diff --git "a/out/checkpoint-17000/trainer_state.json" "b/out/checkpoint-17000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/out/checkpoint-17000/trainer_state.json" @@ -0,0 +1,119169 @@ +{ + "best_metric": 2.3730249404907227, + "best_model_checkpoint": "./out/checkpoint-17000", + "epoch": 1.3719635219110644, + "eval_steps": 1000, + "global_step": 17000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.07037365830038e-05, + "grad_norm": 0.8911969065666199, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.6759, + "step": 1 + }, + { + "epoch": 0.0001614074731660076, + "grad_norm": 0.8724873661994934, + "learning_rate": 4.000000000000001e-06, + "loss": 2.7001, + "step": 2 + }, + { + "epoch": 0.00024211120974901139, + "grad_norm": 0.9050428867340088, + "learning_rate": 6e-06, + "loss": 2.6291, + "step": 3 + }, + { + "epoch": 0.0003228149463320152, + "grad_norm": 0.9249712824821472, + "learning_rate": 8.000000000000001e-06, + "loss": 2.7174, + "step": 4 + }, + { + "epoch": 0.000403518682915019, + "grad_norm": 0.9102846384048462, + "learning_rate": 1e-05, + "loss": 2.6831, + "step": 5 + }, + { + "epoch": 0.00048422241949802277, + "grad_norm": 0.9129141569137573, + "learning_rate": 1.2e-05, + "loss": 2.684, + "step": 6 + }, + { + "epoch": 0.0005649261560810266, + "grad_norm": 0.8648065328598022, + "learning_rate": 1.4000000000000001e-05, + "loss": 2.6488, + "step": 7 + }, + { + "epoch": 0.0006456298926640304, + "grad_norm": 0.8677545785903931, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.7143, + "step": 8 + }, + { + "epoch": 0.0007263336292470342, + "grad_norm": 0.919029712677002, + "learning_rate": 1.8e-05, + "loss": 2.631, + "step": 9 + }, + { + "epoch": 0.000807037365830038, + "grad_norm": 0.9289683103561401, + "learning_rate": 2e-05, + "loss": 2.6564, + "step": 10 + }, + { + "epoch": 0.0008877411024130417, + "grad_norm": 0.8810267448425293, + "learning_rate": 2.2000000000000003e-05, + "loss": 2.6395, + "step": 11 + }, + { + "epoch": 0.0009684448389960455, + "grad_norm": 0.8185754418373108, + "learning_rate": 2.4e-05, + "loss": 2.6871, + "step": 12 + }, + { + "epoch": 0.0010491485755790492, + "grad_norm": 0.9476913213729858, + "learning_rate": 2.6000000000000002e-05, + "loss": 2.7011, + "step": 13 + }, + { + "epoch": 0.0011298523121620531, + "grad_norm": 0.9616057872772217, + "learning_rate": 2.8000000000000003e-05, + "loss": 2.7373, + "step": 14 + }, + { + "epoch": 0.0012105560487450568, + "grad_norm": 0.9429686665534973, + "learning_rate": 3e-05, + "loss": 2.7556, + "step": 15 + }, + { + "epoch": 0.0012912597853280607, + "grad_norm": 1.0331422090530396, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.7756, + "step": 16 + }, + { + "epoch": 0.0013719635219110644, + "grad_norm": 0.906057596206665, + "learning_rate": 3.4000000000000007e-05, + "loss": 2.7053, + "step": 17 + }, + { + "epoch": 0.0014526672584940683, + "grad_norm": 0.8677626252174377, + "learning_rate": 3.6e-05, + "loss": 2.7012, + "step": 18 + }, + { + "epoch": 0.001533370995077072, + "grad_norm": 0.9378079175949097, + "learning_rate": 3.8e-05, + "loss": 2.6786, + "step": 19 + }, + { + "epoch": 0.001614074731660076, + "grad_norm": 1.0333882570266724, + "learning_rate": 4e-05, + "loss": 2.689, + "step": 20 + }, + { + "epoch": 0.0016947784682430796, + "grad_norm": 0.9435378909111023, + "learning_rate": 4.2e-05, + "loss": 2.7084, + "step": 21 + }, + { + "epoch": 0.0017754822048260835, + "grad_norm": 0.9530225396156311, + "learning_rate": 4.4000000000000006e-05, + "loss": 2.7039, + "step": 22 + }, + { + "epoch": 0.0018561859414090872, + "grad_norm": 1.0154749155044556, + "learning_rate": 4.600000000000001e-05, + "loss": 2.6623, + "step": 23 + }, + { + "epoch": 0.001936889677992091, + "grad_norm": 1.0341671705245972, + "learning_rate": 4.8e-05, + "loss": 2.7072, + "step": 24 + }, + { + "epoch": 0.002017593414575095, + "grad_norm": 0.9185739159584045, + "learning_rate": 5e-05, + "loss": 2.6595, + "step": 25 + }, + { + "epoch": 0.0020982971511580985, + "grad_norm": 1.060390591621399, + "learning_rate": 5.2000000000000004e-05, + "loss": 2.7045, + "step": 26 + }, + { + "epoch": 0.0021790008877411024, + "grad_norm": 0.9720118641853333, + "learning_rate": 5.4000000000000005e-05, + "loss": 2.6513, + "step": 27 + }, + { + "epoch": 0.0022597046243241063, + "grad_norm": 0.9426784515380859, + "learning_rate": 5.6000000000000006e-05, + "loss": 2.6541, + "step": 28 + }, + { + "epoch": 0.00234040836090711, + "grad_norm": 0.9736170768737793, + "learning_rate": 5.8e-05, + "loss": 2.7324, + "step": 29 + }, + { + "epoch": 0.0024211120974901136, + "grad_norm": 0.9831354022026062, + "learning_rate": 6e-05, + "loss": 2.6651, + "step": 30 + }, + { + "epoch": 0.0025018158340731175, + "grad_norm": 1.0222605466842651, + "learning_rate": 6.2e-05, + "loss": 2.7375, + "step": 31 + }, + { + "epoch": 0.0025825195706561214, + "grad_norm": 0.9182235598564148, + "learning_rate": 6.400000000000001e-05, + "loss": 2.7142, + "step": 32 + }, + { + "epoch": 0.0026632233072391254, + "grad_norm": 1.0200958251953125, + "learning_rate": 6.6e-05, + "loss": 2.6785, + "step": 33 + }, + { + "epoch": 0.002743927043822129, + "grad_norm": 1.0153381824493408, + "learning_rate": 6.800000000000001e-05, + "loss": 2.6737, + "step": 34 + }, + { + "epoch": 0.0028246307804051327, + "grad_norm": 0.8998087644577026, + "learning_rate": 7e-05, + "loss": 2.7594, + "step": 35 + }, + { + "epoch": 0.0029053345169881366, + "grad_norm": 0.9005621671676636, + "learning_rate": 7.2e-05, + "loss": 2.713, + "step": 36 + }, + { + "epoch": 0.0029860382535711405, + "grad_norm": 1.0165663957595825, + "learning_rate": 7.4e-05, + "loss": 2.7197, + "step": 37 + }, + { + "epoch": 0.003066741990154144, + "grad_norm": 1.0011894702911377, + "learning_rate": 7.6e-05, + "loss": 2.6315, + "step": 38 + }, + { + "epoch": 0.003147445726737148, + "grad_norm": 1.141209602355957, + "learning_rate": 7.800000000000001e-05, + "loss": 2.7249, + "step": 39 + }, + { + "epoch": 0.003228149463320152, + "grad_norm": 0.9114719033241272, + "learning_rate": 8e-05, + "loss": 2.7039, + "step": 40 + }, + { + "epoch": 0.0033088531999031557, + "grad_norm": 1.0193392038345337, + "learning_rate": 8.2e-05, + "loss": 2.6501, + "step": 41 + }, + { + "epoch": 0.003389556936486159, + "grad_norm": 0.9458270072937012, + "learning_rate": 8.4e-05, + "loss": 2.725, + "step": 42 + }, + { + "epoch": 0.003470260673069163, + "grad_norm": 0.9667492508888245, + "learning_rate": 8.6e-05, + "loss": 2.7232, + "step": 43 + }, + { + "epoch": 0.003550964409652167, + "grad_norm": 0.9987972378730774, + "learning_rate": 8.800000000000001e-05, + "loss": 2.6554, + "step": 44 + }, + { + "epoch": 0.003631668146235171, + "grad_norm": 1.0166393518447876, + "learning_rate": 9e-05, + "loss": 2.7291, + "step": 45 + }, + { + "epoch": 0.0037123718828181744, + "grad_norm": 0.9557009935379028, + "learning_rate": 9.200000000000001e-05, + "loss": 2.7194, + "step": 46 + }, + { + "epoch": 0.0037930756194011783, + "grad_norm": 0.9575492143630981, + "learning_rate": 9.4e-05, + "loss": 2.6671, + "step": 47 + }, + { + "epoch": 0.003873779355984182, + "grad_norm": 0.9614555239677429, + "learning_rate": 9.6e-05, + "loss": 2.6865, + "step": 48 + }, + { + "epoch": 0.003954483092567186, + "grad_norm": 0.9245515465736389, + "learning_rate": 9.8e-05, + "loss": 2.7821, + "step": 49 + }, + { + "epoch": 0.00403518682915019, + "grad_norm": 0.9756044745445251, + "learning_rate": 0.0001, + "loss": 2.7608, + "step": 50 + }, + { + "epoch": 0.0041158905657331935, + "grad_norm": 0.95787513256073, + "learning_rate": 0.00010200000000000001, + "loss": 2.6458, + "step": 51 + }, + { + "epoch": 0.004196594302316197, + "grad_norm": 1.0102490186691284, + "learning_rate": 0.00010400000000000001, + "loss": 2.7835, + "step": 52 + }, + { + "epoch": 0.004277298038899201, + "grad_norm": 0.9676176309585571, + "learning_rate": 0.00010600000000000002, + "loss": 2.702, + "step": 53 + }, + { + "epoch": 0.004358001775482205, + "grad_norm": 0.9724096655845642, + "learning_rate": 0.00010800000000000001, + "loss": 2.714, + "step": 54 + }, + { + "epoch": 0.004438705512065208, + "grad_norm": 0.9482994675636292, + "learning_rate": 0.00011000000000000002, + "loss": 2.8069, + "step": 55 + }, + { + "epoch": 0.0045194092486482125, + "grad_norm": 0.9886480569839478, + "learning_rate": 0.00011200000000000001, + "loss": 2.7468, + "step": 56 + }, + { + "epoch": 0.004600112985231216, + "grad_norm": 0.9696247577667236, + "learning_rate": 0.00011399999999999999, + "loss": 2.7486, + "step": 57 + }, + { + "epoch": 0.00468081672181422, + "grad_norm": 1.0638912916183472, + "learning_rate": 0.000116, + "loss": 2.7747, + "step": 58 + }, + { + "epoch": 0.004761520458397224, + "grad_norm": 1.016483187675476, + "learning_rate": 0.000118, + "loss": 2.6925, + "step": 59 + }, + { + "epoch": 0.004842224194980227, + "grad_norm": 1.0298779010772705, + "learning_rate": 0.00012, + "loss": 2.7487, + "step": 60 + }, + { + "epoch": 0.004922927931563232, + "grad_norm": 1.1082268953323364, + "learning_rate": 0.000122, + "loss": 2.7697, + "step": 61 + }, + { + "epoch": 0.005003631668146235, + "grad_norm": 0.9202101826667786, + "learning_rate": 0.000124, + "loss": 2.7429, + "step": 62 + }, + { + "epoch": 0.0050843354047292386, + "grad_norm": 1.0140503644943237, + "learning_rate": 0.000126, + "loss": 2.7492, + "step": 63 + }, + { + "epoch": 0.005165039141312243, + "grad_norm": 1.0689163208007812, + "learning_rate": 0.00012800000000000002, + "loss": 2.7353, + "step": 64 + }, + { + "epoch": 0.005245742877895246, + "grad_norm": 0.9947141408920288, + "learning_rate": 0.00013000000000000002, + "loss": 2.7385, + "step": 65 + }, + { + "epoch": 0.005326446614478251, + "grad_norm": 1.2034410238265991, + "learning_rate": 0.000132, + "loss": 2.7632, + "step": 66 + }, + { + "epoch": 0.005407150351061254, + "grad_norm": 0.9450412392616272, + "learning_rate": 0.000134, + "loss": 2.7547, + "step": 67 + }, + { + "epoch": 0.005487854087644258, + "grad_norm": 1.1818269491195679, + "learning_rate": 0.00013600000000000003, + "loss": 2.7663, + "step": 68 + }, + { + "epoch": 0.005568557824227262, + "grad_norm": 1.003347396850586, + "learning_rate": 0.000138, + "loss": 2.7299, + "step": 69 + }, + { + "epoch": 0.0056492615608102655, + "grad_norm": 1.0105760097503662, + "learning_rate": 0.00014, + "loss": 2.7261, + "step": 70 + }, + { + "epoch": 0.005729965297393269, + "grad_norm": 0.9459090232849121, + "learning_rate": 0.000142, + "loss": 2.7237, + "step": 71 + }, + { + "epoch": 0.005810669033976273, + "grad_norm": 0.9716219305992126, + "learning_rate": 0.000144, + "loss": 2.8175, + "step": 72 + }, + { + "epoch": 0.005891372770559277, + "grad_norm": 0.9968419075012207, + "learning_rate": 0.000146, + "loss": 2.7828, + "step": 73 + }, + { + "epoch": 0.005972076507142281, + "grad_norm": 1.099680781364441, + "learning_rate": 0.000148, + "loss": 2.7111, + "step": 74 + }, + { + "epoch": 0.0060527802437252845, + "grad_norm": 1.004846453666687, + "learning_rate": 0.00015000000000000001, + "loss": 2.7508, + "step": 75 + }, + { + "epoch": 0.006133483980308288, + "grad_norm": 1.0568128824234009, + "learning_rate": 0.000152, + "loss": 2.7341, + "step": 76 + }, + { + "epoch": 0.006214187716891292, + "grad_norm": 0.9871000051498413, + "learning_rate": 0.000154, + "loss": 2.7831, + "step": 77 + }, + { + "epoch": 0.006294891453474296, + "grad_norm": 1.005947232246399, + "learning_rate": 0.00015600000000000002, + "loss": 2.6798, + "step": 78 + }, + { + "epoch": 0.006375595190057299, + "grad_norm": 0.9984713792800903, + "learning_rate": 0.00015800000000000002, + "loss": 2.8126, + "step": 79 + }, + { + "epoch": 0.006456298926640304, + "grad_norm": 0.9805751442909241, + "learning_rate": 0.00016, + "loss": 2.7826, + "step": 80 + }, + { + "epoch": 0.006537002663223307, + "grad_norm": 1.02998685836792, + "learning_rate": 0.000162, + "loss": 2.7636, + "step": 81 + }, + { + "epoch": 0.006617706399806311, + "grad_norm": 1.0790135860443115, + "learning_rate": 0.000164, + "loss": 2.7809, + "step": 82 + }, + { + "epoch": 0.006698410136389315, + "grad_norm": 1.1058307886123657, + "learning_rate": 0.000166, + "loss": 2.787, + "step": 83 + }, + { + "epoch": 0.006779113872972318, + "grad_norm": 1.0199624300003052, + "learning_rate": 0.000168, + "loss": 2.7171, + "step": 84 + }, + { + "epoch": 0.006859817609555323, + "grad_norm": 1.006494402885437, + "learning_rate": 0.00017, + "loss": 2.7791, + "step": 85 + }, + { + "epoch": 0.006940521346138326, + "grad_norm": 0.9672449827194214, + "learning_rate": 0.000172, + "loss": 2.6929, + "step": 86 + }, + { + "epoch": 0.00702122508272133, + "grad_norm": 0.9747781157493591, + "learning_rate": 0.000174, + "loss": 2.7676, + "step": 87 + }, + { + "epoch": 0.007101928819304334, + "grad_norm": 0.9193839430809021, + "learning_rate": 0.00017600000000000002, + "loss": 2.7124, + "step": 88 + }, + { + "epoch": 0.0071826325558873375, + "grad_norm": 1.078499436378479, + "learning_rate": 0.00017800000000000002, + "loss": 2.8018, + "step": 89 + }, + { + "epoch": 0.007263336292470342, + "grad_norm": 1.070957899093628, + "learning_rate": 0.00018, + "loss": 2.7889, + "step": 90 + }, + { + "epoch": 0.007344040029053345, + "grad_norm": 1.160942554473877, + "learning_rate": 0.000182, + "loss": 2.8026, + "step": 91 + }, + { + "epoch": 0.007424743765636349, + "grad_norm": 0.9988501071929932, + "learning_rate": 0.00018400000000000003, + "loss": 2.7746, + "step": 92 + }, + { + "epoch": 0.007505447502219353, + "grad_norm": 1.0882319211959839, + "learning_rate": 0.00018600000000000002, + "loss": 2.8105, + "step": 93 + }, + { + "epoch": 0.0075861512388023565, + "grad_norm": 1.1882357597351074, + "learning_rate": 0.000188, + "loss": 2.8294, + "step": 94 + }, + { + "epoch": 0.00766685497538536, + "grad_norm": 1.0761829614639282, + "learning_rate": 0.00019, + "loss": 2.7846, + "step": 95 + }, + { + "epoch": 0.007747558711968364, + "grad_norm": 1.0665982961654663, + "learning_rate": 0.000192, + "loss": 2.8542, + "step": 96 + }, + { + "epoch": 0.007828262448551369, + "grad_norm": 1.206127405166626, + "learning_rate": 0.000194, + "loss": 2.7711, + "step": 97 + }, + { + "epoch": 0.007908966185134371, + "grad_norm": 1.095150113105774, + "learning_rate": 0.000196, + "loss": 2.732, + "step": 98 + }, + { + "epoch": 0.007989669921717376, + "grad_norm": 1.118348240852356, + "learning_rate": 0.00019800000000000002, + "loss": 2.7736, + "step": 99 + }, + { + "epoch": 0.00807037365830038, + "grad_norm": 1.0646461248397827, + "learning_rate": 0.0002, + "loss": 2.8584, + "step": 100 + }, + { + "epoch": 0.008151077394883383, + "grad_norm": 1.0387661457061768, + "learning_rate": 0.0001999999987538693, + "loss": 2.7961, + "step": 101 + }, + { + "epoch": 0.008231781131466387, + "grad_norm": 1.1905474662780762, + "learning_rate": 0.00019999999501547723, + "loss": 2.8615, + "step": 102 + }, + { + "epoch": 0.008312484868049391, + "grad_norm": 0.9630722999572754, + "learning_rate": 0.0001999999887848239, + "loss": 2.8076, + "step": 103 + }, + { + "epoch": 0.008393188604632394, + "grad_norm": 1.1034537553787231, + "learning_rate": 0.00019999998006190942, + "loss": 2.8402, + "step": 104 + }, + { + "epoch": 0.008473892341215398, + "grad_norm": 1.0679295063018799, + "learning_rate": 0.00019999996884673403, + "loss": 2.7948, + "step": 105 + }, + { + "epoch": 0.008554596077798403, + "grad_norm": 1.0108860731124878, + "learning_rate": 0.00019999995513929802, + "loss": 2.7996, + "step": 106 + }, + { + "epoch": 0.008635299814381405, + "grad_norm": 1.3762084245681763, + "learning_rate": 0.0001999999389396017, + "loss": 2.8023, + "step": 107 + }, + { + "epoch": 0.00871600355096441, + "grad_norm": 1.1320533752441406, + "learning_rate": 0.00019999992024764555, + "loss": 2.793, + "step": 108 + }, + { + "epoch": 0.008796707287547414, + "grad_norm": 1.1752389669418335, + "learning_rate": 0.00019999989906342998, + "loss": 2.8274, + "step": 109 + }, + { + "epoch": 0.008877411024130416, + "grad_norm": 1.2734956741333008, + "learning_rate": 0.00019999987538695552, + "loss": 2.8017, + "step": 110 + }, + { + "epoch": 0.00895811476071342, + "grad_norm": 1.3703055381774902, + "learning_rate": 0.00019999984921822273, + "loss": 2.8699, + "step": 111 + }, + { + "epoch": 0.009038818497296425, + "grad_norm": 1.0079127550125122, + "learning_rate": 0.0001999998205572323, + "loss": 2.8845, + "step": 112 + }, + { + "epoch": 0.00911952223387943, + "grad_norm": 1.28025484085083, + "learning_rate": 0.000199999789403985, + "loss": 2.8636, + "step": 113 + }, + { + "epoch": 0.009200225970462432, + "grad_norm": 1.1057093143463135, + "learning_rate": 0.00019999975575848148, + "loss": 2.8484, + "step": 114 + }, + { + "epoch": 0.009280929707045436, + "grad_norm": 1.0874677896499634, + "learning_rate": 0.00019999971962072265, + "loss": 2.7314, + "step": 115 + }, + { + "epoch": 0.00936163344362844, + "grad_norm": 1.0909658670425415, + "learning_rate": 0.00019999968099070943, + "loss": 2.7827, + "step": 116 + }, + { + "epoch": 0.009442337180211443, + "grad_norm": 1.0881624221801758, + "learning_rate": 0.00019999963986844273, + "loss": 2.827, + "step": 117 + }, + { + "epoch": 0.009523040916794448, + "grad_norm": 1.2498180866241455, + "learning_rate": 0.00019999959625392362, + "loss": 2.8695, + "step": 118 + }, + { + "epoch": 0.009603744653377452, + "grad_norm": 1.1344549655914307, + "learning_rate": 0.00019999955014715317, + "loss": 2.8079, + "step": 119 + }, + { + "epoch": 0.009684448389960455, + "grad_norm": 1.032563328742981, + "learning_rate": 0.00019999950154813253, + "loss": 2.7787, + "step": 120 + }, + { + "epoch": 0.009765152126543459, + "grad_norm": 0.9630110263824463, + "learning_rate": 0.0001999994504568629, + "loss": 2.8103, + "step": 121 + }, + { + "epoch": 0.009845855863126463, + "grad_norm": 1.0418641567230225, + "learning_rate": 0.0001999993968733456, + "loss": 2.8679, + "step": 122 + }, + { + "epoch": 0.009926559599709466, + "grad_norm": 0.9797310829162598, + "learning_rate": 0.00019999934079758188, + "loss": 2.7792, + "step": 123 + }, + { + "epoch": 0.01000726333629247, + "grad_norm": 1.0494028329849243, + "learning_rate": 0.00019999928222957323, + "loss": 2.8007, + "step": 124 + }, + { + "epoch": 0.010087967072875475, + "grad_norm": 1.1570640802383423, + "learning_rate": 0.00019999922116932105, + "loss": 2.8331, + "step": 125 + }, + { + "epoch": 0.010168670809458477, + "grad_norm": 1.2753098011016846, + "learning_rate": 0.00019999915761682684, + "loss": 2.8533, + "step": 126 + }, + { + "epoch": 0.010249374546041481, + "grad_norm": 0.9804013967514038, + "learning_rate": 0.00019999909157209227, + "loss": 2.841, + "step": 127 + }, + { + "epoch": 0.010330078282624486, + "grad_norm": 1.320839285850525, + "learning_rate": 0.00019999902303511892, + "loss": 2.8738, + "step": 128 + }, + { + "epoch": 0.01041078201920749, + "grad_norm": 1.1105059385299683, + "learning_rate": 0.0001999989520059085, + "loss": 2.8458, + "step": 129 + }, + { + "epoch": 0.010491485755790493, + "grad_norm": 1.2869762182235718, + "learning_rate": 0.0001999988784844628, + "loss": 2.7951, + "step": 130 + }, + { + "epoch": 0.010572189492373497, + "grad_norm": 1.1609153747558594, + "learning_rate": 0.00019999880247078368, + "loss": 2.8147, + "step": 131 + }, + { + "epoch": 0.010652893228956501, + "grad_norm": 1.066728115081787, + "learning_rate": 0.00019999872396487297, + "loss": 2.863, + "step": 132 + }, + { + "epoch": 0.010733596965539504, + "grad_norm": 1.2868720293045044, + "learning_rate": 0.0001999986429667327, + "loss": 2.7765, + "step": 133 + }, + { + "epoch": 0.010814300702122508, + "grad_norm": 1.0064955949783325, + "learning_rate": 0.00019999855947636485, + "loss": 2.7834, + "step": 134 + }, + { + "epoch": 0.010895004438705513, + "grad_norm": 1.146589756011963, + "learning_rate": 0.00019999847349377143, + "loss": 2.7966, + "step": 135 + }, + { + "epoch": 0.010975708175288515, + "grad_norm": 0.9831073880195618, + "learning_rate": 0.0001999983850189547, + "loss": 2.8877, + "step": 136 + }, + { + "epoch": 0.01105641191187152, + "grad_norm": 1.1690322160720825, + "learning_rate": 0.0001999982940519168, + "loss": 2.8514, + "step": 137 + }, + { + "epoch": 0.011137115648454524, + "grad_norm": 1.0014944076538086, + "learning_rate": 0.00019999820059266003, + "loss": 2.7846, + "step": 138 + }, + { + "epoch": 0.011217819385037527, + "grad_norm": 0.9581566452980042, + "learning_rate": 0.0001999981046411867, + "loss": 2.7907, + "step": 139 + }, + { + "epoch": 0.011298523121620531, + "grad_norm": 1.1300675868988037, + "learning_rate": 0.00019999800619749922, + "loss": 2.8099, + "step": 140 + }, + { + "epoch": 0.011379226858203535, + "grad_norm": 0.9845526814460754, + "learning_rate": 0.0001999979052616, + "loss": 2.8607, + "step": 141 + }, + { + "epoch": 0.011459930594786538, + "grad_norm": 1.0781387090682983, + "learning_rate": 0.0001999978018334916, + "loss": 2.831, + "step": 142 + }, + { + "epoch": 0.011540634331369542, + "grad_norm": 1.1142648458480835, + "learning_rate": 0.00019999769591317658, + "loss": 2.9194, + "step": 143 + }, + { + "epoch": 0.011621338067952547, + "grad_norm": 0.9972650408744812, + "learning_rate": 0.00019999758750065757, + "loss": 2.8253, + "step": 144 + }, + { + "epoch": 0.01170204180453555, + "grad_norm": 1.040738582611084, + "learning_rate": 0.0001999974765959373, + "loss": 2.7378, + "step": 145 + }, + { + "epoch": 0.011782745541118553, + "grad_norm": 0.9824327826499939, + "learning_rate": 0.00019999736319901848, + "loss": 2.8263, + "step": 146 + }, + { + "epoch": 0.011863449277701558, + "grad_norm": 1.0531679391860962, + "learning_rate": 0.00019999724730990402, + "loss": 2.7975, + "step": 147 + }, + { + "epoch": 0.011944153014284562, + "grad_norm": 1.0699561834335327, + "learning_rate": 0.0001999971289285967, + "loss": 2.8199, + "step": 148 + }, + { + "epoch": 0.012024856750867565, + "grad_norm": 1.0203633308410645, + "learning_rate": 0.0001999970080550996, + "loss": 2.8479, + "step": 149 + }, + { + "epoch": 0.012105560487450569, + "grad_norm": 1.035589575767517, + "learning_rate": 0.00019999688468941564, + "loss": 2.8263, + "step": 150 + }, + { + "epoch": 0.012186264224033573, + "grad_norm": 0.9706670641899109, + "learning_rate": 0.00019999675883154792, + "loss": 2.8324, + "step": 151 + }, + { + "epoch": 0.012266967960616576, + "grad_norm": 1.1565446853637695, + "learning_rate": 0.00019999663048149958, + "loss": 2.8098, + "step": 152 + }, + { + "epoch": 0.01234767169719958, + "grad_norm": 1.025796890258789, + "learning_rate": 0.0001999964996392738, + "loss": 2.7906, + "step": 153 + }, + { + "epoch": 0.012428375433782585, + "grad_norm": 1.117438554763794, + "learning_rate": 0.00019999636630487386, + "loss": 2.8276, + "step": 154 + }, + { + "epoch": 0.012509079170365587, + "grad_norm": 1.025159478187561, + "learning_rate": 0.00019999623047830308, + "loss": 2.8089, + "step": 155 + }, + { + "epoch": 0.012589782906948592, + "grad_norm": 1.007582664489746, + "learning_rate": 0.00019999609215956487, + "loss": 2.8147, + "step": 156 + }, + { + "epoch": 0.012670486643531596, + "grad_norm": 1.0504885911941528, + "learning_rate": 0.0001999959513486626, + "loss": 2.8329, + "step": 157 + }, + { + "epoch": 0.012751190380114599, + "grad_norm": 0.918382465839386, + "learning_rate": 0.00019999580804559987, + "loss": 2.878, + "step": 158 + }, + { + "epoch": 0.012831894116697603, + "grad_norm": 0.9397236704826355, + "learning_rate": 0.0001999956622503802, + "loss": 2.8254, + "step": 159 + }, + { + "epoch": 0.012912597853280607, + "grad_norm": 0.9985697269439697, + "learning_rate": 0.00019999551396300723, + "loss": 2.8417, + "step": 160 + }, + { + "epoch": 0.01299330158986361, + "grad_norm": 0.9866878390312195, + "learning_rate": 0.00019999536318348465, + "loss": 2.7524, + "step": 161 + }, + { + "epoch": 0.013074005326446614, + "grad_norm": 1.0707440376281738, + "learning_rate": 0.00019999520991181627, + "loss": 2.8171, + "step": 162 + }, + { + "epoch": 0.013154709063029619, + "grad_norm": 0.9359755516052246, + "learning_rate": 0.00019999505414800583, + "loss": 2.8463, + "step": 163 + }, + { + "epoch": 0.013235412799612623, + "grad_norm": 1.056647777557373, + "learning_rate": 0.00019999489589205726, + "loss": 2.8602, + "step": 164 + }, + { + "epoch": 0.013316116536195625, + "grad_norm": 0.975370466709137, + "learning_rate": 0.0001999947351439745, + "loss": 2.8292, + "step": 165 + }, + { + "epoch": 0.01339682027277863, + "grad_norm": 0.9241237044334412, + "learning_rate": 0.00019999457190376157, + "loss": 2.7827, + "step": 166 + }, + { + "epoch": 0.013477524009361634, + "grad_norm": 0.9478302001953125, + "learning_rate": 0.00019999440617142247, + "loss": 2.7708, + "step": 167 + }, + { + "epoch": 0.013558227745944637, + "grad_norm": 0.9804863333702087, + "learning_rate": 0.00019999423794696142, + "loss": 2.7696, + "step": 168 + }, + { + "epoch": 0.013638931482527641, + "grad_norm": 0.9764013886451721, + "learning_rate": 0.00019999406723038255, + "loss": 2.8521, + "step": 169 + }, + { + "epoch": 0.013719635219110645, + "grad_norm": 1.026532769203186, + "learning_rate": 0.00019999389402169016, + "loss": 2.8507, + "step": 170 + }, + { + "epoch": 0.013800338955693648, + "grad_norm": 0.9983204007148743, + "learning_rate": 0.00019999371832088854, + "loss": 2.8761, + "step": 171 + }, + { + "epoch": 0.013881042692276652, + "grad_norm": 0.9914593696594238, + "learning_rate": 0.00019999354012798206, + "loss": 2.8723, + "step": 172 + }, + { + "epoch": 0.013961746428859657, + "grad_norm": 1.066962718963623, + "learning_rate": 0.00019999335944297517, + "loss": 2.8635, + "step": 173 + }, + { + "epoch": 0.01404245016544266, + "grad_norm": 1.0848973989486694, + "learning_rate": 0.0001999931762658724, + "loss": 2.8645, + "step": 174 + }, + { + "epoch": 0.014123153902025664, + "grad_norm": 1.0245702266693115, + "learning_rate": 0.0001999929905966783, + "loss": 2.8463, + "step": 175 + }, + { + "epoch": 0.014203857638608668, + "grad_norm": 1.2363669872283936, + "learning_rate": 0.00019999280243539747, + "loss": 2.8345, + "step": 176 + }, + { + "epoch": 0.01428456137519167, + "grad_norm": 1.0224756002426147, + "learning_rate": 0.0001999926117820346, + "loss": 2.8309, + "step": 177 + }, + { + "epoch": 0.014365265111774675, + "grad_norm": 1.0882402658462524, + "learning_rate": 0.0001999924186365945, + "loss": 2.8619, + "step": 178 + }, + { + "epoch": 0.01444596884835768, + "grad_norm": 1.0384254455566406, + "learning_rate": 0.00019999222299908192, + "loss": 2.8477, + "step": 179 + }, + { + "epoch": 0.014526672584940684, + "grad_norm": 0.9662587642669678, + "learning_rate": 0.00019999202486950177, + "loss": 2.8087, + "step": 180 + }, + { + "epoch": 0.014607376321523686, + "grad_norm": 0.9086892604827881, + "learning_rate": 0.000199991824247859, + "loss": 2.7688, + "step": 181 + }, + { + "epoch": 0.01468808005810669, + "grad_norm": 1.004185676574707, + "learning_rate": 0.00019999162113415854, + "loss": 2.8237, + "step": 182 + }, + { + "epoch": 0.014768783794689695, + "grad_norm": 0.997965395450592, + "learning_rate": 0.00019999141552840552, + "loss": 2.8228, + "step": 183 + }, + { + "epoch": 0.014849487531272697, + "grad_norm": 0.9844975471496582, + "learning_rate": 0.00019999120743060503, + "loss": 2.8582, + "step": 184 + }, + { + "epoch": 0.014930191267855702, + "grad_norm": 1.0531272888183594, + "learning_rate": 0.00019999099684076232, + "loss": 2.8571, + "step": 185 + }, + { + "epoch": 0.015010895004438706, + "grad_norm": 1.1178920269012451, + "learning_rate": 0.00019999078375888257, + "loss": 2.85, + "step": 186 + }, + { + "epoch": 0.015091598741021709, + "grad_norm": 1.0773903131484985, + "learning_rate": 0.0001999905681849711, + "loss": 2.826, + "step": 187 + }, + { + "epoch": 0.015172302477604713, + "grad_norm": 1.1573486328125, + "learning_rate": 0.00019999035011903325, + "loss": 2.8866, + "step": 188 + }, + { + "epoch": 0.015253006214187717, + "grad_norm": 1.0401980876922607, + "learning_rate": 0.00019999012956107456, + "loss": 2.788, + "step": 189 + }, + { + "epoch": 0.01533370995077072, + "grad_norm": 1.0150686502456665, + "learning_rate": 0.00019998990651110045, + "loss": 2.8542, + "step": 190 + }, + { + "epoch": 0.015414413687353724, + "grad_norm": 1.1902797222137451, + "learning_rate": 0.0001999896809691165, + "loss": 2.9209, + "step": 191 + }, + { + "epoch": 0.015495117423936729, + "grad_norm": 1.0177555084228516, + "learning_rate": 0.0001999894529351283, + "loss": 2.7852, + "step": 192 + }, + { + "epoch": 0.015575821160519731, + "grad_norm": 1.062322974205017, + "learning_rate": 0.00019998922240914159, + "loss": 2.8328, + "step": 193 + }, + { + "epoch": 0.015656524897102737, + "grad_norm": 1.0937334299087524, + "learning_rate": 0.00019998898939116205, + "loss": 2.8069, + "step": 194 + }, + { + "epoch": 0.015737228633685738, + "grad_norm": 0.9553198218345642, + "learning_rate": 0.00019998875388119554, + "loss": 2.8402, + "step": 195 + }, + { + "epoch": 0.015817932370268743, + "grad_norm": 1.1802356243133545, + "learning_rate": 0.0001999885158792479, + "loss": 2.945, + "step": 196 + }, + { + "epoch": 0.015898636106851747, + "grad_norm": 1.160346269607544, + "learning_rate": 0.0001999882753853251, + "loss": 2.8341, + "step": 197 + }, + { + "epoch": 0.01597933984343475, + "grad_norm": 1.0379278659820557, + "learning_rate": 0.00019998803239943305, + "loss": 2.898, + "step": 198 + }, + { + "epoch": 0.016060043580017756, + "grad_norm": 1.2022395133972168, + "learning_rate": 0.00019998778692157792, + "loss": 2.8302, + "step": 199 + }, + { + "epoch": 0.01614074731660076, + "grad_norm": 1.057017207145691, + "learning_rate": 0.00019998753895176575, + "loss": 2.8474, + "step": 200 + }, + { + "epoch": 0.01622145105318376, + "grad_norm": 0.9299072027206421, + "learning_rate": 0.00019998728849000271, + "loss": 2.8266, + "step": 201 + }, + { + "epoch": 0.016302154789766765, + "grad_norm": 1.0296592712402344, + "learning_rate": 0.00019998703553629512, + "loss": 2.8106, + "step": 202 + }, + { + "epoch": 0.01638285852634977, + "grad_norm": 0.9641671180725098, + "learning_rate": 0.0001999867800906492, + "loss": 2.8089, + "step": 203 + }, + { + "epoch": 0.016463562262932774, + "grad_norm": 0.9951125383377075, + "learning_rate": 0.00019998652215307136, + "loss": 2.813, + "step": 204 + }, + { + "epoch": 0.016544265999515778, + "grad_norm": 1.0089969635009766, + "learning_rate": 0.00019998626172356804, + "loss": 2.8021, + "step": 205 + }, + { + "epoch": 0.016624969736098782, + "grad_norm": 0.9916231632232666, + "learning_rate": 0.00019998599880214566, + "loss": 2.8455, + "step": 206 + }, + { + "epoch": 0.016705673472681787, + "grad_norm": 0.9612492322921753, + "learning_rate": 0.00019998573338881088, + "loss": 2.8653, + "step": 207 + }, + { + "epoch": 0.016786377209264788, + "grad_norm": 0.984578013420105, + "learning_rate": 0.00019998546548357022, + "loss": 2.8359, + "step": 208 + }, + { + "epoch": 0.016867080945847792, + "grad_norm": 0.9457565546035767, + "learning_rate": 0.0001999851950864304, + "loss": 2.8507, + "step": 209 + }, + { + "epoch": 0.016947784682430796, + "grad_norm": 1.0219026803970337, + "learning_rate": 0.00019998492219739817, + "loss": 2.8326, + "step": 210 + }, + { + "epoch": 0.0170284884190138, + "grad_norm": 0.971570611000061, + "learning_rate": 0.00019998464681648032, + "loss": 2.8079, + "step": 211 + }, + { + "epoch": 0.017109192155596805, + "grad_norm": 0.9731320738792419, + "learning_rate": 0.00019998436894368368, + "loss": 2.8536, + "step": 212 + }, + { + "epoch": 0.01718989589217981, + "grad_norm": 1.0519105195999146, + "learning_rate": 0.00019998408857901525, + "loss": 2.8589, + "step": 213 + }, + { + "epoch": 0.01727059962876281, + "grad_norm": 0.9725883603096008, + "learning_rate": 0.00019998380572248194, + "loss": 2.7937, + "step": 214 + }, + { + "epoch": 0.017351303365345815, + "grad_norm": 1.0397064685821533, + "learning_rate": 0.00019998352037409084, + "loss": 2.9145, + "step": 215 + }, + { + "epoch": 0.01743200710192882, + "grad_norm": 0.9094852209091187, + "learning_rate": 0.00019998323253384904, + "loss": 2.7692, + "step": 216 + }, + { + "epoch": 0.017512710838511823, + "grad_norm": 0.941646158695221, + "learning_rate": 0.00019998294220176374, + "loss": 2.7975, + "step": 217 + }, + { + "epoch": 0.017593414575094828, + "grad_norm": 0.9939892888069153, + "learning_rate": 0.00019998264937784216, + "loss": 2.8421, + "step": 218 + }, + { + "epoch": 0.017674118311677832, + "grad_norm": 0.8985795378684998, + "learning_rate": 0.0001999823540620916, + "loss": 2.8146, + "step": 219 + }, + { + "epoch": 0.017754822048260833, + "grad_norm": 1.0436078310012817, + "learning_rate": 0.00019998205625451943, + "loss": 2.8416, + "step": 220 + }, + { + "epoch": 0.017835525784843837, + "grad_norm": 0.9941675066947937, + "learning_rate": 0.00019998175595513305, + "loss": 2.8723, + "step": 221 + }, + { + "epoch": 0.01791622952142684, + "grad_norm": 0.9203903675079346, + "learning_rate": 0.00019998145316393995, + "loss": 2.7791, + "step": 222 + }, + { + "epoch": 0.017996933258009846, + "grad_norm": 0.9325969815254211, + "learning_rate": 0.00019998114788094768, + "loss": 2.8664, + "step": 223 + }, + { + "epoch": 0.01807763699459285, + "grad_norm": 0.9483599662780762, + "learning_rate": 0.00019998084010616388, + "loss": 2.7782, + "step": 224 + }, + { + "epoch": 0.018158340731175854, + "grad_norm": 0.9555078744888306, + "learning_rate": 0.00019998052983959615, + "loss": 2.7771, + "step": 225 + }, + { + "epoch": 0.01823904446775886, + "grad_norm": 0.9452421069145203, + "learning_rate": 0.00019998021708125233, + "loss": 2.8878, + "step": 226 + }, + { + "epoch": 0.01831974820434186, + "grad_norm": 0.9784894585609436, + "learning_rate": 0.00019997990183114007, + "loss": 2.8382, + "step": 227 + }, + { + "epoch": 0.018400451940924864, + "grad_norm": 1.0844931602478027, + "learning_rate": 0.00019997958408926735, + "loss": 2.8015, + "step": 228 + }, + { + "epoch": 0.01848115567750787, + "grad_norm": 1.0416710376739502, + "learning_rate": 0.00019997926385564207, + "loss": 2.8364, + "step": 229 + }, + { + "epoch": 0.018561859414090873, + "grad_norm": 0.9213813543319702, + "learning_rate": 0.00019997894113027215, + "loss": 2.8489, + "step": 230 + }, + { + "epoch": 0.018642563150673877, + "grad_norm": 1.0186388492584229, + "learning_rate": 0.00019997861591316567, + "loss": 2.914, + "step": 231 + }, + { + "epoch": 0.01872326688725688, + "grad_norm": 1.0032236576080322, + "learning_rate": 0.00019997828820433072, + "loss": 2.8733, + "step": 232 + }, + { + "epoch": 0.018803970623839882, + "grad_norm": 0.9783569574356079, + "learning_rate": 0.0001999779580037755, + "loss": 2.851, + "step": 233 + }, + { + "epoch": 0.018884674360422887, + "grad_norm": 0.8471441268920898, + "learning_rate": 0.00019997762531150825, + "loss": 2.7923, + "step": 234 + }, + { + "epoch": 0.01896537809700589, + "grad_norm": 0.8912937641143799, + "learning_rate": 0.00019997729012753717, + "loss": 2.8725, + "step": 235 + }, + { + "epoch": 0.019046081833588895, + "grad_norm": 1.2453325986862183, + "learning_rate": 0.00019997695245187075, + "loss": 2.9292, + "step": 236 + }, + { + "epoch": 0.0191267855701719, + "grad_norm": 0.8870908617973328, + "learning_rate": 0.0001999766122845173, + "loss": 2.8008, + "step": 237 + }, + { + "epoch": 0.019207489306754904, + "grad_norm": 1.0679768323898315, + "learning_rate": 0.0001999762696254853, + "loss": 2.8919, + "step": 238 + }, + { + "epoch": 0.01928819304333791, + "grad_norm": 0.9769917130470276, + "learning_rate": 0.00019997592447478337, + "loss": 2.7937, + "step": 239 + }, + { + "epoch": 0.01936889677992091, + "grad_norm": 1.066183090209961, + "learning_rate": 0.00019997557683242004, + "loss": 2.8375, + "step": 240 + }, + { + "epoch": 0.019449600516503913, + "grad_norm": 0.9834103584289551, + "learning_rate": 0.000199975226698404, + "loss": 2.8577, + "step": 241 + }, + { + "epoch": 0.019530304253086918, + "grad_norm": 1.102211833000183, + "learning_rate": 0.00019997487407274396, + "loss": 2.8466, + "step": 242 + }, + { + "epoch": 0.019611007989669922, + "grad_norm": 0.9936226606369019, + "learning_rate": 0.00019997451895544872, + "loss": 2.7729, + "step": 243 + }, + { + "epoch": 0.019691711726252926, + "grad_norm": 1.0995992422103882, + "learning_rate": 0.00019997416134652713, + "loss": 2.8425, + "step": 244 + }, + { + "epoch": 0.01977241546283593, + "grad_norm": 0.94181889295578, + "learning_rate": 0.00019997380124598814, + "loss": 2.8495, + "step": 245 + }, + { + "epoch": 0.01985311919941893, + "grad_norm": 0.9791487455368042, + "learning_rate": 0.00019997343865384067, + "loss": 2.8919, + "step": 246 + }, + { + "epoch": 0.019933822936001936, + "grad_norm": 0.9173399209976196, + "learning_rate": 0.00019997307357009375, + "loss": 2.8593, + "step": 247 + }, + { + "epoch": 0.02001452667258494, + "grad_norm": 0.9675281047821045, + "learning_rate": 0.00019997270599475653, + "loss": 2.8226, + "step": 248 + }, + { + "epoch": 0.020095230409167945, + "grad_norm": 0.8928244113922119, + "learning_rate": 0.00019997233592783812, + "loss": 2.8296, + "step": 249 + }, + { + "epoch": 0.02017593414575095, + "grad_norm": 0.928601861000061, + "learning_rate": 0.0001999719633693478, + "loss": 2.8399, + "step": 250 + }, + { + "epoch": 0.020256637882333953, + "grad_norm": 0.9378123879432678, + "learning_rate": 0.00019997158831929482, + "loss": 2.8711, + "step": 251 + }, + { + "epoch": 0.020337341618916954, + "grad_norm": 0.9041047692298889, + "learning_rate": 0.00019997121077768853, + "loss": 2.8338, + "step": 252 + }, + { + "epoch": 0.02041804535549996, + "grad_norm": 0.9673274755477905, + "learning_rate": 0.00019997083074453832, + "loss": 2.8556, + "step": 253 + }, + { + "epoch": 0.020498749092082963, + "grad_norm": 0.9204083681106567, + "learning_rate": 0.0001999704482198537, + "loss": 2.7954, + "step": 254 + }, + { + "epoch": 0.020579452828665967, + "grad_norm": 0.9267606735229492, + "learning_rate": 0.00019997006320364417, + "loss": 2.8656, + "step": 255 + }, + { + "epoch": 0.02066015656524897, + "grad_norm": 0.9562919735908508, + "learning_rate": 0.00019996967569591936, + "loss": 2.8406, + "step": 256 + }, + { + "epoch": 0.020740860301831976, + "grad_norm": 0.9065950512886047, + "learning_rate": 0.0001999692856966889, + "loss": 2.7856, + "step": 257 + }, + { + "epoch": 0.02082156403841498, + "grad_norm": 0.9136463403701782, + "learning_rate": 0.0001999688932059625, + "loss": 2.8083, + "step": 258 + }, + { + "epoch": 0.02090226777499798, + "grad_norm": 0.9785570502281189, + "learning_rate": 0.00019996849822374998, + "loss": 2.7984, + "step": 259 + }, + { + "epoch": 0.020982971511580985, + "grad_norm": 0.9549168348312378, + "learning_rate": 0.00019996810075006117, + "loss": 2.8048, + "step": 260 + }, + { + "epoch": 0.02106367524816399, + "grad_norm": 0.8923975825309753, + "learning_rate": 0.00019996770078490594, + "loss": 2.8559, + "step": 261 + }, + { + "epoch": 0.021144378984746994, + "grad_norm": 0.9516206383705139, + "learning_rate": 0.0001999672983282943, + "loss": 2.9171, + "step": 262 + }, + { + "epoch": 0.02122508272133, + "grad_norm": 0.9101666808128357, + "learning_rate": 0.0001999668933802363, + "loss": 2.8746, + "step": 263 + }, + { + "epoch": 0.021305786457913003, + "grad_norm": 0.9081267714500427, + "learning_rate": 0.00019996648594074195, + "loss": 2.8637, + "step": 264 + }, + { + "epoch": 0.021386490194496004, + "grad_norm": 1.0048178434371948, + "learning_rate": 0.0001999660760098215, + "loss": 2.8783, + "step": 265 + }, + { + "epoch": 0.021467193931079008, + "grad_norm": 0.9625924229621887, + "learning_rate": 0.0001999656635874851, + "loss": 2.8226, + "step": 266 + }, + { + "epoch": 0.021547897667662012, + "grad_norm": 0.9911805391311646, + "learning_rate": 0.00019996524867374306, + "loss": 2.8135, + "step": 267 + }, + { + "epoch": 0.021628601404245017, + "grad_norm": 0.8920134902000427, + "learning_rate": 0.00019996483126860572, + "loss": 2.7934, + "step": 268 + }, + { + "epoch": 0.02170930514082802, + "grad_norm": 1.0806514024734497, + "learning_rate": 0.00019996441137208346, + "loss": 2.8435, + "step": 269 + }, + { + "epoch": 0.021790008877411025, + "grad_norm": 0.9426547884941101, + "learning_rate": 0.00019996398898418675, + "loss": 2.7919, + "step": 270 + }, + { + "epoch": 0.021870712613994026, + "grad_norm": 0.9893020987510681, + "learning_rate": 0.00019996356410492615, + "loss": 2.8616, + "step": 271 + }, + { + "epoch": 0.02195141635057703, + "grad_norm": 1.0196046829223633, + "learning_rate": 0.00019996313673431218, + "loss": 2.8101, + "step": 272 + }, + { + "epoch": 0.022032120087160035, + "grad_norm": 0.9556699991226196, + "learning_rate": 0.00019996270687235558, + "loss": 2.8669, + "step": 273 + }, + { + "epoch": 0.02211282382374304, + "grad_norm": 0.8985902667045593, + "learning_rate": 0.00019996227451906702, + "loss": 2.8078, + "step": 274 + }, + { + "epoch": 0.022193527560326044, + "grad_norm": 1.0198246240615845, + "learning_rate": 0.00019996183967445726, + "loss": 2.8314, + "step": 275 + }, + { + "epoch": 0.022274231296909048, + "grad_norm": 0.9360179901123047, + "learning_rate": 0.00019996140233853715, + "loss": 2.7969, + "step": 276 + }, + { + "epoch": 0.022354935033492052, + "grad_norm": 1.0250160694122314, + "learning_rate": 0.00019996096251131759, + "loss": 2.7897, + "step": 277 + }, + { + "epoch": 0.022435638770075053, + "grad_norm": 0.934582531452179, + "learning_rate": 0.00019996052019280954, + "loss": 2.8667, + "step": 278 + }, + { + "epoch": 0.022516342506658057, + "grad_norm": 0.9394461512565613, + "learning_rate": 0.00019996007538302407, + "loss": 2.7681, + "step": 279 + }, + { + "epoch": 0.022597046243241062, + "grad_norm": 0.9468861222267151, + "learning_rate": 0.00019995962808197216, + "loss": 2.7709, + "step": 280 + }, + { + "epoch": 0.022677749979824066, + "grad_norm": 0.9798515439033508, + "learning_rate": 0.00019995917828966506, + "loss": 2.8274, + "step": 281 + }, + { + "epoch": 0.02275845371640707, + "grad_norm": 1.0403941869735718, + "learning_rate": 0.00019995872600611395, + "loss": 2.8897, + "step": 282 + }, + { + "epoch": 0.022839157452990075, + "grad_norm": 0.9795030951499939, + "learning_rate": 0.00019995827123133006, + "loss": 2.8792, + "step": 283 + }, + { + "epoch": 0.022919861189573076, + "grad_norm": 0.9162538647651672, + "learning_rate": 0.00019995781396532479, + "loss": 2.8339, + "step": 284 + }, + { + "epoch": 0.02300056492615608, + "grad_norm": 1.0864707231521606, + "learning_rate": 0.00019995735420810947, + "loss": 2.8599, + "step": 285 + }, + { + "epoch": 0.023081268662739084, + "grad_norm": 0.9181776642799377, + "learning_rate": 0.0001999568919596956, + "loss": 2.8736, + "step": 286 + }, + { + "epoch": 0.02316197239932209, + "grad_norm": 0.8880531191825867, + "learning_rate": 0.00019995642722009472, + "loss": 2.8215, + "step": 287 + }, + { + "epoch": 0.023242676135905093, + "grad_norm": 0.9287240505218506, + "learning_rate": 0.00019995595998931835, + "loss": 2.844, + "step": 288 + }, + { + "epoch": 0.023323379872488097, + "grad_norm": 0.886894941329956, + "learning_rate": 0.0001999554902673782, + "loss": 2.8319, + "step": 289 + }, + { + "epoch": 0.0234040836090711, + "grad_norm": 0.9564458131790161, + "learning_rate": 0.0001999550180542859, + "loss": 2.8126, + "step": 290 + }, + { + "epoch": 0.023484787345654103, + "grad_norm": 0.8745970726013184, + "learning_rate": 0.00019995454335005334, + "loss": 2.8344, + "step": 291 + }, + { + "epoch": 0.023565491082237107, + "grad_norm": 1.0343137979507446, + "learning_rate": 0.00019995406615469217, + "loss": 2.8498, + "step": 292 + }, + { + "epoch": 0.02364619481882011, + "grad_norm": 0.9951575994491577, + "learning_rate": 0.0001999535864682145, + "loss": 2.8655, + "step": 293 + }, + { + "epoch": 0.023726898555403116, + "grad_norm": 0.8457592725753784, + "learning_rate": 0.0001999531042906321, + "loss": 2.8189, + "step": 294 + }, + { + "epoch": 0.02380760229198612, + "grad_norm": 0.9126954674720764, + "learning_rate": 0.00019995261962195708, + "loss": 2.8272, + "step": 295 + }, + { + "epoch": 0.023888306028569124, + "grad_norm": 1.0171937942504883, + "learning_rate": 0.0001999521324622015, + "loss": 2.869, + "step": 296 + }, + { + "epoch": 0.023969009765152125, + "grad_norm": 0.9887226223945618, + "learning_rate": 0.00019995164281137753, + "loss": 2.7643, + "step": 297 + }, + { + "epoch": 0.02404971350173513, + "grad_norm": 1.4240798950195312, + "learning_rate": 0.00019995115066949733, + "loss": 2.8332, + "step": 298 + }, + { + "epoch": 0.024130417238318134, + "grad_norm": 0.9856921434402466, + "learning_rate": 0.00019995065603657316, + "loss": 2.8283, + "step": 299 + }, + { + "epoch": 0.024211120974901138, + "grad_norm": 0.997164785861969, + "learning_rate": 0.0001999501589126174, + "loss": 2.9164, + "step": 300 + }, + { + "epoch": 0.024291824711484142, + "grad_norm": 1.6480412483215332, + "learning_rate": 0.00019994965929764238, + "loss": 2.8941, + "step": 301 + }, + { + "epoch": 0.024372528448067147, + "grad_norm": 1.1590758562088013, + "learning_rate": 0.0001999491571916606, + "loss": 2.8127, + "step": 302 + }, + { + "epoch": 0.024453232184650148, + "grad_norm": 1.1228376626968384, + "learning_rate": 0.00019994865259468454, + "loss": 2.8439, + "step": 303 + }, + { + "epoch": 0.024533935921233152, + "grad_norm": 1.0426349639892578, + "learning_rate": 0.0001999481455067268, + "loss": 2.8671, + "step": 304 + }, + { + "epoch": 0.024614639657816156, + "grad_norm": 1.0911917686462402, + "learning_rate": 0.00019994763592779996, + "loss": 2.8297, + "step": 305 + }, + { + "epoch": 0.02469534339439916, + "grad_norm": 1.0493195056915283, + "learning_rate": 0.00019994712385791683, + "loss": 2.7996, + "step": 306 + }, + { + "epoch": 0.024776047130982165, + "grad_norm": 0.9275023341178894, + "learning_rate": 0.00019994660929709008, + "loss": 2.7949, + "step": 307 + }, + { + "epoch": 0.02485675086756517, + "grad_norm": 1.1074799299240112, + "learning_rate": 0.00019994609224533255, + "loss": 2.8364, + "step": 308 + }, + { + "epoch": 0.024937454604148174, + "grad_norm": 0.9189429879188538, + "learning_rate": 0.00019994557270265717, + "loss": 2.8293, + "step": 309 + }, + { + "epoch": 0.025018158340731175, + "grad_norm": 0.9577780961990356, + "learning_rate": 0.00019994505066907683, + "loss": 2.8295, + "step": 310 + }, + { + "epoch": 0.02509886207731418, + "grad_norm": 1.0707277059555054, + "learning_rate": 0.0001999445261446046, + "loss": 2.795, + "step": 311 + }, + { + "epoch": 0.025179565813897183, + "grad_norm": 0.9211257696151733, + "learning_rate": 0.0001999439991292535, + "loss": 2.8355, + "step": 312 + }, + { + "epoch": 0.025260269550480188, + "grad_norm": 0.987779438495636, + "learning_rate": 0.00019994346962303667, + "loss": 2.8175, + "step": 313 + }, + { + "epoch": 0.025340973287063192, + "grad_norm": 0.9317128658294678, + "learning_rate": 0.00019994293762596734, + "loss": 2.8205, + "step": 314 + }, + { + "epoch": 0.025421677023646196, + "grad_norm": 0.8989154100418091, + "learning_rate": 0.00019994240313805873, + "loss": 2.8257, + "step": 315 + }, + { + "epoch": 0.025502380760229197, + "grad_norm": 0.8391042351722717, + "learning_rate": 0.00019994186615932423, + "loss": 2.8105, + "step": 316 + }, + { + "epoch": 0.0255830844968122, + "grad_norm": 0.8908089995384216, + "learning_rate": 0.00019994132668977715, + "loss": 2.7894, + "step": 317 + }, + { + "epoch": 0.025663788233395206, + "grad_norm": 0.8666881322860718, + "learning_rate": 0.00019994078472943097, + "loss": 2.7934, + "step": 318 + }, + { + "epoch": 0.02574449196997821, + "grad_norm": 0.8834616541862488, + "learning_rate": 0.00019994024027829914, + "loss": 2.8166, + "step": 319 + }, + { + "epoch": 0.025825195706561214, + "grad_norm": 0.9831370115280151, + "learning_rate": 0.00019993969333639532, + "loss": 2.889, + "step": 320 + }, + { + "epoch": 0.02590589944314422, + "grad_norm": 0.9171644449234009, + "learning_rate": 0.00019993914390373308, + "loss": 2.8582, + "step": 321 + }, + { + "epoch": 0.02598660317972722, + "grad_norm": 0.9624861478805542, + "learning_rate": 0.00019993859198032615, + "loss": 2.8574, + "step": 322 + }, + { + "epoch": 0.026067306916310224, + "grad_norm": 0.8826586008071899, + "learning_rate": 0.00019993803756618826, + "loss": 2.8544, + "step": 323 + }, + { + "epoch": 0.02614801065289323, + "grad_norm": 0.9286447763442993, + "learning_rate": 0.0001999374806613332, + "loss": 2.7937, + "step": 324 + }, + { + "epoch": 0.026228714389476233, + "grad_norm": 0.9901685118675232, + "learning_rate": 0.00019993692126577493, + "loss": 2.7654, + "step": 325 + }, + { + "epoch": 0.026309418126059237, + "grad_norm": 0.9624341130256653, + "learning_rate": 0.00019993635937952734, + "loss": 2.8804, + "step": 326 + }, + { + "epoch": 0.02639012186264224, + "grad_norm": 0.8867596387863159, + "learning_rate": 0.0001999357950026044, + "loss": 2.8254, + "step": 327 + }, + { + "epoch": 0.026470825599225246, + "grad_norm": 0.9243817925453186, + "learning_rate": 0.00019993522813502022, + "loss": 2.8177, + "step": 328 + }, + { + "epoch": 0.026551529335808247, + "grad_norm": 0.9322247505187988, + "learning_rate": 0.00019993465877678895, + "loss": 2.9023, + "step": 329 + }, + { + "epoch": 0.02663223307239125, + "grad_norm": 0.8768174648284912, + "learning_rate": 0.00019993408692792474, + "loss": 2.8184, + "step": 330 + }, + { + "epoch": 0.026712936808974255, + "grad_norm": 0.9436870813369751, + "learning_rate": 0.00019993351258844184, + "loss": 2.8319, + "step": 331 + }, + { + "epoch": 0.02679364054555726, + "grad_norm": 0.9970327019691467, + "learning_rate": 0.0001999329357583546, + "loss": 2.7946, + "step": 332 + }, + { + "epoch": 0.026874344282140264, + "grad_norm": 0.9100088477134705, + "learning_rate": 0.00019993235643767736, + "loss": 2.782, + "step": 333 + }, + { + "epoch": 0.02695504801872327, + "grad_norm": 0.9693402051925659, + "learning_rate": 0.00019993177462642456, + "loss": 2.8182, + "step": 334 + }, + { + "epoch": 0.02703575175530627, + "grad_norm": 0.8761965036392212, + "learning_rate": 0.00019993119032461073, + "loss": 2.8058, + "step": 335 + }, + { + "epoch": 0.027116455491889273, + "grad_norm": 1.0699270963668823, + "learning_rate": 0.00019993060353225043, + "loss": 2.9211, + "step": 336 + }, + { + "epoch": 0.027197159228472278, + "grad_norm": 1.0094172954559326, + "learning_rate": 0.00019993001424935822, + "loss": 2.8837, + "step": 337 + }, + { + "epoch": 0.027277862965055282, + "grad_norm": 0.9683573842048645, + "learning_rate": 0.00019992942247594887, + "loss": 2.8523, + "step": 338 + }, + { + "epoch": 0.027358566701638286, + "grad_norm": 1.3243813514709473, + "learning_rate": 0.00019992882821203708, + "loss": 2.7891, + "step": 339 + }, + { + "epoch": 0.02743927043822129, + "grad_norm": 1.0227056741714478, + "learning_rate": 0.0001999282314576377, + "loss": 2.8396, + "step": 340 + }, + { + "epoch": 0.027519974174804295, + "grad_norm": 1.03257417678833, + "learning_rate": 0.00019992763221276556, + "loss": 2.824, + "step": 341 + }, + { + "epoch": 0.027600677911387296, + "grad_norm": 0.86456698179245, + "learning_rate": 0.00019992703047743562, + "loss": 2.8006, + "step": 342 + }, + { + "epoch": 0.0276813816479703, + "grad_norm": 0.965339720249176, + "learning_rate": 0.00019992642625166286, + "loss": 2.8658, + "step": 343 + }, + { + "epoch": 0.027762085384553305, + "grad_norm": 1.0028942823410034, + "learning_rate": 0.00019992581953546236, + "loss": 2.8311, + "step": 344 + }, + { + "epoch": 0.02784278912113631, + "grad_norm": 0.984307050704956, + "learning_rate": 0.0001999252103288492, + "loss": 2.8748, + "step": 345 + }, + { + "epoch": 0.027923492857719313, + "grad_norm": 0.9405032396316528, + "learning_rate": 0.00019992459863183858, + "loss": 2.8371, + "step": 346 + }, + { + "epoch": 0.028004196594302318, + "grad_norm": 0.9867002367973328, + "learning_rate": 0.0001999239844444458, + "loss": 2.7914, + "step": 347 + }, + { + "epoch": 0.02808490033088532, + "grad_norm": 0.9224951267242432, + "learning_rate": 0.00019992336776668613, + "loss": 2.7986, + "step": 348 + }, + { + "epoch": 0.028165604067468323, + "grad_norm": 1.002838134765625, + "learning_rate": 0.0001999227485985749, + "loss": 2.8207, + "step": 349 + }, + { + "epoch": 0.028246307804051327, + "grad_norm": 0.8922045826911926, + "learning_rate": 0.00019992212694012757, + "loss": 2.8264, + "step": 350 + }, + { + "epoch": 0.02832701154063433, + "grad_norm": 1.0860323905944824, + "learning_rate": 0.00019992150279135964, + "loss": 2.8778, + "step": 351 + }, + { + "epoch": 0.028407715277217336, + "grad_norm": 1.0995604991912842, + "learning_rate": 0.0001999208761522867, + "loss": 2.8599, + "step": 352 + }, + { + "epoch": 0.02848841901380034, + "grad_norm": 0.8741658926010132, + "learning_rate": 0.0001999202470229243, + "loss": 2.7757, + "step": 353 + }, + { + "epoch": 0.02856912275038334, + "grad_norm": 0.9142587184906006, + "learning_rate": 0.00019991961540328815, + "loss": 2.8235, + "step": 354 + }, + { + "epoch": 0.028649826486966345, + "grad_norm": 1.0000953674316406, + "learning_rate": 0.000199918981293394, + "loss": 2.8, + "step": 355 + }, + { + "epoch": 0.02873053022354935, + "grad_norm": 0.9416046738624573, + "learning_rate": 0.00019991834469325763, + "loss": 2.7941, + "step": 356 + }, + { + "epoch": 0.028811233960132354, + "grad_norm": 0.9135935306549072, + "learning_rate": 0.00019991770560289496, + "loss": 2.8315, + "step": 357 + }, + { + "epoch": 0.02889193769671536, + "grad_norm": 0.8867244124412537, + "learning_rate": 0.00019991706402232184, + "loss": 2.8649, + "step": 358 + }, + { + "epoch": 0.028972641433298363, + "grad_norm": 0.9360243678092957, + "learning_rate": 0.00019991641995155431, + "loss": 2.7556, + "step": 359 + }, + { + "epoch": 0.029053345169881367, + "grad_norm": 0.8903766870498657, + "learning_rate": 0.00019991577339060842, + "loss": 2.8379, + "step": 360 + }, + { + "epoch": 0.029134048906464368, + "grad_norm": 1.0178784132003784, + "learning_rate": 0.00019991512433950023, + "loss": 2.8045, + "step": 361 + }, + { + "epoch": 0.029214752643047372, + "grad_norm": 0.9318631887435913, + "learning_rate": 0.000199914472798246, + "loss": 2.823, + "step": 362 + }, + { + "epoch": 0.029295456379630377, + "grad_norm": 0.9384647011756897, + "learning_rate": 0.00019991381876686195, + "loss": 2.9379, + "step": 363 + }, + { + "epoch": 0.02937616011621338, + "grad_norm": 0.9318633675575256, + "learning_rate": 0.00019991316224536433, + "loss": 2.8222, + "step": 364 + }, + { + "epoch": 0.029456863852796385, + "grad_norm": 0.8653938174247742, + "learning_rate": 0.00019991250323376952, + "loss": 2.8447, + "step": 365 + }, + { + "epoch": 0.02953756758937939, + "grad_norm": 0.8997991681098938, + "learning_rate": 0.00019991184173209398, + "loss": 2.8523, + "step": 366 + }, + { + "epoch": 0.02961827132596239, + "grad_norm": 0.8587092161178589, + "learning_rate": 0.00019991117774035416, + "loss": 2.8141, + "step": 367 + }, + { + "epoch": 0.029698975062545395, + "grad_norm": 0.8740741014480591, + "learning_rate": 0.00019991051125856663, + "loss": 2.7487, + "step": 368 + }, + { + "epoch": 0.0297796787991284, + "grad_norm": 0.9099416732788086, + "learning_rate": 0.00019990984228674798, + "loss": 2.834, + "step": 369 + }, + { + "epoch": 0.029860382535711404, + "grad_norm": 0.8675365447998047, + "learning_rate": 0.0001999091708249149, + "loss": 2.8259, + "step": 370 + }, + { + "epoch": 0.029941086272294408, + "grad_norm": 1.0141092538833618, + "learning_rate": 0.00019990849687308412, + "loss": 2.8369, + "step": 371 + }, + { + "epoch": 0.030021790008877412, + "grad_norm": 0.849155604839325, + "learning_rate": 0.00019990782043127243, + "loss": 2.7505, + "step": 372 + }, + { + "epoch": 0.030102493745460413, + "grad_norm": 1.073754072189331, + "learning_rate": 0.0001999071414994967, + "loss": 2.8939, + "step": 373 + }, + { + "epoch": 0.030183197482043417, + "grad_norm": 0.8615279197692871, + "learning_rate": 0.00019990646007777383, + "loss": 2.7662, + "step": 374 + }, + { + "epoch": 0.030263901218626422, + "grad_norm": 0.8803398609161377, + "learning_rate": 0.0001999057761661208, + "loss": 2.7992, + "step": 375 + }, + { + "epoch": 0.030344604955209426, + "grad_norm": 0.8901834487915039, + "learning_rate": 0.00019990508976455473, + "loss": 2.8222, + "step": 376 + }, + { + "epoch": 0.03042530869179243, + "grad_norm": 0.9443284869194031, + "learning_rate": 0.00019990440087309263, + "loss": 2.8326, + "step": 377 + }, + { + "epoch": 0.030506012428375435, + "grad_norm": 0.9122868180274963, + "learning_rate": 0.0001999037094917517, + "loss": 2.7653, + "step": 378 + }, + { + "epoch": 0.03058671616495844, + "grad_norm": 0.8764635920524597, + "learning_rate": 0.0001999030156205492, + "loss": 2.7813, + "step": 379 + }, + { + "epoch": 0.03066741990154144, + "grad_norm": 0.8466865420341492, + "learning_rate": 0.0001999023192595024, + "loss": 2.8338, + "step": 380 + }, + { + "epoch": 0.030748123638124444, + "grad_norm": 0.8833961486816406, + "learning_rate": 0.00019990162040862863, + "loss": 2.78, + "step": 381 + }, + { + "epoch": 0.03082882737470745, + "grad_norm": 1.0298357009887695, + "learning_rate": 0.00019990091906794537, + "loss": 2.8059, + "step": 382 + }, + { + "epoch": 0.030909531111290453, + "grad_norm": 0.8651318550109863, + "learning_rate": 0.00019990021523747005, + "loss": 2.8608, + "step": 383 + }, + { + "epoch": 0.030990234847873457, + "grad_norm": 1.0262864828109741, + "learning_rate": 0.0001998995089172202, + "loss": 2.8226, + "step": 384 + }, + { + "epoch": 0.03107093858445646, + "grad_norm": 0.9266276955604553, + "learning_rate": 0.00019989880010721348, + "loss": 2.9414, + "step": 385 + }, + { + "epoch": 0.031151642321039463, + "grad_norm": 0.8762117028236389, + "learning_rate": 0.00019989808880746749, + "loss": 2.8023, + "step": 386 + }, + { + "epoch": 0.031232346057622467, + "grad_norm": 0.8531816601753235, + "learning_rate": 0.00019989737501800004, + "loss": 2.777, + "step": 387 + }, + { + "epoch": 0.031313049794205475, + "grad_norm": 0.8999545574188232, + "learning_rate": 0.0001998966587388288, + "loss": 2.8656, + "step": 388 + }, + { + "epoch": 0.03139375353078847, + "grad_norm": 0.932248055934906, + "learning_rate": 0.00019989593996997177, + "loss": 2.8212, + "step": 389 + }, + { + "epoch": 0.031474457267371476, + "grad_norm": 0.9059134125709534, + "learning_rate": 0.00019989521871144672, + "loss": 2.7945, + "step": 390 + }, + { + "epoch": 0.03155516100395448, + "grad_norm": 0.9323028922080994, + "learning_rate": 0.00019989449496327172, + "loss": 2.8338, + "step": 391 + }, + { + "epoch": 0.031635864740537485, + "grad_norm": 0.9141251444816589, + "learning_rate": 0.0001998937687254648, + "loss": 2.7935, + "step": 392 + }, + { + "epoch": 0.03171656847712049, + "grad_norm": 1.0026880502700806, + "learning_rate": 0.000199893039998044, + "loss": 2.8811, + "step": 393 + }, + { + "epoch": 0.031797272213703494, + "grad_norm": 1.0178622007369995, + "learning_rate": 0.00019989230878102756, + "loss": 2.9003, + "step": 394 + }, + { + "epoch": 0.0318779759502865, + "grad_norm": 0.9111912846565247, + "learning_rate": 0.00019989157507443363, + "loss": 2.8399, + "step": 395 + }, + { + "epoch": 0.0319586796868695, + "grad_norm": 1.054563283920288, + "learning_rate": 0.00019989083887828052, + "loss": 2.9088, + "step": 396 + }, + { + "epoch": 0.03203938342345251, + "grad_norm": 0.9459816217422485, + "learning_rate": 0.00019989010019258663, + "loss": 2.805, + "step": 397 + }, + { + "epoch": 0.03212008716003551, + "grad_norm": 1.0139873027801514, + "learning_rate": 0.00019988935901737033, + "loss": 2.8452, + "step": 398 + }, + { + "epoch": 0.032200790896618516, + "grad_norm": 0.986325204372406, + "learning_rate": 0.00019988861535265006, + "loss": 2.8311, + "step": 399 + }, + { + "epoch": 0.03228149463320152, + "grad_norm": 0.9565223455429077, + "learning_rate": 0.00019988786919844436, + "loss": 2.7766, + "step": 400 + }, + { + "epoch": 0.032362198369784524, + "grad_norm": 0.8901559710502625, + "learning_rate": 0.0001998871205547719, + "loss": 2.7966, + "step": 401 + }, + { + "epoch": 0.03244290210636752, + "grad_norm": 1.0959528684616089, + "learning_rate": 0.00019988636942165123, + "loss": 2.8377, + "step": 402 + }, + { + "epoch": 0.032523605842950526, + "grad_norm": 1.0768988132476807, + "learning_rate": 0.00019988561579910118, + "loss": 2.8267, + "step": 403 + }, + { + "epoch": 0.03260430957953353, + "grad_norm": 0.9563855528831482, + "learning_rate": 0.00019988485968714048, + "loss": 2.8459, + "step": 404 + }, + { + "epoch": 0.032685013316116535, + "grad_norm": 0.930927038192749, + "learning_rate": 0.00019988410108578796, + "loss": 2.8053, + "step": 405 + }, + { + "epoch": 0.03276571705269954, + "grad_norm": 1.0658363103866577, + "learning_rate": 0.00019988333999506255, + "loss": 2.8512, + "step": 406 + }, + { + "epoch": 0.03284642078928254, + "grad_norm": 0.9258090257644653, + "learning_rate": 0.0001998825764149832, + "loss": 2.8541, + "step": 407 + }, + { + "epoch": 0.03292712452586555, + "grad_norm": 1.18158757686615, + "learning_rate": 0.00019988181034556895, + "loss": 2.8838, + "step": 408 + }, + { + "epoch": 0.03300782826244855, + "grad_norm": 0.9506754875183105, + "learning_rate": 0.00019988104178683891, + "loss": 2.7733, + "step": 409 + }, + { + "epoch": 0.033088531999031556, + "grad_norm": 0.9559460282325745, + "learning_rate": 0.0001998802707388122, + "loss": 2.9259, + "step": 410 + }, + { + "epoch": 0.03316923573561456, + "grad_norm": 0.9322298765182495, + "learning_rate": 0.00019987949720150808, + "loss": 2.8318, + "step": 411 + }, + { + "epoch": 0.033249939472197565, + "grad_norm": 0.9226691722869873, + "learning_rate": 0.00019987872117494576, + "loss": 2.9063, + "step": 412 + }, + { + "epoch": 0.03333064320878057, + "grad_norm": 1.0543674230575562, + "learning_rate": 0.00019987794265914464, + "loss": 2.7877, + "step": 413 + }, + { + "epoch": 0.033411346945363574, + "grad_norm": 0.989986002445221, + "learning_rate": 0.00019987716165412408, + "loss": 2.8354, + "step": 414 + }, + { + "epoch": 0.03349205068194657, + "grad_norm": 0.8703451752662659, + "learning_rate": 0.0001998763781599036, + "loss": 2.8127, + "step": 415 + }, + { + "epoch": 0.033572754418529575, + "grad_norm": 0.974943220615387, + "learning_rate": 0.0001998755921765027, + "loss": 2.9272, + "step": 416 + }, + { + "epoch": 0.03365345815511258, + "grad_norm": 0.8714169859886169, + "learning_rate": 0.000199874803703941, + "loss": 2.8027, + "step": 417 + }, + { + "epoch": 0.033734161891695584, + "grad_norm": 0.9251161217689514, + "learning_rate": 0.00019987401274223804, + "loss": 2.8186, + "step": 418 + }, + { + "epoch": 0.03381486562827859, + "grad_norm": 0.9657236933708191, + "learning_rate": 0.00019987321929141366, + "loss": 2.8297, + "step": 419 + }, + { + "epoch": 0.03389556936486159, + "grad_norm": 0.9022002816200256, + "learning_rate": 0.00019987242335148757, + "loss": 2.881, + "step": 420 + }, + { + "epoch": 0.0339762731014446, + "grad_norm": 0.9479621052742004, + "learning_rate": 0.0001998716249224796, + "loss": 2.8288, + "step": 421 + }, + { + "epoch": 0.0340569768380276, + "grad_norm": 0.9458955526351929, + "learning_rate": 0.00019987082400440968, + "loss": 2.8861, + "step": 422 + }, + { + "epoch": 0.034137680574610606, + "grad_norm": 0.9444572329521179, + "learning_rate": 0.0001998700205972978, + "loss": 2.8877, + "step": 423 + }, + { + "epoch": 0.03421838431119361, + "grad_norm": 0.9263925552368164, + "learning_rate": 0.00019986921470116392, + "loss": 2.8028, + "step": 424 + }, + { + "epoch": 0.034299088047776614, + "grad_norm": 1.0690566301345825, + "learning_rate": 0.00019986840631602812, + "loss": 2.882, + "step": 425 + }, + { + "epoch": 0.03437979178435962, + "grad_norm": 0.8999007940292358, + "learning_rate": 0.0001998675954419106, + "loss": 2.8179, + "step": 426 + }, + { + "epoch": 0.03446049552094262, + "grad_norm": 0.894395112991333, + "learning_rate": 0.00019986678207883153, + "loss": 2.814, + "step": 427 + }, + { + "epoch": 0.03454119925752562, + "grad_norm": 0.8621550798416138, + "learning_rate": 0.00019986596622681123, + "loss": 2.7584, + "step": 428 + }, + { + "epoch": 0.034621902994108625, + "grad_norm": 0.9452527165412903, + "learning_rate": 0.00019986514788587, + "loss": 2.8949, + "step": 429 + }, + { + "epoch": 0.03470260673069163, + "grad_norm": 0.8973272442817688, + "learning_rate": 0.0001998643270560282, + "loss": 2.868, + "step": 430 + }, + { + "epoch": 0.034783310467274633, + "grad_norm": 0.9887418150901794, + "learning_rate": 0.00019986350373730634, + "loss": 2.8009, + "step": 431 + }, + { + "epoch": 0.03486401420385764, + "grad_norm": 0.9449994564056396, + "learning_rate": 0.0001998626779297249, + "loss": 2.8305, + "step": 432 + }, + { + "epoch": 0.03494471794044064, + "grad_norm": 1.052871823310852, + "learning_rate": 0.0001998618496333045, + "loss": 2.8136, + "step": 433 + }, + { + "epoch": 0.035025421677023647, + "grad_norm": 0.9600724577903748, + "learning_rate": 0.00019986101884806576, + "loss": 2.7857, + "step": 434 + }, + { + "epoch": 0.03510612541360665, + "grad_norm": 0.874043345451355, + "learning_rate": 0.00019986018557402942, + "loss": 2.8524, + "step": 435 + }, + { + "epoch": 0.035186829150189655, + "grad_norm": 0.9810616374015808, + "learning_rate": 0.0001998593498112162, + "loss": 2.7506, + "step": 436 + }, + { + "epoch": 0.03526753288677266, + "grad_norm": 0.9163016080856323, + "learning_rate": 0.00019985851155964693, + "loss": 2.798, + "step": 437 + }, + { + "epoch": 0.035348236623355664, + "grad_norm": 1.0688380002975464, + "learning_rate": 0.00019985767081934252, + "loss": 2.8916, + "step": 438 + }, + { + "epoch": 0.03542894035993867, + "grad_norm": 0.925020158290863, + "learning_rate": 0.00019985682759032393, + "loss": 2.8017, + "step": 439 + }, + { + "epoch": 0.035509644096521666, + "grad_norm": 0.9429430961608887, + "learning_rate": 0.0001998559818726122, + "loss": 2.837, + "step": 440 + }, + { + "epoch": 0.03559034783310467, + "grad_norm": 0.9135627150535583, + "learning_rate": 0.00019985513366622832, + "loss": 2.8423, + "step": 441 + }, + { + "epoch": 0.035671051569687674, + "grad_norm": 0.9218924045562744, + "learning_rate": 0.00019985428297119353, + "loss": 2.854, + "step": 442 + }, + { + "epoch": 0.03575175530627068, + "grad_norm": 0.9307878613471985, + "learning_rate": 0.00019985342978752897, + "loss": 2.8591, + "step": 443 + }, + { + "epoch": 0.03583245904285368, + "grad_norm": 0.935394287109375, + "learning_rate": 0.00019985257411525592, + "loss": 2.8388, + "step": 444 + }, + { + "epoch": 0.03591316277943669, + "grad_norm": 0.890959620475769, + "learning_rate": 0.0001998517159543957, + "loss": 2.78, + "step": 445 + }, + { + "epoch": 0.03599386651601969, + "grad_norm": 1.110924482345581, + "learning_rate": 0.0001998508553049697, + "loss": 2.8117, + "step": 446 + }, + { + "epoch": 0.036074570252602696, + "grad_norm": 0.8774176239967346, + "learning_rate": 0.0001998499921669994, + "loss": 2.8368, + "step": 447 + }, + { + "epoch": 0.0361552739891857, + "grad_norm": 0.9766948819160461, + "learning_rate": 0.00019984912654050625, + "loss": 2.764, + "step": 448 + }, + { + "epoch": 0.036235977725768705, + "grad_norm": 1.1439398527145386, + "learning_rate": 0.00019984825842551187, + "loss": 2.84, + "step": 449 + }, + { + "epoch": 0.03631668146235171, + "grad_norm": 0.8995118737220764, + "learning_rate": 0.0001998473878220379, + "loss": 2.834, + "step": 450 + }, + { + "epoch": 0.03639738519893471, + "grad_norm": 0.9810060858726501, + "learning_rate": 0.000199846514730106, + "loss": 2.9338, + "step": 451 + }, + { + "epoch": 0.03647808893551772, + "grad_norm": 1.0862053632736206, + "learning_rate": 0.00019984563914973795, + "loss": 2.837, + "step": 452 + }, + { + "epoch": 0.036558792672100715, + "grad_norm": 0.9456702470779419, + "learning_rate": 0.0001998447610809556, + "loss": 2.7664, + "step": 453 + }, + { + "epoch": 0.03663949640868372, + "grad_norm": 1.0714432001113892, + "learning_rate": 0.0001998438805237808, + "loss": 2.8339, + "step": 454 + }, + { + "epoch": 0.036720200145266724, + "grad_norm": 0.89134281873703, + "learning_rate": 0.00019984299747823547, + "loss": 2.7818, + "step": 455 + }, + { + "epoch": 0.03680090388184973, + "grad_norm": 0.869742214679718, + "learning_rate": 0.0001998421119443417, + "loss": 2.7916, + "step": 456 + }, + { + "epoch": 0.03688160761843273, + "grad_norm": 0.9307265281677246, + "learning_rate": 0.00019984122392212149, + "loss": 2.8485, + "step": 457 + }, + { + "epoch": 0.03696231135501574, + "grad_norm": 0.900215744972229, + "learning_rate": 0.00019984033341159698, + "loss": 2.8536, + "step": 458 + }, + { + "epoch": 0.03704301509159874, + "grad_norm": 0.8679699897766113, + "learning_rate": 0.00019983944041279038, + "loss": 2.8344, + "step": 459 + }, + { + "epoch": 0.037123718828181745, + "grad_norm": 0.9540488719940186, + "learning_rate": 0.00019983854492572394, + "loss": 2.873, + "step": 460 + }, + { + "epoch": 0.03720442256476475, + "grad_norm": 0.8697962760925293, + "learning_rate": 0.00019983764695042, + "loss": 2.8122, + "step": 461 + }, + { + "epoch": 0.037285126301347754, + "grad_norm": 0.9534483551979065, + "learning_rate": 0.0001998367464869009, + "loss": 2.8842, + "step": 462 + }, + { + "epoch": 0.03736583003793076, + "grad_norm": 0.8402275443077087, + "learning_rate": 0.00019983584353518911, + "loss": 2.8135, + "step": 463 + }, + { + "epoch": 0.03744653377451376, + "grad_norm": 0.8226146697998047, + "learning_rate": 0.0001998349380953071, + "loss": 2.8036, + "step": 464 + }, + { + "epoch": 0.03752723751109677, + "grad_norm": 0.9292199611663818, + "learning_rate": 0.0001998340301672775, + "loss": 2.7887, + "step": 465 + }, + { + "epoch": 0.037607941247679764, + "grad_norm": 0.9035555124282837, + "learning_rate": 0.0001998331197511229, + "loss": 2.7851, + "step": 466 + }, + { + "epoch": 0.03768864498426277, + "grad_norm": 0.9411706328392029, + "learning_rate": 0.00019983220684686596, + "loss": 2.7782, + "step": 467 + }, + { + "epoch": 0.03776934872084577, + "grad_norm": 0.9867696166038513, + "learning_rate": 0.0001998312914545295, + "loss": 2.8125, + "step": 468 + }, + { + "epoch": 0.03785005245742878, + "grad_norm": 0.9683675169944763, + "learning_rate": 0.00019983037357413624, + "loss": 2.8325, + "step": 469 + }, + { + "epoch": 0.03793075619401178, + "grad_norm": 0.963941752910614, + "learning_rate": 0.00019982945320570913, + "loss": 2.8281, + "step": 470 + }, + { + "epoch": 0.038011459930594786, + "grad_norm": 0.9812459349632263, + "learning_rate": 0.0001998285303492711, + "loss": 2.765, + "step": 471 + }, + { + "epoch": 0.03809216366717779, + "grad_norm": 0.9681405425071716, + "learning_rate": 0.00019982760500484516, + "loss": 2.8882, + "step": 472 + }, + { + "epoch": 0.038172867403760795, + "grad_norm": 0.8983948826789856, + "learning_rate": 0.00019982667717245432, + "loss": 2.8182, + "step": 473 + }, + { + "epoch": 0.0382535711403438, + "grad_norm": 0.9875261783599854, + "learning_rate": 0.00019982574685212178, + "loss": 2.8072, + "step": 474 + }, + { + "epoch": 0.038334274876926804, + "grad_norm": 0.8889442086219788, + "learning_rate": 0.00019982481404387064, + "loss": 2.8635, + "step": 475 + }, + { + "epoch": 0.03841497861350981, + "grad_norm": 0.8904242515563965, + "learning_rate": 0.00019982387874772418, + "loss": 2.829, + "step": 476 + }, + { + "epoch": 0.03849568235009281, + "grad_norm": 1.0182000398635864, + "learning_rate": 0.00019982294096370574, + "loss": 2.8552, + "step": 477 + }, + { + "epoch": 0.03857638608667582, + "grad_norm": 0.9867151975631714, + "learning_rate": 0.00019982200069183867, + "loss": 2.8201, + "step": 478 + }, + { + "epoch": 0.038657089823258814, + "grad_norm": 0.9785345196723938, + "learning_rate": 0.0001998210579321464, + "loss": 2.8652, + "step": 479 + }, + { + "epoch": 0.03873779355984182, + "grad_norm": 0.9696915149688721, + "learning_rate": 0.00019982011268465243, + "loss": 2.8276, + "step": 480 + }, + { + "epoch": 0.03881849729642482, + "grad_norm": 0.9257470965385437, + "learning_rate": 0.00019981916494938033, + "loss": 2.8321, + "step": 481 + }, + { + "epoch": 0.03889920103300783, + "grad_norm": 0.9394895434379578, + "learning_rate": 0.00019981821472635369, + "loss": 2.8747, + "step": 482 + }, + { + "epoch": 0.03897990476959083, + "grad_norm": 0.9888504147529602, + "learning_rate": 0.00019981726201559626, + "loss": 2.8201, + "step": 483 + }, + { + "epoch": 0.039060608506173836, + "grad_norm": 0.8957003951072693, + "learning_rate": 0.0001998163068171317, + "loss": 2.8255, + "step": 484 + }, + { + "epoch": 0.03914131224275684, + "grad_norm": 0.9792008996009827, + "learning_rate": 0.00019981534913098383, + "loss": 2.7985, + "step": 485 + }, + { + "epoch": 0.039222015979339844, + "grad_norm": 0.8689060211181641, + "learning_rate": 0.00019981438895717656, + "loss": 2.7945, + "step": 486 + }, + { + "epoch": 0.03930271971592285, + "grad_norm": 0.9932593703269958, + "learning_rate": 0.0001998134262957338, + "loss": 2.9041, + "step": 487 + }, + { + "epoch": 0.03938342345250585, + "grad_norm": 0.8496069312095642, + "learning_rate": 0.00019981246114667955, + "loss": 2.8433, + "step": 488 + }, + { + "epoch": 0.03946412718908886, + "grad_norm": 0.8484126925468445, + "learning_rate": 0.00019981149351003786, + "loss": 2.7872, + "step": 489 + }, + { + "epoch": 0.03954483092567186, + "grad_norm": 0.9208858013153076, + "learning_rate": 0.00019981052338583283, + "loss": 2.7776, + "step": 490 + }, + { + "epoch": 0.03962553466225486, + "grad_norm": 0.9305418729782104, + "learning_rate": 0.00019980955077408865, + "loss": 2.7851, + "step": 491 + }, + { + "epoch": 0.03970623839883786, + "grad_norm": 0.9803212881088257, + "learning_rate": 0.00019980857567482955, + "loss": 2.8469, + "step": 492 + }, + { + "epoch": 0.03978694213542087, + "grad_norm": 0.9165790677070618, + "learning_rate": 0.00019980759808807985, + "loss": 2.8513, + "step": 493 + }, + { + "epoch": 0.03986764587200387, + "grad_norm": 0.9153794050216675, + "learning_rate": 0.00019980661801386393, + "loss": 2.8322, + "step": 494 + }, + { + "epoch": 0.039948349608586876, + "grad_norm": 0.89347904920578, + "learning_rate": 0.00019980563545220616, + "loss": 2.8316, + "step": 495 + }, + { + "epoch": 0.04002905334516988, + "grad_norm": 0.9882236123085022, + "learning_rate": 0.00019980465040313105, + "loss": 2.7471, + "step": 496 + }, + { + "epoch": 0.040109757081752885, + "grad_norm": 0.9391099810600281, + "learning_rate": 0.00019980366286666322, + "loss": 2.8182, + "step": 497 + }, + { + "epoch": 0.04019046081833589, + "grad_norm": 1.0155293941497803, + "learning_rate": 0.00019980267284282717, + "loss": 2.8721, + "step": 498 + }, + { + "epoch": 0.040271164554918894, + "grad_norm": 0.9952930212020874, + "learning_rate": 0.00019980168033164765, + "loss": 2.8538, + "step": 499 + }, + { + "epoch": 0.0403518682915019, + "grad_norm": 0.8385666608810425, + "learning_rate": 0.00019980068533314934, + "loss": 2.8242, + "step": 500 + }, + { + "epoch": 0.0404325720280849, + "grad_norm": 0.8747559785842896, + "learning_rate": 0.0001997996878473571, + "loss": 2.7908, + "step": 501 + }, + { + "epoch": 0.04051327576466791, + "grad_norm": 0.9267926216125488, + "learning_rate": 0.00019979868787429575, + "loss": 2.8359, + "step": 502 + }, + { + "epoch": 0.04059397950125091, + "grad_norm": 0.8194155693054199, + "learning_rate": 0.00019979768541399022, + "loss": 2.8161, + "step": 503 + }, + { + "epoch": 0.04067468323783391, + "grad_norm": 0.8923258185386658, + "learning_rate": 0.00019979668046646548, + "loss": 2.7547, + "step": 504 + }, + { + "epoch": 0.04075538697441691, + "grad_norm": 0.8965646028518677, + "learning_rate": 0.00019979567303174663, + "loss": 2.8432, + "step": 505 + }, + { + "epoch": 0.04083609071099992, + "grad_norm": 0.814481794834137, + "learning_rate": 0.0001997946631098587, + "loss": 2.8327, + "step": 506 + }, + { + "epoch": 0.04091679444758292, + "grad_norm": 0.8806928396224976, + "learning_rate": 0.00019979365070082694, + "loss": 2.8573, + "step": 507 + }, + { + "epoch": 0.040997498184165926, + "grad_norm": 0.8546919822692871, + "learning_rate": 0.00019979263580467653, + "loss": 2.8618, + "step": 508 + }, + { + "epoch": 0.04107820192074893, + "grad_norm": 0.8557277321815491, + "learning_rate": 0.00019979161842143274, + "loss": 2.8454, + "step": 509 + }, + { + "epoch": 0.041158905657331935, + "grad_norm": 0.9153180122375488, + "learning_rate": 0.00019979059855112098, + "loss": 2.8027, + "step": 510 + }, + { + "epoch": 0.04123960939391494, + "grad_norm": 0.8616741895675659, + "learning_rate": 0.00019978957619376666, + "loss": 2.7628, + "step": 511 + }, + { + "epoch": 0.04132031313049794, + "grad_norm": 0.8777137398719788, + "learning_rate": 0.00019978855134939524, + "loss": 2.8443, + "step": 512 + }, + { + "epoch": 0.04140101686708095, + "grad_norm": 0.852100133895874, + "learning_rate": 0.0001997875240180323, + "loss": 2.8125, + "step": 513 + }, + { + "epoch": 0.04148172060366395, + "grad_norm": 0.8470742702484131, + "learning_rate": 0.00019978649419970338, + "loss": 2.8139, + "step": 514 + }, + { + "epoch": 0.041562424340246956, + "grad_norm": 0.8890305161476135, + "learning_rate": 0.0001997854618944342, + "loss": 2.8633, + "step": 515 + }, + { + "epoch": 0.04164312807682996, + "grad_norm": 0.8893599510192871, + "learning_rate": 0.00019978442710225043, + "loss": 2.8066, + "step": 516 + }, + { + "epoch": 0.04172383181341296, + "grad_norm": 0.9093891382217407, + "learning_rate": 0.00019978338982317792, + "loss": 2.8026, + "step": 517 + }, + { + "epoch": 0.04180453554999596, + "grad_norm": 0.9775434136390686, + "learning_rate": 0.00019978235005724252, + "loss": 2.849, + "step": 518 + }, + { + "epoch": 0.04188523928657897, + "grad_norm": 1.0014091730117798, + "learning_rate": 0.00019978130780447012, + "loss": 2.8572, + "step": 519 + }, + { + "epoch": 0.04196594302316197, + "grad_norm": 0.8487632870674133, + "learning_rate": 0.00019978026306488668, + "loss": 2.7611, + "step": 520 + }, + { + "epoch": 0.042046646759744975, + "grad_norm": 0.86592698097229, + "learning_rate": 0.00019977921583851825, + "loss": 2.7616, + "step": 521 + }, + { + "epoch": 0.04212735049632798, + "grad_norm": 1.0285916328430176, + "learning_rate": 0.00019977816612539093, + "loss": 2.8049, + "step": 522 + }, + { + "epoch": 0.042208054232910984, + "grad_norm": 0.9716495871543884, + "learning_rate": 0.00019977711392553092, + "loss": 2.8459, + "step": 523 + }, + { + "epoch": 0.04228875796949399, + "grad_norm": 0.8842264413833618, + "learning_rate": 0.0001997760592389644, + "loss": 2.7934, + "step": 524 + }, + { + "epoch": 0.04236946170607699, + "grad_norm": 0.8839964866638184, + "learning_rate": 0.00019977500206571765, + "loss": 2.8135, + "step": 525 + }, + { + "epoch": 0.04245016544266, + "grad_norm": 0.870331346988678, + "learning_rate": 0.00019977394240581705, + "loss": 2.8684, + "step": 526 + }, + { + "epoch": 0.042530869179243, + "grad_norm": 0.8844720125198364, + "learning_rate": 0.000199772880259289, + "loss": 2.7867, + "step": 527 + }, + { + "epoch": 0.042611572915826006, + "grad_norm": 0.9353455901145935, + "learning_rate": 0.00019977181562615994, + "loss": 2.8051, + "step": 528 + }, + { + "epoch": 0.04269227665240901, + "grad_norm": 0.9530816078186035, + "learning_rate": 0.00019977074850645646, + "loss": 2.7915, + "step": 529 + }, + { + "epoch": 0.04277298038899201, + "grad_norm": 0.8984190821647644, + "learning_rate": 0.00019976967890020507, + "loss": 2.7957, + "step": 530 + }, + { + "epoch": 0.04285368412557501, + "grad_norm": 0.9146613478660583, + "learning_rate": 0.00019976860680743252, + "loss": 2.9053, + "step": 531 + }, + { + "epoch": 0.042934387862158016, + "grad_norm": 0.9228026866912842, + "learning_rate": 0.0001997675322281655, + "loss": 2.8578, + "step": 532 + }, + { + "epoch": 0.04301509159874102, + "grad_norm": 0.8266343474388123, + "learning_rate": 0.0001997664551624308, + "loss": 2.7393, + "step": 533 + }, + { + "epoch": 0.043095795335324025, + "grad_norm": 0.9197628498077393, + "learning_rate": 0.0001997653756102552, + "loss": 2.8828, + "step": 534 + }, + { + "epoch": 0.04317649907190703, + "grad_norm": 0.9145991802215576, + "learning_rate": 0.00019976429357166566, + "loss": 2.7767, + "step": 535 + }, + { + "epoch": 0.04325720280849003, + "grad_norm": 0.9123281240463257, + "learning_rate": 0.00019976320904668913, + "loss": 2.7993, + "step": 536 + }, + { + "epoch": 0.04333790654507304, + "grad_norm": 0.8597636818885803, + "learning_rate": 0.00019976212203535266, + "loss": 2.8148, + "step": 537 + }, + { + "epoch": 0.04341861028165604, + "grad_norm": 0.8963296413421631, + "learning_rate": 0.00019976103253768334, + "loss": 2.7722, + "step": 538 + }, + { + "epoch": 0.043499314018239046, + "grad_norm": 0.9480688571929932, + "learning_rate": 0.0001997599405537083, + "loss": 2.8038, + "step": 539 + }, + { + "epoch": 0.04358001775482205, + "grad_norm": 0.8115736842155457, + "learning_rate": 0.00019975884608345476, + "loss": 2.8069, + "step": 540 + }, + { + "epoch": 0.043660721491405055, + "grad_norm": 0.9642506837844849, + "learning_rate": 0.00019975774912695, + "loss": 2.8703, + "step": 541 + }, + { + "epoch": 0.04374142522798805, + "grad_norm": 0.9638697504997253, + "learning_rate": 0.0001997566496842214, + "loss": 2.8223, + "step": 542 + }, + { + "epoch": 0.04382212896457106, + "grad_norm": 0.9478490352630615, + "learning_rate": 0.00019975554775529628, + "loss": 2.8164, + "step": 543 + }, + { + "epoch": 0.04390283270115406, + "grad_norm": 1.1771583557128906, + "learning_rate": 0.00019975444334020215, + "loss": 2.7969, + "step": 544 + }, + { + "epoch": 0.043983536437737066, + "grad_norm": 0.9597339034080505, + "learning_rate": 0.00019975333643896655, + "loss": 2.8025, + "step": 545 + }, + { + "epoch": 0.04406424017432007, + "grad_norm": 0.981595516204834, + "learning_rate": 0.00019975222705161704, + "loss": 2.7994, + "step": 546 + }, + { + "epoch": 0.044144943910903074, + "grad_norm": 0.9581133723258972, + "learning_rate": 0.00019975111517818127, + "loss": 2.802, + "step": 547 + }, + { + "epoch": 0.04422564764748608, + "grad_norm": 0.8643878698348999, + "learning_rate": 0.00019975000081868697, + "loss": 2.7958, + "step": 548 + }, + { + "epoch": 0.04430635138406908, + "grad_norm": 1.2188652753829956, + "learning_rate": 0.0001997488839731619, + "loss": 2.8786, + "step": 549 + }, + { + "epoch": 0.04438705512065209, + "grad_norm": 0.9138071537017822, + "learning_rate": 0.00019974776464163387, + "loss": 2.809, + "step": 550 + }, + { + "epoch": 0.04446775885723509, + "grad_norm": 0.9604587554931641, + "learning_rate": 0.00019974664282413083, + "loss": 2.8009, + "step": 551 + }, + { + "epoch": 0.044548462593818096, + "grad_norm": 1.0271116495132446, + "learning_rate": 0.00019974551852068072, + "loss": 2.8689, + "step": 552 + }, + { + "epoch": 0.0446291663304011, + "grad_norm": 0.9330877065658569, + "learning_rate": 0.00019974439173131155, + "loss": 2.7613, + "step": 553 + }, + { + "epoch": 0.044709870066984105, + "grad_norm": 0.9549325108528137, + "learning_rate": 0.00019974326245605136, + "loss": 2.8314, + "step": 554 + }, + { + "epoch": 0.0447905738035671, + "grad_norm": 0.8928439021110535, + "learning_rate": 0.00019974213069492836, + "loss": 2.8097, + "step": 555 + }, + { + "epoch": 0.044871277540150106, + "grad_norm": 0.8705076575279236, + "learning_rate": 0.00019974099644797075, + "loss": 2.8112, + "step": 556 + }, + { + "epoch": 0.04495198127673311, + "grad_norm": 0.988345742225647, + "learning_rate": 0.00019973985971520676, + "loss": 2.7648, + "step": 557 + }, + { + "epoch": 0.045032685013316115, + "grad_norm": 0.9161957502365112, + "learning_rate": 0.00019973872049666475, + "loss": 2.8691, + "step": 558 + }, + { + "epoch": 0.04511338874989912, + "grad_norm": 0.8404076099395752, + "learning_rate": 0.00019973757879237312, + "loss": 2.7708, + "step": 559 + }, + { + "epoch": 0.045194092486482124, + "grad_norm": 1.05247962474823, + "learning_rate": 0.0001997364346023603, + "loss": 2.8638, + "step": 560 + }, + { + "epoch": 0.04527479622306513, + "grad_norm": 0.9235066175460815, + "learning_rate": 0.00019973528792665483, + "loss": 2.7876, + "step": 561 + }, + { + "epoch": 0.04535549995964813, + "grad_norm": 1.220075249671936, + "learning_rate": 0.00019973413876528526, + "loss": 2.8563, + "step": 562 + }, + { + "epoch": 0.04543620369623114, + "grad_norm": 0.9098384976387024, + "learning_rate": 0.00019973298711828025, + "loss": 2.8427, + "step": 563 + }, + { + "epoch": 0.04551690743281414, + "grad_norm": 0.8792217969894409, + "learning_rate": 0.00019973183298566848, + "loss": 2.8673, + "step": 564 + }, + { + "epoch": 0.045597611169397145, + "grad_norm": 0.9895235896110535, + "learning_rate": 0.00019973067636747875, + "loss": 2.8262, + "step": 565 + }, + { + "epoch": 0.04567831490598015, + "grad_norm": 0.9191479086875916, + "learning_rate": 0.00019972951726373984, + "loss": 2.8005, + "step": 566 + }, + { + "epoch": 0.045759018642563154, + "grad_norm": 0.9631491899490356, + "learning_rate": 0.0001997283556744807, + "loss": 2.8438, + "step": 567 + }, + { + "epoch": 0.04583972237914615, + "grad_norm": 0.8302746415138245, + "learning_rate": 0.00019972719159973024, + "loss": 2.8221, + "step": 568 + }, + { + "epoch": 0.045920426115729156, + "grad_norm": 0.8238534927368164, + "learning_rate": 0.00019972602503951748, + "loss": 2.7674, + "step": 569 + }, + { + "epoch": 0.04600112985231216, + "grad_norm": 0.9675811529159546, + "learning_rate": 0.00019972485599387146, + "loss": 2.8457, + "step": 570 + }, + { + "epoch": 0.046081833588895164, + "grad_norm": 0.8663914203643799, + "learning_rate": 0.00019972368446282134, + "loss": 2.7851, + "step": 571 + }, + { + "epoch": 0.04616253732547817, + "grad_norm": 0.9904592633247375, + "learning_rate": 0.00019972251044639636, + "loss": 2.8792, + "step": 572 + }, + { + "epoch": 0.04624324106206117, + "grad_norm": 0.907600462436676, + "learning_rate": 0.0001997213339446257, + "loss": 2.7991, + "step": 573 + }, + { + "epoch": 0.04632394479864418, + "grad_norm": 0.871362566947937, + "learning_rate": 0.00019972015495753876, + "loss": 2.7959, + "step": 574 + }, + { + "epoch": 0.04640464853522718, + "grad_norm": 0.9664937853813171, + "learning_rate": 0.00019971897348516486, + "loss": 2.7847, + "step": 575 + }, + { + "epoch": 0.046485352271810186, + "grad_norm": 1.0670619010925293, + "learning_rate": 0.0001997177895275335, + "loss": 2.8864, + "step": 576 + }, + { + "epoch": 0.04656605600839319, + "grad_norm": 0.9281025528907776, + "learning_rate": 0.00019971660308467414, + "loss": 2.8568, + "step": 577 + }, + { + "epoch": 0.046646759744976195, + "grad_norm": 0.8964822888374329, + "learning_rate": 0.00019971541415661639, + "loss": 2.7246, + "step": 578 + }, + { + "epoch": 0.0467274634815592, + "grad_norm": 0.8921917676925659, + "learning_rate": 0.00019971422274338985, + "loss": 2.8513, + "step": 579 + }, + { + "epoch": 0.0468081672181422, + "grad_norm": 0.9550159573554993, + "learning_rate": 0.0001997130288450242, + "loss": 2.7615, + "step": 580 + }, + { + "epoch": 0.0468888709547252, + "grad_norm": 0.9330170154571533, + "learning_rate": 0.00019971183246154925, + "loss": 2.9017, + "step": 581 + }, + { + "epoch": 0.046969574691308205, + "grad_norm": 0.9125271439552307, + "learning_rate": 0.00019971063359299477, + "loss": 2.8263, + "step": 582 + }, + { + "epoch": 0.04705027842789121, + "grad_norm": 1.0005927085876465, + "learning_rate": 0.00019970943223939066, + "loss": 2.8371, + "step": 583 + }, + { + "epoch": 0.047130982164474214, + "grad_norm": 1.0333613157272339, + "learning_rate": 0.00019970822840076685, + "loss": 2.8275, + "step": 584 + }, + { + "epoch": 0.04721168590105722, + "grad_norm": 0.8684708476066589, + "learning_rate": 0.00019970702207715334, + "loss": 2.8343, + "step": 585 + }, + { + "epoch": 0.04729238963764022, + "grad_norm": 1.1112761497497559, + "learning_rate": 0.00019970581326858025, + "loss": 2.9012, + "step": 586 + }, + { + "epoch": 0.04737309337422323, + "grad_norm": 1.0187962055206299, + "learning_rate": 0.00019970460197507763, + "loss": 2.8423, + "step": 587 + }, + { + "epoch": 0.04745379711080623, + "grad_norm": 0.9802024960517883, + "learning_rate": 0.00019970338819667567, + "loss": 2.867, + "step": 588 + }, + { + "epoch": 0.047534500847389236, + "grad_norm": 0.9825551509857178, + "learning_rate": 0.00019970217193340467, + "loss": 2.8359, + "step": 589 + }, + { + "epoch": 0.04761520458397224, + "grad_norm": 1.1399210691452026, + "learning_rate": 0.00019970095318529494, + "loss": 2.8356, + "step": 590 + }, + { + "epoch": 0.047695908320555244, + "grad_norm": 1.0373995304107666, + "learning_rate": 0.00019969973195237684, + "loss": 2.8005, + "step": 591 + }, + { + "epoch": 0.04777661205713825, + "grad_norm": 1.133596420288086, + "learning_rate": 0.00019969850823468077, + "loss": 2.8778, + "step": 592 + }, + { + "epoch": 0.047857315793721246, + "grad_norm": 1.0187327861785889, + "learning_rate": 0.00019969728203223728, + "loss": 2.8291, + "step": 593 + }, + { + "epoch": 0.04793801953030425, + "grad_norm": 1.0588128566741943, + "learning_rate": 0.00019969605334507688, + "loss": 2.9396, + "step": 594 + }, + { + "epoch": 0.048018723266887255, + "grad_norm": 0.8783230781555176, + "learning_rate": 0.00019969482217323026, + "loss": 2.8076, + "step": 595 + }, + { + "epoch": 0.04809942700347026, + "grad_norm": 1.0500195026397705, + "learning_rate": 0.00019969358851672805, + "loss": 2.9099, + "step": 596 + }, + { + "epoch": 0.04818013074005326, + "grad_norm": 0.9523593187332153, + "learning_rate": 0.000199692352375601, + "loss": 2.7448, + "step": 597 + }, + { + "epoch": 0.04826083447663627, + "grad_norm": 1.0008500814437866, + "learning_rate": 0.00019969111374987995, + "loss": 2.8212, + "step": 598 + }, + { + "epoch": 0.04834153821321927, + "grad_norm": 0.8992626070976257, + "learning_rate": 0.00019968987263959575, + "loss": 2.8698, + "step": 599 + }, + { + "epoch": 0.048422241949802276, + "grad_norm": 0.9914852380752563, + "learning_rate": 0.00019968862904477935, + "loss": 2.8221, + "step": 600 + }, + { + "epoch": 0.04850294568638528, + "grad_norm": 0.9633241295814514, + "learning_rate": 0.00019968738296546168, + "loss": 2.8835, + "step": 601 + }, + { + "epoch": 0.048583649422968285, + "grad_norm": 1.055831789970398, + "learning_rate": 0.00019968613440167387, + "loss": 2.8781, + "step": 602 + }, + { + "epoch": 0.04866435315955129, + "grad_norm": 0.913856029510498, + "learning_rate": 0.000199684883353447, + "loss": 2.7863, + "step": 603 + }, + { + "epoch": 0.048745056896134294, + "grad_norm": 0.8429243564605713, + "learning_rate": 0.00019968362982081226, + "loss": 2.7753, + "step": 604 + }, + { + "epoch": 0.0488257606327173, + "grad_norm": 0.9324761629104614, + "learning_rate": 0.0001996823738038009, + "loss": 2.8058, + "step": 605 + }, + { + "epoch": 0.048906464369300295, + "grad_norm": 1.0004981756210327, + "learning_rate": 0.0001996811153024442, + "loss": 2.8537, + "step": 606 + }, + { + "epoch": 0.0489871681058833, + "grad_norm": 0.9438043236732483, + "learning_rate": 0.00019967985431677354, + "loss": 2.8828, + "step": 607 + }, + { + "epoch": 0.049067871842466304, + "grad_norm": 0.9359340071678162, + "learning_rate": 0.00019967859084682034, + "loss": 2.8149, + "step": 608 + }, + { + "epoch": 0.04914857557904931, + "grad_norm": 1.0400227308273315, + "learning_rate": 0.00019967732489261609, + "loss": 2.8489, + "step": 609 + }, + { + "epoch": 0.04922927931563231, + "grad_norm": 0.8978031277656555, + "learning_rate": 0.00019967605645419237, + "loss": 2.8599, + "step": 610 + }, + { + "epoch": 0.04930998305221532, + "grad_norm": 0.9982689619064331, + "learning_rate": 0.00019967478553158073, + "loss": 2.9024, + "step": 611 + }, + { + "epoch": 0.04939068678879832, + "grad_norm": 1.0695222616195679, + "learning_rate": 0.00019967351212481292, + "loss": 2.8483, + "step": 612 + }, + { + "epoch": 0.049471390525381326, + "grad_norm": 1.0615525245666504, + "learning_rate": 0.0001996722362339206, + "loss": 2.806, + "step": 613 + }, + { + "epoch": 0.04955209426196433, + "grad_norm": 0.9624890089035034, + "learning_rate": 0.0001996709578589356, + "loss": 2.8641, + "step": 614 + }, + { + "epoch": 0.049632797998547334, + "grad_norm": 0.9156595468521118, + "learning_rate": 0.00019966967699988985, + "loss": 2.7991, + "step": 615 + }, + { + "epoch": 0.04971350173513034, + "grad_norm": 0.8687645196914673, + "learning_rate": 0.00019966839365681517, + "loss": 2.774, + "step": 616 + }, + { + "epoch": 0.04979420547171334, + "grad_norm": 0.9175437688827515, + "learning_rate": 0.00019966710782974359, + "loss": 2.8064, + "step": 617 + }, + { + "epoch": 0.04987490920829635, + "grad_norm": 0.8897463083267212, + "learning_rate": 0.00019966581951870715, + "loss": 2.8487, + "step": 618 + }, + { + "epoch": 0.049955612944879345, + "grad_norm": 0.8908397555351257, + "learning_rate": 0.00019966452872373795, + "loss": 2.8523, + "step": 619 + }, + { + "epoch": 0.05003631668146235, + "grad_norm": 0.95484858751297, + "learning_rate": 0.00019966323544486818, + "loss": 2.8471, + "step": 620 + }, + { + "epoch": 0.050117020418045354, + "grad_norm": 0.9995831251144409, + "learning_rate": 0.00019966193968213008, + "loss": 2.8341, + "step": 621 + }, + { + "epoch": 0.05019772415462836, + "grad_norm": 0.8731706142425537, + "learning_rate": 0.00019966064143555587, + "loss": 2.8491, + "step": 622 + }, + { + "epoch": 0.05027842789121136, + "grad_norm": 0.9213298559188843, + "learning_rate": 0.000199659340705178, + "loss": 2.8256, + "step": 623 + }, + { + "epoch": 0.050359131627794367, + "grad_norm": 0.9565179347991943, + "learning_rate": 0.00019965803749102885, + "loss": 2.8177, + "step": 624 + }, + { + "epoch": 0.05043983536437737, + "grad_norm": 1.0076881647109985, + "learning_rate": 0.00019965673179314086, + "loss": 2.7812, + "step": 625 + }, + { + "epoch": 0.050520539100960375, + "grad_norm": 0.989647388458252, + "learning_rate": 0.00019965542361154666, + "loss": 2.9226, + "step": 626 + }, + { + "epoch": 0.05060124283754338, + "grad_norm": 0.9671580791473389, + "learning_rate": 0.00019965411294627878, + "loss": 2.8204, + "step": 627 + }, + { + "epoch": 0.050681946574126384, + "grad_norm": 0.9275986552238464, + "learning_rate": 0.00019965279979736989, + "loss": 2.8481, + "step": 628 + }, + { + "epoch": 0.05076265031070939, + "grad_norm": 0.9949543476104736, + "learning_rate": 0.00019965148416485273, + "loss": 2.8606, + "step": 629 + }, + { + "epoch": 0.05084335404729239, + "grad_norm": 0.9506482481956482, + "learning_rate": 0.0001996501660487601, + "loss": 2.8088, + "step": 630 + }, + { + "epoch": 0.0509240577838754, + "grad_norm": 0.9147887229919434, + "learning_rate": 0.00019964884544912488, + "loss": 2.7997, + "step": 631 + }, + { + "epoch": 0.051004761520458394, + "grad_norm": 0.8964840769767761, + "learning_rate": 0.00019964752236597993, + "loss": 2.8342, + "step": 632 + }, + { + "epoch": 0.0510854652570414, + "grad_norm": 0.931811511516571, + "learning_rate": 0.00019964619679935824, + "loss": 2.8229, + "step": 633 + }, + { + "epoch": 0.0511661689936244, + "grad_norm": 0.8634423017501831, + "learning_rate": 0.00019964486874929282, + "loss": 2.803, + "step": 634 + }, + { + "epoch": 0.05124687273020741, + "grad_norm": 0.892223596572876, + "learning_rate": 0.00019964353821581683, + "loss": 2.802, + "step": 635 + }, + { + "epoch": 0.05132757646679041, + "grad_norm": 0.8373630046844482, + "learning_rate": 0.00019964220519896338, + "loss": 2.7693, + "step": 636 + }, + { + "epoch": 0.051408280203373416, + "grad_norm": 0.8729730248451233, + "learning_rate": 0.0001996408696987657, + "loss": 2.8467, + "step": 637 + }, + { + "epoch": 0.05148898393995642, + "grad_norm": 0.8994413614273071, + "learning_rate": 0.0001996395317152571, + "loss": 2.8837, + "step": 638 + }, + { + "epoch": 0.051569687676539425, + "grad_norm": 0.9146113395690918, + "learning_rate": 0.0001996381912484709, + "loss": 2.8189, + "step": 639 + }, + { + "epoch": 0.05165039141312243, + "grad_norm": 0.9330562353134155, + "learning_rate": 0.00019963684829844052, + "loss": 2.7873, + "step": 640 + }, + { + "epoch": 0.05173109514970543, + "grad_norm": 0.9076224565505981, + "learning_rate": 0.00019963550286519944, + "loss": 2.802, + "step": 641 + }, + { + "epoch": 0.05181179888628844, + "grad_norm": 0.9580704569816589, + "learning_rate": 0.00019963415494878115, + "loss": 2.8173, + "step": 642 + }, + { + "epoch": 0.05189250262287144, + "grad_norm": 0.9291248917579651, + "learning_rate": 0.00019963280454921928, + "loss": 2.7866, + "step": 643 + }, + { + "epoch": 0.05197320635945444, + "grad_norm": 0.9815296530723572, + "learning_rate": 0.0001996314516665475, + "loss": 2.7903, + "step": 644 + }, + { + "epoch": 0.052053910096037444, + "grad_norm": 0.9461820721626282, + "learning_rate": 0.00019963009630079949, + "loss": 2.7854, + "step": 645 + }, + { + "epoch": 0.05213461383262045, + "grad_norm": 0.9660771489143372, + "learning_rate": 0.00019962873845200908, + "loss": 2.9187, + "step": 646 + }, + { + "epoch": 0.05221531756920345, + "grad_norm": 0.8987802863121033, + "learning_rate": 0.00019962737812021002, + "loss": 2.8854, + "step": 647 + }, + { + "epoch": 0.05229602130578646, + "grad_norm": 0.9810429215431213, + "learning_rate": 0.0001996260153054363, + "loss": 2.8974, + "step": 648 + }, + { + "epoch": 0.05237672504236946, + "grad_norm": 0.8185738325119019, + "learning_rate": 0.00019962465000772183, + "loss": 2.797, + "step": 649 + }, + { + "epoch": 0.052457428778952465, + "grad_norm": 0.8976237773895264, + "learning_rate": 0.0001996232822271007, + "loss": 2.8557, + "step": 650 + }, + { + "epoch": 0.05253813251553547, + "grad_norm": 0.8591496348381042, + "learning_rate": 0.0001996219119636069, + "loss": 2.8521, + "step": 651 + }, + { + "epoch": 0.052618836252118474, + "grad_norm": 0.8907031416893005, + "learning_rate": 0.00019962053921727472, + "loss": 2.8117, + "step": 652 + }, + { + "epoch": 0.05269953998870148, + "grad_norm": 0.9034241437911987, + "learning_rate": 0.00019961916398813823, + "loss": 2.741, + "step": 653 + }, + { + "epoch": 0.05278024372528448, + "grad_norm": 0.8284802436828613, + "learning_rate": 0.00019961778627623176, + "loss": 2.776, + "step": 654 + }, + { + "epoch": 0.05286094746186749, + "grad_norm": 0.8459529876708984, + "learning_rate": 0.00019961640608158967, + "loss": 2.8027, + "step": 655 + }, + { + "epoch": 0.05294165119845049, + "grad_norm": 0.9720042943954468, + "learning_rate": 0.00019961502340424636, + "loss": 2.9086, + "step": 656 + }, + { + "epoch": 0.05302235493503349, + "grad_norm": 0.8581427335739136, + "learning_rate": 0.00019961363824423626, + "loss": 2.8347, + "step": 657 + }, + { + "epoch": 0.05310305867161649, + "grad_norm": 0.9545331597328186, + "learning_rate": 0.00019961225060159386, + "loss": 2.828, + "step": 658 + }, + { + "epoch": 0.0531837624081995, + "grad_norm": 1.0303562879562378, + "learning_rate": 0.00019961086047635385, + "loss": 2.8461, + "step": 659 + }, + { + "epoch": 0.0532644661447825, + "grad_norm": 0.86605304479599, + "learning_rate": 0.0001996094678685508, + "loss": 2.8355, + "step": 660 + }, + { + "epoch": 0.053345169881365506, + "grad_norm": 0.8146334886550903, + "learning_rate": 0.0001996080727782194, + "loss": 2.8638, + "step": 661 + }, + { + "epoch": 0.05342587361794851, + "grad_norm": 0.9434560537338257, + "learning_rate": 0.00019960667520539446, + "loss": 2.8196, + "step": 662 + }, + { + "epoch": 0.053506577354531515, + "grad_norm": 0.9362602829933167, + "learning_rate": 0.00019960527515011084, + "loss": 2.8452, + "step": 663 + }, + { + "epoch": 0.05358728109111452, + "grad_norm": 0.828713059425354, + "learning_rate": 0.00019960387261240334, + "loss": 2.8079, + "step": 664 + }, + { + "epoch": 0.053667984827697524, + "grad_norm": 0.8610214591026306, + "learning_rate": 0.00019960246759230697, + "loss": 2.8197, + "step": 665 + }, + { + "epoch": 0.05374868856428053, + "grad_norm": 0.8913124799728394, + "learning_rate": 0.00019960106008985674, + "loss": 2.8392, + "step": 666 + }, + { + "epoch": 0.05382939230086353, + "grad_norm": 0.8109759092330933, + "learning_rate": 0.00019959965010508778, + "loss": 2.7961, + "step": 667 + }, + { + "epoch": 0.05391009603744654, + "grad_norm": 0.8714832663536072, + "learning_rate": 0.00019959823763803514, + "loss": 2.7984, + "step": 668 + }, + { + "epoch": 0.05399079977402954, + "grad_norm": 0.9008125066757202, + "learning_rate": 0.00019959682268873408, + "loss": 2.8319, + "step": 669 + }, + { + "epoch": 0.05407150351061254, + "grad_norm": 0.8718584775924683, + "learning_rate": 0.00019959540525721985, + "loss": 2.7973, + "step": 670 + }, + { + "epoch": 0.05415220724719554, + "grad_norm": 0.8666327595710754, + "learning_rate": 0.00019959398534352774, + "loss": 2.8296, + "step": 671 + }, + { + "epoch": 0.05423291098377855, + "grad_norm": 0.9755229949951172, + "learning_rate": 0.00019959256294769322, + "loss": 2.8358, + "step": 672 + }, + { + "epoch": 0.05431361472036155, + "grad_norm": 1.193708062171936, + "learning_rate": 0.0001995911380697517, + "loss": 2.7672, + "step": 673 + }, + { + "epoch": 0.054394318456944556, + "grad_norm": 0.9104088544845581, + "learning_rate": 0.00019958971070973866, + "loss": 2.8389, + "step": 674 + }, + { + "epoch": 0.05447502219352756, + "grad_norm": 0.9266251921653748, + "learning_rate": 0.0001995882808676897, + "loss": 2.8226, + "step": 675 + }, + { + "epoch": 0.054555725930110564, + "grad_norm": 1.1161282062530518, + "learning_rate": 0.00019958684854364046, + "loss": 2.8236, + "step": 676 + }, + { + "epoch": 0.05463642966669357, + "grad_norm": 0.9200586080551147, + "learning_rate": 0.00019958541373762666, + "loss": 2.8074, + "step": 677 + }, + { + "epoch": 0.05471713340327657, + "grad_norm": 1.0372560024261475, + "learning_rate": 0.000199583976449684, + "loss": 2.815, + "step": 678 + }, + { + "epoch": 0.05479783713985958, + "grad_norm": 0.8822301030158997, + "learning_rate": 0.0001995825366798483, + "loss": 2.7985, + "step": 679 + }, + { + "epoch": 0.05487854087644258, + "grad_norm": 0.9226076006889343, + "learning_rate": 0.00019958109442815553, + "loss": 2.7649, + "step": 680 + }, + { + "epoch": 0.054959244613025586, + "grad_norm": 0.8769479990005493, + "learning_rate": 0.00019957964969464156, + "loss": 2.8483, + "step": 681 + }, + { + "epoch": 0.05503994834960859, + "grad_norm": 0.8601027727127075, + "learning_rate": 0.0001995782024793424, + "loss": 2.8072, + "step": 682 + }, + { + "epoch": 0.05512065208619159, + "grad_norm": 0.9684911370277405, + "learning_rate": 0.00019957675278229416, + "loss": 2.8693, + "step": 683 + }, + { + "epoch": 0.05520135582277459, + "grad_norm": 0.9119890928268433, + "learning_rate": 0.00019957530060353294, + "loss": 2.853, + "step": 684 + }, + { + "epoch": 0.055282059559357596, + "grad_norm": 0.9588247537612915, + "learning_rate": 0.0001995738459430949, + "loss": 2.8435, + "step": 685 + }, + { + "epoch": 0.0553627632959406, + "grad_norm": 0.8317441940307617, + "learning_rate": 0.00019957238880101636, + "loss": 2.8208, + "step": 686 + }, + { + "epoch": 0.055443467032523605, + "grad_norm": 0.92695152759552, + "learning_rate": 0.00019957092917733361, + "loss": 2.8378, + "step": 687 + }, + { + "epoch": 0.05552417076910661, + "grad_norm": 0.8908315300941467, + "learning_rate": 0.00019956946707208305, + "loss": 2.8041, + "step": 688 + }, + { + "epoch": 0.055604874505689614, + "grad_norm": 0.9787055253982544, + "learning_rate": 0.00019956800248530107, + "loss": 2.8604, + "step": 689 + }, + { + "epoch": 0.05568557824227262, + "grad_norm": 0.8707631826400757, + "learning_rate": 0.00019956653541702415, + "loss": 2.7763, + "step": 690 + }, + { + "epoch": 0.05576628197885562, + "grad_norm": 1.0059715509414673, + "learning_rate": 0.00019956506586728896, + "loss": 2.8267, + "step": 691 + }, + { + "epoch": 0.05584698571543863, + "grad_norm": 0.88490891456604, + "learning_rate": 0.00019956359383613203, + "loss": 2.8278, + "step": 692 + }, + { + "epoch": 0.05592768945202163, + "grad_norm": 0.9527923464775085, + "learning_rate": 0.00019956211932359007, + "loss": 2.8251, + "step": 693 + }, + { + "epoch": 0.056008393188604635, + "grad_norm": 0.9612617492675781, + "learning_rate": 0.00019956064232969987, + "loss": 2.8148, + "step": 694 + }, + { + "epoch": 0.05608909692518763, + "grad_norm": 0.9261285066604614, + "learning_rate": 0.0001995591628544982, + "loss": 2.8176, + "step": 695 + }, + { + "epoch": 0.05616980066177064, + "grad_norm": 0.9766250252723694, + "learning_rate": 0.0001995576808980219, + "loss": 2.7968, + "step": 696 + }, + { + "epoch": 0.05625050439835364, + "grad_norm": 0.9287495017051697, + "learning_rate": 0.00019955619646030802, + "loss": 2.7679, + "step": 697 + }, + { + "epoch": 0.056331208134936646, + "grad_norm": 0.9182924032211304, + "learning_rate": 0.00019955470954139345, + "loss": 2.8295, + "step": 698 + }, + { + "epoch": 0.05641191187151965, + "grad_norm": 0.8650663495063782, + "learning_rate": 0.00019955322014131524, + "loss": 2.7928, + "step": 699 + }, + { + "epoch": 0.056492615608102655, + "grad_norm": 0.9543934464454651, + "learning_rate": 0.00019955172826011062, + "loss": 2.8049, + "step": 700 + }, + { + "epoch": 0.05657331934468566, + "grad_norm": 0.9060636162757874, + "learning_rate": 0.00019955023389781664, + "loss": 2.871, + "step": 701 + }, + { + "epoch": 0.05665402308126866, + "grad_norm": 0.9824137091636658, + "learning_rate": 0.00019954873705447065, + "loss": 2.816, + "step": 702 + }, + { + "epoch": 0.05673472681785167, + "grad_norm": 0.8831053972244263, + "learning_rate": 0.00019954723773010988, + "loss": 2.8207, + "step": 703 + }, + { + "epoch": 0.05681543055443467, + "grad_norm": 0.9603390693664551, + "learning_rate": 0.00019954573592477173, + "loss": 2.831, + "step": 704 + }, + { + "epoch": 0.056896134291017676, + "grad_norm": 0.911556601524353, + "learning_rate": 0.00019954423163849364, + "loss": 2.7679, + "step": 705 + }, + { + "epoch": 0.05697683802760068, + "grad_norm": 0.8558745384216309, + "learning_rate": 0.00019954272487131305, + "loss": 2.7934, + "step": 706 + }, + { + "epoch": 0.057057541764183685, + "grad_norm": 1.0175282955169678, + "learning_rate": 0.00019954121562326758, + "loss": 2.905, + "step": 707 + }, + { + "epoch": 0.05713824550076668, + "grad_norm": 0.9480875730514526, + "learning_rate": 0.00019953970389439483, + "loss": 2.85, + "step": 708 + }, + { + "epoch": 0.05721894923734969, + "grad_norm": 0.9271003603935242, + "learning_rate": 0.0001995381896847324, + "loss": 2.8237, + "step": 709 + }, + { + "epoch": 0.05729965297393269, + "grad_norm": 0.8439653515815735, + "learning_rate": 0.00019953667299431815, + "loss": 2.821, + "step": 710 + }, + { + "epoch": 0.057380356710515695, + "grad_norm": 0.9750552177429199, + "learning_rate": 0.0001995351538231898, + "loss": 2.8613, + "step": 711 + }, + { + "epoch": 0.0574610604470987, + "grad_norm": 0.9409266710281372, + "learning_rate": 0.0001995336321713852, + "loss": 2.7876, + "step": 712 + }, + { + "epoch": 0.057541764183681704, + "grad_norm": 0.811138927936554, + "learning_rate": 0.00019953210803894233, + "loss": 2.7957, + "step": 713 + }, + { + "epoch": 0.05762246792026471, + "grad_norm": 0.9504825472831726, + "learning_rate": 0.00019953058142589916, + "loss": 2.8536, + "step": 714 + }, + { + "epoch": 0.05770317165684771, + "grad_norm": 0.8183554410934448, + "learning_rate": 0.00019952905233229368, + "loss": 2.7697, + "step": 715 + }, + { + "epoch": 0.05778387539343072, + "grad_norm": 1.1146113872528076, + "learning_rate": 0.0001995275207581641, + "loss": 2.8629, + "step": 716 + }, + { + "epoch": 0.05786457913001372, + "grad_norm": 0.8797986507415771, + "learning_rate": 0.00019952598670354852, + "loss": 2.7962, + "step": 717 + }, + { + "epoch": 0.057945282866596726, + "grad_norm": 0.8771101832389832, + "learning_rate": 0.00019952445016848517, + "loss": 2.8323, + "step": 718 + }, + { + "epoch": 0.05802598660317973, + "grad_norm": 0.9003355503082275, + "learning_rate": 0.00019952291115301235, + "loss": 2.777, + "step": 719 + }, + { + "epoch": 0.058106690339762734, + "grad_norm": 0.846125602722168, + "learning_rate": 0.00019952136965716846, + "loss": 2.7875, + "step": 720 + }, + { + "epoch": 0.05818739407634573, + "grad_norm": 0.908833920955658, + "learning_rate": 0.00019951982568099187, + "loss": 2.7975, + "step": 721 + }, + { + "epoch": 0.058268097812928736, + "grad_norm": 0.8616230487823486, + "learning_rate": 0.00019951827922452106, + "loss": 2.7486, + "step": 722 + }, + { + "epoch": 0.05834880154951174, + "grad_norm": 0.8791850805282593, + "learning_rate": 0.00019951673028779462, + "loss": 2.8301, + "step": 723 + }, + { + "epoch": 0.058429505286094745, + "grad_norm": 0.9437321424484253, + "learning_rate": 0.00019951517887085112, + "loss": 2.7956, + "step": 724 + }, + { + "epoch": 0.05851020902267775, + "grad_norm": 0.9263394474983215, + "learning_rate": 0.00019951362497372922, + "loss": 2.867, + "step": 725 + }, + { + "epoch": 0.05859091275926075, + "grad_norm": 0.9442462921142578, + "learning_rate": 0.00019951206859646764, + "loss": 2.8447, + "step": 726 + }, + { + "epoch": 0.05867161649584376, + "grad_norm": 0.9286711812019348, + "learning_rate": 0.0001995105097391052, + "loss": 2.7588, + "step": 727 + }, + { + "epoch": 0.05875232023242676, + "grad_norm": 0.9338774085044861, + "learning_rate": 0.00019950894840168072, + "loss": 2.7394, + "step": 728 + }, + { + "epoch": 0.058833023969009766, + "grad_norm": 0.8880760073661804, + "learning_rate": 0.00019950738458423314, + "loss": 2.7949, + "step": 729 + }, + { + "epoch": 0.05891372770559277, + "grad_norm": 1.0091183185577393, + "learning_rate": 0.00019950581828680143, + "loss": 2.8633, + "step": 730 + }, + { + "epoch": 0.058994431442175775, + "grad_norm": 0.8657729625701904, + "learning_rate": 0.0001995042495094246, + "loss": 2.8649, + "step": 731 + }, + { + "epoch": 0.05907513517875878, + "grad_norm": 1.0084047317504883, + "learning_rate": 0.00019950267825214176, + "loss": 2.8422, + "step": 732 + }, + { + "epoch": 0.059155838915341784, + "grad_norm": 0.9096506237983704, + "learning_rate": 0.00019950110451499208, + "loss": 2.7908, + "step": 733 + }, + { + "epoch": 0.05923654265192478, + "grad_norm": 1.1338937282562256, + "learning_rate": 0.0001994995282980148, + "loss": 2.8093, + "step": 734 + }, + { + "epoch": 0.059317246388507786, + "grad_norm": 0.8813811540603638, + "learning_rate": 0.00019949794960124915, + "loss": 2.8866, + "step": 735 + }, + { + "epoch": 0.05939795012509079, + "grad_norm": 0.8457592129707336, + "learning_rate": 0.00019949636842473453, + "loss": 2.7744, + "step": 736 + }, + { + "epoch": 0.059478653861673794, + "grad_norm": 0.8731856346130371, + "learning_rate": 0.0001994947847685103, + "loss": 2.7822, + "step": 737 + }, + { + "epoch": 0.0595593575982568, + "grad_norm": 0.8915185332298279, + "learning_rate": 0.00019949319863261597, + "loss": 2.773, + "step": 738 + }, + { + "epoch": 0.0596400613348398, + "grad_norm": 0.9478987455368042, + "learning_rate": 0.00019949161001709106, + "loss": 2.8462, + "step": 739 + }, + { + "epoch": 0.05972076507142281, + "grad_norm": 0.8903716206550598, + "learning_rate": 0.00019949001892197515, + "loss": 2.7741, + "step": 740 + }, + { + "epoch": 0.05980146880800581, + "grad_norm": 0.8870117664337158, + "learning_rate": 0.00019948842534730786, + "loss": 2.8255, + "step": 741 + }, + { + "epoch": 0.059882172544588816, + "grad_norm": 1.0766080617904663, + "learning_rate": 0.00019948682929312898, + "loss": 2.8865, + "step": 742 + }, + { + "epoch": 0.05996287628117182, + "grad_norm": 0.846447229385376, + "learning_rate": 0.00019948523075947824, + "loss": 2.8441, + "step": 743 + }, + { + "epoch": 0.060043580017754825, + "grad_norm": 0.9847991466522217, + "learning_rate": 0.00019948362974639552, + "loss": 2.8099, + "step": 744 + }, + { + "epoch": 0.06012428375433783, + "grad_norm": 0.9170514941215515, + "learning_rate": 0.00019948202625392068, + "loss": 2.8797, + "step": 745 + }, + { + "epoch": 0.060204987490920826, + "grad_norm": 0.8564898371696472, + "learning_rate": 0.0001994804202820937, + "loss": 2.7993, + "step": 746 + }, + { + "epoch": 0.06028569122750383, + "grad_norm": 0.8527392148971558, + "learning_rate": 0.00019947881183095457, + "loss": 2.7816, + "step": 747 + }, + { + "epoch": 0.060366394964086835, + "grad_norm": 0.9170876145362854, + "learning_rate": 0.00019947720090054342, + "loss": 2.8031, + "step": 748 + }, + { + "epoch": 0.06044709870066984, + "grad_norm": 0.8891414403915405, + "learning_rate": 0.0001994755874909004, + "loss": 2.8072, + "step": 749 + }, + { + "epoch": 0.060527802437252844, + "grad_norm": 0.8853670358657837, + "learning_rate": 0.0001994739716020657, + "loss": 2.8857, + "step": 750 + }, + { + "epoch": 0.06060850617383585, + "grad_norm": 0.9011211395263672, + "learning_rate": 0.0001994723532340796, + "loss": 2.8519, + "step": 751 + }, + { + "epoch": 0.06068920991041885, + "grad_norm": 0.8843330144882202, + "learning_rate": 0.00019947073238698243, + "loss": 2.7882, + "step": 752 + }, + { + "epoch": 0.06076991364700186, + "grad_norm": 0.8712944984436035, + "learning_rate": 0.00019946910906081463, + "loss": 2.791, + "step": 753 + }, + { + "epoch": 0.06085061738358486, + "grad_norm": 0.8296090364456177, + "learning_rate": 0.00019946748325561656, + "loss": 2.8073, + "step": 754 + }, + { + "epoch": 0.060931321120167865, + "grad_norm": 0.9239117503166199, + "learning_rate": 0.00019946585497142885, + "loss": 2.8209, + "step": 755 + }, + { + "epoch": 0.06101202485675087, + "grad_norm": 0.8885170221328735, + "learning_rate": 0.000199464224208292, + "loss": 2.8391, + "step": 756 + }, + { + "epoch": 0.061092728593333874, + "grad_norm": 0.933720588684082, + "learning_rate": 0.0001994625909662467, + "loss": 2.7635, + "step": 757 + }, + { + "epoch": 0.06117343232991688, + "grad_norm": 0.9751253724098206, + "learning_rate": 0.00019946095524533362, + "loss": 2.7933, + "step": 758 + }, + { + "epoch": 0.061254136066499876, + "grad_norm": 0.9469670057296753, + "learning_rate": 0.00019945931704559353, + "loss": 2.7652, + "step": 759 + }, + { + "epoch": 0.06133483980308288, + "grad_norm": 0.8559684157371521, + "learning_rate": 0.00019945767636706728, + "loss": 2.8258, + "step": 760 + }, + { + "epoch": 0.061415543539665884, + "grad_norm": 1.021478295326233, + "learning_rate": 0.00019945603320979574, + "loss": 2.8047, + "step": 761 + }, + { + "epoch": 0.06149624727624889, + "grad_norm": 0.8421681523323059, + "learning_rate": 0.00019945438757381986, + "loss": 2.8233, + "step": 762 + }, + { + "epoch": 0.06157695101283189, + "grad_norm": 0.900654137134552, + "learning_rate": 0.0001994527394591807, + "loss": 2.7591, + "step": 763 + }, + { + "epoch": 0.0616576547494149, + "grad_norm": 0.878300666809082, + "learning_rate": 0.0001994510888659193, + "loss": 2.715, + "step": 764 + }, + { + "epoch": 0.0617383584859979, + "grad_norm": 0.9170855283737183, + "learning_rate": 0.00019944943579407678, + "loss": 2.8604, + "step": 765 + }, + { + "epoch": 0.061819062222580906, + "grad_norm": 0.8532859683036804, + "learning_rate": 0.00019944778024369434, + "loss": 2.8124, + "step": 766 + }, + { + "epoch": 0.06189976595916391, + "grad_norm": 0.8549049496650696, + "learning_rate": 0.00019944612221481332, + "loss": 2.8066, + "step": 767 + }, + { + "epoch": 0.061980469695746915, + "grad_norm": 0.9602857828140259, + "learning_rate": 0.00019944446170747492, + "loss": 2.8424, + "step": 768 + }, + { + "epoch": 0.06206117343232992, + "grad_norm": 0.910953164100647, + "learning_rate": 0.0001994427987217206, + "loss": 2.8093, + "step": 769 + }, + { + "epoch": 0.06214187716891292, + "grad_norm": 0.8536386489868164, + "learning_rate": 0.0001994411332575918, + "loss": 2.802, + "step": 770 + }, + { + "epoch": 0.06222258090549593, + "grad_norm": 0.9166232347488403, + "learning_rate": 0.00019943946531513, + "loss": 2.783, + "step": 771 + }, + { + "epoch": 0.062303284642078925, + "grad_norm": 0.9954056739807129, + "learning_rate": 0.00019943779489437678, + "loss": 2.8198, + "step": 772 + }, + { + "epoch": 0.06238398837866193, + "grad_norm": 0.8527171015739441, + "learning_rate": 0.0001994361219953738, + "loss": 2.8159, + "step": 773 + }, + { + "epoch": 0.062464692115244934, + "grad_norm": 0.8951592445373535, + "learning_rate": 0.00019943444661816274, + "loss": 2.7969, + "step": 774 + }, + { + "epoch": 0.06254539585182795, + "grad_norm": 0.9348207116127014, + "learning_rate": 0.00019943276876278532, + "loss": 2.8403, + "step": 775 + }, + { + "epoch": 0.06262609958841095, + "grad_norm": 0.866318941116333, + "learning_rate": 0.00019943108842928342, + "loss": 2.7886, + "step": 776 + }, + { + "epoch": 0.06270680332499395, + "grad_norm": 0.8571285605430603, + "learning_rate": 0.00019942940561769884, + "loss": 2.771, + "step": 777 + }, + { + "epoch": 0.06278750706157694, + "grad_norm": 0.8384295105934143, + "learning_rate": 0.00019942772032807357, + "loss": 2.7885, + "step": 778 + }, + { + "epoch": 0.06286821079815995, + "grad_norm": 0.9934808611869812, + "learning_rate": 0.00019942603256044961, + "loss": 2.8399, + "step": 779 + }, + { + "epoch": 0.06294891453474295, + "grad_norm": 0.8275915384292603, + "learning_rate": 0.00019942434231486902, + "loss": 2.8983, + "step": 780 + }, + { + "epoch": 0.06302961827132596, + "grad_norm": 0.9073596000671387, + "learning_rate": 0.0001994226495913739, + "loss": 2.7886, + "step": 781 + }, + { + "epoch": 0.06311032200790896, + "grad_norm": 0.9091461300849915, + "learning_rate": 0.00019942095439000646, + "loss": 2.814, + "step": 782 + }, + { + "epoch": 0.06319102574449197, + "grad_norm": 0.9356934428215027, + "learning_rate": 0.000199419256710809, + "loss": 2.8238, + "step": 783 + }, + { + "epoch": 0.06327172948107497, + "grad_norm": 0.883514940738678, + "learning_rate": 0.00019941755655382374, + "loss": 2.7912, + "step": 784 + }, + { + "epoch": 0.06335243321765797, + "grad_norm": 0.8770506381988525, + "learning_rate": 0.00019941585391909308, + "loss": 2.7774, + "step": 785 + }, + { + "epoch": 0.06343313695424098, + "grad_norm": 0.8891726136207581, + "learning_rate": 0.00019941414880665948, + "loss": 2.7975, + "step": 786 + }, + { + "epoch": 0.06351384069082398, + "grad_norm": 0.9280585050582886, + "learning_rate": 0.00019941244121656545, + "loss": 2.9468, + "step": 787 + }, + { + "epoch": 0.06359454442740699, + "grad_norm": 0.8545510768890381, + "learning_rate": 0.00019941073114885347, + "loss": 2.8165, + "step": 788 + }, + { + "epoch": 0.06367524816398999, + "grad_norm": 0.8631312847137451, + "learning_rate": 0.0001994090186035662, + "loss": 2.7955, + "step": 789 + }, + { + "epoch": 0.063755951900573, + "grad_norm": 0.8883851170539856, + "learning_rate": 0.00019940730358074634, + "loss": 2.7828, + "step": 790 + }, + { + "epoch": 0.063836655637156, + "grad_norm": 0.8421074748039246, + "learning_rate": 0.00019940558608043664, + "loss": 2.7999, + "step": 791 + }, + { + "epoch": 0.063917359373739, + "grad_norm": 0.918134868144989, + "learning_rate": 0.0001994038661026799, + "loss": 2.7888, + "step": 792 + }, + { + "epoch": 0.06399806311032201, + "grad_norm": 0.8513637781143188, + "learning_rate": 0.00019940214364751896, + "loss": 2.7719, + "step": 793 + }, + { + "epoch": 0.06407876684690501, + "grad_norm": 0.9181898236274719, + "learning_rate": 0.00019940041871499675, + "loss": 2.8345, + "step": 794 + }, + { + "epoch": 0.06415947058348802, + "grad_norm": 0.8129134774208069, + "learning_rate": 0.00019939869130515626, + "loss": 2.7316, + "step": 795 + }, + { + "epoch": 0.06424017432007102, + "grad_norm": 0.8782191872596741, + "learning_rate": 0.00019939696141804057, + "loss": 2.7852, + "step": 796 + }, + { + "epoch": 0.06432087805665403, + "grad_norm": 0.9064851403236389, + "learning_rate": 0.00019939522905369276, + "loss": 2.8105, + "step": 797 + }, + { + "epoch": 0.06440158179323703, + "grad_norm": 0.9888454675674438, + "learning_rate": 0.00019939349421215603, + "loss": 2.8496, + "step": 798 + }, + { + "epoch": 0.06448228552982004, + "grad_norm": 0.8717427253723145, + "learning_rate": 0.0001993917568934736, + "loss": 2.8227, + "step": 799 + }, + { + "epoch": 0.06456298926640304, + "grad_norm": 0.922980010509491, + "learning_rate": 0.0001993900170976888, + "loss": 2.8571, + "step": 800 + }, + { + "epoch": 0.06464369300298604, + "grad_norm": 0.8311850428581238, + "learning_rate": 0.00019938827482484492, + "loss": 2.7905, + "step": 801 + }, + { + "epoch": 0.06472439673956905, + "grad_norm": 0.9274900555610657, + "learning_rate": 0.0001993865300749855, + "loss": 2.8526, + "step": 802 + }, + { + "epoch": 0.06480510047615205, + "grad_norm": 0.9072165489196777, + "learning_rate": 0.00019938478284815388, + "loss": 2.8384, + "step": 803 + }, + { + "epoch": 0.06488580421273504, + "grad_norm": 0.854099452495575, + "learning_rate": 0.0001993830331443937, + "loss": 2.8459, + "step": 804 + }, + { + "epoch": 0.06496650794931805, + "grad_norm": 0.824126660823822, + "learning_rate": 0.00019938128096374854, + "loss": 2.7845, + "step": 805 + }, + { + "epoch": 0.06504721168590105, + "grad_norm": 0.8570442795753479, + "learning_rate": 0.0001993795263062621, + "loss": 2.8446, + "step": 806 + }, + { + "epoch": 0.06512791542248406, + "grad_norm": 0.8998628854751587, + "learning_rate": 0.00019937776917197805, + "loss": 2.8604, + "step": 807 + }, + { + "epoch": 0.06520861915906706, + "grad_norm": 0.9189189076423645, + "learning_rate": 0.00019937600956094023, + "loss": 2.7866, + "step": 808 + }, + { + "epoch": 0.06528932289565006, + "grad_norm": 0.9471604824066162, + "learning_rate": 0.00019937424747319248, + "loss": 2.7619, + "step": 809 + }, + { + "epoch": 0.06537002663223307, + "grad_norm": 0.8507755994796753, + "learning_rate": 0.00019937248290877874, + "loss": 2.8259, + "step": 810 + }, + { + "epoch": 0.06545073036881607, + "grad_norm": 0.8800963759422302, + "learning_rate": 0.00019937071586774292, + "loss": 2.827, + "step": 811 + }, + { + "epoch": 0.06553143410539908, + "grad_norm": 0.8851124048233032, + "learning_rate": 0.00019936894635012915, + "loss": 2.793, + "step": 812 + }, + { + "epoch": 0.06561213784198208, + "grad_norm": 0.88127601146698, + "learning_rate": 0.00019936717435598144, + "loss": 2.8885, + "step": 813 + }, + { + "epoch": 0.06569284157856509, + "grad_norm": 0.9115073084831238, + "learning_rate": 0.000199365399885344, + "loss": 2.8278, + "step": 814 + }, + { + "epoch": 0.06577354531514809, + "grad_norm": 0.8722662925720215, + "learning_rate": 0.00019936362293826107, + "loss": 2.8125, + "step": 815 + }, + { + "epoch": 0.0658542490517311, + "grad_norm": 0.8332365155220032, + "learning_rate": 0.0001993618435147769, + "loss": 2.7682, + "step": 816 + }, + { + "epoch": 0.0659349527883141, + "grad_norm": 0.9524003863334656, + "learning_rate": 0.0001993600616149359, + "loss": 2.8166, + "step": 817 + }, + { + "epoch": 0.0660156565248971, + "grad_norm": 0.8402767181396484, + "learning_rate": 0.0001993582772387824, + "loss": 2.8192, + "step": 818 + }, + { + "epoch": 0.06609636026148011, + "grad_norm": 0.8589913249015808, + "learning_rate": 0.0001993564903863609, + "loss": 2.7785, + "step": 819 + }, + { + "epoch": 0.06617706399806311, + "grad_norm": 1.034550428390503, + "learning_rate": 0.00019935470105771598, + "loss": 2.8407, + "step": 820 + }, + { + "epoch": 0.06625776773464612, + "grad_norm": 0.856490969657898, + "learning_rate": 0.0001993529092528921, + "loss": 2.794, + "step": 821 + }, + { + "epoch": 0.06633847147122912, + "grad_norm": 0.897498369216919, + "learning_rate": 0.0001993511149719341, + "loss": 2.7959, + "step": 822 + }, + { + "epoch": 0.06641917520781213, + "grad_norm": 0.8495277166366577, + "learning_rate": 0.00019934931821488658, + "loss": 2.783, + "step": 823 + }, + { + "epoch": 0.06649987894439513, + "grad_norm": 0.8362239599227905, + "learning_rate": 0.00019934751898179436, + "loss": 2.8628, + "step": 824 + }, + { + "epoch": 0.06658058268097813, + "grad_norm": 0.8702061176300049, + "learning_rate": 0.00019934571727270225, + "loss": 2.7878, + "step": 825 + }, + { + "epoch": 0.06666128641756114, + "grad_norm": 0.8341560363769531, + "learning_rate": 0.0001993439130876552, + "loss": 2.7345, + "step": 826 + }, + { + "epoch": 0.06674199015414414, + "grad_norm": 0.880181074142456, + "learning_rate": 0.00019934210642669813, + "loss": 2.7789, + "step": 827 + }, + { + "epoch": 0.06682269389072715, + "grad_norm": 0.9088126420974731, + "learning_rate": 0.00019934029728987607, + "loss": 2.7893, + "step": 828 + }, + { + "epoch": 0.06690339762731014, + "grad_norm": 0.8087106347084045, + "learning_rate": 0.00019933848567723416, + "loss": 2.7967, + "step": 829 + }, + { + "epoch": 0.06698410136389314, + "grad_norm": 0.8970876336097717, + "learning_rate": 0.00019933667158881745, + "loss": 2.8837, + "step": 830 + }, + { + "epoch": 0.06706480510047615, + "grad_norm": 0.9344804883003235, + "learning_rate": 0.00019933485502467128, + "loss": 2.7754, + "step": 831 + }, + { + "epoch": 0.06714550883705915, + "grad_norm": 0.8119301795959473, + "learning_rate": 0.00019933303598484084, + "loss": 2.7919, + "step": 832 + }, + { + "epoch": 0.06722621257364216, + "grad_norm": 0.9370681047439575, + "learning_rate": 0.00019933121446937148, + "loss": 2.8011, + "step": 833 + }, + { + "epoch": 0.06730691631022516, + "grad_norm": 0.8358973264694214, + "learning_rate": 0.00019932939047830858, + "loss": 2.8339, + "step": 834 + }, + { + "epoch": 0.06738762004680816, + "grad_norm": 0.8565972447395325, + "learning_rate": 0.00019932756401169765, + "loss": 2.8269, + "step": 835 + }, + { + "epoch": 0.06746832378339117, + "grad_norm": 0.8405514359474182, + "learning_rate": 0.00019932573506958417, + "loss": 2.7621, + "step": 836 + }, + { + "epoch": 0.06754902751997417, + "grad_norm": 0.8217617869377136, + "learning_rate": 0.00019932390365201373, + "loss": 2.8363, + "step": 837 + }, + { + "epoch": 0.06762973125655718, + "grad_norm": 0.9121438264846802, + "learning_rate": 0.00019932206975903198, + "loss": 2.8033, + "step": 838 + }, + { + "epoch": 0.06771043499314018, + "grad_norm": 0.9113054871559143, + "learning_rate": 0.00019932023339068464, + "loss": 2.8696, + "step": 839 + }, + { + "epoch": 0.06779113872972319, + "grad_norm": 0.8638293743133545, + "learning_rate": 0.00019931839454701743, + "loss": 2.8008, + "step": 840 + }, + { + "epoch": 0.06787184246630619, + "grad_norm": 0.862932562828064, + "learning_rate": 0.0001993165532280762, + "loss": 2.8092, + "step": 841 + }, + { + "epoch": 0.0679525462028892, + "grad_norm": 0.9089607000350952, + "learning_rate": 0.00019931470943390685, + "loss": 2.8921, + "step": 842 + }, + { + "epoch": 0.0680332499394722, + "grad_norm": 0.9233555793762207, + "learning_rate": 0.00019931286316455537, + "loss": 2.9025, + "step": 843 + }, + { + "epoch": 0.0681139536760552, + "grad_norm": 0.9403017163276672, + "learning_rate": 0.0001993110144200677, + "loss": 2.7875, + "step": 844 + }, + { + "epoch": 0.06819465741263821, + "grad_norm": 0.9194290637969971, + "learning_rate": 0.00019930916320048996, + "loss": 2.8254, + "step": 845 + }, + { + "epoch": 0.06827536114922121, + "grad_norm": 0.8238688111305237, + "learning_rate": 0.00019930730950586828, + "loss": 2.82, + "step": 846 + }, + { + "epoch": 0.06835606488580422, + "grad_norm": 0.8560660481452942, + "learning_rate": 0.00019930545333624885, + "loss": 2.8516, + "step": 847 + }, + { + "epoch": 0.06843676862238722, + "grad_norm": 0.9127222895622253, + "learning_rate": 0.0001993035946916779, + "loss": 2.7674, + "step": 848 + }, + { + "epoch": 0.06851747235897022, + "grad_norm": 0.8679420948028564, + "learning_rate": 0.00019930173357220182, + "loss": 2.777, + "step": 849 + }, + { + "epoch": 0.06859817609555323, + "grad_norm": 0.9686945676803589, + "learning_rate": 0.00019929986997786699, + "loss": 2.7841, + "step": 850 + }, + { + "epoch": 0.06867887983213623, + "grad_norm": 0.8366333246231079, + "learning_rate": 0.00019929800390871977, + "loss": 2.7993, + "step": 851 + }, + { + "epoch": 0.06875958356871924, + "grad_norm": 0.8374585509300232, + "learning_rate": 0.00019929613536480675, + "loss": 2.7545, + "step": 852 + }, + { + "epoch": 0.06884028730530224, + "grad_norm": 0.9843763709068298, + "learning_rate": 0.00019929426434617451, + "loss": 2.8118, + "step": 853 + }, + { + "epoch": 0.06892099104188525, + "grad_norm": 0.8093454241752625, + "learning_rate": 0.0001992923908528696, + "loss": 2.7301, + "step": 854 + }, + { + "epoch": 0.06900169477846824, + "grad_norm": 0.8374418020248413, + "learning_rate": 0.00019929051488493877, + "loss": 2.7745, + "step": 855 + }, + { + "epoch": 0.06908239851505124, + "grad_norm": 0.869965136051178, + "learning_rate": 0.00019928863644242875, + "loss": 2.7637, + "step": 856 + }, + { + "epoch": 0.06916310225163425, + "grad_norm": 0.9280590415000916, + "learning_rate": 0.00019928675552538638, + "loss": 2.7792, + "step": 857 + }, + { + "epoch": 0.06924380598821725, + "grad_norm": 0.8624193668365479, + "learning_rate": 0.00019928487213385852, + "loss": 2.7755, + "step": 858 + }, + { + "epoch": 0.06932450972480025, + "grad_norm": 0.8379972577095032, + "learning_rate": 0.00019928298626789212, + "loss": 2.8563, + "step": 859 + }, + { + "epoch": 0.06940521346138326, + "grad_norm": 0.9272914528846741, + "learning_rate": 0.00019928109792753418, + "loss": 2.836, + "step": 860 + }, + { + "epoch": 0.06948591719796626, + "grad_norm": 0.9239040613174438, + "learning_rate": 0.00019927920711283175, + "loss": 2.7999, + "step": 861 + }, + { + "epoch": 0.06956662093454927, + "grad_norm": 0.9125113487243652, + "learning_rate": 0.00019927731382383195, + "loss": 2.8494, + "step": 862 + }, + { + "epoch": 0.06964732467113227, + "grad_norm": 0.8782855868339539, + "learning_rate": 0.00019927541806058198, + "loss": 2.767, + "step": 863 + }, + { + "epoch": 0.06972802840771528, + "grad_norm": 0.8815447092056274, + "learning_rate": 0.00019927351982312907, + "loss": 2.7877, + "step": 864 + }, + { + "epoch": 0.06980873214429828, + "grad_norm": 0.8555476069450378, + "learning_rate": 0.00019927161911152056, + "loss": 2.8057, + "step": 865 + }, + { + "epoch": 0.06988943588088128, + "grad_norm": 0.8562924265861511, + "learning_rate": 0.00019926971592580382, + "loss": 2.8049, + "step": 866 + }, + { + "epoch": 0.06997013961746429, + "grad_norm": 0.846503734588623, + "learning_rate": 0.00019926781026602625, + "loss": 2.8545, + "step": 867 + }, + { + "epoch": 0.07005084335404729, + "grad_norm": 0.8439623713493347, + "learning_rate": 0.00019926590213223535, + "loss": 2.7451, + "step": 868 + }, + { + "epoch": 0.0701315470906303, + "grad_norm": 0.8471730351448059, + "learning_rate": 0.00019926399152447868, + "loss": 2.7879, + "step": 869 + }, + { + "epoch": 0.0702122508272133, + "grad_norm": 0.8721400499343872, + "learning_rate": 0.00019926207844280387, + "loss": 2.8594, + "step": 870 + }, + { + "epoch": 0.0702929545637963, + "grad_norm": 0.8110925555229187, + "learning_rate": 0.0001992601628872586, + "loss": 2.7789, + "step": 871 + }, + { + "epoch": 0.07037365830037931, + "grad_norm": 0.9593119025230408, + "learning_rate": 0.0001992582448578906, + "loss": 2.8792, + "step": 872 + }, + { + "epoch": 0.07045436203696231, + "grad_norm": 0.8553354144096375, + "learning_rate": 0.00019925632435474765, + "loss": 2.8056, + "step": 873 + }, + { + "epoch": 0.07053506577354532, + "grad_norm": 0.8062612414360046, + "learning_rate": 0.00019925440137787768, + "loss": 2.7762, + "step": 874 + }, + { + "epoch": 0.07061576951012832, + "grad_norm": 0.8264921307563782, + "learning_rate": 0.00019925247592732858, + "loss": 2.8435, + "step": 875 + }, + { + "epoch": 0.07069647324671133, + "grad_norm": 0.7770401835441589, + "learning_rate": 0.00019925054800314828, + "loss": 2.7846, + "step": 876 + }, + { + "epoch": 0.07077717698329433, + "grad_norm": 0.8426765203475952, + "learning_rate": 0.0001992486176053849, + "loss": 2.782, + "step": 877 + }, + { + "epoch": 0.07085788071987734, + "grad_norm": 0.855330228805542, + "learning_rate": 0.00019924668473408655, + "loss": 2.8051, + "step": 878 + }, + { + "epoch": 0.07093858445646034, + "grad_norm": 0.8762049674987793, + "learning_rate": 0.00019924474938930135, + "loss": 2.7634, + "step": 879 + }, + { + "epoch": 0.07101928819304333, + "grad_norm": 0.9226812124252319, + "learning_rate": 0.0001992428115710776, + "loss": 2.8342, + "step": 880 + }, + { + "epoch": 0.07109999192962634, + "grad_norm": 0.9031660556793213, + "learning_rate": 0.00019924087127946353, + "loss": 2.7953, + "step": 881 + }, + { + "epoch": 0.07118069566620934, + "grad_norm": 1.0151792764663696, + "learning_rate": 0.00019923892851450757, + "loss": 2.8225, + "step": 882 + }, + { + "epoch": 0.07126139940279234, + "grad_norm": 0.9805678725242615, + "learning_rate": 0.00019923698327625806, + "loss": 2.7727, + "step": 883 + }, + { + "epoch": 0.07134210313937535, + "grad_norm": 0.8831729888916016, + "learning_rate": 0.00019923503556476356, + "loss": 2.7682, + "step": 884 + }, + { + "epoch": 0.07142280687595835, + "grad_norm": 1.0311404466629028, + "learning_rate": 0.00019923308538007253, + "loss": 2.8422, + "step": 885 + }, + { + "epoch": 0.07150351061254136, + "grad_norm": 0.8143388628959656, + "learning_rate": 0.0001992311327222336, + "loss": 2.7876, + "step": 886 + }, + { + "epoch": 0.07158421434912436, + "grad_norm": 0.877017617225647, + "learning_rate": 0.00019922917759129552, + "loss": 2.7486, + "step": 887 + }, + { + "epoch": 0.07166491808570737, + "grad_norm": 0.930646538734436, + "learning_rate": 0.0001992272199873069, + "loss": 2.8022, + "step": 888 + }, + { + "epoch": 0.07174562182229037, + "grad_norm": 0.934753954410553, + "learning_rate": 0.00019922525991031655, + "loss": 2.8485, + "step": 889 + }, + { + "epoch": 0.07182632555887337, + "grad_norm": 0.9564220905303955, + "learning_rate": 0.00019922329736037339, + "loss": 2.761, + "step": 890 + }, + { + "epoch": 0.07190702929545638, + "grad_norm": 0.9457311630249023, + "learning_rate": 0.00019922133233752626, + "loss": 2.8279, + "step": 891 + }, + { + "epoch": 0.07198773303203938, + "grad_norm": 0.9385658502578735, + "learning_rate": 0.0001992193648418242, + "loss": 2.8222, + "step": 892 + }, + { + "epoch": 0.07206843676862239, + "grad_norm": 1.0157524347305298, + "learning_rate": 0.00019921739487331616, + "loss": 2.9166, + "step": 893 + }, + { + "epoch": 0.07214914050520539, + "grad_norm": 0.9143860340118408, + "learning_rate": 0.00019921542243205132, + "loss": 2.8139, + "step": 894 + }, + { + "epoch": 0.0722298442417884, + "grad_norm": 0.8769320249557495, + "learning_rate": 0.00019921344751807878, + "loss": 2.8023, + "step": 895 + }, + { + "epoch": 0.0723105479783714, + "grad_norm": 0.9647517204284668, + "learning_rate": 0.0001992114701314478, + "loss": 2.8872, + "step": 896 + }, + { + "epoch": 0.0723912517149544, + "grad_norm": 1.025978446006775, + "learning_rate": 0.00019920949027220762, + "loss": 2.837, + "step": 897 + }, + { + "epoch": 0.07247195545153741, + "grad_norm": 0.8848521113395691, + "learning_rate": 0.0001992075079404076, + "loss": 2.7498, + "step": 898 + }, + { + "epoch": 0.07255265918812041, + "grad_norm": 0.9395595788955688, + "learning_rate": 0.0001992055231360972, + "loss": 2.8752, + "step": 899 + }, + { + "epoch": 0.07263336292470342, + "grad_norm": 0.8711572885513306, + "learning_rate": 0.00019920353585932578, + "loss": 2.8608, + "step": 900 + }, + { + "epoch": 0.07271406666128642, + "grad_norm": 0.8606846332550049, + "learning_rate": 0.00019920154611014295, + "loss": 2.829, + "step": 901 + }, + { + "epoch": 0.07279477039786943, + "grad_norm": 0.859354555606842, + "learning_rate": 0.0001991995538885983, + "loss": 2.8102, + "step": 902 + }, + { + "epoch": 0.07287547413445243, + "grad_norm": 0.9063243865966797, + "learning_rate": 0.00019919755919474143, + "loss": 2.8509, + "step": 903 + }, + { + "epoch": 0.07295617787103544, + "grad_norm": 0.8321940898895264, + "learning_rate": 0.00019919556202862207, + "loss": 2.796, + "step": 904 + }, + { + "epoch": 0.07303688160761844, + "grad_norm": 0.8875191807746887, + "learning_rate": 0.00019919356239029003, + "loss": 2.8672, + "step": 905 + }, + { + "epoch": 0.07311758534420143, + "grad_norm": 0.9028071165084839, + "learning_rate": 0.0001991915602797951, + "loss": 2.8926, + "step": 906 + }, + { + "epoch": 0.07319828908078443, + "grad_norm": 0.9449291825294495, + "learning_rate": 0.0001991895556971872, + "loss": 2.8159, + "step": 907 + }, + { + "epoch": 0.07327899281736744, + "grad_norm": 0.871576189994812, + "learning_rate": 0.0001991875486425163, + "loss": 2.8162, + "step": 908 + }, + { + "epoch": 0.07335969655395044, + "grad_norm": 0.818423330783844, + "learning_rate": 0.0001991855391158324, + "loss": 2.8882, + "step": 909 + }, + { + "epoch": 0.07344040029053345, + "grad_norm": 0.8802343606948853, + "learning_rate": 0.0001991835271171856, + "loss": 2.8245, + "step": 910 + }, + { + "epoch": 0.07352110402711645, + "grad_norm": 0.916023313999176, + "learning_rate": 0.000199181512646626, + "loss": 2.8966, + "step": 911 + }, + { + "epoch": 0.07360180776369946, + "grad_norm": 1.0663317441940308, + "learning_rate": 0.0001991794957042039, + "loss": 2.7736, + "step": 912 + }, + { + "epoch": 0.07368251150028246, + "grad_norm": 0.9212445616722107, + "learning_rate": 0.00019917747628996947, + "loss": 2.7924, + "step": 913 + }, + { + "epoch": 0.07376321523686546, + "grad_norm": 0.9785256385803223, + "learning_rate": 0.00019917545440397308, + "loss": 2.8021, + "step": 914 + }, + { + "epoch": 0.07384391897344847, + "grad_norm": 0.8510444760322571, + "learning_rate": 0.00019917343004626514, + "loss": 2.7991, + "step": 915 + }, + { + "epoch": 0.07392462271003147, + "grad_norm": 0.8967106342315674, + "learning_rate": 0.0001991714032168961, + "loss": 2.8838, + "step": 916 + }, + { + "epoch": 0.07400532644661448, + "grad_norm": 0.8940563797950745, + "learning_rate": 0.0001991693739159164, + "loss": 2.8124, + "step": 917 + }, + { + "epoch": 0.07408603018319748, + "grad_norm": 0.9270479679107666, + "learning_rate": 0.0001991673421433767, + "loss": 2.7627, + "step": 918 + }, + { + "epoch": 0.07416673391978049, + "grad_norm": 0.905805230140686, + "learning_rate": 0.0001991653078993276, + "loss": 2.781, + "step": 919 + }, + { + "epoch": 0.07424743765636349, + "grad_norm": 0.9295129179954529, + "learning_rate": 0.00019916327118381982, + "loss": 2.8332, + "step": 920 + }, + { + "epoch": 0.0743281413929465, + "grad_norm": 0.863331139087677, + "learning_rate": 0.00019916123199690408, + "loss": 2.8489, + "step": 921 + }, + { + "epoch": 0.0744088451295295, + "grad_norm": 0.9966896772384644, + "learning_rate": 0.00019915919033863127, + "loss": 2.9107, + "step": 922 + }, + { + "epoch": 0.0744895488661125, + "grad_norm": 0.8921390771865845, + "learning_rate": 0.00019915714620905218, + "loss": 2.7668, + "step": 923 + }, + { + "epoch": 0.07457025260269551, + "grad_norm": 0.9378434419631958, + "learning_rate": 0.00019915509960821782, + "loss": 2.8305, + "step": 924 + }, + { + "epoch": 0.07465095633927851, + "grad_norm": 1.0351817607879639, + "learning_rate": 0.0001991530505361792, + "loss": 2.9412, + "step": 925 + }, + { + "epoch": 0.07473166007586152, + "grad_norm": 0.7995476722717285, + "learning_rate": 0.0001991509989929874, + "loss": 2.7872, + "step": 926 + }, + { + "epoch": 0.07481236381244452, + "grad_norm": 0.858830988407135, + "learning_rate": 0.0001991489449786935, + "loss": 2.7775, + "step": 927 + }, + { + "epoch": 0.07489306754902753, + "grad_norm": 1.1254682540893555, + "learning_rate": 0.00019914688849334867, + "loss": 2.7913, + "step": 928 + }, + { + "epoch": 0.07497377128561053, + "grad_norm": 0.9475330710411072, + "learning_rate": 0.00019914482953700428, + "loss": 2.7945, + "step": 929 + }, + { + "epoch": 0.07505447502219353, + "grad_norm": 0.8427290916442871, + "learning_rate": 0.00019914276810971152, + "loss": 2.8297, + "step": 930 + }, + { + "epoch": 0.07513517875877652, + "grad_norm": 0.9308956265449524, + "learning_rate": 0.00019914070421152183, + "loss": 2.8534, + "step": 931 + }, + { + "epoch": 0.07521588249535953, + "grad_norm": 0.9264787435531616, + "learning_rate": 0.00019913863784248664, + "loss": 2.7959, + "step": 932 + }, + { + "epoch": 0.07529658623194253, + "grad_norm": 0.8432087302207947, + "learning_rate": 0.00019913656900265742, + "loss": 2.8479, + "step": 933 + }, + { + "epoch": 0.07537728996852554, + "grad_norm": 0.8237274885177612, + "learning_rate": 0.0001991344976920858, + "loss": 2.782, + "step": 934 + }, + { + "epoch": 0.07545799370510854, + "grad_norm": 0.8143243789672852, + "learning_rate": 0.0001991324239108233, + "loss": 2.7567, + "step": 935 + }, + { + "epoch": 0.07553869744169155, + "grad_norm": 0.8824434280395508, + "learning_rate": 0.0001991303476589217, + "loss": 2.7971, + "step": 936 + }, + { + "epoch": 0.07561940117827455, + "grad_norm": 0.8202407360076904, + "learning_rate": 0.00019912826893643272, + "loss": 2.7825, + "step": 937 + }, + { + "epoch": 0.07570010491485755, + "grad_norm": 0.8001337647438049, + "learning_rate": 0.00019912618774340813, + "loss": 2.8294, + "step": 938 + }, + { + "epoch": 0.07578080865144056, + "grad_norm": 0.8875572085380554, + "learning_rate": 0.00019912410407989982, + "loss": 2.8013, + "step": 939 + }, + { + "epoch": 0.07586151238802356, + "grad_norm": 0.8676280379295349, + "learning_rate": 0.0001991220179459597, + "loss": 2.767, + "step": 940 + }, + { + "epoch": 0.07594221612460657, + "grad_norm": 0.9767136573791504, + "learning_rate": 0.00019911992934163982, + "loss": 2.8315, + "step": 941 + }, + { + "epoch": 0.07602291986118957, + "grad_norm": 0.8690733909606934, + "learning_rate": 0.0001991178382669922, + "loss": 2.8042, + "step": 942 + }, + { + "epoch": 0.07610362359777258, + "grad_norm": 0.862978458404541, + "learning_rate": 0.00019911574472206893, + "loss": 2.8243, + "step": 943 + }, + { + "epoch": 0.07618432733435558, + "grad_norm": 0.9116127490997314, + "learning_rate": 0.00019911364870692225, + "loss": 2.7377, + "step": 944 + }, + { + "epoch": 0.07626503107093859, + "grad_norm": 0.8765420317649841, + "learning_rate": 0.00019911155022160433, + "loss": 2.7673, + "step": 945 + }, + { + "epoch": 0.07634573480752159, + "grad_norm": 0.8229342699050903, + "learning_rate": 0.0001991094492661675, + "loss": 2.7749, + "step": 946 + }, + { + "epoch": 0.0764264385441046, + "grad_norm": 0.8340098261833191, + "learning_rate": 0.00019910734584066412, + "loss": 2.7871, + "step": 947 + }, + { + "epoch": 0.0765071422806876, + "grad_norm": 0.8116940259933472, + "learning_rate": 0.0001991052399451466, + "loss": 2.8202, + "step": 948 + }, + { + "epoch": 0.0765878460172706, + "grad_norm": 0.8730412721633911, + "learning_rate": 0.00019910313157966747, + "loss": 2.8661, + "step": 949 + }, + { + "epoch": 0.07666854975385361, + "grad_norm": 0.8272213339805603, + "learning_rate": 0.0001991010207442792, + "loss": 2.8352, + "step": 950 + }, + { + "epoch": 0.07674925349043661, + "grad_norm": 0.8586944937705994, + "learning_rate": 0.0001990989074390345, + "loss": 2.8018, + "step": 951 + }, + { + "epoch": 0.07682995722701962, + "grad_norm": 0.81830894947052, + "learning_rate": 0.00019909679166398592, + "loss": 2.8154, + "step": 952 + }, + { + "epoch": 0.07691066096360262, + "grad_norm": 0.8158484101295471, + "learning_rate": 0.00019909467341918627, + "loss": 2.7618, + "step": 953 + }, + { + "epoch": 0.07699136470018562, + "grad_norm": 0.816834032535553, + "learning_rate": 0.00019909255270468833, + "loss": 2.8125, + "step": 954 + }, + { + "epoch": 0.07707206843676863, + "grad_norm": 0.944790780544281, + "learning_rate": 0.00019909042952054496, + "loss": 2.8054, + "step": 955 + }, + { + "epoch": 0.07715277217335163, + "grad_norm": 0.9281302690505981, + "learning_rate": 0.00019908830386680904, + "loss": 2.8724, + "step": 956 + }, + { + "epoch": 0.07723347590993462, + "grad_norm": 0.8850300908088684, + "learning_rate": 0.00019908617574353356, + "loss": 2.7906, + "step": 957 + }, + { + "epoch": 0.07731417964651763, + "grad_norm": 0.8997938632965088, + "learning_rate": 0.00019908404515077158, + "loss": 2.7814, + "step": 958 + }, + { + "epoch": 0.07739488338310063, + "grad_norm": 0.8814194798469543, + "learning_rate": 0.0001990819120885762, + "loss": 2.7423, + "step": 959 + }, + { + "epoch": 0.07747558711968364, + "grad_norm": 0.8759928345680237, + "learning_rate": 0.00019907977655700054, + "loss": 2.7803, + "step": 960 + }, + { + "epoch": 0.07755629085626664, + "grad_norm": 0.8439476490020752, + "learning_rate": 0.00019907763855609787, + "loss": 2.8277, + "step": 961 + }, + { + "epoch": 0.07763699459284965, + "grad_norm": 0.8745121955871582, + "learning_rate": 0.00019907549808592144, + "loss": 2.8152, + "step": 962 + }, + { + "epoch": 0.07771769832943265, + "grad_norm": 1.0439598560333252, + "learning_rate": 0.00019907335514652465, + "loss": 2.7882, + "step": 963 + }, + { + "epoch": 0.07779840206601565, + "grad_norm": 0.9516503810882568, + "learning_rate": 0.00019907120973796082, + "loss": 2.8555, + "step": 964 + }, + { + "epoch": 0.07787910580259866, + "grad_norm": 0.928717315196991, + "learning_rate": 0.0001990690618602835, + "loss": 2.8214, + "step": 965 + }, + { + "epoch": 0.07795980953918166, + "grad_norm": 0.7923071384429932, + "learning_rate": 0.00019906691151354617, + "loss": 2.8153, + "step": 966 + }, + { + "epoch": 0.07804051327576467, + "grad_norm": 0.8783324956893921, + "learning_rate": 0.00019906475869780246, + "loss": 2.7691, + "step": 967 + }, + { + "epoch": 0.07812121701234767, + "grad_norm": 0.8974801301956177, + "learning_rate": 0.000199062603413106, + "loss": 2.8156, + "step": 968 + }, + { + "epoch": 0.07820192074893068, + "grad_norm": 0.9304391741752625, + "learning_rate": 0.00019906044565951052, + "loss": 2.8489, + "step": 969 + }, + { + "epoch": 0.07828262448551368, + "grad_norm": 0.8351098895072937, + "learning_rate": 0.00019905828543706976, + "loss": 2.7744, + "step": 970 + }, + { + "epoch": 0.07836332822209668, + "grad_norm": 0.8634265065193176, + "learning_rate": 0.0001990561227458376, + "loss": 2.8193, + "step": 971 + }, + { + "epoch": 0.07844403195867969, + "grad_norm": 0.8969653248786926, + "learning_rate": 0.00019905395758586792, + "loss": 2.7548, + "step": 972 + }, + { + "epoch": 0.07852473569526269, + "grad_norm": 0.8964852094650269, + "learning_rate": 0.0001990517899572147, + "loss": 2.8037, + "step": 973 + }, + { + "epoch": 0.0786054394318457, + "grad_norm": 0.8567596077919006, + "learning_rate": 0.00019904961985993196, + "loss": 2.7942, + "step": 974 + }, + { + "epoch": 0.0786861431684287, + "grad_norm": 0.8275273442268372, + "learning_rate": 0.00019904744729407374, + "loss": 2.8359, + "step": 975 + }, + { + "epoch": 0.0787668469050117, + "grad_norm": 0.9458810091018677, + "learning_rate": 0.00019904527225969424, + "loss": 2.8354, + "step": 976 + }, + { + "epoch": 0.07884755064159471, + "grad_norm": 0.8690593838691711, + "learning_rate": 0.00019904309475684767, + "loss": 2.7894, + "step": 977 + }, + { + "epoch": 0.07892825437817771, + "grad_norm": 0.810279130935669, + "learning_rate": 0.00019904091478558823, + "loss": 2.7939, + "step": 978 + }, + { + "epoch": 0.07900895811476072, + "grad_norm": 0.8779012560844421, + "learning_rate": 0.0001990387323459703, + "loss": 2.7551, + "step": 979 + }, + { + "epoch": 0.07908966185134372, + "grad_norm": 0.7936381101608276, + "learning_rate": 0.00019903654743804833, + "loss": 2.814, + "step": 980 + }, + { + "epoch": 0.07917036558792673, + "grad_norm": 0.9567989110946655, + "learning_rate": 0.00019903436006187667, + "loss": 2.7715, + "step": 981 + }, + { + "epoch": 0.07925106932450972, + "grad_norm": 0.9250255823135376, + "learning_rate": 0.00019903217021750987, + "loss": 2.8967, + "step": 982 + }, + { + "epoch": 0.07933177306109272, + "grad_norm": 0.8342804312705994, + "learning_rate": 0.00019902997790500256, + "loss": 2.7728, + "step": 983 + }, + { + "epoch": 0.07941247679767573, + "grad_norm": 0.8321473598480225, + "learning_rate": 0.00019902778312440932, + "loss": 2.8479, + "step": 984 + }, + { + "epoch": 0.07949318053425873, + "grad_norm": 0.894727885723114, + "learning_rate": 0.00019902558587578484, + "loss": 2.8211, + "step": 985 + }, + { + "epoch": 0.07957388427084174, + "grad_norm": 0.8093457221984863, + "learning_rate": 0.0001990233861591839, + "loss": 2.7481, + "step": 986 + }, + { + "epoch": 0.07965458800742474, + "grad_norm": 0.8626284599304199, + "learning_rate": 0.00019902118397466132, + "loss": 2.8368, + "step": 987 + }, + { + "epoch": 0.07973529174400774, + "grad_norm": 0.799648642539978, + "learning_rate": 0.00019901897932227204, + "loss": 2.8713, + "step": 988 + }, + { + "epoch": 0.07981599548059075, + "grad_norm": 0.9658265709877014, + "learning_rate": 0.00019901677220207092, + "loss": 2.7284, + "step": 989 + }, + { + "epoch": 0.07989669921717375, + "grad_norm": 0.877299427986145, + "learning_rate": 0.00019901456261411303, + "loss": 2.7916, + "step": 990 + }, + { + "epoch": 0.07997740295375676, + "grad_norm": 0.926450252532959, + "learning_rate": 0.00019901235055845337, + "loss": 2.8207, + "step": 991 + }, + { + "epoch": 0.08005810669033976, + "grad_norm": 0.8858455419540405, + "learning_rate": 0.00019901013603514716, + "loss": 2.795, + "step": 992 + }, + { + "epoch": 0.08013881042692277, + "grad_norm": 0.8619922995567322, + "learning_rate": 0.0001990079190442495, + "loss": 2.8163, + "step": 993 + }, + { + "epoch": 0.08021951416350577, + "grad_norm": 0.859200656414032, + "learning_rate": 0.00019900569958581572, + "loss": 2.7715, + "step": 994 + }, + { + "epoch": 0.08030021790008877, + "grad_norm": 0.8346282839775085, + "learning_rate": 0.0001990034776599011, + "loss": 2.8312, + "step": 995 + }, + { + "epoch": 0.08038092163667178, + "grad_norm": 0.9188725352287292, + "learning_rate": 0.00019900125326656102, + "loss": 2.799, + "step": 996 + }, + { + "epoch": 0.08046162537325478, + "grad_norm": 0.8548648953437805, + "learning_rate": 0.00019899902640585092, + "loss": 2.7778, + "step": 997 + }, + { + "epoch": 0.08054232910983779, + "grad_norm": 0.8883183002471924, + "learning_rate": 0.00019899679707782624, + "loss": 2.809, + "step": 998 + }, + { + "epoch": 0.08062303284642079, + "grad_norm": 0.8915852308273315, + "learning_rate": 0.00019899456528254267, + "loss": 2.8309, + "step": 999 + }, + { + "epoch": 0.0807037365830038, + "grad_norm": 0.8092094659805298, + "learning_rate": 0.00019899233102005573, + "loss": 2.7753, + "step": 1000 + }, + { + "epoch": 0.0807037365830038, + "eval_loss": 2.7104671001434326, + "eval_runtime": 773.7354, + "eval_samples_per_second": 3.386, + "eval_steps_per_second": 0.565, + "step": 1000 + }, + { + "epoch": 0.0807844403195868, + "grad_norm": 0.8744900226593018, + "learning_rate": 0.00019899009429042114, + "loss": 2.7948, + "step": 1001 + }, + { + "epoch": 0.0808651440561698, + "grad_norm": 0.8749974370002747, + "learning_rate": 0.0001989878550936946, + "loss": 2.7609, + "step": 1002 + }, + { + "epoch": 0.08094584779275281, + "grad_norm": 0.8622820377349854, + "learning_rate": 0.000198985613429932, + "loss": 2.8023, + "step": 1003 + }, + { + "epoch": 0.08102655152933581, + "grad_norm": 0.9404367208480835, + "learning_rate": 0.00019898336929918915, + "loss": 2.7992, + "step": 1004 + }, + { + "epoch": 0.08110725526591882, + "grad_norm": 0.8846708536148071, + "learning_rate": 0.000198981122701522, + "loss": 2.8084, + "step": 1005 + }, + { + "epoch": 0.08118795900250182, + "grad_norm": 0.8105908036231995, + "learning_rate": 0.0001989788736369865, + "loss": 2.8504, + "step": 1006 + }, + { + "epoch": 0.08126866273908483, + "grad_norm": 1.0107187032699585, + "learning_rate": 0.0001989766221056388, + "loss": 2.7935, + "step": 1007 + }, + { + "epoch": 0.08134936647566782, + "grad_norm": 0.7825451493263245, + "learning_rate": 0.0001989743681075349, + "loss": 2.8024, + "step": 1008 + }, + { + "epoch": 0.08143007021225082, + "grad_norm": 0.8478613495826721, + "learning_rate": 0.000198972111642731, + "loss": 2.8645, + "step": 1009 + }, + { + "epoch": 0.08151077394883383, + "grad_norm": 0.8432144522666931, + "learning_rate": 0.0001989698527112834, + "loss": 2.8469, + "step": 1010 + }, + { + "epoch": 0.08159147768541683, + "grad_norm": 0.8147936463356018, + "learning_rate": 0.00019896759131324835, + "loss": 2.7799, + "step": 1011 + }, + { + "epoch": 0.08167218142199983, + "grad_norm": 0.8446993827819824, + "learning_rate": 0.00019896532744868224, + "loss": 2.7685, + "step": 1012 + }, + { + "epoch": 0.08175288515858284, + "grad_norm": 0.7635807394981384, + "learning_rate": 0.00019896306111764146, + "loss": 2.7823, + "step": 1013 + }, + { + "epoch": 0.08183358889516584, + "grad_norm": 0.8272855877876282, + "learning_rate": 0.00019896079232018253, + "loss": 2.7877, + "step": 1014 + }, + { + "epoch": 0.08191429263174885, + "grad_norm": 0.8079700469970703, + "learning_rate": 0.00019895852105636193, + "loss": 2.7849, + "step": 1015 + }, + { + "epoch": 0.08199499636833185, + "grad_norm": 0.8518063426017761, + "learning_rate": 0.0001989562473262363, + "loss": 2.8622, + "step": 1016 + }, + { + "epoch": 0.08207570010491486, + "grad_norm": 0.8646622896194458, + "learning_rate": 0.00019895397112986235, + "loss": 2.8224, + "step": 1017 + }, + { + "epoch": 0.08215640384149786, + "grad_norm": 0.8764398097991943, + "learning_rate": 0.00019895169246729672, + "loss": 2.938, + "step": 1018 + }, + { + "epoch": 0.08223710757808086, + "grad_norm": 0.8304057717323303, + "learning_rate": 0.0001989494113385963, + "loss": 2.7586, + "step": 1019 + }, + { + "epoch": 0.08231781131466387, + "grad_norm": 0.8569272756576538, + "learning_rate": 0.00019894712774381787, + "loss": 2.7803, + "step": 1020 + }, + { + "epoch": 0.08239851505124687, + "grad_norm": 0.8788578510284424, + "learning_rate": 0.00019894484168301836, + "loss": 2.8138, + "step": 1021 + }, + { + "epoch": 0.08247921878782988, + "grad_norm": 0.9113569855690002, + "learning_rate": 0.0001989425531562548, + "loss": 2.8023, + "step": 1022 + }, + { + "epoch": 0.08255992252441288, + "grad_norm": 0.8630590438842773, + "learning_rate": 0.00019894026216358413, + "loss": 2.791, + "step": 1023 + }, + { + "epoch": 0.08264062626099589, + "grad_norm": 0.8691157698631287, + "learning_rate": 0.00019893796870506348, + "loss": 2.811, + "step": 1024 + }, + { + "epoch": 0.08272132999757889, + "grad_norm": 0.9078284502029419, + "learning_rate": 0.00019893567278075007, + "loss": 2.8282, + "step": 1025 + }, + { + "epoch": 0.0828020337341619, + "grad_norm": 0.867511510848999, + "learning_rate": 0.00019893337439070105, + "loss": 2.7862, + "step": 1026 + }, + { + "epoch": 0.0828827374707449, + "grad_norm": 0.8016698360443115, + "learning_rate": 0.00019893107353497372, + "loss": 2.8083, + "step": 1027 + }, + { + "epoch": 0.0829634412073279, + "grad_norm": 0.8583545684814453, + "learning_rate": 0.00019892877021362543, + "loss": 2.8041, + "step": 1028 + }, + { + "epoch": 0.08304414494391091, + "grad_norm": 0.8302493691444397, + "learning_rate": 0.0001989264644267136, + "loss": 2.7866, + "step": 1029 + }, + { + "epoch": 0.08312484868049391, + "grad_norm": 0.9628411531448364, + "learning_rate": 0.00019892415617429567, + "loss": 2.8187, + "step": 1030 + }, + { + "epoch": 0.08320555241707692, + "grad_norm": 0.874840259552002, + "learning_rate": 0.0001989218454564292, + "loss": 2.7475, + "step": 1031 + }, + { + "epoch": 0.08328625615365992, + "grad_norm": 0.8641294836997986, + "learning_rate": 0.0001989195322731717, + "loss": 2.7795, + "step": 1032 + }, + { + "epoch": 0.08336695989024291, + "grad_norm": 0.8219757080078125, + "learning_rate": 0.0001989172166245809, + "loss": 2.7683, + "step": 1033 + }, + { + "epoch": 0.08344766362682592, + "grad_norm": 0.7905694246292114, + "learning_rate": 0.00019891489851071455, + "loss": 2.7668, + "step": 1034 + }, + { + "epoch": 0.08352836736340892, + "grad_norm": 0.8180816173553467, + "learning_rate": 0.0001989125779316303, + "loss": 2.7661, + "step": 1035 + }, + { + "epoch": 0.08360907109999192, + "grad_norm": 0.8337293267250061, + "learning_rate": 0.00019891025488738605, + "loss": 2.7823, + "step": 1036 + }, + { + "epoch": 0.08368977483657493, + "grad_norm": 0.9673140048980713, + "learning_rate": 0.00019890792937803973, + "loss": 2.8164, + "step": 1037 + }, + { + "epoch": 0.08377047857315793, + "grad_norm": 0.8810501098632812, + "learning_rate": 0.00019890560140364922, + "loss": 2.7904, + "step": 1038 + }, + { + "epoch": 0.08385118230974094, + "grad_norm": 0.9507614374160767, + "learning_rate": 0.0001989032709642726, + "loss": 2.7928, + "step": 1039 + }, + { + "epoch": 0.08393188604632394, + "grad_norm": 0.953738808631897, + "learning_rate": 0.00019890093805996793, + "loss": 2.7922, + "step": 1040 + }, + { + "epoch": 0.08401258978290695, + "grad_norm": 0.8079931139945984, + "learning_rate": 0.00019889860269079336, + "loss": 2.7909, + "step": 1041 + }, + { + "epoch": 0.08409329351948995, + "grad_norm": 1.0330647230148315, + "learning_rate": 0.0001988962648568071, + "loss": 2.7526, + "step": 1042 + }, + { + "epoch": 0.08417399725607295, + "grad_norm": 0.8988988399505615, + "learning_rate": 0.00019889392455806738, + "loss": 2.7471, + "step": 1043 + }, + { + "epoch": 0.08425470099265596, + "grad_norm": 0.7986348271369934, + "learning_rate": 0.00019889158179463255, + "loss": 2.7208, + "step": 1044 + }, + { + "epoch": 0.08433540472923896, + "grad_norm": 0.9231631755828857, + "learning_rate": 0.000198889236566561, + "loss": 2.7953, + "step": 1045 + }, + { + "epoch": 0.08441610846582197, + "grad_norm": 0.8438155055046082, + "learning_rate": 0.00019888688887391117, + "loss": 2.8006, + "step": 1046 + }, + { + "epoch": 0.08449681220240497, + "grad_norm": 0.8915219306945801, + "learning_rate": 0.0001988845387167416, + "loss": 2.8184, + "step": 1047 + }, + { + "epoch": 0.08457751593898798, + "grad_norm": 0.924401581287384, + "learning_rate": 0.0001988821860951108, + "loss": 2.8411, + "step": 1048 + }, + { + "epoch": 0.08465821967557098, + "grad_norm": 0.8144630193710327, + "learning_rate": 0.00019887983100907745, + "loss": 2.8258, + "step": 1049 + }, + { + "epoch": 0.08473892341215399, + "grad_norm": 0.9974459409713745, + "learning_rate": 0.00019887747345870028, + "loss": 2.7567, + "step": 1050 + }, + { + "epoch": 0.08481962714873699, + "grad_norm": 0.944526195526123, + "learning_rate": 0.00019887511344403796, + "loss": 2.8657, + "step": 1051 + }, + { + "epoch": 0.08490033088532, + "grad_norm": 0.8204831480979919, + "learning_rate": 0.00019887275096514936, + "loss": 2.8054, + "step": 1052 + }, + { + "epoch": 0.084981034621903, + "grad_norm": 0.8855900168418884, + "learning_rate": 0.00019887038602209336, + "loss": 2.8019, + "step": 1053 + }, + { + "epoch": 0.085061738358486, + "grad_norm": 0.9025108814239502, + "learning_rate": 0.0001988680186149289, + "loss": 2.7934, + "step": 1054 + }, + { + "epoch": 0.08514244209506901, + "grad_norm": 0.8486441373825073, + "learning_rate": 0.00019886564874371494, + "loss": 2.809, + "step": 1055 + }, + { + "epoch": 0.08522314583165201, + "grad_norm": 0.778364896774292, + "learning_rate": 0.00019886327640851058, + "loss": 2.7783, + "step": 1056 + }, + { + "epoch": 0.08530384956823502, + "grad_norm": 0.8515299558639526, + "learning_rate": 0.00019886090160937497, + "loss": 2.8122, + "step": 1057 + }, + { + "epoch": 0.08538455330481802, + "grad_norm": 0.8466131091117859, + "learning_rate": 0.00019885852434636724, + "loss": 2.7798, + "step": 1058 + }, + { + "epoch": 0.08546525704140101, + "grad_norm": 0.8856541514396667, + "learning_rate": 0.00019885614461954667, + "loss": 2.8033, + "step": 1059 + }, + { + "epoch": 0.08554596077798401, + "grad_norm": 0.8853924870491028, + "learning_rate": 0.00019885376242897258, + "loss": 2.8368, + "step": 1060 + }, + { + "epoch": 0.08562666451456702, + "grad_norm": 0.7858660221099854, + "learning_rate": 0.0001988513777747043, + "loss": 2.7806, + "step": 1061 + }, + { + "epoch": 0.08570736825115002, + "grad_norm": 0.8601513504981995, + "learning_rate": 0.0001988489906568013, + "loss": 2.8434, + "step": 1062 + }, + { + "epoch": 0.08578807198773303, + "grad_norm": 0.9126001596450806, + "learning_rate": 0.00019884660107532306, + "loss": 2.8469, + "step": 1063 + }, + { + "epoch": 0.08586877572431603, + "grad_norm": 0.9016061425209045, + "learning_rate": 0.00019884420903032912, + "loss": 2.7907, + "step": 1064 + }, + { + "epoch": 0.08594947946089904, + "grad_norm": 0.9134494066238403, + "learning_rate": 0.00019884181452187915, + "loss": 2.8426, + "step": 1065 + }, + { + "epoch": 0.08603018319748204, + "grad_norm": 0.8891138434410095, + "learning_rate": 0.00019883941755003272, + "loss": 2.8092, + "step": 1066 + }, + { + "epoch": 0.08611088693406505, + "grad_norm": 0.822884202003479, + "learning_rate": 0.0001988370181148497, + "loss": 2.8454, + "step": 1067 + }, + { + "epoch": 0.08619159067064805, + "grad_norm": 0.8341901898384094, + "learning_rate": 0.0001988346162163898, + "loss": 2.8027, + "step": 1068 + }, + { + "epoch": 0.08627229440723105, + "grad_norm": 0.8653229475021362, + "learning_rate": 0.00019883221185471291, + "loss": 2.7487, + "step": 1069 + }, + { + "epoch": 0.08635299814381406, + "grad_norm": 0.8065966367721558, + "learning_rate": 0.00019882980502987894, + "loss": 2.7847, + "step": 1070 + }, + { + "epoch": 0.08643370188039706, + "grad_norm": 0.9106903076171875, + "learning_rate": 0.0001988273957419479, + "loss": 2.7962, + "step": 1071 + }, + { + "epoch": 0.08651440561698007, + "grad_norm": 0.953815221786499, + "learning_rate": 0.0001988249839909798, + "loss": 2.8168, + "step": 1072 + }, + { + "epoch": 0.08659510935356307, + "grad_norm": 0.8642842173576355, + "learning_rate": 0.00019882256977703477, + "loss": 2.8205, + "step": 1073 + }, + { + "epoch": 0.08667581309014608, + "grad_norm": 0.8500350117683411, + "learning_rate": 0.000198820153100173, + "loss": 2.8798, + "step": 1074 + }, + { + "epoch": 0.08675651682672908, + "grad_norm": 0.9212989807128906, + "learning_rate": 0.00019881773396045467, + "loss": 2.8088, + "step": 1075 + }, + { + "epoch": 0.08683722056331208, + "grad_norm": 0.8897970914840698, + "learning_rate": 0.0001988153123579401, + "loss": 2.7983, + "step": 1076 + }, + { + "epoch": 0.08691792429989509, + "grad_norm": 0.7942636609077454, + "learning_rate": 0.00019881288829268968, + "loss": 2.7711, + "step": 1077 + }, + { + "epoch": 0.08699862803647809, + "grad_norm": 0.8286700248718262, + "learning_rate": 0.00019881046176476374, + "loss": 2.7995, + "step": 1078 + }, + { + "epoch": 0.0870793317730611, + "grad_norm": 0.9436343908309937, + "learning_rate": 0.00019880803277422281, + "loss": 2.8399, + "step": 1079 + }, + { + "epoch": 0.0871600355096441, + "grad_norm": 0.9592518210411072, + "learning_rate": 0.00019880560132112742, + "loss": 2.7888, + "step": 1080 + }, + { + "epoch": 0.0872407392462271, + "grad_norm": 0.8956589698791504, + "learning_rate": 0.00019880316740553816, + "loss": 2.7635, + "step": 1081 + }, + { + "epoch": 0.08732144298281011, + "grad_norm": 1.055312156677246, + "learning_rate": 0.00019880073102751574, + "loss": 2.7778, + "step": 1082 + }, + { + "epoch": 0.08740214671939311, + "grad_norm": 0.783273458480835, + "learning_rate": 0.00019879829218712075, + "loss": 2.735, + "step": 1083 + }, + { + "epoch": 0.0874828504559761, + "grad_norm": 0.8315421938896179, + "learning_rate": 0.00019879585088441413, + "loss": 2.7973, + "step": 1084 + }, + { + "epoch": 0.08756355419255911, + "grad_norm": 0.9550945162773132, + "learning_rate": 0.00019879340711945662, + "loss": 2.8083, + "step": 1085 + }, + { + "epoch": 0.08764425792914211, + "grad_norm": 0.9579277634620667, + "learning_rate": 0.00019879096089230915, + "loss": 2.7411, + "step": 1086 + }, + { + "epoch": 0.08772496166572512, + "grad_norm": 0.8602219223976135, + "learning_rate": 0.0001987885122030327, + "loss": 2.7461, + "step": 1087 + }, + { + "epoch": 0.08780566540230812, + "grad_norm": 0.9749068021774292, + "learning_rate": 0.00019878606105168829, + "loss": 2.7701, + "step": 1088 + }, + { + "epoch": 0.08788636913889113, + "grad_norm": 0.8128982186317444, + "learning_rate": 0.00019878360743833703, + "loss": 2.7949, + "step": 1089 + }, + { + "epoch": 0.08796707287547413, + "grad_norm": 0.9177080988883972, + "learning_rate": 0.00019878115136304003, + "loss": 2.7471, + "step": 1090 + }, + { + "epoch": 0.08804777661205714, + "grad_norm": 0.9052132368087769, + "learning_rate": 0.0001987786928258585, + "loss": 2.8356, + "step": 1091 + }, + { + "epoch": 0.08812848034864014, + "grad_norm": 0.8972994089126587, + "learning_rate": 0.00019877623182685378, + "loss": 2.8304, + "step": 1092 + }, + { + "epoch": 0.08820918408522314, + "grad_norm": 0.861251950263977, + "learning_rate": 0.0001987737683660871, + "loss": 2.8436, + "step": 1093 + }, + { + "epoch": 0.08828988782180615, + "grad_norm": 0.9139869809150696, + "learning_rate": 0.00019877130244361996, + "loss": 2.7583, + "step": 1094 + }, + { + "epoch": 0.08837059155838915, + "grad_norm": 0.8441170454025269, + "learning_rate": 0.00019876883405951377, + "loss": 2.7508, + "step": 1095 + }, + { + "epoch": 0.08845129529497216, + "grad_norm": 0.8624769449234009, + "learning_rate": 0.00019876636321383004, + "loss": 2.8003, + "step": 1096 + }, + { + "epoch": 0.08853199903155516, + "grad_norm": 0.9033877849578857, + "learning_rate": 0.00019876388990663037, + "loss": 2.7934, + "step": 1097 + }, + { + "epoch": 0.08861270276813817, + "grad_norm": 0.9492632746696472, + "learning_rate": 0.0001987614141379764, + "loss": 2.7852, + "step": 1098 + }, + { + "epoch": 0.08869340650472117, + "grad_norm": 0.9004682302474976, + "learning_rate": 0.00019875893590792982, + "loss": 2.7518, + "step": 1099 + }, + { + "epoch": 0.08877411024130417, + "grad_norm": 0.8352272510528564, + "learning_rate": 0.0001987564552165524, + "loss": 2.8035, + "step": 1100 + }, + { + "epoch": 0.08885481397788718, + "grad_norm": 0.8488562107086182, + "learning_rate": 0.00019875397206390593, + "loss": 2.7672, + "step": 1101 + }, + { + "epoch": 0.08893551771447018, + "grad_norm": 0.9450985193252563, + "learning_rate": 0.00019875148645005238, + "loss": 2.7558, + "step": 1102 + }, + { + "epoch": 0.08901622145105319, + "grad_norm": 0.9203561544418335, + "learning_rate": 0.0001987489983750536, + "loss": 2.7983, + "step": 1103 + }, + { + "epoch": 0.08909692518763619, + "grad_norm": 0.8761897087097168, + "learning_rate": 0.0001987465078389717, + "loss": 2.7536, + "step": 1104 + }, + { + "epoch": 0.0891776289242192, + "grad_norm": 0.9064637422561646, + "learning_rate": 0.00019874401484186867, + "loss": 2.8104, + "step": 1105 + }, + { + "epoch": 0.0892583326608022, + "grad_norm": 0.8394999504089355, + "learning_rate": 0.00019874151938380666, + "loss": 2.7459, + "step": 1106 + }, + { + "epoch": 0.0893390363973852, + "grad_norm": 0.8782099485397339, + "learning_rate": 0.00019873902146484785, + "loss": 2.8675, + "step": 1107 + }, + { + "epoch": 0.08941974013396821, + "grad_norm": 0.8564850091934204, + "learning_rate": 0.00019873652108505458, + "loss": 2.8561, + "step": 1108 + }, + { + "epoch": 0.08950044387055121, + "grad_norm": 0.8343809843063354, + "learning_rate": 0.0001987340182444891, + "loss": 2.8406, + "step": 1109 + }, + { + "epoch": 0.0895811476071342, + "grad_norm": 1.096273422241211, + "learning_rate": 0.00019873151294321376, + "loss": 2.8264, + "step": 1110 + }, + { + "epoch": 0.08966185134371721, + "grad_norm": 0.8654618263244629, + "learning_rate": 0.00019872900518129103, + "loss": 2.7956, + "step": 1111 + }, + { + "epoch": 0.08974255508030021, + "grad_norm": 0.8868138194084167, + "learning_rate": 0.00019872649495878344, + "loss": 2.8028, + "step": 1112 + }, + { + "epoch": 0.08982325881688322, + "grad_norm": 0.8139104843139648, + "learning_rate": 0.00019872398227575348, + "loss": 2.7502, + "step": 1113 + }, + { + "epoch": 0.08990396255346622, + "grad_norm": 0.8277762532234192, + "learning_rate": 0.00019872146713226384, + "loss": 2.7913, + "step": 1114 + }, + { + "epoch": 0.08998466629004923, + "grad_norm": 0.8470397591590881, + "learning_rate": 0.00019871894952837717, + "loss": 2.7982, + "step": 1115 + }, + { + "epoch": 0.09006537002663223, + "grad_norm": 0.8424760103225708, + "learning_rate": 0.00019871642946415625, + "loss": 2.8067, + "step": 1116 + }, + { + "epoch": 0.09014607376321523, + "grad_norm": 0.8253894448280334, + "learning_rate": 0.00019871390693966382, + "loss": 2.8339, + "step": 1117 + }, + { + "epoch": 0.09022677749979824, + "grad_norm": 0.8120691776275635, + "learning_rate": 0.00019871138195496282, + "loss": 2.7938, + "step": 1118 + }, + { + "epoch": 0.09030748123638124, + "grad_norm": 0.920189619064331, + "learning_rate": 0.00019870885451011617, + "loss": 2.8083, + "step": 1119 + }, + { + "epoch": 0.09038818497296425, + "grad_norm": 0.8990969657897949, + "learning_rate": 0.0001987063246051868, + "loss": 2.7481, + "step": 1120 + }, + { + "epoch": 0.09046888870954725, + "grad_norm": 0.8280801773071289, + "learning_rate": 0.0001987037922402378, + "loss": 2.8536, + "step": 1121 + }, + { + "epoch": 0.09054959244613026, + "grad_norm": 0.8510503768920898, + "learning_rate": 0.0001987012574153323, + "loss": 2.758, + "step": 1122 + }, + { + "epoch": 0.09063029618271326, + "grad_norm": 0.9103946685791016, + "learning_rate": 0.00019869872013053344, + "loss": 2.7594, + "step": 1123 + }, + { + "epoch": 0.09071099991929626, + "grad_norm": 0.804916262626648, + "learning_rate": 0.00019869618038590448, + "loss": 2.7489, + "step": 1124 + }, + { + "epoch": 0.09079170365587927, + "grad_norm": 0.7542802095413208, + "learning_rate": 0.00019869363818150867, + "loss": 2.76, + "step": 1125 + }, + { + "epoch": 0.09087240739246227, + "grad_norm": 0.7725108861923218, + "learning_rate": 0.00019869109351740947, + "loss": 2.8124, + "step": 1126 + }, + { + "epoch": 0.09095311112904528, + "grad_norm": 0.8533692955970764, + "learning_rate": 0.0001986885463936702, + "loss": 2.8499, + "step": 1127 + }, + { + "epoch": 0.09103381486562828, + "grad_norm": 0.8351541757583618, + "learning_rate": 0.0001986859968103544, + "loss": 2.8075, + "step": 1128 + }, + { + "epoch": 0.09111451860221129, + "grad_norm": 0.8780044913291931, + "learning_rate": 0.0001986834447675256, + "loss": 2.7587, + "step": 1129 + }, + { + "epoch": 0.09119522233879429, + "grad_norm": 0.9587519764900208, + "learning_rate": 0.00019868089026524736, + "loss": 2.8069, + "step": 1130 + }, + { + "epoch": 0.0912759260753773, + "grad_norm": 0.8285651206970215, + "learning_rate": 0.00019867833330358342, + "loss": 2.8209, + "step": 1131 + }, + { + "epoch": 0.0913566298119603, + "grad_norm": 0.8589211106300354, + "learning_rate": 0.00019867577388259745, + "loss": 2.8144, + "step": 1132 + }, + { + "epoch": 0.0914373335485433, + "grad_norm": 0.8740364909172058, + "learning_rate": 0.00019867321200235324, + "loss": 2.858, + "step": 1133 + }, + { + "epoch": 0.09151803728512631, + "grad_norm": 0.8368108868598938, + "learning_rate": 0.00019867064766291467, + "loss": 2.7997, + "step": 1134 + }, + { + "epoch": 0.0915987410217093, + "grad_norm": 0.8243690133094788, + "learning_rate": 0.00019866808086434564, + "loss": 2.7925, + "step": 1135 + }, + { + "epoch": 0.0916794447582923, + "grad_norm": 0.8296996355056763, + "learning_rate": 0.0001986655116067101, + "loss": 2.7953, + "step": 1136 + }, + { + "epoch": 0.09176014849487531, + "grad_norm": 0.9255942702293396, + "learning_rate": 0.0001986629398900721, + "loss": 2.844, + "step": 1137 + }, + { + "epoch": 0.09184085223145831, + "grad_norm": 0.7498174905776978, + "learning_rate": 0.00019866036571449574, + "loss": 2.7372, + "step": 1138 + }, + { + "epoch": 0.09192155596804132, + "grad_norm": 0.8170139193534851, + "learning_rate": 0.00019865778908004513, + "loss": 2.7656, + "step": 1139 + }, + { + "epoch": 0.09200225970462432, + "grad_norm": 0.8858106732368469, + "learning_rate": 0.00019865520998678458, + "loss": 2.7657, + "step": 1140 + }, + { + "epoch": 0.09208296344120732, + "grad_norm": 0.8789847493171692, + "learning_rate": 0.00019865262843477826, + "loss": 2.8419, + "step": 1141 + }, + { + "epoch": 0.09216366717779033, + "grad_norm": 0.8433314561843872, + "learning_rate": 0.00019865004442409058, + "loss": 2.7981, + "step": 1142 + }, + { + "epoch": 0.09224437091437333, + "grad_norm": 0.8822595477104187, + "learning_rate": 0.0001986474579547859, + "loss": 2.8368, + "step": 1143 + }, + { + "epoch": 0.09232507465095634, + "grad_norm": 0.9067013263702393, + "learning_rate": 0.00019864486902692872, + "loss": 2.7807, + "step": 1144 + }, + { + "epoch": 0.09240577838753934, + "grad_norm": 0.9551558494567871, + "learning_rate": 0.00019864227764058355, + "loss": 2.7617, + "step": 1145 + }, + { + "epoch": 0.09248648212412235, + "grad_norm": 0.8337206244468689, + "learning_rate": 0.00019863968379581494, + "loss": 2.8289, + "step": 1146 + }, + { + "epoch": 0.09256718586070535, + "grad_norm": 0.952702522277832, + "learning_rate": 0.0001986370874926876, + "loss": 2.8508, + "step": 1147 + }, + { + "epoch": 0.09264788959728835, + "grad_norm": 0.8586699366569519, + "learning_rate": 0.00019863448873126615, + "loss": 2.8784, + "step": 1148 + }, + { + "epoch": 0.09272859333387136, + "grad_norm": 0.7625309228897095, + "learning_rate": 0.00019863188751161544, + "loss": 2.7936, + "step": 1149 + }, + { + "epoch": 0.09280929707045436, + "grad_norm": 0.8912700414657593, + "learning_rate": 0.0001986292838338003, + "loss": 2.8745, + "step": 1150 + }, + { + "epoch": 0.09289000080703737, + "grad_norm": 0.8618904948234558, + "learning_rate": 0.00019862667769788553, + "loss": 2.8086, + "step": 1151 + }, + { + "epoch": 0.09297070454362037, + "grad_norm": 1.0013352632522583, + "learning_rate": 0.00019862406910393617, + "loss": 2.8211, + "step": 1152 + }, + { + "epoch": 0.09305140828020338, + "grad_norm": 0.7922475337982178, + "learning_rate": 0.0001986214580520172, + "loss": 2.7668, + "step": 1153 + }, + { + "epoch": 0.09313211201678638, + "grad_norm": 0.9490330815315247, + "learning_rate": 0.00019861884454219365, + "loss": 2.7571, + "step": 1154 + }, + { + "epoch": 0.09321281575336939, + "grad_norm": 0.8780270218849182, + "learning_rate": 0.00019861622857453076, + "loss": 2.7598, + "step": 1155 + }, + { + "epoch": 0.09329351948995239, + "grad_norm": 0.9220066070556641, + "learning_rate": 0.00019861361014909365, + "loss": 2.7609, + "step": 1156 + }, + { + "epoch": 0.0933742232265354, + "grad_norm": 0.8299020528793335, + "learning_rate": 0.0001986109892659476, + "loss": 2.8655, + "step": 1157 + }, + { + "epoch": 0.0934549269631184, + "grad_norm": 0.9700348377227783, + "learning_rate": 0.0001986083659251579, + "loss": 2.8597, + "step": 1158 + }, + { + "epoch": 0.0935356306997014, + "grad_norm": 0.8820784687995911, + "learning_rate": 0.00019860574012679001, + "loss": 2.8776, + "step": 1159 + }, + { + "epoch": 0.0936163344362844, + "grad_norm": 0.8134172558784485, + "learning_rate": 0.0001986031118709093, + "loss": 2.8163, + "step": 1160 + }, + { + "epoch": 0.0936970381728674, + "grad_norm": 0.885974109172821, + "learning_rate": 0.00019860048115758123, + "loss": 2.752, + "step": 1161 + }, + { + "epoch": 0.0937777419094504, + "grad_norm": 0.9650186896324158, + "learning_rate": 0.0001985978479868715, + "loss": 2.7587, + "step": 1162 + }, + { + "epoch": 0.0938584456460334, + "grad_norm": 0.8550445437431335, + "learning_rate": 0.00019859521235884563, + "loss": 2.7887, + "step": 1163 + }, + { + "epoch": 0.09393914938261641, + "grad_norm": 0.9686560034751892, + "learning_rate": 0.00019859257427356933, + "loss": 2.7974, + "step": 1164 + }, + { + "epoch": 0.09401985311919941, + "grad_norm": 0.9185387492179871, + "learning_rate": 0.00019858993373110837, + "loss": 2.7933, + "step": 1165 + }, + { + "epoch": 0.09410055685578242, + "grad_norm": 0.9549610018730164, + "learning_rate": 0.00019858729073152852, + "loss": 2.7698, + "step": 1166 + }, + { + "epoch": 0.09418126059236542, + "grad_norm": 1.0523492097854614, + "learning_rate": 0.0001985846452748957, + "loss": 2.7215, + "step": 1167 + }, + { + "epoch": 0.09426196432894843, + "grad_norm": 0.8551118969917297, + "learning_rate": 0.00019858199736127582, + "loss": 2.805, + "step": 1168 + }, + { + "epoch": 0.09434266806553143, + "grad_norm": 1.021374225616455, + "learning_rate": 0.0001985793469907349, + "loss": 2.794, + "step": 1169 + }, + { + "epoch": 0.09442337180211444, + "grad_norm": 0.8745501041412354, + "learning_rate": 0.0001985766941633389, + "loss": 2.7793, + "step": 1170 + }, + { + "epoch": 0.09450407553869744, + "grad_norm": 0.7426434755325317, + "learning_rate": 0.00019857403887915402, + "loss": 2.7808, + "step": 1171 + }, + { + "epoch": 0.09458477927528045, + "grad_norm": 0.9183726906776428, + "learning_rate": 0.0001985713811382464, + "loss": 2.8001, + "step": 1172 + }, + { + "epoch": 0.09466548301186345, + "grad_norm": 0.8136709928512573, + "learning_rate": 0.00019856872094068233, + "loss": 2.7394, + "step": 1173 + }, + { + "epoch": 0.09474618674844645, + "grad_norm": 0.9399348497390747, + "learning_rate": 0.00019856605828652807, + "loss": 2.7733, + "step": 1174 + }, + { + "epoch": 0.09482689048502946, + "grad_norm": 0.8233176469802856, + "learning_rate": 0.00019856339317584997, + "loss": 2.7672, + "step": 1175 + }, + { + "epoch": 0.09490759422161246, + "grad_norm": 0.9157048463821411, + "learning_rate": 0.00019856072560871447, + "loss": 2.7992, + "step": 1176 + }, + { + "epoch": 0.09498829795819547, + "grad_norm": 0.8729545474052429, + "learning_rate": 0.00019855805558518803, + "loss": 2.749, + "step": 1177 + }, + { + "epoch": 0.09506900169477847, + "grad_norm": 0.8592300415039062, + "learning_rate": 0.00019855538310533722, + "loss": 2.7257, + "step": 1178 + }, + { + "epoch": 0.09514970543136148, + "grad_norm": 0.8470803499221802, + "learning_rate": 0.00019855270816922867, + "loss": 2.7479, + "step": 1179 + }, + { + "epoch": 0.09523040916794448, + "grad_norm": 0.8538667559623718, + "learning_rate": 0.00019855003077692897, + "loss": 2.7576, + "step": 1180 + }, + { + "epoch": 0.09531111290452748, + "grad_norm": 0.8890984654426575, + "learning_rate": 0.0001985473509285049, + "loss": 2.7961, + "step": 1181 + }, + { + "epoch": 0.09539181664111049, + "grad_norm": 0.7769411206245422, + "learning_rate": 0.00019854466862402324, + "loss": 2.8087, + "step": 1182 + }, + { + "epoch": 0.09547252037769349, + "grad_norm": 0.8892520666122437, + "learning_rate": 0.00019854198386355085, + "loss": 2.7935, + "step": 1183 + }, + { + "epoch": 0.0955532241142765, + "grad_norm": 0.8675585389137268, + "learning_rate": 0.00019853929664715464, + "loss": 2.833, + "step": 1184 + }, + { + "epoch": 0.0956339278508595, + "grad_norm": 0.8053853511810303, + "learning_rate": 0.00019853660697490154, + "loss": 2.8002, + "step": 1185 + }, + { + "epoch": 0.09571463158744249, + "grad_norm": 0.9237198829650879, + "learning_rate": 0.00019853391484685865, + "loss": 2.8281, + "step": 1186 + }, + { + "epoch": 0.0957953353240255, + "grad_norm": 0.8432926535606384, + "learning_rate": 0.000198531220263093, + "loss": 2.8131, + "step": 1187 + }, + { + "epoch": 0.0958760390606085, + "grad_norm": 0.796380341053009, + "learning_rate": 0.0001985285232236718, + "loss": 2.753, + "step": 1188 + }, + { + "epoch": 0.0959567427971915, + "grad_norm": 0.9183037281036377, + "learning_rate": 0.00019852582372866225, + "loss": 2.7625, + "step": 1189 + }, + { + "epoch": 0.09603744653377451, + "grad_norm": 0.8194435238838196, + "learning_rate": 0.0001985231217781316, + "loss": 2.7906, + "step": 1190 + }, + { + "epoch": 0.09611815027035751, + "grad_norm": 0.8430871367454529, + "learning_rate": 0.00019852041737214725, + "loss": 2.8457, + "step": 1191 + }, + { + "epoch": 0.09619885400694052, + "grad_norm": 1.0237345695495605, + "learning_rate": 0.0001985177105107765, + "loss": 2.789, + "step": 1192 + }, + { + "epoch": 0.09627955774352352, + "grad_norm": 0.8721581101417542, + "learning_rate": 0.00019851500119408692, + "loss": 2.7187, + "step": 1193 + }, + { + "epoch": 0.09636026148010653, + "grad_norm": 0.8089142441749573, + "learning_rate": 0.00019851228942214603, + "loss": 2.7544, + "step": 1194 + }, + { + "epoch": 0.09644096521668953, + "grad_norm": 1.1076842546463013, + "learning_rate": 0.0001985095751950213, + "loss": 2.7859, + "step": 1195 + }, + { + "epoch": 0.09652166895327254, + "grad_norm": 0.84585040807724, + "learning_rate": 0.0001985068585127805, + "loss": 2.8005, + "step": 1196 + }, + { + "epoch": 0.09660237268985554, + "grad_norm": 0.8231167197227478, + "learning_rate": 0.00019850413937549127, + "loss": 2.8561, + "step": 1197 + }, + { + "epoch": 0.09668307642643854, + "grad_norm": 1.0028103590011597, + "learning_rate": 0.00019850141778322136, + "loss": 2.8049, + "step": 1198 + }, + { + "epoch": 0.09676378016302155, + "grad_norm": 0.8575148582458496, + "learning_rate": 0.0001984986937360387, + "loss": 2.7723, + "step": 1199 + }, + { + "epoch": 0.09684448389960455, + "grad_norm": 0.8567116260528564, + "learning_rate": 0.00019849596723401107, + "loss": 2.7418, + "step": 1200 + }, + { + "epoch": 0.09692518763618756, + "grad_norm": 1.1159218549728394, + "learning_rate": 0.00019849323827720645, + "loss": 2.8352, + "step": 1201 + }, + { + "epoch": 0.09700589137277056, + "grad_norm": 0.849656879901886, + "learning_rate": 0.0001984905068656929, + "loss": 2.7875, + "step": 1202 + }, + { + "epoch": 0.09708659510935357, + "grad_norm": 0.8479150533676147, + "learning_rate": 0.00019848777299953847, + "loss": 2.7828, + "step": 1203 + }, + { + "epoch": 0.09716729884593657, + "grad_norm": 0.9143954515457153, + "learning_rate": 0.00019848503667881125, + "loss": 2.7978, + "step": 1204 + }, + { + "epoch": 0.09724800258251957, + "grad_norm": 0.8162297010421753, + "learning_rate": 0.0001984822979035795, + "loss": 2.7621, + "step": 1205 + }, + { + "epoch": 0.09732870631910258, + "grad_norm": 0.8625509142875671, + "learning_rate": 0.00019847955667391144, + "loss": 2.7484, + "step": 1206 + }, + { + "epoch": 0.09740941005568558, + "grad_norm": 0.8485168218612671, + "learning_rate": 0.00019847681298987543, + "loss": 2.7599, + "step": 1207 + }, + { + "epoch": 0.09749011379226859, + "grad_norm": 0.8962678909301758, + "learning_rate": 0.00019847406685153976, + "loss": 2.7753, + "step": 1208 + }, + { + "epoch": 0.09757081752885159, + "grad_norm": 0.8890791535377502, + "learning_rate": 0.00019847131825897297, + "loss": 2.7635, + "step": 1209 + }, + { + "epoch": 0.0976515212654346, + "grad_norm": 0.8461710810661316, + "learning_rate": 0.00019846856721224355, + "loss": 2.796, + "step": 1210 + }, + { + "epoch": 0.0977322250020176, + "grad_norm": 0.912738025188446, + "learning_rate": 0.00019846581371141996, + "loss": 2.7889, + "step": 1211 + }, + { + "epoch": 0.09781292873860059, + "grad_norm": 0.8530749082565308, + "learning_rate": 0.00019846305775657097, + "loss": 2.8298, + "step": 1212 + }, + { + "epoch": 0.0978936324751836, + "grad_norm": 0.8890148401260376, + "learning_rate": 0.00019846029934776516, + "loss": 2.7491, + "step": 1213 + }, + { + "epoch": 0.0979743362117666, + "grad_norm": 0.8936887979507446, + "learning_rate": 0.0001984575384850713, + "loss": 2.7759, + "step": 1214 + }, + { + "epoch": 0.0980550399483496, + "grad_norm": 0.7811321020126343, + "learning_rate": 0.00019845477516855823, + "loss": 2.8126, + "step": 1215 + }, + { + "epoch": 0.09813574368493261, + "grad_norm": 0.8751768469810486, + "learning_rate": 0.00019845200939829484, + "loss": 2.792, + "step": 1216 + }, + { + "epoch": 0.09821644742151561, + "grad_norm": 0.8749501705169678, + "learning_rate": 0.00019844924117434998, + "loss": 2.7818, + "step": 1217 + }, + { + "epoch": 0.09829715115809862, + "grad_norm": 0.8130955100059509, + "learning_rate": 0.0001984464704967927, + "loss": 2.8581, + "step": 1218 + }, + { + "epoch": 0.09837785489468162, + "grad_norm": 0.8158220648765564, + "learning_rate": 0.00019844369736569196, + "loss": 2.7704, + "step": 1219 + }, + { + "epoch": 0.09845855863126463, + "grad_norm": 0.9351849555969238, + "learning_rate": 0.00019844092178111702, + "loss": 2.7857, + "step": 1220 + }, + { + "epoch": 0.09853926236784763, + "grad_norm": 0.8373914957046509, + "learning_rate": 0.00019843814374313697, + "loss": 2.8217, + "step": 1221 + }, + { + "epoch": 0.09861996610443063, + "grad_norm": 0.8919960856437683, + "learning_rate": 0.00019843536325182104, + "loss": 2.7914, + "step": 1222 + }, + { + "epoch": 0.09870066984101364, + "grad_norm": 0.9994316697120667, + "learning_rate": 0.00019843258030723858, + "loss": 2.7981, + "step": 1223 + }, + { + "epoch": 0.09878137357759664, + "grad_norm": 0.8144915699958801, + "learning_rate": 0.0001984297949094589, + "loss": 2.811, + "step": 1224 + }, + { + "epoch": 0.09886207731417965, + "grad_norm": 0.8957876563072205, + "learning_rate": 0.0001984270070585514, + "loss": 2.7752, + "step": 1225 + }, + { + "epoch": 0.09894278105076265, + "grad_norm": 0.9426520466804504, + "learning_rate": 0.0001984242167545856, + "loss": 2.8139, + "step": 1226 + }, + { + "epoch": 0.09902348478734566, + "grad_norm": 0.888769268989563, + "learning_rate": 0.00019842142399763106, + "loss": 2.8305, + "step": 1227 + }, + { + "epoch": 0.09910418852392866, + "grad_norm": 0.9497748613357544, + "learning_rate": 0.00019841862878775736, + "loss": 2.748, + "step": 1228 + }, + { + "epoch": 0.09918489226051166, + "grad_norm": 0.8715065717697144, + "learning_rate": 0.00019841583112503416, + "loss": 2.7794, + "step": 1229 + }, + { + "epoch": 0.09926559599709467, + "grad_norm": 0.875599205493927, + "learning_rate": 0.00019841303100953116, + "loss": 2.8016, + "step": 1230 + }, + { + "epoch": 0.09934629973367767, + "grad_norm": 0.8631919622421265, + "learning_rate": 0.0001984102284413182, + "loss": 2.8239, + "step": 1231 + }, + { + "epoch": 0.09942700347026068, + "grad_norm": 0.9028074741363525, + "learning_rate": 0.0001984074234204651, + "loss": 2.8372, + "step": 1232 + }, + { + "epoch": 0.09950770720684368, + "grad_norm": 0.890933096408844, + "learning_rate": 0.00019840461594704175, + "loss": 2.799, + "step": 1233 + }, + { + "epoch": 0.09958841094342669, + "grad_norm": 0.9626480340957642, + "learning_rate": 0.00019840180602111816, + "loss": 2.8207, + "step": 1234 + }, + { + "epoch": 0.09966911468000969, + "grad_norm": 0.798394501209259, + "learning_rate": 0.00019839899364276433, + "loss": 2.7784, + "step": 1235 + }, + { + "epoch": 0.0997498184165927, + "grad_norm": 0.8246447443962097, + "learning_rate": 0.00019839617881205036, + "loss": 2.8193, + "step": 1236 + }, + { + "epoch": 0.09983052215317569, + "grad_norm": 0.8315989375114441, + "learning_rate": 0.0001983933615290464, + "loss": 2.8036, + "step": 1237 + }, + { + "epoch": 0.09991122588975869, + "grad_norm": 0.8889075517654419, + "learning_rate": 0.00019839054179382267, + "loss": 2.7606, + "step": 1238 + }, + { + "epoch": 0.0999919296263417, + "grad_norm": 0.7558645009994507, + "learning_rate": 0.00019838771960644942, + "loss": 2.7666, + "step": 1239 + }, + { + "epoch": 0.1000726333629247, + "grad_norm": 0.8876601457595825, + "learning_rate": 0.00019838489496699704, + "loss": 2.8778, + "step": 1240 + }, + { + "epoch": 0.1001533370995077, + "grad_norm": 0.8609516620635986, + "learning_rate": 0.00019838206787553588, + "loss": 2.8189, + "step": 1241 + }, + { + "epoch": 0.10023404083609071, + "grad_norm": 0.8521148562431335, + "learning_rate": 0.00019837923833213644, + "loss": 2.8159, + "step": 1242 + }, + { + "epoch": 0.10031474457267371, + "grad_norm": 0.9155359268188477, + "learning_rate": 0.0001983764063368692, + "loss": 2.8351, + "step": 1243 + }, + { + "epoch": 0.10039544830925672, + "grad_norm": 0.8595378398895264, + "learning_rate": 0.00019837357188980475, + "loss": 2.8447, + "step": 1244 + }, + { + "epoch": 0.10047615204583972, + "grad_norm": 0.900244951248169, + "learning_rate": 0.00019837073499101373, + "loss": 2.8646, + "step": 1245 + }, + { + "epoch": 0.10055685578242272, + "grad_norm": 0.8404260277748108, + "learning_rate": 0.00019836789564056689, + "loss": 2.7824, + "step": 1246 + }, + { + "epoch": 0.10063755951900573, + "grad_norm": 0.8776196241378784, + "learning_rate": 0.0001983650538385349, + "loss": 2.8045, + "step": 1247 + }, + { + "epoch": 0.10071826325558873, + "grad_norm": 0.8889327049255371, + "learning_rate": 0.00019836220958498868, + "loss": 2.7967, + "step": 1248 + }, + { + "epoch": 0.10079896699217174, + "grad_norm": 0.8905191421508789, + "learning_rate": 0.00019835936287999906, + "loss": 2.8167, + "step": 1249 + }, + { + "epoch": 0.10087967072875474, + "grad_norm": 0.839970052242279, + "learning_rate": 0.000198356513723637, + "loss": 2.8643, + "step": 1250 + }, + { + "epoch": 0.10096037446533775, + "grad_norm": 0.7989531755447388, + "learning_rate": 0.00019835366211597353, + "loss": 2.8493, + "step": 1251 + }, + { + "epoch": 0.10104107820192075, + "grad_norm": 0.7960095405578613, + "learning_rate": 0.0001983508080570797, + "loss": 2.7377, + "step": 1252 + }, + { + "epoch": 0.10112178193850375, + "grad_norm": 0.7989903092384338, + "learning_rate": 0.00019834795154702661, + "loss": 2.7409, + "step": 1253 + }, + { + "epoch": 0.10120248567508676, + "grad_norm": 0.8557813167572021, + "learning_rate": 0.0001983450925858855, + "loss": 2.7945, + "step": 1254 + }, + { + "epoch": 0.10128318941166976, + "grad_norm": 0.948357880115509, + "learning_rate": 0.0001983422311737276, + "loss": 2.826, + "step": 1255 + }, + { + "epoch": 0.10136389314825277, + "grad_norm": 0.8356020450592041, + "learning_rate": 0.00019833936731062423, + "loss": 2.8157, + "step": 1256 + }, + { + "epoch": 0.10144459688483577, + "grad_norm": 0.8199872970581055, + "learning_rate": 0.00019833650099664678, + "loss": 2.7273, + "step": 1257 + }, + { + "epoch": 0.10152530062141878, + "grad_norm": 0.8178466558456421, + "learning_rate": 0.00019833363223186669, + "loss": 2.7513, + "step": 1258 + }, + { + "epoch": 0.10160600435800178, + "grad_norm": 0.8165889978408813, + "learning_rate": 0.00019833076101635538, + "loss": 2.7689, + "step": 1259 + }, + { + "epoch": 0.10168670809458479, + "grad_norm": 0.8240275979042053, + "learning_rate": 0.0001983278873501845, + "loss": 2.7477, + "step": 1260 + }, + { + "epoch": 0.10176741183116779, + "grad_norm": 0.8470584750175476, + "learning_rate": 0.00019832501123342563, + "loss": 2.7414, + "step": 1261 + }, + { + "epoch": 0.1018481155677508, + "grad_norm": 0.819063663482666, + "learning_rate": 0.00019832213266615046, + "loss": 2.7335, + "step": 1262 + }, + { + "epoch": 0.10192881930433378, + "grad_norm": 0.8045673370361328, + "learning_rate": 0.00019831925164843071, + "loss": 2.8141, + "step": 1263 + }, + { + "epoch": 0.10200952304091679, + "grad_norm": 0.7827214598655701, + "learning_rate": 0.00019831636818033824, + "loss": 2.7549, + "step": 1264 + }, + { + "epoch": 0.10209022677749979, + "grad_norm": 0.9596436619758606, + "learning_rate": 0.00019831348226194485, + "loss": 2.7327, + "step": 1265 + }, + { + "epoch": 0.1021709305140828, + "grad_norm": 0.826909601688385, + "learning_rate": 0.0001983105938933225, + "loss": 2.7166, + "step": 1266 + }, + { + "epoch": 0.1022516342506658, + "grad_norm": 0.8060985207557678, + "learning_rate": 0.00019830770307454313, + "loss": 2.7514, + "step": 1267 + }, + { + "epoch": 0.1023323379872488, + "grad_norm": 0.8257390856742859, + "learning_rate": 0.00019830480980567887, + "loss": 2.77, + "step": 1268 + }, + { + "epoch": 0.10241304172383181, + "grad_norm": 0.844406008720398, + "learning_rate": 0.00019830191408680173, + "loss": 2.8548, + "step": 1269 + }, + { + "epoch": 0.10249374546041481, + "grad_norm": 0.84171462059021, + "learning_rate": 0.00019829901591798398, + "loss": 2.7404, + "step": 1270 + }, + { + "epoch": 0.10257444919699782, + "grad_norm": 0.8084118962287903, + "learning_rate": 0.00019829611529929774, + "loss": 2.8078, + "step": 1271 + }, + { + "epoch": 0.10265515293358082, + "grad_norm": 0.8273561000823975, + "learning_rate": 0.00019829321223081538, + "loss": 2.787, + "step": 1272 + }, + { + "epoch": 0.10273585667016383, + "grad_norm": 0.799098551273346, + "learning_rate": 0.00019829030671260925, + "loss": 2.7563, + "step": 1273 + }, + { + "epoch": 0.10281656040674683, + "grad_norm": 0.885866105556488, + "learning_rate": 0.00019828739874475172, + "loss": 2.7313, + "step": 1274 + }, + { + "epoch": 0.10289726414332984, + "grad_norm": 0.7702760696411133, + "learning_rate": 0.00019828448832731529, + "loss": 2.7919, + "step": 1275 + }, + { + "epoch": 0.10297796787991284, + "grad_norm": 0.7577444911003113, + "learning_rate": 0.0001982815754603725, + "loss": 2.7149, + "step": 1276 + }, + { + "epoch": 0.10305867161649584, + "grad_norm": 0.8439713716506958, + "learning_rate": 0.00019827866014399592, + "loss": 2.7881, + "step": 1277 + }, + { + "epoch": 0.10313937535307885, + "grad_norm": 0.8504937291145325, + "learning_rate": 0.00019827574237825827, + "loss": 2.7611, + "step": 1278 + }, + { + "epoch": 0.10322007908966185, + "grad_norm": 0.7775665521621704, + "learning_rate": 0.00019827282216323218, + "loss": 2.7312, + "step": 1279 + }, + { + "epoch": 0.10330078282624486, + "grad_norm": 0.8671591281890869, + "learning_rate": 0.00019826989949899048, + "loss": 2.836, + "step": 1280 + }, + { + "epoch": 0.10338148656282786, + "grad_norm": 0.9308713674545288, + "learning_rate": 0.00019826697438560603, + "loss": 2.7494, + "step": 1281 + }, + { + "epoch": 0.10346219029941087, + "grad_norm": 0.9145268797874451, + "learning_rate": 0.0001982640468231517, + "loss": 2.8054, + "step": 1282 + }, + { + "epoch": 0.10354289403599387, + "grad_norm": 0.8150805234909058, + "learning_rate": 0.00019826111681170043, + "loss": 2.7879, + "step": 1283 + }, + { + "epoch": 0.10362359777257688, + "grad_norm": 0.8576685786247253, + "learning_rate": 0.00019825818435132531, + "loss": 2.8184, + "step": 1284 + }, + { + "epoch": 0.10370430150915988, + "grad_norm": 0.8838599920272827, + "learning_rate": 0.00019825524944209937, + "loss": 2.7838, + "step": 1285 + }, + { + "epoch": 0.10378500524574288, + "grad_norm": 0.9119304418563843, + "learning_rate": 0.00019825231208409576, + "loss": 2.8392, + "step": 1286 + }, + { + "epoch": 0.10386570898232589, + "grad_norm": 0.8112398982048035, + "learning_rate": 0.00019824937227738771, + "loss": 2.7844, + "step": 1287 + }, + { + "epoch": 0.10394641271890888, + "grad_norm": 0.8714308738708496, + "learning_rate": 0.00019824643002204847, + "loss": 2.7765, + "step": 1288 + }, + { + "epoch": 0.10402711645549188, + "grad_norm": 0.8733358979225159, + "learning_rate": 0.00019824348531815138, + "loss": 2.771, + "step": 1289 + }, + { + "epoch": 0.10410782019207489, + "grad_norm": 0.8218281269073486, + "learning_rate": 0.00019824053816576981, + "loss": 2.8099, + "step": 1290 + }, + { + "epoch": 0.10418852392865789, + "grad_norm": 0.8647308945655823, + "learning_rate": 0.00019823758856497725, + "loss": 2.7738, + "step": 1291 + }, + { + "epoch": 0.1042692276652409, + "grad_norm": 0.8358582854270935, + "learning_rate": 0.00019823463651584718, + "loss": 2.8021, + "step": 1292 + }, + { + "epoch": 0.1043499314018239, + "grad_norm": 0.7943673133850098, + "learning_rate": 0.00019823168201845318, + "loss": 2.8293, + "step": 1293 + }, + { + "epoch": 0.1044306351384069, + "grad_norm": 0.8501425981521606, + "learning_rate": 0.0001982287250728689, + "loss": 2.7701, + "step": 1294 + }, + { + "epoch": 0.10451133887498991, + "grad_norm": 0.8503665328025818, + "learning_rate": 0.00019822576567916797, + "loss": 2.7881, + "step": 1295 + }, + { + "epoch": 0.10459204261157291, + "grad_norm": 0.9687628149986267, + "learning_rate": 0.0001982228038374242, + "loss": 2.7623, + "step": 1296 + }, + { + "epoch": 0.10467274634815592, + "grad_norm": 0.8034376502037048, + "learning_rate": 0.00019821983954771146, + "loss": 2.8072, + "step": 1297 + }, + { + "epoch": 0.10475345008473892, + "grad_norm": 0.817135214805603, + "learning_rate": 0.00019821687281010352, + "loss": 2.7572, + "step": 1298 + }, + { + "epoch": 0.10483415382132193, + "grad_norm": 0.7961457371711731, + "learning_rate": 0.0001982139036246744, + "loss": 2.8405, + "step": 1299 + }, + { + "epoch": 0.10491485755790493, + "grad_norm": 0.7572407722473145, + "learning_rate": 0.00019821093199149804, + "loss": 2.7495, + "step": 1300 + }, + { + "epoch": 0.10499556129448794, + "grad_norm": 0.7990664839744568, + "learning_rate": 0.00019820795791064856, + "loss": 2.7567, + "step": 1301 + }, + { + "epoch": 0.10507626503107094, + "grad_norm": 0.8197236061096191, + "learning_rate": 0.0001982049813822, + "loss": 2.7807, + "step": 1302 + }, + { + "epoch": 0.10515696876765394, + "grad_norm": 0.9491304159164429, + "learning_rate": 0.00019820200240622664, + "loss": 2.8531, + "step": 1303 + }, + { + "epoch": 0.10523767250423695, + "grad_norm": 0.8143845200538635, + "learning_rate": 0.00019819902098280268, + "loss": 2.7542, + "step": 1304 + }, + { + "epoch": 0.10531837624081995, + "grad_norm": 0.9055941104888916, + "learning_rate": 0.0001981960371120024, + "loss": 2.863, + "step": 1305 + }, + { + "epoch": 0.10539907997740296, + "grad_norm": 0.7804721593856812, + "learning_rate": 0.0001981930507939002, + "loss": 2.8213, + "step": 1306 + }, + { + "epoch": 0.10547978371398596, + "grad_norm": 0.8375318050384521, + "learning_rate": 0.00019819006202857046, + "loss": 2.8222, + "step": 1307 + }, + { + "epoch": 0.10556048745056897, + "grad_norm": 0.9145569801330566, + "learning_rate": 0.00019818707081608773, + "loss": 2.805, + "step": 1308 + }, + { + "epoch": 0.10564119118715197, + "grad_norm": 0.7899324893951416, + "learning_rate": 0.00019818407715652654, + "loss": 2.8246, + "step": 1309 + }, + { + "epoch": 0.10572189492373497, + "grad_norm": 0.7843480110168457, + "learning_rate": 0.0001981810810499615, + "loss": 2.7909, + "step": 1310 + }, + { + "epoch": 0.10580259866031798, + "grad_norm": 0.8071008920669556, + "learning_rate": 0.00019817808249646723, + "loss": 2.7434, + "step": 1311 + }, + { + "epoch": 0.10588330239690098, + "grad_norm": 0.8682011961936951, + "learning_rate": 0.0001981750814961185, + "loss": 2.8387, + "step": 1312 + }, + { + "epoch": 0.10596400613348399, + "grad_norm": 0.7501091361045837, + "learning_rate": 0.0001981720780489902, + "loss": 2.7633, + "step": 1313 + }, + { + "epoch": 0.10604470987006698, + "grad_norm": 0.9259567856788635, + "learning_rate": 0.000198169072155157, + "loss": 2.8309, + "step": 1314 + }, + { + "epoch": 0.10612541360664998, + "grad_norm": 0.8018674254417419, + "learning_rate": 0.00019816606381469393, + "loss": 2.8647, + "step": 1315 + }, + { + "epoch": 0.10620611734323299, + "grad_norm": 0.8218088746070862, + "learning_rate": 0.00019816305302767595, + "loss": 2.823, + "step": 1316 + }, + { + "epoch": 0.10628682107981599, + "grad_norm": 0.812125027179718, + "learning_rate": 0.00019816003979417808, + "loss": 2.7216, + "step": 1317 + }, + { + "epoch": 0.106367524816399, + "grad_norm": 0.787407636642456, + "learning_rate": 0.0001981570241142754, + "loss": 2.7639, + "step": 1318 + }, + { + "epoch": 0.106448228552982, + "grad_norm": 0.7982528805732727, + "learning_rate": 0.00019815400598804312, + "loss": 2.8597, + "step": 1319 + }, + { + "epoch": 0.106528932289565, + "grad_norm": 0.8490404486656189, + "learning_rate": 0.00019815098541555646, + "loss": 2.7947, + "step": 1320 + }, + { + "epoch": 0.10660963602614801, + "grad_norm": 0.8743172883987427, + "learning_rate": 0.00019814796239689064, + "loss": 2.8674, + "step": 1321 + }, + { + "epoch": 0.10669033976273101, + "grad_norm": 0.8338125348091125, + "learning_rate": 0.00019814493693212106, + "loss": 2.781, + "step": 1322 + }, + { + "epoch": 0.10677104349931402, + "grad_norm": 0.871516764163971, + "learning_rate": 0.00019814190902132307, + "loss": 2.8742, + "step": 1323 + }, + { + "epoch": 0.10685174723589702, + "grad_norm": 0.8935555815696716, + "learning_rate": 0.00019813887866457216, + "loss": 2.7991, + "step": 1324 + }, + { + "epoch": 0.10693245097248003, + "grad_norm": 0.840067446231842, + "learning_rate": 0.00019813584586194388, + "loss": 2.7922, + "step": 1325 + }, + { + "epoch": 0.10701315470906303, + "grad_norm": 0.7919262647628784, + "learning_rate": 0.0001981328106135138, + "loss": 2.7912, + "step": 1326 + }, + { + "epoch": 0.10709385844564603, + "grad_norm": 0.7974550127983093, + "learning_rate": 0.00019812977291935752, + "loss": 2.8497, + "step": 1327 + }, + { + "epoch": 0.10717456218222904, + "grad_norm": 0.9126157164573669, + "learning_rate": 0.00019812673277955082, + "loss": 2.7698, + "step": 1328 + }, + { + "epoch": 0.10725526591881204, + "grad_norm": 0.8329752683639526, + "learning_rate": 0.0001981236901941694, + "loss": 2.8366, + "step": 1329 + }, + { + "epoch": 0.10733596965539505, + "grad_norm": 0.8313524127006531, + "learning_rate": 0.00019812064516328915, + "loss": 2.6863, + "step": 1330 + }, + { + "epoch": 0.10741667339197805, + "grad_norm": 0.8917783498764038, + "learning_rate": 0.0001981175976869859, + "loss": 2.7817, + "step": 1331 + }, + { + "epoch": 0.10749737712856106, + "grad_norm": 0.8370450735092163, + "learning_rate": 0.00019811454776533566, + "loss": 2.837, + "step": 1332 + }, + { + "epoch": 0.10757808086514406, + "grad_norm": 0.8415676355361938, + "learning_rate": 0.00019811149539841443, + "loss": 2.7399, + "step": 1333 + }, + { + "epoch": 0.10765878460172706, + "grad_norm": 0.8576632142066956, + "learning_rate": 0.00019810844058629825, + "loss": 2.7747, + "step": 1334 + }, + { + "epoch": 0.10773948833831007, + "grad_norm": 0.8943549394607544, + "learning_rate": 0.00019810538332906328, + "loss": 2.7368, + "step": 1335 + }, + { + "epoch": 0.10782019207489307, + "grad_norm": 0.8878718018531799, + "learning_rate": 0.00019810232362678568, + "loss": 2.7907, + "step": 1336 + }, + { + "epoch": 0.10790089581147608, + "grad_norm": 0.8131409287452698, + "learning_rate": 0.00019809926147954174, + "loss": 2.7782, + "step": 1337 + }, + { + "epoch": 0.10798159954805908, + "grad_norm": 0.8733747005462646, + "learning_rate": 0.0001980961968874078, + "loss": 2.8552, + "step": 1338 + }, + { + "epoch": 0.10806230328464207, + "grad_norm": 0.8997320532798767, + "learning_rate": 0.0001980931298504602, + "loss": 2.8452, + "step": 1339 + }, + { + "epoch": 0.10814300702122508, + "grad_norm": 0.8400282263755798, + "learning_rate": 0.00019809006036877538, + "loss": 2.786, + "step": 1340 + }, + { + "epoch": 0.10822371075780808, + "grad_norm": 0.8173925280570984, + "learning_rate": 0.00019808698844242983, + "loss": 2.8363, + "step": 1341 + }, + { + "epoch": 0.10830441449439109, + "grad_norm": 0.872278094291687, + "learning_rate": 0.00019808391407150015, + "loss": 2.7789, + "step": 1342 + }, + { + "epoch": 0.10838511823097409, + "grad_norm": 0.8939952254295349, + "learning_rate": 0.00019808083725606293, + "loss": 2.7453, + "step": 1343 + }, + { + "epoch": 0.1084658219675571, + "grad_norm": 0.8351218104362488, + "learning_rate": 0.00019807775799619484, + "loss": 2.8004, + "step": 1344 + }, + { + "epoch": 0.1085465257041401, + "grad_norm": 0.8381102681159973, + "learning_rate": 0.00019807467629197266, + "loss": 2.8155, + "step": 1345 + }, + { + "epoch": 0.1086272294407231, + "grad_norm": 0.869458019733429, + "learning_rate": 0.00019807159214347317, + "loss": 2.8219, + "step": 1346 + }, + { + "epoch": 0.10870793317730611, + "grad_norm": 0.8251017928123474, + "learning_rate": 0.00019806850555077326, + "loss": 2.7978, + "step": 1347 + }, + { + "epoch": 0.10878863691388911, + "grad_norm": 0.8056492209434509, + "learning_rate": 0.0001980654165139498, + "loss": 2.7994, + "step": 1348 + }, + { + "epoch": 0.10886934065047212, + "grad_norm": 0.9566174745559692, + "learning_rate": 0.00019806232503307984, + "loss": 2.794, + "step": 1349 + }, + { + "epoch": 0.10895004438705512, + "grad_norm": 0.7891408801078796, + "learning_rate": 0.0001980592311082404, + "loss": 2.7134, + "step": 1350 + }, + { + "epoch": 0.10903074812363812, + "grad_norm": 0.8894741535186768, + "learning_rate": 0.00019805613473950862, + "loss": 2.7829, + "step": 1351 + }, + { + "epoch": 0.10911145186022113, + "grad_norm": 0.893086850643158, + "learning_rate": 0.0001980530359269616, + "loss": 2.7475, + "step": 1352 + }, + { + "epoch": 0.10919215559680413, + "grad_norm": 0.8758537173271179, + "learning_rate": 0.00019804993467067666, + "loss": 2.8715, + "step": 1353 + }, + { + "epoch": 0.10927285933338714, + "grad_norm": 0.9304648041725159, + "learning_rate": 0.00019804683097073098, + "loss": 2.8051, + "step": 1354 + }, + { + "epoch": 0.10935356306997014, + "grad_norm": 0.8465876579284668, + "learning_rate": 0.00019804372482720202, + "loss": 2.7879, + "step": 1355 + }, + { + "epoch": 0.10943426680655315, + "grad_norm": 0.8485612273216248, + "learning_rate": 0.00019804061624016713, + "loss": 2.7783, + "step": 1356 + }, + { + "epoch": 0.10951497054313615, + "grad_norm": 0.835630476474762, + "learning_rate": 0.0001980375052097038, + "loss": 2.8116, + "step": 1357 + }, + { + "epoch": 0.10959567427971915, + "grad_norm": 0.8404836058616638, + "learning_rate": 0.00019803439173588956, + "loss": 2.8257, + "step": 1358 + }, + { + "epoch": 0.10967637801630216, + "grad_norm": 0.8048505783081055, + "learning_rate": 0.00019803127581880206, + "loss": 2.7762, + "step": 1359 + }, + { + "epoch": 0.10975708175288516, + "grad_norm": 0.8481776118278503, + "learning_rate": 0.00019802815745851885, + "loss": 2.8243, + "step": 1360 + }, + { + "epoch": 0.10983778548946817, + "grad_norm": 0.8565996885299683, + "learning_rate": 0.00019802503665511775, + "loss": 2.7958, + "step": 1361 + }, + { + "epoch": 0.10991848922605117, + "grad_norm": 0.8867515921592712, + "learning_rate": 0.0001980219134086765, + "loss": 2.7973, + "step": 1362 + }, + { + "epoch": 0.10999919296263418, + "grad_norm": 0.8459765911102295, + "learning_rate": 0.0001980187877192729, + "loss": 2.848, + "step": 1363 + }, + { + "epoch": 0.11007989669921718, + "grad_norm": 0.7929832339286804, + "learning_rate": 0.0001980156595869849, + "loss": 2.8583, + "step": 1364 + }, + { + "epoch": 0.11016060043580017, + "grad_norm": 0.8475651741027832, + "learning_rate": 0.00019801252901189043, + "loss": 2.8436, + "step": 1365 + }, + { + "epoch": 0.11024130417238318, + "grad_norm": 0.8545576333999634, + "learning_rate": 0.00019800939599406755, + "loss": 2.7457, + "step": 1366 + }, + { + "epoch": 0.11032200790896618, + "grad_norm": 1.0093715190887451, + "learning_rate": 0.00019800626053359435, + "loss": 2.8198, + "step": 1367 + }, + { + "epoch": 0.11040271164554918, + "grad_norm": 0.8728145956993103, + "learning_rate": 0.0001980031226305489, + "loss": 2.7794, + "step": 1368 + }, + { + "epoch": 0.11048341538213219, + "grad_norm": 0.8538581728935242, + "learning_rate": 0.00019799998228500946, + "loss": 2.8018, + "step": 1369 + }, + { + "epoch": 0.11056411911871519, + "grad_norm": 0.9452785849571228, + "learning_rate": 0.00019799683949705432, + "loss": 2.8173, + "step": 1370 + }, + { + "epoch": 0.1106448228552982, + "grad_norm": 0.806508481502533, + "learning_rate": 0.00019799369426676174, + "loss": 2.8192, + "step": 1371 + }, + { + "epoch": 0.1107255265918812, + "grad_norm": 0.8952856063842773, + "learning_rate": 0.00019799054659421018, + "loss": 2.8072, + "step": 1372 + }, + { + "epoch": 0.1108062303284642, + "grad_norm": 0.8863561749458313, + "learning_rate": 0.00019798739647947802, + "loss": 2.7836, + "step": 1373 + }, + { + "epoch": 0.11088693406504721, + "grad_norm": 0.8544357419013977, + "learning_rate": 0.00019798424392264378, + "loss": 2.7714, + "step": 1374 + }, + { + "epoch": 0.11096763780163021, + "grad_norm": 0.807546854019165, + "learning_rate": 0.00019798108892378607, + "loss": 2.7635, + "step": 1375 + }, + { + "epoch": 0.11104834153821322, + "grad_norm": 0.8198233246803284, + "learning_rate": 0.0001979779314829835, + "loss": 2.8253, + "step": 1376 + }, + { + "epoch": 0.11112904527479622, + "grad_norm": 0.9268671870231628, + "learning_rate": 0.00019797477160031477, + "loss": 2.8007, + "step": 1377 + }, + { + "epoch": 0.11120974901137923, + "grad_norm": 0.8547680974006653, + "learning_rate": 0.0001979716092758586, + "loss": 2.7749, + "step": 1378 + }, + { + "epoch": 0.11129045274796223, + "grad_norm": 0.8052394390106201, + "learning_rate": 0.00019796844450969384, + "loss": 2.763, + "step": 1379 + }, + { + "epoch": 0.11137115648454524, + "grad_norm": 0.8291144371032715, + "learning_rate": 0.00019796527730189936, + "loss": 2.8053, + "step": 1380 + }, + { + "epoch": 0.11145186022112824, + "grad_norm": 0.8114006519317627, + "learning_rate": 0.00019796210765255404, + "loss": 2.8047, + "step": 1381 + }, + { + "epoch": 0.11153256395771124, + "grad_norm": 0.9326293468475342, + "learning_rate": 0.00019795893556173697, + "loss": 2.8199, + "step": 1382 + }, + { + "epoch": 0.11161326769429425, + "grad_norm": 0.7702555656433105, + "learning_rate": 0.00019795576102952714, + "loss": 2.7909, + "step": 1383 + }, + { + "epoch": 0.11169397143087725, + "grad_norm": 0.8115492463111877, + "learning_rate": 0.0001979525840560037, + "loss": 2.748, + "step": 1384 + }, + { + "epoch": 0.11177467516746026, + "grad_norm": 0.8926187753677368, + "learning_rate": 0.0001979494046412458, + "loss": 2.7791, + "step": 1385 + }, + { + "epoch": 0.11185537890404326, + "grad_norm": 0.8549754023551941, + "learning_rate": 0.0001979462227853327, + "loss": 2.7989, + "step": 1386 + }, + { + "epoch": 0.11193608264062627, + "grad_norm": 0.8625262975692749, + "learning_rate": 0.0001979430384883437, + "loss": 2.7202, + "step": 1387 + }, + { + "epoch": 0.11201678637720927, + "grad_norm": 0.8134698867797852, + "learning_rate": 0.00019793985175035813, + "loss": 2.8008, + "step": 1388 + }, + { + "epoch": 0.11209749011379228, + "grad_norm": 0.8546617031097412, + "learning_rate": 0.00019793666257145547, + "loss": 2.8076, + "step": 1389 + }, + { + "epoch": 0.11217819385037527, + "grad_norm": 0.8003748059272766, + "learning_rate": 0.00019793347095171514, + "loss": 2.826, + "step": 1390 + }, + { + "epoch": 0.11225889758695827, + "grad_norm": 0.8116614818572998, + "learning_rate": 0.00019793027689121674, + "loss": 2.7096, + "step": 1391 + }, + { + "epoch": 0.11233960132354127, + "grad_norm": 0.7785829901695251, + "learning_rate": 0.00019792708039003984, + "loss": 2.748, + "step": 1392 + }, + { + "epoch": 0.11242030506012428, + "grad_norm": 0.7999277710914612, + "learning_rate": 0.0001979238814482641, + "loss": 2.7671, + "step": 1393 + }, + { + "epoch": 0.11250100879670728, + "grad_norm": 0.8862190842628479, + "learning_rate": 0.00019792068006596925, + "loss": 2.8484, + "step": 1394 + }, + { + "epoch": 0.11258171253329029, + "grad_norm": 0.8747627139091492, + "learning_rate": 0.00019791747624323512, + "loss": 2.7477, + "step": 1395 + }, + { + "epoch": 0.11266241626987329, + "grad_norm": 0.8280831575393677, + "learning_rate": 0.0001979142699801415, + "loss": 2.87, + "step": 1396 + }, + { + "epoch": 0.1127431200064563, + "grad_norm": 0.8069074153900146, + "learning_rate": 0.00019791106127676832, + "loss": 2.7724, + "step": 1397 + }, + { + "epoch": 0.1128238237430393, + "grad_norm": 0.8253301382064819, + "learning_rate": 0.00019790785013319557, + "loss": 2.7351, + "step": 1398 + }, + { + "epoch": 0.1129045274796223, + "grad_norm": 0.8298853635787964, + "learning_rate": 0.00019790463654950323, + "loss": 2.7709, + "step": 1399 + }, + { + "epoch": 0.11298523121620531, + "grad_norm": 0.7796407341957092, + "learning_rate": 0.0001979014205257715, + "loss": 2.7766, + "step": 1400 + }, + { + "epoch": 0.11306593495278831, + "grad_norm": 0.8922166228294373, + "learning_rate": 0.00019789820206208037, + "loss": 2.8473, + "step": 1401 + }, + { + "epoch": 0.11314663868937132, + "grad_norm": 0.7763219475746155, + "learning_rate": 0.00019789498115851015, + "loss": 2.8629, + "step": 1402 + }, + { + "epoch": 0.11322734242595432, + "grad_norm": 0.8679928779602051, + "learning_rate": 0.0001978917578151411, + "loss": 2.8017, + "step": 1403 + }, + { + "epoch": 0.11330804616253733, + "grad_norm": 0.8491933941841125, + "learning_rate": 0.00019788853203205357, + "loss": 2.7156, + "step": 1404 + }, + { + "epoch": 0.11338874989912033, + "grad_norm": 0.8271194696426392, + "learning_rate": 0.00019788530380932792, + "loss": 2.7892, + "step": 1405 + }, + { + "epoch": 0.11346945363570334, + "grad_norm": 0.9224163293838501, + "learning_rate": 0.00019788207314704463, + "loss": 2.7824, + "step": 1406 + }, + { + "epoch": 0.11355015737228634, + "grad_norm": 0.7662777900695801, + "learning_rate": 0.00019787884004528422, + "loss": 2.7364, + "step": 1407 + }, + { + "epoch": 0.11363086110886934, + "grad_norm": 0.8750362396240234, + "learning_rate": 0.00019787560450412728, + "loss": 2.7546, + "step": 1408 + }, + { + "epoch": 0.11371156484545235, + "grad_norm": 0.9158821105957031, + "learning_rate": 0.0001978723665236544, + "loss": 2.8304, + "step": 1409 + }, + { + "epoch": 0.11379226858203535, + "grad_norm": 0.8291050791740417, + "learning_rate": 0.0001978691261039463, + "loss": 2.758, + "step": 1410 + }, + { + "epoch": 0.11387297231861836, + "grad_norm": 0.801886796951294, + "learning_rate": 0.00019786588324508374, + "loss": 2.7805, + "step": 1411 + }, + { + "epoch": 0.11395367605520136, + "grad_norm": 0.8140222430229187, + "learning_rate": 0.00019786263794714757, + "loss": 2.8155, + "step": 1412 + }, + { + "epoch": 0.11403437979178437, + "grad_norm": 0.7747580409049988, + "learning_rate": 0.00019785939021021865, + "loss": 2.778, + "step": 1413 + }, + { + "epoch": 0.11411508352836737, + "grad_norm": 0.8954138159751892, + "learning_rate": 0.0001978561400343779, + "loss": 2.7756, + "step": 1414 + }, + { + "epoch": 0.11419578726495037, + "grad_norm": 0.9038921594619751, + "learning_rate": 0.00019785288741970634, + "loss": 2.7181, + "step": 1415 + }, + { + "epoch": 0.11427649100153336, + "grad_norm": 0.8284393548965454, + "learning_rate": 0.000197849632366285, + "loss": 2.7467, + "step": 1416 + }, + { + "epoch": 0.11435719473811637, + "grad_norm": 0.8996441960334778, + "learning_rate": 0.00019784637487419514, + "loss": 2.7918, + "step": 1417 + }, + { + "epoch": 0.11443789847469937, + "grad_norm": 0.9868448376655579, + "learning_rate": 0.00019784311494351777, + "loss": 2.7687, + "step": 1418 + }, + { + "epoch": 0.11451860221128238, + "grad_norm": 0.8491402864456177, + "learning_rate": 0.0001978398525743342, + "loss": 2.8492, + "step": 1419 + }, + { + "epoch": 0.11459930594786538, + "grad_norm": 1.06125807762146, + "learning_rate": 0.0001978365877667258, + "loss": 2.8041, + "step": 1420 + }, + { + "epoch": 0.11468000968444839, + "grad_norm": 0.8194011449813843, + "learning_rate": 0.00019783332052077386, + "loss": 2.7109, + "step": 1421 + }, + { + "epoch": 0.11476071342103139, + "grad_norm": 0.972620964050293, + "learning_rate": 0.00019783005083655984, + "loss": 2.8107, + "step": 1422 + }, + { + "epoch": 0.1148414171576144, + "grad_norm": 0.925410270690918, + "learning_rate": 0.0001978267787141652, + "loss": 2.7603, + "step": 1423 + }, + { + "epoch": 0.1149221208941974, + "grad_norm": 0.920156717300415, + "learning_rate": 0.00019782350415367152, + "loss": 2.7644, + "step": 1424 + }, + { + "epoch": 0.1150028246307804, + "grad_norm": 0.8617576360702515, + "learning_rate": 0.00019782022715516043, + "loss": 2.769, + "step": 1425 + }, + { + "epoch": 0.11508352836736341, + "grad_norm": 1.0987342596054077, + "learning_rate": 0.00019781694771871356, + "loss": 2.8224, + "step": 1426 + }, + { + "epoch": 0.11516423210394641, + "grad_norm": 0.8418076634407043, + "learning_rate": 0.00019781366584441264, + "loss": 2.7947, + "step": 1427 + }, + { + "epoch": 0.11524493584052942, + "grad_norm": 0.8010901808738708, + "learning_rate": 0.0001978103815323395, + "loss": 2.733, + "step": 1428 + }, + { + "epoch": 0.11532563957711242, + "grad_norm": 0.8649042844772339, + "learning_rate": 0.00019780709478257598, + "loss": 2.7681, + "step": 1429 + }, + { + "epoch": 0.11540634331369543, + "grad_norm": 0.7728127837181091, + "learning_rate": 0.00019780380559520397, + "loss": 2.7795, + "step": 1430 + }, + { + "epoch": 0.11548704705027843, + "grad_norm": 0.7770940065383911, + "learning_rate": 0.00019780051397030545, + "loss": 2.743, + "step": 1431 + }, + { + "epoch": 0.11556775078686143, + "grad_norm": 0.8341890573501587, + "learning_rate": 0.0001977972199079625, + "loss": 2.8047, + "step": 1432 + }, + { + "epoch": 0.11564845452344444, + "grad_norm": 0.7894187569618225, + "learning_rate": 0.00019779392340825717, + "loss": 2.7757, + "step": 1433 + }, + { + "epoch": 0.11572915826002744, + "grad_norm": 0.8002873063087463, + "learning_rate": 0.00019779062447127164, + "loss": 2.7816, + "step": 1434 + }, + { + "epoch": 0.11580986199661045, + "grad_norm": 0.8256075978279114, + "learning_rate": 0.0001977873230970881, + "loss": 2.7839, + "step": 1435 + }, + { + "epoch": 0.11589056573319345, + "grad_norm": 0.8695322871208191, + "learning_rate": 0.0001977840192857889, + "loss": 2.746, + "step": 1436 + }, + { + "epoch": 0.11597126946977646, + "grad_norm": 0.767425537109375, + "learning_rate": 0.00019778071303745628, + "loss": 2.797, + "step": 1437 + }, + { + "epoch": 0.11605197320635946, + "grad_norm": 0.8263241052627563, + "learning_rate": 0.0001977774043521727, + "loss": 2.7702, + "step": 1438 + }, + { + "epoch": 0.11613267694294246, + "grad_norm": 0.8108638525009155, + "learning_rate": 0.0001977740932300206, + "loss": 2.6981, + "step": 1439 + }, + { + "epoch": 0.11621338067952547, + "grad_norm": 0.7945007681846619, + "learning_rate": 0.00019777077967108255, + "loss": 2.7357, + "step": 1440 + }, + { + "epoch": 0.11629408441610846, + "grad_norm": 0.8480326533317566, + "learning_rate": 0.00019776746367544107, + "loss": 2.8563, + "step": 1441 + }, + { + "epoch": 0.11637478815269146, + "grad_norm": 0.8202071785926819, + "learning_rate": 0.00019776414524317882, + "loss": 2.7955, + "step": 1442 + }, + { + "epoch": 0.11645549188927447, + "grad_norm": 0.8202874660491943, + "learning_rate": 0.00019776082437437852, + "loss": 2.765, + "step": 1443 + }, + { + "epoch": 0.11653619562585747, + "grad_norm": 0.8053051829338074, + "learning_rate": 0.00019775750106912294, + "loss": 2.6866, + "step": 1444 + }, + { + "epoch": 0.11661689936244048, + "grad_norm": 0.831968367099762, + "learning_rate": 0.00019775417532749486, + "loss": 2.7022, + "step": 1445 + }, + { + "epoch": 0.11669760309902348, + "grad_norm": 0.8903129696846008, + "learning_rate": 0.00019775084714957725, + "loss": 2.7308, + "step": 1446 + }, + { + "epoch": 0.11677830683560649, + "grad_norm": 0.8178622722625732, + "learning_rate": 0.000197747516535453, + "loss": 2.7446, + "step": 1447 + }, + { + "epoch": 0.11685901057218949, + "grad_norm": 0.8270576596260071, + "learning_rate": 0.00019774418348520508, + "loss": 2.7716, + "step": 1448 + }, + { + "epoch": 0.1169397143087725, + "grad_norm": 0.7965807914733887, + "learning_rate": 0.00019774084799891662, + "loss": 2.7305, + "step": 1449 + }, + { + "epoch": 0.1170204180453555, + "grad_norm": 0.8499472737312317, + "learning_rate": 0.00019773751007667073, + "loss": 2.7584, + "step": 1450 + }, + { + "epoch": 0.1171011217819385, + "grad_norm": 0.8961663842201233, + "learning_rate": 0.0001977341697185506, + "loss": 2.7729, + "step": 1451 + }, + { + "epoch": 0.1171818255185215, + "grad_norm": 1.0203527212142944, + "learning_rate": 0.0001977308269246395, + "loss": 2.727, + "step": 1452 + }, + { + "epoch": 0.11726252925510451, + "grad_norm": 0.953289806842804, + "learning_rate": 0.0001977274816950207, + "loss": 2.8158, + "step": 1453 + }, + { + "epoch": 0.11734323299168752, + "grad_norm": 1.0064597129821777, + "learning_rate": 0.0001977241340297776, + "loss": 2.8743, + "step": 1454 + }, + { + "epoch": 0.11742393672827052, + "grad_norm": 0.8541988730430603, + "learning_rate": 0.00019772078392899363, + "loss": 2.8532, + "step": 1455 + }, + { + "epoch": 0.11750464046485352, + "grad_norm": 0.8351433873176575, + "learning_rate": 0.00019771743139275228, + "loss": 2.7749, + "step": 1456 + }, + { + "epoch": 0.11758534420143653, + "grad_norm": 0.9555812478065491, + "learning_rate": 0.00019771407642113712, + "loss": 2.7408, + "step": 1457 + }, + { + "epoch": 0.11766604793801953, + "grad_norm": 0.7943894267082214, + "learning_rate": 0.0001977107190142317, + "loss": 2.7265, + "step": 1458 + }, + { + "epoch": 0.11774675167460254, + "grad_norm": 0.8636460900306702, + "learning_rate": 0.0001977073591721198, + "loss": 2.8178, + "step": 1459 + }, + { + "epoch": 0.11782745541118554, + "grad_norm": 0.8673834800720215, + "learning_rate": 0.00019770399689488506, + "loss": 2.7928, + "step": 1460 + }, + { + "epoch": 0.11790815914776855, + "grad_norm": 0.9463722705841064, + "learning_rate": 0.00019770063218261133, + "loss": 2.7448, + "step": 1461 + }, + { + "epoch": 0.11798886288435155, + "grad_norm": 0.8429726362228394, + "learning_rate": 0.00019769726503538246, + "loss": 2.7564, + "step": 1462 + }, + { + "epoch": 0.11806956662093455, + "grad_norm": 0.9412201642990112, + "learning_rate": 0.00019769389545328236, + "loss": 2.793, + "step": 1463 + }, + { + "epoch": 0.11815027035751756, + "grad_norm": 0.9112111926078796, + "learning_rate": 0.000197690523436395, + "loss": 2.7787, + "step": 1464 + }, + { + "epoch": 0.11823097409410056, + "grad_norm": 0.8417023420333862, + "learning_rate": 0.00019768714898480444, + "loss": 2.7654, + "step": 1465 + }, + { + "epoch": 0.11831167783068357, + "grad_norm": 0.8275290727615356, + "learning_rate": 0.00019768377209859476, + "loss": 2.7914, + "step": 1466 + }, + { + "epoch": 0.11839238156726656, + "grad_norm": 0.8113142848014832, + "learning_rate": 0.00019768039277785017, + "loss": 2.7516, + "step": 1467 + }, + { + "epoch": 0.11847308530384956, + "grad_norm": 0.8655288219451904, + "learning_rate": 0.0001976770110226548, + "loss": 2.8158, + "step": 1468 + }, + { + "epoch": 0.11855378904043257, + "grad_norm": 0.8063547611236572, + "learning_rate": 0.000197673626833093, + "loss": 2.7624, + "step": 1469 + }, + { + "epoch": 0.11863449277701557, + "grad_norm": 0.843772292137146, + "learning_rate": 0.00019767024020924908, + "loss": 2.86, + "step": 1470 + }, + { + "epoch": 0.11871519651359858, + "grad_norm": 0.7942481637001038, + "learning_rate": 0.0001976668511512075, + "loss": 2.758, + "step": 1471 + }, + { + "epoch": 0.11879590025018158, + "grad_norm": 0.841275155544281, + "learning_rate": 0.00019766345965905268, + "loss": 2.8014, + "step": 1472 + }, + { + "epoch": 0.11887660398676458, + "grad_norm": 0.8003600835800171, + "learning_rate": 0.00019766006573286915, + "loss": 2.7829, + "step": 1473 + }, + { + "epoch": 0.11895730772334759, + "grad_norm": 0.8437239527702332, + "learning_rate": 0.00019765666937274147, + "loss": 2.7706, + "step": 1474 + }, + { + "epoch": 0.11903801145993059, + "grad_norm": 0.8118240833282471, + "learning_rate": 0.00019765327057875433, + "loss": 2.8185, + "step": 1475 + }, + { + "epoch": 0.1191187151965136, + "grad_norm": 0.8051649928092957, + "learning_rate": 0.00019764986935099244, + "loss": 2.7676, + "step": 1476 + }, + { + "epoch": 0.1191994189330966, + "grad_norm": 0.7786862850189209, + "learning_rate": 0.00019764646568954053, + "loss": 2.8069, + "step": 1477 + }, + { + "epoch": 0.1192801226696796, + "grad_norm": 0.8199592232704163, + "learning_rate": 0.0001976430595944834, + "loss": 2.7718, + "step": 1478 + }, + { + "epoch": 0.11936082640626261, + "grad_norm": 0.8696652054786682, + "learning_rate": 0.00019763965106590604, + "loss": 2.7682, + "step": 1479 + }, + { + "epoch": 0.11944153014284561, + "grad_norm": 0.7993931174278259, + "learning_rate": 0.00019763624010389334, + "loss": 2.7607, + "step": 1480 + }, + { + "epoch": 0.11952223387942862, + "grad_norm": 0.8107055425643921, + "learning_rate": 0.0001976328267085303, + "loss": 2.7885, + "step": 1481 + }, + { + "epoch": 0.11960293761601162, + "grad_norm": 0.8189423084259033, + "learning_rate": 0.000197629410879902, + "loss": 2.7332, + "step": 1482 + }, + { + "epoch": 0.11968364135259463, + "grad_norm": 0.9134814143180847, + "learning_rate": 0.0001976259926180936, + "loss": 2.7691, + "step": 1483 + }, + { + "epoch": 0.11976434508917763, + "grad_norm": 0.8642883896827698, + "learning_rate": 0.00019762257192319023, + "loss": 2.7876, + "step": 1484 + }, + { + "epoch": 0.11984504882576064, + "grad_norm": 0.7411352396011353, + "learning_rate": 0.0001976191487952772, + "loss": 2.7577, + "step": 1485 + }, + { + "epoch": 0.11992575256234364, + "grad_norm": 0.7741669416427612, + "learning_rate": 0.00019761572323443978, + "loss": 2.8005, + "step": 1486 + }, + { + "epoch": 0.12000645629892664, + "grad_norm": 0.8195405602455139, + "learning_rate": 0.0001976122952407634, + "loss": 2.7421, + "step": 1487 + }, + { + "epoch": 0.12008716003550965, + "grad_norm": 0.8355886936187744, + "learning_rate": 0.00019760886481433345, + "loss": 2.8156, + "step": 1488 + }, + { + "epoch": 0.12016786377209265, + "grad_norm": 0.8321093916893005, + "learning_rate": 0.00019760543195523542, + "loss": 2.7261, + "step": 1489 + }, + { + "epoch": 0.12024856750867566, + "grad_norm": 0.7792446613311768, + "learning_rate": 0.0001976019966635549, + "loss": 2.7319, + "step": 1490 + }, + { + "epoch": 0.12032927124525866, + "grad_norm": 0.770535409450531, + "learning_rate": 0.00019759855893937748, + "loss": 2.7727, + "step": 1491 + }, + { + "epoch": 0.12040997498184165, + "grad_norm": 0.8168532252311707, + "learning_rate": 0.00019759511878278887, + "loss": 2.7763, + "step": 1492 + }, + { + "epoch": 0.12049067871842466, + "grad_norm": 0.8395755290985107, + "learning_rate": 0.00019759167619387476, + "loss": 2.8382, + "step": 1493 + }, + { + "epoch": 0.12057138245500766, + "grad_norm": 0.8682762384414673, + "learning_rate": 0.00019758823117272097, + "loss": 2.8056, + "step": 1494 + }, + { + "epoch": 0.12065208619159067, + "grad_norm": 0.815192699432373, + "learning_rate": 0.00019758478371941337, + "loss": 2.7602, + "step": 1495 + }, + { + "epoch": 0.12073278992817367, + "grad_norm": 0.7919273376464844, + "learning_rate": 0.00019758133383403786, + "loss": 2.7989, + "step": 1496 + }, + { + "epoch": 0.12081349366475667, + "grad_norm": 1.004387378692627, + "learning_rate": 0.00019757788151668045, + "loss": 2.7765, + "step": 1497 + }, + { + "epoch": 0.12089419740133968, + "grad_norm": 1.0032062530517578, + "learning_rate": 0.00019757442676742715, + "loss": 2.7751, + "step": 1498 + }, + { + "epoch": 0.12097490113792268, + "grad_norm": 0.8797723054885864, + "learning_rate": 0.00019757096958636407, + "loss": 2.7798, + "step": 1499 + }, + { + "epoch": 0.12105560487450569, + "grad_norm": 0.9239820241928101, + "learning_rate": 0.0001975675099735774, + "loss": 2.7976, + "step": 1500 + }, + { + "epoch": 0.12113630861108869, + "grad_norm": 0.9903601408004761, + "learning_rate": 0.00019756404792915328, + "loss": 2.7891, + "step": 1501 + }, + { + "epoch": 0.1212170123476717, + "grad_norm": 0.8402895331382751, + "learning_rate": 0.0001975605834531781, + "loss": 2.8037, + "step": 1502 + }, + { + "epoch": 0.1212977160842547, + "grad_norm": 0.8986102342605591, + "learning_rate": 0.00019755711654573813, + "loss": 2.8375, + "step": 1503 + }, + { + "epoch": 0.1213784198208377, + "grad_norm": 0.8795471787452698, + "learning_rate": 0.0001975536472069198, + "loss": 2.7916, + "step": 1504 + }, + { + "epoch": 0.12145912355742071, + "grad_norm": 0.866278350353241, + "learning_rate": 0.00019755017543680962, + "loss": 2.7884, + "step": 1505 + }, + { + "epoch": 0.12153982729400371, + "grad_norm": 0.7877952456474304, + "learning_rate": 0.00019754670123549398, + "loss": 2.7659, + "step": 1506 + }, + { + "epoch": 0.12162053103058672, + "grad_norm": 0.857155978679657, + "learning_rate": 0.00019754322460305962, + "loss": 2.8029, + "step": 1507 + }, + { + "epoch": 0.12170123476716972, + "grad_norm": 0.8323284387588501, + "learning_rate": 0.00019753974553959314, + "loss": 2.7764, + "step": 1508 + }, + { + "epoch": 0.12178193850375273, + "grad_norm": 0.8557485938072205, + "learning_rate": 0.00019753626404518117, + "loss": 2.7448, + "step": 1509 + }, + { + "epoch": 0.12186264224033573, + "grad_norm": 0.8026818037033081, + "learning_rate": 0.00019753278011991058, + "loss": 2.7323, + "step": 1510 + }, + { + "epoch": 0.12194334597691874, + "grad_norm": 0.8578904271125793, + "learning_rate": 0.00019752929376386816, + "loss": 2.759, + "step": 1511 + }, + { + "epoch": 0.12202404971350174, + "grad_norm": 0.8617175221443176, + "learning_rate": 0.00019752580497714076, + "loss": 2.7641, + "step": 1512 + }, + { + "epoch": 0.12210475345008474, + "grad_norm": 0.8261943459510803, + "learning_rate": 0.00019752231375981538, + "loss": 2.7554, + "step": 1513 + }, + { + "epoch": 0.12218545718666775, + "grad_norm": 0.9984099268913269, + "learning_rate": 0.00019751882011197902, + "loss": 2.763, + "step": 1514 + }, + { + "epoch": 0.12226616092325075, + "grad_norm": 0.8014064431190491, + "learning_rate": 0.00019751532403371874, + "loss": 2.8083, + "step": 1515 + }, + { + "epoch": 0.12234686465983376, + "grad_norm": 0.9276653528213501, + "learning_rate": 0.0001975118255251217, + "loss": 2.8055, + "step": 1516 + }, + { + "epoch": 0.12242756839641676, + "grad_norm": 0.9365193843841553, + "learning_rate": 0.00019750832458627503, + "loss": 2.7397, + "step": 1517 + }, + { + "epoch": 0.12250827213299975, + "grad_norm": 0.8952646851539612, + "learning_rate": 0.00019750482121726605, + "loss": 2.8305, + "step": 1518 + }, + { + "epoch": 0.12258897586958276, + "grad_norm": 0.8395531177520752, + "learning_rate": 0.00019750131541818204, + "loss": 2.7852, + "step": 1519 + }, + { + "epoch": 0.12266967960616576, + "grad_norm": 0.8123572468757629, + "learning_rate": 0.0001974978071891104, + "loss": 2.831, + "step": 1520 + }, + { + "epoch": 0.12275038334274876, + "grad_norm": 0.8716141581535339, + "learning_rate": 0.00019749429653013851, + "loss": 2.8012, + "step": 1521 + }, + { + "epoch": 0.12283108707933177, + "grad_norm": 0.7848379611968994, + "learning_rate": 0.0001974907834413539, + "loss": 2.7812, + "step": 1522 + }, + { + "epoch": 0.12291179081591477, + "grad_norm": 0.834072470664978, + "learning_rate": 0.00019748726792284414, + "loss": 2.7442, + "step": 1523 + }, + { + "epoch": 0.12299249455249778, + "grad_norm": 0.8377225399017334, + "learning_rate": 0.0001974837499746968, + "loss": 2.7967, + "step": 1524 + }, + { + "epoch": 0.12307319828908078, + "grad_norm": 0.8809494376182556, + "learning_rate": 0.0001974802295969996, + "loss": 2.8042, + "step": 1525 + }, + { + "epoch": 0.12315390202566379, + "grad_norm": 0.8504741787910461, + "learning_rate": 0.00019747670678984028, + "loss": 2.7909, + "step": 1526 + }, + { + "epoch": 0.12323460576224679, + "grad_norm": 0.9444355368614197, + "learning_rate": 0.00019747318155330663, + "loss": 2.8567, + "step": 1527 + }, + { + "epoch": 0.1233153094988298, + "grad_norm": 0.859166145324707, + "learning_rate": 0.00019746965388748645, + "loss": 2.8305, + "step": 1528 + }, + { + "epoch": 0.1233960132354128, + "grad_norm": 0.8431086540222168, + "learning_rate": 0.00019746612379246777, + "loss": 2.7799, + "step": 1529 + }, + { + "epoch": 0.1234767169719958, + "grad_norm": 0.8872438669204712, + "learning_rate": 0.00019746259126833846, + "loss": 2.8413, + "step": 1530 + }, + { + "epoch": 0.12355742070857881, + "grad_norm": 0.8698925375938416, + "learning_rate": 0.0001974590563151866, + "loss": 2.8446, + "step": 1531 + }, + { + "epoch": 0.12363812444516181, + "grad_norm": 0.8926429152488708, + "learning_rate": 0.0001974555189331003, + "loss": 2.7859, + "step": 1532 + }, + { + "epoch": 0.12371882818174482, + "grad_norm": 0.8089048862457275, + "learning_rate": 0.00019745197912216775, + "loss": 2.7985, + "step": 1533 + }, + { + "epoch": 0.12379953191832782, + "grad_norm": 0.8180400729179382, + "learning_rate": 0.0001974484368824771, + "loss": 2.7587, + "step": 1534 + }, + { + "epoch": 0.12388023565491083, + "grad_norm": 0.9584212303161621, + "learning_rate": 0.00019744489221411668, + "loss": 2.766, + "step": 1535 + }, + { + "epoch": 0.12396093939149383, + "grad_norm": 0.8425920009613037, + "learning_rate": 0.00019744134511717485, + "loss": 2.8125, + "step": 1536 + }, + { + "epoch": 0.12404164312807683, + "grad_norm": 0.9109299182891846, + "learning_rate": 0.00019743779559173996, + "loss": 2.8613, + "step": 1537 + }, + { + "epoch": 0.12412234686465984, + "grad_norm": 0.8840214610099792, + "learning_rate": 0.0001974342436379005, + "loss": 2.7603, + "step": 1538 + }, + { + "epoch": 0.12420305060124284, + "grad_norm": 0.8128962516784668, + "learning_rate": 0.00019743068925574502, + "loss": 2.7593, + "step": 1539 + }, + { + "epoch": 0.12428375433782585, + "grad_norm": 0.8150052428245544, + "learning_rate": 0.00019742713244536204, + "loss": 2.8099, + "step": 1540 + }, + { + "epoch": 0.12436445807440885, + "grad_norm": 0.8442968130111694, + "learning_rate": 0.00019742357320684027, + "loss": 2.7746, + "step": 1541 + }, + { + "epoch": 0.12444516181099186, + "grad_norm": 0.9347402453422546, + "learning_rate": 0.00019742001154026838, + "loss": 2.8247, + "step": 1542 + }, + { + "epoch": 0.12452586554757485, + "grad_norm": 0.8305966854095459, + "learning_rate": 0.00019741644744573512, + "loss": 2.7398, + "step": 1543 + }, + { + "epoch": 0.12460656928415785, + "grad_norm": 0.8811129927635193, + "learning_rate": 0.00019741288092332935, + "loss": 2.8014, + "step": 1544 + }, + { + "epoch": 0.12468727302074085, + "grad_norm": 1.0287303924560547, + "learning_rate": 0.00019740931197313996, + "loss": 2.8449, + "step": 1545 + }, + { + "epoch": 0.12476797675732386, + "grad_norm": 0.8499771356582642, + "learning_rate": 0.00019740574059525588, + "loss": 2.7845, + "step": 1546 + }, + { + "epoch": 0.12484868049390686, + "grad_norm": 0.8110969066619873, + "learning_rate": 0.00019740216678976614, + "loss": 2.7565, + "step": 1547 + }, + { + "epoch": 0.12492938423048987, + "grad_norm": 0.8530771136283875, + "learning_rate": 0.00019739859055675977, + "loss": 2.8098, + "step": 1548 + }, + { + "epoch": 0.12501008796707289, + "grad_norm": 0.8483901619911194, + "learning_rate": 0.00019739501189632591, + "loss": 2.812, + "step": 1549 + }, + { + "epoch": 0.1250907917036559, + "grad_norm": 0.7894467711448669, + "learning_rate": 0.00019739143080855378, + "loss": 2.8576, + "step": 1550 + }, + { + "epoch": 0.1251714954402389, + "grad_norm": 0.8270247578620911, + "learning_rate": 0.0001973878472935326, + "loss": 2.7613, + "step": 1551 + }, + { + "epoch": 0.1252521991768219, + "grad_norm": 0.8496212959289551, + "learning_rate": 0.00019738426135135174, + "loss": 2.8375, + "step": 1552 + }, + { + "epoch": 0.1253329029134049, + "grad_norm": 0.8465524911880493, + "learning_rate": 0.00019738067298210045, + "loss": 2.8023, + "step": 1553 + }, + { + "epoch": 0.1254136066499879, + "grad_norm": 0.7843824028968811, + "learning_rate": 0.00019737708218586826, + "loss": 2.7424, + "step": 1554 + }, + { + "epoch": 0.1254943103865709, + "grad_norm": 0.8310040235519409, + "learning_rate": 0.00019737348896274462, + "loss": 2.7608, + "step": 1555 + }, + { + "epoch": 0.1255750141231539, + "grad_norm": 0.7895017266273499, + "learning_rate": 0.00019736989331281914, + "loss": 2.7549, + "step": 1556 + }, + { + "epoch": 0.1256557178597369, + "grad_norm": 0.8140431642532349, + "learning_rate": 0.00019736629523618138, + "loss": 2.802, + "step": 1557 + }, + { + "epoch": 0.1257364215963199, + "grad_norm": 0.8026889562606812, + "learning_rate": 0.000197362694732921, + "loss": 2.7758, + "step": 1558 + }, + { + "epoch": 0.1258171253329029, + "grad_norm": 0.8018048405647278, + "learning_rate": 0.0001973590918031278, + "loss": 2.7729, + "step": 1559 + }, + { + "epoch": 0.1258978290694859, + "grad_norm": 0.8394612073898315, + "learning_rate": 0.00019735548644689147, + "loss": 2.7692, + "step": 1560 + }, + { + "epoch": 0.1259785328060689, + "grad_norm": 0.819804310798645, + "learning_rate": 0.00019735187866430198, + "loss": 2.6933, + "step": 1561 + }, + { + "epoch": 0.12605923654265191, + "grad_norm": 0.8094257116317749, + "learning_rate": 0.0001973482684554492, + "loss": 2.7722, + "step": 1562 + }, + { + "epoch": 0.12613994027923492, + "grad_norm": 0.8647315502166748, + "learning_rate": 0.00019734465582042305, + "loss": 2.787, + "step": 1563 + }, + { + "epoch": 0.12622064401581792, + "grad_norm": 0.8439335823059082, + "learning_rate": 0.00019734104075931367, + "loss": 2.8, + "step": 1564 + }, + { + "epoch": 0.12630134775240093, + "grad_norm": 0.852480947971344, + "learning_rate": 0.00019733742327221105, + "loss": 2.8656, + "step": 1565 + }, + { + "epoch": 0.12638205148898393, + "grad_norm": 0.813846230506897, + "learning_rate": 0.00019733380335920542, + "loss": 2.7733, + "step": 1566 + }, + { + "epoch": 0.12646275522556694, + "grad_norm": 0.7860896587371826, + "learning_rate": 0.00019733018102038698, + "loss": 2.8201, + "step": 1567 + }, + { + "epoch": 0.12654345896214994, + "grad_norm": 0.7857748866081238, + "learning_rate": 0.00019732655625584602, + "loss": 2.8726, + "step": 1568 + }, + { + "epoch": 0.12662416269873294, + "grad_norm": 0.8152899146080017, + "learning_rate": 0.00019732292906567286, + "loss": 2.7738, + "step": 1569 + }, + { + "epoch": 0.12670486643531595, + "grad_norm": 0.8281696438789368, + "learning_rate": 0.00019731929944995788, + "loss": 2.7966, + "step": 1570 + }, + { + "epoch": 0.12678557017189895, + "grad_norm": 0.8070773482322693, + "learning_rate": 0.00019731566740879158, + "loss": 2.6988, + "step": 1571 + }, + { + "epoch": 0.12686627390848196, + "grad_norm": 0.7859680652618408, + "learning_rate": 0.00019731203294226445, + "loss": 2.7241, + "step": 1572 + }, + { + "epoch": 0.12694697764506496, + "grad_norm": 0.7753982543945312, + "learning_rate": 0.0001973083960504671, + "loss": 2.7621, + "step": 1573 + }, + { + "epoch": 0.12702768138164797, + "grad_norm": 0.8063471913337708, + "learning_rate": 0.00019730475673349014, + "loss": 2.7298, + "step": 1574 + }, + { + "epoch": 0.12710838511823097, + "grad_norm": 0.7943962812423706, + "learning_rate": 0.0001973011149914243, + "loss": 2.7714, + "step": 1575 + }, + { + "epoch": 0.12718908885481398, + "grad_norm": 0.8297483325004578, + "learning_rate": 0.00019729747082436033, + "loss": 2.7743, + "step": 1576 + }, + { + "epoch": 0.12726979259139698, + "grad_norm": 0.8728111386299133, + "learning_rate": 0.000197293824232389, + "loss": 2.8251, + "step": 1577 + }, + { + "epoch": 0.12735049632797998, + "grad_norm": 0.8762480020523071, + "learning_rate": 0.00019729017521560128, + "loss": 2.8036, + "step": 1578 + }, + { + "epoch": 0.127431200064563, + "grad_norm": 0.9266185164451599, + "learning_rate": 0.00019728652377408806, + "loss": 2.7335, + "step": 1579 + }, + { + "epoch": 0.127511903801146, + "grad_norm": 0.9289839267730713, + "learning_rate": 0.00019728286990794037, + "loss": 2.7715, + "step": 1580 + }, + { + "epoch": 0.127592607537729, + "grad_norm": 0.8811823725700378, + "learning_rate": 0.0001972792136172493, + "loss": 2.7389, + "step": 1581 + }, + { + "epoch": 0.127673311274312, + "grad_norm": 0.8174294233322144, + "learning_rate": 0.00019727555490210588, + "loss": 2.7483, + "step": 1582 + }, + { + "epoch": 0.127754015010895, + "grad_norm": 0.8254107236862183, + "learning_rate": 0.00019727189376260137, + "loss": 2.7897, + "step": 1583 + }, + { + "epoch": 0.127834718747478, + "grad_norm": 0.8478763699531555, + "learning_rate": 0.000197268230198827, + "loss": 2.7394, + "step": 1584 + }, + { + "epoch": 0.12791542248406101, + "grad_norm": 0.8356192111968994, + "learning_rate": 0.00019726456421087404, + "loss": 2.7518, + "step": 1585 + }, + { + "epoch": 0.12799612622064402, + "grad_norm": 0.8523107767105103, + "learning_rate": 0.00019726089579883392, + "loss": 2.7893, + "step": 1586 + }, + { + "epoch": 0.12807682995722702, + "grad_norm": 0.9048579931259155, + "learning_rate": 0.00019725722496279804, + "loss": 2.7488, + "step": 1587 + }, + { + "epoch": 0.12815753369381003, + "grad_norm": 0.8242251873016357, + "learning_rate": 0.00019725355170285787, + "loss": 2.7544, + "step": 1588 + }, + { + "epoch": 0.12823823743039303, + "grad_norm": 0.8343983888626099, + "learning_rate": 0.00019724987601910497, + "loss": 2.7317, + "step": 1589 + }, + { + "epoch": 0.12831894116697604, + "grad_norm": 0.8084509372711182, + "learning_rate": 0.00019724619791163095, + "loss": 2.7822, + "step": 1590 + }, + { + "epoch": 0.12839964490355904, + "grad_norm": 0.8397380113601685, + "learning_rate": 0.00019724251738052745, + "loss": 2.8188, + "step": 1591 + }, + { + "epoch": 0.12848034864014204, + "grad_norm": 0.8558558821678162, + "learning_rate": 0.00019723883442588624, + "loss": 2.7623, + "step": 1592 + }, + { + "epoch": 0.12856105237672505, + "grad_norm": 0.7602639198303223, + "learning_rate": 0.0001972351490477991, + "loss": 2.7932, + "step": 1593 + }, + { + "epoch": 0.12864175611330805, + "grad_norm": 0.8379851579666138, + "learning_rate": 0.00019723146124635786, + "loss": 2.8296, + "step": 1594 + }, + { + "epoch": 0.12872245984989106, + "grad_norm": 0.8454548716545105, + "learning_rate": 0.00019722777102165444, + "loss": 2.8192, + "step": 1595 + }, + { + "epoch": 0.12880316358647406, + "grad_norm": 0.8344082832336426, + "learning_rate": 0.0001972240783737808, + "loss": 2.7628, + "step": 1596 + }, + { + "epoch": 0.12888386732305707, + "grad_norm": 0.809093713760376, + "learning_rate": 0.000197220383302829, + "loss": 2.8055, + "step": 1597 + }, + { + "epoch": 0.12896457105964007, + "grad_norm": 0.7909694910049438, + "learning_rate": 0.0001972166858088911, + "loss": 2.7292, + "step": 1598 + }, + { + "epoch": 0.12904527479622308, + "grad_norm": 0.8350280523300171, + "learning_rate": 0.00019721298589205928, + "loss": 2.7671, + "step": 1599 + }, + { + "epoch": 0.12912597853280608, + "grad_norm": 0.7857616543769836, + "learning_rate": 0.00019720928355242568, + "loss": 2.729, + "step": 1600 + }, + { + "epoch": 0.12920668226938908, + "grad_norm": 0.7899746298789978, + "learning_rate": 0.0001972055787900827, + "loss": 2.8023, + "step": 1601 + }, + { + "epoch": 0.1292873860059721, + "grad_norm": 0.8604246377944946, + "learning_rate": 0.00019720187160512256, + "loss": 2.749, + "step": 1602 + }, + { + "epoch": 0.1293680897425551, + "grad_norm": 0.8517864942550659, + "learning_rate": 0.0001971981619976377, + "loss": 2.7203, + "step": 1603 + }, + { + "epoch": 0.1294487934791381, + "grad_norm": 0.8860471248626709, + "learning_rate": 0.00019719444996772056, + "loss": 2.7372, + "step": 1604 + }, + { + "epoch": 0.1295294972157211, + "grad_norm": 0.8355888724327087, + "learning_rate": 0.00019719073551546367, + "loss": 2.7284, + "step": 1605 + }, + { + "epoch": 0.1296102009523041, + "grad_norm": 0.7998479604721069, + "learning_rate": 0.00019718701864095955, + "loss": 2.7726, + "step": 1606 + }, + { + "epoch": 0.12969090468888708, + "grad_norm": 0.8564549088478088, + "learning_rate": 0.00019718329934430092, + "loss": 2.7334, + "step": 1607 + }, + { + "epoch": 0.1297716084254701, + "grad_norm": 0.8594443798065186, + "learning_rate": 0.00019717957762558044, + "loss": 2.7865, + "step": 1608 + }, + { + "epoch": 0.1298523121620531, + "grad_norm": 0.804553210735321, + "learning_rate": 0.00019717585348489082, + "loss": 2.8094, + "step": 1609 + }, + { + "epoch": 0.1299330158986361, + "grad_norm": 0.7892553806304932, + "learning_rate": 0.0001971721269223249, + "loss": 2.7969, + "step": 1610 + }, + { + "epoch": 0.1300137196352191, + "grad_norm": 0.8703331351280212, + "learning_rate": 0.0001971683979379756, + "loss": 2.8192, + "step": 1611 + }, + { + "epoch": 0.1300944233718021, + "grad_norm": 0.8176589012145996, + "learning_rate": 0.00019716466653193582, + "loss": 2.7902, + "step": 1612 + }, + { + "epoch": 0.1301751271083851, + "grad_norm": 0.8305137157440186, + "learning_rate": 0.00019716093270429855, + "loss": 2.8202, + "step": 1613 + }, + { + "epoch": 0.1302558308449681, + "grad_norm": 0.8261505365371704, + "learning_rate": 0.00019715719645515688, + "loss": 2.7905, + "step": 1614 + }, + { + "epoch": 0.13033653458155112, + "grad_norm": 0.9465535879135132, + "learning_rate": 0.00019715345778460389, + "loss": 2.7965, + "step": 1615 + }, + { + "epoch": 0.13041723831813412, + "grad_norm": 0.8847100138664246, + "learning_rate": 0.00019714971669273275, + "loss": 2.8177, + "step": 1616 + }, + { + "epoch": 0.13049794205471713, + "grad_norm": 0.9768328666687012, + "learning_rate": 0.0001971459731796367, + "loss": 2.7668, + "step": 1617 + }, + { + "epoch": 0.13057864579130013, + "grad_norm": 0.7498586177825928, + "learning_rate": 0.0001971422272454091, + "loss": 2.761, + "step": 1618 + }, + { + "epoch": 0.13065934952788313, + "grad_norm": 1.0455373525619507, + "learning_rate": 0.00019713847889014325, + "loss": 2.7652, + "step": 1619 + }, + { + "epoch": 0.13074005326446614, + "grad_norm": 0.8484631180763245, + "learning_rate": 0.00019713472811393258, + "loss": 2.7858, + "step": 1620 + }, + { + "epoch": 0.13082075700104914, + "grad_norm": 0.8190686702728271, + "learning_rate": 0.00019713097491687057, + "loss": 2.7217, + "step": 1621 + }, + { + "epoch": 0.13090146073763215, + "grad_norm": 0.8866000175476074, + "learning_rate": 0.00019712721929905077, + "loss": 2.7868, + "step": 1622 + }, + { + "epoch": 0.13098216447421515, + "grad_norm": 0.8026713132858276, + "learning_rate": 0.00019712346126056677, + "loss": 2.7276, + "step": 1623 + }, + { + "epoch": 0.13106286821079816, + "grad_norm": 0.8306462168693542, + "learning_rate": 0.00019711970080151225, + "loss": 2.7747, + "step": 1624 + }, + { + "epoch": 0.13114357194738116, + "grad_norm": 0.8276618123054504, + "learning_rate": 0.0001971159379219809, + "loss": 2.7146, + "step": 1625 + }, + { + "epoch": 0.13122427568396416, + "grad_norm": 0.9749011993408203, + "learning_rate": 0.00019711217262206648, + "loss": 2.8731, + "step": 1626 + }, + { + "epoch": 0.13130497942054717, + "grad_norm": 0.828484058380127, + "learning_rate": 0.00019710840490186292, + "loss": 2.803, + "step": 1627 + }, + { + "epoch": 0.13138568315713017, + "grad_norm": 0.8095957636833191, + "learning_rate": 0.00019710463476146402, + "loss": 2.7751, + "step": 1628 + }, + { + "epoch": 0.13146638689371318, + "grad_norm": 0.8731853365898132, + "learning_rate": 0.0001971008622009638, + "loss": 2.8274, + "step": 1629 + }, + { + "epoch": 0.13154709063029618, + "grad_norm": 0.8180200457572937, + "learning_rate": 0.00019709708722045628, + "loss": 2.813, + "step": 1630 + }, + { + "epoch": 0.13162779436687919, + "grad_norm": 0.7740067839622498, + "learning_rate": 0.00019709330982003553, + "loss": 2.7319, + "step": 1631 + }, + { + "epoch": 0.1317084981034622, + "grad_norm": 0.8439326882362366, + "learning_rate": 0.0001970895299997957, + "loss": 2.8182, + "step": 1632 + }, + { + "epoch": 0.1317892018400452, + "grad_norm": 0.8254802823066711, + "learning_rate": 0.000197085747759831, + "loss": 2.7874, + "step": 1633 + }, + { + "epoch": 0.1318699055766282, + "grad_norm": 0.8128175139427185, + "learning_rate": 0.00019708196310023562, + "loss": 2.8125, + "step": 1634 + }, + { + "epoch": 0.1319506093132112, + "grad_norm": 0.8664820790290833, + "learning_rate": 0.00019707817602110402, + "loss": 2.8446, + "step": 1635 + }, + { + "epoch": 0.1320313130497942, + "grad_norm": 0.8101332783699036, + "learning_rate": 0.00019707438652253044, + "loss": 2.8027, + "step": 1636 + }, + { + "epoch": 0.1321120167863772, + "grad_norm": 0.8296725153923035, + "learning_rate": 0.00019707059460460945, + "loss": 2.7677, + "step": 1637 + }, + { + "epoch": 0.13219272052296022, + "grad_norm": 0.7321150898933411, + "learning_rate": 0.0001970668002674355, + "loss": 2.6991, + "step": 1638 + }, + { + "epoch": 0.13227342425954322, + "grad_norm": 0.8321375250816345, + "learning_rate": 0.0001970630035111031, + "loss": 2.6948, + "step": 1639 + }, + { + "epoch": 0.13235412799612623, + "grad_norm": 0.7622714042663574, + "learning_rate": 0.00019705920433570694, + "loss": 2.6957, + "step": 1640 + }, + { + "epoch": 0.13243483173270923, + "grad_norm": 0.8413416147232056, + "learning_rate": 0.00019705540274134173, + "loss": 2.7277, + "step": 1641 + }, + { + "epoch": 0.13251553546929223, + "grad_norm": 0.8798941373825073, + "learning_rate": 0.00019705159872810218, + "loss": 2.7699, + "step": 1642 + }, + { + "epoch": 0.13259623920587524, + "grad_norm": 0.788287341594696, + "learning_rate": 0.00019704779229608304, + "loss": 2.7933, + "step": 1643 + }, + { + "epoch": 0.13267694294245824, + "grad_norm": 0.8547430634498596, + "learning_rate": 0.00019704398344537927, + "loss": 2.7706, + "step": 1644 + }, + { + "epoch": 0.13275764667904125, + "grad_norm": 0.8474008440971375, + "learning_rate": 0.00019704017217608575, + "loss": 2.8005, + "step": 1645 + }, + { + "epoch": 0.13283835041562425, + "grad_norm": 0.8636945486068726, + "learning_rate": 0.00019703635848829747, + "loss": 2.8241, + "step": 1646 + }, + { + "epoch": 0.13291905415220726, + "grad_norm": 0.8158168792724609, + "learning_rate": 0.00019703254238210947, + "loss": 2.7576, + "step": 1647 + }, + { + "epoch": 0.13299975788879026, + "grad_norm": 0.8420887589454651, + "learning_rate": 0.0001970287238576169, + "loss": 2.7677, + "step": 1648 + }, + { + "epoch": 0.13308046162537326, + "grad_norm": 0.7910059690475464, + "learning_rate": 0.00019702490291491486, + "loss": 2.7807, + "step": 1649 + }, + { + "epoch": 0.13316116536195627, + "grad_norm": 0.8308143615722656, + "learning_rate": 0.00019702107955409863, + "loss": 2.7698, + "step": 1650 + }, + { + "epoch": 0.13324186909853927, + "grad_norm": 0.8215764760971069, + "learning_rate": 0.00019701725377526349, + "loss": 2.8263, + "step": 1651 + }, + { + "epoch": 0.13332257283512228, + "grad_norm": 0.8780504465103149, + "learning_rate": 0.00019701342557850476, + "loss": 2.8032, + "step": 1652 + }, + { + "epoch": 0.13340327657170528, + "grad_norm": 0.8125136494636536, + "learning_rate": 0.0001970095949639179, + "loss": 2.8317, + "step": 1653 + }, + { + "epoch": 0.13348398030828829, + "grad_norm": 0.8170902132987976, + "learning_rate": 0.00019700576193159831, + "loss": 2.7528, + "step": 1654 + }, + { + "epoch": 0.1335646840448713, + "grad_norm": 0.8318637013435364, + "learning_rate": 0.00019700192648164157, + "loss": 2.7963, + "step": 1655 + }, + { + "epoch": 0.1336453877814543, + "grad_norm": 0.8445270657539368, + "learning_rate": 0.00019699808861414327, + "loss": 2.772, + "step": 1656 + }, + { + "epoch": 0.1337260915180373, + "grad_norm": 0.7908959984779358, + "learning_rate": 0.00019699424832919906, + "loss": 2.7528, + "step": 1657 + }, + { + "epoch": 0.13380679525462028, + "grad_norm": 0.8153900504112244, + "learning_rate": 0.00019699040562690462, + "loss": 2.7643, + "step": 1658 + }, + { + "epoch": 0.13388749899120328, + "grad_norm": 0.86302250623703, + "learning_rate": 0.0001969865605073557, + "loss": 2.8037, + "step": 1659 + }, + { + "epoch": 0.13396820272778628, + "grad_norm": 0.8373419046401978, + "learning_rate": 0.0001969827129706482, + "loss": 2.7647, + "step": 1660 + }, + { + "epoch": 0.1340489064643693, + "grad_norm": 0.8166481852531433, + "learning_rate": 0.00019697886301687798, + "loss": 2.8333, + "step": 1661 + }, + { + "epoch": 0.1341296102009523, + "grad_norm": 0.7807812094688416, + "learning_rate": 0.00019697501064614098, + "loss": 2.7495, + "step": 1662 + }, + { + "epoch": 0.1342103139375353, + "grad_norm": 0.8375338315963745, + "learning_rate": 0.00019697115585853324, + "loss": 2.7518, + "step": 1663 + }, + { + "epoch": 0.1342910176741183, + "grad_norm": 0.7392182350158691, + "learning_rate": 0.00019696729865415077, + "loss": 2.758, + "step": 1664 + }, + { + "epoch": 0.1343717214107013, + "grad_norm": 0.8041971921920776, + "learning_rate": 0.00019696343903308978, + "loss": 2.7485, + "step": 1665 + }, + { + "epoch": 0.1344524251472843, + "grad_norm": 0.789310097694397, + "learning_rate": 0.00019695957699544643, + "loss": 2.8179, + "step": 1666 + }, + { + "epoch": 0.13453312888386731, + "grad_norm": 0.7643609642982483, + "learning_rate": 0.00019695571254131693, + "loss": 2.7791, + "step": 1667 + }, + { + "epoch": 0.13461383262045032, + "grad_norm": 0.8284661769866943, + "learning_rate": 0.00019695184567079766, + "loss": 2.717, + "step": 1668 + }, + { + "epoch": 0.13469453635703332, + "grad_norm": 0.7620903253555298, + "learning_rate": 0.00019694797638398494, + "loss": 2.7808, + "step": 1669 + }, + { + "epoch": 0.13477524009361633, + "grad_norm": 0.9123913645744324, + "learning_rate": 0.00019694410468097524, + "loss": 2.7648, + "step": 1670 + }, + { + "epoch": 0.13485594383019933, + "grad_norm": 0.735518217086792, + "learning_rate": 0.000196940230561865, + "loss": 2.7653, + "step": 1671 + }, + { + "epoch": 0.13493664756678234, + "grad_norm": 0.8363413214683533, + "learning_rate": 0.00019693635402675085, + "loss": 2.766, + "step": 1672 + }, + { + "epoch": 0.13501735130336534, + "grad_norm": 0.8206491470336914, + "learning_rate": 0.00019693247507572936, + "loss": 2.7829, + "step": 1673 + }, + { + "epoch": 0.13509805503994834, + "grad_norm": 0.7726099491119385, + "learning_rate": 0.0001969285937088972, + "loss": 2.7381, + "step": 1674 + }, + { + "epoch": 0.13517875877653135, + "grad_norm": 0.8970316052436829, + "learning_rate": 0.0001969247099263511, + "loss": 2.7836, + "step": 1675 + }, + { + "epoch": 0.13525946251311435, + "grad_norm": 0.7966172099113464, + "learning_rate": 0.00019692082372818788, + "loss": 2.7135, + "step": 1676 + }, + { + "epoch": 0.13534016624969736, + "grad_norm": 0.8583024740219116, + "learning_rate": 0.00019691693511450438, + "loss": 2.7908, + "step": 1677 + }, + { + "epoch": 0.13542086998628036, + "grad_norm": 0.9430457353591919, + "learning_rate": 0.0001969130440853975, + "loss": 2.7311, + "step": 1678 + }, + { + "epoch": 0.13550157372286337, + "grad_norm": 0.8066009879112244, + "learning_rate": 0.00019690915064096424, + "loss": 2.7039, + "step": 1679 + }, + { + "epoch": 0.13558227745944637, + "grad_norm": 1.0169655084609985, + "learning_rate": 0.0001969052547813016, + "loss": 2.7832, + "step": 1680 + }, + { + "epoch": 0.13566298119602938, + "grad_norm": 0.8606080412864685, + "learning_rate": 0.00019690135650650672, + "loss": 2.751, + "step": 1681 + }, + { + "epoch": 0.13574368493261238, + "grad_norm": 0.8625333905220032, + "learning_rate": 0.00019689745581667674, + "loss": 2.761, + "step": 1682 + }, + { + "epoch": 0.13582438866919538, + "grad_norm": 0.9304285645484924, + "learning_rate": 0.00019689355271190886, + "loss": 2.7566, + "step": 1683 + }, + { + "epoch": 0.1359050924057784, + "grad_norm": 0.793397068977356, + "learning_rate": 0.00019688964719230035, + "loss": 2.7648, + "step": 1684 + }, + { + "epoch": 0.1359857961423614, + "grad_norm": 0.8496749401092529, + "learning_rate": 0.00019688573925794858, + "loss": 2.7461, + "step": 1685 + }, + { + "epoch": 0.1360664998789444, + "grad_norm": 0.7807914018630981, + "learning_rate": 0.0001968818289089509, + "loss": 2.8266, + "step": 1686 + }, + { + "epoch": 0.1361472036155274, + "grad_norm": 0.8186607956886292, + "learning_rate": 0.0001968779161454048, + "loss": 2.8447, + "step": 1687 + }, + { + "epoch": 0.1362279073521104, + "grad_norm": 0.8007118701934814, + "learning_rate": 0.0001968740009674078, + "loss": 2.7888, + "step": 1688 + }, + { + "epoch": 0.1363086110886934, + "grad_norm": 0.8735570311546326, + "learning_rate": 0.00019687008337505749, + "loss": 2.7152, + "step": 1689 + }, + { + "epoch": 0.13638931482527641, + "grad_norm": 0.8546476364135742, + "learning_rate": 0.00019686616336845144, + "loss": 2.8113, + "step": 1690 + }, + { + "epoch": 0.13647001856185942, + "grad_norm": 0.9156736135482788, + "learning_rate": 0.0001968622409476874, + "loss": 2.7561, + "step": 1691 + }, + { + "epoch": 0.13655072229844242, + "grad_norm": 0.8091925382614136, + "learning_rate": 0.0001968583161128631, + "loss": 2.7384, + "step": 1692 + }, + { + "epoch": 0.13663142603502543, + "grad_norm": 0.7871039509773254, + "learning_rate": 0.0001968543888640764, + "loss": 2.7138, + "step": 1693 + }, + { + "epoch": 0.13671212977160843, + "grad_norm": 0.9537062048912048, + "learning_rate": 0.00019685045920142516, + "loss": 2.7726, + "step": 1694 + }, + { + "epoch": 0.13679283350819144, + "grad_norm": 0.8663280010223389, + "learning_rate": 0.00019684652712500728, + "loss": 2.7509, + "step": 1695 + }, + { + "epoch": 0.13687353724477444, + "grad_norm": 0.8717214465141296, + "learning_rate": 0.0001968425926349208, + "loss": 2.791, + "step": 1696 + }, + { + "epoch": 0.13695424098135744, + "grad_norm": 0.8942584991455078, + "learning_rate": 0.00019683865573126374, + "loss": 2.77, + "step": 1697 + }, + { + "epoch": 0.13703494471794045, + "grad_norm": 0.8243421316146851, + "learning_rate": 0.00019683471641413424, + "loss": 2.8063, + "step": 1698 + }, + { + "epoch": 0.13711564845452345, + "grad_norm": 0.8618699908256531, + "learning_rate": 0.0001968307746836305, + "loss": 2.6872, + "step": 1699 + }, + { + "epoch": 0.13719635219110646, + "grad_norm": 0.7931695580482483, + "learning_rate": 0.00019682683053985072, + "loss": 2.7495, + "step": 1700 + }, + { + "epoch": 0.13727705592768946, + "grad_norm": 0.7549482583999634, + "learning_rate": 0.00019682288398289324, + "loss": 2.7543, + "step": 1701 + }, + { + "epoch": 0.13735775966427247, + "grad_norm": 0.7953789234161377, + "learning_rate": 0.00019681893501285636, + "loss": 2.6895, + "step": 1702 + }, + { + "epoch": 0.13743846340085547, + "grad_norm": 0.7916574478149414, + "learning_rate": 0.00019681498362983857, + "loss": 2.819, + "step": 1703 + }, + { + "epoch": 0.13751916713743847, + "grad_norm": 0.7986735105514526, + "learning_rate": 0.0001968110298339383, + "loss": 2.8062, + "step": 1704 + }, + { + "epoch": 0.13759987087402148, + "grad_norm": 0.8601658940315247, + "learning_rate": 0.00019680707362525407, + "loss": 2.7625, + "step": 1705 + }, + { + "epoch": 0.13768057461060448, + "grad_norm": 0.8888362050056458, + "learning_rate": 0.00019680311500388454, + "loss": 2.7747, + "step": 1706 + }, + { + "epoch": 0.1377612783471875, + "grad_norm": 0.7762896418571472, + "learning_rate": 0.00019679915396992833, + "loss": 2.7959, + "step": 1707 + }, + { + "epoch": 0.1378419820837705, + "grad_norm": 0.8942253589630127, + "learning_rate": 0.00019679519052348416, + "loss": 2.7717, + "step": 1708 + }, + { + "epoch": 0.13792268582035347, + "grad_norm": 0.8388909697532654, + "learning_rate": 0.00019679122466465082, + "loss": 2.7448, + "step": 1709 + }, + { + "epoch": 0.13800338955693647, + "grad_norm": 0.8826024532318115, + "learning_rate": 0.00019678725639352712, + "loss": 2.7307, + "step": 1710 + }, + { + "epoch": 0.13808409329351948, + "grad_norm": 0.8972313404083252, + "learning_rate": 0.00019678328571021204, + "loss": 2.7619, + "step": 1711 + }, + { + "epoch": 0.13816479703010248, + "grad_norm": 0.9373044371604919, + "learning_rate": 0.00019677931261480444, + "loss": 2.7664, + "step": 1712 + }, + { + "epoch": 0.1382455007666855, + "grad_norm": 0.8060994148254395, + "learning_rate": 0.00019677533710740343, + "loss": 2.7707, + "step": 1713 + }, + { + "epoch": 0.1383262045032685, + "grad_norm": 0.8324100971221924, + "learning_rate": 0.000196771359188108, + "loss": 2.8249, + "step": 1714 + }, + { + "epoch": 0.1384069082398515, + "grad_norm": 0.879176676273346, + "learning_rate": 0.00019676737885701738, + "loss": 2.7767, + "step": 1715 + }, + { + "epoch": 0.1384876119764345, + "grad_norm": 0.8823966979980469, + "learning_rate": 0.0001967633961142307, + "loss": 2.791, + "step": 1716 + }, + { + "epoch": 0.1385683157130175, + "grad_norm": 0.8176039457321167, + "learning_rate": 0.00019675941095984728, + "loss": 2.8225, + "step": 1717 + }, + { + "epoch": 0.1386490194496005, + "grad_norm": 0.8005076050758362, + "learning_rate": 0.00019675542339396635, + "loss": 2.8175, + "step": 1718 + }, + { + "epoch": 0.1387297231861835, + "grad_norm": 0.800854504108429, + "learning_rate": 0.0001967514334166874, + "loss": 2.8226, + "step": 1719 + }, + { + "epoch": 0.13881042692276652, + "grad_norm": 0.7941261529922485, + "learning_rate": 0.00019674744102810978, + "loss": 2.7488, + "step": 1720 + }, + { + "epoch": 0.13889113065934952, + "grad_norm": 0.7955947518348694, + "learning_rate": 0.00019674344622833302, + "loss": 2.7749, + "step": 1721 + }, + { + "epoch": 0.13897183439593253, + "grad_norm": 0.8353856205940247, + "learning_rate": 0.00019673944901745674, + "loss": 2.7982, + "step": 1722 + }, + { + "epoch": 0.13905253813251553, + "grad_norm": 0.8711503744125366, + "learning_rate": 0.00019673544939558047, + "loss": 2.8007, + "step": 1723 + }, + { + "epoch": 0.13913324186909853, + "grad_norm": 0.8525274991989136, + "learning_rate": 0.00019673144736280396, + "loss": 2.7423, + "step": 1724 + }, + { + "epoch": 0.13921394560568154, + "grad_norm": 0.8143991231918335, + "learning_rate": 0.0001967274429192269, + "loss": 2.7752, + "step": 1725 + }, + { + "epoch": 0.13929464934226454, + "grad_norm": 0.8508228063583374, + "learning_rate": 0.00019672343606494912, + "loss": 2.7422, + "step": 1726 + }, + { + "epoch": 0.13937535307884755, + "grad_norm": 0.8320932984352112, + "learning_rate": 0.0001967194268000705, + "loss": 2.7598, + "step": 1727 + }, + { + "epoch": 0.13945605681543055, + "grad_norm": 0.8233908414840698, + "learning_rate": 0.00019671541512469092, + "loss": 2.7834, + "step": 1728 + }, + { + "epoch": 0.13953676055201356, + "grad_norm": 0.8097162246704102, + "learning_rate": 0.00019671140103891038, + "loss": 2.7856, + "step": 1729 + }, + { + "epoch": 0.13961746428859656, + "grad_norm": 0.9043141007423401, + "learning_rate": 0.0001967073845428289, + "loss": 2.8047, + "step": 1730 + }, + { + "epoch": 0.13969816802517956, + "grad_norm": 0.9118517637252808, + "learning_rate": 0.00019670336563654662, + "loss": 2.789, + "step": 1731 + }, + { + "epoch": 0.13977887176176257, + "grad_norm": 0.8016074895858765, + "learning_rate": 0.00019669934432016368, + "loss": 2.7506, + "step": 1732 + }, + { + "epoch": 0.13985957549834557, + "grad_norm": 0.8376848697662354, + "learning_rate": 0.0001966953205937803, + "loss": 2.7832, + "step": 1733 + }, + { + "epoch": 0.13994027923492858, + "grad_norm": 0.8511834144592285, + "learning_rate": 0.0001966912944574968, + "loss": 2.7564, + "step": 1734 + }, + { + "epoch": 0.14002098297151158, + "grad_norm": 0.7796351909637451, + "learning_rate": 0.00019668726591141344, + "loss": 2.7489, + "step": 1735 + }, + { + "epoch": 0.14010168670809459, + "grad_norm": 0.8204767107963562, + "learning_rate": 0.00019668323495563068, + "loss": 2.7634, + "step": 1736 + }, + { + "epoch": 0.1401823904446776, + "grad_norm": 0.9049975872039795, + "learning_rate": 0.000196679201590249, + "loss": 2.7863, + "step": 1737 + }, + { + "epoch": 0.1402630941812606, + "grad_norm": 0.7473673224449158, + "learning_rate": 0.0001966751658153689, + "loss": 2.7557, + "step": 1738 + }, + { + "epoch": 0.1403437979178436, + "grad_norm": 0.7765525579452515, + "learning_rate": 0.0001966711276310909, + "loss": 2.7865, + "step": 1739 + }, + { + "epoch": 0.1404245016544266, + "grad_norm": 0.8766517043113708, + "learning_rate": 0.00019666708703751576, + "loss": 2.7873, + "step": 1740 + }, + { + "epoch": 0.1405052053910096, + "grad_norm": 0.8351505994796753, + "learning_rate": 0.00019666304403474408, + "loss": 2.7355, + "step": 1741 + }, + { + "epoch": 0.1405859091275926, + "grad_norm": 0.7612324953079224, + "learning_rate": 0.00019665899862287667, + "loss": 2.7608, + "step": 1742 + }, + { + "epoch": 0.14066661286417562, + "grad_norm": 0.894249439239502, + "learning_rate": 0.00019665495080201434, + "loss": 2.7469, + "step": 1743 + }, + { + "epoch": 0.14074731660075862, + "grad_norm": 0.8528907895088196, + "learning_rate": 0.00019665090057225803, + "loss": 2.773, + "step": 1744 + }, + { + "epoch": 0.14082802033734163, + "grad_norm": 0.7718498706817627, + "learning_rate": 0.00019664684793370855, + "loss": 2.8045, + "step": 1745 + }, + { + "epoch": 0.14090872407392463, + "grad_norm": 0.8013718128204346, + "learning_rate": 0.00019664279288646706, + "loss": 2.7665, + "step": 1746 + }, + { + "epoch": 0.14098942781050763, + "grad_norm": 0.828803539276123, + "learning_rate": 0.00019663873543063448, + "loss": 2.7846, + "step": 1747 + }, + { + "epoch": 0.14107013154709064, + "grad_norm": 0.8349393606185913, + "learning_rate": 0.00019663467556631204, + "loss": 2.7405, + "step": 1748 + }, + { + "epoch": 0.14115083528367364, + "grad_norm": 0.8273345232009888, + "learning_rate": 0.00019663061329360085, + "loss": 2.7578, + "step": 1749 + }, + { + "epoch": 0.14123153902025665, + "grad_norm": 0.7989444136619568, + "learning_rate": 0.0001966265486126022, + "loss": 2.739, + "step": 1750 + }, + { + "epoch": 0.14131224275683965, + "grad_norm": 0.8690519332885742, + "learning_rate": 0.00019662248152341736, + "loss": 2.7566, + "step": 1751 + }, + { + "epoch": 0.14139294649342266, + "grad_norm": 0.8453623056411743, + "learning_rate": 0.0001966184120261477, + "loss": 2.8572, + "step": 1752 + }, + { + "epoch": 0.14147365023000566, + "grad_norm": 0.8396254777908325, + "learning_rate": 0.00019661434012089468, + "loss": 2.786, + "step": 1753 + }, + { + "epoch": 0.14155435396658866, + "grad_norm": 0.7643738389015198, + "learning_rate": 0.00019661026580775973, + "loss": 2.8193, + "step": 1754 + }, + { + "epoch": 0.14163505770317167, + "grad_norm": 0.8124154806137085, + "learning_rate": 0.00019660618908684443, + "loss": 2.7754, + "step": 1755 + }, + { + "epoch": 0.14171576143975467, + "grad_norm": 0.8620683550834656, + "learning_rate": 0.00019660210995825036, + "loss": 2.7827, + "step": 1756 + }, + { + "epoch": 0.14179646517633768, + "grad_norm": 0.8241196274757385, + "learning_rate": 0.0001965980284220792, + "loss": 2.7573, + "step": 1757 + }, + { + "epoch": 0.14187716891292068, + "grad_norm": 0.8264089822769165, + "learning_rate": 0.00019659394447843262, + "loss": 2.8214, + "step": 1758 + }, + { + "epoch": 0.14195787264950369, + "grad_norm": 0.9129722118377686, + "learning_rate": 0.00019658985812741247, + "loss": 2.7962, + "step": 1759 + }, + { + "epoch": 0.14203857638608666, + "grad_norm": 0.7976365089416504, + "learning_rate": 0.00019658576936912057, + "loss": 2.7534, + "step": 1760 + }, + { + "epoch": 0.14211928012266967, + "grad_norm": 0.7587228417396545, + "learning_rate": 0.00019658167820365882, + "loss": 2.7083, + "step": 1761 + }, + { + "epoch": 0.14219998385925267, + "grad_norm": 0.757882833480835, + "learning_rate": 0.00019657758463112918, + "loss": 2.7135, + "step": 1762 + }, + { + "epoch": 0.14228068759583568, + "grad_norm": 0.8541501760482788, + "learning_rate": 0.00019657348865163369, + "loss": 2.7833, + "step": 1763 + }, + { + "epoch": 0.14236139133241868, + "grad_norm": 0.7708966135978699, + "learning_rate": 0.00019656939026527442, + "loss": 2.7128, + "step": 1764 + }, + { + "epoch": 0.14244209506900168, + "grad_norm": 0.8733000159263611, + "learning_rate": 0.00019656528947215347, + "loss": 2.7597, + "step": 1765 + }, + { + "epoch": 0.1425227988055847, + "grad_norm": 0.7913360595703125, + "learning_rate": 0.0001965611862723731, + "loss": 2.7681, + "step": 1766 + }, + { + "epoch": 0.1426035025421677, + "grad_norm": 0.8692380785942078, + "learning_rate": 0.00019655708066603555, + "loss": 2.7587, + "step": 1767 + }, + { + "epoch": 0.1426842062787507, + "grad_norm": 0.8231006860733032, + "learning_rate": 0.00019655297265324317, + "loss": 2.772, + "step": 1768 + }, + { + "epoch": 0.1427649100153337, + "grad_norm": 0.7373722791671753, + "learning_rate": 0.0001965488622340983, + "loss": 2.7875, + "step": 1769 + }, + { + "epoch": 0.1428456137519167, + "grad_norm": 0.8614751696586609, + "learning_rate": 0.0001965447494087034, + "loss": 2.7962, + "step": 1770 + }, + { + "epoch": 0.1429263174884997, + "grad_norm": 0.8336494565010071, + "learning_rate": 0.000196540634177161, + "loss": 2.7072, + "step": 1771 + }, + { + "epoch": 0.14300702122508271, + "grad_norm": 0.844292163848877, + "learning_rate": 0.00019653651653957362, + "loss": 2.8043, + "step": 1772 + }, + { + "epoch": 0.14308772496166572, + "grad_norm": 0.7366824150085449, + "learning_rate": 0.0001965323964960439, + "loss": 2.7296, + "step": 1773 + }, + { + "epoch": 0.14316842869824872, + "grad_norm": 0.75767982006073, + "learning_rate": 0.0001965282740466745, + "loss": 2.7946, + "step": 1774 + }, + { + "epoch": 0.14324913243483173, + "grad_norm": 0.8361382484436035, + "learning_rate": 0.00019652414919156823, + "loss": 2.7232, + "step": 1775 + }, + { + "epoch": 0.14332983617141473, + "grad_norm": 0.8473719358444214, + "learning_rate": 0.0001965200219308278, + "loss": 2.774, + "step": 1776 + }, + { + "epoch": 0.14341053990799774, + "grad_norm": 0.7446423172950745, + "learning_rate": 0.00019651589226455613, + "loss": 2.7439, + "step": 1777 + }, + { + "epoch": 0.14349124364458074, + "grad_norm": 0.8332851529121399, + "learning_rate": 0.00019651176019285616, + "loss": 2.7891, + "step": 1778 + }, + { + "epoch": 0.14357194738116374, + "grad_norm": 0.885313868522644, + "learning_rate": 0.0001965076257158308, + "loss": 2.7677, + "step": 1779 + }, + { + "epoch": 0.14365265111774675, + "grad_norm": 0.8506965637207031, + "learning_rate": 0.00019650348883358315, + "loss": 2.8112, + "step": 1780 + }, + { + "epoch": 0.14373335485432975, + "grad_norm": 0.8415799736976624, + "learning_rate": 0.0001964993495462163, + "loss": 2.8242, + "step": 1781 + }, + { + "epoch": 0.14381405859091276, + "grad_norm": 0.8501513004302979, + "learning_rate": 0.00019649520785383338, + "loss": 2.8352, + "step": 1782 + }, + { + "epoch": 0.14389476232749576, + "grad_norm": 0.7839778065681458, + "learning_rate": 0.00019649106375653767, + "loss": 2.7194, + "step": 1783 + }, + { + "epoch": 0.14397546606407877, + "grad_norm": 0.8013346195220947, + "learning_rate": 0.00019648691725443243, + "loss": 2.7665, + "step": 1784 + }, + { + "epoch": 0.14405616980066177, + "grad_norm": 1.0338317155838013, + "learning_rate": 0.00019648276834762095, + "loss": 2.8599, + "step": 1785 + }, + { + "epoch": 0.14413687353724478, + "grad_norm": 0.898417592048645, + "learning_rate": 0.0001964786170362067, + "loss": 2.7192, + "step": 1786 + }, + { + "epoch": 0.14421757727382778, + "grad_norm": 0.8876320123672485, + "learning_rate": 0.00019647446332029313, + "loss": 2.7722, + "step": 1787 + }, + { + "epoch": 0.14429828101041078, + "grad_norm": 0.819461464881897, + "learning_rate": 0.00019647030719998373, + "loss": 2.7698, + "step": 1788 + }, + { + "epoch": 0.1443789847469938, + "grad_norm": 0.848380446434021, + "learning_rate": 0.0001964661486753821, + "loss": 2.7894, + "step": 1789 + }, + { + "epoch": 0.1444596884835768, + "grad_norm": 0.8343753814697266, + "learning_rate": 0.0001964619877465919, + "loss": 2.699, + "step": 1790 + }, + { + "epoch": 0.1445403922201598, + "grad_norm": 0.8718340396881104, + "learning_rate": 0.0001964578244137168, + "loss": 2.7313, + "step": 1791 + }, + { + "epoch": 0.1446210959567428, + "grad_norm": 0.866122841835022, + "learning_rate": 0.00019645365867686056, + "loss": 2.7112, + "step": 1792 + }, + { + "epoch": 0.1447017996933258, + "grad_norm": 0.8351789712905884, + "learning_rate": 0.000196449490536127, + "loss": 2.7765, + "step": 1793 + }, + { + "epoch": 0.1447825034299088, + "grad_norm": 0.8628408312797546, + "learning_rate": 0.00019644531999162004, + "loss": 2.7375, + "step": 1794 + }, + { + "epoch": 0.14486320716649181, + "grad_norm": 0.8414484858512878, + "learning_rate": 0.00019644114704344358, + "loss": 2.7502, + "step": 1795 + }, + { + "epoch": 0.14494391090307482, + "grad_norm": 0.9092586636543274, + "learning_rate": 0.00019643697169170166, + "loss": 2.7714, + "step": 1796 + }, + { + "epoch": 0.14502461463965782, + "grad_norm": 0.8458060622215271, + "learning_rate": 0.0001964327939364983, + "loss": 2.8376, + "step": 1797 + }, + { + "epoch": 0.14510531837624083, + "grad_norm": 0.8150759935379028, + "learning_rate": 0.00019642861377793764, + "loss": 2.7147, + "step": 1798 + }, + { + "epoch": 0.14518602211282383, + "grad_norm": 0.9008790850639343, + "learning_rate": 0.00019642443121612387, + "loss": 2.7786, + "step": 1799 + }, + { + "epoch": 0.14526672584940684, + "grad_norm": 0.848671555519104, + "learning_rate": 0.00019642024625116117, + "loss": 2.7813, + "step": 1800 + }, + { + "epoch": 0.14534742958598984, + "grad_norm": 0.8035007119178772, + "learning_rate": 0.00019641605888315393, + "loss": 2.7988, + "step": 1801 + }, + { + "epoch": 0.14542813332257284, + "grad_norm": 0.8210242390632629, + "learning_rate": 0.00019641186911220645, + "loss": 2.8451, + "step": 1802 + }, + { + "epoch": 0.14550883705915585, + "grad_norm": 0.8852066397666931, + "learning_rate": 0.00019640767693842318, + "loss": 2.7492, + "step": 1803 + }, + { + "epoch": 0.14558954079573885, + "grad_norm": 0.8421196937561035, + "learning_rate": 0.0001964034823619086, + "loss": 2.759, + "step": 1804 + }, + { + "epoch": 0.14567024453232186, + "grad_norm": 0.8166298866271973, + "learning_rate": 0.00019639928538276724, + "loss": 2.7942, + "step": 1805 + }, + { + "epoch": 0.14575094826890486, + "grad_norm": 0.8502809405326843, + "learning_rate": 0.00019639508600110368, + "loss": 2.7829, + "step": 1806 + }, + { + "epoch": 0.14583165200548787, + "grad_norm": 0.8371078372001648, + "learning_rate": 0.0001963908842170226, + "loss": 2.7168, + "step": 1807 + }, + { + "epoch": 0.14591235574207087, + "grad_norm": 0.8148230910301208, + "learning_rate": 0.0001963866800306287, + "loss": 2.7706, + "step": 1808 + }, + { + "epoch": 0.14599305947865387, + "grad_norm": 0.8984564542770386, + "learning_rate": 0.0001963824734420268, + "loss": 2.7761, + "step": 1809 + }, + { + "epoch": 0.14607376321523688, + "grad_norm": 0.9357183575630188, + "learning_rate": 0.00019637826445132172, + "loss": 2.7738, + "step": 1810 + }, + { + "epoch": 0.14615446695181986, + "grad_norm": 0.8545449376106262, + "learning_rate": 0.00019637405305861834, + "loss": 2.772, + "step": 1811 + }, + { + "epoch": 0.14623517068840286, + "grad_norm": 1.1674948930740356, + "learning_rate": 0.00019636983926402165, + "loss": 2.8988, + "step": 1812 + }, + { + "epoch": 0.14631587442498586, + "grad_norm": 0.7875451445579529, + "learning_rate": 0.00019636562306763665, + "loss": 2.7053, + "step": 1813 + }, + { + "epoch": 0.14639657816156887, + "grad_norm": 0.8980962038040161, + "learning_rate": 0.0001963614044695684, + "loss": 2.7731, + "step": 1814 + }, + { + "epoch": 0.14647728189815187, + "grad_norm": 0.8403381705284119, + "learning_rate": 0.00019635718346992207, + "loss": 2.8555, + "step": 1815 + }, + { + "epoch": 0.14655798563473488, + "grad_norm": 0.8736433982849121, + "learning_rate": 0.00019635296006880284, + "loss": 2.7918, + "step": 1816 + }, + { + "epoch": 0.14663868937131788, + "grad_norm": 0.8604151606559753, + "learning_rate": 0.000196348734266316, + "loss": 2.7493, + "step": 1817 + }, + { + "epoch": 0.1467193931079009, + "grad_norm": 0.8329424262046814, + "learning_rate": 0.00019634450606256681, + "loss": 2.7348, + "step": 1818 + }, + { + "epoch": 0.1468000968444839, + "grad_norm": 0.9835913181304932, + "learning_rate": 0.0001963402754576607, + "loss": 2.7651, + "step": 1819 + }, + { + "epoch": 0.1468808005810669, + "grad_norm": 0.7968378067016602, + "learning_rate": 0.0001963360424517031, + "loss": 2.7672, + "step": 1820 + }, + { + "epoch": 0.1469615043176499, + "grad_norm": 0.8012512922286987, + "learning_rate": 0.00019633180704479948, + "loss": 2.8022, + "step": 1821 + }, + { + "epoch": 0.1470422080542329, + "grad_norm": 0.7656376957893372, + "learning_rate": 0.0001963275692370554, + "loss": 2.7561, + "step": 1822 + }, + { + "epoch": 0.1471229117908159, + "grad_norm": 0.8030453324317932, + "learning_rate": 0.00019632332902857656, + "loss": 2.8048, + "step": 1823 + }, + { + "epoch": 0.1472036155273989, + "grad_norm": 0.8050903677940369, + "learning_rate": 0.0001963190864194685, + "loss": 2.7846, + "step": 1824 + }, + { + "epoch": 0.14728431926398192, + "grad_norm": 0.8001886606216431, + "learning_rate": 0.00019631484140983705, + "loss": 2.7382, + "step": 1825 + }, + { + "epoch": 0.14736502300056492, + "grad_norm": 0.8589862585067749, + "learning_rate": 0.00019631059399978796, + "loss": 2.8376, + "step": 1826 + }, + { + "epoch": 0.14744572673714793, + "grad_norm": 0.86325603723526, + "learning_rate": 0.00019630634418942714, + "loss": 2.7643, + "step": 1827 + }, + { + "epoch": 0.14752643047373093, + "grad_norm": 0.7893280386924744, + "learning_rate": 0.00019630209197886046, + "loss": 2.713, + "step": 1828 + }, + { + "epoch": 0.14760713421031393, + "grad_norm": 0.8890528082847595, + "learning_rate": 0.00019629783736819394, + "loss": 2.7435, + "step": 1829 + }, + { + "epoch": 0.14768783794689694, + "grad_norm": 0.794924795627594, + "learning_rate": 0.00019629358035753357, + "loss": 2.7703, + "step": 1830 + }, + { + "epoch": 0.14776854168347994, + "grad_norm": 0.7712973952293396, + "learning_rate": 0.00019628932094698545, + "loss": 2.7487, + "step": 1831 + }, + { + "epoch": 0.14784924542006295, + "grad_norm": 0.7810670137405396, + "learning_rate": 0.00019628505913665576, + "loss": 2.7687, + "step": 1832 + }, + { + "epoch": 0.14792994915664595, + "grad_norm": 0.8331059813499451, + "learning_rate": 0.0001962807949266507, + "loss": 2.7166, + "step": 1833 + }, + { + "epoch": 0.14801065289322896, + "grad_norm": 0.8983452916145325, + "learning_rate": 0.00019627652831707656, + "loss": 2.8096, + "step": 1834 + }, + { + "epoch": 0.14809135662981196, + "grad_norm": 0.8387179374694824, + "learning_rate": 0.00019627225930803963, + "loss": 2.8252, + "step": 1835 + }, + { + "epoch": 0.14817206036639496, + "grad_norm": 0.8619294762611389, + "learning_rate": 0.0001962679878996464, + "loss": 2.7623, + "step": 1836 + }, + { + "epoch": 0.14825276410297797, + "grad_norm": 0.8195026516914368, + "learning_rate": 0.0001962637140920032, + "loss": 2.7295, + "step": 1837 + }, + { + "epoch": 0.14833346783956097, + "grad_norm": 0.806216835975647, + "learning_rate": 0.00019625943788521664, + "loss": 2.7184, + "step": 1838 + }, + { + "epoch": 0.14841417157614398, + "grad_norm": 0.7758379578590393, + "learning_rate": 0.00019625515927939327, + "loss": 2.7675, + "step": 1839 + }, + { + "epoch": 0.14849487531272698, + "grad_norm": 0.7617168426513672, + "learning_rate": 0.0001962508782746397, + "loss": 2.8041, + "step": 1840 + }, + { + "epoch": 0.14857557904930999, + "grad_norm": 0.9630066156387329, + "learning_rate": 0.00019624659487106264, + "loss": 2.814, + "step": 1841 + }, + { + "epoch": 0.148656282785893, + "grad_norm": 0.7656112313270569, + "learning_rate": 0.00019624230906876888, + "loss": 2.7564, + "step": 1842 + }, + { + "epoch": 0.148736986522476, + "grad_norm": 0.9394779801368713, + "learning_rate": 0.0001962380208678652, + "loss": 2.7958, + "step": 1843 + }, + { + "epoch": 0.148817690259059, + "grad_norm": 0.7647004127502441, + "learning_rate": 0.00019623373026845842, + "loss": 2.72, + "step": 1844 + }, + { + "epoch": 0.148898393995642, + "grad_norm": 0.809079647064209, + "learning_rate": 0.00019622943727065555, + "loss": 2.7732, + "step": 1845 + }, + { + "epoch": 0.148979097732225, + "grad_norm": 0.8241337537765503, + "learning_rate": 0.00019622514187456357, + "loss": 2.759, + "step": 1846 + }, + { + "epoch": 0.149059801468808, + "grad_norm": 0.8979619145393372, + "learning_rate": 0.00019622084408028948, + "loss": 2.8307, + "step": 1847 + }, + { + "epoch": 0.14914050520539102, + "grad_norm": 0.8058865666389465, + "learning_rate": 0.00019621654388794047, + "loss": 2.807, + "step": 1848 + }, + { + "epoch": 0.14922120894197402, + "grad_norm": 0.81967693567276, + "learning_rate": 0.00019621224129762364, + "loss": 2.7762, + "step": 1849 + }, + { + "epoch": 0.14930191267855702, + "grad_norm": 0.7385755777359009, + "learning_rate": 0.0001962079363094463, + "loss": 2.7854, + "step": 1850 + }, + { + "epoch": 0.14938261641514003, + "grad_norm": 0.8585657477378845, + "learning_rate": 0.00019620362892351566, + "loss": 2.7781, + "step": 1851 + }, + { + "epoch": 0.14946332015172303, + "grad_norm": 0.8328986763954163, + "learning_rate": 0.00019619931913993912, + "loss": 2.8245, + "step": 1852 + }, + { + "epoch": 0.14954402388830604, + "grad_norm": 0.749727189540863, + "learning_rate": 0.0001961950069588241, + "loss": 2.8049, + "step": 1853 + }, + { + "epoch": 0.14962472762488904, + "grad_norm": 0.7886502742767334, + "learning_rate": 0.00019619069238027803, + "loss": 2.7521, + "step": 1854 + }, + { + "epoch": 0.14970543136147205, + "grad_norm": 0.816137433052063, + "learning_rate": 0.00019618637540440848, + "loss": 2.8383, + "step": 1855 + }, + { + "epoch": 0.14978613509805505, + "grad_norm": 0.80442214012146, + "learning_rate": 0.000196182056031323, + "loss": 2.7227, + "step": 1856 + }, + { + "epoch": 0.14986683883463806, + "grad_norm": 0.7605221271514893, + "learning_rate": 0.00019617773426112924, + "loss": 2.7494, + "step": 1857 + }, + { + "epoch": 0.14994754257122106, + "grad_norm": 0.8745137453079224, + "learning_rate": 0.00019617341009393497, + "loss": 2.6978, + "step": 1858 + }, + { + "epoch": 0.15002824630780406, + "grad_norm": 0.8151741623878479, + "learning_rate": 0.00019616908352984789, + "loss": 2.7817, + "step": 1859 + }, + { + "epoch": 0.15010895004438707, + "grad_norm": 0.773876428604126, + "learning_rate": 0.0001961647545689759, + "loss": 2.812, + "step": 1860 + }, + { + "epoch": 0.15018965378097007, + "grad_norm": 0.8216966390609741, + "learning_rate": 0.00019616042321142683, + "loss": 2.8181, + "step": 1861 + }, + { + "epoch": 0.15027035751755305, + "grad_norm": 0.8097409605979919, + "learning_rate": 0.00019615608945730862, + "loss": 2.8336, + "step": 1862 + }, + { + "epoch": 0.15035106125413605, + "grad_norm": 0.8085697293281555, + "learning_rate": 0.00019615175330672932, + "loss": 2.8176, + "step": 1863 + }, + { + "epoch": 0.15043176499071906, + "grad_norm": 0.7658133506774902, + "learning_rate": 0.00019614741475979701, + "loss": 2.7543, + "step": 1864 + }, + { + "epoch": 0.15051246872730206, + "grad_norm": 0.7193909883499146, + "learning_rate": 0.00019614307381661978, + "loss": 2.7475, + "step": 1865 + }, + { + "epoch": 0.15059317246388507, + "grad_norm": 0.835608959197998, + "learning_rate": 0.0001961387304773058, + "loss": 2.8017, + "step": 1866 + }, + { + "epoch": 0.15067387620046807, + "grad_norm": 0.7898489832878113, + "learning_rate": 0.0001961343847419634, + "loss": 2.7613, + "step": 1867 + }, + { + "epoch": 0.15075457993705108, + "grad_norm": 0.8031982183456421, + "learning_rate": 0.0001961300366107008, + "loss": 2.7442, + "step": 1868 + }, + { + "epoch": 0.15083528367363408, + "grad_norm": 0.8427363634109497, + "learning_rate": 0.00019612568608362642, + "loss": 2.8095, + "step": 1869 + }, + { + "epoch": 0.15091598741021708, + "grad_norm": 0.8282802700996399, + "learning_rate": 0.00019612133316084863, + "loss": 2.7216, + "step": 1870 + }, + { + "epoch": 0.1509966911468001, + "grad_norm": 0.7799758911132812, + "learning_rate": 0.000196116977842476, + "loss": 2.793, + "step": 1871 + }, + { + "epoch": 0.1510773948833831, + "grad_norm": 0.8151525259017944, + "learning_rate": 0.00019611262012861702, + "loss": 2.7641, + "step": 1872 + }, + { + "epoch": 0.1511580986199661, + "grad_norm": 0.7926812767982483, + "learning_rate": 0.0001961082600193803, + "loss": 2.7523, + "step": 1873 + }, + { + "epoch": 0.1512388023565491, + "grad_norm": 0.8737135529518127, + "learning_rate": 0.0001961038975148745, + "loss": 2.7965, + "step": 1874 + }, + { + "epoch": 0.1513195060931321, + "grad_norm": 0.7948090434074402, + "learning_rate": 0.00019609953261520837, + "loss": 2.7737, + "step": 1875 + }, + { + "epoch": 0.1514002098297151, + "grad_norm": 0.8161277770996094, + "learning_rate": 0.0001960951653204907, + "loss": 2.7423, + "step": 1876 + }, + { + "epoch": 0.15148091356629811, + "grad_norm": 0.8904973864555359, + "learning_rate": 0.00019609079563083026, + "loss": 2.7066, + "step": 1877 + }, + { + "epoch": 0.15156161730288112, + "grad_norm": 0.8107061982154846, + "learning_rate": 0.00019608642354633604, + "loss": 2.7939, + "step": 1878 + }, + { + "epoch": 0.15164232103946412, + "grad_norm": 0.8410987854003906, + "learning_rate": 0.00019608204906711694, + "loss": 2.7521, + "step": 1879 + }, + { + "epoch": 0.15172302477604713, + "grad_norm": 0.8336483836174011, + "learning_rate": 0.0001960776721932821, + "loss": 2.7613, + "step": 1880 + }, + { + "epoch": 0.15180372851263013, + "grad_norm": 0.730549156665802, + "learning_rate": 0.00019607329292494044, + "loss": 2.8019, + "step": 1881 + }, + { + "epoch": 0.15188443224921314, + "grad_norm": 0.7543070912361145, + "learning_rate": 0.0001960689112622012, + "loss": 2.6907, + "step": 1882 + }, + { + "epoch": 0.15196513598579614, + "grad_norm": 0.848414421081543, + "learning_rate": 0.00019606452720517359, + "loss": 2.7278, + "step": 1883 + }, + { + "epoch": 0.15204583972237914, + "grad_norm": 0.8331718444824219, + "learning_rate": 0.00019606014075396682, + "loss": 2.6994, + "step": 1884 + }, + { + "epoch": 0.15212654345896215, + "grad_norm": 0.9192764759063721, + "learning_rate": 0.00019605575190869025, + "loss": 2.7095, + "step": 1885 + }, + { + "epoch": 0.15220724719554515, + "grad_norm": 0.8377116322517395, + "learning_rate": 0.00019605136066945324, + "loss": 2.7925, + "step": 1886 + }, + { + "epoch": 0.15228795093212816, + "grad_norm": 0.7302869558334351, + "learning_rate": 0.00019604696703636525, + "loss": 2.7286, + "step": 1887 + }, + { + "epoch": 0.15236865466871116, + "grad_norm": 0.7972438335418701, + "learning_rate": 0.00019604257100953577, + "loss": 2.7732, + "step": 1888 + }, + { + "epoch": 0.15244935840529417, + "grad_norm": 1.0350826978683472, + "learning_rate": 0.00019603817258907435, + "loss": 2.8211, + "step": 1889 + }, + { + "epoch": 0.15253006214187717, + "grad_norm": 0.782755970954895, + "learning_rate": 0.00019603377177509067, + "loss": 2.8489, + "step": 1890 + }, + { + "epoch": 0.15261076587846018, + "grad_norm": 0.9072603583335876, + "learning_rate": 0.0001960293685676943, + "loss": 2.7764, + "step": 1891 + }, + { + "epoch": 0.15269146961504318, + "grad_norm": 0.7878704071044922, + "learning_rate": 0.0001960249629669951, + "loss": 2.7494, + "step": 1892 + }, + { + "epoch": 0.15277217335162618, + "grad_norm": 0.8770418167114258, + "learning_rate": 0.00019602055497310278, + "loss": 2.7318, + "step": 1893 + }, + { + "epoch": 0.1528528770882092, + "grad_norm": 0.8004975914955139, + "learning_rate": 0.00019601614458612723, + "loss": 2.7272, + "step": 1894 + }, + { + "epoch": 0.1529335808247922, + "grad_norm": 0.8511070013046265, + "learning_rate": 0.00019601173180617835, + "loss": 2.7876, + "step": 1895 + }, + { + "epoch": 0.1530142845613752, + "grad_norm": 0.7946128845214844, + "learning_rate": 0.00019600731663336617, + "loss": 2.7435, + "step": 1896 + }, + { + "epoch": 0.1530949882979582, + "grad_norm": 0.8155317902565002, + "learning_rate": 0.00019600289906780067, + "loss": 2.7642, + "step": 1897 + }, + { + "epoch": 0.1531756920345412, + "grad_norm": 0.8086098432540894, + "learning_rate": 0.000195998479109592, + "loss": 2.7358, + "step": 1898 + }, + { + "epoch": 0.1532563957711242, + "grad_norm": 0.8698278665542603, + "learning_rate": 0.00019599405675885026, + "loss": 2.725, + "step": 1899 + }, + { + "epoch": 0.15333709950770721, + "grad_norm": 0.8756006360054016, + "learning_rate": 0.00019598963201568573, + "loss": 2.7209, + "step": 1900 + }, + { + "epoch": 0.15341780324429022, + "grad_norm": 0.7984628081321716, + "learning_rate": 0.0001959852048802086, + "loss": 2.7685, + "step": 1901 + }, + { + "epoch": 0.15349850698087322, + "grad_norm": 0.8244056105613708, + "learning_rate": 0.0001959807753525293, + "loss": 2.7692, + "step": 1902 + }, + { + "epoch": 0.15357921071745623, + "grad_norm": 0.8577731251716614, + "learning_rate": 0.00019597634343275814, + "loss": 2.7571, + "step": 1903 + }, + { + "epoch": 0.15365991445403923, + "grad_norm": 0.8410975933074951, + "learning_rate": 0.00019597190912100566, + "loss": 2.7862, + "step": 1904 + }, + { + "epoch": 0.15374061819062224, + "grad_norm": 0.9094158411026001, + "learning_rate": 0.0001959674724173823, + "loss": 2.7655, + "step": 1905 + }, + { + "epoch": 0.15382132192720524, + "grad_norm": 0.8375208973884583, + "learning_rate": 0.00019596303332199868, + "loss": 2.8129, + "step": 1906 + }, + { + "epoch": 0.15390202566378824, + "grad_norm": 0.8335977792739868, + "learning_rate": 0.00019595859183496543, + "loss": 2.7835, + "step": 1907 + }, + { + "epoch": 0.15398272940037125, + "grad_norm": 0.7973531484603882, + "learning_rate": 0.0001959541479563932, + "loss": 2.7785, + "step": 1908 + }, + { + "epoch": 0.15406343313695425, + "grad_norm": 0.7808824181556702, + "learning_rate": 0.0001959497016863928, + "loss": 2.7862, + "step": 1909 + }, + { + "epoch": 0.15414413687353726, + "grad_norm": 0.853824257850647, + "learning_rate": 0.00019594525302507504, + "loss": 2.6721, + "step": 1910 + }, + { + "epoch": 0.15422484061012026, + "grad_norm": 0.8589324355125427, + "learning_rate": 0.00019594080197255073, + "loss": 2.7948, + "step": 1911 + }, + { + "epoch": 0.15430554434670327, + "grad_norm": 0.7951898574829102, + "learning_rate": 0.00019593634852893086, + "loss": 2.7903, + "step": 1912 + }, + { + "epoch": 0.15438624808328624, + "grad_norm": 0.8333349227905273, + "learning_rate": 0.0001959318926943264, + "loss": 2.8073, + "step": 1913 + }, + { + "epoch": 0.15446695181986925, + "grad_norm": 0.8552380800247192, + "learning_rate": 0.0001959274344688484, + "loss": 2.8199, + "step": 1914 + }, + { + "epoch": 0.15454765555645225, + "grad_norm": 0.8356214165687561, + "learning_rate": 0.000195922973852608, + "loss": 2.7985, + "step": 1915 + }, + { + "epoch": 0.15462835929303526, + "grad_norm": 0.7167248725891113, + "learning_rate": 0.00019591851084571634, + "loss": 2.6802, + "step": 1916 + }, + { + "epoch": 0.15470906302961826, + "grad_norm": 0.7980726361274719, + "learning_rate": 0.00019591404544828464, + "loss": 2.692, + "step": 1917 + }, + { + "epoch": 0.15478976676620126, + "grad_norm": 0.7766004800796509, + "learning_rate": 0.00019590957766042424, + "loss": 2.7219, + "step": 1918 + }, + { + "epoch": 0.15487047050278427, + "grad_norm": 0.828852653503418, + "learning_rate": 0.0001959051074822464, + "loss": 2.7369, + "step": 1919 + }, + { + "epoch": 0.15495117423936727, + "grad_norm": 0.7818129062652588, + "learning_rate": 0.0001959006349138626, + "loss": 2.7778, + "step": 1920 + }, + { + "epoch": 0.15503187797595028, + "grad_norm": 0.8428593873977661, + "learning_rate": 0.00019589615995538432, + "loss": 2.8257, + "step": 1921 + }, + { + "epoch": 0.15511258171253328, + "grad_norm": 0.8756616115570068, + "learning_rate": 0.00019589168260692307, + "loss": 2.7692, + "step": 1922 + }, + { + "epoch": 0.15519328544911629, + "grad_norm": 0.7802519202232361, + "learning_rate": 0.0001958872028685904, + "loss": 2.7811, + "step": 1923 + }, + { + "epoch": 0.1552739891856993, + "grad_norm": 0.7787032723426819, + "learning_rate": 0.00019588272074049797, + "loss": 2.7546, + "step": 1924 + }, + { + "epoch": 0.1553546929222823, + "grad_norm": 0.848479151725769, + "learning_rate": 0.0001958782362227575, + "loss": 2.7759, + "step": 1925 + }, + { + "epoch": 0.1554353966588653, + "grad_norm": 0.8331353664398193, + "learning_rate": 0.00019587374931548076, + "loss": 2.7881, + "step": 1926 + }, + { + "epoch": 0.1555161003954483, + "grad_norm": 0.8646424412727356, + "learning_rate": 0.00019586926001877958, + "loss": 2.8059, + "step": 1927 + }, + { + "epoch": 0.1555968041320313, + "grad_norm": 0.912253737449646, + "learning_rate": 0.00019586476833276584, + "loss": 2.7446, + "step": 1928 + }, + { + "epoch": 0.1556775078686143, + "grad_norm": 0.9256471395492554, + "learning_rate": 0.00019586027425755147, + "loss": 2.8, + "step": 1929 + }, + { + "epoch": 0.15575821160519732, + "grad_norm": 1.0984607934951782, + "learning_rate": 0.0001958557777932485, + "loss": 2.7759, + "step": 1930 + }, + { + "epoch": 0.15583891534178032, + "grad_norm": 0.8736081123352051, + "learning_rate": 0.00019585127893996895, + "loss": 2.7464, + "step": 1931 + }, + { + "epoch": 0.15591961907836333, + "grad_norm": 0.932538628578186, + "learning_rate": 0.00019584677769782498, + "loss": 2.7874, + "step": 1932 + }, + { + "epoch": 0.15600032281494633, + "grad_norm": 0.9742087125778198, + "learning_rate": 0.0001958422740669288, + "loss": 2.7727, + "step": 1933 + }, + { + "epoch": 0.15608102655152933, + "grad_norm": 0.8975874781608582, + "learning_rate": 0.00019583776804739256, + "loss": 2.7812, + "step": 1934 + }, + { + "epoch": 0.15616173028811234, + "grad_norm": 0.9380232691764832, + "learning_rate": 0.00019583325963932864, + "loss": 2.7284, + "step": 1935 + }, + { + "epoch": 0.15624243402469534, + "grad_norm": 0.8332872986793518, + "learning_rate": 0.00019582874884284938, + "loss": 2.7792, + "step": 1936 + }, + { + "epoch": 0.15632313776127835, + "grad_norm": 1.0017194747924805, + "learning_rate": 0.0001958242356580672, + "loss": 2.7187, + "step": 1937 + }, + { + "epoch": 0.15640384149786135, + "grad_norm": 0.9433515667915344, + "learning_rate": 0.0001958197200850946, + "loss": 2.8394, + "step": 1938 + }, + { + "epoch": 0.15648454523444436, + "grad_norm": 0.8781030178070068, + "learning_rate": 0.00019581520212404407, + "loss": 2.7667, + "step": 1939 + }, + { + "epoch": 0.15656524897102736, + "grad_norm": 0.895656168460846, + "learning_rate": 0.00019581068177502826, + "loss": 2.799, + "step": 1940 + }, + { + "epoch": 0.15664595270761036, + "grad_norm": 0.8336960673332214, + "learning_rate": 0.0001958061590381598, + "loss": 2.8152, + "step": 1941 + }, + { + "epoch": 0.15672665644419337, + "grad_norm": 0.9184536337852478, + "learning_rate": 0.00019580163391355143, + "loss": 2.7746, + "step": 1942 + }, + { + "epoch": 0.15680736018077637, + "grad_norm": 0.8564908504486084, + "learning_rate": 0.00019579710640131587, + "loss": 2.7674, + "step": 1943 + }, + { + "epoch": 0.15688806391735938, + "grad_norm": 0.7491608262062073, + "learning_rate": 0.00019579257650156605, + "loss": 2.7665, + "step": 1944 + }, + { + "epoch": 0.15696876765394238, + "grad_norm": 0.9165031313896179, + "learning_rate": 0.00019578804421441478, + "loss": 2.7343, + "step": 1945 + }, + { + "epoch": 0.15704947139052539, + "grad_norm": 0.8413978815078735, + "learning_rate": 0.00019578350953997512, + "loss": 2.7503, + "step": 1946 + }, + { + "epoch": 0.1571301751271084, + "grad_norm": 0.7820419073104858, + "learning_rate": 0.00019577897247835993, + "loss": 2.7535, + "step": 1947 + }, + { + "epoch": 0.1572108788636914, + "grad_norm": 0.8134996294975281, + "learning_rate": 0.00019577443302968246, + "loss": 2.7504, + "step": 1948 + }, + { + "epoch": 0.1572915826002744, + "grad_norm": 0.8201301097869873, + "learning_rate": 0.00019576989119405574, + "loss": 2.6927, + "step": 1949 + }, + { + "epoch": 0.1573722863368574, + "grad_norm": 0.8343217372894287, + "learning_rate": 0.00019576534697159296, + "loss": 2.7742, + "step": 1950 + }, + { + "epoch": 0.1574529900734404, + "grad_norm": 0.8161751627922058, + "learning_rate": 0.0001957608003624074, + "loss": 2.8236, + "step": 1951 + }, + { + "epoch": 0.1575336938100234, + "grad_norm": 0.8626808524131775, + "learning_rate": 0.00019575625136661242, + "loss": 2.7305, + "step": 1952 + }, + { + "epoch": 0.15761439754660642, + "grad_norm": 0.8238986730575562, + "learning_rate": 0.0001957516999843213, + "loss": 2.7641, + "step": 1953 + }, + { + "epoch": 0.15769510128318942, + "grad_norm": 0.7806095480918884, + "learning_rate": 0.00019574714621564755, + "loss": 2.7155, + "step": 1954 + }, + { + "epoch": 0.15777580501977242, + "grad_norm": 0.8137761950492859, + "learning_rate": 0.0001957425900607046, + "loss": 2.7529, + "step": 1955 + }, + { + "epoch": 0.15785650875635543, + "grad_norm": 0.8383988738059998, + "learning_rate": 0.00019573803151960606, + "loss": 2.7726, + "step": 1956 + }, + { + "epoch": 0.15793721249293843, + "grad_norm": 0.8734413385391235, + "learning_rate": 0.00019573347059246549, + "loss": 2.8563, + "step": 1957 + }, + { + "epoch": 0.15801791622952144, + "grad_norm": 0.8018438816070557, + "learning_rate": 0.0001957289072793966, + "loss": 2.8031, + "step": 1958 + }, + { + "epoch": 0.15809861996610444, + "grad_norm": 0.8175764083862305, + "learning_rate": 0.0001957243415805131, + "loss": 2.7824, + "step": 1959 + }, + { + "epoch": 0.15817932370268745, + "grad_norm": 0.7642164826393127, + "learning_rate": 0.00019571977349592878, + "loss": 2.7666, + "step": 1960 + }, + { + "epoch": 0.15826002743927045, + "grad_norm": 0.7584841847419739, + "learning_rate": 0.0001957152030257575, + "loss": 2.7211, + "step": 1961 + }, + { + "epoch": 0.15834073117585346, + "grad_norm": 0.822610080242157, + "learning_rate": 0.00019571063017011312, + "loss": 2.7025, + "step": 1962 + }, + { + "epoch": 0.15842143491243646, + "grad_norm": 0.7553817629814148, + "learning_rate": 0.00019570605492910968, + "loss": 2.8122, + "step": 1963 + }, + { + "epoch": 0.15850213864901944, + "grad_norm": 0.7224497199058533, + "learning_rate": 0.0001957014773028612, + "loss": 2.7613, + "step": 1964 + }, + { + "epoch": 0.15858284238560244, + "grad_norm": 0.8563623428344727, + "learning_rate": 0.00019569689729148168, + "loss": 2.8005, + "step": 1965 + }, + { + "epoch": 0.15866354612218544, + "grad_norm": 0.7665508389472961, + "learning_rate": 0.00019569231489508537, + "loss": 2.7387, + "step": 1966 + }, + { + "epoch": 0.15874424985876845, + "grad_norm": 0.7788479328155518, + "learning_rate": 0.0001956877301137864, + "loss": 2.7229, + "step": 1967 + }, + { + "epoch": 0.15882495359535145, + "grad_norm": 0.7326748371124268, + "learning_rate": 0.00019568314294769908, + "loss": 2.7728, + "step": 1968 + }, + { + "epoch": 0.15890565733193446, + "grad_norm": 0.790492594242096, + "learning_rate": 0.00019567855339693772, + "loss": 2.7809, + "step": 1969 + }, + { + "epoch": 0.15898636106851746, + "grad_norm": 0.8026898503303528, + "learning_rate": 0.0001956739614616167, + "loss": 2.7267, + "step": 1970 + }, + { + "epoch": 0.15906706480510047, + "grad_norm": 0.7963770627975464, + "learning_rate": 0.00019566936714185046, + "loss": 2.7161, + "step": 1971 + }, + { + "epoch": 0.15914776854168347, + "grad_norm": 0.7708200216293335, + "learning_rate": 0.00019566477043775354, + "loss": 2.7223, + "step": 1972 + }, + { + "epoch": 0.15922847227826648, + "grad_norm": 0.8036624789237976, + "learning_rate": 0.00019566017134944042, + "loss": 2.7644, + "step": 1973 + }, + { + "epoch": 0.15930917601484948, + "grad_norm": 0.8221341967582703, + "learning_rate": 0.00019565556987702581, + "loss": 2.7629, + "step": 1974 + }, + { + "epoch": 0.15938987975143248, + "grad_norm": 0.7685462832450867, + "learning_rate": 0.00019565096602062435, + "loss": 2.8016, + "step": 1975 + }, + { + "epoch": 0.1594705834880155, + "grad_norm": 0.8173574209213257, + "learning_rate": 0.00019564635978035075, + "loss": 2.761, + "step": 1976 + }, + { + "epoch": 0.1595512872245985, + "grad_norm": 0.7567519545555115, + "learning_rate": 0.00019564175115631988, + "loss": 2.7794, + "step": 1977 + }, + { + "epoch": 0.1596319909611815, + "grad_norm": 0.8754587173461914, + "learning_rate": 0.00019563714014864654, + "loss": 2.7769, + "step": 1978 + }, + { + "epoch": 0.1597126946977645, + "grad_norm": 0.753871738910675, + "learning_rate": 0.00019563252675744569, + "loss": 2.7489, + "step": 1979 + }, + { + "epoch": 0.1597933984343475, + "grad_norm": 0.777103841304779, + "learning_rate": 0.00019562791098283225, + "loss": 2.7667, + "step": 1980 + }, + { + "epoch": 0.1598741021709305, + "grad_norm": 0.8227293491363525, + "learning_rate": 0.00019562329282492131, + "loss": 2.7904, + "step": 1981 + }, + { + "epoch": 0.15995480590751351, + "grad_norm": 0.7595541477203369, + "learning_rate": 0.00019561867228382797, + "loss": 2.7654, + "step": 1982 + }, + { + "epoch": 0.16003550964409652, + "grad_norm": 0.8330550789833069, + "learning_rate": 0.00019561404935966733, + "loss": 2.7533, + "step": 1983 + }, + { + "epoch": 0.16011621338067952, + "grad_norm": 0.8213297128677368, + "learning_rate": 0.0001956094240525547, + "loss": 2.8103, + "step": 1984 + }, + { + "epoch": 0.16019691711726253, + "grad_norm": 0.8046056628227234, + "learning_rate": 0.00019560479636260527, + "loss": 2.7666, + "step": 1985 + }, + { + "epoch": 0.16027762085384553, + "grad_norm": 0.7886037230491638, + "learning_rate": 0.0001956001662899344, + "loss": 2.7066, + "step": 1986 + }, + { + "epoch": 0.16035832459042854, + "grad_norm": 0.8300043940544128, + "learning_rate": 0.00019559553383465748, + "loss": 2.7617, + "step": 1987 + }, + { + "epoch": 0.16043902832701154, + "grad_norm": 0.7963815331459045, + "learning_rate": 0.00019559089899688994, + "loss": 2.6891, + "step": 1988 + }, + { + "epoch": 0.16051973206359454, + "grad_norm": 0.7794002294540405, + "learning_rate": 0.00019558626177674734, + "loss": 2.8012, + "step": 1989 + }, + { + "epoch": 0.16060043580017755, + "grad_norm": 0.8345863819122314, + "learning_rate": 0.00019558162217434526, + "loss": 2.7715, + "step": 1990 + }, + { + "epoch": 0.16068113953676055, + "grad_norm": 0.8883393406867981, + "learning_rate": 0.00019557698018979927, + "loss": 2.7863, + "step": 1991 + }, + { + "epoch": 0.16076184327334356, + "grad_norm": 0.8069450259208679, + "learning_rate": 0.0001955723358232251, + "loss": 2.759, + "step": 1992 + }, + { + "epoch": 0.16084254700992656, + "grad_norm": 0.9014191031455994, + "learning_rate": 0.00019556768907473852, + "loss": 2.711, + "step": 1993 + }, + { + "epoch": 0.16092325074650957, + "grad_norm": 0.8429470658302307, + "learning_rate": 0.0001955630399444553, + "loss": 2.6936, + "step": 1994 + }, + { + "epoch": 0.16100395448309257, + "grad_norm": 0.7859500050544739, + "learning_rate": 0.00019555838843249128, + "loss": 2.7343, + "step": 1995 + }, + { + "epoch": 0.16108465821967557, + "grad_norm": 0.8068249821662903, + "learning_rate": 0.00019555373453896245, + "loss": 2.7492, + "step": 1996 + }, + { + "epoch": 0.16116536195625858, + "grad_norm": 0.8194023370742798, + "learning_rate": 0.00019554907826398478, + "loss": 2.7265, + "step": 1997 + }, + { + "epoch": 0.16124606569284158, + "grad_norm": 0.8139404654502869, + "learning_rate": 0.00019554441960767434, + "loss": 2.7311, + "step": 1998 + }, + { + "epoch": 0.1613267694294246, + "grad_norm": 0.8210673928260803, + "learning_rate": 0.00019553975857014718, + "loss": 2.7095, + "step": 1999 + }, + { + "epoch": 0.1614074731660076, + "grad_norm": 0.8615561723709106, + "learning_rate": 0.0001955350951515195, + "loss": 2.7458, + "step": 2000 + }, + { + "epoch": 0.1614074731660076, + "eval_loss": 2.6739437580108643, + "eval_runtime": 813.8274, + "eval_samples_per_second": 3.219, + "eval_steps_per_second": 0.537, + "step": 2000 + }, + { + "epoch": 0.1614881769025906, + "grad_norm": 0.8945594429969788, + "learning_rate": 0.0001955304293519075, + "loss": 2.776, + "step": 2001 + }, + { + "epoch": 0.1615688806391736, + "grad_norm": 0.7943438291549683, + "learning_rate": 0.00019552576117142748, + "loss": 2.7484, + "step": 2002 + }, + { + "epoch": 0.1616495843757566, + "grad_norm": 0.8264374136924744, + "learning_rate": 0.00019552109061019582, + "loss": 2.7725, + "step": 2003 + }, + { + "epoch": 0.1617302881123396, + "grad_norm": 0.7591681480407715, + "learning_rate": 0.00019551641766832887, + "loss": 2.7217, + "step": 2004 + }, + { + "epoch": 0.16181099184892261, + "grad_norm": 0.8275293707847595, + "learning_rate": 0.0001955117423459431, + "loss": 2.7279, + "step": 2005 + }, + { + "epoch": 0.16189169558550562, + "grad_norm": 0.8109650611877441, + "learning_rate": 0.00019550706464315504, + "loss": 2.8111, + "step": 2006 + }, + { + "epoch": 0.16197239932208862, + "grad_norm": 0.8710397481918335, + "learning_rate": 0.00019550238456008127, + "loss": 2.7166, + "step": 2007 + }, + { + "epoch": 0.16205310305867163, + "grad_norm": 0.8569270968437195, + "learning_rate": 0.00019549770209683845, + "loss": 2.7739, + "step": 2008 + }, + { + "epoch": 0.16213380679525463, + "grad_norm": 0.7927817702293396, + "learning_rate": 0.00019549301725354325, + "loss": 2.7154, + "step": 2009 + }, + { + "epoch": 0.16221451053183764, + "grad_norm": 0.7576590776443481, + "learning_rate": 0.00019548833003031244, + "loss": 2.7276, + "step": 2010 + }, + { + "epoch": 0.16229521426842064, + "grad_norm": 0.8092780709266663, + "learning_rate": 0.00019548364042726283, + "loss": 2.7494, + "step": 2011 + }, + { + "epoch": 0.16237591800500364, + "grad_norm": 0.7643424868583679, + "learning_rate": 0.0001954789484445113, + "loss": 2.7877, + "step": 2012 + }, + { + "epoch": 0.16245662174158665, + "grad_norm": 0.8235166072845459, + "learning_rate": 0.0001954742540821748, + "loss": 2.7884, + "step": 2013 + }, + { + "epoch": 0.16253732547816965, + "grad_norm": 0.9297853708267212, + "learning_rate": 0.00019546955734037034, + "loss": 2.765, + "step": 2014 + }, + { + "epoch": 0.16261802921475263, + "grad_norm": 0.7778275609016418, + "learning_rate": 0.0001954648582192149, + "loss": 2.7178, + "step": 2015 + }, + { + "epoch": 0.16269873295133563, + "grad_norm": 0.8767017126083374, + "learning_rate": 0.00019546015671882566, + "loss": 2.8254, + "step": 2016 + }, + { + "epoch": 0.16277943668791864, + "grad_norm": 0.7870603203773499, + "learning_rate": 0.0001954554528393198, + "loss": 2.797, + "step": 2017 + }, + { + "epoch": 0.16286014042450164, + "grad_norm": 0.8112391233444214, + "learning_rate": 0.00019545074658081454, + "loss": 2.8562, + "step": 2018 + }, + { + "epoch": 0.16294084416108465, + "grad_norm": 0.8216677308082581, + "learning_rate": 0.00019544603794342713, + "loss": 2.7894, + "step": 2019 + }, + { + "epoch": 0.16302154789766765, + "grad_norm": 0.8445515632629395, + "learning_rate": 0.00019544132692727497, + "loss": 2.8618, + "step": 2020 + }, + { + "epoch": 0.16310225163425066, + "grad_norm": 0.8275444507598877, + "learning_rate": 0.00019543661353247548, + "loss": 2.8087, + "step": 2021 + }, + { + "epoch": 0.16318295537083366, + "grad_norm": 0.8142833709716797, + "learning_rate": 0.00019543189775914608, + "loss": 2.8075, + "step": 2022 + }, + { + "epoch": 0.16326365910741666, + "grad_norm": 0.8182976245880127, + "learning_rate": 0.0001954271796074043, + "loss": 2.8312, + "step": 2023 + }, + { + "epoch": 0.16334436284399967, + "grad_norm": 0.7629228234291077, + "learning_rate": 0.0001954224590773678, + "loss": 2.7191, + "step": 2024 + }, + { + "epoch": 0.16342506658058267, + "grad_norm": 0.8630000948905945, + "learning_rate": 0.00019541773616915418, + "loss": 2.8013, + "step": 2025 + }, + { + "epoch": 0.16350577031716568, + "grad_norm": 0.8917906880378723, + "learning_rate": 0.00019541301088288115, + "loss": 2.7573, + "step": 2026 + }, + { + "epoch": 0.16358647405374868, + "grad_norm": 0.8641694188117981, + "learning_rate": 0.00019540828321866648, + "loss": 2.7509, + "step": 2027 + }, + { + "epoch": 0.16366717779033169, + "grad_norm": 0.7687639594078064, + "learning_rate": 0.00019540355317662798, + "loss": 2.7266, + "step": 2028 + }, + { + "epoch": 0.1637478815269147, + "grad_norm": 0.7870400547981262, + "learning_rate": 0.00019539882075688355, + "loss": 2.8217, + "step": 2029 + }, + { + "epoch": 0.1638285852634977, + "grad_norm": 0.9373054504394531, + "learning_rate": 0.0001953940859595511, + "loss": 2.7562, + "step": 2030 + }, + { + "epoch": 0.1639092890000807, + "grad_norm": 0.7941255569458008, + "learning_rate": 0.00019538934878474872, + "loss": 2.7553, + "step": 2031 + }, + { + "epoch": 0.1639899927366637, + "grad_norm": 0.735977053642273, + "learning_rate": 0.00019538460923259438, + "loss": 2.7058, + "step": 2032 + }, + { + "epoch": 0.1640706964732467, + "grad_norm": 0.7812782526016235, + "learning_rate": 0.00019537986730320625, + "loss": 2.7885, + "step": 2033 + }, + { + "epoch": 0.1641514002098297, + "grad_norm": 1.1534128189086914, + "learning_rate": 0.0001953751229967025, + "loss": 2.7139, + "step": 2034 + }, + { + "epoch": 0.16423210394641272, + "grad_norm": 0.9139814972877502, + "learning_rate": 0.00019537037631320135, + "loss": 2.7869, + "step": 2035 + }, + { + "epoch": 0.16431280768299572, + "grad_norm": 0.8330421447753906, + "learning_rate": 0.00019536562725282116, + "loss": 2.7491, + "step": 2036 + }, + { + "epoch": 0.16439351141957873, + "grad_norm": 0.9040594696998596, + "learning_rate": 0.00019536087581568026, + "loss": 2.7637, + "step": 2037 + }, + { + "epoch": 0.16447421515616173, + "grad_norm": 0.9158666729927063, + "learning_rate": 0.00019535612200189705, + "loss": 2.7709, + "step": 2038 + }, + { + "epoch": 0.16455491889274473, + "grad_norm": 0.8668088912963867, + "learning_rate": 0.00019535136581158997, + "loss": 2.7994, + "step": 2039 + }, + { + "epoch": 0.16463562262932774, + "grad_norm": 0.9179345369338989, + "learning_rate": 0.00019534660724487764, + "loss": 2.747, + "step": 2040 + }, + { + "epoch": 0.16471632636591074, + "grad_norm": 0.9690881967544556, + "learning_rate": 0.00019534184630187862, + "loss": 2.742, + "step": 2041 + }, + { + "epoch": 0.16479703010249375, + "grad_norm": 0.8478729724884033, + "learning_rate": 0.00019533708298271157, + "loss": 2.7824, + "step": 2042 + }, + { + "epoch": 0.16487773383907675, + "grad_norm": 0.8286584615707397, + "learning_rate": 0.00019533231728749518, + "loss": 2.7263, + "step": 2043 + }, + { + "epoch": 0.16495843757565976, + "grad_norm": 0.8095324039459229, + "learning_rate": 0.00019532754921634826, + "loss": 2.7845, + "step": 2044 + }, + { + "epoch": 0.16503914131224276, + "grad_norm": 0.9552872776985168, + "learning_rate": 0.0001953227787693896, + "loss": 2.7676, + "step": 2045 + }, + { + "epoch": 0.16511984504882576, + "grad_norm": 1.021515130996704, + "learning_rate": 0.00019531800594673815, + "loss": 2.784, + "step": 2046 + }, + { + "epoch": 0.16520054878540877, + "grad_norm": 0.7847293019294739, + "learning_rate": 0.00019531323074851276, + "loss": 2.7319, + "step": 2047 + }, + { + "epoch": 0.16528125252199177, + "grad_norm": 0.7803899049758911, + "learning_rate": 0.0001953084531748326, + "loss": 2.8321, + "step": 2048 + }, + { + "epoch": 0.16536195625857478, + "grad_norm": 0.8687692880630493, + "learning_rate": 0.0001953036732258166, + "loss": 2.763, + "step": 2049 + }, + { + "epoch": 0.16544265999515778, + "grad_norm": 0.8212031126022339, + "learning_rate": 0.00019529889090158392, + "loss": 2.7262, + "step": 2050 + }, + { + "epoch": 0.16552336373174079, + "grad_norm": 0.8460689187049866, + "learning_rate": 0.0001952941062022538, + "loss": 2.8018, + "step": 2051 + }, + { + "epoch": 0.1656040674683238, + "grad_norm": 0.9189361929893494, + "learning_rate": 0.00019528931912794547, + "loss": 2.8079, + "step": 2052 + }, + { + "epoch": 0.1656847712049068, + "grad_norm": 0.9529987573623657, + "learning_rate": 0.00019528452967877816, + "loss": 2.8015, + "step": 2053 + }, + { + "epoch": 0.1657654749414898, + "grad_norm": 0.8468493223190308, + "learning_rate": 0.00019527973785487133, + "loss": 2.8013, + "step": 2054 + }, + { + "epoch": 0.1658461786780728, + "grad_norm": 0.8150945901870728, + "learning_rate": 0.00019527494365634436, + "loss": 2.7975, + "step": 2055 + }, + { + "epoch": 0.1659268824146558, + "grad_norm": 0.814942479133606, + "learning_rate": 0.00019527014708331674, + "loss": 2.7503, + "step": 2056 + }, + { + "epoch": 0.1660075861512388, + "grad_norm": 0.7841517329216003, + "learning_rate": 0.000195265348135908, + "loss": 2.7921, + "step": 2057 + }, + { + "epoch": 0.16608828988782182, + "grad_norm": 0.7603738903999329, + "learning_rate": 0.0001952605468142378, + "loss": 2.7658, + "step": 2058 + }, + { + "epoch": 0.16616899362440482, + "grad_norm": 0.8460882902145386, + "learning_rate": 0.00019525574311842574, + "loss": 2.7644, + "step": 2059 + }, + { + "epoch": 0.16624969736098782, + "grad_norm": 0.8633555173873901, + "learning_rate": 0.00019525093704859156, + "loss": 2.7956, + "step": 2060 + }, + { + "epoch": 0.16633040109757083, + "grad_norm": 0.7700977325439453, + "learning_rate": 0.00019524612860485503, + "loss": 2.7103, + "step": 2061 + }, + { + "epoch": 0.16641110483415383, + "grad_norm": 0.888770580291748, + "learning_rate": 0.00019524131778733602, + "loss": 2.7325, + "step": 2062 + }, + { + "epoch": 0.16649180857073684, + "grad_norm": 0.8338149189949036, + "learning_rate": 0.00019523650459615438, + "loss": 2.7533, + "step": 2063 + }, + { + "epoch": 0.16657251230731984, + "grad_norm": 0.7723987698554993, + "learning_rate": 0.0001952316890314301, + "loss": 2.7316, + "step": 2064 + }, + { + "epoch": 0.16665321604390285, + "grad_norm": 0.8952934145927429, + "learning_rate": 0.0001952268710932832, + "loss": 2.7825, + "step": 2065 + }, + { + "epoch": 0.16673391978048582, + "grad_norm": 0.8201496601104736, + "learning_rate": 0.00019522205078183378, + "loss": 2.7162, + "step": 2066 + }, + { + "epoch": 0.16681462351706883, + "grad_norm": 0.7733781337738037, + "learning_rate": 0.00019521722809720188, + "loss": 2.7834, + "step": 2067 + }, + { + "epoch": 0.16689532725365183, + "grad_norm": 0.8285118937492371, + "learning_rate": 0.0001952124030395078, + "loss": 2.8475, + "step": 2068 + }, + { + "epoch": 0.16697603099023484, + "grad_norm": 0.84097820520401, + "learning_rate": 0.00019520757560887174, + "loss": 2.784, + "step": 2069 + }, + { + "epoch": 0.16705673472681784, + "grad_norm": 0.7336563467979431, + "learning_rate": 0.000195202745805414, + "loss": 2.7663, + "step": 2070 + }, + { + "epoch": 0.16713743846340084, + "grad_norm": 0.8359388113021851, + "learning_rate": 0.000195197913629255, + "loss": 2.7931, + "step": 2071 + }, + { + "epoch": 0.16721814219998385, + "grad_norm": 0.8272559642791748, + "learning_rate": 0.0001951930790805151, + "loss": 2.8578, + "step": 2072 + }, + { + "epoch": 0.16729884593656685, + "grad_norm": 0.7970743179321289, + "learning_rate": 0.00019518824215931487, + "loss": 2.8148, + "step": 2073 + }, + { + "epoch": 0.16737954967314986, + "grad_norm": 0.856200098991394, + "learning_rate": 0.00019518340286577482, + "loss": 2.8067, + "step": 2074 + }, + { + "epoch": 0.16746025340973286, + "grad_norm": 0.7581893801689148, + "learning_rate": 0.00019517856120001556, + "loss": 2.7339, + "step": 2075 + }, + { + "epoch": 0.16754095714631587, + "grad_norm": 0.8488386869430542, + "learning_rate": 0.00019517371716215774, + "loss": 2.7332, + "step": 2076 + }, + { + "epoch": 0.16762166088289887, + "grad_norm": 0.7488275170326233, + "learning_rate": 0.00019516887075232212, + "loss": 2.7734, + "step": 2077 + }, + { + "epoch": 0.16770236461948188, + "grad_norm": 0.9173932075500488, + "learning_rate": 0.00019516402197062945, + "loss": 2.7792, + "step": 2078 + }, + { + "epoch": 0.16778306835606488, + "grad_norm": 0.8200702667236328, + "learning_rate": 0.0001951591708172006, + "loss": 2.8046, + "step": 2079 + }, + { + "epoch": 0.16786377209264788, + "grad_norm": 0.8270781636238098, + "learning_rate": 0.00019515431729215642, + "loss": 2.7467, + "step": 2080 + }, + { + "epoch": 0.1679444758292309, + "grad_norm": 0.8660609722137451, + "learning_rate": 0.00019514946139561799, + "loss": 2.8169, + "step": 2081 + }, + { + "epoch": 0.1680251795658139, + "grad_norm": 0.78753262758255, + "learning_rate": 0.0001951446031277062, + "loss": 2.7388, + "step": 2082 + }, + { + "epoch": 0.1681058833023969, + "grad_norm": 0.791593074798584, + "learning_rate": 0.00019513974248854224, + "loss": 2.8776, + "step": 2083 + }, + { + "epoch": 0.1681865870389799, + "grad_norm": 0.7883535623550415, + "learning_rate": 0.0001951348794782472, + "loss": 2.78, + "step": 2084 + }, + { + "epoch": 0.1682672907755629, + "grad_norm": 0.7877013087272644, + "learning_rate": 0.00019513001409694224, + "loss": 2.7559, + "step": 2085 + }, + { + "epoch": 0.1683479945121459, + "grad_norm": 0.8838450908660889, + "learning_rate": 0.00019512514634474864, + "loss": 2.7489, + "step": 2086 + }, + { + "epoch": 0.16842869824872891, + "grad_norm": 0.7751588821411133, + "learning_rate": 0.00019512027622178775, + "loss": 2.6832, + "step": 2087 + }, + { + "epoch": 0.16850940198531192, + "grad_norm": 0.90345299243927, + "learning_rate": 0.00019511540372818095, + "loss": 2.8189, + "step": 2088 + }, + { + "epoch": 0.16859010572189492, + "grad_norm": 0.7820938229560852, + "learning_rate": 0.00019511052886404966, + "loss": 2.7655, + "step": 2089 + }, + { + "epoch": 0.16867080945847793, + "grad_norm": 0.8250375986099243, + "learning_rate": 0.00019510565162951537, + "loss": 2.7866, + "step": 2090 + }, + { + "epoch": 0.16875151319506093, + "grad_norm": 0.8063845634460449, + "learning_rate": 0.00019510077202469962, + "loss": 2.7774, + "step": 2091 + }, + { + "epoch": 0.16883221693164394, + "grad_norm": 0.7627965807914734, + "learning_rate": 0.00019509589004972403, + "loss": 2.7201, + "step": 2092 + }, + { + "epoch": 0.16891292066822694, + "grad_norm": 0.8392470479011536, + "learning_rate": 0.00019509100570471027, + "loss": 2.7613, + "step": 2093 + }, + { + "epoch": 0.16899362440480994, + "grad_norm": 0.7807552814483643, + "learning_rate": 0.0001950861189897801, + "loss": 2.7451, + "step": 2094 + }, + { + "epoch": 0.16907432814139295, + "grad_norm": 0.7829259634017944, + "learning_rate": 0.00019508122990505528, + "loss": 2.7128, + "step": 2095 + }, + { + "epoch": 0.16915503187797595, + "grad_norm": 0.7793046832084656, + "learning_rate": 0.00019507633845065766, + "loss": 2.7849, + "step": 2096 + }, + { + "epoch": 0.16923573561455896, + "grad_norm": 0.869752824306488, + "learning_rate": 0.00019507144462670915, + "loss": 2.7882, + "step": 2097 + }, + { + "epoch": 0.16931643935114196, + "grad_norm": 0.7550783753395081, + "learning_rate": 0.00019506654843333174, + "loss": 2.7211, + "step": 2098 + }, + { + "epoch": 0.16939714308772497, + "grad_norm": 0.8364891409873962, + "learning_rate": 0.0001950616498706474, + "loss": 2.7171, + "step": 2099 + }, + { + "epoch": 0.16947784682430797, + "grad_norm": 0.8026537299156189, + "learning_rate": 0.0001950567489387783, + "loss": 2.8362, + "step": 2100 + }, + { + "epoch": 0.16955855056089097, + "grad_norm": 0.8073398470878601, + "learning_rate": 0.00019505184563784652, + "loss": 2.7635, + "step": 2101 + }, + { + "epoch": 0.16963925429747398, + "grad_norm": 0.8168368935585022, + "learning_rate": 0.00019504693996797424, + "loss": 2.7553, + "step": 2102 + }, + { + "epoch": 0.16971995803405698, + "grad_norm": 0.7933681011199951, + "learning_rate": 0.0001950420319292838, + "loss": 2.7887, + "step": 2103 + }, + { + "epoch": 0.16980066177064, + "grad_norm": 0.8326540589332581, + "learning_rate": 0.00019503712152189748, + "loss": 2.7844, + "step": 2104 + }, + { + "epoch": 0.169881365507223, + "grad_norm": 0.8357202410697937, + "learning_rate": 0.00019503220874593765, + "loss": 2.7744, + "step": 2105 + }, + { + "epoch": 0.169962069243806, + "grad_norm": 0.8541022539138794, + "learning_rate": 0.00019502729360152676, + "loss": 2.7867, + "step": 2106 + }, + { + "epoch": 0.170042772980389, + "grad_norm": 0.8338841795921326, + "learning_rate": 0.0001950223760887873, + "loss": 2.7208, + "step": 2107 + }, + { + "epoch": 0.170123476716972, + "grad_norm": 0.8824255466461182, + "learning_rate": 0.00019501745620784187, + "loss": 2.7658, + "step": 2108 + }, + { + "epoch": 0.170204180453555, + "grad_norm": 0.7710463404655457, + "learning_rate": 0.00019501253395881306, + "loss": 2.7167, + "step": 2109 + }, + { + "epoch": 0.17028488419013801, + "grad_norm": 0.7740076184272766, + "learning_rate": 0.0001950076093418235, + "loss": 2.7251, + "step": 2110 + }, + { + "epoch": 0.17036558792672102, + "grad_norm": 0.8258434534072876, + "learning_rate": 0.00019500268235699597, + "loss": 2.7533, + "step": 2111 + }, + { + "epoch": 0.17044629166330402, + "grad_norm": 0.8347997069358826, + "learning_rate": 0.00019499775300445326, + "loss": 2.7372, + "step": 2112 + }, + { + "epoch": 0.17052699539988703, + "grad_norm": 0.8246529698371887, + "learning_rate": 0.00019499282128431823, + "loss": 2.7458, + "step": 2113 + }, + { + "epoch": 0.17060769913647003, + "grad_norm": 0.8510704040527344, + "learning_rate": 0.00019498788719671378, + "loss": 2.8144, + "step": 2114 + }, + { + "epoch": 0.17068840287305304, + "grad_norm": 0.7793454527854919, + "learning_rate": 0.00019498295074176286, + "loss": 2.7927, + "step": 2115 + }, + { + "epoch": 0.17076910660963604, + "grad_norm": 0.7888665199279785, + "learning_rate": 0.00019497801191958853, + "loss": 2.7156, + "step": 2116 + }, + { + "epoch": 0.17084981034621902, + "grad_norm": 0.8502812385559082, + "learning_rate": 0.00019497307073031386, + "loss": 2.7906, + "step": 2117 + }, + { + "epoch": 0.17093051408280202, + "grad_norm": 0.8376502990722656, + "learning_rate": 0.00019496812717406203, + "loss": 2.7354, + "step": 2118 + }, + { + "epoch": 0.17101121781938503, + "grad_norm": 0.7974401116371155, + "learning_rate": 0.0001949631812509562, + "loss": 2.7755, + "step": 2119 + }, + { + "epoch": 0.17109192155596803, + "grad_norm": 0.7760190963745117, + "learning_rate": 0.00019495823296111965, + "loss": 2.7694, + "step": 2120 + }, + { + "epoch": 0.17117262529255103, + "grad_norm": 0.7721701860427856, + "learning_rate": 0.00019495328230467575, + "loss": 2.7474, + "step": 2121 + }, + { + "epoch": 0.17125332902913404, + "grad_norm": 0.7360577583312988, + "learning_rate": 0.0001949483292817478, + "loss": 2.8044, + "step": 2122 + }, + { + "epoch": 0.17133403276571704, + "grad_norm": 0.7536107301712036, + "learning_rate": 0.0001949433738924593, + "loss": 2.8165, + "step": 2123 + }, + { + "epoch": 0.17141473650230005, + "grad_norm": 0.7668276429176331, + "learning_rate": 0.00019493841613693375, + "loss": 2.7964, + "step": 2124 + }, + { + "epoch": 0.17149544023888305, + "grad_norm": 0.8323161602020264, + "learning_rate": 0.0001949334560152947, + "loss": 2.7395, + "step": 2125 + }, + { + "epoch": 0.17157614397546606, + "grad_norm": 0.8132179975509644, + "learning_rate": 0.00019492849352766576, + "loss": 2.7511, + "step": 2126 + }, + { + "epoch": 0.17165684771204906, + "grad_norm": 0.7806998491287231, + "learning_rate": 0.0001949235286741706, + "loss": 2.7649, + "step": 2127 + }, + { + "epoch": 0.17173755144863206, + "grad_norm": 0.8315939903259277, + "learning_rate": 0.00019491856145493298, + "loss": 2.7742, + "step": 2128 + }, + { + "epoch": 0.17181825518521507, + "grad_norm": 0.8368063569068909, + "learning_rate": 0.00019491359187007672, + "loss": 2.7667, + "step": 2129 + }, + { + "epoch": 0.17189895892179807, + "grad_norm": 0.9183431267738342, + "learning_rate": 0.0001949086199197256, + "loss": 2.7444, + "step": 2130 + }, + { + "epoch": 0.17197966265838108, + "grad_norm": 0.7824065089225769, + "learning_rate": 0.0001949036456040036, + "loss": 2.7455, + "step": 2131 + }, + { + "epoch": 0.17206036639496408, + "grad_norm": 0.777974009513855, + "learning_rate": 0.00019489866892303468, + "loss": 2.7466, + "step": 2132 + }, + { + "epoch": 0.17214107013154709, + "grad_norm": 0.8068816065788269, + "learning_rate": 0.00019489368987694286, + "loss": 2.7081, + "step": 2133 + }, + { + "epoch": 0.1722217738681301, + "grad_norm": 0.8757622838020325, + "learning_rate": 0.00019488870846585222, + "loss": 2.8005, + "step": 2134 + }, + { + "epoch": 0.1723024776047131, + "grad_norm": 0.7967162728309631, + "learning_rate": 0.00019488372468988693, + "loss": 2.7737, + "step": 2135 + }, + { + "epoch": 0.1723831813412961, + "grad_norm": 0.7700283527374268, + "learning_rate": 0.00019487873854917117, + "loss": 2.7431, + "step": 2136 + }, + { + "epoch": 0.1724638850778791, + "grad_norm": 0.8259130716323853, + "learning_rate": 0.00019487375004382927, + "loss": 2.7635, + "step": 2137 + }, + { + "epoch": 0.1725445888144621, + "grad_norm": 0.8253815770149231, + "learning_rate": 0.0001948687591739855, + "loss": 2.7046, + "step": 2138 + }, + { + "epoch": 0.1726252925510451, + "grad_norm": 0.8087987303733826, + "learning_rate": 0.00019486376593976426, + "loss": 2.7728, + "step": 2139 + }, + { + "epoch": 0.17270599628762812, + "grad_norm": 0.8437588214874268, + "learning_rate": 0.00019485877034128998, + "loss": 2.7606, + "step": 2140 + }, + { + "epoch": 0.17278670002421112, + "grad_norm": 0.8416075110435486, + "learning_rate": 0.00019485377237868723, + "loss": 2.7396, + "step": 2141 + }, + { + "epoch": 0.17286740376079412, + "grad_norm": 0.784275472164154, + "learning_rate": 0.00019484877205208046, + "loss": 2.766, + "step": 2142 + }, + { + "epoch": 0.17294810749737713, + "grad_norm": 0.8082472681999207, + "learning_rate": 0.0001948437693615944, + "loss": 2.8, + "step": 2143 + }, + { + "epoch": 0.17302881123396013, + "grad_norm": 0.8904329538345337, + "learning_rate": 0.00019483876430735365, + "loss": 2.6579, + "step": 2144 + }, + { + "epoch": 0.17310951497054314, + "grad_norm": 0.7864851355552673, + "learning_rate": 0.000194833756889483, + "loss": 2.8231, + "step": 2145 + }, + { + "epoch": 0.17319021870712614, + "grad_norm": 0.7445049285888672, + "learning_rate": 0.00019482874710810723, + "loss": 2.7498, + "step": 2146 + }, + { + "epoch": 0.17327092244370915, + "grad_norm": 0.8266116380691528, + "learning_rate": 0.00019482373496335117, + "loss": 2.7152, + "step": 2147 + }, + { + "epoch": 0.17335162618029215, + "grad_norm": 0.7712300419807434, + "learning_rate": 0.0001948187204553398, + "loss": 2.7751, + "step": 2148 + }, + { + "epoch": 0.17343232991687516, + "grad_norm": 0.7472708225250244, + "learning_rate": 0.00019481370358419807, + "loss": 2.7397, + "step": 2149 + }, + { + "epoch": 0.17351303365345816, + "grad_norm": 0.763454020023346, + "learning_rate": 0.00019480868435005095, + "loss": 2.7174, + "step": 2150 + }, + { + "epoch": 0.17359373739004116, + "grad_norm": 0.8187674283981323, + "learning_rate": 0.00019480366275302362, + "loss": 2.8424, + "step": 2151 + }, + { + "epoch": 0.17367444112662417, + "grad_norm": 0.8183228373527527, + "learning_rate": 0.0001947986387932412, + "loss": 2.7351, + "step": 2152 + }, + { + "epoch": 0.17375514486320717, + "grad_norm": 0.807231605052948, + "learning_rate": 0.00019479361247082884, + "loss": 2.8054, + "step": 2153 + }, + { + "epoch": 0.17383584859979018, + "grad_norm": 0.8383626341819763, + "learning_rate": 0.00019478858378591194, + "loss": 2.7181, + "step": 2154 + }, + { + "epoch": 0.17391655233637318, + "grad_norm": 0.8330298662185669, + "learning_rate": 0.0001947835527386157, + "loss": 2.748, + "step": 2155 + }, + { + "epoch": 0.17399725607295619, + "grad_norm": 0.8433073163032532, + "learning_rate": 0.0001947785193290656, + "loss": 2.8115, + "step": 2156 + }, + { + "epoch": 0.1740779598095392, + "grad_norm": 0.8873384594917297, + "learning_rate": 0.000194773483557387, + "loss": 2.8288, + "step": 2157 + }, + { + "epoch": 0.1741586635461222, + "grad_norm": 0.8399423360824585, + "learning_rate": 0.00019476844542370546, + "loss": 2.7514, + "step": 2158 + }, + { + "epoch": 0.1742393672827052, + "grad_norm": 0.7808830738067627, + "learning_rate": 0.00019476340492814655, + "loss": 2.7003, + "step": 2159 + }, + { + "epoch": 0.1743200710192882, + "grad_norm": 0.8268750905990601, + "learning_rate": 0.00019475836207083589, + "loss": 2.7961, + "step": 2160 + }, + { + "epoch": 0.1744007747558712, + "grad_norm": 0.9144260883331299, + "learning_rate": 0.0001947533168518991, + "loss": 2.769, + "step": 2161 + }, + { + "epoch": 0.1744814784924542, + "grad_norm": 0.8409113883972168, + "learning_rate": 0.000194748269271462, + "loss": 2.8004, + "step": 2162 + }, + { + "epoch": 0.17456218222903722, + "grad_norm": 0.8747037649154663, + "learning_rate": 0.00019474321932965035, + "loss": 2.7602, + "step": 2163 + }, + { + "epoch": 0.17464288596562022, + "grad_norm": 0.8582575917243958, + "learning_rate": 0.00019473816702659, + "loss": 2.7292, + "step": 2164 + }, + { + "epoch": 0.17472358970220322, + "grad_norm": 0.7402843832969666, + "learning_rate": 0.0001947331123624069, + "loss": 2.7287, + "step": 2165 + }, + { + "epoch": 0.17480429343878623, + "grad_norm": 0.8019410967826843, + "learning_rate": 0.000194728055337227, + "loss": 2.7451, + "step": 2166 + }, + { + "epoch": 0.17488499717536923, + "grad_norm": 0.9137046337127686, + "learning_rate": 0.0001947229959511763, + "loss": 2.808, + "step": 2167 + }, + { + "epoch": 0.1749657009119522, + "grad_norm": 0.7539177536964417, + "learning_rate": 0.000194717934204381, + "loss": 2.7031, + "step": 2168 + }, + { + "epoch": 0.17504640464853521, + "grad_norm": 0.8611089587211609, + "learning_rate": 0.00019471287009696715, + "loss": 2.8751, + "step": 2169 + }, + { + "epoch": 0.17512710838511822, + "grad_norm": 0.906134843826294, + "learning_rate": 0.000194707803629061, + "loss": 2.9163, + "step": 2170 + }, + { + "epoch": 0.17520781212170122, + "grad_norm": 0.8066667318344116, + "learning_rate": 0.00019470273480078879, + "loss": 2.7549, + "step": 2171 + }, + { + "epoch": 0.17528851585828423, + "grad_norm": 0.7962325215339661, + "learning_rate": 0.00019469766361227692, + "loss": 2.7964, + "step": 2172 + }, + { + "epoch": 0.17536921959486723, + "grad_norm": 0.7802287340164185, + "learning_rate": 0.0001946925900636517, + "loss": 2.7022, + "step": 2173 + }, + { + "epoch": 0.17544992333145024, + "grad_norm": 0.783478319644928, + "learning_rate": 0.0001946875141550396, + "loss": 2.7798, + "step": 2174 + }, + { + "epoch": 0.17553062706803324, + "grad_norm": 0.8006815314292908, + "learning_rate": 0.00019468243588656713, + "loss": 2.7345, + "step": 2175 + }, + { + "epoch": 0.17561133080461624, + "grad_norm": 0.7566428184509277, + "learning_rate": 0.00019467735525836085, + "loss": 2.7822, + "step": 2176 + }, + { + "epoch": 0.17569203454119925, + "grad_norm": 0.772282600402832, + "learning_rate": 0.0001946722722705474, + "loss": 2.7346, + "step": 2177 + }, + { + "epoch": 0.17577273827778225, + "grad_norm": 0.7808345556259155, + "learning_rate": 0.00019466718692325347, + "loss": 2.755, + "step": 2178 + }, + { + "epoch": 0.17585344201436526, + "grad_norm": 0.8150362372398376, + "learning_rate": 0.00019466209921660576, + "loss": 2.7691, + "step": 2179 + }, + { + "epoch": 0.17593414575094826, + "grad_norm": 0.7952939867973328, + "learning_rate": 0.0001946570091507311, + "loss": 2.8175, + "step": 2180 + }, + { + "epoch": 0.17601484948753127, + "grad_norm": 0.8211334347724915, + "learning_rate": 0.00019465191672575634, + "loss": 2.7561, + "step": 2181 + }, + { + "epoch": 0.17609555322411427, + "grad_norm": 0.7726178765296936, + "learning_rate": 0.00019464682194180838, + "loss": 2.7435, + "step": 2182 + }, + { + "epoch": 0.17617625696069728, + "grad_norm": 0.7614372372627258, + "learning_rate": 0.00019464172479901422, + "loss": 2.7301, + "step": 2183 + }, + { + "epoch": 0.17625696069728028, + "grad_norm": 0.7818898558616638, + "learning_rate": 0.00019463662529750083, + "loss": 2.6964, + "step": 2184 + }, + { + "epoch": 0.17633766443386328, + "grad_norm": 0.7849796414375305, + "learning_rate": 0.0001946315234373954, + "loss": 2.7431, + "step": 2185 + }, + { + "epoch": 0.1764183681704463, + "grad_norm": 0.7939459085464478, + "learning_rate": 0.00019462641921882506, + "loss": 2.7126, + "step": 2186 + }, + { + "epoch": 0.1764990719070293, + "grad_norm": 0.8391629457473755, + "learning_rate": 0.00019462131264191696, + "loss": 2.8394, + "step": 2187 + }, + { + "epoch": 0.1765797756436123, + "grad_norm": 0.7548067569732666, + "learning_rate": 0.0001946162037067984, + "loss": 2.7315, + "step": 2188 + }, + { + "epoch": 0.1766604793801953, + "grad_norm": 0.8278634548187256, + "learning_rate": 0.00019461109241359674, + "loss": 2.8298, + "step": 2189 + }, + { + "epoch": 0.1767411831167783, + "grad_norm": 0.8275949954986572, + "learning_rate": 0.00019460597876243933, + "loss": 2.8072, + "step": 2190 + }, + { + "epoch": 0.1768218868533613, + "grad_norm": 0.7720363140106201, + "learning_rate": 0.00019460086275345363, + "loss": 2.7478, + "step": 2191 + }, + { + "epoch": 0.17690259058994431, + "grad_norm": 0.7795925140380859, + "learning_rate": 0.00019459574438676714, + "loss": 2.7633, + "step": 2192 + }, + { + "epoch": 0.17698329432652732, + "grad_norm": 0.7722043991088867, + "learning_rate": 0.00019459062366250743, + "loss": 2.8001, + "step": 2193 + }, + { + "epoch": 0.17706399806311032, + "grad_norm": 0.8560587763786316, + "learning_rate": 0.00019458550058080212, + "loss": 2.7494, + "step": 2194 + }, + { + "epoch": 0.17714470179969333, + "grad_norm": 0.7473754286766052, + "learning_rate": 0.00019458037514177886, + "loss": 2.7112, + "step": 2195 + }, + { + "epoch": 0.17722540553627633, + "grad_norm": 0.7625827789306641, + "learning_rate": 0.00019457524734556542, + "loss": 2.7496, + "step": 2196 + }, + { + "epoch": 0.17730610927285934, + "grad_norm": 0.7809351682662964, + "learning_rate": 0.00019457011719228962, + "loss": 2.7764, + "step": 2197 + }, + { + "epoch": 0.17738681300944234, + "grad_norm": 0.7846190333366394, + "learning_rate": 0.00019456498468207927, + "loss": 2.7189, + "step": 2198 + }, + { + "epoch": 0.17746751674602534, + "grad_norm": 0.7919551134109497, + "learning_rate": 0.0001945598498150623, + "loss": 2.7798, + "step": 2199 + }, + { + "epoch": 0.17754822048260835, + "grad_norm": 0.796183705329895, + "learning_rate": 0.0001945547125913667, + "loss": 2.7498, + "step": 2200 + }, + { + "epoch": 0.17762892421919135, + "grad_norm": 0.791668176651001, + "learning_rate": 0.0001945495730111205, + "loss": 2.7638, + "step": 2201 + }, + { + "epoch": 0.17770962795577436, + "grad_norm": 0.8303191661834717, + "learning_rate": 0.0001945444310744518, + "loss": 2.8079, + "step": 2202 + }, + { + "epoch": 0.17779033169235736, + "grad_norm": 0.8245917558670044, + "learning_rate": 0.00019453928678148872, + "loss": 2.7222, + "step": 2203 + }, + { + "epoch": 0.17787103542894037, + "grad_norm": 0.793456494808197, + "learning_rate": 0.0001945341401323595, + "loss": 2.8532, + "step": 2204 + }, + { + "epoch": 0.17795173916552337, + "grad_norm": 0.7574856877326965, + "learning_rate": 0.00019452899112719235, + "loss": 2.7361, + "step": 2205 + }, + { + "epoch": 0.17803244290210637, + "grad_norm": 0.7748556733131409, + "learning_rate": 0.0001945238397661157, + "loss": 2.7423, + "step": 2206 + }, + { + "epoch": 0.17811314663868938, + "grad_norm": 0.8973588347434998, + "learning_rate": 0.00019451868604925782, + "loss": 2.7604, + "step": 2207 + }, + { + "epoch": 0.17819385037527238, + "grad_norm": 0.7613589763641357, + "learning_rate": 0.00019451352997674722, + "loss": 2.7168, + "step": 2208 + }, + { + "epoch": 0.1782745541118554, + "grad_norm": 0.8152763247489929, + "learning_rate": 0.00019450837154871243, + "loss": 2.7904, + "step": 2209 + }, + { + "epoch": 0.1783552578484384, + "grad_norm": 0.8115083575248718, + "learning_rate": 0.00019450321076528194, + "loss": 2.7595, + "step": 2210 + }, + { + "epoch": 0.1784359615850214, + "grad_norm": 0.772665798664093, + "learning_rate": 0.00019449804762658438, + "loss": 2.7125, + "step": 2211 + }, + { + "epoch": 0.1785166653216044, + "grad_norm": 0.8002723455429077, + "learning_rate": 0.0001944928821327485, + "loss": 2.8121, + "step": 2212 + }, + { + "epoch": 0.1785973690581874, + "grad_norm": 0.8354858160018921, + "learning_rate": 0.00019448771428390296, + "loss": 2.8662, + "step": 2213 + }, + { + "epoch": 0.1786780727947704, + "grad_norm": 0.7799130082130432, + "learning_rate": 0.0001944825440801766, + "loss": 2.7247, + "step": 2214 + }, + { + "epoch": 0.1787587765313534, + "grad_norm": 0.810265302658081, + "learning_rate": 0.00019447737152169828, + "loss": 2.7095, + "step": 2215 + }, + { + "epoch": 0.17883948026793642, + "grad_norm": 0.8305599093437195, + "learning_rate": 0.00019447219660859687, + "loss": 2.7448, + "step": 2216 + }, + { + "epoch": 0.17892018400451942, + "grad_norm": 0.7899554371833801, + "learning_rate": 0.00019446701934100138, + "loss": 2.7295, + "step": 2217 + }, + { + "epoch": 0.17900088774110243, + "grad_norm": 0.7675672173500061, + "learning_rate": 0.00019446183971904082, + "loss": 2.7236, + "step": 2218 + }, + { + "epoch": 0.1790815914776854, + "grad_norm": 0.8717279434204102, + "learning_rate": 0.0001944566577428443, + "loss": 2.8044, + "step": 2219 + }, + { + "epoch": 0.1791622952142684, + "grad_norm": 0.8151431679725647, + "learning_rate": 0.00019445147341254094, + "loss": 2.7753, + "step": 2220 + }, + { + "epoch": 0.1792429989508514, + "grad_norm": 0.8481619358062744, + "learning_rate": 0.00019444628672825998, + "loss": 2.7954, + "step": 2221 + }, + { + "epoch": 0.17932370268743442, + "grad_norm": 0.8133199214935303, + "learning_rate": 0.00019444109769013065, + "loss": 2.7235, + "step": 2222 + }, + { + "epoch": 0.17940440642401742, + "grad_norm": 0.8250097036361694, + "learning_rate": 0.00019443590629828232, + "loss": 2.8352, + "step": 2223 + }, + { + "epoch": 0.17948511016060043, + "grad_norm": 0.8279787302017212, + "learning_rate": 0.00019443071255284433, + "loss": 2.7513, + "step": 2224 + }, + { + "epoch": 0.17956581389718343, + "grad_norm": 0.7781538963317871, + "learning_rate": 0.00019442551645394612, + "loss": 2.7239, + "step": 2225 + }, + { + "epoch": 0.17964651763376643, + "grad_norm": 0.7718615531921387, + "learning_rate": 0.00019442031800171727, + "loss": 2.7387, + "step": 2226 + }, + { + "epoch": 0.17972722137034944, + "grad_norm": 0.7704512476921082, + "learning_rate": 0.00019441511719628724, + "loss": 2.792, + "step": 2227 + }, + { + "epoch": 0.17980792510693244, + "grad_norm": 0.8290835618972778, + "learning_rate": 0.00019440991403778566, + "loss": 2.7745, + "step": 2228 + }, + { + "epoch": 0.17988862884351545, + "grad_norm": 0.8408392667770386, + "learning_rate": 0.00019440470852634227, + "loss": 2.7688, + "step": 2229 + }, + { + "epoch": 0.17996933258009845, + "grad_norm": 0.8503465056419373, + "learning_rate": 0.00019439950066208676, + "loss": 2.6747, + "step": 2230 + }, + { + "epoch": 0.18005003631668146, + "grad_norm": 0.8213364481925964, + "learning_rate": 0.0001943942904451489, + "loss": 2.7212, + "step": 2231 + }, + { + "epoch": 0.18013074005326446, + "grad_norm": 0.8511209487915039, + "learning_rate": 0.0001943890778756586, + "loss": 2.701, + "step": 2232 + }, + { + "epoch": 0.18021144378984746, + "grad_norm": 0.8034417033195496, + "learning_rate": 0.00019438386295374577, + "loss": 2.7029, + "step": 2233 + }, + { + "epoch": 0.18029214752643047, + "grad_norm": 0.7603715658187866, + "learning_rate": 0.0001943786456795403, + "loss": 2.7201, + "step": 2234 + }, + { + "epoch": 0.18037285126301347, + "grad_norm": 0.9210647940635681, + "learning_rate": 0.0001943734260531723, + "loss": 2.7847, + "step": 2235 + }, + { + "epoch": 0.18045355499959648, + "grad_norm": 0.7429665923118591, + "learning_rate": 0.00019436820407477186, + "loss": 2.7493, + "step": 2236 + }, + { + "epoch": 0.18053425873617948, + "grad_norm": 0.8290510773658752, + "learning_rate": 0.00019436297974446905, + "loss": 2.7711, + "step": 2237 + }, + { + "epoch": 0.18061496247276249, + "grad_norm": 0.7593570947647095, + "learning_rate": 0.0001943577530623941, + "loss": 2.7539, + "step": 2238 + }, + { + "epoch": 0.1806956662093455, + "grad_norm": 0.8222225308418274, + "learning_rate": 0.00019435252402867734, + "loss": 2.7703, + "step": 2239 + }, + { + "epoch": 0.1807763699459285, + "grad_norm": 0.8280842900276184, + "learning_rate": 0.00019434729264344898, + "loss": 2.7966, + "step": 2240 + }, + { + "epoch": 0.1808570736825115, + "grad_norm": 0.8258495926856995, + "learning_rate": 0.00019434205890683952, + "loss": 2.759, + "step": 2241 + }, + { + "epoch": 0.1809377774190945, + "grad_norm": 0.8294420838356018, + "learning_rate": 0.00019433682281897932, + "loss": 2.6996, + "step": 2242 + }, + { + "epoch": 0.1810184811556775, + "grad_norm": 0.8258811235427856, + "learning_rate": 0.0001943315843799989, + "loss": 2.774, + "step": 2243 + }, + { + "epoch": 0.1810991848922605, + "grad_norm": 0.8035838007926941, + "learning_rate": 0.0001943263435900288, + "loss": 2.7806, + "step": 2244 + }, + { + "epoch": 0.18117988862884352, + "grad_norm": 0.7900332808494568, + "learning_rate": 0.00019432110044919964, + "loss": 2.7462, + "step": 2245 + }, + { + "epoch": 0.18126059236542652, + "grad_norm": 0.8126730918884277, + "learning_rate": 0.00019431585495764212, + "loss": 2.6913, + "step": 2246 + }, + { + "epoch": 0.18134129610200952, + "grad_norm": 0.8411321043968201, + "learning_rate": 0.00019431060711548695, + "loss": 2.7503, + "step": 2247 + }, + { + "epoch": 0.18142199983859253, + "grad_norm": 0.7712867856025696, + "learning_rate": 0.0001943053569228649, + "loss": 2.7703, + "step": 2248 + }, + { + "epoch": 0.18150270357517553, + "grad_norm": 0.9093566536903381, + "learning_rate": 0.00019430010437990688, + "loss": 2.7838, + "step": 2249 + }, + { + "epoch": 0.18158340731175854, + "grad_norm": 0.8184913396835327, + "learning_rate": 0.00019429484948674372, + "loss": 2.8167, + "step": 2250 + }, + { + "epoch": 0.18166411104834154, + "grad_norm": 0.7215915322303772, + "learning_rate": 0.00019428959224350643, + "loss": 2.739, + "step": 2251 + }, + { + "epoch": 0.18174481478492455, + "grad_norm": 0.7842726111412048, + "learning_rate": 0.000194284332650326, + "loss": 2.8547, + "step": 2252 + }, + { + "epoch": 0.18182551852150755, + "grad_norm": 0.7758263349533081, + "learning_rate": 0.00019427907070733357, + "loss": 2.7746, + "step": 2253 + }, + { + "epoch": 0.18190622225809056, + "grad_norm": 0.7710500359535217, + "learning_rate": 0.00019427380641466027, + "loss": 2.7415, + "step": 2254 + }, + { + "epoch": 0.18198692599467356, + "grad_norm": 0.8233851194381714, + "learning_rate": 0.00019426853977243724, + "loss": 2.7471, + "step": 2255 + }, + { + "epoch": 0.18206762973125656, + "grad_norm": 0.7856284379959106, + "learning_rate": 0.00019426327078079578, + "loss": 2.6892, + "step": 2256 + }, + { + "epoch": 0.18214833346783957, + "grad_norm": 0.7978290915489197, + "learning_rate": 0.00019425799943986722, + "loss": 2.7346, + "step": 2257 + }, + { + "epoch": 0.18222903720442257, + "grad_norm": 0.8339362740516663, + "learning_rate": 0.00019425272574978293, + "loss": 2.7403, + "step": 2258 + }, + { + "epoch": 0.18230974094100558, + "grad_norm": 0.8035171031951904, + "learning_rate": 0.0001942474497106743, + "loss": 2.7444, + "step": 2259 + }, + { + "epoch": 0.18239044467758858, + "grad_norm": 0.7950475811958313, + "learning_rate": 0.0001942421713226729, + "loss": 2.7218, + "step": 2260 + }, + { + "epoch": 0.18247114841417159, + "grad_norm": 0.8439741730690002, + "learning_rate": 0.00019423689058591022, + "loss": 2.7498, + "step": 2261 + }, + { + "epoch": 0.1825518521507546, + "grad_norm": 0.8585919737815857, + "learning_rate": 0.00019423160750051789, + "loss": 2.7459, + "step": 2262 + }, + { + "epoch": 0.1826325558873376, + "grad_norm": 0.857276201248169, + "learning_rate": 0.00019422632206662755, + "loss": 2.8404, + "step": 2263 + }, + { + "epoch": 0.1827132596239206, + "grad_norm": 0.7692707777023315, + "learning_rate": 0.000194221034284371, + "loss": 2.8069, + "step": 2264 + }, + { + "epoch": 0.1827939633605036, + "grad_norm": 0.9107782244682312, + "learning_rate": 0.00019421574415387998, + "loss": 2.7554, + "step": 2265 + }, + { + "epoch": 0.1828746670970866, + "grad_norm": 0.763300895690918, + "learning_rate": 0.00019421045167528628, + "loss": 2.8031, + "step": 2266 + }, + { + "epoch": 0.1829553708336696, + "grad_norm": 0.8625530004501343, + "learning_rate": 0.0001942051568487219, + "loss": 2.7622, + "step": 2267 + }, + { + "epoch": 0.18303607457025262, + "grad_norm": 0.8483080863952637, + "learning_rate": 0.00019419985967431875, + "loss": 2.7726, + "step": 2268 + }, + { + "epoch": 0.18311677830683562, + "grad_norm": 0.8295309543609619, + "learning_rate": 0.00019419456015220884, + "loss": 2.7676, + "step": 2269 + }, + { + "epoch": 0.1831974820434186, + "grad_norm": 0.812976062297821, + "learning_rate": 0.0001941892582825243, + "loss": 2.745, + "step": 2270 + }, + { + "epoch": 0.1832781857800016, + "grad_norm": 0.799846351146698, + "learning_rate": 0.00019418395406539717, + "loss": 2.7474, + "step": 2271 + }, + { + "epoch": 0.1833588895165846, + "grad_norm": 0.7825174331665039, + "learning_rate": 0.00019417864750095976, + "loss": 2.7982, + "step": 2272 + }, + { + "epoch": 0.1834395932531676, + "grad_norm": 0.8331060409545898, + "learning_rate": 0.00019417333858934424, + "loss": 2.7279, + "step": 2273 + }, + { + "epoch": 0.18352029698975061, + "grad_norm": 0.8579809665679932, + "learning_rate": 0.00019416802733068295, + "loss": 2.7425, + "step": 2274 + }, + { + "epoch": 0.18360100072633362, + "grad_norm": 0.8643589019775391, + "learning_rate": 0.0001941627137251083, + "loss": 2.7369, + "step": 2275 + }, + { + "epoch": 0.18368170446291662, + "grad_norm": 0.9086846113204956, + "learning_rate": 0.00019415739777275265, + "loss": 2.7681, + "step": 2276 + }, + { + "epoch": 0.18376240819949963, + "grad_norm": 0.8442896604537964, + "learning_rate": 0.00019415207947374853, + "loss": 2.7733, + "step": 2277 + }, + { + "epoch": 0.18384311193608263, + "grad_norm": 0.7858592867851257, + "learning_rate": 0.00019414675882822846, + "loss": 2.7726, + "step": 2278 + }, + { + "epoch": 0.18392381567266564, + "grad_norm": 0.8191118240356445, + "learning_rate": 0.00019414143583632503, + "loss": 2.8142, + "step": 2279 + }, + { + "epoch": 0.18400451940924864, + "grad_norm": 0.8093815445899963, + "learning_rate": 0.00019413611049817097, + "loss": 2.7068, + "step": 2280 + }, + { + "epoch": 0.18408522314583164, + "grad_norm": 0.80247563123703, + "learning_rate": 0.00019413078281389895, + "loss": 2.7459, + "step": 2281 + }, + { + "epoch": 0.18416592688241465, + "grad_norm": 0.8200877904891968, + "learning_rate": 0.00019412545278364176, + "loss": 2.6963, + "step": 2282 + }, + { + "epoch": 0.18424663061899765, + "grad_norm": 0.870662271976471, + "learning_rate": 0.00019412012040753224, + "loss": 2.8636, + "step": 2283 + }, + { + "epoch": 0.18432733435558066, + "grad_norm": 0.7626601457595825, + "learning_rate": 0.00019411478568570332, + "loss": 2.8082, + "step": 2284 + }, + { + "epoch": 0.18440803809216366, + "grad_norm": 0.7492787837982178, + "learning_rate": 0.00019410944861828787, + "loss": 2.7231, + "step": 2285 + }, + { + "epoch": 0.18448874182874667, + "grad_norm": 0.8172419667243958, + "learning_rate": 0.000194104109205419, + "loss": 2.7054, + "step": 2286 + }, + { + "epoch": 0.18456944556532967, + "grad_norm": 0.7749670147895813, + "learning_rate": 0.0001940987674472297, + "loss": 2.6907, + "step": 2287 + }, + { + "epoch": 0.18465014930191267, + "grad_norm": 0.8855465054512024, + "learning_rate": 0.00019409342334385316, + "loss": 2.7439, + "step": 2288 + }, + { + "epoch": 0.18473085303849568, + "grad_norm": 0.8066419363021851, + "learning_rate": 0.00019408807689542257, + "loss": 2.7126, + "step": 2289 + }, + { + "epoch": 0.18481155677507868, + "grad_norm": 0.7759004235267639, + "learning_rate": 0.00019408272810207114, + "loss": 2.7207, + "step": 2290 + }, + { + "epoch": 0.1848922605116617, + "grad_norm": 0.8593513369560242, + "learning_rate": 0.00019407737696393215, + "loss": 2.7375, + "step": 2291 + }, + { + "epoch": 0.1849729642482447, + "grad_norm": 0.8154759407043457, + "learning_rate": 0.00019407202348113904, + "loss": 2.7608, + "step": 2292 + }, + { + "epoch": 0.1850536679848277, + "grad_norm": 0.7912892699241638, + "learning_rate": 0.0001940666676538252, + "loss": 2.7886, + "step": 2293 + }, + { + "epoch": 0.1851343717214107, + "grad_norm": 0.9184576272964478, + "learning_rate": 0.0001940613094821241, + "loss": 2.7867, + "step": 2294 + }, + { + "epoch": 0.1852150754579937, + "grad_norm": 0.8114588856697083, + "learning_rate": 0.0001940559489661693, + "loss": 2.8105, + "step": 2295 + }, + { + "epoch": 0.1852957791945767, + "grad_norm": 0.7681595683097839, + "learning_rate": 0.00019405058610609438, + "loss": 2.7707, + "step": 2296 + }, + { + "epoch": 0.18537648293115971, + "grad_norm": 0.7719643712043762, + "learning_rate": 0.000194045220902033, + "loss": 2.6767, + "step": 2297 + }, + { + "epoch": 0.18545718666774272, + "grad_norm": 0.7602487206459045, + "learning_rate": 0.00019403985335411888, + "loss": 2.7698, + "step": 2298 + }, + { + "epoch": 0.18553789040432572, + "grad_norm": 0.8044554591178894, + "learning_rate": 0.00019403448346248578, + "loss": 2.7578, + "step": 2299 + }, + { + "epoch": 0.18561859414090873, + "grad_norm": 0.7830328345298767, + "learning_rate": 0.00019402911122726757, + "loss": 2.7113, + "step": 2300 + }, + { + "epoch": 0.18569929787749173, + "grad_norm": 0.7793100476264954, + "learning_rate": 0.0001940237366485981, + "loss": 2.7388, + "step": 2301 + }, + { + "epoch": 0.18578000161407474, + "grad_norm": 0.9127374887466431, + "learning_rate": 0.00019401835972661133, + "loss": 2.7459, + "step": 2302 + }, + { + "epoch": 0.18586070535065774, + "grad_norm": 0.8007177114486694, + "learning_rate": 0.00019401298046144128, + "loss": 2.776, + "step": 2303 + }, + { + "epoch": 0.18594140908724074, + "grad_norm": 0.7384614944458008, + "learning_rate": 0.000194007598853222, + "loss": 2.6819, + "step": 2304 + }, + { + "epoch": 0.18602211282382375, + "grad_norm": 0.798909068107605, + "learning_rate": 0.0001940022149020876, + "loss": 2.7218, + "step": 2305 + }, + { + "epoch": 0.18610281656040675, + "grad_norm": 0.8388963341712952, + "learning_rate": 0.0001939968286081723, + "loss": 2.8248, + "step": 2306 + }, + { + "epoch": 0.18618352029698976, + "grad_norm": 0.8411754369735718, + "learning_rate": 0.0001939914399716103, + "loss": 2.7575, + "step": 2307 + }, + { + "epoch": 0.18626422403357276, + "grad_norm": 0.7936103343963623, + "learning_rate": 0.00019398604899253594, + "loss": 2.7488, + "step": 2308 + }, + { + "epoch": 0.18634492777015577, + "grad_norm": 0.7913734912872314, + "learning_rate": 0.00019398065567108357, + "loss": 2.7963, + "step": 2309 + }, + { + "epoch": 0.18642563150673877, + "grad_norm": 0.8341575860977173, + "learning_rate": 0.00019397526000738754, + "loss": 2.7698, + "step": 2310 + }, + { + "epoch": 0.18650633524332177, + "grad_norm": 0.8323128819465637, + "learning_rate": 0.00019396986200158244, + "loss": 2.7218, + "step": 2311 + }, + { + "epoch": 0.18658703897990478, + "grad_norm": 0.748073160648346, + "learning_rate": 0.0001939644616538027, + "loss": 2.7798, + "step": 2312 + }, + { + "epoch": 0.18666774271648778, + "grad_norm": 0.8166958689689636, + "learning_rate": 0.00019395905896418296, + "loss": 2.661, + "step": 2313 + }, + { + "epoch": 0.1867484464530708, + "grad_norm": 0.796791672706604, + "learning_rate": 0.00019395365393285786, + "loss": 2.7297, + "step": 2314 + }, + { + "epoch": 0.1868291501896538, + "grad_norm": 0.7851170897483826, + "learning_rate": 0.0001939482465599621, + "loss": 2.7798, + "step": 2315 + }, + { + "epoch": 0.1869098539262368, + "grad_norm": 0.7545836567878723, + "learning_rate": 0.00019394283684563045, + "loss": 2.7327, + "step": 2316 + }, + { + "epoch": 0.1869905576628198, + "grad_norm": 0.8100360631942749, + "learning_rate": 0.00019393742478999776, + "loss": 2.7901, + "step": 2317 + }, + { + "epoch": 0.1870712613994028, + "grad_norm": 0.7874314785003662, + "learning_rate": 0.00019393201039319887, + "loss": 2.7597, + "step": 2318 + }, + { + "epoch": 0.1871519651359858, + "grad_norm": 0.7698730826377869, + "learning_rate": 0.00019392659365536876, + "loss": 2.7327, + "step": 2319 + }, + { + "epoch": 0.1872326688725688, + "grad_norm": 0.7417994141578674, + "learning_rate": 0.0001939211745766424, + "loss": 2.7413, + "step": 2320 + }, + { + "epoch": 0.1873133726091518, + "grad_norm": 0.7823258638381958, + "learning_rate": 0.00019391575315715485, + "loss": 2.7577, + "step": 2321 + }, + { + "epoch": 0.1873940763457348, + "grad_norm": 0.82382732629776, + "learning_rate": 0.00019391032939704124, + "loss": 2.7769, + "step": 2322 + }, + { + "epoch": 0.1874747800823178, + "grad_norm": 0.8405026197433472, + "learning_rate": 0.0001939049032964367, + "loss": 2.8402, + "step": 2323 + }, + { + "epoch": 0.1875554838189008, + "grad_norm": 0.8307906985282898, + "learning_rate": 0.00019389947485547654, + "loss": 2.7642, + "step": 2324 + }, + { + "epoch": 0.1876361875554838, + "grad_norm": 0.8618248701095581, + "learning_rate": 0.000193894044074296, + "loss": 2.7853, + "step": 2325 + }, + { + "epoch": 0.1877168912920668, + "grad_norm": 0.8040831685066223, + "learning_rate": 0.00019388861095303046, + "loss": 2.7467, + "step": 2326 + }, + { + "epoch": 0.18779759502864982, + "grad_norm": 0.7723637223243713, + "learning_rate": 0.0001938831754918153, + "loss": 2.7222, + "step": 2327 + }, + { + "epoch": 0.18787829876523282, + "grad_norm": 0.8189084529876709, + "learning_rate": 0.000193877737690786, + "loss": 2.7857, + "step": 2328 + }, + { + "epoch": 0.18795900250181583, + "grad_norm": 0.8335791826248169, + "learning_rate": 0.00019387229755007805, + "loss": 2.6997, + "step": 2329 + }, + { + "epoch": 0.18803970623839883, + "grad_norm": 0.7732782959938049, + "learning_rate": 0.00019386685506982707, + "loss": 2.7155, + "step": 2330 + }, + { + "epoch": 0.18812040997498183, + "grad_norm": 0.8262906670570374, + "learning_rate": 0.0001938614102501687, + "loss": 2.7638, + "step": 2331 + }, + { + "epoch": 0.18820111371156484, + "grad_norm": 0.7969058156013489, + "learning_rate": 0.00019385596309123862, + "loss": 2.7363, + "step": 2332 + }, + { + "epoch": 0.18828181744814784, + "grad_norm": 0.7834853529930115, + "learning_rate": 0.0001938505135931726, + "loss": 2.7205, + "step": 2333 + }, + { + "epoch": 0.18836252118473085, + "grad_norm": 0.748481810092926, + "learning_rate": 0.00019384506175610647, + "loss": 2.7759, + "step": 2334 + }, + { + "epoch": 0.18844322492131385, + "grad_norm": 0.8137786984443665, + "learning_rate": 0.00019383960758017604, + "loss": 2.828, + "step": 2335 + }, + { + "epoch": 0.18852392865789686, + "grad_norm": 0.8065745234489441, + "learning_rate": 0.00019383415106551734, + "loss": 2.7408, + "step": 2336 + }, + { + "epoch": 0.18860463239447986, + "grad_norm": 0.768643856048584, + "learning_rate": 0.0001938286922122663, + "loss": 2.6503, + "step": 2337 + }, + { + "epoch": 0.18868533613106286, + "grad_norm": 0.7677921652793884, + "learning_rate": 0.00019382323102055897, + "loss": 2.7088, + "step": 2338 + }, + { + "epoch": 0.18876603986764587, + "grad_norm": 0.7648717164993286, + "learning_rate": 0.0001938177674905315, + "loss": 2.7015, + "step": 2339 + }, + { + "epoch": 0.18884674360422887, + "grad_norm": 0.7517116665840149, + "learning_rate": 0.00019381230162231997, + "loss": 2.7095, + "step": 2340 + }, + { + "epoch": 0.18892744734081188, + "grad_norm": 0.8147841691970825, + "learning_rate": 0.00019380683341606067, + "loss": 2.8563, + "step": 2341 + }, + { + "epoch": 0.18900815107739488, + "grad_norm": 0.7849822640419006, + "learning_rate": 0.00019380136287188988, + "loss": 2.7432, + "step": 2342 + }, + { + "epoch": 0.18908885481397789, + "grad_norm": 0.813811719417572, + "learning_rate": 0.0001937958899899439, + "loss": 2.7419, + "step": 2343 + }, + { + "epoch": 0.1891695585505609, + "grad_norm": 0.8142707943916321, + "learning_rate": 0.00019379041477035923, + "loss": 2.7658, + "step": 2344 + }, + { + "epoch": 0.1892502622871439, + "grad_norm": 0.7594506740570068, + "learning_rate": 0.00019378493721327217, + "loss": 2.7298, + "step": 2345 + }, + { + "epoch": 0.1893309660237269, + "grad_norm": 0.8374232053756714, + "learning_rate": 0.00019377945731881936, + "loss": 2.8112, + "step": 2346 + }, + { + "epoch": 0.1894116697603099, + "grad_norm": 0.783608615398407, + "learning_rate": 0.00019377397508713734, + "loss": 2.8168, + "step": 2347 + }, + { + "epoch": 0.1894923734968929, + "grad_norm": 0.720214307308197, + "learning_rate": 0.0001937684905183627, + "loss": 2.7516, + "step": 2348 + }, + { + "epoch": 0.1895730772334759, + "grad_norm": 0.7939600944519043, + "learning_rate": 0.0001937630036126322, + "loss": 2.7609, + "step": 2349 + }, + { + "epoch": 0.18965378097005892, + "grad_norm": 0.787315309047699, + "learning_rate": 0.00019375751437008252, + "loss": 2.758, + "step": 2350 + }, + { + "epoch": 0.18973448470664192, + "grad_norm": 0.7862411141395569, + "learning_rate": 0.00019375202279085053, + "loss": 2.6866, + "step": 2351 + }, + { + "epoch": 0.18981518844322492, + "grad_norm": 0.8651136159896851, + "learning_rate": 0.000193746528875073, + "loss": 2.7488, + "step": 2352 + }, + { + "epoch": 0.18989589217980793, + "grad_norm": 0.8150602579116821, + "learning_rate": 0.00019374103262288696, + "loss": 2.7417, + "step": 2353 + }, + { + "epoch": 0.18997659591639093, + "grad_norm": 0.9053540229797363, + "learning_rate": 0.00019373553403442934, + "loss": 2.7587, + "step": 2354 + }, + { + "epoch": 0.19005729965297394, + "grad_norm": 0.8775703310966492, + "learning_rate": 0.0001937300331098372, + "loss": 2.733, + "step": 2355 + }, + { + "epoch": 0.19013800338955694, + "grad_norm": 0.7714357972145081, + "learning_rate": 0.0001937245298492476, + "loss": 2.7595, + "step": 2356 + }, + { + "epoch": 0.19021870712613995, + "grad_norm": 0.8648017048835754, + "learning_rate": 0.0001937190242527977, + "loss": 2.7944, + "step": 2357 + }, + { + "epoch": 0.19029941086272295, + "grad_norm": 0.9367388486862183, + "learning_rate": 0.00019371351632062477, + "loss": 2.7902, + "step": 2358 + }, + { + "epoch": 0.19038011459930596, + "grad_norm": 0.8116368651390076, + "learning_rate": 0.00019370800605286604, + "loss": 2.7291, + "step": 2359 + }, + { + "epoch": 0.19046081833588896, + "grad_norm": 0.7892753481864929, + "learning_rate": 0.00019370249344965882, + "loss": 2.8192, + "step": 2360 + }, + { + "epoch": 0.19054152207247196, + "grad_norm": 0.8109372854232788, + "learning_rate": 0.00019369697851114056, + "loss": 2.6982, + "step": 2361 + }, + { + "epoch": 0.19062222580905497, + "grad_norm": 0.8756314516067505, + "learning_rate": 0.00019369146123744864, + "loss": 2.744, + "step": 2362 + }, + { + "epoch": 0.19070292954563797, + "grad_norm": 0.7400399446487427, + "learning_rate": 0.00019368594162872058, + "loss": 2.7328, + "step": 2363 + }, + { + "epoch": 0.19078363328222098, + "grad_norm": 0.8223158717155457, + "learning_rate": 0.000193680419685094, + "loss": 2.7614, + "step": 2364 + }, + { + "epoch": 0.19086433701880398, + "grad_norm": 0.7350139617919922, + "learning_rate": 0.00019367489540670645, + "loss": 2.7074, + "step": 2365 + }, + { + "epoch": 0.19094504075538699, + "grad_norm": 0.7915631532669067, + "learning_rate": 0.00019366936879369563, + "loss": 2.7835, + "step": 2366 + }, + { + "epoch": 0.19102574449197, + "grad_norm": 0.7765628099441528, + "learning_rate": 0.00019366383984619932, + "loss": 2.765, + "step": 2367 + }, + { + "epoch": 0.191106448228553, + "grad_norm": 0.8127059936523438, + "learning_rate": 0.00019365830856435525, + "loss": 2.7753, + "step": 2368 + }, + { + "epoch": 0.191187151965136, + "grad_norm": 0.8652897477149963, + "learning_rate": 0.0001936527749483013, + "loss": 2.7137, + "step": 2369 + }, + { + "epoch": 0.191267855701719, + "grad_norm": 0.8086774945259094, + "learning_rate": 0.00019364723899817541, + "loss": 2.7209, + "step": 2370 + }, + { + "epoch": 0.191348559438302, + "grad_norm": 0.7965098023414612, + "learning_rate": 0.00019364170071411554, + "loss": 2.786, + "step": 2371 + }, + { + "epoch": 0.19142926317488498, + "grad_norm": 0.7954064607620239, + "learning_rate": 0.00019363616009625967, + "loss": 2.7508, + "step": 2372 + }, + { + "epoch": 0.191509966911468, + "grad_norm": 0.7835928201675415, + "learning_rate": 0.00019363061714474595, + "loss": 2.7423, + "step": 2373 + }, + { + "epoch": 0.191590670648051, + "grad_norm": 0.8720580339431763, + "learning_rate": 0.0001936250718597125, + "loss": 2.7877, + "step": 2374 + }, + { + "epoch": 0.191671374384634, + "grad_norm": 0.836066484451294, + "learning_rate": 0.00019361952424129747, + "loss": 2.8456, + "step": 2375 + }, + { + "epoch": 0.191752078121217, + "grad_norm": 0.793666660785675, + "learning_rate": 0.00019361397428963923, + "loss": 2.786, + "step": 2376 + }, + { + "epoch": 0.1918327818578, + "grad_norm": 0.8573217391967773, + "learning_rate": 0.000193608422004876, + "loss": 2.7569, + "step": 2377 + }, + { + "epoch": 0.191913485594383, + "grad_norm": 0.81243896484375, + "learning_rate": 0.00019360286738714623, + "loss": 2.771, + "step": 2378 + }, + { + "epoch": 0.19199418933096601, + "grad_norm": 0.7449626326560974, + "learning_rate": 0.00019359731043658832, + "loss": 2.7479, + "step": 2379 + }, + { + "epoch": 0.19207489306754902, + "grad_norm": 0.8124165534973145, + "learning_rate": 0.00019359175115334076, + "loss": 2.7602, + "step": 2380 + }, + { + "epoch": 0.19215559680413202, + "grad_norm": 0.7786986827850342, + "learning_rate": 0.00019358618953754211, + "loss": 2.6926, + "step": 2381 + }, + { + "epoch": 0.19223630054071503, + "grad_norm": 0.7987258434295654, + "learning_rate": 0.000193580625589331, + "loss": 2.7573, + "step": 2382 + }, + { + "epoch": 0.19231700427729803, + "grad_norm": 0.8236463665962219, + "learning_rate": 0.00019357505930884606, + "loss": 2.6755, + "step": 2383 + }, + { + "epoch": 0.19239770801388104, + "grad_norm": 0.8285779356956482, + "learning_rate": 0.00019356949069622602, + "loss": 2.7658, + "step": 2384 + }, + { + "epoch": 0.19247841175046404, + "grad_norm": 0.7823960781097412, + "learning_rate": 0.0001935639197516097, + "loss": 2.7404, + "step": 2385 + }, + { + "epoch": 0.19255911548704704, + "grad_norm": 0.968638002872467, + "learning_rate": 0.00019355834647513591, + "loss": 2.7836, + "step": 2386 + }, + { + "epoch": 0.19263981922363005, + "grad_norm": 0.8170328736305237, + "learning_rate": 0.00019355277086694357, + "loss": 2.7816, + "step": 2387 + }, + { + "epoch": 0.19272052296021305, + "grad_norm": 0.8342583179473877, + "learning_rate": 0.00019354719292717163, + "loss": 2.8204, + "step": 2388 + }, + { + "epoch": 0.19280122669679606, + "grad_norm": 0.8160435557365417, + "learning_rate": 0.0001935416126559591, + "loss": 2.6938, + "step": 2389 + }, + { + "epoch": 0.19288193043337906, + "grad_norm": 0.7888174653053284, + "learning_rate": 0.00019353603005344504, + "loss": 2.6804, + "step": 2390 + }, + { + "epoch": 0.19296263416996207, + "grad_norm": 0.8389205932617188, + "learning_rate": 0.00019353044511976865, + "loss": 2.7571, + "step": 2391 + }, + { + "epoch": 0.19304333790654507, + "grad_norm": 0.7920562028884888, + "learning_rate": 0.00019352485785506906, + "loss": 2.7174, + "step": 2392 + }, + { + "epoch": 0.19312404164312807, + "grad_norm": 0.7853459715843201, + "learning_rate": 0.00019351926825948555, + "loss": 2.7626, + "step": 2393 + }, + { + "epoch": 0.19320474537971108, + "grad_norm": 0.9109459519386292, + "learning_rate": 0.0001935136763331574, + "loss": 2.7568, + "step": 2394 + }, + { + "epoch": 0.19328544911629408, + "grad_norm": 0.7983853816986084, + "learning_rate": 0.00019350808207622397, + "loss": 2.7412, + "step": 2395 + }, + { + "epoch": 0.1933661528528771, + "grad_norm": 0.7416854500770569, + "learning_rate": 0.00019350248548882472, + "loss": 2.7335, + "step": 2396 + }, + { + "epoch": 0.1934468565894601, + "grad_norm": 0.7305171489715576, + "learning_rate": 0.0001934968865710991, + "loss": 2.7295, + "step": 2397 + }, + { + "epoch": 0.1935275603260431, + "grad_norm": 0.7717033624649048, + "learning_rate": 0.0001934912853231867, + "loss": 2.7568, + "step": 2398 + }, + { + "epoch": 0.1936082640626261, + "grad_norm": 0.7833831906318665, + "learning_rate": 0.00019348568174522705, + "loss": 2.736, + "step": 2399 + }, + { + "epoch": 0.1936889677992091, + "grad_norm": 0.872831404209137, + "learning_rate": 0.00019348007583735983, + "loss": 2.7719, + "step": 2400 + }, + { + "epoch": 0.1937696715357921, + "grad_norm": 0.8389193415641785, + "learning_rate": 0.0001934744675997248, + "loss": 2.7572, + "step": 2401 + }, + { + "epoch": 0.19385037527237511, + "grad_norm": 0.8442249298095703, + "learning_rate": 0.00019346885703246165, + "loss": 2.8117, + "step": 2402 + }, + { + "epoch": 0.19393107900895812, + "grad_norm": 0.8451170325279236, + "learning_rate": 0.00019346324413571027, + "loss": 2.7216, + "step": 2403 + }, + { + "epoch": 0.19401178274554112, + "grad_norm": 0.898529052734375, + "learning_rate": 0.00019345762890961052, + "loss": 2.8119, + "step": 2404 + }, + { + "epoch": 0.19409248648212413, + "grad_norm": 0.8302313685417175, + "learning_rate": 0.00019345201135430236, + "loss": 2.76, + "step": 2405 + }, + { + "epoch": 0.19417319021870713, + "grad_norm": 0.8975207209587097, + "learning_rate": 0.00019344639146992582, + "loss": 2.8043, + "step": 2406 + }, + { + "epoch": 0.19425389395529014, + "grad_norm": 0.8972581028938293, + "learning_rate": 0.0001934407692566209, + "loss": 2.7487, + "step": 2407 + }, + { + "epoch": 0.19433459769187314, + "grad_norm": 0.8311447501182556, + "learning_rate": 0.00019343514471452776, + "loss": 2.7653, + "step": 2408 + }, + { + "epoch": 0.19441530142845614, + "grad_norm": 0.8336243033409119, + "learning_rate": 0.0001934295178437866, + "loss": 2.753, + "step": 2409 + }, + { + "epoch": 0.19449600516503915, + "grad_norm": 0.8339207172393799, + "learning_rate": 0.0001934238886445376, + "loss": 2.7643, + "step": 2410 + }, + { + "epoch": 0.19457670890162215, + "grad_norm": 0.906074583530426, + "learning_rate": 0.0001934182571169211, + "loss": 2.7777, + "step": 2411 + }, + { + "epoch": 0.19465741263820516, + "grad_norm": 0.8759943246841431, + "learning_rate": 0.00019341262326107742, + "loss": 2.77, + "step": 2412 + }, + { + "epoch": 0.19473811637478816, + "grad_norm": 0.8399369716644287, + "learning_rate": 0.00019340698707714699, + "loss": 2.752, + "step": 2413 + }, + { + "epoch": 0.19481882011137117, + "grad_norm": 0.8551808595657349, + "learning_rate": 0.00019340134856527026, + "loss": 2.6727, + "step": 2414 + }, + { + "epoch": 0.19489952384795417, + "grad_norm": 0.7660732865333557, + "learning_rate": 0.00019339570772558778, + "loss": 2.7491, + "step": 2415 + }, + { + "epoch": 0.19498022758453717, + "grad_norm": 0.8257685303688049, + "learning_rate": 0.00019339006455824015, + "loss": 2.7584, + "step": 2416 + }, + { + "epoch": 0.19506093132112018, + "grad_norm": 0.797275960445404, + "learning_rate": 0.00019338441906336794, + "loss": 2.7051, + "step": 2417 + }, + { + "epoch": 0.19514163505770318, + "grad_norm": 0.8311913013458252, + "learning_rate": 0.00019337877124111193, + "loss": 2.8084, + "step": 2418 + }, + { + "epoch": 0.1952223387942862, + "grad_norm": 0.7995893359184265, + "learning_rate": 0.0001933731210916128, + "loss": 2.7556, + "step": 2419 + }, + { + "epoch": 0.1953030425308692, + "grad_norm": 0.792850136756897, + "learning_rate": 0.00019336746861501147, + "loss": 2.7289, + "step": 2420 + }, + { + "epoch": 0.1953837462674522, + "grad_norm": 0.8058848977088928, + "learning_rate": 0.00019336181381144873, + "loss": 2.7394, + "step": 2421 + }, + { + "epoch": 0.1954644500040352, + "grad_norm": 0.8267124891281128, + "learning_rate": 0.00019335615668106555, + "loss": 2.771, + "step": 2422 + }, + { + "epoch": 0.19554515374061818, + "grad_norm": 0.7641060948371887, + "learning_rate": 0.00019335049722400292, + "loss": 2.7311, + "step": 2423 + }, + { + "epoch": 0.19562585747720118, + "grad_norm": 0.8023245930671692, + "learning_rate": 0.00019334483544040186, + "loss": 2.7658, + "step": 2424 + }, + { + "epoch": 0.19570656121378419, + "grad_norm": 0.8341927528381348, + "learning_rate": 0.00019333917133040348, + "loss": 2.7476, + "step": 2425 + }, + { + "epoch": 0.1957872649503672, + "grad_norm": 0.7985726594924927, + "learning_rate": 0.000193333504894149, + "loss": 2.7362, + "step": 2426 + }, + { + "epoch": 0.1958679686869502, + "grad_norm": 0.7267594933509827, + "learning_rate": 0.0001933278361317796, + "loss": 2.6875, + "step": 2427 + }, + { + "epoch": 0.1959486724235332, + "grad_norm": 0.8292990326881409, + "learning_rate": 0.00019332216504343652, + "loss": 2.7619, + "step": 2428 + }, + { + "epoch": 0.1960293761601162, + "grad_norm": 0.7549588680267334, + "learning_rate": 0.00019331649162926116, + "loss": 2.7385, + "step": 2429 + }, + { + "epoch": 0.1961100798966992, + "grad_norm": 0.7688446640968323, + "learning_rate": 0.0001933108158893949, + "loss": 2.7544, + "step": 2430 + }, + { + "epoch": 0.1961907836332822, + "grad_norm": 0.8168436884880066, + "learning_rate": 0.00019330513782397918, + "loss": 2.8013, + "step": 2431 + }, + { + "epoch": 0.19627148736986522, + "grad_norm": 0.8405759334564209, + "learning_rate": 0.00019329945743315556, + "loss": 2.7299, + "step": 2432 + }, + { + "epoch": 0.19635219110644822, + "grad_norm": 0.79430091381073, + "learning_rate": 0.00019329377471706554, + "loss": 2.7293, + "step": 2433 + }, + { + "epoch": 0.19643289484303122, + "grad_norm": 0.8428656458854675, + "learning_rate": 0.0001932880896758508, + "loss": 2.8211, + "step": 2434 + }, + { + "epoch": 0.19651359857961423, + "grad_norm": 0.7883139252662659, + "learning_rate": 0.00019328240230965298, + "loss": 2.6943, + "step": 2435 + }, + { + "epoch": 0.19659430231619723, + "grad_norm": 0.7539335489273071, + "learning_rate": 0.00019327671261861387, + "loss": 2.6926, + "step": 2436 + }, + { + "epoch": 0.19667500605278024, + "grad_norm": 0.9986057281494141, + "learning_rate": 0.00019327102060287524, + "loss": 2.7851, + "step": 2437 + }, + { + "epoch": 0.19675570978936324, + "grad_norm": 0.7716113924980164, + "learning_rate": 0.000193265326262579, + "loss": 2.752, + "step": 2438 + }, + { + "epoch": 0.19683641352594625, + "grad_norm": 0.9134296774864197, + "learning_rate": 0.000193259629597867, + "loss": 2.7698, + "step": 2439 + }, + { + "epoch": 0.19691711726252925, + "grad_norm": 0.7966345548629761, + "learning_rate": 0.00019325393060888124, + "loss": 2.7839, + "step": 2440 + }, + { + "epoch": 0.19699782099911226, + "grad_norm": 0.8051251173019409, + "learning_rate": 0.0001932482292957638, + "loss": 2.7322, + "step": 2441 + }, + { + "epoch": 0.19707852473569526, + "grad_norm": 0.843169629573822, + "learning_rate": 0.0001932425256586567, + "loss": 2.8263, + "step": 2442 + }, + { + "epoch": 0.19715922847227826, + "grad_norm": 0.7552370429039001, + "learning_rate": 0.00019323681969770213, + "loss": 2.7342, + "step": 2443 + }, + { + "epoch": 0.19723993220886127, + "grad_norm": 0.844473123550415, + "learning_rate": 0.0001932311114130423, + "loss": 2.776, + "step": 2444 + }, + { + "epoch": 0.19732063594544427, + "grad_norm": 0.8002473711967468, + "learning_rate": 0.00019322540080481945, + "loss": 2.7382, + "step": 2445 + }, + { + "epoch": 0.19740133968202728, + "grad_norm": 0.8564329147338867, + "learning_rate": 0.00019321968787317594, + "loss": 2.7592, + "step": 2446 + }, + { + "epoch": 0.19748204341861028, + "grad_norm": 0.7853825688362122, + "learning_rate": 0.00019321397261825408, + "loss": 2.7101, + "step": 2447 + }, + { + "epoch": 0.19756274715519329, + "grad_norm": 0.8482939004898071, + "learning_rate": 0.0001932082550401964, + "loss": 2.7891, + "step": 2448 + }, + { + "epoch": 0.1976434508917763, + "grad_norm": 0.8361770510673523, + "learning_rate": 0.00019320253513914536, + "loss": 2.7341, + "step": 2449 + }, + { + "epoch": 0.1977241546283593, + "grad_norm": 0.7814618945121765, + "learning_rate": 0.0001931968129152435, + "loss": 2.771, + "step": 2450 + }, + { + "epoch": 0.1978048583649423, + "grad_norm": 0.7588146924972534, + "learning_rate": 0.00019319108836863343, + "loss": 2.7577, + "step": 2451 + }, + { + "epoch": 0.1978855621015253, + "grad_norm": 0.9184895157814026, + "learning_rate": 0.00019318536149945785, + "loss": 2.7711, + "step": 2452 + }, + { + "epoch": 0.1979662658381083, + "grad_norm": 0.8454298973083496, + "learning_rate": 0.00019317963230785947, + "loss": 2.7748, + "step": 2453 + }, + { + "epoch": 0.1980469695746913, + "grad_norm": 0.7662420868873596, + "learning_rate": 0.0001931739007939811, + "loss": 2.7704, + "step": 2454 + }, + { + "epoch": 0.19812767331127432, + "grad_norm": 0.837888777256012, + "learning_rate": 0.0001931681669579655, + "loss": 2.7613, + "step": 2455 + }, + { + "epoch": 0.19820837704785732, + "grad_norm": 0.7835226058959961, + "learning_rate": 0.0001931624307999557, + "loss": 2.6888, + "step": 2456 + }, + { + "epoch": 0.19828908078444032, + "grad_norm": 0.8491464257240295, + "learning_rate": 0.00019315669232009456, + "loss": 2.7521, + "step": 2457 + }, + { + "epoch": 0.19836978452102333, + "grad_norm": 0.7590088248252869, + "learning_rate": 0.00019315095151852516, + "loss": 2.7441, + "step": 2458 + }, + { + "epoch": 0.19845048825760633, + "grad_norm": 0.9316127300262451, + "learning_rate": 0.00019314520839539052, + "loss": 2.786, + "step": 2459 + }, + { + "epoch": 0.19853119199418934, + "grad_norm": 0.7819615006446838, + "learning_rate": 0.0001931394629508338, + "loss": 2.7003, + "step": 2460 + }, + { + "epoch": 0.19861189573077234, + "grad_norm": 0.7675932049751282, + "learning_rate": 0.0001931337151849982, + "loss": 2.7065, + "step": 2461 + }, + { + "epoch": 0.19869259946735535, + "grad_norm": 0.7797678112983704, + "learning_rate": 0.000193127965098027, + "loss": 2.7605, + "step": 2462 + }, + { + "epoch": 0.19877330320393835, + "grad_norm": 0.789544403553009, + "learning_rate": 0.00019312221269006345, + "loss": 2.7913, + "step": 2463 + }, + { + "epoch": 0.19885400694052136, + "grad_norm": 0.9594957232475281, + "learning_rate": 0.00019311645796125094, + "loss": 2.785, + "step": 2464 + }, + { + "epoch": 0.19893471067710436, + "grad_norm": 0.8154739141464233, + "learning_rate": 0.00019311070091173287, + "loss": 2.6716, + "step": 2465 + }, + { + "epoch": 0.19901541441368736, + "grad_norm": 0.9042142629623413, + "learning_rate": 0.00019310494154165274, + "loss": 2.734, + "step": 2466 + }, + { + "epoch": 0.19909611815027037, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001930991798511541, + "loss": 2.7052, + "step": 2467 + }, + { + "epoch": 0.19917682188685337, + "grad_norm": 0.7917614579200745, + "learning_rate": 0.00019309341584038055, + "loss": 2.728, + "step": 2468 + }, + { + "epoch": 0.19925752562343638, + "grad_norm": 0.8295063376426697, + "learning_rate": 0.00019308764950947568, + "loss": 2.7496, + "step": 2469 + }, + { + "epoch": 0.19933822936001938, + "grad_norm": 0.790831983089447, + "learning_rate": 0.0001930818808585833, + "loss": 2.7356, + "step": 2470 + }, + { + "epoch": 0.19941893309660239, + "grad_norm": 0.8527843952178955, + "learning_rate": 0.0001930761098878471, + "loss": 2.718, + "step": 2471 + }, + { + "epoch": 0.1994996368331854, + "grad_norm": 0.8518494367599487, + "learning_rate": 0.00019307033659741096, + "loss": 2.7189, + "step": 2472 + }, + { + "epoch": 0.1995803405697684, + "grad_norm": 0.8027220368385315, + "learning_rate": 0.00019306456098741872, + "loss": 2.7272, + "step": 2473 + }, + { + "epoch": 0.19966104430635137, + "grad_norm": 0.7516468167304993, + "learning_rate": 0.00019305878305801434, + "loss": 2.798, + "step": 2474 + }, + { + "epoch": 0.19974174804293438, + "grad_norm": 0.7676397562026978, + "learning_rate": 0.00019305300280934187, + "loss": 2.8076, + "step": 2475 + }, + { + "epoch": 0.19982245177951738, + "grad_norm": 0.8237762451171875, + "learning_rate": 0.00019304722024154528, + "loss": 2.6998, + "step": 2476 + }, + { + "epoch": 0.19990315551610038, + "grad_norm": 0.8397759199142456, + "learning_rate": 0.0001930414353547688, + "loss": 2.806, + "step": 2477 + }, + { + "epoch": 0.1999838592526834, + "grad_norm": 0.8911117911338806, + "learning_rate": 0.00019303564814915645, + "loss": 2.7566, + "step": 2478 + }, + { + "epoch": 0.2000645629892664, + "grad_norm": 0.765404999256134, + "learning_rate": 0.00019302985862485264, + "loss": 2.7363, + "step": 2479 + }, + { + "epoch": 0.2001452667258494, + "grad_norm": 0.7898589372634888, + "learning_rate": 0.0001930240667820015, + "loss": 2.7007, + "step": 2480 + }, + { + "epoch": 0.2002259704624324, + "grad_norm": 0.7581521272659302, + "learning_rate": 0.0001930182726207475, + "loss": 2.7508, + "step": 2481 + }, + { + "epoch": 0.2003066741990154, + "grad_norm": 0.8179795742034912, + "learning_rate": 0.00019301247614123495, + "loss": 2.7327, + "step": 2482 + }, + { + "epoch": 0.2003873779355984, + "grad_norm": 0.8103611469268799, + "learning_rate": 0.00019300667734360838, + "loss": 2.7869, + "step": 2483 + }, + { + "epoch": 0.20046808167218141, + "grad_norm": 0.7368054389953613, + "learning_rate": 0.0001930008762280123, + "loss": 2.73, + "step": 2484 + }, + { + "epoch": 0.20054878540876442, + "grad_norm": 0.7679662108421326, + "learning_rate": 0.00019299507279459127, + "loss": 2.7905, + "step": 2485 + }, + { + "epoch": 0.20062948914534742, + "grad_norm": 0.7783839702606201, + "learning_rate": 0.0001929892670434899, + "loss": 2.6816, + "step": 2486 + }, + { + "epoch": 0.20071019288193043, + "grad_norm": 0.7575809359550476, + "learning_rate": 0.00019298345897485298, + "loss": 2.7351, + "step": 2487 + }, + { + "epoch": 0.20079089661851343, + "grad_norm": 0.7674959301948547, + "learning_rate": 0.00019297764858882514, + "loss": 2.7682, + "step": 2488 + }, + { + "epoch": 0.20087160035509644, + "grad_norm": 0.7972592115402222, + "learning_rate": 0.00019297183588555127, + "loss": 2.782, + "step": 2489 + }, + { + "epoch": 0.20095230409167944, + "grad_norm": 0.8417105674743652, + "learning_rate": 0.00019296602086517624, + "loss": 2.8173, + "step": 2490 + }, + { + "epoch": 0.20103300782826244, + "grad_norm": 0.7194239497184753, + "learning_rate": 0.00019296020352784496, + "loss": 2.7735, + "step": 2491 + }, + { + "epoch": 0.20111371156484545, + "grad_norm": 0.801895022392273, + "learning_rate": 0.00019295438387370237, + "loss": 2.7018, + "step": 2492 + }, + { + "epoch": 0.20119441530142845, + "grad_norm": 0.900943398475647, + "learning_rate": 0.0001929485619028936, + "loss": 2.77, + "step": 2493 + }, + { + "epoch": 0.20127511903801146, + "grad_norm": 0.7882106304168701, + "learning_rate": 0.00019294273761556366, + "loss": 2.7195, + "step": 2494 + }, + { + "epoch": 0.20135582277459446, + "grad_norm": 0.7471950054168701, + "learning_rate": 0.00019293691101185775, + "loss": 2.7346, + "step": 2495 + }, + { + "epoch": 0.20143652651117747, + "grad_norm": 0.7498352527618408, + "learning_rate": 0.00019293108209192104, + "loss": 2.7255, + "step": 2496 + }, + { + "epoch": 0.20151723024776047, + "grad_norm": 0.8233164548873901, + "learning_rate": 0.0001929252508558989, + "loss": 2.8253, + "step": 2497 + }, + { + "epoch": 0.20159793398434347, + "grad_norm": 0.7533289790153503, + "learning_rate": 0.00019291941730393658, + "loss": 2.7487, + "step": 2498 + }, + { + "epoch": 0.20167863772092648, + "grad_norm": 0.7372691035270691, + "learning_rate": 0.0001929135814361795, + "loss": 2.6799, + "step": 2499 + }, + { + "epoch": 0.20175934145750948, + "grad_norm": 0.7760890126228333, + "learning_rate": 0.00019290774325277305, + "loss": 2.8366, + "step": 2500 + }, + { + "epoch": 0.2018400451940925, + "grad_norm": 0.7653746008872986, + "learning_rate": 0.0001929019027538628, + "loss": 2.7413, + "step": 2501 + }, + { + "epoch": 0.2019207489306755, + "grad_norm": 0.7364951372146606, + "learning_rate": 0.0001928960599395943, + "loss": 2.7405, + "step": 2502 + }, + { + "epoch": 0.2020014526672585, + "grad_norm": 0.8317872285842896, + "learning_rate": 0.00019289021481011314, + "loss": 2.7186, + "step": 2503 + }, + { + "epoch": 0.2020821564038415, + "grad_norm": 0.8325691223144531, + "learning_rate": 0.00019288436736556502, + "loss": 2.7305, + "step": 2504 + }, + { + "epoch": 0.2021628601404245, + "grad_norm": 0.7674683332443237, + "learning_rate": 0.00019287851760609566, + "loss": 2.7171, + "step": 2505 + }, + { + "epoch": 0.2022435638770075, + "grad_norm": 0.8043155074119568, + "learning_rate": 0.00019287266553185084, + "loss": 2.7425, + "step": 2506 + }, + { + "epoch": 0.2023242676135905, + "grad_norm": 0.8522058725357056, + "learning_rate": 0.00019286681114297642, + "loss": 2.7764, + "step": 2507 + }, + { + "epoch": 0.20240497135017352, + "grad_norm": 0.7700086236000061, + "learning_rate": 0.00019286095443961832, + "loss": 2.7499, + "step": 2508 + }, + { + "epoch": 0.20248567508675652, + "grad_norm": 0.8078013062477112, + "learning_rate": 0.0001928550954219225, + "loss": 2.7863, + "step": 2509 + }, + { + "epoch": 0.20256637882333953, + "grad_norm": 0.7431712746620178, + "learning_rate": 0.00019284923409003496, + "loss": 2.8296, + "step": 2510 + }, + { + "epoch": 0.20264708255992253, + "grad_norm": 0.753754734992981, + "learning_rate": 0.00019284337044410182, + "loss": 2.722, + "step": 2511 + }, + { + "epoch": 0.20272778629650554, + "grad_norm": 0.8117631077766418, + "learning_rate": 0.00019283750448426918, + "loss": 2.7718, + "step": 2512 + }, + { + "epoch": 0.20280849003308854, + "grad_norm": 0.9149020910263062, + "learning_rate": 0.00019283163621068325, + "loss": 2.7416, + "step": 2513 + }, + { + "epoch": 0.20288919376967154, + "grad_norm": 0.8240262866020203, + "learning_rate": 0.0001928257656234903, + "loss": 2.811, + "step": 2514 + }, + { + "epoch": 0.20296989750625455, + "grad_norm": 0.7394035458564758, + "learning_rate": 0.00019281989272283657, + "loss": 2.7345, + "step": 2515 + }, + { + "epoch": 0.20305060124283755, + "grad_norm": 0.7827345132827759, + "learning_rate": 0.00019281401750886854, + "loss": 2.7955, + "step": 2516 + }, + { + "epoch": 0.20313130497942056, + "grad_norm": 0.7482333183288574, + "learning_rate": 0.00019280813998173252, + "loss": 2.6963, + "step": 2517 + }, + { + "epoch": 0.20321200871600356, + "grad_norm": 0.8187180757522583, + "learning_rate": 0.00019280226014157509, + "loss": 2.7413, + "step": 2518 + }, + { + "epoch": 0.20329271245258657, + "grad_norm": 0.7708666920661926, + "learning_rate": 0.00019279637798854274, + "loss": 2.7636, + "step": 2519 + }, + { + "epoch": 0.20337341618916957, + "grad_norm": 0.7414180040359497, + "learning_rate": 0.00019279049352278208, + "loss": 2.7321, + "step": 2520 + }, + { + "epoch": 0.20345411992575257, + "grad_norm": 0.8172248601913452, + "learning_rate": 0.00019278460674443975, + "loss": 2.8026, + "step": 2521 + }, + { + "epoch": 0.20353482366233558, + "grad_norm": 0.7463089227676392, + "learning_rate": 0.0001927787176536625, + "loss": 2.74, + "step": 2522 + }, + { + "epoch": 0.20361552739891858, + "grad_norm": 0.7684210538864136, + "learning_rate": 0.00019277282625059704, + "loss": 2.782, + "step": 2523 + }, + { + "epoch": 0.2036962311355016, + "grad_norm": 0.9246797561645508, + "learning_rate": 0.00019276693253539027, + "loss": 2.8546, + "step": 2524 + }, + { + "epoch": 0.20377693487208456, + "grad_norm": 0.753753125667572, + "learning_rate": 0.00019276103650818906, + "loss": 2.7422, + "step": 2525 + }, + { + "epoch": 0.20385763860866757, + "grad_norm": 0.7461897134780884, + "learning_rate": 0.00019275513816914032, + "loss": 2.7575, + "step": 2526 + }, + { + "epoch": 0.20393834234525057, + "grad_norm": 0.7555257081985474, + "learning_rate": 0.00019274923751839106, + "loss": 2.7423, + "step": 2527 + }, + { + "epoch": 0.20401904608183358, + "grad_norm": 0.7628511786460876, + "learning_rate": 0.00019274333455608837, + "loss": 2.7386, + "step": 2528 + }, + { + "epoch": 0.20409974981841658, + "grad_norm": 0.7529371976852417, + "learning_rate": 0.00019273742928237937, + "loss": 2.6852, + "step": 2529 + }, + { + "epoch": 0.20418045355499959, + "grad_norm": 0.7466779351234436, + "learning_rate": 0.00019273152169741118, + "loss": 2.6996, + "step": 2530 + }, + { + "epoch": 0.2042611572915826, + "grad_norm": 0.7916153073310852, + "learning_rate": 0.0001927256118013311, + "loss": 2.7644, + "step": 2531 + }, + { + "epoch": 0.2043418610281656, + "grad_norm": 0.7662972211837769, + "learning_rate": 0.00019271969959428636, + "loss": 2.7497, + "step": 2532 + }, + { + "epoch": 0.2044225647647486, + "grad_norm": 0.8244680166244507, + "learning_rate": 0.00019271378507642432, + "loss": 2.7598, + "step": 2533 + }, + { + "epoch": 0.2045032685013316, + "grad_norm": 0.7721532583236694, + "learning_rate": 0.00019270786824789244, + "loss": 2.7303, + "step": 2534 + }, + { + "epoch": 0.2045839722379146, + "grad_norm": 0.7598209381103516, + "learning_rate": 0.0001927019491088381, + "loss": 2.734, + "step": 2535 + }, + { + "epoch": 0.2046646759744976, + "grad_norm": 0.7778685092926025, + "learning_rate": 0.00019269602765940887, + "loss": 2.7113, + "step": 2536 + }, + { + "epoch": 0.20474537971108062, + "grad_norm": 0.7447141408920288, + "learning_rate": 0.00019269010389975235, + "loss": 2.7205, + "step": 2537 + }, + { + "epoch": 0.20482608344766362, + "grad_norm": 0.8066664338111877, + "learning_rate": 0.00019268417783001613, + "loss": 2.7637, + "step": 2538 + }, + { + "epoch": 0.20490678718424662, + "grad_norm": 0.7055318355560303, + "learning_rate": 0.00019267824945034794, + "loss": 2.6936, + "step": 2539 + }, + { + "epoch": 0.20498749092082963, + "grad_norm": 0.832647979259491, + "learning_rate": 0.0001926723187608955, + "loss": 2.7423, + "step": 2540 + }, + { + "epoch": 0.20506819465741263, + "grad_norm": 0.7316983938217163, + "learning_rate": 0.0001926663857618066, + "loss": 2.7136, + "step": 2541 + }, + { + "epoch": 0.20514889839399564, + "grad_norm": 0.8115554451942444, + "learning_rate": 0.00019266045045322915, + "loss": 2.6964, + "step": 2542 + }, + { + "epoch": 0.20522960213057864, + "grad_norm": 0.802573025226593, + "learning_rate": 0.00019265451283531108, + "loss": 2.7989, + "step": 2543 + }, + { + "epoch": 0.20531030586716165, + "grad_norm": 0.7073348164558411, + "learning_rate": 0.00019264857290820033, + "loss": 2.7399, + "step": 2544 + }, + { + "epoch": 0.20539100960374465, + "grad_norm": 0.7749258279800415, + "learning_rate": 0.00019264263067204495, + "loss": 2.7321, + "step": 2545 + }, + { + "epoch": 0.20547171334032766, + "grad_norm": 0.7473557591438293, + "learning_rate": 0.00019263668612699305, + "loss": 2.7774, + "step": 2546 + }, + { + "epoch": 0.20555241707691066, + "grad_norm": 0.8073423504829407, + "learning_rate": 0.0001926307392731928, + "loss": 2.7429, + "step": 2547 + }, + { + "epoch": 0.20563312081349366, + "grad_norm": 0.9106586575508118, + "learning_rate": 0.00019262479011079235, + "loss": 2.7972, + "step": 2548 + }, + { + "epoch": 0.20571382455007667, + "grad_norm": 0.7975970506668091, + "learning_rate": 0.00019261883863994002, + "loss": 2.7561, + "step": 2549 + }, + { + "epoch": 0.20579452828665967, + "grad_norm": 0.8967030048370361, + "learning_rate": 0.00019261288486078414, + "loss": 2.7368, + "step": 2550 + }, + { + "epoch": 0.20587523202324268, + "grad_norm": 0.7157345414161682, + "learning_rate": 0.00019260692877347304, + "loss": 2.7329, + "step": 2551 + }, + { + "epoch": 0.20595593575982568, + "grad_norm": 0.8758620619773865, + "learning_rate": 0.00019260097037815524, + "loss": 2.7522, + "step": 2552 + }, + { + "epoch": 0.20603663949640869, + "grad_norm": 0.7948124408721924, + "learning_rate": 0.00019259500967497916, + "loss": 2.7675, + "step": 2553 + }, + { + "epoch": 0.2061173432329917, + "grad_norm": 0.8233941197395325, + "learning_rate": 0.00019258904666409344, + "loss": 2.7728, + "step": 2554 + }, + { + "epoch": 0.2061980469695747, + "grad_norm": 0.8084299564361572, + "learning_rate": 0.0001925830813456466, + "loss": 2.7728, + "step": 2555 + }, + { + "epoch": 0.2062787507061577, + "grad_norm": 0.8004557490348816, + "learning_rate": 0.00019257711371978737, + "loss": 2.7783, + "step": 2556 + }, + { + "epoch": 0.2063594544427407, + "grad_norm": 0.7999755144119263, + "learning_rate": 0.0001925711437866645, + "loss": 2.7632, + "step": 2557 + }, + { + "epoch": 0.2064401581793237, + "grad_norm": 0.7317264080047607, + "learning_rate": 0.0001925651715464267, + "loss": 2.7101, + "step": 2558 + }, + { + "epoch": 0.2065208619159067, + "grad_norm": 0.7906385660171509, + "learning_rate": 0.00019255919699922287, + "loss": 2.7258, + "step": 2559 + }, + { + "epoch": 0.20660156565248972, + "grad_norm": 0.7932917475700378, + "learning_rate": 0.0001925532201452019, + "loss": 2.7714, + "step": 2560 + }, + { + "epoch": 0.20668226938907272, + "grad_norm": 0.8039286732673645, + "learning_rate": 0.00019254724098451275, + "loss": 2.7469, + "step": 2561 + }, + { + "epoch": 0.20676297312565572, + "grad_norm": 0.79400634765625, + "learning_rate": 0.00019254125951730444, + "loss": 2.7499, + "step": 2562 + }, + { + "epoch": 0.20684367686223873, + "grad_norm": 0.8072263598442078, + "learning_rate": 0.00019253527574372603, + "loss": 2.7805, + "step": 2563 + }, + { + "epoch": 0.20692438059882173, + "grad_norm": 0.7117579579353333, + "learning_rate": 0.00019252928966392667, + "loss": 2.7321, + "step": 2564 + }, + { + "epoch": 0.20700508433540474, + "grad_norm": 0.7080324292182922, + "learning_rate": 0.00019252330127805554, + "loss": 2.7225, + "step": 2565 + }, + { + "epoch": 0.20708578807198774, + "grad_norm": 0.7276670336723328, + "learning_rate": 0.00019251731058626186, + "loss": 2.7592, + "step": 2566 + }, + { + "epoch": 0.20716649180857075, + "grad_norm": 0.8030811548233032, + "learning_rate": 0.00019251131758869495, + "loss": 2.7184, + "step": 2567 + }, + { + "epoch": 0.20724719554515375, + "grad_norm": 0.7808283567428589, + "learning_rate": 0.0001925053222855042, + "loss": 2.7504, + "step": 2568 + }, + { + "epoch": 0.20732789928173675, + "grad_norm": 0.783225953578949, + "learning_rate": 0.00019249932467683902, + "loss": 2.7125, + "step": 2569 + }, + { + "epoch": 0.20740860301831976, + "grad_norm": 0.7440134286880493, + "learning_rate": 0.00019249332476284887, + "loss": 2.7938, + "step": 2570 + }, + { + "epoch": 0.20748930675490276, + "grad_norm": 0.8729553818702698, + "learning_rate": 0.00019248732254368328, + "loss": 2.8338, + "step": 2571 + }, + { + "epoch": 0.20757001049148577, + "grad_norm": 0.8170497417449951, + "learning_rate": 0.0001924813180194918, + "loss": 2.7254, + "step": 2572 + }, + { + "epoch": 0.20765071422806877, + "grad_norm": 0.733220100402832, + "learning_rate": 0.00019247531119042418, + "loss": 2.6401, + "step": 2573 + }, + { + "epoch": 0.20773141796465178, + "grad_norm": 0.7247937917709351, + "learning_rate": 0.00019246930205663008, + "loss": 2.736, + "step": 2574 + }, + { + "epoch": 0.20781212170123478, + "grad_norm": 0.7880212068557739, + "learning_rate": 0.00019246329061825925, + "loss": 2.7173, + "step": 2575 + }, + { + "epoch": 0.20789282543781776, + "grad_norm": 0.820808470249176, + "learning_rate": 0.00019245727687546149, + "loss": 2.7331, + "step": 2576 + }, + { + "epoch": 0.20797352917440076, + "grad_norm": 0.8605412840843201, + "learning_rate": 0.00019245126082838673, + "loss": 2.761, + "step": 2577 + }, + { + "epoch": 0.20805423291098377, + "grad_norm": 0.763506293296814, + "learning_rate": 0.00019244524247718486, + "loss": 2.7053, + "step": 2578 + }, + { + "epoch": 0.20813493664756677, + "grad_norm": 0.8428114652633667, + "learning_rate": 0.00019243922182200592, + "loss": 2.724, + "step": 2579 + }, + { + "epoch": 0.20821564038414977, + "grad_norm": 0.821986734867096, + "learning_rate": 0.0001924331988629999, + "loss": 2.7615, + "step": 2580 + }, + { + "epoch": 0.20829634412073278, + "grad_norm": 0.8177430629730225, + "learning_rate": 0.00019242717360031693, + "loss": 2.7012, + "step": 2581 + }, + { + "epoch": 0.20837704785731578, + "grad_norm": 0.7584180235862732, + "learning_rate": 0.00019242114603410724, + "loss": 2.7372, + "step": 2582 + }, + { + "epoch": 0.2084577515938988, + "grad_norm": 0.9384645223617554, + "learning_rate": 0.00019241511616452096, + "loss": 2.695, + "step": 2583 + }, + { + "epoch": 0.2085384553304818, + "grad_norm": 0.8518964648246765, + "learning_rate": 0.00019240908399170844, + "loss": 2.8216, + "step": 2584 + }, + { + "epoch": 0.2086191590670648, + "grad_norm": 0.9082949161529541, + "learning_rate": 0.00019240304951581995, + "loss": 2.777, + "step": 2585 + }, + { + "epoch": 0.2086998628036478, + "grad_norm": 0.7906371355056763, + "learning_rate": 0.00019239701273700597, + "loss": 2.7083, + "step": 2586 + }, + { + "epoch": 0.2087805665402308, + "grad_norm": 0.7711954712867737, + "learning_rate": 0.00019239097365541686, + "loss": 2.6907, + "step": 2587 + }, + { + "epoch": 0.2088612702768138, + "grad_norm": 0.8155506253242493, + "learning_rate": 0.0001923849322712032, + "loss": 2.7602, + "step": 2588 + }, + { + "epoch": 0.20894197401339681, + "grad_norm": 0.8843441009521484, + "learning_rate": 0.0001923788885845155, + "loss": 2.7525, + "step": 2589 + }, + { + "epoch": 0.20902267774997982, + "grad_norm": 0.7336379289627075, + "learning_rate": 0.00019237284259550444, + "loss": 2.731, + "step": 2590 + }, + { + "epoch": 0.20910338148656282, + "grad_norm": 0.8261263370513916, + "learning_rate": 0.00019236679430432066, + "loss": 2.6493, + "step": 2591 + }, + { + "epoch": 0.20918408522314583, + "grad_norm": 0.7716216444969177, + "learning_rate": 0.00019236074371111497, + "loss": 2.7775, + "step": 2592 + }, + { + "epoch": 0.20926478895972883, + "grad_norm": 0.8390100598335266, + "learning_rate": 0.00019235469081603808, + "loss": 2.7532, + "step": 2593 + }, + { + "epoch": 0.20934549269631184, + "grad_norm": 0.8388446569442749, + "learning_rate": 0.00019234863561924087, + "loss": 2.8171, + "step": 2594 + }, + { + "epoch": 0.20942619643289484, + "grad_norm": 0.8003209829330444, + "learning_rate": 0.00019234257812087425, + "loss": 2.7385, + "step": 2595 + }, + { + "epoch": 0.20950690016947784, + "grad_norm": 0.8008458018302917, + "learning_rate": 0.00019233651832108918, + "loss": 2.7366, + "step": 2596 + }, + { + "epoch": 0.20958760390606085, + "grad_norm": 0.7701897025108337, + "learning_rate": 0.00019233045622003676, + "loss": 2.69, + "step": 2597 + }, + { + "epoch": 0.20966830764264385, + "grad_norm": 0.8106730580329895, + "learning_rate": 0.00019232439181786796, + "loss": 2.6911, + "step": 2598 + }, + { + "epoch": 0.20974901137922686, + "grad_norm": 0.9580766558647156, + "learning_rate": 0.00019231832511473401, + "loss": 2.7663, + "step": 2599 + }, + { + "epoch": 0.20982971511580986, + "grad_norm": 0.7851876616477966, + "learning_rate": 0.0001923122561107861, + "loss": 2.7632, + "step": 2600 + }, + { + "epoch": 0.20991041885239287, + "grad_norm": 0.8160942196846008, + "learning_rate": 0.0001923061848061754, + "loss": 2.8533, + "step": 2601 + }, + { + "epoch": 0.20999112258897587, + "grad_norm": 0.8540663719177246, + "learning_rate": 0.00019230011120105334, + "loss": 2.7083, + "step": 2602 + }, + { + "epoch": 0.21007182632555887, + "grad_norm": 0.8273833394050598, + "learning_rate": 0.0001922940352955712, + "loss": 2.7916, + "step": 2603 + }, + { + "epoch": 0.21015253006214188, + "grad_norm": 0.8394255638122559, + "learning_rate": 0.00019228795708988046, + "loss": 2.8561, + "step": 2604 + }, + { + "epoch": 0.21023323379872488, + "grad_norm": 0.8291410803794861, + "learning_rate": 0.00019228187658413258, + "loss": 2.7462, + "step": 2605 + }, + { + "epoch": 0.2103139375353079, + "grad_norm": 0.7984235286712646, + "learning_rate": 0.00019227579377847912, + "loss": 2.7459, + "step": 2606 + }, + { + "epoch": 0.2103946412718909, + "grad_norm": 0.8343340158462524, + "learning_rate": 0.00019226970867307163, + "loss": 2.6963, + "step": 2607 + }, + { + "epoch": 0.2104753450084739, + "grad_norm": 0.6982808709144592, + "learning_rate": 0.00019226362126806184, + "loss": 2.7333, + "step": 2608 + }, + { + "epoch": 0.2105560487450569, + "grad_norm": 0.8039572834968567, + "learning_rate": 0.0001922575315636014, + "loss": 2.7253, + "step": 2609 + }, + { + "epoch": 0.2106367524816399, + "grad_norm": 0.8708705902099609, + "learning_rate": 0.00019225143955984214, + "loss": 2.7555, + "step": 2610 + }, + { + "epoch": 0.2107174562182229, + "grad_norm": 0.8773347735404968, + "learning_rate": 0.00019224534525693585, + "loss": 2.7598, + "step": 2611 + }, + { + "epoch": 0.2107981599548059, + "grad_norm": 0.8151054978370667, + "learning_rate": 0.0001922392486550344, + "loss": 2.7398, + "step": 2612 + }, + { + "epoch": 0.21087886369138892, + "grad_norm": 0.7922329306602478, + "learning_rate": 0.0001922331497542898, + "loss": 2.7296, + "step": 2613 + }, + { + "epoch": 0.21095956742797192, + "grad_norm": 0.7536506652832031, + "learning_rate": 0.00019222704855485396, + "loss": 2.7897, + "step": 2614 + }, + { + "epoch": 0.21104027116455493, + "grad_norm": 0.7539274096488953, + "learning_rate": 0.000192220945056879, + "loss": 2.7809, + "step": 2615 + }, + { + "epoch": 0.21112097490113793, + "grad_norm": 0.7737646698951721, + "learning_rate": 0.00019221483926051705, + "loss": 2.7195, + "step": 2616 + }, + { + "epoch": 0.21120167863772094, + "grad_norm": 0.7421913743019104, + "learning_rate": 0.00019220873116592024, + "loss": 2.6817, + "step": 2617 + }, + { + "epoch": 0.21128238237430394, + "grad_norm": 0.7872927784919739, + "learning_rate": 0.0001922026207732408, + "loss": 2.7379, + "step": 2618 + }, + { + "epoch": 0.21136308611088694, + "grad_norm": 0.7950671315193176, + "learning_rate": 0.00019219650808263104, + "loss": 2.7135, + "step": 2619 + }, + { + "epoch": 0.21144378984746995, + "grad_norm": 0.7711792588233948, + "learning_rate": 0.0001921903930942433, + "loss": 2.7021, + "step": 2620 + }, + { + "epoch": 0.21152449358405295, + "grad_norm": 0.9030743837356567, + "learning_rate": 0.00019218427580822996, + "loss": 2.8083, + "step": 2621 + }, + { + "epoch": 0.21160519732063596, + "grad_norm": 0.8191907405853271, + "learning_rate": 0.0001921781562247435, + "loss": 2.6998, + "step": 2622 + }, + { + "epoch": 0.21168590105721896, + "grad_norm": 0.7883538603782654, + "learning_rate": 0.00019217203434393644, + "loss": 2.7573, + "step": 2623 + }, + { + "epoch": 0.21176660479380197, + "grad_norm": 0.7565868496894836, + "learning_rate": 0.00019216591016596134, + "loss": 2.7725, + "step": 2624 + }, + { + "epoch": 0.21184730853038497, + "grad_norm": 0.8579828143119812, + "learning_rate": 0.00019215978369097086, + "loss": 2.7529, + "step": 2625 + }, + { + "epoch": 0.21192801226696797, + "grad_norm": 0.7835422158241272, + "learning_rate": 0.0001921536549191176, + "loss": 2.6926, + "step": 2626 + }, + { + "epoch": 0.21200871600355095, + "grad_norm": 0.8041907548904419, + "learning_rate": 0.00019214752385055442, + "loss": 2.7541, + "step": 2627 + }, + { + "epoch": 0.21208941974013396, + "grad_norm": 0.7754014730453491, + "learning_rate": 0.00019214139048543406, + "loss": 2.6807, + "step": 2628 + }, + { + "epoch": 0.21217012347671696, + "grad_norm": 0.8222344517707825, + "learning_rate": 0.00019213525482390936, + "loss": 2.7339, + "step": 2629 + }, + { + "epoch": 0.21225082721329996, + "grad_norm": 0.8083673715591431, + "learning_rate": 0.0001921291168661333, + "loss": 2.739, + "step": 2630 + }, + { + "epoch": 0.21233153094988297, + "grad_norm": 0.8039100766181946, + "learning_rate": 0.0001921229766122588, + "loss": 2.7372, + "step": 2631 + }, + { + "epoch": 0.21241223468646597, + "grad_norm": 0.7513072490692139, + "learning_rate": 0.00019211683406243892, + "loss": 2.7284, + "step": 2632 + }, + { + "epoch": 0.21249293842304898, + "grad_norm": 0.7653890252113342, + "learning_rate": 0.00019211068921682673, + "loss": 2.6911, + "step": 2633 + }, + { + "epoch": 0.21257364215963198, + "grad_norm": 0.7210217714309692, + "learning_rate": 0.00019210454207557542, + "loss": 2.6989, + "step": 2634 + }, + { + "epoch": 0.21265434589621499, + "grad_norm": 0.7389202117919922, + "learning_rate": 0.00019209839263883814, + "loss": 2.7016, + "step": 2635 + }, + { + "epoch": 0.212735049632798, + "grad_norm": 0.8069031238555908, + "learning_rate": 0.00019209224090676813, + "loss": 2.8213, + "step": 2636 + }, + { + "epoch": 0.212815753369381, + "grad_norm": 0.8019161224365234, + "learning_rate": 0.00019208608687951877, + "loss": 2.7413, + "step": 2637 + }, + { + "epoch": 0.212896457105964, + "grad_norm": 0.775572657585144, + "learning_rate": 0.00019207993055724343, + "loss": 2.7016, + "step": 2638 + }, + { + "epoch": 0.212977160842547, + "grad_norm": 0.7482941746711731, + "learning_rate": 0.0001920737719400955, + "loss": 2.7991, + "step": 2639 + }, + { + "epoch": 0.21305786457913, + "grad_norm": 0.8467636704444885, + "learning_rate": 0.0001920676110282285, + "loss": 2.7401, + "step": 2640 + }, + { + "epoch": 0.213138568315713, + "grad_norm": 0.8726305365562439, + "learning_rate": 0.00019206144782179597, + "loss": 2.7599, + "step": 2641 + }, + { + "epoch": 0.21321927205229602, + "grad_norm": 0.740527868270874, + "learning_rate": 0.00019205528232095148, + "loss": 2.7326, + "step": 2642 + }, + { + "epoch": 0.21329997578887902, + "grad_norm": 0.7932354211807251, + "learning_rate": 0.00019204911452584873, + "loss": 2.7873, + "step": 2643 + }, + { + "epoch": 0.21338067952546202, + "grad_norm": 0.7994125485420227, + "learning_rate": 0.00019204294443664143, + "loss": 2.7305, + "step": 2644 + }, + { + "epoch": 0.21346138326204503, + "grad_norm": 0.880557656288147, + "learning_rate": 0.00019203677205348338, + "loss": 2.7295, + "step": 2645 + }, + { + "epoch": 0.21354208699862803, + "grad_norm": 0.8269557952880859, + "learning_rate": 0.00019203059737652836, + "loss": 2.765, + "step": 2646 + }, + { + "epoch": 0.21362279073521104, + "grad_norm": 0.8732784986495972, + "learning_rate": 0.00019202442040593026, + "loss": 2.6742, + "step": 2647 + }, + { + "epoch": 0.21370349447179404, + "grad_norm": 0.7921704649925232, + "learning_rate": 0.0001920182411418431, + "loss": 2.7144, + "step": 2648 + }, + { + "epoch": 0.21378419820837705, + "grad_norm": 0.8097628355026245, + "learning_rate": 0.00019201205958442082, + "loss": 2.7513, + "step": 2649 + }, + { + "epoch": 0.21386490194496005, + "grad_norm": 0.8230542540550232, + "learning_rate": 0.00019200587573381744, + "loss": 2.7648, + "step": 2650 + }, + { + "epoch": 0.21394560568154306, + "grad_norm": 0.7719153761863708, + "learning_rate": 0.0001919996895901872, + "loss": 2.7637, + "step": 2651 + }, + { + "epoch": 0.21402630941812606, + "grad_norm": 0.9022669792175293, + "learning_rate": 0.00019199350115368415, + "loss": 2.7707, + "step": 2652 + }, + { + "epoch": 0.21410701315470906, + "grad_norm": 0.8111257553100586, + "learning_rate": 0.00019198731042446263, + "loss": 2.7423, + "step": 2653 + }, + { + "epoch": 0.21418771689129207, + "grad_norm": 0.7534981966018677, + "learning_rate": 0.00019198111740267683, + "loss": 2.7474, + "step": 2654 + }, + { + "epoch": 0.21426842062787507, + "grad_norm": 0.761411190032959, + "learning_rate": 0.00019197492208848117, + "loss": 2.7541, + "step": 2655 + }, + { + "epoch": 0.21434912436445808, + "grad_norm": 0.8076324462890625, + "learning_rate": 0.00019196872448203002, + "loss": 2.7198, + "step": 2656 + }, + { + "epoch": 0.21442982810104108, + "grad_norm": 0.7987746000289917, + "learning_rate": 0.00019196252458347784, + "loss": 2.7164, + "step": 2657 + }, + { + "epoch": 0.21451053183762409, + "grad_norm": 0.7581545114517212, + "learning_rate": 0.0001919563223929792, + "loss": 2.6837, + "step": 2658 + }, + { + "epoch": 0.2145912355742071, + "grad_norm": 0.8773601055145264, + "learning_rate": 0.00019195011791068857, + "loss": 2.8248, + "step": 2659 + }, + { + "epoch": 0.2146719393107901, + "grad_norm": 0.7027503252029419, + "learning_rate": 0.00019194391113676066, + "loss": 2.6726, + "step": 2660 + }, + { + "epoch": 0.2147526430473731, + "grad_norm": 0.8650866746902466, + "learning_rate": 0.00019193770207135015, + "loss": 2.7348, + "step": 2661 + }, + { + "epoch": 0.2148333467839561, + "grad_norm": 0.8521862030029297, + "learning_rate": 0.0001919314907146118, + "loss": 2.7409, + "step": 2662 + }, + { + "epoch": 0.2149140505205391, + "grad_norm": 0.8098535537719727, + "learning_rate": 0.00019192527706670033, + "loss": 2.7615, + "step": 2663 + }, + { + "epoch": 0.2149947542571221, + "grad_norm": 0.7396193146705627, + "learning_rate": 0.0001919190611277707, + "loss": 2.7191, + "step": 2664 + }, + { + "epoch": 0.21507545799370512, + "grad_norm": 0.8245799541473389, + "learning_rate": 0.00019191284289797776, + "loss": 2.7429, + "step": 2665 + }, + { + "epoch": 0.21515616173028812, + "grad_norm": 0.791646420955658, + "learning_rate": 0.00019190662237747656, + "loss": 2.7197, + "step": 2666 + }, + { + "epoch": 0.21523686546687112, + "grad_norm": 0.7850802540779114, + "learning_rate": 0.00019190039956642205, + "loss": 2.7353, + "step": 2667 + }, + { + "epoch": 0.21531756920345413, + "grad_norm": 0.7657971978187561, + "learning_rate": 0.00019189417446496937, + "loss": 2.7083, + "step": 2668 + }, + { + "epoch": 0.21539827294003713, + "grad_norm": 0.7704403400421143, + "learning_rate": 0.00019188794707327363, + "loss": 2.7813, + "step": 2669 + }, + { + "epoch": 0.21547897667662014, + "grad_norm": 0.7345917224884033, + "learning_rate": 0.00019188171739149005, + "loss": 2.7098, + "step": 2670 + }, + { + "epoch": 0.21555968041320314, + "grad_norm": 0.728831946849823, + "learning_rate": 0.00019187548541977392, + "loss": 2.6745, + "step": 2671 + }, + { + "epoch": 0.21564038414978615, + "grad_norm": 0.8079627156257629, + "learning_rate": 0.0001918692511582805, + "loss": 2.6427, + "step": 2672 + }, + { + "epoch": 0.21572108788636915, + "grad_norm": 0.766808032989502, + "learning_rate": 0.0001918630146071652, + "loss": 2.6956, + "step": 2673 + }, + { + "epoch": 0.21580179162295215, + "grad_norm": 0.7555391192436218, + "learning_rate": 0.00019185677576658345, + "loss": 2.6499, + "step": 2674 + }, + { + "epoch": 0.21588249535953516, + "grad_norm": 0.7740229964256287, + "learning_rate": 0.00019185053463669074, + "loss": 2.7685, + "step": 2675 + }, + { + "epoch": 0.21596319909611816, + "grad_norm": 0.8272803425788879, + "learning_rate": 0.00019184429121764257, + "loss": 2.7272, + "step": 2676 + }, + { + "epoch": 0.21604390283270117, + "grad_norm": 0.870625376701355, + "learning_rate": 0.00019183804550959463, + "loss": 2.7509, + "step": 2677 + }, + { + "epoch": 0.21612460656928414, + "grad_norm": 0.8021238446235657, + "learning_rate": 0.0001918317975127025, + "loss": 2.7058, + "step": 2678 + }, + { + "epoch": 0.21620531030586715, + "grad_norm": 0.729918897151947, + "learning_rate": 0.00019182554722712192, + "loss": 2.6145, + "step": 2679 + }, + { + "epoch": 0.21628601404245015, + "grad_norm": 0.7658380270004272, + "learning_rate": 0.00019181929465300867, + "loss": 2.712, + "step": 2680 + }, + { + "epoch": 0.21636671777903316, + "grad_norm": 0.7702174186706543, + "learning_rate": 0.00019181303979051858, + "loss": 2.8257, + "step": 2681 + }, + { + "epoch": 0.21644742151561616, + "grad_norm": 0.7782231569290161, + "learning_rate": 0.00019180678263980755, + "loss": 2.8226, + "step": 2682 + }, + { + "epoch": 0.21652812525219917, + "grad_norm": 0.7448495626449585, + "learning_rate": 0.0001918005232010315, + "loss": 2.7877, + "step": 2683 + }, + { + "epoch": 0.21660882898878217, + "grad_norm": 0.7273527979850769, + "learning_rate": 0.00019179426147434647, + "loss": 2.7169, + "step": 2684 + }, + { + "epoch": 0.21668953272536517, + "grad_norm": 0.7730992436408997, + "learning_rate": 0.00019178799745990846, + "loss": 2.717, + "step": 2685 + }, + { + "epoch": 0.21677023646194818, + "grad_norm": 0.7709231376647949, + "learning_rate": 0.0001917817311578736, + "loss": 2.7676, + "step": 2686 + }, + { + "epoch": 0.21685094019853118, + "grad_norm": 0.7825181484222412, + "learning_rate": 0.00019177546256839812, + "loss": 2.7473, + "step": 2687 + }, + { + "epoch": 0.2169316439351142, + "grad_norm": 0.8133581280708313, + "learning_rate": 0.0001917691916916382, + "loss": 2.7242, + "step": 2688 + }, + { + "epoch": 0.2170123476716972, + "grad_norm": 0.7833015322685242, + "learning_rate": 0.00019176291852775011, + "loss": 2.8128, + "step": 2689 + }, + { + "epoch": 0.2170930514082802, + "grad_norm": 0.7423487305641174, + "learning_rate": 0.00019175664307689028, + "loss": 2.6999, + "step": 2690 + }, + { + "epoch": 0.2171737551448632, + "grad_norm": 0.7881289124488831, + "learning_rate": 0.000191750365339215, + "loss": 2.7349, + "step": 2691 + }, + { + "epoch": 0.2172544588814462, + "grad_norm": 0.8316197395324707, + "learning_rate": 0.00019174408531488077, + "loss": 2.7654, + "step": 2692 + }, + { + "epoch": 0.2173351626180292, + "grad_norm": 0.7589917778968811, + "learning_rate": 0.00019173780300404413, + "loss": 2.6815, + "step": 2693 + }, + { + "epoch": 0.21741586635461221, + "grad_norm": 0.7752439975738525, + "learning_rate": 0.00019173151840686163, + "loss": 2.7804, + "step": 2694 + }, + { + "epoch": 0.21749657009119522, + "grad_norm": 0.8156552910804749, + "learning_rate": 0.0001917252315234899, + "loss": 2.7325, + "step": 2695 + }, + { + "epoch": 0.21757727382777822, + "grad_norm": 0.8886982798576355, + "learning_rate": 0.00019171894235408564, + "loss": 2.7257, + "step": 2696 + }, + { + "epoch": 0.21765797756436123, + "grad_norm": 0.8270704746246338, + "learning_rate": 0.00019171265089880558, + "loss": 2.7357, + "step": 2697 + }, + { + "epoch": 0.21773868130094423, + "grad_norm": 0.807700514793396, + "learning_rate": 0.00019170635715780651, + "loss": 2.7488, + "step": 2698 + }, + { + "epoch": 0.21781938503752724, + "grad_norm": 0.8195288181304932, + "learning_rate": 0.00019170006113124533, + "loss": 2.7048, + "step": 2699 + }, + { + "epoch": 0.21790008877411024, + "grad_norm": 0.817097008228302, + "learning_rate": 0.00019169376281927888, + "loss": 2.7148, + "step": 2700 + }, + { + "epoch": 0.21798079251069324, + "grad_norm": 0.8415588140487671, + "learning_rate": 0.0001916874622220642, + "loss": 2.7376, + "step": 2701 + }, + { + "epoch": 0.21806149624727625, + "grad_norm": 0.8004198670387268, + "learning_rate": 0.00019168115933975826, + "loss": 2.7145, + "step": 2702 + }, + { + "epoch": 0.21814219998385925, + "grad_norm": 0.8167368769645691, + "learning_rate": 0.0001916748541725182, + "loss": 2.6923, + "step": 2703 + }, + { + "epoch": 0.21822290372044226, + "grad_norm": 0.8877980709075928, + "learning_rate": 0.0001916685467205011, + "loss": 2.8232, + "step": 2704 + }, + { + "epoch": 0.21830360745702526, + "grad_norm": 0.7835622429847717, + "learning_rate": 0.00019166223698386422, + "loss": 2.7797, + "step": 2705 + }, + { + "epoch": 0.21838431119360827, + "grad_norm": 0.8023552894592285, + "learning_rate": 0.00019165592496276477, + "loss": 2.6697, + "step": 2706 + }, + { + "epoch": 0.21846501493019127, + "grad_norm": 0.8549069166183472, + "learning_rate": 0.00019164961065736008, + "loss": 2.729, + "step": 2707 + }, + { + "epoch": 0.21854571866677427, + "grad_norm": 0.8561950325965881, + "learning_rate": 0.00019164329406780753, + "loss": 2.772, + "step": 2708 + }, + { + "epoch": 0.21862642240335728, + "grad_norm": 0.6979276537895203, + "learning_rate": 0.00019163697519426453, + "loss": 2.7195, + "step": 2709 + }, + { + "epoch": 0.21870712613994028, + "grad_norm": 0.7659175395965576, + "learning_rate": 0.00019163065403688856, + "loss": 2.7742, + "step": 2710 + }, + { + "epoch": 0.2187878298765233, + "grad_norm": 0.8621466755867004, + "learning_rate": 0.00019162433059583718, + "loss": 2.721, + "step": 2711 + }, + { + "epoch": 0.2188685336131063, + "grad_norm": 0.8086833357810974, + "learning_rate": 0.00019161800487126795, + "loss": 2.7356, + "step": 2712 + }, + { + "epoch": 0.2189492373496893, + "grad_norm": 0.816215455532074, + "learning_rate": 0.00019161167686333855, + "loss": 2.7159, + "step": 2713 + }, + { + "epoch": 0.2190299410862723, + "grad_norm": 0.9180822968482971, + "learning_rate": 0.0001916053465722067, + "loss": 2.7162, + "step": 2714 + }, + { + "epoch": 0.2191106448228553, + "grad_norm": 0.7547199130058289, + "learning_rate": 0.00019159901399803014, + "loss": 2.7338, + "step": 2715 + }, + { + "epoch": 0.2191913485594383, + "grad_norm": 0.7380769848823547, + "learning_rate": 0.00019159267914096675, + "loss": 2.7149, + "step": 2716 + }, + { + "epoch": 0.2192720522960213, + "grad_norm": 0.7242285013198853, + "learning_rate": 0.00019158634200117433, + "loss": 2.724, + "step": 2717 + }, + { + "epoch": 0.21935275603260432, + "grad_norm": 0.8400316834449768, + "learning_rate": 0.00019158000257881087, + "loss": 2.7528, + "step": 2718 + }, + { + "epoch": 0.21943345976918732, + "grad_norm": 0.8437172770500183, + "learning_rate": 0.00019157366087403435, + "loss": 2.7872, + "step": 2719 + }, + { + "epoch": 0.21951416350577033, + "grad_norm": 0.7428301572799683, + "learning_rate": 0.00019156731688700282, + "loss": 2.6831, + "step": 2720 + }, + { + "epoch": 0.21959486724235333, + "grad_norm": 0.7589641213417053, + "learning_rate": 0.00019156097061787445, + "loss": 2.7105, + "step": 2721 + }, + { + "epoch": 0.21967557097893634, + "grad_norm": 0.7607305645942688, + "learning_rate": 0.00019155462206680727, + "loss": 2.7913, + "step": 2722 + }, + { + "epoch": 0.21975627471551934, + "grad_norm": 0.7455689311027527, + "learning_rate": 0.00019154827123395963, + "loss": 2.6321, + "step": 2723 + }, + { + "epoch": 0.21983697845210234, + "grad_norm": 0.7860318422317505, + "learning_rate": 0.00019154191811948974, + "loss": 2.7907, + "step": 2724 + }, + { + "epoch": 0.21991768218868535, + "grad_norm": 0.8101385235786438, + "learning_rate": 0.00019153556272355596, + "loss": 2.7682, + "step": 2725 + }, + { + "epoch": 0.21999838592526835, + "grad_norm": 0.7437283396720886, + "learning_rate": 0.00019152920504631667, + "loss": 2.7271, + "step": 2726 + }, + { + "epoch": 0.22007908966185136, + "grad_norm": 0.7390851974487305, + "learning_rate": 0.00019152284508793034, + "loss": 2.7492, + "step": 2727 + }, + { + "epoch": 0.22015979339843436, + "grad_norm": 0.9074966311454773, + "learning_rate": 0.0001915164828485555, + "loss": 2.8076, + "step": 2728 + }, + { + "epoch": 0.22024049713501734, + "grad_norm": 0.7644218802452087, + "learning_rate": 0.00019151011832835063, + "loss": 2.7238, + "step": 2729 + }, + { + "epoch": 0.22032120087160034, + "grad_norm": 0.823567807674408, + "learning_rate": 0.0001915037515274744, + "loss": 2.7701, + "step": 2730 + }, + { + "epoch": 0.22040190460818335, + "grad_norm": 0.7601858377456665, + "learning_rate": 0.00019149738244608552, + "loss": 2.6981, + "step": 2731 + }, + { + "epoch": 0.22048260834476635, + "grad_norm": 0.8242961764335632, + "learning_rate": 0.00019149101108434269, + "loss": 2.6916, + "step": 2732 + }, + { + "epoch": 0.22056331208134936, + "grad_norm": 0.7970656156539917, + "learning_rate": 0.0001914846374424047, + "loss": 2.7858, + "step": 2733 + }, + { + "epoch": 0.22064401581793236, + "grad_norm": 0.7844050526618958, + "learning_rate": 0.0001914782615204304, + "loss": 2.6782, + "step": 2734 + }, + { + "epoch": 0.22072471955451536, + "grad_norm": 0.7965044975280762, + "learning_rate": 0.00019147188331857868, + "loss": 2.7563, + "step": 2735 + }, + { + "epoch": 0.22080542329109837, + "grad_norm": 0.8189071416854858, + "learning_rate": 0.00019146550283700856, + "loss": 2.7587, + "step": 2736 + }, + { + "epoch": 0.22088612702768137, + "grad_norm": 0.7610960602760315, + "learning_rate": 0.00019145912007587898, + "loss": 2.663, + "step": 2737 + }, + { + "epoch": 0.22096683076426438, + "grad_norm": 0.7642313838005066, + "learning_rate": 0.00019145273503534907, + "loss": 2.78, + "step": 2738 + }, + { + "epoch": 0.22104753450084738, + "grad_norm": 0.7699539065361023, + "learning_rate": 0.0001914463477155779, + "loss": 2.7429, + "step": 2739 + }, + { + "epoch": 0.22112823823743039, + "grad_norm": 0.7674413919448853, + "learning_rate": 0.00019143995811672477, + "loss": 2.7048, + "step": 2740 + }, + { + "epoch": 0.2212089419740134, + "grad_norm": 0.7871866226196289, + "learning_rate": 0.00019143356623894882, + "loss": 2.7769, + "step": 2741 + }, + { + "epoch": 0.2212896457105964, + "grad_norm": 0.8453468680381775, + "learning_rate": 0.00019142717208240937, + "loss": 2.7677, + "step": 2742 + }, + { + "epoch": 0.2213703494471794, + "grad_norm": 0.8050780892372131, + "learning_rate": 0.00019142077564726582, + "loss": 2.7809, + "step": 2743 + }, + { + "epoch": 0.2214510531837624, + "grad_norm": 0.811287522315979, + "learning_rate": 0.0001914143769336776, + "loss": 2.7201, + "step": 2744 + }, + { + "epoch": 0.2215317569203454, + "grad_norm": 0.823106050491333, + "learning_rate": 0.00019140797594180412, + "loss": 2.7371, + "step": 2745 + }, + { + "epoch": 0.2216124606569284, + "grad_norm": 0.778126060962677, + "learning_rate": 0.0001914015726718049, + "loss": 2.6925, + "step": 2746 + }, + { + "epoch": 0.22169316439351142, + "grad_norm": 0.8240278959274292, + "learning_rate": 0.0001913951671238396, + "loss": 2.7227, + "step": 2747 + }, + { + "epoch": 0.22177386813009442, + "grad_norm": 0.8061805963516235, + "learning_rate": 0.0001913887592980678, + "loss": 2.7092, + "step": 2748 + }, + { + "epoch": 0.22185457186667742, + "grad_norm": 0.9111800789833069, + "learning_rate": 0.00019138234919464925, + "loss": 2.7364, + "step": 2749 + }, + { + "epoch": 0.22193527560326043, + "grad_norm": 0.8154863715171814, + "learning_rate": 0.0001913759368137437, + "loss": 2.6983, + "step": 2750 + }, + { + "epoch": 0.22201597933984343, + "grad_norm": 0.8547734022140503, + "learning_rate": 0.0001913695221555109, + "loss": 2.7016, + "step": 2751 + }, + { + "epoch": 0.22209668307642644, + "grad_norm": 0.7488531470298767, + "learning_rate": 0.00019136310522011079, + "loss": 2.6641, + "step": 2752 + }, + { + "epoch": 0.22217738681300944, + "grad_norm": 0.9118027091026306, + "learning_rate": 0.00019135668600770326, + "loss": 2.6965, + "step": 2753 + }, + { + "epoch": 0.22225809054959245, + "grad_norm": 0.7629117369651794, + "learning_rate": 0.00019135026451844834, + "loss": 2.7836, + "step": 2754 + }, + { + "epoch": 0.22233879428617545, + "grad_norm": 0.8081222176551819, + "learning_rate": 0.000191343840752506, + "loss": 2.7339, + "step": 2755 + }, + { + "epoch": 0.22241949802275846, + "grad_norm": 0.9143899083137512, + "learning_rate": 0.00019133741471003636, + "loss": 2.7051, + "step": 2756 + }, + { + "epoch": 0.22250020175934146, + "grad_norm": 0.8096790909767151, + "learning_rate": 0.00019133098639119962, + "loss": 2.6884, + "step": 2757 + }, + { + "epoch": 0.22258090549592446, + "grad_norm": 0.7959297895431519, + "learning_rate": 0.00019132455579615597, + "loss": 2.7127, + "step": 2758 + }, + { + "epoch": 0.22266160923250747, + "grad_norm": 0.7111356854438782, + "learning_rate": 0.00019131812292506563, + "loss": 2.7418, + "step": 2759 + }, + { + "epoch": 0.22274231296909047, + "grad_norm": 0.7584012150764465, + "learning_rate": 0.00019131168777808898, + "loss": 2.6705, + "step": 2760 + }, + { + "epoch": 0.22282301670567348, + "grad_norm": 0.7646663784980774, + "learning_rate": 0.0001913052503553864, + "loss": 2.7166, + "step": 2761 + }, + { + "epoch": 0.22290372044225648, + "grad_norm": 0.7643954157829285, + "learning_rate": 0.00019129881065711827, + "loss": 2.7967, + "step": 2762 + }, + { + "epoch": 0.22298442417883949, + "grad_norm": 0.7591429948806763, + "learning_rate": 0.0001912923686834451, + "loss": 2.6611, + "step": 2763 + }, + { + "epoch": 0.2230651279154225, + "grad_norm": 0.7182386517524719, + "learning_rate": 0.00019128592443452749, + "loss": 2.6808, + "step": 2764 + }, + { + "epoch": 0.2231458316520055, + "grad_norm": 0.7689648270606995, + "learning_rate": 0.00019127947791052602, + "loss": 2.7288, + "step": 2765 + }, + { + "epoch": 0.2232265353885885, + "grad_norm": 0.7851321697235107, + "learning_rate": 0.00019127302911160136, + "loss": 2.7227, + "step": 2766 + }, + { + "epoch": 0.2233072391251715, + "grad_norm": 0.8419411182403564, + "learning_rate": 0.00019126657803791424, + "loss": 2.7397, + "step": 2767 + }, + { + "epoch": 0.2233879428617545, + "grad_norm": 0.7657596468925476, + "learning_rate": 0.0001912601246896254, + "loss": 2.7223, + "step": 2768 + }, + { + "epoch": 0.2234686465983375, + "grad_norm": 0.8033619523048401, + "learning_rate": 0.00019125366906689567, + "loss": 2.7256, + "step": 2769 + }, + { + "epoch": 0.22354935033492052, + "grad_norm": 0.7784682512283325, + "learning_rate": 0.00019124721116988601, + "loss": 2.7692, + "step": 2770 + }, + { + "epoch": 0.22363005407150352, + "grad_norm": 0.7842707633972168, + "learning_rate": 0.00019124075099875731, + "loss": 2.7707, + "step": 2771 + }, + { + "epoch": 0.22371075780808652, + "grad_norm": 0.7864845395088196, + "learning_rate": 0.0001912342885536706, + "loss": 2.6912, + "step": 2772 + }, + { + "epoch": 0.22379146154466953, + "grad_norm": 0.8544312715530396, + "learning_rate": 0.0001912278238347869, + "loss": 2.8345, + "step": 2773 + }, + { + "epoch": 0.22387216528125253, + "grad_norm": 0.7210882306098938, + "learning_rate": 0.0001912213568422674, + "loss": 2.6933, + "step": 2774 + }, + { + "epoch": 0.22395286901783554, + "grad_norm": 0.8877022862434387, + "learning_rate": 0.00019121488757627318, + "loss": 2.7583, + "step": 2775 + }, + { + "epoch": 0.22403357275441854, + "grad_norm": 0.902886688709259, + "learning_rate": 0.00019120841603696554, + "loss": 2.8, + "step": 2776 + }, + { + "epoch": 0.22411427649100155, + "grad_norm": 0.771294355392456, + "learning_rate": 0.0001912019422245058, + "loss": 2.7712, + "step": 2777 + }, + { + "epoch": 0.22419498022758455, + "grad_norm": 0.7973463535308838, + "learning_rate": 0.0001911954661390552, + "loss": 2.7368, + "step": 2778 + }, + { + "epoch": 0.22427568396416755, + "grad_norm": 0.776836633682251, + "learning_rate": 0.00019118898778077524, + "loss": 2.7126, + "step": 2779 + }, + { + "epoch": 0.22435638770075053, + "grad_norm": 0.8286641240119934, + "learning_rate": 0.00019118250714982731, + "loss": 2.7148, + "step": 2780 + }, + { + "epoch": 0.22443709143733354, + "grad_norm": 0.7848700284957886, + "learning_rate": 0.00019117602424637294, + "loss": 2.7284, + "step": 2781 + }, + { + "epoch": 0.22451779517391654, + "grad_norm": 0.7658216953277588, + "learning_rate": 0.0001911695390705737, + "loss": 2.7186, + "step": 2782 + }, + { + "epoch": 0.22459849891049954, + "grad_norm": 0.7596792578697205, + "learning_rate": 0.00019116305162259124, + "loss": 2.6854, + "step": 2783 + }, + { + "epoch": 0.22467920264708255, + "grad_norm": 0.7901157140731812, + "learning_rate": 0.00019115656190258726, + "loss": 2.7347, + "step": 2784 + }, + { + "epoch": 0.22475990638366555, + "grad_norm": 0.7499287128448486, + "learning_rate": 0.00019115006991072346, + "loss": 2.7219, + "step": 2785 + }, + { + "epoch": 0.22484061012024856, + "grad_norm": 0.7427374124526978, + "learning_rate": 0.00019114357564716162, + "loss": 2.7147, + "step": 2786 + }, + { + "epoch": 0.22492131385683156, + "grad_norm": 0.8305855393409729, + "learning_rate": 0.00019113707911206363, + "loss": 2.7587, + "step": 2787 + }, + { + "epoch": 0.22500201759341457, + "grad_norm": 0.8266459703445435, + "learning_rate": 0.00019113058030559142, + "loss": 2.7275, + "step": 2788 + }, + { + "epoch": 0.22508272132999757, + "grad_norm": 0.7338323593139648, + "learning_rate": 0.0001911240792279069, + "loss": 2.762, + "step": 2789 + }, + { + "epoch": 0.22516342506658057, + "grad_norm": 0.7653434872627258, + "learning_rate": 0.00019111757587917216, + "loss": 2.6715, + "step": 2790 + }, + { + "epoch": 0.22524412880316358, + "grad_norm": 0.76301509141922, + "learning_rate": 0.00019111107025954923, + "loss": 2.698, + "step": 2791 + }, + { + "epoch": 0.22532483253974658, + "grad_norm": 0.7810547947883606, + "learning_rate": 0.00019110456236920024, + "loss": 2.7295, + "step": 2792 + }, + { + "epoch": 0.2254055362763296, + "grad_norm": 0.7885214686393738, + "learning_rate": 0.00019109805220828742, + "loss": 2.7724, + "step": 2793 + }, + { + "epoch": 0.2254862400129126, + "grad_norm": 0.8087031841278076, + "learning_rate": 0.00019109153977697301, + "loss": 2.7888, + "step": 2794 + }, + { + "epoch": 0.2255669437494956, + "grad_norm": 0.795101523399353, + "learning_rate": 0.00019108502507541933, + "loss": 2.6815, + "step": 2795 + }, + { + "epoch": 0.2256476474860786, + "grad_norm": 0.8337482213973999, + "learning_rate": 0.0001910785081037887, + "loss": 2.8192, + "step": 2796 + }, + { + "epoch": 0.2257283512226616, + "grad_norm": 0.8357288241386414, + "learning_rate": 0.00019107198886224357, + "loss": 2.7867, + "step": 2797 + }, + { + "epoch": 0.2258090549592446, + "grad_norm": 0.80678391456604, + "learning_rate": 0.00019106546735094644, + "loss": 2.7313, + "step": 2798 + }, + { + "epoch": 0.2258897586958276, + "grad_norm": 0.7481401562690735, + "learning_rate": 0.00019105894357005979, + "loss": 2.7073, + "step": 2799 + }, + { + "epoch": 0.22597046243241062, + "grad_norm": 0.8025074005126953, + "learning_rate": 0.00019105241751974622, + "loss": 2.6922, + "step": 2800 + }, + { + "epoch": 0.22605116616899362, + "grad_norm": 0.7308986186981201, + "learning_rate": 0.00019104588920016842, + "loss": 2.7511, + "step": 2801 + }, + { + "epoch": 0.22613186990557663, + "grad_norm": 0.7727689146995544, + "learning_rate": 0.00019103935861148905, + "loss": 2.707, + "step": 2802 + }, + { + "epoch": 0.22621257364215963, + "grad_norm": 0.8611076474189758, + "learning_rate": 0.0001910328257538709, + "loss": 2.8494, + "step": 2803 + }, + { + "epoch": 0.22629327737874264, + "grad_norm": 0.8487605452537537, + "learning_rate": 0.00019102629062747677, + "loss": 2.7698, + "step": 2804 + }, + { + "epoch": 0.22637398111532564, + "grad_norm": 0.7495502233505249, + "learning_rate": 0.00019101975323246952, + "loss": 2.7091, + "step": 2805 + }, + { + "epoch": 0.22645468485190864, + "grad_norm": 0.7334234118461609, + "learning_rate": 0.0001910132135690121, + "loss": 2.7375, + "step": 2806 + }, + { + "epoch": 0.22653538858849165, + "grad_norm": 0.879912257194519, + "learning_rate": 0.00019100667163726747, + "loss": 2.7278, + "step": 2807 + }, + { + "epoch": 0.22661609232507465, + "grad_norm": 0.8087306618690491, + "learning_rate": 0.0001910001274373987, + "loss": 2.8065, + "step": 2808 + }, + { + "epoch": 0.22669679606165766, + "grad_norm": 0.7548169493675232, + "learning_rate": 0.00019099358096956887, + "loss": 2.7235, + "step": 2809 + }, + { + "epoch": 0.22677749979824066, + "grad_norm": 0.7505785822868347, + "learning_rate": 0.00019098703223394118, + "loss": 2.6633, + "step": 2810 + }, + { + "epoch": 0.22685820353482367, + "grad_norm": 0.829075813293457, + "learning_rate": 0.00019098048123067875, + "loss": 2.7389, + "step": 2811 + }, + { + "epoch": 0.22693890727140667, + "grad_norm": 0.7731673121452332, + "learning_rate": 0.00019097392795994493, + "loss": 2.7639, + "step": 2812 + }, + { + "epoch": 0.22701961100798967, + "grad_norm": 0.7389004826545715, + "learning_rate": 0.00019096737242190303, + "loss": 2.717, + "step": 2813 + }, + { + "epoch": 0.22710031474457268, + "grad_norm": 0.7520460486412048, + "learning_rate": 0.0001909608146167164, + "loss": 2.7203, + "step": 2814 + }, + { + "epoch": 0.22718101848115568, + "grad_norm": 0.7272354364395142, + "learning_rate": 0.00019095425454454849, + "loss": 2.7306, + "step": 2815 + }, + { + "epoch": 0.2272617222177387, + "grad_norm": 0.7593528032302856, + "learning_rate": 0.00019094769220556282, + "loss": 2.7565, + "step": 2816 + }, + { + "epoch": 0.2273424259543217, + "grad_norm": 0.7312695384025574, + "learning_rate": 0.0001909411275999229, + "loss": 2.744, + "step": 2817 + }, + { + "epoch": 0.2274231296909047, + "grad_norm": 0.7483308911323547, + "learning_rate": 0.00019093456072779238, + "loss": 2.7938, + "step": 2818 + }, + { + "epoch": 0.2275038334274877, + "grad_norm": 0.8515620231628418, + "learning_rate": 0.00019092799158933486, + "loss": 2.7392, + "step": 2819 + }, + { + "epoch": 0.2275845371640707, + "grad_norm": 0.7119776606559753, + "learning_rate": 0.00019092142018471415, + "loss": 2.6985, + "step": 2820 + }, + { + "epoch": 0.2276652409006537, + "grad_norm": 0.7549445033073425, + "learning_rate": 0.00019091484651409394, + "loss": 2.7621, + "step": 2821 + }, + { + "epoch": 0.2277459446372367, + "grad_norm": 0.8728097081184387, + "learning_rate": 0.00019090827057763814, + "loss": 2.8321, + "step": 2822 + }, + { + "epoch": 0.22782664837381972, + "grad_norm": 0.755043089389801, + "learning_rate": 0.00019090169237551057, + "loss": 2.7341, + "step": 2823 + }, + { + "epoch": 0.22790735211040272, + "grad_norm": 0.7949401140213013, + "learning_rate": 0.00019089511190787523, + "loss": 2.7646, + "step": 2824 + }, + { + "epoch": 0.22798805584698573, + "grad_norm": 0.8027622103691101, + "learning_rate": 0.00019088852917489607, + "loss": 2.7606, + "step": 2825 + }, + { + "epoch": 0.22806875958356873, + "grad_norm": 0.8609418869018555, + "learning_rate": 0.0001908819441767372, + "loss": 2.7433, + "step": 2826 + }, + { + "epoch": 0.22814946332015174, + "grad_norm": 0.8021805882453918, + "learning_rate": 0.00019087535691356271, + "loss": 2.7723, + "step": 2827 + }, + { + "epoch": 0.22823016705673474, + "grad_norm": 0.8104252219200134, + "learning_rate": 0.00019086876738553675, + "loss": 2.7229, + "step": 2828 + }, + { + "epoch": 0.22831087079331774, + "grad_norm": 0.8714433908462524, + "learning_rate": 0.00019086217559282362, + "loss": 2.75, + "step": 2829 + }, + { + "epoch": 0.22839157452990075, + "grad_norm": 0.7598714828491211, + "learning_rate": 0.0001908555815355875, + "loss": 2.6979, + "step": 2830 + }, + { + "epoch": 0.22847227826648372, + "grad_norm": 0.859708309173584, + "learning_rate": 0.00019084898521399283, + "loss": 2.7863, + "step": 2831 + }, + { + "epoch": 0.22855298200306673, + "grad_norm": 0.7798011302947998, + "learning_rate": 0.00019084238662820397, + "loss": 2.7623, + "step": 2832 + }, + { + "epoch": 0.22863368573964973, + "grad_norm": 0.7869576811790466, + "learning_rate": 0.00019083578577838535, + "loss": 2.7341, + "step": 2833 + }, + { + "epoch": 0.22871438947623274, + "grad_norm": 0.7486738562583923, + "learning_rate": 0.0001908291826647015, + "loss": 2.7615, + "step": 2834 + }, + { + "epoch": 0.22879509321281574, + "grad_norm": 0.8270190954208374, + "learning_rate": 0.00019082257728731704, + "loss": 2.7515, + "step": 2835 + }, + { + "epoch": 0.22887579694939875, + "grad_norm": 0.9060254693031311, + "learning_rate": 0.00019081596964639648, + "loss": 2.874, + "step": 2836 + }, + { + "epoch": 0.22895650068598175, + "grad_norm": 0.7802320122718811, + "learning_rate": 0.00019080935974210458, + "loss": 2.7224, + "step": 2837 + }, + { + "epoch": 0.22903720442256476, + "grad_norm": 0.9513018131256104, + "learning_rate": 0.00019080274757460607, + "loss": 2.7168, + "step": 2838 + }, + { + "epoch": 0.22911790815914776, + "grad_norm": 0.7139711976051331, + "learning_rate": 0.0001907961331440657, + "loss": 2.676, + "step": 2839 + }, + { + "epoch": 0.22919861189573076, + "grad_norm": 0.8635632395744324, + "learning_rate": 0.00019078951645064838, + "loss": 2.6979, + "step": 2840 + }, + { + "epoch": 0.22927931563231377, + "grad_norm": 0.8823218941688538, + "learning_rate": 0.000190782897494519, + "loss": 2.7345, + "step": 2841 + }, + { + "epoch": 0.22936001936889677, + "grad_norm": 0.8139359354972839, + "learning_rate": 0.00019077627627584246, + "loss": 2.6988, + "step": 2842 + }, + { + "epoch": 0.22944072310547978, + "grad_norm": 0.8935994505882263, + "learning_rate": 0.00019076965279478383, + "loss": 2.7706, + "step": 2843 + }, + { + "epoch": 0.22952142684206278, + "grad_norm": 0.8362705111503601, + "learning_rate": 0.00019076302705150816, + "loss": 2.7593, + "step": 2844 + }, + { + "epoch": 0.22960213057864579, + "grad_norm": 0.7534157633781433, + "learning_rate": 0.00019075639904618066, + "loss": 2.7501, + "step": 2845 + }, + { + "epoch": 0.2296828343152288, + "grad_norm": 0.8826640248298645, + "learning_rate": 0.00019074976877896642, + "loss": 2.7758, + "step": 2846 + }, + { + "epoch": 0.2297635380518118, + "grad_norm": 0.8395571112632751, + "learning_rate": 0.0001907431362500307, + "loss": 2.7625, + "step": 2847 + }, + { + "epoch": 0.2298442417883948, + "grad_norm": 0.7927684783935547, + "learning_rate": 0.00019073650145953885, + "loss": 2.7392, + "step": 2848 + }, + { + "epoch": 0.2299249455249778, + "grad_norm": 0.823208749294281, + "learning_rate": 0.00019072986440765618, + "loss": 2.7259, + "step": 2849 + }, + { + "epoch": 0.2300056492615608, + "grad_norm": 0.889416515827179, + "learning_rate": 0.00019072322509454815, + "loss": 2.7539, + "step": 2850 + }, + { + "epoch": 0.2300863529981438, + "grad_norm": 0.7957748770713806, + "learning_rate": 0.0001907165835203802, + "loss": 2.7756, + "step": 2851 + }, + { + "epoch": 0.23016705673472682, + "grad_norm": 0.7924029231071472, + "learning_rate": 0.00019070993968531782, + "loss": 2.7439, + "step": 2852 + }, + { + "epoch": 0.23024776047130982, + "grad_norm": 0.7811052799224854, + "learning_rate": 0.0001907032935895266, + "loss": 2.7479, + "step": 2853 + }, + { + "epoch": 0.23032846420789282, + "grad_norm": 0.7973877191543579, + "learning_rate": 0.00019069664523317225, + "loss": 2.7502, + "step": 2854 + }, + { + "epoch": 0.23040916794447583, + "grad_norm": 0.7524267435073853, + "learning_rate": 0.0001906899946164204, + "loss": 2.75, + "step": 2855 + }, + { + "epoch": 0.23048987168105883, + "grad_norm": 0.7594791054725647, + "learning_rate": 0.00019068334173943683, + "loss": 2.6534, + "step": 2856 + }, + { + "epoch": 0.23057057541764184, + "grad_norm": 0.7253785729408264, + "learning_rate": 0.00019067668660238733, + "loss": 2.7246, + "step": 2857 + }, + { + "epoch": 0.23065127915422484, + "grad_norm": 0.788737416267395, + "learning_rate": 0.00019067002920543775, + "loss": 2.757, + "step": 2858 + }, + { + "epoch": 0.23073198289080785, + "grad_norm": 0.7577618956565857, + "learning_rate": 0.00019066336954875403, + "loss": 2.674, + "step": 2859 + }, + { + "epoch": 0.23081268662739085, + "grad_norm": 0.7682929635047913, + "learning_rate": 0.0001906567076325022, + "loss": 2.8193, + "step": 2860 + }, + { + "epoch": 0.23089339036397385, + "grad_norm": 0.7742112874984741, + "learning_rate": 0.00019065004345684817, + "loss": 2.6969, + "step": 2861 + }, + { + "epoch": 0.23097409410055686, + "grad_norm": 0.7981678247451782, + "learning_rate": 0.00019064337702195814, + "loss": 2.7681, + "step": 2862 + }, + { + "epoch": 0.23105479783713986, + "grad_norm": 0.7608500123023987, + "learning_rate": 0.00019063670832799817, + "loss": 2.7459, + "step": 2863 + }, + { + "epoch": 0.23113550157372287, + "grad_norm": 0.7563463449478149, + "learning_rate": 0.00019063003737513455, + "loss": 2.7678, + "step": 2864 + }, + { + "epoch": 0.23121620531030587, + "grad_norm": 0.7915034890174866, + "learning_rate": 0.00019062336416353343, + "loss": 2.7577, + "step": 2865 + }, + { + "epoch": 0.23129690904688888, + "grad_norm": 0.7229592204093933, + "learning_rate": 0.00019061668869336122, + "loss": 2.7308, + "step": 2866 + }, + { + "epoch": 0.23137761278347188, + "grad_norm": 0.7910905480384827, + "learning_rate": 0.00019061001096478425, + "loss": 2.7571, + "step": 2867 + }, + { + "epoch": 0.23145831652005489, + "grad_norm": 0.8474656939506531, + "learning_rate": 0.00019060333097796895, + "loss": 2.7011, + "step": 2868 + }, + { + "epoch": 0.2315390202566379, + "grad_norm": 0.8005419373512268, + "learning_rate": 0.00019059664873308178, + "loss": 2.7441, + "step": 2869 + }, + { + "epoch": 0.2316197239932209, + "grad_norm": 0.7728021740913391, + "learning_rate": 0.00019058996423028935, + "loss": 2.7753, + "step": 2870 + }, + { + "epoch": 0.2317004277298039, + "grad_norm": 0.7338094115257263, + "learning_rate": 0.00019058327746975816, + "loss": 2.7009, + "step": 2871 + }, + { + "epoch": 0.2317811314663869, + "grad_norm": 0.7746245265007019, + "learning_rate": 0.00019057658845165494, + "loss": 2.6938, + "step": 2872 + }, + { + "epoch": 0.2318618352029699, + "grad_norm": 0.7474356293678284, + "learning_rate": 0.00019056989717614636, + "loss": 2.7161, + "step": 2873 + }, + { + "epoch": 0.2319425389395529, + "grad_norm": 0.9540585279464722, + "learning_rate": 0.00019056320364339917, + "loss": 2.7753, + "step": 2874 + }, + { + "epoch": 0.23202324267613592, + "grad_norm": 0.799726665019989, + "learning_rate": 0.00019055650785358024, + "loss": 2.7301, + "step": 2875 + }, + { + "epoch": 0.23210394641271892, + "grad_norm": 0.8087828159332275, + "learning_rate": 0.0001905498098068564, + "loss": 2.7305, + "step": 2876 + }, + { + "epoch": 0.23218465014930192, + "grad_norm": 0.8177600502967834, + "learning_rate": 0.00019054310950339457, + "loss": 2.7462, + "step": 2877 + }, + { + "epoch": 0.23226535388588493, + "grad_norm": 0.7106238603591919, + "learning_rate": 0.00019053640694336181, + "loss": 2.7183, + "step": 2878 + }, + { + "epoch": 0.23234605762246793, + "grad_norm": 0.884185791015625, + "learning_rate": 0.00019052970212692514, + "loss": 2.7549, + "step": 2879 + }, + { + "epoch": 0.23242676135905094, + "grad_norm": 0.7532132267951965, + "learning_rate": 0.00019052299505425163, + "loss": 2.7524, + "step": 2880 + }, + { + "epoch": 0.23250746509563394, + "grad_norm": 0.7295021414756775, + "learning_rate": 0.00019051628572550842, + "loss": 2.6928, + "step": 2881 + }, + { + "epoch": 0.23258816883221692, + "grad_norm": 0.8475896716117859, + "learning_rate": 0.00019050957414086278, + "loss": 2.7138, + "step": 2882 + }, + { + "epoch": 0.23266887256879992, + "grad_norm": 0.7219378352165222, + "learning_rate": 0.00019050286030048198, + "loss": 2.7034, + "step": 2883 + }, + { + "epoch": 0.23274957630538293, + "grad_norm": 0.8410176634788513, + "learning_rate": 0.0001904961442045333, + "loss": 2.7413, + "step": 2884 + }, + { + "epoch": 0.23283028004196593, + "grad_norm": 0.7792301177978516, + "learning_rate": 0.00019048942585318414, + "loss": 2.6771, + "step": 2885 + }, + { + "epoch": 0.23291098377854894, + "grad_norm": 0.7457073926925659, + "learning_rate": 0.00019048270524660196, + "loss": 2.7325, + "step": 2886 + }, + { + "epoch": 0.23299168751513194, + "grad_norm": 0.8258858323097229, + "learning_rate": 0.00019047598238495424, + "loss": 2.7434, + "step": 2887 + }, + { + "epoch": 0.23307239125171494, + "grad_norm": 0.8188657164573669, + "learning_rate": 0.00019046925726840853, + "loss": 2.732, + "step": 2888 + }, + { + "epoch": 0.23315309498829795, + "grad_norm": 0.8084142208099365, + "learning_rate": 0.00019046252989713246, + "loss": 2.7537, + "step": 2889 + }, + { + "epoch": 0.23323379872488095, + "grad_norm": 0.75553297996521, + "learning_rate": 0.00019045580027129364, + "loss": 2.6685, + "step": 2890 + }, + { + "epoch": 0.23331450246146396, + "grad_norm": 0.8145995736122131, + "learning_rate": 0.00019044906839105986, + "loss": 2.7654, + "step": 2891 + }, + { + "epoch": 0.23339520619804696, + "grad_norm": 0.8433949947357178, + "learning_rate": 0.0001904423342565988, + "loss": 2.7713, + "step": 2892 + }, + { + "epoch": 0.23347590993462997, + "grad_norm": 0.7826054096221924, + "learning_rate": 0.0001904355978680784, + "loss": 2.7108, + "step": 2893 + }, + { + "epoch": 0.23355661367121297, + "grad_norm": 0.7281686663627625, + "learning_rate": 0.0001904288592256665, + "loss": 2.7606, + "step": 2894 + }, + { + "epoch": 0.23363731740779597, + "grad_norm": 0.8282813429832458, + "learning_rate": 0.00019042211832953103, + "loss": 2.6662, + "step": 2895 + }, + { + "epoch": 0.23371802114437898, + "grad_norm": 0.8227263689041138, + "learning_rate": 0.00019041537517984, + "loss": 2.7493, + "step": 2896 + }, + { + "epoch": 0.23379872488096198, + "grad_norm": 0.839350700378418, + "learning_rate": 0.0001904086297767615, + "loss": 2.7258, + "step": 2897 + }, + { + "epoch": 0.233879428617545, + "grad_norm": 0.713231086730957, + "learning_rate": 0.00019040188212046357, + "loss": 2.6722, + "step": 2898 + }, + { + "epoch": 0.233960132354128, + "grad_norm": 0.8314552903175354, + "learning_rate": 0.00019039513221111447, + "loss": 2.8509, + "step": 2899 + }, + { + "epoch": 0.234040836090711, + "grad_norm": 0.8885688781738281, + "learning_rate": 0.0001903883800488824, + "loss": 2.7608, + "step": 2900 + }, + { + "epoch": 0.234121539827294, + "grad_norm": 0.755308210849762, + "learning_rate": 0.00019038162563393555, + "loss": 2.7065, + "step": 2901 + }, + { + "epoch": 0.234202243563877, + "grad_norm": 0.7436641454696655, + "learning_rate": 0.00019037486896644236, + "loss": 2.6865, + "step": 2902 + }, + { + "epoch": 0.23428294730046, + "grad_norm": 0.7861987948417664, + "learning_rate": 0.0001903681100465712, + "loss": 2.7238, + "step": 2903 + }, + { + "epoch": 0.234363651037043, + "grad_norm": 0.7481045126914978, + "learning_rate": 0.0001903613488744905, + "loss": 2.7038, + "step": 2904 + }, + { + "epoch": 0.23444435477362602, + "grad_norm": 0.790765106678009, + "learning_rate": 0.0001903545854503688, + "loss": 2.6865, + "step": 2905 + }, + { + "epoch": 0.23452505851020902, + "grad_norm": 0.8594793677330017, + "learning_rate": 0.0001903478197743746, + "loss": 2.7324, + "step": 2906 + }, + { + "epoch": 0.23460576224679203, + "grad_norm": 0.7504310011863708, + "learning_rate": 0.00019034105184667662, + "loss": 2.6535, + "step": 2907 + }, + { + "epoch": 0.23468646598337503, + "grad_norm": 0.7824578881263733, + "learning_rate": 0.00019033428166744342, + "loss": 2.7113, + "step": 2908 + }, + { + "epoch": 0.23476716971995804, + "grad_norm": 0.7766899466514587, + "learning_rate": 0.0001903275092368438, + "loss": 2.6907, + "step": 2909 + }, + { + "epoch": 0.23484787345654104, + "grad_norm": 0.8082600235939026, + "learning_rate": 0.00019032073455504657, + "loss": 2.6781, + "step": 2910 + }, + { + "epoch": 0.23492857719312404, + "grad_norm": 0.7790517210960388, + "learning_rate": 0.0001903139576222205, + "loss": 2.7277, + "step": 2911 + }, + { + "epoch": 0.23500928092970705, + "grad_norm": 0.7449578046798706, + "learning_rate": 0.00019030717843853453, + "loss": 2.7078, + "step": 2912 + }, + { + "epoch": 0.23508998466629005, + "grad_norm": 0.7931632399559021, + "learning_rate": 0.0001903003970041576, + "loss": 2.7165, + "step": 2913 + }, + { + "epoch": 0.23517068840287306, + "grad_norm": 0.7970653176307678, + "learning_rate": 0.00019029361331925873, + "loss": 2.7993, + "step": 2914 + }, + { + "epoch": 0.23525139213945606, + "grad_norm": 0.8497335314750671, + "learning_rate": 0.00019028682738400697, + "loss": 2.7564, + "step": 2915 + }, + { + "epoch": 0.23533209587603907, + "grad_norm": 0.7840128540992737, + "learning_rate": 0.0001902800391985715, + "loss": 2.7546, + "step": 2916 + }, + { + "epoch": 0.23541279961262207, + "grad_norm": 0.8237372636795044, + "learning_rate": 0.00019027324876312146, + "loss": 2.7507, + "step": 2917 + }, + { + "epoch": 0.23549350334920507, + "grad_norm": 0.8445321917533875, + "learning_rate": 0.00019026645607782603, + "loss": 2.7287, + "step": 2918 + }, + { + "epoch": 0.23557420708578808, + "grad_norm": 0.8380417227745056, + "learning_rate": 0.0001902596611428546, + "loss": 2.7778, + "step": 2919 + }, + { + "epoch": 0.23565491082237108, + "grad_norm": 0.7989064455032349, + "learning_rate": 0.00019025286395837646, + "loss": 2.7254, + "step": 2920 + }, + { + "epoch": 0.2357356145589541, + "grad_norm": 0.8223496079444885, + "learning_rate": 0.00019024606452456102, + "loss": 2.7028, + "step": 2921 + }, + { + "epoch": 0.2358163182955371, + "grad_norm": 0.8090229630470276, + "learning_rate": 0.00019023926284157775, + "loss": 2.6911, + "step": 2922 + }, + { + "epoch": 0.2358970220321201, + "grad_norm": 0.7556560635566711, + "learning_rate": 0.00019023245890959615, + "loss": 2.7183, + "step": 2923 + }, + { + "epoch": 0.2359777257687031, + "grad_norm": 0.7907983660697937, + "learning_rate": 0.00019022565272878582, + "loss": 2.6805, + "step": 2924 + }, + { + "epoch": 0.2360584295052861, + "grad_norm": 0.9404142498970032, + "learning_rate": 0.0001902188442993164, + "loss": 2.8081, + "step": 2925 + }, + { + "epoch": 0.2361391332418691, + "grad_norm": 0.8349069952964783, + "learning_rate": 0.0001902120336213575, + "loss": 2.8329, + "step": 2926 + }, + { + "epoch": 0.2362198369784521, + "grad_norm": 0.8557522892951965, + "learning_rate": 0.00019020522069507892, + "loss": 2.704, + "step": 2927 + }, + { + "epoch": 0.23630054071503512, + "grad_norm": 0.7557278275489807, + "learning_rate": 0.00019019840552065044, + "loss": 2.7071, + "step": 2928 + }, + { + "epoch": 0.23638124445161812, + "grad_norm": 0.8810723423957825, + "learning_rate": 0.00019019158809824193, + "loss": 2.7535, + "step": 2929 + }, + { + "epoch": 0.23646194818820113, + "grad_norm": 0.7845562100410461, + "learning_rate": 0.00019018476842802326, + "loss": 2.7254, + "step": 2930 + }, + { + "epoch": 0.23654265192478413, + "grad_norm": 0.7566044926643372, + "learning_rate": 0.00019017794651016444, + "loss": 2.7295, + "step": 2931 + }, + { + "epoch": 0.23662335566136714, + "grad_norm": 0.8083382248878479, + "learning_rate": 0.00019017112234483545, + "loss": 2.7305, + "step": 2932 + }, + { + "epoch": 0.2367040593979501, + "grad_norm": 0.7924187183380127, + "learning_rate": 0.00019016429593220638, + "loss": 2.7659, + "step": 2933 + }, + { + "epoch": 0.23678476313453312, + "grad_norm": 0.8400307297706604, + "learning_rate": 0.00019015746727244737, + "loss": 2.7293, + "step": 2934 + }, + { + "epoch": 0.23686546687111612, + "grad_norm": 0.6931199431419373, + "learning_rate": 0.0001901506363657286, + "loss": 2.7189, + "step": 2935 + }, + { + "epoch": 0.23694617060769912, + "grad_norm": 0.8263585567474365, + "learning_rate": 0.0001901438032122203, + "loss": 2.7368, + "step": 2936 + }, + { + "epoch": 0.23702687434428213, + "grad_norm": 0.8001893162727356, + "learning_rate": 0.0001901369678120928, + "loss": 2.7793, + "step": 2937 + }, + { + "epoch": 0.23710757808086513, + "grad_norm": 0.7724235653877258, + "learning_rate": 0.00019013013016551644, + "loss": 2.717, + "step": 2938 + }, + { + "epoch": 0.23718828181744814, + "grad_norm": 0.7617147564888, + "learning_rate": 0.00019012329027266164, + "loss": 2.7275, + "step": 2939 + }, + { + "epoch": 0.23726898555403114, + "grad_norm": 0.80738765001297, + "learning_rate": 0.00019011644813369884, + "loss": 2.7444, + "step": 2940 + }, + { + "epoch": 0.23734968929061415, + "grad_norm": 0.7885528802871704, + "learning_rate": 0.00019010960374879861, + "loss": 2.7377, + "step": 2941 + }, + { + "epoch": 0.23743039302719715, + "grad_norm": 0.720268964767456, + "learning_rate": 0.00019010275711813147, + "loss": 2.6897, + "step": 2942 + }, + { + "epoch": 0.23751109676378016, + "grad_norm": 0.7532111406326294, + "learning_rate": 0.00019009590824186815, + "loss": 2.8117, + "step": 2943 + }, + { + "epoch": 0.23759180050036316, + "grad_norm": 0.780777633190155, + "learning_rate": 0.00019008905712017925, + "loss": 2.7565, + "step": 2944 + }, + { + "epoch": 0.23767250423694616, + "grad_norm": 0.8721919059753418, + "learning_rate": 0.00019008220375323553, + "loss": 2.801, + "step": 2945 + }, + { + "epoch": 0.23775320797352917, + "grad_norm": 0.8258914947509766, + "learning_rate": 0.00019007534814120786, + "loss": 2.7696, + "step": 2946 + }, + { + "epoch": 0.23783391171011217, + "grad_norm": 0.7292730808258057, + "learning_rate": 0.00019006849028426704, + "loss": 2.7512, + "step": 2947 + }, + { + "epoch": 0.23791461544669518, + "grad_norm": 0.7789164185523987, + "learning_rate": 0.00019006163018258398, + "loss": 2.7489, + "step": 2948 + }, + { + "epoch": 0.23799531918327818, + "grad_norm": 0.8049725294113159, + "learning_rate": 0.00019005476783632967, + "loss": 2.672, + "step": 2949 + }, + { + "epoch": 0.23807602291986119, + "grad_norm": 0.7440119981765747, + "learning_rate": 0.00019004790324567519, + "loss": 2.7208, + "step": 2950 + }, + { + "epoch": 0.2381567266564442, + "grad_norm": 0.7695925235748291, + "learning_rate": 0.00019004103641079154, + "loss": 2.7816, + "step": 2951 + }, + { + "epoch": 0.2382374303930272, + "grad_norm": 0.7623234391212463, + "learning_rate": 0.00019003416733184988, + "loss": 2.7034, + "step": 2952 + }, + { + "epoch": 0.2383181341296102, + "grad_norm": 0.8136502504348755, + "learning_rate": 0.00019002729600902141, + "loss": 2.7638, + "step": 2953 + }, + { + "epoch": 0.2383988378661932, + "grad_norm": 0.7813066840171814, + "learning_rate": 0.00019002042244247743, + "loss": 2.7606, + "step": 2954 + }, + { + "epoch": 0.2384795416027762, + "grad_norm": 0.7863059043884277, + "learning_rate": 0.0001900135466323892, + "loss": 2.7219, + "step": 2955 + }, + { + "epoch": 0.2385602453393592, + "grad_norm": 0.8712359070777893, + "learning_rate": 0.00019000666857892806, + "loss": 2.7485, + "step": 2956 + }, + { + "epoch": 0.23864094907594222, + "grad_norm": 0.8130611777305603, + "learning_rate": 0.00018999978828226547, + "loss": 2.7195, + "step": 2957 + }, + { + "epoch": 0.23872165281252522, + "grad_norm": 0.759503960609436, + "learning_rate": 0.00018999290574257292, + "loss": 2.6856, + "step": 2958 + }, + { + "epoch": 0.23880235654910822, + "grad_norm": 0.7490882277488708, + "learning_rate": 0.0001899860209600219, + "loss": 2.7587, + "step": 2959 + }, + { + "epoch": 0.23888306028569123, + "grad_norm": 0.8111297488212585, + "learning_rate": 0.000189979133934784, + "loss": 2.7688, + "step": 2960 + }, + { + "epoch": 0.23896376402227423, + "grad_norm": 0.844894289970398, + "learning_rate": 0.0001899722446670309, + "loss": 2.7706, + "step": 2961 + }, + { + "epoch": 0.23904446775885724, + "grad_norm": 0.7875459790229797, + "learning_rate": 0.00018996535315693423, + "loss": 2.7535, + "step": 2962 + }, + { + "epoch": 0.23912517149544024, + "grad_norm": 0.7768518328666687, + "learning_rate": 0.0001899584594046658, + "loss": 2.7268, + "step": 2963 + }, + { + "epoch": 0.23920587523202325, + "grad_norm": 0.8645716309547424, + "learning_rate": 0.00018995156341039744, + "loss": 2.7856, + "step": 2964 + }, + { + "epoch": 0.23928657896860625, + "grad_norm": 0.7816600799560547, + "learning_rate": 0.00018994466517430097, + "loss": 2.757, + "step": 2965 + }, + { + "epoch": 0.23936728270518925, + "grad_norm": 0.7967644333839417, + "learning_rate": 0.00018993776469654832, + "loss": 2.7021, + "step": 2966 + }, + { + "epoch": 0.23944798644177226, + "grad_norm": 0.800589919090271, + "learning_rate": 0.00018993086197731146, + "loss": 2.6838, + "step": 2967 + }, + { + "epoch": 0.23952869017835526, + "grad_norm": 0.7658529281616211, + "learning_rate": 0.00018992395701676246, + "loss": 2.6992, + "step": 2968 + }, + { + "epoch": 0.23960939391493827, + "grad_norm": 0.848456621170044, + "learning_rate": 0.00018991704981507338, + "loss": 2.7249, + "step": 2969 + }, + { + "epoch": 0.23969009765152127, + "grad_norm": 0.7365427017211914, + "learning_rate": 0.00018991014037241638, + "loss": 2.7044, + "step": 2970 + }, + { + "epoch": 0.23977080138810428, + "grad_norm": 0.8026351928710938, + "learning_rate": 0.00018990322868896365, + "loss": 2.7409, + "step": 2971 + }, + { + "epoch": 0.23985150512468728, + "grad_norm": 0.788646936416626, + "learning_rate": 0.00018989631476488744, + "loss": 2.7331, + "step": 2972 + }, + { + "epoch": 0.23993220886127029, + "grad_norm": 0.8388644456863403, + "learning_rate": 0.00018988939860036007, + "loss": 2.7478, + "step": 2973 + }, + { + "epoch": 0.2400129125978533, + "grad_norm": 0.7479026913642883, + "learning_rate": 0.00018988248019555394, + "loss": 2.7248, + "step": 2974 + }, + { + "epoch": 0.2400936163344363, + "grad_norm": 0.7313364744186401, + "learning_rate": 0.00018987555955064144, + "loss": 2.7323, + "step": 2975 + }, + { + "epoch": 0.2401743200710193, + "grad_norm": 0.7858260273933411, + "learning_rate": 0.00018986863666579505, + "loss": 2.6845, + "step": 2976 + }, + { + "epoch": 0.2402550238076023, + "grad_norm": 0.8090949654579163, + "learning_rate": 0.00018986171154118732, + "loss": 2.8094, + "step": 2977 + }, + { + "epoch": 0.2403357275441853, + "grad_norm": 0.7917135953903198, + "learning_rate": 0.00018985478417699085, + "loss": 2.7106, + "step": 2978 + }, + { + "epoch": 0.2404164312807683, + "grad_norm": 0.8192126154899597, + "learning_rate": 0.00018984785457337825, + "loss": 2.7729, + "step": 2979 + }, + { + "epoch": 0.24049713501735132, + "grad_norm": 0.797922670841217, + "learning_rate": 0.00018984092273052226, + "loss": 2.7747, + "step": 2980 + }, + { + "epoch": 0.24057783875393432, + "grad_norm": 0.9050948023796082, + "learning_rate": 0.00018983398864859564, + "loss": 2.7453, + "step": 2981 + }, + { + "epoch": 0.24065854249051732, + "grad_norm": 0.7827617526054382, + "learning_rate": 0.0001898270523277712, + "loss": 2.7371, + "step": 2982 + }, + { + "epoch": 0.24073924622710033, + "grad_norm": 0.7530156373977661, + "learning_rate": 0.0001898201137682218, + "loss": 2.7397, + "step": 2983 + }, + { + "epoch": 0.2408199499636833, + "grad_norm": 0.7989545464515686, + "learning_rate": 0.00018981317297012034, + "loss": 2.7532, + "step": 2984 + }, + { + "epoch": 0.2409006537002663, + "grad_norm": 0.7501168847084045, + "learning_rate": 0.00018980622993363988, + "loss": 2.7395, + "step": 2985 + }, + { + "epoch": 0.2409813574368493, + "grad_norm": 0.8073468208312988, + "learning_rate": 0.0001897992846589534, + "loss": 2.7673, + "step": 2986 + }, + { + "epoch": 0.24106206117343232, + "grad_norm": 0.9155512452125549, + "learning_rate": 0.00018979233714623401, + "loss": 2.6608, + "step": 2987 + }, + { + "epoch": 0.24114276491001532, + "grad_norm": 0.7461311221122742, + "learning_rate": 0.00018978538739565485, + "loss": 2.7657, + "step": 2988 + }, + { + "epoch": 0.24122346864659833, + "grad_norm": 0.8011443018913269, + "learning_rate": 0.00018977843540738914, + "loss": 2.7363, + "step": 2989 + }, + { + "epoch": 0.24130417238318133, + "grad_norm": 0.7602998614311218, + "learning_rate": 0.0001897714811816101, + "loss": 2.7285, + "step": 2990 + }, + { + "epoch": 0.24138487611976434, + "grad_norm": 0.8283531069755554, + "learning_rate": 0.00018976452471849116, + "loss": 2.7614, + "step": 2991 + }, + { + "epoch": 0.24146557985634734, + "grad_norm": 0.7358889579772949, + "learning_rate": 0.00018975756601820556, + "loss": 2.7429, + "step": 2992 + }, + { + "epoch": 0.24154628359293034, + "grad_norm": 0.7749240398406982, + "learning_rate": 0.0001897506050809268, + "loss": 2.6884, + "step": 2993 + }, + { + "epoch": 0.24162698732951335, + "grad_norm": 0.7529963254928589, + "learning_rate": 0.00018974364190682837, + "loss": 2.7619, + "step": 2994 + }, + { + "epoch": 0.24170769106609635, + "grad_norm": 0.7946054935455322, + "learning_rate": 0.00018973667649608376, + "loss": 2.7403, + "step": 2995 + }, + { + "epoch": 0.24178839480267936, + "grad_norm": 0.735870897769928, + "learning_rate": 0.0001897297088488666, + "loss": 2.7158, + "step": 2996 + }, + { + "epoch": 0.24186909853926236, + "grad_norm": 0.8409188985824585, + "learning_rate": 0.00018972273896535055, + "loss": 2.768, + "step": 2997 + }, + { + "epoch": 0.24194980227584537, + "grad_norm": 0.8351938724517822, + "learning_rate": 0.0001897157668457093, + "loss": 2.7548, + "step": 2998 + }, + { + "epoch": 0.24203050601242837, + "grad_norm": 0.8339046239852905, + "learning_rate": 0.00018970879249011663, + "loss": 2.7842, + "step": 2999 + }, + { + "epoch": 0.24211120974901137, + "grad_norm": 0.8092730641365051, + "learning_rate": 0.00018970181589874637, + "loss": 2.7141, + "step": 3000 + }, + { + "epoch": 0.24211120974901137, + "eval_loss": 2.643277406692505, + "eval_runtime": 784.7512, + "eval_samples_per_second": 3.339, + "eval_steps_per_second": 0.557, + "step": 3000 + }, + { + "epoch": 0.24219191348559438, + "grad_norm": 0.8014447093009949, + "learning_rate": 0.00018969483707177235, + "loss": 2.7341, + "step": 3001 + }, + { + "epoch": 0.24227261722217738, + "grad_norm": 0.744153618812561, + "learning_rate": 0.00018968785600936855, + "loss": 2.678, + "step": 3002 + }, + { + "epoch": 0.2423533209587604, + "grad_norm": 0.7264240384101868, + "learning_rate": 0.0001896808727117089, + "loss": 2.7321, + "step": 3003 + }, + { + "epoch": 0.2424340246953434, + "grad_norm": 0.8214067220687866, + "learning_rate": 0.00018967388717896748, + "loss": 2.7311, + "step": 3004 + }, + { + "epoch": 0.2425147284319264, + "grad_norm": 0.7871330976486206, + "learning_rate": 0.00018966689941131838, + "loss": 2.7184, + "step": 3005 + }, + { + "epoch": 0.2425954321685094, + "grad_norm": 0.7301360964775085, + "learning_rate": 0.00018965990940893575, + "loss": 2.7039, + "step": 3006 + }, + { + "epoch": 0.2426761359050924, + "grad_norm": 0.8290385603904724, + "learning_rate": 0.00018965291717199382, + "loss": 2.7848, + "step": 3007 + }, + { + "epoch": 0.2427568396416754, + "grad_norm": 0.7465909123420715, + "learning_rate": 0.00018964592270066683, + "loss": 2.7271, + "step": 3008 + }, + { + "epoch": 0.2428375433782584, + "grad_norm": 0.7992933988571167, + "learning_rate": 0.00018963892599512913, + "loss": 2.7749, + "step": 3009 + }, + { + "epoch": 0.24291824711484142, + "grad_norm": 0.7879100441932678, + "learning_rate": 0.00018963192705555507, + "loss": 2.6844, + "step": 3010 + }, + { + "epoch": 0.24299895085142442, + "grad_norm": 0.7895401120185852, + "learning_rate": 0.00018962492588211905, + "loss": 2.725, + "step": 3011 + }, + { + "epoch": 0.24307965458800743, + "grad_norm": 0.7699374556541443, + "learning_rate": 0.00018961792247499564, + "loss": 2.7408, + "step": 3012 + }, + { + "epoch": 0.24316035832459043, + "grad_norm": 0.828372597694397, + "learning_rate": 0.0001896109168343593, + "loss": 2.7527, + "step": 3013 + }, + { + "epoch": 0.24324106206117344, + "grad_norm": 0.7611951231956482, + "learning_rate": 0.0001896039089603847, + "loss": 2.7294, + "step": 3014 + }, + { + "epoch": 0.24332176579775644, + "grad_norm": 0.8214892148971558, + "learning_rate": 0.00018959689885324646, + "loss": 2.6931, + "step": 3015 + }, + { + "epoch": 0.24340246953433944, + "grad_norm": 0.7472538352012634, + "learning_rate": 0.00018958988651311928, + "loss": 2.7316, + "step": 3016 + }, + { + "epoch": 0.24348317327092245, + "grad_norm": 0.7574933171272278, + "learning_rate": 0.00018958287194017795, + "loss": 2.7764, + "step": 3017 + }, + { + "epoch": 0.24356387700750545, + "grad_norm": 0.739152729511261, + "learning_rate": 0.00018957585513459723, + "loss": 2.7949, + "step": 3018 + }, + { + "epoch": 0.24364458074408846, + "grad_norm": 0.824097752571106, + "learning_rate": 0.00018956883609655208, + "loss": 2.6612, + "step": 3019 + }, + { + "epoch": 0.24372528448067146, + "grad_norm": 0.7891144156455994, + "learning_rate": 0.00018956181482621744, + "loss": 2.7139, + "step": 3020 + }, + { + "epoch": 0.24380598821725447, + "grad_norm": 0.7364415526390076, + "learning_rate": 0.0001895547913237682, + "loss": 2.6984, + "step": 3021 + }, + { + "epoch": 0.24388669195383747, + "grad_norm": 0.7631362080574036, + "learning_rate": 0.0001895477655893795, + "loss": 2.7015, + "step": 3022 + }, + { + "epoch": 0.24396739569042047, + "grad_norm": 0.780541181564331, + "learning_rate": 0.00018954073762322637, + "loss": 2.7716, + "step": 3023 + }, + { + "epoch": 0.24404809942700348, + "grad_norm": 0.7877349853515625, + "learning_rate": 0.00018953370742548403, + "loss": 2.6654, + "step": 3024 + }, + { + "epoch": 0.24412880316358648, + "grad_norm": 0.7786216139793396, + "learning_rate": 0.00018952667499632763, + "loss": 2.7491, + "step": 3025 + }, + { + "epoch": 0.2442095069001695, + "grad_norm": 0.8207663893699646, + "learning_rate": 0.00018951964033593247, + "loss": 2.7212, + "step": 3026 + }, + { + "epoch": 0.2442902106367525, + "grad_norm": 0.8271831274032593, + "learning_rate": 0.00018951260344447386, + "loss": 2.7456, + "step": 3027 + }, + { + "epoch": 0.2443709143733355, + "grad_norm": 0.7610505819320679, + "learning_rate": 0.00018950556432212722, + "loss": 2.7472, + "step": 3028 + }, + { + "epoch": 0.2444516181099185, + "grad_norm": 0.7521701455116272, + "learning_rate": 0.00018949852296906792, + "loss": 2.7263, + "step": 3029 + }, + { + "epoch": 0.2445323218465015, + "grad_norm": 0.7518337965011597, + "learning_rate": 0.00018949147938547144, + "loss": 2.7069, + "step": 3030 + }, + { + "epoch": 0.2446130255830845, + "grad_norm": 0.7823107838630676, + "learning_rate": 0.00018948443357151343, + "loss": 2.7858, + "step": 3031 + }, + { + "epoch": 0.2446937293196675, + "grad_norm": 0.733132004737854, + "learning_rate": 0.00018947738552736938, + "loss": 2.7194, + "step": 3032 + }, + { + "epoch": 0.24477443305625052, + "grad_norm": 0.7756488919258118, + "learning_rate": 0.00018947033525321501, + "loss": 2.7299, + "step": 3033 + }, + { + "epoch": 0.24485513679283352, + "grad_norm": 0.7971112728118896, + "learning_rate": 0.00018946328274922598, + "loss": 2.7474, + "step": 3034 + }, + { + "epoch": 0.2449358405294165, + "grad_norm": 0.7871260643005371, + "learning_rate": 0.0001894562280155781, + "loss": 2.6994, + "step": 3035 + }, + { + "epoch": 0.2450165442659995, + "grad_norm": 0.7431116104125977, + "learning_rate": 0.00018944917105244717, + "loss": 2.6834, + "step": 3036 + }, + { + "epoch": 0.2450972480025825, + "grad_norm": 0.7372273206710815, + "learning_rate": 0.00018944211186000906, + "loss": 2.6988, + "step": 3037 + }, + { + "epoch": 0.2451779517391655, + "grad_norm": 0.8161508440971375, + "learning_rate": 0.00018943505043843975, + "loss": 2.7595, + "step": 3038 + }, + { + "epoch": 0.24525865547574852, + "grad_norm": 0.8062586784362793, + "learning_rate": 0.00018942798678791518, + "loss": 2.6893, + "step": 3039 + }, + { + "epoch": 0.24533935921233152, + "grad_norm": 0.824023425579071, + "learning_rate": 0.0001894209209086114, + "loss": 2.7188, + "step": 3040 + }, + { + "epoch": 0.24542006294891452, + "grad_norm": 0.740466833114624, + "learning_rate": 0.00018941385280070455, + "loss": 2.674, + "step": 3041 + }, + { + "epoch": 0.24550076668549753, + "grad_norm": 0.8543577194213867, + "learning_rate": 0.00018940678246437073, + "loss": 2.7423, + "step": 3042 + }, + { + "epoch": 0.24558147042208053, + "grad_norm": 0.7059324979782104, + "learning_rate": 0.0001893997098997862, + "loss": 2.6669, + "step": 3043 + }, + { + "epoch": 0.24566217415866354, + "grad_norm": 0.7739956974983215, + "learning_rate": 0.00018939263510712721, + "loss": 2.7118, + "step": 3044 + }, + { + "epoch": 0.24574287789524654, + "grad_norm": 0.7701205611228943, + "learning_rate": 0.00018938555808657007, + "loss": 2.7653, + "step": 3045 + }, + { + "epoch": 0.24582358163182955, + "grad_norm": 0.7243000864982605, + "learning_rate": 0.00018937847883829115, + "loss": 2.6789, + "step": 3046 + }, + { + "epoch": 0.24590428536841255, + "grad_norm": 0.7645598649978638, + "learning_rate": 0.00018937139736246693, + "loss": 2.7108, + "step": 3047 + }, + { + "epoch": 0.24598498910499556, + "grad_norm": 0.7544745802879333, + "learning_rate": 0.00018936431365927385, + "loss": 2.6958, + "step": 3048 + }, + { + "epoch": 0.24606569284157856, + "grad_norm": 0.709282398223877, + "learning_rate": 0.00018935722772888848, + "loss": 2.6728, + "step": 3049 + }, + { + "epoch": 0.24614639657816156, + "grad_norm": 0.7524243593215942, + "learning_rate": 0.00018935013957148742, + "loss": 2.7283, + "step": 3050 + }, + { + "epoch": 0.24622710031474457, + "grad_norm": 0.7959655523300171, + "learning_rate": 0.0001893430491872473, + "loss": 2.7384, + "step": 3051 + }, + { + "epoch": 0.24630780405132757, + "grad_norm": 0.7252553105354309, + "learning_rate": 0.00018933595657634486, + "loss": 2.7226, + "step": 3052 + }, + { + "epoch": 0.24638850778791058, + "grad_norm": 0.7387316226959229, + "learning_rate": 0.00018932886173895686, + "loss": 2.7546, + "step": 3053 + }, + { + "epoch": 0.24646921152449358, + "grad_norm": 0.804856538772583, + "learning_rate": 0.0001893217646752601, + "loss": 2.7321, + "step": 3054 + }, + { + "epoch": 0.24654991526107659, + "grad_norm": 0.6929069757461548, + "learning_rate": 0.0001893146653854315, + "loss": 2.6735, + "step": 3055 + }, + { + "epoch": 0.2466306189976596, + "grad_norm": 0.7076159715652466, + "learning_rate": 0.00018930756386964794, + "loss": 2.7368, + "step": 3056 + }, + { + "epoch": 0.2467113227342426, + "grad_norm": 0.7522851228713989, + "learning_rate": 0.00018930046012808648, + "loss": 2.7448, + "step": 3057 + }, + { + "epoch": 0.2467920264708256, + "grad_norm": 0.8347200155258179, + "learning_rate": 0.00018929335416092408, + "loss": 2.6837, + "step": 3058 + }, + { + "epoch": 0.2468727302074086, + "grad_norm": 0.737503707408905, + "learning_rate": 0.00018928624596833786, + "loss": 2.693, + "step": 3059 + }, + { + "epoch": 0.2469534339439916, + "grad_norm": 0.7836787104606628, + "learning_rate": 0.00018927913555050503, + "loss": 2.7335, + "step": 3060 + }, + { + "epoch": 0.2470341376805746, + "grad_norm": 0.7823840975761414, + "learning_rate": 0.00018927202290760278, + "loss": 2.6736, + "step": 3061 + }, + { + "epoch": 0.24711484141715762, + "grad_norm": 0.7894529700279236, + "learning_rate": 0.00018926490803980833, + "loss": 2.7112, + "step": 3062 + }, + { + "epoch": 0.24719554515374062, + "grad_norm": 0.8289024829864502, + "learning_rate": 0.000189257790947299, + "loss": 2.7667, + "step": 3063 + }, + { + "epoch": 0.24727624889032362, + "grad_norm": 0.70560222864151, + "learning_rate": 0.00018925067163025227, + "loss": 2.6946, + "step": 3064 + }, + { + "epoch": 0.24735695262690663, + "grad_norm": 0.6954196095466614, + "learning_rate": 0.00018924355008884548, + "loss": 2.7237, + "step": 3065 + }, + { + "epoch": 0.24743765636348963, + "grad_norm": 0.7975121736526489, + "learning_rate": 0.0001892364263232561, + "loss": 2.6392, + "step": 3066 + }, + { + "epoch": 0.24751836010007264, + "grad_norm": 0.777350902557373, + "learning_rate": 0.00018922930033366174, + "loss": 2.7284, + "step": 3067 + }, + { + "epoch": 0.24759906383665564, + "grad_norm": 0.738240659236908, + "learning_rate": 0.00018922217212023995, + "loss": 2.6884, + "step": 3068 + }, + { + "epoch": 0.24767976757323865, + "grad_norm": 0.8077268600463867, + "learning_rate": 0.0001892150416831684, + "loss": 2.7205, + "step": 3069 + }, + { + "epoch": 0.24776047130982165, + "grad_norm": 0.8108188509941101, + "learning_rate": 0.00018920790902262483, + "loss": 2.7592, + "step": 3070 + }, + { + "epoch": 0.24784117504640465, + "grad_norm": 0.7842642664909363, + "learning_rate": 0.00018920077413878695, + "loss": 2.7474, + "step": 3071 + }, + { + "epoch": 0.24792187878298766, + "grad_norm": 0.7644543051719666, + "learning_rate": 0.0001891936370318326, + "loss": 2.7179, + "step": 3072 + }, + { + "epoch": 0.24800258251957066, + "grad_norm": 0.7761854529380798, + "learning_rate": 0.00018918649770193965, + "loss": 2.71, + "step": 3073 + }, + { + "epoch": 0.24808328625615367, + "grad_norm": 0.7724074125289917, + "learning_rate": 0.00018917935614928607, + "loss": 2.7359, + "step": 3074 + }, + { + "epoch": 0.24816398999273667, + "grad_norm": 0.7360609173774719, + "learning_rate": 0.0001891722123740498, + "loss": 2.7342, + "step": 3075 + }, + { + "epoch": 0.24824469372931968, + "grad_norm": 0.757561206817627, + "learning_rate": 0.00018916506637640894, + "loss": 2.7647, + "step": 3076 + }, + { + "epoch": 0.24832539746590268, + "grad_norm": 0.7180947065353394, + "learning_rate": 0.00018915791815654148, + "loss": 2.6771, + "step": 3077 + }, + { + "epoch": 0.24840610120248569, + "grad_norm": 0.7219653129577637, + "learning_rate": 0.0001891507677146257, + "loss": 2.7772, + "step": 3078 + }, + { + "epoch": 0.2484868049390687, + "grad_norm": 0.749113917350769, + "learning_rate": 0.0001891436150508397, + "loss": 2.6996, + "step": 3079 + }, + { + "epoch": 0.2485675086756517, + "grad_norm": 0.766180157661438, + "learning_rate": 0.00018913646016536183, + "loss": 2.7896, + "step": 3080 + }, + { + "epoch": 0.2486482124122347, + "grad_norm": 0.7672411799430847, + "learning_rate": 0.00018912930305837032, + "loss": 2.7307, + "step": 3081 + }, + { + "epoch": 0.2487289161488177, + "grad_norm": 0.7639018297195435, + "learning_rate": 0.00018912214373004364, + "loss": 2.6569, + "step": 3082 + }, + { + "epoch": 0.2488096198854007, + "grad_norm": 0.8935483694076538, + "learning_rate": 0.00018911498218056013, + "loss": 2.6897, + "step": 3083 + }, + { + "epoch": 0.2488903236219837, + "grad_norm": 0.8506368398666382, + "learning_rate": 0.00018910781841009836, + "loss": 2.778, + "step": 3084 + }, + { + "epoch": 0.24897102735856672, + "grad_norm": 0.8026999235153198, + "learning_rate": 0.0001891006524188368, + "loss": 2.7799, + "step": 3085 + }, + { + "epoch": 0.2490517310951497, + "grad_norm": 0.784637987613678, + "learning_rate": 0.00018909348420695406, + "loss": 2.673, + "step": 3086 + }, + { + "epoch": 0.2491324348317327, + "grad_norm": 0.8949337601661682, + "learning_rate": 0.00018908631377462882, + "loss": 2.7726, + "step": 3087 + }, + { + "epoch": 0.2492131385683157, + "grad_norm": 0.73841792345047, + "learning_rate": 0.00018907914112203974, + "loss": 2.7403, + "step": 3088 + }, + { + "epoch": 0.2492938423048987, + "grad_norm": 0.7305924296379089, + "learning_rate": 0.00018907196624936564, + "loss": 2.6713, + "step": 3089 + }, + { + "epoch": 0.2493745460414817, + "grad_norm": 0.7707394361495972, + "learning_rate": 0.0001890647891567853, + "loss": 2.7306, + "step": 3090 + }, + { + "epoch": 0.2494552497780647, + "grad_norm": 0.8691473603248596, + "learning_rate": 0.00018905760984447759, + "loss": 2.6775, + "step": 3091 + }, + { + "epoch": 0.24953595351464772, + "grad_norm": 0.7466028332710266, + "learning_rate": 0.00018905042831262144, + "loss": 2.7196, + "step": 3092 + }, + { + "epoch": 0.24961665725123072, + "grad_norm": 0.7785150408744812, + "learning_rate": 0.0001890432445613958, + "loss": 2.7099, + "step": 3093 + }, + { + "epoch": 0.24969736098781373, + "grad_norm": 0.7775028347969055, + "learning_rate": 0.0001890360585909798, + "loss": 2.698, + "step": 3094 + }, + { + "epoch": 0.24977806472439673, + "grad_norm": 0.829257071018219, + "learning_rate": 0.00018902887040155245, + "loss": 2.711, + "step": 3095 + }, + { + "epoch": 0.24985876846097974, + "grad_norm": 0.8492234945297241, + "learning_rate": 0.00018902167999329295, + "loss": 2.7164, + "step": 3096 + }, + { + "epoch": 0.24993947219756274, + "grad_norm": 0.7332174777984619, + "learning_rate": 0.00018901448736638045, + "loss": 2.6925, + "step": 3097 + }, + { + "epoch": 0.25002017593414577, + "grad_norm": 0.7494251728057861, + "learning_rate": 0.00018900729252099426, + "loss": 2.6899, + "step": 3098 + }, + { + "epoch": 0.25010087967072875, + "grad_norm": 0.7760747075080872, + "learning_rate": 0.00018900009545731367, + "loss": 2.6626, + "step": 3099 + }, + { + "epoch": 0.2501815834073118, + "grad_norm": 0.7270001173019409, + "learning_rate": 0.00018899289617551804, + "loss": 2.7338, + "step": 3100 + }, + { + "epoch": 0.25026228714389476, + "grad_norm": 0.7832693457603455, + "learning_rate": 0.0001889856946757868, + "loss": 2.6668, + "step": 3101 + }, + { + "epoch": 0.2503429908804778, + "grad_norm": 0.8833239674568176, + "learning_rate": 0.00018897849095829945, + "loss": 2.7219, + "step": 3102 + }, + { + "epoch": 0.25042369461706077, + "grad_norm": 0.8144814372062683, + "learning_rate": 0.0001889712850232355, + "loss": 2.724, + "step": 3103 + }, + { + "epoch": 0.2505043983536438, + "grad_norm": 0.9466180801391602, + "learning_rate": 0.0001889640768707746, + "loss": 2.7499, + "step": 3104 + }, + { + "epoch": 0.2505851020902268, + "grad_norm": 0.926292359828949, + "learning_rate": 0.00018895686650109632, + "loss": 2.7391, + "step": 3105 + }, + { + "epoch": 0.2506658058268098, + "grad_norm": 0.8214002251625061, + "learning_rate": 0.00018894965391438038, + "loss": 2.7546, + "step": 3106 + }, + { + "epoch": 0.2507465095633928, + "grad_norm": 0.9021030068397522, + "learning_rate": 0.00018894243911080655, + "loss": 2.7188, + "step": 3107 + }, + { + "epoch": 0.2508272132999758, + "grad_norm": 0.778366208076477, + "learning_rate": 0.00018893522209055465, + "loss": 2.7852, + "step": 3108 + }, + { + "epoch": 0.2509079170365588, + "grad_norm": 0.8780209422111511, + "learning_rate": 0.00018892800285380456, + "loss": 2.7344, + "step": 3109 + }, + { + "epoch": 0.2509886207731418, + "grad_norm": 0.7581839561462402, + "learning_rate": 0.00018892078140073614, + "loss": 2.6697, + "step": 3110 + }, + { + "epoch": 0.2510693245097248, + "grad_norm": 0.7818635702133179, + "learning_rate": 0.00018891355773152944, + "loss": 2.6969, + "step": 3111 + }, + { + "epoch": 0.2511500282463078, + "grad_norm": 0.7528424859046936, + "learning_rate": 0.0001889063318463644, + "loss": 2.7359, + "step": 3112 + }, + { + "epoch": 0.2512307319828908, + "grad_norm": 0.8274288773536682, + "learning_rate": 0.0001888991037454212, + "loss": 2.7124, + "step": 3113 + }, + { + "epoch": 0.2513114357194738, + "grad_norm": 0.7186813354492188, + "learning_rate": 0.00018889187342888, + "loss": 2.7037, + "step": 3114 + }, + { + "epoch": 0.2513921394560568, + "grad_norm": 0.7458071112632751, + "learning_rate": 0.00018888464089692088, + "loss": 2.7178, + "step": 3115 + }, + { + "epoch": 0.2514728431926398, + "grad_norm": 0.7814257740974426, + "learning_rate": 0.00018887740614972418, + "loss": 2.7554, + "step": 3116 + }, + { + "epoch": 0.2515535469292228, + "grad_norm": 0.7706831097602844, + "learning_rate": 0.0001888701691874702, + "loss": 2.7441, + "step": 3117 + }, + { + "epoch": 0.2516342506658058, + "grad_norm": 0.8177775740623474, + "learning_rate": 0.0001888629300103393, + "loss": 2.7257, + "step": 3118 + }, + { + "epoch": 0.25171495440238884, + "grad_norm": 0.791097104549408, + "learning_rate": 0.00018885568861851188, + "loss": 2.6937, + "step": 3119 + }, + { + "epoch": 0.2517956581389718, + "grad_norm": 0.7521430850028992, + "learning_rate": 0.00018884844501216845, + "loss": 2.7723, + "step": 3120 + }, + { + "epoch": 0.25187636187555484, + "grad_norm": 0.8119359016418457, + "learning_rate": 0.00018884119919148948, + "loss": 2.7573, + "step": 3121 + }, + { + "epoch": 0.2519570656121378, + "grad_norm": 0.7579830288887024, + "learning_rate": 0.00018883395115665562, + "loss": 2.6943, + "step": 3122 + }, + { + "epoch": 0.25203776934872085, + "grad_norm": 0.7718791365623474, + "learning_rate": 0.00018882670090784748, + "loss": 2.6911, + "step": 3123 + }, + { + "epoch": 0.25211847308530383, + "grad_norm": 0.7718087434768677, + "learning_rate": 0.00018881944844524576, + "loss": 2.7505, + "step": 3124 + }, + { + "epoch": 0.25219917682188686, + "grad_norm": 0.7696875333786011, + "learning_rate": 0.0001888121937690312, + "loss": 2.7272, + "step": 3125 + }, + { + "epoch": 0.25227988055846984, + "grad_norm": 0.8082131743431091, + "learning_rate": 0.00018880493687938464, + "loss": 2.6677, + "step": 3126 + }, + { + "epoch": 0.25236058429505287, + "grad_norm": 0.857224702835083, + "learning_rate": 0.00018879767777648686, + "loss": 2.7237, + "step": 3127 + }, + { + "epoch": 0.25244128803163585, + "grad_norm": 0.8135749697685242, + "learning_rate": 0.00018879041646051886, + "loss": 2.7298, + "step": 3128 + }, + { + "epoch": 0.2525219917682189, + "grad_norm": 0.7772457003593445, + "learning_rate": 0.0001887831529316616, + "loss": 2.7723, + "step": 3129 + }, + { + "epoch": 0.25260269550480186, + "grad_norm": 0.795555055141449, + "learning_rate": 0.00018877588719009607, + "loss": 2.7207, + "step": 3130 + }, + { + "epoch": 0.2526833992413849, + "grad_norm": 0.7677939534187317, + "learning_rate": 0.00018876861923600337, + "loss": 2.6649, + "step": 3131 + }, + { + "epoch": 0.25276410297796786, + "grad_norm": 0.7706151008605957, + "learning_rate": 0.00018876134906956464, + "loss": 2.7154, + "step": 3132 + }, + { + "epoch": 0.2528448067145509, + "grad_norm": 0.8230584859848022, + "learning_rate": 0.00018875407669096105, + "loss": 2.7871, + "step": 3133 + }, + { + "epoch": 0.2529255104511339, + "grad_norm": 0.7037158608436584, + "learning_rate": 0.0001887468021003739, + "loss": 2.669, + "step": 3134 + }, + { + "epoch": 0.2530062141877169, + "grad_norm": 0.8485400080680847, + "learning_rate": 0.00018873952529798441, + "loss": 2.7517, + "step": 3135 + }, + { + "epoch": 0.2530869179242999, + "grad_norm": 0.7803399562835693, + "learning_rate": 0.000188732246283974, + "loss": 2.6987, + "step": 3136 + }, + { + "epoch": 0.2531676216608829, + "grad_norm": 0.7884016633033752, + "learning_rate": 0.0001887249650585241, + "loss": 2.7348, + "step": 3137 + }, + { + "epoch": 0.2532483253974659, + "grad_norm": 0.7794530987739563, + "learning_rate": 0.0001887176816218161, + "loss": 2.6934, + "step": 3138 + }, + { + "epoch": 0.2533290291340489, + "grad_norm": 0.7905173301696777, + "learning_rate": 0.00018871039597403156, + "loss": 2.714, + "step": 3139 + }, + { + "epoch": 0.2534097328706319, + "grad_norm": 0.7857949137687683, + "learning_rate": 0.0001887031081153521, + "loss": 2.7591, + "step": 3140 + }, + { + "epoch": 0.25349043660721493, + "grad_norm": 0.8602419495582581, + "learning_rate": 0.00018869581804595927, + "loss": 2.7819, + "step": 3141 + }, + { + "epoch": 0.2535711403437979, + "grad_norm": 0.7845202088356018, + "learning_rate": 0.00018868852576603483, + "loss": 2.6796, + "step": 3142 + }, + { + "epoch": 0.25365184408038094, + "grad_norm": 0.7600612640380859, + "learning_rate": 0.00018868123127576048, + "loss": 2.6785, + "step": 3143 + }, + { + "epoch": 0.2537325478169639, + "grad_norm": 0.7731521725654602, + "learning_rate": 0.000188673934575318, + "loss": 2.7435, + "step": 3144 + }, + { + "epoch": 0.25381325155354695, + "grad_norm": 0.8214225172996521, + "learning_rate": 0.0001886666356648893, + "loss": 2.7264, + "step": 3145 + }, + { + "epoch": 0.2538939552901299, + "grad_norm": 0.7623010277748108, + "learning_rate": 0.00018865933454465628, + "loss": 2.73, + "step": 3146 + }, + { + "epoch": 0.25397465902671296, + "grad_norm": 0.7864633798599243, + "learning_rate": 0.00018865203121480088, + "loss": 2.7654, + "step": 3147 + }, + { + "epoch": 0.25405536276329593, + "grad_norm": 0.7654051780700684, + "learning_rate": 0.0001886447256755051, + "loss": 2.7171, + "step": 3148 + }, + { + "epoch": 0.25413606649987897, + "grad_norm": 0.8045486211776733, + "learning_rate": 0.0001886374179269511, + "loss": 2.7385, + "step": 3149 + }, + { + "epoch": 0.25421677023646194, + "grad_norm": 0.8504971861839294, + "learning_rate": 0.0001886301079693209, + "loss": 2.6719, + "step": 3150 + }, + { + "epoch": 0.254297473973045, + "grad_norm": 0.771538496017456, + "learning_rate": 0.0001886227958027967, + "loss": 2.6707, + "step": 3151 + }, + { + "epoch": 0.25437817770962795, + "grad_norm": 0.8472220301628113, + "learning_rate": 0.0001886154814275608, + "loss": 2.7201, + "step": 3152 + }, + { + "epoch": 0.254458881446211, + "grad_norm": 0.7639158368110657, + "learning_rate": 0.00018860816484379545, + "loss": 2.76, + "step": 3153 + }, + { + "epoch": 0.25453958518279396, + "grad_norm": 0.8042064905166626, + "learning_rate": 0.000188600846051683, + "loss": 2.6862, + "step": 3154 + }, + { + "epoch": 0.254620288919377, + "grad_norm": 0.7481087446212769, + "learning_rate": 0.0001885935250514059, + "loss": 2.7394, + "step": 3155 + }, + { + "epoch": 0.25470099265595997, + "grad_norm": 0.7826097011566162, + "learning_rate": 0.00018858620184314653, + "loss": 2.596, + "step": 3156 + }, + { + "epoch": 0.254781696392543, + "grad_norm": 0.7477610111236572, + "learning_rate": 0.00018857887642708743, + "loss": 2.7385, + "step": 3157 + }, + { + "epoch": 0.254862400129126, + "grad_norm": 0.7347466945648193, + "learning_rate": 0.00018857154880341122, + "loss": 2.722, + "step": 3158 + }, + { + "epoch": 0.254943103865709, + "grad_norm": 0.7853806018829346, + "learning_rate": 0.00018856421897230048, + "loss": 2.7675, + "step": 3159 + }, + { + "epoch": 0.255023807602292, + "grad_norm": 0.7497034072875977, + "learning_rate": 0.0001885568869339379, + "loss": 2.6882, + "step": 3160 + }, + { + "epoch": 0.255104511338875, + "grad_norm": 0.7932263612747192, + "learning_rate": 0.0001885495526885062, + "loss": 2.7938, + "step": 3161 + }, + { + "epoch": 0.255185215075458, + "grad_norm": 0.7776823043823242, + "learning_rate": 0.00018854221623618815, + "loss": 2.6955, + "step": 3162 + }, + { + "epoch": 0.25526591881204097, + "grad_norm": 0.7564878463745117, + "learning_rate": 0.00018853487757716666, + "loss": 2.7644, + "step": 3163 + }, + { + "epoch": 0.255346622548624, + "grad_norm": 0.836270809173584, + "learning_rate": 0.00018852753671162454, + "loss": 2.7119, + "step": 3164 + }, + { + "epoch": 0.255427326285207, + "grad_norm": 0.7540388703346252, + "learning_rate": 0.00018852019363974485, + "loss": 2.797, + "step": 3165 + }, + { + "epoch": 0.25550803002179, + "grad_norm": 0.7943860292434692, + "learning_rate": 0.0001885128483617105, + "loss": 2.7973, + "step": 3166 + }, + { + "epoch": 0.255588733758373, + "grad_norm": 0.7743831276893616, + "learning_rate": 0.00018850550087770463, + "loss": 2.7403, + "step": 3167 + }, + { + "epoch": 0.255669437494956, + "grad_norm": 0.7593801021575928, + "learning_rate": 0.00018849815118791028, + "loss": 2.7203, + "step": 3168 + }, + { + "epoch": 0.255750141231539, + "grad_norm": 0.7663586139678955, + "learning_rate": 0.00018849079929251068, + "loss": 2.7481, + "step": 3169 + }, + { + "epoch": 0.25583084496812203, + "grad_norm": 0.7218170166015625, + "learning_rate": 0.00018848344519168905, + "loss": 2.6698, + "step": 3170 + }, + { + "epoch": 0.255911548704705, + "grad_norm": 0.8374441266059875, + "learning_rate": 0.00018847608888562868, + "loss": 2.8121, + "step": 3171 + }, + { + "epoch": 0.25599225244128804, + "grad_norm": 0.7488373517990112, + "learning_rate": 0.00018846873037451286, + "loss": 2.6871, + "step": 3172 + }, + { + "epoch": 0.256072956177871, + "grad_norm": 0.7513325810432434, + "learning_rate": 0.00018846136965852505, + "loss": 2.6924, + "step": 3173 + }, + { + "epoch": 0.25615365991445405, + "grad_norm": 0.7467690706253052, + "learning_rate": 0.00018845400673784865, + "loss": 2.714, + "step": 3174 + }, + { + "epoch": 0.256234363651037, + "grad_norm": 0.7717954516410828, + "learning_rate": 0.0001884466416126672, + "loss": 2.6679, + "step": 3175 + }, + { + "epoch": 0.25631506738762005, + "grad_norm": 0.7086547613143921, + "learning_rate": 0.0001884392742831642, + "loss": 2.7046, + "step": 3176 + }, + { + "epoch": 0.25639577112420303, + "grad_norm": 0.7024885416030884, + "learning_rate": 0.00018843190474952337, + "loss": 2.6724, + "step": 3177 + }, + { + "epoch": 0.25647647486078606, + "grad_norm": 0.8376390933990479, + "learning_rate": 0.00018842453301192827, + "loss": 2.7818, + "step": 3178 + }, + { + "epoch": 0.25655717859736904, + "grad_norm": 0.8190221190452576, + "learning_rate": 0.00018841715907056265, + "loss": 2.7455, + "step": 3179 + }, + { + "epoch": 0.25663788233395207, + "grad_norm": 0.8029047846794128, + "learning_rate": 0.0001884097829256103, + "loss": 2.7102, + "step": 3180 + }, + { + "epoch": 0.25671858607053505, + "grad_norm": 0.7467923760414124, + "learning_rate": 0.00018840240457725508, + "loss": 2.7051, + "step": 3181 + }, + { + "epoch": 0.2567992898071181, + "grad_norm": 0.7850394248962402, + "learning_rate": 0.00018839502402568086, + "loss": 2.6826, + "step": 3182 + }, + { + "epoch": 0.25687999354370106, + "grad_norm": 0.7144927978515625, + "learning_rate": 0.00018838764127107155, + "loss": 2.6694, + "step": 3183 + }, + { + "epoch": 0.2569606972802841, + "grad_norm": 0.7580311894416809, + "learning_rate": 0.0001883802563136112, + "loss": 2.7191, + "step": 3184 + }, + { + "epoch": 0.25704140101686707, + "grad_norm": 0.7366482615470886, + "learning_rate": 0.0001883728691534838, + "loss": 2.7175, + "step": 3185 + }, + { + "epoch": 0.2571221047534501, + "grad_norm": 0.6961715817451477, + "learning_rate": 0.0001883654797908735, + "loss": 2.7705, + "step": 3186 + }, + { + "epoch": 0.2572028084900331, + "grad_norm": 0.7473716735839844, + "learning_rate": 0.00018835808822596445, + "loss": 2.707, + "step": 3187 + }, + { + "epoch": 0.2572835122266161, + "grad_norm": 0.8376151919364929, + "learning_rate": 0.00018835069445894087, + "loss": 2.7424, + "step": 3188 + }, + { + "epoch": 0.2573642159631991, + "grad_norm": 0.7950237393379211, + "learning_rate": 0.00018834329848998706, + "loss": 2.7593, + "step": 3189 + }, + { + "epoch": 0.2574449196997821, + "grad_norm": 0.7637122869491577, + "learning_rate": 0.0001883359003192873, + "loss": 2.6708, + "step": 3190 + }, + { + "epoch": 0.2575256234363651, + "grad_norm": 0.709516704082489, + "learning_rate": 0.00018832849994702597, + "loss": 2.6988, + "step": 3191 + }, + { + "epoch": 0.2576063271729481, + "grad_norm": 0.7465435266494751, + "learning_rate": 0.00018832109737338757, + "loss": 2.7183, + "step": 3192 + }, + { + "epoch": 0.2576870309095311, + "grad_norm": 0.7619186043739319, + "learning_rate": 0.00018831369259855653, + "loss": 2.6833, + "step": 3193 + }, + { + "epoch": 0.25776773464611413, + "grad_norm": 0.7501961588859558, + "learning_rate": 0.0001883062856227174, + "loss": 2.725, + "step": 3194 + }, + { + "epoch": 0.2578484383826971, + "grad_norm": 0.7720133066177368, + "learning_rate": 0.00018829887644605483, + "loss": 2.7988, + "step": 3195 + }, + { + "epoch": 0.25792914211928014, + "grad_norm": 0.7253942489624023, + "learning_rate": 0.00018829146506875344, + "loss": 2.6999, + "step": 3196 + }, + { + "epoch": 0.2580098458558631, + "grad_norm": 0.7759599685668945, + "learning_rate": 0.00018828405149099792, + "loss": 2.6831, + "step": 3197 + }, + { + "epoch": 0.25809054959244615, + "grad_norm": 0.7250547409057617, + "learning_rate": 0.0001882766357129731, + "loss": 2.6742, + "step": 3198 + }, + { + "epoch": 0.2581712533290291, + "grad_norm": 0.7565183043479919, + "learning_rate": 0.00018826921773486372, + "loss": 2.6777, + "step": 3199 + }, + { + "epoch": 0.25825195706561216, + "grad_norm": 0.7183675169944763, + "learning_rate": 0.0001882617975568547, + "loss": 2.6743, + "step": 3200 + }, + { + "epoch": 0.25833266080219514, + "grad_norm": 0.7021663784980774, + "learning_rate": 0.00018825437517913098, + "loss": 2.727, + "step": 3201 + }, + { + "epoch": 0.25841336453877817, + "grad_norm": 0.7406932711601257, + "learning_rate": 0.00018824695060187753, + "loss": 2.7448, + "step": 3202 + }, + { + "epoch": 0.25849406827536114, + "grad_norm": 0.7766773104667664, + "learning_rate": 0.0001882395238252794, + "loss": 2.69, + "step": 3203 + }, + { + "epoch": 0.2585747720119442, + "grad_norm": 0.7483372688293457, + "learning_rate": 0.00018823209484952164, + "loss": 2.6611, + "step": 3204 + }, + { + "epoch": 0.25865547574852715, + "grad_norm": 0.781831681728363, + "learning_rate": 0.0001882246636747895, + "loss": 2.7292, + "step": 3205 + }, + { + "epoch": 0.2587361794851102, + "grad_norm": 0.7188203930854797, + "learning_rate": 0.00018821723030126806, + "loss": 2.718, + "step": 3206 + }, + { + "epoch": 0.25881688322169316, + "grad_norm": 0.7332054972648621, + "learning_rate": 0.00018820979472914263, + "loss": 2.6492, + "step": 3207 + }, + { + "epoch": 0.2588975869582762, + "grad_norm": 0.7044041156768799, + "learning_rate": 0.00018820235695859858, + "loss": 2.7047, + "step": 3208 + }, + { + "epoch": 0.25897829069485917, + "grad_norm": 0.8651862740516663, + "learning_rate": 0.00018819491698982121, + "loss": 2.6301, + "step": 3209 + }, + { + "epoch": 0.2590589944314422, + "grad_norm": 0.8118106126785278, + "learning_rate": 0.00018818747482299598, + "loss": 2.6522, + "step": 3210 + }, + { + "epoch": 0.2591396981680252, + "grad_norm": 0.7239218354225159, + "learning_rate": 0.00018818003045830832, + "loss": 2.7058, + "step": 3211 + }, + { + "epoch": 0.2592204019046082, + "grad_norm": 0.8557687997817993, + "learning_rate": 0.00018817258389594382, + "loss": 2.7125, + "step": 3212 + }, + { + "epoch": 0.2593011056411912, + "grad_norm": 0.7685148119926453, + "learning_rate": 0.00018816513513608801, + "loss": 2.7516, + "step": 3213 + }, + { + "epoch": 0.25938180937777416, + "grad_norm": 0.7497698664665222, + "learning_rate": 0.00018815768417892664, + "loss": 2.6536, + "step": 3214 + }, + { + "epoch": 0.2594625131143572, + "grad_norm": 0.7041923403739929, + "learning_rate": 0.0001881502310246453, + "loss": 2.7031, + "step": 3215 + }, + { + "epoch": 0.2595432168509402, + "grad_norm": 0.7815428376197815, + "learning_rate": 0.00018814277567342976, + "loss": 2.7291, + "step": 3216 + }, + { + "epoch": 0.2596239205875232, + "grad_norm": 0.7285065650939941, + "learning_rate": 0.00018813531812546583, + "loss": 2.7712, + "step": 3217 + }, + { + "epoch": 0.2597046243241062, + "grad_norm": 0.7606547474861145, + "learning_rate": 0.0001881278583809394, + "loss": 2.6714, + "step": 3218 + }, + { + "epoch": 0.2597853280606892, + "grad_norm": 0.7166680097579956, + "learning_rate": 0.00018812039644003638, + "loss": 2.7147, + "step": 3219 + }, + { + "epoch": 0.2598660317972722, + "grad_norm": 0.8977978229522705, + "learning_rate": 0.0001881129323029427, + "loss": 2.7743, + "step": 3220 + }, + { + "epoch": 0.2599467355338552, + "grad_norm": 0.7447277307510376, + "learning_rate": 0.00018810546596984446, + "loss": 2.7049, + "step": 3221 + }, + { + "epoch": 0.2600274392704382, + "grad_norm": 0.7343515157699585, + "learning_rate": 0.00018809799744092768, + "loss": 2.6999, + "step": 3222 + }, + { + "epoch": 0.26010814300702123, + "grad_norm": 0.7303341627120972, + "learning_rate": 0.00018809052671637852, + "loss": 2.7222, + "step": 3223 + }, + { + "epoch": 0.2601888467436042, + "grad_norm": 0.7412950396537781, + "learning_rate": 0.00018808305379638314, + "loss": 2.6957, + "step": 3224 + }, + { + "epoch": 0.26026955048018724, + "grad_norm": 0.7495343089103699, + "learning_rate": 0.00018807557868112781, + "loss": 2.7123, + "step": 3225 + }, + { + "epoch": 0.2603502542167702, + "grad_norm": 0.8137524724006653, + "learning_rate": 0.00018806810137079886, + "loss": 2.7191, + "step": 3226 + }, + { + "epoch": 0.26043095795335325, + "grad_norm": 0.786374568939209, + "learning_rate": 0.0001880606218655826, + "loss": 2.7237, + "step": 3227 + }, + { + "epoch": 0.2605116616899362, + "grad_norm": 0.9969484806060791, + "learning_rate": 0.00018805314016566543, + "loss": 2.7603, + "step": 3228 + }, + { + "epoch": 0.26059236542651926, + "grad_norm": 0.8132432103157043, + "learning_rate": 0.00018804565627123386, + "loss": 2.6807, + "step": 3229 + }, + { + "epoch": 0.26067306916310223, + "grad_norm": 0.7604904174804688, + "learning_rate": 0.00018803817018247436, + "loss": 2.7105, + "step": 3230 + }, + { + "epoch": 0.26075377289968527, + "grad_norm": 0.743505597114563, + "learning_rate": 0.00018803068189957354, + "loss": 2.7152, + "step": 3231 + }, + { + "epoch": 0.26083447663626824, + "grad_norm": 0.7780006527900696, + "learning_rate": 0.000188023191422718, + "loss": 2.7043, + "step": 3232 + }, + { + "epoch": 0.2609151803728513, + "grad_norm": 0.7683089375495911, + "learning_rate": 0.00018801569875209447, + "loss": 2.7033, + "step": 3233 + }, + { + "epoch": 0.26099588410943425, + "grad_norm": 0.7540118098258972, + "learning_rate": 0.0001880082038878896, + "loss": 2.7121, + "step": 3234 + }, + { + "epoch": 0.2610765878460173, + "grad_norm": 0.7509592771530151, + "learning_rate": 0.00018800070683029025, + "loss": 2.6575, + "step": 3235 + }, + { + "epoch": 0.26115729158260026, + "grad_norm": 0.8015461564064026, + "learning_rate": 0.00018799320757948327, + "loss": 2.6956, + "step": 3236 + }, + { + "epoch": 0.2612379953191833, + "grad_norm": 0.7586383819580078, + "learning_rate": 0.00018798570613565553, + "loss": 2.6719, + "step": 3237 + }, + { + "epoch": 0.26131869905576627, + "grad_norm": 0.7833155989646912, + "learning_rate": 0.000187978202498994, + "loss": 2.7317, + "step": 3238 + }, + { + "epoch": 0.2613994027923493, + "grad_norm": 0.7976018786430359, + "learning_rate": 0.00018797069666968565, + "loss": 2.7514, + "step": 3239 + }, + { + "epoch": 0.2614801065289323, + "grad_norm": 0.8388968706130981, + "learning_rate": 0.00018796318864791763, + "loss": 2.6845, + "step": 3240 + }, + { + "epoch": 0.2615608102655153, + "grad_norm": 0.8082842230796814, + "learning_rate": 0.00018795567843387701, + "loss": 2.7204, + "step": 3241 + }, + { + "epoch": 0.2616415140020983, + "grad_norm": 0.7514800429344177, + "learning_rate": 0.00018794816602775094, + "loss": 2.7117, + "step": 3242 + }, + { + "epoch": 0.2617222177386813, + "grad_norm": 0.8676564693450928, + "learning_rate": 0.00018794065142972664, + "loss": 2.6596, + "step": 3243 + }, + { + "epoch": 0.2618029214752643, + "grad_norm": 0.7449865341186523, + "learning_rate": 0.0001879331346399915, + "loss": 2.7089, + "step": 3244 + }, + { + "epoch": 0.2618836252118473, + "grad_norm": 0.8020811676979065, + "learning_rate": 0.00018792561565873274, + "loss": 2.7293, + "step": 3245 + }, + { + "epoch": 0.2619643289484303, + "grad_norm": 0.7961642146110535, + "learning_rate": 0.00018791809448613783, + "loss": 2.7269, + "step": 3246 + }, + { + "epoch": 0.26204503268501333, + "grad_norm": 0.7842351198196411, + "learning_rate": 0.00018791057112239415, + "loss": 2.6773, + "step": 3247 + }, + { + "epoch": 0.2621257364215963, + "grad_norm": 0.7494246959686279, + "learning_rate": 0.00018790304556768925, + "loss": 2.7317, + "step": 3248 + }, + { + "epoch": 0.26220644015817934, + "grad_norm": 0.7822836637496948, + "learning_rate": 0.0001878955178222107, + "loss": 2.6834, + "step": 3249 + }, + { + "epoch": 0.2622871438947623, + "grad_norm": 0.8432494401931763, + "learning_rate": 0.00018788798788614607, + "loss": 2.7048, + "step": 3250 + }, + { + "epoch": 0.26236784763134535, + "grad_norm": 0.9599446058273315, + "learning_rate": 0.000187880455759683, + "loss": 2.7793, + "step": 3251 + }, + { + "epoch": 0.26244855136792833, + "grad_norm": 0.8097226023674011, + "learning_rate": 0.00018787292144300928, + "loss": 2.7177, + "step": 3252 + }, + { + "epoch": 0.26252925510451136, + "grad_norm": 0.8423499464988708, + "learning_rate": 0.00018786538493631265, + "loss": 2.7265, + "step": 3253 + }, + { + "epoch": 0.26260995884109434, + "grad_norm": 0.7388847470283508, + "learning_rate": 0.00018785784623978095, + "loss": 2.6778, + "step": 3254 + }, + { + "epoch": 0.26269066257767737, + "grad_norm": 0.766368567943573, + "learning_rate": 0.0001878503053536021, + "loss": 2.654, + "step": 3255 + }, + { + "epoch": 0.26277136631426035, + "grad_norm": 0.8181266188621521, + "learning_rate": 0.00018784276227796394, + "loss": 2.7568, + "step": 3256 + }, + { + "epoch": 0.2628520700508434, + "grad_norm": 0.8235312104225159, + "learning_rate": 0.00018783521701305452, + "loss": 2.7317, + "step": 3257 + }, + { + "epoch": 0.26293277378742635, + "grad_norm": 0.7103183269500732, + "learning_rate": 0.00018782766955906195, + "loss": 2.6919, + "step": 3258 + }, + { + "epoch": 0.2630134775240094, + "grad_norm": 0.7202538251876831, + "learning_rate": 0.0001878201199161742, + "loss": 2.7179, + "step": 3259 + }, + { + "epoch": 0.26309418126059236, + "grad_norm": 0.8402286171913147, + "learning_rate": 0.00018781256808457952, + "loss": 2.7789, + "step": 3260 + }, + { + "epoch": 0.2631748849971754, + "grad_norm": 0.8136829137802124, + "learning_rate": 0.00018780501406446613, + "loss": 2.6872, + "step": 3261 + }, + { + "epoch": 0.26325558873375837, + "grad_norm": 0.8017000555992126, + "learning_rate": 0.00018779745785602224, + "loss": 2.7527, + "step": 3262 + }, + { + "epoch": 0.2633362924703414, + "grad_norm": 0.7880774140357971, + "learning_rate": 0.00018778989945943619, + "loss": 2.7348, + "step": 3263 + }, + { + "epoch": 0.2634169962069244, + "grad_norm": 0.7402438521385193, + "learning_rate": 0.00018778233887489635, + "loss": 2.6946, + "step": 3264 + }, + { + "epoch": 0.26349769994350736, + "grad_norm": 0.7450907230377197, + "learning_rate": 0.0001877747761025912, + "loss": 2.7502, + "step": 3265 + }, + { + "epoch": 0.2635784036800904, + "grad_norm": 0.7504056692123413, + "learning_rate": 0.00018776721114270917, + "loss": 2.832, + "step": 3266 + }, + { + "epoch": 0.26365910741667337, + "grad_norm": 0.7710226774215698, + "learning_rate": 0.00018775964399543878, + "loss": 2.6895, + "step": 3267 + }, + { + "epoch": 0.2637398111532564, + "grad_norm": 0.769927978515625, + "learning_rate": 0.00018775207466096867, + "loss": 2.6801, + "step": 3268 + }, + { + "epoch": 0.2638205148898394, + "grad_norm": 0.7210869193077087, + "learning_rate": 0.0001877445031394875, + "loss": 2.6966, + "step": 3269 + }, + { + "epoch": 0.2639012186264224, + "grad_norm": 0.7731119990348816, + "learning_rate": 0.00018773692943118393, + "loss": 2.6965, + "step": 3270 + }, + { + "epoch": 0.2639819223630054, + "grad_norm": 0.7539728283882141, + "learning_rate": 0.00018772935353624672, + "loss": 2.753, + "step": 3271 + }, + { + "epoch": 0.2640626260995884, + "grad_norm": 0.7993821501731873, + "learning_rate": 0.00018772177545486472, + "loss": 2.7177, + "step": 3272 + }, + { + "epoch": 0.2641433298361714, + "grad_norm": 0.7880005240440369, + "learning_rate": 0.00018771419518722672, + "loss": 2.6854, + "step": 3273 + }, + { + "epoch": 0.2642240335727544, + "grad_norm": 0.8079188466072083, + "learning_rate": 0.0001877066127335217, + "loss": 2.734, + "step": 3274 + }, + { + "epoch": 0.2643047373093374, + "grad_norm": 0.8241428732872009, + "learning_rate": 0.00018769902809393865, + "loss": 2.7156, + "step": 3275 + }, + { + "epoch": 0.26438544104592043, + "grad_norm": 0.8007158041000366, + "learning_rate": 0.00018769144126866657, + "loss": 2.693, + "step": 3276 + }, + { + "epoch": 0.2644661447825034, + "grad_norm": 0.8360451459884644, + "learning_rate": 0.00018768385225789456, + "loss": 2.6919, + "step": 3277 + }, + { + "epoch": 0.26454684851908644, + "grad_norm": 0.7596627473831177, + "learning_rate": 0.00018767626106181172, + "loss": 2.7861, + "step": 3278 + }, + { + "epoch": 0.2646275522556694, + "grad_norm": 0.7469248175621033, + "learning_rate": 0.00018766866768060727, + "loss": 2.7305, + "step": 3279 + }, + { + "epoch": 0.26470825599225245, + "grad_norm": 0.7103936076164246, + "learning_rate": 0.00018766107211447045, + "loss": 2.6456, + "step": 3280 + }, + { + "epoch": 0.2647889597288354, + "grad_norm": 0.7595266103744507, + "learning_rate": 0.00018765347436359056, + "loss": 2.7235, + "step": 3281 + }, + { + "epoch": 0.26486966346541846, + "grad_norm": 0.786648154258728, + "learning_rate": 0.00018764587442815698, + "loss": 2.7182, + "step": 3282 + }, + { + "epoch": 0.26495036720200144, + "grad_norm": 0.7152618169784546, + "learning_rate": 0.00018763827230835908, + "loss": 2.6842, + "step": 3283 + }, + { + "epoch": 0.26503107093858447, + "grad_norm": 0.89169842004776, + "learning_rate": 0.00018763066800438636, + "loss": 2.7661, + "step": 3284 + }, + { + "epoch": 0.26511177467516744, + "grad_norm": 0.8148171305656433, + "learning_rate": 0.00018762306151642833, + "loss": 2.7264, + "step": 3285 + }, + { + "epoch": 0.2651924784117505, + "grad_norm": 0.8070533871650696, + "learning_rate": 0.00018761545284467454, + "loss": 2.7425, + "step": 3286 + }, + { + "epoch": 0.26527318214833345, + "grad_norm": 0.8536118268966675, + "learning_rate": 0.00018760784198931465, + "loss": 2.702, + "step": 3287 + }, + { + "epoch": 0.2653538858849165, + "grad_norm": 0.7422329783439636, + "learning_rate": 0.00018760022895053833, + "loss": 2.6913, + "step": 3288 + }, + { + "epoch": 0.26543458962149946, + "grad_norm": 0.7415527105331421, + "learning_rate": 0.0001875926137285353, + "loss": 2.6472, + "step": 3289 + }, + { + "epoch": 0.2655152933580825, + "grad_norm": 0.8432031273841858, + "learning_rate": 0.00018758499632349538, + "loss": 2.7506, + "step": 3290 + }, + { + "epoch": 0.26559599709466547, + "grad_norm": 0.8113259077072144, + "learning_rate": 0.0001875773767356084, + "loss": 2.6866, + "step": 3291 + }, + { + "epoch": 0.2656767008312485, + "grad_norm": 0.7898122668266296, + "learning_rate": 0.00018756975496506424, + "loss": 2.6516, + "step": 3292 + }, + { + "epoch": 0.2657574045678315, + "grad_norm": 0.7627275586128235, + "learning_rate": 0.0001875621310120529, + "loss": 2.7065, + "step": 3293 + }, + { + "epoch": 0.2658381083044145, + "grad_norm": 0.8227291107177734, + "learning_rate": 0.00018755450487676435, + "loss": 2.7614, + "step": 3294 + }, + { + "epoch": 0.2659188120409975, + "grad_norm": 0.8162109851837158, + "learning_rate": 0.00018754687655938868, + "loss": 2.7924, + "step": 3295 + }, + { + "epoch": 0.2659995157775805, + "grad_norm": 0.7231846451759338, + "learning_rate": 0.00018753924606011602, + "loss": 2.7505, + "step": 3296 + }, + { + "epoch": 0.2660802195141635, + "grad_norm": 0.8635944724082947, + "learning_rate": 0.00018753161337913647, + "loss": 2.7505, + "step": 3297 + }, + { + "epoch": 0.26616092325074653, + "grad_norm": 0.8131890892982483, + "learning_rate": 0.00018752397851664031, + "loss": 2.7872, + "step": 3298 + }, + { + "epoch": 0.2662416269873295, + "grad_norm": 0.7336695790290833, + "learning_rate": 0.00018751634147281786, + "loss": 2.7517, + "step": 3299 + }, + { + "epoch": 0.26632233072391254, + "grad_norm": 0.7541754841804504, + "learning_rate": 0.00018750870224785939, + "loss": 2.7807, + "step": 3300 + }, + { + "epoch": 0.2664030344604955, + "grad_norm": 0.9347110390663147, + "learning_rate": 0.0001875010608419553, + "loss": 2.6954, + "step": 3301 + }, + { + "epoch": 0.26648373819707855, + "grad_norm": 0.7591213583946228, + "learning_rate": 0.00018749341725529604, + "loss": 2.7019, + "step": 3302 + }, + { + "epoch": 0.2665644419336615, + "grad_norm": 0.811527669429779, + "learning_rate": 0.00018748577148807211, + "loss": 2.7123, + "step": 3303 + }, + { + "epoch": 0.26664514567024455, + "grad_norm": 0.7419980764389038, + "learning_rate": 0.00018747812354047408, + "loss": 2.7383, + "step": 3304 + }, + { + "epoch": 0.26672584940682753, + "grad_norm": 0.7801192402839661, + "learning_rate": 0.00018747047341269256, + "loss": 2.7245, + "step": 3305 + }, + { + "epoch": 0.26680655314341056, + "grad_norm": 0.7392756938934326, + "learning_rate": 0.00018746282110491816, + "loss": 2.6992, + "step": 3306 + }, + { + "epoch": 0.26688725687999354, + "grad_norm": 0.7085927724838257, + "learning_rate": 0.00018745516661734161, + "loss": 2.739, + "step": 3307 + }, + { + "epoch": 0.26696796061657657, + "grad_norm": 0.7218676209449768, + "learning_rate": 0.00018744750995015373, + "loss": 2.7091, + "step": 3308 + }, + { + "epoch": 0.26704866435315955, + "grad_norm": 0.847872257232666, + "learning_rate": 0.0001874398511035453, + "loss": 2.699, + "step": 3309 + }, + { + "epoch": 0.2671293680897426, + "grad_norm": 0.8280770778656006, + "learning_rate": 0.00018743219007770723, + "loss": 2.763, + "step": 3310 + }, + { + "epoch": 0.26721007182632556, + "grad_norm": 0.7271165251731873, + "learning_rate": 0.0001874245268728304, + "loss": 2.7219, + "step": 3311 + }, + { + "epoch": 0.2672907755629086, + "grad_norm": 0.7342363595962524, + "learning_rate": 0.00018741686148910586, + "loss": 2.6765, + "step": 3312 + }, + { + "epoch": 0.26737147929949157, + "grad_norm": 0.7260174751281738, + "learning_rate": 0.0001874091939267246, + "loss": 2.7003, + "step": 3313 + }, + { + "epoch": 0.2674521830360746, + "grad_norm": 0.742494523525238, + "learning_rate": 0.00018740152418587775, + "loss": 2.7371, + "step": 3314 + }, + { + "epoch": 0.2675328867726576, + "grad_norm": 0.7238131165504456, + "learning_rate": 0.00018739385226675646, + "loss": 2.7486, + "step": 3315 + }, + { + "epoch": 0.26761359050924055, + "grad_norm": 0.7329363226890564, + "learning_rate": 0.0001873861781695519, + "loss": 2.6414, + "step": 3316 + }, + { + "epoch": 0.2676942942458236, + "grad_norm": 0.7078117728233337, + "learning_rate": 0.00018737850189445534, + "loss": 2.7271, + "step": 3317 + }, + { + "epoch": 0.26777499798240656, + "grad_norm": 0.7945309281349182, + "learning_rate": 0.00018737082344165814, + "loss": 2.7323, + "step": 3318 + }, + { + "epoch": 0.2678557017189896, + "grad_norm": 0.7510890364646912, + "learning_rate": 0.0001873631428113516, + "loss": 2.6563, + "step": 3319 + }, + { + "epoch": 0.26793640545557257, + "grad_norm": 0.7790820002555847, + "learning_rate": 0.0001873554600037272, + "loss": 2.7445, + "step": 3320 + }, + { + "epoch": 0.2680171091921556, + "grad_norm": 0.7689393162727356, + "learning_rate": 0.00018734777501897636, + "loss": 2.669, + "step": 3321 + }, + { + "epoch": 0.2680978129287386, + "grad_norm": 0.8227118253707886, + "learning_rate": 0.00018734008785729065, + "loss": 2.7279, + "step": 3322 + }, + { + "epoch": 0.2681785166653216, + "grad_norm": 0.7551290392875671, + "learning_rate": 0.00018733239851886162, + "loss": 2.6864, + "step": 3323 + }, + { + "epoch": 0.2682592204019046, + "grad_norm": 0.8572004437446594, + "learning_rate": 0.00018732470700388097, + "loss": 2.8159, + "step": 3324 + }, + { + "epoch": 0.2683399241384876, + "grad_norm": 0.7509044408798218, + "learning_rate": 0.00018731701331254033, + "loss": 2.7698, + "step": 3325 + }, + { + "epoch": 0.2684206278750706, + "grad_norm": 0.8474129438400269, + "learning_rate": 0.00018730931744503148, + "loss": 2.6745, + "step": 3326 + }, + { + "epoch": 0.2685013316116536, + "grad_norm": 0.8310953378677368, + "learning_rate": 0.00018730161940154618, + "loss": 2.712, + "step": 3327 + }, + { + "epoch": 0.2685820353482366, + "grad_norm": 0.8820717334747314, + "learning_rate": 0.00018729391918227632, + "loss": 2.7776, + "step": 3328 + }, + { + "epoch": 0.26866273908481964, + "grad_norm": 0.8827663064002991, + "learning_rate": 0.00018728621678741384, + "loss": 2.7115, + "step": 3329 + }, + { + "epoch": 0.2687434428214026, + "grad_norm": 0.7896323800086975, + "learning_rate": 0.00018727851221715064, + "loss": 2.6799, + "step": 3330 + }, + { + "epoch": 0.26882414655798564, + "grad_norm": 0.7775614261627197, + "learning_rate": 0.0001872708054716788, + "loss": 2.7021, + "step": 3331 + }, + { + "epoch": 0.2689048502945686, + "grad_norm": 0.8150187134742737, + "learning_rate": 0.0001872630965511903, + "loss": 2.679, + "step": 3332 + }, + { + "epoch": 0.26898555403115165, + "grad_norm": 0.7821844220161438, + "learning_rate": 0.00018725538545587736, + "loss": 2.7067, + "step": 3333 + }, + { + "epoch": 0.26906625776773463, + "grad_norm": 0.8390234112739563, + "learning_rate": 0.00018724767218593216, + "loss": 2.7133, + "step": 3334 + }, + { + "epoch": 0.26914696150431766, + "grad_norm": 0.8150694370269775, + "learning_rate": 0.00018723995674154687, + "loss": 2.7022, + "step": 3335 + }, + { + "epoch": 0.26922766524090064, + "grad_norm": 0.7473872900009155, + "learning_rate": 0.0001872322391229138, + "loss": 2.7268, + "step": 3336 + }, + { + "epoch": 0.26930836897748367, + "grad_norm": 0.7591951489448547, + "learning_rate": 0.0001872245193302253, + "loss": 2.7516, + "step": 3337 + }, + { + "epoch": 0.26938907271406665, + "grad_norm": 0.7914662957191467, + "learning_rate": 0.00018721679736367382, + "loss": 2.6613, + "step": 3338 + }, + { + "epoch": 0.2694697764506497, + "grad_norm": 0.7823428511619568, + "learning_rate": 0.00018720907322345172, + "loss": 2.6661, + "step": 3339 + }, + { + "epoch": 0.26955048018723266, + "grad_norm": 0.8428264260292053, + "learning_rate": 0.00018720134690975156, + "loss": 2.672, + "step": 3340 + }, + { + "epoch": 0.2696311839238157, + "grad_norm": 0.71320641040802, + "learning_rate": 0.00018719361842276587, + "loss": 2.7326, + "step": 3341 + }, + { + "epoch": 0.26971188766039866, + "grad_norm": 0.7972821593284607, + "learning_rate": 0.00018718588776268731, + "loss": 2.7182, + "step": 3342 + }, + { + "epoch": 0.2697925913969817, + "grad_norm": 0.7924500107765198, + "learning_rate": 0.0001871781549297085, + "loss": 2.7308, + "step": 3343 + }, + { + "epoch": 0.2698732951335647, + "grad_norm": 0.7668356895446777, + "learning_rate": 0.0001871704199240222, + "loss": 2.678, + "step": 3344 + }, + { + "epoch": 0.2699539988701477, + "grad_norm": 0.866973876953125, + "learning_rate": 0.00018716268274582114, + "loss": 2.7802, + "step": 3345 + }, + { + "epoch": 0.2700347026067307, + "grad_norm": 0.7709557414054871, + "learning_rate": 0.0001871549433952982, + "loss": 2.7418, + "step": 3346 + }, + { + "epoch": 0.2701154063433137, + "grad_norm": 0.7707573771476746, + "learning_rate": 0.00018714720187264626, + "loss": 2.7486, + "step": 3347 + }, + { + "epoch": 0.2701961100798967, + "grad_norm": 0.8007768392562866, + "learning_rate": 0.00018713945817805822, + "loss": 2.7106, + "step": 3348 + }, + { + "epoch": 0.2702768138164797, + "grad_norm": 0.7239583134651184, + "learning_rate": 0.0001871317123117271, + "loss": 2.7209, + "step": 3349 + }, + { + "epoch": 0.2703575175530627, + "grad_norm": 0.775104820728302, + "learning_rate": 0.00018712396427384594, + "loss": 2.6503, + "step": 3350 + }, + { + "epoch": 0.27043822128964573, + "grad_norm": 0.7492741346359253, + "learning_rate": 0.0001871162140646079, + "loss": 2.699, + "step": 3351 + }, + { + "epoch": 0.2705189250262287, + "grad_norm": 0.7550846338272095, + "learning_rate": 0.00018710846168420604, + "loss": 2.7458, + "step": 3352 + }, + { + "epoch": 0.27059962876281174, + "grad_norm": 0.807996928691864, + "learning_rate": 0.0001871007071328336, + "loss": 2.7604, + "step": 3353 + }, + { + "epoch": 0.2706803324993947, + "grad_norm": 0.7381845116615295, + "learning_rate": 0.00018709295041068386, + "loss": 2.6833, + "step": 3354 + }, + { + "epoch": 0.27076103623597775, + "grad_norm": 0.7542420625686646, + "learning_rate": 0.00018708519151795016, + "loss": 2.6462, + "step": 3355 + }, + { + "epoch": 0.2708417399725607, + "grad_norm": 0.7675846219062805, + "learning_rate": 0.00018707743045482582, + "loss": 2.7068, + "step": 3356 + }, + { + "epoch": 0.27092244370914376, + "grad_norm": 0.7437357902526855, + "learning_rate": 0.0001870696672215043, + "loss": 2.73, + "step": 3357 + }, + { + "epoch": 0.27100314744572673, + "grad_norm": 0.7880852222442627, + "learning_rate": 0.00018706190181817903, + "loss": 2.759, + "step": 3358 + }, + { + "epoch": 0.27108385118230977, + "grad_norm": 0.7403178811073303, + "learning_rate": 0.00018705413424504363, + "loss": 2.7538, + "step": 3359 + }, + { + "epoch": 0.27116455491889274, + "grad_norm": 0.7601225972175598, + "learning_rate": 0.00018704636450229164, + "loss": 2.7331, + "step": 3360 + }, + { + "epoch": 0.2712452586554758, + "grad_norm": 0.7810701727867126, + "learning_rate": 0.0001870385925901167, + "loss": 2.7736, + "step": 3361 + }, + { + "epoch": 0.27132596239205875, + "grad_norm": 0.8934530019760132, + "learning_rate": 0.0001870308185087125, + "loss": 2.7214, + "step": 3362 + }, + { + "epoch": 0.2714066661286418, + "grad_norm": 0.7468441128730774, + "learning_rate": 0.0001870230422582728, + "loss": 2.6957, + "step": 3363 + }, + { + "epoch": 0.27148736986522476, + "grad_norm": 0.7643293142318726, + "learning_rate": 0.00018701526383899144, + "loss": 2.6773, + "step": 3364 + }, + { + "epoch": 0.2715680736018078, + "grad_norm": 0.7602033615112305, + "learning_rate": 0.0001870074832510622, + "loss": 2.7095, + "step": 3365 + }, + { + "epoch": 0.27164877733839077, + "grad_norm": 0.772065281867981, + "learning_rate": 0.00018699970049467908, + "loss": 2.6753, + "step": 3366 + }, + { + "epoch": 0.27172948107497374, + "grad_norm": 0.7718359231948853, + "learning_rate": 0.00018699191557003598, + "loss": 2.6857, + "step": 3367 + }, + { + "epoch": 0.2718101848115568, + "grad_norm": 0.8207093477249146, + "learning_rate": 0.00018698412847732693, + "loss": 2.7549, + "step": 3368 + }, + { + "epoch": 0.27189088854813975, + "grad_norm": 0.7393590807914734, + "learning_rate": 0.00018697633921674605, + "loss": 2.6884, + "step": 3369 + }, + { + "epoch": 0.2719715922847228, + "grad_norm": 0.7955869436264038, + "learning_rate": 0.0001869685477884874, + "loss": 2.708, + "step": 3370 + }, + { + "epoch": 0.27205229602130576, + "grad_norm": 0.7392188906669617, + "learning_rate": 0.00018696075419274527, + "loss": 2.717, + "step": 3371 + }, + { + "epoch": 0.2721329997578888, + "grad_norm": 0.800204873085022, + "learning_rate": 0.00018695295842971376, + "loss": 2.7184, + "step": 3372 + }, + { + "epoch": 0.27221370349447177, + "grad_norm": 0.8195740580558777, + "learning_rate": 0.00018694516049958725, + "loss": 2.6865, + "step": 3373 + }, + { + "epoch": 0.2722944072310548, + "grad_norm": 0.8617578148841858, + "learning_rate": 0.00018693736040256007, + "loss": 2.7098, + "step": 3374 + }, + { + "epoch": 0.2723751109676378, + "grad_norm": 0.8184413909912109, + "learning_rate": 0.00018692955813882662, + "loss": 2.7449, + "step": 3375 + }, + { + "epoch": 0.2724558147042208, + "grad_norm": 0.990275502204895, + "learning_rate": 0.00018692175370858133, + "loss": 2.7891, + "step": 3376 + }, + { + "epoch": 0.2725365184408038, + "grad_norm": 0.7857810854911804, + "learning_rate": 0.0001869139471120187, + "loss": 2.6884, + "step": 3377 + }, + { + "epoch": 0.2726172221773868, + "grad_norm": 0.8040915131568909, + "learning_rate": 0.00018690613834933335, + "loss": 2.7047, + "step": 3378 + }, + { + "epoch": 0.2726979259139698, + "grad_norm": 0.7512348294258118, + "learning_rate": 0.00018689832742071983, + "loss": 2.6898, + "step": 3379 + }, + { + "epoch": 0.27277862965055283, + "grad_norm": 0.6781859397888184, + "learning_rate": 0.00018689051432637288, + "loss": 2.6396, + "step": 3380 + }, + { + "epoch": 0.2728593333871358, + "grad_norm": 0.7858247756958008, + "learning_rate": 0.00018688269906648716, + "loss": 2.6785, + "step": 3381 + }, + { + "epoch": 0.27294003712371884, + "grad_norm": 0.7342140674591064, + "learning_rate": 0.00018687488164125744, + "loss": 2.6778, + "step": 3382 + }, + { + "epoch": 0.2730207408603018, + "grad_norm": 0.8113372921943665, + "learning_rate": 0.00018686706205087858, + "loss": 2.6982, + "step": 3383 + }, + { + "epoch": 0.27310144459688485, + "grad_norm": 0.7904205918312073, + "learning_rate": 0.0001868592402955455, + "loss": 2.7891, + "step": 3384 + }, + { + "epoch": 0.2731821483334678, + "grad_norm": 0.7274135947227478, + "learning_rate": 0.00018685141637545308, + "loss": 2.6908, + "step": 3385 + }, + { + "epoch": 0.27326285207005085, + "grad_norm": 0.7675744295120239, + "learning_rate": 0.0001868435902907963, + "loss": 2.6987, + "step": 3386 + }, + { + "epoch": 0.27334355580663383, + "grad_norm": 0.8085030913352966, + "learning_rate": 0.00018683576204177026, + "loss": 2.7798, + "step": 3387 + }, + { + "epoch": 0.27342425954321686, + "grad_norm": 0.7498135566711426, + "learning_rate": 0.00018682793162857006, + "loss": 2.7216, + "step": 3388 + }, + { + "epoch": 0.27350496327979984, + "grad_norm": 0.900741696357727, + "learning_rate": 0.0001868200990513908, + "loss": 2.6871, + "step": 3389 + }, + { + "epoch": 0.27358566701638287, + "grad_norm": 0.7948571443557739, + "learning_rate": 0.00018681226431042772, + "loss": 2.6985, + "step": 3390 + }, + { + "epoch": 0.27366637075296585, + "grad_norm": 0.8739100098609924, + "learning_rate": 0.00018680442740587612, + "loss": 2.6922, + "step": 3391 + }, + { + "epoch": 0.2737470744895489, + "grad_norm": 0.730084240436554, + "learning_rate": 0.00018679658833793125, + "loss": 2.7029, + "step": 3392 + }, + { + "epoch": 0.27382777822613186, + "grad_norm": 0.7560603022575378, + "learning_rate": 0.00018678874710678853, + "loss": 2.7429, + "step": 3393 + }, + { + "epoch": 0.2739084819627149, + "grad_norm": 0.8331460356712341, + "learning_rate": 0.00018678090371264334, + "loss": 2.7157, + "step": 3394 + }, + { + "epoch": 0.27398918569929787, + "grad_norm": 0.8070168495178223, + "learning_rate": 0.00018677305815569122, + "loss": 2.7629, + "step": 3395 + }, + { + "epoch": 0.2740698894358809, + "grad_norm": 0.7922534346580505, + "learning_rate": 0.00018676521043612762, + "loss": 2.7159, + "step": 3396 + }, + { + "epoch": 0.2741505931724639, + "grad_norm": 0.7838901281356812, + "learning_rate": 0.0001867573605541482, + "loss": 2.6721, + "step": 3397 + }, + { + "epoch": 0.2742312969090469, + "grad_norm": 0.8912512063980103, + "learning_rate": 0.00018674950850994856, + "loss": 2.7243, + "step": 3398 + }, + { + "epoch": 0.2743120006456299, + "grad_norm": 0.7205448150634766, + "learning_rate": 0.0001867416543037244, + "loss": 2.7152, + "step": 3399 + }, + { + "epoch": 0.2743927043822129, + "grad_norm": 0.6992877721786499, + "learning_rate": 0.00018673379793567146, + "loss": 2.7183, + "step": 3400 + }, + { + "epoch": 0.2744734081187959, + "grad_norm": 0.8009448051452637, + "learning_rate": 0.00018672593940598556, + "loss": 2.715, + "step": 3401 + }, + { + "epoch": 0.2745541118553789, + "grad_norm": 0.7812647819519043, + "learning_rate": 0.0001867180787148626, + "loss": 2.7579, + "step": 3402 + }, + { + "epoch": 0.2746348155919619, + "grad_norm": 0.7300555109977722, + "learning_rate": 0.00018671021586249835, + "loss": 2.694, + "step": 3403 + }, + { + "epoch": 0.27471551932854493, + "grad_norm": 0.8082736134529114, + "learning_rate": 0.00018670235084908887, + "loss": 2.768, + "step": 3404 + }, + { + "epoch": 0.2747962230651279, + "grad_norm": 0.7729581594467163, + "learning_rate": 0.0001866944836748302, + "loss": 2.7256, + "step": 3405 + }, + { + "epoch": 0.27487692680171094, + "grad_norm": 0.8113458752632141, + "learning_rate": 0.00018668661433991835, + "loss": 2.6692, + "step": 3406 + }, + { + "epoch": 0.2749576305382939, + "grad_norm": 0.7757337689399719, + "learning_rate": 0.00018667874284454948, + "loss": 2.6769, + "step": 3407 + }, + { + "epoch": 0.27503833427487695, + "grad_norm": 0.7896093726158142, + "learning_rate": 0.00018667086918891976, + "loss": 2.7118, + "step": 3408 + }, + { + "epoch": 0.2751190380114599, + "grad_norm": 0.7764071822166443, + "learning_rate": 0.00018666299337322543, + "loss": 2.7284, + "step": 3409 + }, + { + "epoch": 0.27519974174804296, + "grad_norm": 0.794815182685852, + "learning_rate": 0.00018665511539766273, + "loss": 2.7232, + "step": 3410 + }, + { + "epoch": 0.27528044548462594, + "grad_norm": 0.8134122490882874, + "learning_rate": 0.0001866472352624281, + "loss": 2.7023, + "step": 3411 + }, + { + "epoch": 0.27536114922120897, + "grad_norm": 0.7654025554656982, + "learning_rate": 0.00018663935296771782, + "loss": 2.7002, + "step": 3412 + }, + { + "epoch": 0.27544185295779194, + "grad_norm": 0.6930806636810303, + "learning_rate": 0.0001866314685137284, + "loss": 2.6764, + "step": 3413 + }, + { + "epoch": 0.275522556694375, + "grad_norm": 0.7535184621810913, + "learning_rate": 0.00018662358190065631, + "loss": 2.6657, + "step": 3414 + }, + { + "epoch": 0.27560326043095795, + "grad_norm": 0.7775620818138123, + "learning_rate": 0.00018661569312869816, + "loss": 2.6931, + "step": 3415 + }, + { + "epoch": 0.275683964167541, + "grad_norm": 0.7209072113037109, + "learning_rate": 0.00018660780219805048, + "loss": 2.7293, + "step": 3416 + }, + { + "epoch": 0.27576466790412396, + "grad_norm": 0.7182055711746216, + "learning_rate": 0.00018659990910891, + "loss": 2.6561, + "step": 3417 + }, + { + "epoch": 0.27584537164070694, + "grad_norm": 0.7130969166755676, + "learning_rate": 0.00018659201386147338, + "loss": 2.7156, + "step": 3418 + }, + { + "epoch": 0.27592607537728997, + "grad_norm": 0.7296265959739685, + "learning_rate": 0.00018658411645593745, + "loss": 2.6894, + "step": 3419 + }, + { + "epoch": 0.27600677911387295, + "grad_norm": 0.7707972526550293, + "learning_rate": 0.000186576216892499, + "loss": 2.7528, + "step": 3420 + }, + { + "epoch": 0.276087482850456, + "grad_norm": 0.6945170164108276, + "learning_rate": 0.0001865683151713549, + "loss": 2.6762, + "step": 3421 + }, + { + "epoch": 0.27616818658703896, + "grad_norm": 0.7664114236831665, + "learning_rate": 0.0001865604112927021, + "loss": 2.7212, + "step": 3422 + }, + { + "epoch": 0.276248890323622, + "grad_norm": 0.6950399875640869, + "learning_rate": 0.0001865525052567376, + "loss": 2.7035, + "step": 3423 + }, + { + "epoch": 0.27632959406020496, + "grad_norm": 0.7307506799697876, + "learning_rate": 0.00018654459706365838, + "loss": 2.7296, + "step": 3424 + }, + { + "epoch": 0.276410297796788, + "grad_norm": 0.720912516117096, + "learning_rate": 0.0001865366867136616, + "loss": 2.6884, + "step": 3425 + }, + { + "epoch": 0.276491001533371, + "grad_norm": 0.7581072449684143, + "learning_rate": 0.00018652877420694436, + "loss": 2.705, + "step": 3426 + }, + { + "epoch": 0.276571705269954, + "grad_norm": 0.7473136186599731, + "learning_rate": 0.0001865208595437039, + "loss": 2.7316, + "step": 3427 + }, + { + "epoch": 0.276652409006537, + "grad_norm": 0.7272855639457703, + "learning_rate": 0.00018651294272413745, + "loss": 2.6834, + "step": 3428 + }, + { + "epoch": 0.27673311274312, + "grad_norm": 0.7046366930007935, + "learning_rate": 0.0001865050237484423, + "loss": 2.6491, + "step": 3429 + }, + { + "epoch": 0.276813816479703, + "grad_norm": 0.7521376609802246, + "learning_rate": 0.00018649710261681586, + "loss": 2.708, + "step": 3430 + }, + { + "epoch": 0.276894520216286, + "grad_norm": 0.7372453808784485, + "learning_rate": 0.0001864891793294555, + "loss": 2.682, + "step": 3431 + }, + { + "epoch": 0.276975223952869, + "grad_norm": 0.7381749749183655, + "learning_rate": 0.0001864812538865587, + "loss": 2.7526, + "step": 3432 + }, + { + "epoch": 0.27705592768945203, + "grad_norm": 0.7891514301300049, + "learning_rate": 0.00018647332628832298, + "loss": 2.6904, + "step": 3433 + }, + { + "epoch": 0.277136631426035, + "grad_norm": 0.7942724823951721, + "learning_rate": 0.00018646539653494596, + "loss": 2.7873, + "step": 3434 + }, + { + "epoch": 0.27721733516261804, + "grad_norm": 0.7365398406982422, + "learning_rate": 0.0001864574646266252, + "loss": 2.6684, + "step": 3435 + }, + { + "epoch": 0.277298038899201, + "grad_norm": 0.7802249193191528, + "learning_rate": 0.00018644953056355846, + "loss": 2.7152, + "step": 3436 + }, + { + "epoch": 0.27737874263578405, + "grad_norm": 0.7801448106765747, + "learning_rate": 0.0001864415943459434, + "loss": 2.7034, + "step": 3437 + }, + { + "epoch": 0.277459446372367, + "grad_norm": 0.7722738981246948, + "learning_rate": 0.00018643365597397786, + "loss": 2.7135, + "step": 3438 + }, + { + "epoch": 0.27754015010895006, + "grad_norm": 0.7847445011138916, + "learning_rate": 0.00018642571544785967, + "loss": 2.6999, + "step": 3439 + }, + { + "epoch": 0.27762085384553303, + "grad_norm": 0.7226125597953796, + "learning_rate": 0.00018641777276778675, + "loss": 2.7613, + "step": 3440 + }, + { + "epoch": 0.27770155758211607, + "grad_norm": 0.713188111782074, + "learning_rate": 0.000186409827933957, + "loss": 2.6953, + "step": 3441 + }, + { + "epoch": 0.27778226131869904, + "grad_norm": 0.7308298349380493, + "learning_rate": 0.0001864018809465685, + "loss": 2.7045, + "step": 3442 + }, + { + "epoch": 0.2778629650552821, + "grad_norm": 0.7606719732284546, + "learning_rate": 0.00018639393180581925, + "loss": 2.7883, + "step": 3443 + }, + { + "epoch": 0.27794366879186505, + "grad_norm": 0.7583296895027161, + "learning_rate": 0.00018638598051190738, + "loss": 2.6734, + "step": 3444 + }, + { + "epoch": 0.2780243725284481, + "grad_norm": 0.7147012948989868, + "learning_rate": 0.00018637802706503108, + "loss": 2.7223, + "step": 3445 + }, + { + "epoch": 0.27810507626503106, + "grad_norm": 0.7812997102737427, + "learning_rate": 0.00018637007146538853, + "loss": 2.7277, + "step": 3446 + }, + { + "epoch": 0.2781857800016141, + "grad_norm": 0.7460772395133972, + "learning_rate": 0.000186362113713178, + "loss": 2.6875, + "step": 3447 + }, + { + "epoch": 0.27826648373819707, + "grad_norm": 0.7359143495559692, + "learning_rate": 0.0001863541538085979, + "loss": 2.7122, + "step": 3448 + }, + { + "epoch": 0.2783471874747801, + "grad_norm": 0.7122978568077087, + "learning_rate": 0.00018634619175184655, + "loss": 2.6381, + "step": 3449 + }, + { + "epoch": 0.2784278912113631, + "grad_norm": 0.6965885758399963, + "learning_rate": 0.00018633822754312234, + "loss": 2.6957, + "step": 3450 + }, + { + "epoch": 0.2785085949479461, + "grad_norm": 0.7737082242965698, + "learning_rate": 0.00018633026118262385, + "loss": 2.7579, + "step": 3451 + }, + { + "epoch": 0.2785892986845291, + "grad_norm": 0.6925420165061951, + "learning_rate": 0.00018632229267054958, + "loss": 2.6226, + "step": 3452 + }, + { + "epoch": 0.2786700024211121, + "grad_norm": 0.7496356964111328, + "learning_rate": 0.0001863143220070981, + "loss": 2.7059, + "step": 3453 + }, + { + "epoch": 0.2787507061576951, + "grad_norm": 0.7066817283630371, + "learning_rate": 0.0001863063491924681, + "loss": 2.681, + "step": 3454 + }, + { + "epoch": 0.2788314098942781, + "grad_norm": 0.8143237829208374, + "learning_rate": 0.0001862983742268583, + "loss": 2.6698, + "step": 3455 + }, + { + "epoch": 0.2789121136308611, + "grad_norm": 0.7518483996391296, + "learning_rate": 0.00018629039711046737, + "loss": 2.7041, + "step": 3456 + }, + { + "epoch": 0.27899281736744413, + "grad_norm": 0.8756366968154907, + "learning_rate": 0.00018628241784349422, + "loss": 2.7547, + "step": 3457 + }, + { + "epoch": 0.2790735211040271, + "grad_norm": 0.8709446787834167, + "learning_rate": 0.0001862744364261377, + "loss": 2.7068, + "step": 3458 + }, + { + "epoch": 0.27915422484061014, + "grad_norm": 0.8121913075447083, + "learning_rate": 0.00018626645285859666, + "loss": 2.673, + "step": 3459 + }, + { + "epoch": 0.2792349285771931, + "grad_norm": 0.7685909271240234, + "learning_rate": 0.00018625846714107012, + "loss": 2.7389, + "step": 3460 + }, + { + "epoch": 0.27931563231377615, + "grad_norm": 0.7098073363304138, + "learning_rate": 0.0001862504792737571, + "loss": 2.6942, + "step": 3461 + }, + { + "epoch": 0.27939633605035913, + "grad_norm": 0.7718049883842468, + "learning_rate": 0.00018624248925685666, + "loss": 2.7359, + "step": 3462 + }, + { + "epoch": 0.27947703978694216, + "grad_norm": 0.7912909984588623, + "learning_rate": 0.00018623449709056797, + "loss": 2.6658, + "step": 3463 + }, + { + "epoch": 0.27955774352352514, + "grad_norm": 0.7255454659461975, + "learning_rate": 0.0001862265027750902, + "loss": 2.771, + "step": 3464 + }, + { + "epoch": 0.27963844726010817, + "grad_norm": 0.7542218565940857, + "learning_rate": 0.00018621850631062254, + "loss": 2.6741, + "step": 3465 + }, + { + "epoch": 0.27971915099669115, + "grad_norm": 0.8386052846908569, + "learning_rate": 0.00018621050769736437, + "loss": 2.67, + "step": 3466 + }, + { + "epoch": 0.2797998547332742, + "grad_norm": 0.8563781976699829, + "learning_rate": 0.00018620250693551495, + "loss": 2.7461, + "step": 3467 + }, + { + "epoch": 0.27988055846985715, + "grad_norm": 0.7490699291229248, + "learning_rate": 0.00018619450402527376, + "loss": 2.6863, + "step": 3468 + }, + { + "epoch": 0.27996126220644013, + "grad_norm": 0.8008999824523926, + "learning_rate": 0.00018618649896684017, + "loss": 2.7769, + "step": 3469 + }, + { + "epoch": 0.28004196594302316, + "grad_norm": 0.7678235769271851, + "learning_rate": 0.00018617849176041378, + "loss": 2.7237, + "step": 3470 + }, + { + "epoch": 0.28012266967960614, + "grad_norm": 0.8774877786636353, + "learning_rate": 0.00018617048240619408, + "loss": 2.7502, + "step": 3471 + }, + { + "epoch": 0.28020337341618917, + "grad_norm": 0.8150283098220825, + "learning_rate": 0.00018616247090438073, + "loss": 2.6941, + "step": 3472 + }, + { + "epoch": 0.28028407715277215, + "grad_norm": 0.7330089807510376, + "learning_rate": 0.00018615445725517332, + "loss": 2.7002, + "step": 3473 + }, + { + "epoch": 0.2803647808893552, + "grad_norm": 0.748275101184845, + "learning_rate": 0.00018614644145877168, + "loss": 2.6996, + "step": 3474 + }, + { + "epoch": 0.28044548462593816, + "grad_norm": 0.7718296647071838, + "learning_rate": 0.0001861384235153755, + "loss": 2.7333, + "step": 3475 + }, + { + "epoch": 0.2805261883625212, + "grad_norm": 0.7751123309135437, + "learning_rate": 0.00018613040342518465, + "loss": 2.7362, + "step": 3476 + }, + { + "epoch": 0.28060689209910417, + "grad_norm": 0.70979243516922, + "learning_rate": 0.000186122381188399, + "loss": 2.6651, + "step": 3477 + }, + { + "epoch": 0.2806875958356872, + "grad_norm": 0.9607138633728027, + "learning_rate": 0.00018611435680521848, + "loss": 2.7779, + "step": 3478 + }, + { + "epoch": 0.2807682995722702, + "grad_norm": 0.709671676158905, + "learning_rate": 0.0001861063302758431, + "loss": 2.6994, + "step": 3479 + }, + { + "epoch": 0.2808490033088532, + "grad_norm": 0.8765757083892822, + "learning_rate": 0.00018609830160047283, + "loss": 2.7107, + "step": 3480 + }, + { + "epoch": 0.2809297070454362, + "grad_norm": 0.7996764183044434, + "learning_rate": 0.0001860902707793079, + "loss": 2.7921, + "step": 3481 + }, + { + "epoch": 0.2810104107820192, + "grad_norm": 0.7094513177871704, + "learning_rate": 0.0001860822378125483, + "loss": 2.7211, + "step": 3482 + }, + { + "epoch": 0.2810911145186022, + "grad_norm": 0.8068607449531555, + "learning_rate": 0.0001860742027003944, + "loss": 2.675, + "step": 3483 + }, + { + "epoch": 0.2811718182551852, + "grad_norm": 0.7737938165664673, + "learning_rate": 0.00018606616544304628, + "loss": 2.7538, + "step": 3484 + }, + { + "epoch": 0.2812525219917682, + "grad_norm": 0.7979975342750549, + "learning_rate": 0.0001860581260407044, + "loss": 2.7894, + "step": 3485 + }, + { + "epoch": 0.28133322572835123, + "grad_norm": 0.7671655416488647, + "learning_rate": 0.00018605008449356904, + "loss": 2.7097, + "step": 3486 + }, + { + "epoch": 0.2814139294649342, + "grad_norm": 0.7284159064292908, + "learning_rate": 0.00018604204080184062, + "loss": 2.7447, + "step": 3487 + }, + { + "epoch": 0.28149463320151724, + "grad_norm": 0.7425351142883301, + "learning_rate": 0.00018603399496571968, + "loss": 2.7302, + "step": 3488 + }, + { + "epoch": 0.2815753369381002, + "grad_norm": 0.7709810733795166, + "learning_rate": 0.00018602594698540663, + "loss": 2.6979, + "step": 3489 + }, + { + "epoch": 0.28165604067468325, + "grad_norm": 0.744628369808197, + "learning_rate": 0.00018601789686110214, + "loss": 2.7279, + "step": 3490 + }, + { + "epoch": 0.2817367444112662, + "grad_norm": 0.7679976224899292, + "learning_rate": 0.00018600984459300678, + "loss": 2.6862, + "step": 3491 + }, + { + "epoch": 0.28181744814784926, + "grad_norm": 0.7923497557640076, + "learning_rate": 0.0001860017901813213, + "loss": 2.6975, + "step": 3492 + }, + { + "epoch": 0.28189815188443224, + "grad_norm": 0.7896692156791687, + "learning_rate": 0.00018599373362624636, + "loss": 2.7052, + "step": 3493 + }, + { + "epoch": 0.28197885562101527, + "grad_norm": 0.7913276553153992, + "learning_rate": 0.00018598567492798284, + "loss": 2.7233, + "step": 3494 + }, + { + "epoch": 0.28205955935759824, + "grad_norm": 0.7385257482528687, + "learning_rate": 0.00018597761408673146, + "loss": 2.7616, + "step": 3495 + }, + { + "epoch": 0.2821402630941813, + "grad_norm": 0.7181909084320068, + "learning_rate": 0.00018596955110269323, + "loss": 2.718, + "step": 3496 + }, + { + "epoch": 0.28222096683076425, + "grad_norm": 0.8313151597976685, + "learning_rate": 0.00018596148597606907, + "loss": 2.6775, + "step": 3497 + }, + { + "epoch": 0.2823016705673473, + "grad_norm": 0.7235481142997742, + "learning_rate": 0.00018595341870705995, + "loss": 2.7085, + "step": 3498 + }, + { + "epoch": 0.28238237430393026, + "grad_norm": 0.7092145085334778, + "learning_rate": 0.00018594534929586697, + "loss": 2.7167, + "step": 3499 + }, + { + "epoch": 0.2824630780405133, + "grad_norm": 0.7929207682609558, + "learning_rate": 0.0001859372777426912, + "loss": 2.663, + "step": 3500 + }, + { + "epoch": 0.28254378177709627, + "grad_norm": 0.7488871216773987, + "learning_rate": 0.00018592920404773383, + "loss": 2.7911, + "step": 3501 + }, + { + "epoch": 0.2826244855136793, + "grad_norm": 0.8230419158935547, + "learning_rate": 0.0001859211282111961, + "loss": 2.754, + "step": 3502 + }, + { + "epoch": 0.2827051892502623, + "grad_norm": 0.731971025466919, + "learning_rate": 0.00018591305023327924, + "loss": 2.7142, + "step": 3503 + }, + { + "epoch": 0.2827858929868453, + "grad_norm": 0.8159881234169006, + "learning_rate": 0.00018590497011418457, + "loss": 2.7046, + "step": 3504 + }, + { + "epoch": 0.2828665967234283, + "grad_norm": 0.750266432762146, + "learning_rate": 0.0001858968878541135, + "loss": 2.6951, + "step": 3505 + }, + { + "epoch": 0.2829473004600113, + "grad_norm": 0.7750049233436584, + "learning_rate": 0.00018588880345326748, + "loss": 2.6958, + "step": 3506 + }, + { + "epoch": 0.2830280041965943, + "grad_norm": 0.8559218049049377, + "learning_rate": 0.00018588071691184795, + "loss": 2.7205, + "step": 3507 + }, + { + "epoch": 0.28310870793317733, + "grad_norm": 0.7334830164909363, + "learning_rate": 0.00018587262823005642, + "loss": 2.7134, + "step": 3508 + }, + { + "epoch": 0.2831894116697603, + "grad_norm": 0.8749497532844543, + "learning_rate": 0.00018586453740809456, + "loss": 2.6811, + "step": 3509 + }, + { + "epoch": 0.28327011540634334, + "grad_norm": 0.8800753355026245, + "learning_rate": 0.00018585644444616396, + "loss": 2.7427, + "step": 3510 + }, + { + "epoch": 0.2833508191429263, + "grad_norm": 0.8666185736656189, + "learning_rate": 0.00018584834934446632, + "loss": 2.6828, + "step": 3511 + }, + { + "epoch": 0.28343152287950935, + "grad_norm": 0.7451635003089905, + "learning_rate": 0.00018584025210320343, + "loss": 2.6784, + "step": 3512 + }, + { + "epoch": 0.2835122266160923, + "grad_norm": 0.8512656688690186, + "learning_rate": 0.00018583215272257708, + "loss": 2.7762, + "step": 3513 + }, + { + "epoch": 0.28359293035267535, + "grad_norm": 0.9298297166824341, + "learning_rate": 0.00018582405120278907, + "loss": 2.7714, + "step": 3514 + }, + { + "epoch": 0.28367363408925833, + "grad_norm": 0.7968065738677979, + "learning_rate": 0.0001858159475440414, + "loss": 2.7286, + "step": 3515 + }, + { + "epoch": 0.28375433782584136, + "grad_norm": 0.7381564378738403, + "learning_rate": 0.00018580784174653596, + "loss": 2.6697, + "step": 3516 + }, + { + "epoch": 0.28383504156242434, + "grad_norm": 0.8199222683906555, + "learning_rate": 0.00018579973381047481, + "loss": 2.7463, + "step": 3517 + }, + { + "epoch": 0.28391574529900737, + "grad_norm": 0.8022071123123169, + "learning_rate": 0.00018579162373606002, + "loss": 2.6898, + "step": 3518 + }, + { + "epoch": 0.28399644903559035, + "grad_norm": 0.7899700999259949, + "learning_rate": 0.0001857835115234937, + "loss": 2.7074, + "step": 3519 + }, + { + "epoch": 0.2840771527721733, + "grad_norm": 0.7237183451652527, + "learning_rate": 0.00018577539717297805, + "loss": 2.6699, + "step": 3520 + }, + { + "epoch": 0.28415785650875636, + "grad_norm": 0.7627314329147339, + "learning_rate": 0.00018576728068471526, + "loss": 2.7745, + "step": 3521 + }, + { + "epoch": 0.28423856024533933, + "grad_norm": 0.7301654815673828, + "learning_rate": 0.00018575916205890766, + "loss": 2.7191, + "step": 3522 + }, + { + "epoch": 0.28431926398192237, + "grad_norm": 0.7441647052764893, + "learning_rate": 0.00018575104129575753, + "loss": 2.7529, + "step": 3523 + }, + { + "epoch": 0.28439996771850534, + "grad_norm": 0.7715914249420166, + "learning_rate": 0.0001857429183954673, + "loss": 2.6893, + "step": 3524 + }, + { + "epoch": 0.2844806714550884, + "grad_norm": 0.7464057207107544, + "learning_rate": 0.00018573479335823944, + "loss": 2.7169, + "step": 3525 + }, + { + "epoch": 0.28456137519167135, + "grad_norm": 0.753198504447937, + "learning_rate": 0.00018572666618427638, + "loss": 2.7144, + "step": 3526 + }, + { + "epoch": 0.2846420789282544, + "grad_norm": 0.7681953310966492, + "learning_rate": 0.00018571853687378073, + "loss": 2.709, + "step": 3527 + }, + { + "epoch": 0.28472278266483736, + "grad_norm": 0.7591876983642578, + "learning_rate": 0.0001857104054269551, + "loss": 2.7519, + "step": 3528 + }, + { + "epoch": 0.2848034864014204, + "grad_norm": 0.7417709827423096, + "learning_rate": 0.00018570227184400205, + "loss": 2.6756, + "step": 3529 + }, + { + "epoch": 0.28488419013800337, + "grad_norm": 0.7641329169273376, + "learning_rate": 0.0001856941361251244, + "loss": 2.6614, + "step": 3530 + }, + { + "epoch": 0.2849648938745864, + "grad_norm": 0.7813490033149719, + "learning_rate": 0.0001856859982705249, + "loss": 2.7145, + "step": 3531 + }, + { + "epoch": 0.2850455976111694, + "grad_norm": 0.7777202129364014, + "learning_rate": 0.00018567785828040628, + "loss": 2.7015, + "step": 3532 + }, + { + "epoch": 0.2851263013477524, + "grad_norm": 0.7647144794464111, + "learning_rate": 0.0001856697161549715, + "loss": 2.7311, + "step": 3533 + }, + { + "epoch": 0.2852070050843354, + "grad_norm": 0.7477256655693054, + "learning_rate": 0.00018566157189442342, + "loss": 2.6832, + "step": 3534 + }, + { + "epoch": 0.2852877088209184, + "grad_norm": 0.7037049531936646, + "learning_rate": 0.00018565342549896506, + "loss": 2.6942, + "step": 3535 + }, + { + "epoch": 0.2853684125575014, + "grad_norm": 0.7309197783470154, + "learning_rate": 0.00018564527696879945, + "loss": 2.6797, + "step": 3536 + }, + { + "epoch": 0.2854491162940844, + "grad_norm": 0.798075795173645, + "learning_rate": 0.00018563712630412967, + "loss": 2.6926, + "step": 3537 + }, + { + "epoch": 0.2855298200306674, + "grad_norm": 0.7831682562828064, + "learning_rate": 0.0001856289735051588, + "loss": 2.7537, + "step": 3538 + }, + { + "epoch": 0.28561052376725043, + "grad_norm": 0.7983096241950989, + "learning_rate": 0.0001856208185720901, + "loss": 2.7037, + "step": 3539 + }, + { + "epoch": 0.2856912275038334, + "grad_norm": 0.7250573635101318, + "learning_rate": 0.00018561266150512678, + "loss": 2.7282, + "step": 3540 + }, + { + "epoch": 0.28577193124041644, + "grad_norm": 0.7800211906433105, + "learning_rate": 0.00018560450230447218, + "loss": 2.6541, + "step": 3541 + }, + { + "epoch": 0.2858526349769994, + "grad_norm": 0.7624209523200989, + "learning_rate": 0.00018559634097032953, + "loss": 2.7041, + "step": 3542 + }, + { + "epoch": 0.28593333871358245, + "grad_norm": 0.7212036848068237, + "learning_rate": 0.0001855881775029024, + "loss": 2.7287, + "step": 3543 + }, + { + "epoch": 0.28601404245016543, + "grad_norm": 0.7774164080619812, + "learning_rate": 0.00018558001190239408, + "loss": 2.6515, + "step": 3544 + }, + { + "epoch": 0.28609474618674846, + "grad_norm": 0.7169588208198547, + "learning_rate": 0.0001855718441690082, + "loss": 2.7111, + "step": 3545 + }, + { + "epoch": 0.28617544992333144, + "grad_norm": 0.7473909258842468, + "learning_rate": 0.00018556367430294827, + "loss": 2.7405, + "step": 3546 + }, + { + "epoch": 0.28625615365991447, + "grad_norm": 0.7213929295539856, + "learning_rate": 0.0001855555023044179, + "loss": 2.7336, + "step": 3547 + }, + { + "epoch": 0.28633685739649745, + "grad_norm": 0.701816201210022, + "learning_rate": 0.00018554732817362078, + "loss": 2.721, + "step": 3548 + }, + { + "epoch": 0.2864175611330805, + "grad_norm": 0.8158134818077087, + "learning_rate": 0.00018553915191076064, + "loss": 2.6979, + "step": 3549 + }, + { + "epoch": 0.28649826486966345, + "grad_norm": 0.7303084135055542, + "learning_rate": 0.00018553097351604118, + "loss": 2.6734, + "step": 3550 + }, + { + "epoch": 0.2865789686062465, + "grad_norm": 0.8140435814857483, + "learning_rate": 0.00018552279298966634, + "loss": 2.6832, + "step": 3551 + }, + { + "epoch": 0.28665967234282946, + "grad_norm": 0.7024678587913513, + "learning_rate": 0.00018551461033183988, + "loss": 2.7118, + "step": 3552 + }, + { + "epoch": 0.2867403760794125, + "grad_norm": 0.7277806401252747, + "learning_rate": 0.00018550642554276582, + "loss": 2.6362, + "step": 3553 + }, + { + "epoch": 0.28682107981599547, + "grad_norm": 0.8376575112342834, + "learning_rate": 0.00018549823862264812, + "loss": 2.744, + "step": 3554 + }, + { + "epoch": 0.2869017835525785, + "grad_norm": 0.712195098400116, + "learning_rate": 0.00018549004957169082, + "loss": 2.6715, + "step": 3555 + }, + { + "epoch": 0.2869824872891615, + "grad_norm": 0.7511523962020874, + "learning_rate": 0.00018548185839009805, + "loss": 2.7655, + "step": 3556 + }, + { + "epoch": 0.2870631910257445, + "grad_norm": 0.7397211790084839, + "learning_rate": 0.00018547366507807388, + "loss": 2.6813, + "step": 3557 + }, + { + "epoch": 0.2871438947623275, + "grad_norm": 0.6926341652870178, + "learning_rate": 0.00018546546963582253, + "loss": 2.6477, + "step": 3558 + }, + { + "epoch": 0.2872245984989105, + "grad_norm": 0.7776244878768921, + "learning_rate": 0.00018545727206354827, + "loss": 2.6979, + "step": 3559 + }, + { + "epoch": 0.2873053022354935, + "grad_norm": 0.7639400959014893, + "learning_rate": 0.00018544907236145542, + "loss": 2.6913, + "step": 3560 + }, + { + "epoch": 0.28738600597207653, + "grad_norm": 0.7738329768180847, + "learning_rate": 0.0001854408705297483, + "loss": 2.7231, + "step": 3561 + }, + { + "epoch": 0.2874667097086595, + "grad_norm": 0.7182422876358032, + "learning_rate": 0.00018543266656863137, + "loss": 2.718, + "step": 3562 + }, + { + "epoch": 0.28754741344524254, + "grad_norm": 0.7257261276245117, + "learning_rate": 0.00018542446047830903, + "loss": 2.7354, + "step": 3563 + }, + { + "epoch": 0.2876281171818255, + "grad_norm": 0.7761391997337341, + "learning_rate": 0.00018541625225898588, + "loss": 2.705, + "step": 3564 + }, + { + "epoch": 0.28770882091840855, + "grad_norm": 0.9272314310073853, + "learning_rate": 0.0001854080419108664, + "loss": 2.7278, + "step": 3565 + }, + { + "epoch": 0.2877895246549915, + "grad_norm": 0.7622589468955994, + "learning_rate": 0.00018539982943415527, + "loss": 2.7224, + "step": 3566 + }, + { + "epoch": 0.28787022839157456, + "grad_norm": 0.725349485874176, + "learning_rate": 0.0001853916148290572, + "loss": 2.6782, + "step": 3567 + }, + { + "epoch": 0.28795093212815753, + "grad_norm": 0.776242733001709, + "learning_rate": 0.0001853833980957768, + "loss": 2.6467, + "step": 3568 + }, + { + "epoch": 0.28803163586474057, + "grad_norm": 0.8461112976074219, + "learning_rate": 0.00018537517923451896, + "loss": 2.6763, + "step": 3569 + }, + { + "epoch": 0.28811233960132354, + "grad_norm": 0.8161221742630005, + "learning_rate": 0.00018536695824548848, + "loss": 2.7057, + "step": 3570 + }, + { + "epoch": 0.2881930433379065, + "grad_norm": 0.7404211759567261, + "learning_rate": 0.00018535873512889024, + "loss": 2.7083, + "step": 3571 + }, + { + "epoch": 0.28827374707448955, + "grad_norm": 0.831042468547821, + "learning_rate": 0.00018535050988492918, + "loss": 2.6121, + "step": 3572 + }, + { + "epoch": 0.2883544508110725, + "grad_norm": 0.7286352515220642, + "learning_rate": 0.00018534228251381035, + "loss": 2.7165, + "step": 3573 + }, + { + "epoch": 0.28843515454765556, + "grad_norm": 0.7951883673667908, + "learning_rate": 0.00018533405301573872, + "loss": 2.6794, + "step": 3574 + }, + { + "epoch": 0.28851585828423854, + "grad_norm": 0.7431079149246216, + "learning_rate": 0.00018532582139091944, + "loss": 2.6758, + "step": 3575 + }, + { + "epoch": 0.28859656202082157, + "grad_norm": 0.7408809065818787, + "learning_rate": 0.0001853175876395576, + "loss": 2.6901, + "step": 3576 + }, + { + "epoch": 0.28867726575740454, + "grad_norm": 0.7428708672523499, + "learning_rate": 0.00018530935176185848, + "loss": 2.6679, + "step": 3577 + }, + { + "epoch": 0.2887579694939876, + "grad_norm": 0.7670302987098694, + "learning_rate": 0.00018530111375802735, + "loss": 2.7306, + "step": 3578 + }, + { + "epoch": 0.28883867323057055, + "grad_norm": 0.7582474946975708, + "learning_rate": 0.00018529287362826943, + "loss": 2.7715, + "step": 3579 + }, + { + "epoch": 0.2889193769671536, + "grad_norm": 0.750973105430603, + "learning_rate": 0.0001852846313727902, + "loss": 2.7147, + "step": 3580 + }, + { + "epoch": 0.28900008070373656, + "grad_norm": 0.771854043006897, + "learning_rate": 0.00018527638699179498, + "loss": 2.6874, + "step": 3581 + }, + { + "epoch": 0.2890807844403196, + "grad_norm": 0.785469651222229, + "learning_rate": 0.00018526814048548928, + "loss": 2.6858, + "step": 3582 + }, + { + "epoch": 0.28916148817690257, + "grad_norm": 0.7601101398468018, + "learning_rate": 0.00018525989185407864, + "loss": 2.6927, + "step": 3583 + }, + { + "epoch": 0.2892421919134856, + "grad_norm": 0.7313411831855774, + "learning_rate": 0.00018525164109776861, + "loss": 2.6813, + "step": 3584 + }, + { + "epoch": 0.2893228956500686, + "grad_norm": 0.7471718192100525, + "learning_rate": 0.00018524338821676483, + "loss": 2.6791, + "step": 3585 + }, + { + "epoch": 0.2894035993866516, + "grad_norm": 0.7615204453468323, + "learning_rate": 0.00018523513321127302, + "loss": 2.7767, + "step": 3586 + }, + { + "epoch": 0.2894843031232346, + "grad_norm": 0.766793966293335, + "learning_rate": 0.00018522687608149886, + "loss": 2.664, + "step": 3587 + }, + { + "epoch": 0.2895650068598176, + "grad_norm": 0.7897932529449463, + "learning_rate": 0.00018521861682764816, + "loss": 2.7148, + "step": 3588 + }, + { + "epoch": 0.2896457105964006, + "grad_norm": 0.7366818785667419, + "learning_rate": 0.00018521035544992679, + "loss": 2.69, + "step": 3589 + }, + { + "epoch": 0.28972641433298363, + "grad_norm": 0.7503829598426819, + "learning_rate": 0.00018520209194854058, + "loss": 2.7141, + "step": 3590 + }, + { + "epoch": 0.2898071180695666, + "grad_norm": 0.8064351081848145, + "learning_rate": 0.00018519382632369556, + "loss": 2.6738, + "step": 3591 + }, + { + "epoch": 0.28988782180614964, + "grad_norm": 0.7364048361778259, + "learning_rate": 0.00018518555857559768, + "loss": 2.6731, + "step": 3592 + }, + { + "epoch": 0.2899685255427326, + "grad_norm": 0.7065430283546448, + "learning_rate": 0.00018517728870445297, + "loss": 2.7314, + "step": 3593 + }, + { + "epoch": 0.29004922927931565, + "grad_norm": 0.8233428001403809, + "learning_rate": 0.0001851690167104676, + "loss": 2.727, + "step": 3594 + }, + { + "epoch": 0.2901299330158986, + "grad_norm": 0.7563758492469788, + "learning_rate": 0.00018516074259384768, + "loss": 2.665, + "step": 3595 + }, + { + "epoch": 0.29021063675248165, + "grad_norm": 0.7451249361038208, + "learning_rate": 0.00018515246635479943, + "loss": 2.7686, + "step": 3596 + }, + { + "epoch": 0.29029134048906463, + "grad_norm": 0.7374305725097656, + "learning_rate": 0.00018514418799352918, + "loss": 2.6466, + "step": 3597 + }, + { + "epoch": 0.29037204422564766, + "grad_norm": 0.7596983909606934, + "learning_rate": 0.00018513590751024315, + "loss": 2.6763, + "step": 3598 + }, + { + "epoch": 0.29045274796223064, + "grad_norm": 0.7808190584182739, + "learning_rate": 0.0001851276249051478, + "loss": 2.7362, + "step": 3599 + }, + { + "epoch": 0.29053345169881367, + "grad_norm": 0.765785276889801, + "learning_rate": 0.00018511934017844948, + "loss": 2.7049, + "step": 3600 + }, + { + "epoch": 0.29061415543539665, + "grad_norm": 0.7503563165664673, + "learning_rate": 0.0001851110533303547, + "loss": 2.6262, + "step": 3601 + }, + { + "epoch": 0.2906948591719797, + "grad_norm": 0.7287782430648804, + "learning_rate": 0.00018510276436107, + "loss": 2.7076, + "step": 3602 + }, + { + "epoch": 0.29077556290856266, + "grad_norm": 0.7748721837997437, + "learning_rate": 0.00018509447327080193, + "loss": 2.6945, + "step": 3603 + }, + { + "epoch": 0.2908562666451457, + "grad_norm": 0.7482423186302185, + "learning_rate": 0.00018508618005975714, + "loss": 2.7326, + "step": 3604 + }, + { + "epoch": 0.29093697038172867, + "grad_norm": 0.7708765864372253, + "learning_rate": 0.00018507788472814238, + "loss": 2.7602, + "step": 3605 + }, + { + "epoch": 0.2910176741183117, + "grad_norm": 0.7308060526847839, + "learning_rate": 0.0001850695872761643, + "loss": 2.6735, + "step": 3606 + }, + { + "epoch": 0.2910983778548947, + "grad_norm": 0.7512951493263245, + "learning_rate": 0.00018506128770402972, + "loss": 2.6877, + "step": 3607 + }, + { + "epoch": 0.2911790815914777, + "grad_norm": 0.6806616187095642, + "learning_rate": 0.00018505298601194552, + "loss": 2.6689, + "step": 3608 + }, + { + "epoch": 0.2912597853280607, + "grad_norm": 0.7825661301612854, + "learning_rate": 0.00018504468220011857, + "loss": 2.7108, + "step": 3609 + }, + { + "epoch": 0.2913404890646437, + "grad_norm": 0.8243381977081299, + "learning_rate": 0.00018503637626875584, + "loss": 2.6789, + "step": 3610 + }, + { + "epoch": 0.2914211928012267, + "grad_norm": 0.745012640953064, + "learning_rate": 0.00018502806821806429, + "loss": 2.7658, + "step": 3611 + }, + { + "epoch": 0.2915018965378097, + "grad_norm": 0.7091341018676758, + "learning_rate": 0.00018501975804825104, + "loss": 2.7046, + "step": 3612 + }, + { + "epoch": 0.2915826002743927, + "grad_norm": 0.729026734828949, + "learning_rate": 0.0001850114457595232, + "loss": 2.6692, + "step": 3613 + }, + { + "epoch": 0.29166330401097573, + "grad_norm": 0.8098071813583374, + "learning_rate": 0.00018500313135208786, + "loss": 2.712, + "step": 3614 + }, + { + "epoch": 0.2917440077475587, + "grad_norm": 0.7387483716011047, + "learning_rate": 0.0001849948148261523, + "loss": 2.6705, + "step": 3615 + }, + { + "epoch": 0.29182471148414174, + "grad_norm": 0.7904576659202576, + "learning_rate": 0.0001849864961819238, + "loss": 2.5969, + "step": 3616 + }, + { + "epoch": 0.2919054152207247, + "grad_norm": 0.7560681700706482, + "learning_rate": 0.00018497817541960964, + "loss": 2.6971, + "step": 3617 + }, + { + "epoch": 0.29198611895730775, + "grad_norm": 0.8488430976867676, + "learning_rate": 0.00018496985253941723, + "loss": 2.7367, + "step": 3618 + }, + { + "epoch": 0.2920668226938907, + "grad_norm": 0.7641268372535706, + "learning_rate": 0.00018496152754155399, + "loss": 2.6948, + "step": 3619 + }, + { + "epoch": 0.29214752643047376, + "grad_norm": 0.7219721674919128, + "learning_rate": 0.00018495320042622736, + "loss": 2.7225, + "step": 3620 + }, + { + "epoch": 0.29222823016705674, + "grad_norm": 0.7583872675895691, + "learning_rate": 0.00018494487119364493, + "loss": 2.7335, + "step": 3621 + }, + { + "epoch": 0.2923089339036397, + "grad_norm": 0.7771418690681458, + "learning_rate": 0.00018493653984401424, + "loss": 2.6712, + "step": 3622 + }, + { + "epoch": 0.29238963764022274, + "grad_norm": 0.7537891268730164, + "learning_rate": 0.00018492820637754296, + "loss": 2.7282, + "step": 3623 + }, + { + "epoch": 0.2924703413768057, + "grad_norm": 0.7334226965904236, + "learning_rate": 0.00018491987079443875, + "loss": 2.7072, + "step": 3624 + }, + { + "epoch": 0.29255104511338875, + "grad_norm": 0.7768076658248901, + "learning_rate": 0.00018491153309490942, + "loss": 2.7176, + "step": 3625 + }, + { + "epoch": 0.29263174884997173, + "grad_norm": 0.6831281185150146, + "learning_rate": 0.0001849031932791627, + "loss": 2.6982, + "step": 3626 + }, + { + "epoch": 0.29271245258655476, + "grad_norm": 0.7150557637214661, + "learning_rate": 0.00018489485134740648, + "loss": 2.7325, + "step": 3627 + }, + { + "epoch": 0.29279315632313774, + "grad_norm": 0.782667338848114, + "learning_rate": 0.00018488650729984863, + "loss": 2.7146, + "step": 3628 + }, + { + "epoch": 0.29287386005972077, + "grad_norm": 0.7718524932861328, + "learning_rate": 0.0001848781611366971, + "loss": 2.746, + "step": 3629 + }, + { + "epoch": 0.29295456379630375, + "grad_norm": 0.7066439390182495, + "learning_rate": 0.00018486981285815998, + "loss": 2.7497, + "step": 3630 + }, + { + "epoch": 0.2930352675328868, + "grad_norm": 0.7705665826797485, + "learning_rate": 0.00018486146246444522, + "loss": 2.6448, + "step": 3631 + }, + { + "epoch": 0.29311597126946976, + "grad_norm": 0.7334863543510437, + "learning_rate": 0.000184853109955761, + "loss": 2.6931, + "step": 3632 + }, + { + "epoch": 0.2931966750060528, + "grad_norm": 0.7903133630752563, + "learning_rate": 0.0001848447553323155, + "loss": 2.6954, + "step": 3633 + }, + { + "epoch": 0.29327737874263576, + "grad_norm": 0.6821191310882568, + "learning_rate": 0.00018483639859431689, + "loss": 2.6165, + "step": 3634 + }, + { + "epoch": 0.2933580824792188, + "grad_norm": 0.7187811136245728, + "learning_rate": 0.00018482803974197344, + "loss": 2.6387, + "step": 3635 + }, + { + "epoch": 0.2934387862158018, + "grad_norm": 0.7429843544960022, + "learning_rate": 0.00018481967877549354, + "loss": 2.6848, + "step": 3636 + }, + { + "epoch": 0.2935194899523848, + "grad_norm": 0.7431524395942688, + "learning_rate": 0.0001848113156950855, + "loss": 2.7044, + "step": 3637 + }, + { + "epoch": 0.2936001936889678, + "grad_norm": 0.7008687853813171, + "learning_rate": 0.00018480295050095778, + "loss": 2.6922, + "step": 3638 + }, + { + "epoch": 0.2936808974255508, + "grad_norm": 0.7106652855873108, + "learning_rate": 0.00018479458319331884, + "loss": 2.6845, + "step": 3639 + }, + { + "epoch": 0.2937616011621338, + "grad_norm": 0.7288951873779297, + "learning_rate": 0.00018478621377237723, + "loss": 2.7017, + "step": 3640 + }, + { + "epoch": 0.2938423048987168, + "grad_norm": 0.7228607535362244, + "learning_rate": 0.00018477784223834155, + "loss": 2.7449, + "step": 3641 + }, + { + "epoch": 0.2939230086352998, + "grad_norm": 0.7180825471878052, + "learning_rate": 0.00018476946859142043, + "loss": 2.7291, + "step": 3642 + }, + { + "epoch": 0.29400371237188283, + "grad_norm": 0.7854947447776794, + "learning_rate": 0.00018476109283182258, + "loss": 2.7619, + "step": 3643 + }, + { + "epoch": 0.2940844161084658, + "grad_norm": 0.7871318459510803, + "learning_rate": 0.00018475271495975673, + "loss": 2.6695, + "step": 3644 + }, + { + "epoch": 0.29416511984504884, + "grad_norm": 0.7813127636909485, + "learning_rate": 0.00018474433497543165, + "loss": 2.735, + "step": 3645 + }, + { + "epoch": 0.2942458235816318, + "grad_norm": 0.7835291028022766, + "learning_rate": 0.00018473595287905623, + "loss": 2.7336, + "step": 3646 + }, + { + "epoch": 0.29432652731821485, + "grad_norm": 0.6970148682594299, + "learning_rate": 0.00018472756867083935, + "loss": 2.6912, + "step": 3647 + }, + { + "epoch": 0.2944072310547978, + "grad_norm": 0.7968462109565735, + "learning_rate": 0.00018471918235098998, + "loss": 2.6889, + "step": 3648 + }, + { + "epoch": 0.29448793479138086, + "grad_norm": 0.7011313438415527, + "learning_rate": 0.00018471079391971714, + "loss": 2.6989, + "step": 3649 + }, + { + "epoch": 0.29456863852796383, + "grad_norm": 0.8047335743904114, + "learning_rate": 0.00018470240337722991, + "loss": 2.6827, + "step": 3650 + }, + { + "epoch": 0.29464934226454687, + "grad_norm": 0.7446332573890686, + "learning_rate": 0.00018469401072373733, + "loss": 2.7089, + "step": 3651 + }, + { + "epoch": 0.29473004600112984, + "grad_norm": 0.7610359191894531, + "learning_rate": 0.00018468561595944862, + "loss": 2.6766, + "step": 3652 + }, + { + "epoch": 0.2948107497377129, + "grad_norm": 0.7705755233764648, + "learning_rate": 0.000184677219084573, + "loss": 2.7445, + "step": 3653 + }, + { + "epoch": 0.29489145347429585, + "grad_norm": 0.7466446757316589, + "learning_rate": 0.00018466882009931973, + "loss": 2.726, + "step": 3654 + }, + { + "epoch": 0.2949721572108789, + "grad_norm": 0.7912059426307678, + "learning_rate": 0.00018466041900389813, + "loss": 2.6865, + "step": 3655 + }, + { + "epoch": 0.29505286094746186, + "grad_norm": 0.722588837146759, + "learning_rate": 0.00018465201579851757, + "loss": 2.7039, + "step": 3656 + }, + { + "epoch": 0.2951335646840449, + "grad_norm": 0.739311933517456, + "learning_rate": 0.00018464361048338752, + "loss": 2.6991, + "step": 3657 + }, + { + "epoch": 0.29521426842062787, + "grad_norm": 0.7784128785133362, + "learning_rate": 0.00018463520305871743, + "loss": 2.753, + "step": 3658 + }, + { + "epoch": 0.2952949721572109, + "grad_norm": 0.8261777758598328, + "learning_rate": 0.00018462679352471682, + "loss": 2.7257, + "step": 3659 + }, + { + "epoch": 0.2953756758937939, + "grad_norm": 0.7510927319526672, + "learning_rate": 0.0001846183818815953, + "loss": 2.6981, + "step": 3660 + }, + { + "epoch": 0.2954563796303769, + "grad_norm": 0.7403035163879395, + "learning_rate": 0.00018460996812956254, + "loss": 2.744, + "step": 3661 + }, + { + "epoch": 0.2955370833669599, + "grad_norm": 0.7927733063697815, + "learning_rate": 0.00018460155226882817, + "loss": 2.6304, + "step": 3662 + }, + { + "epoch": 0.2956177871035429, + "grad_norm": 0.7923495769500732, + "learning_rate": 0.000184593134299602, + "loss": 2.7882, + "step": 3663 + }, + { + "epoch": 0.2956984908401259, + "grad_norm": 0.7639210224151611, + "learning_rate": 0.00018458471422209377, + "loss": 2.7171, + "step": 3664 + }, + { + "epoch": 0.2957791945767089, + "grad_norm": 0.736652672290802, + "learning_rate": 0.00018457629203651337, + "loss": 2.7479, + "step": 3665 + }, + { + "epoch": 0.2958598983132919, + "grad_norm": 0.7718610763549805, + "learning_rate": 0.00018456786774307066, + "loss": 2.7135, + "step": 3666 + }, + { + "epoch": 0.29594060204987493, + "grad_norm": 0.7711780071258545, + "learning_rate": 0.00018455944134197565, + "loss": 2.6867, + "step": 3667 + }, + { + "epoch": 0.2960213057864579, + "grad_norm": 0.7202491760253906, + "learning_rate": 0.0001845510128334383, + "loss": 2.6657, + "step": 3668 + }, + { + "epoch": 0.29610200952304094, + "grad_norm": 0.8155657649040222, + "learning_rate": 0.00018454258221766869, + "loss": 2.7342, + "step": 3669 + }, + { + "epoch": 0.2961827132596239, + "grad_norm": 0.7972069382667542, + "learning_rate": 0.00018453414949487696, + "loss": 2.7351, + "step": 3670 + }, + { + "epoch": 0.29626341699620695, + "grad_norm": 0.8645625710487366, + "learning_rate": 0.00018452571466527325, + "loss": 2.6778, + "step": 3671 + }, + { + "epoch": 0.29634412073278993, + "grad_norm": 0.7410334944725037, + "learning_rate": 0.00018451727772906775, + "loss": 2.7228, + "step": 3672 + }, + { + "epoch": 0.2964248244693729, + "grad_norm": 0.7845733165740967, + "learning_rate": 0.0001845088386864708, + "loss": 2.7068, + "step": 3673 + }, + { + "epoch": 0.29650552820595594, + "grad_norm": 0.7709881067276001, + "learning_rate": 0.00018450039753769266, + "loss": 2.676, + "step": 3674 + }, + { + "epoch": 0.2965862319425389, + "grad_norm": 0.7214749455451965, + "learning_rate": 0.00018449195428294371, + "loss": 2.6488, + "step": 3675 + }, + { + "epoch": 0.29666693567912195, + "grad_norm": 0.7467561960220337, + "learning_rate": 0.00018448350892243443, + "loss": 2.7262, + "step": 3676 + }, + { + "epoch": 0.2967476394157049, + "grad_norm": 0.8412678241729736, + "learning_rate": 0.00018447506145637522, + "loss": 2.7898, + "step": 3677 + }, + { + "epoch": 0.29682834315228795, + "grad_norm": 0.7130109071731567, + "learning_rate": 0.00018446661188497668, + "loss": 2.7344, + "step": 3678 + }, + { + "epoch": 0.29690904688887093, + "grad_norm": 0.7807374000549316, + "learning_rate": 0.00018445816020844937, + "loss": 2.7198, + "step": 3679 + }, + { + "epoch": 0.29698975062545396, + "grad_norm": 0.8497760891914368, + "learning_rate": 0.00018444970642700394, + "loss": 2.7479, + "step": 3680 + }, + { + "epoch": 0.29707045436203694, + "grad_norm": 0.6827178001403809, + "learning_rate": 0.0001844412505408511, + "loss": 2.727, + "step": 3681 + }, + { + "epoch": 0.29715115809861997, + "grad_norm": 0.8063304424285889, + "learning_rate": 0.00018443279255020152, + "loss": 2.7896, + "step": 3682 + }, + { + "epoch": 0.29723186183520295, + "grad_norm": 0.7759353518486023, + "learning_rate": 0.00018442433245526604, + "loss": 2.7014, + "step": 3683 + }, + { + "epoch": 0.297312565571786, + "grad_norm": 0.7380958199501038, + "learning_rate": 0.00018441587025625554, + "loss": 2.6665, + "step": 3684 + }, + { + "epoch": 0.29739326930836896, + "grad_norm": 0.7623556852340698, + "learning_rate": 0.00018440740595338087, + "loss": 2.6955, + "step": 3685 + }, + { + "epoch": 0.297473973044952, + "grad_norm": 0.8204537630081177, + "learning_rate": 0.000184398939546853, + "loss": 2.6854, + "step": 3686 + }, + { + "epoch": 0.29755467678153497, + "grad_norm": 0.7346726655960083, + "learning_rate": 0.00018439047103688293, + "loss": 2.6664, + "step": 3687 + }, + { + "epoch": 0.297635380518118, + "grad_norm": 0.777860701084137, + "learning_rate": 0.00018438200042368173, + "loss": 2.6423, + "step": 3688 + }, + { + "epoch": 0.297716084254701, + "grad_norm": 0.7331553101539612, + "learning_rate": 0.00018437352770746054, + "loss": 2.6137, + "step": 3689 + }, + { + "epoch": 0.297796787991284, + "grad_norm": 0.7634466290473938, + "learning_rate": 0.00018436505288843043, + "loss": 2.7266, + "step": 3690 + }, + { + "epoch": 0.297877491727867, + "grad_norm": 0.8151016235351562, + "learning_rate": 0.00018435657596680268, + "loss": 2.7373, + "step": 3691 + }, + { + "epoch": 0.29795819546445, + "grad_norm": 0.7806773781776428, + "learning_rate": 0.00018434809694278857, + "loss": 2.7011, + "step": 3692 + }, + { + "epoch": 0.298038899201033, + "grad_norm": 0.7575243711471558, + "learning_rate": 0.00018433961581659935, + "loss": 2.6601, + "step": 3693 + }, + { + "epoch": 0.298119602937616, + "grad_norm": 0.7527276873588562, + "learning_rate": 0.00018433113258844647, + "loss": 2.6864, + "step": 3694 + }, + { + "epoch": 0.298200306674199, + "grad_norm": 0.8024318218231201, + "learning_rate": 0.0001843226472585413, + "loss": 2.728, + "step": 3695 + }, + { + "epoch": 0.29828101041078203, + "grad_norm": 0.7549982666969299, + "learning_rate": 0.0001843141598270954, + "loss": 2.6834, + "step": 3696 + }, + { + "epoch": 0.298361714147365, + "grad_norm": 0.7699971199035645, + "learning_rate": 0.0001843056702943202, + "loss": 2.7209, + "step": 3697 + }, + { + "epoch": 0.29844241788394804, + "grad_norm": 0.823842465877533, + "learning_rate": 0.0001842971786604273, + "loss": 2.6924, + "step": 3698 + }, + { + "epoch": 0.298523121620531, + "grad_norm": 0.7645791172981262, + "learning_rate": 0.00018428868492562837, + "loss": 2.6821, + "step": 3699 + }, + { + "epoch": 0.29860382535711405, + "grad_norm": 0.7530989050865173, + "learning_rate": 0.00018428018909013506, + "loss": 2.7592, + "step": 3700 + }, + { + "epoch": 0.298684529093697, + "grad_norm": 0.7958168387413025, + "learning_rate": 0.00018427169115415914, + "loss": 2.6925, + "step": 3701 + }, + { + "epoch": 0.29876523283028006, + "grad_norm": 0.7777522802352905, + "learning_rate": 0.00018426319111791242, + "loss": 2.6757, + "step": 3702 + }, + { + "epoch": 0.29884593656686304, + "grad_norm": 0.7418079972267151, + "learning_rate": 0.00018425468898160667, + "loss": 2.6445, + "step": 3703 + }, + { + "epoch": 0.29892664030344607, + "grad_norm": 0.7591132521629333, + "learning_rate": 0.00018424618474545382, + "loss": 2.7157, + "step": 3704 + }, + { + "epoch": 0.29900734404002904, + "grad_norm": 0.7591627836227417, + "learning_rate": 0.00018423767840966586, + "loss": 2.6691, + "step": 3705 + }, + { + "epoch": 0.2990880477766121, + "grad_norm": 0.7934779524803162, + "learning_rate": 0.00018422916997445476, + "loss": 2.7262, + "step": 3706 + }, + { + "epoch": 0.29916875151319505, + "grad_norm": 0.7964254021644592, + "learning_rate": 0.00018422065944003252, + "loss": 2.6196, + "step": 3707 + }, + { + "epoch": 0.2992494552497781, + "grad_norm": 0.7448374032974243, + "learning_rate": 0.0001842121468066113, + "loss": 2.6732, + "step": 3708 + }, + { + "epoch": 0.29933015898636106, + "grad_norm": 0.7813000679016113, + "learning_rate": 0.00018420363207440329, + "loss": 2.6978, + "step": 3709 + }, + { + "epoch": 0.2994108627229441, + "grad_norm": 0.7760851979255676, + "learning_rate": 0.00018419511524362064, + "loss": 2.7466, + "step": 3710 + }, + { + "epoch": 0.29949156645952707, + "grad_norm": 0.7786797881126404, + "learning_rate": 0.00018418659631447564, + "loss": 2.7044, + "step": 3711 + }, + { + "epoch": 0.2995722701961101, + "grad_norm": 0.7860158085823059, + "learning_rate": 0.00018417807528718055, + "loss": 2.6587, + "step": 3712 + }, + { + "epoch": 0.2996529739326931, + "grad_norm": 0.8327339291572571, + "learning_rate": 0.0001841695521619478, + "loss": 2.7112, + "step": 3713 + }, + { + "epoch": 0.2997336776692761, + "grad_norm": 0.7535735368728638, + "learning_rate": 0.00018416102693898982, + "loss": 2.726, + "step": 3714 + }, + { + "epoch": 0.2998143814058591, + "grad_norm": 0.7781090140342712, + "learning_rate": 0.000184152499618519, + "loss": 2.7238, + "step": 3715 + }, + { + "epoch": 0.2998950851424421, + "grad_norm": 0.7700545191764832, + "learning_rate": 0.00018414397020074795, + "loss": 2.7081, + "step": 3716 + }, + { + "epoch": 0.2999757888790251, + "grad_norm": 0.7578303217887878, + "learning_rate": 0.0001841354386858892, + "loss": 2.6591, + "step": 3717 + }, + { + "epoch": 0.30005649261560813, + "grad_norm": 0.7506501078605652, + "learning_rate": 0.00018412690507415538, + "loss": 2.6551, + "step": 3718 + }, + { + "epoch": 0.3001371963521911, + "grad_norm": 0.7869547009468079, + "learning_rate": 0.00018411836936575918, + "loss": 2.7169, + "step": 3719 + }, + { + "epoch": 0.30021790008877414, + "grad_norm": 0.7547428607940674, + "learning_rate": 0.00018410983156091332, + "loss": 2.7498, + "step": 3720 + }, + { + "epoch": 0.3002986038253571, + "grad_norm": 0.7829383015632629, + "learning_rate": 0.0001841012916598306, + "loss": 2.6885, + "step": 3721 + }, + { + "epoch": 0.30037930756194015, + "grad_norm": 0.8469082117080688, + "learning_rate": 0.00018409274966272386, + "loss": 2.7594, + "step": 3722 + }, + { + "epoch": 0.3004600112985231, + "grad_norm": 0.7690171599388123, + "learning_rate": 0.00018408420556980596, + "loss": 2.7892, + "step": 3723 + }, + { + "epoch": 0.3005407150351061, + "grad_norm": 0.7295899987220764, + "learning_rate": 0.00018407565938128987, + "loss": 2.7023, + "step": 3724 + }, + { + "epoch": 0.30062141877168913, + "grad_norm": 0.7249528169631958, + "learning_rate": 0.00018406711109738856, + "loss": 2.7135, + "step": 3725 + }, + { + "epoch": 0.3007021225082721, + "grad_norm": 0.7237234711647034, + "learning_rate": 0.0001840585607183151, + "loss": 2.6117, + "step": 3726 + }, + { + "epoch": 0.30078282624485514, + "grad_norm": 0.7426557540893555, + "learning_rate": 0.00018405000824428256, + "loss": 2.7202, + "step": 3727 + }, + { + "epoch": 0.3008635299814381, + "grad_norm": 0.7572938799858093, + "learning_rate": 0.00018404145367550414, + "loss": 2.7373, + "step": 3728 + }, + { + "epoch": 0.30094423371802115, + "grad_norm": 0.7198675274848938, + "learning_rate": 0.00018403289701219295, + "loss": 2.6675, + "step": 3729 + }, + { + "epoch": 0.3010249374546041, + "grad_norm": 0.722532331943512, + "learning_rate": 0.00018402433825456235, + "loss": 2.6933, + "step": 3730 + }, + { + "epoch": 0.30110564119118716, + "grad_norm": 0.7621530890464783, + "learning_rate": 0.0001840157774028256, + "loss": 2.6951, + "step": 3731 + }, + { + "epoch": 0.30118634492777013, + "grad_norm": 0.7435615062713623, + "learning_rate": 0.00018400721445719604, + "loss": 2.7323, + "step": 3732 + }, + { + "epoch": 0.30126704866435317, + "grad_norm": 0.7233619689941406, + "learning_rate": 0.00018399864941788708, + "loss": 2.6789, + "step": 3733 + }, + { + "epoch": 0.30134775240093614, + "grad_norm": 0.7421496510505676, + "learning_rate": 0.00018399008228511224, + "loss": 2.72, + "step": 3734 + }, + { + "epoch": 0.3014284561375192, + "grad_norm": 0.7250909805297852, + "learning_rate": 0.000183981513059085, + "loss": 2.6717, + "step": 3735 + }, + { + "epoch": 0.30150915987410215, + "grad_norm": 0.7642899751663208, + "learning_rate": 0.0001839729417400189, + "loss": 2.6823, + "step": 3736 + }, + { + "epoch": 0.3015898636106852, + "grad_norm": 0.7434508204460144, + "learning_rate": 0.00018396436832812758, + "loss": 2.6441, + "step": 3737 + }, + { + "epoch": 0.30167056734726816, + "grad_norm": 0.7163311839103699, + "learning_rate": 0.00018395579282362473, + "loss": 2.6736, + "step": 3738 + }, + { + "epoch": 0.3017512710838512, + "grad_norm": 0.6936792731285095, + "learning_rate": 0.00018394721522672404, + "loss": 2.6792, + "step": 3739 + }, + { + "epoch": 0.30183197482043417, + "grad_norm": 0.7791975736618042, + "learning_rate": 0.0001839386355376393, + "loss": 2.653, + "step": 3740 + }, + { + "epoch": 0.3019126785570172, + "grad_norm": 0.7902694940567017, + "learning_rate": 0.00018393005375658437, + "loss": 2.7448, + "step": 3741 + }, + { + "epoch": 0.3019933822936002, + "grad_norm": 0.7405624389648438, + "learning_rate": 0.0001839214698837731, + "loss": 2.6977, + "step": 3742 + }, + { + "epoch": 0.3020740860301832, + "grad_norm": 0.8033632040023804, + "learning_rate": 0.00018391288391941943, + "loss": 2.7468, + "step": 3743 + }, + { + "epoch": 0.3021547897667662, + "grad_norm": 0.8148884177207947, + "learning_rate": 0.00018390429586373735, + "loss": 2.6992, + "step": 3744 + }, + { + "epoch": 0.3022354935033492, + "grad_norm": 0.7633625268936157, + "learning_rate": 0.00018389570571694089, + "loss": 2.6604, + "step": 3745 + }, + { + "epoch": 0.3023161972399322, + "grad_norm": 0.8687180876731873, + "learning_rate": 0.00018388711347924413, + "loss": 2.6808, + "step": 3746 + }, + { + "epoch": 0.3023969009765152, + "grad_norm": 0.6974104046821594, + "learning_rate": 0.0001838785191508612, + "loss": 2.7613, + "step": 3747 + }, + { + "epoch": 0.3024776047130982, + "grad_norm": 0.7919288873672485, + "learning_rate": 0.00018386992273200633, + "loss": 2.664, + "step": 3748 + }, + { + "epoch": 0.30255830844968123, + "grad_norm": 0.7708829045295715, + "learning_rate": 0.00018386132422289374, + "loss": 2.7703, + "step": 3749 + }, + { + "epoch": 0.3026390121862642, + "grad_norm": 0.7099813222885132, + "learning_rate": 0.00018385272362373775, + "loss": 2.6485, + "step": 3750 + }, + { + "epoch": 0.30271971592284724, + "grad_norm": 0.7629622220993042, + "learning_rate": 0.0001838441209347527, + "loss": 2.7339, + "step": 3751 + }, + { + "epoch": 0.3028004196594302, + "grad_norm": 0.727275550365448, + "learning_rate": 0.00018383551615615295, + "loss": 2.7194, + "step": 3752 + }, + { + "epoch": 0.30288112339601325, + "grad_norm": 0.7158832550048828, + "learning_rate": 0.00018382690928815302, + "loss": 2.6698, + "step": 3753 + }, + { + "epoch": 0.30296182713259623, + "grad_norm": 0.8075565099716187, + "learning_rate": 0.00018381830033096735, + "loss": 2.7198, + "step": 3754 + }, + { + "epoch": 0.30304253086917926, + "grad_norm": 0.7949094176292419, + "learning_rate": 0.00018380968928481057, + "loss": 2.7048, + "step": 3755 + }, + { + "epoch": 0.30312323460576224, + "grad_norm": 0.7009503841400146, + "learning_rate": 0.00018380107614989724, + "loss": 2.709, + "step": 3756 + }, + { + "epoch": 0.30320393834234527, + "grad_norm": 0.668574869632721, + "learning_rate": 0.00018379246092644204, + "loss": 2.6515, + "step": 3757 + }, + { + "epoch": 0.30328464207892825, + "grad_norm": 0.7470806241035461, + "learning_rate": 0.00018378384361465968, + "loss": 2.7577, + "step": 3758 + }, + { + "epoch": 0.3033653458155113, + "grad_norm": 0.7529913783073425, + "learning_rate": 0.0001837752242147649, + "loss": 2.7189, + "step": 3759 + }, + { + "epoch": 0.30344604955209425, + "grad_norm": 0.7373302578926086, + "learning_rate": 0.00018376660272697258, + "loss": 2.7197, + "step": 3760 + }, + { + "epoch": 0.3035267532886773, + "grad_norm": 0.7650466561317444, + "learning_rate": 0.0001837579791514975, + "loss": 2.6613, + "step": 3761 + }, + { + "epoch": 0.30360745702526026, + "grad_norm": 0.775209903717041, + "learning_rate": 0.00018374935348855468, + "loss": 2.6454, + "step": 3762 + }, + { + "epoch": 0.3036881607618433, + "grad_norm": 0.7049290537834167, + "learning_rate": 0.00018374072573835903, + "loss": 2.6663, + "step": 3763 + }, + { + "epoch": 0.30376886449842627, + "grad_norm": 0.7060630917549133, + "learning_rate": 0.0001837320959011256, + "loss": 2.6908, + "step": 3764 + }, + { + "epoch": 0.3038495682350093, + "grad_norm": 0.7561464905738831, + "learning_rate": 0.00018372346397706944, + "loss": 2.673, + "step": 3765 + }, + { + "epoch": 0.3039302719715923, + "grad_norm": 0.7293568849563599, + "learning_rate": 0.0001837148299664057, + "loss": 2.6431, + "step": 3766 + }, + { + "epoch": 0.3040109757081753, + "grad_norm": 0.8460379838943481, + "learning_rate": 0.00018370619386934962, + "loss": 2.7493, + "step": 3767 + }, + { + "epoch": 0.3040916794447583, + "grad_norm": 0.8136082291603088, + "learning_rate": 0.00018369755568611632, + "loss": 2.7298, + "step": 3768 + }, + { + "epoch": 0.3041723831813413, + "grad_norm": 0.6916636824607849, + "learning_rate": 0.00018368891541692116, + "loss": 2.7173, + "step": 3769 + }, + { + "epoch": 0.3042530869179243, + "grad_norm": 0.7547643780708313, + "learning_rate": 0.0001836802730619795, + "loss": 2.6343, + "step": 3770 + }, + { + "epoch": 0.30433379065450733, + "grad_norm": 0.7439205050468445, + "learning_rate": 0.00018367162862150665, + "loss": 2.6627, + "step": 3771 + }, + { + "epoch": 0.3044144943910903, + "grad_norm": 0.7781087756156921, + "learning_rate": 0.0001836629820957181, + "loss": 2.7223, + "step": 3772 + }, + { + "epoch": 0.30449519812767334, + "grad_norm": 0.7876880764961243, + "learning_rate": 0.00018365433348482935, + "loss": 2.7139, + "step": 3773 + }, + { + "epoch": 0.3045759018642563, + "grad_norm": 0.7571346163749695, + "learning_rate": 0.00018364568278905595, + "loss": 2.6939, + "step": 3774 + }, + { + "epoch": 0.3046566056008393, + "grad_norm": 0.9011813402175903, + "learning_rate": 0.00018363703000861346, + "loss": 2.7516, + "step": 3775 + }, + { + "epoch": 0.3047373093374223, + "grad_norm": 0.7809761762619019, + "learning_rate": 0.00018362837514371755, + "loss": 2.7587, + "step": 3776 + }, + { + "epoch": 0.3048180130740053, + "grad_norm": 0.7486867308616638, + "learning_rate": 0.00018361971819458393, + "loss": 2.6617, + "step": 3777 + }, + { + "epoch": 0.30489871681058833, + "grad_norm": 0.7434267401695251, + "learning_rate": 0.00018361105916142836, + "loss": 2.7328, + "step": 3778 + }, + { + "epoch": 0.3049794205471713, + "grad_norm": 0.7895822525024414, + "learning_rate": 0.0001836023980444666, + "loss": 2.7038, + "step": 3779 + }, + { + "epoch": 0.30506012428375434, + "grad_norm": 0.7329267263412476, + "learning_rate": 0.00018359373484391458, + "loss": 2.6533, + "step": 3780 + }, + { + "epoch": 0.3051408280203373, + "grad_norm": 0.7578477263450623, + "learning_rate": 0.00018358506955998817, + "loss": 2.723, + "step": 3781 + }, + { + "epoch": 0.30522153175692035, + "grad_norm": 0.7174215316772461, + "learning_rate": 0.0001835764021929033, + "loss": 2.7665, + "step": 3782 + }, + { + "epoch": 0.3053022354935033, + "grad_norm": 0.7261673808097839, + "learning_rate": 0.00018356773274287605, + "loss": 2.7239, + "step": 3783 + }, + { + "epoch": 0.30538293923008636, + "grad_norm": 0.7550768852233887, + "learning_rate": 0.00018355906121012244, + "loss": 2.6952, + "step": 3784 + }, + { + "epoch": 0.30546364296666934, + "grad_norm": 0.7805373668670654, + "learning_rate": 0.0001835503875948586, + "loss": 2.6453, + "step": 3785 + }, + { + "epoch": 0.30554434670325237, + "grad_norm": 0.7753674983978271, + "learning_rate": 0.0001835417118973007, + "loss": 2.7188, + "step": 3786 + }, + { + "epoch": 0.30562505043983534, + "grad_norm": 0.719774603843689, + "learning_rate": 0.00018353303411766496, + "loss": 2.69, + "step": 3787 + }, + { + "epoch": 0.3057057541764184, + "grad_norm": 0.786780059337616, + "learning_rate": 0.00018352435425616763, + "loss": 2.7015, + "step": 3788 + }, + { + "epoch": 0.30578645791300135, + "grad_norm": 0.7481613159179688, + "learning_rate": 0.00018351567231302508, + "loss": 2.6267, + "step": 3789 + }, + { + "epoch": 0.3058671616495844, + "grad_norm": 0.8138384222984314, + "learning_rate": 0.00018350698828845365, + "loss": 2.7301, + "step": 3790 + }, + { + "epoch": 0.30594786538616736, + "grad_norm": 0.7911081314086914, + "learning_rate": 0.00018349830218266982, + "loss": 2.6661, + "step": 3791 + }, + { + "epoch": 0.3060285691227504, + "grad_norm": 0.763179361820221, + "learning_rate": 0.00018348961399588997, + "loss": 2.6509, + "step": 3792 + }, + { + "epoch": 0.30610927285933337, + "grad_norm": 0.8214982748031616, + "learning_rate": 0.00018348092372833072, + "loss": 2.6951, + "step": 3793 + }, + { + "epoch": 0.3061899765959164, + "grad_norm": 0.7271003127098083, + "learning_rate": 0.00018347223138020865, + "loss": 2.7227, + "step": 3794 + }, + { + "epoch": 0.3062706803324994, + "grad_norm": 0.7727730870246887, + "learning_rate": 0.00018346353695174037, + "loss": 2.721, + "step": 3795 + }, + { + "epoch": 0.3063513840690824, + "grad_norm": 0.844895601272583, + "learning_rate": 0.00018345484044314257, + "loss": 2.6757, + "step": 3796 + }, + { + "epoch": 0.3064320878056654, + "grad_norm": 0.7409898638725281, + "learning_rate": 0.00018344614185463197, + "loss": 2.6798, + "step": 3797 + }, + { + "epoch": 0.3065127915422484, + "grad_norm": 0.8284425139427185, + "learning_rate": 0.00018343744118642542, + "loss": 2.7573, + "step": 3798 + }, + { + "epoch": 0.3065934952788314, + "grad_norm": 0.7535427808761597, + "learning_rate": 0.00018342873843873973, + "loss": 2.7026, + "step": 3799 + }, + { + "epoch": 0.30667419901541443, + "grad_norm": 0.8013898730278015, + "learning_rate": 0.00018342003361179176, + "loss": 2.7331, + "step": 3800 + }, + { + "epoch": 0.3067549027519974, + "grad_norm": 0.7458386421203613, + "learning_rate": 0.0001834113267057985, + "loss": 2.6976, + "step": 3801 + }, + { + "epoch": 0.30683560648858044, + "grad_norm": 0.8333673477172852, + "learning_rate": 0.00018340261772097695, + "loss": 2.7064, + "step": 3802 + }, + { + "epoch": 0.3069163102251634, + "grad_norm": 0.7273485064506531, + "learning_rate": 0.00018339390665754414, + "loss": 2.6619, + "step": 3803 + }, + { + "epoch": 0.30699701396174645, + "grad_norm": 0.8199014067649841, + "learning_rate": 0.0001833851935157172, + "loss": 2.654, + "step": 3804 + }, + { + "epoch": 0.3070777176983294, + "grad_norm": 0.780197024345398, + "learning_rate": 0.00018337647829571324, + "loss": 2.6814, + "step": 3805 + }, + { + "epoch": 0.30715842143491245, + "grad_norm": 0.7214049100875854, + "learning_rate": 0.0001833677609977495, + "loss": 2.709, + "step": 3806 + }, + { + "epoch": 0.30723912517149543, + "grad_norm": 0.7680457830429077, + "learning_rate": 0.00018335904162204326, + "loss": 2.6628, + "step": 3807 + }, + { + "epoch": 0.30731982890807846, + "grad_norm": 0.760728120803833, + "learning_rate": 0.00018335032016881178, + "loss": 2.7005, + "step": 3808 + }, + { + "epoch": 0.30740053264466144, + "grad_norm": 0.7631687521934509, + "learning_rate": 0.00018334159663827243, + "loss": 2.7012, + "step": 3809 + }, + { + "epoch": 0.30748123638124447, + "grad_norm": 0.7515785694122314, + "learning_rate": 0.00018333287103064266, + "loss": 2.7062, + "step": 3810 + }, + { + "epoch": 0.30756194011782745, + "grad_norm": 0.804500162601471, + "learning_rate": 0.00018332414334613987, + "loss": 2.7888, + "step": 3811 + }, + { + "epoch": 0.3076426438544105, + "grad_norm": 0.7551451325416565, + "learning_rate": 0.00018331541358498164, + "loss": 2.6345, + "step": 3812 + }, + { + "epoch": 0.30772334759099346, + "grad_norm": 0.7342958450317383, + "learning_rate": 0.0001833066817473855, + "loss": 2.6601, + "step": 3813 + }, + { + "epoch": 0.3078040513275765, + "grad_norm": 0.8059296607971191, + "learning_rate": 0.0001832979478335691, + "loss": 2.7694, + "step": 3814 + }, + { + "epoch": 0.30788475506415947, + "grad_norm": 0.7037352919578552, + "learning_rate": 0.0001832892118437501, + "loss": 2.6788, + "step": 3815 + }, + { + "epoch": 0.3079654588007425, + "grad_norm": 0.759509801864624, + "learning_rate": 0.0001832804737781462, + "loss": 2.7115, + "step": 3816 + }, + { + "epoch": 0.3080461625373255, + "grad_norm": 0.7911720871925354, + "learning_rate": 0.00018327173363697524, + "loss": 2.6676, + "step": 3817 + }, + { + "epoch": 0.3081268662739085, + "grad_norm": 0.7592991590499878, + "learning_rate": 0.00018326299142045496, + "loss": 2.7245, + "step": 3818 + }, + { + "epoch": 0.3082075700104915, + "grad_norm": 0.7620227932929993, + "learning_rate": 0.00018325424712880333, + "loss": 2.7224, + "step": 3819 + }, + { + "epoch": 0.3082882737470745, + "grad_norm": 0.7834638953208923, + "learning_rate": 0.0001832455007622382, + "loss": 2.7469, + "step": 3820 + }, + { + "epoch": 0.3083689774836575, + "grad_norm": 0.7765992879867554, + "learning_rate": 0.00018323675232097757, + "loss": 2.7193, + "step": 3821 + }, + { + "epoch": 0.3084496812202405, + "grad_norm": 0.7334728837013245, + "learning_rate": 0.00018322800180523949, + "loss": 2.667, + "step": 3822 + }, + { + "epoch": 0.3085303849568235, + "grad_norm": 0.7674607634544373, + "learning_rate": 0.00018321924921524207, + "loss": 2.6479, + "step": 3823 + }, + { + "epoch": 0.30861108869340653, + "grad_norm": 0.7616469860076904, + "learning_rate": 0.0001832104945512034, + "loss": 2.6535, + "step": 3824 + }, + { + "epoch": 0.3086917924299895, + "grad_norm": 0.7693164944648743, + "learning_rate": 0.00018320173781334172, + "loss": 2.7616, + "step": 3825 + }, + { + "epoch": 0.3087724961665725, + "grad_norm": 0.7099221348762512, + "learning_rate": 0.0001831929790018752, + "loss": 2.6729, + "step": 3826 + }, + { + "epoch": 0.3088531999031555, + "grad_norm": 0.7389346957206726, + "learning_rate": 0.00018318421811702222, + "loss": 2.6396, + "step": 3827 + }, + { + "epoch": 0.3089339036397385, + "grad_norm": 0.8302628397941589, + "learning_rate": 0.00018317545515900106, + "loss": 2.6786, + "step": 3828 + }, + { + "epoch": 0.3090146073763215, + "grad_norm": 0.7441998720169067, + "learning_rate": 0.00018316669012803015, + "loss": 2.6769, + "step": 3829 + }, + { + "epoch": 0.3090953111129045, + "grad_norm": 0.8454675674438477, + "learning_rate": 0.00018315792302432788, + "loss": 2.7275, + "step": 3830 + }, + { + "epoch": 0.30917601484948753, + "grad_norm": 0.8129739761352539, + "learning_rate": 0.00018314915384811282, + "loss": 2.7603, + "step": 3831 + }, + { + "epoch": 0.3092567185860705, + "grad_norm": 0.7525617480278015, + "learning_rate": 0.00018314038259960349, + "loss": 2.7156, + "step": 3832 + }, + { + "epoch": 0.30933742232265354, + "grad_norm": 0.7319022417068481, + "learning_rate": 0.0001831316092790185, + "loss": 2.676, + "step": 3833 + }, + { + "epoch": 0.3094181260592365, + "grad_norm": 0.7767768502235413, + "learning_rate": 0.00018312283388657646, + "loss": 2.7022, + "step": 3834 + }, + { + "epoch": 0.30949882979581955, + "grad_norm": 0.709293007850647, + "learning_rate": 0.00018311405642249616, + "loss": 2.6241, + "step": 3835 + }, + { + "epoch": 0.30957953353240253, + "grad_norm": 0.715360701084137, + "learning_rate": 0.0001831052768869963, + "loss": 2.6777, + "step": 3836 + }, + { + "epoch": 0.30966023726898556, + "grad_norm": 0.7361319065093994, + "learning_rate": 0.0001830964952802957, + "loss": 2.6539, + "step": 3837 + }, + { + "epoch": 0.30974094100556854, + "grad_norm": 0.7243087291717529, + "learning_rate": 0.0001830877116026132, + "loss": 2.7506, + "step": 3838 + }, + { + "epoch": 0.30982164474215157, + "grad_norm": 0.7361106872558594, + "learning_rate": 0.00018307892585416776, + "loss": 2.697, + "step": 3839 + }, + { + "epoch": 0.30990234847873455, + "grad_norm": 0.7541893720626831, + "learning_rate": 0.00018307013803517833, + "loss": 2.694, + "step": 3840 + }, + { + "epoch": 0.3099830522153176, + "grad_norm": 0.7235575914382935, + "learning_rate": 0.00018306134814586388, + "loss": 2.6711, + "step": 3841 + }, + { + "epoch": 0.31006375595190055, + "grad_norm": 0.7868196368217468, + "learning_rate": 0.00018305255618644354, + "loss": 2.7177, + "step": 3842 + }, + { + "epoch": 0.3101444596884836, + "grad_norm": 0.8074443340301514, + "learning_rate": 0.00018304376215713637, + "loss": 2.7293, + "step": 3843 + }, + { + "epoch": 0.31022516342506656, + "grad_norm": 0.6993385553359985, + "learning_rate": 0.00018303496605816158, + "loss": 2.6942, + "step": 3844 + }, + { + "epoch": 0.3103058671616496, + "grad_norm": 0.7272824645042419, + "learning_rate": 0.00018302616788973839, + "loss": 2.7093, + "step": 3845 + }, + { + "epoch": 0.31038657089823257, + "grad_norm": 0.7496963143348694, + "learning_rate": 0.00018301736765208605, + "loss": 2.7096, + "step": 3846 + }, + { + "epoch": 0.3104672746348156, + "grad_norm": 0.7407644987106323, + "learning_rate": 0.00018300856534542387, + "loss": 2.6956, + "step": 3847 + }, + { + "epoch": 0.3105479783713986, + "grad_norm": 0.742382287979126, + "learning_rate": 0.00018299976096997132, + "loss": 2.6744, + "step": 3848 + }, + { + "epoch": 0.3106286821079816, + "grad_norm": 0.7314567565917969, + "learning_rate": 0.0001829909545259477, + "loss": 2.7544, + "step": 3849 + }, + { + "epoch": 0.3107093858445646, + "grad_norm": 0.7550896406173706, + "learning_rate": 0.0001829821460135726, + "loss": 2.714, + "step": 3850 + }, + { + "epoch": 0.3107900895811476, + "grad_norm": 0.7496031522750854, + "learning_rate": 0.00018297333543306548, + "loss": 2.6718, + "step": 3851 + }, + { + "epoch": 0.3108707933177306, + "grad_norm": 0.7600073218345642, + "learning_rate": 0.00018296452278464596, + "loss": 2.7141, + "step": 3852 + }, + { + "epoch": 0.31095149705431363, + "grad_norm": 0.7242388129234314, + "learning_rate": 0.00018295570806853366, + "loss": 2.7407, + "step": 3853 + }, + { + "epoch": 0.3110322007908966, + "grad_norm": 0.723874568939209, + "learning_rate": 0.00018294689128494824, + "loss": 2.7253, + "step": 3854 + }, + { + "epoch": 0.31111290452747964, + "grad_norm": 0.7902834415435791, + "learning_rate": 0.00018293807243410947, + "loss": 2.7118, + "step": 3855 + }, + { + "epoch": 0.3111936082640626, + "grad_norm": 0.7676794528961182, + "learning_rate": 0.00018292925151623717, + "loss": 2.684, + "step": 3856 + }, + { + "epoch": 0.31127431200064565, + "grad_norm": 0.767431378364563, + "learning_rate": 0.0001829204285315511, + "loss": 2.6936, + "step": 3857 + }, + { + "epoch": 0.3113550157372286, + "grad_norm": 0.7802234888076782, + "learning_rate": 0.00018291160348027122, + "loss": 2.7181, + "step": 3858 + }, + { + "epoch": 0.31143571947381166, + "grad_norm": 0.7823610305786133, + "learning_rate": 0.00018290277636261743, + "loss": 2.7014, + "step": 3859 + }, + { + "epoch": 0.31151642321039463, + "grad_norm": 0.8199869394302368, + "learning_rate": 0.00018289394717880978, + "loss": 2.73, + "step": 3860 + }, + { + "epoch": 0.31159712694697766, + "grad_norm": 0.7725761532783508, + "learning_rate": 0.00018288511592906822, + "loss": 2.6978, + "step": 3861 + }, + { + "epoch": 0.31167783068356064, + "grad_norm": 0.752034068107605, + "learning_rate": 0.00018287628261361296, + "loss": 2.6635, + "step": 3862 + }, + { + "epoch": 0.3117585344201437, + "grad_norm": 0.7961714267730713, + "learning_rate": 0.0001828674472326641, + "loss": 2.7047, + "step": 3863 + }, + { + "epoch": 0.31183923815672665, + "grad_norm": 0.7413069605827332, + "learning_rate": 0.00018285860978644182, + "loss": 2.6872, + "step": 3864 + }, + { + "epoch": 0.3119199418933097, + "grad_norm": 0.8943146467208862, + "learning_rate": 0.00018284977027516636, + "loss": 2.7611, + "step": 3865 + }, + { + "epoch": 0.31200064562989266, + "grad_norm": 0.7663856744766235, + "learning_rate": 0.0001828409286990581, + "loss": 2.7541, + "step": 3866 + }, + { + "epoch": 0.3120813493664757, + "grad_norm": 0.7557348608970642, + "learning_rate": 0.00018283208505833731, + "loss": 2.6633, + "step": 3867 + }, + { + "epoch": 0.31216205310305867, + "grad_norm": 0.7690094113349915, + "learning_rate": 0.00018282323935322445, + "loss": 2.7117, + "step": 3868 + }, + { + "epoch": 0.3122427568396417, + "grad_norm": 0.8059033751487732, + "learning_rate": 0.00018281439158393997, + "loss": 2.6743, + "step": 3869 + }, + { + "epoch": 0.3123234605762247, + "grad_norm": 0.7877150774002075, + "learning_rate": 0.00018280554175070438, + "loss": 2.6546, + "step": 3870 + }, + { + "epoch": 0.3124041643128077, + "grad_norm": 0.799670934677124, + "learning_rate": 0.0001827966898537382, + "loss": 2.7184, + "step": 3871 + }, + { + "epoch": 0.3124848680493907, + "grad_norm": 0.8353915214538574, + "learning_rate": 0.0001827878358932621, + "loss": 2.7235, + "step": 3872 + }, + { + "epoch": 0.3125655717859737, + "grad_norm": 0.7954776883125305, + "learning_rate": 0.00018277897986949672, + "loss": 2.5992, + "step": 3873 + }, + { + "epoch": 0.3126462755225567, + "grad_norm": 0.7959856986999512, + "learning_rate": 0.00018277012178266277, + "loss": 2.6877, + "step": 3874 + }, + { + "epoch": 0.3127269792591397, + "grad_norm": 0.8220208883285522, + "learning_rate": 0.00018276126163298102, + "loss": 2.6891, + "step": 3875 + }, + { + "epoch": 0.3128076829957227, + "grad_norm": 0.7827965021133423, + "learning_rate": 0.0001827523994206723, + "loss": 2.7271, + "step": 3876 + }, + { + "epoch": 0.3128883867323057, + "grad_norm": 0.764369010925293, + "learning_rate": 0.00018274353514595746, + "loss": 2.6661, + "step": 3877 + }, + { + "epoch": 0.3129690904688887, + "grad_norm": 0.7440944314002991, + "learning_rate": 0.00018273466880905744, + "loss": 2.6621, + "step": 3878 + }, + { + "epoch": 0.3130497942054717, + "grad_norm": 0.8544813394546509, + "learning_rate": 0.00018272580041019319, + "loss": 2.7168, + "step": 3879 + }, + { + "epoch": 0.3131304979420547, + "grad_norm": 0.7232592701911926, + "learning_rate": 0.00018271692994958577, + "loss": 2.6666, + "step": 3880 + }, + { + "epoch": 0.3132112016786377, + "grad_norm": 0.750525712966919, + "learning_rate": 0.00018270805742745617, + "loss": 2.6984, + "step": 3881 + }, + { + "epoch": 0.31329190541522073, + "grad_norm": 0.8195550441741943, + "learning_rate": 0.00018269918284402565, + "loss": 2.7183, + "step": 3882 + }, + { + "epoch": 0.3133726091518037, + "grad_norm": 0.7695632576942444, + "learning_rate": 0.0001826903061995153, + "loss": 2.7092, + "step": 3883 + }, + { + "epoch": 0.31345331288838674, + "grad_norm": 0.7631582617759705, + "learning_rate": 0.0001826814274941463, + "loss": 2.7061, + "step": 3884 + }, + { + "epoch": 0.3135340166249697, + "grad_norm": 0.8318471908569336, + "learning_rate": 0.0001826725467281401, + "loss": 2.694, + "step": 3885 + }, + { + "epoch": 0.31361472036155275, + "grad_norm": 0.7313492298126221, + "learning_rate": 0.00018266366390171784, + "loss": 2.6729, + "step": 3886 + }, + { + "epoch": 0.3136954240981357, + "grad_norm": 0.7508631944656372, + "learning_rate": 0.00018265477901510105, + "loss": 2.731, + "step": 3887 + }, + { + "epoch": 0.31377612783471875, + "grad_norm": 0.8106402158737183, + "learning_rate": 0.00018264589206851107, + "loss": 2.7113, + "step": 3888 + }, + { + "epoch": 0.31385683157130173, + "grad_norm": 0.771542489528656, + "learning_rate": 0.00018263700306216945, + "loss": 2.644, + "step": 3889 + }, + { + "epoch": 0.31393753530788476, + "grad_norm": 0.812441885471344, + "learning_rate": 0.00018262811199629768, + "loss": 2.6889, + "step": 3890 + }, + { + "epoch": 0.31401823904446774, + "grad_norm": 0.8231199979782104, + "learning_rate": 0.00018261921887111738, + "loss": 2.6466, + "step": 3891 + }, + { + "epoch": 0.31409894278105077, + "grad_norm": 0.7492454051971436, + "learning_rate": 0.00018261032368685012, + "loss": 2.6693, + "step": 3892 + }, + { + "epoch": 0.31417964651763375, + "grad_norm": 0.7651814222335815, + "learning_rate": 0.00018260142644371772, + "loss": 2.6569, + "step": 3893 + }, + { + "epoch": 0.3142603502542168, + "grad_norm": 0.7504465579986572, + "learning_rate": 0.0001825925271419418, + "loss": 2.684, + "step": 3894 + }, + { + "epoch": 0.31434105399079976, + "grad_norm": 0.749650239944458, + "learning_rate": 0.00018258362578174424, + "loss": 2.6482, + "step": 3895 + }, + { + "epoch": 0.3144217577273828, + "grad_norm": 0.8445256352424622, + "learning_rate": 0.00018257472236334686, + "loss": 2.727, + "step": 3896 + }, + { + "epoch": 0.31450246146396577, + "grad_norm": 0.7628257870674133, + "learning_rate": 0.0001825658168869715, + "loss": 2.7314, + "step": 3897 + }, + { + "epoch": 0.3145831652005488, + "grad_norm": 0.7738446593284607, + "learning_rate": 0.00018255690935284019, + "loss": 2.7478, + "step": 3898 + }, + { + "epoch": 0.3146638689371318, + "grad_norm": 0.7578958868980408, + "learning_rate": 0.00018254799976117486, + "loss": 2.6922, + "step": 3899 + }, + { + "epoch": 0.3147445726737148, + "grad_norm": 0.8367362022399902, + "learning_rate": 0.00018253908811219764, + "loss": 2.7347, + "step": 3900 + }, + { + "epoch": 0.3148252764102978, + "grad_norm": 0.7530354857444763, + "learning_rate": 0.00018253017440613057, + "loss": 2.7151, + "step": 3901 + }, + { + "epoch": 0.3149059801468808, + "grad_norm": 0.7168053388595581, + "learning_rate": 0.00018252125864319578, + "loss": 2.7072, + "step": 3902 + }, + { + "epoch": 0.3149866838834638, + "grad_norm": 0.7480056285858154, + "learning_rate": 0.00018251234082361555, + "loss": 2.6489, + "step": 3903 + }, + { + "epoch": 0.3150673876200468, + "grad_norm": 0.8563880324363708, + "learning_rate": 0.0001825034209476121, + "loss": 2.7384, + "step": 3904 + }, + { + "epoch": 0.3151480913566298, + "grad_norm": 0.7959346771240234, + "learning_rate": 0.0001824944990154077, + "loss": 2.631, + "step": 3905 + }, + { + "epoch": 0.31522879509321283, + "grad_norm": 0.7385980486869812, + "learning_rate": 0.00018248557502722476, + "loss": 2.7394, + "step": 3906 + }, + { + "epoch": 0.3153094988297958, + "grad_norm": 0.7682650685310364, + "learning_rate": 0.00018247664898328567, + "loss": 2.7327, + "step": 3907 + }, + { + "epoch": 0.31539020256637884, + "grad_norm": 0.7720316648483276, + "learning_rate": 0.0001824677208838129, + "loss": 2.6442, + "step": 3908 + }, + { + "epoch": 0.3154709063029618, + "grad_norm": 0.7927379608154297, + "learning_rate": 0.00018245879072902895, + "loss": 2.7738, + "step": 3909 + }, + { + "epoch": 0.31555161003954485, + "grad_norm": 0.7506012916564941, + "learning_rate": 0.00018244985851915637, + "loss": 2.6825, + "step": 3910 + }, + { + "epoch": 0.3156323137761278, + "grad_norm": 0.6996353268623352, + "learning_rate": 0.00018244092425441781, + "loss": 2.6783, + "step": 3911 + }, + { + "epoch": 0.31571301751271086, + "grad_norm": 0.8039344549179077, + "learning_rate": 0.00018243198793503588, + "loss": 2.7628, + "step": 3912 + }, + { + "epoch": 0.31579372124929384, + "grad_norm": 0.7890963554382324, + "learning_rate": 0.0001824230495612334, + "loss": 2.7512, + "step": 3913 + }, + { + "epoch": 0.31587442498587687, + "grad_norm": 0.7470870614051819, + "learning_rate": 0.00018241410913323301, + "loss": 2.7058, + "step": 3914 + }, + { + "epoch": 0.31595512872245984, + "grad_norm": 0.7056336402893066, + "learning_rate": 0.0001824051666512576, + "loss": 2.6091, + "step": 3915 + }, + { + "epoch": 0.3160358324590429, + "grad_norm": 0.7818490862846375, + "learning_rate": 0.00018239622211553002, + "loss": 2.7509, + "step": 3916 + }, + { + "epoch": 0.31611653619562585, + "grad_norm": 0.7590607404708862, + "learning_rate": 0.0001823872755262732, + "loss": 2.7238, + "step": 3917 + }, + { + "epoch": 0.3161972399322089, + "grad_norm": 0.7157841920852661, + "learning_rate": 0.00018237832688371014, + "loss": 2.6639, + "step": 3918 + }, + { + "epoch": 0.31627794366879186, + "grad_norm": 0.7515804171562195, + "learning_rate": 0.00018236937618806382, + "loss": 2.6973, + "step": 3919 + }, + { + "epoch": 0.3163586474053749, + "grad_norm": 0.6691949963569641, + "learning_rate": 0.00018236042343955733, + "loss": 2.727, + "step": 3920 + }, + { + "epoch": 0.31643935114195787, + "grad_norm": 0.8122327327728271, + "learning_rate": 0.0001823514686384138, + "loss": 2.7513, + "step": 3921 + }, + { + "epoch": 0.3165200548785409, + "grad_norm": 0.7813653349876404, + "learning_rate": 0.0001823425117848564, + "loss": 2.7037, + "step": 3922 + }, + { + "epoch": 0.3166007586151239, + "grad_norm": 0.6869354844093323, + "learning_rate": 0.00018233355287910834, + "loss": 2.693, + "step": 3923 + }, + { + "epoch": 0.3166814623517069, + "grad_norm": 0.7773037552833557, + "learning_rate": 0.00018232459192139296, + "loss": 2.687, + "step": 3924 + }, + { + "epoch": 0.3167621660882899, + "grad_norm": 0.7644256949424744, + "learning_rate": 0.00018231562891193352, + "loss": 2.6753, + "step": 3925 + }, + { + "epoch": 0.3168428698248729, + "grad_norm": 0.8427005410194397, + "learning_rate": 0.00018230666385095343, + "loss": 2.6641, + "step": 3926 + }, + { + "epoch": 0.3169235735614559, + "grad_norm": 0.7194599509239197, + "learning_rate": 0.0001822976967386761, + "loss": 2.7091, + "step": 3927 + }, + { + "epoch": 0.3170042772980389, + "grad_norm": 0.7710655331611633, + "learning_rate": 0.00018228872757532512, + "loss": 2.6938, + "step": 3928 + }, + { + "epoch": 0.3170849810346219, + "grad_norm": 0.8003759980201721, + "learning_rate": 0.0001822797563611239, + "loss": 2.7019, + "step": 3929 + }, + { + "epoch": 0.3171656847712049, + "grad_norm": 0.7960470914840698, + "learning_rate": 0.00018227078309629606, + "loss": 2.661, + "step": 3930 + }, + { + "epoch": 0.3172463885077879, + "grad_norm": 0.7731126546859741, + "learning_rate": 0.00018226180778106526, + "loss": 2.7023, + "step": 3931 + }, + { + "epoch": 0.3173270922443709, + "grad_norm": 0.7561383843421936, + "learning_rate": 0.00018225283041565515, + "loss": 2.6768, + "step": 3932 + }, + { + "epoch": 0.3174077959809539, + "grad_norm": 0.7578409910202026, + "learning_rate": 0.0001822438510002895, + "loss": 2.7145, + "step": 3933 + }, + { + "epoch": 0.3174884997175369, + "grad_norm": 0.7901952862739563, + "learning_rate": 0.00018223486953519214, + "loss": 2.7121, + "step": 3934 + }, + { + "epoch": 0.31756920345411993, + "grad_norm": 0.82305908203125, + "learning_rate": 0.0001822258860205868, + "loss": 2.7553, + "step": 3935 + }, + { + "epoch": 0.3176499071907029, + "grad_norm": 0.748055636882782, + "learning_rate": 0.0001822169004566975, + "loss": 2.7236, + "step": 3936 + }, + { + "epoch": 0.31773061092728594, + "grad_norm": 0.7981358766555786, + "learning_rate": 0.0001822079128437481, + "loss": 2.7444, + "step": 3937 + }, + { + "epoch": 0.3178113146638689, + "grad_norm": 0.7938945889472961, + "learning_rate": 0.0001821989231819626, + "loss": 2.7512, + "step": 3938 + }, + { + "epoch": 0.31789201840045195, + "grad_norm": 0.7250397205352783, + "learning_rate": 0.0001821899314715651, + "loss": 2.6843, + "step": 3939 + }, + { + "epoch": 0.3179727221370349, + "grad_norm": 0.8844723701477051, + "learning_rate": 0.00018218093771277965, + "loss": 2.6295, + "step": 3940 + }, + { + "epoch": 0.31805342587361796, + "grad_norm": 0.7545698881149292, + "learning_rate": 0.0001821719419058304, + "loss": 2.7478, + "step": 3941 + }, + { + "epoch": 0.31813412961020093, + "grad_norm": 0.7254738807678223, + "learning_rate": 0.00018216294405094157, + "loss": 2.665, + "step": 3942 + }, + { + "epoch": 0.31821483334678397, + "grad_norm": 0.7664754390716553, + "learning_rate": 0.00018215394414833737, + "loss": 2.7431, + "step": 3943 + }, + { + "epoch": 0.31829553708336694, + "grad_norm": 0.8250303864479065, + "learning_rate": 0.00018214494219824217, + "loss": 2.6957, + "step": 3944 + }, + { + "epoch": 0.31837624081995, + "grad_norm": 0.7425532341003418, + "learning_rate": 0.00018213593820088026, + "loss": 2.666, + "step": 3945 + }, + { + "epoch": 0.31845694455653295, + "grad_norm": 0.6943121552467346, + "learning_rate": 0.00018212693215647604, + "loss": 2.716, + "step": 3946 + }, + { + "epoch": 0.318537648293116, + "grad_norm": 0.732829213142395, + "learning_rate": 0.00018211792406525403, + "loss": 2.6557, + "step": 3947 + }, + { + "epoch": 0.31861835202969896, + "grad_norm": 0.7666537165641785, + "learning_rate": 0.00018210891392743866, + "loss": 2.7275, + "step": 3948 + }, + { + "epoch": 0.318699055766282, + "grad_norm": 0.7652621865272522, + "learning_rate": 0.00018209990174325455, + "loss": 2.6372, + "step": 3949 + }, + { + "epoch": 0.31877975950286497, + "grad_norm": 0.7416055202484131, + "learning_rate": 0.00018209088751292626, + "loss": 2.6688, + "step": 3950 + }, + { + "epoch": 0.318860463239448, + "grad_norm": 0.7504609227180481, + "learning_rate": 0.00018208187123667848, + "loss": 2.6912, + "step": 3951 + }, + { + "epoch": 0.318941166976031, + "grad_norm": 0.7308809757232666, + "learning_rate": 0.00018207285291473588, + "loss": 2.7272, + "step": 3952 + }, + { + "epoch": 0.319021870712614, + "grad_norm": 0.8031618595123291, + "learning_rate": 0.00018206383254732326, + "loss": 2.7354, + "step": 3953 + }, + { + "epoch": 0.319102574449197, + "grad_norm": 0.81386798620224, + "learning_rate": 0.00018205481013466542, + "loss": 2.676, + "step": 3954 + }, + { + "epoch": 0.31918327818578, + "grad_norm": 0.7845911383628845, + "learning_rate": 0.0001820457856769872, + "loss": 2.7094, + "step": 3955 + }, + { + "epoch": 0.319263981922363, + "grad_norm": 0.7189298272132874, + "learning_rate": 0.00018203675917451357, + "loss": 2.6764, + "step": 3956 + }, + { + "epoch": 0.319344685658946, + "grad_norm": 0.8253228664398193, + "learning_rate": 0.00018202773062746944, + "loss": 2.6805, + "step": 3957 + }, + { + "epoch": 0.319425389395529, + "grad_norm": 0.7965289950370789, + "learning_rate": 0.0001820187000360798, + "loss": 2.7148, + "step": 3958 + }, + { + "epoch": 0.31950609313211203, + "grad_norm": 0.7505398988723755, + "learning_rate": 0.0001820096674005698, + "loss": 2.6732, + "step": 3959 + }, + { + "epoch": 0.319586796868695, + "grad_norm": 0.7554877400398254, + "learning_rate": 0.0001820006327211645, + "loss": 2.7467, + "step": 3960 + }, + { + "epoch": 0.31966750060527804, + "grad_norm": 0.7836194038391113, + "learning_rate": 0.00018199159599808907, + "loss": 2.7252, + "step": 3961 + }, + { + "epoch": 0.319748204341861, + "grad_norm": 0.7967261672019958, + "learning_rate": 0.00018198255723156877, + "loss": 2.6814, + "step": 3962 + }, + { + "epoch": 0.31982890807844405, + "grad_norm": 0.7411713600158691, + "learning_rate": 0.00018197351642182882, + "loss": 2.6928, + "step": 3963 + }, + { + "epoch": 0.31990961181502703, + "grad_norm": 0.6961422562599182, + "learning_rate": 0.00018196447356909454, + "loss": 2.6651, + "step": 3964 + }, + { + "epoch": 0.31999031555161006, + "grad_norm": 0.7245771884918213, + "learning_rate": 0.00018195542867359134, + "loss": 2.6726, + "step": 3965 + }, + { + "epoch": 0.32007101928819304, + "grad_norm": 0.784654974937439, + "learning_rate": 0.00018194638173554462, + "loss": 2.6829, + "step": 3966 + }, + { + "epoch": 0.32015172302477607, + "grad_norm": 0.7373329997062683, + "learning_rate": 0.00018193733275517985, + "loss": 2.6481, + "step": 3967 + }, + { + "epoch": 0.32023242676135905, + "grad_norm": 0.7878682613372803, + "learning_rate": 0.00018192828173272258, + "loss": 2.6701, + "step": 3968 + }, + { + "epoch": 0.3203131304979421, + "grad_norm": 0.759676992893219, + "learning_rate": 0.00018191922866839835, + "loss": 2.7218, + "step": 3969 + }, + { + "epoch": 0.32039383423452505, + "grad_norm": 0.7923088669776917, + "learning_rate": 0.00018191017356243282, + "loss": 2.6841, + "step": 3970 + }, + { + "epoch": 0.3204745379711081, + "grad_norm": 0.7084882855415344, + "learning_rate": 0.00018190111641505164, + "loss": 2.7167, + "step": 3971 + }, + { + "epoch": 0.32055524170769106, + "grad_norm": 0.7166235446929932, + "learning_rate": 0.00018189205722648054, + "loss": 2.6647, + "step": 3972 + }, + { + "epoch": 0.3206359454442741, + "grad_norm": 0.7997722029685974, + "learning_rate": 0.0001818829959969453, + "loss": 2.7199, + "step": 3973 + }, + { + "epoch": 0.32071664918085707, + "grad_norm": 0.8309516310691833, + "learning_rate": 0.0001818739327266718, + "loss": 2.8006, + "step": 3974 + }, + { + "epoch": 0.3207973529174401, + "grad_norm": 0.7164002656936646, + "learning_rate": 0.00018186486741588582, + "loss": 2.6258, + "step": 3975 + }, + { + "epoch": 0.3208780566540231, + "grad_norm": 0.7715865969657898, + "learning_rate": 0.0001818558000648134, + "loss": 2.7034, + "step": 3976 + }, + { + "epoch": 0.3209587603906061, + "grad_norm": 0.7806593775749207, + "learning_rate": 0.0001818467306736804, + "loss": 2.6758, + "step": 3977 + }, + { + "epoch": 0.3210394641271891, + "grad_norm": 0.8026594519615173, + "learning_rate": 0.00018183765924271298, + "loss": 2.6976, + "step": 3978 + }, + { + "epoch": 0.32112016786377207, + "grad_norm": 0.7971245050430298, + "learning_rate": 0.00018182858577213716, + "loss": 2.7312, + "step": 3979 + }, + { + "epoch": 0.3212008716003551, + "grad_norm": 0.7347297072410583, + "learning_rate": 0.00018181951026217908, + "loss": 2.6664, + "step": 3980 + }, + { + "epoch": 0.3212815753369381, + "grad_norm": 0.7929779291152954, + "learning_rate": 0.0001818104327130649, + "loss": 2.6603, + "step": 3981 + }, + { + "epoch": 0.3213622790735211, + "grad_norm": 0.7465224862098694, + "learning_rate": 0.00018180135312502089, + "loss": 2.6566, + "step": 3982 + }, + { + "epoch": 0.3214429828101041, + "grad_norm": 0.7114695906639099, + "learning_rate": 0.00018179227149827334, + "loss": 2.6492, + "step": 3983 + }, + { + "epoch": 0.3215236865466871, + "grad_norm": 0.7179337739944458, + "learning_rate": 0.00018178318783304857, + "loss": 2.6778, + "step": 3984 + }, + { + "epoch": 0.3216043902832701, + "grad_norm": 0.7182629704475403, + "learning_rate": 0.000181774102129573, + "loss": 2.7057, + "step": 3985 + }, + { + "epoch": 0.3216850940198531, + "grad_norm": 0.7383119463920593, + "learning_rate": 0.000181765014388073, + "loss": 2.6633, + "step": 3986 + }, + { + "epoch": 0.3217657977564361, + "grad_norm": 0.7340527176856995, + "learning_rate": 0.00018175592460877512, + "loss": 2.6838, + "step": 3987 + }, + { + "epoch": 0.32184650149301913, + "grad_norm": 0.7934359312057495, + "learning_rate": 0.00018174683279190593, + "loss": 2.6795, + "step": 3988 + }, + { + "epoch": 0.3219272052296021, + "grad_norm": 0.6960840821266174, + "learning_rate": 0.00018173773893769192, + "loss": 2.6669, + "step": 3989 + }, + { + "epoch": 0.32200790896618514, + "grad_norm": 0.7513574361801147, + "learning_rate": 0.00018172864304635985, + "loss": 2.6744, + "step": 3990 + }, + { + "epoch": 0.3220886127027681, + "grad_norm": 0.7516636848449707, + "learning_rate": 0.00018171954511813629, + "loss": 2.6652, + "step": 3991 + }, + { + "epoch": 0.32216931643935115, + "grad_norm": 0.7817716002464294, + "learning_rate": 0.00018171044515324808, + "loss": 2.6671, + "step": 3992 + }, + { + "epoch": 0.3222500201759341, + "grad_norm": 0.6859925389289856, + "learning_rate": 0.000181701343151922, + "loss": 2.6984, + "step": 3993 + }, + { + "epoch": 0.32233072391251716, + "grad_norm": 0.7669627666473389, + "learning_rate": 0.00018169223911438485, + "loss": 2.7102, + "step": 3994 + }, + { + "epoch": 0.32241142764910014, + "grad_norm": 0.784724235534668, + "learning_rate": 0.00018168313304086357, + "loss": 2.7413, + "step": 3995 + }, + { + "epoch": 0.32249213138568317, + "grad_norm": 0.7341497540473938, + "learning_rate": 0.00018167402493158509, + "loss": 2.706, + "step": 3996 + }, + { + "epoch": 0.32257283512226614, + "grad_norm": 0.7975730299949646, + "learning_rate": 0.00018166491478677641, + "loss": 2.6896, + "step": 3997 + }, + { + "epoch": 0.3226535388588492, + "grad_norm": 0.8138537406921387, + "learning_rate": 0.00018165580260666458, + "loss": 2.6986, + "step": 3998 + }, + { + "epoch": 0.32273424259543215, + "grad_norm": 0.6734997034072876, + "learning_rate": 0.0001816466883914767, + "loss": 2.6686, + "step": 3999 + }, + { + "epoch": 0.3228149463320152, + "grad_norm": 0.7742779850959778, + "learning_rate": 0.00018163757214143992, + "loss": 2.7222, + "step": 4000 + }, + { + "epoch": 0.3228149463320152, + "eval_loss": 2.615234375, + "eval_runtime": 783.0394, + "eval_samples_per_second": 3.346, + "eval_steps_per_second": 0.558, + "step": 4000 + }, + { + "epoch": 0.32289565006859816, + "grad_norm": 0.7654715180397034, + "learning_rate": 0.00018162845385678145, + "loss": 2.7016, + "step": 4001 + }, + { + "epoch": 0.3229763538051812, + "grad_norm": 0.8698763251304626, + "learning_rate": 0.0001816193335377285, + "loss": 2.6709, + "step": 4002 + }, + { + "epoch": 0.32305705754176417, + "grad_norm": 0.758056640625, + "learning_rate": 0.00018161021118450843, + "loss": 2.7277, + "step": 4003 + }, + { + "epoch": 0.3231377612783472, + "grad_norm": 0.7462654113769531, + "learning_rate": 0.00018160108679734856, + "loss": 2.623, + "step": 4004 + }, + { + "epoch": 0.3232184650149302, + "grad_norm": 0.7274953722953796, + "learning_rate": 0.00018159196037647628, + "loss": 2.6875, + "step": 4005 + }, + { + "epoch": 0.3232991687515132, + "grad_norm": 0.7737346887588501, + "learning_rate": 0.0001815828319221191, + "loss": 2.6967, + "step": 4006 + }, + { + "epoch": 0.3233798724880962, + "grad_norm": 0.7793172001838684, + "learning_rate": 0.00018157370143450448, + "loss": 2.724, + "step": 4007 + }, + { + "epoch": 0.3234605762246792, + "grad_norm": 0.7791805863380432, + "learning_rate": 0.00018156456891385995, + "loss": 2.6653, + "step": 4008 + }, + { + "epoch": 0.3235412799612622, + "grad_norm": 0.7225624918937683, + "learning_rate": 0.0001815554343604132, + "loss": 2.745, + "step": 4009 + }, + { + "epoch": 0.32362198369784523, + "grad_norm": 0.6958494782447815, + "learning_rate": 0.0001815462977743918, + "loss": 2.6856, + "step": 4010 + }, + { + "epoch": 0.3237026874344282, + "grad_norm": 0.7572030425071716, + "learning_rate": 0.0001815371591560235, + "loss": 2.7053, + "step": 4011 + }, + { + "epoch": 0.32378339117101124, + "grad_norm": 0.7133952975273132, + "learning_rate": 0.00018152801850553605, + "loss": 2.6984, + "step": 4012 + }, + { + "epoch": 0.3238640949075942, + "grad_norm": 0.7598705291748047, + "learning_rate": 0.00018151887582315728, + "loss": 2.6632, + "step": 4013 + }, + { + "epoch": 0.32394479864417725, + "grad_norm": 0.7670698165893555, + "learning_rate": 0.00018150973110911503, + "loss": 2.7035, + "step": 4014 + }, + { + "epoch": 0.3240255023807602, + "grad_norm": 0.7547060251235962, + "learning_rate": 0.00018150058436363723, + "loss": 2.6531, + "step": 4015 + }, + { + "epoch": 0.32410620611734325, + "grad_norm": 0.7943035364151001, + "learning_rate": 0.00018149143558695178, + "loss": 2.766, + "step": 4016 + }, + { + "epoch": 0.32418690985392623, + "grad_norm": 0.864356517791748, + "learning_rate": 0.00018148228477928675, + "loss": 2.7134, + "step": 4017 + }, + { + "epoch": 0.32426761359050926, + "grad_norm": 0.7773902416229248, + "learning_rate": 0.00018147313194087018, + "loss": 2.6948, + "step": 4018 + }, + { + "epoch": 0.32434831732709224, + "grad_norm": 0.839131772518158, + "learning_rate": 0.0001814639770719302, + "loss": 2.7393, + "step": 4019 + }, + { + "epoch": 0.32442902106367527, + "grad_norm": 0.807837963104248, + "learning_rate": 0.00018145482017269498, + "loss": 2.7835, + "step": 4020 + }, + { + "epoch": 0.32450972480025825, + "grad_norm": 0.7133228182792664, + "learning_rate": 0.00018144566124339272, + "loss": 2.6859, + "step": 4021 + }, + { + "epoch": 0.3245904285368413, + "grad_norm": 0.8450621962547302, + "learning_rate": 0.00018143650028425162, + "loss": 2.7548, + "step": 4022 + }, + { + "epoch": 0.32467113227342426, + "grad_norm": 0.8594980835914612, + "learning_rate": 0.00018142733729550013, + "loss": 2.6636, + "step": 4023 + }, + { + "epoch": 0.3247518360100073, + "grad_norm": 0.7134621739387512, + "learning_rate": 0.0001814181722773665, + "loss": 2.6501, + "step": 4024 + }, + { + "epoch": 0.32483253974659027, + "grad_norm": 0.8630430698394775, + "learning_rate": 0.0001814090052300792, + "loss": 2.6994, + "step": 4025 + }, + { + "epoch": 0.3249132434831733, + "grad_norm": 0.7044873237609863, + "learning_rate": 0.00018139983615386666, + "loss": 2.6603, + "step": 4026 + }, + { + "epoch": 0.3249939472197563, + "grad_norm": 0.6896052360534668, + "learning_rate": 0.00018139066504895744, + "loss": 2.6649, + "step": 4027 + }, + { + "epoch": 0.3250746509563393, + "grad_norm": 0.802855372428894, + "learning_rate": 0.00018138149191558012, + "loss": 2.7067, + "step": 4028 + }, + { + "epoch": 0.3251553546929223, + "grad_norm": 0.7555437088012695, + "learning_rate": 0.00018137231675396324, + "loss": 2.6471, + "step": 4029 + }, + { + "epoch": 0.32523605842950526, + "grad_norm": 0.6846967339515686, + "learning_rate": 0.00018136313956433552, + "loss": 2.6774, + "step": 4030 + }, + { + "epoch": 0.3253167621660883, + "grad_norm": 0.7435858249664307, + "learning_rate": 0.0001813539603469257, + "loss": 2.7135, + "step": 4031 + }, + { + "epoch": 0.32539746590267127, + "grad_norm": 0.7669098377227783, + "learning_rate": 0.00018134477910196253, + "loss": 2.7014, + "step": 4032 + }, + { + "epoch": 0.3254781696392543, + "grad_norm": 0.7797521352767944, + "learning_rate": 0.00018133559582967482, + "loss": 2.7229, + "step": 4033 + }, + { + "epoch": 0.3255588733758373, + "grad_norm": 0.7377886176109314, + "learning_rate": 0.00018132641053029142, + "loss": 2.7196, + "step": 4034 + }, + { + "epoch": 0.3256395771124203, + "grad_norm": 0.7387986779212952, + "learning_rate": 0.0001813172232040413, + "loss": 2.687, + "step": 4035 + }, + { + "epoch": 0.3257202808490033, + "grad_norm": 0.7276624441146851, + "learning_rate": 0.0001813080338511534, + "loss": 2.6954, + "step": 4036 + }, + { + "epoch": 0.3258009845855863, + "grad_norm": 0.7929670214653015, + "learning_rate": 0.00018129884247185683, + "loss": 2.7431, + "step": 4037 + }, + { + "epoch": 0.3258816883221693, + "grad_norm": 0.7896441221237183, + "learning_rate": 0.0001812896490663805, + "loss": 2.6823, + "step": 4038 + }, + { + "epoch": 0.3259623920587523, + "grad_norm": 0.8642957210540771, + "learning_rate": 0.00018128045363495368, + "loss": 2.7334, + "step": 4039 + }, + { + "epoch": 0.3260430957953353, + "grad_norm": 0.7156081795692444, + "learning_rate": 0.00018127125617780542, + "loss": 2.6886, + "step": 4040 + }, + { + "epoch": 0.32612379953191833, + "grad_norm": 0.8260853290557861, + "learning_rate": 0.00018126205669516507, + "loss": 2.6802, + "step": 4041 + }, + { + "epoch": 0.3262045032685013, + "grad_norm": 0.6853542327880859, + "learning_rate": 0.00018125285518726182, + "loss": 2.6392, + "step": 4042 + }, + { + "epoch": 0.32628520700508434, + "grad_norm": 0.7574017643928528, + "learning_rate": 0.00018124365165432505, + "loss": 2.7412, + "step": 4043 + }, + { + "epoch": 0.3263659107416673, + "grad_norm": 0.8656191825866699, + "learning_rate": 0.00018123444609658408, + "loss": 2.6903, + "step": 4044 + }, + { + "epoch": 0.32644661447825035, + "grad_norm": 0.7443257570266724, + "learning_rate": 0.00018122523851426837, + "loss": 2.682, + "step": 4045 + }, + { + "epoch": 0.32652731821483333, + "grad_norm": 0.7222229242324829, + "learning_rate": 0.0001812160289076074, + "loss": 2.6196, + "step": 4046 + }, + { + "epoch": 0.32660802195141636, + "grad_norm": 0.8531985878944397, + "learning_rate": 0.00018120681727683066, + "loss": 2.6777, + "step": 4047 + }, + { + "epoch": 0.32668872568799934, + "grad_norm": 0.7380290627479553, + "learning_rate": 0.0001811976036221678, + "loss": 2.6847, + "step": 4048 + }, + { + "epoch": 0.32676942942458237, + "grad_norm": 0.7250707149505615, + "learning_rate": 0.00018118838794384837, + "loss": 2.6846, + "step": 4049 + }, + { + "epoch": 0.32685013316116535, + "grad_norm": 0.763504147529602, + "learning_rate": 0.00018117917024210208, + "loss": 2.69, + "step": 4050 + }, + { + "epoch": 0.3269308368977484, + "grad_norm": 0.7740737795829773, + "learning_rate": 0.00018116995051715867, + "loss": 2.6945, + "step": 4051 + }, + { + "epoch": 0.32701154063433135, + "grad_norm": 0.7777624726295471, + "learning_rate": 0.00018116072876924792, + "loss": 2.6918, + "step": 4052 + }, + { + "epoch": 0.3270922443709144, + "grad_norm": 0.7957910895347595, + "learning_rate": 0.0001811515049985997, + "loss": 2.7237, + "step": 4053 + }, + { + "epoch": 0.32717294810749736, + "grad_norm": 0.7828991413116455, + "learning_rate": 0.00018114227920544375, + "loss": 2.7008, + "step": 4054 + }, + { + "epoch": 0.3272536518440804, + "grad_norm": 0.6695161461830139, + "learning_rate": 0.00018113305139001016, + "loss": 2.7311, + "step": 4055 + }, + { + "epoch": 0.32733435558066337, + "grad_norm": 0.7693436145782471, + "learning_rate": 0.00018112382155252883, + "loss": 2.7102, + "step": 4056 + }, + { + "epoch": 0.3274150593172464, + "grad_norm": 0.7520042657852173, + "learning_rate": 0.0001811145896932298, + "loss": 2.6455, + "step": 4057 + }, + { + "epoch": 0.3274957630538294, + "grad_norm": 0.786834716796875, + "learning_rate": 0.00018110535581234317, + "loss": 2.6965, + "step": 4058 + }, + { + "epoch": 0.3275764667904124, + "grad_norm": 0.742001473903656, + "learning_rate": 0.00018109611991009905, + "loss": 2.7341, + "step": 4059 + }, + { + "epoch": 0.3276571705269954, + "grad_norm": 0.813522219657898, + "learning_rate": 0.00018108688198672766, + "loss": 2.8116, + "step": 4060 + }, + { + "epoch": 0.3277378742635784, + "grad_norm": 0.7611314058303833, + "learning_rate": 0.00018107764204245916, + "loss": 2.6741, + "step": 4061 + }, + { + "epoch": 0.3278185780001614, + "grad_norm": 0.7285993695259094, + "learning_rate": 0.00018106840007752392, + "loss": 2.671, + "step": 4062 + }, + { + "epoch": 0.32789928173674443, + "grad_norm": 0.773151695728302, + "learning_rate": 0.0001810591560921522, + "loss": 2.7106, + "step": 4063 + }, + { + "epoch": 0.3279799854733274, + "grad_norm": 0.7448920011520386, + "learning_rate": 0.00018104991008657445, + "loss": 2.7176, + "step": 4064 + }, + { + "epoch": 0.32806068920991044, + "grad_norm": 0.7088467478752136, + "learning_rate": 0.0001810406620610211, + "loss": 2.7085, + "step": 4065 + }, + { + "epoch": 0.3281413929464934, + "grad_norm": 0.7507789731025696, + "learning_rate": 0.00018103141201572255, + "loss": 2.7361, + "step": 4066 + }, + { + "epoch": 0.32822209668307645, + "grad_norm": 0.7065643072128296, + "learning_rate": 0.00018102215995090943, + "loss": 2.6573, + "step": 4067 + }, + { + "epoch": 0.3283028004196594, + "grad_norm": 0.6888713836669922, + "learning_rate": 0.0001810129058668123, + "loss": 2.6699, + "step": 4068 + }, + { + "epoch": 0.32838350415624246, + "grad_norm": 0.736347496509552, + "learning_rate": 0.00018100364976366174, + "loss": 2.7089, + "step": 4069 + }, + { + "epoch": 0.32846420789282543, + "grad_norm": 0.6854562759399414, + "learning_rate": 0.0001809943916416885, + "loss": 2.7051, + "step": 4070 + }, + { + "epoch": 0.32854491162940846, + "grad_norm": 0.7481048107147217, + "learning_rate": 0.0001809851315011233, + "loss": 2.7428, + "step": 4071 + }, + { + "epoch": 0.32862561536599144, + "grad_norm": 0.7600961923599243, + "learning_rate": 0.0001809758693421969, + "loss": 2.7153, + "step": 4072 + }, + { + "epoch": 0.3287063191025745, + "grad_norm": 0.7545063495635986, + "learning_rate": 0.00018096660516514024, + "loss": 2.6736, + "step": 4073 + }, + { + "epoch": 0.32878702283915745, + "grad_norm": 0.7967175841331482, + "learning_rate": 0.0001809573389701841, + "loss": 2.6711, + "step": 4074 + }, + { + "epoch": 0.3288677265757405, + "grad_norm": 0.7115446925163269, + "learning_rate": 0.00018094807075755943, + "loss": 2.6761, + "step": 4075 + }, + { + "epoch": 0.32894843031232346, + "grad_norm": 0.8230876326560974, + "learning_rate": 0.00018093880052749725, + "loss": 2.6749, + "step": 4076 + }, + { + "epoch": 0.3290291340489065, + "grad_norm": 0.8549706935882568, + "learning_rate": 0.00018092952828022856, + "loss": 2.7084, + "step": 4077 + }, + { + "epoch": 0.32910983778548947, + "grad_norm": 0.7379534244537354, + "learning_rate": 0.00018092025401598448, + "loss": 2.7241, + "step": 4078 + }, + { + "epoch": 0.3291905415220725, + "grad_norm": 0.7659998536109924, + "learning_rate": 0.00018091097773499616, + "loss": 2.7108, + "step": 4079 + }, + { + "epoch": 0.3292712452586555, + "grad_norm": 0.8074536323547363, + "learning_rate": 0.00018090169943749476, + "loss": 2.676, + "step": 4080 + }, + { + "epoch": 0.32935194899523845, + "grad_norm": 0.7588536143302917, + "learning_rate": 0.00018089241912371153, + "loss": 2.639, + "step": 4081 + }, + { + "epoch": 0.3294326527318215, + "grad_norm": 0.7510811686515808, + "learning_rate": 0.00018088313679387775, + "loss": 2.6722, + "step": 4082 + }, + { + "epoch": 0.32951335646840446, + "grad_norm": 0.7538900971412659, + "learning_rate": 0.0001808738524482248, + "loss": 2.6917, + "step": 4083 + }, + { + "epoch": 0.3295940602049875, + "grad_norm": 0.8071155548095703, + "learning_rate": 0.00018086456608698402, + "loss": 2.6964, + "step": 4084 + }, + { + "epoch": 0.32967476394157047, + "grad_norm": 0.7778098583221436, + "learning_rate": 0.00018085527771038686, + "loss": 2.7301, + "step": 4085 + }, + { + "epoch": 0.3297554676781535, + "grad_norm": 0.7717564702033997, + "learning_rate": 0.00018084598731866485, + "loss": 2.7484, + "step": 4086 + }, + { + "epoch": 0.3298361714147365, + "grad_norm": 0.7361736297607422, + "learning_rate": 0.00018083669491204948, + "loss": 2.6299, + "step": 4087 + }, + { + "epoch": 0.3299168751513195, + "grad_norm": 0.736681342124939, + "learning_rate": 0.00018082740049077238, + "loss": 2.7521, + "step": 4088 + }, + { + "epoch": 0.3299975788879025, + "grad_norm": 0.8011857867240906, + "learning_rate": 0.00018081810405506517, + "loss": 2.724, + "step": 4089 + }, + { + "epoch": 0.3300782826244855, + "grad_norm": 0.7741932272911072, + "learning_rate": 0.00018080880560515956, + "loss": 2.6766, + "step": 4090 + }, + { + "epoch": 0.3301589863610685, + "grad_norm": 0.7321778535842896, + "learning_rate": 0.00018079950514128724, + "loss": 2.6614, + "step": 4091 + }, + { + "epoch": 0.33023969009765153, + "grad_norm": 0.7916514277458191, + "learning_rate": 0.00018079020266368006, + "loss": 2.7177, + "step": 4092 + }, + { + "epoch": 0.3303203938342345, + "grad_norm": 0.7961388826370239, + "learning_rate": 0.00018078089817256986, + "loss": 2.6671, + "step": 4093 + }, + { + "epoch": 0.33040109757081754, + "grad_norm": 0.7167038321495056, + "learning_rate": 0.0001807715916681885, + "loss": 2.6989, + "step": 4094 + }, + { + "epoch": 0.3304818013074005, + "grad_norm": 0.6924864649772644, + "learning_rate": 0.00018076228315076794, + "loss": 2.6484, + "step": 4095 + }, + { + "epoch": 0.33056250504398355, + "grad_norm": 0.777881383895874, + "learning_rate": 0.00018075297262054013, + "loss": 2.6498, + "step": 4096 + }, + { + "epoch": 0.3306432087805665, + "grad_norm": 0.7878376841545105, + "learning_rate": 0.0001807436600777372, + "loss": 2.7745, + "step": 4097 + }, + { + "epoch": 0.33072391251714955, + "grad_norm": 0.8418465256690979, + "learning_rate": 0.0001807343455225912, + "loss": 2.7195, + "step": 4098 + }, + { + "epoch": 0.33080461625373253, + "grad_norm": 0.7780830264091492, + "learning_rate": 0.00018072502895533424, + "loss": 2.6652, + "step": 4099 + }, + { + "epoch": 0.33088531999031556, + "grad_norm": 0.7102445960044861, + "learning_rate": 0.00018071571037619853, + "loss": 2.6618, + "step": 4100 + }, + { + "epoch": 0.33096602372689854, + "grad_norm": 0.7028098106384277, + "learning_rate": 0.00018070638978541633, + "loss": 2.7114, + "step": 4101 + }, + { + "epoch": 0.33104672746348157, + "grad_norm": 0.7529525756835938, + "learning_rate": 0.00018069706718321996, + "loss": 2.7231, + "step": 4102 + }, + { + "epoch": 0.33112743120006455, + "grad_norm": 0.7404564023017883, + "learning_rate": 0.0001806877425698417, + "loss": 2.6564, + "step": 4103 + }, + { + "epoch": 0.3312081349366476, + "grad_norm": 0.7725130319595337, + "learning_rate": 0.00018067841594551401, + "loss": 2.677, + "step": 4104 + }, + { + "epoch": 0.33128883867323056, + "grad_norm": 0.7616425156593323, + "learning_rate": 0.00018066908731046927, + "loss": 2.6586, + "step": 4105 + }, + { + "epoch": 0.3313695424098136, + "grad_norm": 0.7318183779716492, + "learning_rate": 0.00018065975666494002, + "loss": 2.6624, + "step": 4106 + }, + { + "epoch": 0.33145024614639657, + "grad_norm": 0.7012802958488464, + "learning_rate": 0.00018065042400915878, + "loss": 2.6663, + "step": 4107 + }, + { + "epoch": 0.3315309498829796, + "grad_norm": 0.815226674079895, + "learning_rate": 0.00018064108934335814, + "loss": 2.7248, + "step": 4108 + }, + { + "epoch": 0.3316116536195626, + "grad_norm": 0.68972247838974, + "learning_rate": 0.00018063175266777077, + "loss": 2.6961, + "step": 4109 + }, + { + "epoch": 0.3316923573561456, + "grad_norm": 0.7563794255256653, + "learning_rate": 0.00018062241398262937, + "loss": 2.6526, + "step": 4110 + }, + { + "epoch": 0.3317730610927286, + "grad_norm": 0.7878836989402771, + "learning_rate": 0.00018061307328816662, + "loss": 2.7316, + "step": 4111 + }, + { + "epoch": 0.3318537648293116, + "grad_norm": 0.7189129590988159, + "learning_rate": 0.00018060373058461537, + "loss": 2.6577, + "step": 4112 + }, + { + "epoch": 0.3319344685658946, + "grad_norm": 0.7517561912536621, + "learning_rate": 0.00018059438587220847, + "loss": 2.668, + "step": 4113 + }, + { + "epoch": 0.3320151723024776, + "grad_norm": 0.7602595686912537, + "learning_rate": 0.00018058503915117878, + "loss": 2.6741, + "step": 4114 + }, + { + "epoch": 0.3320958760390606, + "grad_norm": 0.7702187299728394, + "learning_rate": 0.00018057569042175927, + "loss": 2.7082, + "step": 4115 + }, + { + "epoch": 0.33217657977564363, + "grad_norm": 0.7289660573005676, + "learning_rate": 0.00018056633968418294, + "loss": 2.6728, + "step": 4116 + }, + { + "epoch": 0.3322572835122266, + "grad_norm": 0.6936683654785156, + "learning_rate": 0.0001805569869386828, + "loss": 2.6735, + "step": 4117 + }, + { + "epoch": 0.33233798724880964, + "grad_norm": 0.7128138542175293, + "learning_rate": 0.000180547632185492, + "loss": 2.646, + "step": 4118 + }, + { + "epoch": 0.3324186909853926, + "grad_norm": 0.7234248518943787, + "learning_rate": 0.00018053827542484363, + "loss": 2.6497, + "step": 4119 + }, + { + "epoch": 0.33249939472197565, + "grad_norm": 0.7084202170372009, + "learning_rate": 0.0001805289166569709, + "loss": 2.6328, + "step": 4120 + }, + { + "epoch": 0.3325800984585586, + "grad_norm": 0.8068051934242249, + "learning_rate": 0.00018051955588210708, + "loss": 2.6576, + "step": 4121 + }, + { + "epoch": 0.33266080219514166, + "grad_norm": 0.787680447101593, + "learning_rate": 0.00018051019310048544, + "loss": 2.7091, + "step": 4122 + }, + { + "epoch": 0.33274150593172463, + "grad_norm": 0.698946475982666, + "learning_rate": 0.00018050082831233931, + "loss": 2.6657, + "step": 4123 + }, + { + "epoch": 0.33282220966830767, + "grad_norm": 0.7946122288703918, + "learning_rate": 0.00018049146151790215, + "loss": 2.6981, + "step": 4124 + }, + { + "epoch": 0.33290291340489064, + "grad_norm": 0.8025123476982117, + "learning_rate": 0.00018048209271740736, + "loss": 2.6878, + "step": 4125 + }, + { + "epoch": 0.3329836171414737, + "grad_norm": 0.7493376135826111, + "learning_rate": 0.0001804727219110884, + "loss": 2.6556, + "step": 4126 + }, + { + "epoch": 0.33306432087805665, + "grad_norm": 0.7143186926841736, + "learning_rate": 0.00018046334909917886, + "loss": 2.6879, + "step": 4127 + }, + { + "epoch": 0.3331450246146397, + "grad_norm": 0.7375641465187073, + "learning_rate": 0.00018045397428191235, + "loss": 2.6817, + "step": 4128 + }, + { + "epoch": 0.33322572835122266, + "grad_norm": 0.7201291918754578, + "learning_rate": 0.00018044459745952248, + "loss": 2.6765, + "step": 4129 + }, + { + "epoch": 0.3333064320878057, + "grad_norm": 0.7924519777297974, + "learning_rate": 0.00018043521863224296, + "loss": 2.7748, + "step": 4130 + }, + { + "epoch": 0.33338713582438867, + "grad_norm": 0.7773354053497314, + "learning_rate": 0.00018042583780030752, + "loss": 2.6839, + "step": 4131 + }, + { + "epoch": 0.33346783956097165, + "grad_norm": 0.7527397274971008, + "learning_rate": 0.00018041645496394998, + "loss": 2.6749, + "step": 4132 + }, + { + "epoch": 0.3335485432975547, + "grad_norm": 0.7329208254814148, + "learning_rate": 0.00018040707012340418, + "loss": 2.7535, + "step": 4133 + }, + { + "epoch": 0.33362924703413765, + "grad_norm": 0.7637773752212524, + "learning_rate": 0.00018039768327890397, + "loss": 2.632, + "step": 4134 + }, + { + "epoch": 0.3337099507707207, + "grad_norm": 0.823623776435852, + "learning_rate": 0.00018038829443068333, + "loss": 2.7122, + "step": 4135 + }, + { + "epoch": 0.33379065450730366, + "grad_norm": 0.8040826916694641, + "learning_rate": 0.00018037890357897632, + "loss": 2.7197, + "step": 4136 + }, + { + "epoch": 0.3338713582438867, + "grad_norm": 0.7483998537063599, + "learning_rate": 0.00018036951072401686, + "loss": 2.6535, + "step": 4137 + }, + { + "epoch": 0.33395206198046967, + "grad_norm": 0.8141106367111206, + "learning_rate": 0.00018036011586603914, + "loss": 2.7127, + "step": 4138 + }, + { + "epoch": 0.3340327657170527, + "grad_norm": 0.7226041555404663, + "learning_rate": 0.00018035071900527724, + "loss": 2.6846, + "step": 4139 + }, + { + "epoch": 0.3341134694536357, + "grad_norm": 0.7624794840812683, + "learning_rate": 0.00018034132014196541, + "loss": 2.6725, + "step": 4140 + }, + { + "epoch": 0.3341941731902187, + "grad_norm": 0.7299962043762207, + "learning_rate": 0.00018033191927633785, + "loss": 2.6728, + "step": 4141 + }, + { + "epoch": 0.3342748769268017, + "grad_norm": 0.7920462489128113, + "learning_rate": 0.0001803225164086289, + "loss": 2.6544, + "step": 4142 + }, + { + "epoch": 0.3343555806633847, + "grad_norm": 0.7469778656959534, + "learning_rate": 0.00018031311153907282, + "loss": 2.7356, + "step": 4143 + }, + { + "epoch": 0.3344362843999677, + "grad_norm": 0.8831696510314941, + "learning_rate": 0.0001803037046679041, + "loss": 2.6584, + "step": 4144 + }, + { + "epoch": 0.33451698813655073, + "grad_norm": 0.8047679662704468, + "learning_rate": 0.00018029429579535715, + "loss": 2.6213, + "step": 4145 + }, + { + "epoch": 0.3345976918731337, + "grad_norm": 0.7109517455101013, + "learning_rate": 0.00018028488492166645, + "loss": 2.6622, + "step": 4146 + }, + { + "epoch": 0.33467839560971674, + "grad_norm": 0.7240141034126282, + "learning_rate": 0.0001802754720470665, + "loss": 2.6794, + "step": 4147 + }, + { + "epoch": 0.3347590993462997, + "grad_norm": 0.7292990684509277, + "learning_rate": 0.000180266057171792, + "loss": 2.6079, + "step": 4148 + }, + { + "epoch": 0.33483980308288275, + "grad_norm": 0.8055328130722046, + "learning_rate": 0.00018025664029607756, + "loss": 2.7044, + "step": 4149 + }, + { + "epoch": 0.3349205068194657, + "grad_norm": 0.8348979949951172, + "learning_rate": 0.00018024722142015781, + "loss": 2.6757, + "step": 4150 + }, + { + "epoch": 0.33500121055604876, + "grad_norm": 0.7797044515609741, + "learning_rate": 0.00018023780054426754, + "loss": 2.7125, + "step": 4151 + }, + { + "epoch": 0.33508191429263173, + "grad_norm": 0.802442729473114, + "learning_rate": 0.00018022837766864153, + "loss": 2.7121, + "step": 4152 + }, + { + "epoch": 0.33516261802921476, + "grad_norm": 0.7248829007148743, + "learning_rate": 0.00018021895279351463, + "loss": 2.7344, + "step": 4153 + }, + { + "epoch": 0.33524332176579774, + "grad_norm": 0.7458582520484924, + "learning_rate": 0.00018020952591912175, + "loss": 2.665, + "step": 4154 + }, + { + "epoch": 0.3353240255023808, + "grad_norm": 0.8153703808784485, + "learning_rate": 0.0001802000970456978, + "loss": 2.7416, + "step": 4155 + }, + { + "epoch": 0.33540472923896375, + "grad_norm": 0.7583708763122559, + "learning_rate": 0.00018019066617347779, + "loss": 2.7002, + "step": 4156 + }, + { + "epoch": 0.3354854329755468, + "grad_norm": 0.7522469162940979, + "learning_rate": 0.00018018123330269678, + "loss": 2.7196, + "step": 4157 + }, + { + "epoch": 0.33556613671212976, + "grad_norm": 0.7386923432350159, + "learning_rate": 0.00018017179843358983, + "loss": 2.6947, + "step": 4158 + }, + { + "epoch": 0.3356468404487128, + "grad_norm": 0.7366231083869934, + "learning_rate": 0.00018016236156639205, + "loss": 2.7377, + "step": 4159 + }, + { + "epoch": 0.33572754418529577, + "grad_norm": 0.7727232575416565, + "learning_rate": 0.00018015292270133872, + "loss": 2.7566, + "step": 4160 + }, + { + "epoch": 0.3358082479218788, + "grad_norm": 0.6781843304634094, + "learning_rate": 0.000180143481838665, + "loss": 2.6796, + "step": 4161 + }, + { + "epoch": 0.3358889516584618, + "grad_norm": 0.7036039233207703, + "learning_rate": 0.00018013403897860624, + "loss": 2.7012, + "step": 4162 + }, + { + "epoch": 0.3359696553950448, + "grad_norm": 0.8252625465393066, + "learning_rate": 0.00018012459412139776, + "loss": 2.6613, + "step": 4163 + }, + { + "epoch": 0.3360503591316278, + "grad_norm": 0.6924486756324768, + "learning_rate": 0.00018011514726727493, + "loss": 2.6425, + "step": 4164 + }, + { + "epoch": 0.3361310628682108, + "grad_norm": 0.7735962271690369, + "learning_rate": 0.0001801056984164732, + "loss": 2.7235, + "step": 4165 + }, + { + "epoch": 0.3362117666047938, + "grad_norm": 0.7439951300621033, + "learning_rate": 0.0001800962475692281, + "loss": 2.7428, + "step": 4166 + }, + { + "epoch": 0.3362924703413768, + "grad_norm": 0.6830539107322693, + "learning_rate": 0.0001800867947257751, + "loss": 2.5907, + "step": 4167 + }, + { + "epoch": 0.3363731740779598, + "grad_norm": 0.8355144262313843, + "learning_rate": 0.00018007733988634986, + "loss": 2.6978, + "step": 4168 + }, + { + "epoch": 0.33645387781454283, + "grad_norm": 0.6880978941917419, + "learning_rate": 0.00018006788305118798, + "loss": 2.6934, + "step": 4169 + }, + { + "epoch": 0.3365345815511258, + "grad_norm": 0.762709379196167, + "learning_rate": 0.0001800584242205251, + "loss": 2.684, + "step": 4170 + }, + { + "epoch": 0.33661528528770884, + "grad_norm": 0.7543070912361145, + "learning_rate": 0.0001800489633945971, + "loss": 2.6857, + "step": 4171 + }, + { + "epoch": 0.3366959890242918, + "grad_norm": 0.787651777267456, + "learning_rate": 0.00018003950057363964, + "loss": 2.6979, + "step": 4172 + }, + { + "epoch": 0.33677669276087485, + "grad_norm": 0.7831481099128723, + "learning_rate": 0.00018003003575788856, + "loss": 2.7158, + "step": 4173 + }, + { + "epoch": 0.33685739649745783, + "grad_norm": 0.844904363155365, + "learning_rate": 0.00018002056894757986, + "loss": 2.6459, + "step": 4174 + }, + { + "epoch": 0.33693810023404086, + "grad_norm": 0.7529420852661133, + "learning_rate": 0.00018001110014294937, + "loss": 2.685, + "step": 4175 + }, + { + "epoch": 0.33701880397062384, + "grad_norm": 0.776719868183136, + "learning_rate": 0.0001800016293442331, + "loss": 2.6353, + "step": 4176 + }, + { + "epoch": 0.33709950770720687, + "grad_norm": 0.7988671660423279, + "learning_rate": 0.00017999215655166716, + "loss": 2.7241, + "step": 4177 + }, + { + "epoch": 0.33718021144378985, + "grad_norm": 0.7190617918968201, + "learning_rate": 0.00017998268176548752, + "loss": 2.7278, + "step": 4178 + }, + { + "epoch": 0.3372609151803729, + "grad_norm": 0.8337060809135437, + "learning_rate": 0.0001799732049859304, + "loss": 2.7059, + "step": 4179 + }, + { + "epoch": 0.33734161891695585, + "grad_norm": 0.7547435164451599, + "learning_rate": 0.0001799637262132319, + "loss": 2.7782, + "step": 4180 + }, + { + "epoch": 0.3374223226535389, + "grad_norm": 0.8067883253097534, + "learning_rate": 0.0001799542454476284, + "loss": 2.7978, + "step": 4181 + }, + { + "epoch": 0.33750302639012186, + "grad_norm": 0.7451581358909607, + "learning_rate": 0.00017994476268935609, + "loss": 2.6931, + "step": 4182 + }, + { + "epoch": 0.33758373012670484, + "grad_norm": 0.7521898746490479, + "learning_rate": 0.00017993527793865125, + "loss": 2.6939, + "step": 4183 + }, + { + "epoch": 0.33766443386328787, + "grad_norm": 0.7608996033668518, + "learning_rate": 0.0001799257911957504, + "loss": 2.715, + "step": 4184 + }, + { + "epoch": 0.33774513759987085, + "grad_norm": 0.7459948658943176, + "learning_rate": 0.00017991630246088987, + "loss": 2.6951, + "step": 4185 + }, + { + "epoch": 0.3378258413364539, + "grad_norm": 0.7549717426300049, + "learning_rate": 0.00017990681173430618, + "loss": 2.7353, + "step": 4186 + }, + { + "epoch": 0.33790654507303686, + "grad_norm": 0.7234344482421875, + "learning_rate": 0.0001798973190162359, + "loss": 2.6491, + "step": 4187 + }, + { + "epoch": 0.3379872488096199, + "grad_norm": 0.7652330994606018, + "learning_rate": 0.00017988782430691553, + "loss": 2.765, + "step": 4188 + }, + { + "epoch": 0.33806795254620287, + "grad_norm": 0.742953360080719, + "learning_rate": 0.00017987832760658177, + "loss": 2.7079, + "step": 4189 + }, + { + "epoch": 0.3381486562827859, + "grad_norm": 0.7440767288208008, + "learning_rate": 0.00017986882891547125, + "loss": 2.6751, + "step": 4190 + }, + { + "epoch": 0.3382293600193689, + "grad_norm": 0.7141925096511841, + "learning_rate": 0.00017985932823382078, + "loss": 2.6249, + "step": 4191 + }, + { + "epoch": 0.3383100637559519, + "grad_norm": 0.7200489044189453, + "learning_rate": 0.00017984982556186707, + "loss": 2.6811, + "step": 4192 + }, + { + "epoch": 0.3383907674925349, + "grad_norm": 0.7677409648895264, + "learning_rate": 0.00017984032089984696, + "loss": 2.6641, + "step": 4193 + }, + { + "epoch": 0.3384714712291179, + "grad_norm": 0.7386545538902283, + "learning_rate": 0.00017983081424799741, + "loss": 2.6504, + "step": 4194 + }, + { + "epoch": 0.3385521749657009, + "grad_norm": 0.7528583407402039, + "learning_rate": 0.00017982130560655526, + "loss": 2.6422, + "step": 4195 + }, + { + "epoch": 0.3386328787022839, + "grad_norm": 0.7339407801628113, + "learning_rate": 0.0001798117949757575, + "loss": 2.7047, + "step": 4196 + }, + { + "epoch": 0.3387135824388669, + "grad_norm": 0.7655882239341736, + "learning_rate": 0.00017980228235584117, + "loss": 2.7644, + "step": 4197 + }, + { + "epoch": 0.33879428617544993, + "grad_norm": 0.7602109909057617, + "learning_rate": 0.00017979276774704342, + "loss": 2.697, + "step": 4198 + }, + { + "epoch": 0.3388749899120329, + "grad_norm": 0.7188911437988281, + "learning_rate": 0.00017978325114960126, + "loss": 2.7147, + "step": 4199 + }, + { + "epoch": 0.33895569364861594, + "grad_norm": 0.7672597765922546, + "learning_rate": 0.00017977373256375194, + "loss": 2.6558, + "step": 4200 + }, + { + "epoch": 0.3390363973851989, + "grad_norm": 0.784187912940979, + "learning_rate": 0.0001797642119897327, + "loss": 2.7005, + "step": 4201 + }, + { + "epoch": 0.33911710112178195, + "grad_norm": 0.7359703779220581, + "learning_rate": 0.00017975468942778075, + "loss": 2.6578, + "step": 4202 + }, + { + "epoch": 0.3391978048583649, + "grad_norm": 0.7776080965995789, + "learning_rate": 0.00017974516487813345, + "loss": 2.6747, + "step": 4203 + }, + { + "epoch": 0.33927850859494796, + "grad_norm": 0.6934135556221008, + "learning_rate": 0.00017973563834102824, + "loss": 2.6335, + "step": 4204 + }, + { + "epoch": 0.33935921233153094, + "grad_norm": 0.7715818881988525, + "learning_rate": 0.00017972610981670245, + "loss": 2.6062, + "step": 4205 + }, + { + "epoch": 0.33943991606811397, + "grad_norm": 0.7466367483139038, + "learning_rate": 0.0001797165793053936, + "loss": 2.7243, + "step": 4206 + }, + { + "epoch": 0.33952061980469694, + "grad_norm": 0.7485085129737854, + "learning_rate": 0.00017970704680733926, + "loss": 2.6603, + "step": 4207 + }, + { + "epoch": 0.33960132354128, + "grad_norm": 0.7365782856941223, + "learning_rate": 0.0001796975123227769, + "loss": 2.7179, + "step": 4208 + }, + { + "epoch": 0.33968202727786295, + "grad_norm": 0.8405506014823914, + "learning_rate": 0.00017968797585194422, + "loss": 2.7413, + "step": 4209 + }, + { + "epoch": 0.339762731014446, + "grad_norm": 0.8227888941764832, + "learning_rate": 0.00017967843739507888, + "loss": 2.6814, + "step": 4210 + }, + { + "epoch": 0.33984343475102896, + "grad_norm": 0.8247283697128296, + "learning_rate": 0.0001796688969524186, + "loss": 2.6802, + "step": 4211 + }, + { + "epoch": 0.339924138487612, + "grad_norm": 0.7639476656913757, + "learning_rate": 0.00017965935452420116, + "loss": 2.7422, + "step": 4212 + }, + { + "epoch": 0.34000484222419497, + "grad_norm": 0.7846776247024536, + "learning_rate": 0.00017964981011066436, + "loss": 2.7443, + "step": 4213 + }, + { + "epoch": 0.340085545960778, + "grad_norm": 0.7593334913253784, + "learning_rate": 0.00017964026371204608, + "loss": 2.7179, + "step": 4214 + }, + { + "epoch": 0.340166249697361, + "grad_norm": 0.7878177165985107, + "learning_rate": 0.00017963071532858425, + "loss": 2.7118, + "step": 4215 + }, + { + "epoch": 0.340246953433944, + "grad_norm": 0.7728220224380493, + "learning_rate": 0.00017962116496051685, + "loss": 2.6646, + "step": 4216 + }, + { + "epoch": 0.340327657170527, + "grad_norm": 0.8419308066368103, + "learning_rate": 0.00017961161260808187, + "loss": 2.7829, + "step": 4217 + }, + { + "epoch": 0.34040836090711, + "grad_norm": 0.7066153883934021, + "learning_rate": 0.0001796020582715174, + "loss": 2.6498, + "step": 4218 + }, + { + "epoch": 0.340489064643693, + "grad_norm": 0.7976264953613281, + "learning_rate": 0.00017959250195106156, + "loss": 2.7496, + "step": 4219 + }, + { + "epoch": 0.34056976838027603, + "grad_norm": 0.736595630645752, + "learning_rate": 0.0001795829436469525, + "loss": 2.6497, + "step": 4220 + }, + { + "epoch": 0.340650472116859, + "grad_norm": 0.818550705909729, + "learning_rate": 0.0001795733833594285, + "loss": 2.6793, + "step": 4221 + }, + { + "epoch": 0.34073117585344204, + "grad_norm": 0.7712778449058533, + "learning_rate": 0.00017956382108872773, + "loss": 2.6215, + "step": 4222 + }, + { + "epoch": 0.340811879590025, + "grad_norm": 0.746306300163269, + "learning_rate": 0.00017955425683508858, + "loss": 2.7372, + "step": 4223 + }, + { + "epoch": 0.34089258332660805, + "grad_norm": 0.7269306778907776, + "learning_rate": 0.00017954469059874937, + "loss": 2.6438, + "step": 4224 + }, + { + "epoch": 0.340973287063191, + "grad_norm": 0.7426211833953857, + "learning_rate": 0.00017953512237994855, + "loss": 2.6539, + "step": 4225 + }, + { + "epoch": 0.34105399079977405, + "grad_norm": 0.7269948124885559, + "learning_rate": 0.0001795255521789246, + "loss": 2.6833, + "step": 4226 + }, + { + "epoch": 0.34113469453635703, + "grad_norm": 0.7279343605041504, + "learning_rate": 0.00017951597999591598, + "loss": 2.7011, + "step": 4227 + }, + { + "epoch": 0.34121539827294006, + "grad_norm": 0.7554663419723511, + "learning_rate": 0.0001795064058311613, + "loss": 2.7036, + "step": 4228 + }, + { + "epoch": 0.34129610200952304, + "grad_norm": 0.7516502141952515, + "learning_rate": 0.00017949682968489912, + "loss": 2.6699, + "step": 4229 + }, + { + "epoch": 0.34137680574610607, + "grad_norm": 0.7931745052337646, + "learning_rate": 0.00017948725155736818, + "loss": 2.6655, + "step": 4230 + }, + { + "epoch": 0.34145750948268905, + "grad_norm": 0.6981344223022461, + "learning_rate": 0.0001794776714488071, + "loss": 2.6987, + "step": 4231 + }, + { + "epoch": 0.3415382132192721, + "grad_norm": 0.7513911724090576, + "learning_rate": 0.00017946808935945474, + "loss": 2.6985, + "step": 4232 + }, + { + "epoch": 0.34161891695585506, + "grad_norm": 0.7373185753822327, + "learning_rate": 0.00017945850528954983, + "loss": 2.7269, + "step": 4233 + }, + { + "epoch": 0.34169962069243803, + "grad_norm": 0.6990259289741516, + "learning_rate": 0.0001794489192393313, + "loss": 2.6763, + "step": 4234 + }, + { + "epoch": 0.34178032442902107, + "grad_norm": 0.7661817669868469, + "learning_rate": 0.00017943933120903797, + "loss": 2.7057, + "step": 4235 + }, + { + "epoch": 0.34186102816560404, + "grad_norm": 0.7570027112960815, + "learning_rate": 0.0001794297411989089, + "loss": 2.7358, + "step": 4236 + }, + { + "epoch": 0.3419417319021871, + "grad_norm": 0.7751824855804443, + "learning_rate": 0.000179420149209183, + "loss": 2.6771, + "step": 4237 + }, + { + "epoch": 0.34202243563877005, + "grad_norm": 0.8028360605239868, + "learning_rate": 0.0001794105552400994, + "loss": 2.6399, + "step": 4238 + }, + { + "epoch": 0.3421031393753531, + "grad_norm": 0.7398171424865723, + "learning_rate": 0.00017940095929189716, + "loss": 2.6532, + "step": 4239 + }, + { + "epoch": 0.34218384311193606, + "grad_norm": 0.8300225138664246, + "learning_rate": 0.0001793913613648155, + "loss": 2.6798, + "step": 4240 + }, + { + "epoch": 0.3422645468485191, + "grad_norm": 0.7501145005226135, + "learning_rate": 0.00017938176145909356, + "loss": 2.7132, + "step": 4241 + }, + { + "epoch": 0.34234525058510207, + "grad_norm": 0.7178483605384827, + "learning_rate": 0.00017937215957497063, + "loss": 2.7172, + "step": 4242 + }, + { + "epoch": 0.3424259543216851, + "grad_norm": 0.7207306027412415, + "learning_rate": 0.00017936255571268599, + "loss": 2.629, + "step": 4243 + }, + { + "epoch": 0.3425066580582681, + "grad_norm": 0.7339839935302734, + "learning_rate": 0.00017935294987247899, + "loss": 2.6262, + "step": 4244 + }, + { + "epoch": 0.3425873617948511, + "grad_norm": 0.6977292895317078, + "learning_rate": 0.00017934334205458907, + "loss": 2.6949, + "step": 4245 + }, + { + "epoch": 0.3426680655314341, + "grad_norm": 0.7368096113204956, + "learning_rate": 0.00017933373225925564, + "loss": 2.681, + "step": 4246 + }, + { + "epoch": 0.3427487692680171, + "grad_norm": 0.7234459519386292, + "learning_rate": 0.00017932412048671825, + "loss": 2.6891, + "step": 4247 + }, + { + "epoch": 0.3428294730046001, + "grad_norm": 0.7659995555877686, + "learning_rate": 0.00017931450673721642, + "loss": 2.7394, + "step": 4248 + }, + { + "epoch": 0.3429101767411831, + "grad_norm": 0.7799893617630005, + "learning_rate": 0.00017930489101098974, + "loss": 2.7707, + "step": 4249 + }, + { + "epoch": 0.3429908804777661, + "grad_norm": 0.7063946723937988, + "learning_rate": 0.00017929527330827786, + "loss": 2.6573, + "step": 4250 + }, + { + "epoch": 0.34307158421434913, + "grad_norm": 0.7090561389923096, + "learning_rate": 0.0001792856536293205, + "loss": 2.7095, + "step": 4251 + }, + { + "epoch": 0.3431522879509321, + "grad_norm": 0.8020029067993164, + "learning_rate": 0.0001792760319743574, + "loss": 2.6905, + "step": 4252 + }, + { + "epoch": 0.34323299168751514, + "grad_norm": 0.7221484780311584, + "learning_rate": 0.00017926640834362836, + "loss": 2.6853, + "step": 4253 + }, + { + "epoch": 0.3433136954240981, + "grad_norm": 0.7102623581886292, + "learning_rate": 0.00017925678273737324, + "loss": 2.6821, + "step": 4254 + }, + { + "epoch": 0.34339439916068115, + "grad_norm": 0.7702807784080505, + "learning_rate": 0.00017924715515583187, + "loss": 2.6986, + "step": 4255 + }, + { + "epoch": 0.34347510289726413, + "grad_norm": 0.7938152551651001, + "learning_rate": 0.00017923752559924425, + "loss": 2.7162, + "step": 4256 + }, + { + "epoch": 0.34355580663384716, + "grad_norm": 0.7340937852859497, + "learning_rate": 0.00017922789406785036, + "loss": 2.6904, + "step": 4257 + }, + { + "epoch": 0.34363651037043014, + "grad_norm": 0.7010839581489563, + "learning_rate": 0.00017921826056189026, + "loss": 2.6969, + "step": 4258 + }, + { + "epoch": 0.34371721410701317, + "grad_norm": 0.758178174495697, + "learning_rate": 0.00017920862508160403, + "loss": 2.6391, + "step": 4259 + }, + { + "epoch": 0.34379791784359615, + "grad_norm": 0.7861726880073547, + "learning_rate": 0.0001791989876272318, + "loss": 2.7088, + "step": 4260 + }, + { + "epoch": 0.3438786215801792, + "grad_norm": 0.6764364242553711, + "learning_rate": 0.00017918934819901377, + "loss": 2.6221, + "step": 4261 + }, + { + "epoch": 0.34395932531676215, + "grad_norm": 0.76728355884552, + "learning_rate": 0.00017917970679719018, + "loss": 2.6854, + "step": 4262 + }, + { + "epoch": 0.3440400290533452, + "grad_norm": 0.7161166071891785, + "learning_rate": 0.00017917006342200133, + "loss": 2.7048, + "step": 4263 + }, + { + "epoch": 0.34412073278992816, + "grad_norm": 0.7182073593139648, + "learning_rate": 0.00017916041807368753, + "loss": 2.7559, + "step": 4264 + }, + { + "epoch": 0.3442014365265112, + "grad_norm": 0.832258403301239, + "learning_rate": 0.0001791507707524892, + "loss": 2.6743, + "step": 4265 + }, + { + "epoch": 0.34428214026309417, + "grad_norm": 0.7048495411872864, + "learning_rate": 0.00017914112145864675, + "loss": 2.693, + "step": 4266 + }, + { + "epoch": 0.3443628439996772, + "grad_norm": 0.7475518584251404, + "learning_rate": 0.00017913147019240068, + "loss": 2.6881, + "step": 4267 + }, + { + "epoch": 0.3444435477362602, + "grad_norm": 0.72830730676651, + "learning_rate": 0.00017912181695399154, + "loss": 2.659, + "step": 4268 + }, + { + "epoch": 0.3445242514728432, + "grad_norm": 0.7183662056922913, + "learning_rate": 0.00017911216174365988, + "loss": 2.6611, + "step": 4269 + }, + { + "epoch": 0.3446049552094262, + "grad_norm": 0.7487103343009949, + "learning_rate": 0.0001791025045616463, + "loss": 2.6518, + "step": 4270 + }, + { + "epoch": 0.3446856589460092, + "grad_norm": 0.7733812928199768, + "learning_rate": 0.0001790928454081916, + "loss": 2.6359, + "step": 4271 + }, + { + "epoch": 0.3447663626825922, + "grad_norm": 0.7774991393089294, + "learning_rate": 0.00017908318428353642, + "loss": 2.6654, + "step": 4272 + }, + { + "epoch": 0.34484706641917523, + "grad_norm": 0.6882895827293396, + "learning_rate": 0.00017907352118792157, + "loss": 2.686, + "step": 4273 + }, + { + "epoch": 0.3449277701557582, + "grad_norm": 0.7571535110473633, + "learning_rate": 0.00017906385612158785, + "loss": 2.7108, + "step": 4274 + }, + { + "epoch": 0.34500847389234124, + "grad_norm": 0.7324517369270325, + "learning_rate": 0.00017905418908477615, + "loss": 2.6663, + "step": 4275 + }, + { + "epoch": 0.3450891776289242, + "grad_norm": 0.7476221919059753, + "learning_rate": 0.00017904452007772744, + "loss": 2.7202, + "step": 4276 + }, + { + "epoch": 0.34516988136550725, + "grad_norm": 0.7648386359214783, + "learning_rate": 0.00017903484910068268, + "loss": 2.6759, + "step": 4277 + }, + { + "epoch": 0.3452505851020902, + "grad_norm": 0.7375434637069702, + "learning_rate": 0.00017902517615388282, + "loss": 2.6603, + "step": 4278 + }, + { + "epoch": 0.34533128883867326, + "grad_norm": 0.7248519062995911, + "learning_rate": 0.00017901550123756906, + "loss": 2.7147, + "step": 4279 + }, + { + "epoch": 0.34541199257525623, + "grad_norm": 0.7264916896820068, + "learning_rate": 0.0001790058243519824, + "loss": 2.6992, + "step": 4280 + }, + { + "epoch": 0.34549269631183926, + "grad_norm": 0.8370026350021362, + "learning_rate": 0.0001789961454973641, + "loss": 2.7114, + "step": 4281 + }, + { + "epoch": 0.34557340004842224, + "grad_norm": 0.72071373462677, + "learning_rate": 0.00017898646467395538, + "loss": 2.6957, + "step": 4282 + }, + { + "epoch": 0.3456541037850053, + "grad_norm": 0.7355397343635559, + "learning_rate": 0.0001789767818819975, + "loss": 2.6744, + "step": 4283 + }, + { + "epoch": 0.34573480752158825, + "grad_norm": 0.734756588935852, + "learning_rate": 0.00017896709712173173, + "loss": 2.726, + "step": 4284 + }, + { + "epoch": 0.3458155112581712, + "grad_norm": 0.7890543341636658, + "learning_rate": 0.00017895741039339945, + "loss": 2.6726, + "step": 4285 + }, + { + "epoch": 0.34589621499475426, + "grad_norm": 0.7768735885620117, + "learning_rate": 0.00017894772169724216, + "loss": 2.7617, + "step": 4286 + }, + { + "epoch": 0.34597691873133724, + "grad_norm": 0.7306547164916992, + "learning_rate": 0.00017893803103350125, + "loss": 2.6253, + "step": 4287 + }, + { + "epoch": 0.34605762246792027, + "grad_norm": 0.767066478729248, + "learning_rate": 0.00017892833840241828, + "loss": 2.6522, + "step": 4288 + }, + { + "epoch": 0.34613832620450324, + "grad_norm": 0.7018097639083862, + "learning_rate": 0.00017891864380423477, + "loss": 2.7111, + "step": 4289 + }, + { + "epoch": 0.3462190299410863, + "grad_norm": 0.7305615544319153, + "learning_rate": 0.00017890894723919236, + "loss": 2.6924, + "step": 4290 + }, + { + "epoch": 0.34629973367766925, + "grad_norm": 0.7588002681732178, + "learning_rate": 0.00017889924870753275, + "loss": 2.6952, + "step": 4291 + }, + { + "epoch": 0.3463804374142523, + "grad_norm": 0.7162861824035645, + "learning_rate": 0.0001788895482094976, + "loss": 2.6239, + "step": 4292 + }, + { + "epoch": 0.34646114115083526, + "grad_norm": 0.7494024634361267, + "learning_rate": 0.00017887984574532868, + "loss": 2.6763, + "step": 4293 + }, + { + "epoch": 0.3465418448874183, + "grad_norm": 0.7100037336349487, + "learning_rate": 0.0001788701413152678, + "loss": 2.6378, + "step": 4294 + }, + { + "epoch": 0.34662254862400127, + "grad_norm": 0.7316900491714478, + "learning_rate": 0.00017886043491955684, + "loss": 2.7001, + "step": 4295 + }, + { + "epoch": 0.3467032523605843, + "grad_norm": 0.8467028737068176, + "learning_rate": 0.00017885072655843772, + "loss": 2.7536, + "step": 4296 + }, + { + "epoch": 0.3467839560971673, + "grad_norm": 0.7248796820640564, + "learning_rate": 0.00017884101623215237, + "loss": 2.6956, + "step": 4297 + }, + { + "epoch": 0.3468646598337503, + "grad_norm": 0.7183107137680054, + "learning_rate": 0.0001788313039409428, + "loss": 2.743, + "step": 4298 + }, + { + "epoch": 0.3469453635703333, + "grad_norm": 0.6835163831710815, + "learning_rate": 0.00017882158968505105, + "loss": 2.7016, + "step": 4299 + }, + { + "epoch": 0.3470260673069163, + "grad_norm": 0.7973365783691406, + "learning_rate": 0.00017881187346471925, + "loss": 2.6927, + "step": 4300 + }, + { + "epoch": 0.3471067710434993, + "grad_norm": 0.700040876865387, + "learning_rate": 0.00017880215528018954, + "loss": 2.6961, + "step": 4301 + }, + { + "epoch": 0.34718747478008233, + "grad_norm": 0.8180583119392395, + "learning_rate": 0.00017879243513170415, + "loss": 2.642, + "step": 4302 + }, + { + "epoch": 0.3472681785166653, + "grad_norm": 0.7134599685668945, + "learning_rate": 0.0001787827130195053, + "loss": 2.6901, + "step": 4303 + }, + { + "epoch": 0.34734888225324834, + "grad_norm": 0.767998218536377, + "learning_rate": 0.0001787729889438353, + "loss": 2.6472, + "step": 4304 + }, + { + "epoch": 0.3474295859898313, + "grad_norm": 0.7260780930519104, + "learning_rate": 0.0001787632629049365, + "loss": 2.6791, + "step": 4305 + }, + { + "epoch": 0.34751028972641435, + "grad_norm": 0.6918236613273621, + "learning_rate": 0.00017875353490305132, + "loss": 2.6596, + "step": 4306 + }, + { + "epoch": 0.3475909934629973, + "grad_norm": 0.7734197974205017, + "learning_rate": 0.00017874380493842216, + "loss": 2.6402, + "step": 4307 + }, + { + "epoch": 0.34767169719958035, + "grad_norm": 0.7051037549972534, + "learning_rate": 0.00017873407301129154, + "loss": 2.7517, + "step": 4308 + }, + { + "epoch": 0.34775240093616333, + "grad_norm": 0.7026919722557068, + "learning_rate": 0.00017872433912190203, + "loss": 2.7058, + "step": 4309 + }, + { + "epoch": 0.34783310467274636, + "grad_norm": 0.7248546481132507, + "learning_rate": 0.00017871460327049618, + "loss": 2.666, + "step": 4310 + }, + { + "epoch": 0.34791380840932934, + "grad_norm": 0.7348842620849609, + "learning_rate": 0.0001787048654573167, + "loss": 2.7712, + "step": 4311 + }, + { + "epoch": 0.34799451214591237, + "grad_norm": 0.7923693656921387, + "learning_rate": 0.00017869512568260618, + "loss": 2.6469, + "step": 4312 + }, + { + "epoch": 0.34807521588249535, + "grad_norm": 0.7604066729545593, + "learning_rate": 0.00017868538394660743, + "loss": 2.7152, + "step": 4313 + }, + { + "epoch": 0.3481559196190784, + "grad_norm": 0.6811137795448303, + "learning_rate": 0.00017867564024956324, + "loss": 2.715, + "step": 4314 + }, + { + "epoch": 0.34823662335566136, + "grad_norm": 0.7292799353599548, + "learning_rate": 0.00017866589459171643, + "loss": 2.6374, + "step": 4315 + }, + { + "epoch": 0.3483173270922444, + "grad_norm": 0.6961250901222229, + "learning_rate": 0.0001786561469733099, + "loss": 2.6592, + "step": 4316 + }, + { + "epoch": 0.34839803082882737, + "grad_norm": 0.7447086572647095, + "learning_rate": 0.00017864639739458658, + "loss": 2.6965, + "step": 4317 + }, + { + "epoch": 0.3484787345654104, + "grad_norm": 0.7107378244400024, + "learning_rate": 0.00017863664585578942, + "loss": 2.7057, + "step": 4318 + }, + { + "epoch": 0.3485594383019934, + "grad_norm": 0.7372235655784607, + "learning_rate": 0.00017862689235716153, + "loss": 2.6289, + "step": 4319 + }, + { + "epoch": 0.3486401420385764, + "grad_norm": 0.7360481023788452, + "learning_rate": 0.00017861713689894593, + "loss": 2.7208, + "step": 4320 + }, + { + "epoch": 0.3487208457751594, + "grad_norm": 0.7378106713294983, + "learning_rate": 0.00017860737948138575, + "loss": 2.6836, + "step": 4321 + }, + { + "epoch": 0.3488015495117424, + "grad_norm": 0.7110548615455627, + "learning_rate": 0.00017859762010472423, + "loss": 2.6941, + "step": 4322 + }, + { + "epoch": 0.3488822532483254, + "grad_norm": 0.7419706583023071, + "learning_rate": 0.00017858785876920455, + "loss": 2.6591, + "step": 4323 + }, + { + "epoch": 0.3489629569849084, + "grad_norm": 0.7759542465209961, + "learning_rate": 0.00017857809547506997, + "loss": 2.6966, + "step": 4324 + }, + { + "epoch": 0.3490436607214914, + "grad_norm": 0.7894207239151001, + "learning_rate": 0.0001785683302225639, + "loss": 2.7298, + "step": 4325 + }, + { + "epoch": 0.34912436445807443, + "grad_norm": 0.7342399954795837, + "learning_rate": 0.0001785585630119296, + "loss": 2.6998, + "step": 4326 + }, + { + "epoch": 0.3492050681946574, + "grad_norm": 0.8684173822402954, + "learning_rate": 0.0001785487938434106, + "loss": 2.7179, + "step": 4327 + }, + { + "epoch": 0.34928577193124044, + "grad_norm": 0.7557523846626282, + "learning_rate": 0.00017853902271725033, + "loss": 2.7081, + "step": 4328 + }, + { + "epoch": 0.3493664756678234, + "grad_norm": 0.7910173535346985, + "learning_rate": 0.0001785292496336923, + "loss": 2.718, + "step": 4329 + }, + { + "epoch": 0.34944717940440645, + "grad_norm": 0.7878917455673218, + "learning_rate": 0.00017851947459298007, + "loss": 2.674, + "step": 4330 + }, + { + "epoch": 0.3495278831409894, + "grad_norm": 0.7290656566619873, + "learning_rate": 0.0001785096975953573, + "loss": 2.6962, + "step": 4331 + }, + { + "epoch": 0.34960858687757246, + "grad_norm": 0.8465737104415894, + "learning_rate": 0.00017849991864106763, + "loss": 2.6793, + "step": 4332 + }, + { + "epoch": 0.34968929061415543, + "grad_norm": 0.7183132171630859, + "learning_rate": 0.0001784901377303548, + "loss": 2.6902, + "step": 4333 + }, + { + "epoch": 0.34976999435073847, + "grad_norm": 0.7535461783409119, + "learning_rate": 0.00017848035486346255, + "loss": 2.7153, + "step": 4334 + }, + { + "epoch": 0.34985069808732144, + "grad_norm": 0.778734028339386, + "learning_rate": 0.0001784705700406347, + "loss": 2.6316, + "step": 4335 + }, + { + "epoch": 0.3499314018239044, + "grad_norm": 0.6937401294708252, + "learning_rate": 0.00017846078326211516, + "loss": 2.6902, + "step": 4336 + }, + { + "epoch": 0.35001210556048745, + "grad_norm": 0.7450751066207886, + "learning_rate": 0.00017845099452814774, + "loss": 2.6898, + "step": 4337 + }, + { + "epoch": 0.35009280929707043, + "grad_norm": 0.7535614967346191, + "learning_rate": 0.0001784412038389765, + "loss": 2.6969, + "step": 4338 + }, + { + "epoch": 0.35017351303365346, + "grad_norm": 0.6971385478973389, + "learning_rate": 0.00017843141119484543, + "loss": 2.6517, + "step": 4339 + }, + { + "epoch": 0.35025421677023644, + "grad_norm": 0.7233202457427979, + "learning_rate": 0.00017842161659599858, + "loss": 2.7332, + "step": 4340 + }, + { + "epoch": 0.35033492050681947, + "grad_norm": 0.7870340347290039, + "learning_rate": 0.00017841182004268, + "loss": 2.6485, + "step": 4341 + }, + { + "epoch": 0.35041562424340245, + "grad_norm": 0.7387053966522217, + "learning_rate": 0.0001784020215351339, + "loss": 2.6945, + "step": 4342 + }, + { + "epoch": 0.3504963279799855, + "grad_norm": 0.8357887268066406, + "learning_rate": 0.00017839222107360453, + "loss": 2.703, + "step": 4343 + }, + { + "epoch": 0.35057703171656845, + "grad_norm": 0.7197332978248596, + "learning_rate": 0.000178382418658336, + "loss": 2.6649, + "step": 4344 + }, + { + "epoch": 0.3506577354531515, + "grad_norm": 0.7416980862617493, + "learning_rate": 0.0001783726142895728, + "loss": 2.7393, + "step": 4345 + }, + { + "epoch": 0.35073843918973446, + "grad_norm": 0.6807832717895508, + "learning_rate": 0.00017836280796755912, + "loss": 2.6619, + "step": 4346 + }, + { + "epoch": 0.3508191429263175, + "grad_norm": 0.6858795285224915, + "learning_rate": 0.00017835299969253945, + "loss": 2.6266, + "step": 4347 + }, + { + "epoch": 0.35089984666290047, + "grad_norm": 0.8432363867759705, + "learning_rate": 0.0001783431894647582, + "loss": 2.6534, + "step": 4348 + }, + { + "epoch": 0.3509805503994835, + "grad_norm": 0.7240749001502991, + "learning_rate": 0.0001783333772844599, + "loss": 2.6851, + "step": 4349 + }, + { + "epoch": 0.3510612541360665, + "grad_norm": 0.7814531326293945, + "learning_rate": 0.00017832356315188906, + "loss": 2.7085, + "step": 4350 + }, + { + "epoch": 0.3511419578726495, + "grad_norm": 0.6989716291427612, + "learning_rate": 0.00017831374706729026, + "loss": 2.6674, + "step": 4351 + }, + { + "epoch": 0.3512226616092325, + "grad_norm": 0.7118446230888367, + "learning_rate": 0.0001783039290309082, + "loss": 2.6837, + "step": 4352 + }, + { + "epoch": 0.3513033653458155, + "grad_norm": 0.7641892433166504, + "learning_rate": 0.00017829410904298754, + "loss": 2.6415, + "step": 4353 + }, + { + "epoch": 0.3513840690823985, + "grad_norm": 0.6975794434547424, + "learning_rate": 0.000178284287103773, + "loss": 2.6679, + "step": 4354 + }, + { + "epoch": 0.35146477281898153, + "grad_norm": 0.7192546725273132, + "learning_rate": 0.00017827446321350943, + "loss": 2.6539, + "step": 4355 + }, + { + "epoch": 0.3515454765555645, + "grad_norm": 0.8749549388885498, + "learning_rate": 0.00017826463737244155, + "loss": 2.7254, + "step": 4356 + }, + { + "epoch": 0.35162618029214754, + "grad_norm": 0.8509732484817505, + "learning_rate": 0.0001782548095808144, + "loss": 2.7679, + "step": 4357 + }, + { + "epoch": 0.3517068840287305, + "grad_norm": 0.7647901773452759, + "learning_rate": 0.00017824497983887278, + "loss": 2.7049, + "step": 4358 + }, + { + "epoch": 0.35178758776531355, + "grad_norm": 0.7551973462104797, + "learning_rate": 0.00017823514814686178, + "loss": 2.7086, + "step": 4359 + }, + { + "epoch": 0.3518682915018965, + "grad_norm": 0.730140209197998, + "learning_rate": 0.00017822531450502633, + "loss": 2.6334, + "step": 4360 + }, + { + "epoch": 0.35194899523847956, + "grad_norm": 0.8210160136222839, + "learning_rate": 0.00017821547891361158, + "loss": 2.7248, + "step": 4361 + }, + { + "epoch": 0.35202969897506253, + "grad_norm": 0.761972963809967, + "learning_rate": 0.00017820564137286264, + "loss": 2.6502, + "step": 4362 + }, + { + "epoch": 0.35211040271164556, + "grad_norm": 0.7564061284065247, + "learning_rate": 0.00017819580188302466, + "loss": 2.6795, + "step": 4363 + }, + { + "epoch": 0.35219110644822854, + "grad_norm": 0.7382947206497192, + "learning_rate": 0.00017818596044434293, + "loss": 2.6754, + "step": 4364 + }, + { + "epoch": 0.3522718101848116, + "grad_norm": 0.737194836139679, + "learning_rate": 0.00017817611705706266, + "loss": 2.7098, + "step": 4365 + }, + { + "epoch": 0.35235251392139455, + "grad_norm": 0.7183281779289246, + "learning_rate": 0.0001781662717214292, + "loss": 2.6528, + "step": 4366 + }, + { + "epoch": 0.3524332176579776, + "grad_norm": 0.7785990238189697, + "learning_rate": 0.00017815642443768794, + "loss": 2.6419, + "step": 4367 + }, + { + "epoch": 0.35251392139456056, + "grad_norm": 0.7114452719688416, + "learning_rate": 0.00017814657520608427, + "loss": 2.7088, + "step": 4368 + }, + { + "epoch": 0.3525946251311436, + "grad_norm": 0.746969997882843, + "learning_rate": 0.00017813672402686365, + "loss": 2.7199, + "step": 4369 + }, + { + "epoch": 0.35267532886772657, + "grad_norm": 0.7700605988502502, + "learning_rate": 0.00017812687090027165, + "loss": 2.6713, + "step": 4370 + }, + { + "epoch": 0.3527560326043096, + "grad_norm": 0.7733504772186279, + "learning_rate": 0.0001781170158265538, + "loss": 2.6916, + "step": 4371 + }, + { + "epoch": 0.3528367363408926, + "grad_norm": 0.7769689559936523, + "learning_rate": 0.00017810715880595566, + "loss": 2.7787, + "step": 4372 + }, + { + "epoch": 0.3529174400774756, + "grad_norm": 0.7538996934890747, + "learning_rate": 0.000178097299838723, + "loss": 2.6964, + "step": 4373 + }, + { + "epoch": 0.3529981438140586, + "grad_norm": 0.7777890563011169, + "learning_rate": 0.00017808743892510146, + "loss": 2.6882, + "step": 4374 + }, + { + "epoch": 0.3530788475506416, + "grad_norm": 0.8331751823425293, + "learning_rate": 0.00017807757606533683, + "loss": 2.7113, + "step": 4375 + }, + { + "epoch": 0.3531595512872246, + "grad_norm": 0.8039207458496094, + "learning_rate": 0.00017806771125967492, + "loss": 2.6694, + "step": 4376 + }, + { + "epoch": 0.3532402550238076, + "grad_norm": 0.7727575898170471, + "learning_rate": 0.00017805784450836154, + "loss": 2.6639, + "step": 4377 + }, + { + "epoch": 0.3533209587603906, + "grad_norm": 0.8247967958450317, + "learning_rate": 0.00017804797581164264, + "loss": 2.6539, + "step": 4378 + }, + { + "epoch": 0.35340166249697363, + "grad_norm": 0.7574009299278259, + "learning_rate": 0.0001780381051697642, + "loss": 2.7163, + "step": 4379 + }, + { + "epoch": 0.3534823662335566, + "grad_norm": 0.7304368615150452, + "learning_rate": 0.0001780282325829721, + "loss": 2.5759, + "step": 4380 + }, + { + "epoch": 0.35356306997013964, + "grad_norm": 0.7133963704109192, + "learning_rate": 0.00017801835805151257, + "loss": 2.7008, + "step": 4381 + }, + { + "epoch": 0.3536437737067226, + "grad_norm": 0.7525407075881958, + "learning_rate": 0.00017800848157563157, + "loss": 2.6785, + "step": 4382 + }, + { + "epoch": 0.35372447744330565, + "grad_norm": 0.7306779623031616, + "learning_rate": 0.00017799860315557528, + "loss": 2.6454, + "step": 4383 + }, + { + "epoch": 0.35380518117988863, + "grad_norm": 0.6657043695449829, + "learning_rate": 0.00017798872279158994, + "loss": 2.708, + "step": 4384 + }, + { + "epoch": 0.35388588491647166, + "grad_norm": 0.7655978202819824, + "learning_rate": 0.00017797884048392177, + "loss": 2.727, + "step": 4385 + }, + { + "epoch": 0.35396658865305464, + "grad_norm": 0.6802939176559448, + "learning_rate": 0.00017796895623281702, + "loss": 2.659, + "step": 4386 + }, + { + "epoch": 0.3540472923896376, + "grad_norm": 0.7191160917282104, + "learning_rate": 0.00017795907003852207, + "loss": 2.6335, + "step": 4387 + }, + { + "epoch": 0.35412799612622065, + "grad_norm": 0.7771886587142944, + "learning_rate": 0.00017794918190128337, + "loss": 2.6658, + "step": 4388 + }, + { + "epoch": 0.3542086998628036, + "grad_norm": 0.7133512496948242, + "learning_rate": 0.00017793929182134723, + "loss": 2.6701, + "step": 4389 + }, + { + "epoch": 0.35428940359938665, + "grad_norm": 0.7795221209526062, + "learning_rate": 0.00017792939979896022, + "loss": 2.6932, + "step": 4390 + }, + { + "epoch": 0.35437010733596963, + "grad_norm": 0.726767897605896, + "learning_rate": 0.00017791950583436887, + "loss": 2.676, + "step": 4391 + }, + { + "epoch": 0.35445081107255266, + "grad_norm": 0.7447288632392883, + "learning_rate": 0.00017790960992781972, + "loss": 2.7195, + "step": 4392 + }, + { + "epoch": 0.35453151480913564, + "grad_norm": 0.8053649663925171, + "learning_rate": 0.0001778997120795595, + "loss": 2.6851, + "step": 4393 + }, + { + "epoch": 0.35461221854571867, + "grad_norm": 0.7258884906768799, + "learning_rate": 0.00017788981228983474, + "loss": 2.6819, + "step": 4394 + }, + { + "epoch": 0.35469292228230165, + "grad_norm": 0.7279395461082458, + "learning_rate": 0.0001778799105588923, + "loss": 2.6954, + "step": 4395 + }, + { + "epoch": 0.3547736260188847, + "grad_norm": 0.7372962236404419, + "learning_rate": 0.0001778700068869789, + "loss": 2.7049, + "step": 4396 + }, + { + "epoch": 0.35485432975546766, + "grad_norm": 0.712003767490387, + "learning_rate": 0.00017786010127434135, + "loss": 2.7413, + "step": 4397 + }, + { + "epoch": 0.3549350334920507, + "grad_norm": 0.7487424612045288, + "learning_rate": 0.0001778501937212266, + "loss": 2.7231, + "step": 4398 + }, + { + "epoch": 0.35501573722863367, + "grad_norm": 0.73053377866745, + "learning_rate": 0.00017784028422788146, + "loss": 2.7029, + "step": 4399 + }, + { + "epoch": 0.3550964409652167, + "grad_norm": 0.697062611579895, + "learning_rate": 0.00017783037279455298, + "loss": 2.7139, + "step": 4400 + }, + { + "epoch": 0.3551771447017997, + "grad_norm": 0.7750880718231201, + "learning_rate": 0.00017782045942148819, + "loss": 2.6601, + "step": 4401 + }, + { + "epoch": 0.3552578484383827, + "grad_norm": 0.7124977111816406, + "learning_rate": 0.00017781054410893413, + "loss": 2.6119, + "step": 4402 + }, + { + "epoch": 0.3553385521749657, + "grad_norm": 0.7773111462593079, + "learning_rate": 0.00017780062685713785, + "loss": 2.7181, + "step": 4403 + }, + { + "epoch": 0.3554192559115487, + "grad_norm": 0.7282142639160156, + "learning_rate": 0.00017779070766634663, + "loss": 2.7141, + "step": 4404 + }, + { + "epoch": 0.3554999596481317, + "grad_norm": 0.8578598499298096, + "learning_rate": 0.0001777807865368076, + "loss": 2.7628, + "step": 4405 + }, + { + "epoch": 0.3555806633847147, + "grad_norm": 0.7126399874687195, + "learning_rate": 0.00017777086346876809, + "loss": 2.6914, + "step": 4406 + }, + { + "epoch": 0.3556613671212977, + "grad_norm": 0.8026365637779236, + "learning_rate": 0.00017776093846247533, + "loss": 2.7059, + "step": 4407 + }, + { + "epoch": 0.35574207085788073, + "grad_norm": 0.7839884161949158, + "learning_rate": 0.0001777510115181767, + "loss": 2.7265, + "step": 4408 + }, + { + "epoch": 0.3558227745944637, + "grad_norm": 0.7498767971992493, + "learning_rate": 0.00017774108263611966, + "loss": 2.7201, + "step": 4409 + }, + { + "epoch": 0.35590347833104674, + "grad_norm": 0.6996301412582397, + "learning_rate": 0.0001777311518165516, + "loss": 2.6271, + "step": 4410 + }, + { + "epoch": 0.3559841820676297, + "grad_norm": 0.7721461057662964, + "learning_rate": 0.00017772121905972003, + "loss": 2.6739, + "step": 4411 + }, + { + "epoch": 0.35606488580421275, + "grad_norm": 0.8018803000450134, + "learning_rate": 0.00017771128436587256, + "loss": 2.7092, + "step": 4412 + }, + { + "epoch": 0.3561455895407957, + "grad_norm": 0.7185639142990112, + "learning_rate": 0.0001777013477352567, + "loss": 2.6996, + "step": 4413 + }, + { + "epoch": 0.35622629327737876, + "grad_norm": 0.7218519449234009, + "learning_rate": 0.0001776914091681202, + "loss": 2.6555, + "step": 4414 + }, + { + "epoch": 0.35630699701396173, + "grad_norm": 0.7234479188919067, + "learning_rate": 0.00017768146866471062, + "loss": 2.6762, + "step": 4415 + }, + { + "epoch": 0.35638770075054477, + "grad_norm": 0.6723350286483765, + "learning_rate": 0.00017767152622527582, + "loss": 2.6272, + "step": 4416 + }, + { + "epoch": 0.35646840448712774, + "grad_norm": 0.7281947731971741, + "learning_rate": 0.00017766158185006356, + "loss": 2.7216, + "step": 4417 + }, + { + "epoch": 0.3565491082237108, + "grad_norm": 0.8350874781608582, + "learning_rate": 0.00017765163553932166, + "loss": 2.6619, + "step": 4418 + }, + { + "epoch": 0.35662981196029375, + "grad_norm": 0.7454007267951965, + "learning_rate": 0.00017764168729329801, + "loss": 2.6623, + "step": 4419 + }, + { + "epoch": 0.3567105156968768, + "grad_norm": 0.7419041395187378, + "learning_rate": 0.00017763173711224058, + "loss": 2.6773, + "step": 4420 + }, + { + "epoch": 0.35679121943345976, + "grad_norm": 0.7965987920761108, + "learning_rate": 0.0001776217849963973, + "loss": 2.6426, + "step": 4421 + }, + { + "epoch": 0.3568719231700428, + "grad_norm": 0.7093302607536316, + "learning_rate": 0.00017761183094601622, + "loss": 2.6745, + "step": 4422 + }, + { + "epoch": 0.35695262690662577, + "grad_norm": 0.7937216758728027, + "learning_rate": 0.00017760187496134548, + "loss": 2.7275, + "step": 4423 + }, + { + "epoch": 0.3570333306432088, + "grad_norm": 0.9185259938240051, + "learning_rate": 0.00017759191704263313, + "loss": 2.7055, + "step": 4424 + }, + { + "epoch": 0.3571140343797918, + "grad_norm": 0.7365124821662903, + "learning_rate": 0.00017758195719012743, + "loss": 2.6504, + "step": 4425 + }, + { + "epoch": 0.3571947381163748, + "grad_norm": 0.6992416977882385, + "learning_rate": 0.0001775719954040765, + "loss": 2.6684, + "step": 4426 + }, + { + "epoch": 0.3572754418529578, + "grad_norm": 0.7742372751235962, + "learning_rate": 0.00017756203168472866, + "loss": 2.6877, + "step": 4427 + }, + { + "epoch": 0.3573561455895408, + "grad_norm": 0.7448472380638123, + "learning_rate": 0.0001775520660323323, + "loss": 2.7027, + "step": 4428 + }, + { + "epoch": 0.3574368493261238, + "grad_norm": 0.7201915979385376, + "learning_rate": 0.00017754209844713569, + "loss": 2.7046, + "step": 4429 + }, + { + "epoch": 0.3575175530627068, + "grad_norm": 0.6675081253051758, + "learning_rate": 0.0001775321289293873, + "loss": 2.6503, + "step": 4430 + }, + { + "epoch": 0.3575982567992898, + "grad_norm": 0.7252706289291382, + "learning_rate": 0.0001775221574793356, + "loss": 2.6053, + "step": 4431 + }, + { + "epoch": 0.35767896053587284, + "grad_norm": 0.7134702801704407, + "learning_rate": 0.00017751218409722906, + "loss": 2.6857, + "step": 4432 + }, + { + "epoch": 0.3577596642724558, + "grad_norm": 0.7074102163314819, + "learning_rate": 0.0001775022087833163, + "loss": 2.6871, + "step": 4433 + }, + { + "epoch": 0.35784036800903885, + "grad_norm": 0.693520724773407, + "learning_rate": 0.00017749223153784588, + "loss": 2.6629, + "step": 4434 + }, + { + "epoch": 0.3579210717456218, + "grad_norm": 0.6933221817016602, + "learning_rate": 0.0001774822523610665, + "loss": 2.6793, + "step": 4435 + }, + { + "epoch": 0.35800177548220485, + "grad_norm": 0.75307297706604, + "learning_rate": 0.00017747227125322685, + "loss": 2.7012, + "step": 4436 + }, + { + "epoch": 0.35808247921878783, + "grad_norm": 0.7732915282249451, + "learning_rate": 0.0001774622882145757, + "loss": 2.6908, + "step": 4437 + }, + { + "epoch": 0.3581631829553708, + "grad_norm": 0.7067054510116577, + "learning_rate": 0.0001774523032453618, + "loss": 2.7494, + "step": 4438 + }, + { + "epoch": 0.35824388669195384, + "grad_norm": 0.7412838935852051, + "learning_rate": 0.00017744231634583406, + "loss": 2.6734, + "step": 4439 + }, + { + "epoch": 0.3583245904285368, + "grad_norm": 0.7663930654525757, + "learning_rate": 0.00017743232751624136, + "loss": 2.6952, + "step": 4440 + }, + { + "epoch": 0.35840529416511985, + "grad_norm": 0.70650714635849, + "learning_rate": 0.00017742233675683268, + "loss": 2.6806, + "step": 4441 + }, + { + "epoch": 0.3584859979017028, + "grad_norm": 0.698310375213623, + "learning_rate": 0.00017741234406785692, + "loss": 2.6471, + "step": 4442 + }, + { + "epoch": 0.35856670163828586, + "grad_norm": 0.7274026274681091, + "learning_rate": 0.00017740234944956323, + "loss": 2.6688, + "step": 4443 + }, + { + "epoch": 0.35864740537486883, + "grad_norm": 0.6944074034690857, + "learning_rate": 0.00017739235290220067, + "loss": 2.6954, + "step": 4444 + }, + { + "epoch": 0.35872810911145186, + "grad_norm": 0.841995358467102, + "learning_rate": 0.00017738235442601834, + "loss": 2.7169, + "step": 4445 + }, + { + "epoch": 0.35880881284803484, + "grad_norm": 0.74863201379776, + "learning_rate": 0.00017737235402126545, + "loss": 2.6534, + "step": 4446 + }, + { + "epoch": 0.3588895165846179, + "grad_norm": 0.7260422110557556, + "learning_rate": 0.00017736235168819126, + "loss": 2.6266, + "step": 4447 + }, + { + "epoch": 0.35897022032120085, + "grad_norm": 0.7450951337814331, + "learning_rate": 0.00017735234742704504, + "loss": 2.7328, + "step": 4448 + }, + { + "epoch": 0.3590509240577839, + "grad_norm": 0.6942493319511414, + "learning_rate": 0.00017734234123807614, + "loss": 2.7219, + "step": 4449 + }, + { + "epoch": 0.35913162779436686, + "grad_norm": 0.7676761746406555, + "learning_rate": 0.00017733233312153393, + "loss": 2.6594, + "step": 4450 + }, + { + "epoch": 0.3592123315309499, + "grad_norm": 0.7446104288101196, + "learning_rate": 0.00017732232307766778, + "loss": 2.6877, + "step": 4451 + }, + { + "epoch": 0.35929303526753287, + "grad_norm": 0.7551130056381226, + "learning_rate": 0.00017731231110672727, + "loss": 2.672, + "step": 4452 + }, + { + "epoch": 0.3593737390041159, + "grad_norm": 0.6876464486122131, + "learning_rate": 0.00017730229720896182, + "loss": 2.6658, + "step": 4453 + }, + { + "epoch": 0.3594544427406989, + "grad_norm": 0.6992844343185425, + "learning_rate": 0.00017729228138462107, + "loss": 2.6805, + "step": 4454 + }, + { + "epoch": 0.3595351464772819, + "grad_norm": 0.8437497615814209, + "learning_rate": 0.00017728226363395466, + "loss": 2.6884, + "step": 4455 + }, + { + "epoch": 0.3596158502138649, + "grad_norm": 0.7669322490692139, + "learning_rate": 0.00017727224395721217, + "loss": 2.6432, + "step": 4456 + }, + { + "epoch": 0.3596965539504479, + "grad_norm": 0.7613428831100464, + "learning_rate": 0.0001772622223546434, + "loss": 2.6124, + "step": 4457 + }, + { + "epoch": 0.3597772576870309, + "grad_norm": 0.719932496547699, + "learning_rate": 0.00017725219882649807, + "loss": 2.6623, + "step": 4458 + }, + { + "epoch": 0.3598579614236139, + "grad_norm": 0.7650800347328186, + "learning_rate": 0.000177242173373026, + "loss": 2.7551, + "step": 4459 + }, + { + "epoch": 0.3599386651601969, + "grad_norm": 0.7423754930496216, + "learning_rate": 0.0001772321459944771, + "loss": 2.7375, + "step": 4460 + }, + { + "epoch": 0.36001936889677993, + "grad_norm": 0.7602835297584534, + "learning_rate": 0.0001772221166911012, + "loss": 2.7086, + "step": 4461 + }, + { + "epoch": 0.3601000726333629, + "grad_norm": 0.7246943712234497, + "learning_rate": 0.00017721208546314827, + "loss": 2.7068, + "step": 4462 + }, + { + "epoch": 0.36018077636994594, + "grad_norm": 0.715965211391449, + "learning_rate": 0.00017720205231086837, + "loss": 2.689, + "step": 4463 + }, + { + "epoch": 0.3602614801065289, + "grad_norm": 0.7696218490600586, + "learning_rate": 0.00017719201723451151, + "loss": 2.611, + "step": 4464 + }, + { + "epoch": 0.36034218384311195, + "grad_norm": 0.7599236369132996, + "learning_rate": 0.00017718198023432779, + "loss": 2.6504, + "step": 4465 + }, + { + "epoch": 0.36042288757969493, + "grad_norm": 0.7674956321716309, + "learning_rate": 0.0001771719413105674, + "loss": 2.7559, + "step": 4466 + }, + { + "epoch": 0.36050359131627796, + "grad_norm": 0.7263289093971252, + "learning_rate": 0.00017716190046348045, + "loss": 2.6822, + "step": 4467 + }, + { + "epoch": 0.36058429505286094, + "grad_norm": 0.7564195990562439, + "learning_rate": 0.0001771518576933173, + "loss": 2.7319, + "step": 4468 + }, + { + "epoch": 0.36066499878944397, + "grad_norm": 0.7291253805160522, + "learning_rate": 0.00017714181300032813, + "loss": 2.704, + "step": 4469 + }, + { + "epoch": 0.36074570252602695, + "grad_norm": 0.7354169487953186, + "learning_rate": 0.00017713176638476332, + "loss": 2.6344, + "step": 4470 + }, + { + "epoch": 0.36082640626261, + "grad_norm": 0.7104110717773438, + "learning_rate": 0.0001771217178468733, + "loss": 2.665, + "step": 4471 + }, + { + "epoch": 0.36090710999919295, + "grad_norm": 0.6913934350013733, + "learning_rate": 0.00017711166738690847, + "loss": 2.6674, + "step": 4472 + }, + { + "epoch": 0.360987813735776, + "grad_norm": 0.7999634742736816, + "learning_rate": 0.0001771016150051193, + "loss": 2.6847, + "step": 4473 + }, + { + "epoch": 0.36106851747235896, + "grad_norm": 0.7878915667533875, + "learning_rate": 0.00017709156070175634, + "loss": 2.7125, + "step": 4474 + }, + { + "epoch": 0.361149221208942, + "grad_norm": 0.7145688533782959, + "learning_rate": 0.00017708150447707017, + "loss": 2.6863, + "step": 4475 + }, + { + "epoch": 0.36122992494552497, + "grad_norm": 0.7518604397773743, + "learning_rate": 0.00017707144633131143, + "loss": 2.6616, + "step": 4476 + }, + { + "epoch": 0.361310628682108, + "grad_norm": 0.735634982585907, + "learning_rate": 0.0001770613862647308, + "loss": 2.6315, + "step": 4477 + }, + { + "epoch": 0.361391332418691, + "grad_norm": 0.7925180196762085, + "learning_rate": 0.00017705132427757895, + "loss": 2.6951, + "step": 4478 + }, + { + "epoch": 0.361472036155274, + "grad_norm": 0.6949547529220581, + "learning_rate": 0.00017704126037010667, + "loss": 2.6934, + "step": 4479 + }, + { + "epoch": 0.361552739891857, + "grad_norm": 0.7233577966690063, + "learning_rate": 0.00017703119454256483, + "loss": 2.6773, + "step": 4480 + }, + { + "epoch": 0.36163344362844, + "grad_norm": 0.7303269505500793, + "learning_rate": 0.00017702112679520424, + "loss": 2.6351, + "step": 4481 + }, + { + "epoch": 0.361714147365023, + "grad_norm": 0.7620660066604614, + "learning_rate": 0.00017701105712827583, + "loss": 2.6748, + "step": 4482 + }, + { + "epoch": 0.36179485110160603, + "grad_norm": 0.7744965553283691, + "learning_rate": 0.00017700098554203057, + "loss": 2.7013, + "step": 4483 + }, + { + "epoch": 0.361875554838189, + "grad_norm": 0.8017357587814331, + "learning_rate": 0.00017699091203671947, + "loss": 2.7273, + "step": 4484 + }, + { + "epoch": 0.36195625857477204, + "grad_norm": 0.8014432191848755, + "learning_rate": 0.0001769808366125936, + "loss": 2.6864, + "step": 4485 + }, + { + "epoch": 0.362036962311355, + "grad_norm": 0.6914888620376587, + "learning_rate": 0.00017697075926990406, + "loss": 2.6851, + "step": 4486 + }, + { + "epoch": 0.36211766604793805, + "grad_norm": 0.7472698092460632, + "learning_rate": 0.00017696068000890196, + "loss": 2.695, + "step": 4487 + }, + { + "epoch": 0.362198369784521, + "grad_norm": 0.7506285309791565, + "learning_rate": 0.00017695059882983855, + "loss": 2.7055, + "step": 4488 + }, + { + "epoch": 0.362279073521104, + "grad_norm": 0.7501141428947449, + "learning_rate": 0.00017694051573296507, + "loss": 2.7109, + "step": 4489 + }, + { + "epoch": 0.36235977725768703, + "grad_norm": 0.6654670834541321, + "learning_rate": 0.00017693043071853284, + "loss": 2.6165, + "step": 4490 + }, + { + "epoch": 0.36244048099427, + "grad_norm": 0.7894664406776428, + "learning_rate": 0.00017692034378679315, + "loss": 2.7274, + "step": 4491 + }, + { + "epoch": 0.36252118473085304, + "grad_norm": 0.7206711173057556, + "learning_rate": 0.00017691025493799743, + "loss": 2.7047, + "step": 4492 + }, + { + "epoch": 0.362601888467436, + "grad_norm": 0.7656282186508179, + "learning_rate": 0.00017690016417239708, + "loss": 2.696, + "step": 4493 + }, + { + "epoch": 0.36268259220401905, + "grad_norm": 0.7357437610626221, + "learning_rate": 0.00017689007149024362, + "loss": 2.7279, + "step": 4494 + }, + { + "epoch": 0.362763295940602, + "grad_norm": 0.7262146472930908, + "learning_rate": 0.00017687997689178864, + "loss": 2.6964, + "step": 4495 + }, + { + "epoch": 0.36284399967718506, + "grad_norm": 0.7839891910552979, + "learning_rate": 0.00017686988037728365, + "loss": 2.651, + "step": 4496 + }, + { + "epoch": 0.36292470341376803, + "grad_norm": 0.7150306105613708, + "learning_rate": 0.00017685978194698028, + "loss": 2.6481, + "step": 4497 + }, + { + "epoch": 0.36300540715035107, + "grad_norm": 0.7144685387611389, + "learning_rate": 0.00017684968160113025, + "loss": 2.7169, + "step": 4498 + }, + { + "epoch": 0.36308611088693404, + "grad_norm": 0.7593061327934265, + "learning_rate": 0.00017683957933998525, + "loss": 2.7543, + "step": 4499 + }, + { + "epoch": 0.3631668146235171, + "grad_norm": 0.7301446199417114, + "learning_rate": 0.00017682947516379707, + "loss": 2.6806, + "step": 4500 + }, + { + "epoch": 0.36324751836010005, + "grad_norm": 0.7314243316650391, + "learning_rate": 0.00017681936907281757, + "loss": 2.7227, + "step": 4501 + }, + { + "epoch": 0.3633282220966831, + "grad_norm": 0.7695817351341248, + "learning_rate": 0.00017680926106729852, + "loss": 2.7229, + "step": 4502 + }, + { + "epoch": 0.36340892583326606, + "grad_norm": 0.6885762810707092, + "learning_rate": 0.00017679915114749198, + "loss": 2.7246, + "step": 4503 + }, + { + "epoch": 0.3634896295698491, + "grad_norm": 0.6893608570098877, + "learning_rate": 0.0001767890393136498, + "loss": 2.6572, + "step": 4504 + }, + { + "epoch": 0.36357033330643207, + "grad_norm": 0.7011978626251221, + "learning_rate": 0.00017677892556602402, + "loss": 2.6775, + "step": 4505 + }, + { + "epoch": 0.3636510370430151, + "grad_norm": 0.6693406105041504, + "learning_rate": 0.00017676880990486672, + "loss": 2.6183, + "step": 4506 + }, + { + "epoch": 0.3637317407795981, + "grad_norm": 0.7023048996925354, + "learning_rate": 0.00017675869233043002, + "loss": 2.6772, + "step": 4507 + }, + { + "epoch": 0.3638124445161811, + "grad_norm": 0.6903806328773499, + "learning_rate": 0.00017674857284296605, + "loss": 2.6486, + "step": 4508 + }, + { + "epoch": 0.3638931482527641, + "grad_norm": 0.6799258589744568, + "learning_rate": 0.000176738451442727, + "loss": 2.6305, + "step": 4509 + }, + { + "epoch": 0.3639738519893471, + "grad_norm": 0.7935682535171509, + "learning_rate": 0.00017672832812996517, + "loss": 2.7365, + "step": 4510 + }, + { + "epoch": 0.3640545557259301, + "grad_norm": 0.7593684196472168, + "learning_rate": 0.00017671820290493284, + "loss": 2.7029, + "step": 4511 + }, + { + "epoch": 0.36413525946251313, + "grad_norm": 0.7185288667678833, + "learning_rate": 0.00017670807576788234, + "loss": 2.6646, + "step": 4512 + }, + { + "epoch": 0.3642159631990961, + "grad_norm": 0.7260291576385498, + "learning_rate": 0.00017669794671906606, + "loss": 2.6615, + "step": 4513 + }, + { + "epoch": 0.36429666693567914, + "grad_norm": 0.6933417916297913, + "learning_rate": 0.00017668781575873646, + "loss": 2.6678, + "step": 4514 + }, + { + "epoch": 0.3643773706722621, + "grad_norm": 0.7657343149185181, + "learning_rate": 0.00017667768288714603, + "loss": 2.7155, + "step": 4515 + }, + { + "epoch": 0.36445807440884515, + "grad_norm": 0.7326949834823608, + "learning_rate": 0.0001766675481045473, + "loss": 2.732, + "step": 4516 + }, + { + "epoch": 0.3645387781454281, + "grad_norm": 0.7370324730873108, + "learning_rate": 0.0001766574114111929, + "loss": 2.6124, + "step": 4517 + }, + { + "epoch": 0.36461948188201115, + "grad_norm": 0.7280072569847107, + "learning_rate": 0.00017664727280733536, + "loss": 2.6793, + "step": 4518 + }, + { + "epoch": 0.36470018561859413, + "grad_norm": 0.7174237370491028, + "learning_rate": 0.00017663713229322748, + "loss": 2.629, + "step": 4519 + }, + { + "epoch": 0.36478088935517716, + "grad_norm": 0.6660771369934082, + "learning_rate": 0.0001766269898691219, + "loss": 2.6862, + "step": 4520 + }, + { + "epoch": 0.36486159309176014, + "grad_norm": 0.7024446725845337, + "learning_rate": 0.00017661684553527143, + "loss": 2.6602, + "step": 4521 + }, + { + "epoch": 0.36494229682834317, + "grad_norm": 0.7419618964195251, + "learning_rate": 0.0001766066992919289, + "loss": 2.6904, + "step": 4522 + }, + { + "epoch": 0.36502300056492615, + "grad_norm": 0.7425804138183594, + "learning_rate": 0.00017659655113934716, + "loss": 2.7312, + "step": 4523 + }, + { + "epoch": 0.3651037043015092, + "grad_norm": 0.7117013931274414, + "learning_rate": 0.00017658640107777915, + "loss": 2.6411, + "step": 4524 + }, + { + "epoch": 0.36518440803809216, + "grad_norm": 0.719613254070282, + "learning_rate": 0.00017657624910747782, + "loss": 2.6799, + "step": 4525 + }, + { + "epoch": 0.3652651117746752, + "grad_norm": 0.7654159665107727, + "learning_rate": 0.0001765660952286962, + "loss": 2.6675, + "step": 4526 + }, + { + "epoch": 0.36534581551125817, + "grad_norm": 0.7111814022064209, + "learning_rate": 0.00017655593944168734, + "loss": 2.6717, + "step": 4527 + }, + { + "epoch": 0.3654265192478412, + "grad_norm": 0.7494712471961975, + "learning_rate": 0.00017654578174670436, + "loss": 2.7181, + "step": 4528 + }, + { + "epoch": 0.3655072229844242, + "grad_norm": 0.8062291145324707, + "learning_rate": 0.0001765356221440004, + "loss": 2.6563, + "step": 4529 + }, + { + "epoch": 0.3655879267210072, + "grad_norm": 0.7923303842544556, + "learning_rate": 0.00017652546063382866, + "loss": 2.6295, + "step": 4530 + }, + { + "epoch": 0.3656686304575902, + "grad_norm": 0.7417340278625488, + "learning_rate": 0.00017651529721644238, + "loss": 2.6727, + "step": 4531 + }, + { + "epoch": 0.3657493341941732, + "grad_norm": 0.7326166033744812, + "learning_rate": 0.0001765051318920949, + "loss": 2.702, + "step": 4532 + }, + { + "epoch": 0.3658300379307562, + "grad_norm": 0.8133745193481445, + "learning_rate": 0.00017649496466103957, + "loss": 2.7157, + "step": 4533 + }, + { + "epoch": 0.3659107416673392, + "grad_norm": 0.710502564907074, + "learning_rate": 0.00017648479552352973, + "loss": 2.6668, + "step": 4534 + }, + { + "epoch": 0.3659914454039222, + "grad_norm": 0.6947012543678284, + "learning_rate": 0.00017647462447981885, + "loss": 2.6865, + "step": 4535 + }, + { + "epoch": 0.36607214914050523, + "grad_norm": 0.8432720899581909, + "learning_rate": 0.0001764644515301604, + "loss": 2.6226, + "step": 4536 + }, + { + "epoch": 0.3661528528770882, + "grad_norm": 0.7321269512176514, + "learning_rate": 0.00017645427667480802, + "loss": 2.662, + "step": 4537 + }, + { + "epoch": 0.36623355661367124, + "grad_norm": 0.8099743723869324, + "learning_rate": 0.00017644409991401515, + "loss": 2.6853, + "step": 4538 + }, + { + "epoch": 0.3663142603502542, + "grad_norm": 0.6885355114936829, + "learning_rate": 0.0001764339212480355, + "loss": 2.6672, + "step": 4539 + }, + { + "epoch": 0.3663949640868372, + "grad_norm": 0.911396324634552, + "learning_rate": 0.00017642374067712276, + "loss": 2.5778, + "step": 4540 + }, + { + "epoch": 0.3664756678234202, + "grad_norm": 0.7461941838264465, + "learning_rate": 0.0001764135582015306, + "loss": 2.6629, + "step": 4541 + }, + { + "epoch": 0.3665563715600032, + "grad_norm": 0.772741436958313, + "learning_rate": 0.0001764033738215128, + "loss": 2.725, + "step": 4542 + }, + { + "epoch": 0.36663707529658623, + "grad_norm": 0.7256152629852295, + "learning_rate": 0.0001763931875373232, + "loss": 2.6439, + "step": 4543 + }, + { + "epoch": 0.3667177790331692, + "grad_norm": 0.8089167475700378, + "learning_rate": 0.0001763829993492157, + "loss": 2.5972, + "step": 4544 + }, + { + "epoch": 0.36679848276975224, + "grad_norm": 0.7115232944488525, + "learning_rate": 0.0001763728092574442, + "loss": 2.633, + "step": 4545 + }, + { + "epoch": 0.3668791865063352, + "grad_norm": 0.7189347147941589, + "learning_rate": 0.00017636261726226266, + "loss": 2.619, + "step": 4546 + }, + { + "epoch": 0.36695989024291825, + "grad_norm": 0.7667742967605591, + "learning_rate": 0.00017635242336392506, + "loss": 2.667, + "step": 4547 + }, + { + "epoch": 0.36704059397950123, + "grad_norm": 0.7982457876205444, + "learning_rate": 0.00017634222756268545, + "loss": 2.6667, + "step": 4548 + }, + { + "epoch": 0.36712129771608426, + "grad_norm": 0.7465574145317078, + "learning_rate": 0.00017633202985879804, + "loss": 2.6436, + "step": 4549 + }, + { + "epoch": 0.36720200145266724, + "grad_norm": 0.7297804951667786, + "learning_rate": 0.00017632183025251686, + "loss": 2.6464, + "step": 4550 + }, + { + "epoch": 0.36728270518925027, + "grad_norm": 0.6885054111480713, + "learning_rate": 0.0001763116287440962, + "loss": 2.6742, + "step": 4551 + }, + { + "epoch": 0.36736340892583325, + "grad_norm": 0.7341574430465698, + "learning_rate": 0.00017630142533379023, + "loss": 2.6688, + "step": 4552 + }, + { + "epoch": 0.3674441126624163, + "grad_norm": 0.8565430045127869, + "learning_rate": 0.0001762912200218533, + "loss": 2.6889, + "step": 4553 + }, + { + "epoch": 0.36752481639899925, + "grad_norm": 0.7509489059448242, + "learning_rate": 0.00017628101280853974, + "loss": 2.6177, + "step": 4554 + }, + { + "epoch": 0.3676055201355823, + "grad_norm": 0.8128334879875183, + "learning_rate": 0.00017627080369410396, + "loss": 2.7301, + "step": 4555 + }, + { + "epoch": 0.36768622387216526, + "grad_norm": 0.7511637210845947, + "learning_rate": 0.00017626059267880035, + "loss": 2.7327, + "step": 4556 + }, + { + "epoch": 0.3677669276087483, + "grad_norm": 0.8350822925567627, + "learning_rate": 0.00017625037976288347, + "loss": 2.6073, + "step": 4557 + }, + { + "epoch": 0.36784763134533127, + "grad_norm": 0.7743313312530518, + "learning_rate": 0.00017624016494660776, + "loss": 2.7055, + "step": 4558 + }, + { + "epoch": 0.3679283350819143, + "grad_norm": 0.8196439146995544, + "learning_rate": 0.00017622994823022787, + "loss": 2.6565, + "step": 4559 + }, + { + "epoch": 0.3680090388184973, + "grad_norm": 0.7223393321037292, + "learning_rate": 0.00017621972961399837, + "loss": 2.68, + "step": 4560 + }, + { + "epoch": 0.3680897425550803, + "grad_norm": 0.7215418219566345, + "learning_rate": 0.000176209509098174, + "loss": 2.6627, + "step": 4561 + }, + { + "epoch": 0.3681704462916633, + "grad_norm": 0.8050473928451538, + "learning_rate": 0.00017619928668300946, + "loss": 2.5802, + "step": 4562 + }, + { + "epoch": 0.3682511500282463, + "grad_norm": 0.7452750205993652, + "learning_rate": 0.00017618906236875948, + "loss": 2.6524, + "step": 4563 + }, + { + "epoch": 0.3683318537648293, + "grad_norm": 0.7950742244720459, + "learning_rate": 0.00017617883615567888, + "loss": 2.6371, + "step": 4564 + }, + { + "epoch": 0.36841255750141233, + "grad_norm": 0.7185397744178772, + "learning_rate": 0.00017616860804402261, + "loss": 2.6531, + "step": 4565 + }, + { + "epoch": 0.3684932612379953, + "grad_norm": 0.7480553388595581, + "learning_rate": 0.0001761583780340455, + "loss": 2.6727, + "step": 4566 + }, + { + "epoch": 0.36857396497457834, + "grad_norm": 0.7740724086761475, + "learning_rate": 0.00017614814612600251, + "loss": 2.6095, + "step": 4567 + }, + { + "epoch": 0.3686546687111613, + "grad_norm": 0.9159810543060303, + "learning_rate": 0.00017613791232014866, + "loss": 2.7039, + "step": 4568 + }, + { + "epoch": 0.36873537244774435, + "grad_norm": 0.7478305697441101, + "learning_rate": 0.00017612767661673905, + "loss": 2.6307, + "step": 4569 + }, + { + "epoch": 0.3688160761843273, + "grad_norm": 0.9154726266860962, + "learning_rate": 0.00017611743901602874, + "loss": 2.675, + "step": 4570 + }, + { + "epoch": 0.36889677992091036, + "grad_norm": 0.7903287410736084, + "learning_rate": 0.0001761071995182728, + "loss": 2.6938, + "step": 4571 + }, + { + "epoch": 0.36897748365749333, + "grad_norm": 0.7919119596481323, + "learning_rate": 0.0001760969581237266, + "loss": 2.7092, + "step": 4572 + }, + { + "epoch": 0.36905818739407636, + "grad_norm": 0.8052253723144531, + "learning_rate": 0.00017608671483264522, + "loss": 2.6914, + "step": 4573 + }, + { + "epoch": 0.36913889113065934, + "grad_norm": 0.7660435438156128, + "learning_rate": 0.00017607646964528403, + "loss": 2.674, + "step": 4574 + }, + { + "epoch": 0.3692195948672424, + "grad_norm": 0.8554383516311646, + "learning_rate": 0.00017606622256189836, + "loss": 2.6792, + "step": 4575 + }, + { + "epoch": 0.36930029860382535, + "grad_norm": 0.7719140648841858, + "learning_rate": 0.00017605597358274358, + "loss": 2.6836, + "step": 4576 + }, + { + "epoch": 0.3693810023404084, + "grad_norm": 0.733068585395813, + "learning_rate": 0.00017604572270807513, + "loss": 2.6496, + "step": 4577 + }, + { + "epoch": 0.36946170607699136, + "grad_norm": 0.7622445225715637, + "learning_rate": 0.00017603546993814849, + "loss": 2.7097, + "step": 4578 + }, + { + "epoch": 0.3695424098135744, + "grad_norm": 0.7326679825782776, + "learning_rate": 0.00017602521527321913, + "loss": 2.6786, + "step": 4579 + }, + { + "epoch": 0.36962311355015737, + "grad_norm": 0.7579432129859924, + "learning_rate": 0.00017601495871354272, + "loss": 2.6618, + "step": 4580 + }, + { + "epoch": 0.3697038172867404, + "grad_norm": 0.8812715411186218, + "learning_rate": 0.00017600470025937485, + "loss": 2.6942, + "step": 4581 + }, + { + "epoch": 0.3697845210233234, + "grad_norm": 0.7230449318885803, + "learning_rate": 0.00017599443991097116, + "loss": 2.6374, + "step": 4582 + }, + { + "epoch": 0.3698652247599064, + "grad_norm": 0.8347739577293396, + "learning_rate": 0.00017598417766858735, + "loss": 2.6653, + "step": 4583 + }, + { + "epoch": 0.3699459284964894, + "grad_norm": 0.7826598882675171, + "learning_rate": 0.0001759739135324792, + "loss": 2.6342, + "step": 4584 + }, + { + "epoch": 0.3700266322330724, + "grad_norm": 0.749060332775116, + "learning_rate": 0.00017596364750290254, + "loss": 2.7256, + "step": 4585 + }, + { + "epoch": 0.3701073359696554, + "grad_norm": 0.7470815181732178, + "learning_rate": 0.00017595337958011323, + "loss": 2.6485, + "step": 4586 + }, + { + "epoch": 0.3701880397062384, + "grad_norm": 0.7251530289649963, + "learning_rate": 0.00017594310976436716, + "loss": 2.6613, + "step": 4587 + }, + { + "epoch": 0.3702687434428214, + "grad_norm": 0.7143718004226685, + "learning_rate": 0.00017593283805592027, + "loss": 2.6101, + "step": 4588 + }, + { + "epoch": 0.37034944717940443, + "grad_norm": 0.7378203272819519, + "learning_rate": 0.00017592256445502855, + "loss": 2.6735, + "step": 4589 + }, + { + "epoch": 0.3704301509159874, + "grad_norm": 0.7193629741668701, + "learning_rate": 0.00017591228896194808, + "loss": 2.719, + "step": 4590 + }, + { + "epoch": 0.3705108546525704, + "grad_norm": 0.7377258539199829, + "learning_rate": 0.00017590201157693494, + "loss": 2.6789, + "step": 4591 + }, + { + "epoch": 0.3705915583891534, + "grad_norm": 0.7468351721763611, + "learning_rate": 0.00017589173230024522, + "loss": 2.6389, + "step": 4592 + }, + { + "epoch": 0.3706722621257364, + "grad_norm": 0.7612246870994568, + "learning_rate": 0.0001758814511321352, + "loss": 2.7045, + "step": 4593 + }, + { + "epoch": 0.37075296586231943, + "grad_norm": 0.7603838443756104, + "learning_rate": 0.00017587116807286102, + "loss": 2.7323, + "step": 4594 + }, + { + "epoch": 0.3708336695989024, + "grad_norm": 0.7436477541923523, + "learning_rate": 0.000175860883122679, + "loss": 2.7331, + "step": 4595 + }, + { + "epoch": 0.37091437333548544, + "grad_norm": 0.7004369497299194, + "learning_rate": 0.0001758505962818455, + "loss": 2.6418, + "step": 4596 + }, + { + "epoch": 0.3709950770720684, + "grad_norm": 0.711980938911438, + "learning_rate": 0.00017584030755061683, + "loss": 2.6184, + "step": 4597 + }, + { + "epoch": 0.37107578080865145, + "grad_norm": 0.6999367475509644, + "learning_rate": 0.0001758300169292495, + "loss": 2.6584, + "step": 4598 + }, + { + "epoch": 0.3711564845452344, + "grad_norm": 0.6755785942077637, + "learning_rate": 0.0001758197244179999, + "loss": 2.664, + "step": 4599 + }, + { + "epoch": 0.37123718828181745, + "grad_norm": 0.7174055576324463, + "learning_rate": 0.00017580943001712455, + "loss": 2.6821, + "step": 4600 + }, + { + "epoch": 0.37131789201840043, + "grad_norm": 0.8218933343887329, + "learning_rate": 0.00017579913372688005, + "loss": 2.6355, + "step": 4601 + }, + { + "epoch": 0.37139859575498346, + "grad_norm": 0.7417960166931152, + "learning_rate": 0.000175788835547523, + "loss": 2.7226, + "step": 4602 + }, + { + "epoch": 0.37147929949156644, + "grad_norm": 0.824421763420105, + "learning_rate": 0.00017577853547931006, + "loss": 2.6526, + "step": 4603 + }, + { + "epoch": 0.37156000322814947, + "grad_norm": 0.7391949892044067, + "learning_rate": 0.00017576823352249794, + "loss": 2.6702, + "step": 4604 + }, + { + "epoch": 0.37164070696473245, + "grad_norm": 0.7890247106552124, + "learning_rate": 0.00017575792967734337, + "loss": 2.7281, + "step": 4605 + }, + { + "epoch": 0.3717214107013155, + "grad_norm": 0.785527765750885, + "learning_rate": 0.00017574762394410317, + "loss": 2.6728, + "step": 4606 + }, + { + "epoch": 0.37180211443789846, + "grad_norm": 0.7195863127708435, + "learning_rate": 0.00017573731632303415, + "loss": 2.6329, + "step": 4607 + }, + { + "epoch": 0.3718828181744815, + "grad_norm": 0.7896780371665955, + "learning_rate": 0.0001757270068143932, + "loss": 2.6776, + "step": 4608 + }, + { + "epoch": 0.37196352191106447, + "grad_norm": 0.7568275332450867, + "learning_rate": 0.00017571669541843735, + "loss": 2.6668, + "step": 4609 + }, + { + "epoch": 0.3720442256476475, + "grad_norm": 0.7923939228057861, + "learning_rate": 0.00017570638213542348, + "loss": 2.7033, + "step": 4610 + }, + { + "epoch": 0.3721249293842305, + "grad_norm": 0.7586569786071777, + "learning_rate": 0.00017569606696560868, + "loss": 2.7286, + "step": 4611 + }, + { + "epoch": 0.3722056331208135, + "grad_norm": 0.8222009539604187, + "learning_rate": 0.00017568574990925004, + "loss": 2.6448, + "step": 4612 + }, + { + "epoch": 0.3722863368573965, + "grad_norm": 0.7144019603729248, + "learning_rate": 0.00017567543096660466, + "loss": 2.6671, + "step": 4613 + }, + { + "epoch": 0.3723670405939795, + "grad_norm": 0.7602240443229675, + "learning_rate": 0.00017566511013792973, + "loss": 2.6492, + "step": 4614 + }, + { + "epoch": 0.3724477443305625, + "grad_norm": 0.7949689626693726, + "learning_rate": 0.00017565478742348245, + "loss": 2.7002, + "step": 4615 + }, + { + "epoch": 0.3725284480671455, + "grad_norm": 0.6922519207000732, + "learning_rate": 0.00017564446282352012, + "loss": 2.6917, + "step": 4616 + }, + { + "epoch": 0.3726091518037285, + "grad_norm": 0.7382915616035461, + "learning_rate": 0.0001756341363383, + "loss": 2.6375, + "step": 4617 + }, + { + "epoch": 0.37268985554031153, + "grad_norm": 0.7511888742446899, + "learning_rate": 0.00017562380796807956, + "loss": 2.6823, + "step": 4618 + }, + { + "epoch": 0.3727705592768945, + "grad_norm": 0.7273457646369934, + "learning_rate": 0.00017561347771311608, + "loss": 2.6124, + "step": 4619 + }, + { + "epoch": 0.37285126301347754, + "grad_norm": 0.689440131187439, + "learning_rate": 0.0001756031455736671, + "loss": 2.6931, + "step": 4620 + }, + { + "epoch": 0.3729319667500605, + "grad_norm": 0.7755659222602844, + "learning_rate": 0.00017559281154999013, + "loss": 2.6273, + "step": 4621 + }, + { + "epoch": 0.37301267048664355, + "grad_norm": 0.6940193176269531, + "learning_rate": 0.00017558247564234265, + "loss": 2.641, + "step": 4622 + }, + { + "epoch": 0.3730933742232265, + "grad_norm": 0.7387529015541077, + "learning_rate": 0.00017557213785098232, + "loss": 2.7229, + "step": 4623 + }, + { + "epoch": 0.37317407795980956, + "grad_norm": 0.6807727217674255, + "learning_rate": 0.00017556179817616678, + "loss": 2.6469, + "step": 4624 + }, + { + "epoch": 0.37325478169639253, + "grad_norm": 0.7203819751739502, + "learning_rate": 0.0001755514566181537, + "loss": 2.6239, + "step": 4625 + }, + { + "epoch": 0.37333548543297557, + "grad_norm": 0.9345876574516296, + "learning_rate": 0.0001755411131772008, + "loss": 2.7154, + "step": 4626 + }, + { + "epoch": 0.37341618916955854, + "grad_norm": 0.6787357330322266, + "learning_rate": 0.00017553076785356594, + "loss": 2.6374, + "step": 4627 + }, + { + "epoch": 0.3734968929061416, + "grad_norm": 0.7153670191764832, + "learning_rate": 0.0001755204206475069, + "loss": 2.6734, + "step": 4628 + }, + { + "epoch": 0.37357759664272455, + "grad_norm": 0.736464262008667, + "learning_rate": 0.00017551007155928154, + "loss": 2.7241, + "step": 4629 + }, + { + "epoch": 0.3736583003793076, + "grad_norm": 0.7134939432144165, + "learning_rate": 0.0001754997205891478, + "loss": 2.682, + "step": 4630 + }, + { + "epoch": 0.37373900411589056, + "grad_norm": 0.7071199417114258, + "learning_rate": 0.0001754893677373637, + "loss": 2.7361, + "step": 4631 + }, + { + "epoch": 0.3738197078524736, + "grad_norm": 0.7040621638298035, + "learning_rate": 0.00017547901300418722, + "loss": 2.7031, + "step": 4632 + }, + { + "epoch": 0.37390041158905657, + "grad_norm": 0.7179287075996399, + "learning_rate": 0.00017546865638987642, + "loss": 2.6755, + "step": 4633 + }, + { + "epoch": 0.3739811153256396, + "grad_norm": 0.7579259276390076, + "learning_rate": 0.00017545829789468944, + "loss": 2.6514, + "step": 4634 + }, + { + "epoch": 0.3740618190622226, + "grad_norm": 0.7825835347175598, + "learning_rate": 0.0001754479375188844, + "loss": 2.6876, + "step": 4635 + }, + { + "epoch": 0.3741425227988056, + "grad_norm": 0.7913421988487244, + "learning_rate": 0.00017543757526271956, + "loss": 2.7153, + "step": 4636 + }, + { + "epoch": 0.3742232265353886, + "grad_norm": 0.7766042947769165, + "learning_rate": 0.00017542721112645313, + "loss": 2.645, + "step": 4637 + }, + { + "epoch": 0.3743039302719716, + "grad_norm": 0.7363953590393066, + "learning_rate": 0.00017541684511034343, + "loss": 2.6376, + "step": 4638 + }, + { + "epoch": 0.3743846340085546, + "grad_norm": 0.6928617358207703, + "learning_rate": 0.00017540647721464881, + "loss": 2.6882, + "step": 4639 + }, + { + "epoch": 0.3744653377451376, + "grad_norm": 0.7832257747650146, + "learning_rate": 0.0001753961074396277, + "loss": 2.7305, + "step": 4640 + }, + { + "epoch": 0.3745460414817206, + "grad_norm": 0.7180350422859192, + "learning_rate": 0.00017538573578553844, + "loss": 2.6783, + "step": 4641 + }, + { + "epoch": 0.3746267452183036, + "grad_norm": 0.718209981918335, + "learning_rate": 0.00017537536225263964, + "loss": 2.6961, + "step": 4642 + }, + { + "epoch": 0.3747074489548866, + "grad_norm": 0.7056655287742615, + "learning_rate": 0.00017536498684118975, + "loss": 2.7096, + "step": 4643 + }, + { + "epoch": 0.3747881526914696, + "grad_norm": 0.8004828691482544, + "learning_rate": 0.0001753546095514474, + "loss": 2.7168, + "step": 4644 + }, + { + "epoch": 0.3748688564280526, + "grad_norm": 0.7630821466445923, + "learning_rate": 0.0001753442303836712, + "loss": 2.7091, + "step": 4645 + }, + { + "epoch": 0.3749495601646356, + "grad_norm": 0.7539668083190918, + "learning_rate": 0.0001753338493381198, + "loss": 2.651, + "step": 4646 + }, + { + "epoch": 0.37503026390121863, + "grad_norm": 0.7243319749832153, + "learning_rate": 0.000175323466415052, + "loss": 2.6765, + "step": 4647 + }, + { + "epoch": 0.3751109676378016, + "grad_norm": 0.8906281590461731, + "learning_rate": 0.00017531308161472647, + "loss": 2.5938, + "step": 4648 + }, + { + "epoch": 0.37519167137438464, + "grad_norm": 0.787966251373291, + "learning_rate": 0.0001753026949374021, + "loss": 2.6011, + "step": 4649 + }, + { + "epoch": 0.3752723751109676, + "grad_norm": 0.7763915061950684, + "learning_rate": 0.00017529230638333772, + "loss": 2.7197, + "step": 4650 + }, + { + "epoch": 0.37535307884755065, + "grad_norm": 0.7717103362083435, + "learning_rate": 0.00017528191595279224, + "loss": 2.6605, + "step": 4651 + }, + { + "epoch": 0.3754337825841336, + "grad_norm": 0.7340055108070374, + "learning_rate": 0.00017527152364602464, + "loss": 2.6856, + "step": 4652 + }, + { + "epoch": 0.37551448632071666, + "grad_norm": 0.7805169820785522, + "learning_rate": 0.0001752611294632939, + "loss": 2.7088, + "step": 4653 + }, + { + "epoch": 0.37559519005729963, + "grad_norm": 0.7894891500473022, + "learning_rate": 0.00017525073340485912, + "loss": 2.6691, + "step": 4654 + }, + { + "epoch": 0.37567589379388266, + "grad_norm": 0.7627872824668884, + "learning_rate": 0.0001752403354709793, + "loss": 2.6536, + "step": 4655 + }, + { + "epoch": 0.37575659753046564, + "grad_norm": 0.8097225427627563, + "learning_rate": 0.00017522993566191367, + "loss": 2.7108, + "step": 4656 + }, + { + "epoch": 0.3758373012670487, + "grad_norm": 0.834449827671051, + "learning_rate": 0.00017521953397792137, + "loss": 2.7565, + "step": 4657 + }, + { + "epoch": 0.37591800500363165, + "grad_norm": 0.7924147844314575, + "learning_rate": 0.00017520913041926166, + "loss": 2.7101, + "step": 4658 + }, + { + "epoch": 0.3759987087402147, + "grad_norm": 0.7407249808311462, + "learning_rate": 0.00017519872498619385, + "loss": 2.6501, + "step": 4659 + }, + { + "epoch": 0.37607941247679766, + "grad_norm": 0.7251791954040527, + "learning_rate": 0.0001751883176789772, + "loss": 2.6786, + "step": 4660 + }, + { + "epoch": 0.3761601162133807, + "grad_norm": 0.7120431661605835, + "learning_rate": 0.00017517790849787116, + "loss": 2.7244, + "step": 4661 + }, + { + "epoch": 0.37624081994996367, + "grad_norm": 0.724836528301239, + "learning_rate": 0.00017516749744313513, + "loss": 2.7099, + "step": 4662 + }, + { + "epoch": 0.3763215236865467, + "grad_norm": 0.7788939476013184, + "learning_rate": 0.00017515708451502855, + "loss": 2.6206, + "step": 4663 + }, + { + "epoch": 0.3764022274231297, + "grad_norm": 0.7518914341926575, + "learning_rate": 0.00017514666971381099, + "loss": 2.7505, + "step": 4664 + }, + { + "epoch": 0.3764829311597127, + "grad_norm": 0.8004730939865112, + "learning_rate": 0.00017513625303974194, + "loss": 2.6119, + "step": 4665 + }, + { + "epoch": 0.3765636348962957, + "grad_norm": 0.7661109566688538, + "learning_rate": 0.00017512583449308107, + "loss": 2.724, + "step": 4666 + }, + { + "epoch": 0.3766443386328787, + "grad_norm": 0.7669692635536194, + "learning_rate": 0.00017511541407408805, + "loss": 2.7109, + "step": 4667 + }, + { + "epoch": 0.3767250423694617, + "grad_norm": 0.738608181476593, + "learning_rate": 0.00017510499178302253, + "loss": 2.6642, + "step": 4668 + }, + { + "epoch": 0.3768057461060447, + "grad_norm": 0.7194661498069763, + "learning_rate": 0.00017509456762014432, + "loss": 2.6906, + "step": 4669 + }, + { + "epoch": 0.3768864498426277, + "grad_norm": 0.7025040984153748, + "learning_rate": 0.00017508414158571314, + "loss": 2.6596, + "step": 4670 + }, + { + "epoch": 0.37696715357921073, + "grad_norm": 0.7756575345993042, + "learning_rate": 0.00017507371367998892, + "loss": 2.7114, + "step": 4671 + }, + { + "epoch": 0.3770478573157937, + "grad_norm": 0.834966778755188, + "learning_rate": 0.00017506328390323148, + "loss": 2.7554, + "step": 4672 + }, + { + "epoch": 0.37712856105237674, + "grad_norm": 0.6997280120849609, + "learning_rate": 0.0001750528522557008, + "loss": 2.6285, + "step": 4673 + }, + { + "epoch": 0.3772092647889597, + "grad_norm": 0.7101716995239258, + "learning_rate": 0.0001750424187376569, + "loss": 2.6465, + "step": 4674 + }, + { + "epoch": 0.37728996852554275, + "grad_norm": 0.6577222347259521, + "learning_rate": 0.0001750319833493597, + "loss": 2.6372, + "step": 4675 + }, + { + "epoch": 0.37737067226212573, + "grad_norm": 0.7402529120445251, + "learning_rate": 0.00017502154609106937, + "loss": 2.6464, + "step": 4676 + }, + { + "epoch": 0.37745137599870876, + "grad_norm": 0.6858490705490112, + "learning_rate": 0.00017501110696304596, + "loss": 2.6141, + "step": 4677 + }, + { + "epoch": 0.37753207973529174, + "grad_norm": 0.729468822479248, + "learning_rate": 0.0001750006659655497, + "loss": 2.6671, + "step": 4678 + }, + { + "epoch": 0.37761278347187477, + "grad_norm": 0.7197559475898743, + "learning_rate": 0.0001749902230988408, + "loss": 2.6462, + "step": 4679 + }, + { + "epoch": 0.37769348720845775, + "grad_norm": 0.7171144485473633, + "learning_rate": 0.00017497977836317957, + "loss": 2.6427, + "step": 4680 + }, + { + "epoch": 0.3777741909450408, + "grad_norm": 0.7423805594444275, + "learning_rate": 0.00017496933175882617, + "loss": 2.662, + "step": 4681 + }, + { + "epoch": 0.37785489468162375, + "grad_norm": 0.7498061060905457, + "learning_rate": 0.0001749588832860411, + "loss": 2.6243, + "step": 4682 + }, + { + "epoch": 0.3779355984182068, + "grad_norm": 0.7706165909767151, + "learning_rate": 0.0001749484329450847, + "loss": 2.6928, + "step": 4683 + }, + { + "epoch": 0.37801630215478976, + "grad_norm": 0.723363995552063, + "learning_rate": 0.00017493798073621745, + "loss": 2.6787, + "step": 4684 + }, + { + "epoch": 0.3780970058913728, + "grad_norm": 0.7444875836372375, + "learning_rate": 0.00017492752665969983, + "loss": 2.6789, + "step": 4685 + }, + { + "epoch": 0.37817770962795577, + "grad_norm": 0.6946491599082947, + "learning_rate": 0.00017491707071579237, + "loss": 2.6761, + "step": 4686 + }, + { + "epoch": 0.3782584133645388, + "grad_norm": 0.7171412706375122, + "learning_rate": 0.00017490661290475568, + "loss": 2.6788, + "step": 4687 + }, + { + "epoch": 0.3783391171011218, + "grad_norm": 0.7503272891044617, + "learning_rate": 0.00017489615322685038, + "loss": 2.7057, + "step": 4688 + }, + { + "epoch": 0.3784198208377048, + "grad_norm": 0.7458747625350952, + "learning_rate": 0.00017488569168233714, + "loss": 2.6857, + "step": 4689 + }, + { + "epoch": 0.3785005245742878, + "grad_norm": 0.7030516266822815, + "learning_rate": 0.0001748752282714768, + "loss": 2.6522, + "step": 4690 + }, + { + "epoch": 0.3785812283108708, + "grad_norm": 0.7717545628547668, + "learning_rate": 0.00017486476299452994, + "loss": 2.6527, + "step": 4691 + }, + { + "epoch": 0.3786619320474538, + "grad_norm": 0.6788322925567627, + "learning_rate": 0.0001748542958517575, + "loss": 2.6362, + "step": 4692 + }, + { + "epoch": 0.3787426357840368, + "grad_norm": 0.8518630266189575, + "learning_rate": 0.0001748438268434204, + "loss": 2.6812, + "step": 4693 + }, + { + "epoch": 0.3788233395206198, + "grad_norm": 0.7167141437530518, + "learning_rate": 0.00017483335596977945, + "loss": 2.6414, + "step": 4694 + }, + { + "epoch": 0.3789040432572028, + "grad_norm": 0.7748053073883057, + "learning_rate": 0.00017482288323109567, + "loss": 2.7291, + "step": 4695 + }, + { + "epoch": 0.3789847469937858, + "grad_norm": 0.7203041911125183, + "learning_rate": 0.00017481240862763002, + "loss": 2.6957, + "step": 4696 + }, + { + "epoch": 0.3790654507303688, + "grad_norm": 0.7973119020462036, + "learning_rate": 0.00017480193215964362, + "loss": 2.7456, + "step": 4697 + }, + { + "epoch": 0.3791461544669518, + "grad_norm": 0.7851223945617676, + "learning_rate": 0.00017479145382739755, + "loss": 2.6525, + "step": 4698 + }, + { + "epoch": 0.3792268582035348, + "grad_norm": 0.7012068629264832, + "learning_rate": 0.0001747809736311529, + "loss": 2.6662, + "step": 4699 + }, + { + "epoch": 0.37930756194011783, + "grad_norm": 0.7266128659248352, + "learning_rate": 0.00017477049157117093, + "loss": 2.5853, + "step": 4700 + }, + { + "epoch": 0.3793882656767008, + "grad_norm": 0.7264416217803955, + "learning_rate": 0.00017476000764771285, + "loss": 2.6972, + "step": 4701 + }, + { + "epoch": 0.37946896941328384, + "grad_norm": 0.797709047794342, + "learning_rate": 0.00017474952186103995, + "loss": 2.6997, + "step": 4702 + }, + { + "epoch": 0.3795496731498668, + "grad_norm": 0.7552568912506104, + "learning_rate": 0.00017473903421141358, + "loss": 2.7178, + "step": 4703 + }, + { + "epoch": 0.37963037688644985, + "grad_norm": 0.7611108422279358, + "learning_rate": 0.0001747285446990951, + "loss": 2.6997, + "step": 4704 + }, + { + "epoch": 0.3797110806230328, + "grad_norm": 0.8081753253936768, + "learning_rate": 0.00017471805332434595, + "loss": 2.7242, + "step": 4705 + }, + { + "epoch": 0.37979178435961586, + "grad_norm": 0.728301465511322, + "learning_rate": 0.0001747075600874276, + "loss": 2.5885, + "step": 4706 + }, + { + "epoch": 0.37987248809619883, + "grad_norm": 0.7548539638519287, + "learning_rate": 0.00017469706498860155, + "loss": 2.7038, + "step": 4707 + }, + { + "epoch": 0.37995319183278187, + "grad_norm": 0.7054354548454285, + "learning_rate": 0.00017468656802812938, + "loss": 2.6566, + "step": 4708 + }, + { + "epoch": 0.38003389556936484, + "grad_norm": 0.7231585383415222, + "learning_rate": 0.0001746760692062727, + "loss": 2.6564, + "step": 4709 + }, + { + "epoch": 0.3801145993059479, + "grad_norm": 0.6931934952735901, + "learning_rate": 0.00017466556852329318, + "loss": 2.6403, + "step": 4710 + }, + { + "epoch": 0.38019530304253085, + "grad_norm": 0.7882393598556519, + "learning_rate": 0.00017465506597945255, + "loss": 2.6337, + "step": 4711 + }, + { + "epoch": 0.3802760067791139, + "grad_norm": 0.7015109658241272, + "learning_rate": 0.0001746445615750125, + "loss": 2.6742, + "step": 4712 + }, + { + "epoch": 0.38035671051569686, + "grad_norm": 0.7653505802154541, + "learning_rate": 0.0001746340553102348, + "loss": 2.6742, + "step": 4713 + }, + { + "epoch": 0.3804374142522799, + "grad_norm": 0.7166270613670349, + "learning_rate": 0.0001746235471853814, + "loss": 2.5995, + "step": 4714 + }, + { + "epoch": 0.38051811798886287, + "grad_norm": 0.7612236738204956, + "learning_rate": 0.0001746130372007141, + "loss": 2.7595, + "step": 4715 + }, + { + "epoch": 0.3805988217254459, + "grad_norm": 0.6783852577209473, + "learning_rate": 0.00017460252535649493, + "loss": 2.6156, + "step": 4716 + }, + { + "epoch": 0.3806795254620289, + "grad_norm": 0.7495827078819275, + "learning_rate": 0.00017459201165298578, + "loss": 2.6847, + "step": 4717 + }, + { + "epoch": 0.3807602291986119, + "grad_norm": 0.814798891544342, + "learning_rate": 0.0001745814960904487, + "loss": 2.6211, + "step": 4718 + }, + { + "epoch": 0.3808409329351949, + "grad_norm": 0.7541367411613464, + "learning_rate": 0.0001745709786691458, + "loss": 2.6214, + "step": 4719 + }, + { + "epoch": 0.3809216366717779, + "grad_norm": 0.7065702676773071, + "learning_rate": 0.00017456045938933921, + "loss": 2.6699, + "step": 4720 + }, + { + "epoch": 0.3810023404083609, + "grad_norm": 0.751960813999176, + "learning_rate": 0.000174549938251291, + "loss": 2.6085, + "step": 4721 + }, + { + "epoch": 0.3810830441449439, + "grad_norm": 0.72068190574646, + "learning_rate": 0.00017453941525526353, + "loss": 2.6201, + "step": 4722 + }, + { + "epoch": 0.3811637478815269, + "grad_norm": 0.7201167941093445, + "learning_rate": 0.00017452889040151892, + "loss": 2.6775, + "step": 4723 + }, + { + "epoch": 0.38124445161810994, + "grad_norm": 0.7904958128929138, + "learning_rate": 0.00017451836369031956, + "loss": 2.7217, + "step": 4724 + }, + { + "epoch": 0.3813251553546929, + "grad_norm": 0.7096366882324219, + "learning_rate": 0.0001745078351219278, + "loss": 2.7004, + "step": 4725 + }, + { + "epoch": 0.38140585909127594, + "grad_norm": 0.6812441945075989, + "learning_rate": 0.00017449730469660602, + "loss": 2.6555, + "step": 4726 + }, + { + "epoch": 0.3814865628278589, + "grad_norm": 0.8037428855895996, + "learning_rate": 0.00017448677241461665, + "loss": 2.7094, + "step": 4727 + }, + { + "epoch": 0.38156726656444195, + "grad_norm": 0.7282679677009583, + "learning_rate": 0.00017447623827622223, + "loss": 2.6699, + "step": 4728 + }, + { + "epoch": 0.38164797030102493, + "grad_norm": 0.745705783367157, + "learning_rate": 0.00017446570228168523, + "loss": 2.6098, + "step": 4729 + }, + { + "epoch": 0.38172867403760796, + "grad_norm": 0.7098714113235474, + "learning_rate": 0.00017445516443126828, + "loss": 2.6628, + "step": 4730 + }, + { + "epoch": 0.38180937777419094, + "grad_norm": 0.7376620769500732, + "learning_rate": 0.00017444462472523405, + "loss": 2.7086, + "step": 4731 + }, + { + "epoch": 0.38189008151077397, + "grad_norm": 0.717800498008728, + "learning_rate": 0.00017443408316384512, + "loss": 2.6582, + "step": 4732 + }, + { + "epoch": 0.38197078524735695, + "grad_norm": 0.7061530947685242, + "learning_rate": 0.00017442353974736428, + "loss": 2.6817, + "step": 4733 + }, + { + "epoch": 0.38205148898394, + "grad_norm": 0.744667112827301, + "learning_rate": 0.0001744129944760543, + "loss": 2.6649, + "step": 4734 + }, + { + "epoch": 0.38213219272052296, + "grad_norm": 0.7302529215812683, + "learning_rate": 0.00017440244735017797, + "loss": 2.7313, + "step": 4735 + }, + { + "epoch": 0.382212896457106, + "grad_norm": 0.6845258474349976, + "learning_rate": 0.00017439189836999816, + "loss": 2.637, + "step": 4736 + }, + { + "epoch": 0.38229360019368896, + "grad_norm": 0.7060490250587463, + "learning_rate": 0.0001743813475357778, + "loss": 2.6674, + "step": 4737 + }, + { + "epoch": 0.382374303930272, + "grad_norm": 0.7146841287612915, + "learning_rate": 0.00017437079484777977, + "loss": 2.6607, + "step": 4738 + }, + { + "epoch": 0.382455007666855, + "grad_norm": 0.7107662558555603, + "learning_rate": 0.00017436024030626719, + "loss": 2.6777, + "step": 4739 + }, + { + "epoch": 0.382535711403438, + "grad_norm": 0.7356777191162109, + "learning_rate": 0.00017434968391150303, + "loss": 2.5801, + "step": 4740 + }, + { + "epoch": 0.382616415140021, + "grad_norm": 0.6839054226875305, + "learning_rate": 0.00017433912566375037, + "loss": 2.6319, + "step": 4741 + }, + { + "epoch": 0.382697118876604, + "grad_norm": 0.7049627900123596, + "learning_rate": 0.00017432856556327236, + "loss": 2.741, + "step": 4742 + }, + { + "epoch": 0.382777822613187, + "grad_norm": 0.7926551103591919, + "learning_rate": 0.00017431800361033224, + "loss": 2.64, + "step": 4743 + }, + { + "epoch": 0.38285852634976997, + "grad_norm": 0.734272301197052, + "learning_rate": 0.0001743074398051932, + "loss": 2.6575, + "step": 4744 + }, + { + "epoch": 0.382939230086353, + "grad_norm": 0.6959543824195862, + "learning_rate": 0.00017429687414811847, + "loss": 2.664, + "step": 4745 + }, + { + "epoch": 0.383019933822936, + "grad_norm": 0.7258255481719971, + "learning_rate": 0.00017428630663937148, + "loss": 2.6597, + "step": 4746 + }, + { + "epoch": 0.383100637559519, + "grad_norm": 0.8067473769187927, + "learning_rate": 0.0001742757372792155, + "loss": 2.6798, + "step": 4747 + }, + { + "epoch": 0.383181341296102, + "grad_norm": 0.7000626921653748, + "learning_rate": 0.000174265166067914, + "loss": 2.6561, + "step": 4748 + }, + { + "epoch": 0.383262045032685, + "grad_norm": 0.818914532661438, + "learning_rate": 0.00017425459300573045, + "loss": 2.6491, + "step": 4749 + }, + { + "epoch": 0.383342748769268, + "grad_norm": 0.7060543298721313, + "learning_rate": 0.00017424401809292833, + "loss": 2.6825, + "step": 4750 + }, + { + "epoch": 0.383423452505851, + "grad_norm": 0.893488883972168, + "learning_rate": 0.0001742334413297712, + "loss": 2.7201, + "step": 4751 + }, + { + "epoch": 0.383504156242434, + "grad_norm": 0.8131078481674194, + "learning_rate": 0.00017422286271652265, + "loss": 2.7828, + "step": 4752 + }, + { + "epoch": 0.38358485997901703, + "grad_norm": 0.7735587954521179, + "learning_rate": 0.00017421228225344634, + "loss": 2.6489, + "step": 4753 + }, + { + "epoch": 0.3836655637156, + "grad_norm": 0.713800311088562, + "learning_rate": 0.000174201699940806, + "loss": 2.6686, + "step": 4754 + }, + { + "epoch": 0.38374626745218304, + "grad_norm": 0.8246580362319946, + "learning_rate": 0.00017419111577886528, + "loss": 2.6771, + "step": 4755 + }, + { + "epoch": 0.383826971188766, + "grad_norm": 0.694542646408081, + "learning_rate": 0.00017418052976788805, + "loss": 2.6632, + "step": 4756 + }, + { + "epoch": 0.38390767492534905, + "grad_norm": 0.7200453281402588, + "learning_rate": 0.0001741699419081381, + "loss": 2.6386, + "step": 4757 + }, + { + "epoch": 0.38398837866193203, + "grad_norm": 0.7002073526382446, + "learning_rate": 0.00017415935219987933, + "loss": 2.6399, + "step": 4758 + }, + { + "epoch": 0.38406908239851506, + "grad_norm": 0.7056967616081238, + "learning_rate": 0.00017414876064337565, + "loss": 2.7048, + "step": 4759 + }, + { + "epoch": 0.38414978613509804, + "grad_norm": 0.7406448721885681, + "learning_rate": 0.000174138167238891, + "loss": 2.6256, + "step": 4760 + }, + { + "epoch": 0.38423048987168107, + "grad_norm": 0.7280529737472534, + "learning_rate": 0.00017412757198668945, + "loss": 2.6393, + "step": 4761 + }, + { + "epoch": 0.38431119360826405, + "grad_norm": 0.7626908421516418, + "learning_rate": 0.00017411697488703502, + "loss": 2.6717, + "step": 4762 + }, + { + "epoch": 0.3843918973448471, + "grad_norm": 0.716345489025116, + "learning_rate": 0.00017410637594019184, + "loss": 2.6457, + "step": 4763 + }, + { + "epoch": 0.38447260108143005, + "grad_norm": 0.8825077414512634, + "learning_rate": 0.00017409577514642405, + "loss": 2.7042, + "step": 4764 + }, + { + "epoch": 0.3845533048180131, + "grad_norm": 0.7301186919212341, + "learning_rate": 0.00017408517250599585, + "loss": 2.7065, + "step": 4765 + }, + { + "epoch": 0.38463400855459606, + "grad_norm": 0.8235788345336914, + "learning_rate": 0.0001740745680191715, + "loss": 2.6315, + "step": 4766 + }, + { + "epoch": 0.3847147122911791, + "grad_norm": 0.7355515956878662, + "learning_rate": 0.00017406396168621527, + "loss": 2.6939, + "step": 4767 + }, + { + "epoch": 0.38479541602776207, + "grad_norm": 0.6781682372093201, + "learning_rate": 0.0001740533535073915, + "loss": 2.6071, + "step": 4768 + }, + { + "epoch": 0.3848761197643451, + "grad_norm": 0.801191508769989, + "learning_rate": 0.0001740427434829646, + "loss": 2.6635, + "step": 4769 + }, + { + "epoch": 0.3849568235009281, + "grad_norm": 0.759682297706604, + "learning_rate": 0.00017403213161319903, + "loss": 2.6823, + "step": 4770 + }, + { + "epoch": 0.3850375272375111, + "grad_norm": 0.806498110294342, + "learning_rate": 0.00017402151789835916, + "loss": 2.7111, + "step": 4771 + }, + { + "epoch": 0.3851182309740941, + "grad_norm": 0.7677996158599854, + "learning_rate": 0.00017401090233870958, + "loss": 2.6701, + "step": 4772 + }, + { + "epoch": 0.3851989347106771, + "grad_norm": 0.7449933290481567, + "learning_rate": 0.00017400028493451487, + "loss": 2.7037, + "step": 4773 + }, + { + "epoch": 0.3852796384472601, + "grad_norm": 0.7506107091903687, + "learning_rate": 0.0001739896656860396, + "loss": 2.6587, + "step": 4774 + }, + { + "epoch": 0.38536034218384313, + "grad_norm": 0.8781036734580994, + "learning_rate": 0.00017397904459354844, + "loss": 2.7634, + "step": 4775 + }, + { + "epoch": 0.3854410459204261, + "grad_norm": 0.7067514657974243, + "learning_rate": 0.0001739684216573061, + "loss": 2.638, + "step": 4776 + }, + { + "epoch": 0.38552174965700914, + "grad_norm": 0.7742886543273926, + "learning_rate": 0.00017395779687757735, + "loss": 2.7043, + "step": 4777 + }, + { + "epoch": 0.3856024533935921, + "grad_norm": 0.7348291277885437, + "learning_rate": 0.00017394717025462697, + "loss": 2.7404, + "step": 4778 + }, + { + "epoch": 0.38568315713017515, + "grad_norm": 0.7449346780776978, + "learning_rate": 0.00017393654178871984, + "loss": 2.631, + "step": 4779 + }, + { + "epoch": 0.3857638608667581, + "grad_norm": 0.7191200256347656, + "learning_rate": 0.00017392591148012078, + "loss": 2.6776, + "step": 4780 + }, + { + "epoch": 0.38584456460334116, + "grad_norm": 0.7055533528327942, + "learning_rate": 0.00017391527932909476, + "loss": 2.6219, + "step": 4781 + }, + { + "epoch": 0.38592526833992413, + "grad_norm": 0.73755943775177, + "learning_rate": 0.0001739046453359068, + "loss": 2.6692, + "step": 4782 + }, + { + "epoch": 0.38600597207650716, + "grad_norm": 0.7469369769096375, + "learning_rate": 0.00017389400950082185, + "loss": 2.6572, + "step": 4783 + }, + { + "epoch": 0.38608667581309014, + "grad_norm": 0.7552534341812134, + "learning_rate": 0.00017388337182410504, + "loss": 2.6853, + "step": 4784 + }, + { + "epoch": 0.3861673795496732, + "grad_norm": 0.7453532814979553, + "learning_rate": 0.00017387273230602145, + "loss": 2.6601, + "step": 4785 + }, + { + "epoch": 0.38624808328625615, + "grad_norm": 0.7259301543235779, + "learning_rate": 0.0001738620909468363, + "loss": 2.6997, + "step": 4786 + }, + { + "epoch": 0.3863287870228392, + "grad_norm": 0.6970019936561584, + "learning_rate": 0.00017385144774681476, + "loss": 2.7497, + "step": 4787 + }, + { + "epoch": 0.38640949075942216, + "grad_norm": 0.7172032594680786, + "learning_rate": 0.00017384080270622208, + "loss": 2.7182, + "step": 4788 + }, + { + "epoch": 0.3864901944960052, + "grad_norm": 0.7184371948242188, + "learning_rate": 0.00017383015582532357, + "loss": 2.6358, + "step": 4789 + }, + { + "epoch": 0.38657089823258817, + "grad_norm": 0.7302096486091614, + "learning_rate": 0.00017381950710438458, + "loss": 2.6066, + "step": 4790 + }, + { + "epoch": 0.3866516019691712, + "grad_norm": 0.7043540477752686, + "learning_rate": 0.00017380885654367053, + "loss": 2.699, + "step": 4791 + }, + { + "epoch": 0.3867323057057542, + "grad_norm": 0.6919732689857483, + "learning_rate": 0.0001737982041434468, + "loss": 2.6025, + "step": 4792 + }, + { + "epoch": 0.3868130094423372, + "grad_norm": 0.7277705669403076, + "learning_rate": 0.00017378754990397894, + "loss": 2.6764, + "step": 4793 + }, + { + "epoch": 0.3868937131789202, + "grad_norm": 0.7546190619468689, + "learning_rate": 0.00017377689382553247, + "loss": 2.5865, + "step": 4794 + }, + { + "epoch": 0.38697441691550316, + "grad_norm": 0.7636401653289795, + "learning_rate": 0.00017376623590837294, + "loss": 2.6488, + "step": 4795 + }, + { + "epoch": 0.3870551206520862, + "grad_norm": 0.6945658922195435, + "learning_rate": 0.00017375557615276595, + "loss": 2.6739, + "step": 4796 + }, + { + "epoch": 0.38713582438866917, + "grad_norm": 0.7503637075424194, + "learning_rate": 0.00017374491455897722, + "loss": 2.6854, + "step": 4797 + }, + { + "epoch": 0.3872165281252522, + "grad_norm": 0.7457373142242432, + "learning_rate": 0.00017373425112727247, + "loss": 2.6659, + "step": 4798 + }, + { + "epoch": 0.3872972318618352, + "grad_norm": 0.7742534875869751, + "learning_rate": 0.0001737235858579174, + "loss": 2.6461, + "step": 4799 + }, + { + "epoch": 0.3873779355984182, + "grad_norm": 0.7397909760475159, + "learning_rate": 0.0001737129187511779, + "loss": 2.6779, + "step": 4800 + }, + { + "epoch": 0.3874586393350012, + "grad_norm": 0.7922031879425049, + "learning_rate": 0.00017370224980731974, + "loss": 2.6417, + "step": 4801 + }, + { + "epoch": 0.3875393430715842, + "grad_norm": 0.8503968715667725, + "learning_rate": 0.00017369157902660887, + "loss": 2.7063, + "step": 4802 + }, + { + "epoch": 0.3876200468081672, + "grad_norm": 0.7143701314926147, + "learning_rate": 0.00017368090640931125, + "loss": 2.6152, + "step": 4803 + }, + { + "epoch": 0.38770075054475023, + "grad_norm": 0.8016753196716309, + "learning_rate": 0.0001736702319556928, + "loss": 2.6005, + "step": 4804 + }, + { + "epoch": 0.3877814542813332, + "grad_norm": 0.7329538464546204, + "learning_rate": 0.00017365955566601962, + "loss": 2.6027, + "step": 4805 + }, + { + "epoch": 0.38786215801791624, + "grad_norm": 0.7005148530006409, + "learning_rate": 0.00017364887754055773, + "loss": 2.6585, + "step": 4806 + }, + { + "epoch": 0.3879428617544992, + "grad_norm": 0.7092769145965576, + "learning_rate": 0.00017363819757957333, + "loss": 2.6763, + "step": 4807 + }, + { + "epoch": 0.38802356549108225, + "grad_norm": 0.7475202679634094, + "learning_rate": 0.0001736275157833325, + "loss": 2.5969, + "step": 4808 + }, + { + "epoch": 0.3881042692276652, + "grad_norm": 0.822496235370636, + "learning_rate": 0.0001736168321521016, + "loss": 2.6758, + "step": 4809 + }, + { + "epoch": 0.38818497296424825, + "grad_norm": 0.7756842374801636, + "learning_rate": 0.0001736061466861467, + "loss": 2.6676, + "step": 4810 + }, + { + "epoch": 0.38826567670083123, + "grad_norm": 0.7192497849464417, + "learning_rate": 0.00017359545938573428, + "loss": 2.7045, + "step": 4811 + }, + { + "epoch": 0.38834638043741426, + "grad_norm": 0.7064149379730225, + "learning_rate": 0.00017358477025113063, + "loss": 2.6169, + "step": 4812 + }, + { + "epoch": 0.38842708417399724, + "grad_norm": 0.7297258973121643, + "learning_rate": 0.00017357407928260215, + "loss": 2.612, + "step": 4813 + }, + { + "epoch": 0.38850778791058027, + "grad_norm": 0.7011935114860535, + "learning_rate": 0.00017356338648041528, + "loss": 2.6507, + "step": 4814 + }, + { + "epoch": 0.38858849164716325, + "grad_norm": 0.7647256255149841, + "learning_rate": 0.00017355269184483651, + "loss": 2.6838, + "step": 4815 + }, + { + "epoch": 0.3886691953837463, + "grad_norm": 0.690182089805603, + "learning_rate": 0.0001735419953761324, + "loss": 2.6996, + "step": 4816 + }, + { + "epoch": 0.38874989912032926, + "grad_norm": 0.7142173647880554, + "learning_rate": 0.00017353129707456955, + "loss": 2.6705, + "step": 4817 + }, + { + "epoch": 0.3888306028569123, + "grad_norm": 0.801369309425354, + "learning_rate": 0.00017352059694041456, + "loss": 2.7002, + "step": 4818 + }, + { + "epoch": 0.38891130659349527, + "grad_norm": 0.7021649479866028, + "learning_rate": 0.0001735098949739341, + "loss": 2.7042, + "step": 4819 + }, + { + "epoch": 0.3889920103300783, + "grad_norm": 0.6802586317062378, + "learning_rate": 0.00017349919117539488, + "loss": 2.7186, + "step": 4820 + }, + { + "epoch": 0.3890727140666613, + "grad_norm": 0.7723212838172913, + "learning_rate": 0.0001734884855450637, + "loss": 2.608, + "step": 4821 + }, + { + "epoch": 0.3891534178032443, + "grad_norm": 0.7037193179130554, + "learning_rate": 0.00017347777808320735, + "loss": 2.6198, + "step": 4822 + }, + { + "epoch": 0.3892341215398273, + "grad_norm": 0.7172731161117554, + "learning_rate": 0.00017346706879009272, + "loss": 2.7037, + "step": 4823 + }, + { + "epoch": 0.3893148252764103, + "grad_norm": 0.7421539425849915, + "learning_rate": 0.00017345635766598667, + "loss": 2.6619, + "step": 4824 + }, + { + "epoch": 0.3893955290129933, + "grad_norm": 0.7587071061134338, + "learning_rate": 0.0001734456447111562, + "loss": 2.6229, + "step": 4825 + }, + { + "epoch": 0.3894762327495763, + "grad_norm": 0.6981459259986877, + "learning_rate": 0.00017343492992586822, + "loss": 2.5927, + "step": 4826 + }, + { + "epoch": 0.3895569364861593, + "grad_norm": 0.7628491520881653, + "learning_rate": 0.00017342421331038987, + "loss": 2.7047, + "step": 4827 + }, + { + "epoch": 0.38963764022274233, + "grad_norm": 0.8005064129829407, + "learning_rate": 0.00017341349486498818, + "loss": 2.6918, + "step": 4828 + }, + { + "epoch": 0.3897183439593253, + "grad_norm": 0.7756431102752686, + "learning_rate": 0.0001734027745899303, + "loss": 2.6621, + "step": 4829 + }, + { + "epoch": 0.38979904769590834, + "grad_norm": 0.7317833304405212, + "learning_rate": 0.00017339205248548338, + "loss": 2.7134, + "step": 4830 + }, + { + "epoch": 0.3898797514324913, + "grad_norm": 0.7293959259986877, + "learning_rate": 0.0001733813285519147, + "loss": 2.6865, + "step": 4831 + }, + { + "epoch": 0.38996045516907435, + "grad_norm": 0.7120299935340881, + "learning_rate": 0.00017337060278949147, + "loss": 2.6915, + "step": 4832 + }, + { + "epoch": 0.3900411589056573, + "grad_norm": 0.7255397439002991, + "learning_rate": 0.00017335987519848103, + "loss": 2.6671, + "step": 4833 + }, + { + "epoch": 0.39012186264224036, + "grad_norm": 0.7849408388137817, + "learning_rate": 0.0001733491457791507, + "loss": 2.6301, + "step": 4834 + }, + { + "epoch": 0.39020256637882333, + "grad_norm": 0.6998472809791565, + "learning_rate": 0.00017333841453176797, + "loss": 2.6587, + "step": 4835 + }, + { + "epoch": 0.39028327011540637, + "grad_norm": 0.7530023455619812, + "learning_rate": 0.00017332768145660024, + "loss": 2.7011, + "step": 4836 + }, + { + "epoch": 0.39036397385198934, + "grad_norm": 0.7251207828521729, + "learning_rate": 0.00017331694655391497, + "loss": 2.6416, + "step": 4837 + }, + { + "epoch": 0.3904446775885724, + "grad_norm": 0.7016854882240295, + "learning_rate": 0.00017330620982397975, + "loss": 2.7224, + "step": 4838 + }, + { + "epoch": 0.39052538132515535, + "grad_norm": 0.7253310084342957, + "learning_rate": 0.00017329547126706217, + "loss": 2.6747, + "step": 4839 + }, + { + "epoch": 0.3906060850617384, + "grad_norm": 0.7114601731300354, + "learning_rate": 0.00017328473088342987, + "loss": 2.6654, + "step": 4840 + }, + { + "epoch": 0.39068678879832136, + "grad_norm": 0.7773289680480957, + "learning_rate": 0.00017327398867335048, + "loss": 2.6625, + "step": 4841 + }, + { + "epoch": 0.3907674925349044, + "grad_norm": 0.7541868686676025, + "learning_rate": 0.00017326324463709175, + "loss": 2.667, + "step": 4842 + }, + { + "epoch": 0.39084819627148737, + "grad_norm": 0.8095890283584595, + "learning_rate": 0.00017325249877492147, + "loss": 2.706, + "step": 4843 + }, + { + "epoch": 0.3909289000080704, + "grad_norm": 0.7019474506378174, + "learning_rate": 0.00017324175108710742, + "loss": 2.6125, + "step": 4844 + }, + { + "epoch": 0.3910096037446534, + "grad_norm": 0.7055396437644958, + "learning_rate": 0.00017323100157391746, + "loss": 2.6373, + "step": 4845 + }, + { + "epoch": 0.39109030748123635, + "grad_norm": 0.7332476377487183, + "learning_rate": 0.00017322025023561955, + "loss": 2.6559, + "step": 4846 + }, + { + "epoch": 0.3911710112178194, + "grad_norm": 0.7740387916564941, + "learning_rate": 0.00017320949707248158, + "loss": 2.7341, + "step": 4847 + }, + { + "epoch": 0.39125171495440236, + "grad_norm": 0.7371044754981995, + "learning_rate": 0.0001731987420847716, + "loss": 2.7318, + "step": 4848 + }, + { + "epoch": 0.3913324186909854, + "grad_norm": 0.7897786498069763, + "learning_rate": 0.00017318798527275758, + "loss": 2.6759, + "step": 4849 + }, + { + "epoch": 0.39141312242756837, + "grad_norm": 0.7149896621704102, + "learning_rate": 0.0001731772266367077, + "loss": 2.7097, + "step": 4850 + }, + { + "epoch": 0.3914938261641514, + "grad_norm": 0.7824358344078064, + "learning_rate": 0.00017316646617689002, + "loss": 2.6376, + "step": 4851 + }, + { + "epoch": 0.3915745299007344, + "grad_norm": 0.7704496383666992, + "learning_rate": 0.00017315570389357272, + "loss": 2.6539, + "step": 4852 + }, + { + "epoch": 0.3916552336373174, + "grad_norm": 0.7489706873893738, + "learning_rate": 0.00017314493978702407, + "loss": 2.6716, + "step": 4853 + }, + { + "epoch": 0.3917359373739004, + "grad_norm": 0.7368690967559814, + "learning_rate": 0.00017313417385751234, + "loss": 2.7171, + "step": 4854 + }, + { + "epoch": 0.3918166411104834, + "grad_norm": 0.7215858697891235, + "learning_rate": 0.00017312340610530579, + "loss": 2.6306, + "step": 4855 + }, + { + "epoch": 0.3918973448470664, + "grad_norm": 0.7622217535972595, + "learning_rate": 0.00017311263653067285, + "loss": 2.6089, + "step": 4856 + }, + { + "epoch": 0.39197804858364943, + "grad_norm": 0.7317889332771301, + "learning_rate": 0.00017310186513388185, + "loss": 2.6831, + "step": 4857 + }, + { + "epoch": 0.3920587523202324, + "grad_norm": 0.894185483455658, + "learning_rate": 0.0001730910919152013, + "loss": 2.684, + "step": 4858 + }, + { + "epoch": 0.39213945605681544, + "grad_norm": 0.7313157916069031, + "learning_rate": 0.00017308031687489968, + "loss": 2.6465, + "step": 4859 + }, + { + "epoch": 0.3922201597933984, + "grad_norm": 0.7765825390815735, + "learning_rate": 0.00017306954001324552, + "loss": 2.6526, + "step": 4860 + }, + { + "epoch": 0.39230086352998145, + "grad_norm": 0.7171424031257629, + "learning_rate": 0.00017305876133050742, + "loss": 2.6212, + "step": 4861 + }, + { + "epoch": 0.3923815672665644, + "grad_norm": 0.7215112447738647, + "learning_rate": 0.000173047980826954, + "loss": 2.6329, + "step": 4862 + }, + { + "epoch": 0.39246227100314746, + "grad_norm": 0.7393578886985779, + "learning_rate": 0.00017303719850285396, + "loss": 2.7264, + "step": 4863 + }, + { + "epoch": 0.39254297473973043, + "grad_norm": 0.7620136737823486, + "learning_rate": 0.00017302641435847603, + "loss": 2.6686, + "step": 4864 + }, + { + "epoch": 0.39262367847631346, + "grad_norm": 0.7290963530540466, + "learning_rate": 0.00017301562839408893, + "loss": 2.578, + "step": 4865 + }, + { + "epoch": 0.39270438221289644, + "grad_norm": 0.6978541612625122, + "learning_rate": 0.00017300484060996153, + "loss": 2.6783, + "step": 4866 + }, + { + "epoch": 0.3927850859494795, + "grad_norm": 0.7212007641792297, + "learning_rate": 0.00017299405100636264, + "loss": 2.6282, + "step": 4867 + }, + { + "epoch": 0.39286578968606245, + "grad_norm": 0.757324755191803, + "learning_rate": 0.0001729832595835612, + "loss": 2.6933, + "step": 4868 + }, + { + "epoch": 0.3929464934226455, + "grad_norm": 0.7052869200706482, + "learning_rate": 0.00017297246634182618, + "loss": 2.7152, + "step": 4869 + }, + { + "epoch": 0.39302719715922846, + "grad_norm": 0.7326259016990662, + "learning_rate": 0.0001729616712814265, + "loss": 2.6792, + "step": 4870 + }, + { + "epoch": 0.3931079008958115, + "grad_norm": 0.7540302276611328, + "learning_rate": 0.00017295087440263128, + "loss": 2.6621, + "step": 4871 + }, + { + "epoch": 0.39318860463239447, + "grad_norm": 0.765454888343811, + "learning_rate": 0.00017294007570570956, + "loss": 2.7049, + "step": 4872 + }, + { + "epoch": 0.3932693083689775, + "grad_norm": 0.7303065061569214, + "learning_rate": 0.0001729292751909305, + "loss": 2.6867, + "step": 4873 + }, + { + "epoch": 0.3933500121055605, + "grad_norm": 0.7049854397773743, + "learning_rate": 0.00017291847285856325, + "loss": 2.7052, + "step": 4874 + }, + { + "epoch": 0.3934307158421435, + "grad_norm": 0.7199053764343262, + "learning_rate": 0.00017290766870887704, + "loss": 2.7195, + "step": 4875 + }, + { + "epoch": 0.3935114195787265, + "grad_norm": 0.7536180019378662, + "learning_rate": 0.00017289686274214118, + "loss": 2.6861, + "step": 4876 + }, + { + "epoch": 0.3935921233153095, + "grad_norm": 0.7295238971710205, + "learning_rate": 0.00017288605495862492, + "loss": 2.6684, + "step": 4877 + }, + { + "epoch": 0.3936728270518925, + "grad_norm": 0.7575719952583313, + "learning_rate": 0.00017287524535859763, + "loss": 2.6439, + "step": 4878 + }, + { + "epoch": 0.3937535307884755, + "grad_norm": 0.678909182548523, + "learning_rate": 0.00017286443394232874, + "loss": 2.6562, + "step": 4879 + }, + { + "epoch": 0.3938342345250585, + "grad_norm": 0.6908892393112183, + "learning_rate": 0.00017285362071008768, + "loss": 2.6364, + "step": 4880 + }, + { + "epoch": 0.39391493826164153, + "grad_norm": 0.7414079904556274, + "learning_rate": 0.00017284280566214397, + "loss": 2.5872, + "step": 4881 + }, + { + "epoch": 0.3939956419982245, + "grad_norm": 0.6824749112129211, + "learning_rate": 0.0001728319887987671, + "loss": 2.641, + "step": 4882 + }, + { + "epoch": 0.39407634573480754, + "grad_norm": 0.6908513903617859, + "learning_rate": 0.0001728211701202267, + "loss": 2.6977, + "step": 4883 + }, + { + "epoch": 0.3941570494713905, + "grad_norm": 0.7214735746383667, + "learning_rate": 0.0001728103496267924, + "loss": 2.5826, + "step": 4884 + }, + { + "epoch": 0.39423775320797355, + "grad_norm": 0.812781572341919, + "learning_rate": 0.00017279952731873385, + "loss": 2.6806, + "step": 4885 + }, + { + "epoch": 0.39431845694455653, + "grad_norm": 0.7610746026039124, + "learning_rate": 0.00017278870319632078, + "loss": 2.6046, + "step": 4886 + }, + { + "epoch": 0.39439916068113956, + "grad_norm": 0.7151652574539185, + "learning_rate": 0.00017277787725982293, + "loss": 2.6543, + "step": 4887 + }, + { + "epoch": 0.39447986441772254, + "grad_norm": 0.7293612360954285, + "learning_rate": 0.00017276704950951017, + "loss": 2.6384, + "step": 4888 + }, + { + "epoch": 0.39456056815430557, + "grad_norm": 0.8138254284858704, + "learning_rate": 0.00017275621994565233, + "loss": 2.7208, + "step": 4889 + }, + { + "epoch": 0.39464127189088855, + "grad_norm": 0.7557196021080017, + "learning_rate": 0.00017274538856851924, + "loss": 2.6571, + "step": 4890 + }, + { + "epoch": 0.3947219756274716, + "grad_norm": 0.7297266721725464, + "learning_rate": 0.00017273455537838097, + "loss": 2.6222, + "step": 4891 + }, + { + "epoch": 0.39480267936405455, + "grad_norm": 0.7838431596755981, + "learning_rate": 0.00017272372037550743, + "loss": 2.782, + "step": 4892 + }, + { + "epoch": 0.3948833831006376, + "grad_norm": 0.7799673676490784, + "learning_rate": 0.00017271288356016866, + "loss": 2.6658, + "step": 4893 + }, + { + "epoch": 0.39496408683722056, + "grad_norm": 0.8495545387268066, + "learning_rate": 0.0001727020449326348, + "loss": 2.6552, + "step": 4894 + }, + { + "epoch": 0.3950447905738036, + "grad_norm": 0.7317770719528198, + "learning_rate": 0.00017269120449317588, + "loss": 2.6616, + "step": 4895 + }, + { + "epoch": 0.39512549431038657, + "grad_norm": 0.7518885731697083, + "learning_rate": 0.00017268036224206217, + "loss": 2.6864, + "step": 4896 + }, + { + "epoch": 0.39520619804696955, + "grad_norm": 0.83487468957901, + "learning_rate": 0.00017266951817956382, + "loss": 2.7535, + "step": 4897 + }, + { + "epoch": 0.3952869017835526, + "grad_norm": 0.7440658211708069, + "learning_rate": 0.00017265867230595113, + "loss": 2.6584, + "step": 4898 + }, + { + "epoch": 0.39536760552013556, + "grad_norm": 0.7060485482215881, + "learning_rate": 0.00017264782462149438, + "loss": 2.6892, + "step": 4899 + }, + { + "epoch": 0.3954483092567186, + "grad_norm": 0.8410428166389465, + "learning_rate": 0.00017263697512646394, + "loss": 2.6425, + "step": 4900 + }, + { + "epoch": 0.39552901299330157, + "grad_norm": 0.757046639919281, + "learning_rate": 0.0001726261238211302, + "loss": 2.6159, + "step": 4901 + }, + { + "epoch": 0.3956097167298846, + "grad_norm": 0.7288908958435059, + "learning_rate": 0.00017261527070576365, + "loss": 2.6753, + "step": 4902 + }, + { + "epoch": 0.3956904204664676, + "grad_norm": 0.8194541335105896, + "learning_rate": 0.0001726044157806347, + "loss": 2.6673, + "step": 4903 + }, + { + "epoch": 0.3957711242030506, + "grad_norm": 0.7957740426063538, + "learning_rate": 0.00017259355904601393, + "loss": 2.6662, + "step": 4904 + }, + { + "epoch": 0.3958518279396336, + "grad_norm": 0.8790122270584106, + "learning_rate": 0.0001725827005021719, + "loss": 2.7513, + "step": 4905 + }, + { + "epoch": 0.3959325316762166, + "grad_norm": 0.7674984335899353, + "learning_rate": 0.00017257184014937924, + "loss": 2.6375, + "step": 4906 + }, + { + "epoch": 0.3960132354127996, + "grad_norm": 0.7250992655754089, + "learning_rate": 0.00017256097798790663, + "loss": 2.63, + "step": 4907 + }, + { + "epoch": 0.3960939391493826, + "grad_norm": 0.8578312397003174, + "learning_rate": 0.00017255011401802475, + "loss": 2.702, + "step": 4908 + }, + { + "epoch": 0.3961746428859656, + "grad_norm": 0.7365253567695618, + "learning_rate": 0.00017253924824000438, + "loss": 2.6156, + "step": 4909 + }, + { + "epoch": 0.39625534662254863, + "grad_norm": 0.7148925065994263, + "learning_rate": 0.00017252838065411633, + "loss": 2.6658, + "step": 4910 + }, + { + "epoch": 0.3963360503591316, + "grad_norm": 0.7517829537391663, + "learning_rate": 0.00017251751126063148, + "loss": 2.6347, + "step": 4911 + }, + { + "epoch": 0.39641675409571464, + "grad_norm": 0.7880864143371582, + "learning_rate": 0.00017250664005982066, + "loss": 2.7045, + "step": 4912 + }, + { + "epoch": 0.3964974578322976, + "grad_norm": 0.7460693120956421, + "learning_rate": 0.00017249576705195482, + "loss": 2.6976, + "step": 4913 + }, + { + "epoch": 0.39657816156888065, + "grad_norm": 0.7179895043373108, + "learning_rate": 0.00017248489223730496, + "loss": 2.6366, + "step": 4914 + }, + { + "epoch": 0.3966588653054636, + "grad_norm": 0.7737421989440918, + "learning_rate": 0.00017247401561614213, + "loss": 2.7116, + "step": 4915 + }, + { + "epoch": 0.39673956904204666, + "grad_norm": 0.8561483025550842, + "learning_rate": 0.0001724631371887374, + "loss": 2.6591, + "step": 4916 + }, + { + "epoch": 0.39682027277862963, + "grad_norm": 0.7616356611251831, + "learning_rate": 0.00017245225695536182, + "loss": 2.6436, + "step": 4917 + }, + { + "epoch": 0.39690097651521267, + "grad_norm": 0.7754645943641663, + "learning_rate": 0.0001724413749162866, + "loss": 2.6699, + "step": 4918 + }, + { + "epoch": 0.39698168025179564, + "grad_norm": 0.800165593624115, + "learning_rate": 0.000172430491071783, + "loss": 2.7155, + "step": 4919 + }, + { + "epoch": 0.3970623839883787, + "grad_norm": 0.8448799848556519, + "learning_rate": 0.00017241960542212223, + "loss": 2.6991, + "step": 4920 + }, + { + "epoch": 0.39714308772496165, + "grad_norm": 0.7106496095657349, + "learning_rate": 0.00017240871796757556, + "loss": 2.628, + "step": 4921 + }, + { + "epoch": 0.3972237914615447, + "grad_norm": 0.7332959175109863, + "learning_rate": 0.00017239782870841436, + "loss": 2.6159, + "step": 4922 + }, + { + "epoch": 0.39730449519812766, + "grad_norm": 0.7573551535606384, + "learning_rate": 0.00017238693764491002, + "loss": 2.67, + "step": 4923 + }, + { + "epoch": 0.3973851989347107, + "grad_norm": 0.7833136320114136, + "learning_rate": 0.00017237604477733399, + "loss": 2.7276, + "step": 4924 + }, + { + "epoch": 0.39746590267129367, + "grad_norm": 0.7233073711395264, + "learning_rate": 0.00017236515010595773, + "loss": 2.6654, + "step": 4925 + }, + { + "epoch": 0.3975466064078767, + "grad_norm": 0.7920324206352234, + "learning_rate": 0.00017235425363105273, + "loss": 2.7611, + "step": 4926 + }, + { + "epoch": 0.3976273101444597, + "grad_norm": 0.7096883058547974, + "learning_rate": 0.00017234335535289063, + "loss": 2.687, + "step": 4927 + }, + { + "epoch": 0.3977080138810427, + "grad_norm": 0.7231960296630859, + "learning_rate": 0.000172332455271743, + "loss": 2.6441, + "step": 4928 + }, + { + "epoch": 0.3977887176176257, + "grad_norm": 0.7852105498313904, + "learning_rate": 0.00017232155338788146, + "loss": 2.5948, + "step": 4929 + }, + { + "epoch": 0.3978694213542087, + "grad_norm": 0.788789689540863, + "learning_rate": 0.0001723106497015778, + "loss": 2.6797, + "step": 4930 + }, + { + "epoch": 0.3979501250907917, + "grad_norm": 0.7082793116569519, + "learning_rate": 0.00017229974421310377, + "loss": 2.6787, + "step": 4931 + }, + { + "epoch": 0.3980308288273747, + "grad_norm": 0.8157992362976074, + "learning_rate": 0.00017228883692273106, + "loss": 2.6367, + "step": 4932 + }, + { + "epoch": 0.3981115325639577, + "grad_norm": 0.7576673030853271, + "learning_rate": 0.00017227792783073157, + "loss": 2.6826, + "step": 4933 + }, + { + "epoch": 0.39819223630054074, + "grad_norm": 0.7225388884544373, + "learning_rate": 0.00017226701693737718, + "loss": 2.668, + "step": 4934 + }, + { + "epoch": 0.3982729400371237, + "grad_norm": 0.7029562592506409, + "learning_rate": 0.00017225610424293985, + "loss": 2.6613, + "step": 4935 + }, + { + "epoch": 0.39835364377370674, + "grad_norm": 0.73081374168396, + "learning_rate": 0.0001722451897476915, + "loss": 2.6378, + "step": 4936 + }, + { + "epoch": 0.3984343475102897, + "grad_norm": 0.744008481502533, + "learning_rate": 0.0001722342734519042, + "loss": 2.6501, + "step": 4937 + }, + { + "epoch": 0.39851505124687275, + "grad_norm": 0.7482618093490601, + "learning_rate": 0.00017222335535584996, + "loss": 2.7287, + "step": 4938 + }, + { + "epoch": 0.39859575498345573, + "grad_norm": 0.6487892866134644, + "learning_rate": 0.00017221243545980093, + "loss": 2.6417, + "step": 4939 + }, + { + "epoch": 0.39867645872003876, + "grad_norm": 0.7894789576530457, + "learning_rate": 0.00017220151376402923, + "loss": 2.7431, + "step": 4940 + }, + { + "epoch": 0.39875716245662174, + "grad_norm": 0.8232294321060181, + "learning_rate": 0.00017219059026880708, + "loss": 2.6824, + "step": 4941 + }, + { + "epoch": 0.39883786619320477, + "grad_norm": 0.6844691634178162, + "learning_rate": 0.00017217966497440668, + "loss": 2.6294, + "step": 4942 + }, + { + "epoch": 0.39891856992978775, + "grad_norm": 0.7245259881019592, + "learning_rate": 0.00017216873788110037, + "loss": 2.6815, + "step": 4943 + }, + { + "epoch": 0.3989992736663708, + "grad_norm": 0.7197226881980896, + "learning_rate": 0.00017215780898916045, + "loss": 2.725, + "step": 4944 + }, + { + "epoch": 0.39907997740295376, + "grad_norm": 0.8391285538673401, + "learning_rate": 0.00017214687829885934, + "loss": 2.6724, + "step": 4945 + }, + { + "epoch": 0.3991606811395368, + "grad_norm": 0.7357564568519592, + "learning_rate": 0.00017213594581046938, + "loss": 2.7052, + "step": 4946 + }, + { + "epoch": 0.39924138487611976, + "grad_norm": 0.7611483931541443, + "learning_rate": 0.00017212501152426312, + "loss": 2.7214, + "step": 4947 + }, + { + "epoch": 0.39932208861270274, + "grad_norm": 0.7314950227737427, + "learning_rate": 0.00017211407544051306, + "loss": 2.6594, + "step": 4948 + }, + { + "epoch": 0.3994027923492858, + "grad_norm": 0.774131178855896, + "learning_rate": 0.00017210313755949169, + "loss": 2.6812, + "step": 4949 + }, + { + "epoch": 0.39948349608586875, + "grad_norm": 0.707003116607666, + "learning_rate": 0.00017209219788147167, + "loss": 2.7334, + "step": 4950 + }, + { + "epoch": 0.3995641998224518, + "grad_norm": 0.8179643154144287, + "learning_rate": 0.0001720812564067256, + "loss": 2.6554, + "step": 4951 + }, + { + "epoch": 0.39964490355903476, + "grad_norm": 0.6572005152702332, + "learning_rate": 0.00017207031313552621, + "loss": 2.6423, + "step": 4952 + }, + { + "epoch": 0.3997256072956178, + "grad_norm": 0.7663072943687439, + "learning_rate": 0.00017205936806814623, + "loss": 2.689, + "step": 4953 + }, + { + "epoch": 0.39980631103220077, + "grad_norm": 0.7351107001304626, + "learning_rate": 0.00017204842120485846, + "loss": 2.631, + "step": 4954 + }, + { + "epoch": 0.3998870147687838, + "grad_norm": 0.7754253149032593, + "learning_rate": 0.00017203747254593564, + "loss": 2.6371, + "step": 4955 + }, + { + "epoch": 0.3999677185053668, + "grad_norm": 0.7471042275428772, + "learning_rate": 0.00017202652209165074, + "loss": 2.6542, + "step": 4956 + }, + { + "epoch": 0.4000484222419498, + "grad_norm": 0.7357343435287476, + "learning_rate": 0.00017201556984227664, + "loss": 2.6226, + "step": 4957 + }, + { + "epoch": 0.4001291259785328, + "grad_norm": 0.8096252679824829, + "learning_rate": 0.00017200461579808626, + "loss": 2.6458, + "step": 4958 + }, + { + "epoch": 0.4002098297151158, + "grad_norm": 0.7622970938682556, + "learning_rate": 0.0001719936599593526, + "loss": 2.7129, + "step": 4959 + }, + { + "epoch": 0.4002905334516988, + "grad_norm": 0.7374953627586365, + "learning_rate": 0.00017198270232634882, + "loss": 2.696, + "step": 4960 + }, + { + "epoch": 0.4003712371882818, + "grad_norm": 0.7897924184799194, + "learning_rate": 0.00017197174289934787, + "loss": 2.7508, + "step": 4961 + }, + { + "epoch": 0.4004519409248648, + "grad_norm": 0.7047984004020691, + "learning_rate": 0.00017196078167862298, + "loss": 2.6733, + "step": 4962 + }, + { + "epoch": 0.40053264466144783, + "grad_norm": 0.7866294980049133, + "learning_rate": 0.0001719498186644473, + "loss": 2.694, + "step": 4963 + }, + { + "epoch": 0.4006133483980308, + "grad_norm": 0.739923894405365, + "learning_rate": 0.00017193885385709409, + "loss": 2.7125, + "step": 4964 + }, + { + "epoch": 0.40069405213461384, + "grad_norm": 0.7506374716758728, + "learning_rate": 0.00017192788725683652, + "loss": 2.627, + "step": 4965 + }, + { + "epoch": 0.4007747558711968, + "grad_norm": 0.6591607928276062, + "learning_rate": 0.00017191691886394802, + "loss": 2.6723, + "step": 4966 + }, + { + "epoch": 0.40085545960777985, + "grad_norm": 0.7748788595199585, + "learning_rate": 0.00017190594867870192, + "loss": 2.6486, + "step": 4967 + }, + { + "epoch": 0.40093616334436283, + "grad_norm": 0.7518232464790344, + "learning_rate": 0.0001718949767013716, + "loss": 2.6879, + "step": 4968 + }, + { + "epoch": 0.40101686708094586, + "grad_norm": 0.7360039949417114, + "learning_rate": 0.00017188400293223052, + "loss": 2.6506, + "step": 4969 + }, + { + "epoch": 0.40109757081752884, + "grad_norm": 0.7217130064964294, + "learning_rate": 0.0001718730273715522, + "loss": 2.6263, + "step": 4970 + }, + { + "epoch": 0.40117827455411187, + "grad_norm": 0.7246078252792358, + "learning_rate": 0.00017186205001961015, + "loss": 2.6222, + "step": 4971 + }, + { + "epoch": 0.40125897829069485, + "grad_norm": 0.7566879391670227, + "learning_rate": 0.00017185107087667794, + "loss": 2.7003, + "step": 4972 + }, + { + "epoch": 0.4013396820272779, + "grad_norm": 0.7881271243095398, + "learning_rate": 0.00017184008994302924, + "loss": 2.6463, + "step": 4973 + }, + { + "epoch": 0.40142038576386085, + "grad_norm": 0.7307420372962952, + "learning_rate": 0.00017182910721893775, + "loss": 2.667, + "step": 4974 + }, + { + "epoch": 0.4015010895004439, + "grad_norm": 0.7088132500648499, + "learning_rate": 0.00017181812270467708, + "loss": 2.6073, + "step": 4975 + }, + { + "epoch": 0.40158179323702686, + "grad_norm": 0.7839647531509399, + "learning_rate": 0.0001718071364005211, + "loss": 2.6594, + "step": 4976 + }, + { + "epoch": 0.4016624969736099, + "grad_norm": 0.7472013235092163, + "learning_rate": 0.00017179614830674353, + "loss": 2.737, + "step": 4977 + }, + { + "epoch": 0.40174320071019287, + "grad_norm": 0.7241616249084473, + "learning_rate": 0.0001717851584236183, + "loss": 2.6615, + "step": 4978 + }, + { + "epoch": 0.4018239044467759, + "grad_norm": 0.7918941378593445, + "learning_rate": 0.00017177416675141929, + "loss": 2.6774, + "step": 4979 + }, + { + "epoch": 0.4019046081833589, + "grad_norm": 0.801003098487854, + "learning_rate": 0.00017176317329042039, + "loss": 2.6749, + "step": 4980 + }, + { + "epoch": 0.4019853119199419, + "grad_norm": 0.7556802034378052, + "learning_rate": 0.00017175217804089564, + "loss": 2.6197, + "step": 4981 + }, + { + "epoch": 0.4020660156565249, + "grad_norm": 0.7539604902267456, + "learning_rate": 0.00017174118100311904, + "loss": 2.6222, + "step": 4982 + }, + { + "epoch": 0.4021467193931079, + "grad_norm": 0.741436243057251, + "learning_rate": 0.0001717301821773647, + "loss": 2.6471, + "step": 4983 + }, + { + "epoch": 0.4022274231296909, + "grad_norm": 0.7449339628219604, + "learning_rate": 0.0001717191815639067, + "loss": 2.6448, + "step": 4984 + }, + { + "epoch": 0.40230812686627393, + "grad_norm": 0.7771497964859009, + "learning_rate": 0.0001717081791630192, + "loss": 2.673, + "step": 4985 + }, + { + "epoch": 0.4023888306028569, + "grad_norm": 0.6916669607162476, + "learning_rate": 0.00017169717497497646, + "loss": 2.6025, + "step": 4986 + }, + { + "epoch": 0.40246953433943994, + "grad_norm": 0.7373276948928833, + "learning_rate": 0.0001716861690000527, + "loss": 2.6783, + "step": 4987 + }, + { + "epoch": 0.4025502380760229, + "grad_norm": 0.7756158709526062, + "learning_rate": 0.0001716751612385222, + "loss": 2.7296, + "step": 4988 + }, + { + "epoch": 0.40263094181260595, + "grad_norm": 0.7725681066513062, + "learning_rate": 0.00017166415169065933, + "loss": 2.7169, + "step": 4989 + }, + { + "epoch": 0.4027116455491889, + "grad_norm": 0.7165024280548096, + "learning_rate": 0.00017165314035673846, + "loss": 2.677, + "step": 4990 + }, + { + "epoch": 0.40279234928577196, + "grad_norm": 0.8888981938362122, + "learning_rate": 0.00017164212723703404, + "loss": 2.7694, + "step": 4991 + }, + { + "epoch": 0.40287305302235493, + "grad_norm": 0.7439224720001221, + "learning_rate": 0.00017163111233182052, + "loss": 2.674, + "step": 4992 + }, + { + "epoch": 0.40295375675893796, + "grad_norm": 0.6948431730270386, + "learning_rate": 0.00017162009564137244, + "loss": 2.6595, + "step": 4993 + }, + { + "epoch": 0.40303446049552094, + "grad_norm": 0.7274380922317505, + "learning_rate": 0.00017160907716596438, + "loss": 2.649, + "step": 4994 + }, + { + "epoch": 0.403115164232104, + "grad_norm": 0.7127148509025574, + "learning_rate": 0.0001715980569058709, + "loss": 2.6883, + "step": 4995 + }, + { + "epoch": 0.40319586796868695, + "grad_norm": 0.7129155993461609, + "learning_rate": 0.00017158703486136668, + "loss": 2.6516, + "step": 4996 + }, + { + "epoch": 0.40327657170527, + "grad_norm": 0.7848126292228699, + "learning_rate": 0.00017157601103272646, + "loss": 2.6778, + "step": 4997 + }, + { + "epoch": 0.40335727544185296, + "grad_norm": 0.752268373966217, + "learning_rate": 0.0001715649854202249, + "loss": 2.7228, + "step": 4998 + }, + { + "epoch": 0.40343797917843593, + "grad_norm": 0.7750338912010193, + "learning_rate": 0.00017155395802413684, + "loss": 2.6338, + "step": 4999 + }, + { + "epoch": 0.40351868291501897, + "grad_norm": 0.7165457010269165, + "learning_rate": 0.00017154292884473713, + "loss": 2.6195, + "step": 5000 + }, + { + "epoch": 0.40351868291501897, + "eval_loss": 2.585501194000244, + "eval_runtime": 901.8519, + "eval_samples_per_second": 2.905, + "eval_steps_per_second": 0.485, + "step": 5000 + }, + { + "epoch": 0.40359938665160194, + "grad_norm": 0.8118943572044373, + "learning_rate": 0.00017153189788230062, + "loss": 2.6649, + "step": 5001 + }, + { + "epoch": 0.403680090388185, + "grad_norm": 0.722984790802002, + "learning_rate": 0.00017152086513710221, + "loss": 2.6929, + "step": 5002 + }, + { + "epoch": 0.40376079412476795, + "grad_norm": 0.700690507888794, + "learning_rate": 0.00017150983060941686, + "loss": 2.6368, + "step": 5003 + }, + { + "epoch": 0.403841497861351, + "grad_norm": 0.7331504225730896, + "learning_rate": 0.00017149879429951965, + "loss": 2.6826, + "step": 5004 + }, + { + "epoch": 0.40392220159793396, + "grad_norm": 0.7312643527984619, + "learning_rate": 0.00017148775620768553, + "loss": 2.6279, + "step": 5005 + }, + { + "epoch": 0.404002905334517, + "grad_norm": 0.7488462924957275, + "learning_rate": 0.00017147671633418972, + "loss": 2.6711, + "step": 5006 + }, + { + "epoch": 0.40408360907109997, + "grad_norm": 0.8620340824127197, + "learning_rate": 0.00017146567467930725, + "loss": 2.6637, + "step": 5007 + }, + { + "epoch": 0.404164312807683, + "grad_norm": 0.683907151222229, + "learning_rate": 0.00017145463124331335, + "loss": 2.6331, + "step": 5008 + }, + { + "epoch": 0.404245016544266, + "grad_norm": 0.7389389276504517, + "learning_rate": 0.0001714435860264833, + "loss": 2.7232, + "step": 5009 + }, + { + "epoch": 0.404325720280849, + "grad_norm": 0.7456515431404114, + "learning_rate": 0.00017143253902909228, + "loss": 2.6363, + "step": 5010 + }, + { + "epoch": 0.404406424017432, + "grad_norm": 0.7044962644577026, + "learning_rate": 0.0001714214902514157, + "loss": 2.6672, + "step": 5011 + }, + { + "epoch": 0.404487127754015, + "grad_norm": 0.7410328984260559, + "learning_rate": 0.00017141043969372887, + "loss": 2.6059, + "step": 5012 + }, + { + "epoch": 0.404567831490598, + "grad_norm": 0.6697140336036682, + "learning_rate": 0.00017139938735630722, + "loss": 2.7151, + "step": 5013 + }, + { + "epoch": 0.404648535227181, + "grad_norm": 0.746675431728363, + "learning_rate": 0.00017138833323942617, + "loss": 2.6792, + "step": 5014 + }, + { + "epoch": 0.404729238963764, + "grad_norm": 0.7724997401237488, + "learning_rate": 0.00017137727734336129, + "loss": 2.6234, + "step": 5015 + }, + { + "epoch": 0.40480994270034704, + "grad_norm": 0.8014429211616516, + "learning_rate": 0.00017136621966838805, + "loss": 2.6795, + "step": 5016 + }, + { + "epoch": 0.40489064643693, + "grad_norm": 0.6900430917739868, + "learning_rate": 0.00017135516021478205, + "loss": 2.7127, + "step": 5017 + }, + { + "epoch": 0.40497135017351304, + "grad_norm": 0.6648666858673096, + "learning_rate": 0.00017134409898281896, + "loss": 2.6564, + "step": 5018 + }, + { + "epoch": 0.405052053910096, + "grad_norm": 0.7054181098937988, + "learning_rate": 0.00017133303597277442, + "loss": 2.6652, + "step": 5019 + }, + { + "epoch": 0.40513275764667905, + "grad_norm": 0.6847733855247498, + "learning_rate": 0.00017132197118492414, + "loss": 2.6997, + "step": 5020 + }, + { + "epoch": 0.40521346138326203, + "grad_norm": 0.7047749757766724, + "learning_rate": 0.00017131090461954392, + "loss": 2.6752, + "step": 5021 + }, + { + "epoch": 0.40529416511984506, + "grad_norm": 0.7549976706504822, + "learning_rate": 0.00017129983627690957, + "loss": 2.6736, + "step": 5022 + }, + { + "epoch": 0.40537486885642804, + "grad_norm": 0.7436367273330688, + "learning_rate": 0.00017128876615729686, + "loss": 2.7189, + "step": 5023 + }, + { + "epoch": 0.40545557259301107, + "grad_norm": 0.6515071988105774, + "learning_rate": 0.00017127769426098177, + "loss": 2.6422, + "step": 5024 + }, + { + "epoch": 0.40553627632959405, + "grad_norm": 0.6960858702659607, + "learning_rate": 0.00017126662058824024, + "loss": 2.6619, + "step": 5025 + }, + { + "epoch": 0.4056169800661771, + "grad_norm": 0.8075968623161316, + "learning_rate": 0.0001712555451393482, + "loss": 2.6678, + "step": 5026 + }, + { + "epoch": 0.40569768380276006, + "grad_norm": 0.6864624619483948, + "learning_rate": 0.00017124446791458176, + "loss": 2.6331, + "step": 5027 + }, + { + "epoch": 0.4057783875393431, + "grad_norm": 0.7218763828277588, + "learning_rate": 0.0001712333889142169, + "loss": 2.6316, + "step": 5028 + }, + { + "epoch": 0.40585909127592606, + "grad_norm": 0.7024715542793274, + "learning_rate": 0.0001712223081385298, + "loss": 2.623, + "step": 5029 + }, + { + "epoch": 0.4059397950125091, + "grad_norm": 0.6681575775146484, + "learning_rate": 0.0001712112255877966, + "loss": 2.6786, + "step": 5030 + }, + { + "epoch": 0.4060204987490921, + "grad_norm": 0.7249817848205566, + "learning_rate": 0.0001712001412622935, + "loss": 2.6179, + "step": 5031 + }, + { + "epoch": 0.4061012024856751, + "grad_norm": 0.7178316116333008, + "learning_rate": 0.00017118905516229677, + "loss": 2.696, + "step": 5032 + }, + { + "epoch": 0.4061819062222581, + "grad_norm": 0.7838767766952515, + "learning_rate": 0.0001711779672880827, + "loss": 2.6881, + "step": 5033 + }, + { + "epoch": 0.4062626099588411, + "grad_norm": 0.799937903881073, + "learning_rate": 0.0001711668776399276, + "loss": 2.7587, + "step": 5034 + }, + { + "epoch": 0.4063433136954241, + "grad_norm": 0.7622246146202087, + "learning_rate": 0.0001711557862181079, + "loss": 2.6621, + "step": 5035 + }, + { + "epoch": 0.4064240174320071, + "grad_norm": 0.7158814072608948, + "learning_rate": 0.00017114469302290003, + "loss": 2.6421, + "step": 5036 + }, + { + "epoch": 0.4065047211685901, + "grad_norm": 0.7913404107093811, + "learning_rate": 0.0001711335980545804, + "loss": 2.6323, + "step": 5037 + }, + { + "epoch": 0.40658542490517313, + "grad_norm": 0.718325138092041, + "learning_rate": 0.00017112250131342556, + "loss": 2.6171, + "step": 5038 + }, + { + "epoch": 0.4066661286417561, + "grad_norm": 0.7793646454811096, + "learning_rate": 0.0001711114027997121, + "loss": 2.7494, + "step": 5039 + }, + { + "epoch": 0.40674683237833914, + "grad_norm": 0.7774816155433655, + "learning_rate": 0.00017110030251371656, + "loss": 2.5534, + "step": 5040 + }, + { + "epoch": 0.4068275361149221, + "grad_norm": 0.8547549247741699, + "learning_rate": 0.00017108920045571564, + "loss": 2.7155, + "step": 5041 + }, + { + "epoch": 0.40690823985150515, + "grad_norm": 0.7685851454734802, + "learning_rate": 0.000171078096625986, + "loss": 2.6109, + "step": 5042 + }, + { + "epoch": 0.4069889435880881, + "grad_norm": 0.7953611016273499, + "learning_rate": 0.00017106699102480445, + "loss": 2.7034, + "step": 5043 + }, + { + "epoch": 0.40706964732467116, + "grad_norm": 0.7550730109214783, + "learning_rate": 0.00017105588365244764, + "loss": 2.7026, + "step": 5044 + }, + { + "epoch": 0.40715035106125413, + "grad_norm": 0.7036548256874084, + "learning_rate": 0.0001710447745091925, + "loss": 2.6246, + "step": 5045 + }, + { + "epoch": 0.40723105479783717, + "grad_norm": 0.7154512405395508, + "learning_rate": 0.00017103366359531586, + "loss": 2.6592, + "step": 5046 + }, + { + "epoch": 0.40731175853442014, + "grad_norm": 0.7773932218551636, + "learning_rate": 0.00017102255091109463, + "loss": 2.6458, + "step": 5047 + }, + { + "epoch": 0.4073924622710032, + "grad_norm": 0.7458996176719666, + "learning_rate": 0.0001710114364568058, + "loss": 2.643, + "step": 5048 + }, + { + "epoch": 0.40747316600758615, + "grad_norm": 0.7465376257896423, + "learning_rate": 0.00017100032023272633, + "loss": 2.6677, + "step": 5049 + }, + { + "epoch": 0.40755386974416913, + "grad_norm": 0.7340850830078125, + "learning_rate": 0.0001709892022391333, + "loss": 2.6372, + "step": 5050 + }, + { + "epoch": 0.40763457348075216, + "grad_norm": 0.7189164757728577, + "learning_rate": 0.00017097808247630377, + "loss": 2.6524, + "step": 5051 + }, + { + "epoch": 0.40771527721733514, + "grad_norm": 0.6954184174537659, + "learning_rate": 0.0001709669609445149, + "loss": 2.7383, + "step": 5052 + }, + { + "epoch": 0.40779598095391817, + "grad_norm": 0.736409604549408, + "learning_rate": 0.00017095583764404384, + "loss": 2.6424, + "step": 5053 + }, + { + "epoch": 0.40787668469050115, + "grad_norm": 0.6773545742034912, + "learning_rate": 0.0001709447125751678, + "loss": 2.6557, + "step": 5054 + }, + { + "epoch": 0.4079573884270842, + "grad_norm": 0.718748927116394, + "learning_rate": 0.00017093358573816412, + "loss": 2.6884, + "step": 5055 + }, + { + "epoch": 0.40803809216366715, + "grad_norm": 0.8276848793029785, + "learning_rate": 0.00017092245713331002, + "loss": 2.6642, + "step": 5056 + }, + { + "epoch": 0.4081187959002502, + "grad_norm": 0.7694761157035828, + "learning_rate": 0.00017091132676088294, + "loss": 2.644, + "step": 5057 + }, + { + "epoch": 0.40819949963683316, + "grad_norm": 0.766724705696106, + "learning_rate": 0.0001709001946211602, + "loss": 2.6918, + "step": 5058 + }, + { + "epoch": 0.4082802033734162, + "grad_norm": 0.7067074775695801, + "learning_rate": 0.00017088906071441927, + "loss": 2.7228, + "step": 5059 + }, + { + "epoch": 0.40836090710999917, + "grad_norm": 0.7216899991035461, + "learning_rate": 0.00017087792504093767, + "loss": 2.7068, + "step": 5060 + }, + { + "epoch": 0.4084416108465822, + "grad_norm": 0.6728984713554382, + "learning_rate": 0.00017086678760099287, + "loss": 2.686, + "step": 5061 + }, + { + "epoch": 0.4085223145831652, + "grad_norm": 0.7546882033348083, + "learning_rate": 0.0001708556483948625, + "loss": 2.6907, + "step": 5062 + }, + { + "epoch": 0.4086030183197482, + "grad_norm": 0.7471179962158203, + "learning_rate": 0.00017084450742282416, + "loss": 2.6857, + "step": 5063 + }, + { + "epoch": 0.4086837220563312, + "grad_norm": 0.7879743576049805, + "learning_rate": 0.00017083336468515548, + "loss": 2.7224, + "step": 5064 + }, + { + "epoch": 0.4087644257929142, + "grad_norm": 0.691343367099762, + "learning_rate": 0.00017082222018213422, + "loss": 2.6561, + "step": 5065 + }, + { + "epoch": 0.4088451295294972, + "grad_norm": 0.7497386336326599, + "learning_rate": 0.00017081107391403805, + "loss": 2.6317, + "step": 5066 + }, + { + "epoch": 0.40892583326608023, + "grad_norm": 0.6846269965171814, + "learning_rate": 0.00017079992588114485, + "loss": 2.6522, + "step": 5067 + }, + { + "epoch": 0.4090065370026632, + "grad_norm": 0.7312905192375183, + "learning_rate": 0.0001707887760837324, + "loss": 2.588, + "step": 5068 + }, + { + "epoch": 0.40908724073924624, + "grad_norm": 0.6966867446899414, + "learning_rate": 0.00017077762452207866, + "loss": 2.6316, + "step": 5069 + }, + { + "epoch": 0.4091679444758292, + "grad_norm": 0.6882073283195496, + "learning_rate": 0.00017076647119646147, + "loss": 2.6977, + "step": 5070 + }, + { + "epoch": 0.40924864821241225, + "grad_norm": 0.7392483949661255, + "learning_rate": 0.00017075531610715884, + "loss": 2.6768, + "step": 5071 + }, + { + "epoch": 0.4093293519489952, + "grad_norm": 0.7311073541641235, + "learning_rate": 0.00017074415925444876, + "loss": 2.6628, + "step": 5072 + }, + { + "epoch": 0.40941005568557826, + "grad_norm": 0.6769934296607971, + "learning_rate": 0.00017073300063860934, + "loss": 2.6438, + "step": 5073 + }, + { + "epoch": 0.40949075942216123, + "grad_norm": 0.736456573009491, + "learning_rate": 0.00017072184025991862, + "loss": 2.6151, + "step": 5074 + }, + { + "epoch": 0.40957146315874426, + "grad_norm": 0.7026283740997314, + "learning_rate": 0.00017071067811865476, + "loss": 2.6726, + "step": 5075 + }, + { + "epoch": 0.40965216689532724, + "grad_norm": 0.6825234293937683, + "learning_rate": 0.00017069951421509597, + "loss": 2.6795, + "step": 5076 + }, + { + "epoch": 0.4097328706319103, + "grad_norm": 0.7243828773498535, + "learning_rate": 0.0001706883485495205, + "loss": 2.687, + "step": 5077 + }, + { + "epoch": 0.40981357436849325, + "grad_norm": 0.7300469875335693, + "learning_rate": 0.00017067718112220658, + "loss": 2.6268, + "step": 5078 + }, + { + "epoch": 0.4098942781050763, + "grad_norm": 0.698095440864563, + "learning_rate": 0.00017066601193343255, + "loss": 2.6461, + "step": 5079 + }, + { + "epoch": 0.40997498184165926, + "grad_norm": 0.7318777441978455, + "learning_rate": 0.00017065484098347677, + "loss": 2.6817, + "step": 5080 + }, + { + "epoch": 0.4100556855782423, + "grad_norm": 0.7681582570075989, + "learning_rate": 0.00017064366827261772, + "loss": 2.7309, + "step": 5081 + }, + { + "epoch": 0.41013638931482527, + "grad_norm": 0.7690179944038391, + "learning_rate": 0.0001706324938011337, + "loss": 2.6292, + "step": 5082 + }, + { + "epoch": 0.4102170930514083, + "grad_norm": 0.6745284199714661, + "learning_rate": 0.00017062131756930338, + "loss": 2.7133, + "step": 5083 + }, + { + "epoch": 0.4102977967879913, + "grad_norm": 0.7524279952049255, + "learning_rate": 0.00017061013957740518, + "loss": 2.6237, + "step": 5084 + }, + { + "epoch": 0.4103785005245743, + "grad_norm": 0.7813692092895508, + "learning_rate": 0.00017059895982571773, + "loss": 2.6953, + "step": 5085 + }, + { + "epoch": 0.4104592042611573, + "grad_norm": 0.7128829956054688, + "learning_rate": 0.00017058777831451967, + "loss": 2.6771, + "step": 5086 + }, + { + "epoch": 0.4105399079977403, + "grad_norm": 0.7249834537506104, + "learning_rate": 0.00017057659504408963, + "loss": 2.6376, + "step": 5087 + }, + { + "epoch": 0.4106206117343233, + "grad_norm": 0.7742593288421631, + "learning_rate": 0.00017056541001470637, + "loss": 2.6227, + "step": 5088 + }, + { + "epoch": 0.4107013154709063, + "grad_norm": 0.6994228959083557, + "learning_rate": 0.00017055422322664863, + "loss": 2.6573, + "step": 5089 + }, + { + "epoch": 0.4107820192074893, + "grad_norm": 0.7144249081611633, + "learning_rate": 0.00017054303468019518, + "loss": 2.6602, + "step": 5090 + }, + { + "epoch": 0.41086272294407233, + "grad_norm": 0.7695099711418152, + "learning_rate": 0.00017053184437562497, + "loss": 2.6516, + "step": 5091 + }, + { + "epoch": 0.4109434266806553, + "grad_norm": 0.7610031962394714, + "learning_rate": 0.00017052065231321678, + "loss": 2.6963, + "step": 5092 + }, + { + "epoch": 0.41102413041723834, + "grad_norm": 0.7117859721183777, + "learning_rate": 0.0001705094584932496, + "loss": 2.6954, + "step": 5093 + }, + { + "epoch": 0.4111048341538213, + "grad_norm": 0.7891486287117004, + "learning_rate": 0.00017049826291600244, + "loss": 2.7265, + "step": 5094 + }, + { + "epoch": 0.41118553789040435, + "grad_norm": 0.7347370386123657, + "learning_rate": 0.00017048706558175423, + "loss": 2.658, + "step": 5095 + }, + { + "epoch": 0.41126624162698733, + "grad_norm": 0.7541289925575256, + "learning_rate": 0.00017047586649078414, + "loss": 2.6596, + "step": 5096 + }, + { + "epoch": 0.41134694536357036, + "grad_norm": 0.7471255660057068, + "learning_rate": 0.00017046466564337118, + "loss": 2.7008, + "step": 5097 + }, + { + "epoch": 0.41142764910015334, + "grad_norm": 0.7566937208175659, + "learning_rate": 0.00017045346303979457, + "loss": 2.7006, + "step": 5098 + }, + { + "epoch": 0.41150835283673637, + "grad_norm": 0.6991304159164429, + "learning_rate": 0.00017044225868033353, + "loss": 2.6846, + "step": 5099 + }, + { + "epoch": 0.41158905657331935, + "grad_norm": 0.7286314368247986, + "learning_rate": 0.00017043105256526724, + "loss": 2.6219, + "step": 5100 + }, + { + "epoch": 0.4116697603099023, + "grad_norm": 0.6953727006912231, + "learning_rate": 0.000170419844694875, + "loss": 2.6093, + "step": 5101 + }, + { + "epoch": 0.41175046404648535, + "grad_norm": 0.6942756772041321, + "learning_rate": 0.00017040863506943615, + "loss": 2.6399, + "step": 5102 + }, + { + "epoch": 0.41183116778306833, + "grad_norm": 0.7513531446456909, + "learning_rate": 0.00017039742368923005, + "loss": 2.6187, + "step": 5103 + }, + { + "epoch": 0.41191187151965136, + "grad_norm": 0.7530633211135864, + "learning_rate": 0.00017038621055453617, + "loss": 2.6124, + "step": 5104 + }, + { + "epoch": 0.41199257525623434, + "grad_norm": 0.7487555146217346, + "learning_rate": 0.00017037499566563392, + "loss": 2.6331, + "step": 5105 + }, + { + "epoch": 0.41207327899281737, + "grad_norm": 0.7641858458518982, + "learning_rate": 0.00017036377902280282, + "loss": 2.6875, + "step": 5106 + }, + { + "epoch": 0.41215398272940035, + "grad_norm": 0.6962767839431763, + "learning_rate": 0.0001703525606263224, + "loss": 2.6538, + "step": 5107 + }, + { + "epoch": 0.4122346864659834, + "grad_norm": 0.8183409571647644, + "learning_rate": 0.0001703413404764723, + "loss": 2.6204, + "step": 5108 + }, + { + "epoch": 0.41231539020256636, + "grad_norm": 0.7029808759689331, + "learning_rate": 0.00017033011857353207, + "loss": 2.6369, + "step": 5109 + }, + { + "epoch": 0.4123960939391494, + "grad_norm": 0.7171663045883179, + "learning_rate": 0.00017031889491778149, + "loss": 2.6211, + "step": 5110 + }, + { + "epoch": 0.41247679767573237, + "grad_norm": 0.7456090450286865, + "learning_rate": 0.0001703076695095002, + "loss": 2.6574, + "step": 5111 + }, + { + "epoch": 0.4125575014123154, + "grad_norm": 0.7468575239181519, + "learning_rate": 0.000170296442348968, + "loss": 2.598, + "step": 5112 + }, + { + "epoch": 0.4126382051488984, + "grad_norm": 0.7106603384017944, + "learning_rate": 0.0001702852134364647, + "loss": 2.6577, + "step": 5113 + }, + { + "epoch": 0.4127189088854814, + "grad_norm": 0.7788330912590027, + "learning_rate": 0.00017027398277227017, + "loss": 2.6797, + "step": 5114 + }, + { + "epoch": 0.4127996126220644, + "grad_norm": 0.7794120907783508, + "learning_rate": 0.00017026275035666427, + "loss": 2.5834, + "step": 5115 + }, + { + "epoch": 0.4128803163586474, + "grad_norm": 0.7270684838294983, + "learning_rate": 0.00017025151618992702, + "loss": 2.7153, + "step": 5116 + }, + { + "epoch": 0.4129610200952304, + "grad_norm": 0.8169006109237671, + "learning_rate": 0.00017024028027233827, + "loss": 2.6786, + "step": 5117 + }, + { + "epoch": 0.4130417238318134, + "grad_norm": 0.8053112626075745, + "learning_rate": 0.00017022904260417815, + "loss": 2.6456, + "step": 5118 + }, + { + "epoch": 0.4131224275683964, + "grad_norm": 0.7646365165710449, + "learning_rate": 0.0001702178031857267, + "loss": 2.6784, + "step": 5119 + }, + { + "epoch": 0.41320313130497943, + "grad_norm": 0.7878902554512024, + "learning_rate": 0.00017020656201726406, + "loss": 2.66, + "step": 5120 + }, + { + "epoch": 0.4132838350415624, + "grad_norm": 0.8602383732795715, + "learning_rate": 0.00017019531909907037, + "loss": 2.7018, + "step": 5121 + }, + { + "epoch": 0.41336453877814544, + "grad_norm": 0.801092267036438, + "learning_rate": 0.00017018407443142585, + "loss": 2.7728, + "step": 5122 + }, + { + "epoch": 0.4134452425147284, + "grad_norm": 0.7372604012489319, + "learning_rate": 0.00017017282801461074, + "loss": 2.6588, + "step": 5123 + }, + { + "epoch": 0.41352594625131145, + "grad_norm": 0.7553830146789551, + "learning_rate": 0.0001701615798489053, + "loss": 2.6844, + "step": 5124 + }, + { + "epoch": 0.4136066499878944, + "grad_norm": 0.7699872255325317, + "learning_rate": 0.0001701503299345899, + "loss": 2.6523, + "step": 5125 + }, + { + "epoch": 0.41368735372447746, + "grad_norm": 0.7087047696113586, + "learning_rate": 0.0001701390782719449, + "loss": 2.6785, + "step": 5126 + }, + { + "epoch": 0.41376805746106043, + "grad_norm": 0.7835792303085327, + "learning_rate": 0.0001701278248612507, + "loss": 2.7064, + "step": 5127 + }, + { + "epoch": 0.41384876119764347, + "grad_norm": 0.7833154201507568, + "learning_rate": 0.0001701165697027878, + "loss": 2.6552, + "step": 5128 + }, + { + "epoch": 0.41392946493422644, + "grad_norm": 0.8240615725517273, + "learning_rate": 0.0001701053127968367, + "loss": 2.7074, + "step": 5129 + }, + { + "epoch": 0.4140101686708095, + "grad_norm": 0.7612149119377136, + "learning_rate": 0.0001700940541436779, + "loss": 2.7484, + "step": 5130 + }, + { + "epoch": 0.41409087240739245, + "grad_norm": 0.7795391082763672, + "learning_rate": 0.00017008279374359212, + "loss": 2.6022, + "step": 5131 + }, + { + "epoch": 0.4141715761439755, + "grad_norm": 0.7714587450027466, + "learning_rate": 0.00017007153159685992, + "loss": 2.6529, + "step": 5132 + }, + { + "epoch": 0.41425227988055846, + "grad_norm": 0.7821317911148071, + "learning_rate": 0.00017006026770376194, + "loss": 2.6356, + "step": 5133 + }, + { + "epoch": 0.4143329836171415, + "grad_norm": 0.7300596833229065, + "learning_rate": 0.00017004900206457897, + "loss": 2.6552, + "step": 5134 + }, + { + "epoch": 0.41441368735372447, + "grad_norm": 0.780505359172821, + "learning_rate": 0.00017003773467959174, + "loss": 2.675, + "step": 5135 + }, + { + "epoch": 0.4144943910903075, + "grad_norm": 0.7107391357421875, + "learning_rate": 0.00017002646554908107, + "loss": 2.7096, + "step": 5136 + }, + { + "epoch": 0.4145750948268905, + "grad_norm": 0.7358834743499756, + "learning_rate": 0.0001700151946733279, + "loss": 2.6619, + "step": 5137 + }, + { + "epoch": 0.4146557985634735, + "grad_norm": 0.7573859095573425, + "learning_rate": 0.00017000392205261298, + "loss": 2.6234, + "step": 5138 + }, + { + "epoch": 0.4147365023000565, + "grad_norm": 0.7032024264335632, + "learning_rate": 0.00016999264768721738, + "loss": 2.6096, + "step": 5139 + }, + { + "epoch": 0.4148172060366395, + "grad_norm": 0.743813693523407, + "learning_rate": 0.00016998137157742203, + "loss": 2.6782, + "step": 5140 + }, + { + "epoch": 0.4148979097732225, + "grad_norm": 0.8861347436904907, + "learning_rate": 0.00016997009372350793, + "loss": 2.6645, + "step": 5141 + }, + { + "epoch": 0.4149786135098055, + "grad_norm": 0.7598684430122375, + "learning_rate": 0.00016995881412575623, + "loss": 2.649, + "step": 5142 + }, + { + "epoch": 0.4150593172463885, + "grad_norm": 0.7535565495491028, + "learning_rate": 0.00016994753278444798, + "loss": 2.6449, + "step": 5143 + }, + { + "epoch": 0.41514002098297154, + "grad_norm": 0.7073138356208801, + "learning_rate": 0.0001699362496998644, + "loss": 2.6253, + "step": 5144 + }, + { + "epoch": 0.4152207247195545, + "grad_norm": 0.7161526679992676, + "learning_rate": 0.00016992496487228662, + "loss": 2.6623, + "step": 5145 + }, + { + "epoch": 0.41530142845613754, + "grad_norm": 0.8284714818000793, + "learning_rate": 0.00016991367830199595, + "loss": 2.7363, + "step": 5146 + }, + { + "epoch": 0.4153821321927205, + "grad_norm": 0.7127673625946045, + "learning_rate": 0.0001699023899892737, + "loss": 2.6274, + "step": 5147 + }, + { + "epoch": 0.41546283592930355, + "grad_norm": 0.7496370673179626, + "learning_rate": 0.00016989109993440112, + "loss": 2.6364, + "step": 5148 + }, + { + "epoch": 0.41554353966588653, + "grad_norm": 0.7616143822669983, + "learning_rate": 0.00016987980813765963, + "loss": 2.7225, + "step": 5149 + }, + { + "epoch": 0.41562424340246956, + "grad_norm": 0.6935909986495972, + "learning_rate": 0.00016986851459933067, + "loss": 2.6109, + "step": 5150 + }, + { + "epoch": 0.41570494713905254, + "grad_norm": 0.721023678779602, + "learning_rate": 0.00016985721931969566, + "loss": 2.6993, + "step": 5151 + }, + { + "epoch": 0.4157856508756355, + "grad_norm": 0.8216699361801147, + "learning_rate": 0.00016984592229903617, + "loss": 2.6512, + "step": 5152 + }, + { + "epoch": 0.41586635461221855, + "grad_norm": 0.7425234913825989, + "learning_rate": 0.00016983462353763372, + "loss": 2.5903, + "step": 5153 + }, + { + "epoch": 0.4159470583488015, + "grad_norm": 0.7292542457580566, + "learning_rate": 0.00016982332303576986, + "loss": 2.692, + "step": 5154 + }, + { + "epoch": 0.41602776208538456, + "grad_norm": 0.7466831803321838, + "learning_rate": 0.0001698120207937263, + "loss": 2.7145, + "step": 5155 + }, + { + "epoch": 0.41610846582196753, + "grad_norm": 0.7271949648857117, + "learning_rate": 0.00016980071681178471, + "loss": 2.655, + "step": 5156 + }, + { + "epoch": 0.41618916955855056, + "grad_norm": 0.7505547404289246, + "learning_rate": 0.00016978941109022677, + "loss": 2.7167, + "step": 5157 + }, + { + "epoch": 0.41626987329513354, + "grad_norm": 0.7307172417640686, + "learning_rate": 0.00016977810362933427, + "loss": 2.6735, + "step": 5158 + }, + { + "epoch": 0.4163505770317166, + "grad_norm": 0.7839170098304749, + "learning_rate": 0.00016976679442938904, + "loss": 2.6818, + "step": 5159 + }, + { + "epoch": 0.41643128076829955, + "grad_norm": 0.7131803631782532, + "learning_rate": 0.00016975548349067293, + "loss": 2.6921, + "step": 5160 + }, + { + "epoch": 0.4165119845048826, + "grad_norm": 0.8129798173904419, + "learning_rate": 0.0001697441708134678, + "loss": 2.6682, + "step": 5161 + }, + { + "epoch": 0.41659268824146556, + "grad_norm": 0.7634746432304382, + "learning_rate": 0.00016973285639805563, + "loss": 2.6684, + "step": 5162 + }, + { + "epoch": 0.4166733919780486, + "grad_norm": 0.7367348074913025, + "learning_rate": 0.0001697215402447184, + "loss": 2.6424, + "step": 5163 + }, + { + "epoch": 0.41675409571463157, + "grad_norm": 0.7235338687896729, + "learning_rate": 0.00016971022235373815, + "loss": 2.6817, + "step": 5164 + }, + { + "epoch": 0.4168347994512146, + "grad_norm": 0.7764291763305664, + "learning_rate": 0.0001696989027253969, + "loss": 2.6477, + "step": 5165 + }, + { + "epoch": 0.4169155031877976, + "grad_norm": 0.8207562565803528, + "learning_rate": 0.00016968758135997683, + "loss": 2.6408, + "step": 5166 + }, + { + "epoch": 0.4169962069243806, + "grad_norm": 0.7291484475135803, + "learning_rate": 0.00016967625825776005, + "loss": 2.6233, + "step": 5167 + }, + { + "epoch": 0.4170769106609636, + "grad_norm": 0.7060603499412537, + "learning_rate": 0.0001696649334190288, + "loss": 2.6204, + "step": 5168 + }, + { + "epoch": 0.4171576143975466, + "grad_norm": 0.7058241963386536, + "learning_rate": 0.00016965360684406528, + "loss": 2.6212, + "step": 5169 + }, + { + "epoch": 0.4172383181341296, + "grad_norm": 0.8248410224914551, + "learning_rate": 0.00016964227853315177, + "loss": 2.6688, + "step": 5170 + }, + { + "epoch": 0.4173190218707126, + "grad_norm": 0.7287606596946716, + "learning_rate": 0.0001696309484865707, + "loss": 2.6201, + "step": 5171 + }, + { + "epoch": 0.4173997256072956, + "grad_norm": 0.7214288115501404, + "learning_rate": 0.00016961961670460433, + "loss": 2.682, + "step": 5172 + }, + { + "epoch": 0.41748042934387863, + "grad_norm": 0.7133594155311584, + "learning_rate": 0.00016960828318753516, + "loss": 2.7167, + "step": 5173 + }, + { + "epoch": 0.4175611330804616, + "grad_norm": 0.6935842633247375, + "learning_rate": 0.00016959694793564558, + "loss": 2.6134, + "step": 5174 + }, + { + "epoch": 0.41764183681704464, + "grad_norm": 0.6863382458686829, + "learning_rate": 0.00016958561094921815, + "loss": 2.6396, + "step": 5175 + }, + { + "epoch": 0.4177225405536276, + "grad_norm": 0.7659433484077454, + "learning_rate": 0.0001695742722285354, + "loss": 2.6926, + "step": 5176 + }, + { + "epoch": 0.41780324429021065, + "grad_norm": 0.6997129917144775, + "learning_rate": 0.00016956293177387992, + "loss": 2.6983, + "step": 5177 + }, + { + "epoch": 0.41788394802679363, + "grad_norm": 0.6784526705741882, + "learning_rate": 0.00016955158958553433, + "loss": 2.6961, + "step": 5178 + }, + { + "epoch": 0.41796465176337666, + "grad_norm": 0.8227884769439697, + "learning_rate": 0.00016954024566378132, + "loss": 2.7008, + "step": 5179 + }, + { + "epoch": 0.41804535549995964, + "grad_norm": 0.7733054757118225, + "learning_rate": 0.0001695289000089036, + "loss": 2.6615, + "step": 5180 + }, + { + "epoch": 0.41812605923654267, + "grad_norm": 0.7077545523643494, + "learning_rate": 0.00016951755262118394, + "loss": 2.6388, + "step": 5181 + }, + { + "epoch": 0.41820676297312565, + "grad_norm": 0.7962050437927246, + "learning_rate": 0.00016950620350090513, + "loss": 2.7063, + "step": 5182 + }, + { + "epoch": 0.4182874667097087, + "grad_norm": 0.6950554847717285, + "learning_rate": 0.00016949485264835005, + "loss": 2.7076, + "step": 5183 + }, + { + "epoch": 0.41836817044629165, + "grad_norm": 0.8546960949897766, + "learning_rate": 0.00016948350006380162, + "loss": 2.6533, + "step": 5184 + }, + { + "epoch": 0.4184488741828747, + "grad_norm": 0.7469324469566345, + "learning_rate": 0.00016947214574754272, + "loss": 2.5884, + "step": 5185 + }, + { + "epoch": 0.41852957791945766, + "grad_norm": 0.7125554084777832, + "learning_rate": 0.0001694607896998563, + "loss": 2.6448, + "step": 5186 + }, + { + "epoch": 0.4186102816560407, + "grad_norm": 0.6998329758644104, + "learning_rate": 0.00016944943192102549, + "loss": 2.5569, + "step": 5187 + }, + { + "epoch": 0.41869098539262367, + "grad_norm": 0.9046749472618103, + "learning_rate": 0.00016943807241133328, + "loss": 2.7701, + "step": 5188 + }, + { + "epoch": 0.4187716891292067, + "grad_norm": 0.7842074036598206, + "learning_rate": 0.00016942671117106274, + "loss": 2.7124, + "step": 5189 + }, + { + "epoch": 0.4188523928657897, + "grad_norm": 0.7625874280929565, + "learning_rate": 0.00016941534820049713, + "loss": 2.6626, + "step": 5190 + }, + { + "epoch": 0.4189330966023727, + "grad_norm": 0.7006461024284363, + "learning_rate": 0.00016940398349991957, + "loss": 2.6283, + "step": 5191 + }, + { + "epoch": 0.4190138003389557, + "grad_norm": 0.7081875205039978, + "learning_rate": 0.00016939261706961332, + "loss": 2.69, + "step": 5192 + }, + { + "epoch": 0.4190945040755387, + "grad_norm": 0.7554503083229065, + "learning_rate": 0.00016938124890986166, + "loss": 2.641, + "step": 5193 + }, + { + "epoch": 0.4191752078121217, + "grad_norm": 0.7478535175323486, + "learning_rate": 0.0001693698790209479, + "loss": 2.7035, + "step": 5194 + }, + { + "epoch": 0.41925591154870473, + "grad_norm": 0.7323064208030701, + "learning_rate": 0.00016935850740315545, + "loss": 2.6713, + "step": 5195 + }, + { + "epoch": 0.4193366152852877, + "grad_norm": 0.8011505007743835, + "learning_rate": 0.00016934713405676764, + "loss": 2.6413, + "step": 5196 + }, + { + "epoch": 0.41941731902187074, + "grad_norm": 0.768851637840271, + "learning_rate": 0.00016933575898206804, + "loss": 2.6147, + "step": 5197 + }, + { + "epoch": 0.4194980227584537, + "grad_norm": 0.7255160808563232, + "learning_rate": 0.00016932438217934006, + "loss": 2.6093, + "step": 5198 + }, + { + "epoch": 0.41957872649503675, + "grad_norm": 0.7431769967079163, + "learning_rate": 0.00016931300364886722, + "loss": 2.6658, + "step": 5199 + }, + { + "epoch": 0.4196594302316197, + "grad_norm": 0.7532122731208801, + "learning_rate": 0.00016930162339093318, + "loss": 2.6371, + "step": 5200 + }, + { + "epoch": 0.41974013396820276, + "grad_norm": 0.7253943681716919, + "learning_rate": 0.00016929024140582152, + "loss": 2.6365, + "step": 5201 + }, + { + "epoch": 0.41982083770478573, + "grad_norm": 0.7323265075683594, + "learning_rate": 0.00016927885769381593, + "loss": 2.7096, + "step": 5202 + }, + { + "epoch": 0.4199015414413687, + "grad_norm": 0.7340009808540344, + "learning_rate": 0.00016926747225520008, + "loss": 2.6983, + "step": 5203 + }, + { + "epoch": 0.41998224517795174, + "grad_norm": 0.838706374168396, + "learning_rate": 0.00016925608509025776, + "loss": 2.7098, + "step": 5204 + }, + { + "epoch": 0.4200629489145347, + "grad_norm": 0.7320838570594788, + "learning_rate": 0.0001692446961992728, + "loss": 2.6767, + "step": 5205 + }, + { + "epoch": 0.42014365265111775, + "grad_norm": 0.7275335192680359, + "learning_rate": 0.00016923330558252898, + "loss": 2.6754, + "step": 5206 + }, + { + "epoch": 0.4202243563877007, + "grad_norm": 0.7572353482246399, + "learning_rate": 0.00016922191324031017, + "loss": 2.7076, + "step": 5207 + }, + { + "epoch": 0.42030506012428376, + "grad_norm": 0.7991098165512085, + "learning_rate": 0.0001692105191729004, + "loss": 2.7281, + "step": 5208 + }, + { + "epoch": 0.42038576386086673, + "grad_norm": 0.70769202709198, + "learning_rate": 0.00016919912338058356, + "loss": 2.684, + "step": 5209 + }, + { + "epoch": 0.42046646759744977, + "grad_norm": 0.6895349621772766, + "learning_rate": 0.0001691877258636436, + "loss": 2.6723, + "step": 5210 + }, + { + "epoch": 0.42054717133403274, + "grad_norm": 0.7368944883346558, + "learning_rate": 0.00016917632662236476, + "loss": 2.601, + "step": 5211 + }, + { + "epoch": 0.4206278750706158, + "grad_norm": 0.7122060060501099, + "learning_rate": 0.00016916492565703097, + "loss": 2.703, + "step": 5212 + }, + { + "epoch": 0.42070857880719875, + "grad_norm": 0.735251784324646, + "learning_rate": 0.00016915352296792646, + "loss": 2.7715, + "step": 5213 + }, + { + "epoch": 0.4207892825437818, + "grad_norm": 0.7686039805412292, + "learning_rate": 0.00016914211855533536, + "loss": 2.6935, + "step": 5214 + }, + { + "epoch": 0.42086998628036476, + "grad_norm": 0.8457472920417786, + "learning_rate": 0.00016913071241954195, + "loss": 2.6535, + "step": 5215 + }, + { + "epoch": 0.4209506900169478, + "grad_norm": 0.6913465261459351, + "learning_rate": 0.00016911930456083046, + "loss": 2.6453, + "step": 5216 + }, + { + "epoch": 0.42103139375353077, + "grad_norm": 0.6939878463745117, + "learning_rate": 0.00016910789497948524, + "loss": 2.6483, + "step": 5217 + }, + { + "epoch": 0.4211120974901138, + "grad_norm": 0.7240888476371765, + "learning_rate": 0.00016909648367579062, + "loss": 2.6649, + "step": 5218 + }, + { + "epoch": 0.4211928012266968, + "grad_norm": 0.7570972442626953, + "learning_rate": 0.00016908507065003102, + "loss": 2.6633, + "step": 5219 + }, + { + "epoch": 0.4212735049632798, + "grad_norm": 0.72161465883255, + "learning_rate": 0.00016907365590249082, + "loss": 2.6999, + "step": 5220 + }, + { + "epoch": 0.4213542086998628, + "grad_norm": 0.7818038463592529, + "learning_rate": 0.00016906223943345458, + "loss": 2.6478, + "step": 5221 + }, + { + "epoch": 0.4214349124364458, + "grad_norm": 0.7292464971542358, + "learning_rate": 0.00016905082124320684, + "loss": 2.6725, + "step": 5222 + }, + { + "epoch": 0.4215156161730288, + "grad_norm": 0.7612937092781067, + "learning_rate": 0.0001690394013320321, + "loss": 2.6474, + "step": 5223 + }, + { + "epoch": 0.4215963199096118, + "grad_norm": 0.7325131297111511, + "learning_rate": 0.000169027979700215, + "loss": 2.6525, + "step": 5224 + }, + { + "epoch": 0.4216770236461948, + "grad_norm": 0.7736644148826599, + "learning_rate": 0.00016901655634804022, + "loss": 2.662, + "step": 5225 + }, + { + "epoch": 0.42175772738277784, + "grad_norm": 0.758522629737854, + "learning_rate": 0.00016900513127579244, + "loss": 2.6558, + "step": 5226 + }, + { + "epoch": 0.4218384311193608, + "grad_norm": 0.7559491991996765, + "learning_rate": 0.00016899370448375642, + "loss": 2.7361, + "step": 5227 + }, + { + "epoch": 0.42191913485594384, + "grad_norm": 0.7791146039962769, + "learning_rate": 0.00016898227597221692, + "loss": 2.6739, + "step": 5228 + }, + { + "epoch": 0.4219998385925268, + "grad_norm": 0.7280717492103577, + "learning_rate": 0.00016897084574145878, + "loss": 2.6316, + "step": 5229 + }, + { + "epoch": 0.42208054232910985, + "grad_norm": 0.7455596327781677, + "learning_rate": 0.0001689594137917669, + "loss": 2.7244, + "step": 5230 + }, + { + "epoch": 0.42216124606569283, + "grad_norm": 0.7965813875198364, + "learning_rate": 0.00016894798012342613, + "loss": 2.6757, + "step": 5231 + }, + { + "epoch": 0.42224194980227586, + "grad_norm": 0.6740596294403076, + "learning_rate": 0.00016893654473672148, + "loss": 2.631, + "step": 5232 + }, + { + "epoch": 0.42232265353885884, + "grad_norm": 0.695105254650116, + "learning_rate": 0.00016892510763193795, + "loss": 2.6563, + "step": 5233 + }, + { + "epoch": 0.42240335727544187, + "grad_norm": 0.7623865008354187, + "learning_rate": 0.00016891366880936051, + "loss": 2.6738, + "step": 5234 + }, + { + "epoch": 0.42248406101202485, + "grad_norm": 0.7545912265777588, + "learning_rate": 0.00016890222826927435, + "loss": 2.6949, + "step": 5235 + }, + { + "epoch": 0.4225647647486079, + "grad_norm": 0.7280749678611755, + "learning_rate": 0.00016889078601196452, + "loss": 2.6571, + "step": 5236 + }, + { + "epoch": 0.42264546848519086, + "grad_norm": 0.6624523401260376, + "learning_rate": 0.00016887934203771625, + "loss": 2.6854, + "step": 5237 + }, + { + "epoch": 0.4227261722217739, + "grad_norm": 0.7835487127304077, + "learning_rate": 0.0001688678963468147, + "loss": 2.6437, + "step": 5238 + }, + { + "epoch": 0.42280687595835686, + "grad_norm": 0.7384940981864929, + "learning_rate": 0.00016885644893954518, + "loss": 2.6584, + "step": 5239 + }, + { + "epoch": 0.4228875796949399, + "grad_norm": 0.8227531313896179, + "learning_rate": 0.00016884499981619292, + "loss": 2.673, + "step": 5240 + }, + { + "epoch": 0.4229682834315229, + "grad_norm": 0.7442220449447632, + "learning_rate": 0.00016883354897704334, + "loss": 2.6729, + "step": 5241 + }, + { + "epoch": 0.4230489871681059, + "grad_norm": 0.7182636857032776, + "learning_rate": 0.00016882209642238175, + "loss": 2.6833, + "step": 5242 + }, + { + "epoch": 0.4231296909046889, + "grad_norm": 0.7061870098114014, + "learning_rate": 0.00016881064215249362, + "loss": 2.6696, + "step": 5243 + }, + { + "epoch": 0.4232103946412719, + "grad_norm": 0.6792885065078735, + "learning_rate": 0.00016879918616766445, + "loss": 2.6805, + "step": 5244 + }, + { + "epoch": 0.4232910983778549, + "grad_norm": 0.7439807057380676, + "learning_rate": 0.00016878772846817968, + "loss": 2.6522, + "step": 5245 + }, + { + "epoch": 0.4233718021144379, + "grad_norm": 0.7078969478607178, + "learning_rate": 0.00016877626905432492, + "loss": 2.6549, + "step": 5246 + }, + { + "epoch": 0.4234525058510209, + "grad_norm": 0.7103868126869202, + "learning_rate": 0.00016876480792638577, + "loss": 2.6812, + "step": 5247 + }, + { + "epoch": 0.42353320958760393, + "grad_norm": 0.7224452495574951, + "learning_rate": 0.00016875334508464782, + "loss": 2.6657, + "step": 5248 + }, + { + "epoch": 0.4236139133241869, + "grad_norm": 0.6885106563568115, + "learning_rate": 0.00016874188052939682, + "loss": 2.6421, + "step": 5249 + }, + { + "epoch": 0.42369461706076994, + "grad_norm": 0.6736720204353333, + "learning_rate": 0.00016873041426091845, + "loss": 2.6717, + "step": 5250 + }, + { + "epoch": 0.4237753207973529, + "grad_norm": 0.7597963809967041, + "learning_rate": 0.00016871894627949846, + "loss": 2.6231, + "step": 5251 + }, + { + "epoch": 0.42385602453393595, + "grad_norm": 0.8295687437057495, + "learning_rate": 0.00016870747658542275, + "loss": 2.6631, + "step": 5252 + }, + { + "epoch": 0.4239367282705189, + "grad_norm": 0.6750548481941223, + "learning_rate": 0.0001686960051789771, + "loss": 2.6997, + "step": 5253 + }, + { + "epoch": 0.4240174320071019, + "grad_norm": 0.7229160666465759, + "learning_rate": 0.0001686845320604474, + "loss": 2.6525, + "step": 5254 + }, + { + "epoch": 0.42409813574368493, + "grad_norm": 0.8318623900413513, + "learning_rate": 0.00016867305723011967, + "loss": 2.7774, + "step": 5255 + }, + { + "epoch": 0.4241788394802679, + "grad_norm": 0.8391026854515076, + "learning_rate": 0.00016866158068827979, + "loss": 2.6712, + "step": 5256 + }, + { + "epoch": 0.42425954321685094, + "grad_norm": 0.691146969795227, + "learning_rate": 0.00016865010243521388, + "loss": 2.6459, + "step": 5257 + }, + { + "epoch": 0.4243402469534339, + "grad_norm": 0.7223602533340454, + "learning_rate": 0.00016863862247120794, + "loss": 2.6675, + "step": 5258 + }, + { + "epoch": 0.42442095069001695, + "grad_norm": 0.8400631546974182, + "learning_rate": 0.0001686271407965481, + "loss": 2.6978, + "step": 5259 + }, + { + "epoch": 0.42450165442659993, + "grad_norm": 0.737684965133667, + "learning_rate": 0.0001686156574115205, + "loss": 2.6992, + "step": 5260 + }, + { + "epoch": 0.42458235816318296, + "grad_norm": 0.7511717677116394, + "learning_rate": 0.0001686041723164114, + "loss": 2.6947, + "step": 5261 + }, + { + "epoch": 0.42466306189976594, + "grad_norm": 0.7434492707252502, + "learning_rate": 0.00016859268551150698, + "loss": 2.7353, + "step": 5262 + }, + { + "epoch": 0.42474376563634897, + "grad_norm": 0.746609628200531, + "learning_rate": 0.00016858119699709353, + "loss": 2.7519, + "step": 5263 + }, + { + "epoch": 0.42482446937293195, + "grad_norm": 0.7709949612617493, + "learning_rate": 0.0001685697067734574, + "loss": 2.7018, + "step": 5264 + }, + { + "epoch": 0.424905173109515, + "grad_norm": 0.7496309876441956, + "learning_rate": 0.00016855821484088488, + "loss": 2.6761, + "step": 5265 + }, + { + "epoch": 0.42498587684609795, + "grad_norm": 0.7071252465248108, + "learning_rate": 0.00016854672119966243, + "loss": 2.6762, + "step": 5266 + }, + { + "epoch": 0.425066580582681, + "grad_norm": 0.7991356253623962, + "learning_rate": 0.00016853522585007658, + "loss": 2.6134, + "step": 5267 + }, + { + "epoch": 0.42514728431926396, + "grad_norm": 0.8194605708122253, + "learning_rate": 0.0001685237287924137, + "loss": 2.6601, + "step": 5268 + }, + { + "epoch": 0.425227988055847, + "grad_norm": 0.7451688051223755, + "learning_rate": 0.00016851223002696037, + "loss": 2.6631, + "step": 5269 + }, + { + "epoch": 0.42530869179242997, + "grad_norm": 0.7220263481140137, + "learning_rate": 0.0001685007295540032, + "loss": 2.6631, + "step": 5270 + }, + { + "epoch": 0.425389395529013, + "grad_norm": 0.7268854975700378, + "learning_rate": 0.00016848922737382874, + "loss": 2.6752, + "step": 5271 + }, + { + "epoch": 0.425470099265596, + "grad_norm": 0.8841642141342163, + "learning_rate": 0.00016847772348672378, + "loss": 2.7153, + "step": 5272 + }, + { + "epoch": 0.425550803002179, + "grad_norm": 0.7725942134857178, + "learning_rate": 0.00016846621789297489, + "loss": 2.6726, + "step": 5273 + }, + { + "epoch": 0.425631506738762, + "grad_norm": 0.7179448008537292, + "learning_rate": 0.00016845471059286887, + "loss": 2.6659, + "step": 5274 + }, + { + "epoch": 0.425712210475345, + "grad_norm": 0.7630325555801392, + "learning_rate": 0.00016844320158669257, + "loss": 2.7133, + "step": 5275 + }, + { + "epoch": 0.425792914211928, + "grad_norm": 0.7349739670753479, + "learning_rate": 0.00016843169087473272, + "loss": 2.6397, + "step": 5276 + }, + { + "epoch": 0.42587361794851103, + "grad_norm": 0.7670298218727112, + "learning_rate": 0.00016842017845727626, + "loss": 2.6485, + "step": 5277 + }, + { + "epoch": 0.425954321685094, + "grad_norm": 0.692095160484314, + "learning_rate": 0.00016840866433461013, + "loss": 2.6058, + "step": 5278 + }, + { + "epoch": 0.42603502542167704, + "grad_norm": 0.6888624429702759, + "learning_rate": 0.00016839714850702125, + "loss": 2.5757, + "step": 5279 + }, + { + "epoch": 0.42611572915826, + "grad_norm": 0.6816484332084656, + "learning_rate": 0.00016838563097479664, + "loss": 2.6656, + "step": 5280 + }, + { + "epoch": 0.42619643289484305, + "grad_norm": 0.7778486609458923, + "learning_rate": 0.00016837411173822333, + "loss": 2.6738, + "step": 5281 + }, + { + "epoch": 0.426277136631426, + "grad_norm": 0.73436439037323, + "learning_rate": 0.00016836259079758845, + "loss": 2.6346, + "step": 5282 + }, + { + "epoch": 0.42635784036800906, + "grad_norm": 0.673528254032135, + "learning_rate": 0.00016835106815317908, + "loss": 2.6636, + "step": 5283 + }, + { + "epoch": 0.42643854410459203, + "grad_norm": 0.6892737150192261, + "learning_rate": 0.00016833954380528242, + "loss": 2.6723, + "step": 5284 + }, + { + "epoch": 0.42651924784117506, + "grad_norm": 0.7404607534408569, + "learning_rate": 0.00016832801775418571, + "loss": 2.6751, + "step": 5285 + }, + { + "epoch": 0.42659995157775804, + "grad_norm": 0.7040587663650513, + "learning_rate": 0.00016831649000017618, + "loss": 2.6079, + "step": 5286 + }, + { + "epoch": 0.4266806553143411, + "grad_norm": 0.7295164465904236, + "learning_rate": 0.00016830496054354112, + "loss": 2.5928, + "step": 5287 + }, + { + "epoch": 0.42676135905092405, + "grad_norm": 0.7269962430000305, + "learning_rate": 0.00016829342938456788, + "loss": 2.6648, + "step": 5288 + }, + { + "epoch": 0.4268420627875071, + "grad_norm": 0.7296550273895264, + "learning_rate": 0.0001682818965235439, + "loss": 2.6814, + "step": 5289 + }, + { + "epoch": 0.42692276652409006, + "grad_norm": 0.8376085758209229, + "learning_rate": 0.00016827036196075655, + "loss": 2.702, + "step": 5290 + }, + { + "epoch": 0.4270034702606731, + "grad_norm": 0.7461032271385193, + "learning_rate": 0.00016825882569649332, + "loss": 2.6959, + "step": 5291 + }, + { + "epoch": 0.42708417399725607, + "grad_norm": 0.7218661308288574, + "learning_rate": 0.00016824728773104171, + "loss": 2.7182, + "step": 5292 + }, + { + "epoch": 0.4271648777338391, + "grad_norm": 0.7012860774993896, + "learning_rate": 0.00016823574806468933, + "loss": 2.6989, + "step": 5293 + }, + { + "epoch": 0.4272455814704221, + "grad_norm": 0.7039482593536377, + "learning_rate": 0.0001682242066977237, + "loss": 2.6153, + "step": 5294 + }, + { + "epoch": 0.4273262852070051, + "grad_norm": 0.8783851861953735, + "learning_rate": 0.0001682126636304325, + "loss": 2.7174, + "step": 5295 + }, + { + "epoch": 0.4274069889435881, + "grad_norm": 0.7266566157341003, + "learning_rate": 0.00016820111886310343, + "loss": 2.6571, + "step": 5296 + }, + { + "epoch": 0.4274876926801711, + "grad_norm": 0.7512212991714478, + "learning_rate": 0.0001681895723960242, + "loss": 2.6802, + "step": 5297 + }, + { + "epoch": 0.4275683964167541, + "grad_norm": 0.7786974310874939, + "learning_rate": 0.00016817802422948254, + "loss": 2.6514, + "step": 5298 + }, + { + "epoch": 0.4276491001533371, + "grad_norm": 0.7454531788825989, + "learning_rate": 0.00016816647436376634, + "loss": 2.6508, + "step": 5299 + }, + { + "epoch": 0.4277298038899201, + "grad_norm": 0.7542992830276489, + "learning_rate": 0.0001681549227991634, + "loss": 2.6455, + "step": 5300 + }, + { + "epoch": 0.42781050762650313, + "grad_norm": 0.7405722141265869, + "learning_rate": 0.0001681433695359616, + "loss": 2.6505, + "step": 5301 + }, + { + "epoch": 0.4278912113630861, + "grad_norm": 0.7120002508163452, + "learning_rate": 0.00016813181457444896, + "loss": 2.6652, + "step": 5302 + }, + { + "epoch": 0.42797191509966914, + "grad_norm": 0.7645997405052185, + "learning_rate": 0.00016812025791491334, + "loss": 2.6456, + "step": 5303 + }, + { + "epoch": 0.4280526188362521, + "grad_norm": 0.7214465141296387, + "learning_rate": 0.00016810869955764286, + "loss": 2.6261, + "step": 5304 + }, + { + "epoch": 0.4281333225728351, + "grad_norm": 0.7653367519378662, + "learning_rate": 0.00016809713950292551, + "loss": 2.7295, + "step": 5305 + }, + { + "epoch": 0.4282140263094181, + "grad_norm": 0.6798970103263855, + "learning_rate": 0.0001680855777510495, + "loss": 2.6549, + "step": 5306 + }, + { + "epoch": 0.4282947300460011, + "grad_norm": 0.7693684101104736, + "learning_rate": 0.00016807401430230288, + "loss": 2.7001, + "step": 5307 + }, + { + "epoch": 0.42837543378258414, + "grad_norm": 0.6962063312530518, + "learning_rate": 0.00016806244915697384, + "loss": 2.6582, + "step": 5308 + }, + { + "epoch": 0.4284561375191671, + "grad_norm": 0.7526959776878357, + "learning_rate": 0.00016805088231535068, + "loss": 2.7204, + "step": 5309 + }, + { + "epoch": 0.42853684125575014, + "grad_norm": 0.7403820753097534, + "learning_rate": 0.0001680393137777217, + "loss": 2.6505, + "step": 5310 + }, + { + "epoch": 0.4286175449923331, + "grad_norm": 0.7056909799575806, + "learning_rate": 0.00016802774354437506, + "loss": 2.5981, + "step": 5311 + }, + { + "epoch": 0.42869824872891615, + "grad_norm": 0.6756439805030823, + "learning_rate": 0.0001680161716155993, + "loss": 2.6845, + "step": 5312 + }, + { + "epoch": 0.42877895246549913, + "grad_norm": 0.7634297013282776, + "learning_rate": 0.0001680045979916827, + "loss": 2.6399, + "step": 5313 + }, + { + "epoch": 0.42885965620208216, + "grad_norm": 0.6793022751808167, + "learning_rate": 0.0001679930226729138, + "loss": 2.6808, + "step": 5314 + }, + { + "epoch": 0.42894035993866514, + "grad_norm": 0.7692369222640991, + "learning_rate": 0.00016798144565958103, + "loss": 2.673, + "step": 5315 + }, + { + "epoch": 0.42902106367524817, + "grad_norm": 0.668798565864563, + "learning_rate": 0.00016796986695197293, + "loss": 2.6465, + "step": 5316 + }, + { + "epoch": 0.42910176741183115, + "grad_norm": 0.719160795211792, + "learning_rate": 0.00016795828655037805, + "loss": 2.5876, + "step": 5317 + }, + { + "epoch": 0.4291824711484142, + "grad_norm": 0.7352864742279053, + "learning_rate": 0.000167946704455085, + "loss": 2.625, + "step": 5318 + }, + { + "epoch": 0.42926317488499716, + "grad_norm": 0.7103392481803894, + "learning_rate": 0.00016793512066638254, + "loss": 2.602, + "step": 5319 + }, + { + "epoch": 0.4293438786215802, + "grad_norm": 0.7005727291107178, + "learning_rate": 0.0001679235351845592, + "loss": 2.6723, + "step": 5320 + }, + { + "epoch": 0.42942458235816316, + "grad_norm": 0.7686243653297424, + "learning_rate": 0.00016791194800990387, + "loss": 2.693, + "step": 5321 + }, + { + "epoch": 0.4295052860947462, + "grad_norm": 0.7026933431625366, + "learning_rate": 0.00016790035914270526, + "loss": 2.6334, + "step": 5322 + }, + { + "epoch": 0.4295859898313292, + "grad_norm": 0.748938262462616, + "learning_rate": 0.0001678887685832522, + "loss": 2.6757, + "step": 5323 + }, + { + "epoch": 0.4296666935679122, + "grad_norm": 0.7753568887710571, + "learning_rate": 0.00016787717633183355, + "loss": 2.6782, + "step": 5324 + }, + { + "epoch": 0.4297473973044952, + "grad_norm": 0.7605767846107483, + "learning_rate": 0.00016786558238873823, + "loss": 2.6822, + "step": 5325 + }, + { + "epoch": 0.4298281010410782, + "grad_norm": 0.7516531348228455, + "learning_rate": 0.00016785398675425524, + "loss": 2.6802, + "step": 5326 + }, + { + "epoch": 0.4299088047776612, + "grad_norm": 0.7551677227020264, + "learning_rate": 0.0001678423894286735, + "loss": 2.6509, + "step": 5327 + }, + { + "epoch": 0.4299895085142442, + "grad_norm": 0.765364944934845, + "learning_rate": 0.00016783079041228206, + "loss": 2.6552, + "step": 5328 + }, + { + "epoch": 0.4300702122508272, + "grad_norm": 0.7016649842262268, + "learning_rate": 0.00016781918970537002, + "loss": 2.6861, + "step": 5329 + }, + { + "epoch": 0.43015091598741023, + "grad_norm": 0.7266311645507812, + "learning_rate": 0.0001678075873082265, + "loss": 2.7064, + "step": 5330 + }, + { + "epoch": 0.4302316197239932, + "grad_norm": 0.7414532899856567, + "learning_rate": 0.00016779598322114064, + "loss": 2.6273, + "step": 5331 + }, + { + "epoch": 0.43031232346057624, + "grad_norm": 0.7032443881034851, + "learning_rate": 0.00016778437744440167, + "loss": 2.6577, + "step": 5332 + }, + { + "epoch": 0.4303930271971592, + "grad_norm": 0.7150338888168335, + "learning_rate": 0.00016777276997829882, + "loss": 2.6586, + "step": 5333 + }, + { + "epoch": 0.43047373093374225, + "grad_norm": 0.6893971562385559, + "learning_rate": 0.0001677611608231214, + "loss": 2.6713, + "step": 5334 + }, + { + "epoch": 0.4305544346703252, + "grad_norm": 0.861935555934906, + "learning_rate": 0.00016774954997915867, + "loss": 2.7037, + "step": 5335 + }, + { + "epoch": 0.43063513840690826, + "grad_norm": 0.7140138745307922, + "learning_rate": 0.00016773793744670012, + "loss": 2.6684, + "step": 5336 + }, + { + "epoch": 0.43071584214349123, + "grad_norm": 0.7245929837226868, + "learning_rate": 0.00016772632322603506, + "loss": 2.6349, + "step": 5337 + }, + { + "epoch": 0.43079654588007427, + "grad_norm": 0.7216203808784485, + "learning_rate": 0.000167714707317453, + "loss": 2.6338, + "step": 5338 + }, + { + "epoch": 0.43087724961665724, + "grad_norm": 0.7076452374458313, + "learning_rate": 0.00016770308972124343, + "loss": 2.6614, + "step": 5339 + }, + { + "epoch": 0.4309579533532403, + "grad_norm": 0.7392035722732544, + "learning_rate": 0.00016769147043769586, + "loss": 2.6697, + "step": 5340 + }, + { + "epoch": 0.43103865708982325, + "grad_norm": 0.7235357761383057, + "learning_rate": 0.00016767984946709994, + "loss": 2.6664, + "step": 5341 + }, + { + "epoch": 0.4311193608264063, + "grad_norm": 0.6985526084899902, + "learning_rate": 0.00016766822680974524, + "loss": 2.6157, + "step": 5342 + }, + { + "epoch": 0.43120006456298926, + "grad_norm": 0.769963264465332, + "learning_rate": 0.0001676566024659214, + "loss": 2.6096, + "step": 5343 + }, + { + "epoch": 0.4312807682995723, + "grad_norm": 0.7504093050956726, + "learning_rate": 0.00016764497643591823, + "loss": 2.5795, + "step": 5344 + }, + { + "epoch": 0.43136147203615527, + "grad_norm": 0.7193379402160645, + "learning_rate": 0.0001676333487200254, + "loss": 2.6158, + "step": 5345 + }, + { + "epoch": 0.4314421757727383, + "grad_norm": 0.777357280254364, + "learning_rate": 0.00016762171931853273, + "loss": 2.6388, + "step": 5346 + }, + { + "epoch": 0.4315228795093213, + "grad_norm": 0.8590179085731506, + "learning_rate": 0.00016761008823173003, + "loss": 2.6597, + "step": 5347 + }, + { + "epoch": 0.4316035832459043, + "grad_norm": 0.7040170431137085, + "learning_rate": 0.0001675984554599072, + "loss": 2.6447, + "step": 5348 + }, + { + "epoch": 0.4316842869824873, + "grad_norm": 0.7682301998138428, + "learning_rate": 0.00016758682100335417, + "loss": 2.6738, + "step": 5349 + }, + { + "epoch": 0.4317649907190703, + "grad_norm": 0.8342414498329163, + "learning_rate": 0.00016757518486236087, + "loss": 2.7058, + "step": 5350 + }, + { + "epoch": 0.4318456944556533, + "grad_norm": 0.7410600781440735, + "learning_rate": 0.00016756354703721736, + "loss": 2.6597, + "step": 5351 + }, + { + "epoch": 0.4319263981922363, + "grad_norm": 0.7633174061775208, + "learning_rate": 0.00016755190752821363, + "loss": 2.6461, + "step": 5352 + }, + { + "epoch": 0.4320071019288193, + "grad_norm": 0.7855150103569031, + "learning_rate": 0.00016754026633563973, + "loss": 2.6556, + "step": 5353 + }, + { + "epoch": 0.43208780566540234, + "grad_norm": 0.7197602391242981, + "learning_rate": 0.00016752862345978587, + "loss": 2.6511, + "step": 5354 + }, + { + "epoch": 0.4321685094019853, + "grad_norm": 0.7748876810073853, + "learning_rate": 0.00016751697890094223, + "loss": 2.7, + "step": 5355 + }, + { + "epoch": 0.4322492131385683, + "grad_norm": 0.7457308173179626, + "learning_rate": 0.00016750533265939895, + "loss": 2.6934, + "step": 5356 + }, + { + "epoch": 0.4323299168751513, + "grad_norm": 0.8003394603729248, + "learning_rate": 0.00016749368473544633, + "loss": 2.6273, + "step": 5357 + }, + { + "epoch": 0.4324106206117343, + "grad_norm": 0.7163615822792053, + "learning_rate": 0.00016748203512937464, + "loss": 2.6605, + "step": 5358 + }, + { + "epoch": 0.43249132434831733, + "grad_norm": 0.6859120726585388, + "learning_rate": 0.00016747038384147422, + "loss": 2.6748, + "step": 5359 + }, + { + "epoch": 0.4325720280849003, + "grad_norm": 0.7169440984725952, + "learning_rate": 0.0001674587308720355, + "loss": 2.6674, + "step": 5360 + }, + { + "epoch": 0.43265273182148334, + "grad_norm": 0.7762351036071777, + "learning_rate": 0.00016744707622134888, + "loss": 2.6673, + "step": 5361 + }, + { + "epoch": 0.4327334355580663, + "grad_norm": 0.7169542908668518, + "learning_rate": 0.0001674354198897048, + "loss": 2.7341, + "step": 5362 + }, + { + "epoch": 0.43281413929464935, + "grad_norm": 0.7903403043746948, + "learning_rate": 0.00016742376187739376, + "loss": 2.6019, + "step": 5363 + }, + { + "epoch": 0.4328948430312323, + "grad_norm": 0.8395403027534485, + "learning_rate": 0.00016741210218470634, + "loss": 2.6519, + "step": 5364 + }, + { + "epoch": 0.43297554676781536, + "grad_norm": 0.7521546483039856, + "learning_rate": 0.0001674004408119331, + "loss": 2.6067, + "step": 5365 + }, + { + "epoch": 0.43305625050439833, + "grad_norm": 0.7186779975891113, + "learning_rate": 0.0001673887777593647, + "loss": 2.6435, + "step": 5366 + }, + { + "epoch": 0.43313695424098136, + "grad_norm": 0.7362968921661377, + "learning_rate": 0.0001673771130272918, + "loss": 2.6031, + "step": 5367 + }, + { + "epoch": 0.43321765797756434, + "grad_norm": 0.8033537864685059, + "learning_rate": 0.0001673654466160051, + "loss": 2.7234, + "step": 5368 + }, + { + "epoch": 0.4332983617141474, + "grad_norm": 0.7109711766242981, + "learning_rate": 0.0001673537785257954, + "loss": 2.6621, + "step": 5369 + }, + { + "epoch": 0.43337906545073035, + "grad_norm": 0.7499226927757263, + "learning_rate": 0.0001673421087569535, + "loss": 2.706, + "step": 5370 + }, + { + "epoch": 0.4334597691873134, + "grad_norm": 0.7192875146865845, + "learning_rate": 0.00016733043730977017, + "loss": 2.6053, + "step": 5371 + }, + { + "epoch": 0.43354047292389636, + "grad_norm": 0.6939374208450317, + "learning_rate": 0.00016731876418453636, + "loss": 2.6621, + "step": 5372 + }, + { + "epoch": 0.4336211766604794, + "grad_norm": 0.720741331577301, + "learning_rate": 0.00016730708938154297, + "loss": 2.6358, + "step": 5373 + }, + { + "epoch": 0.43370188039706237, + "grad_norm": 0.6979780793190002, + "learning_rate": 0.00016729541290108095, + "loss": 2.6162, + "step": 5374 + }, + { + "epoch": 0.4337825841336454, + "grad_norm": 0.8014200925827026, + "learning_rate": 0.00016728373474344136, + "loss": 2.6255, + "step": 5375 + }, + { + "epoch": 0.4338632878702284, + "grad_norm": 0.7780057787895203, + "learning_rate": 0.0001672720549089152, + "loss": 2.6257, + "step": 5376 + }, + { + "epoch": 0.4339439916068114, + "grad_norm": 0.7111102938652039, + "learning_rate": 0.00016726037339779358, + "loss": 2.6384, + "step": 5377 + }, + { + "epoch": 0.4340246953433944, + "grad_norm": 0.7077106833457947, + "learning_rate": 0.00016724869021036764, + "loss": 2.6293, + "step": 5378 + }, + { + "epoch": 0.4341053990799774, + "grad_norm": 0.8328250646591187, + "learning_rate": 0.00016723700534692853, + "loss": 2.6186, + "step": 5379 + }, + { + "epoch": 0.4341861028165604, + "grad_norm": 0.6942149996757507, + "learning_rate": 0.00016722531880776752, + "loss": 2.6032, + "step": 5380 + }, + { + "epoch": 0.4342668065531434, + "grad_norm": 0.7180305123329163, + "learning_rate": 0.00016721363059317583, + "loss": 2.6166, + "step": 5381 + }, + { + "epoch": 0.4343475102897264, + "grad_norm": 0.8093443512916565, + "learning_rate": 0.00016720194070344476, + "loss": 2.6596, + "step": 5382 + }, + { + "epoch": 0.43442821402630943, + "grad_norm": 0.7337743043899536, + "learning_rate": 0.00016719024913886568, + "loss": 2.6137, + "step": 5383 + }, + { + "epoch": 0.4345089177628924, + "grad_norm": 0.7590384483337402, + "learning_rate": 0.00016717855589972993, + "loss": 2.6541, + "step": 5384 + }, + { + "epoch": 0.43458962149947544, + "grad_norm": 0.6945257186889648, + "learning_rate": 0.00016716686098632898, + "loss": 2.686, + "step": 5385 + }, + { + "epoch": 0.4346703252360584, + "grad_norm": 0.7175764441490173, + "learning_rate": 0.00016715516439895424, + "loss": 2.6081, + "step": 5386 + }, + { + "epoch": 0.43475102897264145, + "grad_norm": 0.7287259697914124, + "learning_rate": 0.00016714346613789732, + "loss": 2.6462, + "step": 5387 + }, + { + "epoch": 0.43483173270922443, + "grad_norm": 0.6864096522331238, + "learning_rate": 0.00016713176620344964, + "loss": 2.7104, + "step": 5388 + }, + { + "epoch": 0.43491243644580746, + "grad_norm": 0.6554383039474487, + "learning_rate": 0.00016712006459590289, + "loss": 2.6153, + "step": 5389 + }, + { + "epoch": 0.43499314018239044, + "grad_norm": 0.6415165662765503, + "learning_rate": 0.00016710836131554867, + "loss": 2.6198, + "step": 5390 + }, + { + "epoch": 0.43507384391897347, + "grad_norm": 0.6998475193977356, + "learning_rate": 0.00016709665636267869, + "loss": 2.6774, + "step": 5391 + }, + { + "epoch": 0.43515454765555645, + "grad_norm": 0.7437679171562195, + "learning_rate": 0.00016708494973758465, + "loss": 2.6176, + "step": 5392 + }, + { + "epoch": 0.4352352513921395, + "grad_norm": 0.6898311376571655, + "learning_rate": 0.00016707324144055825, + "loss": 2.6194, + "step": 5393 + }, + { + "epoch": 0.43531595512872245, + "grad_norm": 0.7536425590515137, + "learning_rate": 0.00016706153147189138, + "loss": 2.672, + "step": 5394 + }, + { + "epoch": 0.4353966588653055, + "grad_norm": 0.7576118111610413, + "learning_rate": 0.00016704981983187581, + "loss": 2.6473, + "step": 5395 + }, + { + "epoch": 0.43547736260188846, + "grad_norm": 0.7452495098114014, + "learning_rate": 0.00016703810652080349, + "loss": 2.6487, + "step": 5396 + }, + { + "epoch": 0.4355580663384715, + "grad_norm": 0.7817744612693787, + "learning_rate": 0.0001670263915389663, + "loss": 2.61, + "step": 5397 + }, + { + "epoch": 0.43563877007505447, + "grad_norm": 0.7195492386817932, + "learning_rate": 0.00016701467488665624, + "loss": 2.6745, + "step": 5398 + }, + { + "epoch": 0.4357194738116375, + "grad_norm": 0.7703930735588074, + "learning_rate": 0.0001670029565641653, + "loss": 2.7196, + "step": 5399 + }, + { + "epoch": 0.4358001775482205, + "grad_norm": 0.6859520673751831, + "learning_rate": 0.00016699123657178553, + "loss": 2.6317, + "step": 5400 + }, + { + "epoch": 0.4358808812848035, + "grad_norm": 0.7380268573760986, + "learning_rate": 0.00016697951490980903, + "loss": 2.6008, + "step": 5401 + }, + { + "epoch": 0.4359615850213865, + "grad_norm": 0.7903439402580261, + "learning_rate": 0.00016696779157852792, + "loss": 2.6411, + "step": 5402 + }, + { + "epoch": 0.4360422887579695, + "grad_norm": 0.7022606134414673, + "learning_rate": 0.0001669560665782344, + "loss": 2.6153, + "step": 5403 + }, + { + "epoch": 0.4361229924945525, + "grad_norm": 0.8196203112602234, + "learning_rate": 0.00016694433990922068, + "loss": 2.6128, + "step": 5404 + }, + { + "epoch": 0.43620369623113553, + "grad_norm": 0.7342696189880371, + "learning_rate": 0.000166932611571779, + "loss": 2.6802, + "step": 5405 + }, + { + "epoch": 0.4362843999677185, + "grad_norm": 0.7475131154060364, + "learning_rate": 0.0001669208815662017, + "loss": 2.6106, + "step": 5406 + }, + { + "epoch": 0.4363651037043015, + "grad_norm": 0.7067655324935913, + "learning_rate": 0.00016690914989278107, + "loss": 2.6362, + "step": 5407 + }, + { + "epoch": 0.4364458074408845, + "grad_norm": 0.7550163865089417, + "learning_rate": 0.00016689741655180956, + "loss": 2.6256, + "step": 5408 + }, + { + "epoch": 0.4365265111774675, + "grad_norm": 0.7341828346252441, + "learning_rate": 0.00016688568154357952, + "loss": 2.6912, + "step": 5409 + }, + { + "epoch": 0.4366072149140505, + "grad_norm": 0.7501869201660156, + "learning_rate": 0.00016687394486838349, + "loss": 2.7122, + "step": 5410 + }, + { + "epoch": 0.4366879186506335, + "grad_norm": 0.7041562795639038, + "learning_rate": 0.00016686220652651392, + "loss": 2.6755, + "step": 5411 + }, + { + "epoch": 0.43676862238721653, + "grad_norm": 0.7218217253684998, + "learning_rate": 0.00016685046651826338, + "loss": 2.693, + "step": 5412 + }, + { + "epoch": 0.4368493261237995, + "grad_norm": 0.6880577206611633, + "learning_rate": 0.00016683872484392448, + "loss": 2.638, + "step": 5413 + }, + { + "epoch": 0.43693002986038254, + "grad_norm": 0.6864475607872009, + "learning_rate": 0.0001668269815037898, + "loss": 2.6497, + "step": 5414 + }, + { + "epoch": 0.4370107335969655, + "grad_norm": 0.7326167821884155, + "learning_rate": 0.00016681523649815212, + "loss": 2.6858, + "step": 5415 + }, + { + "epoch": 0.43709143733354855, + "grad_norm": 0.6773428320884705, + "learning_rate": 0.00016680348982730405, + "loss": 2.6489, + "step": 5416 + }, + { + "epoch": 0.4371721410701315, + "grad_norm": 0.7117835283279419, + "learning_rate": 0.00016679174149153837, + "loss": 2.6607, + "step": 5417 + }, + { + "epoch": 0.43725284480671456, + "grad_norm": 0.7268334031105042, + "learning_rate": 0.00016677999149114793, + "loss": 2.703, + "step": 5418 + }, + { + "epoch": 0.43733354854329753, + "grad_norm": 0.7672972679138184, + "learning_rate": 0.00016676823982642554, + "loss": 2.5803, + "step": 5419 + }, + { + "epoch": 0.43741425227988057, + "grad_norm": 0.6966733932495117, + "learning_rate": 0.00016675648649766407, + "loss": 2.6149, + "step": 5420 + }, + { + "epoch": 0.43749495601646354, + "grad_norm": 0.752896249294281, + "learning_rate": 0.00016674473150515644, + "loss": 2.7108, + "step": 5421 + }, + { + "epoch": 0.4375756597530466, + "grad_norm": 0.7094796895980835, + "learning_rate": 0.00016673297484919565, + "loss": 2.6989, + "step": 5422 + }, + { + "epoch": 0.43765636348962955, + "grad_norm": 0.7631612420082092, + "learning_rate": 0.00016672121653007465, + "loss": 2.6673, + "step": 5423 + }, + { + "epoch": 0.4377370672262126, + "grad_norm": 0.7083843946456909, + "learning_rate": 0.00016670945654808655, + "loss": 2.6529, + "step": 5424 + }, + { + "epoch": 0.43781777096279556, + "grad_norm": 0.7291569709777832, + "learning_rate": 0.0001666976949035244, + "loss": 2.633, + "step": 5425 + }, + { + "epoch": 0.4378984746993786, + "grad_norm": 0.8351448774337769, + "learning_rate": 0.00016668593159668138, + "loss": 2.5993, + "step": 5426 + }, + { + "epoch": 0.43797917843596157, + "grad_norm": 0.7339642643928528, + "learning_rate": 0.00016667416662785058, + "loss": 2.6486, + "step": 5427 + }, + { + "epoch": 0.4380598821725446, + "grad_norm": 0.7257512211799622, + "learning_rate": 0.00016666239999732526, + "loss": 2.6453, + "step": 5428 + }, + { + "epoch": 0.4381405859091276, + "grad_norm": 0.7282476425170898, + "learning_rate": 0.00016665063170539872, + "loss": 2.6654, + "step": 5429 + }, + { + "epoch": 0.4382212896457106, + "grad_norm": 0.726685643196106, + "learning_rate": 0.00016663886175236417, + "loss": 2.65, + "step": 5430 + }, + { + "epoch": 0.4383019933822936, + "grad_norm": 0.7478880286216736, + "learning_rate": 0.000166627090138515, + "loss": 2.623, + "step": 5431 + }, + { + "epoch": 0.4383826971188766, + "grad_norm": 0.7624948024749756, + "learning_rate": 0.00016661531686414457, + "loss": 2.6438, + "step": 5432 + }, + { + "epoch": 0.4384634008554596, + "grad_norm": 0.8098936676979065, + "learning_rate": 0.00016660354192954633, + "loss": 2.6226, + "step": 5433 + }, + { + "epoch": 0.4385441045920426, + "grad_norm": 0.7305725812911987, + "learning_rate": 0.0001665917653350137, + "loss": 2.6425, + "step": 5434 + }, + { + "epoch": 0.4386248083286256, + "grad_norm": 0.7064421772956848, + "learning_rate": 0.00016657998708084027, + "loss": 2.6069, + "step": 5435 + }, + { + "epoch": 0.43870551206520864, + "grad_norm": 0.8279524445533752, + "learning_rate": 0.00016656820716731945, + "loss": 2.6609, + "step": 5436 + }, + { + "epoch": 0.4387862158017916, + "grad_norm": 0.742659866809845, + "learning_rate": 0.00016655642559474488, + "loss": 2.64, + "step": 5437 + }, + { + "epoch": 0.43886691953837464, + "grad_norm": 0.757780909538269, + "learning_rate": 0.00016654464236341026, + "loss": 2.6546, + "step": 5438 + }, + { + "epoch": 0.4389476232749576, + "grad_norm": 0.7439742684364319, + "learning_rate": 0.00016653285747360918, + "loss": 2.6717, + "step": 5439 + }, + { + "epoch": 0.43902832701154065, + "grad_norm": 0.7529581189155579, + "learning_rate": 0.0001665210709256354, + "loss": 2.6204, + "step": 5440 + }, + { + "epoch": 0.43910903074812363, + "grad_norm": 0.7224153876304626, + "learning_rate": 0.00016650928271978258, + "loss": 2.6417, + "step": 5441 + }, + { + "epoch": 0.43918973448470666, + "grad_norm": 0.6792185306549072, + "learning_rate": 0.00016649749285634462, + "loss": 2.6382, + "step": 5442 + }, + { + "epoch": 0.43927043822128964, + "grad_norm": 0.6887058019638062, + "learning_rate": 0.00016648570133561533, + "loss": 2.6302, + "step": 5443 + }, + { + "epoch": 0.43935114195787267, + "grad_norm": 0.7373671531677246, + "learning_rate": 0.00016647390815788853, + "loss": 2.625, + "step": 5444 + }, + { + "epoch": 0.43943184569445565, + "grad_norm": 0.7595719695091248, + "learning_rate": 0.0001664621133234582, + "loss": 2.6444, + "step": 5445 + }, + { + "epoch": 0.4395125494310387, + "grad_norm": 0.7331473231315613, + "learning_rate": 0.00016645031683261825, + "loss": 2.6308, + "step": 5446 + }, + { + "epoch": 0.43959325316762166, + "grad_norm": 0.7724922895431519, + "learning_rate": 0.0001664385186856627, + "loss": 2.6646, + "step": 5447 + }, + { + "epoch": 0.4396739569042047, + "grad_norm": 0.6960163712501526, + "learning_rate": 0.00016642671888288563, + "loss": 2.6196, + "step": 5448 + }, + { + "epoch": 0.43975466064078766, + "grad_norm": 0.6769189834594727, + "learning_rate": 0.00016641491742458103, + "loss": 2.6558, + "step": 5449 + }, + { + "epoch": 0.4398353643773707, + "grad_norm": 0.7435783743858337, + "learning_rate": 0.0001664031143110431, + "loss": 2.6717, + "step": 5450 + }, + { + "epoch": 0.4399160681139537, + "grad_norm": 0.7234118580818176, + "learning_rate": 0.00016639130954256603, + "loss": 2.6549, + "step": 5451 + }, + { + "epoch": 0.4399967718505367, + "grad_norm": 0.720825731754303, + "learning_rate": 0.00016637950311944392, + "loss": 2.6098, + "step": 5452 + }, + { + "epoch": 0.4400774755871197, + "grad_norm": 0.6977505087852478, + "learning_rate": 0.0001663676950419711, + "loss": 2.6351, + "step": 5453 + }, + { + "epoch": 0.4401581793237027, + "grad_norm": 0.6959076523780823, + "learning_rate": 0.00016635588531044185, + "loss": 2.6918, + "step": 5454 + }, + { + "epoch": 0.4402388830602857, + "grad_norm": 0.7022189497947693, + "learning_rate": 0.00016634407392515044, + "loss": 2.6218, + "step": 5455 + }, + { + "epoch": 0.4403195867968687, + "grad_norm": 0.7147775292396545, + "learning_rate": 0.0001663322608863913, + "loss": 2.6966, + "step": 5456 + }, + { + "epoch": 0.4404002905334517, + "grad_norm": 0.7592755556106567, + "learning_rate": 0.00016632044619445882, + "loss": 2.6326, + "step": 5457 + }, + { + "epoch": 0.4404809942700347, + "grad_norm": 0.6914302110671997, + "learning_rate": 0.00016630862984964745, + "loss": 2.603, + "step": 5458 + }, + { + "epoch": 0.4405616980066177, + "grad_norm": 0.7735368609428406, + "learning_rate": 0.0001662968118522517, + "loss": 2.6666, + "step": 5459 + }, + { + "epoch": 0.4406424017432007, + "grad_norm": 0.7175899744033813, + "learning_rate": 0.00016628499220256612, + "loss": 2.666, + "step": 5460 + }, + { + "epoch": 0.4407231054797837, + "grad_norm": 0.6735796332359314, + "learning_rate": 0.00016627317090088523, + "loss": 2.6451, + "step": 5461 + }, + { + "epoch": 0.4408038092163667, + "grad_norm": 0.72022545337677, + "learning_rate": 0.0001662613479475037, + "loss": 2.6295, + "step": 5462 + }, + { + "epoch": 0.4408845129529497, + "grad_norm": 0.7084751725196838, + "learning_rate": 0.00016624952334271616, + "loss": 2.6633, + "step": 5463 + }, + { + "epoch": 0.4409652166895327, + "grad_norm": 0.7399250864982605, + "learning_rate": 0.00016623769708681735, + "loss": 2.6076, + "step": 5464 + }, + { + "epoch": 0.44104592042611573, + "grad_norm": 0.6904892325401306, + "learning_rate": 0.00016622586918010193, + "loss": 2.6799, + "step": 5465 + }, + { + "epoch": 0.4411266241626987, + "grad_norm": 0.7419006824493408, + "learning_rate": 0.00016621403962286478, + "loss": 2.65, + "step": 5466 + }, + { + "epoch": 0.44120732789928174, + "grad_norm": 0.7201282978057861, + "learning_rate": 0.00016620220841540064, + "loss": 2.6769, + "step": 5467 + }, + { + "epoch": 0.4412880316358647, + "grad_norm": 0.7223218679428101, + "learning_rate": 0.00016619037555800443, + "loss": 2.6342, + "step": 5468 + }, + { + "epoch": 0.44136873537244775, + "grad_norm": 0.7517585754394531, + "learning_rate": 0.00016617854105097104, + "loss": 2.6103, + "step": 5469 + }, + { + "epoch": 0.44144943910903073, + "grad_norm": 0.6765139698982239, + "learning_rate": 0.0001661667048945954, + "loss": 2.624, + "step": 5470 + }, + { + "epoch": 0.44153014284561376, + "grad_norm": 0.7197677493095398, + "learning_rate": 0.00016615486708917255, + "loss": 2.5786, + "step": 5471 + }, + { + "epoch": 0.44161084658219674, + "grad_norm": 0.7196774482727051, + "learning_rate": 0.00016614302763499742, + "loss": 2.6147, + "step": 5472 + }, + { + "epoch": 0.44169155031877977, + "grad_norm": 0.7210293412208557, + "learning_rate": 0.00016613118653236518, + "loss": 2.6526, + "step": 5473 + }, + { + "epoch": 0.44177225405536275, + "grad_norm": 0.6870129108428955, + "learning_rate": 0.00016611934378157092, + "loss": 2.665, + "step": 5474 + }, + { + "epoch": 0.4418529577919458, + "grad_norm": 0.6925365328788757, + "learning_rate": 0.00016610749938290975, + "loss": 2.5734, + "step": 5475 + }, + { + "epoch": 0.44193366152852875, + "grad_norm": 0.7399131655693054, + "learning_rate": 0.0001660956533366769, + "loss": 2.6935, + "step": 5476 + }, + { + "epoch": 0.4420143652651118, + "grad_norm": 0.7348966002464294, + "learning_rate": 0.00016608380564316758, + "loss": 2.6788, + "step": 5477 + }, + { + "epoch": 0.44209506900169476, + "grad_norm": 0.7597334980964661, + "learning_rate": 0.00016607195630267708, + "loss": 2.6732, + "step": 5478 + }, + { + "epoch": 0.4421757727382778, + "grad_norm": 0.6847043037414551, + "learning_rate": 0.00016606010531550072, + "loss": 2.6475, + "step": 5479 + }, + { + "epoch": 0.44225647647486077, + "grad_norm": 0.7065151929855347, + "learning_rate": 0.00016604825268193388, + "loss": 2.6674, + "step": 5480 + }, + { + "epoch": 0.4423371802114438, + "grad_norm": 0.7102208137512207, + "learning_rate": 0.0001660363984022719, + "loss": 2.6723, + "step": 5481 + }, + { + "epoch": 0.4424178839480268, + "grad_norm": 0.6912767887115479, + "learning_rate": 0.00016602454247681024, + "loss": 2.628, + "step": 5482 + }, + { + "epoch": 0.4424985876846098, + "grad_norm": 0.7265123128890991, + "learning_rate": 0.0001660126849058444, + "loss": 2.5935, + "step": 5483 + }, + { + "epoch": 0.4425792914211928, + "grad_norm": 0.8177923560142517, + "learning_rate": 0.0001660008256896699, + "loss": 2.6402, + "step": 5484 + }, + { + "epoch": 0.4426599951577758, + "grad_norm": 0.7196556925773621, + "learning_rate": 0.00016598896482858231, + "loss": 2.6939, + "step": 5485 + }, + { + "epoch": 0.4427406988943588, + "grad_norm": 0.7459850907325745, + "learning_rate": 0.0001659771023228772, + "loss": 2.6343, + "step": 5486 + }, + { + "epoch": 0.44282140263094183, + "grad_norm": 0.7399095892906189, + "learning_rate": 0.00016596523817285024, + "loss": 2.6139, + "step": 5487 + }, + { + "epoch": 0.4429021063675248, + "grad_norm": 0.7517558336257935, + "learning_rate": 0.0001659533723787971, + "loss": 2.6609, + "step": 5488 + }, + { + "epoch": 0.44298281010410784, + "grad_norm": 0.7073537707328796, + "learning_rate": 0.00016594150494101355, + "loss": 2.6326, + "step": 5489 + }, + { + "epoch": 0.4430635138406908, + "grad_norm": 0.7414752244949341, + "learning_rate": 0.0001659296358597953, + "loss": 2.6759, + "step": 5490 + }, + { + "epoch": 0.44314421757727385, + "grad_norm": 0.7636380195617676, + "learning_rate": 0.0001659177651354382, + "loss": 2.5743, + "step": 5491 + }, + { + "epoch": 0.4432249213138568, + "grad_norm": 0.6839539408683777, + "learning_rate": 0.00016590589276823804, + "loss": 2.631, + "step": 5492 + }, + { + "epoch": 0.44330562505043986, + "grad_norm": 0.8057516813278198, + "learning_rate": 0.0001658940187584908, + "loss": 2.6916, + "step": 5493 + }, + { + "epoch": 0.44338632878702283, + "grad_norm": 0.7479767799377441, + "learning_rate": 0.00016588214310649232, + "loss": 2.6811, + "step": 5494 + }, + { + "epoch": 0.44346703252360586, + "grad_norm": 0.7854729294776917, + "learning_rate": 0.00016587026581253866, + "loss": 2.6746, + "step": 5495 + }, + { + "epoch": 0.44354773626018884, + "grad_norm": 0.7782836556434631, + "learning_rate": 0.00016585838687692577, + "loss": 2.61, + "step": 5496 + }, + { + "epoch": 0.4436284399967719, + "grad_norm": 0.7047034502029419, + "learning_rate": 0.00016584650629994968, + "loss": 2.6573, + "step": 5497 + }, + { + "epoch": 0.44370914373335485, + "grad_norm": 0.7398735880851746, + "learning_rate": 0.0001658346240819066, + "loss": 2.6338, + "step": 5498 + }, + { + "epoch": 0.4437898474699379, + "grad_norm": 0.7243468165397644, + "learning_rate": 0.00016582274022309258, + "loss": 2.5898, + "step": 5499 + }, + { + "epoch": 0.44387055120652086, + "grad_norm": 0.7415906190872192, + "learning_rate": 0.00016581085472380376, + "loss": 2.5893, + "step": 5500 + }, + { + "epoch": 0.4439512549431039, + "grad_norm": 0.6935107707977295, + "learning_rate": 0.00016579896758433645, + "loss": 2.6704, + "step": 5501 + }, + { + "epoch": 0.44403195867968687, + "grad_norm": 0.7188034653663635, + "learning_rate": 0.00016578707880498685, + "loss": 2.643, + "step": 5502 + }, + { + "epoch": 0.4441126624162699, + "grad_norm": 0.6697022914886475, + "learning_rate": 0.0001657751883860513, + "loss": 2.6313, + "step": 5503 + }, + { + "epoch": 0.4441933661528529, + "grad_norm": 0.760154664516449, + "learning_rate": 0.00016576329632782613, + "loss": 2.6604, + "step": 5504 + }, + { + "epoch": 0.4442740698894359, + "grad_norm": 0.6883447170257568, + "learning_rate": 0.00016575140263060765, + "loss": 2.64, + "step": 5505 + }, + { + "epoch": 0.4443547736260189, + "grad_norm": 0.8628804683685303, + "learning_rate": 0.0001657395072946924, + "loss": 2.6651, + "step": 5506 + }, + { + "epoch": 0.4444354773626019, + "grad_norm": 0.7125170230865479, + "learning_rate": 0.0001657276103203768, + "loss": 2.7132, + "step": 5507 + }, + { + "epoch": 0.4445161810991849, + "grad_norm": 0.6965304613113403, + "learning_rate": 0.00016571571170795725, + "loss": 2.7109, + "step": 5508 + }, + { + "epoch": 0.44459688483576787, + "grad_norm": 0.720327615737915, + "learning_rate": 0.00016570381145773042, + "loss": 2.6323, + "step": 5509 + }, + { + "epoch": 0.4446775885723509, + "grad_norm": 0.7097898125648499, + "learning_rate": 0.00016569190956999287, + "loss": 2.6461, + "step": 5510 + }, + { + "epoch": 0.4447582923089339, + "grad_norm": 0.7142884731292725, + "learning_rate": 0.0001656800060450412, + "loss": 2.6894, + "step": 5511 + }, + { + "epoch": 0.4448389960455169, + "grad_norm": 0.6992002725601196, + "learning_rate": 0.0001656681008831721, + "loss": 2.6116, + "step": 5512 + }, + { + "epoch": 0.4449196997820999, + "grad_norm": 0.763841450214386, + "learning_rate": 0.00016565619408468227, + "loss": 2.6441, + "step": 5513 + }, + { + "epoch": 0.4450004035186829, + "grad_norm": 0.6958404183387756, + "learning_rate": 0.00016564428564986848, + "loss": 2.5751, + "step": 5514 + }, + { + "epoch": 0.4450811072552659, + "grad_norm": 0.8804046511650085, + "learning_rate": 0.00016563237557902744, + "loss": 2.6353, + "step": 5515 + }, + { + "epoch": 0.4451618109918489, + "grad_norm": 0.744864821434021, + "learning_rate": 0.00016562046387245608, + "loss": 2.6887, + "step": 5516 + }, + { + "epoch": 0.4452425147284319, + "grad_norm": 0.7627978920936584, + "learning_rate": 0.0001656085505304512, + "loss": 2.6347, + "step": 5517 + }, + { + "epoch": 0.44532321846501494, + "grad_norm": 0.7728918194770813, + "learning_rate": 0.00016559663555330975, + "loss": 2.6344, + "step": 5518 + }, + { + "epoch": 0.4454039222015979, + "grad_norm": 0.7853842377662659, + "learning_rate": 0.00016558471894132865, + "loss": 2.7239, + "step": 5519 + }, + { + "epoch": 0.44548462593818094, + "grad_norm": 0.7981860041618347, + "learning_rate": 0.00016557280069480495, + "loss": 2.66, + "step": 5520 + }, + { + "epoch": 0.4455653296747639, + "grad_norm": 0.7555295825004578, + "learning_rate": 0.0001655608808140356, + "loss": 2.6636, + "step": 5521 + }, + { + "epoch": 0.44564603341134695, + "grad_norm": 0.6893854141235352, + "learning_rate": 0.00016554895929931778, + "loss": 2.5999, + "step": 5522 + }, + { + "epoch": 0.44572673714792993, + "grad_norm": 0.7740506529808044, + "learning_rate": 0.0001655370361509485, + "loss": 2.6308, + "step": 5523 + }, + { + "epoch": 0.44580744088451296, + "grad_norm": 0.6956021785736084, + "learning_rate": 0.00016552511136922498, + "loss": 2.6376, + "step": 5524 + }, + { + "epoch": 0.44588814462109594, + "grad_norm": 0.7408841252326965, + "learning_rate": 0.00016551318495444445, + "loss": 2.6644, + "step": 5525 + }, + { + "epoch": 0.44596884835767897, + "grad_norm": 0.7715663313865662, + "learning_rate": 0.000165501256906904, + "loss": 2.6791, + "step": 5526 + }, + { + "epoch": 0.44604955209426195, + "grad_norm": 0.6880629062652588, + "learning_rate": 0.0001654893272269011, + "loss": 2.7209, + "step": 5527 + }, + { + "epoch": 0.446130255830845, + "grad_norm": 0.6765853762626648, + "learning_rate": 0.0001654773959147329, + "loss": 2.6548, + "step": 5528 + }, + { + "epoch": 0.44621095956742796, + "grad_norm": 0.739248514175415, + "learning_rate": 0.00016546546297069688, + "loss": 2.69, + "step": 5529 + }, + { + "epoch": 0.446291663304011, + "grad_norm": 0.7655714750289917, + "learning_rate": 0.00016545352839509038, + "loss": 2.6238, + "step": 5530 + }, + { + "epoch": 0.44637236704059396, + "grad_norm": 0.706068217754364, + "learning_rate": 0.00016544159218821088, + "loss": 2.6528, + "step": 5531 + }, + { + "epoch": 0.446453070777177, + "grad_norm": 0.7411316633224487, + "learning_rate": 0.00016542965435035578, + "loss": 2.7034, + "step": 5532 + }, + { + "epoch": 0.44653377451376, + "grad_norm": 0.6550690531730652, + "learning_rate": 0.0001654177148818227, + "loss": 2.6388, + "step": 5533 + }, + { + "epoch": 0.446614478250343, + "grad_norm": 0.7151147127151489, + "learning_rate": 0.00016540577378290915, + "loss": 2.7382, + "step": 5534 + }, + { + "epoch": 0.446695181986926, + "grad_norm": 0.7343939542770386, + "learning_rate": 0.00016539383105391276, + "loss": 2.6316, + "step": 5535 + }, + { + "epoch": 0.446775885723509, + "grad_norm": 0.702036440372467, + "learning_rate": 0.00016538188669513115, + "loss": 2.6465, + "step": 5536 + }, + { + "epoch": 0.446856589460092, + "grad_norm": 0.7212840914726257, + "learning_rate": 0.00016536994070686197, + "loss": 2.6471, + "step": 5537 + }, + { + "epoch": 0.446937293196675, + "grad_norm": 0.7345479130744934, + "learning_rate": 0.00016535799308940304, + "loss": 2.6746, + "step": 5538 + }, + { + "epoch": 0.447017996933258, + "grad_norm": 0.7447341084480286, + "learning_rate": 0.00016534604384305207, + "loss": 2.6487, + "step": 5539 + }, + { + "epoch": 0.44709870066984103, + "grad_norm": 0.6865687370300293, + "learning_rate": 0.00016533409296810687, + "loss": 2.6202, + "step": 5540 + }, + { + "epoch": 0.447179404406424, + "grad_norm": 0.8210769891738892, + "learning_rate": 0.0001653221404648653, + "loss": 2.7155, + "step": 5541 + }, + { + "epoch": 0.44726010814300704, + "grad_norm": 0.7768925428390503, + "learning_rate": 0.0001653101863336252, + "loss": 2.6011, + "step": 5542 + }, + { + "epoch": 0.44734081187959, + "grad_norm": 0.7160049080848694, + "learning_rate": 0.00016529823057468456, + "loss": 2.6541, + "step": 5543 + }, + { + "epoch": 0.44742151561617305, + "grad_norm": 0.7386900782585144, + "learning_rate": 0.00016528627318834134, + "loss": 2.6586, + "step": 5544 + }, + { + "epoch": 0.447502219352756, + "grad_norm": 0.7415460348129272, + "learning_rate": 0.0001652743141748935, + "loss": 2.7032, + "step": 5545 + }, + { + "epoch": 0.44758292308933906, + "grad_norm": 0.8483054637908936, + "learning_rate": 0.00016526235353463912, + "loss": 2.6145, + "step": 5546 + }, + { + "epoch": 0.44766362682592203, + "grad_norm": 0.7428778409957886, + "learning_rate": 0.00016525039126787629, + "loss": 2.7005, + "step": 5547 + }, + { + "epoch": 0.44774433056250507, + "grad_norm": 0.7214285731315613, + "learning_rate": 0.00016523842737490316, + "loss": 2.6267, + "step": 5548 + }, + { + "epoch": 0.44782503429908804, + "grad_norm": 0.6753950715065002, + "learning_rate": 0.0001652264618560179, + "loss": 2.6732, + "step": 5549 + }, + { + "epoch": 0.4479057380356711, + "grad_norm": 0.6969403028488159, + "learning_rate": 0.00016521449471151867, + "loss": 2.6218, + "step": 5550 + }, + { + "epoch": 0.44798644177225405, + "grad_norm": 0.7562664151191711, + "learning_rate": 0.00016520252594170377, + "loss": 2.69, + "step": 5551 + }, + { + "epoch": 0.4480671455088371, + "grad_norm": 0.6831937432289124, + "learning_rate": 0.0001651905555468715, + "loss": 2.709, + "step": 5552 + }, + { + "epoch": 0.44814784924542006, + "grad_norm": 0.6753427386283875, + "learning_rate": 0.00016517858352732017, + "loss": 2.5852, + "step": 5553 + }, + { + "epoch": 0.4482285529820031, + "grad_norm": 0.7573871612548828, + "learning_rate": 0.00016516660988334815, + "loss": 2.6187, + "step": 5554 + }, + { + "epoch": 0.44830925671858607, + "grad_norm": 0.6424254775047302, + "learning_rate": 0.00016515463461525383, + "loss": 2.6411, + "step": 5555 + }, + { + "epoch": 0.4483899604551691, + "grad_norm": 0.7460073232650757, + "learning_rate": 0.0001651426577233358, + "loss": 2.6239, + "step": 5556 + }, + { + "epoch": 0.4484706641917521, + "grad_norm": 0.6980866193771362, + "learning_rate": 0.0001651306792078924, + "loss": 2.605, + "step": 5557 + }, + { + "epoch": 0.4485513679283351, + "grad_norm": 0.7376009225845337, + "learning_rate": 0.00016511869906922217, + "loss": 2.7114, + "step": 5558 + }, + { + "epoch": 0.4486320716649181, + "grad_norm": 0.7227364778518677, + "learning_rate": 0.0001651067173076238, + "loss": 2.6212, + "step": 5559 + }, + { + "epoch": 0.44871277540150106, + "grad_norm": 0.8989635705947876, + "learning_rate": 0.00016509473392339584, + "loss": 2.671, + "step": 5560 + }, + { + "epoch": 0.4487934791380841, + "grad_norm": 0.7273553609848022, + "learning_rate": 0.0001650827489168369, + "loss": 2.6556, + "step": 5561 + }, + { + "epoch": 0.44887418287466707, + "grad_norm": 0.839439868927002, + "learning_rate": 0.00016507076228824578, + "loss": 2.6959, + "step": 5562 + }, + { + "epoch": 0.4489548866112501, + "grad_norm": 0.6912770867347717, + "learning_rate": 0.00016505877403792115, + "loss": 2.6709, + "step": 5563 + }, + { + "epoch": 0.4490355903478331, + "grad_norm": 0.7850949168205261, + "learning_rate": 0.00016504678416616182, + "loss": 2.7257, + "step": 5564 + }, + { + "epoch": 0.4491162940844161, + "grad_norm": 0.7768355011940002, + "learning_rate": 0.0001650347926732666, + "loss": 2.5939, + "step": 5565 + }, + { + "epoch": 0.4491969978209991, + "grad_norm": 0.6518398523330688, + "learning_rate": 0.0001650227995595343, + "loss": 2.6589, + "step": 5566 + }, + { + "epoch": 0.4492777015575821, + "grad_norm": 0.6855975389480591, + "learning_rate": 0.0001650108048252639, + "loss": 2.6372, + "step": 5567 + }, + { + "epoch": 0.4493584052941651, + "grad_norm": 0.7176938056945801, + "learning_rate": 0.0001649988084707543, + "loss": 2.6506, + "step": 5568 + }, + { + "epoch": 0.44943910903074813, + "grad_norm": 0.735335648059845, + "learning_rate": 0.00016498681049630448, + "loss": 2.608, + "step": 5569 + }, + { + "epoch": 0.4495198127673311, + "grad_norm": 0.6862306594848633, + "learning_rate": 0.00016497481090221346, + "loss": 2.5982, + "step": 5570 + }, + { + "epoch": 0.44960051650391414, + "grad_norm": 0.7213380336761475, + "learning_rate": 0.0001649628096887803, + "loss": 2.6457, + "step": 5571 + }, + { + "epoch": 0.4496812202404971, + "grad_norm": 0.7118985652923584, + "learning_rate": 0.0001649508068563041, + "loss": 2.6321, + "step": 5572 + }, + { + "epoch": 0.44976192397708015, + "grad_norm": 0.7663396596908569, + "learning_rate": 0.00016493880240508405, + "loss": 2.5865, + "step": 5573 + }, + { + "epoch": 0.4498426277136631, + "grad_norm": 0.6854543089866638, + "learning_rate": 0.00016492679633541926, + "loss": 2.6536, + "step": 5574 + }, + { + "epoch": 0.44992333145024616, + "grad_norm": 0.7071701884269714, + "learning_rate": 0.000164914788647609, + "loss": 2.6149, + "step": 5575 + }, + { + "epoch": 0.45000403518682913, + "grad_norm": 0.7610478401184082, + "learning_rate": 0.00016490277934195252, + "loss": 2.6326, + "step": 5576 + }, + { + "epoch": 0.45008473892341216, + "grad_norm": 0.7117596864700317, + "learning_rate": 0.0001648907684187491, + "loss": 2.6938, + "step": 5577 + }, + { + "epoch": 0.45016544265999514, + "grad_norm": 0.6980494856834412, + "learning_rate": 0.00016487875587829813, + "loss": 2.6798, + "step": 5578 + }, + { + "epoch": 0.4502461463965782, + "grad_norm": 0.7957972288131714, + "learning_rate": 0.00016486674172089898, + "loss": 2.6029, + "step": 5579 + }, + { + "epoch": 0.45032685013316115, + "grad_norm": 0.7258082032203674, + "learning_rate": 0.00016485472594685103, + "loss": 2.6785, + "step": 5580 + }, + { + "epoch": 0.4504075538697442, + "grad_norm": 0.7402041554450989, + "learning_rate": 0.0001648427085564538, + "loss": 2.6263, + "step": 5581 + }, + { + "epoch": 0.45048825760632716, + "grad_norm": 0.6943814158439636, + "learning_rate": 0.00016483068955000673, + "loss": 2.6761, + "step": 5582 + }, + { + "epoch": 0.4505689613429102, + "grad_norm": 0.8021644353866577, + "learning_rate": 0.00016481866892780947, + "loss": 2.6376, + "step": 5583 + }, + { + "epoch": 0.45064966507949317, + "grad_norm": 0.7748533487319946, + "learning_rate": 0.0001648066466901615, + "loss": 2.7465, + "step": 5584 + }, + { + "epoch": 0.4507303688160762, + "grad_norm": 0.7432222366333008, + "learning_rate": 0.00016479462283736248, + "loss": 2.6368, + "step": 5585 + }, + { + "epoch": 0.4508110725526592, + "grad_norm": 0.7835286259651184, + "learning_rate": 0.00016478259736971214, + "loss": 2.6449, + "step": 5586 + }, + { + "epoch": 0.4508917762892422, + "grad_norm": 0.7372995018959045, + "learning_rate": 0.00016477057028751007, + "loss": 2.6091, + "step": 5587 + }, + { + "epoch": 0.4509724800258252, + "grad_norm": 0.8230665326118469, + "learning_rate": 0.0001647585415910561, + "loss": 2.6345, + "step": 5588 + }, + { + "epoch": 0.4510531837624082, + "grad_norm": 0.7490825057029724, + "learning_rate": 0.00016474651128065002, + "loss": 2.5996, + "step": 5589 + }, + { + "epoch": 0.4511338874989912, + "grad_norm": 0.7950569987297058, + "learning_rate": 0.00016473447935659157, + "loss": 2.7109, + "step": 5590 + }, + { + "epoch": 0.4512145912355742, + "grad_norm": 0.7648342251777649, + "learning_rate": 0.00016472244581918074, + "loss": 2.6268, + "step": 5591 + }, + { + "epoch": 0.4512952949721572, + "grad_norm": 0.726828396320343, + "learning_rate": 0.00016471041066871733, + "loss": 2.5959, + "step": 5592 + }, + { + "epoch": 0.45137599870874023, + "grad_norm": 0.7855841517448425, + "learning_rate": 0.00016469837390550133, + "loss": 2.6671, + "step": 5593 + }, + { + "epoch": 0.4514567024453232, + "grad_norm": 0.6858882904052734, + "learning_rate": 0.00016468633552983275, + "loss": 2.6003, + "step": 5594 + }, + { + "epoch": 0.45153740618190624, + "grad_norm": 0.710926353931427, + "learning_rate": 0.0001646742955420116, + "loss": 2.6049, + "step": 5595 + }, + { + "epoch": 0.4516181099184892, + "grad_norm": 0.8359978199005127, + "learning_rate": 0.0001646622539423379, + "loss": 2.6636, + "step": 5596 + }, + { + "epoch": 0.45169881365507225, + "grad_norm": 0.7628041505813599, + "learning_rate": 0.00016465021073111186, + "loss": 2.6586, + "step": 5597 + }, + { + "epoch": 0.4517795173916552, + "grad_norm": 0.7723419666290283, + "learning_rate": 0.00016463816590863356, + "loss": 2.6213, + "step": 5598 + }, + { + "epoch": 0.45186022112823826, + "grad_norm": 0.7210986018180847, + "learning_rate": 0.0001646261194752032, + "loss": 2.6674, + "step": 5599 + }, + { + "epoch": 0.45194092486482124, + "grad_norm": 0.7665949463844299, + "learning_rate": 0.00016461407143112097, + "loss": 2.68, + "step": 5600 + }, + { + "epoch": 0.45202162860140427, + "grad_norm": 0.7225117087364197, + "learning_rate": 0.00016460202177668722, + "loss": 2.6473, + "step": 5601 + }, + { + "epoch": 0.45210233233798724, + "grad_norm": 0.6831738948822021, + "learning_rate": 0.0001645899705122022, + "loss": 2.6863, + "step": 5602 + }, + { + "epoch": 0.4521830360745703, + "grad_norm": 0.7006321549415588, + "learning_rate": 0.00016457791763796627, + "loss": 2.6242, + "step": 5603 + }, + { + "epoch": 0.45226373981115325, + "grad_norm": 0.7245663404464722, + "learning_rate": 0.00016456586315427983, + "loss": 2.6201, + "step": 5604 + }, + { + "epoch": 0.4523444435477363, + "grad_norm": 0.7444287538528442, + "learning_rate": 0.00016455380706144332, + "loss": 2.6684, + "step": 5605 + }, + { + "epoch": 0.45242514728431926, + "grad_norm": 0.6562673449516296, + "learning_rate": 0.00016454174935975714, + "loss": 2.5912, + "step": 5606 + }, + { + "epoch": 0.4525058510209023, + "grad_norm": 0.6494336724281311, + "learning_rate": 0.0001645296900495219, + "loss": 2.6245, + "step": 5607 + }, + { + "epoch": 0.45258655475748527, + "grad_norm": 0.6968161463737488, + "learning_rate": 0.0001645176291310381, + "loss": 2.6494, + "step": 5608 + }, + { + "epoch": 0.4526672584940683, + "grad_norm": 0.7351142764091492, + "learning_rate": 0.00016450556660460632, + "loss": 2.574, + "step": 5609 + }, + { + "epoch": 0.4527479622306513, + "grad_norm": 0.7522323131561279, + "learning_rate": 0.0001644935024705272, + "loss": 2.6512, + "step": 5610 + }, + { + "epoch": 0.45282866596723426, + "grad_norm": 0.6744225025177002, + "learning_rate": 0.0001644814367291014, + "loss": 2.6288, + "step": 5611 + }, + { + "epoch": 0.4529093697038173, + "grad_norm": 0.6933234333992004, + "learning_rate": 0.00016446936938062967, + "loss": 2.6076, + "step": 5612 + }, + { + "epoch": 0.45299007344040026, + "grad_norm": 0.7101204991340637, + "learning_rate": 0.00016445730042541272, + "loss": 2.6322, + "step": 5613 + }, + { + "epoch": 0.4530707771769833, + "grad_norm": 0.7647581696510315, + "learning_rate": 0.00016444522986375134, + "loss": 2.7021, + "step": 5614 + }, + { + "epoch": 0.4531514809135663, + "grad_norm": 0.7028820514678955, + "learning_rate": 0.00016443315769594635, + "loss": 2.6171, + "step": 5615 + }, + { + "epoch": 0.4532321846501493, + "grad_norm": 0.6933851838111877, + "learning_rate": 0.00016442108392229868, + "loss": 2.6119, + "step": 5616 + }, + { + "epoch": 0.4533128883867323, + "grad_norm": 0.7218462824821472, + "learning_rate": 0.0001644090085431092, + "loss": 2.6661, + "step": 5617 + }, + { + "epoch": 0.4533935921233153, + "grad_norm": 0.7390525341033936, + "learning_rate": 0.00016439693155867883, + "loss": 2.7084, + "step": 5618 + }, + { + "epoch": 0.4534742958598983, + "grad_norm": 0.734136164188385, + "learning_rate": 0.0001643848529693086, + "loss": 2.6896, + "step": 5619 + }, + { + "epoch": 0.4535549995964813, + "grad_norm": 0.8082060813903809, + "learning_rate": 0.00016437277277529954, + "loss": 2.5828, + "step": 5620 + }, + { + "epoch": 0.4536357033330643, + "grad_norm": 0.695988655090332, + "learning_rate": 0.0001643606909769527, + "loss": 2.6383, + "step": 5621 + }, + { + "epoch": 0.45371640706964733, + "grad_norm": 0.7415786385536194, + "learning_rate": 0.00016434860757456922, + "loss": 2.6388, + "step": 5622 + }, + { + "epoch": 0.4537971108062303, + "grad_norm": 0.7378649115562439, + "learning_rate": 0.0001643365225684502, + "loss": 2.6534, + "step": 5623 + }, + { + "epoch": 0.45387781454281334, + "grad_norm": 0.7686129808425903, + "learning_rate": 0.0001643244359588969, + "loss": 2.6637, + "step": 5624 + }, + { + "epoch": 0.4539585182793963, + "grad_norm": 0.7305558323860168, + "learning_rate": 0.00016431234774621047, + "loss": 2.6525, + "step": 5625 + }, + { + "epoch": 0.45403922201597935, + "grad_norm": 0.7994235157966614, + "learning_rate": 0.00016430025793069225, + "loss": 2.6316, + "step": 5626 + }, + { + "epoch": 0.4541199257525623, + "grad_norm": 0.6945801377296448, + "learning_rate": 0.0001642881665126435, + "loss": 2.6367, + "step": 5627 + }, + { + "epoch": 0.45420062948914536, + "grad_norm": 0.6855447292327881, + "learning_rate": 0.00016427607349236558, + "loss": 2.6317, + "step": 5628 + }, + { + "epoch": 0.45428133322572833, + "grad_norm": 0.6961888670921326, + "learning_rate": 0.00016426397887015992, + "loss": 2.6477, + "step": 5629 + }, + { + "epoch": 0.45436203696231137, + "grad_norm": 0.7531994581222534, + "learning_rate": 0.0001642518826463279, + "loss": 2.7219, + "step": 5630 + }, + { + "epoch": 0.45444274069889434, + "grad_norm": 0.7442335486412048, + "learning_rate": 0.00016423978482117102, + "loss": 2.706, + "step": 5631 + }, + { + "epoch": 0.4545234444354774, + "grad_norm": 0.7075700759887695, + "learning_rate": 0.00016422768539499076, + "loss": 2.6481, + "step": 5632 + }, + { + "epoch": 0.45460414817206035, + "grad_norm": 0.7831876873970032, + "learning_rate": 0.0001642155843680887, + "loss": 2.616, + "step": 5633 + }, + { + "epoch": 0.4546848519086434, + "grad_norm": 0.7514604926109314, + "learning_rate": 0.00016420348174076642, + "loss": 2.6282, + "step": 5634 + }, + { + "epoch": 0.45476555564522636, + "grad_norm": 0.7136685252189636, + "learning_rate": 0.0001641913775133255, + "loss": 2.6764, + "step": 5635 + }, + { + "epoch": 0.4548462593818094, + "grad_norm": 0.7406740784645081, + "learning_rate": 0.00016417927168606771, + "loss": 2.6126, + "step": 5636 + }, + { + "epoch": 0.45492696311839237, + "grad_norm": 0.7257869839668274, + "learning_rate": 0.0001641671642592947, + "loss": 2.6035, + "step": 5637 + }, + { + "epoch": 0.4550076668549754, + "grad_norm": 0.8378798961639404, + "learning_rate": 0.00016415505523330822, + "loss": 2.6657, + "step": 5638 + }, + { + "epoch": 0.4550883705915584, + "grad_norm": 0.7218836545944214, + "learning_rate": 0.00016414294460841003, + "loss": 2.6209, + "step": 5639 + }, + { + "epoch": 0.4551690743281414, + "grad_norm": 0.7792766690254211, + "learning_rate": 0.00016413083238490204, + "loss": 2.7208, + "step": 5640 + }, + { + "epoch": 0.4552497780647244, + "grad_norm": 0.7800823450088501, + "learning_rate": 0.000164118718563086, + "loss": 2.6351, + "step": 5641 + }, + { + "epoch": 0.4553304818013074, + "grad_norm": 0.7593275904655457, + "learning_rate": 0.00016410660314326395, + "loss": 2.7025, + "step": 5642 + }, + { + "epoch": 0.4554111855378904, + "grad_norm": 0.7561587691307068, + "learning_rate": 0.00016409448612573772, + "loss": 2.6188, + "step": 5643 + }, + { + "epoch": 0.4554918892744734, + "grad_norm": 0.7674516439437866, + "learning_rate": 0.00016408236751080937, + "loss": 2.629, + "step": 5644 + }, + { + "epoch": 0.4555725930110564, + "grad_norm": 0.7112495303153992, + "learning_rate": 0.00016407024729878095, + "loss": 2.6261, + "step": 5645 + }, + { + "epoch": 0.45565329674763944, + "grad_norm": 0.6861695647239685, + "learning_rate": 0.00016405812548995444, + "loss": 2.6984, + "step": 5646 + }, + { + "epoch": 0.4557340004842224, + "grad_norm": 0.7711648941040039, + "learning_rate": 0.000164046002084632, + "loss": 2.6839, + "step": 5647 + }, + { + "epoch": 0.45581470422080544, + "grad_norm": 0.6862967014312744, + "learning_rate": 0.00016403387708311578, + "loss": 2.5964, + "step": 5648 + }, + { + "epoch": 0.4558954079573884, + "grad_norm": 0.707374632358551, + "learning_rate": 0.00016402175048570793, + "loss": 2.6191, + "step": 5649 + }, + { + "epoch": 0.45597611169397145, + "grad_norm": 0.7980892658233643, + "learning_rate": 0.00016400962229271072, + "loss": 2.6288, + "step": 5650 + }, + { + "epoch": 0.45605681543055443, + "grad_norm": 0.686187744140625, + "learning_rate": 0.0001639974925044264, + "loss": 2.6277, + "step": 5651 + }, + { + "epoch": 0.45613751916713746, + "grad_norm": 0.6970425844192505, + "learning_rate": 0.0001639853611211573, + "loss": 2.5726, + "step": 5652 + }, + { + "epoch": 0.45621822290372044, + "grad_norm": 0.701500415802002, + "learning_rate": 0.00016397322814320573, + "loss": 2.6275, + "step": 5653 + }, + { + "epoch": 0.45629892664030347, + "grad_norm": 0.8432207107543945, + "learning_rate": 0.00016396109357087407, + "loss": 2.6185, + "step": 5654 + }, + { + "epoch": 0.45637963037688645, + "grad_norm": 0.7049770951271057, + "learning_rate": 0.00016394895740446476, + "loss": 2.674, + "step": 5655 + }, + { + "epoch": 0.4564603341134695, + "grad_norm": 0.7068646550178528, + "learning_rate": 0.00016393681964428026, + "loss": 2.6072, + "step": 5656 + }, + { + "epoch": 0.45654103785005246, + "grad_norm": 0.7698760032653809, + "learning_rate": 0.00016392468029062312, + "loss": 2.6547, + "step": 5657 + }, + { + "epoch": 0.4566217415866355, + "grad_norm": 0.7381031513214111, + "learning_rate": 0.00016391253934379583, + "loss": 2.6125, + "step": 5658 + }, + { + "epoch": 0.45670244532321846, + "grad_norm": 0.7367781400680542, + "learning_rate": 0.00016390039680410097, + "loss": 2.6763, + "step": 5659 + }, + { + "epoch": 0.4567831490598015, + "grad_norm": 0.7416272759437561, + "learning_rate": 0.00016388825267184121, + "loss": 2.7059, + "step": 5660 + }, + { + "epoch": 0.4568638527963845, + "grad_norm": 0.6933416724205017, + "learning_rate": 0.0001638761069473192, + "loss": 2.6028, + "step": 5661 + }, + { + "epoch": 0.45694455653296745, + "grad_norm": 0.7311314940452576, + "learning_rate": 0.00016386395963083756, + "loss": 2.6266, + "step": 5662 + }, + { + "epoch": 0.4570252602695505, + "grad_norm": 0.7172734141349792, + "learning_rate": 0.00016385181072269917, + "loss": 2.6754, + "step": 5663 + }, + { + "epoch": 0.45710596400613346, + "grad_norm": 0.7286428213119507, + "learning_rate": 0.00016383966022320671, + "loss": 2.6637, + "step": 5664 + }, + { + "epoch": 0.4571866677427165, + "grad_norm": 0.7296474575996399, + "learning_rate": 0.00016382750813266308, + "loss": 2.6655, + "step": 5665 + }, + { + "epoch": 0.45726737147929947, + "grad_norm": 0.6929224133491516, + "learning_rate": 0.00016381535445137105, + "loss": 2.6376, + "step": 5666 + }, + { + "epoch": 0.4573480752158825, + "grad_norm": 0.7012765407562256, + "learning_rate": 0.0001638031991796336, + "loss": 2.6222, + "step": 5667 + }, + { + "epoch": 0.4574287789524655, + "grad_norm": 0.7360745668411255, + "learning_rate": 0.00016379104231775368, + "loss": 2.6304, + "step": 5668 + }, + { + "epoch": 0.4575094826890485, + "grad_norm": 0.7276801466941833, + "learning_rate": 0.00016377888386603419, + "loss": 2.7046, + "step": 5669 + }, + { + "epoch": 0.4575901864256315, + "grad_norm": 0.688432514667511, + "learning_rate": 0.0001637667238247782, + "loss": 2.6598, + "step": 5670 + }, + { + "epoch": 0.4576708901622145, + "grad_norm": 0.6874414682388306, + "learning_rate": 0.00016375456219428877, + "loss": 2.7, + "step": 5671 + }, + { + "epoch": 0.4577515938987975, + "grad_norm": 0.711091160774231, + "learning_rate": 0.000163742398974869, + "loss": 2.6063, + "step": 5672 + }, + { + "epoch": 0.4578322976353805, + "grad_norm": 0.7131791710853577, + "learning_rate": 0.000163730234166822, + "loss": 2.5948, + "step": 5673 + }, + { + "epoch": 0.4579130013719635, + "grad_norm": 0.7166630625724792, + "learning_rate": 0.000163718067770451, + "loss": 2.6488, + "step": 5674 + }, + { + "epoch": 0.45799370510854653, + "grad_norm": 0.7285952568054199, + "learning_rate": 0.00016370589978605916, + "loss": 2.6445, + "step": 5675 + }, + { + "epoch": 0.4580744088451295, + "grad_norm": 0.728050172328949, + "learning_rate": 0.0001636937302139498, + "loss": 2.5425, + "step": 5676 + }, + { + "epoch": 0.45815511258171254, + "grad_norm": 0.7196047902107239, + "learning_rate": 0.00016368155905442615, + "loss": 2.7426, + "step": 5677 + }, + { + "epoch": 0.4582358163182955, + "grad_norm": 0.6844602823257446, + "learning_rate": 0.0001636693863077916, + "loss": 2.6157, + "step": 5678 + }, + { + "epoch": 0.45831652005487855, + "grad_norm": 0.7375781536102295, + "learning_rate": 0.0001636572119743495, + "loss": 2.7069, + "step": 5679 + }, + { + "epoch": 0.4583972237914615, + "grad_norm": 0.7667750120162964, + "learning_rate": 0.0001636450360544033, + "loss": 2.6589, + "step": 5680 + }, + { + "epoch": 0.45847792752804456, + "grad_norm": 0.6569861173629761, + "learning_rate": 0.00016363285854825642, + "loss": 2.6197, + "step": 5681 + }, + { + "epoch": 0.45855863126462754, + "grad_norm": 0.7177335023880005, + "learning_rate": 0.00016362067945621239, + "loss": 2.6104, + "step": 5682 + }, + { + "epoch": 0.45863933500121057, + "grad_norm": 0.7260481715202332, + "learning_rate": 0.00016360849877857469, + "loss": 2.6435, + "step": 5683 + }, + { + "epoch": 0.45872003873779355, + "grad_norm": 0.7083989381790161, + "learning_rate": 0.00016359631651564693, + "loss": 2.6366, + "step": 5684 + }, + { + "epoch": 0.4588007424743766, + "grad_norm": 0.6417020559310913, + "learning_rate": 0.00016358413266773271, + "loss": 2.6311, + "step": 5685 + }, + { + "epoch": 0.45888144621095955, + "grad_norm": 0.737856924533844, + "learning_rate": 0.0001635719472351357, + "loss": 2.6647, + "step": 5686 + }, + { + "epoch": 0.4589621499475426, + "grad_norm": 0.6774190068244934, + "learning_rate": 0.0001635597602181596, + "loss": 2.6366, + "step": 5687 + }, + { + "epoch": 0.45904285368412556, + "grad_norm": 0.6480480432510376, + "learning_rate": 0.0001635475716171081, + "loss": 2.6501, + "step": 5688 + }, + { + "epoch": 0.4591235574207086, + "grad_norm": 0.7886860370635986, + "learning_rate": 0.0001635353814322851, + "loss": 2.7239, + "step": 5689 + }, + { + "epoch": 0.45920426115729157, + "grad_norm": 0.7579021453857422, + "learning_rate": 0.0001635231896639942, + "loss": 2.6155, + "step": 5690 + }, + { + "epoch": 0.4592849648938746, + "grad_norm": 0.6853809356689453, + "learning_rate": 0.0001635109963125394, + "loss": 2.5933, + "step": 5691 + }, + { + "epoch": 0.4593656686304576, + "grad_norm": 0.661342978477478, + "learning_rate": 0.00016349880137822456, + "loss": 2.6277, + "step": 5692 + }, + { + "epoch": 0.4594463723670406, + "grad_norm": 0.6795682311058044, + "learning_rate": 0.0001634866048613536, + "loss": 2.6221, + "step": 5693 + }, + { + "epoch": 0.4595270761036236, + "grad_norm": 0.7375383377075195, + "learning_rate": 0.00016347440676223047, + "loss": 2.6082, + "step": 5694 + }, + { + "epoch": 0.4596077798402066, + "grad_norm": 0.7565153241157532, + "learning_rate": 0.0001634622070811592, + "loss": 2.6615, + "step": 5695 + }, + { + "epoch": 0.4596884835767896, + "grad_norm": 0.6869745254516602, + "learning_rate": 0.00016345000581844386, + "loss": 2.6172, + "step": 5696 + }, + { + "epoch": 0.45976918731337263, + "grad_norm": 0.7192853689193726, + "learning_rate": 0.0001634378029743885, + "loss": 2.6324, + "step": 5697 + }, + { + "epoch": 0.4598498910499556, + "grad_norm": 0.6919218301773071, + "learning_rate": 0.00016342559854929726, + "loss": 2.5965, + "step": 5698 + }, + { + "epoch": 0.45993059478653864, + "grad_norm": 0.6715282797813416, + "learning_rate": 0.00016341339254347432, + "loss": 2.6225, + "step": 5699 + }, + { + "epoch": 0.4600112985231216, + "grad_norm": 0.6768380999565125, + "learning_rate": 0.00016340118495722388, + "loss": 2.6376, + "step": 5700 + }, + { + "epoch": 0.46009200225970465, + "grad_norm": 0.6898325681686401, + "learning_rate": 0.00016338897579085018, + "loss": 2.667, + "step": 5701 + }, + { + "epoch": 0.4601727059962876, + "grad_norm": 0.7171810865402222, + "learning_rate": 0.00016337676504465747, + "loss": 2.678, + "step": 5702 + }, + { + "epoch": 0.46025340973287066, + "grad_norm": 0.7050724029541016, + "learning_rate": 0.00016336455271895016, + "loss": 2.619, + "step": 5703 + }, + { + "epoch": 0.46033411346945363, + "grad_norm": 0.8287240862846375, + "learning_rate": 0.00016335233881403248, + "loss": 2.71, + "step": 5704 + }, + { + "epoch": 0.46041481720603666, + "grad_norm": 0.6880568861961365, + "learning_rate": 0.000163340123330209, + "loss": 2.6516, + "step": 5705 + }, + { + "epoch": 0.46049552094261964, + "grad_norm": 0.7222896218299866, + "learning_rate": 0.00016332790626778402, + "loss": 2.5899, + "step": 5706 + }, + { + "epoch": 0.4605762246792027, + "grad_norm": 0.7707448601722717, + "learning_rate": 0.00016331568762706207, + "loss": 2.6116, + "step": 5707 + }, + { + "epoch": 0.46065692841578565, + "grad_norm": 0.7780653834342957, + "learning_rate": 0.0001633034674083477, + "loss": 2.6072, + "step": 5708 + }, + { + "epoch": 0.4607376321523687, + "grad_norm": 0.7551524639129639, + "learning_rate": 0.00016329124561194545, + "loss": 2.548, + "step": 5709 + }, + { + "epoch": 0.46081833588895166, + "grad_norm": 0.9312284588813782, + "learning_rate": 0.0001632790222381599, + "loss": 2.6557, + "step": 5710 + }, + { + "epoch": 0.4608990396255347, + "grad_norm": 0.7404753565788269, + "learning_rate": 0.0001632667972872957, + "loss": 2.6889, + "step": 5711 + }, + { + "epoch": 0.46097974336211767, + "grad_norm": 0.7423726916313171, + "learning_rate": 0.00016325457075965752, + "loss": 2.6265, + "step": 5712 + }, + { + "epoch": 0.46106044709870064, + "grad_norm": 1.0683187246322632, + "learning_rate": 0.0001632423426555501, + "loss": 2.6827, + "step": 5713 + }, + { + "epoch": 0.4611411508352837, + "grad_norm": 0.7204160094261169, + "learning_rate": 0.0001632301129752782, + "loss": 2.702, + "step": 5714 + }, + { + "epoch": 0.46122185457186665, + "grad_norm": 0.7591153383255005, + "learning_rate": 0.0001632178817191466, + "loss": 2.6031, + "step": 5715 + }, + { + "epoch": 0.4613025583084497, + "grad_norm": 0.8147456645965576, + "learning_rate": 0.00016320564888746013, + "loss": 2.6117, + "step": 5716 + }, + { + "epoch": 0.46138326204503266, + "grad_norm": 0.7880246639251709, + "learning_rate": 0.00016319341448052364, + "loss": 2.5896, + "step": 5717 + }, + { + "epoch": 0.4614639657816157, + "grad_norm": 0.6875137686729431, + "learning_rate": 0.00016318117849864206, + "loss": 2.6258, + "step": 5718 + }, + { + "epoch": 0.46154466951819867, + "grad_norm": 0.7197960615158081, + "learning_rate": 0.00016316894094212044, + "loss": 2.6656, + "step": 5719 + }, + { + "epoch": 0.4616253732547817, + "grad_norm": 0.7049540281295776, + "learning_rate": 0.0001631567018112636, + "loss": 2.6698, + "step": 5720 + }, + { + "epoch": 0.4617060769913647, + "grad_norm": 0.7128825783729553, + "learning_rate": 0.00016314446110637668, + "loss": 2.6552, + "step": 5721 + }, + { + "epoch": 0.4617867807279477, + "grad_norm": 0.7956201434135437, + "learning_rate": 0.00016313221882776477, + "loss": 2.6747, + "step": 5722 + }, + { + "epoch": 0.4618674844645307, + "grad_norm": 0.7598347663879395, + "learning_rate": 0.0001631199749757329, + "loss": 2.6187, + "step": 5723 + }, + { + "epoch": 0.4619481882011137, + "grad_norm": 0.6587582230567932, + "learning_rate": 0.00016310772955058627, + "loss": 2.596, + "step": 5724 + }, + { + "epoch": 0.4620288919376967, + "grad_norm": 0.700136125087738, + "learning_rate": 0.00016309548255263003, + "loss": 2.6527, + "step": 5725 + }, + { + "epoch": 0.4621095956742797, + "grad_norm": 0.7246582508087158, + "learning_rate": 0.00016308323398216945, + "loss": 2.6577, + "step": 5726 + }, + { + "epoch": 0.4621902994108627, + "grad_norm": 0.6951557993888855, + "learning_rate": 0.00016307098383950977, + "loss": 2.5816, + "step": 5727 + }, + { + "epoch": 0.46227100314744574, + "grad_norm": 0.7109191417694092, + "learning_rate": 0.0001630587321249563, + "loss": 2.6586, + "step": 5728 + }, + { + "epoch": 0.4623517068840287, + "grad_norm": 0.7357863783836365, + "learning_rate": 0.0001630464788388144, + "loss": 2.691, + "step": 5729 + }, + { + "epoch": 0.46243241062061174, + "grad_norm": 0.7916350960731506, + "learning_rate": 0.00016303422398138945, + "loss": 2.6584, + "step": 5730 + }, + { + "epoch": 0.4625131143571947, + "grad_norm": 0.6543231010437012, + "learning_rate": 0.00016302196755298685, + "loss": 2.6482, + "step": 5731 + }, + { + "epoch": 0.46259381809377775, + "grad_norm": 0.6978787183761597, + "learning_rate": 0.00016300970955391208, + "loss": 2.5956, + "step": 5732 + }, + { + "epoch": 0.46267452183036073, + "grad_norm": 0.7301886677742004, + "learning_rate": 0.00016299744998447065, + "loss": 2.6178, + "step": 5733 + }, + { + "epoch": 0.46275522556694376, + "grad_norm": 0.7381030321121216, + "learning_rate": 0.00016298518884496808, + "loss": 2.6712, + "step": 5734 + }, + { + "epoch": 0.46283592930352674, + "grad_norm": 0.7769027948379517, + "learning_rate": 0.00016297292613570995, + "loss": 2.6082, + "step": 5735 + }, + { + "epoch": 0.46291663304010977, + "grad_norm": 0.7698354721069336, + "learning_rate": 0.0001629606618570019, + "loss": 2.6543, + "step": 5736 + }, + { + "epoch": 0.46299733677669275, + "grad_norm": 0.7001554369926453, + "learning_rate": 0.00016294839600914957, + "loss": 2.6174, + "step": 5737 + }, + { + "epoch": 0.4630780405132758, + "grad_norm": 0.7589300274848938, + "learning_rate": 0.00016293612859245868, + "loss": 2.6338, + "step": 5738 + }, + { + "epoch": 0.46315874424985876, + "grad_norm": 0.7083945274353027, + "learning_rate": 0.00016292385960723493, + "loss": 2.6793, + "step": 5739 + }, + { + "epoch": 0.4632394479864418, + "grad_norm": 0.739439845085144, + "learning_rate": 0.00016291158905378412, + "loss": 2.7335, + "step": 5740 + }, + { + "epoch": 0.46332015172302476, + "grad_norm": 0.6868166923522949, + "learning_rate": 0.00016289931693241205, + "loss": 2.6139, + "step": 5741 + }, + { + "epoch": 0.4634008554596078, + "grad_norm": 0.7385871410369873, + "learning_rate": 0.0001628870432434246, + "loss": 2.6783, + "step": 5742 + }, + { + "epoch": 0.4634815591961908, + "grad_norm": 0.7227835655212402, + "learning_rate": 0.00016287476798712764, + "loss": 2.6732, + "step": 5743 + }, + { + "epoch": 0.4635622629327738, + "grad_norm": 0.6662411689758301, + "learning_rate": 0.00016286249116382709, + "loss": 2.6645, + "step": 5744 + }, + { + "epoch": 0.4636429666693568, + "grad_norm": 0.8110263347625732, + "learning_rate": 0.00016285021277382894, + "loss": 2.6448, + "step": 5745 + }, + { + "epoch": 0.4637236704059398, + "grad_norm": 0.7419269680976868, + "learning_rate": 0.0001628379328174392, + "loss": 2.7286, + "step": 5746 + }, + { + "epoch": 0.4638043741425228, + "grad_norm": 0.6518125534057617, + "learning_rate": 0.0001628256512949639, + "loss": 2.6545, + "step": 5747 + }, + { + "epoch": 0.4638850778791058, + "grad_norm": 0.6816060543060303, + "learning_rate": 0.00016281336820670917, + "loss": 2.6167, + "step": 5748 + }, + { + "epoch": 0.4639657816156888, + "grad_norm": 0.6537362337112427, + "learning_rate": 0.0001628010835529811, + "loss": 2.6522, + "step": 5749 + }, + { + "epoch": 0.46404648535227183, + "grad_norm": 0.6720992922782898, + "learning_rate": 0.00016278879733408585, + "loss": 2.6028, + "step": 5750 + }, + { + "epoch": 0.4641271890888548, + "grad_norm": 0.6778908371925354, + "learning_rate": 0.00016277650955032967, + "loss": 2.5591, + "step": 5751 + }, + { + "epoch": 0.46420789282543784, + "grad_norm": 0.6908471584320068, + "learning_rate": 0.0001627642202020187, + "loss": 2.6574, + "step": 5752 + }, + { + "epoch": 0.4642885965620208, + "grad_norm": 0.7034298181533813, + "learning_rate": 0.00016275192928945936, + "loss": 2.657, + "step": 5753 + }, + { + "epoch": 0.46436930029860385, + "grad_norm": 0.7245952486991882, + "learning_rate": 0.0001627396368129579, + "loss": 2.6572, + "step": 5754 + }, + { + "epoch": 0.4644500040351868, + "grad_norm": 0.6764482855796814, + "learning_rate": 0.0001627273427728207, + "loss": 2.6576, + "step": 5755 + }, + { + "epoch": 0.46453070777176986, + "grad_norm": 0.7074379920959473, + "learning_rate": 0.0001627150471693541, + "loss": 2.614, + "step": 5756 + }, + { + "epoch": 0.46461141150835283, + "grad_norm": 0.7292052507400513, + "learning_rate": 0.0001627027500028646, + "loss": 2.673, + "step": 5757 + }, + { + "epoch": 0.46469211524493587, + "grad_norm": 0.7554025650024414, + "learning_rate": 0.0001626904512736587, + "loss": 2.5919, + "step": 5758 + }, + { + "epoch": 0.46477281898151884, + "grad_norm": 0.6829606890678406, + "learning_rate": 0.00016267815098204284, + "loss": 2.7206, + "step": 5759 + }, + { + "epoch": 0.4648535227181019, + "grad_norm": 0.7201548218727112, + "learning_rate": 0.00016266584912832363, + "loss": 2.6651, + "step": 5760 + }, + { + "epoch": 0.46493422645468485, + "grad_norm": 0.6889227628707886, + "learning_rate": 0.00016265354571280764, + "loss": 2.6776, + "step": 5761 + }, + { + "epoch": 0.4650149301912679, + "grad_norm": 0.7286190986633301, + "learning_rate": 0.00016264124073580156, + "loss": 2.591, + "step": 5762 + }, + { + "epoch": 0.46509563392785086, + "grad_norm": 0.7222036123275757, + "learning_rate": 0.00016262893419761196, + "loss": 2.6422, + "step": 5763 + }, + { + "epoch": 0.46517633766443384, + "grad_norm": 0.6822768449783325, + "learning_rate": 0.00016261662609854562, + "loss": 2.6126, + "step": 5764 + }, + { + "epoch": 0.46525704140101687, + "grad_norm": 0.7263356447219849, + "learning_rate": 0.00016260431643890929, + "loss": 2.6304, + "step": 5765 + }, + { + "epoch": 0.46533774513759985, + "grad_norm": 0.7152180075645447, + "learning_rate": 0.00016259200521900972, + "loss": 2.6489, + "step": 5766 + }, + { + "epoch": 0.4654184488741829, + "grad_norm": 0.6988116502761841, + "learning_rate": 0.00016257969243915378, + "loss": 2.6151, + "step": 5767 + }, + { + "epoch": 0.46549915261076585, + "grad_norm": 0.7131790518760681, + "learning_rate": 0.00016256737809964831, + "loss": 2.6284, + "step": 5768 + }, + { + "epoch": 0.4655798563473489, + "grad_norm": 0.674196183681488, + "learning_rate": 0.00016255506220080025, + "loss": 2.5815, + "step": 5769 + }, + { + "epoch": 0.46566056008393186, + "grad_norm": 0.7166198492050171, + "learning_rate": 0.0001625427447429165, + "loss": 2.6594, + "step": 5770 + }, + { + "epoch": 0.4657412638205149, + "grad_norm": 0.6997127532958984, + "learning_rate": 0.00016253042572630407, + "loss": 2.6502, + "step": 5771 + }, + { + "epoch": 0.46582196755709787, + "grad_norm": 0.7761591076850891, + "learning_rate": 0.00016251810515126994, + "loss": 2.624, + "step": 5772 + }, + { + "epoch": 0.4659026712936809, + "grad_norm": 0.7038728594779968, + "learning_rate": 0.00016250578301812125, + "loss": 2.6096, + "step": 5773 + }, + { + "epoch": 0.4659833750302639, + "grad_norm": 0.7080080509185791, + "learning_rate": 0.00016249345932716505, + "loss": 2.6196, + "step": 5774 + }, + { + "epoch": 0.4660640787668469, + "grad_norm": 0.7461444735527039, + "learning_rate": 0.00016248113407870847, + "loss": 2.65, + "step": 5775 + }, + { + "epoch": 0.4661447825034299, + "grad_norm": 0.7914463877677917, + "learning_rate": 0.00016246880727305868, + "loss": 2.6539, + "step": 5776 + }, + { + "epoch": 0.4662254862400129, + "grad_norm": 0.7067776918411255, + "learning_rate": 0.00016245647891052295, + "loss": 2.72, + "step": 5777 + }, + { + "epoch": 0.4663061899765959, + "grad_norm": 0.7190818190574646, + "learning_rate": 0.00016244414899140852, + "loss": 2.7029, + "step": 5778 + }, + { + "epoch": 0.46638689371317893, + "grad_norm": 0.6740003824234009, + "learning_rate": 0.00016243181751602261, + "loss": 2.6404, + "step": 5779 + }, + { + "epoch": 0.4664675974497619, + "grad_norm": 0.7942661643028259, + "learning_rate": 0.00016241948448467267, + "loss": 2.6333, + "step": 5780 + }, + { + "epoch": 0.46654830118634494, + "grad_norm": 0.6415690183639526, + "learning_rate": 0.00016240714989766597, + "loss": 2.6354, + "step": 5781 + }, + { + "epoch": 0.4666290049229279, + "grad_norm": 0.7287769913673401, + "learning_rate": 0.00016239481375530997, + "loss": 2.6721, + "step": 5782 + }, + { + "epoch": 0.46670970865951095, + "grad_norm": 0.8197699189186096, + "learning_rate": 0.00016238247605791212, + "loss": 2.7577, + "step": 5783 + }, + { + "epoch": 0.4667904123960939, + "grad_norm": 0.8182012438774109, + "learning_rate": 0.0001623701368057799, + "loss": 2.6475, + "step": 5784 + }, + { + "epoch": 0.46687111613267696, + "grad_norm": 0.6974665522575378, + "learning_rate": 0.00016235779599922082, + "loss": 2.5897, + "step": 5785 + }, + { + "epoch": 0.46695181986925993, + "grad_norm": 0.7156379222869873, + "learning_rate": 0.00016234545363854247, + "loss": 2.5981, + "step": 5786 + }, + { + "epoch": 0.46703252360584296, + "grad_norm": 0.6875364780426025, + "learning_rate": 0.0001623331097240524, + "loss": 2.6333, + "step": 5787 + }, + { + "epoch": 0.46711322734242594, + "grad_norm": 0.7222917675971985, + "learning_rate": 0.00016232076425605835, + "loss": 2.5865, + "step": 5788 + }, + { + "epoch": 0.467193931079009, + "grad_norm": 0.7224915027618408, + "learning_rate": 0.00016230841723486792, + "loss": 2.667, + "step": 5789 + }, + { + "epoch": 0.46727463481559195, + "grad_norm": 0.7125402688980103, + "learning_rate": 0.00016229606866078887, + "loss": 2.6548, + "step": 5790 + }, + { + "epoch": 0.467355338552175, + "grad_norm": 0.6866132616996765, + "learning_rate": 0.00016228371853412894, + "loss": 2.6381, + "step": 5791 + }, + { + "epoch": 0.46743604228875796, + "grad_norm": 0.7573552131652832, + "learning_rate": 0.00016227136685519593, + "loss": 2.6766, + "step": 5792 + }, + { + "epoch": 0.467516746025341, + "grad_norm": 0.7565932273864746, + "learning_rate": 0.00016225901362429767, + "loss": 2.5965, + "step": 5793 + }, + { + "epoch": 0.46759744976192397, + "grad_norm": 0.7279250621795654, + "learning_rate": 0.00016224665884174207, + "loss": 2.6599, + "step": 5794 + }, + { + "epoch": 0.467678153498507, + "grad_norm": 0.7501276731491089, + "learning_rate": 0.000162234302507837, + "loss": 2.636, + "step": 5795 + }, + { + "epoch": 0.46775885723509, + "grad_norm": 0.7823930978775024, + "learning_rate": 0.00016222194462289042, + "loss": 2.6277, + "step": 5796 + }, + { + "epoch": 0.467839560971673, + "grad_norm": 0.7168415784835815, + "learning_rate": 0.00016220958518721034, + "loss": 2.6868, + "step": 5797 + }, + { + "epoch": 0.467920264708256, + "grad_norm": 0.7468454241752625, + "learning_rate": 0.00016219722420110478, + "loss": 2.7209, + "step": 5798 + }, + { + "epoch": 0.468000968444839, + "grad_norm": 0.6915228962898254, + "learning_rate": 0.0001621848616648818, + "loss": 2.6356, + "step": 5799 + }, + { + "epoch": 0.468081672181422, + "grad_norm": 0.7731573581695557, + "learning_rate": 0.00016217249757884955, + "loss": 2.6396, + "step": 5800 + }, + { + "epoch": 0.468162375918005, + "grad_norm": 0.6579388380050659, + "learning_rate": 0.0001621601319433161, + "loss": 2.6077, + "step": 5801 + }, + { + "epoch": 0.468243079654588, + "grad_norm": 0.7136246562004089, + "learning_rate": 0.00016214776475858967, + "loss": 2.6602, + "step": 5802 + }, + { + "epoch": 0.46832378339117103, + "grad_norm": 0.6929461359977722, + "learning_rate": 0.0001621353960249785, + "loss": 2.6851, + "step": 5803 + }, + { + "epoch": 0.468404487127754, + "grad_norm": 0.8001779913902283, + "learning_rate": 0.00016212302574279087, + "loss": 2.6577, + "step": 5804 + }, + { + "epoch": 0.46848519086433704, + "grad_norm": 0.7637671828269958, + "learning_rate": 0.00016211065391233498, + "loss": 2.6923, + "step": 5805 + }, + { + "epoch": 0.46856589460092, + "grad_norm": 0.6879906058311462, + "learning_rate": 0.0001620982805339193, + "loss": 2.6555, + "step": 5806 + }, + { + "epoch": 0.46864659833750305, + "grad_norm": 0.7731223702430725, + "learning_rate": 0.0001620859056078521, + "loss": 2.6301, + "step": 5807 + }, + { + "epoch": 0.468727302074086, + "grad_norm": 0.7351491451263428, + "learning_rate": 0.00016207352913444185, + "loss": 2.6154, + "step": 5808 + }, + { + "epoch": 0.46880800581066906, + "grad_norm": 0.716314435005188, + "learning_rate": 0.000162061151113997, + "loss": 2.6294, + "step": 5809 + }, + { + "epoch": 0.46888870954725204, + "grad_norm": 0.6974702477455139, + "learning_rate": 0.00016204877154682605, + "loss": 2.6046, + "step": 5810 + }, + { + "epoch": 0.46896941328383507, + "grad_norm": 0.7456035614013672, + "learning_rate": 0.00016203639043323745, + "loss": 2.6308, + "step": 5811 + }, + { + "epoch": 0.46905011702041804, + "grad_norm": 0.7198047637939453, + "learning_rate": 0.0001620240077735399, + "loss": 2.6303, + "step": 5812 + }, + { + "epoch": 0.4691308207570011, + "grad_norm": 0.7098269462585449, + "learning_rate": 0.00016201162356804192, + "loss": 2.6352, + "step": 5813 + }, + { + "epoch": 0.46921152449358405, + "grad_norm": 0.7060410976409912, + "learning_rate": 0.0001619992378170522, + "loss": 2.6489, + "step": 5814 + }, + { + "epoch": 0.46929222823016703, + "grad_norm": 0.7126092314720154, + "learning_rate": 0.0001619868505208794, + "loss": 2.66, + "step": 5815 + }, + { + "epoch": 0.46937293196675006, + "grad_norm": 0.7391123175621033, + "learning_rate": 0.00016197446167983223, + "loss": 2.6066, + "step": 5816 + }, + { + "epoch": 0.46945363570333304, + "grad_norm": 0.7282211780548096, + "learning_rate": 0.0001619620712942195, + "loss": 2.6422, + "step": 5817 + }, + { + "epoch": 0.46953433943991607, + "grad_norm": 0.7581801414489746, + "learning_rate": 0.00016194967936434998, + "loss": 2.702, + "step": 5818 + }, + { + "epoch": 0.46961504317649905, + "grad_norm": 0.6649011373519897, + "learning_rate": 0.00016193728589053248, + "loss": 2.6235, + "step": 5819 + }, + { + "epoch": 0.4696957469130821, + "grad_norm": 0.720312237739563, + "learning_rate": 0.00016192489087307592, + "loss": 2.5961, + "step": 5820 + }, + { + "epoch": 0.46977645064966506, + "grad_norm": 0.72076016664505, + "learning_rate": 0.0001619124943122892, + "loss": 2.6793, + "step": 5821 + }, + { + "epoch": 0.4698571543862481, + "grad_norm": 0.6695740818977356, + "learning_rate": 0.0001619000962084813, + "loss": 2.6325, + "step": 5822 + }, + { + "epoch": 0.46993785812283106, + "grad_norm": 0.7678804993629456, + "learning_rate": 0.0001618876965619612, + "loss": 2.7473, + "step": 5823 + }, + { + "epoch": 0.4700185618594141, + "grad_norm": 0.782349169254303, + "learning_rate": 0.00016187529537303792, + "loss": 2.6139, + "step": 5824 + }, + { + "epoch": 0.4700992655959971, + "grad_norm": 0.6906631588935852, + "learning_rate": 0.00016186289264202052, + "loss": 2.6529, + "step": 5825 + }, + { + "epoch": 0.4701799693325801, + "grad_norm": 0.732947051525116, + "learning_rate": 0.00016185048836921814, + "loss": 2.6416, + "step": 5826 + }, + { + "epoch": 0.4702606730691631, + "grad_norm": 0.8306718468666077, + "learning_rate": 0.0001618380825549399, + "loss": 2.6566, + "step": 5827 + }, + { + "epoch": 0.4703413768057461, + "grad_norm": 0.725764811038971, + "learning_rate": 0.00016182567519949502, + "loss": 2.6664, + "step": 5828 + }, + { + "epoch": 0.4704220805423291, + "grad_norm": 0.7301872372627258, + "learning_rate": 0.00016181326630319268, + "loss": 2.6666, + "step": 5829 + }, + { + "epoch": 0.4705027842789121, + "grad_norm": 0.7297122478485107, + "learning_rate": 0.00016180085586634216, + "loss": 2.6415, + "step": 5830 + }, + { + "epoch": 0.4705834880154951, + "grad_norm": 0.7445664405822754, + "learning_rate": 0.00016178844388925278, + "loss": 2.6112, + "step": 5831 + }, + { + "epoch": 0.47066419175207813, + "grad_norm": 0.7787267565727234, + "learning_rate": 0.00016177603037223384, + "loss": 2.6452, + "step": 5832 + }, + { + "epoch": 0.4707448954886611, + "grad_norm": 0.7386903762817383, + "learning_rate": 0.00016176361531559474, + "loss": 2.6919, + "step": 5833 + }, + { + "epoch": 0.47082559922524414, + "grad_norm": 0.7991776466369629, + "learning_rate": 0.0001617511987196449, + "loss": 2.6728, + "step": 5834 + }, + { + "epoch": 0.4709063029618271, + "grad_norm": 0.7196263670921326, + "learning_rate": 0.00016173878058469375, + "loss": 2.6008, + "step": 5835 + }, + { + "epoch": 0.47098700669841015, + "grad_norm": 0.6773477792739868, + "learning_rate": 0.00016172636091105086, + "loss": 2.6184, + "step": 5836 + }, + { + "epoch": 0.4710677104349931, + "grad_norm": 0.7238345742225647, + "learning_rate": 0.00016171393969902567, + "loss": 2.6221, + "step": 5837 + }, + { + "epoch": 0.47114841417157616, + "grad_norm": 0.702104926109314, + "learning_rate": 0.00016170151694892777, + "loss": 2.5909, + "step": 5838 + }, + { + "epoch": 0.47122911790815913, + "grad_norm": 0.7571590542793274, + "learning_rate": 0.00016168909266106677, + "loss": 2.6044, + "step": 5839 + }, + { + "epoch": 0.47130982164474217, + "grad_norm": 0.7408227324485779, + "learning_rate": 0.00016167666683575234, + "loss": 2.5771, + "step": 5840 + }, + { + "epoch": 0.47139052538132514, + "grad_norm": 0.6760764122009277, + "learning_rate": 0.00016166423947329414, + "loss": 2.6202, + "step": 5841 + }, + { + "epoch": 0.4714712291179082, + "grad_norm": 0.7085632681846619, + "learning_rate": 0.00016165181057400192, + "loss": 2.5887, + "step": 5842 + }, + { + "epoch": 0.47155193285449115, + "grad_norm": 0.7298943400382996, + "learning_rate": 0.00016163938013818538, + "loss": 2.609, + "step": 5843 + }, + { + "epoch": 0.4716326365910742, + "grad_norm": 0.7591157555580139, + "learning_rate": 0.0001616269481661544, + "loss": 2.6582, + "step": 5844 + }, + { + "epoch": 0.47171334032765716, + "grad_norm": 0.6727088093757629, + "learning_rate": 0.00016161451465821877, + "loss": 2.6289, + "step": 5845 + }, + { + "epoch": 0.4717940440642402, + "grad_norm": 0.6782706379890442, + "learning_rate": 0.00016160207961468835, + "loss": 2.6875, + "step": 5846 + }, + { + "epoch": 0.47187474780082317, + "grad_norm": 0.6839444041252136, + "learning_rate": 0.00016158964303587313, + "loss": 2.5687, + "step": 5847 + }, + { + "epoch": 0.4719554515374062, + "grad_norm": 0.7565997838973999, + "learning_rate": 0.00016157720492208295, + "loss": 2.6855, + "step": 5848 + }, + { + "epoch": 0.4720361552739892, + "grad_norm": 0.7286611199378967, + "learning_rate": 0.0001615647652736279, + "loss": 2.5906, + "step": 5849 + }, + { + "epoch": 0.4721168590105722, + "grad_norm": 0.7503396272659302, + "learning_rate": 0.00016155232409081793, + "loss": 2.6419, + "step": 5850 + }, + { + "epoch": 0.4721975627471552, + "grad_norm": 0.6924198865890503, + "learning_rate": 0.00016153988137396317, + "loss": 2.661, + "step": 5851 + }, + { + "epoch": 0.4722782664837382, + "grad_norm": 0.7731672525405884, + "learning_rate": 0.0001615274371233737, + "loss": 2.6993, + "step": 5852 + }, + { + "epoch": 0.4723589702203212, + "grad_norm": 0.7422799468040466, + "learning_rate": 0.00016151499133935964, + "loss": 2.6134, + "step": 5853 + }, + { + "epoch": 0.4724396739569042, + "grad_norm": 0.6924546957015991, + "learning_rate": 0.0001615025440222312, + "loss": 2.672, + "step": 5854 + }, + { + "epoch": 0.4725203776934872, + "grad_norm": 0.7205976843833923, + "learning_rate": 0.00016149009517229862, + "loss": 2.6722, + "step": 5855 + }, + { + "epoch": 0.47260108143007024, + "grad_norm": 0.6898519992828369, + "learning_rate": 0.0001614776447898721, + "loss": 2.6474, + "step": 5856 + }, + { + "epoch": 0.4726817851666532, + "grad_norm": 0.7512481212615967, + "learning_rate": 0.00016146519287526197, + "loss": 2.7413, + "step": 5857 + }, + { + "epoch": 0.47276248890323624, + "grad_norm": 0.6734220385551453, + "learning_rate": 0.0001614527394287786, + "loss": 2.6114, + "step": 5858 + }, + { + "epoch": 0.4728431926398192, + "grad_norm": 0.6745339632034302, + "learning_rate": 0.00016144028445073228, + "loss": 2.6039, + "step": 5859 + }, + { + "epoch": 0.47292389637640225, + "grad_norm": 0.7463086843490601, + "learning_rate": 0.0001614278279414335, + "loss": 2.6109, + "step": 5860 + }, + { + "epoch": 0.47300460011298523, + "grad_norm": 0.7203261256217957, + "learning_rate": 0.00016141536990119264, + "loss": 2.651, + "step": 5861 + }, + { + "epoch": 0.47308530384956826, + "grad_norm": 0.7718746066093445, + "learning_rate": 0.00016140291033032024, + "loss": 2.6953, + "step": 5862 + }, + { + "epoch": 0.47316600758615124, + "grad_norm": 0.7854858040809631, + "learning_rate": 0.0001613904492291268, + "loss": 2.5941, + "step": 5863 + }, + { + "epoch": 0.47324671132273427, + "grad_norm": 0.7218664288520813, + "learning_rate": 0.0001613779865979229, + "loss": 2.6447, + "step": 5864 + }, + { + "epoch": 0.47332741505931725, + "grad_norm": 0.7479045987129211, + "learning_rate": 0.0001613655224370191, + "loss": 2.6662, + "step": 5865 + }, + { + "epoch": 0.4734081187959002, + "grad_norm": 0.7335021495819092, + "learning_rate": 0.00016135305674672612, + "loss": 2.6283, + "step": 5866 + }, + { + "epoch": 0.47348882253248326, + "grad_norm": 0.7650331258773804, + "learning_rate": 0.00016134058952735453, + "loss": 2.7168, + "step": 5867 + }, + { + "epoch": 0.47356952626906623, + "grad_norm": 0.733383297920227, + "learning_rate": 0.00016132812077921513, + "loss": 2.6352, + "step": 5868 + }, + { + "epoch": 0.47365023000564926, + "grad_norm": 1.3944146633148193, + "learning_rate": 0.00016131565050261866, + "loss": 2.7518, + "step": 5869 + }, + { + "epoch": 0.47373093374223224, + "grad_norm": 0.746112585067749, + "learning_rate": 0.0001613031786978759, + "loss": 2.6253, + "step": 5870 + }, + { + "epoch": 0.4738116374788153, + "grad_norm": 0.9859737753868103, + "learning_rate": 0.00016129070536529766, + "loss": 2.6682, + "step": 5871 + }, + { + "epoch": 0.47389234121539825, + "grad_norm": 0.7358877062797546, + "learning_rate": 0.00016127823050519484, + "loss": 2.6712, + "step": 5872 + }, + { + "epoch": 0.4739730449519813, + "grad_norm": 0.7379923462867737, + "learning_rate": 0.0001612657541178783, + "loss": 2.6268, + "step": 5873 + }, + { + "epoch": 0.47405374868856426, + "grad_norm": 0.7671005725860596, + "learning_rate": 0.00016125327620365907, + "loss": 2.6127, + "step": 5874 + }, + { + "epoch": 0.4741344524251473, + "grad_norm": 0.8007156252861023, + "learning_rate": 0.00016124079676284805, + "loss": 2.6173, + "step": 5875 + }, + { + "epoch": 0.47421515616173027, + "grad_norm": 0.7930500507354736, + "learning_rate": 0.00016122831579575627, + "loss": 2.589, + "step": 5876 + }, + { + "epoch": 0.4742958598983133, + "grad_norm": 0.788006603717804, + "learning_rate": 0.00016121583330269484, + "loss": 2.6731, + "step": 5877 + }, + { + "epoch": 0.4743765636348963, + "grad_norm": 0.742148220539093, + "learning_rate": 0.00016120334928397483, + "loss": 2.674, + "step": 5878 + }, + { + "epoch": 0.4744572673714793, + "grad_norm": 0.6823038458824158, + "learning_rate": 0.00016119086373990736, + "loss": 2.6153, + "step": 5879 + }, + { + "epoch": 0.4745379711080623, + "grad_norm": 0.7542331218719482, + "learning_rate": 0.00016117837667080356, + "loss": 2.6739, + "step": 5880 + }, + { + "epoch": 0.4746186748446453, + "grad_norm": 0.8163543343544006, + "learning_rate": 0.00016116588807697476, + "loss": 2.6558, + "step": 5881 + }, + { + "epoch": 0.4746993785812283, + "grad_norm": 0.7528213858604431, + "learning_rate": 0.0001611533979587321, + "loss": 2.6243, + "step": 5882 + }, + { + "epoch": 0.4747800823178113, + "grad_norm": 0.7476626038551331, + "learning_rate": 0.00016114090631638695, + "loss": 2.5984, + "step": 5883 + }, + { + "epoch": 0.4748607860543943, + "grad_norm": 0.7436621785163879, + "learning_rate": 0.00016112841315025055, + "loss": 2.6118, + "step": 5884 + }, + { + "epoch": 0.47494148979097733, + "grad_norm": 0.8024004101753235, + "learning_rate": 0.0001611159184606343, + "loss": 2.6926, + "step": 5885 + }, + { + "epoch": 0.4750221935275603, + "grad_norm": 0.7475626468658447, + "learning_rate": 0.00016110342224784962, + "loss": 2.6175, + "step": 5886 + }, + { + "epoch": 0.47510289726414334, + "grad_norm": 0.7900637984275818, + "learning_rate": 0.00016109092451220796, + "loss": 2.6503, + "step": 5887 + }, + { + "epoch": 0.4751836010007263, + "grad_norm": 0.6988356113433838, + "learning_rate": 0.00016107842525402074, + "loss": 2.6494, + "step": 5888 + }, + { + "epoch": 0.47526430473730935, + "grad_norm": 1.0214186906814575, + "learning_rate": 0.00016106592447359948, + "loss": 2.6476, + "step": 5889 + }, + { + "epoch": 0.4753450084738923, + "grad_norm": 0.741527795791626, + "learning_rate": 0.00016105342217125578, + "loss": 2.6054, + "step": 5890 + }, + { + "epoch": 0.47542571221047536, + "grad_norm": 0.7196603417396545, + "learning_rate": 0.0001610409183473012, + "loss": 2.6146, + "step": 5891 + }, + { + "epoch": 0.47550641594705834, + "grad_norm": 0.8130923509597778, + "learning_rate": 0.00016102841300204737, + "loss": 2.6505, + "step": 5892 + }, + { + "epoch": 0.47558711968364137, + "grad_norm": 0.7929537892341614, + "learning_rate": 0.00016101590613580596, + "loss": 2.6725, + "step": 5893 + }, + { + "epoch": 0.47566782342022434, + "grad_norm": 0.7149303555488586, + "learning_rate": 0.00016100339774888865, + "loss": 2.6272, + "step": 5894 + }, + { + "epoch": 0.4757485271568074, + "grad_norm": 0.7242792248725891, + "learning_rate": 0.00016099088784160724, + "loss": 2.5948, + "step": 5895 + }, + { + "epoch": 0.47582923089339035, + "grad_norm": 0.7571540474891663, + "learning_rate": 0.00016097837641427346, + "loss": 2.689, + "step": 5896 + }, + { + "epoch": 0.4759099346299734, + "grad_norm": 0.7402021288871765, + "learning_rate": 0.00016096586346719916, + "loss": 2.7035, + "step": 5897 + }, + { + "epoch": 0.47599063836655636, + "grad_norm": 0.7195574045181274, + "learning_rate": 0.00016095334900069613, + "loss": 2.5862, + "step": 5898 + }, + { + "epoch": 0.4760713421031394, + "grad_norm": 0.7677412033081055, + "learning_rate": 0.00016094083301507634, + "loss": 2.6715, + "step": 5899 + }, + { + "epoch": 0.47615204583972237, + "grad_norm": 0.7131708860397339, + "learning_rate": 0.0001609283155106517, + "loss": 2.6555, + "step": 5900 + }, + { + "epoch": 0.4762327495763054, + "grad_norm": 0.6774055361747742, + "learning_rate": 0.00016091579648773414, + "loss": 2.621, + "step": 5901 + }, + { + "epoch": 0.4763134533128884, + "grad_norm": 0.6873257160186768, + "learning_rate": 0.00016090327594663571, + "loss": 2.6719, + "step": 5902 + }, + { + "epoch": 0.4763941570494714, + "grad_norm": 0.8004229068756104, + "learning_rate": 0.00016089075388766845, + "loss": 2.6926, + "step": 5903 + }, + { + "epoch": 0.4764748607860544, + "grad_norm": 0.7196173667907715, + "learning_rate": 0.00016087823031114438, + "loss": 2.6032, + "step": 5904 + }, + { + "epoch": 0.4765555645226374, + "grad_norm": 0.7665518522262573, + "learning_rate": 0.00016086570521737573, + "loss": 2.6359, + "step": 5905 + }, + { + "epoch": 0.4766362682592204, + "grad_norm": 0.7240240573883057, + "learning_rate": 0.0001608531786066746, + "loss": 2.6489, + "step": 5906 + }, + { + "epoch": 0.47671697199580343, + "grad_norm": 0.7603839039802551, + "learning_rate": 0.00016084065047935317, + "loss": 2.6064, + "step": 5907 + }, + { + "epoch": 0.4767976757323864, + "grad_norm": 0.7394058704376221, + "learning_rate": 0.0001608281208357237, + "loss": 2.6643, + "step": 5908 + }, + { + "epoch": 0.47687837946896944, + "grad_norm": 0.7183148860931396, + "learning_rate": 0.00016081558967609845, + "loss": 2.56, + "step": 5909 + }, + { + "epoch": 0.4769590832055524, + "grad_norm": 0.7181926965713501, + "learning_rate": 0.00016080305700078972, + "loss": 2.6665, + "step": 5910 + }, + { + "epoch": 0.47703978694213545, + "grad_norm": 0.7634081840515137, + "learning_rate": 0.00016079052281010988, + "loss": 2.7076, + "step": 5911 + }, + { + "epoch": 0.4771204906787184, + "grad_norm": 0.7928739190101624, + "learning_rate": 0.0001607779871043713, + "loss": 2.6512, + "step": 5912 + }, + { + "epoch": 0.47720119441530146, + "grad_norm": 0.7192893028259277, + "learning_rate": 0.00016076544988388643, + "loss": 2.6453, + "step": 5913 + }, + { + "epoch": 0.47728189815188443, + "grad_norm": 0.7171720862388611, + "learning_rate": 0.00016075291114896767, + "loss": 2.6501, + "step": 5914 + }, + { + "epoch": 0.47736260188846746, + "grad_norm": 0.6787160038948059, + "learning_rate": 0.00016074037089992756, + "loss": 2.6566, + "step": 5915 + }, + { + "epoch": 0.47744330562505044, + "grad_norm": 0.8118634819984436, + "learning_rate": 0.00016072782913707868, + "loss": 2.6635, + "step": 5916 + }, + { + "epoch": 0.4775240093616334, + "grad_norm": 0.7188509702682495, + "learning_rate": 0.0001607152858607335, + "loss": 2.6899, + "step": 5917 + }, + { + "epoch": 0.47760471309821645, + "grad_norm": 0.6742647290229797, + "learning_rate": 0.00016070274107120468, + "loss": 2.6221, + "step": 5918 + }, + { + "epoch": 0.4776854168347994, + "grad_norm": 0.7274083495140076, + "learning_rate": 0.00016069019476880488, + "loss": 2.6588, + "step": 5919 + }, + { + "epoch": 0.47776612057138246, + "grad_norm": 0.6984386444091797, + "learning_rate": 0.00016067764695384682, + "loss": 2.6376, + "step": 5920 + }, + { + "epoch": 0.47784682430796543, + "grad_norm": 0.7260883450508118, + "learning_rate": 0.00016066509762664315, + "loss": 2.6623, + "step": 5921 + }, + { + "epoch": 0.47792752804454847, + "grad_norm": 0.7540579438209534, + "learning_rate": 0.00016065254678750666, + "loss": 2.695, + "step": 5922 + }, + { + "epoch": 0.47800823178113144, + "grad_norm": 0.7032651305198669, + "learning_rate": 0.00016063999443675017, + "loss": 2.6791, + "step": 5923 + }, + { + "epoch": 0.4780889355177145, + "grad_norm": 0.682842493057251, + "learning_rate": 0.0001606274405746865, + "loss": 2.6198, + "step": 5924 + }, + { + "epoch": 0.47816963925429745, + "grad_norm": 0.6843859553337097, + "learning_rate": 0.00016061488520162853, + "loss": 2.6432, + "step": 5925 + }, + { + "epoch": 0.4782503429908805, + "grad_norm": 0.652119517326355, + "learning_rate": 0.00016060232831788918, + "loss": 2.6461, + "step": 5926 + }, + { + "epoch": 0.47833104672746346, + "grad_norm": 0.6986887454986572, + "learning_rate": 0.0001605897699237814, + "loss": 2.5885, + "step": 5927 + }, + { + "epoch": 0.4784117504640465, + "grad_norm": 0.7156725525856018, + "learning_rate": 0.00016057721001961817, + "loss": 2.6526, + "step": 5928 + }, + { + "epoch": 0.47849245420062947, + "grad_norm": 0.7367579936981201, + "learning_rate": 0.0001605646486057125, + "loss": 2.5842, + "step": 5929 + }, + { + "epoch": 0.4785731579372125, + "grad_norm": 0.7059770822525024, + "learning_rate": 0.00016055208568237746, + "loss": 2.617, + "step": 5930 + }, + { + "epoch": 0.4786538616737955, + "grad_norm": 0.7225117087364197, + "learning_rate": 0.00016053952124992619, + "loss": 2.6499, + "step": 5931 + }, + { + "epoch": 0.4787345654103785, + "grad_norm": 0.7027475237846375, + "learning_rate": 0.00016052695530867177, + "loss": 2.5934, + "step": 5932 + }, + { + "epoch": 0.4788152691469615, + "grad_norm": 0.7031852602958679, + "learning_rate": 0.00016051438785892743, + "loss": 2.5947, + "step": 5933 + }, + { + "epoch": 0.4788959728835445, + "grad_norm": 0.6731768846511841, + "learning_rate": 0.00016050181890100635, + "loss": 2.6811, + "step": 5934 + }, + { + "epoch": 0.4789766766201275, + "grad_norm": 0.7120038866996765, + "learning_rate": 0.0001604892484352218, + "loss": 2.6625, + "step": 5935 + }, + { + "epoch": 0.4790573803567105, + "grad_norm": 0.6895150542259216, + "learning_rate": 0.00016047667646188702, + "loss": 2.6784, + "step": 5936 + }, + { + "epoch": 0.4791380840932935, + "grad_norm": 0.7080708742141724, + "learning_rate": 0.0001604641029813154, + "loss": 2.6491, + "step": 5937 + }, + { + "epoch": 0.47921878782987654, + "grad_norm": 0.6522819399833679, + "learning_rate": 0.00016045152799382025, + "loss": 2.6113, + "step": 5938 + }, + { + "epoch": 0.4792994915664595, + "grad_norm": 0.6988112926483154, + "learning_rate": 0.00016043895149971506, + "loss": 2.6892, + "step": 5939 + }, + { + "epoch": 0.47938019530304254, + "grad_norm": 0.7545368671417236, + "learning_rate": 0.00016042637349931318, + "loss": 2.6872, + "step": 5940 + }, + { + "epoch": 0.4794608990396255, + "grad_norm": 0.7083707451820374, + "learning_rate": 0.0001604137939929281, + "loss": 2.6726, + "step": 5941 + }, + { + "epoch": 0.47954160277620855, + "grad_norm": 0.8198027014732361, + "learning_rate": 0.00016040121298087337, + "loss": 2.647, + "step": 5942 + }, + { + "epoch": 0.47962230651279153, + "grad_norm": 0.7296201586723328, + "learning_rate": 0.00016038863046346252, + "loss": 2.7122, + "step": 5943 + }, + { + "epoch": 0.47970301024937456, + "grad_norm": 0.7262474298477173, + "learning_rate": 0.00016037604644100913, + "loss": 2.6903, + "step": 5944 + }, + { + "epoch": 0.47978371398595754, + "grad_norm": 0.8010182976722717, + "learning_rate": 0.00016036346091382686, + "loss": 2.6942, + "step": 5945 + }, + { + "epoch": 0.47986441772254057, + "grad_norm": 0.7227098345756531, + "learning_rate": 0.00016035087388222932, + "loss": 2.6661, + "step": 5946 + }, + { + "epoch": 0.47994512145912355, + "grad_norm": 0.7374662756919861, + "learning_rate": 0.00016033828534653028, + "loss": 2.6233, + "step": 5947 + }, + { + "epoch": 0.4800258251957066, + "grad_norm": 0.7139650583267212, + "learning_rate": 0.00016032569530704342, + "loss": 2.5859, + "step": 5948 + }, + { + "epoch": 0.48010652893228956, + "grad_norm": 0.7067660689353943, + "learning_rate": 0.00016031310376408254, + "loss": 2.6677, + "step": 5949 + }, + { + "epoch": 0.4801872326688726, + "grad_norm": 0.694715142250061, + "learning_rate": 0.00016030051071796146, + "loss": 2.6415, + "step": 5950 + }, + { + "epoch": 0.48026793640545556, + "grad_norm": 0.728918194770813, + "learning_rate": 0.00016028791616899403, + "loss": 2.6274, + "step": 5951 + }, + { + "epoch": 0.4803486401420386, + "grad_norm": 0.699846088886261, + "learning_rate": 0.00016027532011749412, + "loss": 2.6613, + "step": 5952 + }, + { + "epoch": 0.4804293438786216, + "grad_norm": 0.7177432179450989, + "learning_rate": 0.0001602627225637757, + "loss": 2.6107, + "step": 5953 + }, + { + "epoch": 0.4805100476152046, + "grad_norm": 0.7502370476722717, + "learning_rate": 0.00016025012350815267, + "loss": 2.6534, + "step": 5954 + }, + { + "epoch": 0.4805907513517876, + "grad_norm": 0.7730218172073364, + "learning_rate": 0.0001602375229509391, + "loss": 2.7037, + "step": 5955 + }, + { + "epoch": 0.4806714550883706, + "grad_norm": 0.7046666145324707, + "learning_rate": 0.00016022492089244898, + "loss": 2.6336, + "step": 5956 + }, + { + "epoch": 0.4807521588249536, + "grad_norm": 0.7991104125976562, + "learning_rate": 0.0001602123173329964, + "loss": 2.7024, + "step": 5957 + }, + { + "epoch": 0.4808328625615366, + "grad_norm": 0.7056288123130798, + "learning_rate": 0.00016019971227289548, + "loss": 2.6088, + "step": 5958 + }, + { + "epoch": 0.4809135662981196, + "grad_norm": 0.7277925610542297, + "learning_rate": 0.00016018710571246038, + "loss": 2.6245, + "step": 5959 + }, + { + "epoch": 0.48099427003470263, + "grad_norm": 0.7545790672302246, + "learning_rate": 0.00016017449765200526, + "loss": 2.6076, + "step": 5960 + }, + { + "epoch": 0.4810749737712856, + "grad_norm": 0.7106321454048157, + "learning_rate": 0.00016016188809184434, + "loss": 2.5561, + "step": 5961 + }, + { + "epoch": 0.48115567750786864, + "grad_norm": 0.7464704513549805, + "learning_rate": 0.0001601492770322919, + "loss": 2.6336, + "step": 5962 + }, + { + "epoch": 0.4812363812444516, + "grad_norm": 0.7531768083572388, + "learning_rate": 0.00016013666447366228, + "loss": 2.6236, + "step": 5963 + }, + { + "epoch": 0.48131708498103465, + "grad_norm": 0.7412876486778259, + "learning_rate": 0.00016012405041626978, + "loss": 2.6309, + "step": 5964 + }, + { + "epoch": 0.4813977887176176, + "grad_norm": 0.7030940055847168, + "learning_rate": 0.00016011143486042878, + "loss": 2.6252, + "step": 5965 + }, + { + "epoch": 0.48147849245420066, + "grad_norm": 0.7932302951812744, + "learning_rate": 0.00016009881780645367, + "loss": 2.6797, + "step": 5966 + }, + { + "epoch": 0.48155919619078363, + "grad_norm": 0.7366262078285217, + "learning_rate": 0.00016008619925465893, + "loss": 2.6616, + "step": 5967 + }, + { + "epoch": 0.4816398999273666, + "grad_norm": 0.6938421130180359, + "learning_rate": 0.00016007357920535902, + "loss": 2.6888, + "step": 5968 + }, + { + "epoch": 0.48172060366394964, + "grad_norm": 0.7560005784034729, + "learning_rate": 0.00016006095765886853, + "loss": 2.6044, + "step": 5969 + }, + { + "epoch": 0.4818013074005326, + "grad_norm": 0.7330430150032043, + "learning_rate": 0.0001600483346155019, + "loss": 2.7023, + "step": 5970 + }, + { + "epoch": 0.48188201113711565, + "grad_norm": 0.7257955074310303, + "learning_rate": 0.00016003571007557388, + "loss": 2.6763, + "step": 5971 + }, + { + "epoch": 0.4819627148736986, + "grad_norm": 0.704187273979187, + "learning_rate": 0.000160023084039399, + "loss": 2.6229, + "step": 5972 + }, + { + "epoch": 0.48204341861028166, + "grad_norm": 0.7014813423156738, + "learning_rate": 0.00016001045650729196, + "loss": 2.6207, + "step": 5973 + }, + { + "epoch": 0.48212412234686464, + "grad_norm": 0.8039405941963196, + "learning_rate": 0.00015999782747956747, + "loss": 2.6198, + "step": 5974 + }, + { + "epoch": 0.48220482608344767, + "grad_norm": 0.7114945650100708, + "learning_rate": 0.0001599851969565403, + "loss": 2.6154, + "step": 5975 + }, + { + "epoch": 0.48228552982003065, + "grad_norm": 0.7603329420089722, + "learning_rate": 0.00015997256493852517, + "loss": 2.6217, + "step": 5976 + }, + { + "epoch": 0.4823662335566137, + "grad_norm": 0.7773346900939941, + "learning_rate": 0.000159959931425837, + "loss": 2.7054, + "step": 5977 + }, + { + "epoch": 0.48244693729319665, + "grad_norm": 0.8022029399871826, + "learning_rate": 0.0001599472964187906, + "loss": 2.6844, + "step": 5978 + }, + { + "epoch": 0.4825276410297797, + "grad_norm": 0.7384541630744934, + "learning_rate": 0.00015993465991770087, + "loss": 2.6516, + "step": 5979 + }, + { + "epoch": 0.48260834476636266, + "grad_norm": 0.6993509531021118, + "learning_rate": 0.00015992202192288273, + "loss": 2.6837, + "step": 5980 + }, + { + "epoch": 0.4826890485029457, + "grad_norm": 0.7430509328842163, + "learning_rate": 0.00015990938243465116, + "loss": 2.6717, + "step": 5981 + }, + { + "epoch": 0.48276975223952867, + "grad_norm": 0.7544847726821899, + "learning_rate": 0.0001598967414533212, + "loss": 2.6573, + "step": 5982 + }, + { + "epoch": 0.4828504559761117, + "grad_norm": 0.736955463886261, + "learning_rate": 0.00015988409897920786, + "loss": 2.6865, + "step": 5983 + }, + { + "epoch": 0.4829311597126947, + "grad_norm": 0.7771684527397156, + "learning_rate": 0.00015987145501262622, + "loss": 2.6173, + "step": 5984 + }, + { + "epoch": 0.4830118634492777, + "grad_norm": 0.7504391670227051, + "learning_rate": 0.00015985880955389143, + "loss": 2.6218, + "step": 5985 + }, + { + "epoch": 0.4830925671858607, + "grad_norm": 0.7025442123413086, + "learning_rate": 0.00015984616260331861, + "loss": 2.6107, + "step": 5986 + }, + { + "epoch": 0.4831732709224437, + "grad_norm": 0.6906485557556152, + "learning_rate": 0.000159833514161223, + "loss": 2.633, + "step": 5987 + }, + { + "epoch": 0.4832539746590267, + "grad_norm": 0.7771004438400269, + "learning_rate": 0.00015982086422791983, + "loss": 2.5956, + "step": 5988 + }, + { + "epoch": 0.48333467839560973, + "grad_norm": 0.6927372813224792, + "learning_rate": 0.00015980821280372432, + "loss": 2.5984, + "step": 5989 + }, + { + "epoch": 0.4834153821321927, + "grad_norm": 0.7196357846260071, + "learning_rate": 0.00015979555988895184, + "loss": 2.6386, + "step": 5990 + }, + { + "epoch": 0.48349608586877574, + "grad_norm": 0.7601087689399719, + "learning_rate": 0.0001597829054839177, + "loss": 2.6707, + "step": 5991 + }, + { + "epoch": 0.4835767896053587, + "grad_norm": 0.7783588767051697, + "learning_rate": 0.00015977024958893722, + "loss": 2.5815, + "step": 5992 + }, + { + "epoch": 0.48365749334194175, + "grad_norm": 0.7651833891868591, + "learning_rate": 0.00015975759220432592, + "loss": 2.6235, + "step": 5993 + }, + { + "epoch": 0.4837381970785247, + "grad_norm": 0.7158511877059937, + "learning_rate": 0.0001597449333303992, + "loss": 2.6813, + "step": 5994 + }, + { + "epoch": 0.48381890081510776, + "grad_norm": 0.7411341667175293, + "learning_rate": 0.0001597322729674726, + "loss": 2.7231, + "step": 5995 + }, + { + "epoch": 0.48389960455169073, + "grad_norm": 0.7168158292770386, + "learning_rate": 0.0001597196111158616, + "loss": 2.6408, + "step": 5996 + }, + { + "epoch": 0.48398030828827376, + "grad_norm": 0.7603393793106079, + "learning_rate": 0.00015970694777588175, + "loss": 2.7821, + "step": 5997 + }, + { + "epoch": 0.48406101202485674, + "grad_norm": 0.7298564910888672, + "learning_rate": 0.0001596942829478487, + "loss": 2.6828, + "step": 5998 + }, + { + "epoch": 0.4841417157614398, + "grad_norm": 0.7850572466850281, + "learning_rate": 0.0001596816166320781, + "loss": 2.6191, + "step": 5999 + }, + { + "epoch": 0.48422241949802275, + "grad_norm": 0.7697601914405823, + "learning_rate": 0.00015966894882888562, + "loss": 2.6768, + "step": 6000 + }, + { + "epoch": 0.48422241949802275, + "eval_loss": 2.5610127449035645, + "eval_runtime": 760.0481, + "eval_samples_per_second": 3.447, + "eval_steps_per_second": 0.575, + "step": 6000 + }, + { + "epoch": 0.4843031232346058, + "grad_norm": 0.7212432026863098, + "learning_rate": 0.00015965627953858693, + "loss": 2.5967, + "step": 6001 + }, + { + "epoch": 0.48438382697118876, + "grad_norm": 0.7629631757736206, + "learning_rate": 0.0001596436087614978, + "loss": 2.7005, + "step": 6002 + }, + { + "epoch": 0.4844645307077718, + "grad_norm": 0.7154754400253296, + "learning_rate": 0.00015963093649793404, + "loss": 2.6909, + "step": 6003 + }, + { + "epoch": 0.48454523444435477, + "grad_norm": 0.7365279793739319, + "learning_rate": 0.00015961826274821147, + "loss": 2.6268, + "step": 6004 + }, + { + "epoch": 0.4846259381809378, + "grad_norm": 0.8114632964134216, + "learning_rate": 0.00015960558751264596, + "loss": 2.6647, + "step": 6005 + }, + { + "epoch": 0.4847066419175208, + "grad_norm": 0.7411556243896484, + "learning_rate": 0.00015959291079155338, + "loss": 2.6378, + "step": 6006 + }, + { + "epoch": 0.4847873456541038, + "grad_norm": 0.7137390375137329, + "learning_rate": 0.00015958023258524968, + "loss": 2.6454, + "step": 6007 + }, + { + "epoch": 0.4848680493906868, + "grad_norm": 0.7477054595947266, + "learning_rate": 0.00015956755289405088, + "loss": 2.6463, + "step": 6008 + }, + { + "epoch": 0.4849487531272698, + "grad_norm": 0.7198071479797363, + "learning_rate": 0.0001595548717182729, + "loss": 2.6537, + "step": 6009 + }, + { + "epoch": 0.4850294568638528, + "grad_norm": 0.6697781085968018, + "learning_rate": 0.00015954218905823186, + "loss": 2.7018, + "step": 6010 + }, + { + "epoch": 0.4851101606004358, + "grad_norm": 0.7577201724052429, + "learning_rate": 0.00015952950491424382, + "loss": 2.6531, + "step": 6011 + }, + { + "epoch": 0.4851908643370188, + "grad_norm": 0.6852774024009705, + "learning_rate": 0.0001595168192866249, + "loss": 2.5819, + "step": 6012 + }, + { + "epoch": 0.48527156807360183, + "grad_norm": 0.7116097807884216, + "learning_rate": 0.0001595041321756913, + "loss": 2.5691, + "step": 6013 + }, + { + "epoch": 0.4853522718101848, + "grad_norm": 0.7478477954864502, + "learning_rate": 0.00015949144358175916, + "loss": 2.6658, + "step": 6014 + }, + { + "epoch": 0.48543297554676784, + "grad_norm": 0.816969633102417, + "learning_rate": 0.0001594787535051447, + "loss": 2.6709, + "step": 6015 + }, + { + "epoch": 0.4855136792833508, + "grad_norm": 0.6953164339065552, + "learning_rate": 0.00015946606194616427, + "loss": 2.6139, + "step": 6016 + }, + { + "epoch": 0.48559438301993385, + "grad_norm": 0.6698834300041199, + "learning_rate": 0.0001594533689051341, + "loss": 2.574, + "step": 6017 + }, + { + "epoch": 0.4856750867565168, + "grad_norm": 0.7686784267425537, + "learning_rate": 0.0001594406743823706, + "loss": 2.6271, + "step": 6018 + }, + { + "epoch": 0.4857557904930998, + "grad_norm": 0.7713280916213989, + "learning_rate": 0.00015942797837819009, + "loss": 2.6682, + "step": 6019 + }, + { + "epoch": 0.48583649422968284, + "grad_norm": 0.8102596998214722, + "learning_rate": 0.00015941528089290902, + "loss": 2.6771, + "step": 6020 + }, + { + "epoch": 0.4859171979662658, + "grad_norm": 0.7140331864356995, + "learning_rate": 0.00015940258192684382, + "loss": 2.6267, + "step": 6021 + }, + { + "epoch": 0.48599790170284884, + "grad_norm": 0.7057615518569946, + "learning_rate": 0.000159389881480311, + "loss": 2.6011, + "step": 6022 + }, + { + "epoch": 0.4860786054394318, + "grad_norm": 0.7106850147247314, + "learning_rate": 0.0001593771795536271, + "loss": 2.6681, + "step": 6023 + }, + { + "epoch": 0.48615930917601485, + "grad_norm": 0.7618210315704346, + "learning_rate": 0.00015936447614710867, + "loss": 2.6545, + "step": 6024 + }, + { + "epoch": 0.48624001291259783, + "grad_norm": 0.7577608227729797, + "learning_rate": 0.00015935177126107233, + "loss": 2.6479, + "step": 6025 + }, + { + "epoch": 0.48632071664918086, + "grad_norm": 0.758745551109314, + "learning_rate": 0.00015933906489583468, + "loss": 2.7057, + "step": 6026 + }, + { + "epoch": 0.48640142038576384, + "grad_norm": 0.785906970500946, + "learning_rate": 0.00015932635705171241, + "loss": 2.7081, + "step": 6027 + }, + { + "epoch": 0.48648212412234687, + "grad_norm": 0.6744558215141296, + "learning_rate": 0.00015931364772902228, + "loss": 2.6438, + "step": 6028 + }, + { + "epoch": 0.48656282785892985, + "grad_norm": 0.7451377511024475, + "learning_rate": 0.00015930093692808099, + "loss": 2.6509, + "step": 6029 + }, + { + "epoch": 0.4866435315955129, + "grad_norm": 0.6590149402618408, + "learning_rate": 0.0001592882246492053, + "loss": 2.5683, + "step": 6030 + }, + { + "epoch": 0.48672423533209586, + "grad_norm": 0.7433840036392212, + "learning_rate": 0.0001592755108927121, + "loss": 2.6647, + "step": 6031 + }, + { + "epoch": 0.4868049390686789, + "grad_norm": 0.876806378364563, + "learning_rate": 0.00015926279565891822, + "loss": 2.6482, + "step": 6032 + }, + { + "epoch": 0.48688564280526186, + "grad_norm": 0.7495005130767822, + "learning_rate": 0.00015925007894814058, + "loss": 2.6346, + "step": 6033 + }, + { + "epoch": 0.4869663465418449, + "grad_norm": 0.7005730271339417, + "learning_rate": 0.00015923736076069604, + "loss": 2.6241, + "step": 6034 + }, + { + "epoch": 0.4870470502784279, + "grad_norm": 0.664098858833313, + "learning_rate": 0.00015922464109690166, + "loss": 2.6281, + "step": 6035 + }, + { + "epoch": 0.4871277540150109, + "grad_norm": 0.7482514977455139, + "learning_rate": 0.00015921191995707442, + "loss": 2.5764, + "step": 6036 + }, + { + "epoch": 0.4872084577515939, + "grad_norm": 0.7450351715087891, + "learning_rate": 0.0001591991973415313, + "loss": 2.6433, + "step": 6037 + }, + { + "epoch": 0.4872891614881769, + "grad_norm": 0.6738519072532654, + "learning_rate": 0.00015918647325058948, + "loss": 2.6688, + "step": 6038 + }, + { + "epoch": 0.4873698652247599, + "grad_norm": 0.7999960780143738, + "learning_rate": 0.000159173747684566, + "loss": 2.6309, + "step": 6039 + }, + { + "epoch": 0.4874505689613429, + "grad_norm": 0.7249687910079956, + "learning_rate": 0.00015916102064377806, + "loss": 2.5808, + "step": 6040 + }, + { + "epoch": 0.4875312726979259, + "grad_norm": 0.7014601826667786, + "learning_rate": 0.00015914829212854286, + "loss": 2.6646, + "step": 6041 + }, + { + "epoch": 0.48761197643450893, + "grad_norm": 0.7091174721717834, + "learning_rate": 0.00015913556213917757, + "loss": 2.6576, + "step": 6042 + }, + { + "epoch": 0.4876926801710919, + "grad_norm": 0.6949019432067871, + "learning_rate": 0.00015912283067599952, + "loss": 2.5883, + "step": 6043 + }, + { + "epoch": 0.48777338390767494, + "grad_norm": 0.6990448236465454, + "learning_rate": 0.00015911009773932598, + "loss": 2.6413, + "step": 6044 + }, + { + "epoch": 0.4878540876442579, + "grad_norm": 0.7106831073760986, + "learning_rate": 0.00015909736332947425, + "loss": 2.6122, + "step": 6045 + }, + { + "epoch": 0.48793479138084095, + "grad_norm": 0.7052395343780518, + "learning_rate": 0.00015908462744676177, + "loss": 2.572, + "step": 6046 + }, + { + "epoch": 0.4880154951174239, + "grad_norm": 0.7250158190727234, + "learning_rate": 0.00015907189009150592, + "loss": 2.6582, + "step": 6047 + }, + { + "epoch": 0.48809619885400696, + "grad_norm": 0.7213590145111084, + "learning_rate": 0.00015905915126402414, + "loss": 2.7025, + "step": 6048 + }, + { + "epoch": 0.48817690259058993, + "grad_norm": 0.7136254906654358, + "learning_rate": 0.00015904641096463394, + "loss": 2.6823, + "step": 6049 + }, + { + "epoch": 0.48825760632717297, + "grad_norm": 0.7163361310958862, + "learning_rate": 0.00015903366919365282, + "loss": 2.6642, + "step": 6050 + }, + { + "epoch": 0.48833831006375594, + "grad_norm": 0.6842724680900574, + "learning_rate": 0.00015902092595139838, + "loss": 2.6599, + "step": 6051 + }, + { + "epoch": 0.488419013800339, + "grad_norm": 0.7426519393920898, + "learning_rate": 0.0001590081812381882, + "loss": 2.6271, + "step": 6052 + }, + { + "epoch": 0.48849971753692195, + "grad_norm": 0.7415586709976196, + "learning_rate": 0.00015899543505433985, + "loss": 2.6105, + "step": 6053 + }, + { + "epoch": 0.488580421273505, + "grad_norm": 0.7286739945411682, + "learning_rate": 0.00015898268740017105, + "loss": 2.6304, + "step": 6054 + }, + { + "epoch": 0.48866112501008796, + "grad_norm": 0.6898483633995056, + "learning_rate": 0.00015896993827599947, + "loss": 2.6237, + "step": 6055 + }, + { + "epoch": 0.488741828746671, + "grad_norm": 0.7020056247711182, + "learning_rate": 0.00015895718768214293, + "loss": 2.6166, + "step": 6056 + }, + { + "epoch": 0.48882253248325397, + "grad_norm": 0.7145286798477173, + "learning_rate": 0.00015894443561891914, + "loss": 2.6729, + "step": 6057 + }, + { + "epoch": 0.488903236219837, + "grad_norm": 0.6888289451599121, + "learning_rate": 0.00015893168208664594, + "loss": 2.6154, + "step": 6058 + }, + { + "epoch": 0.48898393995642, + "grad_norm": 0.6929970383644104, + "learning_rate": 0.00015891892708564116, + "loss": 2.6748, + "step": 6059 + }, + { + "epoch": 0.489064643693003, + "grad_norm": 0.679853618144989, + "learning_rate": 0.0001589061706162227, + "loss": 2.605, + "step": 6060 + }, + { + "epoch": 0.489145347429586, + "grad_norm": 0.71812504529953, + "learning_rate": 0.0001588934126787085, + "loss": 2.7249, + "step": 6061 + }, + { + "epoch": 0.489226051166169, + "grad_norm": 0.7083466053009033, + "learning_rate": 0.00015888065327341648, + "loss": 2.5986, + "step": 6062 + }, + { + "epoch": 0.489306754902752, + "grad_norm": 0.7476792931556702, + "learning_rate": 0.00015886789240066466, + "loss": 2.5942, + "step": 6063 + }, + { + "epoch": 0.489387458639335, + "grad_norm": 0.7197855114936829, + "learning_rate": 0.00015885513006077114, + "loss": 2.6198, + "step": 6064 + }, + { + "epoch": 0.489468162375918, + "grad_norm": 0.6678233742713928, + "learning_rate": 0.00015884236625405385, + "loss": 2.5793, + "step": 6065 + }, + { + "epoch": 0.48954886611250104, + "grad_norm": 0.7371037602424622, + "learning_rate": 0.00015882960098083105, + "loss": 2.6231, + "step": 6066 + }, + { + "epoch": 0.489629569849084, + "grad_norm": 0.7087417244911194, + "learning_rate": 0.00015881683424142078, + "loss": 2.6483, + "step": 6067 + }, + { + "epoch": 0.48971027358566704, + "grad_norm": 0.7300292253494263, + "learning_rate": 0.00015880406603614126, + "loss": 2.6778, + "step": 6068 + }, + { + "epoch": 0.48979097732225, + "grad_norm": 0.8347866535186768, + "learning_rate": 0.0001587912963653107, + "loss": 2.554, + "step": 6069 + }, + { + "epoch": 0.489871681058833, + "grad_norm": 0.7717794179916382, + "learning_rate": 0.00015877852522924732, + "loss": 2.6904, + "step": 6070 + }, + { + "epoch": 0.48995238479541603, + "grad_norm": 0.6960952281951904, + "learning_rate": 0.00015876575262826944, + "loss": 2.6059, + "step": 6071 + }, + { + "epoch": 0.490033088531999, + "grad_norm": 0.7316592931747437, + "learning_rate": 0.00015875297856269543, + "loss": 2.6685, + "step": 6072 + }, + { + "epoch": 0.49011379226858204, + "grad_norm": 0.6775457859039307, + "learning_rate": 0.00015874020303284362, + "loss": 2.6232, + "step": 6073 + }, + { + "epoch": 0.490194496005165, + "grad_norm": 0.7741925120353699, + "learning_rate": 0.00015872742603903237, + "loss": 2.6767, + "step": 6074 + }, + { + "epoch": 0.49027519974174805, + "grad_norm": 0.857490599155426, + "learning_rate": 0.00015871464758158017, + "loss": 2.6649, + "step": 6075 + }, + { + "epoch": 0.490355903478331, + "grad_norm": 0.7474274039268494, + "learning_rate": 0.00015870186766080545, + "loss": 2.6926, + "step": 6076 + }, + { + "epoch": 0.49043660721491406, + "grad_norm": 0.7266567945480347, + "learning_rate": 0.00015868908627702675, + "loss": 2.5919, + "step": 6077 + }, + { + "epoch": 0.49051731095149703, + "grad_norm": 0.7247830629348755, + "learning_rate": 0.0001586763034305626, + "loss": 2.6158, + "step": 6078 + }, + { + "epoch": 0.49059801468808006, + "grad_norm": 0.7654951214790344, + "learning_rate": 0.00015866351912173157, + "loss": 2.7236, + "step": 6079 + }, + { + "epoch": 0.49067871842466304, + "grad_norm": 0.732431948184967, + "learning_rate": 0.00015865073335085236, + "loss": 2.6349, + "step": 6080 + }, + { + "epoch": 0.4907594221612461, + "grad_norm": 0.7240673303604126, + "learning_rate": 0.0001586379461182435, + "loss": 2.6282, + "step": 6081 + }, + { + "epoch": 0.49084012589782905, + "grad_norm": 0.767473042011261, + "learning_rate": 0.00015862515742422374, + "loss": 2.6939, + "step": 6082 + }, + { + "epoch": 0.4909208296344121, + "grad_norm": 0.6977359056472778, + "learning_rate": 0.00015861236726911183, + "loss": 2.6591, + "step": 6083 + }, + { + "epoch": 0.49100153337099506, + "grad_norm": 0.7676639556884766, + "learning_rate": 0.00015859957565322655, + "loss": 2.6189, + "step": 6084 + }, + { + "epoch": 0.4910822371075781, + "grad_norm": 0.7157976031303406, + "learning_rate": 0.0001585867825768866, + "loss": 2.644, + "step": 6085 + }, + { + "epoch": 0.49116294084416107, + "grad_norm": 0.7080803513526917, + "learning_rate": 0.0001585739880404109, + "loss": 2.6099, + "step": 6086 + }, + { + "epoch": 0.4912436445807441, + "grad_norm": 0.7109760046005249, + "learning_rate": 0.0001585611920441183, + "loss": 2.7087, + "step": 6087 + }, + { + "epoch": 0.4913243483173271, + "grad_norm": 0.7274255156517029, + "learning_rate": 0.00015854839458832772, + "loss": 2.6394, + "step": 6088 + }, + { + "epoch": 0.4914050520539101, + "grad_norm": 0.7407883405685425, + "learning_rate": 0.00015853559567335812, + "loss": 2.6729, + "step": 6089 + }, + { + "epoch": 0.4914857557904931, + "grad_norm": 0.6879885196685791, + "learning_rate": 0.00015852279529952843, + "loss": 2.5971, + "step": 6090 + }, + { + "epoch": 0.4915664595270761, + "grad_norm": 0.7678415179252625, + "learning_rate": 0.00015850999346715772, + "loss": 2.6606, + "step": 6091 + }, + { + "epoch": 0.4916471632636591, + "grad_norm": 0.7108608484268188, + "learning_rate": 0.00015849719017656504, + "loss": 2.6494, + "step": 6092 + }, + { + "epoch": 0.4917278670002421, + "grad_norm": 0.7238833904266357, + "learning_rate": 0.00015848438542806945, + "loss": 2.6742, + "step": 6093 + }, + { + "epoch": 0.4918085707368251, + "grad_norm": 0.7316902279853821, + "learning_rate": 0.0001584715792219901, + "loss": 2.6757, + "step": 6094 + }, + { + "epoch": 0.49188927447340813, + "grad_norm": 0.7339446544647217, + "learning_rate": 0.00015845877155864612, + "loss": 2.607, + "step": 6095 + }, + { + "epoch": 0.4919699782099911, + "grad_norm": 0.6931337714195251, + "learning_rate": 0.0001584459624383568, + "loss": 2.6203, + "step": 6096 + }, + { + "epoch": 0.49205068194657414, + "grad_norm": 0.734229326248169, + "learning_rate": 0.00015843315186144126, + "loss": 2.646, + "step": 6097 + }, + { + "epoch": 0.4921313856831571, + "grad_norm": 0.7764919400215149, + "learning_rate": 0.00015842033982821883, + "loss": 2.6698, + "step": 6098 + }, + { + "epoch": 0.49221208941974015, + "grad_norm": 0.7707986235618591, + "learning_rate": 0.00015840752633900887, + "loss": 2.6995, + "step": 6099 + }, + { + "epoch": 0.4922927931563231, + "grad_norm": 0.7321949601173401, + "learning_rate": 0.00015839471139413066, + "loss": 2.6517, + "step": 6100 + }, + { + "epoch": 0.49237349689290616, + "grad_norm": 0.7087488770484924, + "learning_rate": 0.00015838189499390353, + "loss": 2.6153, + "step": 6101 + }, + { + "epoch": 0.49245420062948914, + "grad_norm": 0.7300730347633362, + "learning_rate": 0.00015836907713864706, + "loss": 2.5868, + "step": 6102 + }, + { + "epoch": 0.49253490436607217, + "grad_norm": 0.8476536273956299, + "learning_rate": 0.00015835625782868054, + "loss": 2.7158, + "step": 6103 + }, + { + "epoch": 0.49261560810265514, + "grad_norm": 0.8062012791633606, + "learning_rate": 0.0001583434370643236, + "loss": 2.6896, + "step": 6104 + }, + { + "epoch": 0.4926963118392382, + "grad_norm": 0.7336686849594116, + "learning_rate": 0.00015833061484589562, + "loss": 2.6416, + "step": 6105 + }, + { + "epoch": 0.49277701557582115, + "grad_norm": 0.6976929306983948, + "learning_rate": 0.00015831779117371627, + "loss": 2.6279, + "step": 6106 + }, + { + "epoch": 0.4928577193124042, + "grad_norm": 0.7262609601020813, + "learning_rate": 0.00015830496604810513, + "loss": 2.6144, + "step": 6107 + }, + { + "epoch": 0.49293842304898716, + "grad_norm": 0.7274572253227234, + "learning_rate": 0.00015829213946938183, + "loss": 2.7409, + "step": 6108 + }, + { + "epoch": 0.4930191267855702, + "grad_norm": 0.7438454031944275, + "learning_rate": 0.000158279311437866, + "loss": 2.5928, + "step": 6109 + }, + { + "epoch": 0.49309983052215317, + "grad_norm": 0.6885421872138977, + "learning_rate": 0.00015826648195387742, + "loss": 2.6659, + "step": 6110 + }, + { + "epoch": 0.4931805342587362, + "grad_norm": 0.6781450510025024, + "learning_rate": 0.0001582536510177358, + "loss": 2.6068, + "step": 6111 + }, + { + "epoch": 0.4932612379953192, + "grad_norm": 0.7618128657341003, + "learning_rate": 0.0001582408186297609, + "loss": 2.6705, + "step": 6112 + }, + { + "epoch": 0.4933419417319022, + "grad_norm": 0.7011203765869141, + "learning_rate": 0.00015822798479027256, + "loss": 2.596, + "step": 6113 + }, + { + "epoch": 0.4934226454684852, + "grad_norm": 0.7727806568145752, + "learning_rate": 0.00015821514949959065, + "loss": 2.6458, + "step": 6114 + }, + { + "epoch": 0.4935033492050682, + "grad_norm": 0.7318129539489746, + "learning_rate": 0.00015820231275803502, + "loss": 2.6009, + "step": 6115 + }, + { + "epoch": 0.4935840529416512, + "grad_norm": 0.6836227178573608, + "learning_rate": 0.00015818947456592563, + "loss": 2.6311, + "step": 6116 + }, + { + "epoch": 0.49366475667823423, + "grad_norm": 0.7657275199890137, + "learning_rate": 0.0001581766349235824, + "loss": 2.6079, + "step": 6117 + }, + { + "epoch": 0.4937454604148172, + "grad_norm": 0.74736487865448, + "learning_rate": 0.0001581637938313254, + "loss": 2.6752, + "step": 6118 + }, + { + "epoch": 0.49382616415140024, + "grad_norm": 0.716708242893219, + "learning_rate": 0.00015815095128947454, + "loss": 2.5896, + "step": 6119 + }, + { + "epoch": 0.4939068678879832, + "grad_norm": 0.740727424621582, + "learning_rate": 0.00015813810729835002, + "loss": 2.6528, + "step": 6120 + }, + { + "epoch": 0.4939875716245662, + "grad_norm": 0.6746687293052673, + "learning_rate": 0.0001581252618582719, + "loss": 2.6438, + "step": 6121 + }, + { + "epoch": 0.4940682753611492, + "grad_norm": 0.7547900080680847, + "learning_rate": 0.00015811241496956028, + "loss": 2.631, + "step": 6122 + }, + { + "epoch": 0.4941489790977322, + "grad_norm": 0.7500903606414795, + "learning_rate": 0.0001580995666325354, + "loss": 2.7039, + "step": 6123 + }, + { + "epoch": 0.49422968283431523, + "grad_norm": 0.7692849636077881, + "learning_rate": 0.00015808671684751743, + "loss": 2.5922, + "step": 6124 + }, + { + "epoch": 0.4943103865708982, + "grad_norm": 0.6964236497879028, + "learning_rate": 0.00015807386561482662, + "loss": 2.6239, + "step": 6125 + }, + { + "epoch": 0.49439109030748124, + "grad_norm": 0.7094165086746216, + "learning_rate": 0.0001580610129347833, + "loss": 2.6239, + "step": 6126 + }, + { + "epoch": 0.4944717940440642, + "grad_norm": 0.7579131126403809, + "learning_rate": 0.00015804815880770775, + "loss": 2.6654, + "step": 6127 + }, + { + "epoch": 0.49455249778064725, + "grad_norm": 0.7687693238258362, + "learning_rate": 0.00015803530323392034, + "loss": 2.6557, + "step": 6128 + }, + { + "epoch": 0.4946332015172302, + "grad_norm": 0.6913540363311768, + "learning_rate": 0.0001580224462137415, + "loss": 2.6299, + "step": 6129 + }, + { + "epoch": 0.49471390525381326, + "grad_norm": 0.7574129700660706, + "learning_rate": 0.0001580095877474916, + "loss": 2.6327, + "step": 6130 + }, + { + "epoch": 0.49479460899039623, + "grad_norm": 0.6834598183631897, + "learning_rate": 0.0001579967278354911, + "loss": 2.6402, + "step": 6131 + }, + { + "epoch": 0.49487531272697927, + "grad_norm": 0.7872750163078308, + "learning_rate": 0.00015798386647806057, + "loss": 2.6647, + "step": 6132 + }, + { + "epoch": 0.49495601646356224, + "grad_norm": 0.705211341381073, + "learning_rate": 0.00015797100367552055, + "loss": 2.6288, + "step": 6133 + }, + { + "epoch": 0.4950367202001453, + "grad_norm": 0.7302640080451965, + "learning_rate": 0.00015795813942819155, + "loss": 2.6683, + "step": 6134 + }, + { + "epoch": 0.49511742393672825, + "grad_norm": 0.7522360682487488, + "learning_rate": 0.0001579452737363942, + "loss": 2.5885, + "step": 6135 + }, + { + "epoch": 0.4951981276733113, + "grad_norm": 0.657376229763031, + "learning_rate": 0.0001579324066004492, + "loss": 2.5775, + "step": 6136 + }, + { + "epoch": 0.49527883140989426, + "grad_norm": 0.7539556622505188, + "learning_rate": 0.00015791953802067715, + "loss": 2.6236, + "step": 6137 + }, + { + "epoch": 0.4953595351464773, + "grad_norm": 0.7090374827384949, + "learning_rate": 0.00015790666799739883, + "loss": 2.5845, + "step": 6138 + }, + { + "epoch": 0.49544023888306027, + "grad_norm": 0.6883948445320129, + "learning_rate": 0.00015789379653093497, + "loss": 2.6621, + "step": 6139 + }, + { + "epoch": 0.4955209426196433, + "grad_norm": 0.7466424107551575, + "learning_rate": 0.00015788092362160633, + "loss": 2.6289, + "step": 6140 + }, + { + "epoch": 0.4956016463562263, + "grad_norm": 0.7424437403678894, + "learning_rate": 0.00015786804926973383, + "loss": 2.6405, + "step": 6141 + }, + { + "epoch": 0.4956823500928093, + "grad_norm": 0.7227851748466492, + "learning_rate": 0.00015785517347563822, + "loss": 2.6537, + "step": 6142 + }, + { + "epoch": 0.4957630538293923, + "grad_norm": 0.7548653483390808, + "learning_rate": 0.00015784229623964048, + "loss": 2.7377, + "step": 6143 + }, + { + "epoch": 0.4958437575659753, + "grad_norm": 0.7086976170539856, + "learning_rate": 0.00015782941756206152, + "loss": 2.6194, + "step": 6144 + }, + { + "epoch": 0.4959244613025583, + "grad_norm": 0.6605533957481384, + "learning_rate": 0.0001578165374432223, + "loss": 2.6265, + "step": 6145 + }, + { + "epoch": 0.4960051650391413, + "grad_norm": 0.7187899947166443, + "learning_rate": 0.00015780365588344384, + "loss": 2.5639, + "step": 6146 + }, + { + "epoch": 0.4960858687757243, + "grad_norm": 0.7014074921607971, + "learning_rate": 0.00015779077288304716, + "loss": 2.6011, + "step": 6147 + }, + { + "epoch": 0.49616657251230734, + "grad_norm": 0.7463840842247009, + "learning_rate": 0.00015777788844235335, + "loss": 2.6059, + "step": 6148 + }, + { + "epoch": 0.4962472762488903, + "grad_norm": 0.8022417426109314, + "learning_rate": 0.00015776500256168356, + "loss": 2.6011, + "step": 6149 + }, + { + "epoch": 0.49632797998547334, + "grad_norm": 0.7140083909034729, + "learning_rate": 0.0001577521152413589, + "loss": 2.6891, + "step": 6150 + }, + { + "epoch": 0.4964086837220563, + "grad_norm": 0.7266198992729187, + "learning_rate": 0.00015773922648170053, + "loss": 2.6561, + "step": 6151 + }, + { + "epoch": 0.49648938745863935, + "grad_norm": 0.7241406440734863, + "learning_rate": 0.0001577263362830297, + "loss": 2.6835, + "step": 6152 + }, + { + "epoch": 0.49657009119522233, + "grad_norm": 0.7422344088554382, + "learning_rate": 0.0001577134446456677, + "loss": 2.6039, + "step": 6153 + }, + { + "epoch": 0.49665079493180536, + "grad_norm": 0.8764764666557312, + "learning_rate": 0.0001577005515699358, + "loss": 2.68, + "step": 6154 + }, + { + "epoch": 0.49673149866838834, + "grad_norm": 0.7224323749542236, + "learning_rate": 0.0001576876570561553, + "loss": 2.5824, + "step": 6155 + }, + { + "epoch": 0.49681220240497137, + "grad_norm": 0.7601075172424316, + "learning_rate": 0.00015767476110464758, + "loss": 2.7124, + "step": 6156 + }, + { + "epoch": 0.49689290614155435, + "grad_norm": 0.7425428628921509, + "learning_rate": 0.0001576618637157341, + "loss": 2.5913, + "step": 6157 + }, + { + "epoch": 0.4969736098781374, + "grad_norm": 0.721969723701477, + "learning_rate": 0.0001576489648897362, + "loss": 2.6482, + "step": 6158 + }, + { + "epoch": 0.49705431361472036, + "grad_norm": 0.8142126798629761, + "learning_rate": 0.00015763606462697544, + "loss": 2.6231, + "step": 6159 + }, + { + "epoch": 0.4971350173513034, + "grad_norm": 0.6636359691619873, + "learning_rate": 0.00015762316292777326, + "loss": 2.6388, + "step": 6160 + }, + { + "epoch": 0.49721572108788636, + "grad_norm": 0.7093132734298706, + "learning_rate": 0.00015761025979245123, + "loss": 2.6562, + "step": 6161 + }, + { + "epoch": 0.4972964248244694, + "grad_norm": 0.7130851745605469, + "learning_rate": 0.00015759735522133094, + "loss": 2.6856, + "step": 6162 + }, + { + "epoch": 0.4973771285610524, + "grad_norm": 0.7303292155265808, + "learning_rate": 0.000157584449214734, + "loss": 2.6077, + "step": 6163 + }, + { + "epoch": 0.4974578322976354, + "grad_norm": 0.6742258071899414, + "learning_rate": 0.00015757154177298204, + "loss": 2.6644, + "step": 6164 + }, + { + "epoch": 0.4975385360342184, + "grad_norm": 0.6882894039154053, + "learning_rate": 0.00015755863289639677, + "loss": 2.6462, + "step": 6165 + }, + { + "epoch": 0.4976192397708014, + "grad_norm": 0.7882276773452759, + "learning_rate": 0.00015754572258529993, + "loss": 2.6509, + "step": 6166 + }, + { + "epoch": 0.4976999435073844, + "grad_norm": 0.7163859009742737, + "learning_rate": 0.00015753281084001324, + "loss": 2.627, + "step": 6167 + }, + { + "epoch": 0.4977806472439674, + "grad_norm": 0.7194411158561707, + "learning_rate": 0.0001575198976608585, + "loss": 2.6798, + "step": 6168 + }, + { + "epoch": 0.4978613509805504, + "grad_norm": 0.7233198881149292, + "learning_rate": 0.0001575069830481576, + "loss": 2.6616, + "step": 6169 + }, + { + "epoch": 0.49794205471713343, + "grad_norm": 0.7246997952461243, + "learning_rate": 0.00015749406700223231, + "loss": 2.6262, + "step": 6170 + }, + { + "epoch": 0.4980227584537164, + "grad_norm": 0.7509368658065796, + "learning_rate": 0.00015748114952340457, + "loss": 2.6148, + "step": 6171 + }, + { + "epoch": 0.4981034621902994, + "grad_norm": 0.7079075574874878, + "learning_rate": 0.00015746823061199637, + "loss": 2.6712, + "step": 6172 + }, + { + "epoch": 0.4981841659268824, + "grad_norm": 0.6821560859680176, + "learning_rate": 0.0001574553102683296, + "loss": 2.6253, + "step": 6173 + }, + { + "epoch": 0.4982648696634654, + "grad_norm": 0.7623000741004944, + "learning_rate": 0.00015744238849272634, + "loss": 2.6252, + "step": 6174 + }, + { + "epoch": 0.4983455734000484, + "grad_norm": 0.709434449672699, + "learning_rate": 0.00015742946528550858, + "loss": 2.555, + "step": 6175 + }, + { + "epoch": 0.4984262771366314, + "grad_norm": 0.7277799844741821, + "learning_rate": 0.00015741654064699846, + "loss": 2.6551, + "step": 6176 + }, + { + "epoch": 0.49850698087321443, + "grad_norm": 0.7208690643310547, + "learning_rate": 0.00015740361457751802, + "loss": 2.6747, + "step": 6177 + }, + { + "epoch": 0.4985876846097974, + "grad_norm": 0.8458136916160583, + "learning_rate": 0.00015739068707738946, + "loss": 2.6551, + "step": 6178 + }, + { + "epoch": 0.49866838834638044, + "grad_norm": 0.7718539834022522, + "learning_rate": 0.00015737775814693498, + "loss": 2.6246, + "step": 6179 + }, + { + "epoch": 0.4987490920829634, + "grad_norm": 0.6982735395431519, + "learning_rate": 0.00015736482778647674, + "loss": 2.5726, + "step": 6180 + }, + { + "epoch": 0.49882979581954645, + "grad_norm": 0.6759411692619324, + "learning_rate": 0.00015735189599633707, + "loss": 2.6603, + "step": 6181 + }, + { + "epoch": 0.4989104995561294, + "grad_norm": 0.7016656994819641, + "learning_rate": 0.0001573389627768382, + "loss": 2.6045, + "step": 6182 + }, + { + "epoch": 0.49899120329271246, + "grad_norm": 0.7170618176460266, + "learning_rate": 0.00015732602812830253, + "loss": 2.6419, + "step": 6183 + }, + { + "epoch": 0.49907190702929544, + "grad_norm": 0.6963300704956055, + "learning_rate": 0.00015731309205105237, + "loss": 2.6377, + "step": 6184 + }, + { + "epoch": 0.49915261076587847, + "grad_norm": 0.7437995672225952, + "learning_rate": 0.00015730015454541014, + "loss": 2.7013, + "step": 6185 + }, + { + "epoch": 0.49923331450246144, + "grad_norm": 0.6846518516540527, + "learning_rate": 0.00015728721561169827, + "loss": 2.5526, + "step": 6186 + }, + { + "epoch": 0.4993140182390445, + "grad_norm": 0.7343618273735046, + "learning_rate": 0.00015727427525023924, + "loss": 2.6567, + "step": 6187 + }, + { + "epoch": 0.49939472197562745, + "grad_norm": 0.6947566270828247, + "learning_rate": 0.00015726133346135554, + "loss": 2.6642, + "step": 6188 + }, + { + "epoch": 0.4994754257122105, + "grad_norm": 0.7402610778808594, + "learning_rate": 0.00015724839024536976, + "loss": 2.6964, + "step": 6189 + }, + { + "epoch": 0.49955612944879346, + "grad_norm": 0.7318306565284729, + "learning_rate": 0.00015723544560260444, + "loss": 2.5864, + "step": 6190 + }, + { + "epoch": 0.4996368331853765, + "grad_norm": 0.752216100692749, + "learning_rate": 0.00015722249953338215, + "loss": 2.6357, + "step": 6191 + }, + { + "epoch": 0.49971753692195947, + "grad_norm": 0.70283442735672, + "learning_rate": 0.00015720955203802565, + "loss": 2.5892, + "step": 6192 + }, + { + "epoch": 0.4997982406585425, + "grad_norm": 0.7457823753356934, + "learning_rate": 0.00015719660311685755, + "loss": 2.6663, + "step": 6193 + }, + { + "epoch": 0.4998789443951255, + "grad_norm": 0.7296229600906372, + "learning_rate": 0.00015718365277020058, + "loss": 2.6238, + "step": 6194 + }, + { + "epoch": 0.4999596481317085, + "grad_norm": 0.6963346004486084, + "learning_rate": 0.0001571707009983775, + "loss": 2.6303, + "step": 6195 + }, + { + "epoch": 0.5000403518682915, + "grad_norm": 0.7074694633483887, + "learning_rate": 0.0001571577478017111, + "loss": 2.6077, + "step": 6196 + }, + { + "epoch": 0.5001210556048745, + "grad_norm": 0.7826260328292847, + "learning_rate": 0.00015714479318052423, + "loss": 2.6668, + "step": 6197 + }, + { + "epoch": 0.5002017593414575, + "grad_norm": 0.6908758282661438, + "learning_rate": 0.00015713183713513974, + "loss": 2.6195, + "step": 6198 + }, + { + "epoch": 0.5002824630780405, + "grad_norm": 0.7571602463722229, + "learning_rate": 0.0001571188796658805, + "loss": 2.6546, + "step": 6199 + }, + { + "epoch": 0.5003631668146236, + "grad_norm": 0.7359431385993958, + "learning_rate": 0.0001571059207730695, + "loss": 2.5792, + "step": 6200 + }, + { + "epoch": 0.5004438705512065, + "grad_norm": 0.6886340379714966, + "learning_rate": 0.00015709296045702967, + "loss": 2.6099, + "step": 6201 + }, + { + "epoch": 0.5005245742877895, + "grad_norm": 0.6900473833084106, + "learning_rate": 0.000157079998718084, + "loss": 2.6461, + "step": 6202 + }, + { + "epoch": 0.5006052780243725, + "grad_norm": 0.66212397813797, + "learning_rate": 0.00015706703555655555, + "loss": 2.6178, + "step": 6203 + }, + { + "epoch": 0.5006859817609556, + "grad_norm": 0.7666565179824829, + "learning_rate": 0.00015705407097276744, + "loss": 2.7097, + "step": 6204 + }, + { + "epoch": 0.5007666854975386, + "grad_norm": 0.7294591069221497, + "learning_rate": 0.0001570411049670427, + "loss": 2.5995, + "step": 6205 + }, + { + "epoch": 0.5008473892341215, + "grad_norm": 0.7279765009880066, + "learning_rate": 0.00015702813753970453, + "loss": 2.5554, + "step": 6206 + }, + { + "epoch": 0.5009280929707045, + "grad_norm": 0.7174742817878723, + "learning_rate": 0.0001570151686910761, + "loss": 2.6523, + "step": 6207 + }, + { + "epoch": 0.5010087967072876, + "grad_norm": 0.67017662525177, + "learning_rate": 0.00015700219842148063, + "loss": 2.5613, + "step": 6208 + }, + { + "epoch": 0.5010895004438706, + "grad_norm": 0.7000258564949036, + "learning_rate": 0.00015698922673124138, + "loss": 2.5658, + "step": 6209 + }, + { + "epoch": 0.5011702041804535, + "grad_norm": 0.6894544363021851, + "learning_rate": 0.00015697625362068164, + "loss": 2.6925, + "step": 6210 + }, + { + "epoch": 0.5012509079170365, + "grad_norm": 0.6742957234382629, + "learning_rate": 0.00015696327909012466, + "loss": 2.6429, + "step": 6211 + }, + { + "epoch": 0.5013316116536196, + "grad_norm": 0.7039656639099121, + "learning_rate": 0.0001569503031398939, + "loss": 2.6313, + "step": 6212 + }, + { + "epoch": 0.5014123153902026, + "grad_norm": 0.720003604888916, + "learning_rate": 0.00015693732577031272, + "loss": 2.6207, + "step": 6213 + }, + { + "epoch": 0.5014930191267856, + "grad_norm": 0.8611499071121216, + "learning_rate": 0.00015692434698170456, + "loss": 2.6855, + "step": 6214 + }, + { + "epoch": 0.5015737228633685, + "grad_norm": 0.6664702296257019, + "learning_rate": 0.00015691136677439284, + "loss": 2.6174, + "step": 6215 + }, + { + "epoch": 0.5016544265999516, + "grad_norm": 0.7258509993553162, + "learning_rate": 0.00015689838514870111, + "loss": 2.6558, + "step": 6216 + }, + { + "epoch": 0.5017351303365346, + "grad_norm": 0.6972211599349976, + "learning_rate": 0.0001568854021049529, + "loss": 2.5913, + "step": 6217 + }, + { + "epoch": 0.5018158340731176, + "grad_norm": 0.7927280068397522, + "learning_rate": 0.00015687241764347177, + "loss": 2.6466, + "step": 6218 + }, + { + "epoch": 0.5018965378097006, + "grad_norm": 0.7044646143913269, + "learning_rate": 0.00015685943176458128, + "loss": 2.6195, + "step": 6219 + }, + { + "epoch": 0.5019772415462836, + "grad_norm": 0.6935598254203796, + "learning_rate": 0.00015684644446860516, + "loss": 2.6486, + "step": 6220 + }, + { + "epoch": 0.5020579452828666, + "grad_norm": 0.7965792417526245, + "learning_rate": 0.00015683345575586704, + "loss": 2.6265, + "step": 6221 + }, + { + "epoch": 0.5021386490194496, + "grad_norm": 0.727053701877594, + "learning_rate": 0.00015682046562669064, + "loss": 2.6714, + "step": 6222 + }, + { + "epoch": 0.5022193527560326, + "grad_norm": 0.7919184565544128, + "learning_rate": 0.0001568074740813997, + "loss": 2.7115, + "step": 6223 + }, + { + "epoch": 0.5023000564926156, + "grad_norm": 0.7724714279174805, + "learning_rate": 0.00015679448112031801, + "loss": 2.6636, + "step": 6224 + }, + { + "epoch": 0.5023807602291986, + "grad_norm": 0.6893701553344727, + "learning_rate": 0.0001567814867437694, + "loss": 2.6562, + "step": 6225 + }, + { + "epoch": 0.5024614639657816, + "grad_norm": 0.7089633345603943, + "learning_rate": 0.00015676849095207769, + "loss": 2.6125, + "step": 6226 + }, + { + "epoch": 0.5025421677023646, + "grad_norm": 0.7620012760162354, + "learning_rate": 0.00015675549374556682, + "loss": 2.6935, + "step": 6227 + }, + { + "epoch": 0.5026228714389476, + "grad_norm": 0.7293741703033447, + "learning_rate": 0.00015674249512456065, + "loss": 2.66, + "step": 6228 + }, + { + "epoch": 0.5027035751755307, + "grad_norm": 0.7366519570350647, + "learning_rate": 0.00015672949508938318, + "loss": 2.5968, + "step": 6229 + }, + { + "epoch": 0.5027842789121136, + "grad_norm": 0.6646310091018677, + "learning_rate": 0.00015671649364035846, + "loss": 2.5751, + "step": 6230 + }, + { + "epoch": 0.5028649826486966, + "grad_norm": 0.6682632565498352, + "learning_rate": 0.00015670349077781038, + "loss": 2.5902, + "step": 6231 + }, + { + "epoch": 0.5029456863852796, + "grad_norm": 0.7327528595924377, + "learning_rate": 0.00015669048650206313, + "loss": 2.6487, + "step": 6232 + }, + { + "epoch": 0.5030263901218627, + "grad_norm": 0.7114281058311462, + "learning_rate": 0.00015667748081344074, + "loss": 2.5779, + "step": 6233 + }, + { + "epoch": 0.5031070938584457, + "grad_norm": 0.7908105850219727, + "learning_rate": 0.00015666447371226737, + "loss": 2.6099, + "step": 6234 + }, + { + "epoch": 0.5031877975950286, + "grad_norm": 0.7823575139045715, + "learning_rate": 0.00015665146519886725, + "loss": 2.6339, + "step": 6235 + }, + { + "epoch": 0.5032685013316116, + "grad_norm": 0.7404836416244507, + "learning_rate": 0.00015663845527356447, + "loss": 2.6035, + "step": 6236 + }, + { + "epoch": 0.5033492050681947, + "grad_norm": 0.7448995113372803, + "learning_rate": 0.00015662544393668334, + "loss": 2.6566, + "step": 6237 + }, + { + "epoch": 0.5034299088047777, + "grad_norm": 0.7209747433662415, + "learning_rate": 0.00015661243118854815, + "loss": 2.682, + "step": 6238 + }, + { + "epoch": 0.5035106125413606, + "grad_norm": 0.691759467124939, + "learning_rate": 0.00015659941702948315, + "loss": 2.6435, + "step": 6239 + }, + { + "epoch": 0.5035913162779436, + "grad_norm": 0.7646063566207886, + "learning_rate": 0.00015658640145981275, + "loss": 2.591, + "step": 6240 + }, + { + "epoch": 0.5036720200145267, + "grad_norm": 0.8319387435913086, + "learning_rate": 0.00015657338447986133, + "loss": 2.5937, + "step": 6241 + }, + { + "epoch": 0.5037527237511097, + "grad_norm": 0.729193389415741, + "learning_rate": 0.00015656036608995323, + "loss": 2.651, + "step": 6242 + }, + { + "epoch": 0.5038334274876927, + "grad_norm": 0.720098614692688, + "learning_rate": 0.000156547346290413, + "loss": 2.681, + "step": 6243 + }, + { + "epoch": 0.5039141312242756, + "grad_norm": 0.7172541618347168, + "learning_rate": 0.00015653432508156508, + "loss": 2.5906, + "step": 6244 + }, + { + "epoch": 0.5039948349608587, + "grad_norm": 0.7352481484413147, + "learning_rate": 0.00015652130246373398, + "loss": 2.6376, + "step": 6245 + }, + { + "epoch": 0.5040755386974417, + "grad_norm": 0.6664925813674927, + "learning_rate": 0.0001565082784372443, + "loss": 2.706, + "step": 6246 + }, + { + "epoch": 0.5041562424340247, + "grad_norm": 0.7292987704277039, + "learning_rate": 0.0001564952530024206, + "loss": 2.6149, + "step": 6247 + }, + { + "epoch": 0.5042369461706077, + "grad_norm": 0.6904531121253967, + "learning_rate": 0.00015648222615958747, + "loss": 2.579, + "step": 6248 + }, + { + "epoch": 0.5043176499071907, + "grad_norm": 0.7385311722755432, + "learning_rate": 0.00015646919790906965, + "loss": 2.6137, + "step": 6249 + }, + { + "epoch": 0.5043983536437737, + "grad_norm": 0.7869507074356079, + "learning_rate": 0.0001564561682511918, + "loss": 2.6831, + "step": 6250 + }, + { + "epoch": 0.5044790573803567, + "grad_norm": 0.723680317401886, + "learning_rate": 0.00015644313718627867, + "loss": 2.6083, + "step": 6251 + }, + { + "epoch": 0.5045597611169397, + "grad_norm": 0.7029969692230225, + "learning_rate": 0.00015643010471465502, + "loss": 2.6462, + "step": 6252 + }, + { + "epoch": 0.5046404648535228, + "grad_norm": 0.818975031375885, + "learning_rate": 0.00015641707083664566, + "loss": 2.6393, + "step": 6253 + }, + { + "epoch": 0.5047211685901057, + "grad_norm": 0.7237667441368103, + "learning_rate": 0.0001564040355525754, + "loss": 2.5995, + "step": 6254 + }, + { + "epoch": 0.5048018723266887, + "grad_norm": 0.8613824248313904, + "learning_rate": 0.00015639099886276912, + "loss": 2.748, + "step": 6255 + }, + { + "epoch": 0.5048825760632717, + "grad_norm": 0.6802194118499756, + "learning_rate": 0.00015637796076755178, + "loss": 2.6393, + "step": 6256 + }, + { + "epoch": 0.5049632797998548, + "grad_norm": 0.7816255688667297, + "learning_rate": 0.00015636492126724823, + "loss": 2.6218, + "step": 6257 + }, + { + "epoch": 0.5050439835364378, + "grad_norm": 0.7443990707397461, + "learning_rate": 0.00015635188036218356, + "loss": 2.6181, + "step": 6258 + }, + { + "epoch": 0.5051246872730207, + "grad_norm": 0.7869458794593811, + "learning_rate": 0.0001563388380526827, + "loss": 2.6641, + "step": 6259 + }, + { + "epoch": 0.5052053910096037, + "grad_norm": 0.7423158288002014, + "learning_rate": 0.00015632579433907072, + "loss": 2.5849, + "step": 6260 + }, + { + "epoch": 0.5052860947461868, + "grad_norm": 0.7888280153274536, + "learning_rate": 0.00015631274922167272, + "loss": 2.7095, + "step": 6261 + }, + { + "epoch": 0.5053667984827698, + "grad_norm": 0.7053405046463013, + "learning_rate": 0.0001562997027008138, + "loss": 2.5747, + "step": 6262 + }, + { + "epoch": 0.5054475022193528, + "grad_norm": 0.7930825352668762, + "learning_rate": 0.0001562866547768191, + "loss": 2.6359, + "step": 6263 + }, + { + "epoch": 0.5055282059559357, + "grad_norm": 0.7431469559669495, + "learning_rate": 0.0001562736054500139, + "loss": 2.6167, + "step": 6264 + }, + { + "epoch": 0.5056089096925188, + "grad_norm": 0.8395694494247437, + "learning_rate": 0.00015626055472072324, + "loss": 2.7217, + "step": 6265 + }, + { + "epoch": 0.5056896134291018, + "grad_norm": 0.7318898439407349, + "learning_rate": 0.0001562475025892726, + "loss": 2.6866, + "step": 6266 + }, + { + "epoch": 0.5057703171656848, + "grad_norm": 0.7487025856971741, + "learning_rate": 0.0001562344490559871, + "loss": 2.7206, + "step": 6267 + }, + { + "epoch": 0.5058510209022677, + "grad_norm": 0.8187269568443298, + "learning_rate": 0.00015622139412119212, + "loss": 2.658, + "step": 6268 + }, + { + "epoch": 0.5059317246388508, + "grad_norm": 0.6714495420455933, + "learning_rate": 0.00015620833778521307, + "loss": 2.6182, + "step": 6269 + }, + { + "epoch": 0.5060124283754338, + "grad_norm": 0.7556246519088745, + "learning_rate": 0.00015619528004837528, + "loss": 2.6502, + "step": 6270 + }, + { + "epoch": 0.5060931321120168, + "grad_norm": 0.6989960074424744, + "learning_rate": 0.00015618222091100424, + "loss": 2.6031, + "step": 6271 + }, + { + "epoch": 0.5061738358485998, + "grad_norm": 0.7002139091491699, + "learning_rate": 0.0001561691603734254, + "loss": 2.6563, + "step": 6272 + }, + { + "epoch": 0.5062545395851827, + "grad_norm": 0.7064816355705261, + "learning_rate": 0.00015615609843596423, + "loss": 2.6482, + "step": 6273 + }, + { + "epoch": 0.5063352433217658, + "grad_norm": 0.6971433162689209, + "learning_rate": 0.00015614303509894634, + "loss": 2.6522, + "step": 6274 + }, + { + "epoch": 0.5064159470583488, + "grad_norm": 0.6982942223548889, + "learning_rate": 0.0001561299703626972, + "loss": 2.6477, + "step": 6275 + }, + { + "epoch": 0.5064966507949318, + "grad_norm": 0.7219811081886292, + "learning_rate": 0.0001561169042275425, + "loss": 2.6514, + "step": 6276 + }, + { + "epoch": 0.5065773545315148, + "grad_norm": 0.7391932010650635, + "learning_rate": 0.00015610383669380787, + "loss": 2.698, + "step": 6277 + }, + { + "epoch": 0.5066580582680978, + "grad_norm": 0.7852853536605835, + "learning_rate": 0.00015609076776181894, + "loss": 2.6281, + "step": 6278 + }, + { + "epoch": 0.5067387620046808, + "grad_norm": 0.7435647249221802, + "learning_rate": 0.00015607769743190147, + "loss": 2.6403, + "step": 6279 + }, + { + "epoch": 0.5068194657412638, + "grad_norm": 0.7300949096679688, + "learning_rate": 0.00015606462570438119, + "loss": 2.6125, + "step": 6280 + }, + { + "epoch": 0.5069001694778468, + "grad_norm": 0.7081549167633057, + "learning_rate": 0.00015605155257958388, + "loss": 2.6192, + "step": 6281 + }, + { + "epoch": 0.5069808732144299, + "grad_norm": 0.709020733833313, + "learning_rate": 0.00015603847805783537, + "loss": 2.6745, + "step": 6282 + }, + { + "epoch": 0.5070615769510128, + "grad_norm": 0.691684901714325, + "learning_rate": 0.0001560254021394615, + "loss": 2.5638, + "step": 6283 + }, + { + "epoch": 0.5071422806875958, + "grad_norm": 0.8338537812232971, + "learning_rate": 0.00015601232482478813, + "loss": 2.5835, + "step": 6284 + }, + { + "epoch": 0.5072229844241788, + "grad_norm": 0.659436047077179, + "learning_rate": 0.00015599924611414126, + "loss": 2.601, + "step": 6285 + }, + { + "epoch": 0.5073036881607619, + "grad_norm": 0.72590172290802, + "learning_rate": 0.00015598616600784676, + "loss": 2.602, + "step": 6286 + }, + { + "epoch": 0.5073843918973449, + "grad_norm": 0.6704443693161011, + "learning_rate": 0.00015597308450623066, + "loss": 2.5703, + "step": 6287 + }, + { + "epoch": 0.5074650956339278, + "grad_norm": 0.7298632264137268, + "learning_rate": 0.00015596000160961898, + "loss": 2.6859, + "step": 6288 + }, + { + "epoch": 0.5075457993705108, + "grad_norm": 0.6900345087051392, + "learning_rate": 0.00015594691731833776, + "loss": 2.6264, + "step": 6289 + }, + { + "epoch": 0.5076265031070939, + "grad_norm": 0.6705992221832275, + "learning_rate": 0.0001559338316327131, + "loss": 2.6135, + "step": 6290 + }, + { + "epoch": 0.5077072068436769, + "grad_norm": 0.691545307636261, + "learning_rate": 0.0001559207445530712, + "loss": 2.6538, + "step": 6291 + }, + { + "epoch": 0.5077879105802598, + "grad_norm": 0.6579985618591309, + "learning_rate": 0.00015590765607973811, + "loss": 2.6224, + "step": 6292 + }, + { + "epoch": 0.5078686143168428, + "grad_norm": 0.6938790678977966, + "learning_rate": 0.00015589456621304014, + "loss": 2.5932, + "step": 6293 + }, + { + "epoch": 0.5079493180534259, + "grad_norm": 0.7421671748161316, + "learning_rate": 0.00015588147495330346, + "loss": 2.7098, + "step": 6294 + }, + { + "epoch": 0.5080300217900089, + "grad_norm": 0.7076674699783325, + "learning_rate": 0.0001558683823008543, + "loss": 2.664, + "step": 6295 + }, + { + "epoch": 0.5081107255265919, + "grad_norm": 0.6829726696014404, + "learning_rate": 0.00015585528825601906, + "loss": 2.6029, + "step": 6296 + }, + { + "epoch": 0.5081914292631748, + "grad_norm": 0.6968080401420593, + "learning_rate": 0.000155842192819124, + "loss": 2.6256, + "step": 6297 + }, + { + "epoch": 0.5082721329997579, + "grad_norm": 0.7453410625457764, + "learning_rate": 0.00015582909599049554, + "loss": 2.6577, + "step": 6298 + }, + { + "epoch": 0.5083528367363409, + "grad_norm": 0.6603519916534424, + "learning_rate": 0.00015581599777046007, + "loss": 2.6066, + "step": 6299 + }, + { + "epoch": 0.5084335404729239, + "grad_norm": 0.7096173763275146, + "learning_rate": 0.00015580289815934401, + "loss": 2.5488, + "step": 6300 + }, + { + "epoch": 0.5085142442095069, + "grad_norm": 0.799298107624054, + "learning_rate": 0.0001557897971574739, + "loss": 2.6021, + "step": 6301 + }, + { + "epoch": 0.50859494794609, + "grad_norm": 0.6820314526557922, + "learning_rate": 0.00015577669476517618, + "loss": 2.6276, + "step": 6302 + }, + { + "epoch": 0.5086756516826729, + "grad_norm": 0.7119347453117371, + "learning_rate": 0.00015576359098277742, + "loss": 2.6627, + "step": 6303 + }, + { + "epoch": 0.5087563554192559, + "grad_norm": 0.7638720273971558, + "learning_rate": 0.00015575048581060422, + "loss": 2.6824, + "step": 6304 + }, + { + "epoch": 0.5088370591558389, + "grad_norm": 0.7360339164733887, + "learning_rate": 0.00015573737924898316, + "loss": 2.5805, + "step": 6305 + }, + { + "epoch": 0.508917762892422, + "grad_norm": 0.7220984697341919, + "learning_rate": 0.00015572427129824091, + "loss": 2.6374, + "step": 6306 + }, + { + "epoch": 0.5089984666290049, + "grad_norm": 0.670964777469635, + "learning_rate": 0.00015571116195870418, + "loss": 2.6371, + "step": 6307 + }, + { + "epoch": 0.5090791703655879, + "grad_norm": 0.7826075553894043, + "learning_rate": 0.00015569805123069968, + "loss": 2.7666, + "step": 6308 + }, + { + "epoch": 0.5091598741021709, + "grad_norm": 0.7691593766212463, + "learning_rate": 0.00015568493911455412, + "loss": 2.6242, + "step": 6309 + }, + { + "epoch": 0.509240577838754, + "grad_norm": 0.714500367641449, + "learning_rate": 0.0001556718256105943, + "loss": 2.6551, + "step": 6310 + }, + { + "epoch": 0.509321281575337, + "grad_norm": 0.7634009718894958, + "learning_rate": 0.00015565871071914706, + "loss": 2.7069, + "step": 6311 + }, + { + "epoch": 0.5094019853119199, + "grad_norm": 0.7134168148040771, + "learning_rate": 0.00015564559444053926, + "loss": 2.5816, + "step": 6312 + }, + { + "epoch": 0.5094826890485029, + "grad_norm": 0.6548121571540833, + "learning_rate": 0.0001556324767750978, + "loss": 2.6192, + "step": 6313 + }, + { + "epoch": 0.509563392785086, + "grad_norm": 0.7244428992271423, + "learning_rate": 0.0001556193577231496, + "loss": 2.6072, + "step": 6314 + }, + { + "epoch": 0.509644096521669, + "grad_norm": 0.6976662278175354, + "learning_rate": 0.0001556062372850216, + "loss": 2.6148, + "step": 6315 + }, + { + "epoch": 0.509724800258252, + "grad_norm": 0.772726833820343, + "learning_rate": 0.00015559311546104083, + "loss": 2.6458, + "step": 6316 + }, + { + "epoch": 0.5098055039948349, + "grad_norm": 0.7976188659667969, + "learning_rate": 0.00015557999225153428, + "loss": 2.6772, + "step": 6317 + }, + { + "epoch": 0.509886207731418, + "grad_norm": 0.6458039283752441, + "learning_rate": 0.00015556686765682903, + "loss": 2.6143, + "step": 6318 + }, + { + "epoch": 0.509966911468001, + "grad_norm": 0.7295405268669128, + "learning_rate": 0.0001555537416772522, + "loss": 2.5919, + "step": 6319 + }, + { + "epoch": 0.510047615204584, + "grad_norm": 0.657978355884552, + "learning_rate": 0.00015554061431313093, + "loss": 2.6245, + "step": 6320 + }, + { + "epoch": 0.510128318941167, + "grad_norm": 0.6726922392845154, + "learning_rate": 0.00015552748556479232, + "loss": 2.6207, + "step": 6321 + }, + { + "epoch": 0.51020902267775, + "grad_norm": 0.7954673767089844, + "learning_rate": 0.00015551435543256363, + "loss": 2.7177, + "step": 6322 + }, + { + "epoch": 0.510289726414333, + "grad_norm": 0.7186735272407532, + "learning_rate": 0.00015550122391677211, + "loss": 2.5953, + "step": 6323 + }, + { + "epoch": 0.510370430150916, + "grad_norm": 0.7835420966148376, + "learning_rate": 0.00015548809101774498, + "loss": 2.7039, + "step": 6324 + }, + { + "epoch": 0.510451133887499, + "grad_norm": 0.6966592073440552, + "learning_rate": 0.00015547495673580962, + "loss": 2.6287, + "step": 6325 + }, + { + "epoch": 0.5105318376240819, + "grad_norm": 0.6676180362701416, + "learning_rate": 0.00015546182107129328, + "loss": 2.638, + "step": 6326 + }, + { + "epoch": 0.510612541360665, + "grad_norm": 0.7285657525062561, + "learning_rate": 0.0001554486840245234, + "loss": 2.6661, + "step": 6327 + }, + { + "epoch": 0.510693245097248, + "grad_norm": 0.6453657150268555, + "learning_rate": 0.00015543554559582735, + "loss": 2.715, + "step": 6328 + }, + { + "epoch": 0.510773948833831, + "grad_norm": 0.7364684343338013, + "learning_rate": 0.0001554224057855326, + "loss": 2.6475, + "step": 6329 + }, + { + "epoch": 0.510854652570414, + "grad_norm": 0.670894980430603, + "learning_rate": 0.00015540926459396665, + "loss": 2.6091, + "step": 6330 + }, + { + "epoch": 0.510935356306997, + "grad_norm": 0.6750168204307556, + "learning_rate": 0.00015539612202145696, + "loss": 2.6473, + "step": 6331 + }, + { + "epoch": 0.51101606004358, + "grad_norm": 0.6552454233169556, + "learning_rate": 0.0001553829780683311, + "loss": 2.6158, + "step": 6332 + }, + { + "epoch": 0.511096763780163, + "grad_norm": 0.7387828230857849, + "learning_rate": 0.00015536983273491668, + "loss": 2.6219, + "step": 6333 + }, + { + "epoch": 0.511177467516746, + "grad_norm": 0.6993975639343262, + "learning_rate": 0.00015535668602154127, + "loss": 2.6446, + "step": 6334 + }, + { + "epoch": 0.5112581712533291, + "grad_norm": 0.6491217613220215, + "learning_rate": 0.00015534353792853254, + "loss": 2.6404, + "step": 6335 + }, + { + "epoch": 0.511338874989912, + "grad_norm": 0.7165521383285522, + "learning_rate": 0.0001553303884562182, + "loss": 2.6339, + "step": 6336 + }, + { + "epoch": 0.511419578726495, + "grad_norm": 0.7363756895065308, + "learning_rate": 0.0001553172376049259, + "loss": 2.6411, + "step": 6337 + }, + { + "epoch": 0.511500282463078, + "grad_norm": 0.7148438096046448, + "learning_rate": 0.00015530408537498347, + "loss": 2.5617, + "step": 6338 + }, + { + "epoch": 0.5115809861996611, + "grad_norm": 0.7140451669692993, + "learning_rate": 0.00015529093176671864, + "loss": 2.5898, + "step": 6339 + }, + { + "epoch": 0.5116616899362441, + "grad_norm": 0.7799252271652222, + "learning_rate": 0.00015527777678045926, + "loss": 2.6176, + "step": 6340 + }, + { + "epoch": 0.511742393672827, + "grad_norm": 0.7292928099632263, + "learning_rate": 0.00015526462041653323, + "loss": 2.6722, + "step": 6341 + }, + { + "epoch": 0.51182309740941, + "grad_norm": 0.6986904740333557, + "learning_rate": 0.00015525146267526837, + "loss": 2.6154, + "step": 6342 + }, + { + "epoch": 0.5119038011459931, + "grad_norm": 0.7239612936973572, + "learning_rate": 0.00015523830355699262, + "loss": 2.5664, + "step": 6343 + }, + { + "epoch": 0.5119845048825761, + "grad_norm": 0.6805121898651123, + "learning_rate": 0.00015522514306203395, + "loss": 2.6204, + "step": 6344 + }, + { + "epoch": 0.512065208619159, + "grad_norm": 0.7036689519882202, + "learning_rate": 0.00015521198119072035, + "loss": 2.6211, + "step": 6345 + }, + { + "epoch": 0.512145912355742, + "grad_norm": 0.7155849933624268, + "learning_rate": 0.00015519881794337988, + "loss": 2.6074, + "step": 6346 + }, + { + "epoch": 0.5122266160923251, + "grad_norm": 0.7183938026428223, + "learning_rate": 0.00015518565332034057, + "loss": 2.6148, + "step": 6347 + }, + { + "epoch": 0.5123073198289081, + "grad_norm": 0.7053570747375488, + "learning_rate": 0.0001551724873219305, + "loss": 2.6476, + "step": 6348 + }, + { + "epoch": 0.5123880235654911, + "grad_norm": 0.714846670627594, + "learning_rate": 0.00015515931994847785, + "loss": 2.5728, + "step": 6349 + }, + { + "epoch": 0.512468727302074, + "grad_norm": 0.7504729628562927, + "learning_rate": 0.00015514615120031076, + "loss": 2.6415, + "step": 6350 + }, + { + "epoch": 0.5125494310386571, + "grad_norm": 0.6940335035324097, + "learning_rate": 0.0001551329810777574, + "loss": 2.6115, + "step": 6351 + }, + { + "epoch": 0.5126301347752401, + "grad_norm": 0.7166119813919067, + "learning_rate": 0.00015511980958114608, + "loss": 2.6284, + "step": 6352 + }, + { + "epoch": 0.5127108385118231, + "grad_norm": 0.7787839770317078, + "learning_rate": 0.00015510663671080497, + "loss": 2.6385, + "step": 6353 + }, + { + "epoch": 0.5127915422484061, + "grad_norm": 0.7298412322998047, + "learning_rate": 0.00015509346246706245, + "loss": 2.629, + "step": 6354 + }, + { + "epoch": 0.5128722459849892, + "grad_norm": 0.7918897271156311, + "learning_rate": 0.00015508028685024683, + "loss": 2.6777, + "step": 6355 + }, + { + "epoch": 0.5129529497215721, + "grad_norm": 0.6867843866348267, + "learning_rate": 0.00015506710986068646, + "loss": 2.6101, + "step": 6356 + }, + { + "epoch": 0.5130336534581551, + "grad_norm": 0.716468870639801, + "learning_rate": 0.00015505393149870978, + "loss": 2.6558, + "step": 6357 + }, + { + "epoch": 0.5131143571947381, + "grad_norm": 0.6704092621803284, + "learning_rate": 0.0001550407517646452, + "loss": 2.6128, + "step": 6358 + }, + { + "epoch": 0.5131950609313212, + "grad_norm": 0.820716381072998, + "learning_rate": 0.00015502757065882124, + "loss": 2.6052, + "step": 6359 + }, + { + "epoch": 0.5132757646679041, + "grad_norm": 0.7328094840049744, + "learning_rate": 0.00015501438818156635, + "loss": 2.6399, + "step": 6360 + }, + { + "epoch": 0.5133564684044871, + "grad_norm": 0.6602808833122253, + "learning_rate": 0.00015500120433320911, + "loss": 2.5509, + "step": 6361 + }, + { + "epoch": 0.5134371721410701, + "grad_norm": 0.7013166546821594, + "learning_rate": 0.00015498801911407805, + "loss": 2.6439, + "step": 6362 + }, + { + "epoch": 0.5135178758776532, + "grad_norm": 0.7415499091148376, + "learning_rate": 0.00015497483252450186, + "loss": 2.575, + "step": 6363 + }, + { + "epoch": 0.5135985796142362, + "grad_norm": 0.7262336015701294, + "learning_rate": 0.00015496164456480912, + "loss": 2.6815, + "step": 6364 + }, + { + "epoch": 0.5136792833508191, + "grad_norm": 0.7353699803352356, + "learning_rate": 0.0001549484552353285, + "loss": 2.6172, + "step": 6365 + }, + { + "epoch": 0.5137599870874021, + "grad_norm": 0.7005086541175842, + "learning_rate": 0.00015493526453638879, + "loss": 2.5945, + "step": 6366 + }, + { + "epoch": 0.5138406908239852, + "grad_norm": 0.7469770908355713, + "learning_rate": 0.00015492207246831864, + "loss": 2.6797, + "step": 6367 + }, + { + "epoch": 0.5139213945605682, + "grad_norm": 0.6768934726715088, + "learning_rate": 0.00015490887903144693, + "loss": 2.6369, + "step": 6368 + }, + { + "epoch": 0.5140020982971512, + "grad_norm": 0.7625820636749268, + "learning_rate": 0.00015489568422610237, + "loss": 2.6182, + "step": 6369 + }, + { + "epoch": 0.5140828020337341, + "grad_norm": 0.749351978302002, + "learning_rate": 0.00015488248805261388, + "loss": 2.6066, + "step": 6370 + }, + { + "epoch": 0.5141635057703172, + "grad_norm": 0.8369480967521667, + "learning_rate": 0.00015486929051131032, + "loss": 2.7627, + "step": 6371 + }, + { + "epoch": 0.5142442095069002, + "grad_norm": 0.6482037305831909, + "learning_rate": 0.0001548560916025206, + "loss": 2.609, + "step": 6372 + }, + { + "epoch": 0.5143249132434832, + "grad_norm": 0.6801851391792297, + "learning_rate": 0.0001548428913265737, + "loss": 2.5878, + "step": 6373 + }, + { + "epoch": 0.5144056169800661, + "grad_norm": 0.744926929473877, + "learning_rate": 0.0001548296896837986, + "loss": 2.6569, + "step": 6374 + }, + { + "epoch": 0.5144863207166491, + "grad_norm": 0.6862614750862122, + "learning_rate": 0.00015481648667452425, + "loss": 2.5626, + "step": 6375 + }, + { + "epoch": 0.5145670244532322, + "grad_norm": 0.7186449766159058, + "learning_rate": 0.0001548032822990798, + "loss": 2.6783, + "step": 6376 + }, + { + "epoch": 0.5146477281898152, + "grad_norm": 0.699715256690979, + "learning_rate": 0.0001547900765577943, + "loss": 2.6709, + "step": 6377 + }, + { + "epoch": 0.5147284319263982, + "grad_norm": 0.7272205352783203, + "learning_rate": 0.00015477686945099687, + "loss": 2.6076, + "step": 6378 + }, + { + "epoch": 0.5148091356629811, + "grad_norm": 0.7667459845542908, + "learning_rate": 0.00015476366097901667, + "loss": 2.6541, + "step": 6379 + }, + { + "epoch": 0.5148898393995642, + "grad_norm": 0.6538121700286865, + "learning_rate": 0.00015475045114218285, + "loss": 2.5806, + "step": 6380 + }, + { + "epoch": 0.5149705431361472, + "grad_norm": 0.7388994097709656, + "learning_rate": 0.00015473723994082473, + "loss": 2.6293, + "step": 6381 + }, + { + "epoch": 0.5150512468727302, + "grad_norm": 0.7044215202331543, + "learning_rate": 0.00015472402737527142, + "loss": 2.5755, + "step": 6382 + }, + { + "epoch": 0.5151319506093132, + "grad_norm": 0.6807994246482849, + "learning_rate": 0.00015471081344585236, + "loss": 2.6493, + "step": 6383 + }, + { + "epoch": 0.5152126543458962, + "grad_norm": 0.676278293132782, + "learning_rate": 0.00015469759815289681, + "loss": 2.6319, + "step": 6384 + }, + { + "epoch": 0.5152933580824792, + "grad_norm": 0.7515453696250916, + "learning_rate": 0.00015468438149673412, + "loss": 2.6415, + "step": 6385 + }, + { + "epoch": 0.5153740618190622, + "grad_norm": 0.8694239854812622, + "learning_rate": 0.0001546711634776937, + "loss": 2.5818, + "step": 6386 + }, + { + "epoch": 0.5154547655556452, + "grad_norm": 0.717090368270874, + "learning_rate": 0.000154657944096105, + "loss": 2.7132, + "step": 6387 + }, + { + "epoch": 0.5155354692922283, + "grad_norm": 0.7098804116249084, + "learning_rate": 0.00015464472335229742, + "loss": 2.564, + "step": 6388 + }, + { + "epoch": 0.5156161730288112, + "grad_norm": 0.6879690289497375, + "learning_rate": 0.0001546315012466005, + "loss": 2.6094, + "step": 6389 + }, + { + "epoch": 0.5156968767653942, + "grad_norm": 0.7110763788223267, + "learning_rate": 0.00015461827777934377, + "loss": 2.5982, + "step": 6390 + }, + { + "epoch": 0.5157775805019772, + "grad_norm": 0.7168039679527283, + "learning_rate": 0.00015460505295085677, + "loss": 2.5451, + "step": 6391 + }, + { + "epoch": 0.5158582842385603, + "grad_norm": 0.7059877514839172, + "learning_rate": 0.00015459182676146914, + "loss": 2.6655, + "step": 6392 + }, + { + "epoch": 0.5159389879751433, + "grad_norm": 0.7278143763542175, + "learning_rate": 0.00015457859921151043, + "loss": 2.6587, + "step": 6393 + }, + { + "epoch": 0.5160196917117262, + "grad_norm": 0.7301023602485657, + "learning_rate": 0.0001545653703013104, + "loss": 2.7672, + "step": 6394 + }, + { + "epoch": 0.5161003954483092, + "grad_norm": 0.6933302283287048, + "learning_rate": 0.0001545521400311987, + "loss": 2.5924, + "step": 6395 + }, + { + "epoch": 0.5161810991848923, + "grad_norm": 0.7074775099754333, + "learning_rate": 0.00015453890840150508, + "loss": 2.6663, + "step": 6396 + }, + { + "epoch": 0.5162618029214753, + "grad_norm": 0.7069801092147827, + "learning_rate": 0.00015452567541255924, + "loss": 2.6791, + "step": 6397 + }, + { + "epoch": 0.5163425066580583, + "grad_norm": 0.6586462259292603, + "learning_rate": 0.00015451244106469108, + "loss": 2.6368, + "step": 6398 + }, + { + "epoch": 0.5164232103946412, + "grad_norm": 0.6862531900405884, + "learning_rate": 0.00015449920535823042, + "loss": 2.7099, + "step": 6399 + }, + { + "epoch": 0.5165039141312243, + "grad_norm": 0.7177795767784119, + "learning_rate": 0.00015448596829350706, + "loss": 2.5921, + "step": 6400 + }, + { + "epoch": 0.5165846178678073, + "grad_norm": 0.6936569213867188, + "learning_rate": 0.00015447272987085094, + "loss": 2.5739, + "step": 6401 + }, + { + "epoch": 0.5166653216043903, + "grad_norm": 0.7394363284111023, + "learning_rate": 0.00015445949009059202, + "loss": 2.5941, + "step": 6402 + }, + { + "epoch": 0.5167460253409732, + "grad_norm": 0.6713366508483887, + "learning_rate": 0.00015444624895306027, + "loss": 2.574, + "step": 6403 + }, + { + "epoch": 0.5168267290775563, + "grad_norm": 0.679128885269165, + "learning_rate": 0.0001544330064585856, + "loss": 2.6422, + "step": 6404 + }, + { + "epoch": 0.5169074328141393, + "grad_norm": 0.6803367137908936, + "learning_rate": 0.0001544197626074982, + "loss": 2.6503, + "step": 6405 + }, + { + "epoch": 0.5169881365507223, + "grad_norm": 0.8009794354438782, + "learning_rate": 0.000154406517400128, + "loss": 2.6434, + "step": 6406 + }, + { + "epoch": 0.5170688402873053, + "grad_norm": 0.7292529344558716, + "learning_rate": 0.00015439327083680517, + "loss": 2.6333, + "step": 6407 + }, + { + "epoch": 0.5171495440238884, + "grad_norm": 0.67046719789505, + "learning_rate": 0.00015438002291785988, + "loss": 2.5791, + "step": 6408 + }, + { + "epoch": 0.5172302477604713, + "grad_norm": 0.755501925945282, + "learning_rate": 0.00015436677364362225, + "loss": 2.5558, + "step": 6409 + }, + { + "epoch": 0.5173109514970543, + "grad_norm": 0.6957115530967712, + "learning_rate": 0.0001543535230144225, + "loss": 2.5839, + "step": 6410 + }, + { + "epoch": 0.5173916552336373, + "grad_norm": 0.6629074215888977, + "learning_rate": 0.0001543402710305909, + "loss": 2.6529, + "step": 6411 + }, + { + "epoch": 0.5174723589702204, + "grad_norm": 0.6647019386291504, + "learning_rate": 0.00015432701769245766, + "loss": 2.589, + "step": 6412 + }, + { + "epoch": 0.5175530627068033, + "grad_norm": 0.6472512483596802, + "learning_rate": 0.00015431376300035316, + "loss": 2.6184, + "step": 6413 + }, + { + "epoch": 0.5176337664433863, + "grad_norm": 0.6900136470794678, + "learning_rate": 0.0001543005069546077, + "loss": 2.7029, + "step": 6414 + }, + { + "epoch": 0.5177144701799693, + "grad_norm": 0.7702177166938782, + "learning_rate": 0.00015428724955555165, + "loss": 2.6189, + "step": 6415 + }, + { + "epoch": 0.5177951739165524, + "grad_norm": 0.641655445098877, + "learning_rate": 0.00015427399080351545, + "loss": 2.6486, + "step": 6416 + }, + { + "epoch": 0.5178758776531354, + "grad_norm": 0.6826485991477966, + "learning_rate": 0.00015426073069882952, + "loss": 2.6105, + "step": 6417 + }, + { + "epoch": 0.5179565813897183, + "grad_norm": 0.749812662601471, + "learning_rate": 0.00015424746924182434, + "loss": 2.5644, + "step": 6418 + }, + { + "epoch": 0.5180372851263013, + "grad_norm": 0.6737890243530273, + "learning_rate": 0.0001542342064328304, + "loss": 2.686, + "step": 6419 + }, + { + "epoch": 0.5181179888628844, + "grad_norm": 0.7131822109222412, + "learning_rate": 0.0001542209422721783, + "loss": 2.697, + "step": 6420 + }, + { + "epoch": 0.5181986925994674, + "grad_norm": 0.7543746829032898, + "learning_rate": 0.0001542076767601986, + "loss": 2.6349, + "step": 6421 + }, + { + "epoch": 0.5182793963360504, + "grad_norm": 0.7589309215545654, + "learning_rate": 0.00015419440989722184, + "loss": 2.63, + "step": 6422 + }, + { + "epoch": 0.5183601000726333, + "grad_norm": 0.7036365866661072, + "learning_rate": 0.00015418114168357872, + "loss": 2.605, + "step": 6423 + }, + { + "epoch": 0.5184408038092164, + "grad_norm": 0.733161985874176, + "learning_rate": 0.00015416787211959998, + "loss": 2.6708, + "step": 6424 + }, + { + "epoch": 0.5185215075457994, + "grad_norm": 0.6928101181983948, + "learning_rate": 0.00015415460120561623, + "loss": 2.6549, + "step": 6425 + }, + { + "epoch": 0.5186022112823824, + "grad_norm": 0.6557250022888184, + "learning_rate": 0.00015414132894195825, + "loss": 2.6185, + "step": 6426 + }, + { + "epoch": 0.5186829150189654, + "grad_norm": 0.7236297726631165, + "learning_rate": 0.00015412805532895684, + "loss": 2.6185, + "step": 6427 + }, + { + "epoch": 0.5187636187555483, + "grad_norm": 0.7194060683250427, + "learning_rate": 0.0001541147803669428, + "loss": 2.6123, + "step": 6428 + }, + { + "epoch": 0.5188443224921314, + "grad_norm": 0.7077342867851257, + "learning_rate": 0.00015410150405624696, + "loss": 2.6628, + "step": 6429 + }, + { + "epoch": 0.5189250262287144, + "grad_norm": 0.7036150693893433, + "learning_rate": 0.00015408822639720023, + "loss": 2.5966, + "step": 6430 + }, + { + "epoch": 0.5190057299652974, + "grad_norm": 0.7047349810600281, + "learning_rate": 0.00015407494739013352, + "loss": 2.6626, + "step": 6431 + }, + { + "epoch": 0.5190864337018803, + "grad_norm": 0.7537584900856018, + "learning_rate": 0.00015406166703537777, + "loss": 2.6452, + "step": 6432 + }, + { + "epoch": 0.5191671374384634, + "grad_norm": 0.7944707870483398, + "learning_rate": 0.00015404838533326394, + "loss": 2.6834, + "step": 6433 + }, + { + "epoch": 0.5192478411750464, + "grad_norm": 0.8602458238601685, + "learning_rate": 0.00015403510228412305, + "loss": 2.6238, + "step": 6434 + }, + { + "epoch": 0.5193285449116294, + "grad_norm": 0.7181896567344666, + "learning_rate": 0.0001540218178882862, + "loss": 2.652, + "step": 6435 + }, + { + "epoch": 0.5194092486482124, + "grad_norm": 0.7470960021018982, + "learning_rate": 0.0001540085321460844, + "loss": 2.6703, + "step": 6436 + }, + { + "epoch": 0.5194899523847955, + "grad_norm": 0.8249944448471069, + "learning_rate": 0.00015399524505784883, + "loss": 2.5945, + "step": 6437 + }, + { + "epoch": 0.5195706561213784, + "grad_norm": 0.7332444190979004, + "learning_rate": 0.00015398195662391057, + "loss": 2.6472, + "step": 6438 + }, + { + "epoch": 0.5196513598579614, + "grad_norm": 0.7727739810943604, + "learning_rate": 0.0001539686668446009, + "loss": 2.6276, + "step": 6439 + }, + { + "epoch": 0.5197320635945444, + "grad_norm": 0.7161617279052734, + "learning_rate": 0.00015395537572025094, + "loss": 2.624, + "step": 6440 + }, + { + "epoch": 0.5198127673311275, + "grad_norm": 0.7657529711723328, + "learning_rate": 0.00015394208325119198, + "loss": 2.6604, + "step": 6441 + }, + { + "epoch": 0.5198934710677104, + "grad_norm": 0.732904314994812, + "learning_rate": 0.00015392878943775527, + "loss": 2.6334, + "step": 6442 + }, + { + "epoch": 0.5199741748042934, + "grad_norm": 0.7058991193771362, + "learning_rate": 0.0001539154942802722, + "loss": 2.5936, + "step": 6443 + }, + { + "epoch": 0.5200548785408764, + "grad_norm": 0.7328821420669556, + "learning_rate": 0.00015390219777907405, + "loss": 2.5969, + "step": 6444 + }, + { + "epoch": 0.5201355822774595, + "grad_norm": 0.7899969220161438, + "learning_rate": 0.00015388889993449224, + "loss": 2.5856, + "step": 6445 + }, + { + "epoch": 0.5202162860140425, + "grad_norm": 0.6963860392570496, + "learning_rate": 0.00015387560074685817, + "loss": 2.6139, + "step": 6446 + }, + { + "epoch": 0.5202969897506254, + "grad_norm": 0.812053918838501, + "learning_rate": 0.00015386230021650327, + "loss": 2.716, + "step": 6447 + }, + { + "epoch": 0.5203776934872084, + "grad_norm": 0.766781210899353, + "learning_rate": 0.0001538489983437591, + "loss": 2.6509, + "step": 6448 + }, + { + "epoch": 0.5204583972237915, + "grad_norm": 0.6877299547195435, + "learning_rate": 0.00015383569512895712, + "loss": 2.6076, + "step": 6449 + }, + { + "epoch": 0.5205391009603745, + "grad_norm": 0.7009176015853882, + "learning_rate": 0.00015382239057242888, + "loss": 2.608, + "step": 6450 + }, + { + "epoch": 0.5206198046969575, + "grad_norm": 0.7187578678131104, + "learning_rate": 0.000153809084674506, + "loss": 2.5946, + "step": 6451 + }, + { + "epoch": 0.5207005084335404, + "grad_norm": 0.7242687344551086, + "learning_rate": 0.00015379577743552001, + "loss": 2.6752, + "step": 6452 + }, + { + "epoch": 0.5207812121701235, + "grad_norm": 0.7668174505233765, + "learning_rate": 0.00015378246885580266, + "loss": 2.6694, + "step": 6453 + }, + { + "epoch": 0.5208619159067065, + "grad_norm": 0.7676039338111877, + "learning_rate": 0.00015376915893568557, + "loss": 2.6379, + "step": 6454 + }, + { + "epoch": 0.5209426196432895, + "grad_norm": 0.7394412159919739, + "learning_rate": 0.00015375584767550053, + "loss": 2.6046, + "step": 6455 + }, + { + "epoch": 0.5210233233798724, + "grad_norm": 0.7246636748313904, + "learning_rate": 0.00015374253507557923, + "loss": 2.592, + "step": 6456 + }, + { + "epoch": 0.5211040271164555, + "grad_norm": 0.7121255993843079, + "learning_rate": 0.00015372922113625345, + "loss": 2.634, + "step": 6457 + }, + { + "epoch": 0.5211847308530385, + "grad_norm": 0.7378345131874084, + "learning_rate": 0.00015371590585785505, + "loss": 2.5753, + "step": 6458 + }, + { + "epoch": 0.5212654345896215, + "grad_norm": 0.6682030558586121, + "learning_rate": 0.00015370258924071587, + "loss": 2.6305, + "step": 6459 + }, + { + "epoch": 0.5213461383262045, + "grad_norm": 0.7164177894592285, + "learning_rate": 0.00015368927128516776, + "loss": 2.7188, + "step": 6460 + }, + { + "epoch": 0.5214268420627876, + "grad_norm": 0.7341115474700928, + "learning_rate": 0.00015367595199154273, + "loss": 2.6204, + "step": 6461 + }, + { + "epoch": 0.5215075457993705, + "grad_norm": 0.6781840920448303, + "learning_rate": 0.00015366263136017258, + "loss": 2.6104, + "step": 6462 + }, + { + "epoch": 0.5215882495359535, + "grad_norm": 0.7029077410697937, + "learning_rate": 0.0001536493093913894, + "loss": 2.6055, + "step": 6463 + }, + { + "epoch": 0.5216689532725365, + "grad_norm": 0.6958553194999695, + "learning_rate": 0.00015363598608552522, + "loss": 2.5991, + "step": 6464 + }, + { + "epoch": 0.5217496570091196, + "grad_norm": 0.6919750571250916, + "learning_rate": 0.00015362266144291207, + "loss": 2.6022, + "step": 6465 + }, + { + "epoch": 0.5218303607457025, + "grad_norm": 0.6980622410774231, + "learning_rate": 0.000153609335463882, + "loss": 2.6289, + "step": 6466 + }, + { + "epoch": 0.5219110644822855, + "grad_norm": 0.7468248009681702, + "learning_rate": 0.00015359600814876715, + "loss": 2.6327, + "step": 6467 + }, + { + "epoch": 0.5219917682188685, + "grad_norm": 0.7183729410171509, + "learning_rate": 0.00015358267949789966, + "loss": 2.6389, + "step": 6468 + }, + { + "epoch": 0.5220724719554516, + "grad_norm": 0.6558868885040283, + "learning_rate": 0.00015356934951161178, + "loss": 2.6261, + "step": 6469 + }, + { + "epoch": 0.5221531756920346, + "grad_norm": 0.8000216484069824, + "learning_rate": 0.00015355601819023562, + "loss": 2.6908, + "step": 6470 + }, + { + "epoch": 0.5222338794286175, + "grad_norm": 0.775056004524231, + "learning_rate": 0.00015354268553410355, + "loss": 2.6763, + "step": 6471 + }, + { + "epoch": 0.5223145831652005, + "grad_norm": 0.7345123291015625, + "learning_rate": 0.00015352935154354776, + "loss": 2.582, + "step": 6472 + }, + { + "epoch": 0.5223952869017836, + "grad_norm": 0.731311023235321, + "learning_rate": 0.0001535160162189006, + "loss": 2.6519, + "step": 6473 + }, + { + "epoch": 0.5224759906383666, + "grad_norm": 0.6481007933616638, + "learning_rate": 0.00015350267956049443, + "loss": 2.5695, + "step": 6474 + }, + { + "epoch": 0.5225566943749496, + "grad_norm": 0.7698814868927002, + "learning_rate": 0.00015348934156866163, + "loss": 2.5732, + "step": 6475 + }, + { + "epoch": 0.5226373981115325, + "grad_norm": 0.7404680848121643, + "learning_rate": 0.00015347600224373462, + "loss": 2.5826, + "step": 6476 + }, + { + "epoch": 0.5227181018481155, + "grad_norm": 0.6965613961219788, + "learning_rate": 0.00015346266158604584, + "loss": 2.6069, + "step": 6477 + }, + { + "epoch": 0.5227988055846986, + "grad_norm": 0.6611152291297913, + "learning_rate": 0.00015344931959592777, + "loss": 2.4937, + "step": 6478 + }, + { + "epoch": 0.5228795093212816, + "grad_norm": 0.7418150305747986, + "learning_rate": 0.00015343597627371296, + "loss": 2.5747, + "step": 6479 + }, + { + "epoch": 0.5229602130578646, + "grad_norm": 0.6847610473632812, + "learning_rate": 0.00015342263161973393, + "loss": 2.5906, + "step": 6480 + }, + { + "epoch": 0.5230409167944475, + "grad_norm": 0.7054881453514099, + "learning_rate": 0.00015340928563432326, + "loss": 2.5914, + "step": 6481 + }, + { + "epoch": 0.5231216205310306, + "grad_norm": 0.6918888092041016, + "learning_rate": 0.0001533959383178136, + "loss": 2.6412, + "step": 6482 + }, + { + "epoch": 0.5232023242676136, + "grad_norm": 0.7232856154441833, + "learning_rate": 0.00015338258967053755, + "loss": 2.6364, + "step": 6483 + }, + { + "epoch": 0.5232830280041966, + "grad_norm": 0.7345031499862671, + "learning_rate": 0.00015336923969282786, + "loss": 2.6649, + "step": 6484 + }, + { + "epoch": 0.5233637317407795, + "grad_norm": 0.7644383907318115, + "learning_rate": 0.0001533558883850172, + "loss": 2.6949, + "step": 6485 + }, + { + "epoch": 0.5234444354773626, + "grad_norm": 0.6532372832298279, + "learning_rate": 0.0001533425357474383, + "loss": 2.5915, + "step": 6486 + }, + { + "epoch": 0.5235251392139456, + "grad_norm": 0.7089118361473083, + "learning_rate": 0.000153329181780424, + "loss": 2.6446, + "step": 6487 + }, + { + "epoch": 0.5236058429505286, + "grad_norm": 0.6966068148612976, + "learning_rate": 0.00015331582648430705, + "loss": 2.6764, + "step": 6488 + }, + { + "epoch": 0.5236865466871116, + "grad_norm": 0.7130835056304932, + "learning_rate": 0.00015330246985942035, + "loss": 2.6279, + "step": 6489 + }, + { + "epoch": 0.5237672504236947, + "grad_norm": 0.729727029800415, + "learning_rate": 0.00015328911190609678, + "loss": 2.612, + "step": 6490 + }, + { + "epoch": 0.5238479541602776, + "grad_norm": 0.6804213523864746, + "learning_rate": 0.0001532757526246692, + "loss": 2.6113, + "step": 6491 + }, + { + "epoch": 0.5239286578968606, + "grad_norm": 0.7324437499046326, + "learning_rate": 0.0001532623920154707, + "loss": 2.6054, + "step": 6492 + }, + { + "epoch": 0.5240093616334436, + "grad_norm": 0.6166699528694153, + "learning_rate": 0.00015324903007883406, + "loss": 2.5822, + "step": 6493 + }, + { + "epoch": 0.5240900653700267, + "grad_norm": 0.7339944839477539, + "learning_rate": 0.00015323566681509242, + "loss": 2.6204, + "step": 6494 + }, + { + "epoch": 0.5241707691066096, + "grad_norm": 0.7267727255821228, + "learning_rate": 0.00015322230222457886, + "loss": 2.6094, + "step": 6495 + }, + { + "epoch": 0.5242514728431926, + "grad_norm": 0.6417120695114136, + "learning_rate": 0.00015320893630762635, + "loss": 2.6044, + "step": 6496 + }, + { + "epoch": 0.5243321765797756, + "grad_norm": 0.7092922329902649, + "learning_rate": 0.00015319556906456808, + "loss": 2.6428, + "step": 6497 + }, + { + "epoch": 0.5244128803163587, + "grad_norm": 0.7482922673225403, + "learning_rate": 0.00015318220049573714, + "loss": 2.6025, + "step": 6498 + }, + { + "epoch": 0.5244935840529417, + "grad_norm": 0.691925048828125, + "learning_rate": 0.00015316883060146675, + "loss": 2.6308, + "step": 6499 + }, + { + "epoch": 0.5245742877895246, + "grad_norm": 0.7084488272666931, + "learning_rate": 0.00015315545938209015, + "loss": 2.6535, + "step": 6500 + }, + { + "epoch": 0.5246549915261076, + "grad_norm": 0.7182802557945251, + "learning_rate": 0.00015314208683794056, + "loss": 2.6045, + "step": 6501 + }, + { + "epoch": 0.5247356952626907, + "grad_norm": 0.7043096423149109, + "learning_rate": 0.00015312871296935122, + "loss": 2.6465, + "step": 6502 + }, + { + "epoch": 0.5248163989992737, + "grad_norm": 0.7679466009140015, + "learning_rate": 0.00015311533777665547, + "loss": 2.6624, + "step": 6503 + }, + { + "epoch": 0.5248971027358567, + "grad_norm": 0.6825870275497437, + "learning_rate": 0.00015310196126018668, + "loss": 2.5548, + "step": 6504 + }, + { + "epoch": 0.5249778064724396, + "grad_norm": 0.7364058494567871, + "learning_rate": 0.00015308858342027816, + "loss": 2.6495, + "step": 6505 + }, + { + "epoch": 0.5250585102090227, + "grad_norm": 0.7333239316940308, + "learning_rate": 0.00015307520425726341, + "loss": 2.5835, + "step": 6506 + }, + { + "epoch": 0.5251392139456057, + "grad_norm": 0.7479620575904846, + "learning_rate": 0.00015306182377147583, + "loss": 2.6065, + "step": 6507 + }, + { + "epoch": 0.5252199176821887, + "grad_norm": 0.7347591519355774, + "learning_rate": 0.00015304844196324888, + "loss": 2.6624, + "step": 6508 + }, + { + "epoch": 0.5253006214187717, + "grad_norm": 0.6879193782806396, + "learning_rate": 0.0001530350588329161, + "loss": 2.6598, + "step": 6509 + }, + { + "epoch": 0.5253813251553547, + "grad_norm": 0.7841597199440002, + "learning_rate": 0.000153021674380811, + "loss": 2.53, + "step": 6510 + }, + { + "epoch": 0.5254620288919377, + "grad_norm": 0.7916845679283142, + "learning_rate": 0.0001530082886072672, + "loss": 2.6995, + "step": 6511 + }, + { + "epoch": 0.5255427326285207, + "grad_norm": 0.7066318988800049, + "learning_rate": 0.0001529949015126183, + "loss": 2.58, + "step": 6512 + }, + { + "epoch": 0.5256234363651037, + "grad_norm": 0.6871134638786316, + "learning_rate": 0.00015298151309719787, + "loss": 2.6095, + "step": 6513 + }, + { + "epoch": 0.5257041401016868, + "grad_norm": 0.7479702830314636, + "learning_rate": 0.00015296812336133963, + "loss": 2.608, + "step": 6514 + }, + { + "epoch": 0.5257848438382697, + "grad_norm": 0.6772119402885437, + "learning_rate": 0.00015295473230537735, + "loss": 2.5679, + "step": 6515 + }, + { + "epoch": 0.5258655475748527, + "grad_norm": 0.7365416884422302, + "learning_rate": 0.0001529413399296447, + "loss": 2.6722, + "step": 6516 + }, + { + "epoch": 0.5259462513114357, + "grad_norm": 0.7538040280342102, + "learning_rate": 0.00015292794623447545, + "loss": 2.5562, + "step": 6517 + }, + { + "epoch": 0.5260269550480188, + "grad_norm": 0.7471820712089539, + "learning_rate": 0.00015291455122020344, + "loss": 2.7079, + "step": 6518 + }, + { + "epoch": 0.5261076587846018, + "grad_norm": 0.7605932354927063, + "learning_rate": 0.00015290115488716247, + "loss": 2.6696, + "step": 6519 + }, + { + "epoch": 0.5261883625211847, + "grad_norm": 0.7081854939460754, + "learning_rate": 0.00015288775723568647, + "loss": 2.6502, + "step": 6520 + }, + { + "epoch": 0.5262690662577677, + "grad_norm": 0.7236372828483582, + "learning_rate": 0.0001528743582661093, + "loss": 2.662, + "step": 6521 + }, + { + "epoch": 0.5263497699943508, + "grad_norm": 0.6710047721862793, + "learning_rate": 0.0001528609579787649, + "loss": 2.5947, + "step": 6522 + }, + { + "epoch": 0.5264304737309338, + "grad_norm": 0.709381103515625, + "learning_rate": 0.00015284755637398726, + "loss": 2.5922, + "step": 6523 + }, + { + "epoch": 0.5265111774675167, + "grad_norm": 0.7029775381088257, + "learning_rate": 0.00015283415345211033, + "loss": 2.6777, + "step": 6524 + }, + { + "epoch": 0.5265918812040997, + "grad_norm": 0.7250857949256897, + "learning_rate": 0.00015282074921346825, + "loss": 2.6027, + "step": 6525 + }, + { + "epoch": 0.5266725849406828, + "grad_norm": 0.7192760705947876, + "learning_rate": 0.00015280734365839498, + "loss": 2.6544, + "step": 6526 + }, + { + "epoch": 0.5267532886772658, + "grad_norm": 0.693583071231842, + "learning_rate": 0.0001527939367872247, + "loss": 2.6302, + "step": 6527 + }, + { + "epoch": 0.5268339924138488, + "grad_norm": 0.7031428217887878, + "learning_rate": 0.00015278052860029145, + "loss": 2.6944, + "step": 6528 + }, + { + "epoch": 0.5269146961504317, + "grad_norm": 0.6986895799636841, + "learning_rate": 0.00015276711909792949, + "loss": 2.6595, + "step": 6529 + }, + { + "epoch": 0.5269953998870147, + "grad_norm": 0.7375979423522949, + "learning_rate": 0.000152753708280473, + "loss": 2.6839, + "step": 6530 + }, + { + "epoch": 0.5270761036235978, + "grad_norm": 0.7126755714416504, + "learning_rate": 0.0001527402961482562, + "loss": 2.5597, + "step": 6531 + }, + { + "epoch": 0.5271568073601808, + "grad_norm": 0.6631070971488953, + "learning_rate": 0.00015272688270161338, + "loss": 2.5566, + "step": 6532 + }, + { + "epoch": 0.5272375110967638, + "grad_norm": 0.6896609663963318, + "learning_rate": 0.00015271346794087874, + "loss": 2.5801, + "step": 6533 + }, + { + "epoch": 0.5273182148333467, + "grad_norm": 0.7437502145767212, + "learning_rate": 0.00015270005186638673, + "loss": 2.6572, + "step": 6534 + }, + { + "epoch": 0.5273989185699298, + "grad_norm": 0.7013052701950073, + "learning_rate": 0.00015268663447847166, + "loss": 2.621, + "step": 6535 + }, + { + "epoch": 0.5274796223065128, + "grad_norm": 0.7161773443222046, + "learning_rate": 0.00015267321577746795, + "loss": 2.5989, + "step": 6536 + }, + { + "epoch": 0.5275603260430958, + "grad_norm": 0.7654534578323364, + "learning_rate": 0.00015265979576371, + "loss": 2.6338, + "step": 6537 + }, + { + "epoch": 0.5276410297796787, + "grad_norm": 0.694646954536438, + "learning_rate": 0.0001526463744375323, + "loss": 2.6036, + "step": 6538 + }, + { + "epoch": 0.5277217335162618, + "grad_norm": 0.6594679355621338, + "learning_rate": 0.0001526329517992693, + "loss": 2.6256, + "step": 6539 + }, + { + "epoch": 0.5278024372528448, + "grad_norm": 0.6424389481544495, + "learning_rate": 0.00015261952784925557, + "loss": 2.6389, + "step": 6540 + }, + { + "epoch": 0.5278831409894278, + "grad_norm": 0.7465235590934753, + "learning_rate": 0.0001526061025878257, + "loss": 2.5449, + "step": 6541 + }, + { + "epoch": 0.5279638447260108, + "grad_norm": 0.6900132298469543, + "learning_rate": 0.0001525926760153142, + "loss": 2.5597, + "step": 6542 + }, + { + "epoch": 0.5280445484625939, + "grad_norm": 0.7505282163619995, + "learning_rate": 0.00015257924813205572, + "loss": 2.6526, + "step": 6543 + }, + { + "epoch": 0.5281252521991768, + "grad_norm": 0.72642582654953, + "learning_rate": 0.00015256581893838495, + "loss": 2.6593, + "step": 6544 + }, + { + "epoch": 0.5282059559357598, + "grad_norm": 0.6901132464408875, + "learning_rate": 0.00015255238843463656, + "loss": 2.6726, + "step": 6545 + }, + { + "epoch": 0.5282866596723428, + "grad_norm": 0.7741395831108093, + "learning_rate": 0.0001525389566211453, + "loss": 2.5929, + "step": 6546 + }, + { + "epoch": 0.5283673634089259, + "grad_norm": 0.7282403111457825, + "learning_rate": 0.00015252552349824585, + "loss": 2.5696, + "step": 6547 + }, + { + "epoch": 0.5284480671455088, + "grad_norm": 0.7421764731407166, + "learning_rate": 0.0001525120890662731, + "loss": 2.5593, + "step": 6548 + }, + { + "epoch": 0.5285287708820918, + "grad_norm": 0.6830468773841858, + "learning_rate": 0.00015249865332556182, + "loss": 2.6396, + "step": 6549 + }, + { + "epoch": 0.5286094746186748, + "grad_norm": 0.6758440732955933, + "learning_rate": 0.00015248521627644684, + "loss": 2.5375, + "step": 6550 + }, + { + "epoch": 0.5286901783552579, + "grad_norm": 0.6897253394126892, + "learning_rate": 0.00015247177791926308, + "loss": 2.6148, + "step": 6551 + }, + { + "epoch": 0.5287708820918409, + "grad_norm": 0.6391426920890808, + "learning_rate": 0.00015245833825434547, + "loss": 2.5563, + "step": 6552 + }, + { + "epoch": 0.5288515858284238, + "grad_norm": 0.7213610410690308, + "learning_rate": 0.00015244489728202893, + "loss": 2.6158, + "step": 6553 + }, + { + "epoch": 0.5289322895650068, + "grad_norm": 0.6678160429000854, + "learning_rate": 0.00015243145500264845, + "loss": 2.6177, + "step": 6554 + }, + { + "epoch": 0.5290129933015899, + "grad_norm": 0.7041724324226379, + "learning_rate": 0.00015241801141653905, + "loss": 2.6504, + "step": 6555 + }, + { + "epoch": 0.5290936970381729, + "grad_norm": 0.6551648378372192, + "learning_rate": 0.0001524045665240358, + "loss": 2.577, + "step": 6556 + }, + { + "epoch": 0.5291744007747559, + "grad_norm": 0.7190412878990173, + "learning_rate": 0.00015239112032547377, + "loss": 2.596, + "step": 6557 + }, + { + "epoch": 0.5292551045113388, + "grad_norm": 0.6936302781105042, + "learning_rate": 0.00015237767282118807, + "loss": 2.6551, + "step": 6558 + }, + { + "epoch": 0.5293358082479219, + "grad_norm": 0.6901839971542358, + "learning_rate": 0.0001523642240115138, + "loss": 2.6263, + "step": 6559 + }, + { + "epoch": 0.5294165119845049, + "grad_norm": 0.6905068159103394, + "learning_rate": 0.00015235077389678624, + "loss": 2.6323, + "step": 6560 + }, + { + "epoch": 0.5294972157210879, + "grad_norm": 0.7495188117027283, + "learning_rate": 0.00015233732247734057, + "loss": 2.6243, + "step": 6561 + }, + { + "epoch": 0.5295779194576709, + "grad_norm": 0.6758708357810974, + "learning_rate": 0.00015232386975351197, + "loss": 2.6184, + "step": 6562 + }, + { + "epoch": 0.5296586231942539, + "grad_norm": 0.6443266868591309, + "learning_rate": 0.00015231041572563573, + "loss": 2.6543, + "step": 6563 + }, + { + "epoch": 0.5297393269308369, + "grad_norm": 0.7384275794029236, + "learning_rate": 0.00015229696039404723, + "loss": 2.6117, + "step": 6564 + }, + { + "epoch": 0.5298200306674199, + "grad_norm": 0.6873897314071655, + "learning_rate": 0.00015228350375908178, + "loss": 2.5689, + "step": 6565 + }, + { + "epoch": 0.5299007344040029, + "grad_norm": 0.6715645790100098, + "learning_rate": 0.00015227004582107472, + "loss": 2.5943, + "step": 6566 + }, + { + "epoch": 0.529981438140586, + "grad_norm": 0.6814208030700684, + "learning_rate": 0.00015225658658036151, + "loss": 2.5562, + "step": 6567 + }, + { + "epoch": 0.5300621418771689, + "grad_norm": 0.6942310929298401, + "learning_rate": 0.00015224312603727755, + "loss": 2.5902, + "step": 6568 + }, + { + "epoch": 0.5301428456137519, + "grad_norm": 0.6856299042701721, + "learning_rate": 0.0001522296641921583, + "loss": 2.6115, + "step": 6569 + }, + { + "epoch": 0.5302235493503349, + "grad_norm": 0.870833694934845, + "learning_rate": 0.0001522162010453393, + "loss": 2.7492, + "step": 6570 + }, + { + "epoch": 0.530304253086918, + "grad_norm": 0.6796989440917969, + "learning_rate": 0.0001522027365971561, + "loss": 2.6957, + "step": 6571 + }, + { + "epoch": 0.530384956823501, + "grad_norm": 0.7043026685714722, + "learning_rate": 0.00015218927084794423, + "loss": 2.604, + "step": 6572 + }, + { + "epoch": 0.5304656605600839, + "grad_norm": 0.7533933520317078, + "learning_rate": 0.00015217580379803933, + "loss": 2.6271, + "step": 6573 + }, + { + "epoch": 0.5305463642966669, + "grad_norm": 0.7526697516441345, + "learning_rate": 0.000152162335447777, + "loss": 2.553, + "step": 6574 + }, + { + "epoch": 0.53062706803325, + "grad_norm": 0.6942071318626404, + "learning_rate": 0.00015214886579749284, + "loss": 2.7206, + "step": 6575 + }, + { + "epoch": 0.530707771769833, + "grad_norm": 0.7133236527442932, + "learning_rate": 0.00015213539484752273, + "loss": 2.6545, + "step": 6576 + }, + { + "epoch": 0.530788475506416, + "grad_norm": 0.7229849696159363, + "learning_rate": 0.00015212192259820222, + "loss": 2.6647, + "step": 6577 + }, + { + "epoch": 0.5308691792429989, + "grad_norm": 0.7142449617385864, + "learning_rate": 0.0001521084490498672, + "loss": 2.5777, + "step": 6578 + }, + { + "epoch": 0.5309498829795819, + "grad_norm": 0.6950247287750244, + "learning_rate": 0.00015209497420285342, + "loss": 2.6159, + "step": 6579 + }, + { + "epoch": 0.531030586716165, + "grad_norm": 0.7492622137069702, + "learning_rate": 0.00015208149805749668, + "loss": 2.6927, + "step": 6580 + }, + { + "epoch": 0.531111290452748, + "grad_norm": 0.7618215084075928, + "learning_rate": 0.00015206802061413287, + "loss": 2.5831, + "step": 6581 + }, + { + "epoch": 0.5311919941893309, + "grad_norm": 0.7448660731315613, + "learning_rate": 0.0001520545418730979, + "loss": 2.6123, + "step": 6582 + }, + { + "epoch": 0.5312726979259139, + "grad_norm": 0.7450618147850037, + "learning_rate": 0.00015204106183472766, + "loss": 2.5768, + "step": 6583 + }, + { + "epoch": 0.531353401662497, + "grad_norm": 0.7426019310951233, + "learning_rate": 0.0001520275804993581, + "loss": 2.603, + "step": 6584 + }, + { + "epoch": 0.53143410539908, + "grad_norm": 0.7503333687782288, + "learning_rate": 0.00015201409786732526, + "loss": 2.6159, + "step": 6585 + }, + { + "epoch": 0.531514809135663, + "grad_norm": 0.6944373846054077, + "learning_rate": 0.00015200061393896513, + "loss": 2.5201, + "step": 6586 + }, + { + "epoch": 0.5315955128722459, + "grad_norm": 0.6958110332489014, + "learning_rate": 0.00015198712871461375, + "loss": 2.5592, + "step": 6587 + }, + { + "epoch": 0.531676216608829, + "grad_norm": 0.7838244438171387, + "learning_rate": 0.00015197364219460727, + "loss": 2.6663, + "step": 6588 + }, + { + "epoch": 0.531756920345412, + "grad_norm": 0.754338800907135, + "learning_rate": 0.00015196015437928174, + "loss": 2.6183, + "step": 6589 + }, + { + "epoch": 0.531837624081995, + "grad_norm": 0.7394337058067322, + "learning_rate": 0.00015194666526897332, + "loss": 2.5622, + "step": 6590 + }, + { + "epoch": 0.531918327818578, + "grad_norm": 0.7352069020271301, + "learning_rate": 0.00015193317486401824, + "loss": 2.6173, + "step": 6591 + }, + { + "epoch": 0.531999031555161, + "grad_norm": 0.6318944096565247, + "learning_rate": 0.00015191968316475267, + "loss": 2.6159, + "step": 6592 + }, + { + "epoch": 0.532079735291744, + "grad_norm": 0.7071281671524048, + "learning_rate": 0.00015190619017151291, + "loss": 2.633, + "step": 6593 + }, + { + "epoch": 0.532160439028327, + "grad_norm": 0.7762585282325745, + "learning_rate": 0.00015189269588463517, + "loss": 2.6445, + "step": 6594 + }, + { + "epoch": 0.53224114276491, + "grad_norm": 0.7979930639266968, + "learning_rate": 0.0001518792003044558, + "loss": 2.5825, + "step": 6595 + }, + { + "epoch": 0.5323218465014931, + "grad_norm": 0.7355580925941467, + "learning_rate": 0.00015186570343131114, + "loss": 2.6197, + "step": 6596 + }, + { + "epoch": 0.532402550238076, + "grad_norm": 0.7286938428878784, + "learning_rate": 0.0001518522052655376, + "loss": 2.6385, + "step": 6597 + }, + { + "epoch": 0.532483253974659, + "grad_norm": 0.689143180847168, + "learning_rate": 0.00015183870580747156, + "loss": 2.6593, + "step": 6598 + }, + { + "epoch": 0.532563957711242, + "grad_norm": 0.714746356010437, + "learning_rate": 0.00015182520505744945, + "loss": 2.6059, + "step": 6599 + }, + { + "epoch": 0.5326446614478251, + "grad_norm": 0.8055040240287781, + "learning_rate": 0.00015181170301580777, + "loss": 2.6983, + "step": 6600 + }, + { + "epoch": 0.532725365184408, + "grad_norm": 0.7104170918464661, + "learning_rate": 0.00015179819968288297, + "loss": 2.6578, + "step": 6601 + }, + { + "epoch": 0.532806068920991, + "grad_norm": 0.7175524830818176, + "learning_rate": 0.0001517846950590117, + "loss": 2.6263, + "step": 6602 + }, + { + "epoch": 0.532886772657574, + "grad_norm": 0.6755492091178894, + "learning_rate": 0.00015177118914453042, + "loss": 2.5752, + "step": 6603 + }, + { + "epoch": 0.5329674763941571, + "grad_norm": 0.7020289897918701, + "learning_rate": 0.00015175768193977578, + "loss": 2.6186, + "step": 6604 + }, + { + "epoch": 0.5330481801307401, + "grad_norm": 0.7550958395004272, + "learning_rate": 0.0001517441734450844, + "loss": 2.628, + "step": 6605 + }, + { + "epoch": 0.533128883867323, + "grad_norm": 0.6697603464126587, + "learning_rate": 0.00015173066366079297, + "loss": 2.6433, + "step": 6606 + }, + { + "epoch": 0.533209587603906, + "grad_norm": 0.715372622013092, + "learning_rate": 0.0001517171525872382, + "loss": 2.6022, + "step": 6607 + }, + { + "epoch": 0.5332902913404891, + "grad_norm": 0.7081933617591858, + "learning_rate": 0.00015170364022475675, + "loss": 2.675, + "step": 6608 + }, + { + "epoch": 0.5333709950770721, + "grad_norm": 0.7074152231216431, + "learning_rate": 0.00015169012657368546, + "loss": 2.6637, + "step": 6609 + }, + { + "epoch": 0.5334516988136551, + "grad_norm": 0.6692848801612854, + "learning_rate": 0.00015167661163436108, + "loss": 2.5855, + "step": 6610 + }, + { + "epoch": 0.533532402550238, + "grad_norm": 0.7307556867599487, + "learning_rate": 0.00015166309540712048, + "loss": 2.6105, + "step": 6611 + }, + { + "epoch": 0.5336131062868211, + "grad_norm": 0.7026669383049011, + "learning_rate": 0.00015164957789230048, + "loss": 2.6656, + "step": 6612 + }, + { + "epoch": 0.5336938100234041, + "grad_norm": 0.6579706072807312, + "learning_rate": 0.000151636059090238, + "loss": 2.6456, + "step": 6613 + }, + { + "epoch": 0.5337745137599871, + "grad_norm": 0.6854498386383057, + "learning_rate": 0.00015162253900126993, + "loss": 2.5969, + "step": 6614 + }, + { + "epoch": 0.5338552174965701, + "grad_norm": 0.7542434334754944, + "learning_rate": 0.00015160901762573323, + "loss": 2.6333, + "step": 6615 + }, + { + "epoch": 0.5339359212331531, + "grad_norm": 0.6795105934143066, + "learning_rate": 0.0001515954949639649, + "loss": 2.6268, + "step": 6616 + }, + { + "epoch": 0.5340166249697361, + "grad_norm": 0.6395254135131836, + "learning_rate": 0.000151581971016302, + "loss": 2.5684, + "step": 6617 + }, + { + "epoch": 0.5340973287063191, + "grad_norm": 0.7069850564002991, + "learning_rate": 0.00015156844578308155, + "loss": 2.64, + "step": 6618 + }, + { + "epoch": 0.5341780324429021, + "grad_norm": 0.6779203414916992, + "learning_rate": 0.0001515549192646406, + "loss": 2.6255, + "step": 6619 + }, + { + "epoch": 0.5342587361794852, + "grad_norm": 0.6403560638427734, + "learning_rate": 0.00015154139146131632, + "loss": 2.611, + "step": 6620 + }, + { + "epoch": 0.5343394399160681, + "grad_norm": 0.7532669901847839, + "learning_rate": 0.00015152786237344583, + "loss": 2.5641, + "step": 6621 + }, + { + "epoch": 0.5344201436526511, + "grad_norm": 0.6827573776245117, + "learning_rate": 0.00015151433200136629, + "loss": 2.6096, + "step": 6622 + }, + { + "epoch": 0.5345008473892341, + "grad_norm": 0.6691904067993164, + "learning_rate": 0.000151500800345415, + "loss": 2.6602, + "step": 6623 + }, + { + "epoch": 0.5345815511258172, + "grad_norm": 0.7288634777069092, + "learning_rate": 0.00015148726740592906, + "loss": 2.6468, + "step": 6624 + }, + { + "epoch": 0.5346622548624002, + "grad_norm": 0.7087839245796204, + "learning_rate": 0.00015147373318324586, + "loss": 2.5795, + "step": 6625 + }, + { + "epoch": 0.5347429585989831, + "grad_norm": 0.6618373394012451, + "learning_rate": 0.00015146019767770267, + "loss": 2.638, + "step": 6626 + }, + { + "epoch": 0.5348236623355661, + "grad_norm": 0.7384989857673645, + "learning_rate": 0.00015144666088963684, + "loss": 2.6104, + "step": 6627 + }, + { + "epoch": 0.5349043660721492, + "grad_norm": 0.6662275195121765, + "learning_rate": 0.00015143312281938576, + "loss": 2.6174, + "step": 6628 + }, + { + "epoch": 0.5349850698087322, + "grad_norm": 0.6617184281349182, + "learning_rate": 0.0001514195834672868, + "loss": 2.6154, + "step": 6629 + }, + { + "epoch": 0.5350657735453151, + "grad_norm": 0.7173622846603394, + "learning_rate": 0.0001514060428336774, + "loss": 2.5741, + "step": 6630 + }, + { + "epoch": 0.5351464772818981, + "grad_norm": 0.7773584127426147, + "learning_rate": 0.00015139250091889502, + "loss": 2.6333, + "step": 6631 + }, + { + "epoch": 0.5352271810184811, + "grad_norm": 0.7255204916000366, + "learning_rate": 0.0001513789577232772, + "loss": 2.5459, + "step": 6632 + }, + { + "epoch": 0.5353078847550642, + "grad_norm": 0.7308403849601746, + "learning_rate": 0.00015136541324716144, + "loss": 2.5934, + "step": 6633 + }, + { + "epoch": 0.5353885884916472, + "grad_norm": 0.699367880821228, + "learning_rate": 0.0001513518674908853, + "loss": 2.6797, + "step": 6634 + }, + { + "epoch": 0.5354692922282301, + "grad_norm": 0.7236449718475342, + "learning_rate": 0.0001513383204547864, + "loss": 2.6289, + "step": 6635 + }, + { + "epoch": 0.5355499959648131, + "grad_norm": 0.6860557794570923, + "learning_rate": 0.00015132477213920234, + "loss": 2.6736, + "step": 6636 + }, + { + "epoch": 0.5356306997013962, + "grad_norm": 0.6724153161048889, + "learning_rate": 0.00015131122254447084, + "loss": 2.5581, + "step": 6637 + }, + { + "epoch": 0.5357114034379792, + "grad_norm": 0.6818630695343018, + "learning_rate": 0.00015129767167092949, + "loss": 2.5979, + "step": 6638 + }, + { + "epoch": 0.5357921071745622, + "grad_norm": 0.6956631541252136, + "learning_rate": 0.00015128411951891607, + "loss": 2.6116, + "step": 6639 + }, + { + "epoch": 0.5358728109111451, + "grad_norm": 0.6698076128959656, + "learning_rate": 0.00015127056608876837, + "loss": 2.65, + "step": 6640 + }, + { + "epoch": 0.5359535146477282, + "grad_norm": 0.7763264179229736, + "learning_rate": 0.00015125701138082415, + "loss": 2.6164, + "step": 6641 + }, + { + "epoch": 0.5360342183843112, + "grad_norm": 0.7148340940475464, + "learning_rate": 0.00015124345539542118, + "loss": 2.6467, + "step": 6642 + }, + { + "epoch": 0.5361149221208942, + "grad_norm": 0.7350041270256042, + "learning_rate": 0.00015122989813289733, + "loss": 2.6477, + "step": 6643 + }, + { + "epoch": 0.5361956258574772, + "grad_norm": 0.6993441581726074, + "learning_rate": 0.00015121633959359055, + "loss": 2.7526, + "step": 6644 + }, + { + "epoch": 0.5362763295940602, + "grad_norm": 0.6828470826148987, + "learning_rate": 0.00015120277977783873, + "loss": 2.6439, + "step": 6645 + }, + { + "epoch": 0.5363570333306432, + "grad_norm": 0.7076796889305115, + "learning_rate": 0.0001511892186859797, + "loss": 2.6375, + "step": 6646 + }, + { + "epoch": 0.5364377370672262, + "grad_norm": 0.6830769777297974, + "learning_rate": 0.0001511756563183516, + "loss": 2.6052, + "step": 6647 + }, + { + "epoch": 0.5365184408038092, + "grad_norm": 0.6482179760932922, + "learning_rate": 0.00015116209267529237, + "loss": 2.6251, + "step": 6648 + }, + { + "epoch": 0.5365991445403923, + "grad_norm": 0.6687620878219604, + "learning_rate": 0.00015114852775714, + "loss": 2.659, + "step": 6649 + }, + { + "epoch": 0.5366798482769752, + "grad_norm": 0.734108030796051, + "learning_rate": 0.0001511349615642327, + "loss": 2.6542, + "step": 6650 + }, + { + "epoch": 0.5367605520135582, + "grad_norm": 0.7092111706733704, + "learning_rate": 0.00015112139409690842, + "loss": 2.6228, + "step": 6651 + }, + { + "epoch": 0.5368412557501412, + "grad_norm": 0.6544996500015259, + "learning_rate": 0.0001511078253555054, + "loss": 2.5661, + "step": 6652 + }, + { + "epoch": 0.5369219594867243, + "grad_norm": 0.7012531161308289, + "learning_rate": 0.00015109425534036176, + "loss": 2.6447, + "step": 6653 + }, + { + "epoch": 0.5370026632233073, + "grad_norm": 0.6813335418701172, + "learning_rate": 0.0001510806840518157, + "loss": 2.5723, + "step": 6654 + }, + { + "epoch": 0.5370833669598902, + "grad_norm": 0.6711288094520569, + "learning_rate": 0.0001510671114902055, + "loss": 2.6096, + "step": 6655 + }, + { + "epoch": 0.5371640706964732, + "grad_norm": 0.721866250038147, + "learning_rate": 0.00015105353765586935, + "loss": 2.6167, + "step": 6656 + }, + { + "epoch": 0.5372447744330563, + "grad_norm": 0.8140639066696167, + "learning_rate": 0.00015103996254914562, + "loss": 2.5768, + "step": 6657 + }, + { + "epoch": 0.5373254781696393, + "grad_norm": 0.6859177947044373, + "learning_rate": 0.0001510263861703726, + "loss": 2.5638, + "step": 6658 + }, + { + "epoch": 0.5374061819062222, + "grad_norm": 0.7254204154014587, + "learning_rate": 0.00015101280851988864, + "loss": 2.5855, + "step": 6659 + }, + { + "epoch": 0.5374868856428052, + "grad_norm": 0.7181829810142517, + "learning_rate": 0.00015099922959803218, + "loss": 2.5358, + "step": 6660 + }, + { + "epoch": 0.5375675893793883, + "grad_norm": 0.7092663645744324, + "learning_rate": 0.00015098564940514155, + "loss": 2.679, + "step": 6661 + }, + { + "epoch": 0.5376482931159713, + "grad_norm": 0.7126225233078003, + "learning_rate": 0.00015097206794155527, + "loss": 2.6167, + "step": 6662 + }, + { + "epoch": 0.5377289968525543, + "grad_norm": 0.7469925880432129, + "learning_rate": 0.00015095848520761186, + "loss": 2.5906, + "step": 6663 + }, + { + "epoch": 0.5378097005891372, + "grad_norm": 0.6911186575889587, + "learning_rate": 0.00015094490120364973, + "loss": 2.6488, + "step": 6664 + }, + { + "epoch": 0.5378904043257203, + "grad_norm": 0.6579635143280029, + "learning_rate": 0.00015093131593000753, + "loss": 2.5894, + "step": 6665 + }, + { + "epoch": 0.5379711080623033, + "grad_norm": 0.7107242345809937, + "learning_rate": 0.00015091772938702377, + "loss": 2.6568, + "step": 6666 + }, + { + "epoch": 0.5380518117988863, + "grad_norm": 0.6845428943634033, + "learning_rate": 0.00015090414157503714, + "loss": 2.5697, + "step": 6667 + }, + { + "epoch": 0.5381325155354693, + "grad_norm": 0.6713212132453918, + "learning_rate": 0.00015089055249438622, + "loss": 2.5747, + "step": 6668 + }, + { + "epoch": 0.5382132192720523, + "grad_norm": 0.7091513276100159, + "learning_rate": 0.0001508769621454097, + "loss": 2.6765, + "step": 6669 + }, + { + "epoch": 0.5382939230086353, + "grad_norm": 0.7403436899185181, + "learning_rate": 0.00015086337052844627, + "loss": 2.6841, + "step": 6670 + }, + { + "epoch": 0.5383746267452183, + "grad_norm": 0.6745626330375671, + "learning_rate": 0.0001508497776438347, + "loss": 2.6436, + "step": 6671 + }, + { + "epoch": 0.5384553304818013, + "grad_norm": 0.7491294145584106, + "learning_rate": 0.00015083618349191372, + "loss": 2.6376, + "step": 6672 + }, + { + "epoch": 0.5385360342183844, + "grad_norm": 0.719761848449707, + "learning_rate": 0.00015082258807302222, + "loss": 2.5885, + "step": 6673 + }, + { + "epoch": 0.5386167379549673, + "grad_norm": 0.7302667498588562, + "learning_rate": 0.00015080899138749895, + "loss": 2.7019, + "step": 6674 + }, + { + "epoch": 0.5386974416915503, + "grad_norm": 0.7640584111213684, + "learning_rate": 0.0001507953934356828, + "loss": 2.6404, + "step": 6675 + }, + { + "epoch": 0.5387781454281333, + "grad_norm": 0.699515700340271, + "learning_rate": 0.0001507817942179127, + "loss": 2.6407, + "step": 6676 + }, + { + "epoch": 0.5388588491647164, + "grad_norm": 0.7305224537849426, + "learning_rate": 0.00015076819373452746, + "loss": 2.5994, + "step": 6677 + }, + { + "epoch": 0.5389395529012994, + "grad_norm": 0.7125952243804932, + "learning_rate": 0.00015075459198586616, + "loss": 2.6472, + "step": 6678 + }, + { + "epoch": 0.5390202566378823, + "grad_norm": 0.7077293395996094, + "learning_rate": 0.00015074098897226778, + "loss": 2.6168, + "step": 6679 + }, + { + "epoch": 0.5391009603744653, + "grad_norm": 0.6713843941688538, + "learning_rate": 0.00015072738469407127, + "loss": 2.5736, + "step": 6680 + }, + { + "epoch": 0.5391816641110483, + "grad_norm": 0.7101294994354248, + "learning_rate": 0.00015071377915161578, + "loss": 2.6994, + "step": 6681 + }, + { + "epoch": 0.5392623678476314, + "grad_norm": 0.7132740020751953, + "learning_rate": 0.00015070017234524032, + "loss": 2.586, + "step": 6682 + }, + { + "epoch": 0.5393430715842144, + "grad_norm": 0.7043401598930359, + "learning_rate": 0.00015068656427528402, + "loss": 2.6025, + "step": 6683 + }, + { + "epoch": 0.5394237753207973, + "grad_norm": 0.6831551194190979, + "learning_rate": 0.00015067295494208607, + "loss": 2.6183, + "step": 6684 + }, + { + "epoch": 0.5395044790573803, + "grad_norm": 0.7066370844841003, + "learning_rate": 0.0001506593443459856, + "loss": 2.6467, + "step": 6685 + }, + { + "epoch": 0.5395851827939634, + "grad_norm": 0.7908033132553101, + "learning_rate": 0.0001506457324873219, + "loss": 2.6929, + "step": 6686 + }, + { + "epoch": 0.5396658865305464, + "grad_norm": 0.7186774611473083, + "learning_rate": 0.00015063211936643407, + "loss": 2.5841, + "step": 6687 + }, + { + "epoch": 0.5397465902671293, + "grad_norm": 0.6634512543678284, + "learning_rate": 0.0001506185049836615, + "loss": 2.5517, + "step": 6688 + }, + { + "epoch": 0.5398272940037123, + "grad_norm": 0.734406590461731, + "learning_rate": 0.00015060488933934353, + "loss": 2.6317, + "step": 6689 + }, + { + "epoch": 0.5399079977402954, + "grad_norm": 0.7754772305488586, + "learning_rate": 0.00015059127243381937, + "loss": 2.6885, + "step": 6690 + }, + { + "epoch": 0.5399887014768784, + "grad_norm": 0.7636603713035583, + "learning_rate": 0.00015057765426742848, + "loss": 2.5767, + "step": 6691 + }, + { + "epoch": 0.5400694052134614, + "grad_norm": 0.6621577143669128, + "learning_rate": 0.00015056403484051017, + "loss": 2.5905, + "step": 6692 + }, + { + "epoch": 0.5401501089500443, + "grad_norm": 0.7605881094932556, + "learning_rate": 0.00015055041415340404, + "loss": 2.6166, + "step": 6693 + }, + { + "epoch": 0.5402308126866274, + "grad_norm": 0.7603485584259033, + "learning_rate": 0.0001505367922064494, + "loss": 2.6123, + "step": 6694 + }, + { + "epoch": 0.5403115164232104, + "grad_norm": 0.7021469473838806, + "learning_rate": 0.0001505231689999858, + "loss": 2.6754, + "step": 6695 + }, + { + "epoch": 0.5403922201597934, + "grad_norm": 0.7291955947875977, + "learning_rate": 0.00015050954453435273, + "loss": 2.6393, + "step": 6696 + }, + { + "epoch": 0.5404729238963764, + "grad_norm": 0.6658700704574585, + "learning_rate": 0.00015049591880988977, + "loss": 2.5888, + "step": 6697 + }, + { + "epoch": 0.5405536276329594, + "grad_norm": 0.7080146074295044, + "learning_rate": 0.00015048229182693657, + "loss": 2.6318, + "step": 6698 + }, + { + "epoch": 0.5406343313695424, + "grad_norm": 0.7440849542617798, + "learning_rate": 0.00015046866358583267, + "loss": 2.596, + "step": 6699 + }, + { + "epoch": 0.5407150351061254, + "grad_norm": 0.886578381061554, + "learning_rate": 0.00015045503408691775, + "loss": 2.6479, + "step": 6700 + }, + { + "epoch": 0.5407957388427084, + "grad_norm": 0.7221408486366272, + "learning_rate": 0.00015044140333053148, + "loss": 2.625, + "step": 6701 + }, + { + "epoch": 0.5408764425792915, + "grad_norm": 0.7193209528923035, + "learning_rate": 0.0001504277713170136, + "loss": 2.6044, + "step": 6702 + }, + { + "epoch": 0.5409571463158744, + "grad_norm": 0.7139819860458374, + "learning_rate": 0.00015041413804670384, + "loss": 2.5572, + "step": 6703 + }, + { + "epoch": 0.5410378500524574, + "grad_norm": 0.728875994682312, + "learning_rate": 0.00015040050351994196, + "loss": 2.6373, + "step": 6704 + }, + { + "epoch": 0.5411185537890404, + "grad_norm": 0.6794858574867249, + "learning_rate": 0.0001503868677370678, + "loss": 2.6265, + "step": 6705 + }, + { + "epoch": 0.5411992575256235, + "grad_norm": 0.6874774098396301, + "learning_rate": 0.00015037323069842117, + "loss": 2.6146, + "step": 6706 + }, + { + "epoch": 0.5412799612622065, + "grad_norm": 0.7064409255981445, + "learning_rate": 0.00015035959240434197, + "loss": 2.6126, + "step": 6707 + }, + { + "epoch": 0.5413606649987894, + "grad_norm": 0.7212977409362793, + "learning_rate": 0.00015034595285517006, + "loss": 2.6836, + "step": 6708 + }, + { + "epoch": 0.5414413687353724, + "grad_norm": 0.7826492190361023, + "learning_rate": 0.0001503323120512454, + "loss": 2.6648, + "step": 6709 + }, + { + "epoch": 0.5415220724719555, + "grad_norm": 0.7228415608406067, + "learning_rate": 0.000150318669992908, + "loss": 2.5734, + "step": 6710 + }, + { + "epoch": 0.5416027762085385, + "grad_norm": 0.6929590702056885, + "learning_rate": 0.00015030502668049778, + "loss": 2.6023, + "step": 6711 + }, + { + "epoch": 0.5416834799451214, + "grad_norm": 0.679990291595459, + "learning_rate": 0.0001502913821143548, + "loss": 2.5867, + "step": 6712 + }, + { + "epoch": 0.5417641836817044, + "grad_norm": 0.7324180603027344, + "learning_rate": 0.00015027773629481907, + "loss": 2.5722, + "step": 6713 + }, + { + "epoch": 0.5418448874182875, + "grad_norm": 0.686826765537262, + "learning_rate": 0.00015026408922223078, + "loss": 2.6138, + "step": 6714 + }, + { + "epoch": 0.5419255911548705, + "grad_norm": 0.7045193314552307, + "learning_rate": 0.00015025044089693, + "loss": 2.619, + "step": 6715 + }, + { + "epoch": 0.5420062948914535, + "grad_norm": 0.6839936375617981, + "learning_rate": 0.00015023679131925683, + "loss": 2.5778, + "step": 6716 + }, + { + "epoch": 0.5420869986280364, + "grad_norm": 0.7613961696624756, + "learning_rate": 0.00015022314048955153, + "loss": 2.6262, + "step": 6717 + }, + { + "epoch": 0.5421677023646195, + "grad_norm": 0.7867478728294373, + "learning_rate": 0.00015020948840815428, + "loss": 2.6576, + "step": 6718 + }, + { + "epoch": 0.5422484061012025, + "grad_norm": 0.7371038794517517, + "learning_rate": 0.0001501958350754053, + "loss": 2.6495, + "step": 6719 + }, + { + "epoch": 0.5423291098377855, + "grad_norm": 0.7146512269973755, + "learning_rate": 0.00015018218049164494, + "loss": 2.6514, + "step": 6720 + }, + { + "epoch": 0.5424098135743685, + "grad_norm": 0.7507650256156921, + "learning_rate": 0.00015016852465721346, + "loss": 2.6509, + "step": 6721 + }, + { + "epoch": 0.5424905173109515, + "grad_norm": 0.6786547303199768, + "learning_rate": 0.0001501548675724512, + "loss": 2.5983, + "step": 6722 + }, + { + "epoch": 0.5425712210475345, + "grad_norm": 0.7077932357788086, + "learning_rate": 0.0001501412092376985, + "loss": 2.622, + "step": 6723 + }, + { + "epoch": 0.5426519247841175, + "grad_norm": 0.7191271781921387, + "learning_rate": 0.00015012754965329584, + "loss": 2.6632, + "step": 6724 + }, + { + "epoch": 0.5427326285207005, + "grad_norm": 0.6785906553268433, + "learning_rate": 0.00015011388881958356, + "loss": 2.6312, + "step": 6725 + }, + { + "epoch": 0.5428133322572836, + "grad_norm": 0.6880263090133667, + "learning_rate": 0.00015010022673690222, + "loss": 2.5951, + "step": 6726 + }, + { + "epoch": 0.5428940359938665, + "grad_norm": 0.7769095301628113, + "learning_rate": 0.0001500865634055923, + "loss": 2.5503, + "step": 6727 + }, + { + "epoch": 0.5429747397304495, + "grad_norm": 0.6847476959228516, + "learning_rate": 0.0001500728988259942, + "loss": 2.6824, + "step": 6728 + }, + { + "epoch": 0.5430554434670325, + "grad_norm": 0.6829310059547424, + "learning_rate": 0.00015005923299844863, + "loss": 2.5683, + "step": 6729 + }, + { + "epoch": 0.5431361472036156, + "grad_norm": 0.7436082363128662, + "learning_rate": 0.0001500455659232961, + "loss": 2.6165, + "step": 6730 + }, + { + "epoch": 0.5432168509401986, + "grad_norm": 0.7876375913619995, + "learning_rate": 0.00015003189760087724, + "loss": 2.6203, + "step": 6731 + }, + { + "epoch": 0.5432975546767815, + "grad_norm": 0.6869253516197205, + "learning_rate": 0.0001500182280315327, + "loss": 2.6136, + "step": 6732 + }, + { + "epoch": 0.5433782584133645, + "grad_norm": 0.7179432511329651, + "learning_rate": 0.00015000455721560316, + "loss": 2.6049, + "step": 6733 + }, + { + "epoch": 0.5434589621499475, + "grad_norm": 0.7286917567253113, + "learning_rate": 0.00014999088515342939, + "loss": 2.5704, + "step": 6734 + }, + { + "epoch": 0.5435396658865306, + "grad_norm": 0.6841779351234436, + "learning_rate": 0.00014997721184535206, + "loss": 2.6095, + "step": 6735 + }, + { + "epoch": 0.5436203696231136, + "grad_norm": 0.7661791443824768, + "learning_rate": 0.00014996353729171196, + "loss": 2.6193, + "step": 6736 + }, + { + "epoch": 0.5437010733596965, + "grad_norm": 0.7365885376930237, + "learning_rate": 0.0001499498614928499, + "loss": 2.586, + "step": 6737 + }, + { + "epoch": 0.5437817770962795, + "grad_norm": 0.7423815131187439, + "learning_rate": 0.00014993618444910674, + "loss": 2.6199, + "step": 6738 + }, + { + "epoch": 0.5438624808328626, + "grad_norm": 0.7667781114578247, + "learning_rate": 0.0001499225061608233, + "loss": 2.6584, + "step": 6739 + }, + { + "epoch": 0.5439431845694456, + "grad_norm": 0.7148830890655518, + "learning_rate": 0.00014990882662834057, + "loss": 2.7172, + "step": 6740 + }, + { + "epoch": 0.5440238883060285, + "grad_norm": 0.7206205725669861, + "learning_rate": 0.00014989514585199936, + "loss": 2.5682, + "step": 6741 + }, + { + "epoch": 0.5441045920426115, + "grad_norm": 0.7306448221206665, + "learning_rate": 0.0001498814638321407, + "loss": 2.6724, + "step": 6742 + }, + { + "epoch": 0.5441852957791946, + "grad_norm": 0.7058824896812439, + "learning_rate": 0.00014986778056910556, + "loss": 2.6573, + "step": 6743 + }, + { + "epoch": 0.5442659995157776, + "grad_norm": 0.770588755607605, + "learning_rate": 0.000149854096063235, + "loss": 2.658, + "step": 6744 + }, + { + "epoch": 0.5443467032523606, + "grad_norm": 0.8283931612968445, + "learning_rate": 0.00014984041031487001, + "loss": 2.6624, + "step": 6745 + }, + { + "epoch": 0.5444274069889435, + "grad_norm": 0.6814693808555603, + "learning_rate": 0.00014982672332435176, + "loss": 2.5835, + "step": 6746 + }, + { + "epoch": 0.5445081107255266, + "grad_norm": 0.7059363722801208, + "learning_rate": 0.00014981303509202127, + "loss": 2.5977, + "step": 6747 + }, + { + "epoch": 0.5445888144621096, + "grad_norm": 0.6678106188774109, + "learning_rate": 0.00014979934561821975, + "loss": 2.6479, + "step": 6748 + }, + { + "epoch": 0.5446695181986926, + "grad_norm": 0.8167592883110046, + "learning_rate": 0.00014978565490328835, + "loss": 2.6529, + "step": 6749 + }, + { + "epoch": 0.5447502219352756, + "grad_norm": 0.807209849357605, + "learning_rate": 0.00014977196294756832, + "loss": 2.6546, + "step": 6750 + }, + { + "epoch": 0.5448309256718586, + "grad_norm": 0.7099517583847046, + "learning_rate": 0.00014975826975140085, + "loss": 2.6178, + "step": 6751 + }, + { + "epoch": 0.5449116294084416, + "grad_norm": 0.7900758981704712, + "learning_rate": 0.0001497445753151272, + "loss": 2.586, + "step": 6752 + }, + { + "epoch": 0.5449923331450246, + "grad_norm": 0.6826134920120239, + "learning_rate": 0.00014973087963908875, + "loss": 2.5914, + "step": 6753 + }, + { + "epoch": 0.5450730368816076, + "grad_norm": 0.7383863925933838, + "learning_rate": 0.0001497171827236268, + "loss": 2.6357, + "step": 6754 + }, + { + "epoch": 0.5451537406181907, + "grad_norm": 0.7208051085472107, + "learning_rate": 0.0001497034845690826, + "loss": 2.5435, + "step": 6755 + }, + { + "epoch": 0.5452344443547736, + "grad_norm": 0.680794894695282, + "learning_rate": 0.00014968978517579772, + "loss": 2.5691, + "step": 6756 + }, + { + "epoch": 0.5453151480913566, + "grad_norm": 0.680759847164154, + "learning_rate": 0.00014967608454411347, + "loss": 2.5761, + "step": 6757 + }, + { + "epoch": 0.5453958518279396, + "grad_norm": 0.719634473323822, + "learning_rate": 0.00014966238267437134, + "loss": 2.637, + "step": 6758 + }, + { + "epoch": 0.5454765555645227, + "grad_norm": 0.777302086353302, + "learning_rate": 0.0001496486795669128, + "loss": 2.6457, + "step": 6759 + }, + { + "epoch": 0.5455572593011057, + "grad_norm": 0.6875059604644775, + "learning_rate": 0.0001496349752220794, + "loss": 2.6116, + "step": 6760 + }, + { + "epoch": 0.5456379630376886, + "grad_norm": 0.6884258985519409, + "learning_rate": 0.0001496212696402127, + "loss": 2.5863, + "step": 6761 + }, + { + "epoch": 0.5457186667742716, + "grad_norm": 0.6667922139167786, + "learning_rate": 0.00014960756282165422, + "loss": 2.5892, + "step": 6762 + }, + { + "epoch": 0.5457993705108547, + "grad_norm": 0.6712725162506104, + "learning_rate": 0.00014959385476674559, + "loss": 2.5478, + "step": 6763 + }, + { + "epoch": 0.5458800742474377, + "grad_norm": 0.6803874969482422, + "learning_rate": 0.00014958014547582845, + "loss": 2.5785, + "step": 6764 + }, + { + "epoch": 0.5459607779840207, + "grad_norm": 0.6975811123847961, + "learning_rate": 0.0001495664349492445, + "loss": 2.5765, + "step": 6765 + }, + { + "epoch": 0.5460414817206036, + "grad_norm": 0.7676273584365845, + "learning_rate": 0.00014955272318733544, + "loss": 2.634, + "step": 6766 + }, + { + "epoch": 0.5461221854571867, + "grad_norm": 0.7044547200202942, + "learning_rate": 0.000149539010190443, + "loss": 2.646, + "step": 6767 + }, + { + "epoch": 0.5462028891937697, + "grad_norm": 0.7453166842460632, + "learning_rate": 0.00014952529595890887, + "loss": 2.6137, + "step": 6768 + }, + { + "epoch": 0.5462835929303527, + "grad_norm": 0.7281681299209595, + "learning_rate": 0.00014951158049307493, + "loss": 2.6558, + "step": 6769 + }, + { + "epoch": 0.5463642966669356, + "grad_norm": 0.7131047248840332, + "learning_rate": 0.00014949786379328298, + "loss": 2.6441, + "step": 6770 + }, + { + "epoch": 0.5464450004035187, + "grad_norm": 0.7072219848632812, + "learning_rate": 0.00014948414585987487, + "loss": 2.5861, + "step": 6771 + }, + { + "epoch": 0.5465257041401017, + "grad_norm": 0.7270335555076599, + "learning_rate": 0.00014947042669319252, + "loss": 2.6703, + "step": 6772 + }, + { + "epoch": 0.5466064078766847, + "grad_norm": 0.7314150929450989, + "learning_rate": 0.0001494567062935778, + "loss": 2.6101, + "step": 6773 + }, + { + "epoch": 0.5466871116132677, + "grad_norm": 0.8168460130691528, + "learning_rate": 0.00014944298466137266, + "loss": 2.662, + "step": 6774 + }, + { + "epoch": 0.5467678153498507, + "grad_norm": 0.7338390350341797, + "learning_rate": 0.00014942926179691913, + "loss": 2.6481, + "step": 6775 + }, + { + "epoch": 0.5468485190864337, + "grad_norm": 0.7065639495849609, + "learning_rate": 0.00014941553770055917, + "loss": 2.6192, + "step": 6776 + }, + { + "epoch": 0.5469292228230167, + "grad_norm": 0.7675396203994751, + "learning_rate": 0.00014940181237263483, + "loss": 2.5828, + "step": 6777 + }, + { + "epoch": 0.5470099265595997, + "grad_norm": 0.7085692286491394, + "learning_rate": 0.0001493880858134882, + "loss": 2.5815, + "step": 6778 + }, + { + "epoch": 0.5470906302961828, + "grad_norm": 0.757591187953949, + "learning_rate": 0.00014937435802346135, + "loss": 2.691, + "step": 6779 + }, + { + "epoch": 0.5471713340327657, + "grad_norm": 0.7299168705940247, + "learning_rate": 0.00014936062900289647, + "loss": 2.6246, + "step": 6780 + }, + { + "epoch": 0.5472520377693487, + "grad_norm": 0.693692684173584, + "learning_rate": 0.00014934689875213564, + "loss": 2.6149, + "step": 6781 + }, + { + "epoch": 0.5473327415059317, + "grad_norm": 0.733657956123352, + "learning_rate": 0.00014933316727152113, + "loss": 2.582, + "step": 6782 + }, + { + "epoch": 0.5474134452425147, + "grad_norm": 0.6881953477859497, + "learning_rate": 0.00014931943456139514, + "loss": 2.6023, + "step": 6783 + }, + { + "epoch": 0.5474941489790978, + "grad_norm": 0.7102411985397339, + "learning_rate": 0.00014930570062209988, + "loss": 2.6296, + "step": 6784 + }, + { + "epoch": 0.5475748527156807, + "grad_norm": 0.7263364791870117, + "learning_rate": 0.00014929196545397771, + "loss": 2.6414, + "step": 6785 + }, + { + "epoch": 0.5476555564522637, + "grad_norm": 0.7239066958427429, + "learning_rate": 0.00014927822905737092, + "loss": 2.6174, + "step": 6786 + }, + { + "epoch": 0.5477362601888467, + "grad_norm": 0.6909911632537842, + "learning_rate": 0.0001492644914326218, + "loss": 2.6036, + "step": 6787 + }, + { + "epoch": 0.5478169639254298, + "grad_norm": 0.719693124294281, + "learning_rate": 0.00014925075258007283, + "loss": 2.6507, + "step": 6788 + }, + { + "epoch": 0.5478976676620128, + "grad_norm": 0.7722225785255432, + "learning_rate": 0.0001492370125000663, + "loss": 2.6268, + "step": 6789 + }, + { + "epoch": 0.5479783713985957, + "grad_norm": 0.7456568479537964, + "learning_rate": 0.00014922327119294476, + "loss": 2.6426, + "step": 6790 + }, + { + "epoch": 0.5480590751351787, + "grad_norm": 0.7430242300033569, + "learning_rate": 0.00014920952865905062, + "loss": 2.6632, + "step": 6791 + }, + { + "epoch": 0.5481397788717618, + "grad_norm": 0.7363260388374329, + "learning_rate": 0.0001491957848987264, + "loss": 2.6021, + "step": 6792 + }, + { + "epoch": 0.5482204826083448, + "grad_norm": 0.6903972029685974, + "learning_rate": 0.00014918203991231462, + "loss": 2.6086, + "step": 6793 + }, + { + "epoch": 0.5483011863449277, + "grad_norm": 0.6765161752700806, + "learning_rate": 0.00014916829370015781, + "loss": 2.5806, + "step": 6794 + }, + { + "epoch": 0.5483818900815107, + "grad_norm": 0.7533403635025024, + "learning_rate": 0.0001491545462625986, + "loss": 2.6351, + "step": 6795 + }, + { + "epoch": 0.5484625938180938, + "grad_norm": 0.6841829419136047, + "learning_rate": 0.00014914079759997963, + "loss": 2.606, + "step": 6796 + }, + { + "epoch": 0.5485432975546768, + "grad_norm": 0.7671411037445068, + "learning_rate": 0.00014912704771264353, + "loss": 2.6645, + "step": 6797 + }, + { + "epoch": 0.5486240012912598, + "grad_norm": 0.7218797206878662, + "learning_rate": 0.00014911329660093295, + "loss": 2.6302, + "step": 6798 + }, + { + "epoch": 0.5487047050278427, + "grad_norm": 0.7269994020462036, + "learning_rate": 0.00014909954426519067, + "loss": 2.6261, + "step": 6799 + }, + { + "epoch": 0.5487854087644258, + "grad_norm": 0.765353262424469, + "learning_rate": 0.00014908579070575936, + "loss": 2.5787, + "step": 6800 + }, + { + "epoch": 0.5488661125010088, + "grad_norm": 0.6503065228462219, + "learning_rate": 0.00014907203592298189, + "loss": 2.6404, + "step": 6801 + }, + { + "epoch": 0.5489468162375918, + "grad_norm": 0.6869633197784424, + "learning_rate": 0.00014905827991720097, + "loss": 2.6463, + "step": 6802 + }, + { + "epoch": 0.5490275199741748, + "grad_norm": 0.7221426963806152, + "learning_rate": 0.00014904452268875947, + "loss": 2.6686, + "step": 6803 + }, + { + "epoch": 0.5491082237107578, + "grad_norm": 0.6781399250030518, + "learning_rate": 0.00014903076423800028, + "loss": 2.6274, + "step": 6804 + }, + { + "epoch": 0.5491889274473408, + "grad_norm": 0.7451084852218628, + "learning_rate": 0.00014901700456526626, + "loss": 2.6449, + "step": 6805 + }, + { + "epoch": 0.5492696311839238, + "grad_norm": 0.7159574627876282, + "learning_rate": 0.0001490032436709004, + "loss": 2.6664, + "step": 6806 + }, + { + "epoch": 0.5493503349205068, + "grad_norm": 0.724039614200592, + "learning_rate": 0.00014898948155524558, + "loss": 2.5816, + "step": 6807 + }, + { + "epoch": 0.5494310386570899, + "grad_norm": 0.7194633483886719, + "learning_rate": 0.0001489757182186448, + "loss": 2.5625, + "step": 6808 + }, + { + "epoch": 0.5495117423936728, + "grad_norm": 0.704133927822113, + "learning_rate": 0.0001489619536614411, + "loss": 2.6295, + "step": 6809 + }, + { + "epoch": 0.5495924461302558, + "grad_norm": 0.6717158555984497, + "learning_rate": 0.00014894818788397757, + "loss": 2.6168, + "step": 6810 + }, + { + "epoch": 0.5496731498668388, + "grad_norm": 0.7096573710441589, + "learning_rate": 0.0001489344208865972, + "loss": 2.6316, + "step": 6811 + }, + { + "epoch": 0.5497538536034219, + "grad_norm": 0.6383458375930786, + "learning_rate": 0.00014892065266964316, + "loss": 2.5577, + "step": 6812 + }, + { + "epoch": 0.5498345573400049, + "grad_norm": 0.7606377601623535, + "learning_rate": 0.0001489068832334586, + "loss": 2.7078, + "step": 6813 + }, + { + "epoch": 0.5499152610765878, + "grad_norm": 0.649162232875824, + "learning_rate": 0.00014889311257838665, + "loss": 2.6023, + "step": 6814 + }, + { + "epoch": 0.5499959648131708, + "grad_norm": 0.6445025205612183, + "learning_rate": 0.00014887934070477053, + "loss": 2.6, + "step": 6815 + }, + { + "epoch": 0.5500766685497539, + "grad_norm": 0.6873729825019836, + "learning_rate": 0.00014886556761295342, + "loss": 2.6398, + "step": 6816 + }, + { + "epoch": 0.5501573722863369, + "grad_norm": 0.7814947366714478, + "learning_rate": 0.0001488517933032787, + "loss": 2.5803, + "step": 6817 + }, + { + "epoch": 0.5502380760229199, + "grad_norm": 0.7140909433364868, + "learning_rate": 0.00014883801777608953, + "loss": 2.6051, + "step": 6818 + }, + { + "epoch": 0.5503187797595028, + "grad_norm": 0.7326326370239258, + "learning_rate": 0.00014882424103172936, + "loss": 2.6123, + "step": 6819 + }, + { + "epoch": 0.5503994834960859, + "grad_norm": 0.7093667387962341, + "learning_rate": 0.00014881046307054142, + "loss": 2.6527, + "step": 6820 + }, + { + "epoch": 0.5504801872326689, + "grad_norm": 0.6877567768096924, + "learning_rate": 0.00014879668389286915, + "loss": 2.6086, + "step": 6821 + }, + { + "epoch": 0.5505608909692519, + "grad_norm": 0.7095615863800049, + "learning_rate": 0.000148782903499056, + "loss": 2.6469, + "step": 6822 + }, + { + "epoch": 0.5506415947058348, + "grad_norm": 0.6931191086769104, + "learning_rate": 0.00014876912188944535, + "loss": 2.6842, + "step": 6823 + }, + { + "epoch": 0.5507222984424179, + "grad_norm": 0.7016414403915405, + "learning_rate": 0.00014875533906438072, + "loss": 2.5753, + "step": 6824 + }, + { + "epoch": 0.5508030021790009, + "grad_norm": 0.6813814640045166, + "learning_rate": 0.00014874155502420558, + "loss": 2.5739, + "step": 6825 + }, + { + "epoch": 0.5508837059155839, + "grad_norm": 0.7068608403205872, + "learning_rate": 0.00014872776976926347, + "loss": 2.6325, + "step": 6826 + }, + { + "epoch": 0.5509644096521669, + "grad_norm": 0.6978127360343933, + "learning_rate": 0.00014871398329989796, + "loss": 2.5614, + "step": 6827 + }, + { + "epoch": 0.55104511338875, + "grad_norm": 0.6923051476478577, + "learning_rate": 0.00014870019561645265, + "loss": 2.6075, + "step": 6828 + }, + { + "epoch": 0.5511258171253329, + "grad_norm": 0.6708533763885498, + "learning_rate": 0.00014868640671927117, + "loss": 2.5883, + "step": 6829 + }, + { + "epoch": 0.5512065208619159, + "grad_norm": 0.7679650783538818, + "learning_rate": 0.00014867261660869713, + "loss": 2.6105, + "step": 6830 + }, + { + "epoch": 0.5512872245984989, + "grad_norm": 0.7080917358398438, + "learning_rate": 0.0001486588252850743, + "loss": 2.5855, + "step": 6831 + }, + { + "epoch": 0.551367928335082, + "grad_norm": 0.7218755483627319, + "learning_rate": 0.00014864503274874635, + "loss": 2.5872, + "step": 6832 + }, + { + "epoch": 0.551448632071665, + "grad_norm": 0.689038872718811, + "learning_rate": 0.000148631239000057, + "loss": 2.5902, + "step": 6833 + }, + { + "epoch": 0.5515293358082479, + "grad_norm": 0.6810954213142395, + "learning_rate": 0.00014861744403935005, + "loss": 2.5938, + "step": 6834 + }, + { + "epoch": 0.5516100395448309, + "grad_norm": 0.7509457468986511, + "learning_rate": 0.00014860364786696933, + "loss": 2.593, + "step": 6835 + }, + { + "epoch": 0.5516907432814139, + "grad_norm": 0.739536702632904, + "learning_rate": 0.00014858985048325863, + "loss": 2.6668, + "step": 6836 + }, + { + "epoch": 0.551771447017997, + "grad_norm": 0.661829948425293, + "learning_rate": 0.00014857605188856184, + "loss": 2.6407, + "step": 6837 + }, + { + "epoch": 0.5518521507545799, + "grad_norm": 0.6869735717773438, + "learning_rate": 0.00014856225208322287, + "loss": 2.535, + "step": 6838 + }, + { + "epoch": 0.5519328544911629, + "grad_norm": 0.6724792122840881, + "learning_rate": 0.00014854845106758563, + "loss": 2.5629, + "step": 6839 + }, + { + "epoch": 0.5520135582277459, + "grad_norm": 0.7066503763198853, + "learning_rate": 0.00014853464884199407, + "loss": 2.6002, + "step": 6840 + }, + { + "epoch": 0.552094261964329, + "grad_norm": 0.7354215979576111, + "learning_rate": 0.0001485208454067922, + "loss": 2.6032, + "step": 6841 + }, + { + "epoch": 0.552174965700912, + "grad_norm": 0.8124571442604065, + "learning_rate": 0.00014850704076232405, + "loss": 2.5884, + "step": 6842 + }, + { + "epoch": 0.5522556694374949, + "grad_norm": 0.6941336393356323, + "learning_rate": 0.00014849323490893364, + "loss": 2.6461, + "step": 6843 + }, + { + "epoch": 0.5523363731740779, + "grad_norm": 0.6848790049552917, + "learning_rate": 0.00014847942784696505, + "loss": 2.6098, + "step": 6844 + }, + { + "epoch": 0.552417076910661, + "grad_norm": 0.6688000559806824, + "learning_rate": 0.00014846561957676237, + "loss": 2.6115, + "step": 6845 + }, + { + "epoch": 0.552497780647244, + "grad_norm": 0.6647306084632874, + "learning_rate": 0.00014845181009866975, + "loss": 2.597, + "step": 6846 + }, + { + "epoch": 0.552578484383827, + "grad_norm": 0.7277785539627075, + "learning_rate": 0.0001484379994130314, + "loss": 2.6223, + "step": 6847 + }, + { + "epoch": 0.5526591881204099, + "grad_norm": 0.6623761057853699, + "learning_rate": 0.00014842418752019146, + "loss": 2.5657, + "step": 6848 + }, + { + "epoch": 0.552739891856993, + "grad_norm": 0.7207754254341125, + "learning_rate": 0.00014841037442049423, + "loss": 2.5711, + "step": 6849 + }, + { + "epoch": 0.552820595593576, + "grad_norm": 0.6963560581207275, + "learning_rate": 0.00014839656011428389, + "loss": 2.6078, + "step": 6850 + }, + { + "epoch": 0.552901299330159, + "grad_norm": 0.6875078678131104, + "learning_rate": 0.00014838274460190475, + "loss": 2.6109, + "step": 6851 + }, + { + "epoch": 0.552982003066742, + "grad_norm": 0.7049943804740906, + "learning_rate": 0.00014836892788370118, + "loss": 2.5755, + "step": 6852 + }, + { + "epoch": 0.553062706803325, + "grad_norm": 0.6941191554069519, + "learning_rate": 0.00014835510996001744, + "loss": 2.6694, + "step": 6853 + }, + { + "epoch": 0.553143410539908, + "grad_norm": 0.7589484453201294, + "learning_rate": 0.000148341290831198, + "loss": 2.5677, + "step": 6854 + }, + { + "epoch": 0.553224114276491, + "grad_norm": 0.6594784259796143, + "learning_rate": 0.00014832747049758723, + "loss": 2.6209, + "step": 6855 + }, + { + "epoch": 0.553304818013074, + "grad_norm": 0.726598858833313, + "learning_rate": 0.00014831364895952952, + "loss": 2.6492, + "step": 6856 + }, + { + "epoch": 0.553385521749657, + "grad_norm": 0.6668030023574829, + "learning_rate": 0.0001482998262173694, + "loss": 2.6057, + "step": 6857 + }, + { + "epoch": 0.55346622548624, + "grad_norm": 0.7698997855186462, + "learning_rate": 0.0001482860022714514, + "loss": 2.6215, + "step": 6858 + }, + { + "epoch": 0.553546929222823, + "grad_norm": 0.6805251836776733, + "learning_rate": 0.00014827217712211997, + "loss": 2.5855, + "step": 6859 + }, + { + "epoch": 0.553627632959406, + "grad_norm": 0.8481020331382751, + "learning_rate": 0.00014825835076971968, + "loss": 2.6218, + "step": 6860 + }, + { + "epoch": 0.5537083366959891, + "grad_norm": 0.6801722645759583, + "learning_rate": 0.00014824452321459517, + "loss": 2.5998, + "step": 6861 + }, + { + "epoch": 0.553789040432572, + "grad_norm": 0.7174597978591919, + "learning_rate": 0.00014823069445709104, + "loss": 2.5782, + "step": 6862 + }, + { + "epoch": 0.553869744169155, + "grad_norm": 0.7607117891311646, + "learning_rate": 0.0001482168644975519, + "loss": 2.6492, + "step": 6863 + }, + { + "epoch": 0.553950447905738, + "grad_norm": 0.7554265856742859, + "learning_rate": 0.00014820303333632246, + "loss": 2.6511, + "step": 6864 + }, + { + "epoch": 0.5540311516423211, + "grad_norm": 0.7520260214805603, + "learning_rate": 0.00014818920097374745, + "loss": 2.6258, + "step": 6865 + }, + { + "epoch": 0.5541118553789041, + "grad_norm": 0.7897995114326477, + "learning_rate": 0.00014817536741017152, + "loss": 2.6153, + "step": 6866 + }, + { + "epoch": 0.554192559115487, + "grad_norm": 0.7444615960121155, + "learning_rate": 0.00014816153264593957, + "loss": 2.5892, + "step": 6867 + }, + { + "epoch": 0.55427326285207, + "grad_norm": 0.6593222618103027, + "learning_rate": 0.0001481476966813963, + "loss": 2.6048, + "step": 6868 + }, + { + "epoch": 0.5543539665886531, + "grad_norm": 0.7517102360725403, + "learning_rate": 0.0001481338595168866, + "loss": 2.6496, + "step": 6869 + }, + { + "epoch": 0.5544346703252361, + "grad_norm": 0.7314056754112244, + "learning_rate": 0.00014812002115275529, + "loss": 2.6009, + "step": 6870 + }, + { + "epoch": 0.554515374061819, + "grad_norm": 0.6718037724494934, + "learning_rate": 0.00014810618158934722, + "loss": 2.6279, + "step": 6871 + }, + { + "epoch": 0.554596077798402, + "grad_norm": 0.6853529810905457, + "learning_rate": 0.00014809234082700735, + "loss": 2.6562, + "step": 6872 + }, + { + "epoch": 0.5546767815349851, + "grad_norm": 0.713599443435669, + "learning_rate": 0.0001480784988660807, + "loss": 2.5783, + "step": 6873 + }, + { + "epoch": 0.5547574852715681, + "grad_norm": 0.6820243000984192, + "learning_rate": 0.00014806465570691213, + "loss": 2.5753, + "step": 6874 + }, + { + "epoch": 0.5548381890081511, + "grad_norm": 0.6999152302742004, + "learning_rate": 0.00014805081134984673, + "loss": 2.5839, + "step": 6875 + }, + { + "epoch": 0.554918892744734, + "grad_norm": 0.7145923376083374, + "learning_rate": 0.00014803696579522948, + "loss": 2.6153, + "step": 6876 + }, + { + "epoch": 0.5549995964813171, + "grad_norm": 0.7569223046302795, + "learning_rate": 0.00014802311904340548, + "loss": 2.5879, + "step": 6877 + }, + { + "epoch": 0.5550803002179001, + "grad_norm": 0.6977131962776184, + "learning_rate": 0.00014800927109471983, + "loss": 2.6587, + "step": 6878 + }, + { + "epoch": 0.5551610039544831, + "grad_norm": 0.6693562865257263, + "learning_rate": 0.00014799542194951764, + "loss": 2.6271, + "step": 6879 + }, + { + "epoch": 0.5552417076910661, + "grad_norm": 0.6937456130981445, + "learning_rate": 0.00014798157160814406, + "loss": 2.6213, + "step": 6880 + }, + { + "epoch": 0.5553224114276492, + "grad_norm": 0.761538565158844, + "learning_rate": 0.0001479677200709443, + "loss": 2.6053, + "step": 6881 + }, + { + "epoch": 0.5554031151642321, + "grad_norm": 0.707457959651947, + "learning_rate": 0.00014795386733826356, + "loss": 2.5763, + "step": 6882 + }, + { + "epoch": 0.5554838189008151, + "grad_norm": 0.7323198318481445, + "learning_rate": 0.0001479400134104471, + "loss": 2.6899, + "step": 6883 + }, + { + "epoch": 0.5555645226373981, + "grad_norm": 0.7181541323661804, + "learning_rate": 0.0001479261582878402, + "loss": 2.5743, + "step": 6884 + }, + { + "epoch": 0.5556452263739811, + "grad_norm": 0.7683241367340088, + "learning_rate": 0.00014791230197078813, + "loss": 2.5295, + "step": 6885 + }, + { + "epoch": 0.5557259301105641, + "grad_norm": 0.7248150706291199, + "learning_rate": 0.00014789844445963626, + "loss": 2.6131, + "step": 6886 + }, + { + "epoch": 0.5558066338471471, + "grad_norm": 0.6868402361869812, + "learning_rate": 0.00014788458575472997, + "loss": 2.6182, + "step": 6887 + }, + { + "epoch": 0.5558873375837301, + "grad_norm": 0.6995798945426941, + "learning_rate": 0.0001478707258564146, + "loss": 2.5969, + "step": 6888 + }, + { + "epoch": 0.5559680413203131, + "grad_norm": 0.6912558078765869, + "learning_rate": 0.00014785686476503565, + "loss": 2.6264, + "step": 6889 + }, + { + "epoch": 0.5560487450568962, + "grad_norm": 0.7485123872756958, + "learning_rate": 0.00014784300248093848, + "loss": 2.6036, + "step": 6890 + }, + { + "epoch": 0.5561294487934791, + "grad_norm": 0.7150819897651672, + "learning_rate": 0.00014782913900446864, + "loss": 2.5807, + "step": 6891 + }, + { + "epoch": 0.5562101525300621, + "grad_norm": 0.6715224385261536, + "learning_rate": 0.00014781527433597167, + "loss": 2.6164, + "step": 6892 + }, + { + "epoch": 0.5562908562666451, + "grad_norm": 0.6951256394386292, + "learning_rate": 0.000147801408475793, + "loss": 2.6106, + "step": 6893 + }, + { + "epoch": 0.5563715600032282, + "grad_norm": 0.7296997904777527, + "learning_rate": 0.00014778754142427832, + "loss": 2.6182, + "step": 6894 + }, + { + "epoch": 0.5564522637398112, + "grad_norm": 0.7484713196754456, + "learning_rate": 0.0001477736731817732, + "loss": 2.6384, + "step": 6895 + }, + { + "epoch": 0.5565329674763941, + "grad_norm": 0.6967526078224182, + "learning_rate": 0.00014775980374862326, + "loss": 2.5889, + "step": 6896 + }, + { + "epoch": 0.5566136712129771, + "grad_norm": 0.7004885077476501, + "learning_rate": 0.00014774593312517415, + "loss": 2.6549, + "step": 6897 + }, + { + "epoch": 0.5566943749495602, + "grad_norm": 0.7069302201271057, + "learning_rate": 0.00014773206131177158, + "loss": 2.6408, + "step": 6898 + }, + { + "epoch": 0.5567750786861432, + "grad_norm": 0.7048566341400146, + "learning_rate": 0.00014771818830876127, + "loss": 2.5909, + "step": 6899 + }, + { + "epoch": 0.5568557824227262, + "grad_norm": 0.7386630773544312, + "learning_rate": 0.00014770431411648897, + "loss": 2.6402, + "step": 6900 + }, + { + "epoch": 0.5569364861593091, + "grad_norm": 0.7244876027107239, + "learning_rate": 0.00014769043873530047, + "loss": 2.5548, + "step": 6901 + }, + { + "epoch": 0.5570171898958922, + "grad_norm": 0.6820651888847351, + "learning_rate": 0.00014767656216554156, + "loss": 2.682, + "step": 6902 + }, + { + "epoch": 0.5570978936324752, + "grad_norm": 0.7281784415245056, + "learning_rate": 0.00014766268440755812, + "loss": 2.622, + "step": 6903 + }, + { + "epoch": 0.5571785973690582, + "grad_norm": 0.6525030136108398, + "learning_rate": 0.00014764880546169594, + "loss": 2.5809, + "step": 6904 + }, + { + "epoch": 0.5572593011056411, + "grad_norm": 0.6735210418701172, + "learning_rate": 0.00014763492532830102, + "loss": 2.6645, + "step": 6905 + }, + { + "epoch": 0.5573400048422242, + "grad_norm": 0.674700140953064, + "learning_rate": 0.00014762104400771922, + "loss": 2.6466, + "step": 6906 + }, + { + "epoch": 0.5574207085788072, + "grad_norm": 0.7570134401321411, + "learning_rate": 0.00014760716150029652, + "loss": 2.57, + "step": 6907 + }, + { + "epoch": 0.5575014123153902, + "grad_norm": 0.6532449722290039, + "learning_rate": 0.00014759327780637893, + "loss": 2.6207, + "step": 6908 + }, + { + "epoch": 0.5575821160519732, + "grad_norm": 0.7697737812995911, + "learning_rate": 0.00014757939292631242, + "loss": 2.5846, + "step": 6909 + }, + { + "epoch": 0.5576628197885563, + "grad_norm": 0.6750194430351257, + "learning_rate": 0.00014756550686044308, + "loss": 2.6421, + "step": 6910 + }, + { + "epoch": 0.5577435235251392, + "grad_norm": 0.7357683777809143, + "learning_rate": 0.00014755161960911697, + "loss": 2.6173, + "step": 6911 + }, + { + "epoch": 0.5578242272617222, + "grad_norm": 0.6812090277671814, + "learning_rate": 0.0001475377311726802, + "loss": 2.5556, + "step": 6912 + }, + { + "epoch": 0.5579049309983052, + "grad_norm": 0.7633040547370911, + "learning_rate": 0.00014752384155147888, + "loss": 2.6505, + "step": 6913 + }, + { + "epoch": 0.5579856347348883, + "grad_norm": 0.7426417469978333, + "learning_rate": 0.00014750995074585922, + "loss": 2.5575, + "step": 6914 + }, + { + "epoch": 0.5580663384714712, + "grad_norm": 0.6926711201667786, + "learning_rate": 0.00014749605875616744, + "loss": 2.5751, + "step": 6915 + }, + { + "epoch": 0.5581470422080542, + "grad_norm": 0.70630943775177, + "learning_rate": 0.00014748216558274966, + "loss": 2.6228, + "step": 6916 + }, + { + "epoch": 0.5582277459446372, + "grad_norm": 0.7183346748352051, + "learning_rate": 0.0001474682712259522, + "loss": 2.5704, + "step": 6917 + }, + { + "epoch": 0.5583084496812203, + "grad_norm": 0.7622792720794678, + "learning_rate": 0.00014745437568612136, + "loss": 2.6031, + "step": 6918 + }, + { + "epoch": 0.5583891534178033, + "grad_norm": 0.6967802047729492, + "learning_rate": 0.00014744047896360344, + "loss": 2.6031, + "step": 6919 + }, + { + "epoch": 0.5584698571543862, + "grad_norm": 0.7827191948890686, + "learning_rate": 0.00014742658105874475, + "loss": 2.5427, + "step": 6920 + }, + { + "epoch": 0.5585505608909692, + "grad_norm": 0.6865705847740173, + "learning_rate": 0.0001474126819718917, + "loss": 2.6514, + "step": 6921 + }, + { + "epoch": 0.5586312646275523, + "grad_norm": 0.7181665897369385, + "learning_rate": 0.0001473987817033906, + "loss": 2.613, + "step": 6922 + }, + { + "epoch": 0.5587119683641353, + "grad_norm": 0.7198463082313538, + "learning_rate": 0.00014738488025358806, + "loss": 2.6423, + "step": 6923 + }, + { + "epoch": 0.5587926721007183, + "grad_norm": 0.773078441619873, + "learning_rate": 0.00014737097762283042, + "loss": 2.5946, + "step": 6924 + }, + { + "epoch": 0.5588733758373012, + "grad_norm": 0.7732799649238586, + "learning_rate": 0.00014735707381146416, + "loss": 2.6778, + "step": 6925 + }, + { + "epoch": 0.5589540795738843, + "grad_norm": 0.7639997601509094, + "learning_rate": 0.00014734316881983585, + "loss": 2.6064, + "step": 6926 + }, + { + "epoch": 0.5590347833104673, + "grad_norm": 0.7912085652351379, + "learning_rate": 0.00014732926264829198, + "loss": 2.5765, + "step": 6927 + }, + { + "epoch": 0.5591154870470503, + "grad_norm": 0.7460121512413025, + "learning_rate": 0.0001473153552971792, + "loss": 2.6724, + "step": 6928 + }, + { + "epoch": 0.5591961907836333, + "grad_norm": 0.6853603720664978, + "learning_rate": 0.00014730144676684408, + "loss": 2.5846, + "step": 6929 + }, + { + "epoch": 0.5592768945202163, + "grad_norm": 0.7368159294128418, + "learning_rate": 0.00014728753705763324, + "loss": 2.6626, + "step": 6930 + }, + { + "epoch": 0.5593575982567993, + "grad_norm": 0.6888907551765442, + "learning_rate": 0.0001472736261698934, + "loss": 2.6169, + "step": 6931 + }, + { + "epoch": 0.5594383019933823, + "grad_norm": 0.6978163719177246, + "learning_rate": 0.0001472597141039712, + "loss": 2.6367, + "step": 6932 + }, + { + "epoch": 0.5595190057299653, + "grad_norm": 0.7829774618148804, + "learning_rate": 0.00014724580086021335, + "loss": 2.5983, + "step": 6933 + }, + { + "epoch": 0.5595997094665484, + "grad_norm": 0.7872018218040466, + "learning_rate": 0.0001472318864389667, + "loss": 2.5418, + "step": 6934 + }, + { + "epoch": 0.5596804132031313, + "grad_norm": 0.6994973421096802, + "learning_rate": 0.00014721797084057793, + "loss": 2.6062, + "step": 6935 + }, + { + "epoch": 0.5597611169397143, + "grad_norm": 0.7281144857406616, + "learning_rate": 0.00014720405406539394, + "loss": 2.573, + "step": 6936 + }, + { + "epoch": 0.5598418206762973, + "grad_norm": 0.713513970375061, + "learning_rate": 0.0001471901361137615, + "loss": 2.6589, + "step": 6937 + }, + { + "epoch": 0.5599225244128803, + "grad_norm": 0.7752750515937805, + "learning_rate": 0.00014717621698602754, + "loss": 2.6478, + "step": 6938 + }, + { + "epoch": 0.5600032281494634, + "grad_norm": 0.6876000165939331, + "learning_rate": 0.00014716229668253889, + "loss": 2.6092, + "step": 6939 + }, + { + "epoch": 0.5600839318860463, + "grad_norm": 0.6371028423309326, + "learning_rate": 0.00014714837520364256, + "loss": 2.606, + "step": 6940 + }, + { + "epoch": 0.5601646356226293, + "grad_norm": 0.6488915085792542, + "learning_rate": 0.00014713445254968546, + "loss": 2.5769, + "step": 6941 + }, + { + "epoch": 0.5602453393592123, + "grad_norm": 0.7286413908004761, + "learning_rate": 0.00014712052872101458, + "loss": 2.6267, + "step": 6942 + }, + { + "epoch": 0.5603260430957954, + "grad_norm": 0.6863759160041809, + "learning_rate": 0.00014710660371797696, + "loss": 2.641, + "step": 6943 + }, + { + "epoch": 0.5604067468323783, + "grad_norm": 0.706900417804718, + "learning_rate": 0.00014709267754091964, + "loss": 2.6344, + "step": 6944 + }, + { + "epoch": 0.5604874505689613, + "grad_norm": 0.6462892293930054, + "learning_rate": 0.0001470787501901897, + "loss": 2.5561, + "step": 6945 + }, + { + "epoch": 0.5605681543055443, + "grad_norm": 0.7342472076416016, + "learning_rate": 0.00014706482166613425, + "loss": 2.583, + "step": 6946 + }, + { + "epoch": 0.5606488580421274, + "grad_norm": 0.7132803797721863, + "learning_rate": 0.00014705089196910038, + "loss": 2.558, + "step": 6947 + }, + { + "epoch": 0.5607295617787104, + "grad_norm": 0.7709125876426697, + "learning_rate": 0.00014703696109943533, + "loss": 2.6165, + "step": 6948 + }, + { + "epoch": 0.5608102655152933, + "grad_norm": 0.7108885645866394, + "learning_rate": 0.00014702302905748619, + "loss": 2.5788, + "step": 6949 + }, + { + "epoch": 0.5608909692518763, + "grad_norm": 0.7295591235160828, + "learning_rate": 0.0001470090958436003, + "loss": 2.6526, + "step": 6950 + }, + { + "epoch": 0.5609716729884594, + "grad_norm": 0.7235364317893982, + "learning_rate": 0.00014699516145812486, + "loss": 2.604, + "step": 6951 + }, + { + "epoch": 0.5610523767250424, + "grad_norm": 0.6723269820213318, + "learning_rate": 0.00014698122590140714, + "loss": 2.5838, + "step": 6952 + }, + { + "epoch": 0.5611330804616254, + "grad_norm": 0.7022266983985901, + "learning_rate": 0.00014696728917379447, + "loss": 2.6086, + "step": 6953 + }, + { + "epoch": 0.5612137841982083, + "grad_norm": 0.6923824548721313, + "learning_rate": 0.00014695335127563414, + "loss": 2.6678, + "step": 6954 + }, + { + "epoch": 0.5612944879347914, + "grad_norm": 0.6909339427947998, + "learning_rate": 0.0001469394122072736, + "loss": 2.6397, + "step": 6955 + }, + { + "epoch": 0.5613751916713744, + "grad_norm": 0.710299015045166, + "learning_rate": 0.00014692547196906022, + "loss": 2.5973, + "step": 6956 + }, + { + "epoch": 0.5614558954079574, + "grad_norm": 0.7141178250312805, + "learning_rate": 0.00014691153056134136, + "loss": 2.6111, + "step": 6957 + }, + { + "epoch": 0.5615365991445403, + "grad_norm": 0.6994750499725342, + "learning_rate": 0.00014689758798446456, + "loss": 2.6498, + "step": 6958 + }, + { + "epoch": 0.5616173028811234, + "grad_norm": 0.6951611638069153, + "learning_rate": 0.00014688364423877726, + "loss": 2.6208, + "step": 6959 + }, + { + "epoch": 0.5616980066177064, + "grad_norm": 0.6610642075538635, + "learning_rate": 0.000146869699324627, + "loss": 2.5725, + "step": 6960 + }, + { + "epoch": 0.5617787103542894, + "grad_norm": 0.6771267056465149, + "learning_rate": 0.00014685575324236135, + "loss": 2.6336, + "step": 6961 + }, + { + "epoch": 0.5618594140908724, + "grad_norm": 0.7431008815765381, + "learning_rate": 0.0001468418059923278, + "loss": 2.6782, + "step": 6962 + }, + { + "epoch": 0.5619401178274555, + "grad_norm": 0.7399705648422241, + "learning_rate": 0.000146827857574874, + "loss": 2.6212, + "step": 6963 + }, + { + "epoch": 0.5620208215640384, + "grad_norm": 0.7237067222595215, + "learning_rate": 0.00014681390799034763, + "loss": 2.6261, + "step": 6964 + }, + { + "epoch": 0.5621015253006214, + "grad_norm": 0.7033257484436035, + "learning_rate": 0.00014679995723909623, + "loss": 2.6912, + "step": 6965 + }, + { + "epoch": 0.5621822290372044, + "grad_norm": 0.6953759789466858, + "learning_rate": 0.00014678600532146762, + "loss": 2.6022, + "step": 6966 + }, + { + "epoch": 0.5622629327737875, + "grad_norm": 0.8338057994842529, + "learning_rate": 0.0001467720522378094, + "loss": 2.595, + "step": 6967 + }, + { + "epoch": 0.5623436365103704, + "grad_norm": 0.6506100296974182, + "learning_rate": 0.00014675809798846942, + "loss": 2.6033, + "step": 6968 + }, + { + "epoch": 0.5624243402469534, + "grad_norm": 0.7122468948364258, + "learning_rate": 0.0001467441425737954, + "loss": 2.56, + "step": 6969 + }, + { + "epoch": 0.5625050439835364, + "grad_norm": 0.7012680172920227, + "learning_rate": 0.00014673018599413516, + "loss": 2.6052, + "step": 6970 + }, + { + "epoch": 0.5625857477201195, + "grad_norm": 0.668187141418457, + "learning_rate": 0.00014671622824983653, + "loss": 2.6675, + "step": 6971 + }, + { + "epoch": 0.5626664514567025, + "grad_norm": 0.7259203791618347, + "learning_rate": 0.00014670226934124738, + "loss": 2.5977, + "step": 6972 + }, + { + "epoch": 0.5627471551932854, + "grad_norm": 0.6705875396728516, + "learning_rate": 0.00014668830926871555, + "loss": 2.649, + "step": 6973 + }, + { + "epoch": 0.5628278589298684, + "grad_norm": 0.682731568813324, + "learning_rate": 0.00014667434803258906, + "loss": 2.6084, + "step": 6974 + }, + { + "epoch": 0.5629085626664515, + "grad_norm": 0.7061700224876404, + "learning_rate": 0.00014666038563321577, + "loss": 2.6256, + "step": 6975 + }, + { + "epoch": 0.5629892664030345, + "grad_norm": 0.6839977502822876, + "learning_rate": 0.00014664642207094374, + "loss": 2.6342, + "step": 6976 + }, + { + "epoch": 0.5630699701396175, + "grad_norm": 0.7376503348350525, + "learning_rate": 0.00014663245734612094, + "loss": 2.6001, + "step": 6977 + }, + { + "epoch": 0.5631506738762004, + "grad_norm": 0.6901546716690063, + "learning_rate": 0.0001466184914590954, + "loss": 2.6715, + "step": 6978 + }, + { + "epoch": 0.5632313776127835, + "grad_norm": 0.816223680973053, + "learning_rate": 0.00014660452441021512, + "loss": 2.6407, + "step": 6979 + }, + { + "epoch": 0.5633120813493665, + "grad_norm": 0.6904644966125488, + "learning_rate": 0.00014659055619982835, + "loss": 2.5543, + "step": 6980 + }, + { + "epoch": 0.5633927850859495, + "grad_norm": 0.6784235239028931, + "learning_rate": 0.0001465765868282831, + "loss": 2.6184, + "step": 6981 + }, + { + "epoch": 0.5634734888225325, + "grad_norm": 0.7689006328582764, + "learning_rate": 0.00014656261629592755, + "loss": 2.644, + "step": 6982 + }, + { + "epoch": 0.5635541925591155, + "grad_norm": 0.7608775496482849, + "learning_rate": 0.0001465486446031099, + "loss": 2.5952, + "step": 6983 + }, + { + "epoch": 0.5636348962956985, + "grad_norm": 0.7266525626182556, + "learning_rate": 0.00014653467175017833, + "loss": 2.6479, + "step": 6984 + }, + { + "epoch": 0.5637156000322815, + "grad_norm": 0.6907477974891663, + "learning_rate": 0.00014652069773748113, + "loss": 2.5825, + "step": 6985 + }, + { + "epoch": 0.5637963037688645, + "grad_norm": 0.7790403366088867, + "learning_rate": 0.00014650672256536648, + "loss": 2.5948, + "step": 6986 + }, + { + "epoch": 0.5638770075054474, + "grad_norm": 0.7072858214378357, + "learning_rate": 0.00014649274623418278, + "loss": 2.6017, + "step": 6987 + }, + { + "epoch": 0.5639577112420305, + "grad_norm": 0.7140414118766785, + "learning_rate": 0.0001464787687442783, + "loss": 2.5709, + "step": 6988 + }, + { + "epoch": 0.5640384149786135, + "grad_norm": 0.857783317565918, + "learning_rate": 0.00014646479009600139, + "loss": 2.7049, + "step": 6989 + }, + { + "epoch": 0.5641191187151965, + "grad_norm": 0.7599344253540039, + "learning_rate": 0.00014645081028970047, + "loss": 2.6369, + "step": 6990 + }, + { + "epoch": 0.5641998224517795, + "grad_norm": 0.7286150455474854, + "learning_rate": 0.00014643682932572393, + "loss": 2.6238, + "step": 6991 + }, + { + "epoch": 0.5642805261883626, + "grad_norm": 0.7095075249671936, + "learning_rate": 0.0001464228472044202, + "loss": 2.5924, + "step": 6992 + }, + { + "epoch": 0.5643612299249455, + "grad_norm": 0.7583668828010559, + "learning_rate": 0.0001464088639261378, + "loss": 2.6098, + "step": 6993 + }, + { + "epoch": 0.5644419336615285, + "grad_norm": 0.7393970489501953, + "learning_rate": 0.00014639487949122515, + "loss": 2.6036, + "step": 6994 + }, + { + "epoch": 0.5645226373981115, + "grad_norm": 0.6789388656616211, + "learning_rate": 0.00014638089390003086, + "loss": 2.642, + "step": 6995 + }, + { + "epoch": 0.5646033411346946, + "grad_norm": 0.8021289706230164, + "learning_rate": 0.00014636690715290346, + "loss": 2.6851, + "step": 6996 + }, + { + "epoch": 0.5646840448712775, + "grad_norm": 0.6931039094924927, + "learning_rate": 0.00014635291925019152, + "loss": 2.6358, + "step": 6997 + }, + { + "epoch": 0.5647647486078605, + "grad_norm": 0.7356590032577515, + "learning_rate": 0.00014633893019224366, + "loss": 2.5661, + "step": 6998 + }, + { + "epoch": 0.5648454523444435, + "grad_norm": 0.6777941584587097, + "learning_rate": 0.0001463249399794085, + "loss": 2.5578, + "step": 6999 + }, + { + "epoch": 0.5649261560810266, + "grad_norm": 0.7163615822792053, + "learning_rate": 0.0001463109486120348, + "loss": 2.5582, + "step": 7000 + }, + { + "epoch": 0.5649261560810266, + "eval_loss": 2.5298855304718018, + "eval_runtime": 757.774, + "eval_samples_per_second": 3.457, + "eval_steps_per_second": 0.577, + "step": 7000 + }, + { + "epoch": 0.5650068598176096, + "grad_norm": 0.7175148129463196, + "learning_rate": 0.0001462969560904712, + "loss": 2.568, + "step": 7001 + }, + { + "epoch": 0.5650875635541925, + "grad_norm": 0.6998937129974365, + "learning_rate": 0.00014628296241506636, + "loss": 2.6347, + "step": 7002 + }, + { + "epoch": 0.5651682672907755, + "grad_norm": 0.8140312433242798, + "learning_rate": 0.00014626896758616916, + "loss": 2.6566, + "step": 7003 + }, + { + "epoch": 0.5652489710273586, + "grad_norm": 0.7218164205551147, + "learning_rate": 0.00014625497160412833, + "loss": 2.5693, + "step": 7004 + }, + { + "epoch": 0.5653296747639416, + "grad_norm": 0.6974074244499207, + "learning_rate": 0.0001462409744692927, + "loss": 2.6084, + "step": 7005 + }, + { + "epoch": 0.5654103785005246, + "grad_norm": 0.7475053071975708, + "learning_rate": 0.00014622697618201113, + "loss": 2.6534, + "step": 7006 + }, + { + "epoch": 0.5654910822371075, + "grad_norm": 0.6768492460250854, + "learning_rate": 0.00014621297674263247, + "loss": 2.585, + "step": 7007 + }, + { + "epoch": 0.5655717859736906, + "grad_norm": 0.7023029923439026, + "learning_rate": 0.0001461989761515056, + "loss": 2.6219, + "step": 7008 + }, + { + "epoch": 0.5656524897102736, + "grad_norm": 0.7248445749282837, + "learning_rate": 0.0001461849744089795, + "loss": 2.6382, + "step": 7009 + }, + { + "epoch": 0.5657331934468566, + "grad_norm": 0.6961148381233215, + "learning_rate": 0.00014617097151540308, + "loss": 2.7184, + "step": 7010 + }, + { + "epoch": 0.5658138971834396, + "grad_norm": 0.6649057269096375, + "learning_rate": 0.0001461569674711254, + "loss": 2.6059, + "step": 7011 + }, + { + "epoch": 0.5658946009200226, + "grad_norm": 0.7451788783073425, + "learning_rate": 0.00014614296227649542, + "loss": 2.5697, + "step": 7012 + }, + { + "epoch": 0.5659753046566056, + "grad_norm": 0.6880216598510742, + "learning_rate": 0.0001461289559318622, + "loss": 2.5785, + "step": 7013 + }, + { + "epoch": 0.5660560083931886, + "grad_norm": 0.7505971789360046, + "learning_rate": 0.00014611494843757482, + "loss": 2.5479, + "step": 7014 + }, + { + "epoch": 0.5661367121297716, + "grad_norm": 0.745914876461029, + "learning_rate": 0.00014610093979398235, + "loss": 2.6367, + "step": 7015 + }, + { + "epoch": 0.5662174158663547, + "grad_norm": 0.6758660674095154, + "learning_rate": 0.000146086930001434, + "loss": 2.5673, + "step": 7016 + }, + { + "epoch": 0.5662981196029376, + "grad_norm": 0.7114273309707642, + "learning_rate": 0.00014607291906027886, + "loss": 2.6188, + "step": 7017 + }, + { + "epoch": 0.5663788233395206, + "grad_norm": 0.6791165471076965, + "learning_rate": 0.00014605890697086613, + "loss": 2.6197, + "step": 7018 + }, + { + "epoch": 0.5664595270761036, + "grad_norm": 0.6948217153549194, + "learning_rate": 0.00014604489373354503, + "loss": 2.5996, + "step": 7019 + }, + { + "epoch": 0.5665402308126867, + "grad_norm": 0.6993576884269714, + "learning_rate": 0.00014603087934866483, + "loss": 2.565, + "step": 7020 + }, + { + "epoch": 0.5666209345492697, + "grad_norm": 0.6936905384063721, + "learning_rate": 0.0001460168638165748, + "loss": 2.6524, + "step": 7021 + }, + { + "epoch": 0.5667016382858526, + "grad_norm": 0.6810741424560547, + "learning_rate": 0.00014600284713762424, + "loss": 2.6519, + "step": 7022 + }, + { + "epoch": 0.5667823420224356, + "grad_norm": 0.7540227770805359, + "learning_rate": 0.00014598882931216245, + "loss": 2.659, + "step": 7023 + }, + { + "epoch": 0.5668630457590187, + "grad_norm": 0.6520613431930542, + "learning_rate": 0.0001459748103405388, + "loss": 2.5341, + "step": 7024 + }, + { + "epoch": 0.5669437494956017, + "grad_norm": 0.7159109711647034, + "learning_rate": 0.00014596079022310277, + "loss": 2.6548, + "step": 7025 + }, + { + "epoch": 0.5670244532321846, + "grad_norm": 0.803284227848053, + "learning_rate": 0.00014594676896020366, + "loss": 2.705, + "step": 7026 + }, + { + "epoch": 0.5671051569687676, + "grad_norm": 0.7069976925849915, + "learning_rate": 0.00014593274655219095, + "loss": 2.5733, + "step": 7027 + }, + { + "epoch": 0.5671858607053507, + "grad_norm": 0.7085167169570923, + "learning_rate": 0.00014591872299941417, + "loss": 2.6247, + "step": 7028 + }, + { + "epoch": 0.5672665644419337, + "grad_norm": 0.6748499274253845, + "learning_rate": 0.00014590469830222272, + "loss": 2.6446, + "step": 7029 + }, + { + "epoch": 0.5673472681785167, + "grad_norm": 0.6885821223258972, + "learning_rate": 0.00014589067246096623, + "loss": 2.5879, + "step": 7030 + }, + { + "epoch": 0.5674279719150996, + "grad_norm": 0.7220324277877808, + "learning_rate": 0.0001458766454759942, + "loss": 2.6249, + "step": 7031 + }, + { + "epoch": 0.5675086756516827, + "grad_norm": 0.6712783575057983, + "learning_rate": 0.00014586261734765628, + "loss": 2.5971, + "step": 7032 + }, + { + "epoch": 0.5675893793882657, + "grad_norm": 0.6582161784172058, + "learning_rate": 0.00014584858807630203, + "loss": 2.6224, + "step": 7033 + }, + { + "epoch": 0.5676700831248487, + "grad_norm": 0.6699219346046448, + "learning_rate": 0.0001458345576622811, + "loss": 2.5926, + "step": 7034 + }, + { + "epoch": 0.5677507868614317, + "grad_norm": 0.6508033871650696, + "learning_rate": 0.0001458205261059432, + "loss": 2.6311, + "step": 7035 + }, + { + "epoch": 0.5678314905980147, + "grad_norm": 0.7551338076591492, + "learning_rate": 0.00014580649340763802, + "loss": 2.5729, + "step": 7036 + }, + { + "epoch": 0.5679121943345977, + "grad_norm": 0.6875829100608826, + "learning_rate": 0.00014579245956771527, + "loss": 2.6253, + "step": 7037 + }, + { + "epoch": 0.5679928980711807, + "grad_norm": 0.698204517364502, + "learning_rate": 0.00014577842458652474, + "loss": 2.6218, + "step": 7038 + }, + { + "epoch": 0.5680736018077637, + "grad_norm": 0.8258630037307739, + "learning_rate": 0.00014576438846441615, + "loss": 2.6307, + "step": 7039 + }, + { + "epoch": 0.5681543055443466, + "grad_norm": 0.753105878829956, + "learning_rate": 0.00014575035120173942, + "loss": 2.5664, + "step": 7040 + }, + { + "epoch": 0.5682350092809297, + "grad_norm": 0.6999726295471191, + "learning_rate": 0.00014573631279884435, + "loss": 2.6857, + "step": 7041 + }, + { + "epoch": 0.5683157130175127, + "grad_norm": 0.6484847068786621, + "learning_rate": 0.00014572227325608078, + "loss": 2.6068, + "step": 7042 + }, + { + "epoch": 0.5683964167540957, + "grad_norm": 0.7098011374473572, + "learning_rate": 0.00014570823257379866, + "loss": 2.6591, + "step": 7043 + }, + { + "epoch": 0.5684771204906787, + "grad_norm": 0.8304192423820496, + "learning_rate": 0.0001456941907523479, + "loss": 2.6582, + "step": 7044 + }, + { + "epoch": 0.5685578242272618, + "grad_norm": 0.763214111328125, + "learning_rate": 0.00014568014779207844, + "loss": 2.6605, + "step": 7045 + }, + { + "epoch": 0.5686385279638447, + "grad_norm": 0.6805880665779114, + "learning_rate": 0.00014566610369334032, + "loss": 2.6362, + "step": 7046 + }, + { + "epoch": 0.5687192317004277, + "grad_norm": 0.6753434538841248, + "learning_rate": 0.00014565205845648352, + "loss": 2.6352, + "step": 7047 + }, + { + "epoch": 0.5687999354370107, + "grad_norm": 0.7065438032150269, + "learning_rate": 0.00014563801208185807, + "loss": 2.5975, + "step": 7048 + }, + { + "epoch": 0.5688806391735938, + "grad_norm": 0.6863527894020081, + "learning_rate": 0.00014562396456981407, + "loss": 2.576, + "step": 7049 + }, + { + "epoch": 0.5689613429101767, + "grad_norm": 0.7344440817832947, + "learning_rate": 0.00014560991592070158, + "loss": 2.5933, + "step": 7050 + }, + { + "epoch": 0.5690420466467597, + "grad_norm": 0.699992835521698, + "learning_rate": 0.00014559586613487082, + "loss": 2.6161, + "step": 7051 + }, + { + "epoch": 0.5691227503833427, + "grad_norm": 0.7287258505821228, + "learning_rate": 0.00014558181521267185, + "loss": 2.665, + "step": 7052 + }, + { + "epoch": 0.5692034541199258, + "grad_norm": 0.7304692268371582, + "learning_rate": 0.0001455677631544549, + "loss": 2.5696, + "step": 7053 + }, + { + "epoch": 0.5692841578565088, + "grad_norm": 0.6556086540222168, + "learning_rate": 0.00014555370996057016, + "loss": 2.6405, + "step": 7054 + }, + { + "epoch": 0.5693648615930917, + "grad_norm": 0.6796221137046814, + "learning_rate": 0.0001455396556313679, + "loss": 2.6475, + "step": 7055 + }, + { + "epoch": 0.5694455653296747, + "grad_norm": 0.7067505717277527, + "learning_rate": 0.00014552560016719838, + "loss": 2.6344, + "step": 7056 + }, + { + "epoch": 0.5695262690662578, + "grad_norm": 0.7108997106552124, + "learning_rate": 0.00014551154356841193, + "loss": 2.6543, + "step": 7057 + }, + { + "epoch": 0.5696069728028408, + "grad_norm": 0.7296212911605835, + "learning_rate": 0.0001454974858353588, + "loss": 2.6152, + "step": 7058 + }, + { + "epoch": 0.5696876765394238, + "grad_norm": 0.7329154014587402, + "learning_rate": 0.00014548342696838943, + "loss": 2.6338, + "step": 7059 + }, + { + "epoch": 0.5697683802760067, + "grad_norm": 0.6880258321762085, + "learning_rate": 0.00014546936696785412, + "loss": 2.5834, + "step": 7060 + }, + { + "epoch": 0.5698490840125898, + "grad_norm": 0.7140741348266602, + "learning_rate": 0.00014545530583410336, + "loss": 2.6361, + "step": 7061 + }, + { + "epoch": 0.5699297877491728, + "grad_norm": 0.6419476866722107, + "learning_rate": 0.00014544124356748755, + "loss": 2.4982, + "step": 7062 + }, + { + "epoch": 0.5700104914857558, + "grad_norm": 0.6934036612510681, + "learning_rate": 0.00014542718016835718, + "loss": 2.5748, + "step": 7063 + }, + { + "epoch": 0.5700911952223388, + "grad_norm": 0.721663236618042, + "learning_rate": 0.0001454131156370627, + "loss": 2.5419, + "step": 7064 + }, + { + "epoch": 0.5701718989589218, + "grad_norm": 0.734062671661377, + "learning_rate": 0.00014539904997395468, + "loss": 2.6288, + "step": 7065 + }, + { + "epoch": 0.5702526026955048, + "grad_norm": 0.7927694320678711, + "learning_rate": 0.00014538498317938367, + "loss": 2.6331, + "step": 7066 + }, + { + "epoch": 0.5703333064320878, + "grad_norm": 0.715929388999939, + "learning_rate": 0.00014537091525370025, + "loss": 2.6333, + "step": 7067 + }, + { + "epoch": 0.5704140101686708, + "grad_norm": 0.772230327129364, + "learning_rate": 0.00014535684619725498, + "loss": 2.6019, + "step": 7068 + }, + { + "epoch": 0.5704947139052539, + "grad_norm": 0.7277318239212036, + "learning_rate": 0.0001453427760103986, + "loss": 2.6062, + "step": 7069 + }, + { + "epoch": 0.5705754176418368, + "grad_norm": 0.6708227396011353, + "learning_rate": 0.00014532870469348164, + "loss": 2.6613, + "step": 7070 + }, + { + "epoch": 0.5706561213784198, + "grad_norm": 0.7507323622703552, + "learning_rate": 0.0001453146322468549, + "loss": 2.6456, + "step": 7071 + }, + { + "epoch": 0.5707368251150028, + "grad_norm": 0.6864063739776611, + "learning_rate": 0.00014530055867086912, + "loss": 2.6361, + "step": 7072 + }, + { + "epoch": 0.5708175288515859, + "grad_norm": 0.6805310249328613, + "learning_rate": 0.00014528648396587498, + "loss": 2.6088, + "step": 7073 + }, + { + "epoch": 0.5708982325881689, + "grad_norm": 0.7946523427963257, + "learning_rate": 0.00014527240813222325, + "loss": 2.6533, + "step": 7074 + }, + { + "epoch": 0.5709789363247518, + "grad_norm": 0.6814306974411011, + "learning_rate": 0.00014525833117026474, + "loss": 2.6478, + "step": 7075 + }, + { + "epoch": 0.5710596400613348, + "grad_norm": 0.749664843082428, + "learning_rate": 0.00014524425308035034, + "loss": 2.6296, + "step": 7076 + }, + { + "epoch": 0.5711403437979179, + "grad_norm": 0.6774656772613525, + "learning_rate": 0.00014523017386283091, + "loss": 2.5867, + "step": 7077 + }, + { + "epoch": 0.5712210475345009, + "grad_norm": 0.7331634163856506, + "learning_rate": 0.00014521609351805733, + "loss": 2.6484, + "step": 7078 + }, + { + "epoch": 0.5713017512710838, + "grad_norm": 0.7076910734176636, + "learning_rate": 0.00014520201204638045, + "loss": 2.6464, + "step": 7079 + }, + { + "epoch": 0.5713824550076668, + "grad_norm": 0.74099200963974, + "learning_rate": 0.00014518792944815127, + "loss": 2.6304, + "step": 7080 + }, + { + "epoch": 0.5714631587442499, + "grad_norm": 0.6673823595046997, + "learning_rate": 0.00014517384572372078, + "loss": 2.5903, + "step": 7081 + }, + { + "epoch": 0.5715438624808329, + "grad_norm": 0.6872609257698059, + "learning_rate": 0.00014515976087343997, + "loss": 2.6189, + "step": 7082 + }, + { + "epoch": 0.5716245662174159, + "grad_norm": 0.7363224625587463, + "learning_rate": 0.0001451456748976599, + "loss": 2.5845, + "step": 7083 + }, + { + "epoch": 0.5717052699539988, + "grad_norm": 0.7672157287597656, + "learning_rate": 0.00014513158779673157, + "loss": 2.6331, + "step": 7084 + }, + { + "epoch": 0.5717859736905819, + "grad_norm": 0.661195695400238, + "learning_rate": 0.00014511749957100612, + "loss": 2.5827, + "step": 7085 + }, + { + "epoch": 0.5718666774271649, + "grad_norm": 0.8034788370132446, + "learning_rate": 0.0001451034102208346, + "loss": 2.6209, + "step": 7086 + }, + { + "epoch": 0.5719473811637479, + "grad_norm": 0.7318302392959595, + "learning_rate": 0.00014508931974656822, + "loss": 2.5898, + "step": 7087 + }, + { + "epoch": 0.5720280849003309, + "grad_norm": 0.7334744930267334, + "learning_rate": 0.00014507522814855814, + "loss": 2.5893, + "step": 7088 + }, + { + "epoch": 0.5721087886369138, + "grad_norm": 0.783051609992981, + "learning_rate": 0.00014506113542715553, + "loss": 2.6284, + "step": 7089 + }, + { + "epoch": 0.5721894923734969, + "grad_norm": 0.7319497466087341, + "learning_rate": 0.00014504704158271165, + "loss": 2.5705, + "step": 7090 + }, + { + "epoch": 0.5722701961100799, + "grad_norm": 0.7886925935745239, + "learning_rate": 0.00014503294661557772, + "loss": 2.641, + "step": 7091 + }, + { + "epoch": 0.5723508998466629, + "grad_norm": 0.6882795691490173, + "learning_rate": 0.00014501885052610502, + "loss": 2.5714, + "step": 7092 + }, + { + "epoch": 0.5724316035832459, + "grad_norm": 0.7089235186576843, + "learning_rate": 0.00014500475331464494, + "loss": 2.6073, + "step": 7093 + }, + { + "epoch": 0.5725123073198289, + "grad_norm": 0.7261029481887817, + "learning_rate": 0.00014499065498154874, + "loss": 2.5595, + "step": 7094 + }, + { + "epoch": 0.5725930110564119, + "grad_norm": 0.7625105977058411, + "learning_rate": 0.0001449765555271678, + "loss": 2.5978, + "step": 7095 + }, + { + "epoch": 0.5726737147929949, + "grad_norm": 0.7853986024856567, + "learning_rate": 0.00014496245495185353, + "loss": 2.6378, + "step": 7096 + }, + { + "epoch": 0.5727544185295779, + "grad_norm": 0.8070923686027527, + "learning_rate": 0.00014494835325595736, + "loss": 2.7062, + "step": 7097 + }, + { + "epoch": 0.572835122266161, + "grad_norm": 0.7074965834617615, + "learning_rate": 0.00014493425043983073, + "loss": 2.5177, + "step": 7098 + }, + { + "epoch": 0.5729158260027439, + "grad_norm": 0.6890520453453064, + "learning_rate": 0.00014492014650382512, + "loss": 2.6058, + "step": 7099 + }, + { + "epoch": 0.5729965297393269, + "grad_norm": 0.6979860067367554, + "learning_rate": 0.00014490604144829202, + "loss": 2.5274, + "step": 7100 + }, + { + "epoch": 0.5730772334759099, + "grad_norm": 0.7972229719161987, + "learning_rate": 0.000144891935273583, + "loss": 2.6369, + "step": 7101 + }, + { + "epoch": 0.573157937212493, + "grad_norm": 0.6994345188140869, + "learning_rate": 0.0001448778279800496, + "loss": 2.5975, + "step": 7102 + }, + { + "epoch": 0.573238640949076, + "grad_norm": 0.7943929433822632, + "learning_rate": 0.0001448637195680434, + "loss": 2.6317, + "step": 7103 + }, + { + "epoch": 0.5733193446856589, + "grad_norm": 0.6975306272506714, + "learning_rate": 0.00014484961003791605, + "loss": 2.6264, + "step": 7104 + }, + { + "epoch": 0.5734000484222419, + "grad_norm": 0.6889060735702515, + "learning_rate": 0.00014483549939001917, + "loss": 2.5974, + "step": 7105 + }, + { + "epoch": 0.573480752158825, + "grad_norm": 0.7372777462005615, + "learning_rate": 0.00014482138762470444, + "loss": 2.5851, + "step": 7106 + }, + { + "epoch": 0.573561455895408, + "grad_norm": 0.7045157551765442, + "learning_rate": 0.00014480727474232362, + "loss": 2.6451, + "step": 7107 + }, + { + "epoch": 0.5736421596319909, + "grad_norm": 0.6974517107009888, + "learning_rate": 0.00014479316074322832, + "loss": 2.6796, + "step": 7108 + }, + { + "epoch": 0.5737228633685739, + "grad_norm": 0.7328097224235535, + "learning_rate": 0.00014477904562777038, + "loss": 2.5923, + "step": 7109 + }, + { + "epoch": 0.573803567105157, + "grad_norm": 0.7288877964019775, + "learning_rate": 0.0001447649293963016, + "loss": 2.6012, + "step": 7110 + }, + { + "epoch": 0.57388427084174, + "grad_norm": 0.7054389119148254, + "learning_rate": 0.00014475081204917372, + "loss": 2.6666, + "step": 7111 + }, + { + "epoch": 0.573964974578323, + "grad_norm": 0.7447949647903442, + "learning_rate": 0.00014473669358673865, + "loss": 2.6093, + "step": 7112 + }, + { + "epoch": 0.5740456783149059, + "grad_norm": 0.6431592106819153, + "learning_rate": 0.0001447225740093482, + "loss": 2.6242, + "step": 7113 + }, + { + "epoch": 0.574126382051489, + "grad_norm": 0.7096747756004333, + "learning_rate": 0.00014470845331735434, + "loss": 2.6297, + "step": 7114 + }, + { + "epoch": 0.574207085788072, + "grad_norm": 0.6918880939483643, + "learning_rate": 0.00014469433151110894, + "loss": 2.5849, + "step": 7115 + }, + { + "epoch": 0.574287789524655, + "grad_norm": 0.6617783308029175, + "learning_rate": 0.00014468020859096395, + "loss": 2.5972, + "step": 7116 + }, + { + "epoch": 0.574368493261238, + "grad_norm": 0.6525121927261353, + "learning_rate": 0.0001446660845572714, + "loss": 2.5888, + "step": 7117 + }, + { + "epoch": 0.574449196997821, + "grad_norm": 0.7024720907211304, + "learning_rate": 0.00014465195941038326, + "loss": 2.6135, + "step": 7118 + }, + { + "epoch": 0.574529900734404, + "grad_norm": 0.7660520672798157, + "learning_rate": 0.00014463783315065153, + "loss": 2.5837, + "step": 7119 + }, + { + "epoch": 0.574610604470987, + "grad_norm": 0.8206443190574646, + "learning_rate": 0.00014462370577842838, + "loss": 2.6749, + "step": 7120 + }, + { + "epoch": 0.57469130820757, + "grad_norm": 0.7176216840744019, + "learning_rate": 0.00014460957729406577, + "loss": 2.5814, + "step": 7121 + }, + { + "epoch": 0.5747720119441531, + "grad_norm": 0.7867588400840759, + "learning_rate": 0.0001445954476979159, + "loss": 2.5697, + "step": 7122 + }, + { + "epoch": 0.574852715680736, + "grad_norm": 0.7150471806526184, + "learning_rate": 0.0001445813169903309, + "loss": 2.5689, + "step": 7123 + }, + { + "epoch": 0.574933419417319, + "grad_norm": 0.7082479596138, + "learning_rate": 0.00014456718517166296, + "loss": 2.6081, + "step": 7124 + }, + { + "epoch": 0.575014123153902, + "grad_norm": 0.7207253575325012, + "learning_rate": 0.00014455305224226426, + "loss": 2.6573, + "step": 7125 + }, + { + "epoch": 0.5750948268904851, + "grad_norm": 0.7451751232147217, + "learning_rate": 0.00014453891820248704, + "loss": 2.6057, + "step": 7126 + }, + { + "epoch": 0.575175530627068, + "grad_norm": 0.7030230164527893, + "learning_rate": 0.0001445247830526835, + "loss": 2.6122, + "step": 7127 + }, + { + "epoch": 0.575256234363651, + "grad_norm": 0.7233754396438599, + "learning_rate": 0.00014451064679320605, + "loss": 2.5937, + "step": 7128 + }, + { + "epoch": 0.575336938100234, + "grad_norm": 0.6943942904472351, + "learning_rate": 0.0001444965094244069, + "loss": 2.6327, + "step": 7129 + }, + { + "epoch": 0.5754176418368171, + "grad_norm": 0.682056725025177, + "learning_rate": 0.00014448237094663843, + "loss": 2.6212, + "step": 7130 + }, + { + "epoch": 0.5754983455734001, + "grad_norm": 0.7424136400222778, + "learning_rate": 0.00014446823136025298, + "loss": 2.6031, + "step": 7131 + }, + { + "epoch": 0.575579049309983, + "grad_norm": 0.7464002370834351, + "learning_rate": 0.00014445409066560298, + "loss": 2.6363, + "step": 7132 + }, + { + "epoch": 0.575659753046566, + "grad_norm": 0.7137650847434998, + "learning_rate": 0.00014443994886304085, + "loss": 2.5343, + "step": 7133 + }, + { + "epoch": 0.5757404567831491, + "grad_norm": 0.6744158864021301, + "learning_rate": 0.00014442580595291901, + "loss": 2.6463, + "step": 7134 + }, + { + "epoch": 0.5758211605197321, + "grad_norm": 0.6947084069252014, + "learning_rate": 0.00014441166193558991, + "loss": 2.6074, + "step": 7135 + }, + { + "epoch": 0.5759018642563151, + "grad_norm": 0.6981585621833801, + "learning_rate": 0.00014439751681140616, + "loss": 2.6257, + "step": 7136 + }, + { + "epoch": 0.575982567992898, + "grad_norm": 0.6800102591514587, + "learning_rate": 0.00014438337058072023, + "loss": 2.6447, + "step": 7137 + }, + { + "epoch": 0.5760632717294811, + "grad_norm": 0.6952316164970398, + "learning_rate": 0.00014436922324388465, + "loss": 2.5739, + "step": 7138 + }, + { + "epoch": 0.5761439754660641, + "grad_norm": 0.709170937538147, + "learning_rate": 0.0001443550748012521, + "loss": 2.5918, + "step": 7139 + }, + { + "epoch": 0.5762246792026471, + "grad_norm": 0.7677363157272339, + "learning_rate": 0.00014434092525317512, + "loss": 2.6322, + "step": 7140 + }, + { + "epoch": 0.5763053829392301, + "grad_norm": 0.6730263233184814, + "learning_rate": 0.00014432677460000636, + "loss": 2.6764, + "step": 7141 + }, + { + "epoch": 0.576386086675813, + "grad_norm": 0.6782239675521851, + "learning_rate": 0.0001443126228420985, + "loss": 2.5208, + "step": 7142 + }, + { + "epoch": 0.5764667904123961, + "grad_norm": 0.7737600207328796, + "learning_rate": 0.00014429846997980424, + "loss": 2.6964, + "step": 7143 + }, + { + "epoch": 0.5765474941489791, + "grad_norm": 0.7456403374671936, + "learning_rate": 0.00014428431601347635, + "loss": 2.6163, + "step": 7144 + }, + { + "epoch": 0.5766281978855621, + "grad_norm": 0.7824606895446777, + "learning_rate": 0.00014427016094346754, + "loss": 2.6499, + "step": 7145 + }, + { + "epoch": 0.576708901622145, + "grad_norm": 0.7233635187149048, + "learning_rate": 0.00014425600477013055, + "loss": 2.6064, + "step": 7146 + }, + { + "epoch": 0.5767896053587281, + "grad_norm": 0.7008275389671326, + "learning_rate": 0.00014424184749381824, + "loss": 2.5585, + "step": 7147 + }, + { + "epoch": 0.5768703090953111, + "grad_norm": 0.6817710995674133, + "learning_rate": 0.00014422768911488346, + "loss": 2.6215, + "step": 7148 + }, + { + "epoch": 0.5769510128318941, + "grad_norm": 0.6860779523849487, + "learning_rate": 0.00014421352963367906, + "loss": 2.5877, + "step": 7149 + }, + { + "epoch": 0.5770317165684771, + "grad_norm": 0.732865035533905, + "learning_rate": 0.00014419936905055793, + "loss": 2.5704, + "step": 7150 + }, + { + "epoch": 0.5771124203050602, + "grad_norm": 0.6992458701133728, + "learning_rate": 0.00014418520736587297, + "loss": 2.6654, + "step": 7151 + }, + { + "epoch": 0.5771931240416431, + "grad_norm": 0.6865053176879883, + "learning_rate": 0.00014417104457997715, + "loss": 2.6389, + "step": 7152 + }, + { + "epoch": 0.5772738277782261, + "grad_norm": 0.7652727365493774, + "learning_rate": 0.00014415688069322345, + "loss": 2.6478, + "step": 7153 + }, + { + "epoch": 0.5773545315148091, + "grad_norm": 0.708692193031311, + "learning_rate": 0.0001441427157059648, + "loss": 2.6065, + "step": 7154 + }, + { + "epoch": 0.5774352352513922, + "grad_norm": 0.7549232244491577, + "learning_rate": 0.00014412854961855435, + "loss": 2.6484, + "step": 7155 + }, + { + "epoch": 0.5775159389879752, + "grad_norm": 0.6410655975341797, + "learning_rate": 0.00014411438243134506, + "loss": 2.6061, + "step": 7156 + }, + { + "epoch": 0.5775966427245581, + "grad_norm": 0.7711724042892456, + "learning_rate": 0.00014410021414469005, + "loss": 2.628, + "step": 7157 + }, + { + "epoch": 0.5776773464611411, + "grad_norm": 0.6723695993423462, + "learning_rate": 0.0001440860447589424, + "loss": 2.6214, + "step": 7158 + }, + { + "epoch": 0.5777580501977242, + "grad_norm": 0.7359206676483154, + "learning_rate": 0.0001440718742744553, + "loss": 2.6157, + "step": 7159 + }, + { + "epoch": 0.5778387539343072, + "grad_norm": 0.7320525050163269, + "learning_rate": 0.0001440577026915819, + "loss": 2.6081, + "step": 7160 + }, + { + "epoch": 0.5779194576708901, + "grad_norm": 0.7728561162948608, + "learning_rate": 0.00014404353001067535, + "loss": 2.5989, + "step": 7161 + }, + { + "epoch": 0.5780001614074731, + "grad_norm": 0.7380329370498657, + "learning_rate": 0.0001440293562320889, + "loss": 2.6337, + "step": 7162 + }, + { + "epoch": 0.5780808651440562, + "grad_norm": 0.667789876461029, + "learning_rate": 0.00014401518135617581, + "loss": 2.6324, + "step": 7163 + }, + { + "epoch": 0.5781615688806392, + "grad_norm": 0.6907219886779785, + "learning_rate": 0.00014400100538328935, + "loss": 2.5897, + "step": 7164 + }, + { + "epoch": 0.5782422726172222, + "grad_norm": 0.9051530957221985, + "learning_rate": 0.00014398682831378283, + "loss": 2.6895, + "step": 7165 + }, + { + "epoch": 0.5783229763538051, + "grad_norm": 0.7189533114433289, + "learning_rate": 0.00014397265014800956, + "loss": 2.5948, + "step": 7166 + }, + { + "epoch": 0.5784036800903882, + "grad_norm": 0.7003059983253479, + "learning_rate": 0.00014395847088632285, + "loss": 2.5814, + "step": 7167 + }, + { + "epoch": 0.5784843838269712, + "grad_norm": 0.8083534240722656, + "learning_rate": 0.0001439442905290762, + "loss": 2.6131, + "step": 7168 + }, + { + "epoch": 0.5785650875635542, + "grad_norm": 0.7068585157394409, + "learning_rate": 0.0001439301090766229, + "loss": 2.6027, + "step": 7169 + }, + { + "epoch": 0.5786457913001372, + "grad_norm": 0.7010494470596313, + "learning_rate": 0.00014391592652931653, + "loss": 2.5296, + "step": 7170 + }, + { + "epoch": 0.5787264950367202, + "grad_norm": 0.7577467560768127, + "learning_rate": 0.00014390174288751045, + "loss": 2.6347, + "step": 7171 + }, + { + "epoch": 0.5788071987733032, + "grad_norm": 0.643799364566803, + "learning_rate": 0.00014388755815155813, + "loss": 2.6152, + "step": 7172 + }, + { + "epoch": 0.5788879025098862, + "grad_norm": 0.740352988243103, + "learning_rate": 0.00014387337232181315, + "loss": 2.6123, + "step": 7173 + }, + { + "epoch": 0.5789686062464692, + "grad_norm": 0.7309309840202332, + "learning_rate": 0.00014385918539862907, + "loss": 2.6072, + "step": 7174 + }, + { + "epoch": 0.5790493099830523, + "grad_norm": 0.7237016558647156, + "learning_rate": 0.00014384499738235941, + "loss": 2.6375, + "step": 7175 + }, + { + "epoch": 0.5791300137196352, + "grad_norm": 0.6600970029830933, + "learning_rate": 0.00014383080827335784, + "loss": 2.5285, + "step": 7176 + }, + { + "epoch": 0.5792107174562182, + "grad_norm": 0.6822233200073242, + "learning_rate": 0.00014381661807197794, + "loss": 2.5497, + "step": 7177 + }, + { + "epoch": 0.5792914211928012, + "grad_norm": 0.6990383863449097, + "learning_rate": 0.00014380242677857337, + "loss": 2.6283, + "step": 7178 + }, + { + "epoch": 0.5793721249293843, + "grad_norm": 0.64422208070755, + "learning_rate": 0.00014378823439349783, + "loss": 2.5762, + "step": 7179 + }, + { + "epoch": 0.5794528286659673, + "grad_norm": 0.63804692029953, + "learning_rate": 0.00014377404091710501, + "loss": 2.5523, + "step": 7180 + }, + { + "epoch": 0.5795335324025502, + "grad_norm": 0.6978863477706909, + "learning_rate": 0.0001437598463497487, + "loss": 2.5089, + "step": 7181 + }, + { + "epoch": 0.5796142361391332, + "grad_norm": 0.7091087698936462, + "learning_rate": 0.00014374565069178257, + "loss": 2.7005, + "step": 7182 + }, + { + "epoch": 0.5796949398757163, + "grad_norm": 0.683659553527832, + "learning_rate": 0.00014373145394356053, + "loss": 2.5988, + "step": 7183 + }, + { + "epoch": 0.5797756436122993, + "grad_norm": 0.7352960705757141, + "learning_rate": 0.00014371725610543633, + "loss": 2.5671, + "step": 7184 + }, + { + "epoch": 0.5798563473488823, + "grad_norm": 0.6951913237571716, + "learning_rate": 0.00014370305717776382, + "loss": 2.5917, + "step": 7185 + }, + { + "epoch": 0.5799370510854652, + "grad_norm": 0.6644465923309326, + "learning_rate": 0.0001436888571608969, + "loss": 2.5954, + "step": 7186 + }, + { + "epoch": 0.5800177548220483, + "grad_norm": 0.7406458258628845, + "learning_rate": 0.00014367465605518942, + "loss": 2.6369, + "step": 7187 + }, + { + "epoch": 0.5800984585586313, + "grad_norm": 0.6724697351455688, + "learning_rate": 0.00014366045386099535, + "loss": 2.6227, + "step": 7188 + }, + { + "epoch": 0.5801791622952143, + "grad_norm": 0.6804977059364319, + "learning_rate": 0.00014364625057866867, + "loss": 2.6445, + "step": 7189 + }, + { + "epoch": 0.5802598660317972, + "grad_norm": 0.7020019888877869, + "learning_rate": 0.00014363204620856335, + "loss": 2.6733, + "step": 7190 + }, + { + "epoch": 0.5803405697683802, + "grad_norm": 0.6458491086959839, + "learning_rate": 0.00014361784075103332, + "loss": 2.572, + "step": 7191 + }, + { + "epoch": 0.5804212735049633, + "grad_norm": 0.7078056335449219, + "learning_rate": 0.00014360363420643272, + "loss": 2.7032, + "step": 7192 + }, + { + "epoch": 0.5805019772415463, + "grad_norm": 0.6367471814155579, + "learning_rate": 0.00014358942657511557, + "loss": 2.5369, + "step": 7193 + }, + { + "epoch": 0.5805826809781293, + "grad_norm": 0.7311955094337463, + "learning_rate": 0.00014357521785743596, + "loss": 2.6513, + "step": 7194 + }, + { + "epoch": 0.5806633847147122, + "grad_norm": 0.6957442164421082, + "learning_rate": 0.00014356100805374805, + "loss": 2.6512, + "step": 7195 + }, + { + "epoch": 0.5807440884512953, + "grad_norm": 0.7026693224906921, + "learning_rate": 0.0001435467971644059, + "loss": 2.6049, + "step": 7196 + }, + { + "epoch": 0.5808247921878783, + "grad_norm": 0.7337697744369507, + "learning_rate": 0.00014353258518976376, + "loss": 2.5516, + "step": 7197 + }, + { + "epoch": 0.5809054959244613, + "grad_norm": 0.6891856789588928, + "learning_rate": 0.00014351837213017577, + "loss": 2.5894, + "step": 7198 + }, + { + "epoch": 0.5809861996610443, + "grad_norm": 0.6710659265518188, + "learning_rate": 0.0001435041579859962, + "loss": 2.596, + "step": 7199 + }, + { + "epoch": 0.5810669033976273, + "grad_norm": 0.7637245059013367, + "learning_rate": 0.00014348994275757931, + "loss": 2.6278, + "step": 7200 + }, + { + "epoch": 0.5811476071342103, + "grad_norm": 0.7558664679527283, + "learning_rate": 0.00014347572644527934, + "loss": 2.6917, + "step": 7201 + }, + { + "epoch": 0.5812283108707933, + "grad_norm": 0.7254986763000488, + "learning_rate": 0.00014346150904945065, + "loss": 2.6161, + "step": 7202 + }, + { + "epoch": 0.5813090146073763, + "grad_norm": 0.7177211046218872, + "learning_rate": 0.00014344729057044753, + "loss": 2.555, + "step": 7203 + }, + { + "epoch": 0.5813897183439594, + "grad_norm": 0.6408729553222656, + "learning_rate": 0.00014343307100862432, + "loss": 2.6071, + "step": 7204 + }, + { + "epoch": 0.5814704220805423, + "grad_norm": 0.7399997711181641, + "learning_rate": 0.0001434188503643355, + "loss": 2.6013, + "step": 7205 + }, + { + "epoch": 0.5815511258171253, + "grad_norm": 0.7796236276626587, + "learning_rate": 0.00014340462863793543, + "loss": 2.603, + "step": 7206 + }, + { + "epoch": 0.5816318295537083, + "grad_norm": 0.7420137524604797, + "learning_rate": 0.00014339040582977855, + "loss": 2.5858, + "step": 7207 + }, + { + "epoch": 0.5817125332902914, + "grad_norm": 0.738042414188385, + "learning_rate": 0.00014337618194021928, + "loss": 2.592, + "step": 7208 + }, + { + "epoch": 0.5817932370268744, + "grad_norm": 0.6910614371299744, + "learning_rate": 0.00014336195696961222, + "loss": 2.6448, + "step": 7209 + }, + { + "epoch": 0.5818739407634573, + "grad_norm": 0.7838915586471558, + "learning_rate": 0.00014334773091831185, + "loss": 2.6257, + "step": 7210 + }, + { + "epoch": 0.5819546445000403, + "grad_norm": 0.7362141013145447, + "learning_rate": 0.0001433335037866727, + "loss": 2.6505, + "step": 7211 + }, + { + "epoch": 0.5820353482366234, + "grad_norm": 0.6892269253730774, + "learning_rate": 0.00014331927557504934, + "loss": 2.6518, + "step": 7212 + }, + { + "epoch": 0.5821160519732064, + "grad_norm": 0.7444556951522827, + "learning_rate": 0.0001433050462837964, + "loss": 2.6785, + "step": 7213 + }, + { + "epoch": 0.5821967557097893, + "grad_norm": 0.6948450207710266, + "learning_rate": 0.00014329081591326853, + "loss": 2.5753, + "step": 7214 + }, + { + "epoch": 0.5822774594463723, + "grad_norm": 0.713741660118103, + "learning_rate": 0.00014327658446382032, + "loss": 2.6425, + "step": 7215 + }, + { + "epoch": 0.5823581631829554, + "grad_norm": 0.7352245450019836, + "learning_rate": 0.00014326235193580657, + "loss": 2.6859, + "step": 7216 + }, + { + "epoch": 0.5824388669195384, + "grad_norm": 0.7151867151260376, + "learning_rate": 0.00014324811832958187, + "loss": 2.6106, + "step": 7217 + }, + { + "epoch": 0.5825195706561214, + "grad_norm": 0.7003469467163086, + "learning_rate": 0.000143233883645501, + "loss": 2.618, + "step": 7218 + }, + { + "epoch": 0.5826002743927043, + "grad_norm": 0.7139034867286682, + "learning_rate": 0.00014321964788391878, + "loss": 2.5772, + "step": 7219 + }, + { + "epoch": 0.5826809781292874, + "grad_norm": 0.6368305683135986, + "learning_rate": 0.00014320541104518992, + "loss": 2.5259, + "step": 7220 + }, + { + "epoch": 0.5827616818658704, + "grad_norm": 0.6921548247337341, + "learning_rate": 0.0001431911731296693, + "loss": 2.6403, + "step": 7221 + }, + { + "epoch": 0.5828423856024534, + "grad_norm": 0.6995570659637451, + "learning_rate": 0.00014317693413771175, + "loss": 2.6172, + "step": 7222 + }, + { + "epoch": 0.5829230893390364, + "grad_norm": 0.7557246088981628, + "learning_rate": 0.0001431626940696721, + "loss": 2.6347, + "step": 7223 + }, + { + "epoch": 0.5830037930756194, + "grad_norm": 0.6912205219268799, + "learning_rate": 0.00014314845292590528, + "loss": 2.5958, + "step": 7224 + }, + { + "epoch": 0.5830844968122024, + "grad_norm": 0.6896184682846069, + "learning_rate": 0.00014313421070676625, + "loss": 2.569, + "step": 7225 + }, + { + "epoch": 0.5831652005487854, + "grad_norm": 0.6900814771652222, + "learning_rate": 0.00014311996741260994, + "loss": 2.5466, + "step": 7226 + }, + { + "epoch": 0.5832459042853684, + "grad_norm": 0.7319771647453308, + "learning_rate": 0.00014310572304379132, + "loss": 2.6181, + "step": 7227 + }, + { + "epoch": 0.5833266080219515, + "grad_norm": 0.728138267993927, + "learning_rate": 0.0001430914776006654, + "loss": 2.6644, + "step": 7228 + }, + { + "epoch": 0.5834073117585344, + "grad_norm": 0.7361802458763123, + "learning_rate": 0.0001430772310835872, + "loss": 2.6079, + "step": 7229 + }, + { + "epoch": 0.5834880154951174, + "grad_norm": 0.6893376708030701, + "learning_rate": 0.00014306298349291182, + "loss": 2.5615, + "step": 7230 + }, + { + "epoch": 0.5835687192317004, + "grad_norm": 0.6661401987075806, + "learning_rate": 0.00014304873482899431, + "loss": 2.6028, + "step": 7231 + }, + { + "epoch": 0.5836494229682835, + "grad_norm": 0.6571504473686218, + "learning_rate": 0.0001430344850921898, + "loss": 2.5553, + "step": 7232 + }, + { + "epoch": 0.5837301267048665, + "grad_norm": 0.6878423690795898, + "learning_rate": 0.00014302023428285342, + "loss": 2.5336, + "step": 7233 + }, + { + "epoch": 0.5838108304414494, + "grad_norm": 0.768117368221283, + "learning_rate": 0.00014300598240134035, + "loss": 2.6036, + "step": 7234 + }, + { + "epoch": 0.5838915341780324, + "grad_norm": 0.6876625418663025, + "learning_rate": 0.0001429917294480058, + "loss": 2.6314, + "step": 7235 + }, + { + "epoch": 0.5839722379146155, + "grad_norm": 0.7146790027618408, + "learning_rate": 0.00014297747542320495, + "loss": 2.6029, + "step": 7236 + }, + { + "epoch": 0.5840529416511985, + "grad_norm": 0.7032392024993896, + "learning_rate": 0.00014296322032729308, + "loss": 2.6163, + "step": 7237 + }, + { + "epoch": 0.5841336453877815, + "grad_norm": 0.7323551177978516, + "learning_rate": 0.00014294896416062544, + "loss": 2.6706, + "step": 7238 + }, + { + "epoch": 0.5842143491243644, + "grad_norm": 0.7647258639335632, + "learning_rate": 0.00014293470692355734, + "loss": 2.6744, + "step": 7239 + }, + { + "epoch": 0.5842950528609475, + "grad_norm": 0.6824506521224976, + "learning_rate": 0.00014292044861644414, + "loss": 2.579, + "step": 7240 + }, + { + "epoch": 0.5843757565975305, + "grad_norm": 0.7553619742393494, + "learning_rate": 0.00014290618923964115, + "loss": 2.6196, + "step": 7241 + }, + { + "epoch": 0.5844564603341135, + "grad_norm": 0.6872109770774841, + "learning_rate": 0.00014289192879350375, + "loss": 2.555, + "step": 7242 + }, + { + "epoch": 0.5845371640706964, + "grad_norm": 0.664658784866333, + "learning_rate": 0.00014287766727838735, + "loss": 2.5781, + "step": 7243 + }, + { + "epoch": 0.5846178678072794, + "grad_norm": 0.6709543466567993, + "learning_rate": 0.00014286340469464744, + "loss": 2.6022, + "step": 7244 + }, + { + "epoch": 0.5846985715438625, + "grad_norm": 0.7236210107803345, + "learning_rate": 0.00014284914104263941, + "loss": 2.5609, + "step": 7245 + }, + { + "epoch": 0.5847792752804455, + "grad_norm": 0.6751740574836731, + "learning_rate": 0.0001428348763227188, + "loss": 2.5792, + "step": 7246 + }, + { + "epoch": 0.5848599790170285, + "grad_norm": 0.6684607267379761, + "learning_rate": 0.0001428206105352411, + "loss": 2.5705, + "step": 7247 + }, + { + "epoch": 0.5849406827536114, + "grad_norm": 0.6876732707023621, + "learning_rate": 0.00014280634368056186, + "loss": 2.6576, + "step": 7248 + }, + { + "epoch": 0.5850213864901945, + "grad_norm": 0.758637547492981, + "learning_rate": 0.0001427920757590366, + "loss": 2.6215, + "step": 7249 + }, + { + "epoch": 0.5851020902267775, + "grad_norm": 0.6839025020599365, + "learning_rate": 0.00014277780677102097, + "loss": 2.5898, + "step": 7250 + }, + { + "epoch": 0.5851827939633605, + "grad_norm": 0.6912671327590942, + "learning_rate": 0.00014276353671687056, + "loss": 2.5879, + "step": 7251 + }, + { + "epoch": 0.5852634976999435, + "grad_norm": 0.6727048754692078, + "learning_rate": 0.00014274926559694107, + "loss": 2.5501, + "step": 7252 + }, + { + "epoch": 0.5853442014365265, + "grad_norm": 0.7031945586204529, + "learning_rate": 0.00014273499341158812, + "loss": 2.625, + "step": 7253 + }, + { + "epoch": 0.5854249051731095, + "grad_norm": 0.6886943578720093, + "learning_rate": 0.0001427207201611674, + "loss": 2.6141, + "step": 7254 + }, + { + "epoch": 0.5855056089096925, + "grad_norm": 0.7906915545463562, + "learning_rate": 0.00014270644584603466, + "loss": 2.7189, + "step": 7255 + }, + { + "epoch": 0.5855863126462755, + "grad_norm": 0.6873704195022583, + "learning_rate": 0.00014269217046654567, + "loss": 2.6031, + "step": 7256 + }, + { + "epoch": 0.5856670163828586, + "grad_norm": 0.6655381321907043, + "learning_rate": 0.00014267789402305618, + "loss": 2.5747, + "step": 7257 + }, + { + "epoch": 0.5857477201194415, + "grad_norm": 0.6655673384666443, + "learning_rate": 0.00014266361651592204, + "loss": 2.625, + "step": 7258 + }, + { + "epoch": 0.5858284238560245, + "grad_norm": 0.6752866506576538, + "learning_rate": 0.00014264933794549901, + "loss": 2.5914, + "step": 7259 + }, + { + "epoch": 0.5859091275926075, + "grad_norm": 0.6680975556373596, + "learning_rate": 0.00014263505831214302, + "loss": 2.5572, + "step": 7260 + }, + { + "epoch": 0.5859898313291906, + "grad_norm": 0.6873607039451599, + "learning_rate": 0.00014262077761620994, + "loss": 2.6696, + "step": 7261 + }, + { + "epoch": 0.5860705350657736, + "grad_norm": 0.6745384335517883, + "learning_rate": 0.00014260649585805566, + "loss": 2.5738, + "step": 7262 + }, + { + "epoch": 0.5861512388023565, + "grad_norm": 0.6524637937545776, + "learning_rate": 0.0001425922130380361, + "loss": 2.6209, + "step": 7263 + }, + { + "epoch": 0.5862319425389395, + "grad_norm": 0.6729850172996521, + "learning_rate": 0.00014257792915650728, + "loss": 2.652, + "step": 7264 + }, + { + "epoch": 0.5863126462755226, + "grad_norm": 0.6713503003120422, + "learning_rate": 0.00014256364421382514, + "loss": 2.5658, + "step": 7265 + }, + { + "epoch": 0.5863933500121056, + "grad_norm": 0.6835616827011108, + "learning_rate": 0.00014254935821034575, + "loss": 2.5535, + "step": 7266 + }, + { + "epoch": 0.5864740537486886, + "grad_norm": 0.7425376176834106, + "learning_rate": 0.00014253507114642515, + "loss": 2.6369, + "step": 7267 + }, + { + "epoch": 0.5865547574852715, + "grad_norm": 0.6788069605827332, + "learning_rate": 0.00014252078302241932, + "loss": 2.601, + "step": 7268 + }, + { + "epoch": 0.5866354612218546, + "grad_norm": 0.6828538179397583, + "learning_rate": 0.0001425064938386845, + "loss": 2.5861, + "step": 7269 + }, + { + "epoch": 0.5867161649584376, + "grad_norm": 0.6763372421264648, + "learning_rate": 0.0001424922035955767, + "loss": 2.6035, + "step": 7270 + }, + { + "epoch": 0.5867968686950206, + "grad_norm": 0.6517930626869202, + "learning_rate": 0.0001424779122934521, + "loss": 2.5564, + "step": 7271 + }, + { + "epoch": 0.5868775724316035, + "grad_norm": 0.6633113622665405, + "learning_rate": 0.00014246361993266692, + "loss": 2.6163, + "step": 7272 + }, + { + "epoch": 0.5869582761681866, + "grad_norm": 0.684822678565979, + "learning_rate": 0.00014244932651357733, + "loss": 2.6057, + "step": 7273 + }, + { + "epoch": 0.5870389799047696, + "grad_norm": 0.7679704427719116, + "learning_rate": 0.00014243503203653952, + "loss": 2.6522, + "step": 7274 + }, + { + "epoch": 0.5871196836413526, + "grad_norm": 0.6834188103675842, + "learning_rate": 0.00014242073650190984, + "loss": 2.652, + "step": 7275 + }, + { + "epoch": 0.5872003873779356, + "grad_norm": 0.6903846859931946, + "learning_rate": 0.00014240643991004449, + "loss": 2.5894, + "step": 7276 + }, + { + "epoch": 0.5872810911145186, + "grad_norm": 0.7060866951942444, + "learning_rate": 0.0001423921422612998, + "loss": 2.5994, + "step": 7277 + }, + { + "epoch": 0.5873617948511016, + "grad_norm": 0.6646741628646851, + "learning_rate": 0.0001423778435560321, + "loss": 2.6432, + "step": 7278 + }, + { + "epoch": 0.5874424985876846, + "grad_norm": 0.6930218935012817, + "learning_rate": 0.0001423635437945978, + "loss": 2.6233, + "step": 7279 + }, + { + "epoch": 0.5875232023242676, + "grad_norm": 0.6914143562316895, + "learning_rate": 0.00014234924297735322, + "loss": 2.6143, + "step": 7280 + }, + { + "epoch": 0.5876039060608507, + "grad_norm": 0.7351366281509399, + "learning_rate": 0.0001423349411046548, + "loss": 2.6323, + "step": 7281 + }, + { + "epoch": 0.5876846097974336, + "grad_norm": 0.6813770532608032, + "learning_rate": 0.000142320638176859, + "loss": 2.5964, + "step": 7282 + }, + { + "epoch": 0.5877653135340166, + "grad_norm": 0.7049702405929565, + "learning_rate": 0.00014230633419432226, + "loss": 2.6284, + "step": 7283 + }, + { + "epoch": 0.5878460172705996, + "grad_norm": 0.7140446901321411, + "learning_rate": 0.00014229202915740107, + "loss": 2.6113, + "step": 7284 + }, + { + "epoch": 0.5879267210071827, + "grad_norm": 0.696588933467865, + "learning_rate": 0.00014227772306645196, + "loss": 2.6384, + "step": 7285 + }, + { + "epoch": 0.5880074247437657, + "grad_norm": 0.6800615787506104, + "learning_rate": 0.0001422634159218315, + "loss": 2.5743, + "step": 7286 + }, + { + "epoch": 0.5880881284803486, + "grad_norm": 0.7586596608161926, + "learning_rate": 0.00014224910772389624, + "loss": 2.6504, + "step": 7287 + }, + { + "epoch": 0.5881688322169316, + "grad_norm": 0.73286372423172, + "learning_rate": 0.00014223479847300278, + "loss": 2.6026, + "step": 7288 + }, + { + "epoch": 0.5882495359535147, + "grad_norm": 0.6808766722679138, + "learning_rate": 0.00014222048816950772, + "loss": 2.5822, + "step": 7289 + }, + { + "epoch": 0.5883302396900977, + "grad_norm": 0.7424919009208679, + "learning_rate": 0.0001422061768137677, + "loss": 2.6474, + "step": 7290 + }, + { + "epoch": 0.5884109434266807, + "grad_norm": 0.658183753490448, + "learning_rate": 0.00014219186440613948, + "loss": 2.6051, + "step": 7291 + }, + { + "epoch": 0.5884916471632636, + "grad_norm": 0.6693006157875061, + "learning_rate": 0.0001421775509469797, + "loss": 2.5774, + "step": 7292 + }, + { + "epoch": 0.5885723508998466, + "grad_norm": 0.7298646569252014, + "learning_rate": 0.00014216323643664508, + "loss": 2.5688, + "step": 7293 + }, + { + "epoch": 0.5886530546364297, + "grad_norm": 0.6665881276130676, + "learning_rate": 0.00014214892087549238, + "loss": 2.608, + "step": 7294 + }, + { + "epoch": 0.5887337583730127, + "grad_norm": 0.7220060229301453, + "learning_rate": 0.00014213460426387841, + "loss": 2.6078, + "step": 7295 + }, + { + "epoch": 0.5888144621095956, + "grad_norm": 0.6693970561027527, + "learning_rate": 0.00014212028660215997, + "loss": 2.597, + "step": 7296 + }, + { + "epoch": 0.5888951658461786, + "grad_norm": 0.682331919670105, + "learning_rate": 0.00014210596789069387, + "loss": 2.5752, + "step": 7297 + }, + { + "epoch": 0.5889758695827617, + "grad_norm": 0.7586890459060669, + "learning_rate": 0.000142091648129837, + "loss": 2.6878, + "step": 7298 + }, + { + "epoch": 0.5890565733193447, + "grad_norm": 0.6740901470184326, + "learning_rate": 0.00014207732731994624, + "loss": 2.6083, + "step": 7299 + }, + { + "epoch": 0.5891372770559277, + "grad_norm": 0.6959021091461182, + "learning_rate": 0.00014206300546137842, + "loss": 2.5765, + "step": 7300 + }, + { + "epoch": 0.5892179807925106, + "grad_norm": 0.7446078658103943, + "learning_rate": 0.0001420486825544906, + "loss": 2.662, + "step": 7301 + }, + { + "epoch": 0.5892986845290937, + "grad_norm": 0.7418847680091858, + "learning_rate": 0.0001420343585996397, + "loss": 2.6606, + "step": 7302 + }, + { + "epoch": 0.5893793882656767, + "grad_norm": 0.7185709476470947, + "learning_rate": 0.00014202003359718273, + "loss": 2.563, + "step": 7303 + }, + { + "epoch": 0.5894600920022597, + "grad_norm": 0.6960515379905701, + "learning_rate": 0.00014200570754747664, + "loss": 2.6182, + "step": 7304 + }, + { + "epoch": 0.5895407957388427, + "grad_norm": 0.6589705348014832, + "learning_rate": 0.00014199138045087849, + "loss": 2.6714, + "step": 7305 + }, + { + "epoch": 0.5896214994754257, + "grad_norm": 0.7027507424354553, + "learning_rate": 0.00014197705230774543, + "loss": 2.6145, + "step": 7306 + }, + { + "epoch": 0.5897022032120087, + "grad_norm": 0.6761246919631958, + "learning_rate": 0.00014196272311843447, + "loss": 2.5688, + "step": 7307 + }, + { + "epoch": 0.5897829069485917, + "grad_norm": 0.6618059277534485, + "learning_rate": 0.00014194839288330277, + "loss": 2.6194, + "step": 7308 + }, + { + "epoch": 0.5898636106851747, + "grad_norm": 0.7182614803314209, + "learning_rate": 0.00014193406160270747, + "loss": 2.5452, + "step": 7309 + }, + { + "epoch": 0.5899443144217578, + "grad_norm": 0.6830565333366394, + "learning_rate": 0.0001419197292770057, + "loss": 2.5728, + "step": 7310 + }, + { + "epoch": 0.5900250181583407, + "grad_norm": 0.6744499802589417, + "learning_rate": 0.00014190539590655475, + "loss": 2.5736, + "step": 7311 + }, + { + "epoch": 0.5901057218949237, + "grad_norm": 0.7177874445915222, + "learning_rate": 0.00014189106149171176, + "loss": 2.6271, + "step": 7312 + }, + { + "epoch": 0.5901864256315067, + "grad_norm": 0.6770105361938477, + "learning_rate": 0.000141876726032834, + "loss": 2.5924, + "step": 7313 + }, + { + "epoch": 0.5902671293680898, + "grad_norm": 0.7295818328857422, + "learning_rate": 0.0001418623895302788, + "loss": 2.644, + "step": 7314 + }, + { + "epoch": 0.5903478331046728, + "grad_norm": 0.7244859933853149, + "learning_rate": 0.00014184805198440338, + "loss": 2.5892, + "step": 7315 + }, + { + "epoch": 0.5904285368412557, + "grad_norm": 0.7067728638648987, + "learning_rate": 0.00014183371339556512, + "loss": 2.5985, + "step": 7316 + }, + { + "epoch": 0.5905092405778387, + "grad_norm": 0.6732490062713623, + "learning_rate": 0.0001418193737641214, + "loss": 2.5771, + "step": 7317 + }, + { + "epoch": 0.5905899443144218, + "grad_norm": 0.7087544202804565, + "learning_rate": 0.00014180503309042957, + "loss": 2.6373, + "step": 7318 + }, + { + "epoch": 0.5906706480510048, + "grad_norm": 0.772174596786499, + "learning_rate": 0.00014179069137484703, + "loss": 2.6262, + "step": 7319 + }, + { + "epoch": 0.5907513517875878, + "grad_norm": 0.6855718493461609, + "learning_rate": 0.00014177634861773118, + "loss": 2.6268, + "step": 7320 + }, + { + "epoch": 0.5908320555241707, + "grad_norm": 0.7168720364570618, + "learning_rate": 0.00014176200481943953, + "loss": 2.5892, + "step": 7321 + }, + { + "epoch": 0.5909127592607538, + "grad_norm": 0.7126333713531494, + "learning_rate": 0.0001417476599803296, + "loss": 2.6079, + "step": 7322 + }, + { + "epoch": 0.5909934629973368, + "grad_norm": 0.7451913952827454, + "learning_rate": 0.0001417333141007588, + "loss": 2.635, + "step": 7323 + }, + { + "epoch": 0.5910741667339198, + "grad_norm": 0.7405436038970947, + "learning_rate": 0.00014171896718108475, + "loss": 2.6014, + "step": 7324 + }, + { + "epoch": 0.5911548704705027, + "grad_norm": 0.7583999037742615, + "learning_rate": 0.00014170461922166498, + "loss": 2.6815, + "step": 7325 + }, + { + "epoch": 0.5912355742070858, + "grad_norm": 0.6653509140014648, + "learning_rate": 0.00014169027022285706, + "loss": 2.6153, + "step": 7326 + }, + { + "epoch": 0.5913162779436688, + "grad_norm": 0.7145548462867737, + "learning_rate": 0.00014167592018501864, + "loss": 2.6022, + "step": 7327 + }, + { + "epoch": 0.5913969816802518, + "grad_norm": 0.6996089816093445, + "learning_rate": 0.00014166156910850737, + "loss": 2.6586, + "step": 7328 + }, + { + "epoch": 0.5914776854168348, + "grad_norm": 0.735653281211853, + "learning_rate": 0.0001416472169936809, + "loss": 2.6084, + "step": 7329 + }, + { + "epoch": 0.5915583891534179, + "grad_norm": 0.695036768913269, + "learning_rate": 0.00014163286384089686, + "loss": 2.5058, + "step": 7330 + }, + { + "epoch": 0.5916390928900008, + "grad_norm": 0.9014756679534912, + "learning_rate": 0.00014161850965051307, + "loss": 2.5991, + "step": 7331 + }, + { + "epoch": 0.5917197966265838, + "grad_norm": 0.7079846858978271, + "learning_rate": 0.0001416041544228872, + "loss": 2.6067, + "step": 7332 + }, + { + "epoch": 0.5918005003631668, + "grad_norm": 0.7681204080581665, + "learning_rate": 0.00014158979815837705, + "loss": 2.5414, + "step": 7333 + }, + { + "epoch": 0.5918812040997499, + "grad_norm": 0.6501670479774475, + "learning_rate": 0.00014157544085734042, + "loss": 2.617, + "step": 7334 + }, + { + "epoch": 0.5919619078363328, + "grad_norm": 0.7573496103286743, + "learning_rate": 0.00014156108252013513, + "loss": 2.6341, + "step": 7335 + }, + { + "epoch": 0.5920426115729158, + "grad_norm": 0.6865558624267578, + "learning_rate": 0.00014154672314711903, + "loss": 2.6229, + "step": 7336 + }, + { + "epoch": 0.5921233153094988, + "grad_norm": 0.6859166622161865, + "learning_rate": 0.00014153236273864995, + "loss": 2.6149, + "step": 7337 + }, + { + "epoch": 0.5922040190460819, + "grad_norm": 0.7603647112846375, + "learning_rate": 0.00014151800129508585, + "loss": 2.5645, + "step": 7338 + }, + { + "epoch": 0.5922847227826649, + "grad_norm": 0.6740217208862305, + "learning_rate": 0.00014150363881678464, + "loss": 2.5883, + "step": 7339 + }, + { + "epoch": 0.5923654265192478, + "grad_norm": 0.6412263512611389, + "learning_rate": 0.00014148927530410426, + "loss": 2.576, + "step": 7340 + }, + { + "epoch": 0.5924461302558308, + "grad_norm": 0.669834315776825, + "learning_rate": 0.00014147491075740265, + "loss": 2.542, + "step": 7341 + }, + { + "epoch": 0.5925268339924139, + "grad_norm": 0.720024049282074, + "learning_rate": 0.00014146054517703786, + "loss": 2.6491, + "step": 7342 + }, + { + "epoch": 0.5926075377289969, + "grad_norm": 0.7191612720489502, + "learning_rate": 0.00014144617856336794, + "loss": 2.5933, + "step": 7343 + }, + { + "epoch": 0.5926882414655799, + "grad_norm": 0.7012050747871399, + "learning_rate": 0.00014143181091675087, + "loss": 2.5253, + "step": 7344 + }, + { + "epoch": 0.5927689452021628, + "grad_norm": 0.7825081944465637, + "learning_rate": 0.00014141744223754478, + "loss": 2.6225, + "step": 7345 + }, + { + "epoch": 0.5928496489387458, + "grad_norm": 0.6699295043945312, + "learning_rate": 0.00014140307252610775, + "loss": 2.5893, + "step": 7346 + }, + { + "epoch": 0.5929303526753289, + "grad_norm": 0.6668846011161804, + "learning_rate": 0.00014138870178279794, + "loss": 2.5944, + "step": 7347 + }, + { + "epoch": 0.5930110564119119, + "grad_norm": 0.7681072950363159, + "learning_rate": 0.0001413743300079735, + "loss": 2.5715, + "step": 7348 + }, + { + "epoch": 0.5930917601484949, + "grad_norm": 0.653075635433197, + "learning_rate": 0.00014135995720199258, + "loss": 2.5924, + "step": 7349 + }, + { + "epoch": 0.5931724638850778, + "grad_norm": 0.6807504892349243, + "learning_rate": 0.00014134558336521342, + "loss": 2.5395, + "step": 7350 + }, + { + "epoch": 0.5932531676216609, + "grad_norm": 0.681175708770752, + "learning_rate": 0.00014133120849799423, + "loss": 2.5401, + "step": 7351 + }, + { + "epoch": 0.5933338713582439, + "grad_norm": 0.7159900665283203, + "learning_rate": 0.0001413168326006933, + "loss": 2.5684, + "step": 7352 + }, + { + "epoch": 0.5934145750948269, + "grad_norm": 0.6517181992530823, + "learning_rate": 0.00014130245567366888, + "loss": 2.5887, + "step": 7353 + }, + { + "epoch": 0.5934952788314098, + "grad_norm": 0.6982731223106384, + "learning_rate": 0.00014128807771727936, + "loss": 2.5707, + "step": 7354 + }, + { + "epoch": 0.5935759825679929, + "grad_norm": 0.7003650069236755, + "learning_rate": 0.00014127369873188296, + "loss": 2.6415, + "step": 7355 + }, + { + "epoch": 0.5936566863045759, + "grad_norm": 0.7408339977264404, + "learning_rate": 0.0001412593187178381, + "loss": 2.5655, + "step": 7356 + }, + { + "epoch": 0.5937373900411589, + "grad_norm": 0.717218279838562, + "learning_rate": 0.00014124493767550317, + "loss": 2.586, + "step": 7357 + }, + { + "epoch": 0.5938180937777419, + "grad_norm": 0.6723458766937256, + "learning_rate": 0.00014123055560523657, + "loss": 2.593, + "step": 7358 + }, + { + "epoch": 0.593898797514325, + "grad_norm": 0.6861262321472168, + "learning_rate": 0.00014121617250739677, + "loss": 2.612, + "step": 7359 + }, + { + "epoch": 0.5939795012509079, + "grad_norm": 0.6811453104019165, + "learning_rate": 0.00014120178838234222, + "loss": 2.5708, + "step": 7360 + }, + { + "epoch": 0.5940602049874909, + "grad_norm": 0.6249656677246094, + "learning_rate": 0.00014118740323043136, + "loss": 2.5604, + "step": 7361 + }, + { + "epoch": 0.5941409087240739, + "grad_norm": 0.7671588659286499, + "learning_rate": 0.00014117301705202274, + "loss": 2.547, + "step": 7362 + }, + { + "epoch": 0.594221612460657, + "grad_norm": 0.6856057643890381, + "learning_rate": 0.00014115862984747496, + "loss": 2.6108, + "step": 7363 + }, + { + "epoch": 0.5943023161972399, + "grad_norm": 0.692331850528717, + "learning_rate": 0.0001411442416171465, + "loss": 2.6347, + "step": 7364 + }, + { + "epoch": 0.5943830199338229, + "grad_norm": 0.7256516814231873, + "learning_rate": 0.000141129852361396, + "loss": 2.6098, + "step": 7365 + }, + { + "epoch": 0.5944637236704059, + "grad_norm": 0.7522590160369873, + "learning_rate": 0.00014111546208058203, + "loss": 2.5688, + "step": 7366 + }, + { + "epoch": 0.594544427406989, + "grad_norm": 0.6915806531906128, + "learning_rate": 0.0001411010707750633, + "loss": 2.5899, + "step": 7367 + }, + { + "epoch": 0.594625131143572, + "grad_norm": 0.7355465292930603, + "learning_rate": 0.00014108667844519844, + "loss": 2.5212, + "step": 7368 + }, + { + "epoch": 0.5947058348801549, + "grad_norm": 0.731002926826477, + "learning_rate": 0.00014107228509134615, + "loss": 2.6369, + "step": 7369 + }, + { + "epoch": 0.5947865386167379, + "grad_norm": 0.6764423251152039, + "learning_rate": 0.0001410578907138652, + "loss": 2.6012, + "step": 7370 + }, + { + "epoch": 0.594867242353321, + "grad_norm": 0.7466071844100952, + "learning_rate": 0.0001410434953131142, + "loss": 2.5822, + "step": 7371 + }, + { + "epoch": 0.594947946089904, + "grad_norm": 0.7276137471199036, + "learning_rate": 0.00014102909888945205, + "loss": 2.6055, + "step": 7372 + }, + { + "epoch": 0.595028649826487, + "grad_norm": 0.7411746978759766, + "learning_rate": 0.00014101470144323752, + "loss": 2.6489, + "step": 7373 + }, + { + "epoch": 0.5951093535630699, + "grad_norm": 0.7511908411979675, + "learning_rate": 0.0001410003029748294, + "loss": 2.6268, + "step": 7374 + }, + { + "epoch": 0.595190057299653, + "grad_norm": 0.6623562574386597, + "learning_rate": 0.0001409859034845866, + "loss": 2.58, + "step": 7375 + }, + { + "epoch": 0.595270761036236, + "grad_norm": 0.6948572397232056, + "learning_rate": 0.00014097150297286785, + "loss": 2.5811, + "step": 7376 + }, + { + "epoch": 0.595351464772819, + "grad_norm": 0.6836786270141602, + "learning_rate": 0.0001409571014400322, + "loss": 2.5861, + "step": 7377 + }, + { + "epoch": 0.595432168509402, + "grad_norm": 0.6644341945648193, + "learning_rate": 0.00014094269888643854, + "loss": 2.6339, + "step": 7378 + }, + { + "epoch": 0.595512872245985, + "grad_norm": 0.6434289813041687, + "learning_rate": 0.0001409282953124458, + "loss": 2.4897, + "step": 7379 + }, + { + "epoch": 0.595593575982568, + "grad_norm": 0.6745082139968872, + "learning_rate": 0.0001409138907184129, + "loss": 2.522, + "step": 7380 + }, + { + "epoch": 0.595674279719151, + "grad_norm": 0.725321352481842, + "learning_rate": 0.0001408994851046989, + "loss": 2.5711, + "step": 7381 + }, + { + "epoch": 0.595754983455734, + "grad_norm": 0.7485500574111938, + "learning_rate": 0.00014088507847166283, + "loss": 2.6095, + "step": 7382 + }, + { + "epoch": 0.595835687192317, + "grad_norm": 0.721125602722168, + "learning_rate": 0.00014087067081966376, + "loss": 2.6762, + "step": 7383 + }, + { + "epoch": 0.5959163909289, + "grad_norm": 0.7099901437759399, + "learning_rate": 0.00014085626214906073, + "loss": 2.5667, + "step": 7384 + }, + { + "epoch": 0.595997094665483, + "grad_norm": 0.6889060139656067, + "learning_rate": 0.00014084185246021283, + "loss": 2.6723, + "step": 7385 + }, + { + "epoch": 0.596077798402066, + "grad_norm": 0.735698938369751, + "learning_rate": 0.00014082744175347923, + "loss": 2.6434, + "step": 7386 + }, + { + "epoch": 0.5961585021386491, + "grad_norm": 0.7603070735931396, + "learning_rate": 0.00014081303002921902, + "loss": 2.665, + "step": 7387 + }, + { + "epoch": 0.596239205875232, + "grad_norm": 0.6786355376243591, + "learning_rate": 0.00014079861728779141, + "loss": 2.5842, + "step": 7388 + }, + { + "epoch": 0.596319909611815, + "grad_norm": 0.6693331003189087, + "learning_rate": 0.00014078420352955565, + "loss": 2.6211, + "step": 7389 + }, + { + "epoch": 0.596400613348398, + "grad_norm": 0.74013751745224, + "learning_rate": 0.0001407697887548709, + "loss": 2.5886, + "step": 7390 + }, + { + "epoch": 0.5964813170849811, + "grad_norm": 0.739507257938385, + "learning_rate": 0.00014075537296409646, + "loss": 2.607, + "step": 7391 + }, + { + "epoch": 0.5965620208215641, + "grad_norm": 0.7121848464012146, + "learning_rate": 0.00014074095615759156, + "loss": 2.6052, + "step": 7392 + }, + { + "epoch": 0.596642724558147, + "grad_norm": 0.7526760697364807, + "learning_rate": 0.00014072653833571556, + "loss": 2.6051, + "step": 7393 + }, + { + "epoch": 0.59672342829473, + "grad_norm": 0.7867496609687805, + "learning_rate": 0.00014071211949882777, + "loss": 2.6228, + "step": 7394 + }, + { + "epoch": 0.596804132031313, + "grad_norm": 0.7527757883071899, + "learning_rate": 0.00014069769964728752, + "loss": 2.6793, + "step": 7395 + }, + { + "epoch": 0.5968848357678961, + "grad_norm": 0.7096899747848511, + "learning_rate": 0.00014068327878145423, + "loss": 2.5207, + "step": 7396 + }, + { + "epoch": 0.5969655395044791, + "grad_norm": 0.6863983869552612, + "learning_rate": 0.00014066885690168726, + "loss": 2.7059, + "step": 7397 + }, + { + "epoch": 0.597046243241062, + "grad_norm": 0.7782251834869385, + "learning_rate": 0.0001406544340083461, + "loss": 2.6232, + "step": 7398 + }, + { + "epoch": 0.597126946977645, + "grad_norm": 0.6944136619567871, + "learning_rate": 0.00014064001010179013, + "loss": 2.6134, + "step": 7399 + }, + { + "epoch": 0.5972076507142281, + "grad_norm": 0.7629704475402832, + "learning_rate": 0.00014062558518237892, + "loss": 2.5358, + "step": 7400 + }, + { + "epoch": 0.5972883544508111, + "grad_norm": 0.6922330260276794, + "learning_rate": 0.0001406111592504719, + "loss": 2.5457, + "step": 7401 + }, + { + "epoch": 0.597369058187394, + "grad_norm": 0.6992952227592468, + "learning_rate": 0.00014059673230642865, + "loss": 2.6241, + "step": 7402 + }, + { + "epoch": 0.597449761923977, + "grad_norm": 0.6587642431259155, + "learning_rate": 0.0001405823043506087, + "loss": 2.5867, + "step": 7403 + }, + { + "epoch": 0.5975304656605601, + "grad_norm": 0.6993013024330139, + "learning_rate": 0.00014056787538337164, + "loss": 2.6194, + "step": 7404 + }, + { + "epoch": 0.5976111693971431, + "grad_norm": 0.7605414986610413, + "learning_rate": 0.0001405534454050771, + "loss": 2.607, + "step": 7405 + }, + { + "epoch": 0.5976918731337261, + "grad_norm": 0.6624562740325928, + "learning_rate": 0.00014053901441608466, + "loss": 2.5962, + "step": 7406 + }, + { + "epoch": 0.597772576870309, + "grad_norm": 0.7432621717453003, + "learning_rate": 0.000140524582416754, + "loss": 2.6434, + "step": 7407 + }, + { + "epoch": 0.5978532806068921, + "grad_norm": 0.7184053659439087, + "learning_rate": 0.00014051014940744488, + "loss": 2.6139, + "step": 7408 + }, + { + "epoch": 0.5979339843434751, + "grad_norm": 0.7567455768585205, + "learning_rate": 0.00014049571538851687, + "loss": 2.5788, + "step": 7409 + }, + { + "epoch": 0.5980146880800581, + "grad_norm": 0.6759883761405945, + "learning_rate": 0.00014048128036032984, + "loss": 2.5584, + "step": 7410 + }, + { + "epoch": 0.5980953918166411, + "grad_norm": 0.7607424855232239, + "learning_rate": 0.00014046684432324343, + "loss": 2.5675, + "step": 7411 + }, + { + "epoch": 0.5981760955532242, + "grad_norm": 0.7134036421775818, + "learning_rate": 0.00014045240727761748, + "loss": 2.6805, + "step": 7412 + }, + { + "epoch": 0.5982567992898071, + "grad_norm": 0.6996984481811523, + "learning_rate": 0.00014043796922381184, + "loss": 2.5874, + "step": 7413 + }, + { + "epoch": 0.5983375030263901, + "grad_norm": 0.7098252177238464, + "learning_rate": 0.00014042353016218627, + "loss": 2.5895, + "step": 7414 + }, + { + "epoch": 0.5984182067629731, + "grad_norm": 0.7160520553588867, + "learning_rate": 0.00014040909009310068, + "loss": 2.6042, + "step": 7415 + }, + { + "epoch": 0.5984989104995562, + "grad_norm": 0.6727281212806702, + "learning_rate": 0.00014039464901691493, + "loss": 2.5356, + "step": 7416 + }, + { + "epoch": 0.5985796142361391, + "grad_norm": 0.7052881717681885, + "learning_rate": 0.00014038020693398891, + "loss": 2.6093, + "step": 7417 + }, + { + "epoch": 0.5986603179727221, + "grad_norm": 0.7151781916618347, + "learning_rate": 0.00014036576384468262, + "loss": 2.5776, + "step": 7418 + }, + { + "epoch": 0.5987410217093051, + "grad_norm": 0.7376574873924255, + "learning_rate": 0.0001403513197493559, + "loss": 2.6246, + "step": 7419 + }, + { + "epoch": 0.5988217254458882, + "grad_norm": 0.6882135272026062, + "learning_rate": 0.00014033687464836892, + "loss": 2.6028, + "step": 7420 + }, + { + "epoch": 0.5989024291824712, + "grad_norm": 0.6603999137878418, + "learning_rate": 0.00014032242854208153, + "loss": 2.5897, + "step": 7421 + }, + { + "epoch": 0.5989831329190541, + "grad_norm": 0.7001559734344482, + "learning_rate": 0.0001403079814308538, + "loss": 2.6033, + "step": 7422 + }, + { + "epoch": 0.5990638366556371, + "grad_norm": 0.7184363603591919, + "learning_rate": 0.00014029353331504582, + "loss": 2.7464, + "step": 7423 + }, + { + "epoch": 0.5991445403922202, + "grad_norm": 0.6794769167900085, + "learning_rate": 0.00014027908419501767, + "loss": 2.569, + "step": 7424 + }, + { + "epoch": 0.5992252441288032, + "grad_norm": 0.6846041083335876, + "learning_rate": 0.00014026463407112942, + "loss": 2.5995, + "step": 7425 + }, + { + "epoch": 0.5993059478653862, + "grad_norm": 0.6539658308029175, + "learning_rate": 0.00014025018294374129, + "loss": 2.5749, + "step": 7426 + }, + { + "epoch": 0.5993866516019691, + "grad_norm": 0.6572301983833313, + "learning_rate": 0.00014023573081321336, + "loss": 2.5312, + "step": 7427 + }, + { + "epoch": 0.5994673553385522, + "grad_norm": 0.7010765671730042, + "learning_rate": 0.00014022127767990581, + "loss": 2.5088, + "step": 7428 + }, + { + "epoch": 0.5995480590751352, + "grad_norm": 0.7193396091461182, + "learning_rate": 0.0001402068235441789, + "loss": 2.6193, + "step": 7429 + }, + { + "epoch": 0.5996287628117182, + "grad_norm": 0.6928533315658569, + "learning_rate": 0.00014019236840639288, + "loss": 2.6149, + "step": 7430 + }, + { + "epoch": 0.5997094665483012, + "grad_norm": 0.743658185005188, + "learning_rate": 0.00014017791226690794, + "loss": 2.5466, + "step": 7431 + }, + { + "epoch": 0.5997901702848842, + "grad_norm": 0.752082347869873, + "learning_rate": 0.0001401634551260844, + "loss": 2.6605, + "step": 7432 + }, + { + "epoch": 0.5998708740214672, + "grad_norm": 0.7280415296554565, + "learning_rate": 0.00014014899698428255, + "loss": 2.6128, + "step": 7433 + }, + { + "epoch": 0.5999515777580502, + "grad_norm": 0.7037710547447205, + "learning_rate": 0.0001401345378418628, + "loss": 2.6157, + "step": 7434 + }, + { + "epoch": 0.6000322814946332, + "grad_norm": 0.6984395980834961, + "learning_rate": 0.00014012007769918542, + "loss": 2.5579, + "step": 7435 + }, + { + "epoch": 0.6001129852312163, + "grad_norm": 0.6853601336479187, + "learning_rate": 0.00014010561655661085, + "loss": 2.6316, + "step": 7436 + }, + { + "epoch": 0.6001936889677992, + "grad_norm": 0.7551750540733337, + "learning_rate": 0.00014009115441449948, + "loss": 2.6671, + "step": 7437 + }, + { + "epoch": 0.6002743927043822, + "grad_norm": 0.7680155038833618, + "learning_rate": 0.0001400766912732117, + "loss": 2.6301, + "step": 7438 + }, + { + "epoch": 0.6003550964409652, + "grad_norm": 0.6757175922393799, + "learning_rate": 0.00014006222713310807, + "loss": 2.5584, + "step": 7439 + }, + { + "epoch": 0.6004358001775483, + "grad_norm": 0.6636163592338562, + "learning_rate": 0.00014004776199454897, + "loss": 2.5437, + "step": 7440 + }, + { + "epoch": 0.6005165039141312, + "grad_norm": 0.7317774891853333, + "learning_rate": 0.00014003329585789498, + "loss": 2.594, + "step": 7441 + }, + { + "epoch": 0.6005972076507142, + "grad_norm": 0.6903451681137085, + "learning_rate": 0.0001400188287235066, + "loss": 2.6175, + "step": 7442 + }, + { + "epoch": 0.6006779113872972, + "grad_norm": 0.7137858867645264, + "learning_rate": 0.00014000436059174437, + "loss": 2.6411, + "step": 7443 + }, + { + "epoch": 0.6007586151238803, + "grad_norm": 0.7124149203300476, + "learning_rate": 0.00013998989146296893, + "loss": 2.6562, + "step": 7444 + }, + { + "epoch": 0.6008393188604633, + "grad_norm": 0.7518175840377808, + "learning_rate": 0.00013997542133754087, + "loss": 2.6213, + "step": 7445 + }, + { + "epoch": 0.6009200225970462, + "grad_norm": 0.6843053698539734, + "learning_rate": 0.0001399609502158208, + "loss": 2.6099, + "step": 7446 + }, + { + "epoch": 0.6010007263336292, + "grad_norm": 0.6668025255203247, + "learning_rate": 0.0001399464780981694, + "loss": 2.609, + "step": 7447 + }, + { + "epoch": 0.6010814300702122, + "grad_norm": 0.6849119067192078, + "learning_rate": 0.00013993200498494735, + "loss": 2.6097, + "step": 7448 + }, + { + "epoch": 0.6011621338067953, + "grad_norm": 0.7767381072044373, + "learning_rate": 0.0001399175308765153, + "loss": 2.6351, + "step": 7449 + }, + { + "epoch": 0.6012428375433783, + "grad_norm": 0.6630256772041321, + "learning_rate": 0.0001399030557732341, + "loss": 2.5924, + "step": 7450 + }, + { + "epoch": 0.6013235412799612, + "grad_norm": 0.6918755769729614, + "learning_rate": 0.00013988857967546444, + "loss": 2.6205, + "step": 7451 + }, + { + "epoch": 0.6014042450165442, + "grad_norm": 0.7179181575775146, + "learning_rate": 0.00013987410258356708, + "loss": 2.5971, + "step": 7452 + }, + { + "epoch": 0.6014849487531273, + "grad_norm": 0.7233672738075256, + "learning_rate": 0.00013985962449790284, + "loss": 2.595, + "step": 7453 + }, + { + "epoch": 0.6015656524897103, + "grad_norm": 0.6861593127250671, + "learning_rate": 0.0001398451454188326, + "loss": 2.6127, + "step": 7454 + }, + { + "epoch": 0.6016463562262933, + "grad_norm": 0.6818981170654297, + "learning_rate": 0.00013983066534671714, + "loss": 2.5923, + "step": 7455 + }, + { + "epoch": 0.6017270599628762, + "grad_norm": 0.700036346912384, + "learning_rate": 0.0001398161842819174, + "loss": 2.5474, + "step": 7456 + }, + { + "epoch": 0.6018077636994593, + "grad_norm": 0.6884824633598328, + "learning_rate": 0.00013980170222479426, + "loss": 2.6041, + "step": 7457 + }, + { + "epoch": 0.6018884674360423, + "grad_norm": 0.6745120286941528, + "learning_rate": 0.00013978721917570866, + "loss": 2.6638, + "step": 7458 + }, + { + "epoch": 0.6019691711726253, + "grad_norm": 0.6886256337165833, + "learning_rate": 0.00013977273513502157, + "loss": 2.5733, + "step": 7459 + }, + { + "epoch": 0.6020498749092082, + "grad_norm": 0.7220930457115173, + "learning_rate": 0.00013975825010309394, + "loss": 2.5739, + "step": 7460 + }, + { + "epoch": 0.6021305786457913, + "grad_norm": 0.7281780242919922, + "learning_rate": 0.0001397437640802868, + "loss": 2.5646, + "step": 7461 + }, + { + "epoch": 0.6022112823823743, + "grad_norm": 0.7316896915435791, + "learning_rate": 0.00013972927706696115, + "loss": 2.6532, + "step": 7462 + }, + { + "epoch": 0.6022919861189573, + "grad_norm": 0.6288646459579468, + "learning_rate": 0.00013971478906347806, + "loss": 2.5753, + "step": 7463 + }, + { + "epoch": 0.6023726898555403, + "grad_norm": 0.7110145688056946, + "learning_rate": 0.00013970030007019862, + "loss": 2.6421, + "step": 7464 + }, + { + "epoch": 0.6024533935921234, + "grad_norm": 0.7437754273414612, + "learning_rate": 0.00013968581008748393, + "loss": 2.585, + "step": 7465 + }, + { + "epoch": 0.6025340973287063, + "grad_norm": 0.6839718222618103, + "learning_rate": 0.00013967131911569514, + "loss": 2.6249, + "step": 7466 + }, + { + "epoch": 0.6026148010652893, + "grad_norm": 0.7358397841453552, + "learning_rate": 0.00013965682715519332, + "loss": 2.597, + "step": 7467 + }, + { + "epoch": 0.6026955048018723, + "grad_norm": 0.673651397228241, + "learning_rate": 0.00013964233420633973, + "loss": 2.6111, + "step": 7468 + }, + { + "epoch": 0.6027762085384554, + "grad_norm": 0.7390083074569702, + "learning_rate": 0.00013962784026949553, + "loss": 2.6131, + "step": 7469 + }, + { + "epoch": 0.6028569122750383, + "grad_norm": 0.6902220249176025, + "learning_rate": 0.00013961334534502197, + "loss": 2.6116, + "step": 7470 + }, + { + "epoch": 0.6029376160116213, + "grad_norm": 0.6946651935577393, + "learning_rate": 0.00013959884943328033, + "loss": 2.6307, + "step": 7471 + }, + { + "epoch": 0.6030183197482043, + "grad_norm": 0.7277294993400574, + "learning_rate": 0.00013958435253463183, + "loss": 2.6065, + "step": 7472 + }, + { + "epoch": 0.6030990234847874, + "grad_norm": 0.743833601474762, + "learning_rate": 0.00013956985464943776, + "loss": 2.6644, + "step": 7473 + }, + { + "epoch": 0.6031797272213704, + "grad_norm": 0.6480288505554199, + "learning_rate": 0.0001395553557780595, + "loss": 2.5386, + "step": 7474 + }, + { + "epoch": 0.6032604309579533, + "grad_norm": 0.799443781375885, + "learning_rate": 0.00013954085592085834, + "loss": 2.5653, + "step": 7475 + }, + { + "epoch": 0.6033411346945363, + "grad_norm": 0.6790705323219299, + "learning_rate": 0.00013952635507819575, + "loss": 2.6229, + "step": 7476 + }, + { + "epoch": 0.6034218384311194, + "grad_norm": 0.6871588826179504, + "learning_rate": 0.00013951185325043302, + "loss": 2.6514, + "step": 7477 + }, + { + "epoch": 0.6035025421677024, + "grad_norm": 0.7236921787261963, + "learning_rate": 0.00013949735043793164, + "loss": 2.5931, + "step": 7478 + }, + { + "epoch": 0.6035832459042854, + "grad_norm": 0.6888518929481506, + "learning_rate": 0.00013948284664105305, + "loss": 2.6408, + "step": 7479 + }, + { + "epoch": 0.6036639496408683, + "grad_norm": 0.7292625904083252, + "learning_rate": 0.00013946834186015868, + "loss": 2.5829, + "step": 7480 + }, + { + "epoch": 0.6037446533774514, + "grad_norm": 0.6755293607711792, + "learning_rate": 0.00013945383609561009, + "loss": 2.5917, + "step": 7481 + }, + { + "epoch": 0.6038253571140344, + "grad_norm": 0.6808032989501953, + "learning_rate": 0.00013943932934776877, + "loss": 2.6103, + "step": 7482 + }, + { + "epoch": 0.6039060608506174, + "grad_norm": 0.747173547744751, + "learning_rate": 0.00013942482161699625, + "loss": 2.624, + "step": 7483 + }, + { + "epoch": 0.6039867645872004, + "grad_norm": 0.7265594005584717, + "learning_rate": 0.00013941031290365413, + "loss": 2.5672, + "step": 7484 + }, + { + "epoch": 0.6040674683237834, + "grad_norm": 0.6434060335159302, + "learning_rate": 0.000139395803208104, + "loss": 2.5885, + "step": 7485 + }, + { + "epoch": 0.6041481720603664, + "grad_norm": 0.7148730754852295, + "learning_rate": 0.00013938129253070747, + "loss": 2.6466, + "step": 7486 + }, + { + "epoch": 0.6042288757969494, + "grad_norm": 0.7724708318710327, + "learning_rate": 0.00013936678087182616, + "loss": 2.6364, + "step": 7487 + }, + { + "epoch": 0.6043095795335324, + "grad_norm": 0.6886702179908752, + "learning_rate": 0.0001393522682318218, + "loss": 2.5844, + "step": 7488 + }, + { + "epoch": 0.6043902832701155, + "grad_norm": 0.6501082181930542, + "learning_rate": 0.00013933775461105603, + "loss": 2.5767, + "step": 7489 + }, + { + "epoch": 0.6044709870066984, + "grad_norm": 0.7333959341049194, + "learning_rate": 0.00013932324000989058, + "loss": 2.5735, + "step": 7490 + }, + { + "epoch": 0.6045516907432814, + "grad_norm": 0.7057361602783203, + "learning_rate": 0.00013930872442868722, + "loss": 2.627, + "step": 7491 + }, + { + "epoch": 0.6046323944798644, + "grad_norm": 0.705078661441803, + "learning_rate": 0.00013929420786780767, + "loss": 2.6012, + "step": 7492 + }, + { + "epoch": 0.6047130982164475, + "grad_norm": 0.7192156314849854, + "learning_rate": 0.00013927969032761378, + "loss": 2.5594, + "step": 7493 + }, + { + "epoch": 0.6047938019530305, + "grad_norm": 0.703116774559021, + "learning_rate": 0.00013926517180846726, + "loss": 2.6099, + "step": 7494 + }, + { + "epoch": 0.6048745056896134, + "grad_norm": 0.6970264315605164, + "learning_rate": 0.00013925065231073006, + "loss": 2.5832, + "step": 7495 + }, + { + "epoch": 0.6049552094261964, + "grad_norm": 0.7308031320571899, + "learning_rate": 0.00013923613183476402, + "loss": 2.586, + "step": 7496 + }, + { + "epoch": 0.6050359131627794, + "grad_norm": 0.7212777137756348, + "learning_rate": 0.00013922161038093097, + "loss": 2.6374, + "step": 7497 + }, + { + "epoch": 0.6051166168993625, + "grad_norm": 0.6644641757011414, + "learning_rate": 0.0001392070879495929, + "loss": 2.5226, + "step": 7498 + }, + { + "epoch": 0.6051973206359454, + "grad_norm": 0.6683016419410706, + "learning_rate": 0.0001391925645411117, + "loss": 2.5279, + "step": 7499 + }, + { + "epoch": 0.6052780243725284, + "grad_norm": 0.7341439127922058, + "learning_rate": 0.00013917804015584932, + "loss": 2.5995, + "step": 7500 + }, + { + "epoch": 0.6053587281091114, + "grad_norm": 0.753942608833313, + "learning_rate": 0.0001391635147941678, + "loss": 2.5706, + "step": 7501 + }, + { + "epoch": 0.6054394318456945, + "grad_norm": 0.7541958093643188, + "learning_rate": 0.00013914898845642908, + "loss": 2.6365, + "step": 7502 + }, + { + "epoch": 0.6055201355822775, + "grad_norm": 0.6583349108695984, + "learning_rate": 0.00013913446114299528, + "loss": 2.534, + "step": 7503 + }, + { + "epoch": 0.6056008393188604, + "grad_norm": 0.6545756459236145, + "learning_rate": 0.00013911993285422835, + "loss": 2.5443, + "step": 7504 + }, + { + "epoch": 0.6056815430554434, + "grad_norm": 0.8290210366249084, + "learning_rate": 0.00013910540359049045, + "loss": 2.6196, + "step": 7505 + }, + { + "epoch": 0.6057622467920265, + "grad_norm": 0.7032577395439148, + "learning_rate": 0.0001390908733521437, + "loss": 2.6575, + "step": 7506 + }, + { + "epoch": 0.6058429505286095, + "grad_norm": 0.7018071413040161, + "learning_rate": 0.0001390763421395502, + "loss": 2.6272, + "step": 7507 + }, + { + "epoch": 0.6059236542651925, + "grad_norm": 0.6288552284240723, + "learning_rate": 0.00013906180995307206, + "loss": 2.5295, + "step": 7508 + }, + { + "epoch": 0.6060043580017754, + "grad_norm": 0.7013774514198303, + "learning_rate": 0.00013904727679307153, + "loss": 2.5669, + "step": 7509 + }, + { + "epoch": 0.6060850617383585, + "grad_norm": 0.6811630129814148, + "learning_rate": 0.00013903274265991082, + "loss": 2.5827, + "step": 7510 + }, + { + "epoch": 0.6061657654749415, + "grad_norm": 0.6690269112586975, + "learning_rate": 0.0001390182075539521, + "loss": 2.5947, + "step": 7511 + }, + { + "epoch": 0.6062464692115245, + "grad_norm": 0.6946289539337158, + "learning_rate": 0.00013900367147555768, + "loss": 2.59, + "step": 7512 + }, + { + "epoch": 0.6063271729481075, + "grad_norm": 0.7302843332290649, + "learning_rate": 0.0001389891344250898, + "loss": 2.5994, + "step": 7513 + }, + { + "epoch": 0.6064078766846905, + "grad_norm": 0.7462306022644043, + "learning_rate": 0.00013897459640291074, + "loss": 2.5983, + "step": 7514 + }, + { + "epoch": 0.6064885804212735, + "grad_norm": 0.6948123574256897, + "learning_rate": 0.0001389600574093829, + "loss": 2.5737, + "step": 7515 + }, + { + "epoch": 0.6065692841578565, + "grad_norm": 0.6897372007369995, + "learning_rate": 0.00013894551744486857, + "loss": 2.607, + "step": 7516 + }, + { + "epoch": 0.6066499878944395, + "grad_norm": 0.6808069348335266, + "learning_rate": 0.00013893097650973015, + "loss": 2.5712, + "step": 7517 + }, + { + "epoch": 0.6067306916310226, + "grad_norm": 0.7000731229782104, + "learning_rate": 0.00013891643460433, + "loss": 2.5654, + "step": 7518 + }, + { + "epoch": 0.6068113953676055, + "grad_norm": 0.7197545766830444, + "learning_rate": 0.0001389018917290306, + "loss": 2.5705, + "step": 7519 + }, + { + "epoch": 0.6068920991041885, + "grad_norm": 0.7001069188117981, + "learning_rate": 0.00013888734788419433, + "loss": 2.5934, + "step": 7520 + }, + { + "epoch": 0.6069728028407715, + "grad_norm": 0.7480459213256836, + "learning_rate": 0.00013887280307018377, + "loss": 2.5211, + "step": 7521 + }, + { + "epoch": 0.6070535065773546, + "grad_norm": 0.6913945078849792, + "learning_rate": 0.00013885825728736132, + "loss": 2.6013, + "step": 7522 + }, + { + "epoch": 0.6071342103139376, + "grad_norm": 0.6527336239814758, + "learning_rate": 0.00013884371053608948, + "loss": 2.5901, + "step": 7523 + }, + { + "epoch": 0.6072149140505205, + "grad_norm": 0.6897335052490234, + "learning_rate": 0.00013882916281673086, + "loss": 2.5389, + "step": 7524 + }, + { + "epoch": 0.6072956177871035, + "grad_norm": 0.7159501910209656, + "learning_rate": 0.00013881461412964798, + "loss": 2.5399, + "step": 7525 + }, + { + "epoch": 0.6073763215236866, + "grad_norm": 0.6744364500045776, + "learning_rate": 0.00013880006447520346, + "loss": 2.5658, + "step": 7526 + }, + { + "epoch": 0.6074570252602696, + "grad_norm": 0.819950520992279, + "learning_rate": 0.00013878551385375994, + "loss": 2.6143, + "step": 7527 + }, + { + "epoch": 0.6075377289968525, + "grad_norm": 0.744293212890625, + "learning_rate": 0.00013877096226568, + "loss": 2.6565, + "step": 7528 + }, + { + "epoch": 0.6076184327334355, + "grad_norm": 0.7121254205703735, + "learning_rate": 0.00013875640971132636, + "loss": 2.6151, + "step": 7529 + }, + { + "epoch": 0.6076991364700186, + "grad_norm": 0.7616204023361206, + "learning_rate": 0.00013874185619106163, + "loss": 2.6395, + "step": 7530 + }, + { + "epoch": 0.6077798402066016, + "grad_norm": 0.7481076121330261, + "learning_rate": 0.0001387273017052486, + "loss": 2.597, + "step": 7531 + }, + { + "epoch": 0.6078605439431846, + "grad_norm": 0.6660816073417664, + "learning_rate": 0.00013871274625425, + "loss": 2.5696, + "step": 7532 + }, + { + "epoch": 0.6079412476797675, + "grad_norm": 0.7491411566734314, + "learning_rate": 0.00013869818983842854, + "loss": 2.552, + "step": 7533 + }, + { + "epoch": 0.6080219514163506, + "grad_norm": 0.7130792140960693, + "learning_rate": 0.00013868363245814704, + "loss": 2.5959, + "step": 7534 + }, + { + "epoch": 0.6081026551529336, + "grad_norm": 0.7157341241836548, + "learning_rate": 0.00013866907411376827, + "loss": 2.5598, + "step": 7535 + }, + { + "epoch": 0.6081833588895166, + "grad_norm": 0.7750656008720398, + "learning_rate": 0.00013865451480565513, + "loss": 2.6217, + "step": 7536 + }, + { + "epoch": 0.6082640626260996, + "grad_norm": 0.6915080547332764, + "learning_rate": 0.00013863995453417043, + "loss": 2.6211, + "step": 7537 + }, + { + "epoch": 0.6083447663626826, + "grad_norm": 0.7245940566062927, + "learning_rate": 0.00013862539329967706, + "loss": 2.5619, + "step": 7538 + }, + { + "epoch": 0.6084254700992656, + "grad_norm": 0.8884119391441345, + "learning_rate": 0.0001386108311025379, + "loss": 2.6349, + "step": 7539 + }, + { + "epoch": 0.6085061738358486, + "grad_norm": 0.7889477610588074, + "learning_rate": 0.0001385962679431159, + "loss": 2.6169, + "step": 7540 + }, + { + "epoch": 0.6085868775724316, + "grad_norm": 0.7187505960464478, + "learning_rate": 0.00013858170382177403, + "loss": 2.5582, + "step": 7541 + }, + { + "epoch": 0.6086675813090147, + "grad_norm": 0.7502198219299316, + "learning_rate": 0.00013856713873887526, + "loss": 2.5418, + "step": 7542 + }, + { + "epoch": 0.6087482850455976, + "grad_norm": 0.797704815864563, + "learning_rate": 0.00013855257269478256, + "loss": 2.5764, + "step": 7543 + }, + { + "epoch": 0.6088289887821806, + "grad_norm": 0.7651431560516357, + "learning_rate": 0.00013853800568985896, + "loss": 2.5995, + "step": 7544 + }, + { + "epoch": 0.6089096925187636, + "grad_norm": 0.7048482298851013, + "learning_rate": 0.00013852343772446753, + "loss": 2.5656, + "step": 7545 + }, + { + "epoch": 0.6089903962553467, + "grad_norm": 0.7252251505851746, + "learning_rate": 0.00013850886879897135, + "loss": 2.6509, + "step": 7546 + }, + { + "epoch": 0.6090710999919297, + "grad_norm": 0.7220067381858826, + "learning_rate": 0.00013849429891373344, + "loss": 2.5558, + "step": 7547 + }, + { + "epoch": 0.6091518037285126, + "grad_norm": 0.7672600746154785, + "learning_rate": 0.000138479728069117, + "loss": 2.5682, + "step": 7548 + }, + { + "epoch": 0.6092325074650956, + "grad_norm": 0.7753601670265198, + "learning_rate": 0.0001384651562654852, + "loss": 2.6459, + "step": 7549 + }, + { + "epoch": 0.6093132112016786, + "grad_norm": 0.7346559166908264, + "learning_rate": 0.00013845058350320108, + "loss": 2.5988, + "step": 7550 + }, + { + "epoch": 0.6093939149382617, + "grad_norm": 0.7386072874069214, + "learning_rate": 0.00013843600978262797, + "loss": 2.6366, + "step": 7551 + }, + { + "epoch": 0.6094746186748446, + "grad_norm": 0.7114188075065613, + "learning_rate": 0.00013842143510412898, + "loss": 2.5515, + "step": 7552 + }, + { + "epoch": 0.6095553224114276, + "grad_norm": 0.6836373209953308, + "learning_rate": 0.00013840685946806742, + "loss": 2.6301, + "step": 7553 + }, + { + "epoch": 0.6096360261480106, + "grad_norm": 0.7548927068710327, + "learning_rate": 0.00013839228287480652, + "loss": 2.6508, + "step": 7554 + }, + { + "epoch": 0.6097167298845937, + "grad_norm": 0.6931679248809814, + "learning_rate": 0.00013837770532470957, + "loss": 2.5535, + "step": 7555 + }, + { + "epoch": 0.6097974336211767, + "grad_norm": 0.7621145248413086, + "learning_rate": 0.00013836312681813988, + "loss": 2.6831, + "step": 7556 + }, + { + "epoch": 0.6098781373577596, + "grad_norm": 0.6735427975654602, + "learning_rate": 0.00013834854735546079, + "loss": 2.5338, + "step": 7557 + }, + { + "epoch": 0.6099588410943426, + "grad_norm": 0.7157600522041321, + "learning_rate": 0.00013833396693703565, + "loss": 2.5713, + "step": 7558 + }, + { + "epoch": 0.6100395448309257, + "grad_norm": 0.718032956123352, + "learning_rate": 0.00013831938556322789, + "loss": 2.5625, + "step": 7559 + }, + { + "epoch": 0.6101202485675087, + "grad_norm": 0.7290309071540833, + "learning_rate": 0.0001383048032344008, + "loss": 2.5956, + "step": 7560 + }, + { + "epoch": 0.6102009523040917, + "grad_norm": 0.675470769405365, + "learning_rate": 0.00013829021995091792, + "loss": 2.6053, + "step": 7561 + }, + { + "epoch": 0.6102816560406746, + "grad_norm": 0.7348767518997192, + "learning_rate": 0.00013827563571314268, + "loss": 2.6174, + "step": 7562 + }, + { + "epoch": 0.6103623597772577, + "grad_norm": 0.64495849609375, + "learning_rate": 0.00013826105052143852, + "loss": 2.5923, + "step": 7563 + }, + { + "epoch": 0.6104430635138407, + "grad_norm": 0.7379264235496521, + "learning_rate": 0.000138246464376169, + "loss": 2.6438, + "step": 7564 + }, + { + "epoch": 0.6105237672504237, + "grad_norm": 0.7802134156227112, + "learning_rate": 0.00013823187727769756, + "loss": 2.5884, + "step": 7565 + }, + { + "epoch": 0.6106044709870067, + "grad_norm": 0.6907222867012024, + "learning_rate": 0.00013821728922638782, + "loss": 2.596, + "step": 7566 + }, + { + "epoch": 0.6106851747235897, + "grad_norm": 0.6924182176589966, + "learning_rate": 0.00013820270022260335, + "loss": 2.5631, + "step": 7567 + }, + { + "epoch": 0.6107658784601727, + "grad_norm": 0.729258120059967, + "learning_rate": 0.0001381881102667077, + "loss": 2.5761, + "step": 7568 + }, + { + "epoch": 0.6108465821967557, + "grad_norm": 0.7141425013542175, + "learning_rate": 0.00013817351935906455, + "loss": 2.6214, + "step": 7569 + }, + { + "epoch": 0.6109272859333387, + "grad_norm": 0.7564505338668823, + "learning_rate": 0.00013815892750003748, + "loss": 2.6338, + "step": 7570 + }, + { + "epoch": 0.6110079896699218, + "grad_norm": 0.674705982208252, + "learning_rate": 0.00013814433468999022, + "loss": 2.5604, + "step": 7571 + }, + { + "epoch": 0.6110886934065047, + "grad_norm": 0.6956657767295837, + "learning_rate": 0.00013812974092928642, + "loss": 2.5805, + "step": 7572 + }, + { + "epoch": 0.6111693971430877, + "grad_norm": 0.7393823862075806, + "learning_rate": 0.0001381151462182898, + "loss": 2.6312, + "step": 7573 + }, + { + "epoch": 0.6112501008796707, + "grad_norm": 0.7048184275627136, + "learning_rate": 0.00013810055055736407, + "loss": 2.5948, + "step": 7574 + }, + { + "epoch": 0.6113308046162538, + "grad_norm": 0.748798668384552, + "learning_rate": 0.0001380859539468731, + "loss": 2.5815, + "step": 7575 + }, + { + "epoch": 0.6114115083528368, + "grad_norm": 0.7146531343460083, + "learning_rate": 0.00013807135638718048, + "loss": 2.5803, + "step": 7576 + }, + { + "epoch": 0.6114922120894197, + "grad_norm": 0.6883770823478699, + "learning_rate": 0.00013805675787865025, + "loss": 2.6005, + "step": 7577 + }, + { + "epoch": 0.6115729158260027, + "grad_norm": 0.7808375358581543, + "learning_rate": 0.0001380421584216461, + "loss": 2.6539, + "step": 7578 + }, + { + "epoch": 0.6116536195625858, + "grad_norm": 0.6919417977333069, + "learning_rate": 0.00013802755801653192, + "loss": 2.5812, + "step": 7579 + }, + { + "epoch": 0.6117343232991688, + "grad_norm": 0.6651085615158081, + "learning_rate": 0.0001380129566636716, + "loss": 2.5952, + "step": 7580 + }, + { + "epoch": 0.6118150270357517, + "grad_norm": 0.7806586623191833, + "learning_rate": 0.00013799835436342897, + "loss": 2.6509, + "step": 7581 + }, + { + "epoch": 0.6118957307723347, + "grad_norm": 0.6522969007492065, + "learning_rate": 0.0001379837511161681, + "loss": 2.606, + "step": 7582 + }, + { + "epoch": 0.6119764345089178, + "grad_norm": 0.7566540837287903, + "learning_rate": 0.0001379691469222528, + "loss": 2.6625, + "step": 7583 + }, + { + "epoch": 0.6120571382455008, + "grad_norm": 0.7126421928405762, + "learning_rate": 0.00013795454178204715, + "loss": 2.6396, + "step": 7584 + }, + { + "epoch": 0.6121378419820838, + "grad_norm": 0.6534276008605957, + "learning_rate": 0.0001379399356959151, + "loss": 2.5841, + "step": 7585 + }, + { + "epoch": 0.6122185457186667, + "grad_norm": 0.7663385272026062, + "learning_rate": 0.00013792532866422065, + "loss": 2.6685, + "step": 7586 + }, + { + "epoch": 0.6122992494552498, + "grad_norm": 0.6971656084060669, + "learning_rate": 0.0001379107206873279, + "loss": 2.6036, + "step": 7587 + }, + { + "epoch": 0.6123799531918328, + "grad_norm": 0.6807122230529785, + "learning_rate": 0.00013789611176560088, + "loss": 2.6499, + "step": 7588 + }, + { + "epoch": 0.6124606569284158, + "grad_norm": 0.6712431311607361, + "learning_rate": 0.0001378815018994037, + "loss": 2.6725, + "step": 7589 + }, + { + "epoch": 0.6125413606649988, + "grad_norm": 0.6986604928970337, + "learning_rate": 0.00013786689108910045, + "loss": 2.6159, + "step": 7590 + }, + { + "epoch": 0.6126220644015818, + "grad_norm": 0.7004108428955078, + "learning_rate": 0.0001378522793350553, + "loss": 2.5743, + "step": 7591 + }, + { + "epoch": 0.6127027681381648, + "grad_norm": 0.6782098412513733, + "learning_rate": 0.00013783766663763239, + "loss": 2.5776, + "step": 7592 + }, + { + "epoch": 0.6127834718747478, + "grad_norm": 0.6697036027908325, + "learning_rate": 0.00013782305299719593, + "loss": 2.6195, + "step": 7593 + }, + { + "epoch": 0.6128641756113308, + "grad_norm": 0.6894395351409912, + "learning_rate": 0.00013780843841411014, + "loss": 2.662, + "step": 7594 + }, + { + "epoch": 0.6129448793479139, + "grad_norm": 0.6775636672973633, + "learning_rate": 0.00013779382288873918, + "loss": 2.6083, + "step": 7595 + }, + { + "epoch": 0.6130255830844968, + "grad_norm": 0.7143577337265015, + "learning_rate": 0.00013777920642144738, + "loss": 2.581, + "step": 7596 + }, + { + "epoch": 0.6131062868210798, + "grad_norm": 0.6143797636032104, + "learning_rate": 0.00013776458901259905, + "loss": 2.541, + "step": 7597 + }, + { + "epoch": 0.6131869905576628, + "grad_norm": 0.7003727555274963, + "learning_rate": 0.00013774997066255839, + "loss": 2.5748, + "step": 7598 + }, + { + "epoch": 0.6132676942942458, + "grad_norm": 0.6796504259109497, + "learning_rate": 0.0001377353513716898, + "loss": 2.596, + "step": 7599 + }, + { + "epoch": 0.6133483980308289, + "grad_norm": 0.7011274695396423, + "learning_rate": 0.00013772073114035762, + "loss": 2.5318, + "step": 7600 + }, + { + "epoch": 0.6134291017674118, + "grad_norm": 0.6584382057189941, + "learning_rate": 0.0001377061099689262, + "loss": 2.5793, + "step": 7601 + }, + { + "epoch": 0.6135098055039948, + "grad_norm": 0.6586211919784546, + "learning_rate": 0.00013769148785775995, + "loss": 2.5969, + "step": 7602 + }, + { + "epoch": 0.6135905092405778, + "grad_norm": 0.7187132835388184, + "learning_rate": 0.0001376768648072233, + "loss": 2.6407, + "step": 7603 + }, + { + "epoch": 0.6136712129771609, + "grad_norm": 0.7394679188728333, + "learning_rate": 0.00013766224081768072, + "loss": 2.5959, + "step": 7604 + }, + { + "epoch": 0.6137519167137439, + "grad_norm": 0.6802375912666321, + "learning_rate": 0.00013764761588949665, + "loss": 2.5956, + "step": 7605 + }, + { + "epoch": 0.6138326204503268, + "grad_norm": 0.6949049234390259, + "learning_rate": 0.00013763299002303553, + "loss": 2.556, + "step": 7606 + }, + { + "epoch": 0.6139133241869098, + "grad_norm": 0.7406589388847351, + "learning_rate": 0.00013761836321866196, + "loss": 2.5495, + "step": 7607 + }, + { + "epoch": 0.6139940279234929, + "grad_norm": 0.742499053478241, + "learning_rate": 0.0001376037354767404, + "loss": 2.589, + "step": 7608 + }, + { + "epoch": 0.6140747316600759, + "grad_norm": 0.7669157385826111, + "learning_rate": 0.00013758910679763551, + "loss": 2.576, + "step": 7609 + }, + { + "epoch": 0.6141554353966588, + "grad_norm": 0.6506752967834473, + "learning_rate": 0.00013757447718171182, + "loss": 2.5792, + "step": 7610 + }, + { + "epoch": 0.6142361391332418, + "grad_norm": 0.698514461517334, + "learning_rate": 0.00013755984662933393, + "loss": 2.5809, + "step": 7611 + }, + { + "epoch": 0.6143168428698249, + "grad_norm": 0.6541082262992859, + "learning_rate": 0.00013754521514086645, + "loss": 2.5755, + "step": 7612 + }, + { + "epoch": 0.6143975466064079, + "grad_norm": 0.6619362235069275, + "learning_rate": 0.0001375305827166741, + "loss": 2.5886, + "step": 7613 + }, + { + "epoch": 0.6144782503429909, + "grad_norm": 0.7205569744110107, + "learning_rate": 0.00013751594935712148, + "loss": 2.6293, + "step": 7614 + }, + { + "epoch": 0.6145589540795738, + "grad_norm": 0.7382494211196899, + "learning_rate": 0.00013750131506257339, + "loss": 2.6977, + "step": 7615 + }, + { + "epoch": 0.6146396578161569, + "grad_norm": 0.7492627501487732, + "learning_rate": 0.00013748667983339444, + "loss": 2.6492, + "step": 7616 + }, + { + "epoch": 0.6147203615527399, + "grad_norm": 0.6627328991889954, + "learning_rate": 0.00013747204366994947, + "loss": 2.5458, + "step": 7617 + }, + { + "epoch": 0.6148010652893229, + "grad_norm": 0.7039626836776733, + "learning_rate": 0.00013745740657260323, + "loss": 2.6578, + "step": 7618 + }, + { + "epoch": 0.6148817690259059, + "grad_norm": 0.6999295353889465, + "learning_rate": 0.00013744276854172046, + "loss": 2.6189, + "step": 7619 + }, + { + "epoch": 0.6149624727624889, + "grad_norm": 0.7604365348815918, + "learning_rate": 0.00013742812957766607, + "loss": 2.5344, + "step": 7620 + }, + { + "epoch": 0.6150431764990719, + "grad_norm": 0.6860831379890442, + "learning_rate": 0.0001374134896808048, + "loss": 2.6309, + "step": 7621 + }, + { + "epoch": 0.6151238802356549, + "grad_norm": 0.6628854274749756, + "learning_rate": 0.0001373988488515016, + "loss": 2.6339, + "step": 7622 + }, + { + "epoch": 0.6152045839722379, + "grad_norm": 0.7112562656402588, + "learning_rate": 0.00013738420709012134, + "loss": 2.6064, + "step": 7623 + }, + { + "epoch": 0.615285287708821, + "grad_norm": 0.7068392634391785, + "learning_rate": 0.0001373695643970289, + "loss": 2.624, + "step": 7624 + }, + { + "epoch": 0.6153659914454039, + "grad_norm": 0.6534786224365234, + "learning_rate": 0.00013735492077258924, + "loss": 2.5582, + "step": 7625 + }, + { + "epoch": 0.6154466951819869, + "grad_norm": 0.7433418035507202, + "learning_rate": 0.00013734027621716729, + "loss": 2.5803, + "step": 7626 + }, + { + "epoch": 0.6155273989185699, + "grad_norm": 0.7172532081604004, + "learning_rate": 0.00013732563073112804, + "loss": 2.5906, + "step": 7627 + }, + { + "epoch": 0.615608102655153, + "grad_norm": 0.6712297201156616, + "learning_rate": 0.00013731098431483653, + "loss": 2.5597, + "step": 7628 + }, + { + "epoch": 0.615688806391736, + "grad_norm": 0.7079061269760132, + "learning_rate": 0.00013729633696865775, + "loss": 2.5538, + "step": 7629 + }, + { + "epoch": 0.6157695101283189, + "grad_norm": 0.6968971490859985, + "learning_rate": 0.00013728168869295678, + "loss": 2.6429, + "step": 7630 + }, + { + "epoch": 0.6158502138649019, + "grad_norm": 0.7123236060142517, + "learning_rate": 0.00013726703948809864, + "loss": 2.5607, + "step": 7631 + }, + { + "epoch": 0.615930917601485, + "grad_norm": 0.6441208124160767, + "learning_rate": 0.00013725238935444843, + "loss": 2.6176, + "step": 7632 + }, + { + "epoch": 0.616011621338068, + "grad_norm": 0.7145917415618896, + "learning_rate": 0.00013723773829237137, + "loss": 2.5698, + "step": 7633 + }, + { + "epoch": 0.616092325074651, + "grad_norm": 0.6397334337234497, + "learning_rate": 0.00013722308630223252, + "loss": 2.596, + "step": 7634 + }, + { + "epoch": 0.6161730288112339, + "grad_norm": 0.6372843980789185, + "learning_rate": 0.00013720843338439702, + "loss": 2.5679, + "step": 7635 + }, + { + "epoch": 0.616253732547817, + "grad_norm": 0.707842230796814, + "learning_rate": 0.00013719377953923012, + "loss": 2.6296, + "step": 7636 + }, + { + "epoch": 0.6163344362844, + "grad_norm": 0.6629409193992615, + "learning_rate": 0.000137179124767097, + "loss": 2.542, + "step": 7637 + }, + { + "epoch": 0.616415140020983, + "grad_norm": 0.753646194934845, + "learning_rate": 0.00013716446906836288, + "loss": 2.5741, + "step": 7638 + }, + { + "epoch": 0.6164958437575659, + "grad_norm": 0.6409948468208313, + "learning_rate": 0.0001371498124433931, + "loss": 2.6723, + "step": 7639 + }, + { + "epoch": 0.616576547494149, + "grad_norm": 0.6489264965057373, + "learning_rate": 0.0001371351548925528, + "loss": 2.5806, + "step": 7640 + }, + { + "epoch": 0.616657251230732, + "grad_norm": 0.6857934594154358, + "learning_rate": 0.00013712049641620745, + "loss": 2.6406, + "step": 7641 + }, + { + "epoch": 0.616737954967315, + "grad_norm": 0.6754183769226074, + "learning_rate": 0.00013710583701472226, + "loss": 2.5576, + "step": 7642 + }, + { + "epoch": 0.616818658703898, + "grad_norm": 0.7083800435066223, + "learning_rate": 0.0001370911766884626, + "loss": 2.5747, + "step": 7643 + }, + { + "epoch": 0.616899362440481, + "grad_norm": 0.7281948924064636, + "learning_rate": 0.0001370765154377939, + "loss": 2.5627, + "step": 7644 + }, + { + "epoch": 0.616980066177064, + "grad_norm": 0.655414342880249, + "learning_rate": 0.00013706185326308148, + "loss": 2.5897, + "step": 7645 + }, + { + "epoch": 0.617060769913647, + "grad_norm": 0.6771859526634216, + "learning_rate": 0.0001370471901646908, + "loss": 2.5761, + "step": 7646 + }, + { + "epoch": 0.61714147365023, + "grad_norm": 0.6813557147979736, + "learning_rate": 0.00013703252614298732, + "loss": 2.5807, + "step": 7647 + }, + { + "epoch": 0.6172221773868131, + "grad_norm": 0.6948046684265137, + "learning_rate": 0.00013701786119833646, + "loss": 2.586, + "step": 7648 + }, + { + "epoch": 0.617302881123396, + "grad_norm": 0.643455982208252, + "learning_rate": 0.00013700319533110377, + "loss": 2.592, + "step": 7649 + }, + { + "epoch": 0.617383584859979, + "grad_norm": 0.7292457818984985, + "learning_rate": 0.0001369885285416547, + "loss": 2.6396, + "step": 7650 + }, + { + "epoch": 0.617464288596562, + "grad_norm": 0.642902672290802, + "learning_rate": 0.00013697386083035478, + "loss": 2.6115, + "step": 7651 + }, + { + "epoch": 0.617544992333145, + "grad_norm": 0.6536445021629333, + "learning_rate": 0.00013695919219756966, + "loss": 2.5406, + "step": 7652 + }, + { + "epoch": 0.6176256960697281, + "grad_norm": 0.6643723249435425, + "learning_rate": 0.0001369445226436648, + "loss": 2.6188, + "step": 7653 + }, + { + "epoch": 0.617706399806311, + "grad_norm": 0.6481621265411377, + "learning_rate": 0.00013692985216900592, + "loss": 2.5489, + "step": 7654 + }, + { + "epoch": 0.617787103542894, + "grad_norm": 0.6828036904335022, + "learning_rate": 0.00013691518077395856, + "loss": 2.5114, + "step": 7655 + }, + { + "epoch": 0.617867807279477, + "grad_norm": 0.6802895665168762, + "learning_rate": 0.00013690050845888838, + "loss": 2.5973, + "step": 7656 + }, + { + "epoch": 0.6179485110160601, + "grad_norm": 0.6980829238891602, + "learning_rate": 0.00013688583522416107, + "loss": 2.6032, + "step": 7657 + }, + { + "epoch": 0.618029214752643, + "grad_norm": 0.7157626748085022, + "learning_rate": 0.00013687116107014236, + "loss": 2.5552, + "step": 7658 + }, + { + "epoch": 0.618109918489226, + "grad_norm": 0.69700688123703, + "learning_rate": 0.00013685648599719792, + "loss": 2.5988, + "step": 7659 + }, + { + "epoch": 0.618190622225809, + "grad_norm": 0.6859539151191711, + "learning_rate": 0.0001368418100056935, + "loss": 2.6268, + "step": 7660 + }, + { + "epoch": 0.6182713259623921, + "grad_norm": 0.6812828183174133, + "learning_rate": 0.00013682713309599487, + "loss": 2.6002, + "step": 7661 + }, + { + "epoch": 0.6183520296989751, + "grad_norm": 0.6461766362190247, + "learning_rate": 0.00013681245526846783, + "loss": 2.6064, + "step": 7662 + }, + { + "epoch": 0.618432733435558, + "grad_norm": 0.7198306322097778, + "learning_rate": 0.00013679777652347814, + "loss": 2.6012, + "step": 7663 + }, + { + "epoch": 0.618513437172141, + "grad_norm": 0.7367191910743713, + "learning_rate": 0.00013678309686139168, + "loss": 2.6661, + "step": 7664 + }, + { + "epoch": 0.6185941409087241, + "grad_norm": 0.6975768804550171, + "learning_rate": 0.0001367684162825743, + "loss": 2.6394, + "step": 7665 + }, + { + "epoch": 0.6186748446453071, + "grad_norm": 0.7545140385627747, + "learning_rate": 0.0001367537347873919, + "loss": 2.624, + "step": 7666 + }, + { + "epoch": 0.6187555483818901, + "grad_norm": 0.6683520674705505, + "learning_rate": 0.0001367390523762103, + "loss": 2.6345, + "step": 7667 + }, + { + "epoch": 0.618836252118473, + "grad_norm": 0.6964975595474243, + "learning_rate": 0.00013672436904939552, + "loss": 2.591, + "step": 7668 + }, + { + "epoch": 0.6189169558550561, + "grad_norm": 0.7033975124359131, + "learning_rate": 0.00013670968480731344, + "loss": 2.566, + "step": 7669 + }, + { + "epoch": 0.6189976595916391, + "grad_norm": 0.706136167049408, + "learning_rate": 0.00013669499965033007, + "loss": 2.6073, + "step": 7670 + }, + { + "epoch": 0.6190783633282221, + "grad_norm": 0.7146300673484802, + "learning_rate": 0.0001366803135788114, + "loss": 2.6602, + "step": 7671 + }, + { + "epoch": 0.6191590670648051, + "grad_norm": 0.7603063583374023, + "learning_rate": 0.00013666562659312342, + "loss": 2.5286, + "step": 7672 + }, + { + "epoch": 0.6192397708013881, + "grad_norm": 0.744955837726593, + "learning_rate": 0.00013665093869363217, + "loss": 2.5678, + "step": 7673 + }, + { + "epoch": 0.6193204745379711, + "grad_norm": 0.7548620104789734, + "learning_rate": 0.00013663624988070373, + "loss": 2.6081, + "step": 7674 + }, + { + "epoch": 0.6194011782745541, + "grad_norm": 0.7367276549339294, + "learning_rate": 0.0001366215601547042, + "loss": 2.5559, + "step": 7675 + }, + { + "epoch": 0.6194818820111371, + "grad_norm": 0.7243839502334595, + "learning_rate": 0.00013660686951599962, + "loss": 2.5545, + "step": 7676 + }, + { + "epoch": 0.6195625857477202, + "grad_norm": 0.7595756649971008, + "learning_rate": 0.00013659217796495616, + "loss": 2.6547, + "step": 7677 + }, + { + "epoch": 0.6196432894843031, + "grad_norm": 0.7566717863082886, + "learning_rate": 0.00013657748550193998, + "loss": 2.6521, + "step": 7678 + }, + { + "epoch": 0.6197239932208861, + "grad_norm": 0.8441942930221558, + "learning_rate": 0.00013656279212731728, + "loss": 2.6325, + "step": 7679 + }, + { + "epoch": 0.6198046969574691, + "grad_norm": 0.7481170296669006, + "learning_rate": 0.00013654809784145418, + "loss": 2.6037, + "step": 7680 + }, + { + "epoch": 0.6198854006940522, + "grad_norm": 0.6626241207122803, + "learning_rate": 0.00013653340264471695, + "loss": 2.6028, + "step": 7681 + }, + { + "epoch": 0.6199661044306352, + "grad_norm": 0.7658020853996277, + "learning_rate": 0.00013651870653747186, + "loss": 2.5553, + "step": 7682 + }, + { + "epoch": 0.6200468081672181, + "grad_norm": 0.8218126893043518, + "learning_rate": 0.0001365040095200851, + "loss": 2.5661, + "step": 7683 + }, + { + "epoch": 0.6201275119038011, + "grad_norm": 0.6481068134307861, + "learning_rate": 0.00013648931159292304, + "loss": 2.5675, + "step": 7684 + }, + { + "epoch": 0.6202082156403842, + "grad_norm": 0.7529950141906738, + "learning_rate": 0.0001364746127563519, + "loss": 2.6137, + "step": 7685 + }, + { + "epoch": 0.6202889193769672, + "grad_norm": 0.7133232355117798, + "learning_rate": 0.00013645991301073816, + "loss": 2.6004, + "step": 7686 + }, + { + "epoch": 0.6203696231135502, + "grad_norm": 0.7809340953826904, + "learning_rate": 0.000136445212356448, + "loss": 2.6317, + "step": 7687 + }, + { + "epoch": 0.6204503268501331, + "grad_norm": 0.7106895446777344, + "learning_rate": 0.00013643051079384789, + "loss": 2.6086, + "step": 7688 + }, + { + "epoch": 0.6205310305867162, + "grad_norm": 0.6960744261741638, + "learning_rate": 0.00013641580832330423, + "loss": 2.5554, + "step": 7689 + }, + { + "epoch": 0.6206117343232992, + "grad_norm": 0.7078820466995239, + "learning_rate": 0.00013640110494518343, + "loss": 2.5902, + "step": 7690 + }, + { + "epoch": 0.6206924380598822, + "grad_norm": 0.7150746583938599, + "learning_rate": 0.00013638640065985195, + "loss": 2.5947, + "step": 7691 + }, + { + "epoch": 0.6207731417964651, + "grad_norm": 0.7507869601249695, + "learning_rate": 0.00013637169546767625, + "loss": 2.559, + "step": 7692 + }, + { + "epoch": 0.6208538455330482, + "grad_norm": 0.7453179359436035, + "learning_rate": 0.00013635698936902282, + "loss": 2.5612, + "step": 7693 + }, + { + "epoch": 0.6209345492696312, + "grad_norm": 0.7174177765846252, + "learning_rate": 0.00013634228236425816, + "loss": 2.6221, + "step": 7694 + }, + { + "epoch": 0.6210152530062142, + "grad_norm": 0.7394092679023743, + "learning_rate": 0.00013632757445374884, + "loss": 2.6045, + "step": 7695 + }, + { + "epoch": 0.6210959567427972, + "grad_norm": 0.7346367239952087, + "learning_rate": 0.0001363128656378614, + "loss": 2.677, + "step": 7696 + }, + { + "epoch": 0.6211766604793802, + "grad_norm": 0.6697696447372437, + "learning_rate": 0.00013629815591696245, + "loss": 2.5741, + "step": 7697 + }, + { + "epoch": 0.6212573642159632, + "grad_norm": 0.6993793845176697, + "learning_rate": 0.00013628344529141852, + "loss": 2.5206, + "step": 7698 + }, + { + "epoch": 0.6213380679525462, + "grad_norm": 0.6946697235107422, + "learning_rate": 0.00013626873376159631, + "loss": 2.6046, + "step": 7699 + }, + { + "epoch": 0.6214187716891292, + "grad_norm": 0.7641928195953369, + "learning_rate": 0.00013625402132786248, + "loss": 2.5459, + "step": 7700 + }, + { + "epoch": 0.6214994754257122, + "grad_norm": 0.6513504981994629, + "learning_rate": 0.00013623930799058363, + "loss": 2.6137, + "step": 7701 + }, + { + "epoch": 0.6215801791622952, + "grad_norm": 0.6745209097862244, + "learning_rate": 0.00013622459375012651, + "loss": 2.5285, + "step": 7702 + }, + { + "epoch": 0.6216608828988782, + "grad_norm": 0.7162348628044128, + "learning_rate": 0.0001362098786068578, + "loss": 2.6224, + "step": 7703 + }, + { + "epoch": 0.6217415866354612, + "grad_norm": 0.7387436032295227, + "learning_rate": 0.00013619516256114427, + "loss": 2.6216, + "step": 7704 + }, + { + "epoch": 0.6218222903720442, + "grad_norm": 0.764955461025238, + "learning_rate": 0.00013618044561335268, + "loss": 2.612, + "step": 7705 + }, + { + "epoch": 0.6219029941086273, + "grad_norm": 0.6492719054222107, + "learning_rate": 0.00013616572776384983, + "loss": 2.5532, + "step": 7706 + }, + { + "epoch": 0.6219836978452102, + "grad_norm": 0.6870293617248535, + "learning_rate": 0.0001361510090130025, + "loss": 2.5705, + "step": 7707 + }, + { + "epoch": 0.6220644015817932, + "grad_norm": 0.6899540424346924, + "learning_rate": 0.0001361362893611775, + "loss": 2.5768, + "step": 7708 + }, + { + "epoch": 0.6221451053183762, + "grad_norm": 0.658941924571991, + "learning_rate": 0.0001361215688087417, + "loss": 2.5664, + "step": 7709 + }, + { + "epoch": 0.6222258090549593, + "grad_norm": 0.6875531673431396, + "learning_rate": 0.000136106847356062, + "loss": 2.6128, + "step": 7710 + }, + { + "epoch": 0.6223065127915423, + "grad_norm": 0.657073974609375, + "learning_rate": 0.0001360921250035053, + "loss": 2.6449, + "step": 7711 + }, + { + "epoch": 0.6223872165281252, + "grad_norm": 0.7051201462745667, + "learning_rate": 0.00013607740175143848, + "loss": 2.5925, + "step": 7712 + }, + { + "epoch": 0.6224679202647082, + "grad_norm": 0.702877938747406, + "learning_rate": 0.0001360626776002285, + "loss": 2.5338, + "step": 7713 + }, + { + "epoch": 0.6225486240012913, + "grad_norm": 0.650935709476471, + "learning_rate": 0.00013604795255024233, + "loss": 2.5799, + "step": 7714 + }, + { + "epoch": 0.6226293277378743, + "grad_norm": 0.7035139203071594, + "learning_rate": 0.00013603322660184694, + "loss": 2.5476, + "step": 7715 + }, + { + "epoch": 0.6227100314744572, + "grad_norm": 0.6549977660179138, + "learning_rate": 0.0001360184997554094, + "loss": 2.6117, + "step": 7716 + }, + { + "epoch": 0.6227907352110402, + "grad_norm": 0.6882792115211487, + "learning_rate": 0.00013600377201129662, + "loss": 2.53, + "step": 7717 + }, + { + "epoch": 0.6228714389476233, + "grad_norm": 0.7390840649604797, + "learning_rate": 0.0001359890433698758, + "loss": 2.6345, + "step": 7718 + }, + { + "epoch": 0.6229521426842063, + "grad_norm": 0.7577612400054932, + "learning_rate": 0.00013597431383151386, + "loss": 2.6386, + "step": 7719 + }, + { + "epoch": 0.6230328464207893, + "grad_norm": 0.6818724870681763, + "learning_rate": 0.00013595958339657804, + "loss": 2.5806, + "step": 7720 + }, + { + "epoch": 0.6231135501573722, + "grad_norm": 0.6954349279403687, + "learning_rate": 0.0001359448520654354, + "loss": 2.5913, + "step": 7721 + }, + { + "epoch": 0.6231942538939553, + "grad_norm": 0.7976544499397278, + "learning_rate": 0.00013593011983845308, + "loss": 2.5686, + "step": 7722 + }, + { + "epoch": 0.6232749576305383, + "grad_norm": 0.7362754940986633, + "learning_rate": 0.00013591538671599824, + "loss": 2.5596, + "step": 7723 + }, + { + "epoch": 0.6233556613671213, + "grad_norm": 0.6842390298843384, + "learning_rate": 0.00013590065269843805, + "loss": 2.5793, + "step": 7724 + }, + { + "epoch": 0.6234363651037043, + "grad_norm": 0.6816275715827942, + "learning_rate": 0.0001358859177861398, + "loss": 2.5948, + "step": 7725 + }, + { + "epoch": 0.6235170688402873, + "grad_norm": 0.6892915964126587, + "learning_rate": 0.00013587118197947066, + "loss": 2.6287, + "step": 7726 + }, + { + "epoch": 0.6235977725768703, + "grad_norm": 0.6851752996444702, + "learning_rate": 0.00013585644527879792, + "loss": 2.5781, + "step": 7727 + }, + { + "epoch": 0.6236784763134533, + "grad_norm": 0.7022164463996887, + "learning_rate": 0.00013584170768448877, + "loss": 2.5856, + "step": 7728 + }, + { + "epoch": 0.6237591800500363, + "grad_norm": 0.6752299070358276, + "learning_rate": 0.0001358269691969106, + "loss": 2.6042, + "step": 7729 + }, + { + "epoch": 0.6238398837866194, + "grad_norm": 0.6861466765403748, + "learning_rate": 0.00013581222981643074, + "loss": 2.5887, + "step": 7730 + }, + { + "epoch": 0.6239205875232023, + "grad_norm": 0.7147940397262573, + "learning_rate": 0.00013579748954341647, + "loss": 2.5796, + "step": 7731 + }, + { + "epoch": 0.6240012912597853, + "grad_norm": 0.6704726219177246, + "learning_rate": 0.0001357827483782352, + "loss": 2.6027, + "step": 7732 + }, + { + "epoch": 0.6240819949963683, + "grad_norm": 0.6984317898750305, + "learning_rate": 0.0001357680063212543, + "loss": 2.635, + "step": 7733 + }, + { + "epoch": 0.6241626987329514, + "grad_norm": 0.6205787658691406, + "learning_rate": 0.00013575326337284115, + "loss": 2.5715, + "step": 7734 + }, + { + "epoch": 0.6242434024695344, + "grad_norm": 0.7214726805686951, + "learning_rate": 0.00013573851953336326, + "loss": 2.5605, + "step": 7735 + }, + { + "epoch": 0.6243241062061173, + "grad_norm": 0.6716169714927673, + "learning_rate": 0.000135723774803188, + "loss": 2.6766, + "step": 7736 + }, + { + "epoch": 0.6244048099427003, + "grad_norm": 0.6446832418441772, + "learning_rate": 0.00013570902918268293, + "loss": 2.5629, + "step": 7737 + }, + { + "epoch": 0.6244855136792834, + "grad_norm": 0.6721374988555908, + "learning_rate": 0.0001356942826722155, + "loss": 2.6093, + "step": 7738 + }, + { + "epoch": 0.6245662174158664, + "grad_norm": 0.7430365681648254, + "learning_rate": 0.0001356795352721532, + "loss": 2.5966, + "step": 7739 + }, + { + "epoch": 0.6246469211524494, + "grad_norm": 0.6787518858909607, + "learning_rate": 0.00013566478698286366, + "loss": 2.5519, + "step": 7740 + }, + { + "epoch": 0.6247276248890323, + "grad_norm": 0.6340047121047974, + "learning_rate": 0.0001356500378047144, + "loss": 2.5181, + "step": 7741 + }, + { + "epoch": 0.6248083286256154, + "grad_norm": 0.7559040188789368, + "learning_rate": 0.000135635287738073, + "loss": 2.6068, + "step": 7742 + }, + { + "epoch": 0.6248890323621984, + "grad_norm": 0.6819902062416077, + "learning_rate": 0.00013562053678330707, + "loss": 2.5754, + "step": 7743 + }, + { + "epoch": 0.6249697360987814, + "grad_norm": 0.6463500261306763, + "learning_rate": 0.00013560578494078423, + "loss": 2.5915, + "step": 7744 + }, + { + "epoch": 0.6250504398353643, + "grad_norm": 0.7510617971420288, + "learning_rate": 0.0001355910322108722, + "loss": 2.5738, + "step": 7745 + }, + { + "epoch": 0.6251311435719474, + "grad_norm": 0.75312739610672, + "learning_rate": 0.00013557627859393855, + "loss": 2.5938, + "step": 7746 + }, + { + "epoch": 0.6252118473085304, + "grad_norm": 0.7784396409988403, + "learning_rate": 0.0001355615240903511, + "loss": 2.6634, + "step": 7747 + }, + { + "epoch": 0.6252925510451134, + "grad_norm": 0.7174746990203857, + "learning_rate": 0.00013554676870047752, + "loss": 2.5973, + "step": 7748 + }, + { + "epoch": 0.6253732547816964, + "grad_norm": 0.6854952573776245, + "learning_rate": 0.0001355320124246855, + "loss": 2.5397, + "step": 7749 + }, + { + "epoch": 0.6254539585182795, + "grad_norm": 0.6584961414337158, + "learning_rate": 0.00013551725526334284, + "loss": 2.5574, + "step": 7750 + }, + { + "epoch": 0.6255346622548624, + "grad_norm": 0.7067389488220215, + "learning_rate": 0.00013550249721681738, + "loss": 2.5524, + "step": 7751 + }, + { + "epoch": 0.6256153659914454, + "grad_norm": 0.6923872232437134, + "learning_rate": 0.00013548773828547686, + "loss": 2.5651, + "step": 7752 + }, + { + "epoch": 0.6256960697280284, + "grad_norm": 0.6612355709075928, + "learning_rate": 0.00013547297846968915, + "loss": 2.6075, + "step": 7753 + }, + { + "epoch": 0.6257767734646114, + "grad_norm": 0.6762828826904297, + "learning_rate": 0.00013545821776982206, + "loss": 2.6136, + "step": 7754 + }, + { + "epoch": 0.6258574772011944, + "grad_norm": 0.6940783858299255, + "learning_rate": 0.0001354434561862435, + "loss": 2.5566, + "step": 7755 + }, + { + "epoch": 0.6259381809377774, + "grad_norm": 0.7874250411987305, + "learning_rate": 0.0001354286937193214, + "loss": 2.6732, + "step": 7756 + }, + { + "epoch": 0.6260188846743604, + "grad_norm": 0.6974111795425415, + "learning_rate": 0.0001354139303694236, + "loss": 2.5455, + "step": 7757 + }, + { + "epoch": 0.6260995884109434, + "grad_norm": 0.6710802316665649, + "learning_rate": 0.0001353991661369181, + "loss": 2.5608, + "step": 7758 + }, + { + "epoch": 0.6261802921475265, + "grad_norm": 0.681635320186615, + "learning_rate": 0.00013538440102217286, + "loss": 2.6107, + "step": 7759 + }, + { + "epoch": 0.6262609958841094, + "grad_norm": 0.7229577898979187, + "learning_rate": 0.0001353696350255558, + "loss": 2.5936, + "step": 7760 + }, + { + "epoch": 0.6263416996206924, + "grad_norm": 0.6909681558609009, + "learning_rate": 0.00013535486814743504, + "loss": 2.5521, + "step": 7761 + }, + { + "epoch": 0.6264224033572754, + "grad_norm": 0.7003746032714844, + "learning_rate": 0.0001353401003881785, + "loss": 2.5606, + "step": 7762 + }, + { + "epoch": 0.6265031070938585, + "grad_norm": 0.6883233785629272, + "learning_rate": 0.0001353253317481543, + "loss": 2.5971, + "step": 7763 + }, + { + "epoch": 0.6265838108304415, + "grad_norm": 0.7382355332374573, + "learning_rate": 0.0001353105622277305, + "loss": 2.5449, + "step": 7764 + }, + { + "epoch": 0.6266645145670244, + "grad_norm": 0.7090556621551514, + "learning_rate": 0.00013529579182727515, + "loss": 2.5988, + "step": 7765 + }, + { + "epoch": 0.6267452183036074, + "grad_norm": 0.6842581629753113, + "learning_rate": 0.00013528102054715643, + "loss": 2.6214, + "step": 7766 + }, + { + "epoch": 0.6268259220401905, + "grad_norm": 0.6969670653343201, + "learning_rate": 0.00013526624838774246, + "loss": 2.5443, + "step": 7767 + }, + { + "epoch": 0.6269066257767735, + "grad_norm": 0.7244827151298523, + "learning_rate": 0.00013525147534940138, + "loss": 2.5967, + "step": 7768 + }, + { + "epoch": 0.6269873295133565, + "grad_norm": 0.7022162675857544, + "learning_rate": 0.0001352367014325014, + "loss": 2.599, + "step": 7769 + }, + { + "epoch": 0.6270680332499394, + "grad_norm": 0.7065250873565674, + "learning_rate": 0.00013522192663741067, + "loss": 2.6105, + "step": 7770 + }, + { + "epoch": 0.6271487369865225, + "grad_norm": 0.6690711975097656, + "learning_rate": 0.0001352071509644975, + "loss": 2.55, + "step": 7771 + }, + { + "epoch": 0.6272294407231055, + "grad_norm": 0.6405982971191406, + "learning_rate": 0.00013519237441413011, + "loss": 2.6078, + "step": 7772 + }, + { + "epoch": 0.6273101444596885, + "grad_norm": 0.7340127229690552, + "learning_rate": 0.00013517759698667672, + "loss": 2.6244, + "step": 7773 + }, + { + "epoch": 0.6273908481962714, + "grad_norm": 0.6609435677528381, + "learning_rate": 0.00013516281868250566, + "loss": 2.5746, + "step": 7774 + }, + { + "epoch": 0.6274715519328545, + "grad_norm": 0.6681997179985046, + "learning_rate": 0.00013514803950198523, + "loss": 2.6181, + "step": 7775 + }, + { + "epoch": 0.6275522556694375, + "grad_norm": 0.7120032906532288, + "learning_rate": 0.0001351332594454838, + "loss": 2.6018, + "step": 7776 + }, + { + "epoch": 0.6276329594060205, + "grad_norm": 0.6618601679801941, + "learning_rate": 0.0001351184785133697, + "loss": 2.5342, + "step": 7777 + }, + { + "epoch": 0.6277136631426035, + "grad_norm": 0.7250192165374756, + "learning_rate": 0.00013510369670601132, + "loss": 2.5795, + "step": 7778 + }, + { + "epoch": 0.6277943668791865, + "grad_norm": 0.7918543219566345, + "learning_rate": 0.00013508891402377708, + "loss": 2.6544, + "step": 7779 + }, + { + "epoch": 0.6278750706157695, + "grad_norm": 0.678895890712738, + "learning_rate": 0.00013507413046703534, + "loss": 2.5937, + "step": 7780 + }, + { + "epoch": 0.6279557743523525, + "grad_norm": 0.7336576581001282, + "learning_rate": 0.00013505934603615457, + "loss": 2.598, + "step": 7781 + }, + { + "epoch": 0.6280364780889355, + "grad_norm": 0.6891419291496277, + "learning_rate": 0.00013504456073150332, + "loss": 2.5063, + "step": 7782 + }, + { + "epoch": 0.6281171818255186, + "grad_norm": 0.7949386835098267, + "learning_rate": 0.00013502977455344997, + "loss": 2.5703, + "step": 7783 + }, + { + "epoch": 0.6281978855621015, + "grad_norm": 0.7917985320091248, + "learning_rate": 0.00013501498750236306, + "loss": 2.639, + "step": 7784 + }, + { + "epoch": 0.6282785892986845, + "grad_norm": 0.7387086749076843, + "learning_rate": 0.00013500019957861113, + "loss": 2.5864, + "step": 7785 + }, + { + "epoch": 0.6283592930352675, + "grad_norm": 0.7189435958862305, + "learning_rate": 0.00013498541078256273, + "loss": 2.5627, + "step": 7786 + }, + { + "epoch": 0.6284399967718506, + "grad_norm": 0.6709900498390198, + "learning_rate": 0.00013497062111458646, + "loss": 2.5973, + "step": 7787 + }, + { + "epoch": 0.6285207005084336, + "grad_norm": 0.6925386190414429, + "learning_rate": 0.0001349558305750509, + "loss": 2.615, + "step": 7788 + }, + { + "epoch": 0.6286014042450165, + "grad_norm": 0.7191932201385498, + "learning_rate": 0.00013494103916432466, + "loss": 2.576, + "step": 7789 + }, + { + "epoch": 0.6286821079815995, + "grad_norm": 0.6798804402351379, + "learning_rate": 0.00013492624688277638, + "loss": 2.5661, + "step": 7790 + }, + { + "epoch": 0.6287628117181826, + "grad_norm": 0.6514562964439392, + "learning_rate": 0.00013491145373077475, + "loss": 2.6135, + "step": 7791 + }, + { + "epoch": 0.6288435154547656, + "grad_norm": 0.7345223426818848, + "learning_rate": 0.00013489665970868838, + "loss": 2.6015, + "step": 7792 + }, + { + "epoch": 0.6289242191913486, + "grad_norm": 0.7102675437927246, + "learning_rate": 0.0001348818648168861, + "loss": 2.5545, + "step": 7793 + }, + { + "epoch": 0.6290049229279315, + "grad_norm": 0.7151654362678528, + "learning_rate": 0.0001348670690557365, + "loss": 2.6464, + "step": 7794 + }, + { + "epoch": 0.6290856266645146, + "grad_norm": 0.7344057559967041, + "learning_rate": 0.00013485227242560844, + "loss": 2.6777, + "step": 7795 + }, + { + "epoch": 0.6291663304010976, + "grad_norm": 0.6622766852378845, + "learning_rate": 0.00013483747492687065, + "loss": 2.5713, + "step": 7796 + }, + { + "epoch": 0.6292470341376806, + "grad_norm": 0.6899346709251404, + "learning_rate": 0.0001348226765598919, + "loss": 2.5188, + "step": 7797 + }, + { + "epoch": 0.6293277378742635, + "grad_norm": 0.6711421012878418, + "learning_rate": 0.000134807877325041, + "loss": 2.5603, + "step": 7798 + }, + { + "epoch": 0.6294084416108466, + "grad_norm": 0.6973204016685486, + "learning_rate": 0.00013479307722268687, + "loss": 2.6621, + "step": 7799 + }, + { + "epoch": 0.6294891453474296, + "grad_norm": 0.7782350778579712, + "learning_rate": 0.00013477827625319824, + "loss": 2.5929, + "step": 7800 + }, + { + "epoch": 0.6295698490840126, + "grad_norm": 0.8703733682632446, + "learning_rate": 0.0001347634744169441, + "loss": 2.6884, + "step": 7801 + }, + { + "epoch": 0.6296505528205956, + "grad_norm": 0.7196036577224731, + "learning_rate": 0.00013474867171429326, + "loss": 2.6002, + "step": 7802 + }, + { + "epoch": 0.6297312565571785, + "grad_norm": 0.7224054932594299, + "learning_rate": 0.00013473386814561475, + "loss": 2.6007, + "step": 7803 + }, + { + "epoch": 0.6298119602937616, + "grad_norm": 0.7615752816200256, + "learning_rate": 0.00013471906371127743, + "loss": 2.6459, + "step": 7804 + }, + { + "epoch": 0.6298926640303446, + "grad_norm": 0.7189914584159851, + "learning_rate": 0.00013470425841165024, + "loss": 2.5692, + "step": 7805 + }, + { + "epoch": 0.6299733677669276, + "grad_norm": 0.7101845741271973, + "learning_rate": 0.00013468945224710225, + "loss": 2.5776, + "step": 7806 + }, + { + "epoch": 0.6300540715035106, + "grad_norm": 0.6860305666923523, + "learning_rate": 0.00013467464521800244, + "loss": 2.5567, + "step": 7807 + }, + { + "epoch": 0.6301347752400936, + "grad_norm": 0.7003797292709351, + "learning_rate": 0.0001346598373247198, + "loss": 2.6444, + "step": 7808 + }, + { + "epoch": 0.6302154789766766, + "grad_norm": 0.6341832876205444, + "learning_rate": 0.00013464502856762344, + "loss": 2.5475, + "step": 7809 + }, + { + "epoch": 0.6302961827132596, + "grad_norm": 0.6255922317504883, + "learning_rate": 0.00013463021894708242, + "loss": 2.5875, + "step": 7810 + }, + { + "epoch": 0.6303768864498426, + "grad_norm": 0.7136420607566833, + "learning_rate": 0.00013461540846346575, + "loss": 2.5708, + "step": 7811 + }, + { + "epoch": 0.6304575901864257, + "grad_norm": 0.7164542078971863, + "learning_rate": 0.00013460059711714267, + "loss": 2.4975, + "step": 7812 + }, + { + "epoch": 0.6305382939230086, + "grad_norm": 0.7667872905731201, + "learning_rate": 0.00013458578490848226, + "loss": 2.6124, + "step": 7813 + }, + { + "epoch": 0.6306189976595916, + "grad_norm": 0.6631812453269958, + "learning_rate": 0.0001345709718378537, + "loss": 2.5318, + "step": 7814 + }, + { + "epoch": 0.6306997013961746, + "grad_norm": 0.696864664554596, + "learning_rate": 0.0001345561579056261, + "loss": 2.6171, + "step": 7815 + }, + { + "epoch": 0.6307804051327577, + "grad_norm": 0.7368598580360413, + "learning_rate": 0.00013454134311216873, + "loss": 2.5734, + "step": 7816 + }, + { + "epoch": 0.6308611088693407, + "grad_norm": 0.7279712557792664, + "learning_rate": 0.00013452652745785083, + "loss": 2.6231, + "step": 7817 + }, + { + "epoch": 0.6309418126059236, + "grad_norm": 0.8070993423461914, + "learning_rate": 0.00013451171094304158, + "loss": 2.5486, + "step": 7818 + }, + { + "epoch": 0.6310225163425066, + "grad_norm": 0.7522621750831604, + "learning_rate": 0.0001344968935681103, + "loss": 2.5576, + "step": 7819 + }, + { + "epoch": 0.6311032200790897, + "grad_norm": 0.8185423612594604, + "learning_rate": 0.00013448207533342624, + "loss": 2.6068, + "step": 7820 + }, + { + "epoch": 0.6311839238156727, + "grad_norm": 0.7542584538459778, + "learning_rate": 0.0001344672562393587, + "loss": 2.643, + "step": 7821 + }, + { + "epoch": 0.6312646275522557, + "grad_norm": 0.7892276644706726, + "learning_rate": 0.00013445243628627712, + "loss": 2.6211, + "step": 7822 + }, + { + "epoch": 0.6313453312888386, + "grad_norm": 0.7216602563858032, + "learning_rate": 0.00013443761547455072, + "loss": 2.5725, + "step": 7823 + }, + { + "epoch": 0.6314260350254217, + "grad_norm": 0.6750743985176086, + "learning_rate": 0.0001344227938045489, + "loss": 2.5319, + "step": 7824 + }, + { + "epoch": 0.6315067387620047, + "grad_norm": 0.6711540222167969, + "learning_rate": 0.0001344079712766411, + "loss": 2.5957, + "step": 7825 + }, + { + "epoch": 0.6315874424985877, + "grad_norm": 0.6923524737358093, + "learning_rate": 0.00013439314789119667, + "loss": 2.6084, + "step": 7826 + }, + { + "epoch": 0.6316681462351706, + "grad_norm": 0.6859166026115417, + "learning_rate": 0.00013437832364858517, + "loss": 2.5608, + "step": 7827 + }, + { + "epoch": 0.6317488499717537, + "grad_norm": 0.7340966463088989, + "learning_rate": 0.0001343634985491759, + "loss": 2.531, + "step": 7828 + }, + { + "epoch": 0.6318295537083367, + "grad_norm": 0.7374520301818848, + "learning_rate": 0.00013434867259333848, + "loss": 2.5972, + "step": 7829 + }, + { + "epoch": 0.6319102574449197, + "grad_norm": 0.7252814769744873, + "learning_rate": 0.00013433384578144232, + "loss": 2.5874, + "step": 7830 + }, + { + "epoch": 0.6319909611815027, + "grad_norm": 0.7000489830970764, + "learning_rate": 0.000134319018113857, + "loss": 2.6137, + "step": 7831 + }, + { + "epoch": 0.6320716649180858, + "grad_norm": 0.805981457233429, + "learning_rate": 0.00013430418959095198, + "loss": 2.5581, + "step": 7832 + }, + { + "epoch": 0.6321523686546687, + "grad_norm": 0.7459721565246582, + "learning_rate": 0.00013428936021309693, + "loss": 2.5284, + "step": 7833 + }, + { + "epoch": 0.6322330723912517, + "grad_norm": 0.749794065952301, + "learning_rate": 0.00013427452998066136, + "loss": 2.5927, + "step": 7834 + }, + { + "epoch": 0.6323137761278347, + "grad_norm": 0.6925346255302429, + "learning_rate": 0.00013425969889401494, + "loss": 2.5703, + "step": 7835 + }, + { + "epoch": 0.6323944798644178, + "grad_norm": 0.6647117137908936, + "learning_rate": 0.00013424486695352728, + "loss": 2.5649, + "step": 7836 + }, + { + "epoch": 0.6324751836010007, + "grad_norm": 0.7358147501945496, + "learning_rate": 0.00013423003415956796, + "loss": 2.6122, + "step": 7837 + }, + { + "epoch": 0.6325558873375837, + "grad_norm": 0.7798088788986206, + "learning_rate": 0.00013421520051250675, + "loss": 2.5805, + "step": 7838 + }, + { + "epoch": 0.6326365910741667, + "grad_norm": 0.7108271718025208, + "learning_rate": 0.00013420036601271334, + "loss": 2.5457, + "step": 7839 + }, + { + "epoch": 0.6327172948107498, + "grad_norm": 0.7108528017997742, + "learning_rate": 0.00013418553066055734, + "loss": 2.6313, + "step": 7840 + }, + { + "epoch": 0.6327979985473328, + "grad_norm": 0.7325249910354614, + "learning_rate": 0.00013417069445640858, + "loss": 2.5598, + "step": 7841 + }, + { + "epoch": 0.6328787022839157, + "grad_norm": 0.6861844062805176, + "learning_rate": 0.0001341558574006368, + "loss": 2.5899, + "step": 7842 + }, + { + "epoch": 0.6329594060204987, + "grad_norm": 0.7576130628585815, + "learning_rate": 0.00013414101949361175, + "loss": 2.6077, + "step": 7843 + }, + { + "epoch": 0.6330401097570818, + "grad_norm": 0.7756128907203674, + "learning_rate": 0.0001341261807357033, + "loss": 2.6111, + "step": 7844 + }, + { + "epoch": 0.6331208134936648, + "grad_norm": 0.7131127715110779, + "learning_rate": 0.00013411134112728114, + "loss": 2.5227, + "step": 7845 + }, + { + "epoch": 0.6332015172302478, + "grad_norm": 0.6517898440361023, + "learning_rate": 0.00013409650066871525, + "loss": 2.5825, + "step": 7846 + }, + { + "epoch": 0.6332822209668307, + "grad_norm": 0.8452722430229187, + "learning_rate": 0.0001340816593603754, + "loss": 2.6037, + "step": 7847 + }, + { + "epoch": 0.6333629247034138, + "grad_norm": 0.7421110272407532, + "learning_rate": 0.00013406681720263153, + "loss": 2.5684, + "step": 7848 + }, + { + "epoch": 0.6334436284399968, + "grad_norm": 0.695139467716217, + "learning_rate": 0.0001340519741958535, + "loss": 2.5648, + "step": 7849 + }, + { + "epoch": 0.6335243321765798, + "grad_norm": 0.7780016660690308, + "learning_rate": 0.0001340371303404113, + "loss": 2.6849, + "step": 7850 + }, + { + "epoch": 0.6336050359131628, + "grad_norm": 0.7276864051818848, + "learning_rate": 0.00013402228563667482, + "loss": 2.6198, + "step": 7851 + }, + { + "epoch": 0.6336857396497458, + "grad_norm": 0.7566827535629272, + "learning_rate": 0.00013400744008501404, + "loss": 2.5803, + "step": 7852 + }, + { + "epoch": 0.6337664433863288, + "grad_norm": 0.7933458089828491, + "learning_rate": 0.00013399259368579894, + "loss": 2.6029, + "step": 7853 + }, + { + "epoch": 0.6338471471229118, + "grad_norm": 0.6849822402000427, + "learning_rate": 0.00013397774643939957, + "loss": 2.5454, + "step": 7854 + }, + { + "epoch": 0.6339278508594948, + "grad_norm": 0.7054651379585266, + "learning_rate": 0.00013396289834618594, + "loss": 2.5905, + "step": 7855 + }, + { + "epoch": 0.6340085545960777, + "grad_norm": 0.7036863565444946, + "learning_rate": 0.00013394804940652813, + "loss": 2.6342, + "step": 7856 + }, + { + "epoch": 0.6340892583326608, + "grad_norm": 0.7101735472679138, + "learning_rate": 0.00013393319962079614, + "loss": 2.6402, + "step": 7857 + }, + { + "epoch": 0.6341699620692438, + "grad_norm": 0.7053956389427185, + "learning_rate": 0.0001339183489893601, + "loss": 2.5841, + "step": 7858 + }, + { + "epoch": 0.6342506658058268, + "grad_norm": 0.7734887003898621, + "learning_rate": 0.0001339034975125902, + "loss": 2.652, + "step": 7859 + }, + { + "epoch": 0.6343313695424098, + "grad_norm": 0.6714119911193848, + "learning_rate": 0.0001338886451908565, + "loss": 2.5927, + "step": 7860 + }, + { + "epoch": 0.6344120732789928, + "grad_norm": 0.6580910682678223, + "learning_rate": 0.00013387379202452917, + "loss": 2.6114, + "step": 7861 + }, + { + "epoch": 0.6344927770155758, + "grad_norm": 0.6810200214385986, + "learning_rate": 0.00013385893801397836, + "loss": 2.5616, + "step": 7862 + }, + { + "epoch": 0.6345734807521588, + "grad_norm": 0.6989572048187256, + "learning_rate": 0.00013384408315957432, + "loss": 2.5954, + "step": 7863 + }, + { + "epoch": 0.6346541844887418, + "grad_norm": 0.7033671736717224, + "learning_rate": 0.00013382922746168728, + "loss": 2.6015, + "step": 7864 + }, + { + "epoch": 0.6347348882253249, + "grad_norm": 0.6873033046722412, + "learning_rate": 0.0001338143709206875, + "loss": 2.562, + "step": 7865 + }, + { + "epoch": 0.6348155919619078, + "grad_norm": 0.7361463904380798, + "learning_rate": 0.00013379951353694513, + "loss": 2.6175, + "step": 7866 + }, + { + "epoch": 0.6348962956984908, + "grad_norm": 0.7623226046562195, + "learning_rate": 0.00013378465531083055, + "loss": 2.7342, + "step": 7867 + }, + { + "epoch": 0.6349769994350738, + "grad_norm": 0.7427035570144653, + "learning_rate": 0.0001337697962427141, + "loss": 2.5468, + "step": 7868 + }, + { + "epoch": 0.6350577031716569, + "grad_norm": 0.6865772008895874, + "learning_rate": 0.00013375493633296598, + "loss": 2.6112, + "step": 7869 + }, + { + "epoch": 0.6351384069082399, + "grad_norm": 0.663567304611206, + "learning_rate": 0.00013374007558195666, + "loss": 2.5896, + "step": 7870 + }, + { + "epoch": 0.6352191106448228, + "grad_norm": 0.6804360151290894, + "learning_rate": 0.00013372521399005643, + "loss": 2.58, + "step": 7871 + }, + { + "epoch": 0.6352998143814058, + "grad_norm": 0.6755216121673584, + "learning_rate": 0.0001337103515576357, + "loss": 2.5593, + "step": 7872 + }, + { + "epoch": 0.6353805181179889, + "grad_norm": 0.8148807883262634, + "learning_rate": 0.00013369548828506491, + "loss": 2.6473, + "step": 7873 + }, + { + "epoch": 0.6354612218545719, + "grad_norm": 0.713009774684906, + "learning_rate": 0.00013368062417271447, + "loss": 2.6002, + "step": 7874 + }, + { + "epoch": 0.6355419255911549, + "grad_norm": 0.6390172839164734, + "learning_rate": 0.00013366575922095484, + "loss": 2.5794, + "step": 7875 + }, + { + "epoch": 0.6356226293277378, + "grad_norm": 0.7228195667266846, + "learning_rate": 0.00013365089343015649, + "loss": 2.6051, + "step": 7876 + }, + { + "epoch": 0.6357033330643209, + "grad_norm": 0.7563474178314209, + "learning_rate": 0.00013363602680068986, + "loss": 2.6308, + "step": 7877 + }, + { + "epoch": 0.6357840368009039, + "grad_norm": 0.7366798520088196, + "learning_rate": 0.00013362115933292557, + "loss": 2.5589, + "step": 7878 + }, + { + "epoch": 0.6358647405374869, + "grad_norm": 0.7137070894241333, + "learning_rate": 0.00013360629102723409, + "loss": 2.6428, + "step": 7879 + }, + { + "epoch": 0.6359454442740698, + "grad_norm": 0.6799132823944092, + "learning_rate": 0.000133591421883986, + "loss": 2.5549, + "step": 7880 + }, + { + "epoch": 0.6360261480106529, + "grad_norm": 0.7031344771385193, + "learning_rate": 0.00013357655190355188, + "loss": 2.6298, + "step": 7881 + }, + { + "epoch": 0.6361068517472359, + "grad_norm": 0.7441670298576355, + "learning_rate": 0.00013356168108630227, + "loss": 2.5844, + "step": 7882 + }, + { + "epoch": 0.6361875554838189, + "grad_norm": 0.7281978726387024, + "learning_rate": 0.00013354680943260784, + "loss": 2.5773, + "step": 7883 + }, + { + "epoch": 0.6362682592204019, + "grad_norm": 0.6969650983810425, + "learning_rate": 0.00013353193694283928, + "loss": 2.6156, + "step": 7884 + }, + { + "epoch": 0.636348962956985, + "grad_norm": 0.6668435335159302, + "learning_rate": 0.00013351706361736714, + "loss": 2.6328, + "step": 7885 + }, + { + "epoch": 0.6364296666935679, + "grad_norm": 0.6909573078155518, + "learning_rate": 0.0001335021894565622, + "loss": 2.5772, + "step": 7886 + }, + { + "epoch": 0.6365103704301509, + "grad_norm": 0.6740022897720337, + "learning_rate": 0.0001334873144607951, + "loss": 2.6435, + "step": 7887 + }, + { + "epoch": 0.6365910741667339, + "grad_norm": 0.7203185558319092, + "learning_rate": 0.0001334724386304366, + "loss": 2.5401, + "step": 7888 + }, + { + "epoch": 0.636671777903317, + "grad_norm": 0.7343020439147949, + "learning_rate": 0.0001334575619658574, + "loss": 2.5811, + "step": 7889 + }, + { + "epoch": 0.6367524816399, + "grad_norm": 0.6941348314285278, + "learning_rate": 0.00013344268446742835, + "loss": 2.6267, + "step": 7890 + }, + { + "epoch": 0.6368331853764829, + "grad_norm": 0.6983792185783386, + "learning_rate": 0.00013342780613552016, + "loss": 2.533, + "step": 7891 + }, + { + "epoch": 0.6369138891130659, + "grad_norm": 0.7093533277511597, + "learning_rate": 0.00013341292697050365, + "loss": 2.6616, + "step": 7892 + }, + { + "epoch": 0.636994592849649, + "grad_norm": 0.7377648949623108, + "learning_rate": 0.00013339804697274965, + "loss": 2.6032, + "step": 7893 + }, + { + "epoch": 0.637075296586232, + "grad_norm": 0.6669821739196777, + "learning_rate": 0.00013338316614262903, + "loss": 2.6082, + "step": 7894 + }, + { + "epoch": 0.6371560003228149, + "grad_norm": 0.6665576100349426, + "learning_rate": 0.00013336828448051263, + "loss": 2.6114, + "step": 7895 + }, + { + "epoch": 0.6372367040593979, + "grad_norm": 0.6893584132194519, + "learning_rate": 0.0001333534019867714, + "loss": 2.5886, + "step": 7896 + }, + { + "epoch": 0.637317407795981, + "grad_norm": 0.7651494741439819, + "learning_rate": 0.00013333851866177617, + "loss": 2.5622, + "step": 7897 + }, + { + "epoch": 0.637398111532564, + "grad_norm": 0.8124055862426758, + "learning_rate": 0.00013332363450589788, + "loss": 2.6036, + "step": 7898 + }, + { + "epoch": 0.637478815269147, + "grad_norm": 0.7394436597824097, + "learning_rate": 0.00013330874951950755, + "loss": 2.6214, + "step": 7899 + }, + { + "epoch": 0.6375595190057299, + "grad_norm": 0.6279659867286682, + "learning_rate": 0.00013329386370297615, + "loss": 2.5652, + "step": 7900 + }, + { + "epoch": 0.637640222742313, + "grad_norm": 0.7289649248123169, + "learning_rate": 0.00013327897705667455, + "loss": 2.5628, + "step": 7901 + }, + { + "epoch": 0.637720926478896, + "grad_norm": 0.7267701625823975, + "learning_rate": 0.0001332640895809739, + "loss": 2.5475, + "step": 7902 + }, + { + "epoch": 0.637801630215479, + "grad_norm": 0.7470490336418152, + "learning_rate": 0.00013324920127624515, + "loss": 2.5054, + "step": 7903 + }, + { + "epoch": 0.637882333952062, + "grad_norm": 0.6963294148445129, + "learning_rate": 0.00013323431214285944, + "loss": 2.5992, + "step": 7904 + }, + { + "epoch": 0.6379630376886449, + "grad_norm": 0.6993808746337891, + "learning_rate": 0.00013321942218118778, + "loss": 2.6044, + "step": 7905 + }, + { + "epoch": 0.638043741425228, + "grad_norm": 0.6620917916297913, + "learning_rate": 0.00013320453139160126, + "loss": 2.5278, + "step": 7906 + }, + { + "epoch": 0.638124445161811, + "grad_norm": 0.6535444855690002, + "learning_rate": 0.00013318963977447106, + "loss": 2.6069, + "step": 7907 + }, + { + "epoch": 0.638205148898394, + "grad_norm": 0.6913008689880371, + "learning_rate": 0.00013317474733016824, + "loss": 2.5271, + "step": 7908 + }, + { + "epoch": 0.638285852634977, + "grad_norm": 0.6760269403457642, + "learning_rate": 0.000133159854059064, + "loss": 2.7029, + "step": 7909 + }, + { + "epoch": 0.63836655637156, + "grad_norm": 0.7026536464691162, + "learning_rate": 0.0001331449599615295, + "loss": 2.592, + "step": 7910 + }, + { + "epoch": 0.638447260108143, + "grad_norm": 0.7935923933982849, + "learning_rate": 0.000133130065037936, + "loss": 2.5674, + "step": 7911 + }, + { + "epoch": 0.638527963844726, + "grad_norm": 0.694675087928772, + "learning_rate": 0.00013311516928865466, + "loss": 2.6727, + "step": 7912 + }, + { + "epoch": 0.638608667581309, + "grad_norm": 0.7378186583518982, + "learning_rate": 0.00013310027271405672, + "loss": 2.5691, + "step": 7913 + }, + { + "epoch": 0.638689371317892, + "grad_norm": 0.7684193849563599, + "learning_rate": 0.00013308537531451345, + "loss": 2.5796, + "step": 7914 + }, + { + "epoch": 0.638770075054475, + "grad_norm": 0.6881510019302368, + "learning_rate": 0.00013307047709039619, + "loss": 2.6, + "step": 7915 + }, + { + "epoch": 0.638850778791058, + "grad_norm": 0.7341364026069641, + "learning_rate": 0.00013305557804207618, + "loss": 2.622, + "step": 7916 + }, + { + "epoch": 0.638931482527641, + "grad_norm": 0.7620663642883301, + "learning_rate": 0.00013304067816992474, + "loss": 2.5571, + "step": 7917 + }, + { + "epoch": 0.6390121862642241, + "grad_norm": 0.6929789781570435, + "learning_rate": 0.00013302577747431322, + "loss": 2.6204, + "step": 7918 + }, + { + "epoch": 0.639092890000807, + "grad_norm": 0.6942943334579468, + "learning_rate": 0.000133010875955613, + "loss": 2.6737, + "step": 7919 + }, + { + "epoch": 0.63917359373739, + "grad_norm": 0.69537752866745, + "learning_rate": 0.0001329959736141955, + "loss": 2.6105, + "step": 7920 + }, + { + "epoch": 0.639254297473973, + "grad_norm": 0.6690821051597595, + "learning_rate": 0.00013298107045043203, + "loss": 2.6279, + "step": 7921 + }, + { + "epoch": 0.6393350012105561, + "grad_norm": 0.7748103141784668, + "learning_rate": 0.00013296616646469412, + "loss": 2.6307, + "step": 7922 + }, + { + "epoch": 0.6394157049471391, + "grad_norm": 0.7509558200836182, + "learning_rate": 0.00013295126165735311, + "loss": 2.6388, + "step": 7923 + }, + { + "epoch": 0.639496408683722, + "grad_norm": 0.7641764283180237, + "learning_rate": 0.0001329363560287806, + "loss": 2.5819, + "step": 7924 + }, + { + "epoch": 0.639577112420305, + "grad_norm": 0.6912327408790588, + "learning_rate": 0.00013292144957934794, + "loss": 2.5588, + "step": 7925 + }, + { + "epoch": 0.6396578161568881, + "grad_norm": 0.7568803429603577, + "learning_rate": 0.0001329065423094267, + "loss": 2.5627, + "step": 7926 + }, + { + "epoch": 0.6397385198934711, + "grad_norm": 0.7272306084632874, + "learning_rate": 0.00013289163421938843, + "loss": 2.6101, + "step": 7927 + }, + { + "epoch": 0.6398192236300541, + "grad_norm": 0.6965963840484619, + "learning_rate": 0.00013287672530960465, + "loss": 2.5967, + "step": 7928 + }, + { + "epoch": 0.639899927366637, + "grad_norm": 0.7729843854904175, + "learning_rate": 0.00013286181558044694, + "loss": 2.6222, + "step": 7929 + }, + { + "epoch": 0.6399806311032201, + "grad_norm": 0.6876606941223145, + "learning_rate": 0.00013284690503228687, + "loss": 2.6162, + "step": 7930 + }, + { + "epoch": 0.6400613348398031, + "grad_norm": 0.7555204629898071, + "learning_rate": 0.0001328319936654961, + "loss": 2.588, + "step": 7931 + }, + { + "epoch": 0.6401420385763861, + "grad_norm": 0.7324720621109009, + "learning_rate": 0.0001328170814804462, + "loss": 2.6111, + "step": 7932 + }, + { + "epoch": 0.640222742312969, + "grad_norm": 0.6802392601966858, + "learning_rate": 0.0001328021684775088, + "loss": 2.5955, + "step": 7933 + }, + { + "epoch": 0.6403034460495521, + "grad_norm": 0.7564330697059631, + "learning_rate": 0.00013278725465705568, + "loss": 2.5355, + "step": 7934 + }, + { + "epoch": 0.6403841497861351, + "grad_norm": 0.6916235089302063, + "learning_rate": 0.00013277234001945844, + "loss": 2.6037, + "step": 7935 + }, + { + "epoch": 0.6404648535227181, + "grad_norm": 0.688819169998169, + "learning_rate": 0.00013275742456508885, + "loss": 2.5626, + "step": 7936 + }, + { + "epoch": 0.6405455572593011, + "grad_norm": 0.6647922992706299, + "learning_rate": 0.0001327425082943186, + "loss": 2.6166, + "step": 7937 + }, + { + "epoch": 0.6406262609958842, + "grad_norm": 0.6792626976966858, + "learning_rate": 0.00013272759120751943, + "loss": 2.6206, + "step": 7938 + }, + { + "epoch": 0.6407069647324671, + "grad_norm": 0.6482827663421631, + "learning_rate": 0.00013271267330506312, + "loss": 2.5558, + "step": 7939 + }, + { + "epoch": 0.6407876684690501, + "grad_norm": 0.6628372073173523, + "learning_rate": 0.0001326977545873215, + "loss": 2.5904, + "step": 7940 + }, + { + "epoch": 0.6408683722056331, + "grad_norm": 0.7168916463851929, + "learning_rate": 0.00013268283505466635, + "loss": 2.5189, + "step": 7941 + }, + { + "epoch": 0.6409490759422162, + "grad_norm": 0.6691678762435913, + "learning_rate": 0.00013266791470746957, + "loss": 2.608, + "step": 7942 + }, + { + "epoch": 0.6410297796787991, + "grad_norm": 0.6850359439849854, + "learning_rate": 0.00013265299354610292, + "loss": 2.5929, + "step": 7943 + }, + { + "epoch": 0.6411104834153821, + "grad_norm": 0.6807669401168823, + "learning_rate": 0.0001326380715709383, + "loss": 2.6016, + "step": 7944 + }, + { + "epoch": 0.6411911871519651, + "grad_norm": 0.6450446844100952, + "learning_rate": 0.00013262314878234767, + "loss": 2.6129, + "step": 7945 + }, + { + "epoch": 0.6412718908885482, + "grad_norm": 0.679115355014801, + "learning_rate": 0.00013260822518070285, + "loss": 2.6049, + "step": 7946 + }, + { + "epoch": 0.6413525946251312, + "grad_norm": 0.7082008123397827, + "learning_rate": 0.00013259330076637583, + "loss": 2.5673, + "step": 7947 + }, + { + "epoch": 0.6414332983617141, + "grad_norm": 0.7357851266860962, + "learning_rate": 0.00013257837553973855, + "loss": 2.6118, + "step": 7948 + }, + { + "epoch": 0.6415140020982971, + "grad_norm": 0.687035083770752, + "learning_rate": 0.000132563449501163, + "loss": 2.5359, + "step": 7949 + }, + { + "epoch": 0.6415947058348802, + "grad_norm": 0.6950698494911194, + "learning_rate": 0.00013254852265102117, + "loss": 2.5527, + "step": 7950 + }, + { + "epoch": 0.6416754095714632, + "grad_norm": 0.6878959536552429, + "learning_rate": 0.00013253359498968507, + "loss": 2.611, + "step": 7951 + }, + { + "epoch": 0.6417561133080462, + "grad_norm": 0.7224605083465576, + "learning_rate": 0.00013251866651752675, + "loss": 2.5459, + "step": 7952 + }, + { + "epoch": 0.6418368170446291, + "grad_norm": 0.7299731969833374, + "learning_rate": 0.00013250373723491826, + "loss": 2.5651, + "step": 7953 + }, + { + "epoch": 0.6419175207812122, + "grad_norm": 0.7663037776947021, + "learning_rate": 0.00013248880714223163, + "loss": 2.6073, + "step": 7954 + }, + { + "epoch": 0.6419982245177952, + "grad_norm": 0.6532007455825806, + "learning_rate": 0.00013247387623983902, + "loss": 2.6087, + "step": 7955 + }, + { + "epoch": 0.6420789282543782, + "grad_norm": 0.7520449757575989, + "learning_rate": 0.00013245894452811255, + "loss": 2.5998, + "step": 7956 + }, + { + "epoch": 0.6421596319909612, + "grad_norm": 0.7196050882339478, + "learning_rate": 0.0001324440120074243, + "loss": 2.6448, + "step": 7957 + }, + { + "epoch": 0.6422403357275441, + "grad_norm": 0.7093806862831116, + "learning_rate": 0.0001324290786781465, + "loss": 2.5935, + "step": 7958 + }, + { + "epoch": 0.6423210394641272, + "grad_norm": 0.695541501045227, + "learning_rate": 0.00013241414454065125, + "loss": 2.5872, + "step": 7959 + }, + { + "epoch": 0.6424017432007102, + "grad_norm": 0.6763006448745728, + "learning_rate": 0.0001323992095953108, + "loss": 2.572, + "step": 7960 + }, + { + "epoch": 0.6424824469372932, + "grad_norm": 0.6403522491455078, + "learning_rate": 0.00013238427384249738, + "loss": 2.6137, + "step": 7961 + }, + { + "epoch": 0.6425631506738761, + "grad_norm": 0.6647571325302124, + "learning_rate": 0.00013236933728258315, + "loss": 2.5904, + "step": 7962 + }, + { + "epoch": 0.6426438544104592, + "grad_norm": 0.6931071877479553, + "learning_rate": 0.0001323543999159405, + "loss": 2.6085, + "step": 7963 + }, + { + "epoch": 0.6427245581470422, + "grad_norm": 0.6899439096450806, + "learning_rate": 0.00013233946174294155, + "loss": 2.5555, + "step": 7964 + }, + { + "epoch": 0.6428052618836252, + "grad_norm": 0.6564984321594238, + "learning_rate": 0.0001323245227639587, + "loss": 2.576, + "step": 7965 + }, + { + "epoch": 0.6428859656202082, + "grad_norm": 0.7427607774734497, + "learning_rate": 0.00013230958297936427, + "loss": 2.6178, + "step": 7966 + }, + { + "epoch": 0.6429666693567913, + "grad_norm": 0.6884508728981018, + "learning_rate": 0.00013229464238953054, + "loss": 2.6519, + "step": 7967 + }, + { + "epoch": 0.6430473730933742, + "grad_norm": 0.692442774772644, + "learning_rate": 0.00013227970099482993, + "loss": 2.5784, + "step": 7968 + }, + { + "epoch": 0.6431280768299572, + "grad_norm": 0.6637876629829407, + "learning_rate": 0.00013226475879563477, + "loss": 2.5785, + "step": 7969 + }, + { + "epoch": 0.6432087805665402, + "grad_norm": 0.6844972372055054, + "learning_rate": 0.0001322498157923175, + "loss": 2.5745, + "step": 7970 + }, + { + "epoch": 0.6432894843031233, + "grad_norm": 0.7259756922721863, + "learning_rate": 0.0001322348719852505, + "loss": 2.5696, + "step": 7971 + }, + { + "epoch": 0.6433701880397062, + "grad_norm": 0.6719023585319519, + "learning_rate": 0.00013221992737480625, + "loss": 2.6049, + "step": 7972 + }, + { + "epoch": 0.6434508917762892, + "grad_norm": 0.7160155773162842, + "learning_rate": 0.00013220498196135717, + "loss": 2.572, + "step": 7973 + }, + { + "epoch": 0.6435315955128722, + "grad_norm": 0.6920225620269775, + "learning_rate": 0.00013219003574527576, + "loss": 2.6576, + "step": 7974 + }, + { + "epoch": 0.6436122992494553, + "grad_norm": 0.698518693447113, + "learning_rate": 0.0001321750887269345, + "loss": 2.6074, + "step": 7975 + }, + { + "epoch": 0.6436930029860383, + "grad_norm": 0.7607932090759277, + "learning_rate": 0.00013216014090670594, + "loss": 2.6173, + "step": 7976 + }, + { + "epoch": 0.6437737067226212, + "grad_norm": 0.8130847811698914, + "learning_rate": 0.0001321451922849626, + "loss": 2.6023, + "step": 7977 + }, + { + "epoch": 0.6438544104592042, + "grad_norm": 0.676675021648407, + "learning_rate": 0.00013213024286207702, + "loss": 2.6174, + "step": 7978 + }, + { + "epoch": 0.6439351141957873, + "grad_norm": 0.7018851041793823, + "learning_rate": 0.00013211529263842183, + "loss": 2.5713, + "step": 7979 + }, + { + "epoch": 0.6440158179323703, + "grad_norm": 0.796097457408905, + "learning_rate": 0.00013210034161436954, + "loss": 2.5937, + "step": 7980 + }, + { + "epoch": 0.6440965216689533, + "grad_norm": 0.7118527293205261, + "learning_rate": 0.0001320853897902929, + "loss": 2.5721, + "step": 7981 + }, + { + "epoch": 0.6441772254055362, + "grad_norm": 0.7282249331474304, + "learning_rate": 0.00013207043716656445, + "loss": 2.5975, + "step": 7982 + }, + { + "epoch": 0.6442579291421193, + "grad_norm": 0.6710900664329529, + "learning_rate": 0.00013205548374355686, + "loss": 2.5809, + "step": 7983 + }, + { + "epoch": 0.6443386328787023, + "grad_norm": 0.7045658230781555, + "learning_rate": 0.00013204052952164278, + "loss": 2.5715, + "step": 7984 + }, + { + "epoch": 0.6444193366152853, + "grad_norm": 0.719507098197937, + "learning_rate": 0.00013202557450119504, + "loss": 2.5948, + "step": 7985 + }, + { + "epoch": 0.6445000403518683, + "grad_norm": 0.7603922486305237, + "learning_rate": 0.0001320106186825862, + "loss": 2.6176, + "step": 7986 + }, + { + "epoch": 0.6445807440884513, + "grad_norm": 0.7057444453239441, + "learning_rate": 0.0001319956620661891, + "loss": 2.5905, + "step": 7987 + }, + { + "epoch": 0.6446614478250343, + "grad_norm": 0.7884874939918518, + "learning_rate": 0.00013198070465237645, + "loss": 2.5892, + "step": 7988 + }, + { + "epoch": 0.6447421515616173, + "grad_norm": 0.6932834386825562, + "learning_rate": 0.00013196574644152103, + "loss": 2.6032, + "step": 7989 + }, + { + "epoch": 0.6448228552982003, + "grad_norm": 0.7361180186271667, + "learning_rate": 0.00013195078743399568, + "loss": 2.5877, + "step": 7990 + }, + { + "epoch": 0.6449035590347834, + "grad_norm": 0.6843615174293518, + "learning_rate": 0.00013193582763017315, + "loss": 2.5804, + "step": 7991 + }, + { + "epoch": 0.6449842627713663, + "grad_norm": 0.7592078447341919, + "learning_rate": 0.00013192086703042635, + "loss": 2.6464, + "step": 7992 + }, + { + "epoch": 0.6450649665079493, + "grad_norm": 0.7362154126167297, + "learning_rate": 0.0001319059056351281, + "loss": 2.6154, + "step": 7993 + }, + { + "epoch": 0.6451456702445323, + "grad_norm": 0.6721758246421814, + "learning_rate": 0.00013189094344465125, + "loss": 2.5735, + "step": 7994 + }, + { + "epoch": 0.6452263739811154, + "grad_norm": 0.6221550107002258, + "learning_rate": 0.00013187598045936874, + "loss": 2.5612, + "step": 7995 + }, + { + "epoch": 0.6453070777176984, + "grad_norm": 0.7225528359413147, + "learning_rate": 0.00013186101667965344, + "loss": 2.6263, + "step": 7996 + }, + { + "epoch": 0.6453877814542813, + "grad_norm": 0.7599418759346008, + "learning_rate": 0.00013184605210587837, + "loss": 2.5814, + "step": 7997 + }, + { + "epoch": 0.6454684851908643, + "grad_norm": 0.6778777837753296, + "learning_rate": 0.00013183108673841642, + "loss": 2.6158, + "step": 7998 + }, + { + "epoch": 0.6455491889274474, + "grad_norm": 0.6860963106155396, + "learning_rate": 0.00013181612057764058, + "loss": 2.6207, + "step": 7999 + }, + { + "epoch": 0.6456298926640304, + "grad_norm": 0.6615182757377625, + "learning_rate": 0.00013180115362392382, + "loss": 2.5571, + "step": 8000 + }, + { + "epoch": 0.6456298926640304, + "eval_loss": 2.5128066539764404, + "eval_runtime": 754.3655, + "eval_samples_per_second": 3.473, + "eval_steps_per_second": 0.579, + "step": 8000 + }, + { + "epoch": 0.6457105964006133, + "grad_norm": 0.688169538974762, + "learning_rate": 0.0001317861858776392, + "loss": 2.6513, + "step": 8001 + }, + { + "epoch": 0.6457913001371963, + "grad_norm": 0.6726182103157043, + "learning_rate": 0.00013177121733915975, + "loss": 2.5909, + "step": 8002 + }, + { + "epoch": 0.6458720038737794, + "grad_norm": 0.7348085641860962, + "learning_rate": 0.00013175624800885853, + "loss": 2.577, + "step": 8003 + }, + { + "epoch": 0.6459527076103624, + "grad_norm": 0.677435040473938, + "learning_rate": 0.00013174127788710856, + "loss": 2.5056, + "step": 8004 + }, + { + "epoch": 0.6460334113469454, + "grad_norm": 0.6864951848983765, + "learning_rate": 0.000131726306974283, + "loss": 2.5733, + "step": 8005 + }, + { + "epoch": 0.6461141150835283, + "grad_norm": 0.7070075869560242, + "learning_rate": 0.0001317113352707549, + "loss": 2.5359, + "step": 8006 + }, + { + "epoch": 0.6461948188201113, + "grad_norm": 0.7065049409866333, + "learning_rate": 0.00013169636277689746, + "loss": 2.6261, + "step": 8007 + }, + { + "epoch": 0.6462755225566944, + "grad_norm": 0.6691577434539795, + "learning_rate": 0.0001316813894930838, + "loss": 2.6015, + "step": 8008 + }, + { + "epoch": 0.6463562262932774, + "grad_norm": 0.6754019260406494, + "learning_rate": 0.0001316664154196871, + "loss": 2.5954, + "step": 8009 + }, + { + "epoch": 0.6464369300298604, + "grad_norm": 0.6172776818275452, + "learning_rate": 0.00013165144055708055, + "loss": 2.5599, + "step": 8010 + }, + { + "epoch": 0.6465176337664433, + "grad_norm": 0.6778094172477722, + "learning_rate": 0.00013163646490563737, + "loss": 2.5407, + "step": 8011 + }, + { + "epoch": 0.6465983375030264, + "grad_norm": 0.7363924980163574, + "learning_rate": 0.00013162148846573076, + "loss": 2.6075, + "step": 8012 + }, + { + "epoch": 0.6466790412396094, + "grad_norm": 0.6662711501121521, + "learning_rate": 0.00013160651123773404, + "loss": 2.5611, + "step": 8013 + }, + { + "epoch": 0.6467597449761924, + "grad_norm": 0.699670135974884, + "learning_rate": 0.00013159153322202043, + "loss": 2.5612, + "step": 8014 + }, + { + "epoch": 0.6468404487127754, + "grad_norm": 0.7382899522781372, + "learning_rate": 0.0001315765544189632, + "loss": 2.6017, + "step": 8015 + }, + { + "epoch": 0.6469211524493584, + "grad_norm": 0.7624868154525757, + "learning_rate": 0.0001315615748289357, + "loss": 2.6174, + "step": 8016 + }, + { + "epoch": 0.6470018561859414, + "grad_norm": 0.704622745513916, + "learning_rate": 0.00013154659445231129, + "loss": 2.5367, + "step": 8017 + }, + { + "epoch": 0.6470825599225244, + "grad_norm": 0.7117413878440857, + "learning_rate": 0.00013153161328946324, + "loss": 2.5958, + "step": 8018 + }, + { + "epoch": 0.6471632636591074, + "grad_norm": 0.6825408339500427, + "learning_rate": 0.00013151663134076497, + "loss": 2.5118, + "step": 8019 + }, + { + "epoch": 0.6472439673956905, + "grad_norm": 0.6732384562492371, + "learning_rate": 0.00013150164860658986, + "loss": 2.6312, + "step": 8020 + }, + { + "epoch": 0.6473246711322734, + "grad_norm": 0.712812602519989, + "learning_rate": 0.00013148666508731134, + "loss": 2.576, + "step": 8021 + }, + { + "epoch": 0.6474053748688564, + "grad_norm": 0.8128857612609863, + "learning_rate": 0.0001314716807833028, + "loss": 2.5333, + "step": 8022 + }, + { + "epoch": 0.6474860786054394, + "grad_norm": 0.7817162275314331, + "learning_rate": 0.00013145669569493773, + "loss": 2.6835, + "step": 8023 + }, + { + "epoch": 0.6475667823420225, + "grad_norm": 0.7164301872253418, + "learning_rate": 0.00013144170982258956, + "loss": 2.5573, + "step": 8024 + }, + { + "epoch": 0.6476474860786054, + "grad_norm": 0.67625892162323, + "learning_rate": 0.00013142672316663177, + "loss": 2.5976, + "step": 8025 + }, + { + "epoch": 0.6477281898151884, + "grad_norm": 0.6919494867324829, + "learning_rate": 0.0001314117357274379, + "loss": 2.6179, + "step": 8026 + }, + { + "epoch": 0.6478088935517714, + "grad_norm": 0.6787464618682861, + "learning_rate": 0.0001313967475053815, + "loss": 2.5405, + "step": 8027 + }, + { + "epoch": 0.6478895972883545, + "grad_norm": 0.6305621862411499, + "learning_rate": 0.00013138175850083605, + "loss": 2.6016, + "step": 8028 + }, + { + "epoch": 0.6479703010249375, + "grad_norm": 0.7456182837486267, + "learning_rate": 0.00013136676871417516, + "loss": 2.6091, + "step": 8029 + }, + { + "epoch": 0.6480510047615204, + "grad_norm": 0.7047890424728394, + "learning_rate": 0.00013135177814577238, + "loss": 2.6108, + "step": 8030 + }, + { + "epoch": 0.6481317084981034, + "grad_norm": 0.7509389519691467, + "learning_rate": 0.00013133678679600133, + "loss": 2.6396, + "step": 8031 + }, + { + "epoch": 0.6482124122346865, + "grad_norm": 0.63836270570755, + "learning_rate": 0.00013132179466523566, + "loss": 2.5759, + "step": 8032 + }, + { + "epoch": 0.6482931159712695, + "grad_norm": 0.6994885206222534, + "learning_rate": 0.000131306801753849, + "loss": 2.61, + "step": 8033 + }, + { + "epoch": 0.6483738197078525, + "grad_norm": 0.6762083768844604, + "learning_rate": 0.00013129180806221497, + "loss": 2.5431, + "step": 8034 + }, + { + "epoch": 0.6484545234444354, + "grad_norm": 0.6890944242477417, + "learning_rate": 0.0001312768135907073, + "loss": 2.5922, + "step": 8035 + }, + { + "epoch": 0.6485352271810185, + "grad_norm": 0.7409473061561584, + "learning_rate": 0.0001312618183396997, + "loss": 2.6132, + "step": 8036 + }, + { + "epoch": 0.6486159309176015, + "grad_norm": 0.6660643815994263, + "learning_rate": 0.00013124682230956585, + "loss": 2.5816, + "step": 8037 + }, + { + "epoch": 0.6486966346541845, + "grad_norm": 0.714235246181488, + "learning_rate": 0.0001312318255006795, + "loss": 2.5613, + "step": 8038 + }, + { + "epoch": 0.6487773383907675, + "grad_norm": 0.6568472385406494, + "learning_rate": 0.00013121682791341442, + "loss": 2.6382, + "step": 8039 + }, + { + "epoch": 0.6488580421273505, + "grad_norm": 0.6874251961708069, + "learning_rate": 0.00013120182954814438, + "loss": 2.593, + "step": 8040 + }, + { + "epoch": 0.6489387458639335, + "grad_norm": 0.7620158791542053, + "learning_rate": 0.0001311868304052432, + "loss": 2.589, + "step": 8041 + }, + { + "epoch": 0.6490194496005165, + "grad_norm": 0.6755926609039307, + "learning_rate": 0.00013117183048508467, + "loss": 2.5876, + "step": 8042 + }, + { + "epoch": 0.6491001533370995, + "grad_norm": 0.6952808499336243, + "learning_rate": 0.00013115682978804264, + "loss": 2.5909, + "step": 8043 + }, + { + "epoch": 0.6491808570736826, + "grad_norm": 0.6599535346031189, + "learning_rate": 0.00013114182831449098, + "loss": 2.6031, + "step": 8044 + }, + { + "epoch": 0.6492615608102655, + "grad_norm": 0.7816598415374756, + "learning_rate": 0.00013112682606480355, + "loss": 2.5633, + "step": 8045 + }, + { + "epoch": 0.6493422645468485, + "grad_norm": 0.7188639640808105, + "learning_rate": 0.00013111182303935425, + "loss": 2.6292, + "step": 8046 + }, + { + "epoch": 0.6494229682834315, + "grad_norm": 0.7131505608558655, + "learning_rate": 0.00013109681923851698, + "loss": 2.5729, + "step": 8047 + }, + { + "epoch": 0.6495036720200146, + "grad_norm": 0.7466408014297485, + "learning_rate": 0.00013108181466266568, + "loss": 2.5742, + "step": 8048 + }, + { + "epoch": 0.6495843757565976, + "grad_norm": 0.6707943677902222, + "learning_rate": 0.00013106680931217437, + "loss": 2.5506, + "step": 8049 + }, + { + "epoch": 0.6496650794931805, + "grad_norm": 0.6913424730300903, + "learning_rate": 0.0001310518031874169, + "loss": 2.5639, + "step": 8050 + }, + { + "epoch": 0.6497457832297635, + "grad_norm": 0.8261755704879761, + "learning_rate": 0.00013103679628876733, + "loss": 2.601, + "step": 8051 + }, + { + "epoch": 0.6498264869663466, + "grad_norm": 0.7410566806793213, + "learning_rate": 0.0001310217886165997, + "loss": 2.5326, + "step": 8052 + }, + { + "epoch": 0.6499071907029296, + "grad_norm": 0.7032365202903748, + "learning_rate": 0.00013100678017128798, + "loss": 2.5907, + "step": 8053 + }, + { + "epoch": 0.6499878944395125, + "grad_norm": 0.7074568271636963, + "learning_rate": 0.00013099177095320626, + "loss": 2.6193, + "step": 8054 + }, + { + "epoch": 0.6500685981760955, + "grad_norm": 0.7754546999931335, + "learning_rate": 0.00013097676096272855, + "loss": 2.5832, + "step": 8055 + }, + { + "epoch": 0.6501493019126786, + "grad_norm": 0.7475717663764954, + "learning_rate": 0.00013096175020022903, + "loss": 2.6233, + "step": 8056 + }, + { + "epoch": 0.6502300056492616, + "grad_norm": 0.7863949537277222, + "learning_rate": 0.00013094673866608173, + "loss": 2.5745, + "step": 8057 + }, + { + "epoch": 0.6503107093858446, + "grad_norm": 0.69294673204422, + "learning_rate": 0.0001309317263606608, + "loss": 2.5982, + "step": 8058 + }, + { + "epoch": 0.6503914131224275, + "grad_norm": 0.7096135020256042, + "learning_rate": 0.00013091671328434046, + "loss": 2.5944, + "step": 8059 + }, + { + "epoch": 0.6504721168590105, + "grad_norm": 0.7001097202301025, + "learning_rate": 0.00013090169943749476, + "loss": 2.5435, + "step": 8060 + }, + { + "epoch": 0.6505528205955936, + "grad_norm": 0.7522539496421814, + "learning_rate": 0.00013088668482049792, + "loss": 2.5843, + "step": 8061 + }, + { + "epoch": 0.6506335243321766, + "grad_norm": 0.6675420999526978, + "learning_rate": 0.00013087166943372418, + "loss": 2.5623, + "step": 8062 + }, + { + "epoch": 0.6507142280687596, + "grad_norm": 0.7779181599617004, + "learning_rate": 0.00013085665327754772, + "loss": 2.6087, + "step": 8063 + }, + { + "epoch": 0.6507949318053425, + "grad_norm": 0.7385239005088806, + "learning_rate": 0.00013084163635234284, + "loss": 2.5725, + "step": 8064 + }, + { + "epoch": 0.6508756355419256, + "grad_norm": 0.6966612339019775, + "learning_rate": 0.00013082661865848375, + "loss": 2.5745, + "step": 8065 + }, + { + "epoch": 0.6509563392785086, + "grad_norm": 0.7098337411880493, + "learning_rate": 0.00013081160019634468, + "loss": 2.5461, + "step": 8066 + }, + { + "epoch": 0.6510370430150916, + "grad_norm": 0.6514503359794617, + "learning_rate": 0.00013079658096630002, + "loss": 2.5869, + "step": 8067 + }, + { + "epoch": 0.6511177467516746, + "grad_norm": 0.680422306060791, + "learning_rate": 0.0001307815609687241, + "loss": 2.6316, + "step": 8068 + }, + { + "epoch": 0.6511984504882576, + "grad_norm": 0.6892665028572083, + "learning_rate": 0.00013076654020399117, + "loss": 2.5862, + "step": 8069 + }, + { + "epoch": 0.6512791542248406, + "grad_norm": 0.7605568170547485, + "learning_rate": 0.00013075151867247568, + "loss": 2.5342, + "step": 8070 + }, + { + "epoch": 0.6513598579614236, + "grad_norm": 0.7571204900741577, + "learning_rate": 0.00013073649637455192, + "loss": 2.5762, + "step": 8071 + }, + { + "epoch": 0.6514405616980066, + "grad_norm": 0.6910812258720398, + "learning_rate": 0.00013072147331059431, + "loss": 2.6635, + "step": 8072 + }, + { + "epoch": 0.6515212654345897, + "grad_norm": 0.765559196472168, + "learning_rate": 0.00013070644948097733, + "loss": 2.5885, + "step": 8073 + }, + { + "epoch": 0.6516019691711726, + "grad_norm": 0.7533665299415588, + "learning_rate": 0.00013069142488607532, + "loss": 2.6545, + "step": 8074 + }, + { + "epoch": 0.6516826729077556, + "grad_norm": 0.685089647769928, + "learning_rate": 0.0001306763995262628, + "loss": 2.5955, + "step": 8075 + }, + { + "epoch": 0.6517633766443386, + "grad_norm": 0.7280653715133667, + "learning_rate": 0.00013066137340191422, + "loss": 2.5548, + "step": 8076 + }, + { + "epoch": 0.6518440803809217, + "grad_norm": 0.6881482601165771, + "learning_rate": 0.00013064634651340404, + "loss": 2.6143, + "step": 8077 + }, + { + "epoch": 0.6519247841175047, + "grad_norm": 0.6878265142440796, + "learning_rate": 0.0001306313188611068, + "loss": 2.5681, + "step": 8078 + }, + { + "epoch": 0.6520054878540876, + "grad_norm": 0.685238242149353, + "learning_rate": 0.00013061629044539702, + "loss": 2.5517, + "step": 8079 + }, + { + "epoch": 0.6520861915906706, + "grad_norm": 0.6689820885658264, + "learning_rate": 0.00013060126126664928, + "loss": 2.6201, + "step": 8080 + }, + { + "epoch": 0.6521668953272537, + "grad_norm": 0.7128999829292297, + "learning_rate": 0.00013058623132523807, + "loss": 2.5829, + "step": 8081 + }, + { + "epoch": 0.6522475990638367, + "grad_norm": 0.6835216879844666, + "learning_rate": 0.00013057120062153805, + "loss": 2.6312, + "step": 8082 + }, + { + "epoch": 0.6523283028004196, + "grad_norm": 0.7140012383460999, + "learning_rate": 0.00013055616915592382, + "loss": 2.6148, + "step": 8083 + }, + { + "epoch": 0.6524090065370026, + "grad_norm": 0.7378252148628235, + "learning_rate": 0.00013054113692876994, + "loss": 2.5805, + "step": 8084 + }, + { + "epoch": 0.6524897102735857, + "grad_norm": 0.7569258213043213, + "learning_rate": 0.0001305261039404511, + "loss": 2.6088, + "step": 8085 + }, + { + "epoch": 0.6525704140101687, + "grad_norm": 0.6909007430076599, + "learning_rate": 0.00013051107019134195, + "loss": 2.5285, + "step": 8086 + }, + { + "epoch": 0.6526511177467517, + "grad_norm": 0.6785587072372437, + "learning_rate": 0.0001304960356818172, + "loss": 2.5527, + "step": 8087 + }, + { + "epoch": 0.6527318214833346, + "grad_norm": 0.7058801054954529, + "learning_rate": 0.0001304810004122515, + "loss": 2.6789, + "step": 8088 + }, + { + "epoch": 0.6528125252199177, + "grad_norm": 0.6920512318611145, + "learning_rate": 0.0001304659643830196, + "loss": 2.5748, + "step": 8089 + }, + { + "epoch": 0.6528932289565007, + "grad_norm": 0.6829244494438171, + "learning_rate": 0.00013045092759449625, + "loss": 2.5389, + "step": 8090 + }, + { + "epoch": 0.6529739326930837, + "grad_norm": 0.6942421793937683, + "learning_rate": 0.00013043589004705614, + "loss": 2.5851, + "step": 8091 + }, + { + "epoch": 0.6530546364296667, + "grad_norm": 0.6473072171211243, + "learning_rate": 0.0001304208517410741, + "loss": 2.56, + "step": 8092 + }, + { + "epoch": 0.6531353401662497, + "grad_norm": 0.6692056655883789, + "learning_rate": 0.00013040581267692494, + "loss": 2.5977, + "step": 8093 + }, + { + "epoch": 0.6532160439028327, + "grad_norm": 0.6918915510177612, + "learning_rate": 0.00013039077285498344, + "loss": 2.551, + "step": 8094 + }, + { + "epoch": 0.6532967476394157, + "grad_norm": 0.7432852387428284, + "learning_rate": 0.00013037573227562443, + "loss": 2.5537, + "step": 8095 + }, + { + "epoch": 0.6533774513759987, + "grad_norm": 0.6737081408500671, + "learning_rate": 0.0001303606909392228, + "loss": 2.5947, + "step": 8096 + }, + { + "epoch": 0.6534581551125818, + "grad_norm": 0.6810599565505981, + "learning_rate": 0.0001303456488461533, + "loss": 2.5704, + "step": 8097 + }, + { + "epoch": 0.6535388588491647, + "grad_norm": 0.675240159034729, + "learning_rate": 0.00013033060599679098, + "loss": 2.591, + "step": 8098 + }, + { + "epoch": 0.6536195625857477, + "grad_norm": 0.6888695359230042, + "learning_rate": 0.00013031556239151066, + "loss": 2.5403, + "step": 8099 + }, + { + "epoch": 0.6537002663223307, + "grad_norm": 0.7154796719551086, + "learning_rate": 0.00013030051803068727, + "loss": 2.5654, + "step": 8100 + }, + { + "epoch": 0.6537809700589138, + "grad_norm": 0.6655243635177612, + "learning_rate": 0.0001302854729146958, + "loss": 2.5867, + "step": 8101 + }, + { + "epoch": 0.6538616737954968, + "grad_norm": 0.7070788145065308, + "learning_rate": 0.00013027042704391115, + "loss": 2.5593, + "step": 8102 + }, + { + "epoch": 0.6539423775320797, + "grad_norm": 0.7071834206581116, + "learning_rate": 0.0001302553804187083, + "loss": 2.536, + "step": 8103 + }, + { + "epoch": 0.6540230812686627, + "grad_norm": 0.7086542248725891, + "learning_rate": 0.00013024033303946233, + "loss": 2.5644, + "step": 8104 + }, + { + "epoch": 0.6541037850052458, + "grad_norm": 0.6714556813240051, + "learning_rate": 0.00013022528490654818, + "loss": 2.5167, + "step": 8105 + }, + { + "epoch": 0.6541844887418288, + "grad_norm": 0.6905114054679871, + "learning_rate": 0.00013021023602034095, + "loss": 2.5227, + "step": 8106 + }, + { + "epoch": 0.6542651924784118, + "grad_norm": 0.7050586342811584, + "learning_rate": 0.00013019518638121563, + "loss": 2.5725, + "step": 8107 + }, + { + "epoch": 0.6543458962149947, + "grad_norm": 0.6940500736236572, + "learning_rate": 0.00013018013598954737, + "loss": 2.5912, + "step": 8108 + }, + { + "epoch": 0.6544265999515777, + "grad_norm": 0.7136965990066528, + "learning_rate": 0.00013016508484571122, + "loss": 2.6101, + "step": 8109 + }, + { + "epoch": 0.6545073036881608, + "grad_norm": 0.7205774188041687, + "learning_rate": 0.0001301500329500823, + "loss": 2.5869, + "step": 8110 + }, + { + "epoch": 0.6545880074247438, + "grad_norm": 0.6831154823303223, + "learning_rate": 0.00013013498030303575, + "loss": 2.5309, + "step": 8111 + }, + { + "epoch": 0.6546687111613267, + "grad_norm": 0.6778538823127747, + "learning_rate": 0.0001301199269049467, + "loss": 2.6297, + "step": 8112 + }, + { + "epoch": 0.6547494148979097, + "grad_norm": 0.705055832862854, + "learning_rate": 0.00013010487275619034, + "loss": 2.6188, + "step": 8113 + }, + { + "epoch": 0.6548301186344928, + "grad_norm": 0.6927980780601501, + "learning_rate": 0.00013008981785714188, + "loss": 2.5744, + "step": 8114 + }, + { + "epoch": 0.6549108223710758, + "grad_norm": 0.7070884108543396, + "learning_rate": 0.0001300747622081765, + "loss": 2.618, + "step": 8115 + }, + { + "epoch": 0.6549915261076588, + "grad_norm": 0.723479688167572, + "learning_rate": 0.0001300597058096694, + "loss": 2.5928, + "step": 8116 + }, + { + "epoch": 0.6550722298442417, + "grad_norm": 0.6689562201499939, + "learning_rate": 0.00013004464866199587, + "loss": 2.5592, + "step": 8117 + }, + { + "epoch": 0.6551529335808248, + "grad_norm": 0.6685079336166382, + "learning_rate": 0.00013002959076553115, + "loss": 2.558, + "step": 8118 + }, + { + "epoch": 0.6552336373174078, + "grad_norm": 0.678105890750885, + "learning_rate": 0.00013001453212065057, + "loss": 2.6176, + "step": 8119 + }, + { + "epoch": 0.6553143410539908, + "grad_norm": 0.7355597019195557, + "learning_rate": 0.00012999947272772933, + "loss": 2.6293, + "step": 8120 + }, + { + "epoch": 0.6553950447905738, + "grad_norm": 0.735862672328949, + "learning_rate": 0.00012998441258714284, + "loss": 2.635, + "step": 8121 + }, + { + "epoch": 0.6554757485271568, + "grad_norm": 0.6766025424003601, + "learning_rate": 0.0001299693516992664, + "loss": 2.5829, + "step": 8122 + }, + { + "epoch": 0.6555564522637398, + "grad_norm": 0.6701885461807251, + "learning_rate": 0.00012995429006447542, + "loss": 2.5996, + "step": 8123 + }, + { + "epoch": 0.6556371560003228, + "grad_norm": 0.6814082264900208, + "learning_rate": 0.00012993922768314518, + "loss": 2.5906, + "step": 8124 + }, + { + "epoch": 0.6557178597369058, + "grad_norm": 0.7104958295822144, + "learning_rate": 0.00012992416455565113, + "loss": 2.6708, + "step": 8125 + }, + { + "epoch": 0.6557985634734889, + "grad_norm": 0.6451221108436584, + "learning_rate": 0.0001299091006823687, + "loss": 2.5512, + "step": 8126 + }, + { + "epoch": 0.6558792672100718, + "grad_norm": 0.6736068725585938, + "learning_rate": 0.0001298940360636733, + "loss": 2.5839, + "step": 8127 + }, + { + "epoch": 0.6559599709466548, + "grad_norm": 0.6873149871826172, + "learning_rate": 0.00012987897069994031, + "loss": 2.5804, + "step": 8128 + }, + { + "epoch": 0.6560406746832378, + "grad_norm": 0.6937728524208069, + "learning_rate": 0.00012986390459154533, + "loss": 2.5648, + "step": 8129 + }, + { + "epoch": 0.6561213784198209, + "grad_norm": 0.7109464406967163, + "learning_rate": 0.00012984883773886377, + "loss": 2.6132, + "step": 8130 + }, + { + "epoch": 0.6562020821564039, + "grad_norm": 0.7134159803390503, + "learning_rate": 0.00012983377014227115, + "loss": 2.6029, + "step": 8131 + }, + { + "epoch": 0.6562827858929868, + "grad_norm": 0.6788110733032227, + "learning_rate": 0.000129818701802143, + "loss": 2.6344, + "step": 8132 + }, + { + "epoch": 0.6563634896295698, + "grad_norm": 0.6798231601715088, + "learning_rate": 0.00012980363271885483, + "loss": 2.5758, + "step": 8133 + }, + { + "epoch": 0.6564441933661529, + "grad_norm": 0.6586930155754089, + "learning_rate": 0.00012978856289278226, + "loss": 2.5918, + "step": 8134 + }, + { + "epoch": 0.6565248971027359, + "grad_norm": 0.6614218950271606, + "learning_rate": 0.0001297734923243008, + "loss": 2.5777, + "step": 8135 + }, + { + "epoch": 0.6566056008393188, + "grad_norm": 0.6874340176582336, + "learning_rate": 0.0001297584210137861, + "loss": 2.5528, + "step": 8136 + }, + { + "epoch": 0.6566863045759018, + "grad_norm": 0.6972174048423767, + "learning_rate": 0.00012974334896161376, + "loss": 2.6551, + "step": 8137 + }, + { + "epoch": 0.6567670083124849, + "grad_norm": 0.7414106726646423, + "learning_rate": 0.0001297282761681594, + "loss": 2.5719, + "step": 8138 + }, + { + "epoch": 0.6568477120490679, + "grad_norm": 0.6678279042243958, + "learning_rate": 0.00012971320263379868, + "loss": 2.555, + "step": 8139 + }, + { + "epoch": 0.6569284157856509, + "grad_norm": 0.692149817943573, + "learning_rate": 0.0001296981283589073, + "loss": 2.5991, + "step": 8140 + }, + { + "epoch": 0.6570091195222338, + "grad_norm": 0.6937025189399719, + "learning_rate": 0.00012968305334386094, + "loss": 2.5635, + "step": 8141 + }, + { + "epoch": 0.6570898232588169, + "grad_norm": 0.6250358819961548, + "learning_rate": 0.00012966797758903528, + "loss": 2.55, + "step": 8142 + }, + { + "epoch": 0.6571705269953999, + "grad_norm": 0.7388221025466919, + "learning_rate": 0.00012965290109480607, + "loss": 2.5307, + "step": 8143 + }, + { + "epoch": 0.6572512307319829, + "grad_norm": 0.7165891528129578, + "learning_rate": 0.00012963782386154904, + "loss": 2.5482, + "step": 8144 + }, + { + "epoch": 0.6573319344685659, + "grad_norm": 0.7605282068252563, + "learning_rate": 0.00012962274588963996, + "loss": 2.5839, + "step": 8145 + }, + { + "epoch": 0.657412638205149, + "grad_norm": 0.7259613275527954, + "learning_rate": 0.00012960766717945465, + "loss": 2.5612, + "step": 8146 + }, + { + "epoch": 0.6574933419417319, + "grad_norm": 0.7301480770111084, + "learning_rate": 0.00012959258773136885, + "loss": 2.5365, + "step": 8147 + }, + { + "epoch": 0.6575740456783149, + "grad_norm": 0.6800966262817383, + "learning_rate": 0.0001295775075457584, + "loss": 2.5663, + "step": 8148 + }, + { + "epoch": 0.6576547494148979, + "grad_norm": 0.6968960165977478, + "learning_rate": 0.0001295624266229992, + "loss": 2.5626, + "step": 8149 + }, + { + "epoch": 0.657735453151481, + "grad_norm": 0.9044952392578125, + "learning_rate": 0.00012954734496346704, + "loss": 2.6479, + "step": 8150 + }, + { + "epoch": 0.6578161568880639, + "grad_norm": 0.6955156922340393, + "learning_rate": 0.00012953226256753777, + "loss": 2.5879, + "step": 8151 + }, + { + "epoch": 0.6578968606246469, + "grad_norm": 0.6535033583641052, + "learning_rate": 0.00012951717943558735, + "loss": 2.5372, + "step": 8152 + }, + { + "epoch": 0.6579775643612299, + "grad_norm": 0.720730721950531, + "learning_rate": 0.0001295020955679916, + "loss": 2.5813, + "step": 8153 + }, + { + "epoch": 0.658058268097813, + "grad_norm": 0.7190384268760681, + "learning_rate": 0.00012948701096512655, + "loss": 2.5923, + "step": 8154 + }, + { + "epoch": 0.658138971834396, + "grad_norm": 0.6624464988708496, + "learning_rate": 0.0001294719256273681, + "loss": 2.5548, + "step": 8155 + }, + { + "epoch": 0.6582196755709789, + "grad_norm": 0.7839831709861755, + "learning_rate": 0.00012945683955509224, + "loss": 2.531, + "step": 8156 + }, + { + "epoch": 0.6583003793075619, + "grad_norm": 0.694970965385437, + "learning_rate": 0.00012944175274867497, + "loss": 2.4693, + "step": 8157 + }, + { + "epoch": 0.658381083044145, + "grad_norm": 0.7409366965293884, + "learning_rate": 0.0001294266652084922, + "loss": 2.5706, + "step": 8158 + }, + { + "epoch": 0.658461786780728, + "grad_norm": 0.7502163052558899, + "learning_rate": 0.00012941157693492002, + "loss": 2.6137, + "step": 8159 + }, + { + "epoch": 0.658542490517311, + "grad_norm": 0.6627129912376404, + "learning_rate": 0.00012939648792833447, + "loss": 2.5781, + "step": 8160 + }, + { + "epoch": 0.6586231942538939, + "grad_norm": 0.6775660514831543, + "learning_rate": 0.00012938139818911157, + "loss": 2.5441, + "step": 8161 + }, + { + "epoch": 0.6587038979904769, + "grad_norm": 0.7150553464889526, + "learning_rate": 0.00012936630771762748, + "loss": 2.5763, + "step": 8162 + }, + { + "epoch": 0.65878460172706, + "grad_norm": 0.7461466193199158, + "learning_rate": 0.0001293512165142582, + "loss": 2.54, + "step": 8163 + }, + { + "epoch": 0.658865305463643, + "grad_norm": 0.7635199427604675, + "learning_rate": 0.00012933612457937988, + "loss": 2.5763, + "step": 8164 + }, + { + "epoch": 0.658946009200226, + "grad_norm": 0.7360543608665466, + "learning_rate": 0.00012932103191336865, + "loss": 2.5968, + "step": 8165 + }, + { + "epoch": 0.6590267129368089, + "grad_norm": 0.6482167840003967, + "learning_rate": 0.0001293059385166007, + "loss": 2.5704, + "step": 8166 + }, + { + "epoch": 0.659107416673392, + "grad_norm": 0.7024737596511841, + "learning_rate": 0.00012929084438945208, + "loss": 2.6221, + "step": 8167 + }, + { + "epoch": 0.659188120409975, + "grad_norm": 0.7192068696022034, + "learning_rate": 0.0001292757495322991, + "loss": 2.5574, + "step": 8168 + }, + { + "epoch": 0.659268824146558, + "grad_norm": 0.6900508403778076, + "learning_rate": 0.0001292606539455179, + "loss": 2.5969, + "step": 8169 + }, + { + "epoch": 0.6593495278831409, + "grad_norm": 0.7522475719451904, + "learning_rate": 0.00012924555762948474, + "loss": 2.592, + "step": 8170 + }, + { + "epoch": 0.659430231619724, + "grad_norm": 0.6610947251319885, + "learning_rate": 0.00012923046058457583, + "loss": 2.5404, + "step": 8171 + }, + { + "epoch": 0.659510935356307, + "grad_norm": 0.667628288269043, + "learning_rate": 0.00012921536281116738, + "loss": 2.5551, + "step": 8172 + }, + { + "epoch": 0.65959163909289, + "grad_norm": 0.7119980454444885, + "learning_rate": 0.00012920026430963578, + "loss": 2.6002, + "step": 8173 + }, + { + "epoch": 0.659672342829473, + "grad_norm": 0.712166428565979, + "learning_rate": 0.00012918516508035724, + "loss": 2.626, + "step": 8174 + }, + { + "epoch": 0.659753046566056, + "grad_norm": 0.6993290185928345, + "learning_rate": 0.0001291700651237081, + "loss": 2.6311, + "step": 8175 + }, + { + "epoch": 0.659833750302639, + "grad_norm": 0.6889405250549316, + "learning_rate": 0.0001291549644400647, + "loss": 2.6483, + "step": 8176 + }, + { + "epoch": 0.659914454039222, + "grad_norm": 0.7120937705039978, + "learning_rate": 0.00012913986302980334, + "loss": 2.5489, + "step": 8177 + }, + { + "epoch": 0.659995157775805, + "grad_norm": 0.7112947106361389, + "learning_rate": 0.00012912476089330043, + "loss": 2.6393, + "step": 8178 + }, + { + "epoch": 0.6600758615123881, + "grad_norm": 0.710342526435852, + "learning_rate": 0.00012910965803093237, + "loss": 2.5897, + "step": 8179 + }, + { + "epoch": 0.660156565248971, + "grad_norm": 0.6506931185722351, + "learning_rate": 0.0001290945544430755, + "loss": 2.6429, + "step": 8180 + }, + { + "epoch": 0.660237268985554, + "grad_norm": 0.7147021293640137, + "learning_rate": 0.00012907945013010633, + "loss": 2.5521, + "step": 8181 + }, + { + "epoch": 0.660317972722137, + "grad_norm": 0.6802387833595276, + "learning_rate": 0.0001290643450924012, + "loss": 2.581, + "step": 8182 + }, + { + "epoch": 0.6603986764587201, + "grad_norm": 0.7599670886993408, + "learning_rate": 0.00012904923933033664, + "loss": 2.5532, + "step": 8183 + }, + { + "epoch": 0.6604793801953031, + "grad_norm": 0.7105657458305359, + "learning_rate": 0.0001290341328442891, + "loss": 2.5744, + "step": 8184 + }, + { + "epoch": 0.660560083931886, + "grad_norm": 0.6786425113677979, + "learning_rate": 0.00012901902563463506, + "loss": 2.5326, + "step": 8185 + }, + { + "epoch": 0.660640787668469, + "grad_norm": 0.7305583357810974, + "learning_rate": 0.00012900391770175106, + "loss": 2.6103, + "step": 8186 + }, + { + "epoch": 0.6607214914050521, + "grad_norm": 0.6578992605209351, + "learning_rate": 0.00012898880904601363, + "loss": 2.5833, + "step": 8187 + }, + { + "epoch": 0.6608021951416351, + "grad_norm": 0.6498856544494629, + "learning_rate": 0.00012897369966779926, + "loss": 2.6333, + "step": 8188 + }, + { + "epoch": 0.660882898878218, + "grad_norm": 0.7065569162368774, + "learning_rate": 0.00012895858956748458, + "loss": 2.5326, + "step": 8189 + }, + { + "epoch": 0.660963602614801, + "grad_norm": 0.7676446437835693, + "learning_rate": 0.00012894347874544613, + "loss": 2.6233, + "step": 8190 + }, + { + "epoch": 0.6610443063513841, + "grad_norm": 0.6794395446777344, + "learning_rate": 0.00012892836720206056, + "loss": 2.5426, + "step": 8191 + }, + { + "epoch": 0.6611250100879671, + "grad_norm": 0.7448986768722534, + "learning_rate": 0.00012891325493770444, + "loss": 2.5832, + "step": 8192 + }, + { + "epoch": 0.6612057138245501, + "grad_norm": 0.7789760231971741, + "learning_rate": 0.0001288981419527544, + "loss": 2.6393, + "step": 8193 + }, + { + "epoch": 0.661286417561133, + "grad_norm": 0.7425827980041504, + "learning_rate": 0.00012888302824758718, + "loss": 2.6159, + "step": 8194 + }, + { + "epoch": 0.6613671212977161, + "grad_norm": 0.6677481532096863, + "learning_rate": 0.00012886791382257936, + "loss": 2.5399, + "step": 8195 + }, + { + "epoch": 0.6614478250342991, + "grad_norm": 0.698397159576416, + "learning_rate": 0.0001288527986781077, + "loss": 2.5443, + "step": 8196 + }, + { + "epoch": 0.6615285287708821, + "grad_norm": 0.6862680315971375, + "learning_rate": 0.00012883768281454885, + "loss": 2.5843, + "step": 8197 + }, + { + "epoch": 0.6616092325074651, + "grad_norm": 0.7421948313713074, + "learning_rate": 0.00012882256623227955, + "loss": 2.5885, + "step": 8198 + }, + { + "epoch": 0.6616899362440481, + "grad_norm": 0.7453073859214783, + "learning_rate": 0.00012880744893167654, + "loss": 2.5821, + "step": 8199 + }, + { + "epoch": 0.6617706399806311, + "grad_norm": 0.668218195438385, + "learning_rate": 0.00012879233091311667, + "loss": 2.5941, + "step": 8200 + }, + { + "epoch": 0.6618513437172141, + "grad_norm": 0.6864587664604187, + "learning_rate": 0.00012877721217697657, + "loss": 2.5321, + "step": 8201 + }, + { + "epoch": 0.6619320474537971, + "grad_norm": 0.6521022319793701, + "learning_rate": 0.00012876209272363317, + "loss": 2.5945, + "step": 8202 + }, + { + "epoch": 0.6620127511903802, + "grad_norm": 0.7564631104469299, + "learning_rate": 0.00012874697255346325, + "loss": 2.5901, + "step": 8203 + }, + { + "epoch": 0.6620934549269631, + "grad_norm": 0.731991171836853, + "learning_rate": 0.00012873185166684356, + "loss": 2.649, + "step": 8204 + }, + { + "epoch": 0.6621741586635461, + "grad_norm": 0.6804815530776978, + "learning_rate": 0.00012871673006415108, + "loss": 2.5417, + "step": 8205 + }, + { + "epoch": 0.6622548624001291, + "grad_norm": 0.6862792372703552, + "learning_rate": 0.0001287016077457626, + "loss": 2.6118, + "step": 8206 + }, + { + "epoch": 0.6623355661367122, + "grad_norm": 0.7013735175132751, + "learning_rate": 0.00012868648471205503, + "loss": 2.6296, + "step": 8207 + }, + { + "epoch": 0.6624162698732952, + "grad_norm": 0.7284584045410156, + "learning_rate": 0.00012867136096340529, + "loss": 2.6547, + "step": 8208 + }, + { + "epoch": 0.6624969736098781, + "grad_norm": 0.714546799659729, + "learning_rate": 0.00012865623650019025, + "loss": 2.5955, + "step": 8209 + }, + { + "epoch": 0.6625776773464611, + "grad_norm": 0.7645453214645386, + "learning_rate": 0.0001286411113227869, + "loss": 2.6132, + "step": 8210 + }, + { + "epoch": 0.6626583810830441, + "grad_norm": 0.6615093946456909, + "learning_rate": 0.0001286259854315722, + "loss": 2.5701, + "step": 8211 + }, + { + "epoch": 0.6627390848196272, + "grad_norm": 0.6565523147583008, + "learning_rate": 0.0001286108588269231, + "loss": 2.57, + "step": 8212 + }, + { + "epoch": 0.6628197885562102, + "grad_norm": 0.7173478007316589, + "learning_rate": 0.00012859573150921666, + "loss": 2.589, + "step": 8213 + }, + { + "epoch": 0.6629004922927931, + "grad_norm": 0.7069580554962158, + "learning_rate": 0.00012858060347882975, + "loss": 2.6146, + "step": 8214 + }, + { + "epoch": 0.6629811960293761, + "grad_norm": 0.7004678249359131, + "learning_rate": 0.00012856547473613953, + "loss": 2.5735, + "step": 8215 + }, + { + "epoch": 0.6630618997659592, + "grad_norm": 0.6589130163192749, + "learning_rate": 0.00012855034528152305, + "loss": 2.5731, + "step": 8216 + }, + { + "epoch": 0.6631426035025422, + "grad_norm": 0.7223117351531982, + "learning_rate": 0.0001285352151153573, + "loss": 2.5262, + "step": 8217 + }, + { + "epoch": 0.6632233072391251, + "grad_norm": 0.7045131325721741, + "learning_rate": 0.0001285200842380194, + "loss": 2.5789, + "step": 8218 + }, + { + "epoch": 0.6633040109757081, + "grad_norm": 0.7002174854278564, + "learning_rate": 0.00012850495264988645, + "loss": 2.6386, + "step": 8219 + }, + { + "epoch": 0.6633847147122912, + "grad_norm": 0.6844584941864014, + "learning_rate": 0.00012848982035133555, + "loss": 2.5394, + "step": 8220 + }, + { + "epoch": 0.6634654184488742, + "grad_norm": 0.7154871821403503, + "learning_rate": 0.00012847468734274387, + "loss": 2.5927, + "step": 8221 + }, + { + "epoch": 0.6635461221854572, + "grad_norm": 0.6856776475906372, + "learning_rate": 0.00012845955362448855, + "loss": 2.5694, + "step": 8222 + }, + { + "epoch": 0.6636268259220401, + "grad_norm": 0.7069089412689209, + "learning_rate": 0.00012844441919694676, + "loss": 2.5856, + "step": 8223 + }, + { + "epoch": 0.6637075296586232, + "grad_norm": 0.7084143161773682, + "learning_rate": 0.00012842928406049567, + "loss": 2.6301, + "step": 8224 + }, + { + "epoch": 0.6637882333952062, + "grad_norm": 0.6790862679481506, + "learning_rate": 0.00012841414821551252, + "loss": 2.5586, + "step": 8225 + }, + { + "epoch": 0.6638689371317892, + "grad_norm": 0.6537249684333801, + "learning_rate": 0.00012839901166237453, + "loss": 2.5652, + "step": 8226 + }, + { + "epoch": 0.6639496408683722, + "grad_norm": 0.6670125126838684, + "learning_rate": 0.00012838387440145893, + "loss": 2.5438, + "step": 8227 + }, + { + "epoch": 0.6640303446049552, + "grad_norm": 0.7202955484390259, + "learning_rate": 0.00012836873643314297, + "loss": 2.5632, + "step": 8228 + }, + { + "epoch": 0.6641110483415382, + "grad_norm": 0.6844765543937683, + "learning_rate": 0.00012835359775780394, + "loss": 2.5595, + "step": 8229 + }, + { + "epoch": 0.6641917520781212, + "grad_norm": 0.6557698249816895, + "learning_rate": 0.00012833845837581916, + "loss": 2.5998, + "step": 8230 + }, + { + "epoch": 0.6642724558147042, + "grad_norm": 0.6741784811019897, + "learning_rate": 0.0001283233182875659, + "loss": 2.5591, + "step": 8231 + }, + { + "epoch": 0.6643531595512873, + "grad_norm": 0.6926484704017639, + "learning_rate": 0.00012830817749342154, + "loss": 2.5557, + "step": 8232 + }, + { + "epoch": 0.6644338632878702, + "grad_norm": 0.6866984367370605, + "learning_rate": 0.00012829303599376336, + "loss": 2.5646, + "step": 8233 + }, + { + "epoch": 0.6645145670244532, + "grad_norm": 0.6772707104682922, + "learning_rate": 0.0001282778937889688, + "loss": 2.6028, + "step": 8234 + }, + { + "epoch": 0.6645952707610362, + "grad_norm": 0.693236768245697, + "learning_rate": 0.00012826275087941518, + "loss": 2.611, + "step": 8235 + }, + { + "epoch": 0.6646759744976193, + "grad_norm": 0.7181996703147888, + "learning_rate": 0.00012824760726547993, + "loss": 2.6081, + "step": 8236 + }, + { + "epoch": 0.6647566782342023, + "grad_norm": 0.6845484375953674, + "learning_rate": 0.00012823246294754048, + "loss": 2.5544, + "step": 8237 + }, + { + "epoch": 0.6648373819707852, + "grad_norm": 0.7106444239616394, + "learning_rate": 0.00012821731792597425, + "loss": 2.552, + "step": 8238 + }, + { + "epoch": 0.6649180857073682, + "grad_norm": 0.6930601000785828, + "learning_rate": 0.0001282021722011587, + "loss": 2.5401, + "step": 8239 + }, + { + "epoch": 0.6649987894439513, + "grad_norm": 0.6658228039741516, + "learning_rate": 0.00012818702577347129, + "loss": 2.6287, + "step": 8240 + }, + { + "epoch": 0.6650794931805343, + "grad_norm": 0.6919803619384766, + "learning_rate": 0.0001281718786432895, + "loss": 2.6142, + "step": 8241 + }, + { + "epoch": 0.6651601969171173, + "grad_norm": 0.6675698757171631, + "learning_rate": 0.00012815673081099086, + "loss": 2.5325, + "step": 8242 + }, + { + "epoch": 0.6652409006537002, + "grad_norm": 0.6669798493385315, + "learning_rate": 0.0001281415822769529, + "loss": 2.5355, + "step": 8243 + }, + { + "epoch": 0.6653216043902833, + "grad_norm": 0.6449857950210571, + "learning_rate": 0.00012812643304155316, + "loss": 2.5968, + "step": 8244 + }, + { + "epoch": 0.6654023081268663, + "grad_norm": 0.6972789168357849, + "learning_rate": 0.00012811128310516914, + "loss": 2.6133, + "step": 8245 + }, + { + "epoch": 0.6654830118634493, + "grad_norm": 0.7179878354072571, + "learning_rate": 0.0001280961324681785, + "loss": 2.5793, + "step": 8246 + }, + { + "epoch": 0.6655637156000322, + "grad_norm": 0.6736378073692322, + "learning_rate": 0.0001280809811309588, + "loss": 2.5543, + "step": 8247 + }, + { + "epoch": 0.6656444193366153, + "grad_norm": 0.7376420497894287, + "learning_rate": 0.00012806582909388763, + "loss": 2.5501, + "step": 8248 + }, + { + "epoch": 0.6657251230731983, + "grad_norm": 0.7163094878196716, + "learning_rate": 0.00012805067635734263, + "loss": 2.5538, + "step": 8249 + }, + { + "epoch": 0.6658058268097813, + "grad_norm": 0.7699353694915771, + "learning_rate": 0.00012803552292170144, + "loss": 2.5925, + "step": 8250 + }, + { + "epoch": 0.6658865305463643, + "grad_norm": 0.6504995822906494, + "learning_rate": 0.00012802036878734177, + "loss": 2.5944, + "step": 8251 + }, + { + "epoch": 0.6659672342829474, + "grad_norm": 0.7150379419326782, + "learning_rate": 0.0001280052139546412, + "loss": 2.5959, + "step": 8252 + }, + { + "epoch": 0.6660479380195303, + "grad_norm": 0.7562555074691772, + "learning_rate": 0.00012799005842397757, + "loss": 2.6041, + "step": 8253 + }, + { + "epoch": 0.6661286417561133, + "grad_norm": 0.7242838740348816, + "learning_rate": 0.00012797490219572846, + "loss": 2.6152, + "step": 8254 + }, + { + "epoch": 0.6662093454926963, + "grad_norm": 0.7062848210334778, + "learning_rate": 0.00012795974527027168, + "loss": 2.596, + "step": 8255 + }, + { + "epoch": 0.6662900492292794, + "grad_norm": 0.8179726004600525, + "learning_rate": 0.00012794458764798497, + "loss": 2.5792, + "step": 8256 + }, + { + "epoch": 0.6663707529658623, + "grad_norm": 0.692166268825531, + "learning_rate": 0.00012792942932924608, + "loss": 2.6025, + "step": 8257 + }, + { + "epoch": 0.6664514567024453, + "grad_norm": 0.6540334224700928, + "learning_rate": 0.0001279142703144328, + "loss": 2.5119, + "step": 8258 + }, + { + "epoch": 0.6665321604390283, + "grad_norm": 0.7087461352348328, + "learning_rate": 0.00012789911060392294, + "loss": 2.5808, + "step": 8259 + }, + { + "epoch": 0.6666128641756114, + "grad_norm": 0.6897622346878052, + "learning_rate": 0.0001278839501980943, + "loss": 2.5811, + "step": 8260 + }, + { + "epoch": 0.6666935679121944, + "grad_norm": 0.6653634905815125, + "learning_rate": 0.00012786878909732473, + "loss": 2.5498, + "step": 8261 + }, + { + "epoch": 0.6667742716487773, + "grad_norm": 0.6541483402252197, + "learning_rate": 0.0001278536273019921, + "loss": 2.605, + "step": 8262 + }, + { + "epoch": 0.6668549753853603, + "grad_norm": 0.6748146414756775, + "learning_rate": 0.00012783846481247428, + "loss": 2.5571, + "step": 8263 + }, + { + "epoch": 0.6669356791219433, + "grad_norm": 0.7258282899856567, + "learning_rate": 0.00012782330162914915, + "loss": 2.5562, + "step": 8264 + }, + { + "epoch": 0.6670163828585264, + "grad_norm": 0.6963080167770386, + "learning_rate": 0.00012780813775239457, + "loss": 2.6467, + "step": 8265 + }, + { + "epoch": 0.6670970865951094, + "grad_norm": 0.6627718806266785, + "learning_rate": 0.00012779297318258855, + "loss": 2.5369, + "step": 8266 + }, + { + "epoch": 0.6671777903316923, + "grad_norm": 0.7026168704032898, + "learning_rate": 0.00012777780792010897, + "loss": 2.5639, + "step": 8267 + }, + { + "epoch": 0.6672584940682753, + "grad_norm": 0.6969077587127686, + "learning_rate": 0.0001277626419653338, + "loss": 2.517, + "step": 8268 + }, + { + "epoch": 0.6673391978048584, + "grad_norm": 0.6918485760688782, + "learning_rate": 0.00012774747531864102, + "loss": 2.6388, + "step": 8269 + }, + { + "epoch": 0.6674199015414414, + "grad_norm": 0.6661256551742554, + "learning_rate": 0.00012773230798040862, + "loss": 2.5477, + "step": 8270 + }, + { + "epoch": 0.6675006052780244, + "grad_norm": 0.6778402328491211, + "learning_rate": 0.0001277171399510146, + "loss": 2.6032, + "step": 8271 + }, + { + "epoch": 0.6675813090146073, + "grad_norm": 0.6464864611625671, + "learning_rate": 0.00012770197123083702, + "loss": 2.5396, + "step": 8272 + }, + { + "epoch": 0.6676620127511904, + "grad_norm": 0.7154508233070374, + "learning_rate": 0.0001276868018202539, + "loss": 2.6163, + "step": 8273 + }, + { + "epoch": 0.6677427164877734, + "grad_norm": 0.6849631071090698, + "learning_rate": 0.0001276716317196433, + "loss": 2.549, + "step": 8274 + }, + { + "epoch": 0.6678234202243564, + "grad_norm": 0.6696017980575562, + "learning_rate": 0.00012765646092938334, + "loss": 2.5046, + "step": 8275 + }, + { + "epoch": 0.6679041239609393, + "grad_norm": 0.668153703212738, + "learning_rate": 0.00012764128944985203, + "loss": 2.5422, + "step": 8276 + }, + { + "epoch": 0.6679848276975224, + "grad_norm": 0.6600282192230225, + "learning_rate": 0.00012762611728142756, + "loss": 2.6117, + "step": 8277 + }, + { + "epoch": 0.6680655314341054, + "grad_norm": 0.6691608428955078, + "learning_rate": 0.000127610944424488, + "loss": 2.5761, + "step": 8278 + }, + { + "epoch": 0.6681462351706884, + "grad_norm": 0.695142924785614, + "learning_rate": 0.00012759577087941156, + "loss": 2.6123, + "step": 8279 + }, + { + "epoch": 0.6682269389072714, + "grad_norm": 0.6846559643745422, + "learning_rate": 0.00012758059664657635, + "loss": 2.5882, + "step": 8280 + }, + { + "epoch": 0.6683076426438544, + "grad_norm": 0.7616459131240845, + "learning_rate": 0.0001275654217263606, + "loss": 2.5559, + "step": 8281 + }, + { + "epoch": 0.6683883463804374, + "grad_norm": 0.6995570063591003, + "learning_rate": 0.00012755024611914246, + "loss": 2.5336, + "step": 8282 + }, + { + "epoch": 0.6684690501170204, + "grad_norm": 0.7199691534042358, + "learning_rate": 0.0001275350698253002, + "loss": 2.6618, + "step": 8283 + }, + { + "epoch": 0.6685497538536034, + "grad_norm": 0.6938748955726624, + "learning_rate": 0.000127519892845212, + "loss": 2.574, + "step": 8284 + }, + { + "epoch": 0.6686304575901865, + "grad_norm": 0.6827714443206787, + "learning_rate": 0.00012750471517925614, + "loss": 2.5647, + "step": 8285 + }, + { + "epoch": 0.6687111613267694, + "grad_norm": 0.6684606671333313, + "learning_rate": 0.00012748953682781083, + "loss": 2.528, + "step": 8286 + }, + { + "epoch": 0.6687918650633524, + "grad_norm": 0.6842156052589417, + "learning_rate": 0.00012747435779125448, + "loss": 2.5521, + "step": 8287 + }, + { + "epoch": 0.6688725687999354, + "grad_norm": 0.7440506219863892, + "learning_rate": 0.0001274591780699653, + "loss": 2.5646, + "step": 8288 + }, + { + "epoch": 0.6689532725365185, + "grad_norm": 0.769922137260437, + "learning_rate": 0.0001274439976643216, + "loss": 2.6104, + "step": 8289 + }, + { + "epoch": 0.6690339762731015, + "grad_norm": 0.7793089747428894, + "learning_rate": 0.00012742881657470175, + "loss": 2.6348, + "step": 8290 + }, + { + "epoch": 0.6691146800096844, + "grad_norm": 0.695060133934021, + "learning_rate": 0.0001274136348014841, + "loss": 2.5797, + "step": 8291 + }, + { + "epoch": 0.6691953837462674, + "grad_norm": 0.7089917659759521, + "learning_rate": 0.00012739845234504697, + "loss": 2.5431, + "step": 8292 + }, + { + "epoch": 0.6692760874828505, + "grad_norm": 0.7542717456817627, + "learning_rate": 0.00012738326920576885, + "loss": 2.6172, + "step": 8293 + }, + { + "epoch": 0.6693567912194335, + "grad_norm": 0.6947969794273376, + "learning_rate": 0.00012736808538402802, + "loss": 2.6026, + "step": 8294 + }, + { + "epoch": 0.6694374949560165, + "grad_norm": 0.6696321368217468, + "learning_rate": 0.00012735290088020302, + "loss": 2.5592, + "step": 8295 + }, + { + "epoch": 0.6695181986925994, + "grad_norm": 0.7001518607139587, + "learning_rate": 0.0001273377156946722, + "loss": 2.5994, + "step": 8296 + }, + { + "epoch": 0.6695989024291825, + "grad_norm": 0.6708101630210876, + "learning_rate": 0.000127322529827814, + "loss": 2.6392, + "step": 8297 + }, + { + "epoch": 0.6696796061657655, + "grad_norm": 0.6282601952552795, + "learning_rate": 0.000127307343280007, + "loss": 2.5762, + "step": 8298 + }, + { + "epoch": 0.6697603099023485, + "grad_norm": 0.6879595518112183, + "learning_rate": 0.0001272921560516296, + "loss": 2.5507, + "step": 8299 + }, + { + "epoch": 0.6698410136389314, + "grad_norm": 0.6108266115188599, + "learning_rate": 0.00012727696814306033, + "loss": 2.5865, + "step": 8300 + }, + { + "epoch": 0.6699217173755145, + "grad_norm": 0.6763970851898193, + "learning_rate": 0.0001272617795546777, + "loss": 2.6439, + "step": 8301 + }, + { + "epoch": 0.6700024211120975, + "grad_norm": 0.6997560858726501, + "learning_rate": 0.00012724659028686027, + "loss": 2.5291, + "step": 8302 + }, + { + "epoch": 0.6700831248486805, + "grad_norm": 0.675714910030365, + "learning_rate": 0.0001272314003399866, + "loss": 2.5452, + "step": 8303 + }, + { + "epoch": 0.6701638285852635, + "grad_norm": 0.6847789883613586, + "learning_rate": 0.00012721620971443525, + "loss": 2.6111, + "step": 8304 + }, + { + "epoch": 0.6702445323218466, + "grad_norm": 0.7283920645713806, + "learning_rate": 0.0001272010184105848, + "loss": 2.6322, + "step": 8305 + }, + { + "epoch": 0.6703252360584295, + "grad_norm": 0.7551796436309814, + "learning_rate": 0.00012718582642881382, + "loss": 2.5728, + "step": 8306 + }, + { + "epoch": 0.6704059397950125, + "grad_norm": 0.694526195526123, + "learning_rate": 0.00012717063376950104, + "loss": 2.6241, + "step": 8307 + }, + { + "epoch": 0.6704866435315955, + "grad_norm": 0.6956443190574646, + "learning_rate": 0.00012715544043302504, + "loss": 2.5531, + "step": 8308 + }, + { + "epoch": 0.6705673472681786, + "grad_norm": 0.7649452686309814, + "learning_rate": 0.00012714024641976446, + "loss": 2.5462, + "step": 8309 + }, + { + "epoch": 0.6706480510047615, + "grad_norm": 0.7711065411567688, + "learning_rate": 0.00012712505173009797, + "loss": 2.5878, + "step": 8310 + }, + { + "epoch": 0.6707287547413445, + "grad_norm": 0.68077552318573, + "learning_rate": 0.00012710985636440434, + "loss": 2.5668, + "step": 8311 + }, + { + "epoch": 0.6708094584779275, + "grad_norm": 0.7181024551391602, + "learning_rate": 0.0001270946603230622, + "loss": 2.6104, + "step": 8312 + }, + { + "epoch": 0.6708901622145105, + "grad_norm": 0.7136553525924683, + "learning_rate": 0.0001270794636064503, + "loss": 2.5282, + "step": 8313 + }, + { + "epoch": 0.6709708659510936, + "grad_norm": 0.880094587802887, + "learning_rate": 0.00012706426621494736, + "loss": 2.5837, + "step": 8314 + }, + { + "epoch": 0.6710515696876765, + "grad_norm": 0.7438541054725647, + "learning_rate": 0.00012704906814893217, + "loss": 2.5577, + "step": 8315 + }, + { + "epoch": 0.6711322734242595, + "grad_norm": 0.8197470903396606, + "learning_rate": 0.00012703386940878352, + "loss": 2.569, + "step": 8316 + }, + { + "epoch": 0.6712129771608425, + "grad_norm": 0.7728317975997925, + "learning_rate": 0.00012701866999488014, + "loss": 2.6407, + "step": 8317 + }, + { + "epoch": 0.6712936808974256, + "grad_norm": 0.7594823837280273, + "learning_rate": 0.0001270034699076009, + "loss": 2.5789, + "step": 8318 + }, + { + "epoch": 0.6713743846340086, + "grad_norm": 0.7502284646034241, + "learning_rate": 0.0001269882691473246, + "loss": 2.6068, + "step": 8319 + }, + { + "epoch": 0.6714550883705915, + "grad_norm": 0.7355664372444153, + "learning_rate": 0.0001269730677144301, + "loss": 2.6055, + "step": 8320 + }, + { + "epoch": 0.6715357921071745, + "grad_norm": 0.7218407392501831, + "learning_rate": 0.0001269578656092962, + "loss": 2.5953, + "step": 8321 + }, + { + "epoch": 0.6716164958437576, + "grad_norm": 0.6932538747787476, + "learning_rate": 0.00012694266283230185, + "loss": 2.5795, + "step": 8322 + }, + { + "epoch": 0.6716971995803406, + "grad_norm": 0.7337260246276855, + "learning_rate": 0.00012692745938382591, + "loss": 2.5606, + "step": 8323 + }, + { + "epoch": 0.6717779033169236, + "grad_norm": 0.6959026455879211, + "learning_rate": 0.00012691225526424731, + "loss": 2.5688, + "step": 8324 + }, + { + "epoch": 0.6718586070535065, + "grad_norm": 0.7352995872497559, + "learning_rate": 0.00012689705047394493, + "loss": 2.6308, + "step": 8325 + }, + { + "epoch": 0.6719393107900896, + "grad_norm": 0.7023616433143616, + "learning_rate": 0.00012688184501329777, + "loss": 2.6462, + "step": 8326 + }, + { + "epoch": 0.6720200145266726, + "grad_norm": 0.6581354737281799, + "learning_rate": 0.00012686663888268474, + "loss": 2.5997, + "step": 8327 + }, + { + "epoch": 0.6721007182632556, + "grad_norm": 0.6332606077194214, + "learning_rate": 0.00012685143208248484, + "loss": 2.6348, + "step": 8328 + }, + { + "epoch": 0.6721814219998385, + "grad_norm": 0.6826457977294922, + "learning_rate": 0.00012683622461307707, + "loss": 2.5092, + "step": 8329 + }, + { + "epoch": 0.6722621257364216, + "grad_norm": 0.7641614079475403, + "learning_rate": 0.00012682101647484042, + "loss": 2.7098, + "step": 8330 + }, + { + "epoch": 0.6723428294730046, + "grad_norm": 0.7153630256652832, + "learning_rate": 0.00012680580766815394, + "loss": 2.5647, + "step": 8331 + }, + { + "epoch": 0.6724235332095876, + "grad_norm": 0.6746379137039185, + "learning_rate": 0.00012679059819339664, + "loss": 2.6187, + "step": 8332 + }, + { + "epoch": 0.6725042369461706, + "grad_norm": 0.6748883128166199, + "learning_rate": 0.00012677538805094764, + "loss": 2.6045, + "step": 8333 + }, + { + "epoch": 0.6725849406827537, + "grad_norm": 0.7366370558738708, + "learning_rate": 0.00012676017724118596, + "loss": 2.5789, + "step": 8334 + }, + { + "epoch": 0.6726656444193366, + "grad_norm": 0.7381749153137207, + "learning_rate": 0.00012674496576449074, + "loss": 2.5958, + "step": 8335 + }, + { + "epoch": 0.6727463481559196, + "grad_norm": 0.7109243869781494, + "learning_rate": 0.00012672975362124103, + "loss": 2.5874, + "step": 8336 + }, + { + "epoch": 0.6728270518925026, + "grad_norm": 0.6904270052909851, + "learning_rate": 0.00012671454081181595, + "loss": 2.5891, + "step": 8337 + }, + { + "epoch": 0.6729077556290857, + "grad_norm": 0.6809365749359131, + "learning_rate": 0.00012669932733659476, + "loss": 2.5904, + "step": 8338 + }, + { + "epoch": 0.6729884593656686, + "grad_norm": 0.7527552843093872, + "learning_rate": 0.00012668411319595647, + "loss": 2.5602, + "step": 8339 + }, + { + "epoch": 0.6730691631022516, + "grad_norm": 0.6746577620506287, + "learning_rate": 0.00012666889839028038, + "loss": 2.5468, + "step": 8340 + }, + { + "epoch": 0.6731498668388346, + "grad_norm": 0.6904895305633545, + "learning_rate": 0.00012665368291994562, + "loss": 2.623, + "step": 8341 + }, + { + "epoch": 0.6732305705754177, + "grad_norm": 0.6495908498764038, + "learning_rate": 0.00012663846678533135, + "loss": 2.5843, + "step": 8342 + }, + { + "epoch": 0.6733112743120007, + "grad_norm": 0.6782342195510864, + "learning_rate": 0.00012662324998681692, + "loss": 2.6141, + "step": 8343 + }, + { + "epoch": 0.6733919780485836, + "grad_norm": 0.7090504765510559, + "learning_rate": 0.0001266080325247815, + "loss": 2.6654, + "step": 8344 + }, + { + "epoch": 0.6734726817851666, + "grad_norm": 0.7085515856742859, + "learning_rate": 0.00012659281439960434, + "loss": 2.5394, + "step": 8345 + }, + { + "epoch": 0.6735533855217497, + "grad_norm": 0.6813806295394897, + "learning_rate": 0.00012657759561166473, + "loss": 2.6522, + "step": 8346 + }, + { + "epoch": 0.6736340892583327, + "grad_norm": 0.726378858089447, + "learning_rate": 0.00012656237616134197, + "loss": 2.5922, + "step": 8347 + }, + { + "epoch": 0.6737147929949157, + "grad_norm": 0.6323714256286621, + "learning_rate": 0.00012654715604901534, + "loss": 2.4938, + "step": 8348 + }, + { + "epoch": 0.6737954967314986, + "grad_norm": 0.6925889253616333, + "learning_rate": 0.0001265319352750642, + "loss": 2.635, + "step": 8349 + }, + { + "epoch": 0.6738762004680817, + "grad_norm": 0.6676003932952881, + "learning_rate": 0.00012651671383986788, + "loss": 2.558, + "step": 8350 + }, + { + "epoch": 0.6739569042046647, + "grad_norm": 0.7464616298675537, + "learning_rate": 0.00012650149174380575, + "loss": 2.5777, + "step": 8351 + }, + { + "epoch": 0.6740376079412477, + "grad_norm": 0.6611667275428772, + "learning_rate": 0.00012648626898725715, + "loss": 2.5779, + "step": 8352 + }, + { + "epoch": 0.6741183116778307, + "grad_norm": 0.7391866445541382, + "learning_rate": 0.00012647104557060148, + "loss": 2.5624, + "step": 8353 + }, + { + "epoch": 0.6741990154144137, + "grad_norm": 0.7107826471328735, + "learning_rate": 0.00012645582149421817, + "loss": 2.5744, + "step": 8354 + }, + { + "epoch": 0.6742797191509967, + "grad_norm": 0.7385339736938477, + "learning_rate": 0.00012644059675848666, + "loss": 2.5752, + "step": 8355 + }, + { + "epoch": 0.6743604228875797, + "grad_norm": 0.6887345314025879, + "learning_rate": 0.00012642537136378634, + "loss": 2.5794, + "step": 8356 + }, + { + "epoch": 0.6744411266241627, + "grad_norm": 0.6934933662414551, + "learning_rate": 0.00012641014531049666, + "loss": 2.5361, + "step": 8357 + }, + { + "epoch": 0.6745218303607458, + "grad_norm": 0.7437291741371155, + "learning_rate": 0.00012639491859899716, + "loss": 2.5741, + "step": 8358 + }, + { + "epoch": 0.6746025340973287, + "grad_norm": 0.7088494896888733, + "learning_rate": 0.00012637969122966729, + "loss": 2.6449, + "step": 8359 + }, + { + "epoch": 0.6746832378339117, + "grad_norm": 0.7496390342712402, + "learning_rate": 0.00012636446320288654, + "loss": 2.6109, + "step": 8360 + }, + { + "epoch": 0.6747639415704947, + "grad_norm": 0.6949843764305115, + "learning_rate": 0.00012634923451903447, + "loss": 2.5769, + "step": 8361 + }, + { + "epoch": 0.6748446453070778, + "grad_norm": 0.7192673087120056, + "learning_rate": 0.00012633400517849056, + "loss": 2.6053, + "step": 8362 + }, + { + "epoch": 0.6749253490436607, + "grad_norm": 0.7003379464149475, + "learning_rate": 0.00012631877518163442, + "loss": 2.5745, + "step": 8363 + }, + { + "epoch": 0.6750060527802437, + "grad_norm": 0.7499879002571106, + "learning_rate": 0.00012630354452884563, + "loss": 2.6077, + "step": 8364 + }, + { + "epoch": 0.6750867565168267, + "grad_norm": 0.7047405242919922, + "learning_rate": 0.00012628831322050377, + "loss": 2.5955, + "step": 8365 + }, + { + "epoch": 0.6751674602534097, + "grad_norm": 0.7463203072547913, + "learning_rate": 0.00012627308125698838, + "loss": 2.5421, + "step": 8366 + }, + { + "epoch": 0.6752481639899928, + "grad_norm": 0.7377086877822876, + "learning_rate": 0.00012625784863867914, + "loss": 2.5804, + "step": 8367 + }, + { + "epoch": 0.6753288677265757, + "grad_norm": 0.7136400938034058, + "learning_rate": 0.00012624261536595566, + "loss": 2.5673, + "step": 8368 + }, + { + "epoch": 0.6754095714631587, + "grad_norm": 0.6923615336418152, + "learning_rate": 0.0001262273814391976, + "loss": 2.5832, + "step": 8369 + }, + { + "epoch": 0.6754902751997417, + "grad_norm": 0.7495028972625732, + "learning_rate": 0.00012621214685878469, + "loss": 2.5943, + "step": 8370 + }, + { + "epoch": 0.6755709789363248, + "grad_norm": 0.6751434206962585, + "learning_rate": 0.0001261969116250965, + "loss": 2.5495, + "step": 8371 + }, + { + "epoch": 0.6756516826729078, + "grad_norm": 0.7055973410606384, + "learning_rate": 0.00012618167573851284, + "loss": 2.5651, + "step": 8372 + }, + { + "epoch": 0.6757323864094907, + "grad_norm": 0.7479640245437622, + "learning_rate": 0.00012616643919941337, + "loss": 2.653, + "step": 8373 + }, + { + "epoch": 0.6758130901460737, + "grad_norm": 0.7075015902519226, + "learning_rate": 0.00012615120200817778, + "loss": 2.5787, + "step": 8374 + }, + { + "epoch": 0.6758937938826568, + "grad_norm": 0.7513934969902039, + "learning_rate": 0.00012613596416518593, + "loss": 2.6099, + "step": 8375 + }, + { + "epoch": 0.6759744976192398, + "grad_norm": 0.6742326021194458, + "learning_rate": 0.00012612072567081754, + "loss": 2.5335, + "step": 8376 + }, + { + "epoch": 0.6760552013558228, + "grad_norm": 0.7271459698677063, + "learning_rate": 0.00012610548652545239, + "loss": 2.6082, + "step": 8377 + }, + { + "epoch": 0.6761359050924057, + "grad_norm": 0.7481515407562256, + "learning_rate": 0.00012609024672947022, + "loss": 2.5805, + "step": 8378 + }, + { + "epoch": 0.6762166088289888, + "grad_norm": 0.7484803199768066, + "learning_rate": 0.00012607500628325093, + "loss": 2.6099, + "step": 8379 + }, + { + "epoch": 0.6762973125655718, + "grad_norm": 0.7462390661239624, + "learning_rate": 0.00012605976518717435, + "loss": 2.6054, + "step": 8380 + }, + { + "epoch": 0.6763780163021548, + "grad_norm": 0.7014410495758057, + "learning_rate": 0.00012604452344162028, + "loss": 2.5614, + "step": 8381 + }, + { + "epoch": 0.6764587200387377, + "grad_norm": 0.6902963519096375, + "learning_rate": 0.0001260292810469686, + "loss": 2.5813, + "step": 8382 + }, + { + "epoch": 0.6765394237753208, + "grad_norm": 0.6646186113357544, + "learning_rate": 0.00012601403800359919, + "loss": 2.545, + "step": 8383 + }, + { + "epoch": 0.6766201275119038, + "grad_norm": 0.7067462801933289, + "learning_rate": 0.00012599879431189197, + "loss": 2.6195, + "step": 8384 + }, + { + "epoch": 0.6767008312484868, + "grad_norm": 0.7263965010643005, + "learning_rate": 0.0001259835499722268, + "loss": 2.5929, + "step": 8385 + }, + { + "epoch": 0.6767815349850698, + "grad_norm": 0.6672000885009766, + "learning_rate": 0.0001259683049849837, + "loss": 2.5561, + "step": 8386 + }, + { + "epoch": 0.6768622387216529, + "grad_norm": 0.6543236374855042, + "learning_rate": 0.0001259530593505425, + "loss": 2.6256, + "step": 8387 + }, + { + "epoch": 0.6769429424582358, + "grad_norm": 0.6532339453697205, + "learning_rate": 0.00012593781306928324, + "loss": 2.5074, + "step": 8388 + }, + { + "epoch": 0.6770236461948188, + "grad_norm": 0.7442833185195923, + "learning_rate": 0.00012592256614158591, + "loss": 2.6124, + "step": 8389 + }, + { + "epoch": 0.6771043499314018, + "grad_norm": 0.786685585975647, + "learning_rate": 0.00012590731856783043, + "loss": 2.6077, + "step": 8390 + }, + { + "epoch": 0.6771850536679849, + "grad_norm": 0.7952337265014648, + "learning_rate": 0.00012589207034839687, + "loss": 2.5894, + "step": 8391 + }, + { + "epoch": 0.6772657574045678, + "grad_norm": 0.7847954034805298, + "learning_rate": 0.00012587682148366524, + "loss": 2.4934, + "step": 8392 + }, + { + "epoch": 0.6773464611411508, + "grad_norm": 0.6769007444381714, + "learning_rate": 0.00012586157197401552, + "loss": 2.5695, + "step": 8393 + }, + { + "epoch": 0.6774271648777338, + "grad_norm": 0.6583757996559143, + "learning_rate": 0.00012584632181982788, + "loss": 2.5866, + "step": 8394 + }, + { + "epoch": 0.6775078686143169, + "grad_norm": 0.7375823855400085, + "learning_rate": 0.0001258310710214823, + "loss": 2.5141, + "step": 8395 + }, + { + "epoch": 0.6775885723508999, + "grad_norm": 0.6901078224182129, + "learning_rate": 0.00012581581957935896, + "loss": 2.5732, + "step": 8396 + }, + { + "epoch": 0.6776692760874828, + "grad_norm": 0.687152624130249, + "learning_rate": 0.0001258005674938379, + "loss": 2.5916, + "step": 8397 + }, + { + "epoch": 0.6777499798240658, + "grad_norm": 0.7198586463928223, + "learning_rate": 0.00012578531476529917, + "loss": 2.5626, + "step": 8398 + }, + { + "epoch": 0.6778306835606489, + "grad_norm": 0.7417474985122681, + "learning_rate": 0.00012577006139412309, + "loss": 2.5486, + "step": 8399 + }, + { + "epoch": 0.6779113872972319, + "grad_norm": 0.6588087677955627, + "learning_rate": 0.0001257548073806897, + "loss": 2.6123, + "step": 8400 + }, + { + "epoch": 0.6779920910338149, + "grad_norm": 0.7211382389068604, + "learning_rate": 0.00012573955272537915, + "loss": 2.6402, + "step": 8401 + }, + { + "epoch": 0.6780727947703978, + "grad_norm": 0.7196084856987, + "learning_rate": 0.00012572429742857167, + "loss": 2.51, + "step": 8402 + }, + { + "epoch": 0.6781534985069809, + "grad_norm": 0.6399394273757935, + "learning_rate": 0.00012570904149064748, + "loss": 2.5309, + "step": 8403 + }, + { + "epoch": 0.6782342022435639, + "grad_norm": 0.6969572305679321, + "learning_rate": 0.00012569378491198674, + "loss": 2.5829, + "step": 8404 + }, + { + "epoch": 0.6783149059801469, + "grad_norm": 0.8005492091178894, + "learning_rate": 0.00012567852769296975, + "loss": 2.6277, + "step": 8405 + }, + { + "epoch": 0.6783956097167299, + "grad_norm": 0.6786207556724548, + "learning_rate": 0.0001256632698339767, + "loss": 2.5839, + "step": 8406 + }, + { + "epoch": 0.6784763134533129, + "grad_norm": 0.7047130465507507, + "learning_rate": 0.0001256480113353879, + "loss": 2.533, + "step": 8407 + }, + { + "epoch": 0.6785570171898959, + "grad_norm": 0.7640479803085327, + "learning_rate": 0.0001256327521975836, + "loss": 2.5855, + "step": 8408 + }, + { + "epoch": 0.6786377209264789, + "grad_norm": 0.728111207485199, + "learning_rate": 0.00012561749242094412, + "loss": 2.6184, + "step": 8409 + }, + { + "epoch": 0.6787184246630619, + "grad_norm": 0.7842772603034973, + "learning_rate": 0.00012560223200584975, + "loss": 2.5915, + "step": 8410 + }, + { + "epoch": 0.678799128399645, + "grad_norm": 0.7129092812538147, + "learning_rate": 0.00012558697095268085, + "loss": 2.6526, + "step": 8411 + }, + { + "epoch": 0.6788798321362279, + "grad_norm": 0.751103401184082, + "learning_rate": 0.00012557170926181773, + "loss": 2.605, + "step": 8412 + }, + { + "epoch": 0.6789605358728109, + "grad_norm": 0.6850594878196716, + "learning_rate": 0.0001255564469336408, + "loss": 2.6047, + "step": 8413 + }, + { + "epoch": 0.6790412396093939, + "grad_norm": 0.703037679195404, + "learning_rate": 0.00012554118396853036, + "loss": 2.653, + "step": 8414 + }, + { + "epoch": 0.6791219433459769, + "grad_norm": 0.8097915053367615, + "learning_rate": 0.0001255259203668669, + "loss": 2.5937, + "step": 8415 + }, + { + "epoch": 0.67920264708256, + "grad_norm": 0.700351357460022, + "learning_rate": 0.00012551065612903076, + "loss": 2.6089, + "step": 8416 + }, + { + "epoch": 0.6792833508191429, + "grad_norm": 0.6760888695716858, + "learning_rate": 0.00012549539125540236, + "loss": 2.547, + "step": 8417 + }, + { + "epoch": 0.6793640545557259, + "grad_norm": 0.6751723289489746, + "learning_rate": 0.0001254801257463622, + "loss": 2.625, + "step": 8418 + }, + { + "epoch": 0.6794447582923089, + "grad_norm": 0.6928921937942505, + "learning_rate": 0.00012546485960229065, + "loss": 2.5671, + "step": 8419 + }, + { + "epoch": 0.679525462028892, + "grad_norm": 0.6541565656661987, + "learning_rate": 0.0001254495928235683, + "loss": 2.5837, + "step": 8420 + }, + { + "epoch": 0.679606165765475, + "grad_norm": 0.6228676438331604, + "learning_rate": 0.00012543432541057555, + "loss": 2.5798, + "step": 8421 + }, + { + "epoch": 0.6796868695020579, + "grad_norm": 0.7620853185653687, + "learning_rate": 0.0001254190573636929, + "loss": 2.5885, + "step": 8422 + }, + { + "epoch": 0.6797675732386409, + "grad_norm": 0.7425604462623596, + "learning_rate": 0.0001254037886833009, + "loss": 2.6124, + "step": 8423 + }, + { + "epoch": 0.679848276975224, + "grad_norm": 0.7150974273681641, + "learning_rate": 0.0001253885193697801, + "loss": 2.5423, + "step": 8424 + }, + { + "epoch": 0.679928980711807, + "grad_norm": 0.672649621963501, + "learning_rate": 0.000125373249423511, + "loss": 2.5563, + "step": 8425 + }, + { + "epoch": 0.6800096844483899, + "grad_norm": 0.6913620829582214, + "learning_rate": 0.00012535797884487425, + "loss": 2.5261, + "step": 8426 + }, + { + "epoch": 0.6800903881849729, + "grad_norm": 0.712123692035675, + "learning_rate": 0.00012534270763425034, + "loss": 2.5958, + "step": 8427 + }, + { + "epoch": 0.680171091921556, + "grad_norm": 0.7593061327934265, + "learning_rate": 0.00012532743579201993, + "loss": 2.6036, + "step": 8428 + }, + { + "epoch": 0.680251795658139, + "grad_norm": 0.7108714580535889, + "learning_rate": 0.0001253121633185636, + "loss": 2.6004, + "step": 8429 + }, + { + "epoch": 0.680332499394722, + "grad_norm": 0.7142449021339417, + "learning_rate": 0.00012529689021426198, + "loss": 2.588, + "step": 8430 + }, + { + "epoch": 0.6804132031313049, + "grad_norm": 0.7579841017723083, + "learning_rate": 0.00012528161647949574, + "loss": 2.5927, + "step": 8431 + }, + { + "epoch": 0.680493906867888, + "grad_norm": 0.6522083878517151, + "learning_rate": 0.00012526634211464555, + "loss": 2.5619, + "step": 8432 + }, + { + "epoch": 0.680574610604471, + "grad_norm": 0.7681782245635986, + "learning_rate": 0.00012525106712009203, + "loss": 2.6065, + "step": 8433 + }, + { + "epoch": 0.680655314341054, + "grad_norm": 0.6900169253349304, + "learning_rate": 0.00012523579149621594, + "loss": 2.5507, + "step": 8434 + }, + { + "epoch": 0.680736018077637, + "grad_norm": 0.6907666325569153, + "learning_rate": 0.00012522051524339794, + "loss": 2.5213, + "step": 8435 + }, + { + "epoch": 0.68081672181422, + "grad_norm": 0.7202023267745972, + "learning_rate": 0.0001252052383620188, + "loss": 2.6367, + "step": 8436 + }, + { + "epoch": 0.680897425550803, + "grad_norm": 0.7893621325492859, + "learning_rate": 0.00012518996085245925, + "loss": 2.6066, + "step": 8437 + }, + { + "epoch": 0.680978129287386, + "grad_norm": 0.7693532109260559, + "learning_rate": 0.00012517468271509998, + "loss": 2.5346, + "step": 8438 + }, + { + "epoch": 0.681058833023969, + "grad_norm": 0.7976840734481812, + "learning_rate": 0.0001251594039503218, + "loss": 2.5991, + "step": 8439 + }, + { + "epoch": 0.6811395367605521, + "grad_norm": 0.7671225666999817, + "learning_rate": 0.00012514412455850554, + "loss": 2.5959, + "step": 8440 + }, + { + "epoch": 0.681220240497135, + "grad_norm": 0.7143450975418091, + "learning_rate": 0.00012512884454003194, + "loss": 2.5828, + "step": 8441 + }, + { + "epoch": 0.681300944233718, + "grad_norm": 0.6821861863136292, + "learning_rate": 0.00012511356389528192, + "loss": 2.5908, + "step": 8442 + }, + { + "epoch": 0.681381647970301, + "grad_norm": 0.7279960513114929, + "learning_rate": 0.00012509828262463615, + "loss": 2.578, + "step": 8443 + }, + { + "epoch": 0.6814623517068841, + "grad_norm": 0.6503065824508667, + "learning_rate": 0.0001250830007284756, + "loss": 2.525, + "step": 8444 + }, + { + "epoch": 0.681543055443467, + "grad_norm": 0.7276029586791992, + "learning_rate": 0.00012506771820718112, + "loss": 2.584, + "step": 8445 + }, + { + "epoch": 0.68162375918005, + "grad_norm": 0.7635578513145447, + "learning_rate": 0.00012505243506113356, + "loss": 2.627, + "step": 8446 + }, + { + "epoch": 0.681704462916633, + "grad_norm": 0.7086981534957886, + "learning_rate": 0.00012503715129071386, + "loss": 2.6164, + "step": 8447 + }, + { + "epoch": 0.6817851666532161, + "grad_norm": 0.7144165635108948, + "learning_rate": 0.00012502186689630285, + "loss": 2.5642, + "step": 8448 + }, + { + "epoch": 0.6818658703897991, + "grad_norm": 0.8135093450546265, + "learning_rate": 0.00012500658187828155, + "loss": 2.6161, + "step": 8449 + }, + { + "epoch": 0.681946574126382, + "grad_norm": 0.7223377227783203, + "learning_rate": 0.00012499129623703086, + "loss": 2.6192, + "step": 8450 + }, + { + "epoch": 0.682027277862965, + "grad_norm": 0.7189127206802368, + "learning_rate": 0.00012497600997293172, + "loss": 2.6086, + "step": 8451 + }, + { + "epoch": 0.6821079815995481, + "grad_norm": 0.6742144823074341, + "learning_rate": 0.00012496072308636514, + "loss": 2.5747, + "step": 8452 + }, + { + "epoch": 0.6821886853361311, + "grad_norm": 0.7432419657707214, + "learning_rate": 0.0001249454355777121, + "loss": 2.5687, + "step": 8453 + }, + { + "epoch": 0.6822693890727141, + "grad_norm": 0.6140317320823669, + "learning_rate": 0.00012493014744735357, + "loss": 2.5371, + "step": 8454 + }, + { + "epoch": 0.682350092809297, + "grad_norm": 0.7215768098831177, + "learning_rate": 0.0001249148586956706, + "loss": 2.6806, + "step": 8455 + }, + { + "epoch": 0.6824307965458801, + "grad_norm": 0.7485790252685547, + "learning_rate": 0.0001248995693230442, + "loss": 2.575, + "step": 8456 + }, + { + "epoch": 0.6825115002824631, + "grad_norm": 0.744349479675293, + "learning_rate": 0.00012488427932985552, + "loss": 2.5961, + "step": 8457 + }, + { + "epoch": 0.6825922040190461, + "grad_norm": 0.6784959435462952, + "learning_rate": 0.0001248689887164855, + "loss": 2.5501, + "step": 8458 + }, + { + "epoch": 0.682672907755629, + "grad_norm": 0.6664010286331177, + "learning_rate": 0.0001248536974833153, + "loss": 2.5741, + "step": 8459 + }, + { + "epoch": 0.6827536114922121, + "grad_norm": 0.7185953259468079, + "learning_rate": 0.00012483840563072592, + "loss": 2.5875, + "step": 8460 + }, + { + "epoch": 0.6828343152287951, + "grad_norm": 0.6553035378456116, + "learning_rate": 0.00012482311315909864, + "loss": 2.5321, + "step": 8461 + }, + { + "epoch": 0.6829150189653781, + "grad_norm": 0.6713398694992065, + "learning_rate": 0.00012480782006881442, + "loss": 2.6207, + "step": 8462 + }, + { + "epoch": 0.6829957227019611, + "grad_norm": 0.6733734607696533, + "learning_rate": 0.00012479252636025452, + "loss": 2.5746, + "step": 8463 + }, + { + "epoch": 0.6830764264385442, + "grad_norm": 0.7257994413375854, + "learning_rate": 0.00012477723203380004, + "loss": 2.5837, + "step": 8464 + }, + { + "epoch": 0.6831571301751271, + "grad_norm": 0.716242253780365, + "learning_rate": 0.00012476193708983214, + "loss": 2.5611, + "step": 8465 + }, + { + "epoch": 0.6832378339117101, + "grad_norm": 0.6797829866409302, + "learning_rate": 0.0001247466415287321, + "loss": 2.5763, + "step": 8466 + }, + { + "epoch": 0.6833185376482931, + "grad_norm": 0.679931640625, + "learning_rate": 0.000124731345350881, + "loss": 2.606, + "step": 8467 + }, + { + "epoch": 0.6833992413848761, + "grad_norm": 0.6767866611480713, + "learning_rate": 0.00012471604855666016, + "loss": 2.5682, + "step": 8468 + }, + { + "epoch": 0.6834799451214592, + "grad_norm": 0.7297048568725586, + "learning_rate": 0.00012470075114645078, + "loss": 2.5527, + "step": 8469 + }, + { + "epoch": 0.6835606488580421, + "grad_norm": 0.6882644295692444, + "learning_rate": 0.0001246854531206341, + "loss": 2.5712, + "step": 8470 + }, + { + "epoch": 0.6836413525946251, + "grad_norm": 0.7129159569740295, + "learning_rate": 0.00012467015447959143, + "loss": 2.5627, + "step": 8471 + }, + { + "epoch": 0.6837220563312081, + "grad_norm": 0.6671481728553772, + "learning_rate": 0.000124654855223704, + "loss": 2.6226, + "step": 8472 + }, + { + "epoch": 0.6838027600677912, + "grad_norm": 0.7096946835517883, + "learning_rate": 0.00012463955535335313, + "loss": 2.5373, + "step": 8473 + }, + { + "epoch": 0.6838834638043741, + "grad_norm": 0.6781395077705383, + "learning_rate": 0.00012462425486892012, + "loss": 2.5607, + "step": 8474 + }, + { + "epoch": 0.6839641675409571, + "grad_norm": 0.6777891516685486, + "learning_rate": 0.00012460895377078632, + "loss": 2.5991, + "step": 8475 + }, + { + "epoch": 0.6840448712775401, + "grad_norm": 0.7175275087356567, + "learning_rate": 0.00012459365205933306, + "loss": 2.6006, + "step": 8476 + }, + { + "epoch": 0.6841255750141232, + "grad_norm": 0.6832807660102844, + "learning_rate": 0.00012457834973494174, + "loss": 2.5757, + "step": 8477 + }, + { + "epoch": 0.6842062787507062, + "grad_norm": 0.7002938985824585, + "learning_rate": 0.00012456304679799366, + "loss": 2.554, + "step": 8478 + }, + { + "epoch": 0.6842869824872891, + "grad_norm": 0.7236241698265076, + "learning_rate": 0.00012454774324887027, + "loss": 2.6054, + "step": 8479 + }, + { + "epoch": 0.6843676862238721, + "grad_norm": 0.7327216267585754, + "learning_rate": 0.00012453243908795288, + "loss": 2.6101, + "step": 8480 + }, + { + "epoch": 0.6844483899604552, + "grad_norm": 0.7414156794548035, + "learning_rate": 0.00012451713431562306, + "loss": 2.5505, + "step": 8481 + }, + { + "epoch": 0.6845290936970382, + "grad_norm": 0.697795569896698, + "learning_rate": 0.00012450182893226214, + "loss": 2.539, + "step": 8482 + }, + { + "epoch": 0.6846097974336212, + "grad_norm": 0.7053593397140503, + "learning_rate": 0.00012448652293825158, + "loss": 2.6045, + "step": 8483 + }, + { + "epoch": 0.6846905011702041, + "grad_norm": 0.6710856556892395, + "learning_rate": 0.00012447121633397287, + "loss": 2.554, + "step": 8484 + }, + { + "epoch": 0.6847712049067872, + "grad_norm": 0.754454493522644, + "learning_rate": 0.0001244559091198075, + "loss": 2.5523, + "step": 8485 + }, + { + "epoch": 0.6848519086433702, + "grad_norm": 0.6468656659126282, + "learning_rate": 0.0001244406012961369, + "loss": 2.5931, + "step": 8486 + }, + { + "epoch": 0.6849326123799532, + "grad_norm": 0.7169063091278076, + "learning_rate": 0.00012442529286334266, + "loss": 2.5743, + "step": 8487 + }, + { + "epoch": 0.6850133161165362, + "grad_norm": 0.6737040877342224, + "learning_rate": 0.00012440998382180627, + "loss": 2.5734, + "step": 8488 + }, + { + "epoch": 0.6850940198531192, + "grad_norm": 0.7026428580284119, + "learning_rate": 0.0001243946741719093, + "loss": 2.4994, + "step": 8489 + }, + { + "epoch": 0.6851747235897022, + "grad_norm": 0.7378512024879456, + "learning_rate": 0.00012437936391403322, + "loss": 2.5611, + "step": 8490 + }, + { + "epoch": 0.6852554273262852, + "grad_norm": 0.7379863262176514, + "learning_rate": 0.0001243640530485597, + "loss": 2.538, + "step": 8491 + }, + { + "epoch": 0.6853361310628682, + "grad_norm": 0.68398118019104, + "learning_rate": 0.00012434874157587027, + "loss": 2.5593, + "step": 8492 + }, + { + "epoch": 0.6854168347994513, + "grad_norm": 0.6780444383621216, + "learning_rate": 0.0001243334294963466, + "loss": 2.5068, + "step": 8493 + }, + { + "epoch": 0.6854975385360342, + "grad_norm": 0.7425427436828613, + "learning_rate": 0.0001243181168103702, + "loss": 2.6607, + "step": 8494 + }, + { + "epoch": 0.6855782422726172, + "grad_norm": 0.7563300132751465, + "learning_rate": 0.0001243028035183228, + "loss": 2.5915, + "step": 8495 + }, + { + "epoch": 0.6856589460092002, + "grad_norm": 0.6746618151664734, + "learning_rate": 0.000124287489620586, + "loss": 2.5399, + "step": 8496 + }, + { + "epoch": 0.6857396497457833, + "grad_norm": 0.7100487947463989, + "learning_rate": 0.00012427217511754146, + "loss": 2.5927, + "step": 8497 + }, + { + "epoch": 0.6858203534823663, + "grad_norm": 0.6487080454826355, + "learning_rate": 0.00012425686000957088, + "loss": 2.5582, + "step": 8498 + }, + { + "epoch": 0.6859010572189492, + "grad_norm": 0.6577199697494507, + "learning_rate": 0.00012424154429705592, + "loss": 2.5589, + "step": 8499 + }, + { + "epoch": 0.6859817609555322, + "grad_norm": 0.6748726963996887, + "learning_rate": 0.00012422622798037832, + "loss": 2.5651, + "step": 8500 + }, + { + "epoch": 0.6860624646921153, + "grad_norm": 0.7159377336502075, + "learning_rate": 0.0001242109110599198, + "loss": 2.569, + "step": 8501 + }, + { + "epoch": 0.6861431684286983, + "grad_norm": 0.6772934198379517, + "learning_rate": 0.00012419559353606208, + "loss": 2.5533, + "step": 8502 + }, + { + "epoch": 0.6862238721652812, + "grad_norm": 0.6776062846183777, + "learning_rate": 0.00012418027540918693, + "loss": 2.5704, + "step": 8503 + }, + { + "epoch": 0.6863045759018642, + "grad_norm": 0.7009913921356201, + "learning_rate": 0.00012416495667967608, + "loss": 2.5928, + "step": 8504 + }, + { + "epoch": 0.6863852796384473, + "grad_norm": 0.607571005821228, + "learning_rate": 0.00012414963734791137, + "loss": 2.5459, + "step": 8505 + }, + { + "epoch": 0.6864659833750303, + "grad_norm": 0.6798292398452759, + "learning_rate": 0.00012413431741427458, + "loss": 2.6585, + "step": 8506 + }, + { + "epoch": 0.6865466871116133, + "grad_norm": 0.7892771363258362, + "learning_rate": 0.00012411899687914747, + "loss": 2.5781, + "step": 8507 + }, + { + "epoch": 0.6866273908481962, + "grad_norm": 0.6683816909790039, + "learning_rate": 0.00012410367574291199, + "loss": 2.5598, + "step": 8508 + }, + { + "epoch": 0.6867080945847793, + "grad_norm": 0.7591805458068848, + "learning_rate": 0.00012408835400594983, + "loss": 2.6478, + "step": 8509 + }, + { + "epoch": 0.6867887983213623, + "grad_norm": 0.6896353960037231, + "learning_rate": 0.00012407303166864293, + "loss": 2.5418, + "step": 8510 + }, + { + "epoch": 0.6868695020579453, + "grad_norm": 0.6657233834266663, + "learning_rate": 0.00012405770873137316, + "loss": 2.5753, + "step": 8511 + }, + { + "epoch": 0.6869502057945283, + "grad_norm": 0.6775455474853516, + "learning_rate": 0.00012404238519452237, + "loss": 2.4902, + "step": 8512 + }, + { + "epoch": 0.6870309095311113, + "grad_norm": 0.6572847962379456, + "learning_rate": 0.00012402706105847254, + "loss": 2.6189, + "step": 8513 + }, + { + "epoch": 0.6871116132676943, + "grad_norm": 0.7159940004348755, + "learning_rate": 0.00012401173632360557, + "loss": 2.5928, + "step": 8514 + }, + { + "epoch": 0.6871923170042773, + "grad_norm": 0.7178850173950195, + "learning_rate": 0.0001239964109903033, + "loss": 2.5342, + "step": 8515 + }, + { + "epoch": 0.6872730207408603, + "grad_norm": 0.6761649250984192, + "learning_rate": 0.00012398108505894774, + "loss": 2.5716, + "step": 8516 + }, + { + "epoch": 0.6873537244774433, + "grad_norm": 0.6831200122833252, + "learning_rate": 0.0001239657585299209, + "loss": 2.5506, + "step": 8517 + }, + { + "epoch": 0.6874344282140263, + "grad_norm": 0.7064316868782043, + "learning_rate": 0.00012395043140360468, + "loss": 2.541, + "step": 8518 + }, + { + "epoch": 0.6875151319506093, + "grad_norm": 0.7269963026046753, + "learning_rate": 0.00012393510368038113, + "loss": 2.541, + "step": 8519 + }, + { + "epoch": 0.6875958356871923, + "grad_norm": 0.6651471257209778, + "learning_rate": 0.00012391977536063218, + "loss": 2.5476, + "step": 8520 + }, + { + "epoch": 0.6876765394237753, + "grad_norm": 0.7649257779121399, + "learning_rate": 0.00012390444644473994, + "loss": 2.601, + "step": 8521 + }, + { + "epoch": 0.6877572431603584, + "grad_norm": 0.6637376546859741, + "learning_rate": 0.0001238891169330864, + "loss": 2.5582, + "step": 8522 + }, + { + "epoch": 0.6878379468969413, + "grad_norm": 0.6609189510345459, + "learning_rate": 0.0001238737868260536, + "loss": 2.5795, + "step": 8523 + }, + { + "epoch": 0.6879186506335243, + "grad_norm": 0.657494843006134, + "learning_rate": 0.00012385845612402363, + "loss": 2.6005, + "step": 8524 + }, + { + "epoch": 0.6879993543701073, + "grad_norm": 0.6780641674995422, + "learning_rate": 0.00012384312482737858, + "loss": 2.514, + "step": 8525 + }, + { + "epoch": 0.6880800581066904, + "grad_norm": 0.7310795187950134, + "learning_rate": 0.00012382779293650052, + "loss": 2.5707, + "step": 8526 + }, + { + "epoch": 0.6881607618432733, + "grad_norm": 0.6722557544708252, + "learning_rate": 0.0001238124604517716, + "loss": 2.5897, + "step": 8527 + }, + { + "epoch": 0.6882414655798563, + "grad_norm": 0.6502346992492676, + "learning_rate": 0.0001237971273735739, + "loss": 2.5554, + "step": 8528 + }, + { + "epoch": 0.6883221693164393, + "grad_norm": 0.6993897557258606, + "learning_rate": 0.0001237817937022896, + "loss": 2.6328, + "step": 8529 + }, + { + "epoch": 0.6884028730530224, + "grad_norm": 0.7069644331932068, + "learning_rate": 0.00012376645943830083, + "loss": 2.5957, + "step": 8530 + }, + { + "epoch": 0.6884835767896054, + "grad_norm": 0.7193333506584167, + "learning_rate": 0.00012375112458198973, + "loss": 2.6505, + "step": 8531 + }, + { + "epoch": 0.6885642805261883, + "grad_norm": 0.6821088194847107, + "learning_rate": 0.00012373578913373853, + "loss": 2.6129, + "step": 8532 + }, + { + "epoch": 0.6886449842627713, + "grad_norm": 0.6499428749084473, + "learning_rate": 0.00012372045309392947, + "loss": 2.6053, + "step": 8533 + }, + { + "epoch": 0.6887256879993544, + "grad_norm": 0.7469449639320374, + "learning_rate": 0.00012370511646294464, + "loss": 2.6423, + "step": 8534 + }, + { + "epoch": 0.6888063917359374, + "grad_norm": 0.7326325178146362, + "learning_rate": 0.00012368977924116637, + "loss": 2.5708, + "step": 8535 + }, + { + "epoch": 0.6888870954725204, + "grad_norm": 0.7459580302238464, + "learning_rate": 0.00012367444142897686, + "loss": 2.544, + "step": 8536 + }, + { + "epoch": 0.6889677992091033, + "grad_norm": 0.7198929786682129, + "learning_rate": 0.00012365910302675843, + "loss": 2.6295, + "step": 8537 + }, + { + "epoch": 0.6890485029456864, + "grad_norm": 0.8139802813529968, + "learning_rate": 0.0001236437640348933, + "loss": 2.549, + "step": 8538 + }, + { + "epoch": 0.6891292066822694, + "grad_norm": 0.6497162580490112, + "learning_rate": 0.00012362842445376372, + "loss": 2.5849, + "step": 8539 + }, + { + "epoch": 0.6892099104188524, + "grad_norm": 0.7378165125846863, + "learning_rate": 0.00012361308428375208, + "loss": 2.606, + "step": 8540 + }, + { + "epoch": 0.6892906141554354, + "grad_norm": 0.6807567477226257, + "learning_rate": 0.00012359774352524062, + "loss": 2.5892, + "step": 8541 + }, + { + "epoch": 0.6893713178920184, + "grad_norm": 0.6639370918273926, + "learning_rate": 0.0001235824021786117, + "loss": 2.5249, + "step": 8542 + }, + { + "epoch": 0.6894520216286014, + "grad_norm": 0.7140880823135376, + "learning_rate": 0.00012356706024424773, + "loss": 2.5877, + "step": 8543 + }, + { + "epoch": 0.6895327253651844, + "grad_norm": 0.7079257965087891, + "learning_rate": 0.00012355171772253097, + "loss": 2.6011, + "step": 8544 + }, + { + "epoch": 0.6896134291017674, + "grad_norm": 0.7150856852531433, + "learning_rate": 0.00012353637461384387, + "loss": 2.549, + "step": 8545 + }, + { + "epoch": 0.6896941328383505, + "grad_norm": 0.6896397471427917, + "learning_rate": 0.00012352103091856876, + "loss": 2.5452, + "step": 8546 + }, + { + "epoch": 0.6897748365749334, + "grad_norm": 0.696964681148529, + "learning_rate": 0.00012350568663708808, + "loss": 2.5075, + "step": 8547 + }, + { + "epoch": 0.6898555403115164, + "grad_norm": 0.6926069855690002, + "learning_rate": 0.00012349034176978427, + "loss": 2.5905, + "step": 8548 + }, + { + "epoch": 0.6899362440480994, + "grad_norm": 0.6949423551559448, + "learning_rate": 0.00012347499631703968, + "loss": 2.5284, + "step": 8549 + }, + { + "epoch": 0.6900169477846825, + "grad_norm": 0.6480536460876465, + "learning_rate": 0.0001234596502792369, + "loss": 2.5713, + "step": 8550 + }, + { + "epoch": 0.6900976515212655, + "grad_norm": 0.6990019679069519, + "learning_rate": 0.00012344430365675825, + "loss": 2.5826, + "step": 8551 + }, + { + "epoch": 0.6901783552578484, + "grad_norm": 0.7063903212547302, + "learning_rate": 0.00012342895644998627, + "loss": 2.5271, + "step": 8552 + }, + { + "epoch": 0.6902590589944314, + "grad_norm": 0.7037132978439331, + "learning_rate": 0.0001234136086593035, + "loss": 2.5855, + "step": 8553 + }, + { + "epoch": 0.6903397627310145, + "grad_norm": 0.679701030254364, + "learning_rate": 0.00012339826028509235, + "loss": 2.5577, + "step": 8554 + }, + { + "epoch": 0.6904204664675975, + "grad_norm": 0.7088965773582458, + "learning_rate": 0.0001233829113277354, + "loss": 2.5767, + "step": 8555 + }, + { + "epoch": 0.6905011702041804, + "grad_norm": 0.7115551829338074, + "learning_rate": 0.00012336756178761517, + "loss": 2.5651, + "step": 8556 + }, + { + "epoch": 0.6905818739407634, + "grad_norm": 0.6778836250305176, + "learning_rate": 0.00012335221166511425, + "loss": 2.6388, + "step": 8557 + }, + { + "epoch": 0.6906625776773465, + "grad_norm": 0.6358879804611206, + "learning_rate": 0.00012333686096061515, + "loss": 2.5493, + "step": 8558 + }, + { + "epoch": 0.6907432814139295, + "grad_norm": 0.688197135925293, + "learning_rate": 0.00012332150967450046, + "loss": 2.5707, + "step": 8559 + }, + { + "epoch": 0.6908239851505125, + "grad_norm": 0.6931524872779846, + "learning_rate": 0.0001233061578071528, + "loss": 2.5561, + "step": 8560 + }, + { + "epoch": 0.6909046888870954, + "grad_norm": 0.6684975624084473, + "learning_rate": 0.00012329080535895478, + "loss": 2.6442, + "step": 8561 + }, + { + "epoch": 0.6909853926236785, + "grad_norm": 0.6865811347961426, + "learning_rate": 0.00012327545233028898, + "loss": 2.564, + "step": 8562 + }, + { + "epoch": 0.6910660963602615, + "grad_norm": 0.6999006867408752, + "learning_rate": 0.0001232600987215381, + "loss": 2.5607, + "step": 8563 + }, + { + "epoch": 0.6911468000968445, + "grad_norm": 0.6734526753425598, + "learning_rate": 0.0001232447445330847, + "loss": 2.5261, + "step": 8564 + }, + { + "epoch": 0.6912275038334275, + "grad_norm": 0.7447343468666077, + "learning_rate": 0.00012322938976531153, + "loss": 2.5359, + "step": 8565 + }, + { + "epoch": 0.6913082075700105, + "grad_norm": 0.6498517394065857, + "learning_rate": 0.00012321403441860126, + "loss": 2.5345, + "step": 8566 + }, + { + "epoch": 0.6913889113065935, + "grad_norm": 0.692933976650238, + "learning_rate": 0.00012319867849333658, + "loss": 2.6293, + "step": 8567 + }, + { + "epoch": 0.6914696150431765, + "grad_norm": 0.728430449962616, + "learning_rate": 0.00012318332198990015, + "loss": 2.618, + "step": 8568 + }, + { + "epoch": 0.6915503187797595, + "grad_norm": 0.7029061913490295, + "learning_rate": 0.00012316796490867478, + "loss": 2.6151, + "step": 8569 + }, + { + "epoch": 0.6916310225163425, + "grad_norm": 0.6692330241203308, + "learning_rate": 0.00012315260725004313, + "loss": 2.5511, + "step": 8570 + }, + { + "epoch": 0.6917117262529255, + "grad_norm": 0.6811983585357666, + "learning_rate": 0.000123137249014388, + "loss": 2.6337, + "step": 8571 + }, + { + "epoch": 0.6917924299895085, + "grad_norm": 0.7387441992759705, + "learning_rate": 0.00012312189020209212, + "loss": 2.5679, + "step": 8572 + }, + { + "epoch": 0.6918731337260915, + "grad_norm": 0.7180185914039612, + "learning_rate": 0.0001231065308135383, + "loss": 2.639, + "step": 8573 + }, + { + "epoch": 0.6919538374626745, + "grad_norm": 0.6997829079627991, + "learning_rate": 0.00012309117084910936, + "loss": 2.5392, + "step": 8574 + }, + { + "epoch": 0.6920345411992576, + "grad_norm": 0.7004552483558655, + "learning_rate": 0.00012307581030918807, + "loss": 2.6033, + "step": 8575 + }, + { + "epoch": 0.6921152449358405, + "grad_norm": 0.7183418273925781, + "learning_rate": 0.00012306044919415724, + "loss": 2.6302, + "step": 8576 + }, + { + "epoch": 0.6921959486724235, + "grad_norm": 0.6645712852478027, + "learning_rate": 0.00012304508750439976, + "loss": 2.5401, + "step": 8577 + }, + { + "epoch": 0.6922766524090065, + "grad_norm": 0.6455898284912109, + "learning_rate": 0.00012302972524029848, + "loss": 2.5084, + "step": 8578 + }, + { + "epoch": 0.6923573561455896, + "grad_norm": 0.6933849453926086, + "learning_rate": 0.00012301436240223622, + "loss": 2.5734, + "step": 8579 + }, + { + "epoch": 0.6924380598821726, + "grad_norm": 0.7967655658721924, + "learning_rate": 0.00012299899899059587, + "loss": 2.5721, + "step": 8580 + }, + { + "epoch": 0.6925187636187555, + "grad_norm": 0.706730306148529, + "learning_rate": 0.0001229836350057604, + "loss": 2.6216, + "step": 8581 + }, + { + "epoch": 0.6925994673553385, + "grad_norm": 0.7021105885505676, + "learning_rate": 0.0001229682704481126, + "loss": 2.4877, + "step": 8582 + }, + { + "epoch": 0.6926801710919216, + "grad_norm": 0.7197253108024597, + "learning_rate": 0.00012295290531803553, + "loss": 2.6124, + "step": 8583 + }, + { + "epoch": 0.6927608748285046, + "grad_norm": 0.7559605836868286, + "learning_rate": 0.00012293753961591198, + "loss": 2.6391, + "step": 8584 + }, + { + "epoch": 0.6928415785650875, + "grad_norm": 0.7074676752090454, + "learning_rate": 0.00012292217334212505, + "loss": 2.5949, + "step": 8585 + }, + { + "epoch": 0.6929222823016705, + "grad_norm": 0.6843528747558594, + "learning_rate": 0.00012290680649705763, + "loss": 2.4981, + "step": 8586 + }, + { + "epoch": 0.6930029860382536, + "grad_norm": 0.6853117942810059, + "learning_rate": 0.00012289143908109266, + "loss": 2.6352, + "step": 8587 + }, + { + "epoch": 0.6930836897748366, + "grad_norm": 0.6545630097389221, + "learning_rate": 0.00012287607109461325, + "loss": 2.5344, + "step": 8588 + }, + { + "epoch": 0.6931643935114196, + "grad_norm": 0.7377945184707642, + "learning_rate": 0.00012286070253800233, + "loss": 2.5895, + "step": 8589 + }, + { + "epoch": 0.6932450972480025, + "grad_norm": 0.6919971108436584, + "learning_rate": 0.00012284533341164295, + "loss": 2.5825, + "step": 8590 + }, + { + "epoch": 0.6933258009845856, + "grad_norm": 0.6911910176277161, + "learning_rate": 0.00012282996371591816, + "loss": 2.6008, + "step": 8591 + }, + { + "epoch": 0.6934065047211686, + "grad_norm": 0.7486373782157898, + "learning_rate": 0.00012281459345121095, + "loss": 2.6056, + "step": 8592 + }, + { + "epoch": 0.6934872084577516, + "grad_norm": 0.6829040050506592, + "learning_rate": 0.00012279922261790443, + "loss": 2.5161, + "step": 8593 + }, + { + "epoch": 0.6935679121943346, + "grad_norm": 0.7410104870796204, + "learning_rate": 0.00012278385121638173, + "loss": 2.6114, + "step": 8594 + }, + { + "epoch": 0.6936486159309176, + "grad_norm": 0.7355940937995911, + "learning_rate": 0.00012276847924702587, + "loss": 2.6371, + "step": 8595 + }, + { + "epoch": 0.6937293196675006, + "grad_norm": 0.650641679763794, + "learning_rate": 0.00012275310671022003, + "loss": 2.5568, + "step": 8596 + }, + { + "epoch": 0.6938100234040836, + "grad_norm": 0.661573052406311, + "learning_rate": 0.00012273773360634726, + "loss": 2.5828, + "step": 8597 + }, + { + "epoch": 0.6938907271406666, + "grad_norm": 0.6848435401916504, + "learning_rate": 0.00012272235993579072, + "loss": 2.5226, + "step": 8598 + }, + { + "epoch": 0.6939714308772497, + "grad_norm": 0.7015430927276611, + "learning_rate": 0.0001227069856989336, + "loss": 2.6156, + "step": 8599 + }, + { + "epoch": 0.6940521346138326, + "grad_norm": 0.7058628797531128, + "learning_rate": 0.000122691610896159, + "loss": 2.6007, + "step": 8600 + }, + { + "epoch": 0.6941328383504156, + "grad_norm": 0.6589432954788208, + "learning_rate": 0.0001226762355278502, + "loss": 2.5551, + "step": 8601 + }, + { + "epoch": 0.6942135420869986, + "grad_norm": 0.6875284910202026, + "learning_rate": 0.0001226608595943903, + "loss": 2.5537, + "step": 8602 + }, + { + "epoch": 0.6942942458235817, + "grad_norm": 0.7178356051445007, + "learning_rate": 0.00012264548309616252, + "loss": 2.655, + "step": 8603 + }, + { + "epoch": 0.6943749495601647, + "grad_norm": 0.7327077388763428, + "learning_rate": 0.00012263010603355017, + "loss": 2.5574, + "step": 8604 + }, + { + "epoch": 0.6944556532967476, + "grad_norm": 0.6318337917327881, + "learning_rate": 0.0001226147284069364, + "loss": 2.577, + "step": 8605 + }, + { + "epoch": 0.6945363570333306, + "grad_norm": 0.674872875213623, + "learning_rate": 0.00012259935021670444, + "loss": 2.6225, + "step": 8606 + }, + { + "epoch": 0.6946170607699137, + "grad_norm": 0.6554198861122131, + "learning_rate": 0.0001225839714632376, + "loss": 2.5951, + "step": 8607 + }, + { + "epoch": 0.6946977645064967, + "grad_norm": 0.7086453437805176, + "learning_rate": 0.00012256859214691918, + "loss": 2.622, + "step": 8608 + }, + { + "epoch": 0.6947784682430796, + "grad_norm": 0.6609488129615784, + "learning_rate": 0.00012255321226813245, + "loss": 2.5623, + "step": 8609 + }, + { + "epoch": 0.6948591719796626, + "grad_norm": 0.7504609823226929, + "learning_rate": 0.00012253783182726075, + "loss": 2.5264, + "step": 8610 + }, + { + "epoch": 0.6949398757162457, + "grad_norm": 0.6702934503555298, + "learning_rate": 0.00012252245082468733, + "loss": 2.5877, + "step": 8611 + }, + { + "epoch": 0.6950205794528287, + "grad_norm": 0.7116326689720154, + "learning_rate": 0.00012250706926079553, + "loss": 2.5629, + "step": 8612 + }, + { + "epoch": 0.6951012831894117, + "grad_norm": 0.7495368719100952, + "learning_rate": 0.00012249168713596875, + "loss": 2.5731, + "step": 8613 + }, + { + "epoch": 0.6951819869259946, + "grad_norm": 0.7434844970703125, + "learning_rate": 0.0001224763044505904, + "loss": 2.6008, + "step": 8614 + }, + { + "epoch": 0.6952626906625777, + "grad_norm": 0.719667375087738, + "learning_rate": 0.00012246092120504371, + "loss": 2.6051, + "step": 8615 + }, + { + "epoch": 0.6953433943991607, + "grad_norm": 0.7189086079597473, + "learning_rate": 0.00012244553739971216, + "loss": 2.5662, + "step": 8616 + }, + { + "epoch": 0.6954240981357437, + "grad_norm": 0.7222673892974854, + "learning_rate": 0.00012243015303497917, + "loss": 2.609, + "step": 8617 + }, + { + "epoch": 0.6955048018723267, + "grad_norm": 0.7323142290115356, + "learning_rate": 0.00012241476811122813, + "loss": 2.5458, + "step": 8618 + }, + { + "epoch": 0.6955855056089096, + "grad_norm": 0.7374032735824585, + "learning_rate": 0.00012239938262884246, + "loss": 2.6147, + "step": 8619 + }, + { + "epoch": 0.6956662093454927, + "grad_norm": 0.6707843542098999, + "learning_rate": 0.00012238399658820562, + "loss": 2.6462, + "step": 8620 + }, + { + "epoch": 0.6957469130820757, + "grad_norm": 0.7603243589401245, + "learning_rate": 0.0001223686099897011, + "loss": 2.6295, + "step": 8621 + }, + { + "epoch": 0.6958276168186587, + "grad_norm": 0.6966906785964966, + "learning_rate": 0.00012235322283371232, + "loss": 2.545, + "step": 8622 + }, + { + "epoch": 0.6959083205552417, + "grad_norm": 0.6757891774177551, + "learning_rate": 0.0001223378351206228, + "loss": 2.5548, + "step": 8623 + }, + { + "epoch": 0.6959890242918247, + "grad_norm": 0.6901456713676453, + "learning_rate": 0.00012232244685081605, + "loss": 2.5734, + "step": 8624 + }, + { + "epoch": 0.6960697280284077, + "grad_norm": 0.6942903995513916, + "learning_rate": 0.00012230705802467558, + "loss": 2.5495, + "step": 8625 + }, + { + "epoch": 0.6961504317649907, + "grad_norm": 0.6774815320968628, + "learning_rate": 0.0001222916686425849, + "loss": 2.5076, + "step": 8626 + }, + { + "epoch": 0.6962311355015737, + "grad_norm": 0.8037571310997009, + "learning_rate": 0.00012227627870492754, + "loss": 2.6737, + "step": 8627 + }, + { + "epoch": 0.6963118392381568, + "grad_norm": 0.7027560472488403, + "learning_rate": 0.0001222608882120871, + "loss": 2.5401, + "step": 8628 + }, + { + "epoch": 0.6963925429747397, + "grad_norm": 0.6651299595832825, + "learning_rate": 0.00012224549716444714, + "loss": 2.5835, + "step": 8629 + }, + { + "epoch": 0.6964732467113227, + "grad_norm": 0.7082433104515076, + "learning_rate": 0.00012223010556239124, + "loss": 2.5622, + "step": 8630 + }, + { + "epoch": 0.6965539504479057, + "grad_norm": 0.7993464469909668, + "learning_rate": 0.00012221471340630305, + "loss": 2.655, + "step": 8631 + }, + { + "epoch": 0.6966346541844888, + "grad_norm": 0.7375298142433167, + "learning_rate": 0.00012219932069656606, + "loss": 2.598, + "step": 8632 + }, + { + "epoch": 0.6967153579210718, + "grad_norm": 0.6915456652641296, + "learning_rate": 0.00012218392743356397, + "loss": 2.5649, + "step": 8633 + }, + { + "epoch": 0.6967960616576547, + "grad_norm": 0.679256021976471, + "learning_rate": 0.00012216853361768045, + "loss": 2.545, + "step": 8634 + }, + { + "epoch": 0.6968767653942377, + "grad_norm": 0.7234694361686707, + "learning_rate": 0.0001221531392492991, + "loss": 2.5863, + "step": 8635 + }, + { + "epoch": 0.6969574691308208, + "grad_norm": 0.7053319811820984, + "learning_rate": 0.00012213774432880364, + "loss": 2.5829, + "step": 8636 + }, + { + "epoch": 0.6970381728674038, + "grad_norm": 0.7584449648857117, + "learning_rate": 0.00012212234885657772, + "loss": 2.5855, + "step": 8637 + }, + { + "epoch": 0.6971188766039867, + "grad_norm": 0.7098579406738281, + "learning_rate": 0.00012210695283300501, + "loss": 2.6057, + "step": 8638 + }, + { + "epoch": 0.6971995803405697, + "grad_norm": 0.7350205779075623, + "learning_rate": 0.00012209155625846928, + "loss": 2.546, + "step": 8639 + }, + { + "epoch": 0.6972802840771528, + "grad_norm": 0.6842331290245056, + "learning_rate": 0.0001220761591333542, + "loss": 2.5602, + "step": 8640 + }, + { + "epoch": 0.6973609878137358, + "grad_norm": 0.6731252074241638, + "learning_rate": 0.00012206076145804354, + "loss": 2.4676, + "step": 8641 + }, + { + "epoch": 0.6974416915503188, + "grad_norm": 0.7271167635917664, + "learning_rate": 0.00012204536323292104, + "loss": 2.5605, + "step": 8642 + }, + { + "epoch": 0.6975223952869017, + "grad_norm": 0.6860780715942383, + "learning_rate": 0.00012202996445837043, + "loss": 2.5041, + "step": 8643 + }, + { + "epoch": 0.6976030990234848, + "grad_norm": 0.7134578824043274, + "learning_rate": 0.00012201456513477554, + "loss": 2.614, + "step": 8644 + }, + { + "epoch": 0.6976838027600678, + "grad_norm": 0.6995248198509216, + "learning_rate": 0.00012199916526252014, + "loss": 2.5087, + "step": 8645 + }, + { + "epoch": 0.6977645064966508, + "grad_norm": 0.7280197143554688, + "learning_rate": 0.00012198376484198803, + "loss": 2.5723, + "step": 8646 + }, + { + "epoch": 0.6978452102332338, + "grad_norm": 0.6898967623710632, + "learning_rate": 0.00012196836387356306, + "loss": 2.6073, + "step": 8647 + }, + { + "epoch": 0.6979259139698168, + "grad_norm": 0.6670758128166199, + "learning_rate": 0.00012195296235762901, + "loss": 2.5276, + "step": 8648 + }, + { + "epoch": 0.6980066177063998, + "grad_norm": 0.6862780451774597, + "learning_rate": 0.00012193756029456973, + "loss": 2.5363, + "step": 8649 + }, + { + "epoch": 0.6980873214429828, + "grad_norm": 0.6568876504898071, + "learning_rate": 0.00012192215768476916, + "loss": 2.5828, + "step": 8650 + }, + { + "epoch": 0.6981680251795658, + "grad_norm": 0.7237746119499207, + "learning_rate": 0.00012190675452861107, + "loss": 2.6076, + "step": 8651 + }, + { + "epoch": 0.6982487289161489, + "grad_norm": 0.6831536293029785, + "learning_rate": 0.00012189135082647943, + "loss": 2.5199, + "step": 8652 + }, + { + "epoch": 0.6983294326527318, + "grad_norm": 0.6767029166221619, + "learning_rate": 0.00012187594657875805, + "loss": 2.5859, + "step": 8653 + }, + { + "epoch": 0.6984101363893148, + "grad_norm": 0.6977167129516602, + "learning_rate": 0.00012186054178583092, + "loss": 2.5831, + "step": 8654 + }, + { + "epoch": 0.6984908401258978, + "grad_norm": 0.6369525194168091, + "learning_rate": 0.00012184513644808197, + "loss": 2.5839, + "step": 8655 + }, + { + "epoch": 0.6985715438624809, + "grad_norm": 0.6814634203910828, + "learning_rate": 0.00012182973056589508, + "loss": 2.5493, + "step": 8656 + }, + { + "epoch": 0.6986522475990639, + "grad_norm": 0.6895000338554382, + "learning_rate": 0.00012181432413965428, + "loss": 2.5616, + "step": 8657 + }, + { + "epoch": 0.6987329513356468, + "grad_norm": 0.6689717769622803, + "learning_rate": 0.00012179891716974345, + "loss": 2.5481, + "step": 8658 + }, + { + "epoch": 0.6988136550722298, + "grad_norm": 0.6945160031318665, + "learning_rate": 0.00012178350965654666, + "loss": 2.5781, + "step": 8659 + }, + { + "epoch": 0.6988943588088129, + "grad_norm": 0.7226110696792603, + "learning_rate": 0.00012176810160044785, + "loss": 2.5767, + "step": 8660 + }, + { + "epoch": 0.6989750625453959, + "grad_norm": 0.6810569167137146, + "learning_rate": 0.00012175269300183105, + "loss": 2.5184, + "step": 8661 + }, + { + "epoch": 0.6990557662819789, + "grad_norm": 0.727281928062439, + "learning_rate": 0.0001217372838610803, + "loss": 2.5972, + "step": 8662 + }, + { + "epoch": 0.6991364700185618, + "grad_norm": 0.7111573219299316, + "learning_rate": 0.00012172187417857959, + "loss": 2.6445, + "step": 8663 + }, + { + "epoch": 0.6992171737551449, + "grad_norm": 0.6808965802192688, + "learning_rate": 0.00012170646395471296, + "loss": 2.5191, + "step": 8664 + }, + { + "epoch": 0.6992978774917279, + "grad_norm": 0.7063688635826111, + "learning_rate": 0.00012169105318986455, + "loss": 2.6021, + "step": 8665 + }, + { + "epoch": 0.6993785812283109, + "grad_norm": 0.6522886753082275, + "learning_rate": 0.0001216756418844184, + "loss": 2.5697, + "step": 8666 + }, + { + "epoch": 0.6994592849648938, + "grad_norm": 0.6706095337867737, + "learning_rate": 0.00012166023003875859, + "loss": 2.5706, + "step": 8667 + }, + { + "epoch": 0.6995399887014769, + "grad_norm": 0.6744416356086731, + "learning_rate": 0.00012164481765326923, + "loss": 2.5713, + "step": 8668 + }, + { + "epoch": 0.6996206924380599, + "grad_norm": 0.7385411858558655, + "learning_rate": 0.0001216294047283344, + "loss": 2.5543, + "step": 8669 + }, + { + "epoch": 0.6997013961746429, + "grad_norm": 0.7286678552627563, + "learning_rate": 0.0001216139912643383, + "loss": 2.588, + "step": 8670 + }, + { + "epoch": 0.6997820999112259, + "grad_norm": 0.7065937519073486, + "learning_rate": 0.00012159857726166503, + "loss": 2.5475, + "step": 8671 + }, + { + "epoch": 0.6998628036478088, + "grad_norm": 0.6609788537025452, + "learning_rate": 0.00012158316272069874, + "loss": 2.5664, + "step": 8672 + }, + { + "epoch": 0.6999435073843919, + "grad_norm": 0.7360579371452332, + "learning_rate": 0.00012156774764182364, + "loss": 2.5822, + "step": 8673 + }, + { + "epoch": 0.7000242111209749, + "grad_norm": 0.6265058517456055, + "learning_rate": 0.00012155233202542384, + "loss": 2.5849, + "step": 8674 + }, + { + "epoch": 0.7001049148575579, + "grad_norm": 0.646976888179779, + "learning_rate": 0.00012153691587188363, + "loss": 2.5839, + "step": 8675 + }, + { + "epoch": 0.7001856185941409, + "grad_norm": 0.6634985208511353, + "learning_rate": 0.0001215214991815872, + "loss": 2.5434, + "step": 8676 + }, + { + "epoch": 0.700266322330724, + "grad_norm": 0.6757560968399048, + "learning_rate": 0.00012150608195491871, + "loss": 2.6186, + "step": 8677 + }, + { + "epoch": 0.7003470260673069, + "grad_norm": 0.7077112197875977, + "learning_rate": 0.00012149066419226247, + "loss": 2.5757, + "step": 8678 + }, + { + "epoch": 0.7004277298038899, + "grad_norm": 0.698226273059845, + "learning_rate": 0.00012147524589400268, + "loss": 2.5307, + "step": 8679 + }, + { + "epoch": 0.7005084335404729, + "grad_norm": 0.6782405376434326, + "learning_rate": 0.00012145982706052361, + "loss": 2.5582, + "step": 8680 + }, + { + "epoch": 0.700589137277056, + "grad_norm": 0.6832882165908813, + "learning_rate": 0.0001214444076922096, + "loss": 2.574, + "step": 8681 + }, + { + "epoch": 0.7006698410136389, + "grad_norm": 0.7182612419128418, + "learning_rate": 0.00012142898778944485, + "loss": 2.6457, + "step": 8682 + }, + { + "epoch": 0.7007505447502219, + "grad_norm": 0.7043644785881042, + "learning_rate": 0.00012141356735261373, + "loss": 2.5244, + "step": 8683 + }, + { + "epoch": 0.7008312484868049, + "grad_norm": 0.6942669749259949, + "learning_rate": 0.00012139814638210054, + "loss": 2.5507, + "step": 8684 + }, + { + "epoch": 0.700911952223388, + "grad_norm": 0.8412066102027893, + "learning_rate": 0.00012138272487828959, + "loss": 2.6025, + "step": 8685 + }, + { + "epoch": 0.700992655959971, + "grad_norm": 0.6906788945198059, + "learning_rate": 0.00012136730284156525, + "loss": 2.5259, + "step": 8686 + }, + { + "epoch": 0.7010733596965539, + "grad_norm": 0.7258631587028503, + "learning_rate": 0.00012135188027231188, + "loss": 2.6311, + "step": 8687 + }, + { + "epoch": 0.7011540634331369, + "grad_norm": 0.6294744610786438, + "learning_rate": 0.00012133645717091382, + "loss": 2.5969, + "step": 8688 + }, + { + "epoch": 0.70123476716972, + "grad_norm": 0.6994131207466125, + "learning_rate": 0.00012132103353775548, + "loss": 2.5954, + "step": 8689 + }, + { + "epoch": 0.701315470906303, + "grad_norm": 0.671441912651062, + "learning_rate": 0.00012130560937322124, + "loss": 2.5628, + "step": 8690 + }, + { + "epoch": 0.701396174642886, + "grad_norm": 0.6915482878684998, + "learning_rate": 0.00012129018467769555, + "loss": 2.5173, + "step": 8691 + }, + { + "epoch": 0.7014768783794689, + "grad_norm": 0.6810318231582642, + "learning_rate": 0.00012127475945156279, + "loss": 2.6186, + "step": 8692 + }, + { + "epoch": 0.701557582116052, + "grad_norm": 0.7931910157203674, + "learning_rate": 0.00012125933369520741, + "loss": 2.6243, + "step": 8693 + }, + { + "epoch": 0.701638285852635, + "grad_norm": 0.6843162178993225, + "learning_rate": 0.00012124390740901386, + "loss": 2.6072, + "step": 8694 + }, + { + "epoch": 0.701718989589218, + "grad_norm": 0.672115683555603, + "learning_rate": 0.0001212284805933666, + "loss": 2.6027, + "step": 8695 + }, + { + "epoch": 0.7017996933258009, + "grad_norm": 0.65242600440979, + "learning_rate": 0.00012121305324865014, + "loss": 2.5128, + "step": 8696 + }, + { + "epoch": 0.701880397062384, + "grad_norm": 0.7253173589706421, + "learning_rate": 0.00012119762537524893, + "loss": 2.5776, + "step": 8697 + }, + { + "epoch": 0.701961100798967, + "grad_norm": 0.6536431312561035, + "learning_rate": 0.00012118219697354745, + "loss": 2.5656, + "step": 8698 + }, + { + "epoch": 0.70204180453555, + "grad_norm": 0.7121500372886658, + "learning_rate": 0.00012116676804393028, + "loss": 2.5878, + "step": 8699 + }, + { + "epoch": 0.702122508272133, + "grad_norm": 0.676449716091156, + "learning_rate": 0.00012115133858678191, + "loss": 2.6624, + "step": 8700 + }, + { + "epoch": 0.702203212008716, + "grad_norm": 0.7230382561683655, + "learning_rate": 0.0001211359086024869, + "loss": 2.5461, + "step": 8701 + }, + { + "epoch": 0.702283915745299, + "grad_norm": 0.6679937839508057, + "learning_rate": 0.00012112047809142979, + "loss": 2.5568, + "step": 8702 + }, + { + "epoch": 0.702364619481882, + "grad_norm": 0.6627704501152039, + "learning_rate": 0.0001211050470539952, + "loss": 2.4819, + "step": 8703 + }, + { + "epoch": 0.702445323218465, + "grad_norm": 0.6680646538734436, + "learning_rate": 0.0001210896154905676, + "loss": 2.5722, + "step": 8704 + }, + { + "epoch": 0.7025260269550481, + "grad_norm": 0.7406336665153503, + "learning_rate": 0.00012107418340153167, + "loss": 2.5722, + "step": 8705 + }, + { + "epoch": 0.702606730691631, + "grad_norm": 0.6634557247161865, + "learning_rate": 0.00012105875078727203, + "loss": 2.5747, + "step": 8706 + }, + { + "epoch": 0.702687434428214, + "grad_norm": 0.6521568894386292, + "learning_rate": 0.00012104331764817325, + "loss": 2.555, + "step": 8707 + }, + { + "epoch": 0.702768138164797, + "grad_norm": 0.677606463432312, + "learning_rate": 0.00012102788398461999, + "loss": 2.5544, + "step": 8708 + }, + { + "epoch": 0.7028488419013801, + "grad_norm": 0.6593700051307678, + "learning_rate": 0.0001210124497969969, + "loss": 2.5252, + "step": 8709 + }, + { + "epoch": 0.7029295456379631, + "grad_norm": 0.686903715133667, + "learning_rate": 0.00012099701508568863, + "loss": 2.6513, + "step": 8710 + }, + { + "epoch": 0.703010249374546, + "grad_norm": 0.6395620107650757, + "learning_rate": 0.00012098157985107987, + "loss": 2.5169, + "step": 8711 + }, + { + "epoch": 0.703090953111129, + "grad_norm": 0.7387555837631226, + "learning_rate": 0.00012096614409355526, + "loss": 2.5741, + "step": 8712 + }, + { + "epoch": 0.7031716568477121, + "grad_norm": 0.665900707244873, + "learning_rate": 0.00012095070781349957, + "loss": 2.5068, + "step": 8713 + }, + { + "epoch": 0.7032523605842951, + "grad_norm": 0.6983458399772644, + "learning_rate": 0.00012093527101129745, + "loss": 2.5028, + "step": 8714 + }, + { + "epoch": 0.703333064320878, + "grad_norm": 0.6250826120376587, + "learning_rate": 0.00012091983368733366, + "loss": 2.5765, + "step": 8715 + }, + { + "epoch": 0.703413768057461, + "grad_norm": 0.7031501531600952, + "learning_rate": 0.00012090439584199294, + "loss": 2.5885, + "step": 8716 + }, + { + "epoch": 0.7034944717940441, + "grad_norm": 0.7140926122665405, + "learning_rate": 0.00012088895747566002, + "loss": 2.6278, + "step": 8717 + }, + { + "epoch": 0.7035751755306271, + "grad_norm": 0.6753602027893066, + "learning_rate": 0.00012087351858871969, + "loss": 2.5664, + "step": 8718 + }, + { + "epoch": 0.7036558792672101, + "grad_norm": 0.7150039076805115, + "learning_rate": 0.0001208580791815567, + "loss": 2.6739, + "step": 8719 + }, + { + "epoch": 0.703736583003793, + "grad_norm": 0.7120389342308044, + "learning_rate": 0.00012084263925455583, + "loss": 2.565, + "step": 8720 + }, + { + "epoch": 0.703817286740376, + "grad_norm": 0.7775784134864807, + "learning_rate": 0.00012082719880810194, + "loss": 2.5861, + "step": 8721 + }, + { + "epoch": 0.7038979904769591, + "grad_norm": 0.6704322695732117, + "learning_rate": 0.0001208117578425798, + "loss": 2.5957, + "step": 8722 + }, + { + "epoch": 0.7039786942135421, + "grad_norm": 0.6761276721954346, + "learning_rate": 0.00012079631635837426, + "loss": 2.5472, + "step": 8723 + }, + { + "epoch": 0.7040593979501251, + "grad_norm": 0.7639868855476379, + "learning_rate": 0.00012078087435587016, + "loss": 2.6053, + "step": 8724 + }, + { + "epoch": 0.704140101686708, + "grad_norm": 0.7490074038505554, + "learning_rate": 0.0001207654318354523, + "loss": 2.5517, + "step": 8725 + }, + { + "epoch": 0.7042208054232911, + "grad_norm": 0.7068852782249451, + "learning_rate": 0.00012074998879750566, + "loss": 2.5357, + "step": 8726 + }, + { + "epoch": 0.7043015091598741, + "grad_norm": 0.7273775935173035, + "learning_rate": 0.00012073454524241503, + "loss": 2.6028, + "step": 8727 + }, + { + "epoch": 0.7043822128964571, + "grad_norm": 0.7146363258361816, + "learning_rate": 0.00012071910117056533, + "loss": 2.5982, + "step": 8728 + }, + { + "epoch": 0.7044629166330401, + "grad_norm": 0.7631390690803528, + "learning_rate": 0.00012070365658234149, + "loss": 2.6021, + "step": 8729 + }, + { + "epoch": 0.7045436203696231, + "grad_norm": 0.7065283060073853, + "learning_rate": 0.00012068821147812839, + "loss": 2.5538, + "step": 8730 + }, + { + "epoch": 0.7046243241062061, + "grad_norm": 0.7914319634437561, + "learning_rate": 0.00012067276585831097, + "loss": 2.5617, + "step": 8731 + }, + { + "epoch": 0.7047050278427891, + "grad_norm": 0.7036565542221069, + "learning_rate": 0.0001206573197232742, + "loss": 2.5354, + "step": 8732 + }, + { + "epoch": 0.7047857315793721, + "grad_norm": 0.657116711139679, + "learning_rate": 0.00012064187307340303, + "loss": 2.5084, + "step": 8733 + }, + { + "epoch": 0.7048664353159552, + "grad_norm": 0.7246817946434021, + "learning_rate": 0.00012062642590908242, + "loss": 2.5737, + "step": 8734 + }, + { + "epoch": 0.7049471390525381, + "grad_norm": 0.6895857453346252, + "learning_rate": 0.00012061097823069736, + "loss": 2.5792, + "step": 8735 + }, + { + "epoch": 0.7050278427891211, + "grad_norm": 0.7654988169670105, + "learning_rate": 0.00012059553003863282, + "loss": 2.5302, + "step": 8736 + }, + { + "epoch": 0.7051085465257041, + "grad_norm": 0.7611668109893799, + "learning_rate": 0.00012058008133327387, + "loss": 2.6073, + "step": 8737 + }, + { + "epoch": 0.7051892502622872, + "grad_norm": 0.728729784488678, + "learning_rate": 0.00012056463211500546, + "loss": 2.5714, + "step": 8738 + }, + { + "epoch": 0.7052699539988702, + "grad_norm": 0.7251634001731873, + "learning_rate": 0.00012054918238421271, + "loss": 2.627, + "step": 8739 + }, + { + "epoch": 0.7053506577354531, + "grad_norm": 0.827745795249939, + "learning_rate": 0.00012053373214128056, + "loss": 2.6303, + "step": 8740 + }, + { + "epoch": 0.7054313614720361, + "grad_norm": 0.6837510466575623, + "learning_rate": 0.00012051828138659416, + "loss": 2.5837, + "step": 8741 + }, + { + "epoch": 0.7055120652086192, + "grad_norm": 0.6763553619384766, + "learning_rate": 0.00012050283012053856, + "loss": 2.575, + "step": 8742 + }, + { + "epoch": 0.7055927689452022, + "grad_norm": 0.6779605150222778, + "learning_rate": 0.00012048737834349886, + "loss": 2.588, + "step": 8743 + }, + { + "epoch": 0.7056734726817852, + "grad_norm": 0.7207251191139221, + "learning_rate": 0.00012047192605586008, + "loss": 2.6182, + "step": 8744 + }, + { + "epoch": 0.7057541764183681, + "grad_norm": 0.6681165099143982, + "learning_rate": 0.00012045647325800742, + "loss": 2.5595, + "step": 8745 + }, + { + "epoch": 0.7058348801549512, + "grad_norm": 0.7520970702171326, + "learning_rate": 0.00012044101995032594, + "loss": 2.6306, + "step": 8746 + }, + { + "epoch": 0.7059155838915342, + "grad_norm": 0.7148429155349731, + "learning_rate": 0.00012042556613320087, + "loss": 2.5749, + "step": 8747 + }, + { + "epoch": 0.7059962876281172, + "grad_norm": 0.619369626045227, + "learning_rate": 0.00012041011180701729, + "loss": 2.5382, + "step": 8748 + }, + { + "epoch": 0.7060769913647001, + "grad_norm": 0.7450816035270691, + "learning_rate": 0.00012039465697216032, + "loss": 2.5547, + "step": 8749 + }, + { + "epoch": 0.7061576951012832, + "grad_norm": 0.7324537634849548, + "learning_rate": 0.00012037920162901521, + "loss": 2.5756, + "step": 8750 + }, + { + "epoch": 0.7062383988378662, + "grad_norm": 0.7881754636764526, + "learning_rate": 0.00012036374577796715, + "loss": 2.6376, + "step": 8751 + }, + { + "epoch": 0.7063191025744492, + "grad_norm": 0.7095965147018433, + "learning_rate": 0.00012034828941940128, + "loss": 2.5454, + "step": 8752 + }, + { + "epoch": 0.7063998063110322, + "grad_norm": 0.7142949104309082, + "learning_rate": 0.00012033283255370287, + "loss": 2.5738, + "step": 8753 + }, + { + "epoch": 0.7064805100476153, + "grad_norm": 0.6592378616333008, + "learning_rate": 0.0001203173751812571, + "loss": 2.5473, + "step": 8754 + }, + { + "epoch": 0.7065612137841982, + "grad_norm": 0.6964332461357117, + "learning_rate": 0.00012030191730244926, + "loss": 2.5829, + "step": 8755 + }, + { + "epoch": 0.7066419175207812, + "grad_norm": 0.707539975643158, + "learning_rate": 0.00012028645891766455, + "loss": 2.5652, + "step": 8756 + }, + { + "epoch": 0.7067226212573642, + "grad_norm": 0.6991387009620667, + "learning_rate": 0.00012027100002728824, + "loss": 2.5874, + "step": 8757 + }, + { + "epoch": 0.7068033249939473, + "grad_norm": 0.665746808052063, + "learning_rate": 0.00012025554063170566, + "loss": 2.5163, + "step": 8758 + }, + { + "epoch": 0.7068840287305302, + "grad_norm": 0.696130096912384, + "learning_rate": 0.00012024008073130204, + "loss": 2.5748, + "step": 8759 + }, + { + "epoch": 0.7069647324671132, + "grad_norm": 0.698885440826416, + "learning_rate": 0.00012022462032646269, + "loss": 2.5561, + "step": 8760 + }, + { + "epoch": 0.7070454362036962, + "grad_norm": 0.7052211761474609, + "learning_rate": 0.00012020915941757292, + "loss": 2.5979, + "step": 8761 + }, + { + "epoch": 0.7071261399402793, + "grad_norm": 0.7370811104774475, + "learning_rate": 0.00012019369800501808, + "loss": 2.5623, + "step": 8762 + }, + { + "epoch": 0.7072068436768623, + "grad_norm": 0.6699148416519165, + "learning_rate": 0.00012017823608918352, + "loss": 2.5816, + "step": 8763 + }, + { + "epoch": 0.7072875474134452, + "grad_norm": 0.6712930798530579, + "learning_rate": 0.00012016277367045457, + "loss": 2.5495, + "step": 8764 + }, + { + "epoch": 0.7073682511500282, + "grad_norm": 0.7238204479217529, + "learning_rate": 0.00012014731074921659, + "loss": 2.5936, + "step": 8765 + }, + { + "epoch": 0.7074489548866113, + "grad_norm": 0.7303668856620789, + "learning_rate": 0.00012013184732585494, + "loss": 2.6366, + "step": 8766 + }, + { + "epoch": 0.7075296586231943, + "grad_norm": 0.6883132457733154, + "learning_rate": 0.00012011638340075505, + "loss": 2.534, + "step": 8767 + }, + { + "epoch": 0.7076103623597773, + "grad_norm": 0.7057133316993713, + "learning_rate": 0.00012010091897430229, + "loss": 2.6035, + "step": 8768 + }, + { + "epoch": 0.7076910660963602, + "grad_norm": 0.7069352269172668, + "learning_rate": 0.0001200854540468821, + "loss": 2.5047, + "step": 8769 + }, + { + "epoch": 0.7077717698329433, + "grad_norm": 0.7192478775978088, + "learning_rate": 0.00012006998861887985, + "loss": 2.5698, + "step": 8770 + }, + { + "epoch": 0.7078524735695263, + "grad_norm": 0.6992887854576111, + "learning_rate": 0.00012005452269068107, + "loss": 2.5631, + "step": 8771 + }, + { + "epoch": 0.7079331773061093, + "grad_norm": 0.676154613494873, + "learning_rate": 0.00012003905626267114, + "loss": 2.5255, + "step": 8772 + }, + { + "epoch": 0.7080138810426923, + "grad_norm": 0.672269880771637, + "learning_rate": 0.00012002358933523555, + "loss": 2.5766, + "step": 8773 + }, + { + "epoch": 0.7080945847792752, + "grad_norm": 0.7334566712379456, + "learning_rate": 0.00012000812190875976, + "loss": 2.6068, + "step": 8774 + }, + { + "epoch": 0.7081752885158583, + "grad_norm": 0.6599388122558594, + "learning_rate": 0.00011999265398362931, + "loss": 2.6032, + "step": 8775 + }, + { + "epoch": 0.7082559922524413, + "grad_norm": 0.7158498167991638, + "learning_rate": 0.00011997718556022958, + "loss": 2.599, + "step": 8776 + }, + { + "epoch": 0.7083366959890243, + "grad_norm": 0.7470360994338989, + "learning_rate": 0.00011996171663894624, + "loss": 2.58, + "step": 8777 + }, + { + "epoch": 0.7084173997256072, + "grad_norm": 0.6251266002655029, + "learning_rate": 0.00011994624722016472, + "loss": 2.5996, + "step": 8778 + }, + { + "epoch": 0.7084981034621903, + "grad_norm": 0.6649689078330994, + "learning_rate": 0.00011993077730427058, + "loss": 2.6025, + "step": 8779 + }, + { + "epoch": 0.7085788071987733, + "grad_norm": 0.7554693818092346, + "learning_rate": 0.00011991530689164939, + "loss": 2.6207, + "step": 8780 + }, + { + "epoch": 0.7086595109353563, + "grad_norm": 0.7941430807113647, + "learning_rate": 0.00011989983598268661, + "loss": 2.584, + "step": 8781 + }, + { + "epoch": 0.7087402146719393, + "grad_norm": 0.7257998585700989, + "learning_rate": 0.00011988436457776799, + "loss": 2.6152, + "step": 8782 + }, + { + "epoch": 0.7088209184085223, + "grad_norm": 0.716354489326477, + "learning_rate": 0.00011986889267727899, + "loss": 2.585, + "step": 8783 + }, + { + "epoch": 0.7089016221451053, + "grad_norm": 0.7094400525093079, + "learning_rate": 0.00011985342028160525, + "loss": 2.5759, + "step": 8784 + }, + { + "epoch": 0.7089823258816883, + "grad_norm": 0.7211421728134155, + "learning_rate": 0.0001198379473911324, + "loss": 2.5645, + "step": 8785 + }, + { + "epoch": 0.7090630296182713, + "grad_norm": 0.7166693806648254, + "learning_rate": 0.000119822474006246, + "loss": 2.5357, + "step": 8786 + }, + { + "epoch": 0.7091437333548544, + "grad_norm": 0.6702254414558411, + "learning_rate": 0.00011980700012733175, + "loss": 2.5353, + "step": 8787 + }, + { + "epoch": 0.7092244370914373, + "grad_norm": 0.6784049868583679, + "learning_rate": 0.0001197915257547753, + "loss": 2.4942, + "step": 8788 + }, + { + "epoch": 0.7093051408280203, + "grad_norm": 0.6914299726486206, + "learning_rate": 0.00011977605088896226, + "loss": 2.5682, + "step": 8789 + }, + { + "epoch": 0.7093858445646033, + "grad_norm": 0.7324358820915222, + "learning_rate": 0.00011976057553027837, + "loss": 2.564, + "step": 8790 + }, + { + "epoch": 0.7094665483011864, + "grad_norm": 0.6927928924560547, + "learning_rate": 0.00011974509967910927, + "loss": 2.5728, + "step": 8791 + }, + { + "epoch": 0.7095472520377694, + "grad_norm": 0.6795603036880493, + "learning_rate": 0.00011972962333584066, + "loss": 2.588, + "step": 8792 + }, + { + "epoch": 0.7096279557743523, + "grad_norm": 0.7132226228713989, + "learning_rate": 0.00011971414650085828, + "loss": 2.5759, + "step": 8793 + }, + { + "epoch": 0.7097086595109353, + "grad_norm": 0.737195611000061, + "learning_rate": 0.00011969866917454782, + "loss": 2.5721, + "step": 8794 + }, + { + "epoch": 0.7097893632475184, + "grad_norm": 0.6776021718978882, + "learning_rate": 0.00011968319135729507, + "loss": 2.5794, + "step": 8795 + }, + { + "epoch": 0.7098700669841014, + "grad_norm": 0.7113735675811768, + "learning_rate": 0.0001196677130494857, + "loss": 2.5595, + "step": 8796 + }, + { + "epoch": 0.7099507707206844, + "grad_norm": 0.6277747750282288, + "learning_rate": 0.0001196522342515055, + "loss": 2.5003, + "step": 8797 + }, + { + "epoch": 0.7100314744572673, + "grad_norm": 0.6982879042625427, + "learning_rate": 0.00011963675496374028, + "loss": 2.542, + "step": 8798 + }, + { + "epoch": 0.7101121781938504, + "grad_norm": 0.7019705176353455, + "learning_rate": 0.00011962127518657578, + "loss": 2.5723, + "step": 8799 + }, + { + "epoch": 0.7101928819304334, + "grad_norm": 0.6831088662147522, + "learning_rate": 0.00011960579492039783, + "loss": 2.5676, + "step": 8800 + }, + { + "epoch": 0.7102735856670164, + "grad_norm": 0.6744031310081482, + "learning_rate": 0.0001195903141655922, + "loss": 2.58, + "step": 8801 + }, + { + "epoch": 0.7103542894035993, + "grad_norm": 0.6873177289962769, + "learning_rate": 0.00011957483292254473, + "loss": 2.6289, + "step": 8802 + }, + { + "epoch": 0.7104349931401824, + "grad_norm": 0.6340685486793518, + "learning_rate": 0.00011955935119164125, + "loss": 2.5688, + "step": 8803 + }, + { + "epoch": 0.7105156968767654, + "grad_norm": 0.7147708535194397, + "learning_rate": 0.00011954386897326764, + "loss": 2.5471, + "step": 8804 + }, + { + "epoch": 0.7105964006133484, + "grad_norm": 0.699605405330658, + "learning_rate": 0.00011952838626780971, + "loss": 2.6122, + "step": 8805 + }, + { + "epoch": 0.7106771043499314, + "grad_norm": 0.6685385704040527, + "learning_rate": 0.00011951290307565335, + "loss": 2.5423, + "step": 8806 + }, + { + "epoch": 0.7107578080865145, + "grad_norm": 0.6884726881980896, + "learning_rate": 0.00011949741939718439, + "loss": 2.5243, + "step": 8807 + }, + { + "epoch": 0.7108385118230974, + "grad_norm": 0.6991142630577087, + "learning_rate": 0.00011948193523278884, + "loss": 2.6271, + "step": 8808 + }, + { + "epoch": 0.7109192155596804, + "grad_norm": 0.6964353919029236, + "learning_rate": 0.00011946645058285253, + "loss": 2.6296, + "step": 8809 + }, + { + "epoch": 0.7109999192962634, + "grad_norm": 0.7592040300369263, + "learning_rate": 0.00011945096544776136, + "loss": 2.6601, + "step": 8810 + }, + { + "epoch": 0.7110806230328465, + "grad_norm": 0.7146934866905212, + "learning_rate": 0.00011943547982790131, + "loss": 2.54, + "step": 8811 + }, + { + "epoch": 0.7111613267694294, + "grad_norm": 0.6991123557090759, + "learning_rate": 0.00011941999372365827, + "loss": 2.5978, + "step": 8812 + }, + { + "epoch": 0.7112420305060124, + "grad_norm": 0.6835920810699463, + "learning_rate": 0.00011940450713541822, + "loss": 2.6096, + "step": 8813 + }, + { + "epoch": 0.7113227342425954, + "grad_norm": 0.6913917660713196, + "learning_rate": 0.00011938902006356716, + "loss": 2.5624, + "step": 8814 + }, + { + "epoch": 0.7114034379791785, + "grad_norm": 0.6620622873306274, + "learning_rate": 0.00011937353250849102, + "loss": 2.6211, + "step": 8815 + }, + { + "epoch": 0.7114841417157615, + "grad_norm": 0.6738792061805725, + "learning_rate": 0.00011935804447057581, + "loss": 2.5889, + "step": 8816 + }, + { + "epoch": 0.7115648454523444, + "grad_norm": 0.7101936936378479, + "learning_rate": 0.00011934255595020751, + "loss": 2.5846, + "step": 8817 + }, + { + "epoch": 0.7116455491889274, + "grad_norm": 0.6843911409378052, + "learning_rate": 0.00011932706694777216, + "loss": 2.5757, + "step": 8818 + }, + { + "epoch": 0.7117262529255105, + "grad_norm": 0.7217971086502075, + "learning_rate": 0.0001193115774636558, + "loss": 2.6174, + "step": 8819 + }, + { + "epoch": 0.7118069566620935, + "grad_norm": 0.6706245541572571, + "learning_rate": 0.00011929608749824445, + "loss": 2.5893, + "step": 8820 + }, + { + "epoch": 0.7118876603986765, + "grad_norm": 0.7057672739028931, + "learning_rate": 0.00011928059705192413, + "loss": 2.5426, + "step": 8821 + }, + { + "epoch": 0.7119683641352594, + "grad_norm": 0.7354697585105896, + "learning_rate": 0.00011926510612508095, + "loss": 2.5741, + "step": 8822 + }, + { + "epoch": 0.7120490678718424, + "grad_norm": 0.6618186235427856, + "learning_rate": 0.00011924961471810096, + "loss": 2.6007, + "step": 8823 + }, + { + "epoch": 0.7121297716084255, + "grad_norm": 0.6733995676040649, + "learning_rate": 0.00011923412283137028, + "loss": 2.5739, + "step": 8824 + }, + { + "epoch": 0.7122104753450085, + "grad_norm": 0.7324833869934082, + "learning_rate": 0.00011921863046527497, + "loss": 2.5461, + "step": 8825 + }, + { + "epoch": 0.7122911790815915, + "grad_norm": 0.6753048896789551, + "learning_rate": 0.00011920313762020113, + "loss": 2.5066, + "step": 8826 + }, + { + "epoch": 0.7123718828181744, + "grad_norm": 0.7861250638961792, + "learning_rate": 0.00011918764429653489, + "loss": 2.5229, + "step": 8827 + }, + { + "epoch": 0.7124525865547575, + "grad_norm": 0.7037342190742493, + "learning_rate": 0.00011917215049466244, + "loss": 2.5443, + "step": 8828 + }, + { + "epoch": 0.7125332902913405, + "grad_norm": 0.7112773060798645, + "learning_rate": 0.00011915665621496985, + "loss": 2.5656, + "step": 8829 + }, + { + "epoch": 0.7126139940279235, + "grad_norm": 0.6384316682815552, + "learning_rate": 0.00011914116145784333, + "loss": 2.5526, + "step": 8830 + }, + { + "epoch": 0.7126946977645064, + "grad_norm": 0.6673600077629089, + "learning_rate": 0.000119125666223669, + "loss": 2.5868, + "step": 8831 + }, + { + "epoch": 0.7127754015010895, + "grad_norm": 0.6927722692489624, + "learning_rate": 0.0001191101705128331, + "loss": 2.6237, + "step": 8832 + }, + { + "epoch": 0.7128561052376725, + "grad_norm": 0.7410106658935547, + "learning_rate": 0.00011909467432572182, + "loss": 2.5652, + "step": 8833 + }, + { + "epoch": 0.7129368089742555, + "grad_norm": 0.6780139803886414, + "learning_rate": 0.0001190791776627213, + "loss": 2.5343, + "step": 8834 + }, + { + "epoch": 0.7130175127108385, + "grad_norm": 0.7147949934005737, + "learning_rate": 0.00011906368052421781, + "loss": 2.5368, + "step": 8835 + }, + { + "epoch": 0.7130982164474216, + "grad_norm": 0.7092324495315552, + "learning_rate": 0.00011904818291059759, + "loss": 2.538, + "step": 8836 + }, + { + "epoch": 0.7131789201840045, + "grad_norm": 0.761763870716095, + "learning_rate": 0.00011903268482224684, + "loss": 2.5984, + "step": 8837 + }, + { + "epoch": 0.7132596239205875, + "grad_norm": 0.7011365294456482, + "learning_rate": 0.00011901718625955182, + "loss": 2.5383, + "step": 8838 + }, + { + "epoch": 0.7133403276571705, + "grad_norm": 0.7982703447341919, + "learning_rate": 0.00011900168722289882, + "loss": 2.5714, + "step": 8839 + }, + { + "epoch": 0.7134210313937536, + "grad_norm": 0.6788253784179688, + "learning_rate": 0.00011898618771267412, + "loss": 2.5675, + "step": 8840 + }, + { + "epoch": 0.7135017351303365, + "grad_norm": 0.6245018243789673, + "learning_rate": 0.00011897068772926397, + "loss": 2.5497, + "step": 8841 + }, + { + "epoch": 0.7135824388669195, + "grad_norm": 0.732109785079956, + "learning_rate": 0.0001189551872730547, + "loss": 2.5043, + "step": 8842 + }, + { + "epoch": 0.7136631426035025, + "grad_norm": 0.7640885710716248, + "learning_rate": 0.0001189396863444326, + "loss": 2.5974, + "step": 8843 + }, + { + "epoch": 0.7137438463400856, + "grad_norm": 0.6806808710098267, + "learning_rate": 0.00011892418494378403, + "loss": 2.5911, + "step": 8844 + }, + { + "epoch": 0.7138245500766686, + "grad_norm": 0.6730000376701355, + "learning_rate": 0.00011890868307149528, + "loss": 2.5405, + "step": 8845 + }, + { + "epoch": 0.7139052538132515, + "grad_norm": 0.6881929636001587, + "learning_rate": 0.00011889318072795275, + "loss": 2.6083, + "step": 8846 + }, + { + "epoch": 0.7139859575498345, + "grad_norm": 0.7079598307609558, + "learning_rate": 0.00011887767791354275, + "loss": 2.5743, + "step": 8847 + }, + { + "epoch": 0.7140666612864176, + "grad_norm": 0.6760475635528564, + "learning_rate": 0.00011886217462865166, + "loss": 2.5925, + "step": 8848 + }, + { + "epoch": 0.7141473650230006, + "grad_norm": 0.6851043701171875, + "learning_rate": 0.00011884667087366587, + "loss": 2.5839, + "step": 8849 + }, + { + "epoch": 0.7142280687595836, + "grad_norm": 0.6805267930030823, + "learning_rate": 0.00011883116664897178, + "loss": 2.562, + "step": 8850 + }, + { + "epoch": 0.7143087724961665, + "grad_norm": 0.6720704436302185, + "learning_rate": 0.00011881566195495581, + "loss": 2.5381, + "step": 8851 + }, + { + "epoch": 0.7143894762327496, + "grad_norm": 0.718166172504425, + "learning_rate": 0.00011880015679200436, + "loss": 2.5912, + "step": 8852 + }, + { + "epoch": 0.7144701799693326, + "grad_norm": 0.6643497943878174, + "learning_rate": 0.00011878465116050383, + "loss": 2.5122, + "step": 8853 + }, + { + "epoch": 0.7145508837059156, + "grad_norm": 0.705186665058136, + "learning_rate": 0.00011876914506084074, + "loss": 2.617, + "step": 8854 + }, + { + "epoch": 0.7146315874424986, + "grad_norm": 0.6417848467826843, + "learning_rate": 0.00011875363849340144, + "loss": 2.5552, + "step": 8855 + }, + { + "epoch": 0.7147122911790816, + "grad_norm": 0.6861358880996704, + "learning_rate": 0.00011873813145857249, + "loss": 2.6324, + "step": 8856 + }, + { + "epoch": 0.7147929949156646, + "grad_norm": 0.7134111523628235, + "learning_rate": 0.00011872262395674027, + "loss": 2.5892, + "step": 8857 + }, + { + "epoch": 0.7148736986522476, + "grad_norm": 0.7177506685256958, + "learning_rate": 0.00011870711598829135, + "loss": 2.5677, + "step": 8858 + }, + { + "epoch": 0.7149544023888306, + "grad_norm": 0.6435763835906982, + "learning_rate": 0.00011869160755361219, + "loss": 2.5452, + "step": 8859 + }, + { + "epoch": 0.7150351061254137, + "grad_norm": 0.6443132758140564, + "learning_rate": 0.00011867609865308935, + "loss": 2.5566, + "step": 8860 + }, + { + "epoch": 0.7151158098619966, + "grad_norm": 0.7132347822189331, + "learning_rate": 0.00011866058928710925, + "loss": 2.565, + "step": 8861 + }, + { + "epoch": 0.7151965135985796, + "grad_norm": 0.7803207039833069, + "learning_rate": 0.00011864507945605854, + "loss": 2.556, + "step": 8862 + }, + { + "epoch": 0.7152772173351626, + "grad_norm": 0.7277950644493103, + "learning_rate": 0.00011862956916032367, + "loss": 2.5623, + "step": 8863 + }, + { + "epoch": 0.7153579210717457, + "grad_norm": 0.6812277436256409, + "learning_rate": 0.00011861405840029125, + "loss": 2.6146, + "step": 8864 + }, + { + "epoch": 0.7154386248083286, + "grad_norm": 0.7170509099960327, + "learning_rate": 0.00011859854717634786, + "loss": 2.52, + "step": 8865 + }, + { + "epoch": 0.7155193285449116, + "grad_norm": 0.7282906174659729, + "learning_rate": 0.00011858303548888004, + "loss": 2.5605, + "step": 8866 + }, + { + "epoch": 0.7156000322814946, + "grad_norm": 0.7290246486663818, + "learning_rate": 0.00011856752333827439, + "loss": 2.6292, + "step": 8867 + }, + { + "epoch": 0.7156807360180777, + "grad_norm": 0.6870024800300598, + "learning_rate": 0.00011855201072491752, + "loss": 2.6396, + "step": 8868 + }, + { + "epoch": 0.7157614397546607, + "grad_norm": 0.7336156964302063, + "learning_rate": 0.00011853649764919605, + "loss": 2.6356, + "step": 8869 + }, + { + "epoch": 0.7158421434912436, + "grad_norm": 0.7181294560432434, + "learning_rate": 0.00011852098411149661, + "loss": 2.5163, + "step": 8870 + }, + { + "epoch": 0.7159228472278266, + "grad_norm": 0.7355513572692871, + "learning_rate": 0.00011850547011220583, + "loss": 2.5485, + "step": 8871 + }, + { + "epoch": 0.7160035509644097, + "grad_norm": 0.7005351185798645, + "learning_rate": 0.00011848995565171038, + "loss": 2.5187, + "step": 8872 + }, + { + "epoch": 0.7160842547009927, + "grad_norm": 0.6550194025039673, + "learning_rate": 0.00011847444073039686, + "loss": 2.5174, + "step": 8873 + }, + { + "epoch": 0.7161649584375757, + "grad_norm": 0.6568251252174377, + "learning_rate": 0.00011845892534865202, + "loss": 2.5128, + "step": 8874 + }, + { + "epoch": 0.7162456621741586, + "grad_norm": 0.6359419226646423, + "learning_rate": 0.0001184434095068625, + "loss": 2.5967, + "step": 8875 + }, + { + "epoch": 0.7163263659107416, + "grad_norm": 0.6730023622512817, + "learning_rate": 0.00011842789320541504, + "loss": 2.5243, + "step": 8876 + }, + { + "epoch": 0.7164070696473247, + "grad_norm": 0.6750187277793884, + "learning_rate": 0.00011841237644469625, + "loss": 2.602, + "step": 8877 + }, + { + "epoch": 0.7164877733839077, + "grad_norm": 0.7039143443107605, + "learning_rate": 0.00011839685922509291, + "loss": 2.5345, + "step": 8878 + }, + { + "epoch": 0.7165684771204907, + "grad_norm": 0.6602306962013245, + "learning_rate": 0.00011838134154699177, + "loss": 2.5995, + "step": 8879 + }, + { + "epoch": 0.7166491808570736, + "grad_norm": 0.6744598150253296, + "learning_rate": 0.00011836582341077955, + "loss": 2.6005, + "step": 8880 + }, + { + "epoch": 0.7167298845936567, + "grad_norm": 0.7136051058769226, + "learning_rate": 0.00011835030481684302, + "loss": 2.5424, + "step": 8881 + }, + { + "epoch": 0.7168105883302397, + "grad_norm": 0.7085986137390137, + "learning_rate": 0.00011833478576556889, + "loss": 2.5912, + "step": 8882 + }, + { + "epoch": 0.7168912920668227, + "grad_norm": 0.7635689377784729, + "learning_rate": 0.00011831926625734398, + "loss": 2.5836, + "step": 8883 + }, + { + "epoch": 0.7169719958034056, + "grad_norm": 0.6543256640434265, + "learning_rate": 0.00011830374629255508, + "loss": 2.5442, + "step": 8884 + }, + { + "epoch": 0.7170526995399887, + "grad_norm": 0.663840115070343, + "learning_rate": 0.00011828822587158896, + "loss": 2.5529, + "step": 8885 + }, + { + "epoch": 0.7171334032765717, + "grad_norm": 0.6868027448654175, + "learning_rate": 0.00011827270499483247, + "loss": 2.6678, + "step": 8886 + }, + { + "epoch": 0.7172141070131547, + "grad_norm": 0.649172842502594, + "learning_rate": 0.00011825718366267238, + "loss": 2.57, + "step": 8887 + }, + { + "epoch": 0.7172948107497377, + "grad_norm": 0.6818440556526184, + "learning_rate": 0.00011824166187549554, + "loss": 2.5602, + "step": 8888 + }, + { + "epoch": 0.7173755144863208, + "grad_norm": 0.7222314476966858, + "learning_rate": 0.00011822613963368885, + "loss": 2.5526, + "step": 8889 + }, + { + "epoch": 0.7174562182229037, + "grad_norm": 0.7309598922729492, + "learning_rate": 0.00011821061693763909, + "loss": 2.5515, + "step": 8890 + }, + { + "epoch": 0.7175369219594867, + "grad_norm": 0.6935746669769287, + "learning_rate": 0.00011819509378773314, + "loss": 2.5506, + "step": 8891 + }, + { + "epoch": 0.7176176256960697, + "grad_norm": 0.6754423975944519, + "learning_rate": 0.00011817957018435792, + "loss": 2.5621, + "step": 8892 + }, + { + "epoch": 0.7176983294326528, + "grad_norm": 0.7087355852127075, + "learning_rate": 0.00011816404612790026, + "loss": 2.5708, + "step": 8893 + }, + { + "epoch": 0.7177790331692357, + "grad_norm": 0.726820707321167, + "learning_rate": 0.0001181485216187471, + "loss": 2.5741, + "step": 8894 + }, + { + "epoch": 0.7178597369058187, + "grad_norm": 0.6539922952651978, + "learning_rate": 0.00011813299665728532, + "loss": 2.613, + "step": 8895 + }, + { + "epoch": 0.7179404406424017, + "grad_norm": 0.7008066773414612, + "learning_rate": 0.00011811747124390189, + "loss": 2.6029, + "step": 8896 + }, + { + "epoch": 0.7180211443789848, + "grad_norm": 0.6900522708892822, + "learning_rate": 0.00011810194537898374, + "loss": 2.5716, + "step": 8897 + }, + { + "epoch": 0.7181018481155678, + "grad_norm": 0.675345242023468, + "learning_rate": 0.00011808641906291776, + "loss": 2.5742, + "step": 8898 + }, + { + "epoch": 0.7181825518521507, + "grad_norm": 0.6697559356689453, + "learning_rate": 0.00011807089229609092, + "loss": 2.5717, + "step": 8899 + }, + { + "epoch": 0.7182632555887337, + "grad_norm": 0.6874344944953918, + "learning_rate": 0.00011805536507889021, + "loss": 2.5394, + "step": 8900 + }, + { + "epoch": 0.7183439593253168, + "grad_norm": 0.6675494313240051, + "learning_rate": 0.00011803983741170263, + "loss": 2.5655, + "step": 8901 + }, + { + "epoch": 0.7184246630618998, + "grad_norm": 0.6937244534492493, + "learning_rate": 0.00011802430929491517, + "loss": 2.5676, + "step": 8902 + }, + { + "epoch": 0.7185053667984828, + "grad_norm": 0.7591496109962463, + "learning_rate": 0.00011800878072891474, + "loss": 2.5849, + "step": 8903 + }, + { + "epoch": 0.7185860705350657, + "grad_norm": 0.6503129005432129, + "learning_rate": 0.00011799325171408846, + "loss": 2.5416, + "step": 8904 + }, + { + "epoch": 0.7186667742716488, + "grad_norm": 0.6450222134590149, + "learning_rate": 0.00011797772225082333, + "loss": 2.5395, + "step": 8905 + }, + { + "epoch": 0.7187474780082318, + "grad_norm": 0.7317619919776917, + "learning_rate": 0.00011796219233950632, + "loss": 2.609, + "step": 8906 + }, + { + "epoch": 0.7188281817448148, + "grad_norm": 0.7585787773132324, + "learning_rate": 0.00011794666198052455, + "loss": 2.5556, + "step": 8907 + }, + { + "epoch": 0.7189088854813978, + "grad_norm": 0.6718214750289917, + "learning_rate": 0.00011793113117426505, + "loss": 2.5914, + "step": 8908 + }, + { + "epoch": 0.7189895892179808, + "grad_norm": 0.6459314823150635, + "learning_rate": 0.00011791559992111487, + "loss": 2.5956, + "step": 8909 + }, + { + "epoch": 0.7190702929545638, + "grad_norm": 0.6592775583267212, + "learning_rate": 0.00011790006822146113, + "loss": 2.5568, + "step": 8910 + }, + { + "epoch": 0.7191509966911468, + "grad_norm": 0.7277452349662781, + "learning_rate": 0.0001178845360756909, + "loss": 2.5989, + "step": 8911 + }, + { + "epoch": 0.7192317004277298, + "grad_norm": 0.7020131945610046, + "learning_rate": 0.00011786900348419128, + "loss": 2.645, + "step": 8912 + }, + { + "epoch": 0.7193124041643129, + "grad_norm": 0.6746636629104614, + "learning_rate": 0.00011785347044734938, + "loss": 2.5173, + "step": 8913 + }, + { + "epoch": 0.7193931079008958, + "grad_norm": 0.6782798171043396, + "learning_rate": 0.0001178379369655523, + "loss": 2.6007, + "step": 8914 + }, + { + "epoch": 0.7194738116374788, + "grad_norm": 0.705498218536377, + "learning_rate": 0.00011782240303918724, + "loss": 2.5408, + "step": 8915 + }, + { + "epoch": 0.7195545153740618, + "grad_norm": 0.675532341003418, + "learning_rate": 0.00011780686866864128, + "loss": 2.5188, + "step": 8916 + }, + { + "epoch": 0.7196352191106449, + "grad_norm": 0.6552390456199646, + "learning_rate": 0.00011779133385430161, + "loss": 2.5409, + "step": 8917 + }, + { + "epoch": 0.7197159228472279, + "grad_norm": 0.6589654088020325, + "learning_rate": 0.00011777579859655544, + "loss": 2.5447, + "step": 8918 + }, + { + "epoch": 0.7197966265838108, + "grad_norm": 0.7548382878303528, + "learning_rate": 0.00011776026289578985, + "loss": 2.5239, + "step": 8919 + }, + { + "epoch": 0.7198773303203938, + "grad_norm": 0.697325587272644, + "learning_rate": 0.00011774472675239207, + "loss": 2.5887, + "step": 8920 + }, + { + "epoch": 0.7199580340569769, + "grad_norm": 0.734462320804596, + "learning_rate": 0.00011772919016674934, + "loss": 2.5847, + "step": 8921 + }, + { + "epoch": 0.7200387377935599, + "grad_norm": 0.6736955642700195, + "learning_rate": 0.00011771365313924886, + "loss": 2.558, + "step": 8922 + }, + { + "epoch": 0.7201194415301428, + "grad_norm": 0.7157856822013855, + "learning_rate": 0.00011769811567027784, + "loss": 2.6199, + "step": 8923 + }, + { + "epoch": 0.7202001452667258, + "grad_norm": 0.7045830488204956, + "learning_rate": 0.0001176825777602235, + "loss": 2.576, + "step": 8924 + }, + { + "epoch": 0.7202808490033088, + "grad_norm": 0.6875419020652771, + "learning_rate": 0.00011766703940947308, + "loss": 2.6045, + "step": 8925 + }, + { + "epoch": 0.7203615527398919, + "grad_norm": 0.7313494086265564, + "learning_rate": 0.00011765150061841387, + "loss": 2.5388, + "step": 8926 + }, + { + "epoch": 0.7204422564764749, + "grad_norm": 0.7223608493804932, + "learning_rate": 0.00011763596138743313, + "loss": 2.5466, + "step": 8927 + }, + { + "epoch": 0.7205229602130578, + "grad_norm": 0.7289614081382751, + "learning_rate": 0.00011762042171691816, + "loss": 2.5862, + "step": 8928 + }, + { + "epoch": 0.7206036639496408, + "grad_norm": 0.7098878026008606, + "learning_rate": 0.00011760488160725617, + "loss": 2.5497, + "step": 8929 + }, + { + "epoch": 0.7206843676862239, + "grad_norm": 0.7096838355064392, + "learning_rate": 0.00011758934105883452, + "loss": 2.558, + "step": 8930 + }, + { + "epoch": 0.7207650714228069, + "grad_norm": 0.7334743738174438, + "learning_rate": 0.00011757380007204055, + "loss": 2.5966, + "step": 8931 + }, + { + "epoch": 0.7208457751593899, + "grad_norm": 0.7192476391792297, + "learning_rate": 0.00011755825864726149, + "loss": 2.5307, + "step": 8932 + }, + { + "epoch": 0.7209264788959728, + "grad_norm": 0.7329632043838501, + "learning_rate": 0.00011754271678488478, + "loss": 2.6453, + "step": 8933 + }, + { + "epoch": 0.7210071826325559, + "grad_norm": 0.6827974915504456, + "learning_rate": 0.00011752717448529766, + "loss": 2.5507, + "step": 8934 + }, + { + "epoch": 0.7210878863691389, + "grad_norm": 0.8292449116706848, + "learning_rate": 0.00011751163174888756, + "loss": 2.6178, + "step": 8935 + }, + { + "epoch": 0.7211685901057219, + "grad_norm": 0.6504058837890625, + "learning_rate": 0.00011749608857604183, + "loss": 2.574, + "step": 8936 + }, + { + "epoch": 0.7212492938423049, + "grad_norm": 0.6567742824554443, + "learning_rate": 0.00011748054496714785, + "loss": 2.45, + "step": 8937 + }, + { + "epoch": 0.7213299975788879, + "grad_norm": 0.6699101328849792, + "learning_rate": 0.00011746500092259296, + "loss": 2.5827, + "step": 8938 + }, + { + "epoch": 0.7214107013154709, + "grad_norm": 0.7664934992790222, + "learning_rate": 0.0001174494564427646, + "loss": 2.5246, + "step": 8939 + }, + { + "epoch": 0.7214914050520539, + "grad_norm": 0.7276309132575989, + "learning_rate": 0.00011743391152805017, + "loss": 2.6096, + "step": 8940 + }, + { + "epoch": 0.7215721087886369, + "grad_norm": 0.7248005867004395, + "learning_rate": 0.0001174183661788371, + "loss": 2.6362, + "step": 8941 + }, + { + "epoch": 0.72165281252522, + "grad_norm": 0.7773801684379578, + "learning_rate": 0.00011740282039551282, + "loss": 2.547, + "step": 8942 + }, + { + "epoch": 0.7217335162618029, + "grad_norm": 0.7346466779708862, + "learning_rate": 0.00011738727417846476, + "loss": 2.5635, + "step": 8943 + }, + { + "epoch": 0.7218142199983859, + "grad_norm": 0.7042707800865173, + "learning_rate": 0.0001173717275280804, + "loss": 2.5593, + "step": 8944 + }, + { + "epoch": 0.7218949237349689, + "grad_norm": 0.6894899010658264, + "learning_rate": 0.00011735618044474712, + "loss": 2.5272, + "step": 8945 + }, + { + "epoch": 0.721975627471552, + "grad_norm": 0.6643744111061096, + "learning_rate": 0.00011734063292885249, + "loss": 2.6001, + "step": 8946 + }, + { + "epoch": 0.722056331208135, + "grad_norm": 0.7543076276779175, + "learning_rate": 0.00011732508498078396, + "loss": 2.558, + "step": 8947 + }, + { + "epoch": 0.7221370349447179, + "grad_norm": 0.7065596580505371, + "learning_rate": 0.00011730953660092903, + "loss": 2.6255, + "step": 8948 + }, + { + "epoch": 0.7222177386813009, + "grad_norm": 0.6968158483505249, + "learning_rate": 0.0001172939877896752, + "loss": 2.5277, + "step": 8949 + }, + { + "epoch": 0.722298442417884, + "grad_norm": 0.6918557286262512, + "learning_rate": 0.00011727843854740996, + "loss": 2.5456, + "step": 8950 + }, + { + "epoch": 0.722379146154467, + "grad_norm": 0.7262142300605774, + "learning_rate": 0.00011726288887452088, + "loss": 2.5345, + "step": 8951 + }, + { + "epoch": 0.7224598498910499, + "grad_norm": 0.7423329949378967, + "learning_rate": 0.00011724733877139548, + "loss": 2.6335, + "step": 8952 + }, + { + "epoch": 0.7225405536276329, + "grad_norm": 0.7734495997428894, + "learning_rate": 0.00011723178823842136, + "loss": 2.5951, + "step": 8953 + }, + { + "epoch": 0.722621257364216, + "grad_norm": 0.6792804598808289, + "learning_rate": 0.00011721623727598597, + "loss": 2.5927, + "step": 8954 + }, + { + "epoch": 0.722701961100799, + "grad_norm": 0.7971853017807007, + "learning_rate": 0.00011720068588447697, + "loss": 2.5451, + "step": 8955 + }, + { + "epoch": 0.722782664837382, + "grad_norm": 0.7264395356178284, + "learning_rate": 0.00011718513406428189, + "loss": 2.5769, + "step": 8956 + }, + { + "epoch": 0.7228633685739649, + "grad_norm": 0.6536725759506226, + "learning_rate": 0.0001171695818157884, + "loss": 2.6285, + "step": 8957 + }, + { + "epoch": 0.722944072310548, + "grad_norm": 0.6676235198974609, + "learning_rate": 0.000117154029139384, + "loss": 2.5896, + "step": 8958 + }, + { + "epoch": 0.723024776047131, + "grad_norm": 0.7104088664054871, + "learning_rate": 0.00011713847603545636, + "loss": 2.5606, + "step": 8959 + }, + { + "epoch": 0.723105479783714, + "grad_norm": 0.6646785140037537, + "learning_rate": 0.0001171229225043931, + "loss": 2.5617, + "step": 8960 + }, + { + "epoch": 0.723186183520297, + "grad_norm": 0.7148672342300415, + "learning_rate": 0.00011710736854658186, + "loss": 2.5855, + "step": 8961 + }, + { + "epoch": 0.72326688725688, + "grad_norm": 0.6864955425262451, + "learning_rate": 0.00011709181416241028, + "loss": 2.6098, + "step": 8962 + }, + { + "epoch": 0.723347590993463, + "grad_norm": 0.7049087285995483, + "learning_rate": 0.00011707625935226602, + "loss": 2.506, + "step": 8963 + }, + { + "epoch": 0.723428294730046, + "grad_norm": 0.6419759392738342, + "learning_rate": 0.00011706070411653672, + "loss": 2.5485, + "step": 8964 + }, + { + "epoch": 0.723508998466629, + "grad_norm": 0.6879174709320068, + "learning_rate": 0.00011704514845561007, + "loss": 2.5373, + "step": 8965 + }, + { + "epoch": 0.7235897022032121, + "grad_norm": 0.6473780274391174, + "learning_rate": 0.00011702959236987378, + "loss": 2.5479, + "step": 8966 + }, + { + "epoch": 0.723670405939795, + "grad_norm": 0.6924241185188293, + "learning_rate": 0.00011701403585971553, + "loss": 2.5679, + "step": 8967 + }, + { + "epoch": 0.723751109676378, + "grad_norm": 0.7452483773231506, + "learning_rate": 0.00011699847892552305, + "loss": 2.5043, + "step": 8968 + }, + { + "epoch": 0.723831813412961, + "grad_norm": 0.7517218589782715, + "learning_rate": 0.00011698292156768402, + "loss": 2.5554, + "step": 8969 + }, + { + "epoch": 0.7239125171495441, + "grad_norm": 0.6492432355880737, + "learning_rate": 0.00011696736378658618, + "loss": 2.6091, + "step": 8970 + }, + { + "epoch": 0.723993220886127, + "grad_norm": 0.740093469619751, + "learning_rate": 0.0001169518055826173, + "loss": 2.5629, + "step": 8971 + }, + { + "epoch": 0.72407392462271, + "grad_norm": 0.7186923027038574, + "learning_rate": 0.00011693624695616509, + "loss": 2.5537, + "step": 8972 + }, + { + "epoch": 0.724154628359293, + "grad_norm": 0.7066059112548828, + "learning_rate": 0.00011692068790761737, + "loss": 2.5115, + "step": 8973 + }, + { + "epoch": 0.7242353320958761, + "grad_norm": 0.7031805515289307, + "learning_rate": 0.00011690512843736185, + "loss": 2.596, + "step": 8974 + }, + { + "epoch": 0.7243160358324591, + "grad_norm": 0.7308956384658813, + "learning_rate": 0.00011688956854578635, + "loss": 2.6311, + "step": 8975 + }, + { + "epoch": 0.724396739569042, + "grad_norm": 0.6926052570343018, + "learning_rate": 0.00011687400823327863, + "loss": 2.5659, + "step": 8976 + }, + { + "epoch": 0.724477443305625, + "grad_norm": 0.69638991355896, + "learning_rate": 0.00011685844750022654, + "loss": 2.4792, + "step": 8977 + }, + { + "epoch": 0.724558147042208, + "grad_norm": 0.6858355402946472, + "learning_rate": 0.00011684288634701785, + "loss": 2.5707, + "step": 8978 + }, + { + "epoch": 0.7246388507787911, + "grad_norm": 0.6673639416694641, + "learning_rate": 0.00011682732477404044, + "loss": 2.5627, + "step": 8979 + }, + { + "epoch": 0.7247195545153741, + "grad_norm": 0.7174322605133057, + "learning_rate": 0.00011681176278168206, + "loss": 2.5801, + "step": 8980 + }, + { + "epoch": 0.724800258251957, + "grad_norm": 0.6840930581092834, + "learning_rate": 0.00011679620037033064, + "loss": 2.4994, + "step": 8981 + }, + { + "epoch": 0.72488096198854, + "grad_norm": 0.7179884910583496, + "learning_rate": 0.00011678063754037399, + "loss": 2.6408, + "step": 8982 + }, + { + "epoch": 0.7249616657251231, + "grad_norm": 0.6564825773239136, + "learning_rate": 0.00011676507429219998, + "loss": 2.5412, + "step": 8983 + }, + { + "epoch": 0.7250423694617061, + "grad_norm": 0.7020624876022339, + "learning_rate": 0.00011674951062619652, + "loss": 2.5778, + "step": 8984 + }, + { + "epoch": 0.7251230731982891, + "grad_norm": 0.8061255812644958, + "learning_rate": 0.00011673394654275145, + "loss": 2.5581, + "step": 8985 + }, + { + "epoch": 0.725203776934872, + "grad_norm": 0.7653982043266296, + "learning_rate": 0.00011671838204225267, + "loss": 2.5324, + "step": 8986 + }, + { + "epoch": 0.7252844806714551, + "grad_norm": 0.7168377041816711, + "learning_rate": 0.00011670281712508816, + "loss": 2.6357, + "step": 8987 + }, + { + "epoch": 0.7253651844080381, + "grad_norm": 0.6860470771789551, + "learning_rate": 0.00011668725179164575, + "loss": 2.5367, + "step": 8988 + }, + { + "epoch": 0.7254458881446211, + "grad_norm": 0.7175878286361694, + "learning_rate": 0.00011667168604231342, + "loss": 2.549, + "step": 8989 + }, + { + "epoch": 0.725526591881204, + "grad_norm": 0.7124783992767334, + "learning_rate": 0.00011665611987747907, + "loss": 2.5566, + "step": 8990 + }, + { + "epoch": 0.7256072956177871, + "grad_norm": 0.6575417518615723, + "learning_rate": 0.00011664055329753067, + "loss": 2.5455, + "step": 8991 + }, + { + "epoch": 0.7256879993543701, + "grad_norm": 0.6576877236366272, + "learning_rate": 0.00011662498630285623, + "loss": 2.5596, + "step": 8992 + }, + { + "epoch": 0.7257687030909531, + "grad_norm": 0.7235110402107239, + "learning_rate": 0.00011660941889384365, + "loss": 2.6199, + "step": 8993 + }, + { + "epoch": 0.7258494068275361, + "grad_norm": 0.6623982787132263, + "learning_rate": 0.00011659385107088092, + "loss": 2.5642, + "step": 8994 + }, + { + "epoch": 0.7259301105641192, + "grad_norm": 0.7113857865333557, + "learning_rate": 0.00011657828283435605, + "loss": 2.5631, + "step": 8995 + }, + { + "epoch": 0.7260108143007021, + "grad_norm": 0.7076124548912048, + "learning_rate": 0.00011656271418465702, + "loss": 2.5141, + "step": 8996 + }, + { + "epoch": 0.7260915180372851, + "grad_norm": 0.7534562349319458, + "learning_rate": 0.00011654714512217188, + "loss": 2.5896, + "step": 8997 + }, + { + "epoch": 0.7261722217738681, + "grad_norm": 0.7393170595169067, + "learning_rate": 0.00011653157564728865, + "loss": 2.5848, + "step": 8998 + }, + { + "epoch": 0.7262529255104512, + "grad_norm": 0.6829591989517212, + "learning_rate": 0.0001165160057603953, + "loss": 2.5439, + "step": 8999 + }, + { + "epoch": 0.7263336292470342, + "grad_norm": 0.6527189016342163, + "learning_rate": 0.00011650043546187995, + "loss": 2.5655, + "step": 9000 + }, + { + "epoch": 0.7263336292470342, + "eval_loss": 2.487652063369751, + "eval_runtime": 845.9129, + "eval_samples_per_second": 3.097, + "eval_steps_per_second": 0.517, + "step": 9000 + }, + { + "epoch": 0.7264143329836171, + "grad_norm": 0.6545615196228027, + "learning_rate": 0.00011648486475213058, + "loss": 2.5366, + "step": 9001 + }, + { + "epoch": 0.7264950367202001, + "grad_norm": 0.6854971647262573, + "learning_rate": 0.00011646929363153529, + "loss": 2.5832, + "step": 9002 + }, + { + "epoch": 0.7265757404567832, + "grad_norm": 0.7745552062988281, + "learning_rate": 0.00011645372210048218, + "loss": 2.5854, + "step": 9003 + }, + { + "epoch": 0.7266564441933662, + "grad_norm": 0.7159156203269958, + "learning_rate": 0.00011643815015935928, + "loss": 2.614, + "step": 9004 + }, + { + "epoch": 0.7267371479299491, + "grad_norm": 0.700074315071106, + "learning_rate": 0.00011642257780855475, + "loss": 2.6124, + "step": 9005 + }, + { + "epoch": 0.7268178516665321, + "grad_norm": 0.7367869019508362, + "learning_rate": 0.0001164070050484566, + "loss": 2.5512, + "step": 9006 + }, + { + "epoch": 0.7268985554031152, + "grad_norm": 0.6623905897140503, + "learning_rate": 0.00011639143187945301, + "loss": 2.5724, + "step": 9007 + }, + { + "epoch": 0.7269792591396982, + "grad_norm": 0.7111610770225525, + "learning_rate": 0.0001163758583019321, + "loss": 2.547, + "step": 9008 + }, + { + "epoch": 0.7270599628762812, + "grad_norm": 0.6860959529876709, + "learning_rate": 0.00011636028431628199, + "loss": 2.532, + "step": 9009 + }, + { + "epoch": 0.7271406666128641, + "grad_norm": 0.7606309056282043, + "learning_rate": 0.00011634470992289084, + "loss": 2.5214, + "step": 9010 + }, + { + "epoch": 0.7272213703494472, + "grad_norm": 0.6440508365631104, + "learning_rate": 0.00011632913512214677, + "loss": 2.5554, + "step": 9011 + }, + { + "epoch": 0.7273020740860302, + "grad_norm": 0.6770462393760681, + "learning_rate": 0.00011631355991443796, + "loss": 2.5877, + "step": 9012 + }, + { + "epoch": 0.7273827778226132, + "grad_norm": 0.6419155597686768, + "learning_rate": 0.00011629798430015262, + "loss": 2.5337, + "step": 9013 + }, + { + "epoch": 0.7274634815591962, + "grad_norm": 0.6782121658325195, + "learning_rate": 0.00011628240827967891, + "loss": 2.5152, + "step": 9014 + }, + { + "epoch": 0.7275441852957792, + "grad_norm": 0.6972285509109497, + "learning_rate": 0.00011626683185340501, + "loss": 2.5628, + "step": 9015 + }, + { + "epoch": 0.7276248890323622, + "grad_norm": 0.6823342442512512, + "learning_rate": 0.00011625125502171914, + "loss": 2.5977, + "step": 9016 + }, + { + "epoch": 0.7277055927689452, + "grad_norm": 0.723311722278595, + "learning_rate": 0.0001162356777850095, + "loss": 2.5772, + "step": 9017 + }, + { + "epoch": 0.7277862965055282, + "grad_norm": 0.7395427227020264, + "learning_rate": 0.00011622010014366435, + "loss": 2.6068, + "step": 9018 + }, + { + "epoch": 0.7278670002421113, + "grad_norm": 0.6970974206924438, + "learning_rate": 0.00011620452209807192, + "loss": 2.5577, + "step": 9019 + }, + { + "epoch": 0.7279477039786942, + "grad_norm": 0.6921418309211731, + "learning_rate": 0.0001161889436486204, + "loss": 2.5476, + "step": 9020 + }, + { + "epoch": 0.7280284077152772, + "grad_norm": 0.7243841886520386, + "learning_rate": 0.0001161733647956981, + "loss": 2.579, + "step": 9021 + }, + { + "epoch": 0.7281091114518602, + "grad_norm": 0.7240262627601624, + "learning_rate": 0.0001161577855396933, + "loss": 2.5959, + "step": 9022 + }, + { + "epoch": 0.7281898151884433, + "grad_norm": 0.7215476632118225, + "learning_rate": 0.0001161422058809942, + "loss": 2.5979, + "step": 9023 + }, + { + "epoch": 0.7282705189250263, + "grad_norm": 0.7109708786010742, + "learning_rate": 0.00011612662581998917, + "loss": 2.5912, + "step": 9024 + }, + { + "epoch": 0.7283512226616092, + "grad_norm": 0.6814073920249939, + "learning_rate": 0.00011611104535706645, + "loss": 2.5742, + "step": 9025 + }, + { + "epoch": 0.7284319263981922, + "grad_norm": 0.6788144707679749, + "learning_rate": 0.0001160954644926144, + "loss": 2.5656, + "step": 9026 + }, + { + "epoch": 0.7285126301347752, + "grad_norm": 0.7312989830970764, + "learning_rate": 0.00011607988322702126, + "loss": 2.5877, + "step": 9027 + }, + { + "epoch": 0.7285933338713583, + "grad_norm": 0.6725338697433472, + "learning_rate": 0.0001160643015606754, + "loss": 2.5261, + "step": 9028 + }, + { + "epoch": 0.7286740376079412, + "grad_norm": 0.7439326047897339, + "learning_rate": 0.00011604871949396516, + "loss": 2.603, + "step": 9029 + }, + { + "epoch": 0.7287547413445242, + "grad_norm": 0.7091783285140991, + "learning_rate": 0.00011603313702727889, + "loss": 2.5227, + "step": 9030 + }, + { + "epoch": 0.7288354450811072, + "grad_norm": 0.7474398016929626, + "learning_rate": 0.00011601755416100492, + "loss": 2.616, + "step": 9031 + }, + { + "epoch": 0.7289161488176903, + "grad_norm": 0.6904098987579346, + "learning_rate": 0.00011600197089553162, + "loss": 2.556, + "step": 9032 + }, + { + "epoch": 0.7289968525542733, + "grad_norm": 0.7305783033370972, + "learning_rate": 0.00011598638723124739, + "loss": 2.5633, + "step": 9033 + }, + { + "epoch": 0.7290775562908562, + "grad_norm": 0.6626651883125305, + "learning_rate": 0.00011597080316854062, + "loss": 2.5862, + "step": 9034 + }, + { + "epoch": 0.7291582600274392, + "grad_norm": 0.683102548122406, + "learning_rate": 0.00011595521870779968, + "loss": 2.5629, + "step": 9035 + }, + { + "epoch": 0.7292389637640223, + "grad_norm": 0.7486757636070251, + "learning_rate": 0.00011593963384941295, + "loss": 2.5831, + "step": 9036 + }, + { + "epoch": 0.7293196675006053, + "grad_norm": 0.8059591054916382, + "learning_rate": 0.00011592404859376888, + "loss": 2.6414, + "step": 9037 + }, + { + "epoch": 0.7294003712371883, + "grad_norm": 0.8371721506118774, + "learning_rate": 0.00011590846294125594, + "loss": 2.643, + "step": 9038 + }, + { + "epoch": 0.7294810749737712, + "grad_norm": 0.7216931581497192, + "learning_rate": 0.00011589287689226246, + "loss": 2.6, + "step": 9039 + }, + { + "epoch": 0.7295617787103543, + "grad_norm": 0.6940354704856873, + "learning_rate": 0.00011587729044717701, + "loss": 2.546, + "step": 9040 + }, + { + "epoch": 0.7296424824469373, + "grad_norm": 0.6888829469680786, + "learning_rate": 0.00011586170360638792, + "loss": 2.5878, + "step": 9041 + }, + { + "epoch": 0.7297231861835203, + "grad_norm": 0.6863886117935181, + "learning_rate": 0.00011584611637028373, + "loss": 2.5389, + "step": 9042 + }, + { + "epoch": 0.7298038899201033, + "grad_norm": 0.6670756936073303, + "learning_rate": 0.00011583052873925294, + "loss": 2.5465, + "step": 9043 + }, + { + "epoch": 0.7298845936566863, + "grad_norm": 0.7441220879554749, + "learning_rate": 0.00011581494071368392, + "loss": 2.5679, + "step": 9044 + }, + { + "epoch": 0.7299652973932693, + "grad_norm": 0.7135717272758484, + "learning_rate": 0.0001157993522939653, + "loss": 2.5341, + "step": 9045 + }, + { + "epoch": 0.7300460011298523, + "grad_norm": 0.6837992072105408, + "learning_rate": 0.00011578376348048547, + "loss": 2.5233, + "step": 9046 + }, + { + "epoch": 0.7301267048664353, + "grad_norm": 0.706666886806488, + "learning_rate": 0.00011576817427363302, + "loss": 2.6109, + "step": 9047 + }, + { + "epoch": 0.7302074086030184, + "grad_norm": 0.6856269240379333, + "learning_rate": 0.00011575258467379646, + "loss": 2.5651, + "step": 9048 + }, + { + "epoch": 0.7302881123396013, + "grad_norm": 0.6931480169296265, + "learning_rate": 0.00011573699468136427, + "loss": 2.6031, + "step": 9049 + }, + { + "epoch": 0.7303688160761843, + "grad_norm": 0.6558480858802795, + "learning_rate": 0.00011572140429672508, + "loss": 2.5661, + "step": 9050 + }, + { + "epoch": 0.7304495198127673, + "grad_norm": 0.6468425393104553, + "learning_rate": 0.00011570581352026742, + "loss": 2.5171, + "step": 9051 + }, + { + "epoch": 0.7305302235493504, + "grad_norm": 0.7204702496528625, + "learning_rate": 0.00011569022235237974, + "loss": 2.5861, + "step": 9052 + }, + { + "epoch": 0.7306109272859334, + "grad_norm": 0.7536416053771973, + "learning_rate": 0.00011567463079345078, + "loss": 2.633, + "step": 9053 + }, + { + "epoch": 0.7306916310225163, + "grad_norm": 0.6597960591316223, + "learning_rate": 0.00011565903884386904, + "loss": 2.5327, + "step": 9054 + }, + { + "epoch": 0.7307723347590993, + "grad_norm": 0.689153254032135, + "learning_rate": 0.0001156434465040231, + "loss": 2.5397, + "step": 9055 + }, + { + "epoch": 0.7308530384956824, + "grad_norm": 0.7664844393730164, + "learning_rate": 0.00011562785377430159, + "loss": 2.4852, + "step": 9056 + }, + { + "epoch": 0.7309337422322654, + "grad_norm": 0.7122881412506104, + "learning_rate": 0.0001156122606550931, + "loss": 2.5401, + "step": 9057 + }, + { + "epoch": 0.7310144459688483, + "grad_norm": 0.6937551498413086, + "learning_rate": 0.00011559666714678627, + "loss": 2.5705, + "step": 9058 + }, + { + "epoch": 0.7310951497054313, + "grad_norm": 0.6504047513008118, + "learning_rate": 0.00011558107324976974, + "loss": 2.5638, + "step": 9059 + }, + { + "epoch": 0.7311758534420144, + "grad_norm": 0.7759538888931274, + "learning_rate": 0.0001155654789644321, + "loss": 2.5864, + "step": 9060 + }, + { + "epoch": 0.7312565571785974, + "grad_norm": 0.719859778881073, + "learning_rate": 0.00011554988429116207, + "loss": 2.519, + "step": 9061 + }, + { + "epoch": 0.7313372609151804, + "grad_norm": 0.7159178853034973, + "learning_rate": 0.00011553428923034826, + "loss": 2.5301, + "step": 9062 + }, + { + "epoch": 0.7314179646517633, + "grad_norm": 0.6584001183509827, + "learning_rate": 0.00011551869378237934, + "loss": 2.4716, + "step": 9063 + }, + { + "epoch": 0.7314986683883464, + "grad_norm": 0.6548463702201843, + "learning_rate": 0.00011550309794764405, + "loss": 2.5637, + "step": 9064 + }, + { + "epoch": 0.7315793721249294, + "grad_norm": 0.73887699842453, + "learning_rate": 0.000115487501726531, + "loss": 2.5813, + "step": 9065 + }, + { + "epoch": 0.7316600758615124, + "grad_norm": 0.7856181859970093, + "learning_rate": 0.00011547190511942893, + "loss": 2.592, + "step": 9066 + }, + { + "epoch": 0.7317407795980954, + "grad_norm": 0.7040740847587585, + "learning_rate": 0.00011545630812672654, + "loss": 2.5324, + "step": 9067 + }, + { + "epoch": 0.7318214833346784, + "grad_norm": 0.7316064238548279, + "learning_rate": 0.00011544071074881253, + "loss": 2.5487, + "step": 9068 + }, + { + "epoch": 0.7319021870712614, + "grad_norm": 0.7020413279533386, + "learning_rate": 0.00011542511298607568, + "loss": 2.5179, + "step": 9069 + }, + { + "epoch": 0.7319828908078444, + "grad_norm": 0.672605574131012, + "learning_rate": 0.00011540951483890468, + "loss": 2.5367, + "step": 9070 + }, + { + "epoch": 0.7320635945444274, + "grad_norm": 0.7668856382369995, + "learning_rate": 0.00011539391630768828, + "loss": 2.6089, + "step": 9071 + }, + { + "epoch": 0.7321442982810105, + "grad_norm": 0.6641809940338135, + "learning_rate": 0.00011537831739281524, + "loss": 2.5411, + "step": 9072 + }, + { + "epoch": 0.7322250020175934, + "grad_norm": 0.7142000198364258, + "learning_rate": 0.00011536271809467434, + "loss": 2.5469, + "step": 9073 + }, + { + "epoch": 0.7323057057541764, + "grad_norm": 0.7266140580177307, + "learning_rate": 0.00011534711841365435, + "loss": 2.5565, + "step": 9074 + }, + { + "epoch": 0.7323864094907594, + "grad_norm": 0.6763899326324463, + "learning_rate": 0.00011533151835014407, + "loss": 2.551, + "step": 9075 + }, + { + "epoch": 0.7324671132273425, + "grad_norm": 0.6517418026924133, + "learning_rate": 0.00011531591790453224, + "loss": 2.5415, + "step": 9076 + }, + { + "epoch": 0.7325478169639255, + "grad_norm": 0.6602214574813843, + "learning_rate": 0.00011530031707720772, + "loss": 2.593, + "step": 9077 + }, + { + "epoch": 0.7326285207005084, + "grad_norm": 0.7448844313621521, + "learning_rate": 0.00011528471586855931, + "loss": 2.5598, + "step": 9078 + }, + { + "epoch": 0.7327092244370914, + "grad_norm": 0.7197073698043823, + "learning_rate": 0.00011526911427897579, + "loss": 2.5128, + "step": 9079 + }, + { + "epoch": 0.7327899281736744, + "grad_norm": 0.7245968580245972, + "learning_rate": 0.00011525351230884606, + "loss": 2.5016, + "step": 9080 + }, + { + "epoch": 0.7328706319102575, + "grad_norm": 0.6715837717056274, + "learning_rate": 0.00011523790995855892, + "loss": 2.5469, + "step": 9081 + }, + { + "epoch": 0.7329513356468405, + "grad_norm": 0.7143638730049133, + "learning_rate": 0.00011522230722850325, + "loss": 2.5164, + "step": 9082 + }, + { + "epoch": 0.7330320393834234, + "grad_norm": 0.6809647083282471, + "learning_rate": 0.00011520670411906787, + "loss": 2.6071, + "step": 9083 + }, + { + "epoch": 0.7331127431200064, + "grad_norm": 0.7160956859588623, + "learning_rate": 0.00011519110063064167, + "loss": 2.5346, + "step": 9084 + }, + { + "epoch": 0.7331934468565895, + "grad_norm": 0.6814724802970886, + "learning_rate": 0.00011517549676361357, + "loss": 2.5499, + "step": 9085 + }, + { + "epoch": 0.7332741505931725, + "grad_norm": 0.6914821267127991, + "learning_rate": 0.00011515989251837239, + "loss": 2.5386, + "step": 9086 + }, + { + "epoch": 0.7333548543297554, + "grad_norm": 0.7292554378509521, + "learning_rate": 0.00011514428789530705, + "loss": 2.5642, + "step": 9087 + }, + { + "epoch": 0.7334355580663384, + "grad_norm": 0.6894826292991638, + "learning_rate": 0.00011512868289480647, + "loss": 2.6131, + "step": 9088 + }, + { + "epoch": 0.7335162618029215, + "grad_norm": 0.658770740032196, + "learning_rate": 0.00011511307751725957, + "loss": 2.5594, + "step": 9089 + }, + { + "epoch": 0.7335969655395045, + "grad_norm": 0.7508681416511536, + "learning_rate": 0.0001150974717630553, + "loss": 2.595, + "step": 9090 + }, + { + "epoch": 0.7336776692760875, + "grad_norm": 0.69661545753479, + "learning_rate": 0.00011508186563258256, + "loss": 2.5803, + "step": 9091 + }, + { + "epoch": 0.7337583730126704, + "grad_norm": 0.7277412414550781, + "learning_rate": 0.00011506625912623028, + "loss": 2.5456, + "step": 9092 + }, + { + "epoch": 0.7338390767492535, + "grad_norm": 0.658329963684082, + "learning_rate": 0.00011505065224438745, + "loss": 2.5177, + "step": 9093 + }, + { + "epoch": 0.7339197804858365, + "grad_norm": 0.7277211546897888, + "learning_rate": 0.00011503504498744302, + "loss": 2.553, + "step": 9094 + }, + { + "epoch": 0.7340004842224195, + "grad_norm": 0.7240201830863953, + "learning_rate": 0.00011501943735578598, + "loss": 2.5851, + "step": 9095 + }, + { + "epoch": 0.7340811879590025, + "grad_norm": 0.6565662026405334, + "learning_rate": 0.00011500382934980529, + "loss": 2.5865, + "step": 9096 + }, + { + "epoch": 0.7341618916955855, + "grad_norm": 0.658268392086029, + "learning_rate": 0.00011498822096988995, + "loss": 2.5402, + "step": 9097 + }, + { + "epoch": 0.7342425954321685, + "grad_norm": 0.7305087447166443, + "learning_rate": 0.00011497261221642894, + "loss": 2.5483, + "step": 9098 + }, + { + "epoch": 0.7343232991687515, + "grad_norm": 0.7271504402160645, + "learning_rate": 0.00011495700308981134, + "loss": 2.5303, + "step": 9099 + }, + { + "epoch": 0.7344040029053345, + "grad_norm": 0.70429527759552, + "learning_rate": 0.0001149413935904261, + "loss": 2.5878, + "step": 9100 + }, + { + "epoch": 0.7344847066419176, + "grad_norm": 0.7168769836425781, + "learning_rate": 0.00011492578371866229, + "loss": 2.6017, + "step": 9101 + }, + { + "epoch": 0.7345654103785005, + "grad_norm": 0.7131996154785156, + "learning_rate": 0.00011491017347490891, + "loss": 2.5439, + "step": 9102 + }, + { + "epoch": 0.7346461141150835, + "grad_norm": 0.660321056842804, + "learning_rate": 0.00011489456285955504, + "loss": 2.5236, + "step": 9103 + }, + { + "epoch": 0.7347268178516665, + "grad_norm": 0.6742995977401733, + "learning_rate": 0.00011487895187298977, + "loss": 2.5375, + "step": 9104 + }, + { + "epoch": 0.7348075215882496, + "grad_norm": 0.6380610466003418, + "learning_rate": 0.00011486334051560206, + "loss": 2.5173, + "step": 9105 + }, + { + "epoch": 0.7348882253248326, + "grad_norm": 0.6948198080062866, + "learning_rate": 0.0001148477287877811, + "loss": 2.5247, + "step": 9106 + }, + { + "epoch": 0.7349689290614155, + "grad_norm": 0.7088696360588074, + "learning_rate": 0.00011483211668991591, + "loss": 2.587, + "step": 9107 + }, + { + "epoch": 0.7350496327979985, + "grad_norm": 0.6278921961784363, + "learning_rate": 0.00011481650422239556, + "loss": 2.5652, + "step": 9108 + }, + { + "epoch": 0.7351303365345816, + "grad_norm": 0.6901956796646118, + "learning_rate": 0.00011480089138560926, + "loss": 2.5964, + "step": 9109 + }, + { + "epoch": 0.7352110402711646, + "grad_norm": 0.7264819145202637, + "learning_rate": 0.00011478527817994604, + "loss": 2.5437, + "step": 9110 + }, + { + "epoch": 0.7352917440077475, + "grad_norm": 0.6940708756446838, + "learning_rate": 0.00011476966460579501, + "loss": 2.5761, + "step": 9111 + }, + { + "epoch": 0.7353724477443305, + "grad_norm": 0.689588189125061, + "learning_rate": 0.00011475405066354536, + "loss": 2.5457, + "step": 9112 + }, + { + "epoch": 0.7354531514809136, + "grad_norm": 0.6938436031341553, + "learning_rate": 0.00011473843635358618, + "loss": 2.6026, + "step": 9113 + }, + { + "epoch": 0.7355338552174966, + "grad_norm": 0.7122177481651306, + "learning_rate": 0.00011472282167630663, + "loss": 2.5701, + "step": 9114 + }, + { + "epoch": 0.7356145589540796, + "grad_norm": 0.6667213439941406, + "learning_rate": 0.00011470720663209591, + "loss": 2.5944, + "step": 9115 + }, + { + "epoch": 0.7356952626906625, + "grad_norm": 0.705910861492157, + "learning_rate": 0.00011469159122134314, + "loss": 2.6183, + "step": 9116 + }, + { + "epoch": 0.7357759664272456, + "grad_norm": 0.709937572479248, + "learning_rate": 0.00011467597544443751, + "loss": 2.5153, + "step": 9117 + }, + { + "epoch": 0.7358566701638286, + "grad_norm": 0.6870958805084229, + "learning_rate": 0.00011466035930176822, + "loss": 2.5334, + "step": 9118 + }, + { + "epoch": 0.7359373739004116, + "grad_norm": 0.7274392247200012, + "learning_rate": 0.00011464474279372443, + "loss": 2.5336, + "step": 9119 + }, + { + "epoch": 0.7360180776369946, + "grad_norm": 0.6360952258110046, + "learning_rate": 0.0001146291259206954, + "loss": 2.5604, + "step": 9120 + }, + { + "epoch": 0.7360987813735776, + "grad_norm": 0.7990559935569763, + "learning_rate": 0.00011461350868307028, + "loss": 2.624, + "step": 9121 + }, + { + "epoch": 0.7361794851101606, + "grad_norm": 0.6670079827308655, + "learning_rate": 0.00011459789108123835, + "loss": 2.5761, + "step": 9122 + }, + { + "epoch": 0.7362601888467436, + "grad_norm": 0.6994437575340271, + "learning_rate": 0.00011458227311558877, + "loss": 2.5679, + "step": 9123 + }, + { + "epoch": 0.7363408925833266, + "grad_norm": 0.7428358197212219, + "learning_rate": 0.00011456665478651087, + "loss": 2.5874, + "step": 9124 + }, + { + "epoch": 0.7364215963199097, + "grad_norm": 0.7079486846923828, + "learning_rate": 0.00011455103609439387, + "loss": 2.5999, + "step": 9125 + }, + { + "epoch": 0.7365023000564926, + "grad_norm": 0.646244466304779, + "learning_rate": 0.00011453541703962695, + "loss": 2.5053, + "step": 9126 + }, + { + "epoch": 0.7365830037930756, + "grad_norm": 0.6671318411827087, + "learning_rate": 0.0001145197976225995, + "loss": 2.5277, + "step": 9127 + }, + { + "epoch": 0.7366637075296586, + "grad_norm": 0.7060399055480957, + "learning_rate": 0.00011450417784370072, + "loss": 2.6092, + "step": 9128 + }, + { + "epoch": 0.7367444112662416, + "grad_norm": 0.741547703742981, + "learning_rate": 0.00011448855770331989, + "loss": 2.6121, + "step": 9129 + }, + { + "epoch": 0.7368251150028247, + "grad_norm": 0.710267961025238, + "learning_rate": 0.00011447293720184636, + "loss": 2.5141, + "step": 9130 + }, + { + "epoch": 0.7369058187394076, + "grad_norm": 0.6914308071136475, + "learning_rate": 0.0001144573163396694, + "loss": 2.5489, + "step": 9131 + }, + { + "epoch": 0.7369865224759906, + "grad_norm": 0.7051414847373962, + "learning_rate": 0.0001144416951171783, + "loss": 2.5925, + "step": 9132 + }, + { + "epoch": 0.7370672262125736, + "grad_norm": 0.6765387058258057, + "learning_rate": 0.00011442607353476245, + "loss": 2.5864, + "step": 9133 + }, + { + "epoch": 0.7371479299491567, + "grad_norm": 0.706672191619873, + "learning_rate": 0.00011441045159281108, + "loss": 2.4823, + "step": 9134 + }, + { + "epoch": 0.7372286336857397, + "grad_norm": 0.7534066438674927, + "learning_rate": 0.00011439482929171362, + "loss": 2.5728, + "step": 9135 + }, + { + "epoch": 0.7373093374223226, + "grad_norm": 0.6628777384757996, + "learning_rate": 0.00011437920663185939, + "loss": 2.5538, + "step": 9136 + }, + { + "epoch": 0.7373900411589056, + "grad_norm": 0.6575733423233032, + "learning_rate": 0.00011436358361363773, + "loss": 2.4802, + "step": 9137 + }, + { + "epoch": 0.7374707448954887, + "grad_norm": 0.7629329562187195, + "learning_rate": 0.00011434796023743803, + "loss": 2.6169, + "step": 9138 + }, + { + "epoch": 0.7375514486320717, + "grad_norm": 0.7148225903511047, + "learning_rate": 0.00011433233650364965, + "loss": 2.6335, + "step": 9139 + }, + { + "epoch": 0.7376321523686546, + "grad_norm": 0.705210268497467, + "learning_rate": 0.00011431671241266198, + "loss": 2.6261, + "step": 9140 + }, + { + "epoch": 0.7377128561052376, + "grad_norm": 0.7137441635131836, + "learning_rate": 0.00011430108796486441, + "loss": 2.5021, + "step": 9141 + }, + { + "epoch": 0.7377935598418207, + "grad_norm": 0.6979854702949524, + "learning_rate": 0.00011428546316064635, + "loss": 2.5436, + "step": 9142 + }, + { + "epoch": 0.7378742635784037, + "grad_norm": 0.6568784713745117, + "learning_rate": 0.00011426983800039721, + "loss": 2.5882, + "step": 9143 + }, + { + "epoch": 0.7379549673149867, + "grad_norm": 0.666606605052948, + "learning_rate": 0.00011425421248450638, + "loss": 2.5472, + "step": 9144 + }, + { + "epoch": 0.7380356710515696, + "grad_norm": 0.7240840792655945, + "learning_rate": 0.00011423858661336333, + "loss": 2.6057, + "step": 9145 + }, + { + "epoch": 0.7381163747881527, + "grad_norm": 0.7342149615287781, + "learning_rate": 0.0001142229603873575, + "loss": 2.508, + "step": 9146 + }, + { + "epoch": 0.7381970785247357, + "grad_norm": 0.7089941501617432, + "learning_rate": 0.0001142073338068783, + "loss": 2.6115, + "step": 9147 + }, + { + "epoch": 0.7382777822613187, + "grad_norm": 0.6883555054664612, + "learning_rate": 0.00011419170687231519, + "loss": 2.5254, + "step": 9148 + }, + { + "epoch": 0.7383584859979017, + "grad_norm": 0.6819528937339783, + "learning_rate": 0.00011417607958405765, + "loss": 2.5498, + "step": 9149 + }, + { + "epoch": 0.7384391897344847, + "grad_norm": 0.7348979711532593, + "learning_rate": 0.00011416045194249516, + "loss": 2.5547, + "step": 9150 + }, + { + "epoch": 0.7385198934710677, + "grad_norm": 0.6733320355415344, + "learning_rate": 0.00011414482394801719, + "loss": 2.5985, + "step": 9151 + }, + { + "epoch": 0.7386005972076507, + "grad_norm": 0.714771032333374, + "learning_rate": 0.00011412919560101327, + "loss": 2.571, + "step": 9152 + }, + { + "epoch": 0.7386813009442337, + "grad_norm": 0.7010024189949036, + "learning_rate": 0.0001141135669018728, + "loss": 2.5755, + "step": 9153 + }, + { + "epoch": 0.7387620046808168, + "grad_norm": 0.7014826536178589, + "learning_rate": 0.00011409793785098536, + "loss": 2.6033, + "step": 9154 + }, + { + "epoch": 0.7388427084173997, + "grad_norm": 0.7286051511764526, + "learning_rate": 0.0001140823084487405, + "loss": 2.515, + "step": 9155 + }, + { + "epoch": 0.7389234121539827, + "grad_norm": 0.669365406036377, + "learning_rate": 0.00011406667869552768, + "loss": 2.506, + "step": 9156 + }, + { + "epoch": 0.7390041158905657, + "grad_norm": 0.6886852979660034, + "learning_rate": 0.00011405104859173645, + "loss": 2.6123, + "step": 9157 + }, + { + "epoch": 0.7390848196271488, + "grad_norm": 0.6344162225723267, + "learning_rate": 0.00011403541813775635, + "loss": 2.5483, + "step": 9158 + }, + { + "epoch": 0.7391655233637318, + "grad_norm": 0.7043579816818237, + "learning_rate": 0.00011401978733397694, + "loss": 2.5545, + "step": 9159 + }, + { + "epoch": 0.7392462271003147, + "grad_norm": 0.7960262298583984, + "learning_rate": 0.00011400415618078781, + "loss": 2.5666, + "step": 9160 + }, + { + "epoch": 0.7393269308368977, + "grad_norm": 0.6771546006202698, + "learning_rate": 0.00011398852467857848, + "loss": 2.6016, + "step": 9161 + }, + { + "epoch": 0.7394076345734808, + "grad_norm": 0.6522069573402405, + "learning_rate": 0.00011397289282773855, + "loss": 2.5493, + "step": 9162 + }, + { + "epoch": 0.7394883383100638, + "grad_norm": 0.6804657578468323, + "learning_rate": 0.00011395726062865762, + "loss": 2.5856, + "step": 9163 + }, + { + "epoch": 0.7395690420466468, + "grad_norm": 0.7562841176986694, + "learning_rate": 0.00011394162808172526, + "loss": 2.557, + "step": 9164 + }, + { + "epoch": 0.7396497457832297, + "grad_norm": 0.6464113593101501, + "learning_rate": 0.00011392599518733107, + "loss": 2.5292, + "step": 9165 + }, + { + "epoch": 0.7397304495198128, + "grad_norm": 0.7469549775123596, + "learning_rate": 0.00011391036194586466, + "loss": 2.6168, + "step": 9166 + }, + { + "epoch": 0.7398111532563958, + "grad_norm": 0.7095946669578552, + "learning_rate": 0.00011389472835771572, + "loss": 2.5468, + "step": 9167 + }, + { + "epoch": 0.7398918569929788, + "grad_norm": 0.7376375794410706, + "learning_rate": 0.00011387909442327382, + "loss": 2.5576, + "step": 9168 + }, + { + "epoch": 0.7399725607295617, + "grad_norm": 0.736727774143219, + "learning_rate": 0.00011386346014292859, + "loss": 2.6034, + "step": 9169 + }, + { + "epoch": 0.7400532644661448, + "grad_norm": 0.7026904821395874, + "learning_rate": 0.00011384782551706967, + "loss": 2.5848, + "step": 9170 + }, + { + "epoch": 0.7401339682027278, + "grad_norm": 0.6894888877868652, + "learning_rate": 0.00011383219054608678, + "loss": 2.5475, + "step": 9171 + }, + { + "epoch": 0.7402146719393108, + "grad_norm": 0.6754137277603149, + "learning_rate": 0.00011381655523036954, + "loss": 2.5124, + "step": 9172 + }, + { + "epoch": 0.7402953756758938, + "grad_norm": 0.7935643196105957, + "learning_rate": 0.00011380091957030762, + "loss": 2.5898, + "step": 9173 + }, + { + "epoch": 0.7403760794124769, + "grad_norm": 0.7017118334770203, + "learning_rate": 0.0001137852835662907, + "loss": 2.6139, + "step": 9174 + }, + { + "epoch": 0.7404567831490598, + "grad_norm": 0.7246189117431641, + "learning_rate": 0.00011376964721870847, + "loss": 2.4627, + "step": 9175 + }, + { + "epoch": 0.7405374868856428, + "grad_norm": 0.6835598349571228, + "learning_rate": 0.00011375401052795064, + "loss": 2.5707, + "step": 9176 + }, + { + "epoch": 0.7406181906222258, + "grad_norm": 0.6439787745475769, + "learning_rate": 0.00011373837349440693, + "loss": 2.5161, + "step": 9177 + }, + { + "epoch": 0.7406988943588089, + "grad_norm": 0.7249091267585754, + "learning_rate": 0.00011372273611846704, + "loss": 2.5054, + "step": 9178 + }, + { + "epoch": 0.7407795980953918, + "grad_norm": 0.7653267979621887, + "learning_rate": 0.0001137070984005207, + "loss": 2.6016, + "step": 9179 + }, + { + "epoch": 0.7408603018319748, + "grad_norm": 0.7195165157318115, + "learning_rate": 0.0001136914603409576, + "loss": 2.5931, + "step": 9180 + }, + { + "epoch": 0.7409410055685578, + "grad_norm": 0.7093746662139893, + "learning_rate": 0.00011367582194016756, + "loss": 2.5567, + "step": 9181 + }, + { + "epoch": 0.7410217093051408, + "grad_norm": 0.6868107318878174, + "learning_rate": 0.00011366018319854026, + "loss": 2.5769, + "step": 9182 + }, + { + "epoch": 0.7411024130417239, + "grad_norm": 0.6870261430740356, + "learning_rate": 0.00011364454411646552, + "loss": 2.5418, + "step": 9183 + }, + { + "epoch": 0.7411831167783068, + "grad_norm": 0.7034662365913391, + "learning_rate": 0.00011362890469433306, + "loss": 2.5798, + "step": 9184 + }, + { + "epoch": 0.7412638205148898, + "grad_norm": 0.7200794816017151, + "learning_rate": 0.00011361326493253264, + "loss": 2.5523, + "step": 9185 + }, + { + "epoch": 0.7413445242514728, + "grad_norm": 0.7034540772438049, + "learning_rate": 0.0001135976248314541, + "loss": 2.5107, + "step": 9186 + }, + { + "epoch": 0.7414252279880559, + "grad_norm": 0.7155053019523621, + "learning_rate": 0.00011358198439148721, + "loss": 2.5804, + "step": 9187 + }, + { + "epoch": 0.7415059317246389, + "grad_norm": 0.6965398788452148, + "learning_rate": 0.00011356634361302175, + "loss": 2.5532, + "step": 9188 + }, + { + "epoch": 0.7415866354612218, + "grad_norm": 0.65416419506073, + "learning_rate": 0.00011355070249644755, + "loss": 2.5411, + "step": 9189 + }, + { + "epoch": 0.7416673391978048, + "grad_norm": 0.6798486709594727, + "learning_rate": 0.0001135350610421544, + "loss": 2.4957, + "step": 9190 + }, + { + "epoch": 0.7417480429343879, + "grad_norm": 0.6839874386787415, + "learning_rate": 0.00011351941925053218, + "loss": 2.5745, + "step": 9191 + }, + { + "epoch": 0.7418287466709709, + "grad_norm": 0.7374398708343506, + "learning_rate": 0.00011350377712197068, + "loss": 2.4923, + "step": 9192 + }, + { + "epoch": 0.7419094504075538, + "grad_norm": 0.7517396807670593, + "learning_rate": 0.00011348813465685974, + "loss": 2.538, + "step": 9193 + }, + { + "epoch": 0.7419901541441368, + "grad_norm": 0.6670863628387451, + "learning_rate": 0.00011347249185558926, + "loss": 2.5442, + "step": 9194 + }, + { + "epoch": 0.7420708578807199, + "grad_norm": 0.6508080363273621, + "learning_rate": 0.00011345684871854905, + "loss": 2.6665, + "step": 9195 + }, + { + "epoch": 0.7421515616173029, + "grad_norm": 0.6935258507728577, + "learning_rate": 0.00011344120524612898, + "loss": 2.5388, + "step": 9196 + }, + { + "epoch": 0.7422322653538859, + "grad_norm": 0.696067750453949, + "learning_rate": 0.00011342556143871897, + "loss": 2.574, + "step": 9197 + }, + { + "epoch": 0.7423129690904688, + "grad_norm": 0.7486966252326965, + "learning_rate": 0.00011340991729670882, + "loss": 2.5924, + "step": 9198 + }, + { + "epoch": 0.7423936728270519, + "grad_norm": 0.676407516002655, + "learning_rate": 0.00011339427282048854, + "loss": 2.5907, + "step": 9199 + }, + { + "epoch": 0.7424743765636349, + "grad_norm": 0.7241318225860596, + "learning_rate": 0.00011337862801044792, + "loss": 2.5685, + "step": 9200 + }, + { + "epoch": 0.7425550803002179, + "grad_norm": 0.7012883424758911, + "learning_rate": 0.00011336298286697692, + "loss": 2.56, + "step": 9201 + }, + { + "epoch": 0.7426357840368009, + "grad_norm": 0.7313060164451599, + "learning_rate": 0.0001133473373904655, + "loss": 2.632, + "step": 9202 + }, + { + "epoch": 0.742716487773384, + "grad_norm": 0.6829206943511963, + "learning_rate": 0.00011333169158130353, + "loss": 2.5006, + "step": 9203 + }, + { + "epoch": 0.7427971915099669, + "grad_norm": 0.7324578166007996, + "learning_rate": 0.00011331604543988093, + "loss": 2.5004, + "step": 9204 + }, + { + "epoch": 0.7428778952465499, + "grad_norm": 0.6761097311973572, + "learning_rate": 0.00011330039896658766, + "loss": 2.5516, + "step": 9205 + }, + { + "epoch": 0.7429585989831329, + "grad_norm": 0.6909754276275635, + "learning_rate": 0.00011328475216181369, + "loss": 2.5273, + "step": 9206 + }, + { + "epoch": 0.743039302719716, + "grad_norm": 0.6420674324035645, + "learning_rate": 0.00011326910502594899, + "loss": 2.5507, + "step": 9207 + }, + { + "epoch": 0.7431200064562989, + "grad_norm": 0.6442455053329468, + "learning_rate": 0.0001132534575593835, + "loss": 2.542, + "step": 9208 + }, + { + "epoch": 0.7432007101928819, + "grad_norm": 0.7053101658821106, + "learning_rate": 0.0001132378097625072, + "loss": 2.5116, + "step": 9209 + }, + { + "epoch": 0.7432814139294649, + "grad_norm": 0.7570765614509583, + "learning_rate": 0.00011322216163571007, + "loss": 2.5576, + "step": 9210 + }, + { + "epoch": 0.743362117666048, + "grad_norm": 0.6937675476074219, + "learning_rate": 0.00011320651317938214, + "loss": 2.6212, + "step": 9211 + }, + { + "epoch": 0.743442821402631, + "grad_norm": 0.6741313934326172, + "learning_rate": 0.00011319086439391333, + "loss": 2.5723, + "step": 9212 + }, + { + "epoch": 0.7435235251392139, + "grad_norm": 0.711358904838562, + "learning_rate": 0.00011317521527969374, + "loss": 2.5713, + "step": 9213 + }, + { + "epoch": 0.7436042288757969, + "grad_norm": 0.7443268895149231, + "learning_rate": 0.00011315956583711331, + "loss": 2.5301, + "step": 9214 + }, + { + "epoch": 0.74368493261238, + "grad_norm": 0.7001742720603943, + "learning_rate": 0.00011314391606656212, + "loss": 2.5545, + "step": 9215 + }, + { + "epoch": 0.743765636348963, + "grad_norm": 0.7294990420341492, + "learning_rate": 0.00011312826596843019, + "loss": 2.5897, + "step": 9216 + }, + { + "epoch": 0.743846340085546, + "grad_norm": 0.706924319267273, + "learning_rate": 0.00011311261554310753, + "loss": 2.6477, + "step": 9217 + }, + { + "epoch": 0.7439270438221289, + "grad_norm": 0.7065039277076721, + "learning_rate": 0.00011309696479098423, + "loss": 2.5326, + "step": 9218 + }, + { + "epoch": 0.744007747558712, + "grad_norm": 0.6502599716186523, + "learning_rate": 0.00011308131371245037, + "loss": 2.5833, + "step": 9219 + }, + { + "epoch": 0.744088451295295, + "grad_norm": 0.7135158181190491, + "learning_rate": 0.00011306566230789592, + "loss": 2.5686, + "step": 9220 + }, + { + "epoch": 0.744169155031878, + "grad_norm": 0.7239195108413696, + "learning_rate": 0.00011305001057771101, + "loss": 2.6303, + "step": 9221 + }, + { + "epoch": 0.744249858768461, + "grad_norm": 0.6442604660987854, + "learning_rate": 0.00011303435852228574, + "loss": 2.5495, + "step": 9222 + }, + { + "epoch": 0.744330562505044, + "grad_norm": 0.6700316071510315, + "learning_rate": 0.0001130187061420102, + "loss": 2.5575, + "step": 9223 + }, + { + "epoch": 0.744411266241627, + "grad_norm": 0.7532816529273987, + "learning_rate": 0.00011300305343727446, + "loss": 2.5174, + "step": 9224 + }, + { + "epoch": 0.74449196997821, + "grad_norm": 0.7614738941192627, + "learning_rate": 0.00011298740040846862, + "loss": 2.5995, + "step": 9225 + }, + { + "epoch": 0.744572673714793, + "grad_norm": 0.6781208515167236, + "learning_rate": 0.00011297174705598283, + "loss": 2.5225, + "step": 9226 + }, + { + "epoch": 0.744653377451376, + "grad_norm": 0.680525541305542, + "learning_rate": 0.0001129560933802072, + "loss": 2.5844, + "step": 9227 + }, + { + "epoch": 0.744734081187959, + "grad_norm": 0.7196657657623291, + "learning_rate": 0.00011294043938153185, + "loss": 2.564, + "step": 9228 + }, + { + "epoch": 0.744814784924542, + "grad_norm": 0.6997412443161011, + "learning_rate": 0.00011292478506034694, + "loss": 2.6486, + "step": 9229 + }, + { + "epoch": 0.744895488661125, + "grad_norm": 0.7438939809799194, + "learning_rate": 0.00011290913041704256, + "loss": 2.5667, + "step": 9230 + }, + { + "epoch": 0.744976192397708, + "grad_norm": 0.7391374707221985, + "learning_rate": 0.00011289347545200892, + "loss": 2.5974, + "step": 9231 + }, + { + "epoch": 0.745056896134291, + "grad_norm": 0.7845481634140015, + "learning_rate": 0.0001128778201656362, + "loss": 2.5168, + "step": 9232 + }, + { + "epoch": 0.745137599870874, + "grad_norm": 0.728712797164917, + "learning_rate": 0.00011286216455831449, + "loss": 2.5241, + "step": 9233 + }, + { + "epoch": 0.745218303607457, + "grad_norm": 0.7310191988945007, + "learning_rate": 0.00011284650863043407, + "loss": 2.5777, + "step": 9234 + }, + { + "epoch": 0.74529900734404, + "grad_norm": 0.6661474704742432, + "learning_rate": 0.00011283085238238503, + "loss": 2.5471, + "step": 9235 + }, + { + "epoch": 0.7453797110806231, + "grad_norm": 0.7697983384132385, + "learning_rate": 0.00011281519581455761, + "loss": 2.587, + "step": 9236 + }, + { + "epoch": 0.745460414817206, + "grad_norm": 0.7336567640304565, + "learning_rate": 0.00011279953892734203, + "loss": 2.5756, + "step": 9237 + }, + { + "epoch": 0.745541118553789, + "grad_norm": 0.6192059516906738, + "learning_rate": 0.00011278388172112848, + "loss": 2.5038, + "step": 9238 + }, + { + "epoch": 0.745621822290372, + "grad_norm": 0.7180300354957581, + "learning_rate": 0.00011276822419630719, + "loss": 2.5469, + "step": 9239 + }, + { + "epoch": 0.7457025260269551, + "grad_norm": 0.7583367824554443, + "learning_rate": 0.00011275256635326837, + "loss": 2.6274, + "step": 9240 + }, + { + "epoch": 0.7457832297635381, + "grad_norm": 0.6848096251487732, + "learning_rate": 0.00011273690819240221, + "loss": 2.5117, + "step": 9241 + }, + { + "epoch": 0.745863933500121, + "grad_norm": 0.6830503344535828, + "learning_rate": 0.00011272124971409907, + "loss": 2.5114, + "step": 9242 + }, + { + "epoch": 0.745944637236704, + "grad_norm": 0.780240535736084, + "learning_rate": 0.0001127055909187491, + "loss": 2.6432, + "step": 9243 + }, + { + "epoch": 0.7460253409732871, + "grad_norm": 0.7421274185180664, + "learning_rate": 0.00011268993180674261, + "loss": 2.5723, + "step": 9244 + }, + { + "epoch": 0.7461060447098701, + "grad_norm": 0.6695685386657715, + "learning_rate": 0.00011267427237846986, + "loss": 2.5335, + "step": 9245 + }, + { + "epoch": 0.746186748446453, + "grad_norm": 0.8390316963195801, + "learning_rate": 0.00011265861263432104, + "loss": 2.5125, + "step": 9246 + }, + { + "epoch": 0.746267452183036, + "grad_norm": 0.7030535936355591, + "learning_rate": 0.00011264295257468658, + "loss": 2.5986, + "step": 9247 + }, + { + "epoch": 0.7463481559196191, + "grad_norm": 0.6754253506660461, + "learning_rate": 0.00011262729219995669, + "loss": 2.5067, + "step": 9248 + }, + { + "epoch": 0.7464288596562021, + "grad_norm": 0.6809592843055725, + "learning_rate": 0.00011261163151052163, + "loss": 2.5359, + "step": 9249 + }, + { + "epoch": 0.7465095633927851, + "grad_norm": 0.6546878218650818, + "learning_rate": 0.00011259597050677178, + "loss": 2.5357, + "step": 9250 + }, + { + "epoch": 0.746590267129368, + "grad_norm": 0.6514731645584106, + "learning_rate": 0.00011258030918909739, + "loss": 2.5591, + "step": 9251 + }, + { + "epoch": 0.7466709708659511, + "grad_norm": 0.6981258392333984, + "learning_rate": 0.0001125646475578888, + "loss": 2.6171, + "step": 9252 + }, + { + "epoch": 0.7467516746025341, + "grad_norm": 0.6763784885406494, + "learning_rate": 0.00011254898561353639, + "loss": 2.5455, + "step": 9253 + }, + { + "epoch": 0.7468323783391171, + "grad_norm": 0.6241726279258728, + "learning_rate": 0.00011253332335643043, + "loss": 2.6073, + "step": 9254 + }, + { + "epoch": 0.7469130820757001, + "grad_norm": 0.6810312271118164, + "learning_rate": 0.00011251766078696132, + "loss": 2.5285, + "step": 9255 + }, + { + "epoch": 0.7469937858122832, + "grad_norm": 0.6603971123695374, + "learning_rate": 0.00011250199790551934, + "loss": 2.5985, + "step": 9256 + }, + { + "epoch": 0.7470744895488661, + "grad_norm": 0.69618159532547, + "learning_rate": 0.0001124863347124949, + "loss": 2.5728, + "step": 9257 + }, + { + "epoch": 0.7471551932854491, + "grad_norm": 0.6878889203071594, + "learning_rate": 0.00011247067120827837, + "loss": 2.5459, + "step": 9258 + }, + { + "epoch": 0.7472358970220321, + "grad_norm": 0.6613149046897888, + "learning_rate": 0.00011245500739326011, + "loss": 2.6559, + "step": 9259 + }, + { + "epoch": 0.7473166007586152, + "grad_norm": 0.6397448778152466, + "learning_rate": 0.00011243934326783053, + "loss": 2.5712, + "step": 9260 + }, + { + "epoch": 0.7473973044951981, + "grad_norm": 0.6804259419441223, + "learning_rate": 0.00011242367883237996, + "loss": 2.6143, + "step": 9261 + }, + { + "epoch": 0.7474780082317811, + "grad_norm": 0.8029066324234009, + "learning_rate": 0.00011240801408729884, + "loss": 2.5702, + "step": 9262 + }, + { + "epoch": 0.7475587119683641, + "grad_norm": 0.7086285948753357, + "learning_rate": 0.00011239234903297761, + "loss": 2.6113, + "step": 9263 + }, + { + "epoch": 0.7476394157049472, + "grad_norm": 0.6980452537536621, + "learning_rate": 0.00011237668366980665, + "loss": 2.6355, + "step": 9264 + }, + { + "epoch": 0.7477201194415302, + "grad_norm": 0.6906906962394714, + "learning_rate": 0.00011236101799817636, + "loss": 2.5605, + "step": 9265 + }, + { + "epoch": 0.7478008231781131, + "grad_norm": 0.7412894368171692, + "learning_rate": 0.00011234535201847716, + "loss": 2.6073, + "step": 9266 + }, + { + "epoch": 0.7478815269146961, + "grad_norm": 0.6949330568313599, + "learning_rate": 0.00011232968573109955, + "loss": 2.5623, + "step": 9267 + }, + { + "epoch": 0.7479622306512792, + "grad_norm": 0.6916515827178955, + "learning_rate": 0.00011231401913643393, + "loss": 2.5348, + "step": 9268 + }, + { + "epoch": 0.7480429343878622, + "grad_norm": 0.7576180696487427, + "learning_rate": 0.0001122983522348708, + "loss": 2.5968, + "step": 9269 + }, + { + "epoch": 0.7481236381244452, + "grad_norm": 0.6734197735786438, + "learning_rate": 0.00011228268502680052, + "loss": 2.5185, + "step": 9270 + }, + { + "epoch": 0.7482043418610281, + "grad_norm": 0.6952544450759888, + "learning_rate": 0.00011226701751261367, + "loss": 2.57, + "step": 9271 + }, + { + "epoch": 0.7482850455976112, + "grad_norm": 0.6504654884338379, + "learning_rate": 0.00011225134969270068, + "loss": 2.5677, + "step": 9272 + }, + { + "epoch": 0.7483657493341942, + "grad_norm": 0.6843643188476562, + "learning_rate": 0.00011223568156745198, + "loss": 2.5686, + "step": 9273 + }, + { + "epoch": 0.7484464530707772, + "grad_norm": 0.6786371469497681, + "learning_rate": 0.00011222001313725816, + "loss": 2.5024, + "step": 9274 + }, + { + "epoch": 0.7485271568073602, + "grad_norm": 0.6431117057800293, + "learning_rate": 0.00011220434440250967, + "loss": 2.5206, + "step": 9275 + }, + { + "epoch": 0.7486078605439432, + "grad_norm": 0.699547290802002, + "learning_rate": 0.000112188675363597, + "loss": 2.5974, + "step": 9276 + }, + { + "epoch": 0.7486885642805262, + "grad_norm": 0.6870436072349548, + "learning_rate": 0.00011217300602091067, + "loss": 2.5303, + "step": 9277 + }, + { + "epoch": 0.7487692680171092, + "grad_norm": 0.7032173871994019, + "learning_rate": 0.0001121573363748412, + "loss": 2.5045, + "step": 9278 + }, + { + "epoch": 0.7488499717536922, + "grad_norm": 0.6890417337417603, + "learning_rate": 0.00011214166642577917, + "loss": 2.5945, + "step": 9279 + }, + { + "epoch": 0.7489306754902753, + "grad_norm": 0.7257806062698364, + "learning_rate": 0.00011212599617411506, + "loss": 2.6013, + "step": 9280 + }, + { + "epoch": 0.7490113792268582, + "grad_norm": 0.722561240196228, + "learning_rate": 0.0001121103256202394, + "loss": 2.5809, + "step": 9281 + }, + { + "epoch": 0.7490920829634412, + "grad_norm": 0.7360994219779968, + "learning_rate": 0.00011209465476454277, + "loss": 2.5036, + "step": 9282 + }, + { + "epoch": 0.7491727867000242, + "grad_norm": 0.6561676263809204, + "learning_rate": 0.00011207898360741574, + "loss": 2.5302, + "step": 9283 + }, + { + "epoch": 0.7492534904366072, + "grad_norm": 0.7454147338867188, + "learning_rate": 0.00011206331214924887, + "loss": 2.5511, + "step": 9284 + }, + { + "epoch": 0.7493341941731902, + "grad_norm": 0.7085482478141785, + "learning_rate": 0.00011204764039043275, + "loss": 2.5743, + "step": 9285 + }, + { + "epoch": 0.7494148979097732, + "grad_norm": 0.691872775554657, + "learning_rate": 0.0001120319683313579, + "loss": 2.5414, + "step": 9286 + }, + { + "epoch": 0.7494956016463562, + "grad_norm": 0.6661050915718079, + "learning_rate": 0.00011201629597241496, + "loss": 2.5418, + "step": 9287 + }, + { + "epoch": 0.7495763053829392, + "grad_norm": 0.7440990805625916, + "learning_rate": 0.00011200062331399452, + "loss": 2.5543, + "step": 9288 + }, + { + "epoch": 0.7496570091195223, + "grad_norm": 0.6655303835868835, + "learning_rate": 0.00011198495035648715, + "loss": 2.5629, + "step": 9289 + }, + { + "epoch": 0.7497377128561052, + "grad_norm": 0.7550996541976929, + "learning_rate": 0.00011196927710028353, + "loss": 2.5376, + "step": 9290 + }, + { + "epoch": 0.7498184165926882, + "grad_norm": 0.692915678024292, + "learning_rate": 0.00011195360354577422, + "loss": 2.4661, + "step": 9291 + }, + { + "epoch": 0.7498991203292712, + "grad_norm": 0.7572253346443176, + "learning_rate": 0.00011193792969334985, + "loss": 2.5641, + "step": 9292 + }, + { + "epoch": 0.7499798240658543, + "grad_norm": 0.6550531387329102, + "learning_rate": 0.00011192225554340107, + "loss": 2.5591, + "step": 9293 + }, + { + "epoch": 0.7500605278024373, + "grad_norm": 0.677130401134491, + "learning_rate": 0.0001119065810963185, + "loss": 2.5859, + "step": 9294 + }, + { + "epoch": 0.7501412315390202, + "grad_norm": 0.680673360824585, + "learning_rate": 0.00011189090635249287, + "loss": 2.5343, + "step": 9295 + }, + { + "epoch": 0.7502219352756032, + "grad_norm": 0.7574957609176636, + "learning_rate": 0.00011187523131231472, + "loss": 2.5966, + "step": 9296 + }, + { + "epoch": 0.7503026390121863, + "grad_norm": 0.7099971175193787, + "learning_rate": 0.00011185955597617474, + "loss": 2.5547, + "step": 9297 + }, + { + "epoch": 0.7503833427487693, + "grad_norm": 0.7153162956237793, + "learning_rate": 0.00011184388034446367, + "loss": 2.5986, + "step": 9298 + }, + { + "epoch": 0.7504640464853523, + "grad_norm": 0.7154852747917175, + "learning_rate": 0.00011182820441757212, + "loss": 2.5214, + "step": 9299 + }, + { + "epoch": 0.7505447502219352, + "grad_norm": 0.6899208426475525, + "learning_rate": 0.00011181252819589081, + "loss": 2.5026, + "step": 9300 + }, + { + "epoch": 0.7506254539585183, + "grad_norm": 0.6719048023223877, + "learning_rate": 0.00011179685167981041, + "loss": 2.5915, + "step": 9301 + }, + { + "epoch": 0.7507061576951013, + "grad_norm": 0.6664413213729858, + "learning_rate": 0.00011178117486972164, + "loss": 2.5479, + "step": 9302 + }, + { + "epoch": 0.7507868614316843, + "grad_norm": 0.7433286905288696, + "learning_rate": 0.00011176549776601517, + "loss": 2.5941, + "step": 9303 + }, + { + "epoch": 0.7508675651682672, + "grad_norm": 0.7868518233299255, + "learning_rate": 0.00011174982036908177, + "loss": 2.5537, + "step": 9304 + }, + { + "epoch": 0.7509482689048503, + "grad_norm": 0.7037336826324463, + "learning_rate": 0.0001117341426793121, + "loss": 2.568, + "step": 9305 + }, + { + "epoch": 0.7510289726414333, + "grad_norm": 0.6630405783653259, + "learning_rate": 0.00011171846469709697, + "loss": 2.4906, + "step": 9306 + }, + { + "epoch": 0.7511096763780163, + "grad_norm": 0.7398669719696045, + "learning_rate": 0.00011170278642282701, + "loss": 2.574, + "step": 9307 + }, + { + "epoch": 0.7511903801145993, + "grad_norm": 0.7557641267776489, + "learning_rate": 0.00011168710785689304, + "loss": 2.5237, + "step": 9308 + }, + { + "epoch": 0.7512710838511824, + "grad_norm": 0.6883708238601685, + "learning_rate": 0.00011167142899968581, + "loss": 2.5643, + "step": 9309 + }, + { + "epoch": 0.7513517875877653, + "grad_norm": 0.6623669862747192, + "learning_rate": 0.00011165574985159606, + "loss": 2.5319, + "step": 9310 + }, + { + "epoch": 0.7514324913243483, + "grad_norm": 0.6938778758049011, + "learning_rate": 0.00011164007041301454, + "loss": 2.5083, + "step": 9311 + }, + { + "epoch": 0.7515131950609313, + "grad_norm": 0.718534529209137, + "learning_rate": 0.00011162439068433204, + "loss": 2.4791, + "step": 9312 + }, + { + "epoch": 0.7515938987975144, + "grad_norm": 0.672113299369812, + "learning_rate": 0.00011160871066593934, + "loss": 2.5264, + "step": 9313 + }, + { + "epoch": 0.7516746025340973, + "grad_norm": 0.6854343414306641, + "learning_rate": 0.00011159303035822723, + "loss": 2.5734, + "step": 9314 + }, + { + "epoch": 0.7517553062706803, + "grad_norm": 0.6494589447975159, + "learning_rate": 0.0001115773497615865, + "loss": 2.5564, + "step": 9315 + }, + { + "epoch": 0.7518360100072633, + "grad_norm": 0.7219608426094055, + "learning_rate": 0.00011156166887640793, + "loss": 2.6049, + "step": 9316 + }, + { + "epoch": 0.7519167137438464, + "grad_norm": 0.6892502903938293, + "learning_rate": 0.00011154598770308236, + "loss": 2.5333, + "step": 9317 + }, + { + "epoch": 0.7519974174804294, + "grad_norm": 0.6670175790786743, + "learning_rate": 0.0001115303062420006, + "loss": 2.5882, + "step": 9318 + }, + { + "epoch": 0.7520781212170123, + "grad_norm": 0.7367776036262512, + "learning_rate": 0.00011151462449355347, + "loss": 2.5634, + "step": 9319 + }, + { + "epoch": 0.7521588249535953, + "grad_norm": 0.6971952319145203, + "learning_rate": 0.00011149894245813182, + "loss": 2.5323, + "step": 9320 + }, + { + "epoch": 0.7522395286901784, + "grad_norm": 0.6555755734443665, + "learning_rate": 0.00011148326013612642, + "loss": 2.5597, + "step": 9321 + }, + { + "epoch": 0.7523202324267614, + "grad_norm": 0.7004384994506836, + "learning_rate": 0.00011146757752792819, + "loss": 2.4761, + "step": 9322 + }, + { + "epoch": 0.7524009361633444, + "grad_norm": 0.7151978015899658, + "learning_rate": 0.00011145189463392791, + "loss": 2.5825, + "step": 9323 + }, + { + "epoch": 0.7524816398999273, + "grad_norm": 0.7176918387413025, + "learning_rate": 0.00011143621145451653, + "loss": 2.6112, + "step": 9324 + }, + { + "epoch": 0.7525623436365104, + "grad_norm": 0.7156146168708801, + "learning_rate": 0.00011142052799008487, + "loss": 2.5293, + "step": 9325 + }, + { + "epoch": 0.7526430473730934, + "grad_norm": 0.7360113263130188, + "learning_rate": 0.00011140484424102375, + "loss": 2.5703, + "step": 9326 + }, + { + "epoch": 0.7527237511096764, + "grad_norm": 0.65630042552948, + "learning_rate": 0.00011138916020772414, + "loss": 2.5224, + "step": 9327 + }, + { + "epoch": 0.7528044548462594, + "grad_norm": 0.7088161110877991, + "learning_rate": 0.00011137347589057687, + "loss": 2.6673, + "step": 9328 + }, + { + "epoch": 0.7528851585828424, + "grad_norm": 0.7335243821144104, + "learning_rate": 0.00011135779128997283, + "loss": 2.5693, + "step": 9329 + }, + { + "epoch": 0.7529658623194254, + "grad_norm": 0.7166211605072021, + "learning_rate": 0.00011134210640630298, + "loss": 2.5612, + "step": 9330 + }, + { + "epoch": 0.7530465660560084, + "grad_norm": 0.7324960231781006, + "learning_rate": 0.00011132642123995816, + "loss": 2.5682, + "step": 9331 + }, + { + "epoch": 0.7531272697925914, + "grad_norm": 0.7133917808532715, + "learning_rate": 0.00011131073579132936, + "loss": 2.6131, + "step": 9332 + }, + { + "epoch": 0.7532079735291743, + "grad_norm": 0.678741455078125, + "learning_rate": 0.0001112950500608074, + "loss": 2.6109, + "step": 9333 + }, + { + "epoch": 0.7532886772657574, + "grad_norm": 0.7000784277915955, + "learning_rate": 0.0001112793640487833, + "loss": 2.5087, + "step": 9334 + }, + { + "epoch": 0.7533693810023404, + "grad_norm": 0.719976544380188, + "learning_rate": 0.00011126367775564795, + "loss": 2.4665, + "step": 9335 + }, + { + "epoch": 0.7534500847389234, + "grad_norm": 0.7127155065536499, + "learning_rate": 0.00011124799118179232, + "loss": 2.5254, + "step": 9336 + }, + { + "epoch": 0.7535307884755064, + "grad_norm": 0.6306474804878235, + "learning_rate": 0.00011123230432760734, + "loss": 2.5487, + "step": 9337 + }, + { + "epoch": 0.7536114922120895, + "grad_norm": 0.667019784450531, + "learning_rate": 0.00011121661719348397, + "loss": 2.5576, + "step": 9338 + }, + { + "epoch": 0.7536921959486724, + "grad_norm": 0.6869673132896423, + "learning_rate": 0.00011120092977981318, + "loss": 2.544, + "step": 9339 + }, + { + "epoch": 0.7537728996852554, + "grad_norm": 0.6688670516014099, + "learning_rate": 0.00011118524208698596, + "loss": 2.6017, + "step": 9340 + }, + { + "epoch": 0.7538536034218384, + "grad_norm": 0.6717860102653503, + "learning_rate": 0.00011116955411539325, + "loss": 2.5571, + "step": 9341 + }, + { + "epoch": 0.7539343071584215, + "grad_norm": 0.7113999724388123, + "learning_rate": 0.00011115386586542604, + "loss": 2.5684, + "step": 9342 + }, + { + "epoch": 0.7540150108950044, + "grad_norm": 0.6687907576560974, + "learning_rate": 0.00011113817733747536, + "loss": 2.548, + "step": 9343 + }, + { + "epoch": 0.7540957146315874, + "grad_norm": 0.6828920841217041, + "learning_rate": 0.00011112248853193219, + "loss": 2.5544, + "step": 9344 + }, + { + "epoch": 0.7541764183681704, + "grad_norm": 0.6793262362480164, + "learning_rate": 0.00011110679944918749, + "loss": 2.4655, + "step": 9345 + }, + { + "epoch": 0.7542571221047535, + "grad_norm": 0.6812230348587036, + "learning_rate": 0.00011109111008963235, + "loss": 2.5473, + "step": 9346 + }, + { + "epoch": 0.7543378258413365, + "grad_norm": 0.6838300824165344, + "learning_rate": 0.00011107542045365775, + "loss": 2.5248, + "step": 9347 + }, + { + "epoch": 0.7544185295779194, + "grad_norm": 0.7101932764053345, + "learning_rate": 0.0001110597305416547, + "loss": 2.5235, + "step": 9348 + }, + { + "epoch": 0.7544992333145024, + "grad_norm": 0.7136144042015076, + "learning_rate": 0.0001110440403540143, + "loss": 2.5592, + "step": 9349 + }, + { + "epoch": 0.7545799370510855, + "grad_norm": 0.6673154234886169, + "learning_rate": 0.00011102834989112751, + "loss": 2.4962, + "step": 9350 + }, + { + "epoch": 0.7546606407876685, + "grad_norm": 0.6849049925804138, + "learning_rate": 0.00011101265915338544, + "loss": 2.5793, + "step": 9351 + }, + { + "epoch": 0.7547413445242515, + "grad_norm": 0.7239733338356018, + "learning_rate": 0.0001109969681411791, + "loss": 2.5556, + "step": 9352 + }, + { + "epoch": 0.7548220482608344, + "grad_norm": 0.6738215684890747, + "learning_rate": 0.00011098127685489955, + "loss": 2.6181, + "step": 9353 + }, + { + "epoch": 0.7549027519974175, + "grad_norm": 0.6212114095687866, + "learning_rate": 0.00011096558529493787, + "loss": 2.5509, + "step": 9354 + }, + { + "epoch": 0.7549834557340005, + "grad_norm": 0.6801952123641968, + "learning_rate": 0.00011094989346168517, + "loss": 2.6454, + "step": 9355 + }, + { + "epoch": 0.7550641594705835, + "grad_norm": 0.6605944037437439, + "learning_rate": 0.0001109342013555325, + "loss": 2.5218, + "step": 9356 + }, + { + "epoch": 0.7551448632071665, + "grad_norm": 0.6486438512802124, + "learning_rate": 0.00011091850897687096, + "loss": 2.5431, + "step": 9357 + }, + { + "epoch": 0.7552255669437495, + "grad_norm": 0.6701794266700745, + "learning_rate": 0.0001109028163260916, + "loss": 2.563, + "step": 9358 + }, + { + "epoch": 0.7553062706803325, + "grad_norm": 0.6486446261405945, + "learning_rate": 0.00011088712340358555, + "loss": 2.5147, + "step": 9359 + }, + { + "epoch": 0.7553869744169155, + "grad_norm": 0.695197582244873, + "learning_rate": 0.00011087143020974396, + "loss": 2.5707, + "step": 9360 + }, + { + "epoch": 0.7554676781534985, + "grad_norm": 0.6910821199417114, + "learning_rate": 0.00011085573674495791, + "loss": 2.5797, + "step": 9361 + }, + { + "epoch": 0.7555483818900816, + "grad_norm": 0.7084208726882935, + "learning_rate": 0.00011084004300961852, + "loss": 2.5362, + "step": 9362 + }, + { + "epoch": 0.7556290856266645, + "grad_norm": 0.6750916242599487, + "learning_rate": 0.00011082434900411691, + "loss": 2.5554, + "step": 9363 + }, + { + "epoch": 0.7557097893632475, + "grad_norm": 0.6711466908454895, + "learning_rate": 0.0001108086547288442, + "loss": 2.5577, + "step": 9364 + }, + { + "epoch": 0.7557904930998305, + "grad_norm": 0.7267118096351624, + "learning_rate": 0.00011079296018419163, + "loss": 2.5422, + "step": 9365 + }, + { + "epoch": 0.7558711968364136, + "grad_norm": 0.692730188369751, + "learning_rate": 0.00011077726537055021, + "loss": 2.5281, + "step": 9366 + }, + { + "epoch": 0.7559519005729965, + "grad_norm": 0.7071926593780518, + "learning_rate": 0.00011076157028831122, + "loss": 2.5273, + "step": 9367 + }, + { + "epoch": 0.7560326043095795, + "grad_norm": 0.7662521600723267, + "learning_rate": 0.00011074587493786574, + "loss": 2.5433, + "step": 9368 + }, + { + "epoch": 0.7561133080461625, + "grad_norm": 0.7173436880111694, + "learning_rate": 0.00011073017931960496, + "loss": 2.579, + "step": 9369 + }, + { + "epoch": 0.7561940117827456, + "grad_norm": 0.6401154398918152, + "learning_rate": 0.00011071448343392008, + "loss": 2.5189, + "step": 9370 + }, + { + "epoch": 0.7562747155193286, + "grad_norm": 0.6510714292526245, + "learning_rate": 0.00011069878728120224, + "loss": 2.5682, + "step": 9371 + }, + { + "epoch": 0.7563554192559115, + "grad_norm": 0.7189988493919373, + "learning_rate": 0.00011068309086184269, + "loss": 2.5247, + "step": 9372 + }, + { + "epoch": 0.7564361229924945, + "grad_norm": 0.678753137588501, + "learning_rate": 0.00011066739417623258, + "loss": 2.5083, + "step": 9373 + }, + { + "epoch": 0.7565168267290776, + "grad_norm": 0.6903115510940552, + "learning_rate": 0.0001106516972247631, + "loss": 2.5658, + "step": 9374 + }, + { + "epoch": 0.7565975304656606, + "grad_norm": 0.6772382855415344, + "learning_rate": 0.0001106360000078255, + "loss": 2.5445, + "step": 9375 + }, + { + "epoch": 0.7566782342022436, + "grad_norm": 0.6655055284500122, + "learning_rate": 0.00011062030252581097, + "loss": 2.5186, + "step": 9376 + }, + { + "epoch": 0.7567589379388265, + "grad_norm": 0.7173851728439331, + "learning_rate": 0.00011060460477911074, + "loss": 2.5297, + "step": 9377 + }, + { + "epoch": 0.7568396416754096, + "grad_norm": 0.6891282200813293, + "learning_rate": 0.00011058890676811606, + "loss": 2.5706, + "step": 9378 + }, + { + "epoch": 0.7569203454119926, + "grad_norm": 0.7053082585334778, + "learning_rate": 0.0001105732084932181, + "loss": 2.5475, + "step": 9379 + }, + { + "epoch": 0.7570010491485756, + "grad_norm": 0.7503373622894287, + "learning_rate": 0.00011055750995480818, + "loss": 2.6438, + "step": 9380 + }, + { + "epoch": 0.7570817528851586, + "grad_norm": 0.6703453660011292, + "learning_rate": 0.0001105418111532775, + "loss": 2.5485, + "step": 9381 + }, + { + "epoch": 0.7571624566217416, + "grad_norm": 0.6651757955551147, + "learning_rate": 0.00011052611208901733, + "loss": 2.6079, + "step": 9382 + }, + { + "epoch": 0.7572431603583246, + "grad_norm": 0.6738902926445007, + "learning_rate": 0.00011051041276241895, + "loss": 2.5279, + "step": 9383 + }, + { + "epoch": 0.7573238640949076, + "grad_norm": 0.6803816556930542, + "learning_rate": 0.00011049471317387357, + "loss": 2.5972, + "step": 9384 + }, + { + "epoch": 0.7574045678314906, + "grad_norm": 0.7127584218978882, + "learning_rate": 0.00011047901332377253, + "loss": 2.5275, + "step": 9385 + }, + { + "epoch": 0.7574852715680735, + "grad_norm": 0.7655676007270813, + "learning_rate": 0.00011046331321250711, + "loss": 2.6491, + "step": 9386 + }, + { + "epoch": 0.7575659753046566, + "grad_norm": 0.7005762457847595, + "learning_rate": 0.00011044761284046854, + "loss": 2.5266, + "step": 9387 + }, + { + "epoch": 0.7576466790412396, + "grad_norm": 0.701931357383728, + "learning_rate": 0.00011043191220804817, + "loss": 2.5556, + "step": 9388 + }, + { + "epoch": 0.7577273827778226, + "grad_norm": 0.6888757944107056, + "learning_rate": 0.00011041621131563724, + "loss": 2.5654, + "step": 9389 + }, + { + "epoch": 0.7578080865144056, + "grad_norm": 0.7119149565696716, + "learning_rate": 0.00011040051016362711, + "loss": 2.5925, + "step": 9390 + }, + { + "epoch": 0.7578887902509887, + "grad_norm": 0.7378301024436951, + "learning_rate": 0.00011038480875240911, + "loss": 2.5604, + "step": 9391 + }, + { + "epoch": 0.7579694939875716, + "grad_norm": 0.7221272587776184, + "learning_rate": 0.00011036910708237449, + "loss": 2.5293, + "step": 9392 + }, + { + "epoch": 0.7580501977241546, + "grad_norm": 0.6895891427993774, + "learning_rate": 0.00011035340515391465, + "loss": 2.5177, + "step": 9393 + }, + { + "epoch": 0.7581309014607376, + "grad_norm": 0.6812298893928528, + "learning_rate": 0.00011033770296742086, + "loss": 2.6345, + "step": 9394 + }, + { + "epoch": 0.7582116051973207, + "grad_norm": 0.6733750700950623, + "learning_rate": 0.00011032200052328449, + "loss": 2.5548, + "step": 9395 + }, + { + "epoch": 0.7582923089339036, + "grad_norm": 0.7667728066444397, + "learning_rate": 0.00011030629782189692, + "loss": 2.5858, + "step": 9396 + }, + { + "epoch": 0.7583730126704866, + "grad_norm": 0.6809018850326538, + "learning_rate": 0.00011029059486364946, + "loss": 2.6028, + "step": 9397 + }, + { + "epoch": 0.7584537164070696, + "grad_norm": 0.6817305684089661, + "learning_rate": 0.00011027489164893345, + "loss": 2.5594, + "step": 9398 + }, + { + "epoch": 0.7585344201436527, + "grad_norm": 0.6936343908309937, + "learning_rate": 0.00011025918817814027, + "loss": 2.4997, + "step": 9399 + }, + { + "epoch": 0.7586151238802357, + "grad_norm": 0.7046801447868347, + "learning_rate": 0.00011024348445166133, + "loss": 2.5199, + "step": 9400 + }, + { + "epoch": 0.7586958276168186, + "grad_norm": 0.7247316241264343, + "learning_rate": 0.00011022778046988798, + "loss": 2.5233, + "step": 9401 + }, + { + "epoch": 0.7587765313534016, + "grad_norm": 0.675652265548706, + "learning_rate": 0.00011021207623321162, + "loss": 2.5213, + "step": 9402 + }, + { + "epoch": 0.7588572350899847, + "grad_norm": 0.6866120100021362, + "learning_rate": 0.0001101963717420236, + "loss": 2.6026, + "step": 9403 + }, + { + "epoch": 0.7589379388265677, + "grad_norm": 0.7168806791305542, + "learning_rate": 0.00011018066699671534, + "loss": 2.5707, + "step": 9404 + }, + { + "epoch": 0.7590186425631507, + "grad_norm": 0.6858265995979309, + "learning_rate": 0.00011016496199767825, + "loss": 2.5313, + "step": 9405 + }, + { + "epoch": 0.7590993462997336, + "grad_norm": 0.7064315676689148, + "learning_rate": 0.00011014925674530375, + "loss": 2.5362, + "step": 9406 + }, + { + "epoch": 0.7591800500363167, + "grad_norm": 0.658385694026947, + "learning_rate": 0.00011013355123998324, + "loss": 2.5773, + "step": 9407 + }, + { + "epoch": 0.7592607537728997, + "grad_norm": 0.7112493515014648, + "learning_rate": 0.00011011784548210813, + "loss": 2.589, + "step": 9408 + }, + { + "epoch": 0.7593414575094827, + "grad_norm": 0.6835871934890747, + "learning_rate": 0.00011010213947206986, + "loss": 2.5952, + "step": 9409 + }, + { + "epoch": 0.7594221612460657, + "grad_norm": 0.6920506358146667, + "learning_rate": 0.00011008643321025989, + "loss": 2.5433, + "step": 9410 + }, + { + "epoch": 0.7595028649826487, + "grad_norm": 0.7239150404930115, + "learning_rate": 0.00011007072669706962, + "loss": 2.5291, + "step": 9411 + }, + { + "epoch": 0.7595835687192317, + "grad_norm": 0.644568145275116, + "learning_rate": 0.00011005501993289052, + "loss": 2.5324, + "step": 9412 + }, + { + "epoch": 0.7596642724558147, + "grad_norm": 0.6604863405227661, + "learning_rate": 0.00011003931291811405, + "loss": 2.561, + "step": 9413 + }, + { + "epoch": 0.7597449761923977, + "grad_norm": 0.7056753635406494, + "learning_rate": 0.00011002360565313164, + "loss": 2.6537, + "step": 9414 + }, + { + "epoch": 0.7598256799289808, + "grad_norm": 0.6712720394134521, + "learning_rate": 0.00011000789813833476, + "loss": 2.5222, + "step": 9415 + }, + { + "epoch": 0.7599063836655637, + "grad_norm": 0.6829253435134888, + "learning_rate": 0.00010999219037411492, + "loss": 2.5156, + "step": 9416 + }, + { + "epoch": 0.7599870874021467, + "grad_norm": 0.7386518120765686, + "learning_rate": 0.00010997648236086359, + "loss": 2.5378, + "step": 9417 + }, + { + "epoch": 0.7600677911387297, + "grad_norm": 0.6711105108261108, + "learning_rate": 0.00010996077409897223, + "loss": 2.4985, + "step": 9418 + }, + { + "epoch": 0.7601484948753128, + "grad_norm": 0.6936883926391602, + "learning_rate": 0.00010994506558883233, + "loss": 2.4912, + "step": 9419 + }, + { + "epoch": 0.7602291986118958, + "grad_norm": 0.6927978992462158, + "learning_rate": 0.00010992935683083541, + "loss": 2.5526, + "step": 9420 + }, + { + "epoch": 0.7603099023484787, + "grad_norm": 0.7661495804786682, + "learning_rate": 0.00010991364782537297, + "loss": 2.5778, + "step": 9421 + }, + { + "epoch": 0.7603906060850617, + "grad_norm": 0.7092108726501465, + "learning_rate": 0.0001098979385728365, + "loss": 2.6557, + "step": 9422 + }, + { + "epoch": 0.7604713098216448, + "grad_norm": 0.696666419506073, + "learning_rate": 0.00010988222907361754, + "loss": 2.4897, + "step": 9423 + }, + { + "epoch": 0.7605520135582278, + "grad_norm": 0.6836280822753906, + "learning_rate": 0.00010986651932810756, + "loss": 2.5146, + "step": 9424 + }, + { + "epoch": 0.7606327172948107, + "grad_norm": 0.7269579768180847, + "learning_rate": 0.00010985080933669815, + "loss": 2.5314, + "step": 9425 + }, + { + "epoch": 0.7607134210313937, + "grad_norm": 0.6862092018127441, + "learning_rate": 0.00010983509909978085, + "loss": 2.5415, + "step": 9426 + }, + { + "epoch": 0.7607941247679768, + "grad_norm": 0.7068747878074646, + "learning_rate": 0.00010981938861774713, + "loss": 2.5919, + "step": 9427 + }, + { + "epoch": 0.7608748285045598, + "grad_norm": 0.699999213218689, + "learning_rate": 0.0001098036778909886, + "loss": 2.5175, + "step": 9428 + }, + { + "epoch": 0.7609555322411428, + "grad_norm": 0.6642772555351257, + "learning_rate": 0.0001097879669198968, + "loss": 2.5721, + "step": 9429 + }, + { + "epoch": 0.7610362359777257, + "grad_norm": 0.7100533843040466, + "learning_rate": 0.00010977225570486323, + "loss": 2.5189, + "step": 9430 + }, + { + "epoch": 0.7611169397143088, + "grad_norm": 0.7289063930511475, + "learning_rate": 0.00010975654424627955, + "loss": 2.6139, + "step": 9431 + }, + { + "epoch": 0.7611976434508918, + "grad_norm": 0.7289659380912781, + "learning_rate": 0.00010974083254453726, + "loss": 2.5201, + "step": 9432 + }, + { + "epoch": 0.7612783471874748, + "grad_norm": 0.7389557957649231, + "learning_rate": 0.000109725120600028, + "loss": 2.559, + "step": 9433 + }, + { + "epoch": 0.7613590509240578, + "grad_norm": 0.7021538615226746, + "learning_rate": 0.00010970940841314327, + "loss": 2.6353, + "step": 9434 + }, + { + "epoch": 0.7614397546606407, + "grad_norm": 0.6614113450050354, + "learning_rate": 0.0001096936959842747, + "loss": 2.54, + "step": 9435 + }, + { + "epoch": 0.7615204583972238, + "grad_norm": 0.6905426979064941, + "learning_rate": 0.00010967798331381392, + "loss": 2.5845, + "step": 9436 + }, + { + "epoch": 0.7616011621338068, + "grad_norm": 0.8183904886245728, + "learning_rate": 0.00010966227040215247, + "loss": 2.5255, + "step": 9437 + }, + { + "epoch": 0.7616818658703898, + "grad_norm": 0.7404630780220032, + "learning_rate": 0.00010964655724968199, + "loss": 2.5726, + "step": 9438 + }, + { + "epoch": 0.7617625696069728, + "grad_norm": 0.657127320766449, + "learning_rate": 0.0001096308438567941, + "loss": 2.6233, + "step": 9439 + }, + { + "epoch": 0.7618432733435558, + "grad_norm": 0.7417906522750854, + "learning_rate": 0.00010961513022388039, + "loss": 2.6361, + "step": 9440 + }, + { + "epoch": 0.7619239770801388, + "grad_norm": 0.6930029988288879, + "learning_rate": 0.00010959941635133249, + "loss": 2.5164, + "step": 9441 + }, + { + "epoch": 0.7620046808167218, + "grad_norm": 0.6897261738777161, + "learning_rate": 0.00010958370223954207, + "loss": 2.5626, + "step": 9442 + }, + { + "epoch": 0.7620853845533048, + "grad_norm": 0.6737398505210876, + "learning_rate": 0.00010956798788890072, + "loss": 2.5342, + "step": 9443 + }, + { + "epoch": 0.7621660882898879, + "grad_norm": 0.6550001502037048, + "learning_rate": 0.0001095522732998001, + "loss": 2.5604, + "step": 9444 + }, + { + "epoch": 0.7622467920264708, + "grad_norm": 0.7184637784957886, + "learning_rate": 0.00010953655847263187, + "loss": 2.6006, + "step": 9445 + }, + { + "epoch": 0.7623274957630538, + "grad_norm": 0.6188609600067139, + "learning_rate": 0.00010952084340778766, + "loss": 2.4875, + "step": 9446 + }, + { + "epoch": 0.7624081994996368, + "grad_norm": 0.6550862789154053, + "learning_rate": 0.00010950512810565917, + "loss": 2.5794, + "step": 9447 + }, + { + "epoch": 0.7624889032362199, + "grad_norm": 0.6659231781959534, + "learning_rate": 0.000109489412566638, + "loss": 2.5137, + "step": 9448 + }, + { + "epoch": 0.7625696069728028, + "grad_norm": 0.749376118183136, + "learning_rate": 0.00010947369679111592, + "loss": 2.5923, + "step": 9449 + }, + { + "epoch": 0.7626503107093858, + "grad_norm": 0.6597894430160522, + "learning_rate": 0.0001094579807794845, + "loss": 2.5677, + "step": 9450 + }, + { + "epoch": 0.7627310144459688, + "grad_norm": 0.7194519639015198, + "learning_rate": 0.00010944226453213548, + "loss": 2.5754, + "step": 9451 + }, + { + "epoch": 0.7628117181825519, + "grad_norm": 0.6734583377838135, + "learning_rate": 0.00010942654804946057, + "loss": 2.535, + "step": 9452 + }, + { + "epoch": 0.7628924219191349, + "grad_norm": 0.7171904444694519, + "learning_rate": 0.00010941083133185146, + "loss": 2.5431, + "step": 9453 + }, + { + "epoch": 0.7629731256557178, + "grad_norm": 0.6760339736938477, + "learning_rate": 0.00010939511437969978, + "loss": 2.5163, + "step": 9454 + }, + { + "epoch": 0.7630538293923008, + "grad_norm": 0.6720966696739197, + "learning_rate": 0.00010937939719339731, + "loss": 2.5621, + "step": 9455 + }, + { + "epoch": 0.7631345331288839, + "grad_norm": 0.6374503970146179, + "learning_rate": 0.00010936367977333574, + "loss": 2.5007, + "step": 9456 + }, + { + "epoch": 0.7632152368654669, + "grad_norm": 0.6407146453857422, + "learning_rate": 0.00010934796211990684, + "loss": 2.5724, + "step": 9457 + }, + { + "epoch": 0.7632959406020499, + "grad_norm": 0.6685383319854736, + "learning_rate": 0.00010933224423350225, + "loss": 2.501, + "step": 9458 + }, + { + "epoch": 0.7633766443386328, + "grad_norm": 0.664806604385376, + "learning_rate": 0.00010931652611451373, + "loss": 2.6174, + "step": 9459 + }, + { + "epoch": 0.7634573480752159, + "grad_norm": 0.6383369565010071, + "learning_rate": 0.00010930080776333303, + "loss": 2.557, + "step": 9460 + }, + { + "epoch": 0.7635380518117989, + "grad_norm": 0.6747864484786987, + "learning_rate": 0.0001092850891803519, + "loss": 2.5406, + "step": 9461 + }, + { + "epoch": 0.7636187555483819, + "grad_norm": 0.7312811613082886, + "learning_rate": 0.00010926937036596205, + "loss": 2.5903, + "step": 9462 + }, + { + "epoch": 0.7636994592849649, + "grad_norm": 0.645847737789154, + "learning_rate": 0.00010925365132055529, + "loss": 2.5254, + "step": 9463 + }, + { + "epoch": 0.7637801630215479, + "grad_norm": 0.6466063857078552, + "learning_rate": 0.00010923793204452335, + "loss": 2.5322, + "step": 9464 + }, + { + "epoch": 0.7638608667581309, + "grad_norm": 0.6450574994087219, + "learning_rate": 0.000109222212538258, + "loss": 2.522, + "step": 9465 + }, + { + "epoch": 0.7639415704947139, + "grad_norm": 0.6491848826408386, + "learning_rate": 0.00010920649280215096, + "loss": 2.5545, + "step": 9466 + }, + { + "epoch": 0.7640222742312969, + "grad_norm": 0.6888336539268494, + "learning_rate": 0.0001091907728365941, + "loss": 2.5217, + "step": 9467 + }, + { + "epoch": 0.76410297796788, + "grad_norm": 0.702557384967804, + "learning_rate": 0.00010917505264197914, + "loss": 2.5351, + "step": 9468 + }, + { + "epoch": 0.7641836817044629, + "grad_norm": 0.6552408933639526, + "learning_rate": 0.0001091593322186979, + "loss": 2.5115, + "step": 9469 + }, + { + "epoch": 0.7642643854410459, + "grad_norm": 0.7514002919197083, + "learning_rate": 0.00010914361156714212, + "loss": 2.5196, + "step": 9470 + }, + { + "epoch": 0.7643450891776289, + "grad_norm": 0.6692500710487366, + "learning_rate": 0.00010912789068770366, + "loss": 2.5639, + "step": 9471 + }, + { + "epoch": 0.764425792914212, + "grad_norm": 0.6567397117614746, + "learning_rate": 0.0001091121695807743, + "loss": 2.5027, + "step": 9472 + }, + { + "epoch": 0.764506496650795, + "grad_norm": 0.6876057982444763, + "learning_rate": 0.00010909644824674587, + "loss": 2.519, + "step": 9473 + }, + { + "epoch": 0.7645872003873779, + "grad_norm": 0.747949481010437, + "learning_rate": 0.00010908072668601017, + "loss": 2.5604, + "step": 9474 + }, + { + "epoch": 0.7646679041239609, + "grad_norm": 0.6371368169784546, + "learning_rate": 0.000109065004898959, + "loss": 2.5853, + "step": 9475 + }, + { + "epoch": 0.764748607860544, + "grad_norm": 0.6472185254096985, + "learning_rate": 0.00010904928288598422, + "loss": 2.5662, + "step": 9476 + }, + { + "epoch": 0.764829311597127, + "grad_norm": 0.7009313702583313, + "learning_rate": 0.00010903356064747765, + "loss": 2.5244, + "step": 9477 + }, + { + "epoch": 0.76491001533371, + "grad_norm": 0.7405661940574646, + "learning_rate": 0.00010901783818383116, + "loss": 2.4963, + "step": 9478 + }, + { + "epoch": 0.7649907190702929, + "grad_norm": 0.7693421840667725, + "learning_rate": 0.00010900211549543658, + "loss": 2.6018, + "step": 9479 + }, + { + "epoch": 0.765071422806876, + "grad_norm": 0.6965410709381104, + "learning_rate": 0.00010898639258268571, + "loss": 2.627, + "step": 9480 + }, + { + "epoch": 0.765152126543459, + "grad_norm": 0.7167130708694458, + "learning_rate": 0.00010897066944597046, + "loss": 2.5298, + "step": 9481 + }, + { + "epoch": 0.765232830280042, + "grad_norm": 0.7159689664840698, + "learning_rate": 0.00010895494608568268, + "loss": 2.5179, + "step": 9482 + }, + { + "epoch": 0.7653135340166249, + "grad_norm": 0.7329332232475281, + "learning_rate": 0.00010893922250221423, + "loss": 2.6498, + "step": 9483 + }, + { + "epoch": 0.765394237753208, + "grad_norm": 0.6912567019462585, + "learning_rate": 0.000108923498695957, + "loss": 2.5679, + "step": 9484 + }, + { + "epoch": 0.765474941489791, + "grad_norm": 0.7030324935913086, + "learning_rate": 0.00010890777466730285, + "loss": 2.5678, + "step": 9485 + }, + { + "epoch": 0.765555645226374, + "grad_norm": 0.7238864898681641, + "learning_rate": 0.00010889205041664365, + "loss": 2.5525, + "step": 9486 + }, + { + "epoch": 0.765636348962957, + "grad_norm": 0.6623672842979431, + "learning_rate": 0.00010887632594437134, + "loss": 2.4857, + "step": 9487 + }, + { + "epoch": 0.7657170526995399, + "grad_norm": 0.726645827293396, + "learning_rate": 0.00010886060125087776, + "loss": 2.5405, + "step": 9488 + }, + { + "epoch": 0.765797756436123, + "grad_norm": 0.6624459624290466, + "learning_rate": 0.00010884487633655487, + "loss": 2.5538, + "step": 9489 + }, + { + "epoch": 0.765878460172706, + "grad_norm": 0.7198002934455872, + "learning_rate": 0.00010882915120179453, + "loss": 2.5808, + "step": 9490 + }, + { + "epoch": 0.765959163909289, + "grad_norm": 0.7545582056045532, + "learning_rate": 0.00010881342584698862, + "loss": 2.6059, + "step": 9491 + }, + { + "epoch": 0.766039867645872, + "grad_norm": 0.6748257279396057, + "learning_rate": 0.00010879770027252915, + "loss": 2.5203, + "step": 9492 + }, + { + "epoch": 0.766120571382455, + "grad_norm": 0.7376208901405334, + "learning_rate": 0.00010878197447880796, + "loss": 2.5255, + "step": 9493 + }, + { + "epoch": 0.766201275119038, + "grad_norm": 0.7589401006698608, + "learning_rate": 0.00010876624846621704, + "loss": 2.6304, + "step": 9494 + }, + { + "epoch": 0.766281978855621, + "grad_norm": 0.6963146924972534, + "learning_rate": 0.00010875052223514827, + "loss": 2.5547, + "step": 9495 + }, + { + "epoch": 0.766362682592204, + "grad_norm": 0.6660788059234619, + "learning_rate": 0.00010873479578599361, + "loss": 2.5922, + "step": 9496 + }, + { + "epoch": 0.7664433863287871, + "grad_norm": 0.7506482005119324, + "learning_rate": 0.00010871906911914502, + "loss": 2.5383, + "step": 9497 + }, + { + "epoch": 0.76652409006537, + "grad_norm": 0.7514285445213318, + "learning_rate": 0.00010870334223499443, + "loss": 2.5551, + "step": 9498 + }, + { + "epoch": 0.766604793801953, + "grad_norm": 0.6461809873580933, + "learning_rate": 0.00010868761513393379, + "loss": 2.5367, + "step": 9499 + }, + { + "epoch": 0.766685497538536, + "grad_norm": 0.6328238844871521, + "learning_rate": 0.00010867188781635512, + "loss": 2.5505, + "step": 9500 + }, + { + "epoch": 0.7667662012751191, + "grad_norm": 0.7090224027633667, + "learning_rate": 0.00010865616028265027, + "loss": 2.5921, + "step": 9501 + }, + { + "epoch": 0.766846905011702, + "grad_norm": 0.6404605507850647, + "learning_rate": 0.0001086404325332113, + "loss": 2.5357, + "step": 9502 + }, + { + "epoch": 0.766927608748285, + "grad_norm": 0.652477502822876, + "learning_rate": 0.00010862470456843016, + "loss": 2.5277, + "step": 9503 + }, + { + "epoch": 0.767008312484868, + "grad_norm": 0.7045448422431946, + "learning_rate": 0.00010860897638869887, + "loss": 2.5712, + "step": 9504 + }, + { + "epoch": 0.7670890162214511, + "grad_norm": 0.7024295926094055, + "learning_rate": 0.00010859324799440936, + "loss": 2.5976, + "step": 9505 + }, + { + "epoch": 0.7671697199580341, + "grad_norm": 0.7165585160255432, + "learning_rate": 0.00010857751938595364, + "loss": 2.5378, + "step": 9506 + }, + { + "epoch": 0.767250423694617, + "grad_norm": 0.7037522196769714, + "learning_rate": 0.0001085617905637237, + "loss": 2.554, + "step": 9507 + }, + { + "epoch": 0.7673311274312, + "grad_norm": 0.738210916519165, + "learning_rate": 0.00010854606152811163, + "loss": 2.5102, + "step": 9508 + }, + { + "epoch": 0.7674118311677831, + "grad_norm": 0.7500020861625671, + "learning_rate": 0.0001085303322795093, + "loss": 2.5908, + "step": 9509 + }, + { + "epoch": 0.7674925349043661, + "grad_norm": 0.7669610977172852, + "learning_rate": 0.00010851460281830883, + "loss": 2.5119, + "step": 9510 + }, + { + "epoch": 0.7675732386409491, + "grad_norm": 0.6619212031364441, + "learning_rate": 0.00010849887314490217, + "loss": 2.5622, + "step": 9511 + }, + { + "epoch": 0.767653942377532, + "grad_norm": 0.7142546772956848, + "learning_rate": 0.00010848314325968136, + "loss": 2.596, + "step": 9512 + }, + { + "epoch": 0.7677346461141151, + "grad_norm": 0.7365403175354004, + "learning_rate": 0.0001084674131630385, + "loss": 2.5695, + "step": 9513 + }, + { + "epoch": 0.7678153498506981, + "grad_norm": 0.7843711972236633, + "learning_rate": 0.00010845168285536555, + "loss": 2.5707, + "step": 9514 + }, + { + "epoch": 0.7678960535872811, + "grad_norm": 0.6391385197639465, + "learning_rate": 0.00010843595233705454, + "loss": 2.5523, + "step": 9515 + }, + { + "epoch": 0.7679767573238641, + "grad_norm": 0.6955631971359253, + "learning_rate": 0.00010842022160849758, + "loss": 2.5072, + "step": 9516 + }, + { + "epoch": 0.7680574610604471, + "grad_norm": 0.7291388511657715, + "learning_rate": 0.00010840449067008665, + "loss": 2.5786, + "step": 9517 + }, + { + "epoch": 0.7681381647970301, + "grad_norm": 0.7988889813423157, + "learning_rate": 0.00010838875952221387, + "loss": 2.5622, + "step": 9518 + }, + { + "epoch": 0.7682188685336131, + "grad_norm": 0.726271390914917, + "learning_rate": 0.00010837302816527129, + "loss": 2.5479, + "step": 9519 + }, + { + "epoch": 0.7682995722701961, + "grad_norm": 0.7305205464363098, + "learning_rate": 0.00010835729659965095, + "loss": 2.5946, + "step": 9520 + }, + { + "epoch": 0.7683802760067792, + "grad_norm": 0.7843366265296936, + "learning_rate": 0.00010834156482574493, + "loss": 2.5212, + "step": 9521 + }, + { + "epoch": 0.7684609797433621, + "grad_norm": 0.6988845467567444, + "learning_rate": 0.00010832583284394529, + "loss": 2.5174, + "step": 9522 + }, + { + "epoch": 0.7685416834799451, + "grad_norm": 0.7088077068328857, + "learning_rate": 0.00010831010065464414, + "loss": 2.5253, + "step": 9523 + }, + { + "epoch": 0.7686223872165281, + "grad_norm": 0.7447031140327454, + "learning_rate": 0.00010829436825823358, + "loss": 2.6045, + "step": 9524 + }, + { + "epoch": 0.7687030909531112, + "grad_norm": 0.6865237951278687, + "learning_rate": 0.00010827863565510566, + "loss": 2.558, + "step": 9525 + }, + { + "epoch": 0.7687837946896942, + "grad_norm": 0.7748900651931763, + "learning_rate": 0.0001082629028456525, + "loss": 2.5694, + "step": 9526 + }, + { + "epoch": 0.7688644984262771, + "grad_norm": 0.7031759023666382, + "learning_rate": 0.00010824716983026622, + "loss": 2.5171, + "step": 9527 + }, + { + "epoch": 0.7689452021628601, + "grad_norm": 0.7627702355384827, + "learning_rate": 0.00010823143660933888, + "loss": 2.5715, + "step": 9528 + }, + { + "epoch": 0.7690259058994432, + "grad_norm": 0.707815945148468, + "learning_rate": 0.00010821570318326264, + "loss": 2.5281, + "step": 9529 + }, + { + "epoch": 0.7691066096360262, + "grad_norm": 0.6833841800689697, + "learning_rate": 0.00010819996955242962, + "loss": 2.5702, + "step": 9530 + }, + { + "epoch": 0.7691873133726091, + "grad_norm": 0.7029415369033813, + "learning_rate": 0.00010818423571723189, + "loss": 2.5331, + "step": 9531 + }, + { + "epoch": 0.7692680171091921, + "grad_norm": 0.6442921161651611, + "learning_rate": 0.00010816850167806161, + "loss": 2.5423, + "step": 9532 + }, + { + "epoch": 0.7693487208457752, + "grad_norm": 0.7259004712104797, + "learning_rate": 0.00010815276743531093, + "loss": 2.6014, + "step": 9533 + }, + { + "epoch": 0.7694294245823582, + "grad_norm": 0.6483473777770996, + "learning_rate": 0.00010813703298937199, + "loss": 2.5268, + "step": 9534 + }, + { + "epoch": 0.7695101283189412, + "grad_norm": 0.6805520057678223, + "learning_rate": 0.00010812129834063691, + "loss": 2.5536, + "step": 9535 + }, + { + "epoch": 0.7695908320555241, + "grad_norm": 0.7120587825775146, + "learning_rate": 0.00010810556348949783, + "loss": 2.518, + "step": 9536 + }, + { + "epoch": 0.7696715357921071, + "grad_norm": 0.7280872464179993, + "learning_rate": 0.00010808982843634692, + "loss": 2.5525, + "step": 9537 + }, + { + "epoch": 0.7697522395286902, + "grad_norm": 0.68332439661026, + "learning_rate": 0.00010807409318157636, + "loss": 2.6318, + "step": 9538 + }, + { + "epoch": 0.7698329432652732, + "grad_norm": 0.655352771282196, + "learning_rate": 0.00010805835772557826, + "loss": 2.5781, + "step": 9539 + }, + { + "epoch": 0.7699136470018562, + "grad_norm": 0.7675400972366333, + "learning_rate": 0.00010804262206874484, + "loss": 2.5542, + "step": 9540 + }, + { + "epoch": 0.7699943507384391, + "grad_norm": 0.6676837205886841, + "learning_rate": 0.00010802688621146826, + "loss": 2.5411, + "step": 9541 + }, + { + "epoch": 0.7700750544750222, + "grad_norm": 0.7378436326980591, + "learning_rate": 0.00010801115015414067, + "loss": 2.5416, + "step": 9542 + }, + { + "epoch": 0.7701557582116052, + "grad_norm": 0.7330371141433716, + "learning_rate": 0.0001079954138971543, + "loss": 2.5154, + "step": 9543 + }, + { + "epoch": 0.7702364619481882, + "grad_norm": 0.6792974472045898, + "learning_rate": 0.00010797967744090131, + "loss": 2.5328, + "step": 9544 + }, + { + "epoch": 0.7703171656847712, + "grad_norm": 0.7129618525505066, + "learning_rate": 0.00010796394078577392, + "loss": 2.5688, + "step": 9545 + }, + { + "epoch": 0.7703978694213542, + "grad_norm": 0.6900608539581299, + "learning_rate": 0.00010794820393216429, + "loss": 2.5659, + "step": 9546 + }, + { + "epoch": 0.7704785731579372, + "grad_norm": 0.6798564195632935, + "learning_rate": 0.00010793246688046464, + "loss": 2.5746, + "step": 9547 + }, + { + "epoch": 0.7705592768945202, + "grad_norm": 0.7132395505905151, + "learning_rate": 0.00010791672963106715, + "loss": 2.6277, + "step": 9548 + }, + { + "epoch": 0.7706399806311032, + "grad_norm": 0.6762476563453674, + "learning_rate": 0.0001079009921843641, + "loss": 2.5265, + "step": 9549 + }, + { + "epoch": 0.7707206843676863, + "grad_norm": 0.7223351001739502, + "learning_rate": 0.00010788525454074765, + "loss": 2.6255, + "step": 9550 + }, + { + "epoch": 0.7708013881042692, + "grad_norm": 0.7383624315261841, + "learning_rate": 0.00010786951670061008, + "loss": 2.5744, + "step": 9551 + }, + { + "epoch": 0.7708820918408522, + "grad_norm": 0.6677328944206238, + "learning_rate": 0.00010785377866434355, + "loss": 2.5594, + "step": 9552 + }, + { + "epoch": 0.7709627955774352, + "grad_norm": 0.6572195887565613, + "learning_rate": 0.00010783804043234032, + "loss": 2.5582, + "step": 9553 + }, + { + "epoch": 0.7710434993140183, + "grad_norm": 0.6837800741195679, + "learning_rate": 0.00010782230200499265, + "loss": 2.5311, + "step": 9554 + }, + { + "epoch": 0.7711242030506013, + "grad_norm": 0.7232153415679932, + "learning_rate": 0.00010780656338269277, + "loss": 2.5074, + "step": 9555 + }, + { + "epoch": 0.7712049067871842, + "grad_norm": 0.6722296476364136, + "learning_rate": 0.00010779082456583291, + "loss": 2.551, + "step": 9556 + }, + { + "epoch": 0.7712856105237672, + "grad_norm": 0.6461100578308105, + "learning_rate": 0.00010777508555480535, + "loss": 2.5723, + "step": 9557 + }, + { + "epoch": 0.7713663142603503, + "grad_norm": 0.6573290824890137, + "learning_rate": 0.0001077593463500023, + "loss": 2.4967, + "step": 9558 + }, + { + "epoch": 0.7714470179969333, + "grad_norm": 0.7184738516807556, + "learning_rate": 0.0001077436069518161, + "loss": 2.6703, + "step": 9559 + }, + { + "epoch": 0.7715277217335162, + "grad_norm": 0.7226557731628418, + "learning_rate": 0.00010772786736063895, + "loss": 2.6118, + "step": 9560 + }, + { + "epoch": 0.7716084254700992, + "grad_norm": 0.6800956130027771, + "learning_rate": 0.00010771212757686318, + "loss": 2.578, + "step": 9561 + }, + { + "epoch": 0.7716891292066823, + "grad_norm": 0.6657535433769226, + "learning_rate": 0.00010769638760088099, + "loss": 2.5291, + "step": 9562 + }, + { + "epoch": 0.7717698329432653, + "grad_norm": 0.620527982711792, + "learning_rate": 0.00010768064743308471, + "loss": 2.5518, + "step": 9563 + }, + { + "epoch": 0.7718505366798483, + "grad_norm": 0.693760097026825, + "learning_rate": 0.00010766490707386663, + "loss": 2.52, + "step": 9564 + }, + { + "epoch": 0.7719312404164312, + "grad_norm": 0.6674148440361023, + "learning_rate": 0.000107649166523619, + "loss": 2.5197, + "step": 9565 + }, + { + "epoch": 0.7720119441530143, + "grad_norm": 0.6844033598899841, + "learning_rate": 0.00010763342578273419, + "loss": 2.5842, + "step": 9566 + }, + { + "epoch": 0.7720926478895973, + "grad_norm": 0.6891880035400391, + "learning_rate": 0.00010761768485160442, + "loss": 2.5349, + "step": 9567 + }, + { + "epoch": 0.7721733516261803, + "grad_norm": 0.7157394289970398, + "learning_rate": 0.00010760194373062204, + "loss": 2.5762, + "step": 9568 + }, + { + "epoch": 0.7722540553627633, + "grad_norm": 0.7522526383399963, + "learning_rate": 0.00010758620242017936, + "loss": 2.5348, + "step": 9569 + }, + { + "epoch": 0.7723347590993463, + "grad_norm": 0.6817746162414551, + "learning_rate": 0.00010757046092066869, + "loss": 2.5836, + "step": 9570 + }, + { + "epoch": 0.7724154628359293, + "grad_norm": 0.7274518013000488, + "learning_rate": 0.00010755471923248232, + "loss": 2.5276, + "step": 9571 + }, + { + "epoch": 0.7724961665725123, + "grad_norm": 0.6735557913780212, + "learning_rate": 0.00010753897735601264, + "loss": 2.6116, + "step": 9572 + }, + { + "epoch": 0.7725768703090953, + "grad_norm": 0.6626406908035278, + "learning_rate": 0.00010752323529165186, + "loss": 2.5778, + "step": 9573 + }, + { + "epoch": 0.7726575740456784, + "grad_norm": 0.6627367734909058, + "learning_rate": 0.00010750749303979246, + "loss": 2.5839, + "step": 9574 + }, + { + "epoch": 0.7727382777822613, + "grad_norm": 0.6658251881599426, + "learning_rate": 0.0001074917506008267, + "loss": 2.5233, + "step": 9575 + }, + { + "epoch": 0.7728189815188443, + "grad_norm": 0.6969848871231079, + "learning_rate": 0.00010747600797514692, + "loss": 2.5169, + "step": 9576 + }, + { + "epoch": 0.7728996852554273, + "grad_norm": 0.7313554883003235, + "learning_rate": 0.00010746026516314549, + "loss": 2.5528, + "step": 9577 + }, + { + "epoch": 0.7729803889920104, + "grad_norm": 0.6467077136039734, + "learning_rate": 0.00010744452216521472, + "loss": 2.5158, + "step": 9578 + }, + { + "epoch": 0.7730610927285934, + "grad_norm": 0.6808056235313416, + "learning_rate": 0.00010742877898174702, + "loss": 2.5346, + "step": 9579 + }, + { + "epoch": 0.7731417964651763, + "grad_norm": 0.7537400722503662, + "learning_rate": 0.00010741303561313474, + "loss": 2.5621, + "step": 9580 + }, + { + "epoch": 0.7732225002017593, + "grad_norm": 0.6715610027313232, + "learning_rate": 0.00010739729205977021, + "loss": 2.5384, + "step": 9581 + }, + { + "epoch": 0.7733032039383424, + "grad_norm": 0.7129234075546265, + "learning_rate": 0.00010738154832204586, + "loss": 2.5639, + "step": 9582 + }, + { + "epoch": 0.7733839076749254, + "grad_norm": 0.7156025171279907, + "learning_rate": 0.00010736580440035397, + "loss": 2.5427, + "step": 9583 + }, + { + "epoch": 0.7734646114115084, + "grad_norm": 0.7394191026687622, + "learning_rate": 0.00010735006029508703, + "loss": 2.5809, + "step": 9584 + }, + { + "epoch": 0.7735453151480913, + "grad_norm": 0.7117684483528137, + "learning_rate": 0.00010733431600663737, + "loss": 2.5807, + "step": 9585 + }, + { + "epoch": 0.7736260188846744, + "grad_norm": 0.6622862219810486, + "learning_rate": 0.00010731857153539737, + "loss": 2.5277, + "step": 9586 + }, + { + "epoch": 0.7737067226212574, + "grad_norm": 0.7744547128677368, + "learning_rate": 0.00010730282688175943, + "loss": 2.6119, + "step": 9587 + }, + { + "epoch": 0.7737874263578404, + "grad_norm": 0.6804926991462708, + "learning_rate": 0.00010728708204611597, + "loss": 2.534, + "step": 9588 + }, + { + "epoch": 0.7738681300944233, + "grad_norm": 0.7115367650985718, + "learning_rate": 0.00010727133702885937, + "loss": 2.542, + "step": 9589 + }, + { + "epoch": 0.7739488338310063, + "grad_norm": 0.7623847723007202, + "learning_rate": 0.00010725559183038205, + "loss": 2.587, + "step": 9590 + }, + { + "epoch": 0.7740295375675894, + "grad_norm": 0.6612982153892517, + "learning_rate": 0.00010723984645107641, + "loss": 2.5257, + "step": 9591 + }, + { + "epoch": 0.7741102413041724, + "grad_norm": 0.7553900480270386, + "learning_rate": 0.00010722410089133488, + "loss": 2.6311, + "step": 9592 + }, + { + "epoch": 0.7741909450407554, + "grad_norm": 0.7541414499282837, + "learning_rate": 0.00010720835515154983, + "loss": 2.5978, + "step": 9593 + }, + { + "epoch": 0.7742716487773383, + "grad_norm": 0.6690947413444519, + "learning_rate": 0.00010719260923211376, + "loss": 2.568, + "step": 9594 + }, + { + "epoch": 0.7743523525139214, + "grad_norm": 0.7282151579856873, + "learning_rate": 0.00010717686313341909, + "loss": 2.5375, + "step": 9595 + }, + { + "epoch": 0.7744330562505044, + "grad_norm": 0.6862902045249939, + "learning_rate": 0.00010716111685585821, + "loss": 2.5503, + "step": 9596 + }, + { + "epoch": 0.7745137599870874, + "grad_norm": 0.7076265811920166, + "learning_rate": 0.00010714537039982357, + "loss": 2.4766, + "step": 9597 + }, + { + "epoch": 0.7745944637236704, + "grad_norm": 0.7063891887664795, + "learning_rate": 0.00010712962376570761, + "loss": 2.5822, + "step": 9598 + }, + { + "epoch": 0.7746751674602534, + "grad_norm": 0.6975609064102173, + "learning_rate": 0.00010711387695390282, + "loss": 2.597, + "step": 9599 + }, + { + "epoch": 0.7747558711968364, + "grad_norm": 0.6790002584457397, + "learning_rate": 0.0001070981299648016, + "loss": 2.5705, + "step": 9600 + }, + { + "epoch": 0.7748365749334194, + "grad_norm": 0.6493679881095886, + "learning_rate": 0.00010708238279879643, + "loss": 2.49, + "step": 9601 + }, + { + "epoch": 0.7749172786700024, + "grad_norm": 0.6741142868995667, + "learning_rate": 0.00010706663545627977, + "loss": 2.6008, + "step": 9602 + }, + { + "epoch": 0.7749979824065855, + "grad_norm": 0.6753309965133667, + "learning_rate": 0.00010705088793764408, + "loss": 2.536, + "step": 9603 + }, + { + "epoch": 0.7750786861431684, + "grad_norm": 0.6879377365112305, + "learning_rate": 0.00010703514024328183, + "loss": 2.5884, + "step": 9604 + }, + { + "epoch": 0.7751593898797514, + "grad_norm": 0.6535949110984802, + "learning_rate": 0.00010701939237358549, + "loss": 2.5489, + "step": 9605 + }, + { + "epoch": 0.7752400936163344, + "grad_norm": 0.7308230400085449, + "learning_rate": 0.00010700364432894756, + "loss": 2.5679, + "step": 9606 + }, + { + "epoch": 0.7753207973529175, + "grad_norm": 0.7016584277153015, + "learning_rate": 0.00010698789610976052, + "loss": 2.5678, + "step": 9607 + }, + { + "epoch": 0.7754015010895005, + "grad_norm": 0.7181541323661804, + "learning_rate": 0.00010697214771641682, + "loss": 2.5004, + "step": 9608 + }, + { + "epoch": 0.7754822048260834, + "grad_norm": 0.6414844989776611, + "learning_rate": 0.00010695639914930895, + "loss": 2.4896, + "step": 9609 + }, + { + "epoch": 0.7755629085626664, + "grad_norm": 0.7288017868995667, + "learning_rate": 0.00010694065040882943, + "loss": 2.5945, + "step": 9610 + }, + { + "epoch": 0.7756436122992495, + "grad_norm": 0.6808066368103027, + "learning_rate": 0.00010692490149537079, + "loss": 2.5973, + "step": 9611 + }, + { + "epoch": 0.7757243160358325, + "grad_norm": 0.7924454212188721, + "learning_rate": 0.00010690915240932553, + "loss": 2.5448, + "step": 9612 + }, + { + "epoch": 0.7758050197724154, + "grad_norm": 0.6466094851493835, + "learning_rate": 0.00010689340315108606, + "loss": 2.5065, + "step": 9613 + }, + { + "epoch": 0.7758857235089984, + "grad_norm": 0.6775460243225098, + "learning_rate": 0.00010687765372104502, + "loss": 2.5238, + "step": 9614 + }, + { + "epoch": 0.7759664272455815, + "grad_norm": 0.6901230812072754, + "learning_rate": 0.00010686190411959484, + "loss": 2.5109, + "step": 9615 + }, + { + "epoch": 0.7760471309821645, + "grad_norm": 0.7032039165496826, + "learning_rate": 0.00010684615434712808, + "loss": 2.6094, + "step": 9616 + }, + { + "epoch": 0.7761278347187475, + "grad_norm": 0.7008969187736511, + "learning_rate": 0.00010683040440403727, + "loss": 2.5758, + "step": 9617 + }, + { + "epoch": 0.7762085384553304, + "grad_norm": 0.6909677386283875, + "learning_rate": 0.00010681465429071491, + "loss": 2.5373, + "step": 9618 + }, + { + "epoch": 0.7762892421919135, + "grad_norm": 0.699030339717865, + "learning_rate": 0.00010679890400755355, + "loss": 2.577, + "step": 9619 + }, + { + "epoch": 0.7763699459284965, + "grad_norm": 0.7012344598770142, + "learning_rate": 0.00010678315355494575, + "loss": 2.5205, + "step": 9620 + }, + { + "epoch": 0.7764506496650795, + "grad_norm": 0.7693915367126465, + "learning_rate": 0.000106767402933284, + "loss": 2.5947, + "step": 9621 + }, + { + "epoch": 0.7765313534016625, + "grad_norm": 0.7635772228240967, + "learning_rate": 0.00010675165214296093, + "loss": 2.6221, + "step": 9622 + }, + { + "epoch": 0.7766120571382455, + "grad_norm": 0.701411783695221, + "learning_rate": 0.000106735901184369, + "loss": 2.5236, + "step": 9623 + }, + { + "epoch": 0.7766927608748285, + "grad_norm": 0.7283998727798462, + "learning_rate": 0.00010672015005790079, + "loss": 2.5581, + "step": 9624 + }, + { + "epoch": 0.7767734646114115, + "grad_norm": 0.7069897055625916, + "learning_rate": 0.0001067043987639489, + "loss": 2.5541, + "step": 9625 + }, + { + "epoch": 0.7768541683479945, + "grad_norm": 0.7419753074645996, + "learning_rate": 0.00010668864730290586, + "loss": 2.5992, + "step": 9626 + }, + { + "epoch": 0.7769348720845776, + "grad_norm": 0.6651501059532166, + "learning_rate": 0.00010667289567516426, + "loss": 2.546, + "step": 9627 + }, + { + "epoch": 0.7770155758211605, + "grad_norm": 0.7265670895576477, + "learning_rate": 0.00010665714388111665, + "loss": 2.611, + "step": 9628 + }, + { + "epoch": 0.7770962795577435, + "grad_norm": 0.6520028114318848, + "learning_rate": 0.00010664139192115559, + "loss": 2.5433, + "step": 9629 + }, + { + "epoch": 0.7771769832943265, + "grad_norm": 0.6990057826042175, + "learning_rate": 0.0001066256397956737, + "loss": 2.5325, + "step": 9630 + }, + { + "epoch": 0.7772576870309096, + "grad_norm": 0.7353312373161316, + "learning_rate": 0.00010660988750506355, + "loss": 2.4707, + "step": 9631 + }, + { + "epoch": 0.7773383907674926, + "grad_norm": 0.6810272932052612, + "learning_rate": 0.00010659413504971774, + "loss": 2.5618, + "step": 9632 + }, + { + "epoch": 0.7774190945040755, + "grad_norm": 0.6480081081390381, + "learning_rate": 0.00010657838243002883, + "loss": 2.4543, + "step": 9633 + }, + { + "epoch": 0.7774997982406585, + "grad_norm": 0.6617380976676941, + "learning_rate": 0.00010656262964638942, + "loss": 2.5628, + "step": 9634 + }, + { + "epoch": 0.7775805019772416, + "grad_norm": 0.6761382222175598, + "learning_rate": 0.00010654687669919212, + "loss": 2.5433, + "step": 9635 + }, + { + "epoch": 0.7776612057138246, + "grad_norm": 0.6733867526054382, + "learning_rate": 0.00010653112358882957, + "loss": 2.5282, + "step": 9636 + }, + { + "epoch": 0.7777419094504076, + "grad_norm": 0.6854631304740906, + "learning_rate": 0.00010651537031569433, + "loss": 2.5997, + "step": 9637 + }, + { + "epoch": 0.7778226131869905, + "grad_norm": 0.7451226115226746, + "learning_rate": 0.00010649961688017904, + "loss": 2.5058, + "step": 9638 + }, + { + "epoch": 0.7779033169235735, + "grad_norm": 0.6744229197502136, + "learning_rate": 0.0001064838632826763, + "loss": 2.5962, + "step": 9639 + }, + { + "epoch": 0.7779840206601566, + "grad_norm": 0.7568119764328003, + "learning_rate": 0.00010646810952357873, + "loss": 2.5896, + "step": 9640 + }, + { + "epoch": 0.7780647243967396, + "grad_norm": 0.6860085725784302, + "learning_rate": 0.00010645235560327899, + "loss": 2.5675, + "step": 9641 + }, + { + "epoch": 0.7781454281333225, + "grad_norm": 0.6491742134094238, + "learning_rate": 0.00010643660152216965, + "loss": 2.5374, + "step": 9642 + }, + { + "epoch": 0.7782261318699055, + "grad_norm": 0.6664023399353027, + "learning_rate": 0.0001064208472806434, + "loss": 2.4679, + "step": 9643 + }, + { + "epoch": 0.7783068356064886, + "grad_norm": 0.6595140099525452, + "learning_rate": 0.00010640509287909284, + "loss": 2.5045, + "step": 9644 + }, + { + "epoch": 0.7783875393430716, + "grad_norm": 0.6788576245307922, + "learning_rate": 0.0001063893383179106, + "loss": 2.5706, + "step": 9645 + }, + { + "epoch": 0.7784682430796546, + "grad_norm": 0.6741334199905396, + "learning_rate": 0.00010637358359748939, + "loss": 2.5763, + "step": 9646 + }, + { + "epoch": 0.7785489468162375, + "grad_norm": 0.6837517023086548, + "learning_rate": 0.0001063578287182218, + "loss": 2.5484, + "step": 9647 + }, + { + "epoch": 0.7786296505528206, + "grad_norm": 0.6604229211807251, + "learning_rate": 0.00010634207368050048, + "loss": 2.5465, + "step": 9648 + }, + { + "epoch": 0.7787103542894036, + "grad_norm": 0.6528951525688171, + "learning_rate": 0.00010632631848471813, + "loss": 2.5409, + "step": 9649 + }, + { + "epoch": 0.7787910580259866, + "grad_norm": 0.6615377068519592, + "learning_rate": 0.00010631056313126734, + "loss": 2.5545, + "step": 9650 + }, + { + "epoch": 0.7788717617625696, + "grad_norm": 0.666033923625946, + "learning_rate": 0.00010629480762054089, + "loss": 2.5341, + "step": 9651 + }, + { + "epoch": 0.7789524654991526, + "grad_norm": 0.7022622227668762, + "learning_rate": 0.00010627905195293135, + "loss": 2.5206, + "step": 9652 + }, + { + "epoch": 0.7790331692357356, + "grad_norm": 0.7175850868225098, + "learning_rate": 0.00010626329612883141, + "loss": 2.5912, + "step": 9653 + }, + { + "epoch": 0.7791138729723186, + "grad_norm": 0.6592069268226624, + "learning_rate": 0.00010624754014863379, + "loss": 2.5076, + "step": 9654 + }, + { + "epoch": 0.7791945767089016, + "grad_norm": 0.645893931388855, + "learning_rate": 0.0001062317840127311, + "loss": 2.5124, + "step": 9655 + }, + { + "epoch": 0.7792752804454847, + "grad_norm": 0.6638232469558716, + "learning_rate": 0.00010621602772151607, + "loss": 2.5182, + "step": 9656 + }, + { + "epoch": 0.7793559841820676, + "grad_norm": 0.6718387603759766, + "learning_rate": 0.0001062002712753814, + "loss": 2.4773, + "step": 9657 + }, + { + "epoch": 0.7794366879186506, + "grad_norm": 0.6402876377105713, + "learning_rate": 0.00010618451467471972, + "loss": 2.5557, + "step": 9658 + }, + { + "epoch": 0.7795173916552336, + "grad_norm": 0.6898398399353027, + "learning_rate": 0.00010616875791992382, + "loss": 2.5557, + "step": 9659 + }, + { + "epoch": 0.7795980953918167, + "grad_norm": 0.6718475222587585, + "learning_rate": 0.00010615300101138633, + "loss": 2.5335, + "step": 9660 + }, + { + "epoch": 0.7796787991283997, + "grad_norm": 0.6436911225318909, + "learning_rate": 0.00010613724394949995, + "loss": 2.5214, + "step": 9661 + }, + { + "epoch": 0.7797595028649826, + "grad_norm": 0.7554156184196472, + "learning_rate": 0.00010612148673465743, + "loss": 2.5526, + "step": 9662 + }, + { + "epoch": 0.7798402066015656, + "grad_norm": 0.6728504300117493, + "learning_rate": 0.00010610572936725147, + "loss": 2.5935, + "step": 9663 + }, + { + "epoch": 0.7799209103381487, + "grad_norm": 0.6793323159217834, + "learning_rate": 0.00010608997184767476, + "loss": 2.5515, + "step": 9664 + }, + { + "epoch": 0.7800016140747317, + "grad_norm": 0.7242898941040039, + "learning_rate": 0.00010607421417631999, + "loss": 2.5332, + "step": 9665 + }, + { + "epoch": 0.7800823178113147, + "grad_norm": 0.6719244718551636, + "learning_rate": 0.00010605845635357996, + "loss": 2.5191, + "step": 9666 + }, + { + "epoch": 0.7801630215478976, + "grad_norm": 0.6836631894111633, + "learning_rate": 0.00010604269837984737, + "loss": 2.6489, + "step": 9667 + }, + { + "epoch": 0.7802437252844807, + "grad_norm": 0.6833824515342712, + "learning_rate": 0.00010602694025551496, + "loss": 2.4906, + "step": 9668 + }, + { + "epoch": 0.7803244290210637, + "grad_norm": 0.7449159026145935, + "learning_rate": 0.0001060111819809754, + "loss": 2.5301, + "step": 9669 + }, + { + "epoch": 0.7804051327576467, + "grad_norm": 0.7149158120155334, + "learning_rate": 0.00010599542355662149, + "loss": 2.5097, + "step": 9670 + }, + { + "epoch": 0.7804858364942296, + "grad_norm": 0.6616973876953125, + "learning_rate": 0.00010597966498284595, + "loss": 2.5928, + "step": 9671 + }, + { + "epoch": 0.7805665402308127, + "grad_norm": 0.6556531190872192, + "learning_rate": 0.00010596390626004154, + "loss": 2.5543, + "step": 9672 + }, + { + "epoch": 0.7806472439673957, + "grad_norm": 0.6585283875465393, + "learning_rate": 0.000105948147388601, + "loss": 2.5244, + "step": 9673 + }, + { + "epoch": 0.7807279477039787, + "grad_norm": 0.6484133005142212, + "learning_rate": 0.00010593238836891704, + "loss": 2.4996, + "step": 9674 + }, + { + "epoch": 0.7808086514405617, + "grad_norm": 0.6681119799613953, + "learning_rate": 0.00010591662920138248, + "loss": 2.5322, + "step": 9675 + }, + { + "epoch": 0.7808893551771448, + "grad_norm": 0.709403395652771, + "learning_rate": 0.00010590086988639005, + "loss": 2.5554, + "step": 9676 + }, + { + "epoch": 0.7809700589137277, + "grad_norm": 0.6734669804573059, + "learning_rate": 0.00010588511042433251, + "loss": 2.5452, + "step": 9677 + }, + { + "epoch": 0.7810507626503107, + "grad_norm": 0.6800141930580139, + "learning_rate": 0.00010586935081560268, + "loss": 2.5154, + "step": 9678 + }, + { + "epoch": 0.7811314663868937, + "grad_norm": 0.7757244110107422, + "learning_rate": 0.00010585359106059326, + "loss": 2.5935, + "step": 9679 + }, + { + "epoch": 0.7812121701234768, + "grad_norm": 0.7288491725921631, + "learning_rate": 0.00010583783115969699, + "loss": 2.5276, + "step": 9680 + }, + { + "epoch": 0.7812928738600597, + "grad_norm": 0.6785164475440979, + "learning_rate": 0.00010582207111330678, + "loss": 2.5907, + "step": 9681 + }, + { + "epoch": 0.7813735775966427, + "grad_norm": 0.6651367545127869, + "learning_rate": 0.0001058063109218153, + "loss": 2.545, + "step": 9682 + }, + { + "epoch": 0.7814542813332257, + "grad_norm": 0.6657043695449829, + "learning_rate": 0.0001057905505856154, + "loss": 2.5548, + "step": 9683 + }, + { + "epoch": 0.7815349850698088, + "grad_norm": 0.6486692428588867, + "learning_rate": 0.00010577479010509986, + "loss": 2.5589, + "step": 9684 + }, + { + "epoch": 0.7816156888063918, + "grad_norm": 0.700749397277832, + "learning_rate": 0.0001057590294806614, + "loss": 2.6008, + "step": 9685 + }, + { + "epoch": 0.7816963925429747, + "grad_norm": 0.647051215171814, + "learning_rate": 0.00010574326871269289, + "loss": 2.4894, + "step": 9686 + }, + { + "epoch": 0.7817770962795577, + "grad_norm": 0.6932066679000854, + "learning_rate": 0.00010572750780158713, + "loss": 2.5256, + "step": 9687 + }, + { + "epoch": 0.7818578000161408, + "grad_norm": 0.6330733895301819, + "learning_rate": 0.00010571174674773689, + "loss": 2.5242, + "step": 9688 + }, + { + "epoch": 0.7819385037527238, + "grad_norm": 0.6476379036903381, + "learning_rate": 0.00010569598555153499, + "loss": 2.552, + "step": 9689 + }, + { + "epoch": 0.7820192074893068, + "grad_norm": 0.661204993724823, + "learning_rate": 0.00010568022421337424, + "loss": 2.4869, + "step": 9690 + }, + { + "epoch": 0.7820999112258897, + "grad_norm": 0.6663263440132141, + "learning_rate": 0.00010566446273364746, + "loss": 2.5134, + "step": 9691 + }, + { + "epoch": 0.7821806149624727, + "grad_norm": 0.6982834339141846, + "learning_rate": 0.00010564870111274748, + "loss": 2.5755, + "step": 9692 + }, + { + "epoch": 0.7822613186990558, + "grad_norm": 0.6266167759895325, + "learning_rate": 0.00010563293935106706, + "loss": 2.5413, + "step": 9693 + }, + { + "epoch": 0.7823420224356388, + "grad_norm": 0.6484279632568359, + "learning_rate": 0.0001056171774489991, + "loss": 2.5579, + "step": 9694 + }, + { + "epoch": 0.7824227261722217, + "grad_norm": 0.674933910369873, + "learning_rate": 0.00010560141540693638, + "loss": 2.5364, + "step": 9695 + }, + { + "epoch": 0.7825034299088047, + "grad_norm": 0.7961840033531189, + "learning_rate": 0.00010558565322527174, + "loss": 2.5143, + "step": 9696 + }, + { + "epoch": 0.7825841336453878, + "grad_norm": 0.697158694267273, + "learning_rate": 0.00010556989090439804, + "loss": 2.5341, + "step": 9697 + }, + { + "epoch": 0.7826648373819708, + "grad_norm": 0.6912708282470703, + "learning_rate": 0.00010555412844470806, + "loss": 2.5331, + "step": 9698 + }, + { + "epoch": 0.7827455411185538, + "grad_norm": 0.7078350186347961, + "learning_rate": 0.00010553836584659474, + "loss": 2.5752, + "step": 9699 + }, + { + "epoch": 0.7828262448551367, + "grad_norm": 0.6421065926551819, + "learning_rate": 0.00010552260311045082, + "loss": 2.5393, + "step": 9700 + }, + { + "epoch": 0.7829069485917198, + "grad_norm": 0.644120454788208, + "learning_rate": 0.00010550684023666918, + "loss": 2.5062, + "step": 9701 + }, + { + "epoch": 0.7829876523283028, + "grad_norm": 0.7038589715957642, + "learning_rate": 0.00010549107722564275, + "loss": 2.6074, + "step": 9702 + }, + { + "epoch": 0.7830683560648858, + "grad_norm": 0.6692953109741211, + "learning_rate": 0.00010547531407776427, + "loss": 2.5801, + "step": 9703 + }, + { + "epoch": 0.7831490598014688, + "grad_norm": 0.7059200406074524, + "learning_rate": 0.00010545955079342669, + "loss": 2.5579, + "step": 9704 + }, + { + "epoch": 0.7832297635380518, + "grad_norm": 0.7126718759536743, + "learning_rate": 0.0001054437873730228, + "loss": 2.5764, + "step": 9705 + }, + { + "epoch": 0.7833104672746348, + "grad_norm": 0.696784257888794, + "learning_rate": 0.0001054280238169455, + "loss": 2.5256, + "step": 9706 + }, + { + "epoch": 0.7833911710112178, + "grad_norm": 0.7473082542419434, + "learning_rate": 0.00010541226012558767, + "loss": 2.5983, + "step": 9707 + }, + { + "epoch": 0.7834718747478008, + "grad_norm": 0.6598967909812927, + "learning_rate": 0.00010539649629934219, + "loss": 2.5267, + "step": 9708 + }, + { + "epoch": 0.7835525784843839, + "grad_norm": 0.7168934345245361, + "learning_rate": 0.00010538073233860188, + "loss": 2.5278, + "step": 9709 + }, + { + "epoch": 0.7836332822209668, + "grad_norm": 0.6848951578140259, + "learning_rate": 0.00010536496824375968, + "loss": 2.5267, + "step": 9710 + }, + { + "epoch": 0.7837139859575498, + "grad_norm": 0.7276272773742676, + "learning_rate": 0.0001053492040152084, + "loss": 2.5706, + "step": 9711 + }, + { + "epoch": 0.7837946896941328, + "grad_norm": 0.6929399371147156, + "learning_rate": 0.00010533343965334101, + "loss": 2.5184, + "step": 9712 + }, + { + "epoch": 0.7838753934307159, + "grad_norm": 0.7497181296348572, + "learning_rate": 0.00010531767515855037, + "loss": 2.5626, + "step": 9713 + }, + { + "epoch": 0.7839560971672989, + "grad_norm": 0.6536200046539307, + "learning_rate": 0.00010530191053122935, + "loss": 2.5909, + "step": 9714 + }, + { + "epoch": 0.7840368009038818, + "grad_norm": 0.6750395894050598, + "learning_rate": 0.00010528614577177087, + "loss": 2.5119, + "step": 9715 + }, + { + "epoch": 0.7841175046404648, + "grad_norm": 0.6284878849983215, + "learning_rate": 0.00010527038088056782, + "loss": 2.5417, + "step": 9716 + }, + { + "epoch": 0.7841982083770479, + "grad_norm": 0.6529444456100464, + "learning_rate": 0.00010525461585801308, + "loss": 2.5865, + "step": 9717 + }, + { + "epoch": 0.7842789121136309, + "grad_norm": 0.7332968711853027, + "learning_rate": 0.00010523885070449959, + "loss": 2.561, + "step": 9718 + }, + { + "epoch": 0.7843596158502139, + "grad_norm": 0.7054178714752197, + "learning_rate": 0.00010522308542042025, + "loss": 2.623, + "step": 9719 + }, + { + "epoch": 0.7844403195867968, + "grad_norm": 0.6837820410728455, + "learning_rate": 0.00010520732000616798, + "loss": 2.5586, + "step": 9720 + }, + { + "epoch": 0.7845210233233799, + "grad_norm": 0.7339439392089844, + "learning_rate": 0.00010519155446213565, + "loss": 2.5374, + "step": 9721 + }, + { + "epoch": 0.7846017270599629, + "grad_norm": 0.7625028491020203, + "learning_rate": 0.00010517578878871624, + "loss": 2.5663, + "step": 9722 + }, + { + "epoch": 0.7846824307965459, + "grad_norm": 0.6749752759933472, + "learning_rate": 0.00010516002298630263, + "loss": 2.5744, + "step": 9723 + }, + { + "epoch": 0.7847631345331288, + "grad_norm": 0.6702882647514343, + "learning_rate": 0.00010514425705528776, + "loss": 2.6247, + "step": 9724 + }, + { + "epoch": 0.7848438382697119, + "grad_norm": 0.6641737222671509, + "learning_rate": 0.00010512849099606457, + "loss": 2.5792, + "step": 9725 + }, + { + "epoch": 0.7849245420062949, + "grad_norm": 0.7522993683815002, + "learning_rate": 0.00010511272480902597, + "loss": 2.5941, + "step": 9726 + }, + { + "epoch": 0.7850052457428779, + "grad_norm": 0.7507709860801697, + "learning_rate": 0.00010509695849456487, + "loss": 2.5312, + "step": 9727 + }, + { + "epoch": 0.7850859494794609, + "grad_norm": 0.7101978063583374, + "learning_rate": 0.0001050811920530743, + "loss": 2.5833, + "step": 9728 + }, + { + "epoch": 0.785166653216044, + "grad_norm": 0.6814672946929932, + "learning_rate": 0.0001050654254849471, + "loss": 2.5466, + "step": 9729 + }, + { + "epoch": 0.7852473569526269, + "grad_norm": 0.7250106930732727, + "learning_rate": 0.0001050496587905763, + "loss": 2.5144, + "step": 9730 + }, + { + "epoch": 0.7853280606892099, + "grad_norm": 0.7125658392906189, + "learning_rate": 0.00010503389197035474, + "loss": 2.5384, + "step": 9731 + }, + { + "epoch": 0.7854087644257929, + "grad_norm": 0.7076827883720398, + "learning_rate": 0.00010501812502467547, + "loss": 2.4879, + "step": 9732 + }, + { + "epoch": 0.785489468162376, + "grad_norm": 0.632216215133667, + "learning_rate": 0.00010500235795393141, + "loss": 2.5678, + "step": 9733 + }, + { + "epoch": 0.785570171898959, + "grad_norm": 0.7376949191093445, + "learning_rate": 0.00010498659075851551, + "loss": 2.5024, + "step": 9734 + }, + { + "epoch": 0.7856508756355419, + "grad_norm": 0.6730546951293945, + "learning_rate": 0.00010497082343882072, + "loss": 2.5001, + "step": 9735 + }, + { + "epoch": 0.7857315793721249, + "grad_norm": 0.6958187818527222, + "learning_rate": 0.00010495505599524002, + "loss": 2.538, + "step": 9736 + }, + { + "epoch": 0.785812283108708, + "grad_norm": 0.6882508397102356, + "learning_rate": 0.00010493928842816638, + "loss": 2.5247, + "step": 9737 + }, + { + "epoch": 0.785892986845291, + "grad_norm": 0.711086630821228, + "learning_rate": 0.00010492352073799276, + "loss": 2.5721, + "step": 9738 + }, + { + "epoch": 0.7859736905818739, + "grad_norm": 0.7217094898223877, + "learning_rate": 0.00010490775292511214, + "loss": 2.5827, + "step": 9739 + }, + { + "epoch": 0.7860543943184569, + "grad_norm": 0.6812087893486023, + "learning_rate": 0.0001048919849899175, + "loss": 2.532, + "step": 9740 + }, + { + "epoch": 0.7861350980550399, + "grad_norm": 0.7449110150337219, + "learning_rate": 0.00010487621693280176, + "loss": 2.5611, + "step": 9741 + }, + { + "epoch": 0.786215801791623, + "grad_norm": 0.7297104001045227, + "learning_rate": 0.00010486044875415797, + "loss": 2.5173, + "step": 9742 + }, + { + "epoch": 0.786296505528206, + "grad_norm": 0.6741474270820618, + "learning_rate": 0.0001048446804543791, + "loss": 2.5451, + "step": 9743 + }, + { + "epoch": 0.7863772092647889, + "grad_norm": 0.6450859308242798, + "learning_rate": 0.00010482891203385812, + "loss": 2.551, + "step": 9744 + }, + { + "epoch": 0.7864579130013719, + "grad_norm": 0.6867123246192932, + "learning_rate": 0.00010481314349298805, + "loss": 2.4875, + "step": 9745 + }, + { + "epoch": 0.786538616737955, + "grad_norm": 0.6951552629470825, + "learning_rate": 0.00010479737483216183, + "loss": 2.6253, + "step": 9746 + }, + { + "epoch": 0.786619320474538, + "grad_norm": 0.6786869764328003, + "learning_rate": 0.0001047816060517725, + "loss": 2.5551, + "step": 9747 + }, + { + "epoch": 0.786700024211121, + "grad_norm": 0.698957622051239, + "learning_rate": 0.00010476583715221306, + "loss": 2.5554, + "step": 9748 + }, + { + "epoch": 0.7867807279477039, + "grad_norm": 0.6407502889633179, + "learning_rate": 0.00010475006813387648, + "loss": 2.5112, + "step": 9749 + }, + { + "epoch": 0.786861431684287, + "grad_norm": 0.660418689250946, + "learning_rate": 0.00010473429899715581, + "loss": 2.5557, + "step": 9750 + }, + { + "epoch": 0.78694213542087, + "grad_norm": 0.71445631980896, + "learning_rate": 0.00010471852974244403, + "loss": 2.5169, + "step": 9751 + }, + { + "epoch": 0.787022839157453, + "grad_norm": 0.6620494723320007, + "learning_rate": 0.00010470276037013414, + "loss": 2.5517, + "step": 9752 + }, + { + "epoch": 0.787103542894036, + "grad_norm": 0.6921235918998718, + "learning_rate": 0.00010468699088061917, + "loss": 2.5246, + "step": 9753 + }, + { + "epoch": 0.787184246630619, + "grad_norm": 0.6617140769958496, + "learning_rate": 0.00010467122127429214, + "loss": 2.4941, + "step": 9754 + }, + { + "epoch": 0.787264950367202, + "grad_norm": 0.6549816727638245, + "learning_rate": 0.00010465545155154608, + "loss": 2.5189, + "step": 9755 + }, + { + "epoch": 0.787345654103785, + "grad_norm": 0.7030060887336731, + "learning_rate": 0.00010463968171277396, + "loss": 2.5058, + "step": 9756 + }, + { + "epoch": 0.787426357840368, + "grad_norm": 0.7294049859046936, + "learning_rate": 0.00010462391175836886, + "loss": 2.5166, + "step": 9757 + }, + { + "epoch": 0.787507061576951, + "grad_norm": 0.6407562494277954, + "learning_rate": 0.00010460814168872382, + "loss": 2.5391, + "step": 9758 + }, + { + "epoch": 0.787587765313534, + "grad_norm": 0.8024646639823914, + "learning_rate": 0.0001045923715042318, + "loss": 2.7034, + "step": 9759 + }, + { + "epoch": 0.787668469050117, + "grad_norm": 0.7160943150520325, + "learning_rate": 0.00010457660120528592, + "loss": 2.6016, + "step": 9760 + }, + { + "epoch": 0.7877491727867, + "grad_norm": 0.6987707018852234, + "learning_rate": 0.00010456083079227916, + "loss": 2.5428, + "step": 9761 + }, + { + "epoch": 0.7878298765232831, + "grad_norm": 0.7235369086265564, + "learning_rate": 0.00010454506026560453, + "loss": 2.517, + "step": 9762 + }, + { + "epoch": 0.787910580259866, + "grad_norm": 0.6827502846717834, + "learning_rate": 0.00010452928962565518, + "loss": 2.5777, + "step": 9763 + }, + { + "epoch": 0.787991283996449, + "grad_norm": 0.71755450963974, + "learning_rate": 0.00010451351887282408, + "loss": 2.6004, + "step": 9764 + }, + { + "epoch": 0.788071987733032, + "grad_norm": 0.6988046765327454, + "learning_rate": 0.00010449774800750427, + "loss": 2.6116, + "step": 9765 + }, + { + "epoch": 0.7881526914696151, + "grad_norm": 0.6959548592567444, + "learning_rate": 0.00010448197703008884, + "loss": 2.5856, + "step": 9766 + }, + { + "epoch": 0.7882333952061981, + "grad_norm": 0.687042772769928, + "learning_rate": 0.00010446620594097079, + "loss": 2.5167, + "step": 9767 + }, + { + "epoch": 0.788314098942781, + "grad_norm": 0.6950173377990723, + "learning_rate": 0.00010445043474054325, + "loss": 2.5157, + "step": 9768 + }, + { + "epoch": 0.788394802679364, + "grad_norm": 0.680768609046936, + "learning_rate": 0.00010443466342919926, + "loss": 2.6177, + "step": 9769 + }, + { + "epoch": 0.7884755064159471, + "grad_norm": 0.7790142893791199, + "learning_rate": 0.00010441889200733181, + "loss": 2.5761, + "step": 9770 + }, + { + "epoch": 0.7885562101525301, + "grad_norm": 0.6207798719406128, + "learning_rate": 0.00010440312047533406, + "loss": 2.5305, + "step": 9771 + }, + { + "epoch": 0.7886369138891131, + "grad_norm": 0.7143635749816895, + "learning_rate": 0.00010438734883359903, + "loss": 2.5922, + "step": 9772 + }, + { + "epoch": 0.788717617625696, + "grad_norm": 0.7234248518943787, + "learning_rate": 0.00010437157708251977, + "loss": 2.6051, + "step": 9773 + }, + { + "epoch": 0.7887983213622791, + "grad_norm": 0.6602753400802612, + "learning_rate": 0.00010435580522248942, + "loss": 2.6002, + "step": 9774 + }, + { + "epoch": 0.7888790250988621, + "grad_norm": 0.6929246783256531, + "learning_rate": 0.00010434003325390101, + "loss": 2.5798, + "step": 9775 + }, + { + "epoch": 0.7889597288354451, + "grad_norm": 0.7355811595916748, + "learning_rate": 0.00010432426117714762, + "loss": 2.5859, + "step": 9776 + }, + { + "epoch": 0.789040432572028, + "grad_norm": 0.7009611129760742, + "learning_rate": 0.00010430848899262233, + "loss": 2.5535, + "step": 9777 + }, + { + "epoch": 0.7891211363086111, + "grad_norm": 0.6699070930480957, + "learning_rate": 0.00010429271670071823, + "loss": 2.5687, + "step": 9778 + }, + { + "epoch": 0.7892018400451941, + "grad_norm": 0.6632630228996277, + "learning_rate": 0.00010427694430182844, + "loss": 2.5359, + "step": 9779 + }, + { + "epoch": 0.7892825437817771, + "grad_norm": 0.7256911993026733, + "learning_rate": 0.000104261171796346, + "loss": 2.5432, + "step": 9780 + }, + { + "epoch": 0.7893632475183601, + "grad_norm": 0.6654312610626221, + "learning_rate": 0.000104245399184664, + "loss": 2.5432, + "step": 9781 + }, + { + "epoch": 0.7894439512549432, + "grad_norm": 0.6808900237083435, + "learning_rate": 0.00010422962646717557, + "loss": 2.4951, + "step": 9782 + }, + { + "epoch": 0.7895246549915261, + "grad_norm": 0.6655945181846619, + "learning_rate": 0.00010421385364427378, + "loss": 2.5152, + "step": 9783 + }, + { + "epoch": 0.7896053587281091, + "grad_norm": 0.8399274349212646, + "learning_rate": 0.00010419808071635178, + "loss": 2.5688, + "step": 9784 + }, + { + "epoch": 0.7896860624646921, + "grad_norm": 0.6412226557731628, + "learning_rate": 0.00010418230768380262, + "loss": 2.5527, + "step": 9785 + }, + { + "epoch": 0.7897667662012752, + "grad_norm": 0.6505058407783508, + "learning_rate": 0.0001041665345470194, + "loss": 2.5768, + "step": 9786 + }, + { + "epoch": 0.7898474699378581, + "grad_norm": 0.6297653317451477, + "learning_rate": 0.00010415076130639526, + "loss": 2.5372, + "step": 9787 + }, + { + "epoch": 0.7899281736744411, + "grad_norm": 0.6524460315704346, + "learning_rate": 0.00010413498796232331, + "loss": 2.5047, + "step": 9788 + }, + { + "epoch": 0.7900088774110241, + "grad_norm": 0.6637924313545227, + "learning_rate": 0.00010411921451519662, + "loss": 2.508, + "step": 9789 + }, + { + "epoch": 0.7900895811476072, + "grad_norm": 0.6423435211181641, + "learning_rate": 0.00010410344096540836, + "loss": 2.4597, + "step": 9790 + }, + { + "epoch": 0.7901702848841902, + "grad_norm": 0.6361977458000183, + "learning_rate": 0.00010408766731335163, + "loss": 2.5921, + "step": 9791 + }, + { + "epoch": 0.7902509886207731, + "grad_norm": 0.6792182922363281, + "learning_rate": 0.00010407189355941953, + "loss": 2.5543, + "step": 9792 + }, + { + "epoch": 0.7903316923573561, + "grad_norm": 0.6998419761657715, + "learning_rate": 0.00010405611970400519, + "loss": 2.5333, + "step": 9793 + }, + { + "epoch": 0.7904123960939391, + "grad_norm": 0.6730015873908997, + "learning_rate": 0.00010404034574750174, + "loss": 2.596, + "step": 9794 + }, + { + "epoch": 0.7904930998305222, + "grad_norm": 0.7120258808135986, + "learning_rate": 0.00010402457169030235, + "loss": 2.5314, + "step": 9795 + }, + { + "epoch": 0.7905738035671052, + "grad_norm": 0.6553651690483093, + "learning_rate": 0.0001040087975328001, + "loss": 2.4973, + "step": 9796 + }, + { + "epoch": 0.7906545073036881, + "grad_norm": 0.6506681442260742, + "learning_rate": 0.00010399302327538812, + "loss": 2.588, + "step": 9797 + }, + { + "epoch": 0.7907352110402711, + "grad_norm": 0.6737257242202759, + "learning_rate": 0.00010397724891845957, + "loss": 2.5454, + "step": 9798 + }, + { + "epoch": 0.7908159147768542, + "grad_norm": 0.670120894908905, + "learning_rate": 0.00010396147446240756, + "loss": 2.4926, + "step": 9799 + }, + { + "epoch": 0.7908966185134372, + "grad_norm": 0.7028468251228333, + "learning_rate": 0.00010394569990762529, + "loss": 2.5727, + "step": 9800 + }, + { + "epoch": 0.7909773222500202, + "grad_norm": 0.7084455490112305, + "learning_rate": 0.00010392992525450584, + "loss": 2.547, + "step": 9801 + }, + { + "epoch": 0.7910580259866031, + "grad_norm": 0.732694685459137, + "learning_rate": 0.0001039141505034424, + "loss": 2.5871, + "step": 9802 + }, + { + "epoch": 0.7911387297231862, + "grad_norm": 0.7214515209197998, + "learning_rate": 0.00010389837565482807, + "loss": 2.5672, + "step": 9803 + }, + { + "epoch": 0.7912194334597692, + "grad_norm": 0.6495330333709717, + "learning_rate": 0.00010388260070905604, + "loss": 2.5266, + "step": 9804 + }, + { + "epoch": 0.7913001371963522, + "grad_norm": 0.6930941343307495, + "learning_rate": 0.00010386682566651945, + "loss": 2.5734, + "step": 9805 + }, + { + "epoch": 0.7913808409329351, + "grad_norm": 0.714214563369751, + "learning_rate": 0.00010385105052761148, + "loss": 2.4987, + "step": 9806 + }, + { + "epoch": 0.7914615446695182, + "grad_norm": 0.7525388598442078, + "learning_rate": 0.00010383527529272523, + "loss": 2.5427, + "step": 9807 + }, + { + "epoch": 0.7915422484061012, + "grad_norm": 0.6088642477989197, + "learning_rate": 0.00010381949996225389, + "loss": 2.5018, + "step": 9808 + }, + { + "epoch": 0.7916229521426842, + "grad_norm": 0.6797540187835693, + "learning_rate": 0.00010380372453659066, + "loss": 2.5235, + "step": 9809 + }, + { + "epoch": 0.7917036558792672, + "grad_norm": 0.6754054427146912, + "learning_rate": 0.00010378794901612865, + "loss": 2.5343, + "step": 9810 + }, + { + "epoch": 0.7917843596158503, + "grad_norm": 0.7375015020370483, + "learning_rate": 0.00010377217340126106, + "loss": 2.6101, + "step": 9811 + }, + { + "epoch": 0.7918650633524332, + "grad_norm": 0.6487904191017151, + "learning_rate": 0.00010375639769238103, + "loss": 2.5408, + "step": 9812 + }, + { + "epoch": 0.7919457670890162, + "grad_norm": 0.7280275821685791, + "learning_rate": 0.00010374062188988176, + "loss": 2.5503, + "step": 9813 + }, + { + "epoch": 0.7920264708255992, + "grad_norm": 0.6944922208786011, + "learning_rate": 0.00010372484599415644, + "loss": 2.5815, + "step": 9814 + }, + { + "epoch": 0.7921071745621823, + "grad_norm": 0.6970139741897583, + "learning_rate": 0.00010370907000559818, + "loss": 2.546, + "step": 9815 + }, + { + "epoch": 0.7921878782987652, + "grad_norm": 0.7338151335716248, + "learning_rate": 0.00010369329392460023, + "loss": 2.5449, + "step": 9816 + }, + { + "epoch": 0.7922685820353482, + "grad_norm": 0.7763465642929077, + "learning_rate": 0.00010367751775155574, + "loss": 2.5331, + "step": 9817 + }, + { + "epoch": 0.7923492857719312, + "grad_norm": 0.6892645955085754, + "learning_rate": 0.00010366174148685786, + "loss": 2.5617, + "step": 9818 + }, + { + "epoch": 0.7924299895085143, + "grad_norm": 0.7388250231742859, + "learning_rate": 0.00010364596513089984, + "loss": 2.5236, + "step": 9819 + }, + { + "epoch": 0.7925106932450973, + "grad_norm": 0.7035132646560669, + "learning_rate": 0.00010363018868407482, + "loss": 2.5711, + "step": 9820 + }, + { + "epoch": 0.7925913969816802, + "grad_norm": 0.7087043523788452, + "learning_rate": 0.00010361441214677603, + "loss": 2.5416, + "step": 9821 + }, + { + "epoch": 0.7926721007182632, + "grad_norm": 0.7173168063163757, + "learning_rate": 0.00010359863551939664, + "loss": 2.529, + "step": 9822 + }, + { + "epoch": 0.7927528044548463, + "grad_norm": 0.7007408738136292, + "learning_rate": 0.00010358285880232983, + "loss": 2.5287, + "step": 9823 + }, + { + "epoch": 0.7928335081914293, + "grad_norm": 0.7731965780258179, + "learning_rate": 0.0001035670819959688, + "loss": 2.5913, + "step": 9824 + }, + { + "epoch": 0.7929142119280123, + "grad_norm": 0.6625120639801025, + "learning_rate": 0.00010355130510070681, + "loss": 2.5815, + "step": 9825 + }, + { + "epoch": 0.7929949156645952, + "grad_norm": 0.6628395318984985, + "learning_rate": 0.00010353552811693699, + "loss": 2.512, + "step": 9826 + }, + { + "epoch": 0.7930756194011783, + "grad_norm": 0.6565915942192078, + "learning_rate": 0.00010351975104505256, + "loss": 2.54, + "step": 9827 + }, + { + "epoch": 0.7931563231377613, + "grad_norm": 0.6581636667251587, + "learning_rate": 0.00010350397388544672, + "loss": 2.5462, + "step": 9828 + }, + { + "epoch": 0.7932370268743443, + "grad_norm": 0.705668568611145, + "learning_rate": 0.0001034881966385127, + "loss": 2.5241, + "step": 9829 + }, + { + "epoch": 0.7933177306109273, + "grad_norm": 0.7047126293182373, + "learning_rate": 0.00010347241930464373, + "loss": 2.5275, + "step": 9830 + }, + { + "epoch": 0.7933984343475103, + "grad_norm": 0.6285849213600159, + "learning_rate": 0.00010345664188423296, + "loss": 2.518, + "step": 9831 + }, + { + "epoch": 0.7934791380840933, + "grad_norm": 0.697542130947113, + "learning_rate": 0.00010344086437767366, + "loss": 2.5219, + "step": 9832 + }, + { + "epoch": 0.7935598418206763, + "grad_norm": 0.6349283456802368, + "learning_rate": 0.00010342508678535903, + "loss": 2.5277, + "step": 9833 + }, + { + "epoch": 0.7936405455572593, + "grad_norm": 0.7084335088729858, + "learning_rate": 0.00010340930910768225, + "loss": 2.476, + "step": 9834 + }, + { + "epoch": 0.7937212492938424, + "grad_norm": 0.6714156866073608, + "learning_rate": 0.00010339353134503662, + "loss": 2.556, + "step": 9835 + }, + { + "epoch": 0.7938019530304253, + "grad_norm": 0.6687895059585571, + "learning_rate": 0.00010337775349781527, + "loss": 2.5756, + "step": 9836 + }, + { + "epoch": 0.7938826567670083, + "grad_norm": 0.669784665107727, + "learning_rate": 0.00010336197556641152, + "loss": 2.5545, + "step": 9837 + }, + { + "epoch": 0.7939633605035913, + "grad_norm": 0.6738600134849548, + "learning_rate": 0.0001033461975512185, + "loss": 2.5807, + "step": 9838 + }, + { + "epoch": 0.7940440642401744, + "grad_norm": 0.691443681716919, + "learning_rate": 0.00010333041945262953, + "loss": 2.5279, + "step": 9839 + }, + { + "epoch": 0.7941247679767574, + "grad_norm": 0.6283861398696899, + "learning_rate": 0.0001033146412710378, + "loss": 2.5355, + "step": 9840 + }, + { + "epoch": 0.7942054717133403, + "grad_norm": 0.6491204500198364, + "learning_rate": 0.00010329886300683655, + "loss": 2.5431, + "step": 9841 + }, + { + "epoch": 0.7942861754499233, + "grad_norm": 0.6673988103866577, + "learning_rate": 0.00010328308466041898, + "loss": 2.5845, + "step": 9842 + }, + { + "epoch": 0.7943668791865063, + "grad_norm": 0.6669130325317383, + "learning_rate": 0.00010326730623217837, + "loss": 2.5348, + "step": 9843 + }, + { + "epoch": 0.7944475829230894, + "grad_norm": 0.7003189921379089, + "learning_rate": 0.00010325152772250795, + "loss": 2.5779, + "step": 9844 + }, + { + "epoch": 0.7945282866596723, + "grad_norm": 0.6602177619934082, + "learning_rate": 0.00010323574913180097, + "loss": 2.5527, + "step": 9845 + }, + { + "epoch": 0.7946089903962553, + "grad_norm": 0.7053726315498352, + "learning_rate": 0.00010321997046045066, + "loss": 2.566, + "step": 9846 + }, + { + "epoch": 0.7946896941328383, + "grad_norm": 0.7428076863288879, + "learning_rate": 0.00010320419170885025, + "loss": 2.5348, + "step": 9847 + }, + { + "epoch": 0.7947703978694214, + "grad_norm": 0.7029163837432861, + "learning_rate": 0.00010318841287739303, + "loss": 2.5387, + "step": 9848 + }, + { + "epoch": 0.7948511016060044, + "grad_norm": 0.6159133911132812, + "learning_rate": 0.00010317263396647221, + "loss": 2.5408, + "step": 9849 + }, + { + "epoch": 0.7949318053425873, + "grad_norm": 0.6748857498168945, + "learning_rate": 0.00010315685497648106, + "loss": 2.5299, + "step": 9850 + }, + { + "epoch": 0.7950125090791703, + "grad_norm": 0.6281898021697998, + "learning_rate": 0.00010314107590781284, + "loss": 2.5202, + "step": 9851 + }, + { + "epoch": 0.7950932128157534, + "grad_norm": 0.6602163910865784, + "learning_rate": 0.00010312529676086078, + "loss": 2.5119, + "step": 9852 + }, + { + "epoch": 0.7951739165523364, + "grad_norm": 0.6665403246879578, + "learning_rate": 0.00010310951753601818, + "loss": 2.5913, + "step": 9853 + }, + { + "epoch": 0.7952546202889194, + "grad_norm": 0.6705873012542725, + "learning_rate": 0.00010309373823367827, + "loss": 2.6039, + "step": 9854 + }, + { + "epoch": 0.7953353240255023, + "grad_norm": 0.6571313738822937, + "learning_rate": 0.0001030779588542343, + "loss": 2.5629, + "step": 9855 + }, + { + "epoch": 0.7954160277620854, + "grad_norm": 0.6597230434417725, + "learning_rate": 0.00010306217939807956, + "loss": 2.5569, + "step": 9856 + }, + { + "epoch": 0.7954967314986684, + "grad_norm": 0.7098817229270935, + "learning_rate": 0.00010304639986560733, + "loss": 2.4736, + "step": 9857 + }, + { + "epoch": 0.7955774352352514, + "grad_norm": 0.628663957118988, + "learning_rate": 0.00010303062025721082, + "loss": 2.5241, + "step": 9858 + }, + { + "epoch": 0.7956581389718343, + "grad_norm": 0.630843460559845, + "learning_rate": 0.00010301484057328333, + "loss": 2.5604, + "step": 9859 + }, + { + "epoch": 0.7957388427084174, + "grad_norm": 0.7457596659660339, + "learning_rate": 0.00010299906081421813, + "loss": 2.5675, + "step": 9860 + }, + { + "epoch": 0.7958195464450004, + "grad_norm": 0.6566091775894165, + "learning_rate": 0.00010298328098040851, + "loss": 2.4918, + "step": 9861 + }, + { + "epoch": 0.7959002501815834, + "grad_norm": 0.657357931137085, + "learning_rate": 0.00010296750107224773, + "loss": 2.5268, + "step": 9862 + }, + { + "epoch": 0.7959809539181664, + "grad_norm": 0.7021927833557129, + "learning_rate": 0.00010295172109012905, + "loss": 2.528, + "step": 9863 + }, + { + "epoch": 0.7960616576547495, + "grad_norm": 0.662053108215332, + "learning_rate": 0.00010293594103444578, + "loss": 2.5483, + "step": 9864 + }, + { + "epoch": 0.7961423613913324, + "grad_norm": 0.776407778263092, + "learning_rate": 0.00010292016090559118, + "loss": 2.6089, + "step": 9865 + }, + { + "epoch": 0.7962230651279154, + "grad_norm": 0.6499512791633606, + "learning_rate": 0.00010290438070395854, + "loss": 2.5609, + "step": 9866 + }, + { + "epoch": 0.7963037688644984, + "grad_norm": 0.6802246570587158, + "learning_rate": 0.00010288860042994113, + "loss": 2.5217, + "step": 9867 + }, + { + "epoch": 0.7963844726010815, + "grad_norm": 0.6371235847473145, + "learning_rate": 0.00010287282008393224, + "loss": 2.4783, + "step": 9868 + }, + { + "epoch": 0.7964651763376644, + "grad_norm": 0.7070169448852539, + "learning_rate": 0.00010285703966632518, + "loss": 2.5006, + "step": 9869 + }, + { + "epoch": 0.7965458800742474, + "grad_norm": 0.657738208770752, + "learning_rate": 0.00010284125917751323, + "loss": 2.551, + "step": 9870 + }, + { + "epoch": 0.7966265838108304, + "grad_norm": 0.7936853170394897, + "learning_rate": 0.00010282547861788964, + "loss": 2.574, + "step": 9871 + }, + { + "epoch": 0.7967072875474135, + "grad_norm": 0.675715982913971, + "learning_rate": 0.00010280969798784779, + "loss": 2.5288, + "step": 9872 + }, + { + "epoch": 0.7967879912839965, + "grad_norm": 0.6980394124984741, + "learning_rate": 0.00010279391728778092, + "loss": 2.5437, + "step": 9873 + }, + { + "epoch": 0.7968686950205794, + "grad_norm": 0.6580469608306885, + "learning_rate": 0.00010277813651808226, + "loss": 2.5574, + "step": 9874 + }, + { + "epoch": 0.7969493987571624, + "grad_norm": 0.6960238218307495, + "learning_rate": 0.00010276235567914522, + "loss": 2.5477, + "step": 9875 + }, + { + "epoch": 0.7970301024937455, + "grad_norm": 0.704140841960907, + "learning_rate": 0.00010274657477136304, + "loss": 2.5099, + "step": 9876 + }, + { + "epoch": 0.7971108062303285, + "grad_norm": 0.7238990068435669, + "learning_rate": 0.00010273079379512906, + "loss": 2.6182, + "step": 9877 + }, + { + "epoch": 0.7971915099669115, + "grad_norm": 0.6527700424194336, + "learning_rate": 0.00010271501275083657, + "loss": 2.5148, + "step": 9878 + }, + { + "epoch": 0.7972722137034944, + "grad_norm": 0.6665365695953369, + "learning_rate": 0.00010269923163887884, + "loss": 2.5624, + "step": 9879 + }, + { + "epoch": 0.7973529174400775, + "grad_norm": 0.7304019927978516, + "learning_rate": 0.0001026834504596492, + "loss": 2.5537, + "step": 9880 + }, + { + "epoch": 0.7974336211766605, + "grad_norm": 0.6645877957344055, + "learning_rate": 0.00010266766921354099, + "loss": 2.5381, + "step": 9881 + }, + { + "epoch": 0.7975143249132435, + "grad_norm": 0.6817314624786377, + "learning_rate": 0.00010265188790094744, + "loss": 2.5399, + "step": 9882 + }, + { + "epoch": 0.7975950286498265, + "grad_norm": 0.7477232217788696, + "learning_rate": 0.00010263610652226194, + "loss": 2.6461, + "step": 9883 + }, + { + "epoch": 0.7976757323864095, + "grad_norm": 0.7087170481681824, + "learning_rate": 0.00010262032507787777, + "loss": 2.5469, + "step": 9884 + }, + { + "epoch": 0.7977564361229925, + "grad_norm": 0.7093435525894165, + "learning_rate": 0.00010260454356818825, + "loss": 2.5606, + "step": 9885 + }, + { + "epoch": 0.7978371398595755, + "grad_norm": 0.6662636399269104, + "learning_rate": 0.00010258876199358672, + "loss": 2.5415, + "step": 9886 + }, + { + "epoch": 0.7979178435961585, + "grad_norm": 0.6829736232757568, + "learning_rate": 0.00010257298035446644, + "loss": 2.5618, + "step": 9887 + }, + { + "epoch": 0.7979985473327416, + "grad_norm": 0.6872264742851257, + "learning_rate": 0.00010255719865122077, + "loss": 2.5629, + "step": 9888 + }, + { + "epoch": 0.7980792510693245, + "grad_norm": 0.6988633871078491, + "learning_rate": 0.00010254141688424303, + "loss": 2.5191, + "step": 9889 + }, + { + "epoch": 0.7981599548059075, + "grad_norm": 0.6787285804748535, + "learning_rate": 0.00010252563505392654, + "loss": 2.5003, + "step": 9890 + }, + { + "epoch": 0.7982406585424905, + "grad_norm": 0.6703466773033142, + "learning_rate": 0.00010250985316066461, + "loss": 2.5442, + "step": 9891 + }, + { + "epoch": 0.7983213622790736, + "grad_norm": 0.6463642120361328, + "learning_rate": 0.0001024940712048506, + "loss": 2.5236, + "step": 9892 + }, + { + "epoch": 0.7984020660156566, + "grad_norm": 0.6835207939147949, + "learning_rate": 0.0001024782891868778, + "loss": 2.5094, + "step": 9893 + }, + { + "epoch": 0.7984827697522395, + "grad_norm": 0.6621001958847046, + "learning_rate": 0.00010246250710713956, + "loss": 2.5456, + "step": 9894 + }, + { + "epoch": 0.7985634734888225, + "grad_norm": 0.6675469875335693, + "learning_rate": 0.0001024467249660292, + "loss": 2.5312, + "step": 9895 + }, + { + "epoch": 0.7986441772254055, + "grad_norm": 0.7357796430587769, + "learning_rate": 0.00010243094276394007, + "loss": 2.5374, + "step": 9896 + }, + { + "epoch": 0.7987248809619886, + "grad_norm": 0.7005879878997803, + "learning_rate": 0.00010241516050126549, + "loss": 2.5667, + "step": 9897 + }, + { + "epoch": 0.7988055846985715, + "grad_norm": 0.669870913028717, + "learning_rate": 0.0001023993781783988, + "loss": 2.533, + "step": 9898 + }, + { + "epoch": 0.7988862884351545, + "grad_norm": 0.7584091424942017, + "learning_rate": 0.00010238359579573333, + "loss": 2.5995, + "step": 9899 + }, + { + "epoch": 0.7989669921717375, + "grad_norm": 0.6931570172309875, + "learning_rate": 0.00010236781335366239, + "loss": 2.5506, + "step": 9900 + }, + { + "epoch": 0.7990476959083206, + "grad_norm": 0.6810948848724365, + "learning_rate": 0.0001023520308525794, + "loss": 2.5048, + "step": 9901 + }, + { + "epoch": 0.7991283996449036, + "grad_norm": 0.6857194900512695, + "learning_rate": 0.00010233624829287765, + "loss": 2.5559, + "step": 9902 + }, + { + "epoch": 0.7992091033814865, + "grad_norm": 0.6685707569122314, + "learning_rate": 0.00010232046567495046, + "loss": 2.5661, + "step": 9903 + }, + { + "epoch": 0.7992898071180695, + "grad_norm": 0.6626694202423096, + "learning_rate": 0.00010230468299919121, + "loss": 2.6293, + "step": 9904 + }, + { + "epoch": 0.7993705108546526, + "grad_norm": 0.6407302021980286, + "learning_rate": 0.00010228890026599323, + "loss": 2.5552, + "step": 9905 + }, + { + "epoch": 0.7994512145912356, + "grad_norm": 0.762235701084137, + "learning_rate": 0.00010227311747574986, + "loss": 2.4904, + "step": 9906 + }, + { + "epoch": 0.7995319183278186, + "grad_norm": 0.703507661819458, + "learning_rate": 0.0001022573346288545, + "loss": 2.5684, + "step": 9907 + }, + { + "epoch": 0.7996126220644015, + "grad_norm": 0.82541823387146, + "learning_rate": 0.00010224155172570043, + "loss": 2.521, + "step": 9908 + }, + { + "epoch": 0.7996933258009846, + "grad_norm": 0.6836804747581482, + "learning_rate": 0.00010222576876668104, + "loss": 2.5364, + "step": 9909 + }, + { + "epoch": 0.7997740295375676, + "grad_norm": 0.7388977408409119, + "learning_rate": 0.00010220998575218966, + "loss": 2.5724, + "step": 9910 + }, + { + "epoch": 0.7998547332741506, + "grad_norm": 0.7380896806716919, + "learning_rate": 0.00010219420268261966, + "loss": 2.5918, + "step": 9911 + }, + { + "epoch": 0.7999354370107336, + "grad_norm": 0.7303522825241089, + "learning_rate": 0.00010217841955836442, + "loss": 2.5432, + "step": 9912 + }, + { + "epoch": 0.8000161407473166, + "grad_norm": 0.6859301924705505, + "learning_rate": 0.00010216263637981727, + "loss": 2.5734, + "step": 9913 + }, + { + "epoch": 0.8000968444838996, + "grad_norm": 0.731910228729248, + "learning_rate": 0.00010214685314737154, + "loss": 2.5227, + "step": 9914 + }, + { + "epoch": 0.8001775482204826, + "grad_norm": 0.7105006575584412, + "learning_rate": 0.00010213106986142062, + "loss": 2.5335, + "step": 9915 + }, + { + "epoch": 0.8002582519570656, + "grad_norm": 0.7337056994438171, + "learning_rate": 0.00010211528652235786, + "loss": 2.6204, + "step": 9916 + }, + { + "epoch": 0.8003389556936487, + "grad_norm": 0.7350614666938782, + "learning_rate": 0.00010209950313057668, + "loss": 2.5264, + "step": 9917 + }, + { + "epoch": 0.8004196594302316, + "grad_norm": 0.6411921977996826, + "learning_rate": 0.00010208371968647036, + "loss": 2.4642, + "step": 9918 + }, + { + "epoch": 0.8005003631668146, + "grad_norm": 0.7601611018180847, + "learning_rate": 0.00010206793619043229, + "loss": 2.6249, + "step": 9919 + }, + { + "epoch": 0.8005810669033976, + "grad_norm": 0.7086012363433838, + "learning_rate": 0.00010205215264285585, + "loss": 2.5508, + "step": 9920 + }, + { + "epoch": 0.8006617706399807, + "grad_norm": 0.7267128825187683, + "learning_rate": 0.00010203636904413443, + "loss": 2.5109, + "step": 9921 + }, + { + "epoch": 0.8007424743765637, + "grad_norm": 0.7606067657470703, + "learning_rate": 0.00010202058539466132, + "loss": 2.5172, + "step": 9922 + }, + { + "epoch": 0.8008231781131466, + "grad_norm": 0.7610498666763306, + "learning_rate": 0.00010200480169483, + "loss": 2.5085, + "step": 9923 + }, + { + "epoch": 0.8009038818497296, + "grad_norm": 0.7604225873947144, + "learning_rate": 0.00010198901794503373, + "loss": 2.5615, + "step": 9924 + }, + { + "epoch": 0.8009845855863127, + "grad_norm": 0.739532470703125, + "learning_rate": 0.00010197323414566596, + "loss": 2.5574, + "step": 9925 + }, + { + "epoch": 0.8010652893228957, + "grad_norm": 0.6913303136825562, + "learning_rate": 0.00010195745029712003, + "loss": 2.5403, + "step": 9926 + }, + { + "epoch": 0.8011459930594786, + "grad_norm": 0.6963592767715454, + "learning_rate": 0.0001019416663997893, + "loss": 2.5615, + "step": 9927 + }, + { + "epoch": 0.8012266967960616, + "grad_norm": 0.681481122970581, + "learning_rate": 0.0001019258824540672, + "loss": 2.5125, + "step": 9928 + }, + { + "epoch": 0.8013074005326447, + "grad_norm": 0.7192744016647339, + "learning_rate": 0.00010191009846034709, + "loss": 2.5952, + "step": 9929 + }, + { + "epoch": 0.8013881042692277, + "grad_norm": 0.7030046582221985, + "learning_rate": 0.00010189431441902228, + "loss": 2.5445, + "step": 9930 + }, + { + "epoch": 0.8014688080058107, + "grad_norm": 0.6180598139762878, + "learning_rate": 0.00010187853033048622, + "loss": 2.4902, + "step": 9931 + }, + { + "epoch": 0.8015495117423936, + "grad_norm": 0.7479971051216125, + "learning_rate": 0.0001018627461951323, + "loss": 2.5703, + "step": 9932 + }, + { + "epoch": 0.8016302154789767, + "grad_norm": 0.7339857220649719, + "learning_rate": 0.00010184696201335387, + "loss": 2.5744, + "step": 9933 + }, + { + "epoch": 0.8017109192155597, + "grad_norm": 0.6741397380828857, + "learning_rate": 0.00010183117778554432, + "loss": 2.5777, + "step": 9934 + }, + { + "epoch": 0.8017916229521427, + "grad_norm": 0.6731706857681274, + "learning_rate": 0.00010181539351209699, + "loss": 2.5438, + "step": 9935 + }, + { + "epoch": 0.8018723266887257, + "grad_norm": 0.6929418444633484, + "learning_rate": 0.00010179960919340535, + "loss": 2.5308, + "step": 9936 + }, + { + "epoch": 0.8019530304253087, + "grad_norm": 0.7383175492286682, + "learning_rate": 0.00010178382482986271, + "loss": 2.5623, + "step": 9937 + }, + { + "epoch": 0.8020337341618917, + "grad_norm": 0.6872193217277527, + "learning_rate": 0.00010176804042186252, + "loss": 2.5271, + "step": 9938 + }, + { + "epoch": 0.8021144378984747, + "grad_norm": 0.7354295253753662, + "learning_rate": 0.00010175225596979816, + "loss": 2.5122, + "step": 9939 + }, + { + "epoch": 0.8021951416350577, + "grad_norm": 0.7589237689971924, + "learning_rate": 0.00010173647147406297, + "loss": 2.5529, + "step": 9940 + }, + { + "epoch": 0.8022758453716408, + "grad_norm": 0.6998353004455566, + "learning_rate": 0.00010172068693505037, + "loss": 2.4683, + "step": 9941 + }, + { + "epoch": 0.8023565491082237, + "grad_norm": 0.6816055178642273, + "learning_rate": 0.00010170490235315377, + "loss": 2.567, + "step": 9942 + }, + { + "epoch": 0.8024372528448067, + "grad_norm": 0.7188318371772766, + "learning_rate": 0.00010168911772876652, + "loss": 2.5631, + "step": 9943 + }, + { + "epoch": 0.8025179565813897, + "grad_norm": 0.6925922632217407, + "learning_rate": 0.00010167333306228209, + "loss": 2.4872, + "step": 9944 + }, + { + "epoch": 0.8025986603179727, + "grad_norm": 0.7081493735313416, + "learning_rate": 0.00010165754835409377, + "loss": 2.5482, + "step": 9945 + }, + { + "epoch": 0.8026793640545558, + "grad_norm": 0.6838935613632202, + "learning_rate": 0.00010164176360459505, + "loss": 2.541, + "step": 9946 + }, + { + "epoch": 0.8027600677911387, + "grad_norm": 0.6959214210510254, + "learning_rate": 0.00010162597881417928, + "loss": 2.4574, + "step": 9947 + }, + { + "epoch": 0.8028407715277217, + "grad_norm": 0.693004310131073, + "learning_rate": 0.00010161019398323986, + "loss": 2.5553, + "step": 9948 + }, + { + "epoch": 0.8029214752643047, + "grad_norm": 0.6683690547943115, + "learning_rate": 0.00010159440911217022, + "loss": 2.5501, + "step": 9949 + }, + { + "epoch": 0.8030021790008878, + "grad_norm": 0.6797001361846924, + "learning_rate": 0.0001015786242013637, + "loss": 2.5731, + "step": 9950 + }, + { + "epoch": 0.8030828827374707, + "grad_norm": 0.6621012091636658, + "learning_rate": 0.00010156283925121375, + "loss": 2.5278, + "step": 9951 + }, + { + "epoch": 0.8031635864740537, + "grad_norm": 0.7024650573730469, + "learning_rate": 0.00010154705426211377, + "loss": 2.5939, + "step": 9952 + }, + { + "epoch": 0.8032442902106367, + "grad_norm": 0.6756548285484314, + "learning_rate": 0.00010153126923445714, + "loss": 2.5797, + "step": 9953 + }, + { + "epoch": 0.8033249939472198, + "grad_norm": 0.6560662984848022, + "learning_rate": 0.00010151548416863732, + "loss": 2.5358, + "step": 9954 + }, + { + "epoch": 0.8034056976838028, + "grad_norm": 0.7172456979751587, + "learning_rate": 0.00010149969906504766, + "loss": 2.5054, + "step": 9955 + }, + { + "epoch": 0.8034864014203857, + "grad_norm": 0.6379461288452148, + "learning_rate": 0.00010148391392408152, + "loss": 2.5341, + "step": 9956 + }, + { + "epoch": 0.8035671051569687, + "grad_norm": 0.6553892493247986, + "learning_rate": 0.00010146812874613243, + "loss": 2.5618, + "step": 9957 + }, + { + "epoch": 0.8036478088935518, + "grad_norm": 0.6940072178840637, + "learning_rate": 0.00010145234353159372, + "loss": 2.5686, + "step": 9958 + }, + { + "epoch": 0.8037285126301348, + "grad_norm": 0.6641896963119507, + "learning_rate": 0.00010143655828085878, + "loss": 2.5188, + "step": 9959 + }, + { + "epoch": 0.8038092163667178, + "grad_norm": 0.6622887253761292, + "learning_rate": 0.00010142077299432111, + "loss": 2.54, + "step": 9960 + }, + { + "epoch": 0.8038899201033007, + "grad_norm": 0.7216808795928955, + "learning_rate": 0.000101404987672374, + "loss": 2.5775, + "step": 9961 + }, + { + "epoch": 0.8039706238398838, + "grad_norm": 0.6544952988624573, + "learning_rate": 0.00010138920231541095, + "loss": 2.6066, + "step": 9962 + }, + { + "epoch": 0.8040513275764668, + "grad_norm": 0.6869354248046875, + "learning_rate": 0.00010137341692382539, + "loss": 2.5157, + "step": 9963 + }, + { + "epoch": 0.8041320313130498, + "grad_norm": 0.6731898784637451, + "learning_rate": 0.00010135763149801063, + "loss": 2.4369, + "step": 9964 + }, + { + "epoch": 0.8042127350496328, + "grad_norm": 0.6943373084068298, + "learning_rate": 0.00010134184603836017, + "loss": 2.5529, + "step": 9965 + }, + { + "epoch": 0.8042934387862158, + "grad_norm": 0.729928195476532, + "learning_rate": 0.00010132606054526739, + "loss": 2.5814, + "step": 9966 + }, + { + "epoch": 0.8043741425227988, + "grad_norm": 0.6491130590438843, + "learning_rate": 0.00010131027501912571, + "loss": 2.5246, + "step": 9967 + }, + { + "epoch": 0.8044548462593818, + "grad_norm": 0.747756838798523, + "learning_rate": 0.00010129448946032857, + "loss": 2.513, + "step": 9968 + }, + { + "epoch": 0.8045355499959648, + "grad_norm": 0.6449645757675171, + "learning_rate": 0.00010127870386926935, + "loss": 2.5232, + "step": 9969 + }, + { + "epoch": 0.8046162537325479, + "grad_norm": 0.6425037980079651, + "learning_rate": 0.0001012629182463415, + "loss": 2.5065, + "step": 9970 + }, + { + "epoch": 0.8046969574691308, + "grad_norm": 0.7340624332427979, + "learning_rate": 0.00010124713259193843, + "loss": 2.5325, + "step": 9971 + }, + { + "epoch": 0.8047776612057138, + "grad_norm": 0.7308940291404724, + "learning_rate": 0.00010123134690645352, + "loss": 2.5717, + "step": 9972 + }, + { + "epoch": 0.8048583649422968, + "grad_norm": 0.7128338813781738, + "learning_rate": 0.00010121556119028028, + "loss": 2.5548, + "step": 9973 + }, + { + "epoch": 0.8049390686788799, + "grad_norm": 0.7027677893638611, + "learning_rate": 0.00010119977544381207, + "loss": 2.5311, + "step": 9974 + }, + { + "epoch": 0.8050197724154629, + "grad_norm": 0.7022054195404053, + "learning_rate": 0.00010118398966744229, + "loss": 2.5177, + "step": 9975 + }, + { + "epoch": 0.8051004761520458, + "grad_norm": 0.7382696270942688, + "learning_rate": 0.00010116820386156441, + "loss": 2.532, + "step": 9976 + }, + { + "epoch": 0.8051811798886288, + "grad_norm": 0.6968613862991333, + "learning_rate": 0.00010115241802657181, + "loss": 2.536, + "step": 9977 + }, + { + "epoch": 0.8052618836252119, + "grad_norm": 0.8277899026870728, + "learning_rate": 0.00010113663216285798, + "loss": 2.5963, + "step": 9978 + }, + { + "epoch": 0.8053425873617949, + "grad_norm": 0.677707314491272, + "learning_rate": 0.00010112084627081629, + "loss": 2.5041, + "step": 9979 + }, + { + "epoch": 0.8054232910983778, + "grad_norm": 0.6943314075469971, + "learning_rate": 0.00010110506035084017, + "loss": 2.4776, + "step": 9980 + }, + { + "epoch": 0.8055039948349608, + "grad_norm": 0.6948177218437195, + "learning_rate": 0.00010108927440332306, + "loss": 2.5306, + "step": 9981 + }, + { + "epoch": 0.8055846985715439, + "grad_norm": 0.6873918771743774, + "learning_rate": 0.0001010734884286584, + "loss": 2.5783, + "step": 9982 + }, + { + "epoch": 0.8056654023081269, + "grad_norm": 0.6370649933815002, + "learning_rate": 0.00010105770242723958, + "loss": 2.5584, + "step": 9983 + }, + { + "epoch": 0.8057461060447099, + "grad_norm": 0.7594422698020935, + "learning_rate": 0.00010104191639946008, + "loss": 2.543, + "step": 9984 + }, + { + "epoch": 0.8058268097812928, + "grad_norm": 0.697380542755127, + "learning_rate": 0.00010102613034571327, + "loss": 2.5295, + "step": 9985 + }, + { + "epoch": 0.8059075135178759, + "grad_norm": 0.6597251892089844, + "learning_rate": 0.00010101034426639264, + "loss": 2.5917, + "step": 9986 + }, + { + "epoch": 0.8059882172544589, + "grad_norm": 0.6583479046821594, + "learning_rate": 0.00010099455816189156, + "loss": 2.6206, + "step": 9987 + }, + { + "epoch": 0.8060689209910419, + "grad_norm": 0.6603943705558777, + "learning_rate": 0.00010097877203260349, + "loss": 2.5223, + "step": 9988 + }, + { + "epoch": 0.8061496247276249, + "grad_norm": 0.716454267501831, + "learning_rate": 0.00010096298587892188, + "loss": 2.5572, + "step": 9989 + }, + { + "epoch": 0.806230328464208, + "grad_norm": 0.6511488556861877, + "learning_rate": 0.00010094719970124016, + "loss": 2.5815, + "step": 9990 + }, + { + "epoch": 0.8063110322007909, + "grad_norm": 0.6969261169433594, + "learning_rate": 0.00010093141349995173, + "loss": 2.5902, + "step": 9991 + }, + { + "epoch": 0.8063917359373739, + "grad_norm": 0.7012695074081421, + "learning_rate": 0.00010091562727545001, + "loss": 2.5134, + "step": 9992 + }, + { + "epoch": 0.8064724396739569, + "grad_norm": 0.6368406414985657, + "learning_rate": 0.00010089984102812848, + "loss": 2.568, + "step": 9993 + }, + { + "epoch": 0.80655314341054, + "grad_norm": 0.6552153825759888, + "learning_rate": 0.00010088405475838059, + "loss": 2.5101, + "step": 9994 + }, + { + "epoch": 0.8066338471471229, + "grad_norm": 0.6949633359909058, + "learning_rate": 0.00010086826846659974, + "loss": 2.5427, + "step": 9995 + }, + { + "epoch": 0.8067145508837059, + "grad_norm": 0.6593093872070312, + "learning_rate": 0.00010085248215317935, + "loss": 2.5551, + "step": 9996 + }, + { + "epoch": 0.8067952546202889, + "grad_norm": 0.6963745355606079, + "learning_rate": 0.00010083669581851287, + "loss": 2.4956, + "step": 9997 + }, + { + "epoch": 0.8068759583568719, + "grad_norm": 0.7093523144721985, + "learning_rate": 0.00010082090946299377, + "loss": 2.5876, + "step": 9998 + }, + { + "epoch": 0.806956662093455, + "grad_norm": 0.6796671152114868, + "learning_rate": 0.00010080512308701544, + "loss": 2.5302, + "step": 9999 + }, + { + "epoch": 0.8070373658300379, + "grad_norm": 0.7170542478561401, + "learning_rate": 0.00010078933669097135, + "loss": 2.5886, + "step": 10000 + }, + { + "epoch": 0.8070373658300379, + "eval_loss": 2.4734926223754883, + "eval_runtime": 788.2594, + "eval_samples_per_second": 3.324, + "eval_steps_per_second": 0.554, + "step": 10000 + }, + { + "epoch": 0.8071180695666209, + "grad_norm": 0.6566126346588135, + "learning_rate": 0.0001007735502752549, + "loss": 2.4441, + "step": 10001 + }, + { + "epoch": 0.8071987733032039, + "grad_norm": 0.6739515662193298, + "learning_rate": 0.00010075776384025957, + "loss": 2.5767, + "step": 10002 + }, + { + "epoch": 0.807279477039787, + "grad_norm": 0.6334208846092224, + "learning_rate": 0.00010074197738637881, + "loss": 2.5321, + "step": 10003 + }, + { + "epoch": 0.80736018077637, + "grad_norm": 0.6764520406723022, + "learning_rate": 0.000100726190914006, + "loss": 2.5144, + "step": 10004 + }, + { + "epoch": 0.8074408845129529, + "grad_norm": 0.7090082764625549, + "learning_rate": 0.00010071040442353464, + "loss": 2.5626, + "step": 10005 + }, + { + "epoch": 0.8075215882495359, + "grad_norm": 0.6915304064750671, + "learning_rate": 0.00010069461791535814, + "loss": 2.5261, + "step": 10006 + }, + { + "epoch": 0.807602291986119, + "grad_norm": 0.6685747504234314, + "learning_rate": 0.00010067883138986991, + "loss": 2.492, + "step": 10007 + }, + { + "epoch": 0.807682995722702, + "grad_norm": 0.7179074883460999, + "learning_rate": 0.00010066304484746347, + "loss": 2.4601, + "step": 10008 + }, + { + "epoch": 0.807763699459285, + "grad_norm": 0.7032761573791504, + "learning_rate": 0.00010064725828853219, + "loss": 2.578, + "step": 10009 + }, + { + "epoch": 0.8078444031958679, + "grad_norm": 0.710322916507721, + "learning_rate": 0.00010063147171346959, + "loss": 2.5514, + "step": 10010 + }, + { + "epoch": 0.807925106932451, + "grad_norm": 0.6552841067314148, + "learning_rate": 0.00010061568512266903, + "loss": 2.5474, + "step": 10011 + }, + { + "epoch": 0.808005810669034, + "grad_norm": 0.6862452626228333, + "learning_rate": 0.00010059989851652398, + "loss": 2.5772, + "step": 10012 + }, + { + "epoch": 0.808086514405617, + "grad_norm": 0.7123851180076599, + "learning_rate": 0.00010058411189542788, + "loss": 2.4936, + "step": 10013 + }, + { + "epoch": 0.8081672181421999, + "grad_norm": 0.6889944672584534, + "learning_rate": 0.00010056832525977422, + "loss": 2.5041, + "step": 10014 + }, + { + "epoch": 0.808247921878783, + "grad_norm": 0.6986924409866333, + "learning_rate": 0.0001005525386099564, + "loss": 2.5591, + "step": 10015 + }, + { + "epoch": 0.808328625615366, + "grad_norm": 0.6935306787490845, + "learning_rate": 0.00010053675194636787, + "loss": 2.5423, + "step": 10016 + }, + { + "epoch": 0.808409329351949, + "grad_norm": 0.6751969456672668, + "learning_rate": 0.00010052096526940207, + "loss": 2.5666, + "step": 10017 + }, + { + "epoch": 0.808490033088532, + "grad_norm": 0.676909327507019, + "learning_rate": 0.00010050517857945243, + "loss": 2.5394, + "step": 10018 + }, + { + "epoch": 0.808570736825115, + "grad_norm": 0.7439377307891846, + "learning_rate": 0.00010048939187691246, + "loss": 2.5011, + "step": 10019 + }, + { + "epoch": 0.808651440561698, + "grad_norm": 0.6594791412353516, + "learning_rate": 0.00010047360516217554, + "loss": 2.5159, + "step": 10020 + }, + { + "epoch": 0.808732144298281, + "grad_norm": 0.7013304233551025, + "learning_rate": 0.00010045781843563517, + "loss": 2.5439, + "step": 10021 + }, + { + "epoch": 0.808812848034864, + "grad_norm": 0.7537491917610168, + "learning_rate": 0.00010044203169768476, + "loss": 2.5837, + "step": 10022 + }, + { + "epoch": 0.8088935517714471, + "grad_norm": 0.7273866534233093, + "learning_rate": 0.00010042624494871773, + "loss": 2.5546, + "step": 10023 + }, + { + "epoch": 0.80897425550803, + "grad_norm": 0.6716369986534119, + "learning_rate": 0.0001004104581891276, + "loss": 2.5264, + "step": 10024 + }, + { + "epoch": 0.809054959244613, + "grad_norm": 0.7544769644737244, + "learning_rate": 0.00010039467141930777, + "loss": 2.5502, + "step": 10025 + }, + { + "epoch": 0.809135662981196, + "grad_norm": 0.8713179230690002, + "learning_rate": 0.0001003788846396517, + "loss": 2.5178, + "step": 10026 + }, + { + "epoch": 0.8092163667177791, + "grad_norm": 0.6704887747764587, + "learning_rate": 0.00010036309785055283, + "loss": 2.5136, + "step": 10027 + }, + { + "epoch": 0.809297070454362, + "grad_norm": 0.7308552861213684, + "learning_rate": 0.00010034731105240458, + "loss": 2.4781, + "step": 10028 + }, + { + "epoch": 0.809377774190945, + "grad_norm": 0.7214144468307495, + "learning_rate": 0.00010033152424560049, + "loss": 2.5946, + "step": 10029 + }, + { + "epoch": 0.809458477927528, + "grad_norm": 0.6946821808815002, + "learning_rate": 0.00010031573743053393, + "loss": 2.4937, + "step": 10030 + }, + { + "epoch": 0.8095391816641111, + "grad_norm": 0.7348416447639465, + "learning_rate": 0.00010029995060759833, + "loss": 2.5959, + "step": 10031 + }, + { + "epoch": 0.8096198854006941, + "grad_norm": 0.7482579350471497, + "learning_rate": 0.00010028416377718721, + "loss": 2.6, + "step": 10032 + }, + { + "epoch": 0.809700589137277, + "grad_norm": 0.7114939093589783, + "learning_rate": 0.00010026837693969397, + "loss": 2.5376, + "step": 10033 + }, + { + "epoch": 0.80978129287386, + "grad_norm": 0.6559228897094727, + "learning_rate": 0.00010025259009551209, + "loss": 2.4961, + "step": 10034 + }, + { + "epoch": 0.8098619966104431, + "grad_norm": 0.7494906187057495, + "learning_rate": 0.00010023680324503501, + "loss": 2.5723, + "step": 10035 + }, + { + "epoch": 0.8099427003470261, + "grad_norm": 0.7207093834877014, + "learning_rate": 0.00010022101638865618, + "loss": 2.5523, + "step": 10036 + }, + { + "epoch": 0.8100234040836091, + "grad_norm": 0.6730504035949707, + "learning_rate": 0.00010020522952676903, + "loss": 2.5135, + "step": 10037 + }, + { + "epoch": 0.810104107820192, + "grad_norm": 0.6805168390274048, + "learning_rate": 0.000100189442659767, + "loss": 2.5598, + "step": 10038 + }, + { + "epoch": 0.8101848115567751, + "grad_norm": 0.6639137268066406, + "learning_rate": 0.00010017365578804358, + "loss": 2.5152, + "step": 10039 + }, + { + "epoch": 0.8102655152933581, + "grad_norm": 0.6604194641113281, + "learning_rate": 0.00010015786891199221, + "loss": 2.5302, + "step": 10040 + }, + { + "epoch": 0.8103462190299411, + "grad_norm": 0.7664934992790222, + "learning_rate": 0.00010014208203200634, + "loss": 2.5437, + "step": 10041 + }, + { + "epoch": 0.8104269227665241, + "grad_norm": 0.7404079437255859, + "learning_rate": 0.00010012629514847942, + "loss": 2.6559, + "step": 10042 + }, + { + "epoch": 0.8105076265031071, + "grad_norm": 0.694006085395813, + "learning_rate": 0.00010011050826180488, + "loss": 2.5571, + "step": 10043 + }, + { + "epoch": 0.8105883302396901, + "grad_norm": 0.7007058262825012, + "learning_rate": 0.00010009472137237616, + "loss": 2.5639, + "step": 10044 + }, + { + "epoch": 0.8106690339762731, + "grad_norm": 0.7331913113594055, + "learning_rate": 0.00010007893448058678, + "loss": 2.5499, + "step": 10045 + }, + { + "epoch": 0.8107497377128561, + "grad_norm": 0.7636487483978271, + "learning_rate": 0.00010006314758683015, + "loss": 2.6068, + "step": 10046 + }, + { + "epoch": 0.810830441449439, + "grad_norm": 0.6505223512649536, + "learning_rate": 0.0001000473606914997, + "loss": 2.5313, + "step": 10047 + }, + { + "epoch": 0.8109111451860221, + "grad_norm": 0.6425966620445251, + "learning_rate": 0.00010003157379498886, + "loss": 2.5998, + "step": 10048 + }, + { + "epoch": 0.8109918489226051, + "grad_norm": 0.7163281440734863, + "learning_rate": 0.00010001578689769116, + "loss": 2.5493, + "step": 10049 + }, + { + "epoch": 0.8110725526591881, + "grad_norm": 0.7345306873321533, + "learning_rate": 0.0001, + "loss": 2.5609, + "step": 10050 + }, + { + "epoch": 0.8111532563957711, + "grad_norm": 0.6808427572250366, + "learning_rate": 9.998421310230884e-05, + "loss": 2.4823, + "step": 10051 + }, + { + "epoch": 0.8112339601323542, + "grad_norm": 0.7456082105636597, + "learning_rate": 9.996842620501115e-05, + "loss": 2.4782, + "step": 10052 + }, + { + "epoch": 0.8113146638689371, + "grad_norm": 0.7061728239059448, + "learning_rate": 9.995263930850034e-05, + "loss": 2.4906, + "step": 10053 + }, + { + "epoch": 0.8113953676055201, + "grad_norm": 0.691663920879364, + "learning_rate": 9.993685241316986e-05, + "loss": 2.5842, + "step": 10054 + }, + { + "epoch": 0.8114760713421031, + "grad_norm": 0.6899400353431702, + "learning_rate": 9.992106551941325e-05, + "loss": 2.5628, + "step": 10055 + }, + { + "epoch": 0.8115567750786862, + "grad_norm": 0.6909289360046387, + "learning_rate": 9.990527862762385e-05, + "loss": 2.5173, + "step": 10056 + }, + { + "epoch": 0.8116374788152692, + "grad_norm": 0.6507968306541443, + "learning_rate": 9.988949173819514e-05, + "loss": 2.5763, + "step": 10057 + }, + { + "epoch": 0.8117181825518521, + "grad_norm": 0.6972371339797974, + "learning_rate": 9.98737048515206e-05, + "loss": 2.604, + "step": 10058 + }, + { + "epoch": 0.8117988862884351, + "grad_norm": 0.6500107049942017, + "learning_rate": 9.985791796799368e-05, + "loss": 2.509, + "step": 10059 + }, + { + "epoch": 0.8118795900250182, + "grad_norm": 0.704501211643219, + "learning_rate": 9.98421310880078e-05, + "loss": 2.5773, + "step": 10060 + }, + { + "epoch": 0.8119602937616012, + "grad_norm": 0.7037203311920166, + "learning_rate": 9.982634421195641e-05, + "loss": 2.5968, + "step": 10061 + }, + { + "epoch": 0.8120409974981841, + "grad_norm": 0.7161232829093933, + "learning_rate": 9.981055734023304e-05, + "loss": 2.5373, + "step": 10062 + }, + { + "epoch": 0.8121217012347671, + "grad_norm": 0.6602928638458252, + "learning_rate": 9.979477047323099e-05, + "loss": 2.5851, + "step": 10063 + }, + { + "epoch": 0.8122024049713502, + "grad_norm": 0.6685947775840759, + "learning_rate": 9.977898361134383e-05, + "loss": 2.5543, + "step": 10064 + }, + { + "epoch": 0.8122831087079332, + "grad_norm": 0.6772760152816772, + "learning_rate": 9.976319675496502e-05, + "loss": 2.5355, + "step": 10065 + }, + { + "epoch": 0.8123638124445162, + "grad_norm": 0.6140885949134827, + "learning_rate": 9.974740990448792e-05, + "loss": 2.489, + "step": 10066 + }, + { + "epoch": 0.8124445161810991, + "grad_norm": 0.6597142219543457, + "learning_rate": 9.973162306030604e-05, + "loss": 2.5619, + "step": 10067 + }, + { + "epoch": 0.8125252199176822, + "grad_norm": 0.6768592000007629, + "learning_rate": 9.971583622281281e-05, + "loss": 2.5107, + "step": 10068 + }, + { + "epoch": 0.8126059236542652, + "grad_norm": 0.682296633720398, + "learning_rate": 9.970004939240168e-05, + "loss": 2.5003, + "step": 10069 + }, + { + "epoch": 0.8126866273908482, + "grad_norm": 0.7356325387954712, + "learning_rate": 9.96842625694661e-05, + "loss": 2.5864, + "step": 10070 + }, + { + "epoch": 0.8127673311274312, + "grad_norm": 0.6818091869354248, + "learning_rate": 9.966847575439956e-05, + "loss": 2.5375, + "step": 10071 + }, + { + "epoch": 0.8128480348640142, + "grad_norm": 0.6954368352890015, + "learning_rate": 9.965268894759543e-05, + "loss": 2.5314, + "step": 10072 + }, + { + "epoch": 0.8129287386005972, + "grad_norm": 0.6759306192398071, + "learning_rate": 9.963690214944721e-05, + "loss": 2.5881, + "step": 10073 + }, + { + "epoch": 0.8130094423371802, + "grad_norm": 0.6546545624732971, + "learning_rate": 9.962111536034832e-05, + "loss": 2.5264, + "step": 10074 + }, + { + "epoch": 0.8130901460737632, + "grad_norm": 0.6709586977958679, + "learning_rate": 9.960532858069226e-05, + "loss": 2.5906, + "step": 10075 + }, + { + "epoch": 0.8131708498103463, + "grad_norm": 0.7310851812362671, + "learning_rate": 9.958954181087241e-05, + "loss": 2.5134, + "step": 10076 + }, + { + "epoch": 0.8132515535469292, + "grad_norm": 0.6793027520179749, + "learning_rate": 9.957375505128227e-05, + "loss": 2.5387, + "step": 10077 + }, + { + "epoch": 0.8133322572835122, + "grad_norm": 0.6965875029563904, + "learning_rate": 9.955796830231528e-05, + "loss": 2.5649, + "step": 10078 + }, + { + "epoch": 0.8134129610200952, + "grad_norm": 0.6597574353218079, + "learning_rate": 9.954218156436485e-05, + "loss": 2.5281, + "step": 10079 + }, + { + "epoch": 0.8134936647566783, + "grad_norm": 0.7911555171012878, + "learning_rate": 9.952639483782445e-05, + "loss": 2.535, + "step": 10080 + }, + { + "epoch": 0.8135743684932613, + "grad_norm": 0.7405688762664795, + "learning_rate": 9.951060812308757e-05, + "loss": 2.5303, + "step": 10081 + }, + { + "epoch": 0.8136550722298442, + "grad_norm": 0.6961480379104614, + "learning_rate": 9.949482142054758e-05, + "loss": 2.4959, + "step": 10082 + }, + { + "epoch": 0.8137357759664272, + "grad_norm": 0.6761718392372131, + "learning_rate": 9.947903473059797e-05, + "loss": 2.5591, + "step": 10083 + }, + { + "epoch": 0.8138164797030103, + "grad_norm": 0.7383104562759399, + "learning_rate": 9.946324805363218e-05, + "loss": 2.5848, + "step": 10084 + }, + { + "epoch": 0.8138971834395933, + "grad_norm": 0.6495873928070068, + "learning_rate": 9.944746139004364e-05, + "loss": 2.4972, + "step": 10085 + }, + { + "epoch": 0.8139778871761763, + "grad_norm": 0.7247152328491211, + "learning_rate": 9.94316747402258e-05, + "loss": 2.5361, + "step": 10086 + }, + { + "epoch": 0.8140585909127592, + "grad_norm": 0.6965751051902771, + "learning_rate": 9.941588810457215e-05, + "loss": 2.4997, + "step": 10087 + }, + { + "epoch": 0.8141392946493423, + "grad_norm": 0.7138223648071289, + "learning_rate": 9.940010148347603e-05, + "loss": 2.5226, + "step": 10088 + }, + { + "epoch": 0.8142199983859253, + "grad_norm": 0.6571210622787476, + "learning_rate": 9.938431487733099e-05, + "loss": 2.5388, + "step": 10089 + }, + { + "epoch": 0.8143007021225083, + "grad_norm": 0.6721277832984924, + "learning_rate": 9.936852828653042e-05, + "loss": 2.5219, + "step": 10090 + }, + { + "epoch": 0.8143814058590912, + "grad_norm": 0.647520124912262, + "learning_rate": 9.935274171146782e-05, + "loss": 2.6199, + "step": 10091 + }, + { + "epoch": 0.8144621095956743, + "grad_norm": 0.6892204284667969, + "learning_rate": 9.933695515253654e-05, + "loss": 2.5132, + "step": 10092 + }, + { + "epoch": 0.8145428133322573, + "grad_norm": 0.6979050636291504, + "learning_rate": 9.932116861013008e-05, + "loss": 2.5148, + "step": 10093 + }, + { + "epoch": 0.8146235170688403, + "grad_norm": 0.6682664752006531, + "learning_rate": 9.930538208464189e-05, + "loss": 2.5795, + "step": 10094 + }, + { + "epoch": 0.8147042208054233, + "grad_norm": 0.734121561050415, + "learning_rate": 9.928959557646537e-05, + "loss": 2.5469, + "step": 10095 + }, + { + "epoch": 0.8147849245420064, + "grad_norm": 0.6669620275497437, + "learning_rate": 9.9273809085994e-05, + "loss": 2.5277, + "step": 10096 + }, + { + "epoch": 0.8148656282785893, + "grad_norm": 0.6750600934028625, + "learning_rate": 9.925802261362124e-05, + "loss": 2.5869, + "step": 10097 + }, + { + "epoch": 0.8149463320151723, + "grad_norm": 0.6813061237335205, + "learning_rate": 9.924223615974044e-05, + "loss": 2.585, + "step": 10098 + }, + { + "epoch": 0.8150270357517553, + "grad_norm": 0.6775497794151306, + "learning_rate": 9.92264497247451e-05, + "loss": 2.5353, + "step": 10099 + }, + { + "epoch": 0.8151077394883383, + "grad_norm": 0.6877530813217163, + "learning_rate": 9.92106633090287e-05, + "loss": 2.5349, + "step": 10100 + }, + { + "epoch": 0.8151884432249213, + "grad_norm": 0.6984169483184814, + "learning_rate": 9.91948769129846e-05, + "loss": 2.5986, + "step": 10101 + }, + { + "epoch": 0.8152691469615043, + "grad_norm": 0.7144806981086731, + "learning_rate": 9.917909053700626e-05, + "loss": 2.5797, + "step": 10102 + }, + { + "epoch": 0.8153498506980873, + "grad_norm": 0.6494203209877014, + "learning_rate": 9.916330418148715e-05, + "loss": 2.5035, + "step": 10103 + }, + { + "epoch": 0.8154305544346703, + "grad_norm": 0.6669752597808838, + "learning_rate": 9.914751784682069e-05, + "loss": 2.5489, + "step": 10104 + }, + { + "epoch": 0.8155112581712534, + "grad_norm": 0.6557981371879578, + "learning_rate": 9.913173153340029e-05, + "loss": 2.5266, + "step": 10105 + }, + { + "epoch": 0.8155919619078363, + "grad_norm": 0.6633948087692261, + "learning_rate": 9.911594524161941e-05, + "loss": 2.5263, + "step": 10106 + }, + { + "epoch": 0.8156726656444193, + "grad_norm": 0.7191522717475891, + "learning_rate": 9.910015897187154e-05, + "loss": 2.5625, + "step": 10107 + }, + { + "epoch": 0.8157533693810023, + "grad_norm": 0.7089062929153442, + "learning_rate": 9.908437272455001e-05, + "loss": 2.5644, + "step": 10108 + }, + { + "epoch": 0.8158340731175854, + "grad_norm": 0.7662761211395264, + "learning_rate": 9.906858650004831e-05, + "loss": 2.5875, + "step": 10109 + }, + { + "epoch": 0.8159147768541684, + "grad_norm": 0.6658861041069031, + "learning_rate": 9.905280029875988e-05, + "loss": 2.5818, + "step": 10110 + }, + { + "epoch": 0.8159954805907513, + "grad_norm": 0.7229514718055725, + "learning_rate": 9.903701412107815e-05, + "loss": 2.5421, + "step": 10111 + }, + { + "epoch": 0.8160761843273343, + "grad_norm": 0.7295149564743042, + "learning_rate": 9.902122796739652e-05, + "loss": 2.5298, + "step": 10112 + }, + { + "epoch": 0.8161568880639174, + "grad_norm": 0.6805420517921448, + "learning_rate": 9.900544183810849e-05, + "loss": 2.6693, + "step": 10113 + }, + { + "epoch": 0.8162375918005004, + "grad_norm": 0.6560602188110352, + "learning_rate": 9.898965573360738e-05, + "loss": 2.5445, + "step": 10114 + }, + { + "epoch": 0.8163182955370833, + "grad_norm": 0.690396785736084, + "learning_rate": 9.897386965428674e-05, + "loss": 2.5281, + "step": 10115 + }, + { + "epoch": 0.8163989992736663, + "grad_norm": 0.6905054450035095, + "learning_rate": 9.895808360053998e-05, + "loss": 2.5406, + "step": 10116 + }, + { + "epoch": 0.8164797030102494, + "grad_norm": 0.6905301213264465, + "learning_rate": 9.894229757276045e-05, + "loss": 2.5458, + "step": 10117 + }, + { + "epoch": 0.8165604067468324, + "grad_norm": 0.6827620267868042, + "learning_rate": 9.892651157134162e-05, + "loss": 2.4403, + "step": 10118 + }, + { + "epoch": 0.8166411104834154, + "grad_norm": 0.7614343166351318, + "learning_rate": 9.891072559667697e-05, + "loss": 2.6369, + "step": 10119 + }, + { + "epoch": 0.8167218142199983, + "grad_norm": 0.6913704872131348, + "learning_rate": 9.889493964915985e-05, + "loss": 2.5914, + "step": 10120 + }, + { + "epoch": 0.8168025179565814, + "grad_norm": 0.7026088237762451, + "learning_rate": 9.887915372918372e-05, + "loss": 2.5139, + "step": 10121 + }, + { + "epoch": 0.8168832216931644, + "grad_norm": 0.7064465284347534, + "learning_rate": 9.886336783714203e-05, + "loss": 2.549, + "step": 10122 + }, + { + "epoch": 0.8169639254297474, + "grad_norm": 0.7345553040504456, + "learning_rate": 9.884758197342821e-05, + "loss": 2.5887, + "step": 10123 + }, + { + "epoch": 0.8170446291663304, + "grad_norm": 0.6916251182556152, + "learning_rate": 9.883179613843563e-05, + "loss": 2.5659, + "step": 10124 + }, + { + "epoch": 0.8171253329029134, + "grad_norm": 0.6428200602531433, + "learning_rate": 9.881601033255771e-05, + "loss": 2.5379, + "step": 10125 + }, + { + "epoch": 0.8172060366394964, + "grad_norm": 0.7433571815490723, + "learning_rate": 9.880022455618796e-05, + "loss": 2.5751, + "step": 10126 + }, + { + "epoch": 0.8172867403760794, + "grad_norm": 0.733256995677948, + "learning_rate": 9.878443880971974e-05, + "loss": 2.4971, + "step": 10127 + }, + { + "epoch": 0.8173674441126624, + "grad_norm": 0.708289384841919, + "learning_rate": 9.876865309354646e-05, + "loss": 2.635, + "step": 10128 + }, + { + "epoch": 0.8174481478492455, + "grad_norm": 0.6877188682556152, + "learning_rate": 9.87528674080616e-05, + "loss": 2.5827, + "step": 10129 + }, + { + "epoch": 0.8175288515858284, + "grad_norm": 0.7108712792396545, + "learning_rate": 9.873708175365852e-05, + "loss": 2.5643, + "step": 10130 + }, + { + "epoch": 0.8176095553224114, + "grad_norm": 0.7435629367828369, + "learning_rate": 9.872129613073065e-05, + "loss": 2.5267, + "step": 10131 + }, + { + "epoch": 0.8176902590589944, + "grad_norm": 0.669913113117218, + "learning_rate": 9.870551053967148e-05, + "loss": 2.5684, + "step": 10132 + }, + { + "epoch": 0.8177709627955775, + "grad_norm": 0.6981424689292908, + "learning_rate": 9.868972498087431e-05, + "loss": 2.592, + "step": 10133 + }, + { + "epoch": 0.8178516665321605, + "grad_norm": 0.6661834716796875, + "learning_rate": 9.867393945473263e-05, + "loss": 2.5082, + "step": 10134 + }, + { + "epoch": 0.8179323702687434, + "grad_norm": 0.6611261367797852, + "learning_rate": 9.865815396163987e-05, + "loss": 2.556, + "step": 10135 + }, + { + "epoch": 0.8180130740053264, + "grad_norm": 0.6732283234596252, + "learning_rate": 9.86423685019894e-05, + "loss": 2.5668, + "step": 10136 + }, + { + "epoch": 0.8180937777419095, + "grad_norm": 0.6768637299537659, + "learning_rate": 9.862658307617465e-05, + "loss": 2.5467, + "step": 10137 + }, + { + "epoch": 0.8181744814784925, + "grad_norm": 0.6943596601486206, + "learning_rate": 9.861079768458904e-05, + "loss": 2.5989, + "step": 10138 + }, + { + "epoch": 0.8182551852150755, + "grad_norm": 0.7369638681411743, + "learning_rate": 9.859501232762601e-05, + "loss": 2.5189, + "step": 10139 + }, + { + "epoch": 0.8183358889516584, + "grad_norm": 0.7443112730979919, + "learning_rate": 9.857922700567892e-05, + "loss": 2.5979, + "step": 10140 + }, + { + "epoch": 0.8184165926882415, + "grad_norm": 0.6726163029670715, + "learning_rate": 9.85634417191412e-05, + "loss": 2.5451, + "step": 10141 + }, + { + "epoch": 0.8184972964248245, + "grad_norm": 0.720492422580719, + "learning_rate": 9.854765646840632e-05, + "loss": 2.6116, + "step": 10142 + }, + { + "epoch": 0.8185780001614075, + "grad_norm": 0.6998233795166016, + "learning_rate": 9.85318712538676e-05, + "loss": 2.556, + "step": 10143 + }, + { + "epoch": 0.8186587038979904, + "grad_norm": 0.7580110430717468, + "learning_rate": 9.851608607591848e-05, + "loss": 2.5222, + "step": 10144 + }, + { + "epoch": 0.8187394076345735, + "grad_norm": 0.6893007755279541, + "learning_rate": 9.85003009349524e-05, + "loss": 2.4639, + "step": 10145 + }, + { + "epoch": 0.8188201113711565, + "grad_norm": 0.6448441743850708, + "learning_rate": 9.84845158313627e-05, + "loss": 2.5249, + "step": 10146 + }, + { + "epoch": 0.8189008151077395, + "grad_norm": 0.7591872215270996, + "learning_rate": 9.846873076554285e-05, + "loss": 2.5173, + "step": 10147 + }, + { + "epoch": 0.8189815188443225, + "grad_norm": 0.6994685530662537, + "learning_rate": 9.845294573788626e-05, + "loss": 2.5181, + "step": 10148 + }, + { + "epoch": 0.8190622225809054, + "grad_norm": 0.6822378635406494, + "learning_rate": 9.843716074878628e-05, + "loss": 2.5109, + "step": 10149 + }, + { + "epoch": 0.8191429263174885, + "grad_norm": 0.6730359792709351, + "learning_rate": 9.842137579863632e-05, + "loss": 2.5402, + "step": 10150 + }, + { + "epoch": 0.8192236300540715, + "grad_norm": 0.6280627846717834, + "learning_rate": 9.840559088782984e-05, + "loss": 2.4806, + "step": 10151 + }, + { + "epoch": 0.8193043337906545, + "grad_norm": 0.6887876391410828, + "learning_rate": 9.838980601676017e-05, + "loss": 2.5498, + "step": 10152 + }, + { + "epoch": 0.8193850375272375, + "grad_norm": 0.7823790907859802, + "learning_rate": 9.837402118582075e-05, + "loss": 2.467, + "step": 10153 + }, + { + "epoch": 0.8194657412638205, + "grad_norm": 0.8109384179115295, + "learning_rate": 9.835823639540496e-05, + "loss": 2.5898, + "step": 10154 + }, + { + "epoch": 0.8195464450004035, + "grad_norm": 0.6883066892623901, + "learning_rate": 9.834245164590624e-05, + "loss": 2.5589, + "step": 10155 + }, + { + "epoch": 0.8196271487369865, + "grad_norm": 0.7291175723075867, + "learning_rate": 9.832666693771794e-05, + "loss": 2.5317, + "step": 10156 + }, + { + "epoch": 0.8197078524735695, + "grad_norm": 0.6819449663162231, + "learning_rate": 9.831088227123346e-05, + "loss": 2.5513, + "step": 10157 + }, + { + "epoch": 0.8197885562101526, + "grad_norm": 0.7038870453834534, + "learning_rate": 9.829509764684626e-05, + "loss": 2.5301, + "step": 10158 + }, + { + "epoch": 0.8198692599467355, + "grad_norm": 0.7483033537864685, + "learning_rate": 9.827931306494965e-05, + "loss": 2.5273, + "step": 10159 + }, + { + "epoch": 0.8199499636833185, + "grad_norm": 0.6998303532600403, + "learning_rate": 9.826352852593705e-05, + "loss": 2.5083, + "step": 10160 + }, + { + "epoch": 0.8200306674199015, + "grad_norm": 0.6865512728691101, + "learning_rate": 9.824774403020188e-05, + "loss": 2.5693, + "step": 10161 + }, + { + "epoch": 0.8201113711564846, + "grad_norm": 0.8144257068634033, + "learning_rate": 9.823195957813749e-05, + "loss": 2.6052, + "step": 10162 + }, + { + "epoch": 0.8201920748930676, + "grad_norm": 0.6920810341835022, + "learning_rate": 9.821617517013729e-05, + "loss": 2.5467, + "step": 10163 + }, + { + "epoch": 0.8202727786296505, + "grad_norm": 0.7538061141967773, + "learning_rate": 9.820039080659469e-05, + "loss": 2.5933, + "step": 10164 + }, + { + "epoch": 0.8203534823662335, + "grad_norm": 0.6744310259819031, + "learning_rate": 9.818460648790302e-05, + "loss": 2.5633, + "step": 10165 + }, + { + "epoch": 0.8204341861028166, + "grad_norm": 0.6943854689598083, + "learning_rate": 9.816882221445571e-05, + "loss": 2.5868, + "step": 10166 + }, + { + "epoch": 0.8205148898393996, + "grad_norm": 0.6486902832984924, + "learning_rate": 9.815303798664614e-05, + "loss": 2.4983, + "step": 10167 + }, + { + "epoch": 0.8205955935759826, + "grad_norm": 0.6699065566062927, + "learning_rate": 9.813725380486773e-05, + "loss": 2.563, + "step": 10168 + }, + { + "epoch": 0.8206762973125655, + "grad_norm": 0.6547110080718994, + "learning_rate": 9.812146966951379e-05, + "loss": 2.5404, + "step": 10169 + }, + { + "epoch": 0.8207570010491486, + "grad_norm": 0.692592203617096, + "learning_rate": 9.810568558097774e-05, + "loss": 2.5625, + "step": 10170 + }, + { + "epoch": 0.8208377047857316, + "grad_norm": 0.6696702837944031, + "learning_rate": 9.808990153965296e-05, + "loss": 2.5866, + "step": 10171 + }, + { + "epoch": 0.8209184085223146, + "grad_norm": 0.6425998210906982, + "learning_rate": 9.807411754593282e-05, + "loss": 2.5487, + "step": 10172 + }, + { + "epoch": 0.8209991122588975, + "grad_norm": 0.6849769949913025, + "learning_rate": 9.805833360021069e-05, + "loss": 2.5772, + "step": 10173 + }, + { + "epoch": 0.8210798159954806, + "grad_norm": 0.7451414465904236, + "learning_rate": 9.804254970288001e-05, + "loss": 2.5089, + "step": 10174 + }, + { + "epoch": 0.8211605197320636, + "grad_norm": 0.7134390473365784, + "learning_rate": 9.802676585433408e-05, + "loss": 2.541, + "step": 10175 + }, + { + "epoch": 0.8212412234686466, + "grad_norm": 0.7490564584732056, + "learning_rate": 9.801098205496627e-05, + "loss": 2.5299, + "step": 10176 + }, + { + "epoch": 0.8213219272052296, + "grad_norm": 0.6614408493041992, + "learning_rate": 9.799519830517005e-05, + "loss": 2.5252, + "step": 10177 + }, + { + "epoch": 0.8214026309418127, + "grad_norm": 0.761049211025238, + "learning_rate": 9.797941460533869e-05, + "loss": 2.5153, + "step": 10178 + }, + { + "epoch": 0.8214833346783956, + "grad_norm": 0.6352702379226685, + "learning_rate": 9.796363095586561e-05, + "loss": 2.5407, + "step": 10179 + }, + { + "epoch": 0.8215640384149786, + "grad_norm": 0.684212863445282, + "learning_rate": 9.794784735714417e-05, + "loss": 2.5425, + "step": 10180 + }, + { + "epoch": 0.8216447421515616, + "grad_norm": 0.652987539768219, + "learning_rate": 9.793206380956772e-05, + "loss": 2.5542, + "step": 10181 + }, + { + "epoch": 0.8217254458881447, + "grad_norm": 0.6912897229194641, + "learning_rate": 9.791628031352966e-05, + "loss": 2.5041, + "step": 10182 + }, + { + "epoch": 0.8218061496247276, + "grad_norm": 0.7025408744812012, + "learning_rate": 9.790049686942333e-05, + "loss": 2.5296, + "step": 10183 + }, + { + "epoch": 0.8218868533613106, + "grad_norm": 0.7580777406692505, + "learning_rate": 9.788471347764215e-05, + "loss": 2.578, + "step": 10184 + }, + { + "epoch": 0.8219675570978936, + "grad_norm": 0.7044378519058228, + "learning_rate": 9.78689301385794e-05, + "loss": 2.5093, + "step": 10185 + }, + { + "epoch": 0.8220482608344767, + "grad_norm": 0.7339754700660706, + "learning_rate": 9.785314685262849e-05, + "loss": 2.5202, + "step": 10186 + }, + { + "epoch": 0.8221289645710597, + "grad_norm": 0.6872244477272034, + "learning_rate": 9.783736362018277e-05, + "loss": 2.541, + "step": 10187 + }, + { + "epoch": 0.8222096683076426, + "grad_norm": 0.7052434682846069, + "learning_rate": 9.78215804416356e-05, + "loss": 2.4968, + "step": 10188 + }, + { + "epoch": 0.8222903720442256, + "grad_norm": 0.6739610433578491, + "learning_rate": 9.780579731738033e-05, + "loss": 2.5137, + "step": 10189 + }, + { + "epoch": 0.8223710757808087, + "grad_norm": 0.6842939853668213, + "learning_rate": 9.779001424781035e-05, + "loss": 2.5329, + "step": 10190 + }, + { + "epoch": 0.8224517795173917, + "grad_norm": 0.7057977914810181, + "learning_rate": 9.777423123331898e-05, + "loss": 2.5657, + "step": 10191 + }, + { + "epoch": 0.8225324832539747, + "grad_norm": 0.6748424172401428, + "learning_rate": 9.775844827429958e-05, + "loss": 2.6104, + "step": 10192 + }, + { + "epoch": 0.8226131869905576, + "grad_norm": 0.6492514610290527, + "learning_rate": 9.774266537114555e-05, + "loss": 2.58, + "step": 10193 + }, + { + "epoch": 0.8226938907271407, + "grad_norm": 0.6987641453742981, + "learning_rate": 9.772688252425016e-05, + "loss": 2.5301, + "step": 10194 + }, + { + "epoch": 0.8227745944637237, + "grad_norm": 0.710921585559845, + "learning_rate": 9.771109973400679e-05, + "loss": 2.6245, + "step": 10195 + }, + { + "epoch": 0.8228552982003067, + "grad_norm": 0.6673738360404968, + "learning_rate": 9.769531700080883e-05, + "loss": 2.5205, + "step": 10196 + }, + { + "epoch": 0.8229360019368896, + "grad_norm": 0.6705252528190613, + "learning_rate": 9.767953432504958e-05, + "loss": 2.4932, + "step": 10197 + }, + { + "epoch": 0.8230167056734727, + "grad_norm": 0.6587076783180237, + "learning_rate": 9.766375170712237e-05, + "loss": 2.5085, + "step": 10198 + }, + { + "epoch": 0.8230974094100557, + "grad_norm": 0.7285338640213013, + "learning_rate": 9.764796914742061e-05, + "loss": 2.5481, + "step": 10199 + }, + { + "epoch": 0.8231781131466387, + "grad_norm": 0.6971831321716309, + "learning_rate": 9.763218664633763e-05, + "loss": 2.6092, + "step": 10200 + }, + { + "epoch": 0.8232588168832217, + "grad_norm": 0.6940265893936157, + "learning_rate": 9.761640420426669e-05, + "loss": 2.5325, + "step": 10201 + }, + { + "epoch": 0.8233395206198046, + "grad_norm": 0.6612978577613831, + "learning_rate": 9.76006218216012e-05, + "loss": 2.5532, + "step": 10202 + }, + { + "epoch": 0.8234202243563877, + "grad_norm": 0.6707638502120972, + "learning_rate": 9.758483949873453e-05, + "loss": 2.512, + "step": 10203 + }, + { + "epoch": 0.8235009280929707, + "grad_norm": 0.6636764407157898, + "learning_rate": 9.756905723605994e-05, + "loss": 2.5446, + "step": 10204 + }, + { + "epoch": 0.8235816318295537, + "grad_norm": 0.6996643543243408, + "learning_rate": 9.755327503397081e-05, + "loss": 2.5504, + "step": 10205 + }, + { + "epoch": 0.8236623355661367, + "grad_norm": 0.604487955570221, + "learning_rate": 9.753749289286046e-05, + "loss": 2.4767, + "step": 10206 + }, + { + "epoch": 0.8237430393027197, + "grad_norm": 0.6484553217887878, + "learning_rate": 9.752171081312222e-05, + "loss": 2.5522, + "step": 10207 + }, + { + "epoch": 0.8238237430393027, + "grad_norm": 0.6890987753868103, + "learning_rate": 9.75059287951494e-05, + "loss": 2.5545, + "step": 10208 + }, + { + "epoch": 0.8239044467758857, + "grad_norm": 0.6786034107208252, + "learning_rate": 9.749014683933541e-05, + "loss": 2.591, + "step": 10209 + }, + { + "epoch": 0.8239851505124687, + "grad_norm": 0.751192033290863, + "learning_rate": 9.747436494607349e-05, + "loss": 2.5335, + "step": 10210 + }, + { + "epoch": 0.8240658542490518, + "grad_norm": 0.6611589789390564, + "learning_rate": 9.7458583115757e-05, + "loss": 2.5104, + "step": 10211 + }, + { + "epoch": 0.8241465579856347, + "grad_norm": 0.6602892875671387, + "learning_rate": 9.744280134877926e-05, + "loss": 2.5319, + "step": 10212 + }, + { + "epoch": 0.8242272617222177, + "grad_norm": 0.6856467127799988, + "learning_rate": 9.742701964553359e-05, + "loss": 2.5418, + "step": 10213 + }, + { + "epoch": 0.8243079654588007, + "grad_norm": 0.6810153126716614, + "learning_rate": 9.741123800641332e-05, + "loss": 2.5691, + "step": 10214 + }, + { + "epoch": 0.8243886691953838, + "grad_norm": 0.7044229507446289, + "learning_rate": 9.739545643181175e-05, + "loss": 2.5911, + "step": 10215 + }, + { + "epoch": 0.8244693729319668, + "grad_norm": 0.6689271330833435, + "learning_rate": 9.737967492212225e-05, + "loss": 2.5374, + "step": 10216 + }, + { + "epoch": 0.8245500766685497, + "grad_norm": 0.6558904051780701, + "learning_rate": 9.736389347773807e-05, + "loss": 2.5118, + "step": 10217 + }, + { + "epoch": 0.8246307804051327, + "grad_norm": 0.6900291442871094, + "learning_rate": 9.734811209905255e-05, + "loss": 2.515, + "step": 10218 + }, + { + "epoch": 0.8247114841417158, + "grad_norm": 0.7129492163658142, + "learning_rate": 9.733233078645907e-05, + "loss": 2.5191, + "step": 10219 + }, + { + "epoch": 0.8247921878782988, + "grad_norm": 0.7031866908073425, + "learning_rate": 9.731654954035082e-05, + "loss": 2.5616, + "step": 10220 + }, + { + "epoch": 0.8248728916148818, + "grad_norm": 0.6418820023536682, + "learning_rate": 9.730076836112118e-05, + "loss": 2.537, + "step": 10221 + }, + { + "epoch": 0.8249535953514647, + "grad_norm": 0.6731035113334656, + "learning_rate": 9.728498724916347e-05, + "loss": 2.5483, + "step": 10222 + }, + { + "epoch": 0.8250342990880478, + "grad_norm": 0.6941342353820801, + "learning_rate": 9.726920620487096e-05, + "loss": 2.5314, + "step": 10223 + }, + { + "epoch": 0.8251150028246308, + "grad_norm": 0.6808927059173584, + "learning_rate": 9.725342522863696e-05, + "loss": 2.5521, + "step": 10224 + }, + { + "epoch": 0.8251957065612138, + "grad_norm": 0.6873155832290649, + "learning_rate": 9.723764432085481e-05, + "loss": 2.5205, + "step": 10225 + }, + { + "epoch": 0.8252764102977967, + "grad_norm": 0.8590287566184998, + "learning_rate": 9.722186348191776e-05, + "loss": 2.5378, + "step": 10226 + }, + { + "epoch": 0.8253571140343798, + "grad_norm": 0.691523015499115, + "learning_rate": 9.720608271221912e-05, + "loss": 2.5062, + "step": 10227 + }, + { + "epoch": 0.8254378177709628, + "grad_norm": 0.6695523262023926, + "learning_rate": 9.719030201215226e-05, + "loss": 2.5164, + "step": 10228 + }, + { + "epoch": 0.8255185215075458, + "grad_norm": 0.745516300201416, + "learning_rate": 9.717452138211037e-05, + "loss": 2.5207, + "step": 10229 + }, + { + "epoch": 0.8255992252441288, + "grad_norm": 0.6628115773200989, + "learning_rate": 9.715874082248679e-05, + "loss": 2.5293, + "step": 10230 + }, + { + "epoch": 0.8256799289807119, + "grad_norm": 0.6531884074211121, + "learning_rate": 9.714296033367482e-05, + "loss": 2.4812, + "step": 10231 + }, + { + "epoch": 0.8257606327172948, + "grad_norm": 0.7444833517074585, + "learning_rate": 9.712717991606777e-05, + "loss": 2.5422, + "step": 10232 + }, + { + "epoch": 0.8258413364538778, + "grad_norm": 0.7013139128684998, + "learning_rate": 9.711139957005888e-05, + "loss": 2.5117, + "step": 10233 + }, + { + "epoch": 0.8259220401904608, + "grad_norm": 0.6588132977485657, + "learning_rate": 9.709561929604147e-05, + "loss": 2.5257, + "step": 10234 + }, + { + "epoch": 0.8260027439270439, + "grad_norm": 0.7538537383079529, + "learning_rate": 9.707983909440886e-05, + "loss": 2.5225, + "step": 10235 + }, + { + "epoch": 0.8260834476636268, + "grad_norm": Infinity, + "learning_rate": 9.707983909440886e-05, + "loss": 2.5532, + "step": 10236 + }, + { + "epoch": 0.8261641514002098, + "grad_norm": 0.7414929270744324, + "learning_rate": 9.706405896555425e-05, + "loss": 2.5653, + "step": 10237 + }, + { + "epoch": 0.8262448551367928, + "grad_norm": 0.757057785987854, + "learning_rate": 9.704827890987097e-05, + "loss": 2.5732, + "step": 10238 + }, + { + "epoch": 0.8263255588733759, + "grad_norm": 0.730721652507782, + "learning_rate": 9.703249892775232e-05, + "loss": 2.5317, + "step": 10239 + }, + { + "epoch": 0.8264062626099589, + "grad_norm": 0.6943208575248718, + "learning_rate": 9.701671901959151e-05, + "loss": 2.5849, + "step": 10240 + }, + { + "epoch": 0.8264869663465418, + "grad_norm": 0.7111102938652039, + "learning_rate": 9.700093918578188e-05, + "loss": 2.5007, + "step": 10241 + }, + { + "epoch": 0.8265676700831248, + "grad_norm": 0.7240251302719116, + "learning_rate": 9.69851594267167e-05, + "loss": 2.5002, + "step": 10242 + }, + { + "epoch": 0.8266483738197079, + "grad_norm": 0.6624411344528198, + "learning_rate": 9.696937974278922e-05, + "loss": 2.5175, + "step": 10243 + }, + { + "epoch": 0.8267290775562909, + "grad_norm": 0.6972576975822449, + "learning_rate": 9.695360013439269e-05, + "loss": 2.5285, + "step": 10244 + }, + { + "epoch": 0.8268097812928739, + "grad_norm": 0.684446394443512, + "learning_rate": 9.693782060192046e-05, + "loss": 2.57, + "step": 10245 + }, + { + "epoch": 0.8268904850294568, + "grad_norm": 0.6920011639595032, + "learning_rate": 9.692204114576573e-05, + "loss": 2.5042, + "step": 10246 + }, + { + "epoch": 0.8269711887660399, + "grad_norm": 0.7526013851165771, + "learning_rate": 9.690626176632176e-05, + "loss": 2.5878, + "step": 10247 + }, + { + "epoch": 0.8270518925026229, + "grad_norm": 0.6936177611351013, + "learning_rate": 9.689048246398184e-05, + "loss": 2.5572, + "step": 10248 + }, + { + "epoch": 0.8271325962392059, + "grad_norm": 0.672168493270874, + "learning_rate": 9.687470323913922e-05, + "loss": 2.5127, + "step": 10249 + }, + { + "epoch": 0.8272132999757889, + "grad_norm": 0.6847899556159973, + "learning_rate": 9.685892409218717e-05, + "loss": 2.5443, + "step": 10250 + }, + { + "epoch": 0.8272940037123718, + "grad_norm": 0.6877103447914124, + "learning_rate": 9.684314502351894e-05, + "loss": 2.4924, + "step": 10251 + }, + { + "epoch": 0.8273747074489549, + "grad_norm": 0.6894243359565735, + "learning_rate": 9.682736603352783e-05, + "loss": 2.5107, + "step": 10252 + }, + { + "epoch": 0.8274554111855379, + "grad_norm": 0.7318278551101685, + "learning_rate": 9.681158712260698e-05, + "loss": 2.5276, + "step": 10253 + }, + { + "epoch": 0.8275361149221209, + "grad_norm": 0.6949039101600647, + "learning_rate": 9.679580829114975e-05, + "loss": 2.5128, + "step": 10254 + }, + { + "epoch": 0.8276168186587038, + "grad_norm": 0.6523800492286682, + "learning_rate": 9.678002953954939e-05, + "loss": 2.5584, + "step": 10255 + }, + { + "epoch": 0.8276975223952869, + "grad_norm": 0.6914480328559875, + "learning_rate": 9.676425086819905e-05, + "loss": 2.5597, + "step": 10256 + }, + { + "epoch": 0.8277782261318699, + "grad_norm": 0.7107869982719421, + "learning_rate": 9.674847227749206e-05, + "loss": 2.5009, + "step": 10257 + }, + { + "epoch": 0.8278589298684529, + "grad_norm": 0.7066758275032043, + "learning_rate": 9.673269376782166e-05, + "loss": 2.4599, + "step": 10258 + }, + { + "epoch": 0.8279396336050359, + "grad_norm": 0.7147037982940674, + "learning_rate": 9.671691533958104e-05, + "loss": 2.4478, + "step": 10259 + }, + { + "epoch": 0.828020337341619, + "grad_norm": 0.666265606880188, + "learning_rate": 9.670113699316347e-05, + "loss": 2.5652, + "step": 10260 + }, + { + "epoch": 0.8281010410782019, + "grad_norm": 0.7026315927505493, + "learning_rate": 9.668535872896225e-05, + "loss": 2.5397, + "step": 10261 + }, + { + "epoch": 0.8281817448147849, + "grad_norm": 0.6611438393592834, + "learning_rate": 9.66695805473705e-05, + "loss": 2.5628, + "step": 10262 + }, + { + "epoch": 0.8282624485513679, + "grad_norm": 0.7211201190948486, + "learning_rate": 9.66538024487815e-05, + "loss": 2.5551, + "step": 10263 + }, + { + "epoch": 0.828343152287951, + "grad_norm": 0.7224553227424622, + "learning_rate": 9.663802443358849e-05, + "loss": 2.5329, + "step": 10264 + }, + { + "epoch": 0.8284238560245339, + "grad_norm": 0.6805843710899353, + "learning_rate": 9.662224650218474e-05, + "loss": 2.5744, + "step": 10265 + }, + { + "epoch": 0.8285045597611169, + "grad_norm": 0.7101335525512695, + "learning_rate": 9.66064686549634e-05, + "loss": 2.5281, + "step": 10266 + }, + { + "epoch": 0.8285852634976999, + "grad_norm": 0.7208443284034729, + "learning_rate": 9.659069089231774e-05, + "loss": 2.5326, + "step": 10267 + }, + { + "epoch": 0.828665967234283, + "grad_norm": 0.747894287109375, + "learning_rate": 9.6574913214641e-05, + "loss": 2.4909, + "step": 10268 + }, + { + "epoch": 0.828746670970866, + "grad_norm": 0.6618027091026306, + "learning_rate": 9.655913562232635e-05, + "loss": 2.6091, + "step": 10269 + }, + { + "epoch": 0.8288273747074489, + "grad_norm": 0.7101535201072693, + "learning_rate": 9.654335811576704e-05, + "loss": 2.5194, + "step": 10270 + }, + { + "epoch": 0.8289080784440319, + "grad_norm": 0.727763831615448, + "learning_rate": 9.652758069535631e-05, + "loss": 2.5767, + "step": 10271 + }, + { + "epoch": 0.828988782180615, + "grad_norm": 0.6936737895011902, + "learning_rate": 9.65118033614873e-05, + "loss": 2.498, + "step": 10272 + }, + { + "epoch": 0.829069485917198, + "grad_norm": 0.699462354183197, + "learning_rate": 9.64960261145533e-05, + "loss": 2.5033, + "step": 10273 + }, + { + "epoch": 0.829150189653781, + "grad_norm": 0.7024868726730347, + "learning_rate": 9.648024895494749e-05, + "loss": 2.5937, + "step": 10274 + }, + { + "epoch": 0.8292308933903639, + "grad_norm": 0.7028421759605408, + "learning_rate": 9.646447188306305e-05, + "loss": 2.5528, + "step": 10275 + }, + { + "epoch": 0.829311597126947, + "grad_norm": 0.7216476202011108, + "learning_rate": 9.644869489929321e-05, + "loss": 2.5298, + "step": 10276 + }, + { + "epoch": 0.82939230086353, + "grad_norm": 0.6815251111984253, + "learning_rate": 9.643291800403123e-05, + "loss": 2.5138, + "step": 10277 + }, + { + "epoch": 0.829473004600113, + "grad_norm": 0.6961970925331116, + "learning_rate": 9.64171411976702e-05, + "loss": 2.5441, + "step": 10278 + }, + { + "epoch": 0.829553708336696, + "grad_norm": 0.7317311763763428, + "learning_rate": 9.640136448060337e-05, + "loss": 2.5885, + "step": 10279 + }, + { + "epoch": 0.829634412073279, + "grad_norm": 0.729086697101593, + "learning_rate": 9.638558785322396e-05, + "loss": 2.475, + "step": 10280 + }, + { + "epoch": 0.829715115809862, + "grad_norm": 0.7790165543556213, + "learning_rate": 9.636981131592521e-05, + "loss": 2.5538, + "step": 10281 + }, + { + "epoch": 0.829795819546445, + "grad_norm": 0.7066864967346191, + "learning_rate": 9.635403486910018e-05, + "loss": 2.5916, + "step": 10282 + }, + { + "epoch": 0.829876523283028, + "grad_norm": 0.7070252299308777, + "learning_rate": 9.633825851314215e-05, + "loss": 2.5879, + "step": 10283 + }, + { + "epoch": 0.829957227019611, + "grad_norm": 0.7604004740715027, + "learning_rate": 9.63224822484443e-05, + "loss": 2.5298, + "step": 10284 + }, + { + "epoch": 0.830037930756194, + "grad_norm": 0.7548386454582214, + "learning_rate": 9.63067060753998e-05, + "loss": 2.5313, + "step": 10285 + }, + { + "epoch": 0.830118634492777, + "grad_norm": 0.7241540551185608, + "learning_rate": 9.629092999440183e-05, + "loss": 2.5498, + "step": 10286 + }, + { + "epoch": 0.83019933822936, + "grad_norm": 0.6748291850090027, + "learning_rate": 9.627515400584361e-05, + "loss": 2.523, + "step": 10287 + }, + { + "epoch": 0.8302800419659431, + "grad_norm": 0.6624683141708374, + "learning_rate": 9.625937811011826e-05, + "loss": 2.568, + "step": 10288 + }, + { + "epoch": 0.830360745702526, + "grad_norm": 0.6681114435195923, + "learning_rate": 9.624360230761899e-05, + "loss": 2.5255, + "step": 10289 + }, + { + "epoch": 0.830441449439109, + "grad_norm": 0.6895325183868408, + "learning_rate": 9.622782659873899e-05, + "loss": 2.5275, + "step": 10290 + }, + { + "epoch": 0.830522153175692, + "grad_norm": 0.7257826924324036, + "learning_rate": 9.621205098387137e-05, + "loss": 2.5102, + "step": 10291 + }, + { + "epoch": 0.8306028569122751, + "grad_norm": 0.6567066311836243, + "learning_rate": 9.619627546340935e-05, + "loss": 2.5721, + "step": 10292 + }, + { + "epoch": 0.8306835606488581, + "grad_norm": 0.6571428179740906, + "learning_rate": 9.61805000377461e-05, + "loss": 2.5014, + "step": 10293 + }, + { + "epoch": 0.830764264385441, + "grad_norm": 0.7807042598724365, + "learning_rate": 9.61647247072748e-05, + "loss": 2.632, + "step": 10294 + }, + { + "epoch": 0.830844968122024, + "grad_norm": 0.6688913702964783, + "learning_rate": 9.614894947238854e-05, + "loss": 2.5457, + "step": 10295 + }, + { + "epoch": 0.8309256718586071, + "grad_norm": 0.7769338488578796, + "learning_rate": 9.613317433348055e-05, + "loss": 2.4775, + "step": 10296 + }, + { + "epoch": 0.8310063755951901, + "grad_norm": 0.7089162468910217, + "learning_rate": 9.611739929094399e-05, + "loss": 2.4887, + "step": 10297 + }, + { + "epoch": 0.8310870793317731, + "grad_norm": 0.6901174783706665, + "learning_rate": 9.610162434517196e-05, + "loss": 2.6127, + "step": 10298 + }, + { + "epoch": 0.831167783068356, + "grad_norm": 0.6862173676490784, + "learning_rate": 9.608584949655764e-05, + "loss": 2.5432, + "step": 10299 + }, + { + "epoch": 0.8312484868049391, + "grad_norm": 0.6789367198944092, + "learning_rate": 9.607007474549418e-05, + "loss": 2.5135, + "step": 10300 + }, + { + "epoch": 0.8313291905415221, + "grad_norm": 0.6548805832862854, + "learning_rate": 9.605430009237474e-05, + "loss": 2.5466, + "step": 10301 + }, + { + "epoch": 0.8314098942781051, + "grad_norm": 0.6873800158500671, + "learning_rate": 9.603852553759244e-05, + "loss": 2.4954, + "step": 10302 + }, + { + "epoch": 0.831490598014688, + "grad_norm": 0.6816138029098511, + "learning_rate": 9.602275108154046e-05, + "loss": 2.5556, + "step": 10303 + }, + { + "epoch": 0.831571301751271, + "grad_norm": 0.6890314221382141, + "learning_rate": 9.600697672461189e-05, + "loss": 2.5253, + "step": 10304 + }, + { + "epoch": 0.8316520054878541, + "grad_norm": 0.6217427849769592, + "learning_rate": 9.599120246719992e-05, + "loss": 2.53, + "step": 10305 + }, + { + "epoch": 0.8317327092244371, + "grad_norm": 0.6638299226760864, + "learning_rate": 9.59754283096977e-05, + "loss": 2.5323, + "step": 10306 + }, + { + "epoch": 0.8318134129610201, + "grad_norm": 0.6834245920181274, + "learning_rate": 9.595965425249828e-05, + "loss": 2.5339, + "step": 10307 + }, + { + "epoch": 0.831894116697603, + "grad_norm": 0.8013476729393005, + "learning_rate": 9.594388029599484e-05, + "loss": 2.4925, + "step": 10308 + }, + { + "epoch": 0.8319748204341861, + "grad_norm": 0.7677187323570251, + "learning_rate": 9.592810644058049e-05, + "loss": 2.5717, + "step": 10309 + }, + { + "epoch": 0.8320555241707691, + "grad_norm": 0.6558046340942383, + "learning_rate": 9.591233268664841e-05, + "loss": 2.5631, + "step": 10310 + }, + { + "epoch": 0.8321362279073521, + "grad_norm": 0.6648481488227844, + "learning_rate": 9.589655903459165e-05, + "loss": 2.5232, + "step": 10311 + }, + { + "epoch": 0.8322169316439351, + "grad_norm": 0.6907756328582764, + "learning_rate": 9.588078548480338e-05, + "loss": 2.4804, + "step": 10312 + }, + { + "epoch": 0.8322976353805182, + "grad_norm": 0.6924928426742554, + "learning_rate": 9.586501203767675e-05, + "loss": 2.4648, + "step": 10313 + }, + { + "epoch": 0.8323783391171011, + "grad_norm": 0.7654799222946167, + "learning_rate": 9.584923869360477e-05, + "loss": 2.6184, + "step": 10314 + }, + { + "epoch": 0.8324590428536841, + "grad_norm": 0.7056179046630859, + "learning_rate": 9.58334654529806e-05, + "loss": 2.5862, + "step": 10315 + }, + { + "epoch": 0.8325397465902671, + "grad_norm": 0.7245064973831177, + "learning_rate": 9.581769231619743e-05, + "loss": 2.4866, + "step": 10316 + }, + { + "epoch": 0.8326204503268502, + "grad_norm": 0.6782355308532715, + "learning_rate": 9.580191928364824e-05, + "loss": 2.5519, + "step": 10317 + }, + { + "epoch": 0.8327011540634331, + "grad_norm": 0.6910805106163025, + "learning_rate": 9.578614635572621e-05, + "loss": 2.542, + "step": 10318 + }, + { + "epoch": 0.8327818578000161, + "grad_norm": 0.6858026385307312, + "learning_rate": 9.577037353282444e-05, + "loss": 2.5601, + "step": 10319 + }, + { + "epoch": 0.8328625615365991, + "grad_norm": 0.6886423230171204, + "learning_rate": 9.5754600815336e-05, + "loss": 2.5817, + "step": 10320 + }, + { + "epoch": 0.8329432652731822, + "grad_norm": 0.7585750818252563, + "learning_rate": 9.573882820365402e-05, + "loss": 2.5153, + "step": 10321 + }, + { + "epoch": 0.8330239690097652, + "grad_norm": 0.7004472613334656, + "learning_rate": 9.57230556981716e-05, + "loss": 2.5456, + "step": 10322 + }, + { + "epoch": 0.8331046727463481, + "grad_norm": 0.6530508399009705, + "learning_rate": 9.570728329928179e-05, + "loss": 2.5453, + "step": 10323 + }, + { + "epoch": 0.8331853764829311, + "grad_norm": 0.6767956614494324, + "learning_rate": 9.569151100737769e-05, + "loss": 2.5311, + "step": 10324 + }, + { + "epoch": 0.8332660802195142, + "grad_norm": 0.6835905909538269, + "learning_rate": 9.56757388228524e-05, + "loss": 2.5417, + "step": 10325 + }, + { + "epoch": 0.8333467839560972, + "grad_norm": 0.6582748889923096, + "learning_rate": 9.565996674609901e-05, + "loss": 2.5144, + "step": 10326 + }, + { + "epoch": 0.8334274876926802, + "grad_norm": 0.6815205216407776, + "learning_rate": 9.56441947775106e-05, + "loss": 2.5272, + "step": 10327 + }, + { + "epoch": 0.8335081914292631, + "grad_norm": 0.6810150146484375, + "learning_rate": 9.562842291748022e-05, + "loss": 2.5475, + "step": 10328 + }, + { + "epoch": 0.8335888951658462, + "grad_norm": 0.7220990657806396, + "learning_rate": 9.5612651166401e-05, + "loss": 2.54, + "step": 10329 + }, + { + "epoch": 0.8336695989024292, + "grad_norm": 0.6840164065361023, + "learning_rate": 9.559687952466596e-05, + "loss": 2.5987, + "step": 10330 + }, + { + "epoch": 0.8337503026390122, + "grad_norm": 0.7085031867027283, + "learning_rate": 9.558110799266819e-05, + "loss": 2.5674, + "step": 10331 + }, + { + "epoch": 0.8338310063755952, + "grad_norm": 0.6658117175102234, + "learning_rate": 9.55653365708008e-05, + "loss": 2.5793, + "step": 10332 + }, + { + "epoch": 0.8339117101121782, + "grad_norm": 0.782648503780365, + "learning_rate": 9.554956525945677e-05, + "loss": 2.5463, + "step": 10333 + }, + { + "epoch": 0.8339924138487612, + "grad_norm": 0.6999937891960144, + "learning_rate": 9.553379405902922e-05, + "loss": 2.5961, + "step": 10334 + }, + { + "epoch": 0.8340731175853442, + "grad_norm": 0.6681220531463623, + "learning_rate": 9.55180229699112e-05, + "loss": 2.6055, + "step": 10335 + }, + { + "epoch": 0.8341538213219272, + "grad_norm": 0.7127133011817932, + "learning_rate": 9.550225199249577e-05, + "loss": 2.5571, + "step": 10336 + }, + { + "epoch": 0.8342345250585103, + "grad_norm": 0.6939001679420471, + "learning_rate": 9.548648112717596e-05, + "loss": 2.5653, + "step": 10337 + }, + { + "epoch": 0.8343152287950932, + "grad_norm": 0.7483924031257629, + "learning_rate": 9.547071037434487e-05, + "loss": 2.5316, + "step": 10338 + }, + { + "epoch": 0.8343959325316762, + "grad_norm": 0.7975850105285645, + "learning_rate": 9.545493973439548e-05, + "loss": 2.6039, + "step": 10339 + }, + { + "epoch": 0.8344766362682592, + "grad_norm": 0.6893026232719421, + "learning_rate": 9.543916920772087e-05, + "loss": 2.5797, + "step": 10340 + }, + { + "epoch": 0.8345573400048423, + "grad_norm": 0.752869188785553, + "learning_rate": 9.542339879471409e-05, + "loss": 2.5677, + "step": 10341 + }, + { + "epoch": 0.8346380437414253, + "grad_norm": 0.7336339354515076, + "learning_rate": 9.540762849576822e-05, + "loss": 2.5212, + "step": 10342 + }, + { + "epoch": 0.8347187474780082, + "grad_norm": 0.7742713689804077, + "learning_rate": 9.539185831127621e-05, + "loss": 2.5599, + "step": 10343 + }, + { + "epoch": 0.8347994512145912, + "grad_norm": 0.7205352783203125, + "learning_rate": 9.537608824163114e-05, + "loss": 2.5591, + "step": 10344 + }, + { + "epoch": 0.8348801549511743, + "grad_norm": 0.7794787287712097, + "learning_rate": 9.536031828722605e-05, + "loss": 2.5858, + "step": 10345 + }, + { + "epoch": 0.8349608586877573, + "grad_norm": 0.7129528522491455, + "learning_rate": 9.534454844845396e-05, + "loss": 2.5591, + "step": 10346 + }, + { + "epoch": 0.8350415624243402, + "grad_norm": 0.731038510799408, + "learning_rate": 9.532877872570787e-05, + "loss": 2.5774, + "step": 10347 + }, + { + "epoch": 0.8351222661609232, + "grad_norm": 0.7706510424613953, + "learning_rate": 9.531300911938087e-05, + "loss": 2.6102, + "step": 10348 + }, + { + "epoch": 0.8352029698975063, + "grad_norm": 0.6890363097190857, + "learning_rate": 9.52972396298659e-05, + "loss": 2.5393, + "step": 10349 + }, + { + "epoch": 0.8352836736340893, + "grad_norm": 0.6792402863502502, + "learning_rate": 9.528147025755601e-05, + "loss": 2.5607, + "step": 10350 + }, + { + "epoch": 0.8353643773706723, + "grad_norm": 0.7097377777099609, + "learning_rate": 9.526570100284422e-05, + "loss": 2.5681, + "step": 10351 + }, + { + "epoch": 0.8354450811072552, + "grad_norm": 0.7530940771102905, + "learning_rate": 9.524993186612353e-05, + "loss": 2.5405, + "step": 10352 + }, + { + "epoch": 0.8355257848438382, + "grad_norm": 0.714080810546875, + "learning_rate": 9.523416284778696e-05, + "loss": 2.5365, + "step": 10353 + }, + { + "epoch": 0.8356064885804213, + "grad_norm": 0.6745832562446594, + "learning_rate": 9.521839394822752e-05, + "loss": 2.5553, + "step": 10354 + }, + { + "epoch": 0.8356871923170043, + "grad_norm": 0.7163450121879578, + "learning_rate": 9.52026251678382e-05, + "loss": 2.5074, + "step": 10355 + }, + { + "epoch": 0.8357678960535873, + "grad_norm": 0.6876534223556519, + "learning_rate": 9.518685650701197e-05, + "loss": 2.5652, + "step": 10356 + }, + { + "epoch": 0.8358485997901702, + "grad_norm": 0.6424533128738403, + "learning_rate": 9.517108796614187e-05, + "loss": 2.4823, + "step": 10357 + }, + { + "epoch": 0.8359293035267533, + "grad_norm": 0.646802544593811, + "learning_rate": 9.515531954562094e-05, + "loss": 2.5602, + "step": 10358 + }, + { + "epoch": 0.8360100072633363, + "grad_norm": 0.7266993522644043, + "learning_rate": 9.513955124584205e-05, + "loss": 2.5384, + "step": 10359 + }, + { + "epoch": 0.8360907109999193, + "grad_norm": 0.7358742356300354, + "learning_rate": 9.512378306719826e-05, + "loss": 2.5798, + "step": 10360 + }, + { + "epoch": 0.8361714147365022, + "grad_norm": 0.7191498279571533, + "learning_rate": 9.510801501008256e-05, + "loss": 2.5229, + "step": 10361 + }, + { + "epoch": 0.8362521184730853, + "grad_norm": 0.7058876156806946, + "learning_rate": 9.509224707488788e-05, + "loss": 2.5146, + "step": 10362 + }, + { + "epoch": 0.8363328222096683, + "grad_norm": 0.7348346710205078, + "learning_rate": 9.507647926200725e-05, + "loss": 2.5878, + "step": 10363 + }, + { + "epoch": 0.8364135259462513, + "grad_norm": 0.7464115619659424, + "learning_rate": 9.506071157183366e-05, + "loss": 2.6056, + "step": 10364 + }, + { + "epoch": 0.8364942296828343, + "grad_norm": 0.7077332139015198, + "learning_rate": 9.504494400476e-05, + "loss": 2.5161, + "step": 10365 + }, + { + "epoch": 0.8365749334194174, + "grad_norm": 0.7381827235221863, + "learning_rate": 9.502917656117928e-05, + "loss": 2.519, + "step": 10366 + }, + { + "epoch": 0.8366556371560003, + "grad_norm": 0.743180513381958, + "learning_rate": 9.501340924148452e-05, + "loss": 2.6149, + "step": 10367 + }, + { + "epoch": 0.8367363408925833, + "grad_norm": 0.6496078372001648, + "learning_rate": 9.499764204606863e-05, + "loss": 2.4969, + "step": 10368 + }, + { + "epoch": 0.8368170446291663, + "grad_norm": 0.6796541810035706, + "learning_rate": 9.498187497532454e-05, + "loss": 2.5304, + "step": 10369 + }, + { + "epoch": 0.8368977483657494, + "grad_norm": 0.6555948853492737, + "learning_rate": 9.496610802964529e-05, + "loss": 2.6029, + "step": 10370 + }, + { + "epoch": 0.8369784521023323, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.495034120942374e-05, + "loss": 2.5286, + "step": 10371 + }, + { + "epoch": 0.8370591558389153, + "grad_norm": 0.7417613863945007, + "learning_rate": 9.49345745150529e-05, + "loss": 2.5301, + "step": 10372 + }, + { + "epoch": 0.8371398595754983, + "grad_norm": 0.6809872388839722, + "learning_rate": 9.49188079469257e-05, + "loss": 2.5075, + "step": 10373 + }, + { + "epoch": 0.8372205633120814, + "grad_norm": 0.6537099480628967, + "learning_rate": 9.490304150543514e-05, + "loss": 2.5515, + "step": 10374 + }, + { + "epoch": 0.8373012670486644, + "grad_norm": 0.6660431027412415, + "learning_rate": 9.488727519097407e-05, + "loss": 2.549, + "step": 10375 + }, + { + "epoch": 0.8373819707852473, + "grad_norm": 0.7257838249206543, + "learning_rate": 9.487150900393546e-05, + "loss": 2.546, + "step": 10376 + }, + { + "epoch": 0.8374626745218303, + "grad_norm": 0.742085874080658, + "learning_rate": 9.485574294471226e-05, + "loss": 2.5302, + "step": 10377 + }, + { + "epoch": 0.8375433782584134, + "grad_norm": 0.659934401512146, + "learning_rate": 9.48399770136974e-05, + "loss": 2.5553, + "step": 10378 + }, + { + "epoch": 0.8376240819949964, + "grad_norm": 0.7219613790512085, + "learning_rate": 9.482421121128377e-05, + "loss": 2.6186, + "step": 10379 + }, + { + "epoch": 0.8377047857315794, + "grad_norm": 0.706444263458252, + "learning_rate": 9.480844553786436e-05, + "loss": 2.5082, + "step": 10380 + }, + { + "epoch": 0.8377854894681623, + "grad_norm": 0.7527014017105103, + "learning_rate": 9.479267999383204e-05, + "loss": 2.5625, + "step": 10381 + }, + { + "epoch": 0.8378661932047454, + "grad_norm": 0.7488746643066406, + "learning_rate": 9.477691457957976e-05, + "loss": 2.528, + "step": 10382 + }, + { + "epoch": 0.8379468969413284, + "grad_norm": 0.7394229173660278, + "learning_rate": 9.476114929550045e-05, + "loss": 2.5387, + "step": 10383 + }, + { + "epoch": 0.8380276006779114, + "grad_norm": 0.7490981817245483, + "learning_rate": 9.474538414198695e-05, + "loss": 2.548, + "step": 10384 + }, + { + "epoch": 0.8381083044144944, + "grad_norm": 0.7203173041343689, + "learning_rate": 9.472961911943222e-05, + "loss": 2.5547, + "step": 10385 + }, + { + "epoch": 0.8381890081510774, + "grad_norm": 0.6929850578308105, + "learning_rate": 9.471385422822917e-05, + "loss": 2.4831, + "step": 10386 + }, + { + "epoch": 0.8382697118876604, + "grad_norm": 0.6303263902664185, + "learning_rate": 9.469808946877067e-05, + "loss": 2.4569, + "step": 10387 + }, + { + "epoch": 0.8383504156242434, + "grad_norm": 0.6986981630325317, + "learning_rate": 9.468232484144964e-05, + "loss": 2.5278, + "step": 10388 + }, + { + "epoch": 0.8384311193608264, + "grad_norm": 0.6910964846611023, + "learning_rate": 9.466656034665898e-05, + "loss": 2.5657, + "step": 10389 + }, + { + "epoch": 0.8385118230974095, + "grad_norm": 0.6571134924888611, + "learning_rate": 9.465079598479163e-05, + "loss": 2.6017, + "step": 10390 + }, + { + "epoch": 0.8385925268339924, + "grad_norm": 0.7117733359336853, + "learning_rate": 9.463503175624034e-05, + "loss": 2.56, + "step": 10391 + }, + { + "epoch": 0.8386732305705754, + "grad_norm": 0.7052998542785645, + "learning_rate": 9.461926766139813e-05, + "loss": 2.4998, + "step": 10392 + }, + { + "epoch": 0.8387539343071584, + "grad_norm": 0.7306597232818604, + "learning_rate": 9.460350370065786e-05, + "loss": 2.5292, + "step": 10393 + }, + { + "epoch": 0.8388346380437415, + "grad_norm": 0.681069552898407, + "learning_rate": 9.458773987441235e-05, + "loss": 2.5469, + "step": 10394 + }, + { + "epoch": 0.8389153417803245, + "grad_norm": 0.6681767702102661, + "learning_rate": 9.45719761830545e-05, + "loss": 2.5476, + "step": 10395 + }, + { + "epoch": 0.8389960455169074, + "grad_norm": 0.6759339570999146, + "learning_rate": 9.455621262697723e-05, + "loss": 2.4806, + "step": 10396 + }, + { + "epoch": 0.8390767492534904, + "grad_norm": 0.695829451084137, + "learning_rate": 9.454044920657333e-05, + "loss": 2.5255, + "step": 10397 + }, + { + "epoch": 0.8391574529900735, + "grad_norm": 0.686568558216095, + "learning_rate": 9.452468592223572e-05, + "loss": 2.5655, + "step": 10398 + }, + { + "epoch": 0.8392381567266565, + "grad_norm": 0.6529035568237305, + "learning_rate": 9.45089227743573e-05, + "loss": 2.5026, + "step": 10399 + }, + { + "epoch": 0.8393188604632394, + "grad_norm": 0.6809061765670776, + "learning_rate": 9.449315976333082e-05, + "loss": 2.5549, + "step": 10400 + }, + { + "epoch": 0.8393995641998224, + "grad_norm": 0.6920269727706909, + "learning_rate": 9.447739688954919e-05, + "loss": 2.517, + "step": 10401 + }, + { + "epoch": 0.8394802679364055, + "grad_norm": 0.6626712083816528, + "learning_rate": 9.446163415340526e-05, + "loss": 2.605, + "step": 10402 + }, + { + "epoch": 0.8395609716729885, + "grad_norm": 0.6912916898727417, + "learning_rate": 9.444587155529195e-05, + "loss": 2.588, + "step": 10403 + }, + { + "epoch": 0.8396416754095715, + "grad_norm": 0.6771352291107178, + "learning_rate": 9.443010909560198e-05, + "loss": 2.5148, + "step": 10404 + }, + { + "epoch": 0.8397223791461544, + "grad_norm": 0.7015509009361267, + "learning_rate": 9.441434677472827e-05, + "loss": 2.5425, + "step": 10405 + }, + { + "epoch": 0.8398030828827374, + "grad_norm": 0.6789976358413696, + "learning_rate": 9.439858459306364e-05, + "loss": 2.598, + "step": 10406 + }, + { + "epoch": 0.8398837866193205, + "grad_norm": 0.674391508102417, + "learning_rate": 9.438282255100091e-05, + "loss": 2.5581, + "step": 10407 + }, + { + "epoch": 0.8399644903559035, + "grad_norm": 0.6944772005081177, + "learning_rate": 9.436706064893294e-05, + "loss": 2.5591, + "step": 10408 + }, + { + "epoch": 0.8400451940924865, + "grad_norm": 0.6750832200050354, + "learning_rate": 9.435129888725259e-05, + "loss": 2.533, + "step": 10409 + }, + { + "epoch": 0.8401258978290694, + "grad_norm": 0.6927465200424194, + "learning_rate": 9.433553726635257e-05, + "loss": 2.536, + "step": 10410 + }, + { + "epoch": 0.8402066015656525, + "grad_norm": 0.6399651765823364, + "learning_rate": 9.431977578662578e-05, + "loss": 2.5123, + "step": 10411 + }, + { + "epoch": 0.8402873053022355, + "grad_norm": 0.7588143944740295, + "learning_rate": 9.430401444846505e-05, + "loss": 2.6133, + "step": 10412 + }, + { + "epoch": 0.8403680090388185, + "grad_norm": 0.8010972738265991, + "learning_rate": 9.428825325226313e-05, + "loss": 2.5407, + "step": 10413 + }, + { + "epoch": 0.8404487127754015, + "grad_norm": 0.6847307085990906, + "learning_rate": 9.427249219841288e-05, + "loss": 2.5912, + "step": 10414 + }, + { + "epoch": 0.8405294165119845, + "grad_norm": 0.7005963325500488, + "learning_rate": 9.425673128730716e-05, + "loss": 2.5059, + "step": 10415 + }, + { + "epoch": 0.8406101202485675, + "grad_norm": 0.7383962273597717, + "learning_rate": 9.424097051933862e-05, + "loss": 2.5157, + "step": 10416 + }, + { + "epoch": 0.8406908239851505, + "grad_norm": 0.7078843712806702, + "learning_rate": 9.422520989490018e-05, + "loss": 2.6093, + "step": 10417 + }, + { + "epoch": 0.8407715277217335, + "grad_norm": 0.7449501752853394, + "learning_rate": 9.42094494143846e-05, + "loss": 2.594, + "step": 10418 + }, + { + "epoch": 0.8408522314583166, + "grad_norm": 0.6823872923851013, + "learning_rate": 9.419368907818473e-05, + "loss": 2.5653, + "step": 10419 + }, + { + "epoch": 0.8409329351948995, + "grad_norm": 0.7403056025505066, + "learning_rate": 9.417792888669325e-05, + "loss": 2.5296, + "step": 10420 + }, + { + "epoch": 0.8410136389314825, + "grad_norm": 0.6858980655670166, + "learning_rate": 9.4162168840303e-05, + "loss": 2.5401, + "step": 10421 + }, + { + "epoch": 0.8410943426680655, + "grad_norm": 0.692348837852478, + "learning_rate": 9.41464089394068e-05, + "loss": 2.4797, + "step": 10422 + }, + { + "epoch": 0.8411750464046486, + "grad_norm": 0.6939836144447327, + "learning_rate": 9.413064918439736e-05, + "loss": 2.505, + "step": 10423 + }, + { + "epoch": 0.8412557501412316, + "grad_norm": 0.7334314584732056, + "learning_rate": 9.411488957566748e-05, + "loss": 2.5792, + "step": 10424 + }, + { + "epoch": 0.8413364538778145, + "grad_norm": 0.6977920532226562, + "learning_rate": 9.409913011360999e-05, + "loss": 2.5204, + "step": 10425 + }, + { + "epoch": 0.8414171576143975, + "grad_norm": 0.7121822834014893, + "learning_rate": 9.408337079861756e-05, + "loss": 2.571, + "step": 10426 + }, + { + "epoch": 0.8414978613509806, + "grad_norm": 0.761476993560791, + "learning_rate": 9.406761163108297e-05, + "loss": 2.5845, + "step": 10427 + }, + { + "epoch": 0.8415785650875636, + "grad_norm": 0.7160221934318542, + "learning_rate": 9.405185261139906e-05, + "loss": 2.5331, + "step": 10428 + }, + { + "epoch": 0.8416592688241465, + "grad_norm": 0.6828827857971191, + "learning_rate": 9.40360937399585e-05, + "loss": 2.5596, + "step": 10429 + }, + { + "epoch": 0.8417399725607295, + "grad_norm": 0.756473183631897, + "learning_rate": 9.402033501715406e-05, + "loss": 2.6107, + "step": 10430 + }, + { + "epoch": 0.8418206762973126, + "grad_norm": 0.7486895322799683, + "learning_rate": 9.400457644337853e-05, + "loss": 2.5388, + "step": 10431 + }, + { + "epoch": 0.8419013800338956, + "grad_norm": 0.7759146690368652, + "learning_rate": 9.398881801902461e-05, + "loss": 2.5559, + "step": 10432 + }, + { + "epoch": 0.8419820837704786, + "grad_norm": 0.71756911277771, + "learning_rate": 9.397305974448506e-05, + "loss": 2.6109, + "step": 10433 + }, + { + "epoch": 0.8420627875070615, + "grad_norm": 0.7741644382476807, + "learning_rate": 9.395730162015261e-05, + "loss": 2.5664, + "step": 10434 + }, + { + "epoch": 0.8421434912436446, + "grad_norm": 0.7155938744544983, + "learning_rate": 9.394154364642006e-05, + "loss": 2.5693, + "step": 10435 + }, + { + "epoch": 0.8422241949802276, + "grad_norm": 0.6862725019454956, + "learning_rate": 9.392578582368002e-05, + "loss": 2.4942, + "step": 10436 + }, + { + "epoch": 0.8423048987168106, + "grad_norm": 0.6698417067527771, + "learning_rate": 9.391002815232528e-05, + "loss": 2.5258, + "step": 10437 + }, + { + "epoch": 0.8423856024533936, + "grad_norm": 0.7756468057632446, + "learning_rate": 9.389427063274858e-05, + "loss": 2.5008, + "step": 10438 + }, + { + "epoch": 0.8424663061899766, + "grad_norm": 0.6579857468605042, + "learning_rate": 9.387851326534259e-05, + "loss": 2.5335, + "step": 10439 + }, + { + "epoch": 0.8425470099265596, + "grad_norm": 0.7673436403274536, + "learning_rate": 9.386275605050006e-05, + "loss": 2.5646, + "step": 10440 + }, + { + "epoch": 0.8426277136631426, + "grad_norm": 0.7377188205718994, + "learning_rate": 9.384699898861372e-05, + "loss": 2.568, + "step": 10441 + }, + { + "epoch": 0.8427084173997256, + "grad_norm": 0.6502123475074768, + "learning_rate": 9.38312420800762e-05, + "loss": 2.6091, + "step": 10442 + }, + { + "epoch": 0.8427891211363087, + "grad_norm": 0.729852020740509, + "learning_rate": 9.381548532528026e-05, + "loss": 2.4873, + "step": 10443 + }, + { + "epoch": 0.8428698248728916, + "grad_norm": 0.7419102191925049, + "learning_rate": 9.379972872461865e-05, + "loss": 2.4966, + "step": 10444 + }, + { + "epoch": 0.8429505286094746, + "grad_norm": 0.6921093463897705, + "learning_rate": 9.378397227848395e-05, + "loss": 2.4895, + "step": 10445 + }, + { + "epoch": 0.8430312323460576, + "grad_norm": 0.7697325944900513, + "learning_rate": 9.376821598726892e-05, + "loss": 2.5779, + "step": 10446 + }, + { + "epoch": 0.8431119360826407, + "grad_norm": 0.6441029906272888, + "learning_rate": 9.375245985136626e-05, + "loss": 2.4909, + "step": 10447 + }, + { + "epoch": 0.8431926398192237, + "grad_norm": 0.6962057948112488, + "learning_rate": 9.373670387116861e-05, + "loss": 2.5602, + "step": 10448 + }, + { + "epoch": 0.8432733435558066, + "grad_norm": 0.7030641436576843, + "learning_rate": 9.372094804706867e-05, + "loss": 2.5641, + "step": 10449 + }, + { + "epoch": 0.8433540472923896, + "grad_norm": 0.6969063878059387, + "learning_rate": 9.370519237945912e-05, + "loss": 2.5555, + "step": 10450 + }, + { + "epoch": 0.8434347510289727, + "grad_norm": 0.7169879674911499, + "learning_rate": 9.368943686873267e-05, + "loss": 2.5258, + "step": 10451 + }, + { + "epoch": 0.8435154547655557, + "grad_norm": 0.7198735475540161, + "learning_rate": 9.36736815152819e-05, + "loss": 2.5192, + "step": 10452 + }, + { + "epoch": 0.8435961585021386, + "grad_norm": 0.6613535284996033, + "learning_rate": 9.365792631949951e-05, + "loss": 2.5596, + "step": 10453 + }, + { + "epoch": 0.8436768622387216, + "grad_norm": 0.6377065777778625, + "learning_rate": 9.364217128177824e-05, + "loss": 2.5518, + "step": 10454 + }, + { + "epoch": 0.8437575659753046, + "grad_norm": 0.6670635938644409, + "learning_rate": 9.362641640251063e-05, + "loss": 2.4793, + "step": 10455 + }, + { + "epoch": 0.8438382697118877, + "grad_norm": 0.6556122899055481, + "learning_rate": 9.361066168208939e-05, + "loss": 2.5492, + "step": 10456 + }, + { + "epoch": 0.8439189734484707, + "grad_norm": 0.7262280583381653, + "learning_rate": 9.35949071209072e-05, + "loss": 2.6059, + "step": 10457 + }, + { + "epoch": 0.8439996771850536, + "grad_norm": 0.702953040599823, + "learning_rate": 9.357915271935662e-05, + "loss": 2.5445, + "step": 10458 + }, + { + "epoch": 0.8440803809216366, + "grad_norm": 0.6619930267333984, + "learning_rate": 9.356339847783036e-05, + "loss": 2.5688, + "step": 10459 + }, + { + "epoch": 0.8441610846582197, + "grad_norm": 0.7038032412528992, + "learning_rate": 9.354764439672106e-05, + "loss": 2.5195, + "step": 10460 + }, + { + "epoch": 0.8442417883948027, + "grad_norm": 0.6615132689476013, + "learning_rate": 9.353189047642129e-05, + "loss": 2.5176, + "step": 10461 + }, + { + "epoch": 0.8443224921313857, + "grad_norm": 0.6524826288223267, + "learning_rate": 9.351613671732372e-05, + "loss": 2.4294, + "step": 10462 + }, + { + "epoch": 0.8444031958679686, + "grad_norm": 0.6526279449462891, + "learning_rate": 9.350038311982099e-05, + "loss": 2.595, + "step": 10463 + }, + { + "epoch": 0.8444838996045517, + "grad_norm": 0.6610859632492065, + "learning_rate": 9.348462968430569e-05, + "loss": 2.5311, + "step": 10464 + }, + { + "epoch": 0.8445646033411347, + "grad_norm": 0.6835470795631409, + "learning_rate": 9.346887641117045e-05, + "loss": 2.5694, + "step": 10465 + }, + { + "epoch": 0.8446453070777177, + "grad_norm": 0.6768551468849182, + "learning_rate": 9.345312330080787e-05, + "loss": 2.6082, + "step": 10466 + }, + { + "epoch": 0.8447260108143007, + "grad_norm": 0.6368672847747803, + "learning_rate": 9.343737035361059e-05, + "loss": 2.5221, + "step": 10467 + }, + { + "epoch": 0.8448067145508837, + "grad_norm": 0.6952844858169556, + "learning_rate": 9.34216175699712e-05, + "loss": 2.5003, + "step": 10468 + }, + { + "epoch": 0.8448874182874667, + "grad_norm": 0.6663931012153625, + "learning_rate": 9.340586495028227e-05, + "loss": 2.5469, + "step": 10469 + }, + { + "epoch": 0.8449681220240497, + "grad_norm": 0.6840688586235046, + "learning_rate": 9.339011249493647e-05, + "loss": 2.5499, + "step": 10470 + }, + { + "epoch": 0.8450488257606327, + "grad_norm": 0.6832869052886963, + "learning_rate": 9.337436020432632e-05, + "loss": 2.5492, + "step": 10471 + }, + { + "epoch": 0.8451295294972158, + "grad_norm": 0.7444044947624207, + "learning_rate": 9.335860807884442e-05, + "loss": 2.5791, + "step": 10472 + }, + { + "epoch": 0.8452102332337987, + "grad_norm": 0.6821839809417725, + "learning_rate": 9.334285611888339e-05, + "loss": 2.4772, + "step": 10473 + }, + { + "epoch": 0.8452909369703817, + "grad_norm": 0.6209141612052917, + "learning_rate": 9.332710432483577e-05, + "loss": 2.5656, + "step": 10474 + }, + { + "epoch": 0.8453716407069647, + "grad_norm": 0.6531212329864502, + "learning_rate": 9.331135269709415e-05, + "loss": 2.5285, + "step": 10475 + }, + { + "epoch": 0.8454523444435478, + "grad_norm": 0.6418079137802124, + "learning_rate": 9.329560123605115e-05, + "loss": 2.5503, + "step": 10476 + }, + { + "epoch": 0.8455330481801308, + "grad_norm": 0.6636360287666321, + "learning_rate": 9.327984994209924e-05, + "loss": 2.528, + "step": 10477 + }, + { + "epoch": 0.8456137519167137, + "grad_norm": 0.6196488738059998, + "learning_rate": 9.326409881563102e-05, + "loss": 2.4907, + "step": 10478 + }, + { + "epoch": 0.8456944556532967, + "grad_norm": 0.6339137554168701, + "learning_rate": 9.324834785703913e-05, + "loss": 2.4672, + "step": 10479 + }, + { + "epoch": 0.8457751593898798, + "grad_norm": 0.6803932189941406, + "learning_rate": 9.323259706671602e-05, + "loss": 2.5538, + "step": 10480 + }, + { + "epoch": 0.8458558631264628, + "grad_norm": 0.6815275549888611, + "learning_rate": 9.321684644505429e-05, + "loss": 2.5291, + "step": 10481 + }, + { + "epoch": 0.8459365668630457, + "grad_norm": 0.6497374773025513, + "learning_rate": 9.320109599244646e-05, + "loss": 2.5499, + "step": 10482 + }, + { + "epoch": 0.8460172705996287, + "grad_norm": 0.7966926097869873, + "learning_rate": 9.318534570928512e-05, + "loss": 2.523, + "step": 10483 + }, + { + "epoch": 0.8460979743362118, + "grad_norm": 0.6532156467437744, + "learning_rate": 9.316959559596276e-05, + "loss": 2.5138, + "step": 10484 + }, + { + "epoch": 0.8461786780727948, + "grad_norm": 0.7292522192001343, + "learning_rate": 9.315384565287193e-05, + "loss": 2.5413, + "step": 10485 + }, + { + "epoch": 0.8462593818093778, + "grad_norm": 0.7610795497894287, + "learning_rate": 9.313809588040519e-05, + "loss": 2.5071, + "step": 10486 + }, + { + "epoch": 0.8463400855459607, + "grad_norm": 0.7038258910179138, + "learning_rate": 9.312234627895502e-05, + "loss": 2.5568, + "step": 10487 + }, + { + "epoch": 0.8464207892825438, + "grad_norm": 0.7136046290397644, + "learning_rate": 9.310659684891395e-05, + "loss": 2.5372, + "step": 10488 + }, + { + "epoch": 0.8465014930191268, + "grad_norm": 0.7512896060943604, + "learning_rate": 9.309084759067452e-05, + "loss": 2.5821, + "step": 10489 + }, + { + "epoch": 0.8465821967557098, + "grad_norm": 0.7436400651931763, + "learning_rate": 9.307509850462922e-05, + "loss": 2.5489, + "step": 10490 + }, + { + "epoch": 0.8466629004922928, + "grad_norm": 0.6858603954315186, + "learning_rate": 9.305934959117056e-05, + "loss": 2.5622, + "step": 10491 + }, + { + "epoch": 0.8467436042288758, + "grad_norm": 0.707185685634613, + "learning_rate": 9.304360085069107e-05, + "loss": 2.5275, + "step": 10492 + }, + { + "epoch": 0.8468243079654588, + "grad_norm": 0.7207933068275452, + "learning_rate": 9.302785228358322e-05, + "loss": 2.5877, + "step": 10493 + }, + { + "epoch": 0.8469050117020418, + "grad_norm": 0.6470080614089966, + "learning_rate": 9.30121038902395e-05, + "loss": 2.5117, + "step": 10494 + }, + { + "epoch": 0.8469857154386248, + "grad_norm": 0.75248783826828, + "learning_rate": 9.299635567105247e-05, + "loss": 2.5259, + "step": 10495 + }, + { + "epoch": 0.8470664191752079, + "grad_norm": 0.7150708436965942, + "learning_rate": 9.298060762641452e-05, + "loss": 2.551, + "step": 10496 + }, + { + "epoch": 0.8471471229117908, + "grad_norm": 0.6865069270133972, + "learning_rate": 9.296485975671818e-05, + "loss": 2.5184, + "step": 10497 + }, + { + "epoch": 0.8472278266483738, + "grad_norm": 0.7188237309455872, + "learning_rate": 9.294911206235593e-05, + "loss": 2.5207, + "step": 10498 + }, + { + "epoch": 0.8473085303849568, + "grad_norm": 0.6907880902290344, + "learning_rate": 9.293336454372026e-05, + "loss": 2.5544, + "step": 10499 + }, + { + "epoch": 0.8473892341215399, + "grad_norm": 0.7626079320907593, + "learning_rate": 9.291761720120358e-05, + "loss": 2.5741, + "step": 10500 + }, + { + "epoch": 0.8474699378581229, + "grad_norm": 0.6731963753700256, + "learning_rate": 9.29018700351984e-05, + "loss": 2.5433, + "step": 10501 + }, + { + "epoch": 0.8475506415947058, + "grad_norm": 0.7256288528442383, + "learning_rate": 9.288612304609723e-05, + "loss": 2.5131, + "step": 10502 + }, + { + "epoch": 0.8476313453312888, + "grad_norm": 0.7129119634628296, + "learning_rate": 9.287037623429242e-05, + "loss": 2.5054, + "step": 10503 + }, + { + "epoch": 0.8477120490678719, + "grad_norm": 0.6711156964302063, + "learning_rate": 9.285462960017644e-05, + "loss": 2.5671, + "step": 10504 + }, + { + "epoch": 0.8477927528044549, + "grad_norm": 0.7268081903457642, + "learning_rate": 9.283888314414184e-05, + "loss": 2.5627, + "step": 10505 + }, + { + "epoch": 0.8478734565410379, + "grad_norm": 0.8635050058364868, + "learning_rate": 9.282313686658094e-05, + "loss": 2.517, + "step": 10506 + }, + { + "epoch": 0.8479541602776208, + "grad_norm": 0.7077138423919678, + "learning_rate": 9.280739076788624e-05, + "loss": 2.5551, + "step": 10507 + }, + { + "epoch": 0.8480348640142038, + "grad_norm": 0.6312204599380493, + "learning_rate": 9.279164484845018e-05, + "loss": 2.5329, + "step": 10508 + }, + { + "epoch": 0.8481155677507869, + "grad_norm": 0.6749829649925232, + "learning_rate": 9.277589910866516e-05, + "loss": 2.5092, + "step": 10509 + }, + { + "epoch": 0.8481962714873699, + "grad_norm": 0.753391683101654, + "learning_rate": 9.27601535489236e-05, + "loss": 2.6244, + "step": 10510 + }, + { + "epoch": 0.8482769752239528, + "grad_norm": 0.7230119109153748, + "learning_rate": 9.2744408169618e-05, + "loss": 2.5021, + "step": 10511 + }, + { + "epoch": 0.8483576789605358, + "grad_norm": 0.6759157776832581, + "learning_rate": 9.272866297114067e-05, + "loss": 2.5399, + "step": 10512 + }, + { + "epoch": 0.8484383826971189, + "grad_norm": 0.7049473524093628, + "learning_rate": 9.271291795388406e-05, + "loss": 2.5024, + "step": 10513 + }, + { + "epoch": 0.8485190864337019, + "grad_norm": 0.6579850912094116, + "learning_rate": 9.269717311824058e-05, + "loss": 2.5019, + "step": 10514 + }, + { + "epoch": 0.8485997901702849, + "grad_norm": 0.7091391086578369, + "learning_rate": 9.268142846460265e-05, + "loss": 2.5785, + "step": 10515 + }, + { + "epoch": 0.8486804939068678, + "grad_norm": 0.6612898707389832, + "learning_rate": 9.266568399336266e-05, + "loss": 2.5046, + "step": 10516 + }, + { + "epoch": 0.8487611976434509, + "grad_norm": 0.6348623633384705, + "learning_rate": 9.264993970491298e-05, + "loss": 2.543, + "step": 10517 + }, + { + "epoch": 0.8488419013800339, + "grad_norm": 0.688360869884491, + "learning_rate": 9.263419559964604e-05, + "loss": 2.5294, + "step": 10518 + }, + { + "epoch": 0.8489226051166169, + "grad_norm": 0.6483190059661865, + "learning_rate": 9.261845167795418e-05, + "loss": 2.5623, + "step": 10519 + }, + { + "epoch": 0.8490033088531999, + "grad_norm": 0.689379096031189, + "learning_rate": 9.26027079402298e-05, + "loss": 2.4871, + "step": 10520 + }, + { + "epoch": 0.8490840125897829, + "grad_norm": 0.6627655625343323, + "learning_rate": 9.25869643868653e-05, + "loss": 2.5353, + "step": 10521 + }, + { + "epoch": 0.8491647163263659, + "grad_norm": 0.6701192259788513, + "learning_rate": 9.2571221018253e-05, + "loss": 2.5003, + "step": 10522 + }, + { + "epoch": 0.8492454200629489, + "grad_norm": 0.7413944005966187, + "learning_rate": 9.255547783478529e-05, + "loss": 2.5473, + "step": 10523 + }, + { + "epoch": 0.8493261237995319, + "grad_norm": 0.6490365266799927, + "learning_rate": 9.253973483685455e-05, + "loss": 2.5168, + "step": 10524 + }, + { + "epoch": 0.849406827536115, + "grad_norm": 0.7303688526153564, + "learning_rate": 9.25239920248531e-05, + "loss": 2.5953, + "step": 10525 + }, + { + "epoch": 0.8494875312726979, + "grad_norm": 0.7132991552352905, + "learning_rate": 9.250824939917331e-05, + "loss": 2.475, + "step": 10526 + }, + { + "epoch": 0.8495682350092809, + "grad_norm": 0.6935676336288452, + "learning_rate": 9.249250696020753e-05, + "loss": 2.5212, + "step": 10527 + }, + { + "epoch": 0.8496489387458639, + "grad_norm": 0.732961118221283, + "learning_rate": 9.247676470834814e-05, + "loss": 2.5848, + "step": 10528 + }, + { + "epoch": 0.849729642482447, + "grad_norm": 0.6899160146713257, + "learning_rate": 9.246102264398739e-05, + "loss": 2.4551, + "step": 10529 + }, + { + "epoch": 0.84981034621903, + "grad_norm": 0.6941123604774475, + "learning_rate": 9.244528076751766e-05, + "loss": 2.5441, + "step": 10530 + }, + { + "epoch": 0.8498910499556129, + "grad_norm": 0.7351016998291016, + "learning_rate": 9.242953907933134e-05, + "loss": 2.6519, + "step": 10531 + }, + { + "epoch": 0.8499717536921959, + "grad_norm": 0.7156691551208496, + "learning_rate": 9.241379757982065e-05, + "loss": 2.573, + "step": 10532 + }, + { + "epoch": 0.850052457428779, + "grad_norm": 0.7137688994407654, + "learning_rate": 9.239805626937797e-05, + "loss": 2.5688, + "step": 10533 + }, + { + "epoch": 0.850133161165362, + "grad_norm": 0.7018687129020691, + "learning_rate": 9.238231514839559e-05, + "loss": 2.5725, + "step": 10534 + }, + { + "epoch": 0.850213864901945, + "grad_norm": 0.6723659634590149, + "learning_rate": 9.236657421726583e-05, + "loss": 2.5661, + "step": 10535 + }, + { + "epoch": 0.8502945686385279, + "grad_norm": 0.7105850577354431, + "learning_rate": 9.235083347638098e-05, + "loss": 2.5676, + "step": 10536 + }, + { + "epoch": 0.850375272375111, + "grad_norm": 0.682601809501648, + "learning_rate": 9.233509292613341e-05, + "loss": 2.5489, + "step": 10537 + }, + { + "epoch": 0.850455976111694, + "grad_norm": 0.6703988313674927, + "learning_rate": 9.231935256691531e-05, + "loss": 2.5349, + "step": 10538 + }, + { + "epoch": 0.850536679848277, + "grad_norm": 0.6430882215499878, + "learning_rate": 9.230361239911903e-05, + "loss": 2.4959, + "step": 10539 + }, + { + "epoch": 0.8506173835848599, + "grad_norm": 0.7164519429206848, + "learning_rate": 9.228787242313687e-05, + "loss": 2.4999, + "step": 10540 + }, + { + "epoch": 0.850698087321443, + "grad_norm": 0.7463028430938721, + "learning_rate": 9.227213263936107e-05, + "loss": 2.545, + "step": 10541 + }, + { + "epoch": 0.850778791058026, + "grad_norm": 0.650577187538147, + "learning_rate": 9.22563930481839e-05, + "loss": 2.5707, + "step": 10542 + }, + { + "epoch": 0.850859494794609, + "grad_norm": 0.6808211207389832, + "learning_rate": 9.224065364999768e-05, + "loss": 2.5236, + "step": 10543 + }, + { + "epoch": 0.850940198531192, + "grad_norm": 0.6947758793830872, + "learning_rate": 9.222491444519467e-05, + "loss": 2.555, + "step": 10544 + }, + { + "epoch": 0.851020902267775, + "grad_norm": 0.6805624961853027, + "learning_rate": 9.22091754341671e-05, + "loss": 2.517, + "step": 10545 + }, + { + "epoch": 0.851101606004358, + "grad_norm": 0.6645655035972595, + "learning_rate": 9.219343661730724e-05, + "loss": 2.5237, + "step": 10546 + }, + { + "epoch": 0.851182309740941, + "grad_norm": 0.6912586092948914, + "learning_rate": 9.217769799500738e-05, + "loss": 2.5345, + "step": 10547 + }, + { + "epoch": 0.851263013477524, + "grad_norm": 0.6713781356811523, + "learning_rate": 9.21619595676597e-05, + "loss": 2.56, + "step": 10548 + }, + { + "epoch": 0.8513437172141071, + "grad_norm": 0.7031502723693848, + "learning_rate": 9.214622133565648e-05, + "loss": 2.4885, + "step": 10549 + }, + { + "epoch": 0.85142442095069, + "grad_norm": 0.6616455316543579, + "learning_rate": 9.213048329938997e-05, + "loss": 2.5101, + "step": 10550 + }, + { + "epoch": 0.851505124687273, + "grad_norm": 0.711077094078064, + "learning_rate": 9.211474545925236e-05, + "loss": 2.6264, + "step": 10551 + }, + { + "epoch": 0.851585828423856, + "grad_norm": 0.7534502744674683, + "learning_rate": 9.209900781563592e-05, + "loss": 2.5417, + "step": 10552 + }, + { + "epoch": 0.8516665321604391, + "grad_norm": 0.7405222058296204, + "learning_rate": 9.208327036893288e-05, + "loss": 2.546, + "step": 10553 + }, + { + "epoch": 0.8517472358970221, + "grad_norm": 0.7014057040214539, + "learning_rate": 9.20675331195354e-05, + "loss": 2.5211, + "step": 10554 + }, + { + "epoch": 0.851827939633605, + "grad_norm": 0.6984074115753174, + "learning_rate": 9.205179606783573e-05, + "loss": 2.5181, + "step": 10555 + }, + { + "epoch": 0.851908643370188, + "grad_norm": 0.7312670350074768, + "learning_rate": 9.203605921422613e-05, + "loss": 2.5345, + "step": 10556 + }, + { + "epoch": 0.851989347106771, + "grad_norm": 0.6861104369163513, + "learning_rate": 9.202032255909871e-05, + "loss": 2.5426, + "step": 10557 + }, + { + "epoch": 0.8520700508433541, + "grad_norm": 0.6989030838012695, + "learning_rate": 9.200458610284571e-05, + "loss": 2.5221, + "step": 10558 + }, + { + "epoch": 0.852150754579937, + "grad_norm": 0.6645115613937378, + "learning_rate": 9.198884984585932e-05, + "loss": 2.4755, + "step": 10559 + }, + { + "epoch": 0.85223145831652, + "grad_norm": 0.6577785015106201, + "learning_rate": 9.197311378853176e-05, + "loss": 2.5491, + "step": 10560 + }, + { + "epoch": 0.852312162053103, + "grad_norm": 0.7311568856239319, + "learning_rate": 9.195737793125517e-05, + "loss": 2.5653, + "step": 10561 + }, + { + "epoch": 0.8523928657896861, + "grad_norm": 0.6469970345497131, + "learning_rate": 9.194164227442174e-05, + "loss": 2.5384, + "step": 10562 + }, + { + "epoch": 0.8524735695262691, + "grad_norm": 0.6562933325767517, + "learning_rate": 9.19259068184237e-05, + "loss": 2.5644, + "step": 10563 + }, + { + "epoch": 0.852554273262852, + "grad_norm": 0.7740273475646973, + "learning_rate": 9.19101715636531e-05, + "loss": 2.5868, + "step": 10564 + }, + { + "epoch": 0.852634976999435, + "grad_norm": 0.6461195349693298, + "learning_rate": 9.18944365105022e-05, + "loss": 2.4862, + "step": 10565 + }, + { + "epoch": 0.8527156807360181, + "grad_norm": 0.7230537533760071, + "learning_rate": 9.187870165936313e-05, + "loss": 2.5125, + "step": 10566 + }, + { + "epoch": 0.8527963844726011, + "grad_norm": 0.6858233213424683, + "learning_rate": 9.186296701062805e-05, + "loss": 2.5463, + "step": 10567 + }, + { + "epoch": 0.8528770882091841, + "grad_norm": 0.717407763004303, + "learning_rate": 9.184723256468908e-05, + "loss": 2.5399, + "step": 10568 + }, + { + "epoch": 0.852957791945767, + "grad_norm": 0.7537745237350464, + "learning_rate": 9.18314983219384e-05, + "loss": 2.5164, + "step": 10569 + }, + { + "epoch": 0.8530384956823501, + "grad_norm": 0.7068665027618408, + "learning_rate": 9.181576428276814e-05, + "loss": 2.5747, + "step": 10570 + }, + { + "epoch": 0.8531191994189331, + "grad_norm": 0.8013456463813782, + "learning_rate": 9.18000304475704e-05, + "loss": 2.5401, + "step": 10571 + }, + { + "epoch": 0.8531999031555161, + "grad_norm": 0.6458969712257385, + "learning_rate": 9.178429681673741e-05, + "loss": 2.4781, + "step": 10572 + }, + { + "epoch": 0.8532806068920991, + "grad_norm": 0.7235112190246582, + "learning_rate": 9.176856339066114e-05, + "loss": 2.5753, + "step": 10573 + }, + { + "epoch": 0.8533613106286821, + "grad_norm": 0.6815706491470337, + "learning_rate": 9.175283016973382e-05, + "loss": 2.5526, + "step": 10574 + }, + { + "epoch": 0.8534420143652651, + "grad_norm": 0.739747166633606, + "learning_rate": 9.173709715434751e-05, + "loss": 2.5631, + "step": 10575 + }, + { + "epoch": 0.8535227181018481, + "grad_norm": 0.7325060963630676, + "learning_rate": 9.172136434489437e-05, + "loss": 2.4925, + "step": 10576 + }, + { + "epoch": 0.8536034218384311, + "grad_norm": 0.6505454182624817, + "learning_rate": 9.170563174176645e-05, + "loss": 2.5423, + "step": 10577 + }, + { + "epoch": 0.8536841255750142, + "grad_norm": 0.7267098426818848, + "learning_rate": 9.168989934535586e-05, + "loss": 2.5687, + "step": 10578 + }, + { + "epoch": 0.8537648293115971, + "grad_norm": 0.7264497876167297, + "learning_rate": 9.167416715605476e-05, + "loss": 2.5165, + "step": 10579 + }, + { + "epoch": 0.8538455330481801, + "grad_norm": 0.7473852634429932, + "learning_rate": 9.165843517425509e-05, + "loss": 2.5837, + "step": 10580 + }, + { + "epoch": 0.8539262367847631, + "grad_norm": 0.7249133586883545, + "learning_rate": 9.164270340034906e-05, + "loss": 2.5805, + "step": 10581 + }, + { + "epoch": 0.8540069405213462, + "grad_norm": 0.7463760375976562, + "learning_rate": 9.162697183472875e-05, + "loss": 2.5067, + "step": 10582 + }, + { + "epoch": 0.8540876442579292, + "grad_norm": 0.7125511169433594, + "learning_rate": 9.161124047778614e-05, + "loss": 2.5093, + "step": 10583 + }, + { + "epoch": 0.8541683479945121, + "grad_norm": 0.7247455716133118, + "learning_rate": 9.159550932991335e-05, + "loss": 2.5356, + "step": 10584 + }, + { + "epoch": 0.8542490517310951, + "grad_norm": 0.7593860030174255, + "learning_rate": 9.157977839150246e-05, + "loss": 2.5477, + "step": 10585 + }, + { + "epoch": 0.8543297554676782, + "grad_norm": 0.6758295297622681, + "learning_rate": 9.156404766294547e-05, + "loss": 2.4748, + "step": 10586 + }, + { + "epoch": 0.8544104592042612, + "grad_norm": 0.7114073634147644, + "learning_rate": 9.154831714463447e-05, + "loss": 2.5479, + "step": 10587 + }, + { + "epoch": 0.8544911629408442, + "grad_norm": 0.6881263256072998, + "learning_rate": 9.153258683696156e-05, + "loss": 2.5471, + "step": 10588 + }, + { + "epoch": 0.8545718666774271, + "grad_norm": 0.6509317755699158, + "learning_rate": 9.151685674031866e-05, + "loss": 2.5239, + "step": 10589 + }, + { + "epoch": 0.8546525704140102, + "grad_norm": 0.7754644751548767, + "learning_rate": 9.150112685509787e-05, + "loss": 2.5572, + "step": 10590 + }, + { + "epoch": 0.8547332741505932, + "grad_norm": 0.707080602645874, + "learning_rate": 9.148539718169118e-05, + "loss": 2.5572, + "step": 10591 + }, + { + "epoch": 0.8548139778871762, + "grad_norm": 0.6996685266494751, + "learning_rate": 9.146966772049073e-05, + "loss": 2.4968, + "step": 10592 + }, + { + "epoch": 0.8548946816237591, + "grad_norm": 0.6830589771270752, + "learning_rate": 9.145393847188841e-05, + "loss": 2.5795, + "step": 10593 + }, + { + "epoch": 0.8549753853603422, + "grad_norm": 0.7507784366607666, + "learning_rate": 9.143820943627628e-05, + "loss": 2.6135, + "step": 10594 + }, + { + "epoch": 0.8550560890969252, + "grad_norm": 0.673218309879303, + "learning_rate": 9.142248061404638e-05, + "loss": 2.5875, + "step": 10595 + }, + { + "epoch": 0.8551367928335082, + "grad_norm": 0.6861804723739624, + "learning_rate": 9.140675200559065e-05, + "loss": 2.5892, + "step": 10596 + }, + { + "epoch": 0.8552174965700912, + "grad_norm": 0.6928709149360657, + "learning_rate": 9.139102361130114e-05, + "loss": 2.5303, + "step": 10597 + }, + { + "epoch": 0.8552982003066743, + "grad_norm": 0.6958343386650085, + "learning_rate": 9.137529543156986e-05, + "loss": 2.5567, + "step": 10598 + }, + { + "epoch": 0.8553789040432572, + "grad_norm": 0.703845739364624, + "learning_rate": 9.135956746678873e-05, + "loss": 2.5215, + "step": 10599 + }, + { + "epoch": 0.8554596077798402, + "grad_norm": 0.7108649015426636, + "learning_rate": 9.134383971734975e-05, + "loss": 2.5687, + "step": 10600 + }, + { + "epoch": 0.8555403115164232, + "grad_norm": 0.7249850034713745, + "learning_rate": 9.132811218364495e-05, + "loss": 2.565, + "step": 10601 + }, + { + "epoch": 0.8556210152530063, + "grad_norm": 0.7060014009475708, + "learning_rate": 9.131238486606623e-05, + "loss": 2.5366, + "step": 10602 + }, + { + "epoch": 0.8557017189895892, + "grad_norm": 0.6915088891983032, + "learning_rate": 9.129665776500559e-05, + "loss": 2.527, + "step": 10603 + }, + { + "epoch": 0.8557824227261722, + "grad_norm": 0.7226938605308533, + "learning_rate": 9.128093088085503e-05, + "loss": 2.5999, + "step": 10604 + }, + { + "epoch": 0.8558631264627552, + "grad_norm": 0.6802428364753723, + "learning_rate": 9.126520421400641e-05, + "loss": 2.4788, + "step": 10605 + }, + { + "epoch": 0.8559438301993383, + "grad_norm": 0.7855350375175476, + "learning_rate": 9.124947776485175e-05, + "loss": 2.5349, + "step": 10606 + }, + { + "epoch": 0.8560245339359213, + "grad_norm": 0.6758337020874023, + "learning_rate": 9.123375153378296e-05, + "loss": 2.5874, + "step": 10607 + }, + { + "epoch": 0.8561052376725042, + "grad_norm": 0.675061821937561, + "learning_rate": 9.121802552119206e-05, + "loss": 2.5343, + "step": 10608 + }, + { + "epoch": 0.8561859414090872, + "grad_norm": 0.7044726014137268, + "learning_rate": 9.120229972747087e-05, + "loss": 2.5361, + "step": 10609 + }, + { + "epoch": 0.8562666451456702, + "grad_norm": 0.6324402689933777, + "learning_rate": 9.118657415301137e-05, + "loss": 2.5039, + "step": 10610 + }, + { + "epoch": 0.8563473488822533, + "grad_norm": 0.6621509790420532, + "learning_rate": 9.11708487982055e-05, + "loss": 2.5346, + "step": 10611 + }, + { + "epoch": 0.8564280526188363, + "grad_norm": 0.6709887981414795, + "learning_rate": 9.115512366344516e-05, + "loss": 2.5409, + "step": 10612 + }, + { + "epoch": 0.8565087563554192, + "grad_norm": 0.7237712740898132, + "learning_rate": 9.113939874912223e-05, + "loss": 2.5051, + "step": 10613 + }, + { + "epoch": 0.8565894600920022, + "grad_norm": 0.6646109223365784, + "learning_rate": 9.11236740556287e-05, + "loss": 2.5866, + "step": 10614 + }, + { + "epoch": 0.8566701638285853, + "grad_norm": 0.7131930589675903, + "learning_rate": 9.110794958335637e-05, + "loss": 2.5472, + "step": 10615 + }, + { + "epoch": 0.8567508675651683, + "grad_norm": 0.6662428975105286, + "learning_rate": 9.109222533269715e-05, + "loss": 2.4863, + "step": 10616 + }, + { + "epoch": 0.8568315713017512, + "grad_norm": 0.6527226567268372, + "learning_rate": 9.107650130404304e-05, + "loss": 2.5594, + "step": 10617 + }, + { + "epoch": 0.8569122750383342, + "grad_norm": 0.6639060378074646, + "learning_rate": 9.106077749778578e-05, + "loss": 2.5519, + "step": 10618 + }, + { + "epoch": 0.8569929787749173, + "grad_norm": 0.7088096737861633, + "learning_rate": 9.104505391431734e-05, + "loss": 2.5404, + "step": 10619 + }, + { + "epoch": 0.8570736825115003, + "grad_norm": 0.7155873775482178, + "learning_rate": 9.102933055402957e-05, + "loss": 2.5636, + "step": 10620 + }, + { + "epoch": 0.8571543862480833, + "grad_norm": 0.6522316932678223, + "learning_rate": 9.101360741731431e-05, + "loss": 2.5216, + "step": 10621 + }, + { + "epoch": 0.8572350899846662, + "grad_norm": 0.6515649557113647, + "learning_rate": 9.099788450456345e-05, + "loss": 2.5804, + "step": 10622 + }, + { + "epoch": 0.8573157937212493, + "grad_norm": 0.6791853904724121, + "learning_rate": 9.098216181616883e-05, + "loss": 2.5353, + "step": 10623 + }, + { + "epoch": 0.8573964974578323, + "grad_norm": 0.6946877241134644, + "learning_rate": 9.096643935252236e-05, + "loss": 2.5492, + "step": 10624 + }, + { + "epoch": 0.8574772011944153, + "grad_norm": 0.7235898375511169, + "learning_rate": 9.095071711401581e-05, + "loss": 2.5178, + "step": 10625 + }, + { + "epoch": 0.8575579049309983, + "grad_norm": 0.6740610003471375, + "learning_rate": 9.093499510104102e-05, + "loss": 2.5699, + "step": 10626 + }, + { + "epoch": 0.8576386086675813, + "grad_norm": 0.7441792488098145, + "learning_rate": 9.091927331398988e-05, + "loss": 2.579, + "step": 10627 + }, + { + "epoch": 0.8577193124041643, + "grad_norm": 0.6986937522888184, + "learning_rate": 9.090355175325416e-05, + "loss": 2.5556, + "step": 10628 + }, + { + "epoch": 0.8578000161407473, + "grad_norm": 0.6960151791572571, + "learning_rate": 9.08878304192257e-05, + "loss": 2.5448, + "step": 10629 + }, + { + "epoch": 0.8578807198773303, + "grad_norm": 0.6376819014549255, + "learning_rate": 9.087210931229636e-05, + "loss": 2.4636, + "step": 10630 + }, + { + "epoch": 0.8579614236139134, + "grad_norm": 0.752473771572113, + "learning_rate": 9.08563884328579e-05, + "loss": 2.5451, + "step": 10631 + }, + { + "epoch": 0.8580421273504963, + "grad_norm": 0.6879361867904663, + "learning_rate": 9.084066778130213e-05, + "loss": 2.5365, + "step": 10632 + }, + { + "epoch": 0.8581228310870793, + "grad_norm": 0.6630483865737915, + "learning_rate": 9.082494735802091e-05, + "loss": 2.5085, + "step": 10633 + }, + { + "epoch": 0.8582035348236623, + "grad_norm": 0.689602792263031, + "learning_rate": 9.080922716340594e-05, + "loss": 2.5087, + "step": 10634 + }, + { + "epoch": 0.8582842385602454, + "grad_norm": 0.7333599925041199, + "learning_rate": 9.079350719784905e-05, + "loss": 2.5476, + "step": 10635 + }, + { + "epoch": 0.8583649422968284, + "grad_norm": 0.6895802021026611, + "learning_rate": 9.077778746174204e-05, + "loss": 2.5099, + "step": 10636 + }, + { + "epoch": 0.8584456460334113, + "grad_norm": 0.7202162146568298, + "learning_rate": 9.076206795547668e-05, + "loss": 2.5197, + "step": 10637 + }, + { + "epoch": 0.8585263497699943, + "grad_norm": 0.6454200148582458, + "learning_rate": 9.074634867944472e-05, + "loss": 2.5303, + "step": 10638 + }, + { + "epoch": 0.8586070535065774, + "grad_norm": 0.6842506527900696, + "learning_rate": 9.073062963403795e-05, + "loss": 2.5051, + "step": 10639 + }, + { + "epoch": 0.8586877572431604, + "grad_norm": 0.6979129314422607, + "learning_rate": 9.071491081964815e-05, + "loss": 2.5209, + "step": 10640 + }, + { + "epoch": 0.8587684609797434, + "grad_norm": 0.6851540803909302, + "learning_rate": 9.0699192236667e-05, + "loss": 2.5003, + "step": 10641 + }, + { + "epoch": 0.8588491647163263, + "grad_norm": 0.7528585195541382, + "learning_rate": 9.068347388548627e-05, + "loss": 2.5524, + "step": 10642 + }, + { + "epoch": 0.8589298684529094, + "grad_norm": 0.6297397613525391, + "learning_rate": 9.06677557664978e-05, + "loss": 2.5412, + "step": 10643 + }, + { + "epoch": 0.8590105721894924, + "grad_norm": 0.7034026980400085, + "learning_rate": 9.06520378800932e-05, + "loss": 2.4958, + "step": 10644 + }, + { + "epoch": 0.8590912759260754, + "grad_norm": 0.690258800983429, + "learning_rate": 9.063632022666425e-05, + "loss": 2.4894, + "step": 10645 + }, + { + "epoch": 0.8591719796626583, + "grad_norm": 0.6449949145317078, + "learning_rate": 9.06206028066027e-05, + "loss": 2.507, + "step": 10646 + }, + { + "epoch": 0.8592526833992414, + "grad_norm": 0.6328588724136353, + "learning_rate": 9.060488562030023e-05, + "loss": 2.5503, + "step": 10647 + }, + { + "epoch": 0.8593333871358244, + "grad_norm": 0.6570547819137573, + "learning_rate": 9.058916866814858e-05, + "loss": 2.4993, + "step": 10648 + }, + { + "epoch": 0.8594140908724074, + "grad_norm": 0.7689602375030518, + "learning_rate": 9.057345195053945e-05, + "loss": 2.5498, + "step": 10649 + }, + { + "epoch": 0.8594947946089904, + "grad_norm": 0.6727081537246704, + "learning_rate": 9.055773546786454e-05, + "loss": 2.5172, + "step": 10650 + }, + { + "epoch": 0.8595754983455735, + "grad_norm": 0.694722056388855, + "learning_rate": 9.054201922051552e-05, + "loss": 2.5485, + "step": 10651 + }, + { + "epoch": 0.8596562020821564, + "grad_norm": 0.6638815999031067, + "learning_rate": 9.052630320888411e-05, + "loss": 2.5134, + "step": 10652 + }, + { + "epoch": 0.8597369058187394, + "grad_norm": 0.6600833535194397, + "learning_rate": 9.0510587433362e-05, + "loss": 2.5206, + "step": 10653 + }, + { + "epoch": 0.8598176095553224, + "grad_norm": 0.7193894386291504, + "learning_rate": 9.049487189434084e-05, + "loss": 2.5485, + "step": 10654 + }, + { + "epoch": 0.8598983132919055, + "grad_norm": 0.6651753187179565, + "learning_rate": 9.047915659221233e-05, + "loss": 2.5703, + "step": 10655 + }, + { + "epoch": 0.8599790170284884, + "grad_norm": 0.7346364855766296, + "learning_rate": 9.046344152736815e-05, + "loss": 2.5301, + "step": 10656 + }, + { + "epoch": 0.8600597207650714, + "grad_norm": 0.6681811809539795, + "learning_rate": 9.04477267001999e-05, + "loss": 2.5124, + "step": 10657 + }, + { + "epoch": 0.8601404245016544, + "grad_norm": 0.6928461790084839, + "learning_rate": 9.043201211109929e-05, + "loss": 2.5153, + "step": 10658 + }, + { + "epoch": 0.8602211282382374, + "grad_norm": 0.6957700252532959, + "learning_rate": 9.041629776045797e-05, + "loss": 2.4697, + "step": 10659 + }, + { + "epoch": 0.8603018319748205, + "grad_norm": 0.6361939311027527, + "learning_rate": 9.040058364866752e-05, + "loss": 2.5162, + "step": 10660 + }, + { + "epoch": 0.8603825357114034, + "grad_norm": 0.6827390193939209, + "learning_rate": 9.038486977611964e-05, + "loss": 2.4856, + "step": 10661 + }, + { + "epoch": 0.8604632394479864, + "grad_norm": 0.6638801097869873, + "learning_rate": 9.036915614320595e-05, + "loss": 2.5224, + "step": 10662 + }, + { + "epoch": 0.8605439431845694, + "grad_norm": 0.7249652743339539, + "learning_rate": 9.035344275031802e-05, + "loss": 2.5461, + "step": 10663 + }, + { + "epoch": 0.8606246469211525, + "grad_norm": 0.6693316102027893, + "learning_rate": 9.033772959784754e-05, + "loss": 2.5676, + "step": 10664 + }, + { + "epoch": 0.8607053506577355, + "grad_norm": 0.6787340641021729, + "learning_rate": 9.032201668618614e-05, + "loss": 2.5374, + "step": 10665 + }, + { + "epoch": 0.8607860543943184, + "grad_norm": 0.6581670641899109, + "learning_rate": 9.030630401572533e-05, + "loss": 2.5052, + "step": 10666 + }, + { + "epoch": 0.8608667581309014, + "grad_norm": 0.6975873112678528, + "learning_rate": 9.029059158685675e-05, + "loss": 2.4823, + "step": 10667 + }, + { + "epoch": 0.8609474618674845, + "grad_norm": 0.6632521748542786, + "learning_rate": 9.027487939997201e-05, + "loss": 2.5992, + "step": 10668 + }, + { + "epoch": 0.8610281656040675, + "grad_norm": 0.6793977618217468, + "learning_rate": 9.025916745546276e-05, + "loss": 2.5308, + "step": 10669 + }, + { + "epoch": 0.8611088693406505, + "grad_norm": 0.6499481797218323, + "learning_rate": 9.024345575372046e-05, + "loss": 2.4964, + "step": 10670 + }, + { + "epoch": 0.8611895730772334, + "grad_norm": 0.6858868598937988, + "learning_rate": 9.022774429513677e-05, + "loss": 2.5388, + "step": 10671 + }, + { + "epoch": 0.8612702768138165, + "grad_norm": 0.7586160898208618, + "learning_rate": 9.021203308010324e-05, + "loss": 2.5166, + "step": 10672 + }, + { + "epoch": 0.8613509805503995, + "grad_norm": 0.7179701328277588, + "learning_rate": 9.019632210901141e-05, + "loss": 2.5501, + "step": 10673 + }, + { + "epoch": 0.8614316842869825, + "grad_norm": 0.6830369830131531, + "learning_rate": 9.018061138225287e-05, + "loss": 2.4956, + "step": 10674 + }, + { + "epoch": 0.8615123880235654, + "grad_norm": 0.6710512042045593, + "learning_rate": 9.01649009002192e-05, + "loss": 2.5722, + "step": 10675 + }, + { + "epoch": 0.8615930917601485, + "grad_norm": 0.640011727809906, + "learning_rate": 9.014919066330186e-05, + "loss": 2.5197, + "step": 10676 + }, + { + "epoch": 0.8616737954967315, + "grad_norm": 0.6803860664367676, + "learning_rate": 9.013348067189245e-05, + "loss": 2.4794, + "step": 10677 + }, + { + "epoch": 0.8617544992333145, + "grad_norm": 0.6734865307807922, + "learning_rate": 9.011777092638251e-05, + "loss": 2.5831, + "step": 10678 + }, + { + "epoch": 0.8618352029698975, + "grad_norm": 0.6525718569755554, + "learning_rate": 9.010206142716353e-05, + "loss": 2.4925, + "step": 10679 + }, + { + "epoch": 0.8619159067064806, + "grad_norm": 0.6886672377586365, + "learning_rate": 9.008635217462706e-05, + "loss": 2.491, + "step": 10680 + }, + { + "epoch": 0.8619966104430635, + "grad_norm": 0.6397131085395813, + "learning_rate": 9.007064316916461e-05, + "loss": 2.4684, + "step": 10681 + }, + { + "epoch": 0.8620773141796465, + "grad_norm": 0.6308462023735046, + "learning_rate": 9.005493441116768e-05, + "loss": 2.504, + "step": 10682 + }, + { + "epoch": 0.8621580179162295, + "grad_norm": 0.7223808169364929, + "learning_rate": 9.003922590102778e-05, + "loss": 2.5342, + "step": 10683 + }, + { + "epoch": 0.8622387216528126, + "grad_norm": 0.687515914440155, + "learning_rate": 9.002351763913642e-05, + "loss": 2.4822, + "step": 10684 + }, + { + "epoch": 0.8623194253893955, + "grad_norm": 0.6888468265533447, + "learning_rate": 9.00078096258851e-05, + "loss": 2.5497, + "step": 10685 + }, + { + "epoch": 0.8624001291259785, + "grad_norm": 0.7429301738739014, + "learning_rate": 8.999210186166525e-05, + "loss": 2.624, + "step": 10686 + }, + { + "epoch": 0.8624808328625615, + "grad_norm": 0.6901945471763611, + "learning_rate": 8.997639434686839e-05, + "loss": 2.5268, + "step": 10687 + }, + { + "epoch": 0.8625615365991446, + "grad_norm": 0.7396681308746338, + "learning_rate": 8.9960687081886e-05, + "loss": 2.5427, + "step": 10688 + }, + { + "epoch": 0.8626422403357276, + "grad_norm": 0.6825531125068665, + "learning_rate": 8.99449800671095e-05, + "loss": 2.5722, + "step": 10689 + }, + { + "epoch": 0.8627229440723105, + "grad_norm": 0.6719860434532166, + "learning_rate": 8.992927330293039e-05, + "loss": 2.4939, + "step": 10690 + }, + { + "epoch": 0.8628036478088935, + "grad_norm": 0.644567608833313, + "learning_rate": 8.991356678974017e-05, + "loss": 2.5495, + "step": 10691 + }, + { + "epoch": 0.8628843515454766, + "grad_norm": 0.7066643834114075, + "learning_rate": 8.989786052793015e-05, + "loss": 2.5508, + "step": 10692 + }, + { + "epoch": 0.8629650552820596, + "grad_norm": 0.6697196364402771, + "learning_rate": 8.988215451789187e-05, + "loss": 2.5231, + "step": 10693 + }, + { + "epoch": 0.8630457590186426, + "grad_norm": 0.7143658399581909, + "learning_rate": 8.986644876001681e-05, + "loss": 2.5368, + "step": 10694 + }, + { + "epoch": 0.8631264627552255, + "grad_norm": 0.7597684264183044, + "learning_rate": 8.985074325469628e-05, + "loss": 2.5983, + "step": 10695 + }, + { + "epoch": 0.8632071664918086, + "grad_norm": 0.7418014407157898, + "learning_rate": 8.983503800232176e-05, + "loss": 2.5736, + "step": 10696 + }, + { + "epoch": 0.8632878702283916, + "grad_norm": 0.654435932636261, + "learning_rate": 8.981933300328468e-05, + "loss": 2.5389, + "step": 10697 + }, + { + "epoch": 0.8633685739649746, + "grad_norm": 0.658203661441803, + "learning_rate": 8.980362825797643e-05, + "loss": 2.5204, + "step": 10698 + }, + { + "epoch": 0.8634492777015575, + "grad_norm": 0.7132784724235535, + "learning_rate": 8.97879237667884e-05, + "loss": 2.4982, + "step": 10699 + }, + { + "epoch": 0.8635299814381406, + "grad_norm": 0.6901868581771851, + "learning_rate": 8.9772219530112e-05, + "loss": 2.5599, + "step": 10700 + }, + { + "epoch": 0.8636106851747236, + "grad_norm": 0.6241179704666138, + "learning_rate": 8.975651554833869e-05, + "loss": 2.5185, + "step": 10701 + }, + { + "epoch": 0.8636913889113066, + "grad_norm": 0.693692147731781, + "learning_rate": 8.974081182185974e-05, + "loss": 2.506, + "step": 10702 + }, + { + "epoch": 0.8637720926478896, + "grad_norm": 0.6699246168136597, + "learning_rate": 8.972510835106658e-05, + "loss": 2.557, + "step": 10703 + }, + { + "epoch": 0.8638527963844727, + "grad_norm": 0.7339062094688416, + "learning_rate": 8.970940513635059e-05, + "loss": 2.5614, + "step": 10704 + }, + { + "epoch": 0.8639335001210556, + "grad_norm": 0.7558815479278564, + "learning_rate": 8.969370217810311e-05, + "loss": 2.5949, + "step": 10705 + }, + { + "epoch": 0.8640142038576386, + "grad_norm": 0.6992602348327637, + "learning_rate": 8.96779994767155e-05, + "loss": 2.4755, + "step": 10706 + }, + { + "epoch": 0.8640949075942216, + "grad_norm": 0.6836397647857666, + "learning_rate": 8.966229703257915e-05, + "loss": 2.5172, + "step": 10707 + }, + { + "epoch": 0.8641756113308047, + "grad_norm": 0.7054563760757446, + "learning_rate": 8.964659484608537e-05, + "loss": 2.5186, + "step": 10708 + }, + { + "epoch": 0.8642563150673876, + "grad_norm": 0.7096611261367798, + "learning_rate": 8.963089291762551e-05, + "loss": 2.5157, + "step": 10709 + }, + { + "epoch": 0.8643370188039706, + "grad_norm": 0.657465934753418, + "learning_rate": 8.961519124759094e-05, + "loss": 2.5332, + "step": 10710 + }, + { + "epoch": 0.8644177225405536, + "grad_norm": 0.7490121126174927, + "learning_rate": 8.959948983637291e-05, + "loss": 2.512, + "step": 10711 + }, + { + "epoch": 0.8644984262771366, + "grad_norm": 0.7074166536331177, + "learning_rate": 8.958378868436279e-05, + "loss": 2.4745, + "step": 10712 + }, + { + "epoch": 0.8645791300137197, + "grad_norm": 0.7496227025985718, + "learning_rate": 8.956808779195188e-05, + "loss": 2.5533, + "step": 10713 + }, + { + "epoch": 0.8646598337503026, + "grad_norm": 0.6624657511711121, + "learning_rate": 8.95523871595315e-05, + "loss": 2.5346, + "step": 10714 + }, + { + "epoch": 0.8647405374868856, + "grad_norm": 0.6829125881195068, + "learning_rate": 8.953668678749292e-05, + "loss": 2.558, + "step": 10715 + }, + { + "epoch": 0.8648212412234686, + "grad_norm": 0.6954498887062073, + "learning_rate": 8.952098667622745e-05, + "loss": 2.5617, + "step": 10716 + }, + { + "epoch": 0.8649019449600517, + "grad_norm": 0.6722636818885803, + "learning_rate": 8.950528682612645e-05, + "loss": 2.5565, + "step": 10717 + }, + { + "epoch": 0.8649826486966347, + "grad_norm": 0.6793767213821411, + "learning_rate": 8.948958723758107e-05, + "loss": 2.5803, + "step": 10718 + }, + { + "epoch": 0.8650633524332176, + "grad_norm": 0.7159373760223389, + "learning_rate": 8.947388791098266e-05, + "loss": 2.5465, + "step": 10719 + }, + { + "epoch": 0.8651440561698006, + "grad_norm": 0.6823835372924805, + "learning_rate": 8.945818884672253e-05, + "loss": 2.5079, + "step": 10720 + }, + { + "epoch": 0.8652247599063837, + "grad_norm": 0.7521452903747559, + "learning_rate": 8.944249004519185e-05, + "loss": 2.5628, + "step": 10721 + }, + { + "epoch": 0.8653054636429667, + "grad_norm": 0.6774886846542358, + "learning_rate": 8.94267915067819e-05, + "loss": 2.6042, + "step": 10722 + }, + { + "epoch": 0.8653861673795497, + "grad_norm": 0.6915935277938843, + "learning_rate": 8.941109323188398e-05, + "loss": 2.5563, + "step": 10723 + }, + { + "epoch": 0.8654668711161326, + "grad_norm": 0.6609061360359192, + "learning_rate": 8.939539522088927e-05, + "loss": 2.5083, + "step": 10724 + }, + { + "epoch": 0.8655475748527157, + "grad_norm": 0.6457223892211914, + "learning_rate": 8.937969747418903e-05, + "loss": 2.573, + "step": 10725 + }, + { + "epoch": 0.8656282785892987, + "grad_norm": 0.6960360407829285, + "learning_rate": 8.936399999217455e-05, + "loss": 2.516, + "step": 10726 + }, + { + "epoch": 0.8657089823258817, + "grad_norm": 0.7269721627235413, + "learning_rate": 8.934830277523693e-05, + "loss": 2.5932, + "step": 10727 + }, + { + "epoch": 0.8657896860624646, + "grad_norm": 0.7057532668113708, + "learning_rate": 8.933260582376745e-05, + "loss": 2.5022, + "step": 10728 + }, + { + "epoch": 0.8658703897990477, + "grad_norm": 0.6698749661445618, + "learning_rate": 8.931690913815735e-05, + "loss": 2.5357, + "step": 10729 + }, + { + "epoch": 0.8659510935356307, + "grad_norm": 0.6616599559783936, + "learning_rate": 8.930121271879777e-05, + "loss": 2.4776, + "step": 10730 + }, + { + "epoch": 0.8660317972722137, + "grad_norm": 0.7457093000411987, + "learning_rate": 8.928551656607993e-05, + "loss": 2.5799, + "step": 10731 + }, + { + "epoch": 0.8661125010087967, + "grad_norm": 0.7199469804763794, + "learning_rate": 8.926982068039505e-05, + "loss": 2.5278, + "step": 10732 + }, + { + "epoch": 0.8661932047453798, + "grad_norm": 0.7579182386398315, + "learning_rate": 8.925412506213428e-05, + "loss": 2.5227, + "step": 10733 + }, + { + "epoch": 0.8662739084819627, + "grad_norm": 0.687455952167511, + "learning_rate": 8.92384297116888e-05, + "loss": 2.5099, + "step": 10734 + }, + { + "epoch": 0.8663546122185457, + "grad_norm": 0.7616521120071411, + "learning_rate": 8.922273462944978e-05, + "loss": 2.598, + "step": 10735 + }, + { + "epoch": 0.8664353159551287, + "grad_norm": 0.6730697751045227, + "learning_rate": 8.920703981580842e-05, + "loss": 2.5517, + "step": 10736 + }, + { + "epoch": 0.8665160196917118, + "grad_norm": 0.6769895553588867, + "learning_rate": 8.91913452711558e-05, + "loss": 2.5535, + "step": 10737 + }, + { + "epoch": 0.8665967234282947, + "grad_norm": 0.6284549832344055, + "learning_rate": 8.917565099588312e-05, + "loss": 2.4597, + "step": 10738 + }, + { + "epoch": 0.8666774271648777, + "grad_norm": 0.6900805830955505, + "learning_rate": 8.915995699038152e-05, + "loss": 2.5236, + "step": 10739 + }, + { + "epoch": 0.8667581309014607, + "grad_norm": 0.6842896938323975, + "learning_rate": 8.914426325504211e-05, + "loss": 2.5199, + "step": 10740 + }, + { + "epoch": 0.8668388346380438, + "grad_norm": 0.6637243628501892, + "learning_rate": 8.912856979025604e-05, + "loss": 2.5368, + "step": 10741 + }, + { + "epoch": 0.8669195383746268, + "grad_norm": 0.7474464178085327, + "learning_rate": 8.911287659641449e-05, + "loss": 2.4902, + "step": 10742 + }, + { + "epoch": 0.8670002421112097, + "grad_norm": 0.6977849006652832, + "learning_rate": 8.909718367390843e-05, + "loss": 2.5034, + "step": 10743 + }, + { + "epoch": 0.8670809458477927, + "grad_norm": 0.6968807578086853, + "learning_rate": 8.908149102312907e-05, + "loss": 2.5396, + "step": 10744 + }, + { + "epoch": 0.8671616495843758, + "grad_norm": 0.6656209230422974, + "learning_rate": 8.906579864446755e-05, + "loss": 2.5702, + "step": 10745 + }, + { + "epoch": 0.8672423533209588, + "grad_norm": 0.7079079151153564, + "learning_rate": 8.905010653831486e-05, + "loss": 2.5344, + "step": 10746 + }, + { + "epoch": 0.8673230570575418, + "grad_norm": 0.7423387765884399, + "learning_rate": 8.903441470506214e-05, + "loss": 2.5635, + "step": 10747 + }, + { + "epoch": 0.8674037607941247, + "grad_norm": 0.6607224941253662, + "learning_rate": 8.901872314510046e-05, + "loss": 2.54, + "step": 10748 + }, + { + "epoch": 0.8674844645307078, + "grad_norm": 0.6646947860717773, + "learning_rate": 8.900303185882095e-05, + "loss": 2.4661, + "step": 10749 + }, + { + "epoch": 0.8675651682672908, + "grad_norm": 0.6943496465682983, + "learning_rate": 8.89873408466146e-05, + "loss": 2.5213, + "step": 10750 + }, + { + "epoch": 0.8676458720038738, + "grad_norm": 0.7048123478889465, + "learning_rate": 8.89716501088725e-05, + "loss": 2.5529, + "step": 10751 + }, + { + "epoch": 0.8677265757404568, + "grad_norm": 0.654617428779602, + "learning_rate": 8.895595964598574e-05, + "loss": 2.5535, + "step": 10752 + }, + { + "epoch": 0.8678072794770398, + "grad_norm": 0.672063410282135, + "learning_rate": 8.894026945834531e-05, + "loss": 2.5279, + "step": 10753 + }, + { + "epoch": 0.8678879832136228, + "grad_norm": 0.7134148478507996, + "learning_rate": 8.892457954634225e-05, + "loss": 2.5403, + "step": 10754 + }, + { + "epoch": 0.8679686869502058, + "grad_norm": 0.6457598805427551, + "learning_rate": 8.890888991036768e-05, + "loss": 2.515, + "step": 10755 + }, + { + "epoch": 0.8680493906867888, + "grad_norm": 0.6725220084190369, + "learning_rate": 8.889320055081252e-05, + "loss": 2.4829, + "step": 10756 + }, + { + "epoch": 0.8681300944233719, + "grad_norm": 0.6425862312316895, + "learning_rate": 8.887751146806785e-05, + "loss": 2.4965, + "step": 10757 + }, + { + "epoch": 0.8682107981599548, + "grad_norm": 0.6654682755470276, + "learning_rate": 8.886182266252468e-05, + "loss": 2.48, + "step": 10758 + }, + { + "epoch": 0.8682915018965378, + "grad_norm": 0.7102493643760681, + "learning_rate": 8.884613413457398e-05, + "loss": 2.5415, + "step": 10759 + }, + { + "epoch": 0.8683722056331208, + "grad_norm": 0.6996567249298096, + "learning_rate": 8.883044588460677e-05, + "loss": 2.542, + "step": 10760 + }, + { + "epoch": 0.8684529093697038, + "grad_norm": 0.7011905312538147, + "learning_rate": 8.881475791301405e-05, + "loss": 2.5391, + "step": 10761 + }, + { + "epoch": 0.8685336131062869, + "grad_norm": 0.6508356928825378, + "learning_rate": 8.879907022018686e-05, + "loss": 2.4892, + "step": 10762 + }, + { + "epoch": 0.8686143168428698, + "grad_norm": 0.7104009985923767, + "learning_rate": 8.878338280651605e-05, + "loss": 2.5152, + "step": 10763 + }, + { + "epoch": 0.8686950205794528, + "grad_norm": 0.6501138210296631, + "learning_rate": 8.876769567239268e-05, + "loss": 2.5767, + "step": 10764 + }, + { + "epoch": 0.8687757243160358, + "grad_norm": 0.6463173031806946, + "learning_rate": 8.875200881820771e-05, + "loss": 2.4758, + "step": 10765 + }, + { + "epoch": 0.8688564280526189, + "grad_norm": 0.6494991779327393, + "learning_rate": 8.873632224435206e-05, + "loss": 2.5364, + "step": 10766 + }, + { + "epoch": 0.8689371317892018, + "grad_norm": 0.6926043033599854, + "learning_rate": 8.872063595121671e-05, + "loss": 2.5288, + "step": 10767 + }, + { + "epoch": 0.8690178355257848, + "grad_norm": 0.7076035737991333, + "learning_rate": 8.870494993919261e-05, + "loss": 2.5118, + "step": 10768 + }, + { + "epoch": 0.8690985392623678, + "grad_norm": 0.6456892490386963, + "learning_rate": 8.868926420867068e-05, + "loss": 2.4957, + "step": 10769 + }, + { + "epoch": 0.8691792429989509, + "grad_norm": 0.6585200428962708, + "learning_rate": 8.867357876004183e-05, + "loss": 2.5049, + "step": 10770 + }, + { + "epoch": 0.8692599467355339, + "grad_norm": 0.6893252730369568, + "learning_rate": 8.865789359369706e-05, + "loss": 2.4808, + "step": 10771 + }, + { + "epoch": 0.8693406504721168, + "grad_norm": 0.6700639724731445, + "learning_rate": 8.864220871002719e-05, + "loss": 2.5475, + "step": 10772 + }, + { + "epoch": 0.8694213542086998, + "grad_norm": 0.6551913619041443, + "learning_rate": 8.862652410942315e-05, + "loss": 2.5063, + "step": 10773 + }, + { + "epoch": 0.8695020579452829, + "grad_norm": 0.6870427131652832, + "learning_rate": 8.86108397922759e-05, + "loss": 2.5785, + "step": 10774 + }, + { + "epoch": 0.8695827616818659, + "grad_norm": 0.6489934325218201, + "learning_rate": 8.859515575897626e-05, + "loss": 2.5584, + "step": 10775 + }, + { + "epoch": 0.8696634654184489, + "grad_norm": 0.6726663112640381, + "learning_rate": 8.857947200991517e-05, + "loss": 2.5707, + "step": 10776 + }, + { + "epoch": 0.8697441691550318, + "grad_norm": 0.7696183323860168, + "learning_rate": 8.856378854548347e-05, + "loss": 2.501, + "step": 10777 + }, + { + "epoch": 0.8698248728916149, + "grad_norm": 0.7002642154693604, + "learning_rate": 8.854810536607212e-05, + "loss": 2.5792, + "step": 10778 + }, + { + "epoch": 0.8699055766281979, + "grad_norm": 0.6429435610771179, + "learning_rate": 8.853242247207185e-05, + "loss": 2.5463, + "step": 10779 + }, + { + "epoch": 0.8699862803647809, + "grad_norm": 0.7006216645240784, + "learning_rate": 8.851673986387358e-05, + "loss": 2.5698, + "step": 10780 + }, + { + "epoch": 0.8700669841013638, + "grad_norm": 0.7053292989730835, + "learning_rate": 8.850105754186824e-05, + "loss": 2.5468, + "step": 10781 + }, + { + "epoch": 0.8701476878379469, + "grad_norm": 0.6592122912406921, + "learning_rate": 8.848537550644654e-05, + "loss": 2.5271, + "step": 10782 + }, + { + "epoch": 0.8702283915745299, + "grad_norm": 0.679132342338562, + "learning_rate": 8.846969375799941e-05, + "loss": 2.5281, + "step": 10783 + }, + { + "epoch": 0.8703090953111129, + "grad_norm": 0.6868568062782288, + "learning_rate": 8.845401229691765e-05, + "loss": 2.5415, + "step": 10784 + }, + { + "epoch": 0.8703897990476959, + "grad_norm": 0.7060674428939819, + "learning_rate": 8.843833112359208e-05, + "loss": 2.5649, + "step": 10785 + }, + { + "epoch": 0.870470502784279, + "grad_norm": 0.6663981676101685, + "learning_rate": 8.842265023841352e-05, + "loss": 2.5055, + "step": 10786 + }, + { + "epoch": 0.8705512065208619, + "grad_norm": 0.7095218896865845, + "learning_rate": 8.840696964177282e-05, + "loss": 2.5442, + "step": 10787 + }, + { + "epoch": 0.8706319102574449, + "grad_norm": 0.6884104013442993, + "learning_rate": 8.839128933406069e-05, + "loss": 2.5285, + "step": 10788 + }, + { + "epoch": 0.8707126139940279, + "grad_norm": 0.6427462697029114, + "learning_rate": 8.837560931566798e-05, + "loss": 2.5197, + "step": 10789 + }, + { + "epoch": 0.870793317730611, + "grad_norm": 0.6870493292808533, + "learning_rate": 8.835992958698548e-05, + "loss": 2.4937, + "step": 10790 + }, + { + "epoch": 0.870874021467194, + "grad_norm": 0.7006319761276245, + "learning_rate": 8.834425014840398e-05, + "loss": 2.5148, + "step": 10791 + }, + { + "epoch": 0.8709547252037769, + "grad_norm": 0.690601646900177, + "learning_rate": 8.83285710003142e-05, + "loss": 2.5454, + "step": 10792 + }, + { + "epoch": 0.8710354289403599, + "grad_norm": 0.7205955982208252, + "learning_rate": 8.831289214310695e-05, + "loss": 2.5221, + "step": 10793 + }, + { + "epoch": 0.871116132676943, + "grad_norm": 0.7134295105934143, + "learning_rate": 8.8297213577173e-05, + "loss": 2.5626, + "step": 10794 + }, + { + "epoch": 0.871196836413526, + "grad_norm": 0.6560496091842651, + "learning_rate": 8.828153530290307e-05, + "loss": 2.5408, + "step": 10795 + }, + { + "epoch": 0.8712775401501089, + "grad_norm": 0.7055882215499878, + "learning_rate": 8.82658573206879e-05, + "loss": 2.5173, + "step": 10796 + }, + { + "epoch": 0.8713582438866919, + "grad_norm": 0.6751883029937744, + "learning_rate": 8.825017963091827e-05, + "loss": 2.5378, + "step": 10797 + }, + { + "epoch": 0.871438947623275, + "grad_norm": 0.6794824600219727, + "learning_rate": 8.823450223398485e-05, + "loss": 2.592, + "step": 10798 + }, + { + "epoch": 0.871519651359858, + "grad_norm": 0.675729513168335, + "learning_rate": 8.821882513027838e-05, + "loss": 2.5253, + "step": 10799 + }, + { + "epoch": 0.871600355096441, + "grad_norm": 0.7185894250869751, + "learning_rate": 8.820314832018962e-05, + "loss": 2.5073, + "step": 10800 + }, + { + "epoch": 0.8716810588330239, + "grad_norm": 0.6605187654495239, + "learning_rate": 8.818747180410921e-05, + "loss": 2.5141, + "step": 10801 + }, + { + "epoch": 0.871761762569607, + "grad_norm": 0.6955205798149109, + "learning_rate": 8.817179558242788e-05, + "loss": 2.5313, + "step": 10802 + }, + { + "epoch": 0.87184246630619, + "grad_norm": 0.6307928562164307, + "learning_rate": 8.815611965553638e-05, + "loss": 2.4975, + "step": 10803 + }, + { + "epoch": 0.871923170042773, + "grad_norm": 0.7283728122711182, + "learning_rate": 8.814044402382527e-05, + "loss": 2.4623, + "step": 10804 + }, + { + "epoch": 0.872003873779356, + "grad_norm": 0.7019702792167664, + "learning_rate": 8.81247686876853e-05, + "loss": 2.4755, + "step": 10805 + }, + { + "epoch": 0.872084577515939, + "grad_norm": 0.6769137382507324, + "learning_rate": 8.81090936475072e-05, + "loss": 2.59, + "step": 10806 + }, + { + "epoch": 0.872165281252522, + "grad_norm": 0.6185588836669922, + "learning_rate": 8.80934189036815e-05, + "loss": 2.5308, + "step": 10807 + }, + { + "epoch": 0.872245984989105, + "grad_norm": 0.7127000689506531, + "learning_rate": 8.807774445659894e-05, + "loss": 2.5301, + "step": 10808 + }, + { + "epoch": 0.872326688725688, + "grad_norm": 0.7039114236831665, + "learning_rate": 8.806207030665016e-05, + "loss": 2.5176, + "step": 10809 + }, + { + "epoch": 0.8724073924622711, + "grad_norm": 0.6763370633125305, + "learning_rate": 8.804639645422582e-05, + "loss": 2.5324, + "step": 10810 + }, + { + "epoch": 0.872488096198854, + "grad_norm": 0.7546409368515015, + "learning_rate": 8.803072289971648e-05, + "loss": 2.5446, + "step": 10811 + }, + { + "epoch": 0.872568799935437, + "grad_norm": 0.6916004419326782, + "learning_rate": 8.801504964351284e-05, + "loss": 2.5056, + "step": 10812 + }, + { + "epoch": 0.87264950367202, + "grad_norm": 0.7108416557312012, + "learning_rate": 8.799937668600552e-05, + "loss": 2.5966, + "step": 10813 + }, + { + "epoch": 0.872730207408603, + "grad_norm": 0.7146576046943665, + "learning_rate": 8.798370402758506e-05, + "loss": 2.5152, + "step": 10814 + }, + { + "epoch": 0.872810911145186, + "grad_norm": 0.6708142757415771, + "learning_rate": 8.796803166864211e-05, + "loss": 2.5248, + "step": 10815 + }, + { + "epoch": 0.872891614881769, + "grad_norm": 0.6687600612640381, + "learning_rate": 8.795235960956729e-05, + "loss": 2.4451, + "step": 10816 + }, + { + "epoch": 0.872972318618352, + "grad_norm": 0.724012553691864, + "learning_rate": 8.793668785075114e-05, + "loss": 2.4816, + "step": 10817 + }, + { + "epoch": 0.873053022354935, + "grad_norm": 0.6938769221305847, + "learning_rate": 8.792101639258426e-05, + "loss": 2.5435, + "step": 10818 + }, + { + "epoch": 0.8731337260915181, + "grad_norm": 0.7066235542297363, + "learning_rate": 8.790534523545724e-05, + "loss": 2.5167, + "step": 10819 + }, + { + "epoch": 0.873214429828101, + "grad_norm": 0.7129037380218506, + "learning_rate": 8.788967437976062e-05, + "loss": 2.5079, + "step": 10820 + }, + { + "epoch": 0.873295133564684, + "grad_norm": 0.6949728727340698, + "learning_rate": 8.787400382588497e-05, + "loss": 2.5564, + "step": 10821 + }, + { + "epoch": 0.873375837301267, + "grad_norm": 0.7924233675003052, + "learning_rate": 8.785833357422088e-05, + "loss": 2.5748, + "step": 10822 + }, + { + "epoch": 0.8734565410378501, + "grad_norm": 0.7486331462860107, + "learning_rate": 8.784266362515882e-05, + "loss": 2.565, + "step": 10823 + }, + { + "epoch": 0.8735372447744331, + "grad_norm": 0.7036460638046265, + "learning_rate": 8.782699397908935e-05, + "loss": 2.5101, + "step": 10824 + }, + { + "epoch": 0.873617948511016, + "grad_norm": 0.6691471338272095, + "learning_rate": 8.781132463640302e-05, + "loss": 2.5262, + "step": 10825 + }, + { + "epoch": 0.873698652247599, + "grad_norm": 0.6836682558059692, + "learning_rate": 8.779565559749037e-05, + "loss": 2.5651, + "step": 10826 + }, + { + "epoch": 0.8737793559841821, + "grad_norm": 0.6634507775306702, + "learning_rate": 8.777998686274185e-05, + "loss": 2.5383, + "step": 10827 + }, + { + "epoch": 0.8738600597207651, + "grad_norm": 0.6903105974197388, + "learning_rate": 8.7764318432548e-05, + "loss": 2.5659, + "step": 10828 + }, + { + "epoch": 0.8739407634573481, + "grad_norm": 0.737859308719635, + "learning_rate": 8.774865030729937e-05, + "loss": 2.5859, + "step": 10829 + }, + { + "epoch": 0.874021467193931, + "grad_norm": 0.696843683719635, + "learning_rate": 8.773298248738633e-05, + "loss": 2.5244, + "step": 10830 + }, + { + "epoch": 0.8741021709305141, + "grad_norm": 0.7342235445976257, + "learning_rate": 8.771731497319946e-05, + "loss": 2.5073, + "step": 10831 + }, + { + "epoch": 0.8741828746670971, + "grad_norm": 0.6676939725875854, + "learning_rate": 8.770164776512926e-05, + "loss": 2.5408, + "step": 10832 + }, + { + "epoch": 0.8742635784036801, + "grad_norm": 0.6957886219024658, + "learning_rate": 8.768598086356608e-05, + "loss": 2.5566, + "step": 10833 + }, + { + "epoch": 0.874344282140263, + "grad_norm": 0.6938990950584412, + "learning_rate": 8.767031426890046e-05, + "loss": 2.517, + "step": 10834 + }, + { + "epoch": 0.8744249858768461, + "grad_norm": 0.8387169241905212, + "learning_rate": 8.765464798152286e-05, + "loss": 2.5507, + "step": 10835 + }, + { + "epoch": 0.8745056896134291, + "grad_norm": 0.6396276354789734, + "learning_rate": 8.763898200182368e-05, + "loss": 2.5063, + "step": 10836 + }, + { + "epoch": 0.8745863933500121, + "grad_norm": 0.7122719883918762, + "learning_rate": 8.762331633019339e-05, + "loss": 2.5816, + "step": 10837 + }, + { + "epoch": 0.8746670970865951, + "grad_norm": 0.6807141304016113, + "learning_rate": 8.760765096702244e-05, + "loss": 2.6004, + "step": 10838 + }, + { + "epoch": 0.8747478008231782, + "grad_norm": 0.6764848232269287, + "learning_rate": 8.759198591270117e-05, + "loss": 2.5303, + "step": 10839 + }, + { + "epoch": 0.8748285045597611, + "grad_norm": 0.718515932559967, + "learning_rate": 8.757632116762006e-05, + "loss": 2.5088, + "step": 10840 + }, + { + "epoch": 0.8749092082963441, + "grad_norm": 0.7084362506866455, + "learning_rate": 8.75606567321695e-05, + "loss": 2.5496, + "step": 10841 + }, + { + "epoch": 0.8749899120329271, + "grad_norm": 0.7191734910011292, + "learning_rate": 8.754499260673991e-05, + "loss": 2.5525, + "step": 10842 + }, + { + "epoch": 0.8750706157695102, + "grad_norm": 0.7167977094650269, + "learning_rate": 8.752932879172164e-05, + "loss": 2.5479, + "step": 10843 + }, + { + "epoch": 0.8751513195060932, + "grad_norm": 0.6994979381561279, + "learning_rate": 8.751366528750511e-05, + "loss": 2.4942, + "step": 10844 + }, + { + "epoch": 0.8752320232426761, + "grad_norm": 0.7192725539207458, + "learning_rate": 8.749800209448068e-05, + "loss": 2.5233, + "step": 10845 + }, + { + "epoch": 0.8753127269792591, + "grad_norm": 0.7728807330131531, + "learning_rate": 8.748233921303871e-05, + "loss": 2.5698, + "step": 10846 + }, + { + "epoch": 0.8753934307158422, + "grad_norm": 0.7305434942245483, + "learning_rate": 8.746667664356956e-05, + "loss": 2.5096, + "step": 10847 + }, + { + "epoch": 0.8754741344524252, + "grad_norm": 0.7117629051208496, + "learning_rate": 8.745101438646365e-05, + "loss": 2.5272, + "step": 10848 + }, + { + "epoch": 0.8755548381890081, + "grad_norm": 0.7180361151695251, + "learning_rate": 8.743535244211121e-05, + "loss": 2.4718, + "step": 10849 + }, + { + "epoch": 0.8756355419255911, + "grad_norm": 0.6419457793235779, + "learning_rate": 8.741969081090263e-05, + "loss": 2.5407, + "step": 10850 + }, + { + "epoch": 0.8757162456621742, + "grad_norm": 0.7928328514099121, + "learning_rate": 8.740402949322827e-05, + "loss": 2.488, + "step": 10851 + }, + { + "epoch": 0.8757969493987572, + "grad_norm": 0.7449139952659607, + "learning_rate": 8.738836848947839e-05, + "loss": 2.5943, + "step": 10852 + }, + { + "epoch": 0.8758776531353402, + "grad_norm": 0.7919576168060303, + "learning_rate": 8.737270780004334e-05, + "loss": 2.5556, + "step": 10853 + }, + { + "epoch": 0.8759583568719231, + "grad_norm": 0.6867526769638062, + "learning_rate": 8.735704742531346e-05, + "loss": 2.5395, + "step": 10854 + }, + { + "epoch": 0.8760390606085062, + "grad_norm": 0.7195394039154053, + "learning_rate": 8.734138736567896e-05, + "loss": 2.4404, + "step": 10855 + }, + { + "epoch": 0.8761197643450892, + "grad_norm": 0.68385910987854, + "learning_rate": 8.732572762153016e-05, + "loss": 2.502, + "step": 10856 + }, + { + "epoch": 0.8762004680816722, + "grad_norm": 0.6957393884658813, + "learning_rate": 8.731006819325739e-05, + "loss": 2.5788, + "step": 10857 + }, + { + "epoch": 0.8762811718182552, + "grad_norm": 0.6973037123680115, + "learning_rate": 8.729440908125092e-05, + "loss": 2.4927, + "step": 10858 + }, + { + "epoch": 0.8763618755548382, + "grad_norm": 0.6535985469818115, + "learning_rate": 8.727875028590095e-05, + "loss": 2.596, + "step": 10859 + }, + { + "epoch": 0.8764425792914212, + "grad_norm": 0.7447848320007324, + "learning_rate": 8.726309180759777e-05, + "loss": 2.5825, + "step": 10860 + }, + { + "epoch": 0.8765232830280042, + "grad_norm": 0.7155942320823669, + "learning_rate": 8.724743364673168e-05, + "loss": 2.5105, + "step": 10861 + }, + { + "epoch": 0.8766039867645872, + "grad_norm": 0.6664694547653198, + "learning_rate": 8.723177580369285e-05, + "loss": 2.5244, + "step": 10862 + }, + { + "epoch": 0.8766846905011701, + "grad_norm": 0.7437852025032043, + "learning_rate": 8.721611827887153e-05, + "loss": 2.534, + "step": 10863 + }, + { + "epoch": 0.8767653942377532, + "grad_norm": 0.6752577424049377, + "learning_rate": 8.7200461072658e-05, + "loss": 2.5025, + "step": 10864 + }, + { + "epoch": 0.8768460979743362, + "grad_norm": 0.7420764565467834, + "learning_rate": 8.718480418544241e-05, + "loss": 2.5261, + "step": 10865 + }, + { + "epoch": 0.8769268017109192, + "grad_norm": 0.669384777545929, + "learning_rate": 8.7169147617615e-05, + "loss": 2.5258, + "step": 10866 + }, + { + "epoch": 0.8770075054475022, + "grad_norm": 0.6649587750434875, + "learning_rate": 8.715349136956599e-05, + "loss": 2.5308, + "step": 10867 + }, + { + "epoch": 0.8770882091840853, + "grad_norm": 0.728922426700592, + "learning_rate": 8.713783544168552e-05, + "loss": 2.5251, + "step": 10868 + }, + { + "epoch": 0.8771689129206682, + "grad_norm": 0.6957671642303467, + "learning_rate": 8.712217983436384e-05, + "loss": 2.5818, + "step": 10869 + }, + { + "epoch": 0.8772496166572512, + "grad_norm": 0.6796830892562866, + "learning_rate": 8.710652454799108e-05, + "loss": 2.5122, + "step": 10870 + }, + { + "epoch": 0.8773303203938342, + "grad_norm": 0.7230980396270752, + "learning_rate": 8.709086958295746e-05, + "loss": 2.5836, + "step": 10871 + }, + { + "epoch": 0.8774110241304173, + "grad_norm": 0.6992264986038208, + "learning_rate": 8.707521493965309e-05, + "loss": 2.5907, + "step": 10872 + }, + { + "epoch": 0.8774917278670002, + "grad_norm": 0.7066535353660583, + "learning_rate": 8.705956061846816e-05, + "loss": 2.5508, + "step": 10873 + }, + { + "epoch": 0.8775724316035832, + "grad_norm": 0.6559327244758606, + "learning_rate": 8.704390661979283e-05, + "loss": 2.611, + "step": 10874 + }, + { + "epoch": 0.8776531353401662, + "grad_norm": 0.6673287749290466, + "learning_rate": 8.70282529440172e-05, + "loss": 2.5778, + "step": 10875 + }, + { + "epoch": 0.8777338390767493, + "grad_norm": 0.6715971231460571, + "learning_rate": 8.701259959153139e-05, + "loss": 2.5342, + "step": 10876 + }, + { + "epoch": 0.8778145428133323, + "grad_norm": 0.7456488609313965, + "learning_rate": 8.699694656272557e-05, + "loss": 2.5365, + "step": 10877 + }, + { + "epoch": 0.8778952465499152, + "grad_norm": 0.6658159494400024, + "learning_rate": 8.698129385798983e-05, + "loss": 2.4387, + "step": 10878 + }, + { + "epoch": 0.8779759502864982, + "grad_norm": 0.6653816103935242, + "learning_rate": 8.696564147771427e-05, + "loss": 2.5791, + "step": 10879 + }, + { + "epoch": 0.8780566540230813, + "grad_norm": 0.6763200163841248, + "learning_rate": 8.694998942228902e-05, + "loss": 2.5356, + "step": 10880 + }, + { + "epoch": 0.8781373577596643, + "grad_norm": 0.6534504890441895, + "learning_rate": 8.69343376921041e-05, + "loss": 2.5358, + "step": 10881 + }, + { + "epoch": 0.8782180614962473, + "grad_norm": 0.6341667771339417, + "learning_rate": 8.691868628754967e-05, + "loss": 2.4927, + "step": 10882 + }, + { + "epoch": 0.8782987652328302, + "grad_norm": 0.6215559244155884, + "learning_rate": 8.690303520901579e-05, + "loss": 2.4312, + "step": 10883 + }, + { + "epoch": 0.8783794689694133, + "grad_norm": 0.6705841422080994, + "learning_rate": 8.688738445689248e-05, + "loss": 2.4778, + "step": 10884 + }, + { + "epoch": 0.8784601727059963, + "grad_norm": 0.680275559425354, + "learning_rate": 8.687173403156982e-05, + "loss": 2.5577, + "step": 10885 + }, + { + "epoch": 0.8785408764425793, + "grad_norm": 0.6918728351593018, + "learning_rate": 8.685608393343789e-05, + "loss": 2.5212, + "step": 10886 + }, + { + "epoch": 0.8786215801791623, + "grad_norm": 0.623636782169342, + "learning_rate": 8.68404341628867e-05, + "loss": 2.5131, + "step": 10887 + }, + { + "epoch": 0.8787022839157453, + "grad_norm": 0.7200562357902527, + "learning_rate": 8.682478472030628e-05, + "loss": 2.5517, + "step": 10888 + }, + { + "epoch": 0.8787829876523283, + "grad_norm": 0.6902644634246826, + "learning_rate": 8.680913560608666e-05, + "loss": 2.511, + "step": 10889 + }, + { + "epoch": 0.8788636913889113, + "grad_norm": 0.6855802536010742, + "learning_rate": 8.679348682061792e-05, + "loss": 2.5169, + "step": 10890 + }, + { + "epoch": 0.8789443951254943, + "grad_norm": 0.7229284048080444, + "learning_rate": 8.677783836428995e-05, + "loss": 2.5634, + "step": 10891 + }, + { + "epoch": 0.8790250988620774, + "grad_norm": 0.6350376605987549, + "learning_rate": 8.676219023749281e-05, + "loss": 2.443, + "step": 10892 + }, + { + "epoch": 0.8791058025986603, + "grad_norm": 0.6884307265281677, + "learning_rate": 8.674654244061653e-05, + "loss": 2.524, + "step": 10893 + }, + { + "epoch": 0.8791865063352433, + "grad_norm": 0.6571067571640015, + "learning_rate": 8.673089497405102e-05, + "loss": 2.5322, + "step": 10894 + }, + { + "epoch": 0.8792672100718263, + "grad_norm": 0.7078021764755249, + "learning_rate": 8.67152478381863e-05, + "loss": 2.5317, + "step": 10895 + }, + { + "epoch": 0.8793479138084094, + "grad_norm": 0.6809059381484985, + "learning_rate": 8.669960103341236e-05, + "loss": 2.5767, + "step": 10896 + }, + { + "epoch": 0.8794286175449924, + "grad_norm": 0.7399441003799438, + "learning_rate": 8.66839545601191e-05, + "loss": 2.5194, + "step": 10897 + }, + { + "epoch": 0.8795093212815753, + "grad_norm": 0.6762270927429199, + "learning_rate": 8.66683084186965e-05, + "loss": 2.5306, + "step": 10898 + }, + { + "epoch": 0.8795900250181583, + "grad_norm": 0.7394620776176453, + "learning_rate": 8.665266260953455e-05, + "loss": 2.4516, + "step": 10899 + }, + { + "epoch": 0.8796707287547414, + "grad_norm": 0.6775416135787964, + "learning_rate": 8.663701713302309e-05, + "loss": 2.5574, + "step": 10900 + }, + { + "epoch": 0.8797514324913244, + "grad_norm": 0.7630520462989807, + "learning_rate": 8.66213719895521e-05, + "loss": 2.5516, + "step": 10901 + }, + { + "epoch": 0.8798321362279073, + "grad_norm": 0.6555768847465515, + "learning_rate": 8.660572717951149e-05, + "loss": 2.5267, + "step": 10902 + }, + { + "epoch": 0.8799128399644903, + "grad_norm": 0.6899500489234924, + "learning_rate": 8.659008270329119e-05, + "loss": 2.4938, + "step": 10903 + }, + { + "epoch": 0.8799935437010734, + "grad_norm": 0.6939221024513245, + "learning_rate": 8.657443856128107e-05, + "loss": 2.5358, + "step": 10904 + }, + { + "epoch": 0.8800742474376564, + "grad_norm": 0.6454630494117737, + "learning_rate": 8.655879475387102e-05, + "loss": 2.5528, + "step": 10905 + }, + { + "epoch": 0.8801549511742394, + "grad_norm": 0.7142425775527954, + "learning_rate": 8.654315128145099e-05, + "loss": 2.5668, + "step": 10906 + }, + { + "epoch": 0.8802356549108223, + "grad_norm": 0.7512764930725098, + "learning_rate": 8.652750814441075e-05, + "loss": 2.5224, + "step": 10907 + }, + { + "epoch": 0.8803163586474054, + "grad_norm": 0.6599575877189636, + "learning_rate": 8.651186534314026e-05, + "loss": 2.5363, + "step": 10908 + }, + { + "epoch": 0.8803970623839884, + "grad_norm": 0.6787410974502563, + "learning_rate": 8.649622287802935e-05, + "loss": 2.4587, + "step": 10909 + }, + { + "epoch": 0.8804777661205714, + "grad_norm": 0.7124783396720886, + "learning_rate": 8.648058074946786e-05, + "loss": 2.5842, + "step": 10910 + }, + { + "epoch": 0.8805584698571544, + "grad_norm": 0.6698839664459229, + "learning_rate": 8.646493895784562e-05, + "loss": 2.513, + "step": 10911 + }, + { + "epoch": 0.8806391735937374, + "grad_norm": 0.6660044193267822, + "learning_rate": 8.644929750355249e-05, + "loss": 2.4996, + "step": 10912 + }, + { + "epoch": 0.8807198773303204, + "grad_norm": 0.7060455083847046, + "learning_rate": 8.643365638697828e-05, + "loss": 2.5497, + "step": 10913 + }, + { + "epoch": 0.8808005810669034, + "grad_norm": 0.6835277676582336, + "learning_rate": 8.641801560851281e-05, + "loss": 2.5198, + "step": 10914 + }, + { + "epoch": 0.8808812848034864, + "grad_norm": 0.6994042992591858, + "learning_rate": 8.640237516854595e-05, + "loss": 2.5692, + "step": 10915 + }, + { + "epoch": 0.8809619885400694, + "grad_norm": 0.6583377718925476, + "learning_rate": 8.63867350674674e-05, + "loss": 2.5025, + "step": 10916 + }, + { + "epoch": 0.8810426922766524, + "grad_norm": 0.6882332563400269, + "learning_rate": 8.637109530566698e-05, + "loss": 2.5343, + "step": 10917 + }, + { + "epoch": 0.8811233960132354, + "grad_norm": 0.6329876184463501, + "learning_rate": 8.635545588353449e-05, + "loss": 2.5335, + "step": 10918 + }, + { + "epoch": 0.8812040997498184, + "grad_norm": 0.713196337223053, + "learning_rate": 8.633981680145975e-05, + "loss": 2.4814, + "step": 10919 + }, + { + "epoch": 0.8812848034864014, + "grad_norm": 0.7388820648193359, + "learning_rate": 8.632417805983246e-05, + "loss": 2.4927, + "step": 10920 + }, + { + "epoch": 0.8813655072229845, + "grad_norm": 0.7316160798072815, + "learning_rate": 8.63085396590424e-05, + "loss": 2.508, + "step": 10921 + }, + { + "epoch": 0.8814462109595674, + "grad_norm": 0.6690139174461365, + "learning_rate": 8.629290159947934e-05, + "loss": 2.5719, + "step": 10922 + }, + { + "epoch": 0.8815269146961504, + "grad_norm": 0.6369553208351135, + "learning_rate": 8.627726388153297e-05, + "loss": 2.5277, + "step": 10923 + }, + { + "epoch": 0.8816076184327334, + "grad_norm": 0.6870365738868713, + "learning_rate": 8.626162650559306e-05, + "loss": 2.4731, + "step": 10924 + }, + { + "epoch": 0.8816883221693165, + "grad_norm": 0.6890872716903687, + "learning_rate": 8.624598947204938e-05, + "loss": 2.5417, + "step": 10925 + }, + { + "epoch": 0.8817690259058995, + "grad_norm": 0.6548230051994324, + "learning_rate": 8.623035278129156e-05, + "loss": 2.4888, + "step": 10926 + }, + { + "epoch": 0.8818497296424824, + "grad_norm": 0.6835262775421143, + "learning_rate": 8.621471643370933e-05, + "loss": 2.531, + "step": 10927 + }, + { + "epoch": 0.8819304333790654, + "grad_norm": 0.6910626292228699, + "learning_rate": 8.619908042969243e-05, + "loss": 2.4864, + "step": 10928 + }, + { + "epoch": 0.8820111371156485, + "grad_norm": 0.6727725267410278, + "learning_rate": 8.618344476963049e-05, + "loss": 2.5063, + "step": 10929 + }, + { + "epoch": 0.8820918408522315, + "grad_norm": 0.7285245656967163, + "learning_rate": 8.616780945391323e-05, + "loss": 2.5036, + "step": 10930 + }, + { + "epoch": 0.8821725445888144, + "grad_norm": 0.6561840176582336, + "learning_rate": 8.615217448293035e-05, + "loss": 2.5152, + "step": 10931 + }, + { + "epoch": 0.8822532483253974, + "grad_norm": 0.6524627208709717, + "learning_rate": 8.613653985707144e-05, + "loss": 2.4827, + "step": 10932 + }, + { + "epoch": 0.8823339520619805, + "grad_norm": 0.6815671920776367, + "learning_rate": 8.612090557672619e-05, + "loss": 2.5385, + "step": 10933 + }, + { + "epoch": 0.8824146557985635, + "grad_norm": 0.7479865550994873, + "learning_rate": 8.610527164228429e-05, + "loss": 2.5311, + "step": 10934 + }, + { + "epoch": 0.8824953595351465, + "grad_norm": 0.699504554271698, + "learning_rate": 8.608963805413535e-05, + "loss": 2.5332, + "step": 10935 + }, + { + "epoch": 0.8825760632717294, + "grad_norm": 0.7081198692321777, + "learning_rate": 8.607400481266896e-05, + "loss": 2.5636, + "step": 10936 + }, + { + "epoch": 0.8826567670083125, + "grad_norm": 0.7020730972290039, + "learning_rate": 8.605837191827478e-05, + "loss": 2.498, + "step": 10937 + }, + { + "epoch": 0.8827374707448955, + "grad_norm": 0.8004096150398254, + "learning_rate": 8.604273937134242e-05, + "loss": 2.5352, + "step": 10938 + }, + { + "epoch": 0.8828181744814785, + "grad_norm": 0.6399645209312439, + "learning_rate": 8.602710717226147e-05, + "loss": 2.5673, + "step": 10939 + }, + { + "epoch": 0.8828988782180615, + "grad_norm": 0.683195173740387, + "learning_rate": 8.601147532142153e-05, + "loss": 2.4812, + "step": 10940 + }, + { + "epoch": 0.8829795819546445, + "grad_norm": 0.7783642411231995, + "learning_rate": 8.599584381921224e-05, + "loss": 2.4812, + "step": 10941 + }, + { + "epoch": 0.8830602856912275, + "grad_norm": 0.7107423543930054, + "learning_rate": 8.598021266602308e-05, + "loss": 2.5527, + "step": 10942 + }, + { + "epoch": 0.8831409894278105, + "grad_norm": 0.6419345140457153, + "learning_rate": 8.596458186224365e-05, + "loss": 2.5642, + "step": 10943 + }, + { + "epoch": 0.8832216931643935, + "grad_norm": 0.6897309422492981, + "learning_rate": 8.59489514082636e-05, + "loss": 2.5743, + "step": 10944 + }, + { + "epoch": 0.8833023969009766, + "grad_norm": 0.6901495456695557, + "learning_rate": 8.593332130447236e-05, + "loss": 2.5139, + "step": 10945 + }, + { + "epoch": 0.8833831006375595, + "grad_norm": 0.6865388751029968, + "learning_rate": 8.591769155125953e-05, + "loss": 2.5281, + "step": 10946 + }, + { + "epoch": 0.8834638043741425, + "grad_norm": 0.7070403099060059, + "learning_rate": 8.590206214901465e-05, + "loss": 2.4648, + "step": 10947 + }, + { + "epoch": 0.8835445081107255, + "grad_norm": 0.6846395134925842, + "learning_rate": 8.588643309812721e-05, + "loss": 2.4792, + "step": 10948 + }, + { + "epoch": 0.8836252118473086, + "grad_norm": 0.6875495314598083, + "learning_rate": 8.587080439898675e-05, + "loss": 2.5126, + "step": 10949 + }, + { + "epoch": 0.8837059155838916, + "grad_norm": 0.670098066329956, + "learning_rate": 8.58551760519828e-05, + "loss": 2.4922, + "step": 10950 + }, + { + "epoch": 0.8837866193204745, + "grad_norm": 0.6675527691841125, + "learning_rate": 8.583954805750487e-05, + "loss": 2.499, + "step": 10951 + }, + { + "epoch": 0.8838673230570575, + "grad_norm": 0.6694127321243286, + "learning_rate": 8.582392041594236e-05, + "loss": 2.5286, + "step": 10952 + }, + { + "epoch": 0.8839480267936406, + "grad_norm": 0.7291092872619629, + "learning_rate": 8.580829312768482e-05, + "loss": 2.5705, + "step": 10953 + }, + { + "epoch": 0.8840287305302236, + "grad_norm": 0.709904670715332, + "learning_rate": 8.579266619312174e-05, + "loss": 2.5238, + "step": 10954 + }, + { + "epoch": 0.8841094342668065, + "grad_norm": 0.7037622332572937, + "learning_rate": 8.577703961264254e-05, + "loss": 2.5491, + "step": 10955 + }, + { + "epoch": 0.8841901380033895, + "grad_norm": 0.7553049325942993, + "learning_rate": 8.576141338663668e-05, + "loss": 2.5643, + "step": 10956 + }, + { + "epoch": 0.8842708417399726, + "grad_norm": 0.7177377343177795, + "learning_rate": 8.574578751549364e-05, + "loss": 2.49, + "step": 10957 + }, + { + "epoch": 0.8843515454765556, + "grad_norm": 0.682668149471283, + "learning_rate": 8.573016199960283e-05, + "loss": 2.5221, + "step": 10958 + }, + { + "epoch": 0.8844322492131386, + "grad_norm": 0.7508956789970398, + "learning_rate": 8.571453683935366e-05, + "loss": 2.5766, + "step": 10959 + }, + { + "epoch": 0.8845129529497215, + "grad_norm": 0.6495946645736694, + "learning_rate": 8.569891203513562e-05, + "loss": 2.534, + "step": 10960 + }, + { + "epoch": 0.8845936566863046, + "grad_norm": 0.7362824082374573, + "learning_rate": 8.568328758733806e-05, + "loss": 2.4614, + "step": 10961 + }, + { + "epoch": 0.8846743604228876, + "grad_norm": 0.6571496725082397, + "learning_rate": 8.566766349635037e-05, + "loss": 2.4393, + "step": 10962 + }, + { + "epoch": 0.8847550641594706, + "grad_norm": 0.7088329195976257, + "learning_rate": 8.5652039762562e-05, + "loss": 2.5476, + "step": 10963 + }, + { + "epoch": 0.8848357678960536, + "grad_norm": 0.6414440274238586, + "learning_rate": 8.56364163863623e-05, + "loss": 2.4668, + "step": 10964 + }, + { + "epoch": 0.8849164716326365, + "grad_norm": 0.7333478331565857, + "learning_rate": 8.562079336814063e-05, + "loss": 2.5151, + "step": 10965 + }, + { + "epoch": 0.8849971753692196, + "grad_norm": 0.638038694858551, + "learning_rate": 8.560517070828638e-05, + "loss": 2.5063, + "step": 10966 + }, + { + "epoch": 0.8850778791058026, + "grad_norm": 0.638921320438385, + "learning_rate": 8.558954840718896e-05, + "loss": 2.4769, + "step": 10967 + }, + { + "epoch": 0.8851585828423856, + "grad_norm": 0.6923465728759766, + "learning_rate": 8.557392646523759e-05, + "loss": 2.5388, + "step": 10968 + }, + { + "epoch": 0.8852392865789686, + "grad_norm": 0.7095212936401367, + "learning_rate": 8.555830488282169e-05, + "loss": 2.4955, + "step": 10969 + }, + { + "epoch": 0.8853199903155516, + "grad_norm": 0.689908504486084, + "learning_rate": 8.554268366033065e-05, + "loss": 2.4998, + "step": 10970 + }, + { + "epoch": 0.8854006940521346, + "grad_norm": 0.6551975011825562, + "learning_rate": 8.552706279815366e-05, + "loss": 2.4965, + "step": 10971 + }, + { + "epoch": 0.8854813977887176, + "grad_norm": 0.7239118218421936, + "learning_rate": 8.551144229668012e-05, + "loss": 2.5785, + "step": 10972 + }, + { + "epoch": 0.8855621015253006, + "grad_norm": 0.6743230819702148, + "learning_rate": 8.549582215629932e-05, + "loss": 2.5146, + "step": 10973 + }, + { + "epoch": 0.8856428052618837, + "grad_norm": 0.6991584300994873, + "learning_rate": 8.548020237740052e-05, + "loss": 2.5524, + "step": 10974 + }, + { + "epoch": 0.8857235089984666, + "grad_norm": 0.6605305075645447, + "learning_rate": 8.546458296037304e-05, + "loss": 2.5505, + "step": 10975 + }, + { + "epoch": 0.8858042127350496, + "grad_norm": 0.7011568546295166, + "learning_rate": 8.54489639056062e-05, + "loss": 2.4381, + "step": 10976 + }, + { + "epoch": 0.8858849164716326, + "grad_norm": 0.7015339136123657, + "learning_rate": 8.543334521348916e-05, + "loss": 2.5432, + "step": 10977 + }, + { + "epoch": 0.8859656202082157, + "grad_norm": 0.6892278790473938, + "learning_rate": 8.541772688441124e-05, + "loss": 2.5286, + "step": 10978 + }, + { + "epoch": 0.8860463239447987, + "grad_norm": 0.6680187582969666, + "learning_rate": 8.540210891876168e-05, + "loss": 2.439, + "step": 10979 + }, + { + "epoch": 0.8861270276813816, + "grad_norm": 0.7043240666389465, + "learning_rate": 8.538649131692975e-05, + "loss": 2.5558, + "step": 10980 + }, + { + "epoch": 0.8862077314179646, + "grad_norm": 0.6940229535102844, + "learning_rate": 8.537087407930463e-05, + "loss": 2.5219, + "step": 10981 + }, + { + "epoch": 0.8862884351545477, + "grad_norm": 0.6571553945541382, + "learning_rate": 8.535525720627558e-05, + "loss": 2.5054, + "step": 10982 + }, + { + "epoch": 0.8863691388911307, + "grad_norm": 0.6846656203269958, + "learning_rate": 8.533964069823182e-05, + "loss": 2.497, + "step": 10983 + }, + { + "epoch": 0.8864498426277136, + "grad_norm": 0.6838627457618713, + "learning_rate": 8.53240245555625e-05, + "loss": 2.5495, + "step": 10984 + }, + { + "epoch": 0.8865305463642966, + "grad_norm": 0.6825091242790222, + "learning_rate": 8.530840877865687e-05, + "loss": 2.5656, + "step": 10985 + }, + { + "epoch": 0.8866112501008797, + "grad_norm": 0.7368674278259277, + "learning_rate": 8.529279336790414e-05, + "loss": 2.5378, + "step": 10986 + }, + { + "epoch": 0.8866919538374627, + "grad_norm": 0.7333693504333496, + "learning_rate": 8.527717832369338e-05, + "loss": 2.506, + "step": 10987 + }, + { + "epoch": 0.8867726575740457, + "grad_norm": 0.6623306274414062, + "learning_rate": 8.526156364641384e-05, + "loss": 2.4824, + "step": 10988 + }, + { + "epoch": 0.8868533613106286, + "grad_norm": 0.6863973140716553, + "learning_rate": 8.524594933645468e-05, + "loss": 2.536, + "step": 10989 + }, + { + "epoch": 0.8869340650472117, + "grad_norm": 0.6805100440979004, + "learning_rate": 8.523033539420501e-05, + "loss": 2.4954, + "step": 10990 + }, + { + "epoch": 0.8870147687837947, + "grad_norm": 0.6672216653823853, + "learning_rate": 8.521472182005399e-05, + "loss": 2.4893, + "step": 10991 + }, + { + "epoch": 0.8870954725203777, + "grad_norm": 0.7310158610343933, + "learning_rate": 8.519910861439079e-05, + "loss": 2.5317, + "step": 10992 + }, + { + "epoch": 0.8871761762569607, + "grad_norm": 0.6820743083953857, + "learning_rate": 8.518349577760445e-05, + "loss": 2.4482, + "step": 10993 + }, + { + "epoch": 0.8872568799935437, + "grad_norm": 0.6660269498825073, + "learning_rate": 8.516788331008411e-05, + "loss": 2.5353, + "step": 10994 + }, + { + "epoch": 0.8873375837301267, + "grad_norm": 0.676243007183075, + "learning_rate": 8.51522712122189e-05, + "loss": 2.531, + "step": 10995 + }, + { + "epoch": 0.8874182874667097, + "grad_norm": 0.6677152514457703, + "learning_rate": 8.513665948439796e-05, + "loss": 2.4732, + "step": 10996 + }, + { + "epoch": 0.8874989912032927, + "grad_norm": 0.7341045141220093, + "learning_rate": 8.512104812701027e-05, + "loss": 2.5668, + "step": 10997 + }, + { + "epoch": 0.8875796949398758, + "grad_norm": 0.6475326418876648, + "learning_rate": 8.510543714044496e-05, + "loss": 2.5026, + "step": 10998 + }, + { + "epoch": 0.8876603986764587, + "grad_norm": 0.7335529923439026, + "learning_rate": 8.50898265250911e-05, + "loss": 2.4946, + "step": 10999 + }, + { + "epoch": 0.8877411024130417, + "grad_norm": 0.760108232498169, + "learning_rate": 8.507421628133772e-05, + "loss": 2.5697, + "step": 11000 + }, + { + "epoch": 0.8877411024130417, + "eval_loss": 2.450413465499878, + "eval_runtime": 975.281, + "eval_samples_per_second": 2.686, + "eval_steps_per_second": 0.448, + "step": 11000 + }, + { + "epoch": 0.8878218061496247, + "grad_norm": 0.6420160531997681, + "learning_rate": 8.505860640957391e-05, + "loss": 2.5842, + "step": 11001 + }, + { + "epoch": 0.8879025098862078, + "grad_norm": 0.6625204086303711, + "learning_rate": 8.50429969101887e-05, + "loss": 2.4771, + "step": 11002 + }, + { + "epoch": 0.8879832136227908, + "grad_norm": 0.7430149912834167, + "learning_rate": 8.502738778357107e-05, + "loss": 2.5509, + "step": 11003 + }, + { + "epoch": 0.8880639173593737, + "grad_norm": 0.663624107837677, + "learning_rate": 8.501177903011008e-05, + "loss": 2.504, + "step": 11004 + }, + { + "epoch": 0.8881446210959567, + "grad_norm": 0.6638087630271912, + "learning_rate": 8.499617065019476e-05, + "loss": 2.492, + "step": 11005 + }, + { + "epoch": 0.8882253248325398, + "grad_norm": 0.7321780323982239, + "learning_rate": 8.498056264421406e-05, + "loss": 2.5808, + "step": 11006 + }, + { + "epoch": 0.8883060285691228, + "grad_norm": 0.7108619809150696, + "learning_rate": 8.4964955012557e-05, + "loss": 2.6185, + "step": 11007 + }, + { + "epoch": 0.8883867323057058, + "grad_norm": 0.6745856404304504, + "learning_rate": 8.494934775561258e-05, + "loss": 2.576, + "step": 11008 + }, + { + "epoch": 0.8884674360422887, + "grad_norm": 0.8002225756645203, + "learning_rate": 8.493374087376976e-05, + "loss": 2.5598, + "step": 11009 + }, + { + "epoch": 0.8885481397788718, + "grad_norm": 0.6848840713500977, + "learning_rate": 8.491813436741746e-05, + "loss": 2.5218, + "step": 11010 + }, + { + "epoch": 0.8886288435154548, + "grad_norm": 0.6464105248451233, + "learning_rate": 8.490252823694471e-05, + "loss": 2.5503, + "step": 11011 + }, + { + "epoch": 0.8887095472520378, + "grad_norm": 0.7165790796279907, + "learning_rate": 8.488692248274045e-05, + "loss": 2.5104, + "step": 11012 + }, + { + "epoch": 0.8887902509886207, + "grad_norm": 0.6832898259162903, + "learning_rate": 8.487131710519355e-05, + "loss": 2.5379, + "step": 11013 + }, + { + "epoch": 0.8888709547252038, + "grad_norm": 0.6992432475090027, + "learning_rate": 8.485571210469296e-05, + "loss": 2.5388, + "step": 11014 + }, + { + "epoch": 0.8889516584617868, + "grad_norm": 0.6410119533538818, + "learning_rate": 8.484010748162765e-05, + "loss": 2.5237, + "step": 11015 + }, + { + "epoch": 0.8890323621983698, + "grad_norm": 0.716248095035553, + "learning_rate": 8.482450323638647e-05, + "loss": 2.4977, + "step": 11016 + }, + { + "epoch": 0.8891130659349528, + "grad_norm": 0.6620567440986633, + "learning_rate": 8.480889936935833e-05, + "loss": 2.5088, + "step": 11017 + }, + { + "epoch": 0.8891937696715357, + "grad_norm": 0.7311015129089355, + "learning_rate": 8.479329588093217e-05, + "loss": 2.5547, + "step": 11018 + }, + { + "epoch": 0.8892744734081188, + "grad_norm": 0.757203996181488, + "learning_rate": 8.477769277149676e-05, + "loss": 2.5681, + "step": 11019 + }, + { + "epoch": 0.8893551771447018, + "grad_norm": 0.6941282153129578, + "learning_rate": 8.476209004144107e-05, + "loss": 2.5078, + "step": 11020 + }, + { + "epoch": 0.8894358808812848, + "grad_norm": 0.6381667256355286, + "learning_rate": 8.474648769115396e-05, + "loss": 2.5371, + "step": 11021 + }, + { + "epoch": 0.8895165846178678, + "grad_norm": 0.7978621125221252, + "learning_rate": 8.473088572102422e-05, + "loss": 2.5384, + "step": 11022 + }, + { + "epoch": 0.8895972883544508, + "grad_norm": 0.7229189872741699, + "learning_rate": 8.471528413144072e-05, + "loss": 2.5469, + "step": 11023 + }, + { + "epoch": 0.8896779920910338, + "grad_norm": 0.705545961856842, + "learning_rate": 8.469968292279231e-05, + "loss": 2.5281, + "step": 11024 + }, + { + "epoch": 0.8897586958276168, + "grad_norm": 0.7259972095489502, + "learning_rate": 8.468408209546777e-05, + "loss": 2.5485, + "step": 11025 + }, + { + "epoch": 0.8898393995641998, + "grad_norm": 0.6859608888626099, + "learning_rate": 8.466848164985594e-05, + "loss": 2.5548, + "step": 11026 + }, + { + "epoch": 0.8899201033007829, + "grad_norm": 0.7036644816398621, + "learning_rate": 8.465288158634565e-05, + "loss": 2.5159, + "step": 11027 + }, + { + "epoch": 0.8900008070373658, + "grad_norm": 0.6899380087852478, + "learning_rate": 8.463728190532569e-05, + "loss": 2.5037, + "step": 11028 + }, + { + "epoch": 0.8900815107739488, + "grad_norm": 0.7428410649299622, + "learning_rate": 8.462168260718477e-05, + "loss": 2.5074, + "step": 11029 + }, + { + "epoch": 0.8901622145105318, + "grad_norm": 0.6724158525466919, + "learning_rate": 8.460608369231173e-05, + "loss": 2.5544, + "step": 11030 + }, + { + "epoch": 0.8902429182471149, + "grad_norm": 0.6516450643539429, + "learning_rate": 8.459048516109535e-05, + "loss": 2.5152, + "step": 11031 + }, + { + "epoch": 0.8903236219836979, + "grad_norm": 0.7013405561447144, + "learning_rate": 8.457488701392434e-05, + "loss": 2.5116, + "step": 11032 + }, + { + "epoch": 0.8904043257202808, + "grad_norm": 0.7207479476928711, + "learning_rate": 8.455928925118747e-05, + "loss": 2.6041, + "step": 11033 + }, + { + "epoch": 0.8904850294568638, + "grad_norm": 0.69600510597229, + "learning_rate": 8.454369187327348e-05, + "loss": 2.5794, + "step": 11034 + }, + { + "epoch": 0.8905657331934469, + "grad_norm": 0.6831288933753967, + "learning_rate": 8.452809488057108e-05, + "loss": 2.4682, + "step": 11035 + }, + { + "epoch": 0.8906464369300299, + "grad_norm": 0.6978991627693176, + "learning_rate": 8.451249827346901e-05, + "loss": 2.4862, + "step": 11036 + }, + { + "epoch": 0.8907271406666128, + "grad_norm": 0.6772337555885315, + "learning_rate": 8.4496902052356e-05, + "loss": 2.5357, + "step": 11037 + }, + { + "epoch": 0.8908078444031958, + "grad_norm": 0.6735778450965881, + "learning_rate": 8.448130621762067e-05, + "loss": 2.5115, + "step": 11038 + }, + { + "epoch": 0.8908885481397789, + "grad_norm": 0.6695345044136047, + "learning_rate": 8.446571076965177e-05, + "loss": 2.5083, + "step": 11039 + }, + { + "epoch": 0.8909692518763619, + "grad_norm": 0.685343325138092, + "learning_rate": 8.445011570883796e-05, + "loss": 2.5221, + "step": 11040 + }, + { + "epoch": 0.8910499556129449, + "grad_norm": 0.7030319571495056, + "learning_rate": 8.443452103556792e-05, + "loss": 2.5708, + "step": 11041 + }, + { + "epoch": 0.8911306593495278, + "grad_norm": 0.6910343766212463, + "learning_rate": 8.441892675023029e-05, + "loss": 2.5373, + "step": 11042 + }, + { + "epoch": 0.8912113630861109, + "grad_norm": 0.7207868099212646, + "learning_rate": 8.440333285321374e-05, + "loss": 2.5862, + "step": 11043 + }, + { + "epoch": 0.8912920668226939, + "grad_norm": 0.6780788898468018, + "learning_rate": 8.438773934490692e-05, + "loss": 2.562, + "step": 11044 + }, + { + "epoch": 0.8913727705592769, + "grad_norm": 0.7010074257850647, + "learning_rate": 8.437214622569842e-05, + "loss": 2.4556, + "step": 11045 + }, + { + "epoch": 0.8914534742958599, + "grad_norm": 0.6763667464256287, + "learning_rate": 8.435655349597689e-05, + "loss": 2.5402, + "step": 11046 + }, + { + "epoch": 0.891534178032443, + "grad_norm": 0.6870944499969482, + "learning_rate": 8.4340961156131e-05, + "loss": 2.5307, + "step": 11047 + }, + { + "epoch": 0.8916148817690259, + "grad_norm": 0.7835623025894165, + "learning_rate": 8.432536920654923e-05, + "loss": 2.4974, + "step": 11048 + }, + { + "epoch": 0.8916955855056089, + "grad_norm": 0.7551318407058716, + "learning_rate": 8.430977764762024e-05, + "loss": 2.5206, + "step": 11049 + }, + { + "epoch": 0.8917762892421919, + "grad_norm": 0.6486842632293701, + "learning_rate": 8.429418647973265e-05, + "loss": 2.4909, + "step": 11050 + }, + { + "epoch": 0.891856992978775, + "grad_norm": 0.6894064545631409, + "learning_rate": 8.427859570327494e-05, + "loss": 2.5846, + "step": 11051 + }, + { + "epoch": 0.8919376967153579, + "grad_norm": 0.7597395181655884, + "learning_rate": 8.426300531863571e-05, + "loss": 2.5259, + "step": 11052 + }, + { + "epoch": 0.8920184004519409, + "grad_norm": 0.6784652471542358, + "learning_rate": 8.42474153262036e-05, + "loss": 2.5048, + "step": 11053 + }, + { + "epoch": 0.8920991041885239, + "grad_norm": 0.7703847885131836, + "learning_rate": 8.4231825726367e-05, + "loss": 2.4962, + "step": 11054 + }, + { + "epoch": 0.892179807925107, + "grad_norm": 0.6646561026573181, + "learning_rate": 8.421623651951454e-05, + "loss": 2.491, + "step": 11055 + }, + { + "epoch": 0.89226051166169, + "grad_norm": 0.6901054978370667, + "learning_rate": 8.420064770603475e-05, + "loss": 2.515, + "step": 11056 + }, + { + "epoch": 0.8923412153982729, + "grad_norm": 0.6789328455924988, + "learning_rate": 8.41850592863161e-05, + "loss": 2.5481, + "step": 11057 + }, + { + "epoch": 0.8924219191348559, + "grad_norm": 0.6211017370223999, + "learning_rate": 8.41694712607471e-05, + "loss": 2.51, + "step": 11058 + }, + { + "epoch": 0.892502622871439, + "grad_norm": 0.6482260823249817, + "learning_rate": 8.415388362971626e-05, + "loss": 2.5418, + "step": 11059 + }, + { + "epoch": 0.892583326608022, + "grad_norm": 0.7627651691436768, + "learning_rate": 8.413829639361209e-05, + "loss": 2.5033, + "step": 11060 + }, + { + "epoch": 0.892664030344605, + "grad_norm": 0.6560852527618408, + "learning_rate": 8.412270955282302e-05, + "loss": 2.5442, + "step": 11061 + }, + { + "epoch": 0.8927447340811879, + "grad_norm": 0.7479087114334106, + "learning_rate": 8.410712310773752e-05, + "loss": 2.5189, + "step": 11062 + }, + { + "epoch": 0.892825437817771, + "grad_norm": 0.6970879435539246, + "learning_rate": 8.409153705874411e-05, + "loss": 2.5418, + "step": 11063 + }, + { + "epoch": 0.892906141554354, + "grad_norm": 0.6514548659324646, + "learning_rate": 8.407595140623113e-05, + "loss": 2.5277, + "step": 11064 + }, + { + "epoch": 0.892986845290937, + "grad_norm": 0.6745554804801941, + "learning_rate": 8.406036615058707e-05, + "loss": 2.5085, + "step": 11065 + }, + { + "epoch": 0.89306754902752, + "grad_norm": 0.7510363459587097, + "learning_rate": 8.404478129220037e-05, + "loss": 2.4941, + "step": 11066 + }, + { + "epoch": 0.8931482527641029, + "grad_norm": 0.6531470417976379, + "learning_rate": 8.402919683145941e-05, + "loss": 2.5363, + "step": 11067 + }, + { + "epoch": 0.893228956500686, + "grad_norm": 0.6861493587493896, + "learning_rate": 8.401361276875262e-05, + "loss": 2.6369, + "step": 11068 + }, + { + "epoch": 0.893309660237269, + "grad_norm": 0.6029497981071472, + "learning_rate": 8.39980291044684e-05, + "loss": 2.4953, + "step": 11069 + }, + { + "epoch": 0.893390363973852, + "grad_norm": 0.6831715106964111, + "learning_rate": 8.39824458389951e-05, + "loss": 2.5074, + "step": 11070 + }, + { + "epoch": 0.8934710677104349, + "grad_norm": 0.7076299786567688, + "learning_rate": 8.396686297272112e-05, + "loss": 2.5934, + "step": 11071 + }, + { + "epoch": 0.893551771447018, + "grad_norm": 0.6941438913345337, + "learning_rate": 8.395128050603487e-05, + "loss": 2.5338, + "step": 11072 + }, + { + "epoch": 0.893632475183601, + "grad_norm": 0.6867249011993408, + "learning_rate": 8.393569843932463e-05, + "loss": 2.5311, + "step": 11073 + }, + { + "epoch": 0.893713178920184, + "grad_norm": 0.623991847038269, + "learning_rate": 8.392011677297877e-05, + "loss": 2.5133, + "step": 11074 + }, + { + "epoch": 0.893793882656767, + "grad_norm": 0.6808422803878784, + "learning_rate": 8.390453550738564e-05, + "loss": 2.5398, + "step": 11075 + }, + { + "epoch": 0.89387458639335, + "grad_norm": 0.7136701345443726, + "learning_rate": 8.388895464293357e-05, + "loss": 2.5415, + "step": 11076 + }, + { + "epoch": 0.893955290129933, + "grad_norm": 0.6814287304878235, + "learning_rate": 8.387337418001084e-05, + "loss": 2.4782, + "step": 11077 + }, + { + "epoch": 0.894035993866516, + "grad_norm": 0.8101940155029297, + "learning_rate": 8.385779411900579e-05, + "loss": 2.5292, + "step": 11078 + }, + { + "epoch": 0.894116697603099, + "grad_norm": 0.7106796503067017, + "learning_rate": 8.384221446030676e-05, + "loss": 2.5819, + "step": 11079 + }, + { + "epoch": 0.8941974013396821, + "grad_norm": 0.7840015292167664, + "learning_rate": 8.382663520430191e-05, + "loss": 2.5243, + "step": 11080 + }, + { + "epoch": 0.894278105076265, + "grad_norm": 0.7037288546562195, + "learning_rate": 8.381105635137959e-05, + "loss": 2.5606, + "step": 11081 + }, + { + "epoch": 0.894358808812848, + "grad_norm": 0.671558678150177, + "learning_rate": 8.379547790192812e-05, + "loss": 2.4923, + "step": 11082 + }, + { + "epoch": 0.894439512549431, + "grad_norm": 0.6789675951004028, + "learning_rate": 8.377989985633567e-05, + "loss": 2.5281, + "step": 11083 + }, + { + "epoch": 0.8945202162860141, + "grad_norm": 0.6777840852737427, + "learning_rate": 8.37643222149905e-05, + "loss": 2.5159, + "step": 11084 + }, + { + "epoch": 0.8946009200225971, + "grad_norm": 0.6920693516731262, + "learning_rate": 8.374874497828089e-05, + "loss": 2.4952, + "step": 11085 + }, + { + "epoch": 0.89468162375918, + "grad_norm": 0.7394022941589355, + "learning_rate": 8.373316814659502e-05, + "loss": 2.5035, + "step": 11086 + }, + { + "epoch": 0.894762327495763, + "grad_norm": 0.625960648059845, + "learning_rate": 8.37175917203211e-05, + "loss": 2.5324, + "step": 11087 + }, + { + "epoch": 0.8948430312323461, + "grad_norm": 0.6848758459091187, + "learning_rate": 8.370201569984742e-05, + "loss": 2.5312, + "step": 11088 + }, + { + "epoch": 0.8949237349689291, + "grad_norm": 0.7207037210464478, + "learning_rate": 8.368644008556205e-05, + "loss": 2.5807, + "step": 11089 + }, + { + "epoch": 0.895004438705512, + "grad_norm": 0.7582261562347412, + "learning_rate": 8.367086487785326e-05, + "loss": 2.532, + "step": 11090 + }, + { + "epoch": 0.895085142442095, + "grad_norm": 0.6916806101799011, + "learning_rate": 8.36552900771092e-05, + "loss": 2.4772, + "step": 11091 + }, + { + "epoch": 0.8951658461786781, + "grad_norm": 0.6457386016845703, + "learning_rate": 8.363971568371805e-05, + "loss": 2.4952, + "step": 11092 + }, + { + "epoch": 0.8952465499152611, + "grad_norm": 0.7006754279136658, + "learning_rate": 8.362414169806792e-05, + "loss": 2.5818, + "step": 11093 + }, + { + "epoch": 0.8953272536518441, + "grad_norm": 0.6939932703971863, + "learning_rate": 8.3608568120547e-05, + "loss": 2.5411, + "step": 11094 + }, + { + "epoch": 0.895407957388427, + "grad_norm": 0.6314546465873718, + "learning_rate": 8.359299495154343e-05, + "loss": 2.5408, + "step": 11095 + }, + { + "epoch": 0.8954886611250101, + "grad_norm": 0.7202826738357544, + "learning_rate": 8.357742219144529e-05, + "loss": 2.4925, + "step": 11096 + }, + { + "epoch": 0.8955693648615931, + "grad_norm": 0.6475295424461365, + "learning_rate": 8.356184984064071e-05, + "loss": 2.5023, + "step": 11097 + }, + { + "epoch": 0.8956500685981761, + "grad_norm": 0.6161238551139832, + "learning_rate": 8.354627789951785e-05, + "loss": 2.5053, + "step": 11098 + }, + { + "epoch": 0.8957307723347591, + "grad_norm": 0.6919825077056885, + "learning_rate": 8.353070636846472e-05, + "loss": 2.5387, + "step": 11099 + }, + { + "epoch": 0.8958114760713421, + "grad_norm": 0.6374878883361816, + "learning_rate": 8.351513524786944e-05, + "loss": 2.5526, + "step": 11100 + }, + { + "epoch": 0.8958921798079251, + "grad_norm": 0.7041093707084656, + "learning_rate": 8.349956453812009e-05, + "loss": 2.5282, + "step": 11101 + }, + { + "epoch": 0.8959728835445081, + "grad_norm": 0.7252324819564819, + "learning_rate": 8.348399423960471e-05, + "loss": 2.5723, + "step": 11102 + }, + { + "epoch": 0.8960535872810911, + "grad_norm": 0.681682825088501, + "learning_rate": 8.346842435271137e-05, + "loss": 2.5284, + "step": 11103 + }, + { + "epoch": 0.8961342910176742, + "grad_norm": 0.7293850183486938, + "learning_rate": 8.34528548778281e-05, + "loss": 2.5014, + "step": 11104 + }, + { + "epoch": 0.8962149947542571, + "grad_norm": 0.7057846188545227, + "learning_rate": 8.343728581534299e-05, + "loss": 2.5502, + "step": 11105 + }, + { + "epoch": 0.8962956984908401, + "grad_norm": 0.6740830540657043, + "learning_rate": 8.342171716564398e-05, + "loss": 2.5205, + "step": 11106 + }, + { + "epoch": 0.8963764022274231, + "grad_norm": 0.6917470097541809, + "learning_rate": 8.340614892911907e-05, + "loss": 2.5216, + "step": 11107 + }, + { + "epoch": 0.8964571059640062, + "grad_norm": 0.7495635151863098, + "learning_rate": 8.339058110615638e-05, + "loss": 2.5509, + "step": 11108 + }, + { + "epoch": 0.8965378097005892, + "grad_norm": 0.6687765717506409, + "learning_rate": 8.33750136971438e-05, + "loss": 2.5286, + "step": 11109 + }, + { + "epoch": 0.8966185134371721, + "grad_norm": 0.6901381015777588, + "learning_rate": 8.335944670246931e-05, + "loss": 2.5545, + "step": 11110 + }, + { + "epoch": 0.8966992171737551, + "grad_norm": 0.6645506024360657, + "learning_rate": 8.334388012252094e-05, + "loss": 2.4883, + "step": 11111 + }, + { + "epoch": 0.8967799209103382, + "grad_norm": 0.6427997350692749, + "learning_rate": 8.332831395768662e-05, + "loss": 2.5103, + "step": 11112 + }, + { + "epoch": 0.8968606246469212, + "grad_norm": 0.7224035263061523, + "learning_rate": 8.331274820835425e-05, + "loss": 2.5086, + "step": 11113 + }, + { + "epoch": 0.8969413283835042, + "grad_norm": 0.6918233036994934, + "learning_rate": 8.329718287491188e-05, + "loss": 2.5222, + "step": 11114 + }, + { + "epoch": 0.8970220321200871, + "grad_norm": 0.735583484172821, + "learning_rate": 8.328161795774734e-05, + "loss": 2.5277, + "step": 11115 + }, + { + "epoch": 0.8971027358566702, + "grad_norm": 0.6624864339828491, + "learning_rate": 8.326605345724857e-05, + "loss": 2.532, + "step": 11116 + }, + { + "epoch": 0.8971834395932532, + "grad_norm": 0.6227770447731018, + "learning_rate": 8.325048937380352e-05, + "loss": 2.5386, + "step": 11117 + }, + { + "epoch": 0.8972641433298362, + "grad_norm": 0.6483022570610046, + "learning_rate": 8.323492570780004e-05, + "loss": 2.4958, + "step": 11118 + }, + { + "epoch": 0.8973448470664191, + "grad_norm": 0.7072618007659912, + "learning_rate": 8.321936245962602e-05, + "loss": 2.4931, + "step": 11119 + }, + { + "epoch": 0.8974255508030021, + "grad_norm": 0.6848764419555664, + "learning_rate": 8.320379962966937e-05, + "loss": 2.4549, + "step": 11120 + }, + { + "epoch": 0.8975062545395852, + "grad_norm": 0.6819620132446289, + "learning_rate": 8.318823721831795e-05, + "loss": 2.5156, + "step": 11121 + }, + { + "epoch": 0.8975869582761682, + "grad_norm": 0.6834476590156555, + "learning_rate": 8.31726752259596e-05, + "loss": 2.507, + "step": 11122 + }, + { + "epoch": 0.8976676620127512, + "grad_norm": 0.6785772442817688, + "learning_rate": 8.315711365298214e-05, + "loss": 2.5086, + "step": 11123 + }, + { + "epoch": 0.8977483657493341, + "grad_norm": 0.6303566098213196, + "learning_rate": 8.314155249977351e-05, + "loss": 2.5087, + "step": 11124 + }, + { + "epoch": 0.8978290694859172, + "grad_norm": 0.6544361710548401, + "learning_rate": 8.31259917667214e-05, + "loss": 2.505, + "step": 11125 + }, + { + "epoch": 0.8979097732225002, + "grad_norm": 0.8135818243026733, + "learning_rate": 8.311043145421369e-05, + "loss": 2.5139, + "step": 11126 + }, + { + "epoch": 0.8979904769590832, + "grad_norm": 0.6744341254234314, + "learning_rate": 8.309487156263818e-05, + "loss": 2.4797, + "step": 11127 + }, + { + "epoch": 0.8980711806956662, + "grad_norm": 0.6138790845870972, + "learning_rate": 8.307931209238267e-05, + "loss": 2.5334, + "step": 11128 + }, + { + "epoch": 0.8981518844322492, + "grad_norm": 0.702434241771698, + "learning_rate": 8.306375304383492e-05, + "loss": 2.5343, + "step": 11129 + }, + { + "epoch": 0.8982325881688322, + "grad_norm": 0.6787155270576477, + "learning_rate": 8.304819441738275e-05, + "loss": 2.507, + "step": 11130 + }, + { + "epoch": 0.8983132919054152, + "grad_norm": 0.6963719129562378, + "learning_rate": 8.303263621341386e-05, + "loss": 2.5238, + "step": 11131 + }, + { + "epoch": 0.8983939956419982, + "grad_norm": 0.6623271107673645, + "learning_rate": 8.3017078432316e-05, + "loss": 2.5206, + "step": 11132 + }, + { + "epoch": 0.8984746993785813, + "grad_norm": 0.777222752571106, + "learning_rate": 8.300152107447701e-05, + "loss": 2.5004, + "step": 11133 + }, + { + "epoch": 0.8985554031151642, + "grad_norm": 0.6788455247879028, + "learning_rate": 8.29859641402845e-05, + "loss": 2.5735, + "step": 11134 + }, + { + "epoch": 0.8986361068517472, + "grad_norm": 0.6595063209533691, + "learning_rate": 8.297040763012624e-05, + "loss": 2.4988, + "step": 11135 + }, + { + "epoch": 0.8987168105883302, + "grad_norm": 0.7105697989463806, + "learning_rate": 8.295485154438994e-05, + "loss": 2.5531, + "step": 11136 + }, + { + "epoch": 0.8987975143249133, + "grad_norm": 0.6884949803352356, + "learning_rate": 8.29392958834633e-05, + "loss": 2.5158, + "step": 11137 + }, + { + "epoch": 0.8988782180614963, + "grad_norm": 0.7178345322608948, + "learning_rate": 8.2923740647734e-05, + "loss": 2.5836, + "step": 11138 + }, + { + "epoch": 0.8989589217980792, + "grad_norm": 0.7000541687011719, + "learning_rate": 8.290818583758973e-05, + "loss": 2.5345, + "step": 11139 + }, + { + "epoch": 0.8990396255346622, + "grad_norm": 0.6808128952980042, + "learning_rate": 8.289263145341816e-05, + "loss": 2.5227, + "step": 11140 + }, + { + "epoch": 0.8991203292712453, + "grad_norm": 0.7047473788261414, + "learning_rate": 8.287707749560691e-05, + "loss": 2.477, + "step": 11141 + }, + { + "epoch": 0.8992010330078283, + "grad_norm": 0.6654812693595886, + "learning_rate": 8.286152396454365e-05, + "loss": 2.4575, + "step": 11142 + }, + { + "epoch": 0.8992817367444113, + "grad_norm": 0.6690360307693481, + "learning_rate": 8.284597086061603e-05, + "loss": 2.4755, + "step": 11143 + }, + { + "epoch": 0.8993624404809942, + "grad_norm": 0.7270147204399109, + "learning_rate": 8.283041818421164e-05, + "loss": 2.5893, + "step": 11144 + }, + { + "epoch": 0.8994431442175773, + "grad_norm": 0.5977498888969421, + "learning_rate": 8.28148659357181e-05, + "loss": 2.5108, + "step": 11145 + }, + { + "epoch": 0.8995238479541603, + "grad_norm": 0.694593071937561, + "learning_rate": 8.279931411552307e-05, + "loss": 2.5036, + "step": 11146 + }, + { + "epoch": 0.8996045516907433, + "grad_norm": 0.7395440936088562, + "learning_rate": 8.278376272401404e-05, + "loss": 2.5244, + "step": 11147 + }, + { + "epoch": 0.8996852554273262, + "grad_norm": 0.6483517289161682, + "learning_rate": 8.276821176157867e-05, + "loss": 2.5619, + "step": 11148 + }, + { + "epoch": 0.8997659591639093, + "grad_norm": 0.6996768116950989, + "learning_rate": 8.275266122860454e-05, + "loss": 2.5275, + "step": 11149 + }, + { + "epoch": 0.8998466629004923, + "grad_norm": 0.661122739315033, + "learning_rate": 8.273711112547914e-05, + "loss": 2.5053, + "step": 11150 + }, + { + "epoch": 0.8999273666370753, + "grad_norm": 0.6919111609458923, + "learning_rate": 8.272156145259006e-05, + "loss": 2.578, + "step": 11151 + }, + { + "epoch": 0.9000080703736583, + "grad_norm": 0.6680958867073059, + "learning_rate": 8.270601221032482e-05, + "loss": 2.4942, + "step": 11152 + }, + { + "epoch": 0.9000887741102414, + "grad_norm": 0.6782989501953125, + "learning_rate": 8.269046339907101e-05, + "loss": 2.5461, + "step": 11153 + }, + { + "epoch": 0.9001694778468243, + "grad_norm": 0.743468165397644, + "learning_rate": 8.267491501921605e-05, + "loss": 2.629, + "step": 11154 + }, + { + "epoch": 0.9002501815834073, + "grad_norm": 0.709562361240387, + "learning_rate": 8.265936707114751e-05, + "loss": 2.566, + "step": 11155 + }, + { + "epoch": 0.9003308853199903, + "grad_norm": 0.7075676918029785, + "learning_rate": 8.264381955525291e-05, + "loss": 2.5409, + "step": 11156 + }, + { + "epoch": 0.9004115890565734, + "grad_norm": 0.7021335959434509, + "learning_rate": 8.262827247191963e-05, + "loss": 2.5606, + "step": 11157 + }, + { + "epoch": 0.9004922927931563, + "grad_norm": 0.6507331132888794, + "learning_rate": 8.261272582153524e-05, + "loss": 2.5557, + "step": 11158 + }, + { + "epoch": 0.9005729965297393, + "grad_norm": 0.7182760238647461, + "learning_rate": 8.25971796044872e-05, + "loss": 2.5567, + "step": 11159 + }, + { + "epoch": 0.9006537002663223, + "grad_norm": 0.6632338762283325, + "learning_rate": 8.258163382116291e-05, + "loss": 2.5081, + "step": 11160 + }, + { + "epoch": 0.9007344040029054, + "grad_norm": 0.6889928579330444, + "learning_rate": 8.256608847194983e-05, + "loss": 2.5034, + "step": 11161 + }, + { + "epoch": 0.9008151077394884, + "grad_norm": 0.6374824047088623, + "learning_rate": 8.255054355723542e-05, + "loss": 2.4826, + "step": 11162 + }, + { + "epoch": 0.9008958114760713, + "grad_norm": 0.7100771069526672, + "learning_rate": 8.253499907740706e-05, + "loss": 2.4666, + "step": 11163 + }, + { + "epoch": 0.9009765152126543, + "grad_norm": 0.8141123652458191, + "learning_rate": 8.251945503285218e-05, + "loss": 2.5339, + "step": 11164 + }, + { + "epoch": 0.9010572189492374, + "grad_norm": 0.6621670722961426, + "learning_rate": 8.250391142395822e-05, + "loss": 2.4805, + "step": 11165 + }, + { + "epoch": 0.9011379226858204, + "grad_norm": 0.6624772548675537, + "learning_rate": 8.248836825111245e-05, + "loss": 2.5148, + "step": 11166 + }, + { + "epoch": 0.9012186264224034, + "grad_norm": 0.6783565282821655, + "learning_rate": 8.247282551470235e-05, + "loss": 2.4481, + "step": 11167 + }, + { + "epoch": 0.9012993301589863, + "grad_norm": 0.700089156627655, + "learning_rate": 8.245728321511525e-05, + "loss": 2.5649, + "step": 11168 + }, + { + "epoch": 0.9013800338955693, + "grad_norm": 0.6765339970588684, + "learning_rate": 8.244174135273852e-05, + "loss": 2.5221, + "step": 11169 + }, + { + "epoch": 0.9014607376321524, + "grad_norm": 0.6896056532859802, + "learning_rate": 8.242619992795948e-05, + "loss": 2.4742, + "step": 11170 + }, + { + "epoch": 0.9015414413687354, + "grad_norm": 0.7134374976158142, + "learning_rate": 8.241065894116547e-05, + "loss": 2.5231, + "step": 11171 + }, + { + "epoch": 0.9016221451053184, + "grad_norm": 0.6939442753791809, + "learning_rate": 8.239511839274385e-05, + "loss": 2.5159, + "step": 11172 + }, + { + "epoch": 0.9017028488419013, + "grad_norm": 0.6780345439910889, + "learning_rate": 8.237957828308187e-05, + "loss": 2.5474, + "step": 11173 + }, + { + "epoch": 0.9017835525784844, + "grad_norm": 0.6532382965087891, + "learning_rate": 8.236403861256687e-05, + "loss": 2.4982, + "step": 11174 + }, + { + "epoch": 0.9018642563150674, + "grad_norm": 0.6918137073516846, + "learning_rate": 8.234849938158615e-05, + "loss": 2.4657, + "step": 11175 + }, + { + "epoch": 0.9019449600516504, + "grad_norm": 0.6838762164115906, + "learning_rate": 8.233296059052695e-05, + "loss": 2.5405, + "step": 11176 + }, + { + "epoch": 0.9020256637882333, + "grad_norm": 0.7560290098190308, + "learning_rate": 8.231742223977653e-05, + "loss": 2.5379, + "step": 11177 + }, + { + "epoch": 0.9021063675248164, + "grad_norm": 0.6673319339752197, + "learning_rate": 8.230188432972221e-05, + "loss": 2.4669, + "step": 11178 + }, + { + "epoch": 0.9021870712613994, + "grad_norm": 0.7486294507980347, + "learning_rate": 8.228634686075116e-05, + "loss": 2.526, + "step": 11179 + }, + { + "epoch": 0.9022677749979824, + "grad_norm": 0.7012811303138733, + "learning_rate": 8.227080983325067e-05, + "loss": 2.5544, + "step": 11180 + }, + { + "epoch": 0.9023484787345654, + "grad_norm": 0.6807447075843811, + "learning_rate": 8.225527324760796e-05, + "loss": 2.5139, + "step": 11181 + }, + { + "epoch": 0.9024291824711484, + "grad_norm": 0.7594932317733765, + "learning_rate": 8.223973710421018e-05, + "loss": 2.539, + "step": 11182 + }, + { + "epoch": 0.9025098862077314, + "grad_norm": 0.6764204502105713, + "learning_rate": 8.22242014034446e-05, + "loss": 2.6128, + "step": 11183 + }, + { + "epoch": 0.9025905899443144, + "grad_norm": 0.6499967575073242, + "learning_rate": 8.220866614569837e-05, + "loss": 2.5459, + "step": 11184 + }, + { + "epoch": 0.9026712936808974, + "grad_norm": 0.673076331615448, + "learning_rate": 8.219313133135876e-05, + "loss": 2.5852, + "step": 11185 + }, + { + "epoch": 0.9027519974174805, + "grad_norm": 0.784854531288147, + "learning_rate": 8.21775969608128e-05, + "loss": 2.5586, + "step": 11186 + }, + { + "epoch": 0.9028327011540634, + "grad_norm": 0.658963680267334, + "learning_rate": 8.216206303444771e-05, + "loss": 2.4376, + "step": 11187 + }, + { + "epoch": 0.9029134048906464, + "grad_norm": 0.6456249356269836, + "learning_rate": 8.214652955265067e-05, + "loss": 2.5166, + "step": 11188 + }, + { + "epoch": 0.9029941086272294, + "grad_norm": 0.6940007209777832, + "learning_rate": 8.213099651580874e-05, + "loss": 2.4992, + "step": 11189 + }, + { + "epoch": 0.9030748123638125, + "grad_norm": 0.6661425828933716, + "learning_rate": 8.211546392430911e-05, + "loss": 2.5177, + "step": 11190 + }, + { + "epoch": 0.9031555161003955, + "grad_norm": 0.647834300994873, + "learning_rate": 8.20999317785389e-05, + "loss": 2.4666, + "step": 11191 + }, + { + "epoch": 0.9032362198369784, + "grad_norm": 0.7673383355140686, + "learning_rate": 8.208440007888515e-05, + "loss": 2.4852, + "step": 11192 + }, + { + "epoch": 0.9033169235735614, + "grad_norm": 0.7033390998840332, + "learning_rate": 8.206886882573498e-05, + "loss": 2.5549, + "step": 11193 + }, + { + "epoch": 0.9033976273101445, + "grad_norm": 0.6871141195297241, + "learning_rate": 8.205333801947548e-05, + "loss": 2.4585, + "step": 11194 + }, + { + "epoch": 0.9034783310467275, + "grad_norm": 0.7201984524726868, + "learning_rate": 8.20378076604937e-05, + "loss": 2.5271, + "step": 11195 + }, + { + "epoch": 0.9035590347833105, + "grad_norm": 0.704060971736908, + "learning_rate": 8.202227774917671e-05, + "loss": 2.4915, + "step": 11196 + }, + { + "epoch": 0.9036397385198934, + "grad_norm": 0.6833879947662354, + "learning_rate": 8.200674828591156e-05, + "loss": 2.4496, + "step": 11197 + }, + { + "epoch": 0.9037204422564765, + "grad_norm": 0.6564866304397583, + "learning_rate": 8.199121927108527e-05, + "loss": 2.4818, + "step": 11198 + }, + { + "epoch": 0.9038011459930595, + "grad_norm": 0.6970151662826538, + "learning_rate": 8.197569070508486e-05, + "loss": 2.5812, + "step": 11199 + }, + { + "epoch": 0.9038818497296425, + "grad_norm": 0.7147194743156433, + "learning_rate": 8.196016258829737e-05, + "loss": 2.5543, + "step": 11200 + }, + { + "epoch": 0.9039625534662254, + "grad_norm": 0.6357648968696594, + "learning_rate": 8.194463492110981e-05, + "loss": 2.5254, + "step": 11201 + }, + { + "epoch": 0.9040432572028085, + "grad_norm": 0.7113756537437439, + "learning_rate": 8.19291077039091e-05, + "loss": 2.5179, + "step": 11202 + }, + { + "epoch": 0.9041239609393915, + "grad_norm": 0.7252987623214722, + "learning_rate": 8.191358093708228e-05, + "loss": 2.5658, + "step": 11203 + }, + { + "epoch": 0.9042046646759745, + "grad_norm": 0.7095803618431091, + "learning_rate": 8.189805462101631e-05, + "loss": 2.583, + "step": 11204 + }, + { + "epoch": 0.9042853684125575, + "grad_norm": 0.7447760105133057, + "learning_rate": 8.188252875609812e-05, + "loss": 2.5608, + "step": 11205 + }, + { + "epoch": 0.9043660721491406, + "grad_norm": 0.6578439474105835, + "learning_rate": 8.186700334271468e-05, + "loss": 2.508, + "step": 11206 + }, + { + "epoch": 0.9044467758857235, + "grad_norm": 0.6776832938194275, + "learning_rate": 8.185147838125296e-05, + "loss": 2.6188, + "step": 11207 + }, + { + "epoch": 0.9045274796223065, + "grad_norm": 0.6559253931045532, + "learning_rate": 8.183595387209976e-05, + "loss": 2.5307, + "step": 11208 + }, + { + "epoch": 0.9046081833588895, + "grad_norm": 0.7078405022621155, + "learning_rate": 8.18204298156421e-05, + "loss": 2.5545, + "step": 11209 + }, + { + "epoch": 0.9046888870954726, + "grad_norm": 0.6790273189544678, + "learning_rate": 8.18049062122669e-05, + "loss": 2.4963, + "step": 11210 + }, + { + "epoch": 0.9047695908320555, + "grad_norm": 0.6888250708580017, + "learning_rate": 8.178938306236095e-05, + "loss": 2.5108, + "step": 11211 + }, + { + "epoch": 0.9048502945686385, + "grad_norm": 0.6438474059104919, + "learning_rate": 8.177386036631119e-05, + "loss": 2.4976, + "step": 11212 + }, + { + "epoch": 0.9049309983052215, + "grad_norm": 0.6786646842956543, + "learning_rate": 8.175833812450445e-05, + "loss": 2.4584, + "step": 11213 + }, + { + "epoch": 0.9050117020418046, + "grad_norm": 0.6480324268341064, + "learning_rate": 8.174281633732764e-05, + "loss": 2.5021, + "step": 11214 + }, + { + "epoch": 0.9050924057783876, + "grad_norm": 0.7232171893119812, + "learning_rate": 8.172729500516756e-05, + "loss": 2.4742, + "step": 11215 + }, + { + "epoch": 0.9051731095149705, + "grad_norm": 0.7048845291137695, + "learning_rate": 8.171177412841105e-05, + "loss": 2.518, + "step": 11216 + }, + { + "epoch": 0.9052538132515535, + "grad_norm": 0.6363180875778198, + "learning_rate": 8.169625370744496e-05, + "loss": 2.5154, + "step": 11217 + }, + { + "epoch": 0.9053345169881366, + "grad_norm": 0.7176045179367065, + "learning_rate": 8.168073374265605e-05, + "loss": 2.5182, + "step": 11218 + }, + { + "epoch": 0.9054152207247196, + "grad_norm": 0.7011643052101135, + "learning_rate": 8.166521423443112e-05, + "loss": 2.5615, + "step": 11219 + }, + { + "epoch": 0.9054959244613026, + "grad_norm": 0.6853327751159668, + "learning_rate": 8.164969518315704e-05, + "loss": 2.5057, + "step": 11220 + }, + { + "epoch": 0.9055766281978855, + "grad_norm": 0.6972528696060181, + "learning_rate": 8.163417658922049e-05, + "loss": 2.4949, + "step": 11221 + }, + { + "epoch": 0.9056573319344685, + "grad_norm": 0.6780978441238403, + "learning_rate": 8.161865845300824e-05, + "loss": 2.5601, + "step": 11222 + }, + { + "epoch": 0.9057380356710516, + "grad_norm": 0.6454098224639893, + "learning_rate": 8.160314077490711e-05, + "loss": 2.4203, + "step": 11223 + }, + { + "epoch": 0.9058187394076346, + "grad_norm": 0.7300907969474792, + "learning_rate": 8.158762355530378e-05, + "loss": 2.4818, + "step": 11224 + }, + { + "epoch": 0.9058994431442176, + "grad_norm": 0.682475745677948, + "learning_rate": 8.1572106794585e-05, + "loss": 2.4852, + "step": 11225 + }, + { + "epoch": 0.9059801468808005, + "grad_norm": 0.6666192412376404, + "learning_rate": 8.155659049313754e-05, + "loss": 2.5642, + "step": 11226 + }, + { + "epoch": 0.9060608506173836, + "grad_norm": 0.6873177886009216, + "learning_rate": 8.154107465134801e-05, + "loss": 2.5163, + "step": 11227 + }, + { + "epoch": 0.9061415543539666, + "grad_norm": 0.6704845428466797, + "learning_rate": 8.152555926960315e-05, + "loss": 2.5481, + "step": 11228 + }, + { + "epoch": 0.9062222580905496, + "grad_norm": 0.6340618133544922, + "learning_rate": 8.151004434828963e-05, + "loss": 2.4701, + "step": 11229 + }, + { + "epoch": 0.9063029618271325, + "grad_norm": 0.7886226177215576, + "learning_rate": 8.14945298877942e-05, + "loss": 2.5322, + "step": 11230 + }, + { + "epoch": 0.9063836655637156, + "grad_norm": 0.7086018919944763, + "learning_rate": 8.14790158885034e-05, + "loss": 2.4909, + "step": 11231 + }, + { + "epoch": 0.9064643693002986, + "grad_norm": 0.6791329979896545, + "learning_rate": 8.146350235080396e-05, + "loss": 2.4438, + "step": 11232 + }, + { + "epoch": 0.9065450730368816, + "grad_norm": 0.7070720791816711, + "learning_rate": 8.14479892750825e-05, + "loss": 2.528, + "step": 11233 + }, + { + "epoch": 0.9066257767734646, + "grad_norm": 0.6551348567008972, + "learning_rate": 8.143247666172564e-05, + "loss": 2.4747, + "step": 11234 + }, + { + "epoch": 0.9067064805100477, + "grad_norm": 0.6691645979881287, + "learning_rate": 8.141696451111997e-05, + "loss": 2.5038, + "step": 11235 + }, + { + "epoch": 0.9067871842466306, + "grad_norm": 0.6814864277839661, + "learning_rate": 8.14014528236522e-05, + "loss": 2.5737, + "step": 11236 + }, + { + "epoch": 0.9068678879832136, + "grad_norm": 0.7442377209663391, + "learning_rate": 8.138594159970877e-05, + "loss": 2.5839, + "step": 11237 + }, + { + "epoch": 0.9069485917197966, + "grad_norm": 0.6861338019371033, + "learning_rate": 8.137043083967634e-05, + "loss": 2.567, + "step": 11238 + }, + { + "epoch": 0.9070292954563797, + "grad_norm": 0.7056479454040527, + "learning_rate": 8.135492054394151e-05, + "loss": 2.5297, + "step": 11239 + }, + { + "epoch": 0.9071099991929626, + "grad_norm": 0.7166962623596191, + "learning_rate": 8.133941071289076e-05, + "loss": 2.4834, + "step": 11240 + }, + { + "epoch": 0.9071907029295456, + "grad_norm": 0.6285616159439087, + "learning_rate": 8.132390134691068e-05, + "loss": 2.5066, + "step": 11241 + }, + { + "epoch": 0.9072714066661286, + "grad_norm": 0.681915283203125, + "learning_rate": 8.130839244638783e-05, + "loss": 2.5387, + "step": 11242 + }, + { + "epoch": 0.9073521104027117, + "grad_norm": 0.6876898407936096, + "learning_rate": 8.129288401170866e-05, + "loss": 2.4465, + "step": 11243 + }, + { + "epoch": 0.9074328141392947, + "grad_norm": 0.657132625579834, + "learning_rate": 8.127737604325975e-05, + "loss": 2.499, + "step": 11244 + }, + { + "epoch": 0.9075135178758776, + "grad_norm": 0.6678825616836548, + "learning_rate": 8.126186854142752e-05, + "loss": 2.4872, + "step": 11245 + }, + { + "epoch": 0.9075942216124606, + "grad_norm": 0.7296879291534424, + "learning_rate": 8.124636150659858e-05, + "loss": 2.4783, + "step": 11246 + }, + { + "epoch": 0.9076749253490437, + "grad_norm": 0.7087056040763855, + "learning_rate": 8.12308549391593e-05, + "loss": 2.507, + "step": 11247 + }, + { + "epoch": 0.9077556290856267, + "grad_norm": 0.7099738121032715, + "learning_rate": 8.121534883949616e-05, + "loss": 2.5317, + "step": 11248 + }, + { + "epoch": 0.9078363328222097, + "grad_norm": 0.6421170830726624, + "learning_rate": 8.119984320799566e-05, + "loss": 2.5291, + "step": 11249 + }, + { + "epoch": 0.9079170365587926, + "grad_norm": 0.6835018396377563, + "learning_rate": 8.11843380450442e-05, + "loss": 2.5523, + "step": 11250 + }, + { + "epoch": 0.9079977402953757, + "grad_norm": 0.6638229489326477, + "learning_rate": 8.11688333510282e-05, + "loss": 2.5128, + "step": 11251 + }, + { + "epoch": 0.9080784440319587, + "grad_norm": 0.6783459186553955, + "learning_rate": 8.115332912633415e-05, + "loss": 2.5485, + "step": 11252 + }, + { + "epoch": 0.9081591477685417, + "grad_norm": 0.65911865234375, + "learning_rate": 8.113782537134838e-05, + "loss": 2.5408, + "step": 11253 + }, + { + "epoch": 0.9082398515051247, + "grad_norm": 0.6844244003295898, + "learning_rate": 8.112232208645729e-05, + "loss": 2.6067, + "step": 11254 + }, + { + "epoch": 0.9083205552417077, + "grad_norm": 0.6896870136260986, + "learning_rate": 8.110681927204729e-05, + "loss": 2.5444, + "step": 11255 + }, + { + "epoch": 0.9084012589782907, + "grad_norm": 0.6693820953369141, + "learning_rate": 8.109131692850473e-05, + "loss": 2.5118, + "step": 11256 + }, + { + "epoch": 0.9084819627148737, + "grad_norm": 0.6401854753494263, + "learning_rate": 8.107581505621599e-05, + "loss": 2.4811, + "step": 11257 + }, + { + "epoch": 0.9085626664514567, + "grad_norm": 0.6861663460731506, + "learning_rate": 8.106031365556743e-05, + "loss": 2.4633, + "step": 11258 + }, + { + "epoch": 0.9086433701880398, + "grad_norm": 0.6631655097007751, + "learning_rate": 8.104481272694533e-05, + "loss": 2.5748, + "step": 11259 + }, + { + "epoch": 0.9087240739246227, + "grad_norm": 0.6499454975128174, + "learning_rate": 8.102931227073604e-05, + "loss": 2.5573, + "step": 11260 + }, + { + "epoch": 0.9088047776612057, + "grad_norm": 0.7214524149894714, + "learning_rate": 8.10138122873259e-05, + "loss": 2.4905, + "step": 11261 + }, + { + "epoch": 0.9088854813977887, + "grad_norm": 0.6481152176856995, + "learning_rate": 8.099831277710122e-05, + "loss": 2.5073, + "step": 11262 + }, + { + "epoch": 0.9089661851343718, + "grad_norm": 0.6666486859321594, + "learning_rate": 8.09828137404482e-05, + "loss": 2.5379, + "step": 11263 + }, + { + "epoch": 0.9090468888709548, + "grad_norm": 0.7186474800109863, + "learning_rate": 8.096731517775319e-05, + "loss": 2.5164, + "step": 11264 + }, + { + "epoch": 0.9091275926075377, + "grad_norm": 0.6838653087615967, + "learning_rate": 8.095181708940245e-05, + "loss": 2.49, + "step": 11265 + }, + { + "epoch": 0.9092082963441207, + "grad_norm": 0.7740866541862488, + "learning_rate": 8.093631947578221e-05, + "loss": 2.5487, + "step": 11266 + }, + { + "epoch": 0.9092890000807038, + "grad_norm": 0.7198607325553894, + "learning_rate": 8.092082233727871e-05, + "loss": 2.4477, + "step": 11267 + }, + { + "epoch": 0.9093697038172868, + "grad_norm": 0.6454673409461975, + "learning_rate": 8.090532567427825e-05, + "loss": 2.523, + "step": 11268 + }, + { + "epoch": 0.9094504075538697, + "grad_norm": 0.6169581413269043, + "learning_rate": 8.088982948716692e-05, + "loss": 2.4924, + "step": 11269 + }, + { + "epoch": 0.9095311112904527, + "grad_norm": 0.7034861445426941, + "learning_rate": 8.0874333776331e-05, + "loss": 2.4756, + "step": 11270 + }, + { + "epoch": 0.9096118150270357, + "grad_norm": 0.7231355309486389, + "learning_rate": 8.085883854215671e-05, + "loss": 2.4963, + "step": 11271 + }, + { + "epoch": 0.9096925187636188, + "grad_norm": 0.6597892045974731, + "learning_rate": 8.084334378503017e-05, + "loss": 2.5617, + "step": 11272 + }, + { + "epoch": 0.9097732225002018, + "grad_norm": 0.7257365584373474, + "learning_rate": 8.082784950533759e-05, + "loss": 2.5293, + "step": 11273 + }, + { + "epoch": 0.9098539262367847, + "grad_norm": 0.7305313944816589, + "learning_rate": 8.081235570346512e-05, + "loss": 2.5355, + "step": 11274 + }, + { + "epoch": 0.9099346299733677, + "grad_norm": 0.6814435720443726, + "learning_rate": 8.07968623797989e-05, + "loss": 2.4842, + "step": 11275 + }, + { + "epoch": 0.9100153337099508, + "grad_norm": 0.7342902421951294, + "learning_rate": 8.078136953472506e-05, + "loss": 2.4817, + "step": 11276 + }, + { + "epoch": 0.9100960374465338, + "grad_norm": 0.6456516981124878, + "learning_rate": 8.076587716862973e-05, + "loss": 2.5119, + "step": 11277 + }, + { + "epoch": 0.9101767411831168, + "grad_norm": 0.7268881797790527, + "learning_rate": 8.075038528189906e-05, + "loss": 2.4614, + "step": 11278 + }, + { + "epoch": 0.9102574449196997, + "grad_norm": 0.6901549696922302, + "learning_rate": 8.073489387491906e-05, + "loss": 2.5411, + "step": 11279 + }, + { + "epoch": 0.9103381486562828, + "grad_norm": 0.6850160956382751, + "learning_rate": 8.071940294807588e-05, + "loss": 2.5078, + "step": 11280 + }, + { + "epoch": 0.9104188523928658, + "grad_norm": 0.6550731658935547, + "learning_rate": 8.070391250175558e-05, + "loss": 2.5502, + "step": 11281 + }, + { + "epoch": 0.9104995561294488, + "grad_norm": 0.7524412274360657, + "learning_rate": 8.068842253634421e-05, + "loss": 2.4699, + "step": 11282 + }, + { + "epoch": 0.9105802598660317, + "grad_norm": 0.6659243702888489, + "learning_rate": 8.067293305222784e-05, + "loss": 2.557, + "step": 11283 + }, + { + "epoch": 0.9106609636026148, + "grad_norm": 0.67015540599823, + "learning_rate": 8.065744404979251e-05, + "loss": 2.5929, + "step": 11284 + }, + { + "epoch": 0.9107416673391978, + "grad_norm": 0.7139000296592712, + "learning_rate": 8.064195552942422e-05, + "loss": 2.5262, + "step": 11285 + }, + { + "epoch": 0.9108223710757808, + "grad_norm": 0.6918016672134399, + "learning_rate": 8.062646749150899e-05, + "loss": 2.5161, + "step": 11286 + }, + { + "epoch": 0.9109030748123638, + "grad_norm": 0.7395541667938232, + "learning_rate": 8.061097993643289e-05, + "loss": 2.5351, + "step": 11287 + }, + { + "epoch": 0.9109837785489469, + "grad_norm": 0.6794499158859253, + "learning_rate": 8.05954928645818e-05, + "loss": 2.4617, + "step": 11288 + }, + { + "epoch": 0.9110644822855298, + "grad_norm": 0.6906577348709106, + "learning_rate": 8.058000627634176e-05, + "loss": 2.5701, + "step": 11289 + }, + { + "epoch": 0.9111451860221128, + "grad_norm": 0.6954079866409302, + "learning_rate": 8.056452017209874e-05, + "loss": 2.5137, + "step": 11290 + }, + { + "epoch": 0.9112258897586958, + "grad_norm": 0.7381381988525391, + "learning_rate": 8.054903455223866e-05, + "loss": 2.6666, + "step": 11291 + }, + { + "epoch": 0.9113065934952789, + "grad_norm": 0.6731518507003784, + "learning_rate": 8.053354941714749e-05, + "loss": 2.5173, + "step": 11292 + }, + { + "epoch": 0.9113872972318618, + "grad_norm": 0.6976885795593262, + "learning_rate": 8.051806476721116e-05, + "loss": 2.5089, + "step": 11293 + }, + { + "epoch": 0.9114680009684448, + "grad_norm": 0.6401965618133545, + "learning_rate": 8.050258060281562e-05, + "loss": 2.5295, + "step": 11294 + }, + { + "epoch": 0.9115487047050278, + "grad_norm": 0.7409671545028687, + "learning_rate": 8.048709692434667e-05, + "loss": 2.5074, + "step": 11295 + }, + { + "epoch": 0.9116294084416109, + "grad_norm": 0.6028234958648682, + "learning_rate": 8.04716137321903e-05, + "loss": 2.5437, + "step": 11296 + }, + { + "epoch": 0.9117101121781939, + "grad_norm": 0.727643609046936, + "learning_rate": 8.04561310267324e-05, + "loss": 2.5272, + "step": 11297 + }, + { + "epoch": 0.9117908159147768, + "grad_norm": 0.6912926435470581, + "learning_rate": 8.044064880835876e-05, + "loss": 2.5166, + "step": 11298 + }, + { + "epoch": 0.9118715196513598, + "grad_norm": 0.6971367001533508, + "learning_rate": 8.042516707745528e-05, + "loss": 2.5421, + "step": 11299 + }, + { + "epoch": 0.9119522233879429, + "grad_norm": 0.6722451448440552, + "learning_rate": 8.040968583440783e-05, + "loss": 2.5088, + "step": 11300 + }, + { + "epoch": 0.9120329271245259, + "grad_norm": 0.6469144225120544, + "learning_rate": 8.03942050796022e-05, + "loss": 2.4921, + "step": 11301 + }, + { + "epoch": 0.9121136308611089, + "grad_norm": 0.6709008812904358, + "learning_rate": 8.037872481342423e-05, + "loss": 2.4553, + "step": 11302 + }, + { + "epoch": 0.9121943345976918, + "grad_norm": 0.6540920734405518, + "learning_rate": 8.036324503625977e-05, + "loss": 2.489, + "step": 11303 + }, + { + "epoch": 0.9122750383342749, + "grad_norm": 0.6589755415916443, + "learning_rate": 8.034776574849453e-05, + "loss": 2.5195, + "step": 11304 + }, + { + "epoch": 0.9123557420708579, + "grad_norm": 0.676943838596344, + "learning_rate": 8.033228695051434e-05, + "loss": 2.4877, + "step": 11305 + }, + { + "epoch": 0.9124364458074409, + "grad_norm": 0.6509177088737488, + "learning_rate": 8.031680864270498e-05, + "loss": 2.5229, + "step": 11306 + }, + { + "epoch": 0.9125171495440239, + "grad_norm": 0.7480820417404175, + "learning_rate": 8.030133082545219e-05, + "loss": 2.5016, + "step": 11307 + }, + { + "epoch": 0.9125978532806069, + "grad_norm": 0.7130550742149353, + "learning_rate": 8.028585349914174e-05, + "loss": 2.5251, + "step": 11308 + }, + { + "epoch": 0.9126785570171899, + "grad_norm": 0.6959688067436218, + "learning_rate": 8.027037666415934e-05, + "loss": 2.4776, + "step": 11309 + }, + { + "epoch": 0.9127592607537729, + "grad_norm": 0.7540854215621948, + "learning_rate": 8.025490032089076e-05, + "loss": 2.5097, + "step": 11310 + }, + { + "epoch": 0.9128399644903559, + "grad_norm": 0.6921199560165405, + "learning_rate": 8.023942446972165e-05, + "loss": 2.5354, + "step": 11311 + }, + { + "epoch": 0.912920668226939, + "grad_norm": 0.649824857711792, + "learning_rate": 8.022394911103774e-05, + "loss": 2.5398, + "step": 11312 + }, + { + "epoch": 0.9130013719635219, + "grad_norm": 0.6951068639755249, + "learning_rate": 8.020847424522474e-05, + "loss": 2.5302, + "step": 11313 + }, + { + "epoch": 0.9130820757001049, + "grad_norm": 0.6906851530075073, + "learning_rate": 8.019299987266827e-05, + "loss": 2.581, + "step": 11314 + }, + { + "epoch": 0.9131627794366879, + "grad_norm": 0.6758459210395813, + "learning_rate": 8.0177525993754e-05, + "loss": 2.5208, + "step": 11315 + }, + { + "epoch": 0.913243483173271, + "grad_norm": 0.6915175318717957, + "learning_rate": 8.016205260886766e-05, + "loss": 2.5386, + "step": 11316 + }, + { + "epoch": 0.913324186909854, + "grad_norm": 0.7083550691604614, + "learning_rate": 8.014657971839476e-05, + "loss": 2.4895, + "step": 11317 + }, + { + "epoch": 0.9134048906464369, + "grad_norm": 0.7052562832832336, + "learning_rate": 8.013110732272102e-05, + "loss": 2.4896, + "step": 11318 + }, + { + "epoch": 0.9134855943830199, + "grad_norm": 0.7811834216117859, + "learning_rate": 8.011563542223206e-05, + "loss": 2.5082, + "step": 11319 + }, + { + "epoch": 0.913566298119603, + "grad_norm": 0.6207153797149658, + "learning_rate": 8.01001640173134e-05, + "loss": 2.4967, + "step": 11320 + }, + { + "epoch": 0.913647001856186, + "grad_norm": 0.7637950778007507, + "learning_rate": 8.008469310835065e-05, + "loss": 2.4907, + "step": 11321 + }, + { + "epoch": 0.913727705592769, + "grad_norm": 0.7263950705528259, + "learning_rate": 8.006922269572947e-05, + "loss": 2.5259, + "step": 11322 + }, + { + "epoch": 0.9138084093293519, + "grad_norm": 0.6965721845626831, + "learning_rate": 8.005375277983531e-05, + "loss": 2.5648, + "step": 11323 + }, + { + "epoch": 0.9138891130659349, + "grad_norm": 0.7146127223968506, + "learning_rate": 8.003828336105377e-05, + "loss": 2.53, + "step": 11324 + }, + { + "epoch": 0.913969816802518, + "grad_norm": 0.7083697319030762, + "learning_rate": 8.00228144397704e-05, + "loss": 2.4923, + "step": 11325 + }, + { + "epoch": 0.914050520539101, + "grad_norm": 0.7259312868118286, + "learning_rate": 8.000734601637074e-05, + "loss": 2.5303, + "step": 11326 + }, + { + "epoch": 0.9141312242756839, + "grad_norm": 0.7072086930274963, + "learning_rate": 7.999187809124025e-05, + "loss": 2.4662, + "step": 11327 + }, + { + "epoch": 0.9142119280122669, + "grad_norm": 0.7216035723686218, + "learning_rate": 7.997641066476445e-05, + "loss": 2.5069, + "step": 11328 + }, + { + "epoch": 0.91429263174885, + "grad_norm": 0.6925712823867798, + "learning_rate": 7.99609437373289e-05, + "loss": 2.5107, + "step": 11329 + }, + { + "epoch": 0.914373335485433, + "grad_norm": 0.6672701835632324, + "learning_rate": 7.994547730931896e-05, + "loss": 2.5248, + "step": 11330 + }, + { + "epoch": 0.914454039222016, + "grad_norm": 0.8058515787124634, + "learning_rate": 7.993001138112016e-05, + "loss": 2.4427, + "step": 11331 + }, + { + "epoch": 0.9145347429585989, + "grad_norm": 0.6942592859268188, + "learning_rate": 7.991454595311795e-05, + "loss": 2.6163, + "step": 11332 + }, + { + "epoch": 0.914615446695182, + "grad_norm": 0.7051894068717957, + "learning_rate": 7.989908102569774e-05, + "loss": 2.5327, + "step": 11333 + }, + { + "epoch": 0.914696150431765, + "grad_norm": 0.6824771761894226, + "learning_rate": 7.988361659924496e-05, + "loss": 2.4843, + "step": 11334 + }, + { + "epoch": 0.914776854168348, + "grad_norm": 0.6756488084793091, + "learning_rate": 7.98681526741451e-05, + "loss": 2.5215, + "step": 11335 + }, + { + "epoch": 0.914857557904931, + "grad_norm": 0.6988239288330078, + "learning_rate": 7.985268925078344e-05, + "loss": 2.5153, + "step": 11336 + }, + { + "epoch": 0.914938261641514, + "grad_norm": 0.6446006298065186, + "learning_rate": 7.983722632954544e-05, + "loss": 2.5081, + "step": 11337 + }, + { + "epoch": 0.915018965378097, + "grad_norm": 0.6828100681304932, + "learning_rate": 7.982176391081649e-05, + "loss": 2.5607, + "step": 11338 + }, + { + "epoch": 0.91509966911468, + "grad_norm": 0.659721851348877, + "learning_rate": 7.980630199498193e-05, + "loss": 2.531, + "step": 11339 + }, + { + "epoch": 0.915180372851263, + "grad_norm": 0.6298564076423645, + "learning_rate": 7.979084058242709e-05, + "loss": 2.513, + "step": 11340 + }, + { + "epoch": 0.9152610765878461, + "grad_norm": 0.664299726486206, + "learning_rate": 7.977537967353735e-05, + "loss": 2.5533, + "step": 11341 + }, + { + "epoch": 0.915341780324429, + "grad_norm": 0.7035108804702759, + "learning_rate": 7.975991926869801e-05, + "loss": 2.4868, + "step": 11342 + }, + { + "epoch": 0.915422484061012, + "grad_norm": 0.7428407073020935, + "learning_rate": 7.974445936829438e-05, + "loss": 2.5694, + "step": 11343 + }, + { + "epoch": 0.915503187797595, + "grad_norm": 0.6845505237579346, + "learning_rate": 7.972899997271176e-05, + "loss": 2.5092, + "step": 11344 + }, + { + "epoch": 0.9155838915341781, + "grad_norm": 0.7135340571403503, + "learning_rate": 7.971354108233551e-05, + "loss": 2.5157, + "step": 11345 + }, + { + "epoch": 0.915664595270761, + "grad_norm": 0.7032433152198792, + "learning_rate": 7.969808269755077e-05, + "loss": 2.5292, + "step": 11346 + }, + { + "epoch": 0.915745299007344, + "grad_norm": 0.6874690651893616, + "learning_rate": 7.96826248187429e-05, + "loss": 2.5312, + "step": 11347 + }, + { + "epoch": 0.915826002743927, + "grad_norm": 0.6497030258178711, + "learning_rate": 7.966716744629718e-05, + "loss": 2.505, + "step": 11348 + }, + { + "epoch": 0.9159067064805101, + "grad_norm": 0.6618520021438599, + "learning_rate": 7.965171058059874e-05, + "loss": 2.5287, + "step": 11349 + }, + { + "epoch": 0.9159874102170931, + "grad_norm": 0.6737041473388672, + "learning_rate": 7.963625422203288e-05, + "loss": 2.5494, + "step": 11350 + }, + { + "epoch": 0.916068113953676, + "grad_norm": 0.705646276473999, + "learning_rate": 7.96207983709848e-05, + "loss": 2.5402, + "step": 11351 + }, + { + "epoch": 0.916148817690259, + "grad_norm": 0.6852068901062012, + "learning_rate": 7.96053430278397e-05, + "loss": 2.51, + "step": 11352 + }, + { + "epoch": 0.9162295214268421, + "grad_norm": 0.7166822552680969, + "learning_rate": 7.958988819298274e-05, + "loss": 2.576, + "step": 11353 + }, + { + "epoch": 0.9163102251634251, + "grad_norm": 0.6349207162857056, + "learning_rate": 7.957443386679913e-05, + "loss": 2.5219, + "step": 11354 + }, + { + "epoch": 0.9163909289000081, + "grad_norm": 0.6504647135734558, + "learning_rate": 7.955898004967406e-05, + "loss": 2.4593, + "step": 11355 + }, + { + "epoch": 0.916471632636591, + "grad_norm": 0.7313871383666992, + "learning_rate": 7.95435267419926e-05, + "loss": 2.5616, + "step": 11356 + }, + { + "epoch": 0.9165523363731741, + "grad_norm": 0.6948587894439697, + "learning_rate": 7.95280739441399e-05, + "loss": 2.4608, + "step": 11357 + }, + { + "epoch": 0.9166330401097571, + "grad_norm": 0.6130328178405762, + "learning_rate": 7.95126216565012e-05, + "loss": 2.5563, + "step": 11358 + }, + { + "epoch": 0.9167137438463401, + "grad_norm": 0.7149228453636169, + "learning_rate": 7.949716987946145e-05, + "loss": 2.5664, + "step": 11359 + }, + { + "epoch": 0.916794447582923, + "grad_norm": 0.7452285289764404, + "learning_rate": 7.948171861340584e-05, + "loss": 2.525, + "step": 11360 + }, + { + "epoch": 0.9168751513195061, + "grad_norm": 0.6840611100196838, + "learning_rate": 7.946626785871945e-05, + "loss": 2.537, + "step": 11361 + }, + { + "epoch": 0.9169558550560891, + "grad_norm": 0.7269708514213562, + "learning_rate": 7.945081761578732e-05, + "loss": 2.5227, + "step": 11362 + }, + { + "epoch": 0.9170365587926721, + "grad_norm": 0.6521697044372559, + "learning_rate": 7.943536788499452e-05, + "loss": 2.54, + "step": 11363 + }, + { + "epoch": 0.9171172625292551, + "grad_norm": 0.6516863107681274, + "learning_rate": 7.941991866672618e-05, + "loss": 2.4788, + "step": 11364 + }, + { + "epoch": 0.9171979662658382, + "grad_norm": 0.7673580050468445, + "learning_rate": 7.94044699613672e-05, + "loss": 2.4678, + "step": 11365 + }, + { + "epoch": 0.9172786700024211, + "grad_norm": 0.6666994690895081, + "learning_rate": 7.938902176930268e-05, + "loss": 2.5251, + "step": 11366 + }, + { + "epoch": 0.9173593737390041, + "grad_norm": 0.7261863946914673, + "learning_rate": 7.937357409091761e-05, + "loss": 2.4977, + "step": 11367 + }, + { + "epoch": 0.9174400774755871, + "grad_norm": 0.6920679807662964, + "learning_rate": 7.9358126926597e-05, + "loss": 2.5367, + "step": 11368 + }, + { + "epoch": 0.9175207812121702, + "grad_norm": 0.6715712547302246, + "learning_rate": 7.93426802767258e-05, + "loss": 2.4898, + "step": 11369 + }, + { + "epoch": 0.9176014849487532, + "grad_norm": 0.7014333605766296, + "learning_rate": 7.932723414168904e-05, + "loss": 2.4507, + "step": 11370 + }, + { + "epoch": 0.9176821886853361, + "grad_norm": 0.6755761504173279, + "learning_rate": 7.931178852187163e-05, + "loss": 2.5895, + "step": 11371 + }, + { + "epoch": 0.9177628924219191, + "grad_norm": 0.6846731305122375, + "learning_rate": 7.929634341765852e-05, + "loss": 2.5002, + "step": 11372 + }, + { + "epoch": 0.9178435961585021, + "grad_norm": 0.6422831416130066, + "learning_rate": 7.928089882943466e-05, + "loss": 2.5326, + "step": 11373 + }, + { + "epoch": 0.9179242998950852, + "grad_norm": 0.7256442308425903, + "learning_rate": 7.9265454757585e-05, + "loss": 2.5706, + "step": 11374 + }, + { + "epoch": 0.9180050036316681, + "grad_norm": 0.6514387130737305, + "learning_rate": 7.925001120249436e-05, + "loss": 2.5349, + "step": 11375 + }, + { + "epoch": 0.9180857073682511, + "grad_norm": 0.7596457600593567, + "learning_rate": 7.923456816454768e-05, + "loss": 2.4767, + "step": 11376 + }, + { + "epoch": 0.9181664111048341, + "grad_norm": 0.673283040523529, + "learning_rate": 7.921912564412988e-05, + "loss": 2.5156, + "step": 11377 + }, + { + "epoch": 0.9182471148414172, + "grad_norm": 0.6964103579521179, + "learning_rate": 7.920368364162575e-05, + "loss": 2.5293, + "step": 11378 + }, + { + "epoch": 0.9183278185780002, + "grad_norm": 0.6765062212944031, + "learning_rate": 7.91882421574202e-05, + "loss": 2.5757, + "step": 11379 + }, + { + "epoch": 0.9184085223145831, + "grad_norm": 0.7039035558700562, + "learning_rate": 7.917280119189811e-05, + "loss": 2.513, + "step": 11380 + }, + { + "epoch": 0.9184892260511661, + "grad_norm": 0.6523976922035217, + "learning_rate": 7.915736074544419e-05, + "loss": 2.4712, + "step": 11381 + }, + { + "epoch": 0.9185699297877492, + "grad_norm": 0.7159552574157715, + "learning_rate": 7.914192081844334e-05, + "loss": 2.4713, + "step": 11382 + }, + { + "epoch": 0.9186506335243322, + "grad_norm": 0.7071694731712341, + "learning_rate": 7.912648141128036e-05, + "loss": 2.5367, + "step": 11383 + }, + { + "epoch": 0.9187313372609152, + "grad_norm": 0.6675183773040771, + "learning_rate": 7.911104252434e-05, + "loss": 2.5372, + "step": 11384 + }, + { + "epoch": 0.9188120409974981, + "grad_norm": 0.7293995022773743, + "learning_rate": 7.909560415800707e-05, + "loss": 2.5469, + "step": 11385 + }, + { + "epoch": 0.9188927447340812, + "grad_norm": 0.6774035096168518, + "learning_rate": 7.908016631266635e-05, + "loss": 2.5655, + "step": 11386 + }, + { + "epoch": 0.9189734484706642, + "grad_norm": 0.7068144083023071, + "learning_rate": 7.906472898870256e-05, + "loss": 2.5265, + "step": 11387 + }, + { + "epoch": 0.9190541522072472, + "grad_norm": 0.6756324172019958, + "learning_rate": 7.904929218650044e-05, + "loss": 2.4966, + "step": 11388 + }, + { + "epoch": 0.9191348559438302, + "grad_norm": 0.6964625120162964, + "learning_rate": 7.903385590644473e-05, + "loss": 2.5646, + "step": 11389 + }, + { + "epoch": 0.9192155596804132, + "grad_norm": 0.6760976314544678, + "learning_rate": 7.901842014892018e-05, + "loss": 2.5159, + "step": 11390 + }, + { + "epoch": 0.9192962634169962, + "grad_norm": 0.6648714542388916, + "learning_rate": 7.900298491431139e-05, + "loss": 2.5715, + "step": 11391 + }, + { + "epoch": 0.9193769671535792, + "grad_norm": 0.7492914199829102, + "learning_rate": 7.898755020300312e-05, + "loss": 2.5226, + "step": 11392 + }, + { + "epoch": 0.9194576708901622, + "grad_norm": 0.7041164040565491, + "learning_rate": 7.897211601538004e-05, + "loss": 2.5809, + "step": 11393 + }, + { + "epoch": 0.9195383746267453, + "grad_norm": 0.6746383309364319, + "learning_rate": 7.895668235182677e-05, + "loss": 2.5369, + "step": 11394 + }, + { + "epoch": 0.9196190783633282, + "grad_norm": 0.6486156582832336, + "learning_rate": 7.894124921272798e-05, + "loss": 2.5406, + "step": 11395 + }, + { + "epoch": 0.9196997820999112, + "grad_norm": 0.6828807592391968, + "learning_rate": 7.892581659846834e-05, + "loss": 2.5241, + "step": 11396 + }, + { + "epoch": 0.9197804858364942, + "grad_norm": 0.694970428943634, + "learning_rate": 7.891038450943242e-05, + "loss": 2.4402, + "step": 11397 + }, + { + "epoch": 0.9198611895730773, + "grad_norm": 0.7187039852142334, + "learning_rate": 7.889495294600484e-05, + "loss": 2.5052, + "step": 11398 + }, + { + "epoch": 0.9199418933096603, + "grad_norm": 0.6919832825660706, + "learning_rate": 7.887952190857024e-05, + "loss": 2.5078, + "step": 11399 + }, + { + "epoch": 0.9200225970462432, + "grad_norm": 0.7129504084587097, + "learning_rate": 7.886409139751313e-05, + "loss": 2.5047, + "step": 11400 + }, + { + "epoch": 0.9201033007828262, + "grad_norm": 0.6755272746086121, + "learning_rate": 7.88486614132181e-05, + "loss": 2.4821, + "step": 11401 + }, + { + "epoch": 0.9201840045194093, + "grad_norm": 0.7253937125205994, + "learning_rate": 7.883323195606973e-05, + "loss": 2.5062, + "step": 11402 + }, + { + "epoch": 0.9202647082559923, + "grad_norm": 0.7057155966758728, + "learning_rate": 7.881780302645257e-05, + "loss": 2.5475, + "step": 11403 + }, + { + "epoch": 0.9203454119925752, + "grad_norm": 0.713869571685791, + "learning_rate": 7.880237462475111e-05, + "loss": 2.5335, + "step": 11404 + }, + { + "epoch": 0.9204261157291582, + "grad_norm": 0.769648551940918, + "learning_rate": 7.878694675134987e-05, + "loss": 2.4944, + "step": 11405 + }, + { + "epoch": 0.9205068194657413, + "grad_norm": 0.6444964408874512, + "learning_rate": 7.877151940663343e-05, + "loss": 2.5755, + "step": 11406 + }, + { + "epoch": 0.9205875232023243, + "grad_norm": 0.6811819672584534, + "learning_rate": 7.875609259098618e-05, + "loss": 2.5475, + "step": 11407 + }, + { + "epoch": 0.9206682269389073, + "grad_norm": 0.6959417462348938, + "learning_rate": 7.874066630479259e-05, + "loss": 2.5095, + "step": 11408 + }, + { + "epoch": 0.9207489306754902, + "grad_norm": 0.6721363067626953, + "learning_rate": 7.872524054843724e-05, + "loss": 2.5166, + "step": 11409 + }, + { + "epoch": 0.9208296344120733, + "grad_norm": 0.713122546672821, + "learning_rate": 7.870981532230447e-05, + "loss": 2.5084, + "step": 11410 + }, + { + "epoch": 0.9209103381486563, + "grad_norm": 0.7059469819068909, + "learning_rate": 7.869439062677876e-05, + "loss": 2.437, + "step": 11411 + }, + { + "epoch": 0.9209910418852393, + "grad_norm": 0.6808314323425293, + "learning_rate": 7.867896646224454e-05, + "loss": 2.5658, + "step": 11412 + }, + { + "epoch": 0.9210717456218223, + "grad_norm": 0.7060894966125488, + "learning_rate": 7.86635428290862e-05, + "loss": 2.515, + "step": 11413 + }, + { + "epoch": 0.9211524493584053, + "grad_norm": 0.7538465857505798, + "learning_rate": 7.864811972768813e-05, + "loss": 2.4448, + "step": 11414 + }, + { + "epoch": 0.9212331530949883, + "grad_norm": 0.6824522018432617, + "learning_rate": 7.863269715843478e-05, + "loss": 2.503, + "step": 11415 + }, + { + "epoch": 0.9213138568315713, + "grad_norm": 0.7068174481391907, + "learning_rate": 7.861727512171044e-05, + "loss": 2.5198, + "step": 11416 + }, + { + "epoch": 0.9213945605681543, + "grad_norm": 0.6742961406707764, + "learning_rate": 7.860185361789948e-05, + "loss": 2.5167, + "step": 11417 + }, + { + "epoch": 0.9214752643047374, + "grad_norm": 0.7643383741378784, + "learning_rate": 7.858643264738628e-05, + "loss": 2.5508, + "step": 11418 + }, + { + "epoch": 0.9215559680413203, + "grad_norm": 0.6737802028656006, + "learning_rate": 7.857101221055518e-05, + "loss": 2.589, + "step": 11419 + }, + { + "epoch": 0.9216366717779033, + "grad_norm": 0.668214738368988, + "learning_rate": 7.855559230779043e-05, + "loss": 2.4747, + "step": 11420 + }, + { + "epoch": 0.9217173755144863, + "grad_norm": 0.6933084726333618, + "learning_rate": 7.854017293947638e-05, + "loss": 2.5171, + "step": 11421 + }, + { + "epoch": 0.9217980792510694, + "grad_norm": 0.6320228576660156, + "learning_rate": 7.852475410599736e-05, + "loss": 2.5213, + "step": 11422 + }, + { + "epoch": 0.9218787829876524, + "grad_norm": 0.6578245759010315, + "learning_rate": 7.850933580773756e-05, + "loss": 2.5085, + "step": 11423 + }, + { + "epoch": 0.9219594867242353, + "grad_norm": 0.6741796135902405, + "learning_rate": 7.849391804508129e-05, + "loss": 2.5294, + "step": 11424 + }, + { + "epoch": 0.9220401904608183, + "grad_norm": 0.6875781416893005, + "learning_rate": 7.847850081841285e-05, + "loss": 2.5034, + "step": 11425 + }, + { + "epoch": 0.9221208941974013, + "grad_norm": 0.6515244245529175, + "learning_rate": 7.846308412811638e-05, + "loss": 2.4707, + "step": 11426 + }, + { + "epoch": 0.9222015979339844, + "grad_norm": 0.7326812148094177, + "learning_rate": 7.844766797457615e-05, + "loss": 2.5049, + "step": 11427 + }, + { + "epoch": 0.9222823016705674, + "grad_norm": 0.7539918422698975, + "learning_rate": 7.84322523581764e-05, + "loss": 2.4726, + "step": 11428 + }, + { + "epoch": 0.9223630054071503, + "grad_norm": 0.745468020439148, + "learning_rate": 7.841683727930129e-05, + "loss": 2.5003, + "step": 11429 + }, + { + "epoch": 0.9224437091437333, + "grad_norm": 0.726362943649292, + "learning_rate": 7.840142273833499e-05, + "loss": 2.5056, + "step": 11430 + }, + { + "epoch": 0.9225244128803164, + "grad_norm": 0.7275403738021851, + "learning_rate": 7.838600873566175e-05, + "loss": 2.5188, + "step": 11431 + }, + { + "epoch": 0.9226051166168994, + "grad_norm": 0.6908789873123169, + "learning_rate": 7.837059527166563e-05, + "loss": 2.5349, + "step": 11432 + }, + { + "epoch": 0.9226858203534823, + "grad_norm": 0.7220396399497986, + "learning_rate": 7.835518234673079e-05, + "loss": 2.4863, + "step": 11433 + }, + { + "epoch": 0.9227665240900653, + "grad_norm": 0.6516178846359253, + "learning_rate": 7.833976996124142e-05, + "loss": 2.556, + "step": 11434 + }, + { + "epoch": 0.9228472278266484, + "grad_norm": 0.6958726644515991, + "learning_rate": 7.832435811558163e-05, + "loss": 2.5286, + "step": 11435 + }, + { + "epoch": 0.9229279315632314, + "grad_norm": 0.7734121680259705, + "learning_rate": 7.830894681013546e-05, + "loss": 2.5087, + "step": 11436 + }, + { + "epoch": 0.9230086352998144, + "grad_norm": 0.709064245223999, + "learning_rate": 7.829353604528703e-05, + "loss": 2.4817, + "step": 11437 + }, + { + "epoch": 0.9230893390363973, + "grad_norm": 0.7224971652030945, + "learning_rate": 7.827812582142045e-05, + "loss": 2.5179, + "step": 11438 + }, + { + "epoch": 0.9231700427729804, + "grad_norm": 0.7139936685562134, + "learning_rate": 7.826271613891973e-05, + "loss": 2.537, + "step": 11439 + }, + { + "epoch": 0.9232507465095634, + "grad_norm": 0.671138346195221, + "learning_rate": 7.824730699816896e-05, + "loss": 2.4865, + "step": 11440 + }, + { + "epoch": 0.9233314502461464, + "grad_norm": 0.6547425389289856, + "learning_rate": 7.823189839955218e-05, + "loss": 2.509, + "step": 11441 + }, + { + "epoch": 0.9234121539827294, + "grad_norm": 0.719765305519104, + "learning_rate": 7.821649034345338e-05, + "loss": 2.591, + "step": 11442 + }, + { + "epoch": 0.9234928577193124, + "grad_norm": 0.7128504514694214, + "learning_rate": 7.820108283025656e-05, + "loss": 2.541, + "step": 11443 + }, + { + "epoch": 0.9235735614558954, + "grad_norm": 0.7711538672447205, + "learning_rate": 7.818567586034577e-05, + "loss": 2.5388, + "step": 11444 + }, + { + "epoch": 0.9236542651924784, + "grad_norm": 0.7151121497154236, + "learning_rate": 7.817026943410494e-05, + "loss": 2.5539, + "step": 11445 + }, + { + "epoch": 0.9237349689290614, + "grad_norm": 0.7009569406509399, + "learning_rate": 7.815486355191805e-05, + "loss": 2.4793, + "step": 11446 + }, + { + "epoch": 0.9238156726656445, + "grad_norm": 0.7251109480857849, + "learning_rate": 7.813945821416909e-05, + "loss": 2.5406, + "step": 11447 + }, + { + "epoch": 0.9238963764022274, + "grad_norm": 0.6907934546470642, + "learning_rate": 7.812405342124196e-05, + "loss": 2.5069, + "step": 11448 + }, + { + "epoch": 0.9239770801388104, + "grad_norm": 0.699207067489624, + "learning_rate": 7.810864917352061e-05, + "loss": 2.4844, + "step": 11449 + }, + { + "epoch": 0.9240577838753934, + "grad_norm": 0.718386173248291, + "learning_rate": 7.809324547138893e-05, + "loss": 2.5666, + "step": 11450 + }, + { + "epoch": 0.9241384876119765, + "grad_norm": 0.6420444846153259, + "learning_rate": 7.807784231523089e-05, + "loss": 2.506, + "step": 11451 + }, + { + "epoch": 0.9242191913485595, + "grad_norm": 0.6777252554893494, + "learning_rate": 7.806243970543028e-05, + "loss": 2.487, + "step": 11452 + }, + { + "epoch": 0.9242998950851424, + "grad_norm": 0.6907702684402466, + "learning_rate": 7.804703764237102e-05, + "loss": 2.5284, + "step": 11453 + }, + { + "epoch": 0.9243805988217254, + "grad_norm": 0.6383422613143921, + "learning_rate": 7.803163612643698e-05, + "loss": 2.4704, + "step": 11454 + }, + { + "epoch": 0.9244613025583085, + "grad_norm": 0.6879577040672302, + "learning_rate": 7.801623515801198e-05, + "loss": 2.5103, + "step": 11455 + }, + { + "epoch": 0.9245420062948915, + "grad_norm": 0.6856719851493835, + "learning_rate": 7.800083473747986e-05, + "loss": 2.5086, + "step": 11456 + }, + { + "epoch": 0.9246227100314744, + "grad_norm": 0.7463707327842712, + "learning_rate": 7.79854348652245e-05, + "loss": 2.5456, + "step": 11457 + }, + { + "epoch": 0.9247034137680574, + "grad_norm": 0.7352643013000488, + "learning_rate": 7.79700355416296e-05, + "loss": 2.5335, + "step": 11458 + }, + { + "epoch": 0.9247841175046405, + "grad_norm": 0.7525908350944519, + "learning_rate": 7.795463676707897e-05, + "loss": 2.5855, + "step": 11459 + }, + { + "epoch": 0.9248648212412235, + "grad_norm": 0.7323870658874512, + "learning_rate": 7.79392385419565e-05, + "loss": 2.5471, + "step": 11460 + }, + { + "epoch": 0.9249455249778065, + "grad_norm": 0.7443860769271851, + "learning_rate": 7.792384086664582e-05, + "loss": 2.5449, + "step": 11461 + }, + { + "epoch": 0.9250262287143894, + "grad_norm": 0.6928641200065613, + "learning_rate": 7.790844374153073e-05, + "loss": 2.505, + "step": 11462 + }, + { + "epoch": 0.9251069324509725, + "grad_norm": 0.6491222381591797, + "learning_rate": 7.789304716699498e-05, + "loss": 2.5447, + "step": 11463 + }, + { + "epoch": 0.9251876361875555, + "grad_norm": 0.7351166009902954, + "learning_rate": 7.78776511434223e-05, + "loss": 2.524, + "step": 11464 + }, + { + "epoch": 0.9252683399241385, + "grad_norm": 0.6680036783218384, + "learning_rate": 7.786225567119637e-05, + "loss": 2.5019, + "step": 11465 + }, + { + "epoch": 0.9253490436607215, + "grad_norm": 0.7070801258087158, + "learning_rate": 7.784686075070089e-05, + "loss": 2.5052, + "step": 11466 + }, + { + "epoch": 0.9254297473973045, + "grad_norm": 0.7095211148262024, + "learning_rate": 7.783146638231957e-05, + "loss": 2.4998, + "step": 11467 + }, + { + "epoch": 0.9255104511338875, + "grad_norm": 0.6725812554359436, + "learning_rate": 7.781607256643604e-05, + "loss": 2.4909, + "step": 11468 + }, + { + "epoch": 0.9255911548704705, + "grad_norm": 0.684177577495575, + "learning_rate": 7.780067930343396e-05, + "loss": 2.5636, + "step": 11469 + }, + { + "epoch": 0.9256718586070535, + "grad_norm": 0.703419029712677, + "learning_rate": 7.778528659369702e-05, + "loss": 2.4295, + "step": 11470 + }, + { + "epoch": 0.9257525623436366, + "grad_norm": 0.6850195527076721, + "learning_rate": 7.776989443760877e-05, + "loss": 2.5143, + "step": 11471 + }, + { + "epoch": 0.9258332660802195, + "grad_norm": 0.7322348952293396, + "learning_rate": 7.775450283555286e-05, + "loss": 2.5616, + "step": 11472 + }, + { + "epoch": 0.9259139698168025, + "grad_norm": 0.6924510598182678, + "learning_rate": 7.77391117879129e-05, + "loss": 2.4796, + "step": 11473 + }, + { + "epoch": 0.9259946735533855, + "grad_norm": 0.7006441354751587, + "learning_rate": 7.772372129507249e-05, + "loss": 2.5142, + "step": 11474 + }, + { + "epoch": 0.9260753772899685, + "grad_norm": 0.6379218697547913, + "learning_rate": 7.770833135741513e-05, + "loss": 2.5366, + "step": 11475 + }, + { + "epoch": 0.9261560810265516, + "grad_norm": 0.676163375377655, + "learning_rate": 7.769294197532448e-05, + "loss": 2.4936, + "step": 11476 + }, + { + "epoch": 0.9262367847631345, + "grad_norm": 0.6964210271835327, + "learning_rate": 7.767755314918399e-05, + "loss": 2.429, + "step": 11477 + }, + { + "epoch": 0.9263174884997175, + "grad_norm": 0.7017048597335815, + "learning_rate": 7.766216487937722e-05, + "loss": 2.5488, + "step": 11478 + }, + { + "epoch": 0.9263981922363005, + "grad_norm": 0.6742509603500366, + "learning_rate": 7.76467771662877e-05, + "loss": 2.5121, + "step": 11479 + }, + { + "epoch": 0.9264788959728836, + "grad_norm": 0.6751403212547302, + "learning_rate": 7.763139001029893e-05, + "loss": 2.5897, + "step": 11480 + }, + { + "epoch": 0.9265595997094666, + "grad_norm": 0.6639657616615295, + "learning_rate": 7.761600341179439e-05, + "loss": 2.5015, + "step": 11481 + }, + { + "epoch": 0.9266403034460495, + "grad_norm": 0.6332827210426331, + "learning_rate": 7.760061737115756e-05, + "loss": 2.5518, + "step": 11482 + }, + { + "epoch": 0.9267210071826325, + "grad_norm": 0.6751062870025635, + "learning_rate": 7.758523188877192e-05, + "loss": 2.4252, + "step": 11483 + }, + { + "epoch": 0.9268017109192156, + "grad_norm": 0.6763231754302979, + "learning_rate": 7.756984696502084e-05, + "loss": 2.5683, + "step": 11484 + }, + { + "epoch": 0.9268824146557986, + "grad_norm": 0.6480380296707153, + "learning_rate": 7.755446260028784e-05, + "loss": 2.558, + "step": 11485 + }, + { + "epoch": 0.9269631183923815, + "grad_norm": 0.6925072073936462, + "learning_rate": 7.753907879495634e-05, + "loss": 2.5374, + "step": 11486 + }, + { + "epoch": 0.9270438221289645, + "grad_norm": 0.6771834492683411, + "learning_rate": 7.752369554940966e-05, + "loss": 2.5652, + "step": 11487 + }, + { + "epoch": 0.9271245258655476, + "grad_norm": 0.6747026443481445, + "learning_rate": 7.750831286403124e-05, + "loss": 2.5076, + "step": 11488 + }, + { + "epoch": 0.9272052296021306, + "grad_norm": 0.6727211475372314, + "learning_rate": 7.749293073920448e-05, + "loss": 2.4774, + "step": 11489 + }, + { + "epoch": 0.9272859333387136, + "grad_norm": 0.6334055066108704, + "learning_rate": 7.747754917531272e-05, + "loss": 2.5245, + "step": 11490 + }, + { + "epoch": 0.9273666370752965, + "grad_norm": 0.740700900554657, + "learning_rate": 7.746216817273928e-05, + "loss": 2.5485, + "step": 11491 + }, + { + "epoch": 0.9274473408118796, + "grad_norm": 0.6500691771507263, + "learning_rate": 7.744678773186757e-05, + "loss": 2.5277, + "step": 11492 + }, + { + "epoch": 0.9275280445484626, + "grad_norm": 0.6592985987663269, + "learning_rate": 7.743140785308084e-05, + "loss": 2.5304, + "step": 11493 + }, + { + "epoch": 0.9276087482850456, + "grad_norm": 0.6980452537536621, + "learning_rate": 7.741602853676241e-05, + "loss": 2.544, + "step": 11494 + }, + { + "epoch": 0.9276894520216286, + "grad_norm": 0.643190860748291, + "learning_rate": 7.740064978329555e-05, + "loss": 2.5167, + "step": 11495 + }, + { + "epoch": 0.9277701557582116, + "grad_norm": 0.6789804100990295, + "learning_rate": 7.738527159306366e-05, + "loss": 2.5117, + "step": 11496 + }, + { + "epoch": 0.9278508594947946, + "grad_norm": 0.7109663486480713, + "learning_rate": 7.736989396644987e-05, + "loss": 2.5294, + "step": 11497 + }, + { + "epoch": 0.9279315632313776, + "grad_norm": 0.6752706170082092, + "learning_rate": 7.735451690383746e-05, + "loss": 2.4851, + "step": 11498 + }, + { + "epoch": 0.9280122669679606, + "grad_norm": 0.6947829723358154, + "learning_rate": 7.733914040560972e-05, + "loss": 2.5792, + "step": 11499 + }, + { + "epoch": 0.9280929707045437, + "grad_norm": 0.6701157689094543, + "learning_rate": 7.732376447214981e-05, + "loss": 2.4884, + "step": 11500 + }, + { + "epoch": 0.9281736744411266, + "grad_norm": 0.64533531665802, + "learning_rate": 7.730838910384097e-05, + "loss": 2.4644, + "step": 11501 + }, + { + "epoch": 0.9282543781777096, + "grad_norm": 0.6664395332336426, + "learning_rate": 7.729301430106644e-05, + "loss": 2.5286, + "step": 11502 + }, + { + "epoch": 0.9283350819142926, + "grad_norm": 0.6982395648956299, + "learning_rate": 7.72776400642093e-05, + "loss": 2.5092, + "step": 11503 + }, + { + "epoch": 0.9284157856508757, + "grad_norm": 0.6656171679496765, + "learning_rate": 7.726226639365278e-05, + "loss": 2.4945, + "step": 11504 + }, + { + "epoch": 0.9284964893874587, + "grad_norm": 0.6213308572769165, + "learning_rate": 7.724689328978001e-05, + "loss": 2.5042, + "step": 11505 + }, + { + "epoch": 0.9285771931240416, + "grad_norm": 0.6855599880218506, + "learning_rate": 7.723152075297414e-05, + "loss": 2.5207, + "step": 11506 + }, + { + "epoch": 0.9286578968606246, + "grad_norm": 0.7724171280860901, + "learning_rate": 7.721614878361828e-05, + "loss": 2.4842, + "step": 11507 + }, + { + "epoch": 0.9287386005972077, + "grad_norm": 0.708634614944458, + "learning_rate": 7.720077738209559e-05, + "loss": 2.58, + "step": 11508 + }, + { + "epoch": 0.9288193043337907, + "grad_norm": 0.6766082644462585, + "learning_rate": 7.718540654878907e-05, + "loss": 2.492, + "step": 11509 + }, + { + "epoch": 0.9289000080703737, + "grad_norm": 0.6856982707977295, + "learning_rate": 7.717003628408187e-05, + "loss": 2.5186, + "step": 11510 + }, + { + "epoch": 0.9289807118069566, + "grad_norm": 0.680647611618042, + "learning_rate": 7.715466658835705e-05, + "loss": 2.5305, + "step": 11511 + }, + { + "epoch": 0.9290614155435397, + "grad_norm": 0.7174721360206604, + "learning_rate": 7.713929746199771e-05, + "loss": 2.4498, + "step": 11512 + }, + { + "epoch": 0.9291421192801227, + "grad_norm": 0.6507031321525574, + "learning_rate": 7.712392890538676e-05, + "loss": 2.5334, + "step": 11513 + }, + { + "epoch": 0.9292228230167057, + "grad_norm": 0.7545748353004456, + "learning_rate": 7.710856091890732e-05, + "loss": 2.505, + "step": 11514 + }, + { + "epoch": 0.9293035267532886, + "grad_norm": 0.6978560090065002, + "learning_rate": 7.709319350294242e-05, + "loss": 2.5243, + "step": 11515 + }, + { + "epoch": 0.9293842304898717, + "grad_norm": 0.6620199084281921, + "learning_rate": 7.707782665787497e-05, + "loss": 2.5114, + "step": 11516 + }, + { + "epoch": 0.9294649342264547, + "grad_norm": 0.7160476446151733, + "learning_rate": 7.7062460384088e-05, + "loss": 2.5322, + "step": 11517 + }, + { + "epoch": 0.9295456379630377, + "grad_norm": 0.6637005805969238, + "learning_rate": 7.704709468196454e-05, + "loss": 2.456, + "step": 11518 + }, + { + "epoch": 0.9296263416996207, + "grad_norm": 0.6668851375579834, + "learning_rate": 7.703172955188742e-05, + "loss": 2.5251, + "step": 11519 + }, + { + "epoch": 0.9297070454362037, + "grad_norm": 0.6840329170227051, + "learning_rate": 7.701636499423965e-05, + "loss": 2.5068, + "step": 11520 + }, + { + "epoch": 0.9297877491727867, + "grad_norm": 0.695122241973877, + "learning_rate": 7.700100100940415e-05, + "loss": 2.4822, + "step": 11521 + }, + { + "epoch": 0.9298684529093697, + "grad_norm": 0.6784923672676086, + "learning_rate": 7.698563759776382e-05, + "loss": 2.4978, + "step": 11522 + }, + { + "epoch": 0.9299491566459527, + "grad_norm": 0.6949357986450195, + "learning_rate": 7.697027475970154e-05, + "loss": 2.5392, + "step": 11523 + }, + { + "epoch": 0.9300298603825358, + "grad_norm": 0.7128093242645264, + "learning_rate": 7.695491249560025e-05, + "loss": 2.455, + "step": 11524 + }, + { + "epoch": 0.9301105641191187, + "grad_norm": 0.6534962058067322, + "learning_rate": 7.693955080584277e-05, + "loss": 2.5272, + "step": 11525 + }, + { + "epoch": 0.9301912678557017, + "grad_norm": 0.6893511414527893, + "learning_rate": 7.692418969081194e-05, + "loss": 2.5366, + "step": 11526 + }, + { + "epoch": 0.9302719715922847, + "grad_norm": 0.6335335373878479, + "learning_rate": 7.690882915089064e-05, + "loss": 2.5781, + "step": 11527 + }, + { + "epoch": 0.9303526753288677, + "grad_norm": 0.7264769077301025, + "learning_rate": 7.689346918646172e-05, + "loss": 2.5322, + "step": 11528 + }, + { + "epoch": 0.9304333790654508, + "grad_norm": 0.7156329154968262, + "learning_rate": 7.68781097979079e-05, + "loss": 2.5558, + "step": 11529 + }, + { + "epoch": 0.9305140828020337, + "grad_norm": 0.6914563775062561, + "learning_rate": 7.686275098561203e-05, + "loss": 2.5058, + "step": 11530 + }, + { + "epoch": 0.9305947865386167, + "grad_norm": 0.6939939260482788, + "learning_rate": 7.684739274995691e-05, + "loss": 2.4764, + "step": 11531 + }, + { + "epoch": 0.9306754902751997, + "grad_norm": 0.7103014588356018, + "learning_rate": 7.683203509132526e-05, + "loss": 2.5062, + "step": 11532 + }, + { + "epoch": 0.9307561940117828, + "grad_norm": 0.6558870077133179, + "learning_rate": 7.681667801009985e-05, + "loss": 2.4869, + "step": 11533 + }, + { + "epoch": 0.9308368977483658, + "grad_norm": 0.7280104160308838, + "learning_rate": 7.680132150666348e-05, + "loss": 2.566, + "step": 11534 + }, + { + "epoch": 0.9309176014849487, + "grad_norm": 0.6814180612564087, + "learning_rate": 7.678596558139875e-05, + "loss": 2.4926, + "step": 11535 + }, + { + "epoch": 0.9309983052215317, + "grad_norm": 0.6916589736938477, + "learning_rate": 7.677061023468846e-05, + "loss": 2.5189, + "step": 11536 + }, + { + "epoch": 0.9310790089581148, + "grad_norm": 0.6527554988861084, + "learning_rate": 7.675525546691533e-05, + "loss": 2.4969, + "step": 11537 + }, + { + "epoch": 0.9311597126946978, + "grad_norm": 0.6458954811096191, + "learning_rate": 7.673990127846196e-05, + "loss": 2.5159, + "step": 11538 + }, + { + "epoch": 0.9312404164312807, + "grad_norm": 0.6704902052879333, + "learning_rate": 7.672454766971105e-05, + "loss": 2.49, + "step": 11539 + }, + { + "epoch": 0.9313211201678637, + "grad_norm": 0.6599698066711426, + "learning_rate": 7.670919464104527e-05, + "loss": 2.4872, + "step": 11540 + }, + { + "epoch": 0.9314018239044468, + "grad_norm": 0.7638888955116272, + "learning_rate": 7.669384219284722e-05, + "loss": 2.5228, + "step": 11541 + }, + { + "epoch": 0.9314825276410298, + "grad_norm": 0.6911981105804443, + "learning_rate": 7.667849032549954e-05, + "loss": 2.4675, + "step": 11542 + }, + { + "epoch": 0.9315632313776128, + "grad_norm": 0.6414669156074524, + "learning_rate": 7.666313903938486e-05, + "loss": 2.5137, + "step": 11543 + }, + { + "epoch": 0.9316439351141957, + "grad_norm": 0.7552139759063721, + "learning_rate": 7.66477883348858e-05, + "loss": 2.5778, + "step": 11544 + }, + { + "epoch": 0.9317246388507788, + "grad_norm": 0.6738760471343994, + "learning_rate": 7.663243821238484e-05, + "loss": 2.5326, + "step": 11545 + }, + { + "epoch": 0.9318053425873618, + "grad_norm": 0.7406899333000183, + "learning_rate": 7.661708867226459e-05, + "loss": 2.4608, + "step": 11546 + }, + { + "epoch": 0.9318860463239448, + "grad_norm": 0.7261415719985962, + "learning_rate": 7.660173971490769e-05, + "loss": 2.5684, + "step": 11547 + }, + { + "epoch": 0.9319667500605278, + "grad_norm": 0.636542797088623, + "learning_rate": 7.658639134069654e-05, + "loss": 2.5159, + "step": 11548 + }, + { + "epoch": 0.9320474537971108, + "grad_norm": 0.7730209231376648, + "learning_rate": 7.657104355001373e-05, + "loss": 2.487, + "step": 11549 + }, + { + "epoch": 0.9321281575336938, + "grad_norm": 0.6553641557693481, + "learning_rate": 7.655569634324178e-05, + "loss": 2.5105, + "step": 11550 + }, + { + "epoch": 0.9322088612702768, + "grad_norm": 0.7008326649665833, + "learning_rate": 7.654034972076314e-05, + "loss": 2.492, + "step": 11551 + }, + { + "epoch": 0.9322895650068598, + "grad_norm": 0.7074279189109802, + "learning_rate": 7.65250036829603e-05, + "loss": 2.5221, + "step": 11552 + }, + { + "epoch": 0.9323702687434429, + "grad_norm": 0.7235530018806458, + "learning_rate": 7.650965823021578e-05, + "loss": 2.5285, + "step": 11553 + }, + { + "epoch": 0.9324509724800258, + "grad_norm": 0.7601436376571655, + "learning_rate": 7.649431336291194e-05, + "loss": 2.5071, + "step": 11554 + }, + { + "epoch": 0.9325316762166088, + "grad_norm": 0.6446424126625061, + "learning_rate": 7.647896908143127e-05, + "loss": 2.5032, + "step": 11555 + }, + { + "epoch": 0.9326123799531918, + "grad_norm": 0.7032139897346497, + "learning_rate": 7.646362538615614e-05, + "loss": 2.6096, + "step": 11556 + }, + { + "epoch": 0.9326930836897749, + "grad_norm": 0.6727899312973022, + "learning_rate": 7.644828227746904e-05, + "loss": 2.5041, + "step": 11557 + }, + { + "epoch": 0.9327737874263579, + "grad_norm": 0.6817529201507568, + "learning_rate": 7.643293975575229e-05, + "loss": 2.4474, + "step": 11558 + }, + { + "epoch": 0.9328544911629408, + "grad_norm": 0.6374444365501404, + "learning_rate": 7.641759782138827e-05, + "loss": 2.5204, + "step": 11559 + }, + { + "epoch": 0.9329351948995238, + "grad_norm": 0.6889457702636719, + "learning_rate": 7.640225647475939e-05, + "loss": 2.6344, + "step": 11560 + }, + { + "epoch": 0.9330158986361069, + "grad_norm": 0.6657958626747131, + "learning_rate": 7.638691571624794e-05, + "loss": 2.4672, + "step": 11561 + }, + { + "epoch": 0.9330966023726899, + "grad_norm": 0.6425464749336243, + "learning_rate": 7.637157554623627e-05, + "loss": 2.4756, + "step": 11562 + }, + { + "epoch": 0.9331773061092729, + "grad_norm": 0.7193450927734375, + "learning_rate": 7.635623596510675e-05, + "loss": 2.4969, + "step": 11563 + }, + { + "epoch": 0.9332580098458558, + "grad_norm": 0.6595252156257629, + "learning_rate": 7.634089697324159e-05, + "loss": 2.4647, + "step": 11564 + }, + { + "epoch": 0.9333387135824389, + "grad_norm": 0.6505268812179565, + "learning_rate": 7.632555857102312e-05, + "loss": 2.5059, + "step": 11565 + }, + { + "epoch": 0.9334194173190219, + "grad_norm": 0.6877838969230652, + "learning_rate": 7.631022075883365e-05, + "loss": 2.4855, + "step": 11566 + }, + { + "epoch": 0.9335001210556049, + "grad_norm": 0.6376198530197144, + "learning_rate": 7.629488353705538e-05, + "loss": 2.5024, + "step": 11567 + }, + { + "epoch": 0.9335808247921878, + "grad_norm": 0.6807642579078674, + "learning_rate": 7.627954690607058e-05, + "loss": 2.4954, + "step": 11568 + }, + { + "epoch": 0.9336615285287709, + "grad_norm": 0.6785219311714172, + "learning_rate": 7.62642108662615e-05, + "loss": 2.4854, + "step": 11569 + }, + { + "epoch": 0.9337422322653539, + "grad_norm": 0.8159591555595398, + "learning_rate": 7.624887541801032e-05, + "loss": 2.524, + "step": 11570 + }, + { + "epoch": 0.9338229360019369, + "grad_norm": 0.6912592053413391, + "learning_rate": 7.62335405616992e-05, + "loss": 2.5111, + "step": 11571 + }, + { + "epoch": 0.9339036397385199, + "grad_norm": 0.6772454977035522, + "learning_rate": 7.621820629771041e-05, + "loss": 2.5603, + "step": 11572 + }, + { + "epoch": 0.933984343475103, + "grad_norm": 0.6720221638679504, + "learning_rate": 7.620287262642613e-05, + "loss": 2.5016, + "step": 11573 + }, + { + "epoch": 0.9340650472116859, + "grad_norm": 0.651935338973999, + "learning_rate": 7.618753954822841e-05, + "loss": 2.445, + "step": 11574 + }, + { + "epoch": 0.9341457509482689, + "grad_norm": 0.6731166839599609, + "learning_rate": 7.617220706349947e-05, + "loss": 2.4703, + "step": 11575 + }, + { + "epoch": 0.9342264546848519, + "grad_norm": 0.6283879280090332, + "learning_rate": 7.615687517262143e-05, + "loss": 2.5232, + "step": 11576 + }, + { + "epoch": 0.9343071584214349, + "grad_norm": 0.7193455696105957, + "learning_rate": 7.614154387597638e-05, + "loss": 2.5268, + "step": 11577 + }, + { + "epoch": 0.934387862158018, + "grad_norm": 0.6992828845977783, + "learning_rate": 7.61262131739464e-05, + "loss": 2.5834, + "step": 11578 + }, + { + "epoch": 0.9344685658946009, + "grad_norm": 0.6501220464706421, + "learning_rate": 7.611088306691365e-05, + "loss": 2.5146, + "step": 11579 + }, + { + "epoch": 0.9345492696311839, + "grad_norm": 0.7246220111846924, + "learning_rate": 7.60955535552601e-05, + "loss": 2.5665, + "step": 11580 + }, + { + "epoch": 0.9346299733677669, + "grad_norm": 0.7190428376197815, + "learning_rate": 7.608022463936783e-05, + "loss": 2.5061, + "step": 11581 + }, + { + "epoch": 0.93471067710435, + "grad_norm": 0.7144324779510498, + "learning_rate": 7.606489631961893e-05, + "loss": 2.4982, + "step": 11582 + }, + { + "epoch": 0.9347913808409329, + "grad_norm": 0.7144657373428345, + "learning_rate": 7.604956859639535e-05, + "loss": 2.5506, + "step": 11583 + }, + { + "epoch": 0.9348720845775159, + "grad_norm": 0.6596626043319702, + "learning_rate": 7.603424147007913e-05, + "loss": 2.4911, + "step": 11584 + }, + { + "epoch": 0.9349527883140989, + "grad_norm": 0.7090883851051331, + "learning_rate": 7.601891494105227e-05, + "loss": 2.5087, + "step": 11585 + }, + { + "epoch": 0.935033492050682, + "grad_norm": 0.6679760217666626, + "learning_rate": 7.600358900969671e-05, + "loss": 2.497, + "step": 11586 + }, + { + "epoch": 0.935114195787265, + "grad_norm": 0.6795344948768616, + "learning_rate": 7.598826367639447e-05, + "loss": 2.4839, + "step": 11587 + }, + { + "epoch": 0.9351948995238479, + "grad_norm": 0.6378790736198425, + "learning_rate": 7.597293894152744e-05, + "loss": 2.4656, + "step": 11588 + }, + { + "epoch": 0.9352756032604309, + "grad_norm": 0.6646658182144165, + "learning_rate": 7.595761480547762e-05, + "loss": 2.4739, + "step": 11589 + }, + { + "epoch": 0.935356306997014, + "grad_norm": 0.6662073731422424, + "learning_rate": 7.594229126862687e-05, + "loss": 2.4872, + "step": 11590 + }, + { + "epoch": 0.935437010733597, + "grad_norm": 0.6698113679885864, + "learning_rate": 7.592696833135708e-05, + "loss": 2.4964, + "step": 11591 + }, + { + "epoch": 0.93551771447018, + "grad_norm": 0.6520004272460938, + "learning_rate": 7.59116459940502e-05, + "loss": 2.5616, + "step": 11592 + }, + { + "epoch": 0.9355984182067629, + "grad_norm": 0.6675869226455688, + "learning_rate": 7.589632425708806e-05, + "loss": 2.4854, + "step": 11593 + }, + { + "epoch": 0.935679121943346, + "grad_norm": 0.6914103031158447, + "learning_rate": 7.588100312085251e-05, + "loss": 2.5252, + "step": 11594 + }, + { + "epoch": 0.935759825679929, + "grad_norm": 0.7283286452293396, + "learning_rate": 7.586568258572546e-05, + "loss": 2.543, + "step": 11595 + }, + { + "epoch": 0.935840529416512, + "grad_norm": 0.6881958246231079, + "learning_rate": 7.585036265208864e-05, + "loss": 2.4499, + "step": 11596 + }, + { + "epoch": 0.935921233153095, + "grad_norm": 0.7733677625656128, + "learning_rate": 7.58350433203239e-05, + "loss": 2.5595, + "step": 11597 + }, + { + "epoch": 0.936001936889678, + "grad_norm": 0.672711968421936, + "learning_rate": 7.58197245908131e-05, + "loss": 2.4757, + "step": 11598 + }, + { + "epoch": 0.936082640626261, + "grad_norm": 0.691780686378479, + "learning_rate": 7.580440646393794e-05, + "loss": 2.5134, + "step": 11599 + }, + { + "epoch": 0.936163344362844, + "grad_norm": 0.6935102343559265, + "learning_rate": 7.578908894008021e-05, + "loss": 2.5128, + "step": 11600 + }, + { + "epoch": 0.936244048099427, + "grad_norm": 0.7005696892738342, + "learning_rate": 7.57737720196217e-05, + "loss": 2.5338, + "step": 11601 + }, + { + "epoch": 0.93632475183601, + "grad_norm": 0.6729815602302551, + "learning_rate": 7.575845570294409e-05, + "loss": 2.5373, + "step": 11602 + }, + { + "epoch": 0.936405455572593, + "grad_norm": 0.6694760918617249, + "learning_rate": 7.574313999042913e-05, + "loss": 2.5165, + "step": 11603 + }, + { + "epoch": 0.936486159309176, + "grad_norm": 0.6425337791442871, + "learning_rate": 7.572782488245854e-05, + "loss": 2.5102, + "step": 11604 + }, + { + "epoch": 0.936566863045759, + "grad_norm": 0.6613046526908875, + "learning_rate": 7.571251037941405e-05, + "loss": 2.5108, + "step": 11605 + }, + { + "epoch": 0.9366475667823421, + "grad_norm": 0.7396309971809387, + "learning_rate": 7.569719648167723e-05, + "loss": 2.5261, + "step": 11606 + }, + { + "epoch": 0.936728270518925, + "grad_norm": 0.6783239245414734, + "learning_rate": 7.568188318962981e-05, + "loss": 2.5725, + "step": 11607 + }, + { + "epoch": 0.936808974255508, + "grad_norm": 0.7591684460639954, + "learning_rate": 7.566657050365345e-05, + "loss": 2.5085, + "step": 11608 + }, + { + "epoch": 0.936889677992091, + "grad_norm": 0.6805615425109863, + "learning_rate": 7.565125842412974e-05, + "loss": 2.5598, + "step": 11609 + }, + { + "epoch": 0.9369703817286741, + "grad_norm": 0.680203378200531, + "learning_rate": 7.563594695144032e-05, + "loss": 2.5072, + "step": 11610 + }, + { + "epoch": 0.9370510854652571, + "grad_norm": 0.7035777568817139, + "learning_rate": 7.56206360859668e-05, + "loss": 2.4882, + "step": 11611 + }, + { + "epoch": 0.93713178920184, + "grad_norm": 0.7457048892974854, + "learning_rate": 7.560532582809075e-05, + "loss": 2.4975, + "step": 11612 + }, + { + "epoch": 0.937212492938423, + "grad_norm": 0.702055037021637, + "learning_rate": 7.559001617819374e-05, + "loss": 2.5522, + "step": 11613 + }, + { + "epoch": 0.9372931966750061, + "grad_norm": 0.7618527412414551, + "learning_rate": 7.557470713665738e-05, + "loss": 2.5503, + "step": 11614 + }, + { + "epoch": 0.9373739004115891, + "grad_norm": 0.8611559867858887, + "learning_rate": 7.555939870386312e-05, + "loss": 2.4866, + "step": 11615 + }, + { + "epoch": 0.937454604148172, + "grad_norm": 0.7285227179527283, + "learning_rate": 7.554409088019254e-05, + "loss": 2.4855, + "step": 11616 + }, + { + "epoch": 0.937535307884755, + "grad_norm": 0.7512121796607971, + "learning_rate": 7.552878366602716e-05, + "loss": 2.5496, + "step": 11617 + }, + { + "epoch": 0.9376160116213381, + "grad_norm": 0.7353625297546387, + "learning_rate": 7.551347706174844e-05, + "loss": 2.5754, + "step": 11618 + }, + { + "epoch": 0.9376967153579211, + "grad_norm": 0.7131205797195435, + "learning_rate": 7.549817106773788e-05, + "loss": 2.4927, + "step": 11619 + }, + { + "epoch": 0.9377774190945041, + "grad_norm": 0.6562477946281433, + "learning_rate": 7.548286568437695e-05, + "loss": 2.5247, + "step": 11620 + }, + { + "epoch": 0.937858122831087, + "grad_norm": 0.7094948887825012, + "learning_rate": 7.546756091204713e-05, + "loss": 2.5084, + "step": 11621 + }, + { + "epoch": 0.9379388265676701, + "grad_norm": 0.6890475153923035, + "learning_rate": 7.545225675112977e-05, + "loss": 2.5178, + "step": 11622 + }, + { + "epoch": 0.9380195303042531, + "grad_norm": 0.6801474094390869, + "learning_rate": 7.543695320200634e-05, + "loss": 2.5457, + "step": 11623 + }, + { + "epoch": 0.9381002340408361, + "grad_norm": 0.7093712687492371, + "learning_rate": 7.54216502650583e-05, + "loss": 2.6122, + "step": 11624 + }, + { + "epoch": 0.9381809377774191, + "grad_norm": 0.7246927618980408, + "learning_rate": 7.540634794066695e-05, + "loss": 2.5251, + "step": 11625 + }, + { + "epoch": 0.9382616415140022, + "grad_norm": 0.7358111143112183, + "learning_rate": 7.539104622921368e-05, + "loss": 2.5444, + "step": 11626 + }, + { + "epoch": 0.9383423452505851, + "grad_norm": 0.6915993690490723, + "learning_rate": 7.53757451310799e-05, + "loss": 2.448, + "step": 11627 + }, + { + "epoch": 0.9384230489871681, + "grad_norm": 0.6864039301872253, + "learning_rate": 7.536044464664689e-05, + "loss": 2.5267, + "step": 11628 + }, + { + "epoch": 0.9385037527237511, + "grad_norm": 0.664799690246582, + "learning_rate": 7.534514477629602e-05, + "loss": 2.5602, + "step": 11629 + }, + { + "epoch": 0.9385844564603341, + "grad_norm": 0.6770062446594238, + "learning_rate": 7.532984552040862e-05, + "loss": 2.5034, + "step": 11630 + }, + { + "epoch": 0.9386651601969171, + "grad_norm": 0.6961095929145813, + "learning_rate": 7.531454687936592e-05, + "loss": 2.4523, + "step": 11631 + }, + { + "epoch": 0.9387458639335001, + "grad_norm": 0.6776804327964783, + "learning_rate": 7.529924885354924e-05, + "loss": 2.5526, + "step": 11632 + }, + { + "epoch": 0.9388265676700831, + "grad_norm": 0.785796582698822, + "learning_rate": 7.528395144333988e-05, + "loss": 2.5256, + "step": 11633 + }, + { + "epoch": 0.9389072714066661, + "grad_norm": 0.7016655206680298, + "learning_rate": 7.526865464911902e-05, + "loss": 2.4781, + "step": 11634 + }, + { + "epoch": 0.9389879751432492, + "grad_norm": 0.7027767300605774, + "learning_rate": 7.525335847126795e-05, + "loss": 2.5287, + "step": 11635 + }, + { + "epoch": 0.9390686788798321, + "grad_norm": 0.710624098777771, + "learning_rate": 7.523806291016787e-05, + "loss": 2.5486, + "step": 11636 + }, + { + "epoch": 0.9391493826164151, + "grad_norm": 0.7029656767845154, + "learning_rate": 7.52227679662e-05, + "loss": 2.5244, + "step": 11637 + }, + { + "epoch": 0.9392300863529981, + "grad_norm": 0.7417333722114563, + "learning_rate": 7.520747363974551e-05, + "loss": 2.5561, + "step": 11638 + }, + { + "epoch": 0.9393107900895812, + "grad_norm": 0.6595067381858826, + "learning_rate": 7.519217993118559e-05, + "loss": 2.617, + "step": 11639 + }, + { + "epoch": 0.9393914938261642, + "grad_norm": 0.6808187365531921, + "learning_rate": 7.517688684090141e-05, + "loss": 2.5279, + "step": 11640 + }, + { + "epoch": 0.9394721975627471, + "grad_norm": 0.6618706583976746, + "learning_rate": 7.516159436927408e-05, + "loss": 2.4976, + "step": 11641 + }, + { + "epoch": 0.9395529012993301, + "grad_norm": 0.6979385018348694, + "learning_rate": 7.514630251668475e-05, + "loss": 2.4542, + "step": 11642 + }, + { + "epoch": 0.9396336050359132, + "grad_norm": 0.6380844116210938, + "learning_rate": 7.513101128351454e-05, + "loss": 2.48, + "step": 11643 + }, + { + "epoch": 0.9397143087724962, + "grad_norm": 0.6390014290809631, + "learning_rate": 7.511572067014452e-05, + "loss": 2.5111, + "step": 11644 + }, + { + "epoch": 0.9397950125090792, + "grad_norm": 0.7592498064041138, + "learning_rate": 7.510043067695578e-05, + "loss": 2.5161, + "step": 11645 + }, + { + "epoch": 0.9398757162456621, + "grad_norm": 0.6269322037696838, + "learning_rate": 7.508514130432945e-05, + "loss": 2.491, + "step": 11646 + }, + { + "epoch": 0.9399564199822452, + "grad_norm": 0.6372053623199463, + "learning_rate": 7.506985255264646e-05, + "loss": 2.4826, + "step": 11647 + }, + { + "epoch": 0.9400371237188282, + "grad_norm": 0.6962460875511169, + "learning_rate": 7.505456442228794e-05, + "loss": 2.5605, + "step": 11648 + }, + { + "epoch": 0.9401178274554112, + "grad_norm": 0.7931656241416931, + "learning_rate": 7.503927691363491e-05, + "loss": 2.4909, + "step": 11649 + }, + { + "epoch": 0.9401985311919941, + "grad_norm": 0.688792884349823, + "learning_rate": 7.502399002706832e-05, + "loss": 2.4888, + "step": 11650 + }, + { + "epoch": 0.9402792349285772, + "grad_norm": 0.6683691143989563, + "learning_rate": 7.500870376296918e-05, + "loss": 2.5233, + "step": 11651 + }, + { + "epoch": 0.9403599386651602, + "grad_norm": 0.6537527441978455, + "learning_rate": 7.499341812171846e-05, + "loss": 2.5061, + "step": 11652 + }, + { + "epoch": 0.9404406424017432, + "grad_norm": 0.6657658219337463, + "learning_rate": 7.497813310369717e-05, + "loss": 2.4844, + "step": 11653 + }, + { + "epoch": 0.9405213461383262, + "grad_norm": 0.6865110993385315, + "learning_rate": 7.496284870928618e-05, + "loss": 2.4986, + "step": 11654 + }, + { + "epoch": 0.9406020498749093, + "grad_norm": 0.6724923849105835, + "learning_rate": 7.494756493886644e-05, + "loss": 2.4818, + "step": 11655 + }, + { + "epoch": 0.9406827536114922, + "grad_norm": 0.6478626728057861, + "learning_rate": 7.493228179281892e-05, + "loss": 2.5321, + "step": 11656 + }, + { + "epoch": 0.9407634573480752, + "grad_norm": 0.6474425792694092, + "learning_rate": 7.491699927152443e-05, + "loss": 2.5276, + "step": 11657 + }, + { + "epoch": 0.9408441610846582, + "grad_norm": 0.6736220717430115, + "learning_rate": 7.490171737536387e-05, + "loss": 2.4734, + "step": 11658 + }, + { + "epoch": 0.9409248648212413, + "grad_norm": 0.6714746952056885, + "learning_rate": 7.488643610471815e-05, + "loss": 2.5754, + "step": 11659 + }, + { + "epoch": 0.9410055685578242, + "grad_norm": 0.6714532375335693, + "learning_rate": 7.487115545996805e-05, + "loss": 2.4855, + "step": 11660 + }, + { + "epoch": 0.9410862722944072, + "grad_norm": 0.7601683139801025, + "learning_rate": 7.485587544149447e-05, + "loss": 2.4887, + "step": 11661 + }, + { + "epoch": 0.9411669760309902, + "grad_norm": 0.7655646204948425, + "learning_rate": 7.484059604967821e-05, + "loss": 2.4904, + "step": 11662 + }, + { + "epoch": 0.9412476797675733, + "grad_norm": 0.6841822862625122, + "learning_rate": 7.482531728490006e-05, + "loss": 2.5272, + "step": 11663 + }, + { + "epoch": 0.9413283835041563, + "grad_norm": 0.7683621048927307, + "learning_rate": 7.481003914754078e-05, + "loss": 2.5218, + "step": 11664 + }, + { + "epoch": 0.9414090872407392, + "grad_norm": 0.6597647070884705, + "learning_rate": 7.479476163798124e-05, + "loss": 2.4925, + "step": 11665 + }, + { + "epoch": 0.9414897909773222, + "grad_norm": 0.6573941111564636, + "learning_rate": 7.477948475660208e-05, + "loss": 2.4854, + "step": 11666 + }, + { + "epoch": 0.9415704947139053, + "grad_norm": 0.6639125943183899, + "learning_rate": 7.476420850378407e-05, + "loss": 2.5207, + "step": 11667 + }, + { + "epoch": 0.9416511984504883, + "grad_norm": 0.6770366430282593, + "learning_rate": 7.474893287990796e-05, + "loss": 2.5167, + "step": 11668 + }, + { + "epoch": 0.9417319021870713, + "grad_norm": 0.6908389925956726, + "learning_rate": 7.473365788535447e-05, + "loss": 2.4606, + "step": 11669 + }, + { + "epoch": 0.9418126059236542, + "grad_norm": 0.6625069975852966, + "learning_rate": 7.471838352050427e-05, + "loss": 2.5344, + "step": 11670 + }, + { + "epoch": 0.9418933096602373, + "grad_norm": 0.6690869331359863, + "learning_rate": 7.470310978573803e-05, + "loss": 2.4507, + "step": 11671 + }, + { + "epoch": 0.9419740133968203, + "grad_norm": 0.6741886734962463, + "learning_rate": 7.468783668143645e-05, + "loss": 2.5755, + "step": 11672 + }, + { + "epoch": 0.9420547171334033, + "grad_norm": 0.6876424551010132, + "learning_rate": 7.467256420798009e-05, + "loss": 2.483, + "step": 11673 + }, + { + "epoch": 0.9421354208699863, + "grad_norm": 0.7044318318367004, + "learning_rate": 7.465729236574965e-05, + "loss": 2.5025, + "step": 11674 + }, + { + "epoch": 0.9422161246065693, + "grad_norm": 0.6608660817146301, + "learning_rate": 7.46420211551258e-05, + "loss": 2.5253, + "step": 11675 + }, + { + "epoch": 0.9422968283431523, + "grad_norm": 0.6944260001182556, + "learning_rate": 7.4626750576489e-05, + "loss": 2.5002, + "step": 11676 + }, + { + "epoch": 0.9423775320797353, + "grad_norm": 0.7304964065551758, + "learning_rate": 7.46114806302199e-05, + "loss": 2.5501, + "step": 11677 + }, + { + "epoch": 0.9424582358163183, + "grad_norm": 0.688525378704071, + "learning_rate": 7.459621131669911e-05, + "loss": 2.5291, + "step": 11678 + }, + { + "epoch": 0.9425389395529012, + "grad_norm": 0.7388432025909424, + "learning_rate": 7.45809426363071e-05, + "loss": 2.5391, + "step": 11679 + }, + { + "epoch": 0.9426196432894843, + "grad_norm": 0.6777819991111755, + "learning_rate": 7.456567458942447e-05, + "loss": 2.5425, + "step": 11680 + }, + { + "epoch": 0.9427003470260673, + "grad_norm": 0.7208845615386963, + "learning_rate": 7.455040717643169e-05, + "loss": 2.5306, + "step": 11681 + }, + { + "epoch": 0.9427810507626503, + "grad_norm": 0.745384693145752, + "learning_rate": 7.453514039770934e-05, + "loss": 2.4695, + "step": 11682 + }, + { + "epoch": 0.9428617544992333, + "grad_norm": 0.7088115215301514, + "learning_rate": 7.451987425363782e-05, + "loss": 2.5413, + "step": 11683 + }, + { + "epoch": 0.9429424582358163, + "grad_norm": 0.7287998795509338, + "learning_rate": 7.450460874459762e-05, + "loss": 2.5773, + "step": 11684 + }, + { + "epoch": 0.9430231619723993, + "grad_norm": 0.6897092461585999, + "learning_rate": 7.448934387096928e-05, + "loss": 2.5255, + "step": 11685 + }, + { + "epoch": 0.9431038657089823, + "grad_norm": 0.6227227449417114, + "learning_rate": 7.447407963313313e-05, + "loss": 2.5027, + "step": 11686 + }, + { + "epoch": 0.9431845694455653, + "grad_norm": 0.6954305768013, + "learning_rate": 7.445881603146964e-05, + "loss": 2.5477, + "step": 11687 + }, + { + "epoch": 0.9432652731821484, + "grad_norm": 0.7860052585601807, + "learning_rate": 7.444355306635924e-05, + "loss": 2.469, + "step": 11688 + }, + { + "epoch": 0.9433459769187313, + "grad_norm": 0.6851965188980103, + "learning_rate": 7.442829073818227e-05, + "loss": 2.4997, + "step": 11689 + }, + { + "epoch": 0.9434266806553143, + "grad_norm": 0.7011744379997253, + "learning_rate": 7.441302904731916e-05, + "loss": 2.5399, + "step": 11690 + }, + { + "epoch": 0.9435073843918973, + "grad_norm": 0.703167200088501, + "learning_rate": 7.439776799415028e-05, + "loss": 2.5323, + "step": 11691 + }, + { + "epoch": 0.9435880881284804, + "grad_norm": 0.6747310161590576, + "learning_rate": 7.438250757905591e-05, + "loss": 2.5406, + "step": 11692 + }, + { + "epoch": 0.9436687918650634, + "grad_norm": 0.8631153106689453, + "learning_rate": 7.436724780241642e-05, + "loss": 2.5215, + "step": 11693 + }, + { + "epoch": 0.9437494956016463, + "grad_norm": 0.6919798254966736, + "learning_rate": 7.435198866461214e-05, + "loss": 2.4654, + "step": 11694 + }, + { + "epoch": 0.9438301993382293, + "grad_norm": 0.6747070550918579, + "learning_rate": 7.433673016602332e-05, + "loss": 2.5186, + "step": 11695 + }, + { + "epoch": 0.9439109030748124, + "grad_norm": 0.7368776798248291, + "learning_rate": 7.432147230703026e-05, + "loss": 2.5365, + "step": 11696 + }, + { + "epoch": 0.9439916068113954, + "grad_norm": 0.7443639636039734, + "learning_rate": 7.430621508801325e-05, + "loss": 2.4966, + "step": 11697 + }, + { + "epoch": 0.9440723105479784, + "grad_norm": 0.7371395230293274, + "learning_rate": 7.429095850935255e-05, + "loss": 2.4638, + "step": 11698 + }, + { + "epoch": 0.9441530142845613, + "grad_norm": 0.6917321681976318, + "learning_rate": 7.427570257142832e-05, + "loss": 2.5341, + "step": 11699 + }, + { + "epoch": 0.9442337180211444, + "grad_norm": 0.7704101800918579, + "learning_rate": 7.426044727462085e-05, + "loss": 2.5144, + "step": 11700 + }, + { + "epoch": 0.9443144217577274, + "grad_norm": 0.692197859287262, + "learning_rate": 7.424519261931036e-05, + "loss": 2.5293, + "step": 11701 + }, + { + "epoch": 0.9443951254943104, + "grad_norm": 0.7305885553359985, + "learning_rate": 7.422993860587695e-05, + "loss": 2.5236, + "step": 11702 + }, + { + "epoch": 0.9444758292308933, + "grad_norm": 0.6955052018165588, + "learning_rate": 7.421468523470081e-05, + "loss": 2.4765, + "step": 11703 + }, + { + "epoch": 0.9445565329674764, + "grad_norm": 0.7394432425498962, + "learning_rate": 7.419943250616216e-05, + "loss": 2.5053, + "step": 11704 + }, + { + "epoch": 0.9446372367040594, + "grad_norm": 0.679044246673584, + "learning_rate": 7.418418042064108e-05, + "loss": 2.5413, + "step": 11705 + }, + { + "epoch": 0.9447179404406424, + "grad_norm": 0.7153440117835999, + "learning_rate": 7.41689289785177e-05, + "loss": 2.4938, + "step": 11706 + }, + { + "epoch": 0.9447986441772254, + "grad_norm": 0.697068452835083, + "learning_rate": 7.415367818017217e-05, + "loss": 2.5157, + "step": 11707 + }, + { + "epoch": 0.9448793479138085, + "grad_norm": 0.664616048336029, + "learning_rate": 7.41384280259845e-05, + "loss": 2.4859, + "step": 11708 + }, + { + "epoch": 0.9449600516503914, + "grad_norm": 0.7275365591049194, + "learning_rate": 7.412317851633479e-05, + "loss": 2.523, + "step": 11709 + }, + { + "epoch": 0.9450407553869744, + "grad_norm": 0.7408944368362427, + "learning_rate": 7.410792965160318e-05, + "loss": 2.4994, + "step": 11710 + }, + { + "epoch": 0.9451214591235574, + "grad_norm": 0.7222678065299988, + "learning_rate": 7.40926814321696e-05, + "loss": 2.5084, + "step": 11711 + }, + { + "epoch": 0.9452021628601405, + "grad_norm": 0.7242292761802673, + "learning_rate": 7.407743385841412e-05, + "loss": 2.5165, + "step": 11712 + }, + { + "epoch": 0.9452828665967234, + "grad_norm": 0.6634014844894409, + "learning_rate": 7.406218693071677e-05, + "loss": 2.4947, + "step": 11713 + }, + { + "epoch": 0.9453635703333064, + "grad_norm": 0.8126605153083801, + "learning_rate": 7.404694064945751e-05, + "loss": 2.5553, + "step": 11714 + }, + { + "epoch": 0.9454442740698894, + "grad_norm": 0.679344654083252, + "learning_rate": 7.403169501501632e-05, + "loss": 2.5475, + "step": 11715 + }, + { + "epoch": 0.9455249778064725, + "grad_norm": 0.7584314346313477, + "learning_rate": 7.401645002777318e-05, + "loss": 2.5498, + "step": 11716 + }, + { + "epoch": 0.9456056815430555, + "grad_norm": 0.7191590666770935, + "learning_rate": 7.400120568810806e-05, + "loss": 2.5161, + "step": 11717 + }, + { + "epoch": 0.9456863852796384, + "grad_norm": 0.6738762855529785, + "learning_rate": 7.398596199640084e-05, + "loss": 2.4819, + "step": 11718 + }, + { + "epoch": 0.9457670890162214, + "grad_norm": 0.7305885553359985, + "learning_rate": 7.397071895303143e-05, + "loss": 2.4842, + "step": 11719 + }, + { + "epoch": 0.9458477927528045, + "grad_norm": 0.6885291337966919, + "learning_rate": 7.395547655837976e-05, + "loss": 2.5016, + "step": 11720 + }, + { + "epoch": 0.9459284964893875, + "grad_norm": 0.6807307600975037, + "learning_rate": 7.394023481282568e-05, + "loss": 2.4949, + "step": 11721 + }, + { + "epoch": 0.9460092002259705, + "grad_norm": 0.6683849096298218, + "learning_rate": 7.392499371674907e-05, + "loss": 2.4974, + "step": 11722 + }, + { + "epoch": 0.9460899039625534, + "grad_norm": 0.6615697741508484, + "learning_rate": 7.39097532705298e-05, + "loss": 2.4744, + "step": 11723 + }, + { + "epoch": 0.9461706076991365, + "grad_norm": 0.6463690996170044, + "learning_rate": 7.389451347454765e-05, + "loss": 2.478, + "step": 11724 + }, + { + "epoch": 0.9462513114357195, + "grad_norm": 0.6848269701004028, + "learning_rate": 7.387927432918247e-05, + "loss": 2.5491, + "step": 11725 + }, + { + "epoch": 0.9463320151723025, + "grad_norm": 0.7251551747322083, + "learning_rate": 7.386403583481409e-05, + "loss": 2.4936, + "step": 11726 + }, + { + "epoch": 0.9464127189088855, + "grad_norm": 0.6562095284461975, + "learning_rate": 7.384879799182223e-05, + "loss": 2.4895, + "step": 11727 + }, + { + "epoch": 0.9464934226454685, + "grad_norm": 0.6891352534294128, + "learning_rate": 7.383356080058668e-05, + "loss": 2.508, + "step": 11728 + }, + { + "epoch": 0.9465741263820515, + "grad_norm": 0.7220255136489868, + "learning_rate": 7.381832426148719e-05, + "loss": 2.5181, + "step": 11729 + }, + { + "epoch": 0.9466548301186345, + "grad_norm": 0.7213689088821411, + "learning_rate": 7.38030883749035e-05, + "loss": 2.5136, + "step": 11730 + }, + { + "epoch": 0.9467355338552175, + "grad_norm": 0.6711129546165466, + "learning_rate": 7.378785314121535e-05, + "loss": 2.5463, + "step": 11731 + }, + { + "epoch": 0.9468162375918004, + "grad_norm": 0.6380139589309692, + "learning_rate": 7.377261856080239e-05, + "loss": 2.5092, + "step": 11732 + }, + { + "epoch": 0.9468969413283835, + "grad_norm": 0.66046142578125, + "learning_rate": 7.375738463404437e-05, + "loss": 2.5561, + "step": 11733 + }, + { + "epoch": 0.9469776450649665, + "grad_norm": 0.6857354044914246, + "learning_rate": 7.37421513613209e-05, + "loss": 2.5774, + "step": 11734 + }, + { + "epoch": 0.9470583488015495, + "grad_norm": 0.6811589598655701, + "learning_rate": 7.372691874301163e-05, + "loss": 2.4918, + "step": 11735 + }, + { + "epoch": 0.9471390525381325, + "grad_norm": 0.6401017308235168, + "learning_rate": 7.37116867794963e-05, + "loss": 2.4994, + "step": 11736 + }, + { + "epoch": 0.9472197562747156, + "grad_norm": 0.6967078447341919, + "learning_rate": 7.369645547115438e-05, + "loss": 2.5809, + "step": 11737 + }, + { + "epoch": 0.9473004600112985, + "grad_norm": 0.6695219278335571, + "learning_rate": 7.368122481836557e-05, + "loss": 2.4735, + "step": 11738 + }, + { + "epoch": 0.9473811637478815, + "grad_norm": 0.6540528535842896, + "learning_rate": 7.366599482150944e-05, + "loss": 2.4998, + "step": 11739 + }, + { + "epoch": 0.9474618674844645, + "grad_norm": 0.700683057308197, + "learning_rate": 7.365076548096556e-05, + "loss": 2.5258, + "step": 11740 + }, + { + "epoch": 0.9475425712210476, + "grad_norm": 0.7125419974327087, + "learning_rate": 7.363553679711347e-05, + "loss": 2.4653, + "step": 11741 + }, + { + "epoch": 0.9476232749576305, + "grad_norm": 0.7285346984863281, + "learning_rate": 7.362030877033275e-05, + "loss": 2.5523, + "step": 11742 + }, + { + "epoch": 0.9477039786942135, + "grad_norm": 0.7310814261436462, + "learning_rate": 7.360508140100288e-05, + "loss": 2.5027, + "step": 11743 + }, + { + "epoch": 0.9477846824307965, + "grad_norm": 0.746961772441864, + "learning_rate": 7.358985468950335e-05, + "loss": 2.5485, + "step": 11744 + }, + { + "epoch": 0.9478653861673796, + "grad_norm": 0.6880186796188354, + "learning_rate": 7.357462863621369e-05, + "loss": 2.5243, + "step": 11745 + }, + { + "epoch": 0.9479460899039626, + "grad_norm": 0.6406471133232117, + "learning_rate": 7.355940324151339e-05, + "loss": 2.512, + "step": 11746 + }, + { + "epoch": 0.9480267936405455, + "grad_norm": 0.6503005027770996, + "learning_rate": 7.354417850578184e-05, + "loss": 2.5318, + "step": 11747 + }, + { + "epoch": 0.9481074973771285, + "grad_norm": 0.6458879113197327, + "learning_rate": 7.352895442939852e-05, + "loss": 2.5451, + "step": 11748 + }, + { + "epoch": 0.9481882011137116, + "grad_norm": 0.7382936477661133, + "learning_rate": 7.351373101274288e-05, + "loss": 2.5393, + "step": 11749 + }, + { + "epoch": 0.9482689048502946, + "grad_norm": 0.7366087436676025, + "learning_rate": 7.349850825619429e-05, + "loss": 2.5591, + "step": 11750 + }, + { + "epoch": 0.9483496085868776, + "grad_norm": 0.6652588248252869, + "learning_rate": 7.348328616013213e-05, + "loss": 2.5348, + "step": 11751 + }, + { + "epoch": 0.9484303123234605, + "grad_norm": 0.7515435814857483, + "learning_rate": 7.346806472493584e-05, + "loss": 2.5208, + "step": 11752 + }, + { + "epoch": 0.9485110160600436, + "grad_norm": 0.7161263227462769, + "learning_rate": 7.345284395098469e-05, + "loss": 2.5518, + "step": 11753 + }, + { + "epoch": 0.9485917197966266, + "grad_norm": 0.7433953285217285, + "learning_rate": 7.343762383865807e-05, + "loss": 2.5914, + "step": 11754 + }, + { + "epoch": 0.9486724235332096, + "grad_norm": 0.674991250038147, + "learning_rate": 7.342240438833532e-05, + "loss": 2.5566, + "step": 11755 + }, + { + "epoch": 0.9487531272697926, + "grad_norm": 0.7511670589447021, + "learning_rate": 7.34071856003957e-05, + "loss": 2.5253, + "step": 11756 + }, + { + "epoch": 0.9488338310063756, + "grad_norm": 0.6672492623329163, + "learning_rate": 7.339196747521853e-05, + "loss": 2.4887, + "step": 11757 + }, + { + "epoch": 0.9489145347429586, + "grad_norm": 0.6826158761978149, + "learning_rate": 7.337675001318312e-05, + "loss": 2.5072, + "step": 11758 + }, + { + "epoch": 0.9489952384795416, + "grad_norm": 0.7189450860023499, + "learning_rate": 7.336153321466867e-05, + "loss": 2.5583, + "step": 11759 + }, + { + "epoch": 0.9490759422161246, + "grad_norm": 0.6923015117645264, + "learning_rate": 7.33463170800544e-05, + "loss": 2.5416, + "step": 11760 + }, + { + "epoch": 0.9491566459527077, + "grad_norm": 0.690060555934906, + "learning_rate": 7.333110160971963e-05, + "loss": 2.4931, + "step": 11761 + }, + { + "epoch": 0.9492373496892906, + "grad_norm": 0.6887977719306946, + "learning_rate": 7.331588680404354e-05, + "loss": 2.4676, + "step": 11762 + }, + { + "epoch": 0.9493180534258736, + "grad_norm": 0.8573753237724304, + "learning_rate": 7.330067266340528e-05, + "loss": 2.5074, + "step": 11763 + }, + { + "epoch": 0.9493987571624566, + "grad_norm": 0.6760974526405334, + "learning_rate": 7.328545918818403e-05, + "loss": 2.5395, + "step": 11764 + }, + { + "epoch": 0.9494794608990397, + "grad_norm": 0.6946160197257996, + "learning_rate": 7.327024637875901e-05, + "loss": 2.535, + "step": 11765 + }, + { + "epoch": 0.9495601646356226, + "grad_norm": 0.6851378679275513, + "learning_rate": 7.32550342355093e-05, + "loss": 2.487, + "step": 11766 + }, + { + "epoch": 0.9496408683722056, + "grad_norm": 0.6480168104171753, + "learning_rate": 7.323982275881404e-05, + "loss": 2.513, + "step": 11767 + }, + { + "epoch": 0.9497215721087886, + "grad_norm": 0.6492218971252441, + "learning_rate": 7.322461194905239e-05, + "loss": 2.4532, + "step": 11768 + }, + { + "epoch": 0.9498022758453717, + "grad_norm": 0.6670051217079163, + "learning_rate": 7.320940180660337e-05, + "loss": 2.5258, + "step": 11769 + }, + { + "epoch": 0.9498829795819547, + "grad_norm": 0.6678066253662109, + "learning_rate": 7.319419233184608e-05, + "loss": 2.5388, + "step": 11770 + }, + { + "epoch": 0.9499636833185376, + "grad_norm": 0.693545937538147, + "learning_rate": 7.31789835251596e-05, + "loss": 2.5304, + "step": 11771 + }, + { + "epoch": 0.9500443870551206, + "grad_norm": 0.680486798286438, + "learning_rate": 7.316377538692297e-05, + "loss": 2.5024, + "step": 11772 + }, + { + "epoch": 0.9501250907917037, + "grad_norm": 0.7271847128868103, + "learning_rate": 7.314856791751518e-05, + "loss": 2.5947, + "step": 11773 + }, + { + "epoch": 0.9502057945282867, + "grad_norm": 0.6889839172363281, + "learning_rate": 7.31333611173153e-05, + "loss": 2.5135, + "step": 11774 + }, + { + "epoch": 0.9502864982648697, + "grad_norm": 0.7431777119636536, + "learning_rate": 7.311815498670226e-05, + "loss": 2.5856, + "step": 11775 + }, + { + "epoch": 0.9503672020014526, + "grad_norm": 0.7168101072311401, + "learning_rate": 7.310294952605508e-05, + "loss": 2.4383, + "step": 11776 + }, + { + "epoch": 0.9504479057380357, + "grad_norm": 0.654803454875946, + "learning_rate": 7.308774473575271e-05, + "loss": 2.4908, + "step": 11777 + }, + { + "epoch": 0.9505286094746187, + "grad_norm": 0.6810718774795532, + "learning_rate": 7.307254061617412e-05, + "loss": 2.5073, + "step": 11778 + }, + { + "epoch": 0.9506093132112017, + "grad_norm": 0.637980043888092, + "learning_rate": 7.305733716769817e-05, + "loss": 2.5686, + "step": 11779 + }, + { + "epoch": 0.9506900169477847, + "grad_norm": 0.6549471020698547, + "learning_rate": 7.30421343907038e-05, + "loss": 2.5502, + "step": 11780 + }, + { + "epoch": 0.9507707206843676, + "grad_norm": 0.7087163329124451, + "learning_rate": 7.302693228556994e-05, + "loss": 2.4773, + "step": 11781 + }, + { + "epoch": 0.9508514244209507, + "grad_norm": 0.6230717897415161, + "learning_rate": 7.301173085267541e-05, + "loss": 2.4806, + "step": 11782 + }, + { + "epoch": 0.9509321281575337, + "grad_norm": 0.7145688533782959, + "learning_rate": 7.299653009239911e-05, + "loss": 2.5259, + "step": 11783 + }, + { + "epoch": 0.9510128318941167, + "grad_norm": 0.679100513458252, + "learning_rate": 7.298133000511988e-05, + "loss": 2.5012, + "step": 11784 + }, + { + "epoch": 0.9510935356306996, + "grad_norm": 0.7057691216468811, + "learning_rate": 7.29661305912165e-05, + "loss": 2.4826, + "step": 11785 + }, + { + "epoch": 0.9511742393672827, + "grad_norm": 0.65343177318573, + "learning_rate": 7.295093185106782e-05, + "loss": 2.4553, + "step": 11786 + }, + { + "epoch": 0.9512549431038657, + "grad_norm": 0.7948461174964905, + "learning_rate": 7.293573378505268e-05, + "loss": 2.478, + "step": 11787 + }, + { + "epoch": 0.9513356468404487, + "grad_norm": 0.6511468887329102, + "learning_rate": 7.292053639354975e-05, + "loss": 2.4862, + "step": 11788 + }, + { + "epoch": 0.9514163505770317, + "grad_norm": 0.7293919324874878, + "learning_rate": 7.290533967693782e-05, + "loss": 2.5956, + "step": 11789 + }, + { + "epoch": 0.9514970543136148, + "grad_norm": 0.6691277623176575, + "learning_rate": 7.289014363559567e-05, + "loss": 2.5659, + "step": 11790 + }, + { + "epoch": 0.9515777580501977, + "grad_norm": 0.7054625749588013, + "learning_rate": 7.287494826990203e-05, + "loss": 2.5875, + "step": 11791 + }, + { + "epoch": 0.9516584617867807, + "grad_norm": 0.6597220301628113, + "learning_rate": 7.285975358023555e-05, + "loss": 2.5215, + "step": 11792 + }, + { + "epoch": 0.9517391655233637, + "grad_norm": 0.6719489097595215, + "learning_rate": 7.284455956697497e-05, + "loss": 2.4752, + "step": 11793 + }, + { + "epoch": 0.9518198692599468, + "grad_norm": 0.7325637340545654, + "learning_rate": 7.2829366230499e-05, + "loss": 2.5504, + "step": 11794 + }, + { + "epoch": 0.9519005729965297, + "grad_norm": 0.637668788433075, + "learning_rate": 7.281417357118619e-05, + "loss": 2.5105, + "step": 11795 + }, + { + "epoch": 0.9519812767331127, + "grad_norm": 0.7815340161323547, + "learning_rate": 7.279898158941525e-05, + "loss": 2.4998, + "step": 11796 + }, + { + "epoch": 0.9520619804696957, + "grad_norm": 0.6555821299552917, + "learning_rate": 7.278379028556481e-05, + "loss": 2.4326, + "step": 11797 + }, + { + "epoch": 0.9521426842062788, + "grad_norm": 0.7298933863639832, + "learning_rate": 7.276859966001344e-05, + "loss": 2.4779, + "step": 11798 + }, + { + "epoch": 0.9522233879428618, + "grad_norm": 0.683455765247345, + "learning_rate": 7.275340971313974e-05, + "loss": 2.4416, + "step": 11799 + }, + { + "epoch": 0.9523040916794447, + "grad_norm": 0.6353151798248291, + "learning_rate": 7.273822044532232e-05, + "loss": 2.4777, + "step": 11800 + }, + { + "epoch": 0.9523847954160277, + "grad_norm": 0.6898894309997559, + "learning_rate": 7.27230318569397e-05, + "loss": 2.5351, + "step": 11801 + }, + { + "epoch": 0.9524654991526108, + "grad_norm": 0.6528690457344055, + "learning_rate": 7.270784394837041e-05, + "loss": 2.5145, + "step": 11802 + }, + { + "epoch": 0.9525462028891938, + "grad_norm": 0.6432619094848633, + "learning_rate": 7.269265671999304e-05, + "loss": 2.5002, + "step": 11803 + }, + { + "epoch": 0.9526269066257768, + "grad_norm": 0.7317861318588257, + "learning_rate": 7.267747017218601e-05, + "loss": 2.5318, + "step": 11804 + }, + { + "epoch": 0.9527076103623597, + "grad_norm": 0.7581185698509216, + "learning_rate": 7.266228430532785e-05, + "loss": 2.5313, + "step": 11805 + }, + { + "epoch": 0.9527883140989428, + "grad_norm": 0.7316486239433289, + "learning_rate": 7.264709911979702e-05, + "loss": 2.5147, + "step": 11806 + }, + { + "epoch": 0.9528690178355258, + "grad_norm": 0.7378978729248047, + "learning_rate": 7.263191461597199e-05, + "loss": 2.5149, + "step": 11807 + }, + { + "epoch": 0.9529497215721088, + "grad_norm": 0.6603738069534302, + "learning_rate": 7.26167307942312e-05, + "loss": 2.4684, + "step": 11808 + }, + { + "epoch": 0.9530304253086918, + "grad_norm": 0.7566502690315247, + "learning_rate": 7.260154765495302e-05, + "loss": 2.5535, + "step": 11809 + }, + { + "epoch": 0.9531111290452748, + "grad_norm": 0.693067729473114, + "learning_rate": 7.258636519851596e-05, + "loss": 2.5103, + "step": 11810 + }, + { + "epoch": 0.9531918327818578, + "grad_norm": 0.7049208283424377, + "learning_rate": 7.257118342529826e-05, + "loss": 2.5482, + "step": 11811 + }, + { + "epoch": 0.9532725365184408, + "grad_norm": 0.6986998319625854, + "learning_rate": 7.25560023356784e-05, + "loss": 2.4921, + "step": 11812 + }, + { + "epoch": 0.9533532402550238, + "grad_norm": 0.7079482674598694, + "learning_rate": 7.254082193003476e-05, + "loss": 2.5339, + "step": 11813 + }, + { + "epoch": 0.9534339439916069, + "grad_norm": 0.7283922433853149, + "learning_rate": 7.252564220874553e-05, + "loss": 2.5056, + "step": 11814 + }, + { + "epoch": 0.9535146477281898, + "grad_norm": 0.6965533494949341, + "learning_rate": 7.251046317218914e-05, + "loss": 2.5512, + "step": 11815 + }, + { + "epoch": 0.9535953514647728, + "grad_norm": 0.7367159128189087, + "learning_rate": 7.24952848207439e-05, + "loss": 2.5015, + "step": 11816 + }, + { + "epoch": 0.9536760552013558, + "grad_norm": 0.6959818601608276, + "learning_rate": 7.248010715478802e-05, + "loss": 2.4969, + "step": 11817 + }, + { + "epoch": 0.9537567589379389, + "grad_norm": 0.69304358959198, + "learning_rate": 7.246493017469981e-05, + "loss": 2.5098, + "step": 11818 + }, + { + "epoch": 0.9538374626745219, + "grad_norm": 0.6830596327781677, + "learning_rate": 7.244975388085757e-05, + "loss": 2.5206, + "step": 11819 + }, + { + "epoch": 0.9539181664111048, + "grad_norm": 0.7354303598403931, + "learning_rate": 7.243457827363944e-05, + "loss": 2.5223, + "step": 11820 + }, + { + "epoch": 0.9539988701476878, + "grad_norm": 0.7046182751655579, + "learning_rate": 7.241940335342366e-05, + "loss": 2.4931, + "step": 11821 + }, + { + "epoch": 0.9540795738842709, + "grad_norm": 0.6990540623664856, + "learning_rate": 7.240422912058843e-05, + "loss": 2.4302, + "step": 11822 + }, + { + "epoch": 0.9541602776208539, + "grad_norm": 0.7562115788459778, + "learning_rate": 7.238905557551202e-05, + "loss": 2.5118, + "step": 11823 + }, + { + "epoch": 0.9542409813574368, + "grad_norm": 0.8212862014770508, + "learning_rate": 7.237388271857248e-05, + "loss": 2.5476, + "step": 11824 + }, + { + "epoch": 0.9543216850940198, + "grad_norm": 0.7095397710800171, + "learning_rate": 7.235871055014798e-05, + "loss": 2.5073, + "step": 11825 + }, + { + "epoch": 0.9544023888306029, + "grad_norm": 0.7174660563468933, + "learning_rate": 7.23435390706167e-05, + "loss": 2.4553, + "step": 11826 + }, + { + "epoch": 0.9544830925671859, + "grad_norm": 0.7121314406394958, + "learning_rate": 7.23283682803567e-05, + "loss": 2.5164, + "step": 11827 + }, + { + "epoch": 0.9545637963037689, + "grad_norm": 0.7354126572608948, + "learning_rate": 7.231319817974609e-05, + "loss": 2.5413, + "step": 11828 + }, + { + "epoch": 0.9546445000403518, + "grad_norm": 0.7770543694496155, + "learning_rate": 7.2298028769163e-05, + "loss": 2.5244, + "step": 11829 + }, + { + "epoch": 0.9547252037769349, + "grad_norm": 0.6770393252372742, + "learning_rate": 7.228286004898541e-05, + "loss": 2.4707, + "step": 11830 + }, + { + "epoch": 0.9548059075135179, + "grad_norm": 0.6916880011558533, + "learning_rate": 7.22676920195914e-05, + "loss": 2.506, + "step": 11831 + }, + { + "epoch": 0.9548866112501009, + "grad_norm": 0.6299161314964294, + "learning_rate": 7.225252468135901e-05, + "loss": 2.5042, + "step": 11832 + }, + { + "epoch": 0.9549673149866839, + "grad_norm": 0.7081227898597717, + "learning_rate": 7.223735803466623e-05, + "loss": 2.5537, + "step": 11833 + }, + { + "epoch": 0.9550480187232668, + "grad_norm": 0.6600900888442993, + "learning_rate": 7.222219207989104e-05, + "loss": 2.5329, + "step": 11834 + }, + { + "epoch": 0.9551287224598499, + "grad_norm": 0.6715366244316101, + "learning_rate": 7.22070268174115e-05, + "loss": 2.5273, + "step": 11835 + }, + { + "epoch": 0.9552094261964329, + "grad_norm": 0.6655930280685425, + "learning_rate": 7.219186224760543e-05, + "loss": 2.4254, + "step": 11836 + }, + { + "epoch": 0.9552901299330159, + "grad_norm": 0.6925715208053589, + "learning_rate": 7.217669837085088e-05, + "loss": 2.5104, + "step": 11837 + }, + { + "epoch": 0.9553708336695989, + "grad_norm": 0.7132978439331055, + "learning_rate": 7.216153518752571e-05, + "loss": 2.5238, + "step": 11838 + }, + { + "epoch": 0.9554515374061819, + "grad_norm": 0.661651611328125, + "learning_rate": 7.214637269800791e-05, + "loss": 2.445, + "step": 11839 + }, + { + "epoch": 0.9555322411427649, + "grad_norm": 0.6635430455207825, + "learning_rate": 7.213121090267528e-05, + "loss": 2.4707, + "step": 11840 + }, + { + "epoch": 0.9556129448793479, + "grad_norm": 0.6303616166114807, + "learning_rate": 7.211604980190571e-05, + "loss": 2.4923, + "step": 11841 + }, + { + "epoch": 0.9556936486159309, + "grad_norm": 0.7027459144592285, + "learning_rate": 7.210088939607708e-05, + "loss": 2.5592, + "step": 11842 + }, + { + "epoch": 0.955774352352514, + "grad_norm": 0.6539996862411499, + "learning_rate": 7.208572968556722e-05, + "loss": 2.5256, + "step": 11843 + }, + { + "epoch": 0.9558550560890969, + "grad_norm": 0.7019872069358826, + "learning_rate": 7.207057067075393e-05, + "loss": 2.488, + "step": 11844 + }, + { + "epoch": 0.9559357598256799, + "grad_norm": 0.6848211288452148, + "learning_rate": 7.205541235201507e-05, + "loss": 2.4883, + "step": 11845 + }, + { + "epoch": 0.9560164635622629, + "grad_norm": 0.7806351184844971, + "learning_rate": 7.204025472972834e-05, + "loss": 2.5563, + "step": 11846 + }, + { + "epoch": 0.956097167298846, + "grad_norm": 0.7327724695205688, + "learning_rate": 7.202509780427156e-05, + "loss": 2.5275, + "step": 11847 + }, + { + "epoch": 0.956177871035429, + "grad_norm": 0.6805681586265564, + "learning_rate": 7.200994157602248e-05, + "loss": 2.4723, + "step": 11848 + }, + { + "epoch": 0.9562585747720119, + "grad_norm": 0.7053409814834595, + "learning_rate": 7.19947860453588e-05, + "loss": 2.4471, + "step": 11849 + }, + { + "epoch": 0.9563392785085949, + "grad_norm": 0.6783127188682556, + "learning_rate": 7.197963121265826e-05, + "loss": 2.4586, + "step": 11850 + }, + { + "epoch": 0.956419982245178, + "grad_norm": 0.6639916300773621, + "learning_rate": 7.196447707829857e-05, + "loss": 2.4966, + "step": 11851 + }, + { + "epoch": 0.956500685981761, + "grad_norm": 0.684066891670227, + "learning_rate": 7.194932364265739e-05, + "loss": 2.5676, + "step": 11852 + }, + { + "epoch": 0.9565813897183439, + "grad_norm": 0.7872990965843201, + "learning_rate": 7.193417090611239e-05, + "loss": 2.5101, + "step": 11853 + }, + { + "epoch": 0.9566620934549269, + "grad_norm": 0.7543401122093201, + "learning_rate": 7.19190188690412e-05, + "loss": 2.5503, + "step": 11854 + }, + { + "epoch": 0.95674279719151, + "grad_norm": 0.6514382362365723, + "learning_rate": 7.190386753182152e-05, + "loss": 2.4902, + "step": 11855 + }, + { + "epoch": 0.956823500928093, + "grad_norm": 0.6867108345031738, + "learning_rate": 7.188871689483087e-05, + "loss": 2.5054, + "step": 11856 + }, + { + "epoch": 0.956904204664676, + "grad_norm": 0.6536040306091309, + "learning_rate": 7.187356695844687e-05, + "loss": 2.5462, + "step": 11857 + }, + { + "epoch": 0.9569849084012589, + "grad_norm": 0.690237820148468, + "learning_rate": 7.185841772304711e-05, + "loss": 2.5673, + "step": 11858 + }, + { + "epoch": 0.957065612137842, + "grad_norm": 0.6699091196060181, + "learning_rate": 7.184326918900915e-05, + "loss": 2.4733, + "step": 11859 + }, + { + "epoch": 0.957146315874425, + "grad_norm": 0.6482241153717041, + "learning_rate": 7.18281213567105e-05, + "loss": 2.4897, + "step": 11860 + }, + { + "epoch": 0.957227019611008, + "grad_norm": 0.686130166053772, + "learning_rate": 7.181297422652874e-05, + "loss": 2.4596, + "step": 11861 + }, + { + "epoch": 0.957307723347591, + "grad_norm": 0.6507205367088318, + "learning_rate": 7.179782779884132e-05, + "loss": 2.5527, + "step": 11862 + }, + { + "epoch": 0.957388427084174, + "grad_norm": 0.6578813195228577, + "learning_rate": 7.178268207402577e-05, + "loss": 2.4975, + "step": 11863 + }, + { + "epoch": 0.957469130820757, + "grad_norm": 0.6931977272033691, + "learning_rate": 7.176753705245956e-05, + "loss": 2.5533, + "step": 11864 + }, + { + "epoch": 0.95754983455734, + "grad_norm": 0.7306256890296936, + "learning_rate": 7.17523927345201e-05, + "loss": 2.534, + "step": 11865 + }, + { + "epoch": 0.957630538293923, + "grad_norm": 0.6337448358535767, + "learning_rate": 7.173724912058483e-05, + "loss": 2.5015, + "step": 11866 + }, + { + "epoch": 0.9577112420305061, + "grad_norm": 0.6561456322669983, + "learning_rate": 7.172210621103124e-05, + "loss": 2.4946, + "step": 11867 + }, + { + "epoch": 0.957791945767089, + "grad_norm": 0.6341130137443542, + "learning_rate": 7.170696400623666e-05, + "loss": 2.5611, + "step": 11868 + }, + { + "epoch": 0.957872649503672, + "grad_norm": 0.7202804088592529, + "learning_rate": 7.169182250657849e-05, + "loss": 2.5209, + "step": 11869 + }, + { + "epoch": 0.957953353240255, + "grad_norm": 0.6620556712150574, + "learning_rate": 7.167668171243408e-05, + "loss": 2.4895, + "step": 11870 + }, + { + "epoch": 0.9580340569768381, + "grad_norm": 0.6842508912086487, + "learning_rate": 7.166154162418087e-05, + "loss": 2.4417, + "step": 11871 + }, + { + "epoch": 0.958114760713421, + "grad_norm": 0.7539907693862915, + "learning_rate": 7.164640224219608e-05, + "loss": 2.5153, + "step": 11872 + }, + { + "epoch": 0.958195464450004, + "grad_norm": 0.6524286270141602, + "learning_rate": 7.163126356685703e-05, + "loss": 2.509, + "step": 11873 + }, + { + "epoch": 0.958276168186587, + "grad_norm": 0.7022691965103149, + "learning_rate": 7.16161255985411e-05, + "loss": 2.5223, + "step": 11874 + }, + { + "epoch": 0.9583568719231701, + "grad_norm": 0.6659076809883118, + "learning_rate": 7.160098833762549e-05, + "loss": 2.5231, + "step": 11875 + }, + { + "epoch": 0.9584375756597531, + "grad_norm": 0.6756494641304016, + "learning_rate": 7.15858517844875e-05, + "loss": 2.5017, + "step": 11876 + }, + { + "epoch": 0.958518279396336, + "grad_norm": 0.729850709438324, + "learning_rate": 7.157071593950436e-05, + "loss": 2.4583, + "step": 11877 + }, + { + "epoch": 0.958598983132919, + "grad_norm": 0.7155230641365051, + "learning_rate": 7.155558080305326e-05, + "loss": 2.4753, + "step": 11878 + }, + { + "epoch": 0.9586796868695021, + "grad_norm": 0.6553284525871277, + "learning_rate": 7.154044637551147e-05, + "loss": 2.5093, + "step": 11879 + }, + { + "epoch": 0.9587603906060851, + "grad_norm": 0.6516379117965698, + "learning_rate": 7.152531265725617e-05, + "loss": 2.4996, + "step": 11880 + }, + { + "epoch": 0.9588410943426681, + "grad_norm": 0.6871184706687927, + "learning_rate": 7.151017964866449e-05, + "loss": 2.5322, + "step": 11881 + }, + { + "epoch": 0.958921798079251, + "grad_norm": 0.6998933553695679, + "learning_rate": 7.149504735011358e-05, + "loss": 2.5328, + "step": 11882 + }, + { + "epoch": 0.959002501815834, + "grad_norm": 0.7065120935440063, + "learning_rate": 7.147991576198065e-05, + "loss": 2.5251, + "step": 11883 + }, + { + "epoch": 0.9590832055524171, + "grad_norm": 0.6718337535858154, + "learning_rate": 7.146478488464275e-05, + "loss": 2.5596, + "step": 11884 + }, + { + "epoch": 0.9591639092890001, + "grad_norm": 0.6394883990287781, + "learning_rate": 7.144965471847698e-05, + "loss": 2.5022, + "step": 11885 + }, + { + "epoch": 0.9592446130255831, + "grad_norm": 0.6867207288742065, + "learning_rate": 7.143452526386045e-05, + "loss": 2.4927, + "step": 11886 + }, + { + "epoch": 0.959325316762166, + "grad_norm": 0.6710157990455627, + "learning_rate": 7.141939652117026e-05, + "loss": 2.5127, + "step": 11887 + }, + { + "epoch": 0.9594060204987491, + "grad_norm": 0.6286540627479553, + "learning_rate": 7.14042684907834e-05, + "loss": 2.4966, + "step": 11888 + }, + { + "epoch": 0.9594867242353321, + "grad_norm": 0.7295787334442139, + "learning_rate": 7.13891411730769e-05, + "loss": 2.5127, + "step": 11889 + }, + { + "epoch": 0.9595674279719151, + "grad_norm": 0.646084189414978, + "learning_rate": 7.137401456842784e-05, + "loss": 2.5575, + "step": 11890 + }, + { + "epoch": 0.959648131708498, + "grad_norm": 0.7884495258331299, + "learning_rate": 7.135888867721312e-05, + "loss": 2.4807, + "step": 11891 + }, + { + "epoch": 0.9597288354450811, + "grad_norm": 0.638469934463501, + "learning_rate": 7.134376349980977e-05, + "loss": 2.4989, + "step": 11892 + }, + { + "epoch": 0.9598095391816641, + "grad_norm": 0.6802849769592285, + "learning_rate": 7.132863903659476e-05, + "loss": 2.5139, + "step": 11893 + }, + { + "epoch": 0.9598902429182471, + "grad_norm": 0.6657521724700928, + "learning_rate": 7.131351528794499e-05, + "loss": 2.4488, + "step": 11894 + }, + { + "epoch": 0.9599709466548301, + "grad_norm": 0.6537562012672424, + "learning_rate": 7.129839225423741e-05, + "loss": 2.4664, + "step": 11895 + }, + { + "epoch": 0.9600516503914132, + "grad_norm": 0.689637303352356, + "learning_rate": 7.128326993584897e-05, + "loss": 2.582, + "step": 11896 + }, + { + "epoch": 0.9601323541279961, + "grad_norm": 0.6701640486717224, + "learning_rate": 7.126814833315646e-05, + "loss": 2.4963, + "step": 11897 + }, + { + "epoch": 0.9602130578645791, + "grad_norm": 0.7466658353805542, + "learning_rate": 7.125302744653677e-05, + "loss": 2.5015, + "step": 11898 + }, + { + "epoch": 0.9602937616011621, + "grad_norm": 0.6487225294113159, + "learning_rate": 7.123790727636685e-05, + "loss": 2.5393, + "step": 11899 + }, + { + "epoch": 0.9603744653377452, + "grad_norm": 0.7204654216766357, + "learning_rate": 7.122278782302343e-05, + "loss": 2.4668, + "step": 11900 + }, + { + "epoch": 0.9604551690743282, + "grad_norm": 0.6852861046791077, + "learning_rate": 7.120766908688336e-05, + "loss": 2.5893, + "step": 11901 + }, + { + "epoch": 0.9605358728109111, + "grad_norm": 0.6483901739120483, + "learning_rate": 7.119255106832344e-05, + "loss": 2.48, + "step": 11902 + }, + { + "epoch": 0.9606165765474941, + "grad_norm": 0.6670375466346741, + "learning_rate": 7.117743376772049e-05, + "loss": 2.5225, + "step": 11903 + }, + { + "epoch": 0.9606972802840772, + "grad_norm": 0.6805974841117859, + "learning_rate": 7.116231718545118e-05, + "loss": 2.4652, + "step": 11904 + }, + { + "epoch": 0.9607779840206602, + "grad_norm": 0.6700397729873657, + "learning_rate": 7.114720132189232e-05, + "loss": 2.5115, + "step": 11905 + }, + { + "epoch": 0.9608586877572431, + "grad_norm": 0.7167409062385559, + "learning_rate": 7.113208617742066e-05, + "loss": 2.5062, + "step": 11906 + }, + { + "epoch": 0.9609393914938261, + "grad_norm": 0.7337077856063843, + "learning_rate": 7.111697175241286e-05, + "loss": 2.5768, + "step": 11907 + }, + { + "epoch": 0.9610200952304092, + "grad_norm": 0.6681819558143616, + "learning_rate": 7.110185804724558e-05, + "loss": 2.5058, + "step": 11908 + }, + { + "epoch": 0.9611007989669922, + "grad_norm": 0.7235603332519531, + "learning_rate": 7.10867450622956e-05, + "loss": 2.4606, + "step": 11909 + }, + { + "epoch": 0.9611815027035752, + "grad_norm": 0.6931360363960266, + "learning_rate": 7.107163279793947e-05, + "loss": 2.5129, + "step": 11910 + }, + { + "epoch": 0.9612622064401581, + "grad_norm": 0.7331648468971252, + "learning_rate": 7.105652125455388e-05, + "loss": 2.4916, + "step": 11911 + }, + { + "epoch": 0.9613429101767412, + "grad_norm": 0.6538143754005432, + "learning_rate": 7.104141043251545e-05, + "loss": 2.5184, + "step": 11912 + }, + { + "epoch": 0.9614236139133242, + "grad_norm": 0.7018921375274658, + "learning_rate": 7.102630033220077e-05, + "loss": 2.5446, + "step": 11913 + }, + { + "epoch": 0.9615043176499072, + "grad_norm": 0.7528507709503174, + "learning_rate": 7.10111909539864e-05, + "loss": 2.4404, + "step": 11914 + }, + { + "epoch": 0.9615850213864902, + "grad_norm": 0.7258831858634949, + "learning_rate": 7.099608229824894e-05, + "loss": 2.4758, + "step": 11915 + }, + { + "epoch": 0.9616657251230732, + "grad_norm": 0.6954349875450134, + "learning_rate": 7.098097436536498e-05, + "loss": 2.4894, + "step": 11916 + }, + { + "epoch": 0.9617464288596562, + "grad_norm": 0.691584050655365, + "learning_rate": 7.096586715571092e-05, + "loss": 2.544, + "step": 11917 + }, + { + "epoch": 0.9618271325962392, + "grad_norm": 0.7107009291648865, + "learning_rate": 7.095076066966337e-05, + "loss": 2.4994, + "step": 11918 + }, + { + "epoch": 0.9619078363328222, + "grad_norm": 0.6492058634757996, + "learning_rate": 7.093565490759881e-05, + "loss": 2.5751, + "step": 11919 + }, + { + "epoch": 0.9619885400694053, + "grad_norm": 0.6817753314971924, + "learning_rate": 7.092054986989371e-05, + "loss": 2.5129, + "step": 11920 + }, + { + "epoch": 0.9620692438059882, + "grad_norm": 0.6991822123527527, + "learning_rate": 7.090544555692448e-05, + "loss": 2.5728, + "step": 11921 + }, + { + "epoch": 0.9621499475425712, + "grad_norm": 0.6627625226974487, + "learning_rate": 7.089034196906768e-05, + "loss": 2.4479, + "step": 11922 + }, + { + "epoch": 0.9622306512791542, + "grad_norm": 0.6889652013778687, + "learning_rate": 7.087523910669957e-05, + "loss": 2.5323, + "step": 11923 + }, + { + "epoch": 0.9623113550157373, + "grad_norm": 0.7863786816596985, + "learning_rate": 7.086013697019667e-05, + "loss": 2.5146, + "step": 11924 + }, + { + "epoch": 0.9623920587523203, + "grad_norm": 0.6885324716567993, + "learning_rate": 7.084503555993536e-05, + "loss": 2.5072, + "step": 11925 + }, + { + "epoch": 0.9624727624889032, + "grad_norm": 0.619239091873169, + "learning_rate": 7.082993487629192e-05, + "loss": 2.4622, + "step": 11926 + }, + { + "epoch": 0.9625534662254862, + "grad_norm": 0.6762447953224182, + "learning_rate": 7.081483491964278e-05, + "loss": 2.5155, + "step": 11927 + }, + { + "epoch": 0.9626341699620693, + "grad_norm": 0.6559715867042542, + "learning_rate": 7.079973569036424e-05, + "loss": 2.4729, + "step": 11928 + }, + { + "epoch": 0.9627148736986523, + "grad_norm": 0.633280873298645, + "learning_rate": 7.078463718883261e-05, + "loss": 2.4715, + "step": 11929 + }, + { + "epoch": 0.9627955774352353, + "grad_norm": 0.7740094065666199, + "learning_rate": 7.07695394154242e-05, + "loss": 2.4871, + "step": 11930 + }, + { + "epoch": 0.9628762811718182, + "grad_norm": 0.7103284597396851, + "learning_rate": 7.075444237051527e-05, + "loss": 2.5299, + "step": 11931 + }, + { + "epoch": 0.9629569849084013, + "grad_norm": 0.6800934076309204, + "learning_rate": 7.073934605448212e-05, + "loss": 2.5919, + "step": 11932 + }, + { + "epoch": 0.9630376886449843, + "grad_norm": 0.6680917143821716, + "learning_rate": 7.072425046770092e-05, + "loss": 2.4942, + "step": 11933 + }, + { + "epoch": 0.9631183923815673, + "grad_norm": 0.7248062491416931, + "learning_rate": 7.070915561054792e-05, + "loss": 2.4956, + "step": 11934 + }, + { + "epoch": 0.9631990961181502, + "grad_norm": 0.6635782122612, + "learning_rate": 7.069406148339936e-05, + "loss": 2.4658, + "step": 11935 + }, + { + "epoch": 0.9632797998547332, + "grad_norm": 0.6751061081886292, + "learning_rate": 7.067896808663137e-05, + "loss": 2.4912, + "step": 11936 + }, + { + "epoch": 0.9633605035913163, + "grad_norm": 0.7476027607917786, + "learning_rate": 7.066387542062013e-05, + "loss": 2.4858, + "step": 11937 + }, + { + "epoch": 0.9634412073278993, + "grad_norm": 0.6770931482315063, + "learning_rate": 7.064878348574183e-05, + "loss": 2.4574, + "step": 11938 + }, + { + "epoch": 0.9635219110644823, + "grad_norm": 0.7105392813682556, + "learning_rate": 7.063369228237255e-05, + "loss": 2.5523, + "step": 11939 + }, + { + "epoch": 0.9636026148010652, + "grad_norm": 0.6806207299232483, + "learning_rate": 7.061860181088842e-05, + "loss": 2.4992, + "step": 11940 + }, + { + "epoch": 0.9636833185376483, + "grad_norm": 0.7059600353240967, + "learning_rate": 7.060351207166558e-05, + "loss": 2.5778, + "step": 11941 + }, + { + "epoch": 0.9637640222742313, + "grad_norm": 0.6306884288787842, + "learning_rate": 7.058842306508002e-05, + "loss": 2.5389, + "step": 11942 + }, + { + "epoch": 0.9638447260108143, + "grad_norm": 0.6997150778770447, + "learning_rate": 7.057333479150783e-05, + "loss": 2.5077, + "step": 11943 + }, + { + "epoch": 0.9639254297473973, + "grad_norm": 0.7073743343353271, + "learning_rate": 7.05582472513251e-05, + "loss": 2.5274, + "step": 11944 + }, + { + "epoch": 0.9640061334839803, + "grad_norm": 0.6768803596496582, + "learning_rate": 7.054316044490777e-05, + "loss": 2.5155, + "step": 11945 + }, + { + "epoch": 0.9640868372205633, + "grad_norm": 0.6792057752609253, + "learning_rate": 7.052807437263189e-05, + "loss": 2.5509, + "step": 11946 + }, + { + "epoch": 0.9641675409571463, + "grad_norm": 0.6883981823921204, + "learning_rate": 7.051298903487344e-05, + "loss": 2.5176, + "step": 11947 + }, + { + "epoch": 0.9642482446937293, + "grad_norm": 0.6934401392936707, + "learning_rate": 7.049790443200844e-05, + "loss": 2.502, + "step": 11948 + }, + { + "epoch": 0.9643289484303124, + "grad_norm": 0.6882597804069519, + "learning_rate": 7.048282056441269e-05, + "loss": 2.487, + "step": 11949 + }, + { + "epoch": 0.9644096521668953, + "grad_norm": 0.6972896456718445, + "learning_rate": 7.046773743246225e-05, + "loss": 2.5304, + "step": 11950 + }, + { + "epoch": 0.9644903559034783, + "grad_norm": 0.6591988205909729, + "learning_rate": 7.045265503653303e-05, + "loss": 2.4734, + "step": 11951 + }, + { + "epoch": 0.9645710596400613, + "grad_norm": 0.6890063285827637, + "learning_rate": 7.043757337700082e-05, + "loss": 2.5289, + "step": 11952 + }, + { + "epoch": 0.9646517633766444, + "grad_norm": 0.6931065917015076, + "learning_rate": 7.042249245424157e-05, + "loss": 2.484, + "step": 11953 + }, + { + "epoch": 0.9647324671132274, + "grad_norm": 0.6943762898445129, + "learning_rate": 7.040741226863117e-05, + "loss": 2.501, + "step": 11954 + }, + { + "epoch": 0.9648131708498103, + "grad_norm": 0.677154004573822, + "learning_rate": 7.039233282054536e-05, + "loss": 2.4976, + "step": 11955 + }, + { + "epoch": 0.9648938745863933, + "grad_norm": 0.6662883758544922, + "learning_rate": 7.037725411036003e-05, + "loss": 2.4928, + "step": 11956 + }, + { + "epoch": 0.9649745783229764, + "grad_norm": 0.6854663491249084, + "learning_rate": 7.0362176138451e-05, + "loss": 2.4657, + "step": 11957 + }, + { + "epoch": 0.9650552820595594, + "grad_norm": 0.6703238487243652, + "learning_rate": 7.034709890519397e-05, + "loss": 2.4879, + "step": 11958 + }, + { + "epoch": 0.9651359857961423, + "grad_norm": 0.7023652791976929, + "learning_rate": 7.033202241096474e-05, + "loss": 2.4619, + "step": 11959 + }, + { + "epoch": 0.9652166895327253, + "grad_norm": 0.6950454711914062, + "learning_rate": 7.031694665613911e-05, + "loss": 2.5125, + "step": 11960 + }, + { + "epoch": 0.9652973932693084, + "grad_norm": 0.6740411520004272, + "learning_rate": 7.030187164109272e-05, + "loss": 2.436, + "step": 11961 + }, + { + "epoch": 0.9653780970058914, + "grad_norm": 0.6697152256965637, + "learning_rate": 7.028679736620132e-05, + "loss": 2.5513, + "step": 11962 + }, + { + "epoch": 0.9654588007424744, + "grad_norm": 0.6920599937438965, + "learning_rate": 7.027172383184061e-05, + "loss": 2.5264, + "step": 11963 + }, + { + "epoch": 0.9655395044790573, + "grad_norm": 0.6493465304374695, + "learning_rate": 7.025665103838627e-05, + "loss": 2.4834, + "step": 11964 + }, + { + "epoch": 0.9656202082156404, + "grad_norm": 0.684092104434967, + "learning_rate": 7.02415789862139e-05, + "loss": 2.4662, + "step": 11965 + }, + { + "epoch": 0.9657009119522234, + "grad_norm": 0.7161515355110168, + "learning_rate": 7.022650767569921e-05, + "loss": 2.4648, + "step": 11966 + }, + { + "epoch": 0.9657816156888064, + "grad_norm": 0.6994524002075195, + "learning_rate": 7.021143710721778e-05, + "loss": 2.5186, + "step": 11967 + }, + { + "epoch": 0.9658623194253894, + "grad_norm": 0.7105295062065125, + "learning_rate": 7.019636728114518e-05, + "loss": 2.5132, + "step": 11968 + }, + { + "epoch": 0.9659430231619724, + "grad_norm": 0.7182292938232422, + "learning_rate": 7.018129819785702e-05, + "loss": 2.5469, + "step": 11969 + }, + { + "epoch": 0.9660237268985554, + "grad_norm": 0.7021759152412415, + "learning_rate": 7.016622985772887e-05, + "loss": 2.5477, + "step": 11970 + }, + { + "epoch": 0.9661044306351384, + "grad_norm": 0.6751413941383362, + "learning_rate": 7.015116226113624e-05, + "loss": 2.5174, + "step": 11971 + }, + { + "epoch": 0.9661851343717214, + "grad_norm": 0.6341918110847473, + "learning_rate": 7.013609540845468e-05, + "loss": 2.4778, + "step": 11972 + }, + { + "epoch": 0.9662658381083045, + "grad_norm": 0.7080956697463989, + "learning_rate": 7.012102930005971e-05, + "loss": 2.5304, + "step": 11973 + }, + { + "epoch": 0.9663465418448874, + "grad_norm": 0.6367003321647644, + "learning_rate": 7.010596393632674e-05, + "loss": 2.4857, + "step": 11974 + }, + { + "epoch": 0.9664272455814704, + "grad_norm": 0.6841328740119934, + "learning_rate": 7.009089931763131e-05, + "loss": 2.5365, + "step": 11975 + }, + { + "epoch": 0.9665079493180534, + "grad_norm": 0.6568236351013184, + "learning_rate": 7.00758354443489e-05, + "loss": 2.5286, + "step": 11976 + }, + { + "epoch": 0.9665886530546365, + "grad_norm": 0.7071812152862549, + "learning_rate": 7.006077231685485e-05, + "loss": 2.458, + "step": 11977 + }, + { + "epoch": 0.9666693567912195, + "grad_norm": 0.6997712850570679, + "learning_rate": 7.004570993552462e-05, + "loss": 2.4571, + "step": 11978 + }, + { + "epoch": 0.9667500605278024, + "grad_norm": 0.6920793056488037, + "learning_rate": 7.003064830073359e-05, + "loss": 2.4172, + "step": 11979 + }, + { + "epoch": 0.9668307642643854, + "grad_norm": 0.6823387742042542, + "learning_rate": 7.001558741285718e-05, + "loss": 2.4895, + "step": 11980 + }, + { + "epoch": 0.9669114680009685, + "grad_norm": 0.7309569716453552, + "learning_rate": 7.000052727227068e-05, + "loss": 2.502, + "step": 11981 + }, + { + "epoch": 0.9669921717375515, + "grad_norm": 0.734708845615387, + "learning_rate": 6.998546787934946e-05, + "loss": 2.4918, + "step": 11982 + }, + { + "epoch": 0.9670728754741345, + "grad_norm": 0.690406084060669, + "learning_rate": 6.997040923446889e-05, + "loss": 2.4994, + "step": 11983 + }, + { + "epoch": 0.9671535792107174, + "grad_norm": 0.7126687169075012, + "learning_rate": 6.995535133800416e-05, + "loss": 2.4824, + "step": 11984 + }, + { + "epoch": 0.9672342829473004, + "grad_norm": 0.7020599246025085, + "learning_rate": 6.994029419033062e-05, + "loss": 2.4889, + "step": 11985 + }, + { + "epoch": 0.9673149866838835, + "grad_norm": 0.7690796852111816, + "learning_rate": 6.992523779182356e-05, + "loss": 2.4997, + "step": 11986 + }, + { + "epoch": 0.9673956904204665, + "grad_norm": 0.6635778546333313, + "learning_rate": 6.991018214285816e-05, + "loss": 2.4989, + "step": 11987 + }, + { + "epoch": 0.9674763941570494, + "grad_norm": 0.7088577747344971, + "learning_rate": 6.989512724380967e-05, + "loss": 2.549, + "step": 11988 + }, + { + "epoch": 0.9675570978936324, + "grad_norm": 0.6420924663543701, + "learning_rate": 6.988007309505333e-05, + "loss": 2.4585, + "step": 11989 + }, + { + "epoch": 0.9676378016302155, + "grad_norm": 0.7902400493621826, + "learning_rate": 6.986501969696428e-05, + "loss": 2.5009, + "step": 11990 + }, + { + "epoch": 0.9677185053667985, + "grad_norm": 0.700907289981842, + "learning_rate": 6.984996704991773e-05, + "loss": 2.4778, + "step": 11991 + }, + { + "epoch": 0.9677992091033815, + "grad_norm": 0.664378821849823, + "learning_rate": 6.983491515428883e-05, + "loss": 2.5116, + "step": 11992 + }, + { + "epoch": 0.9678799128399644, + "grad_norm": 0.6314663887023926, + "learning_rate": 6.981986401045266e-05, + "loss": 2.4588, + "step": 11993 + }, + { + "epoch": 0.9679606165765475, + "grad_norm": 0.6521078944206238, + "learning_rate": 6.980481361878438e-05, + "loss": 2.5224, + "step": 11994 + }, + { + "epoch": 0.9680413203131305, + "grad_norm": 0.6336014270782471, + "learning_rate": 6.978976397965907e-05, + "loss": 2.4297, + "step": 11995 + }, + { + "epoch": 0.9681220240497135, + "grad_norm": 0.7321500778198242, + "learning_rate": 6.977471509345183e-05, + "loss": 2.5252, + "step": 11996 + }, + { + "epoch": 0.9682027277862965, + "grad_norm": 0.686950147151947, + "learning_rate": 6.97596669605377e-05, + "loss": 2.5188, + "step": 11997 + }, + { + "epoch": 0.9682834315228795, + "grad_norm": 0.729343056678772, + "learning_rate": 6.97446195812917e-05, + "loss": 2.5157, + "step": 11998 + }, + { + "epoch": 0.9683641352594625, + "grad_norm": 0.6447068452835083, + "learning_rate": 6.972957295608889e-05, + "loss": 2.5041, + "step": 11999 + }, + { + "epoch": 0.9684448389960455, + "grad_norm": 0.6847280859947205, + "learning_rate": 6.971452708530423e-05, + "loss": 2.443, + "step": 12000 + }, + { + "epoch": 0.9684448389960455, + "eval_loss": 2.431878089904785, + "eval_runtime": 758.167, + "eval_samples_per_second": 3.456, + "eval_steps_per_second": 0.576, + "step": 12000 + }, + { + "epoch": 0.9685255427326285, + "grad_norm": 0.6440466046333313, + "learning_rate": 6.969948196931272e-05, + "loss": 2.5091, + "step": 12001 + }, + { + "epoch": 0.9686062464692116, + "grad_norm": 0.6570029258728027, + "learning_rate": 6.968443760848937e-05, + "loss": 2.491, + "step": 12002 + }, + { + "epoch": 0.9686869502057945, + "grad_norm": 0.7610877752304077, + "learning_rate": 6.966939400320905e-05, + "loss": 2.4713, + "step": 12003 + }, + { + "epoch": 0.9687676539423775, + "grad_norm": 0.7187781929969788, + "learning_rate": 6.965435115384669e-05, + "loss": 2.4303, + "step": 12004 + }, + { + "epoch": 0.9688483576789605, + "grad_norm": 0.7668420672416687, + "learning_rate": 6.963930906077727e-05, + "loss": 2.5513, + "step": 12005 + }, + { + "epoch": 0.9689290614155436, + "grad_norm": 0.7025619745254517, + "learning_rate": 6.96242677243756e-05, + "loss": 2.4349, + "step": 12006 + }, + { + "epoch": 0.9690097651521266, + "grad_norm": 0.7066935896873474, + "learning_rate": 6.960922714501657e-05, + "loss": 2.5465, + "step": 12007 + }, + { + "epoch": 0.9690904688887095, + "grad_norm": 0.6758970618247986, + "learning_rate": 6.95941873230751e-05, + "loss": 2.4827, + "step": 12008 + }, + { + "epoch": 0.9691711726252925, + "grad_norm": 0.7108862996101379, + "learning_rate": 6.957914825892591e-05, + "loss": 2.5412, + "step": 12009 + }, + { + "epoch": 0.9692518763618756, + "grad_norm": 0.660784125328064, + "learning_rate": 6.956410995294389e-05, + "loss": 2.5173, + "step": 12010 + }, + { + "epoch": 0.9693325800984586, + "grad_norm": 0.6966561079025269, + "learning_rate": 6.954907240550377e-05, + "loss": 2.5196, + "step": 12011 + }, + { + "epoch": 0.9694132838350416, + "grad_norm": 0.6889416575431824, + "learning_rate": 6.953403561698042e-05, + "loss": 2.5351, + "step": 12012 + }, + { + "epoch": 0.9694939875716245, + "grad_norm": 0.7578341960906982, + "learning_rate": 6.951899958774852e-05, + "loss": 2.5184, + "step": 12013 + }, + { + "epoch": 0.9695746913082076, + "grad_norm": 0.6735317707061768, + "learning_rate": 6.950396431818282e-05, + "loss": 2.4592, + "step": 12014 + }, + { + "epoch": 0.9696553950447906, + "grad_norm": 0.6903232932090759, + "learning_rate": 6.948892980865806e-05, + "loss": 2.5212, + "step": 12015 + }, + { + "epoch": 0.9697360987813736, + "grad_norm": 0.6477165818214417, + "learning_rate": 6.94738960595489e-05, + "loss": 2.4423, + "step": 12016 + }, + { + "epoch": 0.9698168025179565, + "grad_norm": 0.6778751015663147, + "learning_rate": 6.945886307123007e-05, + "loss": 2.547, + "step": 12017 + }, + { + "epoch": 0.9698975062545396, + "grad_norm": 0.690558135509491, + "learning_rate": 6.944383084407623e-05, + "loss": 2.5081, + "step": 12018 + }, + { + "epoch": 0.9699782099911226, + "grad_norm": 0.7210639119148254, + "learning_rate": 6.942879937846196e-05, + "loss": 2.496, + "step": 12019 + }, + { + "epoch": 0.9700589137277056, + "grad_norm": 0.7182444930076599, + "learning_rate": 6.941376867476194e-05, + "loss": 2.6138, + "step": 12020 + }, + { + "epoch": 0.9701396174642886, + "grad_norm": 0.6929295063018799, + "learning_rate": 6.939873873335077e-05, + "loss": 2.4828, + "step": 12021 + }, + { + "epoch": 0.9702203212008716, + "grad_norm": 0.6919693350791931, + "learning_rate": 6.938370955460298e-05, + "loss": 2.5123, + "step": 12022 + }, + { + "epoch": 0.9703010249374546, + "grad_norm": 0.6475244164466858, + "learning_rate": 6.93686811388932e-05, + "loss": 2.4992, + "step": 12023 + }, + { + "epoch": 0.9703817286740376, + "grad_norm": 0.6728265881538391, + "learning_rate": 6.935365348659597e-05, + "loss": 2.4486, + "step": 12024 + }, + { + "epoch": 0.9704624324106206, + "grad_norm": 0.6791470646858215, + "learning_rate": 6.933862659808582e-05, + "loss": 2.4657, + "step": 12025 + }, + { + "epoch": 0.9705431361472037, + "grad_norm": 0.7611662745475769, + "learning_rate": 6.932360047373721e-05, + "loss": 2.5243, + "step": 12026 + }, + { + "epoch": 0.9706238398837866, + "grad_norm": 0.6642355918884277, + "learning_rate": 6.930857511392467e-05, + "loss": 2.5308, + "step": 12027 + }, + { + "epoch": 0.9707045436203696, + "grad_norm": 0.7270805239677429, + "learning_rate": 6.92935505190227e-05, + "loss": 2.4708, + "step": 12028 + }, + { + "epoch": 0.9707852473569526, + "grad_norm": 0.6706295013427734, + "learning_rate": 6.927852668940568e-05, + "loss": 2.5136, + "step": 12029 + }, + { + "epoch": 0.9708659510935357, + "grad_norm": 0.6923376321792603, + "learning_rate": 6.92635036254481e-05, + "loss": 2.5238, + "step": 12030 + }, + { + "epoch": 0.9709466548301187, + "grad_norm": 0.7154483199119568, + "learning_rate": 6.924848132752436e-05, + "loss": 2.488, + "step": 12031 + }, + { + "epoch": 0.9710273585667016, + "grad_norm": 0.6675701141357422, + "learning_rate": 6.923345979600884e-05, + "loss": 2.5066, + "step": 12032 + }, + { + "epoch": 0.9711080623032846, + "grad_norm": 0.7282043695449829, + "learning_rate": 6.921843903127592e-05, + "loss": 2.5096, + "step": 12033 + }, + { + "epoch": 0.9711887660398677, + "grad_norm": 0.663526177406311, + "learning_rate": 6.92034190337e-05, + "loss": 2.5276, + "step": 12034 + }, + { + "epoch": 0.9712694697764507, + "grad_norm": 0.7491087913513184, + "learning_rate": 6.918839980365534e-05, + "loss": 2.5044, + "step": 12035 + }, + { + "epoch": 0.9713501735130337, + "grad_norm": 0.6977766156196594, + "learning_rate": 6.917338134151629e-05, + "loss": 2.6102, + "step": 12036 + }, + { + "epoch": 0.9714308772496166, + "grad_norm": 0.6447446346282959, + "learning_rate": 6.915836364765722e-05, + "loss": 2.5137, + "step": 12037 + }, + { + "epoch": 0.9715115809861996, + "grad_norm": 0.6801442503929138, + "learning_rate": 6.91433467224523e-05, + "loss": 2.5145, + "step": 12038 + }, + { + "epoch": 0.9715922847227827, + "grad_norm": 0.6843627691268921, + "learning_rate": 6.912833056627583e-05, + "loss": 2.6099, + "step": 12039 + }, + { + "epoch": 0.9716729884593657, + "grad_norm": 0.6862856149673462, + "learning_rate": 6.911331517950209e-05, + "loss": 2.5358, + "step": 12040 + }, + { + "epoch": 0.9717536921959486, + "grad_norm": 0.6835047602653503, + "learning_rate": 6.909830056250527e-05, + "loss": 2.5257, + "step": 12041 + }, + { + "epoch": 0.9718343959325316, + "grad_norm": 0.6958080530166626, + "learning_rate": 6.908328671565956e-05, + "loss": 2.5008, + "step": 12042 + }, + { + "epoch": 0.9719150996691147, + "grad_norm": 0.7556219100952148, + "learning_rate": 6.906827363933917e-05, + "loss": 2.5283, + "step": 12043 + }, + { + "epoch": 0.9719958034056977, + "grad_norm": 0.7074917554855347, + "learning_rate": 6.90532613339183e-05, + "loss": 2.4898, + "step": 12044 + }, + { + "epoch": 0.9720765071422807, + "grad_norm": 0.6456350684165955, + "learning_rate": 6.903824979977101e-05, + "loss": 2.4989, + "step": 12045 + }, + { + "epoch": 0.9721572108788636, + "grad_norm": 0.6609941720962524, + "learning_rate": 6.902323903727146e-05, + "loss": 2.4883, + "step": 12046 + }, + { + "epoch": 0.9722379146154467, + "grad_norm": 0.7132936716079712, + "learning_rate": 6.90082290467938e-05, + "loss": 2.4983, + "step": 12047 + }, + { + "epoch": 0.9723186183520297, + "grad_norm": 0.6686434745788574, + "learning_rate": 6.899321982871206e-05, + "loss": 2.4862, + "step": 12048 + }, + { + "epoch": 0.9723993220886127, + "grad_norm": 0.6792194247245789, + "learning_rate": 6.897821138340033e-05, + "loss": 2.5368, + "step": 12049 + }, + { + "epoch": 0.9724800258251957, + "grad_norm": 0.6829379796981812, + "learning_rate": 6.896320371123268e-05, + "loss": 2.4842, + "step": 12050 + }, + { + "epoch": 0.9725607295617787, + "grad_norm": 0.7459573745727539, + "learning_rate": 6.894819681258312e-05, + "loss": 2.5023, + "step": 12051 + }, + { + "epoch": 0.9726414332983617, + "grad_norm": 0.6700068712234497, + "learning_rate": 6.893319068782566e-05, + "loss": 2.552, + "step": 12052 + }, + { + "epoch": 0.9727221370349447, + "grad_norm": 0.7093638777732849, + "learning_rate": 6.891818533733434e-05, + "loss": 2.445, + "step": 12053 + }, + { + "epoch": 0.9728028407715277, + "grad_norm": 0.703599214553833, + "learning_rate": 6.890318076148304e-05, + "loss": 2.5536, + "step": 12054 + }, + { + "epoch": 0.9728835445081108, + "grad_norm": 0.6214482188224792, + "learning_rate": 6.888817696064578e-05, + "loss": 2.5188, + "step": 12055 + }, + { + "epoch": 0.9729642482446937, + "grad_norm": 0.6893547773361206, + "learning_rate": 6.887317393519645e-05, + "loss": 2.5596, + "step": 12056 + }, + { + "epoch": 0.9730449519812767, + "grad_norm": 0.6282656788825989, + "learning_rate": 6.885817168550903e-05, + "loss": 2.4873, + "step": 12057 + }, + { + "epoch": 0.9731256557178597, + "grad_norm": 0.6979188323020935, + "learning_rate": 6.884317021195737e-05, + "loss": 2.5358, + "step": 12058 + }, + { + "epoch": 0.9732063594544428, + "grad_norm": 0.7925785183906555, + "learning_rate": 6.882816951491533e-05, + "loss": 2.5358, + "step": 12059 + }, + { + "epoch": 0.9732870631910258, + "grad_norm": 0.6449821591377258, + "learning_rate": 6.881316959475684e-05, + "loss": 2.4784, + "step": 12060 + }, + { + "epoch": 0.9733677669276087, + "grad_norm": 0.7013393044471741, + "learning_rate": 6.879817045185565e-05, + "loss": 2.4804, + "step": 12061 + }, + { + "epoch": 0.9734484706641917, + "grad_norm": 0.8338057398796082, + "learning_rate": 6.878317208658559e-05, + "loss": 2.512, + "step": 12062 + }, + { + "epoch": 0.9735291744007748, + "grad_norm": 0.6815133094787598, + "learning_rate": 6.876817449932054e-05, + "loss": 2.467, + "step": 12063 + }, + { + "epoch": 0.9736098781373578, + "grad_norm": 0.659156858921051, + "learning_rate": 6.87531776904342e-05, + "loss": 2.503, + "step": 12064 + }, + { + "epoch": 0.9736905818739408, + "grad_norm": 0.7149603962898254, + "learning_rate": 6.873818166030033e-05, + "loss": 2.5135, + "step": 12065 + }, + { + "epoch": 0.9737712856105237, + "grad_norm": 0.7010510563850403, + "learning_rate": 6.872318640929272e-05, + "loss": 2.5133, + "step": 12066 + }, + { + "epoch": 0.9738519893471068, + "grad_norm": 0.6247616410255432, + "learning_rate": 6.870819193778504e-05, + "loss": 2.5189, + "step": 12067 + }, + { + "epoch": 0.9739326930836898, + "grad_norm": 0.6938940286636353, + "learning_rate": 6.869319824615101e-05, + "loss": 2.5053, + "step": 12068 + }, + { + "epoch": 0.9740133968202728, + "grad_norm": 0.7636895179748535, + "learning_rate": 6.867820533476436e-05, + "loss": 2.4989, + "step": 12069 + }, + { + "epoch": 0.9740941005568557, + "grad_norm": 0.6489234566688538, + "learning_rate": 6.866321320399869e-05, + "loss": 2.4935, + "step": 12070 + }, + { + "epoch": 0.9741748042934388, + "grad_norm": 0.6752095818519592, + "learning_rate": 6.864822185422764e-05, + "loss": 2.4835, + "step": 12071 + }, + { + "epoch": 0.9742555080300218, + "grad_norm": 0.6947118639945984, + "learning_rate": 6.863323128582486e-05, + "loss": 2.504, + "step": 12072 + }, + { + "epoch": 0.9743362117666048, + "grad_norm": 0.6815536618232727, + "learning_rate": 6.861824149916398e-05, + "loss": 2.5369, + "step": 12073 + }, + { + "epoch": 0.9744169155031878, + "grad_norm": 0.6550236344337463, + "learning_rate": 6.860325249461852e-05, + "loss": 2.4753, + "step": 12074 + }, + { + "epoch": 0.9744976192397709, + "grad_norm": 0.6833250522613525, + "learning_rate": 6.858826427256209e-05, + "loss": 2.4687, + "step": 12075 + }, + { + "epoch": 0.9745783229763538, + "grad_norm": 0.6925075650215149, + "learning_rate": 6.857327683336824e-05, + "loss": 2.5363, + "step": 12076 + }, + { + "epoch": 0.9746590267129368, + "grad_norm": 0.6754821538925171, + "learning_rate": 6.855829017741046e-05, + "loss": 2.4696, + "step": 12077 + }, + { + "epoch": 0.9747397304495198, + "grad_norm": 0.7360671162605286, + "learning_rate": 6.854330430506228e-05, + "loss": 2.5144, + "step": 12078 + }, + { + "epoch": 0.9748204341861029, + "grad_norm": 0.6814733743667603, + "learning_rate": 6.852831921669723e-05, + "loss": 2.5059, + "step": 12079 + }, + { + "epoch": 0.9749011379226858, + "grad_norm": 0.7106744647026062, + "learning_rate": 6.851333491268869e-05, + "loss": 2.453, + "step": 12080 + }, + { + "epoch": 0.9749818416592688, + "grad_norm": 0.6623831987380981, + "learning_rate": 6.849835139341015e-05, + "loss": 2.5244, + "step": 12081 + }, + { + "epoch": 0.9750625453958518, + "grad_norm": 0.6723372936248779, + "learning_rate": 6.848336865923506e-05, + "loss": 2.5159, + "step": 12082 + }, + { + "epoch": 0.9751432491324349, + "grad_norm": 0.7256618142127991, + "learning_rate": 6.84683867105368e-05, + "loss": 2.494, + "step": 12083 + }, + { + "epoch": 0.9752239528690179, + "grad_norm": 0.6881731152534485, + "learning_rate": 6.845340554768874e-05, + "loss": 2.4374, + "step": 12084 + }, + { + "epoch": 0.9753046566056008, + "grad_norm": 0.6759666204452515, + "learning_rate": 6.843842517106434e-05, + "loss": 2.5082, + "step": 12085 + }, + { + "epoch": 0.9753853603421838, + "grad_norm": 0.6983315348625183, + "learning_rate": 6.842344558103684e-05, + "loss": 2.5191, + "step": 12086 + }, + { + "epoch": 0.9754660640787668, + "grad_norm": 0.6805596351623535, + "learning_rate": 6.840846677797959e-05, + "loss": 2.5289, + "step": 12087 + }, + { + "epoch": 0.9755467678153499, + "grad_norm": 0.712942361831665, + "learning_rate": 6.839348876226595e-05, + "loss": 2.5544, + "step": 12088 + }, + { + "epoch": 0.9756274715519329, + "grad_norm": 0.6931124329566956, + "learning_rate": 6.837851153426924e-05, + "loss": 2.5407, + "step": 12089 + }, + { + "epoch": 0.9757081752885158, + "grad_norm": 0.6939486265182495, + "learning_rate": 6.836353509436264e-05, + "loss": 2.5236, + "step": 12090 + }, + { + "epoch": 0.9757888790250988, + "grad_norm": 0.7434083223342896, + "learning_rate": 6.834855944291944e-05, + "loss": 2.4903, + "step": 12091 + }, + { + "epoch": 0.9758695827616819, + "grad_norm": 0.672177255153656, + "learning_rate": 6.833358458031292e-05, + "loss": 2.4995, + "step": 12092 + }, + { + "epoch": 0.9759502864982649, + "grad_norm": 0.6631280779838562, + "learning_rate": 6.831861050691619e-05, + "loss": 2.4689, + "step": 12093 + }, + { + "epoch": 0.9760309902348479, + "grad_norm": 0.7485793232917786, + "learning_rate": 6.830363722310253e-05, + "loss": 2.5526, + "step": 12094 + }, + { + "epoch": 0.9761116939714308, + "grad_norm": 0.6592193245887756, + "learning_rate": 6.828866472924511e-05, + "loss": 2.4425, + "step": 12095 + }, + { + "epoch": 0.9761923977080139, + "grad_norm": 0.6479860544204712, + "learning_rate": 6.827369302571703e-05, + "loss": 2.4637, + "step": 12096 + }, + { + "epoch": 0.9762731014445969, + "grad_norm": 0.6694966554641724, + "learning_rate": 6.825872211289146e-05, + "loss": 2.5256, + "step": 12097 + }, + { + "epoch": 0.9763538051811799, + "grad_norm": 0.675751805305481, + "learning_rate": 6.82437519911415e-05, + "loss": 2.5021, + "step": 12098 + }, + { + "epoch": 0.9764345089177628, + "grad_norm": 0.7255450487136841, + "learning_rate": 6.822878266084026e-05, + "loss": 2.5275, + "step": 12099 + }, + { + "epoch": 0.9765152126543459, + "grad_norm": 0.7034213542938232, + "learning_rate": 6.821381412236079e-05, + "loss": 2.5432, + "step": 12100 + }, + { + "epoch": 0.9765959163909289, + "grad_norm": 0.6808038949966431, + "learning_rate": 6.819884637607619e-05, + "loss": 2.5044, + "step": 12101 + }, + { + "epoch": 0.9766766201275119, + "grad_norm": 0.6601580381393433, + "learning_rate": 6.818387942235945e-05, + "loss": 2.4602, + "step": 12102 + }, + { + "epoch": 0.9767573238640949, + "grad_norm": 0.7163928151130676, + "learning_rate": 6.816891326158359e-05, + "loss": 2.4785, + "step": 12103 + }, + { + "epoch": 0.976838027600678, + "grad_norm": 0.6616904735565186, + "learning_rate": 6.815394789412164e-05, + "loss": 2.5081, + "step": 12104 + }, + { + "epoch": 0.9769187313372609, + "grad_norm": 0.6476422548294067, + "learning_rate": 6.813898332034657e-05, + "loss": 2.4624, + "step": 12105 + }, + { + "epoch": 0.9769994350738439, + "grad_norm": 0.6468440890312195, + "learning_rate": 6.812401954063131e-05, + "loss": 2.4948, + "step": 12106 + }, + { + "epoch": 0.9770801388104269, + "grad_norm": 0.6988391876220703, + "learning_rate": 6.810905655534878e-05, + "loss": 2.4958, + "step": 12107 + }, + { + "epoch": 0.97716084254701, + "grad_norm": 0.6777953505516052, + "learning_rate": 6.809409436487196e-05, + "loss": 2.5304, + "step": 12108 + }, + { + "epoch": 0.9772415462835929, + "grad_norm": 0.7115550637245178, + "learning_rate": 6.807913296957368e-05, + "loss": 2.5321, + "step": 12109 + }, + { + "epoch": 0.9773222500201759, + "grad_norm": 0.737823486328125, + "learning_rate": 6.806417236982684e-05, + "loss": 2.5121, + "step": 12110 + }, + { + "epoch": 0.9774029537567589, + "grad_norm": 0.6797437071800232, + "learning_rate": 6.804921256600439e-05, + "loss": 2.4783, + "step": 12111 + }, + { + "epoch": 0.977483657493342, + "grad_norm": 0.7240802645683289, + "learning_rate": 6.803425355847897e-05, + "loss": 2.4949, + "step": 12112 + }, + { + "epoch": 0.977564361229925, + "grad_norm": 0.6433781981468201, + "learning_rate": 6.801929534762357e-05, + "loss": 2.4937, + "step": 12113 + }, + { + "epoch": 0.9776450649665079, + "grad_norm": 0.6935293078422546, + "learning_rate": 6.800433793381095e-05, + "loss": 2.5025, + "step": 12114 + }, + { + "epoch": 0.9777257687030909, + "grad_norm": 0.699780285358429, + "learning_rate": 6.798938131741383e-05, + "loss": 2.5231, + "step": 12115 + }, + { + "epoch": 0.977806472439674, + "grad_norm": 0.6414729952812195, + "learning_rate": 6.7974425498805e-05, + "loss": 2.4422, + "step": 12116 + }, + { + "epoch": 0.977887176176257, + "grad_norm": 0.6733608841896057, + "learning_rate": 6.795947047835722e-05, + "loss": 2.4873, + "step": 12117 + }, + { + "epoch": 0.97796787991284, + "grad_norm": 0.6985765099525452, + "learning_rate": 6.794451625644318e-05, + "loss": 2.4994, + "step": 12118 + }, + { + "epoch": 0.9780485836494229, + "grad_norm": 0.6429893374443054, + "learning_rate": 6.792956283343559e-05, + "loss": 2.4968, + "step": 12119 + }, + { + "epoch": 0.978129287386006, + "grad_norm": 0.7129024267196655, + "learning_rate": 6.79146102097071e-05, + "loss": 2.5457, + "step": 12120 + }, + { + "epoch": 0.978209991122589, + "grad_norm": 0.6811943650245667, + "learning_rate": 6.789965838563047e-05, + "loss": 2.5012, + "step": 12121 + }, + { + "epoch": 0.978290694859172, + "grad_norm": 0.7269948720932007, + "learning_rate": 6.788470736157821e-05, + "loss": 2.5124, + "step": 12122 + }, + { + "epoch": 0.978371398595755, + "grad_norm": 0.7396084666252136, + "learning_rate": 6.786975713792299e-05, + "loss": 2.5631, + "step": 12123 + }, + { + "epoch": 0.978452102332338, + "grad_norm": 0.6880094408988953, + "learning_rate": 6.785480771503745e-05, + "loss": 2.5103, + "step": 12124 + }, + { + "epoch": 0.978532806068921, + "grad_norm": 0.737095057964325, + "learning_rate": 6.783985909329409e-05, + "loss": 2.5062, + "step": 12125 + }, + { + "epoch": 0.978613509805504, + "grad_norm": 0.6540948152542114, + "learning_rate": 6.782491127306552e-05, + "loss": 2.5568, + "step": 12126 + }, + { + "epoch": 0.978694213542087, + "grad_norm": 0.669706404209137, + "learning_rate": 6.780996425472427e-05, + "loss": 2.5156, + "step": 12127 + }, + { + "epoch": 0.97877491727867, + "grad_norm": 0.6722843647003174, + "learning_rate": 6.779501803864286e-05, + "loss": 2.4784, + "step": 12128 + }, + { + "epoch": 0.978855621015253, + "grad_norm": 0.6545475125312805, + "learning_rate": 6.778007262519377e-05, + "loss": 2.5159, + "step": 12129 + }, + { + "epoch": 0.978936324751836, + "grad_norm": 0.7010136246681213, + "learning_rate": 6.776512801474953e-05, + "loss": 2.5244, + "step": 12130 + }, + { + "epoch": 0.979017028488419, + "grad_norm": 0.6912714242935181, + "learning_rate": 6.775018420768253e-05, + "loss": 2.5223, + "step": 12131 + }, + { + "epoch": 0.9790977322250021, + "grad_norm": 0.6864827275276184, + "learning_rate": 6.773524120436525e-05, + "loss": 2.5027, + "step": 12132 + }, + { + "epoch": 0.979178435961585, + "grad_norm": 0.7586981058120728, + "learning_rate": 6.77202990051701e-05, + "loss": 2.4554, + "step": 12133 + }, + { + "epoch": 0.979259139698168, + "grad_norm": 0.6487839818000793, + "learning_rate": 6.770535761046948e-05, + "loss": 2.5035, + "step": 12134 + }, + { + "epoch": 0.979339843434751, + "grad_norm": 0.7193071246147156, + "learning_rate": 6.769041702063575e-05, + "loss": 2.4669, + "step": 12135 + }, + { + "epoch": 0.9794205471713341, + "grad_norm": 0.7118960618972778, + "learning_rate": 6.76754772360413e-05, + "loss": 2.493, + "step": 12136 + }, + { + "epoch": 0.9795012509079171, + "grad_norm": 0.6617394685745239, + "learning_rate": 6.766053825705847e-05, + "loss": 2.4771, + "step": 12137 + }, + { + "epoch": 0.9795819546445, + "grad_norm": 0.7664859294891357, + "learning_rate": 6.764560008405953e-05, + "loss": 2.5191, + "step": 12138 + }, + { + "epoch": 0.979662658381083, + "grad_norm": 0.708063542842865, + "learning_rate": 6.763066271741682e-05, + "loss": 2.5521, + "step": 12139 + }, + { + "epoch": 0.979743362117666, + "grad_norm": 0.6951049566268921, + "learning_rate": 6.761572615750267e-05, + "loss": 2.4708, + "step": 12140 + }, + { + "epoch": 0.9798240658542491, + "grad_norm": 0.6914932727813721, + "learning_rate": 6.760079040468921e-05, + "loss": 2.5101, + "step": 12141 + }, + { + "epoch": 0.9799047695908321, + "grad_norm": 0.6843075752258301, + "learning_rate": 6.758585545934876e-05, + "loss": 2.4932, + "step": 12142 + }, + { + "epoch": 0.979985473327415, + "grad_norm": 0.6567733883857727, + "learning_rate": 6.757092132185354e-05, + "loss": 2.4577, + "step": 12143 + }, + { + "epoch": 0.980066177063998, + "grad_norm": 0.6874415874481201, + "learning_rate": 6.75559879925757e-05, + "loss": 2.4818, + "step": 12144 + }, + { + "epoch": 0.9801468808005811, + "grad_norm": 0.7274627685546875, + "learning_rate": 6.754105547188746e-05, + "loss": 2.523, + "step": 12145 + }, + { + "epoch": 0.9802275845371641, + "grad_norm": 0.6991173028945923, + "learning_rate": 6.7526123760161e-05, + "loss": 2.4864, + "step": 12146 + }, + { + "epoch": 0.980308288273747, + "grad_norm": 0.670078456401825, + "learning_rate": 6.75111928577684e-05, + "loss": 2.4889, + "step": 12147 + }, + { + "epoch": 0.98038899201033, + "grad_norm": 0.6653482913970947, + "learning_rate": 6.749626276508178e-05, + "loss": 2.4652, + "step": 12148 + }, + { + "epoch": 0.9804696957469131, + "grad_norm": 0.7329251766204834, + "learning_rate": 6.748133348247326e-05, + "loss": 2.518, + "step": 12149 + }, + { + "epoch": 0.9805503994834961, + "grad_norm": 0.7792871594429016, + "learning_rate": 6.746640501031495e-05, + "loss": 2.5018, + "step": 12150 + }, + { + "epoch": 0.9806311032200791, + "grad_norm": 0.6962797045707703, + "learning_rate": 6.745147734897883e-05, + "loss": 2.4388, + "step": 12151 + }, + { + "epoch": 0.980711806956662, + "grad_norm": 0.6981272101402283, + "learning_rate": 6.7436550498837e-05, + "loss": 2.4886, + "step": 12152 + }, + { + "epoch": 0.9807925106932451, + "grad_norm": 0.6696565747261047, + "learning_rate": 6.742162446026146e-05, + "loss": 2.5258, + "step": 12153 + }, + { + "epoch": 0.9808732144298281, + "grad_norm": 0.6922139525413513, + "learning_rate": 6.740669923362417e-05, + "loss": 2.493, + "step": 12154 + }, + { + "epoch": 0.9809539181664111, + "grad_norm": 0.6745694875717163, + "learning_rate": 6.739177481929715e-05, + "loss": 2.5209, + "step": 12155 + }, + { + "epoch": 0.9810346219029941, + "grad_norm": 0.7023215889930725, + "learning_rate": 6.737685121765238e-05, + "loss": 2.4987, + "step": 12156 + }, + { + "epoch": 0.9811153256395772, + "grad_norm": 0.6337805390357971, + "learning_rate": 6.73619284290617e-05, + "loss": 2.4838, + "step": 12157 + }, + { + "epoch": 0.9811960293761601, + "grad_norm": 0.6747817397117615, + "learning_rate": 6.73470064538971e-05, + "loss": 2.4834, + "step": 12158 + }, + { + "epoch": 0.9812767331127431, + "grad_norm": 0.6714580655097961, + "learning_rate": 6.733208529253047e-05, + "loss": 2.4724, + "step": 12159 + }, + { + "epoch": 0.9813574368493261, + "grad_norm": 0.6927861571311951, + "learning_rate": 6.731716494533364e-05, + "loss": 2.495, + "step": 12160 + }, + { + "epoch": 0.9814381405859092, + "grad_norm": 0.6576036214828491, + "learning_rate": 6.73022454126785e-05, + "loss": 2.5415, + "step": 12161 + }, + { + "epoch": 0.9815188443224921, + "grad_norm": 0.6495294570922852, + "learning_rate": 6.728732669493691e-05, + "loss": 2.4889, + "step": 12162 + }, + { + "epoch": 0.9815995480590751, + "grad_norm": 0.6680364012718201, + "learning_rate": 6.72724087924806e-05, + "loss": 2.4733, + "step": 12163 + }, + { + "epoch": 0.9816802517956581, + "grad_norm": 0.6816582083702087, + "learning_rate": 6.725749170568143e-05, + "loss": 2.4688, + "step": 12164 + }, + { + "epoch": 0.9817609555322412, + "grad_norm": 0.6995956897735596, + "learning_rate": 6.724257543491116e-05, + "loss": 2.4962, + "step": 12165 + }, + { + "epoch": 0.9818416592688242, + "grad_norm": 0.6728340983390808, + "learning_rate": 6.722765998054157e-05, + "loss": 2.5218, + "step": 12166 + }, + { + "epoch": 0.9819223630054071, + "grad_norm": 0.6835319995880127, + "learning_rate": 6.721274534294433e-05, + "loss": 2.4845, + "step": 12167 + }, + { + "epoch": 0.9820030667419901, + "grad_norm": 0.6969910264015198, + "learning_rate": 6.719783152249119e-05, + "loss": 2.4983, + "step": 12168 + }, + { + "epoch": 0.9820837704785732, + "grad_norm": 0.7327036261558533, + "learning_rate": 6.718291851955383e-05, + "loss": 2.5893, + "step": 12169 + }, + { + "epoch": 0.9821644742151562, + "grad_norm": 0.7092839479446411, + "learning_rate": 6.716800633450393e-05, + "loss": 2.5104, + "step": 12170 + }, + { + "epoch": 0.9822451779517392, + "grad_norm": 0.7384308576583862, + "learning_rate": 6.715309496771311e-05, + "loss": 2.5066, + "step": 12171 + }, + { + "epoch": 0.9823258816883221, + "grad_norm": 0.6744845509529114, + "learning_rate": 6.713818441955308e-05, + "loss": 2.469, + "step": 12172 + }, + { + "epoch": 0.9824065854249052, + "grad_norm": 0.6497980952262878, + "learning_rate": 6.712327469039536e-05, + "loss": 2.4943, + "step": 12173 + }, + { + "epoch": 0.9824872891614882, + "grad_norm": 0.6550357937812805, + "learning_rate": 6.710836578061156e-05, + "loss": 2.5019, + "step": 12174 + }, + { + "epoch": 0.9825679928980712, + "grad_norm": 0.6813549995422363, + "learning_rate": 6.709345769057331e-05, + "loss": 2.4314, + "step": 12175 + }, + { + "epoch": 0.9826486966346542, + "grad_norm": 0.6636531352996826, + "learning_rate": 6.707855042065209e-05, + "loss": 2.5202, + "step": 12176 + }, + { + "epoch": 0.9827294003712372, + "grad_norm": 0.6684894561767578, + "learning_rate": 6.706364397121944e-05, + "loss": 2.4353, + "step": 12177 + }, + { + "epoch": 0.9828101041078202, + "grad_norm": 0.6813677549362183, + "learning_rate": 6.704873834264688e-05, + "loss": 2.4254, + "step": 12178 + }, + { + "epoch": 0.9828908078444032, + "grad_norm": 0.6584975719451904, + "learning_rate": 6.70338335353059e-05, + "loss": 2.5647, + "step": 12179 + }, + { + "epoch": 0.9829715115809862, + "grad_norm": 0.6959114074707031, + "learning_rate": 6.701892954956796e-05, + "loss": 2.5203, + "step": 12180 + }, + { + "epoch": 0.9830522153175693, + "grad_norm": 0.6399044990539551, + "learning_rate": 6.700402638580452e-05, + "loss": 2.4697, + "step": 12181 + }, + { + "epoch": 0.9831329190541522, + "grad_norm": 0.6838750839233398, + "learning_rate": 6.698912404438702e-05, + "loss": 2.5261, + "step": 12182 + }, + { + "epoch": 0.9832136227907352, + "grad_norm": 0.6286367177963257, + "learning_rate": 6.697422252568679e-05, + "loss": 2.4264, + "step": 12183 + }, + { + "epoch": 0.9832943265273182, + "grad_norm": 0.901637852191925, + "learning_rate": 6.695932183007528e-05, + "loss": 2.4908, + "step": 12184 + }, + { + "epoch": 0.9833750302639013, + "grad_norm": 0.8361458778381348, + "learning_rate": 6.694442195792386e-05, + "loss": 2.5183, + "step": 12185 + }, + { + "epoch": 0.9834557340004842, + "grad_norm": 0.7033401727676392, + "learning_rate": 6.692952290960384e-05, + "loss": 2.5702, + "step": 12186 + }, + { + "epoch": 0.9835364377370672, + "grad_norm": 0.669486939907074, + "learning_rate": 6.691462468548653e-05, + "loss": 2.5143, + "step": 12187 + }, + { + "epoch": 0.9836171414736502, + "grad_norm": 0.7043797969818115, + "learning_rate": 6.689972728594329e-05, + "loss": 2.5638, + "step": 12188 + }, + { + "epoch": 0.9836978452102332, + "grad_norm": 0.6532511115074158, + "learning_rate": 6.688483071134537e-05, + "loss": 2.5227, + "step": 12189 + }, + { + "epoch": 0.9837785489468163, + "grad_norm": 0.7363922595977783, + "learning_rate": 6.6869934962064e-05, + "loss": 2.4953, + "step": 12190 + }, + { + "epoch": 0.9838592526833992, + "grad_norm": 0.6746651530265808, + "learning_rate": 6.685504003847051e-05, + "loss": 2.5021, + "step": 12191 + }, + { + "epoch": 0.9839399564199822, + "grad_norm": 0.665459930896759, + "learning_rate": 6.684014594093604e-05, + "loss": 2.5126, + "step": 12192 + }, + { + "epoch": 0.9840206601565652, + "grad_norm": 0.6618975400924683, + "learning_rate": 6.682525266983179e-05, + "loss": 2.5046, + "step": 12193 + }, + { + "epoch": 0.9841013638931483, + "grad_norm": 0.6536173224449158, + "learning_rate": 6.6810360225529e-05, + "loss": 2.4222, + "step": 12194 + }, + { + "epoch": 0.9841820676297313, + "grad_norm": 0.6882187724113464, + "learning_rate": 6.679546860839876e-05, + "loss": 2.475, + "step": 12195 + }, + { + "epoch": 0.9842627713663142, + "grad_norm": 0.6941187977790833, + "learning_rate": 6.678057781881224e-05, + "loss": 2.5642, + "step": 12196 + }, + { + "epoch": 0.9843434751028972, + "grad_norm": 0.7057064175605774, + "learning_rate": 6.676568785714057e-05, + "loss": 2.4817, + "step": 12197 + }, + { + "epoch": 0.9844241788394803, + "grad_norm": 0.6455948352813721, + "learning_rate": 6.675079872375487e-05, + "loss": 2.5206, + "step": 12198 + }, + { + "epoch": 0.9845048825760633, + "grad_norm": 0.6559014320373535, + "learning_rate": 6.673591041902613e-05, + "loss": 2.4082, + "step": 12199 + }, + { + "epoch": 0.9845855863126463, + "grad_norm": 0.6732046008110046, + "learning_rate": 6.672102294332542e-05, + "loss": 2.5472, + "step": 12200 + }, + { + "epoch": 0.9846662900492292, + "grad_norm": 0.7074914574623108, + "learning_rate": 6.670613629702391e-05, + "loss": 2.5243, + "step": 12201 + }, + { + "epoch": 0.9847469937858123, + "grad_norm": 0.6780694127082825, + "learning_rate": 6.669125048049246e-05, + "loss": 2.494, + "step": 12202 + }, + { + "epoch": 0.9848276975223953, + "grad_norm": 0.6361132264137268, + "learning_rate": 6.66763654941021e-05, + "loss": 2.4764, + "step": 12203 + }, + { + "epoch": 0.9849084012589783, + "grad_norm": 0.752727210521698, + "learning_rate": 6.666148133822387e-05, + "loss": 2.4942, + "step": 12204 + }, + { + "epoch": 0.9849891049955612, + "grad_norm": 0.7282724976539612, + "learning_rate": 6.664659801322863e-05, + "loss": 2.471, + "step": 12205 + }, + { + "epoch": 0.9850698087321443, + "grad_norm": 0.6977601051330566, + "learning_rate": 6.663171551948736e-05, + "loss": 2.4695, + "step": 12206 + }, + { + "epoch": 0.9851505124687273, + "grad_norm": 0.6957824230194092, + "learning_rate": 6.661683385737101e-05, + "loss": 2.5096, + "step": 12207 + }, + { + "epoch": 0.9852312162053103, + "grad_norm": 0.6197221279144287, + "learning_rate": 6.660195302725037e-05, + "loss": 2.4199, + "step": 12208 + }, + { + "epoch": 0.9853119199418933, + "grad_norm": 0.747558057308197, + "learning_rate": 6.658707302949638e-05, + "loss": 2.5988, + "step": 12209 + }, + { + "epoch": 0.9853926236784764, + "grad_norm": 0.6593184471130371, + "learning_rate": 6.657219386447989e-05, + "loss": 2.4837, + "step": 12210 + }, + { + "epoch": 0.9854733274150593, + "grad_norm": 0.6795992255210876, + "learning_rate": 6.655731553257169e-05, + "loss": 2.498, + "step": 12211 + }, + { + "epoch": 0.9855540311516423, + "grad_norm": 0.7588422298431396, + "learning_rate": 6.65424380341426e-05, + "loss": 2.444, + "step": 12212 + }, + { + "epoch": 0.9856347348882253, + "grad_norm": 0.7791433930397034, + "learning_rate": 6.652756136956342e-05, + "loss": 2.4893, + "step": 12213 + }, + { + "epoch": 0.9857154386248084, + "grad_norm": 0.6320767998695374, + "learning_rate": 6.651268553920493e-05, + "loss": 2.4831, + "step": 12214 + }, + { + "epoch": 0.9857961423613913, + "grad_norm": 0.6818140745162964, + "learning_rate": 6.649781054343783e-05, + "loss": 2.4316, + "step": 12215 + }, + { + "epoch": 0.9858768460979743, + "grad_norm": 0.7460113763809204, + "learning_rate": 6.648293638263285e-05, + "loss": 2.5335, + "step": 12216 + }, + { + "epoch": 0.9859575498345573, + "grad_norm": 0.714074432849884, + "learning_rate": 6.646806305716079e-05, + "loss": 2.4573, + "step": 12217 + }, + { + "epoch": 0.9860382535711404, + "grad_norm": 0.6815951466560364, + "learning_rate": 6.645319056739217e-05, + "loss": 2.4758, + "step": 12218 + }, + { + "epoch": 0.9861189573077234, + "grad_norm": 0.6842799782752991, + "learning_rate": 6.643831891369775e-05, + "loss": 2.4998, + "step": 12219 + }, + { + "epoch": 0.9861996610443063, + "grad_norm": 0.6725212335586548, + "learning_rate": 6.642344809644818e-05, + "loss": 2.5179, + "step": 12220 + }, + { + "epoch": 0.9862803647808893, + "grad_norm": 0.7859417796134949, + "learning_rate": 6.640857811601402e-05, + "loss": 2.5801, + "step": 12221 + }, + { + "epoch": 0.9863610685174724, + "grad_norm": 0.6438577771186829, + "learning_rate": 6.639370897276591e-05, + "loss": 2.4659, + "step": 12222 + }, + { + "epoch": 0.9864417722540554, + "grad_norm": 0.7036609053611755, + "learning_rate": 6.637884066707447e-05, + "loss": 2.5637, + "step": 12223 + }, + { + "epoch": 0.9865224759906384, + "grad_norm": 0.6756969094276428, + "learning_rate": 6.636397319931016e-05, + "loss": 2.5381, + "step": 12224 + }, + { + "epoch": 0.9866031797272213, + "grad_norm": 0.6907589435577393, + "learning_rate": 6.634910656984354e-05, + "loss": 2.4927, + "step": 12225 + }, + { + "epoch": 0.9866838834638044, + "grad_norm": 0.7347010374069214, + "learning_rate": 6.63342407790452e-05, + "loss": 2.5131, + "step": 12226 + }, + { + "epoch": 0.9867645872003874, + "grad_norm": 0.6835876107215881, + "learning_rate": 6.631937582728555e-05, + "loss": 2.4611, + "step": 12227 + }, + { + "epoch": 0.9868452909369704, + "grad_norm": 0.8199172616004944, + "learning_rate": 6.630451171493511e-05, + "loss": 2.5341, + "step": 12228 + }, + { + "epoch": 0.9869259946735534, + "grad_norm": 0.7537188529968262, + "learning_rate": 6.62896484423643e-05, + "loss": 2.5218, + "step": 12229 + }, + { + "epoch": 0.9870066984101364, + "grad_norm": 0.7254310250282288, + "learning_rate": 6.62747860099436e-05, + "loss": 2.4766, + "step": 12230 + }, + { + "epoch": 0.9870874021467194, + "grad_norm": 0.6852995157241821, + "learning_rate": 6.625992441804338e-05, + "loss": 2.548, + "step": 12231 + }, + { + "epoch": 0.9871681058833024, + "grad_norm": 0.7089388966560364, + "learning_rate": 6.624506366703402e-05, + "loss": 2.5125, + "step": 12232 + }, + { + "epoch": 0.9872488096198854, + "grad_norm": 0.7114216685295105, + "learning_rate": 6.623020375728597e-05, + "loss": 2.5408, + "step": 12233 + }, + { + "epoch": 0.9873295133564685, + "grad_norm": 0.7891978025436401, + "learning_rate": 6.621534468916946e-05, + "loss": 2.5946, + "step": 12234 + }, + { + "epoch": 0.9874102170930514, + "grad_norm": 0.671399712562561, + "learning_rate": 6.620048646305488e-05, + "loss": 2.4732, + "step": 12235 + }, + { + "epoch": 0.9874909208296344, + "grad_norm": 0.6712855696678162, + "learning_rate": 6.618562907931256e-05, + "loss": 2.4376, + "step": 12236 + }, + { + "epoch": 0.9875716245662174, + "grad_norm": 0.7183727025985718, + "learning_rate": 6.617077253831272e-05, + "loss": 2.5406, + "step": 12237 + }, + { + "epoch": 0.9876523283028005, + "grad_norm": 0.6857761144638062, + "learning_rate": 6.615591684042568e-05, + "loss": 2.5279, + "step": 12238 + }, + { + "epoch": 0.9877330320393835, + "grad_norm": 0.7268103957176208, + "learning_rate": 6.614106198602165e-05, + "loss": 2.5283, + "step": 12239 + }, + { + "epoch": 0.9878137357759664, + "grad_norm": 0.6703717708587646, + "learning_rate": 6.612620797547087e-05, + "loss": 2.4254, + "step": 12240 + }, + { + "epoch": 0.9878944395125494, + "grad_norm": 0.7110719680786133, + "learning_rate": 6.611135480914352e-05, + "loss": 2.496, + "step": 12241 + }, + { + "epoch": 0.9879751432491324, + "grad_norm": 0.7268263697624207, + "learning_rate": 6.609650248740983e-05, + "loss": 2.5489, + "step": 12242 + }, + { + "epoch": 0.9880558469857155, + "grad_norm": 0.7413432598114014, + "learning_rate": 6.60816510106399e-05, + "loss": 2.4998, + "step": 12243 + }, + { + "epoch": 0.9881365507222984, + "grad_norm": 0.7443360090255737, + "learning_rate": 6.606680037920389e-05, + "loss": 2.5282, + "step": 12244 + }, + { + "epoch": 0.9882172544588814, + "grad_norm": 0.7787832021713257, + "learning_rate": 6.605195059347191e-05, + "loss": 2.5221, + "step": 12245 + }, + { + "epoch": 0.9882979581954644, + "grad_norm": 0.6921473741531372, + "learning_rate": 6.603710165381409e-05, + "loss": 2.5434, + "step": 12246 + }, + { + "epoch": 0.9883786619320475, + "grad_norm": 0.737328827381134, + "learning_rate": 6.602225356060044e-05, + "loss": 2.5222, + "step": 12247 + }, + { + "epoch": 0.9884593656686305, + "grad_norm": 0.698823094367981, + "learning_rate": 6.600740631420106e-05, + "loss": 2.528, + "step": 12248 + }, + { + "epoch": 0.9885400694052134, + "grad_norm": 0.6735067963600159, + "learning_rate": 6.599255991498601e-05, + "loss": 2.4942, + "step": 12249 + }, + { + "epoch": 0.9886207731417964, + "grad_norm": 0.659622311592102, + "learning_rate": 6.59777143633252e-05, + "loss": 2.4822, + "step": 12250 + }, + { + "epoch": 0.9887014768783795, + "grad_norm": 0.6973726153373718, + "learning_rate": 6.596286965958872e-05, + "loss": 2.5499, + "step": 12251 + }, + { + "epoch": 0.9887821806149625, + "grad_norm": 0.6771909594535828, + "learning_rate": 6.594802580414651e-05, + "loss": 2.4968, + "step": 12252 + }, + { + "epoch": 0.9888628843515455, + "grad_norm": 0.68080073595047, + "learning_rate": 6.593318279736849e-05, + "loss": 2.5142, + "step": 12253 + }, + { + "epoch": 0.9889435880881284, + "grad_norm": NaN, + "learning_rate": 6.593318279736849e-05, + "loss": 2.466, + "step": 12254 + }, + { + "epoch": 0.9890242918247115, + "grad_norm": 0.6865221858024597, + "learning_rate": 6.591834063962461e-05, + "loss": 2.4894, + "step": 12255 + }, + { + "epoch": 0.9891049955612945, + "grad_norm": 0.7050445079803467, + "learning_rate": 6.590349933128478e-05, + "loss": 2.5733, + "step": 12256 + }, + { + "epoch": 0.9891856992978775, + "grad_norm": 0.6971526741981506, + "learning_rate": 6.588865887271887e-05, + "loss": 2.4997, + "step": 12257 + }, + { + "epoch": 0.9892664030344605, + "grad_norm": 0.6465088725090027, + "learning_rate": 6.587381926429674e-05, + "loss": 2.5155, + "step": 12258 + }, + { + "epoch": 0.9893471067710435, + "grad_norm": 0.6521422266960144, + "learning_rate": 6.585898050638823e-05, + "loss": 2.4803, + "step": 12259 + }, + { + "epoch": 0.9894278105076265, + "grad_norm": 0.6798849105834961, + "learning_rate": 6.584414259936324e-05, + "loss": 2.5301, + "step": 12260 + }, + { + "epoch": 0.9895085142442095, + "grad_norm": 0.6903446912765503, + "learning_rate": 6.582930554359144e-05, + "loss": 2.4662, + "step": 12261 + }, + { + "epoch": 0.9895892179807925, + "grad_norm": 0.7183516621589661, + "learning_rate": 6.581446933944267e-05, + "loss": 2.4711, + "step": 12262 + }, + { + "epoch": 0.9896699217173756, + "grad_norm": 0.702738344669342, + "learning_rate": 6.579963398728671e-05, + "loss": 2.531, + "step": 12263 + }, + { + "epoch": 0.9897506254539585, + "grad_norm": 0.7187048196792603, + "learning_rate": 6.578479948749325e-05, + "loss": 2.4933, + "step": 12264 + }, + { + "epoch": 0.9898313291905415, + "grad_norm": 0.6988784670829773, + "learning_rate": 6.576996584043202e-05, + "loss": 2.5179, + "step": 12265 + }, + { + "epoch": 0.9899120329271245, + "grad_norm": 0.7434641122817993, + "learning_rate": 6.575513304647276e-05, + "loss": 2.5157, + "step": 12266 + }, + { + "epoch": 0.9899927366637076, + "grad_norm": 0.667881429195404, + "learning_rate": 6.574030110598505e-05, + "loss": 2.5152, + "step": 12267 + }, + { + "epoch": 0.9900734404002905, + "grad_norm": 0.6766676902770996, + "learning_rate": 6.572547001933862e-05, + "loss": 2.5041, + "step": 12268 + }, + { + "epoch": 0.9901541441368735, + "grad_norm": 0.6531797051429749, + "learning_rate": 6.571063978690311e-05, + "loss": 2.5457, + "step": 12269 + }, + { + "epoch": 0.9902348478734565, + "grad_norm": 0.6557255983352661, + "learning_rate": 6.569581040904804e-05, + "loss": 2.5253, + "step": 12270 + }, + { + "epoch": 0.9903155516100396, + "grad_norm": 0.6818893551826477, + "learning_rate": 6.568098188614304e-05, + "loss": 2.5031, + "step": 12271 + }, + { + "epoch": 0.9903962553466226, + "grad_norm": 0.6644853949546814, + "learning_rate": 6.56661542185577e-05, + "loss": 2.5285, + "step": 12272 + }, + { + "epoch": 0.9904769590832055, + "grad_norm": 0.6035603284835815, + "learning_rate": 6.565132740666155e-05, + "loss": 2.46, + "step": 12273 + }, + { + "epoch": 0.9905576628197885, + "grad_norm": 0.7061343193054199, + "learning_rate": 6.56365014508241e-05, + "loss": 2.4731, + "step": 12274 + }, + { + "epoch": 0.9906383665563716, + "grad_norm": 0.6981248259544373, + "learning_rate": 6.562167635141486e-05, + "loss": 2.4518, + "step": 12275 + }, + { + "epoch": 0.9907190702929546, + "grad_norm": 0.6718073487281799, + "learning_rate": 6.560685210880334e-05, + "loss": 2.4919, + "step": 12276 + }, + { + "epoch": 0.9907997740295376, + "grad_norm": 0.7095392942428589, + "learning_rate": 6.559202872335893e-05, + "loss": 2.5284, + "step": 12277 + }, + { + "epoch": 0.9908804777661205, + "grad_norm": 0.7052092552185059, + "learning_rate": 6.557720619545111e-05, + "loss": 2.4781, + "step": 12278 + }, + { + "epoch": 0.9909611815027036, + "grad_norm": 0.653570830821991, + "learning_rate": 6.556238452544934e-05, + "loss": 2.5293, + "step": 12279 + }, + { + "epoch": 0.9910418852392866, + "grad_norm": 0.6705330610275269, + "learning_rate": 6.554756371372293e-05, + "loss": 2.4437, + "step": 12280 + }, + { + "epoch": 0.9911225889758696, + "grad_norm": 0.6494189500808716, + "learning_rate": 6.553274376064127e-05, + "loss": 2.4833, + "step": 12281 + }, + { + "epoch": 0.9912032927124526, + "grad_norm": 0.6497724652290344, + "learning_rate": 6.551792466657378e-05, + "loss": 2.4803, + "step": 12282 + }, + { + "epoch": 0.9912839964490356, + "grad_norm": 0.7740494608879089, + "learning_rate": 6.550310643188972e-05, + "loss": 2.4907, + "step": 12283 + }, + { + "epoch": 0.9913647001856186, + "grad_norm": 0.699562668800354, + "learning_rate": 6.548828905695843e-05, + "loss": 2.4576, + "step": 12284 + }, + { + "epoch": 0.9914454039222016, + "grad_norm": 0.8123162984848022, + "learning_rate": 6.547347254214921e-05, + "loss": 2.5118, + "step": 12285 + }, + { + "epoch": 0.9915261076587846, + "grad_norm": 0.7227715253829956, + "learning_rate": 6.545865688783129e-05, + "loss": 2.4688, + "step": 12286 + }, + { + "epoch": 0.9916068113953677, + "grad_norm": 0.6498493552207947, + "learning_rate": 6.544384209437392e-05, + "loss": 2.477, + "step": 12287 + }, + { + "epoch": 0.9916875151319506, + "grad_norm": 0.6427823901176453, + "learning_rate": 6.542902816214636e-05, + "loss": 2.4388, + "step": 12288 + }, + { + "epoch": 0.9917682188685336, + "grad_norm": 0.6803679466247559, + "learning_rate": 6.541421509151778e-05, + "loss": 2.5095, + "step": 12289 + }, + { + "epoch": 0.9918489226051166, + "grad_norm": 0.7025790810585022, + "learning_rate": 6.539940288285734e-05, + "loss": 2.4881, + "step": 12290 + }, + { + "epoch": 0.9919296263416996, + "grad_norm": 0.6899270415306091, + "learning_rate": 6.538459153653424e-05, + "loss": 2.486, + "step": 12291 + }, + { + "epoch": 0.9920103300782827, + "grad_norm": 0.7379609942436218, + "learning_rate": 6.536978105291762e-05, + "loss": 2.5368, + "step": 12292 + }, + { + "epoch": 0.9920910338148656, + "grad_norm": 0.7279202342033386, + "learning_rate": 6.535497143237657e-05, + "loss": 2.5275, + "step": 12293 + }, + { + "epoch": 0.9921717375514486, + "grad_norm": 0.6810527443885803, + "learning_rate": 6.53401626752802e-05, + "loss": 2.5053, + "step": 12294 + }, + { + "epoch": 0.9922524412880316, + "grad_norm": 0.6578424572944641, + "learning_rate": 6.532535478199759e-05, + "loss": 2.5334, + "step": 12295 + }, + { + "epoch": 0.9923331450246147, + "grad_norm": 0.6819284558296204, + "learning_rate": 6.531054775289778e-05, + "loss": 2.4879, + "step": 12296 + }, + { + "epoch": 0.9924138487611976, + "grad_norm": 0.6524500846862793, + "learning_rate": 6.529574158834977e-05, + "loss": 2.5349, + "step": 12297 + }, + { + "epoch": 0.9924945524977806, + "grad_norm": 0.6853352785110474, + "learning_rate": 6.528093628872263e-05, + "loss": 2.4217, + "step": 12298 + }, + { + "epoch": 0.9925752562343636, + "grad_norm": 0.6731893420219421, + "learning_rate": 6.526613185438529e-05, + "loss": 2.4739, + "step": 12299 + }, + { + "epoch": 0.9926559599709467, + "grad_norm": 0.6515606641769409, + "learning_rate": 6.525132828570673e-05, + "loss": 2.5348, + "step": 12300 + }, + { + "epoch": 0.9927366637075297, + "grad_norm": 0.6819963455200195, + "learning_rate": 6.523652558305596e-05, + "loss": 2.5052, + "step": 12301 + }, + { + "epoch": 0.9928173674441126, + "grad_norm": 0.6521475911140442, + "learning_rate": 6.522172374680177e-05, + "loss": 2.5283, + "step": 12302 + }, + { + "epoch": 0.9928980711806956, + "grad_norm": 0.6488186717033386, + "learning_rate": 6.520692277731315e-05, + "loss": 2.4779, + "step": 12303 + }, + { + "epoch": 0.9929787749172787, + "grad_norm": 0.6509760022163391, + "learning_rate": 6.519212267495903e-05, + "loss": 2.5426, + "step": 12304 + }, + { + "epoch": 0.9930594786538617, + "grad_norm": 0.621366560459137, + "learning_rate": 6.517732344010814e-05, + "loss": 2.4804, + "step": 12305 + }, + { + "epoch": 0.9931401823904447, + "grad_norm": 0.6907268166542053, + "learning_rate": 6.516252507312938e-05, + "loss": 2.4883, + "step": 12306 + }, + { + "epoch": 0.9932208861270276, + "grad_norm": 0.7739343643188477, + "learning_rate": 6.514772757439157e-05, + "loss": 2.481, + "step": 12307 + }, + { + "epoch": 0.9933015898636107, + "grad_norm": 0.6794601082801819, + "learning_rate": 6.513293094426352e-05, + "loss": 2.5244, + "step": 12308 + }, + { + "epoch": 0.9933822936001937, + "grad_norm": 0.7189902663230896, + "learning_rate": 6.511813518311394e-05, + "loss": 2.5221, + "step": 12309 + }, + { + "epoch": 0.9934629973367767, + "grad_norm": 0.733318030834198, + "learning_rate": 6.510334029131163e-05, + "loss": 2.521, + "step": 12310 + }, + { + "epoch": 0.9935437010733597, + "grad_norm": 0.7584299445152283, + "learning_rate": 6.508854626922531e-05, + "loss": 2.4962, + "step": 12311 + }, + { + "epoch": 0.9936244048099427, + "grad_norm": 0.6442410349845886, + "learning_rate": 6.507375311722366e-05, + "loss": 2.4775, + "step": 12312 + }, + { + "epoch": 0.9937051085465257, + "grad_norm": 0.6609243154525757, + "learning_rate": 6.505896083567536e-05, + "loss": 2.4706, + "step": 12313 + }, + { + "epoch": 0.9937858122831087, + "grad_norm": 0.6527631878852844, + "learning_rate": 6.504416942494914e-05, + "loss": 2.4612, + "step": 12314 + }, + { + "epoch": 0.9938665160196917, + "grad_norm": 0.6798218488693237, + "learning_rate": 6.502937888541357e-05, + "loss": 2.5502, + "step": 12315 + }, + { + "epoch": 0.9939472197562748, + "grad_norm": 0.6573790907859802, + "learning_rate": 6.501458921743728e-05, + "loss": 2.5598, + "step": 12316 + }, + { + "epoch": 0.9940279234928577, + "grad_norm": 0.6945913434028625, + "learning_rate": 6.49998004213889e-05, + "loss": 2.5323, + "step": 12317 + }, + { + "epoch": 0.9941086272294407, + "grad_norm": 0.7609078288078308, + "learning_rate": 6.498501249763697e-05, + "loss": 2.5211, + "step": 12318 + }, + { + "epoch": 0.9941893309660237, + "grad_norm": 0.6878666281700134, + "learning_rate": 6.497022544655006e-05, + "loss": 2.5366, + "step": 12319 + }, + { + "epoch": 0.9942700347026068, + "grad_norm": 0.6675810813903809, + "learning_rate": 6.495543926849674e-05, + "loss": 2.512, + "step": 12320 + }, + { + "epoch": 0.9943507384391898, + "grad_norm": 0.7285950779914856, + "learning_rate": 6.494065396384544e-05, + "loss": 2.4741, + "step": 12321 + }, + { + "epoch": 0.9944314421757727, + "grad_norm": 0.6287158131599426, + "learning_rate": 6.49258695329647e-05, + "loss": 2.4824, + "step": 12322 + }, + { + "epoch": 0.9945121459123557, + "grad_norm": 0.6506727337837219, + "learning_rate": 6.491108597622296e-05, + "loss": 2.5126, + "step": 12323 + }, + { + "epoch": 0.9945928496489388, + "grad_norm": 0.7679052352905273, + "learning_rate": 6.489630329398869e-05, + "loss": 2.5503, + "step": 12324 + }, + { + "epoch": 0.9946735533855218, + "grad_norm": 0.637184202671051, + "learning_rate": 6.488152148663029e-05, + "loss": 2.5098, + "step": 12325 + }, + { + "epoch": 0.9947542571221047, + "grad_norm": 0.6747186779975891, + "learning_rate": 6.486674055451619e-05, + "loss": 2.5154, + "step": 12326 + }, + { + "epoch": 0.9948349608586877, + "grad_norm": 0.7288245558738708, + "learning_rate": 6.485196049801476e-05, + "loss": 2.5077, + "step": 12327 + }, + { + "epoch": 0.9949156645952708, + "grad_norm": 0.6914251446723938, + "learning_rate": 6.483718131749435e-05, + "loss": 2.4877, + "step": 12328 + }, + { + "epoch": 0.9949963683318538, + "grad_norm": 0.7224392294883728, + "learning_rate": 6.48224030133233e-05, + "loss": 2.4862, + "step": 12329 + }, + { + "epoch": 0.9950770720684368, + "grad_norm": 0.7365561723709106, + "learning_rate": 6.480762558586995e-05, + "loss": 2.477, + "step": 12330 + }, + { + "epoch": 0.9951577758050197, + "grad_norm": 0.7673236131668091, + "learning_rate": 6.47928490355025e-05, + "loss": 2.5423, + "step": 12331 + }, + { + "epoch": 0.9952384795416028, + "grad_norm": 0.6638002395629883, + "learning_rate": 6.477807336258931e-05, + "loss": 2.5007, + "step": 12332 + }, + { + "epoch": 0.9953191832781858, + "grad_norm": 0.6415974497795105, + "learning_rate": 6.476329856749864e-05, + "loss": 2.4924, + "step": 12333 + }, + { + "epoch": 0.9953998870147688, + "grad_norm": 0.7129398584365845, + "learning_rate": 6.474852465059864e-05, + "loss": 2.5313, + "step": 12334 + }, + { + "epoch": 0.9954805907513518, + "grad_norm": 0.6896344423294067, + "learning_rate": 6.473375161225756e-05, + "loss": 2.5073, + "step": 12335 + }, + { + "epoch": 0.9955612944879348, + "grad_norm": 0.7009317874908447, + "learning_rate": 6.47189794528436e-05, + "loss": 2.574, + "step": 12336 + }, + { + "epoch": 0.9956419982245178, + "grad_norm": 0.6555172801017761, + "learning_rate": 6.470420817272488e-05, + "loss": 2.4769, + "step": 12337 + }, + { + "epoch": 0.9957227019611008, + "grad_norm": 0.7569532990455627, + "learning_rate": 6.468943777226954e-05, + "loss": 2.4691, + "step": 12338 + }, + { + "epoch": 0.9958034056976838, + "grad_norm": 0.68092280626297, + "learning_rate": 6.467466825184569e-05, + "loss": 2.4793, + "step": 12339 + }, + { + "epoch": 0.9958841094342669, + "grad_norm": 0.6977378726005554, + "learning_rate": 6.465989961182152e-05, + "loss": 2.4678, + "step": 12340 + }, + { + "epoch": 0.9959648131708498, + "grad_norm": 0.6702281832695007, + "learning_rate": 6.4645131852565e-05, + "loss": 2.5398, + "step": 12341 + }, + { + "epoch": 0.9960455169074328, + "grad_norm": 0.7584038972854614, + "learning_rate": 6.46303649744442e-05, + "loss": 2.5355, + "step": 12342 + }, + { + "epoch": 0.9961262206440158, + "grad_norm": 0.6779505610466003, + "learning_rate": 6.461559897782718e-05, + "loss": 2.4828, + "step": 12343 + }, + { + "epoch": 0.9962069243805988, + "grad_norm": 0.6968233585357666, + "learning_rate": 6.460083386308192e-05, + "loss": 2.5108, + "step": 12344 + }, + { + "epoch": 0.9962876281171819, + "grad_norm": 0.7114594578742981, + "learning_rate": 6.45860696305764e-05, + "loss": 2.5236, + "step": 12345 + }, + { + "epoch": 0.9963683318537648, + "grad_norm": 0.6850530505180359, + "learning_rate": 6.457130628067865e-05, + "loss": 2.458, + "step": 12346 + }, + { + "epoch": 0.9964490355903478, + "grad_norm": 0.7135400772094727, + "learning_rate": 6.455654381375651e-05, + "loss": 2.539, + "step": 12347 + }, + { + "epoch": 0.9965297393269308, + "grad_norm": 0.6736366748809814, + "learning_rate": 6.454178223017797e-05, + "loss": 2.4721, + "step": 12348 + }, + { + "epoch": 0.9966104430635139, + "grad_norm": 0.6806206107139587, + "learning_rate": 6.45270215303109e-05, + "loss": 2.5035, + "step": 12349 + }, + { + "epoch": 0.9966911468000968, + "grad_norm": 0.7120711803436279, + "learning_rate": 6.451226171452318e-05, + "loss": 2.5344, + "step": 12350 + }, + { + "epoch": 0.9967718505366798, + "grad_norm": 0.6865986585617065, + "learning_rate": 6.449750278318264e-05, + "loss": 2.4807, + "step": 12351 + }, + { + "epoch": 0.9968525542732628, + "grad_norm": 0.6461294889450073, + "learning_rate": 6.448274473665717e-05, + "loss": 2.4878, + "step": 12352 + }, + { + "epoch": 0.9969332580098459, + "grad_norm": 0.7090638279914856, + "learning_rate": 6.446798757531454e-05, + "loss": 2.4599, + "step": 12353 + }, + { + "epoch": 0.9970139617464289, + "grad_norm": 0.6933324337005615, + "learning_rate": 6.445323129952252e-05, + "loss": 2.5398, + "step": 12354 + }, + { + "epoch": 0.9970946654830118, + "grad_norm": 0.7018197774887085, + "learning_rate": 6.443847590964888e-05, + "loss": 2.5159, + "step": 12355 + }, + { + "epoch": 0.9971753692195948, + "grad_norm": 0.7292604446411133, + "learning_rate": 6.442372140606145e-05, + "loss": 2.4934, + "step": 12356 + }, + { + "epoch": 0.9972560729561779, + "grad_norm": 0.6686378121376038, + "learning_rate": 6.440896778912783e-05, + "loss": 2.5076, + "step": 12357 + }, + { + "epoch": 0.9973367766927609, + "grad_norm": 0.7194764018058777, + "learning_rate": 6.439421505921576e-05, + "loss": 2.4958, + "step": 12358 + }, + { + "epoch": 0.9974174804293439, + "grad_norm": 0.662467360496521, + "learning_rate": 6.437946321669296e-05, + "loss": 2.5202, + "step": 12359 + }, + { + "epoch": 0.9974981841659268, + "grad_norm": 0.7222515940666199, + "learning_rate": 6.436471226192703e-05, + "loss": 2.5058, + "step": 12360 + }, + { + "epoch": 0.9975788879025099, + "grad_norm": 0.6354855895042419, + "learning_rate": 6.434996219528562e-05, + "loss": 2.4849, + "step": 12361 + }, + { + "epoch": 0.9976595916390929, + "grad_norm": 0.7689539790153503, + "learning_rate": 6.433521301713636e-05, + "loss": 2.4959, + "step": 12362 + }, + { + "epoch": 0.9977402953756759, + "grad_norm": 0.6894338130950928, + "learning_rate": 6.43204647278468e-05, + "loss": 2.5098, + "step": 12363 + }, + { + "epoch": 0.9978209991122589, + "grad_norm": 0.7694165110588074, + "learning_rate": 6.430571732778451e-05, + "loss": 2.513, + "step": 12364 + }, + { + "epoch": 0.9979017028488419, + "grad_norm": 0.6512044668197632, + "learning_rate": 6.42909708173171e-05, + "loss": 2.4785, + "step": 12365 + }, + { + "epoch": 0.9979824065854249, + "grad_norm": 0.6605672836303711, + "learning_rate": 6.427622519681201e-05, + "loss": 2.4804, + "step": 12366 + }, + { + "epoch": 0.9980631103220079, + "grad_norm": 0.7123624086380005, + "learning_rate": 6.426148046663677e-05, + "loss": 2.4854, + "step": 12367 + }, + { + "epoch": 0.9981438140585909, + "grad_norm": 0.662645697593689, + "learning_rate": 6.424673662715886e-05, + "loss": 2.5314, + "step": 12368 + }, + { + "epoch": 0.998224517795174, + "grad_norm": 0.6482149362564087, + "learning_rate": 6.423199367874573e-05, + "loss": 2.4492, + "step": 12369 + }, + { + "epoch": 0.9983052215317569, + "grad_norm": 0.6545752286911011, + "learning_rate": 6.421725162176482e-05, + "loss": 2.5042, + "step": 12370 + }, + { + "epoch": 0.9983859252683399, + "grad_norm": 0.6698874235153198, + "learning_rate": 6.420251045658353e-05, + "loss": 2.4523, + "step": 12371 + }, + { + "epoch": 0.9984666290049229, + "grad_norm": 0.6961477398872375, + "learning_rate": 6.418777018356929e-05, + "loss": 2.556, + "step": 12372 + }, + { + "epoch": 0.998547332741506, + "grad_norm": 0.67090904712677, + "learning_rate": 6.41730308030894e-05, + "loss": 2.5237, + "step": 12373 + }, + { + "epoch": 0.998628036478089, + "grad_norm": 0.6828685402870178, + "learning_rate": 6.415829231551124e-05, + "loss": 2.453, + "step": 12374 + }, + { + "epoch": 0.9987087402146719, + "grad_norm": 0.6699565649032593, + "learning_rate": 6.414355472120213e-05, + "loss": 2.4632, + "step": 12375 + }, + { + "epoch": 0.9987894439512549, + "grad_norm": 0.6918730735778809, + "learning_rate": 6.412881802052936e-05, + "loss": 2.4532, + "step": 12376 + }, + { + "epoch": 0.998870147687838, + "grad_norm": 0.7222442030906677, + "learning_rate": 6.411408221386021e-05, + "loss": 2.5113, + "step": 12377 + }, + { + "epoch": 0.998950851424421, + "grad_norm": 0.7479627132415771, + "learning_rate": 6.409934730156195e-05, + "loss": 2.4857, + "step": 12378 + }, + { + "epoch": 0.999031555161004, + "grad_norm": 0.6552882194519043, + "learning_rate": 6.40846132840018e-05, + "loss": 2.4816, + "step": 12379 + }, + { + "epoch": 0.9991122588975869, + "grad_norm": 0.5990073084831238, + "learning_rate": 6.406988016154694e-05, + "loss": 2.4753, + "step": 12380 + }, + { + "epoch": 0.99919296263417, + "grad_norm": 0.6671901941299438, + "learning_rate": 6.405514793456465e-05, + "loss": 2.5298, + "step": 12381 + }, + { + "epoch": 0.999273666370753, + "grad_norm": 0.6630427241325378, + "learning_rate": 6.4040416603422e-05, + "loss": 2.485, + "step": 12382 + }, + { + "epoch": 0.999354370107336, + "grad_norm": 0.6873636841773987, + "learning_rate": 6.402568616848614e-05, + "loss": 2.4902, + "step": 12383 + }, + { + "epoch": 0.9994350738439189, + "grad_norm": 0.6912413239479065, + "learning_rate": 6.401095663012424e-05, + "loss": 2.5339, + "step": 12384 + }, + { + "epoch": 0.999515777580502, + "grad_norm": 0.6491912603378296, + "learning_rate": 6.39962279887034e-05, + "loss": 2.5367, + "step": 12385 + }, + { + "epoch": 0.999596481317085, + "grad_norm": 0.6668288111686707, + "learning_rate": 6.398150024459065e-05, + "loss": 2.5294, + "step": 12386 + }, + { + "epoch": 0.999677185053668, + "grad_norm": 0.6603856086730957, + "learning_rate": 6.396677339815306e-05, + "loss": 2.4378, + "step": 12387 + }, + { + "epoch": 0.999757888790251, + "grad_norm": 0.6461218595504761, + "learning_rate": 6.395204744975772e-05, + "loss": 2.4835, + "step": 12388 + }, + { + "epoch": 0.999838592526834, + "grad_norm": 0.6621688604354858, + "learning_rate": 6.39373223997715e-05, + "loss": 2.4834, + "step": 12389 + }, + { + "epoch": 0.999919296263417, + "grad_norm": 0.6758724451065063, + "learning_rate": 6.392259824856153e-05, + "loss": 2.4549, + "step": 12390 + }, + { + "epoch": 1.0, + "grad_norm": 1.1304112672805786, + "learning_rate": 6.390787499649473e-05, + "loss": 2.5547, + "step": 12391 + }, + { + "epoch": 1.000080703736583, + "grad_norm": 0.6919478178024292, + "learning_rate": 6.389315264393801e-05, + "loss": 2.47, + "step": 12392 + }, + { + "epoch": 1.000161407473166, + "grad_norm": 0.6916815638542175, + "learning_rate": 6.38784311912583e-05, + "loss": 2.4636, + "step": 12393 + }, + { + "epoch": 1.000242111209749, + "grad_norm": 0.6627040505409241, + "learning_rate": 6.386371063882252e-05, + "loss": 2.5094, + "step": 12394 + }, + { + "epoch": 1.000322814946332, + "grad_norm": 0.6408648490905762, + "learning_rate": 6.384899098699754e-05, + "loss": 2.426, + "step": 12395 + }, + { + "epoch": 1.000403518682915, + "grad_norm": 0.70432448387146, + "learning_rate": 6.38342722361502e-05, + "loss": 2.4861, + "step": 12396 + }, + { + "epoch": 1.000484222419498, + "grad_norm": 0.7115964889526367, + "learning_rate": 6.381955438664735e-05, + "loss": 2.4824, + "step": 12397 + }, + { + "epoch": 1.000564926156081, + "grad_norm": 0.6547040939331055, + "learning_rate": 6.380483743885574e-05, + "loss": 2.488, + "step": 12398 + }, + { + "epoch": 1.000645629892664, + "grad_norm": 0.6916625499725342, + "learning_rate": 6.379012139314223e-05, + "loss": 2.4864, + "step": 12399 + }, + { + "epoch": 1.0007263336292471, + "grad_norm": 0.6311133503913879, + "learning_rate": 6.377540624987352e-05, + "loss": 2.4672, + "step": 12400 + }, + { + "epoch": 1.00080703736583, + "grad_norm": 0.7115580439567566, + "learning_rate": 6.376069200941642e-05, + "loss": 2.4359, + "step": 12401 + }, + { + "epoch": 1.000887741102413, + "grad_norm": 0.6734051704406738, + "learning_rate": 6.374597867213756e-05, + "loss": 2.4896, + "step": 12402 + }, + { + "epoch": 1.000968444838996, + "grad_norm": 0.6910715699195862, + "learning_rate": 6.373126623840368e-05, + "loss": 2.4502, + "step": 12403 + }, + { + "epoch": 1.001049148575579, + "grad_norm": 0.6807514429092407, + "learning_rate": 6.37165547085815e-05, + "loss": 2.4791, + "step": 12404 + }, + { + "epoch": 1.0011298523121621, + "grad_norm": 0.679350733757019, + "learning_rate": 6.370184408303759e-05, + "loss": 2.4758, + "step": 12405 + }, + { + "epoch": 1.001210556048745, + "grad_norm": 0.6516300439834595, + "learning_rate": 6.36871343621386e-05, + "loss": 2.4338, + "step": 12406 + }, + { + "epoch": 1.001291259785328, + "grad_norm": 0.7033620476722717, + "learning_rate": 6.367242554625119e-05, + "loss": 2.429, + "step": 12407 + }, + { + "epoch": 1.0013719635219112, + "grad_norm": 0.6750274896621704, + "learning_rate": 6.365771763574186e-05, + "loss": 2.4283, + "step": 12408 + }, + { + "epoch": 1.001452667258494, + "grad_norm": 0.7188721895217896, + "learning_rate": 6.364301063097722e-05, + "loss": 2.4509, + "step": 12409 + }, + { + "epoch": 1.001533370995077, + "grad_norm": 0.6936308741569519, + "learning_rate": 6.362830453232379e-05, + "loss": 2.4469, + "step": 12410 + }, + { + "epoch": 1.00161407473166, + "grad_norm": 0.673060953617096, + "learning_rate": 6.361359934014808e-05, + "loss": 2.4444, + "step": 12411 + }, + { + "epoch": 1.001694778468243, + "grad_norm": 0.7465113997459412, + "learning_rate": 6.359889505481658e-05, + "loss": 2.4376, + "step": 12412 + }, + { + "epoch": 1.0017754822048262, + "grad_norm": 0.7180366516113281, + "learning_rate": 6.358419167669582e-05, + "loss": 2.4223, + "step": 12413 + }, + { + "epoch": 1.001856185941409, + "grad_norm": 0.6582302451133728, + "learning_rate": 6.356948920615214e-05, + "loss": 2.4723, + "step": 12414 + }, + { + "epoch": 1.001936889677992, + "grad_norm": 0.6452654600143433, + "learning_rate": 6.3554787643552e-05, + "loss": 2.4609, + "step": 12415 + }, + { + "epoch": 1.0020175934145752, + "grad_norm": 0.7170321345329285, + "learning_rate": 6.354008698926185e-05, + "loss": 2.5377, + "step": 12416 + }, + { + "epoch": 1.002098297151158, + "grad_norm": 0.6483680605888367, + "learning_rate": 6.352538724364809e-05, + "loss": 2.4349, + "step": 12417 + }, + { + "epoch": 1.0021790008877411, + "grad_norm": 0.6567494869232178, + "learning_rate": 6.351068840707697e-05, + "loss": 2.4421, + "step": 12418 + }, + { + "epoch": 1.002259704624324, + "grad_norm": 0.7498565912246704, + "learning_rate": 6.349599047991488e-05, + "loss": 2.4212, + "step": 12419 + }, + { + "epoch": 1.002340408360907, + "grad_norm": 0.6894906759262085, + "learning_rate": 6.348129346252816e-05, + "loss": 2.4356, + "step": 12420 + }, + { + "epoch": 1.0024211120974902, + "grad_norm": 0.657361626625061, + "learning_rate": 6.346659735528304e-05, + "loss": 2.4164, + "step": 12421 + }, + { + "epoch": 1.002501815834073, + "grad_norm": 0.6369211673736572, + "learning_rate": 6.345190215854581e-05, + "loss": 2.4229, + "step": 12422 + }, + { + "epoch": 1.0025825195706561, + "grad_norm": 0.7033721208572388, + "learning_rate": 6.343720787268277e-05, + "loss": 2.5052, + "step": 12423 + }, + { + "epoch": 1.0026632233072392, + "grad_norm": 0.7125518918037415, + "learning_rate": 6.342251449806003e-05, + "loss": 2.514, + "step": 12424 + }, + { + "epoch": 1.002743927043822, + "grad_norm": 0.7355595827102661, + "learning_rate": 6.340782203504385e-05, + "loss": 2.4459, + "step": 12425 + }, + { + "epoch": 1.0028246307804052, + "grad_norm": 0.7244594693183899, + "learning_rate": 6.339313048400042e-05, + "loss": 2.452, + "step": 12426 + }, + { + "epoch": 1.002905334516988, + "grad_norm": 0.7112728357315063, + "learning_rate": 6.337843984529585e-05, + "loss": 2.4951, + "step": 12427 + }, + { + "epoch": 1.0029860382535711, + "grad_norm": 0.7235615849494934, + "learning_rate": 6.336375011929628e-05, + "loss": 2.4697, + "step": 12428 + }, + { + "epoch": 1.0030667419901542, + "grad_norm": 0.653865396976471, + "learning_rate": 6.334906130636784e-05, + "loss": 2.4804, + "step": 12429 + }, + { + "epoch": 1.003147445726737, + "grad_norm": 0.7845149636268616, + "learning_rate": 6.33343734068766e-05, + "loss": 2.5415, + "step": 12430 + }, + { + "epoch": 1.0032281494633202, + "grad_norm": 0.7356342077255249, + "learning_rate": 6.33196864211886e-05, + "loss": 2.5321, + "step": 12431 + }, + { + "epoch": 1.0033088531999033, + "grad_norm": 0.6828265190124512, + "learning_rate": 6.330500034966991e-05, + "loss": 2.3849, + "step": 12432 + }, + { + "epoch": 1.0033895569364861, + "grad_norm": 0.7226579189300537, + "learning_rate": 6.329031519268658e-05, + "loss": 2.512, + "step": 12433 + }, + { + "epoch": 1.0034702606730692, + "grad_norm": 0.6490235924720764, + "learning_rate": 6.327563095060449e-05, + "loss": 2.487, + "step": 12434 + }, + { + "epoch": 1.003550964409652, + "grad_norm": 0.6889309883117676, + "learning_rate": 6.326094762378969e-05, + "loss": 2.4677, + "step": 12435 + }, + { + "epoch": 1.0036316681462352, + "grad_norm": 0.695854127407074, + "learning_rate": 6.324626521260815e-05, + "loss": 2.4362, + "step": 12436 + }, + { + "epoch": 1.0037123718828183, + "grad_norm": 0.7045256495475769, + "learning_rate": 6.32315837174257e-05, + "loss": 2.4307, + "step": 12437 + }, + { + "epoch": 1.0037930756194011, + "grad_norm": 0.662604570388794, + "learning_rate": 6.321690313860833e-05, + "loss": 2.4271, + "step": 12438 + }, + { + "epoch": 1.0038737793559842, + "grad_norm": 0.7682240009307861, + "learning_rate": 6.320222347652191e-05, + "loss": 2.4617, + "step": 12439 + }, + { + "epoch": 1.0039544830925673, + "grad_norm": 0.6599584817886353, + "learning_rate": 6.318754473153221e-05, + "loss": 2.405, + "step": 12440 + }, + { + "epoch": 1.0040351868291502, + "grad_norm": 0.7423116564750671, + "learning_rate": 6.317286690400515e-05, + "loss": 2.5496, + "step": 12441 + }, + { + "epoch": 1.0041158905657332, + "grad_norm": 0.6928953528404236, + "learning_rate": 6.315818999430654e-05, + "loss": 2.4265, + "step": 12442 + }, + { + "epoch": 1.0041965943023161, + "grad_norm": 0.699990451335907, + "learning_rate": 6.314351400280211e-05, + "loss": 2.4747, + "step": 12443 + }, + { + "epoch": 1.0042772980388992, + "grad_norm": 0.673384964466095, + "learning_rate": 6.312883892985765e-05, + "loss": 2.4891, + "step": 12444 + }, + { + "epoch": 1.0043580017754823, + "grad_norm": 0.6668596863746643, + "learning_rate": 6.311416477583893e-05, + "loss": 2.4312, + "step": 12445 + }, + { + "epoch": 1.0044387055120652, + "grad_norm": 0.6931218504905701, + "learning_rate": 6.309949154111163e-05, + "loss": 2.4907, + "step": 12446 + }, + { + "epoch": 1.0045194092486482, + "grad_norm": 0.687683641910553, + "learning_rate": 6.308481922604146e-05, + "loss": 2.4302, + "step": 12447 + }, + { + "epoch": 1.004600112985231, + "grad_norm": 0.6887302398681641, + "learning_rate": 6.30701478309941e-05, + "loss": 2.4749, + "step": 12448 + }, + { + "epoch": 1.0046808167218142, + "grad_norm": 0.6713404655456543, + "learning_rate": 6.305547735633522e-05, + "loss": 2.5046, + "step": 12449 + }, + { + "epoch": 1.0047615204583973, + "grad_norm": 0.7147336006164551, + "learning_rate": 6.304080780243038e-05, + "loss": 2.4578, + "step": 12450 + }, + { + "epoch": 1.0048422241949801, + "grad_norm": 0.87425297498703, + "learning_rate": 6.30261391696452e-05, + "loss": 2.4487, + "step": 12451 + }, + { + "epoch": 1.0049229279315632, + "grad_norm": 0.6641440987586975, + "learning_rate": 6.301147145834534e-05, + "loss": 2.4657, + "step": 12452 + }, + { + "epoch": 1.0050036316681463, + "grad_norm": 0.7311998009681702, + "learning_rate": 6.299680466889626e-05, + "loss": 2.4784, + "step": 12453 + }, + { + "epoch": 1.0050843354047292, + "grad_norm": 0.6722697615623474, + "learning_rate": 6.298213880166354e-05, + "loss": 2.4653, + "step": 12454 + }, + { + "epoch": 1.0051650391413123, + "grad_norm": 0.6886328458786011, + "learning_rate": 6.29674738570127e-05, + "loss": 2.3949, + "step": 12455 + }, + { + "epoch": 1.0052457428778951, + "grad_norm": 0.684688925743103, + "learning_rate": 6.295280983530921e-05, + "loss": 2.4334, + "step": 12456 + }, + { + "epoch": 1.0053264466144782, + "grad_norm": 0.7436798214912415, + "learning_rate": 6.293814673691853e-05, + "loss": 2.5316, + "step": 12457 + }, + { + "epoch": 1.0054071503510613, + "grad_norm": 0.7401304244995117, + "learning_rate": 6.292348456220615e-05, + "loss": 2.4556, + "step": 12458 + }, + { + "epoch": 1.0054878540876442, + "grad_norm": 0.7330329418182373, + "learning_rate": 6.290882331153742e-05, + "loss": 2.4321, + "step": 12459 + }, + { + "epoch": 1.0055685578242273, + "grad_norm": 0.8005052208900452, + "learning_rate": 6.289416298527776e-05, + "loss": 2.415, + "step": 12460 + }, + { + "epoch": 1.0056492615608104, + "grad_norm": 0.8047310709953308, + "learning_rate": 6.28795035837926e-05, + "loss": 2.4144, + "step": 12461 + }, + { + "epoch": 1.0057299652973932, + "grad_norm": 0.7384032011032104, + "learning_rate": 6.28648451074472e-05, + "loss": 2.5237, + "step": 12462 + }, + { + "epoch": 1.0058106690339763, + "grad_norm": 0.7240314483642578, + "learning_rate": 6.285018755660695e-05, + "loss": 2.4894, + "step": 12463 + }, + { + "epoch": 1.0058913727705592, + "grad_norm": 0.6901080012321472, + "learning_rate": 6.283553093163712e-05, + "loss": 2.4244, + "step": 12464 + }, + { + "epoch": 1.0059720765071423, + "grad_norm": 0.6572268605232239, + "learning_rate": 6.282087523290304e-05, + "loss": 2.456, + "step": 12465 + }, + { + "epoch": 1.0060527802437254, + "grad_norm": 0.7207481861114502, + "learning_rate": 6.28062204607699e-05, + "loss": 2.4153, + "step": 12466 + }, + { + "epoch": 1.0061334839803082, + "grad_norm": 0.6901980042457581, + "learning_rate": 6.279156661560299e-05, + "loss": 2.4776, + "step": 12467 + }, + { + "epoch": 1.0062141877168913, + "grad_norm": 0.7003545761108398, + "learning_rate": 6.277691369776752e-05, + "loss": 2.4206, + "step": 12468 + }, + { + "epoch": 1.0062948914534744, + "grad_norm": 0.6978366374969482, + "learning_rate": 6.276226170762865e-05, + "loss": 2.3866, + "step": 12469 + }, + { + "epoch": 1.0063755951900573, + "grad_norm": 0.6763097643852234, + "learning_rate": 6.274761064555154e-05, + "loss": 2.5439, + "step": 12470 + }, + { + "epoch": 1.0064562989266403, + "grad_norm": 0.7146836519241333, + "learning_rate": 6.273296051190139e-05, + "loss": 2.5486, + "step": 12471 + }, + { + "epoch": 1.0065370026632232, + "grad_norm": 0.7448136806488037, + "learning_rate": 6.271831130704326e-05, + "loss": 2.4539, + "step": 12472 + }, + { + "epoch": 1.0066177063998063, + "grad_norm": 0.6918472051620483, + "learning_rate": 6.270366303134226e-05, + "loss": 2.4756, + "step": 12473 + }, + { + "epoch": 1.0066984101363894, + "grad_norm": 0.7067514657974243, + "learning_rate": 6.26890156851635e-05, + "loss": 2.4925, + "step": 12474 + }, + { + "epoch": 1.0067791138729723, + "grad_norm": 0.6517517566680908, + "learning_rate": 6.267436926887197e-05, + "loss": 2.4339, + "step": 12475 + }, + { + "epoch": 1.0068598176095553, + "grad_norm": 0.673367977142334, + "learning_rate": 6.265972378283274e-05, + "loss": 2.416, + "step": 12476 + }, + { + "epoch": 1.0069405213461384, + "grad_norm": 0.7190212607383728, + "learning_rate": 6.26450792274108e-05, + "loss": 2.4822, + "step": 12477 + }, + { + "epoch": 1.0070212250827213, + "grad_norm": 0.7568029165267944, + "learning_rate": 6.263043560297112e-05, + "loss": 2.4607, + "step": 12478 + }, + { + "epoch": 1.0071019288193044, + "grad_norm": 0.6860609650611877, + "learning_rate": 6.261579290987866e-05, + "loss": 2.4429, + "step": 12479 + }, + { + "epoch": 1.0071826325558872, + "grad_norm": 0.7066059112548828, + "learning_rate": 6.260115114849839e-05, + "loss": 2.5504, + "step": 12480 + }, + { + "epoch": 1.0072633362924703, + "grad_norm": 0.6857946515083313, + "learning_rate": 6.25865103191952e-05, + "loss": 2.4776, + "step": 12481 + }, + { + "epoch": 1.0073440400290534, + "grad_norm": 0.6879859566688538, + "learning_rate": 6.257187042233396e-05, + "loss": 2.3651, + "step": 12482 + }, + { + "epoch": 1.0074247437656363, + "grad_norm": 0.6900867223739624, + "learning_rate": 6.255723145827954e-05, + "loss": 2.4644, + "step": 12483 + }, + { + "epoch": 1.0075054475022194, + "grad_norm": 0.7144716382026672, + "learning_rate": 6.254259342739683e-05, + "loss": 2.4219, + "step": 12484 + }, + { + "epoch": 1.0075861512388025, + "grad_norm": 0.674619197845459, + "learning_rate": 6.252795633005056e-05, + "loss": 2.5038, + "step": 12485 + }, + { + "epoch": 1.0076668549753853, + "grad_norm": 0.7036965489387512, + "learning_rate": 6.251332016660558e-05, + "loss": 2.4784, + "step": 12486 + }, + { + "epoch": 1.0077475587119684, + "grad_norm": 0.7046369910240173, + "learning_rate": 6.249868493742668e-05, + "loss": 2.514, + "step": 12487 + }, + { + "epoch": 1.0078282624485513, + "grad_norm": 0.6933087110519409, + "learning_rate": 6.248405064287854e-05, + "loss": 2.4855, + "step": 12488 + }, + { + "epoch": 1.0079089661851344, + "grad_norm": 0.7210546731948853, + "learning_rate": 6.246941728332594e-05, + "loss": 2.5101, + "step": 12489 + }, + { + "epoch": 1.0079896699217175, + "grad_norm": 0.6738288402557373, + "learning_rate": 6.245478485913361e-05, + "loss": 2.4891, + "step": 12490 + }, + { + "epoch": 1.0080703736583003, + "grad_norm": 0.7023273706436157, + "learning_rate": 6.244015337066611e-05, + "loss": 2.4977, + "step": 12491 + }, + { + "epoch": 1.0081510773948834, + "grad_norm": 0.6761355996131897, + "learning_rate": 6.24255228182882e-05, + "loss": 2.4948, + "step": 12492 + }, + { + "epoch": 1.0082317811314665, + "grad_norm": 0.6427976489067078, + "learning_rate": 6.241089320236448e-05, + "loss": 2.466, + "step": 12493 + }, + { + "epoch": 1.0083124848680494, + "grad_norm": 0.6907719969749451, + "learning_rate": 6.23962645232596e-05, + "loss": 2.437, + "step": 12494 + }, + { + "epoch": 1.0083931886046325, + "grad_norm": 0.709032416343689, + "learning_rate": 6.238163678133807e-05, + "loss": 2.4298, + "step": 12495 + }, + { + "epoch": 1.0084738923412153, + "grad_norm": 0.7395734786987305, + "learning_rate": 6.236700997696448e-05, + "loss": 2.4502, + "step": 12496 + }, + { + "epoch": 1.0085545960777984, + "grad_norm": 0.6535435914993286, + "learning_rate": 6.23523841105034e-05, + "loss": 2.4494, + "step": 12497 + }, + { + "epoch": 1.0086352998143815, + "grad_norm": 0.6597761511802673, + "learning_rate": 6.23377591823193e-05, + "loss": 2.4377, + "step": 12498 + }, + { + "epoch": 1.0087160035509644, + "grad_norm": 0.6610515713691711, + "learning_rate": 6.232313519277668e-05, + "loss": 2.4328, + "step": 12499 + }, + { + "epoch": 1.0087967072875474, + "grad_norm": 0.6785424947738647, + "learning_rate": 6.230851214224009e-05, + "loss": 2.457, + "step": 12500 + }, + { + "epoch": 1.0088774110241303, + "grad_norm": 0.6939748525619507, + "learning_rate": 6.229389003107383e-05, + "loss": 2.383, + "step": 12501 + }, + { + "epoch": 1.0089581147607134, + "grad_norm": 0.7592256665229797, + "learning_rate": 6.22792688596424e-05, + "loss": 2.4665, + "step": 12502 + }, + { + "epoch": 1.0090388184972965, + "grad_norm": 0.6751298308372498, + "learning_rate": 6.226464862831023e-05, + "loss": 2.491, + "step": 12503 + }, + { + "epoch": 1.0091195222338794, + "grad_norm": 0.682771623134613, + "learning_rate": 6.225002933744164e-05, + "loss": 2.4275, + "step": 12504 + }, + { + "epoch": 1.0092002259704624, + "grad_norm": 0.7314651608467102, + "learning_rate": 6.223541098740098e-05, + "loss": 2.4489, + "step": 12505 + }, + { + "epoch": 1.0092809297070455, + "grad_norm": 0.7132120132446289, + "learning_rate": 6.222079357855261e-05, + "loss": 2.4819, + "step": 12506 + }, + { + "epoch": 1.0093616334436284, + "grad_norm": 0.6571424007415771, + "learning_rate": 6.220617711126082e-05, + "loss": 2.455, + "step": 12507 + }, + { + "epoch": 1.0094423371802115, + "grad_norm": 0.7675301432609558, + "learning_rate": 6.21915615858899e-05, + "loss": 2.5282, + "step": 12508 + }, + { + "epoch": 1.0095230409167943, + "grad_norm": 0.6907868385314941, + "learning_rate": 6.217694700280408e-05, + "loss": 2.4639, + "step": 12509 + }, + { + "epoch": 1.0096037446533774, + "grad_norm": 0.7223815321922302, + "learning_rate": 6.216233336236764e-05, + "loss": 2.4682, + "step": 12510 + }, + { + "epoch": 1.0096844483899605, + "grad_norm": 0.7325109839439392, + "learning_rate": 6.214772066494474e-05, + "loss": 2.4591, + "step": 12511 + }, + { + "epoch": 1.0097651521265434, + "grad_norm": 0.6589400768280029, + "learning_rate": 6.213310891089957e-05, + "loss": 2.4883, + "step": 12512 + }, + { + "epoch": 1.0098458558631265, + "grad_norm": 0.6692262291908264, + "learning_rate": 6.211849810059635e-05, + "loss": 2.4635, + "step": 12513 + }, + { + "epoch": 1.0099265595997096, + "grad_norm": 0.7352520823478699, + "learning_rate": 6.210388823439914e-05, + "loss": 2.4743, + "step": 12514 + }, + { + "epoch": 1.0100072633362924, + "grad_norm": 0.6631996035575867, + "learning_rate": 6.208927931267212e-05, + "loss": 2.4848, + "step": 12515 + }, + { + "epoch": 1.0100879670728755, + "grad_norm": 0.6985767483711243, + "learning_rate": 6.207467133577937e-05, + "loss": 2.5044, + "step": 12516 + }, + { + "epoch": 1.0101686708094584, + "grad_norm": 0.665635347366333, + "learning_rate": 6.206006430408494e-05, + "loss": 2.4718, + "step": 12517 + }, + { + "epoch": 1.0102493745460415, + "grad_norm": 0.6859133243560791, + "learning_rate": 6.204545821795286e-05, + "loss": 2.4702, + "step": 12518 + }, + { + "epoch": 1.0103300782826246, + "grad_norm": 0.6578841805458069, + "learning_rate": 6.203085307774722e-05, + "loss": 2.4614, + "step": 12519 + }, + { + "epoch": 1.0104107820192074, + "grad_norm": 0.717523455619812, + "learning_rate": 6.201624888383194e-05, + "loss": 2.4412, + "step": 12520 + }, + { + "epoch": 1.0104914857557905, + "grad_norm": 0.7333831787109375, + "learning_rate": 6.200164563657103e-05, + "loss": 2.4157, + "step": 12521 + }, + { + "epoch": 1.0105721894923736, + "grad_norm": 0.6968720555305481, + "learning_rate": 6.198704333632845e-05, + "loss": 2.4556, + "step": 12522 + }, + { + "epoch": 1.0106528932289565, + "grad_norm": 0.6533070802688599, + "learning_rate": 6.19724419834681e-05, + "loss": 2.43, + "step": 12523 + }, + { + "epoch": 1.0107335969655395, + "grad_norm": 0.7341824769973755, + "learning_rate": 6.195784157835391e-05, + "loss": 2.5326, + "step": 12524 + }, + { + "epoch": 1.0108143007021224, + "grad_norm": 0.752912163734436, + "learning_rate": 6.194324212134974e-05, + "loss": 2.4282, + "step": 12525 + }, + { + "epoch": 1.0108950044387055, + "grad_norm": 0.6538611650466919, + "learning_rate": 6.192864361281951e-05, + "loss": 2.4135, + "step": 12526 + }, + { + "epoch": 1.0109757081752886, + "grad_norm": 0.6931454539299011, + "learning_rate": 6.191404605312695e-05, + "loss": 2.5097, + "step": 12527 + }, + { + "epoch": 1.0110564119118715, + "grad_norm": 0.6317688822746277, + "learning_rate": 6.18994494426359e-05, + "loss": 2.4977, + "step": 12528 + }, + { + "epoch": 1.0111371156484545, + "grad_norm": 0.6793715953826904, + "learning_rate": 6.188485378171024e-05, + "loss": 2.4619, + "step": 12529 + }, + { + "epoch": 1.0112178193850376, + "grad_norm": 0.6696654558181763, + "learning_rate": 6.187025907071361e-05, + "loss": 2.4658, + "step": 12530 + }, + { + "epoch": 1.0112985231216205, + "grad_norm": 0.6788807511329651, + "learning_rate": 6.185566531000979e-05, + "loss": 2.4793, + "step": 12531 + }, + { + "epoch": 1.0113792268582036, + "grad_norm": 0.6933971643447876, + "learning_rate": 6.184107249996253e-05, + "loss": 2.4772, + "step": 12532 + }, + { + "epoch": 1.0114599305947864, + "grad_norm": 0.6866000294685364, + "learning_rate": 6.182648064093546e-05, + "loss": 2.428, + "step": 12533 + }, + { + "epoch": 1.0115406343313695, + "grad_norm": 0.7013841271400452, + "learning_rate": 6.181188973329229e-05, + "loss": 2.5273, + "step": 12534 + }, + { + "epoch": 1.0116213380679526, + "grad_norm": 0.6569108963012695, + "learning_rate": 6.179729977739669e-05, + "loss": 2.4125, + "step": 12535 + }, + { + "epoch": 1.0117020418045355, + "grad_norm": 0.7503486275672913, + "learning_rate": 6.17827107736122e-05, + "loss": 2.4385, + "step": 12536 + }, + { + "epoch": 1.0117827455411186, + "grad_norm": 0.6757314205169678, + "learning_rate": 6.176812272230246e-05, + "loss": 2.4364, + "step": 12537 + }, + { + "epoch": 1.0118634492777017, + "grad_norm": 0.6567254662513733, + "learning_rate": 6.175353562383106e-05, + "loss": 2.4992, + "step": 12538 + }, + { + "epoch": 1.0119441530142845, + "grad_norm": 0.7564988732337952, + "learning_rate": 6.17389494785615e-05, + "loss": 2.4777, + "step": 12539 + }, + { + "epoch": 1.0120248567508676, + "grad_norm": 0.6972391605377197, + "learning_rate": 6.172436428685735e-05, + "loss": 2.5041, + "step": 12540 + }, + { + "epoch": 1.0121055604874505, + "grad_norm": 0.6861580610275269, + "learning_rate": 6.170978004908209e-05, + "loss": 2.4684, + "step": 12541 + }, + { + "epoch": 1.0121862642240336, + "grad_norm": 0.6621903777122498, + "learning_rate": 6.169519676559921e-05, + "loss": 2.4614, + "step": 12542 + }, + { + "epoch": 1.0122669679606167, + "grad_norm": 0.6879795789718628, + "learning_rate": 6.168061443677215e-05, + "loss": 2.4765, + "step": 12543 + }, + { + "epoch": 1.0123476716971995, + "grad_norm": 0.6361081004142761, + "learning_rate": 6.166603306296434e-05, + "loss": 2.4792, + "step": 12544 + }, + { + "epoch": 1.0124283754337826, + "grad_norm": 0.6660729050636292, + "learning_rate": 6.165145264453924e-05, + "loss": 2.489, + "step": 12545 + }, + { + "epoch": 1.0125090791703655, + "grad_norm": 0.6900594234466553, + "learning_rate": 6.163687318186015e-05, + "loss": 2.4543, + "step": 12546 + }, + { + "epoch": 1.0125897829069486, + "grad_norm": 0.7195869088172913, + "learning_rate": 6.162229467529046e-05, + "loss": 2.4137, + "step": 12547 + }, + { + "epoch": 1.0126704866435317, + "grad_norm": 0.7030326128005981, + "learning_rate": 6.16077171251935e-05, + "loss": 2.4657, + "step": 12548 + }, + { + "epoch": 1.0127511903801145, + "grad_norm": 0.6712052822113037, + "learning_rate": 6.15931405319326e-05, + "loss": 2.4718, + "step": 12549 + }, + { + "epoch": 1.0128318941166976, + "grad_norm": 0.7471029162406921, + "learning_rate": 6.157856489587102e-05, + "loss": 2.4705, + "step": 12550 + }, + { + "epoch": 1.0129125978532807, + "grad_norm": 0.6813762187957764, + "learning_rate": 6.15639902173721e-05, + "loss": 2.4479, + "step": 12551 + }, + { + "epoch": 1.0129933015898636, + "grad_norm": 0.6657249927520752, + "learning_rate": 6.154941649679894e-05, + "loss": 2.4911, + "step": 12552 + }, + { + "epoch": 1.0130740053264466, + "grad_norm": 0.6700132489204407, + "learning_rate": 6.153484373451483e-05, + "loss": 2.4962, + "step": 12553 + }, + { + "epoch": 1.0131547090630295, + "grad_norm": 0.7058695554733276, + "learning_rate": 6.152027193088302e-05, + "loss": 2.3935, + "step": 12554 + }, + { + "epoch": 1.0132354127996126, + "grad_norm": 0.7390396595001221, + "learning_rate": 6.150570108626658e-05, + "loss": 2.4454, + "step": 12555 + }, + { + "epoch": 1.0133161165361957, + "grad_norm": 0.7251414060592651, + "learning_rate": 6.149113120102869e-05, + "loss": 2.4146, + "step": 12556 + }, + { + "epoch": 1.0133968202727786, + "grad_norm": 0.8262537717819214, + "learning_rate": 6.14765622755325e-05, + "loss": 2.4638, + "step": 12557 + }, + { + "epoch": 1.0134775240093616, + "grad_norm": 0.7184064984321594, + "learning_rate": 6.146199431014106e-05, + "loss": 2.3958, + "step": 12558 + }, + { + "epoch": 1.0135582277459447, + "grad_norm": 0.7544865012168884, + "learning_rate": 6.144742730521746e-05, + "loss": 2.4662, + "step": 12559 + }, + { + "epoch": 1.0136389314825276, + "grad_norm": 0.6866207718849182, + "learning_rate": 6.143286126112475e-05, + "loss": 2.4951, + "step": 12560 + }, + { + "epoch": 1.0137196352191107, + "grad_norm": 0.6566087603569031, + "learning_rate": 6.1418296178226e-05, + "loss": 2.4002, + "step": 12561 + }, + { + "epoch": 1.0138003389556935, + "grad_norm": 0.6999008059501648, + "learning_rate": 6.140373205688411e-05, + "loss": 2.5306, + "step": 12562 + }, + { + "epoch": 1.0138810426922766, + "grad_norm": 0.6682353615760803, + "learning_rate": 6.138916889746212e-05, + "loss": 2.5565, + "step": 12563 + }, + { + "epoch": 1.0139617464288597, + "grad_norm": 0.7443362474441528, + "learning_rate": 6.137460670032298e-05, + "loss": 2.3958, + "step": 12564 + }, + { + "epoch": 1.0140424501654426, + "grad_norm": 0.6542403697967529, + "learning_rate": 6.136004546582958e-05, + "loss": 2.4394, + "step": 12565 + }, + { + "epoch": 1.0141231539020257, + "grad_norm": 0.6524317264556885, + "learning_rate": 6.134548519434488e-05, + "loss": 2.4979, + "step": 12566 + }, + { + "epoch": 1.0142038576386088, + "grad_norm": 0.6605600118637085, + "learning_rate": 6.133092588623174e-05, + "loss": 2.4827, + "step": 12567 + }, + { + "epoch": 1.0142845613751916, + "grad_norm": 0.7114397883415222, + "learning_rate": 6.1316367541853e-05, + "loss": 2.4799, + "step": 12568 + }, + { + "epoch": 1.0143652651117747, + "grad_norm": 0.6607296466827393, + "learning_rate": 6.130181016157148e-05, + "loss": 2.4991, + "step": 12569 + }, + { + "epoch": 1.0144459688483576, + "grad_norm": 0.6750844717025757, + "learning_rate": 6.128725374575005e-05, + "loss": 2.4451, + "step": 12570 + }, + { + "epoch": 1.0145266725849407, + "grad_norm": 0.6978901624679565, + "learning_rate": 6.127269829475141e-05, + "loss": 2.4608, + "step": 12571 + }, + { + "epoch": 1.0146073763215238, + "grad_norm": 0.676343560218811, + "learning_rate": 6.125814380893838e-05, + "loss": 2.4536, + "step": 12572 + }, + { + "epoch": 1.0146880800581066, + "grad_norm": 0.7082604765892029, + "learning_rate": 6.124359028867368e-05, + "loss": 2.45, + "step": 12573 + }, + { + "epoch": 1.0147687837946897, + "grad_norm": 0.7049853205680847, + "learning_rate": 6.122903773432003e-05, + "loss": 2.4378, + "step": 12574 + }, + { + "epoch": 1.0148494875312728, + "grad_norm": 0.6329593062400818, + "learning_rate": 6.121448614624009e-05, + "loss": 2.4386, + "step": 12575 + }, + { + "epoch": 1.0149301912678557, + "grad_norm": 0.7249468564987183, + "learning_rate": 6.119993552479655e-05, + "loss": 2.5191, + "step": 12576 + }, + { + "epoch": 1.0150108950044388, + "grad_norm": 0.7028193473815918, + "learning_rate": 6.118538587035206e-05, + "loss": 2.4376, + "step": 12577 + }, + { + "epoch": 1.0150915987410216, + "grad_norm": 0.697382926940918, + "learning_rate": 6.117083718326917e-05, + "loss": 2.4797, + "step": 12578 + }, + { + "epoch": 1.0151723024776047, + "grad_norm": 0.7386965155601501, + "learning_rate": 6.115628946391055e-05, + "loss": 2.4512, + "step": 12579 + }, + { + "epoch": 1.0152530062141878, + "grad_norm": 0.6614577174186707, + "learning_rate": 6.114174271263875e-05, + "loss": 2.4404, + "step": 12580 + }, + { + "epoch": 1.0153337099507707, + "grad_norm": 0.6927464604377747, + "learning_rate": 6.112719692981627e-05, + "loss": 2.47, + "step": 12581 + }, + { + "epoch": 1.0154144136873537, + "grad_norm": 0.7004262208938599, + "learning_rate": 6.111265211580566e-05, + "loss": 2.4212, + "step": 12582 + }, + { + "epoch": 1.0154951174239368, + "grad_norm": 0.71146559715271, + "learning_rate": 6.109810827096942e-05, + "loss": 2.4431, + "step": 12583 + }, + { + "epoch": 1.0155758211605197, + "grad_norm": 0.6857032775878906, + "learning_rate": 6.108356539567e-05, + "loss": 2.453, + "step": 12584 + }, + { + "epoch": 1.0156565248971028, + "grad_norm": 0.6976168155670166, + "learning_rate": 6.106902349026986e-05, + "loss": 2.4718, + "step": 12585 + }, + { + "epoch": 1.0157372286336857, + "grad_norm": 0.7158414125442505, + "learning_rate": 6.105448255513146e-05, + "loss": 2.425, + "step": 12586 + }, + { + "epoch": 1.0158179323702687, + "grad_norm": 0.6611737608909607, + "learning_rate": 6.103994259061714e-05, + "loss": 2.4563, + "step": 12587 + }, + { + "epoch": 1.0158986361068518, + "grad_norm": 0.7262980937957764, + "learning_rate": 6.102540359708926e-05, + "loss": 2.4538, + "step": 12588 + }, + { + "epoch": 1.0159793398434347, + "grad_norm": 0.7123451828956604, + "learning_rate": 6.10108655749102e-05, + "loss": 2.4677, + "step": 12589 + }, + { + "epoch": 1.0160600435800178, + "grad_norm": 0.7135589122772217, + "learning_rate": 6.099632852444235e-05, + "loss": 2.4312, + "step": 12590 + }, + { + "epoch": 1.0161407473166009, + "grad_norm": 0.6509461998939514, + "learning_rate": 6.09817924460479e-05, + "loss": 2.4716, + "step": 12591 + }, + { + "epoch": 1.0162214510531837, + "grad_norm": 0.8835915923118591, + "learning_rate": 6.096725734008919e-05, + "loss": 2.4817, + "step": 12592 + }, + { + "epoch": 1.0163021547897668, + "grad_norm": 0.7084136605262756, + "learning_rate": 6.095272320692846e-05, + "loss": 2.483, + "step": 12593 + }, + { + "epoch": 1.0163828585263497, + "grad_norm": 0.6866818070411682, + "learning_rate": 6.0938190046927934e-05, + "loss": 2.4838, + "step": 12594 + }, + { + "epoch": 1.0164635622629328, + "grad_norm": 0.7297510504722595, + "learning_rate": 6.0923657860449824e-05, + "loss": 2.4675, + "step": 12595 + }, + { + "epoch": 1.0165442659995159, + "grad_norm": 0.6735619306564331, + "learning_rate": 6.090912664785633e-05, + "loss": 2.444, + "step": 12596 + }, + { + "epoch": 1.0166249697360987, + "grad_norm": 0.7046451568603516, + "learning_rate": 6.0894596409509565e-05, + "loss": 2.4757, + "step": 12597 + }, + { + "epoch": 1.0167056734726818, + "grad_norm": 0.6646085977554321, + "learning_rate": 6.0880067145771656e-05, + "loss": 2.4772, + "step": 12598 + }, + { + "epoch": 1.0167863772092647, + "grad_norm": 0.7217094302177429, + "learning_rate": 6.086553885700478e-05, + "loss": 2.4589, + "step": 12599 + }, + { + "epoch": 1.0168670809458478, + "grad_norm": 0.647378146648407, + "learning_rate": 6.085101154357093e-05, + "loss": 2.4327, + "step": 12600 + }, + { + "epoch": 1.0169477846824309, + "grad_norm": 0.6907125115394592, + "learning_rate": 6.083648520583223e-05, + "loss": 2.467, + "step": 12601 + }, + { + "epoch": 1.0170284884190137, + "grad_norm": 0.690433919429779, + "learning_rate": 6.0821959844150687e-05, + "loss": 2.488, + "step": 12602 + }, + { + "epoch": 1.0171091921555968, + "grad_norm": 0.6528738737106323, + "learning_rate": 6.080743545888833e-05, + "loss": 2.5028, + "step": 12603 + }, + { + "epoch": 1.01718989589218, + "grad_norm": 0.6962323784828186, + "learning_rate": 6.079291205040711e-05, + "loss": 2.5381, + "step": 12604 + }, + { + "epoch": 1.0172705996287628, + "grad_norm": 0.7386075854301453, + "learning_rate": 6.077838961906902e-05, + "loss": 2.4445, + "step": 12605 + }, + { + "epoch": 1.0173513033653458, + "grad_norm": 0.7382189631462097, + "learning_rate": 6.0763868165236025e-05, + "loss": 2.4926, + "step": 12606 + }, + { + "epoch": 1.0174320071019287, + "grad_norm": 0.7291865944862366, + "learning_rate": 6.074934768926995e-05, + "loss": 2.4624, + "step": 12607 + }, + { + "epoch": 1.0175127108385118, + "grad_norm": 0.754843533039093, + "learning_rate": 6.073482819153275e-05, + "loss": 2.4291, + "step": 12608 + }, + { + "epoch": 1.017593414575095, + "grad_norm": 0.6827771663665771, + "learning_rate": 6.072030967238628e-05, + "loss": 2.453, + "step": 12609 + }, + { + "epoch": 1.0176741183116778, + "grad_norm": 0.7138541340827942, + "learning_rate": 6.0705792132192355e-05, + "loss": 2.5172, + "step": 12610 + }, + { + "epoch": 1.0177548220482608, + "grad_norm": 0.6539924740791321, + "learning_rate": 6.06912755713128e-05, + "loss": 2.4393, + "step": 12611 + }, + { + "epoch": 1.017835525784844, + "grad_norm": 0.7021273970603943, + "learning_rate": 6.067675999010945e-05, + "loss": 2.4519, + "step": 12612 + }, + { + "epoch": 1.0179162295214268, + "grad_norm": 0.7124225497245789, + "learning_rate": 6.0662245388944004e-05, + "loss": 2.4417, + "step": 12613 + }, + { + "epoch": 1.0179969332580099, + "grad_norm": 0.7214948534965515, + "learning_rate": 6.064773176817823e-05, + "loss": 2.4708, + "step": 12614 + }, + { + "epoch": 1.0180776369945927, + "grad_norm": 0.6738584041595459, + "learning_rate": 6.063321912817386e-05, + "loss": 2.4574, + "step": 12615 + }, + { + "epoch": 1.0181583407311758, + "grad_norm": 0.7215890884399414, + "learning_rate": 6.061870746929257e-05, + "loss": 2.4903, + "step": 12616 + }, + { + "epoch": 1.018239044467759, + "grad_norm": 0.6720155477523804, + "learning_rate": 6.0604196791896016e-05, + "loss": 2.4251, + "step": 12617 + }, + { + "epoch": 1.0183197482043418, + "grad_norm": 0.7046420574188232, + "learning_rate": 6.058968709634587e-05, + "loss": 2.446, + "step": 12618 + }, + { + "epoch": 1.0184004519409249, + "grad_norm": 0.6419540047645569, + "learning_rate": 6.0575178383003764e-05, + "loss": 2.4052, + "step": 12619 + }, + { + "epoch": 1.018481155677508, + "grad_norm": 0.6948695182800293, + "learning_rate": 6.0560670652231235e-05, + "loss": 2.5068, + "step": 12620 + }, + { + "epoch": 1.0185618594140908, + "grad_norm": 0.7274870276451111, + "learning_rate": 6.05461639043899e-05, + "loss": 2.4705, + "step": 12621 + }, + { + "epoch": 1.018642563150674, + "grad_norm": 0.6809766292572021, + "learning_rate": 6.053165813984134e-05, + "loss": 2.3767, + "step": 12622 + }, + { + "epoch": 1.0187232668872568, + "grad_norm": 0.6197625994682312, + "learning_rate": 6.0517153358946985e-05, + "loss": 2.4639, + "step": 12623 + }, + { + "epoch": 1.0188039706238399, + "grad_norm": 0.6613010764122009, + "learning_rate": 6.050264956206837e-05, + "loss": 2.5155, + "step": 12624 + }, + { + "epoch": 1.018884674360423, + "grad_norm": 0.7335553765296936, + "learning_rate": 6.0488146749567e-05, + "loss": 2.5344, + "step": 12625 + }, + { + "epoch": 1.0189653780970058, + "grad_norm": 0.7175146341323853, + "learning_rate": 6.047364492180428e-05, + "loss": 2.4972, + "step": 12626 + }, + { + "epoch": 1.019046081833589, + "grad_norm": 0.6825357675552368, + "learning_rate": 6.045914407914166e-05, + "loss": 2.4356, + "step": 12627 + }, + { + "epoch": 1.019126785570172, + "grad_norm": 0.6369633078575134, + "learning_rate": 6.044464422194056e-05, + "loss": 2.4692, + "step": 12628 + }, + { + "epoch": 1.0192074893067549, + "grad_norm": 0.7407073378562927, + "learning_rate": 6.0430145350562264e-05, + "loss": 2.4565, + "step": 12629 + }, + { + "epoch": 1.019288193043338, + "grad_norm": 0.6836552619934082, + "learning_rate": 6.041564746536821e-05, + "loss": 2.4357, + "step": 12630 + }, + { + "epoch": 1.0193688967799208, + "grad_norm": 0.6778741478919983, + "learning_rate": 6.040115056671972e-05, + "loss": 2.424, + "step": 12631 + }, + { + "epoch": 1.019449600516504, + "grad_norm": 0.6440724730491638, + "learning_rate": 6.0386654654978035e-05, + "loss": 2.4455, + "step": 12632 + }, + { + "epoch": 1.019530304253087, + "grad_norm": 0.681376039981842, + "learning_rate": 6.0372159730504476e-05, + "loss": 2.4562, + "step": 12633 + }, + { + "epoch": 1.0196110079896699, + "grad_norm": 0.657462477684021, + "learning_rate": 6.035766579366029e-05, + "loss": 2.4315, + "step": 12634 + }, + { + "epoch": 1.019691711726253, + "grad_norm": 0.6540380716323853, + "learning_rate": 6.0343172844806706e-05, + "loss": 2.4789, + "step": 12635 + }, + { + "epoch": 1.019772415462836, + "grad_norm": 0.711883008480072, + "learning_rate": 6.03286808843049e-05, + "loss": 2.4178, + "step": 12636 + }, + { + "epoch": 1.019853119199419, + "grad_norm": 0.6746736168861389, + "learning_rate": 6.031418991251607e-05, + "loss": 2.4351, + "step": 12637 + }, + { + "epoch": 1.019933822936002, + "grad_norm": 0.677237331867218, + "learning_rate": 6.02996999298014e-05, + "loss": 2.4335, + "step": 12638 + }, + { + "epoch": 1.0200145266725849, + "grad_norm": 0.6950497627258301, + "learning_rate": 6.0285210936521955e-05, + "loss": 2.5178, + "step": 12639 + }, + { + "epoch": 1.020095230409168, + "grad_norm": 0.6349243521690369, + "learning_rate": 6.027072293303885e-05, + "loss": 2.4405, + "step": 12640 + }, + { + "epoch": 1.020175934145751, + "grad_norm": 0.744276762008667, + "learning_rate": 6.0256235919713236e-05, + "loss": 2.5156, + "step": 12641 + }, + { + "epoch": 1.020256637882334, + "grad_norm": 0.7697997689247131, + "learning_rate": 6.0241749896906075e-05, + "loss": 2.4393, + "step": 12642 + }, + { + "epoch": 1.020337341618917, + "grad_norm": 0.7784204483032227, + "learning_rate": 6.022726486497844e-05, + "loss": 2.4565, + "step": 12643 + }, + { + "epoch": 1.0204180453555, + "grad_norm": 0.7434312701225281, + "learning_rate": 6.021278082429136e-05, + "loss": 2.4637, + "step": 12644 + }, + { + "epoch": 1.020498749092083, + "grad_norm": 0.7770118117332458, + "learning_rate": 6.019829777520575e-05, + "loss": 2.4998, + "step": 12645 + }, + { + "epoch": 1.020579452828666, + "grad_norm": 0.7021752595901489, + "learning_rate": 6.01838157180826e-05, + "loss": 2.4661, + "step": 12646 + }, + { + "epoch": 1.0206601565652489, + "grad_norm": 0.6812437176704407, + "learning_rate": 6.0169334653282895e-05, + "loss": 2.4611, + "step": 12647 + }, + { + "epoch": 1.020740860301832, + "grad_norm": 0.757724940776825, + "learning_rate": 6.0154854581167455e-05, + "loss": 2.4427, + "step": 12648 + }, + { + "epoch": 1.020821564038415, + "grad_norm": 0.7386252880096436, + "learning_rate": 6.014037550209718e-05, + "loss": 2.424, + "step": 12649 + }, + { + "epoch": 1.020902267774998, + "grad_norm": 0.7138059735298157, + "learning_rate": 6.012589741643295e-05, + "loss": 2.4951, + "step": 12650 + }, + { + "epoch": 1.020982971511581, + "grad_norm": 0.714022159576416, + "learning_rate": 6.011142032453561e-05, + "loss": 2.4398, + "step": 12651 + }, + { + "epoch": 1.0210636752481639, + "grad_norm": 0.6961550712585449, + "learning_rate": 6.00969442267659e-05, + "loss": 2.4495, + "step": 12652 + }, + { + "epoch": 1.021144378984747, + "grad_norm": 0.7196643948554993, + "learning_rate": 6.008246912348467e-05, + "loss": 2.4449, + "step": 12653 + }, + { + "epoch": 1.02122508272133, + "grad_norm": 0.6163341999053955, + "learning_rate": 6.006799501505268e-05, + "loss": 2.4108, + "step": 12654 + }, + { + "epoch": 1.021305786457913, + "grad_norm": 0.6657030582427979, + "learning_rate": 6.005352190183061e-05, + "loss": 2.4328, + "step": 12655 + }, + { + "epoch": 1.021386490194496, + "grad_norm": 0.7183353900909424, + "learning_rate": 6.00390497841792e-05, + "loss": 2.4912, + "step": 12656 + }, + { + "epoch": 1.021467193931079, + "grad_norm": 0.6912575364112854, + "learning_rate": 6.002457866245916e-05, + "loss": 2.4597, + "step": 12657 + }, + { + "epoch": 1.021547897667662, + "grad_norm": 0.7395210266113281, + "learning_rate": 6.0010108537031084e-05, + "loss": 2.4823, + "step": 12658 + }, + { + "epoch": 1.021628601404245, + "grad_norm": 0.722618043422699, + "learning_rate": 5.9995639408255636e-05, + "loss": 2.4924, + "step": 12659 + }, + { + "epoch": 1.021709305140828, + "grad_norm": 0.739009439945221, + "learning_rate": 5.998117127649344e-05, + "loss": 2.4454, + "step": 12660 + }, + { + "epoch": 1.021790008877411, + "grad_norm": 0.7017633318901062, + "learning_rate": 5.996670414210506e-05, + "loss": 2.5058, + "step": 12661 + }, + { + "epoch": 1.021870712613994, + "grad_norm": 0.742664635181427, + "learning_rate": 5.9952238005451046e-05, + "loss": 2.436, + "step": 12662 + }, + { + "epoch": 1.021951416350577, + "grad_norm": 0.6865660548210144, + "learning_rate": 5.9937772866892e-05, + "loss": 2.4364, + "step": 12663 + }, + { + "epoch": 1.02203212008716, + "grad_norm": 0.7376219034194946, + "learning_rate": 5.992330872678833e-05, + "loss": 2.4975, + "step": 12664 + }, + { + "epoch": 1.0221128238237431, + "grad_norm": 0.6496078372001648, + "learning_rate": 5.990884558550054e-05, + "loss": 2.4651, + "step": 12665 + }, + { + "epoch": 1.022193527560326, + "grad_norm": 0.7178322076797485, + "learning_rate": 5.989438344338915e-05, + "loss": 2.5015, + "step": 12666 + }, + { + "epoch": 1.022274231296909, + "grad_norm": 0.7084102034568787, + "learning_rate": 5.987992230081459e-05, + "loss": 2.4741, + "step": 12667 + }, + { + "epoch": 1.022354935033492, + "grad_norm": 0.6634935736656189, + "learning_rate": 5.986546215813722e-05, + "loss": 2.4255, + "step": 12668 + }, + { + "epoch": 1.022435638770075, + "grad_norm": 0.6897543668746948, + "learning_rate": 5.985100301571742e-05, + "loss": 2.4682, + "step": 12669 + }, + { + "epoch": 1.0225163425066581, + "grad_norm": 0.6643948554992676, + "learning_rate": 5.9836544873915614e-05, + "loss": 2.4009, + "step": 12670 + }, + { + "epoch": 1.022597046243241, + "grad_norm": 0.681252658367157, + "learning_rate": 5.982208773309208e-05, + "loss": 2.4542, + "step": 12671 + }, + { + "epoch": 1.022677749979824, + "grad_norm": 0.7608681917190552, + "learning_rate": 5.980763159360714e-05, + "loss": 2.5614, + "step": 12672 + }, + { + "epoch": 1.0227584537164072, + "grad_norm": 0.6855095028877258, + "learning_rate": 5.979317645582112e-05, + "loss": 2.4505, + "step": 12673 + }, + { + "epoch": 1.02283915745299, + "grad_norm": 0.6846089363098145, + "learning_rate": 5.97787223200942e-05, + "loss": 2.4438, + "step": 12674 + }, + { + "epoch": 1.0229198611895731, + "grad_norm": 0.7198090553283691, + "learning_rate": 5.9764269186786684e-05, + "loss": 2.4469, + "step": 12675 + }, + { + "epoch": 1.023000564926156, + "grad_norm": 0.7120245099067688, + "learning_rate": 5.9749817056258764e-05, + "loss": 2.4626, + "step": 12676 + }, + { + "epoch": 1.023081268662739, + "grad_norm": 0.6839897036552429, + "learning_rate": 5.973536592887059e-05, + "loss": 2.4384, + "step": 12677 + }, + { + "epoch": 1.0231619723993222, + "grad_norm": 0.7053773999214172, + "learning_rate": 5.9720915804982356e-05, + "loss": 2.4554, + "step": 12678 + }, + { + "epoch": 1.023242676135905, + "grad_norm": 0.7114294767379761, + "learning_rate": 5.970646668495421e-05, + "loss": 2.3964, + "step": 12679 + }, + { + "epoch": 1.0233233798724881, + "grad_norm": 0.7001516819000244, + "learning_rate": 5.9692018569146224e-05, + "loss": 2.5216, + "step": 12680 + }, + { + "epoch": 1.0234040836090712, + "grad_norm": 0.6715773940086365, + "learning_rate": 5.96775714579185e-05, + "loss": 2.4595, + "step": 12681 + }, + { + "epoch": 1.023484787345654, + "grad_norm": 0.6856278777122498, + "learning_rate": 5.96631253516311e-05, + "loss": 2.4637, + "step": 12682 + }, + { + "epoch": 1.0235654910822372, + "grad_norm": 0.6785625219345093, + "learning_rate": 5.96486802506441e-05, + "loss": 2.4615, + "step": 12683 + }, + { + "epoch": 1.02364619481882, + "grad_norm": 0.6834213137626648, + "learning_rate": 5.963423615531743e-05, + "loss": 2.4729, + "step": 12684 + }, + { + "epoch": 1.023726898555403, + "grad_norm": 0.6729516386985779, + "learning_rate": 5.961979306601109e-05, + "loss": 2.4013, + "step": 12685 + }, + { + "epoch": 1.0238076022919862, + "grad_norm": 0.6785775423049927, + "learning_rate": 5.960535098308511e-05, + "loss": 2.4825, + "step": 12686 + }, + { + "epoch": 1.023888306028569, + "grad_norm": 0.67277991771698, + "learning_rate": 5.959090990689934e-05, + "loss": 2.4606, + "step": 12687 + }, + { + "epoch": 1.0239690097651521, + "grad_norm": 0.7679588198661804, + "learning_rate": 5.957646983781373e-05, + "loss": 2.5234, + "step": 12688 + }, + { + "epoch": 1.0240497135017352, + "grad_norm": 0.6597407460212708, + "learning_rate": 5.956203077618821e-05, + "loss": 2.4699, + "step": 12689 + }, + { + "epoch": 1.024130417238318, + "grad_norm": 0.6743008494377136, + "learning_rate": 5.9547592722382525e-05, + "loss": 2.4266, + "step": 12690 + }, + { + "epoch": 1.0242111209749012, + "grad_norm": 0.7223396897315979, + "learning_rate": 5.953315567675657e-05, + "loss": 2.5117, + "step": 12691 + }, + { + "epoch": 1.024291824711484, + "grad_norm": 0.6729528307914734, + "learning_rate": 5.951871963967022e-05, + "loss": 2.4586, + "step": 12692 + }, + { + "epoch": 1.0243725284480671, + "grad_norm": 0.6523739695549011, + "learning_rate": 5.950428461148314e-05, + "loss": 2.4408, + "step": 12693 + }, + { + "epoch": 1.0244532321846502, + "grad_norm": 0.6830984950065613, + "learning_rate": 5.9489850592555164e-05, + "loss": 2.4094, + "step": 12694 + }, + { + "epoch": 1.024533935921233, + "grad_norm": 0.6223493814468384, + "learning_rate": 5.9475417583246006e-05, + "loss": 2.4105, + "step": 12695 + }, + { + "epoch": 1.0246146396578162, + "grad_norm": 0.6506635546684265, + "learning_rate": 5.9460985583915374e-05, + "loss": 2.4451, + "step": 12696 + }, + { + "epoch": 1.024695343394399, + "grad_norm": 0.7626760005950928, + "learning_rate": 5.944655459492293e-05, + "loss": 2.4643, + "step": 12697 + }, + { + "epoch": 1.0247760471309821, + "grad_norm": 0.7074631452560425, + "learning_rate": 5.943212461662837e-05, + "loss": 2.4662, + "step": 12698 + }, + { + "epoch": 1.0248567508675652, + "grad_norm": 0.718083918094635, + "learning_rate": 5.9417695649391346e-05, + "loss": 2.4686, + "step": 12699 + }, + { + "epoch": 1.024937454604148, + "grad_norm": 0.6850628852844238, + "learning_rate": 5.9403267693571384e-05, + "loss": 2.4542, + "step": 12700 + }, + { + "epoch": 1.0250181583407312, + "grad_norm": 0.6662585735321045, + "learning_rate": 5.938884074952812e-05, + "loss": 2.4676, + "step": 12701 + }, + { + "epoch": 1.0250988620773143, + "grad_norm": 0.6806240677833557, + "learning_rate": 5.9374414817621114e-05, + "loss": 2.4243, + "step": 12702 + }, + { + "epoch": 1.0251795658138971, + "grad_norm": 0.6763548851013184, + "learning_rate": 5.9359989898209876e-05, + "loss": 2.4389, + "step": 12703 + }, + { + "epoch": 1.0252602695504802, + "grad_norm": 0.7390143275260925, + "learning_rate": 5.934556599165393e-05, + "loss": 2.4667, + "step": 12704 + }, + { + "epoch": 1.025340973287063, + "grad_norm": 0.6159299612045288, + "learning_rate": 5.933114309831276e-05, + "loss": 2.3832, + "step": 12705 + }, + { + "epoch": 1.0254216770236462, + "grad_norm": 0.6779586672782898, + "learning_rate": 5.931672121854579e-05, + "loss": 2.4615, + "step": 12706 + }, + { + "epoch": 1.0255023807602293, + "grad_norm": 0.643800675868988, + "learning_rate": 5.930230035271247e-05, + "loss": 2.4725, + "step": 12707 + }, + { + "epoch": 1.0255830844968121, + "grad_norm": 0.6605903506278992, + "learning_rate": 5.928788050117227e-05, + "loss": 2.4332, + "step": 12708 + }, + { + "epoch": 1.0256637882333952, + "grad_norm": 0.7046334743499756, + "learning_rate": 5.927346166428446e-05, + "loss": 2.4445, + "step": 12709 + }, + { + "epoch": 1.0257444919699783, + "grad_norm": 0.6536325216293335, + "learning_rate": 5.925904384240843e-05, + "loss": 2.4168, + "step": 12710 + }, + { + "epoch": 1.0258251957065612, + "grad_norm": 0.6861097812652588, + "learning_rate": 5.9244627035903564e-05, + "loss": 2.512, + "step": 12711 + }, + { + "epoch": 1.0259058994431443, + "grad_norm": 0.6782278418540955, + "learning_rate": 5.923021124512911e-05, + "loss": 2.4667, + "step": 12712 + }, + { + "epoch": 1.0259866031797271, + "grad_norm": 0.724435031414032, + "learning_rate": 5.921579647044436e-05, + "loss": 2.4828, + "step": 12713 + }, + { + "epoch": 1.0260673069163102, + "grad_norm": 0.6690630316734314, + "learning_rate": 5.9201382712208575e-05, + "loss": 2.4832, + "step": 12714 + }, + { + "epoch": 1.0261480106528933, + "grad_norm": 0.7045348286628723, + "learning_rate": 5.9186969970781015e-05, + "loss": 2.4576, + "step": 12715 + }, + { + "epoch": 1.0262287143894762, + "grad_norm": 0.673321008682251, + "learning_rate": 5.9172558246520796e-05, + "loss": 2.3986, + "step": 12716 + }, + { + "epoch": 1.0263094181260592, + "grad_norm": 0.7184785008430481, + "learning_rate": 5.915814753978717e-05, + "loss": 2.4008, + "step": 12717 + }, + { + "epoch": 1.0263901218626423, + "grad_norm": 0.6971293091773987, + "learning_rate": 5.914373785093931e-05, + "loss": 2.4559, + "step": 12718 + }, + { + "epoch": 1.0264708255992252, + "grad_norm": 0.6941563487052917, + "learning_rate": 5.912932918033626e-05, + "loss": 2.4787, + "step": 12719 + }, + { + "epoch": 1.0265515293358083, + "grad_norm": 0.6276142001152039, + "learning_rate": 5.911492152833715e-05, + "loss": 2.4275, + "step": 12720 + }, + { + "epoch": 1.0266322330723912, + "grad_norm": 0.715928316116333, + "learning_rate": 5.9100514895301106e-05, + "loss": 2.4127, + "step": 12721 + }, + { + "epoch": 1.0267129368089742, + "grad_norm": 0.7004076838493347, + "learning_rate": 5.908610928158713e-05, + "loss": 2.4651, + "step": 12722 + }, + { + "epoch": 1.0267936405455573, + "grad_norm": 0.6761921048164368, + "learning_rate": 5.907170468755425e-05, + "loss": 2.4245, + "step": 12723 + }, + { + "epoch": 1.0268743442821402, + "grad_norm": 0.7246574759483337, + "learning_rate": 5.9057301113561515e-05, + "loss": 2.4489, + "step": 12724 + }, + { + "epoch": 1.0269550480187233, + "grad_norm": 0.7196606397628784, + "learning_rate": 5.904289855996783e-05, + "loss": 2.4357, + "step": 12725 + }, + { + "epoch": 1.0270357517553064, + "grad_norm": 0.7142692804336548, + "learning_rate": 5.902849702713216e-05, + "loss": 2.4821, + "step": 12726 + }, + { + "epoch": 1.0271164554918892, + "grad_norm": 0.7207832336425781, + "learning_rate": 5.9014096515413454e-05, + "loss": 2.4337, + "step": 12727 + }, + { + "epoch": 1.0271971592284723, + "grad_norm": 0.6865695714950562, + "learning_rate": 5.899969702517063e-05, + "loss": 2.4549, + "step": 12728 + }, + { + "epoch": 1.0272778629650552, + "grad_norm": 0.7136662006378174, + "learning_rate": 5.898529855676249e-05, + "loss": 2.4606, + "step": 12729 + }, + { + "epoch": 1.0273585667016383, + "grad_norm": 0.701885998249054, + "learning_rate": 5.897090111054795e-05, + "loss": 2.4913, + "step": 12730 + }, + { + "epoch": 1.0274392704382214, + "grad_norm": 0.6671354174613953, + "learning_rate": 5.8956504686885805e-05, + "loss": 2.4064, + "step": 12731 + }, + { + "epoch": 1.0275199741748042, + "grad_norm": 0.6720621585845947, + "learning_rate": 5.894210928613484e-05, + "loss": 2.4908, + "step": 12732 + }, + { + "epoch": 1.0276006779113873, + "grad_norm": 0.7530980706214905, + "learning_rate": 5.892771490865383e-05, + "loss": 2.4486, + "step": 12733 + }, + { + "epoch": 1.0276813816479704, + "grad_norm": 0.6771122813224792, + "learning_rate": 5.891332155480158e-05, + "loss": 2.3954, + "step": 12734 + }, + { + "epoch": 1.0277620853845533, + "grad_norm": 0.6779236793518066, + "learning_rate": 5.889892922493671e-05, + "loss": 2.4404, + "step": 12735 + }, + { + "epoch": 1.0278427891211364, + "grad_norm": 0.7593358755111694, + "learning_rate": 5.8884537919417974e-05, + "loss": 2.4997, + "step": 12736 + }, + { + "epoch": 1.0279234928577192, + "grad_norm": 0.672686755657196, + "learning_rate": 5.8870147638604044e-05, + "loss": 2.5394, + "step": 12737 + }, + { + "epoch": 1.0280041965943023, + "grad_norm": 0.6727546453475952, + "learning_rate": 5.885575838285353e-05, + "loss": 2.4554, + "step": 12738 + }, + { + "epoch": 1.0280849003308854, + "grad_norm": 0.7092764377593994, + "learning_rate": 5.884137015252507e-05, + "loss": 2.4568, + "step": 12739 + }, + { + "epoch": 1.0281656040674683, + "grad_norm": 0.6988070011138916, + "learning_rate": 5.882698294797728e-05, + "loss": 2.4453, + "step": 12740 + }, + { + "epoch": 1.0282463078040514, + "grad_norm": 0.7578697204589844, + "learning_rate": 5.8812596769568676e-05, + "loss": 2.5648, + "step": 12741 + }, + { + "epoch": 1.0283270115406344, + "grad_norm": 0.6523683667182922, + "learning_rate": 5.879821161765782e-05, + "loss": 2.4088, + "step": 12742 + }, + { + "epoch": 1.0284077152772173, + "grad_norm": 0.6797270178794861, + "learning_rate": 5.878382749260323e-05, + "loss": 2.4465, + "step": 12743 + }, + { + "epoch": 1.0284884190138004, + "grad_norm": 0.6823786497116089, + "learning_rate": 5.876944439476345e-05, + "loss": 2.5053, + "step": 12744 + }, + { + "epoch": 1.0285691227503833, + "grad_norm": 0.6840088367462158, + "learning_rate": 5.875506232449686e-05, + "loss": 2.3771, + "step": 12745 + }, + { + "epoch": 1.0286498264869663, + "grad_norm": 0.6985318064689636, + "learning_rate": 5.8740681282161914e-05, + "loss": 2.4456, + "step": 12746 + }, + { + "epoch": 1.0287305302235494, + "grad_norm": 0.7102388739585876, + "learning_rate": 5.872630126811707e-05, + "loss": 2.4802, + "step": 12747 + }, + { + "epoch": 1.0288112339601323, + "grad_norm": 0.7917937636375427, + "learning_rate": 5.871192228272067e-05, + "loss": 2.4606, + "step": 12748 + }, + { + "epoch": 1.0288919376967154, + "grad_norm": 0.683397114276886, + "learning_rate": 5.86975443263311e-05, + "loss": 2.5011, + "step": 12749 + }, + { + "epoch": 1.0289726414332985, + "grad_norm": 0.7543408870697021, + "learning_rate": 5.8683167399306724e-05, + "loss": 2.4705, + "step": 12750 + }, + { + "epoch": 1.0290533451698813, + "grad_norm": 0.6946283578872681, + "learning_rate": 5.866879150200579e-05, + "loss": 2.4986, + "step": 12751 + }, + { + "epoch": 1.0291340489064644, + "grad_norm": 0.6535125374794006, + "learning_rate": 5.8654416634786605e-05, + "loss": 2.4203, + "step": 12752 + }, + { + "epoch": 1.0292147526430473, + "grad_norm": 0.7470195889472961, + "learning_rate": 5.8640042798007455e-05, + "loss": 2.5103, + "step": 12753 + }, + { + "epoch": 1.0292954563796304, + "grad_norm": 0.6782363653182983, + "learning_rate": 5.8625669992026535e-05, + "loss": 2.4087, + "step": 12754 + }, + { + "epoch": 1.0293761601162135, + "grad_norm": 0.7601497173309326, + "learning_rate": 5.861129821720207e-05, + "loss": 2.4752, + "step": 12755 + }, + { + "epoch": 1.0294568638527963, + "grad_norm": 0.6875388026237488, + "learning_rate": 5.859692747389227e-05, + "loss": 2.448, + "step": 12756 + }, + { + "epoch": 1.0295375675893794, + "grad_norm": 0.7153629064559937, + "learning_rate": 5.858255776245525e-05, + "loss": 2.4641, + "step": 12757 + }, + { + "epoch": 1.0296182713259623, + "grad_norm": 0.682954728603363, + "learning_rate": 5.8568189083249145e-05, + "loss": 2.441, + "step": 12758 + }, + { + "epoch": 1.0296989750625454, + "grad_norm": 0.6959100961685181, + "learning_rate": 5.855382143663209e-05, + "loss": 2.4316, + "step": 12759 + }, + { + "epoch": 1.0297796787991285, + "grad_norm": 0.7062023878097534, + "learning_rate": 5.8539454822962167e-05, + "loss": 2.4287, + "step": 12760 + }, + { + "epoch": 1.0298603825357113, + "grad_norm": 0.706523597240448, + "learning_rate": 5.852508924259736e-05, + "loss": 2.4596, + "step": 12761 + }, + { + "epoch": 1.0299410862722944, + "grad_norm": 0.6908385753631592, + "learning_rate": 5.851072469589578e-05, + "loss": 2.4428, + "step": 12762 + }, + { + "epoch": 1.0300217900088775, + "grad_norm": 0.6810726523399353, + "learning_rate": 5.8496361183215386e-05, + "loss": 2.4902, + "step": 12763 + }, + { + "epoch": 1.0301024937454604, + "grad_norm": 0.661613941192627, + "learning_rate": 5.8481998704914156e-05, + "loss": 2.4256, + "step": 12764 + }, + { + "epoch": 1.0301831974820435, + "grad_norm": 0.6633132100105286, + "learning_rate": 5.846763726135005e-05, + "loss": 2.4512, + "step": 12765 + }, + { + "epoch": 1.0302639012186263, + "grad_norm": 0.6991820335388184, + "learning_rate": 5.8453276852881025e-05, + "loss": 2.3747, + "step": 12766 + }, + { + "epoch": 1.0303446049552094, + "grad_norm": 0.7392076253890991, + "learning_rate": 5.843891747986487e-05, + "loss": 2.438, + "step": 12767 + }, + { + "epoch": 1.0304253086917925, + "grad_norm": 0.6371724605560303, + "learning_rate": 5.842455914265958e-05, + "loss": 2.4627, + "step": 12768 + }, + { + "epoch": 1.0305060124283754, + "grad_norm": 0.6475048661231995, + "learning_rate": 5.841020184162298e-05, + "loss": 2.4883, + "step": 12769 + }, + { + "epoch": 1.0305867161649584, + "grad_norm": 0.6848995685577393, + "learning_rate": 5.839584557711283e-05, + "loss": 2.4452, + "step": 12770 + }, + { + "epoch": 1.0306674199015415, + "grad_norm": 0.7345505952835083, + "learning_rate": 5.838149034948697e-05, + "loss": 2.5121, + "step": 12771 + }, + { + "epoch": 1.0307481236381244, + "grad_norm": 0.715373158454895, + "learning_rate": 5.836713615910318e-05, + "loss": 2.4549, + "step": 12772 + }, + { + "epoch": 1.0308288273747075, + "grad_norm": 0.7371035814285278, + "learning_rate": 5.8352783006319166e-05, + "loss": 2.4633, + "step": 12773 + }, + { + "epoch": 1.0309095311112904, + "grad_norm": 0.6843077540397644, + "learning_rate": 5.833843089149267e-05, + "loss": 2.4067, + "step": 12774 + }, + { + "epoch": 1.0309902348478734, + "grad_norm": 0.7398965954780579, + "learning_rate": 5.832407981498136e-05, + "loss": 2.5199, + "step": 12775 + }, + { + "epoch": 1.0310709385844565, + "grad_norm": 0.6860283017158508, + "learning_rate": 5.830972977714294e-05, + "loss": 2.4564, + "step": 12776 + }, + { + "epoch": 1.0311516423210394, + "grad_norm": 0.683893084526062, + "learning_rate": 5.829538077833503e-05, + "loss": 2.4635, + "step": 12777 + }, + { + "epoch": 1.0312323460576225, + "grad_norm": 0.6412089467048645, + "learning_rate": 5.828103281891525e-05, + "loss": 2.4806, + "step": 12778 + }, + { + "epoch": 1.0313130497942056, + "grad_norm": 0.646393895149231, + "learning_rate": 5.826668589924123e-05, + "loss": 2.4674, + "step": 12779 + }, + { + "epoch": 1.0313937535307884, + "grad_norm": 0.6805605292320251, + "learning_rate": 5.825234001967044e-05, + "loss": 2.5145, + "step": 12780 + }, + { + "epoch": 1.0314744572673715, + "grad_norm": 0.681532084941864, + "learning_rate": 5.8237995180560455e-05, + "loss": 2.5041, + "step": 12781 + }, + { + "epoch": 1.0315551610039544, + "grad_norm": 0.6971312165260315, + "learning_rate": 5.8223651382268865e-05, + "loss": 2.5324, + "step": 12782 + }, + { + "epoch": 1.0316358647405375, + "grad_norm": 0.6634463667869568, + "learning_rate": 5.8209308625153026e-05, + "loss": 2.5086, + "step": 12783 + }, + { + "epoch": 1.0317165684771206, + "grad_norm": 0.6752117276191711, + "learning_rate": 5.819496690957047e-05, + "loss": 2.4805, + "step": 12784 + }, + { + "epoch": 1.0317972722137034, + "grad_norm": 0.7242109775543213, + "learning_rate": 5.818062623587861e-05, + "loss": 2.4205, + "step": 12785 + }, + { + "epoch": 1.0318779759502865, + "grad_norm": 0.7338563203811646, + "learning_rate": 5.816628660443486e-05, + "loss": 2.4277, + "step": 12786 + }, + { + "epoch": 1.0319586796868696, + "grad_norm": 0.6764293313026428, + "learning_rate": 5.81519480155966e-05, + "loss": 2.5096, + "step": 12787 + }, + { + "epoch": 1.0320393834234525, + "grad_norm": 0.6757099032402039, + "learning_rate": 5.813761046972124e-05, + "loss": 2.468, + "step": 12788 + }, + { + "epoch": 1.0321200871600356, + "grad_norm": 0.7072502374649048, + "learning_rate": 5.8123273967166017e-05, + "loss": 2.4642, + "step": 12789 + }, + { + "epoch": 1.0322007908966184, + "grad_norm": 0.6470256447792053, + "learning_rate": 5.810893850828827e-05, + "loss": 2.4146, + "step": 12790 + }, + { + "epoch": 1.0322814946332015, + "grad_norm": 0.7403351068496704, + "learning_rate": 5.809460409344527e-05, + "loss": 2.512, + "step": 12791 + }, + { + "epoch": 1.0323621983697846, + "grad_norm": 0.6711490154266357, + "learning_rate": 5.808027072299432e-05, + "loss": 2.4602, + "step": 12792 + }, + { + "epoch": 1.0324429021063675, + "grad_norm": 0.7920248508453369, + "learning_rate": 5.806593839729258e-05, + "loss": 2.4512, + "step": 12793 + }, + { + "epoch": 1.0325236058429506, + "grad_norm": 0.6442045569419861, + "learning_rate": 5.805160711669725e-05, + "loss": 2.4165, + "step": 12794 + }, + { + "epoch": 1.0326043095795336, + "grad_norm": 0.6681340932846069, + "learning_rate": 5.803727688156553e-05, + "loss": 2.4296, + "step": 12795 + }, + { + "epoch": 1.0326850133161165, + "grad_norm": 0.6653337478637695, + "learning_rate": 5.802294769225457e-05, + "loss": 2.5165, + "step": 12796 + }, + { + "epoch": 1.0327657170526996, + "grad_norm": 0.6444782018661499, + "learning_rate": 5.8008619549121476e-05, + "loss": 2.4266, + "step": 12797 + }, + { + "epoch": 1.0328464207892825, + "grad_norm": 0.6741451621055603, + "learning_rate": 5.7994292452523394e-05, + "loss": 2.4837, + "step": 12798 + }, + { + "epoch": 1.0329271245258655, + "grad_norm": 0.6629341840744019, + "learning_rate": 5.797996640281731e-05, + "loss": 2.4368, + "step": 12799 + }, + { + "epoch": 1.0330078282624486, + "grad_norm": 0.6755850315093994, + "learning_rate": 5.796564140036029e-05, + "loss": 2.4834, + "step": 12800 + }, + { + "epoch": 1.0330885319990315, + "grad_norm": 0.7271782755851746, + "learning_rate": 5.795131744550942e-05, + "loss": 2.5025, + "step": 12801 + }, + { + "epoch": 1.0331692357356146, + "grad_norm": 0.6870545744895935, + "learning_rate": 5.7936994538621605e-05, + "loss": 2.4443, + "step": 12802 + }, + { + "epoch": 1.0332499394721975, + "grad_norm": 0.7231935858726501, + "learning_rate": 5.792267268005382e-05, + "loss": 2.4917, + "step": 12803 + }, + { + "epoch": 1.0333306432087805, + "grad_norm": 0.6905832290649414, + "learning_rate": 5.790835187016307e-05, + "loss": 2.4902, + "step": 12804 + }, + { + "epoch": 1.0334113469453636, + "grad_norm": 0.711814284324646, + "learning_rate": 5.789403210930613e-05, + "loss": 2.4579, + "step": 12805 + }, + { + "epoch": 1.0334920506819465, + "grad_norm": 0.6982280015945435, + "learning_rate": 5.787971339784004e-05, + "loss": 2.5275, + "step": 12806 + }, + { + "epoch": 1.0335727544185296, + "grad_norm": 0.6871493458747864, + "learning_rate": 5.7865395736121575e-05, + "loss": 2.4401, + "step": 12807 + }, + { + "epoch": 1.0336534581551127, + "grad_norm": 0.6898353099822998, + "learning_rate": 5.785107912450763e-05, + "loss": 2.4005, + "step": 12808 + }, + { + "epoch": 1.0337341618916955, + "grad_norm": 0.6264411807060242, + "learning_rate": 5.7836763563354946e-05, + "loss": 2.4497, + "step": 12809 + }, + { + "epoch": 1.0338148656282786, + "grad_norm": 0.6997092962265015, + "learning_rate": 5.782244905302032e-05, + "loss": 2.4388, + "step": 12810 + }, + { + "epoch": 1.0338955693648615, + "grad_norm": 0.6834601759910583, + "learning_rate": 5.7808135593860555e-05, + "loss": 2.4298, + "step": 12811 + }, + { + "epoch": 1.0339762731014446, + "grad_norm": 0.664315402507782, + "learning_rate": 5.77938231862323e-05, + "loss": 2.4289, + "step": 12812 + }, + { + "epoch": 1.0340569768380277, + "grad_norm": 0.6660603284835815, + "learning_rate": 5.7779511830492306e-05, + "loss": 2.4772, + "step": 12813 + }, + { + "epoch": 1.0341376805746105, + "grad_norm": 0.6457028388977051, + "learning_rate": 5.776520152699728e-05, + "loss": 2.4408, + "step": 12814 + }, + { + "epoch": 1.0342183843111936, + "grad_norm": 0.7132207155227661, + "learning_rate": 5.7750892276103794e-05, + "loss": 2.4953, + "step": 12815 + }, + { + "epoch": 1.0342990880477767, + "grad_norm": 0.7397382259368896, + "learning_rate": 5.773658407816848e-05, + "loss": 2.4396, + "step": 12816 + }, + { + "epoch": 1.0343797917843596, + "grad_norm": 0.6951746344566345, + "learning_rate": 5.7722276933548034e-05, + "loss": 2.5021, + "step": 12817 + }, + { + "epoch": 1.0344604955209427, + "grad_norm": 0.6789736151695251, + "learning_rate": 5.7707970842598935e-05, + "loss": 2.4883, + "step": 12818 + }, + { + "epoch": 1.0345411992575255, + "grad_norm": 0.7231541872024536, + "learning_rate": 5.7693665805677747e-05, + "loss": 2.4761, + "step": 12819 + }, + { + "epoch": 1.0346219029941086, + "grad_norm": 0.685943603515625, + "learning_rate": 5.767936182314104e-05, + "loss": 2.4489, + "step": 12820 + }, + { + "epoch": 1.0347026067306917, + "grad_norm": 0.7081817984580994, + "learning_rate": 5.7665058895345236e-05, + "loss": 2.4329, + "step": 12821 + }, + { + "epoch": 1.0347833104672746, + "grad_norm": 0.6700818538665771, + "learning_rate": 5.7650757022646804e-05, + "loss": 2.4252, + "step": 12822 + }, + { + "epoch": 1.0348640142038577, + "grad_norm": 0.6712214946746826, + "learning_rate": 5.763645620540223e-05, + "loss": 2.419, + "step": 12823 + }, + { + "epoch": 1.0349447179404407, + "grad_norm": 0.6732817888259888, + "learning_rate": 5.762215644396793e-05, + "loss": 2.3928, + "step": 12824 + }, + { + "epoch": 1.0350254216770236, + "grad_norm": 0.6689301133155823, + "learning_rate": 5.760785773870024e-05, + "loss": 2.3981, + "step": 12825 + }, + { + "epoch": 1.0351061254136067, + "grad_norm": 0.6822957992553711, + "learning_rate": 5.759356008995556e-05, + "loss": 2.5265, + "step": 12826 + }, + { + "epoch": 1.0351868291501896, + "grad_norm": 0.7316287755966187, + "learning_rate": 5.7579263498090194e-05, + "loss": 2.4132, + "step": 12827 + }, + { + "epoch": 1.0352675328867726, + "grad_norm": 0.6688703894615173, + "learning_rate": 5.756496796346047e-05, + "loss": 2.4195, + "step": 12828 + }, + { + "epoch": 1.0353482366233557, + "grad_norm": 0.6894570589065552, + "learning_rate": 5.755067348642268e-05, + "loss": 2.4897, + "step": 12829 + }, + { + "epoch": 1.0354289403599386, + "grad_norm": 0.7635753750801086, + "learning_rate": 5.753638006733311e-05, + "loss": 2.4643, + "step": 12830 + }, + { + "epoch": 1.0355096440965217, + "grad_norm": 0.6353672742843628, + "learning_rate": 5.75220877065479e-05, + "loss": 2.4533, + "step": 12831 + }, + { + "epoch": 1.0355903478331048, + "grad_norm": 0.6725208759307861, + "learning_rate": 5.750779640442332e-05, + "loss": 2.4958, + "step": 12832 + }, + { + "epoch": 1.0356710515696876, + "grad_norm": 0.7350767254829407, + "learning_rate": 5.749350616131556e-05, + "loss": 2.4192, + "step": 12833 + }, + { + "epoch": 1.0357517553062707, + "grad_norm": 0.7322222590446472, + "learning_rate": 5.7479216977580695e-05, + "loss": 2.4719, + "step": 12834 + }, + { + "epoch": 1.0358324590428536, + "grad_norm": 0.7233425974845886, + "learning_rate": 5.7464928853574904e-05, + "loss": 2.4707, + "step": 12835 + }, + { + "epoch": 1.0359131627794367, + "grad_norm": 0.7117420434951782, + "learning_rate": 5.745064178965427e-05, + "loss": 2.4463, + "step": 12836 + }, + { + "epoch": 1.0359938665160198, + "grad_norm": 0.7615050077438354, + "learning_rate": 5.743635578617486e-05, + "loss": 2.4256, + "step": 12837 + }, + { + "epoch": 1.0360745702526026, + "grad_norm": 0.7056093215942383, + "learning_rate": 5.7422070843492734e-05, + "loss": 2.4628, + "step": 12838 + }, + { + "epoch": 1.0361552739891857, + "grad_norm": 0.685989499092102, + "learning_rate": 5.740778696196389e-05, + "loss": 2.4271, + "step": 12839 + }, + { + "epoch": 1.0362359777257688, + "grad_norm": 0.7286686301231384, + "learning_rate": 5.739350414194439e-05, + "loss": 2.4984, + "step": 12840 + }, + { + "epoch": 1.0363166814623517, + "grad_norm": 0.6939802765846252, + "learning_rate": 5.737922238379009e-05, + "loss": 2.4601, + "step": 12841 + }, + { + "epoch": 1.0363973851989348, + "grad_norm": 0.7077060341835022, + "learning_rate": 5.736494168785698e-05, + "loss": 2.4264, + "step": 12842 + }, + { + "epoch": 1.0364780889355176, + "grad_norm": 0.667086124420166, + "learning_rate": 5.7350662054501016e-05, + "loss": 2.4733, + "step": 12843 + }, + { + "epoch": 1.0365587926721007, + "grad_norm": 0.6531338691711426, + "learning_rate": 5.7336383484078004e-05, + "loss": 2.4709, + "step": 12844 + }, + { + "epoch": 1.0366394964086838, + "grad_norm": 0.7141630053520203, + "learning_rate": 5.732210597694383e-05, + "loss": 2.4747, + "step": 12845 + }, + { + "epoch": 1.0367202001452667, + "grad_norm": 0.7186396718025208, + "learning_rate": 5.730782953345435e-05, + "loss": 2.4401, + "step": 12846 + }, + { + "epoch": 1.0368009038818498, + "grad_norm": 0.6709686517715454, + "learning_rate": 5.7293554153965345e-05, + "loss": 2.456, + "step": 12847 + }, + { + "epoch": 1.0368816076184326, + "grad_norm": 0.6867267489433289, + "learning_rate": 5.727927983883261e-05, + "loss": 2.4522, + "step": 12848 + }, + { + "epoch": 1.0369623113550157, + "grad_norm": 0.7016724348068237, + "learning_rate": 5.7265006588411926e-05, + "loss": 2.4348, + "step": 12849 + }, + { + "epoch": 1.0370430150915988, + "grad_norm": 0.6764764785766602, + "learning_rate": 5.725073440305896e-05, + "loss": 2.4241, + "step": 12850 + }, + { + "epoch": 1.0371237188281817, + "grad_norm": 0.6965062618255615, + "learning_rate": 5.7236463283129435e-05, + "loss": 2.4559, + "step": 12851 + }, + { + "epoch": 1.0372044225647647, + "grad_norm": 0.6878135800361633, + "learning_rate": 5.7222193228979037e-05, + "loss": 2.4874, + "step": 12852 + }, + { + "epoch": 1.0372851263013478, + "grad_norm": 0.6576557755470276, + "learning_rate": 5.720792424096344e-05, + "loss": 2.4273, + "step": 12853 + }, + { + "epoch": 1.0373658300379307, + "grad_norm": 0.7463123798370361, + "learning_rate": 5.719365631943818e-05, + "loss": 2.4933, + "step": 12854 + }, + { + "epoch": 1.0374465337745138, + "grad_norm": 0.6920896768569946, + "learning_rate": 5.7179389464758914e-05, + "loss": 2.4799, + "step": 12855 + }, + { + "epoch": 1.0375272375110969, + "grad_norm": 0.7330591082572937, + "learning_rate": 5.71651236772812e-05, + "loss": 2.469, + "step": 12856 + }, + { + "epoch": 1.0376079412476797, + "grad_norm": 0.6766076683998108, + "learning_rate": 5.715085895736057e-05, + "loss": 2.4787, + "step": 12857 + }, + { + "epoch": 1.0376886449842628, + "grad_norm": 0.724278450012207, + "learning_rate": 5.713659530535255e-05, + "loss": 2.4524, + "step": 12858 + }, + { + "epoch": 1.0377693487208457, + "grad_norm": 0.6816281676292419, + "learning_rate": 5.712233272161265e-05, + "loss": 2.4993, + "step": 12859 + }, + { + "epoch": 1.0378500524574288, + "grad_norm": 0.7186439633369446, + "learning_rate": 5.710807120649626e-05, + "loss": 2.4108, + "step": 12860 + }, + { + "epoch": 1.0379307561940119, + "grad_norm": 0.6616777181625366, + "learning_rate": 5.709381076035887e-05, + "loss": 2.4797, + "step": 12861 + }, + { + "epoch": 1.0380114599305947, + "grad_norm": 0.6956895589828491, + "learning_rate": 5.7079551383555906e-05, + "loss": 2.4017, + "step": 12862 + }, + { + "epoch": 1.0380921636671778, + "grad_norm": 0.6650584936141968, + "learning_rate": 5.706529307644268e-05, + "loss": 2.4808, + "step": 12863 + }, + { + "epoch": 1.0381728674037607, + "grad_norm": 0.6362698674201965, + "learning_rate": 5.705103583937458e-05, + "loss": 2.4077, + "step": 12864 + }, + { + "epoch": 1.0382535711403438, + "grad_norm": 0.6962565183639526, + "learning_rate": 5.703677967270697e-05, + "loss": 2.4715, + "step": 12865 + }, + { + "epoch": 1.0383342748769269, + "grad_norm": 0.6927294135093689, + "learning_rate": 5.702252457679509e-05, + "loss": 2.4983, + "step": 12866 + }, + { + "epoch": 1.0384149786135097, + "grad_norm": 0.7107497453689575, + "learning_rate": 5.70082705519942e-05, + "loss": 2.4198, + "step": 12867 + }, + { + "epoch": 1.0384956823500928, + "grad_norm": 0.6459221243858337, + "learning_rate": 5.6994017598659634e-05, + "loss": 2.4423, + "step": 12868 + }, + { + "epoch": 1.038576386086676, + "grad_norm": 0.705563485622406, + "learning_rate": 5.697976571714658e-05, + "loss": 2.5346, + "step": 12869 + }, + { + "epoch": 1.0386570898232588, + "grad_norm": 0.7424784898757935, + "learning_rate": 5.696551490781021e-05, + "loss": 2.4824, + "step": 12870 + }, + { + "epoch": 1.0387377935598419, + "grad_norm": 0.6820988059043884, + "learning_rate": 5.695126517100569e-05, + "loss": 2.4965, + "step": 12871 + }, + { + "epoch": 1.0388184972964247, + "grad_norm": 0.8209595680236816, + "learning_rate": 5.6937016507088225e-05, + "loss": 2.475, + "step": 12872 + }, + { + "epoch": 1.0388992010330078, + "grad_norm": 0.7407695055007935, + "learning_rate": 5.6922768916412815e-05, + "loss": 2.4683, + "step": 12873 + }, + { + "epoch": 1.038979904769591, + "grad_norm": 0.7335677742958069, + "learning_rate": 5.690852239933462e-05, + "loss": 2.4621, + "step": 12874 + }, + { + "epoch": 1.0390606085061738, + "grad_norm": 0.6731325387954712, + "learning_rate": 5.689427695620873e-05, + "loss": 2.4882, + "step": 12875 + }, + { + "epoch": 1.0391413122427569, + "grad_norm": 0.7256175875663757, + "learning_rate": 5.68800325873901e-05, + "loss": 2.4827, + "step": 12876 + }, + { + "epoch": 1.03922201597934, + "grad_norm": 0.711928129196167, + "learning_rate": 5.686578929323377e-05, + "loss": 2.4447, + "step": 12877 + }, + { + "epoch": 1.0393027197159228, + "grad_norm": 0.6445996165275574, + "learning_rate": 5.685154707409473e-05, + "loss": 2.453, + "step": 12878 + }, + { + "epoch": 1.039383423452506, + "grad_norm": 0.6656066179275513, + "learning_rate": 5.6837305930327923e-05, + "loss": 2.4863, + "step": 12879 + }, + { + "epoch": 1.0394641271890888, + "grad_norm": 0.6844663619995117, + "learning_rate": 5.682306586228828e-05, + "loss": 2.4524, + "step": 12880 + }, + { + "epoch": 1.0395448309256718, + "grad_norm": 0.6436383724212646, + "learning_rate": 5.6808826870330746e-05, + "loss": 2.4137, + "step": 12881 + }, + { + "epoch": 1.039625534662255, + "grad_norm": 0.6731196641921997, + "learning_rate": 5.6794588954810104e-05, + "loss": 2.4176, + "step": 12882 + }, + { + "epoch": 1.0397062383988378, + "grad_norm": 0.6994587779045105, + "learning_rate": 5.678035211608125e-05, + "loss": 2.4651, + "step": 12883 + }, + { + "epoch": 1.0397869421354209, + "grad_norm": 0.6912599205970764, + "learning_rate": 5.6766116354499e-05, + "loss": 2.3918, + "step": 12884 + }, + { + "epoch": 1.039867645872004, + "grad_norm": 0.7627033591270447, + "learning_rate": 5.6751881670418185e-05, + "loss": 2.4278, + "step": 12885 + }, + { + "epoch": 1.0399483496085868, + "grad_norm": 0.7107213139533997, + "learning_rate": 5.6737648064193485e-05, + "loss": 2.5249, + "step": 12886 + }, + { + "epoch": 1.04002905334517, + "grad_norm": 0.7254211902618408, + "learning_rate": 5.672341553617968e-05, + "loss": 2.4454, + "step": 12887 + }, + { + "epoch": 1.0401097570817528, + "grad_norm": 0.6776205897331238, + "learning_rate": 5.670918408673149e-05, + "loss": 2.4333, + "step": 12888 + }, + { + "epoch": 1.0401904608183359, + "grad_norm": 0.6824465394020081, + "learning_rate": 5.669495371620359e-05, + "loss": 2.427, + "step": 12889 + }, + { + "epoch": 1.040271164554919, + "grad_norm": 0.6633001565933228, + "learning_rate": 5.668072442495066e-05, + "loss": 2.4874, + "step": 12890 + }, + { + "epoch": 1.0403518682915018, + "grad_norm": 0.6655289530754089, + "learning_rate": 5.666649621332735e-05, + "loss": 2.5023, + "step": 12891 + }, + { + "epoch": 1.040432572028085, + "grad_norm": 0.6892853379249573, + "learning_rate": 5.665226908168818e-05, + "loss": 2.4505, + "step": 12892 + }, + { + "epoch": 1.040513275764668, + "grad_norm": 0.7154649496078491, + "learning_rate": 5.6638043030387774e-05, + "loss": 2.4916, + "step": 12893 + }, + { + "epoch": 1.0405939795012509, + "grad_norm": 0.6780592799186707, + "learning_rate": 5.662381805978074e-05, + "loss": 2.4116, + "step": 12894 + }, + { + "epoch": 1.040674683237834, + "grad_norm": 0.6737352013587952, + "learning_rate": 5.66095941702215e-05, + "loss": 2.3903, + "step": 12895 + }, + { + "epoch": 1.0407553869744168, + "grad_norm": 0.7623820304870605, + "learning_rate": 5.659537136206461e-05, + "loss": 2.4334, + "step": 12896 + }, + { + "epoch": 1.040836090711, + "grad_norm": 0.7043081521987915, + "learning_rate": 5.65811496356645e-05, + "loss": 2.4403, + "step": 12897 + }, + { + "epoch": 1.040916794447583, + "grad_norm": 0.6704873442649841, + "learning_rate": 5.6566928991375654e-05, + "loss": 2.4416, + "step": 12898 + }, + { + "epoch": 1.0409974981841659, + "grad_norm": 0.6556837558746338, + "learning_rate": 5.6552709429552474e-05, + "loss": 2.4904, + "step": 12899 + }, + { + "epoch": 1.041078201920749, + "grad_norm": 0.6926451325416565, + "learning_rate": 5.653849095054935e-05, + "loss": 2.4889, + "step": 12900 + }, + { + "epoch": 1.041158905657332, + "grad_norm": 0.6407613158226013, + "learning_rate": 5.6524273554720674e-05, + "loss": 2.3951, + "step": 12901 + }, + { + "epoch": 1.041239609393915, + "grad_norm": 0.7812615633010864, + "learning_rate": 5.651005724242071e-05, + "loss": 2.4535, + "step": 12902 + }, + { + "epoch": 1.041320313130498, + "grad_norm": 0.6868990659713745, + "learning_rate": 5.6495842014003796e-05, + "loss": 2.4373, + "step": 12903 + }, + { + "epoch": 1.0414010168670809, + "grad_norm": 0.6467776894569397, + "learning_rate": 5.648162786982427e-05, + "loss": 2.4929, + "step": 12904 + }, + { + "epoch": 1.041481720603664, + "grad_norm": 0.6588063836097717, + "learning_rate": 5.64674148102363e-05, + "loss": 2.4445, + "step": 12905 + }, + { + "epoch": 1.041562424340247, + "grad_norm": 0.6880654096603394, + "learning_rate": 5.6453202835594136e-05, + "loss": 2.4298, + "step": 12906 + }, + { + "epoch": 1.04164312807683, + "grad_norm": 0.7471407055854797, + "learning_rate": 5.6438991946251996e-05, + "loss": 2.4669, + "step": 12907 + }, + { + "epoch": 1.041723831813413, + "grad_norm": 0.7069533467292786, + "learning_rate": 5.6424782142564034e-05, + "loss": 2.4498, + "step": 12908 + }, + { + "epoch": 1.0418045355499959, + "grad_norm": 0.7013602256774902, + "learning_rate": 5.641057342488443e-05, + "loss": 2.4993, + "step": 12909 + }, + { + "epoch": 1.041885239286579, + "grad_norm": 0.6870697736740112, + "learning_rate": 5.6396365793567305e-05, + "loss": 2.5338, + "step": 12910 + }, + { + "epoch": 1.041965943023162, + "grad_norm": 0.6569130420684814, + "learning_rate": 5.638215924896669e-05, + "loss": 2.4538, + "step": 12911 + }, + { + "epoch": 1.042046646759745, + "grad_norm": 0.6900331377983093, + "learning_rate": 5.636795379143669e-05, + "loss": 2.4013, + "step": 12912 + }, + { + "epoch": 1.042127350496328, + "grad_norm": 0.6800071001052856, + "learning_rate": 5.635374942133136e-05, + "loss": 2.4733, + "step": 12913 + }, + { + "epoch": 1.042208054232911, + "grad_norm": 0.703601598739624, + "learning_rate": 5.6339546139004663e-05, + "loss": 2.432, + "step": 12914 + }, + { + "epoch": 1.042288757969494, + "grad_norm": 0.6781988739967346, + "learning_rate": 5.6325343944810594e-05, + "loss": 2.4418, + "step": 12915 + }, + { + "epoch": 1.042369461706077, + "grad_norm": 0.7247167825698853, + "learning_rate": 5.6311142839103125e-05, + "loss": 2.5133, + "step": 12916 + }, + { + "epoch": 1.04245016544266, + "grad_norm": 0.7738155126571655, + "learning_rate": 5.629694282223619e-05, + "loss": 2.5137, + "step": 12917 + }, + { + "epoch": 1.042530869179243, + "grad_norm": 0.74723219871521, + "learning_rate": 5.628274389456367e-05, + "loss": 2.3996, + "step": 12918 + }, + { + "epoch": 1.042611572915826, + "grad_norm": 0.7245466709136963, + "learning_rate": 5.6268546056439456e-05, + "loss": 2.4213, + "step": 12919 + }, + { + "epoch": 1.042692276652409, + "grad_norm": 0.6307608485221863, + "learning_rate": 5.625434930821742e-05, + "loss": 2.4195, + "step": 12920 + }, + { + "epoch": 1.042772980388992, + "grad_norm": 0.7138007879257202, + "learning_rate": 5.6240153650251326e-05, + "loss": 2.463, + "step": 12921 + }, + { + "epoch": 1.042853684125575, + "grad_norm": 0.779659628868103, + "learning_rate": 5.622595908289498e-05, + "loss": 2.4898, + "step": 12922 + }, + { + "epoch": 1.042934387862158, + "grad_norm": 0.7144278287887573, + "learning_rate": 5.621176560650221e-05, + "loss": 2.4083, + "step": 12923 + }, + { + "epoch": 1.043015091598741, + "grad_norm": 0.7724754214286804, + "learning_rate": 5.619757322142667e-05, + "loss": 2.3917, + "step": 12924 + }, + { + "epoch": 1.043095795335324, + "grad_norm": 0.7667245268821716, + "learning_rate": 5.618338192802208e-05, + "loss": 2.4943, + "step": 12925 + }, + { + "epoch": 1.043176499071907, + "grad_norm": 0.6528030037879944, + "learning_rate": 5.616919172664221e-05, + "loss": 2.4323, + "step": 12926 + }, + { + "epoch": 1.04325720280849, + "grad_norm": 0.6790263652801514, + "learning_rate": 5.6155002617640615e-05, + "loss": 2.4304, + "step": 12927 + }, + { + "epoch": 1.043337906545073, + "grad_norm": 0.7554369568824768, + "learning_rate": 5.614081460137097e-05, + "loss": 2.4637, + "step": 12928 + }, + { + "epoch": 1.043418610281656, + "grad_norm": 0.7126293182373047, + "learning_rate": 5.612662767818686e-05, + "loss": 2.4765, + "step": 12929 + }, + { + "epoch": 1.0434993140182391, + "grad_norm": 0.6705749034881592, + "learning_rate": 5.611244184844189e-05, + "loss": 2.4746, + "step": 12930 + }, + { + "epoch": 1.043580017754822, + "grad_norm": 0.6595145463943481, + "learning_rate": 5.609825711248958e-05, + "loss": 2.463, + "step": 12931 + }, + { + "epoch": 1.043660721491405, + "grad_norm": 0.6942049860954285, + "learning_rate": 5.6084073470683476e-05, + "loss": 2.5101, + "step": 12932 + }, + { + "epoch": 1.043741425227988, + "grad_norm": 0.7285810708999634, + "learning_rate": 5.6069890923377087e-05, + "loss": 2.467, + "step": 12933 + }, + { + "epoch": 1.043822128964571, + "grad_norm": 0.7702928185462952, + "learning_rate": 5.605570947092382e-05, + "loss": 2.4998, + "step": 12934 + }, + { + "epoch": 1.0439028327011541, + "grad_norm": 0.6631895899772644, + "learning_rate": 5.604152911367713e-05, + "loss": 2.4277, + "step": 12935 + }, + { + "epoch": 1.043983536437737, + "grad_norm": 0.6447882652282715, + "learning_rate": 5.6027349851990494e-05, + "loss": 2.4868, + "step": 12936 + }, + { + "epoch": 1.04406424017432, + "grad_norm": 0.695160448551178, + "learning_rate": 5.6013171686217205e-05, + "loss": 2.3917, + "step": 12937 + }, + { + "epoch": 1.0441449439109032, + "grad_norm": 0.6579271554946899, + "learning_rate": 5.5998994616710656e-05, + "loss": 2.4245, + "step": 12938 + }, + { + "epoch": 1.044225647647486, + "grad_norm": 0.7053574323654175, + "learning_rate": 5.598481864382419e-05, + "loss": 2.4809, + "step": 12939 + }, + { + "epoch": 1.0443063513840691, + "grad_norm": 0.7008736729621887, + "learning_rate": 5.5970643767911105e-05, + "loss": 2.4481, + "step": 12940 + }, + { + "epoch": 1.044387055120652, + "grad_norm": 0.6577918529510498, + "learning_rate": 5.5956469989324644e-05, + "loss": 2.4211, + "step": 12941 + }, + { + "epoch": 1.044467758857235, + "grad_norm": 0.6662739515304565, + "learning_rate": 5.594229730841815e-05, + "loss": 2.4607, + "step": 12942 + }, + { + "epoch": 1.0445484625938182, + "grad_norm": 0.6637060046195984, + "learning_rate": 5.592812572554471e-05, + "loss": 2.4388, + "step": 12943 + }, + { + "epoch": 1.044629166330401, + "grad_norm": 0.7282097935676575, + "learning_rate": 5.5913955241057605e-05, + "loss": 2.4536, + "step": 12944 + }, + { + "epoch": 1.0447098700669841, + "grad_norm": 0.6470810174942017, + "learning_rate": 5.589978585530997e-05, + "loss": 2.4032, + "step": 12945 + }, + { + "epoch": 1.0447905738035672, + "grad_norm": 0.6958881616592407, + "learning_rate": 5.588561756865498e-05, + "loss": 2.4577, + "step": 12946 + }, + { + "epoch": 1.04487127754015, + "grad_norm": 0.6999812722206116, + "learning_rate": 5.587145038144569e-05, + "loss": 2.454, + "step": 12947 + }, + { + "epoch": 1.0449519812767332, + "grad_norm": 0.6919988989830017, + "learning_rate": 5.58572842940352e-05, + "loss": 2.4505, + "step": 12948 + }, + { + "epoch": 1.045032685013316, + "grad_norm": 0.6813084483146667, + "learning_rate": 5.584311930677659e-05, + "loss": 2.4873, + "step": 12949 + }, + { + "epoch": 1.0451133887498991, + "grad_norm": 0.6587427854537964, + "learning_rate": 5.582895542002286e-05, + "loss": 2.4658, + "step": 12950 + }, + { + "epoch": 1.0451940924864822, + "grad_norm": 0.6942041516304016, + "learning_rate": 5.581479263412703e-05, + "loss": 2.47, + "step": 12951 + }, + { + "epoch": 1.045274796223065, + "grad_norm": 0.7330117225646973, + "learning_rate": 5.58006309494421e-05, + "loss": 2.4826, + "step": 12952 + }, + { + "epoch": 1.0453554999596482, + "grad_norm": 0.7197144031524658, + "learning_rate": 5.578647036632096e-05, + "loss": 2.4425, + "step": 12953 + }, + { + "epoch": 1.045436203696231, + "grad_norm": 0.7442573308944702, + "learning_rate": 5.577231088511654e-05, + "loss": 2.4946, + "step": 12954 + }, + { + "epoch": 1.0455169074328141, + "grad_norm": 0.7039753198623657, + "learning_rate": 5.575815250618179e-05, + "loss": 2.4188, + "step": 12955 + }, + { + "epoch": 1.0455976111693972, + "grad_norm": 0.7374606728553772, + "learning_rate": 5.574399522986951e-05, + "loss": 2.3916, + "step": 12956 + }, + { + "epoch": 1.04567831490598, + "grad_norm": 0.6358140707015991, + "learning_rate": 5.572983905653253e-05, + "loss": 2.4502, + "step": 12957 + }, + { + "epoch": 1.0457590186425632, + "grad_norm": 0.712858259677887, + "learning_rate": 5.5715683986523694e-05, + "loss": 2.4746, + "step": 12958 + }, + { + "epoch": 1.0458397223791462, + "grad_norm": 0.6757933497428894, + "learning_rate": 5.5701530020195756e-05, + "loss": 2.4836, + "step": 12959 + }, + { + "epoch": 1.045920426115729, + "grad_norm": 0.7509831786155701, + "learning_rate": 5.568737715790151e-05, + "loss": 2.4061, + "step": 12960 + }, + { + "epoch": 1.0460011298523122, + "grad_norm": 0.7120335102081299, + "learning_rate": 5.5673225399993646e-05, + "loss": 2.4772, + "step": 12961 + }, + { + "epoch": 1.046081833588895, + "grad_norm": 0.7213751673698425, + "learning_rate": 5.5659074746824924e-05, + "loss": 2.4637, + "step": 12962 + }, + { + "epoch": 1.0461625373254781, + "grad_norm": 0.7161290645599365, + "learning_rate": 5.5644925198747934e-05, + "loss": 2.4552, + "step": 12963 + }, + { + "epoch": 1.0462432410620612, + "grad_norm": 0.7303922772407532, + "learning_rate": 5.563077675611534e-05, + "loss": 2.5091, + "step": 12964 + }, + { + "epoch": 1.046323944798644, + "grad_norm": 0.7051636576652527, + "learning_rate": 5.561662941927981e-05, + "loss": 2.3717, + "step": 12965 + }, + { + "epoch": 1.0464046485352272, + "grad_norm": 0.6880733370780945, + "learning_rate": 5.5602483188593866e-05, + "loss": 2.4205, + "step": 12966 + }, + { + "epoch": 1.0464853522718103, + "grad_norm": 0.6942360401153564, + "learning_rate": 5.558833806441008e-05, + "loss": 2.4601, + "step": 12967 + }, + { + "epoch": 1.0465660560083931, + "grad_norm": 0.7264992594718933, + "learning_rate": 5.5574194047081016e-05, + "loss": 2.4612, + "step": 12968 + }, + { + "epoch": 1.0466467597449762, + "grad_norm": 0.7502472996711731, + "learning_rate": 5.5560051136959166e-05, + "loss": 2.4099, + "step": 12969 + }, + { + "epoch": 1.046727463481559, + "grad_norm": 0.691694438457489, + "learning_rate": 5.5545909334397004e-05, + "loss": 2.5071, + "step": 12970 + }, + { + "epoch": 1.0468081672181422, + "grad_norm": 0.7120653986930847, + "learning_rate": 5.5531768639747026e-05, + "loss": 2.4066, + "step": 12971 + }, + { + "epoch": 1.0468888709547253, + "grad_norm": 0.6501363515853882, + "learning_rate": 5.551762905336159e-05, + "loss": 2.4186, + "step": 12972 + }, + { + "epoch": 1.0469695746913081, + "grad_norm": 0.6924965977668762, + "learning_rate": 5.5503490575593095e-05, + "loss": 2.4864, + "step": 12973 + }, + { + "epoch": 1.0470502784278912, + "grad_norm": 0.6772900819778442, + "learning_rate": 5.548935320679398e-05, + "loss": 2.4101, + "step": 12974 + }, + { + "epoch": 1.0471309821644743, + "grad_norm": 0.6950967311859131, + "learning_rate": 5.54752169473165e-05, + "loss": 2.4893, + "step": 12975 + }, + { + "epoch": 1.0472116859010572, + "grad_norm": 0.6663516163825989, + "learning_rate": 5.5461081797512994e-05, + "loss": 2.4136, + "step": 12976 + }, + { + "epoch": 1.0472923896376403, + "grad_norm": 0.7337449789047241, + "learning_rate": 5.5446947757735754e-05, + "loss": 2.473, + "step": 12977 + }, + { + "epoch": 1.0473730933742231, + "grad_norm": 0.6808840036392212, + "learning_rate": 5.543281482833709e-05, + "loss": 2.4473, + "step": 12978 + }, + { + "epoch": 1.0474537971108062, + "grad_norm": 0.6472508907318115, + "learning_rate": 5.5418683009669124e-05, + "loss": 2.4077, + "step": 12979 + }, + { + "epoch": 1.0475345008473893, + "grad_norm": 0.6904192566871643, + "learning_rate": 5.540455230208409e-05, + "loss": 2.482, + "step": 12980 + }, + { + "epoch": 1.0476152045839722, + "grad_norm": 0.6781610250473022, + "learning_rate": 5.5390422705934264e-05, + "loss": 2.4458, + "step": 12981 + }, + { + "epoch": 1.0476959083205553, + "grad_norm": 0.7130050659179688, + "learning_rate": 5.5376294221571666e-05, + "loss": 2.5136, + "step": 12982 + }, + { + "epoch": 1.0477766120571383, + "grad_norm": 0.7727184891700745, + "learning_rate": 5.536216684934846e-05, + "loss": 2.5346, + "step": 12983 + }, + { + "epoch": 1.0478573157937212, + "grad_norm": 0.7177208662033081, + "learning_rate": 5.534804058961679e-05, + "loss": 2.4153, + "step": 12984 + }, + { + "epoch": 1.0479380195303043, + "grad_norm": 0.7333023548126221, + "learning_rate": 5.5333915442728634e-05, + "loss": 2.4171, + "step": 12985 + }, + { + "epoch": 1.0480187232668872, + "grad_norm": 0.658423125743866, + "learning_rate": 5.5319791409036046e-05, + "loss": 2.446, + "step": 12986 + }, + { + "epoch": 1.0480994270034703, + "grad_norm": 0.8305184841156006, + "learning_rate": 5.5305668488891114e-05, + "loss": 2.5026, + "step": 12987 + }, + { + "epoch": 1.0481801307400533, + "grad_norm": 0.7083305716514587, + "learning_rate": 5.52915466826457e-05, + "loss": 2.5366, + "step": 12988 + }, + { + "epoch": 1.0482608344766362, + "grad_norm": 0.7924454212188721, + "learning_rate": 5.5277425990651824e-05, + "loss": 2.528, + "step": 12989 + }, + { + "epoch": 1.0483415382132193, + "grad_norm": 0.633376955986023, + "learning_rate": 5.5263306413261384e-05, + "loss": 2.4442, + "step": 12990 + }, + { + "epoch": 1.0484222419498024, + "grad_norm": 0.7387240529060364, + "learning_rate": 5.5249187950826295e-05, + "loss": 2.4761, + "step": 12991 + }, + { + "epoch": 1.0485029456863852, + "grad_norm": 0.6796224117279053, + "learning_rate": 5.523507060369843e-05, + "loss": 2.4828, + "step": 12992 + }, + { + "epoch": 1.0485836494229683, + "grad_norm": 0.6925581097602844, + "learning_rate": 5.5220954372229604e-05, + "loss": 2.4861, + "step": 12993 + }, + { + "epoch": 1.0486643531595512, + "grad_norm": 0.6854318380355835, + "learning_rate": 5.5206839256771704e-05, + "loss": 2.473, + "step": 12994 + }, + { + "epoch": 1.0487450568961343, + "grad_norm": 0.706375241279602, + "learning_rate": 5.519272525767643e-05, + "loss": 2.4284, + "step": 12995 + }, + { + "epoch": 1.0488257606327174, + "grad_norm": 0.6917428374290466, + "learning_rate": 5.517861237529556e-05, + "loss": 2.4702, + "step": 12996 + }, + { + "epoch": 1.0489064643693002, + "grad_norm": 0.6903818845748901, + "learning_rate": 5.516450060998086e-05, + "loss": 2.4679, + "step": 12997 + }, + { + "epoch": 1.0489871681058833, + "grad_norm": 0.6403356194496155, + "learning_rate": 5.515038996208398e-05, + "loss": 2.396, + "step": 12998 + }, + { + "epoch": 1.0490678718424662, + "grad_norm": 0.6491792798042297, + "learning_rate": 5.513628043195662e-05, + "loss": 2.4543, + "step": 12999 + }, + { + "epoch": 1.0491485755790493, + "grad_norm": 0.687303900718689, + "learning_rate": 5.512217201995043e-05, + "loss": 2.4716, + "step": 13000 + }, + { + "epoch": 1.0491485755790493, + "eval_loss": 2.4177169799804688, + "eval_runtime": 763.9215, + "eval_samples_per_second": 3.43, + "eval_steps_per_second": 0.572, + "step": 13000 + }, + { + "epoch": 1.0492292793156324, + "grad_norm": 0.7020761370658875, + "learning_rate": 5.510806472641701e-05, + "loss": 2.3591, + "step": 13001 + }, + { + "epoch": 1.0493099830522152, + "grad_norm": 0.6978075504302979, + "learning_rate": 5.509395855170798e-05, + "loss": 2.4585, + "step": 13002 + }, + { + "epoch": 1.0493906867887983, + "grad_norm": 0.7327752113342285, + "learning_rate": 5.5079853496174925e-05, + "loss": 2.5265, + "step": 13003 + }, + { + "epoch": 1.0494713905253814, + "grad_norm": 0.7552505135536194, + "learning_rate": 5.50657495601693e-05, + "loss": 2.4821, + "step": 13004 + }, + { + "epoch": 1.0495520942619643, + "grad_norm": 0.7100770473480225, + "learning_rate": 5.5051646744042664e-05, + "loss": 2.4566, + "step": 13005 + }, + { + "epoch": 1.0496327979985474, + "grad_norm": 0.7008209824562073, + "learning_rate": 5.503754504814651e-05, + "loss": 2.4476, + "step": 13006 + }, + { + "epoch": 1.0497135017351304, + "grad_norm": 0.640724241733551, + "learning_rate": 5.502344447283223e-05, + "loss": 2.437, + "step": 13007 + }, + { + "epoch": 1.0497942054717133, + "grad_norm": 0.7064981460571289, + "learning_rate": 5.5009345018451297e-05, + "loss": 2.5129, + "step": 13008 + }, + { + "epoch": 1.0498749092082964, + "grad_norm": 0.6729782223701477, + "learning_rate": 5.49952466853551e-05, + "loss": 2.4867, + "step": 13009 + }, + { + "epoch": 1.0499556129448793, + "grad_norm": 0.7245302200317383, + "learning_rate": 5.4981149473894966e-05, + "loss": 2.4485, + "step": 13010 + }, + { + "epoch": 1.0500363166814624, + "grad_norm": 0.6686248779296875, + "learning_rate": 5.4967053384422294e-05, + "loss": 2.4314, + "step": 13011 + }, + { + "epoch": 1.0501170204180454, + "grad_norm": 0.6790863871574402, + "learning_rate": 5.495295841728836e-05, + "loss": 2.4847, + "step": 13012 + }, + { + "epoch": 1.0501977241546283, + "grad_norm": 0.6516931653022766, + "learning_rate": 5.49388645728445e-05, + "loss": 2.4306, + "step": 13013 + }, + { + "epoch": 1.0502784278912114, + "grad_norm": 0.6967600584030151, + "learning_rate": 5.492477185144189e-05, + "loss": 2.4942, + "step": 13014 + }, + { + "epoch": 1.0503591316277943, + "grad_norm": 0.696246325969696, + "learning_rate": 5.491068025343178e-05, + "loss": 2.4647, + "step": 13015 + }, + { + "epoch": 1.0504398353643774, + "grad_norm": 0.6962751150131226, + "learning_rate": 5.489658977916543e-05, + "loss": 2.5095, + "step": 13016 + }, + { + "epoch": 1.0505205391009604, + "grad_norm": 0.6982631087303162, + "learning_rate": 5.488250042899392e-05, + "loss": 2.4327, + "step": 13017 + }, + { + "epoch": 1.0506012428375433, + "grad_norm": 0.6932644844055176, + "learning_rate": 5.486841220326845e-05, + "loss": 2.4777, + "step": 13018 + }, + { + "epoch": 1.0506819465741264, + "grad_norm": 0.6923339366912842, + "learning_rate": 5.485432510234012e-05, + "loss": 2.4321, + "step": 13019 + }, + { + "epoch": 1.0507626503107095, + "grad_norm": 0.7445859313011169, + "learning_rate": 5.4840239126560015e-05, + "loss": 2.4425, + "step": 13020 + }, + { + "epoch": 1.0508433540472923, + "grad_norm": 0.7122324705123901, + "learning_rate": 5.48261542762792e-05, + "loss": 2.4545, + "step": 13021 + }, + { + "epoch": 1.0509240577838754, + "grad_norm": 0.734779417514801, + "learning_rate": 5.4812070551848736e-05, + "loss": 2.4764, + "step": 13022 + }, + { + "epoch": 1.0510047615204583, + "grad_norm": 0.6544109582901001, + "learning_rate": 5.4797987953619566e-05, + "loss": 2.4492, + "step": 13023 + }, + { + "epoch": 1.0510854652570414, + "grad_norm": 0.6366097331047058, + "learning_rate": 5.4783906481942704e-05, + "loss": 2.4695, + "step": 13024 + }, + { + "epoch": 1.0511661689936245, + "grad_norm": 0.6966270804405212, + "learning_rate": 5.476982613716908e-05, + "loss": 2.4505, + "step": 13025 + }, + { + "epoch": 1.0512468727302073, + "grad_norm": 0.7010120153427124, + "learning_rate": 5.4755746919649665e-05, + "loss": 2.4545, + "step": 13026 + }, + { + "epoch": 1.0513275764667904, + "grad_norm": 0.6704719662666321, + "learning_rate": 5.474166882973526e-05, + "loss": 2.3899, + "step": 13027 + }, + { + "epoch": 1.0514082802033735, + "grad_norm": 0.757152259349823, + "learning_rate": 5.472759186777679e-05, + "loss": 2.5112, + "step": 13028 + }, + { + "epoch": 1.0514889839399564, + "grad_norm": 0.6668868660926819, + "learning_rate": 5.471351603412509e-05, + "loss": 2.4797, + "step": 13029 + }, + { + "epoch": 1.0515696876765395, + "grad_norm": 0.7919496893882751, + "learning_rate": 5.4699441329130887e-05, + "loss": 2.4874, + "step": 13030 + }, + { + "epoch": 1.0516503914131223, + "grad_norm": 0.7595484852790833, + "learning_rate": 5.468536775314506e-05, + "loss": 2.4621, + "step": 13031 + }, + { + "epoch": 1.0517310951497054, + "grad_norm": 0.6575995683670044, + "learning_rate": 5.467129530651835e-05, + "loss": 2.4474, + "step": 13032 + }, + { + "epoch": 1.0518117988862885, + "grad_norm": 0.6817733645439148, + "learning_rate": 5.4657223989601425e-05, + "loss": 2.4329, + "step": 13033 + }, + { + "epoch": 1.0518925026228714, + "grad_norm": 0.722882091999054, + "learning_rate": 5.464315380274501e-05, + "loss": 2.4544, + "step": 13034 + }, + { + "epoch": 1.0519732063594545, + "grad_norm": 0.6957377791404724, + "learning_rate": 5.4629084746299796e-05, + "loss": 2.5669, + "step": 13035 + }, + { + "epoch": 1.0520539100960375, + "grad_norm": 0.6749420166015625, + "learning_rate": 5.461501682061636e-05, + "loss": 2.5053, + "step": 13036 + }, + { + "epoch": 1.0521346138326204, + "grad_norm": 0.8158369064331055, + "learning_rate": 5.4600950026045326e-05, + "loss": 2.429, + "step": 13037 + }, + { + "epoch": 1.0522153175692035, + "grad_norm": 0.6960736513137817, + "learning_rate": 5.458688436293735e-05, + "loss": 2.4731, + "step": 13038 + }, + { + "epoch": 1.0522960213057864, + "grad_norm": 0.6686301231384277, + "learning_rate": 5.457281983164287e-05, + "loss": 2.4495, + "step": 13039 + }, + { + "epoch": 1.0523767250423695, + "grad_norm": 0.6691476106643677, + "learning_rate": 5.455875643251248e-05, + "loss": 2.4329, + "step": 13040 + }, + { + "epoch": 1.0524574287789525, + "grad_norm": 0.7737297415733337, + "learning_rate": 5.454469416589666e-05, + "loss": 2.4664, + "step": 13041 + }, + { + "epoch": 1.0525381325155354, + "grad_norm": 0.7848188281059265, + "learning_rate": 5.453063303214588e-05, + "loss": 2.4799, + "step": 13042 + }, + { + "epoch": 1.0526188362521185, + "grad_norm": 0.7831119894981384, + "learning_rate": 5.45165730316106e-05, + "loss": 2.5076, + "step": 13043 + }, + { + "epoch": 1.0526995399887016, + "grad_norm": 0.691635012626648, + "learning_rate": 5.4502514164641196e-05, + "loss": 2.4866, + "step": 13044 + }, + { + "epoch": 1.0527802437252844, + "grad_norm": 0.6667110919952393, + "learning_rate": 5.4488456431588106e-05, + "loss": 2.4162, + "step": 13045 + }, + { + "epoch": 1.0528609474618675, + "grad_norm": 0.7201905846595764, + "learning_rate": 5.447439983280163e-05, + "loss": 2.498, + "step": 13046 + }, + { + "epoch": 1.0529416511984504, + "grad_norm": 0.8538106083869934, + "learning_rate": 5.44603443686321e-05, + "loss": 2.4477, + "step": 13047 + }, + { + "epoch": 1.0530223549350335, + "grad_norm": 0.6661962270736694, + "learning_rate": 5.444629003942987e-05, + "loss": 2.5253, + "step": 13048 + }, + { + "epoch": 1.0531030586716166, + "grad_norm": 0.7239834666252136, + "learning_rate": 5.4432236845545146e-05, + "loss": 2.4786, + "step": 13049 + }, + { + "epoch": 1.0531837624081994, + "grad_norm": 0.7328412532806396, + "learning_rate": 5.4418184787328186e-05, + "loss": 2.4841, + "step": 13050 + }, + { + "epoch": 1.0532644661447825, + "grad_norm": 0.6395559310913086, + "learning_rate": 5.440413386512922e-05, + "loss": 2.3544, + "step": 13051 + }, + { + "epoch": 1.0533451698813656, + "grad_norm": 0.6632471084594727, + "learning_rate": 5.43900840792984e-05, + "loss": 2.4753, + "step": 13052 + }, + { + "epoch": 1.0534258736179485, + "grad_norm": 0.7262828350067139, + "learning_rate": 5.4376035430185935e-05, + "loss": 2.4162, + "step": 13053 + }, + { + "epoch": 1.0535065773545316, + "grad_norm": 0.7897952198982239, + "learning_rate": 5.436198791814196e-05, + "loss": 2.4571, + "step": 13054 + }, + { + "epoch": 1.0535872810911144, + "grad_norm": 0.7281489372253418, + "learning_rate": 5.434794154351651e-05, + "loss": 2.4531, + "step": 13055 + }, + { + "epoch": 1.0536679848276975, + "grad_norm": 0.7322356700897217, + "learning_rate": 5.4333896306659694e-05, + "loss": 2.4102, + "step": 13056 + }, + { + "epoch": 1.0537486885642806, + "grad_norm": 0.7657945156097412, + "learning_rate": 5.4319852207921554e-05, + "loss": 2.4526, + "step": 13057 + }, + { + "epoch": 1.0538293923008635, + "grad_norm": 0.6732973456382751, + "learning_rate": 5.430580924765214e-05, + "loss": 2.4516, + "step": 13058 + }, + { + "epoch": 1.0539100960374466, + "grad_norm": 0.663398027420044, + "learning_rate": 5.429176742620137e-05, + "loss": 2.4437, + "step": 13059 + }, + { + "epoch": 1.0539907997740294, + "grad_norm": 0.6363258957862854, + "learning_rate": 5.4277726743919244e-05, + "loss": 2.414, + "step": 13060 + }, + { + "epoch": 1.0540715035106125, + "grad_norm": 0.6600647568702698, + "learning_rate": 5.426368720115568e-05, + "loss": 2.4319, + "step": 13061 + }, + { + "epoch": 1.0541522072471956, + "grad_norm": 0.6941983699798584, + "learning_rate": 5.4249648798260574e-05, + "loss": 2.5247, + "step": 13062 + }, + { + "epoch": 1.0542329109837785, + "grad_norm": 0.7419719099998474, + "learning_rate": 5.423561153558383e-05, + "loss": 2.5088, + "step": 13063 + }, + { + "epoch": 1.0543136147203616, + "grad_norm": 0.708073079586029, + "learning_rate": 5.4221575413475326e-05, + "loss": 2.4037, + "step": 13064 + }, + { + "epoch": 1.0543943184569446, + "grad_norm": 0.7081628441810608, + "learning_rate": 5.4207540432284764e-05, + "loss": 2.4556, + "step": 13065 + }, + { + "epoch": 1.0544750221935275, + "grad_norm": 0.7058689594268799, + "learning_rate": 5.419350659236201e-05, + "loss": 2.4244, + "step": 13066 + }, + { + "epoch": 1.0545557259301106, + "grad_norm": 0.6858707070350647, + "learning_rate": 5.417947389405684e-05, + "loss": 2.4431, + "step": 13067 + }, + { + "epoch": 1.0546364296666935, + "grad_norm": 0.6769983768463135, + "learning_rate": 5.416544233771893e-05, + "loss": 2.4257, + "step": 13068 + }, + { + "epoch": 1.0547171334032766, + "grad_norm": 0.7128089070320129, + "learning_rate": 5.4151411923698e-05, + "loss": 2.4558, + "step": 13069 + }, + { + "epoch": 1.0547978371398596, + "grad_norm": 0.6419198513031006, + "learning_rate": 5.413738265234374e-05, + "loss": 2.4421, + "step": 13070 + }, + { + "epoch": 1.0548785408764425, + "grad_norm": 0.760848879814148, + "learning_rate": 5.4123354524005784e-05, + "loss": 2.4427, + "step": 13071 + }, + { + "epoch": 1.0549592446130256, + "grad_norm": 0.6749173998832703, + "learning_rate": 5.410932753903377e-05, + "loss": 2.4902, + "step": 13072 + }, + { + "epoch": 1.0550399483496087, + "grad_norm": 0.6908800601959229, + "learning_rate": 5.4095301697777265e-05, + "loss": 2.4219, + "step": 13073 + }, + { + "epoch": 1.0551206520861915, + "grad_norm": 0.6779965758323669, + "learning_rate": 5.408127700058587e-05, + "loss": 2.4533, + "step": 13074 + }, + { + "epoch": 1.0552013558227746, + "grad_norm": 0.6832355260848999, + "learning_rate": 5.406725344780906e-05, + "loss": 2.418, + "step": 13075 + }, + { + "epoch": 1.0552820595593575, + "grad_norm": 0.6766698956489563, + "learning_rate": 5.4053231039796357e-05, + "loss": 2.4493, + "step": 13076 + }, + { + "epoch": 1.0553627632959406, + "grad_norm": 0.7256276607513428, + "learning_rate": 5.4039209776897285e-05, + "loss": 2.4126, + "step": 13077 + }, + { + "epoch": 1.0554434670325237, + "grad_norm": 0.6687275171279907, + "learning_rate": 5.4025189659461196e-05, + "loss": 2.435, + "step": 13078 + }, + { + "epoch": 1.0555241707691065, + "grad_norm": 0.6800444722175598, + "learning_rate": 5.401117068783758e-05, + "loss": 2.4608, + "step": 13079 + }, + { + "epoch": 1.0556048745056896, + "grad_norm": 0.6947116851806641, + "learning_rate": 5.399715286237583e-05, + "loss": 2.4908, + "step": 13080 + }, + { + "epoch": 1.0556855782422727, + "grad_norm": 0.6907915472984314, + "learning_rate": 5.398313618342521e-05, + "loss": 2.4805, + "step": 13081 + }, + { + "epoch": 1.0557662819788556, + "grad_norm": 0.7429100275039673, + "learning_rate": 5.396912065133516e-05, + "loss": 2.458, + "step": 13082 + }, + { + "epoch": 1.0558469857154387, + "grad_norm": 0.7186924815177917, + "learning_rate": 5.3955106266454994e-05, + "loss": 2.4924, + "step": 13083 + }, + { + "epoch": 1.0559276894520215, + "grad_norm": 0.7017999887466431, + "learning_rate": 5.394109302913391e-05, + "loss": 2.4103, + "step": 13084 + }, + { + "epoch": 1.0560083931886046, + "grad_norm": 0.7318955659866333, + "learning_rate": 5.392708093972117e-05, + "loss": 2.4424, + "step": 13085 + }, + { + "epoch": 1.0560890969251877, + "grad_norm": 0.6278600692749023, + "learning_rate": 5.391306999856602e-05, + "loss": 2.4433, + "step": 13086 + }, + { + "epoch": 1.0561698006617706, + "grad_norm": 0.6895800232887268, + "learning_rate": 5.389906020601767e-05, + "loss": 2.4275, + "step": 13087 + }, + { + "epoch": 1.0562505043983537, + "grad_norm": 0.7197345495223999, + "learning_rate": 5.388505156242522e-05, + "loss": 2.4309, + "step": 13088 + }, + { + "epoch": 1.0563312081349367, + "grad_norm": 0.636433482170105, + "learning_rate": 5.3871044068137824e-05, + "loss": 2.4258, + "step": 13089 + }, + { + "epoch": 1.0564119118715196, + "grad_norm": 0.6884748339653015, + "learning_rate": 5.3857037723504634e-05, + "loss": 2.4543, + "step": 13090 + }, + { + "epoch": 1.0564926156081027, + "grad_norm": 0.7277036309242249, + "learning_rate": 5.384303252887464e-05, + "loss": 2.4911, + "step": 13091 + }, + { + "epoch": 1.0565733193446856, + "grad_norm": 0.6940809488296509, + "learning_rate": 5.38290284845969e-05, + "loss": 2.4112, + "step": 13092 + }, + { + "epoch": 1.0566540230812687, + "grad_norm": 0.6729177236557007, + "learning_rate": 5.3815025591020526e-05, + "loss": 2.4394, + "step": 13093 + }, + { + "epoch": 1.0567347268178517, + "grad_norm": 0.6941854357719421, + "learning_rate": 5.3801023848494416e-05, + "loss": 2.4263, + "step": 13094 + }, + { + "epoch": 1.0568154305544346, + "grad_norm": 0.7046812772750854, + "learning_rate": 5.3787023257367554e-05, + "loss": 2.5196, + "step": 13095 + }, + { + "epoch": 1.0568961342910177, + "grad_norm": 0.6896177530288696, + "learning_rate": 5.377302381798891e-05, + "loss": 2.4178, + "step": 13096 + }, + { + "epoch": 1.0569768380276008, + "grad_norm": 0.6693699955940247, + "learning_rate": 5.375902553070731e-05, + "loss": 2.4908, + "step": 13097 + }, + { + "epoch": 1.0570575417641837, + "grad_norm": 0.6751677989959717, + "learning_rate": 5.3745028395871674e-05, + "loss": 2.4222, + "step": 13098 + }, + { + "epoch": 1.0571382455007667, + "grad_norm": 0.7666265368461609, + "learning_rate": 5.373103241383088e-05, + "loss": 2.4965, + "step": 13099 + }, + { + "epoch": 1.0572189492373496, + "grad_norm": 0.8069329857826233, + "learning_rate": 5.3717037584933674e-05, + "loss": 2.4988, + "step": 13100 + }, + { + "epoch": 1.0572996529739327, + "grad_norm": 0.7160749435424805, + "learning_rate": 5.370304390952887e-05, + "loss": 2.4311, + "step": 13101 + }, + { + "epoch": 1.0573803567105158, + "grad_norm": 0.6936448812484741, + "learning_rate": 5.368905138796523e-05, + "loss": 2.4877, + "step": 13102 + }, + { + "epoch": 1.0574610604470986, + "grad_norm": 0.7202793955802917, + "learning_rate": 5.3675060020591494e-05, + "loss": 2.4841, + "step": 13103 + }, + { + "epoch": 1.0575417641836817, + "grad_norm": 0.7750168442726135, + "learning_rate": 5.366106980775636e-05, + "loss": 2.4828, + "step": 13104 + }, + { + "epoch": 1.0576224679202646, + "grad_norm": 0.7079972624778748, + "learning_rate": 5.364708074980849e-05, + "loss": 2.4912, + "step": 13105 + }, + { + "epoch": 1.0577031716568477, + "grad_norm": 0.704066276550293, + "learning_rate": 5.363309284709657e-05, + "loss": 2.4731, + "step": 13106 + }, + { + "epoch": 1.0577838753934308, + "grad_norm": 0.7040490508079529, + "learning_rate": 5.361910609996915e-05, + "loss": 2.3811, + "step": 13107 + }, + { + "epoch": 1.0578645791300136, + "grad_norm": 0.6669453978538513, + "learning_rate": 5.360512050877484e-05, + "loss": 2.5372, + "step": 13108 + }, + { + "epoch": 1.0579452828665967, + "grad_norm": 0.7197996973991394, + "learning_rate": 5.359113607386226e-05, + "loss": 2.4612, + "step": 13109 + }, + { + "epoch": 1.0580259866031798, + "grad_norm": 0.7192320823669434, + "learning_rate": 5.3577152795579824e-05, + "loss": 2.4636, + "step": 13110 + }, + { + "epoch": 1.0581066903397627, + "grad_norm": 0.6907937526702881, + "learning_rate": 5.35631706742761e-05, + "loss": 2.4791, + "step": 13111 + }, + { + "epoch": 1.0581873940763458, + "grad_norm": 0.687035083770752, + "learning_rate": 5.354918971029954e-05, + "loss": 2.4706, + "step": 13112 + }, + { + "epoch": 1.0582680978129286, + "grad_norm": 0.6666533350944519, + "learning_rate": 5.353520990399861e-05, + "loss": 2.4789, + "step": 13113 + }, + { + "epoch": 1.0583488015495117, + "grad_norm": 0.6261809468269348, + "learning_rate": 5.35212312557217e-05, + "loss": 2.4485, + "step": 13114 + }, + { + "epoch": 1.0584295052860948, + "grad_norm": 0.6740814447402954, + "learning_rate": 5.350725376581725e-05, + "loss": 2.47, + "step": 13115 + }, + { + "epoch": 1.0585102090226777, + "grad_norm": 0.7634154558181763, + "learning_rate": 5.3493277434633526e-05, + "loss": 2.4685, + "step": 13116 + }, + { + "epoch": 1.0585909127592608, + "grad_norm": 0.6674611568450928, + "learning_rate": 5.34793022625189e-05, + "loss": 2.4362, + "step": 13117 + }, + { + "epoch": 1.0586716164958438, + "grad_norm": 0.7584757804870605, + "learning_rate": 5.346532824982167e-05, + "loss": 2.499, + "step": 13118 + }, + { + "epoch": 1.0587523202324267, + "grad_norm": 0.6453456282615662, + "learning_rate": 5.345135539689015e-05, + "loss": 2.4341, + "step": 13119 + }, + { + "epoch": 1.0588330239690098, + "grad_norm": 0.70013427734375, + "learning_rate": 5.343738370407247e-05, + "loss": 2.3448, + "step": 13120 + }, + { + "epoch": 1.0589137277055927, + "grad_norm": 0.6763362884521484, + "learning_rate": 5.342341317171693e-05, + "loss": 2.4234, + "step": 13121 + }, + { + "epoch": 1.0589944314421758, + "grad_norm": 0.6896576881408691, + "learning_rate": 5.3409443800171664e-05, + "loss": 2.4753, + "step": 13122 + }, + { + "epoch": 1.0590751351787588, + "grad_norm": 0.6984997987747192, + "learning_rate": 5.339547558978486e-05, + "loss": 2.4581, + "step": 13123 + }, + { + "epoch": 1.0591558389153417, + "grad_norm": 0.7276118993759155, + "learning_rate": 5.338150854090462e-05, + "loss": 2.4765, + "step": 13124 + }, + { + "epoch": 1.0592365426519248, + "grad_norm": 0.6943252086639404, + "learning_rate": 5.336754265387911e-05, + "loss": 2.4514, + "step": 13125 + }, + { + "epoch": 1.0593172463885079, + "grad_norm": 0.7070014476776123, + "learning_rate": 5.335357792905628e-05, + "loss": 2.4365, + "step": 13126 + }, + { + "epoch": 1.0593979501250907, + "grad_norm": 0.6887189149856567, + "learning_rate": 5.333961436678422e-05, + "loss": 2.4834, + "step": 13127 + }, + { + "epoch": 1.0594786538616738, + "grad_norm": 0.8150162696838379, + "learning_rate": 5.332565196741098e-05, + "loss": 2.4474, + "step": 13128 + }, + { + "epoch": 1.0595593575982567, + "grad_norm": 0.6681316494941711, + "learning_rate": 5.331169073128447e-05, + "loss": 2.4888, + "step": 13129 + }, + { + "epoch": 1.0596400613348398, + "grad_norm": 0.6696690320968628, + "learning_rate": 5.329773065875267e-05, + "loss": 2.3874, + "step": 13130 + }, + { + "epoch": 1.0597207650714229, + "grad_norm": 0.729807436466217, + "learning_rate": 5.32837717501635e-05, + "loss": 2.4442, + "step": 13131 + }, + { + "epoch": 1.0598014688080057, + "grad_norm": 0.6959047913551331, + "learning_rate": 5.326981400586486e-05, + "loss": 2.4697, + "step": 13132 + }, + { + "epoch": 1.0598821725445888, + "grad_norm": 0.667294442653656, + "learning_rate": 5.3255857426204606e-05, + "loss": 2.3986, + "step": 13133 + }, + { + "epoch": 1.059962876281172, + "grad_norm": 0.6953842639923096, + "learning_rate": 5.3241902011530566e-05, + "loss": 2.396, + "step": 13134 + }, + { + "epoch": 1.0600435800177548, + "grad_norm": 0.6544597148895264, + "learning_rate": 5.32279477621906e-05, + "loss": 2.426, + "step": 13135 + }, + { + "epoch": 1.0601242837543379, + "grad_norm": 0.708017885684967, + "learning_rate": 5.321399467853241e-05, + "loss": 2.4931, + "step": 13136 + }, + { + "epoch": 1.0602049874909207, + "grad_norm": 0.6669809818267822, + "learning_rate": 5.3200042760903764e-05, + "loss": 2.4354, + "step": 13137 + }, + { + "epoch": 1.0602856912275038, + "grad_norm": 1.0144098997116089, + "learning_rate": 5.3186092009652435e-05, + "loss": 2.4803, + "step": 13138 + }, + { + "epoch": 1.060366394964087, + "grad_norm": 0.7213768362998962, + "learning_rate": 5.317214242512601e-05, + "loss": 2.4318, + "step": 13139 + }, + { + "epoch": 1.0604470987006698, + "grad_norm": 0.6429069638252258, + "learning_rate": 5.315819400767223e-05, + "loss": 2.458, + "step": 13140 + }, + { + "epoch": 1.0605278024372529, + "grad_norm": 0.6480485796928406, + "learning_rate": 5.3144246757638714e-05, + "loss": 2.4586, + "step": 13141 + }, + { + "epoch": 1.060608506173836, + "grad_norm": 0.7037697434425354, + "learning_rate": 5.3130300675373035e-05, + "loss": 2.4698, + "step": 13142 + }, + { + "epoch": 1.0606892099104188, + "grad_norm": 0.7307559251785278, + "learning_rate": 5.3116355761222725e-05, + "loss": 2.4027, + "step": 13143 + }, + { + "epoch": 1.060769913647002, + "grad_norm": 0.6684615612030029, + "learning_rate": 5.310241201553547e-05, + "loss": 2.478, + "step": 13144 + }, + { + "epoch": 1.0608506173835848, + "grad_norm": 0.7018016576766968, + "learning_rate": 5.308846943865866e-05, + "loss": 2.4229, + "step": 13145 + }, + { + "epoch": 1.0609313211201679, + "grad_norm": 0.7538621425628662, + "learning_rate": 5.307452803093982e-05, + "loss": 2.5201, + "step": 13146 + }, + { + "epoch": 1.061012024856751, + "grad_norm": 0.6957963109016418, + "learning_rate": 5.306058779272645e-05, + "loss": 2.4233, + "step": 13147 + }, + { + "epoch": 1.0610927285933338, + "grad_norm": 0.6280590295791626, + "learning_rate": 5.304664872436588e-05, + "loss": 2.5117, + "step": 13148 + }, + { + "epoch": 1.061173432329917, + "grad_norm": 0.6937280297279358, + "learning_rate": 5.3032710826205564e-05, + "loss": 2.4889, + "step": 13149 + }, + { + "epoch": 1.0612541360664998, + "grad_norm": 0.6750391125679016, + "learning_rate": 5.3018774098592884e-05, + "loss": 2.4472, + "step": 13150 + }, + { + "epoch": 1.0613348398030829, + "grad_norm": 0.6931902766227722, + "learning_rate": 5.300483854187519e-05, + "loss": 2.3883, + "step": 13151 + }, + { + "epoch": 1.061415543539666, + "grad_norm": 0.6982774138450623, + "learning_rate": 5.2990904156399726e-05, + "loss": 2.4688, + "step": 13152 + }, + { + "epoch": 1.0614962472762488, + "grad_norm": 0.6873522996902466, + "learning_rate": 5.297697094251382e-05, + "loss": 2.4818, + "step": 13153 + }, + { + "epoch": 1.061576951012832, + "grad_norm": 0.635377049446106, + "learning_rate": 5.296303890056471e-05, + "loss": 2.3906, + "step": 13154 + }, + { + "epoch": 1.061657654749415, + "grad_norm": 0.6368159651756287, + "learning_rate": 5.294910803089963e-05, + "loss": 2.4714, + "step": 13155 + }, + { + "epoch": 1.0617383584859978, + "grad_norm": 0.7147238254547119, + "learning_rate": 5.293517833386576e-05, + "loss": 2.4746, + "step": 13156 + }, + { + "epoch": 1.061819062222581, + "grad_norm": 0.742189884185791, + "learning_rate": 5.2921249809810326e-05, + "loss": 2.3913, + "step": 13157 + }, + { + "epoch": 1.061899765959164, + "grad_norm": 0.6665734648704529, + "learning_rate": 5.290732245908038e-05, + "loss": 2.4263, + "step": 13158 + }, + { + "epoch": 1.0619804696957469, + "grad_norm": 0.6894757747650146, + "learning_rate": 5.2893396282023055e-05, + "loss": 2.4204, + "step": 13159 + }, + { + "epoch": 1.06206117343233, + "grad_norm": 0.6394561529159546, + "learning_rate": 5.287947127898546e-05, + "loss": 2.4183, + "step": 13160 + }, + { + "epoch": 1.0621418771689128, + "grad_norm": 0.7422548532485962, + "learning_rate": 5.2865547450314576e-05, + "loss": 2.4454, + "step": 13161 + }, + { + "epoch": 1.062222580905496, + "grad_norm": 0.7486133575439453, + "learning_rate": 5.285162479635748e-05, + "loss": 2.4856, + "step": 13162 + }, + { + "epoch": 1.062303284642079, + "grad_norm": 0.6743031144142151, + "learning_rate": 5.283770331746112e-05, + "loss": 2.4318, + "step": 13163 + }, + { + "epoch": 1.0623839883786619, + "grad_norm": 0.6461686491966248, + "learning_rate": 5.282378301397248e-05, + "loss": 2.4133, + "step": 13164 + }, + { + "epoch": 1.062464692115245, + "grad_norm": 0.6745431423187256, + "learning_rate": 5.28098638862385e-05, + "loss": 2.4463, + "step": 13165 + }, + { + "epoch": 1.0625453958518278, + "grad_norm": 0.6646310687065125, + "learning_rate": 5.279594593460606e-05, + "loss": 2.4211, + "step": 13166 + }, + { + "epoch": 1.062626099588411, + "grad_norm": 0.6789249777793884, + "learning_rate": 5.278202915942207e-05, + "loss": 2.4832, + "step": 13167 + }, + { + "epoch": 1.062706803324994, + "grad_norm": 0.7082679867744446, + "learning_rate": 5.2768113561033326e-05, + "loss": 2.4303, + "step": 13168 + }, + { + "epoch": 1.0627875070615769, + "grad_norm": 0.6875587701797485, + "learning_rate": 5.275419913978664e-05, + "loss": 2.4601, + "step": 13169 + }, + { + "epoch": 1.06286821079816, + "grad_norm": 0.6556203961372375, + "learning_rate": 5.274028589602886e-05, + "loss": 2.4359, + "step": 13170 + }, + { + "epoch": 1.062948914534743, + "grad_norm": 0.7280015349388123, + "learning_rate": 5.272637383010666e-05, + "loss": 2.4999, + "step": 13171 + }, + { + "epoch": 1.063029618271326, + "grad_norm": 0.664654016494751, + "learning_rate": 5.271246294236678e-05, + "loss": 2.3951, + "step": 13172 + }, + { + "epoch": 1.063110322007909, + "grad_norm": 0.6941719055175781, + "learning_rate": 5.2698553233155945e-05, + "loss": 2.45, + "step": 13173 + }, + { + "epoch": 1.0631910257444919, + "grad_norm": 0.7212931513786316, + "learning_rate": 5.268464470282082e-05, + "loss": 2.4615, + "step": 13174 + }, + { + "epoch": 1.063271729481075, + "grad_norm": 0.6877106428146362, + "learning_rate": 5.2670737351708014e-05, + "loss": 2.4495, + "step": 13175 + }, + { + "epoch": 1.063352433217658, + "grad_norm": 0.737718939781189, + "learning_rate": 5.26568311801642e-05, + "loss": 2.4971, + "step": 13176 + }, + { + "epoch": 1.063433136954241, + "grad_norm": 0.6909129619598389, + "learning_rate": 5.264292618853587e-05, + "loss": 2.4889, + "step": 13177 + }, + { + "epoch": 1.063513840690824, + "grad_norm": 0.6750304102897644, + "learning_rate": 5.262902237716961e-05, + "loss": 2.4779, + "step": 13178 + }, + { + "epoch": 1.063594544427407, + "grad_norm": 0.7256019115447998, + "learning_rate": 5.2615119746411954e-05, + "loss": 2.4904, + "step": 13179 + }, + { + "epoch": 1.06367524816399, + "grad_norm": 0.7335983514785767, + "learning_rate": 5.26012182966094e-05, + "loss": 2.4357, + "step": 13180 + }, + { + "epoch": 1.063755951900573, + "grad_norm": 0.6534200310707092, + "learning_rate": 5.258731802810837e-05, + "loss": 2.4213, + "step": 13181 + }, + { + "epoch": 1.063836655637156, + "grad_norm": 0.6899768114089966, + "learning_rate": 5.257341894125529e-05, + "loss": 2.4963, + "step": 13182 + }, + { + "epoch": 1.063917359373739, + "grad_norm": 0.7016159892082214, + "learning_rate": 5.25595210363966e-05, + "loss": 2.4583, + "step": 13183 + }, + { + "epoch": 1.063998063110322, + "grad_norm": 0.6868152022361755, + "learning_rate": 5.2545624313878636e-05, + "loss": 2.4523, + "step": 13184 + }, + { + "epoch": 1.064078766846905, + "grad_norm": 0.7442622184753418, + "learning_rate": 5.2531728774047785e-05, + "loss": 2.425, + "step": 13185 + }, + { + "epoch": 1.064159470583488, + "grad_norm": 0.6900869011878967, + "learning_rate": 5.251783441725037e-05, + "loss": 2.459, + "step": 13186 + }, + { + "epoch": 1.0642401743200711, + "grad_norm": 0.6910288333892822, + "learning_rate": 5.25039412438326e-05, + "loss": 2.4882, + "step": 13187 + }, + { + "epoch": 1.064320878056654, + "grad_norm": 0.7644359469413757, + "learning_rate": 5.249004925414076e-05, + "loss": 2.4663, + "step": 13188 + }, + { + "epoch": 1.064401581793237, + "grad_norm": 0.6703082919120789, + "learning_rate": 5.247615844852114e-05, + "loss": 2.4309, + "step": 13189 + }, + { + "epoch": 1.06448228552982, + "grad_norm": 0.6449835896492004, + "learning_rate": 5.246226882731983e-05, + "loss": 2.4307, + "step": 13190 + }, + { + "epoch": 1.064562989266403, + "grad_norm": 0.7332713603973389, + "learning_rate": 5.244838039088305e-05, + "loss": 2.3763, + "step": 13191 + }, + { + "epoch": 1.0646436930029861, + "grad_norm": 0.7626641988754272, + "learning_rate": 5.2434493139556974e-05, + "loss": 2.4167, + "step": 13192 + }, + { + "epoch": 1.064724396739569, + "grad_norm": 0.6924002170562744, + "learning_rate": 5.2420607073687614e-05, + "loss": 2.4751, + "step": 13193 + }, + { + "epoch": 1.064805100476152, + "grad_norm": 0.6815003156661987, + "learning_rate": 5.2406722193621074e-05, + "loss": 2.4731, + "step": 13194 + }, + { + "epoch": 1.064885804212735, + "grad_norm": 0.7632609009742737, + "learning_rate": 5.239283849970347e-05, + "loss": 2.4562, + "step": 13195 + }, + { + "epoch": 1.064966507949318, + "grad_norm": 0.7157592177391052, + "learning_rate": 5.23789559922808e-05, + "loss": 2.4507, + "step": 13196 + }, + { + "epoch": 1.065047211685901, + "grad_norm": 0.7035543918609619, + "learning_rate": 5.2365074671699e-05, + "loss": 2.4616, + "step": 13197 + }, + { + "epoch": 1.065127915422484, + "grad_norm": 0.7566644549369812, + "learning_rate": 5.235119453830406e-05, + "loss": 2.4751, + "step": 13198 + }, + { + "epoch": 1.065208619159067, + "grad_norm": 0.7030916213989258, + "learning_rate": 5.233731559244194e-05, + "loss": 2.381, + "step": 13199 + }, + { + "epoch": 1.0652893228956501, + "grad_norm": 0.7663755416870117, + "learning_rate": 5.232343783445847e-05, + "loss": 2.4822, + "step": 13200 + }, + { + "epoch": 1.065370026632233, + "grad_norm": 0.717767596244812, + "learning_rate": 5.230956126469955e-05, + "loss": 2.4807, + "step": 13201 + }, + { + "epoch": 1.065450730368816, + "grad_norm": 0.6920818090438843, + "learning_rate": 5.229568588351108e-05, + "loss": 2.4643, + "step": 13202 + }, + { + "epoch": 1.0655314341053992, + "grad_norm": 0.6812553405761719, + "learning_rate": 5.228181169123877e-05, + "loss": 2.4443, + "step": 13203 + }, + { + "epoch": 1.065612137841982, + "grad_norm": 0.7241889834403992, + "learning_rate": 5.226793868822846e-05, + "loss": 2.4581, + "step": 13204 + }, + { + "epoch": 1.0656928415785651, + "grad_norm": 0.7254642248153687, + "learning_rate": 5.225406687482588e-05, + "loss": 2.4999, + "step": 13205 + }, + { + "epoch": 1.065773545315148, + "grad_norm": 0.7316950559616089, + "learning_rate": 5.2240196251376764e-05, + "loss": 2.4493, + "step": 13206 + }, + { + "epoch": 1.065854249051731, + "grad_norm": 0.7208307385444641, + "learning_rate": 5.22263268182268e-05, + "loss": 2.5083, + "step": 13207 + }, + { + "epoch": 1.0659349527883142, + "grad_norm": 0.6552214622497559, + "learning_rate": 5.22124585757217e-05, + "loss": 2.4662, + "step": 13208 + }, + { + "epoch": 1.066015656524897, + "grad_norm": 0.7949681878089905, + "learning_rate": 5.219859152420701e-05, + "loss": 2.4584, + "step": 13209 + }, + { + "epoch": 1.0660963602614801, + "grad_norm": 0.7012154459953308, + "learning_rate": 5.2184725664028366e-05, + "loss": 2.4702, + "step": 13210 + }, + { + "epoch": 1.066177063998063, + "grad_norm": 0.7431927919387817, + "learning_rate": 5.217086099553136e-05, + "loss": 2.4422, + "step": 13211 + }, + { + "epoch": 1.066257767734646, + "grad_norm": 0.7235366702079773, + "learning_rate": 5.2156997519061554e-05, + "loss": 2.4173, + "step": 13212 + }, + { + "epoch": 1.0663384714712292, + "grad_norm": 0.7475029826164246, + "learning_rate": 5.214313523496439e-05, + "loss": 2.4924, + "step": 13213 + }, + { + "epoch": 1.066419175207812, + "grad_norm": 0.6326786875724792, + "learning_rate": 5.212927414358542e-05, + "loss": 2.4154, + "step": 13214 + }, + { + "epoch": 1.0664998789443951, + "grad_norm": 0.6755837798118591, + "learning_rate": 5.211541424527004e-05, + "loss": 2.4248, + "step": 13215 + }, + { + "epoch": 1.0665805826809782, + "grad_norm": 0.645395040512085, + "learning_rate": 5.210155554036373e-05, + "loss": 2.4078, + "step": 13216 + }, + { + "epoch": 1.066661286417561, + "grad_norm": 0.799913763999939, + "learning_rate": 5.208769802921185e-05, + "loss": 2.5067, + "step": 13217 + }, + { + "epoch": 1.0667419901541442, + "grad_norm": 0.7056344747543335, + "learning_rate": 5.207384171215983e-05, + "loss": 2.4817, + "step": 13218 + }, + { + "epoch": 1.0668226938907273, + "grad_norm": 0.7082187533378601, + "learning_rate": 5.205998658955291e-05, + "loss": 2.4495, + "step": 13219 + }, + { + "epoch": 1.0669033976273101, + "grad_norm": 0.6948464512825012, + "learning_rate": 5.204613266173646e-05, + "loss": 2.4584, + "step": 13220 + }, + { + "epoch": 1.0669841013638932, + "grad_norm": 0.7812542915344238, + "learning_rate": 5.203227992905575e-05, + "loss": 2.4803, + "step": 13221 + }, + { + "epoch": 1.067064805100476, + "grad_norm": 0.6892200708389282, + "learning_rate": 5.201842839185598e-05, + "loss": 2.4424, + "step": 13222 + }, + { + "epoch": 1.0671455088370592, + "grad_norm": 0.6982070803642273, + "learning_rate": 5.20045780504824e-05, + "loss": 2.4654, + "step": 13223 + }, + { + "epoch": 1.0672262125736423, + "grad_norm": 0.6799101233482361, + "learning_rate": 5.1990728905280205e-05, + "loss": 2.4748, + "step": 13224 + }, + { + "epoch": 1.0673069163102251, + "grad_norm": 0.6703687906265259, + "learning_rate": 5.1976880956594544e-05, + "loss": 2.4459, + "step": 13225 + }, + { + "epoch": 1.0673876200468082, + "grad_norm": 0.6821435689926147, + "learning_rate": 5.196303420477053e-05, + "loss": 2.4517, + "step": 13226 + }, + { + "epoch": 1.067468323783391, + "grad_norm": 0.6369695067405701, + "learning_rate": 5.194918865015328e-05, + "loss": 2.4388, + "step": 13227 + }, + { + "epoch": 1.0675490275199742, + "grad_norm": 0.6465736627578735, + "learning_rate": 5.1935344293087885e-05, + "loss": 2.3839, + "step": 13228 + }, + { + "epoch": 1.0676297312565572, + "grad_norm": 0.6745415329933167, + "learning_rate": 5.192150113391933e-05, + "loss": 2.4676, + "step": 13229 + }, + { + "epoch": 1.0677104349931401, + "grad_norm": 0.7605211138725281, + "learning_rate": 5.190765917299263e-05, + "loss": 2.4764, + "step": 13230 + }, + { + "epoch": 1.0677911387297232, + "grad_norm": 0.7040959596633911, + "learning_rate": 5.1893818410652825e-05, + "loss": 2.4727, + "step": 13231 + }, + { + "epoch": 1.0678718424663063, + "grad_norm": 0.6718928813934326, + "learning_rate": 5.1879978847244785e-05, + "loss": 2.4308, + "step": 13232 + }, + { + "epoch": 1.0679525462028892, + "grad_norm": 0.6788188219070435, + "learning_rate": 5.1866140483113445e-05, + "loss": 2.4278, + "step": 13233 + }, + { + "epoch": 1.0680332499394722, + "grad_norm": 0.7310218811035156, + "learning_rate": 5.185230331860371e-05, + "loss": 2.4585, + "step": 13234 + }, + { + "epoch": 1.068113953676055, + "grad_norm": 0.8092277646064758, + "learning_rate": 5.183846735406044e-05, + "loss": 2.4128, + "step": 13235 + }, + { + "epoch": 1.0681946574126382, + "grad_norm": 0.6469862461090088, + "learning_rate": 5.182463258982846e-05, + "loss": 2.4315, + "step": 13236 + }, + { + "epoch": 1.0682753611492213, + "grad_norm": 0.7948115468025208, + "learning_rate": 5.181079902625261e-05, + "loss": 2.5127, + "step": 13237 + }, + { + "epoch": 1.0683560648858041, + "grad_norm": 0.6988852620124817, + "learning_rate": 5.179696666367757e-05, + "loss": 2.432, + "step": 13238 + }, + { + "epoch": 1.0684367686223872, + "grad_norm": 0.6914555430412292, + "learning_rate": 5.1783135502448124e-05, + "loss": 2.4748, + "step": 13239 + }, + { + "epoch": 1.0685174723589703, + "grad_norm": 0.7586313486099243, + "learning_rate": 5.176930554290902e-05, + "loss": 2.4522, + "step": 13240 + }, + { + "epoch": 1.0685981760955532, + "grad_norm": 0.6763948798179626, + "learning_rate": 5.175547678540487e-05, + "loss": 2.4477, + "step": 13241 + }, + { + "epoch": 1.0686788798321363, + "grad_norm": 0.7625983357429504, + "learning_rate": 5.1741649230280334e-05, + "loss": 2.4725, + "step": 13242 + }, + { + "epoch": 1.0687595835687191, + "grad_norm": 0.6574710011482239, + "learning_rate": 5.172782287788005e-05, + "loss": 2.4212, + "step": 13243 + }, + { + "epoch": 1.0688402873053022, + "grad_norm": 0.770062267780304, + "learning_rate": 5.1713997728548615e-05, + "loss": 2.5065, + "step": 13244 + }, + { + "epoch": 1.0689209910418853, + "grad_norm": 0.7719037532806396, + "learning_rate": 5.170017378263057e-05, + "loss": 2.5082, + "step": 13245 + }, + { + "epoch": 1.0690016947784682, + "grad_norm": 0.7106119394302368, + "learning_rate": 5.168635104047046e-05, + "loss": 2.4922, + "step": 13246 + }, + { + "epoch": 1.0690823985150513, + "grad_norm": 0.711815595626831, + "learning_rate": 5.167252950241281e-05, + "loss": 2.498, + "step": 13247 + }, + { + "epoch": 1.0691631022516344, + "grad_norm": 0.6926038265228271, + "learning_rate": 5.165870916880201e-05, + "loss": 2.4464, + "step": 13248 + }, + { + "epoch": 1.0692438059882172, + "grad_norm": 0.6959360241889954, + "learning_rate": 5.164489003998254e-05, + "loss": 2.4668, + "step": 13249 + }, + { + "epoch": 1.0693245097248003, + "grad_norm": 0.7165184617042542, + "learning_rate": 5.1631072116298875e-05, + "loss": 2.4198, + "step": 13250 + }, + { + "epoch": 1.0694052134613832, + "grad_norm": 0.7133236527442932, + "learning_rate": 5.161725539809527e-05, + "loss": 2.4691, + "step": 13251 + }, + { + "epoch": 1.0694859171979663, + "grad_norm": 0.7057758569717407, + "learning_rate": 5.160343988571613e-05, + "loss": 2.466, + "step": 13252 + }, + { + "epoch": 1.0695666209345494, + "grad_norm": 0.6808326244354248, + "learning_rate": 5.158962557950583e-05, + "loss": 2.4248, + "step": 13253 + }, + { + "epoch": 1.0696473246711322, + "grad_norm": 0.7166025638580322, + "learning_rate": 5.1575812479808563e-05, + "loss": 2.4753, + "step": 13254 + }, + { + "epoch": 1.0697280284077153, + "grad_norm": 0.7395358085632324, + "learning_rate": 5.156200058696863e-05, + "loss": 2.485, + "step": 13255 + }, + { + "epoch": 1.0698087321442982, + "grad_norm": 0.681106686592102, + "learning_rate": 5.154818990133026e-05, + "loss": 2.5077, + "step": 13256 + }, + { + "epoch": 1.0698894358808813, + "grad_norm": 0.7517002820968628, + "learning_rate": 5.153438042323766e-05, + "loss": 2.5093, + "step": 13257 + }, + { + "epoch": 1.0699701396174643, + "grad_norm": 0.6516926288604736, + "learning_rate": 5.152057215303499e-05, + "loss": 2.4416, + "step": 13258 + }, + { + "epoch": 1.0700508433540472, + "grad_norm": 0.6930893063545227, + "learning_rate": 5.150676509106638e-05, + "loss": 2.506, + "step": 13259 + }, + { + "epoch": 1.0701315470906303, + "grad_norm": 0.7737041115760803, + "learning_rate": 5.1492959237675986e-05, + "loss": 2.4355, + "step": 13260 + }, + { + "epoch": 1.0702122508272134, + "grad_norm": 0.7274872660636902, + "learning_rate": 5.14791545932078e-05, + "loss": 2.5552, + "step": 13261 + }, + { + "epoch": 1.0702929545637963, + "grad_norm": 0.7112408876419067, + "learning_rate": 5.146535115800593e-05, + "loss": 2.4041, + "step": 13262 + }, + { + "epoch": 1.0703736583003793, + "grad_norm": 0.6822024583816528, + "learning_rate": 5.1451548932414415e-05, + "loss": 2.4346, + "step": 13263 + }, + { + "epoch": 1.0704543620369624, + "grad_norm": 0.6590598225593567, + "learning_rate": 5.1437747916777165e-05, + "loss": 2.3946, + "step": 13264 + }, + { + "epoch": 1.0705350657735453, + "grad_norm": 0.643014132976532, + "learning_rate": 5.142394811143818e-05, + "loss": 2.4455, + "step": 13265 + }, + { + "epoch": 1.0706157695101284, + "grad_norm": 0.6480194926261902, + "learning_rate": 5.141014951674139e-05, + "loss": 2.4304, + "step": 13266 + }, + { + "epoch": 1.0706964732467112, + "grad_norm": 0.6933526992797852, + "learning_rate": 5.139635213303069e-05, + "loss": 2.4627, + "step": 13267 + }, + { + "epoch": 1.0707771769832943, + "grad_norm": 0.6832638382911682, + "learning_rate": 5.138255596064995e-05, + "loss": 2.4645, + "step": 13268 + }, + { + "epoch": 1.0708578807198774, + "grad_norm": 0.6579757928848267, + "learning_rate": 5.1368760999943034e-05, + "loss": 2.3928, + "step": 13269 + }, + { + "epoch": 1.0709385844564603, + "grad_norm": 0.6658132672309875, + "learning_rate": 5.1354967251253684e-05, + "loss": 2.4732, + "step": 13270 + }, + { + "epoch": 1.0710192881930434, + "grad_norm": 0.7610828876495361, + "learning_rate": 5.13411747149257e-05, + "loss": 2.4781, + "step": 13271 + }, + { + "epoch": 1.0710999919296262, + "grad_norm": 0.682858943939209, + "learning_rate": 5.1327383391302895e-05, + "loss": 2.4545, + "step": 13272 + }, + { + "epoch": 1.0711806956662093, + "grad_norm": 0.7461360692977905, + "learning_rate": 5.131359328072887e-05, + "loss": 2.4647, + "step": 13273 + }, + { + "epoch": 1.0712613994027924, + "grad_norm": 0.6767961382865906, + "learning_rate": 5.129980438354738e-05, + "loss": 2.4562, + "step": 13274 + }, + { + "epoch": 1.0713421031393753, + "grad_norm": 0.6768184304237366, + "learning_rate": 5.1286016700102066e-05, + "loss": 2.4662, + "step": 13275 + }, + { + "epoch": 1.0714228068759584, + "grad_norm": 0.7022743225097656, + "learning_rate": 5.1272230230736554e-05, + "loss": 2.4321, + "step": 13276 + }, + { + "epoch": 1.0715035106125415, + "grad_norm": 0.725488007068634, + "learning_rate": 5.125844497579444e-05, + "loss": 2.457, + "step": 13277 + }, + { + "epoch": 1.0715842143491243, + "grad_norm": 0.7542931437492371, + "learning_rate": 5.124466093561928e-05, + "loss": 2.4302, + "step": 13278 + }, + { + "epoch": 1.0716649180857074, + "grad_norm": 0.6598316431045532, + "learning_rate": 5.123087811055467e-05, + "loss": 2.4552, + "step": 13279 + }, + { + "epoch": 1.0717456218222903, + "grad_norm": 0.7533490061759949, + "learning_rate": 5.1217096500944017e-05, + "loss": 2.4778, + "step": 13280 + }, + { + "epoch": 1.0718263255588734, + "grad_norm": 0.6890795826911926, + "learning_rate": 5.1203316107130825e-05, + "loss": 2.4349, + "step": 13281 + }, + { + "epoch": 1.0719070292954564, + "grad_norm": 0.7004082202911377, + "learning_rate": 5.118953692945862e-05, + "loss": 2.4645, + "step": 13282 + }, + { + "epoch": 1.0719877330320393, + "grad_norm": 0.7409259676933289, + "learning_rate": 5.117575896827068e-05, + "loss": 2.4734, + "step": 13283 + }, + { + "epoch": 1.0720684367686224, + "grad_norm": 0.7035481929779053, + "learning_rate": 5.116198222391046e-05, + "loss": 2.5027, + "step": 13284 + }, + { + "epoch": 1.0721491405052055, + "grad_norm": 0.7146698236465454, + "learning_rate": 5.114820669672132e-05, + "loss": 2.4623, + "step": 13285 + }, + { + "epoch": 1.0722298442417884, + "grad_norm": 0.7813882231712341, + "learning_rate": 5.113443238704656e-05, + "loss": 2.4644, + "step": 13286 + }, + { + "epoch": 1.0723105479783714, + "grad_norm": 0.6592430472373962, + "learning_rate": 5.1120659295229486e-05, + "loss": 2.4682, + "step": 13287 + }, + { + "epoch": 1.0723912517149543, + "grad_norm": 0.7047967910766602, + "learning_rate": 5.1106887421613395e-05, + "loss": 2.4368, + "step": 13288 + }, + { + "epoch": 1.0724719554515374, + "grad_norm": 0.700977087020874, + "learning_rate": 5.109311676654143e-05, + "loss": 2.4471, + "step": 13289 + }, + { + "epoch": 1.0725526591881205, + "grad_norm": 0.6821093559265137, + "learning_rate": 5.107934733035684e-05, + "loss": 2.433, + "step": 13290 + }, + { + "epoch": 1.0726333629247033, + "grad_norm": 0.6579930186271667, + "learning_rate": 5.1065579113402794e-05, + "loss": 2.4527, + "step": 13291 + }, + { + "epoch": 1.0727140666612864, + "grad_norm": 0.658514678478241, + "learning_rate": 5.105181211602248e-05, + "loss": 2.4443, + "step": 13292 + }, + { + "epoch": 1.0727947703978695, + "grad_norm": 0.6963977217674255, + "learning_rate": 5.103804633855891e-05, + "loss": 2.4699, + "step": 13293 + }, + { + "epoch": 1.0728754741344524, + "grad_norm": 0.6670787334442139, + "learning_rate": 5.102428178135522e-05, + "loss": 2.4672, + "step": 13294 + }, + { + "epoch": 1.0729561778710355, + "grad_norm": 0.6959822773933411, + "learning_rate": 5.1010518444754454e-05, + "loss": 2.4338, + "step": 13295 + }, + { + "epoch": 1.0730368816076183, + "grad_norm": 0.6534817218780518, + "learning_rate": 5.0996756329099614e-05, + "loss": 2.4491, + "step": 13296 + }, + { + "epoch": 1.0731175853442014, + "grad_norm": 0.7265146970748901, + "learning_rate": 5.098299543473371e-05, + "loss": 2.4718, + "step": 13297 + }, + { + "epoch": 1.0731982890807845, + "grad_norm": 0.6554745435714722, + "learning_rate": 5.0969235761999746e-05, + "loss": 2.4286, + "step": 13298 + }, + { + "epoch": 1.0732789928173674, + "grad_norm": 0.7003172039985657, + "learning_rate": 5.095547731124053e-05, + "loss": 2.4182, + "step": 13299 + }, + { + "epoch": 1.0733596965539505, + "grad_norm": 0.6700341105461121, + "learning_rate": 5.094172008279904e-05, + "loss": 2.428, + "step": 13300 + }, + { + "epoch": 1.0734404002905333, + "grad_norm": 0.7290289402008057, + "learning_rate": 5.0927964077018164e-05, + "loss": 2.4324, + "step": 13301 + }, + { + "epoch": 1.0735211040271164, + "grad_norm": 0.6999204158782959, + "learning_rate": 5.0914209294240644e-05, + "loss": 2.5386, + "step": 13302 + }, + { + "epoch": 1.0736018077636995, + "grad_norm": 0.7008000612258911, + "learning_rate": 5.090045573480935e-05, + "loss": 2.5295, + "step": 13303 + }, + { + "epoch": 1.0736825115002824, + "grad_norm": 0.7023071646690369, + "learning_rate": 5.088670339906705e-05, + "loss": 2.4418, + "step": 13304 + }, + { + "epoch": 1.0737632152368655, + "grad_norm": 0.627174437046051, + "learning_rate": 5.0872952287356525e-05, + "loss": 2.3782, + "step": 13305 + }, + { + "epoch": 1.0738439189734486, + "grad_norm": 0.6992766857147217, + "learning_rate": 5.0859202400020364e-05, + "loss": 2.4698, + "step": 13306 + }, + { + "epoch": 1.0739246227100314, + "grad_norm": 0.7189817428588867, + "learning_rate": 5.084545373740138e-05, + "loss": 2.5248, + "step": 13307 + }, + { + "epoch": 1.0740053264466145, + "grad_norm": 0.6849164962768555, + "learning_rate": 5.0831706299842216e-05, + "loss": 2.4084, + "step": 13308 + }, + { + "epoch": 1.0740860301831976, + "grad_norm": 0.6985825300216675, + "learning_rate": 5.0817960087685424e-05, + "loss": 2.4893, + "step": 13309 + }, + { + "epoch": 1.0741667339197805, + "grad_norm": 0.6519783139228821, + "learning_rate": 5.080421510127362e-05, + "loss": 2.5144, + "step": 13310 + }, + { + "epoch": 1.0742474376563635, + "grad_norm": 0.6605731248855591, + "learning_rate": 5.079047134094941e-05, + "loss": 2.4487, + "step": 13311 + }, + { + "epoch": 1.0743281413929464, + "grad_norm": 0.7236705422401428, + "learning_rate": 5.077672880705526e-05, + "loss": 2.4578, + "step": 13312 + }, + { + "epoch": 1.0744088451295295, + "grad_norm": 0.7126381397247314, + "learning_rate": 5.07629874999337e-05, + "loss": 2.4528, + "step": 13313 + }, + { + "epoch": 1.0744895488661126, + "grad_norm": 0.7247878313064575, + "learning_rate": 5.0749247419927236e-05, + "loss": 2.563, + "step": 13314 + }, + { + "epoch": 1.0745702526026955, + "grad_norm": 0.728349506855011, + "learning_rate": 5.0735508567378234e-05, + "loss": 2.4229, + "step": 13315 + }, + { + "epoch": 1.0746509563392785, + "grad_norm": 0.6593719124794006, + "learning_rate": 5.072177094262913e-05, + "loss": 2.4853, + "step": 13316 + }, + { + "epoch": 1.0747316600758614, + "grad_norm": 0.6519735455513, + "learning_rate": 5.070803454602231e-05, + "loss": 2.4507, + "step": 13317 + }, + { + "epoch": 1.0748123638124445, + "grad_norm": 0.6660017371177673, + "learning_rate": 5.0694299377900115e-05, + "loss": 2.4286, + "step": 13318 + }, + { + "epoch": 1.0748930675490276, + "grad_norm": 0.7506695985794067, + "learning_rate": 5.0680565438604876e-05, + "loss": 2.4841, + "step": 13319 + }, + { + "epoch": 1.0749737712856104, + "grad_norm": 0.6855955719947815, + "learning_rate": 5.0666832728478863e-05, + "loss": 2.3817, + "step": 13320 + }, + { + "epoch": 1.0750544750221935, + "grad_norm": 0.7151634693145752, + "learning_rate": 5.065310124786438e-05, + "loss": 2.3984, + "step": 13321 + }, + { + "epoch": 1.0751351787587766, + "grad_norm": 0.6551649570465088, + "learning_rate": 5.063937099710356e-05, + "loss": 2.4574, + "step": 13322 + }, + { + "epoch": 1.0752158824953595, + "grad_norm": 0.7443479895591736, + "learning_rate": 5.062564197653865e-05, + "loss": 2.52, + "step": 13323 + }, + { + "epoch": 1.0752965862319426, + "grad_norm": 0.7554972767829895, + "learning_rate": 5.061191418651186e-05, + "loss": 2.483, + "step": 13324 + }, + { + "epoch": 1.0753772899685254, + "grad_norm": 0.7661007642745972, + "learning_rate": 5.059818762736521e-05, + "loss": 2.566, + "step": 13325 + }, + { + "epoch": 1.0754579937051085, + "grad_norm": 0.7416480183601379, + "learning_rate": 5.058446229944087e-05, + "loss": 2.465, + "step": 13326 + }, + { + "epoch": 1.0755386974416916, + "grad_norm": 0.6997848749160767, + "learning_rate": 5.057073820308089e-05, + "loss": 2.4936, + "step": 13327 + }, + { + "epoch": 1.0756194011782745, + "grad_norm": 0.7570235133171082, + "learning_rate": 5.0557015338627345e-05, + "loss": 2.519, + "step": 13328 + }, + { + "epoch": 1.0757001049148576, + "grad_norm": 0.7910803556442261, + "learning_rate": 5.0543293706422214e-05, + "loss": 2.4932, + "step": 13329 + }, + { + "epoch": 1.0757808086514407, + "grad_norm": 0.7068312168121338, + "learning_rate": 5.052957330680752e-05, + "loss": 2.4489, + "step": 13330 + }, + { + "epoch": 1.0758615123880235, + "grad_norm": 0.7818215489387512, + "learning_rate": 5.051585414012514e-05, + "loss": 2.4467, + "step": 13331 + }, + { + "epoch": 1.0759422161246066, + "grad_norm": 0.7359446287155151, + "learning_rate": 5.0502136206717046e-05, + "loss": 2.4348, + "step": 13332 + }, + { + "epoch": 1.0760229198611895, + "grad_norm": 0.694726824760437, + "learning_rate": 5.0488419506925124e-05, + "loss": 2.4554, + "step": 13333 + }, + { + "epoch": 1.0761036235977726, + "grad_norm": 0.6776530742645264, + "learning_rate": 5.047470404109118e-05, + "loss": 2.4206, + "step": 13334 + }, + { + "epoch": 1.0761843273343557, + "grad_norm": 0.6977556943893433, + "learning_rate": 5.0460989809557066e-05, + "loss": 2.4748, + "step": 13335 + }, + { + "epoch": 1.0762650310709385, + "grad_norm": 0.6888061761856079, + "learning_rate": 5.044727681266459e-05, + "loss": 2.4129, + "step": 13336 + }, + { + "epoch": 1.0763457348075216, + "grad_norm": 0.744110643863678, + "learning_rate": 5.043356505075549e-05, + "loss": 2.4815, + "step": 13337 + }, + { + "epoch": 1.0764264385441047, + "grad_norm": 0.6726455688476562, + "learning_rate": 5.041985452417154e-05, + "loss": 2.4299, + "step": 13338 + }, + { + "epoch": 1.0765071422806876, + "grad_norm": 0.6755545735359192, + "learning_rate": 5.040614523325441e-05, + "loss": 2.4188, + "step": 13339 + }, + { + "epoch": 1.0765878460172706, + "grad_norm": 0.7152739763259888, + "learning_rate": 5.039243717834582e-05, + "loss": 2.4366, + "step": 13340 + }, + { + "epoch": 1.0766685497538535, + "grad_norm": 0.7253085374832153, + "learning_rate": 5.037873035978733e-05, + "loss": 2.4681, + "step": 13341 + }, + { + "epoch": 1.0767492534904366, + "grad_norm": 0.6780266165733337, + "learning_rate": 5.03650247779206e-05, + "loss": 2.5163, + "step": 13342 + }, + { + "epoch": 1.0768299572270197, + "grad_norm": 0.7440996170043945, + "learning_rate": 5.035132043308722e-05, + "loss": 2.4831, + "step": 13343 + }, + { + "epoch": 1.0769106609636026, + "grad_norm": 0.6619833111763, + "learning_rate": 5.0337617325628695e-05, + "loss": 2.433, + "step": 13344 + }, + { + "epoch": 1.0769913647001856, + "grad_norm": 0.7518059015274048, + "learning_rate": 5.032391545588656e-05, + "loss": 2.4241, + "step": 13345 + }, + { + "epoch": 1.0770720684367687, + "grad_norm": 0.6592784523963928, + "learning_rate": 5.031021482420231e-05, + "loss": 2.4902, + "step": 13346 + }, + { + "epoch": 1.0771527721733516, + "grad_norm": 0.7192299365997314, + "learning_rate": 5.029651543091739e-05, + "loss": 2.4445, + "step": 13347 + }, + { + "epoch": 1.0772334759099347, + "grad_norm": 0.7376793622970581, + "learning_rate": 5.028281727637323e-05, + "loss": 2.4532, + "step": 13348 + }, + { + "epoch": 1.0773141796465175, + "grad_norm": 0.7344524264335632, + "learning_rate": 5.026912036091127e-05, + "loss": 2.4193, + "step": 13349 + }, + { + "epoch": 1.0773948833831006, + "grad_norm": 0.7343986630439758, + "learning_rate": 5.0255424684872785e-05, + "loss": 2.4912, + "step": 13350 + }, + { + "epoch": 1.0774755871196837, + "grad_norm": 0.7103631496429443, + "learning_rate": 5.024173024859916e-05, + "loss": 2.4611, + "step": 13351 + }, + { + "epoch": 1.0775562908562666, + "grad_norm": 0.7554094791412354, + "learning_rate": 5.022803705243169e-05, + "loss": 2.4875, + "step": 13352 + }, + { + "epoch": 1.0776369945928497, + "grad_norm": 0.6754978895187378, + "learning_rate": 5.0214345096711655e-05, + "loss": 2.4585, + "step": 13353 + }, + { + "epoch": 1.0777176983294328, + "grad_norm": 0.690747857093811, + "learning_rate": 5.020065438178026e-05, + "loss": 2.4751, + "step": 13354 + }, + { + "epoch": 1.0777984020660156, + "grad_norm": 0.7012028694152832, + "learning_rate": 5.018696490797874e-05, + "loss": 2.4443, + "step": 13355 + }, + { + "epoch": 1.0778791058025987, + "grad_norm": 0.6788459420204163, + "learning_rate": 5.017327667564831e-05, + "loss": 2.4135, + "step": 13356 + }, + { + "epoch": 1.0779598095391816, + "grad_norm": 0.6662794351577759, + "learning_rate": 5.015958968512997e-05, + "loss": 2.3801, + "step": 13357 + }, + { + "epoch": 1.0780405132757647, + "grad_norm": 0.7873939275741577, + "learning_rate": 5.0145903936764994e-05, + "loss": 2.4629, + "step": 13358 + }, + { + "epoch": 1.0781212170123478, + "grad_norm": 0.7484980225563049, + "learning_rate": 5.0132219430894455e-05, + "loss": 2.4307, + "step": 13359 + }, + { + "epoch": 1.0782019207489306, + "grad_norm": 0.7559076547622681, + "learning_rate": 5.011853616785932e-05, + "loss": 2.4846, + "step": 13360 + }, + { + "epoch": 1.0782826244855137, + "grad_norm": 0.6822710633277893, + "learning_rate": 5.010485414800066e-05, + "loss": 2.4448, + "step": 13361 + }, + { + "epoch": 1.0783633282220966, + "grad_norm": 0.6665955185890198, + "learning_rate": 5.0091173371659496e-05, + "loss": 2.4562, + "step": 13362 + }, + { + "epoch": 1.0784440319586797, + "grad_norm": 0.6645659804344177, + "learning_rate": 5.0077493839176714e-05, + "loss": 2.4545, + "step": 13363 + }, + { + "epoch": 1.0785247356952627, + "grad_norm": 0.6648181080818176, + "learning_rate": 5.0063815550893276e-05, + "loss": 2.4565, + "step": 13364 + }, + { + "epoch": 1.0786054394318456, + "grad_norm": 0.6679299473762512, + "learning_rate": 5.005013850715014e-05, + "loss": 2.4301, + "step": 13365 + }, + { + "epoch": 1.0786861431684287, + "grad_norm": 0.7116484642028809, + "learning_rate": 5.003646270828808e-05, + "loss": 2.4174, + "step": 13366 + }, + { + "epoch": 1.0787668469050118, + "grad_norm": 0.6850735545158386, + "learning_rate": 5.002278815464798e-05, + "loss": 2.4386, + "step": 13367 + }, + { + "epoch": 1.0788475506415947, + "grad_norm": 0.6613513827323914, + "learning_rate": 5.00091148465706e-05, + "loss": 2.4038, + "step": 13368 + }, + { + "epoch": 1.0789282543781777, + "grad_norm": 0.659635603427887, + "learning_rate": 4.9995442784396827e-05, + "loss": 2.4346, + "step": 13369 + }, + { + "epoch": 1.0790089581147608, + "grad_norm": 0.6775132417678833, + "learning_rate": 4.998177196846731e-05, + "loss": 2.4853, + "step": 13370 + }, + { + "epoch": 1.0790896618513437, + "grad_norm": 0.719860851764679, + "learning_rate": 4.996810239912277e-05, + "loss": 2.4018, + "step": 13371 + }, + { + "epoch": 1.0791703655879268, + "grad_norm": 0.7316389083862305, + "learning_rate": 4.9954434076703946e-05, + "loss": 2.424, + "step": 13372 + }, + { + "epoch": 1.0792510693245096, + "grad_norm": 0.6779622435569763, + "learning_rate": 4.99407670015514e-05, + "loss": 2.4743, + "step": 13373 + }, + { + "epoch": 1.0793317730610927, + "grad_norm": 0.7357139587402344, + "learning_rate": 4.992710117400581e-05, + "loss": 2.4385, + "step": 13374 + }, + { + "epoch": 1.0794124767976758, + "grad_norm": 0.671441912651062, + "learning_rate": 4.9913436594407784e-05, + "loss": 2.3988, + "step": 13375 + }, + { + "epoch": 1.0794931805342587, + "grad_norm": 0.7205149531364441, + "learning_rate": 4.9899773263097804e-05, + "loss": 2.4594, + "step": 13376 + }, + { + "epoch": 1.0795738842708418, + "grad_norm": 0.702910840511322, + "learning_rate": 4.988611118041644e-05, + "loss": 2.4831, + "step": 13377 + }, + { + "epoch": 1.0796545880074246, + "grad_norm": 0.6977962255477905, + "learning_rate": 4.987245034670418e-05, + "loss": 2.422, + "step": 13378 + }, + { + "epoch": 1.0797352917440077, + "grad_norm": 0.7106757760047913, + "learning_rate": 4.985879076230149e-05, + "loss": 2.4073, + "step": 13379 + }, + { + "epoch": 1.0798159954805908, + "grad_norm": 0.7046806812286377, + "learning_rate": 4.9845132427548814e-05, + "loss": 2.4065, + "step": 13380 + }, + { + "epoch": 1.0798966992171737, + "grad_norm": 0.7476605772972107, + "learning_rate": 4.9831475342786574e-05, + "loss": 2.4886, + "step": 13381 + }, + { + "epoch": 1.0799774029537568, + "grad_norm": 0.696977972984314, + "learning_rate": 4.981781950835508e-05, + "loss": 2.4732, + "step": 13382 + }, + { + "epoch": 1.0800581066903399, + "grad_norm": 0.6596804857254028, + "learning_rate": 4.98041649245947e-05, + "loss": 2.4497, + "step": 13383 + }, + { + "epoch": 1.0801388104269227, + "grad_norm": 0.7216050028800964, + "learning_rate": 4.979051159184573e-05, + "loss": 2.4745, + "step": 13384 + }, + { + "epoch": 1.0802195141635058, + "grad_norm": 0.6636630296707153, + "learning_rate": 4.977685951044852e-05, + "loss": 2.4904, + "step": 13385 + }, + { + "epoch": 1.0803002179000887, + "grad_norm": 0.7030208110809326, + "learning_rate": 4.97632086807432e-05, + "loss": 2.4302, + "step": 13386 + }, + { + "epoch": 1.0803809216366718, + "grad_norm": 0.7158327102661133, + "learning_rate": 4.974955910307004e-05, + "loss": 2.4735, + "step": 13387 + }, + { + "epoch": 1.0804616253732549, + "grad_norm": 0.6736464500427246, + "learning_rate": 4.9735910777769234e-05, + "loss": 2.4334, + "step": 13388 + }, + { + "epoch": 1.0805423291098377, + "grad_norm": 0.6913403272628784, + "learning_rate": 4.972226370518092e-05, + "loss": 2.468, + "step": 13389 + }, + { + "epoch": 1.0806230328464208, + "grad_norm": 0.7006524205207825, + "learning_rate": 4.970861788564522e-05, + "loss": 2.4598, + "step": 13390 + }, + { + "epoch": 1.080703736583004, + "grad_norm": 0.6892947554588318, + "learning_rate": 4.969497331950227e-05, + "loss": 2.4297, + "step": 13391 + }, + { + "epoch": 1.0807844403195868, + "grad_norm": 0.7270283699035645, + "learning_rate": 4.968133000709203e-05, + "loss": 2.5344, + "step": 13392 + }, + { + "epoch": 1.0808651440561698, + "grad_norm": 0.735342264175415, + "learning_rate": 4.9667687948754594e-05, + "loss": 2.4431, + "step": 13393 + }, + { + "epoch": 1.0809458477927527, + "grad_norm": 0.6869279146194458, + "learning_rate": 4.9654047144829974e-05, + "loss": 2.5581, + "step": 13394 + }, + { + "epoch": 1.0810265515293358, + "grad_norm": 0.6975715160369873, + "learning_rate": 4.964040759565808e-05, + "loss": 2.4328, + "step": 13395 + }, + { + "epoch": 1.0811072552659189, + "grad_norm": 0.7312532067298889, + "learning_rate": 4.9626769301578856e-05, + "loss": 2.4686, + "step": 13396 + }, + { + "epoch": 1.0811879590025018, + "grad_norm": 0.7824496626853943, + "learning_rate": 4.9613132262932215e-05, + "loss": 2.4564, + "step": 13397 + }, + { + "epoch": 1.0812686627390848, + "grad_norm": 0.7337941527366638, + "learning_rate": 4.959949648005805e-05, + "loss": 2.4752, + "step": 13398 + }, + { + "epoch": 1.081349366475668, + "grad_norm": 0.7450836300849915, + "learning_rate": 4.958586195329617e-05, + "loss": 2.4457, + "step": 13399 + }, + { + "epoch": 1.0814300702122508, + "grad_norm": 0.6990504860877991, + "learning_rate": 4.9572228682986385e-05, + "loss": 2.4172, + "step": 13400 + }, + { + "epoch": 1.0815107739488339, + "grad_norm": 0.7293999791145325, + "learning_rate": 4.955859666946853e-05, + "loss": 2.5295, + "step": 13401 + }, + { + "epoch": 1.0815914776854167, + "grad_norm": 0.6872537136077881, + "learning_rate": 4.9544965913082264e-05, + "loss": 2.5029, + "step": 13402 + }, + { + "epoch": 1.0816721814219998, + "grad_norm": 0.6821706891059875, + "learning_rate": 4.953133641416733e-05, + "loss": 2.4738, + "step": 13403 + }, + { + "epoch": 1.081752885158583, + "grad_norm": 0.6811527609825134, + "learning_rate": 4.951770817306346e-05, + "loss": 2.4323, + "step": 13404 + }, + { + "epoch": 1.0818335888951658, + "grad_norm": 0.7138943076133728, + "learning_rate": 4.950408119011023e-05, + "loss": 2.5155, + "step": 13405 + }, + { + "epoch": 1.0819142926317489, + "grad_norm": 0.6777952909469604, + "learning_rate": 4.949045546564729e-05, + "loss": 2.4414, + "step": 13406 + }, + { + "epoch": 1.0819949963683317, + "grad_norm": 0.7065548896789551, + "learning_rate": 4.9476831000014276e-05, + "loss": 2.4913, + "step": 13407 + }, + { + "epoch": 1.0820757001049148, + "grad_norm": 0.7286355495452881, + "learning_rate": 4.9463207793550626e-05, + "loss": 2.4171, + "step": 13408 + }, + { + "epoch": 1.082156403841498, + "grad_norm": 0.6703049540519714, + "learning_rate": 4.944958584659597e-05, + "loss": 2.4387, + "step": 13409 + }, + { + "epoch": 1.0822371075780808, + "grad_norm": 0.6572019457817078, + "learning_rate": 4.943596515948983e-05, + "loss": 2.4324, + "step": 13410 + }, + { + "epoch": 1.0823178113146639, + "grad_norm": 0.6722360849380493, + "learning_rate": 4.942234573257156e-05, + "loss": 2.4802, + "step": 13411 + }, + { + "epoch": 1.082398515051247, + "grad_norm": 0.7122535109519958, + "learning_rate": 4.9408727566180655e-05, + "loss": 2.4531, + "step": 13412 + }, + { + "epoch": 1.0824792187878298, + "grad_norm": 0.6769903898239136, + "learning_rate": 4.9395110660656505e-05, + "loss": 2.4549, + "step": 13413 + }, + { + "epoch": 1.082559922524413, + "grad_norm": 0.766251266002655, + "learning_rate": 4.938149501633852e-05, + "loss": 2.4416, + "step": 13414 + }, + { + "epoch": 1.082640626260996, + "grad_norm": 0.6677987575531006, + "learning_rate": 4.936788063356596e-05, + "loss": 2.4578, + "step": 13415 + }, + { + "epoch": 1.0827213299975789, + "grad_norm": 0.7461380362510681, + "learning_rate": 4.9354267512678156e-05, + "loss": 2.4776, + "step": 13416 + }, + { + "epoch": 1.082802033734162, + "grad_norm": 0.6681976914405823, + "learning_rate": 4.934065565401443e-05, + "loss": 2.5044, + "step": 13417 + }, + { + "epoch": 1.0828827374707448, + "grad_norm": 0.6809324622154236, + "learning_rate": 4.932704505791397e-05, + "loss": 2.4651, + "step": 13418 + }, + { + "epoch": 1.082963441207328, + "grad_norm": 0.6926563382148743, + "learning_rate": 4.931343572471596e-05, + "loss": 2.4633, + "step": 13419 + }, + { + "epoch": 1.083044144943911, + "grad_norm": 0.6451820135116577, + "learning_rate": 4.929982765475971e-05, + "loss": 2.474, + "step": 13420 + }, + { + "epoch": 1.0831248486804939, + "grad_norm": 0.7088493704795837, + "learning_rate": 4.9286220848384247e-05, + "loss": 2.462, + "step": 13421 + }, + { + "epoch": 1.083205552417077, + "grad_norm": 0.7819172739982605, + "learning_rate": 4.9272615305928725e-05, + "loss": 2.4534, + "step": 13422 + }, + { + "epoch": 1.0832862561536598, + "grad_norm": 0.6579666137695312, + "learning_rate": 4.925901102773227e-05, + "loss": 2.4101, + "step": 13423 + }, + { + "epoch": 1.083366959890243, + "grad_norm": 0.6999555230140686, + "learning_rate": 4.924540801413385e-05, + "loss": 2.4534, + "step": 13424 + }, + { + "epoch": 1.083447663626826, + "grad_norm": 0.7034400105476379, + "learning_rate": 4.9231806265472555e-05, + "loss": 2.4741, + "step": 13425 + }, + { + "epoch": 1.0835283673634089, + "grad_norm": 0.6595034599304199, + "learning_rate": 4.921820578208739e-05, + "loss": 2.4011, + "step": 13426 + }, + { + "epoch": 1.083609071099992, + "grad_norm": 0.666419267654419, + "learning_rate": 4.920460656431723e-05, + "loss": 2.4399, + "step": 13427 + }, + { + "epoch": 1.083689774836575, + "grad_norm": 0.7058294415473938, + "learning_rate": 4.919100861250108e-05, + "loss": 2.434, + "step": 13428 + }, + { + "epoch": 1.083770478573158, + "grad_norm": 0.7045806050300598, + "learning_rate": 4.917741192697779e-05, + "loss": 2.4616, + "step": 13429 + }, + { + "epoch": 1.083851182309741, + "grad_norm": 0.6565639972686768, + "learning_rate": 4.916381650808626e-05, + "loss": 2.3864, + "step": 13430 + }, + { + "epoch": 1.0839318860463238, + "grad_norm": 0.6939674615859985, + "learning_rate": 4.9150222356165295e-05, + "loss": 2.4217, + "step": 13431 + }, + { + "epoch": 1.084012589782907, + "grad_norm": 0.7240599989891052, + "learning_rate": 4.913662947155373e-05, + "loss": 2.447, + "step": 13432 + }, + { + "epoch": 1.08409329351949, + "grad_norm": 0.7369012832641602, + "learning_rate": 4.9123037854590336e-05, + "loss": 2.4588, + "step": 13433 + }, + { + "epoch": 1.0841739972560729, + "grad_norm": 0.714269757270813, + "learning_rate": 4.9109447505613803e-05, + "loss": 2.4921, + "step": 13434 + }, + { + "epoch": 1.084254700992656, + "grad_norm": 0.7541659474372864, + "learning_rate": 4.909585842496287e-05, + "loss": 2.4191, + "step": 13435 + }, + { + "epoch": 1.084335404729239, + "grad_norm": 0.7245596051216125, + "learning_rate": 4.9082270612976243e-05, + "loss": 2.4904, + "step": 13436 + }, + { + "epoch": 1.084416108465822, + "grad_norm": 0.7301090359687805, + "learning_rate": 4.90686840699925e-05, + "loss": 2.4461, + "step": 13437 + }, + { + "epoch": 1.084496812202405, + "grad_norm": 0.7404102683067322, + "learning_rate": 4.905509879635028e-05, + "loss": 2.4826, + "step": 13438 + }, + { + "epoch": 1.0845775159389879, + "grad_norm": 0.7053710222244263, + "learning_rate": 4.9041514792388175e-05, + "loss": 2.4231, + "step": 13439 + }, + { + "epoch": 1.084658219675571, + "grad_norm": 0.6171362400054932, + "learning_rate": 4.9027932058444724e-05, + "loss": 2.4472, + "step": 13440 + }, + { + "epoch": 1.084738923412154, + "grad_norm": 0.7367038130760193, + "learning_rate": 4.901435059485845e-05, + "loss": 2.4847, + "step": 13441 + }, + { + "epoch": 1.084819627148737, + "grad_norm": 0.754828691482544, + "learning_rate": 4.900077040196788e-05, + "loss": 2.4731, + "step": 13442 + }, + { + "epoch": 1.08490033088532, + "grad_norm": 0.7380684018135071, + "learning_rate": 4.8987191480111386e-05, + "loss": 2.4227, + "step": 13443 + }, + { + "epoch": 1.084981034621903, + "grad_norm": 0.6711444854736328, + "learning_rate": 4.897361382962742e-05, + "loss": 2.4744, + "step": 13444 + }, + { + "epoch": 1.085061738358486, + "grad_norm": 0.7709227204322815, + "learning_rate": 4.896003745085438e-05, + "loss": 2.5422, + "step": 13445 + }, + { + "epoch": 1.085142442095069, + "grad_norm": 0.6778519153594971, + "learning_rate": 4.8946462344130675e-05, + "loss": 2.4757, + "step": 13446 + }, + { + "epoch": 1.085223145831652, + "grad_norm": 0.7390698194503784, + "learning_rate": 4.893288850979454e-05, + "loss": 2.4214, + "step": 13447 + }, + { + "epoch": 1.085303849568235, + "grad_norm": 0.6632684469223022, + "learning_rate": 4.891931594818432e-05, + "loss": 2.4689, + "step": 13448 + }, + { + "epoch": 1.085384553304818, + "grad_norm": 0.68693608045578, + "learning_rate": 4.890574465963827e-05, + "loss": 2.4788, + "step": 13449 + }, + { + "epoch": 1.085465257041401, + "grad_norm": 0.6910344362258911, + "learning_rate": 4.8892174644494625e-05, + "loss": 2.4611, + "step": 13450 + }, + { + "epoch": 1.085545960777984, + "grad_norm": 0.6935380101203918, + "learning_rate": 4.887860590309158e-05, + "loss": 2.4481, + "step": 13451 + }, + { + "epoch": 1.085626664514567, + "grad_norm": 0.7086954712867737, + "learning_rate": 4.886503843576735e-05, + "loss": 2.4583, + "step": 13452 + }, + { + "epoch": 1.08570736825115, + "grad_norm": 0.7447777986526489, + "learning_rate": 4.8851472242859994e-05, + "loss": 2.5035, + "step": 13453 + }, + { + "epoch": 1.085788071987733, + "grad_norm": 0.6896036267280579, + "learning_rate": 4.8837907324707656e-05, + "loss": 2.4622, + "step": 13454 + }, + { + "epoch": 1.085868775724316, + "grad_norm": 0.7261155247688293, + "learning_rate": 4.882434368164843e-05, + "loss": 2.4958, + "step": 13455 + }, + { + "epoch": 1.085949479460899, + "grad_norm": 0.6868197321891785, + "learning_rate": 4.881078131402031e-05, + "loss": 2.4952, + "step": 13456 + }, + { + "epoch": 1.0860301831974821, + "grad_norm": 0.6338867545127869, + "learning_rate": 4.879722022216132e-05, + "loss": 2.4553, + "step": 13457 + }, + { + "epoch": 1.086110886934065, + "grad_norm": 0.7214454412460327, + "learning_rate": 4.878366040640946e-05, + "loss": 2.4433, + "step": 13458 + }, + { + "epoch": 1.086191590670648, + "grad_norm": 0.6871301531791687, + "learning_rate": 4.877010186710266e-05, + "loss": 2.4118, + "step": 13459 + }, + { + "epoch": 1.0862722944072312, + "grad_norm": 0.6845650672912598, + "learning_rate": 4.875654460457883e-05, + "loss": 2.4684, + "step": 13460 + }, + { + "epoch": 1.086352998143814, + "grad_norm": 0.7027513980865479, + "learning_rate": 4.8742988619175865e-05, + "loss": 2.4569, + "step": 13461 + }, + { + "epoch": 1.0864337018803971, + "grad_norm": 0.6428621411323547, + "learning_rate": 4.8729433911231646e-05, + "loss": 2.4211, + "step": 13462 + }, + { + "epoch": 1.08651440561698, + "grad_norm": 0.6921488046646118, + "learning_rate": 4.8715880481083934e-05, + "loss": 2.4668, + "step": 13463 + }, + { + "epoch": 1.086595109353563, + "grad_norm": 0.7001025676727295, + "learning_rate": 4.870232832907051e-05, + "loss": 2.4685, + "step": 13464 + }, + { + "epoch": 1.0866758130901462, + "grad_norm": 0.7460644245147705, + "learning_rate": 4.868877745552922e-05, + "loss": 2.3922, + "step": 13465 + }, + { + "epoch": 1.086756516826729, + "grad_norm": 0.7418891191482544, + "learning_rate": 4.867522786079768e-05, + "loss": 2.3777, + "step": 13466 + }, + { + "epoch": 1.0868372205633121, + "grad_norm": 0.6430083513259888, + "learning_rate": 4.8661679545213625e-05, + "loss": 2.4385, + "step": 13467 + }, + { + "epoch": 1.086917924299895, + "grad_norm": 0.6963593363761902, + "learning_rate": 4.864813250911475e-05, + "loss": 2.4083, + "step": 13468 + }, + { + "epoch": 1.086998628036478, + "grad_norm": 0.6796097159385681, + "learning_rate": 4.8634586752838606e-05, + "loss": 2.4984, + "step": 13469 + }, + { + "epoch": 1.0870793317730612, + "grad_norm": 0.6845307946205139, + "learning_rate": 4.862104227672281e-05, + "loss": 2.4168, + "step": 13470 + }, + { + "epoch": 1.087160035509644, + "grad_norm": 0.705348014831543, + "learning_rate": 4.8607499081105e-05, + "loss": 2.4216, + "step": 13471 + }, + { + "epoch": 1.087240739246227, + "grad_norm": 0.6906474828720093, + "learning_rate": 4.8593957166322636e-05, + "loss": 2.4955, + "step": 13472 + }, + { + "epoch": 1.0873214429828102, + "grad_norm": 0.696489691734314, + "learning_rate": 4.858041653271323e-05, + "loss": 2.4186, + "step": 13473 + }, + { + "epoch": 1.087402146719393, + "grad_norm": 0.6997761726379395, + "learning_rate": 4.856687718061429e-05, + "loss": 2.441, + "step": 13474 + }, + { + "epoch": 1.0874828504559761, + "grad_norm": 0.6515649557113647, + "learning_rate": 4.8553339110363184e-05, + "loss": 2.3997, + "step": 13475 + }, + { + "epoch": 1.087563554192559, + "grad_norm": 0.6902725696563721, + "learning_rate": 4.853980232229734e-05, + "loss": 2.4765, + "step": 13476 + }, + { + "epoch": 1.087644257929142, + "grad_norm": 0.6832055449485779, + "learning_rate": 4.852626681675415e-05, + "loss": 2.411, + "step": 13477 + }, + { + "epoch": 1.0877249616657252, + "grad_norm": 0.668520987033844, + "learning_rate": 4.8512732594070984e-05, + "loss": 2.4742, + "step": 13478 + }, + { + "epoch": 1.087805665402308, + "grad_norm": 0.7019832134246826, + "learning_rate": 4.849919965458507e-05, + "loss": 2.4638, + "step": 13479 + }, + { + "epoch": 1.0878863691388911, + "grad_norm": 0.6986027359962463, + "learning_rate": 4.8485667998633724e-05, + "loss": 2.4866, + "step": 13480 + }, + { + "epoch": 1.0879670728754742, + "grad_norm": 0.659037709236145, + "learning_rate": 4.8472137626554195e-05, + "loss": 2.4821, + "step": 13481 + }, + { + "epoch": 1.088047776612057, + "grad_norm": 0.6506801247596741, + "learning_rate": 4.8458608538683694e-05, + "loss": 2.4686, + "step": 13482 + }, + { + "epoch": 1.0881284803486402, + "grad_norm": 0.7136878967285156, + "learning_rate": 4.844508073535939e-05, + "loss": 2.4523, + "step": 13483 + }, + { + "epoch": 1.088209184085223, + "grad_norm": 0.6663414239883423, + "learning_rate": 4.843155421691848e-05, + "loss": 2.4287, + "step": 13484 + }, + { + "epoch": 1.0882898878218061, + "grad_norm": 0.7192783355712891, + "learning_rate": 4.8418028983698006e-05, + "loss": 2.4433, + "step": 13485 + }, + { + "epoch": 1.0883705915583892, + "grad_norm": 0.6620980501174927, + "learning_rate": 4.8404505036035086e-05, + "loss": 2.4823, + "step": 13486 + }, + { + "epoch": 1.088451295294972, + "grad_norm": 0.6282123327255249, + "learning_rate": 4.83909823742668e-05, + "loss": 2.4641, + "step": 13487 + }, + { + "epoch": 1.0885319990315552, + "grad_norm": 0.6384354829788208, + "learning_rate": 4.837746099873012e-05, + "loss": 2.4234, + "step": 13488 + }, + { + "epoch": 1.0886127027681383, + "grad_norm": 0.6550076603889465, + "learning_rate": 4.836394090976204e-05, + "loss": 2.4743, + "step": 13489 + }, + { + "epoch": 1.0886934065047211, + "grad_norm": 0.6987888216972351, + "learning_rate": 4.8350422107699545e-05, + "loss": 2.4263, + "step": 13490 + }, + { + "epoch": 1.0887741102413042, + "grad_norm": 0.7012613415718079, + "learning_rate": 4.833690459287953e-05, + "loss": 2.4801, + "step": 13491 + }, + { + "epoch": 1.088854813977887, + "grad_norm": 0.6986923217773438, + "learning_rate": 4.832338836563891e-05, + "loss": 2.426, + "step": 13492 + }, + { + "epoch": 1.0889355177144702, + "grad_norm": 0.6936241984367371, + "learning_rate": 4.830987342631453e-05, + "loss": 2.4361, + "step": 13493 + }, + { + "epoch": 1.0890162214510533, + "grad_norm": 0.6612359881401062, + "learning_rate": 4.8296359775243275e-05, + "loss": 2.4385, + "step": 13494 + }, + { + "epoch": 1.0890969251876361, + "grad_norm": 0.6927692294120789, + "learning_rate": 4.828284741276183e-05, + "loss": 2.4692, + "step": 13495 + }, + { + "epoch": 1.0891776289242192, + "grad_norm": 0.6710225343704224, + "learning_rate": 4.8269336339207036e-05, + "loss": 2.4078, + "step": 13496 + }, + { + "epoch": 1.0892583326608023, + "grad_norm": 0.639076828956604, + "learning_rate": 4.825582655491564e-05, + "loss": 2.4368, + "step": 13497 + }, + { + "epoch": 1.0893390363973852, + "grad_norm": 0.7050483226776123, + "learning_rate": 4.824231806022426e-05, + "loss": 2.4308, + "step": 13498 + }, + { + "epoch": 1.0894197401339683, + "grad_norm": 0.7097769975662231, + "learning_rate": 4.822881085546962e-05, + "loss": 2.4378, + "step": 13499 + }, + { + "epoch": 1.0895004438705511, + "grad_norm": 0.6939458847045898, + "learning_rate": 4.821530494098834e-05, + "loss": 2.4678, + "step": 13500 + }, + { + "epoch": 1.0895811476071342, + "grad_norm": 0.6797441840171814, + "learning_rate": 4.8201800317117016e-05, + "loss": 2.4837, + "step": 13501 + }, + { + "epoch": 1.0896618513437173, + "grad_norm": 0.7451521158218384, + "learning_rate": 4.818829698419225e-05, + "loss": 2.4651, + "step": 13502 + }, + { + "epoch": 1.0897425550803002, + "grad_norm": 0.6749109625816345, + "learning_rate": 4.8174794942550585e-05, + "loss": 2.4569, + "step": 13503 + }, + { + "epoch": 1.0898232588168832, + "grad_norm": 0.6321636438369751, + "learning_rate": 4.8161294192528474e-05, + "loss": 2.4049, + "step": 13504 + }, + { + "epoch": 1.0899039625534663, + "grad_norm": 0.7002367377281189, + "learning_rate": 4.8147794734462415e-05, + "loss": 2.4489, + "step": 13505 + }, + { + "epoch": 1.0899846662900492, + "grad_norm": 0.758057713508606, + "learning_rate": 4.813429656868889e-05, + "loss": 2.436, + "step": 13506 + }, + { + "epoch": 1.0900653700266323, + "grad_norm": 0.6665529012680054, + "learning_rate": 4.812079969554424e-05, + "loss": 2.3805, + "step": 13507 + }, + { + "epoch": 1.0901460737632152, + "grad_norm": 0.6962547898292542, + "learning_rate": 4.810730411536487e-05, + "loss": 2.4203, + "step": 13508 + }, + { + "epoch": 1.0902267774997982, + "grad_norm": 0.6860647201538086, + "learning_rate": 4.809380982848712e-05, + "loss": 2.4482, + "step": 13509 + }, + { + "epoch": 1.0903074812363813, + "grad_norm": 0.7045090198516846, + "learning_rate": 4.808031683524733e-05, + "loss": 2.4155, + "step": 13510 + }, + { + "epoch": 1.0903881849729642, + "grad_norm": 0.6609304547309875, + "learning_rate": 4.806682513598176e-05, + "loss": 2.4295, + "step": 13511 + }, + { + "epoch": 1.0904688887095473, + "grad_norm": 0.7647323608398438, + "learning_rate": 4.8053334731026665e-05, + "loss": 2.4704, + "step": 13512 + }, + { + "epoch": 1.0905495924461301, + "grad_norm": 0.677449643611908, + "learning_rate": 4.803984562071829e-05, + "loss": 2.4501, + "step": 13513 + }, + { + "epoch": 1.0906302961827132, + "grad_norm": 0.645866334438324, + "learning_rate": 4.8026357805392754e-05, + "loss": 2.427, + "step": 13514 + }, + { + "epoch": 1.0907109999192963, + "grad_norm": 0.6968488097190857, + "learning_rate": 4.801287128538624e-05, + "loss": 2.3933, + "step": 13515 + }, + { + "epoch": 1.0907917036558792, + "grad_norm": 0.7137444615364075, + "learning_rate": 4.799938606103491e-05, + "loss": 2.4611, + "step": 13516 + }, + { + "epoch": 1.0908724073924623, + "grad_norm": 0.6860007047653198, + "learning_rate": 4.7985902132674765e-05, + "loss": 2.4252, + "step": 13517 + }, + { + "epoch": 1.0909531111290454, + "grad_norm": 0.726290762424469, + "learning_rate": 4.797241950064192e-05, + "loss": 2.44, + "step": 13518 + }, + { + "epoch": 1.0910338148656282, + "grad_norm": 0.6833362579345703, + "learning_rate": 4.795893816527241e-05, + "loss": 2.4199, + "step": 13519 + }, + { + "epoch": 1.0911145186022113, + "grad_norm": 0.7412242293357849, + "learning_rate": 4.794545812690212e-05, + "loss": 2.5412, + "step": 13520 + }, + { + "epoch": 1.0911952223387944, + "grad_norm": 0.6882274150848389, + "learning_rate": 4.793197938586712e-05, + "loss": 2.473, + "step": 13521 + }, + { + "epoch": 1.0912759260753773, + "grad_norm": 0.7334007024765015, + "learning_rate": 4.791850194250335e-05, + "loss": 2.4357, + "step": 13522 + }, + { + "epoch": 1.0913566298119604, + "grad_norm": 0.6564081311225891, + "learning_rate": 4.790502579714661e-05, + "loss": 2.4425, + "step": 13523 + }, + { + "epoch": 1.0914373335485432, + "grad_norm": 0.7045762538909912, + "learning_rate": 4.78915509501328e-05, + "loss": 2.4929, + "step": 13524 + }, + { + "epoch": 1.0915180372851263, + "grad_norm": 0.7512505650520325, + "learning_rate": 4.787807740179776e-05, + "loss": 2.4187, + "step": 13525 + }, + { + "epoch": 1.0915987410217094, + "grad_norm": 0.6592997908592224, + "learning_rate": 4.786460515247732e-05, + "loss": 2.4344, + "step": 13526 + }, + { + "epoch": 1.0916794447582923, + "grad_norm": 0.6721770763397217, + "learning_rate": 4.785113420250715e-05, + "loss": 2.4415, + "step": 13527 + }, + { + "epoch": 1.0917601484948753, + "grad_norm": 0.7544431686401367, + "learning_rate": 4.783766455222305e-05, + "loss": 2.4831, + "step": 13528 + }, + { + "epoch": 1.0918408522314582, + "grad_norm": 0.7226355671882629, + "learning_rate": 4.782419620196073e-05, + "loss": 2.4807, + "step": 13529 + }, + { + "epoch": 1.0919215559680413, + "grad_norm": 0.6386340260505676, + "learning_rate": 4.78107291520558e-05, + "loss": 2.4062, + "step": 13530 + }, + { + "epoch": 1.0920022597046244, + "grad_norm": 0.6670595407485962, + "learning_rate": 4.7797263402843926e-05, + "loss": 2.4009, + "step": 13531 + }, + { + "epoch": 1.0920829634412073, + "grad_norm": 0.6600756049156189, + "learning_rate": 4.778379895466071e-05, + "loss": 2.4321, + "step": 13532 + }, + { + "epoch": 1.0921636671777903, + "grad_norm": 0.7190701961517334, + "learning_rate": 4.77703358078417e-05, + "loss": 2.4229, + "step": 13533 + }, + { + "epoch": 1.0922443709143734, + "grad_norm": 0.6554828882217407, + "learning_rate": 4.775687396272247e-05, + "loss": 2.442, + "step": 13534 + }, + { + "epoch": 1.0923250746509563, + "grad_norm": 0.6720205545425415, + "learning_rate": 4.774341341963853e-05, + "loss": 2.4994, + "step": 13535 + }, + { + "epoch": 1.0924057783875394, + "grad_norm": 0.7161003947257996, + "learning_rate": 4.7729954178925295e-05, + "loss": 2.4666, + "step": 13536 + }, + { + "epoch": 1.0924864821241222, + "grad_norm": 0.6817156672477722, + "learning_rate": 4.771649624091824e-05, + "loss": 2.4203, + "step": 13537 + }, + { + "epoch": 1.0925671858607053, + "grad_norm": 0.7167035937309265, + "learning_rate": 4.770303960595277e-05, + "loss": 2.4214, + "step": 13538 + }, + { + "epoch": 1.0926478895972884, + "grad_norm": 0.6373945474624634, + "learning_rate": 4.768958427436429e-05, + "loss": 2.485, + "step": 13539 + }, + { + "epoch": 1.0927285933338713, + "grad_norm": 0.7361387014389038, + "learning_rate": 4.767613024648808e-05, + "loss": 2.5192, + "step": 13540 + }, + { + "epoch": 1.0928092970704544, + "grad_norm": 0.7034375667572021, + "learning_rate": 4.766267752265947e-05, + "loss": 2.4324, + "step": 13541 + }, + { + "epoch": 1.0928900008070375, + "grad_norm": 0.7355689406394958, + "learning_rate": 4.7649226103213765e-05, + "loss": 2.5048, + "step": 13542 + }, + { + "epoch": 1.0929707045436203, + "grad_norm": 0.7120445966720581, + "learning_rate": 4.7635775988486176e-05, + "loss": 2.449, + "step": 13543 + }, + { + "epoch": 1.0930514082802034, + "grad_norm": 0.695888876914978, + "learning_rate": 4.7622327178811935e-05, + "loss": 2.4974, + "step": 13544 + }, + { + "epoch": 1.0931321120167863, + "grad_norm": 0.6953639984130859, + "learning_rate": 4.760887967452625e-05, + "loss": 2.3927, + "step": 13545 + }, + { + "epoch": 1.0932128157533694, + "grad_norm": 0.6457183957099915, + "learning_rate": 4.759543347596421e-05, + "loss": 2.4501, + "step": 13546 + }, + { + "epoch": 1.0932935194899525, + "grad_norm": 0.7259296774864197, + "learning_rate": 4.7581988583460946e-05, + "loss": 2.4896, + "step": 13547 + }, + { + "epoch": 1.0933742232265353, + "grad_norm": 0.6897724270820618, + "learning_rate": 4.7568544997351586e-05, + "loss": 2.4181, + "step": 13548 + }, + { + "epoch": 1.0934549269631184, + "grad_norm": 0.6723688840866089, + "learning_rate": 4.755510271797111e-05, + "loss": 2.5097, + "step": 13549 + }, + { + "epoch": 1.0935356306997015, + "grad_norm": 0.7353307604789734, + "learning_rate": 4.754166174565456e-05, + "loss": 2.4548, + "step": 13550 + }, + { + "epoch": 1.0936163344362844, + "grad_norm": 0.7334069013595581, + "learning_rate": 4.752822208073693e-05, + "loss": 2.5113, + "step": 13551 + }, + { + "epoch": 1.0936970381728675, + "grad_norm": 0.6581420302391052, + "learning_rate": 4.751478372355317e-05, + "loss": 2.4546, + "step": 13552 + }, + { + "epoch": 1.0937777419094503, + "grad_norm": 0.7890802621841431, + "learning_rate": 4.75013466744382e-05, + "loss": 2.4092, + "step": 13553 + }, + { + "epoch": 1.0938584456460334, + "grad_norm": 0.7226595282554626, + "learning_rate": 4.7487910933726895e-05, + "loss": 2.457, + "step": 13554 + }, + { + "epoch": 1.0939391493826165, + "grad_norm": 0.7108014225959778, + "learning_rate": 4.7474476501754165e-05, + "loss": 2.471, + "step": 13555 + }, + { + "epoch": 1.0940198531191994, + "grad_norm": 0.6864863038063049, + "learning_rate": 4.746104337885473e-05, + "loss": 2.4778, + "step": 13556 + }, + { + "epoch": 1.0941005568557824, + "grad_norm": 0.6890624165534973, + "learning_rate": 4.744761156536345e-05, + "loss": 2.456, + "step": 13557 + }, + { + "epoch": 1.0941812605923653, + "grad_norm": 0.7052781581878662, + "learning_rate": 4.743418106161509e-05, + "loss": 2.4796, + "step": 13558 + }, + { + "epoch": 1.0942619643289484, + "grad_norm": 0.6569164991378784, + "learning_rate": 4.742075186794431e-05, + "loss": 2.469, + "step": 13559 + }, + { + "epoch": 1.0943426680655315, + "grad_norm": 0.7302874326705933, + "learning_rate": 4.7407323984685836e-05, + "loss": 2.4543, + "step": 13560 + }, + { + "epoch": 1.0944233718021144, + "grad_norm": 0.6499345898628235, + "learning_rate": 4.7393897412174335e-05, + "loss": 2.4037, + "step": 13561 + }, + { + "epoch": 1.0945040755386974, + "grad_norm": 0.6643944382667542, + "learning_rate": 4.7380472150744416e-05, + "loss": 2.4067, + "step": 13562 + }, + { + "epoch": 1.0945847792752805, + "grad_norm": 0.7491872906684875, + "learning_rate": 4.736704820073069e-05, + "loss": 2.4277, + "step": 13563 + }, + { + "epoch": 1.0946654830118634, + "grad_norm": 0.7319512367248535, + "learning_rate": 4.735362556246773e-05, + "loss": 2.4588, + "step": 13564 + }, + { + "epoch": 1.0947461867484465, + "grad_norm": 0.7404350638389587, + "learning_rate": 4.734020423629001e-05, + "loss": 2.432, + "step": 13565 + }, + { + "epoch": 1.0948268904850296, + "grad_norm": 0.6462193727493286, + "learning_rate": 4.732678422253206e-05, + "loss": 2.4417, + "step": 13566 + }, + { + "epoch": 1.0949075942216124, + "grad_norm": 0.6711323857307434, + "learning_rate": 4.731336552152836e-05, + "loss": 2.4023, + "step": 13567 + }, + { + "epoch": 1.0949882979581955, + "grad_norm": 0.658261239528656, + "learning_rate": 4.729994813361329e-05, + "loss": 2.4132, + "step": 13568 + }, + { + "epoch": 1.0950690016947784, + "grad_norm": 0.8081904053688049, + "learning_rate": 4.728653205912127e-05, + "loss": 2.4412, + "step": 13569 + }, + { + "epoch": 1.0951497054313615, + "grad_norm": 0.6620786786079407, + "learning_rate": 4.727311729838666e-05, + "loss": 2.4357, + "step": 13570 + }, + { + "epoch": 1.0952304091679446, + "grad_norm": 0.7026848793029785, + "learning_rate": 4.725970385174381e-05, + "loss": 2.4159, + "step": 13571 + }, + { + "epoch": 1.0953111129045274, + "grad_norm": 0.7017392516136169, + "learning_rate": 4.7246291719526995e-05, + "loss": 2.4253, + "step": 13572 + }, + { + "epoch": 1.0953918166411105, + "grad_norm": 0.710172712802887, + "learning_rate": 4.7232880902070483e-05, + "loss": 2.4057, + "step": 13573 + }, + { + "epoch": 1.0954725203776934, + "grad_norm": 0.7208876013755798, + "learning_rate": 4.721947139970856e-05, + "loss": 2.4803, + "step": 13574 + }, + { + "epoch": 1.0955532241142765, + "grad_norm": 0.693219006061554, + "learning_rate": 4.720606321277534e-05, + "loss": 2.3611, + "step": 13575 + }, + { + "epoch": 1.0956339278508596, + "grad_norm": 0.737206757068634, + "learning_rate": 4.7192656341605026e-05, + "loss": 2.3873, + "step": 13576 + }, + { + "epoch": 1.0957146315874424, + "grad_norm": 0.6605268120765686, + "learning_rate": 4.717925078653179e-05, + "loss": 2.4155, + "step": 13577 + }, + { + "epoch": 1.0957953353240255, + "grad_norm": 0.7143047451972961, + "learning_rate": 4.716584654788967e-05, + "loss": 2.4526, + "step": 13578 + }, + { + "epoch": 1.0958760390606086, + "grad_norm": 0.6980953216552734, + "learning_rate": 4.715244362601277e-05, + "loss": 2.4422, + "step": 13579 + }, + { + "epoch": 1.0959567427971915, + "grad_norm": 0.6852009892463684, + "learning_rate": 4.713904202123515e-05, + "loss": 2.4599, + "step": 13580 + }, + { + "epoch": 1.0960374465337746, + "grad_norm": 0.7436656355857849, + "learning_rate": 4.712564173389074e-05, + "loss": 2.4441, + "step": 13581 + }, + { + "epoch": 1.0961181502703574, + "grad_norm": 0.7090624570846558, + "learning_rate": 4.711224276431352e-05, + "loss": 2.4741, + "step": 13582 + }, + { + "epoch": 1.0961988540069405, + "grad_norm": 0.6611043810844421, + "learning_rate": 4.709884511283753e-05, + "loss": 2.4589, + "step": 13583 + }, + { + "epoch": 1.0962795577435236, + "grad_norm": 0.6932426691055298, + "learning_rate": 4.708544877979658e-05, + "loss": 2.4199, + "step": 13584 + }, + { + "epoch": 1.0963602614801065, + "grad_norm": 0.7629422545433044, + "learning_rate": 4.707205376552456e-05, + "loss": 2.4588, + "step": 13585 + }, + { + "epoch": 1.0964409652166895, + "grad_norm": 0.8116739392280579, + "learning_rate": 4.705866007035531e-05, + "loss": 2.472, + "step": 13586 + }, + { + "epoch": 1.0965216689532726, + "grad_norm": 0.6711297631263733, + "learning_rate": 4.704526769462269e-05, + "loss": 2.4086, + "step": 13587 + }, + { + "epoch": 1.0966023726898555, + "grad_norm": 0.716015636920929, + "learning_rate": 4.703187663866037e-05, + "loss": 2.4411, + "step": 13588 + }, + { + "epoch": 1.0966830764264386, + "grad_norm": 0.6982430219650269, + "learning_rate": 4.701848690280215e-05, + "loss": 2.4438, + "step": 13589 + }, + { + "epoch": 1.0967637801630215, + "grad_norm": 0.7183159589767456, + "learning_rate": 4.7005098487381785e-05, + "loss": 2.4464, + "step": 13590 + }, + { + "epoch": 1.0968444838996045, + "grad_norm": 0.6983399391174316, + "learning_rate": 4.699171139273284e-05, + "loss": 2.4354, + "step": 13591 + }, + { + "epoch": 1.0969251876361876, + "grad_norm": 0.7157938480377197, + "learning_rate": 4.697832561918901e-05, + "loss": 2.4393, + "step": 13592 + }, + { + "epoch": 1.0970058913727705, + "grad_norm": 0.6991363763809204, + "learning_rate": 4.696494116708392e-05, + "loss": 2.4723, + "step": 13593 + }, + { + "epoch": 1.0970865951093536, + "grad_norm": 0.6722309589385986, + "learning_rate": 4.695155803675112e-05, + "loss": 2.447, + "step": 13594 + }, + { + "epoch": 1.0971672988459367, + "grad_norm": 0.6492688655853271, + "learning_rate": 4.6938176228524175e-05, + "loss": 2.4213, + "step": 13595 + }, + { + "epoch": 1.0972480025825195, + "grad_norm": 0.6941642165184021, + "learning_rate": 4.6924795742736616e-05, + "loss": 2.4714, + "step": 13596 + }, + { + "epoch": 1.0973287063191026, + "grad_norm": 0.7506042122840881, + "learning_rate": 4.691141657972185e-05, + "loss": 2.4563, + "step": 13597 + }, + { + "epoch": 1.0974094100556855, + "grad_norm": 0.7032836675643921, + "learning_rate": 4.6898038739813356e-05, + "loss": 2.4824, + "step": 13598 + }, + { + "epoch": 1.0974901137922686, + "grad_norm": 0.6908734440803528, + "learning_rate": 4.6884662223344575e-05, + "loss": 2.4486, + "step": 13599 + }, + { + "epoch": 1.0975708175288517, + "grad_norm": 0.714971661567688, + "learning_rate": 4.687128703064883e-05, + "loss": 2.4372, + "step": 13600 + }, + { + "epoch": 1.0976515212654345, + "grad_norm": 0.6989198327064514, + "learning_rate": 4.6857913162059486e-05, + "loss": 2.395, + "step": 13601 + }, + { + "epoch": 1.0977322250020176, + "grad_norm": 0.7163406014442444, + "learning_rate": 4.684454061790987e-05, + "loss": 2.4868, + "step": 13602 + }, + { + "epoch": 1.0978129287386005, + "grad_norm": 0.6600626707077026, + "learning_rate": 4.6831169398533245e-05, + "loss": 2.5134, + "step": 13603 + }, + { + "epoch": 1.0978936324751836, + "grad_norm": 0.6657080054283142, + "learning_rate": 4.681779950426286e-05, + "loss": 2.4701, + "step": 13604 + }, + { + "epoch": 1.0979743362117667, + "grad_norm": 0.665860116481781, + "learning_rate": 4.680443093543194e-05, + "loss": 2.4593, + "step": 13605 + }, + { + "epoch": 1.0980550399483495, + "grad_norm": 0.7000327110290527, + "learning_rate": 4.679106369237368e-05, + "loss": 2.4523, + "step": 13606 + }, + { + "epoch": 1.0981357436849326, + "grad_norm": 0.6969157457351685, + "learning_rate": 4.677769777542118e-05, + "loss": 2.4935, + "step": 13607 + }, + { + "epoch": 1.0982164474215157, + "grad_norm": 0.6864836812019348, + "learning_rate": 4.676433318490757e-05, + "loss": 2.457, + "step": 13608 + }, + { + "epoch": 1.0982971511580986, + "grad_norm": 0.7331364750862122, + "learning_rate": 4.675096992116598e-05, + "loss": 2.4253, + "step": 13609 + }, + { + "epoch": 1.0983778548946816, + "grad_norm": 0.75, + "learning_rate": 4.673760798452936e-05, + "loss": 2.4147, + "step": 13610 + }, + { + "epoch": 1.0984585586312647, + "grad_norm": 0.6589440703392029, + "learning_rate": 4.6724247375330786e-05, + "loss": 2.4718, + "step": 13611 + }, + { + "epoch": 1.0985392623678476, + "grad_norm": 0.7032667994499207, + "learning_rate": 4.671088809390324e-05, + "loss": 2.4724, + "step": 13612 + }, + { + "epoch": 1.0986199661044307, + "grad_norm": 0.7544135451316833, + "learning_rate": 4.6697530140579646e-05, + "loss": 2.4804, + "step": 13613 + }, + { + "epoch": 1.0987006698410136, + "grad_norm": 0.6503081917762756, + "learning_rate": 4.668417351569295e-05, + "loss": 2.3829, + "step": 13614 + }, + { + "epoch": 1.0987813735775966, + "grad_norm": 0.6928786039352417, + "learning_rate": 4.667081821957605e-05, + "loss": 2.5678, + "step": 13615 + }, + { + "epoch": 1.0988620773141797, + "grad_norm": 0.6652864217758179, + "learning_rate": 4.665746425256173e-05, + "loss": 2.4585, + "step": 13616 + }, + { + "epoch": 1.0989427810507626, + "grad_norm": 0.700265109539032, + "learning_rate": 4.664411161498283e-05, + "loss": 2.4785, + "step": 13617 + }, + { + "epoch": 1.0990234847873457, + "grad_norm": 0.7443608045578003, + "learning_rate": 4.663076030717216e-05, + "loss": 2.4869, + "step": 13618 + }, + { + "epoch": 1.0991041885239285, + "grad_norm": 0.7037705779075623, + "learning_rate": 4.6617410329462477e-05, + "loss": 2.4518, + "step": 13619 + }, + { + "epoch": 1.0991848922605116, + "grad_norm": 0.7528365850448608, + "learning_rate": 4.660406168218643e-05, + "loss": 2.4616, + "step": 13620 + }, + { + "epoch": 1.0992655959970947, + "grad_norm": 0.7149221301078796, + "learning_rate": 4.659071436567676e-05, + "loss": 2.4661, + "step": 13621 + }, + { + "epoch": 1.0993462997336776, + "grad_norm": 0.7212862968444824, + "learning_rate": 4.657736838026608e-05, + "loss": 2.4424, + "step": 13622 + }, + { + "epoch": 1.0994270034702607, + "grad_norm": 0.6934216022491455, + "learning_rate": 4.6564023726287045e-05, + "loss": 2.4633, + "step": 13623 + }, + { + "epoch": 1.0995077072068438, + "grad_norm": 0.7244036793708801, + "learning_rate": 4.655068040407221e-05, + "loss": 2.409, + "step": 13624 + }, + { + "epoch": 1.0995884109434266, + "grad_norm": 0.6911318898200989, + "learning_rate": 4.653733841395419e-05, + "loss": 2.5117, + "step": 13625 + }, + { + "epoch": 1.0996691146800097, + "grad_norm": 0.7579816579818726, + "learning_rate": 4.65239977562654e-05, + "loss": 2.4927, + "step": 13626 + }, + { + "epoch": 1.0997498184165928, + "grad_norm": 0.7699651122093201, + "learning_rate": 4.651065843133837e-05, + "loss": 2.4083, + "step": 13627 + }, + { + "epoch": 1.0998305221531757, + "grad_norm": 0.6669431328773499, + "learning_rate": 4.649732043950561e-05, + "loss": 2.4402, + "step": 13628 + }, + { + "epoch": 1.0999112258897588, + "grad_norm": 0.7134940028190613, + "learning_rate": 4.6483983781099426e-05, + "loss": 2.4275, + "step": 13629 + }, + { + "epoch": 1.0999919296263416, + "grad_norm": 0.7107651233673096, + "learning_rate": 4.647064845645227e-05, + "loss": 2.4654, + "step": 13630 + }, + { + "epoch": 1.1000726333629247, + "grad_norm": 0.7101391553878784, + "learning_rate": 4.645731446589652e-05, + "loss": 2.4357, + "step": 13631 + }, + { + "epoch": 1.1001533370995078, + "grad_norm": 0.7511606216430664, + "learning_rate": 4.6443981809764405e-05, + "loss": 2.5016, + "step": 13632 + }, + { + "epoch": 1.1002340408360907, + "grad_norm": 0.7315953373908997, + "learning_rate": 4.6430650488388226e-05, + "loss": 2.4541, + "step": 13633 + }, + { + "epoch": 1.1003147445726738, + "grad_norm": 0.6701769232749939, + "learning_rate": 4.6417320502100316e-05, + "loss": 2.4071, + "step": 13634 + }, + { + "epoch": 1.1003954483092566, + "grad_norm": 0.7164294123649597, + "learning_rate": 4.6403991851232876e-05, + "loss": 2.478, + "step": 13635 + }, + { + "epoch": 1.1004761520458397, + "grad_norm": 0.7003894448280334, + "learning_rate": 4.639066453611802e-05, + "loss": 2.4686, + "step": 13636 + }, + { + "epoch": 1.1005568557824228, + "grad_norm": 0.6855250000953674, + "learning_rate": 4.6377338557087957e-05, + "loss": 2.4531, + "step": 13637 + }, + { + "epoch": 1.1006375595190057, + "grad_norm": 0.6581299901008606, + "learning_rate": 4.6364013914474816e-05, + "loss": 2.4511, + "step": 13638 + }, + { + "epoch": 1.1007182632555887, + "grad_norm": 0.7599080204963684, + "learning_rate": 4.6350690608610604e-05, + "loss": 2.5143, + "step": 13639 + }, + { + "epoch": 1.1007989669921718, + "grad_norm": 0.7029981017112732, + "learning_rate": 4.633736863982744e-05, + "loss": 2.4541, + "step": 13640 + }, + { + "epoch": 1.1008796707287547, + "grad_norm": 0.7378708720207214, + "learning_rate": 4.6324048008457357e-05, + "loss": 2.4319, + "step": 13641 + }, + { + "epoch": 1.1009603744653378, + "grad_norm": 0.7087826728820801, + "learning_rate": 4.631072871483226e-05, + "loss": 2.4148, + "step": 13642 + }, + { + "epoch": 1.1010410782019207, + "grad_norm": 0.7000819444656372, + "learning_rate": 4.629741075928415e-05, + "loss": 2.4692, + "step": 13643 + }, + { + "epoch": 1.1011217819385037, + "grad_norm": 0.7363965511322021, + "learning_rate": 4.628409414214496e-05, + "loss": 2.4584, + "step": 13644 + }, + { + "epoch": 1.1012024856750868, + "grad_norm": 0.6691753268241882, + "learning_rate": 4.627077886374656e-05, + "loss": 2.4356, + "step": 13645 + }, + { + "epoch": 1.1012831894116697, + "grad_norm": 0.6864185929298401, + "learning_rate": 4.625746492442078e-05, + "loss": 2.4713, + "step": 13646 + }, + { + "epoch": 1.1013638931482528, + "grad_norm": 0.714318573474884, + "learning_rate": 4.624415232449947e-05, + "loss": 2.4482, + "step": 13647 + }, + { + "epoch": 1.1014445968848359, + "grad_norm": 0.6383495330810547, + "learning_rate": 4.623084106431444e-05, + "loss": 2.4248, + "step": 13648 + }, + { + "epoch": 1.1015253006214187, + "grad_norm": 0.7014495730400085, + "learning_rate": 4.6217531144197365e-05, + "loss": 2.4393, + "step": 13649 + }, + { + "epoch": 1.1016060043580018, + "grad_norm": 0.8128634095191956, + "learning_rate": 4.620422256448e-05, + "loss": 2.4741, + "step": 13650 + }, + { + "epoch": 1.1016867080945847, + "grad_norm": 0.7333208322525024, + "learning_rate": 4.619091532549408e-05, + "loss": 2.4288, + "step": 13651 + }, + { + "epoch": 1.1017674118311678, + "grad_norm": 0.7023218274116516, + "learning_rate": 4.617760942757117e-05, + "loss": 2.5025, + "step": 13652 + }, + { + "epoch": 1.1018481155677509, + "grad_norm": 0.6420873403549194, + "learning_rate": 4.616430487104292e-05, + "loss": 2.4165, + "step": 13653 + }, + { + "epoch": 1.1019288193043337, + "grad_norm": 0.6767684817314148, + "learning_rate": 4.615100165624092e-05, + "loss": 2.4642, + "step": 13654 + }, + { + "epoch": 1.1020095230409168, + "grad_norm": 0.7361159920692444, + "learning_rate": 4.613769978349672e-05, + "loss": 2.5343, + "step": 13655 + }, + { + "epoch": 1.1020902267775, + "grad_norm": 0.6642624735832214, + "learning_rate": 4.6124399253141846e-05, + "loss": 2.3769, + "step": 13656 + }, + { + "epoch": 1.1021709305140828, + "grad_norm": 0.6912256479263306, + "learning_rate": 4.611110006550781e-05, + "loss": 2.455, + "step": 13657 + }, + { + "epoch": 1.1022516342506659, + "grad_norm": 0.7419310212135315, + "learning_rate": 4.609780222092599e-05, + "loss": 2.4171, + "step": 13658 + }, + { + "epoch": 1.1023323379872487, + "grad_norm": 0.718953549861908, + "learning_rate": 4.6084505719727835e-05, + "loss": 2.4791, + "step": 13659 + }, + { + "epoch": 1.1024130417238318, + "grad_norm": 0.7904248237609863, + "learning_rate": 4.607121056224477e-05, + "loss": 2.4429, + "step": 13660 + }, + { + "epoch": 1.102493745460415, + "grad_norm": 0.6743534803390503, + "learning_rate": 4.605791674880808e-05, + "loss": 2.4481, + "step": 13661 + }, + { + "epoch": 1.1025744491969978, + "grad_norm": 0.6829143166542053, + "learning_rate": 4.6044624279749106e-05, + "loss": 2.4078, + "step": 13662 + }, + { + "epoch": 1.1026551529335809, + "grad_norm": 0.6803167462348938, + "learning_rate": 4.6031333155399136e-05, + "loss": 2.4509, + "step": 13663 + }, + { + "epoch": 1.1027358566701637, + "grad_norm": 0.7474592328071594, + "learning_rate": 4.601804337608943e-05, + "loss": 2.4563, + "step": 13664 + }, + { + "epoch": 1.1028165604067468, + "grad_norm": 0.6753630042076111, + "learning_rate": 4.6004754942151174e-05, + "loss": 2.4285, + "step": 13665 + }, + { + "epoch": 1.10289726414333, + "grad_norm": 0.7990161180496216, + "learning_rate": 4.599146785391558e-05, + "loss": 2.4907, + "step": 13666 + }, + { + "epoch": 1.1029779678799128, + "grad_norm": 0.8161290287971497, + "learning_rate": 4.597818211171383e-05, + "loss": 2.4599, + "step": 13667 + }, + { + "epoch": 1.1030586716164958, + "grad_norm": 0.6813610792160034, + "learning_rate": 4.596489771587695e-05, + "loss": 2.4484, + "step": 13668 + }, + { + "epoch": 1.103139375353079, + "grad_norm": 0.6598966121673584, + "learning_rate": 4.5951614666736076e-05, + "loss": 2.4326, + "step": 13669 + }, + { + "epoch": 1.1032200790896618, + "grad_norm": 0.7084827423095703, + "learning_rate": 4.593833296462228e-05, + "loss": 2.4188, + "step": 13670 + }, + { + "epoch": 1.1033007828262449, + "grad_norm": 0.6876685619354248, + "learning_rate": 4.59250526098665e-05, + "loss": 2.4482, + "step": 13671 + }, + { + "epoch": 1.103381486562828, + "grad_norm": 0.7292699813842773, + "learning_rate": 4.591177360279978e-05, + "loss": 2.4452, + "step": 13672 + }, + { + "epoch": 1.1034621902994108, + "grad_norm": 0.7057675123214722, + "learning_rate": 4.589849594375304e-05, + "loss": 2.4336, + "step": 13673 + }, + { + "epoch": 1.103542894035994, + "grad_norm": 0.7684180736541748, + "learning_rate": 4.5885219633057196e-05, + "loss": 2.4453, + "step": 13674 + }, + { + "epoch": 1.1036235977725768, + "grad_norm": 0.7107112407684326, + "learning_rate": 4.5871944671043154e-05, + "loss": 2.4116, + "step": 13675 + }, + { + "epoch": 1.1037043015091599, + "grad_norm": 0.659501314163208, + "learning_rate": 4.585867105804177e-05, + "loss": 2.4907, + "step": 13676 + }, + { + "epoch": 1.103785005245743, + "grad_norm": 0.7553967833518982, + "learning_rate": 4.5845398794383786e-05, + "loss": 2.3982, + "step": 13677 + }, + { + "epoch": 1.1038657089823258, + "grad_norm": 0.6861104965209961, + "learning_rate": 4.583212788040003e-05, + "loss": 2.416, + "step": 13678 + }, + { + "epoch": 1.103946412718909, + "grad_norm": 0.6546811461448669, + "learning_rate": 4.5818858316421254e-05, + "loss": 2.4506, + "step": 13679 + }, + { + "epoch": 1.1040271164554918, + "grad_norm": 0.7012909650802612, + "learning_rate": 4.58055901027782e-05, + "loss": 2.439, + "step": 13680 + }, + { + "epoch": 1.1041078201920749, + "grad_norm": 0.7594780325889587, + "learning_rate": 4.5792323239801446e-05, + "loss": 2.4437, + "step": 13681 + }, + { + "epoch": 1.104188523928658, + "grad_norm": 0.6576492190361023, + "learning_rate": 4.577905772782172e-05, + "loss": 2.443, + "step": 13682 + }, + { + "epoch": 1.1042692276652408, + "grad_norm": 0.6751925349235535, + "learning_rate": 4.576579356716963e-05, + "loss": 2.507, + "step": 13683 + }, + { + "epoch": 1.104349931401824, + "grad_norm": 0.7206710577011108, + "learning_rate": 4.575253075817567e-05, + "loss": 2.4236, + "step": 13684 + }, + { + "epoch": 1.104430635138407, + "grad_norm": 0.7736170291900635, + "learning_rate": 4.5739269301170485e-05, + "loss": 2.4095, + "step": 13685 + }, + { + "epoch": 1.1045113388749899, + "grad_norm": 0.6901736855506897, + "learning_rate": 4.572600919648457e-05, + "loss": 2.4519, + "step": 13686 + }, + { + "epoch": 1.104592042611573, + "grad_norm": 0.7762539982795715, + "learning_rate": 4.571275044444836e-05, + "loss": 2.5018, + "step": 13687 + }, + { + "epoch": 1.1046727463481558, + "grad_norm": 0.7231423854827881, + "learning_rate": 4.569949304539232e-05, + "loss": 2.4553, + "step": 13688 + }, + { + "epoch": 1.104753450084739, + "grad_norm": 0.7713531255722046, + "learning_rate": 4.568623699964688e-05, + "loss": 2.49, + "step": 13689 + }, + { + "epoch": 1.104834153821322, + "grad_norm": 0.7355079650878906, + "learning_rate": 4.5672982307542354e-05, + "loss": 2.5191, + "step": 13690 + }, + { + "epoch": 1.1049148575579049, + "grad_norm": 0.6916452050209045, + "learning_rate": 4.565972896940913e-05, + "loss": 2.3867, + "step": 13691 + }, + { + "epoch": 1.104995561294488, + "grad_norm": 0.6622549295425415, + "learning_rate": 4.5646476985577544e-05, + "loss": 2.4364, + "step": 13692 + }, + { + "epoch": 1.105076265031071, + "grad_norm": 0.6683297157287598, + "learning_rate": 4.563322635637779e-05, + "loss": 2.43, + "step": 13693 + }, + { + "epoch": 1.105156968767654, + "grad_norm": 0.6857880353927612, + "learning_rate": 4.561997708214015e-05, + "loss": 2.4515, + "step": 13694 + }, + { + "epoch": 1.105237672504237, + "grad_norm": 0.7473817467689514, + "learning_rate": 4.5606729163194807e-05, + "loss": 2.442, + "step": 13695 + }, + { + "epoch": 1.1053183762408199, + "grad_norm": 0.6988846063613892, + "learning_rate": 4.559348259987203e-05, + "loss": 2.3886, + "step": 13696 + }, + { + "epoch": 1.105399079977403, + "grad_norm": 0.6450650691986084, + "learning_rate": 4.5580237392501836e-05, + "loss": 2.4647, + "step": 13697 + }, + { + "epoch": 1.105479783713986, + "grad_norm": 0.7669623494148254, + "learning_rate": 4.556699354141439e-05, + "loss": 2.4362, + "step": 13698 + }, + { + "epoch": 1.105560487450569, + "grad_norm": 0.7019730806350708, + "learning_rate": 4.55537510469398e-05, + "loss": 2.49, + "step": 13699 + }, + { + "epoch": 1.105641191187152, + "grad_norm": 0.6736636757850647, + "learning_rate": 4.5540509909408e-05, + "loss": 2.43, + "step": 13700 + }, + { + "epoch": 1.105721894923735, + "grad_norm": 0.6872034668922424, + "learning_rate": 4.552727012914907e-05, + "loss": 2.4507, + "step": 13701 + }, + { + "epoch": 1.105802598660318, + "grad_norm": 0.6726621985435486, + "learning_rate": 4.5514031706492986e-05, + "loss": 2.4193, + "step": 13702 + }, + { + "epoch": 1.105883302396901, + "grad_norm": 0.7345453500747681, + "learning_rate": 4.550079464176963e-05, + "loss": 2.4257, + "step": 13703 + }, + { + "epoch": 1.105964006133484, + "grad_norm": 0.6764804124832153, + "learning_rate": 4.548755893530894e-05, + "loss": 2.4656, + "step": 13704 + }, + { + "epoch": 1.106044709870067, + "grad_norm": 0.6915058493614197, + "learning_rate": 4.5474324587440766e-05, + "loss": 2.4148, + "step": 13705 + }, + { + "epoch": 1.10612541360665, + "grad_norm": 0.7960236668586731, + "learning_rate": 4.5461091598494954e-05, + "loss": 2.4148, + "step": 13706 + }, + { + "epoch": 1.106206117343233, + "grad_norm": 0.7058970928192139, + "learning_rate": 4.544785996880131e-05, + "loss": 2.4795, + "step": 13707 + }, + { + "epoch": 1.106286821079816, + "grad_norm": 0.6979549527168274, + "learning_rate": 4.5434629698689634e-05, + "loss": 2.4329, + "step": 13708 + }, + { + "epoch": 1.1063675248163989, + "grad_norm": 0.6805241107940674, + "learning_rate": 4.5421400788489586e-05, + "loss": 2.4303, + "step": 13709 + }, + { + "epoch": 1.106448228552982, + "grad_norm": 0.7566354274749756, + "learning_rate": 4.5408173238530905e-05, + "loss": 2.4769, + "step": 13710 + }, + { + "epoch": 1.106528932289565, + "grad_norm": 0.647773802280426, + "learning_rate": 4.539494704914324e-05, + "loss": 2.4037, + "step": 13711 + }, + { + "epoch": 1.106609636026148, + "grad_norm": 0.7248135209083557, + "learning_rate": 4.538172222065628e-05, + "loss": 2.4366, + "step": 13712 + }, + { + "epoch": 1.106690339762731, + "grad_norm": 0.6861057281494141, + "learning_rate": 4.536849875339953e-05, + "loss": 2.456, + "step": 13713 + }, + { + "epoch": 1.106771043499314, + "grad_norm": 0.7386166453361511, + "learning_rate": 4.5355276647702605e-05, + "loss": 2.4806, + "step": 13714 + }, + { + "epoch": 1.106851747235897, + "grad_norm": 0.664402961730957, + "learning_rate": 4.534205590389503e-05, + "loss": 2.4846, + "step": 13715 + }, + { + "epoch": 1.10693245097248, + "grad_norm": 0.8123969435691833, + "learning_rate": 4.5328836522306296e-05, + "loss": 2.4945, + "step": 13716 + }, + { + "epoch": 1.1070131547090631, + "grad_norm": 0.7375624775886536, + "learning_rate": 4.5315618503265865e-05, + "loss": 2.4533, + "step": 13717 + }, + { + "epoch": 1.107093858445646, + "grad_norm": 0.70960932970047, + "learning_rate": 4.53024018471032e-05, + "loss": 2.4351, + "step": 13718 + }, + { + "epoch": 1.107174562182229, + "grad_norm": 0.7170885801315308, + "learning_rate": 4.5289186554147645e-05, + "loss": 2.4654, + "step": 13719 + }, + { + "epoch": 1.107255265918812, + "grad_norm": 0.6986895203590393, + "learning_rate": 4.5275972624728556e-05, + "loss": 2.4079, + "step": 13720 + }, + { + "epoch": 1.107335969655395, + "grad_norm": 0.6948813796043396, + "learning_rate": 4.526276005917532e-05, + "loss": 2.4981, + "step": 13721 + }, + { + "epoch": 1.1074166733919781, + "grad_norm": 0.7719457149505615, + "learning_rate": 4.524954885781717e-05, + "loss": 2.4853, + "step": 13722 + }, + { + "epoch": 1.107497377128561, + "grad_norm": 0.652686357498169, + "learning_rate": 4.5236339020983363e-05, + "loss": 2.3672, + "step": 13723 + }, + { + "epoch": 1.107578080865144, + "grad_norm": 0.7517427802085876, + "learning_rate": 4.5223130549003144e-05, + "loss": 2.3947, + "step": 13724 + }, + { + "epoch": 1.107658784601727, + "grad_norm": 0.6755498647689819, + "learning_rate": 4.5209923442205705e-05, + "loss": 2.4173, + "step": 13725 + }, + { + "epoch": 1.10773948833831, + "grad_norm": 0.6801806688308716, + "learning_rate": 4.519671770092019e-05, + "loss": 2.4366, + "step": 13726 + }, + { + "epoch": 1.1078201920748931, + "grad_norm": 0.6665045619010925, + "learning_rate": 4.5183513325475724e-05, + "loss": 2.4797, + "step": 13727 + }, + { + "epoch": 1.107900895811476, + "grad_norm": 0.7303451299667358, + "learning_rate": 4.517031031620145e-05, + "loss": 2.4487, + "step": 13728 + }, + { + "epoch": 1.107981599548059, + "grad_norm": 0.7241206765174866, + "learning_rate": 4.515710867342632e-05, + "loss": 2.4632, + "step": 13729 + }, + { + "epoch": 1.1080623032846422, + "grad_norm": 0.738835334777832, + "learning_rate": 4.514390839747941e-05, + "loss": 2.3937, + "step": 13730 + }, + { + "epoch": 1.108143007021225, + "grad_norm": 0.7062843441963196, + "learning_rate": 4.5130709488689726e-05, + "loss": 2.4576, + "step": 13731 + }, + { + "epoch": 1.1082237107578081, + "grad_norm": 0.7074100971221924, + "learning_rate": 4.511751194738616e-05, + "loss": 2.4843, + "step": 13732 + }, + { + "epoch": 1.108304414494391, + "grad_norm": 0.751742959022522, + "learning_rate": 4.510431577389765e-05, + "loss": 2.4607, + "step": 13733 + }, + { + "epoch": 1.108385118230974, + "grad_norm": 0.7370054125785828, + "learning_rate": 4.50911209685531e-05, + "loss": 2.4877, + "step": 13734 + }, + { + "epoch": 1.1084658219675572, + "grad_norm": 0.6410251259803772, + "learning_rate": 4.507792753168135e-05, + "loss": 2.4254, + "step": 13735 + }, + { + "epoch": 1.10854652570414, + "grad_norm": 0.7141317129135132, + "learning_rate": 4.506473546361121e-05, + "loss": 2.4962, + "step": 13736 + }, + { + "epoch": 1.1086272294407231, + "grad_norm": 0.6903412342071533, + "learning_rate": 4.50515447646715e-05, + "loss": 2.4315, + "step": 13737 + }, + { + "epoch": 1.1087079331773062, + "grad_norm": 0.7068564891815186, + "learning_rate": 4.50383554351909e-05, + "loss": 2.5795, + "step": 13738 + }, + { + "epoch": 1.108788636913889, + "grad_norm": 0.6880627274513245, + "learning_rate": 4.5025167475498154e-05, + "loss": 2.4399, + "step": 13739 + }, + { + "epoch": 1.1088693406504722, + "grad_norm": 0.6721192598342896, + "learning_rate": 4.5011980885921965e-05, + "loss": 2.4651, + "step": 13740 + }, + { + "epoch": 1.108950044387055, + "grad_norm": 0.7084259986877441, + "learning_rate": 4.499879566679093e-05, + "loss": 2.4121, + "step": 13741 + }, + { + "epoch": 1.109030748123638, + "grad_norm": 0.6809335947036743, + "learning_rate": 4.498561181843368e-05, + "loss": 2.4714, + "step": 13742 + }, + { + "epoch": 1.1091114518602212, + "grad_norm": 0.690416693687439, + "learning_rate": 4.497242934117879e-05, + "loss": 2.4744, + "step": 13743 + }, + { + "epoch": 1.109192155596804, + "grad_norm": 0.728522002696991, + "learning_rate": 4.495924823535483e-05, + "loss": 2.4374, + "step": 13744 + }, + { + "epoch": 1.1092728593333872, + "grad_norm": 0.7000796794891357, + "learning_rate": 4.494606850129026e-05, + "loss": 2.4635, + "step": 13745 + }, + { + "epoch": 1.1093535630699702, + "grad_norm": 0.824645459651947, + "learning_rate": 4.493289013931353e-05, + "loss": 2.3724, + "step": 13746 + }, + { + "epoch": 1.109434266806553, + "grad_norm": 0.6561198830604553, + "learning_rate": 4.491971314975321e-05, + "loss": 2.3726, + "step": 13747 + }, + { + "epoch": 1.1095149705431362, + "grad_norm": 0.7067599892616272, + "learning_rate": 4.490653753293757e-05, + "loss": 2.4285, + "step": 13748 + }, + { + "epoch": 1.109595674279719, + "grad_norm": 0.6954898834228516, + "learning_rate": 4.489336328919503e-05, + "loss": 2.4252, + "step": 13749 + }, + { + "epoch": 1.1096763780163021, + "grad_norm": 0.6683667302131653, + "learning_rate": 4.4880190418853974e-05, + "loss": 2.4815, + "step": 13750 + }, + { + "epoch": 1.1097570817528852, + "grad_norm": 0.7554971575737, + "learning_rate": 4.486701892224261e-05, + "loss": 2.5036, + "step": 13751 + }, + { + "epoch": 1.109837785489468, + "grad_norm": 0.7043242454528809, + "learning_rate": 4.485384879968926e-05, + "loss": 2.3757, + "step": 13752 + }, + { + "epoch": 1.1099184892260512, + "grad_norm": 0.8016893863677979, + "learning_rate": 4.4840680051522186e-05, + "loss": 2.4655, + "step": 13753 + }, + { + "epoch": 1.1099991929626343, + "grad_norm": 0.7022131085395813, + "learning_rate": 4.4827512678069515e-05, + "loss": 2.475, + "step": 13754 + }, + { + "epoch": 1.1100798966992171, + "grad_norm": 0.6963247656822205, + "learning_rate": 4.4814346679659455e-05, + "loss": 2.4866, + "step": 13755 + }, + { + "epoch": 1.1101606004358002, + "grad_norm": 0.6980907917022705, + "learning_rate": 4.4801182056620125e-05, + "loss": 2.4322, + "step": 13756 + }, + { + "epoch": 1.110241304172383, + "grad_norm": 0.68063884973526, + "learning_rate": 4.478801880927964e-05, + "loss": 2.426, + "step": 13757 + }, + { + "epoch": 1.1103220079089662, + "grad_norm": 0.7454195618629456, + "learning_rate": 4.477485693796605e-05, + "loss": 2.5042, + "step": 13758 + }, + { + "epoch": 1.1104027116455493, + "grad_norm": 0.685975193977356, + "learning_rate": 4.476169644300737e-05, + "loss": 2.4874, + "step": 13759 + }, + { + "epoch": 1.1104834153821321, + "grad_norm": 0.7060961723327637, + "learning_rate": 4.4748537324731664e-05, + "loss": 2.4126, + "step": 13760 + }, + { + "epoch": 1.1105641191187152, + "grad_norm": 0.6794416904449463, + "learning_rate": 4.4735379583466795e-05, + "loss": 2.4112, + "step": 13761 + }, + { + "epoch": 1.1106448228552983, + "grad_norm": 0.6854961514472961, + "learning_rate": 4.472222321954073e-05, + "loss": 2.4909, + "step": 13762 + }, + { + "epoch": 1.1107255265918812, + "grad_norm": 0.7660776972770691, + "learning_rate": 4.470906823328139e-05, + "loss": 2.5021, + "step": 13763 + }, + { + "epoch": 1.1108062303284643, + "grad_norm": 0.7027743458747864, + "learning_rate": 4.4695914625016564e-05, + "loss": 2.4375, + "step": 13764 + }, + { + "epoch": 1.1108869340650471, + "grad_norm": 0.6896719336509705, + "learning_rate": 4.468276239507413e-05, + "loss": 2.4574, + "step": 13765 + }, + { + "epoch": 1.1109676378016302, + "grad_norm": 0.685141384601593, + "learning_rate": 4.4669611543781844e-05, + "loss": 2.4311, + "step": 13766 + }, + { + "epoch": 1.1110483415382133, + "grad_norm": 0.7108263373374939, + "learning_rate": 4.465646207146746e-05, + "loss": 2.4565, + "step": 13767 + }, + { + "epoch": 1.1111290452747962, + "grad_norm": 0.63578861951828, + "learning_rate": 4.464331397845873e-05, + "loss": 2.449, + "step": 13768 + }, + { + "epoch": 1.1112097490113793, + "grad_norm": 0.6917306780815125, + "learning_rate": 4.463016726508335e-05, + "loss": 2.4681, + "step": 13769 + }, + { + "epoch": 1.1112904527479621, + "grad_norm": 0.7328054308891296, + "learning_rate": 4.4617021931668914e-05, + "loss": 2.404, + "step": 13770 + }, + { + "epoch": 1.1113711564845452, + "grad_norm": 0.6501660943031311, + "learning_rate": 4.460387797854305e-05, + "loss": 2.4228, + "step": 13771 + }, + { + "epoch": 1.1114518602211283, + "grad_norm": 0.6656771302223206, + "learning_rate": 4.459073540603336e-05, + "loss": 2.4814, + "step": 13772 + }, + { + "epoch": 1.1115325639577112, + "grad_norm": 0.671017587184906, + "learning_rate": 4.457759421446742e-05, + "loss": 2.4605, + "step": 13773 + }, + { + "epoch": 1.1116132676942942, + "grad_norm": 0.6715343594551086, + "learning_rate": 4.456445440417267e-05, + "loss": 2.424, + "step": 13774 + }, + { + "epoch": 1.1116939714308773, + "grad_norm": 0.7051515579223633, + "learning_rate": 4.4551315975476626e-05, + "loss": 2.4358, + "step": 13775 + }, + { + "epoch": 1.1117746751674602, + "grad_norm": 0.7810437679290771, + "learning_rate": 4.453817892870673e-05, + "loss": 2.4718, + "step": 13776 + }, + { + "epoch": 1.1118553789040433, + "grad_norm": 0.7072561383247375, + "learning_rate": 4.4525043264190405e-05, + "loss": 2.4429, + "step": 13777 + }, + { + "epoch": 1.1119360826406264, + "grad_norm": 0.7949702143669128, + "learning_rate": 4.4511908982255e-05, + "loss": 2.4413, + "step": 13778 + }, + { + "epoch": 1.1120167863772092, + "grad_norm": 0.6716235876083374, + "learning_rate": 4.449877608322792e-05, + "loss": 2.427, + "step": 13779 + }, + { + "epoch": 1.1120974901137923, + "grad_norm": 0.7332563996315002, + "learning_rate": 4.448564456743638e-05, + "loss": 2.4567, + "step": 13780 + }, + { + "epoch": 1.1121781938503752, + "grad_norm": 0.7264607548713684, + "learning_rate": 4.447251443520769e-05, + "loss": 2.4844, + "step": 13781 + }, + { + "epoch": 1.1122588975869583, + "grad_norm": 0.7819967865943909, + "learning_rate": 4.4459385686869136e-05, + "loss": 2.5129, + "step": 13782 + }, + { + "epoch": 1.1123396013235414, + "grad_norm": 0.7587651610374451, + "learning_rate": 4.4446258322747824e-05, + "loss": 2.4714, + "step": 13783 + }, + { + "epoch": 1.1124203050601242, + "grad_norm": 0.6392871141433716, + "learning_rate": 4.443313234317099e-05, + "loss": 2.462, + "step": 13784 + }, + { + "epoch": 1.1125010087967073, + "grad_norm": 0.6609585881233215, + "learning_rate": 4.442000774846574e-05, + "loss": 2.4566, + "step": 13785 + }, + { + "epoch": 1.1125817125332902, + "grad_norm": 0.762924075126648, + "learning_rate": 4.440688453895919e-05, + "loss": 2.4613, + "step": 13786 + }, + { + "epoch": 1.1126624162698733, + "grad_norm": 0.7096089124679565, + "learning_rate": 4.4393762714978394e-05, + "loss": 2.4195, + "step": 13787 + }, + { + "epoch": 1.1127431200064564, + "grad_norm": 0.6663284301757812, + "learning_rate": 4.438064227685039e-05, + "loss": 2.422, + "step": 13788 + }, + { + "epoch": 1.1128238237430392, + "grad_norm": 0.6653628945350647, + "learning_rate": 4.436752322490221e-05, + "loss": 2.4477, + "step": 13789 + }, + { + "epoch": 1.1129045274796223, + "grad_norm": 0.6527605056762695, + "learning_rate": 4.435440555946073e-05, + "loss": 2.3874, + "step": 13790 + }, + { + "epoch": 1.1129852312162054, + "grad_norm": 0.6801275014877319, + "learning_rate": 4.4341289280852935e-05, + "loss": 2.4474, + "step": 13791 + }, + { + "epoch": 1.1130659349527883, + "grad_norm": 0.729905366897583, + "learning_rate": 4.432817438940574e-05, + "loss": 2.4711, + "step": 13792 + }, + { + "epoch": 1.1131466386893714, + "grad_norm": 0.7074751853942871, + "learning_rate": 4.431506088544593e-05, + "loss": 2.451, + "step": 13793 + }, + { + "epoch": 1.1132273424259542, + "grad_norm": 0.7241154313087463, + "learning_rate": 4.430194876930035e-05, + "loss": 2.4883, + "step": 13794 + }, + { + "epoch": 1.1133080461625373, + "grad_norm": 0.6549142003059387, + "learning_rate": 4.428883804129586e-05, + "loss": 2.4243, + "step": 13795 + }, + { + "epoch": 1.1133887498991204, + "grad_norm": 0.7046780586242676, + "learning_rate": 4.427572870175907e-05, + "loss": 2.4143, + "step": 13796 + }, + { + "epoch": 1.1134694536357033, + "grad_norm": 0.6563952565193176, + "learning_rate": 4.426262075101682e-05, + "loss": 2.416, + "step": 13797 + }, + { + "epoch": 1.1135501573722864, + "grad_norm": 0.7002081871032715, + "learning_rate": 4.4249514189395803e-05, + "loss": 2.3673, + "step": 13798 + }, + { + "epoch": 1.1136308611088694, + "grad_norm": 0.6766571998596191, + "learning_rate": 4.423640901722259e-05, + "loss": 2.4941, + "step": 13799 + }, + { + "epoch": 1.1137115648454523, + "grad_norm": 0.7404381632804871, + "learning_rate": 4.422330523482383e-05, + "loss": 2.4794, + "step": 13800 + }, + { + "epoch": 1.1137922685820354, + "grad_norm": 0.6670998930931091, + "learning_rate": 4.421020284252614e-05, + "loss": 2.5131, + "step": 13801 + }, + { + "epoch": 1.1138729723186183, + "grad_norm": 0.803720235824585, + "learning_rate": 4.4197101840655995e-05, + "loss": 2.4751, + "step": 13802 + }, + { + "epoch": 1.1139536760552013, + "grad_norm": 0.6532074809074402, + "learning_rate": 4.4184002229539947e-05, + "loss": 2.4147, + "step": 13803 + }, + { + "epoch": 1.1140343797917844, + "grad_norm": 0.6548035144805908, + "learning_rate": 4.417090400950447e-05, + "loss": 2.4601, + "step": 13804 + }, + { + "epoch": 1.1141150835283673, + "grad_norm": 0.6971763968467712, + "learning_rate": 4.415780718087603e-05, + "loss": 2.4752, + "step": 13805 + }, + { + "epoch": 1.1141957872649504, + "grad_norm": 0.6624024510383606, + "learning_rate": 4.414471174398098e-05, + "loss": 2.4183, + "step": 13806 + }, + { + "epoch": 1.1142764910015335, + "grad_norm": 0.6571507453918457, + "learning_rate": 4.4131617699145714e-05, + "loss": 2.4747, + "step": 13807 + }, + { + "epoch": 1.1143571947381163, + "grad_norm": 0.7165808081626892, + "learning_rate": 4.411852504669658e-05, + "loss": 2.453, + "step": 13808 + }, + { + "epoch": 1.1144378984746994, + "grad_norm": 0.6708057522773743, + "learning_rate": 4.410543378695988e-05, + "loss": 2.4858, + "step": 13809 + }, + { + "epoch": 1.1145186022112823, + "grad_norm": 0.889302134513855, + "learning_rate": 4.409234392026187e-05, + "loss": 2.4333, + "step": 13810 + }, + { + "epoch": 1.1145993059478654, + "grad_norm": 0.7440677881240845, + "learning_rate": 4.407925544692884e-05, + "loss": 2.49, + "step": 13811 + }, + { + "epoch": 1.1146800096844485, + "grad_norm": 0.6688372492790222, + "learning_rate": 4.406616836728691e-05, + "loss": 2.4663, + "step": 13812 + }, + { + "epoch": 1.1147607134210313, + "grad_norm": 0.7108204364776611, + "learning_rate": 4.4053082681662264e-05, + "loss": 2.4843, + "step": 13813 + }, + { + "epoch": 1.1148414171576144, + "grad_norm": 0.7270475029945374, + "learning_rate": 4.4039998390381087e-05, + "loss": 2.4158, + "step": 13814 + }, + { + "epoch": 1.1149221208941973, + "grad_norm": 0.7243396639823914, + "learning_rate": 4.402691549376939e-05, + "loss": 2.3969, + "step": 13815 + }, + { + "epoch": 1.1150028246307804, + "grad_norm": 0.6687803268432617, + "learning_rate": 4.4013833992153285e-05, + "loss": 2.42, + "step": 13816 + }, + { + "epoch": 1.1150835283673635, + "grad_norm": 0.6892626285552979, + "learning_rate": 4.400075388585877e-05, + "loss": 2.4086, + "step": 13817 + }, + { + "epoch": 1.1151642321039463, + "grad_norm": 0.7556231021881104, + "learning_rate": 4.398767517521186e-05, + "loss": 2.4201, + "step": 13818 + }, + { + "epoch": 1.1152449358405294, + "grad_norm": 0.6872838735580444, + "learning_rate": 4.397459786053851e-05, + "loss": 2.4143, + "step": 13819 + }, + { + "epoch": 1.1153256395771125, + "grad_norm": 0.6681817770004272, + "learning_rate": 4.396152194216463e-05, + "loss": 2.4404, + "step": 13820 + }, + { + "epoch": 1.1154063433136954, + "grad_norm": 0.7107201218605042, + "learning_rate": 4.394844742041614e-05, + "loss": 2.4503, + "step": 13821 + }, + { + "epoch": 1.1154870470502785, + "grad_norm": 0.706541121006012, + "learning_rate": 4.3935374295618824e-05, + "loss": 2.5106, + "step": 13822 + }, + { + "epoch": 1.1155677507868615, + "grad_norm": 0.6659905910491943, + "learning_rate": 4.392230256809854e-05, + "loss": 2.3839, + "step": 13823 + }, + { + "epoch": 1.1156484545234444, + "grad_norm": 0.7125810980796814, + "learning_rate": 4.3909232238181095e-05, + "loss": 2.4463, + "step": 13824 + }, + { + "epoch": 1.1157291582600275, + "grad_norm": 0.6581901907920837, + "learning_rate": 4.389616330619217e-05, + "loss": 2.4004, + "step": 13825 + }, + { + "epoch": 1.1158098619966104, + "grad_norm": 0.7660872340202332, + "learning_rate": 4.388309577245752e-05, + "loss": 2.4685, + "step": 13826 + }, + { + "epoch": 1.1158905657331935, + "grad_norm": 0.699526846408844, + "learning_rate": 4.387002963730281e-05, + "loss": 2.4131, + "step": 13827 + }, + { + "epoch": 1.1159712694697765, + "grad_norm": 0.7031015753746033, + "learning_rate": 4.3856964901053685e-05, + "loss": 2.4476, + "step": 13828 + }, + { + "epoch": 1.1160519732063594, + "grad_norm": 0.6876828074455261, + "learning_rate": 4.384390156403575e-05, + "loss": 2.4402, + "step": 13829 + }, + { + "epoch": 1.1161326769429425, + "grad_norm": 0.7188935279846191, + "learning_rate": 4.3830839626574626e-05, + "loss": 2.4473, + "step": 13830 + }, + { + "epoch": 1.1162133806795254, + "grad_norm": 0.6825287938117981, + "learning_rate": 4.381777908899577e-05, + "loss": 2.4757, + "step": 13831 + }, + { + "epoch": 1.1162940844161084, + "grad_norm": 0.718267560005188, + "learning_rate": 4.380471995162472e-05, + "loss": 2.483, + "step": 13832 + }, + { + "epoch": 1.1163747881526915, + "grad_norm": 0.6526767611503601, + "learning_rate": 4.379166221478697e-05, + "loss": 2.4161, + "step": 13833 + }, + { + "epoch": 1.1164554918892744, + "grad_norm": 0.7541480660438538, + "learning_rate": 4.37786058788079e-05, + "loss": 2.4876, + "step": 13834 + }, + { + "epoch": 1.1165361956258575, + "grad_norm": 0.7144232988357544, + "learning_rate": 4.376555094401294e-05, + "loss": 2.4153, + "step": 13835 + }, + { + "epoch": 1.1166168993624406, + "grad_norm": 0.7544882297515869, + "learning_rate": 4.3752497410727445e-05, + "loss": 2.4634, + "step": 13836 + }, + { + "epoch": 1.1166976030990234, + "grad_norm": 0.7263267040252686, + "learning_rate": 4.373944527927674e-05, + "loss": 2.5189, + "step": 13837 + }, + { + "epoch": 1.1167783068356065, + "grad_norm": 0.7709252834320068, + "learning_rate": 4.3726394549986135e-05, + "loss": 2.5036, + "step": 13838 + }, + { + "epoch": 1.1168590105721894, + "grad_norm": 0.6849128007888794, + "learning_rate": 4.3713345223180866e-05, + "loss": 2.414, + "step": 13839 + }, + { + "epoch": 1.1169397143087725, + "grad_norm": 0.6807512044906616, + "learning_rate": 4.3700297299186224e-05, + "loss": 2.4924, + "step": 13840 + }, + { + "epoch": 1.1170204180453556, + "grad_norm": 0.6894977688789368, + "learning_rate": 4.3687250778327294e-05, + "loss": 2.4183, + "step": 13841 + }, + { + "epoch": 1.1171011217819384, + "grad_norm": 0.6657617092132568, + "learning_rate": 4.367420566092928e-05, + "loss": 2.448, + "step": 13842 + }, + { + "epoch": 1.1171818255185215, + "grad_norm": 0.7104446291923523, + "learning_rate": 4.366116194731733e-05, + "loss": 2.4862, + "step": 13843 + }, + { + "epoch": 1.1172625292551046, + "grad_norm": 0.7485257387161255, + "learning_rate": 4.3648119637816465e-05, + "loss": 2.4253, + "step": 13844 + }, + { + "epoch": 1.1173432329916875, + "grad_norm": 0.7079899907112122, + "learning_rate": 4.363507873275177e-05, + "loss": 2.4235, + "step": 13845 + }, + { + "epoch": 1.1174239367282706, + "grad_norm": 0.6891573667526245, + "learning_rate": 4.3622039232448274e-05, + "loss": 2.4382, + "step": 13846 + }, + { + "epoch": 1.1175046404648534, + "grad_norm": 0.6886103749275208, + "learning_rate": 4.360900113723086e-05, + "loss": 2.5115, + "step": 13847 + }, + { + "epoch": 1.1175853442014365, + "grad_norm": 0.7511457800865173, + "learning_rate": 4.35959644474246e-05, + "loss": 2.4071, + "step": 13848 + }, + { + "epoch": 1.1176660479380196, + "grad_norm": 0.6526182293891907, + "learning_rate": 4.358292916335437e-05, + "loss": 2.4242, + "step": 13849 + }, + { + "epoch": 1.1177467516746025, + "grad_norm": 0.7385138273239136, + "learning_rate": 4.356989528534499e-05, + "loss": 2.4459, + "step": 13850 + }, + { + "epoch": 1.1178274554111856, + "grad_norm": 0.6668610572814941, + "learning_rate": 4.355686281372132e-05, + "loss": 2.4188, + "step": 13851 + }, + { + "epoch": 1.1179081591477686, + "grad_norm": 0.6950691342353821, + "learning_rate": 4.354383174880818e-05, + "loss": 2.4339, + "step": 13852 + }, + { + "epoch": 1.1179888628843515, + "grad_norm": 0.7017496824264526, + "learning_rate": 4.3530802090930375e-05, + "loss": 2.4733, + "step": 13853 + }, + { + "epoch": 1.1180695666209346, + "grad_norm": 0.8118221759796143, + "learning_rate": 4.351777384041254e-05, + "loss": 2.4826, + "step": 13854 + }, + { + "epoch": 1.1181502703575175, + "grad_norm": 0.7233164310455322, + "learning_rate": 4.350474699757945e-05, + "loss": 2.4637, + "step": 13855 + }, + { + "epoch": 1.1182309740941005, + "grad_norm": 0.6354575157165527, + "learning_rate": 4.349172156275576e-05, + "loss": 2.4487, + "step": 13856 + }, + { + "epoch": 1.1183116778306836, + "grad_norm": 0.6776937246322632, + "learning_rate": 4.347869753626606e-05, + "loss": 2.4292, + "step": 13857 + }, + { + "epoch": 1.1183923815672665, + "grad_norm": 0.6656864881515503, + "learning_rate": 4.3465674918434953e-05, + "loss": 2.484, + "step": 13858 + }, + { + "epoch": 1.1184730853038496, + "grad_norm": 0.7659650444984436, + "learning_rate": 4.345265370958702e-05, + "loss": 2.4181, + "step": 13859 + }, + { + "epoch": 1.1185537890404325, + "grad_norm": 0.6546063423156738, + "learning_rate": 4.3439633910046764e-05, + "loss": 2.4657, + "step": 13860 + }, + { + "epoch": 1.1186344927770155, + "grad_norm": 0.6869762539863586, + "learning_rate": 4.342661552013869e-05, + "loss": 2.513, + "step": 13861 + }, + { + "epoch": 1.1187151965135986, + "grad_norm": 0.6633490324020386, + "learning_rate": 4.3413598540187275e-05, + "loss": 2.4716, + "step": 13862 + }, + { + "epoch": 1.1187959002501815, + "grad_norm": 0.7238267660140991, + "learning_rate": 4.340058297051687e-05, + "loss": 2.4353, + "step": 13863 + }, + { + "epoch": 1.1188766039867646, + "grad_norm": 0.67429119348526, + "learning_rate": 4.3387568811451875e-05, + "loss": 2.4808, + "step": 13864 + }, + { + "epoch": 1.1189573077233477, + "grad_norm": 0.6901153326034546, + "learning_rate": 4.33745560633167e-05, + "loss": 2.4785, + "step": 13865 + }, + { + "epoch": 1.1190380114599305, + "grad_norm": 0.7227689027786255, + "learning_rate": 4.336154472643556e-05, + "loss": 2.4414, + "step": 13866 + }, + { + "epoch": 1.1191187151965136, + "grad_norm": 0.713793933391571, + "learning_rate": 4.33485348011328e-05, + "loss": 2.5136, + "step": 13867 + }, + { + "epoch": 1.1191994189330967, + "grad_norm": 0.6495655179023743, + "learning_rate": 4.333552628773263e-05, + "loss": 2.4267, + "step": 13868 + }, + { + "epoch": 1.1192801226696796, + "grad_norm": 0.7265790104866028, + "learning_rate": 4.3322519186559274e-05, + "loss": 2.4406, + "step": 13869 + }, + { + "epoch": 1.1193608264062627, + "grad_norm": 0.6700571179389954, + "learning_rate": 4.330951349793688e-05, + "loss": 2.4457, + "step": 13870 + }, + { + "epoch": 1.1194415301428455, + "grad_norm": 0.7112334966659546, + "learning_rate": 4.3296509222189616e-05, + "loss": 2.4788, + "step": 13871 + }, + { + "epoch": 1.1195222338794286, + "grad_norm": 0.7056662440299988, + "learning_rate": 4.32835063596416e-05, + "loss": 2.5195, + "step": 13872 + }, + { + "epoch": 1.1196029376160117, + "grad_norm": 0.7198836207389832, + "learning_rate": 4.327050491061683e-05, + "loss": 2.4827, + "step": 13873 + }, + { + "epoch": 1.1196836413525946, + "grad_norm": 0.7384079694747925, + "learning_rate": 4.325750487543936e-05, + "loss": 2.4556, + "step": 13874 + }, + { + "epoch": 1.1197643450891777, + "grad_norm": 0.7315430641174316, + "learning_rate": 4.324450625443324e-05, + "loss": 2.4302, + "step": 13875 + }, + { + "epoch": 1.1198450488257605, + "grad_norm": 0.6692587733268738, + "learning_rate": 4.323150904792234e-05, + "loss": 2.5283, + "step": 13876 + }, + { + "epoch": 1.1199257525623436, + "grad_norm": 0.7407168745994568, + "learning_rate": 4.321851325623063e-05, + "loss": 2.4757, + "step": 13877 + }, + { + "epoch": 1.1200064562989267, + "grad_norm": 0.7387246489524841, + "learning_rate": 4.3205518879682e-05, + "loss": 2.5025, + "step": 13878 + }, + { + "epoch": 1.1200871600355096, + "grad_norm": 0.8058405518531799, + "learning_rate": 4.319252591860031e-05, + "loss": 2.4951, + "step": 13879 + }, + { + "epoch": 1.1201678637720927, + "grad_norm": 0.6964818835258484, + "learning_rate": 4.317953437330936e-05, + "loss": 2.4462, + "step": 13880 + }, + { + "epoch": 1.1202485675086757, + "grad_norm": 0.6904557347297668, + "learning_rate": 4.316654424413294e-05, + "loss": 2.3981, + "step": 13881 + }, + { + "epoch": 1.1203292712452586, + "grad_norm": 0.6555196046829224, + "learning_rate": 4.315355553139485e-05, + "loss": 2.418, + "step": 13882 + }, + { + "epoch": 1.1204099749818417, + "grad_norm": 0.7745094299316406, + "learning_rate": 4.3140568235418724e-05, + "loss": 2.4635, + "step": 13883 + }, + { + "epoch": 1.1204906787184246, + "grad_norm": 0.686676025390625, + "learning_rate": 4.312758235652825e-05, + "loss": 2.4847, + "step": 13884 + }, + { + "epoch": 1.1205713824550076, + "grad_norm": 0.6937002539634705, + "learning_rate": 4.311459789504714e-05, + "loss": 2.4632, + "step": 13885 + }, + { + "epoch": 1.1206520861915907, + "grad_norm": 0.7024590373039246, + "learning_rate": 4.310161485129891e-05, + "loss": 2.4268, + "step": 13886 + }, + { + "epoch": 1.1207327899281736, + "grad_norm": 0.6848484873771667, + "learning_rate": 4.308863322560717e-05, + "loss": 2.4895, + "step": 13887 + }, + { + "epoch": 1.1208134936647567, + "grad_norm": 0.7071602940559387, + "learning_rate": 4.307565301829546e-05, + "loss": 2.4348, + "step": 13888 + }, + { + "epoch": 1.1208941974013398, + "grad_norm": 0.6868199706077576, + "learning_rate": 4.3062674229687274e-05, + "loss": 2.4613, + "step": 13889 + }, + { + "epoch": 1.1209749011379226, + "grad_norm": 0.7283496260643005, + "learning_rate": 4.304969686010608e-05, + "loss": 2.478, + "step": 13890 + }, + { + "epoch": 1.1210556048745057, + "grad_norm": 0.6907255053520203, + "learning_rate": 4.303672090987535e-05, + "loss": 2.4431, + "step": 13891 + }, + { + "epoch": 1.1211363086110886, + "grad_norm": 0.675089418888092, + "learning_rate": 4.302374637931841e-05, + "loss": 2.4398, + "step": 13892 + }, + { + "epoch": 1.1212170123476717, + "grad_norm": 0.6929863095283508, + "learning_rate": 4.301077326875863e-05, + "loss": 2.3909, + "step": 13893 + }, + { + "epoch": 1.1212977160842548, + "grad_norm": 0.6746132969856262, + "learning_rate": 4.29978015785194e-05, + "loss": 2.4726, + "step": 13894 + }, + { + "epoch": 1.1213784198208376, + "grad_norm": 0.720781147480011, + "learning_rate": 4.298483130892392e-05, + "loss": 2.4445, + "step": 13895 + }, + { + "epoch": 1.1214591235574207, + "grad_norm": 0.6624416708946228, + "learning_rate": 4.297186246029549e-05, + "loss": 2.3868, + "step": 13896 + }, + { + "epoch": 1.1215398272940038, + "grad_norm": 0.7849127054214478, + "learning_rate": 4.295889503295731e-05, + "loss": 2.4479, + "step": 13897 + }, + { + "epoch": 1.1216205310305867, + "grad_norm": 0.6655337810516357, + "learning_rate": 4.294592902723259e-05, + "loss": 2.5093, + "step": 13898 + }, + { + "epoch": 1.1217012347671698, + "grad_norm": 0.7055402398109436, + "learning_rate": 4.293296444344445e-05, + "loss": 2.4385, + "step": 13899 + }, + { + "epoch": 1.1217819385037526, + "grad_norm": 0.7388767600059509, + "learning_rate": 4.2920001281916e-05, + "loss": 2.4863, + "step": 13900 + }, + { + "epoch": 1.1218626422403357, + "grad_norm": 0.6915223002433777, + "learning_rate": 4.2907039542970373e-05, + "loss": 2.4218, + "step": 13901 + }, + { + "epoch": 1.1219433459769188, + "grad_norm": 0.7124893665313721, + "learning_rate": 4.289407922693053e-05, + "loss": 2.4514, + "step": 13902 + }, + { + "epoch": 1.1220240497135017, + "grad_norm": 0.6552406549453735, + "learning_rate": 4.28811203341195e-05, + "loss": 2.4558, + "step": 13903 + }, + { + "epoch": 1.1221047534500848, + "grad_norm": 0.6641791462898254, + "learning_rate": 4.286816286486031e-05, + "loss": 2.4277, + "step": 13904 + }, + { + "epoch": 1.1221854571866678, + "grad_norm": 0.677733838558197, + "learning_rate": 4.285520681947579e-05, + "loss": 2.4861, + "step": 13905 + }, + { + "epoch": 1.1222661609232507, + "grad_norm": 0.6572888493537903, + "learning_rate": 4.284225219828891e-05, + "loss": 2.4657, + "step": 13906 + }, + { + "epoch": 1.1223468646598338, + "grad_norm": 0.6923860907554626, + "learning_rate": 4.2829299001622546e-05, + "loss": 2.4857, + "step": 13907 + }, + { + "epoch": 1.1224275683964167, + "grad_norm": 0.6971977949142456, + "learning_rate": 4.281634722979947e-05, + "loss": 2.4434, + "step": 13908 + }, + { + "epoch": 1.1225082721329998, + "grad_norm": 0.6828060746192932, + "learning_rate": 4.2803396883142456e-05, + "loss": 2.4342, + "step": 13909 + }, + { + "epoch": 1.1225889758695828, + "grad_norm": 0.7001270651817322, + "learning_rate": 4.279044796197438e-05, + "loss": 2.5222, + "step": 13910 + }, + { + "epoch": 1.1226696796061657, + "grad_norm": 0.6425578594207764, + "learning_rate": 4.277750046661785e-05, + "loss": 2.42, + "step": 13911 + }, + { + "epoch": 1.1227503833427488, + "grad_norm": 0.6498209834098816, + "learning_rate": 4.2764554397395585e-05, + "loss": 2.4448, + "step": 13912 + }, + { + "epoch": 1.1228310870793319, + "grad_norm": 0.6894031763076782, + "learning_rate": 4.275160975463025e-05, + "loss": 2.4508, + "step": 13913 + }, + { + "epoch": 1.1229117908159147, + "grad_norm": 0.7286608219146729, + "learning_rate": 4.273866653864448e-05, + "loss": 2.4557, + "step": 13914 + }, + { + "epoch": 1.1229924945524978, + "grad_norm": 0.753826379776001, + "learning_rate": 4.272572474976079e-05, + "loss": 2.4635, + "step": 13915 + }, + { + "epoch": 1.1230731982890807, + "grad_norm": 0.6715937256813049, + "learning_rate": 4.271278438830174e-05, + "loss": 2.5107, + "step": 13916 + }, + { + "epoch": 1.1231539020256638, + "grad_norm": 0.6833200454711914, + "learning_rate": 4.26998454545899e-05, + "loss": 2.4883, + "step": 13917 + }, + { + "epoch": 1.1232346057622469, + "grad_norm": 0.6763597130775452, + "learning_rate": 4.2686907948947666e-05, + "loss": 2.4178, + "step": 13918 + }, + { + "epoch": 1.1233153094988297, + "grad_norm": 0.7336227297782898, + "learning_rate": 4.26739718716975e-05, + "loss": 2.4542, + "step": 13919 + }, + { + "epoch": 1.1233960132354128, + "grad_norm": 0.6583260297775269, + "learning_rate": 4.2661037223161806e-05, + "loss": 2.3998, + "step": 13920 + }, + { + "epoch": 1.1234767169719957, + "grad_norm": 0.6444356441497803, + "learning_rate": 4.264810400366295e-05, + "loss": 2.4354, + "step": 13921 + }, + { + "epoch": 1.1235574207085788, + "grad_norm": 0.6786002516746521, + "learning_rate": 4.2635172213523255e-05, + "loss": 2.3989, + "step": 13922 + }, + { + "epoch": 1.1236381244451619, + "grad_norm": 0.6838372349739075, + "learning_rate": 4.262224185306507e-05, + "loss": 2.4431, + "step": 13923 + }, + { + "epoch": 1.1237188281817447, + "grad_norm": 0.7516793012619019, + "learning_rate": 4.260931292261056e-05, + "loss": 2.4373, + "step": 13924 + }, + { + "epoch": 1.1237995319183278, + "grad_norm": 0.6860260367393494, + "learning_rate": 4.2596385422481985e-05, + "loss": 2.4457, + "step": 13925 + }, + { + "epoch": 1.123880235654911, + "grad_norm": 0.6556448936462402, + "learning_rate": 4.2583459353001595e-05, + "loss": 2.4165, + "step": 13926 + }, + { + "epoch": 1.1239609393914938, + "grad_norm": 0.729131281375885, + "learning_rate": 4.257053471449144e-05, + "loss": 2.4124, + "step": 13927 + }, + { + "epoch": 1.1240416431280769, + "grad_norm": 0.6941910982131958, + "learning_rate": 4.2557611507273684e-05, + "loss": 2.4095, + "step": 13928 + }, + { + "epoch": 1.12412234686466, + "grad_norm": 0.6390536427497864, + "learning_rate": 4.25446897316704e-05, + "loss": 2.4221, + "step": 13929 + }, + { + "epoch": 1.1242030506012428, + "grad_norm": 0.7034881114959717, + "learning_rate": 4.253176938800365e-05, + "loss": 2.4685, + "step": 13930 + }, + { + "epoch": 1.124283754337826, + "grad_norm": 0.6975526809692383, + "learning_rate": 4.251885047659542e-05, + "loss": 2.4771, + "step": 13931 + }, + { + "epoch": 1.1243644580744088, + "grad_norm": 0.7020023465156555, + "learning_rate": 4.2505932997767695e-05, + "loss": 2.4746, + "step": 13932 + }, + { + "epoch": 1.1244451618109919, + "grad_norm": 0.7207093238830566, + "learning_rate": 4.2493016951842444e-05, + "loss": 2.4707, + "step": 13933 + }, + { + "epoch": 1.124525865547575, + "grad_norm": 0.7711251974105835, + "learning_rate": 4.24801023391415e-05, + "loss": 2.5104, + "step": 13934 + }, + { + "epoch": 1.1246065692841578, + "grad_norm": 0.7324040532112122, + "learning_rate": 4.246718915998677e-05, + "loss": 2.4257, + "step": 13935 + }, + { + "epoch": 1.124687273020741, + "grad_norm": 0.6532757878303528, + "learning_rate": 4.2454277414700116e-05, + "loss": 2.3708, + "step": 13936 + }, + { + "epoch": 1.1247679767573238, + "grad_norm": 0.6933012008666992, + "learning_rate": 4.244136710360325e-05, + "loss": 2.4985, + "step": 13937 + }, + { + "epoch": 1.1248486804939068, + "grad_norm": 0.6787589192390442, + "learning_rate": 4.242845822701798e-05, + "loss": 2.402, + "step": 13938 + }, + { + "epoch": 1.12492938423049, + "grad_norm": 0.6567786931991577, + "learning_rate": 4.241555078526602e-05, + "loss": 2.4295, + "step": 13939 + }, + { + "epoch": 1.1250100879670728, + "grad_norm": 0.6962547302246094, + "learning_rate": 4.2402644778669074e-05, + "loss": 2.4006, + "step": 13940 + }, + { + "epoch": 1.125090791703656, + "grad_norm": 0.7152721285820007, + "learning_rate": 4.238974020754877e-05, + "loss": 2.4757, + "step": 13941 + }, + { + "epoch": 1.125171495440239, + "grad_norm": 0.6869861483573914, + "learning_rate": 4.237683707222677e-05, + "loss": 2.3877, + "step": 13942 + }, + { + "epoch": 1.1252521991768218, + "grad_norm": 0.6951470971107483, + "learning_rate": 4.236393537302459e-05, + "loss": 2.3755, + "step": 13943 + }, + { + "epoch": 1.125332902913405, + "grad_norm": 0.6997567415237427, + "learning_rate": 4.2351035110263805e-05, + "loss": 2.4731, + "step": 13944 + }, + { + "epoch": 1.125413606649988, + "grad_norm": 0.6765854358673096, + "learning_rate": 4.23381362842659e-05, + "loss": 2.4004, + "step": 13945 + }, + { + "epoch": 1.1254943103865709, + "grad_norm": 0.7046722173690796, + "learning_rate": 4.2325238895352426e-05, + "loss": 2.4379, + "step": 13946 + }, + { + "epoch": 1.125575014123154, + "grad_norm": 0.6862985491752625, + "learning_rate": 4.231234294384472e-05, + "loss": 2.4614, + "step": 13947 + }, + { + "epoch": 1.1256557178597368, + "grad_norm": 0.6637778282165527, + "learning_rate": 4.229944843006422e-05, + "loss": 2.4412, + "step": 13948 + }, + { + "epoch": 1.12573642159632, + "grad_norm": 0.7042228579521179, + "learning_rate": 4.228655535433231e-05, + "loss": 2.4296, + "step": 13949 + }, + { + "epoch": 1.1258171253329028, + "grad_norm": 0.6767764687538147, + "learning_rate": 4.227366371697029e-05, + "loss": 2.409, + "step": 13950 + }, + { + "epoch": 1.1258978290694859, + "grad_norm": 0.6886798143386841, + "learning_rate": 4.226077351829948e-05, + "loss": 2.4786, + "step": 13951 + }, + { + "epoch": 1.125978532806069, + "grad_norm": 0.7723653316497803, + "learning_rate": 4.224788475864115e-05, + "loss": 2.4111, + "step": 13952 + }, + { + "epoch": 1.1260592365426518, + "grad_norm": 0.7614055275917053, + "learning_rate": 4.2234997438316473e-05, + "loss": 2.5055, + "step": 13953 + }, + { + "epoch": 1.126139940279235, + "grad_norm": 0.7195241451263428, + "learning_rate": 4.222211155764665e-05, + "loss": 2.411, + "step": 13954 + }, + { + "epoch": 1.126220644015818, + "grad_norm": 0.7130021452903748, + "learning_rate": 4.220922711695288e-05, + "loss": 2.4819, + "step": 13955 + }, + { + "epoch": 1.1263013477524009, + "grad_norm": 0.6972241401672363, + "learning_rate": 4.2196344116556194e-05, + "loss": 2.4611, + "step": 13956 + }, + { + "epoch": 1.126382051488984, + "grad_norm": 0.7023231387138367, + "learning_rate": 4.218346255677772e-05, + "loss": 2.4509, + "step": 13957 + }, + { + "epoch": 1.126462755225567, + "grad_norm": 0.6959301829338074, + "learning_rate": 4.2170582437938534e-05, + "loss": 2.4441, + "step": 13958 + }, + { + "epoch": 1.12654345896215, + "grad_norm": 0.7423149347305298, + "learning_rate": 4.2157703760359555e-05, + "loss": 2.4452, + "step": 13959 + }, + { + "epoch": 1.126624162698733, + "grad_norm": 0.6587820053100586, + "learning_rate": 4.214482652436177e-05, + "loss": 2.3936, + "step": 13960 + }, + { + "epoch": 1.1267048664353159, + "grad_norm": 0.6601768136024475, + "learning_rate": 4.213195073026618e-05, + "loss": 2.453, + "step": 13961 + }, + { + "epoch": 1.126785570171899, + "grad_norm": 0.6986891031265259, + "learning_rate": 4.2119076378393676e-05, + "loss": 2.452, + "step": 13962 + }, + { + "epoch": 1.126866273908482, + "grad_norm": 0.7207025289535522, + "learning_rate": 4.2106203469065055e-05, + "loss": 2.4048, + "step": 13963 + }, + { + "epoch": 1.126946977645065, + "grad_norm": 0.6731177568435669, + "learning_rate": 4.2093332002601184e-05, + "loss": 2.4573, + "step": 13964 + }, + { + "epoch": 1.127027681381648, + "grad_norm": 0.7330070734024048, + "learning_rate": 4.208046197932288e-05, + "loss": 2.4274, + "step": 13965 + }, + { + "epoch": 1.1271083851182309, + "grad_norm": 0.7008770704269409, + "learning_rate": 4.206759339955084e-05, + "loss": 2.4933, + "step": 13966 + }, + { + "epoch": 1.127189088854814, + "grad_norm": 0.8309584259986877, + "learning_rate": 4.20547262636058e-05, + "loss": 2.3857, + "step": 13967 + }, + { + "epoch": 1.127269792591397, + "grad_norm": 0.6705843210220337, + "learning_rate": 4.204186057180849e-05, + "loss": 2.4303, + "step": 13968 + }, + { + "epoch": 1.12735049632798, + "grad_norm": 0.7526851296424866, + "learning_rate": 4.202899632447949e-05, + "loss": 2.455, + "step": 13969 + }, + { + "epoch": 1.127431200064563, + "grad_norm": 0.6690995097160339, + "learning_rate": 4.201613352193943e-05, + "loss": 2.4398, + "step": 13970 + }, + { + "epoch": 1.127511903801146, + "grad_norm": 0.6946840286254883, + "learning_rate": 4.20032721645089e-05, + "loss": 2.4032, + "step": 13971 + }, + { + "epoch": 1.127592607537729, + "grad_norm": 0.7438863515853882, + "learning_rate": 4.1990412252508426e-05, + "loss": 2.4644, + "step": 13972 + }, + { + "epoch": 1.127673311274312, + "grad_norm": 0.6975359916687012, + "learning_rate": 4.197755378625852e-05, + "loss": 2.3991, + "step": 13973 + }, + { + "epoch": 1.1277540150108951, + "grad_norm": 0.6799279451370239, + "learning_rate": 4.196469676607968e-05, + "loss": 2.4328, + "step": 13974 + }, + { + "epoch": 1.127834718747478, + "grad_norm": 0.7014481425285339, + "learning_rate": 4.1951841192292274e-05, + "loss": 2.5045, + "step": 13975 + }, + { + "epoch": 1.127915422484061, + "grad_norm": 0.7074011564254761, + "learning_rate": 4.1938987065216716e-05, + "loss": 2.4583, + "step": 13976 + }, + { + "epoch": 1.127996126220644, + "grad_norm": 0.7246339917182922, + "learning_rate": 4.192613438517338e-05, + "loss": 2.447, + "step": 13977 + }, + { + "epoch": 1.128076829957227, + "grad_norm": 0.6757462620735168, + "learning_rate": 4.191328315248262e-05, + "loss": 2.4181, + "step": 13978 + }, + { + "epoch": 1.12815753369381, + "grad_norm": 0.6758493185043335, + "learning_rate": 4.1900433367464644e-05, + "loss": 2.4837, + "step": 13979 + }, + { + "epoch": 1.128238237430393, + "grad_norm": 0.6782165765762329, + "learning_rate": 4.1887585030439736e-05, + "loss": 2.3946, + "step": 13980 + }, + { + "epoch": 1.128318941166976, + "grad_norm": 0.7176415324211121, + "learning_rate": 4.187473814172812e-05, + "loss": 2.4538, + "step": 13981 + }, + { + "epoch": 1.128399644903559, + "grad_norm": 0.6636224985122681, + "learning_rate": 4.186189270164997e-05, + "loss": 2.4493, + "step": 13982 + }, + { + "epoch": 1.128480348640142, + "grad_norm": 0.6613143086433411, + "learning_rate": 4.184904871052544e-05, + "loss": 2.4994, + "step": 13983 + }, + { + "epoch": 1.128561052376725, + "grad_norm": 0.7148364186286926, + "learning_rate": 4.183620616867465e-05, + "loss": 2.4673, + "step": 13984 + }, + { + "epoch": 1.128641756113308, + "grad_norm": 0.6657952070236206, + "learning_rate": 4.1823365076417606e-05, + "loss": 2.3915, + "step": 13985 + }, + { + "epoch": 1.128722459849891, + "grad_norm": 0.7135687470436096, + "learning_rate": 4.181052543407439e-05, + "loss": 2.4961, + "step": 13986 + }, + { + "epoch": 1.1288031635864741, + "grad_norm": 0.7245377898216248, + "learning_rate": 4.179768724196501e-05, + "loss": 2.4519, + "step": 13987 + }, + { + "epoch": 1.128883867323057, + "grad_norm": 0.6832938194274902, + "learning_rate": 4.1784850500409376e-05, + "loss": 2.4471, + "step": 13988 + }, + { + "epoch": 1.12896457105964, + "grad_norm": 0.7303032279014587, + "learning_rate": 4.177201520972746e-05, + "loss": 2.3906, + "step": 13989 + }, + { + "epoch": 1.1290452747962232, + "grad_norm": 0.698581874370575, + "learning_rate": 4.175918137023911e-05, + "loss": 2.4667, + "step": 13990 + }, + { + "epoch": 1.129125978532806, + "grad_norm": 0.69133061170578, + "learning_rate": 4.174634898226422e-05, + "loss": 2.4285, + "step": 13991 + }, + { + "epoch": 1.1292066822693891, + "grad_norm": 0.7029501795768738, + "learning_rate": 4.1733518046122576e-05, + "loss": 2.4839, + "step": 13992 + }, + { + "epoch": 1.129287386005972, + "grad_norm": 0.7566521167755127, + "learning_rate": 4.172068856213398e-05, + "loss": 2.5019, + "step": 13993 + }, + { + "epoch": 1.129368089742555, + "grad_norm": 0.697998046875, + "learning_rate": 4.1707860530618204e-05, + "loss": 2.4305, + "step": 13994 + }, + { + "epoch": 1.1294487934791382, + "grad_norm": 0.674194872379303, + "learning_rate": 4.169503395189489e-05, + "loss": 2.4361, + "step": 13995 + }, + { + "epoch": 1.129529497215721, + "grad_norm": 0.6936436891555786, + "learning_rate": 4.168220882628373e-05, + "loss": 2.518, + "step": 13996 + }, + { + "epoch": 1.1296102009523041, + "grad_norm": 0.6831670999526978, + "learning_rate": 4.166938515410442e-05, + "loss": 2.4197, + "step": 13997 + }, + { + "epoch": 1.129690904688887, + "grad_norm": 0.7323662638664246, + "learning_rate": 4.165656293567647e-05, + "loss": 2.4555, + "step": 13998 + }, + { + "epoch": 1.12977160842547, + "grad_norm": 0.7699782848358154, + "learning_rate": 4.164374217131948e-05, + "loss": 2.4456, + "step": 13999 + }, + { + "epoch": 1.1298523121620532, + "grad_norm": 0.7009051442146301, + "learning_rate": 4.163092286135297e-05, + "loss": 2.4429, + "step": 14000 + }, + { + "epoch": 1.1298523121620532, + "eval_loss": 2.4034411907196045, + "eval_runtime": 771.1158, + "eval_samples_per_second": 3.398, + "eval_steps_per_second": 0.567, + "step": 14000 + }, + { + "epoch": 1.129933015898636, + "grad_norm": 0.674665093421936, + "learning_rate": 4.1618105006096456e-05, + "loss": 2.4127, + "step": 14001 + }, + { + "epoch": 1.1300137196352191, + "grad_norm": 0.7332403659820557, + "learning_rate": 4.1605288605869365e-05, + "loss": 2.4854, + "step": 14002 + }, + { + "epoch": 1.1300944233718022, + "grad_norm": 0.70233553647995, + "learning_rate": 4.159247366099117e-05, + "loss": 2.4433, + "step": 14003 + }, + { + "epoch": 1.130175127108385, + "grad_norm": 0.6259445548057556, + "learning_rate": 4.157966017178118e-05, + "loss": 2.3605, + "step": 14004 + }, + { + "epoch": 1.1302558308449682, + "grad_norm": 0.717408299446106, + "learning_rate": 4.1566848138558755e-05, + "loss": 2.4378, + "step": 14005 + }, + { + "epoch": 1.130336534581551, + "grad_norm": 0.6973297595977783, + "learning_rate": 4.155403756164323e-05, + "loss": 2.4363, + "step": 14006 + }, + { + "epoch": 1.1304172383181341, + "grad_norm": 0.7204940915107727, + "learning_rate": 4.154122844135391e-05, + "loss": 2.4814, + "step": 14007 + }, + { + "epoch": 1.1304979420547172, + "grad_norm": 0.8976696133613586, + "learning_rate": 4.1528420778009935e-05, + "loss": 2.4654, + "step": 14008 + }, + { + "epoch": 1.1305786457913, + "grad_norm": 0.7270354628562927, + "learning_rate": 4.151561457193057e-05, + "loss": 2.4088, + "step": 14009 + }, + { + "epoch": 1.1306593495278832, + "grad_norm": 0.7200367450714111, + "learning_rate": 4.1502809823434985e-05, + "loss": 2.4412, + "step": 14010 + }, + { + "epoch": 1.130740053264466, + "grad_norm": 0.7593986392021179, + "learning_rate": 4.149000653284227e-05, + "loss": 2.5058, + "step": 14011 + }, + { + "epoch": 1.1308207570010491, + "grad_norm": 0.7322795987129211, + "learning_rate": 4.147720470047155e-05, + "loss": 2.4899, + "step": 14012 + }, + { + "epoch": 1.1309014607376322, + "grad_norm": 0.6649030447006226, + "learning_rate": 4.1464404326641905e-05, + "loss": 2.4358, + "step": 14013 + }, + { + "epoch": 1.130982164474215, + "grad_norm": 0.7258814573287964, + "learning_rate": 4.145160541167228e-05, + "loss": 2.4732, + "step": 14014 + }, + { + "epoch": 1.1310628682107982, + "grad_norm": 0.7414976358413696, + "learning_rate": 4.1438807955881695e-05, + "loss": 2.4157, + "step": 14015 + }, + { + "epoch": 1.1311435719473812, + "grad_norm": 0.6813236474990845, + "learning_rate": 4.142601195958914e-05, + "loss": 2.3966, + "step": 14016 + }, + { + "epoch": 1.131224275683964, + "grad_norm": 0.6715923547744751, + "learning_rate": 4.141321742311344e-05, + "loss": 2.4358, + "step": 14017 + }, + { + "epoch": 1.1313049794205472, + "grad_norm": 0.7174912691116333, + "learning_rate": 4.14004243467735e-05, + "loss": 2.4838, + "step": 14018 + }, + { + "epoch": 1.1313856831571303, + "grad_norm": 0.6945109963417053, + "learning_rate": 4.138763273088821e-05, + "loss": 2.4674, + "step": 14019 + }, + { + "epoch": 1.1314663868937131, + "grad_norm": 0.6759494543075562, + "learning_rate": 4.137484257577629e-05, + "loss": 2.4659, + "step": 14020 + }, + { + "epoch": 1.1315470906302962, + "grad_norm": 0.7077876925468445, + "learning_rate": 4.1362053881756534e-05, + "loss": 2.4731, + "step": 14021 + }, + { + "epoch": 1.131627794366879, + "grad_norm": 0.6769500970840454, + "learning_rate": 4.1349266649147654e-05, + "loss": 2.3606, + "step": 14022 + }, + { + "epoch": 1.1317084981034622, + "grad_norm": 0.7104208469390869, + "learning_rate": 4.1336480878268424e-05, + "loss": 2.4626, + "step": 14023 + }, + { + "epoch": 1.1317892018400453, + "grad_norm": 0.7102686762809753, + "learning_rate": 4.132369656943741e-05, + "loss": 2.4545, + "step": 14024 + }, + { + "epoch": 1.1318699055766281, + "grad_norm": 0.7773897647857666, + "learning_rate": 4.1310913722973256e-05, + "loss": 2.5107, + "step": 14025 + }, + { + "epoch": 1.1319506093132112, + "grad_norm": 0.6427130103111267, + "learning_rate": 4.1298132339194585e-05, + "loss": 2.4349, + "step": 14026 + }, + { + "epoch": 1.132031313049794, + "grad_norm": 0.6725162863731384, + "learning_rate": 4.128535241841987e-05, + "loss": 2.4566, + "step": 14027 + }, + { + "epoch": 1.1321120167863772, + "grad_norm": 0.7182251214981079, + "learning_rate": 4.127257396096764e-05, + "loss": 2.4472, + "step": 14028 + }, + { + "epoch": 1.1321927205229603, + "grad_norm": 0.6712302565574646, + "learning_rate": 4.1259796967156426e-05, + "loss": 2.4326, + "step": 14029 + }, + { + "epoch": 1.1322734242595431, + "grad_norm": 0.7726041078567505, + "learning_rate": 4.124702143730459e-05, + "loss": 2.4994, + "step": 14030 + }, + { + "epoch": 1.1323541279961262, + "grad_norm": 0.651899516582489, + "learning_rate": 4.123424737173056e-05, + "loss": 2.4244, + "step": 14031 + }, + { + "epoch": 1.1324348317327093, + "grad_norm": 0.6646261215209961, + "learning_rate": 4.12214747707527e-05, + "loss": 2.5027, + "step": 14032 + }, + { + "epoch": 1.1325155354692922, + "grad_norm": 0.729098916053772, + "learning_rate": 4.120870363468933e-05, + "loss": 2.5117, + "step": 14033 + }, + { + "epoch": 1.1325962392058753, + "grad_norm": 0.7056638598442078, + "learning_rate": 4.119593396385876e-05, + "loss": 2.4279, + "step": 14034 + }, + { + "epoch": 1.1326769429424584, + "grad_norm": 0.7051844000816345, + "learning_rate": 4.1183165758579255e-05, + "loss": 2.3844, + "step": 14035 + }, + { + "epoch": 1.1327576466790412, + "grad_norm": 0.6954311728477478, + "learning_rate": 4.1170399019168984e-05, + "loss": 2.4041, + "step": 14036 + }, + { + "epoch": 1.1328383504156243, + "grad_norm": 0.650044858455658, + "learning_rate": 4.1157633745946135e-05, + "loss": 2.4397, + "step": 14037 + }, + { + "epoch": 1.1329190541522072, + "grad_norm": 0.6974380016326904, + "learning_rate": 4.114486993922888e-05, + "loss": 2.4391, + "step": 14038 + }, + { + "epoch": 1.1329997578887903, + "grad_norm": 0.7252807021141052, + "learning_rate": 4.113210759933536e-05, + "loss": 2.4471, + "step": 14039 + }, + { + "epoch": 1.1330804616253733, + "grad_norm": 0.7001414895057678, + "learning_rate": 4.111934672658354e-05, + "loss": 2.402, + "step": 14040 + }, + { + "epoch": 1.1331611653619562, + "grad_norm": 0.7420533895492554, + "learning_rate": 4.110658732129153e-05, + "loss": 2.4987, + "step": 14041 + }, + { + "epoch": 1.1332418690985393, + "grad_norm": 0.6850644946098328, + "learning_rate": 4.1093829383777315e-05, + "loss": 2.4355, + "step": 14042 + }, + { + "epoch": 1.1333225728351222, + "grad_norm": 0.6905977725982666, + "learning_rate": 4.108107291435885e-05, + "loss": 2.4818, + "step": 14043 + }, + { + "epoch": 1.1334032765717053, + "grad_norm": 0.6555112600326538, + "learning_rate": 4.106831791335407e-05, + "loss": 2.425, + "step": 14044 + }, + { + "epoch": 1.1334839803082883, + "grad_norm": 0.6570355892181396, + "learning_rate": 4.105556438108089e-05, + "loss": 2.4232, + "step": 14045 + }, + { + "epoch": 1.1335646840448712, + "grad_norm": 0.7910747528076172, + "learning_rate": 4.104281231785708e-05, + "loss": 2.484, + "step": 14046 + }, + { + "epoch": 1.1336453877814543, + "grad_norm": 0.6581952571868896, + "learning_rate": 4.103006172400052e-05, + "loss": 2.4102, + "step": 14047 + }, + { + "epoch": 1.1337260915180374, + "grad_norm": 0.6834773421287537, + "learning_rate": 4.1017312599828994e-05, + "loss": 2.4602, + "step": 14048 + }, + { + "epoch": 1.1338067952546202, + "grad_norm": 0.7588350772857666, + "learning_rate": 4.1004564945660195e-05, + "loss": 2.5059, + "step": 14049 + }, + { + "epoch": 1.1338874989912033, + "grad_norm": 0.6604699492454529, + "learning_rate": 4.099181876181185e-05, + "loss": 2.4403, + "step": 14050 + }, + { + "epoch": 1.1339682027277862, + "grad_norm": 0.6957669258117676, + "learning_rate": 4.097907404860163e-05, + "loss": 2.4218, + "step": 14051 + }, + { + "epoch": 1.1340489064643693, + "grad_norm": 0.7091849446296692, + "learning_rate": 4.0966330806347166e-05, + "loss": 2.4396, + "step": 14052 + }, + { + "epoch": 1.1341296102009524, + "grad_norm": 0.6637482047080994, + "learning_rate": 4.095358903536605e-05, + "loss": 2.4514, + "step": 14053 + }, + { + "epoch": 1.1342103139375352, + "grad_norm": 0.7485960125923157, + "learning_rate": 4.0940848735975846e-05, + "loss": 2.4401, + "step": 14054 + }, + { + "epoch": 1.1342910176741183, + "grad_norm": 0.6509774327278137, + "learning_rate": 4.092810990849411e-05, + "loss": 2.4575, + "step": 14055 + }, + { + "epoch": 1.1343717214107012, + "grad_norm": 0.7151626348495483, + "learning_rate": 4.091537255323825e-05, + "loss": 2.45, + "step": 14056 + }, + { + "epoch": 1.1344524251472843, + "grad_norm": 0.7536267042160034, + "learning_rate": 4.0902636670525764e-05, + "loss": 2.497, + "step": 14057 + }, + { + "epoch": 1.1345331288838674, + "grad_norm": 0.7779545783996582, + "learning_rate": 4.0889902260674086e-05, + "loss": 2.412, + "step": 14058 + }, + { + "epoch": 1.1346138326204502, + "grad_norm": 0.7211748957633972, + "learning_rate": 4.087716932400052e-05, + "loss": 2.4727, + "step": 14059 + }, + { + "epoch": 1.1346945363570333, + "grad_norm": 0.6710701584815979, + "learning_rate": 4.086443786082245e-05, + "loss": 2.4318, + "step": 14060 + }, + { + "epoch": 1.1347752400936164, + "grad_norm": 0.7072857022285461, + "learning_rate": 4.085170787145717e-05, + "loss": 2.4672, + "step": 14061 + }, + { + "epoch": 1.1348559438301993, + "grad_norm": 0.6475152969360352, + "learning_rate": 4.083897935622194e-05, + "loss": 2.4104, + "step": 14062 + }, + { + "epoch": 1.1349366475667824, + "grad_norm": 0.7408067584037781, + "learning_rate": 4.0826252315433986e-05, + "loss": 2.4129, + "step": 14063 + }, + { + "epoch": 1.1350173513033655, + "grad_norm": 0.732540488243103, + "learning_rate": 4.081352674941056e-05, + "loss": 2.4209, + "step": 14064 + }, + { + "epoch": 1.1350980550399483, + "grad_norm": 0.6933332681655884, + "learning_rate": 4.080080265846872e-05, + "loss": 2.3797, + "step": 14065 + }, + { + "epoch": 1.1351787587765314, + "grad_norm": 0.6507896780967712, + "learning_rate": 4.078808004292561e-05, + "loss": 2.4372, + "step": 14066 + }, + { + "epoch": 1.1352594625131143, + "grad_norm": 0.729292094707489, + "learning_rate": 4.0775358903098384e-05, + "loss": 2.5513, + "step": 14067 + }, + { + "epoch": 1.1353401662496974, + "grad_norm": 0.692757248878479, + "learning_rate": 4.076263923930398e-05, + "loss": 2.4228, + "step": 14068 + }, + { + "epoch": 1.1354208699862804, + "grad_norm": 0.7028260231018066, + "learning_rate": 4.074992105185946e-05, + "loss": 2.4478, + "step": 14069 + }, + { + "epoch": 1.1355015737228633, + "grad_norm": 0.65067058801651, + "learning_rate": 4.073720434108179e-05, + "loss": 2.3729, + "step": 14070 + }, + { + "epoch": 1.1355822774594464, + "grad_norm": 0.6884061098098755, + "learning_rate": 4.0724489107287933e-05, + "loss": 2.3693, + "step": 14071 + }, + { + "epoch": 1.1356629811960293, + "grad_norm": 0.70686936378479, + "learning_rate": 4.071177535079472e-05, + "loss": 2.4989, + "step": 14072 + }, + { + "epoch": 1.1357436849326124, + "grad_norm": 0.6792482733726501, + "learning_rate": 4.0699063071919016e-05, + "loss": 2.393, + "step": 14073 + }, + { + "epoch": 1.1358243886691954, + "grad_norm": 0.7231085896492004, + "learning_rate": 4.0686352270977745e-05, + "loss": 2.4597, + "step": 14074 + }, + { + "epoch": 1.1359050924057783, + "grad_norm": 0.8024532198905945, + "learning_rate": 4.067364294828758e-05, + "loss": 2.4409, + "step": 14075 + }, + { + "epoch": 1.1359857961423614, + "grad_norm": 0.6761424541473389, + "learning_rate": 4.066093510416532e-05, + "loss": 2.4598, + "step": 14076 + }, + { + "epoch": 1.1360664998789445, + "grad_norm": 0.7075559496879578, + "learning_rate": 4.064822873892771e-05, + "loss": 2.4649, + "step": 14077 + }, + { + "epoch": 1.1361472036155273, + "grad_norm": 0.6292272806167603, + "learning_rate": 4.063552385289134e-05, + "loss": 2.445, + "step": 14078 + }, + { + "epoch": 1.1362279073521104, + "grad_norm": 0.6435273885726929, + "learning_rate": 4.06228204463729e-05, + "loss": 2.4105, + "step": 14079 + }, + { + "epoch": 1.1363086110886935, + "grad_norm": 0.7135637402534485, + "learning_rate": 4.061011851968903e-05, + "loss": 2.3907, + "step": 14080 + }, + { + "epoch": 1.1363893148252764, + "grad_norm": 0.7424013614654541, + "learning_rate": 4.059741807315621e-05, + "loss": 2.4405, + "step": 14081 + }, + { + "epoch": 1.1364700185618595, + "grad_norm": 0.6649916768074036, + "learning_rate": 4.0584719107091016e-05, + "loss": 2.4314, + "step": 14082 + }, + { + "epoch": 1.1365507222984423, + "grad_norm": 0.6700563430786133, + "learning_rate": 4.0572021621809944e-05, + "loss": 2.4093, + "step": 14083 + }, + { + "epoch": 1.1366314260350254, + "grad_norm": 0.6740709543228149, + "learning_rate": 4.055932561762942e-05, + "loss": 2.4301, + "step": 14084 + }, + { + "epoch": 1.1367121297716085, + "grad_norm": 0.7039555907249451, + "learning_rate": 4.0546631094865895e-05, + "loss": 2.4427, + "step": 14085 + }, + { + "epoch": 1.1367928335081914, + "grad_norm": 0.7461164593696594, + "learning_rate": 4.053393805383573e-05, + "loss": 2.3865, + "step": 14086 + }, + { + "epoch": 1.1368735372447745, + "grad_norm": 0.6808290481567383, + "learning_rate": 4.0521246494855316e-05, + "loss": 2.3738, + "step": 14087 + }, + { + "epoch": 1.1369542409813573, + "grad_norm": 0.6942760944366455, + "learning_rate": 4.0508556418240875e-05, + "loss": 2.4351, + "step": 14088 + }, + { + "epoch": 1.1370349447179404, + "grad_norm": 0.7615510821342468, + "learning_rate": 4.049586782430872e-05, + "loss": 2.3968, + "step": 14089 + }, + { + "epoch": 1.1371156484545235, + "grad_norm": 0.7240662574768066, + "learning_rate": 4.048318071337512e-05, + "loss": 2.4046, + "step": 14090 + }, + { + "epoch": 1.1371963521911064, + "grad_norm": 0.7286471128463745, + "learning_rate": 4.047049508575621e-05, + "loss": 2.4039, + "step": 14091 + }, + { + "epoch": 1.1372770559276895, + "grad_norm": 0.7031459212303162, + "learning_rate": 4.045781094176816e-05, + "loss": 2.4494, + "step": 14092 + }, + { + "epoch": 1.1373577596642725, + "grad_norm": 0.7116301655769348, + "learning_rate": 4.0445128281727116e-05, + "loss": 2.3991, + "step": 14093 + }, + { + "epoch": 1.1374384634008554, + "grad_norm": 0.6719788312911987, + "learning_rate": 4.043244710594914e-05, + "loss": 2.4823, + "step": 14094 + }, + { + "epoch": 1.1375191671374385, + "grad_norm": 0.6770508885383606, + "learning_rate": 4.041976741475031e-05, + "loss": 2.4362, + "step": 14095 + }, + { + "epoch": 1.1375998708740216, + "grad_norm": 0.6808609962463379, + "learning_rate": 4.040708920844666e-05, + "loss": 2.435, + "step": 14096 + }, + { + "epoch": 1.1376805746106045, + "grad_norm": 0.7445514798164368, + "learning_rate": 4.0394412487354074e-05, + "loss": 2.4749, + "step": 14097 + }, + { + "epoch": 1.1377612783471875, + "grad_norm": 0.7024775743484497, + "learning_rate": 4.038173725178854e-05, + "loss": 2.4354, + "step": 14098 + }, + { + "epoch": 1.1378419820837704, + "grad_norm": 0.6925685405731201, + "learning_rate": 4.0369063502066e-05, + "loss": 2.4462, + "step": 14099 + }, + { + "epoch": 1.1379226858203535, + "grad_norm": 0.6970539689064026, + "learning_rate": 4.035639123850223e-05, + "loss": 2.3842, + "step": 14100 + }, + { + "epoch": 1.1380033895569364, + "grad_norm": 0.6571836471557617, + "learning_rate": 4.0343720461413107e-05, + "loss": 2.4213, + "step": 14101 + }, + { + "epoch": 1.1380840932935194, + "grad_norm": 0.7264918684959412, + "learning_rate": 4.033105117111441e-05, + "loss": 2.4697, + "step": 14102 + }, + { + "epoch": 1.1381647970301025, + "grad_norm": 0.6929560899734497, + "learning_rate": 4.03183833679219e-05, + "loss": 2.461, + "step": 14103 + }, + { + "epoch": 1.1382455007666854, + "grad_norm": 0.6533559560775757, + "learning_rate": 4.030571705215128e-05, + "loss": 2.4336, + "step": 14104 + }, + { + "epoch": 1.1383262045032685, + "grad_norm": 0.7372364401817322, + "learning_rate": 4.0293052224118234e-05, + "loss": 2.4396, + "step": 14105 + }, + { + "epoch": 1.1384069082398516, + "grad_norm": 0.6736310720443726, + "learning_rate": 4.028038888413844e-05, + "loss": 2.4123, + "step": 14106 + }, + { + "epoch": 1.1384876119764344, + "grad_norm": 0.6898338794708252, + "learning_rate": 4.026772703252742e-05, + "loss": 2.431, + "step": 14107 + }, + { + "epoch": 1.1385683157130175, + "grad_norm": 0.7933369278907776, + "learning_rate": 4.02550666696008e-05, + "loss": 2.4669, + "step": 14108 + }, + { + "epoch": 1.1386490194496006, + "grad_norm": 0.7218122482299805, + "learning_rate": 4.024240779567412e-05, + "loss": 2.3761, + "step": 14109 + }, + { + "epoch": 1.1387297231861835, + "grad_norm": 0.7018248438835144, + "learning_rate": 4.022975041106281e-05, + "loss": 2.4011, + "step": 14110 + }, + { + "epoch": 1.1388104269227666, + "grad_norm": 0.6709668040275574, + "learning_rate": 4.0217094516082364e-05, + "loss": 2.426, + "step": 14111 + }, + { + "epoch": 1.1388911306593494, + "grad_norm": 0.7241504192352295, + "learning_rate": 4.0204440111048195e-05, + "loss": 2.4085, + "step": 14112 + }, + { + "epoch": 1.1389718343959325, + "grad_norm": 0.731347382068634, + "learning_rate": 4.0191787196275675e-05, + "loss": 2.502, + "step": 14113 + }, + { + "epoch": 1.1390525381325156, + "grad_norm": 0.6630167365074158, + "learning_rate": 4.0179135772080166e-05, + "loss": 2.3999, + "step": 14114 + }, + { + "epoch": 1.1391332418690985, + "grad_norm": 0.7094748616218567, + "learning_rate": 4.016648583877698e-05, + "loss": 2.4666, + "step": 14115 + }, + { + "epoch": 1.1392139456056816, + "grad_norm": 0.7262436151504517, + "learning_rate": 4.0153837396681395e-05, + "loss": 2.4369, + "step": 14116 + }, + { + "epoch": 1.1392946493422644, + "grad_norm": 0.6796039938926697, + "learning_rate": 4.014119044610859e-05, + "loss": 2.4607, + "step": 14117 + }, + { + "epoch": 1.1393753530788475, + "grad_norm": 0.6690036058425903, + "learning_rate": 4.0128544987373785e-05, + "loss": 2.4145, + "step": 14118 + }, + { + "epoch": 1.1394560568154306, + "grad_norm": 0.6987181305885315, + "learning_rate": 4.011590102079219e-05, + "loss": 2.4294, + "step": 14119 + }, + { + "epoch": 1.1395367605520135, + "grad_norm": 0.6756789684295654, + "learning_rate": 4.0103258546678836e-05, + "loss": 2.396, + "step": 14120 + }, + { + "epoch": 1.1396174642885966, + "grad_norm": 0.7027772068977356, + "learning_rate": 4.009061756534885e-05, + "loss": 2.3971, + "step": 14121 + }, + { + "epoch": 1.1396981680251796, + "grad_norm": 0.6872174143791199, + "learning_rate": 4.007797807711732e-05, + "loss": 2.4297, + "step": 14122 + }, + { + "epoch": 1.1397788717617625, + "grad_norm": 0.7213007211685181, + "learning_rate": 4.006534008229914e-05, + "loss": 2.4792, + "step": 14123 + }, + { + "epoch": 1.1398595754983456, + "grad_norm": 0.6771649122238159, + "learning_rate": 4.0052703581209395e-05, + "loss": 2.4397, + "step": 14124 + }, + { + "epoch": 1.1399402792349287, + "grad_norm": 0.6577184796333313, + "learning_rate": 4.0040068574163013e-05, + "loss": 2.4113, + "step": 14125 + }, + { + "epoch": 1.1400209829715116, + "grad_norm": 0.7493160367012024, + "learning_rate": 4.002743506147483e-05, + "loss": 2.4454, + "step": 14126 + }, + { + "epoch": 1.1401016867080946, + "grad_norm": 0.6820357441902161, + "learning_rate": 4.0014803043459726e-05, + "loss": 2.4126, + "step": 14127 + }, + { + "epoch": 1.1401823904446775, + "grad_norm": 0.7177188992500305, + "learning_rate": 4.000217252043258e-05, + "loss": 2.4355, + "step": 14128 + }, + { + "epoch": 1.1402630941812606, + "grad_norm": 0.654371440410614, + "learning_rate": 3.998954349270808e-05, + "loss": 2.4932, + "step": 14129 + }, + { + "epoch": 1.1403437979178437, + "grad_norm": 0.7029837965965271, + "learning_rate": 3.997691596060104e-05, + "loss": 2.4341, + "step": 14130 + }, + { + "epoch": 1.1404245016544265, + "grad_norm": 0.7971171140670776, + "learning_rate": 3.996428992442615e-05, + "loss": 2.4466, + "step": 14131 + }, + { + "epoch": 1.1405052053910096, + "grad_norm": 0.6941849589347839, + "learning_rate": 3.9951665384498114e-05, + "loss": 2.4861, + "step": 14132 + }, + { + "epoch": 1.1405859091275925, + "grad_norm": 0.6657733917236328, + "learning_rate": 3.993904234113153e-05, + "loss": 2.4266, + "step": 14133 + }, + { + "epoch": 1.1406666128641756, + "grad_norm": 0.6780329346656799, + "learning_rate": 3.9926420794641e-05, + "loss": 2.458, + "step": 14134 + }, + { + "epoch": 1.1407473166007587, + "grad_norm": 0.7070702910423279, + "learning_rate": 3.991380074534109e-05, + "loss": 2.368, + "step": 14135 + }, + { + "epoch": 1.1408280203373415, + "grad_norm": 0.7186575531959534, + "learning_rate": 3.990118219354635e-05, + "loss": 2.4611, + "step": 14136 + }, + { + "epoch": 1.1409087240739246, + "grad_norm": 0.7171763777732849, + "learning_rate": 3.988856513957123e-05, + "loss": 2.4315, + "step": 14137 + }, + { + "epoch": 1.1409894278105077, + "grad_norm": 0.7090228796005249, + "learning_rate": 3.987594958373025e-05, + "loss": 2.4668, + "step": 14138 + }, + { + "epoch": 1.1410701315470906, + "grad_norm": 0.6523951888084412, + "learning_rate": 3.986333552633773e-05, + "loss": 2.4392, + "step": 14139 + }, + { + "epoch": 1.1411508352836737, + "grad_norm": 0.706000804901123, + "learning_rate": 3.98507229677081e-05, + "loss": 2.4382, + "step": 14140 + }, + { + "epoch": 1.1412315390202568, + "grad_norm": 0.6537537574768066, + "learning_rate": 3.983811190815571e-05, + "loss": 2.456, + "step": 14141 + }, + { + "epoch": 1.1413122427568396, + "grad_norm": 0.7509549856185913, + "learning_rate": 3.982550234799479e-05, + "loss": 2.4744, + "step": 14142 + }, + { + "epoch": 1.1413929464934227, + "grad_norm": 0.7188650965690613, + "learning_rate": 3.981289428753967e-05, + "loss": 2.4632, + "step": 14143 + }, + { + "epoch": 1.1414736502300056, + "grad_norm": 0.7563674449920654, + "learning_rate": 3.9800287727104544e-05, + "loss": 2.5063, + "step": 14144 + }, + { + "epoch": 1.1415543539665887, + "grad_norm": 0.8374128341674805, + "learning_rate": 3.978768266700361e-05, + "loss": 2.4942, + "step": 14145 + }, + { + "epoch": 1.1416350577031718, + "grad_norm": 0.7020177841186523, + "learning_rate": 3.9775079107551027e-05, + "loss": 2.4404, + "step": 14146 + }, + { + "epoch": 1.1417157614397546, + "grad_norm": 0.7326170802116394, + "learning_rate": 3.9762477049060895e-05, + "loss": 2.4127, + "step": 14147 + }, + { + "epoch": 1.1417964651763377, + "grad_norm": 0.6661173105239868, + "learning_rate": 3.974987649184734e-05, + "loss": 2.4649, + "step": 14148 + }, + { + "epoch": 1.1418771689129206, + "grad_norm": 0.7186033129692078, + "learning_rate": 3.973727743622432e-05, + "loss": 2.4275, + "step": 14149 + }, + { + "epoch": 1.1419578726495037, + "grad_norm": 0.7193881869316101, + "learning_rate": 3.972467988250588e-05, + "loss": 2.4997, + "step": 14150 + }, + { + "epoch": 1.1420385763860867, + "grad_norm": 0.7139542102813721, + "learning_rate": 3.971208383100601e-05, + "loss": 2.4211, + "step": 14151 + }, + { + "epoch": 1.1421192801226696, + "grad_norm": 0.6840166449546814, + "learning_rate": 3.969948928203856e-05, + "loss": 2.4504, + "step": 14152 + }, + { + "epoch": 1.1421999838592527, + "grad_norm": 0.8261072039604187, + "learning_rate": 3.968689623591747e-05, + "loss": 2.4901, + "step": 14153 + }, + { + "epoch": 1.1422806875958358, + "grad_norm": 0.7636086940765381, + "learning_rate": 3.96743046929566e-05, + "loss": 2.4202, + "step": 14154 + }, + { + "epoch": 1.1423613913324187, + "grad_norm": 0.7477976679801941, + "learning_rate": 3.966171465346973e-05, + "loss": 2.492, + "step": 14155 + }, + { + "epoch": 1.1424420950690017, + "grad_norm": 0.7516389489173889, + "learning_rate": 3.9649126117770665e-05, + "loss": 2.4512, + "step": 14156 + }, + { + "epoch": 1.1425227988055846, + "grad_norm": 0.6987521648406982, + "learning_rate": 3.9636539086173174e-05, + "loss": 2.4005, + "step": 14157 + }, + { + "epoch": 1.1426035025421677, + "grad_norm": 0.7242532968521118, + "learning_rate": 3.962395355899088e-05, + "loss": 2.4414, + "step": 14158 + }, + { + "epoch": 1.1426842062787508, + "grad_norm": 0.6616180539131165, + "learning_rate": 3.961136953653749e-05, + "loss": 2.4442, + "step": 14159 + }, + { + "epoch": 1.1427649100153336, + "grad_norm": 0.7165415287017822, + "learning_rate": 3.959878701912667e-05, + "loss": 2.4658, + "step": 14160 + }, + { + "epoch": 1.1428456137519167, + "grad_norm": 0.6619318127632141, + "learning_rate": 3.9586206007071926e-05, + "loss": 2.3803, + "step": 14161 + }, + { + "epoch": 1.1429263174884996, + "grad_norm": 0.6654838919639587, + "learning_rate": 3.957362650068684e-05, + "loss": 2.4584, + "step": 14162 + }, + { + "epoch": 1.1430070212250827, + "grad_norm": 0.6947140097618103, + "learning_rate": 3.956104850028496e-05, + "loss": 2.4236, + "step": 14163 + }, + { + "epoch": 1.1430877249616658, + "grad_norm": 0.6510412096977234, + "learning_rate": 3.954847200617973e-05, + "loss": 2.3589, + "step": 14164 + }, + { + "epoch": 1.1431684286982486, + "grad_norm": 0.7550667524337769, + "learning_rate": 3.95358970186846e-05, + "loss": 2.419, + "step": 14165 + }, + { + "epoch": 1.1432491324348317, + "grad_norm": 0.7898361682891846, + "learning_rate": 3.9523323538112975e-05, + "loss": 2.4549, + "step": 14166 + }, + { + "epoch": 1.1433298361714148, + "grad_norm": 0.7162390947341919, + "learning_rate": 3.9510751564778246e-05, + "loss": 2.4493, + "step": 14167 + }, + { + "epoch": 1.1434105399079977, + "grad_norm": 0.8251990079879761, + "learning_rate": 3.949818109899367e-05, + "loss": 2.4474, + "step": 14168 + }, + { + "epoch": 1.1434912436445808, + "grad_norm": 0.6739209890365601, + "learning_rate": 3.948561214107258e-05, + "loss": 2.4564, + "step": 14169 + }, + { + "epoch": 1.1435719473811639, + "grad_norm": 0.6606340408325195, + "learning_rate": 3.9473044691328254e-05, + "loss": 2.3838, + "step": 14170 + }, + { + "epoch": 1.1436526511177467, + "grad_norm": 0.7297452092170715, + "learning_rate": 3.946047875007384e-05, + "loss": 2.4673, + "step": 14171 + }, + { + "epoch": 1.1437333548543298, + "grad_norm": 0.7382420301437378, + "learning_rate": 3.9447914317622546e-05, + "loss": 2.4279, + "step": 14172 + }, + { + "epoch": 1.1438140585909127, + "grad_norm": 0.6947354674339294, + "learning_rate": 3.9435351394287546e-05, + "loss": 2.4553, + "step": 14173 + }, + { + "epoch": 1.1438947623274958, + "grad_norm": 0.670369565486908, + "learning_rate": 3.942278998038183e-05, + "loss": 2.4285, + "step": 14174 + }, + { + "epoch": 1.1439754660640788, + "grad_norm": 0.7097954154014587, + "learning_rate": 3.941023007621859e-05, + "loss": 2.477, + "step": 14175 + }, + { + "epoch": 1.1440561698006617, + "grad_norm": 0.6490213871002197, + "learning_rate": 3.9397671682110826e-05, + "loss": 2.3943, + "step": 14176 + }, + { + "epoch": 1.1441368735372448, + "grad_norm": 0.6505936980247498, + "learning_rate": 3.938511479837147e-05, + "loss": 2.4188, + "step": 14177 + }, + { + "epoch": 1.1442175772738277, + "grad_norm": 0.6696773767471313, + "learning_rate": 3.9372559425313496e-05, + "loss": 2.4377, + "step": 14178 + }, + { + "epoch": 1.1442982810104108, + "grad_norm": 0.6747034192085266, + "learning_rate": 3.936000556324982e-05, + "loss": 2.4111, + "step": 14179 + }, + { + "epoch": 1.1443789847469938, + "grad_norm": 0.7766546607017517, + "learning_rate": 3.934745321249336e-05, + "loss": 2.3873, + "step": 14180 + }, + { + "epoch": 1.1444596884835767, + "grad_norm": 0.7608100175857544, + "learning_rate": 3.933490237335688e-05, + "loss": 2.4567, + "step": 14181 + }, + { + "epoch": 1.1445403922201598, + "grad_norm": 0.7724356055259705, + "learning_rate": 3.9322353046153205e-05, + "loss": 2.4729, + "step": 14182 + }, + { + "epoch": 1.1446210959567429, + "grad_norm": 0.6908414363861084, + "learning_rate": 3.930980523119515e-05, + "loss": 2.41, + "step": 14183 + }, + { + "epoch": 1.1447017996933257, + "grad_norm": 0.7209733128547668, + "learning_rate": 3.9297258928795356e-05, + "loss": 2.4629, + "step": 14184 + }, + { + "epoch": 1.1447825034299088, + "grad_norm": 0.7116519212722778, + "learning_rate": 3.928471413926651e-05, + "loss": 2.5081, + "step": 14185 + }, + { + "epoch": 1.144863207166492, + "grad_norm": 0.6704578995704651, + "learning_rate": 3.9272170862921365e-05, + "loss": 2.494, + "step": 14186 + }, + { + "epoch": 1.1449439109030748, + "grad_norm": 0.6914607882499695, + "learning_rate": 3.9259629100072435e-05, + "loss": 2.3979, + "step": 14187 + }, + { + "epoch": 1.1450246146396579, + "grad_norm": 0.7413245439529419, + "learning_rate": 3.924708885103233e-05, + "loss": 2.4534, + "step": 14188 + }, + { + "epoch": 1.1451053183762407, + "grad_norm": 0.7411661744117737, + "learning_rate": 3.923455011611362e-05, + "loss": 2.4191, + "step": 14189 + }, + { + "epoch": 1.1451860221128238, + "grad_norm": 0.6581972241401672, + "learning_rate": 3.9222012895628716e-05, + "loss": 2.4494, + "step": 14190 + }, + { + "epoch": 1.145266725849407, + "grad_norm": 0.6628647446632385, + "learning_rate": 3.920947718989013e-05, + "loss": 2.4483, + "step": 14191 + }, + { + "epoch": 1.1453474295859898, + "grad_norm": 0.7068151831626892, + "learning_rate": 3.9196942999210316e-05, + "loss": 2.4549, + "step": 14192 + }, + { + "epoch": 1.1454281333225729, + "grad_norm": 0.6727713942527771, + "learning_rate": 3.918441032390159e-05, + "loss": 2.4261, + "step": 14193 + }, + { + "epoch": 1.1455088370591557, + "grad_norm": 0.6680718660354614, + "learning_rate": 3.9171879164276334e-05, + "loss": 2.4705, + "step": 14194 + }, + { + "epoch": 1.1455895407957388, + "grad_norm": 0.710096538066864, + "learning_rate": 3.915934952064685e-05, + "loss": 2.474, + "step": 14195 + }, + { + "epoch": 1.145670244532322, + "grad_norm": 0.6927496790885925, + "learning_rate": 3.9146821393325414e-05, + "loss": 2.3979, + "step": 14196 + }, + { + "epoch": 1.1457509482689048, + "grad_norm": 0.6887550354003906, + "learning_rate": 3.913429478262427e-05, + "loss": 2.4588, + "step": 14197 + }, + { + "epoch": 1.1458316520054879, + "grad_norm": 0.6847062706947327, + "learning_rate": 3.912176968885559e-05, + "loss": 2.4602, + "step": 14198 + }, + { + "epoch": 1.145912355742071, + "grad_norm": 0.6832349300384521, + "learning_rate": 3.91092461123316e-05, + "loss": 2.4672, + "step": 14199 + }, + { + "epoch": 1.1459930594786538, + "grad_norm": 0.6789066791534424, + "learning_rate": 3.909672405336432e-05, + "loss": 2.5029, + "step": 14200 + }, + { + "epoch": 1.146073763215237, + "grad_norm": 0.6953951120376587, + "learning_rate": 3.9084203512265885e-05, + "loss": 2.4223, + "step": 14201 + }, + { + "epoch": 1.1461544669518198, + "grad_norm": 0.6629688739776611, + "learning_rate": 3.907168448934836e-05, + "loss": 2.4028, + "step": 14202 + }, + { + "epoch": 1.1462351706884029, + "grad_norm": 0.6661216020584106, + "learning_rate": 3.90591669849237e-05, + "loss": 2.4668, + "step": 14203 + }, + { + "epoch": 1.146315874424986, + "grad_norm": 0.6814442276954651, + "learning_rate": 3.9046650999303894e-05, + "loss": 2.4273, + "step": 14204 + }, + { + "epoch": 1.1463965781615688, + "grad_norm": 0.6678626537322998, + "learning_rate": 3.903413653280088e-05, + "loss": 2.444, + "step": 14205 + }, + { + "epoch": 1.146477281898152, + "grad_norm": 0.6703703999519348, + "learning_rate": 3.902162358572655e-05, + "loss": 2.4273, + "step": 14206 + }, + { + "epoch": 1.1465579856347348, + "grad_norm": 0.7052578926086426, + "learning_rate": 3.900911215839276e-05, + "loss": 2.4397, + "step": 14207 + }, + { + "epoch": 1.1466386893713179, + "grad_norm": 0.6792036294937134, + "learning_rate": 3.899660225111136e-05, + "loss": 2.439, + "step": 14208 + }, + { + "epoch": 1.146719393107901, + "grad_norm": 0.6995401978492737, + "learning_rate": 3.898409386419407e-05, + "loss": 2.5002, + "step": 14209 + }, + { + "epoch": 1.1468000968444838, + "grad_norm": 0.6527338027954102, + "learning_rate": 3.897158699795265e-05, + "loss": 2.4523, + "step": 14210 + }, + { + "epoch": 1.146880800581067, + "grad_norm": 0.7509400248527527, + "learning_rate": 3.8959081652698814e-05, + "loss": 2.4193, + "step": 14211 + }, + { + "epoch": 1.14696150431765, + "grad_norm": 0.6985350251197815, + "learning_rate": 3.894657782874426e-05, + "loss": 2.4251, + "step": 14212 + }, + { + "epoch": 1.1470422080542328, + "grad_norm": 0.6831483840942383, + "learning_rate": 3.893407552640055e-05, + "loss": 2.4172, + "step": 14213 + }, + { + "epoch": 1.147122911790816, + "grad_norm": 0.7281469702720642, + "learning_rate": 3.892157474597929e-05, + "loss": 2.4451, + "step": 14214 + }, + { + "epoch": 1.147203615527399, + "grad_norm": 0.7326027750968933, + "learning_rate": 3.8909075487792066e-05, + "loss": 2.3926, + "step": 14215 + }, + { + "epoch": 1.1472843192639819, + "grad_norm": 0.7030496597290039, + "learning_rate": 3.889657775215036e-05, + "loss": 2.435, + "step": 14216 + }, + { + "epoch": 1.147365023000565, + "grad_norm": 0.6915596127510071, + "learning_rate": 3.888408153936568e-05, + "loss": 2.4622, + "step": 14217 + }, + { + "epoch": 1.1474457267371478, + "grad_norm": 0.678600013256073, + "learning_rate": 3.8871586849749474e-05, + "loss": 2.4264, + "step": 14218 + }, + { + "epoch": 1.147526430473731, + "grad_norm": 0.7487786412239075, + "learning_rate": 3.885909368361308e-05, + "loss": 2.4038, + "step": 14219 + }, + { + "epoch": 1.147607134210314, + "grad_norm": 0.6658064723014832, + "learning_rate": 3.8846602041267886e-05, + "loss": 2.4079, + "step": 14220 + }, + { + "epoch": 1.1476878379468969, + "grad_norm": 0.6985111832618713, + "learning_rate": 3.883411192302527e-05, + "loss": 2.481, + "step": 14221 + }, + { + "epoch": 1.14776854168348, + "grad_norm": 0.7056208848953247, + "learning_rate": 3.8821623329196445e-05, + "loss": 2.4409, + "step": 14222 + }, + { + "epoch": 1.1478492454200628, + "grad_norm": 0.7107830047607422, + "learning_rate": 3.880913626009268e-05, + "loss": 2.4578, + "step": 14223 + }, + { + "epoch": 1.147929949156646, + "grad_norm": 0.6678555607795715, + "learning_rate": 3.87966507160252e-05, + "loss": 2.4548, + "step": 14224 + }, + { + "epoch": 1.148010652893229, + "grad_norm": 0.6699830293655396, + "learning_rate": 3.8784166697305157e-05, + "loss": 2.3763, + "step": 14225 + }, + { + "epoch": 1.1480913566298119, + "grad_norm": 0.7695464491844177, + "learning_rate": 3.8771684204243716e-05, + "loss": 2.4774, + "step": 14226 + }, + { + "epoch": 1.148172060366395, + "grad_norm": 0.7801330089569092, + "learning_rate": 3.8759203237151954e-05, + "loss": 2.4598, + "step": 14227 + }, + { + "epoch": 1.148252764102978, + "grad_norm": 0.7029622793197632, + "learning_rate": 3.8746723796340955e-05, + "loss": 2.3901, + "step": 14228 + }, + { + "epoch": 1.148333467839561, + "grad_norm": 0.7472359538078308, + "learning_rate": 3.873424588212169e-05, + "loss": 2.4724, + "step": 14229 + }, + { + "epoch": 1.148414171576144, + "grad_norm": 0.6621725559234619, + "learning_rate": 3.872176949480517e-05, + "loss": 2.4523, + "step": 14230 + }, + { + "epoch": 1.148494875312727, + "grad_norm": 0.722658634185791, + "learning_rate": 3.8709294634702376e-05, + "loss": 2.4032, + "step": 14231 + }, + { + "epoch": 1.14857557904931, + "grad_norm": 0.7743202447891235, + "learning_rate": 3.869682130212413e-05, + "loss": 2.4373, + "step": 14232 + }, + { + "epoch": 1.148656282785893, + "grad_norm": 0.6906178593635559, + "learning_rate": 3.868434949738136e-05, + "loss": 2.4765, + "step": 14233 + }, + { + "epoch": 1.148736986522476, + "grad_norm": 0.6708275675773621, + "learning_rate": 3.86718792207849e-05, + "loss": 2.4263, + "step": 14234 + }, + { + "epoch": 1.148817690259059, + "grad_norm": 0.6992776989936829, + "learning_rate": 3.8659410472645494e-05, + "loss": 2.378, + "step": 14235 + }, + { + "epoch": 1.148898393995642, + "grad_norm": 0.7229011058807373, + "learning_rate": 3.864694325327389e-05, + "loss": 2.4075, + "step": 14236 + }, + { + "epoch": 1.148979097732225, + "grad_norm": 0.6622509956359863, + "learning_rate": 3.863447756298091e-05, + "loss": 2.3954, + "step": 14237 + }, + { + "epoch": 1.149059801468808, + "grad_norm": 0.7233534455299377, + "learning_rate": 3.862201340207712e-05, + "loss": 2.4506, + "step": 14238 + }, + { + "epoch": 1.149140505205391, + "grad_norm": 0.716869056224823, + "learning_rate": 3.860955077087321e-05, + "loss": 2.4304, + "step": 14239 + }, + { + "epoch": 1.149221208941974, + "grad_norm": 0.6550257205963135, + "learning_rate": 3.8597089669679766e-05, + "loss": 2.4261, + "step": 14240 + }, + { + "epoch": 1.149301912678557, + "grad_norm": 0.6981741786003113, + "learning_rate": 3.858463009880738e-05, + "loss": 2.4115, + "step": 14241 + }, + { + "epoch": 1.14938261641514, + "grad_norm": 0.6792196035385132, + "learning_rate": 3.8572172058566534e-05, + "loss": 2.4195, + "step": 14242 + }, + { + "epoch": 1.149463320151723, + "grad_norm": 0.7278807163238525, + "learning_rate": 3.855971554926773e-05, + "loss": 2.418, + "step": 14243 + }, + { + "epoch": 1.1495440238883061, + "grad_norm": 0.6451076865196228, + "learning_rate": 3.8547260571221456e-05, + "loss": 2.4591, + "step": 14244 + }, + { + "epoch": 1.149624727624889, + "grad_norm": 0.7052451968193054, + "learning_rate": 3.853480712473805e-05, + "loss": 2.4023, + "step": 14245 + }, + { + "epoch": 1.149705431361472, + "grad_norm": 0.7016182541847229, + "learning_rate": 3.852235521012793e-05, + "loss": 2.4959, + "step": 14246 + }, + { + "epoch": 1.1497861350980552, + "grad_norm": 0.7287492156028748, + "learning_rate": 3.850990482770141e-05, + "loss": 2.3884, + "step": 14247 + }, + { + "epoch": 1.149866838834638, + "grad_norm": 0.6648508310317993, + "learning_rate": 3.84974559777688e-05, + "loss": 2.4632, + "step": 14248 + }, + { + "epoch": 1.1499475425712211, + "grad_norm": 0.7387828230857849, + "learning_rate": 3.848500866064036e-05, + "loss": 2.4053, + "step": 14249 + }, + { + "epoch": 1.150028246307804, + "grad_norm": 0.7230356931686401, + "learning_rate": 3.847256287662635e-05, + "loss": 2.5128, + "step": 14250 + }, + { + "epoch": 1.150108950044387, + "grad_norm": 0.7209547162055969, + "learning_rate": 3.846011862603686e-05, + "loss": 2.4626, + "step": 14251 + }, + { + "epoch": 1.1501896537809702, + "grad_norm": 0.7177916765213013, + "learning_rate": 3.844767590918209e-05, + "loss": 2.4469, + "step": 14252 + }, + { + "epoch": 1.150270357517553, + "grad_norm": 0.7850151658058167, + "learning_rate": 3.843523472637216e-05, + "loss": 2.4731, + "step": 14253 + }, + { + "epoch": 1.150351061254136, + "grad_norm": 0.7051519155502319, + "learning_rate": 3.8422795077917084e-05, + "loss": 2.3696, + "step": 14254 + }, + { + "epoch": 1.150431764990719, + "grad_norm": 0.7434025406837463, + "learning_rate": 3.841035696412692e-05, + "loss": 2.444, + "step": 14255 + }, + { + "epoch": 1.150512468727302, + "grad_norm": 0.7404719591140747, + "learning_rate": 3.839792038531166e-05, + "loss": 2.4415, + "step": 14256 + }, + { + "epoch": 1.1505931724638851, + "grad_norm": 0.6883764266967773, + "learning_rate": 3.838548534178125e-05, + "loss": 2.4887, + "step": 14257 + }, + { + "epoch": 1.150673876200468, + "grad_norm": 0.6697155237197876, + "learning_rate": 3.83730518338456e-05, + "loss": 2.3721, + "step": 14258 + }, + { + "epoch": 1.150754579937051, + "grad_norm": 0.68825763463974, + "learning_rate": 3.836061986181459e-05, + "loss": 2.4712, + "step": 14259 + }, + { + "epoch": 1.1508352836736342, + "grad_norm": 0.6810611486434937, + "learning_rate": 3.8348189425998114e-05, + "loss": 2.3995, + "step": 14260 + }, + { + "epoch": 1.150915987410217, + "grad_norm": 0.6718329787254333, + "learning_rate": 3.8335760526705866e-05, + "loss": 2.4068, + "step": 14261 + }, + { + "epoch": 1.1509966911468001, + "grad_norm": 0.694618284702301, + "learning_rate": 3.832333316424767e-05, + "loss": 2.458, + "step": 14262 + }, + { + "epoch": 1.151077394883383, + "grad_norm": 0.6824250817298889, + "learning_rate": 3.8310907338933266e-05, + "loss": 2.4623, + "step": 14263 + }, + { + "epoch": 1.151158098619966, + "grad_norm": 0.6875178217887878, + "learning_rate": 3.8298483051072264e-05, + "loss": 2.4827, + "step": 14264 + }, + { + "epoch": 1.1512388023565492, + "grad_norm": 0.7868281602859497, + "learning_rate": 3.828606030097437e-05, + "loss": 2.4638, + "step": 14265 + }, + { + "epoch": 1.151319506093132, + "grad_norm": 0.7003639936447144, + "learning_rate": 3.8273639088949165e-05, + "loss": 2.4885, + "step": 14266 + }, + { + "epoch": 1.1514002098297151, + "grad_norm": 0.6965197920799255, + "learning_rate": 3.826121941530623e-05, + "loss": 2.3983, + "step": 14267 + }, + { + "epoch": 1.151480913566298, + "grad_norm": 0.7241101264953613, + "learning_rate": 3.824880128035509e-05, + "loss": 2.4598, + "step": 14268 + }, + { + "epoch": 1.151561617302881, + "grad_norm": 0.700764536857605, + "learning_rate": 3.823638468440528e-05, + "loss": 2.3627, + "step": 14269 + }, + { + "epoch": 1.1516423210394642, + "grad_norm": 0.6889846324920654, + "learning_rate": 3.822396962776619e-05, + "loss": 2.4442, + "step": 14270 + }, + { + "epoch": 1.151723024776047, + "grad_norm": 0.6660009026527405, + "learning_rate": 3.8211556110747245e-05, + "loss": 2.403, + "step": 14271 + }, + { + "epoch": 1.1518037285126301, + "grad_norm": 0.6537240743637085, + "learning_rate": 3.819914413365785e-05, + "loss": 2.4358, + "step": 14272 + }, + { + "epoch": 1.1518844322492132, + "grad_norm": 0.6852741837501526, + "learning_rate": 3.818673369680735e-05, + "loss": 2.4272, + "step": 14273 + }, + { + "epoch": 1.151965135985796, + "grad_norm": 0.701874852180481, + "learning_rate": 3.817432480050501e-05, + "loss": 2.4419, + "step": 14274 + }, + { + "epoch": 1.1520458397223792, + "grad_norm": 0.7089500427246094, + "learning_rate": 3.816191744506011e-05, + "loss": 2.4537, + "step": 14275 + }, + { + "epoch": 1.1521265434589623, + "grad_norm": 0.698564887046814, + "learning_rate": 3.8149511630781866e-05, + "loss": 2.3991, + "step": 14276 + }, + { + "epoch": 1.1522072471955451, + "grad_norm": 0.6940335035324097, + "learning_rate": 3.813710735797947e-05, + "loss": 2.5022, + "step": 14277 + }, + { + "epoch": 1.1522879509321282, + "grad_norm": 0.6916826367378235, + "learning_rate": 3.812470462696208e-05, + "loss": 2.4449, + "step": 14278 + }, + { + "epoch": 1.152368654668711, + "grad_norm": 0.7115256190299988, + "learning_rate": 3.811230343803882e-05, + "loss": 2.4371, + "step": 14279 + }, + { + "epoch": 1.1524493584052942, + "grad_norm": 0.6857369542121887, + "learning_rate": 3.80999037915187e-05, + "loss": 2.4426, + "step": 14280 + }, + { + "epoch": 1.1525300621418773, + "grad_norm": 0.7605363130569458, + "learning_rate": 3.808750568771079e-05, + "loss": 2.4999, + "step": 14281 + }, + { + "epoch": 1.1526107658784601, + "grad_norm": 0.6604358553886414, + "learning_rate": 3.8075109126924115e-05, + "loss": 2.419, + "step": 14282 + }, + { + "epoch": 1.1526914696150432, + "grad_norm": 0.6945412755012512, + "learning_rate": 3.806271410946756e-05, + "loss": 2.4555, + "step": 14283 + }, + { + "epoch": 1.152772173351626, + "grad_norm": 0.7205908894538879, + "learning_rate": 3.805032063565007e-05, + "loss": 2.4745, + "step": 14284 + }, + { + "epoch": 1.1528528770882092, + "grad_norm": 0.7198025584220886, + "learning_rate": 3.8037928705780554e-05, + "loss": 2.4358, + "step": 14285 + }, + { + "epoch": 1.1529335808247922, + "grad_norm": 0.7231044769287109, + "learning_rate": 3.802553832016781e-05, + "loss": 2.4713, + "step": 14286 + }, + { + "epoch": 1.1530142845613751, + "grad_norm": 0.6878815293312073, + "learning_rate": 3.80131494791206e-05, + "loss": 2.4479, + "step": 14287 + }, + { + "epoch": 1.1530949882979582, + "grad_norm": 0.6930533647537231, + "learning_rate": 3.800076218294779e-05, + "loss": 2.3912, + "step": 14288 + }, + { + "epoch": 1.1531756920345413, + "grad_norm": 0.703521192073822, + "learning_rate": 3.798837643195808e-05, + "loss": 2.451, + "step": 14289 + }, + { + "epoch": 1.1532563957711242, + "grad_norm": 0.7099746465682983, + "learning_rate": 3.79759922264601e-05, + "loss": 2.4957, + "step": 14290 + }, + { + "epoch": 1.1533370995077072, + "grad_norm": 0.7268218398094177, + "learning_rate": 3.7963609566762527e-05, + "loss": 2.4242, + "step": 14291 + }, + { + "epoch": 1.1534178032442903, + "grad_norm": 0.7465239763259888, + "learning_rate": 3.7951228453174004e-05, + "loss": 2.3867, + "step": 14292 + }, + { + "epoch": 1.1534985069808732, + "grad_norm": 0.704584002494812, + "learning_rate": 3.793884888600302e-05, + "loss": 2.5009, + "step": 14293 + }, + { + "epoch": 1.1535792107174563, + "grad_norm": 0.7057262063026428, + "learning_rate": 3.792647086555816e-05, + "loss": 2.4381, + "step": 14294 + }, + { + "epoch": 1.1536599144540391, + "grad_norm": 0.7045955061912537, + "learning_rate": 3.791409439214794e-05, + "loss": 2.4456, + "step": 14295 + }, + { + "epoch": 1.1537406181906222, + "grad_norm": 0.705476701259613, + "learning_rate": 3.790171946608074e-05, + "loss": 2.466, + "step": 14296 + }, + { + "epoch": 1.1538213219272053, + "grad_norm": 0.7128286957740784, + "learning_rate": 3.788934608766503e-05, + "loss": 2.4891, + "step": 14297 + }, + { + "epoch": 1.1539020256637882, + "grad_norm": 0.678144633769989, + "learning_rate": 3.787697425720918e-05, + "loss": 2.4453, + "step": 14298 + }, + { + "epoch": 1.1539827294003713, + "grad_norm": 0.754216730594635, + "learning_rate": 3.786460397502151e-05, + "loss": 2.4331, + "step": 14299 + }, + { + "epoch": 1.1540634331369541, + "grad_norm": 0.6881092190742493, + "learning_rate": 3.7852235241410325e-05, + "loss": 2.3692, + "step": 14300 + }, + { + "epoch": 1.1541441368735372, + "grad_norm": 0.7498507499694824, + "learning_rate": 3.783986805668395e-05, + "loss": 2.4556, + "step": 14301 + }, + { + "epoch": 1.1542248406101203, + "grad_norm": 0.6312216520309448, + "learning_rate": 3.7827502421150496e-05, + "loss": 2.4727, + "step": 14302 + }, + { + "epoch": 1.1543055443467032, + "grad_norm": 0.7156404256820679, + "learning_rate": 3.781513833511822e-05, + "loss": 2.4003, + "step": 14303 + }, + { + "epoch": 1.1543862480832863, + "grad_norm": 0.6589376926422119, + "learning_rate": 3.7802775798895226e-05, + "loss": 2.4461, + "step": 14304 + }, + { + "epoch": 1.1544669518198694, + "grad_norm": 0.7259865999221802, + "learning_rate": 3.77904148127897e-05, + "loss": 2.4021, + "step": 14305 + }, + { + "epoch": 1.1545476555564522, + "grad_norm": 0.7248456478118896, + "learning_rate": 3.777805537710961e-05, + "loss": 2.4784, + "step": 14306 + }, + { + "epoch": 1.1546283592930353, + "grad_norm": 0.7085593342781067, + "learning_rate": 3.7765697492163034e-05, + "loss": 2.4394, + "step": 14307 + }, + { + "epoch": 1.1547090630296182, + "grad_norm": 0.7394313216209412, + "learning_rate": 3.775334115825796e-05, + "loss": 2.5055, + "step": 14308 + }, + { + "epoch": 1.1547897667662013, + "grad_norm": 0.7231999039649963, + "learning_rate": 3.7740986375702336e-05, + "loss": 2.4551, + "step": 14309 + }, + { + "epoch": 1.1548704705027844, + "grad_norm": 0.6875953078269958, + "learning_rate": 3.7728633144804084e-05, + "loss": 2.4641, + "step": 14310 + }, + { + "epoch": 1.1549511742393672, + "grad_norm": 0.7477203607559204, + "learning_rate": 3.7716281465871094e-05, + "loss": 2.4929, + "step": 14311 + }, + { + "epoch": 1.1550318779759503, + "grad_norm": 0.6653971076011658, + "learning_rate": 3.770393133921115e-05, + "loss": 2.4819, + "step": 14312 + }, + { + "epoch": 1.1551125817125332, + "grad_norm": 0.7267318964004517, + "learning_rate": 3.769158276513209e-05, + "loss": 2.4568, + "step": 14313 + }, + { + "epoch": 1.1551932854491163, + "grad_norm": 0.6675654053688049, + "learning_rate": 3.76792357439417e-05, + "loss": 2.4789, + "step": 14314 + }, + { + "epoch": 1.1552739891856993, + "grad_norm": 0.6847487688064575, + "learning_rate": 3.7666890275947616e-05, + "loss": 2.4034, + "step": 14315 + }, + { + "epoch": 1.1553546929222822, + "grad_norm": 0.811553418636322, + "learning_rate": 3.765454636145758e-05, + "loss": 2.5051, + "step": 14316 + }, + { + "epoch": 1.1554353966588653, + "grad_norm": 0.690026581287384, + "learning_rate": 3.7642204000779204e-05, + "loss": 2.4477, + "step": 14317 + }, + { + "epoch": 1.1555161003954484, + "grad_norm": 0.695810079574585, + "learning_rate": 3.762986319422013e-05, + "loss": 2.4516, + "step": 14318 + }, + { + "epoch": 1.1555968041320313, + "grad_norm": 0.6869217753410339, + "learning_rate": 3.7617523942087886e-05, + "loss": 2.3802, + "step": 14319 + }, + { + "epoch": 1.1556775078686143, + "grad_norm": 0.7109078764915466, + "learning_rate": 3.7605186244690016e-05, + "loss": 2.4306, + "step": 14320 + }, + { + "epoch": 1.1557582116051974, + "grad_norm": 0.7385044693946838, + "learning_rate": 3.759285010233404e-05, + "loss": 2.4288, + "step": 14321 + }, + { + "epoch": 1.1558389153417803, + "grad_norm": 0.6775605082511902, + "learning_rate": 3.7580515515327355e-05, + "loss": 2.4155, + "step": 14322 + }, + { + "epoch": 1.1559196190783634, + "grad_norm": 0.7325694561004639, + "learning_rate": 3.7568182483977375e-05, + "loss": 2.5035, + "step": 14323 + }, + { + "epoch": 1.1560003228149462, + "grad_norm": 0.6896799206733704, + "learning_rate": 3.7555851008591526e-05, + "loss": 2.4739, + "step": 14324 + }, + { + "epoch": 1.1560810265515293, + "grad_norm": 0.7086506485939026, + "learning_rate": 3.7543521089477065e-05, + "loss": 2.4815, + "step": 14325 + }, + { + "epoch": 1.1561617302881124, + "grad_norm": 0.6886687874794006, + "learning_rate": 3.753119272694132e-05, + "loss": 2.4261, + "step": 14326 + }, + { + "epoch": 1.1562424340246953, + "grad_norm": 0.675136148929596, + "learning_rate": 3.751886592129155e-05, + "loss": 2.3946, + "step": 14327 + }, + { + "epoch": 1.1563231377612784, + "grad_norm": 0.706729531288147, + "learning_rate": 3.7506540672834964e-05, + "loss": 2.4199, + "step": 14328 + }, + { + "epoch": 1.1564038414978612, + "grad_norm": 0.6790904998779297, + "learning_rate": 3.749421698187875e-05, + "loss": 2.4419, + "step": 14329 + }, + { + "epoch": 1.1564845452344443, + "grad_norm": 0.6688171029090881, + "learning_rate": 3.748189484873007e-05, + "loss": 2.4516, + "step": 14330 + }, + { + "epoch": 1.1565652489710274, + "grad_norm": 0.6782420873641968, + "learning_rate": 3.746957427369596e-05, + "loss": 2.4586, + "step": 14331 + }, + { + "epoch": 1.1566459527076103, + "grad_norm": 0.7633399367332458, + "learning_rate": 3.7457255257083514e-05, + "loss": 2.3776, + "step": 14332 + }, + { + "epoch": 1.1567266564441934, + "grad_norm": 0.680000364780426, + "learning_rate": 3.744493779919976e-05, + "loss": 2.4978, + "step": 14333 + }, + { + "epoch": 1.1568073601807765, + "grad_norm": 0.6993350386619568, + "learning_rate": 3.743262190035171e-05, + "loss": 2.3974, + "step": 14334 + }, + { + "epoch": 1.1568880639173593, + "grad_norm": 0.7316375374794006, + "learning_rate": 3.7420307560846234e-05, + "loss": 2.4423, + "step": 14335 + }, + { + "epoch": 1.1569687676539424, + "grad_norm": 0.7384842038154602, + "learning_rate": 3.7407994780990285e-05, + "loss": 2.4604, + "step": 14336 + }, + { + "epoch": 1.1570494713905255, + "grad_norm": 0.6980708837509155, + "learning_rate": 3.739568356109072e-05, + "loss": 2.4408, + "step": 14337 + }, + { + "epoch": 1.1571301751271084, + "grad_norm": 0.6510182619094849, + "learning_rate": 3.738337390145438e-05, + "loss": 2.4076, + "step": 14338 + }, + { + "epoch": 1.1572108788636915, + "grad_norm": 0.7458614706993103, + "learning_rate": 3.737106580238804e-05, + "loss": 2.4976, + "step": 14339 + }, + { + "epoch": 1.1572915826002743, + "grad_norm": 0.6663469672203064, + "learning_rate": 3.735875926419849e-05, + "loss": 2.4414, + "step": 14340 + }, + { + "epoch": 1.1573722863368574, + "grad_norm": 0.6611858606338501, + "learning_rate": 3.7346454287192355e-05, + "loss": 2.3783, + "step": 14341 + }, + { + "epoch": 1.1574529900734405, + "grad_norm": 0.6605291366577148, + "learning_rate": 3.7334150871676364e-05, + "loss": 2.4291, + "step": 14342 + }, + { + "epoch": 1.1575336938100234, + "grad_norm": 0.6879985928535461, + "learning_rate": 3.7321849017957186e-05, + "loss": 2.4229, + "step": 14343 + }, + { + "epoch": 1.1576143975466064, + "grad_norm": 0.7466493844985962, + "learning_rate": 3.7309548726341334e-05, + "loss": 2.4278, + "step": 14344 + }, + { + "epoch": 1.1576951012831893, + "grad_norm": 0.7476457357406616, + "learning_rate": 3.72972499971354e-05, + "loss": 2.4944, + "step": 14345 + }, + { + "epoch": 1.1577758050197724, + "grad_norm": 0.6339364647865295, + "learning_rate": 3.728495283064594e-05, + "loss": 2.3753, + "step": 14346 + }, + { + "epoch": 1.1578565087563555, + "grad_norm": 0.6885230541229248, + "learning_rate": 3.7272657227179355e-05, + "loss": 2.4519, + "step": 14347 + }, + { + "epoch": 1.1579372124929384, + "grad_norm": 0.7561741471290588, + "learning_rate": 3.7260363187042126e-05, + "loss": 2.4808, + "step": 14348 + }, + { + "epoch": 1.1580179162295214, + "grad_norm": 0.8007705211639404, + "learning_rate": 3.724807071054062e-05, + "loss": 2.4649, + "step": 14349 + }, + { + "epoch": 1.1580986199661045, + "grad_norm": 0.6920937895774841, + "learning_rate": 3.72357797979813e-05, + "loss": 2.4145, + "step": 14350 + }, + { + "epoch": 1.1581793237026874, + "grad_norm": 0.7310675978660583, + "learning_rate": 3.7223490449670364e-05, + "loss": 2.4475, + "step": 14351 + }, + { + "epoch": 1.1582600274392705, + "grad_norm": 0.6600463390350342, + "learning_rate": 3.7211202665914155e-05, + "loss": 2.3938, + "step": 14352 + }, + { + "epoch": 1.1583407311758536, + "grad_norm": 0.690258800983429, + "learning_rate": 3.719891644701894e-05, + "loss": 2.3944, + "step": 14353 + }, + { + "epoch": 1.1584214349124364, + "grad_norm": 0.7075135111808777, + "learning_rate": 3.718663179329085e-05, + "loss": 2.3931, + "step": 14354 + }, + { + "epoch": 1.1585021386490195, + "grad_norm": 0.7416332960128784, + "learning_rate": 3.71743487050361e-05, + "loss": 2.4566, + "step": 14355 + }, + { + "epoch": 1.1585828423856024, + "grad_norm": 0.7459710836410522, + "learning_rate": 3.7162067182560846e-05, + "loss": 2.4232, + "step": 14356 + }, + { + "epoch": 1.1586635461221855, + "grad_norm": 0.7265400886535645, + "learning_rate": 3.71497872261711e-05, + "loss": 2.4798, + "step": 14357 + }, + { + "epoch": 1.1587442498587683, + "grad_norm": 0.7142636775970459, + "learning_rate": 3.713750883617294e-05, + "loss": 2.4576, + "step": 14358 + }, + { + "epoch": 1.1588249535953514, + "grad_norm": 0.7279871702194214, + "learning_rate": 3.712523201287239e-05, + "loss": 2.439, + "step": 14359 + }, + { + "epoch": 1.1589056573319345, + "grad_norm": 0.7151274681091309, + "learning_rate": 3.7112956756575414e-05, + "loss": 2.4684, + "step": 14360 + }, + { + "epoch": 1.1589863610685174, + "grad_norm": 0.7142657041549683, + "learning_rate": 3.7100683067587946e-05, + "loss": 2.4582, + "step": 14361 + }, + { + "epoch": 1.1590670648051005, + "grad_norm": 0.7716035842895508, + "learning_rate": 3.7088410946215914e-05, + "loss": 2.5038, + "step": 14362 + }, + { + "epoch": 1.1591477685416836, + "grad_norm": 0.7232338190078735, + "learning_rate": 3.707614039276509e-05, + "loss": 2.4558, + "step": 14363 + }, + { + "epoch": 1.1592284722782664, + "grad_norm": 0.7388719916343689, + "learning_rate": 3.706387140754134e-05, + "loss": 2.4535, + "step": 14364 + }, + { + "epoch": 1.1593091760148495, + "grad_norm": 0.7022652626037598, + "learning_rate": 3.7051603990850425e-05, + "loss": 2.4479, + "step": 14365 + }, + { + "epoch": 1.1593898797514326, + "grad_norm": 0.7861798405647278, + "learning_rate": 3.703933814299813e-05, + "loss": 2.4219, + "step": 14366 + }, + { + "epoch": 1.1594705834880155, + "grad_norm": 0.6928723454475403, + "learning_rate": 3.7027073864290074e-05, + "loss": 2.4401, + "step": 14367 + }, + { + "epoch": 1.1595512872245985, + "grad_norm": 0.6312821507453918, + "learning_rate": 3.701481115503194e-05, + "loss": 2.3975, + "step": 14368 + }, + { + "epoch": 1.1596319909611814, + "grad_norm": 0.7008257508277893, + "learning_rate": 3.700255001552937e-05, + "loss": 2.4988, + "step": 14369 + }, + { + "epoch": 1.1597126946977645, + "grad_norm": 0.6664693355560303, + "learning_rate": 3.699029044608792e-05, + "loss": 2.4123, + "step": 14370 + }, + { + "epoch": 1.1597933984343476, + "grad_norm": 0.6613842844963074, + "learning_rate": 3.6978032447013145e-05, + "loss": 2.4802, + "step": 14371 + }, + { + "epoch": 1.1598741021709305, + "grad_norm": 0.707788348197937, + "learning_rate": 3.696577601861057e-05, + "loss": 2.4432, + "step": 14372 + }, + { + "epoch": 1.1599548059075135, + "grad_norm": 0.6547604203224182, + "learning_rate": 3.695352116118561e-05, + "loss": 2.412, + "step": 14373 + }, + { + "epoch": 1.1600355096440964, + "grad_norm": 0.7238109707832336, + "learning_rate": 3.69412678750437e-05, + "loss": 2.4858, + "step": 14374 + }, + { + "epoch": 1.1601162133806795, + "grad_norm": 0.8156580328941345, + "learning_rate": 3.692901616049026e-05, + "loss": 2.4063, + "step": 14375 + }, + { + "epoch": 1.1601969171172626, + "grad_norm": 0.7035481333732605, + "learning_rate": 3.6916766017830585e-05, + "loss": 2.4586, + "step": 14376 + }, + { + "epoch": 1.1602776208538454, + "grad_norm": 0.7523401379585266, + "learning_rate": 3.690451744736999e-05, + "loss": 2.4262, + "step": 14377 + }, + { + "epoch": 1.1603583245904285, + "grad_norm": 0.6740732192993164, + "learning_rate": 3.689227044941376e-05, + "loss": 2.5215, + "step": 14378 + }, + { + "epoch": 1.1604390283270116, + "grad_norm": 0.6502695083618164, + "learning_rate": 3.6880025024267115e-05, + "loss": 2.4292, + "step": 14379 + }, + { + "epoch": 1.1605197320635945, + "grad_norm": 0.7000409364700317, + "learning_rate": 3.686778117223524e-05, + "loss": 2.4323, + "step": 14380 + }, + { + "epoch": 1.1606004358001776, + "grad_norm": 0.7415478229522705, + "learning_rate": 3.68555388936233e-05, + "loss": 2.4515, + "step": 14381 + }, + { + "epoch": 1.1606811395367607, + "grad_norm": 0.6890547871589661, + "learning_rate": 3.684329818873641e-05, + "loss": 2.4115, + "step": 14382 + }, + { + "epoch": 1.1607618432733435, + "grad_norm": 0.8238685727119446, + "learning_rate": 3.68310590578796e-05, + "loss": 2.4666, + "step": 14383 + }, + { + "epoch": 1.1608425470099266, + "grad_norm": 0.8098889589309692, + "learning_rate": 3.681882150135791e-05, + "loss": 2.4667, + "step": 14384 + }, + { + "epoch": 1.1609232507465095, + "grad_norm": 0.6932713985443115, + "learning_rate": 3.680658551947639e-05, + "loss": 2.4574, + "step": 14385 + }, + { + "epoch": 1.1610039544830926, + "grad_norm": 0.7062943577766418, + "learning_rate": 3.6794351112539915e-05, + "loss": 2.4408, + "step": 14386 + }, + { + "epoch": 1.1610846582196757, + "grad_norm": 0.7859255075454712, + "learning_rate": 3.678211828085343e-05, + "loss": 2.3946, + "step": 14387 + }, + { + "epoch": 1.1611653619562585, + "grad_norm": 0.674609899520874, + "learning_rate": 3.676988702472181e-05, + "loss": 2.4456, + "step": 14388 + }, + { + "epoch": 1.1612460656928416, + "grad_norm": 0.7068402171134949, + "learning_rate": 3.675765734444989e-05, + "loss": 2.4393, + "step": 14389 + }, + { + "epoch": 1.1613267694294245, + "grad_norm": 0.7276526689529419, + "learning_rate": 3.674542924034246e-05, + "loss": 2.456, + "step": 14390 + }, + { + "epoch": 1.1614074731660076, + "grad_norm": 0.7670585513114929, + "learning_rate": 3.673320271270433e-05, + "loss": 2.3774, + "step": 14391 + }, + { + "epoch": 1.1614881769025907, + "grad_norm": 0.702173113822937, + "learning_rate": 3.672097776184013e-05, + "loss": 2.3974, + "step": 14392 + }, + { + "epoch": 1.1615688806391735, + "grad_norm": 0.6922066807746887, + "learning_rate": 3.670875438805457e-05, + "loss": 2.4035, + "step": 14393 + }, + { + "epoch": 1.1616495843757566, + "grad_norm": 0.6675707697868347, + "learning_rate": 3.6696532591652335e-05, + "loss": 2.4369, + "step": 14394 + }, + { + "epoch": 1.1617302881123397, + "grad_norm": 0.6939712762832642, + "learning_rate": 3.668431237293796e-05, + "loss": 2.4265, + "step": 14395 + }, + { + "epoch": 1.1618109918489226, + "grad_norm": 0.719510018825531, + "learning_rate": 3.667209373221602e-05, + "loss": 2.4686, + "step": 14396 + }, + { + "epoch": 1.1618916955855056, + "grad_norm": 0.7167489528656006, + "learning_rate": 3.665987666979104e-05, + "loss": 2.5077, + "step": 14397 + }, + { + "epoch": 1.1619723993220887, + "grad_norm": 0.6539514064788818, + "learning_rate": 3.664766118596754e-05, + "loss": 2.4476, + "step": 14398 + }, + { + "epoch": 1.1620531030586716, + "grad_norm": 0.6926440596580505, + "learning_rate": 3.6635447281049876e-05, + "loss": 2.4336, + "step": 14399 + }, + { + "epoch": 1.1621338067952547, + "grad_norm": 0.7124993205070496, + "learning_rate": 3.662323495534252e-05, + "loss": 2.3938, + "step": 14400 + }, + { + "epoch": 1.1622145105318376, + "grad_norm": 0.7073954939842224, + "learning_rate": 3.661102420914986e-05, + "loss": 2.4232, + "step": 14401 + }, + { + "epoch": 1.1622952142684206, + "grad_norm": 0.7491076588630676, + "learning_rate": 3.659881504277613e-05, + "loss": 2.5047, + "step": 14402 + }, + { + "epoch": 1.1623759180050037, + "grad_norm": 0.6698675155639648, + "learning_rate": 3.658660745652568e-05, + "loss": 2.4164, + "step": 14403 + }, + { + "epoch": 1.1624566217415866, + "grad_norm": 0.6576815843582153, + "learning_rate": 3.657440145070276e-05, + "loss": 2.4368, + "step": 14404 + }, + { + "epoch": 1.1625373254781697, + "grad_norm": 0.8236953020095825, + "learning_rate": 3.6562197025611524e-05, + "loss": 2.5041, + "step": 14405 + }, + { + "epoch": 1.1626180292147525, + "grad_norm": 0.7391532063484192, + "learning_rate": 3.6549994181556157e-05, + "loss": 2.4556, + "step": 14406 + }, + { + "epoch": 1.1626987329513356, + "grad_norm": 0.6529936790466309, + "learning_rate": 3.653779291884084e-05, + "loss": 2.4559, + "step": 14407 + }, + { + "epoch": 1.1627794366879187, + "grad_norm": 0.7101796269416809, + "learning_rate": 3.652559323776957e-05, + "loss": 2.3937, + "step": 14408 + }, + { + "epoch": 1.1628601404245016, + "grad_norm": 0.6890308260917664, + "learning_rate": 3.651339513864645e-05, + "loss": 2.4694, + "step": 14409 + }, + { + "epoch": 1.1629408441610847, + "grad_norm": 0.6919918060302734, + "learning_rate": 3.650119862177548e-05, + "loss": 2.4793, + "step": 14410 + }, + { + "epoch": 1.1630215478976678, + "grad_norm": 0.6553575992584229, + "learning_rate": 3.6489003687460624e-05, + "loss": 2.454, + "step": 14411 + }, + { + "epoch": 1.1631022516342506, + "grad_norm": 0.7095460891723633, + "learning_rate": 3.6476810336005804e-05, + "loss": 2.4672, + "step": 14412 + }, + { + "epoch": 1.1631829553708337, + "grad_norm": 0.738301694393158, + "learning_rate": 3.6464618567714935e-05, + "loss": 2.4369, + "step": 14413 + }, + { + "epoch": 1.1632636591074166, + "grad_norm": 0.7574542760848999, + "learning_rate": 3.645242838289189e-05, + "loss": 2.4981, + "step": 14414 + }, + { + "epoch": 1.1633443628439997, + "grad_norm": 0.6780585646629333, + "learning_rate": 3.64402397818404e-05, + "loss": 2.4811, + "step": 14415 + }, + { + "epoch": 1.1634250665805828, + "grad_norm": 0.7050060629844666, + "learning_rate": 3.6428052764864287e-05, + "loss": 2.4607, + "step": 14416 + }, + { + "epoch": 1.1635057703171656, + "grad_norm": 0.6946923136711121, + "learning_rate": 3.6415867332267316e-05, + "loss": 2.4482, + "step": 14417 + }, + { + "epoch": 1.1635864740537487, + "grad_norm": 0.7202015519142151, + "learning_rate": 3.64036834843531e-05, + "loss": 2.4764, + "step": 14418 + }, + { + "epoch": 1.1636671777903316, + "grad_norm": 0.7845996618270874, + "learning_rate": 3.639150122142534e-05, + "loss": 2.4926, + "step": 14419 + }, + { + "epoch": 1.1637478815269147, + "grad_norm": 0.6924630403518677, + "learning_rate": 3.6379320543787645e-05, + "loss": 2.4664, + "step": 14420 + }, + { + "epoch": 1.1638285852634978, + "grad_norm": 0.7225920557975769, + "learning_rate": 3.636714145174358e-05, + "loss": 2.4638, + "step": 14421 + }, + { + "epoch": 1.1639092890000806, + "grad_norm": 0.6587103605270386, + "learning_rate": 3.63549639455967e-05, + "loss": 2.3629, + "step": 14422 + }, + { + "epoch": 1.1639899927366637, + "grad_norm": 0.7537658214569092, + "learning_rate": 3.634278802565051e-05, + "loss": 2.4971, + "step": 14423 + }, + { + "epoch": 1.1640706964732468, + "grad_norm": 0.6881381273269653, + "learning_rate": 3.633061369220841e-05, + "loss": 2.3737, + "step": 14424 + }, + { + "epoch": 1.1641514002098297, + "grad_norm": 0.693779468536377, + "learning_rate": 3.6318440945573864e-05, + "loss": 2.4346, + "step": 14425 + }, + { + "epoch": 1.1642321039464127, + "grad_norm": 0.777563750743866, + "learning_rate": 3.6306269786050265e-05, + "loss": 2.4288, + "step": 14426 + }, + { + "epoch": 1.1643128076829958, + "grad_norm": 0.6786738634109497, + "learning_rate": 3.629410021394087e-05, + "loss": 2.4094, + "step": 14427 + }, + { + "epoch": 1.1643935114195787, + "grad_norm": 0.7478442788124084, + "learning_rate": 3.628193222954904e-05, + "loss": 2.4163, + "step": 14428 + }, + { + "epoch": 1.1644742151561618, + "grad_norm": 0.6530766487121582, + "learning_rate": 3.626976583317803e-05, + "loss": 2.4328, + "step": 14429 + }, + { + "epoch": 1.1645549188927447, + "grad_norm": 0.6665371060371399, + "learning_rate": 3.6257601025131026e-05, + "loss": 2.4006, + "step": 14430 + }, + { + "epoch": 1.1646356226293277, + "grad_norm": 0.7184741497039795, + "learning_rate": 3.624543780571125e-05, + "loss": 2.462, + "step": 14431 + }, + { + "epoch": 1.1647163263659108, + "grad_norm": 0.7039462327957153, + "learning_rate": 3.6233276175221794e-05, + "loss": 2.4321, + "step": 14432 + }, + { + "epoch": 1.1647970301024937, + "grad_norm": 0.7039144039154053, + "learning_rate": 3.622111613396584e-05, + "loss": 2.4399, + "step": 14433 + }, + { + "epoch": 1.1648777338390768, + "grad_norm": 0.6690253615379333, + "learning_rate": 3.620895768224635e-05, + "loss": 2.3976, + "step": 14434 + }, + { + "epoch": 1.1649584375756596, + "grad_norm": 0.7048032879829407, + "learning_rate": 3.6196800820366384e-05, + "loss": 2.4848, + "step": 14435 + }, + { + "epoch": 1.1650391413122427, + "grad_norm": 0.668971836566925, + "learning_rate": 3.618464554862896e-05, + "loss": 2.4614, + "step": 14436 + }, + { + "epoch": 1.1651198450488258, + "grad_norm": 0.704858660697937, + "learning_rate": 3.617249186733695e-05, + "loss": 2.3962, + "step": 14437 + }, + { + "epoch": 1.1652005487854087, + "grad_norm": 0.692435085773468, + "learning_rate": 3.6160339776793296e-05, + "loss": 2.4059, + "step": 14438 + }, + { + "epoch": 1.1652812525219918, + "grad_norm": 0.6774182319641113, + "learning_rate": 3.614818927730085e-05, + "loss": 2.4975, + "step": 14439 + }, + { + "epoch": 1.1653619562585749, + "grad_norm": 0.6507411003112793, + "learning_rate": 3.613604036916243e-05, + "loss": 2.5029, + "step": 14440 + }, + { + "epoch": 1.1654426599951577, + "grad_norm": 0.7223206162452698, + "learning_rate": 3.612389305268084e-05, + "loss": 2.4599, + "step": 14441 + }, + { + "epoch": 1.1655233637317408, + "grad_norm": 0.6523364186286926, + "learning_rate": 3.611174732815883e-05, + "loss": 2.4521, + "step": 14442 + }, + { + "epoch": 1.165604067468324, + "grad_norm": 0.6668452024459839, + "learning_rate": 3.6099603195899046e-05, + "loss": 2.4082, + "step": 14443 + }, + { + "epoch": 1.1656847712049068, + "grad_norm": 0.6878299117088318, + "learning_rate": 3.60874606562042e-05, + "loss": 2.4144, + "step": 14444 + }, + { + "epoch": 1.1657654749414899, + "grad_norm": 0.6662277579307556, + "learning_rate": 3.6075319709376895e-05, + "loss": 2.438, + "step": 14445 + }, + { + "epoch": 1.1658461786780727, + "grad_norm": 0.721422553062439, + "learning_rate": 3.606318035571976e-05, + "loss": 2.4414, + "step": 14446 + }, + { + "epoch": 1.1659268824146558, + "grad_norm": 0.6739782691001892, + "learning_rate": 3.6051042595535264e-05, + "loss": 2.4093, + "step": 14447 + }, + { + "epoch": 1.166007586151239, + "grad_norm": 0.6890884637832642, + "learning_rate": 3.603890642912596e-05, + "loss": 2.4385, + "step": 14448 + }, + { + "epoch": 1.1660882898878218, + "grad_norm": 0.6503998637199402, + "learning_rate": 3.602677185679433e-05, + "loss": 2.4498, + "step": 14449 + }, + { + "epoch": 1.1661689936244048, + "grad_norm": 0.6748046875, + "learning_rate": 3.601463887884271e-05, + "loss": 2.3739, + "step": 14450 + }, + { + "epoch": 1.1662496973609877, + "grad_norm": 0.6843422651290894, + "learning_rate": 3.600250749557358e-05, + "loss": 2.4323, + "step": 14451 + }, + { + "epoch": 1.1663304010975708, + "grad_norm": 0.7061208486557007, + "learning_rate": 3.599037770728929e-05, + "loss": 2.4611, + "step": 14452 + }, + { + "epoch": 1.166411104834154, + "grad_norm": 0.6614537239074707, + "learning_rate": 3.597824951429208e-05, + "loss": 2.4656, + "step": 14453 + }, + { + "epoch": 1.1664918085707368, + "grad_norm": 0.6620328426361084, + "learning_rate": 3.596612291688424e-05, + "loss": 2.415, + "step": 14454 + }, + { + "epoch": 1.1665725123073198, + "grad_norm": 0.6936565041542053, + "learning_rate": 3.595399791536804e-05, + "loss": 2.4655, + "step": 14455 + }, + { + "epoch": 1.166653216043903, + "grad_norm": 0.6766063570976257, + "learning_rate": 3.594187451004559e-05, + "loss": 2.4628, + "step": 14456 + }, + { + "epoch": 1.1667339197804858, + "grad_norm": 0.6588734984397888, + "learning_rate": 3.592975270121909e-05, + "loss": 2.4503, + "step": 14457 + }, + { + "epoch": 1.1668146235170689, + "grad_norm": 0.7290894985198975, + "learning_rate": 3.591763248919062e-05, + "loss": 2.5075, + "step": 14458 + }, + { + "epoch": 1.1668953272536517, + "grad_norm": 0.6952784657478333, + "learning_rate": 3.590551387426231e-05, + "loss": 2.4258, + "step": 14459 + }, + { + "epoch": 1.1669760309902348, + "grad_norm": 0.6737042665481567, + "learning_rate": 3.5893396856736096e-05, + "loss": 2.4459, + "step": 14460 + }, + { + "epoch": 1.167056734726818, + "grad_norm": 0.6616976857185364, + "learning_rate": 3.588128143691397e-05, + "loss": 2.4726, + "step": 14461 + }, + { + "epoch": 1.1671374384634008, + "grad_norm": 0.7017171382904053, + "learning_rate": 3.5869167615098e-05, + "loss": 2.375, + "step": 14462 + }, + { + "epoch": 1.1672181421999839, + "grad_norm": 0.7153809666633606, + "learning_rate": 3.585705539158997e-05, + "loss": 2.4271, + "step": 14463 + }, + { + "epoch": 1.1672988459365667, + "grad_norm": 0.749196469783783, + "learning_rate": 3.584494476669179e-05, + "loss": 2.4713, + "step": 14464 + }, + { + "epoch": 1.1673795496731498, + "grad_norm": 0.6593676209449768, + "learning_rate": 3.583283574070533e-05, + "loss": 2.4276, + "step": 14465 + }, + { + "epoch": 1.167460253409733, + "grad_norm": 0.6949084401130676, + "learning_rate": 3.5820728313932295e-05, + "loss": 2.4128, + "step": 14466 + }, + { + "epoch": 1.1675409571463158, + "grad_norm": 0.6795482039451599, + "learning_rate": 3.5808622486674484e-05, + "loss": 2.485, + "step": 14467 + }, + { + "epoch": 1.1676216608828989, + "grad_norm": 0.6763483881950378, + "learning_rate": 3.5796518259233625e-05, + "loss": 2.4063, + "step": 14468 + }, + { + "epoch": 1.167702364619482, + "grad_norm": 0.665687620639801, + "learning_rate": 3.578441563191133e-05, + "loss": 2.437, + "step": 14469 + }, + { + "epoch": 1.1677830683560648, + "grad_norm": 0.6338435411453247, + "learning_rate": 3.577231460500926e-05, + "loss": 2.3747, + "step": 14470 + }, + { + "epoch": 1.167863772092648, + "grad_norm": 0.7031865119934082, + "learning_rate": 3.5760215178829e-05, + "loss": 2.3952, + "step": 14471 + }, + { + "epoch": 1.167944475829231, + "grad_norm": 0.7544599771499634, + "learning_rate": 3.5748117353672106e-05, + "loss": 2.3941, + "step": 14472 + }, + { + "epoch": 1.1680251795658139, + "grad_norm": 0.7271532416343689, + "learning_rate": 3.5736021129840083e-05, + "loss": 2.4371, + "step": 14473 + }, + { + "epoch": 1.168105883302397, + "grad_norm": 0.709048867225647, + "learning_rate": 3.572392650763441e-05, + "loss": 2.482, + "step": 14474 + }, + { + "epoch": 1.1681865870389798, + "grad_norm": 0.6894589066505432, + "learning_rate": 3.571183348735653e-05, + "loss": 2.4347, + "step": 14475 + }, + { + "epoch": 1.168267290775563, + "grad_norm": 0.6680620908737183, + "learning_rate": 3.5699742069307774e-05, + "loss": 2.3995, + "step": 14476 + }, + { + "epoch": 1.168347994512146, + "grad_norm": 0.701669454574585, + "learning_rate": 3.568765225378954e-05, + "loss": 2.4045, + "step": 14477 + }, + { + "epoch": 1.1684286982487289, + "grad_norm": 0.7102392911911011, + "learning_rate": 3.567556404110315e-05, + "loss": 2.4695, + "step": 14478 + }, + { + "epoch": 1.168509401985312, + "grad_norm": 0.6820430755615234, + "learning_rate": 3.566347743154982e-05, + "loss": 2.4155, + "step": 14479 + }, + { + "epoch": 1.1685901057218948, + "grad_norm": 0.6611022353172302, + "learning_rate": 3.565139242543081e-05, + "loss": 2.3992, + "step": 14480 + }, + { + "epoch": 1.168670809458478, + "grad_norm": 0.6844382882118225, + "learning_rate": 3.5639309023047306e-05, + "loss": 2.4345, + "step": 14481 + }, + { + "epoch": 1.168751513195061, + "grad_norm": 0.7557988166809082, + "learning_rate": 3.5627227224700464e-05, + "loss": 2.4454, + "step": 14482 + }, + { + "epoch": 1.1688322169316439, + "grad_norm": 0.6652555465698242, + "learning_rate": 3.5615147030691384e-05, + "loss": 2.3749, + "step": 14483 + }, + { + "epoch": 1.168912920668227, + "grad_norm": 0.6912989020347595, + "learning_rate": 3.56030684413212e-05, + "loss": 2.4737, + "step": 14484 + }, + { + "epoch": 1.16899362440481, + "grad_norm": 0.735103964805603, + "learning_rate": 3.559099145689083e-05, + "loss": 2.4098, + "step": 14485 + }, + { + "epoch": 1.169074328141393, + "grad_norm": 0.6873028874397278, + "learning_rate": 3.557891607770133e-05, + "loss": 2.4247, + "step": 14486 + }, + { + "epoch": 1.169155031877976, + "grad_norm": 0.7364680171012878, + "learning_rate": 3.556684230405367e-05, + "loss": 2.4314, + "step": 14487 + }, + { + "epoch": 1.169235735614559, + "grad_norm": 0.679122269153595, + "learning_rate": 3.55547701362487e-05, + "loss": 2.4196, + "step": 14488 + }, + { + "epoch": 1.169316439351142, + "grad_norm": 0.6783872246742249, + "learning_rate": 3.554269957458731e-05, + "loss": 2.4212, + "step": 14489 + }, + { + "epoch": 1.169397143087725, + "grad_norm": 0.7434942126274109, + "learning_rate": 3.553063061937034e-05, + "loss": 2.4139, + "step": 14490 + }, + { + "epoch": 1.1694778468243079, + "grad_norm": 0.6799852252006531, + "learning_rate": 3.55185632708986e-05, + "loss": 2.4252, + "step": 14491 + }, + { + "epoch": 1.169558550560891, + "grad_norm": 0.7040107250213623, + "learning_rate": 3.5506497529472795e-05, + "loss": 2.3937, + "step": 14492 + }, + { + "epoch": 1.169639254297474, + "grad_norm": 0.7350315451622009, + "learning_rate": 3.549443339539368e-05, + "loss": 2.4063, + "step": 14493 + }, + { + "epoch": 1.169719958034057, + "grad_norm": 0.694521963596344, + "learning_rate": 3.548237086896192e-05, + "loss": 2.4715, + "step": 14494 + }, + { + "epoch": 1.16980066177064, + "grad_norm": 0.6648221015930176, + "learning_rate": 3.5470309950478096e-05, + "loss": 2.4365, + "step": 14495 + }, + { + "epoch": 1.1698813655072229, + "grad_norm": 0.688024640083313, + "learning_rate": 3.545825064024284e-05, + "loss": 2.449, + "step": 14496 + }, + { + "epoch": 1.169962069243806, + "grad_norm": 0.6743311882019043, + "learning_rate": 3.544619293855672e-05, + "loss": 2.4283, + "step": 14497 + }, + { + "epoch": 1.170042772980389, + "grad_norm": 0.669119119644165, + "learning_rate": 3.543413684572019e-05, + "loss": 2.4363, + "step": 14498 + }, + { + "epoch": 1.170123476716972, + "grad_norm": 0.6998667120933533, + "learning_rate": 3.5422082362033745e-05, + "loss": 2.425, + "step": 14499 + }, + { + "epoch": 1.170204180453555, + "grad_norm": 0.7681630253791809, + "learning_rate": 3.5410029487797845e-05, + "loss": 2.4382, + "step": 14500 + }, + { + "epoch": 1.170284884190138, + "grad_norm": 0.6925049424171448, + "learning_rate": 3.539797822331279e-05, + "loss": 2.4261, + "step": 14501 + }, + { + "epoch": 1.170365587926721, + "grad_norm": 0.7145542502403259, + "learning_rate": 3.538592856887901e-05, + "loss": 2.4681, + "step": 14502 + }, + { + "epoch": 1.170446291663304, + "grad_norm": 0.6441611647605896, + "learning_rate": 3.537388052479684e-05, + "loss": 2.4187, + "step": 14503 + }, + { + "epoch": 1.1705269953998871, + "grad_norm": 0.6622560620307922, + "learning_rate": 3.5361834091366466e-05, + "loss": 2.4615, + "step": 14504 + }, + { + "epoch": 1.17060769913647, + "grad_norm": 0.6987677812576294, + "learning_rate": 3.5349789268888144e-05, + "loss": 2.413, + "step": 14505 + }, + { + "epoch": 1.170688402873053, + "grad_norm": 0.668358325958252, + "learning_rate": 3.533774605766207e-05, + "loss": 2.5146, + "step": 14506 + }, + { + "epoch": 1.170769106609636, + "grad_norm": 0.7514958381652832, + "learning_rate": 3.532570445798844e-05, + "loss": 2.4474, + "step": 14507 + }, + { + "epoch": 1.170849810346219, + "grad_norm": 0.6454465389251709, + "learning_rate": 3.5313664470167276e-05, + "loss": 2.3911, + "step": 14508 + }, + { + "epoch": 1.170930514082802, + "grad_norm": 0.6653602719306946, + "learning_rate": 3.5301626094498674e-05, + "loss": 2.4223, + "step": 14509 + }, + { + "epoch": 1.171011217819385, + "grad_norm": 0.6782815456390381, + "learning_rate": 3.5289589331282715e-05, + "loss": 2.457, + "step": 14510 + }, + { + "epoch": 1.171091921555968, + "grad_norm": 0.720973014831543, + "learning_rate": 3.527755418081932e-05, + "loss": 2.4541, + "step": 14511 + }, + { + "epoch": 1.171172625292551, + "grad_norm": 0.6300156712532043, + "learning_rate": 3.526552064340841e-05, + "loss": 2.4451, + "step": 14512 + }, + { + "epoch": 1.171253329029134, + "grad_norm": 0.7660964727401733, + "learning_rate": 3.5253488719350026e-05, + "loss": 2.5031, + "step": 14513 + }, + { + "epoch": 1.1713340327657171, + "grad_norm": 0.6931602358818054, + "learning_rate": 3.5241458408943905e-05, + "loss": 2.4249, + "step": 14514 + }, + { + "epoch": 1.1714147365023, + "grad_norm": 0.6863045692443848, + "learning_rate": 3.522942971248993e-05, + "loss": 2.4429, + "step": 14515 + }, + { + "epoch": 1.171495440238883, + "grad_norm": 0.6993531584739685, + "learning_rate": 3.521740263028791e-05, + "loss": 2.3864, + "step": 14516 + }, + { + "epoch": 1.1715761439754662, + "grad_norm": 0.807991087436676, + "learning_rate": 3.520537716263753e-05, + "loss": 2.459, + "step": 14517 + }, + { + "epoch": 1.171656847712049, + "grad_norm": 0.6722908020019531, + "learning_rate": 3.519335330983852e-05, + "loss": 2.4426, + "step": 14518 + }, + { + "epoch": 1.1717375514486321, + "grad_norm": 0.6934377551078796, + "learning_rate": 3.5181331072190585e-05, + "loss": 2.4326, + "step": 14519 + }, + { + "epoch": 1.171818255185215, + "grad_norm": 0.6532938480377197, + "learning_rate": 3.516931044999329e-05, + "loss": 2.3778, + "step": 14520 + }, + { + "epoch": 1.171898958921798, + "grad_norm": 0.6779183745384216, + "learning_rate": 3.5157291443546247e-05, + "loss": 2.4089, + "step": 14521 + }, + { + "epoch": 1.1719796626583812, + "grad_norm": 0.687005877494812, + "learning_rate": 3.514527405314899e-05, + "loss": 2.4669, + "step": 14522 + }, + { + "epoch": 1.172060366394964, + "grad_norm": 0.6804830431938171, + "learning_rate": 3.5133258279101045e-05, + "loss": 2.4789, + "step": 14523 + }, + { + "epoch": 1.1721410701315471, + "grad_norm": 0.8345538973808289, + "learning_rate": 3.512124412170187e-05, + "loss": 2.4506, + "step": 14524 + }, + { + "epoch": 1.17222177386813, + "grad_norm": 0.6571901440620422, + "learning_rate": 3.510923158125088e-05, + "loss": 2.4911, + "step": 14525 + }, + { + "epoch": 1.172302477604713, + "grad_norm": 0.6607047915458679, + "learning_rate": 3.5097220658047504e-05, + "loss": 2.4882, + "step": 14526 + }, + { + "epoch": 1.1723831813412962, + "grad_norm": 0.6883669495582581, + "learning_rate": 3.508521135239101e-05, + "loss": 2.4083, + "step": 14527 + }, + { + "epoch": 1.172463885077879, + "grad_norm": 0.6792941689491272, + "learning_rate": 3.5073203664580746e-05, + "loss": 2.368, + "step": 14528 + }, + { + "epoch": 1.172544588814462, + "grad_norm": 0.6675198674201965, + "learning_rate": 3.506119759491598e-05, + "loss": 2.4193, + "step": 14529 + }, + { + "epoch": 1.1726252925510452, + "grad_norm": 0.7267464399337769, + "learning_rate": 3.504919314369591e-05, + "loss": 2.3906, + "step": 14530 + }, + { + "epoch": 1.172705996287628, + "grad_norm": 0.6927710175514221, + "learning_rate": 3.503719031121973e-05, + "loss": 2.4082, + "step": 14531 + }, + { + "epoch": 1.1727867000242111, + "grad_norm": 0.7231000065803528, + "learning_rate": 3.502518909778656e-05, + "loss": 2.4845, + "step": 14532 + }, + { + "epoch": 1.1728674037607942, + "grad_norm": 0.7087520360946655, + "learning_rate": 3.5013189503695544e-05, + "loss": 2.4622, + "step": 14533 + }, + { + "epoch": 1.172948107497377, + "grad_norm": 0.6669846177101135, + "learning_rate": 3.5001191529245716e-05, + "loss": 2.4151, + "step": 14534 + }, + { + "epoch": 1.1730288112339602, + "grad_norm": 0.7338447570800781, + "learning_rate": 3.4989195174736134e-05, + "loss": 2.4274, + "step": 14535 + }, + { + "epoch": 1.173109514970543, + "grad_norm": 0.7032054662704468, + "learning_rate": 3.497720044046572e-05, + "loss": 2.4066, + "step": 14536 + }, + { + "epoch": 1.1731902187071261, + "grad_norm": 0.6571083068847656, + "learning_rate": 3.496520732673344e-05, + "loss": 2.4581, + "step": 14537 + }, + { + "epoch": 1.1732709224437092, + "grad_norm": 0.6618444919586182, + "learning_rate": 3.495321583383819e-05, + "loss": 2.3675, + "step": 14538 + }, + { + "epoch": 1.173351626180292, + "grad_norm": 0.6597652435302734, + "learning_rate": 3.4941225962078885e-05, + "loss": 2.416, + "step": 14539 + }, + { + "epoch": 1.1734323299168752, + "grad_norm": 0.682634711265564, + "learning_rate": 3.492923771175425e-05, + "loss": 2.5081, + "step": 14540 + }, + { + "epoch": 1.173513033653458, + "grad_norm": 0.7046132683753967, + "learning_rate": 3.49172510831631e-05, + "loss": 2.4439, + "step": 14541 + }, + { + "epoch": 1.1735937373900411, + "grad_norm": 0.6734833717346191, + "learning_rate": 3.4905266076604196e-05, + "loss": 2.4348, + "step": 14542 + }, + { + "epoch": 1.1736744411266242, + "grad_norm": 0.6624744534492493, + "learning_rate": 3.4893282692376214e-05, + "loss": 2.4364, + "step": 14543 + }, + { + "epoch": 1.173755144863207, + "grad_norm": 0.8425754308700562, + "learning_rate": 3.4881300930777815e-05, + "loss": 2.4803, + "step": 14544 + }, + { + "epoch": 1.1738358485997902, + "grad_norm": 0.6438888311386108, + "learning_rate": 3.486932079210766e-05, + "loss": 2.3973, + "step": 14545 + }, + { + "epoch": 1.1739165523363733, + "grad_norm": 0.650399923324585, + "learning_rate": 3.485734227666424e-05, + "loss": 2.4183, + "step": 14546 + }, + { + "epoch": 1.1739972560729561, + "grad_norm": 0.6857002973556519, + "learning_rate": 3.4845365384746144e-05, + "loss": 2.4061, + "step": 14547 + }, + { + "epoch": 1.1740779598095392, + "grad_norm": 0.6680994629859924, + "learning_rate": 3.483339011665189e-05, + "loss": 2.421, + "step": 14548 + }, + { + "epoch": 1.1741586635461223, + "grad_norm": 0.6440950632095337, + "learning_rate": 3.482141647267987e-05, + "loss": 2.3914, + "step": 14549 + }, + { + "epoch": 1.1742393672827052, + "grad_norm": 0.7329740524291992, + "learning_rate": 3.480944445312853e-05, + "loss": 2.4805, + "step": 14550 + }, + { + "epoch": 1.1743200710192883, + "grad_norm": 0.6848189234733582, + "learning_rate": 3.4797474058296245e-05, + "loss": 2.3611, + "step": 14551 + }, + { + "epoch": 1.1744007747558711, + "grad_norm": 0.6994072794914246, + "learning_rate": 3.478550528848134e-05, + "loss": 2.5106, + "step": 14552 + }, + { + "epoch": 1.1744814784924542, + "grad_norm": 0.6826444268226624, + "learning_rate": 3.477353814398212e-05, + "loss": 2.467, + "step": 14553 + }, + { + "epoch": 1.1745621822290373, + "grad_norm": 0.6658408045768738, + "learning_rate": 3.476157262509683e-05, + "loss": 2.423, + "step": 14554 + }, + { + "epoch": 1.1746428859656202, + "grad_norm": 0.6963697075843811, + "learning_rate": 3.474960873212372e-05, + "loss": 2.457, + "step": 14555 + }, + { + "epoch": 1.1747235897022033, + "grad_norm": 0.7574479579925537, + "learning_rate": 3.4737646465360894e-05, + "loss": 2.4292, + "step": 14556 + }, + { + "epoch": 1.1748042934387861, + "grad_norm": 0.7494931817054749, + "learning_rate": 3.472568582510652e-05, + "loss": 2.4395, + "step": 14557 + }, + { + "epoch": 1.1748849971753692, + "grad_norm": 0.7062687873840332, + "learning_rate": 3.471372681165872e-05, + "loss": 2.4561, + "step": 14558 + }, + { + "epoch": 1.1749657009119523, + "grad_norm": 0.6875349879264832, + "learning_rate": 3.4701769425315465e-05, + "loss": 2.4728, + "step": 14559 + }, + { + "epoch": 1.1750464046485352, + "grad_norm": 0.7009960412979126, + "learning_rate": 3.46898136663748e-05, + "loss": 2.5364, + "step": 14560 + }, + { + "epoch": 1.1751271083851182, + "grad_norm": 0.673791766166687, + "learning_rate": 3.467785953513475e-05, + "loss": 2.4611, + "step": 14561 + }, + { + "epoch": 1.1752078121217013, + "grad_norm": 0.7166882753372192, + "learning_rate": 3.4665907031893164e-05, + "loss": 2.4451, + "step": 14562 + }, + { + "epoch": 1.1752885158582842, + "grad_norm": 0.6868429780006409, + "learning_rate": 3.465395615694791e-05, + "loss": 2.4282, + "step": 14563 + }, + { + "epoch": 1.1753692195948673, + "grad_norm": 0.7212893962860107, + "learning_rate": 3.464200691059697e-05, + "loss": 2.4239, + "step": 14564 + }, + { + "epoch": 1.1754499233314502, + "grad_norm": 0.7213432192802429, + "learning_rate": 3.463005929313802e-05, + "loss": 2.4872, + "step": 14565 + }, + { + "epoch": 1.1755306270680332, + "grad_norm": 0.6805179119110107, + "learning_rate": 3.461811330486887e-05, + "loss": 2.4192, + "step": 14566 + }, + { + "epoch": 1.1756113308046163, + "grad_norm": 0.6746333241462708, + "learning_rate": 3.460616894608725e-05, + "loss": 2.3911, + "step": 14567 + }, + { + "epoch": 1.1756920345411992, + "grad_norm": 0.7388630509376526, + "learning_rate": 3.459422621709088e-05, + "loss": 2.4758, + "step": 14568 + }, + { + "epoch": 1.1757727382777823, + "grad_norm": 0.7730274200439453, + "learning_rate": 3.458228511817731e-05, + "loss": 2.4159, + "step": 14569 + }, + { + "epoch": 1.1758534420143651, + "grad_norm": 0.721075177192688, + "learning_rate": 3.457034564964422e-05, + "loss": 2.4673, + "step": 14570 + }, + { + "epoch": 1.1759341457509482, + "grad_norm": 0.6647645235061646, + "learning_rate": 3.4558407811789184e-05, + "loss": 2.395, + "step": 14571 + }, + { + "epoch": 1.1760148494875313, + "grad_norm": 0.7155466675758362, + "learning_rate": 3.454647160490965e-05, + "loss": 2.503, + "step": 14572 + }, + { + "epoch": 1.1760955532241142, + "grad_norm": 0.6789268851280212, + "learning_rate": 3.453453702930314e-05, + "loss": 2.401, + "step": 14573 + }, + { + "epoch": 1.1761762569606973, + "grad_norm": 0.7488093376159668, + "learning_rate": 3.4522604085267105e-05, + "loss": 2.4434, + "step": 14574 + }, + { + "epoch": 1.1762569606972804, + "grad_norm": 0.7954889535903931, + "learning_rate": 3.451067277309893e-05, + "loss": 2.5302, + "step": 14575 + }, + { + "epoch": 1.1763376644338632, + "grad_norm": 0.7008484601974487, + "learning_rate": 3.4498743093095975e-05, + "loss": 2.3935, + "step": 14576 + }, + { + "epoch": 1.1764183681704463, + "grad_norm": 0.6725437641143799, + "learning_rate": 3.448681504555561e-05, + "loss": 2.399, + "step": 14577 + }, + { + "epoch": 1.1764990719070294, + "grad_norm": 0.6778931617736816, + "learning_rate": 3.4474888630775026e-05, + "loss": 2.4178, + "step": 14578 + }, + { + "epoch": 1.1765797756436123, + "grad_norm": 0.7043762803077698, + "learning_rate": 3.44629638490515e-05, + "loss": 2.5581, + "step": 14579 + }, + { + "epoch": 1.1766604793801954, + "grad_norm": 0.6848085522651672, + "learning_rate": 3.445104070068227e-05, + "loss": 2.436, + "step": 14580 + }, + { + "epoch": 1.1767411831167782, + "grad_norm": 0.7504082322120667, + "learning_rate": 3.443911918596441e-05, + "loss": 2.4138, + "step": 14581 + }, + { + "epoch": 1.1768218868533613, + "grad_norm": 0.7441161870956421, + "learning_rate": 3.442719930519508e-05, + "loss": 2.4333, + "step": 14582 + }, + { + "epoch": 1.1769025905899444, + "grad_norm": 0.663894772529602, + "learning_rate": 3.4415281058671354e-05, + "loss": 2.4672, + "step": 14583 + }, + { + "epoch": 1.1769832943265273, + "grad_norm": 0.6814345121383667, + "learning_rate": 3.440336444669027e-05, + "loss": 2.4196, + "step": 14584 + }, + { + "epoch": 1.1770639980631104, + "grad_norm": 0.7566598057746887, + "learning_rate": 3.439144946954881e-05, + "loss": 2.4586, + "step": 14585 + }, + { + "epoch": 1.1771447017996932, + "grad_norm": 0.7324996590614319, + "learning_rate": 3.4379536127543934e-05, + "loss": 2.4286, + "step": 14586 + }, + { + "epoch": 1.1772254055362763, + "grad_norm": 0.6632608771324158, + "learning_rate": 3.436762442097259e-05, + "loss": 2.4713, + "step": 14587 + }, + { + "epoch": 1.1773061092728594, + "grad_norm": 0.7246156930923462, + "learning_rate": 3.4355714350131564e-05, + "loss": 2.4374, + "step": 14588 + }, + { + "epoch": 1.1773868130094423, + "grad_norm": 0.7096351981163025, + "learning_rate": 3.4343805915317737e-05, + "loss": 2.4649, + "step": 14589 + }, + { + "epoch": 1.1774675167460253, + "grad_norm": 0.7090620398521423, + "learning_rate": 3.433189911682793e-05, + "loss": 2.396, + "step": 14590 + }, + { + "epoch": 1.1775482204826084, + "grad_norm": 0.7782440185546875, + "learning_rate": 3.431999395495882e-05, + "loss": 2.4506, + "step": 14591 + }, + { + "epoch": 1.1776289242191913, + "grad_norm": 0.6933457851409912, + "learning_rate": 3.4308090430007155e-05, + "loss": 2.3985, + "step": 14592 + }, + { + "epoch": 1.1777096279557744, + "grad_norm": 0.6935414671897888, + "learning_rate": 3.429618854226959e-05, + "loss": 2.4372, + "step": 14593 + }, + { + "epoch": 1.1777903316923575, + "grad_norm": 0.6971156597137451, + "learning_rate": 3.428428829204276e-05, + "loss": 2.4837, + "step": 14594 + }, + { + "epoch": 1.1778710354289403, + "grad_norm": 0.6460022926330566, + "learning_rate": 3.427238967962325e-05, + "loss": 2.3742, + "step": 14595 + }, + { + "epoch": 1.1779517391655234, + "grad_norm": 0.6941941976547241, + "learning_rate": 3.426049270530763e-05, + "loss": 2.4706, + "step": 14596 + }, + { + "epoch": 1.1780324429021063, + "grad_norm": 0.7062166333198547, + "learning_rate": 3.424859736939236e-05, + "loss": 2.3893, + "step": 14597 + }, + { + "epoch": 1.1781131466386894, + "grad_norm": 0.6586433053016663, + "learning_rate": 3.42367036721739e-05, + "loss": 2.4385, + "step": 14598 + }, + { + "epoch": 1.1781938503752725, + "grad_norm": 0.6781242489814758, + "learning_rate": 3.422481161394869e-05, + "loss": 2.3876, + "step": 14599 + }, + { + "epoch": 1.1782745541118553, + "grad_norm": 0.710127592086792, + "learning_rate": 3.421292119501316e-05, + "loss": 2.4067, + "step": 14600 + }, + { + "epoch": 1.1783552578484384, + "grad_norm": 0.6856096982955933, + "learning_rate": 3.420103241566357e-05, + "loss": 2.4855, + "step": 14601 + }, + { + "epoch": 1.1784359615850213, + "grad_norm": 0.7173380851745605, + "learning_rate": 3.4189145276196245e-05, + "loss": 2.4871, + "step": 14602 + }, + { + "epoch": 1.1785166653216044, + "grad_norm": 0.6895382404327393, + "learning_rate": 3.417725977690745e-05, + "loss": 2.4066, + "step": 14603 + }, + { + "epoch": 1.1785973690581875, + "grad_norm": 0.7417690753936768, + "learning_rate": 3.416537591809341e-05, + "loss": 2.3779, + "step": 14604 + }, + { + "epoch": 1.1786780727947703, + "grad_norm": 0.7258411049842834, + "learning_rate": 3.4153493700050286e-05, + "loss": 2.4334, + "step": 14605 + }, + { + "epoch": 1.1787587765313534, + "grad_norm": 0.65704345703125, + "learning_rate": 3.414161312307427e-05, + "loss": 2.4531, + "step": 14606 + }, + { + "epoch": 1.1788394802679365, + "grad_norm": 0.6937118172645569, + "learning_rate": 3.4129734187461374e-05, + "loss": 2.4562, + "step": 14607 + }, + { + "epoch": 1.1789201840045194, + "grad_norm": 0.7331998348236084, + "learning_rate": 3.411785689350768e-05, + "loss": 2.4418, + "step": 14608 + }, + { + "epoch": 1.1790008877411025, + "grad_norm": 0.666582465171814, + "learning_rate": 3.410598124150924e-05, + "loss": 2.4154, + "step": 14609 + }, + { + "epoch": 1.1790815914776853, + "grad_norm": 0.6684321165084839, + "learning_rate": 3.409410723176197e-05, + "loss": 2.4155, + "step": 14610 + }, + { + "epoch": 1.1791622952142684, + "grad_norm": 0.6413382291793823, + "learning_rate": 3.408223486456184e-05, + "loss": 2.3924, + "step": 14611 + }, + { + "epoch": 1.1792429989508515, + "grad_norm": 0.7081305384635925, + "learning_rate": 3.407036414020475e-05, + "loss": 2.3811, + "step": 14612 + }, + { + "epoch": 1.1793237026874344, + "grad_norm": 0.7550063133239746, + "learning_rate": 3.405849505898645e-05, + "loss": 2.4425, + "step": 14613 + }, + { + "epoch": 1.1794044064240174, + "grad_norm": 0.677200198173523, + "learning_rate": 3.404662762120288e-05, + "loss": 2.5182, + "step": 14614 + }, + { + "epoch": 1.1794851101606003, + "grad_norm": 0.6829770803451538, + "learning_rate": 3.4034761827149745e-05, + "loss": 2.5068, + "step": 14615 + }, + { + "epoch": 1.1795658138971834, + "grad_norm": 0.7069409489631653, + "learning_rate": 3.4022897677122815e-05, + "loss": 2.4449, + "step": 14616 + }, + { + "epoch": 1.1796465176337665, + "grad_norm": 0.6604448556900024, + "learning_rate": 3.4011035171417696e-05, + "loss": 2.3996, + "step": 14617 + }, + { + "epoch": 1.1797272213703494, + "grad_norm": 0.6577324271202087, + "learning_rate": 3.3999174310330084e-05, + "loss": 2.4723, + "step": 14618 + }, + { + "epoch": 1.1798079251069324, + "grad_norm": 0.8159187436103821, + "learning_rate": 3.398731509415561e-05, + "loss": 2.4655, + "step": 14619 + }, + { + "epoch": 1.1798886288435155, + "grad_norm": 0.7170652747154236, + "learning_rate": 3.397545752318977e-05, + "loss": 2.5095, + "step": 14620 + }, + { + "epoch": 1.1799693325800984, + "grad_norm": 0.6865009665489197, + "learning_rate": 3.396360159772812e-05, + "loss": 2.4358, + "step": 14621 + }, + { + "epoch": 1.1800500363166815, + "grad_norm": 0.6485020518302917, + "learning_rate": 3.3951747318066175e-05, + "loss": 2.4576, + "step": 14622 + }, + { + "epoch": 1.1801307400532646, + "grad_norm": 0.6626582145690918, + "learning_rate": 3.39398946844993e-05, + "loss": 2.4824, + "step": 14623 + }, + { + "epoch": 1.1802114437898474, + "grad_norm": 0.718588650226593, + "learning_rate": 3.392804369732293e-05, + "loss": 2.4211, + "step": 14624 + }, + { + "epoch": 1.1802921475264305, + "grad_norm": 0.7449582815170288, + "learning_rate": 3.391619435683243e-05, + "loss": 2.444, + "step": 14625 + }, + { + "epoch": 1.1803728512630134, + "grad_norm": 0.6988492012023926, + "learning_rate": 3.3904346663323115e-05, + "loss": 2.4262, + "step": 14626 + }, + { + "epoch": 1.1804535549995965, + "grad_norm": 0.6779490113258362, + "learning_rate": 3.389250061709025e-05, + "loss": 2.4751, + "step": 14627 + }, + { + "epoch": 1.1805342587361796, + "grad_norm": 0.6883673667907715, + "learning_rate": 3.388065621842912e-05, + "loss": 2.4995, + "step": 14628 + }, + { + "epoch": 1.1806149624727624, + "grad_norm": 0.7112017273902893, + "learning_rate": 3.386881346763483e-05, + "loss": 2.4181, + "step": 14629 + }, + { + "epoch": 1.1806956662093455, + "grad_norm": 0.6960459351539612, + "learning_rate": 3.385697236500258e-05, + "loss": 2.4888, + "step": 14630 + }, + { + "epoch": 1.1807763699459284, + "grad_norm": 0.6874156594276428, + "learning_rate": 3.3845132910827484e-05, + "loss": 2.4175, + "step": 14631 + }, + { + "epoch": 1.1808570736825115, + "grad_norm": 0.7075642347335815, + "learning_rate": 3.383329510540463e-05, + "loss": 2.4315, + "step": 14632 + }, + { + "epoch": 1.1809377774190946, + "grad_norm": 0.674907386302948, + "learning_rate": 3.3821458949028995e-05, + "loss": 2.4216, + "step": 14633 + }, + { + "epoch": 1.1810184811556774, + "grad_norm": 0.7008463740348816, + "learning_rate": 3.380962444199559e-05, + "loss": 2.4114, + "step": 14634 + }, + { + "epoch": 1.1810991848922605, + "grad_norm": 0.6784217953681946, + "learning_rate": 3.379779158459937e-05, + "loss": 2.3663, + "step": 14635 + }, + { + "epoch": 1.1811798886288436, + "grad_norm": 0.7174829244613647, + "learning_rate": 3.378596037713525e-05, + "loss": 2.4582, + "step": 14636 + }, + { + "epoch": 1.1812605923654265, + "grad_norm": 0.7106035947799683, + "learning_rate": 3.3774130819898065e-05, + "loss": 2.5095, + "step": 14637 + }, + { + "epoch": 1.1813412961020096, + "grad_norm": 0.809107780456543, + "learning_rate": 3.3762302913182696e-05, + "loss": 2.4942, + "step": 14638 + }, + { + "epoch": 1.1814219998385926, + "grad_norm": 0.7150272727012634, + "learning_rate": 3.375047665728386e-05, + "loss": 2.378, + "step": 14639 + }, + { + "epoch": 1.1815027035751755, + "grad_norm": 0.7016271352767944, + "learning_rate": 3.373865205249632e-05, + "loss": 2.4393, + "step": 14640 + }, + { + "epoch": 1.1815834073117586, + "grad_norm": 0.6387282013893127, + "learning_rate": 3.372682909911481e-05, + "loss": 2.4399, + "step": 14641 + }, + { + "epoch": 1.1816641110483415, + "grad_norm": 0.834181010723114, + "learning_rate": 3.371500779743393e-05, + "loss": 2.4312, + "step": 14642 + }, + { + "epoch": 1.1817448147849245, + "grad_norm": 0.6690472960472107, + "learning_rate": 3.370318814774832e-05, + "loss": 2.407, + "step": 14643 + }, + { + "epoch": 1.1818255185215076, + "grad_norm": 0.6594302654266357, + "learning_rate": 3.369137015035256e-05, + "loss": 2.4275, + "step": 14644 + }, + { + "epoch": 1.1819062222580905, + "grad_norm": 0.7284699082374573, + "learning_rate": 3.3679553805541194e-05, + "loss": 2.3981, + "step": 14645 + }, + { + "epoch": 1.1819869259946736, + "grad_norm": 0.7109572291374207, + "learning_rate": 3.366773911360871e-05, + "loss": 2.4345, + "step": 14646 + }, + { + "epoch": 1.1820676297312565, + "grad_norm": 0.6874241828918457, + "learning_rate": 3.3655926074849566e-05, + "loss": 2.4488, + "step": 14647 + }, + { + "epoch": 1.1821483334678395, + "grad_norm": 0.6698973178863525, + "learning_rate": 3.364411468955819e-05, + "loss": 2.42, + "step": 14648 + }, + { + "epoch": 1.1822290372044226, + "grad_norm": 0.7816089391708374, + "learning_rate": 3.3632304958028915e-05, + "loss": 2.4638, + "step": 14649 + }, + { + "epoch": 1.1823097409410055, + "grad_norm": 0.6718220710754395, + "learning_rate": 3.3620496880556075e-05, + "loss": 2.413, + "step": 14650 + }, + { + "epoch": 1.1823904446775886, + "grad_norm": 0.753463089466095, + "learning_rate": 3.360869045743401e-05, + "loss": 2.3772, + "step": 14651 + }, + { + "epoch": 1.1824711484141717, + "grad_norm": 0.7031456828117371, + "learning_rate": 3.359688568895689e-05, + "loss": 2.4198, + "step": 14652 + }, + { + "epoch": 1.1825518521507545, + "grad_norm": 0.7857323288917542, + "learning_rate": 3.358508257541897e-05, + "loss": 2.4223, + "step": 14653 + }, + { + "epoch": 1.1826325558873376, + "grad_norm": 0.7779297828674316, + "learning_rate": 3.357328111711439e-05, + "loss": 2.5266, + "step": 14654 + }, + { + "epoch": 1.1827132596239207, + "grad_norm": 0.7382386326789856, + "learning_rate": 3.356148131433728e-05, + "loss": 2.4673, + "step": 14655 + }, + { + "epoch": 1.1827939633605036, + "grad_norm": 0.7868054509162903, + "learning_rate": 3.354968316738174e-05, + "loss": 2.4285, + "step": 14656 + }, + { + "epoch": 1.1828746670970867, + "grad_norm": 0.7007591724395752, + "learning_rate": 3.353788667654183e-05, + "loss": 2.4054, + "step": 14657 + }, + { + "epoch": 1.1829553708336695, + "grad_norm": 0.6627741456031799, + "learning_rate": 3.352609184211148e-05, + "loss": 2.4224, + "step": 14658 + }, + { + "epoch": 1.1830360745702526, + "grad_norm": 0.6865360736846924, + "learning_rate": 3.351429866438469e-05, + "loss": 2.4084, + "step": 14659 + }, + { + "epoch": 1.1831167783068357, + "grad_norm": 0.7572095990180969, + "learning_rate": 3.3502507143655404e-05, + "loss": 2.4339, + "step": 14660 + }, + { + "epoch": 1.1831974820434186, + "grad_norm": 0.6907969117164612, + "learning_rate": 3.349071728021743e-05, + "loss": 2.4578, + "step": 14661 + }, + { + "epoch": 1.1832781857800017, + "grad_norm": 0.6618743538856506, + "learning_rate": 3.347892907436465e-05, + "loss": 2.4131, + "step": 14662 + }, + { + "epoch": 1.1833588895165845, + "grad_norm": 0.777159571647644, + "learning_rate": 3.346714252639084e-05, + "loss": 2.419, + "step": 14663 + }, + { + "epoch": 1.1834395932531676, + "grad_norm": 0.666344165802002, + "learning_rate": 3.345535763658975e-05, + "loss": 2.4155, + "step": 14664 + }, + { + "epoch": 1.1835202969897507, + "grad_norm": 0.708848774433136, + "learning_rate": 3.3443574405255095e-05, + "loss": 2.4794, + "step": 14665 + }, + { + "epoch": 1.1836010007263336, + "grad_norm": 0.7247438430786133, + "learning_rate": 3.3431792832680555e-05, + "loss": 2.4445, + "step": 14666 + }, + { + "epoch": 1.1836817044629167, + "grad_norm": 0.6870034337043762, + "learning_rate": 3.342001291915978e-05, + "loss": 2.4309, + "step": 14667 + }, + { + "epoch": 1.1837624081994997, + "grad_norm": 0.7088049650192261, + "learning_rate": 3.340823466498629e-05, + "loss": 2.4456, + "step": 14668 + }, + { + "epoch": 1.1838431119360826, + "grad_norm": 0.695148229598999, + "learning_rate": 3.3396458070453676e-05, + "loss": 2.4018, + "step": 14669 + }, + { + "epoch": 1.1839238156726657, + "grad_norm": 0.7947117686271667, + "learning_rate": 3.3384683135855444e-05, + "loss": 2.4099, + "step": 14670 + }, + { + "epoch": 1.1840045194092486, + "grad_norm": 0.7268195748329163, + "learning_rate": 3.337290986148502e-05, + "loss": 2.3955, + "step": 14671 + }, + { + "epoch": 1.1840852231458316, + "grad_norm": 0.6932024955749512, + "learning_rate": 3.336113824763585e-05, + "loss": 2.4046, + "step": 14672 + }, + { + "epoch": 1.1841659268824147, + "grad_norm": 0.7408114671707153, + "learning_rate": 3.3349368294601334e-05, + "loss": 2.4186, + "step": 14673 + }, + { + "epoch": 1.1842466306189976, + "grad_norm": 0.6678428053855896, + "learning_rate": 3.3337600002674765e-05, + "loss": 2.4324, + "step": 14674 + }, + { + "epoch": 1.1843273343555807, + "grad_norm": 0.7221381664276123, + "learning_rate": 3.3325833372149416e-05, + "loss": 2.4474, + "step": 14675 + }, + { + "epoch": 1.1844080380921636, + "grad_norm": 0.6971224546432495, + "learning_rate": 3.3314068403318654e-05, + "loss": 2.4197, + "step": 14676 + }, + { + "epoch": 1.1844887418287466, + "grad_norm": 0.65053391456604, + "learning_rate": 3.3302305096475604e-05, + "loss": 2.4169, + "step": 14677 + }, + { + "epoch": 1.1845694455653297, + "grad_norm": 0.7231155633926392, + "learning_rate": 3.3290543451913457e-05, + "loss": 2.4222, + "step": 14678 + }, + { + "epoch": 1.1846501493019126, + "grad_norm": 0.6458824872970581, + "learning_rate": 3.3278783469925345e-05, + "loss": 2.422, + "step": 14679 + }, + { + "epoch": 1.1847308530384957, + "grad_norm": 0.6783488392829895, + "learning_rate": 3.32670251508044e-05, + "loss": 2.4231, + "step": 14680 + }, + { + "epoch": 1.1848115567750788, + "grad_norm": 0.6742293238639832, + "learning_rate": 3.3255268494843586e-05, + "loss": 2.409, + "step": 14681 + }, + { + "epoch": 1.1848922605116616, + "grad_norm": 0.7455186247825623, + "learning_rate": 3.3243513502335956e-05, + "loss": 2.4121, + "step": 14682 + }, + { + "epoch": 1.1849729642482447, + "grad_norm": 0.7042234539985657, + "learning_rate": 3.323176017357451e-05, + "loss": 2.4574, + "step": 14683 + }, + { + "epoch": 1.1850536679848278, + "grad_norm": 0.7897992134094238, + "learning_rate": 3.3220008508852094e-05, + "loss": 2.4796, + "step": 14684 + }, + { + "epoch": 1.1851343717214107, + "grad_norm": 0.6894058585166931, + "learning_rate": 3.3208258508461644e-05, + "loss": 2.4125, + "step": 14685 + }, + { + "epoch": 1.1852150754579938, + "grad_norm": 0.7574072480201721, + "learning_rate": 3.319651017269597e-05, + "loss": 2.4714, + "step": 14686 + }, + { + "epoch": 1.1852957791945766, + "grad_norm": 0.7457531094551086, + "learning_rate": 3.3184763501847905e-05, + "loss": 2.4793, + "step": 14687 + }, + { + "epoch": 1.1853764829311597, + "grad_norm": 0.6819709539413452, + "learning_rate": 3.317301849621018e-05, + "loss": 2.4563, + "step": 14688 + }, + { + "epoch": 1.1854571866677428, + "grad_norm": 0.6998026371002197, + "learning_rate": 3.316127515607555e-05, + "loss": 2.4548, + "step": 14689 + }, + { + "epoch": 1.1855378904043257, + "grad_norm": 0.7148768305778503, + "learning_rate": 3.314953348173664e-05, + "loss": 2.4897, + "step": 14690 + }, + { + "epoch": 1.1856185941409088, + "grad_norm": 0.6581987738609314, + "learning_rate": 3.31377934734861e-05, + "loss": 2.4683, + "step": 14691 + }, + { + "epoch": 1.1856992978774916, + "grad_norm": 0.7493093609809875, + "learning_rate": 3.312605513161653e-05, + "loss": 2.4564, + "step": 14692 + }, + { + "epoch": 1.1857800016140747, + "grad_norm": 0.7095562219619751, + "learning_rate": 3.311431845642051e-05, + "loss": 2.4595, + "step": 14693 + }, + { + "epoch": 1.1858607053506578, + "grad_norm": 0.8045323491096497, + "learning_rate": 3.310258344819047e-05, + "loss": 2.5044, + "step": 14694 + }, + { + "epoch": 1.1859414090872407, + "grad_norm": 0.7381219267845154, + "learning_rate": 3.3090850107218943e-05, + "loss": 2.415, + "step": 14695 + }, + { + "epoch": 1.1860221128238237, + "grad_norm": 0.6859883069992065, + "learning_rate": 3.307911843379832e-05, + "loss": 2.4314, + "step": 14696 + }, + { + "epoch": 1.1861028165604068, + "grad_norm": 0.7084196209907532, + "learning_rate": 3.306738842822099e-05, + "loss": 2.4404, + "step": 14697 + }, + { + "epoch": 1.1861835202969897, + "grad_norm": 0.6964806318283081, + "learning_rate": 3.305566009077932e-05, + "loss": 2.4391, + "step": 14698 + }, + { + "epoch": 1.1862642240335728, + "grad_norm": 0.7272049188613892, + "learning_rate": 3.304393342176562e-05, + "loss": 2.4395, + "step": 14699 + }, + { + "epoch": 1.1863449277701559, + "grad_norm": 0.6651458144187927, + "learning_rate": 3.303220842147209e-05, + "loss": 2.4059, + "step": 14700 + }, + { + "epoch": 1.1864256315067387, + "grad_norm": 0.7599130868911743, + "learning_rate": 3.302048509019099e-05, + "loss": 2.5044, + "step": 14701 + }, + { + "epoch": 1.1865063352433218, + "grad_norm": 0.6694391965866089, + "learning_rate": 3.3008763428214505e-05, + "loss": 2.4817, + "step": 14702 + }, + { + "epoch": 1.1865870389799047, + "grad_norm": 0.7176856398582458, + "learning_rate": 3.299704343583473e-05, + "loss": 2.4702, + "step": 14703 + }, + { + "epoch": 1.1866677427164878, + "grad_norm": 0.7133145332336426, + "learning_rate": 3.298532511334378e-05, + "loss": 2.4685, + "step": 14704 + }, + { + "epoch": 1.1867484464530709, + "grad_norm": 0.7170277833938599, + "learning_rate": 3.297360846103371e-05, + "loss": 2.4203, + "step": 14705 + }, + { + "epoch": 1.1868291501896537, + "grad_norm": 0.6853376626968384, + "learning_rate": 3.296189347919652e-05, + "loss": 2.4067, + "step": 14706 + }, + { + "epoch": 1.1869098539262368, + "grad_norm": 0.7269156575202942, + "learning_rate": 3.2950180168124175e-05, + "loss": 2.4211, + "step": 14707 + }, + { + "epoch": 1.1869905576628197, + "grad_norm": 0.8649005889892578, + "learning_rate": 3.2938468528108626e-05, + "loss": 2.4611, + "step": 14708 + }, + { + "epoch": 1.1870712613994028, + "grad_norm": 0.7256221771240234, + "learning_rate": 3.292675855944177e-05, + "loss": 2.4618, + "step": 14709 + }, + { + "epoch": 1.1871519651359859, + "grad_norm": 0.6854279637336731, + "learning_rate": 3.291505026241539e-05, + "loss": 2.4466, + "step": 14710 + }, + { + "epoch": 1.1872326688725687, + "grad_norm": 0.7182712554931641, + "learning_rate": 3.2903343637321316e-05, + "loss": 2.4847, + "step": 14711 + }, + { + "epoch": 1.1873133726091518, + "grad_norm": 0.6795300841331482, + "learning_rate": 3.289163868445134e-05, + "loss": 2.4407, + "step": 14712 + }, + { + "epoch": 1.187394076345735, + "grad_norm": 0.685146689414978, + "learning_rate": 3.287993540409713e-05, + "loss": 2.4537, + "step": 14713 + }, + { + "epoch": 1.1874747800823178, + "grad_norm": 0.7891005873680115, + "learning_rate": 3.2868233796550375e-05, + "loss": 2.4085, + "step": 14714 + }, + { + "epoch": 1.1875554838189009, + "grad_norm": 0.6521769762039185, + "learning_rate": 3.2856533862102724e-05, + "loss": 2.4174, + "step": 14715 + }, + { + "epoch": 1.1876361875554837, + "grad_norm": 0.7486612200737, + "learning_rate": 3.284483560104575e-05, + "loss": 2.4072, + "step": 14716 + }, + { + "epoch": 1.1877168912920668, + "grad_norm": 0.6895913481712341, + "learning_rate": 3.283313901367103e-05, + "loss": 2.4398, + "step": 14717 + }, + { + "epoch": 1.18779759502865, + "grad_norm": 0.6595678329467773, + "learning_rate": 3.282144410027009e-05, + "loss": 2.4407, + "step": 14718 + }, + { + "epoch": 1.1878782987652328, + "grad_norm": 0.7724249958992004, + "learning_rate": 3.280975086113435e-05, + "loss": 2.464, + "step": 14719 + }, + { + "epoch": 1.1879590025018159, + "grad_norm": 0.659472644329071, + "learning_rate": 3.279805929655524e-05, + "loss": 2.4774, + "step": 14720 + }, + { + "epoch": 1.1880397062383987, + "grad_norm": 0.7187919020652771, + "learning_rate": 3.27863694068242e-05, + "loss": 2.4767, + "step": 14721 + }, + { + "epoch": 1.1881204099749818, + "grad_norm": 0.7740198373794556, + "learning_rate": 3.2774681192232506e-05, + "loss": 2.4762, + "step": 14722 + }, + { + "epoch": 1.188201113711565, + "grad_norm": 0.700591504573822, + "learning_rate": 3.2762994653071464e-05, + "loss": 2.448, + "step": 14723 + }, + { + "epoch": 1.1882818174481478, + "grad_norm": 0.7168558239936829, + "learning_rate": 3.275130978963237e-05, + "loss": 2.4084, + "step": 14724 + }, + { + "epoch": 1.1883625211847308, + "grad_norm": 0.8039551973342896, + "learning_rate": 3.273962660220646e-05, + "loss": 2.3849, + "step": 14725 + }, + { + "epoch": 1.188443224921314, + "grad_norm": 0.6453016400337219, + "learning_rate": 3.27279450910848e-05, + "loss": 2.3856, + "step": 14726 + }, + { + "epoch": 1.1885239286578968, + "grad_norm": 0.7194651365280151, + "learning_rate": 3.2716265256558644e-05, + "loss": 2.4337, + "step": 14727 + }, + { + "epoch": 1.1886046323944799, + "grad_norm": 0.7298597097396851, + "learning_rate": 3.270458709891906e-05, + "loss": 2.4491, + "step": 14728 + }, + { + "epoch": 1.188685336131063, + "grad_norm": 0.7127524614334106, + "learning_rate": 3.269291061845705e-05, + "loss": 2.4319, + "step": 14729 + }, + { + "epoch": 1.1887660398676458, + "grad_norm": 0.6782705783843994, + "learning_rate": 3.2681235815463654e-05, + "loss": 2.4375, + "step": 14730 + }, + { + "epoch": 1.188846743604229, + "grad_norm": 0.7418326735496521, + "learning_rate": 3.266956269022987e-05, + "loss": 2.4149, + "step": 14731 + }, + { + "epoch": 1.1889274473408118, + "grad_norm": 0.7442455291748047, + "learning_rate": 3.265789124304654e-05, + "loss": 2.3935, + "step": 14732 + }, + { + "epoch": 1.1890081510773949, + "grad_norm": 0.7238253951072693, + "learning_rate": 3.264622147420461e-05, + "loss": 2.4592, + "step": 14733 + }, + { + "epoch": 1.189088854813978, + "grad_norm": 0.6488127708435059, + "learning_rate": 3.2634553383994925e-05, + "loss": 2.3468, + "step": 14734 + }, + { + "epoch": 1.1891695585505608, + "grad_norm": 0.7182446718215942, + "learning_rate": 3.2622886972708246e-05, + "loss": 2.4457, + "step": 14735 + }, + { + "epoch": 1.189250262287144, + "grad_norm": 0.6885523796081543, + "learning_rate": 3.261122224063534e-05, + "loss": 2.3943, + "step": 14736 + }, + { + "epoch": 1.1893309660237268, + "grad_norm": 0.653367817401886, + "learning_rate": 3.259955918806693e-05, + "loss": 2.4188, + "step": 14737 + }, + { + "epoch": 1.1894116697603099, + "grad_norm": 0.6968675851821899, + "learning_rate": 3.2587897815293686e-05, + "loss": 2.4276, + "step": 14738 + }, + { + "epoch": 1.189492373496893, + "grad_norm": 0.6827409267425537, + "learning_rate": 3.257623812260626e-05, + "loss": 2.4417, + "step": 14739 + }, + { + "epoch": 1.1895730772334758, + "grad_norm": 0.6807438731193542, + "learning_rate": 3.256458011029523e-05, + "loss": 2.4495, + "step": 14740 + }, + { + "epoch": 1.189653780970059, + "grad_norm": 0.6692882180213928, + "learning_rate": 3.255292377865116e-05, + "loss": 2.3789, + "step": 14741 + }, + { + "epoch": 1.189734484706642, + "grad_norm": 0.6581685543060303, + "learning_rate": 3.2541269127964515e-05, + "loss": 2.4073, + "step": 14742 + }, + { + "epoch": 1.1898151884432249, + "grad_norm": 0.6458544731140137, + "learning_rate": 3.252961615852578e-05, + "loss": 2.4657, + "step": 14743 + }, + { + "epoch": 1.189895892179808, + "grad_norm": 0.6971322298049927, + "learning_rate": 3.251796487062541e-05, + "loss": 2.4404, + "step": 14744 + }, + { + "epoch": 1.189976595916391, + "grad_norm": 0.6770374178886414, + "learning_rate": 3.2506315264553724e-05, + "loss": 2.4329, + "step": 14745 + }, + { + "epoch": 1.190057299652974, + "grad_norm": 0.7634715437889099, + "learning_rate": 3.2494667340601085e-05, + "loss": 2.4234, + "step": 14746 + }, + { + "epoch": 1.190138003389557, + "grad_norm": 0.7717967629432678, + "learning_rate": 3.24830210990578e-05, + "loss": 2.5009, + "step": 14747 + }, + { + "epoch": 1.1902187071261399, + "grad_norm": 0.7133559584617615, + "learning_rate": 3.2471376540214124e-05, + "loss": 2.4272, + "step": 14748 + }, + { + "epoch": 1.190299410862723, + "grad_norm": 0.7273291349411011, + "learning_rate": 3.245973366436027e-05, + "loss": 2.4174, + "step": 14749 + }, + { + "epoch": 1.190380114599306, + "grad_norm": 0.6955052614212036, + "learning_rate": 3.244809247178643e-05, + "loss": 2.3605, + "step": 14750 + }, + { + "epoch": 1.190460818335889, + "grad_norm": 0.7072615027427673, + "learning_rate": 3.2436452962782685e-05, + "loss": 2.4897, + "step": 14751 + }, + { + "epoch": 1.190541522072472, + "grad_norm": 0.7095344662666321, + "learning_rate": 3.242481513763913e-05, + "loss": 2.4172, + "step": 14752 + }, + { + "epoch": 1.1906222258090549, + "grad_norm": 0.7260944247245789, + "learning_rate": 3.2413178996645864e-05, + "loss": 2.4272, + "step": 14753 + }, + { + "epoch": 1.190702929545638, + "grad_norm": 0.6601141691207886, + "learning_rate": 3.2401544540092824e-05, + "loss": 2.4072, + "step": 14754 + }, + { + "epoch": 1.190783633282221, + "grad_norm": 0.6684936881065369, + "learning_rate": 3.238991176827e-05, + "loss": 2.3968, + "step": 14755 + }, + { + "epoch": 1.190864337018804, + "grad_norm": 0.7264483571052551, + "learning_rate": 3.23782806814673e-05, + "loss": 2.4263, + "step": 14756 + }, + { + "epoch": 1.190945040755387, + "grad_norm": 0.6927621960639954, + "learning_rate": 3.2366651279974614e-05, + "loss": 2.4495, + "step": 14757 + }, + { + "epoch": 1.19102574449197, + "grad_norm": 0.7007272243499756, + "learning_rate": 3.2355023564081775e-05, + "loss": 2.4373, + "step": 14758 + }, + { + "epoch": 1.191106448228553, + "grad_norm": 0.6756663918495178, + "learning_rate": 3.234339753407857e-05, + "loss": 2.4148, + "step": 14759 + }, + { + "epoch": 1.191187151965136, + "grad_norm": 0.6741094589233398, + "learning_rate": 3.233177319025479e-05, + "loss": 2.3976, + "step": 14760 + }, + { + "epoch": 1.1912678557017191, + "grad_norm": 0.7098578810691833, + "learning_rate": 3.2320150532900085e-05, + "loss": 2.4326, + "step": 14761 + }, + { + "epoch": 1.191348559438302, + "grad_norm": 0.750271737575531, + "learning_rate": 3.230852956230413e-05, + "loss": 2.4766, + "step": 14762 + }, + { + "epoch": 1.191429263174885, + "grad_norm": 0.68764728307724, + "learning_rate": 3.229691027875661e-05, + "loss": 2.4128, + "step": 14763 + }, + { + "epoch": 1.191509966911468, + "grad_norm": 0.656295657157898, + "learning_rate": 3.228529268254702e-05, + "loss": 2.3928, + "step": 14764 + }, + { + "epoch": 1.191590670648051, + "grad_norm": 0.6690353155136108, + "learning_rate": 3.2273676773964955e-05, + "loss": 2.408, + "step": 14765 + }, + { + "epoch": 1.1916713743846339, + "grad_norm": 0.8111640214920044, + "learning_rate": 3.22620625532999e-05, + "loss": 2.4644, + "step": 14766 + }, + { + "epoch": 1.191752078121217, + "grad_norm": 0.7329768538475037, + "learning_rate": 3.2250450020841316e-05, + "loss": 2.4235, + "step": 14767 + }, + { + "epoch": 1.1918327818578, + "grad_norm": 0.6902688145637512, + "learning_rate": 3.223883917687861e-05, + "loss": 2.3883, + "step": 14768 + }, + { + "epoch": 1.191913485594383, + "grad_norm": 0.797249972820282, + "learning_rate": 3.2227230021701205e-05, + "loss": 2.523, + "step": 14769 + }, + { + "epoch": 1.191994189330966, + "grad_norm": 0.6294408440589905, + "learning_rate": 3.221562255559834e-05, + "loss": 2.4156, + "step": 14770 + }, + { + "epoch": 1.192074893067549, + "grad_norm": 0.7326164245605469, + "learning_rate": 3.220401677885936e-05, + "loss": 2.3828, + "step": 14771 + }, + { + "epoch": 1.192155596804132, + "grad_norm": 0.783747673034668, + "learning_rate": 3.219241269177351e-05, + "loss": 2.4321, + "step": 14772 + }, + { + "epoch": 1.192236300540715, + "grad_norm": 0.7415335178375244, + "learning_rate": 3.2180810294630005e-05, + "loss": 2.4446, + "step": 14773 + }, + { + "epoch": 1.1923170042772981, + "grad_norm": 0.7125591039657593, + "learning_rate": 3.2169209587717966e-05, + "loss": 2.3914, + "step": 14774 + }, + { + "epoch": 1.192397708013881, + "grad_norm": 0.6714075207710266, + "learning_rate": 3.215761057132652e-05, + "loss": 2.3918, + "step": 14775 + }, + { + "epoch": 1.192478411750464, + "grad_norm": 0.7147830724716187, + "learning_rate": 3.214601324574481e-05, + "loss": 2.4389, + "step": 14776 + }, + { + "epoch": 1.192559115487047, + "grad_norm": 0.6780480146408081, + "learning_rate": 3.2134417611261755e-05, + "loss": 2.4119, + "step": 14777 + }, + { + "epoch": 1.19263981922363, + "grad_norm": 0.7473881840705872, + "learning_rate": 3.212282366816645e-05, + "loss": 2.4547, + "step": 14778 + }, + { + "epoch": 1.1927205229602131, + "grad_norm": 0.7418377995491028, + "learning_rate": 3.211123141674784e-05, + "loss": 2.4156, + "step": 14779 + }, + { + "epoch": 1.192801226696796, + "grad_norm": 0.687524139881134, + "learning_rate": 3.209964085729477e-05, + "loss": 2.4309, + "step": 14780 + }, + { + "epoch": 1.192881930433379, + "grad_norm": 0.6965883374214172, + "learning_rate": 3.208805199009615e-05, + "loss": 2.4028, + "step": 14781 + }, + { + "epoch": 1.192962634169962, + "grad_norm": 0.7024682760238647, + "learning_rate": 3.207646481544082e-05, + "loss": 2.4482, + "step": 14782 + }, + { + "epoch": 1.193043337906545, + "grad_norm": 0.6835834383964539, + "learning_rate": 3.2064879333617514e-05, + "loss": 2.3898, + "step": 14783 + }, + { + "epoch": 1.1931240416431281, + "grad_norm": 0.7002003788948059, + "learning_rate": 3.2053295544915e-05, + "loss": 2.487, + "step": 14784 + }, + { + "epoch": 1.193204745379711, + "grad_norm": 0.7128168940544128, + "learning_rate": 3.2041713449622e-05, + "loss": 2.4591, + "step": 14785 + }, + { + "epoch": 1.193285449116294, + "grad_norm": 0.6897242665290833, + "learning_rate": 3.203013304802712e-05, + "loss": 2.4458, + "step": 14786 + }, + { + "epoch": 1.1933661528528772, + "grad_norm": 0.7281817197799683, + "learning_rate": 3.2018554340419004e-05, + "loss": 2.3772, + "step": 14787 + }, + { + "epoch": 1.19344685658946, + "grad_norm": 0.6956086754798889, + "learning_rate": 3.200697732708619e-05, + "loss": 2.4316, + "step": 14788 + }, + { + "epoch": 1.1935275603260431, + "grad_norm": 0.7679805159568787, + "learning_rate": 3.199540200831729e-05, + "loss": 2.4464, + "step": 14789 + }, + { + "epoch": 1.1936082640626262, + "grad_norm": 0.6993041634559631, + "learning_rate": 3.19838283844007e-05, + "loss": 2.3881, + "step": 14790 + }, + { + "epoch": 1.193688967799209, + "grad_norm": 0.689618706703186, + "learning_rate": 3.197225645562493e-05, + "loss": 2.4184, + "step": 14791 + }, + { + "epoch": 1.1937696715357922, + "grad_norm": 0.6896520853042603, + "learning_rate": 3.1960686222278354e-05, + "loss": 2.4484, + "step": 14792 + }, + { + "epoch": 1.193850375272375, + "grad_norm": 0.6743811368942261, + "learning_rate": 3.1949117684649334e-05, + "loss": 2.4636, + "step": 14793 + }, + { + "epoch": 1.1939310790089581, + "grad_norm": 0.7028046250343323, + "learning_rate": 3.1937550843026163e-05, + "loss": 2.4576, + "step": 14794 + }, + { + "epoch": 1.1940117827455412, + "grad_norm": 0.7219679951667786, + "learning_rate": 3.192598569769718e-05, + "loss": 2.4495, + "step": 14795 + }, + { + "epoch": 1.194092486482124, + "grad_norm": 0.731438159942627, + "learning_rate": 3.191442224895056e-05, + "loss": 2.4699, + "step": 14796 + }, + { + "epoch": 1.1941731902187072, + "grad_norm": 0.6731431484222412, + "learning_rate": 3.19028604970745e-05, + "loss": 2.4292, + "step": 14797 + }, + { + "epoch": 1.19425389395529, + "grad_norm": 0.6720147728919983, + "learning_rate": 3.1891300442357174e-05, + "loss": 2.4482, + "step": 14798 + }, + { + "epoch": 1.1943345976918731, + "grad_norm": 0.7504273653030396, + "learning_rate": 3.187974208508667e-05, + "loss": 2.4233, + "step": 14799 + }, + { + "epoch": 1.1944153014284562, + "grad_norm": 0.6882641315460205, + "learning_rate": 3.186818542555108e-05, + "loss": 2.4633, + "step": 14800 + }, + { + "epoch": 1.194496005165039, + "grad_norm": 0.7337899208068848, + "learning_rate": 3.1856630464038385e-05, + "loss": 2.4257, + "step": 14801 + }, + { + "epoch": 1.1945767089016222, + "grad_norm": 0.7026493549346924, + "learning_rate": 3.1845077200836636e-05, + "loss": 2.482, + "step": 14802 + }, + { + "epoch": 1.1946574126382052, + "grad_norm": 0.763351321220398, + "learning_rate": 3.1833525636233675e-05, + "loss": 2.4428, + "step": 14803 + }, + { + "epoch": 1.194738116374788, + "grad_norm": 0.6568076610565186, + "learning_rate": 3.182197577051745e-05, + "loss": 2.4373, + "step": 14804 + }, + { + "epoch": 1.1948188201113712, + "grad_norm": 0.6954717040061951, + "learning_rate": 3.1810427603975844e-05, + "loss": 2.4582, + "step": 14805 + }, + { + "epoch": 1.1948995238479543, + "grad_norm": 0.7130215167999268, + "learning_rate": 3.179888113689661e-05, + "loss": 2.443, + "step": 14806 + }, + { + "epoch": 1.1949802275845371, + "grad_norm": 0.6789865493774414, + "learning_rate": 3.178733636956752e-05, + "loss": 2.4138, + "step": 14807 + }, + { + "epoch": 1.1950609313211202, + "grad_norm": 0.7725361585617065, + "learning_rate": 3.177579330227633e-05, + "loss": 2.4783, + "step": 14808 + }, + { + "epoch": 1.195141635057703, + "grad_norm": 0.6952371001243591, + "learning_rate": 3.17642519353107e-05, + "loss": 2.4571, + "step": 14809 + }, + { + "epoch": 1.1952223387942862, + "grad_norm": 0.7541885375976562, + "learning_rate": 3.1752712268958275e-05, + "loss": 2.4075, + "step": 14810 + }, + { + "epoch": 1.1953030425308693, + "grad_norm": 0.6974624395370483, + "learning_rate": 3.174117430350671e-05, + "loss": 2.4525, + "step": 14811 + }, + { + "epoch": 1.1953837462674521, + "grad_norm": 0.7293709516525269, + "learning_rate": 3.172963803924347e-05, + "loss": 2.4646, + "step": 14812 + }, + { + "epoch": 1.1954644500040352, + "grad_norm": 0.6944144368171692, + "learning_rate": 3.1718103476456106e-05, + "loss": 2.462, + "step": 14813 + }, + { + "epoch": 1.195545153740618, + "grad_norm": 0.6415363550186157, + "learning_rate": 3.170657061543214e-05, + "loss": 2.4086, + "step": 14814 + }, + { + "epoch": 1.1956258574772012, + "grad_norm": 0.6511349081993103, + "learning_rate": 3.169503945645892e-05, + "loss": 2.4376, + "step": 14815 + }, + { + "epoch": 1.1957065612137843, + "grad_norm": 0.7420210242271423, + "learning_rate": 3.1683509999823854e-05, + "loss": 2.4317, + "step": 14816 + }, + { + "epoch": 1.1957872649503671, + "grad_norm": 0.7291967272758484, + "learning_rate": 3.1671982245814316e-05, + "loss": 2.4369, + "step": 14817 + }, + { + "epoch": 1.1958679686869502, + "grad_norm": 0.685743510723114, + "learning_rate": 3.166045619471758e-05, + "loss": 2.465, + "step": 14818 + }, + { + "epoch": 1.1959486724235333, + "grad_norm": 0.7130060195922852, + "learning_rate": 3.164893184682093e-05, + "loss": 2.4305, + "step": 14819 + }, + { + "epoch": 1.1960293761601162, + "grad_norm": 0.694508969783783, + "learning_rate": 3.163740920241156e-05, + "loss": 2.4278, + "step": 14820 + }, + { + "epoch": 1.1961100798966993, + "grad_norm": 0.6478514075279236, + "learning_rate": 3.162588826177669e-05, + "loss": 2.4721, + "step": 14821 + }, + { + "epoch": 1.1961907836332821, + "grad_norm": 0.6586465835571289, + "learning_rate": 3.1614369025203386e-05, + "loss": 2.4716, + "step": 14822 + }, + { + "epoch": 1.1962714873698652, + "grad_norm": 0.7558106184005737, + "learning_rate": 3.160285149297876e-05, + "loss": 2.4656, + "step": 14823 + }, + { + "epoch": 1.1963521911064483, + "grad_norm": 0.7208340764045715, + "learning_rate": 3.1591335665389896e-05, + "loss": 2.4374, + "step": 14824 + }, + { + "epoch": 1.1964328948430312, + "grad_norm": 0.70301353931427, + "learning_rate": 3.157982154272375e-05, + "loss": 2.397, + "step": 14825 + }, + { + "epoch": 1.1965135985796143, + "grad_norm": 0.6857609152793884, + "learning_rate": 3.15683091252673e-05, + "loss": 2.4258, + "step": 14826 + }, + { + "epoch": 1.1965943023161971, + "grad_norm": 0.6954602003097534, + "learning_rate": 3.155679841330747e-05, + "loss": 2.4566, + "step": 14827 + }, + { + "epoch": 1.1966750060527802, + "grad_norm": 0.6923913955688477, + "learning_rate": 3.154528940713113e-05, + "loss": 2.4, + "step": 14828 + }, + { + "epoch": 1.1967557097893633, + "grad_norm": 0.6641134023666382, + "learning_rate": 3.1533782107025124e-05, + "loss": 2.4721, + "step": 14829 + }, + { + "epoch": 1.1968364135259462, + "grad_norm": 0.7470134496688843, + "learning_rate": 3.152227651327627e-05, + "loss": 2.4253, + "step": 14830 + }, + { + "epoch": 1.1969171172625293, + "grad_norm": 0.7234545350074768, + "learning_rate": 3.151077262617126e-05, + "loss": 2.4109, + "step": 14831 + }, + { + "epoch": 1.1969978209991123, + "grad_norm": 0.7814013957977295, + "learning_rate": 3.149927044599682e-05, + "loss": 2.4522, + "step": 14832 + }, + { + "epoch": 1.1970785247356952, + "grad_norm": 0.6825435161590576, + "learning_rate": 3.1487769973039624e-05, + "loss": 2.4728, + "step": 14833 + }, + { + "epoch": 1.1971592284722783, + "grad_norm": 0.7091361880302429, + "learning_rate": 3.147627120758634e-05, + "loss": 2.4615, + "step": 14834 + }, + { + "epoch": 1.1972399322088614, + "grad_norm": 0.7271433472633362, + "learning_rate": 3.146477414992346e-05, + "loss": 2.4154, + "step": 14835 + }, + { + "epoch": 1.1973206359454442, + "grad_norm": 0.6557306051254272, + "learning_rate": 3.145327880033756e-05, + "loss": 2.4348, + "step": 14836 + }, + { + "epoch": 1.1974013396820273, + "grad_norm": 0.6667891144752502, + "learning_rate": 3.1441785159115166e-05, + "loss": 2.4123, + "step": 14837 + }, + { + "epoch": 1.1974820434186102, + "grad_norm": 0.6755266189575195, + "learning_rate": 3.143029322654266e-05, + "loss": 2.4287, + "step": 14838 + }, + { + "epoch": 1.1975627471551933, + "grad_norm": 0.7647396922111511, + "learning_rate": 3.1418803002906475e-05, + "loss": 2.4343, + "step": 14839 + }, + { + "epoch": 1.1976434508917764, + "grad_norm": 0.7288243174552917, + "learning_rate": 3.140731448849305e-05, + "loss": 2.4536, + "step": 14840 + }, + { + "epoch": 1.1977241546283592, + "grad_norm": 0.6126244068145752, + "learning_rate": 3.1395827683588605e-05, + "loss": 2.4187, + "step": 14841 + }, + { + "epoch": 1.1978048583649423, + "grad_norm": 0.6773896217346191, + "learning_rate": 3.138434258847948e-05, + "loss": 2.3916, + "step": 14842 + }, + { + "epoch": 1.1978855621015252, + "grad_norm": 0.724413275718689, + "learning_rate": 3.1372859203451934e-05, + "loss": 2.4614, + "step": 14843 + }, + { + "epoch": 1.1979662658381083, + "grad_norm": 0.7043039798736572, + "learning_rate": 3.136137752879209e-05, + "loss": 2.4343, + "step": 14844 + }, + { + "epoch": 1.1980469695746914, + "grad_norm": 0.7543383240699768, + "learning_rate": 3.134989756478615e-05, + "loss": 2.4345, + "step": 14845 + }, + { + "epoch": 1.1981276733112742, + "grad_norm": 0.7193408608436584, + "learning_rate": 3.1338419311720244e-05, + "loss": 2.4728, + "step": 14846 + }, + { + "epoch": 1.1982083770478573, + "grad_norm": 0.8090186715126038, + "learning_rate": 3.132694276988038e-05, + "loss": 2.4246, + "step": 14847 + }, + { + "epoch": 1.1982890807844404, + "grad_norm": 0.7154600620269775, + "learning_rate": 3.131546793955261e-05, + "loss": 2.4061, + "step": 14848 + }, + { + "epoch": 1.1983697845210233, + "grad_norm": 0.6987032890319824, + "learning_rate": 3.130399482102293e-05, + "loss": 2.4525, + "step": 14849 + }, + { + "epoch": 1.1984504882576064, + "grad_norm": 0.7123507261276245, + "learning_rate": 3.129252341457727e-05, + "loss": 2.4017, + "step": 14850 + }, + { + "epoch": 1.1985311919941894, + "grad_norm": 0.6475987434387207, + "learning_rate": 3.128105372050153e-05, + "loss": 2.4617, + "step": 14851 + }, + { + "epoch": 1.1986118957307723, + "grad_norm": 0.6799046993255615, + "learning_rate": 3.126958573908156e-05, + "loss": 2.4337, + "step": 14852 + }, + { + "epoch": 1.1986925994673554, + "grad_norm": 0.6910607218742371, + "learning_rate": 3.125811947060322e-05, + "loss": 2.415, + "step": 14853 + }, + { + "epoch": 1.1987733032039383, + "grad_norm": 0.6879963278770447, + "learning_rate": 3.124665491535219e-05, + "loss": 2.4912, + "step": 14854 + }, + { + "epoch": 1.1988540069405214, + "grad_norm": 0.7038810849189758, + "learning_rate": 3.123519207361425e-05, + "loss": 2.4528, + "step": 14855 + }, + { + "epoch": 1.1989347106771044, + "grad_norm": 0.6771957278251648, + "learning_rate": 3.1223730945675104e-05, + "loss": 2.4524, + "step": 14856 + }, + { + "epoch": 1.1990154144136873, + "grad_norm": 0.7529320120811462, + "learning_rate": 3.1212271531820336e-05, + "loss": 2.4667, + "step": 14857 + }, + { + "epoch": 1.1990961181502704, + "grad_norm": 0.6498474478721619, + "learning_rate": 3.1200813832335574e-05, + "loss": 2.3863, + "step": 14858 + }, + { + "epoch": 1.1991768218868533, + "grad_norm": 0.7587705850601196, + "learning_rate": 3.1189357847506383e-05, + "loss": 2.4962, + "step": 14859 + }, + { + "epoch": 1.1992575256234363, + "grad_norm": 0.674013078212738, + "learning_rate": 3.117790357761825e-05, + "loss": 2.3939, + "step": 14860 + }, + { + "epoch": 1.1993382293600194, + "grad_norm": 0.6546844840049744, + "learning_rate": 3.116645102295668e-05, + "loss": 2.4775, + "step": 14861 + }, + { + "epoch": 1.1994189330966023, + "grad_norm": 0.7558320760726929, + "learning_rate": 3.11550001838071e-05, + "loss": 2.3918, + "step": 14862 + }, + { + "epoch": 1.1994996368331854, + "grad_norm": 0.7074883580207825, + "learning_rate": 3.114355106045486e-05, + "loss": 2.3969, + "step": 14863 + }, + { + "epoch": 1.1995803405697685, + "grad_norm": 0.706078290939331, + "learning_rate": 3.1132103653185305e-05, + "loss": 2.5028, + "step": 14864 + }, + { + "epoch": 1.1996610443063513, + "grad_norm": 0.6883544921875, + "learning_rate": 3.1120657962283764e-05, + "loss": 2.4407, + "step": 14865 + }, + { + "epoch": 1.1997417480429344, + "grad_norm": 0.6905466914176941, + "learning_rate": 3.110921398803551e-05, + "loss": 2.3893, + "step": 14866 + }, + { + "epoch": 1.1998224517795173, + "grad_norm": 0.6584910154342651, + "learning_rate": 3.109777173072569e-05, + "loss": 2.4515, + "step": 14867 + }, + { + "epoch": 1.1999031555161004, + "grad_norm": 0.6957471370697021, + "learning_rate": 3.108633119063951e-05, + "loss": 2.4483, + "step": 14868 + }, + { + "epoch": 1.1999838592526835, + "grad_norm": 0.6716276407241821, + "learning_rate": 3.1074892368062095e-05, + "loss": 2.4298, + "step": 14869 + }, + { + "epoch": 1.2000645629892663, + "grad_norm": 0.7350820302963257, + "learning_rate": 3.1063455263278543e-05, + "loss": 2.4088, + "step": 14870 + }, + { + "epoch": 1.2001452667258494, + "grad_norm": 0.7409771680831909, + "learning_rate": 3.105201987657388e-05, + "loss": 2.4089, + "step": 14871 + }, + { + "epoch": 1.2002259704624323, + "grad_norm": 0.7273266911506653, + "learning_rate": 3.104058620823315e-05, + "loss": 2.5149, + "step": 14872 + }, + { + "epoch": 1.2003066741990154, + "grad_norm": 0.6793962717056274, + "learning_rate": 3.102915425854124e-05, + "loss": 2.4422, + "step": 14873 + }, + { + "epoch": 1.2003873779355985, + "grad_norm": 0.72386234998703, + "learning_rate": 3.101772402778309e-05, + "loss": 2.4756, + "step": 14874 + }, + { + "epoch": 1.2004680816721813, + "grad_norm": 0.6530055999755859, + "learning_rate": 3.1006295516243625e-05, + "loss": 2.4145, + "step": 14875 + }, + { + "epoch": 1.2005487854087644, + "grad_norm": 0.7288365960121155, + "learning_rate": 3.099486872420758e-05, + "loss": 2.4565, + "step": 14876 + }, + { + "epoch": 1.2006294891453475, + "grad_norm": 0.6982102394104004, + "learning_rate": 3.09834436519598e-05, + "loss": 2.4788, + "step": 14877 + }, + { + "epoch": 1.2007101928819304, + "grad_norm": 0.7208256125450134, + "learning_rate": 3.0972020299785007e-05, + "loss": 2.4186, + "step": 14878 + }, + { + "epoch": 1.2007908966185135, + "grad_norm": 0.6928278803825378, + "learning_rate": 3.096059866796791e-05, + "loss": 2.4177, + "step": 14879 + }, + { + "epoch": 1.2008716003550965, + "grad_norm": 0.7145438194274902, + "learning_rate": 3.094917875679317e-05, + "loss": 2.4796, + "step": 14880 + }, + { + "epoch": 1.2009523040916794, + "grad_norm": 0.7126322388648987, + "learning_rate": 3.093776056654539e-05, + "loss": 2.4926, + "step": 14881 + }, + { + "epoch": 1.2010330078282625, + "grad_norm": 0.7775046825408936, + "learning_rate": 3.092634409750919e-05, + "loss": 2.4386, + "step": 14882 + }, + { + "epoch": 1.2011137115648454, + "grad_norm": 0.6387330889701843, + "learning_rate": 3.091492934996901e-05, + "loss": 2.4302, + "step": 14883 + }, + { + "epoch": 1.2011944153014285, + "grad_norm": 0.6883525252342224, + "learning_rate": 3.090351632420939e-05, + "loss": 2.4644, + "step": 14884 + }, + { + "epoch": 1.2012751190380115, + "grad_norm": 0.6698900461196899, + "learning_rate": 3.0892105020514795e-05, + "loss": 2.414, + "step": 14885 + }, + { + "epoch": 1.2013558227745944, + "grad_norm": 0.7124409079551697, + "learning_rate": 3.088069543916956e-05, + "loss": 2.4275, + "step": 14886 + }, + { + "epoch": 1.2014365265111775, + "grad_norm": 0.6996601223945618, + "learning_rate": 3.0869287580458076e-05, + "loss": 2.4725, + "step": 14887 + }, + { + "epoch": 1.2015172302477604, + "grad_norm": 0.653087317943573, + "learning_rate": 3.085788144466468e-05, + "loss": 2.383, + "step": 14888 + }, + { + "epoch": 1.2015979339843434, + "grad_norm": 0.7426899671554565, + "learning_rate": 3.0846477032073554e-05, + "loss": 2.4064, + "step": 14889 + }, + { + "epoch": 1.2016786377209265, + "grad_norm": 0.6417646408081055, + "learning_rate": 3.083507434296903e-05, + "loss": 2.3964, + "step": 14890 + }, + { + "epoch": 1.2017593414575094, + "grad_norm": 0.6301923394203186, + "learning_rate": 3.0823673377635274e-05, + "loss": 2.4285, + "step": 14891 + }, + { + "epoch": 1.2018400451940925, + "grad_norm": 0.7621259093284607, + "learning_rate": 3.081227413635638e-05, + "loss": 2.4731, + "step": 14892 + }, + { + "epoch": 1.2019207489306756, + "grad_norm": 0.6637598872184753, + "learning_rate": 3.080087661941648e-05, + "loss": 2.4126, + "step": 14893 + }, + { + "epoch": 1.2020014526672584, + "grad_norm": 0.6820287108421326, + "learning_rate": 3.078948082709964e-05, + "loss": 2.4108, + "step": 14894 + }, + { + "epoch": 1.2020821564038415, + "grad_norm": 0.7090989351272583, + "learning_rate": 3.077808675968983e-05, + "loss": 2.4678, + "step": 14895 + }, + { + "epoch": 1.2021628601404246, + "grad_norm": 0.7242181897163391, + "learning_rate": 3.076669441747105e-05, + "loss": 2.5346, + "step": 14896 + }, + { + "epoch": 1.2022435638770075, + "grad_norm": 0.7790088653564453, + "learning_rate": 3.075530380072722e-05, + "loss": 2.4436, + "step": 14897 + }, + { + "epoch": 1.2023242676135906, + "grad_norm": 0.6828821301460266, + "learning_rate": 3.074391490974225e-05, + "loss": 2.3767, + "step": 14898 + }, + { + "epoch": 1.2024049713501734, + "grad_norm": 0.709815502166748, + "learning_rate": 3.0732527744799945e-05, + "loss": 2.4139, + "step": 14899 + }, + { + "epoch": 1.2024856750867565, + "grad_norm": 0.6561180353164673, + "learning_rate": 3.07211423061841e-05, + "loss": 2.399, + "step": 14900 + }, + { + "epoch": 1.2025663788233396, + "grad_norm": 0.7122004628181458, + "learning_rate": 3.0709758594178495e-05, + "loss": 2.4314, + "step": 14901 + }, + { + "epoch": 1.2026470825599225, + "grad_norm": 0.6817516684532166, + "learning_rate": 3.0698376609066825e-05, + "loss": 2.4241, + "step": 14902 + }, + { + "epoch": 1.2027277862965056, + "grad_norm": 0.6848475337028503, + "learning_rate": 3.068699635113277e-05, + "loss": 2.4583, + "step": 14903 + }, + { + "epoch": 1.2028084900330884, + "grad_norm": 0.6567823886871338, + "learning_rate": 3.067561782065999e-05, + "loss": 2.3818, + "step": 14904 + }, + { + "epoch": 1.2028891937696715, + "grad_norm": 0.7373961806297302, + "learning_rate": 3.066424101793198e-05, + "loss": 2.4075, + "step": 14905 + }, + { + "epoch": 1.2029698975062546, + "grad_norm": 0.6968079209327698, + "learning_rate": 3.0652865943232346e-05, + "loss": 2.4701, + "step": 14906 + }, + { + "epoch": 1.2030506012428375, + "grad_norm": 0.7356292009353638, + "learning_rate": 3.064149259684459e-05, + "loss": 2.4188, + "step": 14907 + }, + { + "epoch": 1.2031313049794206, + "grad_norm": 0.7144857048988342, + "learning_rate": 3.063012097905211e-05, + "loss": 2.4411, + "step": 14908 + }, + { + "epoch": 1.2032120087160036, + "grad_norm": 0.734531044960022, + "learning_rate": 3.0618751090138365e-05, + "loss": 2.4595, + "step": 14909 + }, + { + "epoch": 1.2032927124525865, + "grad_norm": 0.6658234000205994, + "learning_rate": 3.060738293038669e-05, + "loss": 2.4206, + "step": 14910 + }, + { + "epoch": 1.2033734161891696, + "grad_norm": 0.678424596786499, + "learning_rate": 3.059601650008044e-05, + "loss": 2.4704, + "step": 14911 + }, + { + "epoch": 1.2034541199257527, + "grad_norm": 0.6852440237998962, + "learning_rate": 3.058465179950287e-05, + "loss": 2.46, + "step": 14912 + }, + { + "epoch": 1.2035348236623356, + "grad_norm": 0.702881395816803, + "learning_rate": 3.057328882893724e-05, + "loss": 2.4372, + "step": 14913 + }, + { + "epoch": 1.2036155273989186, + "grad_norm": 0.6978999972343445, + "learning_rate": 3.056192758866676e-05, + "loss": 2.401, + "step": 14914 + }, + { + "epoch": 1.2036962311355015, + "grad_norm": 0.7070993185043335, + "learning_rate": 3.055056807897454e-05, + "loss": 2.3967, + "step": 14915 + }, + { + "epoch": 1.2037769348720846, + "grad_norm": 0.7159305810928345, + "learning_rate": 3.0539210300143693e-05, + "loss": 2.4388, + "step": 14916 + }, + { + "epoch": 1.2038576386086675, + "grad_norm": 0.6920869946479797, + "learning_rate": 3.0527854252457333e-05, + "loss": 2.441, + "step": 14917 + }, + { + "epoch": 1.2039383423452505, + "grad_norm": 0.7014884352684021, + "learning_rate": 3.0516499936198417e-05, + "loss": 2.4115, + "step": 14918 + }, + { + "epoch": 1.2040190460818336, + "grad_norm": 0.6754150986671448, + "learning_rate": 3.0505147351649955e-05, + "loss": 2.3722, + "step": 14919 + }, + { + "epoch": 1.2040997498184165, + "grad_norm": 0.7681791186332703, + "learning_rate": 3.0493796499094874e-05, + "loss": 2.4331, + "step": 14920 + }, + { + "epoch": 1.2041804535549996, + "grad_norm": 0.7265221476554871, + "learning_rate": 3.0482447378816082e-05, + "loss": 2.4806, + "step": 14921 + }, + { + "epoch": 1.2042611572915827, + "grad_norm": 0.6841520667076111, + "learning_rate": 3.047109999109642e-05, + "loss": 2.3896, + "step": 14922 + }, + { + "epoch": 1.2043418610281655, + "grad_norm": 0.746347963809967, + "learning_rate": 3.0459754336218737e-05, + "loss": 2.4081, + "step": 14923 + }, + { + "epoch": 1.2044225647647486, + "grad_norm": 0.6679818034172058, + "learning_rate": 3.0448410414465712e-05, + "loss": 2.4206, + "step": 14924 + }, + { + "epoch": 1.2045032685013317, + "grad_norm": 0.7122265100479126, + "learning_rate": 3.0437068226120114e-05, + "loss": 2.4217, + "step": 14925 + }, + { + "epoch": 1.2045839722379146, + "grad_norm": 0.7023499011993408, + "learning_rate": 3.0425727771464618e-05, + "loss": 2.4597, + "step": 14926 + }, + { + "epoch": 1.2046646759744977, + "grad_norm": 0.7304259538650513, + "learning_rate": 3.0414389050781876e-05, + "loss": 2.4915, + "step": 14927 + }, + { + "epoch": 1.2047453797110805, + "grad_norm": 0.7209908962249756, + "learning_rate": 3.0403052064354442e-05, + "loss": 2.4163, + "step": 14928 + }, + { + "epoch": 1.2048260834476636, + "grad_norm": 0.7367275953292847, + "learning_rate": 3.0391716812464865e-05, + "loss": 2.4192, + "step": 14929 + }, + { + "epoch": 1.2049067871842467, + "grad_norm": 0.6576591730117798, + "learning_rate": 3.0380383295395674e-05, + "loss": 2.4606, + "step": 14930 + }, + { + "epoch": 1.2049874909208296, + "grad_norm": 0.7082500457763672, + "learning_rate": 3.0369051513429315e-05, + "loss": 2.4079, + "step": 14931 + }, + { + "epoch": 1.2050681946574127, + "grad_norm": 0.6770346760749817, + "learning_rate": 3.03577214668482e-05, + "loss": 2.45, + "step": 14932 + }, + { + "epoch": 1.2051488983939955, + "grad_norm": 0.6979790925979614, + "learning_rate": 3.034639315593476e-05, + "loss": 2.3966, + "step": 14933 + }, + { + "epoch": 1.2052296021305786, + "grad_norm": 0.6863394975662231, + "learning_rate": 3.033506658097124e-05, + "loss": 2.4637, + "step": 14934 + }, + { + "epoch": 1.2053103058671617, + "grad_norm": 0.7522799372673035, + "learning_rate": 3.0323741742239963e-05, + "loss": 2.4585, + "step": 14935 + }, + { + "epoch": 1.2053910096037446, + "grad_norm": 0.7119878530502319, + "learning_rate": 3.031241864002321e-05, + "loss": 2.4473, + "step": 14936 + }, + { + "epoch": 1.2054717133403277, + "grad_norm": 0.690861701965332, + "learning_rate": 3.030109727460312e-05, + "loss": 2.4564, + "step": 14937 + }, + { + "epoch": 1.2055524170769107, + "grad_norm": 0.6825447082519531, + "learning_rate": 3.0289777646261886e-05, + "loss": 2.4511, + "step": 14938 + }, + { + "epoch": 1.2056331208134936, + "grad_norm": 0.7404600977897644, + "learning_rate": 3.027845975528164e-05, + "loss": 2.4461, + "step": 14939 + }, + { + "epoch": 1.2057138245500767, + "grad_norm": 0.6871766448020935, + "learning_rate": 3.026714360194437e-05, + "loss": 2.4486, + "step": 14940 + }, + { + "epoch": 1.2057945282866598, + "grad_norm": 0.6646476984024048, + "learning_rate": 3.02558291865322e-05, + "loss": 2.378, + "step": 14941 + }, + { + "epoch": 1.2058752320232426, + "grad_norm": 0.6998385787010193, + "learning_rate": 3.024451650932707e-05, + "loss": 2.4646, + "step": 14942 + }, + { + "epoch": 1.2059559357598257, + "grad_norm": 0.6763097047805786, + "learning_rate": 3.023320557061098e-05, + "loss": 2.3971, + "step": 14943 + }, + { + "epoch": 1.2060366394964086, + "grad_norm": 0.7409633994102478, + "learning_rate": 3.0221896370665736e-05, + "loss": 2.4405, + "step": 14944 + }, + { + "epoch": 1.2061173432329917, + "grad_norm": 0.6972076892852783, + "learning_rate": 3.0210588909773242e-05, + "loss": 2.3935, + "step": 14945 + }, + { + "epoch": 1.2061980469695748, + "grad_norm": 0.6898512840270996, + "learning_rate": 3.0199283188215333e-05, + "loss": 2.4173, + "step": 14946 + }, + { + "epoch": 1.2062787507061576, + "grad_norm": 0.6878097057342529, + "learning_rate": 3.0187979206273707e-05, + "loss": 2.44, + "step": 14947 + }, + { + "epoch": 1.2063594544427407, + "grad_norm": 0.6629695296287537, + "learning_rate": 3.0176676964230143e-05, + "loss": 2.3836, + "step": 14948 + }, + { + "epoch": 1.2064401581793236, + "grad_norm": 0.717654824256897, + "learning_rate": 3.0165376462366336e-05, + "loss": 2.415, + "step": 14949 + }, + { + "epoch": 1.2065208619159067, + "grad_norm": 0.7526129484176636, + "learning_rate": 3.0154077700963867e-05, + "loss": 2.4985, + "step": 14950 + }, + { + "epoch": 1.2066015656524898, + "grad_norm": 0.6867300271987915, + "learning_rate": 3.014278068030435e-05, + "loss": 2.395, + "step": 14951 + }, + { + "epoch": 1.2066822693890726, + "grad_norm": 0.7321466207504272, + "learning_rate": 3.0131485400669356e-05, + "loss": 2.4503, + "step": 14952 + }, + { + "epoch": 1.2067629731256557, + "grad_norm": 0.6915534734725952, + "learning_rate": 3.0120191862340387e-05, + "loss": 2.398, + "step": 14953 + }, + { + "epoch": 1.2068436768622388, + "grad_norm": 0.7017377018928528, + "learning_rate": 3.01089000655989e-05, + "loss": 2.4367, + "step": 14954 + }, + { + "epoch": 1.2069243805988217, + "grad_norm": 0.7032245397567749, + "learning_rate": 3.0097610010726353e-05, + "loss": 2.4078, + "step": 14955 + }, + { + "epoch": 1.2070050843354048, + "grad_norm": 0.6795478463172913, + "learning_rate": 3.008632169800406e-05, + "loss": 2.3508, + "step": 14956 + }, + { + "epoch": 1.2070857880719879, + "grad_norm": 0.7149559855461121, + "learning_rate": 3.007503512771339e-05, + "loss": 2.4023, + "step": 14957 + }, + { + "epoch": 1.2071664918085707, + "grad_norm": 0.724756121635437, + "learning_rate": 3.006375030013563e-05, + "loss": 2.4439, + "step": 14958 + }, + { + "epoch": 1.2072471955451538, + "grad_norm": 0.7233348488807678, + "learning_rate": 3.005246721555205e-05, + "loss": 2.3819, + "step": 14959 + }, + { + "epoch": 1.2073278992817367, + "grad_norm": 0.700322151184082, + "learning_rate": 3.0041185874243815e-05, + "loss": 2.4222, + "step": 14960 + }, + { + "epoch": 1.2074086030183198, + "grad_norm": 0.7268145680427551, + "learning_rate": 3.002990627649209e-05, + "loss": 2.4698, + "step": 14961 + }, + { + "epoch": 1.2074893067549028, + "grad_norm": 0.6885111331939697, + "learning_rate": 3.001862842257801e-05, + "loss": 2.4505, + "step": 14962 + }, + { + "epoch": 1.2075700104914857, + "grad_norm": 0.7237974405288696, + "learning_rate": 3.0007352312782632e-05, + "loss": 2.422, + "step": 14963 + }, + { + "epoch": 1.2076507142280688, + "grad_norm": 0.7214741110801697, + "learning_rate": 2.9996077947387015e-05, + "loss": 2.4428, + "step": 14964 + }, + { + "epoch": 1.2077314179646517, + "grad_norm": 0.7264460921287537, + "learning_rate": 2.998480532667215e-05, + "loss": 2.4669, + "step": 14965 + }, + { + "epoch": 1.2078121217012348, + "grad_norm": 0.7055517435073853, + "learning_rate": 2.9973534450918928e-05, + "loss": 2.5082, + "step": 14966 + }, + { + "epoch": 1.2078928254378178, + "grad_norm": 0.6886781454086304, + "learning_rate": 2.9962265320408268e-05, + "loss": 2.4697, + "step": 14967 + }, + { + "epoch": 1.2079735291744007, + "grad_norm": 0.6875878572463989, + "learning_rate": 2.9950997935421076e-05, + "loss": 2.4384, + "step": 14968 + }, + { + "epoch": 1.2080542329109838, + "grad_norm": 0.7586886882781982, + "learning_rate": 2.99397322962381e-05, + "loss": 2.4088, + "step": 14969 + }, + { + "epoch": 1.2081349366475669, + "grad_norm": 0.6744365096092224, + "learning_rate": 2.992846840314013e-05, + "loss": 2.4109, + "step": 14970 + }, + { + "epoch": 1.2082156403841497, + "grad_norm": 0.6589661240577698, + "learning_rate": 2.9917206256407893e-05, + "loss": 2.4386, + "step": 14971 + }, + { + "epoch": 1.2082963441207328, + "grad_norm": 0.6787264943122864, + "learning_rate": 2.990594585632208e-05, + "loss": 2.401, + "step": 14972 + }, + { + "epoch": 1.2083770478573157, + "grad_norm": 0.710517406463623, + "learning_rate": 2.9894687203163317e-05, + "loss": 2.4813, + "step": 14973 + }, + { + "epoch": 1.2084577515938988, + "grad_norm": 0.676110029220581, + "learning_rate": 2.988343029721221e-05, + "loss": 2.4654, + "step": 14974 + }, + { + "epoch": 1.2085384553304819, + "grad_norm": 0.6940518617630005, + "learning_rate": 2.9872175138749336e-05, + "loss": 2.4188, + "step": 14975 + }, + { + "epoch": 1.2086191590670647, + "grad_norm": 0.6849910020828247, + "learning_rate": 2.9860921728055147e-05, + "loss": 2.384, + "step": 14976 + }, + { + "epoch": 1.2086998628036478, + "grad_norm": 0.6902467608451843, + "learning_rate": 2.9849670065410128e-05, + "loss": 2.4364, + "step": 14977 + }, + { + "epoch": 1.2087805665402307, + "grad_norm": 0.6742224097251892, + "learning_rate": 2.9838420151094747e-05, + "loss": 2.5085, + "step": 14978 + }, + { + "epoch": 1.2088612702768138, + "grad_norm": 0.6635094285011292, + "learning_rate": 2.9827171985389303e-05, + "loss": 2.3635, + "step": 14979 + }, + { + "epoch": 1.2089419740133969, + "grad_norm": 0.7189158201217651, + "learning_rate": 2.9815925568574165e-05, + "loss": 2.458, + "step": 14980 + }, + { + "epoch": 1.2090226777499797, + "grad_norm": 0.7370143532752991, + "learning_rate": 2.9804680900929628e-05, + "loss": 2.4543, + "step": 14981 + }, + { + "epoch": 1.2091033814865628, + "grad_norm": 0.7410217523574829, + "learning_rate": 2.979343798273593e-05, + "loss": 2.4537, + "step": 14982 + }, + { + "epoch": 1.209184085223146, + "grad_norm": 0.7525770664215088, + "learning_rate": 2.9782196814273277e-05, + "loss": 2.5147, + "step": 14983 + }, + { + "epoch": 1.2092647889597288, + "grad_norm": 0.7302291393280029, + "learning_rate": 2.9770957395821863e-05, + "loss": 2.4711, + "step": 14984 + }, + { + "epoch": 1.2093454926963119, + "grad_norm": 0.7154920101165771, + "learning_rate": 2.975971972766175e-05, + "loss": 2.5224, + "step": 14985 + }, + { + "epoch": 1.209426196432895, + "grad_norm": 0.6827684640884399, + "learning_rate": 2.9748483810073025e-05, + "loss": 2.4477, + "step": 14986 + }, + { + "epoch": 1.2095069001694778, + "grad_norm": 0.7753484845161438, + "learning_rate": 2.973724964333575e-05, + "loss": 2.4257, + "step": 14987 + }, + { + "epoch": 1.209587603906061, + "grad_norm": 0.7146809101104736, + "learning_rate": 2.9726017227729862e-05, + "loss": 2.3953, + "step": 14988 + }, + { + "epoch": 1.2096683076426438, + "grad_norm": 0.7360730767250061, + "learning_rate": 2.9714786563535313e-05, + "loss": 2.3774, + "step": 14989 + }, + { + "epoch": 1.2097490113792269, + "grad_norm": 0.7159923911094666, + "learning_rate": 2.970355765103201e-05, + "loss": 2.4068, + "step": 14990 + }, + { + "epoch": 1.20982971511581, + "grad_norm": 0.6732171773910522, + "learning_rate": 2.969233049049982e-05, + "loss": 2.4215, + "step": 14991 + }, + { + "epoch": 1.2099104188523928, + "grad_norm": 0.749812126159668, + "learning_rate": 2.968110508221853e-05, + "loss": 2.4415, + "step": 14992 + }, + { + "epoch": 1.209991122588976, + "grad_norm": 0.7185530662536621, + "learning_rate": 2.9669881426467916e-05, + "loss": 2.4536, + "step": 14993 + }, + { + "epoch": 1.2100718263255588, + "grad_norm": 0.6757143139839172, + "learning_rate": 2.9658659523527733e-05, + "loss": 2.3892, + "step": 14994 + }, + { + "epoch": 1.2101525300621419, + "grad_norm": 0.7187495231628418, + "learning_rate": 2.96474393736776e-05, + "loss": 2.434, + "step": 14995 + }, + { + "epoch": 1.210233233798725, + "grad_norm": 0.7016372680664062, + "learning_rate": 2.9636220977197182e-05, + "loss": 2.4903, + "step": 14996 + }, + { + "epoch": 1.2103139375353078, + "grad_norm": 0.7528983950614929, + "learning_rate": 2.9625004334366103e-05, + "loss": 2.3829, + "step": 14997 + }, + { + "epoch": 1.210394641271891, + "grad_norm": 0.6735692024230957, + "learning_rate": 2.9613789445463837e-05, + "loss": 2.3844, + "step": 14998 + }, + { + "epoch": 1.210475345008474, + "grad_norm": 0.6825322508811951, + "learning_rate": 2.9602576310769935e-05, + "loss": 2.4691, + "step": 14999 + }, + { + "epoch": 1.2105560487450568, + "grad_norm": 0.7507675290107727, + "learning_rate": 2.959136493056389e-05, + "loss": 2.4605, + "step": 15000 + }, + { + "epoch": 1.2105560487450568, + "eval_loss": 2.3882925510406494, + "eval_runtime": 1014.0781, + "eval_samples_per_second": 2.584, + "eval_steps_per_second": 0.431, + "step": 15000 + }, + { + "epoch": 1.21063675248164, + "grad_norm": 0.6937146782875061, + "learning_rate": 2.9580155305125044e-05, + "loss": 2.4444, + "step": 15001 + }, + { + "epoch": 1.210717456218223, + "grad_norm": 0.6572179794311523, + "learning_rate": 2.9568947434732775e-05, + "loss": 2.4373, + "step": 15002 + }, + { + "epoch": 1.2107981599548059, + "grad_norm": 0.7420738935470581, + "learning_rate": 2.955774131966651e-05, + "loss": 2.4046, + "step": 15003 + }, + { + "epoch": 1.210878863691389, + "grad_norm": 0.7952237129211426, + "learning_rate": 2.954653696020543e-05, + "loss": 2.4082, + "step": 15004 + }, + { + "epoch": 1.2109595674279718, + "grad_norm": 0.6640750765800476, + "learning_rate": 2.9535334356628817e-05, + "loss": 2.4109, + "step": 15005 + }, + { + "epoch": 1.211040271164555, + "grad_norm": 0.6968019008636475, + "learning_rate": 2.952413350921588e-05, + "loss": 2.3991, + "step": 15006 + }, + { + "epoch": 1.211120974901138, + "grad_norm": 0.7174221277236938, + "learning_rate": 2.9512934418245787e-05, + "loss": 2.3909, + "step": 15007 + }, + { + "epoch": 1.2112016786377209, + "grad_norm": 0.6854268908500671, + "learning_rate": 2.9501737083997595e-05, + "loss": 2.4321, + "step": 15008 + }, + { + "epoch": 1.211282382374304, + "grad_norm": 0.6705672740936279, + "learning_rate": 2.949054150675039e-05, + "loss": 2.4749, + "step": 15009 + }, + { + "epoch": 1.2113630861108868, + "grad_norm": 0.7871068716049194, + "learning_rate": 2.9479347686783244e-05, + "loss": 2.424, + "step": 15010 + }, + { + "epoch": 1.21144378984747, + "grad_norm": 0.8194620609283447, + "learning_rate": 2.946815562437506e-05, + "loss": 2.461, + "step": 15011 + }, + { + "epoch": 1.211524493584053, + "grad_norm": 0.673367977142334, + "learning_rate": 2.9456965319804818e-05, + "loss": 2.4212, + "step": 15012 + }, + { + "epoch": 1.2116051973206359, + "grad_norm": 0.6630001068115234, + "learning_rate": 2.9445776773351397e-05, + "loss": 2.4393, + "step": 15013 + }, + { + "epoch": 1.211685901057219, + "grad_norm": 0.676170825958252, + "learning_rate": 2.943458998529365e-05, + "loss": 2.3889, + "step": 15014 + }, + { + "epoch": 1.211766604793802, + "grad_norm": 0.6951417326927185, + "learning_rate": 2.942340495591037e-05, + "loss": 2.4088, + "step": 15015 + }, + { + "epoch": 1.211847308530385, + "grad_norm": 0.6909857988357544, + "learning_rate": 2.941222168548037e-05, + "loss": 2.4282, + "step": 15016 + }, + { + "epoch": 1.211928012266968, + "grad_norm": 0.653264045715332, + "learning_rate": 2.9401040174282292e-05, + "loss": 2.4369, + "step": 15017 + }, + { + "epoch": 1.2120087160035509, + "grad_norm": 0.6994543075561523, + "learning_rate": 2.938986042259484e-05, + "loss": 2.419, + "step": 15018 + }, + { + "epoch": 1.212089419740134, + "grad_norm": 0.709015965461731, + "learning_rate": 2.9378682430696668e-05, + "loss": 2.4747, + "step": 15019 + }, + { + "epoch": 1.212170123476717, + "grad_norm": 0.6899579167366028, + "learning_rate": 2.9367506198866313e-05, + "loss": 2.4134, + "step": 15020 + }, + { + "epoch": 1.2122508272133, + "grad_norm": 0.6811912059783936, + "learning_rate": 2.9356331727382337e-05, + "loss": 2.449, + "step": 15021 + }, + { + "epoch": 1.212331530949883, + "grad_norm": 0.8119748830795288, + "learning_rate": 2.9345159016523237e-05, + "loss": 2.4463, + "step": 15022 + }, + { + "epoch": 1.2124122346864659, + "grad_norm": 0.7323578000068665, + "learning_rate": 2.9333988066567463e-05, + "loss": 2.4305, + "step": 15023 + }, + { + "epoch": 1.212492938423049, + "grad_norm": 0.6639837622642517, + "learning_rate": 2.9322818877793436e-05, + "loss": 2.4237, + "step": 15024 + }, + { + "epoch": 1.212573642159632, + "grad_norm": 0.669623076915741, + "learning_rate": 2.9311651450479516e-05, + "loss": 2.4436, + "step": 15025 + }, + { + "epoch": 1.212654345896215, + "grad_norm": 0.7200437784194946, + "learning_rate": 2.9300485784904054e-05, + "loss": 2.4399, + "step": 15026 + }, + { + "epoch": 1.212735049632798, + "grad_norm": 0.7015525102615356, + "learning_rate": 2.9289321881345254e-05, + "loss": 2.4696, + "step": 15027 + }, + { + "epoch": 1.212815753369381, + "grad_norm": 0.74539715051651, + "learning_rate": 2.9278159740081402e-05, + "loss": 2.4204, + "step": 15028 + }, + { + "epoch": 1.212896457105964, + "grad_norm": 0.6373662352561951, + "learning_rate": 2.9266999361390713e-05, + "loss": 2.4273, + "step": 15029 + }, + { + "epoch": 1.212977160842547, + "grad_norm": 0.8213370442390442, + "learning_rate": 2.9255840745551256e-05, + "loss": 2.4166, + "step": 15030 + }, + { + "epoch": 1.2130578645791301, + "grad_norm": 0.7386181354522705, + "learning_rate": 2.9244683892841185e-05, + "loss": 2.3973, + "step": 15031 + }, + { + "epoch": 1.213138568315713, + "grad_norm": 0.7939273118972778, + "learning_rate": 2.9233528803538534e-05, + "loss": 2.5593, + "step": 15032 + }, + { + "epoch": 1.213219272052296, + "grad_norm": 0.7580689191818237, + "learning_rate": 2.9222375477921347e-05, + "loss": 2.4255, + "step": 15033 + }, + { + "epoch": 1.213299975788879, + "grad_norm": 0.7680409550666809, + "learning_rate": 2.9211223916267573e-05, + "loss": 2.4447, + "step": 15034 + }, + { + "epoch": 1.213380679525462, + "grad_norm": 0.6998565196990967, + "learning_rate": 2.9200074118855135e-05, + "loss": 2.4061, + "step": 15035 + }, + { + "epoch": 1.2134613832620451, + "grad_norm": 0.6673001050949097, + "learning_rate": 2.9188926085961954e-05, + "loss": 2.3989, + "step": 15036 + }, + { + "epoch": 1.213542086998628, + "grad_norm": 0.683215320110321, + "learning_rate": 2.9177779817865815e-05, + "loss": 2.4078, + "step": 15037 + }, + { + "epoch": 1.213622790735211, + "grad_norm": 0.696967363357544, + "learning_rate": 2.9166635314844527e-05, + "loss": 2.4224, + "step": 15038 + }, + { + "epoch": 1.213703494471794, + "grad_norm": 0.6930364370346069, + "learning_rate": 2.915549257717588e-05, + "loss": 2.4112, + "step": 15039 + }, + { + "epoch": 1.213784198208377, + "grad_norm": 0.7387405633926392, + "learning_rate": 2.914435160513752e-05, + "loss": 2.4458, + "step": 15040 + }, + { + "epoch": 1.21386490194496, + "grad_norm": 0.6615941524505615, + "learning_rate": 2.913321239900714e-05, + "loss": 2.4406, + "step": 15041 + }, + { + "epoch": 1.213945605681543, + "grad_norm": 0.7520569562911987, + "learning_rate": 2.912207495906235e-05, + "loss": 2.3991, + "step": 15042 + }, + { + "epoch": 1.214026309418126, + "grad_norm": 0.6952454447746277, + "learning_rate": 2.911093928558072e-05, + "loss": 2.4404, + "step": 15043 + }, + { + "epoch": 1.2141070131547091, + "grad_norm": 0.7595344185829163, + "learning_rate": 2.9099805378839794e-05, + "loss": 2.551, + "step": 15044 + }, + { + "epoch": 1.214187716891292, + "grad_norm": 0.6645220518112183, + "learning_rate": 2.9088673239117094e-05, + "loss": 2.4167, + "step": 15045 + }, + { + "epoch": 1.214268420627875, + "grad_norm": 0.6433377861976624, + "learning_rate": 2.907754286668998e-05, + "loss": 2.3873, + "step": 15046 + }, + { + "epoch": 1.2143491243644582, + "grad_norm": 0.6806936860084534, + "learning_rate": 2.9066414261835894e-05, + "loss": 2.3868, + "step": 15047 + }, + { + "epoch": 1.214429828101041, + "grad_norm": 0.7261343598365784, + "learning_rate": 2.905528742483222e-05, + "loss": 2.4785, + "step": 15048 + }, + { + "epoch": 1.2145105318376241, + "grad_norm": 0.6495440602302551, + "learning_rate": 2.9044162355956196e-05, + "loss": 2.4167, + "step": 15049 + }, + { + "epoch": 1.214591235574207, + "grad_norm": 0.6816607117652893, + "learning_rate": 2.9033039055485135e-05, + "loss": 2.459, + "step": 15050 + }, + { + "epoch": 1.21467193931079, + "grad_norm": 0.6624214053153992, + "learning_rate": 2.902191752369624e-05, + "loss": 2.4498, + "step": 15051 + }, + { + "epoch": 1.2147526430473732, + "grad_norm": 0.6800024509429932, + "learning_rate": 2.9010797760866737e-05, + "loss": 2.4442, + "step": 15052 + }, + { + "epoch": 1.214833346783956, + "grad_norm": 0.711705207824707, + "learning_rate": 2.8999679767273667e-05, + "loss": 2.422, + "step": 15053 + }, + { + "epoch": 1.2149140505205391, + "grad_norm": 0.6854784488677979, + "learning_rate": 2.898856354319419e-05, + "loss": 2.4567, + "step": 15054 + }, + { + "epoch": 1.214994754257122, + "grad_norm": 0.6676114797592163, + "learning_rate": 2.8977449088905373e-05, + "loss": 2.3913, + "step": 15055 + }, + { + "epoch": 1.215075457993705, + "grad_norm": 0.6893348693847656, + "learning_rate": 2.8966336404684145e-05, + "loss": 2.4407, + "step": 15056 + }, + { + "epoch": 1.2151561617302882, + "grad_norm": 0.6749289035797119, + "learning_rate": 2.8955225490807514e-05, + "loss": 2.409, + "step": 15057 + }, + { + "epoch": 1.215236865466871, + "grad_norm": 0.6998956203460693, + "learning_rate": 2.8944116347552387e-05, + "loss": 2.4297, + "step": 15058 + }, + { + "epoch": 1.2153175692034541, + "grad_norm": 0.7040024399757385, + "learning_rate": 2.8933008975195596e-05, + "loss": 2.4262, + "step": 15059 + }, + { + "epoch": 1.2153982729400372, + "grad_norm": 0.6638362407684326, + "learning_rate": 2.8921903374014005e-05, + "loss": 2.4355, + "step": 15060 + }, + { + "epoch": 1.21547897667662, + "grad_norm": 0.6864547729492188, + "learning_rate": 2.8910799544284407e-05, + "loss": 2.4493, + "step": 15061 + }, + { + "epoch": 1.2155596804132032, + "grad_norm": 0.707383394241333, + "learning_rate": 2.8899697486283474e-05, + "loss": 2.4604, + "step": 15062 + }, + { + "epoch": 1.2156403841497863, + "grad_norm": 0.7121397852897644, + "learning_rate": 2.888859720028795e-05, + "loss": 2.4272, + "step": 15063 + }, + { + "epoch": 1.2157210878863691, + "grad_norm": 0.7600439786911011, + "learning_rate": 2.8877498686574455e-05, + "loss": 2.4499, + "step": 15064 + }, + { + "epoch": 1.2158017916229522, + "grad_norm": 0.6654962301254272, + "learning_rate": 2.886640194541962e-05, + "loss": 2.4632, + "step": 15065 + }, + { + "epoch": 1.215882495359535, + "grad_norm": 0.7138063311576843, + "learning_rate": 2.8855306977099994e-05, + "loss": 2.4321, + "step": 15066 + }, + { + "epoch": 1.2159631990961182, + "grad_norm": 0.672604501247406, + "learning_rate": 2.884421378189208e-05, + "loss": 2.4026, + "step": 15067 + }, + { + "epoch": 1.2160439028327013, + "grad_norm": 0.6894693970680237, + "learning_rate": 2.8833122360072405e-05, + "loss": 2.4213, + "step": 15068 + }, + { + "epoch": 1.2161246065692841, + "grad_norm": 0.6784985065460205, + "learning_rate": 2.8822032711917325e-05, + "loss": 2.4207, + "step": 15069 + }, + { + "epoch": 1.2162053103058672, + "grad_norm": 0.6569294929504395, + "learning_rate": 2.8810944837703248e-05, + "loss": 2.4142, + "step": 15070 + }, + { + "epoch": 1.21628601404245, + "grad_norm": 0.7240702509880066, + "learning_rate": 2.879985873770654e-05, + "loss": 2.4173, + "step": 15071 + }, + { + "epoch": 1.2163667177790332, + "grad_norm": 0.6935575604438782, + "learning_rate": 2.8788774412203444e-05, + "loss": 2.4487, + "step": 15072 + }, + { + "epoch": 1.2164474215156162, + "grad_norm": 0.6903246641159058, + "learning_rate": 2.8777691861470234e-05, + "loss": 2.4193, + "step": 15073 + }, + { + "epoch": 1.216528125252199, + "grad_norm": 0.7982182502746582, + "learning_rate": 2.8766611085783123e-05, + "loss": 2.492, + "step": 15074 + }, + { + "epoch": 1.2166088289887822, + "grad_norm": 0.6958058476448059, + "learning_rate": 2.875553208541827e-05, + "loss": 2.4198, + "step": 15075 + }, + { + "epoch": 1.2166895327253653, + "grad_norm": 0.6869969964027405, + "learning_rate": 2.8744454860651794e-05, + "loss": 2.3768, + "step": 15076 + }, + { + "epoch": 1.2167702364619482, + "grad_norm": 0.7263007760047913, + "learning_rate": 2.8733379411759796e-05, + "loss": 2.386, + "step": 15077 + }, + { + "epoch": 1.2168509401985312, + "grad_norm": 0.7010302543640137, + "learning_rate": 2.872230573901825e-05, + "loss": 2.4417, + "step": 15078 + }, + { + "epoch": 1.216931643935114, + "grad_norm": 0.818980872631073, + "learning_rate": 2.8711233842703156e-05, + "loss": 2.433, + "step": 15079 + }, + { + "epoch": 1.2170123476716972, + "grad_norm": 0.6937929391860962, + "learning_rate": 2.87001637230905e-05, + "loss": 2.379, + "step": 15080 + }, + { + "epoch": 1.2170930514082803, + "grad_norm": 0.6954175233840942, + "learning_rate": 2.868909538045612e-05, + "loss": 2.4296, + "step": 15081 + }, + { + "epoch": 1.2171737551448631, + "grad_norm": 0.7177354097366333, + "learning_rate": 2.8678028815075887e-05, + "loss": 2.3978, + "step": 15082 + }, + { + "epoch": 1.2172544588814462, + "grad_norm": 0.7100846171379089, + "learning_rate": 2.8666964027225607e-05, + "loss": 2.4566, + "step": 15083 + }, + { + "epoch": 1.217335162618029, + "grad_norm": 0.6909635066986084, + "learning_rate": 2.8655901017181064e-05, + "loss": 2.4772, + "step": 15084 + }, + { + "epoch": 1.2174158663546122, + "grad_norm": 0.7319501638412476, + "learning_rate": 2.8644839785217947e-05, + "loss": 2.4402, + "step": 15085 + }, + { + "epoch": 1.2174965700911953, + "grad_norm": 0.6691421270370483, + "learning_rate": 2.8633780331611958e-05, + "loss": 2.4465, + "step": 15086 + }, + { + "epoch": 1.2175772738277781, + "grad_norm": 0.7028824687004089, + "learning_rate": 2.8622722656638745e-05, + "loss": 2.4765, + "step": 15087 + }, + { + "epoch": 1.2176579775643612, + "grad_norm": 0.7428398728370667, + "learning_rate": 2.861166676057383e-05, + "loss": 2.441, + "step": 15088 + }, + { + "epoch": 1.2177386813009443, + "grad_norm": 0.6715269684791565, + "learning_rate": 2.8600612643692803e-05, + "loss": 2.4621, + "step": 15089 + }, + { + "epoch": 1.2178193850375272, + "grad_norm": 0.6768512725830078, + "learning_rate": 2.8589560306271168e-05, + "loss": 2.4257, + "step": 15090 + }, + { + "epoch": 1.2179000887741103, + "grad_norm": 0.7442535758018494, + "learning_rate": 2.8578509748584326e-05, + "loss": 2.424, + "step": 15091 + }, + { + "epoch": 1.2179807925106934, + "grad_norm": 0.7275974154472351, + "learning_rate": 2.8567460970907722e-05, + "loss": 2.4698, + "step": 15092 + }, + { + "epoch": 1.2180614962472762, + "grad_norm": 0.7050346732139587, + "learning_rate": 2.8556413973516727e-05, + "loss": 2.4734, + "step": 15093 + }, + { + "epoch": 1.2181421999838593, + "grad_norm": 0.7325939536094666, + "learning_rate": 2.854536875668664e-05, + "loss": 2.4166, + "step": 15094 + }, + { + "epoch": 1.2182229037204422, + "grad_norm": 0.6764184236526489, + "learning_rate": 2.8534325320692746e-05, + "loss": 2.4742, + "step": 15095 + }, + { + "epoch": 1.2183036074570253, + "grad_norm": 0.7405500411987305, + "learning_rate": 2.8523283665810318e-05, + "loss": 2.3959, + "step": 15096 + }, + { + "epoch": 1.2183843111936083, + "grad_norm": 0.6714199185371399, + "learning_rate": 2.8512243792314465e-05, + "loss": 2.4571, + "step": 15097 + }, + { + "epoch": 1.2184650149301912, + "grad_norm": 0.6779391169548035, + "learning_rate": 2.8501205700480372e-05, + "loss": 2.3745, + "step": 15098 + }, + { + "epoch": 1.2185457186667743, + "grad_norm": 0.6876079440116882, + "learning_rate": 2.8490169390583134e-05, + "loss": 2.4432, + "step": 15099 + }, + { + "epoch": 1.2186264224033572, + "grad_norm": 0.7092362642288208, + "learning_rate": 2.8479134862897826e-05, + "loss": 2.4716, + "step": 15100 + }, + { + "epoch": 1.2187071261399403, + "grad_norm": 0.6901989579200745, + "learning_rate": 2.8468102117699414e-05, + "loss": 2.417, + "step": 15101 + }, + { + "epoch": 1.2187878298765233, + "grad_norm": 0.7011592984199524, + "learning_rate": 2.8457071155262884e-05, + "loss": 2.4439, + "step": 15102 + }, + { + "epoch": 1.2188685336131062, + "grad_norm": 0.6923472285270691, + "learning_rate": 2.8446041975863146e-05, + "loss": 2.4247, + "step": 15103 + }, + { + "epoch": 1.2189492373496893, + "grad_norm": 0.6948748230934143, + "learning_rate": 2.843501457977509e-05, + "loss": 2.3902, + "step": 15104 + }, + { + "epoch": 1.2190299410862724, + "grad_norm": 0.7034386396408081, + "learning_rate": 2.842398896727354e-05, + "loss": 2.4277, + "step": 15105 + }, + { + "epoch": 1.2191106448228552, + "grad_norm": 0.7965617775917053, + "learning_rate": 2.8412965138633318e-05, + "loss": 2.435, + "step": 15106 + }, + { + "epoch": 1.2191913485594383, + "grad_norm": 0.7371121644973755, + "learning_rate": 2.8401943094129112e-05, + "loss": 2.3928, + "step": 15107 + }, + { + "epoch": 1.2192720522960214, + "grad_norm": 0.7079561352729797, + "learning_rate": 2.839092283403564e-05, + "loss": 2.4706, + "step": 15108 + }, + { + "epoch": 1.2193527560326043, + "grad_norm": 0.6711337566375732, + "learning_rate": 2.8379904358627584e-05, + "loss": 2.4272, + "step": 15109 + }, + { + "epoch": 1.2194334597691874, + "grad_norm": 0.6840410828590393, + "learning_rate": 2.836888766817951e-05, + "loss": 2.4174, + "step": 15110 + }, + { + "epoch": 1.2195141635057702, + "grad_norm": 0.700366199016571, + "learning_rate": 2.8357872762965986e-05, + "loss": 2.4667, + "step": 15111 + }, + { + "epoch": 1.2195948672423533, + "grad_norm": 0.7090682983398438, + "learning_rate": 2.8346859643261593e-05, + "loss": 2.3748, + "step": 15112 + }, + { + "epoch": 1.2196755709789364, + "grad_norm": 0.7965148687362671, + "learning_rate": 2.8335848309340717e-05, + "loss": 2.5138, + "step": 15113 + }, + { + "epoch": 1.2197562747155193, + "grad_norm": 0.7845773696899414, + "learning_rate": 2.8324838761477833e-05, + "loss": 2.4274, + "step": 15114 + }, + { + "epoch": 1.2198369784521024, + "grad_norm": 0.6545087099075317, + "learning_rate": 2.831383099994731e-05, + "loss": 2.4311, + "step": 15115 + }, + { + "epoch": 1.2199176821886852, + "grad_norm": 0.6846331357955933, + "learning_rate": 2.830282502502356e-05, + "loss": 2.4239, + "step": 15116 + }, + { + "epoch": 1.2199983859252683, + "grad_norm": 0.7062236070632935, + "learning_rate": 2.8291820836980798e-05, + "loss": 2.4429, + "step": 15117 + }, + { + "epoch": 1.2200790896618514, + "grad_norm": 0.7526285648345947, + "learning_rate": 2.8280818436093315e-05, + "loss": 2.4882, + "step": 15118 + }, + { + "epoch": 1.2201597933984343, + "grad_norm": 0.6853364109992981, + "learning_rate": 2.8269817822635337e-05, + "loss": 2.3803, + "step": 15119 + }, + { + "epoch": 1.2202404971350174, + "grad_norm": 0.7796143293380737, + "learning_rate": 2.8258818996880964e-05, + "loss": 2.4157, + "step": 15120 + }, + { + "epoch": 1.2203212008716005, + "grad_norm": 0.7202157378196716, + "learning_rate": 2.824782195910437e-05, + "loss": 2.5101, + "step": 15121 + }, + { + "epoch": 1.2204019046081833, + "grad_norm": 0.6730707287788391, + "learning_rate": 2.8236826709579644e-05, + "loss": 2.4397, + "step": 15122 + }, + { + "epoch": 1.2204826083447664, + "grad_norm": 0.7840865850448608, + "learning_rate": 2.8225833248580745e-05, + "loss": 2.4452, + "step": 15123 + }, + { + "epoch": 1.2205633120813493, + "grad_norm": 0.8323497772216797, + "learning_rate": 2.821484157638171e-05, + "loss": 2.4775, + "step": 15124 + }, + { + "epoch": 1.2206440158179324, + "grad_norm": 0.6699438691139221, + "learning_rate": 2.8203851693256466e-05, + "loss": 2.3958, + "step": 15125 + }, + { + "epoch": 1.2207247195545154, + "grad_norm": 0.6711557507514954, + "learning_rate": 2.8192863599478923e-05, + "loss": 2.477, + "step": 15126 + }, + { + "epoch": 1.2208054232910983, + "grad_norm": 0.6255797743797302, + "learning_rate": 2.8181877295322922e-05, + "loss": 2.4222, + "step": 15127 + }, + { + "epoch": 1.2208861270276814, + "grad_norm": 0.7313731908798218, + "learning_rate": 2.8170892781062297e-05, + "loss": 2.4343, + "step": 15128 + }, + { + "epoch": 1.2209668307642643, + "grad_norm": 0.6611476540565491, + "learning_rate": 2.815991005697076e-05, + "loss": 2.3844, + "step": 15129 + }, + { + "epoch": 1.2210475345008474, + "grad_norm": 0.7293661236763, + "learning_rate": 2.8148929123322065e-05, + "loss": 2.3912, + "step": 15130 + }, + { + "epoch": 1.2211282382374304, + "grad_norm": 0.7150777578353882, + "learning_rate": 2.8137949980389866e-05, + "loss": 2.4227, + "step": 15131 + }, + { + "epoch": 1.2212089419740133, + "grad_norm": 0.7001000642776489, + "learning_rate": 2.8126972628447845e-05, + "loss": 2.4751, + "step": 15132 + }, + { + "epoch": 1.2212896457105964, + "grad_norm": 0.7106043100357056, + "learning_rate": 2.8115997067769505e-05, + "loss": 2.4127, + "step": 15133 + }, + { + "epoch": 1.2213703494471795, + "grad_norm": 0.6969115138053894, + "learning_rate": 2.810502329862842e-05, + "loss": 2.4073, + "step": 15134 + }, + { + "epoch": 1.2214510531837623, + "grad_norm": 0.7493317127227783, + "learning_rate": 2.8094051321298098e-05, + "loss": 2.4541, + "step": 15135 + }, + { + "epoch": 1.2215317569203454, + "grad_norm": 0.6499322652816772, + "learning_rate": 2.808308113605198e-05, + "loss": 2.4057, + "step": 15136 + }, + { + "epoch": 1.2216124606569285, + "grad_norm": 0.6716788411140442, + "learning_rate": 2.807211274316347e-05, + "loss": 2.3856, + "step": 15137 + }, + { + "epoch": 1.2216931643935114, + "grad_norm": 0.7724741101264954, + "learning_rate": 2.8061146142905958e-05, + "loss": 2.4652, + "step": 15138 + }, + { + "epoch": 1.2217738681300945, + "grad_norm": 0.7014325261116028, + "learning_rate": 2.8050181335552718e-05, + "loss": 2.4506, + "step": 15139 + }, + { + "epoch": 1.2218545718666773, + "grad_norm": 0.6705317497253418, + "learning_rate": 2.8039218321377026e-05, + "loss": 2.4581, + "step": 15140 + }, + { + "epoch": 1.2219352756032604, + "grad_norm": 0.709973931312561, + "learning_rate": 2.8028257100652156e-05, + "loss": 2.427, + "step": 15141 + }, + { + "epoch": 1.2220159793398435, + "grad_norm": 0.7021297812461853, + "learning_rate": 2.801729767365122e-05, + "loss": 2.3784, + "step": 15142 + }, + { + "epoch": 1.2220966830764264, + "grad_norm": 0.7431899905204773, + "learning_rate": 2.8006340040647393e-05, + "loss": 2.4135, + "step": 15143 + }, + { + "epoch": 1.2221773868130095, + "grad_norm": 0.6724472045898438, + "learning_rate": 2.7995384201913765e-05, + "loss": 2.3966, + "step": 15144 + }, + { + "epoch": 1.2222580905495923, + "grad_norm": 0.7381375432014465, + "learning_rate": 2.7984430157723384e-05, + "loss": 2.4853, + "step": 15145 + }, + { + "epoch": 1.2223387942861754, + "grad_norm": 0.6809988617897034, + "learning_rate": 2.7973477908349255e-05, + "loss": 2.408, + "step": 15146 + }, + { + "epoch": 1.2224194980227585, + "grad_norm": 0.7042898535728455, + "learning_rate": 2.7962527454064337e-05, + "loss": 2.3981, + "step": 15147 + }, + { + "epoch": 1.2225002017593414, + "grad_norm": 0.7096118330955505, + "learning_rate": 2.7951578795141576e-05, + "loss": 2.4175, + "step": 15148 + }, + { + "epoch": 1.2225809054959245, + "grad_norm": 0.7271720767021179, + "learning_rate": 2.794063193185378e-05, + "loss": 2.4193, + "step": 15149 + }, + { + "epoch": 1.2226616092325076, + "grad_norm": 0.7000352740287781, + "learning_rate": 2.7929686864473792e-05, + "loss": 2.422, + "step": 15150 + }, + { + "epoch": 1.2227423129690904, + "grad_norm": 0.6983076333999634, + "learning_rate": 2.791874359327443e-05, + "loss": 2.4613, + "step": 15151 + }, + { + "epoch": 1.2228230167056735, + "grad_norm": 0.7520100474357605, + "learning_rate": 2.7907802118528383e-05, + "loss": 2.4147, + "step": 15152 + }, + { + "epoch": 1.2229037204422566, + "grad_norm": 0.7056650519371033, + "learning_rate": 2.789686244050834e-05, + "loss": 2.4568, + "step": 15153 + }, + { + "epoch": 1.2229844241788395, + "grad_norm": 0.7092614769935608, + "learning_rate": 2.7885924559486975e-05, + "loss": 2.4758, + "step": 15154 + }, + { + "epoch": 1.2230651279154225, + "grad_norm": 0.702521562576294, + "learning_rate": 2.7874988475736885e-05, + "loss": 2.4893, + "step": 15155 + }, + { + "epoch": 1.2231458316520054, + "grad_norm": 0.7454921007156372, + "learning_rate": 2.786405418953061e-05, + "loss": 2.4277, + "step": 15156 + }, + { + "epoch": 1.2232265353885885, + "grad_norm": 0.659503161907196, + "learning_rate": 2.7853121701140694e-05, + "loss": 2.4664, + "step": 15157 + }, + { + "epoch": 1.2233072391251716, + "grad_norm": 0.6368914842605591, + "learning_rate": 2.7842191010839556e-05, + "loss": 2.3728, + "step": 15158 + }, + { + "epoch": 1.2233879428617545, + "grad_norm": 0.7076737880706787, + "learning_rate": 2.783126211889965e-05, + "loss": 2.4204, + "step": 15159 + }, + { + "epoch": 1.2234686465983375, + "grad_norm": 0.718100905418396, + "learning_rate": 2.7820335025593325e-05, + "loss": 2.478, + "step": 15160 + }, + { + "epoch": 1.2235493503349204, + "grad_norm": 0.6804678440093994, + "learning_rate": 2.7809409731192972e-05, + "loss": 2.3755, + "step": 15161 + }, + { + "epoch": 1.2236300540715035, + "grad_norm": 0.7068643569946289, + "learning_rate": 2.77984862359708e-05, + "loss": 2.3713, + "step": 15162 + }, + { + "epoch": 1.2237107578080866, + "grad_norm": 0.7047072052955627, + "learning_rate": 2.7787564540199097e-05, + "loss": 2.4264, + "step": 15163 + }, + { + "epoch": 1.2237914615446694, + "grad_norm": 0.6985021829605103, + "learning_rate": 2.7776644644150076e-05, + "loss": 2.4101, + "step": 15164 + }, + { + "epoch": 1.2238721652812525, + "grad_norm": 0.7543687224388123, + "learning_rate": 2.776572654809583e-05, + "loss": 2.3722, + "step": 15165 + }, + { + "epoch": 1.2239528690178356, + "grad_norm": 0.7199926972389221, + "learning_rate": 2.7754810252308473e-05, + "loss": 2.3819, + "step": 15166 + }, + { + "epoch": 1.2240335727544185, + "grad_norm": 0.696756899356842, + "learning_rate": 2.7743895757060156e-05, + "loss": 2.4245, + "step": 15167 + }, + { + "epoch": 1.2241142764910016, + "grad_norm": 0.7848933339118958, + "learning_rate": 2.773298306262281e-05, + "loss": 2.4725, + "step": 15168 + }, + { + "epoch": 1.2241949802275847, + "grad_norm": 0.6819389462471008, + "learning_rate": 2.7722072169268432e-05, + "loss": 2.4338, + "step": 15169 + }, + { + "epoch": 1.2242756839641675, + "grad_norm": 0.7185801267623901, + "learning_rate": 2.7711163077268977e-05, + "loss": 2.4745, + "step": 15170 + }, + { + "epoch": 1.2243563877007506, + "grad_norm": 0.7645030617713928, + "learning_rate": 2.7700255786896278e-05, + "loss": 2.4677, + "step": 15171 + }, + { + "epoch": 1.2244370914373335, + "grad_norm": 0.6559275388717651, + "learning_rate": 2.7689350298422202e-05, + "loss": 2.386, + "step": 15172 + }, + { + "epoch": 1.2245177951739166, + "grad_norm": 0.6965066194534302, + "learning_rate": 2.767844661211856e-05, + "loss": 2.4022, + "step": 15173 + }, + { + "epoch": 1.2245984989104994, + "grad_norm": 0.6618858575820923, + "learning_rate": 2.7667544728257057e-05, + "loss": 2.3541, + "step": 15174 + }, + { + "epoch": 1.2246792026470825, + "grad_norm": 0.6635501980781555, + "learning_rate": 2.765664464710941e-05, + "loss": 2.3984, + "step": 15175 + }, + { + "epoch": 1.2247599063836656, + "grad_norm": 0.6987191438674927, + "learning_rate": 2.764574636894729e-05, + "loss": 2.4637, + "step": 15176 + }, + { + "epoch": 1.2248406101202485, + "grad_norm": 0.7289232611656189, + "learning_rate": 2.7634849894042303e-05, + "loss": 2.4033, + "step": 15177 + }, + { + "epoch": 1.2249213138568316, + "grad_norm": 0.7245565056800842, + "learning_rate": 2.762395522266602e-05, + "loss": 2.4281, + "step": 15178 + }, + { + "epoch": 1.2250020175934146, + "grad_norm": 0.6946065425872803, + "learning_rate": 2.761306235508997e-05, + "loss": 2.3869, + "step": 15179 + }, + { + "epoch": 1.2250827213299975, + "grad_norm": 0.6381784677505493, + "learning_rate": 2.7602171291585666e-05, + "loss": 2.404, + "step": 15180 + }, + { + "epoch": 1.2251634250665806, + "grad_norm": 0.6893685460090637, + "learning_rate": 2.759128203242446e-05, + "loss": 2.4807, + "step": 15181 + }, + { + "epoch": 1.2252441288031637, + "grad_norm": 0.6640260815620422, + "learning_rate": 2.7580394577877787e-05, + "loss": 2.4036, + "step": 15182 + }, + { + "epoch": 1.2253248325397466, + "grad_norm": 0.7125177979469299, + "learning_rate": 2.7569508928217026e-05, + "loss": 2.3869, + "step": 15183 + }, + { + "epoch": 1.2254055362763296, + "grad_norm": 0.657865583896637, + "learning_rate": 2.7558625083713397e-05, + "loss": 2.3869, + "step": 15184 + }, + { + "epoch": 1.2254862400129125, + "grad_norm": 0.6776065230369568, + "learning_rate": 2.7547743044638197e-05, + "loss": 2.4128, + "step": 15185 + }, + { + "epoch": 1.2255669437494956, + "grad_norm": 0.7126299738883972, + "learning_rate": 2.753686281126263e-05, + "loss": 2.4465, + "step": 15186 + }, + { + "epoch": 1.2256476474860787, + "grad_norm": 0.6918273568153381, + "learning_rate": 2.7525984383857873e-05, + "loss": 2.428, + "step": 15187 + }, + { + "epoch": 1.2257283512226615, + "grad_norm": 0.7742759585380554, + "learning_rate": 2.7515107762695025e-05, + "loss": 2.4299, + "step": 15188 + }, + { + "epoch": 1.2258090549592446, + "grad_norm": 0.7194607853889465, + "learning_rate": 2.7504232948045205e-05, + "loss": 2.4315, + "step": 15189 + }, + { + "epoch": 1.2258897586958275, + "grad_norm": 0.6962646245956421, + "learning_rate": 2.7493359940179363e-05, + "loss": 2.4494, + "step": 15190 + }, + { + "epoch": 1.2259704624324106, + "grad_norm": 0.6681686639785767, + "learning_rate": 2.7482488739368538e-05, + "loss": 2.427, + "step": 15191 + }, + { + "epoch": 1.2260511661689937, + "grad_norm": 0.6589877009391785, + "learning_rate": 2.747161934588366e-05, + "loss": 2.4333, + "step": 15192 + }, + { + "epoch": 1.2261318699055765, + "grad_norm": 0.7415218949317932, + "learning_rate": 2.746075175999564e-05, + "loss": 2.4203, + "step": 15193 + }, + { + "epoch": 1.2262125736421596, + "grad_norm": 0.7371910214424133, + "learning_rate": 2.7449885981975276e-05, + "loss": 2.4684, + "step": 15194 + }, + { + "epoch": 1.2262932773787427, + "grad_norm": 0.7010802626609802, + "learning_rate": 2.7439022012093407e-05, + "loss": 2.4625, + "step": 15195 + }, + { + "epoch": 1.2263739811153256, + "grad_norm": 0.7125125527381897, + "learning_rate": 2.7428159850620773e-05, + "loss": 2.4075, + "step": 15196 + }, + { + "epoch": 1.2264546848519087, + "grad_norm": 0.701133668422699, + "learning_rate": 2.7417299497828107e-05, + "loss": 2.4525, + "step": 15197 + }, + { + "epoch": 1.2265353885884918, + "grad_norm": 0.7543410658836365, + "learning_rate": 2.7406440953986078e-05, + "loss": 2.474, + "step": 15198 + }, + { + "epoch": 1.2266160923250746, + "grad_norm": 0.69012051820755, + "learning_rate": 2.7395584219365323e-05, + "loss": 2.4853, + "step": 15199 + }, + { + "epoch": 1.2266967960616577, + "grad_norm": 0.6559048295021057, + "learning_rate": 2.7384729294236378e-05, + "loss": 2.4252, + "step": 15200 + }, + { + "epoch": 1.2267774997982406, + "grad_norm": 0.6603518128395081, + "learning_rate": 2.7373876178869794e-05, + "loss": 2.4047, + "step": 15201 + }, + { + "epoch": 1.2268582035348237, + "grad_norm": 0.7159265279769897, + "learning_rate": 2.736302487353609e-05, + "loss": 2.4352, + "step": 15202 + }, + { + "epoch": 1.2269389072714068, + "grad_norm": 0.6784560084342957, + "learning_rate": 2.735217537850565e-05, + "loss": 2.3933, + "step": 15203 + }, + { + "epoch": 1.2270196110079896, + "grad_norm": 0.7341950535774231, + "learning_rate": 2.7341327694048903e-05, + "loss": 2.4514, + "step": 15204 + }, + { + "epoch": 1.2271003147445727, + "grad_norm": 0.726046621799469, + "learning_rate": 2.7330481820436204e-05, + "loss": 2.4427, + "step": 15205 + }, + { + "epoch": 1.2271810184811556, + "grad_norm": 0.6897192001342773, + "learning_rate": 2.7319637757937854e-05, + "loss": 2.4587, + "step": 15206 + }, + { + "epoch": 1.2272617222177387, + "grad_norm": 0.6981058716773987, + "learning_rate": 2.7308795506824124e-05, + "loss": 2.4297, + "step": 15207 + }, + { + "epoch": 1.2273424259543217, + "grad_norm": 0.694583535194397, + "learning_rate": 2.729795506736522e-05, + "loss": 2.3608, + "step": 15208 + }, + { + "epoch": 1.2274231296909046, + "grad_norm": 0.710192084312439, + "learning_rate": 2.728711643983136e-05, + "loss": 2.3733, + "step": 15209 + }, + { + "epoch": 1.2275038334274877, + "grad_norm": 0.7203633785247803, + "learning_rate": 2.7276279624492595e-05, + "loss": 2.389, + "step": 15210 + }, + { + "epoch": 1.2275845371640708, + "grad_norm": 0.7298668622970581, + "learning_rate": 2.726544462161905e-05, + "loss": 2.3981, + "step": 15211 + }, + { + "epoch": 1.2276652409006537, + "grad_norm": 0.6640039682388306, + "learning_rate": 2.725461143148078e-05, + "loss": 2.4073, + "step": 15212 + }, + { + "epoch": 1.2277459446372367, + "grad_norm": 0.7203015685081482, + "learning_rate": 2.724378005434772e-05, + "loss": 2.4901, + "step": 15213 + }, + { + "epoch": 1.2278266483738198, + "grad_norm": 0.6668895483016968, + "learning_rate": 2.723295049048985e-05, + "loss": 2.4482, + "step": 15214 + }, + { + "epoch": 1.2279073521104027, + "grad_norm": 0.7551584839820862, + "learning_rate": 2.7222122740177103e-05, + "loss": 2.4877, + "step": 15215 + }, + { + "epoch": 1.2279880558469858, + "grad_norm": 0.707202672958374, + "learning_rate": 2.721129680367923e-05, + "loss": 2.4577, + "step": 15216 + }, + { + "epoch": 1.2280687595835686, + "grad_norm": 0.685153603553772, + "learning_rate": 2.7200472681266155e-05, + "loss": 2.476, + "step": 15217 + }, + { + "epoch": 1.2281494633201517, + "grad_norm": 0.6843041181564331, + "learning_rate": 2.718965037320762e-05, + "loss": 2.4164, + "step": 15218 + }, + { + "epoch": 1.2282301670567348, + "grad_norm": 0.6548978686332703, + "learning_rate": 2.7178829879773306e-05, + "loss": 2.4187, + "step": 15219 + }, + { + "epoch": 1.2283108707933177, + "grad_norm": 0.7037245035171509, + "learning_rate": 2.7168011201232902e-05, + "loss": 2.3621, + "step": 15220 + }, + { + "epoch": 1.2283915745299008, + "grad_norm": 0.6540676951408386, + "learning_rate": 2.7157194337856074e-05, + "loss": 2.4542, + "step": 15221 + }, + { + "epoch": 1.2284722782664836, + "grad_norm": 0.7699899673461914, + "learning_rate": 2.7146379289912338e-05, + "loss": 2.4639, + "step": 15222 + }, + { + "epoch": 1.2285529820030667, + "grad_norm": 0.7178743481636047, + "learning_rate": 2.713556605767128e-05, + "loss": 2.4222, + "step": 15223 + }, + { + "epoch": 1.2286336857396498, + "grad_norm": 0.6749793887138367, + "learning_rate": 2.7124754641402383e-05, + "loss": 2.4323, + "step": 15224 + }, + { + "epoch": 1.2287143894762327, + "grad_norm": 0.7035594582557678, + "learning_rate": 2.711394504137513e-05, + "loss": 2.4466, + "step": 15225 + }, + { + "epoch": 1.2287950932128158, + "grad_norm": 0.6518487930297852, + "learning_rate": 2.7103137257858868e-05, + "loss": 2.4969, + "step": 15226 + }, + { + "epoch": 1.2288757969493989, + "grad_norm": 0.6739057898521423, + "learning_rate": 2.7092331291122974e-05, + "loss": 2.406, + "step": 15227 + }, + { + "epoch": 1.2289565006859817, + "grad_norm": 0.6584770083427429, + "learning_rate": 2.7081527141436767e-05, + "loss": 2.4304, + "step": 15228 + }, + { + "epoch": 1.2290372044225648, + "grad_norm": 0.6846301555633545, + "learning_rate": 2.7070724809069514e-05, + "loss": 2.3995, + "step": 15229 + }, + { + "epoch": 1.2291179081591477, + "grad_norm": 0.6778364777565002, + "learning_rate": 2.705992429429044e-05, + "loss": 2.38, + "step": 15230 + }, + { + "epoch": 1.2291986118957308, + "grad_norm": 0.6957302689552307, + "learning_rate": 2.7049125597368753e-05, + "loss": 2.3973, + "step": 15231 + }, + { + "epoch": 1.2292793156323139, + "grad_norm": 0.730269193649292, + "learning_rate": 2.7038328718573514e-05, + "loss": 2.4829, + "step": 15232 + }, + { + "epoch": 1.2293600193688967, + "grad_norm": 0.7114049196243286, + "learning_rate": 2.702753365817384e-05, + "loss": 2.3902, + "step": 15233 + }, + { + "epoch": 1.2294407231054798, + "grad_norm": 0.7137531638145447, + "learning_rate": 2.7016740416438823e-05, + "loss": 2.3957, + "step": 15234 + }, + { + "epoch": 1.2295214268420627, + "grad_norm": 0.7178330421447754, + "learning_rate": 2.7005948993637386e-05, + "loss": 2.4429, + "step": 15235 + }, + { + "epoch": 1.2296021305786458, + "grad_norm": 0.6767767071723938, + "learning_rate": 2.6995159390038506e-05, + "loss": 2.4009, + "step": 15236 + }, + { + "epoch": 1.2296828343152288, + "grad_norm": 0.7713541984558105, + "learning_rate": 2.6984371605911086e-05, + "loss": 2.4326, + "step": 15237 + }, + { + "epoch": 1.2297635380518117, + "grad_norm": 0.7218228578567505, + "learning_rate": 2.6973585641523992e-05, + "loss": 2.4358, + "step": 15238 + }, + { + "epoch": 1.2298442417883948, + "grad_norm": 0.6782575249671936, + "learning_rate": 2.696280149714604e-05, + "loss": 2.3844, + "step": 15239 + }, + { + "epoch": 1.2299249455249779, + "grad_norm": 0.6825734972953796, + "learning_rate": 2.6952019173045982e-05, + "loss": 2.4621, + "step": 15240 + }, + { + "epoch": 1.2300056492615608, + "grad_norm": 0.6587522625923157, + "learning_rate": 2.6941238669492608e-05, + "loss": 2.4465, + "step": 15241 + }, + { + "epoch": 1.2300863529981438, + "grad_norm": 0.6898796558380127, + "learning_rate": 2.6930459986754498e-05, + "loss": 2.4469, + "step": 15242 + }, + { + "epoch": 1.230167056734727, + "grad_norm": 0.6764062643051147, + "learning_rate": 2.6919683125100338e-05, + "loss": 2.4476, + "step": 15243 + }, + { + "epoch": 1.2302477604713098, + "grad_norm": 0.6647047400474548, + "learning_rate": 2.6908908084798733e-05, + "loss": 2.3677, + "step": 15244 + }, + { + "epoch": 1.2303284642078929, + "grad_norm": 0.7091608047485352, + "learning_rate": 2.6898134866118174e-05, + "loss": 2.4605, + "step": 15245 + }, + { + "epoch": 1.2304091679444757, + "grad_norm": 0.691007137298584, + "learning_rate": 2.6887363469327188e-05, + "loss": 2.4397, + "step": 15246 + }, + { + "epoch": 1.2304898716810588, + "grad_norm": 0.6685532927513123, + "learning_rate": 2.6876593894694214e-05, + "loss": 2.4279, + "step": 15247 + }, + { + "epoch": 1.230570575417642, + "grad_norm": 0.684474766254425, + "learning_rate": 2.686582614248767e-05, + "loss": 2.4162, + "step": 15248 + }, + { + "epoch": 1.2306512791542248, + "grad_norm": 0.657293975353241, + "learning_rate": 2.6855060212975915e-05, + "loss": 2.4337, + "step": 15249 + }, + { + "epoch": 1.2307319828908079, + "grad_norm": 0.7136504650115967, + "learning_rate": 2.684429610642729e-05, + "loss": 2.4156, + "step": 15250 + }, + { + "epoch": 1.2308126866273907, + "grad_norm": 0.6564410924911499, + "learning_rate": 2.6833533823110013e-05, + "loss": 2.5101, + "step": 15251 + }, + { + "epoch": 1.2308933903639738, + "grad_norm": 0.6628747582435608, + "learning_rate": 2.682277336329233e-05, + "loss": 2.3933, + "step": 15252 + }, + { + "epoch": 1.230974094100557, + "grad_norm": 0.7362595796585083, + "learning_rate": 2.681201472724244e-05, + "loss": 2.4541, + "step": 15253 + }, + { + "epoch": 1.2310547978371398, + "grad_norm": 0.7604697346687317, + "learning_rate": 2.680125791522844e-05, + "loss": 2.4383, + "step": 15254 + }, + { + "epoch": 1.2311355015737229, + "grad_norm": 0.7128429412841797, + "learning_rate": 2.6790502927518434e-05, + "loss": 2.4492, + "step": 15255 + }, + { + "epoch": 1.231216205310306, + "grad_norm": 0.6761955618858337, + "learning_rate": 2.677974976438047e-05, + "loss": 2.4355, + "step": 15256 + }, + { + "epoch": 1.2312969090468888, + "grad_norm": 0.6687077879905701, + "learning_rate": 2.6768998426082538e-05, + "loss": 2.4317, + "step": 15257 + }, + { + "epoch": 1.231377612783472, + "grad_norm": 0.7423825860023499, + "learning_rate": 2.675824891289259e-05, + "loss": 2.4216, + "step": 15258 + }, + { + "epoch": 1.231458316520055, + "grad_norm": 0.671130359172821, + "learning_rate": 2.6747501225078542e-05, + "loss": 2.4775, + "step": 15259 + }, + { + "epoch": 1.2315390202566379, + "grad_norm": 0.7421461939811707, + "learning_rate": 2.6736755362908273e-05, + "loss": 2.4042, + "step": 15260 + }, + { + "epoch": 1.231619723993221, + "grad_norm": 0.7084131240844727, + "learning_rate": 2.6726011326649547e-05, + "loss": 2.4506, + "step": 15261 + }, + { + "epoch": 1.2317004277298038, + "grad_norm": 0.641852855682373, + "learning_rate": 2.671526911657015e-05, + "loss": 2.4261, + "step": 15262 + }, + { + "epoch": 1.231781131466387, + "grad_norm": 0.7627724409103394, + "learning_rate": 2.670452873293785e-05, + "loss": 2.4647, + "step": 15263 + }, + { + "epoch": 1.23186183520297, + "grad_norm": 0.6638163924217224, + "learning_rate": 2.669379017602026e-05, + "loss": 2.4208, + "step": 15264 + }, + { + "epoch": 1.2319425389395529, + "grad_norm": 0.6815361380577087, + "learning_rate": 2.668305344608505e-05, + "loss": 2.4404, + "step": 15265 + }, + { + "epoch": 1.232023242676136, + "grad_norm": 0.6466485857963562, + "learning_rate": 2.6672318543399823e-05, + "loss": 2.4327, + "step": 15266 + }, + { + "epoch": 1.2321039464127188, + "grad_norm": 0.7119305729866028, + "learning_rate": 2.6661585468232042e-05, + "loss": 2.4266, + "step": 15267 + }, + { + "epoch": 1.232184650149302, + "grad_norm": 0.7245718836784363, + "learning_rate": 2.6650854220849286e-05, + "loss": 2.4484, + "step": 15268 + }, + { + "epoch": 1.232265353885885, + "grad_norm": 0.7050287127494812, + "learning_rate": 2.6640124801518972e-05, + "loss": 2.4441, + "step": 15269 + }, + { + "epoch": 1.2323460576224678, + "grad_norm": 0.6906494498252869, + "learning_rate": 2.6629397210508556e-05, + "loss": 2.4297, + "step": 15270 + }, + { + "epoch": 1.232426761359051, + "grad_norm": 0.7224171757698059, + "learning_rate": 2.661867144808532e-05, + "loss": 2.4279, + "step": 15271 + }, + { + "epoch": 1.232507465095634, + "grad_norm": 0.688804030418396, + "learning_rate": 2.6607947514516606e-05, + "loss": 2.4741, + "step": 15272 + }, + { + "epoch": 1.232588168832217, + "grad_norm": 0.6462350487709045, + "learning_rate": 2.6597225410069726e-05, + "loss": 2.4499, + "step": 15273 + }, + { + "epoch": 1.2326688725688, + "grad_norm": 0.6860110759735107, + "learning_rate": 2.658650513501184e-05, + "loss": 2.4488, + "step": 15274 + }, + { + "epoch": 1.2327495763053828, + "grad_norm": 0.7158305644989014, + "learning_rate": 2.6575786689610138e-05, + "loss": 2.4318, + "step": 15275 + }, + { + "epoch": 1.232830280041966, + "grad_norm": 0.7740959525108337, + "learning_rate": 2.6565070074131804e-05, + "loss": 2.4824, + "step": 15276 + }, + { + "epoch": 1.232910983778549, + "grad_norm": 0.7573856711387634, + "learning_rate": 2.6554355288843847e-05, + "loss": 2.4034, + "step": 15277 + }, + { + "epoch": 1.2329916875151319, + "grad_norm": 0.6809369921684265, + "learning_rate": 2.654364233401332e-05, + "loss": 2.5085, + "step": 15278 + }, + { + "epoch": 1.233072391251715, + "grad_norm": 0.6695643067359924, + "learning_rate": 2.6532931209907307e-05, + "loss": 2.4697, + "step": 15279 + }, + { + "epoch": 1.2331530949882978, + "grad_norm": 0.7218750715255737, + "learning_rate": 2.6522221916792655e-05, + "loss": 2.4753, + "step": 15280 + }, + { + "epoch": 1.233233798724881, + "grad_norm": 0.8171822428703308, + "learning_rate": 2.6511514454936314e-05, + "loss": 2.45, + "step": 15281 + }, + { + "epoch": 1.233314502461464, + "grad_norm": 0.7234573364257812, + "learning_rate": 2.6500808824605162e-05, + "loss": 2.3963, + "step": 15282 + }, + { + "epoch": 1.2333952061980469, + "grad_norm": 0.6993409395217896, + "learning_rate": 2.6490105026065948e-05, + "loss": 2.4449, + "step": 15283 + }, + { + "epoch": 1.23347590993463, + "grad_norm": 0.7984449863433838, + "learning_rate": 2.6479403059585472e-05, + "loss": 2.4322, + "step": 15284 + }, + { + "epoch": 1.233556613671213, + "grad_norm": 0.683971107006073, + "learning_rate": 2.6468702925430466e-05, + "loss": 2.4125, + "step": 15285 + }, + { + "epoch": 1.233637317407796, + "grad_norm": 0.6739822626113892, + "learning_rate": 2.6458004623867617e-05, + "loss": 2.4487, + "step": 15286 + }, + { + "epoch": 1.233718021144379, + "grad_norm": 0.7003912925720215, + "learning_rate": 2.644730815516351e-05, + "loss": 2.4437, + "step": 15287 + }, + { + "epoch": 1.233798724880962, + "grad_norm": 0.7011744379997253, + "learning_rate": 2.643661351958474e-05, + "loss": 2.4798, + "step": 15288 + }, + { + "epoch": 1.233879428617545, + "grad_norm": 0.7003397941589355, + "learning_rate": 2.6425920717397867e-05, + "loss": 2.4554, + "step": 15289 + }, + { + "epoch": 1.233960132354128, + "grad_norm": 0.6682165265083313, + "learning_rate": 2.6415229748869374e-05, + "loss": 2.4252, + "step": 15290 + }, + { + "epoch": 1.234040836090711, + "grad_norm": 0.6712457537651062, + "learning_rate": 2.6404540614265715e-05, + "loss": 2.4225, + "step": 15291 + }, + { + "epoch": 1.234121539827294, + "grad_norm": 0.654464602470398, + "learning_rate": 2.63938533138533e-05, + "loss": 2.4462, + "step": 15292 + }, + { + "epoch": 1.234202243563877, + "grad_norm": 0.7311797738075256, + "learning_rate": 2.638316784789845e-05, + "loss": 2.502, + "step": 15293 + }, + { + "epoch": 1.23428294730046, + "grad_norm": 0.6836559176445007, + "learning_rate": 2.6372484216667492e-05, + "loss": 2.5134, + "step": 15294 + }, + { + "epoch": 1.234363651037043, + "grad_norm": 0.6961826086044312, + "learning_rate": 2.636180242042672e-05, + "loss": 2.4479, + "step": 15295 + }, + { + "epoch": 1.234444354773626, + "grad_norm": 0.6824259161949158, + "learning_rate": 2.635112245944229e-05, + "loss": 2.4299, + "step": 15296 + }, + { + "epoch": 1.234525058510209, + "grad_norm": 0.7594609260559082, + "learning_rate": 2.634044433398042e-05, + "loss": 2.4469, + "step": 15297 + }, + { + "epoch": 1.234605762246792, + "grad_norm": 0.7044653296470642, + "learning_rate": 2.632976804430721e-05, + "loss": 2.447, + "step": 15298 + }, + { + "epoch": 1.234686465983375, + "grad_norm": 0.6986916065216064, + "learning_rate": 2.631909359068876e-05, + "loss": 2.4705, + "step": 15299 + }, + { + "epoch": 1.234767169719958, + "grad_norm": 0.7025431990623474, + "learning_rate": 2.630842097339111e-05, + "loss": 2.3951, + "step": 15300 + }, + { + "epoch": 1.2348478734565411, + "grad_norm": 0.6533786058425903, + "learning_rate": 2.6297750192680237e-05, + "loss": 2.3769, + "step": 15301 + }, + { + "epoch": 1.234928577193124, + "grad_norm": 0.6575472354888916, + "learning_rate": 2.628708124882212e-05, + "loss": 2.4293, + "step": 15302 + }, + { + "epoch": 1.235009280929707, + "grad_norm": 0.6712046265602112, + "learning_rate": 2.6276414142082584e-05, + "loss": 2.4819, + "step": 15303 + }, + { + "epoch": 1.2350899846662902, + "grad_norm": 0.6947652101516724, + "learning_rate": 2.6265748872727535e-05, + "loss": 2.449, + "step": 15304 + }, + { + "epoch": 1.235170688402873, + "grad_norm": 0.6881443858146667, + "learning_rate": 2.62550854410228e-05, + "loss": 2.3991, + "step": 15305 + }, + { + "epoch": 1.2352513921394561, + "grad_norm": 0.6681519746780396, + "learning_rate": 2.624442384723407e-05, + "loss": 2.4005, + "step": 15306 + }, + { + "epoch": 1.235332095876039, + "grad_norm": 0.6728120446205139, + "learning_rate": 2.62337640916271e-05, + "loss": 2.4242, + "step": 15307 + }, + { + "epoch": 1.235412799612622, + "grad_norm": 0.707360029220581, + "learning_rate": 2.622310617446755e-05, + "loss": 2.4385, + "step": 15308 + }, + { + "epoch": 1.2354935033492052, + "grad_norm": 0.6890079975128174, + "learning_rate": 2.6212450096021058e-05, + "loss": 2.443, + "step": 15309 + }, + { + "epoch": 1.235574207085788, + "grad_norm": 0.7022379636764526, + "learning_rate": 2.620179585655318e-05, + "loss": 2.3982, + "step": 15310 + }, + { + "epoch": 1.235654910822371, + "grad_norm": 0.7283182740211487, + "learning_rate": 2.61911434563295e-05, + "loss": 2.4197, + "step": 15311 + }, + { + "epoch": 1.235735614558954, + "grad_norm": 0.6721852421760559, + "learning_rate": 2.6180492895615426e-05, + "loss": 2.4356, + "step": 15312 + }, + { + "epoch": 1.235816318295537, + "grad_norm": 0.6817916631698608, + "learning_rate": 2.616984417467645e-05, + "loss": 2.4325, + "step": 15313 + }, + { + "epoch": 1.2358970220321202, + "grad_norm": 0.6826596260070801, + "learning_rate": 2.6159197293777972e-05, + "loss": 2.4043, + "step": 15314 + }, + { + "epoch": 1.235977725768703, + "grad_norm": 0.7135530114173889, + "learning_rate": 2.6148552253185288e-05, + "loss": 2.4269, + "step": 15315 + }, + { + "epoch": 1.236058429505286, + "grad_norm": 0.7027753591537476, + "learning_rate": 2.6137909053163722e-05, + "loss": 2.4266, + "step": 15316 + }, + { + "epoch": 1.2361391332418692, + "grad_norm": 0.6597041487693787, + "learning_rate": 2.6127267693978552e-05, + "loss": 2.4073, + "step": 15317 + }, + { + "epoch": 1.236219836978452, + "grad_norm": 0.6450026631355286, + "learning_rate": 2.6116628175894974e-05, + "loss": 2.4299, + "step": 15318 + }, + { + "epoch": 1.2363005407150351, + "grad_norm": 0.7740476727485657, + "learning_rate": 2.6105990499178156e-05, + "loss": 2.4088, + "step": 15319 + }, + { + "epoch": 1.2363812444516182, + "grad_norm": 0.6460183262825012, + "learning_rate": 2.609535466409322e-05, + "loss": 2.4311, + "step": 15320 + }, + { + "epoch": 1.236461948188201, + "grad_norm": 0.6514838337898254, + "learning_rate": 2.608472067090525e-05, + "loss": 2.4069, + "step": 15321 + }, + { + "epoch": 1.2365426519247842, + "grad_norm": 0.7281234860420227, + "learning_rate": 2.6074088519879237e-05, + "loss": 2.4245, + "step": 15322 + }, + { + "epoch": 1.236623355661367, + "grad_norm": 0.752983570098877, + "learning_rate": 2.606345821128018e-05, + "loss": 2.4149, + "step": 15323 + }, + { + "epoch": 1.2367040593979501, + "grad_norm": 0.6912856101989746, + "learning_rate": 2.6052829745373054e-05, + "loss": 2.4489, + "step": 15324 + }, + { + "epoch": 1.236784763134533, + "grad_norm": 0.6719293594360352, + "learning_rate": 2.604220312242267e-05, + "loss": 2.457, + "step": 15325 + }, + { + "epoch": 1.236865466871116, + "grad_norm": 0.7440586090087891, + "learning_rate": 2.6031578342693918e-05, + "loss": 2.4657, + "step": 15326 + }, + { + "epoch": 1.2369461706076992, + "grad_norm": 0.694442629814148, + "learning_rate": 2.602095540645162e-05, + "loss": 2.4422, + "step": 15327 + }, + { + "epoch": 1.237026874344282, + "grad_norm": 0.7186843752861023, + "learning_rate": 2.601033431396046e-05, + "loss": 2.4229, + "step": 15328 + }, + { + "epoch": 1.2371075780808651, + "grad_norm": 0.7401825785636902, + "learning_rate": 2.5999715065485153e-05, + "loss": 2.45, + "step": 15329 + }, + { + "epoch": 1.2371882818174482, + "grad_norm": 0.6710138916969299, + "learning_rate": 2.598909766129045e-05, + "loss": 2.4074, + "step": 15330 + }, + { + "epoch": 1.237268985554031, + "grad_norm": 0.7867769598960876, + "learning_rate": 2.5978482101640867e-05, + "loss": 2.4709, + "step": 15331 + }, + { + "epoch": 1.2373496892906142, + "grad_norm": 0.7076219916343689, + "learning_rate": 2.5967868386801e-05, + "loss": 2.4887, + "step": 15332 + }, + { + "epoch": 1.2374303930271973, + "grad_norm": 0.7277626991271973, + "learning_rate": 2.5957256517035378e-05, + "loss": 2.4295, + "step": 15333 + }, + { + "epoch": 1.2375110967637801, + "grad_norm": 0.7339804768562317, + "learning_rate": 2.5946646492608506e-05, + "loss": 2.4624, + "step": 15334 + }, + { + "epoch": 1.2375918005003632, + "grad_norm": 0.6707656383514404, + "learning_rate": 2.593603831378475e-05, + "loss": 2.4159, + "step": 15335 + }, + { + "epoch": 1.237672504236946, + "grad_norm": 0.7118813991546631, + "learning_rate": 2.592543198082852e-05, + "loss": 2.4496, + "step": 15336 + }, + { + "epoch": 1.2377532079735292, + "grad_norm": 0.675167977809906, + "learning_rate": 2.591482749400419e-05, + "loss": 2.4519, + "step": 15337 + }, + { + "epoch": 1.2378339117101123, + "grad_norm": 0.8245306611061096, + "learning_rate": 2.5904224853575986e-05, + "loss": 2.4732, + "step": 15338 + }, + { + "epoch": 1.2379146154466951, + "grad_norm": 0.7411863207817078, + "learning_rate": 2.5893624059808184e-05, + "loss": 2.4458, + "step": 15339 + }, + { + "epoch": 1.2379953191832782, + "grad_norm": 0.6864522695541382, + "learning_rate": 2.5883025112964997e-05, + "loss": 2.4264, + "step": 15340 + }, + { + "epoch": 1.238076022919861, + "grad_norm": 0.6585919260978699, + "learning_rate": 2.5872428013310567e-05, + "loss": 2.3904, + "step": 15341 + }, + { + "epoch": 1.2381567266564442, + "grad_norm": 0.6605508327484131, + "learning_rate": 2.5861832761108995e-05, + "loss": 2.4828, + "step": 15342 + }, + { + "epoch": 1.2382374303930272, + "grad_norm": 0.7353223562240601, + "learning_rate": 2.5851239356624392e-05, + "loss": 2.4335, + "step": 15343 + }, + { + "epoch": 1.2383181341296101, + "grad_norm": 0.6907783150672913, + "learning_rate": 2.5840647800120688e-05, + "loss": 2.4394, + "step": 15344 + }, + { + "epoch": 1.2383988378661932, + "grad_norm": 0.7239590287208557, + "learning_rate": 2.5830058091861896e-05, + "loss": 2.4221, + "step": 15345 + }, + { + "epoch": 1.2384795416027763, + "grad_norm": 0.7001412510871887, + "learning_rate": 2.5819470232111975e-05, + "loss": 2.4521, + "step": 15346 + }, + { + "epoch": 1.2385602453393592, + "grad_norm": 0.6983658671379089, + "learning_rate": 2.580888422113473e-05, + "loss": 2.4839, + "step": 15347 + }, + { + "epoch": 1.2386409490759422, + "grad_norm": 0.7829005718231201, + "learning_rate": 2.5798300059194037e-05, + "loss": 2.4546, + "step": 15348 + }, + { + "epoch": 1.2387216528125253, + "grad_norm": 0.7248061299324036, + "learning_rate": 2.5787717746553664e-05, + "loss": 2.4341, + "step": 15349 + }, + { + "epoch": 1.2388023565491082, + "grad_norm": 0.7921163439750671, + "learning_rate": 2.577713728347736e-05, + "loss": 2.475, + "step": 15350 + }, + { + "epoch": 1.2388830602856913, + "grad_norm": 0.6571238040924072, + "learning_rate": 2.5766558670228813e-05, + "loss": 2.4636, + "step": 15351 + }, + { + "epoch": 1.2389637640222741, + "grad_norm": 0.7436683177947998, + "learning_rate": 2.575598190707168e-05, + "loss": 2.4868, + "step": 15352 + }, + { + "epoch": 1.2390444677588572, + "grad_norm": 0.6471900939941406, + "learning_rate": 2.5745406994269573e-05, + "loss": 2.4349, + "step": 15353 + }, + { + "epoch": 1.2391251714954403, + "grad_norm": 0.6612011194229126, + "learning_rate": 2.5734833932086012e-05, + "loss": 2.4088, + "step": 15354 + }, + { + "epoch": 1.2392058752320232, + "grad_norm": 0.6882977485656738, + "learning_rate": 2.572426272078451e-05, + "loss": 2.4344, + "step": 15355 + }, + { + "epoch": 1.2392865789686063, + "grad_norm": 0.6836830973625183, + "learning_rate": 2.5713693360628565e-05, + "loss": 2.4325, + "step": 15356 + }, + { + "epoch": 1.2393672827051891, + "grad_norm": 0.712127149105072, + "learning_rate": 2.5703125851881536e-05, + "loss": 2.4505, + "step": 15357 + }, + { + "epoch": 1.2394479864417722, + "grad_norm": 0.7162468433380127, + "learning_rate": 2.5692560194806837e-05, + "loss": 2.4167, + "step": 15358 + }, + { + "epoch": 1.2395286901783553, + "grad_norm": 0.7770177125930786, + "learning_rate": 2.568199638966777e-05, + "loss": 2.4072, + "step": 15359 + }, + { + "epoch": 1.2396093939149382, + "grad_norm": 0.7049651741981506, + "learning_rate": 2.5671434436727636e-05, + "loss": 2.434, + "step": 15360 + }, + { + "epoch": 1.2396900976515213, + "grad_norm": 0.7793349027633667, + "learning_rate": 2.566087433624964e-05, + "loss": 2.4762, + "step": 15361 + }, + { + "epoch": 1.2397708013881044, + "grad_norm": 0.6776690483093262, + "learning_rate": 2.5650316088497018e-05, + "loss": 2.402, + "step": 15362 + }, + { + "epoch": 1.2398515051246872, + "grad_norm": 0.7207701802253723, + "learning_rate": 2.5639759693732834e-05, + "loss": 2.4398, + "step": 15363 + }, + { + "epoch": 1.2399322088612703, + "grad_norm": 0.759787917137146, + "learning_rate": 2.5629205152220215e-05, + "loss": 2.4268, + "step": 15364 + }, + { + "epoch": 1.2400129125978534, + "grad_norm": 0.6906142830848694, + "learning_rate": 2.5618652464222215e-05, + "loss": 2.4075, + "step": 15365 + }, + { + "epoch": 1.2400936163344363, + "grad_norm": 0.7002954483032227, + "learning_rate": 2.560810163000187e-05, + "loss": 2.4516, + "step": 15366 + }, + { + "epoch": 1.2401743200710194, + "grad_norm": 0.7287559509277344, + "learning_rate": 2.5597552649822053e-05, + "loss": 2.4975, + "step": 15367 + }, + { + "epoch": 1.2402550238076022, + "grad_norm": 0.6523926854133606, + "learning_rate": 2.558700552394572e-05, + "loss": 2.4085, + "step": 15368 + }, + { + "epoch": 1.2403357275441853, + "grad_norm": 0.7289387583732605, + "learning_rate": 2.5576460252635727e-05, + "loss": 2.4789, + "step": 15369 + }, + { + "epoch": 1.2404164312807684, + "grad_norm": 0.6613432765007019, + "learning_rate": 2.5565916836154878e-05, + "loss": 2.4263, + "step": 15370 + }, + { + "epoch": 1.2404971350173513, + "grad_norm": 0.7275245785713196, + "learning_rate": 2.555537527476597e-05, + "loss": 2.4652, + "step": 15371 + }, + { + "epoch": 1.2405778387539343, + "grad_norm": 0.6726976037025452, + "learning_rate": 2.554483556873173e-05, + "loss": 2.4092, + "step": 15372 + }, + { + "epoch": 1.2406585424905172, + "grad_norm": 0.6908233761787415, + "learning_rate": 2.5534297718314794e-05, + "loss": 2.3678, + "step": 15373 + }, + { + "epoch": 1.2407392462271003, + "grad_norm": 0.6893147826194763, + "learning_rate": 2.5523761723777806e-05, + "loss": 2.4625, + "step": 15374 + }, + { + "epoch": 1.2408199499636834, + "grad_norm": 0.7640267014503479, + "learning_rate": 2.551322758538339e-05, + "loss": 2.446, + "step": 15375 + }, + { + "epoch": 1.2409006537002663, + "grad_norm": 0.7187458276748657, + "learning_rate": 2.550269530339402e-05, + "loss": 2.4215, + "step": 15376 + }, + { + "epoch": 1.2409813574368493, + "grad_norm": 0.8041789531707764, + "learning_rate": 2.5492164878072234e-05, + "loss": 2.5085, + "step": 15377 + }, + { + "epoch": 1.2410620611734324, + "grad_norm": 0.6582188010215759, + "learning_rate": 2.5481636309680445e-05, + "loss": 2.467, + "step": 15378 + }, + { + "epoch": 1.2411427649100153, + "grad_norm": 0.705731213092804, + "learning_rate": 2.5471109598481112e-05, + "loss": 2.3764, + "step": 15379 + }, + { + "epoch": 1.2412234686465984, + "grad_norm": 0.6918940544128418, + "learning_rate": 2.5460584744736495e-05, + "loss": 2.4513, + "step": 15380 + }, + { + "epoch": 1.2413041723831812, + "grad_norm": 0.7402673959732056, + "learning_rate": 2.5450061748708975e-05, + "loss": 2.5133, + "step": 15381 + }, + { + "epoch": 1.2413848761197643, + "grad_norm": 0.6740667223930359, + "learning_rate": 2.543954061066083e-05, + "loss": 2.4649, + "step": 15382 + }, + { + "epoch": 1.2414655798563474, + "grad_norm": 0.6665407419204712, + "learning_rate": 2.5429021330854197e-05, + "loss": 2.4321, + "step": 15383 + }, + { + "epoch": 1.2415462835929303, + "grad_norm": 0.7324530482292175, + "learning_rate": 2.5418503909551296e-05, + "loss": 2.3574, + "step": 15384 + }, + { + "epoch": 1.2416269873295134, + "grad_norm": 0.7117868661880493, + "learning_rate": 2.5407988347014255e-05, + "loss": 2.4552, + "step": 15385 + }, + { + "epoch": 1.2417076910660962, + "grad_norm": 0.7162930965423584, + "learning_rate": 2.5397474643505103e-05, + "loss": 2.4135, + "step": 15386 + }, + { + "epoch": 1.2417883948026793, + "grad_norm": 0.7301257848739624, + "learning_rate": 2.5386962799285895e-05, + "loss": 2.4277, + "step": 15387 + }, + { + "epoch": 1.2418690985392624, + "grad_norm": 0.7404977679252625, + "learning_rate": 2.5376452814618645e-05, + "loss": 2.478, + "step": 15388 + }, + { + "epoch": 1.2419498022758453, + "grad_norm": 0.6546272039413452, + "learning_rate": 2.536594468976522e-05, + "loss": 2.4879, + "step": 15389 + }, + { + "epoch": 1.2420305060124284, + "grad_norm": 0.6501599550247192, + "learning_rate": 2.5355438424987565e-05, + "loss": 2.3964, + "step": 15390 + }, + { + "epoch": 1.2421112097490115, + "grad_norm": 0.6711748242378235, + "learning_rate": 2.5344934020547496e-05, + "loss": 2.4123, + "step": 15391 + }, + { + "epoch": 1.2421919134855943, + "grad_norm": 0.6803534030914307, + "learning_rate": 2.5334431476706823e-05, + "loss": 2.4271, + "step": 15392 + }, + { + "epoch": 1.2422726172221774, + "grad_norm": 0.7407296299934387, + "learning_rate": 2.5323930793727302e-05, + "loss": 2.49, + "step": 15393 + }, + { + "epoch": 1.2423533209587605, + "grad_norm": 0.701870858669281, + "learning_rate": 2.5313431971870617e-05, + "loss": 2.4534, + "step": 15394 + }, + { + "epoch": 1.2424340246953434, + "grad_norm": 0.6658090353012085, + "learning_rate": 2.5302935011398475e-05, + "loss": 2.4581, + "step": 15395 + }, + { + "epoch": 1.2425147284319265, + "grad_norm": 0.6616473197937012, + "learning_rate": 2.529243991257243e-05, + "loss": 2.4169, + "step": 15396 + }, + { + "epoch": 1.2425954321685093, + "grad_norm": 0.6714773178100586, + "learning_rate": 2.5281946675654067e-05, + "loss": 2.4159, + "step": 15397 + }, + { + "epoch": 1.2426761359050924, + "grad_norm": 0.6789337396621704, + "learning_rate": 2.5271455300904935e-05, + "loss": 2.4211, + "step": 15398 + }, + { + "epoch": 1.2427568396416755, + "grad_norm": 0.6793739795684814, + "learning_rate": 2.5260965788586456e-05, + "loss": 2.4337, + "step": 15399 + }, + { + "epoch": 1.2428375433782584, + "grad_norm": 0.6432294249534607, + "learning_rate": 2.5250478138960076e-05, + "loss": 2.4268, + "step": 15400 + }, + { + "epoch": 1.2429182471148414, + "grad_norm": 0.6960669159889221, + "learning_rate": 2.523999235228718e-05, + "loss": 2.3535, + "step": 15401 + }, + { + "epoch": 1.2429989508514243, + "grad_norm": 0.6724488735198975, + "learning_rate": 2.5229508428829096e-05, + "loss": 2.4294, + "step": 15402 + }, + { + "epoch": 1.2430796545880074, + "grad_norm": 0.636105477809906, + "learning_rate": 2.521902636884711e-05, + "loss": 2.4438, + "step": 15403 + }, + { + "epoch": 1.2431603583245905, + "grad_norm": 0.6865580677986145, + "learning_rate": 2.52085461726025e-05, + "loss": 2.4473, + "step": 15404 + }, + { + "epoch": 1.2432410620611734, + "grad_norm": 0.6740261316299438, + "learning_rate": 2.5198067840356398e-05, + "loss": 2.4642, + "step": 15405 + }, + { + "epoch": 1.2433217657977564, + "grad_norm": 0.7241789698600769, + "learning_rate": 2.518759137236998e-05, + "loss": 2.4294, + "step": 15406 + }, + { + "epoch": 1.2434024695343395, + "grad_norm": 0.6839794516563416, + "learning_rate": 2.5177116768904373e-05, + "loss": 2.4697, + "step": 15407 + }, + { + "epoch": 1.2434831732709224, + "grad_norm": 0.677390992641449, + "learning_rate": 2.5166644030220578e-05, + "loss": 2.4411, + "step": 15408 + }, + { + "epoch": 1.2435638770075055, + "grad_norm": 0.709065854549408, + "learning_rate": 2.515617315657962e-05, + "loss": 2.4392, + "step": 15409 + }, + { + "epoch": 1.2436445807440886, + "grad_norm": 0.6735498905181885, + "learning_rate": 2.514570414824249e-05, + "loss": 2.3924, + "step": 15410 + }, + { + "epoch": 1.2437252844806714, + "grad_norm": 0.6729374527931213, + "learning_rate": 2.513523700547007e-05, + "loss": 2.4464, + "step": 15411 + }, + { + "epoch": 1.2438059882172545, + "grad_norm": 0.7232720851898193, + "learning_rate": 2.5124771728523244e-05, + "loss": 2.3975, + "step": 15412 + }, + { + "epoch": 1.2438866919538374, + "grad_norm": 0.7467584609985352, + "learning_rate": 2.5114308317662837e-05, + "loss": 2.4191, + "step": 15413 + }, + { + "epoch": 1.2439673956904205, + "grad_norm": 0.6951141953468323, + "learning_rate": 2.5103846773149642e-05, + "loss": 2.4207, + "step": 15414 + }, + { + "epoch": 1.2440480994270036, + "grad_norm": 0.6427489519119263, + "learning_rate": 2.5093387095244336e-05, + "loss": 2.3539, + "step": 15415 + }, + { + "epoch": 1.2441288031635864, + "grad_norm": 0.729580283164978, + "learning_rate": 2.5082929284207644e-05, + "loss": 2.4464, + "step": 15416 + }, + { + "epoch": 1.2442095069001695, + "grad_norm": 0.7247009873390198, + "learning_rate": 2.5072473340300207e-05, + "loss": 2.4294, + "step": 15417 + }, + { + "epoch": 1.2442902106367524, + "grad_norm": 0.7037674784660339, + "learning_rate": 2.5062019263782577e-05, + "loss": 2.4294, + "step": 15418 + }, + { + "epoch": 1.2443709143733355, + "grad_norm": 0.6997841596603394, + "learning_rate": 2.5051567054915303e-05, + "loss": 2.4976, + "step": 15419 + }, + { + "epoch": 1.2444516181099186, + "grad_norm": 0.7001172304153442, + "learning_rate": 2.504111671395891e-05, + "loss": 2.371, + "step": 15420 + }, + { + "epoch": 1.2445323218465014, + "grad_norm": 0.6781473159790039, + "learning_rate": 2.5030668241173827e-05, + "loss": 2.4124, + "step": 15421 + }, + { + "epoch": 1.2446130255830845, + "grad_norm": 0.7053182125091553, + "learning_rate": 2.5020221636820463e-05, + "loss": 2.4109, + "step": 15422 + }, + { + "epoch": 1.2446937293196676, + "grad_norm": 0.68635493516922, + "learning_rate": 2.50097769011592e-05, + "loss": 2.4548, + "step": 15423 + }, + { + "epoch": 1.2447744330562505, + "grad_norm": 0.7015564441680908, + "learning_rate": 2.4999334034450293e-05, + "loss": 2.4537, + "step": 15424 + }, + { + "epoch": 1.2448551367928335, + "grad_norm": 0.694054901599884, + "learning_rate": 2.4988893036954043e-05, + "loss": 2.4396, + "step": 15425 + }, + { + "epoch": 1.2449358405294164, + "grad_norm": 0.702518880367279, + "learning_rate": 2.4978453908930665e-05, + "loss": 2.4015, + "step": 15426 + }, + { + "epoch": 1.2450165442659995, + "grad_norm": 0.7237387895584106, + "learning_rate": 2.4968016650640348e-05, + "loss": 2.4257, + "step": 15427 + }, + { + "epoch": 1.2450972480025826, + "grad_norm": 0.7133163809776306, + "learning_rate": 2.4957581262343154e-05, + "loss": 2.4532, + "step": 15428 + }, + { + "epoch": 1.2451779517391655, + "grad_norm": 0.8339287042617798, + "learning_rate": 2.4947147744299203e-05, + "loss": 2.4621, + "step": 15429 + }, + { + "epoch": 1.2452586554757485, + "grad_norm": 0.7620034217834473, + "learning_rate": 2.493671609676852e-05, + "loss": 2.365, + "step": 15430 + }, + { + "epoch": 1.2453393592123314, + "grad_norm": 0.7445465922355652, + "learning_rate": 2.4926286320011094e-05, + "loss": 2.4764, + "step": 15431 + }, + { + "epoch": 1.2454200629489145, + "grad_norm": 0.7366160154342651, + "learning_rate": 2.4915858414286852e-05, + "loss": 2.4597, + "step": 15432 + }, + { + "epoch": 1.2455007666854976, + "grad_norm": 0.7098437547683716, + "learning_rate": 2.490543237985572e-05, + "loss": 2.4202, + "step": 15433 + }, + { + "epoch": 1.2455814704220805, + "grad_norm": 0.6483333706855774, + "learning_rate": 2.4895008216977478e-05, + "loss": 2.4108, + "step": 15434 + }, + { + "epoch": 1.2456621741586635, + "grad_norm": 0.6797904968261719, + "learning_rate": 2.4884585925911963e-05, + "loss": 2.4414, + "step": 15435 + }, + { + "epoch": 1.2457428778952466, + "grad_norm": 0.6853424310684204, + "learning_rate": 2.4874165506918957e-05, + "loss": 2.4226, + "step": 15436 + }, + { + "epoch": 1.2458235816318295, + "grad_norm": 0.6861590147018433, + "learning_rate": 2.4863746960258094e-05, + "loss": 2.3748, + "step": 15437 + }, + { + "epoch": 1.2459042853684126, + "grad_norm": 0.7360263466835022, + "learning_rate": 2.4853330286189058e-05, + "loss": 2.4441, + "step": 15438 + }, + { + "epoch": 1.2459849891049957, + "grad_norm": 0.6894183158874512, + "learning_rate": 2.4842915484971496e-05, + "loss": 2.3495, + "step": 15439 + }, + { + "epoch": 1.2460656928415785, + "grad_norm": 0.7570669651031494, + "learning_rate": 2.4832502556864923e-05, + "loss": 2.4622, + "step": 15440 + }, + { + "epoch": 1.2461463965781616, + "grad_norm": 0.6986069083213806, + "learning_rate": 2.4822091502128876e-05, + "loss": 2.3647, + "step": 15441 + }, + { + "epoch": 1.2462271003147445, + "grad_norm": 0.681450366973877, + "learning_rate": 2.481168232102279e-05, + "loss": 2.3872, + "step": 15442 + }, + { + "epoch": 1.2463078040513276, + "grad_norm": 0.7241837978363037, + "learning_rate": 2.480127501380618e-05, + "loss": 2.4692, + "step": 15443 + }, + { + "epoch": 1.2463885077879107, + "grad_norm": 0.6575295329093933, + "learning_rate": 2.479086958073834e-05, + "loss": 2.5057, + "step": 15444 + }, + { + "epoch": 1.2464692115244935, + "grad_norm": 0.7289770841598511, + "learning_rate": 2.478046602207864e-05, + "loss": 2.4164, + "step": 15445 + }, + { + "epoch": 1.2465499152610766, + "grad_norm": 0.6682024598121643, + "learning_rate": 2.4770064338086374e-05, + "loss": 2.4466, + "step": 15446 + }, + { + "epoch": 1.2466306189976595, + "grad_norm": 0.7238918542861938, + "learning_rate": 2.475966452902072e-05, + "loss": 2.4367, + "step": 15447 + }, + { + "epoch": 1.2467113227342426, + "grad_norm": 0.6825705170631409, + "learning_rate": 2.4749266595140918e-05, + "loss": 2.4337, + "step": 15448 + }, + { + "epoch": 1.2467920264708257, + "grad_norm": 0.7352269887924194, + "learning_rate": 2.4738870536706126e-05, + "loss": 2.4103, + "step": 15449 + }, + { + "epoch": 1.2468727302074085, + "grad_norm": 0.658930778503418, + "learning_rate": 2.4728476353975394e-05, + "loss": 2.4281, + "step": 15450 + }, + { + "epoch": 1.2469534339439916, + "grad_norm": 0.6933601498603821, + "learning_rate": 2.4718084047207778e-05, + "loss": 2.4502, + "step": 15451 + }, + { + "epoch": 1.2470341376805747, + "grad_norm": 0.6901879906654358, + "learning_rate": 2.4707693616662308e-05, + "loss": 2.4057, + "step": 15452 + }, + { + "epoch": 1.2471148414171576, + "grad_norm": 0.7648913860321045, + "learning_rate": 2.469730506259792e-05, + "loss": 2.4163, + "step": 15453 + }, + { + "epoch": 1.2471955451537406, + "grad_norm": 0.6496175527572632, + "learning_rate": 2.4686918385273537e-05, + "loss": 2.4373, + "step": 15454 + }, + { + "epoch": 1.2472762488903237, + "grad_norm": 0.6949105858802795, + "learning_rate": 2.4676533584948048e-05, + "loss": 2.4108, + "step": 15455 + }, + { + "epoch": 1.2473569526269066, + "grad_norm": 0.7018688321113586, + "learning_rate": 2.4666150661880206e-05, + "loss": 2.4589, + "step": 15456 + }, + { + "epoch": 1.2474376563634897, + "grad_norm": 0.7141219973564148, + "learning_rate": 2.4655769616328827e-05, + "loss": 2.4022, + "step": 15457 + }, + { + "epoch": 1.2475183601000726, + "grad_norm": 0.7276743054389954, + "learning_rate": 2.4645390448552608e-05, + "loss": 2.4443, + "step": 15458 + }, + { + "epoch": 1.2475990638366556, + "grad_norm": 0.6861153244972229, + "learning_rate": 2.463501315881027e-05, + "loss": 2.4478, + "step": 15459 + }, + { + "epoch": 1.2476797675732387, + "grad_norm": 0.7252256274223328, + "learning_rate": 2.462463774736038e-05, + "loss": 2.446, + "step": 15460 + }, + { + "epoch": 1.2477604713098216, + "grad_norm": 0.6914857625961304, + "learning_rate": 2.4614264214461557e-05, + "loss": 2.4294, + "step": 15461 + }, + { + "epoch": 1.2478411750464047, + "grad_norm": 0.6815036535263062, + "learning_rate": 2.460389256037232e-05, + "loss": 2.4389, + "step": 15462 + }, + { + "epoch": 1.2479218787829875, + "grad_norm": 0.7420194745063782, + "learning_rate": 2.4593522785351176e-05, + "loss": 2.4932, + "step": 15463 + }, + { + "epoch": 1.2480025825195706, + "grad_norm": 0.6622182130813599, + "learning_rate": 2.4583154889656556e-05, + "loss": 2.4327, + "step": 15464 + }, + { + "epoch": 1.2480832862561537, + "grad_norm": 0.6527934074401855, + "learning_rate": 2.457278887354689e-05, + "loss": 2.3857, + "step": 15465 + }, + { + "epoch": 1.2481639899927366, + "grad_norm": 0.6942344903945923, + "learning_rate": 2.4562424737280465e-05, + "loss": 2.4181, + "step": 15466 + }, + { + "epoch": 1.2482446937293197, + "grad_norm": 0.7449823021888733, + "learning_rate": 2.45520624811156e-05, + "loss": 2.4575, + "step": 15467 + }, + { + "epoch": 1.2483253974659028, + "grad_norm": 0.6905208826065063, + "learning_rate": 2.4541702105310605e-05, + "loss": 2.3858, + "step": 15468 + }, + { + "epoch": 1.2484061012024856, + "grad_norm": 0.6928502917289734, + "learning_rate": 2.4531343610123603e-05, + "loss": 2.4212, + "step": 15469 + }, + { + "epoch": 1.2484868049390687, + "grad_norm": 0.7182145118713379, + "learning_rate": 2.45209869958128e-05, + "loss": 2.4063, + "step": 15470 + }, + { + "epoch": 1.2485675086756518, + "grad_norm": 0.7379452586174011, + "learning_rate": 2.4510632262636314e-05, + "loss": 2.4612, + "step": 15471 + }, + { + "epoch": 1.2486482124122347, + "grad_norm": 0.6663349270820618, + "learning_rate": 2.450027941085219e-05, + "loss": 2.4583, + "step": 15472 + }, + { + "epoch": 1.2487289161488178, + "grad_norm": 0.7266560792922974, + "learning_rate": 2.4489928440718467e-05, + "loss": 2.4483, + "step": 15473 + }, + { + "epoch": 1.2488096198854006, + "grad_norm": 0.7046550512313843, + "learning_rate": 2.447957935249311e-05, + "loss": 2.4087, + "step": 15474 + }, + { + "epoch": 1.2488903236219837, + "grad_norm": 0.684248685836792, + "learning_rate": 2.4469232146434084e-05, + "loss": 2.4352, + "step": 15475 + }, + { + "epoch": 1.2489710273585668, + "grad_norm": 0.6864973902702332, + "learning_rate": 2.4458886822799198e-05, + "loss": 2.3872, + "step": 15476 + }, + { + "epoch": 1.2490517310951497, + "grad_norm": 0.6964752674102783, + "learning_rate": 2.444854338184631e-05, + "loss": 2.437, + "step": 15477 + }, + { + "epoch": 1.2491324348317328, + "grad_norm": 0.6755973100662231, + "learning_rate": 2.4438201823833252e-05, + "loss": 2.4302, + "step": 15478 + }, + { + "epoch": 1.2492131385683156, + "grad_norm": 0.6434857249259949, + "learning_rate": 2.44278621490177e-05, + "loss": 2.406, + "step": 15479 + }, + { + "epoch": 1.2492938423048987, + "grad_norm": 0.7342328429222107, + "learning_rate": 2.441752435765736e-05, + "loss": 2.451, + "step": 15480 + }, + { + "epoch": 1.2493745460414818, + "grad_norm": 0.7486860752105713, + "learning_rate": 2.44071884500099e-05, + "loss": 2.4536, + "step": 15481 + }, + { + "epoch": 1.2494552497780647, + "grad_norm": 0.7274537086486816, + "learning_rate": 2.4396854426332903e-05, + "loss": 2.4599, + "step": 15482 + }, + { + "epoch": 1.2495359535146477, + "grad_norm": 0.7580124735832214, + "learning_rate": 2.4386522286883918e-05, + "loss": 2.4038, + "step": 15483 + }, + { + "epoch": 1.2496166572512308, + "grad_norm": 0.6776975393295288, + "learning_rate": 2.4376192031920488e-05, + "loss": 2.4246, + "step": 15484 + }, + { + "epoch": 1.2496973609878137, + "grad_norm": 0.6899511814117432, + "learning_rate": 2.4365863661699996e-05, + "loss": 2.3922, + "step": 15485 + }, + { + "epoch": 1.2497780647243968, + "grad_norm": 0.7487930059432983, + "learning_rate": 2.4355537176479903e-05, + "loss": 2.4573, + "step": 15486 + }, + { + "epoch": 1.2498587684609797, + "grad_norm": 0.7306599617004395, + "learning_rate": 2.4345212576517575e-05, + "loss": 2.4745, + "step": 15487 + }, + { + "epoch": 1.2499394721975627, + "grad_norm": 0.7152543067932129, + "learning_rate": 2.43348898620703e-05, + "loss": 2.4768, + "step": 15488 + }, + { + "epoch": 1.2500201759341458, + "grad_norm": 0.6576277017593384, + "learning_rate": 2.432456903339535e-05, + "loss": 2.4289, + "step": 15489 + }, + { + "epoch": 1.2501008796707287, + "grad_norm": 0.6974572539329529, + "learning_rate": 2.4314250090749956e-05, + "loss": 2.4218, + "step": 15490 + }, + { + "epoch": 1.2501815834073118, + "grad_norm": 0.7869577407836914, + "learning_rate": 2.4303933034391323e-05, + "loss": 2.3899, + "step": 15491 + }, + { + "epoch": 1.2502622871438946, + "grad_norm": 0.6723129749298096, + "learning_rate": 2.42936178645765e-05, + "loss": 2.4238, + "step": 15492 + }, + { + "epoch": 1.2503429908804777, + "grad_norm": 0.6839526891708374, + "learning_rate": 2.428330458156265e-05, + "loss": 2.4037, + "step": 15493 + }, + { + "epoch": 1.2504236946170608, + "grad_norm": 0.6866093277931213, + "learning_rate": 2.4272993185606796e-05, + "loss": 2.4228, + "step": 15494 + }, + { + "epoch": 1.2505043983536437, + "grad_norm": 0.6992947459220886, + "learning_rate": 2.426268367696588e-05, + "loss": 2.4248, + "step": 15495 + }, + { + "epoch": 1.2505851020902268, + "grad_norm": 0.6836698651313782, + "learning_rate": 2.4252376055896862e-05, + "loss": 2.5387, + "step": 15496 + }, + { + "epoch": 1.2506658058268099, + "grad_norm": 0.6990752816200256, + "learning_rate": 2.4242070322656663e-05, + "loss": 2.4438, + "step": 15497 + }, + { + "epoch": 1.2507465095633927, + "grad_norm": 0.7143029570579529, + "learning_rate": 2.4231766477502082e-05, + "loss": 2.4, + "step": 15498 + }, + { + "epoch": 1.2508272132999758, + "grad_norm": 0.6585043668746948, + "learning_rate": 2.422146452068994e-05, + "loss": 2.4256, + "step": 15499 + }, + { + "epoch": 1.250907917036559, + "grad_norm": 0.739107072353363, + "learning_rate": 2.421116445247702e-05, + "loss": 2.428, + "step": 15500 + }, + { + "epoch": 1.2509886207731418, + "grad_norm": 0.6675287485122681, + "learning_rate": 2.420086627311997e-05, + "loss": 2.5095, + "step": 15501 + }, + { + "epoch": 1.2510693245097249, + "grad_norm": 0.7133405804634094, + "learning_rate": 2.4190569982875467e-05, + "loss": 2.4719, + "step": 15502 + }, + { + "epoch": 1.2511500282463077, + "grad_norm": 0.710904061794281, + "learning_rate": 2.4180275582000134e-05, + "loss": 2.4449, + "step": 15503 + }, + { + "epoch": 1.2512307319828908, + "grad_norm": 0.7088729739189148, + "learning_rate": 2.4169983070750525e-05, + "loss": 2.4059, + "step": 15504 + }, + { + "epoch": 1.2513114357194737, + "grad_norm": 0.7187358736991882, + "learning_rate": 2.4159692449383152e-05, + "loss": 2.4577, + "step": 15505 + }, + { + "epoch": 1.2513921394560568, + "grad_norm": 0.7531955242156982, + "learning_rate": 2.4149403718154497e-05, + "loss": 2.4101, + "step": 15506 + }, + { + "epoch": 1.2514728431926398, + "grad_norm": 0.7565199136734009, + "learning_rate": 2.413911687732101e-05, + "loss": 2.4805, + "step": 15507 + }, + { + "epoch": 1.2515535469292227, + "grad_norm": 0.706471860408783, + "learning_rate": 2.4128831927139008e-05, + "loss": 2.4494, + "step": 15508 + }, + { + "epoch": 1.2516342506658058, + "grad_norm": 0.7022314667701721, + "learning_rate": 2.4118548867864832e-05, + "loss": 2.4442, + "step": 15509 + }, + { + "epoch": 1.251714954402389, + "grad_norm": 0.6885591745376587, + "learning_rate": 2.4108267699754806e-05, + "loss": 2.4186, + "step": 15510 + }, + { + "epoch": 1.2517956581389718, + "grad_norm": 0.6963610649108887, + "learning_rate": 2.409798842306511e-05, + "loss": 2.4209, + "step": 15511 + }, + { + "epoch": 1.2518763618755548, + "grad_norm": 0.7117185592651367, + "learning_rate": 2.4087711038051942e-05, + "loss": 2.4106, + "step": 15512 + }, + { + "epoch": 1.251957065612138, + "grad_norm": 0.6944519281387329, + "learning_rate": 2.407743554497146e-05, + "loss": 2.4493, + "step": 15513 + }, + { + "epoch": 1.2520377693487208, + "grad_norm": 0.689818263053894, + "learning_rate": 2.406716194407974e-05, + "loss": 2.4358, + "step": 15514 + }, + { + "epoch": 1.2521184730853039, + "grad_norm": 0.8132768273353577, + "learning_rate": 2.4056890235632846e-05, + "loss": 2.4574, + "step": 15515 + }, + { + "epoch": 1.252199176821887, + "grad_norm": 0.6855002045631409, + "learning_rate": 2.4046620419886777e-05, + "loss": 2.4118, + "step": 15516 + }, + { + "epoch": 1.2522798805584698, + "grad_norm": 0.6616373658180237, + "learning_rate": 2.4036352497097458e-05, + "loss": 2.4332, + "step": 15517 + }, + { + "epoch": 1.252360584295053, + "grad_norm": 0.6657225489616394, + "learning_rate": 2.4026086467520803e-05, + "loss": 2.3989, + "step": 15518 + }, + { + "epoch": 1.2524412880316358, + "grad_norm": 0.6796447038650513, + "learning_rate": 2.4015822331412664e-05, + "loss": 2.4269, + "step": 15519 + }, + { + "epoch": 1.2525219917682189, + "grad_norm": 0.7168079614639282, + "learning_rate": 2.400556008902889e-05, + "loss": 2.4263, + "step": 15520 + }, + { + "epoch": 1.2526026955048017, + "grad_norm": 0.6985058188438416, + "learning_rate": 2.3995299740625186e-05, + "loss": 2.437, + "step": 15521 + }, + { + "epoch": 1.2526833992413848, + "grad_norm": 0.7078086137771606, + "learning_rate": 2.3985041286457287e-05, + "loss": 2.3996, + "step": 15522 + }, + { + "epoch": 1.252764102977968, + "grad_norm": 0.6989054083824158, + "learning_rate": 2.3974784726780865e-05, + "loss": 2.4717, + "step": 15523 + }, + { + "epoch": 1.2528448067145508, + "grad_norm": 0.747606098651886, + "learning_rate": 2.396453006185153e-05, + "loss": 2.4228, + "step": 15524 + }, + { + "epoch": 1.2529255104511339, + "grad_norm": 0.7500887513160706, + "learning_rate": 2.3954277291924876e-05, + "loss": 2.4636, + "step": 15525 + }, + { + "epoch": 1.253006214187717, + "grad_norm": 0.7710712552070618, + "learning_rate": 2.3944026417256437e-05, + "loss": 2.4405, + "step": 15526 + }, + { + "epoch": 1.2530869179242998, + "grad_norm": 0.7278285622596741, + "learning_rate": 2.3933777438101657e-05, + "loss": 2.4279, + "step": 15527 + }, + { + "epoch": 1.253167621660883, + "grad_norm": 0.6979010701179504, + "learning_rate": 2.3923530354715973e-05, + "loss": 2.4272, + "step": 15528 + }, + { + "epoch": 1.253248325397466, + "grad_norm": 0.7330336570739746, + "learning_rate": 2.3913285167354804e-05, + "loss": 2.3861, + "step": 15529 + }, + { + "epoch": 1.2533290291340489, + "grad_norm": 0.675499677658081, + "learning_rate": 2.3903041876273436e-05, + "loss": 2.3987, + "step": 15530 + }, + { + "epoch": 1.253409732870632, + "grad_norm": 0.6854682564735413, + "learning_rate": 2.3892800481727186e-05, + "loss": 2.4085, + "step": 15531 + }, + { + "epoch": 1.253490436607215, + "grad_norm": 0.713810384273529, + "learning_rate": 2.388256098397129e-05, + "loss": 2.3897, + "step": 15532 + }, + { + "epoch": 1.253571140343798, + "grad_norm": 0.683214545249939, + "learning_rate": 2.3872323383260953e-05, + "loss": 2.4526, + "step": 15533 + }, + { + "epoch": 1.253651844080381, + "grad_norm": 0.6718357801437378, + "learning_rate": 2.3862087679851318e-05, + "loss": 2.4612, + "step": 15534 + }, + { + "epoch": 1.2537325478169639, + "grad_norm": 0.722283124923706, + "learning_rate": 2.3851853873997488e-05, + "loss": 2.4163, + "step": 15535 + }, + { + "epoch": 1.253813251553547, + "grad_norm": 0.689393162727356, + "learning_rate": 2.384162196595453e-05, + "loss": 2.3984, + "step": 15536 + }, + { + "epoch": 1.2538939552901298, + "grad_norm": 0.7146410346031189, + "learning_rate": 2.3831391955977412e-05, + "loss": 2.4442, + "step": 15537 + }, + { + "epoch": 1.253974659026713, + "grad_norm": 0.6651021838188171, + "learning_rate": 2.3821163844321104e-05, + "loss": 2.4064, + "step": 15538 + }, + { + "epoch": 1.254055362763296, + "grad_norm": 0.7088985443115234, + "learning_rate": 2.381093763124056e-05, + "loss": 2.4831, + "step": 15539 + }, + { + "epoch": 1.2541360664998789, + "grad_norm": 0.661375105381012, + "learning_rate": 2.3800713316990588e-05, + "loss": 2.3657, + "step": 15540 + }, + { + "epoch": 1.254216770236462, + "grad_norm": 0.6870979070663452, + "learning_rate": 2.3790490901826012e-05, + "loss": 2.4208, + "step": 15541 + }, + { + "epoch": 1.254297473973045, + "grad_norm": 0.6256219148635864, + "learning_rate": 2.3780270386001657e-05, + "loss": 2.4182, + "step": 15542 + }, + { + "epoch": 1.254378177709628, + "grad_norm": 0.7070638537406921, + "learning_rate": 2.377005176977215e-05, + "loss": 2.3758, + "step": 15543 + }, + { + "epoch": 1.254458881446211, + "grad_norm": 0.6571370363235474, + "learning_rate": 2.3759835053392242e-05, + "loss": 2.3927, + "step": 15544 + }, + { + "epoch": 1.254539585182794, + "grad_norm": 0.644263744354248, + "learning_rate": 2.3749620237116565e-05, + "loss": 2.3992, + "step": 15545 + }, + { + "epoch": 1.254620288919377, + "grad_norm": 0.7127394676208496, + "learning_rate": 2.3739407321199648e-05, + "loss": 2.3942, + "step": 15546 + }, + { + "epoch": 1.25470099265596, + "grad_norm": 0.7274866104125977, + "learning_rate": 2.372919630589605e-05, + "loss": 2.5232, + "step": 15547 + }, + { + "epoch": 1.2547816963925431, + "grad_norm": 0.690138041973114, + "learning_rate": 2.3718987191460274e-05, + "loss": 2.4371, + "step": 15548 + }, + { + "epoch": 1.254862400129126, + "grad_norm": 0.6990681886672974, + "learning_rate": 2.3708779978146724e-05, + "loss": 2.4568, + "step": 15549 + }, + { + "epoch": 1.254943103865709, + "grad_norm": 0.7430790662765503, + "learning_rate": 2.3698574666209793e-05, + "loss": 2.423, + "step": 15550 + }, + { + "epoch": 1.255023807602292, + "grad_norm": 0.6991416215896606, + "learning_rate": 2.3688371255903828e-05, + "loss": 2.4529, + "step": 15551 + }, + { + "epoch": 1.255104511338875, + "grad_norm": 0.6733322739601135, + "learning_rate": 2.367816974748317e-05, + "loss": 2.4531, + "step": 15552 + }, + { + "epoch": 1.2551852150754579, + "grad_norm": 0.7460463047027588, + "learning_rate": 2.3667970141202e-05, + "loss": 2.4267, + "step": 15553 + }, + { + "epoch": 1.255265918812041, + "grad_norm": 0.6784021854400635, + "learning_rate": 2.3657772437314517e-05, + "loss": 2.4996, + "step": 15554 + }, + { + "epoch": 1.255346622548624, + "grad_norm": 0.7499529719352722, + "learning_rate": 2.3647576636074975e-05, + "loss": 2.4749, + "step": 15555 + }, + { + "epoch": 1.255427326285207, + "grad_norm": 0.6698335409164429, + "learning_rate": 2.3637382737737368e-05, + "loss": 2.4499, + "step": 15556 + }, + { + "epoch": 1.25550803002179, + "grad_norm": 0.6644846200942993, + "learning_rate": 2.3627190742555806e-05, + "loss": 2.397, + "step": 15557 + }, + { + "epoch": 1.255588733758373, + "grad_norm": 0.7041488289833069, + "learning_rate": 2.3617000650784315e-05, + "loss": 2.4012, + "step": 15558 + }, + { + "epoch": 1.255669437494956, + "grad_norm": 0.72523033618927, + "learning_rate": 2.3606812462676798e-05, + "loss": 2.4151, + "step": 15559 + }, + { + "epoch": 1.255750141231539, + "grad_norm": 0.77669757604599, + "learning_rate": 2.3596626178487225e-05, + "loss": 2.4478, + "step": 15560 + }, + { + "epoch": 1.2558308449681221, + "grad_norm": 0.6919559836387634, + "learning_rate": 2.3586441798469462e-05, + "loss": 2.4548, + "step": 15561 + }, + { + "epoch": 1.255911548704705, + "grad_norm": 0.7613349556922913, + "learning_rate": 2.3576259322877292e-05, + "loss": 2.4475, + "step": 15562 + }, + { + "epoch": 1.255992252441288, + "grad_norm": 0.6738333106040955, + "learning_rate": 2.3566078751964515e-05, + "loss": 2.4242, + "step": 15563 + }, + { + "epoch": 1.256072956177871, + "grad_norm": 0.7242118716239929, + "learning_rate": 2.355590008598486e-05, + "loss": 2.4047, + "step": 15564 + }, + { + "epoch": 1.256153659914454, + "grad_norm": 0.7117685675621033, + "learning_rate": 2.354572332519199e-05, + "loss": 2.4473, + "step": 15565 + }, + { + "epoch": 1.256234363651037, + "grad_norm": 0.7466531991958618, + "learning_rate": 2.3535548469839564e-05, + "loss": 2.453, + "step": 15566 + }, + { + "epoch": 1.25631506738762, + "grad_norm": 0.6750668883323669, + "learning_rate": 2.3525375520181136e-05, + "loss": 2.4367, + "step": 15567 + }, + { + "epoch": 1.256395771124203, + "grad_norm": 0.7640851736068726, + "learning_rate": 2.35152044764703e-05, + "loss": 2.5014, + "step": 15568 + }, + { + "epoch": 1.256476474860786, + "grad_norm": 0.7198928594589233, + "learning_rate": 2.3505035338960456e-05, + "loss": 2.5138, + "step": 15569 + }, + { + "epoch": 1.256557178597369, + "grad_norm": 0.7079946398735046, + "learning_rate": 2.349486810790511e-05, + "loss": 2.4172, + "step": 15570 + }, + { + "epoch": 1.2566378823339521, + "grad_norm": 0.7477186918258667, + "learning_rate": 2.3484702783557655e-05, + "loss": 2.4224, + "step": 15571 + }, + { + "epoch": 1.256718586070535, + "grad_norm": 0.6875394582748413, + "learning_rate": 2.3474539366171388e-05, + "loss": 2.4621, + "step": 15572 + }, + { + "epoch": 1.256799289807118, + "grad_norm": 0.7164824604988098, + "learning_rate": 2.346437785599964e-05, + "loss": 2.4416, + "step": 15573 + }, + { + "epoch": 1.2568799935437012, + "grad_norm": 0.7031935453414917, + "learning_rate": 2.3454218253295668e-05, + "loss": 2.3943, + "step": 15574 + }, + { + "epoch": 1.256960697280284, + "grad_norm": 0.6739614009857178, + "learning_rate": 2.3444060558312665e-05, + "loss": 2.4114, + "step": 15575 + }, + { + "epoch": 1.2570414010168671, + "grad_norm": 0.6710866689682007, + "learning_rate": 2.3433904771303794e-05, + "loss": 2.4077, + "step": 15576 + }, + { + "epoch": 1.2571221047534502, + "grad_norm": 0.6589750051498413, + "learning_rate": 2.342375089252219e-05, + "loss": 2.3494, + "step": 15577 + }, + { + "epoch": 1.257202808490033, + "grad_norm": 0.7018333077430725, + "learning_rate": 2.3413598922220857e-05, + "loss": 2.459, + "step": 15578 + }, + { + "epoch": 1.2572835122266162, + "grad_norm": 0.7735301852226257, + "learning_rate": 2.3403448860652842e-05, + "loss": 2.4524, + "step": 15579 + }, + { + "epoch": 1.257364215963199, + "grad_norm": 0.7009726762771606, + "learning_rate": 2.339330070807113e-05, + "loss": 2.4244, + "step": 15580 + }, + { + "epoch": 1.2574449196997821, + "grad_norm": 0.671521008014679, + "learning_rate": 2.3383154464728595e-05, + "loss": 2.3808, + "step": 15581 + }, + { + "epoch": 1.257525623436365, + "grad_norm": 0.7736711502075195, + "learning_rate": 2.3373010130878126e-05, + "loss": 2.4936, + "step": 15582 + }, + { + "epoch": 1.257606327172948, + "grad_norm": 0.6987056136131287, + "learning_rate": 2.336286770677255e-05, + "loss": 2.4484, + "step": 15583 + }, + { + "epoch": 1.2576870309095312, + "grad_norm": 0.6337067484855652, + "learning_rate": 2.3352727192664635e-05, + "loss": 2.4196, + "step": 15584 + }, + { + "epoch": 1.257767734646114, + "grad_norm": 0.6832795143127441, + "learning_rate": 2.3342588588807123e-05, + "loss": 2.3681, + "step": 15585 + }, + { + "epoch": 1.257848438382697, + "grad_norm": 0.7208079695701599, + "learning_rate": 2.3332451895452688e-05, + "loss": 2.4436, + "step": 15586 + }, + { + "epoch": 1.2579291421192802, + "grad_norm": 0.6607621312141418, + "learning_rate": 2.3322317112853986e-05, + "loss": 2.4088, + "step": 15587 + }, + { + "epoch": 1.258009845855863, + "grad_norm": 0.7261247038841248, + "learning_rate": 2.331218424126356e-05, + "loss": 2.4389, + "step": 15588 + }, + { + "epoch": 1.2580905495924462, + "grad_norm": 0.6187729239463806, + "learning_rate": 2.3302053280933954e-05, + "loss": 2.3568, + "step": 15589 + }, + { + "epoch": 1.2581712533290292, + "grad_norm": 0.6196430921554565, + "learning_rate": 2.3291924232117713e-05, + "loss": 2.4285, + "step": 15590 + }, + { + "epoch": 1.258251957065612, + "grad_norm": 0.7271853685379028, + "learning_rate": 2.3281797095067193e-05, + "loss": 2.4058, + "step": 15591 + }, + { + "epoch": 1.2583326608021952, + "grad_norm": 0.7141130566596985, + "learning_rate": 2.327167187003484e-05, + "loss": 2.3971, + "step": 15592 + }, + { + "epoch": 1.2584133645387783, + "grad_norm": 0.680743932723999, + "learning_rate": 2.3261548557273027e-05, + "loss": 2.4387, + "step": 15593 + }, + { + "epoch": 1.2584940682753611, + "grad_norm": 0.718173086643219, + "learning_rate": 2.3251427157033955e-05, + "loss": 2.43, + "step": 15594 + }, + { + "epoch": 1.2585747720119442, + "grad_norm": 0.7600045800209045, + "learning_rate": 2.324130766956998e-05, + "loss": 2.4584, + "step": 15595 + }, + { + "epoch": 1.258655475748527, + "grad_norm": 0.7432500123977661, + "learning_rate": 2.3231190095133294e-05, + "loss": 2.4717, + "step": 15596 + }, + { + "epoch": 1.2587361794851102, + "grad_norm": 0.6603000164031982, + "learning_rate": 2.3221074433975988e-05, + "loss": 2.3952, + "step": 15597 + }, + { + "epoch": 1.258816883221693, + "grad_norm": 0.7020140290260315, + "learning_rate": 2.3210960686350213e-05, + "loss": 2.4064, + "step": 15598 + }, + { + "epoch": 1.2588975869582761, + "grad_norm": 0.7434887290000916, + "learning_rate": 2.320084885250804e-05, + "loss": 2.4708, + "step": 15599 + }, + { + "epoch": 1.2589782906948592, + "grad_norm": 0.6626797318458557, + "learning_rate": 2.3190738932701482e-05, + "loss": 2.4503, + "step": 15600 + }, + { + "epoch": 1.259058994431442, + "grad_norm": 0.7880598902702332, + "learning_rate": 2.3180630927182466e-05, + "loss": 2.384, + "step": 15601 + }, + { + "epoch": 1.2591396981680252, + "grad_norm": 0.7766147255897522, + "learning_rate": 2.3170524836202933e-05, + "loss": 2.4019, + "step": 15602 + }, + { + "epoch": 1.2592204019046083, + "grad_norm": 0.7817980051040649, + "learning_rate": 2.3160420660014792e-05, + "loss": 2.4729, + "step": 15603 + }, + { + "epoch": 1.2593011056411911, + "grad_norm": 0.6915614604949951, + "learning_rate": 2.3150318398869787e-05, + "loss": 2.4028, + "step": 15604 + }, + { + "epoch": 1.2593818093777742, + "grad_norm": 0.690882682800293, + "learning_rate": 2.3140218053019714e-05, + "loss": 2.4386, + "step": 15605 + }, + { + "epoch": 1.2594625131143573, + "grad_norm": 0.6670350432395935, + "learning_rate": 2.3130119622716382e-05, + "loss": 2.4224, + "step": 15606 + }, + { + "epoch": 1.2595432168509402, + "grad_norm": 0.6680006980895996, + "learning_rate": 2.3120023108211375e-05, + "loss": 2.3475, + "step": 15607 + }, + { + "epoch": 1.2596239205875233, + "grad_norm": 0.7003577947616577, + "learning_rate": 2.310992850975636e-05, + "loss": 2.4198, + "step": 15608 + }, + { + "epoch": 1.2597046243241061, + "grad_norm": 0.7444167733192444, + "learning_rate": 2.3099835827602944e-05, + "loss": 2.3756, + "step": 15609 + }, + { + "epoch": 1.2597853280606892, + "grad_norm": 0.6757989525794983, + "learning_rate": 2.3089745062002612e-05, + "loss": 2.3955, + "step": 15610 + }, + { + "epoch": 1.259866031797272, + "grad_norm": 0.6955820322036743, + "learning_rate": 2.3079656213206878e-05, + "loss": 2.4031, + "step": 15611 + }, + { + "epoch": 1.2599467355338552, + "grad_norm": 0.6646408438682556, + "learning_rate": 2.3069569281467184e-05, + "loss": 2.4246, + "step": 15612 + }, + { + "epoch": 1.2600274392704383, + "grad_norm": 0.6922882199287415, + "learning_rate": 2.3059484267034958e-05, + "loss": 2.4157, + "step": 15613 + }, + { + "epoch": 1.2601081430070211, + "grad_norm": 0.8092310428619385, + "learning_rate": 2.3049401170161468e-05, + "loss": 2.4137, + "step": 15614 + }, + { + "epoch": 1.2601888467436042, + "grad_norm": 0.7024559378623962, + "learning_rate": 2.3039319991098063e-05, + "loss": 2.4497, + "step": 15615 + }, + { + "epoch": 1.2602695504801873, + "grad_norm": 0.7096099853515625, + "learning_rate": 2.302924073009597e-05, + "loss": 2.4045, + "step": 15616 + }, + { + "epoch": 1.2603502542167702, + "grad_norm": 0.6777564287185669, + "learning_rate": 2.3019163387406406e-05, + "loss": 2.4607, + "step": 15617 + }, + { + "epoch": 1.2604309579533532, + "grad_norm": 0.7564159035682678, + "learning_rate": 2.300908796328052e-05, + "loss": 2.4985, + "step": 15618 + }, + { + "epoch": 1.2605116616899363, + "grad_norm": 0.7432986497879028, + "learning_rate": 2.2999014457969447e-05, + "loss": 2.4326, + "step": 15619 + }, + { + "epoch": 1.2605923654265192, + "grad_norm": 0.7178141474723816, + "learning_rate": 2.2988942871724182e-05, + "loss": 2.4118, + "step": 15620 + }, + { + "epoch": 1.2606730691631023, + "grad_norm": 0.7074497938156128, + "learning_rate": 2.2978873204795782e-05, + "loss": 2.4163, + "step": 15621 + }, + { + "epoch": 1.2607537728996854, + "grad_norm": 0.670200765132904, + "learning_rate": 2.2968805457435217e-05, + "loss": 2.4081, + "step": 15622 + }, + { + "epoch": 1.2608344766362682, + "grad_norm": 0.7258187532424927, + "learning_rate": 2.2958739629893355e-05, + "loss": 2.4889, + "step": 15623 + }, + { + "epoch": 1.2609151803728513, + "grad_norm": 0.6999781727790833, + "learning_rate": 2.2948675722421086e-05, + "loss": 2.3945, + "step": 15624 + }, + { + "epoch": 1.2609958841094342, + "grad_norm": 0.7030084133148193, + "learning_rate": 2.2938613735269243e-05, + "loss": 2.4509, + "step": 15625 + }, + { + "epoch": 1.2610765878460173, + "grad_norm": 0.6875420212745667, + "learning_rate": 2.292855366868858e-05, + "loss": 2.3658, + "step": 15626 + }, + { + "epoch": 1.2611572915826001, + "grad_norm": 0.7375235557556152, + "learning_rate": 2.2918495522929817e-05, + "loss": 2.4308, + "step": 15627 + }, + { + "epoch": 1.2612379953191832, + "grad_norm": 0.7021106481552124, + "learning_rate": 2.2908439298243644e-05, + "loss": 2.4046, + "step": 15628 + }, + { + "epoch": 1.2613186990557663, + "grad_norm": 0.76661616563797, + "learning_rate": 2.2898384994880716e-05, + "loss": 2.5156, + "step": 15629 + }, + { + "epoch": 1.2613994027923492, + "grad_norm": 0.6684869527816772, + "learning_rate": 2.2888332613091558e-05, + "loss": 2.4342, + "step": 15630 + }, + { + "epoch": 1.2614801065289323, + "grad_norm": 0.6878669261932373, + "learning_rate": 2.2878282153126706e-05, + "loss": 2.4544, + "step": 15631 + }, + { + "epoch": 1.2615608102655154, + "grad_norm": 0.6659132838249207, + "learning_rate": 2.2868233615236702e-05, + "loss": 2.4341, + "step": 15632 + }, + { + "epoch": 1.2616415140020982, + "grad_norm": 0.657474160194397, + "learning_rate": 2.2858186999671905e-05, + "loss": 2.3515, + "step": 15633 + }, + { + "epoch": 1.2617222177386813, + "grad_norm": 0.7245650291442871, + "learning_rate": 2.284814230668274e-05, + "loss": 2.3983, + "step": 15634 + }, + { + "epoch": 1.2618029214752644, + "grad_norm": 0.6400195360183716, + "learning_rate": 2.2838099536519554e-05, + "loss": 2.3535, + "step": 15635 + }, + { + "epoch": 1.2618836252118473, + "grad_norm": 0.6719450950622559, + "learning_rate": 2.282805868943262e-05, + "loss": 2.3906, + "step": 15636 + }, + { + "epoch": 1.2619643289484304, + "grad_norm": 0.682746946811676, + "learning_rate": 2.2818019765672207e-05, + "loss": 2.4045, + "step": 15637 + }, + { + "epoch": 1.2620450326850134, + "grad_norm": 0.6631760597229004, + "learning_rate": 2.2807982765488513e-05, + "loss": 2.4896, + "step": 15638 + }, + { + "epoch": 1.2621257364215963, + "grad_norm": 0.782202422618866, + "learning_rate": 2.279794768913164e-05, + "loss": 2.4628, + "step": 15639 + }, + { + "epoch": 1.2622064401581794, + "grad_norm": 0.7579823732376099, + "learning_rate": 2.278791453685173e-05, + "loss": 2.4635, + "step": 15640 + }, + { + "epoch": 1.2622871438947623, + "grad_norm": 0.665096640586853, + "learning_rate": 2.277788330889884e-05, + "loss": 2.4899, + "step": 15641 + }, + { + "epoch": 1.2623678476313454, + "grad_norm": 0.7635685205459595, + "learning_rate": 2.2767854005522936e-05, + "loss": 2.4146, + "step": 15642 + }, + { + "epoch": 1.2624485513679282, + "grad_norm": 0.7579118609428406, + "learning_rate": 2.2757826626974e-05, + "loss": 2.3692, + "step": 15643 + }, + { + "epoch": 1.2625292551045113, + "grad_norm": 0.6772074699401855, + "learning_rate": 2.2747801173501938e-05, + "loss": 2.3954, + "step": 15644 + }, + { + "epoch": 1.2626099588410944, + "grad_norm": 0.7028382420539856, + "learning_rate": 2.2737777645356606e-05, + "loss": 2.4799, + "step": 15645 + }, + { + "epoch": 1.2626906625776773, + "grad_norm": 0.7152617573738098, + "learning_rate": 2.2727756042787818e-05, + "loss": 2.4095, + "step": 15646 + }, + { + "epoch": 1.2627713663142603, + "grad_norm": 0.7286608219146729, + "learning_rate": 2.271773636604535e-05, + "loss": 2.4496, + "step": 15647 + }, + { + "epoch": 1.2628520700508434, + "grad_norm": 0.7006896734237671, + "learning_rate": 2.2707718615378935e-05, + "loss": 2.4128, + "step": 15648 + }, + { + "epoch": 1.2629327737874263, + "grad_norm": 0.6856697797775269, + "learning_rate": 2.2697702791038177e-05, + "loss": 2.4169, + "step": 15649 + }, + { + "epoch": 1.2630134775240094, + "grad_norm": 0.7582918405532837, + "learning_rate": 2.268768889327275e-05, + "loss": 2.4007, + "step": 15650 + }, + { + "epoch": 1.2630941812605925, + "grad_norm": 0.664633572101593, + "learning_rate": 2.2677676922332237e-05, + "loss": 2.3876, + "step": 15651 + }, + { + "epoch": 1.2631748849971753, + "grad_norm": 0.7283070087432861, + "learning_rate": 2.266766687846611e-05, + "loss": 2.4175, + "step": 15652 + }, + { + "epoch": 1.2632555887337584, + "grad_norm": 0.7309537529945374, + "learning_rate": 2.2657658761923863e-05, + "loss": 2.3998, + "step": 15653 + }, + { + "epoch": 1.2633362924703415, + "grad_norm": 0.6386510133743286, + "learning_rate": 2.2647652572954968e-05, + "loss": 2.3723, + "step": 15654 + }, + { + "epoch": 1.2634169962069244, + "grad_norm": 0.6805689930915833, + "learning_rate": 2.263764831180876e-05, + "loss": 2.3989, + "step": 15655 + }, + { + "epoch": 1.2634976999435072, + "grad_norm": 0.7147208452224731, + "learning_rate": 2.2627645978734536e-05, + "loss": 2.4748, + "step": 15656 + }, + { + "epoch": 1.2635784036800903, + "grad_norm": 0.6835155487060547, + "learning_rate": 2.2617645573981683e-05, + "loss": 2.4266, + "step": 15657 + }, + { + "epoch": 1.2636591074166734, + "grad_norm": 0.7631552219390869, + "learning_rate": 2.2607647097799368e-05, + "loss": 2.4152, + "step": 15658 + }, + { + "epoch": 1.2637398111532563, + "grad_norm": 0.6793624758720398, + "learning_rate": 2.2597650550436777e-05, + "loss": 2.3491, + "step": 15659 + }, + { + "epoch": 1.2638205148898394, + "grad_norm": 0.6465637683868408, + "learning_rate": 2.2587655932143083e-05, + "loss": 2.3774, + "step": 15660 + }, + { + "epoch": 1.2639012186264225, + "grad_norm": 0.6920284628868103, + "learning_rate": 2.2577663243167368e-05, + "loss": 2.4321, + "step": 15661 + }, + { + "epoch": 1.2639819223630053, + "grad_norm": 0.6922522783279419, + "learning_rate": 2.256767248375866e-05, + "loss": 2.4242, + "step": 15662 + }, + { + "epoch": 1.2640626260995884, + "grad_norm": 0.6811214089393616, + "learning_rate": 2.255768365416595e-05, + "loss": 2.4101, + "step": 15663 + }, + { + "epoch": 1.2641433298361715, + "grad_norm": 0.6704947352409363, + "learning_rate": 2.2547696754638238e-05, + "loss": 2.4792, + "step": 15664 + }, + { + "epoch": 1.2642240335727544, + "grad_norm": 0.6814701557159424, + "learning_rate": 2.2537711785424354e-05, + "loss": 2.4429, + "step": 15665 + }, + { + "epoch": 1.2643047373093375, + "grad_norm": 0.6778244972229004, + "learning_rate": 2.252772874677318e-05, + "loss": 2.3882, + "step": 15666 + }, + { + "epoch": 1.2643854410459205, + "grad_norm": 0.6570093035697937, + "learning_rate": 2.2517747638933518e-05, + "loss": 2.4162, + "step": 15667 + }, + { + "epoch": 1.2644661447825034, + "grad_norm": 0.6973466873168945, + "learning_rate": 2.2507768462154133e-05, + "loss": 2.3646, + "step": 15668 + }, + { + "epoch": 1.2645468485190865, + "grad_norm": 0.7258623242378235, + "learning_rate": 2.2497791216683715e-05, + "loss": 2.404, + "step": 15669 + }, + { + "epoch": 1.2646275522556694, + "grad_norm": 0.7462170124053955, + "learning_rate": 2.248781590277097e-05, + "loss": 2.5076, + "step": 15670 + }, + { + "epoch": 1.2647082559922525, + "grad_norm": 0.7070441246032715, + "learning_rate": 2.247784252066444e-05, + "loss": 2.3817, + "step": 15671 + }, + { + "epoch": 1.2647889597288353, + "grad_norm": 0.7150183916091919, + "learning_rate": 2.246787107061272e-05, + "loss": 2.461, + "step": 15672 + }, + { + "epoch": 1.2648696634654184, + "grad_norm": 0.668436586856842, + "learning_rate": 2.2457901552864347e-05, + "loss": 2.466, + "step": 15673 + }, + { + "epoch": 1.2649503672020015, + "grad_norm": 0.7011097073554993, + "learning_rate": 2.2447933967667745e-05, + "loss": 2.4582, + "step": 15674 + }, + { + "epoch": 1.2650310709385844, + "grad_norm": 0.7149096727371216, + "learning_rate": 2.243796831527134e-05, + "loss": 2.4461, + "step": 15675 + }, + { + "epoch": 1.2651117746751674, + "grad_norm": 0.6810914278030396, + "learning_rate": 2.2428004595923525e-05, + "loss": 2.4043, + "step": 15676 + }, + { + "epoch": 1.2651924784117505, + "grad_norm": 0.7700765132904053, + "learning_rate": 2.241804280987261e-05, + "loss": 2.4197, + "step": 15677 + }, + { + "epoch": 1.2652731821483334, + "grad_norm": 0.6897448897361755, + "learning_rate": 2.240808295736686e-05, + "loss": 2.4052, + "step": 15678 + }, + { + "epoch": 1.2653538858849165, + "grad_norm": 0.7092932462692261, + "learning_rate": 2.2398125038654515e-05, + "loss": 2.4088, + "step": 15679 + }, + { + "epoch": 1.2654345896214996, + "grad_norm": 0.6930294632911682, + "learning_rate": 2.2388169053983777e-05, + "loss": 2.4504, + "step": 15680 + }, + { + "epoch": 1.2655152933580824, + "grad_norm": 0.7056782245635986, + "learning_rate": 2.237821500360271e-05, + "loss": 2.3975, + "step": 15681 + }, + { + "epoch": 1.2655959970946655, + "grad_norm": 0.651772141456604, + "learning_rate": 2.236826288775944e-05, + "loss": 2.3941, + "step": 15682 + }, + { + "epoch": 1.2656767008312486, + "grad_norm": 0.7254980206489563, + "learning_rate": 2.2358312706702012e-05, + "loss": 2.4149, + "step": 15683 + }, + { + "epoch": 1.2657574045678315, + "grad_norm": 0.6553635597229004, + "learning_rate": 2.2348364460678373e-05, + "loss": 2.4099, + "step": 15684 + }, + { + "epoch": 1.2658381083044146, + "grad_norm": 0.6952616572380066, + "learning_rate": 2.233841814993646e-05, + "loss": 2.384, + "step": 15685 + }, + { + "epoch": 1.2659188120409974, + "grad_norm": 0.72947096824646, + "learning_rate": 2.2328473774724178e-05, + "loss": 2.5033, + "step": 15686 + }, + { + "epoch": 1.2659995157775805, + "grad_norm": 0.7419683933258057, + "learning_rate": 2.231853133528937e-05, + "loss": 2.4881, + "step": 15687 + }, + { + "epoch": 1.2660802195141634, + "grad_norm": 0.7125211358070374, + "learning_rate": 2.2308590831879827e-05, + "loss": 2.4334, + "step": 15688 + }, + { + "epoch": 1.2661609232507465, + "grad_norm": 0.6668617129325867, + "learning_rate": 2.2298652264743315e-05, + "loss": 2.4144, + "step": 15689 + }, + { + "epoch": 1.2662416269873296, + "grad_norm": 0.8075512051582336, + "learning_rate": 2.2288715634127465e-05, + "loss": 2.421, + "step": 15690 + }, + { + "epoch": 1.2663223307239124, + "grad_norm": 0.6894629001617432, + "learning_rate": 2.2278780940279965e-05, + "loss": 2.4142, + "step": 15691 + }, + { + "epoch": 1.2664030344604955, + "grad_norm": 0.7418074011802673, + "learning_rate": 2.226884818344841e-05, + "loss": 2.4214, + "step": 15692 + }, + { + "epoch": 1.2664837381970786, + "grad_norm": 0.6724219918251038, + "learning_rate": 2.225891736388037e-05, + "loss": 2.4455, + "step": 15693 + }, + { + "epoch": 1.2665644419336615, + "grad_norm": 0.7202882766723633, + "learning_rate": 2.224898848182331e-05, + "loss": 2.4017, + "step": 15694 + }, + { + "epoch": 1.2666451456702446, + "grad_norm": 0.7671259641647339, + "learning_rate": 2.2239061537524698e-05, + "loss": 2.4386, + "step": 15695 + }, + { + "epoch": 1.2667258494068276, + "grad_norm": 0.7154317498207092, + "learning_rate": 2.222913653123194e-05, + "loss": 2.3754, + "step": 15696 + }, + { + "epoch": 1.2668065531434105, + "grad_norm": 0.7203264236450195, + "learning_rate": 2.221921346319239e-05, + "loss": 2.3926, + "step": 15697 + }, + { + "epoch": 1.2668872568799936, + "grad_norm": 0.7104187607765198, + "learning_rate": 2.2209292333653365e-05, + "loss": 2.4528, + "step": 15698 + }, + { + "epoch": 1.2669679606165767, + "grad_norm": 0.7650138139724731, + "learning_rate": 2.2199373142862158e-05, + "loss": 2.4372, + "step": 15699 + }, + { + "epoch": 1.2670486643531595, + "grad_norm": 0.6796044111251831, + "learning_rate": 2.2189455891065903e-05, + "loss": 2.415, + "step": 15700 + }, + { + "epoch": 1.2671293680897426, + "grad_norm": 0.6749297380447388, + "learning_rate": 2.2179540578511813e-05, + "loss": 2.4337, + "step": 15701 + }, + { + "epoch": 1.2672100718263255, + "grad_norm": 0.7330272793769836, + "learning_rate": 2.216962720544703e-05, + "loss": 2.4322, + "step": 15702 + }, + { + "epoch": 1.2672907755629086, + "grad_norm": 0.6793510913848877, + "learning_rate": 2.215971577211855e-05, + "loss": 2.4473, + "step": 15703 + }, + { + "epoch": 1.2673714792994915, + "grad_norm": 0.7477267384529114, + "learning_rate": 2.2149806278773433e-05, + "loss": 2.4699, + "step": 15704 + }, + { + "epoch": 1.2674521830360745, + "grad_norm": 0.7048643827438354, + "learning_rate": 2.213989872565867e-05, + "loss": 2.4341, + "step": 15705 + }, + { + "epoch": 1.2675328867726576, + "grad_norm": 0.647433340549469, + "learning_rate": 2.2129993113021108e-05, + "loss": 2.423, + "step": 15706 + }, + { + "epoch": 1.2676135905092405, + "grad_norm": 0.6886507272720337, + "learning_rate": 2.2120089441107706e-05, + "loss": 2.4185, + "step": 15707 + }, + { + "epoch": 1.2676942942458236, + "grad_norm": 0.6720516085624695, + "learning_rate": 2.2110187710165242e-05, + "loss": 2.4587, + "step": 15708 + }, + { + "epoch": 1.2677749979824067, + "grad_norm": 0.676665723323822, + "learning_rate": 2.2100287920440543e-05, + "loss": 2.4241, + "step": 15709 + }, + { + "epoch": 1.2678557017189895, + "grad_norm": 0.6939559578895569, + "learning_rate": 2.209039007218028e-05, + "loss": 2.3974, + "step": 15710 + }, + { + "epoch": 1.2679364054555726, + "grad_norm": 0.6485786437988281, + "learning_rate": 2.2080494165631137e-05, + "loss": 2.4041, + "step": 15711 + }, + { + "epoch": 1.2680171091921557, + "grad_norm": 0.668319582939148, + "learning_rate": 2.2070600201039802e-05, + "loss": 2.4705, + "step": 15712 + }, + { + "epoch": 1.2680978129287386, + "grad_norm": 0.6837478280067444, + "learning_rate": 2.206070817865279e-05, + "loss": 2.4474, + "step": 15713 + }, + { + "epoch": 1.2681785166653217, + "grad_norm": 0.7000131011009216, + "learning_rate": 2.2050818098716664e-05, + "loss": 2.4463, + "step": 15714 + }, + { + "epoch": 1.2682592204019045, + "grad_norm": 0.7063068151473999, + "learning_rate": 2.204092996147794e-05, + "loss": 2.4226, + "step": 15715 + }, + { + "epoch": 1.2683399241384876, + "grad_norm": 0.6497172117233276, + "learning_rate": 2.2031043767183003e-05, + "loss": 2.3678, + "step": 15716 + }, + { + "epoch": 1.2684206278750705, + "grad_norm": 0.6558645963668823, + "learning_rate": 2.2021159516078262e-05, + "loss": 2.4021, + "step": 15717 + }, + { + "epoch": 1.2685013316116536, + "grad_norm": 0.7411713600158691, + "learning_rate": 2.2011277208410062e-05, + "loss": 2.4346, + "step": 15718 + }, + { + "epoch": 1.2685820353482367, + "grad_norm": 0.7275578379631042, + "learning_rate": 2.2001396844424714e-05, + "loss": 2.4262, + "step": 15719 + }, + { + "epoch": 1.2686627390848195, + "grad_norm": 0.7010936141014099, + "learning_rate": 2.199151842436844e-05, + "loss": 2.4774, + "step": 15720 + }, + { + "epoch": 1.2687434428214026, + "grad_norm": 0.7551137208938599, + "learning_rate": 2.1981641948487462e-05, + "loss": 2.5286, + "step": 15721 + }, + { + "epoch": 1.2688241465579857, + "grad_norm": 0.6510799527168274, + "learning_rate": 2.1971767417027888e-05, + "loss": 2.3813, + "step": 15722 + }, + { + "epoch": 1.2689048502945686, + "grad_norm": 0.636050283908844, + "learning_rate": 2.196189483023584e-05, + "loss": 2.4226, + "step": 15723 + }, + { + "epoch": 1.2689855540311517, + "grad_norm": 0.6939265131950378, + "learning_rate": 2.1952024188357368e-05, + "loss": 2.4516, + "step": 15724 + }, + { + "epoch": 1.2690662577677347, + "grad_norm": 0.6715239882469177, + "learning_rate": 2.1942155491638494e-05, + "loss": 2.4358, + "step": 15725 + }, + { + "epoch": 1.2691469615043176, + "grad_norm": 0.740680456161499, + "learning_rate": 2.1932288740325123e-05, + "loss": 2.4135, + "step": 15726 + }, + { + "epoch": 1.2692276652409007, + "grad_norm": 0.6969335079193115, + "learning_rate": 2.1922423934663193e-05, + "loss": 2.43, + "step": 15727 + }, + { + "epoch": 1.2693083689774838, + "grad_norm": 0.6390758156776428, + "learning_rate": 2.1912561074898554e-05, + "loss": 2.4492, + "step": 15728 + }, + { + "epoch": 1.2693890727140666, + "grad_norm": 0.7129701375961304, + "learning_rate": 2.190270016127701e-05, + "loss": 2.3799, + "step": 15729 + }, + { + "epoch": 1.2694697764506497, + "grad_norm": 0.7309553027153015, + "learning_rate": 2.1892841194044332e-05, + "loss": 2.4955, + "step": 15730 + }, + { + "epoch": 1.2695504801872326, + "grad_norm": 0.7257225513458252, + "learning_rate": 2.1882984173446252e-05, + "loss": 2.4184, + "step": 15731 + }, + { + "epoch": 1.2696311839238157, + "grad_norm": 0.7434510588645935, + "learning_rate": 2.1873129099728384e-05, + "loss": 2.453, + "step": 15732 + }, + { + "epoch": 1.2697118876603986, + "grad_norm": 0.6643160581588745, + "learning_rate": 2.1863275973136356e-05, + "loss": 2.3619, + "step": 15733 + }, + { + "epoch": 1.2697925913969816, + "grad_norm": 0.6677344441413879, + "learning_rate": 2.1853424793915778e-05, + "loss": 2.406, + "step": 15734 + }, + { + "epoch": 1.2698732951335647, + "grad_norm": 0.760028064250946, + "learning_rate": 2.1843575562312092e-05, + "loss": 2.5479, + "step": 15735 + }, + { + "epoch": 1.2699539988701476, + "grad_norm": 0.6668389439582825, + "learning_rate": 2.183372827857082e-05, + "loss": 2.4104, + "step": 15736 + }, + { + "epoch": 1.2700347026067307, + "grad_norm": 0.651155412197113, + "learning_rate": 2.182388294293736e-05, + "loss": 2.3738, + "step": 15737 + }, + { + "epoch": 1.2701154063433138, + "grad_norm": 0.736907958984375, + "learning_rate": 2.1814039555657084e-05, + "loss": 2.4179, + "step": 15738 + }, + { + "epoch": 1.2701961100798966, + "grad_norm": 0.7068225741386414, + "learning_rate": 2.180419811697534e-05, + "loss": 2.3911, + "step": 15739 + }, + { + "epoch": 1.2702768138164797, + "grad_norm": 0.6959261894226074, + "learning_rate": 2.1794358627137368e-05, + "loss": 2.452, + "step": 15740 + }, + { + "epoch": 1.2703575175530628, + "grad_norm": 0.6886181235313416, + "learning_rate": 2.1784521086388442e-05, + "loss": 2.4166, + "step": 15741 + }, + { + "epoch": 1.2704382212896457, + "grad_norm": 0.6494541168212891, + "learning_rate": 2.177468549497369e-05, + "loss": 2.3589, + "step": 15742 + }, + { + "epoch": 1.2705189250262288, + "grad_norm": 0.7008326649665833, + "learning_rate": 2.1764851853138247e-05, + "loss": 2.3697, + "step": 15743 + }, + { + "epoch": 1.2705996287628119, + "grad_norm": 0.6800456643104553, + "learning_rate": 2.1755020161127238e-05, + "loss": 2.4162, + "step": 15744 + }, + { + "epoch": 1.2706803324993947, + "grad_norm": 0.6836018562316895, + "learning_rate": 2.1745190419185634e-05, + "loss": 2.3977, + "step": 15745 + }, + { + "epoch": 1.2707610362359778, + "grad_norm": 0.6489691138267517, + "learning_rate": 2.173536262755844e-05, + "loss": 2.464, + "step": 15746 + }, + { + "epoch": 1.2708417399725607, + "grad_norm": 0.7309786677360535, + "learning_rate": 2.172553678649061e-05, + "loss": 2.4065, + "step": 15747 + }, + { + "epoch": 1.2709224437091438, + "grad_norm": 0.6752686500549316, + "learning_rate": 2.1715712896227004e-05, + "loss": 2.3935, + "step": 15748 + }, + { + "epoch": 1.2710031474457266, + "grad_norm": 0.7039850354194641, + "learning_rate": 2.1705890957012465e-05, + "loss": 2.4605, + "step": 15749 + }, + { + "epoch": 1.2710838511823097, + "grad_norm": 0.6904652714729309, + "learning_rate": 2.169607096909182e-05, + "loss": 2.4264, + "step": 15750 + }, + { + "epoch": 1.2711645549188928, + "grad_norm": 0.7104331254959106, + "learning_rate": 2.168625293270974e-05, + "loss": 2.378, + "step": 15751 + }, + { + "epoch": 1.2712452586554757, + "grad_norm": 0.6732800602912903, + "learning_rate": 2.167643684811096e-05, + "loss": 2.4216, + "step": 15752 + }, + { + "epoch": 1.2713259623920588, + "grad_norm": 0.7207335829734802, + "learning_rate": 2.166662271554011e-05, + "loss": 2.3861, + "step": 15753 + }, + { + "epoch": 1.2714066661286418, + "grad_norm": 0.7561055421829224, + "learning_rate": 2.1656810535241813e-05, + "loss": 2.4753, + "step": 15754 + }, + { + "epoch": 1.2714873698652247, + "grad_norm": 0.7018210887908936, + "learning_rate": 2.1647000307460564e-05, + "loss": 2.401, + "step": 15755 + }, + { + "epoch": 1.2715680736018078, + "grad_norm": 0.6908013224601746, + "learning_rate": 2.163719203244089e-05, + "loss": 2.4451, + "step": 15756 + }, + { + "epoch": 1.2716487773383909, + "grad_norm": 0.734909176826477, + "learning_rate": 2.162738571042723e-05, + "loss": 2.4221, + "step": 15757 + }, + { + "epoch": 1.2717294810749737, + "grad_norm": 0.7047279477119446, + "learning_rate": 2.1617581341663973e-05, + "loss": 2.4149, + "step": 15758 + }, + { + "epoch": 1.2718101848115568, + "grad_norm": 0.6875640749931335, + "learning_rate": 2.1607778926395496e-05, + "loss": 2.3874, + "step": 15759 + }, + { + "epoch": 1.2718908885481397, + "grad_norm": 0.7300851345062256, + "learning_rate": 2.159797846486611e-05, + "loss": 2.4706, + "step": 15760 + }, + { + "epoch": 1.2719715922847228, + "grad_norm": 0.733775794506073, + "learning_rate": 2.1588179957320022e-05, + "loss": 2.4208, + "step": 15761 + }, + { + "epoch": 1.2720522960213057, + "grad_norm": 0.8375213742256165, + "learning_rate": 2.1578383404001458e-05, + "loss": 2.4672, + "step": 15762 + }, + { + "epoch": 1.2721329997578887, + "grad_norm": 0.7276780009269714, + "learning_rate": 2.15685888051546e-05, + "loss": 2.4536, + "step": 15763 + }, + { + "epoch": 1.2722137034944718, + "grad_norm": 0.7765224575996399, + "learning_rate": 2.1558796161023508e-05, + "loss": 2.3671, + "step": 15764 + }, + { + "epoch": 1.2722944072310547, + "grad_norm": 0.7225642204284668, + "learning_rate": 2.1549005471852256e-05, + "loss": 2.4316, + "step": 15765 + }, + { + "epoch": 1.2723751109676378, + "grad_norm": 0.6959484219551086, + "learning_rate": 2.1539216737884904e-05, + "loss": 2.4581, + "step": 15766 + }, + { + "epoch": 1.2724558147042209, + "grad_norm": 0.6943621039390564, + "learning_rate": 2.1529429959365332e-05, + "loss": 2.4372, + "step": 15767 + }, + { + "epoch": 1.2725365184408037, + "grad_norm": 0.7067148089408875, + "learning_rate": 2.151964513653746e-05, + "loss": 2.431, + "step": 15768 + }, + { + "epoch": 1.2726172221773868, + "grad_norm": 0.8317076563835144, + "learning_rate": 2.150986226964521e-05, + "loss": 2.4177, + "step": 15769 + }, + { + "epoch": 1.27269792591397, + "grad_norm": 0.7390087246894836, + "learning_rate": 2.150008135893239e-05, + "loss": 2.4711, + "step": 15770 + }, + { + "epoch": 1.2727786296505528, + "grad_norm": 0.6829150915145874, + "learning_rate": 2.1490302404642725e-05, + "loss": 2.4477, + "step": 15771 + }, + { + "epoch": 1.2728593333871359, + "grad_norm": 0.7355613708496094, + "learning_rate": 2.148052540701995e-05, + "loss": 2.493, + "step": 15772 + }, + { + "epoch": 1.272940037123719, + "grad_norm": 0.6872289776802063, + "learning_rate": 2.1470750366307747e-05, + "loss": 2.4363, + "step": 15773 + }, + { + "epoch": 1.2730207408603018, + "grad_norm": 0.7753220796585083, + "learning_rate": 2.1460977282749705e-05, + "loss": 2.4376, + "step": 15774 + }, + { + "epoch": 1.273101444596885, + "grad_norm": 0.6717056632041931, + "learning_rate": 2.145120615658942e-05, + "loss": 2.4383, + "step": 15775 + }, + { + "epoch": 1.2731821483334678, + "grad_norm": 0.7441569566726685, + "learning_rate": 2.1441436988070428e-05, + "loss": 2.462, + "step": 15776 + }, + { + "epoch": 1.2732628520700509, + "grad_norm": 0.6824371814727783, + "learning_rate": 2.143166977743615e-05, + "loss": 2.4173, + "step": 15777 + }, + { + "epoch": 1.2733435558066337, + "grad_norm": 0.7310225963592529, + "learning_rate": 2.1421904524930038e-05, + "loss": 2.4222, + "step": 15778 + }, + { + "epoch": 1.2734242595432168, + "grad_norm": 0.7198066115379333, + "learning_rate": 2.141214123079548e-05, + "loss": 2.4262, + "step": 15779 + }, + { + "epoch": 1.2735049632798, + "grad_norm": 0.7081776857376099, + "learning_rate": 2.1402379895275783e-05, + "loss": 2.4473, + "step": 15780 + }, + { + "epoch": 1.2735856670163828, + "grad_norm": 0.6909368634223938, + "learning_rate": 2.1392620518614235e-05, + "loss": 2.4528, + "step": 15781 + }, + { + "epoch": 1.2736663707529658, + "grad_norm": 0.7170675992965698, + "learning_rate": 2.1382863101054107e-05, + "loss": 2.4214, + "step": 15782 + }, + { + "epoch": 1.273747074489549, + "grad_norm": 0.6992846727371216, + "learning_rate": 2.1373107642838497e-05, + "loss": 2.4397, + "step": 15783 + }, + { + "epoch": 1.2738277782261318, + "grad_norm": 0.7245237231254578, + "learning_rate": 2.1363354144210578e-05, + "loss": 2.373, + "step": 15784 + }, + { + "epoch": 1.273908481962715, + "grad_norm": 0.6929232478141785, + "learning_rate": 2.1353602605413435e-05, + "loss": 2.4297, + "step": 15785 + }, + { + "epoch": 1.273989185699298, + "grad_norm": 0.7243950366973877, + "learning_rate": 2.134385302669013e-05, + "loss": 2.3856, + "step": 15786 + }, + { + "epoch": 1.2740698894358808, + "grad_norm": 0.6712679266929626, + "learning_rate": 2.133410540828359e-05, + "loss": 2.3818, + "step": 15787 + }, + { + "epoch": 1.274150593172464, + "grad_norm": 0.7433474063873291, + "learning_rate": 2.1324359750436774e-05, + "loss": 2.4148, + "step": 15788 + }, + { + "epoch": 1.274231296909047, + "grad_norm": 0.7225894927978516, + "learning_rate": 2.1314616053392577e-05, + "loss": 2.395, + "step": 15789 + }, + { + "epoch": 1.2743120006456299, + "grad_norm": 0.7026889324188232, + "learning_rate": 2.130487431739383e-05, + "loss": 2.4693, + "step": 15790 + }, + { + "epoch": 1.274392704382213, + "grad_norm": 0.6898565292358398, + "learning_rate": 2.1295134542683325e-05, + "loss": 2.3643, + "step": 15791 + }, + { + "epoch": 1.2744734081187958, + "grad_norm": 0.7212820649147034, + "learning_rate": 2.1285396729503826e-05, + "loss": 2.4178, + "step": 15792 + }, + { + "epoch": 1.274554111855379, + "grad_norm": 0.7149149179458618, + "learning_rate": 2.127566087809798e-05, + "loss": 2.4023, + "step": 15793 + }, + { + "epoch": 1.2746348155919618, + "grad_norm": 0.7039671540260315, + "learning_rate": 2.126592698870846e-05, + "loss": 2.4667, + "step": 15794 + }, + { + "epoch": 1.2747155193285449, + "grad_norm": 0.806849479675293, + "learning_rate": 2.1256195061577877e-05, + "loss": 2.4741, + "step": 15795 + }, + { + "epoch": 1.274796223065128, + "grad_norm": 0.7544776797294617, + "learning_rate": 2.124646509694872e-05, + "loss": 2.4258, + "step": 15796 + }, + { + "epoch": 1.2748769268017108, + "grad_norm": 0.6946810483932495, + "learning_rate": 2.1236737095063518e-05, + "loss": 2.4088, + "step": 15797 + }, + { + "epoch": 1.274957630538294, + "grad_norm": 0.7714219093322754, + "learning_rate": 2.1227011056164714e-05, + "loss": 2.4705, + "step": 15798 + }, + { + "epoch": 1.275038334274877, + "grad_norm": 0.6789658665657043, + "learning_rate": 2.121728698049471e-05, + "loss": 2.4692, + "step": 15799 + }, + { + "epoch": 1.2751190380114599, + "grad_norm": 0.7003477215766907, + "learning_rate": 2.120756486829586e-05, + "loss": 2.4437, + "step": 15800 + }, + { + "epoch": 1.275199741748043, + "grad_norm": 0.6802948117256165, + "learning_rate": 2.1197844719810455e-05, + "loss": 2.4002, + "step": 15801 + }, + { + "epoch": 1.275280445484626, + "grad_norm": 0.67823326587677, + "learning_rate": 2.1188126535280773e-05, + "loss": 2.5119, + "step": 15802 + }, + { + "epoch": 1.275361149221209, + "grad_norm": 0.6580843925476074, + "learning_rate": 2.1178410314948972e-05, + "loss": 2.3814, + "step": 15803 + }, + { + "epoch": 1.275441852957792, + "grad_norm": 0.681642472743988, + "learning_rate": 2.1168696059057226e-05, + "loss": 2.4206, + "step": 15804 + }, + { + "epoch": 1.275522556694375, + "grad_norm": 0.7483543753623962, + "learning_rate": 2.1158983767847674e-05, + "loss": 2.4633, + "step": 15805 + }, + { + "epoch": 1.275603260430958, + "grad_norm": 0.6565235257148743, + "learning_rate": 2.11492734415623e-05, + "loss": 2.4145, + "step": 15806 + }, + { + "epoch": 1.275683964167541, + "grad_norm": 0.6606764793395996, + "learning_rate": 2.1139565080443157e-05, + "loss": 2.3935, + "step": 15807 + }, + { + "epoch": 1.275764667904124, + "grad_norm": 0.7915800213813782, + "learning_rate": 2.1129858684732206e-05, + "loss": 2.4288, + "step": 15808 + }, + { + "epoch": 1.275845371640707, + "grad_norm": 0.6763594746589661, + "learning_rate": 2.112015425467133e-05, + "loss": 2.4147, + "step": 15809 + }, + { + "epoch": 1.2759260753772899, + "grad_norm": 0.6886053085327148, + "learning_rate": 2.1110451790502405e-05, + "loss": 2.3798, + "step": 15810 + }, + { + "epoch": 1.276006779113873, + "grad_norm": 0.686122715473175, + "learning_rate": 2.110075129246728e-05, + "loss": 2.3896, + "step": 15811 + }, + { + "epoch": 1.276087482850456, + "grad_norm": 0.6989614367485046, + "learning_rate": 2.109105276080764e-05, + "loss": 2.4533, + "step": 15812 + }, + { + "epoch": 1.276168186587039, + "grad_norm": 0.6818450689315796, + "learning_rate": 2.1081356195765232e-05, + "loss": 2.4012, + "step": 15813 + }, + { + "epoch": 1.276248890323622, + "grad_norm": 0.7492663860321045, + "learning_rate": 2.107166159758176e-05, + "loss": 2.4269, + "step": 15814 + }, + { + "epoch": 1.276329594060205, + "grad_norm": 0.6752359867095947, + "learning_rate": 2.1061968966498767e-05, + "loss": 2.4478, + "step": 15815 + }, + { + "epoch": 1.276410297796788, + "grad_norm": 0.6784162521362305, + "learning_rate": 2.1052278302757854e-05, + "loss": 2.4853, + "step": 15816 + }, + { + "epoch": 1.276491001533371, + "grad_norm": 0.7273215651512146, + "learning_rate": 2.104258960660055e-05, + "loss": 2.4365, + "step": 15817 + }, + { + "epoch": 1.2765717052699541, + "grad_norm": 0.7021621465682983, + "learning_rate": 2.1032902878268323e-05, + "loss": 2.4665, + "step": 15818 + }, + { + "epoch": 1.276652409006537, + "grad_norm": 0.666828989982605, + "learning_rate": 2.102321811800253e-05, + "loss": 2.3922, + "step": 15819 + }, + { + "epoch": 1.27673311274312, + "grad_norm": 0.6780487298965454, + "learning_rate": 2.1013535326044608e-05, + "loss": 2.4072, + "step": 15820 + }, + { + "epoch": 1.276813816479703, + "grad_norm": 0.6474688053131104, + "learning_rate": 2.1003854502635888e-05, + "loss": 2.4145, + "step": 15821 + }, + { + "epoch": 1.276894520216286, + "grad_norm": 0.6712753772735596, + "learning_rate": 2.0994175648017587e-05, + "loss": 2.4349, + "step": 15822 + }, + { + "epoch": 1.2769752239528689, + "grad_norm": 0.6705189943313599, + "learning_rate": 2.098449876243096e-05, + "loss": 2.4376, + "step": 15823 + }, + { + "epoch": 1.277055927689452, + "grad_norm": 0.6794685125350952, + "learning_rate": 2.0974823846117197e-05, + "loss": 2.3717, + "step": 15824 + }, + { + "epoch": 1.277136631426035, + "grad_norm": 0.7145677804946899, + "learning_rate": 2.0965150899317364e-05, + "loss": 2.3829, + "step": 15825 + }, + { + "epoch": 1.277217335162618, + "grad_norm": 0.7043245434761047, + "learning_rate": 2.095547992227257e-05, + "loss": 2.405, + "step": 15826 + }, + { + "epoch": 1.277298038899201, + "grad_norm": 0.7969205379486084, + "learning_rate": 2.0945810915223873e-05, + "loss": 2.4115, + "step": 15827 + }, + { + "epoch": 1.277378742635784, + "grad_norm": 0.657482385635376, + "learning_rate": 2.0936143878412186e-05, + "loss": 2.372, + "step": 15828 + }, + { + "epoch": 1.277459446372367, + "grad_norm": 0.7315167784690857, + "learning_rate": 2.0926478812078466e-05, + "loss": 2.4372, + "step": 15829 + }, + { + "epoch": 1.27754015010895, + "grad_norm": 0.6985061764717102, + "learning_rate": 2.09168157164636e-05, + "loss": 2.3901, + "step": 15830 + }, + { + "epoch": 1.2776208538455331, + "grad_norm": 0.6906184554100037, + "learning_rate": 2.0907154591808408e-05, + "loss": 2.4562, + "step": 15831 + }, + { + "epoch": 1.277701557582116, + "grad_norm": 0.655094563961029, + "learning_rate": 2.0897495438353676e-05, + "loss": 2.451, + "step": 15832 + }, + { + "epoch": 1.277782261318699, + "grad_norm": 0.7663134932518005, + "learning_rate": 2.0887838256340143e-05, + "loss": 2.4634, + "step": 15833 + }, + { + "epoch": 1.2778629650552822, + "grad_norm": 0.7164491415023804, + "learning_rate": 2.087818304600849e-05, + "loss": 2.4624, + "step": 15834 + }, + { + "epoch": 1.277943668791865, + "grad_norm": 0.6962822079658508, + "learning_rate": 2.0868529807599336e-05, + "loss": 2.4325, + "step": 15835 + }, + { + "epoch": 1.2780243725284481, + "grad_norm": 0.702985405921936, + "learning_rate": 2.0858878541353255e-05, + "loss": 2.4219, + "step": 15836 + }, + { + "epoch": 1.278105076265031, + "grad_norm": 0.7605595588684082, + "learning_rate": 2.0849229247510826e-05, + "loss": 2.4201, + "step": 15837 + }, + { + "epoch": 1.278185780001614, + "grad_norm": 0.8479344248771667, + "learning_rate": 2.083958192631249e-05, + "loss": 2.4689, + "step": 15838 + }, + { + "epoch": 1.278266483738197, + "grad_norm": 0.7241235375404358, + "learning_rate": 2.082993657799869e-05, + "loss": 2.4861, + "step": 15839 + }, + { + "epoch": 1.27834718747478, + "grad_norm": 0.7069835066795349, + "learning_rate": 2.0820293202809827e-05, + "loss": 2.3759, + "step": 15840 + }, + { + "epoch": 1.2784278912113631, + "grad_norm": 0.6606370210647583, + "learning_rate": 2.0810651800986237e-05, + "loss": 2.4444, + "step": 15841 + }, + { + "epoch": 1.278508594947946, + "grad_norm": 0.6608174443244934, + "learning_rate": 2.08010123727682e-05, + "loss": 2.4339, + "step": 15842 + }, + { + "epoch": 1.278589298684529, + "grad_norm": 0.751000702381134, + "learning_rate": 2.0791374918396e-05, + "loss": 2.4327, + "step": 15843 + }, + { + "epoch": 1.2786700024211122, + "grad_norm": 0.7223808765411377, + "learning_rate": 2.0781739438109748e-05, + "loss": 2.3573, + "step": 15844 + }, + { + "epoch": 1.278750706157695, + "grad_norm": 0.6872109770774841, + "learning_rate": 2.0772105932149642e-05, + "loss": 2.3973, + "step": 15845 + }, + { + "epoch": 1.2788314098942781, + "grad_norm": 0.6967385411262512, + "learning_rate": 2.0762474400755762e-05, + "loss": 2.4622, + "step": 15846 + }, + { + "epoch": 1.2789121136308612, + "grad_norm": 0.7289159893989563, + "learning_rate": 2.0752844844168163e-05, + "loss": 2.4507, + "step": 15847 + }, + { + "epoch": 1.278992817367444, + "grad_norm": 0.7735978364944458, + "learning_rate": 2.0743217262626802e-05, + "loss": 2.4341, + "step": 15848 + }, + { + "epoch": 1.2790735211040272, + "grad_norm": 0.7209177017211914, + "learning_rate": 2.0733591656371655e-05, + "loss": 2.4024, + "step": 15849 + }, + { + "epoch": 1.2791542248406103, + "grad_norm": 0.6789259314537048, + "learning_rate": 2.0723968025642604e-05, + "loss": 2.3809, + "step": 15850 + }, + { + "epoch": 1.2792349285771931, + "grad_norm": 0.6972812414169312, + "learning_rate": 2.0714346370679495e-05, + "loss": 2.3986, + "step": 15851 + }, + { + "epoch": 1.2793156323137762, + "grad_norm": 0.7144166827201843, + "learning_rate": 2.070472669172213e-05, + "loss": 2.4241, + "step": 15852 + }, + { + "epoch": 1.279396336050359, + "grad_norm": 0.7325223088264465, + "learning_rate": 2.0695108989010282e-05, + "loss": 2.452, + "step": 15853 + }, + { + "epoch": 1.2794770397869422, + "grad_norm": 0.6900116205215454, + "learning_rate": 2.0685493262783608e-05, + "loss": 2.4091, + "step": 15854 + }, + { + "epoch": 1.279557743523525, + "grad_norm": 0.6846197843551636, + "learning_rate": 2.0675879513281758e-05, + "loss": 2.4337, + "step": 15855 + }, + { + "epoch": 1.2796384472601081, + "grad_norm": 0.6901541352272034, + "learning_rate": 2.0666267740744372e-05, + "loss": 2.4586, + "step": 15856 + }, + { + "epoch": 1.2797191509966912, + "grad_norm": 0.6842665672302246, + "learning_rate": 2.0656657945410953e-05, + "loss": 2.4383, + "step": 15857 + }, + { + "epoch": 1.279799854733274, + "grad_norm": 0.7450493574142456, + "learning_rate": 2.0647050127521028e-05, + "loss": 2.4308, + "step": 15858 + }, + { + "epoch": 1.2798805584698572, + "grad_norm": 0.6928436160087585, + "learning_rate": 2.0637444287314033e-05, + "loss": 2.4726, + "step": 15859 + }, + { + "epoch": 1.2799612622064402, + "grad_norm": 0.6539968252182007, + "learning_rate": 2.06278404250294e-05, + "loss": 2.3983, + "step": 15860 + }, + { + "epoch": 1.280041965943023, + "grad_norm": 0.7183163166046143, + "learning_rate": 2.0618238540906444e-05, + "loss": 2.4172, + "step": 15861 + }, + { + "epoch": 1.2801226696796062, + "grad_norm": 0.7070814371109009, + "learning_rate": 2.0608638635184507e-05, + "loss": 2.4018, + "step": 15862 + }, + { + "epoch": 1.2802033734161893, + "grad_norm": 0.7589142918586731, + "learning_rate": 2.0599040708102847e-05, + "loss": 2.4175, + "step": 15863 + }, + { + "epoch": 1.2802840771527721, + "grad_norm": 0.6945414543151855, + "learning_rate": 2.0589444759900613e-05, + "loss": 2.4093, + "step": 15864 + }, + { + "epoch": 1.2803647808893552, + "grad_norm": 0.685482919216156, + "learning_rate": 2.0579850790817003e-05, + "loss": 2.4388, + "step": 15865 + }, + { + "epoch": 1.280445484625938, + "grad_norm": 0.7089706063270569, + "learning_rate": 2.0570258801091148e-05, + "loss": 2.3779, + "step": 15866 + }, + { + "epoch": 1.2805261883625212, + "grad_norm": 0.6994217038154602, + "learning_rate": 2.0560668790962046e-05, + "loss": 2.3757, + "step": 15867 + }, + { + "epoch": 1.280606892099104, + "grad_norm": 0.7170232534408569, + "learning_rate": 2.055108076066874e-05, + "loss": 2.4087, + "step": 15868 + }, + { + "epoch": 1.2806875958356871, + "grad_norm": 0.7008751034736633, + "learning_rate": 2.0541494710450206e-05, + "loss": 2.4384, + "step": 15869 + }, + { + "epoch": 1.2807682995722702, + "grad_norm": 0.6795800924301147, + "learning_rate": 2.053191064054527e-05, + "loss": 2.415, + "step": 15870 + }, + { + "epoch": 1.280849003308853, + "grad_norm": 0.6650210022926331, + "learning_rate": 2.0522328551192882e-05, + "loss": 2.4421, + "step": 15871 + }, + { + "epoch": 1.2809297070454362, + "grad_norm": 0.7045374512672424, + "learning_rate": 2.0512748442631858e-05, + "loss": 2.4285, + "step": 15872 + }, + { + "epoch": 1.2810104107820193, + "grad_norm": 0.6585350632667542, + "learning_rate": 2.0503170315100883e-05, + "loss": 2.3806, + "step": 15873 + }, + { + "epoch": 1.2810911145186021, + "grad_norm": 0.7833496332168579, + "learning_rate": 2.0493594168838725e-05, + "loss": 2.4557, + "step": 15874 + }, + { + "epoch": 1.2811718182551852, + "grad_norm": 0.7237457036972046, + "learning_rate": 2.0484020004084048e-05, + "loss": 2.3966, + "step": 15875 + }, + { + "epoch": 1.2812525219917683, + "grad_norm": 0.7416609525680542, + "learning_rate": 2.0474447821075426e-05, + "loss": 2.3729, + "step": 15876 + }, + { + "epoch": 1.2813332257283512, + "grad_norm": 0.7148095369338989, + "learning_rate": 2.046487762005146e-05, + "loss": 2.4163, + "step": 15877 + }, + { + "epoch": 1.2814139294649343, + "grad_norm": 0.670281171798706, + "learning_rate": 2.0455309401250632e-05, + "loss": 2.383, + "step": 15878 + }, + { + "epoch": 1.2814946332015174, + "grad_norm": 0.6968950629234314, + "learning_rate": 2.0445743164911457e-05, + "loss": 2.3967, + "step": 15879 + }, + { + "epoch": 1.2815753369381002, + "grad_norm": 0.783441960811615, + "learning_rate": 2.0436178911272298e-05, + "loss": 2.455, + "step": 15880 + }, + { + "epoch": 1.2816560406746833, + "grad_norm": 0.709032416343689, + "learning_rate": 2.0426616640571518e-05, + "loss": 2.4207, + "step": 15881 + }, + { + "epoch": 1.2817367444112662, + "grad_norm": 0.6727990508079529, + "learning_rate": 2.0417056353047504e-05, + "loss": 2.4115, + "step": 15882 + }, + { + "epoch": 1.2818174481478493, + "grad_norm": 0.7336034774780273, + "learning_rate": 2.0407498048938445e-05, + "loss": 2.43, + "step": 15883 + }, + { + "epoch": 1.2818981518844321, + "grad_norm": 0.7649042010307312, + "learning_rate": 2.0397941728482604e-05, + "loss": 2.4655, + "step": 15884 + }, + { + "epoch": 1.2819788556210152, + "grad_norm": 0.7218052744865417, + "learning_rate": 2.038838739191816e-05, + "loss": 2.4872, + "step": 15885 + }, + { + "epoch": 1.2820595593575983, + "grad_norm": 0.7192350625991821, + "learning_rate": 2.0378835039483178e-05, + "loss": 2.4751, + "step": 15886 + }, + { + "epoch": 1.2821402630941812, + "grad_norm": 0.7059212923049927, + "learning_rate": 2.0369284671415768e-05, + "loss": 2.43, + "step": 15887 + }, + { + "epoch": 1.2822209668307643, + "grad_norm": 0.7387098073959351, + "learning_rate": 2.0359736287953956e-05, + "loss": 2.4281, + "step": 15888 + }, + { + "epoch": 1.2823016705673473, + "grad_norm": 0.7454321980476379, + "learning_rate": 2.035018988933568e-05, + "loss": 2.4372, + "step": 15889 + }, + { + "epoch": 1.2823823743039302, + "grad_norm": 0.6822765469551086, + "learning_rate": 2.034064547579888e-05, + "loss": 2.3728, + "step": 15890 + }, + { + "epoch": 1.2824630780405133, + "grad_norm": 0.6917527914047241, + "learning_rate": 2.0331103047581412e-05, + "loss": 2.3997, + "step": 15891 + }, + { + "epoch": 1.2825437817770964, + "grad_norm": 0.6734376549720764, + "learning_rate": 2.032156260492113e-05, + "loss": 2.4495, + "step": 15892 + }, + { + "epoch": 1.2826244855136792, + "grad_norm": 0.7222443222999573, + "learning_rate": 2.0312024148055776e-05, + "loss": 2.3466, + "step": 15893 + }, + { + "epoch": 1.2827051892502623, + "grad_norm": 0.703714907169342, + "learning_rate": 2.030248767722309e-05, + "loss": 2.4599, + "step": 15894 + }, + { + "epoch": 1.2827858929868454, + "grad_norm": 0.655161440372467, + "learning_rate": 2.029295319266078e-05, + "loss": 2.3896, + "step": 15895 + }, + { + "epoch": 1.2828665967234283, + "grad_norm": 0.6449242234230042, + "learning_rate": 2.028342069460639e-05, + "loss": 2.3511, + "step": 15896 + }, + { + "epoch": 1.2829473004600114, + "grad_norm": 0.6578382849693298, + "learning_rate": 2.027389018329755e-05, + "loss": 2.3678, + "step": 15897 + }, + { + "epoch": 1.2830280041965942, + "grad_norm": 0.7047572731971741, + "learning_rate": 2.0264361658971797e-05, + "loss": 2.4522, + "step": 15898 + }, + { + "epoch": 1.2831087079331773, + "grad_norm": 0.7310267090797424, + "learning_rate": 2.0254835121866554e-05, + "loss": 2.4117, + "step": 15899 + }, + { + "epoch": 1.2831894116697602, + "grad_norm": 0.7020776867866516, + "learning_rate": 2.024531057221927e-05, + "loss": 2.4033, + "step": 15900 + }, + { + "epoch": 1.2832701154063433, + "grad_norm": 0.6967746615409851, + "learning_rate": 2.023578801026733e-05, + "loss": 2.3491, + "step": 15901 + }, + { + "epoch": 1.2833508191429264, + "grad_norm": 0.7062339782714844, + "learning_rate": 2.022626743624807e-05, + "loss": 2.4598, + "step": 15902 + }, + { + "epoch": 1.2834315228795092, + "grad_norm": 0.730625331401825, + "learning_rate": 2.0216748850398748e-05, + "loss": 2.4995, + "step": 15903 + }, + { + "epoch": 1.2835122266160923, + "grad_norm": 0.6634403467178345, + "learning_rate": 2.020723225295662e-05, + "loss": 2.3843, + "step": 15904 + }, + { + "epoch": 1.2835929303526754, + "grad_norm": 0.6924816966056824, + "learning_rate": 2.019771764415883e-05, + "loss": 2.4258, + "step": 15905 + }, + { + "epoch": 1.2836736340892583, + "grad_norm": 0.7127227187156677, + "learning_rate": 2.018820502424251e-05, + "loss": 2.4038, + "step": 15906 + }, + { + "epoch": 1.2837543378258414, + "grad_norm": 0.7108431458473206, + "learning_rate": 2.0178694393444785e-05, + "loss": 2.4571, + "step": 15907 + }, + { + "epoch": 1.2838350415624245, + "grad_norm": 0.7478229999542236, + "learning_rate": 2.016918575200262e-05, + "loss": 2.4526, + "step": 15908 + }, + { + "epoch": 1.2839157452990073, + "grad_norm": 0.65651935338974, + "learning_rate": 2.015967910015303e-05, + "loss": 2.434, + "step": 15909 + }, + { + "epoch": 1.2839964490355904, + "grad_norm": 0.7285312414169312, + "learning_rate": 2.015017443813294e-05, + "loss": 2.3857, + "step": 15910 + }, + { + "epoch": 1.2840771527721733, + "grad_norm": 0.6947231292724609, + "learning_rate": 2.014067176617923e-05, + "loss": 2.4294, + "step": 15911 + }, + { + "epoch": 1.2841578565087564, + "grad_norm": 0.6965867877006531, + "learning_rate": 2.0131171084528744e-05, + "loss": 2.4514, + "step": 15912 + }, + { + "epoch": 1.2842385602453392, + "grad_norm": 0.6962311863899231, + "learning_rate": 2.0121672393418246e-05, + "loss": 2.4391, + "step": 15913 + }, + { + "epoch": 1.2843192639819223, + "grad_norm": 0.6687992215156555, + "learning_rate": 2.01121756930845e-05, + "loss": 2.4266, + "step": 15914 + }, + { + "epoch": 1.2843999677185054, + "grad_norm": 0.7118954658508301, + "learning_rate": 2.0102680983764145e-05, + "loss": 2.3436, + "step": 15915 + }, + { + "epoch": 1.2844806714550883, + "grad_norm": 0.6866199970245361, + "learning_rate": 2.009318826569382e-05, + "loss": 2.3719, + "step": 15916 + }, + { + "epoch": 1.2845613751916714, + "grad_norm": 0.6701404452323914, + "learning_rate": 2.008369753911016e-05, + "loss": 2.4875, + "step": 15917 + }, + { + "epoch": 1.2846420789282544, + "grad_norm": 0.7020917534828186, + "learning_rate": 2.007420880424963e-05, + "loss": 2.3871, + "step": 15918 + }, + { + "epoch": 1.2847227826648373, + "grad_norm": 0.6865704655647278, + "learning_rate": 2.006472206134875e-05, + "loss": 2.3815, + "step": 15919 + }, + { + "epoch": 1.2848034864014204, + "grad_norm": 0.7106871008872986, + "learning_rate": 2.0055237310643948e-05, + "loss": 2.4276, + "step": 15920 + }, + { + "epoch": 1.2848841901380035, + "grad_norm": 0.6891976594924927, + "learning_rate": 2.004575455237161e-05, + "loss": 2.3641, + "step": 15921 + }, + { + "epoch": 1.2849648938745863, + "grad_norm": 0.6385056972503662, + "learning_rate": 2.0036273786768067e-05, + "loss": 2.3898, + "step": 15922 + }, + { + "epoch": 1.2850455976111694, + "grad_norm": 0.7038321495056152, + "learning_rate": 2.0026795014069633e-05, + "loss": 2.4688, + "step": 15923 + }, + { + "epoch": 1.2851263013477525, + "grad_norm": 0.6310208439826965, + "learning_rate": 2.0017318234512494e-05, + "loss": 2.3821, + "step": 15924 + }, + { + "epoch": 1.2852070050843354, + "grad_norm": 0.6989426016807556, + "learning_rate": 2.0007843448332865e-05, + "loss": 2.434, + "step": 15925 + }, + { + "epoch": 1.2852877088209185, + "grad_norm": 0.6666426658630371, + "learning_rate": 1.9998370655766886e-05, + "loss": 2.4687, + "step": 15926 + }, + { + "epoch": 1.2853684125575013, + "grad_norm": 0.6421633958816528, + "learning_rate": 1.9988899857050648e-05, + "loss": 2.4269, + "step": 15927 + }, + { + "epoch": 1.2854491162940844, + "grad_norm": 0.7229343056678772, + "learning_rate": 1.997943105242016e-05, + "loss": 2.4139, + "step": 15928 + }, + { + "epoch": 1.2855298200306673, + "grad_norm": 0.7168964743614197, + "learning_rate": 1.9969964242111427e-05, + "loss": 2.405, + "step": 15929 + }, + { + "epoch": 1.2856105237672504, + "grad_norm": 0.6824480891227722, + "learning_rate": 1.99604994263604e-05, + "loss": 2.3955, + "step": 15930 + }, + { + "epoch": 1.2856912275038335, + "grad_norm": 0.670956552028656, + "learning_rate": 1.995103660540294e-05, + "loss": 2.3743, + "step": 15931 + }, + { + "epoch": 1.2857719312404163, + "grad_norm": 0.7057971954345703, + "learning_rate": 1.9941575779474864e-05, + "loss": 2.4496, + "step": 15932 + }, + { + "epoch": 1.2858526349769994, + "grad_norm": 0.7802264094352722, + "learning_rate": 1.9932116948812052e-05, + "loss": 2.4231, + "step": 15933 + }, + { + "epoch": 1.2859333387135825, + "grad_norm": 0.7151160836219788, + "learning_rate": 1.992266011365016e-05, + "loss": 2.4319, + "step": 15934 + }, + { + "epoch": 1.2860140424501654, + "grad_norm": 0.7078769207000732, + "learning_rate": 1.991320527422489e-05, + "loss": 2.4037, + "step": 15935 + }, + { + "epoch": 1.2860947461867485, + "grad_norm": 0.7483938336372375, + "learning_rate": 1.9903752430771927e-05, + "loss": 2.4946, + "step": 15936 + }, + { + "epoch": 1.2861754499233315, + "grad_norm": 0.7774620056152344, + "learning_rate": 1.9894301583526808e-05, + "loss": 2.4536, + "step": 15937 + }, + { + "epoch": 1.2862561536599144, + "grad_norm": 0.7311348915100098, + "learning_rate": 1.988485273272509e-05, + "loss": 2.4178, + "step": 15938 + }, + { + "epoch": 1.2863368573964975, + "grad_norm": 0.6821309328079224, + "learning_rate": 1.9875405878602282e-05, + "loss": 2.4851, + "step": 15939 + }, + { + "epoch": 1.2864175611330806, + "grad_norm": 0.7081651091575623, + "learning_rate": 1.9865961021393785e-05, + "loss": 2.4377, + "step": 15940 + }, + { + "epoch": 1.2864982648696635, + "grad_norm": 0.8093439340591431, + "learning_rate": 1.9856518161335014e-05, + "loss": 2.4681, + "step": 15941 + }, + { + "epoch": 1.2865789686062465, + "grad_norm": 0.6769521832466125, + "learning_rate": 1.984707729866131e-05, + "loss": 2.4231, + "step": 15942 + }, + { + "epoch": 1.2866596723428294, + "grad_norm": 0.6973356604576111, + "learning_rate": 1.983763843360795e-05, + "loss": 2.4144, + "step": 15943 + }, + { + "epoch": 1.2867403760794125, + "grad_norm": 0.7814682722091675, + "learning_rate": 1.9828201566410197e-05, + "loss": 2.3935, + "step": 15944 + }, + { + "epoch": 1.2868210798159954, + "grad_norm": 0.7545498609542847, + "learning_rate": 1.9818766697303236e-05, + "loss": 2.4136, + "step": 15945 + }, + { + "epoch": 1.2869017835525784, + "grad_norm": 0.7165581583976746, + "learning_rate": 1.9809333826522225e-05, + "loss": 2.3757, + "step": 15946 + }, + { + "epoch": 1.2869824872891615, + "grad_norm": 0.6812456846237183, + "learning_rate": 1.9799902954302208e-05, + "loss": 2.4143, + "step": 15947 + }, + { + "epoch": 1.2870631910257444, + "grad_norm": 0.7231366634368896, + "learning_rate": 1.9790474080878262e-05, + "loss": 2.4837, + "step": 15948 + }, + { + "epoch": 1.2871438947623275, + "grad_norm": 0.690916121006012, + "learning_rate": 1.9781047206485393e-05, + "loss": 2.4513, + "step": 15949 + }, + { + "epoch": 1.2872245984989106, + "grad_norm": 0.6608129143714905, + "learning_rate": 1.9771622331358485e-05, + "loss": 2.3908, + "step": 15950 + }, + { + "epoch": 1.2873053022354934, + "grad_norm": 0.7194501161575317, + "learning_rate": 1.976219945573249e-05, + "loss": 2.38, + "step": 15951 + }, + { + "epoch": 1.2873860059720765, + "grad_norm": 0.7315083146095276, + "learning_rate": 1.9752778579842213e-05, + "loss": 2.4351, + "step": 15952 + }, + { + "epoch": 1.2874667097086596, + "grad_norm": 0.7313492298126221, + "learning_rate": 1.974335970392246e-05, + "loss": 2.3531, + "step": 15953 + }, + { + "epoch": 1.2875474134452425, + "grad_norm": 0.6982418894767761, + "learning_rate": 1.9733942828207985e-05, + "loss": 2.4319, + "step": 15954 + }, + { + "epoch": 1.2876281171818256, + "grad_norm": 0.6664792895317078, + "learning_rate": 1.972452795293347e-05, + "loss": 2.3981, + "step": 15955 + }, + { + "epoch": 1.2877088209184087, + "grad_norm": 0.6849696040153503, + "learning_rate": 1.9715115078333578e-05, + "loss": 2.3952, + "step": 15956 + }, + { + "epoch": 1.2877895246549915, + "grad_norm": 0.7355225086212158, + "learning_rate": 1.9705704204642873e-05, + "loss": 2.4556, + "step": 15957 + }, + { + "epoch": 1.2878702283915746, + "grad_norm": 0.6850876808166504, + "learning_rate": 1.9696295332095906e-05, + "loss": 2.3873, + "step": 15958 + }, + { + "epoch": 1.2879509321281575, + "grad_norm": 0.6449069976806641, + "learning_rate": 1.9686888460927198e-05, + "loss": 2.4226, + "step": 15959 + }, + { + "epoch": 1.2880316358647406, + "grad_norm": 0.7517794966697693, + "learning_rate": 1.967748359137114e-05, + "loss": 2.377, + "step": 15960 + }, + { + "epoch": 1.2881123396013234, + "grad_norm": 0.6861303448677063, + "learning_rate": 1.9668080723662162e-05, + "loss": 2.4451, + "step": 15961 + }, + { + "epoch": 1.2881930433379065, + "grad_norm": 0.7025154829025269, + "learning_rate": 1.9658679858034602e-05, + "loss": 2.3856, + "step": 15962 + }, + { + "epoch": 1.2882737470744896, + "grad_norm": 0.6775577068328857, + "learning_rate": 1.964928099472275e-05, + "loss": 2.4383, + "step": 15963 + }, + { + "epoch": 1.2883544508110725, + "grad_norm": 0.6889605522155762, + "learning_rate": 1.963988413396086e-05, + "loss": 2.3766, + "step": 15964 + }, + { + "epoch": 1.2884351545476556, + "grad_norm": 0.6697166562080383, + "learning_rate": 1.9630489275983156e-05, + "loss": 2.44, + "step": 15965 + }, + { + "epoch": 1.2885158582842386, + "grad_norm": 0.6895437836647034, + "learning_rate": 1.96210964210237e-05, + "loss": 2.4242, + "step": 15966 + }, + { + "epoch": 1.2885965620208215, + "grad_norm": 0.6955164670944214, + "learning_rate": 1.9611705569316652e-05, + "loss": 2.3915, + "step": 15967 + }, + { + "epoch": 1.2886772657574046, + "grad_norm": 0.7133461236953735, + "learning_rate": 1.960231672109605e-05, + "loss": 2.4307, + "step": 15968 + }, + { + "epoch": 1.2887579694939877, + "grad_norm": 0.6874761581420898, + "learning_rate": 1.9592929876595857e-05, + "loss": 2.4371, + "step": 15969 + }, + { + "epoch": 1.2888386732305706, + "grad_norm": 0.7168406248092651, + "learning_rate": 1.9583545036050044e-05, + "loss": 2.4681, + "step": 15970 + }, + { + "epoch": 1.2889193769671536, + "grad_norm": 0.701874852180481, + "learning_rate": 1.9574162199692492e-05, + "loss": 2.4746, + "step": 15971 + }, + { + "epoch": 1.2890000807037365, + "grad_norm": 0.7118390202522278, + "learning_rate": 1.9564781367757058e-05, + "loss": 2.4139, + "step": 15972 + }, + { + "epoch": 1.2890807844403196, + "grad_norm": 0.6597239971160889, + "learning_rate": 1.955540254047753e-05, + "loss": 2.4346, + "step": 15973 + }, + { + "epoch": 1.2891614881769025, + "grad_norm": 0.7461068630218506, + "learning_rate": 1.9546025718087645e-05, + "loss": 2.4331, + "step": 15974 + }, + { + "epoch": 1.2892421919134855, + "grad_norm": 0.6992977857589722, + "learning_rate": 1.953665090082115e-05, + "loss": 2.424, + "step": 15975 + }, + { + "epoch": 1.2893228956500686, + "grad_norm": 0.6674031615257263, + "learning_rate": 1.9527278088911617e-05, + "loss": 2.4545, + "step": 15976 + }, + { + "epoch": 1.2894035993866515, + "grad_norm": 0.7377402782440186, + "learning_rate": 1.9517907282592662e-05, + "loss": 2.4625, + "step": 15977 + }, + { + "epoch": 1.2894843031232346, + "grad_norm": 0.720579206943512, + "learning_rate": 1.950853848209788e-05, + "loss": 2.4073, + "step": 15978 + }, + { + "epoch": 1.2895650068598177, + "grad_norm": 0.7221893668174744, + "learning_rate": 1.9499171687660688e-05, + "loss": 2.4056, + "step": 15979 + }, + { + "epoch": 1.2896457105964005, + "grad_norm": 0.7409725189208984, + "learning_rate": 1.9489806899514574e-05, + "loss": 2.3899, + "step": 15980 + }, + { + "epoch": 1.2897264143329836, + "grad_norm": 0.6946583986282349, + "learning_rate": 1.948044411789296e-05, + "loss": 2.4832, + "step": 15981 + }, + { + "epoch": 1.2898071180695667, + "grad_norm": 0.7031306028366089, + "learning_rate": 1.9471083343029096e-05, + "loss": 2.4265, + "step": 15982 + }, + { + "epoch": 1.2898878218061496, + "grad_norm": 0.660093367099762, + "learning_rate": 1.946172457515637e-05, + "loss": 2.4883, + "step": 15983 + }, + { + "epoch": 1.2899685255427327, + "grad_norm": 0.700641930103302, + "learning_rate": 1.945236781450802e-05, + "loss": 2.4096, + "step": 15984 + }, + { + "epoch": 1.2900492292793158, + "grad_norm": 0.7350760698318481, + "learning_rate": 1.9443013061317205e-05, + "loss": 2.4161, + "step": 15985 + }, + { + "epoch": 1.2901299330158986, + "grad_norm": 0.7567386031150818, + "learning_rate": 1.9433660315817072e-05, + "loss": 2.3978, + "step": 15986 + }, + { + "epoch": 1.2902106367524817, + "grad_norm": 0.7471369504928589, + "learning_rate": 1.9424309578240717e-05, + "loss": 2.4079, + "step": 15987 + }, + { + "epoch": 1.2902913404890646, + "grad_norm": 0.6630815267562866, + "learning_rate": 1.941496084882124e-05, + "loss": 2.4223, + "step": 15988 + }, + { + "epoch": 1.2903720442256477, + "grad_norm": 0.687224268913269, + "learning_rate": 1.940561412779155e-05, + "loss": 2.4413, + "step": 15989 + }, + { + "epoch": 1.2904527479622305, + "grad_norm": 0.6989685297012329, + "learning_rate": 1.9396269415384637e-05, + "loss": 2.3651, + "step": 15990 + }, + { + "epoch": 1.2905334516988136, + "grad_norm": 0.7256720066070557, + "learning_rate": 1.938692671183342e-05, + "loss": 2.4526, + "step": 15991 + }, + { + "epoch": 1.2906141554353967, + "grad_norm": 0.692032516002655, + "learning_rate": 1.9377586017370685e-05, + "loss": 2.3936, + "step": 15992 + }, + { + "epoch": 1.2906948591719796, + "grad_norm": 0.6733511686325073, + "learning_rate": 1.936824733222925e-05, + "loss": 2.4691, + "step": 15993 + }, + { + "epoch": 1.2907755629085627, + "grad_norm": 0.6698563098907471, + "learning_rate": 1.935891065664187e-05, + "loss": 2.3904, + "step": 15994 + }, + { + "epoch": 1.2908562666451457, + "grad_norm": 0.660521388053894, + "learning_rate": 1.934957599084123e-05, + "loss": 2.4647, + "step": 15995 + }, + { + "epoch": 1.2909369703817286, + "grad_norm": 0.6714615821838379, + "learning_rate": 1.9340243335059982e-05, + "loss": 2.403, + "step": 15996 + }, + { + "epoch": 1.2910176741183117, + "grad_norm": 0.726099967956543, + "learning_rate": 1.9330912689530746e-05, + "loss": 2.4101, + "step": 15997 + }, + { + "epoch": 1.2910983778548948, + "grad_norm": 0.6585896015167236, + "learning_rate": 1.932158405448601e-05, + "loss": 2.3813, + "step": 15998 + }, + { + "epoch": 1.2911790815914777, + "grad_norm": 0.7967908382415771, + "learning_rate": 1.9312257430158286e-05, + "loss": 2.4188, + "step": 15999 + }, + { + "epoch": 1.2912597853280607, + "grad_norm": 0.7340367436408997, + "learning_rate": 1.9302932816780063e-05, + "loss": 2.4642, + "step": 16000 + }, + { + "epoch": 1.2912597853280607, + "eval_loss": 2.3791537284851074, + "eval_runtime": 780.6124, + "eval_samples_per_second": 3.356, + "eval_steps_per_second": 0.56, + "step": 16000 + }, + { + "epoch": 1.2913404890646438, + "grad_norm": 0.6778663992881775, + "learning_rate": 1.929361021458367e-05, + "loss": 2.4057, + "step": 16001 + }, + { + "epoch": 1.2914211928012267, + "grad_norm": 0.6982381343841553, + "learning_rate": 1.9284289623801477e-05, + "loss": 2.4376, + "step": 16002 + }, + { + "epoch": 1.2915018965378098, + "grad_norm": 0.6956612467765808, + "learning_rate": 1.927497104466578e-05, + "loss": 2.4485, + "step": 16003 + }, + { + "epoch": 1.2915826002743926, + "grad_norm": 0.6780211925506592, + "learning_rate": 1.9265654477408825e-05, + "loss": 2.4233, + "step": 16004 + }, + { + "epoch": 1.2916633040109757, + "grad_norm": 0.6869028806686401, + "learning_rate": 1.92563399222628e-05, + "loss": 2.4156, + "step": 16005 + }, + { + "epoch": 1.2917440077475586, + "grad_norm": 0.6402696967124939, + "learning_rate": 1.9247027379459848e-05, + "loss": 2.4208, + "step": 16006 + }, + { + "epoch": 1.2918247114841417, + "grad_norm": 0.6868177652359009, + "learning_rate": 1.92377168492321e-05, + "loss": 2.4067, + "step": 16007 + }, + { + "epoch": 1.2919054152207248, + "grad_norm": 0.7152438759803772, + "learning_rate": 1.922840833181152e-05, + "loss": 2.3944, + "step": 16008 + }, + { + "epoch": 1.2919861189573076, + "grad_norm": 0.6467335820198059, + "learning_rate": 1.921910182743015e-05, + "loss": 2.4064, + "step": 16009 + }, + { + "epoch": 1.2920668226938907, + "grad_norm": 0.6918551325798035, + "learning_rate": 1.9209797336319956e-05, + "loss": 2.4457, + "step": 16010 + }, + { + "epoch": 1.2921475264304738, + "grad_norm": 0.7308588027954102, + "learning_rate": 1.920049485871278e-05, + "loss": 2.3785, + "step": 16011 + }, + { + "epoch": 1.2922282301670567, + "grad_norm": 0.6918718814849854, + "learning_rate": 1.9191194394840472e-05, + "loss": 2.4645, + "step": 16012 + }, + { + "epoch": 1.2923089339036398, + "grad_norm": 0.7048078775405884, + "learning_rate": 1.9181895944934848e-05, + "loss": 2.4082, + "step": 16013 + }, + { + "epoch": 1.2923896376402229, + "grad_norm": 0.7175794839859009, + "learning_rate": 1.917259950922763e-05, + "loss": 2.4521, + "step": 16014 + }, + { + "epoch": 1.2924703413768057, + "grad_norm": 0.6895543932914734, + "learning_rate": 1.916330508795051e-05, + "loss": 2.4058, + "step": 16015 + }, + { + "epoch": 1.2925510451133888, + "grad_norm": 0.6951895952224731, + "learning_rate": 1.9154012681335176e-05, + "loss": 2.4274, + "step": 16016 + }, + { + "epoch": 1.2926317488499717, + "grad_norm": 0.6807428598403931, + "learning_rate": 1.9144722289613148e-05, + "loss": 2.4008, + "step": 16017 + }, + { + "epoch": 1.2927124525865548, + "grad_norm": 0.6643410325050354, + "learning_rate": 1.9135433913015997e-05, + "loss": 2.4036, + "step": 16018 + }, + { + "epoch": 1.2927931563231376, + "grad_norm": 0.7283294796943665, + "learning_rate": 1.912614755177522e-05, + "loss": 2.4118, + "step": 16019 + }, + { + "epoch": 1.2928738600597207, + "grad_norm": 0.7516021132469177, + "learning_rate": 1.911686320612227e-05, + "loss": 2.3983, + "step": 16020 + }, + { + "epoch": 1.2929545637963038, + "grad_norm": 0.7314203381538391, + "learning_rate": 1.91075808762885e-05, + "loss": 2.4352, + "step": 16021 + }, + { + "epoch": 1.2930352675328867, + "grad_norm": 0.6904106736183167, + "learning_rate": 1.9098300562505266e-05, + "loss": 2.3734, + "step": 16022 + }, + { + "epoch": 1.2931159712694698, + "grad_norm": 0.6936709880828857, + "learning_rate": 1.9089022265003863e-05, + "loss": 2.4356, + "step": 16023 + }, + { + "epoch": 1.2931966750060528, + "grad_norm": 0.6753442883491516, + "learning_rate": 1.9079745984015528e-05, + "loss": 2.4713, + "step": 16024 + }, + { + "epoch": 1.2932773787426357, + "grad_norm": 0.7185340523719788, + "learning_rate": 1.9070471719771445e-05, + "loss": 2.4021, + "step": 16025 + }, + { + "epoch": 1.2933580824792188, + "grad_norm": 0.7486871480941772, + "learning_rate": 1.9061199472502798e-05, + "loss": 2.4144, + "step": 16026 + }, + { + "epoch": 1.2934387862158019, + "grad_norm": 0.6790735721588135, + "learning_rate": 1.90519292424406e-05, + "loss": 2.413, + "step": 16027 + }, + { + "epoch": 1.2935194899523847, + "grad_norm": 0.7104402780532837, + "learning_rate": 1.9042661029815922e-05, + "loss": 2.452, + "step": 16028 + }, + { + "epoch": 1.2936001936889678, + "grad_norm": 0.6975364685058594, + "learning_rate": 1.9033394834859796e-05, + "loss": 2.4169, + "step": 16029 + }, + { + "epoch": 1.293680897425551, + "grad_norm": 0.7619667649269104, + "learning_rate": 1.9024130657803085e-05, + "loss": 2.4106, + "step": 16030 + }, + { + "epoch": 1.2937616011621338, + "grad_norm": 0.6600254774093628, + "learning_rate": 1.9014868498876716e-05, + "loss": 2.3955, + "step": 16031 + }, + { + "epoch": 1.2938423048987169, + "grad_norm": 0.6790784597396851, + "learning_rate": 1.9005608358311533e-05, + "loss": 2.437, + "step": 16032 + }, + { + "epoch": 1.2939230086352997, + "grad_norm": 0.7085568308830261, + "learning_rate": 1.899635023633828e-05, + "loss": 2.4729, + "step": 16033 + }, + { + "epoch": 1.2940037123718828, + "grad_norm": 0.6940603256225586, + "learning_rate": 1.8987094133187732e-05, + "loss": 2.4099, + "step": 16034 + }, + { + "epoch": 1.2940844161084657, + "grad_norm": 0.7387171387672424, + "learning_rate": 1.897784004909058e-05, + "loss": 2.4509, + "step": 16035 + }, + { + "epoch": 1.2941651198450488, + "grad_norm": 0.8263981938362122, + "learning_rate": 1.8968587984277463e-05, + "loss": 2.4208, + "step": 16036 + }, + { + "epoch": 1.2942458235816319, + "grad_norm": 0.7393552660942078, + "learning_rate": 1.8959337938978937e-05, + "loss": 2.4458, + "step": 16037 + }, + { + "epoch": 1.2943265273182147, + "grad_norm": 0.652787983417511, + "learning_rate": 1.895008991342555e-05, + "loss": 2.3593, + "step": 16038 + }, + { + "epoch": 1.2944072310547978, + "grad_norm": 0.6533015370368958, + "learning_rate": 1.8940843907847817e-05, + "loss": 2.4538, + "step": 16039 + }, + { + "epoch": 1.294487934791381, + "grad_norm": 0.6723785400390625, + "learning_rate": 1.8931599922476106e-05, + "loss": 2.4528, + "step": 16040 + }, + { + "epoch": 1.2945686385279638, + "grad_norm": 0.693242073059082, + "learning_rate": 1.892235795754085e-05, + "loss": 2.4006, + "step": 16041 + }, + { + "epoch": 1.2946493422645469, + "grad_norm": 0.6849604845046997, + "learning_rate": 1.8913118013272403e-05, + "loss": 2.3758, + "step": 16042 + }, + { + "epoch": 1.29473004600113, + "grad_norm": 0.7252739667892456, + "learning_rate": 1.8903880089900983e-05, + "loss": 2.4101, + "step": 16043 + }, + { + "epoch": 1.2948107497377128, + "grad_norm": 0.720431923866272, + "learning_rate": 1.8894644187656864e-05, + "loss": 2.4241, + "step": 16044 + }, + { + "epoch": 1.294891453474296, + "grad_norm": 0.6936169862747192, + "learning_rate": 1.8885410306770225e-05, + "loss": 2.4225, + "step": 16045 + }, + { + "epoch": 1.294972157210879, + "grad_norm": 0.7698646187782288, + "learning_rate": 1.8876178447471193e-05, + "loss": 2.4031, + "step": 16046 + }, + { + "epoch": 1.2950528609474619, + "grad_norm": 0.6800495982170105, + "learning_rate": 1.8866948609989854e-05, + "loss": 2.3679, + "step": 16047 + }, + { + "epoch": 1.295133564684045, + "grad_norm": 0.7348111867904663, + "learning_rate": 1.8857720794556267e-05, + "loss": 2.4263, + "step": 16048 + }, + { + "epoch": 1.2952142684206278, + "grad_norm": 0.6614782214164734, + "learning_rate": 1.8848495001400356e-05, + "loss": 2.4396, + "step": 16049 + }, + { + "epoch": 1.295294972157211, + "grad_norm": 0.6683650612831116, + "learning_rate": 1.8839271230752075e-05, + "loss": 2.4189, + "step": 16050 + }, + { + "epoch": 1.2953756758937938, + "grad_norm": 0.711040198802948, + "learning_rate": 1.8830049482841328e-05, + "loss": 2.3974, + "step": 16051 + }, + { + "epoch": 1.2954563796303769, + "grad_norm": 0.6663193702697754, + "learning_rate": 1.882082975789795e-05, + "loss": 2.4196, + "step": 16052 + }, + { + "epoch": 1.29553708336696, + "grad_norm": 0.6551210284233093, + "learning_rate": 1.881161205615166e-05, + "loss": 2.3793, + "step": 16053 + }, + { + "epoch": 1.2956177871035428, + "grad_norm": 0.6849039793014526, + "learning_rate": 1.8802396377832243e-05, + "loss": 2.3941, + "step": 16054 + }, + { + "epoch": 1.295698490840126, + "grad_norm": 0.7642949223518372, + "learning_rate": 1.8793182723169357e-05, + "loss": 2.4296, + "step": 16055 + }, + { + "epoch": 1.295779194576709, + "grad_norm": 0.7104716897010803, + "learning_rate": 1.878397109239263e-05, + "loss": 2.4124, + "step": 16056 + }, + { + "epoch": 1.2958598983132918, + "grad_norm": 0.6822344064712524, + "learning_rate": 1.877476148573164e-05, + "loss": 2.4072, + "step": 16057 + }, + { + "epoch": 1.295940602049875, + "grad_norm": 0.6824066042900085, + "learning_rate": 1.8765553903415956e-05, + "loss": 2.4137, + "step": 16058 + }, + { + "epoch": 1.296021305786458, + "grad_norm": 0.7083307504653931, + "learning_rate": 1.875634834567498e-05, + "loss": 2.4423, + "step": 16059 + }, + { + "epoch": 1.2961020095230409, + "grad_norm": 0.7301077246665955, + "learning_rate": 1.874714481273818e-05, + "loss": 2.3926, + "step": 16060 + }, + { + "epoch": 1.296182713259624, + "grad_norm": 0.685656726360321, + "learning_rate": 1.873794330483496e-05, + "loss": 2.4409, + "step": 16061 + }, + { + "epoch": 1.296263416996207, + "grad_norm": 0.6916719675064087, + "learning_rate": 1.8728743822194584e-05, + "loss": 2.4141, + "step": 16062 + }, + { + "epoch": 1.29634412073279, + "grad_norm": 0.7188845276832581, + "learning_rate": 1.871954636504636e-05, + "loss": 2.4186, + "step": 16063 + }, + { + "epoch": 1.2964248244693728, + "grad_norm": 0.6637440919876099, + "learning_rate": 1.8710350933619504e-05, + "loss": 2.4526, + "step": 16064 + }, + { + "epoch": 1.2965055282059559, + "grad_norm": 0.7000349760055542, + "learning_rate": 1.87011575281432e-05, + "loss": 2.4096, + "step": 16065 + }, + { + "epoch": 1.296586231942539, + "grad_norm": 0.693513810634613, + "learning_rate": 1.8691966148846573e-05, + "loss": 2.3931, + "step": 16066 + }, + { + "epoch": 1.2966669356791218, + "grad_norm": 0.6928985118865967, + "learning_rate": 1.8682776795958678e-05, + "loss": 2.4384, + "step": 16067 + }, + { + "epoch": 1.296747639415705, + "grad_norm": 0.6474096179008484, + "learning_rate": 1.8673589469708585e-05, + "loss": 2.3985, + "step": 16068 + }, + { + "epoch": 1.296828343152288, + "grad_norm": 0.6827313899993896, + "learning_rate": 1.866440417032521e-05, + "loss": 2.4607, + "step": 16069 + }, + { + "epoch": 1.2969090468888709, + "grad_norm": 0.7183445692062378, + "learning_rate": 1.8655220898037485e-05, + "loss": 2.4396, + "step": 16070 + }, + { + "epoch": 1.296989750625454, + "grad_norm": 0.6997376680374146, + "learning_rate": 1.8646039653074333e-05, + "loss": 2.4627, + "step": 16071 + }, + { + "epoch": 1.297070454362037, + "grad_norm": 0.7358444333076477, + "learning_rate": 1.8636860435664493e-05, + "loss": 2.4165, + "step": 16072 + }, + { + "epoch": 1.29715115809862, + "grad_norm": 0.8126270771026611, + "learning_rate": 1.8627683246036787e-05, + "loss": 2.4681, + "step": 16073 + }, + { + "epoch": 1.297231861835203, + "grad_norm": 0.7364177107810974, + "learning_rate": 1.8618508084419918e-05, + "loss": 2.44, + "step": 16074 + }, + { + "epoch": 1.297312565571786, + "grad_norm": 0.7480010390281677, + "learning_rate": 1.8609334951042567e-05, + "loss": 2.4759, + "step": 16075 + }, + { + "epoch": 1.297393269308369, + "grad_norm": 0.6563693284988403, + "learning_rate": 1.8600163846133335e-05, + "loss": 2.3865, + "step": 16076 + }, + { + "epoch": 1.297473973044952, + "grad_norm": 0.6961230039596558, + "learning_rate": 1.8590994769920832e-05, + "loss": 2.3851, + "step": 16077 + }, + { + "epoch": 1.297554676781535, + "grad_norm": 0.7137415409088135, + "learning_rate": 1.8581827722633527e-05, + "loss": 2.4115, + "step": 16078 + }, + { + "epoch": 1.297635380518118, + "grad_norm": 0.6579335331916809, + "learning_rate": 1.85726627044999e-05, + "loss": 2.4464, + "step": 16079 + }, + { + "epoch": 1.2977160842547009, + "grad_norm": 0.7069905400276184, + "learning_rate": 1.8563499715748366e-05, + "loss": 2.4057, + "step": 16080 + }, + { + "epoch": 1.297796787991284, + "grad_norm": 0.771925687789917, + "learning_rate": 1.8554338756607325e-05, + "loss": 2.4696, + "step": 16081 + }, + { + "epoch": 1.297877491727867, + "grad_norm": 0.7268456816673279, + "learning_rate": 1.8545179827305048e-05, + "loss": 2.3949, + "step": 16082 + }, + { + "epoch": 1.29795819546445, + "grad_norm": 0.7049130797386169, + "learning_rate": 1.8536022928069796e-05, + "loss": 2.4448, + "step": 16083 + }, + { + "epoch": 1.298038899201033, + "grad_norm": 0.6716888546943665, + "learning_rate": 1.852686805912982e-05, + "loss": 2.3356, + "step": 16084 + }, + { + "epoch": 1.298119602937616, + "grad_norm": 0.666386604309082, + "learning_rate": 1.851771522071325e-05, + "loss": 2.4226, + "step": 16085 + }, + { + "epoch": 1.298200306674199, + "grad_norm": 0.7084901332855225, + "learning_rate": 1.8508564413048223e-05, + "loss": 2.4452, + "step": 16086 + }, + { + "epoch": 1.298281010410782, + "grad_norm": 0.6615412831306458, + "learning_rate": 1.8499415636362815e-05, + "loss": 2.4193, + "step": 16087 + }, + { + "epoch": 1.2983617141473651, + "grad_norm": 0.7143606543540955, + "learning_rate": 1.849026889088499e-05, + "loss": 2.4513, + "step": 16088 + }, + { + "epoch": 1.298442417883948, + "grad_norm": 0.7241482734680176, + "learning_rate": 1.8481124176842723e-05, + "loss": 2.458, + "step": 16089 + }, + { + "epoch": 1.298523121620531, + "grad_norm": 0.6762149930000305, + "learning_rate": 1.8471981494463963e-05, + "loss": 2.4386, + "step": 16090 + }, + { + "epoch": 1.2986038253571142, + "grad_norm": 0.6672768592834473, + "learning_rate": 1.8462840843976525e-05, + "loss": 2.375, + "step": 16091 + }, + { + "epoch": 1.298684529093697, + "grad_norm": 0.6871693134307861, + "learning_rate": 1.8453702225608226e-05, + "loss": 2.4342, + "step": 16092 + }, + { + "epoch": 1.2987652328302801, + "grad_norm": 0.6771275401115417, + "learning_rate": 1.8444565639586864e-05, + "loss": 2.402, + "step": 16093 + }, + { + "epoch": 1.298845936566863, + "grad_norm": 0.6627403497695923, + "learning_rate": 1.8435431086140077e-05, + "loss": 2.4667, + "step": 16094 + }, + { + "epoch": 1.298926640303446, + "grad_norm": 0.7001610398292542, + "learning_rate": 1.8426298565495538e-05, + "loss": 2.4396, + "step": 16095 + }, + { + "epoch": 1.299007344040029, + "grad_norm": 0.7574489712715149, + "learning_rate": 1.8417168077880908e-05, + "loss": 2.4601, + "step": 16096 + }, + { + "epoch": 1.299088047776612, + "grad_norm": 0.7771055698394775, + "learning_rate": 1.840803962352372e-05, + "loss": 2.4371, + "step": 16097 + }, + { + "epoch": 1.299168751513195, + "grad_norm": 0.6738649606704712, + "learning_rate": 1.8398913202651457e-05, + "loss": 2.3921, + "step": 16098 + }, + { + "epoch": 1.299249455249778, + "grad_norm": 0.7014862895011902, + "learning_rate": 1.8389788815491583e-05, + "loss": 2.451, + "step": 16099 + }, + { + "epoch": 1.299330158986361, + "grad_norm": 0.7026070952415466, + "learning_rate": 1.8380666462271523e-05, + "loss": 2.4583, + "step": 16100 + }, + { + "epoch": 1.2994108627229441, + "grad_norm": 0.6904535293579102, + "learning_rate": 1.8371546143218588e-05, + "loss": 2.4453, + "step": 16101 + }, + { + "epoch": 1.299491566459527, + "grad_norm": 0.6974804997444153, + "learning_rate": 1.8362427858560093e-05, + "loss": 2.4291, + "step": 16102 + }, + { + "epoch": 1.29957227019611, + "grad_norm": 0.6826989650726318, + "learning_rate": 1.8353311608523326e-05, + "loss": 2.4183, + "step": 16103 + }, + { + "epoch": 1.2996529739326932, + "grad_norm": 0.6804787516593933, + "learning_rate": 1.8344197393335448e-05, + "loss": 2.434, + "step": 16104 + }, + { + "epoch": 1.299733677669276, + "grad_norm": 0.7144587635993958, + "learning_rate": 1.8335085213223613e-05, + "loss": 2.4296, + "step": 16105 + }, + { + "epoch": 1.2998143814058591, + "grad_norm": 0.7228755354881287, + "learning_rate": 1.8325975068414924e-05, + "loss": 2.3987, + "step": 16106 + }, + { + "epoch": 1.2998950851424422, + "grad_norm": 0.7417716383934021, + "learning_rate": 1.8316866959136438e-05, + "loss": 2.4076, + "step": 16107 + }, + { + "epoch": 1.299975788879025, + "grad_norm": 0.6737387776374817, + "learning_rate": 1.8307760885615154e-05, + "loss": 2.4175, + "step": 16108 + }, + { + "epoch": 1.3000564926156082, + "grad_norm": 0.7294918298721313, + "learning_rate": 1.8298656848078035e-05, + "loss": 2.4022, + "step": 16109 + }, + { + "epoch": 1.300137196352191, + "grad_norm": 0.7200861573219299, + "learning_rate": 1.828955484675193e-05, + "loss": 2.4018, + "step": 16110 + }, + { + "epoch": 1.3002179000887741, + "grad_norm": 0.7704176306724548, + "learning_rate": 1.8280454881863718e-05, + "loss": 2.4539, + "step": 16111 + }, + { + "epoch": 1.300298603825357, + "grad_norm": 0.6790730953216553, + "learning_rate": 1.8271356953640184e-05, + "loss": 2.4196, + "step": 16112 + }, + { + "epoch": 1.30037930756194, + "grad_norm": 0.7165740132331848, + "learning_rate": 1.8262261062308096e-05, + "loss": 2.4234, + "step": 16113 + }, + { + "epoch": 1.3004600112985232, + "grad_norm": 0.7716830372810364, + "learning_rate": 1.82531672080941e-05, + "loss": 2.4255, + "step": 16114 + }, + { + "epoch": 1.300540715035106, + "grad_norm": 0.6525317430496216, + "learning_rate": 1.824407539122488e-05, + "loss": 2.4482, + "step": 16115 + }, + { + "epoch": 1.3006214187716891, + "grad_norm": 0.7397769093513489, + "learning_rate": 1.8234985611927003e-05, + "loss": 2.33, + "step": 16116 + }, + { + "epoch": 1.3007021225082722, + "grad_norm": 0.7106032967567444, + "learning_rate": 1.822589787042702e-05, + "loss": 2.485, + "step": 16117 + }, + { + "epoch": 1.300782826244855, + "grad_norm": 0.7030045390129089, + "learning_rate": 1.8216812166951425e-05, + "loss": 2.454, + "step": 16118 + }, + { + "epoch": 1.3008635299814382, + "grad_norm": 0.7075662612915039, + "learning_rate": 1.8207728501726683e-05, + "loss": 2.4589, + "step": 16119 + }, + { + "epoch": 1.3009442337180213, + "grad_norm": 0.6700533032417297, + "learning_rate": 1.819864687497912e-05, + "loss": 2.4398, + "step": 16120 + }, + { + "epoch": 1.3010249374546041, + "grad_norm": 0.6951712369918823, + "learning_rate": 1.8189567286935117e-05, + "loss": 2.3998, + "step": 16121 + }, + { + "epoch": 1.3011056411911872, + "grad_norm": 0.708344578742981, + "learning_rate": 1.818048973782097e-05, + "loss": 2.4142, + "step": 16122 + }, + { + "epoch": 1.30118634492777, + "grad_norm": 0.7078592777252197, + "learning_rate": 1.817141422786287e-05, + "loss": 2.451, + "step": 16123 + }, + { + "epoch": 1.3012670486643532, + "grad_norm": 0.7111849784851074, + "learning_rate": 1.816234075728703e-05, + "loss": 2.4762, + "step": 16124 + }, + { + "epoch": 1.301347752400936, + "grad_norm": 0.6716348528862, + "learning_rate": 1.8153269326319588e-05, + "loss": 2.4373, + "step": 16125 + }, + { + "epoch": 1.3014284561375191, + "grad_norm": 0.6592512130737305, + "learning_rate": 1.8144199935186623e-05, + "loss": 2.412, + "step": 16126 + }, + { + "epoch": 1.3015091598741022, + "grad_norm": 0.6958334445953369, + "learning_rate": 1.8135132584114167e-05, + "loss": 2.4077, + "step": 16127 + }, + { + "epoch": 1.301589863610685, + "grad_norm": 0.6911341547966003, + "learning_rate": 1.8126067273328207e-05, + "loss": 2.409, + "step": 16128 + }, + { + "epoch": 1.3016705673472682, + "grad_norm": 0.676114022731781, + "learning_rate": 1.8117004003054693e-05, + "loss": 2.4463, + "step": 16129 + }, + { + "epoch": 1.3017512710838512, + "grad_norm": 0.6493322849273682, + "learning_rate": 1.810794277351947e-05, + "loss": 2.4377, + "step": 16130 + }, + { + "epoch": 1.3018319748204341, + "grad_norm": 0.6938454508781433, + "learning_rate": 1.8098883584948367e-05, + "loss": 2.4298, + "step": 16131 + }, + { + "epoch": 1.3019126785570172, + "grad_norm": 0.69407719373703, + "learning_rate": 1.8089826437567214e-05, + "loss": 2.4107, + "step": 16132 + }, + { + "epoch": 1.3019933822936003, + "grad_norm": 0.6898862719535828, + "learning_rate": 1.8080771331601664e-05, + "loss": 2.4182, + "step": 16133 + }, + { + "epoch": 1.3020740860301832, + "grad_norm": 0.7377758026123047, + "learning_rate": 1.807171826727744e-05, + "loss": 2.4112, + "step": 16134 + }, + { + "epoch": 1.3021547897667662, + "grad_norm": 0.674057126045227, + "learning_rate": 1.8062667244820154e-05, + "loss": 2.4276, + "step": 16135 + }, + { + "epoch": 1.3022354935033493, + "grad_norm": 0.7087522745132446, + "learning_rate": 1.8053618264455384e-05, + "loss": 2.4338, + "step": 16136 + }, + { + "epoch": 1.3023161972399322, + "grad_norm": 0.70958411693573, + "learning_rate": 1.8044571326408667e-05, + "loss": 2.4369, + "step": 16137 + }, + { + "epoch": 1.3023969009765153, + "grad_norm": 0.7023837566375732, + "learning_rate": 1.803552643090548e-05, + "loss": 2.4185, + "step": 16138 + }, + { + "epoch": 1.3024776047130981, + "grad_norm": 0.708543598651886, + "learning_rate": 1.8026483578171216e-05, + "loss": 2.4053, + "step": 16139 + }, + { + "epoch": 1.3025583084496812, + "grad_norm": 0.748601496219635, + "learning_rate": 1.8017442768431257e-05, + "loss": 2.3948, + "step": 16140 + }, + { + "epoch": 1.302639012186264, + "grad_norm": 0.6626949310302734, + "learning_rate": 1.800840400191096e-05, + "loss": 2.4636, + "step": 16141 + }, + { + "epoch": 1.3027197159228472, + "grad_norm": 0.7079617977142334, + "learning_rate": 1.7999367278835534e-05, + "loss": 2.4091, + "step": 16142 + }, + { + "epoch": 1.3028004196594303, + "grad_norm": 0.7025624513626099, + "learning_rate": 1.7990332599430225e-05, + "loss": 2.3732, + "step": 16143 + }, + { + "epoch": 1.3028811233960131, + "grad_norm": 0.7365758419036865, + "learning_rate": 1.7981299963920205e-05, + "loss": 2.4725, + "step": 16144 + }, + { + "epoch": 1.3029618271325962, + "grad_norm": 0.7511963248252869, + "learning_rate": 1.7972269372530615e-05, + "loss": 2.4304, + "step": 16145 + }, + { + "epoch": 1.3030425308691793, + "grad_norm": 0.7055985331535339, + "learning_rate": 1.796324082548644e-05, + "loss": 2.4259, + "step": 16146 + }, + { + "epoch": 1.3031232346057622, + "grad_norm": 0.691162645816803, + "learning_rate": 1.7954214323012775e-05, + "loss": 2.4262, + "step": 16147 + }, + { + "epoch": 1.3032039383423453, + "grad_norm": 0.7179710268974304, + "learning_rate": 1.7945189865334587e-05, + "loss": 2.4301, + "step": 16148 + }, + { + "epoch": 1.3032846420789284, + "grad_norm": 0.7391623258590698, + "learning_rate": 1.7936167452676744e-05, + "loss": 2.4302, + "step": 16149 + }, + { + "epoch": 1.3033653458155112, + "grad_norm": 0.7297981381416321, + "learning_rate": 1.7927147085264117e-05, + "loss": 2.3911, + "step": 16150 + }, + { + "epoch": 1.3034460495520943, + "grad_norm": 0.7571932673454285, + "learning_rate": 1.7918128763321552e-05, + "loss": 2.4348, + "step": 16151 + }, + { + "epoch": 1.3035267532886774, + "grad_norm": 0.7074765563011169, + "learning_rate": 1.7909112487073754e-05, + "loss": 2.4164, + "step": 16152 + }, + { + "epoch": 1.3036074570252603, + "grad_norm": 0.7534131407737732, + "learning_rate": 1.7900098256745467e-05, + "loss": 2.3784, + "step": 16153 + }, + { + "epoch": 1.3036881607618434, + "grad_norm": 0.675398588180542, + "learning_rate": 1.789108607256136e-05, + "loss": 2.4305, + "step": 16154 + }, + { + "epoch": 1.3037688644984262, + "grad_norm": 0.7099249362945557, + "learning_rate": 1.7882075934746002e-05, + "loss": 2.4053, + "step": 16155 + }, + { + "epoch": 1.3038495682350093, + "grad_norm": 0.6914681196212769, + "learning_rate": 1.787306784352397e-05, + "loss": 2.3902, + "step": 16156 + }, + { + "epoch": 1.3039302719715922, + "grad_norm": 0.6956958770751953, + "learning_rate": 1.786406179911977e-05, + "loss": 2.4026, + "step": 16157 + }, + { + "epoch": 1.3040109757081753, + "grad_norm": 0.6873000860214233, + "learning_rate": 1.7855057801757857e-05, + "loss": 2.4082, + "step": 16158 + }, + { + "epoch": 1.3040916794447583, + "grad_norm": 0.7340587377548218, + "learning_rate": 1.7846055851662625e-05, + "loss": 2.4894, + "step": 16159 + }, + { + "epoch": 1.3041723831813412, + "grad_norm": 0.6956963539123535, + "learning_rate": 1.7837055949058444e-05, + "loss": 2.3976, + "step": 16160 + }, + { + "epoch": 1.3042530869179243, + "grad_norm": 0.7654300332069397, + "learning_rate": 1.782805809416962e-05, + "loss": 2.4272, + "step": 16161 + }, + { + "epoch": 1.3043337906545074, + "grad_norm": 0.7735971212387085, + "learning_rate": 1.7819062287220368e-05, + "loss": 2.4513, + "step": 16162 + }, + { + "epoch": 1.3044144943910903, + "grad_norm": 0.6897203326225281, + "learning_rate": 1.7810068528434908e-05, + "loss": 2.3974, + "step": 16163 + }, + { + "epoch": 1.3044951981276733, + "grad_norm": 0.7328432202339172, + "learning_rate": 1.780107681803741e-05, + "loss": 2.4455, + "step": 16164 + }, + { + "epoch": 1.3045759018642564, + "grad_norm": 0.7098489999771118, + "learning_rate": 1.7792087156251924e-05, + "loss": 2.4173, + "step": 16165 + }, + { + "epoch": 1.3046566056008393, + "grad_norm": 0.6593194007873535, + "learning_rate": 1.7783099543302518e-05, + "loss": 2.4102, + "step": 16166 + }, + { + "epoch": 1.3047373093374224, + "grad_norm": 0.7329291105270386, + "learning_rate": 1.7774113979413188e-05, + "loss": 2.4856, + "step": 16167 + }, + { + "epoch": 1.3048180130740052, + "grad_norm": 0.7033355236053467, + "learning_rate": 1.776513046480788e-05, + "loss": 2.4503, + "step": 16168 + }, + { + "epoch": 1.3048987168105883, + "grad_norm": 0.7063608765602112, + "learning_rate": 1.7756148999710486e-05, + "loss": 2.4523, + "step": 16169 + }, + { + "epoch": 1.3049794205471712, + "grad_norm": 0.6905883550643921, + "learning_rate": 1.774716958434487e-05, + "loss": 2.4149, + "step": 16170 + }, + { + "epoch": 1.3050601242837543, + "grad_norm": 0.694551408290863, + "learning_rate": 1.7738192218934778e-05, + "loss": 2.437, + "step": 16171 + }, + { + "epoch": 1.3051408280203374, + "grad_norm": 0.7173176407814026, + "learning_rate": 1.772921690370396e-05, + "loss": 2.4817, + "step": 16172 + }, + { + "epoch": 1.3052215317569202, + "grad_norm": 0.7197130918502808, + "learning_rate": 1.7720243638876153e-05, + "loss": 2.4481, + "step": 16173 + }, + { + "epoch": 1.3053022354935033, + "grad_norm": 0.710811197757721, + "learning_rate": 1.771127242467493e-05, + "loss": 2.397, + "step": 16174 + }, + { + "epoch": 1.3053829392300864, + "grad_norm": 0.9194550514221191, + "learning_rate": 1.7702303261323894e-05, + "loss": 2.5206, + "step": 16175 + }, + { + "epoch": 1.3054636429666693, + "grad_norm": 0.7003832459449768, + "learning_rate": 1.769333614904659e-05, + "loss": 2.4175, + "step": 16176 + }, + { + "epoch": 1.3055443467032524, + "grad_norm": 0.7161554098129272, + "learning_rate": 1.768437108806651e-05, + "loss": 2.3892, + "step": 16177 + }, + { + "epoch": 1.3056250504398355, + "grad_norm": 0.6516181826591492, + "learning_rate": 1.767540807860707e-05, + "loss": 2.4361, + "step": 16178 + }, + { + "epoch": 1.3057057541764183, + "grad_norm": 0.7518061399459839, + "learning_rate": 1.7666447120891662e-05, + "loss": 2.4572, + "step": 16179 + }, + { + "epoch": 1.3057864579130014, + "grad_norm": 0.735388994216919, + "learning_rate": 1.7657488215143637e-05, + "loss": 2.3965, + "step": 16180 + }, + { + "epoch": 1.3058671616495845, + "grad_norm": 0.6994282007217407, + "learning_rate": 1.764853136158622e-05, + "loss": 2.4052, + "step": 16181 + }, + { + "epoch": 1.3059478653861674, + "grad_norm": 0.7095311880111694, + "learning_rate": 1.7639576560442684e-05, + "loss": 2.4818, + "step": 16182 + }, + { + "epoch": 1.3060285691227504, + "grad_norm": 0.6527207493782043, + "learning_rate": 1.7630623811936208e-05, + "loss": 2.3962, + "step": 16183 + }, + { + "epoch": 1.3061092728593333, + "grad_norm": 0.6668451428413391, + "learning_rate": 1.7621673116289882e-05, + "loss": 2.4514, + "step": 16184 + }, + { + "epoch": 1.3061899765959164, + "grad_norm": 0.7119911909103394, + "learning_rate": 1.7612724473726795e-05, + "loss": 2.4313, + "step": 16185 + }, + { + "epoch": 1.3062706803324993, + "grad_norm": 0.706249475479126, + "learning_rate": 1.7603777884469984e-05, + "loss": 2.4131, + "step": 16186 + }, + { + "epoch": 1.3063513840690824, + "grad_norm": 0.6634086966514587, + "learning_rate": 1.759483334874241e-05, + "loss": 2.3532, + "step": 16187 + }, + { + "epoch": 1.3064320878056654, + "grad_norm": 0.8096393942832947, + "learning_rate": 1.7585890866766995e-05, + "loss": 2.4485, + "step": 16188 + }, + { + "epoch": 1.3065127915422483, + "grad_norm": 0.675308883190155, + "learning_rate": 1.7576950438766615e-05, + "loss": 2.388, + "step": 16189 + }, + { + "epoch": 1.3065934952788314, + "grad_norm": 0.738275408744812, + "learning_rate": 1.756801206496411e-05, + "loss": 2.4485, + "step": 16190 + }, + { + "epoch": 1.3066741990154145, + "grad_norm": 0.7045620083808899, + "learning_rate": 1.755907574558221e-05, + "loss": 2.3985, + "step": 16191 + }, + { + "epoch": 1.3067549027519973, + "grad_norm": 0.6499879360198975, + "learning_rate": 1.755014148084363e-05, + "loss": 2.3992, + "step": 16192 + }, + { + "epoch": 1.3068356064885804, + "grad_norm": 0.7101179361343384, + "learning_rate": 1.7541209270971083e-05, + "loss": 2.4217, + "step": 16193 + }, + { + "epoch": 1.3069163102251635, + "grad_norm": 0.6865181922912598, + "learning_rate": 1.7532279116187124e-05, + "loss": 2.4805, + "step": 16194 + }, + { + "epoch": 1.3069970139617464, + "grad_norm": 0.7710141539573669, + "learning_rate": 1.752335101671434e-05, + "loss": 2.3654, + "step": 16195 + }, + { + "epoch": 1.3070777176983295, + "grad_norm": 0.695936381816864, + "learning_rate": 1.7514424972775244e-05, + "loss": 2.4315, + "step": 16196 + }, + { + "epoch": 1.3071584214349126, + "grad_norm": 0.6781535148620605, + "learning_rate": 1.7505500984592304e-05, + "loss": 2.4238, + "step": 16197 + }, + { + "epoch": 1.3072391251714954, + "grad_norm": 0.6549252271652222, + "learning_rate": 1.7496579052387918e-05, + "loss": 2.3766, + "step": 16198 + }, + { + "epoch": 1.3073198289080785, + "grad_norm": 0.6599059700965881, + "learning_rate": 1.7487659176384474e-05, + "loss": 2.4613, + "step": 16199 + }, + { + "epoch": 1.3074005326446614, + "grad_norm": 0.6742514967918396, + "learning_rate": 1.7478741356804228e-05, + "loss": 2.3917, + "step": 16200 + }, + { + "epoch": 1.3074812363812445, + "grad_norm": 0.6542397141456604, + "learning_rate": 1.746982559386946e-05, + "loss": 2.44, + "step": 16201 + }, + { + "epoch": 1.3075619401178273, + "grad_norm": 0.7200478315353394, + "learning_rate": 1.74609118878024e-05, + "loss": 2.4324, + "step": 16202 + }, + { + "epoch": 1.3076426438544104, + "grad_norm": 0.717628002166748, + "learning_rate": 1.745200023882515e-05, + "loss": 2.3996, + "step": 16203 + }, + { + "epoch": 1.3077233475909935, + "grad_norm": 0.7350025177001953, + "learning_rate": 1.744309064715983e-05, + "loss": 2.4812, + "step": 16204 + }, + { + "epoch": 1.3078040513275764, + "grad_norm": 0.7253599762916565, + "learning_rate": 1.74341831130285e-05, + "loss": 2.4454, + "step": 16205 + }, + { + "epoch": 1.3078847550641595, + "grad_norm": 0.7537909746170044, + "learning_rate": 1.7425277636653193e-05, + "loss": 2.4247, + "step": 16206 + }, + { + "epoch": 1.3079654588007426, + "grad_norm": 0.7563284039497375, + "learning_rate": 1.7416374218255783e-05, + "loss": 2.3893, + "step": 16207 + }, + { + "epoch": 1.3080461625373254, + "grad_norm": 0.7118926048278809, + "learning_rate": 1.740747285805818e-05, + "loss": 2.4146, + "step": 16208 + }, + { + "epoch": 1.3081268662739085, + "grad_norm": 0.7805569171905518, + "learning_rate": 1.7398573556282304e-05, + "loss": 2.396, + "step": 16209 + }, + { + "epoch": 1.3082075700104916, + "grad_norm": 0.7357630133628845, + "learning_rate": 1.738967631314987e-05, + "loss": 2.5405, + "step": 16210 + }, + { + "epoch": 1.3082882737470745, + "grad_norm": 0.6670438647270203, + "learning_rate": 1.7380781128882652e-05, + "loss": 2.4452, + "step": 16211 + }, + { + "epoch": 1.3083689774836575, + "grad_norm": 0.7374427318572998, + "learning_rate": 1.7371888003702353e-05, + "loss": 2.5143, + "step": 16212 + }, + { + "epoch": 1.3084496812202406, + "grad_norm": 0.672207236289978, + "learning_rate": 1.736299693783058e-05, + "loss": 2.4178, + "step": 16213 + }, + { + "epoch": 1.3085303849568235, + "grad_norm": 0.6926576495170593, + "learning_rate": 1.735410793148894e-05, + "loss": 2.3466, + "step": 16214 + }, + { + "epoch": 1.3086110886934066, + "grad_norm": 0.6928917169570923, + "learning_rate": 1.734522098489899e-05, + "loss": 2.4654, + "step": 16215 + }, + { + "epoch": 1.3086917924299895, + "grad_norm": 0.6536242961883545, + "learning_rate": 1.733633609828217e-05, + "loss": 2.3761, + "step": 16216 + }, + { + "epoch": 1.3087724961665725, + "grad_norm": 0.6993953585624695, + "learning_rate": 1.732745327185994e-05, + "loss": 2.3963, + "step": 16217 + }, + { + "epoch": 1.3088531999031554, + "grad_norm": 0.6851957440376282, + "learning_rate": 1.731857250585368e-05, + "loss": 2.4253, + "step": 16218 + }, + { + "epoch": 1.3089339036397385, + "grad_norm": 0.6620005965232849, + "learning_rate": 1.7309693800484728e-05, + "loss": 2.4302, + "step": 16219 + }, + { + "epoch": 1.3090146073763216, + "grad_norm": 0.6704410314559937, + "learning_rate": 1.7300817155974356e-05, + "loss": 2.4065, + "step": 16220 + }, + { + "epoch": 1.3090953111129044, + "grad_norm": 0.6882327198982239, + "learning_rate": 1.7291942572543807e-05, + "loss": 2.4526, + "step": 16221 + }, + { + "epoch": 1.3091760148494875, + "grad_norm": 0.6971533298492432, + "learning_rate": 1.7283070050414275e-05, + "loss": 2.4076, + "step": 16222 + }, + { + "epoch": 1.3092567185860706, + "grad_norm": 0.6662544012069702, + "learning_rate": 1.7274199589806827e-05, + "loss": 2.3678, + "step": 16223 + }, + { + "epoch": 1.3093374223226535, + "grad_norm": 0.6342894434928894, + "learning_rate": 1.726533119094258e-05, + "loss": 2.3424, + "step": 16224 + }, + { + "epoch": 1.3094181260592366, + "grad_norm": 0.6808488965034485, + "learning_rate": 1.7256464854042577e-05, + "loss": 2.4286, + "step": 16225 + }, + { + "epoch": 1.3094988297958197, + "grad_norm": 0.6417922973632812, + "learning_rate": 1.7247600579327738e-05, + "loss": 2.3677, + "step": 16226 + }, + { + "epoch": 1.3095795335324025, + "grad_norm": 0.7267102599143982, + "learning_rate": 1.7238738367019002e-05, + "loss": 2.3974, + "step": 16227 + }, + { + "epoch": 1.3096602372689856, + "grad_norm": 0.6915002465248108, + "learning_rate": 1.722987821733725e-05, + "loss": 2.4429, + "step": 16228 + }, + { + "epoch": 1.3097409410055685, + "grad_norm": 0.6930112242698669, + "learning_rate": 1.7221020130503296e-05, + "loss": 2.4272, + "step": 16229 + }, + { + "epoch": 1.3098216447421516, + "grad_norm": 0.7049465179443359, + "learning_rate": 1.7212164106737904e-05, + "loss": 2.4089, + "step": 16230 + }, + { + "epoch": 1.3099023484787344, + "grad_norm": 0.7230044603347778, + "learning_rate": 1.720331014626182e-05, + "loss": 2.4313, + "step": 16231 + }, + { + "epoch": 1.3099830522153175, + "grad_norm": 0.6513530015945435, + "learning_rate": 1.7194458249295665e-05, + "loss": 2.3293, + "step": 16232 + }, + { + "epoch": 1.3100637559519006, + "grad_norm": 0.6880534291267395, + "learning_rate": 1.718560841606005e-05, + "loss": 2.4556, + "step": 16233 + }, + { + "epoch": 1.3101444596884835, + "grad_norm": 0.7075292468070984, + "learning_rate": 1.717676064677559e-05, + "loss": 2.4747, + "step": 16234 + }, + { + "epoch": 1.3102251634250666, + "grad_norm": 0.7713594436645508, + "learning_rate": 1.7167914941662723e-05, + "loss": 2.4135, + "step": 16235 + }, + { + "epoch": 1.3103058671616497, + "grad_norm": 0.7883979082107544, + "learning_rate": 1.7159071300941943e-05, + "loss": 2.418, + "step": 16236 + }, + { + "epoch": 1.3103865708982325, + "grad_norm": 0.6588975787162781, + "learning_rate": 1.7150229724833655e-05, + "loss": 2.3295, + "step": 16237 + }, + { + "epoch": 1.3104672746348156, + "grad_norm": 0.679086446762085, + "learning_rate": 1.7141390213558217e-05, + "loss": 2.413, + "step": 16238 + }, + { + "epoch": 1.3105479783713987, + "grad_norm": 0.6803067326545715, + "learning_rate": 1.713255276733592e-05, + "loss": 2.4338, + "step": 16239 + }, + { + "epoch": 1.3106286821079816, + "grad_norm": 0.7041650414466858, + "learning_rate": 1.712371738638704e-05, + "loss": 2.469, + "step": 16240 + }, + { + "epoch": 1.3107093858445646, + "grad_norm": 0.6560962796211243, + "learning_rate": 1.711488407093178e-05, + "loss": 2.4353, + "step": 16241 + }, + { + "epoch": 1.3107900895811477, + "grad_norm": 0.6637921333312988, + "learning_rate": 1.7106052821190244e-05, + "loss": 2.3996, + "step": 16242 + }, + { + "epoch": 1.3108707933177306, + "grad_norm": 0.8131709098815918, + "learning_rate": 1.7097223637382565e-05, + "loss": 2.466, + "step": 16243 + }, + { + "epoch": 1.3109514970543137, + "grad_norm": 0.6637253165245056, + "learning_rate": 1.708839651972881e-05, + "loss": 2.3811, + "step": 16244 + }, + { + "epoch": 1.3110322007908966, + "grad_norm": 0.71912682056427, + "learning_rate": 1.7079571468448917e-05, + "loss": 2.4175, + "step": 16245 + }, + { + "epoch": 1.3111129045274796, + "grad_norm": 0.7028010487556458, + "learning_rate": 1.7070748483762854e-05, + "loss": 2.41, + "step": 16246 + }, + { + "epoch": 1.3111936082640625, + "grad_norm": 0.7241945862770081, + "learning_rate": 1.7061927565890522e-05, + "loss": 2.4171, + "step": 16247 + }, + { + "epoch": 1.3112743120006456, + "grad_norm": 0.7039221525192261, + "learning_rate": 1.705310871505177e-05, + "loss": 2.4154, + "step": 16248 + }, + { + "epoch": 1.3113550157372287, + "grad_norm": 0.672444760799408, + "learning_rate": 1.704429193146636e-05, + "loss": 2.4025, + "step": 16249 + }, + { + "epoch": 1.3114357194738115, + "grad_norm": 0.7240859866142273, + "learning_rate": 1.7035477215354068e-05, + "loss": 2.3864, + "step": 16250 + }, + { + "epoch": 1.3115164232103946, + "grad_norm": 0.7379294633865356, + "learning_rate": 1.7026664566934536e-05, + "loss": 2.4663, + "step": 16251 + }, + { + "epoch": 1.3115971269469777, + "grad_norm": 0.6928708553314209, + "learning_rate": 1.7017853986427425e-05, + "loss": 2.4407, + "step": 16252 + }, + { + "epoch": 1.3116778306835606, + "grad_norm": 0.6304093599319458, + "learning_rate": 1.7009045474052298e-05, + "loss": 2.4755, + "step": 16253 + }, + { + "epoch": 1.3117585344201437, + "grad_norm": 0.6945829391479492, + "learning_rate": 1.700023903002872e-05, + "loss": 2.3817, + "step": 16254 + }, + { + "epoch": 1.3118392381567268, + "grad_norm": 0.6899009346961975, + "learning_rate": 1.6991434654576133e-05, + "loss": 2.3989, + "step": 16255 + }, + { + "epoch": 1.3119199418933096, + "grad_norm": 0.7359157204627991, + "learning_rate": 1.6982632347913985e-05, + "loss": 2.3788, + "step": 16256 + }, + { + "epoch": 1.3120006456298927, + "grad_norm": 0.6562486886978149, + "learning_rate": 1.6973832110261658e-05, + "loss": 2.3955, + "step": 16257 + }, + { + "epoch": 1.3120813493664758, + "grad_norm": 0.6772989630699158, + "learning_rate": 1.696503394183846e-05, + "loss": 2.4788, + "step": 16258 + }, + { + "epoch": 1.3121620531030587, + "grad_norm": 0.7214391231536865, + "learning_rate": 1.695623784286363e-05, + "loss": 2.3836, + "step": 16259 + }, + { + "epoch": 1.3122427568396418, + "grad_norm": 0.7041679620742798, + "learning_rate": 1.6947443813556495e-05, + "loss": 2.4547, + "step": 16260 + }, + { + "epoch": 1.3123234605762246, + "grad_norm": 0.6819555163383484, + "learning_rate": 1.6938651854136135e-05, + "loss": 2.468, + "step": 16261 + }, + { + "epoch": 1.3124041643128077, + "grad_norm": 0.6466858983039856, + "learning_rate": 1.6929861964821693e-05, + "loss": 2.4572, + "step": 16262 + }, + { + "epoch": 1.3124848680493906, + "grad_norm": 0.688709557056427, + "learning_rate": 1.6921074145832248e-05, + "loss": 2.3891, + "step": 16263 + }, + { + "epoch": 1.3125655717859737, + "grad_norm": 0.6896470785140991, + "learning_rate": 1.69122883973868e-05, + "loss": 2.3825, + "step": 16264 + }, + { + "epoch": 1.3126462755225567, + "grad_norm": 0.8242524266242981, + "learning_rate": 1.690350471970431e-05, + "loss": 2.4804, + "step": 16265 + }, + { + "epoch": 1.3127269792591396, + "grad_norm": 0.7506044507026672, + "learning_rate": 1.689472311300373e-05, + "loss": 2.4671, + "step": 16266 + }, + { + "epoch": 1.3128076829957227, + "grad_norm": 0.6776263117790222, + "learning_rate": 1.688594357750386e-05, + "loss": 2.4646, + "step": 16267 + }, + { + "epoch": 1.3128883867323058, + "grad_norm": 0.6843759417533875, + "learning_rate": 1.6877166113423548e-05, + "loss": 2.4147, + "step": 16268 + }, + { + "epoch": 1.3129690904688887, + "grad_norm": 0.6650474667549133, + "learning_rate": 1.686839072098153e-05, + "loss": 2.4379, + "step": 16269 + }, + { + "epoch": 1.3130497942054717, + "grad_norm": 0.6636466383934021, + "learning_rate": 1.6859617400396533e-05, + "loss": 2.4334, + "step": 16270 + }, + { + "epoch": 1.3131304979420548, + "grad_norm": 0.649217963218689, + "learning_rate": 1.685084615188719e-05, + "loss": 2.319, + "step": 16271 + }, + { + "epoch": 1.3132112016786377, + "grad_norm": 0.7343039512634277, + "learning_rate": 1.6842076975672126e-05, + "loss": 2.3844, + "step": 16272 + }, + { + "epoch": 1.3132919054152208, + "grad_norm": 0.6916847825050354, + "learning_rate": 1.6833309871969894e-05, + "loss": 2.4544, + "step": 16273 + }, + { + "epoch": 1.3133726091518036, + "grad_norm": 0.6762102842330933, + "learning_rate": 1.6824544840998967e-05, + "loss": 2.3912, + "step": 16274 + }, + { + "epoch": 1.3134533128883867, + "grad_norm": 0.7327221035957336, + "learning_rate": 1.68157818829778e-05, + "loss": 2.4403, + "step": 16275 + }, + { + "epoch": 1.3135340166249696, + "grad_norm": 0.7362363338470459, + "learning_rate": 1.6807020998124812e-05, + "loss": 2.5169, + "step": 16276 + }, + { + "epoch": 1.3136147203615527, + "grad_norm": 0.6882300972938538, + "learning_rate": 1.679826218665832e-05, + "loss": 2.4139, + "step": 16277 + }, + { + "epoch": 1.3136954240981358, + "grad_norm": 0.7146984934806824, + "learning_rate": 1.6789505448796615e-05, + "loss": 2.4738, + "step": 16278 + }, + { + "epoch": 1.3137761278347186, + "grad_norm": 0.6581223607063293, + "learning_rate": 1.6780750784757947e-05, + "loss": 2.4617, + "step": 16279 + }, + { + "epoch": 1.3138568315713017, + "grad_norm": 0.7729318141937256, + "learning_rate": 1.6771998194760518e-05, + "loss": 2.4541, + "step": 16280 + }, + { + "epoch": 1.3139375353078848, + "grad_norm": 0.7617159485816956, + "learning_rate": 1.6763247679022442e-05, + "loss": 2.4727, + "step": 16281 + }, + { + "epoch": 1.3140182390444677, + "grad_norm": 0.6640555262565613, + "learning_rate": 1.6754499237761844e-05, + "loss": 2.4717, + "step": 16282 + }, + { + "epoch": 1.3140989427810508, + "grad_norm": 0.7289882898330688, + "learning_rate": 1.6745752871196707e-05, + "loss": 2.4515, + "step": 16283 + }, + { + "epoch": 1.3141796465176339, + "grad_norm": 0.7075887322425842, + "learning_rate": 1.6737008579545043e-05, + "loss": 2.4586, + "step": 16284 + }, + { + "epoch": 1.3142603502542167, + "grad_norm": 0.7152252197265625, + "learning_rate": 1.672826636302477e-05, + "loss": 2.512, + "step": 16285 + }, + { + "epoch": 1.3143410539907998, + "grad_norm": 0.6875295639038086, + "learning_rate": 1.6719526221853808e-05, + "loss": 2.4049, + "step": 16286 + }, + { + "epoch": 1.314421757727383, + "grad_norm": 0.6812484860420227, + "learning_rate": 1.671078815624991e-05, + "loss": 2.3705, + "step": 16287 + }, + { + "epoch": 1.3145024614639658, + "grad_norm": 0.664282500743866, + "learning_rate": 1.6702052166430904e-05, + "loss": 2.3776, + "step": 16288 + }, + { + "epoch": 1.3145831652005489, + "grad_norm": 0.7460842728614807, + "learning_rate": 1.66933182526145e-05, + "loss": 2.4525, + "step": 16289 + }, + { + "epoch": 1.3146638689371317, + "grad_norm": 0.6555477380752563, + "learning_rate": 1.6684586415018366e-05, + "loss": 2.3902, + "step": 16290 + }, + { + "epoch": 1.3147445726737148, + "grad_norm": 0.7191921472549438, + "learning_rate": 1.6675856653860135e-05, + "loss": 2.4957, + "step": 16291 + }, + { + "epoch": 1.3148252764102977, + "grad_norm": 0.738667368888855, + "learning_rate": 1.666712896935738e-05, + "loss": 2.4182, + "step": 16292 + }, + { + "epoch": 1.3149059801468808, + "grad_norm": 0.6764421463012695, + "learning_rate": 1.6658403361727593e-05, + "loss": 2.4179, + "step": 16293 + }, + { + "epoch": 1.3149866838834638, + "grad_norm": 0.6981594562530518, + "learning_rate": 1.6649679831188247e-05, + "loss": 2.4288, + "step": 16294 + }, + { + "epoch": 1.3150673876200467, + "grad_norm": 0.6657801866531372, + "learning_rate": 1.6640958377956784e-05, + "loss": 2.3716, + "step": 16295 + }, + { + "epoch": 1.3151480913566298, + "grad_norm": 0.7238973379135132, + "learning_rate": 1.6632239002250505e-05, + "loss": 2.438, + "step": 16296 + }, + { + "epoch": 1.3152287950932129, + "grad_norm": 0.6727766990661621, + "learning_rate": 1.6623521704286772e-05, + "loss": 2.4406, + "step": 16297 + }, + { + "epoch": 1.3153094988297958, + "grad_norm": 0.6741603016853333, + "learning_rate": 1.661480648428282e-05, + "loss": 2.4379, + "step": 16298 + }, + { + "epoch": 1.3153902025663788, + "grad_norm": 0.7174610495567322, + "learning_rate": 1.6606093342455865e-05, + "loss": 2.4368, + "step": 16299 + }, + { + "epoch": 1.315470906302962, + "grad_norm": 0.6604920029640198, + "learning_rate": 1.6597382279023057e-05, + "loss": 2.4431, + "step": 16300 + }, + { + "epoch": 1.3155516100395448, + "grad_norm": 0.6930821537971497, + "learning_rate": 1.6588673294201494e-05, + "loss": 2.4064, + "step": 16301 + }, + { + "epoch": 1.3156323137761279, + "grad_norm": 0.6489799618721008, + "learning_rate": 1.657996638820826e-05, + "loss": 2.4256, + "step": 16302 + }, + { + "epoch": 1.315713017512711, + "grad_norm": 0.6781083345413208, + "learning_rate": 1.65712615612603e-05, + "loss": 2.4731, + "step": 16303 + }, + { + "epoch": 1.3157937212492938, + "grad_norm": 0.6710748076438904, + "learning_rate": 1.656255881357458e-05, + "loss": 2.4065, + "step": 16304 + }, + { + "epoch": 1.315874424985877, + "grad_norm": 0.7099822163581848, + "learning_rate": 1.655385814536804e-05, + "loss": 2.3978, + "step": 16305 + }, + { + "epoch": 1.3159551287224598, + "grad_norm": 0.7215133905410767, + "learning_rate": 1.6545159556857447e-05, + "loss": 2.4655, + "step": 16306 + }, + { + "epoch": 1.3160358324590429, + "grad_norm": 0.7705253958702087, + "learning_rate": 1.6536463048259643e-05, + "loss": 2.4576, + "step": 16307 + }, + { + "epoch": 1.3161165361956257, + "grad_norm": 0.6232311725616455, + "learning_rate": 1.6527768619791372e-05, + "loss": 2.3923, + "step": 16308 + }, + { + "epoch": 1.3161972399322088, + "grad_norm": 0.6599528789520264, + "learning_rate": 1.6519076271669264e-05, + "loss": 2.4236, + "step": 16309 + }, + { + "epoch": 1.316277943668792, + "grad_norm": 0.6598034501075745, + "learning_rate": 1.6510386004110023e-05, + "loss": 2.368, + "step": 16310 + }, + { + "epoch": 1.3163586474053748, + "grad_norm": 0.6949655413627625, + "learning_rate": 1.650169781733022e-05, + "loss": 2.4277, + "step": 16311 + }, + { + "epoch": 1.3164393511419579, + "grad_norm": 0.6838186383247375, + "learning_rate": 1.6493011711546358e-05, + "loss": 2.4413, + "step": 16312 + }, + { + "epoch": 1.316520054878541, + "grad_norm": 0.7026765942573547, + "learning_rate": 1.6484327686974933e-05, + "loss": 2.4628, + "step": 16313 + }, + { + "epoch": 1.3166007586151238, + "grad_norm": 0.745360791683197, + "learning_rate": 1.647564574383237e-05, + "loss": 2.4358, + "step": 16314 + }, + { + "epoch": 1.316681462351707, + "grad_norm": 0.676225483417511, + "learning_rate": 1.6466965882335083e-05, + "loss": 2.4119, + "step": 16315 + }, + { + "epoch": 1.31676216608829, + "grad_norm": 0.6767755150794983, + "learning_rate": 1.6458288102699325e-05, + "loss": 2.4322, + "step": 16316 + }, + { + "epoch": 1.3168428698248729, + "grad_norm": 0.6957309246063232, + "learning_rate": 1.6449612405141424e-05, + "loss": 2.4327, + "step": 16317 + }, + { + "epoch": 1.316923573561456, + "grad_norm": 0.6773050427436829, + "learning_rate": 1.64409387898776e-05, + "loss": 2.4207, + "step": 16318 + }, + { + "epoch": 1.3170042772980388, + "grad_norm": 0.7319278717041016, + "learning_rate": 1.6432267257123978e-05, + "loss": 2.445, + "step": 16319 + }, + { + "epoch": 1.317084981034622, + "grad_norm": 0.7531326413154602, + "learning_rate": 1.6423597807096714e-05, + "loss": 2.3948, + "step": 16320 + }, + { + "epoch": 1.3171656847712048, + "grad_norm": 0.6741669178009033, + "learning_rate": 1.6414930440011854e-05, + "loss": 2.4177, + "step": 16321 + }, + { + "epoch": 1.3172463885077879, + "grad_norm": 0.6814963221549988, + "learning_rate": 1.640626515608543e-05, + "loss": 2.4419, + "step": 16322 + }, + { + "epoch": 1.317327092244371, + "grad_norm": 0.6740893721580505, + "learning_rate": 1.6397601955533392e-05, + "loss": 2.3516, + "step": 16323 + }, + { + "epoch": 1.3174077959809538, + "grad_norm": 0.7172163724899292, + "learning_rate": 1.6388940838571675e-05, + "loss": 2.4665, + "step": 16324 + }, + { + "epoch": 1.317488499717537, + "grad_norm": 0.6690489053726196, + "learning_rate": 1.6380281805416085e-05, + "loss": 2.3957, + "step": 16325 + }, + { + "epoch": 1.31756920345412, + "grad_norm": 0.7182994484901428, + "learning_rate": 1.6371624856282462e-05, + "loss": 2.4456, + "step": 16326 + }, + { + "epoch": 1.3176499071907029, + "grad_norm": 0.6324366927146912, + "learning_rate": 1.636296999138659e-05, + "loss": 2.4111, + "step": 16327 + }, + { + "epoch": 1.317730610927286, + "grad_norm": 0.6740162372589111, + "learning_rate": 1.6354317210944093e-05, + "loss": 2.451, + "step": 16328 + }, + { + "epoch": 1.317811314663869, + "grad_norm": 0.6964122653007507, + "learning_rate": 1.6345666515170665e-05, + "loss": 2.4269, + "step": 16329 + }, + { + "epoch": 1.317892018400452, + "grad_norm": 0.7093058824539185, + "learning_rate": 1.6337017904281915e-05, + "loss": 2.4686, + "step": 16330 + }, + { + "epoch": 1.317972722137035, + "grad_norm": 0.693233072757721, + "learning_rate": 1.6328371378493367e-05, + "loss": 2.4149, + "step": 16331 + }, + { + "epoch": 1.318053425873618, + "grad_norm": 0.6418019533157349, + "learning_rate": 1.631972693802052e-05, + "loss": 2.4268, + "step": 16332 + }, + { + "epoch": 1.318134129610201, + "grad_norm": 0.6815310120582581, + "learning_rate": 1.631108458307883e-05, + "loss": 2.4274, + "step": 16333 + }, + { + "epoch": 1.318214833346784, + "grad_norm": 0.6774280071258545, + "learning_rate": 1.630244431388369e-05, + "loss": 2.3927, + "step": 16334 + }, + { + "epoch": 1.3182955370833669, + "grad_norm": 0.688090443611145, + "learning_rate": 1.6293806130650413e-05, + "loss": 2.4013, + "step": 16335 + }, + { + "epoch": 1.31837624081995, + "grad_norm": 0.7300553321838379, + "learning_rate": 1.6285170033594288e-05, + "loss": 2.4716, + "step": 16336 + }, + { + "epoch": 1.3184569445565328, + "grad_norm": 0.6798286437988281, + "learning_rate": 1.627653602293059e-05, + "loss": 2.3893, + "step": 16337 + }, + { + "epoch": 1.318537648293116, + "grad_norm": 0.6699275970458984, + "learning_rate": 1.6267904098874442e-05, + "loss": 2.4446, + "step": 16338 + }, + { + "epoch": 1.318618352029699, + "grad_norm": 0.7632322311401367, + "learning_rate": 1.6259274261641e-05, + "loss": 2.4434, + "step": 16339 + }, + { + "epoch": 1.3186990557662819, + "grad_norm": 0.7156099677085876, + "learning_rate": 1.6250646511445343e-05, + "loss": 2.4142, + "step": 16340 + }, + { + "epoch": 1.318779759502865, + "grad_norm": 0.7525599598884583, + "learning_rate": 1.6242020848502505e-05, + "loss": 2.3543, + "step": 16341 + }, + { + "epoch": 1.318860463239448, + "grad_norm": 0.7063113451004028, + "learning_rate": 1.623339727302745e-05, + "loss": 2.4754, + "step": 16342 + }, + { + "epoch": 1.318941166976031, + "grad_norm": 0.7138137221336365, + "learning_rate": 1.6224775785235123e-05, + "loss": 2.4223, + "step": 16343 + }, + { + "epoch": 1.319021870712614, + "grad_norm": 0.6976706981658936, + "learning_rate": 1.6216156385340352e-05, + "loss": 2.4878, + "step": 16344 + }, + { + "epoch": 1.319102574449197, + "grad_norm": 0.6931003332138062, + "learning_rate": 1.6207539073557974e-05, + "loss": 2.39, + "step": 16345 + }, + { + "epoch": 1.31918327818578, + "grad_norm": 0.6919357180595398, + "learning_rate": 1.6198923850102765e-05, + "loss": 2.4197, + "step": 16346 + }, + { + "epoch": 1.319263981922363, + "grad_norm": 0.7453805804252625, + "learning_rate": 1.619031071518945e-05, + "loss": 2.4226, + "step": 16347 + }, + { + "epoch": 1.3193446856589461, + "grad_norm": 0.6990562677383423, + "learning_rate": 1.6181699669032658e-05, + "loss": 2.3925, + "step": 16348 + }, + { + "epoch": 1.319425389395529, + "grad_norm": 0.6974303126335144, + "learning_rate": 1.6173090711847006e-05, + "loss": 2.445, + "step": 16349 + }, + { + "epoch": 1.319506093132112, + "grad_norm": 0.7278286814689636, + "learning_rate": 1.6164483843847057e-05, + "loss": 2.3869, + "step": 16350 + }, + { + "epoch": 1.319586796868695, + "grad_norm": 0.7282646298408508, + "learning_rate": 1.6155879065247326e-05, + "loss": 2.3694, + "step": 16351 + }, + { + "epoch": 1.319667500605278, + "grad_norm": 0.7329844832420349, + "learning_rate": 1.6147276376262255e-05, + "loss": 2.4369, + "step": 16352 + }, + { + "epoch": 1.319748204341861, + "grad_norm": 0.6499385833740234, + "learning_rate": 1.613867577710627e-05, + "loss": 2.441, + "step": 16353 + }, + { + "epoch": 1.319828908078444, + "grad_norm": 0.7026061415672302, + "learning_rate": 1.6130077267993683e-05, + "loss": 2.4117, + "step": 16354 + }, + { + "epoch": 1.319909611815027, + "grad_norm": 0.7007814049720764, + "learning_rate": 1.6121480849138803e-05, + "loss": 2.4287, + "step": 16355 + }, + { + "epoch": 1.31999031555161, + "grad_norm": 0.6525697708129883, + "learning_rate": 1.611288652075591e-05, + "loss": 2.3969, + "step": 16356 + }, + { + "epoch": 1.320071019288193, + "grad_norm": 0.7268216609954834, + "learning_rate": 1.610429428305914e-05, + "loss": 2.4227, + "step": 16357 + }, + { + "epoch": 1.3201517230247761, + "grad_norm": 0.6665107011795044, + "learning_rate": 1.6095704136262668e-05, + "loss": 2.3694, + "step": 16358 + }, + { + "epoch": 1.320232426761359, + "grad_norm": 0.6832399368286133, + "learning_rate": 1.60871160805806e-05, + "loss": 2.4001, + "step": 16359 + }, + { + "epoch": 1.320313130497942, + "grad_norm": 0.6788592338562012, + "learning_rate": 1.6078530116226897e-05, + "loss": 2.4294, + "step": 16360 + }, + { + "epoch": 1.3203938342345252, + "grad_norm": 0.7147449254989624, + "learning_rate": 1.6069946243415625e-05, + "loss": 2.3904, + "step": 16361 + }, + { + "epoch": 1.320474537971108, + "grad_norm": 0.7014418840408325, + "learning_rate": 1.6061364462360683e-05, + "loss": 2.4026, + "step": 16362 + }, + { + "epoch": 1.3205552417076911, + "grad_norm": 0.6867612600326538, + "learning_rate": 1.6052784773275987e-05, + "loss": 2.4092, + "step": 16363 + }, + { + "epoch": 1.3206359454442742, + "grad_norm": 0.6588961482048035, + "learning_rate": 1.6044207176375303e-05, + "loss": 2.4588, + "step": 16364 + }, + { + "epoch": 1.320716649180857, + "grad_norm": 0.688671350479126, + "learning_rate": 1.6035631671872444e-05, + "loss": 2.3957, + "step": 16365 + }, + { + "epoch": 1.3207973529174402, + "grad_norm": 0.7548064589500427, + "learning_rate": 1.6027058259981154e-05, + "loss": 2.4168, + "step": 16366 + }, + { + "epoch": 1.320878056654023, + "grad_norm": 0.7251972556114197, + "learning_rate": 1.6018486940915044e-05, + "loss": 2.4704, + "step": 16367 + }, + { + "epoch": 1.3209587603906061, + "grad_norm": 0.73149174451828, + "learning_rate": 1.6009917714887778e-05, + "loss": 2.4597, + "step": 16368 + }, + { + "epoch": 1.321039464127189, + "grad_norm": 0.6741003394126892, + "learning_rate": 1.600135058211294e-05, + "loss": 2.3876, + "step": 16369 + }, + { + "epoch": 1.321120167863772, + "grad_norm": 0.6891310214996338, + "learning_rate": 1.5992785542804e-05, + "loss": 2.4229, + "step": 16370 + }, + { + "epoch": 1.3212008716003552, + "grad_norm": 0.7529458403587341, + "learning_rate": 1.5984222597174415e-05, + "loss": 2.45, + "step": 16371 + }, + { + "epoch": 1.321281575336938, + "grad_norm": 0.708134651184082, + "learning_rate": 1.5975661745437664e-05, + "loss": 2.454, + "step": 16372 + }, + { + "epoch": 1.321362279073521, + "grad_norm": 0.7511130571365356, + "learning_rate": 1.596710298780705e-05, + "loss": 2.4201, + "step": 16373 + }, + { + "epoch": 1.3214429828101042, + "grad_norm": 0.6599537134170532, + "learning_rate": 1.595854632449588e-05, + "loss": 2.3982, + "step": 16374 + }, + { + "epoch": 1.321523686546687, + "grad_norm": 0.6821228861808777, + "learning_rate": 1.5949991755717453e-05, + "loss": 2.4525, + "step": 16375 + }, + { + "epoch": 1.3216043902832701, + "grad_norm": 0.6872302293777466, + "learning_rate": 1.5941439281684923e-05, + "loss": 2.3631, + "step": 16376 + }, + { + "epoch": 1.3216850940198532, + "grad_norm": 0.6650066375732422, + "learning_rate": 1.5932888902611453e-05, + "loss": 2.3718, + "step": 16377 + }, + { + "epoch": 1.321765797756436, + "grad_norm": 0.6620016694068909, + "learning_rate": 1.5924340618710143e-05, + "loss": 2.4076, + "step": 16378 + }, + { + "epoch": 1.3218465014930192, + "grad_norm": 0.694807231426239, + "learning_rate": 1.5915794430194066e-05, + "loss": 2.4369, + "step": 16379 + }, + { + "epoch": 1.321927205229602, + "grad_norm": 0.6810131669044495, + "learning_rate": 1.590725033727616e-05, + "loss": 2.4151, + "step": 16380 + }, + { + "epoch": 1.3220079089661851, + "grad_norm": 0.768846333026886, + "learning_rate": 1.58987083401694e-05, + "loss": 2.4991, + "step": 16381 + }, + { + "epoch": 1.322088612702768, + "grad_norm": 0.6581698656082153, + "learning_rate": 1.5890168439086672e-05, + "loss": 2.4263, + "step": 16382 + }, + { + "epoch": 1.322169316439351, + "grad_norm": 0.7267034649848938, + "learning_rate": 1.5881630634240818e-05, + "loss": 2.4219, + "step": 16383 + }, + { + "epoch": 1.3222500201759342, + "grad_norm": 0.7391555905342102, + "learning_rate": 1.5873094925844612e-05, + "loss": 2.427, + "step": 16384 + }, + { + "epoch": 1.322330723912517, + "grad_norm": 0.6612021923065186, + "learning_rate": 1.5864561314110815e-05, + "loss": 2.4108, + "step": 16385 + }, + { + "epoch": 1.3224114276491001, + "grad_norm": 0.7118437886238098, + "learning_rate": 1.585602979925206e-05, + "loss": 2.3839, + "step": 16386 + }, + { + "epoch": 1.3224921313856832, + "grad_norm": 0.6663616299629211, + "learning_rate": 1.5847500381480997e-05, + "loss": 2.4302, + "step": 16387 + }, + { + "epoch": 1.322572835122266, + "grad_norm": 0.6848715543746948, + "learning_rate": 1.583897306101022e-05, + "loss": 2.4228, + "step": 16388 + }, + { + "epoch": 1.3226535388588492, + "grad_norm": 0.680895209312439, + "learning_rate": 1.5830447838052208e-05, + "loss": 2.4457, + "step": 16389 + }, + { + "epoch": 1.3227342425954323, + "grad_norm": 0.683276891708374, + "learning_rate": 1.582192471281946e-05, + "loss": 2.4412, + "step": 16390 + }, + { + "epoch": 1.3228149463320151, + "grad_norm": 0.7311880588531494, + "learning_rate": 1.5813403685524396e-05, + "loss": 2.4604, + "step": 16391 + }, + { + "epoch": 1.3228956500685982, + "grad_norm": 0.6769095659255981, + "learning_rate": 1.580488475637937e-05, + "loss": 2.4311, + "step": 16392 + }, + { + "epoch": 1.3229763538051813, + "grad_norm": 0.6683096289634705, + "learning_rate": 1.579636792559671e-05, + "loss": 2.445, + "step": 16393 + }, + { + "epoch": 1.3230570575417642, + "grad_norm": 0.7268782258033752, + "learning_rate": 1.5787853193388667e-05, + "loss": 2.4176, + "step": 16394 + }, + { + "epoch": 1.3231377612783473, + "grad_norm": 0.6878541707992554, + "learning_rate": 1.5779340559967494e-05, + "loss": 2.4615, + "step": 16395 + }, + { + "epoch": 1.3232184650149301, + "grad_norm": 0.7031291127204895, + "learning_rate": 1.577083002554527e-05, + "loss": 2.3726, + "step": 16396 + }, + { + "epoch": 1.3232991687515132, + "grad_norm": 0.7738708853721619, + "learning_rate": 1.5762321590334138e-05, + "loss": 2.5046, + "step": 16397 + }, + { + "epoch": 1.323379872488096, + "grad_norm": 0.6660913228988647, + "learning_rate": 1.575381525454619e-05, + "loss": 2.3759, + "step": 16398 + }, + { + "epoch": 1.3234605762246792, + "grad_norm": 0.6534021496772766, + "learning_rate": 1.574531101839335e-05, + "loss": 2.3983, + "step": 16399 + }, + { + "epoch": 1.3235412799612623, + "grad_norm": 0.6645511388778687, + "learning_rate": 1.5736808882087606e-05, + "loss": 2.3958, + "step": 16400 + }, + { + "epoch": 1.3236219836978451, + "grad_norm": 0.6723225712776184, + "learning_rate": 1.5728308845840855e-05, + "loss": 2.4248, + "step": 16401 + }, + { + "epoch": 1.3237026874344282, + "grad_norm": 0.6609976887702942, + "learning_rate": 1.5719810909864942e-05, + "loss": 2.3888, + "step": 16402 + }, + { + "epoch": 1.3237833911710113, + "grad_norm": 0.6713845729827881, + "learning_rate": 1.5711315074371635e-05, + "loss": 2.4474, + "step": 16403 + }, + { + "epoch": 1.3238640949075942, + "grad_norm": 0.701438307762146, + "learning_rate": 1.5702821339572726e-05, + "loss": 2.4673, + "step": 16404 + }, + { + "epoch": 1.3239447986441772, + "grad_norm": 0.7235428094863892, + "learning_rate": 1.5694329705679834e-05, + "loss": 2.3825, + "step": 16405 + }, + { + "epoch": 1.3240255023807603, + "grad_norm": 0.6785053610801697, + "learning_rate": 1.568584017290462e-05, + "loss": 2.4668, + "step": 16406 + }, + { + "epoch": 1.3241062061173432, + "grad_norm": 0.6918929815292358, + "learning_rate": 1.5677352741458705e-05, + "loss": 2.4329, + "step": 16407 + }, + { + "epoch": 1.3241869098539263, + "grad_norm": 0.7194826006889343, + "learning_rate": 1.5668867411553544e-05, + "loss": 2.3717, + "step": 16408 + }, + { + "epoch": 1.3242676135905094, + "grad_norm": 0.7299134731292725, + "learning_rate": 1.5660384183400658e-05, + "loss": 2.4695, + "step": 16409 + }, + { + "epoch": 1.3243483173270922, + "grad_norm": 0.7047600746154785, + "learning_rate": 1.565190305721147e-05, + "loss": 2.4525, + "step": 16410 + }, + { + "epoch": 1.3244290210636753, + "grad_norm": 0.685001015663147, + "learning_rate": 1.5643424033197328e-05, + "loss": 2.322, + "step": 16411 + }, + { + "epoch": 1.3245097248002582, + "grad_norm": 0.7696635127067566, + "learning_rate": 1.5634947111569588e-05, + "loss": 2.4464, + "step": 16412 + }, + { + "epoch": 1.3245904285368413, + "grad_norm": 0.7066066265106201, + "learning_rate": 1.5626472292539485e-05, + "loss": 2.4315, + "step": 16413 + }, + { + "epoch": 1.3246711322734241, + "grad_norm": 0.6553033590316772, + "learning_rate": 1.5617999576318276e-05, + "loss": 2.4296, + "step": 16414 + }, + { + "epoch": 1.3247518360100072, + "grad_norm": 0.7031354308128357, + "learning_rate": 1.560952896311707e-05, + "loss": 2.4565, + "step": 16415 + }, + { + "epoch": 1.3248325397465903, + "grad_norm": 0.7826353311538696, + "learning_rate": 1.560106045314701e-05, + "loss": 2.4275, + "step": 16416 + }, + { + "epoch": 1.3249132434831732, + "grad_norm": 0.6408981084823608, + "learning_rate": 1.559259404661916e-05, + "loss": 2.3869, + "step": 16417 + }, + { + "epoch": 1.3249939472197563, + "grad_norm": 0.7487547993659973, + "learning_rate": 1.558412974374448e-05, + "loss": 2.3678, + "step": 16418 + }, + { + "epoch": 1.3250746509563394, + "grad_norm": 0.7163991332054138, + "learning_rate": 1.5575667544733963e-05, + "loss": 2.397, + "step": 16419 + }, + { + "epoch": 1.3251553546929222, + "grad_norm": 0.6933553814888, + "learning_rate": 1.5567207449798515e-05, + "loss": 2.424, + "step": 16420 + }, + { + "epoch": 1.3252360584295053, + "grad_norm": 0.687406063079834, + "learning_rate": 1.5558749459148945e-05, + "loss": 2.4346, + "step": 16421 + }, + { + "epoch": 1.3253167621660884, + "grad_norm": 0.6781243681907654, + "learning_rate": 1.5550293572996054e-05, + "loss": 2.4526, + "step": 16422 + }, + { + "epoch": 1.3253974659026713, + "grad_norm": 0.6632506847381592, + "learning_rate": 1.5541839791550616e-05, + "loss": 2.4559, + "step": 16423 + }, + { + "epoch": 1.3254781696392544, + "grad_norm": 0.668396532535553, + "learning_rate": 1.5533388115023327e-05, + "loss": 2.4463, + "step": 16424 + }, + { + "epoch": 1.3255588733758372, + "grad_norm": 0.6853309869766235, + "learning_rate": 1.552493854362479e-05, + "loss": 2.429, + "step": 16425 + }, + { + "epoch": 1.3256395771124203, + "grad_norm": 0.7443413138389587, + "learning_rate": 1.5516491077565597e-05, + "loss": 2.4091, + "step": 16426 + }, + { + "epoch": 1.3257202808490032, + "grad_norm": 0.690170168876648, + "learning_rate": 1.550804571705632e-05, + "loss": 2.3942, + "step": 16427 + }, + { + "epoch": 1.3258009845855863, + "grad_norm": NaN, + "learning_rate": 1.550804571705632e-05, + "loss": 2.3788, + "step": 16428 + }, + { + "epoch": 1.3258816883221693, + "grad_norm": 0.6901132464408875, + "learning_rate": 1.5499602462307373e-05, + "loss": 2.3859, + "step": 16429 + }, + { + "epoch": 1.3259623920587522, + "grad_norm": 0.6639334559440613, + "learning_rate": 1.5491161313529223e-05, + "loss": 2.4271, + "step": 16430 + }, + { + "epoch": 1.3260430957953353, + "grad_norm": 0.7121936678886414, + "learning_rate": 1.548272227093227e-05, + "loss": 2.3818, + "step": 16431 + }, + { + "epoch": 1.3261237995319184, + "grad_norm": 0.6863218545913696, + "learning_rate": 1.5474285334726778e-05, + "loss": 2.3744, + "step": 16432 + }, + { + "epoch": 1.3262045032685013, + "grad_norm": 0.6697081327438354, + "learning_rate": 1.5465850505123057e-05, + "loss": 2.4001, + "step": 16433 + }, + { + "epoch": 1.3262852070050843, + "grad_norm": 0.7258912324905396, + "learning_rate": 1.5457417782331308e-05, + "loss": 2.4556, + "step": 16434 + }, + { + "epoch": 1.3263659107416674, + "grad_norm": 0.6930057406425476, + "learning_rate": 1.5448987166561712e-05, + "loss": 2.4979, + "step": 16435 + }, + { + "epoch": 1.3264466144782503, + "grad_norm": 0.6475574970245361, + "learning_rate": 1.5440558658024363e-05, + "loss": 2.3821, + "step": 16436 + }, + { + "epoch": 1.3265273182148334, + "grad_norm": 0.7489237785339355, + "learning_rate": 1.5432132256929367e-05, + "loss": 2.465, + "step": 16437 + }, + { + "epoch": 1.3266080219514165, + "grad_norm": 0.704391360282898, + "learning_rate": 1.5423707963486667e-05, + "loss": 2.433, + "step": 16438 + }, + { + "epoch": 1.3266887256879993, + "grad_norm": 0.669452965259552, + "learning_rate": 1.5415285777906253e-05, + "loss": 2.3981, + "step": 16439 + }, + { + "epoch": 1.3267694294245824, + "grad_norm": 0.6961604356765747, + "learning_rate": 1.540686570039802e-05, + "loss": 2.4684, + "step": 16440 + }, + { + "epoch": 1.3268501331611653, + "grad_norm": 0.6613924503326416, + "learning_rate": 1.539844773117185e-05, + "loss": 2.3711, + "step": 16441 + }, + { + "epoch": 1.3269308368977484, + "grad_norm": 0.7019763588905334, + "learning_rate": 1.5390031870437492e-05, + "loss": 2.3716, + "step": 16442 + }, + { + "epoch": 1.3270115406343312, + "grad_norm": 0.700176477432251, + "learning_rate": 1.5381618118404707e-05, + "loss": 2.4305, + "step": 16443 + }, + { + "epoch": 1.3270922443709143, + "grad_norm": 0.6716598272323608, + "learning_rate": 1.5373206475283197e-05, + "loss": 2.3835, + "step": 16444 + }, + { + "epoch": 1.3271729481074974, + "grad_norm": 0.6449697017669678, + "learning_rate": 1.53647969412826e-05, + "loss": 2.3707, + "step": 16445 + }, + { + "epoch": 1.3272536518440803, + "grad_norm": 0.7276685237884521, + "learning_rate": 1.535638951661249e-05, + "loss": 2.4313, + "step": 16446 + }, + { + "epoch": 1.3273343555806634, + "grad_norm": 0.7144705057144165, + "learning_rate": 1.5347984201482456e-05, + "loss": 2.4122, + "step": 16447 + }, + { + "epoch": 1.3274150593172465, + "grad_norm": 0.660225510597229, + "learning_rate": 1.53395809961019e-05, + "loss": 2.4282, + "step": 16448 + }, + { + "epoch": 1.3274957630538293, + "grad_norm": 0.7431676983833313, + "learning_rate": 1.5331179900680293e-05, + "loss": 2.3863, + "step": 16449 + }, + { + "epoch": 1.3275764667904124, + "grad_norm": 0.6670290231704712, + "learning_rate": 1.5322780915427036e-05, + "loss": 2.4266, + "step": 16450 + }, + { + "epoch": 1.3276571705269955, + "grad_norm": 0.711098313331604, + "learning_rate": 1.531438404055141e-05, + "loss": 2.4431, + "step": 16451 + }, + { + "epoch": 1.3277378742635784, + "grad_norm": 0.6908091902732849, + "learning_rate": 1.5305989276262688e-05, + "loss": 2.4153, + "step": 16452 + }, + { + "epoch": 1.3278185780001615, + "grad_norm": 0.7458107471466064, + "learning_rate": 1.5297596622770115e-05, + "loss": 2.4076, + "step": 16453 + }, + { + "epoch": 1.3278992817367445, + "grad_norm": 0.7406951189041138, + "learning_rate": 1.528920608028285e-05, + "loss": 2.3585, + "step": 16454 + }, + { + "epoch": 1.3279799854733274, + "grad_norm": 0.718824565410614, + "learning_rate": 1.5280817649010005e-05, + "loss": 2.4092, + "step": 16455 + }, + { + "epoch": 1.3280606892099105, + "grad_norm": 0.7163959741592407, + "learning_rate": 1.527243132916064e-05, + "loss": 2.4344, + "step": 16456 + }, + { + "epoch": 1.3281413929464934, + "grad_norm": 0.6695916652679443, + "learning_rate": 1.5264047120943793e-05, + "loss": 2.4144, + "step": 16457 + }, + { + "epoch": 1.3282220966830764, + "grad_norm": 0.6858509182929993, + "learning_rate": 1.5255665024568366e-05, + "loss": 2.4345, + "step": 16458 + }, + { + "epoch": 1.3283028004196593, + "grad_norm": 0.7277235388755798, + "learning_rate": 1.5247285040243297e-05, + "loss": 2.4219, + "step": 16459 + }, + { + "epoch": 1.3283835041562424, + "grad_norm": 0.6481949090957642, + "learning_rate": 1.5238907168177441e-05, + "loss": 2.4483, + "step": 16460 + }, + { + "epoch": 1.3284642078928255, + "grad_norm": 0.6956833600997925, + "learning_rate": 1.5230531408579574e-05, + "loss": 2.4241, + "step": 16461 + }, + { + "epoch": 1.3285449116294084, + "grad_norm": 0.7266185879707336, + "learning_rate": 1.522215776165845e-05, + "loss": 2.4577, + "step": 16462 + }, + { + "epoch": 1.3286256153659914, + "grad_norm": 0.725574254989624, + "learning_rate": 1.5213786227622773e-05, + "loss": 2.4451, + "step": 16463 + }, + { + "epoch": 1.3287063191025745, + "grad_norm": 0.7550850510597229, + "learning_rate": 1.5205416806681172e-05, + "loss": 2.4262, + "step": 16464 + }, + { + "epoch": 1.3287870228391574, + "grad_norm": 0.6391028761863708, + "learning_rate": 1.5197049499042237e-05, + "loss": 2.4116, + "step": 16465 + }, + { + "epoch": 1.3288677265757405, + "grad_norm": 0.6899027824401855, + "learning_rate": 1.5188684304914524e-05, + "loss": 2.3754, + "step": 16466 + }, + { + "epoch": 1.3289484303123236, + "grad_norm": 0.696681022644043, + "learning_rate": 1.518032122450649e-05, + "loss": 2.471, + "step": 16467 + }, + { + "epoch": 1.3290291340489064, + "grad_norm": 0.7090939283370972, + "learning_rate": 1.5171960258026551e-05, + "loss": 2.4153, + "step": 16468 + }, + { + "epoch": 1.3291098377854895, + "grad_norm": 0.7125746607780457, + "learning_rate": 1.5163601405683148e-05, + "loss": 2.4102, + "step": 16469 + }, + { + "epoch": 1.3291905415220726, + "grad_norm": 0.7407518029212952, + "learning_rate": 1.5155244667684531e-05, + "loss": 2.429, + "step": 16470 + }, + { + "epoch": 1.3292712452586555, + "grad_norm": 0.7401885390281677, + "learning_rate": 1.5146890044239004e-05, + "loss": 2.4577, + "step": 16471 + }, + { + "epoch": 1.3293519489952383, + "grad_norm": 0.7625757455825806, + "learning_rate": 1.5138537535554786e-05, + "loss": 2.3813, + "step": 16472 + }, + { + "epoch": 1.3294326527318214, + "grad_norm": 0.7423396706581116, + "learning_rate": 1.5130187141840057e-05, + "loss": 2.3797, + "step": 16473 + }, + { + "epoch": 1.3295133564684045, + "grad_norm": 0.7029228806495667, + "learning_rate": 1.5121838863302884e-05, + "loss": 2.4203, + "step": 16474 + }, + { + "epoch": 1.3295940602049874, + "grad_norm": 0.8062863349914551, + "learning_rate": 1.5113492700151378e-05, + "loss": 2.3743, + "step": 16475 + }, + { + "epoch": 1.3296747639415705, + "grad_norm": 0.7113343477249146, + "learning_rate": 1.5105148652593548e-05, + "loss": 2.3837, + "step": 16476 + }, + { + "epoch": 1.3297554676781536, + "grad_norm": 0.6733126044273376, + "learning_rate": 1.5096806720837309e-05, + "loss": 2.4677, + "step": 16477 + }, + { + "epoch": 1.3298361714147364, + "grad_norm": 0.6936657428741455, + "learning_rate": 1.5088466905090593e-05, + "loss": 2.3677, + "step": 16478 + }, + { + "epoch": 1.3299168751513195, + "grad_norm": 0.746746301651001, + "learning_rate": 1.5080129205561255e-05, + "loss": 2.423, + "step": 16479 + }, + { + "epoch": 1.3299975788879026, + "grad_norm": 0.6879116296768188, + "learning_rate": 1.5071793622457065e-05, + "loss": 2.4867, + "step": 16480 + }, + { + "epoch": 1.3300782826244855, + "grad_norm": 0.6841214299201965, + "learning_rate": 1.5063460155985776e-05, + "loss": 2.5015, + "step": 16481 + }, + { + "epoch": 1.3301589863610686, + "grad_norm": 0.6955111622810364, + "learning_rate": 1.5055128806355123e-05, + "loss": 2.3975, + "step": 16482 + }, + { + "epoch": 1.3302396900976516, + "grad_norm": 0.7084987163543701, + "learning_rate": 1.5046799573772673e-05, + "loss": 2.4511, + "step": 16483 + }, + { + "epoch": 1.3303203938342345, + "grad_norm": 0.6905840039253235, + "learning_rate": 1.5038472458446051e-05, + "loss": 2.3542, + "step": 16484 + }, + { + "epoch": 1.3304010975708176, + "grad_norm": 0.7182672023773193, + "learning_rate": 1.5030147460582788e-05, + "loss": 2.3673, + "step": 16485 + }, + { + "epoch": 1.3304818013074005, + "grad_norm": 0.6805183291435242, + "learning_rate": 1.5021824580390353e-05, + "loss": 2.3751, + "step": 16486 + }, + { + "epoch": 1.3305625050439835, + "grad_norm": 0.6278836727142334, + "learning_rate": 1.5013503818076202e-05, + "loss": 2.3508, + "step": 16487 + }, + { + "epoch": 1.3306432087805664, + "grad_norm": 0.664000391960144, + "learning_rate": 1.500518517384768e-05, + "loss": 2.4039, + "step": 16488 + }, + { + "epoch": 1.3307239125171495, + "grad_norm": 0.6906681060791016, + "learning_rate": 1.4996868647912155e-05, + "loss": 2.4068, + "step": 16489 + }, + { + "epoch": 1.3308046162537326, + "grad_norm": 0.6756102442741394, + "learning_rate": 1.4988554240476826e-05, + "loss": 2.4423, + "step": 16490 + }, + { + "epoch": 1.3308853199903155, + "grad_norm": 0.7013095021247864, + "learning_rate": 1.4980241951748964e-05, + "loss": 2.3536, + "step": 16491 + }, + { + "epoch": 1.3309660237268985, + "grad_norm": 0.6689851880073547, + "learning_rate": 1.4971931781935732e-05, + "loss": 2.4192, + "step": 16492 + }, + { + "epoch": 1.3310467274634816, + "grad_norm": 0.6411572694778442, + "learning_rate": 1.4963623731244202e-05, + "loss": 2.4012, + "step": 16493 + }, + { + "epoch": 1.3311274312000645, + "grad_norm": 0.7209812998771667, + "learning_rate": 1.4955317799881453e-05, + "loss": 2.378, + "step": 16494 + }, + { + "epoch": 1.3312081349366476, + "grad_norm": 0.7041119933128357, + "learning_rate": 1.4947013988054504e-05, + "loss": 2.4047, + "step": 16495 + }, + { + "epoch": 1.3312888386732307, + "grad_norm": 0.6928852796554565, + "learning_rate": 1.4938712295970292e-05, + "loss": 2.4489, + "step": 16496 + }, + { + "epoch": 1.3313695424098135, + "grad_norm": 0.6923524141311646, + "learning_rate": 1.4930412723835718e-05, + "loss": 2.3752, + "step": 16497 + }, + { + "epoch": 1.3314502461463966, + "grad_norm": 0.7034686803817749, + "learning_rate": 1.4922115271857662e-05, + "loss": 2.3898, + "step": 16498 + }, + { + "epoch": 1.3315309498829797, + "grad_norm": 0.6717320084571838, + "learning_rate": 1.4913819940242856e-05, + "loss": 2.3629, + "step": 16499 + }, + { + "epoch": 1.3316116536195626, + "grad_norm": 0.6885079741477966, + "learning_rate": 1.4905526729198083e-05, + "loss": 2.4321, + "step": 16500 + }, + { + "epoch": 1.3316923573561457, + "grad_norm": 0.662452757358551, + "learning_rate": 1.489723563893004e-05, + "loss": 2.4532, + "step": 16501 + }, + { + "epoch": 1.3317730610927285, + "grad_norm": 0.6650903224945068, + "learning_rate": 1.4888946669645332e-05, + "loss": 2.4347, + "step": 16502 + }, + { + "epoch": 1.3318537648293116, + "grad_norm": 0.7217590808868408, + "learning_rate": 1.4880659821550546e-05, + "loss": 2.4641, + "step": 16503 + }, + { + "epoch": 1.3319344685658945, + "grad_norm": 0.7063763737678528, + "learning_rate": 1.4872375094852232e-05, + "loss": 2.4365, + "step": 16504 + }, + { + "epoch": 1.3320151723024776, + "grad_norm": 0.7366454005241394, + "learning_rate": 1.4864092489756853e-05, + "loss": 2.4223, + "step": 16505 + }, + { + "epoch": 1.3320958760390607, + "grad_norm": 0.7132206559181213, + "learning_rate": 1.4855812006470838e-05, + "loss": 2.4404, + "step": 16506 + }, + { + "epoch": 1.3321765797756435, + "grad_norm": 0.665553867816925, + "learning_rate": 1.484753364520055e-05, + "loss": 2.3818, + "step": 16507 + }, + { + "epoch": 1.3322572835122266, + "grad_norm": 0.7854028344154358, + "learning_rate": 1.483925740615234e-05, + "loss": 2.4111, + "step": 16508 + }, + { + "epoch": 1.3323379872488097, + "grad_norm": 0.7331317663192749, + "learning_rate": 1.4830983289532418e-05, + "loss": 2.4446, + "step": 16509 + }, + { + "epoch": 1.3324186909853926, + "grad_norm": 0.670315146446228, + "learning_rate": 1.4822711295547042e-05, + "loss": 2.4017, + "step": 16510 + }, + { + "epoch": 1.3324993947219756, + "grad_norm": 0.7242144346237183, + "learning_rate": 1.481444142440237e-05, + "loss": 2.4281, + "step": 16511 + }, + { + "epoch": 1.3325800984585587, + "grad_norm": 0.7108538746833801, + "learning_rate": 1.4806173676304468e-05, + "loss": 2.4331, + "step": 16512 + }, + { + "epoch": 1.3326608021951416, + "grad_norm": 0.658989667892456, + "learning_rate": 1.479790805145943e-05, + "loss": 2.4321, + "step": 16513 + }, + { + "epoch": 1.3327415059317247, + "grad_norm": 0.6596404314041138, + "learning_rate": 1.4789644550073233e-05, + "loss": 2.3817, + "step": 16514 + }, + { + "epoch": 1.3328222096683078, + "grad_norm": 0.6922028064727783, + "learning_rate": 1.4781383172351837e-05, + "loss": 2.399, + "step": 16515 + }, + { + "epoch": 1.3329029134048906, + "grad_norm": 0.750747799873352, + "learning_rate": 1.4773123918501141e-05, + "loss": 2.4502, + "step": 16516 + }, + { + "epoch": 1.3329836171414737, + "grad_norm": 0.6887632608413696, + "learning_rate": 1.4764866788727006e-05, + "loss": 2.3636, + "step": 16517 + }, + { + "epoch": 1.3330643208780566, + "grad_norm": 0.6751166582107544, + "learning_rate": 1.4756611783235163e-05, + "loss": 2.3956, + "step": 16518 + }, + { + "epoch": 1.3331450246146397, + "grad_norm": 0.679040253162384, + "learning_rate": 1.4748358902231395e-05, + "loss": 2.4044, + "step": 16519 + }, + { + "epoch": 1.3332257283512225, + "grad_norm": 0.6396780610084534, + "learning_rate": 1.4740108145921373e-05, + "loss": 2.4114, + "step": 16520 + }, + { + "epoch": 1.3333064320878056, + "grad_norm": 0.6686230301856995, + "learning_rate": 1.4731859514510738e-05, + "loss": 2.4535, + "step": 16521 + }, + { + "epoch": 1.3333871358243887, + "grad_norm": 0.6693681478500366, + "learning_rate": 1.472361300820505e-05, + "loss": 2.3885, + "step": 16522 + }, + { + "epoch": 1.3334678395609716, + "grad_norm": 0.7700718641281128, + "learning_rate": 1.4715368627209836e-05, + "loss": 2.3939, + "step": 16523 + }, + { + "epoch": 1.3335485432975547, + "grad_norm": 0.7203121781349182, + "learning_rate": 1.4707126371730561e-05, + "loss": 2.4644, + "step": 16524 + }, + { + "epoch": 1.3336292470341378, + "grad_norm": 0.7798308730125427, + "learning_rate": 1.4698886241972665e-05, + "loss": 2.4293, + "step": 16525 + }, + { + "epoch": 1.3337099507707206, + "grad_norm": 0.7017160654067993, + "learning_rate": 1.4690648238141503e-05, + "loss": 2.4327, + "step": 16526 + }, + { + "epoch": 1.3337906545073037, + "grad_norm": 0.6522603631019592, + "learning_rate": 1.468241236044241e-05, + "loss": 2.3955, + "step": 16527 + }, + { + "epoch": 1.3338713582438868, + "grad_norm": 0.766222357749939, + "learning_rate": 1.4674178609080602e-05, + "loss": 2.4652, + "step": 16528 + }, + { + "epoch": 1.3339520619804697, + "grad_norm": 0.7351565361022949, + "learning_rate": 1.4665946984261303e-05, + "loss": 2.4607, + "step": 16529 + }, + { + "epoch": 1.3340327657170528, + "grad_norm": 0.6817728281021118, + "learning_rate": 1.4657717486189693e-05, + "loss": 2.3687, + "step": 16530 + }, + { + "epoch": 1.3341134694536356, + "grad_norm": 0.7401643395423889, + "learning_rate": 1.464949011507083e-05, + "loss": 2.4179, + "step": 16531 + }, + { + "epoch": 1.3341941731902187, + "grad_norm": 0.7783530354499817, + "learning_rate": 1.4641264871109784e-05, + "loss": 2.4088, + "step": 16532 + }, + { + "epoch": 1.3342748769268016, + "grad_norm": 0.6761943697929382, + "learning_rate": 1.4633041754511534e-05, + "loss": 2.4141, + "step": 16533 + }, + { + "epoch": 1.3343555806633847, + "grad_norm": 0.6842260360717773, + "learning_rate": 1.4624820765481073e-05, + "loss": 2.4918, + "step": 16534 + }, + { + "epoch": 1.3344362843999678, + "grad_norm": 0.6906094551086426, + "learning_rate": 1.4616601904223225e-05, + "loss": 2.4576, + "step": 16535 + }, + { + "epoch": 1.3345169881365506, + "grad_norm": 0.6549125909805298, + "learning_rate": 1.4608385170942829e-05, + "loss": 2.3748, + "step": 16536 + }, + { + "epoch": 1.3345976918731337, + "grad_norm": 0.6603896617889404, + "learning_rate": 1.4600170565844728e-05, + "loss": 2.3739, + "step": 16537 + }, + { + "epoch": 1.3346783956097168, + "grad_norm": 0.6413096189498901, + "learning_rate": 1.4591958089133606e-05, + "loss": 2.3979, + "step": 16538 + }, + { + "epoch": 1.3347590993462997, + "grad_norm": 0.7085204720497131, + "learning_rate": 1.4583747741014142e-05, + "loss": 2.4185, + "step": 16539 + }, + { + "epoch": 1.3348398030828827, + "grad_norm": 0.6517937183380127, + "learning_rate": 1.4575539521690983e-05, + "loss": 2.3938, + "step": 16540 + }, + { + "epoch": 1.3349205068194658, + "grad_norm": 0.6326449513435364, + "learning_rate": 1.4567333431368658e-05, + "loss": 2.4613, + "step": 16541 + }, + { + "epoch": 1.3350012105560487, + "grad_norm": 0.8046317100524902, + "learning_rate": 1.4559129470251708e-05, + "loss": 2.4547, + "step": 16542 + }, + { + "epoch": 1.3350819142926318, + "grad_norm": 0.6661570072174072, + "learning_rate": 1.455092763854462e-05, + "loss": 2.3636, + "step": 16543 + }, + { + "epoch": 1.3351626180292149, + "grad_norm": 0.6806541085243225, + "learning_rate": 1.454272793645176e-05, + "loss": 2.4309, + "step": 16544 + }, + { + "epoch": 1.3352433217657977, + "grad_norm": 0.651836097240448, + "learning_rate": 1.45345303641775e-05, + "loss": 2.3862, + "step": 16545 + }, + { + "epoch": 1.3353240255023808, + "grad_norm": 0.7448983192443848, + "learning_rate": 1.4526334921926165e-05, + "loss": 2.4654, + "step": 16546 + }, + { + "epoch": 1.3354047292389637, + "grad_norm": 0.6885285973548889, + "learning_rate": 1.4518141609901992e-05, + "loss": 2.3943, + "step": 16547 + }, + { + "epoch": 1.3354854329755468, + "grad_norm": 0.7204004526138306, + "learning_rate": 1.450995042830917e-05, + "loss": 2.4117, + "step": 16548 + }, + { + "epoch": 1.3355661367121296, + "grad_norm": 0.6551961898803711, + "learning_rate": 1.4501761377351864e-05, + "loss": 2.4269, + "step": 16549 + }, + { + "epoch": 1.3356468404487127, + "grad_norm": 0.7191253304481506, + "learning_rate": 1.4493574457234182e-05, + "loss": 2.3472, + "step": 16550 + }, + { + "epoch": 1.3357275441852958, + "grad_norm": 0.6793580651283264, + "learning_rate": 1.4485389668160121e-05, + "loss": 2.4264, + "step": 16551 + }, + { + "epoch": 1.3358082479218787, + "grad_norm": 0.704250693321228, + "learning_rate": 1.4477207010333682e-05, + "loss": 2.5236, + "step": 16552 + }, + { + "epoch": 1.3358889516584618, + "grad_norm": 0.6826470494270325, + "learning_rate": 1.4469026483958837e-05, + "loss": 2.4473, + "step": 16553 + }, + { + "epoch": 1.3359696553950449, + "grad_norm": 0.6646167039871216, + "learning_rate": 1.4460848089239399e-05, + "loss": 2.4232, + "step": 16554 + }, + { + "epoch": 1.3360503591316277, + "grad_norm": 0.7604451179504395, + "learning_rate": 1.4452671826379227e-05, + "loss": 2.4208, + "step": 16555 + }, + { + "epoch": 1.3361310628682108, + "grad_norm": 0.7129300236701965, + "learning_rate": 1.4444497695582093e-05, + "loss": 2.4304, + "step": 16556 + }, + { + "epoch": 1.336211766604794, + "grad_norm": 0.6769927740097046, + "learning_rate": 1.4436325697051733e-05, + "loss": 2.3467, + "step": 16557 + }, + { + "epoch": 1.3362924703413768, + "grad_norm": 0.6568608283996582, + "learning_rate": 1.4428155830991797e-05, + "loss": 2.4285, + "step": 16558 + }, + { + "epoch": 1.3363731740779599, + "grad_norm": 0.7687276005744934, + "learning_rate": 1.4419988097605919e-05, + "loss": 2.4815, + "step": 16559 + }, + { + "epoch": 1.336453877814543, + "grad_norm": 0.7001463770866394, + "learning_rate": 1.4411822497097638e-05, + "loss": 2.4629, + "step": 16560 + }, + { + "epoch": 1.3365345815511258, + "grad_norm": 0.7211995720863342, + "learning_rate": 1.4403659029670458e-05, + "loss": 2.4323, + "step": 16561 + }, + { + "epoch": 1.336615285287709, + "grad_norm": 0.7371769547462463, + "learning_rate": 1.439549769552787e-05, + "loss": 2.3962, + "step": 16562 + }, + { + "epoch": 1.3366959890242918, + "grad_norm": 0.7475463151931763, + "learning_rate": 1.4387338494873237e-05, + "loss": 2.3593, + "step": 16563 + }, + { + "epoch": 1.3367766927608749, + "grad_norm": 0.7215834856033325, + "learning_rate": 1.4379181427909916e-05, + "loss": 2.3687, + "step": 16564 + }, + { + "epoch": 1.3368573964974577, + "grad_norm": 0.7160200476646423, + "learning_rate": 1.4371026494841211e-05, + "loss": 2.3652, + "step": 16565 + }, + { + "epoch": 1.3369381002340408, + "grad_norm": 0.6636231541633606, + "learning_rate": 1.436287369587036e-05, + "loss": 2.4628, + "step": 16566 + }, + { + "epoch": 1.337018803970624, + "grad_norm": 0.657774806022644, + "learning_rate": 1.4354723031200556e-05, + "loss": 2.4082, + "step": 16567 + }, + { + "epoch": 1.3370995077072068, + "grad_norm": 0.7020300626754761, + "learning_rate": 1.4346574501034936e-05, + "loss": 2.3821, + "step": 16568 + }, + { + "epoch": 1.3371802114437898, + "grad_norm": 0.6800786256790161, + "learning_rate": 1.4338428105576595e-05, + "loss": 2.3839, + "step": 16569 + }, + { + "epoch": 1.337260915180373, + "grad_norm": 0.7176932692527771, + "learning_rate": 1.4330283845028536e-05, + "loss": 2.4614, + "step": 16570 + }, + { + "epoch": 1.3373416189169558, + "grad_norm": 0.7233355641365051, + "learning_rate": 1.432214171959374e-05, + "loss": 2.4048, + "step": 16571 + }, + { + "epoch": 1.3374223226535389, + "grad_norm": 0.7721874117851257, + "learning_rate": 1.4314001729475157e-05, + "loss": 2.4169, + "step": 16572 + }, + { + "epoch": 1.337503026390122, + "grad_norm": 0.7123380303382874, + "learning_rate": 1.4305863874875613e-05, + "loss": 2.3799, + "step": 16573 + }, + { + "epoch": 1.3375837301267048, + "grad_norm": 0.7297765016555786, + "learning_rate": 1.4297728155997958e-05, + "loss": 2.4655, + "step": 16574 + }, + { + "epoch": 1.337664433863288, + "grad_norm": 0.6806401610374451, + "learning_rate": 1.428959457304493e-05, + "loss": 2.4102, + "step": 16575 + }, + { + "epoch": 1.3377451375998708, + "grad_norm": 0.6811275482177734, + "learning_rate": 1.4281463126219264e-05, + "loss": 2.4298, + "step": 16576 + }, + { + "epoch": 1.3378258413364539, + "grad_norm": 0.6900678277015686, + "learning_rate": 1.427333381572361e-05, + "loss": 2.4745, + "step": 16577 + }, + { + "epoch": 1.3379065450730367, + "grad_norm": 0.7815307974815369, + "learning_rate": 1.4265206641760587e-05, + "loss": 2.3624, + "step": 16578 + }, + { + "epoch": 1.3379872488096198, + "grad_norm": 0.6948800683021545, + "learning_rate": 1.4257081604532708e-05, + "loss": 2.4142, + "step": 16579 + }, + { + "epoch": 1.338067952546203, + "grad_norm": 0.7387657165527344, + "learning_rate": 1.4248958704242488e-05, + "loss": 2.4241, + "step": 16580 + }, + { + "epoch": 1.3381486562827858, + "grad_norm": 0.7158597111701965, + "learning_rate": 1.4240837941092367e-05, + "loss": 2.4473, + "step": 16581 + }, + { + "epoch": 1.3382293600193689, + "grad_norm": 0.758674144744873, + "learning_rate": 1.423271931528477e-05, + "loss": 2.4504, + "step": 16582 + }, + { + "epoch": 1.338310063755952, + "grad_norm": 0.6904417872428894, + "learning_rate": 1.4224602827021982e-05, + "loss": 2.4288, + "step": 16583 + }, + { + "epoch": 1.3383907674925348, + "grad_norm": 0.6988760828971863, + "learning_rate": 1.4216488476506307e-05, + "loss": 2.3874, + "step": 16584 + }, + { + "epoch": 1.338471471229118, + "grad_norm": 0.6969872117042542, + "learning_rate": 1.4208376263940003e-05, + "loss": 2.3388, + "step": 16585 + }, + { + "epoch": 1.338552174965701, + "grad_norm": 0.687179684638977, + "learning_rate": 1.420026618952518e-05, + "loss": 2.431, + "step": 16586 + }, + { + "epoch": 1.3386328787022839, + "grad_norm": 0.6319810152053833, + "learning_rate": 1.4192158253464038e-05, + "loss": 2.4415, + "step": 16587 + }, + { + "epoch": 1.338713582438867, + "grad_norm": 0.7554977536201477, + "learning_rate": 1.4184052455958629e-05, + "loss": 2.3863, + "step": 16588 + }, + { + "epoch": 1.33879428617545, + "grad_norm": 0.7025974988937378, + "learning_rate": 1.4175948797210936e-05, + "loss": 2.3957, + "step": 16589 + }, + { + "epoch": 1.338874989912033, + "grad_norm": 0.7270370721817017, + "learning_rate": 1.4167847277422952e-05, + "loss": 2.4309, + "step": 16590 + }, + { + "epoch": 1.338955693648616, + "grad_norm": 0.7017608284950256, + "learning_rate": 1.4159747896796593e-05, + "loss": 2.4142, + "step": 16591 + }, + { + "epoch": 1.3390363973851989, + "grad_norm": 0.7114055156707764, + "learning_rate": 1.4151650655533687e-05, + "loss": 2.473, + "step": 16592 + }, + { + "epoch": 1.339117101121782, + "grad_norm": 0.6420357823371887, + "learning_rate": 1.4143555553836063e-05, + "loss": 2.3671, + "step": 16593 + }, + { + "epoch": 1.3391978048583648, + "grad_norm": 0.7067350745201111, + "learning_rate": 1.413546259190548e-05, + "loss": 2.4422, + "step": 16594 + }, + { + "epoch": 1.339278508594948, + "grad_norm": 0.7376763224601746, + "learning_rate": 1.4127371769943598e-05, + "loss": 2.4443, + "step": 16595 + }, + { + "epoch": 1.339359212331531, + "grad_norm": 0.646515965461731, + "learning_rate": 1.4119283088152092e-05, + "loss": 2.3949, + "step": 16596 + }, + { + "epoch": 1.3394399160681139, + "grad_norm": 0.6896061301231384, + "learning_rate": 1.411119654673254e-05, + "loss": 2.4535, + "step": 16597 + }, + { + "epoch": 1.339520619804697, + "grad_norm": 0.6992611289024353, + "learning_rate": 1.4103112145886489e-05, + "loss": 2.3983, + "step": 16598 + }, + { + "epoch": 1.33960132354128, + "grad_norm": 0.7176348567008972, + "learning_rate": 1.4095029885815426e-05, + "loss": 2.4671, + "step": 16599 + }, + { + "epoch": 1.339682027277863, + "grad_norm": 0.6635856628417969, + "learning_rate": 1.4086949766720759e-05, + "loss": 2.4235, + "step": 16600 + }, + { + "epoch": 1.339762731014446, + "grad_norm": 0.673332154750824, + "learning_rate": 1.4078871788803915e-05, + "loss": 2.4328, + "step": 16601 + }, + { + "epoch": 1.339843434751029, + "grad_norm": 0.6738821864128113, + "learning_rate": 1.407079595226617e-05, + "loss": 2.4786, + "step": 16602 + }, + { + "epoch": 1.339924138487612, + "grad_norm": 0.690605103969574, + "learning_rate": 1.4062722257308803e-05, + "loss": 2.4025, + "step": 16603 + }, + { + "epoch": 1.340004842224195, + "grad_norm": 0.7186758518218994, + "learning_rate": 1.4054650704133066e-05, + "loss": 2.4793, + "step": 16604 + }, + { + "epoch": 1.3400855459607781, + "grad_norm": 0.6484951376914978, + "learning_rate": 1.4046581292940075e-05, + "loss": 2.3855, + "step": 16605 + }, + { + "epoch": 1.340166249697361, + "grad_norm": 0.6993771195411682, + "learning_rate": 1.403851402393096e-05, + "loss": 2.3872, + "step": 16606 + }, + { + "epoch": 1.340246953433944, + "grad_norm": 0.7446531653404236, + "learning_rate": 1.403044889730678e-05, + "loss": 2.4253, + "step": 16607 + }, + { + "epoch": 1.340327657170527, + "grad_norm": 0.6873160004615784, + "learning_rate": 1.4022385913268542e-05, + "loss": 2.464, + "step": 16608 + }, + { + "epoch": 1.34040836090711, + "grad_norm": 0.6570948362350464, + "learning_rate": 1.4014325072017198e-05, + "loss": 2.4063, + "step": 16609 + }, + { + "epoch": 1.3404890646436929, + "grad_norm": 0.7209224104881287, + "learning_rate": 1.4006266373753651e-05, + "loss": 2.4827, + "step": 16610 + }, + { + "epoch": 1.340569768380276, + "grad_norm": 0.7283413410186768, + "learning_rate": 1.3998209818678732e-05, + "loss": 2.4009, + "step": 16611 + }, + { + "epoch": 1.340650472116859, + "grad_norm": 0.6650960445404053, + "learning_rate": 1.3990155406993221e-05, + "loss": 2.3576, + "step": 16612 + }, + { + "epoch": 1.340731175853442, + "grad_norm": 0.6857860088348389, + "learning_rate": 1.3982103138897873e-05, + "loss": 2.4686, + "step": 16613 + }, + { + "epoch": 1.340811879590025, + "grad_norm": 0.7065873146057129, + "learning_rate": 1.3974053014593402e-05, + "loss": 2.3999, + "step": 16614 + }, + { + "epoch": 1.340892583326608, + "grad_norm": 0.8093010783195496, + "learning_rate": 1.3966005034280372e-05, + "loss": 2.4273, + "step": 16615 + }, + { + "epoch": 1.340973287063191, + "grad_norm": 0.649132251739502, + "learning_rate": 1.3957959198159387e-05, + "loss": 2.3418, + "step": 16616 + }, + { + "epoch": 1.341053990799774, + "grad_norm": 0.7114978432655334, + "learning_rate": 1.3949915506430976e-05, + "loss": 2.4393, + "step": 16617 + }, + { + "epoch": 1.3411346945363571, + "grad_norm": 0.7989282608032227, + "learning_rate": 1.3941873959295615e-05, + "loss": 2.4044, + "step": 16618 + }, + { + "epoch": 1.34121539827294, + "grad_norm": 0.7373676896095276, + "learning_rate": 1.3933834556953707e-05, + "loss": 2.4758, + "step": 16619 + }, + { + "epoch": 1.341296102009523, + "grad_norm": 0.7076435089111328, + "learning_rate": 1.3925797299605647e-05, + "loss": 2.4429, + "step": 16620 + }, + { + "epoch": 1.3413768057461062, + "grad_norm": 0.6739028692245483, + "learning_rate": 1.39177621874517e-05, + "loss": 2.4275, + "step": 16621 + }, + { + "epoch": 1.341457509482689, + "grad_norm": 0.7134198546409607, + "learning_rate": 1.3909729220692125e-05, + "loss": 2.4541, + "step": 16622 + }, + { + "epoch": 1.3415382132192721, + "grad_norm": 0.6770301461219788, + "learning_rate": 1.3901698399527175e-05, + "loss": 2.4143, + "step": 16623 + }, + { + "epoch": 1.341618916955855, + "grad_norm": 0.7146373987197876, + "learning_rate": 1.3893669724156943e-05, + "loss": 2.4886, + "step": 16624 + }, + { + "epoch": 1.341699620692438, + "grad_norm": 0.6801536083221436, + "learning_rate": 1.3885643194781539e-05, + "loss": 2.4154, + "step": 16625 + }, + { + "epoch": 1.341780324429021, + "grad_norm": 0.7350363731384277, + "learning_rate": 1.3877618811601024e-05, + "loss": 2.3918, + "step": 16626 + }, + { + "epoch": 1.341861028165604, + "grad_norm": 0.7088882327079773, + "learning_rate": 1.3869596574815358e-05, + "loss": 2.412, + "step": 16627 + }, + { + "epoch": 1.3419417319021871, + "grad_norm": 0.7199791669845581, + "learning_rate": 1.3861576484624506e-05, + "loss": 2.3912, + "step": 16628 + }, + { + "epoch": 1.34202243563877, + "grad_norm": 0.692971408367157, + "learning_rate": 1.3853558541228328e-05, + "loss": 2.3826, + "step": 16629 + }, + { + "epoch": 1.342103139375353, + "grad_norm": 0.7524722814559937, + "learning_rate": 1.3845542744826679e-05, + "loss": 2.4227, + "step": 16630 + }, + { + "epoch": 1.3421838431119362, + "grad_norm": 0.6624585390090942, + "learning_rate": 1.3837529095619307e-05, + "loss": 2.3649, + "step": 16631 + }, + { + "epoch": 1.342264546848519, + "grad_norm": 0.6884489059448242, + "learning_rate": 1.3829517593805929e-05, + "loss": 2.3687, + "step": 16632 + }, + { + "epoch": 1.3423452505851021, + "grad_norm": 0.6766197085380554, + "learning_rate": 1.3821508239586246e-05, + "loss": 2.4191, + "step": 16633 + }, + { + "epoch": 1.3424259543216852, + "grad_norm": 0.6744453310966492, + "learning_rate": 1.3813501033159837e-05, + "loss": 2.4254, + "step": 16634 + }, + { + "epoch": 1.342506658058268, + "grad_norm": 0.6906216144561768, + "learning_rate": 1.3805495974726267e-05, + "loss": 2.4763, + "step": 16635 + }, + { + "epoch": 1.3425873617948512, + "grad_norm": 0.7052608132362366, + "learning_rate": 1.3797493064485078e-05, + "loss": 2.4307, + "step": 16636 + }, + { + "epoch": 1.342668065531434, + "grad_norm": 0.6701127290725708, + "learning_rate": 1.3789492302635653e-05, + "loss": 2.4529, + "step": 16637 + }, + { + "epoch": 1.3427487692680171, + "grad_norm": 0.7440397143363953, + "learning_rate": 1.3781493689377455e-05, + "loss": 2.4471, + "step": 16638 + }, + { + "epoch": 1.3428294730046, + "grad_norm": 0.7340207695960999, + "learning_rate": 1.3773497224909848e-05, + "loss": 2.4434, + "step": 16639 + }, + { + "epoch": 1.342910176741183, + "grad_norm": 0.6836793422698975, + "learning_rate": 1.376550290943205e-05, + "loss": 2.4072, + "step": 16640 + }, + { + "epoch": 1.3429908804777662, + "grad_norm": 0.6820472478866577, + "learning_rate": 1.3757510743143342e-05, + "loss": 2.4078, + "step": 16641 + }, + { + "epoch": 1.343071584214349, + "grad_norm": 0.6608061194419861, + "learning_rate": 1.3749520726242938e-05, + "loss": 2.3995, + "step": 16642 + }, + { + "epoch": 1.3431522879509321, + "grad_norm": 0.6582421064376831, + "learning_rate": 1.3741532858929906e-05, + "loss": 2.3768, + "step": 16643 + }, + { + "epoch": 1.3432329916875152, + "grad_norm": 0.7032744288444519, + "learning_rate": 1.3733547141403358e-05, + "loss": 2.4367, + "step": 16644 + }, + { + "epoch": 1.343313695424098, + "grad_norm": 0.7149307727813721, + "learning_rate": 1.3725563573862321e-05, + "loss": 2.4425, + "step": 16645 + }, + { + "epoch": 1.3433943991606812, + "grad_norm": 0.7375392913818359, + "learning_rate": 1.3717582156505793e-05, + "loss": 2.409, + "step": 16646 + }, + { + "epoch": 1.3434751028972642, + "grad_norm": 0.8422170877456665, + "learning_rate": 1.3709602889532624e-05, + "loss": 2.4758, + "step": 16647 + }, + { + "epoch": 1.343555806633847, + "grad_norm": 0.6542177796363831, + "learning_rate": 1.3701625773141712e-05, + "loss": 2.4199, + "step": 16648 + }, + { + "epoch": 1.3436365103704302, + "grad_norm": 0.6639342904090881, + "learning_rate": 1.3693650807531898e-05, + "loss": 2.4366, + "step": 16649 + }, + { + "epoch": 1.3437172141070133, + "grad_norm": 0.7270925045013428, + "learning_rate": 1.3685677992901901e-05, + "loss": 2.3745, + "step": 16650 + }, + { + "epoch": 1.3437979178435961, + "grad_norm": 0.7325547337532043, + "learning_rate": 1.367770732945044e-05, + "loss": 2.5053, + "step": 16651 + }, + { + "epoch": 1.3438786215801792, + "grad_norm": 0.7752320766448975, + "learning_rate": 1.3669738817376177e-05, + "loss": 2.4505, + "step": 16652 + }, + { + "epoch": 1.343959325316762, + "grad_norm": 0.6538182497024536, + "learning_rate": 1.3661772456877675e-05, + "loss": 2.4164, + "step": 16653 + }, + { + "epoch": 1.3440400290533452, + "grad_norm": 0.6886051297187805, + "learning_rate": 1.3653808248153487e-05, + "loss": 2.4156, + "step": 16654 + }, + { + "epoch": 1.344120732789928, + "grad_norm": 0.6990679502487183, + "learning_rate": 1.3645846191402134e-05, + "loss": 2.418, + "step": 16655 + }, + { + "epoch": 1.3442014365265111, + "grad_norm": 0.7006608247756958, + "learning_rate": 1.3637886286821999e-05, + "loss": 2.3987, + "step": 16656 + }, + { + "epoch": 1.3442821402630942, + "grad_norm": 0.6858758926391602, + "learning_rate": 1.3629928534611502e-05, + "loss": 2.3571, + "step": 16657 + }, + { + "epoch": 1.344362843999677, + "grad_norm": 0.7273774147033691, + "learning_rate": 1.3621972934968951e-05, + "loss": 2.4141, + "step": 16658 + }, + { + "epoch": 1.3444435477362602, + "grad_norm": 0.6770352721214294, + "learning_rate": 1.3614019488092633e-05, + "loss": 2.4602, + "step": 16659 + }, + { + "epoch": 1.3445242514728433, + "grad_norm": 0.7473095655441284, + "learning_rate": 1.3606068194180766e-05, + "loss": 2.3884, + "step": 16660 + }, + { + "epoch": 1.3446049552094261, + "grad_norm": 0.7271387577056885, + "learning_rate": 1.3598119053431512e-05, + "loss": 2.4705, + "step": 16661 + }, + { + "epoch": 1.3446856589460092, + "grad_norm": 0.658349335193634, + "learning_rate": 1.3590172066043006e-05, + "loss": 2.4271, + "step": 16662 + }, + { + "epoch": 1.3447663626825923, + "grad_norm": 0.6479319930076599, + "learning_rate": 1.3582227232213273e-05, + "loss": 2.3428, + "step": 16663 + }, + { + "epoch": 1.3448470664191752, + "grad_norm": 0.700951874256134, + "learning_rate": 1.3574284552140337e-05, + "loss": 2.4926, + "step": 16664 + }, + { + "epoch": 1.3449277701557583, + "grad_norm": 0.6699960231781006, + "learning_rate": 1.3566344026022171e-05, + "loss": 2.4372, + "step": 16665 + }, + { + "epoch": 1.3450084738923413, + "grad_norm": 0.6743033528327942, + "learning_rate": 1.3558405654056617e-05, + "loss": 2.4142, + "step": 16666 + }, + { + "epoch": 1.3450891776289242, + "grad_norm": 0.6619464755058289, + "learning_rate": 1.355046943644157e-05, + "loss": 2.4099, + "step": 16667 + }, + { + "epoch": 1.3451698813655073, + "grad_norm": 0.668084442615509, + "learning_rate": 1.3542535373374798e-05, + "loss": 2.3895, + "step": 16668 + }, + { + "epoch": 1.3452505851020902, + "grad_norm": 0.7954626679420471, + "learning_rate": 1.3534603465054052e-05, + "loss": 2.479, + "step": 16669 + }, + { + "epoch": 1.3453312888386733, + "grad_norm": 0.6742919683456421, + "learning_rate": 1.3526673711677008e-05, + "loss": 2.4289, + "step": 16670 + }, + { + "epoch": 1.3454119925752561, + "grad_norm": 0.6564723253250122, + "learning_rate": 1.3518746113441316e-05, + "loss": 2.404, + "step": 16671 + }, + { + "epoch": 1.3454926963118392, + "grad_norm": 0.6955705881118774, + "learning_rate": 1.3510820670544521e-05, + "loss": 2.4274, + "step": 16672 + }, + { + "epoch": 1.3455734000484223, + "grad_norm": 0.6687749028205872, + "learning_rate": 1.3502897383184154e-05, + "loss": 2.4564, + "step": 16673 + }, + { + "epoch": 1.3456541037850052, + "grad_norm": 0.7984250783920288, + "learning_rate": 1.34949762515577e-05, + "loss": 2.3426, + "step": 16674 + }, + { + "epoch": 1.3457348075215882, + "grad_norm": 0.7334223389625549, + "learning_rate": 1.348705727586258e-05, + "loss": 2.4712, + "step": 16675 + }, + { + "epoch": 1.3458155112581713, + "grad_norm": 0.6732765436172485, + "learning_rate": 1.3479140456296114e-05, + "loss": 2.424, + "step": 16676 + }, + { + "epoch": 1.3458962149947542, + "grad_norm": 0.7944334149360657, + "learning_rate": 1.3471225793055641e-05, + "loss": 2.3951, + "step": 16677 + }, + { + "epoch": 1.3459769187313373, + "grad_norm": 0.6829007863998413, + "learning_rate": 1.3463313286338408e-05, + "loss": 2.4158, + "step": 16678 + }, + { + "epoch": 1.3460576224679204, + "grad_norm": 0.7019640207290649, + "learning_rate": 1.345540293634161e-05, + "loss": 2.4093, + "step": 16679 + }, + { + "epoch": 1.3461383262045032, + "grad_norm": 0.6839374303817749, + "learning_rate": 1.3447494743262412e-05, + "loss": 2.3959, + "step": 16680 + }, + { + "epoch": 1.3462190299410863, + "grad_norm": 0.7211155295372009, + "learning_rate": 1.3439588707297911e-05, + "loss": 2.4052, + "step": 16681 + }, + { + "epoch": 1.3462997336776692, + "grad_norm": 0.73811274766922, + "learning_rate": 1.3431684828645109e-05, + "loss": 2.4179, + "step": 16682 + }, + { + "epoch": 1.3463804374142523, + "grad_norm": 0.6634721159934998, + "learning_rate": 1.3423783107501009e-05, + "loss": 2.379, + "step": 16683 + }, + { + "epoch": 1.3464611411508352, + "grad_norm": 0.6884057521820068, + "learning_rate": 1.3415883544062579e-05, + "loss": 2.4144, + "step": 16684 + }, + { + "epoch": 1.3465418448874182, + "grad_norm": 0.7239587306976318, + "learning_rate": 1.340798613852664e-05, + "loss": 2.3856, + "step": 16685 + }, + { + "epoch": 1.3466225486240013, + "grad_norm": 0.7201077342033386, + "learning_rate": 1.3400090891090033e-05, + "loss": 2.4552, + "step": 16686 + }, + { + "epoch": 1.3467032523605842, + "grad_norm": 0.7049584984779358, + "learning_rate": 1.3392197801949558e-05, + "loss": 2.4424, + "step": 16687 + }, + { + "epoch": 1.3467839560971673, + "grad_norm": 0.7240790128707886, + "learning_rate": 1.3384306871301877e-05, + "loss": 2.4156, + "step": 16688 + }, + { + "epoch": 1.3468646598337504, + "grad_norm": 0.7276458740234375, + "learning_rate": 1.337641809934369e-05, + "loss": 2.3882, + "step": 16689 + }, + { + "epoch": 1.3469453635703332, + "grad_norm": 0.6650896072387695, + "learning_rate": 1.3368531486271607e-05, + "loss": 2.396, + "step": 16690 + }, + { + "epoch": 1.3470260673069163, + "grad_norm": 0.6946447491645813, + "learning_rate": 1.3360647032282203e-05, + "loss": 2.3779, + "step": 16691 + }, + { + "epoch": 1.3471067710434994, + "grad_norm": 0.7507699728012085, + "learning_rate": 1.3352764737571932e-05, + "loss": 2.4378, + "step": 16692 + }, + { + "epoch": 1.3471874747800823, + "grad_norm": 0.6548876762390137, + "learning_rate": 1.334488460233725e-05, + "loss": 2.4181, + "step": 16693 + }, + { + "epoch": 1.3472681785166654, + "grad_norm": 0.7000874280929565, + "learning_rate": 1.3337006626774595e-05, + "loss": 2.4463, + "step": 16694 + }, + { + "epoch": 1.3473488822532484, + "grad_norm": 0.6487517356872559, + "learning_rate": 1.3329130811080249e-05, + "loss": 2.3703, + "step": 16695 + }, + { + "epoch": 1.3474295859898313, + "grad_norm": 0.6447827219963074, + "learning_rate": 1.3321257155450517e-05, + "loss": 2.3779, + "step": 16696 + }, + { + "epoch": 1.3475102897264144, + "grad_norm": 0.6309572458267212, + "learning_rate": 1.3313385660081667e-05, + "loss": 2.4443, + "step": 16697 + }, + { + "epoch": 1.3475909934629973, + "grad_norm": 0.6366227865219116, + "learning_rate": 1.330551632516982e-05, + "loss": 2.3418, + "step": 16698 + }, + { + "epoch": 1.3476716971995804, + "grad_norm": 0.6864019632339478, + "learning_rate": 1.3297649150911117e-05, + "loss": 2.4416, + "step": 16699 + }, + { + "epoch": 1.3477524009361632, + "grad_norm": 0.6807940006256104, + "learning_rate": 1.3289784137501671e-05, + "loss": 2.4465, + "step": 16700 + }, + { + "epoch": 1.3478331046727463, + "grad_norm": 0.6991185545921326, + "learning_rate": 1.3281921285137455e-05, + "loss": 2.3929, + "step": 16701 + }, + { + "epoch": 1.3479138084093294, + "grad_norm": 0.691908061504364, + "learning_rate": 1.3274060594014437e-05, + "loss": 2.4237, + "step": 16702 + }, + { + "epoch": 1.3479945121459123, + "grad_norm": 0.6909685730934143, + "learning_rate": 1.3266202064328548e-05, + "loss": 2.3695, + "step": 16703 + }, + { + "epoch": 1.3480752158824953, + "grad_norm": 0.6473715901374817, + "learning_rate": 1.325834569627562e-05, + "loss": 2.384, + "step": 16704 + }, + { + "epoch": 1.3481559196190784, + "grad_norm": 0.7433453798294067, + "learning_rate": 1.3250491490051454e-05, + "loss": 2.4546, + "step": 16705 + }, + { + "epoch": 1.3482366233556613, + "grad_norm": 0.7432501316070557, + "learning_rate": 1.3242639445851812e-05, + "loss": 2.4204, + "step": 16706 + }, + { + "epoch": 1.3483173270922444, + "grad_norm": 0.6661228537559509, + "learning_rate": 1.3234789563872397e-05, + "loss": 2.4454, + "step": 16707 + }, + { + "epoch": 1.3483980308288275, + "grad_norm": 0.7481260895729065, + "learning_rate": 1.3226941844308816e-05, + "loss": 2.4348, + "step": 16708 + }, + { + "epoch": 1.3484787345654103, + "grad_norm": 0.6986531019210815, + "learning_rate": 1.3219096287356669e-05, + "loss": 2.3622, + "step": 16709 + }, + { + "epoch": 1.3485594383019934, + "grad_norm": 0.7457645535469055, + "learning_rate": 1.321125289321149e-05, + "loss": 2.4399, + "step": 16710 + }, + { + "epoch": 1.3486401420385765, + "grad_norm": 0.6710307598114014, + "learning_rate": 1.3203411662068754e-05, + "loss": 2.3857, + "step": 16711 + }, + { + "epoch": 1.3487208457751594, + "grad_norm": 0.767304539680481, + "learning_rate": 1.3195572594123884e-05, + "loss": 2.4666, + "step": 16712 + }, + { + "epoch": 1.3488015495117425, + "grad_norm": 0.6720963716506958, + "learning_rate": 1.3187735689572289e-05, + "loss": 2.3952, + "step": 16713 + }, + { + "epoch": 1.3488822532483253, + "grad_norm": 0.6381734609603882, + "learning_rate": 1.3179900948609213e-05, + "loss": 2.3632, + "step": 16714 + }, + { + "epoch": 1.3489629569849084, + "grad_norm": 0.6697315573692322, + "learning_rate": 1.317206837142997e-05, + "loss": 2.4117, + "step": 16715 + }, + { + "epoch": 1.3490436607214913, + "grad_norm": 0.723676323890686, + "learning_rate": 1.3164237958229764e-05, + "loss": 2.3772, + "step": 16716 + }, + { + "epoch": 1.3491243644580744, + "grad_norm": 0.7021055817604065, + "learning_rate": 1.3156409709203732e-05, + "loss": 2.3808, + "step": 16717 + }, + { + "epoch": 1.3492050681946575, + "grad_norm": 0.7128920555114746, + "learning_rate": 1.3148583624546962e-05, + "loss": 2.3854, + "step": 16718 + }, + { + "epoch": 1.3492857719312403, + "grad_norm": 0.6684797406196594, + "learning_rate": 1.314075970445453e-05, + "loss": 2.3722, + "step": 16719 + }, + { + "epoch": 1.3493664756678234, + "grad_norm": 0.6710386276245117, + "learning_rate": 1.3132937949121426e-05, + "loss": 2.412, + "step": 16720 + }, + { + "epoch": 1.3494471794044065, + "grad_norm": 0.7207252979278564, + "learning_rate": 1.3125118358742572e-05, + "loss": 2.4506, + "step": 16721 + }, + { + "epoch": 1.3495278831409894, + "grad_norm": 0.685516893863678, + "learning_rate": 1.3117300933512865e-05, + "loss": 2.435, + "step": 16722 + }, + { + "epoch": 1.3496085868775725, + "grad_norm": 0.71708744764328, + "learning_rate": 1.3109485673627154e-05, + "loss": 2.4735, + "step": 16723 + }, + { + "epoch": 1.3496892906141555, + "grad_norm": 0.7293861508369446, + "learning_rate": 1.3101672579280166e-05, + "loss": 2.4545, + "step": 16724 + }, + { + "epoch": 1.3497699943507384, + "grad_norm": 0.6448976993560791, + "learning_rate": 1.3093861650666661e-05, + "loss": 2.386, + "step": 16725 + }, + { + "epoch": 1.3498506980873215, + "grad_norm": 0.8111226558685303, + "learning_rate": 1.3086052887981315e-05, + "loss": 2.4733, + "step": 16726 + }, + { + "epoch": 1.3499314018239044, + "grad_norm": 0.7673875093460083, + "learning_rate": 1.3078246291418706e-05, + "loss": 2.4119, + "step": 16727 + }, + { + "epoch": 1.3500121055604875, + "grad_norm": 0.7296731472015381, + "learning_rate": 1.307044186117341e-05, + "loss": 2.3724, + "step": 16728 + }, + { + "epoch": 1.3500928092970703, + "grad_norm": 0.6947155594825745, + "learning_rate": 1.306263959743994e-05, + "loss": 2.3989, + "step": 16729 + }, + { + "epoch": 1.3501735130336534, + "grad_norm": 0.6781659722328186, + "learning_rate": 1.3054839500412753e-05, + "loss": 2.429, + "step": 16730 + }, + { + "epoch": 1.3502542167702365, + "grad_norm": 0.7498819231987, + "learning_rate": 1.3047041570286244e-05, + "loss": 2.459, + "step": 16731 + }, + { + "epoch": 1.3503349205068194, + "grad_norm": 0.6651057004928589, + "learning_rate": 1.3039245807254774e-05, + "loss": 2.4049, + "step": 16732 + }, + { + "epoch": 1.3504156242434024, + "grad_norm": 0.6998507380485535, + "learning_rate": 1.3031452211512596e-05, + "loss": 2.4083, + "step": 16733 + }, + { + "epoch": 1.3504963279799855, + "grad_norm": 0.6522402167320251, + "learning_rate": 1.3023660783253966e-05, + "loss": 2.3987, + "step": 16734 + }, + { + "epoch": 1.3505770317165684, + "grad_norm": 0.6618130207061768, + "learning_rate": 1.3015871522673096e-05, + "loss": 2.4514, + "step": 16735 + }, + { + "epoch": 1.3506577354531515, + "grad_norm": 0.7139489650726318, + "learning_rate": 1.300808442996405e-05, + "loss": 2.484, + "step": 16736 + }, + { + "epoch": 1.3507384391897346, + "grad_norm": 0.6582522988319397, + "learning_rate": 1.3000299505320956e-05, + "loss": 2.4463, + "step": 16737 + }, + { + "epoch": 1.3508191429263174, + "grad_norm": 0.7115446329116821, + "learning_rate": 1.2992516748937811e-05, + "loss": 2.4795, + "step": 16738 + }, + { + "epoch": 1.3508998466629005, + "grad_norm": 0.7243752479553223, + "learning_rate": 1.2984736161008581e-05, + "loss": 2.4151, + "step": 16739 + }, + { + "epoch": 1.3509805503994836, + "grad_norm": 0.758084774017334, + "learning_rate": 1.297695774172719e-05, + "loss": 2.4028, + "step": 16740 + }, + { + "epoch": 1.3510612541360665, + "grad_norm": 0.6555618643760681, + "learning_rate": 1.2969181491287496e-05, + "loss": 2.4184, + "step": 16741 + }, + { + "epoch": 1.3511419578726496, + "grad_norm": 0.6657842993736267, + "learning_rate": 1.2961407409883331e-05, + "loss": 2.375, + "step": 16742 + }, + { + "epoch": 1.3512226616092324, + "grad_norm": 0.6355723142623901, + "learning_rate": 1.2953635497708382e-05, + "loss": 2.4202, + "step": 16743 + }, + { + "epoch": 1.3513033653458155, + "grad_norm": 0.7384408116340637, + "learning_rate": 1.2945865754956377e-05, + "loss": 2.4298, + "step": 16744 + }, + { + "epoch": 1.3513840690823984, + "grad_norm": 0.7300455570220947, + "learning_rate": 1.2938098181820979e-05, + "loss": 2.3842, + "step": 16745 + }, + { + "epoch": 1.3514647728189815, + "grad_norm": 0.7378895282745361, + "learning_rate": 1.2930332778495735e-05, + "loss": 2.4025, + "step": 16746 + }, + { + "epoch": 1.3515454765555646, + "grad_norm": 0.6542565822601318, + "learning_rate": 1.2922569545174212e-05, + "loss": 2.3995, + "step": 16747 + }, + { + "epoch": 1.3516261802921474, + "grad_norm": 0.669829249382019, + "learning_rate": 1.291480848204989e-05, + "loss": 2.3843, + "step": 16748 + }, + { + "epoch": 1.3517068840287305, + "grad_norm": 0.6747604608535767, + "learning_rate": 1.2907049589316167e-05, + "loss": 2.4108, + "step": 16749 + }, + { + "epoch": 1.3517875877653136, + "grad_norm": 0.7003559470176697, + "learning_rate": 1.2899292867166402e-05, + "loss": 2.4233, + "step": 16750 + }, + { + "epoch": 1.3518682915018965, + "grad_norm": 0.7365099191665649, + "learning_rate": 1.2891538315793994e-05, + "loss": 2.3592, + "step": 16751 + }, + { + "epoch": 1.3519489952384796, + "grad_norm": 0.6849377751350403, + "learning_rate": 1.2883785935392123e-05, + "loss": 2.3943, + "step": 16752 + }, + { + "epoch": 1.3520296989750626, + "grad_norm": 0.7263002395629883, + "learning_rate": 1.2876035726154045e-05, + "loss": 2.4078, + "step": 16753 + }, + { + "epoch": 1.3521104027116455, + "grad_norm": 0.7341182827949524, + "learning_rate": 1.2868287688272884e-05, + "loss": 2.3568, + "step": 16754 + }, + { + "epoch": 1.3521911064482286, + "grad_norm": 0.7281078100204468, + "learning_rate": 1.2860541821941796e-05, + "loss": 2.4073, + "step": 16755 + }, + { + "epoch": 1.3522718101848117, + "grad_norm": 0.6302868127822876, + "learning_rate": 1.285279812735376e-05, + "loss": 2.3946, + "step": 16756 + }, + { + "epoch": 1.3523525139213946, + "grad_norm": 0.7333062887191772, + "learning_rate": 1.28450566047018e-05, + "loss": 2.3892, + "step": 16757 + }, + { + "epoch": 1.3524332176579776, + "grad_norm": 0.74838787317276, + "learning_rate": 1.2837317254178882e-05, + "loss": 2.4844, + "step": 16758 + }, + { + "epoch": 1.3525139213945605, + "grad_norm": 0.7085757255554199, + "learning_rate": 1.2829580075977843e-05, + "loss": 2.3583, + "step": 16759 + }, + { + "epoch": 1.3525946251311436, + "grad_norm": 0.7182579040527344, + "learning_rate": 1.2821845070291527e-05, + "loss": 2.4326, + "step": 16760 + }, + { + "epoch": 1.3526753288677265, + "grad_norm": 0.6857885718345642, + "learning_rate": 1.2814112237312714e-05, + "loss": 2.4406, + "step": 16761 + }, + { + "epoch": 1.3527560326043095, + "grad_norm": 0.7629652619361877, + "learning_rate": 1.2806381577234139e-05, + "loss": 2.4839, + "step": 16762 + }, + { + "epoch": 1.3528367363408926, + "grad_norm": 0.6940319538116455, + "learning_rate": 1.2798653090248458e-05, + "loss": 2.3918, + "step": 16763 + }, + { + "epoch": 1.3529174400774755, + "grad_norm": 0.6825633645057678, + "learning_rate": 1.2790926776548318e-05, + "loss": 2.3828, + "step": 16764 + }, + { + "epoch": 1.3529981438140586, + "grad_norm": 0.6830280423164368, + "learning_rate": 1.278320263632622e-05, + "loss": 2.3727, + "step": 16765 + }, + { + "epoch": 1.3530788475506417, + "grad_norm": 0.6782984733581543, + "learning_rate": 1.2775480669774698e-05, + "loss": 2.3984, + "step": 16766 + }, + { + "epoch": 1.3531595512872245, + "grad_norm": 0.6939808130264282, + "learning_rate": 1.276776087708621e-05, + "loss": 2.3724, + "step": 16767 + }, + { + "epoch": 1.3532402550238076, + "grad_norm": 0.7562546133995056, + "learning_rate": 1.276004325845317e-05, + "loss": 2.4178, + "step": 16768 + }, + { + "epoch": 1.3533209587603907, + "grad_norm": 0.6692922115325928, + "learning_rate": 1.2752327814067877e-05, + "loss": 2.4072, + "step": 16769 + }, + { + "epoch": 1.3534016624969736, + "grad_norm": 0.6783415079116821, + "learning_rate": 1.2744614544122635e-05, + "loss": 2.3993, + "step": 16770 + }, + { + "epoch": 1.3534823662335567, + "grad_norm": 0.6608997583389282, + "learning_rate": 1.27369034488097e-05, + "loss": 2.3883, + "step": 16771 + }, + { + "epoch": 1.3535630699701398, + "grad_norm": 0.6849228739738464, + "learning_rate": 1.2729194528321231e-05, + "loss": 2.4009, + "step": 16772 + }, + { + "epoch": 1.3536437737067226, + "grad_norm": 0.7059305906295776, + "learning_rate": 1.2721487782849362e-05, + "loss": 2.508, + "step": 16773 + }, + { + "epoch": 1.3537244774433057, + "grad_norm": 0.6471492052078247, + "learning_rate": 1.2713783212586183e-05, + "loss": 2.3813, + "step": 16774 + }, + { + "epoch": 1.3538051811798886, + "grad_norm": 0.7108949422836304, + "learning_rate": 1.2706080817723687e-05, + "loss": 2.4189, + "step": 16775 + }, + { + "epoch": 1.3538858849164717, + "grad_norm": 0.6623945236206055, + "learning_rate": 1.269838059845383e-05, + "loss": 2.4128, + "step": 16776 + }, + { + "epoch": 1.3539665886530545, + "grad_norm": 0.6595518589019775, + "learning_rate": 1.269068255496857e-05, + "loss": 2.3984, + "step": 16777 + }, + { + "epoch": 1.3540472923896376, + "grad_norm": 0.6932248473167419, + "learning_rate": 1.2682986687459708e-05, + "loss": 2.3951, + "step": 16778 + }, + { + "epoch": 1.3541279961262207, + "grad_norm": 0.6914867162704468, + "learning_rate": 1.2675292996119059e-05, + "loss": 2.4602, + "step": 16779 + }, + { + "epoch": 1.3542086998628036, + "grad_norm": 0.6633034348487854, + "learning_rate": 1.266760148113838e-05, + "loss": 2.43, + "step": 16780 + }, + { + "epoch": 1.3542894035993867, + "grad_norm": 0.6987594366073608, + "learning_rate": 1.2659912142709363e-05, + "loss": 2.3962, + "step": 16781 + }, + { + "epoch": 1.3543701073359697, + "grad_norm": 0.7429597973823547, + "learning_rate": 1.2652224981023652e-05, + "loss": 2.4838, + "step": 16782 + }, + { + "epoch": 1.3544508110725526, + "grad_norm": 0.6402504444122314, + "learning_rate": 1.2644539996272808e-05, + "loss": 2.43, + "step": 16783 + }, + { + "epoch": 1.3545315148091357, + "grad_norm": 0.6763156652450562, + "learning_rate": 1.263685718864841e-05, + "loss": 2.4911, + "step": 16784 + }, + { + "epoch": 1.3546122185457188, + "grad_norm": 0.8133900165557861, + "learning_rate": 1.2629176558341881e-05, + "loss": 2.45, + "step": 16785 + }, + { + "epoch": 1.3546929222823016, + "grad_norm": 0.6946277022361755, + "learning_rate": 1.262149810554465e-05, + "loss": 2.43, + "step": 16786 + }, + { + "epoch": 1.3547736260188847, + "grad_norm": 0.7667170166969299, + "learning_rate": 1.2613821830448125e-05, + "loss": 2.4464, + "step": 16787 + }, + { + "epoch": 1.3548543297554676, + "grad_norm": 0.672662615776062, + "learning_rate": 1.2606147733243567e-05, + "loss": 2.3653, + "step": 16788 + }, + { + "epoch": 1.3549350334920507, + "grad_norm": 0.6856412291526794, + "learning_rate": 1.2598475814122258e-05, + "loss": 2.3924, + "step": 16789 + }, + { + "epoch": 1.3550157372286336, + "grad_norm": 0.6966650485992432, + "learning_rate": 1.2590806073275407e-05, + "loss": 2.4039, + "step": 16790 + }, + { + "epoch": 1.3550964409652166, + "grad_norm": 0.7397874593734741, + "learning_rate": 1.2583138510894143e-05, + "loss": 2.4769, + "step": 16791 + }, + { + "epoch": 1.3551771447017997, + "grad_norm": 0.6960996985435486, + "learning_rate": 1.2575473127169591e-05, + "loss": 2.4342, + "step": 16792 + }, + { + "epoch": 1.3552578484383826, + "grad_norm": 0.7324376702308655, + "learning_rate": 1.2567809922292795e-05, + "loss": 2.4779, + "step": 16793 + }, + { + "epoch": 1.3553385521749657, + "grad_norm": 0.6891930103302002, + "learning_rate": 1.2560148896454704e-05, + "loss": 2.4228, + "step": 16794 + }, + { + "epoch": 1.3554192559115488, + "grad_norm": 0.6919474601745605, + "learning_rate": 1.2552490049846278e-05, + "loss": 2.4178, + "step": 16795 + }, + { + "epoch": 1.3554999596481316, + "grad_norm": 0.7067604660987854, + "learning_rate": 1.2544833382658405e-05, + "loss": 2.457, + "step": 16796 + }, + { + "epoch": 1.3555806633847147, + "grad_norm": 0.7667992115020752, + "learning_rate": 1.253717889508188e-05, + "loss": 2.3951, + "step": 16797 + }, + { + "epoch": 1.3556613671212978, + "grad_norm": 0.6337998509407043, + "learning_rate": 1.2529526587307482e-05, + "loss": 2.3788, + "step": 16798 + }, + { + "epoch": 1.3557420708578807, + "grad_norm": 0.6591900587081909, + "learning_rate": 1.2521876459525927e-05, + "loss": 2.4101, + "step": 16799 + }, + { + "epoch": 1.3558227745944638, + "grad_norm": 0.7115298509597778, + "learning_rate": 1.2514228511927895e-05, + "loss": 2.4417, + "step": 16800 + }, + { + "epoch": 1.3559034783310469, + "grad_norm": 0.6851321458816528, + "learning_rate": 1.2506582744703965e-05, + "loss": 2.4081, + "step": 16801 + }, + { + "epoch": 1.3559841820676297, + "grad_norm": 0.7469603419303894, + "learning_rate": 1.249893915804471e-05, + "loss": 2.3703, + "step": 16802 + }, + { + "epoch": 1.3560648858042128, + "grad_norm": 0.6972614526748657, + "learning_rate": 1.2491297752140641e-05, + "loss": 2.3549, + "step": 16803 + }, + { + "epoch": 1.3561455895407957, + "grad_norm": 0.6669485569000244, + "learning_rate": 1.2483658527182151e-05, + "loss": 2.4261, + "step": 16804 + }, + { + "epoch": 1.3562262932773788, + "grad_norm": 0.7516919374465942, + "learning_rate": 1.247602148335968e-05, + "loss": 2.4323, + "step": 16805 + }, + { + "epoch": 1.3563069970139616, + "grad_norm": 0.7191836833953857, + "learning_rate": 1.2468386620863548e-05, + "loss": 2.4242, + "step": 16806 + }, + { + "epoch": 1.3563877007505447, + "grad_norm": 0.660237729549408, + "learning_rate": 1.2460753939884017e-05, + "loss": 2.4154, + "step": 16807 + }, + { + "epoch": 1.3564684044871278, + "grad_norm": 0.749531626701355, + "learning_rate": 1.2453123440611325e-05, + "loss": 2.4138, + "step": 16808 + }, + { + "epoch": 1.3565491082237107, + "grad_norm": 0.6808986067771912, + "learning_rate": 1.2445495123235673e-05, + "loss": 2.3918, + "step": 16809 + }, + { + "epoch": 1.3566298119602938, + "grad_norm": 0.686183750629425, + "learning_rate": 1.2437868987947133e-05, + "loss": 2.4172, + "step": 16810 + }, + { + "epoch": 1.3567105156968768, + "grad_norm": 0.6487868428230286, + "learning_rate": 1.2430245034935784e-05, + "loss": 2.4199, + "step": 16811 + }, + { + "epoch": 1.3567912194334597, + "grad_norm": 0.7352244257926941, + "learning_rate": 1.242262326439163e-05, + "loss": 2.3779, + "step": 16812 + }, + { + "epoch": 1.3568719231700428, + "grad_norm": 0.7250565886497498, + "learning_rate": 1.2415003676504644e-05, + "loss": 2.4106, + "step": 16813 + }, + { + "epoch": 1.3569526269066259, + "grad_norm": 0.6843926906585693, + "learning_rate": 1.2407386271464716e-05, + "loss": 2.3725, + "step": 16814 + }, + { + "epoch": 1.3570333306432087, + "grad_norm": 0.686326801776886, + "learning_rate": 1.2399771049461684e-05, + "loss": 2.3709, + "step": 16815 + }, + { + "epoch": 1.3571140343797918, + "grad_norm": 0.6796969771385193, + "learning_rate": 1.2392158010685373e-05, + "loss": 2.4545, + "step": 16816 + }, + { + "epoch": 1.357194738116375, + "grad_norm": 0.6469466090202332, + "learning_rate": 1.2384547155325466e-05, + "loss": 2.4263, + "step": 16817 + }, + { + "epoch": 1.3572754418529578, + "grad_norm": 0.7089909911155701, + "learning_rate": 1.2376938483571688e-05, + "loss": 2.378, + "step": 16818 + }, + { + "epoch": 1.3573561455895409, + "grad_norm": 0.7313235402107239, + "learning_rate": 1.2369331995613665e-05, + "loss": 2.46, + "step": 16819 + }, + { + "epoch": 1.3574368493261237, + "grad_norm": 0.7555651664733887, + "learning_rate": 1.2361727691640934e-05, + "loss": 2.531, + "step": 16820 + }, + { + "epoch": 1.3575175530627068, + "grad_norm": 0.7563485503196716, + "learning_rate": 1.2354125571843033e-05, + "loss": 2.4205, + "step": 16821 + }, + { + "epoch": 1.3575982567992897, + "grad_norm": 0.7996519804000854, + "learning_rate": 1.2346525636409434e-05, + "loss": 2.4223, + "step": 16822 + }, + { + "epoch": 1.3576789605358728, + "grad_norm": 0.7141731977462769, + "learning_rate": 1.233892788552955e-05, + "loss": 2.4554, + "step": 16823 + }, + { + "epoch": 1.3577596642724559, + "grad_norm": 0.6715070605278015, + "learning_rate": 1.233133231939273e-05, + "loss": 2.4386, + "step": 16824 + }, + { + "epoch": 1.3578403680090387, + "grad_norm": 0.6893020272254944, + "learning_rate": 1.2323738938188301e-05, + "loss": 2.4065, + "step": 16825 + }, + { + "epoch": 1.3579210717456218, + "grad_norm": 0.7542821764945984, + "learning_rate": 1.2316147742105454e-05, + "loss": 2.3974, + "step": 16826 + }, + { + "epoch": 1.358001775482205, + "grad_norm": 0.7177664041519165, + "learning_rate": 1.230855873133343e-05, + "loss": 2.4306, + "step": 16827 + }, + { + "epoch": 1.3580824792187878, + "grad_norm": 0.7056576013565063, + "learning_rate": 1.2300971906061354e-05, + "loss": 2.4238, + "step": 16828 + }, + { + "epoch": 1.3581631829553709, + "grad_norm": 0.686903715133667, + "learning_rate": 1.2293387266478296e-05, + "loss": 2.3902, + "step": 16829 + }, + { + "epoch": 1.358243886691954, + "grad_norm": 0.7377725839614868, + "learning_rate": 1.2285804812773293e-05, + "loss": 2.4294, + "step": 16830 + }, + { + "epoch": 1.3583245904285368, + "grad_norm": 0.6537891030311584, + "learning_rate": 1.227822454513532e-05, + "loss": 2.374, + "step": 16831 + }, + { + "epoch": 1.35840529416512, + "grad_norm": 0.684699296951294, + "learning_rate": 1.2270646463753288e-05, + "loss": 2.4105, + "step": 16832 + }, + { + "epoch": 1.3584859979017028, + "grad_norm": 0.7042316794395447, + "learning_rate": 1.2263070568816081e-05, + "loss": 2.4246, + "step": 16833 + }, + { + "epoch": 1.3585667016382859, + "grad_norm": 0.7610476613044739, + "learning_rate": 1.2255496860512505e-05, + "loss": 2.4581, + "step": 16834 + }, + { + "epoch": 1.3586474053748687, + "grad_norm": 0.6620839834213257, + "learning_rate": 1.224792533903134e-05, + "loss": 2.4138, + "step": 16835 + }, + { + "epoch": 1.3587281091114518, + "grad_norm": 0.6861035823822021, + "learning_rate": 1.2240356004561227e-05, + "loss": 2.4195, + "step": 16836 + }, + { + "epoch": 1.358808812848035, + "grad_norm": 0.7186882495880127, + "learning_rate": 1.2232788857290855e-05, + "loss": 2.404, + "step": 16837 + }, + { + "epoch": 1.3588895165846178, + "grad_norm": 0.7219386696815491, + "learning_rate": 1.2225223897408833e-05, + "loss": 2.3778, + "step": 16838 + }, + { + "epoch": 1.3589702203212009, + "grad_norm": 0.6935911774635315, + "learning_rate": 1.2217661125103663e-05, + "loss": 2.4617, + "step": 16839 + }, + { + "epoch": 1.359050924057784, + "grad_norm": 0.7885910272598267, + "learning_rate": 1.2210100540563828e-05, + "loss": 2.4467, + "step": 16840 + }, + { + "epoch": 1.3591316277943668, + "grad_norm": 0.6690255403518677, + "learning_rate": 1.220254214397778e-05, + "loss": 2.381, + "step": 16841 + }, + { + "epoch": 1.35921233153095, + "grad_norm": 0.7592741847038269, + "learning_rate": 1.2194985935533887e-05, + "loss": 2.4459, + "step": 16842 + }, + { + "epoch": 1.359293035267533, + "grad_norm": 0.827460527420044, + "learning_rate": 1.2187431915420466e-05, + "loss": 2.3842, + "step": 16843 + }, + { + "epoch": 1.3593737390041158, + "grad_norm": 0.7313764691352844, + "learning_rate": 1.2179880083825811e-05, + "loss": 2.3938, + "step": 16844 + }, + { + "epoch": 1.359454442740699, + "grad_norm": 0.7093486189842224, + "learning_rate": 1.2172330440938084e-05, + "loss": 2.4316, + "step": 16845 + }, + { + "epoch": 1.359535146477282, + "grad_norm": 0.6805742383003235, + "learning_rate": 1.2164782986945467e-05, + "loss": 2.4372, + "step": 16846 + }, + { + "epoch": 1.3596158502138649, + "grad_norm": 0.7525961399078369, + "learning_rate": 1.2157237722036064e-05, + "loss": 2.3867, + "step": 16847 + }, + { + "epoch": 1.359696553950448, + "grad_norm": 0.723896861076355, + "learning_rate": 1.2149694646397947e-05, + "loss": 2.4685, + "step": 16848 + }, + { + "epoch": 1.3597772576870308, + "grad_norm": 0.704448938369751, + "learning_rate": 1.2142153760219055e-05, + "loss": 2.4463, + "step": 16849 + }, + { + "epoch": 1.359857961423614, + "grad_norm": 0.7207927703857422, + "learning_rate": 1.2134615063687349e-05, + "loss": 2.3549, + "step": 16850 + }, + { + "epoch": 1.3599386651601968, + "grad_norm": 0.7106234431266785, + "learning_rate": 1.2127078556990724e-05, + "loss": 2.4145, + "step": 16851 + }, + { + "epoch": 1.3600193688967799, + "grad_norm": 0.7740694284439087, + "learning_rate": 1.2119544240316993e-05, + "loss": 2.3999, + "step": 16852 + }, + { + "epoch": 1.360100072633363, + "grad_norm": 0.6696181297302246, + "learning_rate": 1.2112012113853954e-05, + "loss": 2.4046, + "step": 16853 + }, + { + "epoch": 1.3601807763699458, + "grad_norm": 0.6758043169975281, + "learning_rate": 1.2104482177789334e-05, + "loss": 2.4021, + "step": 16854 + }, + { + "epoch": 1.360261480106529, + "grad_norm": 0.6659380793571472, + "learning_rate": 1.2096954432310758e-05, + "loss": 2.4145, + "step": 16855 + }, + { + "epoch": 1.360342183843112, + "grad_norm": 0.6889290809631348, + "learning_rate": 1.2089428877605858e-05, + "loss": 2.3486, + "step": 16856 + }, + { + "epoch": 1.3604228875796949, + "grad_norm": 0.6755563020706177, + "learning_rate": 1.2081905513862201e-05, + "loss": 2.4294, + "step": 16857 + }, + { + "epoch": 1.360503591316278, + "grad_norm": 0.7662243843078613, + "learning_rate": 1.2074384341267276e-05, + "loss": 2.414, + "step": 16858 + }, + { + "epoch": 1.360584295052861, + "grad_norm": 0.7432721853256226, + "learning_rate": 1.2066865360008517e-05, + "loss": 2.4314, + "step": 16859 + }, + { + "epoch": 1.360664998789444, + "grad_norm": 0.6465074419975281, + "learning_rate": 1.2059348570273366e-05, + "loss": 2.3349, + "step": 16860 + }, + { + "epoch": 1.360745702526027, + "grad_norm": 0.6940968632698059, + "learning_rate": 1.2051833972249105e-05, + "loss": 2.4539, + "step": 16861 + }, + { + "epoch": 1.36082640626261, + "grad_norm": 0.7211138010025024, + "learning_rate": 1.2044321566123019e-05, + "loss": 2.4041, + "step": 16862 + }, + { + "epoch": 1.360907109999193, + "grad_norm": 0.6746649146080017, + "learning_rate": 1.2036811352082367e-05, + "loss": 2.4329, + "step": 16863 + }, + { + "epoch": 1.360987813735776, + "grad_norm": 0.7502184510231018, + "learning_rate": 1.2029303330314345e-05, + "loss": 2.407, + "step": 16864 + }, + { + "epoch": 1.361068517472359, + "grad_norm": 0.7192596793174744, + "learning_rate": 1.2021797501006027e-05, + "loss": 2.3907, + "step": 16865 + }, + { + "epoch": 1.361149221208942, + "grad_norm": 0.6682254672050476, + "learning_rate": 1.2014293864344483e-05, + "loss": 2.391, + "step": 16866 + }, + { + "epoch": 1.3612299249455249, + "grad_norm": 0.680969774723053, + "learning_rate": 1.2006792420516755e-05, + "loss": 2.3479, + "step": 16867 + }, + { + "epoch": 1.361310628682108, + "grad_norm": 0.682671308517456, + "learning_rate": 1.1999293169709757e-05, + "loss": 2.4097, + "step": 16868 + }, + { + "epoch": 1.361391332418691, + "grad_norm": 0.7030573487281799, + "learning_rate": 1.199179611211041e-05, + "loss": 2.4514, + "step": 16869 + }, + { + "epoch": 1.361472036155274, + "grad_norm": 0.670630693435669, + "learning_rate": 1.1984301247905582e-05, + "loss": 2.3982, + "step": 16870 + }, + { + "epoch": 1.361552739891857, + "grad_norm": 0.6993644833564758, + "learning_rate": 1.1976808577282017e-05, + "loss": 2.4297, + "step": 16871 + }, + { + "epoch": 1.36163344362844, + "grad_norm": 0.7448122501373291, + "learning_rate": 1.1969318100426486e-05, + "loss": 2.3612, + "step": 16872 + }, + { + "epoch": 1.361714147365023, + "grad_norm": 0.7014498114585876, + "learning_rate": 1.1961829817525649e-05, + "loss": 2.3451, + "step": 16873 + }, + { + "epoch": 1.361794851101606, + "grad_norm": 0.7140750885009766, + "learning_rate": 1.195434372876616e-05, + "loss": 2.4231, + "step": 16874 + }, + { + "epoch": 1.3618755548381891, + "grad_norm": 0.7377427816390991, + "learning_rate": 1.1946859834334567e-05, + "loss": 2.4055, + "step": 16875 + }, + { + "epoch": 1.361956258574772, + "grad_norm": 0.7969191670417786, + "learning_rate": 1.1939378134417433e-05, + "loss": 2.3503, + "step": 16876 + }, + { + "epoch": 1.362036962311355, + "grad_norm": 0.6821554899215698, + "learning_rate": 1.1931898629201155e-05, + "loss": 2.4259, + "step": 16877 + }, + { + "epoch": 1.3621176660479382, + "grad_norm": 0.6598221659660339, + "learning_rate": 1.1924421318872182e-05, + "loss": 2.3833, + "step": 16878 + }, + { + "epoch": 1.362198369784521, + "grad_norm": 0.8031432628631592, + "learning_rate": 1.1916946203616863e-05, + "loss": 2.5077, + "step": 16879 + }, + { + "epoch": 1.362279073521104, + "grad_norm": 0.7247405648231506, + "learning_rate": 1.190947328362152e-05, + "loss": 2.426, + "step": 16880 + }, + { + "epoch": 1.362359777257687, + "grad_norm": 0.7256691455841064, + "learning_rate": 1.1902002559072344e-05, + "loss": 2.474, + "step": 16881 + }, + { + "epoch": 1.36244048099427, + "grad_norm": 0.7382180094718933, + "learning_rate": 1.1894534030155558e-05, + "loss": 2.4487, + "step": 16882 + }, + { + "epoch": 1.362521184730853, + "grad_norm": 0.700179398059845, + "learning_rate": 1.1887067697057297e-05, + "loss": 2.3836, + "step": 16883 + }, + { + "epoch": 1.362601888467436, + "grad_norm": 0.706106424331665, + "learning_rate": 1.1879603559963638e-05, + "loss": 2.4304, + "step": 16884 + }, + { + "epoch": 1.362682592204019, + "grad_norm": 0.7514815926551819, + "learning_rate": 1.1872141619060606e-05, + "loss": 2.4895, + "step": 16885 + }, + { + "epoch": 1.362763295940602, + "grad_norm": 0.6605612635612488, + "learning_rate": 1.1864681874534201e-05, + "loss": 2.3569, + "step": 16886 + }, + { + "epoch": 1.362843999677185, + "grad_norm": 0.6366496682167053, + "learning_rate": 1.1857224326570283e-05, + "loss": 2.3919, + "step": 16887 + }, + { + "epoch": 1.3629247034137681, + "grad_norm": 0.8100820183753967, + "learning_rate": 1.1849768975354736e-05, + "loss": 2.5063, + "step": 16888 + }, + { + "epoch": 1.363005407150351, + "grad_norm": 0.685127854347229, + "learning_rate": 1.1842315821073403e-05, + "loss": 2.4647, + "step": 16889 + }, + { + "epoch": 1.363086110886934, + "grad_norm": 0.696172833442688, + "learning_rate": 1.1834864863911987e-05, + "loss": 2.4224, + "step": 16890 + }, + { + "epoch": 1.3631668146235172, + "grad_norm": 0.6558032035827637, + "learning_rate": 1.1827416104056199e-05, + "loss": 2.3619, + "step": 16891 + }, + { + "epoch": 1.3632475183601, + "grad_norm": 0.744687020778656, + "learning_rate": 1.1819969541691689e-05, + "loss": 2.4669, + "step": 16892 + }, + { + "epoch": 1.3633282220966831, + "grad_norm": 0.6925212740898132, + "learning_rate": 1.1812525177004052e-05, + "loss": 2.3967, + "step": 16893 + }, + { + "epoch": 1.363408925833266, + "grad_norm": 0.6861244440078735, + "learning_rate": 1.1805083010178797e-05, + "loss": 2.3979, + "step": 16894 + }, + { + "epoch": 1.363489629569849, + "grad_norm": 0.6987108588218689, + "learning_rate": 1.179764304140143e-05, + "loss": 2.4263, + "step": 16895 + }, + { + "epoch": 1.363570333306432, + "grad_norm": 0.6940091848373413, + "learning_rate": 1.179020527085738e-05, + "loss": 2.4328, + "step": 16896 + }, + { + "epoch": 1.363651037043015, + "grad_norm": 0.6831968426704407, + "learning_rate": 1.1782769698731966e-05, + "loss": 2.427, + "step": 16897 + }, + { + "epoch": 1.3637317407795981, + "grad_norm": 0.7370985746383667, + "learning_rate": 1.177533632521054e-05, + "loss": 2.3711, + "step": 16898 + }, + { + "epoch": 1.363812444516181, + "grad_norm": 0.8176774978637695, + "learning_rate": 1.1767905150478376e-05, + "loss": 2.4337, + "step": 16899 + }, + { + "epoch": 1.363893148252764, + "grad_norm": 0.786318302154541, + "learning_rate": 1.1760476174720637e-05, + "loss": 2.5099, + "step": 16900 + }, + { + "epoch": 1.3639738519893472, + "grad_norm": 0.7309854626655579, + "learning_rate": 1.1753049398122495e-05, + "loss": 2.46, + "step": 16901 + }, + { + "epoch": 1.36405455572593, + "grad_norm": 0.7410863637924194, + "learning_rate": 1.1745624820869039e-05, + "loss": 2.4249, + "step": 16902 + }, + { + "epoch": 1.3641352594625131, + "grad_norm": 0.7059988379478455, + "learning_rate": 1.1738202443145308e-05, + "loss": 2.4964, + "step": 16903 + }, + { + "epoch": 1.3642159631990962, + "grad_norm": 0.7351845502853394, + "learning_rate": 1.1730782265136287e-05, + "loss": 2.4694, + "step": 16904 + }, + { + "epoch": 1.364296666935679, + "grad_norm": 0.6928153038024902, + "learning_rate": 1.1723364287026938e-05, + "loss": 2.426, + "step": 16905 + }, + { + "epoch": 1.3643773706722622, + "grad_norm": 0.759920060634613, + "learning_rate": 1.1715948509002083e-05, + "loss": 2.4359, + "step": 16906 + }, + { + "epoch": 1.3644580744088453, + "grad_norm": 0.6655696630477905, + "learning_rate": 1.1708534931246573e-05, + "loss": 2.4118, + "step": 16907 + }, + { + "epoch": 1.3645387781454281, + "grad_norm": 0.6912528872489929, + "learning_rate": 1.170112355394517e-05, + "loss": 2.4257, + "step": 16908 + }, + { + "epoch": 1.3646194818820112, + "grad_norm": 0.6612871289253235, + "learning_rate": 1.1693714377282604e-05, + "loss": 2.4192, + "step": 16909 + }, + { + "epoch": 1.364700185618594, + "grad_norm": 0.6548018455505371, + "learning_rate": 1.1686307401443486e-05, + "loss": 2.4054, + "step": 16910 + }, + { + "epoch": 1.3647808893551772, + "grad_norm": 0.7749961018562317, + "learning_rate": 1.1678902626612443e-05, + "loss": 2.44, + "step": 16911 + }, + { + "epoch": 1.36486159309176, + "grad_norm": 0.7187496423721313, + "learning_rate": 1.1671500052974039e-05, + "loss": 2.4033, + "step": 16912 + }, + { + "epoch": 1.3649422968283431, + "grad_norm": 0.7002814412117004, + "learning_rate": 1.1664099680712715e-05, + "loss": 2.4442, + "step": 16913 + }, + { + "epoch": 1.3650230005649262, + "grad_norm": 0.6852529644966125, + "learning_rate": 1.1656701510012946e-05, + "loss": 2.4253, + "step": 16914 + }, + { + "epoch": 1.365103704301509, + "grad_norm": 0.6922035813331604, + "learning_rate": 1.1649305541059142e-05, + "loss": 2.4406, + "step": 16915 + }, + { + "epoch": 1.3651844080380922, + "grad_norm": 0.6883397698402405, + "learning_rate": 1.1641911774035563e-05, + "loss": 2.4064, + "step": 16916 + }, + { + "epoch": 1.3652651117746752, + "grad_norm": 0.7101531624794006, + "learning_rate": 1.163452020912652e-05, + "loss": 2.4068, + "step": 16917 + }, + { + "epoch": 1.365345815511258, + "grad_norm": 0.728369951248169, + "learning_rate": 1.1627130846516231e-05, + "loss": 2.4319, + "step": 16918 + }, + { + "epoch": 1.3654265192478412, + "grad_norm": 0.6765053272247314, + "learning_rate": 1.161974368638884e-05, + "loss": 2.3922, + "step": 16919 + }, + { + "epoch": 1.3655072229844243, + "grad_norm": 0.6909242868423462, + "learning_rate": 1.1612358728928475e-05, + "loss": 2.4124, + "step": 16920 + }, + { + "epoch": 1.3655879267210072, + "grad_norm": 0.735650897026062, + "learning_rate": 1.1604975974319177e-05, + "loss": 2.5137, + "step": 16921 + }, + { + "epoch": 1.3656686304575902, + "grad_norm": 0.6587653756141663, + "learning_rate": 1.1597595422744934e-05, + "loss": 2.4163, + "step": 16922 + }, + { + "epoch": 1.3657493341941733, + "grad_norm": 0.700282096862793, + "learning_rate": 1.159021707438971e-05, + "loss": 2.4272, + "step": 16923 + }, + { + "epoch": 1.3658300379307562, + "grad_norm": 0.7175682783126831, + "learning_rate": 1.1582840929437365e-05, + "loss": 2.4598, + "step": 16924 + }, + { + "epoch": 1.3659107416673393, + "grad_norm": 0.6725881695747375, + "learning_rate": 1.157546698807176e-05, + "loss": 2.4064, + "step": 16925 + }, + { + "epoch": 1.3659914454039221, + "grad_norm": 0.7130467295646667, + "learning_rate": 1.1568095250476651e-05, + "loss": 2.3851, + "step": 16926 + }, + { + "epoch": 1.3660721491405052, + "grad_norm": 0.6859269142150879, + "learning_rate": 1.1560725716835785e-05, + "loss": 2.3577, + "step": 16927 + }, + { + "epoch": 1.366152852877088, + "grad_norm": 0.7037541270256042, + "learning_rate": 1.1553358387332824e-05, + "loss": 2.4402, + "step": 16928 + }, + { + "epoch": 1.3662335566136712, + "grad_norm": 0.7094031572341919, + "learning_rate": 1.1545993262151366e-05, + "loss": 2.4036, + "step": 16929 + }, + { + "epoch": 1.3663142603502543, + "grad_norm": 0.6953302025794983, + "learning_rate": 1.1538630341474965e-05, + "loss": 2.4192, + "step": 16930 + }, + { + "epoch": 1.3663949640868371, + "grad_norm": 0.7012252807617188, + "learning_rate": 1.1531269625487163e-05, + "loss": 2.4207, + "step": 16931 + }, + { + "epoch": 1.3664756678234202, + "grad_norm": 0.6616495847702026, + "learning_rate": 1.1523911114371366e-05, + "loss": 2.4187, + "step": 16932 + }, + { + "epoch": 1.3665563715600033, + "grad_norm": 0.6819868087768555, + "learning_rate": 1.1516554808310975e-05, + "loss": 2.448, + "step": 16933 + }, + { + "epoch": 1.3666370752965862, + "grad_norm": 0.6869969964027405, + "learning_rate": 1.1509200707489343e-05, + "loss": 2.4134, + "step": 16934 + }, + { + "epoch": 1.3667177790331693, + "grad_norm": 0.6600778698921204, + "learning_rate": 1.1501848812089733e-05, + "loss": 2.4159, + "step": 16935 + }, + { + "epoch": 1.3667984827697524, + "grad_norm": 0.668712317943573, + "learning_rate": 1.1494499122295398e-05, + "loss": 2.41, + "step": 16936 + }, + { + "epoch": 1.3668791865063352, + "grad_norm": 0.767365574836731, + "learning_rate": 1.1487151638289518e-05, + "loss": 2.3856, + "step": 16937 + }, + { + "epoch": 1.3669598902429183, + "grad_norm": 0.721546471118927, + "learning_rate": 1.1479806360255174e-05, + "loss": 2.4038, + "step": 16938 + }, + { + "epoch": 1.3670405939795012, + "grad_norm": 0.6796963810920715, + "learning_rate": 1.1472463288375456e-05, + "loss": 2.3698, + "step": 16939 + }, + { + "epoch": 1.3671212977160843, + "grad_norm": 0.7340671420097351, + "learning_rate": 1.1465122422833363e-05, + "loss": 2.4296, + "step": 16940 + }, + { + "epoch": 1.3672020014526671, + "grad_norm": 0.7173369526863098, + "learning_rate": 1.145778376381187e-05, + "loss": 2.3923, + "step": 16941 + }, + { + "epoch": 1.3672827051892502, + "grad_norm": 0.6683956980705261, + "learning_rate": 1.1450447311493839e-05, + "loss": 2.4092, + "step": 16942 + }, + { + "epoch": 1.3673634089258333, + "grad_norm": 0.6457851529121399, + "learning_rate": 1.1443113066062129e-05, + "loss": 2.3467, + "step": 16943 + }, + { + "epoch": 1.3674441126624162, + "grad_norm": 0.6870608925819397, + "learning_rate": 1.1435781027699532e-05, + "loss": 2.3766, + "step": 16944 + }, + { + "epoch": 1.3675248163989993, + "grad_norm": 0.6496049165725708, + "learning_rate": 1.1428451196588775e-05, + "loss": 2.4464, + "step": 16945 + }, + { + "epoch": 1.3676055201355823, + "grad_norm": 0.7554739117622375, + "learning_rate": 1.1421123572912551e-05, + "loss": 2.4243, + "step": 16946 + }, + { + "epoch": 1.3676862238721652, + "grad_norm": 0.7208122611045837, + "learning_rate": 1.1413798156853495e-05, + "loss": 2.3699, + "step": 16947 + }, + { + "epoch": 1.3677669276087483, + "grad_norm": 0.7072176337242126, + "learning_rate": 1.1406474948594126e-05, + "loss": 2.4011, + "step": 16948 + }, + { + "epoch": 1.3678476313453314, + "grad_norm": 0.7316476106643677, + "learning_rate": 1.1399153948316999e-05, + "loss": 2.4508, + "step": 16949 + }, + { + "epoch": 1.3679283350819142, + "grad_norm": 0.8518069386482239, + "learning_rate": 1.1391835156204577e-05, + "loss": 2.4197, + "step": 16950 + }, + { + "epoch": 1.3680090388184973, + "grad_norm": 0.6700364947319031, + "learning_rate": 1.1384518572439228e-05, + "loss": 2.4272, + "step": 16951 + }, + { + "epoch": 1.3680897425550804, + "grad_norm": 0.7007749676704407, + "learning_rate": 1.1377204197203317e-05, + "loss": 2.3777, + "step": 16952 + }, + { + "epoch": 1.3681704462916633, + "grad_norm": 0.6792053580284119, + "learning_rate": 1.1369892030679141e-05, + "loss": 2.4487, + "step": 16953 + }, + { + "epoch": 1.3682511500282464, + "grad_norm": 0.6913022398948669, + "learning_rate": 1.1362582073048932e-05, + "loss": 2.3757, + "step": 16954 + }, + { + "epoch": 1.3683318537648292, + "grad_norm": 0.648248016834259, + "learning_rate": 1.135527432449488e-05, + "loss": 2.3482, + "step": 16955 + }, + { + "epoch": 1.3684125575014123, + "grad_norm": 0.6711798906326294, + "learning_rate": 1.1347968785199115e-05, + "loss": 2.4096, + "step": 16956 + }, + { + "epoch": 1.3684932612379952, + "grad_norm": 0.6932381987571716, + "learning_rate": 1.1340665455343724e-05, + "loss": 2.3834, + "step": 16957 + }, + { + "epoch": 1.3685739649745783, + "grad_norm": 0.6890178918838501, + "learning_rate": 1.1333364335110697e-05, + "loss": 2.4182, + "step": 16958 + }, + { + "epoch": 1.3686546687111614, + "grad_norm": 0.6612519025802612, + "learning_rate": 1.1326065424681997e-05, + "loss": 2.3691, + "step": 16959 + }, + { + "epoch": 1.3687353724477442, + "grad_norm": 0.7123190760612488, + "learning_rate": 1.131876872423957e-05, + "loss": 2.3919, + "step": 16960 + }, + { + "epoch": 1.3688160761843273, + "grad_norm": 0.6615463495254517, + "learning_rate": 1.1311474233965214e-05, + "loss": 2.4266, + "step": 16961 + }, + { + "epoch": 1.3688967799209104, + "grad_norm": 0.7320190668106079, + "learning_rate": 1.130418195404076e-05, + "loss": 2.4268, + "step": 16962 + }, + { + "epoch": 1.3689774836574933, + "grad_norm": 0.6845116019248962, + "learning_rate": 1.1296891884647965e-05, + "loss": 2.3972, + "step": 16963 + }, + { + "epoch": 1.3690581873940764, + "grad_norm": 0.70455002784729, + "learning_rate": 1.1289604025968448e-05, + "loss": 2.4183, + "step": 16964 + }, + { + "epoch": 1.3691388911306595, + "grad_norm": 0.6952407956123352, + "learning_rate": 1.128231837818392e-05, + "loss": 2.4276, + "step": 16965 + }, + { + "epoch": 1.3692195948672423, + "grad_norm": 0.7939464449882507, + "learning_rate": 1.1275034941475938e-05, + "loss": 2.4072, + "step": 16966 + }, + { + "epoch": 1.3693002986038254, + "grad_norm": 0.6974930763244629, + "learning_rate": 1.1267753716026007e-05, + "loss": 2.4133, + "step": 16967 + }, + { + "epoch": 1.3693810023404085, + "grad_norm": 0.7187508344650269, + "learning_rate": 1.126047470201559e-05, + "loss": 2.3588, + "step": 16968 + }, + { + "epoch": 1.3694617060769914, + "grad_norm": 0.6887609958648682, + "learning_rate": 1.1253197899626134e-05, + "loss": 2.4322, + "step": 16969 + }, + { + "epoch": 1.3695424098135744, + "grad_norm": 0.679957389831543, + "learning_rate": 1.1245923309038964e-05, + "loss": 2.3907, + "step": 16970 + }, + { + "epoch": 1.3696231135501573, + "grad_norm": 0.7540870308876038, + "learning_rate": 1.1238650930435378e-05, + "loss": 2.4752, + "step": 16971 + }, + { + "epoch": 1.3697038172867404, + "grad_norm": 0.7697634100914001, + "learning_rate": 1.1231380763996635e-05, + "loss": 2.4366, + "step": 16972 + }, + { + "epoch": 1.3697845210233233, + "grad_norm": 0.6836850643157959, + "learning_rate": 1.1224112809903954e-05, + "loss": 2.3511, + "step": 16973 + }, + { + "epoch": 1.3698652247599064, + "grad_norm": 0.6904506683349609, + "learning_rate": 1.1216847068338421e-05, + "loss": 2.4109, + "step": 16974 + }, + { + "epoch": 1.3699459284964894, + "grad_norm": 0.6579318046569824, + "learning_rate": 1.1209583539481127e-05, + "loss": 2.4391, + "step": 16975 + }, + { + "epoch": 1.3700266322330723, + "grad_norm": 0.7107192277908325, + "learning_rate": 1.120232222351314e-05, + "loss": 2.399, + "step": 16976 + }, + { + "epoch": 1.3701073359696554, + "grad_norm": 0.7581583261489868, + "learning_rate": 1.119506312061539e-05, + "loss": 2.4817, + "step": 16977 + }, + { + "epoch": 1.3701880397062385, + "grad_norm": 0.6836642622947693, + "learning_rate": 1.11878062309688e-05, + "loss": 2.4415, + "step": 16978 + }, + { + "epoch": 1.3702687434428213, + "grad_norm": 0.6842699646949768, + "learning_rate": 1.118055155475426e-05, + "loss": 2.4045, + "step": 16979 + }, + { + "epoch": 1.3703494471794044, + "grad_norm": 0.7630519270896912, + "learning_rate": 1.1173299092152534e-05, + "loss": 2.4314, + "step": 16980 + }, + { + "epoch": 1.3704301509159875, + "grad_norm": 0.7334303259849548, + "learning_rate": 1.116604884334439e-05, + "loss": 2.3564, + "step": 16981 + }, + { + "epoch": 1.3705108546525704, + "grad_norm": 0.6929439306259155, + "learning_rate": 1.1158800808510538e-05, + "loss": 2.4258, + "step": 16982 + }, + { + "epoch": 1.3705915583891535, + "grad_norm": 0.6387187838554382, + "learning_rate": 1.1151554987831591e-05, + "loss": 2.3263, + "step": 16983 + }, + { + "epoch": 1.3706722621257363, + "grad_norm": 0.7279032468795776, + "learning_rate": 1.1144311381488136e-05, + "loss": 2.4074, + "step": 16984 + }, + { + "epoch": 1.3707529658623194, + "grad_norm": 0.7066916227340698, + "learning_rate": 1.113706998966072e-05, + "loss": 2.4358, + "step": 16985 + }, + { + "epoch": 1.3708336695989023, + "grad_norm": 0.6753098964691162, + "learning_rate": 1.1129830812529807e-05, + "loss": 2.4195, + "step": 16986 + }, + { + "epoch": 1.3709143733354854, + "grad_norm": 0.6728894114494324, + "learning_rate": 1.112259385027582e-05, + "loss": 2.3712, + "step": 16987 + }, + { + "epoch": 1.3709950770720685, + "grad_norm": 0.7251775860786438, + "learning_rate": 1.1115359103079115e-05, + "loss": 2.4063, + "step": 16988 + }, + { + "epoch": 1.3710757808086513, + "grad_norm": 0.6797254085540771, + "learning_rate": 1.1108126571120036e-05, + "loss": 2.395, + "step": 16989 + }, + { + "epoch": 1.3711564845452344, + "grad_norm": 0.7505605220794678, + "learning_rate": 1.1100896254578786e-05, + "loss": 2.4044, + "step": 16990 + }, + { + "epoch": 1.3712371882818175, + "grad_norm": 0.7126416563987732, + "learning_rate": 1.1093668153635594e-05, + "loss": 2.4043, + "step": 16991 + }, + { + "epoch": 1.3713178920184004, + "grad_norm": 0.6550771594047546, + "learning_rate": 1.1086442268470609e-05, + "loss": 2.3515, + "step": 16992 + }, + { + "epoch": 1.3713985957549835, + "grad_norm": 0.7253621816635132, + "learning_rate": 1.1079218599263874e-05, + "loss": 2.4109, + "step": 16993 + }, + { + "epoch": 1.3714792994915666, + "grad_norm": 0.7272186875343323, + "learning_rate": 1.1071997146195468e-05, + "loss": 2.3531, + "step": 16994 + }, + { + "epoch": 1.3715600032281494, + "grad_norm": 0.6841129660606384, + "learning_rate": 1.1064777909445345e-05, + "loss": 2.4031, + "step": 16995 + }, + { + "epoch": 1.3716407069647325, + "grad_norm": 0.692945659160614, + "learning_rate": 1.1057560889193441e-05, + "loss": 2.3858, + "step": 16996 + }, + { + "epoch": 1.3717214107013156, + "grad_norm": 0.721182644367218, + "learning_rate": 1.1050346085619612e-05, + "loss": 2.3871, + "step": 16997 + }, + { + "epoch": 1.3718021144378985, + "grad_norm": 0.722960889339447, + "learning_rate": 1.1043133498903702e-05, + "loss": 2.3452, + "step": 16998 + }, + { + "epoch": 1.3718828181744815, + "grad_norm": 0.7148451805114746, + "learning_rate": 1.1035923129225412e-05, + "loss": 2.3905, + "step": 16999 + }, + { + "epoch": 1.3719635219110644, + "grad_norm": 0.7118532061576843, + "learning_rate": 1.1028714976764486e-05, + "loss": 2.3894, + "step": 17000 + }, + { + "epoch": 1.3719635219110644, + "eval_loss": 2.3730249404907227, + "eval_runtime": 769.4165, + "eval_samples_per_second": 3.405, + "eval_steps_per_second": 0.568, + "step": 17000 + } + ], + "logging_steps": 1, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.95407905485312e+17, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}