protgpt2-distilled-small / training_logs.json
littleworth's picture
Upload folder using huggingface_hub
994723e verified
raw
history blame
169 kB
[
{
"loss": 29.9707,
"grad_norm": 0.4777052700519562,
"learning_rate": 0.0009991248796709547,
"epoch": 0.0
},
{
"loss": 22.6857,
"grad_norm": 0.7528864741325378,
"learning_rate": 0.0009982497593419095,
"epoch": 0.01
},
{
"loss": 23.3032,
"grad_norm": 0.2558889389038086,
"learning_rate": 0.0009973746390128642,
"epoch": 0.01
},
{
"loss": 22.7608,
"grad_norm": 0.16549238562583923,
"learning_rate": 0.000996499518683819,
"epoch": 0.01
},
{
"loss": 21.8524,
"grad_norm": 0.969261109828949,
"learning_rate": 0.0009956243983547737,
"epoch": 0.01
},
{
"loss": 20.1216,
"grad_norm": 1.4401915073394775,
"learning_rate": 0.0009947492780257286,
"epoch": 0.02
},
{
"loss": 18.8698,
"grad_norm": 1.2519457340240479,
"learning_rate": 0.0009938741576966832,
"epoch": 0.02
},
{
"loss": 17.8051,
"grad_norm": 0.6829971075057983,
"learning_rate": 0.0009929990373676381,
"epoch": 0.02
},
{
"loss": 16.7436,
"grad_norm": 0.8918408155441284,
"learning_rate": 0.0009921239170385928,
"epoch": 0.02
},
{
"loss": 16.3997,
"grad_norm": 1.1997641324996948,
"learning_rate": 0.0009912487967095476,
"epoch": 0.03
},
{
"loss": 16.6555,
"grad_norm": 1.0908863544464111,
"learning_rate": 0.0009903736763805023,
"epoch": 0.03
},
{
"loss": 16.168,
"grad_norm": 0.8117638230323792,
"learning_rate": 0.0009894985560514572,
"epoch": 0.03
},
{
"loss": 16.4056,
"grad_norm": 0.7367461919784546,
"learning_rate": 0.0009886234357224118,
"epoch": 0.03
},
{
"loss": 16.0576,
"grad_norm": 0.827192485332489,
"learning_rate": 0.0009877483153933667,
"epoch": 0.04
},
{
"loss": 14.8864,
"grad_norm": 0.6644204258918762,
"learning_rate": 0.0009868731950643213,
"epoch": 0.04
},
{
"loss": 15.1702,
"grad_norm": 0.6314308047294617,
"learning_rate": 0.0009859980747352762,
"epoch": 0.04
},
{
"loss": 14.8873,
"grad_norm": 0.4996398091316223,
"learning_rate": 0.0009851229544062309,
"epoch": 0.04
},
{
"loss": 14.704,
"grad_norm": 0.6396967768669128,
"learning_rate": 0.0009842478340771857,
"epoch": 0.05
},
{
"loss": 14.8636,
"grad_norm": 0.5319499373435974,
"learning_rate": 0.0009833727137481404,
"epoch": 0.05
},
{
"loss": 14.7236,
"grad_norm": 1.1328645944595337,
"learning_rate": 0.0009824975934190953,
"epoch": 0.05
},
{
"loss": 14.2966,
"grad_norm": 0.5435690879821777,
"learning_rate": 0.00098162247309005,
"epoch": 0.06
},
{
"loss": 14.8865,
"grad_norm": 0.5260070562362671,
"learning_rate": 0.0009807473527610048,
"epoch": 0.06
},
{
"loss": 14.1635,
"grad_norm": 0.5490550994873047,
"learning_rate": 0.0009798722324319594,
"epoch": 0.06
},
{
"loss": 14.1756,
"grad_norm": 0.5580148696899414,
"learning_rate": 0.0009789971121029143,
"epoch": 0.06
},
{
"loss": 13.3026,
"grad_norm": 0.4862927198410034,
"learning_rate": 0.000978121991773869,
"epoch": 0.07
},
{
"loss": 13.9938,
"grad_norm": 0.4365651607513428,
"learning_rate": 0.0009772468714448236,
"epoch": 0.07
},
{
"loss": 13.7628,
"grad_norm": 0.5206578373908997,
"learning_rate": 0.0009763717511157785,
"epoch": 0.07
},
{
"loss": 13.2932,
"grad_norm": 0.4493275582790375,
"learning_rate": 0.0009754966307867332,
"epoch": 0.07
},
{
"loss": 13.4192,
"grad_norm": 0.5717960596084595,
"learning_rate": 0.000974621510457688,
"epoch": 0.08
},
{
"loss": 13.2883,
"grad_norm": 0.48513928055763245,
"learning_rate": 0.0009737463901286428,
"epoch": 0.08
},
{
"loss": 13.8283,
"grad_norm": 0.7734763622283936,
"learning_rate": 0.0009728712697995975,
"epoch": 0.08
},
{
"loss": 12.4766,
"grad_norm": 0.45278435945510864,
"learning_rate": 0.0009719961494705523,
"epoch": 0.08
},
{
"loss": 13.41,
"grad_norm": 0.5911663174629211,
"learning_rate": 0.000971121029141507,
"epoch": 0.09
},
{
"loss": 12.4475,
"grad_norm": 0.5840547680854797,
"learning_rate": 0.0009702459088124618,
"epoch": 0.09
},
{
"loss": 13.9417,
"grad_norm": 0.7008219957351685,
"learning_rate": 0.0009693707884834166,
"epoch": 0.09
},
{
"loss": 12.9164,
"grad_norm": 0.7021568417549133,
"learning_rate": 0.0009684956681543713,
"epoch": 0.09
},
{
"loss": 12.9824,
"grad_norm": 0.5466001629829407,
"learning_rate": 0.0009676205478253261,
"epoch": 0.1
},
{
"loss": 12.5413,
"grad_norm": 0.6215840578079224,
"learning_rate": 0.0009667454274962808,
"epoch": 0.1
},
{
"loss": 12.7753,
"grad_norm": 1.5948784351348877,
"learning_rate": 0.0009658703071672355,
"epoch": 0.1
},
{
"loss": 12.5837,
"grad_norm": 1.1063404083251953,
"learning_rate": 0.0009649951868381903,
"epoch": 0.1
},
{
"loss": 12.7079,
"grad_norm": 0.7521733045578003,
"learning_rate": 0.000964120066509145,
"epoch": 0.11
},
{
"loss": 12.1584,
"grad_norm": 0.7596040964126587,
"learning_rate": 0.0009632449461800998,
"epoch": 0.11
},
{
"loss": 12.6058,
"grad_norm": 1.1221098899841309,
"learning_rate": 0.0009623698258510546,
"epoch": 0.11
},
{
"loss": 12.7003,
"grad_norm": 0.795098602771759,
"learning_rate": 0.0009614947055220093,
"epoch": 0.12
},
{
"loss": 12.4519,
"grad_norm": 0.481406569480896,
"learning_rate": 0.0009606195851929641,
"epoch": 0.12
},
{
"loss": 12.8483,
"grad_norm": 0.6707068681716919,
"learning_rate": 0.0009597444648639187,
"epoch": 0.12
},
{
"loss": 12.593,
"grad_norm": 0.6381434798240662,
"learning_rate": 0.0009588693445348735,
"epoch": 0.12
},
{
"loss": 12.1654,
"grad_norm": 0.7791229486465454,
"learning_rate": 0.0009579942242058283,
"epoch": 0.13
},
{
"loss": 11.8089,
"grad_norm": 0.8445360660552979,
"learning_rate": 0.000957119103876783,
"epoch": 0.13
},
{
"loss": 12.232,
"grad_norm": 0.6427455544471741,
"learning_rate": 0.0009562439835477378,
"epoch": 0.13
},
{
"loss": 12.9625,
"grad_norm": 0.5700855255126953,
"learning_rate": 0.0009553688632186925,
"epoch": 0.13
},
{
"loss": 12.2628,
"grad_norm": 0.8731588125228882,
"learning_rate": 0.0009544937428896473,
"epoch": 0.14
},
{
"loss": 12.0435,
"grad_norm": 0.869883120059967,
"learning_rate": 0.0009536186225606021,
"epoch": 0.14
},
{
"loss": 12.2867,
"grad_norm": 0.8802808523178101,
"learning_rate": 0.0009527435022315568,
"epoch": 0.14
},
{
"loss": 11.9397,
"grad_norm": 1.0076773166656494,
"learning_rate": 0.0009518683819025116,
"epoch": 0.14
},
{
"loss": 11.8392,
"grad_norm": 0.5855250954627991,
"learning_rate": 0.0009509932615734664,
"epoch": 0.15
},
{
"loss": 11.3847,
"grad_norm": 0.5606763958930969,
"learning_rate": 0.0009501181412444211,
"epoch": 0.15
},
{
"loss": 12.2154,
"grad_norm": 1.1014057397842407,
"learning_rate": 0.0009492430209153759,
"epoch": 0.15
},
{
"loss": 11.6247,
"grad_norm": 0.6524838805198669,
"learning_rate": 0.0009483679005863306,
"epoch": 0.15
},
{
"loss": 11.5115,
"grad_norm": 1.0140221118927002,
"learning_rate": 0.0009474927802572854,
"epoch": 0.16
},
{
"loss": 12.1707,
"grad_norm": 1.4689868688583374,
"learning_rate": 0.0009466176599282402,
"epoch": 0.16
},
{
"loss": 11.6165,
"grad_norm": 0.8136260509490967,
"learning_rate": 0.0009457425395991948,
"epoch": 0.16
},
{
"loss": 11.8841,
"grad_norm": 2.0376949310302734,
"learning_rate": 0.0009448674192701496,
"epoch": 0.17
},
{
"loss": 11.2108,
"grad_norm": 1.1647133827209473,
"learning_rate": 0.0009439922989411043,
"epoch": 0.17
},
{
"loss": 11.9281,
"grad_norm": 0.8479063510894775,
"learning_rate": 0.0009431171786120591,
"epoch": 0.17
},
{
"loss": 11.0593,
"grad_norm": 0.8340569138526917,
"learning_rate": 0.0009422420582830139,
"epoch": 0.17
},
{
"loss": 11.5591,
"grad_norm": 0.9813485145568848,
"learning_rate": 0.0009413669379539686,
"epoch": 0.18
},
{
"loss": 11.1773,
"grad_norm": 0.9088229537010193,
"learning_rate": 0.0009404918176249234,
"epoch": 0.18
},
{
"loss": 11.6913,
"grad_norm": 0.860917866230011,
"learning_rate": 0.0009396166972958782,
"epoch": 0.18
},
{
"loss": 12.3707,
"grad_norm": 0.7795988321304321,
"learning_rate": 0.0009387415769668329,
"epoch": 0.18
},
{
"loss": 11.6669,
"grad_norm": 0.914884626865387,
"learning_rate": 0.0009378664566377877,
"epoch": 0.19
},
{
"loss": 11.6139,
"grad_norm": 1.7863789796829224,
"learning_rate": 0.0009369913363087424,
"epoch": 0.19
},
{
"loss": 11.1885,
"grad_norm": 0.7225568294525146,
"learning_rate": 0.0009361162159796972,
"epoch": 0.19
},
{
"loss": 11.7488,
"grad_norm": 0.9028294682502747,
"learning_rate": 0.000935241095650652,
"epoch": 0.19
},
{
"loss": 11.227,
"grad_norm": 1.0842101573944092,
"learning_rate": 0.0009343659753216067,
"epoch": 0.2
},
{
"loss": 11.4022,
"grad_norm": 0.7042496800422668,
"learning_rate": 0.0009334908549925615,
"epoch": 0.2
},
{
"loss": 11.006,
"grad_norm": 0.8355586528778076,
"learning_rate": 0.0009326157346635162,
"epoch": 0.2
},
{
"loss": 11.0561,
"grad_norm": 0.9001519083976746,
"learning_rate": 0.000931740614334471,
"epoch": 0.2
},
{
"loss": 11.357,
"grad_norm": 0.8695396184921265,
"learning_rate": 0.0009308654940054258,
"epoch": 0.21
},
{
"loss": 10.7003,
"grad_norm": 0.8076105117797852,
"learning_rate": 0.0009299903736763805,
"epoch": 0.21
},
{
"loss": 11.2661,
"grad_norm": 0.9677106142044067,
"learning_rate": 0.0009291152533473353,
"epoch": 0.21
},
{
"loss": 10.8957,
"grad_norm": 0.8753145337104797,
"learning_rate": 0.0009282401330182901,
"epoch": 0.22
},
{
"loss": 11.2854,
"grad_norm": 0.7343422770500183,
"learning_rate": 0.0009273650126892448,
"epoch": 0.22
},
{
"loss": 10.8205,
"grad_norm": 0.9795741438865662,
"learning_rate": 0.0009264898923601996,
"epoch": 0.22
},
{
"loss": 10.6805,
"grad_norm": 0.9723809957504272,
"learning_rate": 0.0009256147720311543,
"epoch": 0.22
},
{
"loss": 10.7639,
"grad_norm": 0.6675435900688171,
"learning_rate": 0.0009247396517021091,
"epoch": 0.23
},
{
"loss": 11.1119,
"grad_norm": 0.9673445224761963,
"learning_rate": 0.0009238645313730638,
"epoch": 0.23
},
{
"loss": 11.293,
"grad_norm": 0.9545767307281494,
"learning_rate": 0.0009229894110440185,
"epoch": 0.23
},
{
"loss": 11.4529,
"grad_norm": 0.8443020582199097,
"learning_rate": 0.0009221142907149733,
"epoch": 0.23
},
{
"loss": 10.402,
"grad_norm": 0.9980494976043701,
"learning_rate": 0.000921239170385928,
"epoch": 0.24
},
{
"loss": 10.8417,
"grad_norm": 1.2651828527450562,
"learning_rate": 0.0009203640500568828,
"epoch": 0.24
},
{
"loss": 10.9627,
"grad_norm": 0.7320075035095215,
"learning_rate": 0.0009194889297278376,
"epoch": 0.24
},
{
"loss": 10.1427,
"grad_norm": 1.5249311923980713,
"learning_rate": 0.0009186138093987923,
"epoch": 0.24
},
{
"loss": 11.0647,
"grad_norm": 0.8371347188949585,
"learning_rate": 0.0009177386890697471,
"epoch": 0.25
},
{
"loss": 10.7984,
"grad_norm": 1.0522745847702026,
"learning_rate": 0.0009168635687407019,
"epoch": 0.25
},
{
"loss": 10.0289,
"grad_norm": 0.9992939829826355,
"learning_rate": 0.0009159884484116566,
"epoch": 0.25
},
{
"loss": 10.6594,
"grad_norm": 1.6465744972229004,
"learning_rate": 0.0009151133280826114,
"epoch": 0.25
},
{
"loss": 10.7898,
"grad_norm": 0.8755474090576172,
"learning_rate": 0.0009142382077535661,
"epoch": 0.26
},
{
"loss": 10.8566,
"grad_norm": 0.9154648780822754,
"learning_rate": 0.0009133630874245209,
"epoch": 0.26
},
{
"loss": 10.3388,
"grad_norm": 0.9557958245277405,
"learning_rate": 0.0009124879670954757,
"epoch": 0.26
},
{
"loss": 11.0761,
"grad_norm": 0.9756875038146973,
"learning_rate": 0.0009116128467664304,
"epoch": 0.27
},
{
"loss": 10.6927,
"grad_norm": 0.9137876033782959,
"learning_rate": 0.0009107377264373852,
"epoch": 0.27
},
{
"loss": 10.4956,
"grad_norm": 1.2811295986175537,
"learning_rate": 0.00090986260610834,
"epoch": 0.27
},
{
"loss": 11.13,
"grad_norm": 1.574196696281433,
"learning_rate": 0.0009090749978121991,
"epoch": 0.27
},
{
"loss": 10.4299,
"grad_norm": 1.120239019393921,
"learning_rate": 0.0009082873895160585,
"epoch": 0.28
},
{
"loss": 10.9432,
"grad_norm": 4.42399263381958,
"learning_rate": 0.0009074122691870133,
"epoch": 0.28
},
{
"loss": 10.6758,
"grad_norm": 1.1292444467544556,
"learning_rate": 0.000906537148857968,
"epoch": 0.28
},
{
"loss": 9.9808,
"grad_norm": 1.36553156375885,
"learning_rate": 0.0009056620285289227,
"epoch": 0.28
},
{
"loss": 10.4376,
"grad_norm": 1.4920979738235474,
"learning_rate": 0.0009047869081998775,
"epoch": 0.29
},
{
"loss": 11.5319,
"grad_norm": 1.142583966255188,
"learning_rate": 0.0009039117878708322,
"epoch": 0.29
},
{
"loss": 10.8741,
"grad_norm": 1.7269898653030396,
"learning_rate": 0.000903036667541787,
"epoch": 0.29
},
{
"loss": 10.6609,
"grad_norm": 1.0620924234390259,
"learning_rate": 0.0009021615472127418,
"epoch": 0.29
},
{
"loss": 10.8716,
"grad_norm": 1.0225517749786377,
"learning_rate": 0.0009012864268836965,
"epoch": 0.3
},
{
"loss": 10.8629,
"grad_norm": 0.8201847672462463,
"learning_rate": 0.0009004113065546513,
"epoch": 0.3
},
{
"loss": 10.2614,
"grad_norm": 0.7885268926620483,
"learning_rate": 0.000899536186225606,
"epoch": 0.3
},
{
"loss": 10.1758,
"grad_norm": 0.8671897053718567,
"learning_rate": 0.0008986610658965608,
"epoch": 0.3
},
{
"loss": 10.2796,
"grad_norm": 0.8501631617546082,
"learning_rate": 0.0008977859455675156,
"epoch": 0.31
},
{
"loss": 10.4376,
"grad_norm": 1.3847661018371582,
"learning_rate": 0.0008969108252384703,
"epoch": 0.31
},
{
"loss": 10.6258,
"grad_norm": 1.1267868280410767,
"learning_rate": 0.0008960357049094251,
"epoch": 0.31
},
{
"loss": 10.3214,
"grad_norm": 0.9492388963699341,
"learning_rate": 0.0008951605845803799,
"epoch": 0.31
},
{
"loss": 10.3126,
"grad_norm": 2.884838819503784,
"learning_rate": 0.0008942854642513346,
"epoch": 0.32
},
{
"loss": 9.8104,
"grad_norm": 1.007505178451538,
"learning_rate": 0.0008934103439222894,
"epoch": 0.32
},
{
"loss": 10.7341,
"grad_norm": 0.9504636526107788,
"learning_rate": 0.0008925352235932441,
"epoch": 0.32
},
{
"loss": 10.3923,
"grad_norm": 1.1075007915496826,
"learning_rate": 0.0008916601032641989,
"epoch": 0.33
},
{
"loss": 10.323,
"grad_norm": 1.137343406677246,
"learning_rate": 0.0008907849829351537,
"epoch": 0.33
},
{
"loss": 10.2794,
"grad_norm": 0.797771155834198,
"learning_rate": 0.0008899098626061084,
"epoch": 0.33
},
{
"loss": 10.6656,
"grad_norm": 1.018343448638916,
"learning_rate": 0.0008890347422770632,
"epoch": 0.33
},
{
"loss": 10.2778,
"grad_norm": 1.0548039674758911,
"learning_rate": 0.000888159621948018,
"epoch": 0.34
},
{
"loss": 10.1114,
"grad_norm": 3.0174038410186768,
"learning_rate": 0.0008872845016189727,
"epoch": 0.34
},
{
"loss": 10.8685,
"grad_norm": 2.50591778755188,
"learning_rate": 0.0008864093812899275,
"epoch": 0.34
},
{
"loss": 10.0677,
"grad_norm": 1.2851207256317139,
"learning_rate": 0.0008855342609608822,
"epoch": 0.34
},
{
"loss": 10.0311,
"grad_norm": 0.7987344264984131,
"learning_rate": 0.0008846591406318369,
"epoch": 0.35
},
{
"loss": 9.7713,
"grad_norm": 1.114479899406433,
"learning_rate": 0.0008837840203027917,
"epoch": 0.35
},
{
"loss": 9.9371,
"grad_norm": 1.2233116626739502,
"learning_rate": 0.0008829088999737464,
"epoch": 0.35
},
{
"loss": 10.5333,
"grad_norm": 2.0412189960479736,
"learning_rate": 0.0008820337796447012,
"epoch": 0.35
},
{
"loss": 10.2577,
"grad_norm": 1.2635306119918823,
"learning_rate": 0.0008811586593156559,
"epoch": 0.36
},
{
"loss": 9.8937,
"grad_norm": 12.760872840881348,
"learning_rate": 0.0008802835389866107,
"epoch": 0.36
},
{
"loss": 10.6092,
"grad_norm": 1.3580334186553955,
"learning_rate": 0.0008794084186575654,
"epoch": 0.36
},
{
"loss": 10.2467,
"grad_norm": 1.250632643699646,
"learning_rate": 0.0008785332983285201,
"epoch": 0.36
},
{
"loss": 10.5076,
"grad_norm": 1.458349585533142,
"learning_rate": 0.0008776581779994749,
"epoch": 0.37
},
{
"loss": 10.2769,
"grad_norm": 1.9139622449874878,
"learning_rate": 0.0008767830576704296,
"epoch": 0.37
},
{
"loss": 10.4452,
"grad_norm": 1.2400761842727661,
"learning_rate": 0.0008759079373413844,
"epoch": 0.37
},
{
"loss": 10.01,
"grad_norm": 1.5482594966888428,
"learning_rate": 0.0008750328170123392,
"epoch": 0.38
},
{
"loss": 10.2997,
"grad_norm": 1.68232262134552,
"learning_rate": 0.0008741576966832939,
"epoch": 0.38
},
{
"loss": 10.0902,
"grad_norm": 1.206350564956665,
"learning_rate": 0.0008732825763542487,
"epoch": 0.38
},
{
"loss": 9.6499,
"grad_norm": 1.2805421352386475,
"learning_rate": 0.0008724074560252034,
"epoch": 0.38
},
{
"loss": 10.3858,
"grad_norm": 1.1297776699066162,
"learning_rate": 0.0008715323356961582,
"epoch": 0.39
},
{
"loss": 10.4059,
"grad_norm": 1.382300853729248,
"learning_rate": 0.000870657215367113,
"epoch": 0.39
},
{
"loss": 9.8993,
"grad_norm": 1.1831278800964355,
"learning_rate": 0.0008697820950380677,
"epoch": 0.39
},
{
"loss": 10.2277,
"grad_norm": 1.5924201011657715,
"learning_rate": 0.0008689069747090225,
"epoch": 0.39
},
{
"loss": 10.2644,
"grad_norm": 0.9275569319725037,
"learning_rate": 0.0008680318543799773,
"epoch": 0.4
},
{
"loss": 10.2756,
"grad_norm": 1.220247745513916,
"learning_rate": 0.000867156734050932,
"epoch": 0.4
},
{
"loss": 9.873,
"grad_norm": 1.2408357858657837,
"learning_rate": 0.0008662816137218868,
"epoch": 0.4
},
{
"loss": 10.4232,
"grad_norm": 2.236565351486206,
"learning_rate": 0.0008654064933928415,
"epoch": 0.4
},
{
"loss": 9.8613,
"grad_norm": 1.3093738555908203,
"learning_rate": 0.0008645313730637963,
"epoch": 0.41
},
{
"loss": 10.1708,
"grad_norm": 2.232199192047119,
"learning_rate": 0.000863656252734751,
"epoch": 0.41
},
{
"loss": 9.9729,
"grad_norm": 1.4281343221664429,
"learning_rate": 0.0008627811324057057,
"epoch": 0.41
},
{
"loss": 10.3467,
"grad_norm": 1.7682894468307495,
"learning_rate": 0.0008619060120766605,
"epoch": 0.41
},
{
"loss": 9.7119,
"grad_norm": 1.7619984149932861,
"learning_rate": 0.0008610308917476152,
"epoch": 0.42
},
{
"loss": 10.4769,
"grad_norm": 1.5372920036315918,
"learning_rate": 0.00086015577141857,
"epoch": 0.42
},
{
"loss": 10.691,
"grad_norm": 2.3789474964141846,
"learning_rate": 0.0008592806510895248,
"epoch": 0.42
},
{
"loss": 9.8791,
"grad_norm": 2.496776819229126,
"learning_rate": 0.0008584055307604795,
"epoch": 0.43
},
{
"loss": 9.7356,
"grad_norm": 4.118072032928467,
"learning_rate": 0.0008575304104314343,
"epoch": 0.43
},
{
"loss": 10.3761,
"grad_norm": 1.7359448671340942,
"learning_rate": 0.0008566552901023891,
"epoch": 0.43
},
{
"loss": 10.1403,
"grad_norm": 1.8283412456512451,
"learning_rate": 0.0008557801697733438,
"epoch": 0.43
},
{
"loss": 10.306,
"grad_norm": 1.9979033470153809,
"learning_rate": 0.0008549050494442986,
"epoch": 0.44
},
{
"loss": 9.5832,
"grad_norm": 3.1794967651367188,
"learning_rate": 0.0008540299291152533,
"epoch": 0.44
},
{
"loss": 10.1963,
"grad_norm": 3.1991539001464844,
"learning_rate": 0.0008531548087862081,
"epoch": 0.44
},
{
"loss": 10.7828,
"grad_norm": 2.5145182609558105,
"learning_rate": 0.0008522796884571629,
"epoch": 0.44
},
{
"loss": 10.1017,
"grad_norm": 1.0783337354660034,
"learning_rate": 0.0008514045681281176,
"epoch": 0.45
},
{
"loss": 9.4955,
"grad_norm": 6.040937423706055,
"learning_rate": 0.0008505294477990724,
"epoch": 0.45
},
{
"loss": 10.3679,
"grad_norm": 1.5212355852127075,
"learning_rate": 0.0008496543274700271,
"epoch": 0.45
},
{
"loss": 9.7236,
"grad_norm": 4.30284309387207,
"learning_rate": 0.0008487792071409819,
"epoch": 0.45
},
{
"loss": 9.7635,
"grad_norm": 2.9821696281433105,
"learning_rate": 0.0008479040868119367,
"epoch": 0.46
},
{
"loss": 9.8438,
"grad_norm": 1.676759958267212,
"learning_rate": 0.0008470289664828914,
"epoch": 0.46
},
{
"loss": 9.6693,
"grad_norm": 1.8075122833251953,
"learning_rate": 0.0008461538461538462,
"epoch": 0.46
},
{
"loss": 10.4572,
"grad_norm": 2.4182658195495605,
"learning_rate": 0.000845278725824801,
"epoch": 0.46
},
{
"loss": 10.3901,
"grad_norm": 1.7208518981933594,
"learning_rate": 0.0008444036054957557,
"epoch": 0.47
},
{
"loss": 9.7696,
"grad_norm": 2.4831340312957764,
"learning_rate": 0.0008435284851667105,
"epoch": 0.47
},
{
"loss": 10.409,
"grad_norm": 1.3335094451904297,
"learning_rate": 0.0008426533648376652,
"epoch": 0.47
},
{
"loss": 10.526,
"grad_norm": 0.9441933035850525,
"learning_rate": 0.0008417782445086199,
"epoch": 0.48
},
{
"loss": 10.14,
"grad_norm": 1.1018340587615967,
"learning_rate": 0.0008409031241795747,
"epoch": 0.48
},
{
"loss": 10.0298,
"grad_norm": 1.2077239751815796,
"learning_rate": 0.0008400280038505294,
"epoch": 0.48
},
{
"loss": 9.7303,
"grad_norm": 2.0401172637939453,
"learning_rate": 0.0008391528835214842,
"epoch": 0.48
},
{
"loss": 10.1229,
"grad_norm": 1.9456411600112915,
"learning_rate": 0.0008382777631924389,
"epoch": 0.49
},
{
"loss": 9.9805,
"grad_norm": 1.830814003944397,
"learning_rate": 0.0008374026428633937,
"epoch": 0.49
},
{
"loss": 10.1328,
"grad_norm": 2.1729185581207275,
"learning_rate": 0.0008365275225343485,
"epoch": 0.49
},
{
"loss": 10.4834,
"grad_norm": 1.324315071105957,
"learning_rate": 0.0008356524022053032,
"epoch": 0.49
},
{
"loss": 10.349,
"grad_norm": 2.837768077850342,
"learning_rate": 0.000834777281876258,
"epoch": 0.5
},
{
"loss": 9.8015,
"grad_norm": 1.1361275911331177,
"learning_rate": 0.0008339021615472128,
"epoch": 0.5
},
{
"loss": 9.5739,
"grad_norm": 1.4033498764038086,
"learning_rate": 0.0008330270412181675,
"epoch": 0.5
},
{
"loss": 9.5204,
"grad_norm": 1.1027082204818726,
"learning_rate": 0.0008321519208891223,
"epoch": 0.5
},
{
"loss": 9.4961,
"grad_norm": 2.2432548999786377,
"learning_rate": 0.000831276800560077,
"epoch": 0.51
},
{
"loss": 10.4562,
"grad_norm": 1.3807300329208374,
"learning_rate": 0.0008304016802310318,
"epoch": 0.51
},
{
"loss": 9.9888,
"grad_norm": 2.594301462173462,
"learning_rate": 0.0008295265599019866,
"epoch": 0.51
},
{
"loss": 9.4501,
"grad_norm": 1.4775426387786865,
"learning_rate": 0.0008286514395729413,
"epoch": 0.51
},
{
"loss": 9.9432,
"grad_norm": 1.463850736618042,
"learning_rate": 0.0008277763192438961,
"epoch": 0.52
},
{
"loss": 9.7867,
"grad_norm": 1.5370949506759644,
"learning_rate": 0.0008269011989148508,
"epoch": 0.52
},
{
"loss": 9.674,
"grad_norm": 1.2858608961105347,
"learning_rate": 0.0008260260785858056,
"epoch": 0.52
},
{
"loss": 10.4663,
"grad_norm": 1.2758288383483887,
"learning_rate": 0.0008251509582567604,
"epoch": 0.52
},
{
"loss": 9.552,
"grad_norm": 1.181013822555542,
"learning_rate": 0.0008242758379277151,
"epoch": 0.53
},
{
"loss": 9.9999,
"grad_norm": 0.9388832449913025,
"learning_rate": 0.0008234007175986699,
"epoch": 0.53
},
{
"loss": 10.1529,
"grad_norm": 1.3157830238342285,
"learning_rate": 0.0008225255972696247,
"epoch": 0.53
},
{
"loss": 10.3224,
"grad_norm": 1.603309154510498,
"learning_rate": 0.0008216504769405794,
"epoch": 0.54
},
{
"loss": 9.2725,
"grad_norm": 1.2987728118896484,
"learning_rate": 0.0008207753566115342,
"epoch": 0.54
},
{
"loss": 10.2593,
"grad_norm": 1.398086428642273,
"learning_rate": 0.0008199002362824888,
"epoch": 0.54
},
{
"loss": 9.8407,
"grad_norm": 1.3308155536651611,
"learning_rate": 0.0008190251159534436,
"epoch": 0.54
},
{
"loss": 10.7467,
"grad_norm": 1.3167645931243896,
"learning_rate": 0.0008181499956243984,
"epoch": 0.55
},
{
"loss": 10.1278,
"grad_norm": 1.935791254043579,
"learning_rate": 0.0008172748752953531,
"epoch": 0.55
},
{
"loss": 9.9477,
"grad_norm": 1.7790919542312622,
"learning_rate": 0.0008163997549663079,
"epoch": 0.55
},
{
"loss": 9.2234,
"grad_norm": 0.8335697650909424,
"learning_rate": 0.0008155246346372626,
"epoch": 0.55
},
{
"loss": 9.8562,
"grad_norm": 2.750474691390991,
"learning_rate": 0.0008146495143082174,
"epoch": 0.56
},
{
"loss": 10.3218,
"grad_norm": 1.4811447858810425,
"learning_rate": 0.0008137743939791722,
"epoch": 0.56
},
{
"loss": 9.6582,
"grad_norm": 1.9921342134475708,
"learning_rate": 0.0008128992736501269,
"epoch": 0.56
},
{
"loss": 9.8513,
"grad_norm": 2.635014295578003,
"learning_rate": 0.0008120241533210817,
"epoch": 0.56
},
{
"loss": 9.8862,
"grad_norm": 1.5898804664611816,
"learning_rate": 0.0008111490329920365,
"epoch": 0.57
},
{
"loss": 9.4721,
"grad_norm": 4.158829689025879,
"learning_rate": 0.0008102739126629912,
"epoch": 0.57
},
{
"loss": 10.1474,
"grad_norm": 1.8269054889678955,
"learning_rate": 0.000809398792333946,
"epoch": 0.57
},
{
"loss": 9.4288,
"grad_norm": 3.384010076522827,
"learning_rate": 0.0008085236720049007,
"epoch": 0.57
},
{
"loss": 10.0144,
"grad_norm": 1.6854453086853027,
"learning_rate": 0.0008076485516758555,
"epoch": 0.58
},
{
"loss": 10.229,
"grad_norm": 2.0812976360321045,
"learning_rate": 0.0008067734313468103,
"epoch": 0.58
},
{
"loss": 9.7204,
"grad_norm": 1.7673369646072388,
"learning_rate": 0.000805898311017765,
"epoch": 0.58
},
{
"loss": 9.6859,
"grad_norm": 2.155963897705078,
"learning_rate": 0.0008050231906887198,
"epoch": 0.59
},
{
"loss": 10.029,
"grad_norm": 1.482950210571289,
"learning_rate": 0.0008041480703596745,
"epoch": 0.59
},
{
"loss": 9.25,
"grad_norm": 2.6473171710968018,
"learning_rate": 0.0008032729500306293,
"epoch": 0.59
},
{
"loss": 10.028,
"grad_norm": 1.3584322929382324,
"learning_rate": 0.0008023978297015841,
"epoch": 0.59
},
{
"loss": 9.6924,
"grad_norm": 1.74970543384552,
"learning_rate": 0.0008015227093725388,
"epoch": 0.6
},
{
"loss": 10.0445,
"grad_norm": 2.0750019550323486,
"learning_rate": 0.0008006475890434936,
"epoch": 0.6
},
{
"loss": 9.7962,
"grad_norm": 7.219356060028076,
"learning_rate": 0.0007997724687144482,
"epoch": 0.6
},
{
"loss": 10.215,
"grad_norm": 1.2369924783706665,
"learning_rate": 0.0007988973483854029,
"epoch": 0.6
},
{
"loss": 9.538,
"grad_norm": 1.9686328172683716,
"learning_rate": 0.0007980222280563577,
"epoch": 0.61
},
{
"loss": 10.2107,
"grad_norm": 1.2081037759780884,
"learning_rate": 0.0007971471077273124,
"epoch": 0.61
},
{
"loss": 9.6709,
"grad_norm": 1.7755659818649292,
"learning_rate": 0.0007962719873982672,
"epoch": 0.61
},
{
"loss": 9.7973,
"grad_norm": 2.226400375366211,
"learning_rate": 0.000795396867069222,
"epoch": 0.61
},
{
"loss": 9.5564,
"grad_norm": 1.2814253568649292,
"learning_rate": 0.0007945217467401767,
"epoch": 0.62
},
{
"loss": 9.7987,
"grad_norm": 2.0225868225097656,
"learning_rate": 0.0007936466264111315,
"epoch": 0.62
},
{
"loss": 10.0866,
"grad_norm": 2.059910774230957,
"learning_rate": 0.0007927715060820862,
"epoch": 0.62
},
{
"loss": 10.031,
"grad_norm": 3.564408779144287,
"learning_rate": 0.000791896385753041,
"epoch": 0.62
},
{
"loss": 9.5562,
"grad_norm": 1.6237695217132568,
"learning_rate": 0.0007910212654239958,
"epoch": 0.63
},
{
"loss": 10.032,
"grad_norm": 1.8051832914352417,
"learning_rate": 0.0007901461450949505,
"epoch": 0.63
},
{
"loss": 9.5223,
"grad_norm": 1.807507872581482,
"learning_rate": 0.0007892710247659053,
"epoch": 0.63
},
{
"loss": 9.4476,
"grad_norm": 1.3200876712799072,
"learning_rate": 0.00078839590443686,
"epoch": 0.64
},
{
"loss": 9.4836,
"grad_norm": 3.295555353164673,
"learning_rate": 0.0007875207841078148,
"epoch": 0.64
},
{
"loss": 9.9695,
"grad_norm": 2.036158561706543,
"learning_rate": 0.0007867331758116741,
"epoch": 0.64
},
{
"loss": 9.414,
"grad_norm": 1.6501291990280151,
"learning_rate": 0.0007858580554826289,
"epoch": 0.64
},
{
"loss": 10.3832,
"grad_norm": 1.3873107433319092,
"learning_rate": 0.0007849829351535836,
"epoch": 0.65
},
{
"loss": 9.6308,
"grad_norm": 1.0633749961853027,
"learning_rate": 0.0007841078148245384,
"epoch": 0.65
},
{
"loss": 9.8861,
"grad_norm": 2.238201141357422,
"learning_rate": 0.0007832326944954931,
"epoch": 0.65
},
{
"loss": 9.9682,
"grad_norm": 1.2320759296417236,
"learning_rate": 0.0007823575741664479,
"epoch": 0.65
},
{
"loss": 9.496,
"grad_norm": 1.8895844221115112,
"learning_rate": 0.0007814824538374027,
"epoch": 0.66
},
{
"loss": 9.9117,
"grad_norm": 1.7297803163528442,
"learning_rate": 0.0007806073335083574,
"epoch": 0.66
},
{
"loss": 10.0705,
"grad_norm": 1.8089996576309204,
"learning_rate": 0.0007797322131793122,
"epoch": 0.66
},
{
"loss": 9.8684,
"grad_norm": 2.4221599102020264,
"learning_rate": 0.000778857092850267,
"epoch": 0.66
},
{
"loss": 9.343,
"grad_norm": 1.869035243988037,
"learning_rate": 0.0007779819725212217,
"epoch": 0.67
},
{
"loss": 9.395,
"grad_norm": 1.5427782535552979,
"learning_rate": 0.0007771068521921765,
"epoch": 0.67
},
{
"loss": 9.3372,
"grad_norm": 1.2343759536743164,
"learning_rate": 0.0007762317318631312,
"epoch": 0.67
},
{
"loss": 10.0514,
"grad_norm": 1.057860016822815,
"learning_rate": 0.000775356611534086,
"epoch": 0.67
},
{
"loss": 9.8897,
"grad_norm": 4.536896228790283,
"learning_rate": 0.0007744814912050408,
"epoch": 0.68
},
{
"loss": 9.7529,
"grad_norm": 2.2841501235961914,
"learning_rate": 0.0007736063708759955,
"epoch": 0.68
},
{
"loss": 9.7393,
"grad_norm": 1.4836674928665161,
"learning_rate": 0.0007727312505469503,
"epoch": 0.68
},
{
"loss": 9.4403,
"grad_norm": 1.9073762893676758,
"learning_rate": 0.000771856130217905,
"epoch": 0.69
},
{
"loss": 9.8424,
"grad_norm": 2.367785930633545,
"learning_rate": 0.0007709810098888598,
"epoch": 0.69
},
{
"loss": 9.5098,
"grad_norm": 0.824318528175354,
"learning_rate": 0.0007701058895598146,
"epoch": 0.69
},
{
"loss": 9.4785,
"grad_norm": 1.2716361284255981,
"learning_rate": 0.0007692307692307693,
"epoch": 0.69
},
{
"loss": 9.8112,
"grad_norm": 2.1307737827301025,
"learning_rate": 0.0007683556489017241,
"epoch": 0.7
},
{
"loss": 9.5932,
"grad_norm": 2.0558087825775146,
"learning_rate": 0.0007674805285726788,
"epoch": 0.7
},
{
"loss": 9.5525,
"grad_norm": 1.582262396812439,
"learning_rate": 0.0007666054082436335,
"epoch": 0.7
},
{
"loss": 9.8359,
"grad_norm": 7.788843154907227,
"learning_rate": 0.0007657302879145883,
"epoch": 0.7
},
{
"loss": 10.3724,
"grad_norm": 1.328479528427124,
"learning_rate": 0.000764855167585543,
"epoch": 0.71
},
{
"loss": 8.8465,
"grad_norm": 1.6026923656463623,
"learning_rate": 0.0007639800472564978,
"epoch": 0.71
},
{
"loss": 9.4257,
"grad_norm": 4.00112247467041,
"learning_rate": 0.0007631049269274526,
"epoch": 0.71
},
{
"loss": 9.4006,
"grad_norm": 1.2519035339355469,
"learning_rate": 0.0007622298065984073,
"epoch": 0.71
},
{
"loss": 9.2469,
"grad_norm": 1.0302975177764893,
"learning_rate": 0.0007613546862693621,
"epoch": 0.72
},
{
"loss": 9.6992,
"grad_norm": 1.066437840461731,
"learning_rate": 0.0007604795659403168,
"epoch": 0.72
},
{
"loss": 8.9602,
"grad_norm": 1.232923984527588,
"learning_rate": 0.0007596044456112715,
"epoch": 0.72
},
{
"loss": 10.1371,
"grad_norm": 2.129009962081909,
"learning_rate": 0.0007587293252822263,
"epoch": 0.72
},
{
"loss": 9.3879,
"grad_norm": 1.385560154914856,
"learning_rate": 0.000757854204953181,
"epoch": 0.73
},
{
"loss": 9.898,
"grad_norm": 15.102237701416016,
"learning_rate": 0.0007569790846241358,
"epoch": 0.73
},
{
"loss": 9.723,
"grad_norm": 1.5371789932250977,
"learning_rate": 0.0007561039642950905,
"epoch": 0.73
},
{
"loss": 9.5436,
"grad_norm": 1.3847825527191162,
"learning_rate": 0.0007552288439660453,
"epoch": 0.73
},
{
"loss": 9.4084,
"grad_norm": 2.662229299545288,
"learning_rate": 0.0007543537236370001,
"epoch": 0.74
},
{
"loss": 9.6916,
"grad_norm": 1.3952440023422241,
"learning_rate": 0.0007534786033079548,
"epoch": 0.74
},
{
"loss": 9.2971,
"grad_norm": 2.79449725151062,
"learning_rate": 0.0007526034829789096,
"epoch": 0.74
},
{
"loss": 9.6677,
"grad_norm": 0.959707498550415,
"learning_rate": 0.0007517283626498644,
"epoch": 0.75
},
{
"loss": 9.5952,
"grad_norm": 1.7505630254745483,
"learning_rate": 0.0007508532423208191,
"epoch": 0.75
},
{
"loss": 9.901,
"grad_norm": 4.176792621612549,
"learning_rate": 0.0007499781219917739,
"epoch": 0.75
},
{
"loss": 9.5036,
"grad_norm": 2.338407516479492,
"learning_rate": 0.0007491030016627286,
"epoch": 0.75
},
{
"loss": 10.0173,
"grad_norm": 1.4003384113311768,
"learning_rate": 0.0007482278813336834,
"epoch": 0.76
},
{
"loss": 9.7204,
"grad_norm": 2.0305333137512207,
"learning_rate": 0.0007473527610046382,
"epoch": 0.76
},
{
"loss": 9.7901,
"grad_norm": 2.2396442890167236,
"learning_rate": 0.0007464776406755928,
"epoch": 0.76
},
{
"loss": 9.5465,
"grad_norm": 3.230546474456787,
"learning_rate": 0.0007456025203465476,
"epoch": 0.76
},
{
"loss": 8.9817,
"grad_norm": 3.14975643157959,
"learning_rate": 0.0007447274000175023,
"epoch": 0.77
},
{
"loss": 10.0403,
"grad_norm": 2.1714890003204346,
"learning_rate": 0.0007438522796884571,
"epoch": 0.77
},
{
"loss": 9.054,
"grad_norm": 1.8472590446472168,
"learning_rate": 0.0007429771593594119,
"epoch": 0.77
},
{
"loss": 9.4847,
"grad_norm": 1.0868862867355347,
"learning_rate": 0.0007421020390303666,
"epoch": 0.77
},
{
"loss": 9.5688,
"grad_norm": 0.9088165760040283,
"learning_rate": 0.0007412269187013214,
"epoch": 0.78
},
{
"loss": 9.2655,
"grad_norm": 1.2336516380310059,
"learning_rate": 0.0007403517983722762,
"epoch": 0.78
},
{
"loss": 9.6194,
"grad_norm": 1.2794588804244995,
"learning_rate": 0.0007394766780432309,
"epoch": 0.78
},
{
"loss": 9.4072,
"grad_norm": 1.5056113004684448,
"learning_rate": 0.0007386015577141857,
"epoch": 0.78
},
{
"loss": 8.781,
"grad_norm": 1.809520959854126,
"learning_rate": 0.0007377264373851404,
"epoch": 0.79
},
{
"loss": 9.3203,
"grad_norm": 3.1000723838806152,
"learning_rate": 0.0007368513170560952,
"epoch": 0.79
},
{
"loss": 9.3199,
"grad_norm": 4.879993915557861,
"learning_rate": 0.00073597619672705,
"epoch": 0.79
},
{
"loss": 10.2243,
"grad_norm": 1.508380651473999,
"learning_rate": 0.0007351010763980047,
"epoch": 0.8
},
{
"loss": 9.3476,
"grad_norm": 1.2379094362258911,
"learning_rate": 0.0007342259560689595,
"epoch": 0.8
},
{
"loss": 9.3482,
"grad_norm": 1.3472929000854492,
"learning_rate": 0.0007333508357399142,
"epoch": 0.8
},
{
"loss": 9.1645,
"grad_norm": 1.2490941286087036,
"learning_rate": 0.000732475715410869,
"epoch": 0.8
},
{
"loss": 9.8443,
"grad_norm": 1.3615162372589111,
"learning_rate": 0.0007316005950818238,
"epoch": 0.81
},
{
"loss": 9.1608,
"grad_norm": 1.608033299446106,
"learning_rate": 0.0007307254747527785,
"epoch": 0.81
},
{
"loss": 9.5366,
"grad_norm": 1.819758415222168,
"learning_rate": 0.0007298503544237333,
"epoch": 0.81
},
{
"loss": 9.3414,
"grad_norm": 1.190049409866333,
"learning_rate": 0.0007289752340946881,
"epoch": 0.81
},
{
"loss": 9.3362,
"grad_norm": 1.136693000793457,
"learning_rate": 0.0007281001137656428,
"epoch": 0.82
},
{
"loss": 9.4184,
"grad_norm": 1.3066457509994507,
"learning_rate": 0.0007272249934365976,
"epoch": 0.82
},
{
"loss": 9.3295,
"grad_norm": 2.193195343017578,
"learning_rate": 0.0007263498731075523,
"epoch": 0.82
},
{
"loss": 9.0824,
"grad_norm": 1.2458583116531372,
"learning_rate": 0.0007254747527785071,
"epoch": 0.82
},
{
"loss": 9.4671,
"grad_norm": 1.4734137058258057,
"learning_rate": 0.0007245996324494618,
"epoch": 0.83
},
{
"loss": 8.8882,
"grad_norm": 1.8609868288040161,
"learning_rate": 0.0007237245121204165,
"epoch": 0.83
},
{
"loss": 9.8334,
"grad_norm": 1.2084137201309204,
"learning_rate": 0.0007228493917913713,
"epoch": 0.83
},
{
"loss": 9.301,
"grad_norm": 1.3520543575286865,
"learning_rate": 0.000721974271462326,
"epoch": 0.83
},
{
"loss": 9.4308,
"grad_norm": 1.7796053886413574,
"learning_rate": 0.0007210991511332808,
"epoch": 0.84
},
{
"loss": 9.2915,
"grad_norm": 1.583756685256958,
"learning_rate": 0.0007202240308042356,
"epoch": 0.84
},
{
"loss": 9.543,
"grad_norm": 1.3439078330993652,
"learning_rate": 0.0007193489104751903,
"epoch": 0.84
},
{
"loss": 9.4767,
"grad_norm": 1.0626850128173828,
"learning_rate": 0.0007184737901461451,
"epoch": 0.85
},
{
"loss": 9.6831,
"grad_norm": 1.559846043586731,
"learning_rate": 0.0007175986698170999,
"epoch": 0.85
},
{
"loss": 9.3683,
"grad_norm": 1.3399856090545654,
"learning_rate": 0.0007167235494880546,
"epoch": 0.85
},
{
"loss": 9.4018,
"grad_norm": 2.0115649700164795,
"learning_rate": 0.0007158484291590094,
"epoch": 0.85
},
{
"loss": 9.6007,
"grad_norm": 1.9016413688659668,
"learning_rate": 0.0007149733088299641,
"epoch": 0.86
},
{
"loss": 9.7843,
"grad_norm": 9.662792205810547,
"learning_rate": 0.0007140981885009189,
"epoch": 0.86
},
{
"loss": 9.4248,
"grad_norm": 0.9219140410423279,
"learning_rate": 0.0007132230681718737,
"epoch": 0.86
},
{
"loss": 9.8659,
"grad_norm": 1.0851889848709106,
"learning_rate": 0.0007123479478428284,
"epoch": 0.86
},
{
"loss": 9.1677,
"grad_norm": 1.0349225997924805,
"learning_rate": 0.0007114728275137832,
"epoch": 0.87
},
{
"loss": 9.1666,
"grad_norm": 1.286309003829956,
"learning_rate": 0.000710597707184738,
"epoch": 0.87
},
{
"loss": 9.5514,
"grad_norm": 1.0325031280517578,
"learning_rate": 0.0007097225868556927,
"epoch": 0.87
},
{
"loss": 9.2542,
"grad_norm": 1.2344691753387451,
"learning_rate": 0.0007088474665266475,
"epoch": 0.87
},
{
"loss": 9.1687,
"grad_norm": 0.9820197224617004,
"learning_rate": 0.0007079723461976022,
"epoch": 0.88
},
{
"loss": 8.9295,
"grad_norm": 2.573585033416748,
"learning_rate": 0.000707097225868557,
"epoch": 0.88
},
{
"loss": 9.6702,
"grad_norm": 0.8707136511802673,
"learning_rate": 0.0007062221055395118,
"epoch": 0.88
},
{
"loss": 8.5564,
"grad_norm": 0.9832028150558472,
"learning_rate": 0.0007053469852104665,
"epoch": 0.88
},
{
"loss": 9.426,
"grad_norm": 2.1577107906341553,
"learning_rate": 0.0007044718648814213,
"epoch": 0.89
},
{
"loss": 9.2118,
"grad_norm": 1.6314407587051392,
"learning_rate": 0.000703596744552376,
"epoch": 0.89
},
{
"loss": 9.4482,
"grad_norm": 1.6563376188278198,
"learning_rate": 0.0007027216242233307,
"epoch": 0.89
},
{
"loss": 10.1221,
"grad_norm": 1.3398720026016235,
"learning_rate": 0.0007018465038942855,
"epoch": 0.9
},
{
"loss": 9.2569,
"grad_norm": 1.2780015468597412,
"learning_rate": 0.0007009713835652402,
"epoch": 0.9
},
{
"loss": 9.1485,
"grad_norm": 1.3434102535247803,
"learning_rate": 0.000700096263236195,
"epoch": 0.9
},
{
"loss": 9.3431,
"grad_norm": 2.2103283405303955,
"learning_rate": 0.0006992211429071497,
"epoch": 0.9
},
{
"loss": 9.5529,
"grad_norm": 2.479997158050537,
"learning_rate": 0.0006983460225781045,
"epoch": 0.91
},
{
"loss": 8.5835,
"grad_norm": 1.3891953229904175,
"learning_rate": 0.0006974709022490593,
"epoch": 0.91
},
{
"loss": 8.835,
"grad_norm": 0.9400926828384399,
"learning_rate": 0.000696595781920014,
"epoch": 0.91
},
{
"loss": 9.1069,
"grad_norm": 1.2385962009429932,
"learning_rate": 0.0006957206615909688,
"epoch": 0.91
},
{
"loss": 9.2235,
"grad_norm": 1.7397691011428833,
"learning_rate": 0.0006948455412619236,
"epoch": 0.92
},
{
"loss": 9.2386,
"grad_norm": 1.7163151502609253,
"learning_rate": 0.0006939704209328783,
"epoch": 0.92
},
{
"loss": 8.7562,
"grad_norm": 1.5626498460769653,
"learning_rate": 0.0006930953006038331,
"epoch": 0.92
},
{
"loss": 8.8432,
"grad_norm": 1.9265193939208984,
"learning_rate": 0.0006922201802747878,
"epoch": 0.92
},
{
"loss": 8.8117,
"grad_norm": 1.4459571838378906,
"learning_rate": 0.0006913450599457426,
"epoch": 0.93
},
{
"loss": 9.439,
"grad_norm": 0.9559070467948914,
"learning_rate": 0.0006904699396166974,
"epoch": 0.93
},
{
"loss": 9.1912,
"grad_norm": 1.9344050884246826,
"learning_rate": 0.0006895948192876521,
"epoch": 0.93
},
{
"loss": 9.5571,
"grad_norm": 1.52436101436615,
"learning_rate": 0.0006887196989586069,
"epoch": 0.93
},
{
"loss": 8.9898,
"grad_norm": 1.4828134775161743,
"learning_rate": 0.0006878445786295616,
"epoch": 0.94
},
{
"loss": 9.1776,
"grad_norm": 1.4312185049057007,
"learning_rate": 0.0006869694583005164,
"epoch": 0.94
},
{
"loss": 10.1621,
"grad_norm": 1.2089942693710327,
"learning_rate": 0.0006860943379714712,
"epoch": 0.94
},
{
"loss": 8.8634,
"grad_norm": 5.034254550933838,
"learning_rate": 0.0006852192176424259,
"epoch": 0.94
},
{
"loss": 9.1892,
"grad_norm": 2.494285821914673,
"learning_rate": 0.0006843440973133807,
"epoch": 0.95
},
{
"loss": 8.6028,
"grad_norm": 1.5366199016571045,
"learning_rate": 0.0006834689769843355,
"epoch": 0.95
},
{
"loss": 9.0938,
"grad_norm": 1.1272014379501343,
"learning_rate": 0.0006825938566552902,
"epoch": 0.95
},
{
"loss": 9.607,
"grad_norm": 3.852747917175293,
"learning_rate": 0.000681718736326245,
"epoch": 0.96
},
{
"loss": 9.6214,
"grad_norm": 1.9155749082565308,
"learning_rate": 0.0006808436159971996,
"epoch": 0.96
},
{
"loss": 8.6868,
"grad_norm": 1.9045560359954834,
"learning_rate": 0.0006799684956681543,
"epoch": 0.96
},
{
"loss": 9.8133,
"grad_norm": 1.4083536863327026,
"learning_rate": 0.000679093375339109,
"epoch": 0.96
},
{
"loss": 9.2029,
"grad_norm": 4.824470043182373,
"learning_rate": 0.0006782182550100638,
"epoch": 0.97
},
{
"loss": 9.3758,
"grad_norm": 1.2905750274658203,
"learning_rate": 0.0006773431346810186,
"epoch": 0.97
},
{
"loss": 9.2105,
"grad_norm": 1.4681618213653564,
"learning_rate": 0.0006764680143519733,
"epoch": 0.97
},
{
"loss": 9.1096,
"grad_norm": 1.5041123628616333,
"learning_rate": 0.0006755928940229281,
"epoch": 0.97
},
{
"loss": 9.1485,
"grad_norm": 1.7930779457092285,
"learning_rate": 0.0006747177736938829,
"epoch": 0.98
},
{
"loss": 9.2587,
"grad_norm": 1.1871591806411743,
"learning_rate": 0.0006738426533648376,
"epoch": 0.98
},
{
"loss": 9.2174,
"grad_norm": 1.550445556640625,
"learning_rate": 0.0006729675330357924,
"epoch": 0.98
},
{
"loss": 8.8521,
"grad_norm": 1.361382007598877,
"learning_rate": 0.0006720924127067471,
"epoch": 0.98
},
{
"loss": 9.0098,
"grad_norm": 1.350142002105713,
"learning_rate": 0.0006712172923777019,
"epoch": 0.99
},
{
"loss": 8.6736,
"grad_norm": 1.2662369012832642,
"learning_rate": 0.0006703421720486567,
"epoch": 0.99
},
{
"loss": 8.9752,
"grad_norm": 1.474623441696167,
"learning_rate": 0.0006694670517196114,
"epoch": 0.99
},
{
"loss": 8.7473,
"grad_norm": 2.676971912384033,
"learning_rate": 0.0006685919313905662,
"epoch": 0.99
},
{
"loss": 8.8512,
"grad_norm": 1.114418625831604,
"learning_rate": 0.000667716811061521,
"epoch": 1.0
},
{
"loss": 8.1921,
"grad_norm": 2.0294203758239746,
"learning_rate": 0.0006668416907324757,
"epoch": 1.0
},
{
"loss": 8.8171,
"grad_norm": 0.9778627157211304,
"learning_rate": 0.0006659665704034305,
"epoch": 1.0
},
{
"loss": 8.8809,
"grad_norm": 1.621929407119751,
"learning_rate": 0.0006650914500743852,
"epoch": 1.01
},
{
"loss": 8.9527,
"grad_norm": 1.0340059995651245,
"learning_rate": 0.00066421632974534,
"epoch": 1.01
},
{
"loss": 8.6295,
"grad_norm": 1.4925633668899536,
"learning_rate": 0.0006633412094162948,
"epoch": 1.01
},
{
"loss": 8.7158,
"grad_norm": 1.3651670217514038,
"learning_rate": 0.0006624660890872495,
"epoch": 1.01
},
{
"loss": 9.0606,
"grad_norm": 1.1281485557556152,
"learning_rate": 0.0006615909687582043,
"epoch": 1.02
},
{
"loss": 8.8925,
"grad_norm": 1.0784941911697388,
"learning_rate": 0.000660715848429159,
"epoch": 1.02
},
{
"loss": 9.1237,
"grad_norm": 1.49080228805542,
"learning_rate": 0.0006598407281001137,
"epoch": 1.02
},
{
"loss": 8.9093,
"grad_norm": 1.080828309059143,
"learning_rate": 0.0006589656077710685,
"epoch": 1.02
},
{
"loss": 8.9275,
"grad_norm": 1.0867069959640503,
"learning_rate": 0.0006580904874420232,
"epoch": 1.03
},
{
"loss": 8.5924,
"grad_norm": 1.0178778171539307,
"learning_rate": 0.000657215367112978,
"epoch": 1.03
},
{
"loss": 8.8768,
"grad_norm": 0.978421688079834,
"learning_rate": 0.0006563402467839327,
"epoch": 1.03
},
{
"loss": 8.8812,
"grad_norm": 1.6234030723571777,
"learning_rate": 0.0006554651264548875,
"epoch": 1.03
},
{
"loss": 9.5212,
"grad_norm": 5.744367599487305,
"learning_rate": 0.0006545900061258423,
"epoch": 1.04
},
{
"loss": 8.066,
"grad_norm": 3.1010031700134277,
"learning_rate": 0.000653714885796797,
"epoch": 1.04
},
{
"loss": 8.8401,
"grad_norm": 1.4084874391555786,
"learning_rate": 0.0006528397654677518,
"epoch": 1.04
},
{
"loss": 9.1554,
"grad_norm": 1.4125443696975708,
"learning_rate": 0.0006519646451387066,
"epoch": 1.04
},
{
"loss": 8.5098,
"grad_norm": 1.0087417364120483,
"learning_rate": 0.0006510895248096613,
"epoch": 1.05
},
{
"loss": 8.6227,
"grad_norm": 1.404480218887329,
"learning_rate": 0.0006502144044806161,
"epoch": 1.05
},
{
"loss": 8.7843,
"grad_norm": 1.1295698881149292,
"learning_rate": 0.0006493392841515708,
"epoch": 1.05
},
{
"loss": 8.6766,
"grad_norm": 1.0821887254714966,
"learning_rate": 0.0006484641638225256,
"epoch": 1.06
},
{
"loss": 8.6414,
"grad_norm": 1.1444706916809082,
"learning_rate": 0.0006475890434934804,
"epoch": 1.06
},
{
"loss": 8.457,
"grad_norm": 1.277224063873291,
"learning_rate": 0.0006467139231644351,
"epoch": 1.06
},
{
"loss": 9.058,
"grad_norm": 1.4391515254974365,
"learning_rate": 0.0006458388028353899,
"epoch": 1.06
},
{
"loss": 9.0137,
"grad_norm": 1.1909124851226807,
"learning_rate": 0.0006449636825063447,
"epoch": 1.07
},
{
"loss": 8.95,
"grad_norm": 1.1959373950958252,
"learning_rate": 0.0006440885621772994,
"epoch": 1.07
},
{
"loss": 8.7242,
"grad_norm": 1.0742520093917847,
"learning_rate": 0.0006432134418482542,
"epoch": 1.07
},
{
"loss": 8.6848,
"grad_norm": 1.1215168237686157,
"learning_rate": 0.0006423383215192089,
"epoch": 1.07
},
{
"loss": 8.2356,
"grad_norm": 1.329377293586731,
"learning_rate": 0.0006414632011901637,
"epoch": 1.08
},
{
"loss": 9.357,
"grad_norm": 1.2252676486968994,
"learning_rate": 0.0006405880808611185,
"epoch": 1.08
},
{
"loss": 8.9564,
"grad_norm": 1.4522862434387207,
"learning_rate": 0.0006397129605320732,
"epoch": 1.08
},
{
"loss": 9.315,
"grad_norm": 1.3707520961761475,
"learning_rate": 0.000638837840203028,
"epoch": 1.08
},
{
"loss": 8.5879,
"grad_norm": 1.6546357870101929,
"learning_rate": 0.0006379627198739826,
"epoch": 1.09
},
{
"loss": 9.4063,
"grad_norm": 0.9310407638549805,
"learning_rate": 0.0006370875995449374,
"epoch": 1.09
},
{
"loss": 8.8435,
"grad_norm": 0.9878571629524231,
"learning_rate": 0.0006362124792158922,
"epoch": 1.09
},
{
"loss": 9.0975,
"grad_norm": 0.9288727045059204,
"learning_rate": 0.0006353373588868469,
"epoch": 1.09
},
{
"loss": 9.219,
"grad_norm": 0.9407894015312195,
"learning_rate": 0.0006344622385578017,
"epoch": 1.1
},
{
"loss": 8.6555,
"grad_norm": 0.9899985790252686,
"learning_rate": 0.0006335871182287564,
"epoch": 1.1
},
{
"loss": 8.1403,
"grad_norm": 0.8422369360923767,
"learning_rate": 0.0006327119978997112,
"epoch": 1.1
},
{
"loss": 8.5879,
"grad_norm": 1.1602038145065308,
"learning_rate": 0.000631836877570666,
"epoch": 1.11
},
{
"loss": 8.8147,
"grad_norm": 1.0149036645889282,
"learning_rate": 0.0006309617572416207,
"epoch": 1.11
},
{
"loss": 8.6708,
"grad_norm": 1.3015429973602295,
"learning_rate": 0.0006300866369125755,
"epoch": 1.11
},
{
"loss": 8.213,
"grad_norm": 1.0710703134536743,
"learning_rate": 0.0006292115165835303,
"epoch": 1.11
},
{
"loss": 8.7651,
"grad_norm": 0.9002228379249573,
"learning_rate": 0.000628336396254485,
"epoch": 1.12
},
{
"loss": 9.2161,
"grad_norm": 1.2090556621551514,
"learning_rate": 0.0006274612759254398,
"epoch": 1.12
},
{
"loss": 8.4087,
"grad_norm": 1.2179570198059082,
"learning_rate": 0.0006265861555963945,
"epoch": 1.12
},
{
"loss": 8.5906,
"grad_norm": 1.7626177072525024,
"learning_rate": 0.0006257110352673493,
"epoch": 1.12
},
{
"loss": 8.7996,
"grad_norm": 1.2657760381698608,
"learning_rate": 0.0006248359149383041,
"epoch": 1.13
},
{
"loss": 8.7193,
"grad_norm": 0.8737196326255798,
"learning_rate": 0.0006239607946092588,
"epoch": 1.13
},
{
"loss": 8.5347,
"grad_norm": 1.1074841022491455,
"learning_rate": 0.0006230856742802136,
"epoch": 1.13
},
{
"loss": 8.8374,
"grad_norm": 1.264391303062439,
"learning_rate": 0.0006222105539511684,
"epoch": 1.13
},
{
"loss": 7.9866,
"grad_norm": 1.0013505220413208,
"learning_rate": 0.0006213354336221231,
"epoch": 1.14
},
{
"loss": 8.1635,
"grad_norm": 1.0330276489257812,
"learning_rate": 0.0006204603132930779,
"epoch": 1.14
},
{
"loss": 8.1751,
"grad_norm": 1.125343918800354,
"learning_rate": 0.0006195851929640326,
"epoch": 1.14
},
{
"loss": 9.082,
"grad_norm": 1.0461503267288208,
"learning_rate": 0.0006187100726349874,
"epoch": 1.14
},
{
"loss": 8.4013,
"grad_norm": 1.2671931982040405,
"learning_rate": 0.0006178349523059422,
"epoch": 1.15
},
{
"loss": 8.735,
"grad_norm": 1.315640926361084,
"learning_rate": 0.0006169598319768969,
"epoch": 1.15
},
{
"loss": 8.3872,
"grad_norm": 1.0746458768844604,
"learning_rate": 0.0006160847116478516,
"epoch": 1.15
},
{
"loss": 8.4791,
"grad_norm": 0.9568318724632263,
"learning_rate": 0.0006152095913188063,
"epoch": 1.15
},
{
"loss": 8.4284,
"grad_norm": 1.0956138372421265,
"learning_rate": 0.0006143344709897611,
"epoch": 1.16
},
{
"loss": 9.1513,
"grad_norm": 1.2635217905044556,
"learning_rate": 0.0006134593506607159,
"epoch": 1.16
},
{
"loss": 8.7084,
"grad_norm": 1.242577075958252,
"learning_rate": 0.0006125842303316706,
"epoch": 1.16
},
{
"loss": 8.9941,
"grad_norm": 1.0156121253967285,
"learning_rate": 0.0006117091100026254,
"epoch": 1.17
},
{
"loss": 8.731,
"grad_norm": 1.3975499868392944,
"learning_rate": 0.0006108339896735801,
"epoch": 1.17
},
{
"loss": 8.5287,
"grad_norm": 1.0764504671096802,
"learning_rate": 0.0006099588693445349,
"epoch": 1.17
},
{
"loss": 8.2368,
"grad_norm": 1.0151234865188599,
"learning_rate": 0.0006090837490154897,
"epoch": 1.17
},
{
"loss": 9.1091,
"grad_norm": 6.751773834228516,
"learning_rate": 0.0006082086286864444,
"epoch": 1.18
},
{
"loss": 8.7919,
"grad_norm": 0.95284503698349,
"learning_rate": 0.0006073335083573992,
"epoch": 1.18
},
{
"loss": 8.0937,
"grad_norm": 1.131046175956726,
"learning_rate": 0.000606458388028354,
"epoch": 1.18
},
{
"loss": 8.4255,
"grad_norm": 0.8307482600212097,
"learning_rate": 0.0006055832676993087,
"epoch": 1.18
},
{
"loss": 8.3428,
"grad_norm": 1.1681163311004639,
"learning_rate": 0.0006047081473702635,
"epoch": 1.19
},
{
"loss": 8.1699,
"grad_norm": 1.6491031646728516,
"learning_rate": 0.0006038330270412182,
"epoch": 1.19
},
{
"loss": 8.3981,
"grad_norm": 0.9328737258911133,
"learning_rate": 0.000602957906712173,
"epoch": 1.19
},
{
"loss": 8.5749,
"grad_norm": 1.3434003591537476,
"learning_rate": 0.0006020827863831278,
"epoch": 1.19
},
{
"loss": 8.6492,
"grad_norm": 1.1651496887207031,
"learning_rate": 0.0006012076660540825,
"epoch": 1.2
},
{
"loss": 8.9343,
"grad_norm": 1.1224288940429688,
"learning_rate": 0.0006003325457250373,
"epoch": 1.2
},
{
"loss": 8.4265,
"grad_norm": 1.1075445413589478,
"learning_rate": 0.0005994574253959919,
"epoch": 1.2
},
{
"loss": 8.3367,
"grad_norm": 1.0349383354187012,
"learning_rate": 0.0005985823050669467,
"epoch": 1.2
},
{
"loss": 8.6752,
"grad_norm": 0.9915909767150879,
"learning_rate": 0.0005977071847379015,
"epoch": 1.21
},
{
"loss": 8.2193,
"grad_norm": 1.172624111175537,
"learning_rate": 0.0005968320644088562,
"epoch": 1.21
},
{
"loss": 7.7701,
"grad_norm": 1.0810112953186035,
"learning_rate": 0.000595956944079811,
"epoch": 1.21
},
{
"loss": 8.9113,
"grad_norm": 1.1411935091018677,
"learning_rate": 0.0005950818237507656,
"epoch": 1.22
},
{
"loss": 8.3426,
"grad_norm": 0.9251805543899536,
"learning_rate": 0.0005942067034217204,
"epoch": 1.22
},
{
"loss": 8.1973,
"grad_norm": 0.9023226499557495,
"learning_rate": 0.0005933315830926752,
"epoch": 1.22
},
{
"loss": 8.8777,
"grad_norm": 0.9467354416847229,
"learning_rate": 0.0005924564627636299,
"epoch": 1.22
},
{
"loss": 8.758,
"grad_norm": 0.9941525459289551,
"learning_rate": 0.0005915813424345847,
"epoch": 1.23
},
{
"loss": 8.6786,
"grad_norm": 0.7721539735794067,
"learning_rate": 0.0005907062221055395,
"epoch": 1.23
},
{
"loss": 8.7063,
"grad_norm": 0.9968111515045166,
"learning_rate": 0.0005898311017764942,
"epoch": 1.23
},
{
"loss": 8.4121,
"grad_norm": 0.8019425272941589,
"learning_rate": 0.000588955981447449,
"epoch": 1.23
},
{
"loss": 8.8181,
"grad_norm": 1.1664308309555054,
"learning_rate": 0.0005880808611184037,
"epoch": 1.24
},
{
"loss": 8.1548,
"grad_norm": 1.008786678314209,
"learning_rate": 0.0005872057407893585,
"epoch": 1.24
},
{
"loss": 8.5725,
"grad_norm": 1.2349562644958496,
"learning_rate": 0.0005863306204603133,
"epoch": 1.24
},
{
"loss": 8.8339,
"grad_norm": 1.2367397546768188,
"learning_rate": 0.000585455500131268,
"epoch": 1.24
},
{
"loss": 8.3184,
"grad_norm": 0.9427123665809631,
"learning_rate": 0.0005845803798022228,
"epoch": 1.25
},
{
"loss": 8.2814,
"grad_norm": 0.951808512210846,
"learning_rate": 0.0005837052594731775,
"epoch": 1.25
},
{
"loss": 8.1453,
"grad_norm": 1.076816439628601,
"learning_rate": 0.0005828301391441323,
"epoch": 1.25
},
{
"loss": 8.5114,
"grad_norm": 1.248741865158081,
"learning_rate": 0.0005819550188150871,
"epoch": 1.25
},
{
"loss": 8.7265,
"grad_norm": 1.0166980028152466,
"learning_rate": 0.0005810798984860418,
"epoch": 1.26
},
{
"loss": 9.0454,
"grad_norm": 1.273942232131958,
"learning_rate": 0.0005802047781569966,
"epoch": 1.26
},
{
"loss": 8.6499,
"grad_norm": 0.8551316857337952,
"learning_rate": 0.0005793296578279514,
"epoch": 1.26
},
{
"loss": 8.0282,
"grad_norm": 1.0231510400772095,
"learning_rate": 0.0005784545374989061,
"epoch": 1.27
},
{
"loss": 8.5694,
"grad_norm": 0.8138982653617859,
"learning_rate": 0.0005775794171698609,
"epoch": 1.27
},
{
"loss": 8.9449,
"grad_norm": 1.151458978652954,
"learning_rate": 0.0005767042968408156,
"epoch": 1.27
},
{
"loss": 8.5309,
"grad_norm": 1.311020851135254,
"learning_rate": 0.0005758291765117704,
"epoch": 1.27
},
{
"loss": 8.3937,
"grad_norm": 1.0431928634643555,
"learning_rate": 0.0005749540561827252,
"epoch": 1.28
},
{
"loss": 8.0121,
"grad_norm": 0.9487342238426208,
"learning_rate": 0.0005740789358536799,
"epoch": 1.28
},
{
"loss": 8.9756,
"grad_norm": 0.7705584764480591,
"learning_rate": 0.0005732038155246346,
"epoch": 1.28
},
{
"loss": 8.9679,
"grad_norm": 0.9359903335571289,
"learning_rate": 0.0005723286951955893,
"epoch": 1.28
},
{
"loss": 8.0724,
"grad_norm": 1.031725525856018,
"learning_rate": 0.0005714535748665441,
"epoch": 1.29
},
{
"loss": 8.7014,
"grad_norm": 1.0501611232757568,
"learning_rate": 0.0005705784545374989,
"epoch": 1.29
},
{
"loss": 8.2284,
"grad_norm": 0.8158836960792542,
"learning_rate": 0.0005697033342084536,
"epoch": 1.29
},
{
"loss": 8.8206,
"grad_norm": 0.8827638030052185,
"learning_rate": 0.0005688282138794084,
"epoch": 1.29
},
{
"loss": 8.4189,
"grad_norm": 0.9118880033493042,
"learning_rate": 0.0005679530935503632,
"epoch": 1.3
},
{
"loss": 8.5532,
"grad_norm": 1.2081084251403809,
"learning_rate": 0.0005670779732213179,
"epoch": 1.3
},
{
"loss": 8.5477,
"grad_norm": 1.3465925455093384,
"learning_rate": 0.0005662028528922727,
"epoch": 1.3
},
{
"loss": 9.2068,
"grad_norm": 0.8770077228546143,
"learning_rate": 0.0005653277325632274,
"epoch": 1.3
},
{
"loss": 8.6147,
"grad_norm": 1.1257092952728271,
"learning_rate": 0.0005644526122341822,
"epoch": 1.31
},
{
"loss": 8.4279,
"grad_norm": 1.0482877492904663,
"learning_rate": 0.000563577491905137,
"epoch": 1.31
},
{
"loss": 9.1236,
"grad_norm": 1.0635833740234375,
"learning_rate": 0.0005627023715760917,
"epoch": 1.31
},
{
"loss": 8.7325,
"grad_norm": 0.866674542427063,
"learning_rate": 0.0005618272512470465,
"epoch": 1.32
},
{
"loss": 8.3691,
"grad_norm": 0.9562137126922607,
"learning_rate": 0.0005609521309180012,
"epoch": 1.32
},
{
"loss": 8.3844,
"grad_norm": 1.2593939304351807,
"learning_rate": 0.000560077010588956,
"epoch": 1.32
},
{
"loss": 8.7797,
"grad_norm": 0.8865370154380798,
"learning_rate": 0.0005592018902599108,
"epoch": 1.32
},
{
"loss": 8.7078,
"grad_norm": 1.0417253971099854,
"learning_rate": 0.0005583267699308655,
"epoch": 1.33
},
{
"loss": 8.6024,
"grad_norm": 1.1513303518295288,
"learning_rate": 0.0005574516496018203,
"epoch": 1.33
},
{
"loss": 8.4373,
"grad_norm": 0.8727751970291138,
"learning_rate": 0.000556576529272775,
"epoch": 1.33
},
{
"loss": 8.2888,
"grad_norm": 1.0075277090072632,
"learning_rate": 0.0005557014089437298,
"epoch": 1.33
},
{
"loss": 8.465,
"grad_norm": 0.9511576294898987,
"learning_rate": 0.0005548262886146846,
"epoch": 1.34
},
{
"loss": 7.7129,
"grad_norm": 0.9443394541740417,
"learning_rate": 0.0005539511682856393,
"epoch": 1.34
},
{
"loss": 8.4521,
"grad_norm": 0.9932364225387573,
"learning_rate": 0.0005530760479565941,
"epoch": 1.34
},
{
"loss": 8.2593,
"grad_norm": 0.8069454431533813,
"learning_rate": 0.0005522009276275489,
"epoch": 1.34
},
{
"loss": 8.4721,
"grad_norm": 1.1227058172225952,
"learning_rate": 0.0005513258072985035,
"epoch": 1.35
},
{
"loss": 8.9954,
"grad_norm": 0.8359375596046448,
"learning_rate": 0.0005504506869694583,
"epoch": 1.35
},
{
"loss": 8.6039,
"grad_norm": 1.1721514463424683,
"learning_rate": 0.000549575566640413,
"epoch": 1.35
},
{
"loss": 7.8393,
"grad_norm": 1.031473994255066,
"learning_rate": 0.0005487004463113678,
"epoch": 1.35
},
{
"loss": 7.8643,
"grad_norm": 0.935614287853241,
"learning_rate": 0.0005478253259823226,
"epoch": 1.36
},
{
"loss": 8.4271,
"grad_norm": 0.9366902709007263,
"learning_rate": 0.0005469502056532773,
"epoch": 1.36
},
{
"loss": 8.3338,
"grad_norm": 0.9616496562957764,
"learning_rate": 0.0005460750853242321,
"epoch": 1.36
},
{
"loss": 8.1388,
"grad_norm": 2.2672061920166016,
"learning_rate": 0.0005451999649951868,
"epoch": 1.36
},
{
"loss": 8.879,
"grad_norm": 1.948036789894104,
"learning_rate": 0.0005443248446661416,
"epoch": 1.37
},
{
"loss": 8.8816,
"grad_norm": 1.0832654237747192,
"learning_rate": 0.0005434497243370964,
"epoch": 1.37
},
{
"loss": 8.5489,
"grad_norm": 0.9174715876579285,
"learning_rate": 0.0005425746040080511,
"epoch": 1.37
},
{
"loss": 8.8525,
"grad_norm": 0.8547096252441406,
"learning_rate": 0.0005416994836790059,
"epoch": 1.38
},
{
"loss": 8.6111,
"grad_norm": 0.7524705529212952,
"learning_rate": 0.0005408243633499607,
"epoch": 1.38
},
{
"loss": 8.0862,
"grad_norm": 0.8433651328086853,
"learning_rate": 0.0005399492430209154,
"epoch": 1.38
},
{
"loss": 8.2379,
"grad_norm": 0.8744563460350037,
"learning_rate": 0.0005390741226918702,
"epoch": 1.38
},
{
"loss": 8.2883,
"grad_norm": 0.8806482553482056,
"learning_rate": 0.0005381990023628249,
"epoch": 1.39
},
{
"loss": 8.6411,
"grad_norm": 0.9276745319366455,
"learning_rate": 0.0005373238820337797,
"epoch": 1.39
},
{
"loss": 8.7561,
"grad_norm": 0.9556492567062378,
"learning_rate": 0.0005364487617047345,
"epoch": 1.39
},
{
"loss": 9.305,
"grad_norm": 0.8606293797492981,
"learning_rate": 0.0005355736413756892,
"epoch": 1.39
},
{
"loss": 8.3839,
"grad_norm": 1.108547329902649,
"learning_rate": 0.000534698521046644,
"epoch": 1.4
},
{
"loss": 8.2164,
"grad_norm": 0.9102107882499695,
"learning_rate": 0.0005338234007175988,
"epoch": 1.4
},
{
"loss": 8.606,
"grad_norm": 1.0984998941421509,
"learning_rate": 0.0005329482803885535,
"epoch": 1.4
},
{
"loss": 8.0491,
"grad_norm": 1.1762152910232544,
"learning_rate": 0.0005320731600595083,
"epoch": 1.4
},
{
"loss": 8.7257,
"grad_norm": 0.9669533371925354,
"learning_rate": 0.000531198039730463,
"epoch": 1.41
},
{
"loss": 8.4473,
"grad_norm": 1.0668437480926514,
"learning_rate": 0.0005303229194014178,
"epoch": 1.41
},
{
"loss": 8.1594,
"grad_norm": 0.8289794921875,
"learning_rate": 0.0005294477990723725,
"epoch": 1.41
},
{
"loss": 8.9208,
"grad_norm": 1.0676897764205933,
"learning_rate": 0.0005285726787433272,
"epoch": 1.41
},
{
"loss": 8.0344,
"grad_norm": 0.9914399981498718,
"learning_rate": 0.000527697558414282,
"epoch": 1.42
},
{
"loss": 7.9721,
"grad_norm": 0.7524304986000061,
"learning_rate": 0.0005268224380852367,
"epoch": 1.42
},
{
"loss": 8.5322,
"grad_norm": 0.9521943926811218,
"learning_rate": 0.0005259473177561915,
"epoch": 1.42
},
{
"loss": 8.1986,
"grad_norm": 0.9657976627349854,
"learning_rate": 0.0005250721974271463,
"epoch": 1.43
},
{
"loss": 8.476,
"grad_norm": 0.9338609576225281,
"learning_rate": 0.000524197077098101,
"epoch": 1.43
},
{
"loss": 8.0189,
"grad_norm": 0.8801831007003784,
"learning_rate": 0.0005233219567690558,
"epoch": 1.43
},
{
"loss": 8.0839,
"grad_norm": 0.8173283934593201,
"learning_rate": 0.0005224468364400105,
"epoch": 1.43
},
{
"loss": 8.3716,
"grad_norm": 0.8624017238616943,
"learning_rate": 0.0005215717161109653,
"epoch": 1.44
},
{
"loss": 8.2837,
"grad_norm": 0.8650451302528381,
"learning_rate": 0.0005206965957819201,
"epoch": 1.44
},
{
"loss": 7.889,
"grad_norm": 0.8268963098526001,
"learning_rate": 0.0005198214754528747,
"epoch": 1.44
},
{
"loss": 8.7807,
"grad_norm": 0.9244619607925415,
"learning_rate": 0.0005189463551238295,
"epoch": 1.44
},
{
"loss": 8.5503,
"grad_norm": 0.8533423542976379,
"learning_rate": 0.0005180712347947842,
"epoch": 1.45
},
{
"loss": 7.7895,
"grad_norm": 0.885784924030304,
"learning_rate": 0.000517196114465739,
"epoch": 1.45
},
{
"loss": 8.9325,
"grad_norm": 1.252569556236267,
"learning_rate": 0.0005163209941366938,
"epoch": 1.45
},
{
"loss": 7.6823,
"grad_norm": 0.9340423941612244,
"learning_rate": 0.0005154458738076485,
"epoch": 1.45
},
{
"loss": 8.5812,
"grad_norm": 1.1366244554519653,
"learning_rate": 0.0005145707534786033,
"epoch": 1.46
},
{
"loss": 8.1907,
"grad_norm": 0.6764490604400635,
"learning_rate": 0.0005136956331495581,
"epoch": 1.46
},
{
"loss": 8.7694,
"grad_norm": 0.7598670721054077,
"learning_rate": 0.0005128205128205128,
"epoch": 1.46
},
{
"loss": 8.4732,
"grad_norm": 1.1497093439102173,
"learning_rate": 0.0005119453924914676,
"epoch": 1.46
},
{
"loss": 7.9224,
"grad_norm": 0.8351478576660156,
"learning_rate": 0.0005110702721624223,
"epoch": 1.47
},
{
"loss": 8.253,
"grad_norm": 0.8981735706329346,
"learning_rate": 0.0005101951518333771,
"epoch": 1.47
},
{
"loss": 8.442,
"grad_norm": 0.910393238067627,
"learning_rate": 0.0005093200315043319,
"epoch": 1.47
},
{
"loss": 8.4128,
"grad_norm": 1.0419617891311646,
"learning_rate": 0.0005084449111752865,
"epoch": 1.48
},
{
"loss": 8.5377,
"grad_norm": 1.1774574518203735,
"learning_rate": 0.0005075697908462413,
"epoch": 1.48
},
{
"loss": 8.1727,
"grad_norm": 0.8679039478302002,
"learning_rate": 0.000506694670517196,
"epoch": 1.48
},
{
"loss": 8.2085,
"grad_norm": 0.8273195028305054,
"learning_rate": 0.0005058195501881508,
"epoch": 1.48
},
{
"loss": 9.0157,
"grad_norm": 1.0897700786590576,
"learning_rate": 0.0005049444298591056,
"epoch": 1.49
},
{
"loss": 8.5794,
"grad_norm": 1.19176185131073,
"learning_rate": 0.0005040693095300603,
"epoch": 1.49
},
{
"loss": 8.4796,
"grad_norm": 0.7944311499595642,
"learning_rate": 0.0005031941892010151,
"epoch": 1.49
},
{
"loss": 8.2379,
"grad_norm": 1.1032432317733765,
"learning_rate": 0.0005023190688719699,
"epoch": 1.49
},
{
"loss": 7.8506,
"grad_norm": 0.9756267070770264,
"learning_rate": 0.0005014439485429246,
"epoch": 1.5
},
{
"loss": 8.4113,
"grad_norm": 0.8557083010673523,
"learning_rate": 0.0005005688282138794,
"epoch": 1.5
},
{
"loss": 8.3315,
"grad_norm": 0.9195913672447205,
"learning_rate": 0.0004996937078848341,
"epoch": 1.5
},
{
"loss": 8.3911,
"grad_norm": 0.7430265545845032,
"learning_rate": 0.0004988185875557889,
"epoch": 1.5
},
{
"loss": 8.3471,
"grad_norm": 0.7685049176216125,
"learning_rate": 0.0004979434672267437,
"epoch": 1.51
},
{
"loss": 8.252,
"grad_norm": 0.9667441844940186,
"learning_rate": 0.0004970683468976984,
"epoch": 1.51
},
{
"loss": 7.9134,
"grad_norm": 0.878400981426239,
"learning_rate": 0.0004961932265686532,
"epoch": 1.51
},
{
"loss": 8.337,
"grad_norm": 0.8655962944030762,
"learning_rate": 0.000495318106239608,
"epoch": 1.51
},
{
"loss": 8.2066,
"grad_norm": 0.8063825964927673,
"learning_rate": 0.0004944429859105627,
"epoch": 1.52
},
{
"loss": 8.4102,
"grad_norm": 0.7918370962142944,
"learning_rate": 0.0004935678655815175,
"epoch": 1.52
},
{
"loss": 8.1297,
"grad_norm": 1.03073251247406,
"learning_rate": 0.0004926927452524722,
"epoch": 1.52
},
{
"loss": 8.296,
"grad_norm": 0.9369198679924011,
"learning_rate": 0.000491817624923427,
"epoch": 1.53
},
{
"loss": 7.8051,
"grad_norm": 0.9166183471679688,
"learning_rate": 0.0004909425045943818,
"epoch": 1.53
},
{
"loss": 8.0258,
"grad_norm": 0.8817450404167175,
"learning_rate": 0.0004900673842653365,
"epoch": 1.53
},
{
"loss": 7.9202,
"grad_norm": 1.0320311784744263,
"learning_rate": 0.0004891922639362913,
"epoch": 1.53
},
{
"loss": 8.6314,
"grad_norm": 0.9652658700942993,
"learning_rate": 0.000488317143607246,
"epoch": 1.54
},
{
"loss": 8.5648,
"grad_norm": 1.0785067081451416,
"learning_rate": 0.00048744202327820075,
"epoch": 1.54
},
{
"loss": 7.528,
"grad_norm": 1.0575002431869507,
"learning_rate": 0.0004865669029491555,
"epoch": 1.54
},
{
"loss": 7.9019,
"grad_norm": 0.8822360634803772,
"learning_rate": 0.0004856917826201103,
"epoch": 1.54
},
{
"loss": 8.2544,
"grad_norm": 0.7296998500823975,
"learning_rate": 0.00048481666229106504,
"epoch": 1.55
},
{
"loss": 8.5853,
"grad_norm": 0.925472617149353,
"learning_rate": 0.0004839415419620198,
"epoch": 1.55
},
{
"loss": 8.3512,
"grad_norm": 0.8641199469566345,
"learning_rate": 0.00048306642163297456,
"epoch": 1.55
},
{
"loss": 8.0277,
"grad_norm": 1.0501607656478882,
"learning_rate": 0.0004821913013039293,
"epoch": 1.55
},
{
"loss": 8.0559,
"grad_norm": 0.7827814221382141,
"learning_rate": 0.00048131618097488403,
"epoch": 1.56
},
{
"loss": 8.0869,
"grad_norm": 0.929253339767456,
"learning_rate": 0.0004804410606458388,
"epoch": 1.56
},
{
"loss": 8.2206,
"grad_norm": 0.9882745742797852,
"learning_rate": 0.00047956594031679355,
"epoch": 1.56
},
{
"loss": 8.8141,
"grad_norm": 0.874455988407135,
"learning_rate": 0.0004786908199877483,
"epoch": 1.56
},
{
"loss": 8.403,
"grad_norm": 1.1270105838775635,
"learning_rate": 0.0004778156996587031,
"epoch": 1.57
},
{
"loss": 8.7545,
"grad_norm": 0.7236598134040833,
"learning_rate": 0.00047694057932965784,
"epoch": 1.57
},
{
"loss": 8.3653,
"grad_norm": 0.8243849873542786,
"learning_rate": 0.0004760654590006126,
"epoch": 1.57
},
{
"loss": 8.0057,
"grad_norm": 0.9829972386360168,
"learning_rate": 0.00047519033867156736,
"epoch": 1.57
},
{
"loss": 7.7738,
"grad_norm": 1.1444923877716064,
"learning_rate": 0.0004743152183425221,
"epoch": 1.58
},
{
"loss": 7.9619,
"grad_norm": 1.1846139430999756,
"learning_rate": 0.0004734400980134769,
"epoch": 1.58
},
{
"loss": 8.8667,
"grad_norm": 0.9437428712844849,
"learning_rate": 0.00047256497768443165,
"epoch": 1.58
},
{
"loss": 8.2367,
"grad_norm": 0.8670662641525269,
"learning_rate": 0.0004716898573553864,
"epoch": 1.59
},
{
"loss": 7.5306,
"grad_norm": 0.823538064956665,
"learning_rate": 0.00047081473702634117,
"epoch": 1.59
},
{
"loss": 8.0832,
"grad_norm": 0.8938249349594116,
"learning_rate": 0.0004699396166972959,
"epoch": 1.59
},
{
"loss": 7.7995,
"grad_norm": 0.8147523999214172,
"learning_rate": 0.00046906449636825064,
"epoch": 1.59
},
{
"loss": 8.2207,
"grad_norm": 0.7885489463806152,
"learning_rate": 0.0004681893760392054,
"epoch": 1.6
},
{
"loss": 8.3315,
"grad_norm": 0.9256998300552368,
"learning_rate": 0.00046731425571016016,
"epoch": 1.6
},
{
"loss": 7.8139,
"grad_norm": 0.7331977486610413,
"learning_rate": 0.0004664391353811149,
"epoch": 1.6
},
{
"loss": 8.2015,
"grad_norm": 0.7677296996116638,
"learning_rate": 0.0004655640150520697,
"epoch": 1.6
},
{
"loss": 8.114,
"grad_norm": 1.066036343574524,
"learning_rate": 0.00046468889472302445,
"epoch": 1.61
},
{
"loss": 8.3314,
"grad_norm": 0.7969563603401184,
"learning_rate": 0.0004638137743939792,
"epoch": 1.61
},
{
"loss": 8.4266,
"grad_norm": 0.8454012274742126,
"learning_rate": 0.000462938654064934,
"epoch": 1.61
},
{
"loss": 8.0451,
"grad_norm": 1.049949288368225,
"learning_rate": 0.00046206353373588874,
"epoch": 1.61
},
{
"loss": 7.8993,
"grad_norm": 0.8960159420967102,
"learning_rate": 0.0004611884134068435,
"epoch": 1.62
},
{
"loss": 8.4117,
"grad_norm": 1.0029221773147583,
"learning_rate": 0.00046031329307779826,
"epoch": 1.62
},
{
"loss": 7.9899,
"grad_norm": 1.0616450309753418,
"learning_rate": 0.0004594381727487529,
"epoch": 1.62
},
{
"loss": 7.9134,
"grad_norm": 0.8082576990127563,
"learning_rate": 0.0004585630524197077,
"epoch": 1.62
},
{
"loss": 8.1685,
"grad_norm": 0.9529896974563599,
"learning_rate": 0.00045768793209066244,
"epoch": 1.63
},
{
"loss": 8.7919,
"grad_norm": 0.7967125773429871,
"learning_rate": 0.0004568128117616172,
"epoch": 1.63
},
{
"loss": 8.4375,
"grad_norm": 0.8775154948234558,
"learning_rate": 0.00045593769143257196,
"epoch": 1.63
},
{
"loss": 8.559,
"grad_norm": 0.782707929611206,
"learning_rate": 0.0004550625711035267,
"epoch": 1.64
},
{
"loss": 8.4288,
"grad_norm": 0.7907795310020447,
"learning_rate": 0.0004541874507744815,
"epoch": 1.64
},
{
"loss": 8.5237,
"grad_norm": 1.0685423612594604,
"learning_rate": 0.00045331233044543625,
"epoch": 1.64
},
{
"loss": 8.4464,
"grad_norm": 1.1534669399261475,
"learning_rate": 0.000452437210116391,
"epoch": 1.64
},
{
"loss": 7.8577,
"grad_norm": 0.7411785125732422,
"learning_rate": 0.00045156208978734577,
"epoch": 1.65
},
{
"loss": 7.8189,
"grad_norm": 0.87079256772995,
"learning_rate": 0.00045068696945830053,
"epoch": 1.65
},
{
"loss": 8.1193,
"grad_norm": 0.9850463271141052,
"learning_rate": 0.0004498118491292553,
"epoch": 1.65
},
{
"loss": 7.9457,
"grad_norm": 0.8739660978317261,
"learning_rate": 0.00044893672880021,
"epoch": 1.65
},
{
"loss": 7.728,
"grad_norm": 0.8551336526870728,
"learning_rate": 0.00044806160847116476,
"epoch": 1.66
},
{
"loss": 8.8456,
"grad_norm": 0.8609566688537598,
"learning_rate": 0.0004471864881421195,
"epoch": 1.66
},
{
"loss": 8.0812,
"grad_norm": 0.7449157238006592,
"learning_rate": 0.0004463113678130743,
"epoch": 1.66
},
{
"loss": 8.0729,
"grad_norm": 0.8253002762794495,
"learning_rate": 0.00044543624748402905,
"epoch": 1.66
},
{
"loss": 8.4942,
"grad_norm": 0.8349846601486206,
"learning_rate": 0.0004445611271549838,
"epoch": 1.67
},
{
"loss": 8.3446,
"grad_norm": 0.9881287813186646,
"learning_rate": 0.00044368600682593857,
"epoch": 1.67
},
{
"loss": 7.3313,
"grad_norm": 0.863059401512146,
"learning_rate": 0.00044281088649689333,
"epoch": 1.67
},
{
"loss": 8.4412,
"grad_norm": 0.9246751666069031,
"learning_rate": 0.0004419357661678481,
"epoch": 1.67
},
{
"loss": 8.4511,
"grad_norm": 0.7963143587112427,
"learning_rate": 0.00044106064583880286,
"epoch": 1.68
},
{
"loss": 7.8743,
"grad_norm": 1.0088573694229126,
"learning_rate": 0.0004401855255097576,
"epoch": 1.68
},
{
"loss": 8.0994,
"grad_norm": 0.7680083513259888,
"learning_rate": 0.0004393104051807124,
"epoch": 1.68
},
{
"loss": 7.8712,
"grad_norm": 0.8324389457702637,
"learning_rate": 0.00043843528485166714,
"epoch": 1.69
},
{
"loss": 7.8454,
"grad_norm": 0.9649554491043091,
"learning_rate": 0.00043756016452262185,
"epoch": 1.69
},
{
"loss": 7.925,
"grad_norm": 0.7881239652633667,
"learning_rate": 0.0004366850441935766,
"epoch": 1.69
},
{
"loss": 7.9826,
"grad_norm": 1.2129865884780884,
"learning_rate": 0.0004358099238645314,
"epoch": 1.69
},
{
"loss": 8.3911,
"grad_norm": 0.7000688910484314,
"learning_rate": 0.00043493480353548614,
"epoch": 1.7
},
{
"loss": 7.9635,
"grad_norm": 0.7449495196342468,
"learning_rate": 0.0004340596832064409,
"epoch": 1.7
},
{
"loss": 7.8492,
"grad_norm": 0.7399414777755737,
"learning_rate": 0.00043318456287739566,
"epoch": 1.7
},
{
"loss": 8.5288,
"grad_norm": 1.0965951681137085,
"learning_rate": 0.0004323094425483504,
"epoch": 1.7
},
{
"loss": 8.0104,
"grad_norm": 0.8990981578826904,
"learning_rate": 0.0004314343222193052,
"epoch": 1.71
},
{
"loss": 7.8636,
"grad_norm": 0.8695485591888428,
"learning_rate": 0.00043055920189025994,
"epoch": 1.71
},
{
"loss": 7.9194,
"grad_norm": 0.7813265919685364,
"learning_rate": 0.0004296840815612147,
"epoch": 1.71
},
{
"loss": 8.4535,
"grad_norm": 0.7645956873893738,
"learning_rate": 0.00042880896123216947,
"epoch": 1.71
},
{
"loss": 7.8434,
"grad_norm": 1.0397326946258545,
"learning_rate": 0.00042793384090312423,
"epoch": 1.72
},
{
"loss": 7.8072,
"grad_norm": 0.9630481004714966,
"learning_rate": 0.00042705872057407894,
"epoch": 1.72
},
{
"loss": 8.327,
"grad_norm": 0.7939698696136475,
"learning_rate": 0.0004261836002450337,
"epoch": 1.72
},
{
"loss": 8.2467,
"grad_norm": 1.0103453397750854,
"learning_rate": 0.00042530847991598846,
"epoch": 1.72
},
{
"loss": 7.63,
"grad_norm": 0.9281976819038391,
"learning_rate": 0.0004244333595869432,
"epoch": 1.73
},
{
"loss": 7.7603,
"grad_norm": 0.7895064949989319,
"learning_rate": 0.000423558239257898,
"epoch": 1.73
},
{
"loss": 7.6725,
"grad_norm": 0.7491249442100525,
"learning_rate": 0.00042268311892885275,
"epoch": 1.73
},
{
"loss": 8.0813,
"grad_norm": 0.7357456088066101,
"learning_rate": 0.0004218079985998075,
"epoch": 1.74
},
{
"loss": 8.1603,
"grad_norm": 0.8232001066207886,
"learning_rate": 0.00042093287827076227,
"epoch": 1.74
},
{
"loss": 8.172,
"grad_norm": 0.7846309542655945,
"learning_rate": 0.00042005775794171703,
"epoch": 1.74
},
{
"loss": 8.2372,
"grad_norm": 0.9100042581558228,
"learning_rate": 0.00041918263761267174,
"epoch": 1.74
},
{
"loss": 7.8489,
"grad_norm": 0.9496660828590393,
"learning_rate": 0.0004183075172836265,
"epoch": 1.75
},
{
"loss": 7.7246,
"grad_norm": 0.7061757445335388,
"learning_rate": 0.00041743239695458126,
"epoch": 1.75
},
{
"loss": 7.988,
"grad_norm": 0.9927607774734497,
"learning_rate": 0.00041655727662553597,
"epoch": 1.75
},
{
"loss": 7.9562,
"grad_norm": 0.8585007190704346,
"learning_rate": 0.00041568215629649073,
"epoch": 1.75
},
{
"loss": 8.1105,
"grad_norm": 1.0176628828048706,
"learning_rate": 0.0004148070359674455,
"epoch": 1.76
},
{
"loss": 7.7869,
"grad_norm": 0.8576889038085938,
"learning_rate": 0.00041393191563840026,
"epoch": 1.76
},
{
"loss": 7.7945,
"grad_norm": 0.8359828591346741,
"learning_rate": 0.000413056795309355,
"epoch": 1.76
},
{
"loss": 7.9683,
"grad_norm": 0.8636084794998169,
"learning_rate": 0.0004121816749803098,
"epoch": 1.76
},
{
"loss": 8.3303,
"grad_norm": 0.9006314873695374,
"learning_rate": 0.00041130655465126454,
"epoch": 1.77
},
{
"loss": 8.1457,
"grad_norm": 1.217007040977478,
"learning_rate": 0.0004104314343222193,
"epoch": 1.77
},
{
"loss": 8.6171,
"grad_norm": 1.0577572584152222,
"learning_rate": 0.00040955631399317407,
"epoch": 1.77
},
{
"loss": 7.9349,
"grad_norm": 0.9530831575393677,
"learning_rate": 0.00040868119366412883,
"epoch": 1.77
},
{
"loss": 8.2722,
"grad_norm": 0.9652631282806396,
"learning_rate": 0.0004078060733350836,
"epoch": 1.78
},
{
"loss": 8.185,
"grad_norm": 0.7349383234977722,
"learning_rate": 0.00040693095300603835,
"epoch": 1.78
},
{
"loss": 7.3944,
"grad_norm": 1.122018814086914,
"learning_rate": 0.0004060558326769931,
"epoch": 1.78
},
{
"loss": 7.8828,
"grad_norm": 0.96207195520401,
"learning_rate": 0.0004051807123479478,
"epoch": 1.78
},
{
"loss": 8.1287,
"grad_norm": 0.833884060382843,
"learning_rate": 0.0004043055920189026,
"epoch": 1.79
},
{
"loss": 8.0382,
"grad_norm": 0.9089711904525757,
"learning_rate": 0.00040343047168985734,
"epoch": 1.79
},
{
"loss": 8.1137,
"grad_norm": 0.6977031230926514,
"learning_rate": 0.0004025553513608121,
"epoch": 1.79
},
{
"loss": 7.9215,
"grad_norm": 0.9814949631690979,
"learning_rate": 0.00040168023103176687,
"epoch": 1.8
},
{
"loss": 8.2266,
"grad_norm": 0.9767114520072937,
"learning_rate": 0.00040080511070272163,
"epoch": 1.8
},
{
"loss": 8.3445,
"grad_norm": 1.1093454360961914,
"learning_rate": 0.0003999299903736764,
"epoch": 1.8
},
{
"loss": 8.4239,
"grad_norm": 0.93362957239151,
"learning_rate": 0.00039905487004463115,
"epoch": 1.8
},
{
"loss": 8.2468,
"grad_norm": 0.9497604370117188,
"learning_rate": 0.0003981797497155859,
"epoch": 1.81
},
{
"loss": 7.8793,
"grad_norm": 0.8992236852645874,
"learning_rate": 0.0003973046293865407,
"epoch": 1.81
},
{
"loss": 7.8246,
"grad_norm": 0.9486469030380249,
"learning_rate": 0.00039642950905749544,
"epoch": 1.81
},
{
"loss": 8.6243,
"grad_norm": 0.970136284828186,
"learning_rate": 0.0003955543887284502,
"epoch": 1.81
},
{
"loss": 7.8859,
"grad_norm": 1.0090283155441284,
"learning_rate": 0.0003946792683994049,
"epoch": 1.82
},
{
"loss": 8.156,
"grad_norm": 0.9662021994590759,
"learning_rate": 0.00039380414807035967,
"epoch": 1.82
},
{
"loss": 7.7991,
"grad_norm": 0.8005274534225464,
"learning_rate": 0.00039292902774131443,
"epoch": 1.82
},
{
"loss": 7.8432,
"grad_norm": 0.8537503480911255,
"learning_rate": 0.0003920539074122692,
"epoch": 1.82
},
{
"loss": 7.7118,
"grad_norm": 0.8975428342819214,
"learning_rate": 0.00039117878708322396,
"epoch": 1.83
},
{
"loss": 8.0563,
"grad_norm": 0.9040714502334595,
"learning_rate": 0.0003903036667541787,
"epoch": 1.83
},
{
"loss": 8.005,
"grad_norm": 0.882514476776123,
"learning_rate": 0.0003894285464251335,
"epoch": 1.83
},
{
"loss": 7.99,
"grad_norm": 0.9527498483657837,
"learning_rate": 0.00038855342609608824,
"epoch": 1.83
},
{
"loss": 7.9497,
"grad_norm": 0.7327905893325806,
"learning_rate": 0.000387678305767043,
"epoch": 1.84
},
{
"loss": 8.1346,
"grad_norm": 0.9137473106384277,
"learning_rate": 0.00038680318543799776,
"epoch": 1.84
},
{
"loss": 7.266,
"grad_norm": 0.8273423910140991,
"learning_rate": 0.0003859280651089525,
"epoch": 1.84
},
{
"loss": 7.525,
"grad_norm": 1.2288787364959717,
"learning_rate": 0.0003850529447799073,
"epoch": 1.85
},
{
"loss": 8.5105,
"grad_norm": 0.7940724492073059,
"learning_rate": 0.00038417782445086205,
"epoch": 1.85
},
{
"loss": 8.0599,
"grad_norm": 0.9253759384155273,
"learning_rate": 0.00038330270412181676,
"epoch": 1.85
},
{
"loss": 7.1757,
"grad_norm": 0.8145419359207153,
"learning_rate": 0.0003824275837927715,
"epoch": 1.85
},
{
"loss": 7.6177,
"grad_norm": 1.1738182306289673,
"learning_rate": 0.0003815524634637263,
"epoch": 1.86
},
{
"loss": 7.6901,
"grad_norm": 0.9141517877578735,
"learning_rate": 0.00038067734313468104,
"epoch": 1.86
},
{
"loss": 7.7036,
"grad_norm": 1.0994611978530884,
"learning_rate": 0.00037980222280563575,
"epoch": 1.86
},
{
"loss": 7.9458,
"grad_norm": 0.8445936441421509,
"learning_rate": 0.0003789271024765905,
"epoch": 1.86
},
{
"loss": 7.6019,
"grad_norm": 0.8796238899230957,
"learning_rate": 0.0003780519821475453,
"epoch": 1.87
},
{
"loss": 7.7582,
"grad_norm": 0.7801417112350464,
"learning_rate": 0.00037717686181850004,
"epoch": 1.87
},
{
"loss": 7.8483,
"grad_norm": 1.008893609046936,
"learning_rate": 0.0003763017414894548,
"epoch": 1.87
},
{
"loss": 8.047,
"grad_norm": 0.8021620512008667,
"learning_rate": 0.00037542662116040956,
"epoch": 1.87
},
{
"loss": 8.2537,
"grad_norm": 0.919774055480957,
"learning_rate": 0.0003745515008313643,
"epoch": 1.88
},
{
"loss": 8.1101,
"grad_norm": 1.094642996788025,
"learning_rate": 0.0003736763805023191,
"epoch": 1.88
},
{
"loss": 7.9119,
"grad_norm": 1.0133185386657715,
"learning_rate": 0.0003728012601732738,
"epoch": 1.88
},
{
"loss": 7.9624,
"grad_norm": 0.7546307444572449,
"learning_rate": 0.00037192613984422855,
"epoch": 1.88
},
{
"loss": 7.9547,
"grad_norm": 0.7390889525413513,
"learning_rate": 0.0003710510195151833,
"epoch": 1.89
},
{
"loss": 7.7794,
"grad_norm": 0.9140797257423401,
"learning_rate": 0.0003701758991861381,
"epoch": 1.89
},
{
"loss": 8.0254,
"grad_norm": 0.8325345516204834,
"learning_rate": 0.00036930077885709284,
"epoch": 1.89
},
{
"loss": 7.7692,
"grad_norm": 1.228366732597351,
"learning_rate": 0.0003684256585280476,
"epoch": 1.9
},
{
"loss": 7.2768,
"grad_norm": 1.0541235208511353,
"learning_rate": 0.00036755053819900236,
"epoch": 1.9
},
{
"loss": 8.1104,
"grad_norm": 1.0765891075134277,
"learning_rate": 0.0003666754178699571,
"epoch": 1.9
},
{
"loss": 7.5317,
"grad_norm": 0.9508135914802551,
"learning_rate": 0.0003658002975409119,
"epoch": 1.9
},
{
"loss": 7.1908,
"grad_norm": 0.7984021306037903,
"learning_rate": 0.00036492517721186665,
"epoch": 1.91
},
{
"loss": 7.8423,
"grad_norm": 1.0381263494491577,
"learning_rate": 0.0003640500568828214,
"epoch": 1.91
},
{
"loss": 8.297,
"grad_norm": 0.9509484171867371,
"learning_rate": 0.00036317493655377617,
"epoch": 1.91
},
{
"loss": 7.7339,
"grad_norm": 0.8926167488098145,
"learning_rate": 0.0003622998162247309,
"epoch": 1.91
},
{
"loss": 7.9,
"grad_norm": 1.0550678968429565,
"learning_rate": 0.00036142469589568564,
"epoch": 1.92
},
{
"loss": 7.6175,
"grad_norm": 0.9359092712402344,
"learning_rate": 0.0003605495755666404,
"epoch": 1.92
},
{
"loss": 8.0818,
"grad_norm": 0.735281765460968,
"learning_rate": 0.00035967445523759516,
"epoch": 1.92
},
{
"loss": 8.1061,
"grad_norm": 0.8289329409599304,
"learning_rate": 0.0003587993349085499,
"epoch": 1.92
},
{
"loss": 7.3778,
"grad_norm": 0.7723102569580078,
"learning_rate": 0.0003579242145795047,
"epoch": 1.93
},
{
"loss": 7.853,
"grad_norm": 0.7856701612472534,
"learning_rate": 0.00035704909425045945,
"epoch": 1.93
},
{
"loss": 8.5133,
"grad_norm": 0.7649736404418945,
"learning_rate": 0.0003561739739214142,
"epoch": 1.93
},
{
"loss": 8.4676,
"grad_norm": 0.6755172610282898,
"learning_rate": 0.000355298853592369,
"epoch": 1.93
},
{
"loss": 8.2074,
"grad_norm": 0.8537729382514954,
"learning_rate": 0.00035442373326332374,
"epoch": 1.94
},
{
"loss": 8.249,
"grad_norm": 0.9827852845191956,
"learning_rate": 0.0003535486129342785,
"epoch": 1.94
},
{
"loss": 8.4107,
"grad_norm": 1.2670233249664307,
"learning_rate": 0.00035267349260523326,
"epoch": 1.94
},
{
"loss": 8.1578,
"grad_norm": 0.8494543433189392,
"learning_rate": 0.000351798372276188,
"epoch": 1.95
},
{
"loss": 7.9296,
"grad_norm": 0.8582159876823425,
"learning_rate": 0.00035092325194714273,
"epoch": 1.95
},
{
"loss": 7.4592,
"grad_norm": 0.8539626598358154,
"learning_rate": 0.0003500481316180975,
"epoch": 1.95
},
{
"loss": 8.1603,
"grad_norm": 0.9004923701286316,
"learning_rate": 0.00034917301128905225,
"epoch": 1.95
},
{
"loss": 8.1319,
"grad_norm": 0.722870945930481,
"learning_rate": 0.000348297890960007,
"epoch": 1.96
},
{
"loss": 7.791,
"grad_norm": 0.9422692656517029,
"learning_rate": 0.0003474227706309618,
"epoch": 1.96
},
{
"loss": 8.0631,
"grad_norm": 1.2248715162277222,
"learning_rate": 0.00034654765030191654,
"epoch": 1.96
},
{
"loss": 8.3269,
"grad_norm": 1.370082974433899,
"learning_rate": 0.0003456725299728713,
"epoch": 1.96
},
{
"loss": 7.7562,
"grad_norm": 1.0009835958480835,
"learning_rate": 0.00034479740964382606,
"epoch": 1.97
},
{
"loss": 7.4909,
"grad_norm": 0.9207608103752136,
"learning_rate": 0.0003439222893147808,
"epoch": 1.97
},
{
"loss": 7.2907,
"grad_norm": 1.0351985692977905,
"learning_rate": 0.0003430471689857356,
"epoch": 1.97
},
{
"loss": 7.9972,
"grad_norm": 0.9398946762084961,
"learning_rate": 0.00034217204865669035,
"epoch": 1.97
},
{
"loss": 7.6034,
"grad_norm": 0.8558303713798523,
"learning_rate": 0.0003412969283276451,
"epoch": 1.98
},
{
"loss": 8.3452,
"grad_norm": 0.8279830813407898,
"learning_rate": 0.0003404218079985998,
"epoch": 1.98
},
{
"loss": 8.3979,
"grad_norm": 0.7496762275695801,
"learning_rate": 0.0003395466876695545,
"epoch": 1.98
},
{
"loss": 7.5979,
"grad_norm": 0.865039587020874,
"learning_rate": 0.0003386715673405093,
"epoch": 1.98
},
{
"loss": 7.7027,
"grad_norm": 0.7518277764320374,
"learning_rate": 0.00033779644701146405,
"epoch": 1.99
},
{
"loss": 7.8756,
"grad_norm": 0.8984577059745789,
"learning_rate": 0.0003369213266824188,
"epoch": 1.99
},
{
"loss": 7.4597,
"grad_norm": 0.7312489151954651,
"learning_rate": 0.00033604620635337357,
"epoch": 1.99
},
{
"loss": 7.8173,
"grad_norm": 0.8688482046127319,
"learning_rate": 0.00033517108602432833,
"epoch": 1.99
},
{
"loss": 7.6772,
"grad_norm": 0.9117947816848755,
"learning_rate": 0.0003342959656952831,
"epoch": 2.0
},
{
"loss": 7.65,
"grad_norm": 1.044518232345581,
"learning_rate": 0.00033342084536623786,
"epoch": 2.0
},
{
"loss": 7.6424,
"grad_norm": 0.8763852119445801,
"learning_rate": 0.0003325457250371926,
"epoch": 2.0
},
{
"loss": 8.1303,
"grad_norm": 1.2922908067703247,
"learning_rate": 0.0003316706047081474,
"epoch": 2.01
},
{
"loss": 8.3256,
"grad_norm": 0.7980864644050598,
"learning_rate": 0.00033079548437910214,
"epoch": 2.01
},
{
"loss": 7.7353,
"grad_norm": 0.8062283396720886,
"learning_rate": 0.00032992036405005685,
"epoch": 2.01
},
{
"loss": 8.2314,
"grad_norm": 0.9204174280166626,
"learning_rate": 0.0003290452437210116,
"epoch": 2.01
},
{
"loss": 7.5946,
"grad_norm": 0.7235244512557983,
"learning_rate": 0.0003281701233919664,
"epoch": 2.02
},
{
"loss": 7.4673,
"grad_norm": 0.8126214146614075,
"learning_rate": 0.00032729500306292114,
"epoch": 2.02
},
{
"loss": 7.6391,
"grad_norm": 0.7648585438728333,
"learning_rate": 0.0003264198827338759,
"epoch": 2.02
},
{
"loss": 8.005,
"grad_norm": 0.7453392148017883,
"learning_rate": 0.00032554476240483066,
"epoch": 2.02
},
{
"loss": 7.8703,
"grad_norm": 0.8830775022506714,
"learning_rate": 0.0003246696420757854,
"epoch": 2.03
},
{
"loss": 7.8639,
"grad_norm": 1.2337687015533447,
"learning_rate": 0.0003237945217467402,
"epoch": 2.03
},
{
"loss": 7.8224,
"grad_norm": 1.0393247604370117,
"learning_rate": 0.00032291940141769494,
"epoch": 2.03
},
{
"loss": 7.7573,
"grad_norm": 0.7463309168815613,
"learning_rate": 0.0003220442810886497,
"epoch": 2.03
},
{
"loss": 8.3318,
"grad_norm": 0.8722276091575623,
"learning_rate": 0.00032116916075960447,
"epoch": 2.04
},
{
"loss": 8.0517,
"grad_norm": 0.9069348573684692,
"learning_rate": 0.00032029404043055923,
"epoch": 2.04
},
{
"loss": 7.9696,
"grad_norm": 0.7715663909912109,
"learning_rate": 0.000319418920101514,
"epoch": 2.04
},
{
"loss": 7.7113,
"grad_norm": 0.8788508176803589,
"learning_rate": 0.0003185437997724687,
"epoch": 2.04
},
{
"loss": 7.5771,
"grad_norm": 1.057786226272583,
"learning_rate": 0.00031766867944342346,
"epoch": 2.05
},
{
"loss": 7.985,
"grad_norm": 1.2888935804367065,
"learning_rate": 0.0003167935591143782,
"epoch": 2.05
},
{
"loss": 7.5748,
"grad_norm": 0.8100298047065735,
"learning_rate": 0.000315918438785333,
"epoch": 2.05
},
{
"loss": 7.7785,
"grad_norm": 0.9130757451057434,
"learning_rate": 0.00031504331845628775,
"epoch": 2.06
},
{
"loss": 7.3718,
"grad_norm": 0.895447313785553,
"learning_rate": 0.0003141681981272425,
"epoch": 2.06
},
{
"loss": 8.0138,
"grad_norm": 0.8260514736175537,
"learning_rate": 0.00031329307779819727,
"epoch": 2.06
},
{
"loss": 7.6438,
"grad_norm": 0.9353188276290894,
"learning_rate": 0.00031241795746915203,
"epoch": 2.06
},
{
"loss": 7.9212,
"grad_norm": 0.8095923066139221,
"learning_rate": 0.0003115428371401068,
"epoch": 2.07
},
{
"loss": 8.2193,
"grad_norm": 0.8156134486198425,
"learning_rate": 0.00031066771681106156,
"epoch": 2.07
},
{
"loss": 7.6264,
"grad_norm": 0.9613614082336426,
"learning_rate": 0.0003097925964820163,
"epoch": 2.07
},
{
"loss": 7.6684,
"grad_norm": 0.8426281809806824,
"learning_rate": 0.0003089174761529711,
"epoch": 2.07
},
{
"loss": 7.7356,
"grad_norm": 0.8271446824073792,
"learning_rate": 0.0003080423558239258,
"epoch": 2.08
},
{
"loss": 7.8816,
"grad_norm": 0.9108027219772339,
"learning_rate": 0.00030716723549488055,
"epoch": 2.08
},
{
"loss": 8.5754,
"grad_norm": 0.8285607099533081,
"learning_rate": 0.0003062921151658353,
"epoch": 2.08
},
{
"loss": 7.8875,
"grad_norm": 0.79032963514328,
"learning_rate": 0.00030541699483679007,
"epoch": 2.08
},
{
"loss": 7.4168,
"grad_norm": 0.8623600602149963,
"learning_rate": 0.00030454187450774483,
"epoch": 2.09
},
{
"loss": 7.546,
"grad_norm": 0.8102550506591797,
"learning_rate": 0.0003036667541786996,
"epoch": 2.09
},
{
"loss": 7.9269,
"grad_norm": 1.0298386812210083,
"learning_rate": 0.00030279163384965436,
"epoch": 2.09
},
{
"loss": 7.6682,
"grad_norm": 0.8902001976966858,
"learning_rate": 0.0003019165135206091,
"epoch": 2.09
},
{
"loss": 8.0309,
"grad_norm": 0.831743597984314,
"learning_rate": 0.0003010413931915639,
"epoch": 2.1
},
{
"loss": 8.157,
"grad_norm": 0.8056457042694092,
"learning_rate": 0.00030016627286251864,
"epoch": 2.1
},
{
"loss": 7.6514,
"grad_norm": 1.071753978729248,
"learning_rate": 0.00029929115253347335,
"epoch": 2.1
},
{
"loss": 7.8337,
"grad_norm": 0.8061104416847229,
"learning_rate": 0.0002984160322044281,
"epoch": 2.11
},
{
"loss": 7.8925,
"grad_norm": 1.1958301067352295,
"learning_rate": 0.0002975409118753828,
"epoch": 2.11
},
{
"loss": 6.9557,
"grad_norm": 0.7460314631462097,
"learning_rate": 0.0002966657915463376,
"epoch": 2.11
},
{
"loss": 7.724,
"grad_norm": 0.8949922323226929,
"learning_rate": 0.00029579067121729234,
"epoch": 2.11
},
{
"loss": 8.1209,
"grad_norm": 0.7350090146064758,
"learning_rate": 0.0002949155508882471,
"epoch": 2.12
},
{
"loss": 7.7897,
"grad_norm": 0.9530614018440247,
"learning_rate": 0.00029404043055920187,
"epoch": 2.12
},
{
"loss": 7.7916,
"grad_norm": 0.7030171155929565,
"learning_rate": 0.00029316531023015663,
"epoch": 2.12
},
{
"loss": 7.53,
"grad_norm": 0.8843898177146912,
"learning_rate": 0.0002922901899011114,
"epoch": 2.12
},
{
"loss": 7.5228,
"grad_norm": 0.9127951860427856,
"learning_rate": 0.00029141506957206615,
"epoch": 2.13
},
{
"loss": 7.423,
"grad_norm": 0.7194523811340332,
"learning_rate": 0.0002905399492430209,
"epoch": 2.13
},
{
"loss": 8.3464,
"grad_norm": 0.8251200318336487,
"learning_rate": 0.0002896648289139757,
"epoch": 2.13
},
{
"loss": 7.8906,
"grad_norm": 0.9383019804954529,
"learning_rate": 0.00028878970858493044,
"epoch": 2.13
},
{
"loss": 6.9917,
"grad_norm": 1.1721993684768677,
"learning_rate": 0.0002879145882558852,
"epoch": 2.14
},
{
"loss": 7.7154,
"grad_norm": 0.7905781865119934,
"learning_rate": 0.00028703946792683996,
"epoch": 2.14
},
{
"loss": 7.9272,
"grad_norm": 0.9261153936386108,
"learning_rate": 0.00028616434759779467,
"epoch": 2.14
},
{
"loss": 7.9141,
"grad_norm": 1.206111192703247,
"learning_rate": 0.00028528922726874943,
"epoch": 2.14
},
{
"loss": 7.9561,
"grad_norm": 0.8015759587287903,
"learning_rate": 0.0002844141069397042,
"epoch": 2.15
},
{
"loss": 7.6844,
"grad_norm": 0.970389723777771,
"learning_rate": 0.00028353898661065896,
"epoch": 2.15
},
{
"loss": 7.7312,
"grad_norm": 1.3079341650009155,
"learning_rate": 0.0002826638662816137,
"epoch": 2.15
},
{
"loss": 7.506,
"grad_norm": 0.8393199443817139,
"learning_rate": 0.0002817887459525685,
"epoch": 2.16
},
{
"loss": 7.3006,
"grad_norm": 0.9169728755950928,
"learning_rate": 0.00028091362562352324,
"epoch": 2.16
},
{
"loss": 7.5924,
"grad_norm": 0.8766190409660339,
"learning_rate": 0.000280038505294478,
"epoch": 2.16
},
{
"loss": 8.2074,
"grad_norm": 0.8473224639892578,
"learning_rate": 0.00027916338496543277,
"epoch": 2.16
},
{
"loss": 7.2028,
"grad_norm": 0.9415881037712097,
"learning_rate": 0.0002782882646363875,
"epoch": 2.17
},
{
"loss": 7.87,
"grad_norm": 0.8043491840362549,
"learning_rate": 0.0002774131443073423,
"epoch": 2.17
},
{
"loss": 8.5354,
"grad_norm": 0.9696796536445618,
"learning_rate": 0.00027653802397829705,
"epoch": 2.17
},
{
"loss": 8.1185,
"grad_norm": 0.9294397830963135,
"learning_rate": 0.00027566290364925176,
"epoch": 2.17
},
{
"loss": 7.8844,
"grad_norm": 1.0350419282913208,
"learning_rate": 0.0002747877833202065,
"epoch": 2.18
},
{
"loss": 7.9054,
"grad_norm": 1.086616039276123,
"learning_rate": 0.0002739126629911613,
"epoch": 2.18
},
{
"loss": 7.4362,
"grad_norm": 0.865028440952301,
"learning_rate": 0.00027303754266211604,
"epoch": 2.18
},
{
"loss": 7.4039,
"grad_norm": 0.8574273586273193,
"learning_rate": 0.0002721624223330708,
"epoch": 2.18
},
{
"loss": 8.0095,
"grad_norm": 1.0509589910507202,
"learning_rate": 0.00027128730200402557,
"epoch": 2.19
},
{
"loss": 7.6467,
"grad_norm": 0.7813432812690735,
"learning_rate": 0.00027041218167498033,
"epoch": 2.19
},
{
"loss": 7.4786,
"grad_norm": 0.855741560459137,
"learning_rate": 0.0002695370613459351,
"epoch": 2.19
},
{
"loss": 7.7862,
"grad_norm": 0.8451842069625854,
"learning_rate": 0.00026866194101688985,
"epoch": 2.19
},
{
"loss": 7.7616,
"grad_norm": 0.882211446762085,
"learning_rate": 0.0002677868206878446,
"epoch": 2.2
},
{
"loss": 8.1508,
"grad_norm": 0.7093100547790527,
"learning_rate": 0.0002669117003587994,
"epoch": 2.2
},
{
"loss": 7.8715,
"grad_norm": 0.9282416701316833,
"learning_rate": 0.00026603658002975414,
"epoch": 2.2
},
{
"loss": 7.6333,
"grad_norm": 0.8849425911903381,
"learning_rate": 0.0002651614597007089,
"epoch": 2.2
},
{
"loss": 7.624,
"grad_norm": 0.8789107203483582,
"learning_rate": 0.0002642863393716636,
"epoch": 2.21
},
{
"loss": 7.5042,
"grad_norm": 0.9759025573730469,
"learning_rate": 0.00026341121904261837,
"epoch": 2.21
},
{
"loss": 7.7317,
"grad_norm": 0.794627845287323,
"learning_rate": 0.00026253609871357313,
"epoch": 2.21
},
{
"loss": 7.4743,
"grad_norm": 1.3992342948913574,
"learning_rate": 0.0002616609783845279,
"epoch": 2.22
},
{
"loss": 7.5986,
"grad_norm": 0.8934722542762756,
"learning_rate": 0.00026078585805548266,
"epoch": 2.22
},
{
"loss": 7.7515,
"grad_norm": 1.0474205017089844,
"learning_rate": 0.00025991073772643736,
"epoch": 2.22
},
{
"loss": 7.0749,
"grad_norm": 0.7677063345909119,
"learning_rate": 0.0002590356173973921,
"epoch": 2.22
},
{
"loss": 7.7033,
"grad_norm": 0.8318948149681091,
"learning_rate": 0.0002581604970683469,
"epoch": 2.23
},
{
"loss": 7.775,
"grad_norm": 0.7674381136894226,
"learning_rate": 0.00025728537673930165,
"epoch": 2.23
},
{
"loss": 7.5289,
"grad_norm": 1.0669969320297241,
"learning_rate": 0.0002564102564102564,
"epoch": 2.23
},
{
"loss": 7.3784,
"grad_norm": 1.0004348754882812,
"learning_rate": 0.00025553513608121117,
"epoch": 2.23
},
{
"loss": 7.4305,
"grad_norm": 0.7937709093093872,
"learning_rate": 0.00025466001575216593,
"epoch": 2.24
},
{
"loss": 7.1845,
"grad_norm": 0.9088554382324219,
"learning_rate": 0.00025378489542312064,
"epoch": 2.24
},
{
"loss": 7.9313,
"grad_norm": 1.0221823453903198,
"learning_rate": 0.0002529097750940754,
"epoch": 2.24
},
{
"loss": 7.4251,
"grad_norm": 0.7980064153671265,
"learning_rate": 0.00025203465476503016,
"epoch": 2.24
},
{
"loss": 8.0494,
"grad_norm": 0.8470319509506226,
"learning_rate": 0.0002511595344359849,
"epoch": 2.25
},
{
"loss": 7.7765,
"grad_norm": 1.101785659790039,
"learning_rate": 0.0002502844141069397,
"epoch": 2.25
},
{
"loss": 7.8624,
"grad_norm": 0.8655755519866943,
"learning_rate": 0.00024940929377789445,
"epoch": 2.25
},
{
"loss": 7.6855,
"grad_norm": 1.0447689294815063,
"learning_rate": 0.0002485341734488492,
"epoch": 2.25
},
{
"loss": 7.7653,
"grad_norm": 0.9611648917198181,
"learning_rate": 0.000247659053119804,
"epoch": 2.26
},
{
"loss": 8.0705,
"grad_norm": 1.410849928855896,
"learning_rate": 0.00024678393279075874,
"epoch": 2.26
},
{
"loss": 7.8147,
"grad_norm": 0.9252009987831116,
"learning_rate": 0.0002459088124617135,
"epoch": 2.26
},
{
"loss": 7.9366,
"grad_norm": 0.899348258972168,
"learning_rate": 0.00024503369213266826,
"epoch": 2.27
},
{
"loss": 8.089,
"grad_norm": 0.7920341491699219,
"learning_rate": 0.000244158571803623,
"epoch": 2.27
},
{
"loss": 7.5066,
"grad_norm": 0.8289885520935059,
"learning_rate": 0.00024328345147457776,
"epoch": 2.27
},
{
"loss": 7.4402,
"grad_norm": 0.9304541349411011,
"learning_rate": 0.00024240833114553252,
"epoch": 2.27
},
{
"loss": 8.1004,
"grad_norm": 0.8798967003822327,
"learning_rate": 0.00024153321081648728,
"epoch": 2.28
},
{
"loss": 7.8528,
"grad_norm": 0.9733609557151794,
"learning_rate": 0.00024065809048744201,
"epoch": 2.28
},
{
"loss": 7.1178,
"grad_norm": 1.1248620748519897,
"learning_rate": 0.00023978297015839678,
"epoch": 2.28
},
{
"loss": 7.7862,
"grad_norm": 1.2658095359802246,
"learning_rate": 0.00023890784982935154,
"epoch": 2.28
},
{
"loss": 7.9395,
"grad_norm": 1.0820565223693848,
"learning_rate": 0.0002380327295003063,
"epoch": 2.29
},
{
"loss": 7.4596,
"grad_norm": 0.9462448954582214,
"learning_rate": 0.00023715760917126106,
"epoch": 2.29
},
{
"loss": 7.8461,
"grad_norm": 0.8025732636451721,
"learning_rate": 0.00023628248884221582,
"epoch": 2.29
},
{
"loss": 7.7102,
"grad_norm": 0.7947144508361816,
"learning_rate": 0.00023540736851317059,
"epoch": 2.29
},
{
"loss": 7.8149,
"grad_norm": 0.8819990158081055,
"learning_rate": 0.00023453224818412532,
"epoch": 2.3
},
{
"loss": 7.5168,
"grad_norm": 0.9773268103599548,
"learning_rate": 0.00023365712785508008,
"epoch": 2.3
},
{
"loss": 7.7338,
"grad_norm": 1.384716510772705,
"learning_rate": 0.00023278200752603484,
"epoch": 2.3
},
{
"loss": 6.9549,
"grad_norm": 1.1293810606002808,
"learning_rate": 0.0002319068871969896,
"epoch": 2.3
},
{
"loss": 7.8655,
"grad_norm": 0.7238449454307556,
"learning_rate": 0.00023103176686794437,
"epoch": 2.31
},
{
"loss": 7.7399,
"grad_norm": 0.8876301646232605,
"learning_rate": 0.00023015664653889913,
"epoch": 2.31
},
{
"loss": 7.5196,
"grad_norm": 0.7352742552757263,
"learning_rate": 0.00022928152620985384,
"epoch": 2.31
},
{
"loss": 8.0545,
"grad_norm": 1.0614981651306152,
"learning_rate": 0.0002284064058808086,
"epoch": 2.32
},
{
"loss": 7.8036,
"grad_norm": 0.999052882194519,
"learning_rate": 0.00022753128555176336,
"epoch": 2.32
},
{
"loss": 7.5506,
"grad_norm": 1.084981918334961,
"learning_rate": 0.00022665616522271812,
"epoch": 2.32
},
{
"loss": 7.7953,
"grad_norm": 1.110907793045044,
"learning_rate": 0.00022578104489367288,
"epoch": 2.32
},
{
"loss": 7.6064,
"grad_norm": 1.29153311252594,
"learning_rate": 0.00022490592456462765,
"epoch": 2.33
},
{
"loss": 7.9157,
"grad_norm": 1.5039303302764893,
"learning_rate": 0.00022403080423558238,
"epoch": 2.33
},
{
"loss": 7.5924,
"grad_norm": 0.850940465927124,
"learning_rate": 0.00022315568390653714,
"epoch": 2.33
},
{
"loss": 7.9425,
"grad_norm": 0.79768967628479,
"learning_rate": 0.0002222805635774919,
"epoch": 2.33
},
{
"loss": 8.0374,
"grad_norm": 0.771493673324585,
"learning_rate": 0.00022140544324844667,
"epoch": 2.34
},
{
"loss": 7.1645,
"grad_norm": 0.7525059580802917,
"learning_rate": 0.00022053032291940143,
"epoch": 2.34
},
{
"loss": 7.5769,
"grad_norm": 0.9684802293777466,
"learning_rate": 0.0002196552025903562,
"epoch": 2.34
},
{
"loss": 7.781,
"grad_norm": 1.1203564405441284,
"learning_rate": 0.00021878008226131092,
"epoch": 2.34
},
{
"loss": 7.4585,
"grad_norm": 1.0650273561477661,
"learning_rate": 0.0002179049619322657,
"epoch": 2.35
},
{
"loss": 7.7015,
"grad_norm": 0.9924284219741821,
"learning_rate": 0.00021702984160322045,
"epoch": 2.35
},
{
"loss": 7.572,
"grad_norm": 0.8644096255302429,
"learning_rate": 0.0002161547212741752,
"epoch": 2.35
},
{
"loss": 7.8879,
"grad_norm": 0.854030966758728,
"learning_rate": 0.00021527960094512997,
"epoch": 2.35
},
{
"loss": 7.842,
"grad_norm": 0.7271285653114319,
"learning_rate": 0.00021440448061608473,
"epoch": 2.36
},
{
"loss": 7.652,
"grad_norm": 0.6921567320823669,
"learning_rate": 0.00021352936028703947,
"epoch": 2.36
},
{
"loss": 7.8335,
"grad_norm": 1.2016472816467285,
"learning_rate": 0.00021265423995799423,
"epoch": 2.36
},
{
"loss": 7.5109,
"grad_norm": 0.79868084192276,
"learning_rate": 0.000211779119628949,
"epoch": 2.37
},
{
"loss": 7.3853,
"grad_norm": 0.8064858913421631,
"learning_rate": 0.00021090399929990375,
"epoch": 2.37
},
{
"loss": 7.6334,
"grad_norm": 0.9092600345611572,
"learning_rate": 0.00021002887897085852,
"epoch": 2.37
},
{
"loss": 7.9536,
"grad_norm": 1.0683679580688477,
"learning_rate": 0.00020915375864181325,
"epoch": 2.37
},
{
"loss": 7.7399,
"grad_norm": 1.1141338348388672,
"learning_rate": 0.00020827863831276799,
"epoch": 2.38
},
{
"loss": 7.9567,
"grad_norm": 0.9624096751213074,
"learning_rate": 0.00020740351798372275,
"epoch": 2.38
},
{
"loss": 8.1788,
"grad_norm": 0.7703258991241455,
"learning_rate": 0.0002065283976546775,
"epoch": 2.38
},
{
"loss": 7.8642,
"grad_norm": 0.9297539591789246,
"learning_rate": 0.00020565327732563227,
"epoch": 2.38
},
{
"loss": 7.4837,
"grad_norm": 0.7845075130462646,
"learning_rate": 0.00020477815699658703,
"epoch": 2.39
},
{
"loss": 7.5431,
"grad_norm": 0.8620021343231201,
"learning_rate": 0.0002039030366675418,
"epoch": 2.39
},
{
"loss": 7.7398,
"grad_norm": 0.8532699942588806,
"learning_rate": 0.00020302791633849656,
"epoch": 2.39
},
{
"loss": 7.7079,
"grad_norm": 1.1266266107559204,
"learning_rate": 0.0002021527960094513,
"epoch": 2.39
},
{
"loss": 7.8789,
"grad_norm": 1.003790020942688,
"learning_rate": 0.00020127767568040605,
"epoch": 2.4
},
{
"loss": 7.5108,
"grad_norm": 1.1769237518310547,
"learning_rate": 0.00020040255535136081,
"epoch": 2.4
},
{
"loss": 7.8151,
"grad_norm": 0.9078934192657471,
"learning_rate": 0.00019952743502231558,
"epoch": 2.4
},
{
"loss": 7.4092,
"grad_norm": 0.8376544713973999,
"learning_rate": 0.00019865231469327034,
"epoch": 2.4
},
{
"loss": 7.4658,
"grad_norm": 0.9094048738479614,
"learning_rate": 0.0001977771943642251,
"epoch": 2.41
},
{
"loss": 7.6113,
"grad_norm": 1.1345362663269043,
"learning_rate": 0.00019690207403517984,
"epoch": 2.41
},
{
"loss": 7.4598,
"grad_norm": 0.8164626955986023,
"learning_rate": 0.0001960269537061346,
"epoch": 2.41
},
{
"loss": 7.707,
"grad_norm": 1.125823736190796,
"learning_rate": 0.00019515183337708936,
"epoch": 2.41
},
{
"loss": 8.0873,
"grad_norm": 0.8651579022407532,
"learning_rate": 0.00019427671304804412,
"epoch": 2.42
},
{
"loss": 7.5421,
"grad_norm": 0.9041004776954651,
"learning_rate": 0.00019340159271899888,
"epoch": 2.42
},
{
"loss": 7.9615,
"grad_norm": 0.8012003302574158,
"learning_rate": 0.00019252647238995364,
"epoch": 2.42
},
{
"loss": 7.6728,
"grad_norm": 0.8691316246986389,
"learning_rate": 0.00019165135206090838,
"epoch": 2.43
},
{
"loss": 7.4882,
"grad_norm": 0.8700850605964661,
"learning_rate": 0.00019077623173186314,
"epoch": 2.43
},
{
"loss": 7.4824,
"grad_norm": 1.0540724992752075,
"learning_rate": 0.00018990111140281788,
"epoch": 2.43
},
{
"loss": 7.3133,
"grad_norm": 0.9065701365470886,
"learning_rate": 0.00018902599107377264,
"epoch": 2.43
},
{
"loss": 8.1036,
"grad_norm": 0.8794527649879456,
"learning_rate": 0.0001881508707447274,
"epoch": 2.44
},
{
"loss": 7.3707,
"grad_norm": 0.9155571460723877,
"learning_rate": 0.00018727575041568216,
"epoch": 2.44
},
{
"loss": 7.0801,
"grad_norm": 0.7177339792251587,
"learning_rate": 0.0001864006300866369,
"epoch": 2.44
},
{
"loss": 7.4368,
"grad_norm": 0.8027993440628052,
"learning_rate": 0.00018552550975759166,
"epoch": 2.44
},
{
"loss": 8.2545,
"grad_norm": 0.9770577549934387,
"learning_rate": 0.00018465038942854642,
"epoch": 2.45
},
{
"loss": 7.4767,
"grad_norm": 1.0428367853164673,
"learning_rate": 0.00018377526909950118,
"epoch": 2.45
},
{
"loss": 8.4641,
"grad_norm": 0.8214976787567139,
"learning_rate": 0.00018290014877045594,
"epoch": 2.45
},
{
"loss": 7.054,
"grad_norm": 1.1258653402328491,
"learning_rate": 0.0001820250284414107,
"epoch": 2.45
},
{
"loss": 7.5935,
"grad_norm": 1.07210373878479,
"learning_rate": 0.00018114990811236544,
"epoch": 2.46
},
{
"loss": 7.8104,
"grad_norm": 1.0441612005233765,
"learning_rate": 0.0001802747877833202,
"epoch": 2.46
},
{
"loss": 7.6147,
"grad_norm": 0.9820619821548462,
"learning_rate": 0.00017939966745427496,
"epoch": 2.46
},
{
"loss": 8.1347,
"grad_norm": 0.8725702166557312,
"learning_rate": 0.00017852454712522973,
"epoch": 2.46
},
{
"loss": 8.227,
"grad_norm": 0.8640567660331726,
"learning_rate": 0.0001776494267961845,
"epoch": 2.47
},
{
"loss": 7.392,
"grad_norm": 1.0909335613250732,
"learning_rate": 0.00017677430646713925,
"epoch": 2.47
},
{
"loss": 6.7634,
"grad_norm": 0.8133190274238586,
"learning_rate": 0.000175899186138094,
"epoch": 2.47
},
{
"loss": 7.802,
"grad_norm": 0.9833294749259949,
"learning_rate": 0.00017502406580904875,
"epoch": 2.48
},
{
"loss": 7.2764,
"grad_norm": 0.9594758152961731,
"learning_rate": 0.0001741489454800035,
"epoch": 2.48
},
{
"loss": 7.1931,
"grad_norm": 0.9970749616622925,
"learning_rate": 0.00017327382515095827,
"epoch": 2.48
},
{
"loss": 7.6539,
"grad_norm": 0.8486274480819702,
"learning_rate": 0.00017239870482191303,
"epoch": 2.48
},
{
"loss": 7.3367,
"grad_norm": 0.9591713547706604,
"learning_rate": 0.0001715235844928678,
"epoch": 2.49
},
{
"loss": 7.545,
"grad_norm": 1.1163291931152344,
"learning_rate": 0.00017064846416382255,
"epoch": 2.49
},
{
"loss": 8.3214,
"grad_norm": 0.8581505417823792,
"learning_rate": 0.00016977334383477726,
"epoch": 2.49
},
{
"loss": 7.1871,
"grad_norm": 0.8021834492683411,
"learning_rate": 0.00016889822350573202,
"epoch": 2.49
},
{
"loss": 7.8969,
"grad_norm": 0.9090090990066528,
"learning_rate": 0.00016802310317668679,
"epoch": 2.5
},
{
"loss": 7.37,
"grad_norm": 0.8283194303512573,
"learning_rate": 0.00016714798284764155,
"epoch": 2.5
},
{
"loss": 7.4669,
"grad_norm": 0.8183834552764893,
"learning_rate": 0.0001662728625185963,
"epoch": 2.5
},
{
"loss": 7.4362,
"grad_norm": 0.9701572060585022,
"learning_rate": 0.00016539774218955107,
"epoch": 2.5
},
{
"loss": 7.2859,
"grad_norm": 1.079610824584961,
"learning_rate": 0.0001645226218605058,
"epoch": 2.51
},
{
"loss": 8.0835,
"grad_norm": 0.8598064184188843,
"learning_rate": 0.00016364750153146057,
"epoch": 2.51
},
{
"loss": 7.6696,
"grad_norm": 0.8653038740158081,
"learning_rate": 0.00016277238120241533,
"epoch": 2.51
},
{
"loss": 7.6096,
"grad_norm": 1.0018919706344604,
"learning_rate": 0.0001618972608733701,
"epoch": 2.51
},
{
"loss": 7.7412,
"grad_norm": 0.8919802308082581,
"learning_rate": 0.00016102214054432485,
"epoch": 2.52
},
{
"loss": 7.4504,
"grad_norm": 0.8712960481643677,
"learning_rate": 0.00016014702021527962,
"epoch": 2.52
},
{
"loss": 8.1,
"grad_norm": 0.8894332647323608,
"learning_rate": 0.00015927189988623435,
"epoch": 2.52
},
{
"loss": 8.1017,
"grad_norm": 1.024781584739685,
"learning_rate": 0.0001583967795571891,
"epoch": 2.53
},
{
"loss": 7.6484,
"grad_norm": 0.9175984859466553,
"learning_rate": 0.00015752165922814387,
"epoch": 2.53
},
{
"loss": 7.3766,
"grad_norm": 0.9064013361930847,
"learning_rate": 0.00015664653889909864,
"epoch": 2.53
},
{
"loss": 7.6414,
"grad_norm": 0.9600405097007751,
"learning_rate": 0.0001557714185700534,
"epoch": 2.53
},
{
"loss": 7.2811,
"grad_norm": 0.9788243174552917,
"learning_rate": 0.00015489629824100816,
"epoch": 2.54
},
{
"loss": 7.3704,
"grad_norm": 0.8740330338478088,
"learning_rate": 0.0001540211779119629,
"epoch": 2.54
},
{
"loss": 7.5645,
"grad_norm": 0.8021050095558167,
"learning_rate": 0.00015314605758291766,
"epoch": 2.54
},
{
"loss": 7.9078,
"grad_norm": 1.0614405870437622,
"learning_rate": 0.00015227093725387242,
"epoch": 2.54
},
{
"loss": 7.3365,
"grad_norm": 0.8063251376152039,
"learning_rate": 0.00015139581692482718,
"epoch": 2.55
},
{
"loss": 7.8801,
"grad_norm": 0.8937615752220154,
"learning_rate": 0.00015052069659578194,
"epoch": 2.55
},
{
"loss": 8.0013,
"grad_norm": 0.9128641486167908,
"learning_rate": 0.00014964557626673668,
"epoch": 2.55
},
{
"loss": 8.1354,
"grad_norm": 0.8519286513328552,
"learning_rate": 0.0001487704559376914,
"epoch": 2.55
},
{
"loss": 7.6918,
"grad_norm": 0.9265363812446594,
"learning_rate": 0.00014789533560864617,
"epoch": 2.56
},
{
"loss": 7.7237,
"grad_norm": 1.113276720046997,
"learning_rate": 0.00014702021527960093,
"epoch": 2.56
},
{
"loss": 7.3281,
"grad_norm": 0.9011558890342712,
"learning_rate": 0.0001461450949505557,
"epoch": 2.56
},
{
"loss": 6.577,
"grad_norm": 0.990836501121521,
"learning_rate": 0.00014526997462151046,
"epoch": 2.56
},
{
"loss": 7.4641,
"grad_norm": 1.1346269845962524,
"learning_rate": 0.00014439485429246522,
"epoch": 2.57
},
{
"loss": 7.6071,
"grad_norm": 1.0057759284973145,
"learning_rate": 0.00014351973396341998,
"epoch": 2.57
},
{
"loss": 8.0022,
"grad_norm": 0.8524260520935059,
"learning_rate": 0.00014264461363437472,
"epoch": 2.57
},
{
"loss": 7.4056,
"grad_norm": 0.7590330839157104,
"learning_rate": 0.00014176949330532948,
"epoch": 2.58
},
{
"loss": 7.9487,
"grad_norm": 1.2074108123779297,
"learning_rate": 0.00014089437297628424,
"epoch": 2.58
},
{
"loss": 7.7237,
"grad_norm": 0.9621999263763428,
"learning_rate": 0.000140019252647239,
"epoch": 2.58
},
{
"loss": 7.2588,
"grad_norm": 0.843911349773407,
"learning_rate": 0.00013914413231819376,
"epoch": 2.58
},
{
"loss": 7.1697,
"grad_norm": 0.7619708180427551,
"learning_rate": 0.00013826901198914853,
"epoch": 2.59
},
{
"loss": 7.2903,
"grad_norm": 1.0736790895462036,
"learning_rate": 0.00013739389166010326,
"epoch": 2.59
},
{
"loss": 7.9193,
"grad_norm": 1.03206467628479,
"learning_rate": 0.00013651877133105802,
"epoch": 2.59
},
{
"loss": 8.182,
"grad_norm": 0.9106431603431702,
"learning_rate": 0.00013564365100201278,
"epoch": 2.59
},
{
"loss": 7.5139,
"grad_norm": 0.9506519436836243,
"learning_rate": 0.00013476853067296755,
"epoch": 2.6
},
{
"loss": 7.9055,
"grad_norm": 0.859704852104187,
"learning_rate": 0.0001338934103439223,
"epoch": 2.6
},
{
"loss": 8.0011,
"grad_norm": 0.9628238677978516,
"learning_rate": 0.00013301829001487707,
"epoch": 2.6
},
{
"loss": 7.4412,
"grad_norm": 0.8472156524658203,
"learning_rate": 0.0001321431696858318,
"epoch": 2.6
},
{
"loss": 7.6981,
"grad_norm": 0.9454402327537537,
"learning_rate": 0.00013126804935678657,
"epoch": 2.61
},
{
"loss": 7.4101,
"grad_norm": 0.8925793766975403,
"learning_rate": 0.00013039292902774133,
"epoch": 2.61
},
{
"loss": 7.1784,
"grad_norm": 0.8468560576438904,
"learning_rate": 0.00012951780869869606,
"epoch": 2.61
},
{
"loss": 7.6655,
"grad_norm": 0.8432177901268005,
"learning_rate": 0.00012864268836965082,
"epoch": 2.61
},
{
"loss": 7.4518,
"grad_norm": 0.813543438911438,
"learning_rate": 0.00012776756804060559,
"epoch": 2.62
},
{
"loss": 7.5661,
"grad_norm": 1.134985327720642,
"learning_rate": 0.00012689244771156032,
"epoch": 2.62
},
{
"loss": 7.3611,
"grad_norm": 1.05497407913208,
"learning_rate": 0.00012601732738251508,
"epoch": 2.62
},
{
"loss": 7.8965,
"grad_norm": 1.0532019138336182,
"learning_rate": 0.00012514220705346984,
"epoch": 2.62
},
{
"loss": 8.113,
"grad_norm": 1.0708712339401245,
"learning_rate": 0.0001242670867244246,
"epoch": 2.63
},
{
"loss": 7.4782,
"grad_norm": 1.1848175525665283,
"learning_rate": 0.00012339196639537937,
"epoch": 2.63
},
{
"loss": 7.761,
"grad_norm": 0.9672744870185852,
"learning_rate": 0.00012251684606633413,
"epoch": 2.63
},
{
"loss": 7.8212,
"grad_norm": 1.2713532447814941,
"learning_rate": 0.00012164172573728888,
"epoch": 2.64
},
{
"loss": 7.5313,
"grad_norm": 1.026662826538086,
"learning_rate": 0.00012076660540824364,
"epoch": 2.64
},
{
"loss": 7.9599,
"grad_norm": 0.8448575139045715,
"learning_rate": 0.00011989148507919839,
"epoch": 2.64
},
{
"loss": 7.7506,
"grad_norm": 1.041380524635315,
"learning_rate": 0.00011901636475015315,
"epoch": 2.64
},
{
"loss": 7.9023,
"grad_norm": 0.8197987675666809,
"learning_rate": 0.00011814124442110791,
"epoch": 2.65
},
{
"loss": 7.4913,
"grad_norm": 0.918388307094574,
"learning_rate": 0.00011726612409206266,
"epoch": 2.65
},
{
"loss": 7.8685,
"grad_norm": 0.9161803722381592,
"learning_rate": 0.00011639100376301742,
"epoch": 2.65
},
{
"loss": 7.5855,
"grad_norm": 0.8994104266166687,
"learning_rate": 0.00011551588343397218,
"epoch": 2.65
},
{
"loss": 7.6488,
"grad_norm": 0.8985808491706848,
"learning_rate": 0.00011464076310492692,
"epoch": 2.66
},
{
"loss": 7.5261,
"grad_norm": 0.9975460767745972,
"learning_rate": 0.00011376564277588168,
"epoch": 2.66
},
{
"loss": 7.9051,
"grad_norm": 1.051378607749939,
"learning_rate": 0.00011289052244683644,
"epoch": 2.66
},
{
"loss": 7.2571,
"grad_norm": 1.017866611480713,
"learning_rate": 0.00011201540211779119,
"epoch": 2.66
},
{
"loss": 7.6068,
"grad_norm": 1.1010361909866333,
"learning_rate": 0.00011114028178874595,
"epoch": 2.67
},
{
"loss": 7.6306,
"grad_norm": 0.9585467576980591,
"learning_rate": 0.00011026516145970071,
"epoch": 2.67
},
{
"loss": 7.3702,
"grad_norm": 0.9484645128250122,
"learning_rate": 0.00010939004113065546,
"epoch": 2.67
},
{
"loss": 7.4482,
"grad_norm": 1.0726372003555298,
"learning_rate": 0.00010851492080161022,
"epoch": 2.67
},
{
"loss": 7.7554,
"grad_norm": 0.8078585863113403,
"learning_rate": 0.00010763980047256499,
"epoch": 2.68
},
{
"loss": 7.3881,
"grad_norm": 0.9488946199417114,
"learning_rate": 0.00010676468014351973,
"epoch": 2.68
},
{
"loss": 7.7557,
"grad_norm": 0.8590677976608276,
"learning_rate": 0.0001058895598144745,
"epoch": 2.68
},
{
"loss": 7.2128,
"grad_norm": 0.8768866062164307,
"learning_rate": 0.00010501443948542926,
"epoch": 2.69
},
{
"loss": 7.6447,
"grad_norm": 1.1127121448516846,
"learning_rate": 0.00010413931915638399,
"epoch": 2.69
},
{
"loss": 7.7283,
"grad_norm": 0.7706397771835327,
"learning_rate": 0.00010326419882733875,
"epoch": 2.69
},
{
"loss": 7.8187,
"grad_norm": 0.910484254360199,
"learning_rate": 0.00010238907849829352,
"epoch": 2.69
},
{
"loss": 6.9677,
"grad_norm": 0.8292771577835083,
"learning_rate": 0.00010151395816924828,
"epoch": 2.7
},
{
"loss": 7.7939,
"grad_norm": 1.2936872243881226,
"learning_rate": 0.00010063883784020303,
"epoch": 2.7
},
{
"loss": 7.2773,
"grad_norm": 1.050876259803772,
"learning_rate": 9.976371751115779e-05,
"epoch": 2.7
},
{
"loss": 7.6461,
"grad_norm": 1.0275306701660156,
"learning_rate": 9.888859718211255e-05,
"epoch": 2.7
},
{
"loss": 7.4058,
"grad_norm": 0.9414623379707336,
"learning_rate": 9.80134768530673e-05,
"epoch": 2.71
},
{
"loss": 7.4938,
"grad_norm": 0.8367570042610168,
"learning_rate": 9.713835652402206e-05,
"epoch": 2.71
},
{
"loss": 7.4702,
"grad_norm": 0.9100292325019836,
"learning_rate": 9.626323619497682e-05,
"epoch": 2.71
},
{
"loss": 7.4209,
"grad_norm": 0.881262481212616,
"learning_rate": 9.538811586593157e-05,
"epoch": 2.71
},
{
"loss": 7.568,
"grad_norm": 1.0841021537780762,
"learning_rate": 9.451299553688632e-05,
"epoch": 2.72
},
{
"loss": 7.4385,
"grad_norm": 0.8553777933120728,
"learning_rate": 9.363787520784108e-05,
"epoch": 2.72
},
{
"loss": 7.7745,
"grad_norm": 0.8244187235832214,
"learning_rate": 9.276275487879583e-05,
"epoch": 2.72
},
{
"loss": 7.3427,
"grad_norm": 1.0330350399017334,
"learning_rate": 9.188763454975059e-05,
"epoch": 2.72
},
{
"loss": 7.4313,
"grad_norm": 0.86846524477005,
"learning_rate": 9.101251422070535e-05,
"epoch": 2.73
},
{
"loss": 7.6994,
"grad_norm": 1.0151475667953491,
"learning_rate": 9.01373938916601e-05,
"epoch": 2.73
},
{
"loss": 7.6994,
"grad_norm": 0.8053341507911682,
"learning_rate": 8.926227356261486e-05,
"epoch": 2.73
},
{
"loss": 7.8782,
"grad_norm": 0.917957067489624,
"learning_rate": 8.838715323356962e-05,
"epoch": 2.74
},
{
"loss": 7.5889,
"grad_norm": 1.556181788444519,
"learning_rate": 8.751203290452437e-05,
"epoch": 2.74
},
{
"loss": 7.6279,
"grad_norm": 1.043771743774414,
"learning_rate": 8.663691257547913e-05,
"epoch": 2.74
},
{
"loss": 7.8682,
"grad_norm": 1.1640032529830933,
"learning_rate": 8.57617922464339e-05,
"epoch": 2.74
},
{
"loss": 7.8918,
"grad_norm": 0.8830235600471497,
"learning_rate": 8.488667191738863e-05,
"epoch": 2.75
},
{
"loss": 7.5466,
"grad_norm": 0.958690345287323,
"learning_rate": 8.401155158834339e-05,
"epoch": 2.75
},
{
"loss": 7.5439,
"grad_norm": 1.1970360279083252,
"learning_rate": 8.313643125929815e-05,
"epoch": 2.75
},
{
"loss": 8.1002,
"grad_norm": 0.9388788938522339,
"learning_rate": 8.22613109302529e-05,
"epoch": 2.75
},
{
"loss": 7.6892,
"grad_norm": 1.0798841714859009,
"learning_rate": 8.138619060120766e-05,
"epoch": 2.76
},
{
"loss": 8.0534,
"grad_norm": 1.2909208536148071,
"learning_rate": 8.051107027216243e-05,
"epoch": 2.76
},
{
"loss": 7.3369,
"grad_norm": 1.272641658782959,
"learning_rate": 7.963594994311717e-05,
"epoch": 2.76
},
{
"loss": 7.5785,
"grad_norm": 0.9654033780097961,
"learning_rate": 7.876082961407194e-05,
"epoch": 2.76
},
{
"loss": 7.8078,
"grad_norm": 0.8423277139663696,
"learning_rate": 7.78857092850267e-05,
"epoch": 2.77
},
{
"loss": 7.8086,
"grad_norm": 0.9509181380271912,
"learning_rate": 7.701058895598145e-05,
"epoch": 2.77
},
{
"loss": 8.1405,
"grad_norm": 0.9167718291282654,
"learning_rate": 7.613546862693621e-05,
"epoch": 2.77
},
{
"loss": 7.7728,
"grad_norm": 0.9845168590545654,
"learning_rate": 7.526034829789097e-05,
"epoch": 2.77
},
{
"loss": 7.4146,
"grad_norm": 0.9597529768943787,
"learning_rate": 7.43852279688457e-05,
"epoch": 2.78
},
{
"loss": 7.0711,
"grad_norm": 1.0068391561508179,
"learning_rate": 7.351010763980047e-05,
"epoch": 2.78
},
{
"loss": 7.1173,
"grad_norm": 0.8510629534721375,
"learning_rate": 7.263498731075523e-05,
"epoch": 2.78
},
{
"loss": 7.1843,
"grad_norm": 0.8737899661064148,
"learning_rate": 7.175986698170999e-05,
"epoch": 2.79
},
{
"loss": 7.376,
"grad_norm": 0.9045628905296326,
"learning_rate": 7.088474665266474e-05,
"epoch": 2.79
},
{
"loss": 7.4447,
"grad_norm": 0.8932380080223083,
"learning_rate": 7.00096263236195e-05,
"epoch": 2.79
},
{
"loss": 7.362,
"grad_norm": 0.8961164951324463,
"learning_rate": 6.913450599457426e-05,
"epoch": 2.79
},
{
"loss": 7.4237,
"grad_norm": 1.0015422105789185,
"learning_rate": 6.825938566552901e-05,
"epoch": 2.8
},
{
"loss": 7.2541,
"grad_norm": 0.9842544198036194,
"learning_rate": 6.738426533648377e-05,
"epoch": 2.8
},
{
"loss": 8.0427,
"grad_norm": 1.0375638008117676,
"learning_rate": 6.650914500743853e-05,
"epoch": 2.8
},
{
"loss": 7.4801,
"grad_norm": 0.9552834630012512,
"learning_rate": 6.563402467839328e-05,
"epoch": 2.8
},
{
"loss": 7.8596,
"grad_norm": 0.8038078546524048,
"learning_rate": 6.475890434934803e-05,
"epoch": 2.81
},
{
"loss": 7.2653,
"grad_norm": 0.8008092045783997,
"learning_rate": 6.388378402030279e-05,
"epoch": 2.81
},
{
"loss": 7.159,
"grad_norm": 1.087442398071289,
"learning_rate": 6.300866369125754e-05,
"epoch": 2.81
},
{
"loss": 7.0556,
"grad_norm": 1.0442233085632324,
"learning_rate": 6.21335433622123e-05,
"epoch": 2.81
},
{
"loss": 7.162,
"grad_norm": 1.0271589756011963,
"learning_rate": 6.125842303316706e-05,
"epoch": 2.82
},
{
"loss": 7.5864,
"grad_norm": 0.9957409501075745,
"learning_rate": 6.038330270412182e-05,
"epoch": 2.82
},
{
"loss": 8.4511,
"grad_norm": 0.870765745639801,
"learning_rate": 5.9508182375076575e-05,
"epoch": 2.82
},
{
"loss": 7.9488,
"grad_norm": 0.8632308840751648,
"learning_rate": 5.863306204603133e-05,
"epoch": 2.82
},
{
"loss": 8.1216,
"grad_norm": 1.1113914251327515,
"learning_rate": 5.775794171698609e-05,
"epoch": 2.83
},
{
"loss": 7.7049,
"grad_norm": 0.9410499334335327,
"learning_rate": 5.688282138794084e-05,
"epoch": 2.83
},
{
"loss": 7.6916,
"grad_norm": 0.8908835053443909,
"learning_rate": 5.6007701058895595e-05,
"epoch": 2.83
},
{
"loss": 7.5659,
"grad_norm": 0.7924339175224304,
"learning_rate": 5.513258072985036e-05,
"epoch": 2.83
},
{
"loss": 7.4359,
"grad_norm": 0.8098507523536682,
"learning_rate": 5.425746040080511e-05,
"epoch": 2.84
},
{
"loss": 7.4043,
"grad_norm": 0.8541660904884338,
"learning_rate": 5.338234007175987e-05,
"epoch": 2.84
},
{
"loss": 7.5664,
"grad_norm": 0.9474323987960815,
"learning_rate": 5.250721974271463e-05,
"epoch": 2.84
},
{
"loss": 7.7903,
"grad_norm": 1.0568387508392334,
"learning_rate": 5.163209941366938e-05,
"epoch": 2.85
},
{
"loss": 7.4216,
"grad_norm": 0.9031184315681458,
"learning_rate": 5.075697908462414e-05,
"epoch": 2.85
},
{
"loss": 7.5944,
"grad_norm": 0.8136922121047974,
"learning_rate": 4.9881858755578894e-05,
"epoch": 2.85
},
{
"loss": 7.6272,
"grad_norm": 1.1002339124679565,
"learning_rate": 4.900673842653365e-05,
"epoch": 2.85
},
{
"loss": 7.5846,
"grad_norm": 1.2232916355133057,
"learning_rate": 4.813161809748841e-05,
"epoch": 2.86
},
{
"loss": 7.8478,
"grad_norm": 0.8891430497169495,
"learning_rate": 4.725649776844316e-05,
"epoch": 2.86
},
{
"loss": 7.3,
"grad_norm": 0.9129414558410645,
"learning_rate": 4.6381377439397914e-05,
"epoch": 2.86
},
{
"loss": 7.4529,
"grad_norm": 0.7938532829284668,
"learning_rate": 4.5506257110352676e-05,
"epoch": 2.86
},
{
"loss": 7.2803,
"grad_norm": 0.9501358270645142,
"learning_rate": 4.463113678130743e-05,
"epoch": 2.87
},
{
"loss": 7.5943,
"grad_norm": 1.0423897504806519,
"learning_rate": 4.3756016452262186e-05,
"epoch": 2.87
},
{
"loss": 7.2376,
"grad_norm": 0.9883305430412292,
"learning_rate": 4.288089612321695e-05,
"epoch": 2.87
},
{
"loss": 7.6255,
"grad_norm": 0.9974358677864075,
"learning_rate": 4.2005775794171696e-05,
"epoch": 2.87
},
{
"loss": 7.2739,
"grad_norm": 0.9481905102729797,
"learning_rate": 4.113065546512645e-05,
"epoch": 2.88
},
{
"loss": 8.0077,
"grad_norm": 1.067797064781189,
"learning_rate": 4.025553513608121e-05,
"epoch": 2.88
},
{
"loss": 7.5522,
"grad_norm": 0.8410007953643799,
"learning_rate": 3.938041480703597e-05,
"epoch": 2.88
},
{
"loss": 7.5854,
"grad_norm": 0.847583532333374,
"learning_rate": 3.8505294477990723e-05,
"epoch": 2.88
},
{
"loss": 7.2142,
"grad_norm": 1.0279533863067627,
"learning_rate": 3.7630174148945485e-05,
"epoch": 2.89
},
{
"loss": 7.4712,
"grad_norm": 1.1256965398788452,
"learning_rate": 3.6755053819900234e-05,
"epoch": 2.89
},
{
"loss": 7.7947,
"grad_norm": 1.0278571844100952,
"learning_rate": 3.5879933490854995e-05,
"epoch": 2.89
},
{
"loss": 7.3523,
"grad_norm": 0.9609654545783997,
"learning_rate": 3.500481316180975e-05,
"epoch": 2.9
},
{
"loss": 7.3334,
"grad_norm": 0.8453736901283264,
"learning_rate": 3.4129692832764505e-05,
"epoch": 2.9
},
{
"loss": 7.177,
"grad_norm": 0.8161653280258179,
"learning_rate": 3.325457250371927e-05,
"epoch": 2.9
},
{
"loss": 7.9061,
"grad_norm": 0.9861032366752625,
"learning_rate": 3.2379452174674016e-05,
"epoch": 2.9
},
{
"loss": 7.3155,
"grad_norm": 1.1409838199615479,
"learning_rate": 3.150433184562877e-05,
"epoch": 2.91
},
{
"loss": 7.7667,
"grad_norm": 0.8848074078559875,
"learning_rate": 3.062921151658353e-05,
"epoch": 2.91
},
{
"loss": 7.2722,
"grad_norm": 0.8996227979660034,
"learning_rate": 2.9754091187538288e-05,
"epoch": 2.91
},
{
"loss": 7.4819,
"grad_norm": 0.9429714679718018,
"learning_rate": 2.8878970858493046e-05,
"epoch": 2.91
},
{
"loss": 7.4154,
"grad_norm": 1.1169899702072144,
"learning_rate": 2.8003850529447798e-05,
"epoch": 2.92
},
{
"loss": 7.7691,
"grad_norm": 0.8326570987701416,
"learning_rate": 2.7128730200402556e-05,
"epoch": 2.92
},
{
"loss": 7.6196,
"grad_norm": 0.9243487119674683,
"learning_rate": 2.6253609871357314e-05,
"epoch": 2.92
},
{
"loss": 7.6529,
"grad_norm": 0.8465039730072021,
"learning_rate": 2.537848954231207e-05,
"epoch": 2.92
},
{
"loss": 7.2623,
"grad_norm": 1.0216766595840454,
"learning_rate": 2.4503369213266825e-05,
"epoch": 2.93
},
{
"loss": 7.5628,
"grad_norm": 0.9314711689949036,
"learning_rate": 2.362824888422158e-05,
"epoch": 2.93
},
{
"loss": 7.9252,
"grad_norm": 0.8769168853759766,
"learning_rate": 2.2753128555176338e-05,
"epoch": 2.93
},
{
"loss": 7.2971,
"grad_norm": 0.8925982713699341,
"learning_rate": 2.1878008226131093e-05,
"epoch": 2.93
},
{
"loss": 7.1022,
"grad_norm": 0.937786340713501,
"learning_rate": 2.1002887897085848e-05,
"epoch": 2.94
},
{
"loss": 7.5253,
"grad_norm": 0.900693416595459,
"learning_rate": 2.0127767568040607e-05,
"epoch": 2.94
},
{
"loss": 7.5837,
"grad_norm": 0.9113482236862183,
"learning_rate": 1.9252647238995362e-05,
"epoch": 2.94
},
{
"loss": 7.7925,
"grad_norm": 0.8734735250473022,
"learning_rate": 1.8377526909950117e-05,
"epoch": 2.95
},
{
"loss": 7.5821,
"grad_norm": 0.8616068959236145,
"learning_rate": 1.7502406580904875e-05,
"epoch": 2.95
},
{
"loss": 6.7659,
"grad_norm": 0.8509213328361511,
"learning_rate": 1.6627286251859634e-05,
"epoch": 2.95
},
{
"loss": 7.9045,
"grad_norm": 0.8518444895744324,
"learning_rate": 1.5752165922814385e-05,
"epoch": 2.95
},
{
"loss": 7.2314,
"grad_norm": 1.1429413557052612,
"learning_rate": 1.4877045593769144e-05,
"epoch": 2.96
},
{
"loss": 7.5707,
"grad_norm": 0.825677752494812,
"learning_rate": 1.4001925264723899e-05,
"epoch": 2.96
},
{
"loss": 7.2231,
"grad_norm": 0.9227612018585205,
"learning_rate": 1.3126804935678657e-05,
"epoch": 2.96
},
{
"loss": 7.3015,
"grad_norm": 0.9745140671730042,
"learning_rate": 1.2251684606633412e-05,
"epoch": 2.96
},
{
"loss": 7.5931,
"grad_norm": 0.8096091151237488,
"learning_rate": 1.1376564277588169e-05,
"epoch": 2.97
},
{
"loss": 7.2393,
"grad_norm": 0.9233807921409607,
"learning_rate": 1.0501443948542924e-05,
"epoch": 2.97
},
{
"loss": 7.2871,
"grad_norm": 0.7690852880477905,
"learning_rate": 9.626323619497681e-06,
"epoch": 2.97
},
{
"loss": 7.5845,
"grad_norm": 0.882102370262146,
"learning_rate": 8.751203290452438e-06,
"epoch": 2.97
},
{
"loss": 7.2335,
"grad_norm": 0.887958288192749,
"learning_rate": 7.876082961407193e-06,
"epoch": 2.98
},
{
"loss": 7.5324,
"grad_norm": 0.8895597457885742,
"learning_rate": 7.000962632361949e-06,
"epoch": 2.98
},
{
"loss": 7.1145,
"grad_norm": 0.8137519955635071,
"learning_rate": 6.125842303316706e-06,
"epoch": 2.98
},
{
"loss": 7.411,
"grad_norm": 0.9460362195968628,
"learning_rate": 5.250721974271462e-06,
"epoch": 2.98
},
{
"loss": 7.6058,
"grad_norm": 0.9842742681503296,
"learning_rate": 4.375601645226219e-06,
"epoch": 2.99
},
{
"loss": 7.6927,
"grad_norm": 0.938562273979187,
"learning_rate": 3.5004813161809747e-06,
"epoch": 2.99
},
{
"loss": 7.6995,
"grad_norm": 0.9931243658065796,
"learning_rate": 2.625360987135731e-06,
"epoch": 2.99
},
{
"loss": 7.5572,
"grad_norm": 0.8916573524475098,
"learning_rate": 1.7502406580904874e-06,
"epoch": 3.0
},
{
"loss": 7.2556,
"grad_norm": 0.780832052230835,
"learning_rate": 8.751203290452437e-07,
"epoch": 3.0
},
{
"train_runtime": 112786.1501,
"train_samples_per_second": 3.243,
"train_steps_per_second": 0.101,
"train_loss": 8.874524852365107,
"epoch": 3.0
}
]