v3c_mistral_lora / last-checkpoint /trainer_state.json
mtzig's picture
Training in progress, step 808, checkpoint
36134be verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 20,
"global_step": 808,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_accuracy": 0.7339246119733924,
"eval_f1": 0.24528301886792453,
"eval_loss": 0.6025775074958801,
"eval_precision": 0.6,
"eval_recall": 0.1541501976284585,
"eval_runtime": 47.5679,
"eval_samples_per_second": 5.802,
"eval_steps_per_second": 0.189,
"step": 0
},
{
"epoch": 0.0012376237623762376,
"grad_norm": 2.056412935256958,
"learning_rate": 2.469135802469136e-07,
"loss": 0.6505,
"step": 1
},
{
"epoch": 0.0024752475247524753,
"grad_norm": 2.1361210346221924,
"learning_rate": 4.938271604938272e-07,
"loss": 0.7395,
"step": 2
},
{
"epoch": 0.0037128712871287127,
"grad_norm": 2.2638471126556396,
"learning_rate": 7.407407407407407e-07,
"loss": 0.6948,
"step": 3
},
{
"epoch": 0.0049504950495049506,
"grad_norm": 1.881201148033142,
"learning_rate": 9.876543209876544e-07,
"loss": 0.6427,
"step": 4
},
{
"epoch": 0.006188118811881188,
"grad_norm": 2.1328437328338623,
"learning_rate": 1.234567901234568e-06,
"loss": 0.6554,
"step": 5
},
{
"epoch": 0.007425742574257425,
"grad_norm": 2.2691922187805176,
"learning_rate": 1.4814814814814815e-06,
"loss": 0.7034,
"step": 6
},
{
"epoch": 0.008663366336633664,
"grad_norm": 2.424414873123169,
"learning_rate": 1.7283950617283952e-06,
"loss": 0.6598,
"step": 7
},
{
"epoch": 0.009900990099009901,
"grad_norm": 2.1118245124816895,
"learning_rate": 1.9753086419753087e-06,
"loss": 0.668,
"step": 8
},
{
"epoch": 0.011138613861386138,
"grad_norm": 1.8890514373779297,
"learning_rate": 2.222222222222222e-06,
"loss": 0.6658,
"step": 9
},
{
"epoch": 0.012376237623762377,
"grad_norm": 2.2101762294769287,
"learning_rate": 2.469135802469136e-06,
"loss": 0.6984,
"step": 10
},
{
"epoch": 0.013613861386138614,
"grad_norm": 2.1789631843566895,
"learning_rate": 2.7160493827160496e-06,
"loss": 0.6483,
"step": 11
},
{
"epoch": 0.01485148514851485,
"grad_norm": 2.1754183769226074,
"learning_rate": 2.962962962962963e-06,
"loss": 0.6328,
"step": 12
},
{
"epoch": 0.01608910891089109,
"grad_norm": 1.9709060192108154,
"learning_rate": 3.2098765432098767e-06,
"loss": 0.6425,
"step": 13
},
{
"epoch": 0.017326732673267328,
"grad_norm": 2.338000535964966,
"learning_rate": 3.4567901234567904e-06,
"loss": 0.7665,
"step": 14
},
{
"epoch": 0.018564356435643563,
"grad_norm": 1.9738425016403198,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.6994,
"step": 15
},
{
"epoch": 0.019801980198019802,
"grad_norm": 1.9872663021087646,
"learning_rate": 3.9506172839506175e-06,
"loss": 0.6101,
"step": 16
},
{
"epoch": 0.02103960396039604,
"grad_norm": 1.9945553541183472,
"learning_rate": 4.197530864197531e-06,
"loss": 0.641,
"step": 17
},
{
"epoch": 0.022277227722772276,
"grad_norm": 2.1487791538238525,
"learning_rate": 4.444444444444444e-06,
"loss": 0.6871,
"step": 18
},
{
"epoch": 0.023514851485148515,
"grad_norm": 2.6171352863311768,
"learning_rate": 4.691358024691358e-06,
"loss": 0.6863,
"step": 19
},
{
"epoch": 0.024752475247524754,
"grad_norm": 1.7834933996200562,
"learning_rate": 4.938271604938272e-06,
"loss": 0.6391,
"step": 20
},
{
"epoch": 0.024752475247524754,
"eval_accuracy": 0.7361419068736141,
"eval_f1": 0.25625,
"eval_loss": 0.5953530669212341,
"eval_precision": 0.6119402985074627,
"eval_recall": 0.16205533596837945,
"eval_runtime": 50.5471,
"eval_samples_per_second": 5.46,
"eval_steps_per_second": 0.178,
"step": 20
},
{
"epoch": 0.02599009900990099,
"grad_norm": 2.140673875808716,
"learning_rate": 5.185185185185185e-06,
"loss": 0.6099,
"step": 21
},
{
"epoch": 0.027227722772277228,
"grad_norm": 1.9627602100372314,
"learning_rate": 5.432098765432099e-06,
"loss": 0.6677,
"step": 22
},
{
"epoch": 0.028465346534653466,
"grad_norm": 1.9993869066238403,
"learning_rate": 5.6790123456790125e-06,
"loss": 0.6015,
"step": 23
},
{
"epoch": 0.0297029702970297,
"grad_norm": 1.7692540884017944,
"learning_rate": 5.925925925925926e-06,
"loss": 0.5969,
"step": 24
},
{
"epoch": 0.03094059405940594,
"grad_norm": 2.137422561645508,
"learning_rate": 6.17283950617284e-06,
"loss": 0.6501,
"step": 25
},
{
"epoch": 0.03217821782178218,
"grad_norm": 1.9657728672027588,
"learning_rate": 6.419753086419753e-06,
"loss": 0.6085,
"step": 26
},
{
"epoch": 0.03341584158415842,
"grad_norm": 1.7881442308425903,
"learning_rate": 6.666666666666667e-06,
"loss": 0.635,
"step": 27
},
{
"epoch": 0.034653465346534656,
"grad_norm": 2.832048177719116,
"learning_rate": 6.913580246913581e-06,
"loss": 0.7251,
"step": 28
},
{
"epoch": 0.03589108910891089,
"grad_norm": 1.9947174787521362,
"learning_rate": 7.160493827160494e-06,
"loss": 0.6394,
"step": 29
},
{
"epoch": 0.03712871287128713,
"grad_norm": 2.0211126804351807,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.6082,
"step": 30
},
{
"epoch": 0.038366336633663366,
"grad_norm": 1.9397317171096802,
"learning_rate": 7.654320987654322e-06,
"loss": 0.6465,
"step": 31
},
{
"epoch": 0.039603960396039604,
"grad_norm": 2.2408998012542725,
"learning_rate": 7.901234567901235e-06,
"loss": 0.643,
"step": 32
},
{
"epoch": 0.04084158415841584,
"grad_norm": 1.9772993326187134,
"learning_rate": 8.148148148148148e-06,
"loss": 0.618,
"step": 33
},
{
"epoch": 0.04207920792079208,
"grad_norm": 1.6278493404388428,
"learning_rate": 8.395061728395062e-06,
"loss": 0.6425,
"step": 34
},
{
"epoch": 0.043316831683168314,
"grad_norm": 1.9789159297943115,
"learning_rate": 8.641975308641975e-06,
"loss": 0.6046,
"step": 35
},
{
"epoch": 0.04455445544554455,
"grad_norm": 1.801087498664856,
"learning_rate": 8.888888888888888e-06,
"loss": 0.6561,
"step": 36
},
{
"epoch": 0.04579207920792079,
"grad_norm": 1.5089136362075806,
"learning_rate": 9.135802469135803e-06,
"loss": 0.5883,
"step": 37
},
{
"epoch": 0.04702970297029703,
"grad_norm": 1.676107406616211,
"learning_rate": 9.382716049382717e-06,
"loss": 0.5684,
"step": 38
},
{
"epoch": 0.04826732673267327,
"grad_norm": 1.8138374090194702,
"learning_rate": 9.62962962962963e-06,
"loss": 0.6034,
"step": 39
},
{
"epoch": 0.04950495049504951,
"grad_norm": 1.7539325952529907,
"learning_rate": 9.876543209876543e-06,
"loss": 0.5891,
"step": 40
},
{
"epoch": 0.04950495049504951,
"eval_accuracy": 0.7549889135254989,
"eval_f1": 0.4318766066838046,
"eval_loss": 0.556958794593811,
"eval_precision": 0.6176470588235294,
"eval_recall": 0.33201581027667987,
"eval_runtime": 48.6708,
"eval_samples_per_second": 5.671,
"eval_steps_per_second": 0.185,
"step": 40
},
{
"epoch": 0.050742574257425746,
"grad_norm": 1.4187287092208862,
"learning_rate": 1.0123456790123458e-05,
"loss": 0.5636,
"step": 41
},
{
"epoch": 0.05198019801980198,
"grad_norm": 1.9447287321090698,
"learning_rate": 1.037037037037037e-05,
"loss": 0.5496,
"step": 42
},
{
"epoch": 0.053217821782178217,
"grad_norm": 1.6454174518585205,
"learning_rate": 1.0617283950617285e-05,
"loss": 0.5807,
"step": 43
},
{
"epoch": 0.054455445544554455,
"grad_norm": 1.7853933572769165,
"learning_rate": 1.0864197530864198e-05,
"loss": 0.6028,
"step": 44
},
{
"epoch": 0.055693069306930694,
"grad_norm": 1.6090970039367676,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.5838,
"step": 45
},
{
"epoch": 0.05693069306930693,
"grad_norm": 2.3328471183776855,
"learning_rate": 1.1358024691358025e-05,
"loss": 0.5993,
"step": 46
},
{
"epoch": 0.05816831683168317,
"grad_norm": 2.4744842052459717,
"learning_rate": 1.160493827160494e-05,
"loss": 0.6092,
"step": 47
},
{
"epoch": 0.0594059405940594,
"grad_norm": 1.7244300842285156,
"learning_rate": 1.1851851851851852e-05,
"loss": 0.5969,
"step": 48
},
{
"epoch": 0.06064356435643564,
"grad_norm": 1.6698678731918335,
"learning_rate": 1.2098765432098767e-05,
"loss": 0.5254,
"step": 49
},
{
"epoch": 0.06188118811881188,
"grad_norm": 1.591994285583496,
"learning_rate": 1.234567901234568e-05,
"loss": 0.5509,
"step": 50
},
{
"epoch": 0.06311881188118812,
"grad_norm": 1.9688084125518799,
"learning_rate": 1.2592592592592593e-05,
"loss": 0.5232,
"step": 51
},
{
"epoch": 0.06435643564356436,
"grad_norm": 2.0831687450408936,
"learning_rate": 1.2839506172839507e-05,
"loss": 0.5141,
"step": 52
},
{
"epoch": 0.0655940594059406,
"grad_norm": 2.0480973720550537,
"learning_rate": 1.3086419753086422e-05,
"loss": 0.5669,
"step": 53
},
{
"epoch": 0.06683168316831684,
"grad_norm": 1.5781453847885132,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.5065,
"step": 54
},
{
"epoch": 0.06806930693069307,
"grad_norm": 2.123061180114746,
"learning_rate": 1.3580246913580248e-05,
"loss": 0.4856,
"step": 55
},
{
"epoch": 0.06930693069306931,
"grad_norm": 2.2889890670776367,
"learning_rate": 1.3827160493827162e-05,
"loss": 0.4936,
"step": 56
},
{
"epoch": 0.07054455445544554,
"grad_norm": 2.201887607574463,
"learning_rate": 1.4074074074074075e-05,
"loss": 0.538,
"step": 57
},
{
"epoch": 0.07178217821782178,
"grad_norm": 1.8556184768676758,
"learning_rate": 1.4320987654320988e-05,
"loss": 0.5091,
"step": 58
},
{
"epoch": 0.07301980198019802,
"grad_norm": 1.5986840724945068,
"learning_rate": 1.4567901234567903e-05,
"loss": 0.4939,
"step": 59
},
{
"epoch": 0.07425742574257425,
"grad_norm": 2.35420560836792,
"learning_rate": 1.4814814814814815e-05,
"loss": 0.4606,
"step": 60
},
{
"epoch": 0.07425742574257425,
"eval_accuracy": 0.779379157427938,
"eval_f1": 0.5204819277108433,
"eval_loss": 0.4962254464626312,
"eval_precision": 0.6666666666666666,
"eval_recall": 0.4268774703557312,
"eval_runtime": 47.7725,
"eval_samples_per_second": 5.777,
"eval_steps_per_second": 0.188,
"step": 60
},
{
"epoch": 0.07549504950495049,
"grad_norm": 2.571995496749878,
"learning_rate": 1.506172839506173e-05,
"loss": 0.538,
"step": 61
},
{
"epoch": 0.07673267326732673,
"grad_norm": 2.467172622680664,
"learning_rate": 1.5308641975308643e-05,
"loss": 0.5176,
"step": 62
},
{
"epoch": 0.07797029702970297,
"grad_norm": 1.9836307764053345,
"learning_rate": 1.555555555555556e-05,
"loss": 0.544,
"step": 63
},
{
"epoch": 0.07920792079207921,
"grad_norm": 1.576439380645752,
"learning_rate": 1.580246913580247e-05,
"loss": 0.4453,
"step": 64
},
{
"epoch": 0.08044554455445545,
"grad_norm": 1.6136027574539185,
"learning_rate": 1.6049382716049385e-05,
"loss": 0.46,
"step": 65
},
{
"epoch": 0.08168316831683169,
"grad_norm": 2.130403518676758,
"learning_rate": 1.6296296296296297e-05,
"loss": 0.4797,
"step": 66
},
{
"epoch": 0.08292079207920793,
"grad_norm": 2.6445112228393555,
"learning_rate": 1.654320987654321e-05,
"loss": 0.5095,
"step": 67
},
{
"epoch": 0.08415841584158416,
"grad_norm": 2.384965658187866,
"learning_rate": 1.6790123456790123e-05,
"loss": 0.478,
"step": 68
},
{
"epoch": 0.0853960396039604,
"grad_norm": 1.9021402597427368,
"learning_rate": 1.7037037037037038e-05,
"loss": 0.4508,
"step": 69
},
{
"epoch": 0.08663366336633663,
"grad_norm": 2.2608911991119385,
"learning_rate": 1.728395061728395e-05,
"loss": 0.4828,
"step": 70
},
{
"epoch": 0.08787128712871287,
"grad_norm": 2.5560309886932373,
"learning_rate": 1.7530864197530865e-05,
"loss": 0.4429,
"step": 71
},
{
"epoch": 0.0891089108910891,
"grad_norm": 3.586392879486084,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.393,
"step": 72
},
{
"epoch": 0.09034653465346534,
"grad_norm": 2.5128958225250244,
"learning_rate": 1.802469135802469e-05,
"loss": 0.4795,
"step": 73
},
{
"epoch": 0.09158415841584158,
"grad_norm": 2.255323886871338,
"learning_rate": 1.8271604938271607e-05,
"loss": 0.3733,
"step": 74
},
{
"epoch": 0.09282178217821782,
"grad_norm": 1.9865373373031616,
"learning_rate": 1.851851851851852e-05,
"loss": 0.3899,
"step": 75
},
{
"epoch": 0.09405940594059406,
"grad_norm": 2.985546588897705,
"learning_rate": 1.8765432098765433e-05,
"loss": 0.3784,
"step": 76
},
{
"epoch": 0.0952970297029703,
"grad_norm": 3.0742247104644775,
"learning_rate": 1.901234567901235e-05,
"loss": 0.4457,
"step": 77
},
{
"epoch": 0.09653465346534654,
"grad_norm": 2.365544319152832,
"learning_rate": 1.925925925925926e-05,
"loss": 0.3507,
"step": 78
},
{
"epoch": 0.09777227722772278,
"grad_norm": 3.4621968269348145,
"learning_rate": 1.9506172839506175e-05,
"loss": 0.405,
"step": 79
},
{
"epoch": 0.09900990099009901,
"grad_norm": 3.251645088195801,
"learning_rate": 1.9753086419753087e-05,
"loss": 0.4229,
"step": 80
},
{
"epoch": 0.09900990099009901,
"eval_accuracy": 0.7904656319290465,
"eval_f1": 0.5771812080536913,
"eval_loss": 0.4432809352874756,
"eval_precision": 0.6649484536082474,
"eval_recall": 0.5098814229249012,
"eval_runtime": 48.2096,
"eval_samples_per_second": 5.725,
"eval_steps_per_second": 0.187,
"step": 80
},
{
"epoch": 0.10024752475247525,
"grad_norm": 3.5432498455047607,
"learning_rate": 2e-05,
"loss": 0.3498,
"step": 81
},
{
"epoch": 0.10148514851485149,
"grad_norm": 4.109142303466797,
"learning_rate": 1.9999906631527858e-05,
"loss": 0.3289,
"step": 82
},
{
"epoch": 0.10272277227722772,
"grad_norm": 3.4147417545318604,
"learning_rate": 1.9999626527854966e-05,
"loss": 0.2813,
"step": 83
},
{
"epoch": 0.10396039603960396,
"grad_norm": 5.5374436378479,
"learning_rate": 1.9999159694211894e-05,
"loss": 0.3393,
"step": 84
},
{
"epoch": 0.1051980198019802,
"grad_norm": 4.537343502044678,
"learning_rate": 1.999850613931615e-05,
"loss": 0.4392,
"step": 85
},
{
"epoch": 0.10643564356435643,
"grad_norm": 3.075702428817749,
"learning_rate": 1.999766587537202e-05,
"loss": 0.3329,
"step": 86
},
{
"epoch": 0.10767326732673267,
"grad_norm": 6.164308071136475,
"learning_rate": 1.9996638918070336e-05,
"loss": 0.3292,
"step": 87
},
{
"epoch": 0.10891089108910891,
"grad_norm": 3.1993377208709717,
"learning_rate": 1.9995425286588187e-05,
"loss": 0.318,
"step": 88
},
{
"epoch": 0.11014851485148515,
"grad_norm": 3.789552927017212,
"learning_rate": 1.9994025003588547e-05,
"loss": 0.3504,
"step": 89
},
{
"epoch": 0.11138613861386139,
"grad_norm": 4.15277624130249,
"learning_rate": 1.9992438095219886e-05,
"loss": 0.2838,
"step": 90
},
{
"epoch": 0.11262376237623763,
"grad_norm": 3.4878060817718506,
"learning_rate": 1.9990664591115637e-05,
"loss": 0.3165,
"step": 91
},
{
"epoch": 0.11386138613861387,
"grad_norm": 5.2607035636901855,
"learning_rate": 1.9988704524393678e-05,
"loss": 0.3229,
"step": 92
},
{
"epoch": 0.1150990099009901,
"grad_norm": 6.290886878967285,
"learning_rate": 1.9986557931655688e-05,
"loss": 0.3629,
"step": 93
},
{
"epoch": 0.11633663366336634,
"grad_norm": 7.600953102111816,
"learning_rate": 1.9984224852986494e-05,
"loss": 0.3405,
"step": 94
},
{
"epoch": 0.11757425742574257,
"grad_norm": 4.730844974517822,
"learning_rate": 1.9981705331953295e-05,
"loss": 0.3718,
"step": 95
},
{
"epoch": 0.1188118811881188,
"grad_norm": 5.086641788482666,
"learning_rate": 1.9978999415604847e-05,
"loss": 0.2757,
"step": 96
},
{
"epoch": 0.12004950495049505,
"grad_norm": 6.739199161529541,
"learning_rate": 1.9976107154470613e-05,
"loss": 0.2859,
"step": 97
},
{
"epoch": 0.12128712871287128,
"grad_norm": 4.352366924285889,
"learning_rate": 1.9973028602559787e-05,
"loss": 0.3398,
"step": 98
},
{
"epoch": 0.12252475247524752,
"grad_norm": 7.858609199523926,
"learning_rate": 1.9969763817360314e-05,
"loss": 0.471,
"step": 99
},
{
"epoch": 0.12376237623762376,
"grad_norm": 5.571165561676025,
"learning_rate": 1.996631285983779e-05,
"loss": 0.3836,
"step": 100
},
{
"epoch": 0.12376237623762376,
"eval_accuracy": 0.8159645232815964,
"eval_f1": 0.6047619047619047,
"eval_loss": 0.42972368001937866,
"eval_precision": 0.7604790419161677,
"eval_recall": 0.5019762845849802,
"eval_runtime": 48.4236,
"eval_samples_per_second": 5.7,
"eval_steps_per_second": 0.186,
"step": 100
},
{
"epoch": 0.125,
"grad_norm": 4.134688854217529,
"learning_rate": 1.9962675794434342e-05,
"loss": 0.2516,
"step": 101
},
{
"epoch": 0.12623762376237624,
"grad_norm": 3.988821506500244,
"learning_rate": 1.9958852689067423e-05,
"loss": 0.2509,
"step": 102
},
{
"epoch": 0.12747524752475248,
"grad_norm": 5.836869716644287,
"learning_rate": 1.9954843615128528e-05,
"loss": 0.3183,
"step": 103
},
{
"epoch": 0.12871287128712872,
"grad_norm": 9.7975492477417,
"learning_rate": 1.995064864748188e-05,
"loss": 0.3471,
"step": 104
},
{
"epoch": 0.12995049504950495,
"grad_norm": 5.1211066246032715,
"learning_rate": 1.9946267864463027e-05,
"loss": 0.3466,
"step": 105
},
{
"epoch": 0.1311881188118812,
"grad_norm": 5.172476291656494,
"learning_rate": 1.994170134787737e-05,
"loss": 0.3442,
"step": 106
},
{
"epoch": 0.13242574257425743,
"grad_norm": 4.703874111175537,
"learning_rate": 1.993694918299864e-05,
"loss": 0.3027,
"step": 107
},
{
"epoch": 0.13366336633663367,
"grad_norm": 3.981438398361206,
"learning_rate": 1.9932011458567315e-05,
"loss": 0.2803,
"step": 108
},
{
"epoch": 0.1349009900990099,
"grad_norm": 3.627497911453247,
"learning_rate": 1.9926888266788955e-05,
"loss": 0.3011,
"step": 109
},
{
"epoch": 0.13613861386138615,
"grad_norm": 5.726022720336914,
"learning_rate": 1.9921579703332475e-05,
"loss": 0.3463,
"step": 110
},
{
"epoch": 0.1373762376237624,
"grad_norm": 3.9661319255828857,
"learning_rate": 1.991608586732837e-05,
"loss": 0.3455,
"step": 111
},
{
"epoch": 0.13861386138613863,
"grad_norm": 4.330716133117676,
"learning_rate": 1.991040686136685e-05,
"loss": 0.2888,
"step": 112
},
{
"epoch": 0.13985148514851486,
"grad_norm": 2.6466479301452637,
"learning_rate": 1.9904542791495938e-05,
"loss": 0.2423,
"step": 113
},
{
"epoch": 0.14108910891089108,
"grad_norm": 3.5607573986053467,
"learning_rate": 1.9898493767219486e-05,
"loss": 0.2481,
"step": 114
},
{
"epoch": 0.14232673267326731,
"grad_norm": 3.259629011154175,
"learning_rate": 1.989225990149512e-05,
"loss": 0.2707,
"step": 115
},
{
"epoch": 0.14356435643564355,
"grad_norm": 3.952185869216919,
"learning_rate": 1.988584131073215e-05,
"loss": 0.2607,
"step": 116
},
{
"epoch": 0.1448019801980198,
"grad_norm": 2.9898970127105713,
"learning_rate": 1.9879238114789375e-05,
"loss": 0.2234,
"step": 117
},
{
"epoch": 0.14603960396039603,
"grad_norm": 3.857395648956299,
"learning_rate": 1.9872450436972856e-05,
"loss": 0.2691,
"step": 118
},
{
"epoch": 0.14727722772277227,
"grad_norm": 4.034820079803467,
"learning_rate": 1.986547840403362e-05,
"loss": 0.3632,
"step": 119
},
{
"epoch": 0.1485148514851485,
"grad_norm": 3.5433619022369385,
"learning_rate": 1.9858322146165272e-05,
"loss": 0.3363,
"step": 120
},
{
"epoch": 0.1485148514851485,
"eval_accuracy": 0.8381374722838137,
"eval_f1": 0.6666666666666666,
"eval_loss": 0.36761781573295593,
"eval_precision": 0.7891891891891892,
"eval_recall": 0.5770750988142292,
"eval_runtime": 48.4565,
"eval_samples_per_second": 5.696,
"eval_steps_per_second": 0.186,
"step": 120
},
{
"epoch": 0.14975247524752475,
"grad_norm": 4.58292818069458,
"learning_rate": 1.9850981797001593e-05,
"loss": 0.2657,
"step": 121
},
{
"epoch": 0.15099009900990099,
"grad_norm": 4.649030685424805,
"learning_rate": 1.9843457493614016e-05,
"loss": 0.2851,
"step": 122
},
{
"epoch": 0.15222772277227722,
"grad_norm": 4.370965957641602,
"learning_rate": 1.9835749376509084e-05,
"loss": 0.2917,
"step": 123
},
{
"epoch": 0.15346534653465346,
"grad_norm": 5.558561325073242,
"learning_rate": 1.9827857589625817e-05,
"loss": 0.2922,
"step": 124
},
{
"epoch": 0.1547029702970297,
"grad_norm": 3.4896552562713623,
"learning_rate": 1.981978228033304e-05,
"loss": 0.2478,
"step": 125
},
{
"epoch": 0.15594059405940594,
"grad_norm": 5.457974910736084,
"learning_rate": 1.9811523599426604e-05,
"loss": 0.3341,
"step": 126
},
{
"epoch": 0.15717821782178218,
"grad_norm": 3.6488845348358154,
"learning_rate": 1.980308170112659e-05,
"loss": 0.2577,
"step": 127
},
{
"epoch": 0.15841584158415842,
"grad_norm": 3.6894092559814453,
"learning_rate": 1.979445674307444e-05,
"loss": 0.2544,
"step": 128
},
{
"epoch": 0.15965346534653466,
"grad_norm": 5.288538455963135,
"learning_rate": 1.9785648886329974e-05,
"loss": 0.2452,
"step": 129
},
{
"epoch": 0.1608910891089109,
"grad_norm": 6.3318305015563965,
"learning_rate": 1.977665829536842e-05,
"loss": 0.2628,
"step": 130
},
{
"epoch": 0.16212871287128713,
"grad_norm": 5.06384801864624,
"learning_rate": 1.9767485138077327e-05,
"loss": 0.337,
"step": 131
},
{
"epoch": 0.16336633663366337,
"grad_norm": 3.954658269882202,
"learning_rate": 1.9758129585753433e-05,
"loss": 0.2729,
"step": 132
},
{
"epoch": 0.1646039603960396,
"grad_norm": 3.3781790733337402,
"learning_rate": 1.9748591813099457e-05,
"loss": 0.2204,
"step": 133
},
{
"epoch": 0.16584158415841585,
"grad_norm": 5.148495674133301,
"learning_rate": 1.9738871998220857e-05,
"loss": 0.2585,
"step": 134
},
{
"epoch": 0.1670792079207921,
"grad_norm": 4.203769207000732,
"learning_rate": 1.9728970322622485e-05,
"loss": 0.3102,
"step": 135
},
{
"epoch": 0.16831683168316833,
"grad_norm": 3.7691049575805664,
"learning_rate": 1.9718886971205206e-05,
"loss": 0.2592,
"step": 136
},
{
"epoch": 0.16955445544554457,
"grad_norm": 5.7634711265563965,
"learning_rate": 1.970862213226244e-05,
"loss": 0.2607,
"step": 137
},
{
"epoch": 0.1707920792079208,
"grad_norm": 4.632352828979492,
"learning_rate": 1.9698175997476657e-05,
"loss": 0.2914,
"step": 138
},
{
"epoch": 0.17202970297029702,
"grad_norm": 5.2901434898376465,
"learning_rate": 1.968754876191578e-05,
"loss": 0.2874,
"step": 139
},
{
"epoch": 0.17326732673267325,
"grad_norm": 3.2094457149505615,
"learning_rate": 1.9676740624029566e-05,
"loss": 0.2483,
"step": 140
},
{
"epoch": 0.17326732673267325,
"eval_accuracy": 0.8403547671840355,
"eval_f1": 0.6587677725118484,
"eval_loss": 0.35367104411125183,
"eval_precision": 0.8224852071005917,
"eval_recall": 0.549407114624506,
"eval_runtime": 49.1165,
"eval_samples_per_second": 5.619,
"eval_steps_per_second": 0.183,
"step": 140
},
{
"epoch": 0.1745049504950495,
"grad_norm": 3.4511711597442627,
"learning_rate": 1.9665751785645874e-05,
"loss": 0.2277,
"step": 141
},
{
"epoch": 0.17574257425742573,
"grad_norm": 3.3621718883514404,
"learning_rate": 1.9654582451966915e-05,
"loss": 0.2893,
"step": 142
},
{
"epoch": 0.17698019801980197,
"grad_norm": 4.829539775848389,
"learning_rate": 1.9643232831565417e-05,
"loss": 0.2127,
"step": 143
},
{
"epoch": 0.1782178217821782,
"grad_norm": 4.233989715576172,
"learning_rate": 1.9631703136380716e-05,
"loss": 0.2133,
"step": 144
},
{
"epoch": 0.17945544554455445,
"grad_norm": 9.943169593811035,
"learning_rate": 1.961999358171482e-05,
"loss": 0.442,
"step": 145
},
{
"epoch": 0.1806930693069307,
"grad_norm": 4.362405300140381,
"learning_rate": 1.960810438622838e-05,
"loss": 0.2677,
"step": 146
},
{
"epoch": 0.18193069306930693,
"grad_norm": 4.714008808135986,
"learning_rate": 1.959603577193659e-05,
"loss": 0.3213,
"step": 147
},
{
"epoch": 0.18316831683168316,
"grad_norm": 3.655679702758789,
"learning_rate": 1.9583787964205073e-05,
"loss": 0.199,
"step": 148
},
{
"epoch": 0.1844059405940594,
"grad_norm": 4.397619247436523,
"learning_rate": 1.9571361191745647e-05,
"loss": 0.2728,
"step": 149
},
{
"epoch": 0.18564356435643564,
"grad_norm": 4.055555820465088,
"learning_rate": 1.955875568661206e-05,
"loss": 0.2461,
"step": 150
},
{
"epoch": 0.18688118811881188,
"grad_norm": 4.366605281829834,
"learning_rate": 1.9545971684195664e-05,
"loss": 0.2026,
"step": 151
},
{
"epoch": 0.18811881188118812,
"grad_norm": 3.7074687480926514,
"learning_rate": 1.9533009423221014e-05,
"loss": 0.2817,
"step": 152
},
{
"epoch": 0.18935643564356436,
"grad_norm": 4.276401996612549,
"learning_rate": 1.951986914574141e-05,
"loss": 0.2661,
"step": 153
},
{
"epoch": 0.1905940594059406,
"grad_norm": 3.917130708694458,
"learning_rate": 1.9506551097134384e-05,
"loss": 0.3005,
"step": 154
},
{
"epoch": 0.19183168316831684,
"grad_norm": 6.731651306152344,
"learning_rate": 1.94930555260971e-05,
"loss": 0.2892,
"step": 155
},
{
"epoch": 0.19306930693069307,
"grad_norm": 4.87600564956665,
"learning_rate": 1.947938268464173e-05,
"loss": 0.1983,
"step": 156
},
{
"epoch": 0.1943069306930693,
"grad_norm": 4.437981605529785,
"learning_rate": 1.9465532828090735e-05,
"loss": 0.2479,
"step": 157
},
{
"epoch": 0.19554455445544555,
"grad_norm": 3.6721622943878174,
"learning_rate": 1.9451506215072106e-05,
"loss": 0.243,
"step": 158
},
{
"epoch": 0.1967821782178218,
"grad_norm": 3.8687756061553955,
"learning_rate": 1.943730310751453e-05,
"loss": 0.2619,
"step": 159
},
{
"epoch": 0.19801980198019803,
"grad_norm": 4.864063739776611,
"learning_rate": 1.9422923770642494e-05,
"loss": 0.2803,
"step": 160
},
{
"epoch": 0.19801980198019803,
"eval_accuracy": 0.8414634146341463,
"eval_f1": 0.6520681265206812,
"eval_loss": 0.34682103991508484,
"eval_precision": 0.8481012658227848,
"eval_recall": 0.5296442687747036,
"eval_runtime": 49.8936,
"eval_samples_per_second": 5.532,
"eval_steps_per_second": 0.18,
"step": 160
},
{
"epoch": 0.19925742574257427,
"grad_norm": 3.036126136779785,
"learning_rate": 1.9408368472971344e-05,
"loss": 0.2777,
"step": 161
},
{
"epoch": 0.2004950495049505,
"grad_norm": 3.19771409034729,
"learning_rate": 1.9393637486302257e-05,
"loss": 0.2741,
"step": 162
},
{
"epoch": 0.20173267326732675,
"grad_norm": 4.557991027832031,
"learning_rate": 1.937873108571718e-05,
"loss": 0.2677,
"step": 163
},
{
"epoch": 0.20297029702970298,
"grad_norm": 4.806491374969482,
"learning_rate": 1.936364954957368e-05,
"loss": 0.2728,
"step": 164
},
{
"epoch": 0.2042079207920792,
"grad_norm": 5.901110649108887,
"learning_rate": 1.934839315949976e-05,
"loss": 0.2406,
"step": 165
},
{
"epoch": 0.20544554455445543,
"grad_norm": 3.7812883853912354,
"learning_rate": 1.933296220038858e-05,
"loss": 0.2857,
"step": 166
},
{
"epoch": 0.20668316831683167,
"grad_norm": 4.161533832550049,
"learning_rate": 1.9317356960393158e-05,
"loss": 0.2132,
"step": 167
},
{
"epoch": 0.2079207920792079,
"grad_norm": 3.8676390647888184,
"learning_rate": 1.9301577730920975e-05,
"loss": 0.2486,
"step": 168
},
{
"epoch": 0.20915841584158415,
"grad_norm": 4.488946437835693,
"learning_rate": 1.9285624806628543e-05,
"loss": 0.2859,
"step": 169
},
{
"epoch": 0.2103960396039604,
"grad_norm": 3.541072130203247,
"learning_rate": 1.9269498485415897e-05,
"loss": 0.2522,
"step": 170
},
{
"epoch": 0.21163366336633663,
"grad_norm": 3.683732509613037,
"learning_rate": 1.925319906842103e-05,
"loss": 0.223,
"step": 171
},
{
"epoch": 0.21287128712871287,
"grad_norm": 3.875123977661133,
"learning_rate": 1.923672686001427e-05,
"loss": 0.2906,
"step": 172
},
{
"epoch": 0.2141089108910891,
"grad_norm": 4.992143630981445,
"learning_rate": 1.922008216779261e-05,
"loss": 0.2183,
"step": 173
},
{
"epoch": 0.21534653465346534,
"grad_norm": 5.165887355804443,
"learning_rate": 1.920326530257394e-05,
"loss": 0.2291,
"step": 174
},
{
"epoch": 0.21658415841584158,
"grad_norm": 3.6516168117523193,
"learning_rate": 1.9186276578391268e-05,
"loss": 0.2092,
"step": 175
},
{
"epoch": 0.21782178217821782,
"grad_norm": 3.7098777294158936,
"learning_rate": 1.9169116312486835e-05,
"loss": 0.2635,
"step": 176
},
{
"epoch": 0.21905940594059406,
"grad_norm": 6.8240180015563965,
"learning_rate": 1.9151784825306205e-05,
"loss": 0.2545,
"step": 177
},
{
"epoch": 0.2202970297029703,
"grad_norm": 4.409351348876953,
"learning_rate": 1.9134282440492272e-05,
"loss": 0.2505,
"step": 178
},
{
"epoch": 0.22153465346534654,
"grad_norm": 3.2560315132141113,
"learning_rate": 1.911660948487922e-05,
"loss": 0.2857,
"step": 179
},
{
"epoch": 0.22277227722772278,
"grad_norm": 5.461050987243652,
"learning_rate": 1.9098766288486426e-05,
"loss": 0.2782,
"step": 180
},
{
"epoch": 0.22277227722772278,
"eval_accuracy": 0.8237250554323725,
"eval_f1": 0.5974683544303797,
"eval_loss": 0.34932276606559753,
"eval_precision": 0.8309859154929577,
"eval_recall": 0.466403162055336,
"eval_runtime": 49.2509,
"eval_samples_per_second": 5.604,
"eval_steps_per_second": 0.183,
"step": 180
},
{
"epoch": 0.22400990099009901,
"grad_norm": 3.929197072982788,
"learning_rate": 1.9080753184512284e-05,
"loss": 0.2682,
"step": 181
},
{
"epoch": 0.22524752475247525,
"grad_norm": 4.4159393310546875,
"learning_rate": 1.9062570509327993e-05,
"loss": 0.2503,
"step": 182
},
{
"epoch": 0.2264851485148515,
"grad_norm": 5.622183799743652,
"learning_rate": 1.9044218602471275e-05,
"loss": 0.3253,
"step": 183
},
{
"epoch": 0.22772277227722773,
"grad_norm": 3.281792402267456,
"learning_rate": 1.9025697806640035e-05,
"loss": 0.2018,
"step": 184
},
{
"epoch": 0.22896039603960397,
"grad_norm": 3.431208372116089,
"learning_rate": 1.9007008467685947e-05,
"loss": 0.2012,
"step": 185
},
{
"epoch": 0.2301980198019802,
"grad_norm": 5.277952671051025,
"learning_rate": 1.8988150934608014e-05,
"loss": 0.2031,
"step": 186
},
{
"epoch": 0.23143564356435645,
"grad_norm": 4.322801113128662,
"learning_rate": 1.8969125559546054e-05,
"loss": 0.2626,
"step": 187
},
{
"epoch": 0.23267326732673269,
"grad_norm": 4.021146297454834,
"learning_rate": 1.894993269777411e-05,
"loss": 0.2343,
"step": 188
},
{
"epoch": 0.23391089108910892,
"grad_norm": 3.045038938522339,
"learning_rate": 1.893057270769381e-05,
"loss": 0.1718,
"step": 189
},
{
"epoch": 0.23514851485148514,
"grad_norm": 4.587369441986084,
"learning_rate": 1.8911045950827693e-05,
"loss": 0.2377,
"step": 190
},
{
"epoch": 0.23638613861386137,
"grad_norm": 5.442078590393066,
"learning_rate": 1.8891352791812452e-05,
"loss": 0.2796,
"step": 191
},
{
"epoch": 0.2376237623762376,
"grad_norm": 6.258726596832275,
"learning_rate": 1.8871493598392122e-05,
"loss": 0.2856,
"step": 192
},
{
"epoch": 0.23886138613861385,
"grad_norm": 6.618675231933594,
"learning_rate": 1.885146874141121e-05,
"loss": 0.256,
"step": 193
},
{
"epoch": 0.2400990099009901,
"grad_norm": 4.947834491729736,
"learning_rate": 1.8831278594807783e-05,
"loss": 0.2452,
"step": 194
},
{
"epoch": 0.24133663366336633,
"grad_norm": 3.6348724365234375,
"learning_rate": 1.881092353560646e-05,
"loss": 0.2141,
"step": 195
},
{
"epoch": 0.24257425742574257,
"grad_norm": 7.256039619445801,
"learning_rate": 1.8790403943911403e-05,
"loss": 0.2617,
"step": 196
},
{
"epoch": 0.2438118811881188,
"grad_norm": 4.058467864990234,
"learning_rate": 1.8769720202899196e-05,
"loss": 0.2119,
"step": 197
},
{
"epoch": 0.24504950495049505,
"grad_norm": 8.09382438659668,
"learning_rate": 1.8748872698811695e-05,
"loss": 0.2156,
"step": 198
},
{
"epoch": 0.24628712871287128,
"grad_norm": 5.703820705413818,
"learning_rate": 1.872786182094882e-05,
"loss": 0.1883,
"step": 199
},
{
"epoch": 0.24752475247524752,
"grad_norm": 6.104684352874756,
"learning_rate": 1.870668796166129e-05,
"loss": 0.2174,
"step": 200
},
{
"epoch": 0.24752475247524752,
"eval_accuracy": 0.8492239467849224,
"eval_f1": 0.6866359447004609,
"eval_loss": 0.33290114998817444,
"eval_precision": 0.8232044198895028,
"eval_recall": 0.5889328063241107,
"eval_runtime": 48.1855,
"eval_samples_per_second": 5.728,
"eval_steps_per_second": 0.187,
"step": 200
},
{
"epoch": 0.24876237623762376,
"grad_norm": 8.231904983520508,
"learning_rate": 1.8685351516343277e-05,
"loss": 0.2536,
"step": 201
},
{
"epoch": 0.25,
"grad_norm": 4.1948041915893555,
"learning_rate": 1.8663852883425045e-05,
"loss": 0.295,
"step": 202
},
{
"epoch": 0.25123762376237624,
"grad_norm": 5.275994777679443,
"learning_rate": 1.86421924643655e-05,
"loss": 0.2739,
"step": 203
},
{
"epoch": 0.2524752475247525,
"grad_norm": 4.482490062713623,
"learning_rate": 1.8620370663644676e-05,
"loss": 0.1973,
"step": 204
},
{
"epoch": 0.2537128712871287,
"grad_norm": 3.5862739086151123,
"learning_rate": 1.8598387888756224e-05,
"loss": 0.2146,
"step": 205
},
{
"epoch": 0.25495049504950495,
"grad_norm": 3.1812057495117188,
"learning_rate": 1.857624455019976e-05,
"loss": 0.2104,
"step": 206
},
{
"epoch": 0.2561881188118812,
"grad_norm": 6.783030986785889,
"learning_rate": 1.855394106147322e-05,
"loss": 0.2979,
"step": 207
},
{
"epoch": 0.25742574257425743,
"grad_norm": 6.715686321258545,
"learning_rate": 1.853147783906514e-05,
"loss": 0.2952,
"step": 208
},
{
"epoch": 0.25866336633663367,
"grad_norm": 5.6060028076171875,
"learning_rate": 1.8508855302446868e-05,
"loss": 0.2323,
"step": 209
},
{
"epoch": 0.2599009900990099,
"grad_norm": 4.204987049102783,
"learning_rate": 1.8486073874064745e-05,
"loss": 0.216,
"step": 210
},
{
"epoch": 0.26113861386138615,
"grad_norm": 5.449676036834717,
"learning_rate": 1.84631339793322e-05,
"loss": 0.1925,
"step": 211
},
{
"epoch": 0.2623762376237624,
"grad_norm": 4.839028835296631,
"learning_rate": 1.8440036046621816e-05,
"loss": 0.2531,
"step": 212
},
{
"epoch": 0.2636138613861386,
"grad_norm": 5.816053867340088,
"learning_rate": 1.8416780507257334e-05,
"loss": 0.2866,
"step": 213
},
{
"epoch": 0.26485148514851486,
"grad_norm": 4.106687545776367,
"learning_rate": 1.8393367795505587e-05,
"loss": 0.1797,
"step": 214
},
{
"epoch": 0.2660891089108911,
"grad_norm": 3.8408498764038086,
"learning_rate": 1.8369798348568403e-05,
"loss": 0.2328,
"step": 215
},
{
"epoch": 0.26732673267326734,
"grad_norm": 3.9387855529785156,
"learning_rate": 1.834607260657443e-05,
"loss": 0.2067,
"step": 216
},
{
"epoch": 0.2685643564356436,
"grad_norm": 3.855027198791504,
"learning_rate": 1.832219101257092e-05,
"loss": 0.2408,
"step": 217
},
{
"epoch": 0.2698019801980198,
"grad_norm": 5.5736494064331055,
"learning_rate": 1.829815401251547e-05,
"loss": 0.2225,
"step": 218
},
{
"epoch": 0.27103960396039606,
"grad_norm": 5.179149150848389,
"learning_rate": 1.8273962055267667e-05,
"loss": 0.2575,
"step": 219
},
{
"epoch": 0.2722772277227723,
"grad_norm": 8.503008842468262,
"learning_rate": 1.8249615592580733e-05,
"loss": 0.2965,
"step": 220
},
{
"epoch": 0.2722772277227723,
"eval_accuracy": 0.844789356984479,
"eval_f1": 0.6682464454976303,
"eval_loss": 0.3314219117164612,
"eval_precision": 0.834319526627219,
"eval_recall": 0.5573122529644269,
"eval_runtime": 48.2502,
"eval_samples_per_second": 5.72,
"eval_steps_per_second": 0.187,
"step": 220
},
{
"epoch": 0.27351485148514854,
"grad_norm": 4.444825172424316,
"learning_rate": 1.822511507909307e-05,
"loss": 0.1907,
"step": 221
},
{
"epoch": 0.2747524752475248,
"grad_norm": 6.425011157989502,
"learning_rate": 1.8200460972319786e-05,
"loss": 0.2938,
"step": 222
},
{
"epoch": 0.275990099009901,
"grad_norm": 3.5462961196899414,
"learning_rate": 1.817565373264413e-05,
"loss": 0.2045,
"step": 223
},
{
"epoch": 0.27722772277227725,
"grad_norm": 5.254908084869385,
"learning_rate": 1.8150693823308913e-05,
"loss": 0.1644,
"step": 224
},
{
"epoch": 0.2784653465346535,
"grad_norm": 4.101227283477783,
"learning_rate": 1.8125581710407864e-05,
"loss": 0.1875,
"step": 225
},
{
"epoch": 0.27970297029702973,
"grad_norm": 3.370792865753174,
"learning_rate": 1.8100317862876902e-05,
"loss": 0.1715,
"step": 226
},
{
"epoch": 0.28094059405940597,
"grad_norm": 4.758403778076172,
"learning_rate": 1.8074902752485392e-05,
"loss": 0.2956,
"step": 227
},
{
"epoch": 0.28217821782178215,
"grad_norm": 5.75641393661499,
"learning_rate": 1.8049336853827343e-05,
"loss": 0.2601,
"step": 228
},
{
"epoch": 0.2834158415841584,
"grad_norm": 3.241687059402466,
"learning_rate": 1.8023620644312538e-05,
"loss": 0.2022,
"step": 229
},
{
"epoch": 0.28465346534653463,
"grad_norm": 4.085322856903076,
"learning_rate": 1.7997754604157607e-05,
"loss": 0.2132,
"step": 230
},
{
"epoch": 0.28589108910891087,
"grad_norm": 5.415487766265869,
"learning_rate": 1.797173921637709e-05,
"loss": 0.1825,
"step": 231
},
{
"epoch": 0.2871287128712871,
"grad_norm": 4.1402907371521,
"learning_rate": 1.794557496677438e-05,
"loss": 0.2029,
"step": 232
},
{
"epoch": 0.28836633663366334,
"grad_norm": 4.597172737121582,
"learning_rate": 1.791926234393268e-05,
"loss": 0.1929,
"step": 233
},
{
"epoch": 0.2896039603960396,
"grad_norm": 6.450316905975342,
"learning_rate": 1.7892801839205867e-05,
"loss": 0.2061,
"step": 234
},
{
"epoch": 0.2908415841584158,
"grad_norm": 4.549274444580078,
"learning_rate": 1.786619394670933e-05,
"loss": 0.2288,
"step": 235
},
{
"epoch": 0.29207920792079206,
"grad_norm": 8.562817573547363,
"learning_rate": 1.7839439163310714e-05,
"loss": 0.2791,
"step": 236
},
{
"epoch": 0.2933168316831683,
"grad_norm": 4.906472206115723,
"learning_rate": 1.7812537988620678e-05,
"loss": 0.2505,
"step": 237
},
{
"epoch": 0.29455445544554454,
"grad_norm": 4.514908790588379,
"learning_rate": 1.7785490924983526e-05,
"loss": 0.2033,
"step": 238
},
{
"epoch": 0.2957920792079208,
"grad_norm": 5.586214065551758,
"learning_rate": 1.7758298477467865e-05,
"loss": 0.1828,
"step": 239
},
{
"epoch": 0.297029702970297,
"grad_norm": 11.2735595703125,
"learning_rate": 1.7730961153857155e-05,
"loss": 0.2379,
"step": 240
},
{
"epoch": 0.297029702970297,
"eval_accuracy": 0.8148558758314856,
"eval_f1": 0.5570291777188329,
"eval_loss": 0.3735515773296356,
"eval_precision": 0.8467741935483871,
"eval_recall": 0.4150197628458498,
"eval_runtime": 48.8987,
"eval_samples_per_second": 5.644,
"eval_steps_per_second": 0.184,
"step": 240
},
{
"epoch": 0.29826732673267325,
"grad_norm": 6.920717239379883,
"learning_rate": 1.7703479464640218e-05,
"loss": 0.2756,
"step": 241
},
{
"epoch": 0.2995049504950495,
"grad_norm": 5.730903625488281,
"learning_rate": 1.767585392300172e-05,
"loss": 0.1745,
"step": 242
},
{
"epoch": 0.30074257425742573,
"grad_norm": 4.035462856292725,
"learning_rate": 1.764808504481259e-05,
"loss": 0.1666,
"step": 243
},
{
"epoch": 0.30198019801980197,
"grad_norm": 4.878346920013428,
"learning_rate": 1.7620173348620368e-05,
"loss": 0.2491,
"step": 244
},
{
"epoch": 0.3032178217821782,
"grad_norm": 3.8003768920898438,
"learning_rate": 1.7592119355639545e-05,
"loss": 0.2041,
"step": 245
},
{
"epoch": 0.30445544554455445,
"grad_norm": 6.53809118270874,
"learning_rate": 1.7563923589741806e-05,
"loss": 0.2415,
"step": 246
},
{
"epoch": 0.3056930693069307,
"grad_norm": 3.5466408729553223,
"learning_rate": 1.7535586577446274e-05,
"loss": 0.1963,
"step": 247
},
{
"epoch": 0.3069306930693069,
"grad_norm": 4.167630195617676,
"learning_rate": 1.7507108847909656e-05,
"loss": 0.2261,
"step": 248
},
{
"epoch": 0.30816831683168316,
"grad_norm": 4.359383583068848,
"learning_rate": 1.7478490932916374e-05,
"loss": 0.1888,
"step": 249
},
{
"epoch": 0.3094059405940594,
"grad_norm": 8.298726081848145,
"learning_rate": 1.744973336686862e-05,
"loss": 0.2532,
"step": 250
},
{
"epoch": 0.31064356435643564,
"grad_norm": 5.459946632385254,
"learning_rate": 1.74208366867764e-05,
"loss": 0.2579,
"step": 251
},
{
"epoch": 0.3118811881188119,
"grad_norm": 4.424745559692383,
"learning_rate": 1.7391801432247487e-05,
"loss": 0.2071,
"step": 252
},
{
"epoch": 0.3131188118811881,
"grad_norm": 4.382404804229736,
"learning_rate": 1.7362628145477355e-05,
"loss": 0.2773,
"step": 253
},
{
"epoch": 0.31435643564356436,
"grad_norm": 3.338047742843628,
"learning_rate": 1.7333317371239046e-05,
"loss": 0.2231,
"step": 254
},
{
"epoch": 0.3155940594059406,
"grad_norm": 3.33626389503479,
"learning_rate": 1.7303869656872994e-05,
"loss": 0.2046,
"step": 255
},
{
"epoch": 0.31683168316831684,
"grad_norm": 3.3837637901306152,
"learning_rate": 1.727428555227683e-05,
"loss": 0.1503,
"step": 256
},
{
"epoch": 0.3180693069306931,
"grad_norm": 3.3898661136627197,
"learning_rate": 1.7244565609895074e-05,
"loss": 0.1641,
"step": 257
},
{
"epoch": 0.3193069306930693,
"grad_norm": 5.7499918937683105,
"learning_rate": 1.721471038470885e-05,
"loss": 0.268,
"step": 258
},
{
"epoch": 0.32054455445544555,
"grad_norm": 4.935744762420654,
"learning_rate": 1.7184720434225518e-05,
"loss": 0.2289,
"step": 259
},
{
"epoch": 0.3217821782178218,
"grad_norm": 3.377199411392212,
"learning_rate": 1.715459631846824e-05,
"loss": 0.1587,
"step": 260
},
{
"epoch": 0.3217821782178218,
"eval_accuracy": 0.8403547671840355,
"eval_f1": 0.6435643564356436,
"eval_loss": 0.33147069811820984,
"eval_precision": 0.8609271523178808,
"eval_recall": 0.5138339920948617,
"eval_runtime": 47.9408,
"eval_samples_per_second": 5.757,
"eval_steps_per_second": 0.188,
"step": 260
},
{
"epoch": 0.32301980198019803,
"grad_norm": 3.6289126873016357,
"learning_rate": 1.712433859996555e-05,
"loss": 0.2245,
"step": 261
},
{
"epoch": 0.32425742574257427,
"grad_norm": 3.2000958919525146,
"learning_rate": 1.7093947843740843e-05,
"loss": 0.2251,
"step": 262
},
{
"epoch": 0.3254950495049505,
"grad_norm": 4.913848400115967,
"learning_rate": 1.706342461730181e-05,
"loss": 0.1782,
"step": 263
},
{
"epoch": 0.32673267326732675,
"grad_norm": 5.196519374847412,
"learning_rate": 1.703276949062985e-05,
"loss": 0.2259,
"step": 264
},
{
"epoch": 0.327970297029703,
"grad_norm": 5.136421203613281,
"learning_rate": 1.700198303616944e-05,
"loss": 0.2132,
"step": 265
},
{
"epoch": 0.3292079207920792,
"grad_norm": 4.810065746307373,
"learning_rate": 1.6971065828817424e-05,
"loss": 0.1883,
"step": 266
},
{
"epoch": 0.33044554455445546,
"grad_norm": 4.666658878326416,
"learning_rate": 1.6940018445912275e-05,
"loss": 0.2087,
"step": 267
},
{
"epoch": 0.3316831683168317,
"grad_norm": 4.813300132751465,
"learning_rate": 1.690884146722334e-05,
"loss": 0.2631,
"step": 268
},
{
"epoch": 0.33292079207920794,
"grad_norm": 9.478407859802246,
"learning_rate": 1.687753547493999e-05,
"loss": 0.2618,
"step": 269
},
{
"epoch": 0.3341584158415842,
"grad_norm": 4.668523788452148,
"learning_rate": 1.684610105366076e-05,
"loss": 0.2025,
"step": 270
},
{
"epoch": 0.3353960396039604,
"grad_norm": 10.10991096496582,
"learning_rate": 1.6814538790382432e-05,
"loss": 0.2893,
"step": 271
},
{
"epoch": 0.33663366336633666,
"grad_norm": 6.124392509460449,
"learning_rate": 1.6782849274489055e-05,
"loss": 0.2382,
"step": 272
},
{
"epoch": 0.3378712871287129,
"grad_norm": 4.633864402770996,
"learning_rate": 1.6751033097740978e-05,
"loss": 0.1991,
"step": 273
},
{
"epoch": 0.33910891089108913,
"grad_norm": 4.003640174865723,
"learning_rate": 1.6719090854263752e-05,
"loss": 0.1811,
"step": 274
},
{
"epoch": 0.34034653465346537,
"grad_norm": 5.303317070007324,
"learning_rate": 1.6687023140537082e-05,
"loss": 0.3266,
"step": 275
},
{
"epoch": 0.3415841584158416,
"grad_norm": 4.467435836791992,
"learning_rate": 1.6654830555383648e-05,
"loss": 0.2174,
"step": 276
},
{
"epoch": 0.34282178217821785,
"grad_norm": 4.210566520690918,
"learning_rate": 1.662251369995795e-05,
"loss": 0.1746,
"step": 277
},
{
"epoch": 0.34405940594059403,
"grad_norm": 3.8887202739715576,
"learning_rate": 1.6590073177735066e-05,
"loss": 0.19,
"step": 278
},
{
"epoch": 0.34529702970297027,
"grad_norm": 4.348226547241211,
"learning_rate": 1.6557509594499405e-05,
"loss": 0.229,
"step": 279
},
{
"epoch": 0.3465346534653465,
"grad_norm": 3.498028039932251,
"learning_rate": 1.6524823558333362e-05,
"loss": 0.1769,
"step": 280
},
{
"epoch": 0.3465346534653465,
"eval_accuracy": 0.8370288248337029,
"eval_f1": 0.6440677966101694,
"eval_loss": 0.33291730284690857,
"eval_precision": 0.83125,
"eval_recall": 0.525691699604743,
"eval_runtime": 48.169,
"eval_samples_per_second": 5.73,
"eval_steps_per_second": 0.187,
"step": 280
},
{
"epoch": 0.34777227722772275,
"grad_norm": 5.36956262588501,
"learning_rate": 1.6492015679605994e-05,
"loss": 0.2361,
"step": 281
},
{
"epoch": 0.349009900990099,
"grad_norm": 5.6981401443481445,
"learning_rate": 1.6459086570961594e-05,
"loss": 0.1696,
"step": 282
},
{
"epoch": 0.3502475247524752,
"grad_norm": 5.104677677154541,
"learning_rate": 1.6426036847308287e-05,
"loss": 0.2587,
"step": 283
},
{
"epoch": 0.35148514851485146,
"grad_norm": 4.432884216308594,
"learning_rate": 1.6392867125806504e-05,
"loss": 0.2231,
"step": 284
},
{
"epoch": 0.3527227722772277,
"grad_norm": 8.529413223266602,
"learning_rate": 1.6359578025857495e-05,
"loss": 0.3018,
"step": 285
},
{
"epoch": 0.35396039603960394,
"grad_norm": 3.8591082096099854,
"learning_rate": 1.6326170169091735e-05,
"loss": 0.2339,
"step": 286
},
{
"epoch": 0.3551980198019802,
"grad_norm": 2.877532482147217,
"learning_rate": 1.6292644179357337e-05,
"loss": 0.168,
"step": 287
},
{
"epoch": 0.3564356435643564,
"grad_norm": 4.591522693634033,
"learning_rate": 1.6259000682708384e-05,
"loss": 0.1687,
"step": 288
},
{
"epoch": 0.35767326732673266,
"grad_norm": 4.433895111083984,
"learning_rate": 1.622524030739326e-05,
"loss": 0.2028,
"step": 289
},
{
"epoch": 0.3589108910891089,
"grad_norm": 5.059347629547119,
"learning_rate": 1.6191363683842883e-05,
"loss": 0.2286,
"step": 290
},
{
"epoch": 0.36014851485148514,
"grad_norm": 3.7166552543640137,
"learning_rate": 1.615737144465898e-05,
"loss": 0.1848,
"step": 291
},
{
"epoch": 0.3613861386138614,
"grad_norm": 4.245189189910889,
"learning_rate": 1.6123264224602245e-05,
"loss": 0.2474,
"step": 292
},
{
"epoch": 0.3626237623762376,
"grad_norm": 6.487268447875977,
"learning_rate": 1.608904266058047e-05,
"loss": 0.2036,
"step": 293
},
{
"epoch": 0.36386138613861385,
"grad_norm": 3.333557605743408,
"learning_rate": 1.605470739163669e-05,
"loss": 0.1859,
"step": 294
},
{
"epoch": 0.3650990099009901,
"grad_norm": 3.7823169231414795,
"learning_rate": 1.6020259058937228e-05,
"loss": 0.1713,
"step": 295
},
{
"epoch": 0.36633663366336633,
"grad_norm": 3.356194496154785,
"learning_rate": 1.5985698305759713e-05,
"loss": 0.1774,
"step": 296
},
{
"epoch": 0.36757425742574257,
"grad_norm": 4.501846790313721,
"learning_rate": 1.59510257774811e-05,
"loss": 0.1836,
"step": 297
},
{
"epoch": 0.3688118811881188,
"grad_norm": 9.250550270080566,
"learning_rate": 1.591624212156558e-05,
"loss": 0.3101,
"step": 298
},
{
"epoch": 0.37004950495049505,
"grad_norm": 3.5429160594940186,
"learning_rate": 1.5881347987552517e-05,
"loss": 0.1918,
"step": 299
},
{
"epoch": 0.3712871287128713,
"grad_norm": 6.185944080352783,
"learning_rate": 1.5846344027044307e-05,
"loss": 0.1984,
"step": 300
},
{
"epoch": 0.3712871287128713,
"eval_accuracy": 0.8536585365853658,
"eval_f1": 0.6826923076923077,
"eval_loss": 0.3210514485836029,
"eval_precision": 0.8711656441717791,
"eval_recall": 0.5612648221343873,
"eval_runtime": 48.5015,
"eval_samples_per_second": 5.691,
"eval_steps_per_second": 0.186,
"step": 300
},
{
"epoch": 0.3725247524752475,
"grad_norm": 6.07203483581543,
"learning_rate": 1.5811230893694214e-05,
"loss": 0.2375,
"step": 301
},
{
"epoch": 0.37376237623762376,
"grad_norm": 5.992534160614014,
"learning_rate": 1.5776009243194158e-05,
"loss": 0.2989,
"step": 302
},
{
"epoch": 0.375,
"grad_norm": 7.230125427246094,
"learning_rate": 1.574067973326248e-05,
"loss": 0.2541,
"step": 303
},
{
"epoch": 0.37623762376237624,
"grad_norm": 5.320266246795654,
"learning_rate": 1.570524302363165e-05,
"loss": 0.2273,
"step": 304
},
{
"epoch": 0.3774752475247525,
"grad_norm": 8.311070442199707,
"learning_rate": 1.5669699776035958e-05,
"loss": 0.1916,
"step": 305
},
{
"epoch": 0.3787128712871287,
"grad_norm": 7.01260232925415,
"learning_rate": 1.5634050654199147e-05,
"loss": 0.2083,
"step": 306
},
{
"epoch": 0.37995049504950495,
"grad_norm": 6.13078498840332,
"learning_rate": 1.5598296323822026e-05,
"loss": 0.1955,
"step": 307
},
{
"epoch": 0.3811881188118812,
"grad_norm": 5.786803722381592,
"learning_rate": 1.556243745257003e-05,
"loss": 0.2323,
"step": 308
},
{
"epoch": 0.38242574257425743,
"grad_norm": 8.768049240112305,
"learning_rate": 1.5526474710060767e-05,
"loss": 0.2716,
"step": 309
},
{
"epoch": 0.38366336633663367,
"grad_norm": 5.660022735595703,
"learning_rate": 1.5490408767851506e-05,
"loss": 0.2747,
"step": 310
},
{
"epoch": 0.3849009900990099,
"grad_norm": 3.7780046463012695,
"learning_rate": 1.5454240299426626e-05,
"loss": 0.1875,
"step": 311
},
{
"epoch": 0.38613861386138615,
"grad_norm": 3.4342143535614014,
"learning_rate": 1.5417969980185055e-05,
"loss": 0.1794,
"step": 312
},
{
"epoch": 0.3873762376237624,
"grad_norm": 5.58084774017334,
"learning_rate": 1.538159848742765e-05,
"loss": 0.2996,
"step": 313
},
{
"epoch": 0.3886138613861386,
"grad_norm": 4.530074119567871,
"learning_rate": 1.5345126500344555e-05,
"loss": 0.1944,
"step": 314
},
{
"epoch": 0.38985148514851486,
"grad_norm": 6.155841827392578,
"learning_rate": 1.530855470000251e-05,
"loss": 0.2508,
"step": 315
},
{
"epoch": 0.3910891089108911,
"grad_norm": 7.240228652954102,
"learning_rate": 1.5271883769332144e-05,
"loss": 0.2912,
"step": 316
},
{
"epoch": 0.39232673267326734,
"grad_norm": 4.011997699737549,
"learning_rate": 1.5235114393115202e-05,
"loss": 0.215,
"step": 317
},
{
"epoch": 0.3935643564356436,
"grad_norm": 2.886672258377075,
"learning_rate": 1.5198247257971788e-05,
"loss": 0.1769,
"step": 318
},
{
"epoch": 0.3948019801980198,
"grad_norm": 3.5629324913024902,
"learning_rate": 1.5161283052347516e-05,
"loss": 0.2009,
"step": 319
},
{
"epoch": 0.39603960396039606,
"grad_norm": 3.8855140209198,
"learning_rate": 1.5124222466500665e-05,
"loss": 0.2109,
"step": 320
},
{
"epoch": 0.39603960396039606,
"eval_accuracy": 0.8569844789356984,
"eval_f1": 0.7061503416856492,
"eval_loss": 0.3063695430755615,
"eval_precision": 0.8333333333333334,
"eval_recall": 0.6126482213438735,
"eval_runtime": 50.0323,
"eval_samples_per_second": 5.516,
"eval_steps_per_second": 0.18,
"step": 320
},
{
"epoch": 0.3972772277227723,
"grad_norm": 3.4308321475982666,
"learning_rate": 1.5087066192489288e-05,
"loss": 0.1939,
"step": 321
},
{
"epoch": 0.39851485148514854,
"grad_norm": 3.8520755767822266,
"learning_rate": 1.5049814924158298e-05,
"loss": 0.1707,
"step": 322
},
{
"epoch": 0.3997524752475248,
"grad_norm": 5.031585693359375,
"learning_rate": 1.5012469357126496e-05,
"loss": 0.2002,
"step": 323
},
{
"epoch": 0.400990099009901,
"grad_norm": 3.010558843612671,
"learning_rate": 1.4975030188773585e-05,
"loss": 0.1784,
"step": 324
},
{
"epoch": 0.40222772277227725,
"grad_norm": 6.272927761077881,
"learning_rate": 1.4937498118227156e-05,
"loss": 0.2514,
"step": 325
},
{
"epoch": 0.4034653465346535,
"grad_norm": 3.995131254196167,
"learning_rate": 1.4899873846349626e-05,
"loss": 0.1942,
"step": 326
},
{
"epoch": 0.40470297029702973,
"grad_norm": 4.409802436828613,
"learning_rate": 1.486215807572515e-05,
"loss": 0.1932,
"step": 327
},
{
"epoch": 0.40594059405940597,
"grad_norm": 3.927633285522461,
"learning_rate": 1.4824351510646508e-05,
"loss": 0.2129,
"step": 328
},
{
"epoch": 0.40717821782178215,
"grad_norm": 4.012439250946045,
"learning_rate": 1.478645485710194e-05,
"loss": 0.2854,
"step": 329
},
{
"epoch": 0.4084158415841584,
"grad_norm": 4.1401848793029785,
"learning_rate": 1.4748468822761974e-05,
"loss": 0.2454,
"step": 330
},
{
"epoch": 0.40965346534653463,
"grad_norm": 4.657931804656982,
"learning_rate": 1.4710394116966206e-05,
"loss": 0.2288,
"step": 331
},
{
"epoch": 0.41089108910891087,
"grad_norm": 6.472193717956543,
"learning_rate": 1.4672231450710066e-05,
"loss": 0.26,
"step": 332
},
{
"epoch": 0.4121287128712871,
"grad_norm": 4.930477142333984,
"learning_rate": 1.4633981536631514e-05,
"loss": 0.25,
"step": 333
},
{
"epoch": 0.41336633663366334,
"grad_norm": 6.003932476043701,
"learning_rate": 1.4595645088997758e-05,
"loss": 0.2209,
"step": 334
},
{
"epoch": 0.4146039603960396,
"grad_norm": 4.356766700744629,
"learning_rate": 1.4557222823691913e-05,
"loss": 0.2051,
"step": 335
},
{
"epoch": 0.4158415841584158,
"grad_norm": 4.463465213775635,
"learning_rate": 1.451871545819961e-05,
"loss": 0.206,
"step": 336
},
{
"epoch": 0.41707920792079206,
"grad_norm": 4.003593444824219,
"learning_rate": 1.4480123711595637e-05,
"loss": 0.232,
"step": 337
},
{
"epoch": 0.4183168316831683,
"grad_norm": 4.910566806793213,
"learning_rate": 1.4441448304530467e-05,
"loss": 0.2214,
"step": 338
},
{
"epoch": 0.41955445544554454,
"grad_norm": 5.432886123657227,
"learning_rate": 1.4402689959216845e-05,
"loss": 0.2621,
"step": 339
},
{
"epoch": 0.4207920792079208,
"grad_norm": 3.290602922439575,
"learning_rate": 1.4363849399416254e-05,
"loss": 0.1961,
"step": 340
},
{
"epoch": 0.4207920792079208,
"eval_accuracy": 0.8625277161862528,
"eval_f1": 0.7194570135746606,
"eval_loss": 0.30351048707962036,
"eval_precision": 0.8412698412698413,
"eval_recall": 0.6284584980237155,
"eval_runtime": 47.8239,
"eval_samples_per_second": 5.771,
"eval_steps_per_second": 0.188,
"step": 340
},
{
"epoch": 0.422029702970297,
"grad_norm": 4.072077751159668,
"learning_rate": 1.4324927350425451e-05,
"loss": 0.2061,
"step": 341
},
{
"epoch": 0.42326732673267325,
"grad_norm": 5.433284282684326,
"learning_rate": 1.4285924539062878e-05,
"loss": 0.2536,
"step": 342
},
{
"epoch": 0.4245049504950495,
"grad_norm": 4.794254779815674,
"learning_rate": 1.424684169365512e-05,
"loss": 0.2207,
"step": 343
},
{
"epoch": 0.42574257425742573,
"grad_norm": 4.268115043640137,
"learning_rate": 1.4207679544023289e-05,
"loss": 0.172,
"step": 344
},
{
"epoch": 0.42698019801980197,
"grad_norm": 3.946532964706421,
"learning_rate": 1.4168438821469402e-05,
"loss": 0.2167,
"step": 345
},
{
"epoch": 0.4282178217821782,
"grad_norm": 4.425963401794434,
"learning_rate": 1.4129120258762719e-05,
"loss": 0.1608,
"step": 346
},
{
"epoch": 0.42945544554455445,
"grad_norm": 3.5604379177093506,
"learning_rate": 1.4089724590126061e-05,
"loss": 0.1506,
"step": 347
},
{
"epoch": 0.4306930693069307,
"grad_norm": 3.63785719871521,
"learning_rate": 1.4050252551222115e-05,
"loss": 0.2117,
"step": 348
},
{
"epoch": 0.4319306930693069,
"grad_norm": 5.928184509277344,
"learning_rate": 1.4010704879139669e-05,
"loss": 0.2693,
"step": 349
},
{
"epoch": 0.43316831683168316,
"grad_norm": 6.965827941894531,
"learning_rate": 1.3971082312379864e-05,
"loss": 0.2764,
"step": 350
},
{
"epoch": 0.4344059405940594,
"grad_norm": 6.468440532684326,
"learning_rate": 1.3931385590842412e-05,
"loss": 0.256,
"step": 351
},
{
"epoch": 0.43564356435643564,
"grad_norm": 3.584207773208618,
"learning_rate": 1.3891615455811751e-05,
"loss": 0.2238,
"step": 352
},
{
"epoch": 0.4368811881188119,
"grad_norm": 5.467838764190674,
"learning_rate": 1.3851772649943238e-05,
"loss": 0.227,
"step": 353
},
{
"epoch": 0.4381188118811881,
"grad_norm": 3.78888201713562,
"learning_rate": 1.3811857917249254e-05,
"loss": 0.1581,
"step": 354
},
{
"epoch": 0.43935643564356436,
"grad_norm": 3.6381869316101074,
"learning_rate": 1.3771872003085315e-05,
"loss": 0.1825,
"step": 355
},
{
"epoch": 0.4405940594059406,
"grad_norm": 7.276025772094727,
"learning_rate": 1.373181565413617e-05,
"loss": 0.1976,
"step": 356
},
{
"epoch": 0.44183168316831684,
"grad_norm": 3.857329845428467,
"learning_rate": 1.3691689618401836e-05,
"loss": 0.2113,
"step": 357
},
{
"epoch": 0.4430693069306931,
"grad_norm": 7.177792072296143,
"learning_rate": 1.365149464518364e-05,
"loss": 0.216,
"step": 358
},
{
"epoch": 0.4443069306930693,
"grad_norm": 4.157836437225342,
"learning_rate": 1.3611231485070233e-05,
"loss": 0.2549,
"step": 359
},
{
"epoch": 0.44554455445544555,
"grad_norm": 4.768564701080322,
"learning_rate": 1.3570900889923566e-05,
"loss": 0.2369,
"step": 360
},
{
"epoch": 0.44554455445544555,
"eval_accuracy": 0.8747228381374723,
"eval_f1": 0.754880694143167,
"eval_loss": 0.2958848178386688,
"eval_precision": 0.8365384615384616,
"eval_recall": 0.6877470355731226,
"eval_runtime": 48.4168,
"eval_samples_per_second": 5.7,
"eval_steps_per_second": 0.186,
"step": 360
},
{
"epoch": 0.4467821782178218,
"grad_norm": 4.88621187210083,
"learning_rate": 1.3530503612864846e-05,
"loss": 0.2358,
"step": 361
},
{
"epoch": 0.44801980198019803,
"grad_norm": 6.0111188888549805,
"learning_rate": 1.3490040408260481e-05,
"loss": 0.2217,
"step": 362
},
{
"epoch": 0.44925742574257427,
"grad_norm": 4.757837295532227,
"learning_rate": 1.3449512031707987e-05,
"loss": 0.2563,
"step": 363
},
{
"epoch": 0.4504950495049505,
"grad_norm": 4.659700870513916,
"learning_rate": 1.340891924002189e-05,
"loss": 0.2332,
"step": 364
},
{
"epoch": 0.45173267326732675,
"grad_norm": 4.033824920654297,
"learning_rate": 1.3368262791219568e-05,
"loss": 0.1607,
"step": 365
},
{
"epoch": 0.452970297029703,
"grad_norm": 3.7575106620788574,
"learning_rate": 1.3327543444507134e-05,
"loss": 0.1768,
"step": 366
},
{
"epoch": 0.4542079207920792,
"grad_norm": 3.7037270069122314,
"learning_rate": 1.3286761960265216e-05,
"loss": 0.1891,
"step": 367
},
{
"epoch": 0.45544554455445546,
"grad_norm": 4.199578762054443,
"learning_rate": 1.3245919100034794e-05,
"loss": 0.2017,
"step": 368
},
{
"epoch": 0.4566831683168317,
"grad_norm": 5.948062419891357,
"learning_rate": 1.3205015626502957e-05,
"loss": 0.2974,
"step": 369
},
{
"epoch": 0.45792079207920794,
"grad_norm": 7.015734672546387,
"learning_rate": 1.3164052303488673e-05,
"loss": 0.2077,
"step": 370
},
{
"epoch": 0.4591584158415842,
"grad_norm": 4.3216657638549805,
"learning_rate": 1.3123029895928516e-05,
"loss": 0.1838,
"step": 371
},
{
"epoch": 0.4603960396039604,
"grad_norm": 3.7913501262664795,
"learning_rate": 1.3081949169862398e-05,
"loss": 0.2199,
"step": 372
},
{
"epoch": 0.46163366336633666,
"grad_norm": 4.902266979217529,
"learning_rate": 1.304081089241923e-05,
"loss": 0.166,
"step": 373
},
{
"epoch": 0.4628712871287129,
"grad_norm": 4.847469806671143,
"learning_rate": 1.2999615831802647e-05,
"loss": 0.2465,
"step": 374
},
{
"epoch": 0.46410891089108913,
"grad_norm": 5.700747013092041,
"learning_rate": 1.2958364757276616e-05,
"loss": 0.2151,
"step": 375
},
{
"epoch": 0.46534653465346537,
"grad_norm": 4.389228343963623,
"learning_rate": 1.2917058439151101e-05,
"loss": 0.206,
"step": 376
},
{
"epoch": 0.4665841584158416,
"grad_norm": 3.7354536056518555,
"learning_rate": 1.2875697648767664e-05,
"loss": 0.1675,
"step": 377
},
{
"epoch": 0.46782178217821785,
"grad_norm": 4.175409317016602,
"learning_rate": 1.2834283158485064e-05,
"loss": 0.1649,
"step": 378
},
{
"epoch": 0.46905940594059403,
"grad_norm": 4.25518274307251,
"learning_rate": 1.2792815741664839e-05,
"loss": 0.2133,
"step": 379
},
{
"epoch": 0.47029702970297027,
"grad_norm": 5.524796009063721,
"learning_rate": 1.2751296172656862e-05,
"loss": 0.2355,
"step": 380
},
{
"epoch": 0.47029702970297027,
"eval_accuracy": 0.8536585365853658,
"eval_f1": 0.6944444444444444,
"eval_loss": 0.3176264464855194,
"eval_precision": 0.8379888268156425,
"eval_recall": 0.5928853754940712,
"eval_runtime": 49.1355,
"eval_samples_per_second": 5.617,
"eval_steps_per_second": 0.183,
"step": 380
},
{
"epoch": 0.4715346534653465,
"grad_norm": 4.528620719909668,
"learning_rate": 1.2709725226784872e-05,
"loss": 0.2361,
"step": 381
},
{
"epoch": 0.47277227722772275,
"grad_norm": 4.263962745666504,
"learning_rate": 1.2668103680332011e-05,
"loss": 0.1655,
"step": 382
},
{
"epoch": 0.474009900990099,
"grad_norm": 4.377805709838867,
"learning_rate": 1.2626432310526321e-05,
"loss": 0.1856,
"step": 383
},
{
"epoch": 0.4752475247524752,
"grad_norm": 3.614685535430908,
"learning_rate": 1.2584711895526227e-05,
"loss": 0.1409,
"step": 384
},
{
"epoch": 0.47648514851485146,
"grad_norm": 4.1616644859313965,
"learning_rate": 1.2542943214406012e-05,
"loss": 0.2266,
"step": 385
},
{
"epoch": 0.4777227722772277,
"grad_norm": 3.9394242763519287,
"learning_rate": 1.250112704714126e-05,
"loss": 0.2002,
"step": 386
},
{
"epoch": 0.47896039603960394,
"grad_norm": 4.4327497482299805,
"learning_rate": 1.2459264174594303e-05,
"loss": 0.2154,
"step": 387
},
{
"epoch": 0.4801980198019802,
"grad_norm": 5.662235260009766,
"learning_rate": 1.2417355378499631e-05,
"loss": 0.1944,
"step": 388
},
{
"epoch": 0.4814356435643564,
"grad_norm": 5.926950454711914,
"learning_rate": 1.2375401441449296e-05,
"loss": 0.2783,
"step": 389
},
{
"epoch": 0.48267326732673266,
"grad_norm": 5.828728199005127,
"learning_rate": 1.23334031468783e-05,
"loss": 0.191,
"step": 390
},
{
"epoch": 0.4839108910891089,
"grad_norm": 4.729196071624756,
"learning_rate": 1.229136127904996e-05,
"loss": 0.2154,
"step": 391
},
{
"epoch": 0.48514851485148514,
"grad_norm": 6.837848663330078,
"learning_rate": 1.2249276623041268e-05,
"loss": 0.2192,
"step": 392
},
{
"epoch": 0.4863861386138614,
"grad_norm": 5.411125659942627,
"learning_rate": 1.2207149964728236e-05,
"loss": 0.196,
"step": 393
},
{
"epoch": 0.4876237623762376,
"grad_norm": 3.0349247455596924,
"learning_rate": 1.2164982090771202e-05,
"loss": 0.1437,
"step": 394
},
{
"epoch": 0.48886138613861385,
"grad_norm": 8.826495170593262,
"learning_rate": 1.2122773788600164e-05,
"loss": 0.2106,
"step": 395
},
{
"epoch": 0.4900990099009901,
"grad_norm": 4.7112555503845215,
"learning_rate": 1.2080525846400055e-05,
"loss": 0.2362,
"step": 396
},
{
"epoch": 0.49133663366336633,
"grad_norm": 4.847238063812256,
"learning_rate": 1.203823905309604e-05,
"loss": 0.2131,
"step": 397
},
{
"epoch": 0.49257425742574257,
"grad_norm": 4.841707229614258,
"learning_rate": 1.1995914198338777e-05,
"loss": 0.2312,
"step": 398
},
{
"epoch": 0.4938118811881188,
"grad_norm": 6.640840530395508,
"learning_rate": 1.1953552072489666e-05,
"loss": 0.272,
"step": 399
},
{
"epoch": 0.49504950495049505,
"grad_norm": 3.752530097961426,
"learning_rate": 1.1911153466606105e-05,
"loss": 0.1538,
"step": 400
},
{
"epoch": 0.49504950495049505,
"eval_accuracy": 0.8503325942350333,
"eval_f1": 0.6778042959427207,
"eval_loss": 0.3098345696926117,
"eval_precision": 0.8554216867469879,
"eval_recall": 0.5612648221343873,
"eval_runtime": 47.9792,
"eval_samples_per_second": 5.752,
"eval_steps_per_second": 0.188,
"step": 400
},
{
"epoch": 0.4962871287128713,
"grad_norm": 3.4826607704162598,
"learning_rate": 1.1868719172426703e-05,
"loss": 0.1486,
"step": 401
},
{
"epoch": 0.4975247524752475,
"grad_norm": 8.029196739196777,
"learning_rate": 1.18262499823565e-05,
"loss": 0.2789,
"step": 402
},
{
"epoch": 0.49876237623762376,
"grad_norm": 3.5876622200012207,
"learning_rate": 1.1783746689452177e-05,
"loss": 0.1425,
"step": 403
},
{
"epoch": 0.5,
"grad_norm": 3.4825289249420166,
"learning_rate": 1.174121008740724e-05,
"loss": 0.1784,
"step": 404
},
{
"epoch": 0.5012376237623762,
"grad_norm": 4.722503185272217,
"learning_rate": 1.1698640970537195e-05,
"loss": 0.2572,
"step": 405
},
{
"epoch": 0.5024752475247525,
"grad_norm": 4.3678178787231445,
"learning_rate": 1.1656040133764721e-05,
"loss": 0.2721,
"step": 406
},
{
"epoch": 0.5037128712871287,
"grad_norm": 5.2886481285095215,
"learning_rate": 1.1613408372604826e-05,
"loss": 0.2189,
"step": 407
},
{
"epoch": 0.504950495049505,
"grad_norm": 3.8910672664642334,
"learning_rate": 1.1570746483149997e-05,
"loss": 0.1907,
"step": 408
},
{
"epoch": 0.5061881188118812,
"grad_norm": 4.010484218597412,
"learning_rate": 1.1528055262055318e-05,
"loss": 0.2616,
"step": 409
},
{
"epoch": 0.5074257425742574,
"grad_norm": 4.501477241516113,
"learning_rate": 1.14853355065236e-05,
"loss": 0.256,
"step": 410
},
{
"epoch": 0.5086633663366337,
"grad_norm": 3.6644740104675293,
"learning_rate": 1.1442588014290511e-05,
"loss": 0.2089,
"step": 411
},
{
"epoch": 0.5099009900990099,
"grad_norm": 3.0626332759857178,
"learning_rate": 1.139981358360966e-05,
"loss": 0.1694,
"step": 412
},
{
"epoch": 0.5111386138613861,
"grad_norm": 3.9855141639709473,
"learning_rate": 1.135701301323769e-05,
"loss": 0.2654,
"step": 413
},
{
"epoch": 0.5123762376237624,
"grad_norm": 4.269925594329834,
"learning_rate": 1.1314187102419374e-05,
"loss": 0.2045,
"step": 414
},
{
"epoch": 0.5136138613861386,
"grad_norm": 5.409115314483643,
"learning_rate": 1.1271336650872687e-05,
"loss": 0.2785,
"step": 415
},
{
"epoch": 0.5148514851485149,
"grad_norm": 5.027437210083008,
"learning_rate": 1.1228462458773866e-05,
"loss": 0.2592,
"step": 416
},
{
"epoch": 0.5160891089108911,
"grad_norm": 3.8614933490753174,
"learning_rate": 1.1185565326742474e-05,
"loss": 0.175,
"step": 417
},
{
"epoch": 0.5173267326732673,
"grad_norm": 4.525482177734375,
"learning_rate": 1.1142646055826442e-05,
"loss": 0.2285,
"step": 418
},
{
"epoch": 0.5185643564356436,
"grad_norm": 5.450479984283447,
"learning_rate": 1.1099705447487128e-05,
"loss": 0.2976,
"step": 419
},
{
"epoch": 0.5198019801980198,
"grad_norm": 5.157322406768799,
"learning_rate": 1.1056744303584322e-05,
"loss": 0.2261,
"step": 420
},
{
"epoch": 0.5198019801980198,
"eval_accuracy": 0.8658536585365854,
"eval_f1": 0.7352297592997812,
"eval_loss": 0.2963501214981079,
"eval_precision": 0.8235294117647058,
"eval_recall": 0.6640316205533597,
"eval_runtime": 48.7884,
"eval_samples_per_second": 5.657,
"eval_steps_per_second": 0.184,
"step": 420
},
{
"epoch": 0.5210396039603961,
"grad_norm": 6.793178558349609,
"learning_rate": 1.1013763426361303e-05,
"loss": 0.2282,
"step": 421
},
{
"epoch": 0.5222772277227723,
"grad_norm": 3.814647912979126,
"learning_rate": 1.0970763618429841e-05,
"loss": 0.206,
"step": 422
},
{
"epoch": 0.5235148514851485,
"grad_norm": 6.057635307312012,
"learning_rate": 1.0927745682755202e-05,
"loss": 0.2281,
"step": 423
},
{
"epoch": 0.5247524752475248,
"grad_norm": 4.867273330688477,
"learning_rate": 1.088471042264118e-05,
"loss": 0.1996,
"step": 424
},
{
"epoch": 0.525990099009901,
"grad_norm": 4.475593090057373,
"learning_rate": 1.0841658641715064e-05,
"loss": 0.2007,
"step": 425
},
{
"epoch": 0.5272277227722773,
"grad_norm": 4.282394886016846,
"learning_rate": 1.079859114391266e-05,
"loss": 0.2534,
"step": 426
},
{
"epoch": 0.5284653465346535,
"grad_norm": 5.078785419464111,
"learning_rate": 1.0755508733463265e-05,
"loss": 0.2951,
"step": 427
},
{
"epoch": 0.5297029702970297,
"grad_norm": 5.91337776184082,
"learning_rate": 1.071241221487464e-05,
"loss": 0.2615,
"step": 428
},
{
"epoch": 0.530940594059406,
"grad_norm": 3.2098143100738525,
"learning_rate": 1.0669302392918007e-05,
"loss": 0.1601,
"step": 429
},
{
"epoch": 0.5321782178217822,
"grad_norm": 3.9253528118133545,
"learning_rate": 1.0626180072613011e-05,
"loss": 0.1894,
"step": 430
},
{
"epoch": 0.5334158415841584,
"grad_norm": 6.6494059562683105,
"learning_rate": 1.0583046059212678e-05,
"loss": 0.2357,
"step": 431
},
{
"epoch": 0.5346534653465347,
"grad_norm": 4.98652982711792,
"learning_rate": 1.0539901158188399e-05,
"loss": 0.2233,
"step": 432
},
{
"epoch": 0.5358910891089109,
"grad_norm": 3.734840154647827,
"learning_rate": 1.0496746175214869e-05,
"loss": 0.1968,
"step": 433
},
{
"epoch": 0.5371287128712872,
"grad_norm": 3.251173734664917,
"learning_rate": 1.045358191615506e-05,
"loss": 0.1523,
"step": 434
},
{
"epoch": 0.5383663366336634,
"grad_norm": 4.58748722076416,
"learning_rate": 1.0410409187045145e-05,
"loss": 0.2063,
"step": 435
},
{
"epoch": 0.5396039603960396,
"grad_norm": 3.0365309715270996,
"learning_rate": 1.0367228794079483e-05,
"loss": 0.1661,
"step": 436
},
{
"epoch": 0.5408415841584159,
"grad_norm": 4.199584484100342,
"learning_rate": 1.0324041543595536e-05,
"loss": 0.1809,
"step": 437
},
{
"epoch": 0.5420792079207921,
"grad_norm": 3.332609176635742,
"learning_rate": 1.0280848242058819e-05,
"loss": 0.1976,
"step": 438
},
{
"epoch": 0.5433168316831684,
"grad_norm": 3.00707745552063,
"learning_rate": 1.0237649696047851e-05,
"loss": 0.1098,
"step": 439
},
{
"epoch": 0.5445544554455446,
"grad_norm": 5.463438034057617,
"learning_rate": 1.0194446712239076e-05,
"loss": 0.1894,
"step": 440
},
{
"epoch": 0.5445544554455446,
"eval_accuracy": 0.8625277161862528,
"eval_f1": 0.7075471698113207,
"eval_loss": 0.30853110551834106,
"eval_precision": 0.8771929824561403,
"eval_recall": 0.5928853754940712,
"eval_runtime": 48.179,
"eval_samples_per_second": 5.729,
"eval_steps_per_second": 0.187,
"step": 440
},
{
"epoch": 0.5457920792079208,
"grad_norm": 4.850437641143799,
"learning_rate": 1.015124009739182e-05,
"loss": 0.2444,
"step": 441
},
{
"epoch": 0.5470297029702971,
"grad_norm": 4.6562018394470215,
"learning_rate": 1.0108030658333193e-05,
"loss": 0.1926,
"step": 442
},
{
"epoch": 0.5482673267326733,
"grad_norm": 6.2014641761779785,
"learning_rate": 1.0064819201943066e-05,
"loss": 0.2206,
"step": 443
},
{
"epoch": 0.5495049504950495,
"grad_norm": 4.451841354370117,
"learning_rate": 1.0021606535138965e-05,
"loss": 0.1638,
"step": 444
},
{
"epoch": 0.5507425742574258,
"grad_norm": 5.371818542480469,
"learning_rate": 9.978393464861036e-06,
"loss": 0.2092,
"step": 445
},
{
"epoch": 0.551980198019802,
"grad_norm": 4.511114120483398,
"learning_rate": 9.935180798056936e-06,
"loss": 0.1662,
"step": 446
},
{
"epoch": 0.5532178217821783,
"grad_norm": 4.505270957946777,
"learning_rate": 9.891969341666809e-06,
"loss": 0.1585,
"step": 447
},
{
"epoch": 0.5544554455445545,
"grad_norm": 8.295400619506836,
"learning_rate": 9.848759902608188e-06,
"loss": 0.1938,
"step": 448
},
{
"epoch": 0.5556930693069307,
"grad_norm": 3.9564101696014404,
"learning_rate": 9.805553287760922e-06,
"loss": 0.1661,
"step": 449
},
{
"epoch": 0.556930693069307,
"grad_norm": 4.960870265960693,
"learning_rate": 9.76235030395215e-06,
"loss": 0.2025,
"step": 450
},
{
"epoch": 0.5581683168316832,
"grad_norm": 5.388489246368408,
"learning_rate": 9.719151757941184e-06,
"loss": 0.1984,
"step": 451
},
{
"epoch": 0.5594059405940595,
"grad_norm": 5.2945356369018555,
"learning_rate": 9.675958456404468e-06,
"loss": 0.1538,
"step": 452
},
{
"epoch": 0.5606435643564357,
"grad_norm": 3.8537747859954834,
"learning_rate": 9.63277120592052e-06,
"loss": 0.2006,
"step": 453
},
{
"epoch": 0.5618811881188119,
"grad_norm": 9.160053253173828,
"learning_rate": 9.589590812954858e-06,
"loss": 0.2389,
"step": 454
},
{
"epoch": 0.5631188118811881,
"grad_norm": 5.641171932220459,
"learning_rate": 9.546418083844944e-06,
"loss": 0.2629,
"step": 455
},
{
"epoch": 0.5643564356435643,
"grad_norm": 5.4803266525268555,
"learning_rate": 9.503253824785133e-06,
"loss": 0.2714,
"step": 456
},
{
"epoch": 0.5655940594059405,
"grad_norm": 5.162708759307861,
"learning_rate": 9.460098841811601e-06,
"loss": 0.2355,
"step": 457
},
{
"epoch": 0.5668316831683168,
"grad_norm": 3.652041435241699,
"learning_rate": 9.416953940787324e-06,
"loss": 0.2158,
"step": 458
},
{
"epoch": 0.568069306930693,
"grad_norm": 4.453556060791016,
"learning_rate": 9.373819927386996e-06,
"loss": 0.2006,
"step": 459
},
{
"epoch": 0.5693069306930693,
"grad_norm": 7.275057792663574,
"learning_rate": 9.330697607081995e-06,
"loss": 0.2089,
"step": 460
},
{
"epoch": 0.5693069306930693,
"eval_accuracy": 0.8592017738359202,
"eval_f1": 0.702576112412178,
"eval_loss": 0.3102664649486542,
"eval_precision": 0.8620689655172413,
"eval_recall": 0.5928853754940712,
"eval_runtime": 47.9599,
"eval_samples_per_second": 5.755,
"eval_steps_per_second": 0.188,
"step": 460
},
{
"epoch": 0.5705445544554455,
"grad_norm": 4.3111371994018555,
"learning_rate": 9.287587785125364e-06,
"loss": 0.1911,
"step": 461
},
{
"epoch": 0.5717821782178217,
"grad_norm": 3.9077603816986084,
"learning_rate": 9.244491266536742e-06,
"loss": 0.1729,
"step": 462
},
{
"epoch": 0.573019801980198,
"grad_norm": 4.95902156829834,
"learning_rate": 9.20140885608734e-06,
"loss": 0.1808,
"step": 463
},
{
"epoch": 0.5742574257425742,
"grad_norm": 3.4273617267608643,
"learning_rate": 9.158341358284939e-06,
"loss": 0.1742,
"step": 464
},
{
"epoch": 0.5754950495049505,
"grad_norm": 5.972158432006836,
"learning_rate": 9.115289577358826e-06,
"loss": 0.2702,
"step": 465
},
{
"epoch": 0.5767326732673267,
"grad_norm": 4.50413703918457,
"learning_rate": 9.072254317244802e-06,
"loss": 0.2357,
"step": 466
},
{
"epoch": 0.5779702970297029,
"grad_norm": 4.178338527679443,
"learning_rate": 9.029236381570162e-06,
"loss": 0.2067,
"step": 467
},
{
"epoch": 0.5792079207920792,
"grad_norm": 3.6516830921173096,
"learning_rate": 8.986236573638697e-06,
"loss": 0.1838,
"step": 468
},
{
"epoch": 0.5804455445544554,
"grad_norm": 4.150967121124268,
"learning_rate": 8.94325569641568e-06,
"loss": 0.1873,
"step": 469
},
{
"epoch": 0.5816831683168316,
"grad_norm": 3.2436413764953613,
"learning_rate": 8.900294552512878e-06,
"loss": 0.1666,
"step": 470
},
{
"epoch": 0.5829207920792079,
"grad_norm": 4.382279396057129,
"learning_rate": 8.85735394417356e-06,
"loss": 0.2198,
"step": 471
},
{
"epoch": 0.5841584158415841,
"grad_norm": 5.853991508483887,
"learning_rate": 8.81443467325753e-06,
"loss": 0.2089,
"step": 472
},
{
"epoch": 0.5853960396039604,
"grad_norm": 6.203914642333984,
"learning_rate": 8.771537541226139e-06,
"loss": 0.2313,
"step": 473
},
{
"epoch": 0.5866336633663366,
"grad_norm": 4.027249813079834,
"learning_rate": 8.728663349127315e-06,
"loss": 0.2215,
"step": 474
},
{
"epoch": 0.5878712871287128,
"grad_norm": 3.39310359954834,
"learning_rate": 8.68581289758063e-06,
"loss": 0.1716,
"step": 475
},
{
"epoch": 0.5891089108910891,
"grad_norm": 8.093832015991211,
"learning_rate": 8.642986986762315e-06,
"loss": 0.2452,
"step": 476
},
{
"epoch": 0.5903465346534653,
"grad_norm": 6.332000255584717,
"learning_rate": 8.600186416390343e-06,
"loss": 0.1736,
"step": 477
},
{
"epoch": 0.5915841584158416,
"grad_norm": 4.7539167404174805,
"learning_rate": 8.55741198570949e-06,
"loss": 0.2205,
"step": 478
},
{
"epoch": 0.5928217821782178,
"grad_norm": 5.075088024139404,
"learning_rate": 8.514664493476402e-06,
"loss": 0.1496,
"step": 479
},
{
"epoch": 0.594059405940594,
"grad_norm": 3.6095449924468994,
"learning_rate": 8.471944737944687e-06,
"loss": 0.225,
"step": 480
},
{
"epoch": 0.594059405940594,
"eval_accuracy": 0.8669623059866962,
"eval_f1": 0.7285067873303167,
"eval_loss": 0.2933300733566284,
"eval_precision": 0.8518518518518519,
"eval_recall": 0.6363636363636364,
"eval_runtime": 47.6843,
"eval_samples_per_second": 5.788,
"eval_steps_per_second": 0.189,
"step": 480
},
{
"epoch": 0.5952970297029703,
"grad_norm": 3.744133949279785,
"learning_rate": 8.429253516850006e-06,
"loss": 0.2748,
"step": 481
},
{
"epoch": 0.5965346534653465,
"grad_norm": 3.6019346714019775,
"learning_rate": 8.386591627395174e-06,
"loss": 0.1726,
"step": 482
},
{
"epoch": 0.5977722772277227,
"grad_norm": 4.779697418212891,
"learning_rate": 8.343959866235282e-06,
"loss": 0.1887,
"step": 483
},
{
"epoch": 0.599009900990099,
"grad_norm": 5.494039058685303,
"learning_rate": 8.30135902946281e-06,
"loss": 0.2056,
"step": 484
},
{
"epoch": 0.6002475247524752,
"grad_norm": 4.281253337860107,
"learning_rate": 8.25878991259276e-06,
"loss": 0.1711,
"step": 485
},
{
"epoch": 0.6014851485148515,
"grad_norm": 7.502985000610352,
"learning_rate": 8.216253310547824e-06,
"loss": 0.2466,
"step": 486
},
{
"epoch": 0.6027227722772277,
"grad_norm": 6.201837062835693,
"learning_rate": 8.173750017643504e-06,
"loss": 0.188,
"step": 487
},
{
"epoch": 0.6039603960396039,
"grad_norm": 3.864279270172119,
"learning_rate": 8.1312808275733e-06,
"loss": 0.1973,
"step": 488
},
{
"epoch": 0.6051980198019802,
"grad_norm": 5.216121673583984,
"learning_rate": 8.0888465333939e-06,
"loss": 0.2352,
"step": 489
},
{
"epoch": 0.6064356435643564,
"grad_norm": 5.146021842956543,
"learning_rate": 8.046447927510335e-06,
"loss": 0.2447,
"step": 490
},
{
"epoch": 0.6076732673267327,
"grad_norm": 3.6283152103424072,
"learning_rate": 8.004085801661227e-06,
"loss": 0.1614,
"step": 491
},
{
"epoch": 0.6089108910891089,
"grad_norm": 3.2507400512695312,
"learning_rate": 7.961760946903964e-06,
"loss": 0.1683,
"step": 492
},
{
"epoch": 0.6101485148514851,
"grad_norm": 3.735715627670288,
"learning_rate": 7.919474153599948e-06,
"loss": 0.185,
"step": 493
},
{
"epoch": 0.6113861386138614,
"grad_norm": 3.8470706939697266,
"learning_rate": 7.87722621139984e-06,
"loss": 0.201,
"step": 494
},
{
"epoch": 0.6126237623762376,
"grad_norm": 5.094181060791016,
"learning_rate": 7.835017909228801e-06,
"loss": 0.2129,
"step": 495
},
{
"epoch": 0.6138613861386139,
"grad_norm": 3.934907913208008,
"learning_rate": 7.792850035271768e-06,
"loss": 0.1731,
"step": 496
},
{
"epoch": 0.6150990099009901,
"grad_norm": 3.6140084266662598,
"learning_rate": 7.750723376958735e-06,
"loss": 0.1805,
"step": 497
},
{
"epoch": 0.6163366336633663,
"grad_norm": 4.225896835327148,
"learning_rate": 7.708638720950043e-06,
"loss": 0.1787,
"step": 498
},
{
"epoch": 0.6175742574257426,
"grad_norm": 2.794840097427368,
"learning_rate": 7.666596853121702e-06,
"loss": 0.108,
"step": 499
},
{
"epoch": 0.6188118811881188,
"grad_norm": 4.840591907501221,
"learning_rate": 7.624598558550707e-06,
"loss": 0.2837,
"step": 500
},
{
"epoch": 0.6188118811881188,
"eval_accuracy": 0.8636363636363636,
"eval_f1": 0.7272727272727273,
"eval_loss": 0.2955215275287628,
"eval_precision": 0.8282828282828283,
"eval_recall": 0.6482213438735178,
"eval_runtime": 48.4116,
"eval_samples_per_second": 5.701,
"eval_steps_per_second": 0.186,
"step": 500
},
{
"epoch": 0.620049504950495,
"grad_norm": 3.656219244003296,
"learning_rate": 7.5826446215003695e-06,
"loss": 0.1701,
"step": 501
},
{
"epoch": 0.6212871287128713,
"grad_norm": 3.9823882579803467,
"learning_rate": 7.5407358254056995e-06,
"loss": 0.1759,
"step": 502
},
{
"epoch": 0.6225247524752475,
"grad_norm": 5.238447666168213,
"learning_rate": 7.4988729528587445e-06,
"loss": 0.2084,
"step": 503
},
{
"epoch": 0.6237623762376238,
"grad_norm": 5.870246887207031,
"learning_rate": 7.45705678559399e-06,
"loss": 0.2592,
"step": 504
},
{
"epoch": 0.625,
"grad_norm": 6.469812393188477,
"learning_rate": 7.415288104473774e-06,
"loss": 0.2664,
"step": 505
},
{
"epoch": 0.6262376237623762,
"grad_norm": 4.086902141571045,
"learning_rate": 7.373567689473683e-06,
"loss": 0.123,
"step": 506
},
{
"epoch": 0.6274752475247525,
"grad_norm": 4.265713691711426,
"learning_rate": 7.3318963196679904e-06,
"loss": 0.1567,
"step": 507
},
{
"epoch": 0.6287128712871287,
"grad_norm": 5.820674419403076,
"learning_rate": 7.290274773215131e-06,
"loss": 0.2661,
"step": 508
},
{
"epoch": 0.629950495049505,
"grad_norm": 3.3316519260406494,
"learning_rate": 7.248703827343142e-06,
"loss": 0.1827,
"step": 509
},
{
"epoch": 0.6311881188118812,
"grad_norm": 6.632443428039551,
"learning_rate": 7.207184258335163e-06,
"loss": 0.2486,
"step": 510
},
{
"epoch": 0.6324257425742574,
"grad_norm": 5.278284072875977,
"learning_rate": 7.1657168415149396e-06,
"loss": 0.205,
"step": 511
},
{
"epoch": 0.6336633663366337,
"grad_norm": 7.767988204956055,
"learning_rate": 7.124302351232337e-06,
"loss": 0.1912,
"step": 512
},
{
"epoch": 0.6349009900990099,
"grad_norm": 4.862364292144775,
"learning_rate": 7.0829415608489e-06,
"loss": 0.2028,
"step": 513
},
{
"epoch": 0.6361386138613861,
"grad_norm": 4.547641754150391,
"learning_rate": 7.041635242723386e-06,
"loss": 0.182,
"step": 514
},
{
"epoch": 0.6373762376237624,
"grad_norm": 4.835113048553467,
"learning_rate": 7.000384168197354e-06,
"loss": 0.2437,
"step": 515
},
{
"epoch": 0.6386138613861386,
"grad_norm": 4.105000019073486,
"learning_rate": 6.9591891075807705e-06,
"loss": 0.1998,
"step": 516
},
{
"epoch": 0.6398514851485149,
"grad_norm": 4.884759902954102,
"learning_rate": 6.918050830137608e-06,
"loss": 0.2597,
"step": 517
},
{
"epoch": 0.6410891089108911,
"grad_norm": 5.189384460449219,
"learning_rate": 6.876970104071483e-06,
"loss": 0.2485,
"step": 518
},
{
"epoch": 0.6423267326732673,
"grad_norm": 4.154090881347656,
"learning_rate": 6.8359476965113295e-06,
"loss": 0.179,
"step": 519
},
{
"epoch": 0.6435643564356436,
"grad_norm": 5.2503437995910645,
"learning_rate": 6.7949843734970475e-06,
"loss": 0.2046,
"step": 520
},
{
"epoch": 0.6435643564356436,
"eval_accuracy": 0.8647450110864745,
"eval_f1": 0.7252252252252253,
"eval_loss": 0.29431188106536865,
"eval_precision": 0.8429319371727748,
"eval_recall": 0.6363636363636364,
"eval_runtime": 49.3715,
"eval_samples_per_second": 5.59,
"eval_steps_per_second": 0.182,
"step": 520
},
{
"epoch": 0.6448019801980198,
"grad_norm": 3.8733835220336914,
"learning_rate": 6.754080899965208e-06,
"loss": 0.2034,
"step": 521
},
{
"epoch": 0.6460396039603961,
"grad_norm": 3.805725574493408,
"learning_rate": 6.713238039734788e-06,
"loss": 0.151,
"step": 522
},
{
"epoch": 0.6472772277227723,
"grad_norm": 3.7677907943725586,
"learning_rate": 6.67245655549287e-06,
"loss": 0.1466,
"step": 523
},
{
"epoch": 0.6485148514851485,
"grad_norm": 3.6880476474761963,
"learning_rate": 6.631737208780433e-06,
"loss": 0.1986,
"step": 524
},
{
"epoch": 0.6497524752475248,
"grad_norm": 4.416601657867432,
"learning_rate": 6.5910807599781135e-06,
"loss": 0.2105,
"step": 525
},
{
"epoch": 0.650990099009901,
"grad_norm": 3.7478973865509033,
"learning_rate": 6.550487968292013e-06,
"loss": 0.1534,
"step": 526
},
{
"epoch": 0.6522277227722773,
"grad_norm": 5.128391742706299,
"learning_rate": 6.509959591739522e-06,
"loss": 0.2103,
"step": 527
},
{
"epoch": 0.6534653465346535,
"grad_norm": 5.070952415466309,
"learning_rate": 6.469496387135158e-06,
"loss": 0.1674,
"step": 528
},
{
"epoch": 0.6547029702970297,
"grad_norm": 4.714488983154297,
"learning_rate": 6.429099110076436e-06,
"loss": 0.221,
"step": 529
},
{
"epoch": 0.655940594059406,
"grad_norm": 5.199388027191162,
"learning_rate": 6.388768514929768e-06,
"loss": 0.2027,
"step": 530
},
{
"epoch": 0.6571782178217822,
"grad_norm": 5.243039608001709,
"learning_rate": 6.3485053548163644e-06,
"loss": 0.2347,
"step": 531
},
{
"epoch": 0.6584158415841584,
"grad_norm": 6.204155445098877,
"learning_rate": 6.308310381598168e-06,
"loss": 0.1924,
"step": 532
},
{
"epoch": 0.6596534653465347,
"grad_norm": 4.144034385681152,
"learning_rate": 6.2681843458638345e-06,
"loss": 0.2338,
"step": 533
},
{
"epoch": 0.6608910891089109,
"grad_norm": 3.8708503246307373,
"learning_rate": 6.2281279969146855e-06,
"loss": 0.1827,
"step": 534
},
{
"epoch": 0.6621287128712872,
"grad_norm": 4.874747276306152,
"learning_rate": 6.18814208275075e-06,
"loss": 0.2345,
"step": 535
},
{
"epoch": 0.6633663366336634,
"grad_norm": 3.9638774394989014,
"learning_rate": 6.148227350056763e-06,
"loss": 0.173,
"step": 536
},
{
"epoch": 0.6646039603960396,
"grad_norm": 3.154224395751953,
"learning_rate": 6.10838454418825e-06,
"loss": 0.1253,
"step": 537
},
{
"epoch": 0.6658415841584159,
"grad_norm": 3.55877947807312,
"learning_rate": 6.068614409157591e-06,
"loss": 0.1708,
"step": 538
},
{
"epoch": 0.6670792079207921,
"grad_norm": 3.995196580886841,
"learning_rate": 6.0289176876201385e-06,
"loss": 0.1969,
"step": 539
},
{
"epoch": 0.6683168316831684,
"grad_norm": 3.824521064758301,
"learning_rate": 5.989295120860334e-06,
"loss": 0.1548,
"step": 540
},
{
"epoch": 0.6683168316831684,
"eval_accuracy": 0.8636363636363636,
"eval_f1": 0.7223476297968398,
"eval_loss": 0.3003367483615875,
"eval_precision": 0.8421052631578947,
"eval_recall": 0.6324110671936759,
"eval_runtime": 49.5296,
"eval_samples_per_second": 5.572,
"eval_steps_per_second": 0.182,
"step": 540
},
{
"epoch": 0.6695544554455446,
"grad_norm": 5.372792720794678,
"learning_rate": 5.94974744877789e-06,
"loss": 0.179,
"step": 541
},
{
"epoch": 0.6707920792079208,
"grad_norm": 4.743022441864014,
"learning_rate": 5.910275409873942e-06,
"loss": 0.1794,
"step": 542
},
{
"epoch": 0.6720297029702971,
"grad_norm": 3.996967315673828,
"learning_rate": 5.870879741237285e-06,
"loss": 0.1525,
"step": 543
},
{
"epoch": 0.6732673267326733,
"grad_norm": 7.0855207443237305,
"learning_rate": 5.831561178530602e-06,
"loss": 0.2256,
"step": 544
},
{
"epoch": 0.6745049504950495,
"grad_norm": 7.199305057525635,
"learning_rate": 5.792320455976714e-06,
"loss": 0.2125,
"step": 545
},
{
"epoch": 0.6757425742574258,
"grad_norm": 5.01775598526001,
"learning_rate": 5.753158306344882e-06,
"loss": 0.1781,
"step": 546
},
{
"epoch": 0.676980198019802,
"grad_norm": 3.7600646018981934,
"learning_rate": 5.7140754609371255e-06,
"loss": 0.2278,
"step": 547
},
{
"epoch": 0.6782178217821783,
"grad_norm": 5.047920227050781,
"learning_rate": 5.675072649574551e-06,
"loss": 0.2191,
"step": 548
},
{
"epoch": 0.6794554455445545,
"grad_norm": 5.662668228149414,
"learning_rate": 5.636150600583747e-06,
"loss": 0.1901,
"step": 549
},
{
"epoch": 0.6806930693069307,
"grad_norm": 4.518259525299072,
"learning_rate": 5.597310040783161e-06,
"loss": 0.2264,
"step": 550
},
{
"epoch": 0.681930693069307,
"grad_norm": 4.768115043640137,
"learning_rate": 5.558551695469532e-06,
"loss": 0.2532,
"step": 551
},
{
"epoch": 0.6831683168316832,
"grad_norm": 4.239420413970947,
"learning_rate": 5.519876288404367e-06,
"loss": 0.2236,
"step": 552
},
{
"epoch": 0.6844059405940595,
"grad_norm": 4.318198204040527,
"learning_rate": 5.481284541800391e-06,
"loss": 0.2504,
"step": 553
},
{
"epoch": 0.6856435643564357,
"grad_norm": 4.004321575164795,
"learning_rate": 5.44277717630809e-06,
"loss": 0.1704,
"step": 554
},
{
"epoch": 0.6868811881188119,
"grad_norm": 6.967254161834717,
"learning_rate": 5.404354911002243e-06,
"loss": 0.2201,
"step": 555
},
{
"epoch": 0.6881188118811881,
"grad_norm": 3.623018980026245,
"learning_rate": 5.3660184633684895e-06,
"loss": 0.1477,
"step": 556
},
{
"epoch": 0.6893564356435643,
"grad_norm": 4.345696926116943,
"learning_rate": 5.3277685492899345e-06,
"loss": 0.2465,
"step": 557
},
{
"epoch": 0.6905940594059405,
"grad_norm": 4.71245813369751,
"learning_rate": 5.289605883033793e-06,
"loss": 0.1864,
"step": 558
},
{
"epoch": 0.6918316831683168,
"grad_norm": 5.07157039642334,
"learning_rate": 5.251531177238029e-06,
"loss": 0.1596,
"step": 559
},
{
"epoch": 0.693069306930693,
"grad_norm": 3.7650375366210938,
"learning_rate": 5.213545142898061e-06,
"loss": 0.1626,
"step": 560
},
{
"epoch": 0.693069306930693,
"eval_accuracy": 0.8625277161862528,
"eval_f1": 0.7129629629629629,
"eval_loss": 0.2982478439807892,
"eval_precision": 0.8603351955307262,
"eval_recall": 0.6086956521739131,
"eval_runtime": 48.9768,
"eval_samples_per_second": 5.635,
"eval_steps_per_second": 0.184,
"step": 560
},
{
"epoch": 0.6943069306930693,
"grad_norm": 5.199242115020752,
"learning_rate": 5.175648489353493e-06,
"loss": 0.1277,
"step": 561
},
{
"epoch": 0.6955445544554455,
"grad_norm": 4.108044624328613,
"learning_rate": 5.137841924274851e-06,
"loss": 0.2117,
"step": 562
},
{
"epoch": 0.6967821782178217,
"grad_norm": 5.149396896362305,
"learning_rate": 5.100126153650379e-06,
"loss": 0.1769,
"step": 563
},
{
"epoch": 0.698019801980198,
"grad_norm": 3.721707820892334,
"learning_rate": 5.0625018817728496e-06,
"loss": 0.1764,
"step": 564
},
{
"epoch": 0.6992574257425742,
"grad_norm": 5.771122932434082,
"learning_rate": 5.024969811226419e-06,
"loss": 0.2841,
"step": 565
},
{
"epoch": 0.7004950495049505,
"grad_norm": 6.165885925292969,
"learning_rate": 4.98753064287351e-06,
"loss": 0.2048,
"step": 566
},
{
"epoch": 0.7017326732673267,
"grad_norm": 3.664384126663208,
"learning_rate": 4.950185075841706e-06,
"loss": 0.14,
"step": 567
},
{
"epoch": 0.7029702970297029,
"grad_norm": 6.110241889953613,
"learning_rate": 4.912933807510714e-06,
"loss": 0.2553,
"step": 568
},
{
"epoch": 0.7042079207920792,
"grad_norm": 4.46115255355835,
"learning_rate": 4.875777533499339e-06,
"loss": 0.1871,
"step": 569
},
{
"epoch": 0.7054455445544554,
"grad_norm": 5.189129829406738,
"learning_rate": 4.838716947652485e-06,
"loss": 0.1922,
"step": 570
},
{
"epoch": 0.7066831683168316,
"grad_norm": 3.854255437850952,
"learning_rate": 4.801752742028214e-06,
"loss": 0.1823,
"step": 571
},
{
"epoch": 0.7079207920792079,
"grad_norm": 4.2072319984436035,
"learning_rate": 4.7648856068848e-06,
"loss": 0.1776,
"step": 572
},
{
"epoch": 0.7091584158415841,
"grad_norm": 3.298652172088623,
"learning_rate": 4.728116230667859e-06,
"loss": 0.2089,
"step": 573
},
{
"epoch": 0.7103960396039604,
"grad_norm": 4.39929723739624,
"learning_rate": 4.691445299997491e-06,
"loss": 0.19,
"step": 574
},
{
"epoch": 0.7116336633663366,
"grad_norm": 4.1644110679626465,
"learning_rate": 4.654873499655449e-06,
"loss": 0.1932,
"step": 575
},
{
"epoch": 0.7128712871287128,
"grad_norm": 6.846812725067139,
"learning_rate": 4.618401512572351e-06,
"loss": 0.2762,
"step": 576
},
{
"epoch": 0.7141089108910891,
"grad_norm": 6.345206260681152,
"learning_rate": 4.582030019814948e-06,
"loss": 0.249,
"step": 577
},
{
"epoch": 0.7153465346534653,
"grad_norm": 4.541729927062988,
"learning_rate": 4.5457597005733774e-06,
"loss": 0.199,
"step": 578
},
{
"epoch": 0.7165841584158416,
"grad_norm": 5.228466510772705,
"learning_rate": 4.5095912321484946e-06,
"loss": 0.1622,
"step": 579
},
{
"epoch": 0.7178217821782178,
"grad_norm": 4.646934509277344,
"learning_rate": 4.4735252899392335e-06,
"loss": 0.2065,
"step": 580
},
{
"epoch": 0.7178217821782178,
"eval_accuracy": 0.8636363636363636,
"eval_f1": 0.7308533916849015,
"eval_loss": 0.28770458698272705,
"eval_precision": 0.8186274509803921,
"eval_recall": 0.6600790513833992,
"eval_runtime": 48.9708,
"eval_samples_per_second": 5.636,
"eval_steps_per_second": 0.184,
"step": 580
},
{
"epoch": 0.719059405940594,
"grad_norm": 5.637617588043213,
"learning_rate": 4.437562547429971e-06,
"loss": 0.2249,
"step": 581
},
{
"epoch": 0.7202970297029703,
"grad_norm": 3.387362480163574,
"learning_rate": 4.4017036761779785e-06,
"loss": 0.1965,
"step": 582
},
{
"epoch": 0.7215346534653465,
"grad_norm": 4.963772773742676,
"learning_rate": 4.365949345800856e-06,
"loss": 0.1589,
"step": 583
},
{
"epoch": 0.7227722772277227,
"grad_norm": 8.057557106018066,
"learning_rate": 4.3303002239640424e-06,
"loss": 0.2567,
"step": 584
},
{
"epoch": 0.724009900990099,
"grad_norm": 4.3113789558410645,
"learning_rate": 4.294756976368351e-06,
"loss": 0.2492,
"step": 585
},
{
"epoch": 0.7252475247524752,
"grad_norm": 6.69741153717041,
"learning_rate": 4.259320266737522e-06,
"loss": 0.2378,
"step": 586
},
{
"epoch": 0.7264851485148515,
"grad_norm": 5.994034290313721,
"learning_rate": 4.223990756805841e-06,
"loss": 0.2038,
"step": 587
},
{
"epoch": 0.7277227722772277,
"grad_norm": 6.540597915649414,
"learning_rate": 4.1887691063057865e-06,
"loss": 0.1929,
"step": 588
},
{
"epoch": 0.7289603960396039,
"grad_norm": 4.549102783203125,
"learning_rate": 4.153655972955695e-06,
"loss": 0.2153,
"step": 589
},
{
"epoch": 0.7301980198019802,
"grad_norm": 5.070977210998535,
"learning_rate": 4.118652012447486e-06,
"loss": 0.1908,
"step": 590
},
{
"epoch": 0.7314356435643564,
"grad_norm": 3.0591437816619873,
"learning_rate": 4.0837578784344225e-06,
"loss": 0.1806,
"step": 591
},
{
"epoch": 0.7326732673267327,
"grad_norm": 3.303514003753662,
"learning_rate": 4.048974222518905e-06,
"loss": 0.1859,
"step": 592
},
{
"epoch": 0.7339108910891089,
"grad_norm": 3.909907817840576,
"learning_rate": 4.01430169424029e-06,
"loss": 0.2238,
"step": 593
},
{
"epoch": 0.7351485148514851,
"grad_norm": 5.40861701965332,
"learning_rate": 3.97974094106278e-06,
"loss": 0.1768,
"step": 594
},
{
"epoch": 0.7363861386138614,
"grad_norm": 4.427615165710449,
"learning_rate": 3.945292608363312e-06,
"loss": 0.2324,
"step": 595
},
{
"epoch": 0.7376237623762376,
"grad_norm": 3.793356418609619,
"learning_rate": 3.9109573394195336e-06,
"loss": 0.1758,
"step": 596
},
{
"epoch": 0.7388613861386139,
"grad_norm": 3.278257369995117,
"learning_rate": 3.876735775397759e-06,
"loss": 0.1133,
"step": 597
},
{
"epoch": 0.7400990099009901,
"grad_norm": 3.4571950435638428,
"learning_rate": 3.842628555341018e-06,
"loss": 0.1381,
"step": 598
},
{
"epoch": 0.7413366336633663,
"grad_norm": 7.060393810272217,
"learning_rate": 3.8086363161571194e-06,
"loss": 0.2736,
"step": 599
},
{
"epoch": 0.7425742574257426,
"grad_norm": 3.590026617050171,
"learning_rate": 3.7747596926067485e-06,
"loss": 0.1423,
"step": 600
},
{
"epoch": 0.7425742574257426,
"eval_accuracy": 0.8603104212860311,
"eval_f1": 0.7014218009478673,
"eval_loss": 0.30313166975975037,
"eval_precision": 0.8757396449704142,
"eval_recall": 0.5849802371541502,
"eval_runtime": 50.2279,
"eval_samples_per_second": 5.495,
"eval_steps_per_second": 0.179,
"step": 600
},
{
"epoch": 0.7438118811881188,
"grad_norm": 4.045815944671631,
"learning_rate": 3.740999317291618e-06,
"loss": 0.2221,
"step": 601
},
{
"epoch": 0.745049504950495,
"grad_norm": 5.232572078704834,
"learning_rate": 3.7073558206426673e-06,
"loss": 0.2071,
"step": 602
},
{
"epoch": 0.7462871287128713,
"grad_norm": 4.61077356338501,
"learning_rate": 3.6738298309082665e-06,
"loss": 0.1197,
"step": 603
},
{
"epoch": 0.7475247524752475,
"grad_norm": 4.389346122741699,
"learning_rate": 3.6404219741425084e-06,
"loss": 0.153,
"step": 604
},
{
"epoch": 0.7487623762376238,
"grad_norm": 6.2665791511535645,
"learning_rate": 3.6071328741934985e-06,
"loss": 0.2109,
"step": 605
},
{
"epoch": 0.75,
"grad_norm": 2.8577771186828613,
"learning_rate": 3.5739631526917152e-06,
"loss": 0.1445,
"step": 606
},
{
"epoch": 0.7512376237623762,
"grad_norm": 5.7536187171936035,
"learning_rate": 3.540913429038407e-06,
"loss": 0.1783,
"step": 607
},
{
"epoch": 0.7524752475247525,
"grad_norm": 5.14769172668457,
"learning_rate": 3.507984320394012e-06,
"loss": 0.2198,
"step": 608
},
{
"epoch": 0.7537128712871287,
"grad_norm": 6.156418323516846,
"learning_rate": 3.47517644166664e-06,
"loss": 0.1891,
"step": 609
},
{
"epoch": 0.754950495049505,
"grad_norm": 3.9831583499908447,
"learning_rate": 3.442490405500598e-06,
"loss": 0.2457,
"step": 610
},
{
"epoch": 0.7561881188118812,
"grad_norm": 5.4462056159973145,
"learning_rate": 3.4099268222649373e-06,
"loss": 0.2008,
"step": 611
},
{
"epoch": 0.7574257425742574,
"grad_norm": 3.8568873405456543,
"learning_rate": 3.3774863000420545e-06,
"loss": 0.1695,
"step": 612
},
{
"epoch": 0.7586633663366337,
"grad_norm": 5.273173809051514,
"learning_rate": 3.3451694446163553e-06,
"loss": 0.2127,
"step": 613
},
{
"epoch": 0.7599009900990099,
"grad_norm": 5.92859411239624,
"learning_rate": 3.3129768594629186e-06,
"loss": 0.1385,
"step": 614
},
{
"epoch": 0.7611386138613861,
"grad_norm": 5.471877574920654,
"learning_rate": 3.2809091457362464e-06,
"loss": 0.2105,
"step": 615
},
{
"epoch": 0.7623762376237624,
"grad_norm": 3.4092907905578613,
"learning_rate": 3.248966902259024e-06,
"loss": 0.156,
"step": 616
},
{
"epoch": 0.7636138613861386,
"grad_norm": 4.115101337432861,
"learning_rate": 3.2171507255109457e-06,
"loss": 0.2304,
"step": 617
},
{
"epoch": 0.7648514851485149,
"grad_norm": 4.784317493438721,
"learning_rate": 3.185461209617571e-06,
"loss": 0.2025,
"step": 618
},
{
"epoch": 0.7660891089108911,
"grad_norm": 4.379182815551758,
"learning_rate": 3.153898946339241e-06,
"loss": 0.1843,
"step": 619
},
{
"epoch": 0.7673267326732673,
"grad_norm": 4.599033355712891,
"learning_rate": 3.122464525060013e-06,
"loss": 0.1743,
"step": 620
},
{
"epoch": 0.7673267326732673,
"eval_accuracy": 0.8658536585365854,
"eval_f1": 0.7256235827664399,
"eval_loss": 0.29202917218208313,
"eval_precision": 0.851063829787234,
"eval_recall": 0.6324110671936759,
"eval_runtime": 47.9364,
"eval_samples_per_second": 5.758,
"eval_steps_per_second": 0.188,
"step": 620
},
{
"epoch": 0.7685643564356436,
"grad_norm": 4.76499605178833,
"learning_rate": 3.0911585327766658e-06,
"loss": 0.2115,
"step": 621
},
{
"epoch": 0.7698019801980198,
"grad_norm": 3.3205549716949463,
"learning_rate": 3.059981554087732e-06,
"loss": 0.1956,
"step": 622
},
{
"epoch": 0.7710396039603961,
"grad_norm": 5.28842306137085,
"learning_rate": 3.0289341711825817e-06,
"loss": 0.2971,
"step": 623
},
{
"epoch": 0.7722772277227723,
"grad_norm": 5.2406182289123535,
"learning_rate": 2.998016963830562e-06,
"loss": 0.188,
"step": 624
},
{
"epoch": 0.7735148514851485,
"grad_norm": 5.418379783630371,
"learning_rate": 2.96723050937015e-06,
"loss": 0.1859,
"step": 625
},
{
"epoch": 0.7747524752475248,
"grad_norm": 3.3900177478790283,
"learning_rate": 2.9365753826981947e-06,
"loss": 0.1113,
"step": 626
},
{
"epoch": 0.775990099009901,
"grad_norm": 7.410489082336426,
"learning_rate": 2.9060521562591625e-06,
"loss": 0.2426,
"step": 627
},
{
"epoch": 0.7772277227722773,
"grad_norm": 4.490034580230713,
"learning_rate": 2.875661400034452e-06,
"loss": 0.1891,
"step": 628
},
{
"epoch": 0.7784653465346535,
"grad_norm": 3.7824716567993164,
"learning_rate": 2.8454036815317643e-06,
"loss": 0.1712,
"step": 629
},
{
"epoch": 0.7797029702970297,
"grad_norm": 6.608094215393066,
"learning_rate": 2.8152795657744882e-06,
"loss": 0.1994,
"step": 630
},
{
"epoch": 0.780940594059406,
"grad_norm": 4.14816951751709,
"learning_rate": 2.78528961529115e-06,
"loss": 0.2238,
"step": 631
},
{
"epoch": 0.7821782178217822,
"grad_norm": 4.3334760665893555,
"learning_rate": 2.7554343901049295e-06,
"loss": 0.256,
"step": 632
},
{
"epoch": 0.7834158415841584,
"grad_norm": 4.515098571777344,
"learning_rate": 2.7257144477231756e-06,
"loss": 0.2338,
"step": 633
},
{
"epoch": 0.7846534653465347,
"grad_norm": 4.144233703613281,
"learning_rate": 2.696130343127007e-06,
"loss": 0.1322,
"step": 634
},
{
"epoch": 0.7858910891089109,
"grad_norm": 3.5474421977996826,
"learning_rate": 2.666682628760958e-06,
"loss": 0.1656,
"step": 635
},
{
"epoch": 0.7871287128712872,
"grad_norm": 4.109697341918945,
"learning_rate": 2.6373718545226444e-06,
"loss": 0.2305,
"step": 636
},
{
"epoch": 0.7883663366336634,
"grad_norm": 6.762195587158203,
"learning_rate": 2.6081985677525124e-06,
"loss": 0.2165,
"step": 637
},
{
"epoch": 0.7896039603960396,
"grad_norm": 4.237494945526123,
"learning_rate": 2.5791633132236027e-06,
"loss": 0.2376,
"step": 638
},
{
"epoch": 0.7908415841584159,
"grad_norm": 4.219669818878174,
"learning_rate": 2.550266633131382e-06,
"loss": 0.1932,
"step": 639
},
{
"epoch": 0.7920792079207921,
"grad_norm": 3.1918282508850098,
"learning_rate": 2.521509067083631e-06,
"loss": 0.1281,
"step": 640
},
{
"epoch": 0.7920792079207921,
"eval_accuracy": 0.8658536585365854,
"eval_f1": 0.7268623024830699,
"eval_loss": 0.29117584228515625,
"eval_precision": 0.8473684210526315,
"eval_recall": 0.6363636363636364,
"eval_runtime": 48.65,
"eval_samples_per_second": 5.673,
"eval_steps_per_second": 0.185,
"step": 640
},
{
"epoch": 0.7933168316831684,
"grad_norm": 3.6139471530914307,
"learning_rate": 2.4928911520903466e-06,
"loss": 0.1664,
"step": 641
},
{
"epoch": 0.7945544554455446,
"grad_norm": 3.5790998935699463,
"learning_rate": 2.4644134225537265e-06,
"loss": 0.1926,
"step": 642
},
{
"epoch": 0.7957920792079208,
"grad_norm": 6.101653575897217,
"learning_rate": 2.4360764102581947e-06,
"loss": 0.1896,
"step": 643
},
{
"epoch": 0.7970297029702971,
"grad_norm": 4.154812335968018,
"learning_rate": 2.4078806443604595e-06,
"loss": 0.2341,
"step": 644
},
{
"epoch": 0.7982673267326733,
"grad_norm": 4.5489630699157715,
"learning_rate": 2.379826651379632e-06,
"loss": 0.2022,
"step": 645
},
{
"epoch": 0.7995049504950495,
"grad_norm": 4.680227756500244,
"learning_rate": 2.351914955187412e-06,
"loss": 0.2343,
"step": 646
},
{
"epoch": 0.8007425742574258,
"grad_norm": 4.6964898109436035,
"learning_rate": 2.3241460769982814e-06,
"loss": 0.2174,
"step": 647
},
{
"epoch": 0.801980198019802,
"grad_norm": 5.496919631958008,
"learning_rate": 2.2965205353597866e-06,
"loss": 0.2319,
"step": 648
},
{
"epoch": 0.8032178217821783,
"grad_norm": 4.648072242736816,
"learning_rate": 2.269038846142847e-06,
"loss": 0.1666,
"step": 649
},
{
"epoch": 0.8044554455445545,
"grad_norm": 3.861116409301758,
"learning_rate": 2.241701522532136e-06,
"loss": 0.1696,
"step": 650
},
{
"epoch": 0.8056930693069307,
"grad_norm": 3.7325706481933594,
"learning_rate": 2.214509075016478e-06,
"loss": 0.168,
"step": 651
},
{
"epoch": 0.806930693069307,
"grad_norm": 3.7055068016052246,
"learning_rate": 2.1874620113793286e-06,
"loss": 0.2214,
"step": 652
},
{
"epoch": 0.8081683168316832,
"grad_norm": 3.3289315700531006,
"learning_rate": 2.160560836689286e-06,
"loss": 0.1643,
"step": 653
},
{
"epoch": 0.8094059405940595,
"grad_norm": 3.24979567527771,
"learning_rate": 2.1338060532906734e-06,
"loss": 0.1478,
"step": 654
},
{
"epoch": 0.8106435643564357,
"grad_norm": 6.499382972717285,
"learning_rate": 2.107198160794136e-06,
"loss": 0.1785,
"step": 655
},
{
"epoch": 0.8118811881188119,
"grad_norm": 5.324498176574707,
"learning_rate": 2.080737656067325e-06,
"loss": 0.2121,
"step": 656
},
{
"epoch": 0.8131188118811881,
"grad_norm": 3.2809088230133057,
"learning_rate": 2.054425033225628e-06,
"loss": 0.1683,
"step": 657
},
{
"epoch": 0.8143564356435643,
"grad_norm": 4.163755416870117,
"learning_rate": 2.028260783622914e-06,
"loss": 0.1624,
"step": 658
},
{
"epoch": 0.8155940594059405,
"grad_norm": 5.601637363433838,
"learning_rate": 2.002245395842394e-06,
"loss": 0.2025,
"step": 659
},
{
"epoch": 0.8168316831683168,
"grad_norm": 6.822818756103516,
"learning_rate": 1.9763793556874655e-06,
"loss": 0.1879,
"step": 660
},
{
"epoch": 0.8168316831683168,
"eval_accuracy": 0.8625277161862528,
"eval_f1": 0.7181818181818181,
"eval_loss": 0.293789803981781,
"eval_precision": 0.8449197860962567,
"eval_recall": 0.6245059288537549,
"eval_runtime": 48.9901,
"eval_samples_per_second": 5.634,
"eval_steps_per_second": 0.184,
"step": 660
},
{
"epoch": 0.818069306930693,
"grad_norm": 5.174654483795166,
"learning_rate": 1.950663146172657e-06,
"loss": 0.2161,
"step": 661
},
{
"epoch": 0.8193069306930693,
"grad_norm": 7.8807692527771,
"learning_rate": 1.9250972475146092e-06,
"loss": 0.2391,
"step": 662
},
{
"epoch": 0.8205445544554455,
"grad_norm": 4.177780628204346,
"learning_rate": 1.8996821371231022e-06,
"loss": 0.142,
"step": 663
},
{
"epoch": 0.8217821782178217,
"grad_norm": 4.918459892272949,
"learning_rate": 1.874418289592137e-06,
"loss": 0.2504,
"step": 664
},
{
"epoch": 0.823019801980198,
"grad_norm": 7.126182556152344,
"learning_rate": 1.849306176691088e-06,
"loss": 0.1987,
"step": 665
},
{
"epoch": 0.8242574257425742,
"grad_norm": 5.940370082855225,
"learning_rate": 1.8243462673558755e-06,
"loss": 0.2805,
"step": 666
},
{
"epoch": 0.8254950495049505,
"grad_norm": 3.46482515335083,
"learning_rate": 1.799539027680216e-06,
"loss": 0.1328,
"step": 667
},
{
"epoch": 0.8267326732673267,
"grad_norm": 8.600014686584473,
"learning_rate": 1.7748849209069286e-06,
"loss": 0.2778,
"step": 668
},
{
"epoch": 0.8279702970297029,
"grad_norm": 3.1640994548797607,
"learning_rate": 1.7503844074192655e-06,
"loss": 0.1474,
"step": 669
},
{
"epoch": 0.8292079207920792,
"grad_norm": 5.749293327331543,
"learning_rate": 1.7260379447323327e-06,
"loss": 0.2154,
"step": 670
},
{
"epoch": 0.8304455445544554,
"grad_norm": 6.068516731262207,
"learning_rate": 1.7018459874845327e-06,
"loss": 0.226,
"step": 671
},
{
"epoch": 0.8316831683168316,
"grad_norm": 8.348623275756836,
"learning_rate": 1.6778089874290793e-06,
"loss": 0.29,
"step": 672
},
{
"epoch": 0.8329207920792079,
"grad_norm": 6.485348701477051,
"learning_rate": 1.6539273934255728e-06,
"loss": 0.2041,
"step": 673
},
{
"epoch": 0.8341584158415841,
"grad_norm": 4.267493724822998,
"learning_rate": 1.6302016514316e-06,
"loss": 0.1943,
"step": 674
},
{
"epoch": 0.8353960396039604,
"grad_norm": 3.1872661113739014,
"learning_rate": 1.6066322044944126e-06,
"loss": 0.1807,
"step": 675
},
{
"epoch": 0.8366336633663366,
"grad_norm": 5.531446933746338,
"learning_rate": 1.583219492742667e-06,
"loss": 0.244,
"step": 676
},
{
"epoch": 0.8378712871287128,
"grad_norm": 4.929156303405762,
"learning_rate": 1.5599639533781853e-06,
"loss": 0.1353,
"step": 677
},
{
"epoch": 0.8391089108910891,
"grad_norm": 4.086613655090332,
"learning_rate": 1.5368660206678031e-06,
"loss": 0.2356,
"step": 678
},
{
"epoch": 0.8403465346534653,
"grad_norm": 4.462868690490723,
"learning_rate": 1.5139261259352589e-06,
"loss": 0.1682,
"step": 679
},
{
"epoch": 0.8415841584158416,
"grad_norm": 3.685532808303833,
"learning_rate": 1.4911446975531329e-06,
"loss": 0.1741,
"step": 680
},
{
"epoch": 0.8415841584158416,
"eval_accuracy": 0.8625277161862528,
"eval_f1": 0.7168949771689498,
"eval_loss": 0.2965390384197235,
"eval_precision": 0.8486486486486486,
"eval_recall": 0.6205533596837944,
"eval_runtime": 49.6397,
"eval_samples_per_second": 5.56,
"eval_steps_per_second": 0.181,
"step": 680
},
{
"epoch": 0.8428217821782178,
"grad_norm": 3.4863808155059814,
"learning_rate": 1.4685221609348632e-06,
"loss": 0.2203,
"step": 681
},
{
"epoch": 0.844059405940594,
"grad_norm": 3.9294660091400146,
"learning_rate": 1.4460589385267843e-06,
"loss": 0.1609,
"step": 682
},
{
"epoch": 0.8452970297029703,
"grad_norm": 3.837347984313965,
"learning_rate": 1.4237554498002425e-06,
"loss": 0.1866,
"step": 683
},
{
"epoch": 0.8465346534653465,
"grad_norm": 5.066103458404541,
"learning_rate": 1.4016121112437787e-06,
"loss": 0.2101,
"step": 684
},
{
"epoch": 0.8477722772277227,
"grad_norm": 4.614877700805664,
"learning_rate": 1.3796293363553259e-06,
"loss": 0.2043,
"step": 685
},
{
"epoch": 0.849009900990099,
"grad_norm": 3.7771222591400146,
"learning_rate": 1.3578075356345044e-06,
"loss": 0.1599,
"step": 686
},
{
"epoch": 0.8502475247524752,
"grad_norm": 2.7725830078125,
"learning_rate": 1.3361471165749563e-06,
"loss": 0.1179,
"step": 687
},
{
"epoch": 0.8514851485148515,
"grad_norm": 4.316575527191162,
"learning_rate": 1.3146484836567263e-06,
"loss": 0.1996,
"step": 688
},
{
"epoch": 0.8527227722772277,
"grad_norm": 3.446519374847412,
"learning_rate": 1.2933120383387132e-06,
"loss": 0.2208,
"step": 689
},
{
"epoch": 0.8539603960396039,
"grad_norm": 4.64270544052124,
"learning_rate": 1.2721381790511832e-06,
"loss": 0.1745,
"step": 690
},
{
"epoch": 0.8551980198019802,
"grad_norm": 5.920294761657715,
"learning_rate": 1.2511273011883096e-06,
"loss": 0.2257,
"step": 691
},
{
"epoch": 0.8564356435643564,
"grad_norm": 5.84797477722168,
"learning_rate": 1.2302797971008085e-06,
"loss": 0.1741,
"step": 692
},
{
"epoch": 0.8576732673267327,
"grad_norm": 4.7784223556518555,
"learning_rate": 1.2095960560886e-06,
"loss": 0.1844,
"step": 693
},
{
"epoch": 0.8589108910891089,
"grad_norm": 4.6134161949157715,
"learning_rate": 1.1890764643935393e-06,
"loss": 0.26,
"step": 694
},
{
"epoch": 0.8601485148514851,
"grad_norm": 3.6760635375976562,
"learning_rate": 1.168721405192218e-06,
"loss": 0.1946,
"step": 695
},
{
"epoch": 0.8613861386138614,
"grad_norm": 5.395166873931885,
"learning_rate": 1.1485312585887887e-06,
"loss": 0.3086,
"step": 696
},
{
"epoch": 0.8626237623762376,
"grad_norm": 3.1336684226989746,
"learning_rate": 1.1285064016078784e-06,
"loss": 0.1662,
"step": 697
},
{
"epoch": 0.8638613861386139,
"grad_norm": 5.393222332000732,
"learning_rate": 1.1086472081875488e-06,
"loss": 0.2479,
"step": 698
},
{
"epoch": 0.8650990099009901,
"grad_norm": 5.272143363952637,
"learning_rate": 1.0889540491723106e-06,
"loss": 0.1916,
"step": 699
},
{
"epoch": 0.8663366336633663,
"grad_norm": 5.7492547035217285,
"learning_rate": 1.0694272923061933e-06,
"loss": 0.1429,
"step": 700
},
{
"epoch": 0.8663366336633663,
"eval_accuracy": 0.8647450110864745,
"eval_f1": 0.7276785714285714,
"eval_loss": 0.2911153733730316,
"eval_precision": 0.8358974358974359,
"eval_recall": 0.6442687747035574,
"eval_runtime": 48.2583,
"eval_samples_per_second": 5.719,
"eval_steps_per_second": 0.186,
"step": 700
},
{
"epoch": 0.8675742574257426,
"grad_norm": 4.760046005249023,
"learning_rate": 1.0500673022258923e-06,
"loss": 0.1807,
"step": 701
},
{
"epoch": 0.8688118811881188,
"grad_norm": 3.5298335552215576,
"learning_rate": 1.030874440453944e-06,
"loss": 0.1995,
"step": 702
},
{
"epoch": 0.870049504950495,
"grad_norm": 5.2245049476623535,
"learning_rate": 1.0118490653919855e-06,
"loss": 0.2404,
"step": 703
},
{
"epoch": 0.8712871287128713,
"grad_norm": 3.6648762226104736,
"learning_rate": 9.92991532314057e-07,
"loss": 0.1154,
"step": 704
},
{
"epoch": 0.8725247524752475,
"grad_norm": 6.1605424880981445,
"learning_rate": 9.743021933599695e-07,
"loss": 0.2483,
"step": 705
},
{
"epoch": 0.8737623762376238,
"grad_norm": 4.078556060791016,
"learning_rate": 9.557813975287266e-07,
"loss": 0.2285,
"step": 706
},
{
"epoch": 0.875,
"grad_norm": 4.299773216247559,
"learning_rate": 9.374294906720083e-07,
"loss": 0.1815,
"step": 707
},
{
"epoch": 0.8762376237623762,
"grad_norm": 4.190319061279297,
"learning_rate": 9.192468154877187e-07,
"loss": 0.2049,
"step": 708
},
{
"epoch": 0.8774752475247525,
"grad_norm": 6.507383823394775,
"learning_rate": 9.012337115135772e-07,
"loss": 0.2033,
"step": 709
},
{
"epoch": 0.8787128712871287,
"grad_norm": 4.297817707061768,
"learning_rate": 8.833905151207833e-07,
"loss": 0.1332,
"step": 710
},
{
"epoch": 0.879950495049505,
"grad_norm": 4.102233409881592,
"learning_rate": 8.657175595077317e-07,
"loss": 0.1851,
"step": 711
},
{
"epoch": 0.8811881188118812,
"grad_norm": 7.134310722351074,
"learning_rate": 8.482151746937983e-07,
"loss": 0.1696,
"step": 712
},
{
"epoch": 0.8824257425742574,
"grad_norm": 4.428358554840088,
"learning_rate": 8.308836875131665e-07,
"loss": 0.2061,
"step": 713
},
{
"epoch": 0.8836633663366337,
"grad_norm": 3.750676393508911,
"learning_rate": 8.137234216087353e-07,
"loss": 0.1409,
"step": 714
},
{
"epoch": 0.8849009900990099,
"grad_norm": 4.440266132354736,
"learning_rate": 7.967346974260626e-07,
"loss": 0.1741,
"step": 715
},
{
"epoch": 0.8861386138613861,
"grad_norm": 4.9036736488342285,
"learning_rate": 7.799178322073941e-07,
"loss": 0.1559,
"step": 716
},
{
"epoch": 0.8873762376237624,
"grad_norm": 4.0620856285095215,
"learning_rate": 7.632731399857329e-07,
"loss": 0.1934,
"step": 717
},
{
"epoch": 0.8886138613861386,
"grad_norm": 4.675508975982666,
"learning_rate": 7.468009315789748e-07,
"loss": 0.176,
"step": 718
},
{
"epoch": 0.8898514851485149,
"grad_norm": 4.654139041900635,
"learning_rate": 7.305015145841055e-07,
"loss": 0.1992,
"step": 719
},
{
"epoch": 0.8910891089108911,
"grad_norm": 5.478711128234863,
"learning_rate": 7.143751933714583e-07,
"loss": 0.2218,
"step": 720
},
{
"epoch": 0.8910891089108911,
"eval_accuracy": 0.8625277161862528,
"eval_f1": 0.7181818181818181,
"eval_loss": 0.29504844546318054,
"eval_precision": 0.8449197860962567,
"eval_recall": 0.6245059288537549,
"eval_runtime": 48.4336,
"eval_samples_per_second": 5.699,
"eval_steps_per_second": 0.186,
"step": 720
},
{
"epoch": 0.8923267326732673,
"grad_norm": 4.685728073120117,
"learning_rate": 6.984222690790277e-07,
"loss": 0.1942,
"step": 721
},
{
"epoch": 0.8935643564356436,
"grad_norm": 4.345710754394531,
"learning_rate": 6.826430396068451e-07,
"loss": 0.1774,
"step": 722
},
{
"epoch": 0.8948019801980198,
"grad_norm": 4.9782328605651855,
"learning_rate": 6.67037799611423e-07,
"loss": 0.1819,
"step": 723
},
{
"epoch": 0.8960396039603961,
"grad_norm": 6.226649284362793,
"learning_rate": 6.516068405002441e-07,
"loss": 0.1995,
"step": 724
},
{
"epoch": 0.8972772277227723,
"grad_norm": 3.8088414669036865,
"learning_rate": 6.363504504263207e-07,
"loss": 0.1909,
"step": 725
},
{
"epoch": 0.8985148514851485,
"grad_norm": 3.8853421211242676,
"learning_rate": 6.212689142828232e-07,
"loss": 0.1821,
"step": 726
},
{
"epoch": 0.8997524752475248,
"grad_norm": 3.680631160736084,
"learning_rate": 6.063625136977447e-07,
"loss": 0.1419,
"step": 727
},
{
"epoch": 0.900990099009901,
"grad_norm": 4.0216217041015625,
"learning_rate": 5.916315270286588e-07,
"loss": 0.1715,
"step": 728
},
{
"epoch": 0.9022277227722773,
"grad_norm": 5.727372646331787,
"learning_rate": 5.770762293575083e-07,
"loss": 0.3295,
"step": 729
},
{
"epoch": 0.9034653465346535,
"grad_norm": 4.530044078826904,
"learning_rate": 5.626968924854714e-07,
"loss": 0.1704,
"step": 730
},
{
"epoch": 0.9047029702970297,
"grad_norm": 7.712387561798096,
"learning_rate": 5.484937849278937e-07,
"loss": 0.205,
"step": 731
},
{
"epoch": 0.905940594059406,
"grad_norm": 5.130886554718018,
"learning_rate": 5.344671719092664e-07,
"loss": 0.1432,
"step": 732
},
{
"epoch": 0.9071782178217822,
"grad_norm": 4.739104270935059,
"learning_rate": 5.206173153582705e-07,
"loss": 0.2583,
"step": 733
},
{
"epoch": 0.9084158415841584,
"grad_norm": 3.7045702934265137,
"learning_rate": 5.069444739029006e-07,
"loss": 0.1193,
"step": 734
},
{
"epoch": 0.9096534653465347,
"grad_norm": 3.3528637886047363,
"learning_rate": 4.934489028656164e-07,
"loss": 0.1153,
"step": 735
},
{
"epoch": 0.9108910891089109,
"grad_norm": 3.305149793624878,
"learning_rate": 4.801308542585892e-07,
"loss": 0.144,
"step": 736
},
{
"epoch": 0.9121287128712872,
"grad_norm": 4.803959369659424,
"learning_rate": 4.669905767789884e-07,
"loss": 0.1267,
"step": 737
},
{
"epoch": 0.9133663366336634,
"grad_norm": 5.374501705169678,
"learning_rate": 4.54028315804339e-07,
"loss": 0.1991,
"step": 738
},
{
"epoch": 0.9146039603960396,
"grad_norm": 5.532665729522705,
"learning_rate": 4.4124431338794403e-07,
"loss": 0.2528,
"step": 739
},
{
"epoch": 0.9158415841584159,
"grad_norm": 3.2962238788604736,
"learning_rate": 4.2863880825435687e-07,
"loss": 0.1608,
"step": 740
},
{
"epoch": 0.9158415841584159,
"eval_accuracy": 0.8603104212860311,
"eval_f1": 0.7096774193548387,
"eval_loss": 0.2995334267616272,
"eval_precision": 0.850828729281768,
"eval_recall": 0.6086956521739131,
"eval_runtime": 50.016,
"eval_samples_per_second": 5.518,
"eval_steps_per_second": 0.18,
"step": 740
},
{
"epoch": 0.9170792079207921,
"grad_norm": 3.7142422199249268,
"learning_rate": 4.162120357949284e-07,
"loss": 0.1256,
"step": 741
},
{
"epoch": 0.9183168316831684,
"grad_norm": 6.413259029388428,
"learning_rate": 4.039642280634104e-07,
"loss": 0.2582,
"step": 742
},
{
"epoch": 0.9195544554455446,
"grad_norm": 8.874350547790527,
"learning_rate": 3.9189561377162343e-07,
"loss": 0.3466,
"step": 743
},
{
"epoch": 0.9207920792079208,
"grad_norm": 3.888648271560669,
"learning_rate": 3.800064182851804e-07,
"loss": 0.1399,
"step": 744
},
{
"epoch": 0.9220297029702971,
"grad_norm": 3.599470615386963,
"learning_rate": 3.682968636192863e-07,
"loss": 0.1305,
"step": 745
},
{
"epoch": 0.9232673267326733,
"grad_norm": 5.102394104003906,
"learning_rate": 3.567671684345875e-07,
"loss": 0.2276,
"step": 746
},
{
"epoch": 0.9245049504950495,
"grad_norm": 3.962132453918457,
"learning_rate": 3.4541754803308567e-07,
"loss": 0.1582,
"step": 747
},
{
"epoch": 0.9257425742574258,
"grad_norm": 3.0672736167907715,
"learning_rate": 3.342482143541281e-07,
"loss": 0.1273,
"step": 748
},
{
"epoch": 0.926980198019802,
"grad_norm": 3.4861226081848145,
"learning_rate": 3.23259375970435e-07,
"loss": 0.1595,
"step": 749
},
{
"epoch": 0.9282178217821783,
"grad_norm": 4.98872184753418,
"learning_rate": 3.124512380842204e-07,
"loss": 0.1265,
"step": 750
},
{
"epoch": 0.9294554455445545,
"grad_norm": 3.516669988632202,
"learning_rate": 3.0182400252334656e-07,
"loss": 0.1916,
"step": 751
},
{
"epoch": 0.9306930693069307,
"grad_norm": 3.6074628829956055,
"learning_rate": 2.9137786773756114e-07,
"loss": 0.1395,
"step": 752
},
{
"epoch": 0.931930693069307,
"grad_norm": 4.940650463104248,
"learning_rate": 2.811130287947972e-07,
"loss": 0.2394,
"step": 753
},
{
"epoch": 0.9331683168316832,
"grad_norm": 6.3607072830200195,
"learning_rate": 2.710296773775167e-07,
"loss": 0.213,
"step": 754
},
{
"epoch": 0.9344059405940595,
"grad_norm": 4.352235317230225,
"learning_rate": 2.61128001779144e-07,
"loss": 0.1511,
"step": 755
},
{
"epoch": 0.9356435643564357,
"grad_norm": 4.3637495040893555,
"learning_rate": 2.514081869005458e-07,
"loss": 0.224,
"step": 756
},
{
"epoch": 0.9368811881188119,
"grad_norm": 3.1667957305908203,
"learning_rate": 2.418704142465722e-07,
"loss": 0.1993,
"step": 757
},
{
"epoch": 0.9381188118811881,
"grad_norm": 4.96242094039917,
"learning_rate": 2.325148619226758e-07,
"loss": 0.1221,
"step": 758
},
{
"epoch": 0.9393564356435643,
"grad_norm": 5.428174018859863,
"learning_rate": 2.2334170463158223e-07,
"loss": 0.2239,
"step": 759
},
{
"epoch": 0.9405940594059405,
"grad_norm": 5.158010005950928,
"learning_rate": 2.1435111367002826e-07,
"loss": 0.2056,
"step": 760
},
{
"epoch": 0.9405940594059405,
"eval_accuracy": 0.8592017738359202,
"eval_f1": 0.7093821510297483,
"eval_loss": 0.296718031167984,
"eval_precision": 0.842391304347826,
"eval_recall": 0.6126482213438735,
"eval_runtime": 50.2838,
"eval_samples_per_second": 5.489,
"eval_steps_per_second": 0.179,
"step": 760
},
{
"epoch": 0.9418316831683168,
"grad_norm": 3.6793015003204346,
"learning_rate": 2.055432569255622e-07,
"loss": 0.1978,
"step": 761
},
{
"epoch": 0.943069306930693,
"grad_norm": 4.697923183441162,
"learning_rate": 1.9691829887340864e-07,
"loss": 0.2068,
"step": 762
},
{
"epoch": 0.9443069306930693,
"grad_norm": 3.3791000843048096,
"learning_rate": 1.8847640057339966e-07,
"loss": 0.1953,
"step": 763
},
{
"epoch": 0.9455445544554455,
"grad_norm": 5.8897528648376465,
"learning_rate": 1.802177196669619e-07,
"loss": 0.2229,
"step": 764
},
{
"epoch": 0.9467821782178217,
"grad_norm": 5.087184906005859,
"learning_rate": 1.7214241037418312e-07,
"loss": 0.1672,
"step": 765
},
{
"epoch": 0.948019801980198,
"grad_norm": 7.06437873840332,
"learning_rate": 1.6425062349091914e-07,
"loss": 0.2658,
"step": 766
},
{
"epoch": 0.9492574257425742,
"grad_norm": 6.635834693908691,
"learning_rate": 1.5654250638598601e-07,
"loss": 0.1855,
"step": 767
},
{
"epoch": 0.9504950495049505,
"grad_norm": 4.292440414428711,
"learning_rate": 1.4901820299840997e-07,
"loss": 0.1832,
"step": 768
},
{
"epoch": 0.9517326732673267,
"grad_norm": 6.999221324920654,
"learning_rate": 1.4167785383472855e-07,
"loss": 0.2044,
"step": 769
},
{
"epoch": 0.9529702970297029,
"grad_norm": 3.0472869873046875,
"learning_rate": 1.345215959663837e-07,
"loss": 0.1321,
"step": 770
},
{
"epoch": 0.9542079207920792,
"grad_norm": 3.8832876682281494,
"learning_rate": 1.2754956302714617e-07,
"loss": 0.116,
"step": 771
},
{
"epoch": 0.9554455445544554,
"grad_norm": 2.6631460189819336,
"learning_rate": 1.207618852106285e-07,
"loss": 0.1369,
"step": 772
},
{
"epoch": 0.9566831683168316,
"grad_norm": 5.974620819091797,
"learning_rate": 1.1415868926785256e-07,
"loss": 0.2214,
"step": 773
},
{
"epoch": 0.9579207920792079,
"grad_norm": 4.9748616218566895,
"learning_rate": 1.0774009850488154e-07,
"loss": 0.1924,
"step": 774
},
{
"epoch": 0.9591584158415841,
"grad_norm": 3.96227765083313,
"learning_rate": 1.0150623278051719e-07,
"loss": 0.1976,
"step": 775
},
{
"epoch": 0.9603960396039604,
"grad_norm": 3.1740334033966064,
"learning_rate": 9.545720850406504e-08,
"loss": 0.1266,
"step": 776
},
{
"epoch": 0.9616336633663366,
"grad_norm": 7.497445583343506,
"learning_rate": 8.959313863315388e-08,
"loss": 0.231,
"step": 777
},
{
"epoch": 0.9628712871287128,
"grad_norm": 4.704877853393555,
"learning_rate": 8.391413267163418e-08,
"loss": 0.163,
"step": 778
},
{
"epoch": 0.9641089108910891,
"grad_norm": 4.273810386657715,
"learning_rate": 7.842029666752627e-08,
"loss": 0.1865,
"step": 779
},
{
"epoch": 0.9653465346534653,
"grad_norm": 4.104014873504639,
"learning_rate": 7.311173321104648e-08,
"loss": 0.2127,
"step": 780
},
{
"epoch": 0.9653465346534653,
"eval_accuracy": 0.8625277161862528,
"eval_f1": 0.7194570135746606,
"eval_loss": 0.2944088578224182,
"eval_precision": 0.8412698412698413,
"eval_recall": 0.6284584980237155,
"eval_runtime": 50.7955,
"eval_samples_per_second": 5.434,
"eval_steps_per_second": 0.177,
"step": 780
},
{
"epoch": 0.9665841584158416,
"grad_norm": 6.6444902420043945,
"learning_rate": 6.79885414326864e-08,
"loss": 0.251,
"step": 781
},
{
"epoch": 0.9678217821782178,
"grad_norm": 5.593561172485352,
"learning_rate": 6.305081700136328e-08,
"loss": 0.2616,
"step": 782
},
{
"epoch": 0.969059405940594,
"grad_norm": 3.74603009223938,
"learning_rate": 5.8298652122634747e-08,
"loss": 0.1636,
"step": 783
},
{
"epoch": 0.9702970297029703,
"grad_norm": 4.336615562438965,
"learning_rate": 5.373213553697576e-08,
"loss": 0.2196,
"step": 784
},
{
"epoch": 0.9715346534653465,
"grad_norm": 3.183893918991089,
"learning_rate": 4.935135251811995e-08,
"loss": 0.141,
"step": 785
},
{
"epoch": 0.9727722772277227,
"grad_norm": 5.894475936889648,
"learning_rate": 4.515638487147311e-08,
"loss": 0.2093,
"step": 786
},
{
"epoch": 0.974009900990099,
"grad_norm": 5.151168346405029,
"learning_rate": 4.1147310932578845e-08,
"loss": 0.2318,
"step": 787
},
{
"epoch": 0.9752475247524752,
"grad_norm": 3.7331173419952393,
"learning_rate": 3.732420556565752e-08,
"loss": 0.1696,
"step": 788
},
{
"epoch": 0.9764851485148515,
"grad_norm": 5.081837177276611,
"learning_rate": 3.368714016221186e-08,
"loss": 0.1864,
"step": 789
},
{
"epoch": 0.9777227722772277,
"grad_norm": 5.835230827331543,
"learning_rate": 3.023618263968797e-08,
"loss": 0.266,
"step": 790
},
{
"epoch": 0.9789603960396039,
"grad_norm": 4.534897327423096,
"learning_rate": 2.6971397440214154e-08,
"loss": 0.1775,
"step": 791
},
{
"epoch": 0.9801980198019802,
"grad_norm": 4.335183620452881,
"learning_rate": 2.3892845529390753e-08,
"loss": 0.2054,
"step": 792
},
{
"epoch": 0.9814356435643564,
"grad_norm": 4.6105451583862305,
"learning_rate": 2.100058439515551e-08,
"loss": 0.1936,
"step": 793
},
{
"epoch": 0.9826732673267327,
"grad_norm": 3.7368929386138916,
"learning_rate": 1.8294668046708874e-08,
"loss": 0.2007,
"step": 794
},
{
"epoch": 0.9839108910891089,
"grad_norm": 4.431072235107422,
"learning_rate": 1.577514701350591e-08,
"loss": 0.2273,
"step": 795
},
{
"epoch": 0.9851485148514851,
"grad_norm": 6.609121322631836,
"learning_rate": 1.3442068344312609e-08,
"loss": 0.238,
"step": 796
},
{
"epoch": 0.9863861386138614,
"grad_norm": 4.512663841247559,
"learning_rate": 1.129547560632771e-08,
"loss": 0.2116,
"step": 797
},
{
"epoch": 0.9876237623762376,
"grad_norm": 4.200558662414551,
"learning_rate": 9.335408884366682e-09,
"loss": 0.198,
"step": 798
},
{
"epoch": 0.9888613861386139,
"grad_norm": 6.579875469207764,
"learning_rate": 7.561904780116758e-09,
"loss": 0.212,
"step": 799
},
{
"epoch": 0.9900990099009901,
"grad_norm": 5.1703667640686035,
"learning_rate": 5.97499641145416e-09,
"loss": 0.2252,
"step": 800
},
{
"epoch": 0.9900990099009901,
"eval_accuracy": 0.8636363636363636,
"eval_f1": 0.7223476297968398,
"eval_loss": 0.2939181923866272,
"eval_precision": 0.8421052631578947,
"eval_recall": 0.6324110671936759,
"eval_runtime": 48.1373,
"eval_samples_per_second": 5.734,
"eval_steps_per_second": 0.187,
"step": 800
},
{
"epoch": 0.9913366336633663,
"grad_norm": 5.27916955947876,
"learning_rate": 4.574713411816811e-09,
"loss": 0.1858,
"step": 801
},
{
"epoch": 0.9925742574257426,
"grad_norm": 5.716527462005615,
"learning_rate": 3.361081929664778e-09,
"loss": 0.1604,
"step": 802
},
{
"epoch": 0.9938118811881188,
"grad_norm": 5.259273529052734,
"learning_rate": 2.3341246279806606e-09,
"loss": 0.1598,
"step": 803
},
{
"epoch": 0.995049504950495,
"grad_norm": 4.770205974578857,
"learning_rate": 1.493860683851045e-09,
"loss": 0.1981,
"step": 804
},
{
"epoch": 0.9962871287128713,
"grad_norm": 3.5331854820251465,
"learning_rate": 8.403057881067877e-10,
"loss": 0.1671,
"step": 805
},
{
"epoch": 0.9975247524752475,
"grad_norm": 4.290172576904297,
"learning_rate": 3.7347214503435927e-10,
"loss": 0.2307,
"step": 806
},
{
"epoch": 0.9987623762376238,
"grad_norm": 4.1493964195251465,
"learning_rate": 9.33684721426964e-11,
"loss": 0.1106,
"step": 807
},
{
"epoch": 1.0,
"grad_norm": 3.5634615421295166,
"learning_rate": 0.0,
"loss": 0.1396,
"step": 808
}
],
"logging_steps": 1,
"max_steps": 808,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4786605370559693e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}