hyungjoochae's picture
Upload folder using huggingface_hub
dc66e2b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 112,
"global_step": 1344,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002232142857142857,
"grad_norm": 4.520495891571045,
"learning_rate": 2.0000000000000002e-07,
"loss": 0.8829,
"step": 1
},
{
"epoch": 0.002232142857142857,
"eval_loss": 0.8741821050643921,
"eval_runtime": 23.3187,
"eval_samples_per_second": 3.131,
"eval_steps_per_second": 0.429,
"step": 1
},
{
"epoch": 0.004464285714285714,
"grad_norm": 4.709418773651123,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.9726,
"step": 2
},
{
"epoch": 0.006696428571428571,
"grad_norm": 4.854740619659424,
"learning_rate": 6.000000000000001e-07,
"loss": 0.94,
"step": 3
},
{
"epoch": 0.008928571428571428,
"grad_norm": 4.648777008056641,
"learning_rate": 8.000000000000001e-07,
"loss": 0.8458,
"step": 4
},
{
"epoch": 0.011160714285714286,
"grad_norm": 5.138184547424316,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.0834,
"step": 5
},
{
"epoch": 0.013392857142857142,
"grad_norm": 4.406048774719238,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.9389,
"step": 6
},
{
"epoch": 0.015625,
"grad_norm": 4.439329147338867,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.949,
"step": 7
},
{
"epoch": 0.017857142857142856,
"grad_norm": 4.817677021026611,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.9092,
"step": 8
},
{
"epoch": 0.020089285714285716,
"grad_norm": 4.216228485107422,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.9105,
"step": 9
},
{
"epoch": 0.022321428571428572,
"grad_norm": 4.030458927154541,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.9263,
"step": 10
},
{
"epoch": 0.024553571428571428,
"grad_norm": 4.106152057647705,
"learning_rate": 2.2e-06,
"loss": 0.991,
"step": 11
},
{
"epoch": 0.026785714285714284,
"grad_norm": 3.9047749042510986,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.8173,
"step": 12
},
{
"epoch": 0.029017857142857144,
"grad_norm": 3.7009527683258057,
"learning_rate": 2.6e-06,
"loss": 1.0595,
"step": 13
},
{
"epoch": 0.03125,
"grad_norm": 3.7975056171417236,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.9012,
"step": 14
},
{
"epoch": 0.033482142857142856,
"grad_norm": 2.969536066055298,
"learning_rate": 3e-06,
"loss": 0.8177,
"step": 15
},
{
"epoch": 0.03571428571428571,
"grad_norm": 3.660879373550415,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.915,
"step": 16
},
{
"epoch": 0.03794642857142857,
"grad_norm": 3.0569710731506348,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.8795,
"step": 17
},
{
"epoch": 0.04017857142857143,
"grad_norm": 2.7441296577453613,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.8276,
"step": 18
},
{
"epoch": 0.04241071428571429,
"grad_norm": 2.8655402660369873,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.7529,
"step": 19
},
{
"epoch": 0.044642857142857144,
"grad_norm": 2.769359827041626,
"learning_rate": 4.000000000000001e-06,
"loss": 0.817,
"step": 20
},
{
"epoch": 0.046875,
"grad_norm": 3.5258753299713135,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.9505,
"step": 21
},
{
"epoch": 0.049107142857142856,
"grad_norm": 3.55863356590271,
"learning_rate": 4.4e-06,
"loss": 0.8813,
"step": 22
},
{
"epoch": 0.05133928571428571,
"grad_norm": 3.1872193813323975,
"learning_rate": 4.600000000000001e-06,
"loss": 0.8472,
"step": 23
},
{
"epoch": 0.05357142857142857,
"grad_norm": 3.643343687057495,
"learning_rate": 4.800000000000001e-06,
"loss": 0.8751,
"step": 24
},
{
"epoch": 0.05580357142857143,
"grad_norm": 3.154827356338501,
"learning_rate": 5e-06,
"loss": 0.827,
"step": 25
},
{
"epoch": 0.05803571428571429,
"grad_norm": 2.529634714126587,
"learning_rate": 5.2e-06,
"loss": 0.8818,
"step": 26
},
{
"epoch": 0.060267857142857144,
"grad_norm": 2.5746371746063232,
"learning_rate": 5.400000000000001e-06,
"loss": 0.885,
"step": 27
},
{
"epoch": 0.0625,
"grad_norm": 2.452150583267212,
"learning_rate": 5.600000000000001e-06,
"loss": 0.975,
"step": 28
},
{
"epoch": 0.06473214285714286,
"grad_norm": 2.5640347003936768,
"learning_rate": 5.8e-06,
"loss": 0.7795,
"step": 29
},
{
"epoch": 0.06696428571428571,
"grad_norm": 2.178790330886841,
"learning_rate": 6e-06,
"loss": 0.7853,
"step": 30
},
{
"epoch": 0.06919642857142858,
"grad_norm": 2.054187297821045,
"learning_rate": 6.200000000000001e-06,
"loss": 0.7556,
"step": 31
},
{
"epoch": 0.07142857142857142,
"grad_norm": 2.3759331703186035,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.8584,
"step": 32
},
{
"epoch": 0.07366071428571429,
"grad_norm": 2.5890913009643555,
"learning_rate": 6.600000000000001e-06,
"loss": 0.8545,
"step": 33
},
{
"epoch": 0.07589285714285714,
"grad_norm": 2.1318633556365967,
"learning_rate": 6.800000000000001e-06,
"loss": 0.8014,
"step": 34
},
{
"epoch": 0.078125,
"grad_norm": 2.1878416538238525,
"learning_rate": 7e-06,
"loss": 0.8423,
"step": 35
},
{
"epoch": 0.08035714285714286,
"grad_norm": 1.959555745124817,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.7856,
"step": 36
},
{
"epoch": 0.08258928571428571,
"grad_norm": 1.9672911167144775,
"learning_rate": 7.4e-06,
"loss": 0.7517,
"step": 37
},
{
"epoch": 0.08482142857142858,
"grad_norm": 2.133237600326538,
"learning_rate": 7.600000000000001e-06,
"loss": 0.8081,
"step": 38
},
{
"epoch": 0.08705357142857142,
"grad_norm": 2.6118452548980713,
"learning_rate": 7.800000000000002e-06,
"loss": 0.9733,
"step": 39
},
{
"epoch": 0.08928571428571429,
"grad_norm": 1.9084440469741821,
"learning_rate": 8.000000000000001e-06,
"loss": 0.6862,
"step": 40
},
{
"epoch": 0.09151785714285714,
"grad_norm": 2.092421531677246,
"learning_rate": 8.2e-06,
"loss": 0.7759,
"step": 41
},
{
"epoch": 0.09375,
"grad_norm": 2.126476764678955,
"learning_rate": 8.400000000000001e-06,
"loss": 0.8206,
"step": 42
},
{
"epoch": 0.09598214285714286,
"grad_norm": 1.9326355457305908,
"learning_rate": 8.6e-06,
"loss": 0.8061,
"step": 43
},
{
"epoch": 0.09821428571428571,
"grad_norm": 2.1919474601745605,
"learning_rate": 8.8e-06,
"loss": 0.8693,
"step": 44
},
{
"epoch": 0.10044642857142858,
"grad_norm": 2.066986322402954,
"learning_rate": 9e-06,
"loss": 0.8146,
"step": 45
},
{
"epoch": 0.10267857142857142,
"grad_norm": 2.196195602416992,
"learning_rate": 9.200000000000002e-06,
"loss": 0.8237,
"step": 46
},
{
"epoch": 0.10491071428571429,
"grad_norm": 2.36797833442688,
"learning_rate": 9.4e-06,
"loss": 0.8609,
"step": 47
},
{
"epoch": 0.10714285714285714,
"grad_norm": 2.007786512374878,
"learning_rate": 9.600000000000001e-06,
"loss": 0.8415,
"step": 48
},
{
"epoch": 0.109375,
"grad_norm": 2.127784013748169,
"learning_rate": 9.800000000000001e-06,
"loss": 0.8149,
"step": 49
},
{
"epoch": 0.11160714285714286,
"grad_norm": 1.842410922050476,
"learning_rate": 1e-05,
"loss": 0.6706,
"step": 50
},
{
"epoch": 0.11383928571428571,
"grad_norm": 2.025834321975708,
"learning_rate": 1.02e-05,
"loss": 0.7797,
"step": 51
},
{
"epoch": 0.11607142857142858,
"grad_norm": 2.0152997970581055,
"learning_rate": 1.04e-05,
"loss": 0.7977,
"step": 52
},
{
"epoch": 0.11830357142857142,
"grad_norm": 1.8089625835418701,
"learning_rate": 1.0600000000000002e-05,
"loss": 0.7222,
"step": 53
},
{
"epoch": 0.12053571428571429,
"grad_norm": 1.9475045204162598,
"learning_rate": 1.0800000000000002e-05,
"loss": 0.7971,
"step": 54
},
{
"epoch": 0.12276785714285714,
"grad_norm": 1.9405206441879272,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.77,
"step": 55
},
{
"epoch": 0.125,
"grad_norm": 1.7220442295074463,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.6592,
"step": 56
},
{
"epoch": 0.12723214285714285,
"grad_norm": 2.070206880569458,
"learning_rate": 1.14e-05,
"loss": 0.8843,
"step": 57
},
{
"epoch": 0.12946428571428573,
"grad_norm": 2.2304985523223877,
"learning_rate": 1.16e-05,
"loss": 0.7968,
"step": 58
},
{
"epoch": 0.13169642857142858,
"grad_norm": 2.300931215286255,
"learning_rate": 1.18e-05,
"loss": 0.7917,
"step": 59
},
{
"epoch": 0.13392857142857142,
"grad_norm": 2.126228094100952,
"learning_rate": 1.2e-05,
"loss": 0.7965,
"step": 60
},
{
"epoch": 0.13616071428571427,
"grad_norm": 2.0050771236419678,
"learning_rate": 1.22e-05,
"loss": 0.7334,
"step": 61
},
{
"epoch": 0.13839285714285715,
"grad_norm": 2.097790241241455,
"learning_rate": 1.2400000000000002e-05,
"loss": 0.7254,
"step": 62
},
{
"epoch": 0.140625,
"grad_norm": 2.2999649047851562,
"learning_rate": 1.2600000000000001e-05,
"loss": 0.7892,
"step": 63
},
{
"epoch": 0.14285714285714285,
"grad_norm": 2.2662696838378906,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.7692,
"step": 64
},
{
"epoch": 0.14508928571428573,
"grad_norm": 1.8592685461044312,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.7562,
"step": 65
},
{
"epoch": 0.14732142857142858,
"grad_norm": 2.0617785453796387,
"learning_rate": 1.3200000000000002e-05,
"loss": 0.8464,
"step": 66
},
{
"epoch": 0.14955357142857142,
"grad_norm": 1.990391492843628,
"learning_rate": 1.3400000000000002e-05,
"loss": 0.7656,
"step": 67
},
{
"epoch": 0.15178571428571427,
"grad_norm": 2.021301031112671,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.7506,
"step": 68
},
{
"epoch": 0.15401785714285715,
"grad_norm": 2.1052801609039307,
"learning_rate": 1.38e-05,
"loss": 0.8233,
"step": 69
},
{
"epoch": 0.15625,
"grad_norm": 2.0981056690216064,
"learning_rate": 1.4e-05,
"loss": 0.7702,
"step": 70
},
{
"epoch": 0.15848214285714285,
"grad_norm": 1.9719496965408325,
"learning_rate": 1.4200000000000001e-05,
"loss": 0.7602,
"step": 71
},
{
"epoch": 0.16071428571428573,
"grad_norm": 1.983307957649231,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.7432,
"step": 72
},
{
"epoch": 0.16294642857142858,
"grad_norm": 2.3522326946258545,
"learning_rate": 1.46e-05,
"loss": 0.8172,
"step": 73
},
{
"epoch": 0.16517857142857142,
"grad_norm": 2.026918888092041,
"learning_rate": 1.48e-05,
"loss": 0.6841,
"step": 74
},
{
"epoch": 0.16741071428571427,
"grad_norm": 2.1341769695281982,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.852,
"step": 75
},
{
"epoch": 0.16964285714285715,
"grad_norm": 1.8743571043014526,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.7731,
"step": 76
},
{
"epoch": 0.171875,
"grad_norm": 2.333038330078125,
"learning_rate": 1.54e-05,
"loss": 0.8161,
"step": 77
},
{
"epoch": 0.17410714285714285,
"grad_norm": 2.553131103515625,
"learning_rate": 1.5600000000000003e-05,
"loss": 0.9338,
"step": 78
},
{
"epoch": 0.17633928571428573,
"grad_norm": 1.8976587057113647,
"learning_rate": 1.58e-05,
"loss": 0.772,
"step": 79
},
{
"epoch": 0.17857142857142858,
"grad_norm": 2.028928279876709,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.8268,
"step": 80
},
{
"epoch": 0.18080357142857142,
"grad_norm": 2.094634771347046,
"learning_rate": 1.62e-05,
"loss": 0.8385,
"step": 81
},
{
"epoch": 0.18303571428571427,
"grad_norm": 2.3168070316314697,
"learning_rate": 1.64e-05,
"loss": 0.9017,
"step": 82
},
{
"epoch": 0.18526785714285715,
"grad_norm": 2.350069522857666,
"learning_rate": 1.66e-05,
"loss": 0.8375,
"step": 83
},
{
"epoch": 0.1875,
"grad_norm": 1.871971607208252,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.7232,
"step": 84
},
{
"epoch": 0.18973214285714285,
"grad_norm": 2.1683645248413086,
"learning_rate": 1.7e-05,
"loss": 0.7449,
"step": 85
},
{
"epoch": 0.19196428571428573,
"grad_norm": 1.8138465881347656,
"learning_rate": 1.72e-05,
"loss": 0.6897,
"step": 86
},
{
"epoch": 0.19419642857142858,
"grad_norm": 2.2803397178649902,
"learning_rate": 1.7400000000000003e-05,
"loss": 0.8541,
"step": 87
},
{
"epoch": 0.19642857142857142,
"grad_norm": 1.8534305095672607,
"learning_rate": 1.76e-05,
"loss": 0.7187,
"step": 88
},
{
"epoch": 0.19866071428571427,
"grad_norm": 2.2822651863098145,
"learning_rate": 1.7800000000000002e-05,
"loss": 0.7458,
"step": 89
},
{
"epoch": 0.20089285714285715,
"grad_norm": 2.2075366973876953,
"learning_rate": 1.8e-05,
"loss": 0.8119,
"step": 90
},
{
"epoch": 0.203125,
"grad_norm": 2.797893762588501,
"learning_rate": 1.8200000000000002e-05,
"loss": 0.9198,
"step": 91
},
{
"epoch": 0.20535714285714285,
"grad_norm": 2.2375845909118652,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.6985,
"step": 92
},
{
"epoch": 0.20758928571428573,
"grad_norm": 2.1225900650024414,
"learning_rate": 1.86e-05,
"loss": 0.8483,
"step": 93
},
{
"epoch": 0.20982142857142858,
"grad_norm": 1.8341416120529175,
"learning_rate": 1.88e-05,
"loss": 0.7703,
"step": 94
},
{
"epoch": 0.21205357142857142,
"grad_norm": 2.27540922164917,
"learning_rate": 1.9e-05,
"loss": 0.8437,
"step": 95
},
{
"epoch": 0.21428571428571427,
"grad_norm": 2.091398000717163,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.7553,
"step": 96
},
{
"epoch": 0.21651785714285715,
"grad_norm": 1.8585134744644165,
"learning_rate": 1.94e-05,
"loss": 0.6444,
"step": 97
},
{
"epoch": 0.21875,
"grad_norm": 2.1151020526885986,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.7303,
"step": 98
},
{
"epoch": 0.22098214285714285,
"grad_norm": 1.9768584966659546,
"learning_rate": 1.98e-05,
"loss": 0.74,
"step": 99
},
{
"epoch": 0.22321428571428573,
"grad_norm": 2.13527250289917,
"learning_rate": 2e-05,
"loss": 0.7134,
"step": 100
},
{
"epoch": 0.22544642857142858,
"grad_norm": 2.309387683868408,
"learning_rate": 1.9999968111891562e-05,
"loss": 0.8742,
"step": 101
},
{
"epoch": 0.22767857142857142,
"grad_norm": 1.8575270175933838,
"learning_rate": 1.9999872447769624e-05,
"loss": 0.7745,
"step": 102
},
{
"epoch": 0.22991071428571427,
"grad_norm": 1.9398894309997559,
"learning_rate": 1.9999713008244287e-05,
"loss": 0.7618,
"step": 103
},
{
"epoch": 0.23214285714285715,
"grad_norm": 2.5453739166259766,
"learning_rate": 1.9999489794332404e-05,
"loss": 0.931,
"step": 104
},
{
"epoch": 0.234375,
"grad_norm": 2.1891441345214844,
"learning_rate": 1.9999202807457537e-05,
"loss": 0.8582,
"step": 105
},
{
"epoch": 0.23660714285714285,
"grad_norm": 1.949729084968567,
"learning_rate": 1.9998852049449998e-05,
"loss": 0.9173,
"step": 106
},
{
"epoch": 0.23883928571428573,
"grad_norm": 1.8219000101089478,
"learning_rate": 1.999843752254677e-05,
"loss": 0.7732,
"step": 107
},
{
"epoch": 0.24107142857142858,
"grad_norm": 1.942179799079895,
"learning_rate": 1.9997959229391567e-05,
"loss": 0.7376,
"step": 108
},
{
"epoch": 0.24330357142857142,
"grad_norm": 1.6319869756698608,
"learning_rate": 1.9997417173034746e-05,
"loss": 0.7755,
"step": 109
},
{
"epoch": 0.24553571428571427,
"grad_norm": 1.9693115949630737,
"learning_rate": 1.9996811356933346e-05,
"loss": 0.7828,
"step": 110
},
{
"epoch": 0.24776785714285715,
"grad_norm": 2.1049964427948,
"learning_rate": 1.999614178495103e-05,
"loss": 0.7936,
"step": 111
},
{
"epoch": 0.25,
"grad_norm": 2.169593572616577,
"learning_rate": 1.9995408461358074e-05,
"loss": 0.7894,
"step": 112
},
{
"epoch": 0.25,
"eval_loss": 0.7251861691474915,
"eval_runtime": 27.5365,
"eval_samples_per_second": 2.651,
"eval_steps_per_second": 0.363,
"step": 112
},
{
"epoch": 0.25223214285714285,
"grad_norm": 1.8239336013793945,
"learning_rate": 1.9994611390831342e-05,
"loss": 0.7608,
"step": 113
},
{
"epoch": 0.2544642857142857,
"grad_norm": 2.3521170616149902,
"learning_rate": 1.9993750578454248e-05,
"loss": 0.9461,
"step": 114
},
{
"epoch": 0.25669642857142855,
"grad_norm": 2.146218776702881,
"learning_rate": 1.9992826029716722e-05,
"loss": 0.8203,
"step": 115
},
{
"epoch": 0.25892857142857145,
"grad_norm": 2.1784703731536865,
"learning_rate": 1.999183775051519e-05,
"loss": 0.749,
"step": 116
},
{
"epoch": 0.2611607142857143,
"grad_norm": 1.7962055206298828,
"learning_rate": 1.9990785747152527e-05,
"loss": 0.7431,
"step": 117
},
{
"epoch": 0.26339285714285715,
"grad_norm": 2.184608221054077,
"learning_rate": 1.9989670026338002e-05,
"loss": 0.8456,
"step": 118
},
{
"epoch": 0.265625,
"grad_norm": 2.270358085632324,
"learning_rate": 1.9988490595187273e-05,
"loss": 0.8213,
"step": 119
},
{
"epoch": 0.26785714285714285,
"grad_norm": 2.243161678314209,
"learning_rate": 1.9987247461222297e-05,
"loss": 0.7454,
"step": 120
},
{
"epoch": 0.2700892857142857,
"grad_norm": 2.3505022525787354,
"learning_rate": 1.9985940632371316e-05,
"loss": 0.853,
"step": 121
},
{
"epoch": 0.27232142857142855,
"grad_norm": 2.321498394012451,
"learning_rate": 1.9984570116968785e-05,
"loss": 0.8958,
"step": 122
},
{
"epoch": 0.27455357142857145,
"grad_norm": 2.576880693435669,
"learning_rate": 1.9983135923755336e-05,
"loss": 0.9688,
"step": 123
},
{
"epoch": 0.2767857142857143,
"grad_norm": 2.2674782276153564,
"learning_rate": 1.9981638061877714e-05,
"loss": 0.8822,
"step": 124
},
{
"epoch": 0.27901785714285715,
"grad_norm": 1.8983664512634277,
"learning_rate": 1.998007654088871e-05,
"loss": 0.6536,
"step": 125
},
{
"epoch": 0.28125,
"grad_norm": 2.1132736206054688,
"learning_rate": 1.9978451370747122e-05,
"loss": 0.8452,
"step": 126
},
{
"epoch": 0.28348214285714285,
"grad_norm": 2.033719778060913,
"learning_rate": 1.9976762561817656e-05,
"loss": 0.763,
"step": 127
},
{
"epoch": 0.2857142857142857,
"grad_norm": 2.284616231918335,
"learning_rate": 1.997501012487091e-05,
"loss": 0.8082,
"step": 128
},
{
"epoch": 0.28794642857142855,
"grad_norm": 1.9635744094848633,
"learning_rate": 1.997319407108326e-05,
"loss": 0.8587,
"step": 129
},
{
"epoch": 0.29017857142857145,
"grad_norm": 2.307817220687866,
"learning_rate": 1.9971314412036807e-05,
"loss": 0.7933,
"step": 130
},
{
"epoch": 0.2924107142857143,
"grad_norm": 2.1261589527130127,
"learning_rate": 1.9969371159719307e-05,
"loss": 0.8069,
"step": 131
},
{
"epoch": 0.29464285714285715,
"grad_norm": 2.0330147743225098,
"learning_rate": 1.996736432652409e-05,
"loss": 0.7368,
"step": 132
},
{
"epoch": 0.296875,
"grad_norm": 2.067072629928589,
"learning_rate": 1.9965293925249976e-05,
"loss": 0.7402,
"step": 133
},
{
"epoch": 0.29910714285714285,
"grad_norm": 2.2394609451293945,
"learning_rate": 1.9963159969101207e-05,
"loss": 0.8081,
"step": 134
},
{
"epoch": 0.3013392857142857,
"grad_norm": 1.8908040523529053,
"learning_rate": 1.996096247168734e-05,
"loss": 0.6806,
"step": 135
},
{
"epoch": 0.30357142857142855,
"grad_norm": 2.1276235580444336,
"learning_rate": 1.9958701447023188e-05,
"loss": 0.8402,
"step": 136
},
{
"epoch": 0.30580357142857145,
"grad_norm": 1.948089361190796,
"learning_rate": 1.9956376909528704e-05,
"loss": 0.8141,
"step": 137
},
{
"epoch": 0.3080357142857143,
"grad_norm": 2.3023507595062256,
"learning_rate": 1.9953988874028917e-05,
"loss": 0.8263,
"step": 138
},
{
"epoch": 0.31026785714285715,
"grad_norm": 2.078064441680908,
"learning_rate": 1.995153735575381e-05,
"loss": 0.8128,
"step": 139
},
{
"epoch": 0.3125,
"grad_norm": 2.271723985671997,
"learning_rate": 1.994902237033824e-05,
"loss": 0.7636,
"step": 140
},
{
"epoch": 0.31473214285714285,
"grad_norm": 1.9039952754974365,
"learning_rate": 1.994644393382183e-05,
"loss": 0.7801,
"step": 141
},
{
"epoch": 0.3169642857142857,
"grad_norm": 2.113295078277588,
"learning_rate": 1.9943802062648877e-05,
"loss": 0.7634,
"step": 142
},
{
"epoch": 0.31919642857142855,
"grad_norm": 1.9675801992416382,
"learning_rate": 1.9941096773668232e-05,
"loss": 0.7411,
"step": 143
},
{
"epoch": 0.32142857142857145,
"grad_norm": 2.325932741165161,
"learning_rate": 1.9938328084133206e-05,
"loss": 0.8638,
"step": 144
},
{
"epoch": 0.3236607142857143,
"grad_norm": 1.9418251514434814,
"learning_rate": 1.9935496011701453e-05,
"loss": 0.7443,
"step": 145
},
{
"epoch": 0.32589285714285715,
"grad_norm": 1.611464500427246,
"learning_rate": 1.9932600574434864e-05,
"loss": 0.7198,
"step": 146
},
{
"epoch": 0.328125,
"grad_norm": 2.157644748687744,
"learning_rate": 1.9929641790799438e-05,
"loss": 0.8276,
"step": 147
},
{
"epoch": 0.33035714285714285,
"grad_norm": 2.295194625854492,
"learning_rate": 1.9926619679665175e-05,
"loss": 0.8713,
"step": 148
},
{
"epoch": 0.3325892857142857,
"grad_norm": 2.154426097869873,
"learning_rate": 1.992353426030596e-05,
"loss": 0.7274,
"step": 149
},
{
"epoch": 0.33482142857142855,
"grad_norm": 1.6973615884780884,
"learning_rate": 1.9920385552399434e-05,
"loss": 0.6846,
"step": 150
},
{
"epoch": 0.33705357142857145,
"grad_norm": 1.7057573795318604,
"learning_rate": 1.991717357602686e-05,
"loss": 0.7335,
"step": 151
},
{
"epoch": 0.3392857142857143,
"grad_norm": 1.9547100067138672,
"learning_rate": 1.9913898351673006e-05,
"loss": 0.6845,
"step": 152
},
{
"epoch": 0.34151785714285715,
"grad_norm": 2.0757429599761963,
"learning_rate": 1.991055990022602e-05,
"loss": 0.7628,
"step": 153
},
{
"epoch": 0.34375,
"grad_norm": 2.1312568187713623,
"learning_rate": 1.990715824297728e-05,
"loss": 0.7328,
"step": 154
},
{
"epoch": 0.34598214285714285,
"grad_norm": 1.9267735481262207,
"learning_rate": 1.990369340162127e-05,
"loss": 0.8076,
"step": 155
},
{
"epoch": 0.3482142857142857,
"grad_norm": 1.9615391492843628,
"learning_rate": 1.9900165398255434e-05,
"loss": 0.7789,
"step": 156
},
{
"epoch": 0.35044642857142855,
"grad_norm": 1.7132021188735962,
"learning_rate": 1.9896574255380045e-05,
"loss": 0.7017,
"step": 157
},
{
"epoch": 0.35267857142857145,
"grad_norm": 2.122762680053711,
"learning_rate": 1.9892919995898052e-05,
"loss": 0.7483,
"step": 158
},
{
"epoch": 0.3549107142857143,
"grad_norm": 2.0038235187530518,
"learning_rate": 1.988920264311494e-05,
"loss": 0.6985,
"step": 159
},
{
"epoch": 0.35714285714285715,
"grad_norm": 2.013420820236206,
"learning_rate": 1.9885422220738583e-05,
"loss": 0.6655,
"step": 160
},
{
"epoch": 0.359375,
"grad_norm": 2.1159446239471436,
"learning_rate": 1.988157875287908e-05,
"loss": 0.8129,
"step": 161
},
{
"epoch": 0.36160714285714285,
"grad_norm": 1.8331681489944458,
"learning_rate": 1.9877672264048618e-05,
"loss": 0.667,
"step": 162
},
{
"epoch": 0.3638392857142857,
"grad_norm": 1.9691740274429321,
"learning_rate": 1.98737027791613e-05,
"loss": 0.6916,
"step": 163
},
{
"epoch": 0.36607142857142855,
"grad_norm": 2.064512252807617,
"learning_rate": 1.9869670323533005e-05,
"loss": 0.8727,
"step": 164
},
{
"epoch": 0.36830357142857145,
"grad_norm": 2.3000264167785645,
"learning_rate": 1.9865574922881204e-05,
"loss": 0.7485,
"step": 165
},
{
"epoch": 0.3705357142857143,
"grad_norm": 2.070896625518799,
"learning_rate": 1.986141660332482e-05,
"loss": 0.8254,
"step": 166
},
{
"epoch": 0.37276785714285715,
"grad_norm": 2.051863431930542,
"learning_rate": 1.9857195391384038e-05,
"loss": 0.801,
"step": 167
},
{
"epoch": 0.375,
"grad_norm": 2.0693776607513428,
"learning_rate": 1.9852911313980146e-05,
"loss": 0.6922,
"step": 168
},
{
"epoch": 0.37723214285714285,
"grad_norm": 1.5244134664535522,
"learning_rate": 1.9848564398435374e-05,
"loss": 0.7052,
"step": 169
},
{
"epoch": 0.3794642857142857,
"grad_norm": 1.927579641342163,
"learning_rate": 1.9844154672472707e-05,
"loss": 0.7238,
"step": 170
},
{
"epoch": 0.38169642857142855,
"grad_norm": 1.7581312656402588,
"learning_rate": 1.9839682164215707e-05,
"loss": 0.7498,
"step": 171
},
{
"epoch": 0.38392857142857145,
"grad_norm": 2.004220485687256,
"learning_rate": 1.9835146902188336e-05,
"loss": 0.8368,
"step": 172
},
{
"epoch": 0.3861607142857143,
"grad_norm": 1.9040734767913818,
"learning_rate": 1.983054891531478e-05,
"loss": 0.7625,
"step": 173
},
{
"epoch": 0.38839285714285715,
"grad_norm": 1.787835955619812,
"learning_rate": 1.9825888232919268e-05,
"loss": 0.7894,
"step": 174
},
{
"epoch": 0.390625,
"grad_norm": 1.8486220836639404,
"learning_rate": 1.982116488472586e-05,
"loss": 0.7794,
"step": 175
},
{
"epoch": 0.39285714285714285,
"grad_norm": 2.068049907684326,
"learning_rate": 1.9816378900858288e-05,
"loss": 0.7192,
"step": 176
},
{
"epoch": 0.3950892857142857,
"grad_norm": 1.8563698530197144,
"learning_rate": 1.9811530311839747e-05,
"loss": 0.8747,
"step": 177
},
{
"epoch": 0.39732142857142855,
"grad_norm": 2.2349257469177246,
"learning_rate": 1.98066191485927e-05,
"loss": 0.9516,
"step": 178
},
{
"epoch": 0.39955357142857145,
"grad_norm": 1.9404733180999756,
"learning_rate": 1.980164544243869e-05,
"loss": 0.7122,
"step": 179
},
{
"epoch": 0.4017857142857143,
"grad_norm": 2.0351598262786865,
"learning_rate": 1.9796609225098136e-05,
"loss": 0.8076,
"step": 180
},
{
"epoch": 0.40401785714285715,
"grad_norm": 1.7152974605560303,
"learning_rate": 1.9791510528690125e-05,
"loss": 0.7297,
"step": 181
},
{
"epoch": 0.40625,
"grad_norm": 1.8740495443344116,
"learning_rate": 1.9786349385732212e-05,
"loss": 0.7284,
"step": 182
},
{
"epoch": 0.40848214285714285,
"grad_norm": 1.9318393468856812,
"learning_rate": 1.9781125829140214e-05,
"loss": 0.6855,
"step": 183
},
{
"epoch": 0.4107142857142857,
"grad_norm": 1.8202929496765137,
"learning_rate": 1.9775839892228004e-05,
"loss": 0.7345,
"step": 184
},
{
"epoch": 0.41294642857142855,
"grad_norm": 1.782867670059204,
"learning_rate": 1.977049160870728e-05,
"loss": 0.744,
"step": 185
},
{
"epoch": 0.41517857142857145,
"grad_norm": 1.9989078044891357,
"learning_rate": 1.976508101268738e-05,
"loss": 0.8473,
"step": 186
},
{
"epoch": 0.4174107142857143,
"grad_norm": 2.07568359375,
"learning_rate": 1.975960813867503e-05,
"loss": 0.8046,
"step": 187
},
{
"epoch": 0.41964285714285715,
"grad_norm": 1.87251615524292,
"learning_rate": 1.9754073021574153e-05,
"loss": 0.7159,
"step": 188
},
{
"epoch": 0.421875,
"grad_norm": 2.1433751583099365,
"learning_rate": 1.9748475696685637e-05,
"loss": 0.8732,
"step": 189
},
{
"epoch": 0.42410714285714285,
"grad_norm": 1.7815970182418823,
"learning_rate": 1.9742816199707096e-05,
"loss": 0.7325,
"step": 190
},
{
"epoch": 0.4263392857142857,
"grad_norm": 2.1016180515289307,
"learning_rate": 1.9737094566732663e-05,
"loss": 0.8413,
"step": 191
},
{
"epoch": 0.42857142857142855,
"grad_norm": 2.0545904636383057,
"learning_rate": 1.9731310834252747e-05,
"loss": 0.7327,
"step": 192
},
{
"epoch": 0.43080357142857145,
"grad_norm": 1.8373966217041016,
"learning_rate": 1.972546503915381e-05,
"loss": 0.6376,
"step": 193
},
{
"epoch": 0.4330357142857143,
"grad_norm": 1.8860907554626465,
"learning_rate": 1.9719557218718116e-05,
"loss": 0.6071,
"step": 194
},
{
"epoch": 0.43526785714285715,
"grad_norm": 1.9010783433914185,
"learning_rate": 1.9713587410623516e-05,
"loss": 0.6556,
"step": 195
},
{
"epoch": 0.4375,
"grad_norm": 2.155518054962158,
"learning_rate": 1.970755565294318e-05,
"loss": 0.8064,
"step": 196
},
{
"epoch": 0.43973214285714285,
"grad_norm": 1.8603652715682983,
"learning_rate": 1.970146198414538e-05,
"loss": 0.7676,
"step": 197
},
{
"epoch": 0.4419642857142857,
"grad_norm": 1.6968109607696533,
"learning_rate": 1.969530644309323e-05,
"loss": 0.6538,
"step": 198
},
{
"epoch": 0.44419642857142855,
"grad_norm": 1.864494800567627,
"learning_rate": 1.968908906904444e-05,
"loss": 0.655,
"step": 199
},
{
"epoch": 0.44642857142857145,
"grad_norm": 1.8527575731277466,
"learning_rate": 1.9682809901651074e-05,
"loss": 0.7734,
"step": 200
},
{
"epoch": 0.4486607142857143,
"grad_norm": 1.9814064502716064,
"learning_rate": 1.9676468980959284e-05,
"loss": 0.6819,
"step": 201
},
{
"epoch": 0.45089285714285715,
"grad_norm": 2.267021894454956,
"learning_rate": 1.9670066347409063e-05,
"loss": 0.8216,
"step": 202
},
{
"epoch": 0.453125,
"grad_norm": 2.0436460971832275,
"learning_rate": 1.9663602041833983e-05,
"loss": 0.8168,
"step": 203
},
{
"epoch": 0.45535714285714285,
"grad_norm": 1.9789938926696777,
"learning_rate": 1.9657076105460945e-05,
"loss": 0.7879,
"step": 204
},
{
"epoch": 0.4575892857142857,
"grad_norm": 1.8295159339904785,
"learning_rate": 1.9650488579909898e-05,
"loss": 0.7912,
"step": 205
},
{
"epoch": 0.45982142857142855,
"grad_norm": 2.1058108806610107,
"learning_rate": 1.964383950719359e-05,
"loss": 0.8244,
"step": 206
},
{
"epoch": 0.46205357142857145,
"grad_norm": 1.6311708688735962,
"learning_rate": 1.9637128929717294e-05,
"loss": 0.7164,
"step": 207
},
{
"epoch": 0.4642857142857143,
"grad_norm": 1.8252456188201904,
"learning_rate": 1.9630356890278527e-05,
"loss": 0.7296,
"step": 208
},
{
"epoch": 0.46651785714285715,
"grad_norm": 2.008681297302246,
"learning_rate": 1.96235234320668e-05,
"loss": 0.7393,
"step": 209
},
{
"epoch": 0.46875,
"grad_norm": 2.0544400215148926,
"learning_rate": 1.9616628598663322e-05,
"loss": 0.8566,
"step": 210
},
{
"epoch": 0.47098214285714285,
"grad_norm": 1.8580057621002197,
"learning_rate": 1.9609672434040736e-05,
"loss": 0.7186,
"step": 211
},
{
"epoch": 0.4732142857142857,
"grad_norm": 1.7987284660339355,
"learning_rate": 1.9602654982562822e-05,
"loss": 0.8183,
"step": 212
},
{
"epoch": 0.47544642857142855,
"grad_norm": 1.8287429809570312,
"learning_rate": 1.9595576288984233e-05,
"loss": 0.6638,
"step": 213
},
{
"epoch": 0.47767857142857145,
"grad_norm": 1.8274677991867065,
"learning_rate": 1.9588436398450206e-05,
"loss": 0.777,
"step": 214
},
{
"epoch": 0.4799107142857143,
"grad_norm": 1.702154517173767,
"learning_rate": 1.958123535649625e-05,
"loss": 0.7325,
"step": 215
},
{
"epoch": 0.48214285714285715,
"grad_norm": 1.7472929954528809,
"learning_rate": 1.9573973209047893e-05,
"loss": 0.7387,
"step": 216
},
{
"epoch": 0.484375,
"grad_norm": 2.046131134033203,
"learning_rate": 1.9566650002420363e-05,
"loss": 0.8264,
"step": 217
},
{
"epoch": 0.48660714285714285,
"grad_norm": 1.8448314666748047,
"learning_rate": 1.9559265783318304e-05,
"loss": 0.7476,
"step": 218
},
{
"epoch": 0.4888392857142857,
"grad_norm": 1.8311007022857666,
"learning_rate": 1.9551820598835464e-05,
"loss": 0.7377,
"step": 219
},
{
"epoch": 0.49107142857142855,
"grad_norm": 1.852664589881897,
"learning_rate": 1.9544314496454423e-05,
"loss": 0.7963,
"step": 220
},
{
"epoch": 0.49330357142857145,
"grad_norm": 1.744728446006775,
"learning_rate": 1.9536747524046254e-05,
"loss": 0.8079,
"step": 221
},
{
"epoch": 0.4955357142857143,
"grad_norm": 1.957882285118103,
"learning_rate": 1.9529119729870253e-05,
"loss": 0.7432,
"step": 222
},
{
"epoch": 0.49776785714285715,
"grad_norm": 1.8669383525848389,
"learning_rate": 1.9521431162573596e-05,
"loss": 0.7875,
"step": 223
},
{
"epoch": 0.5,
"grad_norm": 1.79106605052948,
"learning_rate": 1.9513681871191063e-05,
"loss": 0.7095,
"step": 224
},
{
"epoch": 0.5,
"eval_loss": 0.7226072549819946,
"eval_runtime": 37.3023,
"eval_samples_per_second": 1.957,
"eval_steps_per_second": 0.268,
"step": 224
},
{
"epoch": 0.5022321428571429,
"grad_norm": 1.7508628368377686,
"learning_rate": 1.95058719051447e-05,
"loss": 0.6829,
"step": 225
},
{
"epoch": 0.5044642857142857,
"grad_norm": 1.7533146142959595,
"learning_rate": 1.949800131424352e-05,
"loss": 0.6786,
"step": 226
},
{
"epoch": 0.5066964285714286,
"grad_norm": 1.9132986068725586,
"learning_rate": 1.9490070148683166e-05,
"loss": 0.7689,
"step": 227
},
{
"epoch": 0.5089285714285714,
"grad_norm": 1.6711753606796265,
"learning_rate": 1.9482078459045617e-05,
"loss": 0.7087,
"step": 228
},
{
"epoch": 0.5111607142857143,
"grad_norm": 2.01895809173584,
"learning_rate": 1.947402629629885e-05,
"loss": 0.7217,
"step": 229
},
{
"epoch": 0.5133928571428571,
"grad_norm": 2.0448741912841797,
"learning_rate": 1.9465913711796502e-05,
"loss": 0.7922,
"step": 230
},
{
"epoch": 0.515625,
"grad_norm": 2.043036460876465,
"learning_rate": 1.9457740757277577e-05,
"loss": 0.7573,
"step": 231
},
{
"epoch": 0.5178571428571429,
"grad_norm": 2.070568561553955,
"learning_rate": 1.9449507484866084e-05,
"loss": 0.8412,
"step": 232
},
{
"epoch": 0.5200892857142857,
"grad_norm": 1.930888295173645,
"learning_rate": 1.944121394707072e-05,
"loss": 0.8104,
"step": 233
},
{
"epoch": 0.5223214285714286,
"grad_norm": 2.184985876083374,
"learning_rate": 1.9432860196784533e-05,
"loss": 0.9096,
"step": 234
},
{
"epoch": 0.5245535714285714,
"grad_norm": 1.9199402332305908,
"learning_rate": 1.9424446287284576e-05,
"loss": 0.7141,
"step": 235
},
{
"epoch": 0.5267857142857143,
"grad_norm": 1.6737233400344849,
"learning_rate": 1.941597227223159e-05,
"loss": 0.712,
"step": 236
},
{
"epoch": 0.5290178571428571,
"grad_norm": 1.6949608325958252,
"learning_rate": 1.940743820566963e-05,
"loss": 0.7317,
"step": 237
},
{
"epoch": 0.53125,
"grad_norm": 1.8811354637145996,
"learning_rate": 1.9398844142025746e-05,
"loss": 0.7427,
"step": 238
},
{
"epoch": 0.5334821428571429,
"grad_norm": 2.093593120574951,
"learning_rate": 1.9390190136109625e-05,
"loss": 0.7851,
"step": 239
},
{
"epoch": 0.5357142857142857,
"grad_norm": 1.7398838996887207,
"learning_rate": 1.9381476243113243e-05,
"loss": 0.7885,
"step": 240
},
{
"epoch": 0.5379464285714286,
"grad_norm": 1.7277969121932983,
"learning_rate": 1.9372702518610512e-05,
"loss": 0.8121,
"step": 241
},
{
"epoch": 0.5401785714285714,
"grad_norm": 2.2008254528045654,
"learning_rate": 1.9363869018556928e-05,
"loss": 0.8755,
"step": 242
},
{
"epoch": 0.5424107142857143,
"grad_norm": 2.0191445350646973,
"learning_rate": 1.9354975799289215e-05,
"loss": 0.8049,
"step": 243
},
{
"epoch": 0.5446428571428571,
"grad_norm": 1.8473167419433594,
"learning_rate": 1.9346022917524958e-05,
"loss": 0.7737,
"step": 244
},
{
"epoch": 0.546875,
"grad_norm": 2.2000248432159424,
"learning_rate": 1.933701043036225e-05,
"loss": 0.79,
"step": 245
},
{
"epoch": 0.5491071428571429,
"grad_norm": 1.9332422018051147,
"learning_rate": 1.9327938395279325e-05,
"loss": 0.8249,
"step": 246
},
{
"epoch": 0.5513392857142857,
"grad_norm": 1.9443155527114868,
"learning_rate": 1.9318806870134194e-05,
"loss": 0.7453,
"step": 247
},
{
"epoch": 0.5535714285714286,
"grad_norm": 1.7897255420684814,
"learning_rate": 1.9309615913164262e-05,
"loss": 0.7778,
"step": 248
},
{
"epoch": 0.5558035714285714,
"grad_norm": 1.7514328956604004,
"learning_rate": 1.9300365582985984e-05,
"loss": 0.7577,
"step": 249
},
{
"epoch": 0.5580357142857143,
"grad_norm": 1.7380211353302002,
"learning_rate": 1.9291055938594464e-05,
"loss": 0.7522,
"step": 250
},
{
"epoch": 0.5602678571428571,
"grad_norm": 2.0598490238189697,
"learning_rate": 1.9281687039363088e-05,
"loss": 0.799,
"step": 251
},
{
"epoch": 0.5625,
"grad_norm": 2.061765670776367,
"learning_rate": 1.9272258945043154e-05,
"loss": 0.7477,
"step": 252
},
{
"epoch": 0.5647321428571429,
"grad_norm": 1.8268564939498901,
"learning_rate": 1.9262771715763483e-05,
"loss": 0.7743,
"step": 253
},
{
"epoch": 0.5669642857142857,
"grad_norm": 2.0371830463409424,
"learning_rate": 1.9253225412030028e-05,
"loss": 0.883,
"step": 254
},
{
"epoch": 0.5691964285714286,
"grad_norm": 1.8290431499481201,
"learning_rate": 1.924362009472551e-05,
"loss": 0.7619,
"step": 255
},
{
"epoch": 0.5714285714285714,
"grad_norm": 1.886815071105957,
"learning_rate": 1.9233955825109e-05,
"loss": 0.7959,
"step": 256
},
{
"epoch": 0.5736607142857143,
"grad_norm": 2.0654468536376953,
"learning_rate": 1.9224232664815563e-05,
"loss": 0.7899,
"step": 257
},
{
"epoch": 0.5758928571428571,
"grad_norm": 1.9492045640945435,
"learning_rate": 1.9214450675855832e-05,
"loss": 0.809,
"step": 258
},
{
"epoch": 0.578125,
"grad_norm": 1.9223597049713135,
"learning_rate": 1.9204609920615635e-05,
"loss": 0.7791,
"step": 259
},
{
"epoch": 0.5803571428571429,
"grad_norm": 1.9948698282241821,
"learning_rate": 1.919471046185558e-05,
"loss": 0.8161,
"step": 260
},
{
"epoch": 0.5825892857142857,
"grad_norm": 2.0385048389434814,
"learning_rate": 1.9184752362710674e-05,
"loss": 0.736,
"step": 261
},
{
"epoch": 0.5848214285714286,
"grad_norm": 2.2720816135406494,
"learning_rate": 1.917473568668991e-05,
"loss": 0.7706,
"step": 262
},
{
"epoch": 0.5870535714285714,
"grad_norm": 1.8173810243606567,
"learning_rate": 1.9164660497675848e-05,
"loss": 0.6735,
"step": 263
},
{
"epoch": 0.5892857142857143,
"grad_norm": 1.7594642639160156,
"learning_rate": 1.9154526859924242e-05,
"loss": 0.8137,
"step": 264
},
{
"epoch": 0.5915178571428571,
"grad_norm": 1.6718664169311523,
"learning_rate": 1.9144334838063595e-05,
"loss": 0.6624,
"step": 265
},
{
"epoch": 0.59375,
"grad_norm": 1.8529527187347412,
"learning_rate": 1.9134084497094766e-05,
"loss": 0.789,
"step": 266
},
{
"epoch": 0.5959821428571429,
"grad_norm": 2.0226452350616455,
"learning_rate": 1.9123775902390555e-05,
"loss": 0.8884,
"step": 267
},
{
"epoch": 0.5982142857142857,
"grad_norm": 1.9775701761245728,
"learning_rate": 1.9113409119695276e-05,
"loss": 0.6447,
"step": 268
},
{
"epoch": 0.6004464285714286,
"grad_norm": 1.7548224925994873,
"learning_rate": 1.9102984215124352e-05,
"loss": 0.6737,
"step": 269
},
{
"epoch": 0.6026785714285714,
"grad_norm": 1.7850803136825562,
"learning_rate": 1.9092501255163874e-05,
"loss": 0.6363,
"step": 270
},
{
"epoch": 0.6049107142857143,
"grad_norm": 1.8443948030471802,
"learning_rate": 1.9081960306670198e-05,
"loss": 0.7323,
"step": 271
},
{
"epoch": 0.6071428571428571,
"grad_norm": 1.8395159244537354,
"learning_rate": 1.907136143686951e-05,
"loss": 0.788,
"step": 272
},
{
"epoch": 0.609375,
"grad_norm": 1.7697207927703857,
"learning_rate": 1.9060704713357382e-05,
"loss": 0.7168,
"step": 273
},
{
"epoch": 0.6116071428571429,
"grad_norm": 1.7455432415008545,
"learning_rate": 1.904999020409837e-05,
"loss": 0.7696,
"step": 274
},
{
"epoch": 0.6138392857142857,
"grad_norm": 1.7991318702697754,
"learning_rate": 1.9039217977425567e-05,
"loss": 0.7197,
"step": 275
},
{
"epoch": 0.6160714285714286,
"grad_norm": 1.7043858766555786,
"learning_rate": 1.902838810204015e-05,
"loss": 0.7258,
"step": 276
},
{
"epoch": 0.6183035714285714,
"grad_norm": 1.7921115159988403,
"learning_rate": 1.901750064701097e-05,
"loss": 0.6635,
"step": 277
},
{
"epoch": 0.6205357142857143,
"grad_norm": 1.8393748998641968,
"learning_rate": 1.90065556817741e-05,
"loss": 0.7361,
"step": 278
},
{
"epoch": 0.6227678571428571,
"grad_norm": 1.4951876401901245,
"learning_rate": 1.8995553276132385e-05,
"loss": 0.6451,
"step": 279
},
{
"epoch": 0.625,
"grad_norm": 2.0124995708465576,
"learning_rate": 1.8984493500255e-05,
"loss": 0.9129,
"step": 280
},
{
"epoch": 0.6272321428571429,
"grad_norm": 1.8670498132705688,
"learning_rate": 1.8973376424677022e-05,
"loss": 0.7747,
"step": 281
},
{
"epoch": 0.6294642857142857,
"grad_norm": 1.8474571704864502,
"learning_rate": 1.8962202120298948e-05,
"loss": 0.7649,
"step": 282
},
{
"epoch": 0.6316964285714286,
"grad_norm": 2.081151247024536,
"learning_rate": 1.8950970658386262e-05,
"loss": 0.7737,
"step": 283
},
{
"epoch": 0.6339285714285714,
"grad_norm": 1.9373351335525513,
"learning_rate": 1.8939682110568982e-05,
"loss": 0.7365,
"step": 284
},
{
"epoch": 0.6361607142857143,
"grad_norm": 1.9412529468536377,
"learning_rate": 1.8928336548841197e-05,
"loss": 0.6813,
"step": 285
},
{
"epoch": 0.6383928571428571,
"grad_norm": 1.8421021699905396,
"learning_rate": 1.8916934045560603e-05,
"loss": 0.7973,
"step": 286
},
{
"epoch": 0.640625,
"grad_norm": 1.9929094314575195,
"learning_rate": 1.8905474673448055e-05,
"loss": 0.6829,
"step": 287
},
{
"epoch": 0.6428571428571429,
"grad_norm": 1.6566269397735596,
"learning_rate": 1.8893958505587093e-05,
"loss": 0.6942,
"step": 288
},
{
"epoch": 0.6450892857142857,
"grad_norm": 2.019409656524658,
"learning_rate": 1.8882385615423477e-05,
"loss": 0.767,
"step": 289
},
{
"epoch": 0.6473214285714286,
"grad_norm": 1.7549775838851929,
"learning_rate": 1.8870756076764728e-05,
"loss": 0.7294,
"step": 290
},
{
"epoch": 0.6495535714285714,
"grad_norm": 1.7515791654586792,
"learning_rate": 1.8859069963779636e-05,
"loss": 0.7496,
"step": 291
},
{
"epoch": 0.6517857142857143,
"grad_norm": 1.8197311162948608,
"learning_rate": 1.8847327350997814e-05,
"loss": 0.6977,
"step": 292
},
{
"epoch": 0.6540178571428571,
"grad_norm": 1.6706933975219727,
"learning_rate": 1.88355283133092e-05,
"loss": 0.7149,
"step": 293
},
{
"epoch": 0.65625,
"grad_norm": 1.9751033782958984,
"learning_rate": 1.8823672925963598e-05,
"loss": 0.7942,
"step": 294
},
{
"epoch": 0.6584821428571429,
"grad_norm": 1.8076329231262207,
"learning_rate": 1.8811761264570177e-05,
"loss": 0.7787,
"step": 295
},
{
"epoch": 0.6607142857142857,
"grad_norm": 1.9289543628692627,
"learning_rate": 1.879979340509701e-05,
"loss": 0.7987,
"step": 296
},
{
"epoch": 0.6629464285714286,
"grad_norm": 1.650168538093567,
"learning_rate": 1.8787769423870583e-05,
"loss": 0.6981,
"step": 297
},
{
"epoch": 0.6651785714285714,
"grad_norm": 2.0561366081237793,
"learning_rate": 1.877568939757529e-05,
"loss": 0.7365,
"step": 298
},
{
"epoch": 0.6674107142857143,
"grad_norm": 1.8385493755340576,
"learning_rate": 1.8763553403252975e-05,
"loss": 0.5859,
"step": 299
},
{
"epoch": 0.6696428571428571,
"grad_norm": 1.7286657094955444,
"learning_rate": 1.8751361518302413e-05,
"loss": 0.7328,
"step": 300
},
{
"epoch": 0.671875,
"grad_norm": 1.8077149391174316,
"learning_rate": 1.873911382047884e-05,
"loss": 0.8031,
"step": 301
},
{
"epoch": 0.6741071428571429,
"grad_norm": 1.7272533178329468,
"learning_rate": 1.8726810387893438e-05,
"loss": 0.6546,
"step": 302
},
{
"epoch": 0.6763392857142857,
"grad_norm": 1.9572628736495972,
"learning_rate": 1.871445129901284e-05,
"loss": 0.8659,
"step": 303
},
{
"epoch": 0.6785714285714286,
"grad_norm": 1.978677749633789,
"learning_rate": 1.8702036632658646e-05,
"loss": 0.7295,
"step": 304
},
{
"epoch": 0.6808035714285714,
"grad_norm": 1.81928288936615,
"learning_rate": 1.8689566468006898e-05,
"loss": 0.7568,
"step": 305
},
{
"epoch": 0.6830357142857143,
"grad_norm": 1.6536098718643188,
"learning_rate": 1.867704088458759e-05,
"loss": 0.7303,
"step": 306
},
{
"epoch": 0.6852678571428571,
"grad_norm": 1.989862084388733,
"learning_rate": 1.866445996228415e-05,
"loss": 0.7545,
"step": 307
},
{
"epoch": 0.6875,
"grad_norm": 1.7992223501205444,
"learning_rate": 1.8651823781332948e-05,
"loss": 0.7724,
"step": 308
},
{
"epoch": 0.6897321428571429,
"grad_norm": 1.92568039894104,
"learning_rate": 1.863913242232276e-05,
"loss": 0.7042,
"step": 309
},
{
"epoch": 0.6919642857142857,
"grad_norm": 1.8733493089675903,
"learning_rate": 1.8626385966194275e-05,
"loss": 0.7978,
"step": 310
},
{
"epoch": 0.6941964285714286,
"grad_norm": 1.7693002223968506,
"learning_rate": 1.8613584494239568e-05,
"loss": 0.7821,
"step": 311
},
{
"epoch": 0.6964285714285714,
"grad_norm": 1.6573666334152222,
"learning_rate": 1.8600728088101587e-05,
"loss": 0.7033,
"step": 312
},
{
"epoch": 0.6986607142857143,
"grad_norm": 2.043008327484131,
"learning_rate": 1.858781682977362e-05,
"loss": 0.6474,
"step": 313
},
{
"epoch": 0.7008928571428571,
"grad_norm": 1.755703330039978,
"learning_rate": 1.857485080159879e-05,
"loss": 0.8343,
"step": 314
},
{
"epoch": 0.703125,
"grad_norm": 2.0336127281188965,
"learning_rate": 1.8561830086269524e-05,
"loss": 0.8187,
"step": 315
},
{
"epoch": 0.7053571428571429,
"grad_norm": 1.6667089462280273,
"learning_rate": 1.8548754766827016e-05,
"loss": 0.6551,
"step": 316
},
{
"epoch": 0.7075892857142857,
"grad_norm": 1.6601287126541138,
"learning_rate": 1.8535624926660707e-05,
"loss": 0.762,
"step": 317
},
{
"epoch": 0.7098214285714286,
"grad_norm": 1.8507710695266724,
"learning_rate": 1.852244064950775e-05,
"loss": 0.7247,
"step": 318
},
{
"epoch": 0.7120535714285714,
"grad_norm": 1.7145278453826904,
"learning_rate": 1.8509202019452472e-05,
"loss": 0.6654,
"step": 319
},
{
"epoch": 0.7142857142857143,
"grad_norm": 2.0679047107696533,
"learning_rate": 1.8495909120925857e-05,
"loss": 0.7885,
"step": 320
},
{
"epoch": 0.7165178571428571,
"grad_norm": 1.8965860605239868,
"learning_rate": 1.8482562038704975e-05,
"loss": 0.6821,
"step": 321
},
{
"epoch": 0.71875,
"grad_norm": 2.1363813877105713,
"learning_rate": 1.846916085791247e-05,
"loss": 0.8906,
"step": 322
},
{
"epoch": 0.7209821428571429,
"grad_norm": 1.9793356657028198,
"learning_rate": 1.8455705664016003e-05,
"loss": 0.7592,
"step": 323
},
{
"epoch": 0.7232142857142857,
"grad_norm": 1.793078899383545,
"learning_rate": 1.8442196542827712e-05,
"loss": 0.7786,
"step": 324
},
{
"epoch": 0.7254464285714286,
"grad_norm": 2.359466314315796,
"learning_rate": 1.8428633580503658e-05,
"loss": 0.9417,
"step": 325
},
{
"epoch": 0.7276785714285714,
"grad_norm": 2.090745210647583,
"learning_rate": 1.8415016863543286e-05,
"loss": 0.8483,
"step": 326
},
{
"epoch": 0.7299107142857143,
"grad_norm": 2.008620500564575,
"learning_rate": 1.8401346478788865e-05,
"loss": 0.8119,
"step": 327
},
{
"epoch": 0.7321428571428571,
"grad_norm": 2.027491569519043,
"learning_rate": 1.8387622513424942e-05,
"loss": 0.7877,
"step": 328
},
{
"epoch": 0.734375,
"grad_norm": 1.982364296913147,
"learning_rate": 1.8373845054977764e-05,
"loss": 0.7336,
"step": 329
},
{
"epoch": 0.7366071428571429,
"grad_norm": 1.8951386213302612,
"learning_rate": 1.836001419131476e-05,
"loss": 0.7059,
"step": 330
},
{
"epoch": 0.7388392857142857,
"grad_norm": 1.8096810579299927,
"learning_rate": 1.834613001064394e-05,
"loss": 0.6819,
"step": 331
},
{
"epoch": 0.7410714285714286,
"grad_norm": 1.936662197113037,
"learning_rate": 1.8332192601513358e-05,
"loss": 0.8011,
"step": 332
},
{
"epoch": 0.7433035714285714,
"grad_norm": 1.7710180282592773,
"learning_rate": 1.8318202052810538e-05,
"loss": 0.7537,
"step": 333
},
{
"epoch": 0.7455357142857143,
"grad_norm": 1.7253098487854004,
"learning_rate": 1.8304158453761904e-05,
"loss": 0.6547,
"step": 334
},
{
"epoch": 0.7477678571428571,
"grad_norm": 1.9151325225830078,
"learning_rate": 1.829006189393222e-05,
"loss": 0.7737,
"step": 335
},
{
"epoch": 0.75,
"grad_norm": 1.715453028678894,
"learning_rate": 1.827591246322401e-05,
"loss": 0.7343,
"step": 336
},
{
"epoch": 0.75,
"eval_loss": 0.7154207229614258,
"eval_runtime": 43.0546,
"eval_samples_per_second": 1.696,
"eval_steps_per_second": 0.232,
"step": 336
},
{
"epoch": 0.7522321428571429,
"grad_norm": 1.6917095184326172,
"learning_rate": 1.8261710251876993e-05,
"loss": 0.6706,
"step": 337
},
{
"epoch": 0.7544642857142857,
"grad_norm": 1.8422549962997437,
"learning_rate": 1.8247455350467496e-05,
"loss": 0.7681,
"step": 338
},
{
"epoch": 0.7566964285714286,
"grad_norm": 1.6443406343460083,
"learning_rate": 1.8233147849907894e-05,
"loss": 0.6611,
"step": 339
},
{
"epoch": 0.7589285714285714,
"grad_norm": 2.035898447036743,
"learning_rate": 1.8218787841446003e-05,
"loss": 0.7388,
"step": 340
},
{
"epoch": 0.7611607142857143,
"grad_norm": 1.7699940204620361,
"learning_rate": 1.8204375416664536e-05,
"loss": 0.7281,
"step": 341
},
{
"epoch": 0.7633928571428571,
"grad_norm": 1.7037780284881592,
"learning_rate": 1.8189910667480476e-05,
"loss": 0.6242,
"step": 342
},
{
"epoch": 0.765625,
"grad_norm": 1.887932538986206,
"learning_rate": 1.8175393686144524e-05,
"loss": 0.7796,
"step": 343
},
{
"epoch": 0.7678571428571429,
"grad_norm": 1.8445992469787598,
"learning_rate": 1.8160824565240495e-05,
"loss": 0.7814,
"step": 344
},
{
"epoch": 0.7700892857142857,
"grad_norm": 1.56995689868927,
"learning_rate": 1.8146203397684734e-05,
"loss": 0.65,
"step": 345
},
{
"epoch": 0.7723214285714286,
"grad_norm": 1.7116246223449707,
"learning_rate": 1.8131530276725514e-05,
"loss": 0.7689,
"step": 346
},
{
"epoch": 0.7745535714285714,
"grad_norm": 1.7758585214614868,
"learning_rate": 1.811680529594245e-05,
"loss": 0.7553,
"step": 347
},
{
"epoch": 0.7767857142857143,
"grad_norm": 1.7060250043869019,
"learning_rate": 1.8102028549245894e-05,
"loss": 0.7016,
"step": 348
},
{
"epoch": 0.7790178571428571,
"grad_norm": 1.8875247240066528,
"learning_rate": 1.808720013087635e-05,
"loss": 0.7161,
"step": 349
},
{
"epoch": 0.78125,
"grad_norm": 1.7203177213668823,
"learning_rate": 1.8072320135403862e-05,
"loss": 0.6379,
"step": 350
},
{
"epoch": 0.7834821428571429,
"grad_norm": 1.9935482740402222,
"learning_rate": 1.805738865772741e-05,
"loss": 0.7931,
"step": 351
},
{
"epoch": 0.7857142857142857,
"grad_norm": 1.6885210275650024,
"learning_rate": 1.804240579307431e-05,
"loss": 0.7497,
"step": 352
},
{
"epoch": 0.7879464285714286,
"grad_norm": 1.717721939086914,
"learning_rate": 1.8027371636999605e-05,
"loss": 0.6567,
"step": 353
},
{
"epoch": 0.7901785714285714,
"grad_norm": 2.2966055870056152,
"learning_rate": 1.8012286285385456e-05,
"loss": 0.9328,
"step": 354
},
{
"epoch": 0.7924107142857143,
"grad_norm": 2.057353973388672,
"learning_rate": 1.7997149834440527e-05,
"loss": 0.7644,
"step": 355
},
{
"epoch": 0.7946428571428571,
"grad_norm": 1.7968207597732544,
"learning_rate": 1.7981962380699376e-05,
"loss": 0.758,
"step": 356
},
{
"epoch": 0.796875,
"grad_norm": 1.8473401069641113,
"learning_rate": 1.7966724021021837e-05,
"loss": 0.6907,
"step": 357
},
{
"epoch": 0.7991071428571429,
"grad_norm": 1.7238281965255737,
"learning_rate": 1.7951434852592406e-05,
"loss": 0.7409,
"step": 358
},
{
"epoch": 0.8013392857142857,
"grad_norm": 1.6485793590545654,
"learning_rate": 1.793609497291961e-05,
"loss": 0.6849,
"step": 359
},
{
"epoch": 0.8035714285714286,
"grad_norm": 1.7897621393203735,
"learning_rate": 1.79207044798354e-05,
"loss": 0.9022,
"step": 360
},
{
"epoch": 0.8058035714285714,
"grad_norm": 2.24227237701416,
"learning_rate": 1.7905263471494522e-05,
"loss": 0.906,
"step": 361
},
{
"epoch": 0.8080357142857143,
"grad_norm": 1.8071457147598267,
"learning_rate": 1.788977204637388e-05,
"loss": 0.6459,
"step": 362
},
{
"epoch": 0.8102678571428571,
"grad_norm": 1.894426941871643,
"learning_rate": 1.7874230303271932e-05,
"loss": 0.9378,
"step": 363
},
{
"epoch": 0.8125,
"grad_norm": 1.6523385047912598,
"learning_rate": 1.7858638341308026e-05,
"loss": 0.7221,
"step": 364
},
{
"epoch": 0.8147321428571429,
"grad_norm": 1.9723589420318604,
"learning_rate": 1.78429962599218e-05,
"loss": 0.8942,
"step": 365
},
{
"epoch": 0.8169642857142857,
"grad_norm": 1.8695416450500488,
"learning_rate": 1.7827304158872538e-05,
"loss": 0.6494,
"step": 366
},
{
"epoch": 0.8191964285714286,
"grad_norm": 1.7047300338745117,
"learning_rate": 1.7811562138238508e-05,
"loss": 0.6725,
"step": 367
},
{
"epoch": 0.8214285714285714,
"grad_norm": 1.707351803779602,
"learning_rate": 1.779577029841638e-05,
"loss": 0.7854,
"step": 368
},
{
"epoch": 0.8236607142857143,
"grad_norm": 1.9581531286239624,
"learning_rate": 1.7779928740120525e-05,
"loss": 0.8307,
"step": 369
},
{
"epoch": 0.8258928571428571,
"grad_norm": 1.608521580696106,
"learning_rate": 1.776403756438241e-05,
"loss": 0.6982,
"step": 370
},
{
"epoch": 0.828125,
"grad_norm": 2.188683032989502,
"learning_rate": 1.774809687254994e-05,
"loss": 0.8912,
"step": 371
},
{
"epoch": 0.8303571428571429,
"grad_norm": 2.287449598312378,
"learning_rate": 1.773210676628682e-05,
"loss": 0.9133,
"step": 372
},
{
"epoch": 0.8325892857142857,
"grad_norm": 1.6350460052490234,
"learning_rate": 1.77160673475719e-05,
"loss": 0.6556,
"step": 373
},
{
"epoch": 0.8348214285714286,
"grad_norm": 1.6731687784194946,
"learning_rate": 1.769997871869852e-05,
"loss": 0.6751,
"step": 374
},
{
"epoch": 0.8370535714285714,
"grad_norm": 2.1170239448547363,
"learning_rate": 1.768384098227387e-05,
"loss": 0.797,
"step": 375
},
{
"epoch": 0.8392857142857143,
"grad_norm": 1.8839830160140991,
"learning_rate": 1.7667654241218332e-05,
"loss": 0.7856,
"step": 376
},
{
"epoch": 0.8415178571428571,
"grad_norm": 1.9298778772354126,
"learning_rate": 1.765141859876481e-05,
"loss": 0.7929,
"step": 377
},
{
"epoch": 0.84375,
"grad_norm": 2.2410686016082764,
"learning_rate": 1.7635134158458095e-05,
"loss": 0.8097,
"step": 378
},
{
"epoch": 0.8459821428571429,
"grad_norm": 1.8766001462936401,
"learning_rate": 1.7618801024154186e-05,
"loss": 0.7552,
"step": 379
},
{
"epoch": 0.8482142857142857,
"grad_norm": 1.960311770439148,
"learning_rate": 1.7602419300019627e-05,
"loss": 0.7243,
"step": 380
},
{
"epoch": 0.8504464285714286,
"grad_norm": 1.9935823678970337,
"learning_rate": 1.758598909053087e-05,
"loss": 0.7236,
"step": 381
},
{
"epoch": 0.8526785714285714,
"grad_norm": 1.8538720607757568,
"learning_rate": 1.7569510500473566e-05,
"loss": 0.742,
"step": 382
},
{
"epoch": 0.8549107142857143,
"grad_norm": 1.5118043422698975,
"learning_rate": 1.7552983634941928e-05,
"loss": 0.5574,
"step": 383
},
{
"epoch": 0.8571428571428571,
"grad_norm": 1.9784835577011108,
"learning_rate": 1.753640859933806e-05,
"loss": 0.7678,
"step": 384
},
{
"epoch": 0.859375,
"grad_norm": 2.0069949626922607,
"learning_rate": 1.751978549937126e-05,
"loss": 0.8437,
"step": 385
},
{
"epoch": 0.8616071428571429,
"grad_norm": 1.858752965927124,
"learning_rate": 1.7503114441057374e-05,
"loss": 0.7793,
"step": 386
},
{
"epoch": 0.8638392857142857,
"grad_norm": 1.7572165727615356,
"learning_rate": 1.7486395530718104e-05,
"loss": 0.8084,
"step": 387
},
{
"epoch": 0.8660714285714286,
"grad_norm": 1.585492491722107,
"learning_rate": 1.746962887498034e-05,
"loss": 0.6839,
"step": 388
},
{
"epoch": 0.8683035714285714,
"grad_norm": 1.8265163898468018,
"learning_rate": 1.7452814580775467e-05,
"loss": 0.7074,
"step": 389
},
{
"epoch": 0.8705357142857143,
"grad_norm": 1.6596579551696777,
"learning_rate": 1.743595275533869e-05,
"loss": 0.6543,
"step": 390
},
{
"epoch": 0.8727678571428571,
"grad_norm": 1.6813689470291138,
"learning_rate": 1.7419043506208348e-05,
"loss": 0.804,
"step": 391
},
{
"epoch": 0.875,
"grad_norm": 1.6294770240783691,
"learning_rate": 1.7402086941225246e-05,
"loss": 0.6623,
"step": 392
},
{
"epoch": 0.8772321428571429,
"grad_norm": 1.7459831237792969,
"learning_rate": 1.7385083168531934e-05,
"loss": 0.7403,
"step": 393
},
{
"epoch": 0.8794642857142857,
"grad_norm": 1.845110297203064,
"learning_rate": 1.736803229657204e-05,
"loss": 0.7605,
"step": 394
},
{
"epoch": 0.8816964285714286,
"grad_norm": 1.690212368965149,
"learning_rate": 1.7350934434089583e-05,
"loss": 0.6052,
"step": 395
},
{
"epoch": 0.8839285714285714,
"grad_norm": 1.7885936498641968,
"learning_rate": 1.7333789690128252e-05,
"loss": 0.844,
"step": 396
},
{
"epoch": 0.8861607142857143,
"grad_norm": 1.8944475650787354,
"learning_rate": 1.7316598174030746e-05,
"loss": 0.7689,
"step": 397
},
{
"epoch": 0.8883928571428571,
"grad_norm": 2.1152751445770264,
"learning_rate": 1.7299359995438046e-05,
"loss": 0.8567,
"step": 398
},
{
"epoch": 0.890625,
"grad_norm": 1.7011525630950928,
"learning_rate": 1.728207526428873e-05,
"loss": 0.7467,
"step": 399
},
{
"epoch": 0.8928571428571429,
"grad_norm": 1.963889479637146,
"learning_rate": 1.7264744090818284e-05,
"loss": 0.8,
"step": 400
},
{
"epoch": 0.8950892857142857,
"grad_norm": 1.824885606765747,
"learning_rate": 1.7247366585558366e-05,
"loss": 0.7644,
"step": 401
},
{
"epoch": 0.8973214285714286,
"grad_norm": 1.8513473272323608,
"learning_rate": 1.7229942859336142e-05,
"loss": 0.8467,
"step": 402
},
{
"epoch": 0.8995535714285714,
"grad_norm": 1.688294529914856,
"learning_rate": 1.7212473023273532e-05,
"loss": 0.6652,
"step": 403
},
{
"epoch": 0.9017857142857143,
"grad_norm": 1.7810540199279785,
"learning_rate": 1.719495718878655e-05,
"loss": 0.7861,
"step": 404
},
{
"epoch": 0.9040178571428571,
"grad_norm": 1.7689653635025024,
"learning_rate": 1.7177395467584564e-05,
"loss": 0.7411,
"step": 405
},
{
"epoch": 0.90625,
"grad_norm": 2.2160940170288086,
"learning_rate": 1.7159787971669586e-05,
"loss": 0.777,
"step": 406
},
{
"epoch": 0.9084821428571429,
"grad_norm": 2.250462770462036,
"learning_rate": 1.7142134813335557e-05,
"loss": 0.8158,
"step": 407
},
{
"epoch": 0.9107142857142857,
"grad_norm": 1.9748404026031494,
"learning_rate": 1.712443610516765e-05,
"loss": 0.7991,
"step": 408
},
{
"epoch": 0.9129464285714286,
"grad_norm": 2.171666145324707,
"learning_rate": 1.7106691960041527e-05,
"loss": 0.8593,
"step": 409
},
{
"epoch": 0.9151785714285714,
"grad_norm": 1.7081716060638428,
"learning_rate": 1.7088902491122636e-05,
"loss": 0.7543,
"step": 410
},
{
"epoch": 0.9174107142857143,
"grad_norm": 1.9155141115188599,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.8376,
"step": 411
},
{
"epoch": 0.9196428571428571,
"grad_norm": 2.0130817890167236,
"learning_rate": 1.7053188036012885e-05,
"loss": 0.66,
"step": 412
},
{
"epoch": 0.921875,
"grad_norm": 1.888626217842102,
"learning_rate": 1.7035263277595314e-05,
"loss": 0.6913,
"step": 413
},
{
"epoch": 0.9241071428571429,
"grad_norm": 1.5536619424819946,
"learning_rate": 1.7017293650930083e-05,
"loss": 0.7703,
"step": 414
},
{
"epoch": 0.9263392857142857,
"grad_norm": 1.720703363418579,
"learning_rate": 1.6999279270620675e-05,
"loss": 0.7862,
"step": 415
},
{
"epoch": 0.9285714285714286,
"grad_norm": 1.6617650985717773,
"learning_rate": 1.6981220251555996e-05,
"loss": 0.7429,
"step": 416
},
{
"epoch": 0.9308035714285714,
"grad_norm": 2.0875959396362305,
"learning_rate": 1.6963116708909637e-05,
"loss": 0.7905,
"step": 417
},
{
"epoch": 0.9330357142857143,
"grad_norm": 1.4865297079086304,
"learning_rate": 1.6944968758139144e-05,
"loss": 0.7061,
"step": 418
},
{
"epoch": 0.9352678571428571,
"grad_norm": 1.7242523431777954,
"learning_rate": 1.6926776514985278e-05,
"loss": 0.7275,
"step": 419
},
{
"epoch": 0.9375,
"grad_norm": 1.7873682975769043,
"learning_rate": 1.6908540095471288e-05,
"loss": 0.7436,
"step": 420
},
{
"epoch": 0.9397321428571429,
"grad_norm": 1.887552261352539,
"learning_rate": 1.6890259615902153e-05,
"loss": 0.8187,
"step": 421
},
{
"epoch": 0.9419642857142857,
"grad_norm": 1.7543212175369263,
"learning_rate": 1.6871935192863862e-05,
"loss": 0.6981,
"step": 422
},
{
"epoch": 0.9441964285714286,
"grad_norm": 1.7617319822311401,
"learning_rate": 1.6853566943222647e-05,
"loss": 0.801,
"step": 423
},
{
"epoch": 0.9464285714285714,
"grad_norm": 1.8142614364624023,
"learning_rate": 1.6835154984124266e-05,
"loss": 0.7469,
"step": 424
},
{
"epoch": 0.9486607142857143,
"grad_norm": 1.6476774215698242,
"learning_rate": 1.6816699432993212e-05,
"loss": 0.8173,
"step": 425
},
{
"epoch": 0.9508928571428571,
"grad_norm": 2.0289416313171387,
"learning_rate": 1.6798200407532025e-05,
"loss": 0.9145,
"step": 426
},
{
"epoch": 0.953125,
"grad_norm": 1.8370226621627808,
"learning_rate": 1.677965802572048e-05,
"loss": 0.748,
"step": 427
},
{
"epoch": 0.9553571428571429,
"grad_norm": 1.7355858087539673,
"learning_rate": 1.676107240581488e-05,
"loss": 0.7599,
"step": 428
},
{
"epoch": 0.9575892857142857,
"grad_norm": 1.7881946563720703,
"learning_rate": 1.674244366634727e-05,
"loss": 0.7741,
"step": 429
},
{
"epoch": 0.9598214285714286,
"grad_norm": 2.2131218910217285,
"learning_rate": 1.6723771926124704e-05,
"loss": 0.9298,
"step": 430
},
{
"epoch": 0.9620535714285714,
"grad_norm": 1.5364829301834106,
"learning_rate": 1.6705057304228488e-05,
"loss": 0.6597,
"step": 431
},
{
"epoch": 0.9642857142857143,
"grad_norm": 1.8031491041183472,
"learning_rate": 1.6686299920013388e-05,
"loss": 0.7577,
"step": 432
},
{
"epoch": 0.9665178571428571,
"grad_norm": 1.596772313117981,
"learning_rate": 1.666749989310691e-05,
"loss": 0.728,
"step": 433
},
{
"epoch": 0.96875,
"grad_norm": 1.614382028579712,
"learning_rate": 1.6648657343408517e-05,
"loss": 0.7294,
"step": 434
},
{
"epoch": 0.9709821428571429,
"grad_norm": 1.7294843196868896,
"learning_rate": 1.6629772391088855e-05,
"loss": 0.7244,
"step": 435
},
{
"epoch": 0.9732142857142857,
"grad_norm": 1.9231197834014893,
"learning_rate": 1.661084515658901e-05,
"loss": 0.81,
"step": 436
},
{
"epoch": 0.9754464285714286,
"grad_norm": 1.6754019260406494,
"learning_rate": 1.6591875760619718e-05,
"loss": 0.6205,
"step": 437
},
{
"epoch": 0.9776785714285714,
"grad_norm": 2.166166067123413,
"learning_rate": 1.6572864324160617e-05,
"loss": 0.8231,
"step": 438
},
{
"epoch": 0.9799107142857143,
"grad_norm": 1.67359459400177,
"learning_rate": 1.6553810968459455e-05,
"loss": 0.6678,
"step": 439
},
{
"epoch": 0.9821428571428571,
"grad_norm": 1.8316197395324707,
"learning_rate": 1.6534715815031325e-05,
"loss": 0.7779,
"step": 440
},
{
"epoch": 0.984375,
"grad_norm": 1.8748825788497925,
"learning_rate": 1.651557898565789e-05,
"loss": 0.7911,
"step": 441
},
{
"epoch": 0.9866071428571429,
"grad_norm": 1.6567225456237793,
"learning_rate": 1.649640060238661e-05,
"loss": 0.7287,
"step": 442
},
{
"epoch": 0.9888392857142857,
"grad_norm": 1.8699995279312134,
"learning_rate": 1.6477180787529957e-05,
"loss": 0.7821,
"step": 443
},
{
"epoch": 0.9910714285714286,
"grad_norm": 1.7357499599456787,
"learning_rate": 1.645791966366464e-05,
"loss": 0.6641,
"step": 444
},
{
"epoch": 0.9933035714285714,
"grad_norm": 1.7094175815582275,
"learning_rate": 1.6438617353630823e-05,
"loss": 0.7252,
"step": 445
},
{
"epoch": 0.9955357142857143,
"grad_norm": 1.7703126668930054,
"learning_rate": 1.6419273980531333e-05,
"loss": 0.8532,
"step": 446
},
{
"epoch": 0.9977678571428571,
"grad_norm": 1.5840541124343872,
"learning_rate": 1.6399889667730887e-05,
"loss": 0.6525,
"step": 447
},
{
"epoch": 1.0,
"grad_norm": 1.72904634475708,
"learning_rate": 1.63804645388553e-05,
"loss": 0.6731,
"step": 448
},
{
"epoch": 1.0,
"eval_loss": 0.7064380645751953,
"eval_runtime": 45.5564,
"eval_samples_per_second": 1.602,
"eval_steps_per_second": 0.22,
"step": 448
},
{
"epoch": 1.0022321428571428,
"grad_norm": 1.7170028686523438,
"learning_rate": 1.6360998717790694e-05,
"loss": 0.532,
"step": 449
},
{
"epoch": 1.0044642857142858,
"grad_norm": 2.1551966667175293,
"learning_rate": 1.6341492328682703e-05,
"loss": 0.6989,
"step": 450
},
{
"epoch": 1.0066964285714286,
"grad_norm": 1.7585958242416382,
"learning_rate": 1.6321945495935717e-05,
"loss": 0.6666,
"step": 451
},
{
"epoch": 1.0089285714285714,
"grad_norm": 1.7315536737442017,
"learning_rate": 1.6302358344212025e-05,
"loss": 0.5661,
"step": 452
},
{
"epoch": 1.0111607142857142,
"grad_norm": 1.7567814588546753,
"learning_rate": 1.6282730998431072e-05,
"loss": 0.6854,
"step": 453
},
{
"epoch": 1.0133928571428572,
"grad_norm": 1.8460400104522705,
"learning_rate": 1.6263063583768652e-05,
"loss": 0.5733,
"step": 454
},
{
"epoch": 1.015625,
"grad_norm": 1.7143034934997559,
"learning_rate": 1.624335622565609e-05,
"loss": 0.5911,
"step": 455
},
{
"epoch": 1.0178571428571428,
"grad_norm": 1.7223514318466187,
"learning_rate": 1.622360904977946e-05,
"loss": 0.5492,
"step": 456
},
{
"epoch": 1.0200892857142858,
"grad_norm": 2.3188822269439697,
"learning_rate": 1.6203822182078777e-05,
"loss": 0.7361,
"step": 457
},
{
"epoch": 1.0223214285714286,
"grad_norm": 1.8782273530960083,
"learning_rate": 1.6183995748747204e-05,
"loss": 0.65,
"step": 458
},
{
"epoch": 1.0245535714285714,
"grad_norm": 1.8567129373550415,
"learning_rate": 1.6164129876230226e-05,
"loss": 0.5537,
"step": 459
},
{
"epoch": 1.0267857142857142,
"grad_norm": 1.7850383520126343,
"learning_rate": 1.6144224691224868e-05,
"loss": 0.6298,
"step": 460
},
{
"epoch": 1.0290178571428572,
"grad_norm": 1.7569608688354492,
"learning_rate": 1.6124280320678864e-05,
"loss": 0.649,
"step": 461
},
{
"epoch": 1.03125,
"grad_norm": 2.0803897380828857,
"learning_rate": 1.6104296891789867e-05,
"loss": 0.744,
"step": 462
},
{
"epoch": 1.0334821428571428,
"grad_norm": 1.786569595336914,
"learning_rate": 1.608427453200463e-05,
"loss": 0.5691,
"step": 463
},
{
"epoch": 1.0357142857142858,
"grad_norm": 1.8641101121902466,
"learning_rate": 1.606421336901818e-05,
"loss": 0.6681,
"step": 464
},
{
"epoch": 1.0379464285714286,
"grad_norm": 2.0264892578125,
"learning_rate": 1.6044113530773034e-05,
"loss": 0.7279,
"step": 465
},
{
"epoch": 1.0401785714285714,
"grad_norm": 1.8726037740707397,
"learning_rate": 1.6023975145458352e-05,
"loss": 0.5828,
"step": 466
},
{
"epoch": 1.0424107142857142,
"grad_norm": 1.6226308345794678,
"learning_rate": 1.600379834150914e-05,
"loss": 0.4913,
"step": 467
},
{
"epoch": 1.0446428571428572,
"grad_norm": 1.677682638168335,
"learning_rate": 1.5983583247605414e-05,
"loss": 0.5904,
"step": 468
},
{
"epoch": 1.046875,
"grad_norm": 2.0546329021453857,
"learning_rate": 1.5963329992671402e-05,
"loss": 0.7253,
"step": 469
},
{
"epoch": 1.0491071428571428,
"grad_norm": 2.0313217639923096,
"learning_rate": 1.5943038705874697e-05,
"loss": 0.6989,
"step": 470
},
{
"epoch": 1.0513392857142858,
"grad_norm": 1.7999160289764404,
"learning_rate": 1.5922709516625453e-05,
"loss": 0.6103,
"step": 471
},
{
"epoch": 1.0535714285714286,
"grad_norm": 1.782199501991272,
"learning_rate": 1.590234255457555e-05,
"loss": 0.6372,
"step": 472
},
{
"epoch": 1.0558035714285714,
"grad_norm": 2.0932509899139404,
"learning_rate": 1.588193794961776e-05,
"loss": 0.6486,
"step": 473
},
{
"epoch": 1.0580357142857142,
"grad_norm": 1.6826951503753662,
"learning_rate": 1.5861495831884942e-05,
"loss": 0.5463,
"step": 474
},
{
"epoch": 1.0602678571428572,
"grad_norm": 1.6843758821487427,
"learning_rate": 1.5841016331749185e-05,
"loss": 0.5937,
"step": 475
},
{
"epoch": 1.0625,
"grad_norm": 1.8449771404266357,
"learning_rate": 1.582049957982099e-05,
"loss": 0.5093,
"step": 476
},
{
"epoch": 1.0647321428571428,
"grad_norm": 1.8449821472167969,
"learning_rate": 1.5799945706948447e-05,
"loss": 0.5731,
"step": 477
},
{
"epoch": 1.0669642857142858,
"grad_norm": 1.7647595405578613,
"learning_rate": 1.5779354844216377e-05,
"loss": 0.5962,
"step": 478
},
{
"epoch": 1.0691964285714286,
"grad_norm": 1.7663452625274658,
"learning_rate": 1.5758727122945514e-05,
"loss": 0.6871,
"step": 479
},
{
"epoch": 1.0714285714285714,
"grad_norm": 1.7700406312942505,
"learning_rate": 1.5738062674691657e-05,
"loss": 0.6388,
"step": 480
},
{
"epoch": 1.0736607142857142,
"grad_norm": 1.9175291061401367,
"learning_rate": 1.5717361631244842e-05,
"loss": 0.6238,
"step": 481
},
{
"epoch": 1.0758928571428572,
"grad_norm": 1.8494954109191895,
"learning_rate": 1.5696624124628495e-05,
"loss": 0.6641,
"step": 482
},
{
"epoch": 1.078125,
"grad_norm": 2.086261034011841,
"learning_rate": 1.5675850287098585e-05,
"loss": 0.7263,
"step": 483
},
{
"epoch": 1.0803571428571428,
"grad_norm": 1.7281228303909302,
"learning_rate": 1.5655040251142787e-05,
"loss": 0.6142,
"step": 484
},
{
"epoch": 1.0825892857142858,
"grad_norm": 1.798956036567688,
"learning_rate": 1.5634194149479642e-05,
"loss": 0.5619,
"step": 485
},
{
"epoch": 1.0848214285714286,
"grad_norm": 2.007969379425049,
"learning_rate": 1.5613312115057697e-05,
"loss": 0.7375,
"step": 486
},
{
"epoch": 1.0870535714285714,
"grad_norm": 1.869939923286438,
"learning_rate": 1.559239428105467e-05,
"loss": 0.6256,
"step": 487
},
{
"epoch": 1.0892857142857142,
"grad_norm": 2.1612086296081543,
"learning_rate": 1.5571440780876588e-05,
"loss": 0.6326,
"step": 488
},
{
"epoch": 1.0915178571428572,
"grad_norm": 1.9341946840286255,
"learning_rate": 1.5550451748156957e-05,
"loss": 0.619,
"step": 489
},
{
"epoch": 1.09375,
"grad_norm": 1.7737098932266235,
"learning_rate": 1.5529427316755876e-05,
"loss": 0.6141,
"step": 490
},
{
"epoch": 1.0959821428571428,
"grad_norm": 1.672850251197815,
"learning_rate": 1.5508367620759224e-05,
"loss": 0.5472,
"step": 491
},
{
"epoch": 1.0982142857142858,
"grad_norm": 1.985202431678772,
"learning_rate": 1.548727279447777e-05,
"loss": 0.6157,
"step": 492
},
{
"epoch": 1.1004464285714286,
"grad_norm": 1.824111819267273,
"learning_rate": 1.546614297244634e-05,
"loss": 0.5532,
"step": 493
},
{
"epoch": 1.1026785714285714,
"grad_norm": 2.178053617477417,
"learning_rate": 1.5444978289422937e-05,
"loss": 0.628,
"step": 494
},
{
"epoch": 1.1049107142857142,
"grad_norm": 1.8749269247055054,
"learning_rate": 1.542377888038791e-05,
"loss": 0.6572,
"step": 495
},
{
"epoch": 1.1071428571428572,
"grad_norm": 1.7741912603378296,
"learning_rate": 1.540254488054307e-05,
"loss": 0.5662,
"step": 496
},
{
"epoch": 1.109375,
"grad_norm": 1.8596246242523193,
"learning_rate": 1.538127642531083e-05,
"loss": 0.6233,
"step": 497
},
{
"epoch": 1.1116071428571428,
"grad_norm": 1.9491173028945923,
"learning_rate": 1.5359973650333352e-05,
"loss": 0.6861,
"step": 498
},
{
"epoch": 1.1138392857142858,
"grad_norm": 1.7947884798049927,
"learning_rate": 1.533863669147168e-05,
"loss": 0.6094,
"step": 499
},
{
"epoch": 1.1160714285714286,
"grad_norm": 1.9403222799301147,
"learning_rate": 1.5317265684804865e-05,
"loss": 0.6399,
"step": 500
},
{
"epoch": 1.1183035714285714,
"grad_norm": 1.8400847911834717,
"learning_rate": 1.5295860766629098e-05,
"loss": 0.6124,
"step": 501
},
{
"epoch": 1.1205357142857142,
"grad_norm": 1.8174333572387695,
"learning_rate": 1.5274422073456853e-05,
"loss": 0.6962,
"step": 502
},
{
"epoch": 1.1227678571428572,
"grad_norm": 1.8172571659088135,
"learning_rate": 1.5252949742016005e-05,
"loss": 0.5751,
"step": 503
},
{
"epoch": 1.125,
"grad_norm": 2.1520352363586426,
"learning_rate": 1.5231443909248956e-05,
"loss": 0.7679,
"step": 504
},
{
"epoch": 1.1272321428571428,
"grad_norm": 1.8504657745361328,
"learning_rate": 1.5209904712311777e-05,
"loss": 0.5691,
"step": 505
},
{
"epoch": 1.1294642857142858,
"grad_norm": 1.9086402654647827,
"learning_rate": 1.5188332288573313e-05,
"loss": 0.621,
"step": 506
},
{
"epoch": 1.1316964285714286,
"grad_norm": 1.8591104745864868,
"learning_rate": 1.5166726775614327e-05,
"loss": 0.6644,
"step": 507
},
{
"epoch": 1.1339285714285714,
"grad_norm": 2.2467782497406006,
"learning_rate": 1.5145088311226599e-05,
"loss": 0.7193,
"step": 508
},
{
"epoch": 1.1361607142857142,
"grad_norm": 2.0161256790161133,
"learning_rate": 1.5123417033412078e-05,
"loss": 0.5497,
"step": 509
},
{
"epoch": 1.1383928571428572,
"grad_norm": 1.9359264373779297,
"learning_rate": 1.510171308038197e-05,
"loss": 0.5944,
"step": 510
},
{
"epoch": 1.140625,
"grad_norm": 1.893852710723877,
"learning_rate": 1.5079976590555876e-05,
"loss": 0.6466,
"step": 511
},
{
"epoch": 1.1428571428571428,
"grad_norm": 1.742958426475525,
"learning_rate": 1.5058207702560907e-05,
"loss": 0.6072,
"step": 512
},
{
"epoch": 1.1450892857142858,
"grad_norm": 1.8722553253173828,
"learning_rate": 1.5036406555230794e-05,
"loss": 0.6441,
"step": 513
},
{
"epoch": 1.1473214285714286,
"grad_norm": 1.7931900024414062,
"learning_rate": 1.501457328760501e-05,
"loss": 0.5519,
"step": 514
},
{
"epoch": 1.1495535714285714,
"grad_norm": 1.7850182056427002,
"learning_rate": 1.499270803892787e-05,
"loss": 0.5238,
"step": 515
},
{
"epoch": 1.1517857142857142,
"grad_norm": 2.0565764904022217,
"learning_rate": 1.4970810948647664e-05,
"loss": 0.6809,
"step": 516
},
{
"epoch": 1.1540178571428572,
"grad_norm": 2.0107569694519043,
"learning_rate": 1.4948882156415748e-05,
"loss": 0.6119,
"step": 517
},
{
"epoch": 1.15625,
"grad_norm": 1.9191194772720337,
"learning_rate": 1.4926921802085662e-05,
"loss": 0.7001,
"step": 518
},
{
"epoch": 1.1584821428571428,
"grad_norm": 1.797013282775879,
"learning_rate": 1.4904930025712236e-05,
"loss": 0.5865,
"step": 519
},
{
"epoch": 1.1607142857142858,
"grad_norm": 1.8777704238891602,
"learning_rate": 1.4882906967550708e-05,
"loss": 0.6189,
"step": 520
},
{
"epoch": 1.1629464285714286,
"grad_norm": 1.7872058153152466,
"learning_rate": 1.4860852768055804e-05,
"loss": 0.6804,
"step": 521
},
{
"epoch": 1.1651785714285714,
"grad_norm": 1.9415029287338257,
"learning_rate": 1.4838767567880865e-05,
"loss": 0.5962,
"step": 522
},
{
"epoch": 1.1674107142857142,
"grad_norm": 1.8708178997039795,
"learning_rate": 1.4816651507876946e-05,
"loss": 0.6953,
"step": 523
},
{
"epoch": 1.1696428571428572,
"grad_norm": 1.8373233079910278,
"learning_rate": 1.479450472909191e-05,
"loss": 0.5464,
"step": 524
},
{
"epoch": 1.171875,
"grad_norm": 1.692324161529541,
"learning_rate": 1.4772327372769533e-05,
"loss": 0.6527,
"step": 525
},
{
"epoch": 1.1741071428571428,
"grad_norm": 2.331209421157837,
"learning_rate": 1.4750119580348601e-05,
"loss": 0.7633,
"step": 526
},
{
"epoch": 1.1763392857142858,
"grad_norm": 1.8351179361343384,
"learning_rate": 1.4727881493462018e-05,
"loss": 0.5657,
"step": 527
},
{
"epoch": 1.1785714285714286,
"grad_norm": 1.9545246362686157,
"learning_rate": 1.4705613253935886e-05,
"loss": 0.6594,
"step": 528
},
{
"epoch": 1.1808035714285714,
"grad_norm": 1.8347816467285156,
"learning_rate": 1.4683315003788614e-05,
"loss": 0.6406,
"step": 529
},
{
"epoch": 1.1830357142857142,
"grad_norm": 2.1158814430236816,
"learning_rate": 1.4660986885230002e-05,
"loss": 0.5972,
"step": 530
},
{
"epoch": 1.1852678571428572,
"grad_norm": 1.7203682661056519,
"learning_rate": 1.463862904066035e-05,
"loss": 0.6403,
"step": 531
},
{
"epoch": 1.1875,
"grad_norm": 1.9957817792892456,
"learning_rate": 1.4616241612669523e-05,
"loss": 0.5948,
"step": 532
},
{
"epoch": 1.1897321428571428,
"grad_norm": 1.9116469621658325,
"learning_rate": 1.4593824744036078e-05,
"loss": 0.6249,
"step": 533
},
{
"epoch": 1.1919642857142858,
"grad_norm": 1.8352584838867188,
"learning_rate": 1.4571378577726317e-05,
"loss": 0.5559,
"step": 534
},
{
"epoch": 1.1941964285714286,
"grad_norm": 1.9600341320037842,
"learning_rate": 1.4548903256893392e-05,
"loss": 0.6037,
"step": 535
},
{
"epoch": 1.1964285714285714,
"grad_norm": 2.105661153793335,
"learning_rate": 1.4526398924876407e-05,
"loss": 0.654,
"step": 536
},
{
"epoch": 1.1986607142857142,
"grad_norm": 1.9616891145706177,
"learning_rate": 1.4503865725199468e-05,
"loss": 0.6022,
"step": 537
},
{
"epoch": 1.2008928571428572,
"grad_norm": 1.8042315244674683,
"learning_rate": 1.4481303801570805e-05,
"loss": 0.6358,
"step": 538
},
{
"epoch": 1.203125,
"grad_norm": 1.7036793231964111,
"learning_rate": 1.4458713297881828e-05,
"loss": 0.5692,
"step": 539
},
{
"epoch": 1.2053571428571428,
"grad_norm": 1.8314141035079956,
"learning_rate": 1.4436094358206224e-05,
"loss": 0.6103,
"step": 540
},
{
"epoch": 1.2075892857142858,
"grad_norm": 1.6869179010391235,
"learning_rate": 1.4413447126799038e-05,
"loss": 0.5561,
"step": 541
},
{
"epoch": 1.2098214285714286,
"grad_norm": 1.5481685400009155,
"learning_rate": 1.4390771748095735e-05,
"loss": 0.4697,
"step": 542
},
{
"epoch": 1.2120535714285714,
"grad_norm": 1.904807448387146,
"learning_rate": 1.436806836671131e-05,
"loss": 0.5764,
"step": 543
},
{
"epoch": 1.2142857142857142,
"grad_norm": 1.9439524412155151,
"learning_rate": 1.4345337127439333e-05,
"loss": 0.6585,
"step": 544
},
{
"epoch": 1.2165178571428572,
"grad_norm": 1.8721519708633423,
"learning_rate": 1.4322578175251058e-05,
"loss": 0.685,
"step": 545
},
{
"epoch": 1.21875,
"grad_norm": 1.9289742708206177,
"learning_rate": 1.4299791655294461e-05,
"loss": 0.6364,
"step": 546
},
{
"epoch": 1.2209821428571428,
"grad_norm": 1.6985549926757812,
"learning_rate": 1.4276977712893357e-05,
"loss": 0.6419,
"step": 547
},
{
"epoch": 1.2232142857142858,
"grad_norm": 1.9946244955062866,
"learning_rate": 1.4254136493546432e-05,
"loss": 0.7154,
"step": 548
},
{
"epoch": 1.2254464285714286,
"grad_norm": 1.9798763990402222,
"learning_rate": 1.4231268142926345e-05,
"loss": 0.6713,
"step": 549
},
{
"epoch": 1.2276785714285714,
"grad_norm": 2.0185835361480713,
"learning_rate": 1.4208372806878782e-05,
"loss": 0.7014,
"step": 550
},
{
"epoch": 1.2299107142857142,
"grad_norm": 2.082404851913452,
"learning_rate": 1.4185450631421542e-05,
"loss": 0.7588,
"step": 551
},
{
"epoch": 1.2321428571428572,
"grad_norm": 1.8249948024749756,
"learning_rate": 1.4162501762743579e-05,
"loss": 0.6651,
"step": 552
},
{
"epoch": 1.234375,
"grad_norm": 1.8358428478240967,
"learning_rate": 1.41395263472041e-05,
"loss": 0.6694,
"step": 553
},
{
"epoch": 1.2366071428571428,
"grad_norm": 1.7472898960113525,
"learning_rate": 1.4116524531331616e-05,
"loss": 0.5877,
"step": 554
},
{
"epoch": 1.2388392857142858,
"grad_norm": 1.6928068399429321,
"learning_rate": 1.4093496461823002e-05,
"loss": 0.6702,
"step": 555
},
{
"epoch": 1.2410714285714286,
"grad_norm": 1.7535682916641235,
"learning_rate": 1.4070442285542579e-05,
"loss": 0.6101,
"step": 556
},
{
"epoch": 1.2433035714285714,
"grad_norm": 1.7997139692306519,
"learning_rate": 1.4047362149521152e-05,
"loss": 0.6645,
"step": 557
},
{
"epoch": 1.2455357142857142,
"grad_norm": 1.8536964654922485,
"learning_rate": 1.402425620095511e-05,
"loss": 0.5933,
"step": 558
},
{
"epoch": 1.2477678571428572,
"grad_norm": 1.7359586954116821,
"learning_rate": 1.400112458720544e-05,
"loss": 0.6334,
"step": 559
},
{
"epoch": 1.25,
"grad_norm": 1.6964356899261475,
"learning_rate": 1.3977967455796828e-05,
"loss": 0.6578,
"step": 560
},
{
"epoch": 1.25,
"eval_loss": 0.7179591655731201,
"eval_runtime": 45.9423,
"eval_samples_per_second": 1.589,
"eval_steps_per_second": 0.218,
"step": 560
},
{
"epoch": 1.2522321428571428,
"grad_norm": 1.778124451637268,
"learning_rate": 1.3954784954416703e-05,
"loss": 0.6381,
"step": 561
},
{
"epoch": 1.2544642857142856,
"grad_norm": 1.8484911918640137,
"learning_rate": 1.393157723091428e-05,
"loss": 0.6636,
"step": 562
},
{
"epoch": 1.2566964285714286,
"grad_norm": 2.0035502910614014,
"learning_rate": 1.3908344433299644e-05,
"loss": 0.717,
"step": 563
},
{
"epoch": 1.2589285714285714,
"grad_norm": 1.7717210054397583,
"learning_rate": 1.3885086709742788e-05,
"loss": 0.5578,
"step": 564
},
{
"epoch": 1.2611607142857144,
"grad_norm": 1.9656518697738647,
"learning_rate": 1.3861804208572674e-05,
"loss": 0.6255,
"step": 565
},
{
"epoch": 1.2633928571428572,
"grad_norm": 1.973463773727417,
"learning_rate": 1.3838497078276288e-05,
"loss": 0.691,
"step": 566
},
{
"epoch": 1.265625,
"grad_norm": 1.6152547597885132,
"learning_rate": 1.3815165467497686e-05,
"loss": 0.6327,
"step": 567
},
{
"epoch": 1.2678571428571428,
"grad_norm": 1.9555470943450928,
"learning_rate": 1.3791809525037057e-05,
"loss": 0.6078,
"step": 568
},
{
"epoch": 1.2700892857142856,
"grad_norm": 1.9085325002670288,
"learning_rate": 1.376842939984977e-05,
"loss": 0.5348,
"step": 569
},
{
"epoch": 1.2723214285714286,
"grad_norm": 1.535760521888733,
"learning_rate": 1.3745025241045414e-05,
"loss": 0.5663,
"step": 570
},
{
"epoch": 1.2745535714285714,
"grad_norm": 1.951587200164795,
"learning_rate": 1.372159719788686e-05,
"loss": 0.6476,
"step": 571
},
{
"epoch": 1.2767857142857144,
"grad_norm": 2.169016122817993,
"learning_rate": 1.3698145419789302e-05,
"loss": 0.6918,
"step": 572
},
{
"epoch": 1.2790178571428572,
"grad_norm": 1.7241047620773315,
"learning_rate": 1.3674670056319315e-05,
"loss": 0.6644,
"step": 573
},
{
"epoch": 1.28125,
"grad_norm": 1.6963378190994263,
"learning_rate": 1.3651171257193883e-05,
"loss": 0.6071,
"step": 574
},
{
"epoch": 1.2834821428571428,
"grad_norm": 1.891162395477295,
"learning_rate": 1.3627649172279453e-05,
"loss": 0.6225,
"step": 575
},
{
"epoch": 1.2857142857142856,
"grad_norm": 1.9643828868865967,
"learning_rate": 1.3604103951590993e-05,
"loss": 0.6411,
"step": 576
},
{
"epoch": 1.2879464285714286,
"grad_norm": 1.9789677858352661,
"learning_rate": 1.3580535745291001e-05,
"loss": 0.6765,
"step": 577
},
{
"epoch": 1.2901785714285714,
"grad_norm": 1.958038568496704,
"learning_rate": 1.3556944703688592e-05,
"loss": 0.6554,
"step": 578
},
{
"epoch": 1.2924107142857144,
"grad_norm": 1.7940925359725952,
"learning_rate": 1.3533330977238496e-05,
"loss": 0.5874,
"step": 579
},
{
"epoch": 1.2946428571428572,
"grad_norm": 1.920786738395691,
"learning_rate": 1.3509694716540135e-05,
"loss": 0.5498,
"step": 580
},
{
"epoch": 1.296875,
"grad_norm": 1.6368259191513062,
"learning_rate": 1.348603607233663e-05,
"loss": 0.5471,
"step": 581
},
{
"epoch": 1.2991071428571428,
"grad_norm": 1.859761118888855,
"learning_rate": 1.3462355195513868e-05,
"loss": 0.7125,
"step": 582
},
{
"epoch": 1.3013392857142856,
"grad_norm": 1.9652460813522339,
"learning_rate": 1.343865223709952e-05,
"loss": 0.6611,
"step": 583
},
{
"epoch": 1.3035714285714286,
"grad_norm": 1.8966349363327026,
"learning_rate": 1.341492734826209e-05,
"loss": 0.6874,
"step": 584
},
{
"epoch": 1.3058035714285714,
"grad_norm": 1.8784470558166504,
"learning_rate": 1.3391180680309945e-05,
"loss": 0.5934,
"step": 585
},
{
"epoch": 1.3080357142857144,
"grad_norm": 2.3371737003326416,
"learning_rate": 1.3367412384690346e-05,
"loss": 0.7451,
"step": 586
},
{
"epoch": 1.3102678571428572,
"grad_norm": 1.7194281816482544,
"learning_rate": 1.3343622612988492e-05,
"loss": 0.6767,
"step": 587
},
{
"epoch": 1.3125,
"grad_norm": 1.9259989261627197,
"learning_rate": 1.3319811516926541e-05,
"loss": 0.6433,
"step": 588
},
{
"epoch": 1.3147321428571428,
"grad_norm": 1.7472665309906006,
"learning_rate": 1.329597924836267e-05,
"loss": 0.6128,
"step": 589
},
{
"epoch": 1.3169642857142856,
"grad_norm": 2.028818130493164,
"learning_rate": 1.3272125959290059e-05,
"loss": 0.7026,
"step": 590
},
{
"epoch": 1.3191964285714286,
"grad_norm": 1.8237242698669434,
"learning_rate": 1.3248251801835968e-05,
"loss": 0.6255,
"step": 591
},
{
"epoch": 1.3214285714285714,
"grad_norm": 2.014730215072632,
"learning_rate": 1.3224356928260735e-05,
"loss": 0.5823,
"step": 592
},
{
"epoch": 1.3236607142857144,
"grad_norm": 1.8209702968597412,
"learning_rate": 1.3200441490956832e-05,
"loss": 0.6455,
"step": 593
},
{
"epoch": 1.3258928571428572,
"grad_norm": 1.8454806804656982,
"learning_rate": 1.317650564244787e-05,
"loss": 0.6597,
"step": 594
},
{
"epoch": 1.328125,
"grad_norm": 1.621583104133606,
"learning_rate": 1.3152549535387624e-05,
"loss": 0.6227,
"step": 595
},
{
"epoch": 1.3303571428571428,
"grad_norm": 1.9547210931777954,
"learning_rate": 1.3128573322559097e-05,
"loss": 0.6325,
"step": 596
},
{
"epoch": 1.3325892857142856,
"grad_norm": 1.9106806516647339,
"learning_rate": 1.3104577156873496e-05,
"loss": 0.6069,
"step": 597
},
{
"epoch": 1.3348214285714286,
"grad_norm": 1.8474856615066528,
"learning_rate": 1.3080561191369286e-05,
"loss": 0.6753,
"step": 598
},
{
"epoch": 1.3370535714285714,
"grad_norm": 1.5305962562561035,
"learning_rate": 1.3056525579211215e-05,
"loss": 0.5475,
"step": 599
},
{
"epoch": 1.3392857142857144,
"grad_norm": 2.134941577911377,
"learning_rate": 1.3032470473689322e-05,
"loss": 0.6793,
"step": 600
},
{
"epoch": 1.3415178571428572,
"grad_norm": 1.8868045806884766,
"learning_rate": 1.3008396028217969e-05,
"loss": 0.6653,
"step": 601
},
{
"epoch": 1.34375,
"grad_norm": 1.9452149868011475,
"learning_rate": 1.298430239633486e-05,
"loss": 0.6529,
"step": 602
},
{
"epoch": 1.3459821428571428,
"grad_norm": 1.7577698230743408,
"learning_rate": 1.296018973170007e-05,
"loss": 0.6174,
"step": 603
},
{
"epoch": 1.3482142857142856,
"grad_norm": 1.6762840747833252,
"learning_rate": 1.2936058188095045e-05,
"loss": 0.5839,
"step": 604
},
{
"epoch": 1.3504464285714286,
"grad_norm": 1.8176460266113281,
"learning_rate": 1.2911907919421647e-05,
"loss": 0.6965,
"step": 605
},
{
"epoch": 1.3526785714285714,
"grad_norm": 1.8090909719467163,
"learning_rate": 1.2887739079701147e-05,
"loss": 0.6409,
"step": 606
},
{
"epoch": 1.3549107142857144,
"grad_norm": 1.937070369720459,
"learning_rate": 1.2863551823073266e-05,
"loss": 0.6219,
"step": 607
},
{
"epoch": 1.3571428571428572,
"grad_norm": 1.9617418050765991,
"learning_rate": 1.2839346303795173e-05,
"loss": 0.5834,
"step": 608
},
{
"epoch": 1.359375,
"grad_norm": 1.7203189134597778,
"learning_rate": 1.2815122676240518e-05,
"loss": 0.5439,
"step": 609
},
{
"epoch": 1.3616071428571428,
"grad_norm": 1.7873952388763428,
"learning_rate": 1.2790881094898428e-05,
"loss": 0.6598,
"step": 610
},
{
"epoch": 1.3638392857142856,
"grad_norm": 1.7811925411224365,
"learning_rate": 1.2766621714372543e-05,
"loss": 0.5395,
"step": 611
},
{
"epoch": 1.3660714285714286,
"grad_norm": 1.8524154424667358,
"learning_rate": 1.274234468938001e-05,
"loss": 0.6735,
"step": 612
},
{
"epoch": 1.3683035714285714,
"grad_norm": 1.8554515838623047,
"learning_rate": 1.271805017475051e-05,
"loss": 0.6132,
"step": 613
},
{
"epoch": 1.3705357142857144,
"grad_norm": 1.815579891204834,
"learning_rate": 1.2693738325425272e-05,
"loss": 0.6722,
"step": 614
},
{
"epoch": 1.3727678571428572,
"grad_norm": 2.338247299194336,
"learning_rate": 1.266940929645606e-05,
"loss": 0.7669,
"step": 615
},
{
"epoch": 1.375,
"grad_norm": 1.8510740995407104,
"learning_rate": 1.2645063243004236e-05,
"loss": 0.5504,
"step": 616
},
{
"epoch": 1.3772321428571428,
"grad_norm": 1.798982858657837,
"learning_rate": 1.2620700320339705e-05,
"loss": 0.604,
"step": 617
},
{
"epoch": 1.3794642857142856,
"grad_norm": 1.7797491550445557,
"learning_rate": 1.2596320683839976e-05,
"loss": 0.6598,
"step": 618
},
{
"epoch": 1.3816964285714286,
"grad_norm": 1.8819565773010254,
"learning_rate": 1.2571924488989145e-05,
"loss": 0.6177,
"step": 619
},
{
"epoch": 1.3839285714285714,
"grad_norm": 1.7809234857559204,
"learning_rate": 1.2547511891376916e-05,
"loss": 0.5357,
"step": 620
},
{
"epoch": 1.3861607142857144,
"grad_norm": 1.736649513244629,
"learning_rate": 1.2523083046697598e-05,
"loss": 0.6372,
"step": 621
},
{
"epoch": 1.3883928571428572,
"grad_norm": 1.9298821687698364,
"learning_rate": 1.2498638110749122e-05,
"loss": 0.686,
"step": 622
},
{
"epoch": 1.390625,
"grad_norm": 1.8866440057754517,
"learning_rate": 1.2474177239432042e-05,
"loss": 0.6319,
"step": 623
},
{
"epoch": 1.3928571428571428,
"grad_norm": 1.7113823890686035,
"learning_rate": 1.2449700588748541e-05,
"loss": 0.6942,
"step": 624
},
{
"epoch": 1.3950892857142856,
"grad_norm": 1.9925826787948608,
"learning_rate": 1.2425208314801441e-05,
"loss": 0.5561,
"step": 625
},
{
"epoch": 1.3973214285714286,
"grad_norm": 2.008894443511963,
"learning_rate": 1.2400700573793191e-05,
"loss": 0.6861,
"step": 626
},
{
"epoch": 1.3995535714285714,
"grad_norm": 1.9158005714416504,
"learning_rate": 1.23761775220249e-05,
"loss": 0.6071,
"step": 627
},
{
"epoch": 1.4017857142857144,
"grad_norm": 2.0181586742401123,
"learning_rate": 1.2351639315895309e-05,
"loss": 0.7582,
"step": 628
},
{
"epoch": 1.4040178571428572,
"grad_norm": 2.089715003967285,
"learning_rate": 1.2327086111899816e-05,
"loss": 0.7459,
"step": 629
},
{
"epoch": 1.40625,
"grad_norm": 2.0146496295928955,
"learning_rate": 1.2302518066629467e-05,
"loss": 0.6312,
"step": 630
},
{
"epoch": 1.4084821428571428,
"grad_norm": 1.7847448587417603,
"learning_rate": 1.2277935336769961e-05,
"loss": 0.5838,
"step": 631
},
{
"epoch": 1.4107142857142856,
"grad_norm": 1.8573119640350342,
"learning_rate": 1.2253338079100652e-05,
"loss": 0.6427,
"step": 632
},
{
"epoch": 1.4129464285714286,
"grad_norm": 1.8742104768753052,
"learning_rate": 1.2228726450493538e-05,
"loss": 0.6697,
"step": 633
},
{
"epoch": 1.4151785714285714,
"grad_norm": 2.0059330463409424,
"learning_rate": 1.2204100607912277e-05,
"loss": 0.6663,
"step": 634
},
{
"epoch": 1.4174107142857144,
"grad_norm": 1.8245443105697632,
"learning_rate": 1.2179460708411177e-05,
"loss": 0.6052,
"step": 635
},
{
"epoch": 1.4196428571428572,
"grad_norm": 1.759437084197998,
"learning_rate": 1.2154806909134198e-05,
"loss": 0.59,
"step": 636
},
{
"epoch": 1.421875,
"grad_norm": 1.7457926273345947,
"learning_rate": 1.213013936731394e-05,
"loss": 0.6548,
"step": 637
},
{
"epoch": 1.4241071428571428,
"grad_norm": 1.7185189723968506,
"learning_rate": 1.210545824027066e-05,
"loss": 0.6277,
"step": 638
},
{
"epoch": 1.4263392857142856,
"grad_norm": 2.0099501609802246,
"learning_rate": 1.2080763685411243e-05,
"loss": 0.6866,
"step": 639
},
{
"epoch": 1.4285714285714286,
"grad_norm": 1.6704769134521484,
"learning_rate": 1.205605586022822e-05,
"loss": 0.6058,
"step": 640
},
{
"epoch": 1.4308035714285714,
"grad_norm": 1.7891658544540405,
"learning_rate": 1.2031334922298749e-05,
"loss": 0.6382,
"step": 641
},
{
"epoch": 1.4330357142857144,
"grad_norm": 1.7908434867858887,
"learning_rate": 1.2006601029283629e-05,
"loss": 0.6063,
"step": 642
},
{
"epoch": 1.4352678571428572,
"grad_norm": 1.9139795303344727,
"learning_rate": 1.1981854338926262e-05,
"loss": 0.6399,
"step": 643
},
{
"epoch": 1.4375,
"grad_norm": 1.7342250347137451,
"learning_rate": 1.1957095009051683e-05,
"loss": 0.63,
"step": 644
},
{
"epoch": 1.4397321428571428,
"grad_norm": 1.8219507932662964,
"learning_rate": 1.193232319756553e-05,
"loss": 0.5838,
"step": 645
},
{
"epoch": 1.4419642857142856,
"grad_norm": 1.791318416595459,
"learning_rate": 1.1907539062453044e-05,
"loss": 0.6082,
"step": 646
},
{
"epoch": 1.4441964285714286,
"grad_norm": 1.9500919580459595,
"learning_rate": 1.1882742761778069e-05,
"loss": 0.6217,
"step": 647
},
{
"epoch": 1.4464285714285714,
"grad_norm": 1.8394417762756348,
"learning_rate": 1.1857934453682016e-05,
"loss": 0.62,
"step": 648
},
{
"epoch": 1.4486607142857144,
"grad_norm": 1.7951915264129639,
"learning_rate": 1.1833114296382903e-05,
"loss": 0.6073,
"step": 649
},
{
"epoch": 1.4508928571428572,
"grad_norm": 1.609543800354004,
"learning_rate": 1.1808282448174295e-05,
"loss": 0.5101,
"step": 650
},
{
"epoch": 1.453125,
"grad_norm": 1.9620007276535034,
"learning_rate": 1.1783439067424329e-05,
"loss": 0.6477,
"step": 651
},
{
"epoch": 1.4553571428571428,
"grad_norm": 1.7118951082229614,
"learning_rate": 1.1758584312574693e-05,
"loss": 0.5712,
"step": 652
},
{
"epoch": 1.4575892857142856,
"grad_norm": 1.7285962104797363,
"learning_rate": 1.17337183421396e-05,
"loss": 0.6041,
"step": 653
},
{
"epoch": 1.4598214285714286,
"grad_norm": 1.6281301975250244,
"learning_rate": 1.1708841314704811e-05,
"loss": 0.6508,
"step": 654
},
{
"epoch": 1.4620535714285714,
"grad_norm": 1.5081804990768433,
"learning_rate": 1.1683953388926592e-05,
"loss": 0.5559,
"step": 655
},
{
"epoch": 1.4642857142857144,
"grad_norm": 2.0005712509155273,
"learning_rate": 1.1659054723530721e-05,
"loss": 0.5777,
"step": 656
},
{
"epoch": 1.4665178571428572,
"grad_norm": 2.1517813205718994,
"learning_rate": 1.163414547731146e-05,
"loss": 0.7034,
"step": 657
},
{
"epoch": 1.46875,
"grad_norm": 2.2742764949798584,
"learning_rate": 1.1609225809130566e-05,
"loss": 0.6747,
"step": 658
},
{
"epoch": 1.4709821428571428,
"grad_norm": 1.6455707550048828,
"learning_rate": 1.1584295877916251e-05,
"loss": 0.5293,
"step": 659
},
{
"epoch": 1.4732142857142856,
"grad_norm": 1.7664430141448975,
"learning_rate": 1.1559355842662188e-05,
"loss": 0.6505,
"step": 660
},
{
"epoch": 1.4754464285714286,
"grad_norm": 1.776328682899475,
"learning_rate": 1.1534405862426481e-05,
"loss": 0.6094,
"step": 661
},
{
"epoch": 1.4776785714285714,
"grad_norm": 1.7001668214797974,
"learning_rate": 1.150944609633067e-05,
"loss": 0.6687,
"step": 662
},
{
"epoch": 1.4799107142857144,
"grad_norm": 1.7800358533859253,
"learning_rate": 1.1484476703558698e-05,
"loss": 0.656,
"step": 663
},
{
"epoch": 1.4821428571428572,
"grad_norm": 1.980301856994629,
"learning_rate": 1.1459497843355907e-05,
"loss": 0.6866,
"step": 664
},
{
"epoch": 1.484375,
"grad_norm": 1.8124170303344727,
"learning_rate": 1.1434509675028018e-05,
"loss": 0.5867,
"step": 665
},
{
"epoch": 1.4866071428571428,
"grad_norm": 1.5809247493743896,
"learning_rate": 1.1409512357940114e-05,
"loss": 0.562,
"step": 666
},
{
"epoch": 1.4888392857142856,
"grad_norm": 2.0136334896087646,
"learning_rate": 1.138450605151563e-05,
"loss": 0.628,
"step": 667
},
{
"epoch": 1.4910714285714286,
"grad_norm": 1.7345236539840698,
"learning_rate": 1.1359490915235323e-05,
"loss": 0.6533,
"step": 668
},
{
"epoch": 1.4933035714285714,
"grad_norm": 1.6967463493347168,
"learning_rate": 1.1334467108636273e-05,
"loss": 0.6514,
"step": 669
},
{
"epoch": 1.4955357142857144,
"grad_norm": 1.7118983268737793,
"learning_rate": 1.1309434791310848e-05,
"loss": 0.7126,
"step": 670
},
{
"epoch": 1.4977678571428572,
"grad_norm": 1.6460996866226196,
"learning_rate": 1.1284394122905697e-05,
"loss": 0.6425,
"step": 671
},
{
"epoch": 1.5,
"grad_norm": 1.8571183681488037,
"learning_rate": 1.1259345263120738e-05,
"loss": 0.5949,
"step": 672
},
{
"epoch": 1.5,
"eval_loss": 0.7134630680084229,
"eval_runtime": 45.5605,
"eval_samples_per_second": 1.602,
"eval_steps_per_second": 0.219,
"step": 672
},
{
"epoch": 1.5022321428571428,
"grad_norm": 1.7399746179580688,
"learning_rate": 1.1234288371708112e-05,
"loss": 0.6018,
"step": 673
},
{
"epoch": 1.5044642857142856,
"grad_norm": 1.7217991352081299,
"learning_rate": 1.1209223608471202e-05,
"loss": 0.5705,
"step": 674
},
{
"epoch": 1.5066964285714286,
"grad_norm": 1.7552762031555176,
"learning_rate": 1.1184151133263578e-05,
"loss": 0.6119,
"step": 675
},
{
"epoch": 1.5089285714285714,
"grad_norm": 1.923822045326233,
"learning_rate": 1.1159071105988012e-05,
"loss": 0.569,
"step": 676
},
{
"epoch": 1.5111607142857144,
"grad_norm": 1.7491592168807983,
"learning_rate": 1.1133983686595416e-05,
"loss": 0.6234,
"step": 677
},
{
"epoch": 1.5133928571428572,
"grad_norm": 1.9240578413009644,
"learning_rate": 1.110888903508387e-05,
"loss": 0.7482,
"step": 678
},
{
"epoch": 1.515625,
"grad_norm": 1.783152461051941,
"learning_rate": 1.1083787311497562e-05,
"loss": 0.6756,
"step": 679
},
{
"epoch": 1.5178571428571428,
"grad_norm": 2.130394220352173,
"learning_rate": 1.1058678675925796e-05,
"loss": 0.5983,
"step": 680
},
{
"epoch": 1.5200892857142856,
"grad_norm": 2.051589012145996,
"learning_rate": 1.1033563288501944e-05,
"loss": 0.6002,
"step": 681
},
{
"epoch": 1.5223214285714286,
"grad_norm": 1.8057156801223755,
"learning_rate": 1.1008441309402448e-05,
"loss": 0.6485,
"step": 682
},
{
"epoch": 1.5245535714285714,
"grad_norm": 1.9219319820404053,
"learning_rate": 1.0983312898845788e-05,
"loss": 0.6524,
"step": 683
},
{
"epoch": 1.5267857142857144,
"grad_norm": 1.7352206707000732,
"learning_rate": 1.0958178217091455e-05,
"loss": 0.5449,
"step": 684
},
{
"epoch": 1.5290178571428572,
"grad_norm": 1.9508435726165771,
"learning_rate": 1.093303742443895e-05,
"loss": 0.704,
"step": 685
},
{
"epoch": 1.53125,
"grad_norm": 1.6700478792190552,
"learning_rate": 1.0907890681226728e-05,
"loss": 0.611,
"step": 686
},
{
"epoch": 1.5334821428571428,
"grad_norm": 1.7663685083389282,
"learning_rate": 1.0882738147831209e-05,
"loss": 0.5739,
"step": 687
},
{
"epoch": 1.5357142857142856,
"grad_norm": 1.8615878820419312,
"learning_rate": 1.0857579984665733e-05,
"loss": 0.6008,
"step": 688
},
{
"epoch": 1.5379464285714286,
"grad_norm": 1.6442456245422363,
"learning_rate": 1.0832416352179549e-05,
"loss": 0.638,
"step": 689
},
{
"epoch": 1.5401785714285714,
"grad_norm": 1.9190770387649536,
"learning_rate": 1.0807247410856783e-05,
"loss": 0.6204,
"step": 690
},
{
"epoch": 1.5424107142857144,
"grad_norm": 1.8047891855239868,
"learning_rate": 1.0782073321215423e-05,
"loss": 0.6699,
"step": 691
},
{
"epoch": 1.5446428571428572,
"grad_norm": 1.5776137113571167,
"learning_rate": 1.0756894243806291e-05,
"loss": 0.563,
"step": 692
},
{
"epoch": 1.546875,
"grad_norm": 1.6929550170898438,
"learning_rate": 1.073171033921201e-05,
"loss": 0.6255,
"step": 693
},
{
"epoch": 1.5491071428571428,
"grad_norm": 1.6478757858276367,
"learning_rate": 1.0706521768046006e-05,
"loss": 0.5839,
"step": 694
},
{
"epoch": 1.5513392857142856,
"grad_norm": 1.6278104782104492,
"learning_rate": 1.0681328690951447e-05,
"loss": 0.5533,
"step": 695
},
{
"epoch": 1.5535714285714286,
"grad_norm": 1.8348695039749146,
"learning_rate": 1.0656131268600254e-05,
"loss": 0.5925,
"step": 696
},
{
"epoch": 1.5558035714285714,
"grad_norm": 1.8064024448394775,
"learning_rate": 1.0630929661692051e-05,
"loss": 0.6057,
"step": 697
},
{
"epoch": 1.5580357142857144,
"grad_norm": 1.8319587707519531,
"learning_rate": 1.0605724030953155e-05,
"loss": 0.6066,
"step": 698
},
{
"epoch": 1.5602678571428572,
"grad_norm": 2.0297727584838867,
"learning_rate": 1.0580514537135542e-05,
"loss": 0.7303,
"step": 699
},
{
"epoch": 1.5625,
"grad_norm": 1.909903883934021,
"learning_rate": 1.0555301341015832e-05,
"loss": 0.7264,
"step": 700
},
{
"epoch": 1.5647321428571428,
"grad_norm": 1.9629833698272705,
"learning_rate": 1.0530084603394239e-05,
"loss": 0.6276,
"step": 701
},
{
"epoch": 1.5669642857142856,
"grad_norm": 1.8616989850997925,
"learning_rate": 1.0504864485093588e-05,
"loss": 0.6027,
"step": 702
},
{
"epoch": 1.5691964285714286,
"grad_norm": 1.8771177530288696,
"learning_rate": 1.0479641146958249e-05,
"loss": 0.5703,
"step": 703
},
{
"epoch": 1.5714285714285714,
"grad_norm": 1.9348293542861938,
"learning_rate": 1.0454414749853126e-05,
"loss": 0.5615,
"step": 704
},
{
"epoch": 1.5736607142857144,
"grad_norm": 2.0445797443389893,
"learning_rate": 1.0429185454662638e-05,
"loss": 0.7221,
"step": 705
},
{
"epoch": 1.5758928571428572,
"grad_norm": 1.6325020790100098,
"learning_rate": 1.0403953422289687e-05,
"loss": 0.6198,
"step": 706
},
{
"epoch": 1.578125,
"grad_norm": 1.6755043268203735,
"learning_rate": 1.0378718813654633e-05,
"loss": 0.6068,
"step": 707
},
{
"epoch": 1.5803571428571428,
"grad_norm": 2.0556771755218506,
"learning_rate": 1.0353481789694258e-05,
"loss": 0.6963,
"step": 708
},
{
"epoch": 1.5825892857142856,
"grad_norm": 2.051053524017334,
"learning_rate": 1.0328242511360753e-05,
"loss": 0.6156,
"step": 709
},
{
"epoch": 1.5848214285714286,
"grad_norm": 1.755422830581665,
"learning_rate": 1.030300113962069e-05,
"loss": 0.5775,
"step": 710
},
{
"epoch": 1.5870535714285714,
"grad_norm": 1.5062118768692017,
"learning_rate": 1.0277757835453989e-05,
"loss": 0.5894,
"step": 711
},
{
"epoch": 1.5892857142857144,
"grad_norm": 2.010497808456421,
"learning_rate": 1.0252512759852891e-05,
"loss": 0.5907,
"step": 712
},
{
"epoch": 1.5915178571428572,
"grad_norm": 1.8485591411590576,
"learning_rate": 1.0227266073820939e-05,
"loss": 0.6699,
"step": 713
},
{
"epoch": 1.59375,
"grad_norm": 1.8647955656051636,
"learning_rate": 1.0202017938371947e-05,
"loss": 0.7198,
"step": 714
},
{
"epoch": 1.5959821428571428,
"grad_norm": 1.8782999515533447,
"learning_rate": 1.0176768514528967e-05,
"loss": 0.5807,
"step": 715
},
{
"epoch": 1.5982142857142856,
"grad_norm": 1.9086512327194214,
"learning_rate": 1.015151796332328e-05,
"loss": 0.6106,
"step": 716
},
{
"epoch": 1.6004464285714286,
"grad_norm": 1.9945212602615356,
"learning_rate": 1.012626644579334e-05,
"loss": 0.7394,
"step": 717
},
{
"epoch": 1.6026785714285714,
"grad_norm": 1.789683222770691,
"learning_rate": 1.010101412298378e-05,
"loss": 0.579,
"step": 718
},
{
"epoch": 1.6049107142857144,
"grad_norm": 1.959991216659546,
"learning_rate": 1.0075761155944355e-05,
"loss": 0.6562,
"step": 719
},
{
"epoch": 1.6071428571428572,
"grad_norm": 1.911706566810608,
"learning_rate": 1.0050507705728943e-05,
"loss": 0.5403,
"step": 720
},
{
"epoch": 1.609375,
"grad_norm": 1.9638135433197021,
"learning_rate": 1.0025253933394487e-05,
"loss": 0.6123,
"step": 721
},
{
"epoch": 1.6116071428571428,
"grad_norm": 1.973676085472107,
"learning_rate": 1e-05,
"loss": 0.5727,
"step": 722
},
{
"epoch": 1.6138392857142856,
"grad_norm": 2.101343870162964,
"learning_rate": 9.974746066605515e-06,
"loss": 0.5999,
"step": 723
},
{
"epoch": 1.6160714285714286,
"grad_norm": 1.8011658191680908,
"learning_rate": 9.949492294271062e-06,
"loss": 0.5714,
"step": 724
},
{
"epoch": 1.6183035714285714,
"grad_norm": 1.8372050523757935,
"learning_rate": 9.924238844055646e-06,
"loss": 0.6376,
"step": 725
},
{
"epoch": 1.6205357142857144,
"grad_norm": 1.8423105478286743,
"learning_rate": 9.898985877016225e-06,
"loss": 0.6067,
"step": 726
},
{
"epoch": 1.6227678571428572,
"grad_norm": 1.605955719947815,
"learning_rate": 9.873733554206663e-06,
"loss": 0.5868,
"step": 727
},
{
"epoch": 1.625,
"grad_norm": 1.84028160572052,
"learning_rate": 9.848482036676725e-06,
"loss": 0.6101,
"step": 728
},
{
"epoch": 1.6272321428571428,
"grad_norm": 1.82899808883667,
"learning_rate": 9.823231485471034e-06,
"loss": 0.6578,
"step": 729
},
{
"epoch": 1.6294642857142856,
"grad_norm": 1.6356295347213745,
"learning_rate": 9.797982061628056e-06,
"loss": 0.6306,
"step": 730
},
{
"epoch": 1.6316964285714286,
"grad_norm": 1.9734749794006348,
"learning_rate": 9.772733926179066e-06,
"loss": 0.6826,
"step": 731
},
{
"epoch": 1.6339285714285714,
"grad_norm": 1.6971672773361206,
"learning_rate": 9.747487240147112e-06,
"loss": 0.6536,
"step": 732
},
{
"epoch": 1.6361607142857144,
"grad_norm": 2.0280985832214355,
"learning_rate": 9.722242164546016e-06,
"loss": 0.6014,
"step": 733
},
{
"epoch": 1.6383928571428572,
"grad_norm": 2.0396358966827393,
"learning_rate": 9.696998860379313e-06,
"loss": 0.6495,
"step": 734
},
{
"epoch": 1.640625,
"grad_norm": 1.9824975728988647,
"learning_rate": 9.67175748863925e-06,
"loss": 0.6704,
"step": 735
},
{
"epoch": 1.6428571428571428,
"grad_norm": 1.9405537843704224,
"learning_rate": 9.646518210305747e-06,
"loss": 0.6041,
"step": 736
},
{
"epoch": 1.6450892857142856,
"grad_norm": 1.7607909440994263,
"learning_rate": 9.621281186345367e-06,
"loss": 0.6549,
"step": 737
},
{
"epoch": 1.6473214285714286,
"grad_norm": 1.8185755014419556,
"learning_rate": 9.596046577710314e-06,
"loss": 0.6028,
"step": 738
},
{
"epoch": 1.6495535714285714,
"grad_norm": 1.7550791501998901,
"learning_rate": 9.570814545337362e-06,
"loss": 0.6871,
"step": 739
},
{
"epoch": 1.6517857142857144,
"grad_norm": 2.181464195251465,
"learning_rate": 9.545585250146879e-06,
"loss": 0.789,
"step": 740
},
{
"epoch": 1.6540178571428572,
"grad_norm": 1.8093764781951904,
"learning_rate": 9.520358853041756e-06,
"loss": 0.547,
"step": 741
},
{
"epoch": 1.65625,
"grad_norm": 1.6014175415039062,
"learning_rate": 9.495135514906415e-06,
"loss": 0.5635,
"step": 742
},
{
"epoch": 1.6584821428571428,
"grad_norm": 1.8001309633255005,
"learning_rate": 9.469915396605763e-06,
"loss": 0.6576,
"step": 743
},
{
"epoch": 1.6607142857142856,
"grad_norm": 2.0399434566497803,
"learning_rate": 9.44469865898417e-06,
"loss": 0.6821,
"step": 744
},
{
"epoch": 1.6629464285714286,
"grad_norm": 1.7246460914611816,
"learning_rate": 9.41948546286446e-06,
"loss": 0.586,
"step": 745
},
{
"epoch": 1.6651785714285714,
"grad_norm": 1.621623158454895,
"learning_rate": 9.394275969046845e-06,
"loss": 0.5024,
"step": 746
},
{
"epoch": 1.6674107142857144,
"grad_norm": 1.9162286520004272,
"learning_rate": 9.369070338307954e-06,
"loss": 0.6224,
"step": 747
},
{
"epoch": 1.6696428571428572,
"grad_norm": 2.026129961013794,
"learning_rate": 9.34386873139975e-06,
"loss": 0.57,
"step": 748
},
{
"epoch": 1.671875,
"grad_norm": 1.8409785032272339,
"learning_rate": 9.31867130904856e-06,
"loss": 0.5404,
"step": 749
},
{
"epoch": 1.6741071428571428,
"grad_norm": 2.1627585887908936,
"learning_rate": 9.293478231954e-06,
"loss": 0.782,
"step": 750
},
{
"epoch": 1.6763392857142856,
"grad_norm": 1.648873209953308,
"learning_rate": 9.26828966078799e-06,
"loss": 0.4748,
"step": 751
},
{
"epoch": 1.6785714285714286,
"grad_norm": 2.045337200164795,
"learning_rate": 9.243105756193714e-06,
"loss": 0.6399,
"step": 752
},
{
"epoch": 1.6808035714285714,
"grad_norm": 1.631089448928833,
"learning_rate": 9.217926678784579e-06,
"loss": 0.5552,
"step": 753
},
{
"epoch": 1.6830357142857144,
"grad_norm": 1.683990478515625,
"learning_rate": 9.192752589143219e-06,
"loss": 0.5701,
"step": 754
},
{
"epoch": 1.6852678571428572,
"grad_norm": 1.9549455642700195,
"learning_rate": 9.167583647820453e-06,
"loss": 0.7436,
"step": 755
},
{
"epoch": 1.6875,
"grad_norm": 1.6943837404251099,
"learning_rate": 9.14242001533427e-06,
"loss": 0.5931,
"step": 756
},
{
"epoch": 1.6897321428571428,
"grad_norm": 1.7470611333847046,
"learning_rate": 9.117261852168794e-06,
"loss": 0.6501,
"step": 757
},
{
"epoch": 1.6919642857142856,
"grad_norm": 1.8046934604644775,
"learning_rate": 9.092109318773274e-06,
"loss": 0.7076,
"step": 758
},
{
"epoch": 1.6941964285714286,
"grad_norm": 1.9041763544082642,
"learning_rate": 9.066962575561054e-06,
"loss": 0.6717,
"step": 759
},
{
"epoch": 1.6964285714285714,
"grad_norm": 1.9243342876434326,
"learning_rate": 9.041821782908544e-06,
"loss": 0.6456,
"step": 760
},
{
"epoch": 1.6986607142857144,
"grad_norm": 1.4959568977355957,
"learning_rate": 9.016687101154215e-06,
"loss": 0.5338,
"step": 761
},
{
"epoch": 1.7008928571428572,
"grad_norm": 1.8079724311828613,
"learning_rate": 8.991558690597553e-06,
"loss": 0.6154,
"step": 762
},
{
"epoch": 1.703125,
"grad_norm": 1.9148213863372803,
"learning_rate": 8.966436711498058e-06,
"loss": 0.6811,
"step": 763
},
{
"epoch": 1.7053571428571428,
"grad_norm": 1.9065920114517212,
"learning_rate": 8.941321324074207e-06,
"loss": 0.6276,
"step": 764
},
{
"epoch": 1.7075892857142856,
"grad_norm": 1.983621597290039,
"learning_rate": 8.916212688502438e-06,
"loss": 0.5618,
"step": 765
},
{
"epoch": 1.7098214285714286,
"grad_norm": 1.6854325532913208,
"learning_rate": 8.891110964916135e-06,
"loss": 0.564,
"step": 766
},
{
"epoch": 1.7120535714285714,
"grad_norm": 1.6306523084640503,
"learning_rate": 8.866016313404586e-06,
"loss": 0.5719,
"step": 767
},
{
"epoch": 1.7142857142857144,
"grad_norm": 1.9582360982894897,
"learning_rate": 8.840928894011995e-06,
"loss": 0.6923,
"step": 768
},
{
"epoch": 1.7165178571428572,
"grad_norm": 1.624234914779663,
"learning_rate": 8.815848866736424e-06,
"loss": 0.5244,
"step": 769
},
{
"epoch": 1.71875,
"grad_norm": 2.052605628967285,
"learning_rate": 8.790776391528803e-06,
"loss": 0.6598,
"step": 770
},
{
"epoch": 1.7209821428571428,
"grad_norm": 2.070335626602173,
"learning_rate": 8.76571162829189e-06,
"loss": 0.7077,
"step": 771
},
{
"epoch": 1.7232142857142856,
"grad_norm": 1.8281620740890503,
"learning_rate": 8.740654736879265e-06,
"loss": 0.7506,
"step": 772
},
{
"epoch": 1.7254464285714286,
"grad_norm": 1.6640903949737549,
"learning_rate": 8.715605877094304e-06,
"loss": 0.5309,
"step": 773
},
{
"epoch": 1.7276785714285714,
"grad_norm": 1.8417410850524902,
"learning_rate": 8.690565208689157e-06,
"loss": 0.6216,
"step": 774
},
{
"epoch": 1.7299107142857144,
"grad_norm": 1.7450134754180908,
"learning_rate": 8.665532891363732e-06,
"loss": 0.6304,
"step": 775
},
{
"epoch": 1.7321428571428572,
"grad_norm": 1.8155803680419922,
"learning_rate": 8.640509084764682e-06,
"loss": 0.6779,
"step": 776
},
{
"epoch": 1.734375,
"grad_norm": 1.9008722305297852,
"learning_rate": 8.615493948484375e-06,
"loss": 0.7313,
"step": 777
},
{
"epoch": 1.7366071428571428,
"grad_norm": 1.5549745559692383,
"learning_rate": 8.590487642059888e-06,
"loss": 0.538,
"step": 778
},
{
"epoch": 1.7388392857142856,
"grad_norm": 1.857479214668274,
"learning_rate": 8.565490324971983e-06,
"loss": 0.5986,
"step": 779
},
{
"epoch": 1.7410714285714286,
"grad_norm": 2.1765432357788086,
"learning_rate": 8.540502156644096e-06,
"loss": 0.6422,
"step": 780
},
{
"epoch": 1.7433035714285714,
"grad_norm": 1.8333587646484375,
"learning_rate": 8.515523296441304e-06,
"loss": 0.6261,
"step": 781
},
{
"epoch": 1.7455357142857144,
"grad_norm": 1.5466289520263672,
"learning_rate": 8.490553903669335e-06,
"loss": 0.541,
"step": 782
},
{
"epoch": 1.7477678571428572,
"grad_norm": 1.7777531147003174,
"learning_rate": 8.465594137573524e-06,
"loss": 0.6545,
"step": 783
},
{
"epoch": 1.75,
"grad_norm": 1.5732624530792236,
"learning_rate": 8.440644157337819e-06,
"loss": 0.5988,
"step": 784
},
{
"epoch": 1.75,
"eval_loss": 0.709916353225708,
"eval_runtime": 48.2251,
"eval_samples_per_second": 1.514,
"eval_steps_per_second": 0.207,
"step": 784
},
{
"epoch": 1.7522321428571428,
"grad_norm": 1.6632795333862305,
"learning_rate": 8.415704122083752e-06,
"loss": 0.5257,
"step": 785
},
{
"epoch": 1.7544642857142856,
"grad_norm": 1.7194244861602783,
"learning_rate": 8.390774190869434e-06,
"loss": 0.6742,
"step": 786
},
{
"epoch": 1.7566964285714286,
"grad_norm": 1.8653508424758911,
"learning_rate": 8.365854522688543e-06,
"loss": 0.7084,
"step": 787
},
{
"epoch": 1.7589285714285714,
"grad_norm": 1.814305067062378,
"learning_rate": 8.340945276469282e-06,
"loss": 0.6174,
"step": 788
},
{
"epoch": 1.7611607142857144,
"grad_norm": 1.7196688652038574,
"learning_rate": 8.316046611073413e-06,
"loss": 0.6082,
"step": 789
},
{
"epoch": 1.7633928571428572,
"grad_norm": 1.91426420211792,
"learning_rate": 8.29115868529519e-06,
"loss": 0.69,
"step": 790
},
{
"epoch": 1.765625,
"grad_norm": 1.8663746118545532,
"learning_rate": 8.266281657860406e-06,
"loss": 0.5621,
"step": 791
},
{
"epoch": 1.7678571428571428,
"grad_norm": 1.9205740690231323,
"learning_rate": 8.24141568742531e-06,
"loss": 0.6699,
"step": 792
},
{
"epoch": 1.7700892857142856,
"grad_norm": 1.7767086029052734,
"learning_rate": 8.21656093257567e-06,
"loss": 0.5818,
"step": 793
},
{
"epoch": 1.7723214285714286,
"grad_norm": 1.8975656032562256,
"learning_rate": 8.191717551825707e-06,
"loss": 0.6507,
"step": 794
},
{
"epoch": 1.7745535714285714,
"grad_norm": 1.672675371170044,
"learning_rate": 8.166885703617098e-06,
"loss": 0.5891,
"step": 795
},
{
"epoch": 1.7767857142857144,
"grad_norm": 1.7817902565002441,
"learning_rate": 8.142065546317988e-06,
"loss": 0.6545,
"step": 796
},
{
"epoch": 1.7790178571428572,
"grad_norm": 2.006059408187866,
"learning_rate": 8.117257238221936e-06,
"loss": 0.727,
"step": 797
},
{
"epoch": 1.78125,
"grad_norm": 1.5490721464157104,
"learning_rate": 8.09246093754696e-06,
"loss": 0.6469,
"step": 798
},
{
"epoch": 1.7834821428571428,
"grad_norm": 1.5985546112060547,
"learning_rate": 8.067676802434472e-06,
"loss": 0.5924,
"step": 799
},
{
"epoch": 1.7857142857142856,
"grad_norm": 1.8553714752197266,
"learning_rate": 8.042904990948319e-06,
"loss": 0.6217,
"step": 800
},
{
"epoch": 1.7879464285714286,
"grad_norm": 1.9970471858978271,
"learning_rate": 8.01814566107374e-06,
"loss": 0.6261,
"step": 801
},
{
"epoch": 1.7901785714285714,
"grad_norm": 1.6423388719558716,
"learning_rate": 7.993398970716375e-06,
"loss": 0.5292,
"step": 802
},
{
"epoch": 1.7924107142857144,
"grad_norm": 1.7289307117462158,
"learning_rate": 7.968665077701253e-06,
"loss": 0.6432,
"step": 803
},
{
"epoch": 1.7946428571428572,
"grad_norm": 1.7097069025039673,
"learning_rate": 7.943944139771784e-06,
"loss": 0.6175,
"step": 804
},
{
"epoch": 1.796875,
"grad_norm": 1.684017539024353,
"learning_rate": 7.919236314588759e-06,
"loss": 0.6137,
"step": 805
},
{
"epoch": 1.7991071428571428,
"grad_norm": 2.1642661094665527,
"learning_rate": 7.894541759729344e-06,
"loss": 0.6404,
"step": 806
},
{
"epoch": 1.8013392857142856,
"grad_norm": 1.771759271621704,
"learning_rate": 7.869860632686059e-06,
"loss": 0.6232,
"step": 807
},
{
"epoch": 1.8035714285714286,
"grad_norm": 2.106753349304199,
"learning_rate": 7.845193090865807e-06,
"loss": 0.6691,
"step": 808
},
{
"epoch": 1.8058035714285714,
"grad_norm": 1.966404676437378,
"learning_rate": 7.820539291588825e-06,
"loss": 0.6703,
"step": 809
},
{
"epoch": 1.8080357142857144,
"grad_norm": 1.7450644969940186,
"learning_rate": 7.795899392087728e-06,
"loss": 0.6583,
"step": 810
},
{
"epoch": 1.8102678571428572,
"grad_norm": 1.991557002067566,
"learning_rate": 7.771273549506466e-06,
"loss": 0.6434,
"step": 811
},
{
"epoch": 1.8125,
"grad_norm": 1.7553882598876953,
"learning_rate": 7.746661920899351e-06,
"loss": 0.5625,
"step": 812
},
{
"epoch": 1.8147321428571428,
"grad_norm": 2.1503360271453857,
"learning_rate": 7.72206466323004e-06,
"loss": 0.7151,
"step": 813
},
{
"epoch": 1.8169642857142856,
"grad_norm": 1.7755491733551025,
"learning_rate": 7.697481933370535e-06,
"loss": 0.7162,
"step": 814
},
{
"epoch": 1.8191964285714286,
"grad_norm": 1.7988295555114746,
"learning_rate": 7.672913888100187e-06,
"loss": 0.5866,
"step": 815
},
{
"epoch": 1.8214285714285714,
"grad_norm": 1.6645320653915405,
"learning_rate": 7.648360684104695e-06,
"loss": 0.6317,
"step": 816
},
{
"epoch": 1.8236607142857144,
"grad_norm": 1.955287218093872,
"learning_rate": 7.623822477975105e-06,
"loss": 0.6164,
"step": 817
},
{
"epoch": 1.8258928571428572,
"grad_norm": 1.6788568496704102,
"learning_rate": 7.599299426206812e-06,
"loss": 0.4848,
"step": 818
},
{
"epoch": 1.828125,
"grad_norm": 1.9233473539352417,
"learning_rate": 7.574791685198563e-06,
"loss": 0.6127,
"step": 819
},
{
"epoch": 1.8303571428571428,
"grad_norm": 1.9900723695755005,
"learning_rate": 7.550299411251461e-06,
"loss": 0.6527,
"step": 820
},
{
"epoch": 1.8325892857142856,
"grad_norm": 1.84470534324646,
"learning_rate": 7.52582276056796e-06,
"loss": 0.6146,
"step": 821
},
{
"epoch": 1.8348214285714286,
"grad_norm": 1.8588616847991943,
"learning_rate": 7.501361889250882e-06,
"loss": 0.672,
"step": 822
},
{
"epoch": 1.8370535714285714,
"grad_norm": 1.5921292304992676,
"learning_rate": 7.4769169533024055e-06,
"loss": 0.6028,
"step": 823
},
{
"epoch": 1.8392857142857144,
"grad_norm": 1.7890609502792358,
"learning_rate": 7.452488108623089e-06,
"loss": 0.5925,
"step": 824
},
{
"epoch": 1.8415178571428572,
"grad_norm": 1.5813894271850586,
"learning_rate": 7.428075511010858e-06,
"loss": 0.5878,
"step": 825
},
{
"epoch": 1.84375,
"grad_norm": 1.9074643850326538,
"learning_rate": 7.403679316160024e-06,
"loss": 0.7553,
"step": 826
},
{
"epoch": 1.8459821428571428,
"grad_norm": 1.387987494468689,
"learning_rate": 7.379299679660299e-06,
"loss": 0.494,
"step": 827
},
{
"epoch": 1.8482142857142856,
"grad_norm": 1.7119059562683105,
"learning_rate": 7.354936756995766e-06,
"loss": 0.6305,
"step": 828
},
{
"epoch": 1.8504464285714286,
"grad_norm": 1.9815932512283325,
"learning_rate": 7.3305907035439404e-06,
"loss": 0.6832,
"step": 829
},
{
"epoch": 1.8526785714285714,
"grad_norm": 1.9235873222351074,
"learning_rate": 7.3062616745747325e-06,
"loss": 0.6207,
"step": 830
},
{
"epoch": 1.8549107142857144,
"grad_norm": 1.837198257446289,
"learning_rate": 7.281949825249495e-06,
"loss": 0.6578,
"step": 831
},
{
"epoch": 1.8571428571428572,
"grad_norm": 1.723789930343628,
"learning_rate": 7.257655310619996e-06,
"loss": 0.6194,
"step": 832
},
{
"epoch": 1.859375,
"grad_norm": 1.7569659948349,
"learning_rate": 7.233378285627459e-06,
"loss": 0.5448,
"step": 833
},
{
"epoch": 1.8616071428571428,
"grad_norm": 1.8790802955627441,
"learning_rate": 7.209118905101575e-06,
"loss": 0.7309,
"step": 834
},
{
"epoch": 1.8638392857142856,
"grad_norm": 1.8847455978393555,
"learning_rate": 7.184877323759482e-06,
"loss": 0.6599,
"step": 835
},
{
"epoch": 1.8660714285714286,
"grad_norm": 1.8182835578918457,
"learning_rate": 7.16065369620483e-06,
"loss": 0.683,
"step": 836
},
{
"epoch": 1.8683035714285714,
"grad_norm": 1.7820547819137573,
"learning_rate": 7.136448176926736e-06,
"loss": 0.6397,
"step": 837
},
{
"epoch": 1.8705357142857144,
"grad_norm": 1.9385348558425903,
"learning_rate": 7.112260920298859e-06,
"loss": 0.7061,
"step": 838
},
{
"epoch": 1.8727678571428572,
"grad_norm": 1.8107960224151611,
"learning_rate": 7.088092080578357e-06,
"loss": 0.7015,
"step": 839
},
{
"epoch": 1.875,
"grad_norm": 1.7129448652267456,
"learning_rate": 7.063941811904956e-06,
"loss": 0.6202,
"step": 840
},
{
"epoch": 1.8772321428571428,
"grad_norm": 1.7366535663604736,
"learning_rate": 7.039810268299934e-06,
"loss": 0.5723,
"step": 841
},
{
"epoch": 1.8794642857142856,
"grad_norm": 2.027513027191162,
"learning_rate": 7.015697603665141e-06,
"loss": 0.6308,
"step": 842
},
{
"epoch": 1.8816964285714286,
"grad_norm": 1.9543936252593994,
"learning_rate": 6.991603971782035e-06,
"loss": 0.7181,
"step": 843
},
{
"epoch": 1.8839285714285714,
"grad_norm": 1.7029448747634888,
"learning_rate": 6.967529526310681e-06,
"loss": 0.5145,
"step": 844
},
{
"epoch": 1.8861607142857144,
"grad_norm": 1.8236180543899536,
"learning_rate": 6.943474420788788e-06,
"loss": 0.6421,
"step": 845
},
{
"epoch": 1.8883928571428572,
"grad_norm": 1.779215931892395,
"learning_rate": 6.919438808630716e-06,
"loss": 0.5887,
"step": 846
},
{
"epoch": 1.890625,
"grad_norm": 1.6859050989151,
"learning_rate": 6.895422843126507e-06,
"loss": 0.5996,
"step": 847
},
{
"epoch": 1.8928571428571428,
"grad_norm": 1.9645501375198364,
"learning_rate": 6.871426677440907e-06,
"loss": 0.6966,
"step": 848
},
{
"epoch": 1.8950892857142856,
"grad_norm": 1.8417348861694336,
"learning_rate": 6.847450464612378e-06,
"loss": 0.6475,
"step": 849
},
{
"epoch": 1.8973214285714286,
"grad_norm": 1.9112606048583984,
"learning_rate": 6.8234943575521365e-06,
"loss": 0.6719,
"step": 850
},
{
"epoch": 1.8995535714285714,
"grad_norm": 1.7694634199142456,
"learning_rate": 6.799558509043169e-06,
"loss": 0.6202,
"step": 851
},
{
"epoch": 1.9017857142857144,
"grad_norm": 1.9113940000534058,
"learning_rate": 6.775643071739267e-06,
"loss": 0.5926,
"step": 852
},
{
"epoch": 1.9040178571428572,
"grad_norm": 1.9161518812179565,
"learning_rate": 6.751748198164036e-06,
"loss": 0.8355,
"step": 853
},
{
"epoch": 1.90625,
"grad_norm": 1.9429389238357544,
"learning_rate": 6.727874040709943e-06,
"loss": 0.6021,
"step": 854
},
{
"epoch": 1.9084821428571428,
"grad_norm": 1.9069395065307617,
"learning_rate": 6.704020751637333e-06,
"loss": 0.5882,
"step": 855
},
{
"epoch": 1.9107142857142856,
"grad_norm": 1.7388685941696167,
"learning_rate": 6.680188483073458e-06,
"loss": 0.5105,
"step": 856
},
{
"epoch": 1.9129464285714286,
"grad_norm": 1.8594225645065308,
"learning_rate": 6.6563773870115135e-06,
"loss": 0.6032,
"step": 857
},
{
"epoch": 1.9151785714285714,
"grad_norm": 1.6092168092727661,
"learning_rate": 6.632587615309658e-06,
"loss": 0.5947,
"step": 858
},
{
"epoch": 1.9174107142857144,
"grad_norm": 1.7915990352630615,
"learning_rate": 6.608819319690059e-06,
"loss": 0.6142,
"step": 859
},
{
"epoch": 1.9196428571428572,
"grad_norm": 1.5304429531097412,
"learning_rate": 6.585072651737911e-06,
"loss": 0.5969,
"step": 860
},
{
"epoch": 1.921875,
"grad_norm": 1.6925547122955322,
"learning_rate": 6.56134776290048e-06,
"loss": 0.5597,
"step": 861
},
{
"epoch": 1.9241071428571428,
"grad_norm": 1.6392486095428467,
"learning_rate": 6.537644804486136e-06,
"loss": 0.6305,
"step": 862
},
{
"epoch": 1.9263392857142856,
"grad_norm": 1.886560320854187,
"learning_rate": 6.513963927663372e-06,
"loss": 0.6076,
"step": 863
},
{
"epoch": 1.9285714285714286,
"grad_norm": 1.7735902070999146,
"learning_rate": 6.49030528345987e-06,
"loss": 0.5961,
"step": 864
},
{
"epoch": 1.9308035714285714,
"grad_norm": 1.7739813327789307,
"learning_rate": 6.466669022761506e-06,
"loss": 0.6179,
"step": 865
},
{
"epoch": 1.9330357142857144,
"grad_norm": 1.692681908607483,
"learning_rate": 6.443055296311413e-06,
"loss": 0.6816,
"step": 866
},
{
"epoch": 1.9352678571428572,
"grad_norm": 1.7736715078353882,
"learning_rate": 6.4194642547090016e-06,
"loss": 0.6573,
"step": 867
},
{
"epoch": 1.9375,
"grad_norm": 1.8248765468597412,
"learning_rate": 6.3958960484090094e-06,
"loss": 0.565,
"step": 868
},
{
"epoch": 1.9397321428571428,
"grad_norm": 1.636215329170227,
"learning_rate": 6.37235082772055e-06,
"loss": 0.6068,
"step": 869
},
{
"epoch": 1.9419642857142856,
"grad_norm": 1.8558686971664429,
"learning_rate": 6.348828742806122e-06,
"loss": 0.6067,
"step": 870
},
{
"epoch": 1.9441964285714286,
"grad_norm": 1.7735726833343506,
"learning_rate": 6.325329943680689e-06,
"loss": 0.6364,
"step": 871
},
{
"epoch": 1.9464285714285714,
"grad_norm": 1.7822948694229126,
"learning_rate": 6.3018545802107e-06,
"loss": 0.6346,
"step": 872
},
{
"epoch": 1.9486607142857144,
"grad_norm": 1.7865424156188965,
"learning_rate": 6.278402802113146e-06,
"loss": 0.6141,
"step": 873
},
{
"epoch": 1.9508928571428572,
"grad_norm": 1.8328912258148193,
"learning_rate": 6.25497475895459e-06,
"loss": 0.6986,
"step": 874
},
{
"epoch": 1.953125,
"grad_norm": 1.8505418300628662,
"learning_rate": 6.2315706001502305e-06,
"loss": 0.6397,
"step": 875
},
{
"epoch": 1.9553571428571428,
"grad_norm": 1.664512276649475,
"learning_rate": 6.208190474962945e-06,
"loss": 0.5629,
"step": 876
},
{
"epoch": 1.9575892857142856,
"grad_norm": 1.8029053211212158,
"learning_rate": 6.184834532502315e-06,
"loss": 0.6978,
"step": 877
},
{
"epoch": 1.9598214285714286,
"grad_norm": 1.6065319776535034,
"learning_rate": 6.161502921723719e-06,
"loss": 0.5763,
"step": 878
},
{
"epoch": 1.9620535714285714,
"grad_norm": 1.9059717655181885,
"learning_rate": 6.138195791427329e-06,
"loss": 0.5928,
"step": 879
},
{
"epoch": 1.9642857142857144,
"grad_norm": 2.0226573944091797,
"learning_rate": 6.114913290257219e-06,
"loss": 0.6109,
"step": 880
},
{
"epoch": 1.9665178571428572,
"grad_norm": 1.741227626800537,
"learning_rate": 6.091655566700359e-06,
"loss": 0.6165,
"step": 881
},
{
"epoch": 1.96875,
"grad_norm": 1.747584581375122,
"learning_rate": 6.068422769085722e-06,
"loss": 0.5608,
"step": 882
},
{
"epoch": 1.9709821428571428,
"grad_norm": 1.6240609884262085,
"learning_rate": 6.045215045583301e-06,
"loss": 0.61,
"step": 883
},
{
"epoch": 1.9732142857142856,
"grad_norm": 1.6965066194534302,
"learning_rate": 6.0220325442031714e-06,
"loss": 0.5608,
"step": 884
},
{
"epoch": 1.9754464285714286,
"grad_norm": 1.6870455741882324,
"learning_rate": 5.998875412794562e-06,
"loss": 0.5619,
"step": 885
},
{
"epoch": 1.9776785714285714,
"grad_norm": 2.0198004245758057,
"learning_rate": 5.975743799044894e-06,
"loss": 0.6374,
"step": 886
},
{
"epoch": 1.9799107142857144,
"grad_norm": 1.584223985671997,
"learning_rate": 5.952637850478852e-06,
"loss": 0.6131,
"step": 887
},
{
"epoch": 1.9821428571428572,
"grad_norm": 2.044126272201538,
"learning_rate": 5.929557714457425e-06,
"loss": 0.7384,
"step": 888
},
{
"epoch": 1.984375,
"grad_norm": 1.8715416193008423,
"learning_rate": 5.906503538176999e-06,
"loss": 0.6409,
"step": 889
},
{
"epoch": 1.9866071428571428,
"grad_norm": 1.9569389820098877,
"learning_rate": 5.883475468668387e-06,
"loss": 0.6796,
"step": 890
},
{
"epoch": 1.9888392857142856,
"grad_norm": 1.5443964004516602,
"learning_rate": 5.860473652795901e-06,
"loss": 0.584,
"step": 891
},
{
"epoch": 1.9910714285714286,
"grad_norm": 1.6899211406707764,
"learning_rate": 5.8374982372564255e-06,
"loss": 0.5313,
"step": 892
},
{
"epoch": 1.9933035714285714,
"grad_norm": 1.6658509969711304,
"learning_rate": 5.814549368578464e-06,
"loss": 0.658,
"step": 893
},
{
"epoch": 1.9955357142857144,
"grad_norm": 1.86408531665802,
"learning_rate": 5.7916271931212185e-06,
"loss": 0.7255,
"step": 894
},
{
"epoch": 1.9977678571428572,
"grad_norm": 1.8427174091339111,
"learning_rate": 5.768731857073657e-06,
"loss": 0.6449,
"step": 895
},
{
"epoch": 2.0,
"grad_norm": 1.6087369918823242,
"learning_rate": 5.745863506453569e-06,
"loss": 0.4955,
"step": 896
},
{
"epoch": 2.0,
"eval_loss": 0.7043077349662781,
"eval_runtime": 39.1667,
"eval_samples_per_second": 1.864,
"eval_steps_per_second": 0.255,
"step": 896
},
{
"epoch": 2.002232142857143,
"grad_norm": 1.738125205039978,
"learning_rate": 5.7230222871066475e-06,
"loss": 0.5085,
"step": 897
},
{
"epoch": 2.0044642857142856,
"grad_norm": 1.8423463106155396,
"learning_rate": 5.700208344705537e-06,
"loss": 0.5366,
"step": 898
},
{
"epoch": 2.0066964285714284,
"grad_norm": 1.9999382495880127,
"learning_rate": 5.677421824748946e-06,
"loss": 0.5329,
"step": 899
},
{
"epoch": 2.0089285714285716,
"grad_norm": 1.9602954387664795,
"learning_rate": 5.6546628725606675e-06,
"loss": 0.5518,
"step": 900
},
{
"epoch": 2.0111607142857144,
"grad_norm": 1.907751441001892,
"learning_rate": 5.631931633288696e-06,
"loss": 0.5051,
"step": 901
},
{
"epoch": 2.013392857142857,
"grad_norm": 1.5521221160888672,
"learning_rate": 5.609228251904265e-06,
"loss": 0.4874,
"step": 902
},
{
"epoch": 2.015625,
"grad_norm": 1.9081358909606934,
"learning_rate": 5.586552873200963e-06,
"loss": 0.5361,
"step": 903
},
{
"epoch": 2.017857142857143,
"grad_norm": 1.8409134149551392,
"learning_rate": 5.563905641793776e-06,
"loss": 0.5204,
"step": 904
},
{
"epoch": 2.0200892857142856,
"grad_norm": 1.6720854043960571,
"learning_rate": 5.541286702118174e-06,
"loss": 0.5107,
"step": 905
},
{
"epoch": 2.0223214285714284,
"grad_norm": 1.8406893014907837,
"learning_rate": 5.518696198429201e-06,
"loss": 0.5427,
"step": 906
},
{
"epoch": 2.0245535714285716,
"grad_norm": 1.728305697441101,
"learning_rate": 5.496134274800533e-06,
"loss": 0.4973,
"step": 907
},
{
"epoch": 2.0267857142857144,
"grad_norm": 1.7480419874191284,
"learning_rate": 5.473601075123599e-06,
"loss": 0.5065,
"step": 908
},
{
"epoch": 2.029017857142857,
"grad_norm": 1.8376224040985107,
"learning_rate": 5.451096743106611e-06,
"loss": 0.5953,
"step": 909
},
{
"epoch": 2.03125,
"grad_norm": 1.8109792470932007,
"learning_rate": 5.428621422273687e-06,
"loss": 0.5203,
"step": 910
},
{
"epoch": 2.033482142857143,
"grad_norm": 1.8943278789520264,
"learning_rate": 5.406175255963923e-06,
"loss": 0.5111,
"step": 911
},
{
"epoch": 2.0357142857142856,
"grad_norm": 1.9139760732650757,
"learning_rate": 5.383758387330476e-06,
"loss": 0.4957,
"step": 912
},
{
"epoch": 2.0379464285714284,
"grad_norm": 1.9172364473342896,
"learning_rate": 5.3613709593396545e-06,
"loss": 0.5062,
"step": 913
},
{
"epoch": 2.0401785714285716,
"grad_norm": 1.674847960472107,
"learning_rate": 5.3390131147699995e-06,
"loss": 0.4564,
"step": 914
},
{
"epoch": 2.0424107142857144,
"grad_norm": 1.8302925825119019,
"learning_rate": 5.3166849962113886e-06,
"loss": 0.4828,
"step": 915
},
{
"epoch": 2.044642857142857,
"grad_norm": 1.9897193908691406,
"learning_rate": 5.294386746064115e-06,
"loss": 0.4754,
"step": 916
},
{
"epoch": 2.046875,
"grad_norm": 2.0378408432006836,
"learning_rate": 5.272118506537982e-06,
"loss": 0.5603,
"step": 917
},
{
"epoch": 2.049107142857143,
"grad_norm": 1.819687843322754,
"learning_rate": 5.249880419651403e-06,
"loss": 0.5372,
"step": 918
},
{
"epoch": 2.0513392857142856,
"grad_norm": 1.8932809829711914,
"learning_rate": 5.2276726272304724e-06,
"loss": 0.533,
"step": 919
},
{
"epoch": 2.0535714285714284,
"grad_norm": 1.5039235353469849,
"learning_rate": 5.205495270908094e-06,
"loss": 0.4317,
"step": 920
},
{
"epoch": 2.0558035714285716,
"grad_norm": 1.7907001972198486,
"learning_rate": 5.183348492123056e-06,
"loss": 0.4999,
"step": 921
},
{
"epoch": 2.0580357142857144,
"grad_norm": 1.8389475345611572,
"learning_rate": 5.16123243211914e-06,
"loss": 0.506,
"step": 922
},
{
"epoch": 2.060267857142857,
"grad_norm": 1.907362699508667,
"learning_rate": 5.1391472319442016e-06,
"loss": 0.4987,
"step": 923
},
{
"epoch": 2.0625,
"grad_norm": 1.7584993839263916,
"learning_rate": 5.117093032449297e-06,
"loss": 0.5286,
"step": 924
},
{
"epoch": 2.064732142857143,
"grad_norm": 1.9483177661895752,
"learning_rate": 5.0950699742877645e-06,
"loss": 0.5469,
"step": 925
},
{
"epoch": 2.0669642857142856,
"grad_norm": 1.7706096172332764,
"learning_rate": 5.073078197914341e-06,
"loss": 0.5409,
"step": 926
},
{
"epoch": 2.0691964285714284,
"grad_norm": 1.806550145149231,
"learning_rate": 5.0511178435842565e-06,
"loss": 0.558,
"step": 927
},
{
"epoch": 2.0714285714285716,
"grad_norm": 1.5738914012908936,
"learning_rate": 5.029189051352339e-06,
"loss": 0.4856,
"step": 928
},
{
"epoch": 2.0736607142857144,
"grad_norm": 1.8106647729873657,
"learning_rate": 5.007291961072133e-06,
"loss": 0.4639,
"step": 929
},
{
"epoch": 2.075892857142857,
"grad_norm": 1.849599838256836,
"learning_rate": 4.985426712394994e-06,
"loss": 0.5315,
"step": 930
},
{
"epoch": 2.078125,
"grad_norm": 1.3779913187026978,
"learning_rate": 4.963593444769207e-06,
"loss": 0.4128,
"step": 931
},
{
"epoch": 2.080357142857143,
"grad_norm": 1.9234905242919922,
"learning_rate": 4.941792297439098e-06,
"loss": 0.4776,
"step": 932
},
{
"epoch": 2.0825892857142856,
"grad_norm": 1.9030768871307373,
"learning_rate": 4.920023409444128e-06,
"loss": 0.626,
"step": 933
},
{
"epoch": 2.0848214285714284,
"grad_norm": 2.104311227798462,
"learning_rate": 4.898286919618034e-06,
"loss": 0.5971,
"step": 934
},
{
"epoch": 2.0870535714285716,
"grad_norm": 1.6451133489608765,
"learning_rate": 4.876582966587924e-06,
"loss": 0.4981,
"step": 935
},
{
"epoch": 2.0892857142857144,
"grad_norm": 1.9193094968795776,
"learning_rate": 4.8549116887734045e-06,
"loss": 0.5093,
"step": 936
},
{
"epoch": 2.091517857142857,
"grad_norm": 2.0272116661071777,
"learning_rate": 4.833273224385678e-06,
"loss": 0.5768,
"step": 937
},
{
"epoch": 2.09375,
"grad_norm": 1.6840240955352783,
"learning_rate": 4.811667711426686e-06,
"loss": 0.4768,
"step": 938
},
{
"epoch": 2.095982142857143,
"grad_norm": 1.901715636253357,
"learning_rate": 4.790095287688227e-06,
"loss": 0.6362,
"step": 939
},
{
"epoch": 2.0982142857142856,
"grad_norm": 1.6791905164718628,
"learning_rate": 4.7685560907510465e-06,
"loss": 0.4853,
"step": 940
},
{
"epoch": 2.1004464285714284,
"grad_norm": 1.718680739402771,
"learning_rate": 4.747050257984002e-06,
"loss": 0.4572,
"step": 941
},
{
"epoch": 2.1026785714285716,
"grad_norm": 1.6572511196136475,
"learning_rate": 4.725577926543151e-06,
"loss": 0.4536,
"step": 942
},
{
"epoch": 2.1049107142857144,
"grad_norm": 1.936851143836975,
"learning_rate": 4.704139233370905e-06,
"loss": 0.6019,
"step": 943
},
{
"epoch": 2.107142857142857,
"grad_norm": 1.6410084962844849,
"learning_rate": 4.682734315195138e-06,
"loss": 0.5204,
"step": 944
},
{
"epoch": 2.109375,
"grad_norm": 1.6365997791290283,
"learning_rate": 4.661363308528319e-06,
"loss": 0.4437,
"step": 945
},
{
"epoch": 2.111607142857143,
"grad_norm": 1.9370850324630737,
"learning_rate": 4.640026349666651e-06,
"loss": 0.554,
"step": 946
},
{
"epoch": 2.1138392857142856,
"grad_norm": 1.7888548374176025,
"learning_rate": 4.61872357468917e-06,
"loss": 0.4938,
"step": 947
},
{
"epoch": 2.1160714285714284,
"grad_norm": 1.5971767902374268,
"learning_rate": 4.5974551194569336e-06,
"loss": 0.4829,
"step": 948
},
{
"epoch": 2.1183035714285716,
"grad_norm": 1.7885452508926392,
"learning_rate": 4.576221119612091e-06,
"loss": 0.5542,
"step": 949
},
{
"epoch": 2.1205357142857144,
"grad_norm": 1.5876049995422363,
"learning_rate": 4.555021710577068e-06,
"loss": 0.5482,
"step": 950
},
{
"epoch": 2.122767857142857,
"grad_norm": 1.786490797996521,
"learning_rate": 4.533857027553663e-06,
"loss": 0.5871,
"step": 951
},
{
"epoch": 2.125,
"grad_norm": 1.8872352838516235,
"learning_rate": 4.51272720552223e-06,
"loss": 0.4679,
"step": 952
},
{
"epoch": 2.127232142857143,
"grad_norm": 1.8590319156646729,
"learning_rate": 4.49163237924078e-06,
"loss": 0.5014,
"step": 953
},
{
"epoch": 2.1294642857142856,
"grad_norm": 1.5889908075332642,
"learning_rate": 4.470572683244127e-06,
"loss": 0.4523,
"step": 954
},
{
"epoch": 2.1316964285714284,
"grad_norm": 1.8147304058074951,
"learning_rate": 4.449548251843048e-06,
"loss": 0.4648,
"step": 955
},
{
"epoch": 2.1339285714285716,
"grad_norm": 2.0382473468780518,
"learning_rate": 4.4285592191234125e-06,
"loss": 0.5273,
"step": 956
},
{
"epoch": 2.1361607142857144,
"grad_norm": 1.8126921653747559,
"learning_rate": 4.4076057189453325e-06,
"loss": 0.5059,
"step": 957
},
{
"epoch": 2.138392857142857,
"grad_norm": 1.8667762279510498,
"learning_rate": 4.386687884942307e-06,
"loss": 0.5113,
"step": 958
},
{
"epoch": 2.140625,
"grad_norm": 2.065971851348877,
"learning_rate": 4.365805850520362e-06,
"loss": 0.5056,
"step": 959
},
{
"epoch": 2.142857142857143,
"grad_norm": 1.8023256063461304,
"learning_rate": 4.344959748857215e-06,
"loss": 0.5219,
"step": 960
},
{
"epoch": 2.1450892857142856,
"grad_norm": 2.643050193786621,
"learning_rate": 4.324149712901417e-06,
"loss": 0.5836,
"step": 961
},
{
"epoch": 2.1473214285714284,
"grad_norm": 1.5510233640670776,
"learning_rate": 4.3033758753715095e-06,
"loss": 0.4494,
"step": 962
},
{
"epoch": 2.1495535714285716,
"grad_norm": 1.5919033288955688,
"learning_rate": 4.282638368755161e-06,
"loss": 0.425,
"step": 963
},
{
"epoch": 2.1517857142857144,
"grad_norm": 1.7790838479995728,
"learning_rate": 4.261937325308347e-06,
"loss": 0.5493,
"step": 964
},
{
"epoch": 2.154017857142857,
"grad_norm": 1.7823125123977661,
"learning_rate": 4.241272877054489e-06,
"loss": 0.5211,
"step": 965
},
{
"epoch": 2.15625,
"grad_norm": 1.9036273956298828,
"learning_rate": 4.2206451557836235e-06,
"loss": 0.5364,
"step": 966
},
{
"epoch": 2.158482142857143,
"grad_norm": 1.658921718597412,
"learning_rate": 4.200054293051556e-06,
"loss": 0.5241,
"step": 967
},
{
"epoch": 2.1607142857142856,
"grad_norm": 1.6842666864395142,
"learning_rate": 4.179500420179011e-06,
"loss": 0.5911,
"step": 968
},
{
"epoch": 2.1629464285714284,
"grad_norm": 1.8204643726348877,
"learning_rate": 4.158983668250819e-06,
"loss": 0.5023,
"step": 969
},
{
"epoch": 2.1651785714285716,
"grad_norm": 1.692069172859192,
"learning_rate": 4.138504168115059e-06,
"loss": 0.4712,
"step": 970
},
{
"epoch": 2.1674107142857144,
"grad_norm": 1.6010735034942627,
"learning_rate": 4.11806205038224e-06,
"loss": 0.48,
"step": 971
},
{
"epoch": 2.169642857142857,
"grad_norm": 2.0540120601654053,
"learning_rate": 4.097657445424454e-06,
"loss": 0.565,
"step": 972
},
{
"epoch": 2.171875,
"grad_norm": 1.8458772897720337,
"learning_rate": 4.077290483374549e-06,
"loss": 0.4537,
"step": 973
},
{
"epoch": 2.174107142857143,
"grad_norm": 1.8870779275894165,
"learning_rate": 4.056961294125305e-06,
"loss": 0.5089,
"step": 974
},
{
"epoch": 2.1763392857142856,
"grad_norm": 1.9770042896270752,
"learning_rate": 4.0366700073286005e-06,
"loss": 0.548,
"step": 975
},
{
"epoch": 2.1785714285714284,
"grad_norm": 1.8909940719604492,
"learning_rate": 4.016416752394591e-06,
"loss": 0.6115,
"step": 976
},
{
"epoch": 2.1808035714285716,
"grad_norm": 1.887465000152588,
"learning_rate": 3.996201658490866e-06,
"loss": 0.5199,
"step": 977
},
{
"epoch": 2.1830357142857144,
"grad_norm": 1.808553695678711,
"learning_rate": 3.9760248545416465e-06,
"loss": 0.4737,
"step": 978
},
{
"epoch": 2.185267857142857,
"grad_norm": 1.8134347200393677,
"learning_rate": 3.955886469226967e-06,
"loss": 0.4988,
"step": 979
},
{
"epoch": 2.1875,
"grad_norm": 1.8288437128067017,
"learning_rate": 3.935786630981819e-06,
"loss": 0.4873,
"step": 980
},
{
"epoch": 2.189732142857143,
"grad_norm": 1.7920408248901367,
"learning_rate": 3.915725467995375e-06,
"loss": 0.5163,
"step": 981
},
{
"epoch": 2.1919642857142856,
"grad_norm": 1.6849050521850586,
"learning_rate": 3.895703108210135e-06,
"loss": 0.5308,
"step": 982
},
{
"epoch": 2.1941964285714284,
"grad_norm": 1.5930182933807373,
"learning_rate": 3.875719679321138e-06,
"loss": 0.4919,
"step": 983
},
{
"epoch": 2.1964285714285716,
"grad_norm": 1.9762846231460571,
"learning_rate": 3.8557753087751345e-06,
"loss": 0.5924,
"step": 984
},
{
"epoch": 2.1986607142857144,
"grad_norm": 1.726677417755127,
"learning_rate": 3.835870123769775e-06,
"loss": 0.5425,
"step": 985
},
{
"epoch": 2.200892857142857,
"grad_norm": 1.9238899946212769,
"learning_rate": 3.8160042512528e-06,
"loss": 0.5421,
"step": 986
},
{
"epoch": 2.203125,
"grad_norm": 2.012009620666504,
"learning_rate": 3.796177817921223e-06,
"loss": 0.5438,
"step": 987
},
{
"epoch": 2.205357142857143,
"grad_norm": 1.6114519834518433,
"learning_rate": 3.776390950220544e-06,
"loss": 0.5355,
"step": 988
},
{
"epoch": 2.2075892857142856,
"grad_norm": 1.7870115041732788,
"learning_rate": 3.756643774343913e-06,
"loss": 0.638,
"step": 989
},
{
"epoch": 2.2098214285714284,
"grad_norm": 1.8089219331741333,
"learning_rate": 3.7369364162313528e-06,
"loss": 0.5771,
"step": 990
},
{
"epoch": 2.2120535714285716,
"grad_norm": 1.7549517154693604,
"learning_rate": 3.7172690015689263e-06,
"loss": 0.5726,
"step": 991
},
{
"epoch": 2.2142857142857144,
"grad_norm": 1.7794311046600342,
"learning_rate": 3.6976416557879757e-06,
"loss": 0.5378,
"step": 992
},
{
"epoch": 2.216517857142857,
"grad_norm": 1.7363981008529663,
"learning_rate": 3.678054504064287e-06,
"loss": 0.4822,
"step": 993
},
{
"epoch": 2.21875,
"grad_norm": 1.8827056884765625,
"learning_rate": 3.658507671317296e-06,
"loss": 0.5609,
"step": 994
},
{
"epoch": 2.220982142857143,
"grad_norm": 1.858846664428711,
"learning_rate": 3.639001282209311e-06,
"loss": 0.5211,
"step": 995
},
{
"epoch": 2.2232142857142856,
"grad_norm": 1.877846360206604,
"learning_rate": 3.6195354611447033e-06,
"loss": 0.5076,
"step": 996
},
{
"epoch": 2.2254464285714284,
"grad_norm": 1.9064738750457764,
"learning_rate": 3.600110332269118e-06,
"loss": 0.4971,
"step": 997
},
{
"epoch": 2.2276785714285716,
"grad_norm": 1.8775546550750732,
"learning_rate": 3.580726019468671e-06,
"loss": 0.5283,
"step": 998
},
{
"epoch": 2.2299107142857144,
"grad_norm": 1.8626521825790405,
"learning_rate": 3.561382646369179e-06,
"loss": 0.5797,
"step": 999
},
{
"epoch": 2.232142857142857,
"grad_norm": 1.8749574422836304,
"learning_rate": 3.5420803363353604e-06,
"loss": 0.6058,
"step": 1000
},
{
"epoch": 2.234375,
"grad_norm": 1.8958619832992554,
"learning_rate": 3.5228192124700433e-06,
"loss": 0.5324,
"step": 1001
},
{
"epoch": 2.236607142857143,
"grad_norm": 2.2370924949645996,
"learning_rate": 3.503599397613394e-06,
"loss": 0.5212,
"step": 1002
},
{
"epoch": 2.2388392857142856,
"grad_norm": 1.7963013648986816,
"learning_rate": 3.4844210143421143e-06,
"loss": 0.5309,
"step": 1003
},
{
"epoch": 2.2410714285714284,
"grad_norm": 1.7290846109390259,
"learning_rate": 3.465284184968679e-06,
"loss": 0.5216,
"step": 1004
},
{
"epoch": 2.2433035714285716,
"grad_norm": 1.756559133529663,
"learning_rate": 3.4461890315405466e-06,
"loss": 0.5042,
"step": 1005
},
{
"epoch": 2.2455357142857144,
"grad_norm": 1.7828274965286255,
"learning_rate": 3.4271356758393827e-06,
"loss": 0.5008,
"step": 1006
},
{
"epoch": 2.247767857142857,
"grad_norm": 1.5559848546981812,
"learning_rate": 3.4081242393802847e-06,
"loss": 0.4937,
"step": 1007
},
{
"epoch": 2.25,
"grad_norm": 1.6797314882278442,
"learning_rate": 3.3891548434109942e-06,
"loss": 0.5747,
"step": 1008
},
{
"epoch": 2.25,
"eval_loss": 0.7400864362716675,
"eval_runtime": 64.4492,
"eval_samples_per_second": 1.133,
"eval_steps_per_second": 0.155,
"step": 1008
},
{
"epoch": 2.252232142857143,
"grad_norm": 1.8574589490890503,
"learning_rate": 3.3702276089111484e-06,
"loss": 0.5077,
"step": 1009
},
{
"epoch": 2.2544642857142856,
"grad_norm": 1.894984245300293,
"learning_rate": 3.3513426565914854e-06,
"loss": 0.57,
"step": 1010
},
{
"epoch": 2.2566964285714284,
"grad_norm": 1.81510591506958,
"learning_rate": 3.3325001068930917e-06,
"loss": 0.4895,
"step": 1011
},
{
"epoch": 2.2589285714285716,
"grad_norm": 1.8635365962982178,
"learning_rate": 3.3137000799866148e-06,
"loss": 0.4275,
"step": 1012
},
{
"epoch": 2.2611607142857144,
"grad_norm": 1.8582934141159058,
"learning_rate": 3.2949426957715157e-06,
"loss": 0.6211,
"step": 1013
},
{
"epoch": 2.263392857142857,
"grad_norm": 1.6376415491104126,
"learning_rate": 3.276228073875296e-06,
"loss": 0.5382,
"step": 1014
},
{
"epoch": 2.265625,
"grad_norm": 2.0684192180633545,
"learning_rate": 3.257556333652734e-06,
"loss": 0.4896,
"step": 1015
},
{
"epoch": 2.267857142857143,
"grad_norm": 2.0028772354125977,
"learning_rate": 3.238927594185127e-06,
"loss": 0.5456,
"step": 1016
},
{
"epoch": 2.2700892857142856,
"grad_norm": 1.8271785974502563,
"learning_rate": 3.2203419742795237e-06,
"loss": 0.5187,
"step": 1017
},
{
"epoch": 2.2723214285714284,
"grad_norm": 1.8357112407684326,
"learning_rate": 3.201799592467978e-06,
"loss": 0.5304,
"step": 1018
},
{
"epoch": 2.2745535714285716,
"grad_norm": 2.119241237640381,
"learning_rate": 3.1833005670067874e-06,
"loss": 0.55,
"step": 1019
},
{
"epoch": 2.2767857142857144,
"grad_norm": 1.683974027633667,
"learning_rate": 3.1648450158757373e-06,
"loss": 0.5021,
"step": 1020
},
{
"epoch": 2.279017857142857,
"grad_norm": 1.9696073532104492,
"learning_rate": 3.146433056777355e-06,
"loss": 0.53,
"step": 1021
},
{
"epoch": 2.28125,
"grad_norm": 1.8277369737625122,
"learning_rate": 3.128064807136142e-06,
"loss": 0.4263,
"step": 1022
},
{
"epoch": 2.283482142857143,
"grad_norm": 1.6111699342727661,
"learning_rate": 3.10974038409785e-06,
"loss": 0.4166,
"step": 1023
},
{
"epoch": 2.2857142857142856,
"grad_norm": 1.9450246095657349,
"learning_rate": 3.0914599045287165e-06,
"loss": 0.5023,
"step": 1024
},
{
"epoch": 2.2879464285714284,
"grad_norm": 2.1073365211486816,
"learning_rate": 3.073223485014727e-06,
"loss": 0.5267,
"step": 1025
},
{
"epoch": 2.2901785714285716,
"grad_norm": 1.8855870962142944,
"learning_rate": 3.0550312418608617e-06,
"loss": 0.557,
"step": 1026
},
{
"epoch": 2.2924107142857144,
"grad_norm": 1.6264270544052124,
"learning_rate": 3.0368832910903625e-06,
"loss": 0.4561,
"step": 1027
},
{
"epoch": 2.294642857142857,
"grad_norm": 2.0912892818450928,
"learning_rate": 3.018779748444005e-06,
"loss": 0.6132,
"step": 1028
},
{
"epoch": 2.296875,
"grad_norm": 1.741155982017517,
"learning_rate": 3.000720729379326e-06,
"loss": 0.4242,
"step": 1029
},
{
"epoch": 2.299107142857143,
"grad_norm": 1.8275063037872314,
"learning_rate": 2.9827063490699225e-06,
"loss": 0.4824,
"step": 1030
},
{
"epoch": 2.3013392857142856,
"grad_norm": 1.7157033681869507,
"learning_rate": 2.9647367224046884e-06,
"loss": 0.5407,
"step": 1031
},
{
"epoch": 2.3035714285714284,
"grad_norm": 1.880632996559143,
"learning_rate": 2.9468119639871163e-06,
"loss": 0.4596,
"step": 1032
},
{
"epoch": 2.3058035714285716,
"grad_norm": 1.8524107933044434,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.5019,
"step": 1033
},
{
"epoch": 2.3080357142857144,
"grad_norm": 1.9338825941085815,
"learning_rate": 2.911097508877365e-06,
"loss": 0.5025,
"step": 1034
},
{
"epoch": 2.310267857142857,
"grad_norm": 1.640730857849121,
"learning_rate": 2.8933080399584757e-06,
"loss": 0.5657,
"step": 1035
},
{
"epoch": 2.3125,
"grad_norm": 2.100184917449951,
"learning_rate": 2.8755638948323494e-06,
"loss": 0.5444,
"step": 1036
},
{
"epoch": 2.314732142857143,
"grad_norm": 1.7842200994491577,
"learning_rate": 2.8578651866644447e-06,
"loss": 0.5173,
"step": 1037
},
{
"epoch": 2.3169642857142856,
"grad_norm": 1.8273719549179077,
"learning_rate": 2.840212028330418e-06,
"loss": 0.4315,
"step": 1038
},
{
"epoch": 2.3191964285714284,
"grad_norm": 1.823228120803833,
"learning_rate": 2.8226045324154394e-06,
"loss": 0.4966,
"step": 1039
},
{
"epoch": 2.3214285714285716,
"grad_norm": 1.7970882654190063,
"learning_rate": 2.8050428112134474e-06,
"loss": 0.5744,
"step": 1040
},
{
"epoch": 2.3236607142857144,
"grad_norm": 1.9154415130615234,
"learning_rate": 2.7875269767264667e-06,
"loss": 0.498,
"step": 1041
},
{
"epoch": 2.325892857142857,
"grad_norm": 1.6169114112854004,
"learning_rate": 2.7700571406638633e-06,
"loss": 0.452,
"step": 1042
},
{
"epoch": 2.328125,
"grad_norm": 1.7965830564498901,
"learning_rate": 2.7526334144416345e-06,
"loss": 0.4517,
"step": 1043
},
{
"epoch": 2.330357142857143,
"grad_norm": 1.9453188180923462,
"learning_rate": 2.735255909181719e-06,
"loss": 0.4696,
"step": 1044
},
{
"epoch": 2.3325892857142856,
"grad_norm": 1.6845883131027222,
"learning_rate": 2.7179247357112704e-06,
"loss": 0.543,
"step": 1045
},
{
"epoch": 2.3348214285714284,
"grad_norm": 1.776888132095337,
"learning_rate": 2.7006400045619597e-06,
"loss": 0.4481,
"step": 1046
},
{
"epoch": 2.3370535714285716,
"grad_norm": 2.1295015811920166,
"learning_rate": 2.6834018259692574e-06,
"loss": 0.5429,
"step": 1047
},
{
"epoch": 2.3392857142857144,
"grad_norm": 1.7422338724136353,
"learning_rate": 2.6662103098717485e-06,
"loss": 0.5375,
"step": 1048
},
{
"epoch": 2.341517857142857,
"grad_norm": 1.8716145753860474,
"learning_rate": 2.649065565910419e-06,
"loss": 0.5372,
"step": 1049
},
{
"epoch": 2.34375,
"grad_norm": 1.8629887104034424,
"learning_rate": 2.631967703427959e-06,
"loss": 0.4969,
"step": 1050
},
{
"epoch": 2.345982142857143,
"grad_norm": 1.9818273782730103,
"learning_rate": 2.6149168314680707e-06,
"loss": 0.4883,
"step": 1051
},
{
"epoch": 2.3482142857142856,
"grad_norm": 1.8726003170013428,
"learning_rate": 2.597913058774758e-06,
"loss": 0.5668,
"step": 1052
},
{
"epoch": 2.3504464285714284,
"grad_norm": 1.701008677482605,
"learning_rate": 2.5809564937916543e-06,
"loss": 0.5212,
"step": 1053
},
{
"epoch": 2.3526785714285716,
"grad_norm": 1.8910822868347168,
"learning_rate": 2.564047244661316e-06,
"loss": 0.5219,
"step": 1054
},
{
"epoch": 2.3549107142857144,
"grad_norm": 1.9475998878479004,
"learning_rate": 2.547185419224537e-06,
"loss": 0.5165,
"step": 1055
},
{
"epoch": 2.357142857142857,
"grad_norm": 1.6685254573822021,
"learning_rate": 2.530371125019664e-06,
"loss": 0.4789,
"step": 1056
},
{
"epoch": 2.359375,
"grad_norm": 2.028895139694214,
"learning_rate": 2.513604469281897e-06,
"loss": 0.531,
"step": 1057
},
{
"epoch": 2.361607142857143,
"grad_norm": 1.9576796293258667,
"learning_rate": 2.4968855589426288e-06,
"loss": 0.548,
"step": 1058
},
{
"epoch": 2.3638392857142856,
"grad_norm": 1.756537675857544,
"learning_rate": 2.4802145006287425e-06,
"loss": 0.4646,
"step": 1059
},
{
"epoch": 2.3660714285714284,
"grad_norm": 1.7996472120285034,
"learning_rate": 2.4635914006619454e-06,
"loss": 0.4925,
"step": 1060
},
{
"epoch": 2.3683035714285716,
"grad_norm": 2.11858868598938,
"learning_rate": 2.4470163650580747e-06,
"loss": 0.6414,
"step": 1061
},
{
"epoch": 2.3705357142857144,
"grad_norm": 1.947381854057312,
"learning_rate": 2.430489499526438e-06,
"loss": 0.5795,
"step": 1062
},
{
"epoch": 2.372767857142857,
"grad_norm": 1.8046473264694214,
"learning_rate": 2.414010909469133e-06,
"loss": 0.4544,
"step": 1063
},
{
"epoch": 2.375,
"grad_norm": 1.7068886756896973,
"learning_rate": 2.3975806999803717e-06,
"loss": 0.5324,
"step": 1064
},
{
"epoch": 2.377232142857143,
"grad_norm": 1.5860605239868164,
"learning_rate": 2.38119897584582e-06,
"loss": 0.4809,
"step": 1065
},
{
"epoch": 2.3794642857142856,
"grad_norm": 1.935868501663208,
"learning_rate": 2.364865841541908e-06,
"loss": 0.5075,
"step": 1066
},
{
"epoch": 2.3816964285714284,
"grad_norm": 1.793942928314209,
"learning_rate": 2.3485814012351914e-06,
"loss": 0.5324,
"step": 1067
},
{
"epoch": 2.3839285714285716,
"grad_norm": 1.7716658115386963,
"learning_rate": 2.33234575878167e-06,
"loss": 0.5702,
"step": 1068
},
{
"epoch": 2.3861607142857144,
"grad_norm": 1.8575302362442017,
"learning_rate": 2.3161590177261294e-06,
"loss": 0.4541,
"step": 1069
},
{
"epoch": 2.388392857142857,
"grad_norm": 1.6426512002944946,
"learning_rate": 2.300021281301483e-06,
"loss": 0.4476,
"step": 1070
},
{
"epoch": 2.390625,
"grad_norm": 1.7291110754013062,
"learning_rate": 2.2839326524281037e-06,
"loss": 0.508,
"step": 1071
},
{
"epoch": 2.392857142857143,
"grad_norm": 1.7146512269973755,
"learning_rate": 2.267893233713182e-06,
"loss": 0.5586,
"step": 1072
},
{
"epoch": 2.3950892857142856,
"grad_norm": 1.4405806064605713,
"learning_rate": 2.2519031274500625e-06,
"loss": 0.4264,
"step": 1073
},
{
"epoch": 2.3973214285714284,
"grad_norm": 1.8408970832824707,
"learning_rate": 2.235962435617596e-06,
"loss": 0.4864,
"step": 1074
},
{
"epoch": 2.3995535714285716,
"grad_norm": 1.5550113916397095,
"learning_rate": 2.2200712598794804e-06,
"loss": 0.4476,
"step": 1075
},
{
"epoch": 2.4017857142857144,
"grad_norm": 1.8365901708602905,
"learning_rate": 2.204229701583621e-06,
"loss": 0.5003,
"step": 1076
},
{
"epoch": 2.404017857142857,
"grad_norm": 1.6341036558151245,
"learning_rate": 2.1884378617614933e-06,
"loss": 0.3824,
"step": 1077
},
{
"epoch": 2.40625,
"grad_norm": 1.920911431312561,
"learning_rate": 2.172695841127468e-06,
"loss": 0.5231,
"step": 1078
},
{
"epoch": 2.408482142857143,
"grad_norm": 2.252639055252075,
"learning_rate": 2.157003740078203e-06,
"loss": 0.5236,
"step": 1079
},
{
"epoch": 2.4107142857142856,
"grad_norm": 1.8135790824890137,
"learning_rate": 2.141361658691975e-06,
"loss": 0.5173,
"step": 1080
},
{
"epoch": 2.4129464285714284,
"grad_norm": 2.1770637035369873,
"learning_rate": 2.1257696967280716e-06,
"loss": 0.5397,
"step": 1081
},
{
"epoch": 2.4151785714285716,
"grad_norm": 1.7043235301971436,
"learning_rate": 2.1102279536261193e-06,
"loss": 0.5,
"step": 1082
},
{
"epoch": 2.4174107142857144,
"grad_norm": 1.6420832872390747,
"learning_rate": 2.09473652850548e-06,
"loss": 0.4963,
"step": 1083
},
{
"epoch": 2.419642857142857,
"grad_norm": 1.8134890794754028,
"learning_rate": 2.0792955201646005e-06,
"loss": 0.4707,
"step": 1084
},
{
"epoch": 2.421875,
"grad_norm": 1.9724851846694946,
"learning_rate": 2.063905027080392e-06,
"loss": 0.5633,
"step": 1085
},
{
"epoch": 2.424107142857143,
"grad_norm": 1.650792121887207,
"learning_rate": 2.0485651474075987e-06,
"loss": 0.5001,
"step": 1086
},
{
"epoch": 2.4263392857142856,
"grad_norm": 1.668756127357483,
"learning_rate": 2.033275978978164e-06,
"loss": 0.513,
"step": 1087
},
{
"epoch": 2.4285714285714284,
"grad_norm": 1.710158348083496,
"learning_rate": 2.018037619300628e-06,
"loss": 0.4764,
"step": 1088
},
{
"epoch": 2.4308035714285716,
"grad_norm": 1.7297494411468506,
"learning_rate": 2.0028501655594736e-06,
"loss": 0.5412,
"step": 1089
},
{
"epoch": 2.4330357142857144,
"grad_norm": 1.5992000102996826,
"learning_rate": 1.987713714614543e-06,
"loss": 0.4284,
"step": 1090
},
{
"epoch": 2.435267857142857,
"grad_norm": 1.7302836179733276,
"learning_rate": 1.972628363000396e-06,
"loss": 0.4839,
"step": 1091
},
{
"epoch": 2.4375,
"grad_norm": 1.7639312744140625,
"learning_rate": 1.9575942069256914e-06,
"loss": 0.4923,
"step": 1092
},
{
"epoch": 2.439732142857143,
"grad_norm": 2.4808318614959717,
"learning_rate": 1.942611342272591e-06,
"loss": 0.561,
"step": 1093
},
{
"epoch": 2.4419642857142856,
"grad_norm": 1.8226264715194702,
"learning_rate": 1.9276798645961392e-06,
"loss": 0.5164,
"step": 1094
},
{
"epoch": 2.4441964285714284,
"grad_norm": 1.6818956136703491,
"learning_rate": 1.9127998691236537e-06,
"loss": 0.4326,
"step": 1095
},
{
"epoch": 2.4464285714285716,
"grad_norm": 1.865804672241211,
"learning_rate": 1.8979714507541103e-06,
"loss": 0.5218,
"step": 1096
},
{
"epoch": 2.4486607142857144,
"grad_norm": 1.5961390733718872,
"learning_rate": 1.883194704057556e-06,
"loss": 0.4788,
"step": 1097
},
{
"epoch": 2.450892857142857,
"grad_norm": 1.5959690809249878,
"learning_rate": 1.8684697232744886e-06,
"loss": 0.5347,
"step": 1098
},
{
"epoch": 2.453125,
"grad_norm": 2.1117734909057617,
"learning_rate": 1.8537966023152664e-06,
"loss": 0.4431,
"step": 1099
},
{
"epoch": 2.455357142857143,
"grad_norm": 2.0048511028289795,
"learning_rate": 1.839175434759507e-06,
"loss": 0.5143,
"step": 1100
},
{
"epoch": 2.4575892857142856,
"grad_norm": 1.9185304641723633,
"learning_rate": 1.8246063138554793e-06,
"loss": 0.5521,
"step": 1101
},
{
"epoch": 2.4598214285714284,
"grad_norm": 1.8508011102676392,
"learning_rate": 1.810089332519528e-06,
"loss": 0.5806,
"step": 1102
},
{
"epoch": 2.4620535714285716,
"grad_norm": 1.7387700080871582,
"learning_rate": 1.795624583335467e-06,
"loss": 0.4702,
"step": 1103
},
{
"epoch": 2.4642857142857144,
"grad_norm": 1.8409698009490967,
"learning_rate": 1.7812121585539964e-06,
"loss": 0.5616,
"step": 1104
},
{
"epoch": 2.466517857142857,
"grad_norm": 1.5656404495239258,
"learning_rate": 1.7668521500921098e-06,
"loss": 0.4918,
"step": 1105
},
{
"epoch": 2.46875,
"grad_norm": 2.4327805042266846,
"learning_rate": 1.7525446495325038e-06,
"loss": 0.6054,
"step": 1106
},
{
"epoch": 2.470982142857143,
"grad_norm": 1.7695860862731934,
"learning_rate": 1.7382897481230076e-06,
"loss": 0.4232,
"step": 1107
},
{
"epoch": 2.4732142857142856,
"grad_norm": 1.8793418407440186,
"learning_rate": 1.7240875367759902e-06,
"loss": 0.5547,
"step": 1108
},
{
"epoch": 2.4754464285714284,
"grad_norm": 1.9762465953826904,
"learning_rate": 1.7099381060677833e-06,
"loss": 0.5693,
"step": 1109
},
{
"epoch": 2.4776785714285716,
"grad_norm": 1.4983291625976562,
"learning_rate": 1.6958415462380983e-06,
"loss": 0.4565,
"step": 1110
},
{
"epoch": 2.4799107142857144,
"grad_norm": 1.9047200679779053,
"learning_rate": 1.6817979471894641e-06,
"loss": 0.4563,
"step": 1111
},
{
"epoch": 2.482142857142857,
"grad_norm": 1.9837721586227417,
"learning_rate": 1.6678073984866438e-06,
"loss": 0.5958,
"step": 1112
},
{
"epoch": 2.484375,
"grad_norm": 2.0123696327209473,
"learning_rate": 1.6538699893560618e-06,
"loss": 0.5114,
"step": 1113
},
{
"epoch": 2.486607142857143,
"grad_norm": 1.916115164756775,
"learning_rate": 1.639985808685245e-06,
"loss": 0.5507,
"step": 1114
},
{
"epoch": 2.4888392857142856,
"grad_norm": 2.219719648361206,
"learning_rate": 1.6261549450222392e-06,
"loss": 0.57,
"step": 1115
},
{
"epoch": 2.4910714285714284,
"grad_norm": 1.7933136224746704,
"learning_rate": 1.6123774865750607e-06,
"loss": 0.4511,
"step": 1116
},
{
"epoch": 2.4933035714285716,
"grad_norm": 1.8515256643295288,
"learning_rate": 1.5986535212111353e-06,
"loss": 0.487,
"step": 1117
},
{
"epoch": 2.4955357142857144,
"grad_norm": 1.9687520265579224,
"learning_rate": 1.5849831364567137e-06,
"loss": 0.4977,
"step": 1118
},
{
"epoch": 2.497767857142857,
"grad_norm": 2.062831401824951,
"learning_rate": 1.571366419496344e-06,
"loss": 0.5189,
"step": 1119
},
{
"epoch": 2.5,
"grad_norm": 1.8966569900512695,
"learning_rate": 1.5578034571722879e-06,
"loss": 0.5549,
"step": 1120
},
{
"epoch": 2.5,
"eval_loss": 0.7405146360397339,
"eval_runtime": 48.6921,
"eval_samples_per_second": 1.499,
"eval_steps_per_second": 0.205,
"step": 1120
},
{
"epoch": 2.502232142857143,
"grad_norm": 1.9809284210205078,
"learning_rate": 1.5442943359839978e-06,
"loss": 0.5187,
"step": 1121
},
{
"epoch": 2.5044642857142856,
"grad_norm": 2.1152169704437256,
"learning_rate": 1.5308391420875312e-06,
"loss": 0.5828,
"step": 1122
},
{
"epoch": 2.506696428571429,
"grad_norm": 1.8229962587356567,
"learning_rate": 1.5174379612950273e-06,
"loss": 0.492,
"step": 1123
},
{
"epoch": 2.508928571428571,
"grad_norm": 1.861548900604248,
"learning_rate": 1.5040908790741448e-06,
"loss": 0.523,
"step": 1124
},
{
"epoch": 2.5111607142857144,
"grad_norm": 1.6094368696212769,
"learning_rate": 1.490797980547528e-06,
"loss": 0.4725,
"step": 1125
},
{
"epoch": 2.513392857142857,
"grad_norm": 2.0476646423339844,
"learning_rate": 1.4775593504922547e-06,
"loss": 0.4939,
"step": 1126
},
{
"epoch": 2.515625,
"grad_norm": 1.6590877771377563,
"learning_rate": 1.4643750733392958e-06,
"loss": 0.4955,
"step": 1127
},
{
"epoch": 2.517857142857143,
"grad_norm": 1.984636664390564,
"learning_rate": 1.4512452331729864e-06,
"loss": 0.5275,
"step": 1128
},
{
"epoch": 2.5200892857142856,
"grad_norm": 1.7271418571472168,
"learning_rate": 1.438169913730475e-06,
"loss": 0.5375,
"step": 1129
},
{
"epoch": 2.522321428571429,
"grad_norm": 1.6943548917770386,
"learning_rate": 1.4251491984012089e-06,
"loss": 0.4983,
"step": 1130
},
{
"epoch": 2.524553571428571,
"grad_norm": 1.9767521619796753,
"learning_rate": 1.4121831702263833e-06,
"loss": 0.5568,
"step": 1131
},
{
"epoch": 2.5267857142857144,
"grad_norm": 1.9684196710586548,
"learning_rate": 1.3992719118984167e-06,
"loss": 0.5328,
"step": 1132
},
{
"epoch": 2.529017857142857,
"grad_norm": 1.8626347780227661,
"learning_rate": 1.3864155057604323e-06,
"loss": 0.5693,
"step": 1133
},
{
"epoch": 2.53125,
"grad_norm": 1.8821558952331543,
"learning_rate": 1.3736140338057247e-06,
"loss": 0.6132,
"step": 1134
},
{
"epoch": 2.533482142857143,
"grad_norm": 1.952671766281128,
"learning_rate": 1.3608675776772428e-06,
"loss": 0.6163,
"step": 1135
},
{
"epoch": 2.5357142857142856,
"grad_norm": 1.8021756410598755,
"learning_rate": 1.3481762186670556e-06,
"loss": 0.5465,
"step": 1136
},
{
"epoch": 2.537946428571429,
"grad_norm": 1.8865597248077393,
"learning_rate": 1.335540037715851e-06,
"loss": 0.5889,
"step": 1137
},
{
"epoch": 2.540178571428571,
"grad_norm": 1.9427474737167358,
"learning_rate": 1.3229591154124132e-06,
"loss": 0.5255,
"step": 1138
},
{
"epoch": 2.5424107142857144,
"grad_norm": 1.6922953128814697,
"learning_rate": 1.310433531993104e-06,
"loss": 0.5588,
"step": 1139
},
{
"epoch": 2.544642857142857,
"grad_norm": 1.7958623170852661,
"learning_rate": 1.2979633673413571e-06,
"loss": 0.461,
"step": 1140
},
{
"epoch": 2.546875,
"grad_norm": 1.800321340560913,
"learning_rate": 1.2855487009871615e-06,
"loss": 0.5071,
"step": 1141
},
{
"epoch": 2.549107142857143,
"grad_norm": 1.9416985511779785,
"learning_rate": 1.2731896121065645e-06,
"loss": 0.5693,
"step": 1142
},
{
"epoch": 2.5513392857142856,
"grad_norm": 1.8170133829116821,
"learning_rate": 1.2608861795211601e-06,
"loss": 0.5963,
"step": 1143
},
{
"epoch": 2.553571428571429,
"grad_norm": 1.5669772624969482,
"learning_rate": 1.248638481697586e-06,
"loss": 0.4923,
"step": 1144
},
{
"epoch": 2.555803571428571,
"grad_norm": 2.1065783500671387,
"learning_rate": 1.2364465967470284e-06,
"loss": 0.5503,
"step": 1145
},
{
"epoch": 2.5580357142857144,
"grad_norm": 1.905808925628662,
"learning_rate": 1.224310602424712e-06,
"loss": 0.4643,
"step": 1146
},
{
"epoch": 2.560267857142857,
"grad_norm": 1.917167067527771,
"learning_rate": 1.2122305761294196e-06,
"loss": 0.5424,
"step": 1147
},
{
"epoch": 2.5625,
"grad_norm": 1.7322640419006348,
"learning_rate": 1.2002065949029896e-06,
"loss": 0.4311,
"step": 1148
},
{
"epoch": 2.564732142857143,
"grad_norm": 1.6713980436325073,
"learning_rate": 1.1882387354298264e-06,
"loss": 0.4733,
"step": 1149
},
{
"epoch": 2.5669642857142856,
"grad_norm": 1.913543701171875,
"learning_rate": 1.1763270740364074e-06,
"loss": 0.4587,
"step": 1150
},
{
"epoch": 2.569196428571429,
"grad_norm": 1.911083698272705,
"learning_rate": 1.1644716866908035e-06,
"loss": 0.6257,
"step": 1151
},
{
"epoch": 2.571428571428571,
"grad_norm": 1.8229786157608032,
"learning_rate": 1.15267264900219e-06,
"loss": 0.4281,
"step": 1152
},
{
"epoch": 2.5736607142857144,
"grad_norm": 1.7414361238479614,
"learning_rate": 1.1409300362203667e-06,
"loss": 0.5365,
"step": 1153
},
{
"epoch": 2.575892857142857,
"grad_norm": 1.843278169631958,
"learning_rate": 1.1292439232352781e-06,
"loss": 0.5109,
"step": 1154
},
{
"epoch": 2.578125,
"grad_norm": 1.76763117313385,
"learning_rate": 1.1176143845765253e-06,
"loss": 0.4557,
"step": 1155
},
{
"epoch": 2.580357142857143,
"grad_norm": 1.6767909526824951,
"learning_rate": 1.1060414944129106e-06,
"loss": 0.4849,
"step": 1156
},
{
"epoch": 2.5825892857142856,
"grad_norm": 1.6554889678955078,
"learning_rate": 1.0945253265519472e-06,
"loss": 0.4961,
"step": 1157
},
{
"epoch": 2.584821428571429,
"grad_norm": 2.0062315464019775,
"learning_rate": 1.0830659544393996e-06,
"loss": 0.5364,
"step": 1158
},
{
"epoch": 2.587053571428571,
"grad_norm": 1.7251535654067993,
"learning_rate": 1.0716634511588076e-06,
"loss": 0.47,
"step": 1159
},
{
"epoch": 2.5892857142857144,
"grad_norm": 1.71232008934021,
"learning_rate": 1.0603178894310185e-06,
"loss": 0.5485,
"step": 1160
},
{
"epoch": 2.591517857142857,
"grad_norm": 1.9494444131851196,
"learning_rate": 1.0490293416137409e-06,
"loss": 0.5122,
"step": 1161
},
{
"epoch": 2.59375,
"grad_norm": 1.5578092336654663,
"learning_rate": 1.0377978797010558e-06,
"loss": 0.4307,
"step": 1162
},
{
"epoch": 2.595982142857143,
"grad_norm": 1.9168074131011963,
"learning_rate": 1.0266235753229825e-06,
"loss": 0.6464,
"step": 1163
},
{
"epoch": 2.5982142857142856,
"grad_norm": 1.903611660003662,
"learning_rate": 1.0155064997450026e-06,
"loss": 0.5234,
"step": 1164
},
{
"epoch": 2.600446428571429,
"grad_norm": 1.7924622297286987,
"learning_rate": 1.004446723867618e-06,
"loss": 0.5628,
"step": 1165
},
{
"epoch": 2.602678571428571,
"grad_norm": 2.0109128952026367,
"learning_rate": 9.934443182259023e-07,
"loss": 0.5824,
"step": 1166
},
{
"epoch": 2.6049107142857144,
"grad_norm": 1.718166470527649,
"learning_rate": 9.824993529890303e-07,
"loss": 0.499,
"step": 1167
},
{
"epoch": 2.607142857142857,
"grad_norm": 1.6572740077972412,
"learning_rate": 9.716118979598533e-07,
"loss": 0.5359,
"step": 1168
},
{
"epoch": 2.609375,
"grad_norm": 1.667127251625061,
"learning_rate": 9.607820225744346e-07,
"loss": 0.482,
"step": 1169
},
{
"epoch": 2.611607142857143,
"grad_norm": 1.948585867881775,
"learning_rate": 9.500097959016297e-07,
"loss": 0.5495,
"step": 1170
},
{
"epoch": 2.6138392857142856,
"grad_norm": 1.6463160514831543,
"learning_rate": 9.392952866426198e-07,
"loss": 0.5428,
"step": 1171
},
{
"epoch": 2.616071428571429,
"grad_norm": 1.8283066749572754,
"learning_rate": 9.286385631304939e-07,
"loss": 0.5095,
"step": 1172
},
{
"epoch": 2.618303571428571,
"grad_norm": 1.8753612041473389,
"learning_rate": 9.180396933298019e-07,
"loss": 0.5784,
"step": 1173
},
{
"epoch": 2.6205357142857144,
"grad_norm": 1.831152319908142,
"learning_rate": 9.074987448361261e-07,
"loss": 0.6219,
"step": 1174
},
{
"epoch": 2.622767857142857,
"grad_norm": 1.857530951499939,
"learning_rate": 8.970157848756511e-07,
"loss": 0.4694,
"step": 1175
},
{
"epoch": 2.625,
"grad_norm": 1.9244414567947388,
"learning_rate": 8.865908803047241e-07,
"loss": 0.6059,
"step": 1176
},
{
"epoch": 2.627232142857143,
"grad_norm": 2.004091262817383,
"learning_rate": 8.762240976094461e-07,
"loss": 0.539,
"step": 1177
},
{
"epoch": 2.6294642857142856,
"grad_norm": 1.735284686088562,
"learning_rate": 8.659155029052346e-07,
"loss": 0.4928,
"step": 1178
},
{
"epoch": 2.631696428571429,
"grad_norm": 1.8576728105545044,
"learning_rate": 8.556651619364065e-07,
"loss": 0.4546,
"step": 1179
},
{
"epoch": 2.633928571428571,
"grad_norm": 1.9069209098815918,
"learning_rate": 8.454731400757599e-07,
"loss": 0.4937,
"step": 1180
},
{
"epoch": 2.6361607142857144,
"grad_norm": 1.8208372592926025,
"learning_rate": 8.353395023241528e-07,
"loss": 0.4555,
"step": 1181
},
{
"epoch": 2.638392857142857,
"grad_norm": 1.4398638010025024,
"learning_rate": 8.252643133100935e-07,
"loss": 0.4341,
"step": 1182
},
{
"epoch": 2.640625,
"grad_norm": 1.7023489475250244,
"learning_rate": 8.152476372893259e-07,
"loss": 0.5228,
"step": 1183
},
{
"epoch": 2.642857142857143,
"grad_norm": 1.8599404096603394,
"learning_rate": 8.052895381444226e-07,
"loss": 0.4926,
"step": 1184
},
{
"epoch": 2.6450892857142856,
"grad_norm": 1.6925806999206543,
"learning_rate": 7.953900793843694e-07,
"loss": 0.4984,
"step": 1185
},
{
"epoch": 2.647321428571429,
"grad_norm": 1.822161316871643,
"learning_rate": 7.855493241441692e-07,
"loss": 0.4369,
"step": 1186
},
{
"epoch": 2.649553571428571,
"grad_norm": 1.9494961500167847,
"learning_rate": 7.757673351844386e-07,
"loss": 0.5383,
"step": 1187
},
{
"epoch": 2.6517857142857144,
"grad_norm": 1.6641318798065186,
"learning_rate": 7.660441748909997e-07,
"loss": 0.52,
"step": 1188
},
{
"epoch": 2.654017857142857,
"grad_norm": 1.6094475984573364,
"learning_rate": 7.563799052744947e-07,
"loss": 0.4805,
"step": 1189
},
{
"epoch": 2.65625,
"grad_norm": 1.6435906887054443,
"learning_rate": 7.46774587969975e-07,
"loss": 0.5334,
"step": 1190
},
{
"epoch": 2.658482142857143,
"grad_norm": 1.7535940408706665,
"learning_rate": 7.372282842365208e-07,
"loss": 0.442,
"step": 1191
},
{
"epoch": 2.6607142857142856,
"grad_norm": 2.087963104248047,
"learning_rate": 7.277410549568476e-07,
"loss": 0.6131,
"step": 1192
},
{
"epoch": 2.662946428571429,
"grad_norm": 1.6514884233474731,
"learning_rate": 7.183129606369133e-07,
"loss": 0.5286,
"step": 1193
},
{
"epoch": 2.665178571428571,
"grad_norm": 1.7809820175170898,
"learning_rate": 7.089440614055398e-07,
"loss": 0.4577,
"step": 1194
},
{
"epoch": 2.6674107142857144,
"grad_norm": 1.8051931858062744,
"learning_rate": 6.996344170140168e-07,
"loss": 0.5563,
"step": 1195
},
{
"epoch": 2.669642857142857,
"grad_norm": 1.717929482460022,
"learning_rate": 6.903840868357382e-07,
"loss": 0.4968,
"step": 1196
},
{
"epoch": 2.671875,
"grad_norm": 1.920330286026001,
"learning_rate": 6.811931298658092e-07,
"loss": 0.4726,
"step": 1197
},
{
"epoch": 2.674107142857143,
"grad_norm": 1.5852843523025513,
"learning_rate": 6.720616047206774e-07,
"loss": 0.5563,
"step": 1198
},
{
"epoch": 2.6763392857142856,
"grad_norm": 2.0212185382843018,
"learning_rate": 6.62989569637752e-07,
"loss": 0.5112,
"step": 1199
},
{
"epoch": 2.678571428571429,
"grad_norm": 1.5889393091201782,
"learning_rate": 6.539770824750447e-07,
"loss": 0.4565,
"step": 1200
},
{
"epoch": 2.680803571428571,
"grad_norm": 1.6706169843673706,
"learning_rate": 6.450242007107865e-07,
"loss": 0.4681,
"step": 1201
},
{
"epoch": 2.6830357142857144,
"grad_norm": 2.0316851139068604,
"learning_rate": 6.361309814430727e-07,
"loss": 0.5109,
"step": 1202
},
{
"epoch": 2.685267857142857,
"grad_norm": 1.3804383277893066,
"learning_rate": 6.272974813894905e-07,
"loss": 0.3889,
"step": 1203
},
{
"epoch": 2.6875,
"grad_norm": 1.4953737258911133,
"learning_rate": 6.185237568867597e-07,
"loss": 0.4755,
"step": 1204
},
{
"epoch": 2.689732142857143,
"grad_norm": 1.5238755941390991,
"learning_rate": 6.098098638903771e-07,
"loss": 0.4694,
"step": 1205
},
{
"epoch": 2.6919642857142856,
"grad_norm": 1.6073217391967773,
"learning_rate": 6.01155857974256e-07,
"loss": 0.4265,
"step": 1206
},
{
"epoch": 2.694196428571429,
"grad_norm": 1.841497778892517,
"learning_rate": 5.925617943303719e-07,
"loss": 0.5209,
"step": 1207
},
{
"epoch": 2.696428571428571,
"grad_norm": 1.7118959426879883,
"learning_rate": 5.840277277684136e-07,
"loss": 0.4698,
"step": 1208
},
{
"epoch": 2.6986607142857144,
"grad_norm": 1.6239268779754639,
"learning_rate": 5.755537127154231e-07,
"loss": 0.5341,
"step": 1209
},
{
"epoch": 2.700892857142857,
"grad_norm": 1.7772884368896484,
"learning_rate": 5.671398032154707e-07,
"loss": 0.4857,
"step": 1210
},
{
"epoch": 2.703125,
"grad_norm": 1.6045022010803223,
"learning_rate": 5.58786052929281e-07,
"loss": 0.5097,
"step": 1211
},
{
"epoch": 2.705357142857143,
"grad_norm": 1.5222342014312744,
"learning_rate": 5.504925151339191e-07,
"loss": 0.4765,
"step": 1212
},
{
"epoch": 2.7075892857142856,
"grad_norm": 2.022216558456421,
"learning_rate": 5.422592427224239e-07,
"loss": 0.5601,
"step": 1213
},
{
"epoch": 2.709821428571429,
"grad_norm": 1.724923014640808,
"learning_rate": 5.340862882034992e-07,
"loss": 0.4478,
"step": 1214
},
{
"epoch": 2.712053571428571,
"grad_norm": 1.814286470413208,
"learning_rate": 5.259737037011547e-07,
"loss": 0.5301,
"step": 1215
},
{
"epoch": 2.7142857142857144,
"grad_norm": 1.8270453214645386,
"learning_rate": 5.179215409543848e-07,
"loss": 0.5095,
"step": 1216
},
{
"epoch": 2.716517857142857,
"grad_norm": 2.0552830696105957,
"learning_rate": 5.099298513168382e-07,
"loss": 0.5872,
"step": 1217
},
{
"epoch": 2.71875,
"grad_norm": 1.6828259229660034,
"learning_rate": 5.01998685756484e-07,
"loss": 0.4899,
"step": 1218
},
{
"epoch": 2.720982142857143,
"grad_norm": 1.8285223245620728,
"learning_rate": 4.941280948553018e-07,
"loss": 0.5651,
"step": 1219
},
{
"epoch": 2.7232142857142856,
"grad_norm": 1.8519248962402344,
"learning_rate": 4.863181288089391e-07,
"loss": 0.5844,
"step": 1220
},
{
"epoch": 2.725446428571429,
"grad_norm": 1.867247223854065,
"learning_rate": 4.785688374264053e-07,
"loss": 0.5344,
"step": 1221
},
{
"epoch": 2.727678571428571,
"grad_norm": 1.8264105319976807,
"learning_rate": 4.708802701297499e-07,
"loss": 0.5835,
"step": 1222
},
{
"epoch": 2.7299107142857144,
"grad_norm": 1.8832281827926636,
"learning_rate": 4.632524759537449e-07,
"loss": 0.5418,
"step": 1223
},
{
"epoch": 2.732142857142857,
"grad_norm": 1.7114546298980713,
"learning_rate": 4.556855035455787e-07,
"loss": 0.5168,
"step": 1224
},
{
"epoch": 2.734375,
"grad_norm": 1.7664337158203125,
"learning_rate": 4.481794011645368e-07,
"loss": 0.5501,
"step": 1225
},
{
"epoch": 2.736607142857143,
"grad_norm": 1.6939427852630615,
"learning_rate": 4.407342166816997e-07,
"loss": 0.4614,
"step": 1226
},
{
"epoch": 2.7388392857142856,
"grad_norm": 1.8312894105911255,
"learning_rate": 4.3334999757963734e-07,
"loss": 0.6143,
"step": 1227
},
{
"epoch": 2.741071428571429,
"grad_norm": 1.9868108034133911,
"learning_rate": 4.2602679095210766e-07,
"loss": 0.5636,
"step": 1228
},
{
"epoch": 2.743303571428571,
"grad_norm": 1.5780657529830933,
"learning_rate": 4.187646435037529e-07,
"loss": 0.4947,
"step": 1229
},
{
"epoch": 2.7455357142857144,
"grad_norm": 1.9387363195419312,
"learning_rate": 4.1156360154979813e-07,
"loss": 0.5237,
"step": 1230
},
{
"epoch": 2.747767857142857,
"grad_norm": 1.5292437076568604,
"learning_rate": 4.044237110157667e-07,
"loss": 0.4427,
"step": 1231
},
{
"epoch": 2.75,
"grad_norm": 1.718108892440796,
"learning_rate": 3.9734501743717956e-07,
"loss": 0.5295,
"step": 1232
},
{
"epoch": 2.75,
"eval_loss": 0.7388916015625,
"eval_runtime": 48.1103,
"eval_samples_per_second": 1.517,
"eval_steps_per_second": 0.208,
"step": 1232
},
{
"epoch": 2.752232142857143,
"grad_norm": 1.6977214813232422,
"learning_rate": 3.9032756595926755e-07,
"loss": 0.4584,
"step": 1233
},
{
"epoch": 2.7544642857142856,
"grad_norm": 1.8142741918563843,
"learning_rate": 3.833714013366796e-07,
"loss": 0.4569,
"step": 1234
},
{
"epoch": 2.756696428571429,
"grad_norm": 1.7793419361114502,
"learning_rate": 3.7647656793320164e-07,
"loss": 0.4911,
"step": 1235
},
{
"epoch": 2.758928571428571,
"grad_norm": 1.7444989681243896,
"learning_rate": 3.696431097214748e-07,
"loss": 0.5189,
"step": 1236
},
{
"epoch": 2.7611607142857144,
"grad_norm": 1.9122920036315918,
"learning_rate": 3.628710702827076e-07,
"loss": 0.4846,
"step": 1237
},
{
"epoch": 2.763392857142857,
"grad_norm": 1.8623292446136475,
"learning_rate": 3.5616049280640995e-07,
"loss": 0.4874,
"step": 1238
},
{
"epoch": 2.765625,
"grad_norm": 2.041966199874878,
"learning_rate": 3.4951142009010173e-07,
"loss": 0.5527,
"step": 1239
},
{
"epoch": 2.767857142857143,
"grad_norm": 1.7278554439544678,
"learning_rate": 3.429238945390556e-07,
"loss": 0.5564,
"step": 1240
},
{
"epoch": 2.7700892857142856,
"grad_norm": 1.6937023401260376,
"learning_rate": 3.3639795816601705e-07,
"loss": 0.5349,
"step": 1241
},
{
"epoch": 2.772321428571429,
"grad_norm": 1.569589614868164,
"learning_rate": 3.299336525909391e-07,
"loss": 0.4183,
"step": 1242
},
{
"epoch": 2.774553571428571,
"grad_norm": 1.8380076885223389,
"learning_rate": 3.235310190407182e-07,
"loss": 0.4572,
"step": 1243
},
{
"epoch": 2.7767857142857144,
"grad_norm": 1.6024210453033447,
"learning_rate": 3.171900983489273e-07,
"loss": 0.4429,
"step": 1244
},
{
"epoch": 2.779017857142857,
"grad_norm": 1.658486008644104,
"learning_rate": 3.109109309555602e-07,
"loss": 0.5431,
"step": 1245
},
{
"epoch": 2.78125,
"grad_norm": 1.8605934381484985,
"learning_rate": 3.0469355690677216e-07,
"loss": 0.5497,
"step": 1246
},
{
"epoch": 2.783482142857143,
"grad_norm": 1.8473893404006958,
"learning_rate": 2.985380158546236e-07,
"loss": 0.4607,
"step": 1247
},
{
"epoch": 2.7857142857142856,
"grad_norm": 2.1259796619415283,
"learning_rate": 2.9244434705682276e-07,
"loss": 0.542,
"step": 1248
},
{
"epoch": 2.787946428571429,
"grad_norm": 1.9905544519424438,
"learning_rate": 2.8641258937648577e-07,
"loss": 0.5211,
"step": 1249
},
{
"epoch": 2.790178571428571,
"grad_norm": 1.8052101135253906,
"learning_rate": 2.8044278128188327e-07,
"loss": 0.516,
"step": 1250
},
{
"epoch": 2.7924107142857144,
"grad_norm": 1.7659034729003906,
"learning_rate": 2.7453496084619116e-07,
"loss": 0.5368,
"step": 1251
},
{
"epoch": 2.794642857142857,
"grad_norm": 1.8553686141967773,
"learning_rate": 2.6868916574725347e-07,
"loss": 0.5216,
"step": 1252
},
{
"epoch": 2.796875,
"grad_norm": 1.7524783611297607,
"learning_rate": 2.6290543326733865e-07,
"loss": 0.4936,
"step": 1253
},
{
"epoch": 2.799107142857143,
"grad_norm": 1.8771616220474243,
"learning_rate": 2.571838002929061e-07,
"loss": 0.604,
"step": 1254
},
{
"epoch": 2.8013392857142856,
"grad_norm": 1.833112120628357,
"learning_rate": 2.515243033143644e-07,
"loss": 0.4917,
"step": 1255
},
{
"epoch": 2.803571428571429,
"grad_norm": 1.7152862548828125,
"learning_rate": 2.459269784258467e-07,
"loss": 0.5593,
"step": 1256
},
{
"epoch": 2.805803571428571,
"grad_norm": 1.824363350868225,
"learning_rate": 2.4039186132497226e-07,
"loss": 0.5888,
"step": 1257
},
{
"epoch": 2.8080357142857144,
"grad_norm": 1.7618277072906494,
"learning_rate": 2.349189873126223e-07,
"loss": 0.5356,
"step": 1258
},
{
"epoch": 2.810267857142857,
"grad_norm": 2.1107370853424072,
"learning_rate": 2.2950839129272096e-07,
"loss": 0.569,
"step": 1259
},
{
"epoch": 2.8125,
"grad_norm": 1.6434730291366577,
"learning_rate": 2.2416010777199904e-07,
"loss": 0.5319,
"step": 1260
},
{
"epoch": 2.814732142857143,
"grad_norm": 1.9061239957809448,
"learning_rate": 2.1887417085978745e-07,
"loss": 0.5174,
"step": 1261
},
{
"epoch": 2.8169642857142856,
"grad_norm": 1.8371518850326538,
"learning_rate": 2.1365061426778967e-07,
"loss": 0.583,
"step": 1262
},
{
"epoch": 2.819196428571429,
"grad_norm": 1.754087209701538,
"learning_rate": 2.0848947130987617e-07,
"loss": 0.6134,
"step": 1263
},
{
"epoch": 2.821428571428571,
"grad_norm": 1.772619366645813,
"learning_rate": 2.0339077490186488e-07,
"loss": 0.4524,
"step": 1264
},
{
"epoch": 2.8236607142857144,
"grad_norm": 1.8808914422988892,
"learning_rate": 1.9835455756130995e-07,
"loss": 0.5474,
"step": 1265
},
{
"epoch": 2.825892857142857,
"grad_norm": 1.8659659624099731,
"learning_rate": 1.93380851407301e-07,
"loss": 0.539,
"step": 1266
},
{
"epoch": 2.828125,
"grad_norm": 1.6725388765335083,
"learning_rate": 1.8846968816025434e-07,
"loss": 0.4956,
"step": 1267
},
{
"epoch": 2.830357142857143,
"grad_norm": 2.0685112476348877,
"learning_rate": 1.83621099141712e-07,
"loss": 0.568,
"step": 1268
},
{
"epoch": 2.8325892857142856,
"grad_norm": 2.1546757221221924,
"learning_rate": 1.7883511527414078e-07,
"loss": 0.543,
"step": 1269
},
{
"epoch": 2.834821428571429,
"grad_norm": 1.8702294826507568,
"learning_rate": 1.741117670807335e-07,
"loss": 0.5957,
"step": 1270
},
{
"epoch": 2.837053571428571,
"grad_norm": 1.8077850341796875,
"learning_rate": 1.694510846852193e-07,
"loss": 0.5135,
"step": 1271
},
{
"epoch": 2.8392857142857144,
"grad_norm": 1.8730038404464722,
"learning_rate": 1.648530978116658e-07,
"loss": 0.5495,
"step": 1272
},
{
"epoch": 2.841517857142857,
"grad_norm": 2.0048348903656006,
"learning_rate": 1.6031783578429605e-07,
"loss": 0.5356,
"step": 1273
},
{
"epoch": 2.84375,
"grad_norm": 1.7988780736923218,
"learning_rate": 1.558453275272942e-07,
"loss": 0.5383,
"step": 1274
},
{
"epoch": 2.845982142857143,
"grad_norm": 1.999723196029663,
"learning_rate": 1.5143560156462567e-07,
"loss": 0.4961,
"step": 1275
},
{
"epoch": 2.8482142857142856,
"grad_norm": 1.982683539390564,
"learning_rate": 1.4708868601985503e-07,
"loss": 0.4886,
"step": 1276
},
{
"epoch": 2.850446428571429,
"grad_norm": 1.733881950378418,
"learning_rate": 1.4280460861596513e-07,
"loss": 0.4615,
"step": 1277
},
{
"epoch": 2.852678571428571,
"grad_norm": 1.7100967168807983,
"learning_rate": 1.385833966751815e-07,
"loss": 0.4739,
"step": 1278
},
{
"epoch": 2.8549107142857144,
"grad_norm": 1.951353669166565,
"learning_rate": 1.3442507711879494e-07,
"loss": 0.5577,
"step": 1279
},
{
"epoch": 2.857142857142857,
"grad_norm": 1.7354167699813843,
"learning_rate": 1.303296764669959e-07,
"loss": 0.6184,
"step": 1280
},
{
"epoch": 2.859375,
"grad_norm": 1.764654517173767,
"learning_rate": 1.2629722083870033e-07,
"loss": 0.4979,
"step": 1281
},
{
"epoch": 2.861607142857143,
"grad_norm": 1.8083339929580688,
"learning_rate": 1.2232773595138415e-07,
"loss": 0.4016,
"step": 1282
},
{
"epoch": 2.8638392857142856,
"grad_norm": 1.7972429990768433,
"learning_rate": 1.1842124712092117e-07,
"loss": 0.4883,
"step": 1283
},
{
"epoch": 2.866071428571429,
"grad_norm": 1.6948587894439697,
"learning_rate": 1.1457777926141889e-07,
"loss": 0.4177,
"step": 1284
},
{
"epoch": 2.868303571428571,
"grad_norm": 1.7051078081130981,
"learning_rate": 1.1079735688506065e-07,
"loss": 0.5008,
"step": 1285
},
{
"epoch": 2.8705357142857144,
"grad_norm": 1.6710540056228638,
"learning_rate": 1.0708000410195041e-07,
"loss": 0.4351,
"step": 1286
},
{
"epoch": 2.872767857142857,
"grad_norm": 1.5446432828903198,
"learning_rate": 1.0342574461995936e-07,
"loss": 0.4409,
"step": 1287
},
{
"epoch": 2.875,
"grad_norm": 1.8338514566421509,
"learning_rate": 9.98346017445706e-08,
"loss": 0.5283,
"step": 1288
},
{
"epoch": 2.877232142857143,
"grad_norm": 1.672766923904419,
"learning_rate": 9.630659837873368e-08,
"loss": 0.5112,
"step": 1289
},
{
"epoch": 2.8794642857142856,
"grad_norm": 1.7518768310546875,
"learning_rate": 9.284175702272246e-08,
"loss": 0.4747,
"step": 1290
},
{
"epoch": 2.881696428571429,
"grad_norm": 2.162524938583374,
"learning_rate": 8.944009977398083e-08,
"loss": 0.5968,
"step": 1291
},
{
"epoch": 2.883928571428571,
"grad_norm": 1.8234249353408813,
"learning_rate": 8.610164832699608e-08,
"loss": 0.5249,
"step": 1292
},
{
"epoch": 2.8861607142857144,
"grad_norm": 1.7678987979888916,
"learning_rate": 8.282642397314356e-08,
"loss": 0.5712,
"step": 1293
},
{
"epoch": 2.888392857142857,
"grad_norm": 1.769107460975647,
"learning_rate": 7.96144476005689e-08,
"loss": 0.4407,
"step": 1294
},
{
"epoch": 2.890625,
"grad_norm": 1.5588442087173462,
"learning_rate": 7.646573969404159e-08,
"loss": 0.444,
"step": 1295
},
{
"epoch": 2.892857142857143,
"grad_norm": 1.6710723638534546,
"learning_rate": 7.338032033482712e-08,
"loss": 0.4247,
"step": 1296
},
{
"epoch": 2.8950892857142856,
"grad_norm": 1.6855120658874512,
"learning_rate": 7.035820920056724e-08,
"loss": 0.4729,
"step": 1297
},
{
"epoch": 2.897321428571429,
"grad_norm": 2.1896936893463135,
"learning_rate": 6.73994255651389e-08,
"loss": 0.5533,
"step": 1298
},
{
"epoch": 2.899553571428571,
"grad_norm": 1.658887267112732,
"learning_rate": 6.450398829854764e-08,
"loss": 0.4726,
"step": 1299
},
{
"epoch": 2.9017857142857144,
"grad_norm": 1.7160634994506836,
"learning_rate": 6.167191586679556e-08,
"loss": 0.4999,
"step": 1300
},
{
"epoch": 2.904017857142857,
"grad_norm": 1.9953809976577759,
"learning_rate": 5.890322633177126e-08,
"loss": 0.5515,
"step": 1301
},
{
"epoch": 2.90625,
"grad_norm": 1.998899221420288,
"learning_rate": 5.6197937351125664e-08,
"loss": 0.4844,
"step": 1302
},
{
"epoch": 2.908482142857143,
"grad_norm": 1.6095709800720215,
"learning_rate": 5.355606617817089e-08,
"loss": 0.4459,
"step": 1303
},
{
"epoch": 2.9107142857142856,
"grad_norm": 1.6279726028442383,
"learning_rate": 5.097762966176256e-08,
"loss": 0.4293,
"step": 1304
},
{
"epoch": 2.912946428571429,
"grad_norm": 1.60764741897583,
"learning_rate": 4.846264424619218e-08,
"loss": 0.4048,
"step": 1305
},
{
"epoch": 2.915178571428571,
"grad_norm": 1.5504363775253296,
"learning_rate": 4.6011125971084924e-08,
"loss": 0.4506,
"step": 1306
},
{
"epoch": 2.9174107142857144,
"grad_norm": 1.8371487855911255,
"learning_rate": 4.3623090471296426e-08,
"loss": 0.431,
"step": 1307
},
{
"epoch": 2.919642857142857,
"grad_norm": 1.6488250494003296,
"learning_rate": 4.129855297681618e-08,
"loss": 0.4522,
"step": 1308
},
{
"epoch": 2.921875,
"grad_norm": 1.6050375699996948,
"learning_rate": 3.903752831266205e-08,
"loss": 0.4436,
"step": 1309
},
{
"epoch": 2.924107142857143,
"grad_norm": 1.6481090784072876,
"learning_rate": 3.684003089879484e-08,
"loss": 0.4896,
"step": 1310
},
{
"epoch": 2.9263392857142856,
"grad_norm": 1.685328722000122,
"learning_rate": 3.4706074750022744e-08,
"loss": 0.5347,
"step": 1311
},
{
"epoch": 2.928571428571429,
"grad_norm": 1.8341789245605469,
"learning_rate": 3.2635673475910345e-08,
"loss": 0.515,
"step": 1312
},
{
"epoch": 2.930803571428571,
"grad_norm": 1.7492722272872925,
"learning_rate": 3.062884028069313e-08,
"loss": 0.5591,
"step": 1313
},
{
"epoch": 2.9330357142857144,
"grad_norm": 1.7350465059280396,
"learning_rate": 2.8685587963194206e-08,
"loss": 0.547,
"step": 1314
},
{
"epoch": 2.935267857142857,
"grad_norm": 1.6732760667800903,
"learning_rate": 2.6805928916742163e-08,
"loss": 0.4303,
"step": 1315
},
{
"epoch": 2.9375,
"grad_norm": 1.7712833881378174,
"learning_rate": 2.4989875129091124e-08,
"loss": 0.4711,
"step": 1316
},
{
"epoch": 2.939732142857143,
"grad_norm": 1.616205096244812,
"learning_rate": 2.323743818234414e-08,
"loss": 0.4163,
"step": 1317
},
{
"epoch": 2.9419642857142856,
"grad_norm": 1.7002397775650024,
"learning_rate": 2.154862925288326e-08,
"loss": 0.5499,
"step": 1318
},
{
"epoch": 2.944196428571429,
"grad_norm": 1.6891026496887207,
"learning_rate": 1.9923459111290676e-08,
"loss": 0.5318,
"step": 1319
},
{
"epoch": 2.946428571428571,
"grad_norm": 1.8011424541473389,
"learning_rate": 1.8361938122287704e-08,
"loss": 0.5235,
"step": 1320
},
{
"epoch": 2.9486607142857144,
"grad_norm": 2.001744508743286,
"learning_rate": 1.6864076244663686e-08,
"loss": 0.6503,
"step": 1321
},
{
"epoch": 2.950892857142857,
"grad_norm": 1.6341923475265503,
"learning_rate": 1.5429883031217173e-08,
"loss": 0.5342,
"step": 1322
},
{
"epoch": 2.953125,
"grad_norm": 1.8023895025253296,
"learning_rate": 1.4059367628687094e-08,
"loss": 0.5666,
"step": 1323
},
{
"epoch": 2.955357142857143,
"grad_norm": 1.7343465089797974,
"learning_rate": 1.2752538777704993e-08,
"loss": 0.4884,
"step": 1324
},
{
"epoch": 2.9575892857142856,
"grad_norm": 1.7615457773208618,
"learning_rate": 1.1509404812728443e-08,
"loss": 0.5254,
"step": 1325
},
{
"epoch": 2.959821428571429,
"grad_norm": 1.7637721300125122,
"learning_rate": 1.0329973661996617e-08,
"loss": 0.4997,
"step": 1326
},
{
"epoch": 2.962053571428571,
"grad_norm": 1.8085095882415771,
"learning_rate": 9.214252847475902e-09,
"loss": 0.4738,
"step": 1327
},
{
"epoch": 2.9642857142857144,
"grad_norm": 2.0455617904663086,
"learning_rate": 8.162249484809926e-09,
"loss": 0.5687,
"step": 1328
},
{
"epoch": 2.966517857142857,
"grad_norm": 1.6833295822143555,
"learning_rate": 7.173970283279597e-09,
"loss": 0.4429,
"step": 1329
},
{
"epoch": 2.96875,
"grad_norm": 1.766695499420166,
"learning_rate": 6.249421545755363e-09,
"loss": 0.5295,
"step": 1330
},
{
"epoch": 2.970982142857143,
"grad_norm": 1.7241015434265137,
"learning_rate": 5.388609168659465e-09,
"loss": 0.5362,
"step": 1331
},
{
"epoch": 2.9732142857142856,
"grad_norm": 1.826074242591858,
"learning_rate": 4.591538641927074e-09,
"loss": 0.5459,
"step": 1332
},
{
"epoch": 2.975446428571429,
"grad_norm": 1.6138020753860474,
"learning_rate": 3.858215048972991e-09,
"loss": 0.515,
"step": 1333
},
{
"epoch": 2.977678571428571,
"grad_norm": 1.8125077486038208,
"learning_rate": 3.1886430666561163e-09,
"loss": 0.5262,
"step": 1334
},
{
"epoch": 2.9799107142857144,
"grad_norm": 1.748060703277588,
"learning_rate": 2.5828269652561355e-09,
"loss": 0.4633,
"step": 1335
},
{
"epoch": 2.982142857142857,
"grad_norm": 1.6022776365280151,
"learning_rate": 2.0407706084368816e-09,
"loss": 0.4314,
"step": 1336
},
{
"epoch": 2.984375,
"grad_norm": 1.672903060913086,
"learning_rate": 1.5624774532285726e-09,
"loss": 0.4904,
"step": 1337
},
{
"epoch": 2.986607142857143,
"grad_norm": 1.8359776735305786,
"learning_rate": 1.1479505500044952e-09,
"loss": 0.4427,
"step": 1338
},
{
"epoch": 2.9888392857142856,
"grad_norm": 1.6358612775802612,
"learning_rate": 7.971925424621329e-10,
"loss": 0.4399,
"step": 1339
},
{
"epoch": 2.991071428571429,
"grad_norm": 1.8373823165893555,
"learning_rate": 5.102056675998501e-10,
"loss": 0.5499,
"step": 1340
},
{
"epoch": 2.993303571428571,
"grad_norm": 1.7908254861831665,
"learning_rate": 2.8699175571467177e-10,
"loss": 0.4939,
"step": 1341
},
{
"epoch": 2.9955357142857144,
"grad_norm": 1.9384500980377197,
"learning_rate": 1.2755223037896892e-10,
"loss": 0.5783,
"step": 1342
},
{
"epoch": 2.997767857142857,
"grad_norm": 1.8946892023086548,
"learning_rate": 3.1888108437128085e-11,
"loss": 0.5923,
"step": 1343
},
{
"epoch": 3.0,
"grad_norm": 1.533162236213684,
"learning_rate": 0.0,
"loss": 0.4701,
"step": 1344
},
{
"epoch": 3.0,
"eval_loss": 0.739547848701477,
"eval_runtime": 48.8136,
"eval_samples_per_second": 1.495,
"eval_steps_per_second": 0.205,
"step": 1344
}
],
"logging_steps": 1,
"max_steps": 1344,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 224,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.304796257625047e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}