llava-v1.6-7b-task-lora-short / trainer_state.json
blahBlahhhJ's picture
Upload folder using huggingface_hub
3fc00ba verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 373,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 5.512355255212158,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.3837,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 5.601068071501158,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.4305,
"step": 2
},
{
"epoch": 0.01,
"grad_norm": 3.2300010534534525,
"learning_rate": 5e-05,
"loss": 1.266,
"step": 3
},
{
"epoch": 0.01,
"grad_norm": 2.1453814538418174,
"learning_rate": 6.666666666666667e-05,
"loss": 1.1698,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 2.5014459760080596,
"learning_rate": 8.333333333333334e-05,
"loss": 1.1328,
"step": 5
},
{
"epoch": 0.02,
"grad_norm": 1.8330435913332193,
"learning_rate": 0.0001,
"loss": 1.1005,
"step": 6
},
{
"epoch": 0.02,
"grad_norm": 1.4286351070611591,
"learning_rate": 0.00011666666666666668,
"loss": 0.978,
"step": 7
},
{
"epoch": 0.02,
"grad_norm": 1.310573576099708,
"learning_rate": 0.00013333333333333334,
"loss": 0.9391,
"step": 8
},
{
"epoch": 0.02,
"grad_norm": 1.1949914966282076,
"learning_rate": 0.00015000000000000001,
"loss": 0.9192,
"step": 9
},
{
"epoch": 0.03,
"grad_norm": 1.2833854369614637,
"learning_rate": 0.0001666666666666667,
"loss": 0.8458,
"step": 10
},
{
"epoch": 0.03,
"grad_norm": 1.3466532516140282,
"learning_rate": 0.00018333333333333334,
"loss": 0.9498,
"step": 11
},
{
"epoch": 0.03,
"grad_norm": 1.0265129854654136,
"learning_rate": 0.0002,
"loss": 0.8738,
"step": 12
},
{
"epoch": 0.03,
"grad_norm": 0.9754563625922208,
"learning_rate": 0.0001999962133723217,
"loss": 0.8873,
"step": 13
},
{
"epoch": 0.04,
"grad_norm": 0.9666096955411021,
"learning_rate": 0.00019998485377605772,
"loss": 0.8805,
"step": 14
},
{
"epoch": 0.04,
"grad_norm": 1.0373117093470032,
"learning_rate": 0.00019996592207149934,
"loss": 0.8797,
"step": 15
},
{
"epoch": 0.04,
"grad_norm": 0.9925813153709726,
"learning_rate": 0.00019993941969239282,
"loss": 0.8048,
"step": 16
},
{
"epoch": 0.05,
"grad_norm": 0.986495951671587,
"learning_rate": 0.0001999053486458311,
"loss": 0.7769,
"step": 17
},
{
"epoch": 0.05,
"grad_norm": 1.4142116862108192,
"learning_rate": 0.00019986371151210145,
"loss": 0.7946,
"step": 18
},
{
"epoch": 0.05,
"grad_norm": 1.0439568009598768,
"learning_rate": 0.00019981451144449042,
"loss": 0.8041,
"step": 19
},
{
"epoch": 0.05,
"grad_norm": 8.131551306091136,
"learning_rate": 0.0001997577521690447,
"loss": 0.8122,
"step": 20
},
{
"epoch": 0.06,
"grad_norm": 0.8360699607020335,
"learning_rate": 0.00019969343798428914,
"loss": 0.8185,
"step": 21
},
{
"epoch": 0.06,
"grad_norm": 0.8466183671401063,
"learning_rate": 0.00019962157376090124,
"loss": 0.7552,
"step": 22
},
{
"epoch": 0.06,
"grad_norm": 0.9001204614115436,
"learning_rate": 0.00019954216494134217,
"loss": 0.7959,
"step": 23
},
{
"epoch": 0.06,
"grad_norm": 0.843763156462474,
"learning_rate": 0.00019945521753944451,
"loss": 0.8043,
"step": 24
},
{
"epoch": 0.07,
"grad_norm": 0.7862883144042041,
"learning_rate": 0.0001993607381399571,
"loss": 0.792,
"step": 25
},
{
"epoch": 0.07,
"grad_norm": 0.8743527199399872,
"learning_rate": 0.00019925873389804613,
"loss": 0.7685,
"step": 26
},
{
"epoch": 0.07,
"grad_norm": 0.8981661887195656,
"learning_rate": 0.00019914921253875328,
"loss": 0.7858,
"step": 27
},
{
"epoch": 0.08,
"grad_norm": 0.938979054115086,
"learning_rate": 0.00019903218235641076,
"loss": 0.7859,
"step": 28
},
{
"epoch": 0.08,
"grad_norm": 0.8854235003056622,
"learning_rate": 0.00019890765221401314,
"loss": 0.7487,
"step": 29
},
{
"epoch": 0.08,
"grad_norm": 0.8115643107641081,
"learning_rate": 0.00019877563154254612,
"loss": 0.7344,
"step": 30
},
{
"epoch": 0.08,
"grad_norm": 0.9184079760749247,
"learning_rate": 0.00019863613034027224,
"loss": 0.8187,
"step": 31
},
{
"epoch": 0.09,
"grad_norm": 0.8142820954054314,
"learning_rate": 0.0001984891591719738,
"loss": 0.7795,
"step": 32
},
{
"epoch": 0.09,
"grad_norm": 0.8203580213935835,
"learning_rate": 0.00019833472916815263,
"loss": 0.8192,
"step": 33
},
{
"epoch": 0.09,
"grad_norm": 0.7571847385129802,
"learning_rate": 0.00019817285202418733,
"loss": 0.7864,
"step": 34
},
{
"epoch": 0.09,
"grad_norm": 0.8085847745812597,
"learning_rate": 0.00019800353999944732,
"loss": 0.7653,
"step": 35
},
{
"epoch": 0.1,
"grad_norm": 0.7450297134757211,
"learning_rate": 0.0001978268059163646,
"loss": 0.7016,
"step": 36
},
{
"epoch": 0.1,
"grad_norm": 0.7648175358121383,
"learning_rate": 0.0001976426631594626,
"loss": 0.7651,
"step": 37
},
{
"epoch": 0.1,
"grad_norm": 0.9073006141723517,
"learning_rate": 0.0001974511256743425,
"loss": 0.7839,
"step": 38
},
{
"epoch": 0.1,
"grad_norm": 0.8632251284659535,
"learning_rate": 0.0001972522079666272,
"loss": 0.8221,
"step": 39
},
{
"epoch": 0.11,
"grad_norm": 0.8464617036052766,
"learning_rate": 0.00019704592510086258,
"loss": 0.7659,
"step": 40
},
{
"epoch": 0.11,
"grad_norm": 7.869410992708508,
"learning_rate": 0.00019683229269937686,
"loss": 0.793,
"step": 41
},
{
"epoch": 0.11,
"grad_norm": 0.6910263819695035,
"learning_rate": 0.00019661132694109737,
"loss": 0.7476,
"step": 42
},
{
"epoch": 0.12,
"grad_norm": 0.7823617079640587,
"learning_rate": 0.00019638304456032514,
"loss": 0.7543,
"step": 43
},
{
"epoch": 0.12,
"grad_norm": 0.8986022109125568,
"learning_rate": 0.0001961474628454679,
"loss": 0.7501,
"step": 44
},
{
"epoch": 0.12,
"grad_norm": 0.8605053572792184,
"learning_rate": 0.00019590459963773042,
"loss": 0.8088,
"step": 45
},
{
"epoch": 0.12,
"grad_norm": 0.7869120724203714,
"learning_rate": 0.00019565447332976362,
"loss": 0.7774,
"step": 46
},
{
"epoch": 0.13,
"grad_norm": 0.8300328955398425,
"learning_rate": 0.0001953971028642715,
"loss": 0.7881,
"step": 47
},
{
"epoch": 0.13,
"grad_norm": 4.701180165130191,
"learning_rate": 0.00019513250773257665,
"loss": 0.7878,
"step": 48
},
{
"epoch": 0.13,
"grad_norm": 0.9378185605974173,
"learning_rate": 0.00019486070797314402,
"loss": 0.7515,
"step": 49
},
{
"epoch": 0.13,
"grad_norm": 1.9816239126099178,
"learning_rate": 0.00019458172417006347,
"loss": 0.7393,
"step": 50
},
{
"epoch": 0.14,
"grad_norm": 1.0146355146158235,
"learning_rate": 0.00019429557745149082,
"loss": 0.7174,
"step": 51
},
{
"epoch": 0.14,
"grad_norm": 0.9489373557296571,
"learning_rate": 0.00019400228948804774,
"loss": 0.8367,
"step": 52
},
{
"epoch": 0.14,
"grad_norm": 3.4938666982243114,
"learning_rate": 0.00019370188249118067,
"loss": 0.8304,
"step": 53
},
{
"epoch": 0.14,
"grad_norm": 0.7895857808322408,
"learning_rate": 0.00019339437921147854,
"loss": 0.7571,
"step": 54
},
{
"epoch": 0.15,
"grad_norm": 1.0350953755957781,
"learning_rate": 0.00019307980293694997,
"loss": 0.7984,
"step": 55
},
{
"epoch": 0.15,
"grad_norm": 1.9217385632598274,
"learning_rate": 0.00019275817749125955,
"loss": 0.7453,
"step": 56
},
{
"epoch": 0.15,
"grad_norm": 0.7391980939802598,
"learning_rate": 0.00019242952723192355,
"loss": 0.6975,
"step": 57
},
{
"epoch": 0.16,
"grad_norm": 0.7501773078010896,
"learning_rate": 0.00019209387704846535,
"loss": 0.7518,
"step": 58
},
{
"epoch": 0.16,
"grad_norm": 0.7573007042638589,
"learning_rate": 0.0001917512523605304,
"loss": 0.7379,
"step": 59
},
{
"epoch": 0.16,
"grad_norm": 0.8140744826191785,
"learning_rate": 0.0001914016791159613,
"loss": 0.7044,
"step": 60
},
{
"epoch": 0.16,
"grad_norm": 0.8450442031858358,
"learning_rate": 0.00019104518378883253,
"loss": 0.763,
"step": 61
},
{
"epoch": 0.17,
"grad_norm": 0.7175341911600421,
"learning_rate": 0.00019068179337744547,
"loss": 0.688,
"step": 62
},
{
"epoch": 0.17,
"grad_norm": 0.7728305894940893,
"learning_rate": 0.00019031153540228398,
"loss": 0.743,
"step": 63
},
{
"epoch": 0.17,
"grad_norm": 0.8901123426372263,
"learning_rate": 0.00018993443790392994,
"loss": 0.7268,
"step": 64
},
{
"epoch": 0.17,
"grad_norm": 0.8493239266624959,
"learning_rate": 0.0001895505294409399,
"loss": 0.761,
"step": 65
},
{
"epoch": 0.18,
"grad_norm": 0.7812710594507711,
"learning_rate": 0.0001891598390876821,
"loss": 0.7707,
"step": 66
},
{
"epoch": 0.18,
"grad_norm": 1.0596730633980673,
"learning_rate": 0.00018876239643213455,
"loss": 0.7288,
"step": 67
},
{
"epoch": 0.18,
"grad_norm": 1.7573454798105217,
"learning_rate": 0.00018835823157364458,
"loss": 0.7391,
"step": 68
},
{
"epoch": 0.18,
"grad_norm": 1.0031748089998647,
"learning_rate": 0.0001879473751206489,
"loss": 0.7195,
"step": 69
},
{
"epoch": 0.19,
"grad_norm": 7.439237965595793,
"learning_rate": 0.0001875298581883559,
"loss": 0.7838,
"step": 70
},
{
"epoch": 0.19,
"grad_norm": 0.844581374152523,
"learning_rate": 0.000187105712396389,
"loss": 0.6861,
"step": 71
},
{
"epoch": 0.19,
"grad_norm": 3.678850258225293,
"learning_rate": 0.00018667496986639207,
"loss": 0.7423,
"step": 72
},
{
"epoch": 0.2,
"grad_norm": 0.7753613912910128,
"learning_rate": 0.00018623766321959688,
"loss": 0.7545,
"step": 73
},
{
"epoch": 0.2,
"grad_norm": 0.8548883098140322,
"learning_rate": 0.00018579382557435247,
"loss": 0.7697,
"step": 74
},
{
"epoch": 0.2,
"grad_norm": 0.8184215312663371,
"learning_rate": 0.00018534349054361707,
"loss": 0.725,
"step": 75
},
{
"epoch": 0.2,
"grad_norm": 0.7700953831367962,
"learning_rate": 0.00018488669223241258,
"loss": 0.7443,
"step": 76
},
{
"epoch": 0.21,
"grad_norm": 0.8152820586755163,
"learning_rate": 0.00018442346523524146,
"loss": 0.7442,
"step": 77
},
{
"epoch": 0.21,
"grad_norm": 0.7745035443923122,
"learning_rate": 0.00018395384463346722,
"loss": 0.7462,
"step": 78
},
{
"epoch": 0.21,
"grad_norm": 0.7277596367063084,
"learning_rate": 0.00018347786599265712,
"loss": 0.7176,
"step": 79
},
{
"epoch": 0.21,
"grad_norm": 0.7876787713429247,
"learning_rate": 0.00018299556535988915,
"loss": 0.7013,
"step": 80
},
{
"epoch": 0.22,
"grad_norm": 0.8438646538243175,
"learning_rate": 0.0001825069792610218,
"loss": 0.7209,
"step": 81
},
{
"epoch": 0.22,
"grad_norm": 0.6802198137748939,
"learning_rate": 0.00018201214469792793,
"loss": 0.7562,
"step": 82
},
{
"epoch": 0.22,
"grad_norm": 0.835891816407763,
"learning_rate": 0.00018151109914569266,
"loss": 0.7503,
"step": 83
},
{
"epoch": 0.23,
"grad_norm": 0.744844585452191,
"learning_rate": 0.00018100388054977508,
"loss": 0.7329,
"step": 84
},
{
"epoch": 0.23,
"grad_norm": 0.7364974652166336,
"learning_rate": 0.00018049052732313465,
"loss": 0.7198,
"step": 85
},
{
"epoch": 0.23,
"grad_norm": 0.8292878106466043,
"learning_rate": 0.00017997107834332216,
"loss": 0.6571,
"step": 86
},
{
"epoch": 0.23,
"grad_norm": 0.7148804485605235,
"learning_rate": 0.00017944557294953528,
"loss": 0.6983,
"step": 87
},
{
"epoch": 0.24,
"grad_norm": 0.7140828522762399,
"learning_rate": 0.00017891405093963938,
"loss": 0.7508,
"step": 88
},
{
"epoch": 0.24,
"grad_norm": 0.7717708896492551,
"learning_rate": 0.00017837655256715355,
"loss": 0.7264,
"step": 89
},
{
"epoch": 0.24,
"grad_norm": 0.7427007083892259,
"learning_rate": 0.00017783311853820206,
"loss": 0.6694,
"step": 90
},
{
"epoch": 0.24,
"grad_norm": 0.7560632004948113,
"learning_rate": 0.00017728379000843164,
"loss": 0.6721,
"step": 91
},
{
"epoch": 0.25,
"grad_norm": 0.6487067048836795,
"learning_rate": 0.00017672860857989464,
"loss": 0.6811,
"step": 92
},
{
"epoch": 0.25,
"grad_norm": 0.638380563962605,
"learning_rate": 0.00017616761629789824,
"loss": 0.7018,
"step": 93
},
{
"epoch": 0.25,
"grad_norm": 0.7305532339795056,
"learning_rate": 0.00017560085564782057,
"loss": 0.7208,
"step": 94
},
{
"epoch": 0.25,
"grad_norm": 0.7121274962251023,
"learning_rate": 0.0001750283695518929,
"loss": 0.7132,
"step": 95
},
{
"epoch": 0.26,
"grad_norm": 0.7819223698571628,
"learning_rate": 0.00017445020136594907,
"loss": 0.6793,
"step": 96
},
{
"epoch": 0.26,
"grad_norm": 0.731314756139953,
"learning_rate": 0.00017386639487614232,
"loss": 0.6422,
"step": 97
},
{
"epoch": 0.26,
"grad_norm": 0.7414278250964547,
"learning_rate": 0.00017327699429562884,
"loss": 0.7249,
"step": 98
},
{
"epoch": 0.27,
"grad_norm": 0.6557522740071186,
"learning_rate": 0.00017268204426121967,
"loss": 0.7039,
"step": 99
},
{
"epoch": 0.27,
"grad_norm": 0.7140545115742526,
"learning_rate": 0.0001720815898300002,
"loss": 0.7219,
"step": 100
},
{
"epoch": 0.27,
"grad_norm": 0.809113184222831,
"learning_rate": 0.00017147567647591777,
"loss": 0.7726,
"step": 101
},
{
"epoch": 0.27,
"grad_norm": 0.7041999961741995,
"learning_rate": 0.0001708643500863379,
"loss": 0.7389,
"step": 102
},
{
"epoch": 0.28,
"grad_norm": 1.018002266507155,
"learning_rate": 0.00017024765695856922,
"loss": 0.6771,
"step": 103
},
{
"epoch": 0.28,
"grad_norm": 0.7382080435909644,
"learning_rate": 0.000169625643796357,
"loss": 0.7513,
"step": 104
},
{
"epoch": 0.28,
"grad_norm": 0.7763816161023751,
"learning_rate": 0.0001689983577063464,
"loss": 0.7229,
"step": 105
},
{
"epoch": 0.28,
"grad_norm": 0.6472383525714234,
"learning_rate": 0.00016836584619451476,
"loss": 0.6477,
"step": 106
},
{
"epoch": 0.29,
"grad_norm": 0.7937093362264419,
"learning_rate": 0.00016772815716257412,
"loss": 0.7871,
"step": 107
},
{
"epoch": 0.29,
"grad_norm": 0.7152698878521233,
"learning_rate": 0.0001670853389043432,
"loss": 0.6848,
"step": 108
},
{
"epoch": 0.29,
"grad_norm": 0.693763900764871,
"learning_rate": 0.00016643744010209018,
"loss": 0.7481,
"step": 109
},
{
"epoch": 0.29,
"grad_norm": 0.7760662028837724,
"learning_rate": 0.00016578450982284584,
"loss": 0.7181,
"step": 110
},
{
"epoch": 0.3,
"grad_norm": 0.9732534367753803,
"learning_rate": 0.0001651265975146875,
"loss": 0.7059,
"step": 111
},
{
"epoch": 0.3,
"grad_norm": 0.721390970519964,
"learning_rate": 0.00016446375300299424,
"loss": 0.6929,
"step": 112
},
{
"epoch": 0.3,
"grad_norm": 0.7892594293960745,
"learning_rate": 0.00016379602648667363,
"loss": 0.6969,
"step": 113
},
{
"epoch": 0.31,
"grad_norm": 0.7144379882173059,
"learning_rate": 0.00016312346853435976,
"loss": 0.71,
"step": 114
},
{
"epoch": 0.31,
"grad_norm": 0.692091441113428,
"learning_rate": 0.00016244613008058387,
"loss": 0.7081,
"step": 115
},
{
"epoch": 0.31,
"grad_norm": 0.7073830863522493,
"learning_rate": 0.0001617640624219166,
"loss": 0.6998,
"step": 116
},
{
"epoch": 0.31,
"grad_norm": 0.6939706731832529,
"learning_rate": 0.0001610773172130835,
"loss": 0.7275,
"step": 117
},
{
"epoch": 0.32,
"grad_norm": 0.6515914750845688,
"learning_rate": 0.00016038594646305285,
"loss": 0.7305,
"step": 118
},
{
"epoch": 0.32,
"grad_norm": 0.6867489914120396,
"learning_rate": 0.00015969000253109706,
"loss": 0.6677,
"step": 119
},
{
"epoch": 0.32,
"grad_norm": 0.7789785545676167,
"learning_rate": 0.0001589895381228272,
"loss": 0.6983,
"step": 120
},
{
"epoch": 0.32,
"grad_norm": 0.6722067334974933,
"learning_rate": 0.00015828460628620157,
"loss": 0.6615,
"step": 121
},
{
"epoch": 0.33,
"grad_norm": 0.65540008670773,
"learning_rate": 0.0001575752604075083,
"loss": 0.6925,
"step": 122
},
{
"epoch": 0.33,
"grad_norm": 0.7493774732322246,
"learning_rate": 0.0001568615542073221,
"loss": 0.7029,
"step": 123
},
{
"epoch": 0.33,
"grad_norm": 0.742091567316259,
"learning_rate": 0.00015614354173643604,
"loss": 0.6703,
"step": 124
},
{
"epoch": 0.34,
"grad_norm": 0.6675199697978297,
"learning_rate": 0.00015542127737176798,
"loss": 0.6877,
"step": 125
},
{
"epoch": 0.34,
"grad_norm": 0.8397591916647437,
"learning_rate": 0.00015469481581224272,
"loss": 0.7347,
"step": 126
},
{
"epoch": 0.34,
"grad_norm": 0.704256604633287,
"learning_rate": 0.00015396421207464908,
"loss": 0.7225,
"step": 127
},
{
"epoch": 0.34,
"grad_norm": 0.6307506132251888,
"learning_rate": 0.0001532295214894739,
"loss": 0.6939,
"step": 128
},
{
"epoch": 0.35,
"grad_norm": 0.650289475715352,
"learning_rate": 0.00015249079969671114,
"loss": 0.6572,
"step": 129
},
{
"epoch": 0.35,
"grad_norm": 0.6508582023204799,
"learning_rate": 0.00015174810264164865,
"loss": 0.7119,
"step": 130
},
{
"epoch": 0.35,
"grad_norm": 0.6690206745714716,
"learning_rate": 0.0001510014865706309,
"loss": 0.6948,
"step": 131
},
{
"epoch": 0.35,
"grad_norm": 0.6408558123355553,
"learning_rate": 0.00015025100802679942,
"loss": 0.6977,
"step": 132
},
{
"epoch": 0.36,
"grad_norm": 0.7091883584562263,
"learning_rate": 0.0001494967238458108,
"loss": 0.746,
"step": 133
},
{
"epoch": 0.36,
"grad_norm": 0.7284758613046809,
"learning_rate": 0.00014873869115153223,
"loss": 0.7454,
"step": 134
},
{
"epoch": 0.36,
"grad_norm": 0.7318871271330087,
"learning_rate": 0.0001479769673517152,
"loss": 0.7204,
"step": 135
},
{
"epoch": 0.36,
"grad_norm": 0.6220632369248427,
"learning_rate": 0.00014721161013364829,
"loss": 0.6871,
"step": 136
},
{
"epoch": 0.37,
"grad_norm": 0.6312761822751318,
"learning_rate": 0.00014644267745978797,
"loss": 0.7352,
"step": 137
},
{
"epoch": 0.37,
"grad_norm": 0.629060072944782,
"learning_rate": 0.00014567022756336917,
"loss": 0.6922,
"step": 138
},
{
"epoch": 0.37,
"grad_norm": 0.6482601288258285,
"learning_rate": 0.00014489431894399498,
"loss": 0.6718,
"step": 139
},
{
"epoch": 0.38,
"grad_norm": 0.7021326867479869,
"learning_rate": 0.0001441150103632066,
"loss": 0.7384,
"step": 140
},
{
"epoch": 0.38,
"grad_norm": 0.6275332412079758,
"learning_rate": 0.0001433323608400328,
"loss": 0.6894,
"step": 141
},
{
"epoch": 0.38,
"grad_norm": 0.6213116953061292,
"learning_rate": 0.00014254642964652052,
"loss": 0.6642,
"step": 142
},
{
"epoch": 0.38,
"grad_norm": 0.6265488468753385,
"learning_rate": 0.00014175727630324597,
"loss": 0.6722,
"step": 143
},
{
"epoch": 0.39,
"grad_norm": 0.6354231838763917,
"learning_rate": 0.000140964960574807,
"loss": 0.7121,
"step": 144
},
{
"epoch": 0.39,
"grad_norm": 0.719702694761498,
"learning_rate": 0.00014016954246529696,
"loss": 0.6799,
"step": 145
},
{
"epoch": 0.39,
"grad_norm": 0.6940261755213799,
"learning_rate": 0.0001393710822137604,
"loss": 0.6515,
"step": 146
},
{
"epoch": 0.39,
"grad_norm": 0.6940905781567512,
"learning_rate": 0.00013856964028963116,
"loss": 0.7139,
"step": 147
},
{
"epoch": 0.4,
"grad_norm": 0.6185000019239019,
"learning_rate": 0.00013776527738815263,
"loss": 0.6843,
"step": 148
},
{
"epoch": 0.4,
"grad_norm": 0.6294907306228986,
"learning_rate": 0.00013695805442578136,
"loss": 0.7154,
"step": 149
},
{
"epoch": 0.4,
"grad_norm": 0.602922685838362,
"learning_rate": 0.00013614803253557357,
"loss": 0.685,
"step": 150
},
{
"epoch": 0.4,
"grad_norm": 0.676522392009219,
"learning_rate": 0.00013533527306255547,
"loss": 0.6851,
"step": 151
},
{
"epoch": 0.41,
"grad_norm": 0.611104819984954,
"learning_rate": 0.00013451983755907737,
"loss": 0.6992,
"step": 152
},
{
"epoch": 0.41,
"grad_norm": 0.6796636946794475,
"learning_rate": 0.00013370178778015224,
"loss": 0.7065,
"step": 153
},
{
"epoch": 0.41,
"grad_norm": 0.6046664764536965,
"learning_rate": 0.00013288118567877875,
"loss": 0.6627,
"step": 154
},
{
"epoch": 0.42,
"grad_norm": 0.7122396015442166,
"learning_rate": 0.00013205809340124952,
"loss": 0.736,
"step": 155
},
{
"epoch": 0.42,
"grad_norm": 0.644051018087198,
"learning_rate": 0.00013123257328244453,
"loss": 0.7115,
"step": 156
},
{
"epoch": 0.42,
"grad_norm": 0.6560336305912403,
"learning_rate": 0.00013040468784111044,
"loss": 0.6887,
"step": 157
},
{
"epoch": 0.42,
"grad_norm": 0.611680866224159,
"learning_rate": 0.0001295744997751257,
"loss": 0.6667,
"step": 158
},
{
"epoch": 0.43,
"grad_norm": 0.7406926056500003,
"learning_rate": 0.00012874207195675262,
"loss": 0.7699,
"step": 159
},
{
"epoch": 0.43,
"grad_norm": 0.6977955799983129,
"learning_rate": 0.0001279074674278754,
"loss": 0.7077,
"step": 160
},
{
"epoch": 0.43,
"grad_norm": 0.6372845332477916,
"learning_rate": 0.0001270707493952263,
"loss": 0.6686,
"step": 161
},
{
"epoch": 0.43,
"grad_norm": 0.6492145705292489,
"learning_rate": 0.00012623198122559863,
"loss": 0.7046,
"step": 162
},
{
"epoch": 0.44,
"grad_norm": 0.6681970186049779,
"learning_rate": 0.00012539122644104756,
"loss": 0.7536,
"step": 163
},
{
"epoch": 0.44,
"grad_norm": 0.6311121728953317,
"learning_rate": 0.00012454854871407994,
"loss": 0.7268,
"step": 164
},
{
"epoch": 0.44,
"grad_norm": 0.6171650489076799,
"learning_rate": 0.00012370401186283185,
"loss": 0.6731,
"step": 165
},
{
"epoch": 0.45,
"grad_norm": 0.6139846134340758,
"learning_rate": 0.00012285767984623563,
"loss": 0.6648,
"step": 166
},
{
"epoch": 0.45,
"grad_norm": 0.6727015510207016,
"learning_rate": 0.00012200961675917604,
"loss": 0.7182,
"step": 167
},
{
"epoch": 0.45,
"grad_norm": 0.6093536573435098,
"learning_rate": 0.00012115988682763627,
"loss": 0.7114,
"step": 168
},
{
"epoch": 0.45,
"grad_norm": 0.6831844462735769,
"learning_rate": 0.00012030855440383386,
"loss": 0.6701,
"step": 169
},
{
"epoch": 0.46,
"grad_norm": 0.6158465960782087,
"learning_rate": 0.00011945568396134721,
"loss": 0.646,
"step": 170
},
{
"epoch": 0.46,
"grad_norm": 0.609767308811365,
"learning_rate": 0.0001186013400902328,
"loss": 0.6274,
"step": 171
},
{
"epoch": 0.46,
"grad_norm": 0.5469825613650058,
"learning_rate": 0.00011774558749213357,
"loss": 0.6817,
"step": 172
},
{
"epoch": 0.46,
"grad_norm": 0.6747015764561682,
"learning_rate": 0.00011688849097537904,
"loss": 0.691,
"step": 173
},
{
"epoch": 0.47,
"grad_norm": 0.6382223494379158,
"learning_rate": 0.00011603011545007707,
"loss": 0.7,
"step": 174
},
{
"epoch": 0.47,
"grad_norm": 0.6124637055623531,
"learning_rate": 0.00011517052592319811,
"loss": 0.6687,
"step": 175
},
{
"epoch": 0.47,
"grad_norm": 0.6350222892172688,
"learning_rate": 0.00011430978749365203,
"loss": 0.6679,
"step": 176
},
{
"epoch": 0.47,
"grad_norm": 0.6030636477769307,
"learning_rate": 0.00011344796534735804,
"loss": 0.642,
"step": 177
},
{
"epoch": 0.48,
"grad_norm": 0.6399897114215611,
"learning_rate": 0.00011258512475230807,
"loss": 0.6743,
"step": 178
},
{
"epoch": 0.48,
"grad_norm": 0.6493391217471788,
"learning_rate": 0.00011172133105362358,
"loss": 0.6282,
"step": 179
},
{
"epoch": 0.48,
"grad_norm": 0.6279394364552422,
"learning_rate": 0.00011085664966860727,
"loss": 0.7069,
"step": 180
},
{
"epoch": 0.49,
"grad_norm": 0.6238259325357338,
"learning_rate": 0.00010999114608178837,
"loss": 0.6586,
"step": 181
},
{
"epoch": 0.49,
"grad_norm": 0.7124124134964612,
"learning_rate": 0.00010912488583996363,
"loss": 0.6666,
"step": 182
},
{
"epoch": 0.49,
"grad_norm": 0.5628912814858279,
"learning_rate": 0.00010825793454723325,
"loss": 0.661,
"step": 183
},
{
"epoch": 0.49,
"grad_norm": 0.6905577214693723,
"learning_rate": 0.00010739035786003239,
"loss": 0.7154,
"step": 184
},
{
"epoch": 0.5,
"grad_norm": 0.6530459282353105,
"learning_rate": 0.00010652222148215905,
"loss": 0.681,
"step": 185
},
{
"epoch": 0.5,
"grad_norm": 0.569548383478722,
"learning_rate": 0.00010565359115979791,
"loss": 0.6746,
"step": 186
},
{
"epoch": 0.5,
"grad_norm": 0.5714999167791848,
"learning_rate": 0.00010478453267654147,
"loss": 0.6769,
"step": 187
},
{
"epoch": 0.5,
"grad_norm": 0.7076381300646487,
"learning_rate": 0.00010391511184840774,
"loss": 0.6724,
"step": 188
},
{
"epoch": 0.51,
"grad_norm": 0.5540842570197104,
"learning_rate": 0.00010304539451885629,
"loss": 0.6209,
"step": 189
},
{
"epoch": 0.51,
"grad_norm": 0.6415952358813346,
"learning_rate": 0.00010217544655380129,
"loss": 0.6615,
"step": 190
},
{
"epoch": 0.51,
"grad_norm": 0.6393147605518732,
"learning_rate": 0.00010130533383662362,
"loss": 0.6385,
"step": 191
},
{
"epoch": 0.51,
"grad_norm": 0.6289522798672027,
"learning_rate": 0.00010043512226318124,
"loss": 0.6857,
"step": 192
},
{
"epoch": 0.52,
"grad_norm": 0.6734225585184999,
"learning_rate": 9.956487773681879e-05,
"loss": 0.6244,
"step": 193
},
{
"epoch": 0.52,
"grad_norm": 0.6164997133604642,
"learning_rate": 9.869466616337642e-05,
"loss": 0.6462,
"step": 194
},
{
"epoch": 0.52,
"grad_norm": 0.6511338335869312,
"learning_rate": 9.78245534461987e-05,
"loss": 0.7006,
"step": 195
},
{
"epoch": 0.53,
"grad_norm": 0.6377607002835902,
"learning_rate": 9.695460548114373e-05,
"loss": 0.6989,
"step": 196
},
{
"epoch": 0.53,
"grad_norm": 0.5936010824613069,
"learning_rate": 9.608488815159227e-05,
"loss": 0.6407,
"step": 197
},
{
"epoch": 0.53,
"grad_norm": 0.643239848981791,
"learning_rate": 9.521546732345858e-05,
"loss": 0.693,
"step": 198
},
{
"epoch": 0.53,
"grad_norm": 0.5653914084405384,
"learning_rate": 9.43464088402021e-05,
"loss": 0.6677,
"step": 199
},
{
"epoch": 0.54,
"grad_norm": 0.5724255909192363,
"learning_rate": 9.347777851784096e-05,
"loss": 0.6372,
"step": 200
},
{
"epoch": 0.54,
"grad_norm": 0.5659784186045361,
"learning_rate": 9.260964213996762e-05,
"loss": 0.7082,
"step": 201
},
{
"epoch": 0.54,
"grad_norm": 0.6531108684279694,
"learning_rate": 9.174206545276677e-05,
"loss": 0.6176,
"step": 202
},
{
"epoch": 0.54,
"grad_norm": 0.6422142052940009,
"learning_rate": 9.087511416003635e-05,
"loss": 0.6754,
"step": 203
},
{
"epoch": 0.55,
"grad_norm": 0.6537245766903678,
"learning_rate": 9.000885391821164e-05,
"loss": 0.6484,
"step": 204
},
{
"epoch": 0.55,
"grad_norm": 0.6396400269252728,
"learning_rate": 8.914335033139274e-05,
"loss": 0.6652,
"step": 205
},
{
"epoch": 0.55,
"grad_norm": 0.5559545083417303,
"learning_rate": 8.827866894637643e-05,
"loss": 0.6791,
"step": 206
},
{
"epoch": 0.55,
"grad_norm": 0.5995648170220891,
"learning_rate": 8.741487524769199e-05,
"loss": 0.6461,
"step": 207
},
{
"epoch": 0.56,
"grad_norm": 0.5928913948160235,
"learning_rate": 8.655203465264197e-05,
"loss": 0.6631,
"step": 208
},
{
"epoch": 0.56,
"grad_norm": 0.5570723055144339,
"learning_rate": 8.5690212506348e-05,
"loss": 0.664,
"step": 209
},
{
"epoch": 0.56,
"grad_norm": 0.5932902194668669,
"learning_rate": 8.482947407680193e-05,
"loss": 0.6523,
"step": 210
},
{
"epoch": 0.57,
"grad_norm": 0.637557903417406,
"learning_rate": 8.396988454992295e-05,
"loss": 0.5927,
"step": 211
},
{
"epoch": 0.57,
"grad_norm": 0.588469166277496,
"learning_rate": 8.311150902462095e-05,
"loss": 0.6708,
"step": 212
},
{
"epoch": 0.57,
"grad_norm": 0.7093924699893847,
"learning_rate": 8.225441250786644e-05,
"loss": 0.6862,
"step": 213
},
{
"epoch": 0.57,
"grad_norm": 0.6644277543653607,
"learning_rate": 8.139865990976722e-05,
"loss": 0.6397,
"step": 214
},
{
"epoch": 0.58,
"grad_norm": 0.5692386084383936,
"learning_rate": 8.054431603865283e-05,
"loss": 0.6501,
"step": 215
},
{
"epoch": 0.58,
"grad_norm": 0.6346519607295912,
"learning_rate": 7.969144559616613e-05,
"loss": 0.6625,
"step": 216
},
{
"epoch": 0.58,
"grad_norm": 0.6264859907511606,
"learning_rate": 7.884011317236376e-05,
"loss": 0.643,
"step": 217
},
{
"epoch": 0.58,
"grad_norm": 0.623282323523227,
"learning_rate": 7.7990383240824e-05,
"loss": 0.6844,
"step": 218
},
{
"epoch": 0.59,
"grad_norm": 0.592564998468818,
"learning_rate": 7.714232015376441e-05,
"loss": 0.6411,
"step": 219
},
{
"epoch": 0.59,
"grad_norm": 0.613168941035148,
"learning_rate": 7.629598813716817e-05,
"loss": 0.6556,
"step": 220
},
{
"epoch": 0.59,
"grad_norm": 0.6112236475732046,
"learning_rate": 7.54514512859201e-05,
"loss": 0.6714,
"step": 221
},
{
"epoch": 0.6,
"grad_norm": 0.6020471154492149,
"learning_rate": 7.460877355895248e-05,
"loss": 0.6621,
"step": 222
},
{
"epoch": 0.6,
"grad_norm": 0.6620676419938459,
"learning_rate": 7.376801877440143e-05,
"loss": 0.6259,
"step": 223
},
{
"epoch": 0.6,
"grad_norm": 0.6141172748711491,
"learning_rate": 7.292925060477367e-05,
"loss": 0.6335,
"step": 224
},
{
"epoch": 0.6,
"grad_norm": 0.5491409778708467,
"learning_rate": 7.20925325721246e-05,
"loss": 0.6264,
"step": 225
},
{
"epoch": 0.61,
"grad_norm": 0.6462891221310414,
"learning_rate": 7.12579280432474e-05,
"loss": 0.6721,
"step": 226
},
{
"epoch": 0.61,
"grad_norm": 0.6064044918933936,
"learning_rate": 7.04255002248743e-05,
"loss": 0.6356,
"step": 227
},
{
"epoch": 0.61,
"grad_norm": 0.5553383645621945,
"learning_rate": 6.959531215888961e-05,
"loss": 0.6569,
"step": 228
},
{
"epoch": 0.61,
"grad_norm": 0.5637312706044894,
"learning_rate": 6.876742671755547e-05,
"loss": 0.676,
"step": 229
},
{
"epoch": 0.62,
"grad_norm": 0.597407585296278,
"learning_rate": 6.794190659875052e-05,
"loss": 0.6223,
"step": 230
},
{
"epoch": 0.62,
"grad_norm": 0.6117957902349217,
"learning_rate": 6.711881432122128e-05,
"loss": 0.5797,
"step": 231
},
{
"epoch": 0.62,
"grad_norm": 0.5636620516655729,
"learning_rate": 6.62982122198478e-05,
"loss": 0.6347,
"step": 232
},
{
"epoch": 0.62,
"grad_norm": 0.6093740585052143,
"learning_rate": 6.548016244092264e-05,
"loss": 0.6364,
"step": 233
},
{
"epoch": 0.63,
"grad_norm": 0.5832448160710062,
"learning_rate": 6.466472693744454e-05,
"loss": 0.6249,
"step": 234
},
{
"epoch": 0.63,
"grad_norm": 0.6212348267309529,
"learning_rate": 6.385196746442644e-05,
"loss": 0.6373,
"step": 235
},
{
"epoch": 0.63,
"grad_norm": 0.6041857720598327,
"learning_rate": 6.304194557421866e-05,
"loss": 0.6367,
"step": 236
},
{
"epoch": 0.64,
"grad_norm": 0.6228926839065734,
"learning_rate": 6.223472261184737e-05,
"loss": 0.6272,
"step": 237
},
{
"epoch": 0.64,
"grad_norm": 0.592678196931043,
"learning_rate": 6.143035971036885e-05,
"loss": 0.6405,
"step": 238
},
{
"epoch": 0.64,
"grad_norm": 0.6188393302850655,
"learning_rate": 6.0628917786239615e-05,
"loss": 0.6562,
"step": 239
},
{
"epoch": 0.64,
"grad_norm": 0.6521252480679709,
"learning_rate": 5.983045753470308e-05,
"loss": 0.6177,
"step": 240
},
{
"epoch": 0.65,
"grad_norm": 0.6015226341824674,
"learning_rate": 5.9035039425192996e-05,
"loss": 0.6543,
"step": 241
},
{
"epoch": 0.65,
"grad_norm": 0.5522056983476296,
"learning_rate": 5.824272369675403e-05,
"loss": 0.6412,
"step": 242
},
{
"epoch": 0.65,
"grad_norm": 0.6565986603056718,
"learning_rate": 5.74535703534795e-05,
"loss": 0.608,
"step": 243
},
{
"epoch": 0.65,
"grad_norm": 0.6243504360364475,
"learning_rate": 5.666763915996725e-05,
"loss": 0.6187,
"step": 244
},
{
"epoch": 0.66,
"grad_norm": 0.5710246396677061,
"learning_rate": 5.588498963679338e-05,
"loss": 0.6566,
"step": 245
},
{
"epoch": 0.66,
"grad_norm": 0.6357843375059528,
"learning_rate": 5.5105681056004996e-05,
"loss": 0.6552,
"step": 246
},
{
"epoch": 0.66,
"grad_norm": 0.6476235313255283,
"learning_rate": 5.432977243663089e-05,
"loss": 0.6433,
"step": 247
},
{
"epoch": 0.66,
"grad_norm": 0.6420823226435337,
"learning_rate": 5.355732254021205e-05,
"loss": 0.6555,
"step": 248
},
{
"epoch": 0.67,
"grad_norm": 0.5834281412304384,
"learning_rate": 5.278838986635175e-05,
"loss": 0.6457,
"step": 249
},
{
"epoch": 0.67,
"grad_norm": 0.603427697793282,
"learning_rate": 5.202303264828482e-05,
"loss": 0.6338,
"step": 250
},
{
"epoch": 0.67,
"grad_norm": 0.5503026867657649,
"learning_rate": 5.1261308848467806e-05,
"loss": 0.6211,
"step": 251
},
{
"epoch": 0.68,
"grad_norm": 0.6140538857825613,
"learning_rate": 5.0503276154189205e-05,
"loss": 0.6133,
"step": 252
},
{
"epoch": 0.68,
"grad_norm": 0.5651106677229555,
"learning_rate": 4.974899197320059e-05,
"loss": 0.6085,
"step": 253
},
{
"epoch": 0.68,
"grad_norm": 0.5806960834836444,
"learning_rate": 4.899851342936913e-05,
"loss": 0.6326,
"step": 254
},
{
"epoch": 0.68,
"grad_norm": 0.6248344723288919,
"learning_rate": 4.825189735835137e-05,
"loss": 0.5868,
"step": 255
},
{
"epoch": 0.69,
"grad_norm": 0.5547428746495593,
"learning_rate": 4.750920030328889e-05,
"loss": 0.6694,
"step": 256
},
{
"epoch": 0.69,
"grad_norm": 0.6481730854953198,
"learning_rate": 4.677047851052615e-05,
"loss": 0.5942,
"step": 257
},
{
"epoch": 0.69,
"grad_norm": 0.6287265723350564,
"learning_rate": 4.6035787925350916e-05,
"loss": 0.6555,
"step": 258
},
{
"epoch": 0.69,
"grad_norm": 0.5975999751699451,
"learning_rate": 4.530518418775733e-05,
"loss": 0.5761,
"step": 259
},
{
"epoch": 0.7,
"grad_norm": 0.6370150437871046,
"learning_rate": 4.457872262823202e-05,
"loss": 0.68,
"step": 260
},
{
"epoch": 0.7,
"grad_norm": 0.5854134285960169,
"learning_rate": 4.385645826356401e-05,
"loss": 0.6401,
"step": 261
},
{
"epoch": 0.7,
"grad_norm": 0.5939120027050839,
"learning_rate": 4.313844579267793e-05,
"loss": 0.6449,
"step": 262
},
{
"epoch": 0.71,
"grad_norm": 0.6171451891512717,
"learning_rate": 4.242473959249173e-05,
"loss": 0.6585,
"step": 263
},
{
"epoch": 0.71,
"grad_norm": 0.5375088192193765,
"learning_rate": 4.1715393713798464e-05,
"loss": 0.6159,
"step": 264
},
{
"epoch": 0.71,
"grad_norm": 0.5570000723429187,
"learning_rate": 4.1010461877172836e-05,
"loss": 0.6302,
"step": 265
},
{
"epoch": 0.71,
"grad_norm": 0.6991127608006316,
"learning_rate": 4.030999746890295e-05,
"loss": 0.633,
"step": 266
},
{
"epoch": 0.72,
"grad_norm": 0.6208588931148632,
"learning_rate": 3.961405353694716e-05,
"loss": 0.6392,
"step": 267
},
{
"epoch": 0.72,
"grad_norm": 0.5168283033651194,
"learning_rate": 3.892268278691651e-05,
"loss": 0.6427,
"step": 268
},
{
"epoch": 0.72,
"grad_norm": 0.583521526288499,
"learning_rate": 3.8235937578083424e-05,
"loss": 0.5509,
"step": 269
},
{
"epoch": 0.72,
"grad_norm": 0.5431854753646991,
"learning_rate": 3.755386991941618e-05,
"loss": 0.6798,
"step": 270
},
{
"epoch": 0.73,
"grad_norm": 0.5501260666199879,
"learning_rate": 3.687653146564025e-05,
"loss": 0.5858,
"step": 271
},
{
"epoch": 0.73,
"grad_norm": 0.5343821946856416,
"learning_rate": 3.6203973513326395e-05,
"loss": 0.6327,
"step": 272
},
{
"epoch": 0.73,
"grad_norm": 0.5536890084608536,
"learning_rate": 3.553624699700578e-05,
"loss": 0.5837,
"step": 273
},
{
"epoch": 0.73,
"grad_norm": 0.5340609690647654,
"learning_rate": 3.4873402485312544e-05,
"loss": 0.6238,
"step": 274
},
{
"epoch": 0.74,
"grad_norm": 0.684928593101013,
"learning_rate": 3.4215490177154173e-05,
"loss": 0.6159,
"step": 275
},
{
"epoch": 0.74,
"grad_norm": 0.6111300960968657,
"learning_rate": 3.356255989790984e-05,
"loss": 0.6057,
"step": 276
},
{
"epoch": 0.74,
"grad_norm": 0.5583521629303174,
"learning_rate": 3.2914661095656805e-05,
"loss": 0.6339,
"step": 277
},
{
"epoch": 0.75,
"grad_norm": 0.5796480790048567,
"learning_rate": 3.227184283742591e-05,
"loss": 0.6285,
"step": 278
},
{
"epoch": 0.75,
"grad_norm": 0.6052550118253326,
"learning_rate": 3.1634153805485243e-05,
"loss": 0.6373,
"step": 279
},
{
"epoch": 0.75,
"grad_norm": 0.5993042127057324,
"learning_rate": 3.100164229365361e-05,
"loss": 0.5693,
"step": 280
},
{
"epoch": 0.75,
"grad_norm": 0.6482643299850712,
"learning_rate": 3.0374356203643005e-05,
"loss": 0.6157,
"step": 281
},
{
"epoch": 0.76,
"grad_norm": 0.5279593828065655,
"learning_rate": 2.9752343041430798e-05,
"loss": 0.5828,
"step": 282
},
{
"epoch": 0.76,
"grad_norm": 0.578989064972009,
"learning_rate": 2.9135649913662087e-05,
"loss": 0.6142,
"step": 283
},
{
"epoch": 0.76,
"grad_norm": 0.6613465262466375,
"learning_rate": 2.8524323524082243e-05,
"loss": 0.5862,
"step": 284
},
{
"epoch": 0.76,
"grad_norm": 0.5593586714719698,
"learning_rate": 2.7918410169999822e-05,
"loss": 0.6186,
"step": 285
},
{
"epoch": 0.77,
"grad_norm": 0.6009401050358155,
"learning_rate": 2.7317955738780333e-05,
"loss": 0.5771,
"step": 286
},
{
"epoch": 0.77,
"grad_norm": 0.6239256426233234,
"learning_rate": 2.672300570437116e-05,
"loss": 0.5703,
"step": 287
},
{
"epoch": 0.77,
"grad_norm": 0.5565646944070531,
"learning_rate": 2.6133605123857708e-05,
"loss": 0.6224,
"step": 288
},
{
"epoch": 0.77,
"grad_norm": 0.6682481662923421,
"learning_rate": 2.5549798634050936e-05,
"loss": 0.63,
"step": 289
},
{
"epoch": 0.78,
"grad_norm": 0.5811684620171189,
"learning_rate": 2.4971630448107163e-05,
"loss": 0.6351,
"step": 290
},
{
"epoch": 0.78,
"grad_norm": 0.6113567840839303,
"learning_rate": 2.4399144352179483e-05,
"loss": 0.5717,
"step": 291
},
{
"epoch": 0.78,
"grad_norm": 0.5287785841858424,
"learning_rate": 2.3832383702101747e-05,
"loss": 0.5764,
"step": 292
},
{
"epoch": 0.79,
"grad_norm": 0.581644293338218,
"learning_rate": 2.327139142010538e-05,
"loss": 0.6452,
"step": 293
},
{
"epoch": 0.79,
"grad_norm": 0.5719089122205556,
"learning_rate": 2.271620999156837e-05,
"loss": 0.5994,
"step": 294
},
{
"epoch": 0.79,
"grad_norm": 0.6003439148215275,
"learning_rate": 2.216688146179795e-05,
"loss": 0.6685,
"step": 295
},
{
"epoch": 0.79,
"grad_norm": 0.5523434831489719,
"learning_rate": 2.1623447432846466e-05,
"loss": 0.5729,
"step": 296
},
{
"epoch": 0.8,
"grad_norm": 0.5800800946514322,
"learning_rate": 2.1085949060360654e-05,
"loss": 0.598,
"step": 297
},
{
"epoch": 0.8,
"grad_norm": 0.5987900895932757,
"learning_rate": 2.055442705046474e-05,
"loss": 0.529,
"step": 298
},
{
"epoch": 0.8,
"grad_norm": 0.5380314863841026,
"learning_rate": 2.0028921656677855e-05,
"loss": 0.5847,
"step": 299
},
{
"epoch": 0.8,
"grad_norm": 0.6154729554684041,
"learning_rate": 1.950947267686536e-05,
"loss": 0.5655,
"step": 300
},
{
"epoch": 0.81,
"grad_norm": 0.5653169274060361,
"learning_rate": 1.8996119450224935e-05,
"loss": 0.6026,
"step": 301
},
{
"epoch": 0.81,
"grad_norm": 0.6096171765238086,
"learning_rate": 1.8488900854307366e-05,
"loss": 0.6392,
"step": 302
},
{
"epoch": 0.81,
"grad_norm": 0.5821164151718903,
"learning_rate": 1.79878553020721e-05,
"loss": 0.6021,
"step": 303
},
{
"epoch": 0.82,
"grad_norm": 0.6002726973121734,
"learning_rate": 1.7493020738978206e-05,
"loss": 0.6432,
"step": 304
},
{
"epoch": 0.82,
"grad_norm": 0.5407410721681246,
"learning_rate": 1.7004434640110854e-05,
"loss": 0.5675,
"step": 305
},
{
"epoch": 0.82,
"grad_norm": 0.5951204193957552,
"learning_rate": 1.6522134007342893e-05,
"loss": 0.6271,
"step": 306
},
{
"epoch": 0.82,
"grad_norm": 0.6003726456718518,
"learning_rate": 1.60461553665328e-05,
"loss": 0.6051,
"step": 307
},
{
"epoch": 0.83,
"grad_norm": 0.6020973087534124,
"learning_rate": 1.557653476475852e-05,
"loss": 0.5899,
"step": 308
},
{
"epoch": 0.83,
"grad_norm": 0.5500181352884663,
"learning_rate": 1.5113307767587448e-05,
"loss": 0.6425,
"step": 309
},
{
"epoch": 0.83,
"grad_norm": 0.6418488919418228,
"learning_rate": 1.4656509456382928e-05,
"loss": 0.6296,
"step": 310
},
{
"epoch": 0.83,
"grad_norm": 0.54497743056703,
"learning_rate": 1.4206174425647556e-05,
"loss": 0.5507,
"step": 311
},
{
"epoch": 0.84,
"grad_norm": 0.6216914003629271,
"learning_rate": 1.3762336780403163e-05,
"loss": 0.6547,
"step": 312
},
{
"epoch": 0.84,
"grad_norm": 0.6114537267371907,
"learning_rate": 1.332503013360794e-05,
"loss": 0.5881,
"step": 313
},
{
"epoch": 0.84,
"grad_norm": 0.5787707524225494,
"learning_rate": 1.2894287603611032e-05,
"loss": 0.6015,
"step": 314
},
{
"epoch": 0.84,
"grad_norm": 0.5727228874213498,
"learning_rate": 1.247014181164412e-05,
"loss": 0.6315,
"step": 315
},
{
"epoch": 0.85,
"grad_norm": 0.5365520051065367,
"learning_rate": 1.2052624879351104e-05,
"loss": 0.5849,
"step": 316
},
{
"epoch": 0.85,
"grad_norm": 0.5882130601500214,
"learning_rate": 1.1641768426355427e-05,
"loss": 0.5638,
"step": 317
},
{
"epoch": 0.85,
"grad_norm": 0.5605719123191992,
"learning_rate": 1.123760356786545e-05,
"loss": 0.5813,
"step": 318
},
{
"epoch": 0.86,
"grad_norm": 0.540475552941427,
"learning_rate": 1.0840160912317943e-05,
"loss": 0.5988,
"step": 319
},
{
"epoch": 0.86,
"grad_norm": 0.5836793137801853,
"learning_rate": 1.0449470559060126e-05,
"loss": 0.6359,
"step": 320
},
{
"epoch": 0.86,
"grad_norm": 0.5415961274854277,
"learning_rate": 1.0065562096070069e-05,
"loss": 0.6206,
"step": 321
},
{
"epoch": 0.86,
"grad_norm": 0.5586612112915813,
"learning_rate": 9.68846459771604e-06,
"loss": 0.5986,
"step": 322
},
{
"epoch": 0.87,
"grad_norm": 0.6076513228987033,
"learning_rate": 9.318206622554549e-06,
"loss": 0.5981,
"step": 323
},
{
"epoch": 0.87,
"grad_norm": 0.5929614655401766,
"learning_rate": 8.954816211167482e-06,
"loss": 0.5746,
"step": 324
},
{
"epoch": 0.87,
"grad_norm": 0.5680926959791349,
"learning_rate": 8.59832088403868e-06,
"loss": 0.6537,
"step": 325
},
{
"epoch": 0.87,
"grad_norm": 0.6220839217304087,
"learning_rate": 8.24874763946959e-06,
"loss": 0.5998,
"step": 326
},
{
"epoch": 0.88,
"grad_norm": 0.554602037671773,
"learning_rate": 7.906122951534677e-06,
"loss": 0.5864,
"step": 327
},
{
"epoch": 0.88,
"grad_norm": 0.5935221816502435,
"learning_rate": 7.570472768076464e-06,
"loss": 0.556,
"step": 328
},
{
"epoch": 0.88,
"grad_norm": 0.5727608314538538,
"learning_rate": 7.241822508740448e-06,
"loss": 0.6276,
"step": 329
},
{
"epoch": 0.88,
"grad_norm": 0.6164522880456009,
"learning_rate": 6.920197063050038e-06,
"loss": 0.5819,
"step": 330
},
{
"epoch": 0.89,
"grad_norm": 0.5718452829860532,
"learning_rate": 6.605620788521472e-06,
"loss": 0.6139,
"step": 331
},
{
"epoch": 0.89,
"grad_norm": 0.569531129249351,
"learning_rate": 6.2981175088193564e-06,
"loss": 0.5936,
"step": 332
},
{
"epoch": 0.89,
"grad_norm": 0.5844212361264539,
"learning_rate": 5.997710511952259e-06,
"loss": 0.6065,
"step": 333
},
{
"epoch": 0.9,
"grad_norm": 0.6109564839739653,
"learning_rate": 5.70442254850918e-06,
"loss": 0.648,
"step": 334
},
{
"epoch": 0.9,
"grad_norm": 0.6491048595264175,
"learning_rate": 5.418275829936537e-06,
"loss": 0.555,
"step": 335
},
{
"epoch": 0.9,
"grad_norm": 0.6204717817131216,
"learning_rate": 5.139292026855991e-06,
"loss": 0.6479,
"step": 336
},
{
"epoch": 0.9,
"grad_norm": 0.6360339795207006,
"learning_rate": 4.867492267423379e-06,
"loss": 0.5502,
"step": 337
},
{
"epoch": 0.91,
"grad_norm": 0.5500198353842222,
"learning_rate": 4.602897135728513e-06,
"loss": 0.6044,
"step": 338
},
{
"epoch": 0.91,
"grad_norm": 0.5577787929024242,
"learning_rate": 4.3455266702364e-06,
"loss": 0.6021,
"step": 339
},
{
"epoch": 0.91,
"grad_norm": 0.5620959438033024,
"learning_rate": 4.095400362269597e-06,
"loss": 0.6364,
"step": 340
},
{
"epoch": 0.91,
"grad_norm": 0.6455883188430549,
"learning_rate": 3.852537154532121e-06,
"loss": 0.5935,
"step": 341
},
{
"epoch": 0.92,
"grad_norm": 0.5463076296085635,
"learning_rate": 3.616955439674863e-06,
"loss": 0.5898,
"step": 342
},
{
"epoch": 0.92,
"grad_norm": 0.5664602810118163,
"learning_rate": 3.388673058902647e-06,
"loss": 0.5727,
"step": 343
},
{
"epoch": 0.92,
"grad_norm": 0.5844823066126078,
"learning_rate": 3.167707300623135e-06,
"loss": 0.5734,
"step": 344
},
{
"epoch": 0.92,
"grad_norm": 0.5688042679406337,
"learning_rate": 2.9540748991374268e-06,
"loss": 0.5911,
"step": 345
},
{
"epoch": 0.93,
"grad_norm": 0.5817718407371287,
"learning_rate": 2.7477920333728203e-06,
"loss": 0.583,
"step": 346
},
{
"epoch": 0.93,
"grad_norm": 0.6549728437210761,
"learning_rate": 2.548874325657502e-06,
"loss": 0.6076,
"step": 347
},
{
"epoch": 0.93,
"grad_norm": 0.5825225050052231,
"learning_rate": 2.3573368405374052e-06,
"loss": 0.5811,
"step": 348
},
{
"epoch": 0.94,
"grad_norm": 0.5577099309283072,
"learning_rate": 2.1731940836354103e-06,
"loss": 0.6496,
"step": 349
},
{
"epoch": 0.94,
"grad_norm": 0.6215824384460507,
"learning_rate": 1.996460000552702e-06,
"loss": 0.5724,
"step": 350
},
{
"epoch": 0.94,
"grad_norm": 0.5177119689387795,
"learning_rate": 1.827147975812693e-06,
"loss": 0.6236,
"step": 351
},
{
"epoch": 0.94,
"grad_norm": 0.6230359593868082,
"learning_rate": 1.6652708318473763e-06,
"loss": 0.6383,
"step": 352
},
{
"epoch": 0.95,
"grad_norm": 0.5622331543379322,
"learning_rate": 1.5108408280262276e-06,
"loss": 0.594,
"step": 353
},
{
"epoch": 0.95,
"grad_norm": 0.6107181340780342,
"learning_rate": 1.3638696597277679e-06,
"loss": 0.6463,
"step": 354
},
{
"epoch": 0.95,
"grad_norm": 0.6031539978263786,
"learning_rate": 1.2243684574538838e-06,
"loss": 0.6587,
"step": 355
},
{
"epoch": 0.95,
"grad_norm": 0.5761269685398007,
"learning_rate": 1.092347785986858e-06,
"loss": 0.6137,
"step": 356
},
{
"epoch": 0.96,
"grad_norm": 0.5520617664080628,
"learning_rate": 9.678176435892417e-07,
"loss": 0.5566,
"step": 357
},
{
"epoch": 0.96,
"grad_norm": 0.6171032667293939,
"learning_rate": 8.507874612467382e-07,
"loss": 0.6151,
"step": 358
},
{
"epoch": 0.96,
"grad_norm": 0.6143485738921709,
"learning_rate": 7.412661019538858e-07,
"loss": 0.5945,
"step": 359
},
{
"epoch": 0.97,
"grad_norm": 0.5659045654192223,
"learning_rate": 6.392618600429057e-07,
"loss": 0.5939,
"step": 360
},
{
"epoch": 0.97,
"grad_norm": 0.6008704187809593,
"learning_rate": 5.447824605555041e-07,
"loss": 0.593,
"step": 361
},
{
"epoch": 0.97,
"grad_norm": 0.6456664206486963,
"learning_rate": 4.578350586578628e-07,
"loss": 0.6243,
"step": 362
},
{
"epoch": 0.97,
"grad_norm": 0.6134762242372662,
"learning_rate": 3.7842623909875033e-07,
"loss": 0.5784,
"step": 363
},
{
"epoch": 0.98,
"grad_norm": 0.5477747018780752,
"learning_rate": 3.0656201571085394e-07,
"loss": 0.593,
"step": 364
},
{
"epoch": 0.98,
"grad_norm": 0.5075895950713205,
"learning_rate": 2.422478309553222e-07,
"loss": 0.5505,
"step": 365
},
{
"epoch": 0.98,
"grad_norm": 0.587289160097588,
"learning_rate": 1.854885555095942e-07,
"loss": 0.6621,
"step": 366
},
{
"epoch": 0.98,
"grad_norm": 0.6392065048372381,
"learning_rate": 1.3628848789853933e-07,
"loss": 0.6049,
"step": 367
},
{
"epoch": 0.99,
"grad_norm": 0.5961350765030653,
"learning_rate": 9.465135416891757e-08,
"loss": 0.6227,
"step": 368
},
{
"epoch": 0.99,
"grad_norm": 0.5699523340921686,
"learning_rate": 6.058030760718315e-08,
"loss": 0.5973,
"step": 369
},
{
"epoch": 0.99,
"grad_norm": 0.5616341300318727,
"learning_rate": 3.4077928500686475e-08,
"loss": 0.6024,
"step": 370
},
{
"epoch": 0.99,
"grad_norm": 0.5919424339524653,
"learning_rate": 1.5146223942297254e-08,
"loss": 0.6247,
"step": 371
},
{
"epoch": 1.0,
"grad_norm": 0.5459942448953277,
"learning_rate": 3.7866276783149464e-09,
"loss": 0.6021,
"step": 372
},
{
"epoch": 1.0,
"grad_norm": 0.5739335118144996,
"learning_rate": 0.0,
"loss": 0.5901,
"step": 373
},
{
"epoch": 1.0,
"step": 373,
"total_flos": 52213203664896.0,
"train_loss": 0.6854520184424863,
"train_runtime": 4288.0633,
"train_samples_per_second": 5.561,
"train_steps_per_second": 0.087
}
],
"logging_steps": 1.0,
"max_steps": 373,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"total_flos": 52213203664896.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}