danube-fwdclm-tuned3 / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
4ca5b93 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 2000,
"global_step": 80,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"crossentropy": 2.487246513366699,
"epoch": 0.125,
"grad_norm": 0.03208793327212334,
"learning_rate": 0.001,
"loss": 2.4872,
"step": 1
},
{
"crossentropy": 2.379896402359009,
"epoch": 0.25,
"grad_norm": 0.03239322826266289,
"learning_rate": 0.002,
"loss": 2.3799,
"step": 2
},
{
"crossentropy": 2.5010085105895996,
"epoch": 0.375,
"grad_norm": 0.03320762887597084,
"learning_rate": 0.003,
"loss": 2.501,
"step": 3
},
{
"crossentropy": 2.688127040863037,
"epoch": 0.5,
"grad_norm": 0.03224330395460129,
"learning_rate": 0.004,
"loss": 2.6881,
"step": 4
},
{
"crossentropy": 2.4009199142456055,
"epoch": 0.625,
"grad_norm": 0.029966533184051514,
"learning_rate": 0.005,
"loss": 2.4009,
"step": 5
},
{
"crossentropy": 2.474385976791382,
"epoch": 0.75,
"grad_norm": 0.030283687636256218,
"learning_rate": 0.006,
"loss": 2.4744,
"step": 6
},
{
"crossentropy": 2.5291025638580322,
"epoch": 0.875,
"grad_norm": 0.03127186745405197,
"learning_rate": 0.006999999999999999,
"loss": 2.5291,
"step": 7
},
{
"crossentropy": 2.3309171199798584,
"epoch": 1.0,
"grad_norm": 0.030661335214972496,
"learning_rate": 0.008,
"loss": 2.3309,
"step": 8
},
{
"crossentropy": 2.4654123783111572,
"epoch": 1.125,
"grad_norm": 0.029608242213726044,
"learning_rate": 0.009000000000000001,
"loss": 2.4654,
"step": 9
},
{
"crossentropy": 2.468688488006592,
"epoch": 1.25,
"grad_norm": 0.03335599973797798,
"learning_rate": 0.01,
"loss": 2.4687,
"step": 10
},
{
"crossentropy": 2.3773512840270996,
"epoch": 1.375,
"grad_norm": 0.031801655888557434,
"learning_rate": 0.009994965332706574,
"loss": 2.3774,
"step": 11
},
{
"crossentropy": 2.5056920051574707,
"epoch": 1.5,
"grad_norm": 0.03860826417803764,
"learning_rate": 0.009979871469976196,
"loss": 2.5057,
"step": 12
},
{
"crossentropy": 2.4569733142852783,
"epoch": 1.625,
"grad_norm": 0.03568004071712494,
"learning_rate": 0.009954748808839673,
"loss": 2.457,
"step": 13
},
{
"crossentropy": 2.4127414226531982,
"epoch": 1.75,
"grad_norm": 0.03557576611638069,
"learning_rate": 0.009919647942993149,
"loss": 2.4127,
"step": 14
},
{
"crossentropy": 2.4089269638061523,
"epoch": 1.875,
"grad_norm": 0.0321757011115551,
"grad_norm_var": 5.793923908080815e-06,
"learning_rate": 0.009874639560909117,
"loss": 2.4089,
"step": 15
},
{
"crossentropy": 2.385178804397583,
"epoch": 2.0,
"grad_norm": 0.033555347472429276,
"grad_norm_var": 5.852987523136916e-06,
"learning_rate": 0.009819814303479266,
"loss": 2.3852,
"step": 16
},
{
"crossentropy": 2.4342517852783203,
"epoch": 2.125,
"grad_norm": 0.034557584673166275,
"grad_norm_var": 6.127065953599033e-06,
"learning_rate": 0.009755282581475769,
"loss": 2.4343,
"step": 17
},
{
"crossentropy": 2.4237754344940186,
"epoch": 2.25,
"grad_norm": 0.03371824324131012,
"grad_norm_var": 6.168768579761655e-06,
"learning_rate": 0.009681174353198686,
"loss": 2.4238,
"step": 18
},
{
"crossentropy": 2.4231510162353516,
"epoch": 2.375,
"grad_norm": 0.02907504141330719,
"grad_norm_var": 6.557002911025779e-06,
"learning_rate": 0.009597638862757255,
"loss": 2.4232,
"step": 19
},
{
"crossentropy": 2.3850157260894775,
"epoch": 2.5,
"grad_norm": 0.03278205543756485,
"grad_norm_var": 6.122522196004565e-06,
"learning_rate": 0.009504844339512096,
"loss": 2.385,
"step": 20
},
{
"crossentropy": 2.3287932872772217,
"epoch": 2.625,
"grad_norm": 0.03301858901977539,
"grad_norm_var": 5.930476905840337e-06,
"learning_rate": 0.00940297765928369,
"loss": 2.3288,
"step": 21
},
{
"crossentropy": 2.3058865070343018,
"epoch": 2.75,
"grad_norm": 0.03293571248650551,
"grad_norm_var": 5.5371734660507875e-06,
"learning_rate": 0.009292243968009331,
"loss": 2.3059,
"step": 22
},
{
"crossentropy": 2.2683637142181396,
"epoch": 2.875,
"grad_norm": 0.03541896864771843,
"grad_norm_var": 5.362674881745298e-06,
"learning_rate": 0.009172866268606514,
"loss": 2.2684,
"step": 23
},
{
"crossentropy": 2.411990165710449,
"epoch": 3.0,
"grad_norm": 0.03502194955945015,
"grad_norm_var": 4.411311488799798e-06,
"learning_rate": 0.009045084971874737,
"loss": 2.412,
"step": 24
},
{
"crossentropy": 2.2515363693237305,
"epoch": 3.125,
"grad_norm": 0.03379856050014496,
"grad_norm_var": 4.1744788750110175e-06,
"learning_rate": 0.008909157412340149,
"loss": 2.2515,
"step": 25
},
{
"crossentropy": 2.3486626148223877,
"epoch": 3.25,
"grad_norm": 0.034475792199373245,
"grad_norm_var": 2.7198637048790144e-06,
"learning_rate": 0.008765357330018056,
"loss": 2.3487,
"step": 26
},
{
"crossentropy": 2.379669666290283,
"epoch": 3.375,
"grad_norm": 0.03319563344120979,
"grad_norm_var": 2.4738877727155267e-06,
"learning_rate": 0.008613974319136958,
"loss": 2.3797,
"step": 27
},
{
"crossentropy": 2.3640384674072266,
"epoch": 3.5,
"grad_norm": 0.03426358476281166,
"grad_norm_var": 2.2389126654258237e-06,
"learning_rate": 0.008455313244934324,
"loss": 2.364,
"step": 28
},
{
"crossentropy": 2.324143409729004,
"epoch": 3.625,
"grad_norm": 0.0358288437128067,
"grad_norm_var": 2.410602035355508e-06,
"learning_rate": 0.008289693629698563,
"loss": 2.3241,
"step": 29
},
{
"crossentropy": 2.159423828125,
"epoch": 3.75,
"grad_norm": 0.031966786831617355,
"grad_norm_var": 2.612506091834166e-06,
"learning_rate": 0.008117449009293669,
"loss": 2.1594,
"step": 30
},
{
"crossentropy": 2.2669661045074463,
"epoch": 3.875,
"grad_norm": 0.0339609794318676,
"grad_norm_var": 2.61687730451927e-06,
"learning_rate": 0.007938926261462366,
"loss": 2.267,
"step": 31
},
{
"crossentropy": 2.482945680618286,
"epoch": 4.0,
"grad_norm": 0.03357086703181267,
"grad_norm_var": 2.5635888162410063e-06,
"learning_rate": 0.007754484907260513,
"loss": 2.4829,
"step": 32
},
{
"crossentropy": 2.1757168769836426,
"epoch": 4.125,
"grad_norm": 0.032582979649305344,
"grad_norm_var": 1.2068945133852464e-06,
"learning_rate": 0.007564496387029531,
"loss": 2.1757,
"step": 33
},
{
"crossentropy": 2.4300379753112793,
"epoch": 4.25,
"grad_norm": 0.03353552147746086,
"grad_norm_var": 1.1362555840309259e-06,
"learning_rate": 0.007369343312364994,
"loss": 2.43,
"step": 34
},
{
"crossentropy": 2.1233766078948975,
"epoch": 4.375,
"grad_norm": 0.034830041229724884,
"grad_norm_var": 1.1319644129707703e-06,
"learning_rate": 0.007169418695587791,
"loss": 2.1234,
"step": 35
},
{
"crossentropy": 2.2619526386260986,
"epoch": 4.5,
"grad_norm": 0.033126723021268845,
"grad_norm_var": 1.1071727437842251e-06,
"learning_rate": 0.006965125158269619,
"loss": 2.262,
"step": 36
},
{
"crossentropy": 2.3065433502197266,
"epoch": 4.625,
"grad_norm": 0.0348593033850193,
"grad_norm_var": 1.0216560744425243e-06,
"learning_rate": 0.0067568741204067145,
"loss": 2.3065,
"step": 37
},
{
"crossentropy": 2.2902674674987793,
"epoch": 4.75,
"grad_norm": 0.0360955074429512,
"grad_norm_var": 1.2434575549111667e-06,
"learning_rate": 0.006545084971874737,
"loss": 2.2903,
"step": 38
},
{
"crossentropy": 2.2587778568267822,
"epoch": 4.875,
"grad_norm": 0.03467360511422157,
"grad_norm_var": 1.2055615432713293e-06,
"learning_rate": 0.006330184227833375,
"loss": 2.2588,
"step": 39
},
{
"crossentropy": 2.3978285789489746,
"epoch": 5.0,
"grad_norm": 0.035668086260557175,
"grad_norm_var": 1.3685657271088221e-06,
"learning_rate": 0.006112604669781572,
"loss": 2.3978,
"step": 40
},
{
"crossentropy": 2.3564870357513428,
"epoch": 5.125,
"grad_norm": 0.0373300276696682,
"grad_norm_var": 2.0190065310690096e-06,
"learning_rate": 0.005892784473993183,
"loss": 2.3565,
"step": 41
},
{
"crossentropy": 2.202275514602661,
"epoch": 5.25,
"grad_norm": 0.03768543526530266,
"grad_norm_var": 2.6563097811015932e-06,
"learning_rate": 0.0056711663290882775,
"loss": 2.2023,
"step": 42
},
{
"crossentropy": 2.192146062850952,
"epoch": 5.375,
"grad_norm": 0.03134535253047943,
"grad_norm_var": 3.228640330968311e-06,
"learning_rate": 0.005448196544517168,
"loss": 2.1921,
"step": 43
},
{
"crossentropy": 2.2519092559814453,
"epoch": 5.5,
"grad_norm": 0.032612044364213943,
"grad_norm_var": 3.044945465432008e-06,
"learning_rate": 0.005224324151752576,
"loss": 2.2519,
"step": 44
},
{
"crossentropy": 2.3610141277313232,
"epoch": 5.625,
"grad_norm": 0.03532750904560089,
"grad_norm_var": 3.073519780112897e-06,
"learning_rate": 0.005,
"loss": 2.361,
"step": 45
},
{
"crossentropy": 2.2574493885040283,
"epoch": 5.75,
"grad_norm": 0.03538508713245392,
"grad_norm_var": 3.047191916038562e-06,
"learning_rate": 0.004775675848247427,
"loss": 2.2574,
"step": 46
},
{
"crossentropy": 2.170254707336426,
"epoch": 5.875,
"grad_norm": 0.03475534915924072,
"grad_norm_var": 2.9654755954788835e-06,
"learning_rate": 0.004551803455482833,
"loss": 2.1703,
"step": 47
},
{
"crossentropy": 2.2004218101501465,
"epoch": 6.0,
"grad_norm": 0.0375908762216568,
"grad_norm_var": 3.107626395358089e-06,
"learning_rate": 0.004328833670911724,
"loss": 2.2004,
"step": 48
},
{
"crossentropy": 2.162071943283081,
"epoch": 6.125,
"grad_norm": 0.031826432794332504,
"grad_norm_var": 4.072774386597716e-06,
"learning_rate": 0.004107215526006817,
"loss": 2.1621,
"step": 49
},
{
"crossentropy": 2.144589424133301,
"epoch": 6.25,
"grad_norm": 0.03268062323331833,
"grad_norm_var": 4.202360731680779e-06,
"learning_rate": 0.003887395330218428,
"loss": 2.1446,
"step": 50
},
{
"crossentropy": 2.202075481414795,
"epoch": 6.375,
"grad_norm": 0.03384561836719513,
"grad_norm_var": 4.294842472950198e-06,
"learning_rate": 0.003669815772166625,
"loss": 2.2021,
"step": 51
},
{
"crossentropy": 2.3169946670532227,
"epoch": 6.5,
"grad_norm": 0.03661832585930824,
"grad_norm_var": 4.387942230220839e-06,
"learning_rate": 0.003454915028125263,
"loss": 2.317,
"step": 52
},
{
"crossentropy": 2.191814422607422,
"epoch": 6.625,
"grad_norm": 0.03522748872637749,
"grad_norm_var": 4.380226970383122e-06,
"learning_rate": 0.003243125879593286,
"loss": 2.1918,
"step": 53
},
{
"crossentropy": 2.196298360824585,
"epoch": 6.75,
"grad_norm": 0.03524046018719673,
"grad_norm_var": 4.3576876356847075e-06,
"learning_rate": 0.0030348748417303823,
"loss": 2.1963,
"step": 54
},
{
"crossentropy": 2.2684872150421143,
"epoch": 6.875,
"grad_norm": 0.03640512377023697,
"grad_norm_var": 4.452811959714381e-06,
"learning_rate": 0.00283058130441221,
"loss": 2.2685,
"step": 55
},
{
"crossentropy": 2.3180689811706543,
"epoch": 7.0,
"grad_norm": 0.03479791432619095,
"grad_norm_var": 4.097831569832335e-06,
"learning_rate": 0.002630656687635007,
"loss": 2.3181,
"step": 56
},
{
"crossentropy": 2.201280117034912,
"epoch": 7.125,
"grad_norm": 0.03417252376675606,
"grad_norm_var": 2.7744502044233416e-06,
"learning_rate": 0.00243550361297047,
"loss": 2.2013,
"step": 57
},
{
"crossentropy": 2.3339715003967285,
"epoch": 7.25,
"grad_norm": 0.033313509076833725,
"grad_norm_var": 2.5884423837709467e-06,
"learning_rate": 0.002245515092739488,
"loss": 2.334,
"step": 58
},
{
"crossentropy": 2.1603586673736572,
"epoch": 7.375,
"grad_norm": 0.03365428000688553,
"grad_norm_var": 2.684439790351147e-06,
"learning_rate": 0.0020610737385376348,
"loss": 2.1604,
"step": 59
},
{
"crossentropy": 2.2999627590179443,
"epoch": 7.5,
"grad_norm": 0.034641556441783905,
"grad_norm_var": 2.6678187146791006e-06,
"learning_rate": 0.0018825509907063327,
"loss": 2.3,
"step": 60
},
{
"crossentropy": 2.1747264862060547,
"epoch": 7.625,
"grad_norm": 0.03396380692720413,
"grad_norm_var": 2.7140570016346197e-06,
"learning_rate": 0.001710306370301437,
"loss": 2.1747,
"step": 61
},
{
"crossentropy": 2.120774269104004,
"epoch": 7.75,
"grad_norm": 0.03624221310019493,
"grad_norm_var": 2.3210148057607623e-06,
"learning_rate": 0.0015446867550656768,
"loss": 2.1208,
"step": 62
},
{
"crossentropy": 2.2385036945343018,
"epoch": 7.875,
"grad_norm": 0.034913428127765656,
"grad_norm_var": 1.732991609125894e-06,
"learning_rate": 0.0013860256808630427,
"loss": 2.2385,
"step": 63
},
{
"crossentropy": 2.1571521759033203,
"epoch": 8.0,
"grad_norm": 0.03277244418859482,
"grad_norm_var": 1.4490052253732697e-06,
"learning_rate": 0.0012346426699819458,
"loss": 2.1572,
"step": 64
},
{
"crossentropy": 2.2847859859466553,
"epoch": 8.125,
"grad_norm": 0.033221788704395294,
"grad_norm_var": 1.512194472239048e-06,
"learning_rate": 0.001090842587659851,
"loss": 2.2848,
"step": 65
},
{
"crossentropy": 2.253898859024048,
"epoch": 8.25,
"grad_norm": 0.03283696994185448,
"grad_norm_var": 1.361639072858939e-06,
"learning_rate": 0.0009549150281252633,
"loss": 2.2539,
"step": 66
},
{
"crossentropy": 2.1402039527893066,
"epoch": 8.375,
"grad_norm": 0.03309963271021843,
"grad_norm_var": 1.3845661239702636e-06,
"learning_rate": 0.0008271337313934868,
"loss": 2.1402,
"step": 67
},
{
"crossentropy": 2.1381993293762207,
"epoch": 8.5,
"grad_norm": 0.03431132435798645,
"grad_norm_var": 1.3068839073976725e-06,
"learning_rate": 0.0007077560319906695,
"loss": 2.1382,
"step": 68
},
{
"crossentropy": 2.1610183715820312,
"epoch": 8.625,
"grad_norm": 0.03382508084177971,
"grad_norm_var": 9.367598844583584e-07,
"learning_rate": 0.00059702234071631,
"loss": 2.161,
"step": 69
},
{
"crossentropy": 2.2194838523864746,
"epoch": 8.75,
"grad_norm": 0.03652471676468849,
"grad_norm_var": 1.3163803696068672e-06,
"learning_rate": 0.0004951556604879049,
"loss": 2.2195,
"step": 70
},
{
"crossentropy": 2.179126262664795,
"epoch": 8.875,
"grad_norm": 0.03386425971984863,
"grad_norm_var": 1.2798076699575718e-06,
"learning_rate": 0.0004023611372427471,
"loss": 2.1791,
"step": 71
},
{
"crossentropy": 2.2535359859466553,
"epoch": 9.0,
"grad_norm": 0.032655857503414154,
"grad_norm_var": 1.3903296123838708e-06,
"learning_rate": 0.00031882564680131396,
"loss": 2.2535,
"step": 72
},
{
"crossentropy": 2.083571434020996,
"epoch": 9.125,
"grad_norm": 0.03222977742552757,
"grad_norm_var": 1.6381791075122573e-06,
"learning_rate": 0.00024471741852423234,
"loss": 2.0836,
"step": 73
},
{
"crossentropy": 2.1447954177856445,
"epoch": 9.25,
"grad_norm": 0.03370295464992523,
"grad_norm_var": 1.585818962655111e-06,
"learning_rate": 0.0001801856965207338,
"loss": 2.1448,
"step": 74
},
{
"crossentropy": 2.3297011852264404,
"epoch": 9.375,
"grad_norm": 0.03342713788151741,
"grad_norm_var": 1.5866984901570397e-06,
"learning_rate": 0.0001253604390908819,
"loss": 2.3297,
"step": 75
},
{
"crossentropy": 2.184936761856079,
"epoch": 9.5,
"grad_norm": 0.03379293903708458,
"grad_norm_var": 1.128480817529492e-06,
"learning_rate": 8.035205700685166e-05,
"loss": 2.1849,
"step": 76
},
{
"crossentropy": 2.244959592819214,
"epoch": 9.625,
"grad_norm": 0.03535815700888634,
"grad_norm_var": 1.2224064569112125e-06,
"learning_rate": 4.52511911603265e-05,
"loss": 2.245,
"step": 77
},
{
"crossentropy": 2.145529270172119,
"epoch": 9.75,
"grad_norm": 0.03338780626654625,
"grad_norm_var": 1.180987359381859e-06,
"learning_rate": 2.012853002380466e-05,
"loss": 2.1455,
"step": 78
},
{
"crossentropy": 2.282687187194824,
"epoch": 9.875,
"grad_norm": 0.03313542157411575,
"grad_norm_var": 1.1489689212500093e-06,
"learning_rate": 5.034667293427053e-06,
"loss": 2.2827,
"step": 79
},
{
"crossentropy": 2.1934750080108643,
"epoch": 10.0,
"grad_norm": 0.03277941048145294,
"grad_norm_var": 1.18509241838338e-06,
"learning_rate": 0.0,
"loss": 2.1935,
"step": 80
}
],
"logging_steps": 1,
"max_steps": 80,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.384645494177792e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}