|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 2000, |
|
"global_step": 80, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"crossentropy": 2.487246513366699, |
|
"epoch": 0.125, |
|
"grad_norm": 0.03208793327212334, |
|
"learning_rate": 0.001, |
|
"loss": 2.4872, |
|
"step": 1 |
|
}, |
|
{ |
|
"crossentropy": 2.379896402359009, |
|
"epoch": 0.25, |
|
"grad_norm": 0.03239322826266289, |
|
"learning_rate": 0.002, |
|
"loss": 2.3799, |
|
"step": 2 |
|
}, |
|
{ |
|
"crossentropy": 2.5010085105895996, |
|
"epoch": 0.375, |
|
"grad_norm": 0.03320762887597084, |
|
"learning_rate": 0.003, |
|
"loss": 2.501, |
|
"step": 3 |
|
}, |
|
{ |
|
"crossentropy": 2.688127040863037, |
|
"epoch": 0.5, |
|
"grad_norm": 0.03224330395460129, |
|
"learning_rate": 0.004, |
|
"loss": 2.6881, |
|
"step": 4 |
|
}, |
|
{ |
|
"crossentropy": 2.4009199142456055, |
|
"epoch": 0.625, |
|
"grad_norm": 0.029966533184051514, |
|
"learning_rate": 0.005, |
|
"loss": 2.4009, |
|
"step": 5 |
|
}, |
|
{ |
|
"crossentropy": 2.474385976791382, |
|
"epoch": 0.75, |
|
"grad_norm": 0.030283687636256218, |
|
"learning_rate": 0.006, |
|
"loss": 2.4744, |
|
"step": 6 |
|
}, |
|
{ |
|
"crossentropy": 2.5291025638580322, |
|
"epoch": 0.875, |
|
"grad_norm": 0.03127186745405197, |
|
"learning_rate": 0.006999999999999999, |
|
"loss": 2.5291, |
|
"step": 7 |
|
}, |
|
{ |
|
"crossentropy": 2.3309171199798584, |
|
"epoch": 1.0, |
|
"grad_norm": 0.030661335214972496, |
|
"learning_rate": 0.008, |
|
"loss": 2.3309, |
|
"step": 8 |
|
}, |
|
{ |
|
"crossentropy": 2.4654123783111572, |
|
"epoch": 1.125, |
|
"grad_norm": 0.029608242213726044, |
|
"learning_rate": 0.009000000000000001, |
|
"loss": 2.4654, |
|
"step": 9 |
|
}, |
|
{ |
|
"crossentropy": 2.468688488006592, |
|
"epoch": 1.25, |
|
"grad_norm": 0.03335599973797798, |
|
"learning_rate": 0.01, |
|
"loss": 2.4687, |
|
"step": 10 |
|
}, |
|
{ |
|
"crossentropy": 2.3773512840270996, |
|
"epoch": 1.375, |
|
"grad_norm": 0.031801655888557434, |
|
"learning_rate": 0.009994965332706574, |
|
"loss": 2.3774, |
|
"step": 11 |
|
}, |
|
{ |
|
"crossentropy": 2.5056920051574707, |
|
"epoch": 1.5, |
|
"grad_norm": 0.03860826417803764, |
|
"learning_rate": 0.009979871469976196, |
|
"loss": 2.5057, |
|
"step": 12 |
|
}, |
|
{ |
|
"crossentropy": 2.4569733142852783, |
|
"epoch": 1.625, |
|
"grad_norm": 0.03568004071712494, |
|
"learning_rate": 0.009954748808839673, |
|
"loss": 2.457, |
|
"step": 13 |
|
}, |
|
{ |
|
"crossentropy": 2.4127414226531982, |
|
"epoch": 1.75, |
|
"grad_norm": 0.03557576611638069, |
|
"learning_rate": 0.009919647942993149, |
|
"loss": 2.4127, |
|
"step": 14 |
|
}, |
|
{ |
|
"crossentropy": 2.4089269638061523, |
|
"epoch": 1.875, |
|
"grad_norm": 0.0321757011115551, |
|
"grad_norm_var": 5.793923908080815e-06, |
|
"learning_rate": 0.009874639560909117, |
|
"loss": 2.4089, |
|
"step": 15 |
|
}, |
|
{ |
|
"crossentropy": 2.385178804397583, |
|
"epoch": 2.0, |
|
"grad_norm": 0.033555347472429276, |
|
"grad_norm_var": 5.852987523136916e-06, |
|
"learning_rate": 0.009819814303479266, |
|
"loss": 2.3852, |
|
"step": 16 |
|
}, |
|
{ |
|
"crossentropy": 2.4342517852783203, |
|
"epoch": 2.125, |
|
"grad_norm": 0.034557584673166275, |
|
"grad_norm_var": 6.127065953599033e-06, |
|
"learning_rate": 0.009755282581475769, |
|
"loss": 2.4343, |
|
"step": 17 |
|
}, |
|
{ |
|
"crossentropy": 2.4237754344940186, |
|
"epoch": 2.25, |
|
"grad_norm": 0.03371824324131012, |
|
"grad_norm_var": 6.168768579761655e-06, |
|
"learning_rate": 0.009681174353198686, |
|
"loss": 2.4238, |
|
"step": 18 |
|
}, |
|
{ |
|
"crossentropy": 2.4231510162353516, |
|
"epoch": 2.375, |
|
"grad_norm": 0.02907504141330719, |
|
"grad_norm_var": 6.557002911025779e-06, |
|
"learning_rate": 0.009597638862757255, |
|
"loss": 2.4232, |
|
"step": 19 |
|
}, |
|
{ |
|
"crossentropy": 2.3850157260894775, |
|
"epoch": 2.5, |
|
"grad_norm": 0.03278205543756485, |
|
"grad_norm_var": 6.122522196004565e-06, |
|
"learning_rate": 0.009504844339512096, |
|
"loss": 2.385, |
|
"step": 20 |
|
}, |
|
{ |
|
"crossentropy": 2.3287932872772217, |
|
"epoch": 2.625, |
|
"grad_norm": 0.03301858901977539, |
|
"grad_norm_var": 5.930476905840337e-06, |
|
"learning_rate": 0.00940297765928369, |
|
"loss": 2.3288, |
|
"step": 21 |
|
}, |
|
{ |
|
"crossentropy": 2.3058865070343018, |
|
"epoch": 2.75, |
|
"grad_norm": 0.03293571248650551, |
|
"grad_norm_var": 5.5371734660507875e-06, |
|
"learning_rate": 0.009292243968009331, |
|
"loss": 2.3059, |
|
"step": 22 |
|
}, |
|
{ |
|
"crossentropy": 2.2683637142181396, |
|
"epoch": 2.875, |
|
"grad_norm": 0.03541896864771843, |
|
"grad_norm_var": 5.362674881745298e-06, |
|
"learning_rate": 0.009172866268606514, |
|
"loss": 2.2684, |
|
"step": 23 |
|
}, |
|
{ |
|
"crossentropy": 2.411990165710449, |
|
"epoch": 3.0, |
|
"grad_norm": 0.03502194955945015, |
|
"grad_norm_var": 4.411311488799798e-06, |
|
"learning_rate": 0.009045084971874737, |
|
"loss": 2.412, |
|
"step": 24 |
|
}, |
|
{ |
|
"crossentropy": 2.2515363693237305, |
|
"epoch": 3.125, |
|
"grad_norm": 0.03379856050014496, |
|
"grad_norm_var": 4.1744788750110175e-06, |
|
"learning_rate": 0.008909157412340149, |
|
"loss": 2.2515, |
|
"step": 25 |
|
}, |
|
{ |
|
"crossentropy": 2.3486626148223877, |
|
"epoch": 3.25, |
|
"grad_norm": 0.034475792199373245, |
|
"grad_norm_var": 2.7198637048790144e-06, |
|
"learning_rate": 0.008765357330018056, |
|
"loss": 2.3487, |
|
"step": 26 |
|
}, |
|
{ |
|
"crossentropy": 2.379669666290283, |
|
"epoch": 3.375, |
|
"grad_norm": 0.03319563344120979, |
|
"grad_norm_var": 2.4738877727155267e-06, |
|
"learning_rate": 0.008613974319136958, |
|
"loss": 2.3797, |
|
"step": 27 |
|
}, |
|
{ |
|
"crossentropy": 2.3640384674072266, |
|
"epoch": 3.5, |
|
"grad_norm": 0.03426358476281166, |
|
"grad_norm_var": 2.2389126654258237e-06, |
|
"learning_rate": 0.008455313244934324, |
|
"loss": 2.364, |
|
"step": 28 |
|
}, |
|
{ |
|
"crossentropy": 2.324143409729004, |
|
"epoch": 3.625, |
|
"grad_norm": 0.0358288437128067, |
|
"grad_norm_var": 2.410602035355508e-06, |
|
"learning_rate": 0.008289693629698563, |
|
"loss": 2.3241, |
|
"step": 29 |
|
}, |
|
{ |
|
"crossentropy": 2.159423828125, |
|
"epoch": 3.75, |
|
"grad_norm": 0.031966786831617355, |
|
"grad_norm_var": 2.612506091834166e-06, |
|
"learning_rate": 0.008117449009293669, |
|
"loss": 2.1594, |
|
"step": 30 |
|
}, |
|
{ |
|
"crossentropy": 2.2669661045074463, |
|
"epoch": 3.875, |
|
"grad_norm": 0.0339609794318676, |
|
"grad_norm_var": 2.61687730451927e-06, |
|
"learning_rate": 0.007938926261462366, |
|
"loss": 2.267, |
|
"step": 31 |
|
}, |
|
{ |
|
"crossentropy": 2.482945680618286, |
|
"epoch": 4.0, |
|
"grad_norm": 0.03357086703181267, |
|
"grad_norm_var": 2.5635888162410063e-06, |
|
"learning_rate": 0.007754484907260513, |
|
"loss": 2.4829, |
|
"step": 32 |
|
}, |
|
{ |
|
"crossentropy": 2.1757168769836426, |
|
"epoch": 4.125, |
|
"grad_norm": 0.032582979649305344, |
|
"grad_norm_var": 1.2068945133852464e-06, |
|
"learning_rate": 0.007564496387029531, |
|
"loss": 2.1757, |
|
"step": 33 |
|
}, |
|
{ |
|
"crossentropy": 2.4300379753112793, |
|
"epoch": 4.25, |
|
"grad_norm": 0.03353552147746086, |
|
"grad_norm_var": 1.1362555840309259e-06, |
|
"learning_rate": 0.007369343312364994, |
|
"loss": 2.43, |
|
"step": 34 |
|
}, |
|
{ |
|
"crossentropy": 2.1233766078948975, |
|
"epoch": 4.375, |
|
"grad_norm": 0.034830041229724884, |
|
"grad_norm_var": 1.1319644129707703e-06, |
|
"learning_rate": 0.007169418695587791, |
|
"loss": 2.1234, |
|
"step": 35 |
|
}, |
|
{ |
|
"crossentropy": 2.2619526386260986, |
|
"epoch": 4.5, |
|
"grad_norm": 0.033126723021268845, |
|
"grad_norm_var": 1.1071727437842251e-06, |
|
"learning_rate": 0.006965125158269619, |
|
"loss": 2.262, |
|
"step": 36 |
|
}, |
|
{ |
|
"crossentropy": 2.3065433502197266, |
|
"epoch": 4.625, |
|
"grad_norm": 0.0348593033850193, |
|
"grad_norm_var": 1.0216560744425243e-06, |
|
"learning_rate": 0.0067568741204067145, |
|
"loss": 2.3065, |
|
"step": 37 |
|
}, |
|
{ |
|
"crossentropy": 2.2902674674987793, |
|
"epoch": 4.75, |
|
"grad_norm": 0.0360955074429512, |
|
"grad_norm_var": 1.2434575549111667e-06, |
|
"learning_rate": 0.006545084971874737, |
|
"loss": 2.2903, |
|
"step": 38 |
|
}, |
|
{ |
|
"crossentropy": 2.2587778568267822, |
|
"epoch": 4.875, |
|
"grad_norm": 0.03467360511422157, |
|
"grad_norm_var": 1.2055615432713293e-06, |
|
"learning_rate": 0.006330184227833375, |
|
"loss": 2.2588, |
|
"step": 39 |
|
}, |
|
{ |
|
"crossentropy": 2.3978285789489746, |
|
"epoch": 5.0, |
|
"grad_norm": 0.035668086260557175, |
|
"grad_norm_var": 1.3685657271088221e-06, |
|
"learning_rate": 0.006112604669781572, |
|
"loss": 2.3978, |
|
"step": 40 |
|
}, |
|
{ |
|
"crossentropy": 2.3564870357513428, |
|
"epoch": 5.125, |
|
"grad_norm": 0.0373300276696682, |
|
"grad_norm_var": 2.0190065310690096e-06, |
|
"learning_rate": 0.005892784473993183, |
|
"loss": 2.3565, |
|
"step": 41 |
|
}, |
|
{ |
|
"crossentropy": 2.202275514602661, |
|
"epoch": 5.25, |
|
"grad_norm": 0.03768543526530266, |
|
"grad_norm_var": 2.6563097811015932e-06, |
|
"learning_rate": 0.0056711663290882775, |
|
"loss": 2.2023, |
|
"step": 42 |
|
}, |
|
{ |
|
"crossentropy": 2.192146062850952, |
|
"epoch": 5.375, |
|
"grad_norm": 0.03134535253047943, |
|
"grad_norm_var": 3.228640330968311e-06, |
|
"learning_rate": 0.005448196544517168, |
|
"loss": 2.1921, |
|
"step": 43 |
|
}, |
|
{ |
|
"crossentropy": 2.2519092559814453, |
|
"epoch": 5.5, |
|
"grad_norm": 0.032612044364213943, |
|
"grad_norm_var": 3.044945465432008e-06, |
|
"learning_rate": 0.005224324151752576, |
|
"loss": 2.2519, |
|
"step": 44 |
|
}, |
|
{ |
|
"crossentropy": 2.3610141277313232, |
|
"epoch": 5.625, |
|
"grad_norm": 0.03532750904560089, |
|
"grad_norm_var": 3.073519780112897e-06, |
|
"learning_rate": 0.005, |
|
"loss": 2.361, |
|
"step": 45 |
|
}, |
|
{ |
|
"crossentropy": 2.2574493885040283, |
|
"epoch": 5.75, |
|
"grad_norm": 0.03538508713245392, |
|
"grad_norm_var": 3.047191916038562e-06, |
|
"learning_rate": 0.004775675848247427, |
|
"loss": 2.2574, |
|
"step": 46 |
|
}, |
|
{ |
|
"crossentropy": 2.170254707336426, |
|
"epoch": 5.875, |
|
"grad_norm": 0.03475534915924072, |
|
"grad_norm_var": 2.9654755954788835e-06, |
|
"learning_rate": 0.004551803455482833, |
|
"loss": 2.1703, |
|
"step": 47 |
|
}, |
|
{ |
|
"crossentropy": 2.2004218101501465, |
|
"epoch": 6.0, |
|
"grad_norm": 0.0375908762216568, |
|
"grad_norm_var": 3.107626395358089e-06, |
|
"learning_rate": 0.004328833670911724, |
|
"loss": 2.2004, |
|
"step": 48 |
|
}, |
|
{ |
|
"crossentropy": 2.162071943283081, |
|
"epoch": 6.125, |
|
"grad_norm": 0.031826432794332504, |
|
"grad_norm_var": 4.072774386597716e-06, |
|
"learning_rate": 0.004107215526006817, |
|
"loss": 2.1621, |
|
"step": 49 |
|
}, |
|
{ |
|
"crossentropy": 2.144589424133301, |
|
"epoch": 6.25, |
|
"grad_norm": 0.03268062323331833, |
|
"grad_norm_var": 4.202360731680779e-06, |
|
"learning_rate": 0.003887395330218428, |
|
"loss": 2.1446, |
|
"step": 50 |
|
}, |
|
{ |
|
"crossentropy": 2.202075481414795, |
|
"epoch": 6.375, |
|
"grad_norm": 0.03384561836719513, |
|
"grad_norm_var": 4.294842472950198e-06, |
|
"learning_rate": 0.003669815772166625, |
|
"loss": 2.2021, |
|
"step": 51 |
|
}, |
|
{ |
|
"crossentropy": 2.3169946670532227, |
|
"epoch": 6.5, |
|
"grad_norm": 0.03661832585930824, |
|
"grad_norm_var": 4.387942230220839e-06, |
|
"learning_rate": 0.003454915028125263, |
|
"loss": 2.317, |
|
"step": 52 |
|
}, |
|
{ |
|
"crossentropy": 2.191814422607422, |
|
"epoch": 6.625, |
|
"grad_norm": 0.03522748872637749, |
|
"grad_norm_var": 4.380226970383122e-06, |
|
"learning_rate": 0.003243125879593286, |
|
"loss": 2.1918, |
|
"step": 53 |
|
}, |
|
{ |
|
"crossentropy": 2.196298360824585, |
|
"epoch": 6.75, |
|
"grad_norm": 0.03524046018719673, |
|
"grad_norm_var": 4.3576876356847075e-06, |
|
"learning_rate": 0.0030348748417303823, |
|
"loss": 2.1963, |
|
"step": 54 |
|
}, |
|
{ |
|
"crossentropy": 2.2684872150421143, |
|
"epoch": 6.875, |
|
"grad_norm": 0.03640512377023697, |
|
"grad_norm_var": 4.452811959714381e-06, |
|
"learning_rate": 0.00283058130441221, |
|
"loss": 2.2685, |
|
"step": 55 |
|
}, |
|
{ |
|
"crossentropy": 2.3180689811706543, |
|
"epoch": 7.0, |
|
"grad_norm": 0.03479791432619095, |
|
"grad_norm_var": 4.097831569832335e-06, |
|
"learning_rate": 0.002630656687635007, |
|
"loss": 2.3181, |
|
"step": 56 |
|
}, |
|
{ |
|
"crossentropy": 2.201280117034912, |
|
"epoch": 7.125, |
|
"grad_norm": 0.03417252376675606, |
|
"grad_norm_var": 2.7744502044233416e-06, |
|
"learning_rate": 0.00243550361297047, |
|
"loss": 2.2013, |
|
"step": 57 |
|
}, |
|
{ |
|
"crossentropy": 2.3339715003967285, |
|
"epoch": 7.25, |
|
"grad_norm": 0.033313509076833725, |
|
"grad_norm_var": 2.5884423837709467e-06, |
|
"learning_rate": 0.002245515092739488, |
|
"loss": 2.334, |
|
"step": 58 |
|
}, |
|
{ |
|
"crossentropy": 2.1603586673736572, |
|
"epoch": 7.375, |
|
"grad_norm": 0.03365428000688553, |
|
"grad_norm_var": 2.684439790351147e-06, |
|
"learning_rate": 0.0020610737385376348, |
|
"loss": 2.1604, |
|
"step": 59 |
|
}, |
|
{ |
|
"crossentropy": 2.2999627590179443, |
|
"epoch": 7.5, |
|
"grad_norm": 0.034641556441783905, |
|
"grad_norm_var": 2.6678187146791006e-06, |
|
"learning_rate": 0.0018825509907063327, |
|
"loss": 2.3, |
|
"step": 60 |
|
}, |
|
{ |
|
"crossentropy": 2.1747264862060547, |
|
"epoch": 7.625, |
|
"grad_norm": 0.03396380692720413, |
|
"grad_norm_var": 2.7140570016346197e-06, |
|
"learning_rate": 0.001710306370301437, |
|
"loss": 2.1747, |
|
"step": 61 |
|
}, |
|
{ |
|
"crossentropy": 2.120774269104004, |
|
"epoch": 7.75, |
|
"grad_norm": 0.03624221310019493, |
|
"grad_norm_var": 2.3210148057607623e-06, |
|
"learning_rate": 0.0015446867550656768, |
|
"loss": 2.1208, |
|
"step": 62 |
|
}, |
|
{ |
|
"crossentropy": 2.2385036945343018, |
|
"epoch": 7.875, |
|
"grad_norm": 0.034913428127765656, |
|
"grad_norm_var": 1.732991609125894e-06, |
|
"learning_rate": 0.0013860256808630427, |
|
"loss": 2.2385, |
|
"step": 63 |
|
}, |
|
{ |
|
"crossentropy": 2.1571521759033203, |
|
"epoch": 8.0, |
|
"grad_norm": 0.03277244418859482, |
|
"grad_norm_var": 1.4490052253732697e-06, |
|
"learning_rate": 0.0012346426699819458, |
|
"loss": 2.1572, |
|
"step": 64 |
|
}, |
|
{ |
|
"crossentropy": 2.2847859859466553, |
|
"epoch": 8.125, |
|
"grad_norm": 0.033221788704395294, |
|
"grad_norm_var": 1.512194472239048e-06, |
|
"learning_rate": 0.001090842587659851, |
|
"loss": 2.2848, |
|
"step": 65 |
|
}, |
|
{ |
|
"crossentropy": 2.253898859024048, |
|
"epoch": 8.25, |
|
"grad_norm": 0.03283696994185448, |
|
"grad_norm_var": 1.361639072858939e-06, |
|
"learning_rate": 0.0009549150281252633, |
|
"loss": 2.2539, |
|
"step": 66 |
|
}, |
|
{ |
|
"crossentropy": 2.1402039527893066, |
|
"epoch": 8.375, |
|
"grad_norm": 0.03309963271021843, |
|
"grad_norm_var": 1.3845661239702636e-06, |
|
"learning_rate": 0.0008271337313934868, |
|
"loss": 2.1402, |
|
"step": 67 |
|
}, |
|
{ |
|
"crossentropy": 2.1381993293762207, |
|
"epoch": 8.5, |
|
"grad_norm": 0.03431132435798645, |
|
"grad_norm_var": 1.3068839073976725e-06, |
|
"learning_rate": 0.0007077560319906695, |
|
"loss": 2.1382, |
|
"step": 68 |
|
}, |
|
{ |
|
"crossentropy": 2.1610183715820312, |
|
"epoch": 8.625, |
|
"grad_norm": 0.03382508084177971, |
|
"grad_norm_var": 9.367598844583584e-07, |
|
"learning_rate": 0.00059702234071631, |
|
"loss": 2.161, |
|
"step": 69 |
|
}, |
|
{ |
|
"crossentropy": 2.2194838523864746, |
|
"epoch": 8.75, |
|
"grad_norm": 0.03652471676468849, |
|
"grad_norm_var": 1.3163803696068672e-06, |
|
"learning_rate": 0.0004951556604879049, |
|
"loss": 2.2195, |
|
"step": 70 |
|
}, |
|
{ |
|
"crossentropy": 2.179126262664795, |
|
"epoch": 8.875, |
|
"grad_norm": 0.03386425971984863, |
|
"grad_norm_var": 1.2798076699575718e-06, |
|
"learning_rate": 0.0004023611372427471, |
|
"loss": 2.1791, |
|
"step": 71 |
|
}, |
|
{ |
|
"crossentropy": 2.2535359859466553, |
|
"epoch": 9.0, |
|
"grad_norm": 0.032655857503414154, |
|
"grad_norm_var": 1.3903296123838708e-06, |
|
"learning_rate": 0.00031882564680131396, |
|
"loss": 2.2535, |
|
"step": 72 |
|
}, |
|
{ |
|
"crossentropy": 2.083571434020996, |
|
"epoch": 9.125, |
|
"grad_norm": 0.03222977742552757, |
|
"grad_norm_var": 1.6381791075122573e-06, |
|
"learning_rate": 0.00024471741852423234, |
|
"loss": 2.0836, |
|
"step": 73 |
|
}, |
|
{ |
|
"crossentropy": 2.1447954177856445, |
|
"epoch": 9.25, |
|
"grad_norm": 0.03370295464992523, |
|
"grad_norm_var": 1.585818962655111e-06, |
|
"learning_rate": 0.0001801856965207338, |
|
"loss": 2.1448, |
|
"step": 74 |
|
}, |
|
{ |
|
"crossentropy": 2.3297011852264404, |
|
"epoch": 9.375, |
|
"grad_norm": 0.03342713788151741, |
|
"grad_norm_var": 1.5866984901570397e-06, |
|
"learning_rate": 0.0001253604390908819, |
|
"loss": 2.3297, |
|
"step": 75 |
|
}, |
|
{ |
|
"crossentropy": 2.184936761856079, |
|
"epoch": 9.5, |
|
"grad_norm": 0.03379293903708458, |
|
"grad_norm_var": 1.128480817529492e-06, |
|
"learning_rate": 8.035205700685166e-05, |
|
"loss": 2.1849, |
|
"step": 76 |
|
}, |
|
{ |
|
"crossentropy": 2.244959592819214, |
|
"epoch": 9.625, |
|
"grad_norm": 0.03535815700888634, |
|
"grad_norm_var": 1.2224064569112125e-06, |
|
"learning_rate": 4.52511911603265e-05, |
|
"loss": 2.245, |
|
"step": 77 |
|
}, |
|
{ |
|
"crossentropy": 2.145529270172119, |
|
"epoch": 9.75, |
|
"grad_norm": 0.03338780626654625, |
|
"grad_norm_var": 1.180987359381859e-06, |
|
"learning_rate": 2.012853002380466e-05, |
|
"loss": 2.1455, |
|
"step": 78 |
|
}, |
|
{ |
|
"crossentropy": 2.282687187194824, |
|
"epoch": 9.875, |
|
"grad_norm": 0.03313542157411575, |
|
"grad_norm_var": 1.1489689212500093e-06, |
|
"learning_rate": 5.034667293427053e-06, |
|
"loss": 2.2827, |
|
"step": 79 |
|
}, |
|
{ |
|
"crossentropy": 2.1934750080108643, |
|
"epoch": 10.0, |
|
"grad_norm": 0.03277941048145294, |
|
"grad_norm_var": 1.18509241838338e-06, |
|
"learning_rate": 0.0, |
|
"loss": 2.1935, |
|
"step": 80 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 80, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.384645494177792e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|