|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.957854406130268, |
|
"eval_steps": 17, |
|
"global_step": 585, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01532567049808429, |
|
"grad_norm": 3.475003242492676, |
|
"learning_rate": 2e-05, |
|
"loss": 1.9507, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01532567049808429, |
|
"eval_loss": 1.9943002462387085, |
|
"eval_runtime": 10.4694, |
|
"eval_samples_per_second": 9.552, |
|
"eval_steps_per_second": 4.776, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03065134099616858, |
|
"grad_norm": 3.6678824424743652, |
|
"learning_rate": 4e-05, |
|
"loss": 2.0639, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04597701149425287, |
|
"grad_norm": 3.1201210021972656, |
|
"learning_rate": 6e-05, |
|
"loss": 1.8136, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.06130268199233716, |
|
"grad_norm": 3.606743574142456, |
|
"learning_rate": 8e-05, |
|
"loss": 1.9302, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07662835249042145, |
|
"grad_norm": 3.096000909805298, |
|
"learning_rate": 0.0001, |
|
"loss": 1.9869, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09195402298850575, |
|
"grad_norm": 2.841855049133301, |
|
"learning_rate": 0.00012, |
|
"loss": 1.7556, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.10727969348659004, |
|
"grad_norm": 2.7530441284179688, |
|
"learning_rate": 0.00014, |
|
"loss": 1.8622, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.12260536398467432, |
|
"grad_norm": 2.9382359981536865, |
|
"learning_rate": 0.00016, |
|
"loss": 1.7264, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 2.9906227588653564, |
|
"learning_rate": 0.00018, |
|
"loss": 1.8225, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1532567049808429, |
|
"grad_norm": 2.951603889465332, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8434, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1685823754789272, |
|
"grad_norm": 2.783867120742798, |
|
"learning_rate": 0.00019999916768504724, |
|
"loss": 1.6941, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1839080459770115, |
|
"grad_norm": 2.7186167240142822, |
|
"learning_rate": 0.00019999667075404383, |
|
"loss": 1.8163, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.19923371647509577, |
|
"grad_norm": 2.33475661277771, |
|
"learning_rate": 0.00019999250924855456, |
|
"loss": 1.6088, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.21455938697318008, |
|
"grad_norm": 2.289853811264038, |
|
"learning_rate": 0.00019998668323785296, |
|
"loss": 1.6944, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"grad_norm": 2.4338462352752686, |
|
"learning_rate": 0.00019997919281892067, |
|
"loss": 1.7205, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.24521072796934865, |
|
"grad_norm": 2.6904211044311523, |
|
"learning_rate": 0.00019997003811644533, |
|
"loss": 1.8309, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.26053639846743293, |
|
"grad_norm": 2.0868079662323, |
|
"learning_rate": 0.00019995921928281894, |
|
"loss": 1.714, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.26053639846743293, |
|
"eval_loss": 1.71925687789917, |
|
"eval_runtime": 10.4582, |
|
"eval_samples_per_second": 9.562, |
|
"eval_steps_per_second": 4.781, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 2.312363862991333, |
|
"learning_rate": 0.00019994673649813497, |
|
"loss": 1.7437, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.29118773946360155, |
|
"grad_norm": 2.1838905811309814, |
|
"learning_rate": 0.00019993258997018566, |
|
"loss": 1.6337, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.3065134099616858, |
|
"grad_norm": 2.2951676845550537, |
|
"learning_rate": 0.0001999167799344583, |
|
"loss": 1.6456, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3218390804597701, |
|
"grad_norm": 2.147050380706787, |
|
"learning_rate": 0.00019989930665413147, |
|
"loss": 1.5753, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3371647509578544, |
|
"grad_norm": 2.214049816131592, |
|
"learning_rate": 0.00019988017042007065, |
|
"loss": 1.8861, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.3524904214559387, |
|
"grad_norm": 2.1761178970336914, |
|
"learning_rate": 0.00019985937155082327, |
|
"loss": 1.5181, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.367816091954023, |
|
"grad_norm": 2.7011399269104004, |
|
"learning_rate": 0.00019983691039261357, |
|
"loss": 1.6559, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3831417624521073, |
|
"grad_norm": 2.0692250728607178, |
|
"learning_rate": 0.0001998127873193367, |
|
"loss": 1.6602, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.39846743295019155, |
|
"grad_norm": 2.190605640411377, |
|
"learning_rate": 0.00019978700273255254, |
|
"loss": 1.6678, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 2.303030252456665, |
|
"learning_rate": 0.000199759557061479, |
|
"loss": 1.7287, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.42911877394636017, |
|
"grad_norm": 2.3805620670318604, |
|
"learning_rate": 0.000199730450762985, |
|
"loss": 1.6801, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 1.9173905849456787, |
|
"learning_rate": 0.00019969968432158265, |
|
"loss": 1.6536, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 1.9623961448669434, |
|
"learning_rate": 0.00019966725824941932, |
|
"loss": 1.5311, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.47509578544061304, |
|
"grad_norm": 2.2046408653259277, |
|
"learning_rate": 0.00019963317308626914, |
|
"loss": 1.7119, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.4904214559386973, |
|
"grad_norm": 2.034040927886963, |
|
"learning_rate": 0.00019959742939952392, |
|
"loss": 1.6249, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5057471264367817, |
|
"grad_norm": 2.274533271789551, |
|
"learning_rate": 0.00019956002778418372, |
|
"loss": 1.6809, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5210727969348659, |
|
"grad_norm": 1.9758435487747192, |
|
"learning_rate": 0.0001995209688628471, |
|
"loss": 1.5507, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5210727969348659, |
|
"eval_loss": 1.7039636373519897, |
|
"eval_runtime": 10.4847, |
|
"eval_samples_per_second": 9.538, |
|
"eval_steps_per_second": 4.769, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5363984674329502, |
|
"grad_norm": 1.908996820449829, |
|
"learning_rate": 0.00019948025328570042, |
|
"loss": 1.668, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 2.0340089797973633, |
|
"learning_rate": 0.00019943788173050744, |
|
"loss": 1.6788, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5670498084291188, |
|
"grad_norm": 2.1147003173828125, |
|
"learning_rate": 0.0001993938549025977, |
|
"loss": 1.5346, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5823754789272031, |
|
"grad_norm": 2.2234580516815186, |
|
"learning_rate": 0.00019934817353485501, |
|
"loss": 1.6118, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5977011494252874, |
|
"grad_norm": 1.8898108005523682, |
|
"learning_rate": 0.00019930083838770504, |
|
"loss": 1.542, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6130268199233716, |
|
"grad_norm": 1.947200894355774, |
|
"learning_rate": 0.00019925185024910277, |
|
"loss": 1.6701, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6283524904214559, |
|
"grad_norm": 1.9336851835250854, |
|
"learning_rate": 0.00019920120993451948, |
|
"loss": 1.6159, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6436781609195402, |
|
"grad_norm": 2.044646978378296, |
|
"learning_rate": 0.00019914891828692888, |
|
"loss": 1.6761, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6590038314176245, |
|
"grad_norm": 1.9677635431289673, |
|
"learning_rate": 0.00019909497617679348, |
|
"loss": 1.7505, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6743295019157088, |
|
"grad_norm": 1.887392282485962, |
|
"learning_rate": 0.00019903938450204972, |
|
"loss": 1.6804, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 2.1503148078918457, |
|
"learning_rate": 0.0001989821441880933, |
|
"loss": 1.5835, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7049808429118773, |
|
"grad_norm": 1.8051438331604004, |
|
"learning_rate": 0.00019892325618776351, |
|
"loss": 1.721, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7203065134099617, |
|
"grad_norm": 1.8534125089645386, |
|
"learning_rate": 0.0001988627214813277, |
|
"loss": 1.6925, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.735632183908046, |
|
"grad_norm": 1.6843996047973633, |
|
"learning_rate": 0.00019880054107646467, |
|
"loss": 1.7291, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7509578544061303, |
|
"grad_norm": 2.0053601264953613, |
|
"learning_rate": 0.000198736716008248, |
|
"loss": 1.6344, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.7662835249042146, |
|
"grad_norm": 1.9978563785552979, |
|
"learning_rate": 0.0001986712473391289, |
|
"loss": 1.5687, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7816091954022989, |
|
"grad_norm": 1.6498862504959106, |
|
"learning_rate": 0.0001986041361589184, |
|
"loss": 1.6354, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7816091954022989, |
|
"eval_loss": 1.6665664911270142, |
|
"eval_runtime": 10.4646, |
|
"eval_samples_per_second": 9.556, |
|
"eval_steps_per_second": 4.778, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7969348659003831, |
|
"grad_norm": 2.0754377841949463, |
|
"learning_rate": 0.00019853538358476932, |
|
"loss": 1.7128, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8122605363984674, |
|
"grad_norm": 1.8503700494766235, |
|
"learning_rate": 0.0001984649907611575, |
|
"loss": 1.6028, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 1.9877614974975586, |
|
"learning_rate": 0.00019839295885986296, |
|
"loss": 1.7578, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.842911877394636, |
|
"grad_norm": 1.9744536876678467, |
|
"learning_rate": 0.0001983192890799503, |
|
"loss": 1.6639, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8582375478927203, |
|
"grad_norm": 1.9516663551330566, |
|
"learning_rate": 0.00019824398264774867, |
|
"loss": 1.6724, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.8735632183908046, |
|
"grad_norm": 1.8794466257095337, |
|
"learning_rate": 0.0001981670408168315, |
|
"loss": 1.5008, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 1.7897112369537354, |
|
"learning_rate": 0.0001980884648679955, |
|
"loss": 1.5942, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.9042145593869731, |
|
"grad_norm": 1.776986002922058, |
|
"learning_rate": 0.00019800825610923934, |
|
"loss": 1.5893, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 1.9505722522735596, |
|
"learning_rate": 0.00019792641587574212, |
|
"loss": 1.6273, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9348659003831418, |
|
"grad_norm": 1.9335532188415527, |
|
"learning_rate": 0.00019784294552984078, |
|
"loss": 1.5953, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9501915708812261, |
|
"grad_norm": 2.057013750076294, |
|
"learning_rate": 0.0001977578464610077, |
|
"loss": 1.6479, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 1.838173508644104, |
|
"learning_rate": 0.00019767112008582736, |
|
"loss": 1.6264, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.9808429118773946, |
|
"grad_norm": 1.8121559619903564, |
|
"learning_rate": 0.000197582767847973, |
|
"loss": 1.5673, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.9961685823754789, |
|
"grad_norm": 1.8894027471542358, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 1.6727, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.0076628352490422, |
|
"grad_norm": 3.277520179748535, |
|
"learning_rate": 0.00019740119169423337, |
|
"loss": 2.0471, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0229885057471264, |
|
"grad_norm": 1.553820013999939, |
|
"learning_rate": 0.00019730797080091904, |
|
"loss": 0.9425, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.0383141762452108, |
|
"grad_norm": 1.5284228324890137, |
|
"learning_rate": 0.00019721313009002226, |
|
"loss": 0.9188, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.0383141762452108, |
|
"eval_loss": 1.6558603048324585, |
|
"eval_runtime": 10.461, |
|
"eval_samples_per_second": 9.559, |
|
"eval_steps_per_second": 4.78, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.053639846743295, |
|
"grad_norm": 1.4431841373443604, |
|
"learning_rate": 0.0001971166711402899, |
|
"loss": 0.8091, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.0689655172413792, |
|
"grad_norm": 1.6087971925735474, |
|
"learning_rate": 0.00019701859555740648, |
|
"loss": 0.9413, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0842911877394636, |
|
"grad_norm": 1.6617636680603027, |
|
"learning_rate": 0.0001969189049739674, |
|
"loss": 0.895, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.0996168582375478, |
|
"grad_norm": 1.606227159500122, |
|
"learning_rate": 0.00019681760104945203, |
|
"loss": 0.8442, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.1149425287356323, |
|
"grad_norm": 1.4187818765640259, |
|
"learning_rate": 0.00019671468547019573, |
|
"loss": 0.8078, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.1302681992337165, |
|
"grad_norm": 1.5401397943496704, |
|
"learning_rate": 0.00019661015994936203, |
|
"loss": 0.9093, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.1455938697318007, |
|
"grad_norm": 1.633941888809204, |
|
"learning_rate": 0.000196504026226914, |
|
"loss": 0.8941, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.160919540229885, |
|
"grad_norm": 1.551140308380127, |
|
"learning_rate": 0.00019639628606958533, |
|
"loss": 0.8318, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.1762452107279693, |
|
"grad_norm": 1.920763373374939, |
|
"learning_rate": 0.00019628694127085092, |
|
"loss": 0.8781, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.1915708812260537, |
|
"grad_norm": 1.802857518196106, |
|
"learning_rate": 0.00019617599365089693, |
|
"loss": 0.9417, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 1.5704469680786133, |
|
"learning_rate": 0.0001960634450565907, |
|
"loss": 0.8462, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 1.67445969581604, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 0.9293, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.2375478927203065, |
|
"grad_norm": 1.6255979537963867, |
|
"learning_rate": 0.00019583355246561074, |
|
"loss": 0.8358, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.2528735632183907, |
|
"grad_norm": 1.6431758403778076, |
|
"learning_rate": 0.00019571621229579782, |
|
"loss": 0.9362, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.2681992337164751, |
|
"grad_norm": 1.6321423053741455, |
|
"learning_rate": 0.00019559727880529059, |
|
"loss": 0.9574, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.2835249042145593, |
|
"grad_norm": 1.4820754528045654, |
|
"learning_rate": 0.00019547675397389141, |
|
"loss": 0.7697, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.2988505747126438, |
|
"grad_norm": 1.6704702377319336, |
|
"learning_rate": 0.00019535463980789277, |
|
"loss": 0.8897, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.2988505747126438, |
|
"eval_loss": 1.6953216791152954, |
|
"eval_runtime": 10.5357, |
|
"eval_samples_per_second": 9.492, |
|
"eval_steps_per_second": 4.746, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.314176245210728, |
|
"grad_norm": 1.5606012344360352, |
|
"learning_rate": 0.00019523093834004356, |
|
"loss": 0.8687, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.3295019157088124, |
|
"grad_norm": 1.69247567653656, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.962, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.3448275862068966, |
|
"grad_norm": 1.77336847782135, |
|
"learning_rate": 0.00019497878176186827, |
|
"loss": 0.8073, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.3601532567049808, |
|
"grad_norm": 1.6945431232452393, |
|
"learning_rate": 0.00019485033084901606, |
|
"loss": 0.9388, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.3754789272030652, |
|
"grad_norm": 1.8969769477844238, |
|
"learning_rate": 0.000194720301029191, |
|
"loss": 0.9693, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.3908045977011494, |
|
"grad_norm": 1.6189223527908325, |
|
"learning_rate": 0.0001945886944669084, |
|
"loss": 0.8052, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.4061302681992336, |
|
"grad_norm": 1.652786135673523, |
|
"learning_rate": 0.0001944555133529304, |
|
"loss": 0.9079, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.421455938697318, |
|
"grad_norm": 1.5484676361083984, |
|
"learning_rate": 0.00019432075990422968, |
|
"loss": 0.8395, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.4367816091954024, |
|
"grad_norm": 1.625877022743225, |
|
"learning_rate": 0.00019418443636395248, |
|
"loss": 0.876, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.4521072796934866, |
|
"grad_norm": 1.922146201133728, |
|
"learning_rate": 0.00019404654500138117, |
|
"loss": 0.8344, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.4674329501915708, |
|
"grad_norm": 1.6981974840164185, |
|
"learning_rate": 0.0001939070881118966, |
|
"loss": 0.8232, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.4827586206896552, |
|
"grad_norm": 1.7996752262115479, |
|
"learning_rate": 0.0001937660680169399, |
|
"loss": 0.9207, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.4980842911877394, |
|
"grad_norm": 1.784002423286438, |
|
"learning_rate": 0.00019362348706397373, |
|
"loss": 0.8402, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.5134099616858236, |
|
"grad_norm": 1.436486005783081, |
|
"learning_rate": 0.00019347934762644326, |
|
"loss": 0.7129, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.528735632183908, |
|
"grad_norm": 1.5737037658691406, |
|
"learning_rate": 0.0001933336521037367, |
|
"loss": 0.9158, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5440613026819925, |
|
"grad_norm": 1.516647219657898, |
|
"learning_rate": 0.00019318640292114524, |
|
"loss": 0.8451, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.5593869731800765, |
|
"grad_norm": 1.6449085474014282, |
|
"learning_rate": 0.00019303760252982287, |
|
"loss": 0.9014, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.5593869731800765, |
|
"eval_loss": 1.7118545770645142, |
|
"eval_runtime": 10.4529, |
|
"eval_samples_per_second": 9.567, |
|
"eval_steps_per_second": 4.783, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.5747126436781609, |
|
"grad_norm": 1.578679084777832, |
|
"learning_rate": 0.00019288725340674536, |
|
"loss": 0.8788, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.5900383141762453, |
|
"grad_norm": 1.635235071182251, |
|
"learning_rate": 0.00019273535805466917, |
|
"loss": 0.8992, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.6053639846743295, |
|
"grad_norm": 1.637152075767517, |
|
"learning_rate": 0.0001925819190020898, |
|
"loss": 0.8922, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.6206896551724137, |
|
"grad_norm": 1.5802862644195557, |
|
"learning_rate": 0.0001924269388031996, |
|
"loss": 0.822, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.6360153256704981, |
|
"grad_norm": 1.5077544450759888, |
|
"learning_rate": 0.00019227042003784527, |
|
"loss": 0.7743, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.6513409961685823, |
|
"grad_norm": 1.7062519788742065, |
|
"learning_rate": 0.000192112365311485, |
|
"loss": 0.8473, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 1.676834225654602, |
|
"learning_rate": 0.0001919527772551451, |
|
"loss": 0.96, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.681992337164751, |
|
"grad_norm": 1.775424838066101, |
|
"learning_rate": 0.00019179165852537596, |
|
"loss": 0.8855, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.6973180076628354, |
|
"grad_norm": 1.5298705101013184, |
|
"learning_rate": 0.0001916290118042082, |
|
"loss": 0.7232, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.7126436781609196, |
|
"grad_norm": 1.5757646560668945, |
|
"learning_rate": 0.0001914648397991078, |
|
"loss": 0.9097, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.7279693486590038, |
|
"grad_norm": 1.5786842107772827, |
|
"learning_rate": 0.00019129914524293102, |
|
"loss": 0.8836, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.7432950191570882, |
|
"grad_norm": 1.8097132444381714, |
|
"learning_rate": 0.00019113193089387903, |
|
"loss": 0.938, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.7586206896551724, |
|
"grad_norm": 1.771764874458313, |
|
"learning_rate": 0.00019096319953545185, |
|
"loss": 0.8042, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.7739463601532566, |
|
"grad_norm": 1.8478142023086548, |
|
"learning_rate": 0.00019079295397640215, |
|
"loss": 0.9323, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.789272030651341, |
|
"grad_norm": 1.5792856216430664, |
|
"learning_rate": 0.00019062119705068843, |
|
"loss": 0.8917, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.8045977011494254, |
|
"grad_norm": 1.6793948411941528, |
|
"learning_rate": 0.00019044793161742782, |
|
"loss": 0.8495, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.8199233716475096, |
|
"grad_norm": 1.6884868144989014, |
|
"learning_rate": 0.00019027316056084858, |
|
"loss": 0.8517, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.8199233716475096, |
|
"eval_loss": 1.7208638191223145, |
|
"eval_runtime": 10.4697, |
|
"eval_samples_per_second": 9.551, |
|
"eval_steps_per_second": 4.776, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.8352490421455938, |
|
"grad_norm": 1.740159511566162, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.96, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.8505747126436782, |
|
"grad_norm": 1.6979262828826904, |
|
"learning_rate": 0.0001899191132399138, |
|
"loss": 0.8892, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.8659003831417624, |
|
"grad_norm": 1.7245821952819824, |
|
"learning_rate": 0.00018973984286913584, |
|
"loss": 0.8417, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.8812260536398466, |
|
"grad_norm": 1.8138068914413452, |
|
"learning_rate": 0.0001895590786620963, |
|
"loss": 0.9722, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 1.4977965354919434, |
|
"learning_rate": 0.00018937682362785022, |
|
"loss": 0.8512, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.9118773946360155, |
|
"grad_norm": 1.5849545001983643, |
|
"learning_rate": 0.0001891930808002694, |
|
"loss": 0.7628, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.9272030651340997, |
|
"grad_norm": 1.8099451065063477, |
|
"learning_rate": 0.00018900785323799189, |
|
"loss": 0.9171, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.9425287356321839, |
|
"grad_norm": 1.5819072723388672, |
|
"learning_rate": 0.00018882114402437106, |
|
"loss": 0.7413, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.9578544061302683, |
|
"grad_norm": 1.8191732168197632, |
|
"learning_rate": 0.00018863295626742437, |
|
"loss": 1.0208, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.9731800766283525, |
|
"grad_norm": 1.7665985822677612, |
|
"learning_rate": 0.00018844329309978145, |
|
"loss": 0.8426, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.9885057471264367, |
|
"grad_norm": 1.9029268026351929, |
|
"learning_rate": 0.00018825215767863214, |
|
"loss": 0.983, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.007662835249042, |
|
"grad_norm": 1.5204992294311523, |
|
"learning_rate": 0.0001880595531856738, |
|
"loss": 0.6558, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.0229885057471266, |
|
"grad_norm": 1.225983738899231, |
|
"learning_rate": 0.00018786548282705848, |
|
"loss": 0.3984, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.0383141762452106, |
|
"grad_norm": 1.2345383167266846, |
|
"learning_rate": 0.0001876699498333393, |
|
"loss": 0.4303, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.053639846743295, |
|
"grad_norm": 1.2123405933380127, |
|
"learning_rate": 0.00018747295745941703, |
|
"loss": 0.4609, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 1.2038960456848145, |
|
"learning_rate": 0.00018727450898448563, |
|
"loss": 0.3909, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.0842911877394634, |
|
"grad_norm": 1.2191224098205566, |
|
"learning_rate": 0.00018707460771197774, |
|
"loss": 0.4448, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.0842911877394634, |
|
"eval_loss": 1.796938419342041, |
|
"eval_runtime": 10.4571, |
|
"eval_samples_per_second": 9.563, |
|
"eval_steps_per_second": 4.781, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.099616858237548, |
|
"grad_norm": 1.3134615421295166, |
|
"learning_rate": 0.00018687325696950972, |
|
"loss": 0.5176, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.1149425287356323, |
|
"grad_norm": 1.39946448802948, |
|
"learning_rate": 0.00018667046010882626, |
|
"loss": 0.4207, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.1302681992337167, |
|
"grad_norm": 1.20857834815979, |
|
"learning_rate": 0.00018646622050574454, |
|
"loss": 0.3165, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.1455938697318007, |
|
"grad_norm": 1.4676852226257324, |
|
"learning_rate": 0.00018626054156009806, |
|
"loss": 0.4934, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.160919540229885, |
|
"grad_norm": 1.2490851879119873, |
|
"learning_rate": 0.0001860534266956801, |
|
"loss": 0.4454, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.1762452107279695, |
|
"grad_norm": 1.5670422315597534, |
|
"learning_rate": 0.00018584487936018661, |
|
"loss": 0.4259, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.1915708812260535, |
|
"grad_norm": 1.5839508771896362, |
|
"learning_rate": 0.0001856349030251589, |
|
"loss": 0.4459, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.206896551724138, |
|
"grad_norm": 1.4877279996871948, |
|
"learning_rate": 0.00018542350118592584, |
|
"loss": 0.4585, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 1.292151927947998, |
|
"learning_rate": 0.00018521067736154568, |
|
"loss": 0.3635, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.2375478927203067, |
|
"grad_norm": 1.3014862537384033, |
|
"learning_rate": 0.00018499643509474738, |
|
"loss": 0.4268, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.2528735632183907, |
|
"grad_norm": 1.3445168733596802, |
|
"learning_rate": 0.00018478077795187187, |
|
"loss": 0.4178, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.268199233716475, |
|
"grad_norm": 1.2323206663131714, |
|
"learning_rate": 0.0001845637095228124, |
|
"loss": 0.3389, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.2835249042145596, |
|
"grad_norm": 1.321321725845337, |
|
"learning_rate": 0.000184345233420955, |
|
"loss": 0.394, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.2988505747126435, |
|
"grad_norm": 1.3308717012405396, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 0.3768, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.314176245210728, |
|
"grad_norm": 1.4169113636016846, |
|
"learning_rate": 0.00018390407276949234, |
|
"loss": 0.4106, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.3295019157088124, |
|
"grad_norm": 1.4107593297958374, |
|
"learning_rate": 0.00018368139556357928, |
|
"loss": 0.3955, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.344827586206897, |
|
"grad_norm": 1.2308950424194336, |
|
"learning_rate": 0.00018345732537213027, |
|
"loss": 0.4053, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.344827586206897, |
|
"eval_loss": 1.8346749544143677, |
|
"eval_runtime": 10.5405, |
|
"eval_samples_per_second": 9.487, |
|
"eval_steps_per_second": 4.744, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.3601532567049808, |
|
"grad_norm": 1.2049033641815186, |
|
"learning_rate": 0.0001832318659250847, |
|
"loss": 0.3675, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.375478927203065, |
|
"grad_norm": 1.35014009475708, |
|
"learning_rate": 0.00018300502097550806, |
|
"loss": 0.4565, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.3908045977011496, |
|
"grad_norm": 1.2926514148712158, |
|
"learning_rate": 0.00018277679429952912, |
|
"loss": 0.3887, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.4061302681992336, |
|
"grad_norm": 1.1395353078842163, |
|
"learning_rate": 0.0001825471896962774, |
|
"loss": 0.3469, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.421455938697318, |
|
"grad_norm": 1.2925468683242798, |
|
"learning_rate": 0.00018231621098781982, |
|
"loss": 0.3811, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.4367816091954024, |
|
"grad_norm": 1.2556133270263672, |
|
"learning_rate": 0.00018208386201909698, |
|
"loss": 0.3961, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.4521072796934864, |
|
"grad_norm": 3.042213201522827, |
|
"learning_rate": 0.00018185014665785936, |
|
"loss": 0.4634, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.467432950191571, |
|
"grad_norm": 7.5744099617004395, |
|
"learning_rate": 0.00018161506879460273, |
|
"loss": 0.5113, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.4827586206896552, |
|
"grad_norm": 1.288672685623169, |
|
"learning_rate": 0.00018137863234250347, |
|
"loss": 0.3684, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.4980842911877392, |
|
"grad_norm": 1.3630754947662354, |
|
"learning_rate": 0.00018114084123735356, |
|
"loss": 0.4277, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.5134099616858236, |
|
"grad_norm": 1.344976544380188, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.3682, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.528735632183908, |
|
"grad_norm": 1.5814900398254395, |
|
"learning_rate": 0.000180661210923753, |
|
"loss": 0.4435, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.5440613026819925, |
|
"grad_norm": 1.3256701231002808, |
|
"learning_rate": 0.00018041937969937206, |
|
"loss": 0.3651, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.5593869731800765, |
|
"grad_norm": 1.1954660415649414, |
|
"learning_rate": 0.00018017620978994677, |
|
"loss": 0.3662, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.574712643678161, |
|
"grad_norm": 1.2444689273834229, |
|
"learning_rate": 0.00017993170524335615, |
|
"loss": 0.4181, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.5900383141762453, |
|
"grad_norm": 1.3350296020507812, |
|
"learning_rate": 0.00017968587012969604, |
|
"loss": 0.4437, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.6053639846743293, |
|
"grad_norm": 1.1780810356140137, |
|
"learning_rate": 0.00017943870854121124, |
|
"loss": 0.3723, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.6053639846743293, |
|
"eval_loss": 1.8776559829711914, |
|
"eval_runtime": 10.4883, |
|
"eval_samples_per_second": 9.534, |
|
"eval_steps_per_second": 4.767, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.6206896551724137, |
|
"grad_norm": 1.3304461240768433, |
|
"learning_rate": 0.00017919022459222752, |
|
"loss": 0.4096, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.636015325670498, |
|
"grad_norm": 1.429721474647522, |
|
"learning_rate": 0.00017894042241908294, |
|
"loss": 0.4662, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.6513409961685825, |
|
"grad_norm": 1.160591959953308, |
|
"learning_rate": 0.0001786893061800592, |
|
"loss": 0.3493, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 1.2618906497955322, |
|
"learning_rate": 0.00017843688005531226, |
|
"loss": 0.3734, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.681992337164751, |
|
"grad_norm": 1.3741453886032104, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 0.4422, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.6973180076628354, |
|
"grad_norm": 1.336128830909729, |
|
"learning_rate": 0.0001779281149782269, |
|
"loss": 0.4071, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.7126436781609193, |
|
"grad_norm": 1.5618481636047363, |
|
"learning_rate": 0.000177671784494944, |
|
"loss": 0.3985, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.7279693486590038, |
|
"grad_norm": 1.4244683980941772, |
|
"learning_rate": 0.00017741416106390826, |
|
"loss": 0.4876, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.743295019157088, |
|
"grad_norm": 1.4463664293289185, |
|
"learning_rate": 0.0001771552489735963, |
|
"loss": 0.4698, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 1.3060929775238037, |
|
"learning_rate": 0.0001768950525339362, |
|
"loss": 0.376, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.7739463601532566, |
|
"grad_norm": 1.5133682489395142, |
|
"learning_rate": 0.00017663357607623577, |
|
"loss": 0.4139, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.789272030651341, |
|
"grad_norm": 1.4014631509780884, |
|
"learning_rate": 0.00017637082395311024, |
|
"loss": 0.4094, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.8045977011494254, |
|
"grad_norm": 1.4687765836715698, |
|
"learning_rate": 0.00017610680053841007, |
|
"loss": 0.4123, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.8199233716475094, |
|
"grad_norm": 1.336650013923645, |
|
"learning_rate": 0.000175841510227148, |
|
"loss": 0.3737, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.835249042145594, |
|
"grad_norm": 1.5005886554718018, |
|
"learning_rate": 0.00017557495743542585, |
|
"loss": 0.4835, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.8505747126436782, |
|
"grad_norm": 1.3977274894714355, |
|
"learning_rate": 0.00017530714660036112, |
|
"loss": 0.4989, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.8659003831417627, |
|
"grad_norm": 1.1647838354110718, |
|
"learning_rate": 0.00017503808218001304, |
|
"loss": 0.339, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.8659003831417627, |
|
"eval_loss": 1.875050663948059, |
|
"eval_runtime": 10.5813, |
|
"eval_samples_per_second": 9.451, |
|
"eval_steps_per_second": 4.725, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.8812260536398466, |
|
"grad_norm": 1.4600085020065308, |
|
"learning_rate": 0.00017476776865330847, |
|
"loss": 0.4327, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.896551724137931, |
|
"grad_norm": 1.3009713888168335, |
|
"learning_rate": 0.00017449621051996713, |
|
"loss": 0.3969, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.9118773946360155, |
|
"grad_norm": 1.5662423372268677, |
|
"learning_rate": 0.000174223412300427, |
|
"loss": 0.4866, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.9272030651340994, |
|
"grad_norm": 1.1687737703323364, |
|
"learning_rate": 0.00017394937853576877, |
|
"loss": 0.3411, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 2.942528735632184, |
|
"grad_norm": 1.3152905702590942, |
|
"learning_rate": 0.0001736741137876405, |
|
"loss": 0.4294, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.9578544061302683, |
|
"grad_norm": 1.5262017250061035, |
|
"learning_rate": 0.00017339762263818146, |
|
"loss": 0.433, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.9731800766283527, |
|
"grad_norm": 1.2779839038848877, |
|
"learning_rate": 0.000173119909689946, |
|
"loss": 0.4334, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.9885057471264367, |
|
"grad_norm": 1.2895079851150513, |
|
"learning_rate": 0.00017284097956582692, |
|
"loss": 0.4393, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.003831417624521, |
|
"grad_norm": 5.897226810455322, |
|
"learning_rate": 0.0001725608369089785, |
|
"loss": 0.5205, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.0191570881226055, |
|
"grad_norm": 1.2967376708984375, |
|
"learning_rate": 0.00017227948638273916, |
|
"loss": 0.202, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 3.0344827586206895, |
|
"grad_norm": 1.050823450088501, |
|
"learning_rate": 0.00017199693267055393, |
|
"loss": 0.2219, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 3.049808429118774, |
|
"grad_norm": 0.8004248738288879, |
|
"learning_rate": 0.00017171318047589637, |
|
"loss": 0.1918, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 3.0651340996168583, |
|
"grad_norm": 0.9603090286254883, |
|
"learning_rate": 0.00017142823452219038, |
|
"loss": 0.1627, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.0804597701149423, |
|
"grad_norm": 1.0117729902267456, |
|
"learning_rate": 0.00017114209955273153, |
|
"loss": 0.1734, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 3.0957854406130267, |
|
"grad_norm": 1.150023102760315, |
|
"learning_rate": 0.00017085478033060806, |
|
"loss": 0.2105, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 1.2649832963943481, |
|
"learning_rate": 0.00017056628163862172, |
|
"loss": 0.1996, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 3.1264367816091956, |
|
"grad_norm": 1.1088045835494995, |
|
"learning_rate": 0.00017027660827920798, |
|
"loss": 0.1614, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 3.1264367816091956, |
|
"eval_loss": 2.065758466720581, |
|
"eval_runtime": 10.4748, |
|
"eval_samples_per_second": 9.547, |
|
"eval_steps_per_second": 4.773, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 3.1417624521072796, |
|
"grad_norm": 1.1436564922332764, |
|
"learning_rate": 0.00016998576507435618, |
|
"loss": 0.1886, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.157088122605364, |
|
"grad_norm": 1.2624493837356567, |
|
"learning_rate": 0.00016969375686552937, |
|
"loss": 0.1792, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 3.1724137931034484, |
|
"grad_norm": 1.0960315465927124, |
|
"learning_rate": 0.00016940058851358343, |
|
"loss": 0.196, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 3.1877394636015324, |
|
"grad_norm": 1.062483549118042, |
|
"learning_rate": 0.00016910626489868649, |
|
"loss": 0.1577, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 3.203065134099617, |
|
"grad_norm": 1.0054856538772583, |
|
"learning_rate": 0.0001688107909202374, |
|
"loss": 0.1893, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 3.218390804597701, |
|
"grad_norm": 1.111485481262207, |
|
"learning_rate": 0.00016851417149678444, |
|
"loss": 0.1796, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.2337164750957856, |
|
"grad_norm": 1.009745478630066, |
|
"learning_rate": 0.00016821641156594317, |
|
"loss": 0.1523, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 3.2490421455938696, |
|
"grad_norm": 1.213293433189392, |
|
"learning_rate": 0.0001679175160843145, |
|
"loss": 0.1619, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 3.264367816091954, |
|
"grad_norm": 1.5143858194351196, |
|
"learning_rate": 0.00016761749002740193, |
|
"loss": 0.1609, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 3.2796934865900385, |
|
"grad_norm": 1.3771694898605347, |
|
"learning_rate": 0.00016731633838952905, |
|
"loss": 0.1671, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 3.2950191570881224, |
|
"grad_norm": 1.1563445329666138, |
|
"learning_rate": 0.00016701406618375596, |
|
"loss": 0.1885, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.310344827586207, |
|
"grad_norm": 1.0585676431655884, |
|
"learning_rate": 0.00016671067844179627, |
|
"loss": 0.1634, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 3.3256704980842913, |
|
"grad_norm": 1.1020563840866089, |
|
"learning_rate": 0.00016640618021393304, |
|
"loss": 0.1838, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 3.3409961685823752, |
|
"grad_norm": 0.9592476487159729, |
|
"learning_rate": 0.00016610057656893482, |
|
"loss": 0.179, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 3.3563218390804597, |
|
"grad_norm": 0.9426510334014893, |
|
"learning_rate": 0.00016579387259397127, |
|
"loss": 0.1581, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 3.371647509578544, |
|
"grad_norm": 1.2259931564331055, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 0.2017, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.3869731800766285, |
|
"grad_norm": 1.2636795043945312, |
|
"learning_rate": 0.00016517718409432406, |
|
"loss": 0.1804, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 3.3869731800766285, |
|
"eval_loss": 2.0642523765563965, |
|
"eval_runtime": 10.4896, |
|
"eval_samples_per_second": 9.533, |
|
"eval_steps_per_second": 4.767, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 3.4022988505747125, |
|
"grad_norm": 0.9591987729072571, |
|
"learning_rate": 0.00016486720983522156, |
|
"loss": 0.1653, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 3.417624521072797, |
|
"grad_norm": 0.9433954954147339, |
|
"learning_rate": 0.00016455615577714528, |
|
"loss": 0.1843, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 3.4329501915708813, |
|
"grad_norm": 1.0256028175354004, |
|
"learning_rate": 0.00016424402709799404, |
|
"loss": 0.1596, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 1.0997707843780518, |
|
"learning_rate": 0.00016393082899355516, |
|
"loss": 0.1897, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.4636015325670497, |
|
"grad_norm": 1.6630239486694336, |
|
"learning_rate": 0.00016361656667741802, |
|
"loss": 0.2045, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 3.478927203065134, |
|
"grad_norm": 0.9956857562065125, |
|
"learning_rate": 0.00016330124538088705, |
|
"loss": 0.1653, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 3.4942528735632186, |
|
"grad_norm": 1.3272435665130615, |
|
"learning_rate": 0.0001629848703528949, |
|
"loss": 0.198, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 3.5095785440613025, |
|
"grad_norm": 8.141691207885742, |
|
"learning_rate": 0.0001626674468599149, |
|
"loss": 0.2591, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 3.524904214559387, |
|
"grad_norm": 0.9597133994102478, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.1818, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.5402298850574714, |
|
"grad_norm": 0.949269711971283, |
|
"learning_rate": 0.00016202947563206187, |
|
"loss": 0.1675, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 1.0063790082931519, |
|
"learning_rate": 0.00016170893851704876, |
|
"loss": 0.1875, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 3.57088122605364, |
|
"grad_norm": 1.2696994543075562, |
|
"learning_rate": 0.00016138737417659068, |
|
"loss": 0.1746, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 3.586206896551724, |
|
"grad_norm": 1.055250644683838, |
|
"learning_rate": 0.00016106478796354382, |
|
"loss": 0.1919, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 3.6015325670498086, |
|
"grad_norm": 0.9498022794723511, |
|
"learning_rate": 0.00016074118524777477, |
|
"loss": 0.1441, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.6168582375478926, |
|
"grad_norm": 1.0420253276824951, |
|
"learning_rate": 0.00016041657141607107, |
|
"loss": 0.1634, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 3.632183908045977, |
|
"grad_norm": 1.2098767757415771, |
|
"learning_rate": 0.0001600909518720517, |
|
"loss": 0.187, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 3.6475095785440614, |
|
"grad_norm": 1.2031207084655762, |
|
"learning_rate": 0.0001597643320360769, |
|
"loss": 0.1881, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 3.6475095785440614, |
|
"eval_loss": 2.092371940612793, |
|
"eval_runtime": 10.4707, |
|
"eval_samples_per_second": 9.551, |
|
"eval_steps_per_second": 4.775, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 3.6628352490421454, |
|
"grad_norm": 1.0068916082382202, |
|
"learning_rate": 0.0001594367173451582, |
|
"loss": 0.1499, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 3.67816091954023, |
|
"grad_norm": 1.188425898551941, |
|
"learning_rate": 0.00015910811325286768, |
|
"loss": 0.1928, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.6934865900383143, |
|
"grad_norm": 1.054997205734253, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.1726, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 3.7088122605363987, |
|
"grad_norm": 1.0925296545028687, |
|
"learning_rate": 0.000158447958760718, |
|
"loss": 0.2032, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 3.7241379310344827, |
|
"grad_norm": 1.2014827728271484, |
|
"learning_rate": 0.0001581164193499879, |
|
"loss": 0.1907, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 3.739463601532567, |
|
"grad_norm": 1.1900111436843872, |
|
"learning_rate": 0.0001577839125159613, |
|
"loss": 0.1977, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 3.7547892720306515, |
|
"grad_norm": 1.049250602722168, |
|
"learning_rate": 0.00015745044379364634, |
|
"loss": 0.1734, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 3.7701149425287355, |
|
"grad_norm": 1.1495704650878906, |
|
"learning_rate": 0.00015711601873406313, |
|
"loss": 0.2184, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 3.78544061302682, |
|
"grad_norm": 0.9893819689750671, |
|
"learning_rate": 0.00015678064290415122, |
|
"loss": 0.1594, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 3.8007662835249043, |
|
"grad_norm": 1.0403058528900146, |
|
"learning_rate": 0.00015644432188667695, |
|
"loss": 0.165, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 3.8160919540229887, |
|
"grad_norm": 1.1845136880874634, |
|
"learning_rate": 0.00015610706128014055, |
|
"loss": 0.204, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 3.8314176245210727, |
|
"grad_norm": 1.1242119073867798, |
|
"learning_rate": 0.00015576886669868296, |
|
"loss": 0.1861, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.846743295019157, |
|
"grad_norm": 1.0183254480361938, |
|
"learning_rate": 0.0001554297437719923, |
|
"loss": 0.18, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 3.862068965517241, |
|
"grad_norm": 1.0303974151611328, |
|
"learning_rate": 0.00015508969814521025, |
|
"loss": 0.1951, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 3.8773946360153255, |
|
"grad_norm": 1.1616798639297485, |
|
"learning_rate": 0.000154748735478838, |
|
"loss": 0.2126, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 3.89272030651341, |
|
"grad_norm": 1.1582714319229126, |
|
"learning_rate": 0.00015440686144864207, |
|
"loss": 0.1696, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 3.9080459770114944, |
|
"grad_norm": 1.0691121816635132, |
|
"learning_rate": 0.00015406408174555976, |
|
"loss": 0.1762, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 3.9080459770114944, |
|
"eval_loss": 2.062448501586914, |
|
"eval_runtime": 10.503, |
|
"eval_samples_per_second": 9.521, |
|
"eval_steps_per_second": 4.761, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 3.923371647509579, |
|
"grad_norm": 1.0353065729141235, |
|
"learning_rate": 0.00015372040207560457, |
|
"loss": 0.1894, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 3.9386973180076628, |
|
"grad_norm": 1.1007777452468872, |
|
"learning_rate": 0.00015337582815977104, |
|
"loss": 0.1864, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 3.954022988505747, |
|
"grad_norm": 0.9735039472579956, |
|
"learning_rate": 0.00015303036573393962, |
|
"loss": 0.1716, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 3.969348659003831, |
|
"grad_norm": 1.0294030904769897, |
|
"learning_rate": 0.00015268402054878117, |
|
"loss": 0.1842, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 3.9846743295019156, |
|
"grad_norm": 1.0041604042053223, |
|
"learning_rate": 0.00015233679836966122, |
|
"loss": 0.1904, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.519958734512329, |
|
"learning_rate": 0.00015198870497654395, |
|
"loss": 0.4303, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 4.015325670498084, |
|
"grad_norm": 0.9649507999420166, |
|
"learning_rate": 0.0001516397461638962, |
|
"loss": 0.1039, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 4.030651340996169, |
|
"grad_norm": 0.6340312361717224, |
|
"learning_rate": 0.00015128992774059063, |
|
"loss": 0.0831, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 4.045977011494253, |
|
"grad_norm": 2.8160183429718018, |
|
"learning_rate": 0.00015093925552980933, |
|
"loss": 0.0998, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 4.061302681992337, |
|
"grad_norm": 0.9386498332023621, |
|
"learning_rate": 0.00015058773536894685, |
|
"loss": 0.0737, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 4.076628352490421, |
|
"grad_norm": 0.6389781832695007, |
|
"learning_rate": 0.00015023537310951282, |
|
"loss": 0.0714, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 4.091954022988506, |
|
"grad_norm": 0.6236942410469055, |
|
"learning_rate": 0.0001498821746170349, |
|
"loss": 0.0713, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 4.10727969348659, |
|
"grad_norm": 0.7775859236717224, |
|
"learning_rate": 0.00014952814577096071, |
|
"loss": 0.0723, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 4.1226053639846745, |
|
"grad_norm": 0.8838902711868286, |
|
"learning_rate": 0.0001491732924645604, |
|
"loss": 0.0806, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 4.137931034482759, |
|
"grad_norm": 0.8139066696166992, |
|
"learning_rate": 0.00014881762060482814, |
|
"loss": 0.0681, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.153256704980843, |
|
"grad_norm": 0.7435247302055359, |
|
"learning_rate": 0.00014846113611238413, |
|
"loss": 0.0727, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 4.168582375478927, |
|
"grad_norm": 8.997066497802734, |
|
"learning_rate": 0.0001481038449213758, |
|
"loss": 0.195, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 4.168582375478927, |
|
"eval_loss": 2.326845169067383, |
|
"eval_runtime": 10.5534, |
|
"eval_samples_per_second": 9.476, |
|
"eval_steps_per_second": 4.738, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 4.183908045977011, |
|
"grad_norm": 0.7295827269554138, |
|
"learning_rate": 0.0001477457529793792, |
|
"loss": 0.0834, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 4.199233716475096, |
|
"grad_norm": 0.9554088711738586, |
|
"learning_rate": 0.00014738686624729986, |
|
"loss": 0.0966, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 4.21455938697318, |
|
"grad_norm": 0.709963858127594, |
|
"learning_rate": 0.0001470271906992737, |
|
"loss": 0.0573, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 4.2298850574712645, |
|
"grad_norm": 0.8901592493057251, |
|
"learning_rate": 0.00014666673232256738, |
|
"loss": 0.076, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 4.245210727969349, |
|
"grad_norm": 0.706717848777771, |
|
"learning_rate": 0.00014630549711747888, |
|
"loss": 0.0746, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 4.260536398467433, |
|
"grad_norm": 3.1939444541931152, |
|
"learning_rate": 0.00014594349109723744, |
|
"loss": 0.122, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 4.275862068965517, |
|
"grad_norm": 0.8928236961364746, |
|
"learning_rate": 0.00014558072028790354, |
|
"loss": 0.1025, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 4.291187739463601, |
|
"grad_norm": 0.7875874638557434, |
|
"learning_rate": 0.00014521719072826858, |
|
"loss": 0.0856, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.306513409961686, |
|
"grad_norm": 1.0411407947540283, |
|
"learning_rate": 0.00014485290846975431, |
|
"loss": 0.0819, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 4.32183908045977, |
|
"grad_norm": 0.8319458365440369, |
|
"learning_rate": 0.0001444878795763121, |
|
"loss": 0.0625, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 4.337164750957855, |
|
"grad_norm": 0.7555274963378906, |
|
"learning_rate": 0.00014412211012432212, |
|
"loss": 0.0831, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 4.352490421455939, |
|
"grad_norm": 0.7779274582862854, |
|
"learning_rate": 0.0001437556062024921, |
|
"loss": 0.0991, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 4.3678160919540225, |
|
"grad_norm": 1.9860173463821411, |
|
"learning_rate": 0.00014338837391175582, |
|
"loss": 0.0907, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 4.383141762452107, |
|
"grad_norm": 0.9153367280960083, |
|
"learning_rate": 0.0001430204193651719, |
|
"loss": 0.0957, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 4.398467432950191, |
|
"grad_norm": 1.0085121393203735, |
|
"learning_rate": 0.0001426517486878217, |
|
"loss": 0.1071, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 4.413793103448276, |
|
"grad_norm": 0.7043394446372986, |
|
"learning_rate": 0.00014228236801670763, |
|
"loss": 0.077, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 4.42911877394636, |
|
"grad_norm": 0.7112743854522705, |
|
"learning_rate": 0.00014191228350065078, |
|
"loss": 0.0649, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 4.42911877394636, |
|
"eval_loss": 2.271777868270874, |
|
"eval_runtime": 10.4648, |
|
"eval_samples_per_second": 9.556, |
|
"eval_steps_per_second": 4.778, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.7803434729576111, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 0.0704, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.459770114942529, |
|
"grad_norm": 0.7092854380607605, |
|
"learning_rate": 0.00014117002758747268, |
|
"loss": 0.0745, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 4.4750957854406135, |
|
"grad_norm": 0.7031986117362976, |
|
"learning_rate": 0.00014079786854616537, |
|
"loss": 0.0649, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 4.490421455938697, |
|
"grad_norm": 0.7902014255523682, |
|
"learning_rate": 0.00014042503037133737, |
|
"loss": 0.0908, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 4.505747126436781, |
|
"grad_norm": 1.1959948539733887, |
|
"learning_rate": 0.00014005151926936452, |
|
"loss": 0.0868, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 4.521072796934866, |
|
"grad_norm": 1.7838146686553955, |
|
"learning_rate": 0.00013967734145782425, |
|
"loss": 0.0785, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 4.53639846743295, |
|
"grad_norm": 1.0136120319366455, |
|
"learning_rate": 0.00013930250316539238, |
|
"loss": 0.1004, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 4.551724137931035, |
|
"grad_norm": 0.9047825932502747, |
|
"learning_rate": 0.00013892701063173918, |
|
"loss": 0.0902, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 4.567049808429119, |
|
"grad_norm": 0.7350003123283386, |
|
"learning_rate": 0.00013855087010742562, |
|
"loss": 0.0728, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 4.582375478927203, |
|
"grad_norm": 1.1646071672439575, |
|
"learning_rate": 0.00013817408785379943, |
|
"loss": 0.092, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 4.597701149425287, |
|
"grad_norm": 0.6288233399391174, |
|
"learning_rate": 0.00013779667014289065, |
|
"loss": 0.0678, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.6130268199233715, |
|
"grad_norm": 0.7127698063850403, |
|
"learning_rate": 0.00013741862325730738, |
|
"loss": 0.0921, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 4.628352490421456, |
|
"grad_norm": 0.8102079629898071, |
|
"learning_rate": 0.00013703995349013113, |
|
"loss": 0.0851, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 4.64367816091954, |
|
"grad_norm": 0.778022050857544, |
|
"learning_rate": 0.00013666066714481206, |
|
"loss": 0.0885, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 4.659003831417625, |
|
"grad_norm": 0.6419159770011902, |
|
"learning_rate": 0.0001362807705350641, |
|
"loss": 0.0736, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 4.674329501915709, |
|
"grad_norm": 0.7336333394050598, |
|
"learning_rate": 0.00013590026998475986, |
|
"loss": 0.0761, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 4.689655172413794, |
|
"grad_norm": 0.6584993600845337, |
|
"learning_rate": 0.00013551917182782529, |
|
"loss": 0.0786, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 4.689655172413794, |
|
"eval_loss": 2.256883144378662, |
|
"eval_runtime": 10.5286, |
|
"eval_samples_per_second": 9.498, |
|
"eval_steps_per_second": 4.749, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 4.704980842911877, |
|
"grad_norm": 0.7220829725265503, |
|
"learning_rate": 0.0001351374824081343, |
|
"loss": 0.0737, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 4.7203065134099615, |
|
"grad_norm": 0.8544161319732666, |
|
"learning_rate": 0.00013475520807940304, |
|
"loss": 0.0839, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 4.735632183908046, |
|
"grad_norm": 0.9264532327651978, |
|
"learning_rate": 0.00013437235520508432, |
|
"loss": 0.0904, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 4.75095785440613, |
|
"grad_norm": 0.6544135212898254, |
|
"learning_rate": 0.00013398893015826167, |
|
"loss": 0.0692, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.766283524904215, |
|
"grad_norm": 0.6521825790405273, |
|
"learning_rate": 0.00013360493932154302, |
|
"loss": 0.0696, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 4.781609195402299, |
|
"grad_norm": 0.7229333519935608, |
|
"learning_rate": 0.00013322038908695466, |
|
"loss": 0.0811, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 4.796934865900383, |
|
"grad_norm": 0.8600510954856873, |
|
"learning_rate": 0.00013283528585583484, |
|
"loss": 0.0623, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 4.812260536398467, |
|
"grad_norm": 0.8433498740196228, |
|
"learning_rate": 0.00013244963603872706, |
|
"loss": 0.0805, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 4.827586206896552, |
|
"grad_norm": 1.2378168106079102, |
|
"learning_rate": 0.00013206344605527355, |
|
"loss": 0.0745, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 4.842911877394636, |
|
"grad_norm": 1.4228192567825317, |
|
"learning_rate": 0.00013167672233410825, |
|
"loss": 0.1218, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 4.85823754789272, |
|
"grad_norm": 0.7594043612480164, |
|
"learning_rate": 0.00013128947131274988, |
|
"loss": 0.0744, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 4.873563218390805, |
|
"grad_norm": 0.8461570739746094, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.0907, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 0.8196818232536316, |
|
"learning_rate": 0.00013051341316330946, |
|
"loss": 0.0835, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 4.904214559386973, |
|
"grad_norm": 2.694230794906616, |
|
"learning_rate": 0.00013012461895372344, |
|
"loss": 0.0844, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.919540229885057, |
|
"grad_norm": 1.4861178398132324, |
|
"learning_rate": 0.00012973532328072138, |
|
"loss": 0.0782, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 4.934865900383142, |
|
"grad_norm": 0.9646175503730774, |
|
"learning_rate": 0.00012934553262463548, |
|
"loss": 0.069, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 4.950191570881226, |
|
"grad_norm": 0.7597980499267578, |
|
"learning_rate": 0.00012895525347403756, |
|
"loss": 0.0763, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 4.950191570881226, |
|
"eval_loss": 2.252124547958374, |
|
"eval_runtime": 10.469, |
|
"eval_samples_per_second": 9.552, |
|
"eval_steps_per_second": 4.776, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 4.9655172413793105, |
|
"grad_norm": 0.7091509699821472, |
|
"learning_rate": 0.0001285644923256311, |
|
"loss": 0.0734, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 4.980842911877395, |
|
"grad_norm": 0.8412840366363525, |
|
"learning_rate": 0.00012817325568414297, |
|
"loss": 0.0982, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 4.9961685823754785, |
|
"grad_norm": 0.9467046856880188, |
|
"learning_rate": 0.00012778155006221538, |
|
"loss": 0.0725, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 5.011494252873563, |
|
"grad_norm": 1.2083613872528076, |
|
"learning_rate": 0.00012738938198029724, |
|
"loss": 0.0743, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 5.026819923371647, |
|
"grad_norm": 0.8673701882362366, |
|
"learning_rate": 0.0001269967579665357, |
|
"loss": 0.0423, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 5.042145593869732, |
|
"grad_norm": 0.36529555916786194, |
|
"learning_rate": 0.00012660368455666752, |
|
"loss": 0.027, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 5.057471264367816, |
|
"grad_norm": 0.44554996490478516, |
|
"learning_rate": 0.00012621016829391022, |
|
"loss": 0.0296, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.0727969348659006, |
|
"grad_norm": 0.9303228259086609, |
|
"learning_rate": 0.00012581621572885321, |
|
"loss": 0.0569, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 5.088122605363985, |
|
"grad_norm": 0.45792293548583984, |
|
"learning_rate": 0.00012542183341934872, |
|
"loss": 0.036, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 5.103448275862069, |
|
"grad_norm": 0.6033705472946167, |
|
"learning_rate": 0.0001250270279304026, |
|
"loss": 0.0409, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 5.118773946360153, |
|
"grad_norm": 0.5663286447525024, |
|
"learning_rate": 0.000124631805834065, |
|
"loss": 0.0258, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 5.134099616858237, |
|
"grad_norm": 0.6377267837524414, |
|
"learning_rate": 0.00012423617370932127, |
|
"loss": 0.039, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 5.149425287356322, |
|
"grad_norm": 0.4742782711982727, |
|
"learning_rate": 0.00012384013814198196, |
|
"loss": 0.0335, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 5.164750957854406, |
|
"grad_norm": 0.5032561421394348, |
|
"learning_rate": 0.00012344370572457366, |
|
"loss": 0.0269, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 5.180076628352491, |
|
"grad_norm": 0.4018470048904419, |
|
"learning_rate": 0.0001230468830562289, |
|
"loss": 0.0271, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 5.195402298850575, |
|
"grad_norm": 0.5031781196594238, |
|
"learning_rate": 0.00012264967674257646, |
|
"loss": 0.0252, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 5.210727969348659, |
|
"grad_norm": 0.6742706894874573, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 0.0509, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.210727969348659, |
|
"eval_loss": 2.4545507431030273, |
|
"eval_runtime": 10.7404, |
|
"eval_samples_per_second": 9.311, |
|
"eval_steps_per_second": 4.655, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.226053639846743, |
|
"grad_norm": 0.6078564524650574, |
|
"learning_rate": 0.00012185413963368519, |
|
"loss": 0.0453, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 5.241379310344827, |
|
"grad_norm": 0.5548681616783142, |
|
"learning_rate": 0.00012145582208119497, |
|
"loss": 0.031, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 5.256704980842912, |
|
"grad_norm": 0.5871354937553406, |
|
"learning_rate": 0.00012105714736867391, |
|
"loss": 0.0391, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 5.272030651340996, |
|
"grad_norm": 0.5070196986198425, |
|
"learning_rate": 0.0001206581221325805, |
|
"loss": 0.0282, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 5.287356321839081, |
|
"grad_norm": 0.6400995850563049, |
|
"learning_rate": 0.0001202587530152081, |
|
"loss": 0.0326, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 5.302681992337165, |
|
"grad_norm": 0.5636530518531799, |
|
"learning_rate": 0.00011985904666457455, |
|
"loss": 0.0341, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 5.3180076628352495, |
|
"grad_norm": 0.27172422409057617, |
|
"learning_rate": 0.00011945900973431128, |
|
"loss": 0.0226, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 0.41421565413475037, |
|
"learning_rate": 0.00011905864888355263, |
|
"loss": 0.0322, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 5.3486590038314175, |
|
"grad_norm": 0.444100022315979, |
|
"learning_rate": 0.00011865797077682508, |
|
"loss": 0.0262, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 5.363984674329502, |
|
"grad_norm": 0.5755631923675537, |
|
"learning_rate": 0.00011825698208393619, |
|
"loss": 0.0314, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.379310344827586, |
|
"grad_norm": 0.5454833507537842, |
|
"learning_rate": 0.00011785568947986367, |
|
"loss": 0.0336, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 5.394636015325671, |
|
"grad_norm": 1.3440561294555664, |
|
"learning_rate": 0.00011745409964464424, |
|
"loss": 0.0345, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 5.409961685823755, |
|
"grad_norm": 0.4198431670665741, |
|
"learning_rate": 0.0001170522192632624, |
|
"loss": 0.0276, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 5.425287356321839, |
|
"grad_norm": 0.4718680679798126, |
|
"learning_rate": 0.00011665005502553911, |
|
"loss": 0.0288, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 5.440613026819923, |
|
"grad_norm": 0.9051384329795837, |
|
"learning_rate": 0.00011624761362602061, |
|
"loss": 0.0444, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 5.4559386973180075, |
|
"grad_norm": 0.5586571097373962, |
|
"learning_rate": 0.00011584490176386671, |
|
"loss": 0.027, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 5.471264367816092, |
|
"grad_norm": 0.5432120561599731, |
|
"learning_rate": 0.00011544192614273956, |
|
"loss": 0.0374, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 5.471264367816092, |
|
"eval_loss": 2.4692599773406982, |
|
"eval_runtime": 10.4877, |
|
"eval_samples_per_second": 9.535, |
|
"eval_steps_per_second": 4.768, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 5.486590038314176, |
|
"grad_norm": 0.884427547454834, |
|
"learning_rate": 0.00011503869347069185, |
|
"loss": 0.0558, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 5.501915708812261, |
|
"grad_norm": 0.43964701890945435, |
|
"learning_rate": 0.00011463521046005523, |
|
"loss": 0.0278, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 5.517241379310345, |
|
"grad_norm": 0.44980964064598083, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 0.0275, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.53256704980843, |
|
"grad_norm": 0.40179964900016785, |
|
"learning_rate": 0.00011382752029306604, |
|
"loss": 0.0304, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 5.547892720306513, |
|
"grad_norm": 0.6193554401397705, |
|
"learning_rate": 0.00011342332658176555, |
|
"loss": 0.0305, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 5.563218390804598, |
|
"grad_norm": 0.4448515474796295, |
|
"learning_rate": 0.00011301890942175648, |
|
"loss": 0.0303, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 5.578544061302682, |
|
"grad_norm": 0.40030574798583984, |
|
"learning_rate": 0.0001126142755450878, |
|
"loss": 0.0263, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 5.593869731800766, |
|
"grad_norm": 0.5186451077461243, |
|
"learning_rate": 0.000112209431687416, |
|
"loss": 0.0278, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 5.609195402298851, |
|
"grad_norm": 0.5285075902938843, |
|
"learning_rate": 0.00011180438458789304, |
|
"loss": 0.0348, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 5.624521072796935, |
|
"grad_norm": 0.4877240061759949, |
|
"learning_rate": 0.00011139914098905406, |
|
"loss": 0.0386, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 5.639846743295019, |
|
"grad_norm": 0.5512449145317078, |
|
"learning_rate": 0.00011099370763670523, |
|
"loss": 0.0297, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 5.655172413793103, |
|
"grad_norm": 0.5295383334159851, |
|
"learning_rate": 0.00011058809127981134, |
|
"loss": 0.0344, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 5.670498084291188, |
|
"grad_norm": 0.5817351341247559, |
|
"learning_rate": 0.00011018229867038356, |
|
"loss": 0.0363, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 5.685823754789272, |
|
"grad_norm": 0.3530018627643585, |
|
"learning_rate": 0.00010977633656336706, |
|
"loss": 0.0212, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 5.7011494252873565, |
|
"grad_norm": 2.2889881134033203, |
|
"learning_rate": 0.00010937021171652841, |
|
"loss": 0.0352, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 5.716475095785441, |
|
"grad_norm": 0.846163809299469, |
|
"learning_rate": 0.00010896393089034336, |
|
"loss": 0.0477, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 5.731800766283525, |
|
"grad_norm": 0.31894299387931824, |
|
"learning_rate": 0.00010855750084788398, |
|
"loss": 0.0216, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 5.731800766283525, |
|
"eval_loss": 2.4762635231018066, |
|
"eval_runtime": 10.4616, |
|
"eval_samples_per_second": 9.559, |
|
"eval_steps_per_second": 4.779, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 5.747126436781609, |
|
"grad_norm": 0.6521170139312744, |
|
"learning_rate": 0.00010815092835470633, |
|
"loss": 0.0268, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 5.762452107279693, |
|
"grad_norm": 0.2925560772418976, |
|
"learning_rate": 0.00010774422017873771, |
|
"loss": 0.0223, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 5.777777777777778, |
|
"grad_norm": 0.7669603824615479, |
|
"learning_rate": 0.00010733738309016401, |
|
"loss": 0.027, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 5.793103448275862, |
|
"grad_norm": 0.30490854382514954, |
|
"learning_rate": 0.00010693042386131713, |
|
"loss": 0.02, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 5.8084291187739465, |
|
"grad_norm": 0.456485390663147, |
|
"learning_rate": 0.00010652334926656209, |
|
"loss": 0.0278, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 5.823754789272031, |
|
"grad_norm": 0.5804373621940613, |
|
"learning_rate": 0.00010611616608218429, |
|
"loss": 0.0347, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 5.8390804597701145, |
|
"grad_norm": 1.551376461982727, |
|
"learning_rate": 0.00010570888108627681, |
|
"loss": 0.0274, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 5.854406130268199, |
|
"grad_norm": 0.7403205037117004, |
|
"learning_rate": 0.00010530150105862748, |
|
"loss": 0.0285, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 5.869731800766283, |
|
"grad_norm": 0.7229623794555664, |
|
"learning_rate": 0.00010489403278060613, |
|
"loss": 0.0391, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 5.885057471264368, |
|
"grad_norm": 0.3897419571876526, |
|
"learning_rate": 0.00010448648303505151, |
|
"loss": 0.0231, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 5.900383141762452, |
|
"grad_norm": 0.5959421396255493, |
|
"learning_rate": 0.00010407885860615859, |
|
"loss": 0.0309, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 5.915708812260537, |
|
"grad_norm": 0.7538139224052429, |
|
"learning_rate": 0.00010367116627936548, |
|
"loss": 0.0306, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 5.931034482758621, |
|
"grad_norm": 0.46324053406715393, |
|
"learning_rate": 0.00010326341284124061, |
|
"loss": 0.0293, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 5.946360153256705, |
|
"grad_norm": 1.4018464088439941, |
|
"learning_rate": 0.00010285560507936961, |
|
"loss": 0.0393, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 5.961685823754789, |
|
"grad_norm": 0.5677470564842224, |
|
"learning_rate": 0.00010244774978224254, |
|
"loss": 0.0361, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 5.977011494252873, |
|
"grad_norm": 0.35945063829421997, |
|
"learning_rate": 0.00010203985373914056, |
|
"loss": 0.0206, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.992337164750958, |
|
"grad_norm": 0.35713624954223633, |
|
"learning_rate": 0.0001016319237400232, |
|
"loss": 0.0272, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 5.992337164750958, |
|
"eval_loss": 2.511009454727173, |
|
"eval_runtime": 10.521, |
|
"eval_samples_per_second": 9.505, |
|
"eval_steps_per_second": 4.752, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 6.003831417624521, |
|
"grad_norm": 0.6757388114929199, |
|
"learning_rate": 0.00010122396657541522, |
|
"loss": 0.035, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 6.019157088122605, |
|
"grad_norm": 0.3791247010231018, |
|
"learning_rate": 0.0001008159890362936, |
|
"loss": 0.0174, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 6.0344827586206895, |
|
"grad_norm": 0.19176137447357178, |
|
"learning_rate": 0.00010040799791397444, |
|
"loss": 0.0146, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 6.049808429118774, |
|
"grad_norm": 0.16038718819618225, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0118, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 6.065134099616858, |
|
"grad_norm": 0.14217466115951538, |
|
"learning_rate": 9.95920020860256e-05, |
|
"loss": 0.009, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 6.080459770114943, |
|
"grad_norm": 0.19670097529888153, |
|
"learning_rate": 9.918401096370644e-05, |
|
"loss": 0.0134, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 6.095785440613027, |
|
"grad_norm": 0.7063495516777039, |
|
"learning_rate": 9.877603342458483e-05, |
|
"loss": 0.0186, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 6.111111111111111, |
|
"grad_norm": 0.27073654532432556, |
|
"learning_rate": 9.836807625997683e-05, |
|
"loss": 0.0123, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 6.126436781609195, |
|
"grad_norm": 0.34357860684394836, |
|
"learning_rate": 9.79601462608595e-05, |
|
"loss": 0.0224, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.14176245210728, |
|
"grad_norm": 1.0311784744262695, |
|
"learning_rate": 9.755225021775749e-05, |
|
"loss": 0.0122, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 6.157088122605364, |
|
"grad_norm": 0.12156683206558228, |
|
"learning_rate": 9.71443949206304e-05, |
|
"loss": 0.011, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 6.172413793103448, |
|
"grad_norm": 0.15306659042835236, |
|
"learning_rate": 9.67365871587594e-05, |
|
"loss": 0.0101, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 6.187739463601533, |
|
"grad_norm": 0.40619829297065735, |
|
"learning_rate": 9.632883372063457e-05, |
|
"loss": 0.0124, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 6.203065134099617, |
|
"grad_norm": 0.2220255583524704, |
|
"learning_rate": 9.592114139384145e-05, |
|
"loss": 0.0115, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 6.218390804597701, |
|
"grad_norm": 0.36143144965171814, |
|
"learning_rate": 9.551351696494854e-05, |
|
"loss": 0.0143, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 6.233716475095785, |
|
"grad_norm": 0.19601793587207794, |
|
"learning_rate": 9.51059672193939e-05, |
|
"loss": 0.0121, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 6.24904214559387, |
|
"grad_norm": 0.17943957448005676, |
|
"learning_rate": 9.469849894137253e-05, |
|
"loss": 0.0117, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 6.24904214559387, |
|
"eval_loss": 2.7329955101013184, |
|
"eval_runtime": 10.5244, |
|
"eval_samples_per_second": 9.502, |
|
"eval_steps_per_second": 4.751, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 6.264367816091954, |
|
"grad_norm": 0.19360607862472534, |
|
"learning_rate": 9.42911189137232e-05, |
|
"loss": 0.0095, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 6.2796934865900385, |
|
"grad_norm": 0.24287296831607819, |
|
"learning_rate": 9.388383391781575e-05, |
|
"loss": 0.0116, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.295019157088123, |
|
"grad_norm": 0.554787814617157, |
|
"learning_rate": 9.347665073343794e-05, |
|
"loss": 0.0138, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 6.310344827586207, |
|
"grad_norm": 0.23142507672309875, |
|
"learning_rate": 9.306957613868292e-05, |
|
"loss": 0.0131, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 6.325670498084291, |
|
"grad_norm": 0.2346455603837967, |
|
"learning_rate": 9.266261690983602e-05, |
|
"loss": 0.011, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 6.340996168582375, |
|
"grad_norm": 0.8730548620223999, |
|
"learning_rate": 9.225577982126234e-05, |
|
"loss": 0.0151, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 6.35632183908046, |
|
"grad_norm": 0.3552612364292145, |
|
"learning_rate": 9.184907164529368e-05, |
|
"loss": 0.0232, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 6.371647509578544, |
|
"grad_norm": 0.22842758893966675, |
|
"learning_rate": 9.144249915211605e-05, |
|
"loss": 0.0153, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 6.3869731800766285, |
|
"grad_norm": 0.20680157840251923, |
|
"learning_rate": 9.103606910965666e-05, |
|
"loss": 0.0128, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 6.402298850574713, |
|
"grad_norm": 0.4528963565826416, |
|
"learning_rate": 9.062978828347161e-05, |
|
"loss": 0.0222, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 6.417624521072797, |
|
"grad_norm": 0.298604816198349, |
|
"learning_rate": 9.022366343663298e-05, |
|
"loss": 0.0168, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 6.432950191570881, |
|
"grad_norm": 0.11246322840452194, |
|
"learning_rate": 8.981770132961649e-05, |
|
"loss": 0.0089, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.448275862068965, |
|
"grad_norm": 0.2391061782836914, |
|
"learning_rate": 8.94119087201887e-05, |
|
"loss": 0.0105, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 6.46360153256705, |
|
"grad_norm": 0.10826307535171509, |
|
"learning_rate": 8.900629236329482e-05, |
|
"loss": 0.0089, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 6.478927203065134, |
|
"grad_norm": 0.18837091326713562, |
|
"learning_rate": 8.860085901094595e-05, |
|
"loss": 0.0117, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 6.494252873563219, |
|
"grad_norm": 0.24223893880844116, |
|
"learning_rate": 8.819561541210698e-05, |
|
"loss": 0.0109, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 6.509578544061303, |
|
"grad_norm": 0.38215088844299316, |
|
"learning_rate": 8.779056831258402e-05, |
|
"loss": 0.0115, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 6.509578544061303, |
|
"eval_loss": 2.640347480773926, |
|
"eval_runtime": 10.5535, |
|
"eval_samples_per_second": 9.475, |
|
"eval_steps_per_second": 4.738, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 6.5249042145593865, |
|
"grad_norm": 0.4854836165904999, |
|
"learning_rate": 8.738572445491226e-05, |
|
"loss": 0.0168, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 6.540229885057471, |
|
"grad_norm": 0.20515725016593933, |
|
"learning_rate": 8.698109057824354e-05, |
|
"loss": 0.0128, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 6.555555555555555, |
|
"grad_norm": 0.21756961941719055, |
|
"learning_rate": 8.657667341823448e-05, |
|
"loss": 0.0114, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 6.57088122605364, |
|
"grad_norm": 0.18275758624076843, |
|
"learning_rate": 8.617247970693398e-05, |
|
"loss": 0.0105, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 6.586206896551724, |
|
"grad_norm": 0.175423264503479, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 0.0102, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.601532567049809, |
|
"grad_norm": 0.3893040418624878, |
|
"learning_rate": 8.53647895399448e-05, |
|
"loss": 0.0151, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 6.616858237547893, |
|
"grad_norm": 0.3841419816017151, |
|
"learning_rate": 8.496130652930818e-05, |
|
"loss": 0.0135, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 6.6321839080459775, |
|
"grad_norm": 0.1184447631239891, |
|
"learning_rate": 8.455807385726046e-05, |
|
"loss": 0.0096, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 6.647509578544061, |
|
"grad_norm": 0.11839904636144638, |
|
"learning_rate": 8.415509823613331e-05, |
|
"loss": 0.0087, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 6.662835249042145, |
|
"grad_norm": 0.27116042375564575, |
|
"learning_rate": 8.375238637397942e-05, |
|
"loss": 0.0134, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 6.67816091954023, |
|
"grad_norm": 0.1837141215801239, |
|
"learning_rate": 8.334994497446091e-05, |
|
"loss": 0.0102, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 6.693486590038314, |
|
"grad_norm": 0.14119590818881989, |
|
"learning_rate": 8.294778073673762e-05, |
|
"loss": 0.0103, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 6.708812260536399, |
|
"grad_norm": 0.38409751653671265, |
|
"learning_rate": 8.254590035535579e-05, |
|
"loss": 0.0146, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 6.724137931034483, |
|
"grad_norm": 0.1519305408000946, |
|
"learning_rate": 8.214431052013634e-05, |
|
"loss": 0.0097, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 6.739463601532567, |
|
"grad_norm": 0.2955567240715027, |
|
"learning_rate": 8.174301791606385e-05, |
|
"loss": 0.0114, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 6.754789272030651, |
|
"grad_norm": 0.2837064862251282, |
|
"learning_rate": 8.134202922317495e-05, |
|
"loss": 0.0134, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 6.7701149425287355, |
|
"grad_norm": 0.13082526624202728, |
|
"learning_rate": 8.094135111644742e-05, |
|
"loss": 0.0092, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 6.7701149425287355, |
|
"eval_loss": 2.7746777534484863, |
|
"eval_runtime": 10.5408, |
|
"eval_samples_per_second": 9.487, |
|
"eval_steps_per_second": 4.743, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 6.78544061302682, |
|
"grad_norm": 0.5769606232643127, |
|
"learning_rate": 8.054099026568874e-05, |
|
"loss": 0.0147, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 6.800766283524904, |
|
"grad_norm": 0.1398877650499344, |
|
"learning_rate": 8.014095333542548e-05, |
|
"loss": 0.0098, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 6.816091954022989, |
|
"grad_norm": 0.16053611040115356, |
|
"learning_rate": 7.974124698479192e-05, |
|
"loss": 0.0074, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 6.831417624521073, |
|
"grad_norm": 0.27454668283462524, |
|
"learning_rate": 7.934187786741956e-05, |
|
"loss": 0.0103, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 6.846743295019158, |
|
"grad_norm": 0.36763104796409607, |
|
"learning_rate": 7.894285263132612e-05, |
|
"loss": 0.0153, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 6.862068965517241, |
|
"grad_norm": 0.21019311249256134, |
|
"learning_rate": 7.854417791880507e-05, |
|
"loss": 0.013, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 6.8773946360153255, |
|
"grad_norm": 0.2829742133617401, |
|
"learning_rate": 7.814586036631483e-05, |
|
"loss": 0.0118, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 6.89272030651341, |
|
"grad_norm": 0.30828389525413513, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 0.011, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 6.908045977011494, |
|
"grad_norm": 0.6878758072853088, |
|
"learning_rate": 7.735032325742355e-05, |
|
"loss": 0.0293, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 6.923371647509579, |
|
"grad_norm": 0.15684568881988525, |
|
"learning_rate": 7.695311694377115e-05, |
|
"loss": 0.01, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 6.938697318007663, |
|
"grad_norm": 0.32623958587646484, |
|
"learning_rate": 7.655629427542635e-05, |
|
"loss": 0.0117, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 6.954022988505747, |
|
"grad_norm": 0.10675598680973053, |
|
"learning_rate": 7.615986185801807e-05, |
|
"loss": 0.0077, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 6.969348659003831, |
|
"grad_norm": 0.3139125406742096, |
|
"learning_rate": 7.576382629067877e-05, |
|
"loss": 0.0134, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 6.984674329501916, |
|
"grad_norm": 0.37668049335479736, |
|
"learning_rate": 7.536819416593504e-05, |
|
"loss": 0.011, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.15798693895339966, |
|
"learning_rate": 7.497297206959746e-05, |
|
"loss": 0.0093, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 7.011494252873563, |
|
"grad_norm": 0.3846645653247833, |
|
"learning_rate": 7.457816658065134e-05, |
|
"loss": 0.0108, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 7.026819923371647, |
|
"grad_norm": 0.05968603119254112, |
|
"learning_rate": 7.41837842711468e-05, |
|
"loss": 0.0064, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 7.026819923371647, |
|
"eval_loss": 2.7342193126678467, |
|
"eval_runtime": 10.5281, |
|
"eval_samples_per_second": 9.498, |
|
"eval_steps_per_second": 4.749, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 7.042145593869732, |
|
"grad_norm": 0.05475788936018944, |
|
"learning_rate": 7.378983170608982e-05, |
|
"loss": 0.0054, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.057471264367816, |
|
"grad_norm": 0.055521685630083084, |
|
"learning_rate": 7.339631544333249e-05, |
|
"loss": 0.0057, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 7.0727969348659006, |
|
"grad_norm": 0.06325386464595795, |
|
"learning_rate": 7.300324203346431e-05, |
|
"loss": 0.0061, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 7.088122605363985, |
|
"grad_norm": 0.5059542655944824, |
|
"learning_rate": 7.261061801970277e-05, |
|
"loss": 0.0079, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 7.103448275862069, |
|
"grad_norm": 0.06388293951749802, |
|
"learning_rate": 7.221844993778464e-05, |
|
"loss": 0.0056, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 7.118773946360153, |
|
"grad_norm": 0.07516956329345703, |
|
"learning_rate": 7.182674431585704e-05, |
|
"loss": 0.006, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 7.134099616858237, |
|
"grad_norm": 0.14318601787090302, |
|
"learning_rate": 7.143550767436894e-05, |
|
"loss": 0.0067, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 7.149425287356322, |
|
"grad_norm": 0.1426093429327011, |
|
"learning_rate": 7.104474652596245e-05, |
|
"loss": 0.0079, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 7.164750957854406, |
|
"grad_norm": 0.05885975807905197, |
|
"learning_rate": 7.065446737536456e-05, |
|
"loss": 0.0055, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 7.180076628352491, |
|
"grad_norm": 0.06351395696401596, |
|
"learning_rate": 7.026467671927863e-05, |
|
"loss": 0.0059, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 7.195402298850575, |
|
"grad_norm": 0.0676102414727211, |
|
"learning_rate": 6.98753810462766e-05, |
|
"loss": 0.0062, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.210727969348659, |
|
"grad_norm": 0.07731365412473679, |
|
"learning_rate": 6.948658683669056e-05, |
|
"loss": 0.0058, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 7.226053639846743, |
|
"grad_norm": 0.06487540900707245, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.0061, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 7.241379310344827, |
|
"grad_norm": 0.09343966096639633, |
|
"learning_rate": 6.871052868725012e-05, |
|
"loss": 0.0062, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 7.256704980842912, |
|
"grad_norm": 0.1045990064740181, |
|
"learning_rate": 6.832327766589177e-05, |
|
"loss": 0.0063, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 7.272030651340996, |
|
"grad_norm": 0.05801545828580856, |
|
"learning_rate": 6.793655394472644e-05, |
|
"loss": 0.0057, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 7.287356321839081, |
|
"grad_norm": 0.06868793070316315, |
|
"learning_rate": 6.755036396127296e-05, |
|
"loss": 0.0059, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 7.287356321839081, |
|
"eval_loss": 2.8930225372314453, |
|
"eval_runtime": 10.5758, |
|
"eval_samples_per_second": 9.456, |
|
"eval_steps_per_second": 4.728, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 7.302681992337165, |
|
"grad_norm": 0.08218348026275635, |
|
"learning_rate": 6.716471414416519e-05, |
|
"loss": 0.0075, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 7.3180076628352495, |
|
"grad_norm": 0.08141635358333588, |
|
"learning_rate": 6.677961091304535e-05, |
|
"loss": 0.0061, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 7.333333333333333, |
|
"grad_norm": 0.05970093235373497, |
|
"learning_rate": 6.639506067845697e-05, |
|
"loss": 0.006, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 7.3486590038314175, |
|
"grad_norm": 0.07674306631088257, |
|
"learning_rate": 6.601106984173835e-05, |
|
"loss": 0.0058, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.363984674329502, |
|
"grad_norm": 0.07168275862932205, |
|
"learning_rate": 6.562764479491565e-05, |
|
"loss": 0.0054, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 7.379310344827586, |
|
"grad_norm": 0.06897211819887161, |
|
"learning_rate": 6.524479192059698e-05, |
|
"loss": 0.0059, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 7.394636015325671, |
|
"grad_norm": 0.5173123478889465, |
|
"learning_rate": 6.486251759186572e-05, |
|
"loss": 0.008, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 7.409961685823755, |
|
"grad_norm": 0.05815713480114937, |
|
"learning_rate": 6.448082817217471e-05, |
|
"loss": 0.0052, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 7.425287356321839, |
|
"grad_norm": 0.08304629474878311, |
|
"learning_rate": 6.409973001524012e-05, |
|
"loss": 0.0058, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 7.440613026819923, |
|
"grad_norm": 0.10966533422470093, |
|
"learning_rate": 6.371922946493591e-05, |
|
"loss": 0.0058, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 7.4559386973180075, |
|
"grad_norm": 0.06352514773607254, |
|
"learning_rate": 6.333933285518796e-05, |
|
"loss": 0.0054, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 7.471264367816092, |
|
"grad_norm": 0.16141043603420258, |
|
"learning_rate": 6.29600465098689e-05, |
|
"loss": 0.0106, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 7.486590038314176, |
|
"grad_norm": 0.06440207362174988, |
|
"learning_rate": 6.258137674269261e-05, |
|
"loss": 0.006, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 7.501915708812261, |
|
"grad_norm": 0.08629340678453445, |
|
"learning_rate": 6.220332985710936e-05, |
|
"loss": 0.0073, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 7.517241379310345, |
|
"grad_norm": 0.06371556222438812, |
|
"learning_rate": 6.182591214620057e-05, |
|
"loss": 0.006, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 7.53256704980843, |
|
"grad_norm": 0.08433310687541962, |
|
"learning_rate": 6.144912989257441e-05, |
|
"loss": 0.006, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 7.547892720306513, |
|
"grad_norm": 0.08213558048009872, |
|
"learning_rate": 6.107298936826086e-05, |
|
"loss": 0.0065, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 7.547892720306513, |
|
"eval_loss": 2.91325306892395, |
|
"eval_runtime": 10.6133, |
|
"eval_samples_per_second": 9.422, |
|
"eval_steps_per_second": 4.711, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 7.563218390804598, |
|
"grad_norm": 0.059887565672397614, |
|
"learning_rate": 6.069749683460765e-05, |
|
"loss": 0.0055, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 7.578544061302682, |
|
"grad_norm": 0.06606566160917282, |
|
"learning_rate": 6.0322658542175736e-05, |
|
"loss": 0.0045, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 7.593869731800766, |
|
"grad_norm": 0.076997309923172, |
|
"learning_rate": 5.994848073063551e-05, |
|
"loss": 0.0059, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 7.609195402298851, |
|
"grad_norm": 0.0730021744966507, |
|
"learning_rate": 5.957496962866262e-05, |
|
"loss": 0.0053, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 7.624521072796935, |
|
"grad_norm": 0.05936294421553612, |
|
"learning_rate": 5.920213145383466e-05, |
|
"loss": 0.0054, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 7.639846743295019, |
|
"grad_norm": 0.14003659784793854, |
|
"learning_rate": 5.8829972412527327e-05, |
|
"loss": 0.0073, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 7.655172413793103, |
|
"grad_norm": 0.05907728150486946, |
|
"learning_rate": 5.845849869981137e-05, |
|
"loss": 0.0042, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.670498084291188, |
|
"grad_norm": 0.057687729597091675, |
|
"learning_rate": 5.808771649934923e-05, |
|
"loss": 0.0052, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 7.685823754789272, |
|
"grad_norm": 0.09928648918867111, |
|
"learning_rate": 5.7717631983292375e-05, |
|
"loss": 0.0055, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 7.7011494252873565, |
|
"grad_norm": 0.07954944670200348, |
|
"learning_rate": 5.73482513121783e-05, |
|
"loss": 0.0057, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 7.716475095785441, |
|
"grad_norm": 0.06073677912354469, |
|
"learning_rate": 5.6979580634828125e-05, |
|
"loss": 0.0059, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 7.731800766283525, |
|
"grad_norm": 0.06618310511112213, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 0.0056, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 7.747126436781609, |
|
"grad_norm": 0.06377172470092773, |
|
"learning_rate": 5.624439379750794e-05, |
|
"loss": 0.0053, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 7.762452107279693, |
|
"grad_norm": 0.06222354248166084, |
|
"learning_rate": 5.5877889875677845e-05, |
|
"loss": 0.0054, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 7.777777777777778, |
|
"grad_norm": 0.06755752861499786, |
|
"learning_rate": 5.551212042368792e-05, |
|
"loss": 0.0069, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 7.793103448275862, |
|
"grad_norm": 0.23886863887310028, |
|
"learning_rate": 5.514709153024571e-05, |
|
"loss": 0.007, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 7.8084291187739465, |
|
"grad_norm": 0.06176340579986572, |
|
"learning_rate": 5.478280927173145e-05, |
|
"loss": 0.0059, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 7.8084291187739465, |
|
"eval_loss": 2.921626091003418, |
|
"eval_runtime": 10.5435, |
|
"eval_samples_per_second": 9.485, |
|
"eval_steps_per_second": 4.742, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 7.823754789272031, |
|
"grad_norm": 0.056606221944093704, |
|
"learning_rate": 5.4419279712096437e-05, |
|
"loss": 0.0049, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 7.8390804597701145, |
|
"grad_norm": 0.06514956057071686, |
|
"learning_rate": 5.405650890276255e-05, |
|
"loss": 0.0061, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 7.854406130268199, |
|
"grad_norm": 0.05932604894042015, |
|
"learning_rate": 5.3694502882521125e-05, |
|
"loss": 0.0058, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 7.869731800766283, |
|
"grad_norm": 0.06986385583877563, |
|
"learning_rate": 5.333326767743263e-05, |
|
"loss": 0.0048, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 7.885057471264368, |
|
"grad_norm": 0.07194341719150543, |
|
"learning_rate": 5.297280930072632e-05, |
|
"loss": 0.0065, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 7.900383141762452, |
|
"grad_norm": 0.12007016688585281, |
|
"learning_rate": 5.261313375270014e-05, |
|
"loss": 0.0068, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 7.915708812260537, |
|
"grad_norm": 0.05479056015610695, |
|
"learning_rate": 5.2254247020620814e-05, |
|
"loss": 0.0052, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 7.931034482758621, |
|
"grad_norm": 0.18069668114185333, |
|
"learning_rate": 5.189615507862422e-05, |
|
"loss": 0.0077, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 7.946360153256705, |
|
"grad_norm": 0.08876926451921463, |
|
"learning_rate": 5.153886388761586e-05, |
|
"loss": 0.0063, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 7.961685823754789, |
|
"grad_norm": 0.05993456766009331, |
|
"learning_rate": 5.11823793951719e-05, |
|
"loss": 0.0048, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 7.977011494252873, |
|
"grad_norm": 0.05695677176117897, |
|
"learning_rate": 5.082670753543961e-05, |
|
"loss": 0.0049, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 7.992337164750958, |
|
"grad_norm": 0.0639839619398117, |
|
"learning_rate": 5.047185422903928e-05, |
|
"loss": 0.0054, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 8.007662835249041, |
|
"grad_norm": 0.1566697508096695, |
|
"learning_rate": 5.011782538296512e-05, |
|
"loss": 0.0103, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 8.022988505747126, |
|
"grad_norm": 0.0462418757379055, |
|
"learning_rate": 4.976462689048717e-05, |
|
"loss": 0.0043, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 8.03831417624521, |
|
"grad_norm": 0.046641357243061066, |
|
"learning_rate": 4.9412264631053216e-05, |
|
"loss": 0.0048, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 8.053639846743295, |
|
"grad_norm": 0.04404853284358978, |
|
"learning_rate": 4.9060744470190676e-05, |
|
"loss": 0.0044, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 8.068965517241379, |
|
"grad_norm": 0.053229521960020065, |
|
"learning_rate": 4.87100722594094e-05, |
|
"loss": 0.0058, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 8.068965517241379, |
|
"eval_loss": 2.9435019493103027, |
|
"eval_runtime": 10.5293, |
|
"eval_samples_per_second": 9.497, |
|
"eval_steps_per_second": 4.749, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 8.084291187739463, |
|
"grad_norm": 0.039271771907806396, |
|
"learning_rate": 4.836025383610382e-05, |
|
"loss": 0.0035, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 8.099616858237548, |
|
"grad_norm": 0.0491085946559906, |
|
"learning_rate": 4.801129502345605e-05, |
|
"loss": 0.0048, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 8.114942528735632, |
|
"grad_norm": 0.03886023536324501, |
|
"learning_rate": 4.7663201630338816e-05, |
|
"loss": 0.004, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.130268199233717, |
|
"grad_norm": 0.04504215344786644, |
|
"learning_rate": 4.7315979451218864e-05, |
|
"loss": 0.0047, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 8.145593869731801, |
|
"grad_norm": 0.05867081508040428, |
|
"learning_rate": 4.696963426606041e-05, |
|
"loss": 0.0058, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 8.160919540229886, |
|
"grad_norm": 0.0445120669901371, |
|
"learning_rate": 4.6624171840229e-05, |
|
"loss": 0.0043, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 8.17624521072797, |
|
"grad_norm": 0.05101229250431061, |
|
"learning_rate": 4.6279597924395436e-05, |
|
"loss": 0.0044, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 8.191570881226054, |
|
"grad_norm": 0.04617276415228844, |
|
"learning_rate": 4.593591825444028e-05, |
|
"loss": 0.0045, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 8.206896551724139, |
|
"grad_norm": 0.048301588743925095, |
|
"learning_rate": 4.559313855135795e-05, |
|
"loss": 0.0046, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 8.222222222222221, |
|
"grad_norm": 0.05069313570857048, |
|
"learning_rate": 4.5251264521162005e-05, |
|
"loss": 0.005, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 8.237547892720306, |
|
"grad_norm": 0.04811912775039673, |
|
"learning_rate": 4.491030185478976e-05, |
|
"loss": 0.0045, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 8.25287356321839, |
|
"grad_norm": 0.04650574177503586, |
|
"learning_rate": 4.457025622800771e-05, |
|
"loss": 0.0049, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 8.268199233716475, |
|
"grad_norm": 0.038902636617422104, |
|
"learning_rate": 4.423113330131707e-05, |
|
"loss": 0.0037, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 8.28352490421456, |
|
"grad_norm": 0.0576075054705143, |
|
"learning_rate": 4.389293871985949e-05, |
|
"loss": 0.0066, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 8.298850574712644, |
|
"grad_norm": 0.051424864679574966, |
|
"learning_rate": 4.355567811332311e-05, |
|
"loss": 0.0053, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 8.314176245210728, |
|
"grad_norm": 0.040568236261606216, |
|
"learning_rate": 4.3219357095848836e-05, |
|
"loss": 0.0038, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 8.329501915708812, |
|
"grad_norm": 0.051232922822237015, |
|
"learning_rate": 4.2883981265936876e-05, |
|
"loss": 0.0046, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 8.329501915708812, |
|
"eval_loss": 3.006831169128418, |
|
"eval_runtime": 10.5212, |
|
"eval_samples_per_second": 9.505, |
|
"eval_steps_per_second": 4.752, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 8.344827586206897, |
|
"grad_norm": 0.04653798043727875, |
|
"learning_rate": 4.25495562063537e-05, |
|
"loss": 0.0048, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 8.360153256704981, |
|
"grad_norm": 0.04423636198043823, |
|
"learning_rate": 4.2216087484038714e-05, |
|
"loss": 0.0038, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 8.375478927203066, |
|
"grad_norm": 0.04573935642838478, |
|
"learning_rate": 4.188358065001215e-05, |
|
"loss": 0.0045, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 8.39080459770115, |
|
"grad_norm": 0.044406238943338394, |
|
"learning_rate": 4.155204123928205e-05, |
|
"loss": 0.0041, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 8.406130268199234, |
|
"grad_norm": 0.044500816613435745, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.0044, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 8.421455938697317, |
|
"grad_norm": 0.039383914321660995, |
|
"learning_rate": 4.089188674713236e-05, |
|
"loss": 0.0038, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.436781609195402, |
|
"grad_norm": 0.04521704837679863, |
|
"learning_rate": 4.056328265484184e-05, |
|
"loss": 0.0046, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 8.452107279693486, |
|
"grad_norm": 0.047671083360910416, |
|
"learning_rate": 4.023566796392313e-05, |
|
"loss": 0.0042, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 8.46743295019157, |
|
"grad_norm": 0.04466583952307701, |
|
"learning_rate": 3.990904812794834e-05, |
|
"loss": 0.0043, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 8.482758620689655, |
|
"grad_norm": 0.05882612615823746, |
|
"learning_rate": 3.958342858392893e-05, |
|
"loss": 0.0059, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 8.49808429118774, |
|
"grad_norm": 0.048001233488321304, |
|
"learning_rate": 3.9258814752225284e-05, |
|
"loss": 0.0042, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 8.513409961685824, |
|
"grad_norm": 0.06287714838981628, |
|
"learning_rate": 3.893521203645618e-05, |
|
"loss": 0.0053, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 8.528735632183908, |
|
"grad_norm": 0.047715529799461365, |
|
"learning_rate": 3.8612625823409366e-05, |
|
"loss": 0.0041, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 8.544061302681992, |
|
"grad_norm": 0.05052071437239647, |
|
"learning_rate": 3.829106148295126e-05, |
|
"loss": 0.0046, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 8.559386973180077, |
|
"grad_norm": 0.24502001702785492, |
|
"learning_rate": 3.797052436793814e-05, |
|
"loss": 0.0066, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 8.574712643678161, |
|
"grad_norm": 0.046199604868888855, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 0.0045, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 8.590038314176246, |
|
"grad_norm": 0.049519941210746765, |
|
"learning_rate": 3.7332553140085155e-05, |
|
"loss": 0.0051, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 8.590038314176246, |
|
"eval_loss": 3.0260815620422363, |
|
"eval_runtime": 10.5212, |
|
"eval_samples_per_second": 9.505, |
|
"eval_steps_per_second": 4.752, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 8.60536398467433, |
|
"grad_norm": 0.053081195801496506, |
|
"learning_rate": 3.701512964710513e-05, |
|
"loss": 0.0046, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 8.620689655172415, |
|
"grad_norm": 0.041760966181755066, |
|
"learning_rate": 3.669875461911297e-05, |
|
"loss": 0.0036, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 8.636015325670499, |
|
"grad_norm": 0.05594363436102867, |
|
"learning_rate": 3.638343332258203e-05, |
|
"loss": 0.0052, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 8.651340996168582, |
|
"grad_norm": 0.04741170257329941, |
|
"learning_rate": 3.606917100644488e-05, |
|
"loss": 0.0039, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 8.666666666666666, |
|
"grad_norm": 0.1333678662776947, |
|
"learning_rate": 3.5755972902005987e-05, |
|
"loss": 0.0048, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 8.68199233716475, |
|
"grad_norm": 0.060406796634197235, |
|
"learning_rate": 3.544384422285477e-05, |
|
"loss": 0.0056, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 8.697318007662835, |
|
"grad_norm": 0.04437935724854469, |
|
"learning_rate": 3.513279016477844e-05, |
|
"loss": 0.004, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 8.71264367816092, |
|
"grad_norm": 0.04306851327419281, |
|
"learning_rate": 3.4822815905675954e-05, |
|
"loss": 0.0043, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 8.727969348659004, |
|
"grad_norm": 0.049886684864759445, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 0.0054, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 8.743295019157088, |
|
"grad_norm": 0.039504941552877426, |
|
"learning_rate": 3.4206127406028745e-05, |
|
"loss": 0.0036, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 8.758620689655173, |
|
"grad_norm": 0.05250853672623634, |
|
"learning_rate": 3.389942343106522e-05, |
|
"loss": 0.0055, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 8.773946360153257, |
|
"grad_norm": 0.06467723846435547, |
|
"learning_rate": 3.359381978606701e-05, |
|
"loss": 0.0046, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 8.789272030651341, |
|
"grad_norm": 0.04862450435757637, |
|
"learning_rate": 3.328932155820377e-05, |
|
"loss": 0.0045, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 8.804597701149426, |
|
"grad_norm": 0.04701303318142891, |
|
"learning_rate": 3.298593381624406e-05, |
|
"loss": 0.0045, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 8.81992337164751, |
|
"grad_norm": 0.04837154597043991, |
|
"learning_rate": 3.2683661610470963e-05, |
|
"loss": 0.0039, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 8.835249042145595, |
|
"grad_norm": 0.04792990908026695, |
|
"learning_rate": 3.238250997259808e-05, |
|
"loss": 0.0041, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 8.850574712643677, |
|
"grad_norm": 0.04371470585465431, |
|
"learning_rate": 3.208248391568553e-05, |
|
"loss": 0.0044, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 8.850574712643677, |
|
"eval_loss": 3.0277657508850098, |
|
"eval_runtime": 10.5822, |
|
"eval_samples_per_second": 9.45, |
|
"eval_steps_per_second": 4.725, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 8.865900383141762, |
|
"grad_norm": 0.048086583614349365, |
|
"learning_rate": 3.178358843405684e-05, |
|
"loss": 0.0043, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 8.881226053639846, |
|
"grad_norm": 0.0496319979429245, |
|
"learning_rate": 3.1485828503215585e-05, |
|
"loss": 0.0047, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 8.89655172413793, |
|
"grad_norm": 0.05418609455227852, |
|
"learning_rate": 3.1189209079762607e-05, |
|
"loss": 0.0045, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 8.911877394636015, |
|
"grad_norm": 0.046972278505563736, |
|
"learning_rate": 3.089373510131354e-05, |
|
"loss": 0.0046, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 8.9272030651341, |
|
"grad_norm": 0.043504588305950165, |
|
"learning_rate": 3.0599411486416585e-05, |
|
"loss": 0.0039, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 8.942528735632184, |
|
"grad_norm": 0.05620258301496506, |
|
"learning_rate": 3.030624313447067e-05, |
|
"loss": 0.0048, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 8.957854406130268, |
|
"grad_norm": 0.05009399726986885, |
|
"learning_rate": 3.0014234925643837e-05, |
|
"loss": 0.0049, |
|
"step": 585 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 780, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 12, |
|
"save_steps": 65, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.74949251811115e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|