|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 6, |
|
"global_step": 78, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.08957596868276596, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0134, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 1.0981205701828003, |
|
"eval_runtime": 2.5128, |
|
"eval_samples_per_second": 1.194, |
|
"eval_steps_per_second": 1.194, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.07771213352680206, |
|
"learning_rate": 4e-05, |
|
"loss": 0.9545, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.1224137470126152, |
|
"learning_rate": 6e-05, |
|
"loss": 1.1733, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.09190034121274948, |
|
"learning_rate": 8e-05, |
|
"loss": 0.9954, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.08263542503118515, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9486, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.09250061959028244, |
|
"learning_rate": 0.00012, |
|
"loss": 0.972, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 1.0735869407653809, |
|
"eval_runtime": 2.5357, |
|
"eval_samples_per_second": 1.183, |
|
"eval_steps_per_second": 1.183, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.1398034691810608, |
|
"learning_rate": 0.00014, |
|
"loss": 1.0445, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.0993918851017952, |
|
"learning_rate": 0.00016, |
|
"loss": 0.9169, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.07937725633382797, |
|
"learning_rate": 0.00018, |
|
"loss": 0.8462, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.10001373291015625, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8708, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.1337287873029709, |
|
"learning_rate": 0.00019995690062269984, |
|
"loss": 0.86, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.11684636771678925, |
|
"learning_rate": 0.00019982763964192585, |
|
"loss": 0.7982, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 0.8548387885093689, |
|
"eval_runtime": 2.5536, |
|
"eval_samples_per_second": 1.175, |
|
"eval_steps_per_second": 1.175, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.12103456258773804, |
|
"learning_rate": 0.0001996123284790336, |
|
"loss": 0.7906, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.1426106095314026, |
|
"learning_rate": 0.00019931115272956405, |
|
"loss": 0.7825, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.12367941439151764, |
|
"learning_rate": 0.0001989243720032624, |
|
"loss": 0.7341, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.10154826194047928, |
|
"learning_rate": 0.00019845231970029773, |
|
"loss": 0.7064, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.13628405332565308, |
|
"learning_rate": 0.0001978954027238763, |
|
"loss": 0.6988, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.11276472359895706, |
|
"learning_rate": 0.0001972541011294959, |
|
"loss": 0.6944, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 0.7151015400886536, |
|
"eval_runtime": 2.5734, |
|
"eval_samples_per_second": 1.166, |
|
"eval_steps_per_second": 1.166, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.13381372392177582, |
|
"learning_rate": 0.00019652896771114414, |
|
"loss": 0.6956, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.11248588562011719, |
|
"learning_rate": 0.00019572062752479683, |
|
"loss": 0.7155, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.17762312293052673, |
|
"learning_rate": 0.00019482977734962753, |
|
"loss": 0.7357, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.10546916723251343, |
|
"learning_rate": 0.00019385718508739262, |
|
"loss": 0.6691, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.3150898516178131, |
|
"learning_rate": 0.00019280368910050942, |
|
"loss": 0.7167, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.13151158392429352, |
|
"learning_rate": 0.00019167019748939846, |
|
"loss": 0.6808, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_loss": 0.6942548751831055, |
|
"eval_runtime": 2.5831, |
|
"eval_samples_per_second": 1.161, |
|
"eval_steps_per_second": 1.161, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.14906296133995056, |
|
"learning_rate": 0.00019045768730971196, |
|
"loss": 0.6762, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.19484123587608337, |
|
"learning_rate": 0.00018916720373012426, |
|
"loss": 0.6854, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.12819896638393402, |
|
"learning_rate": 0.00018779985913140924, |
|
"loss": 0.6873, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.21385614573955536, |
|
"learning_rate": 0.00018635683214758214, |
|
"loss": 0.6874, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.12286895513534546, |
|
"learning_rate": 0.0001848393666499315, |
|
"loss": 0.6843, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.08534862101078033, |
|
"learning_rate": 0.00018324877067481783, |
|
"loss": 0.6763, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_loss": 0.6821426749229431, |
|
"eval_runtime": 2.5911, |
|
"eval_samples_per_second": 1.158, |
|
"eval_steps_per_second": 1.158, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.17990928888320923, |
|
"learning_rate": 0.0001815864152961624, |
|
"loss": 0.6789, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.12137839943170547, |
|
"learning_rate": 0.0001798537334435986, |
|
"loss": 0.6877, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.10240964591503143, |
|
"learning_rate": 0.00017805221866730458, |
|
"loss": 0.6725, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.14333295822143555, |
|
"learning_rate": 0.00017618342385058145, |
|
"loss": 0.6745, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.0904482752084732, |
|
"learning_rate": 0.00017424895987128722, |
|
"loss": 0.6894, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.11753042787313461, |
|
"learning_rate": 0.00017225049421328023, |
|
"loss": 0.67, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 0.6763580441474915, |
|
"eval_runtime": 2.5955, |
|
"eval_samples_per_second": 1.156, |
|
"eval_steps_per_second": 1.156, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.13719823956489563, |
|
"learning_rate": 0.00017018974952906884, |
|
"loss": 0.6589, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.1040361225605011, |
|
"learning_rate": 0.0001680685021549063, |
|
"loss": 0.666, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.07594098895788193, |
|
"learning_rate": 0.00016588858057961113, |
|
"loss": 0.645, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.08139798045158386, |
|
"learning_rate": 0.0001636518638684325, |
|
"loss": 0.6542, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.07313457876443863, |
|
"learning_rate": 0.0001613602800433194, |
|
"loss": 0.6458, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.07903215289115906, |
|
"learning_rate": 0.00015901580442098968, |
|
"loss": 0.6424, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"eval_loss": 0.6730008125305176, |
|
"eval_runtime": 2.5989, |
|
"eval_samples_per_second": 1.154, |
|
"eval_steps_per_second": 1.154, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.09322352707386017, |
|
"learning_rate": 0.00015662045791023173, |
|
"loss": 0.6567, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.07249985635280609, |
|
"learning_rate": 0.00015417630526990615, |
|
"loss": 0.6384, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.07686451077461243, |
|
"learning_rate": 0.0001516854533291494, |
|
"loss": 0.665, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.07324113696813583, |
|
"learning_rate": 0.00014915004917131344, |
|
"loss": 0.6297, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.09203895926475525, |
|
"learning_rate": 0.00014657227828320635, |
|
"loss": 0.6539, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.09338624030351639, |
|
"learning_rate": 0.00014395436267123016, |
|
"loss": 0.6552, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"eval_loss": 0.6780009269714355, |
|
"eval_runtime": 2.6045, |
|
"eval_samples_per_second": 1.152, |
|
"eval_steps_per_second": 1.152, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.0812142863869667, |
|
"learning_rate": 0.00014129855894603886, |
|
"loss": 0.6319, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.19316132366657257, |
|
"learning_rate": 0.00013860715637736818, |
|
"loss": 0.7, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.10698059946298599, |
|
"learning_rate": 0.0001358824749207136, |
|
"loss": 0.6725, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.14100198447704315, |
|
"learning_rate": 0.00013312686321755761, |
|
"loss": 0.6766, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.09599179029464722, |
|
"learning_rate": 0.00013034269657086992, |
|
"loss": 0.645, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.08999059349298477, |
|
"learning_rate": 0.000127532374897626, |
|
"loss": 0.6527, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"eval_loss": 0.6689873337745667, |
|
"eval_runtime": 2.6108, |
|
"eval_samples_per_second": 1.149, |
|
"eval_steps_per_second": 1.149, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.13835830986499786, |
|
"learning_rate": 0.00012469832066010843, |
|
"loss": 0.6561, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.10695886611938477, |
|
"learning_rate": 0.00012184297677777463, |
|
"loss": 0.6668, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.0739368349313736, |
|
"learning_rate": 0.00011896880452149077, |
|
"loss": 0.643, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.21791452169418335, |
|
"learning_rate": 0.00011607828139194683, |
|
"loss": 0.6768, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.06241246312856674, |
|
"learning_rate": 0.00011317389898408189, |
|
"loss": 0.6252, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.1302526593208313, |
|
"learning_rate": 0.00011025816083936036, |
|
"loss": 0.6624, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"eval_loss": 0.6632375121116638, |
|
"eval_runtime": 2.6043, |
|
"eval_samples_per_second": 1.152, |
|
"eval_steps_per_second": 1.152, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.11702455580234528, |
|
"learning_rate": 0.0001073335802877504, |
|
"loss": 0.6522, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.08904154598712921, |
|
"learning_rate": 0.00010440267828126478, |
|
"loss": 0.6472, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.08021406084299088, |
|
"learning_rate": 0.00010146798122093166, |
|
"loss": 0.6279, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.07384659349918365, |
|
"learning_rate": 9.853201877906836e-05, |
|
"loss": 0.6262, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.06457240134477615, |
|
"learning_rate": 9.559732171873523e-05, |
|
"loss": 0.64, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.07967618852853775, |
|
"learning_rate": 9.266641971224963e-05, |
|
"loss": 0.6228, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"eval_loss": 0.6625072360038757, |
|
"eval_runtime": 2.6047, |
|
"eval_samples_per_second": 1.152, |
|
"eval_steps_per_second": 1.152, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.09555868804454803, |
|
"learning_rate": 8.974183916063968e-05, |
|
"loss": 0.635, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.07187359035015106, |
|
"learning_rate": 8.682610101591814e-05, |
|
"loss": 0.6277, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.091610848903656, |
|
"learning_rate": 8.392171860805319e-05, |
|
"loss": 0.6649, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.065833680331707, |
|
"learning_rate": 8.103119547850924e-05, |
|
"loss": 0.6262, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.09459354728460312, |
|
"learning_rate": 7.815702322222538e-05, |
|
"loss": 0.6359, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.06780053675174713, |
|
"learning_rate": 7.530167933989161e-05, |
|
"loss": 0.6447, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"eval_loss": 0.6616933941841125, |
|
"eval_runtime": 2.607, |
|
"eval_samples_per_second": 1.151, |
|
"eval_steps_per_second": 1.151, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.0954224094748497, |
|
"learning_rate": 7.246762510237403e-05, |
|
"loss": 0.6636, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.0937703400850296, |
|
"learning_rate": 6.96573034291301e-05, |
|
"loss": 0.6381, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.10935033112764359, |
|
"learning_rate": 6.687313678244242e-05, |
|
"loss": 0.628, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.08154003322124481, |
|
"learning_rate": 6.411752507928642e-05, |
|
"loss": 0.6386, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.12196218967437744, |
|
"learning_rate": 6.139284362263185e-05, |
|
"loss": 0.6317, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.11538293212652206, |
|
"learning_rate": 5.870144105396118e-05, |
|
"loss": 0.6409, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6598871350288391, |
|
"eval_runtime": 2.6073, |
|
"eval_samples_per_second": 1.151, |
|
"eval_steps_per_second": 1.151, |
|
"step": 78 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 117, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 1.2819075742826496e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|