|
{ |
|
"best_metric": 4.583011150360107, |
|
"best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained/de_clm/childes_30/checkpoint-32000", |
|
"epoch": 54.507337526205454, |
|
"eval_steps": 2000, |
|
"global_step": 52000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 2.0964360587002098, |
|
"eval_loss": 7.102903366088867, |
|
"eval_runtime": 0.9708, |
|
"eval_samples_per_second": 1416.286, |
|
"eval_steps_per_second": 88.582, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.1928721174004195, |
|
"grad_norm": 1.3964662551879883, |
|
"learning_rate": 1e-05, |
|
"loss": 6.9987, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.1928721174004195, |
|
"eval_loss": 5.884151935577393, |
|
"eval_runtime": 0.966, |
|
"eval_samples_per_second": 1423.408, |
|
"eval_steps_per_second": 89.028, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.289308176100629, |
|
"eval_loss": 5.54873514175415, |
|
"eval_runtime": 0.9657, |
|
"eval_samples_per_second": 1423.84, |
|
"eval_steps_per_second": 89.055, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 8.385744234800839, |
|
"grad_norm": 2.7172107696533203, |
|
"learning_rate": 2e-05, |
|
"loss": 5.2204, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 8.385744234800839, |
|
"eval_loss": 5.2793288230896, |
|
"eval_runtime": 0.9644, |
|
"eval_samples_per_second": 1425.779, |
|
"eval_steps_per_second": 89.176, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 10.482180293501049, |
|
"eval_loss": 5.10486364364624, |
|
"eval_runtime": 0.9641, |
|
"eval_samples_per_second": 1426.204, |
|
"eval_steps_per_second": 89.203, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 12.578616352201259, |
|
"grad_norm": 2.500443458557129, |
|
"learning_rate": 2.99925e-05, |
|
"loss": 4.7358, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 12.578616352201259, |
|
"eval_loss": 4.983631134033203, |
|
"eval_runtime": 0.9644, |
|
"eval_samples_per_second": 1425.809, |
|
"eval_steps_per_second": 89.178, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 14.675052410901468, |
|
"eval_loss": 4.882917404174805, |
|
"eval_runtime": 0.9686, |
|
"eval_samples_per_second": 1419.612, |
|
"eval_steps_per_second": 88.79, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 16.771488469601678, |
|
"grad_norm": 2.400749444961548, |
|
"learning_rate": 3.999e-05, |
|
"loss": 4.4216, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 16.771488469601678, |
|
"eval_loss": 4.802889823913574, |
|
"eval_runtime": 0.9763, |
|
"eval_samples_per_second": 1408.393, |
|
"eval_steps_per_second": 88.089, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 18.867924528301888, |
|
"eval_loss": 4.74226188659668, |
|
"eval_runtime": 0.976, |
|
"eval_samples_per_second": 1408.882, |
|
"eval_steps_per_second": 88.119, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 20.964360587002098, |
|
"grad_norm": 2.2613022327423096, |
|
"learning_rate": 4.9985e-05, |
|
"loss": 4.1842, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 20.964360587002098, |
|
"eval_loss": 4.690371513366699, |
|
"eval_runtime": 0.9669, |
|
"eval_samples_per_second": 1422.037, |
|
"eval_steps_per_second": 88.942, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 23.060796645702307, |
|
"eval_loss": 4.645771503448486, |
|
"eval_runtime": 0.966, |
|
"eval_samples_per_second": 1423.404, |
|
"eval_steps_per_second": 89.027, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 25.157232704402517, |
|
"grad_norm": 2.2588465213775635, |
|
"learning_rate": 5.9980000000000005e-05, |
|
"loss": 3.9858, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 25.157232704402517, |
|
"eval_loss": 4.623382568359375, |
|
"eval_runtime": 0.9666, |
|
"eval_samples_per_second": 1422.538, |
|
"eval_steps_per_second": 88.973, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 27.253668763102727, |
|
"eval_loss": 4.6056084632873535, |
|
"eval_runtime": 0.9825, |
|
"eval_samples_per_second": 1399.422, |
|
"eval_steps_per_second": 87.527, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 29.350104821802937, |
|
"grad_norm": 2.1845760345458984, |
|
"learning_rate": 6.997500000000001e-05, |
|
"loss": 3.8189, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 29.350104821802937, |
|
"eval_loss": 4.590851783752441, |
|
"eval_runtime": 1.0367, |
|
"eval_samples_per_second": 1326.293, |
|
"eval_steps_per_second": 82.954, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 31.446540880503143, |
|
"eval_loss": 4.586838245391846, |
|
"eval_runtime": 0.9705, |
|
"eval_samples_per_second": 1416.743, |
|
"eval_steps_per_second": 88.611, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 33.542976939203356, |
|
"grad_norm": 2.2118289470672607, |
|
"learning_rate": 7.997e-05, |
|
"loss": 3.6763, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 33.542976939203356, |
|
"eval_loss": 4.583011150360107, |
|
"eval_runtime": 0.9706, |
|
"eval_samples_per_second": 1416.604, |
|
"eval_steps_per_second": 88.602, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 35.63941299790356, |
|
"eval_loss": 4.57816743850708, |
|
"eval_runtime": 0.9657, |
|
"eval_samples_per_second": 1423.818, |
|
"eval_steps_per_second": 89.053, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 37.735849056603776, |
|
"grad_norm": 2.189404010772705, |
|
"learning_rate": 8.9965e-05, |
|
"loss": 3.5493, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 37.735849056603776, |
|
"eval_loss": 4.585381031036377, |
|
"eval_runtime": 0.9726, |
|
"eval_samples_per_second": 1413.783, |
|
"eval_steps_per_second": 88.426, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 39.83228511530398, |
|
"eval_loss": 4.596414566040039, |
|
"eval_runtime": 0.971, |
|
"eval_samples_per_second": 1416.025, |
|
"eval_steps_per_second": 88.566, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 41.928721174004195, |
|
"grad_norm": 2.1832594871520996, |
|
"learning_rate": 9.996000000000001e-05, |
|
"loss": 3.4327, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 41.928721174004195, |
|
"eval_loss": 4.610367774963379, |
|
"eval_runtime": 0.9687, |
|
"eval_samples_per_second": 1419.473, |
|
"eval_steps_per_second": 88.782, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 44.0251572327044, |
|
"eval_loss": 4.636893272399902, |
|
"eval_runtime": 0.9651, |
|
"eval_samples_per_second": 1424.742, |
|
"eval_steps_per_second": 89.111, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 46.121593291404615, |
|
"grad_norm": 2.1686387062072754, |
|
"learning_rate": 9.336333333333334e-05, |
|
"loss": 3.3112, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 46.121593291404615, |
|
"eval_loss": 4.66969108581543, |
|
"eval_runtime": 0.9642, |
|
"eval_samples_per_second": 1426.082, |
|
"eval_steps_per_second": 89.195, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 48.21802935010482, |
|
"eval_loss": 4.69525146484375, |
|
"eval_runtime": 0.966, |
|
"eval_samples_per_second": 1423.408, |
|
"eval_steps_per_second": 89.028, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 50.314465408805034, |
|
"grad_norm": 2.3040497303009033, |
|
"learning_rate": 8.67e-05, |
|
"loss": 3.1908, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 50.314465408805034, |
|
"eval_loss": 4.727965831756592, |
|
"eval_runtime": 0.9829, |
|
"eval_samples_per_second": 1398.863, |
|
"eval_steps_per_second": 87.493, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 52.41090146750524, |
|
"eval_loss": 4.762918472290039, |
|
"eval_runtime": 1.0048, |
|
"eval_samples_per_second": 1368.397, |
|
"eval_steps_per_second": 85.587, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 54.507337526205454, |
|
"grad_norm": 2.491647720336914, |
|
"learning_rate": 8.003666666666667e-05, |
|
"loss": 3.0857, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 54.507337526205454, |
|
"eval_loss": 4.792750835418701, |
|
"eval_runtime": 0.9654, |
|
"eval_samples_per_second": 1424.262, |
|
"eval_steps_per_second": 89.081, |
|
"step": 52000 |
|
} |
|
], |
|
"logging_steps": 4000, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 105, |
|
"save_steps": 4000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3445443431727104e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|