|
{ |
|
"best_metric": 1.0747156143188477, |
|
"best_model_checkpoint": "/root/finetuning_executions/finetuning_01_codet5p_src_fm_fc_dctx/checkpoint-17548", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 87740, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.5826328992843628, |
|
"learning_rate": 2.4750000000000002e-05, |
|
"loss": 1.404, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.5471534729003906, |
|
"learning_rate": 4.975e-05, |
|
"loss": 1.1707, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.150067687034607, |
|
"learning_rate": 4.977225672877847e-05, |
|
"loss": 1.1319, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.197001338005066, |
|
"learning_rate": 4.9542213020473895e-05, |
|
"loss": 1.1168, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.0678050518035889, |
|
"learning_rate": 4.931216931216932e-05, |
|
"loss": 1.098, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.196992039680481, |
|
"learning_rate": 4.9082125603864734e-05, |
|
"loss": 1.0811, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.0403153896331787, |
|
"learning_rate": 4.885208189556016e-05, |
|
"loss": 1.0642, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.290574073791504, |
|
"learning_rate": 4.862203818725558e-05, |
|
"loss": 1.0573, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.1538596153259277, |
|
"learning_rate": 4.8391994478951e-05, |
|
"loss": 1.0422, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.1622496843338013, |
|
"learning_rate": 4.8161950770646426e-05, |
|
"loss": 1.0302, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.2310576438903809, |
|
"learning_rate": 4.793190706234185e-05, |
|
"loss": 1.0327, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.7041934728622437, |
|
"learning_rate": 4.770186335403727e-05, |
|
"loss": 1.016, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2586933374404907, |
|
"learning_rate": 4.747181964573269e-05, |
|
"loss": 1.022, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9497622847557068, |
|
"learning_rate": 4.724177593742811e-05, |
|
"loss": 1.0008, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1111699342727661, |
|
"learning_rate": 4.7011732229123534e-05, |
|
"loss": 0.9878, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.177351951599121, |
|
"learning_rate": 4.678168852081896e-05, |
|
"loss": 0.9804, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.1454102993011475, |
|
"learning_rate": 4.655164481251438e-05, |
|
"loss": 0.9739, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.0769121646881104, |
|
"learning_rate": 4.63216011042098e-05, |
|
"loss": 0.9873, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.2958855628967285, |
|
"learning_rate": 4.609155739590522e-05, |
|
"loss": 0.9615, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.1914528608322144, |
|
"learning_rate": 4.586151368760065e-05, |
|
"loss": 0.9637, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.036251425743103, |
|
"learning_rate": 4.563146997929607e-05, |
|
"loss": 0.9651, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.2706526517868042, |
|
"learning_rate": 4.5401426270991495e-05, |
|
"loss": 0.9538, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.1763725280761719, |
|
"learning_rate": 4.517138256268692e-05, |
|
"loss": 0.9481, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.0269743204116821, |
|
"learning_rate": 4.4941338854382334e-05, |
|
"loss": 0.9406, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.1849923133850098, |
|
"learning_rate": 4.471129514607776e-05, |
|
"loss": 0.9325, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.289588451385498, |
|
"learning_rate": 4.448125143777318e-05, |
|
"loss": 0.9198, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0682235956192017, |
|
"learning_rate": 4.42512077294686e-05, |
|
"loss": 0.938, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0467722415924072, |
|
"learning_rate": 4.4021164021164026e-05, |
|
"loss": 0.9191, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.0143468379974365, |
|
"learning_rate": 4.379112031285945e-05, |
|
"loss": 0.9193, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0179190635681152, |
|
"learning_rate": 4.3561076604554865e-05, |
|
"loss": 0.9109, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.0898582935333252, |
|
"learning_rate": 4.333103289625029e-05, |
|
"loss": 0.9173, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.0596779584884644, |
|
"learning_rate": 4.310098918794571e-05, |
|
"loss": 0.9236, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.1058505773544312, |
|
"learning_rate": 4.2870945479641134e-05, |
|
"loss": 0.9006, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.2219908237457275, |
|
"learning_rate": 4.264090177133656e-05, |
|
"loss": 0.9, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9692860245704651, |
|
"learning_rate": 4.241143317230274e-05, |
|
"loss": 0.9014, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.1939465999603271, |
|
"learning_rate": 4.218138946399816e-05, |
|
"loss": 0.8905, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.2843443155288696, |
|
"learning_rate": 4.1951345755693586e-05, |
|
"loss": 0.8988, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.074813723564148, |
|
"learning_rate": 4.1721302047389e-05, |
|
"loss": 0.8979, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.9322890043258667, |
|
"learning_rate": 4.1491258339084425e-05, |
|
"loss": 0.8782, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.005053162574768, |
|
"learning_rate": 4.126121463077985e-05, |
|
"loss": 0.8735, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.0346530675888062, |
|
"learning_rate": 4.103117092247527e-05, |
|
"loss": 0.884, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.113724946975708, |
|
"learning_rate": 4.0801127214170694e-05, |
|
"loss": 0.8765, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.9634864926338196, |
|
"learning_rate": 4.0571083505866117e-05, |
|
"loss": 0.8734, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.0747156143188477, |
|
"eval_runtime": 239.6824, |
|
"eval_samples_per_second": 251.479, |
|
"eval_steps_per_second": 3.93, |
|
"step": 17548 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9827917218208313, |
|
"learning_rate": 4.034103979756153e-05, |
|
"loss": 0.8706, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.1454176902770996, |
|
"learning_rate": 4.0110996089256956e-05, |
|
"loss": 0.825, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.9637428522109985, |
|
"learning_rate": 3.988152749022314e-05, |
|
"loss": 0.829, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.0976300239562988, |
|
"learning_rate": 3.965148378191857e-05, |
|
"loss": 0.8276, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.104611873626709, |
|
"learning_rate": 3.9422015182884744e-05, |
|
"loss": 0.8206, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.373989224433899, |
|
"learning_rate": 3.9193121693121694e-05, |
|
"loss": 0.8225, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.0706731081008911, |
|
"learning_rate": 3.896307798481712e-05, |
|
"loss": 0.8084, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.1869192123413086, |
|
"learning_rate": 3.873303427651253e-05, |
|
"loss": 0.8222, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.2390427589416504, |
|
"learning_rate": 3.850299056820796e-05, |
|
"loss": 0.8231, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.0254757404327393, |
|
"learning_rate": 3.8272946859903386e-05, |
|
"loss": 0.8039, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.2318751811981201, |
|
"learning_rate": 3.804290315159881e-05, |
|
"loss": 0.8122, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.0611391067504883, |
|
"learning_rate": 3.781285944329423e-05, |
|
"loss": 0.8054, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.1104662418365479, |
|
"learning_rate": 3.7582815734989655e-05, |
|
"loss": 0.8065, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.9983170032501221, |
|
"learning_rate": 3.735277202668507e-05, |
|
"loss": 0.8058, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.5960198640823364, |
|
"learning_rate": 3.7122728318380494e-05, |
|
"loss": 0.8135, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.1167716979980469, |
|
"learning_rate": 3.689268461007592e-05, |
|
"loss": 0.7942, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.1541776657104492, |
|
"learning_rate": 3.666264090177134e-05, |
|
"loss": 0.7997, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.0213623046875, |
|
"learning_rate": 3.643259719346676e-05, |
|
"loss": 0.8009, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.2691611051559448, |
|
"learning_rate": 3.6202553485162186e-05, |
|
"loss": 0.801, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.0916258096694946, |
|
"learning_rate": 3.59725097768576e-05, |
|
"loss": 0.7868, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.2260217666625977, |
|
"learning_rate": 3.5742466068553025e-05, |
|
"loss": 0.7905, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.0053335428237915, |
|
"learning_rate": 3.5513572578789974e-05, |
|
"loss": 0.7976, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.2022624015808105, |
|
"learning_rate": 3.528352887048539e-05, |
|
"loss": 0.7821, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.7201203107833862, |
|
"learning_rate": 3.5053485162180814e-05, |
|
"loss": 0.7812, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.288846731185913, |
|
"learning_rate": 3.482344145387624e-05, |
|
"loss": 0.7773, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.9436947107315063, |
|
"learning_rate": 3.459339774557166e-05, |
|
"loss": 0.7816, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.2146368026733398, |
|
"learning_rate": 3.436335403726708e-05, |
|
"loss": 0.7879, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.2685799598693848, |
|
"learning_rate": 3.4133310328962506e-05, |
|
"loss": 0.7851, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.0855278968811035, |
|
"learning_rate": 3.390326662065793e-05, |
|
"loss": 0.7878, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.039978265762329, |
|
"learning_rate": 3.367379802162411e-05, |
|
"loss": 0.7842, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.144707202911377, |
|
"learning_rate": 3.344375431331953e-05, |
|
"loss": 0.7835, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.112640380859375, |
|
"learning_rate": 3.321371060501495e-05, |
|
"loss": 0.7787, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.9749494194984436, |
|
"learning_rate": 3.2983666896710374e-05, |
|
"loss": 0.7665, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.2835110425949097, |
|
"learning_rate": 3.2753623188405796e-05, |
|
"loss": 0.7778, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.1826539039611816, |
|
"learning_rate": 3.252357948010122e-05, |
|
"loss": 0.7702, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.0026845932006836, |
|
"learning_rate": 3.229353577179664e-05, |
|
"loss": 0.7807, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.314173698425293, |
|
"learning_rate": 3.2063492063492065e-05, |
|
"loss": 0.7693, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.2866084575653076, |
|
"learning_rate": 3.183344835518749e-05, |
|
"loss": 0.7672, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.2260661125183105, |
|
"learning_rate": 3.160340464688291e-05, |
|
"loss": 0.7679, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.05460524559021, |
|
"learning_rate": 3.1373360938578334e-05, |
|
"loss": 0.7678, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 1.020358920097351, |
|
"learning_rate": 3.114331723027376e-05, |
|
"loss": 0.7543, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.2354198694229126, |
|
"learning_rate": 3.091327352196918e-05, |
|
"loss": 0.7627, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.1465234756469727, |
|
"learning_rate": 3.0683229813664596e-05, |
|
"loss": 0.7575, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.2132583856582642, |
|
"learning_rate": 3.045318610536002e-05, |
|
"loss": 0.7667, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.0832360982894897, |
|
"eval_runtime": 239.646, |
|
"eval_samples_per_second": 251.517, |
|
"eval_steps_per_second": 3.931, |
|
"step": 35096 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.150380253791809, |
|
"learning_rate": 3.0223142397055442e-05, |
|
"loss": 0.7454, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.1517704725265503, |
|
"learning_rate": 2.9993098688750865e-05, |
|
"loss": 0.7151, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.2216242551803589, |
|
"learning_rate": 2.9763054980446285e-05, |
|
"loss": 0.7085, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 1.1124303340911865, |
|
"learning_rate": 2.9533011272141708e-05, |
|
"loss": 0.7112, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.2349203824996948, |
|
"learning_rate": 2.930296756383713e-05, |
|
"loss": 0.7028, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.1383484601974487, |
|
"learning_rate": 2.9073498964803314e-05, |
|
"loss": 0.7043, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.156599521636963, |
|
"learning_rate": 2.8843455256498737e-05, |
|
"loss": 0.7171, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.2010080814361572, |
|
"learning_rate": 2.8613411548194156e-05, |
|
"loss": 0.7131, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.0973454713821411, |
|
"learning_rate": 2.838336783988958e-05, |
|
"loss": 0.7107, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.1451126337051392, |
|
"learning_rate": 2.8153324131585002e-05, |
|
"loss": 0.7144, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.3358917236328125, |
|
"learning_rate": 2.7923855532551185e-05, |
|
"loss": 0.7095, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.142595887184143, |
|
"learning_rate": 2.7694386933517368e-05, |
|
"loss": 0.7057, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.0060242414474487, |
|
"learning_rate": 2.746434322521279e-05, |
|
"loss": 0.7015, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.9941455721855164, |
|
"learning_rate": 2.7234299516908214e-05, |
|
"loss": 0.7132, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.0412284135818481, |
|
"learning_rate": 2.7004255808603634e-05, |
|
"loss": 0.705, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.099241852760315, |
|
"learning_rate": 2.6774212100299057e-05, |
|
"loss": 0.7084, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.1212828159332275, |
|
"learning_rate": 2.654474350126524e-05, |
|
"loss": 0.6978, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 1.0526177883148193, |
|
"learning_rate": 2.6315274902231422e-05, |
|
"loss": 0.7004, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.1355630159378052, |
|
"learning_rate": 2.6085231193926845e-05, |
|
"loss": 0.7066, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.1076544523239136, |
|
"learning_rate": 2.5855187485622268e-05, |
|
"loss": 0.7059, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.9984206557273865, |
|
"learning_rate": 2.562514377731769e-05, |
|
"loss": 0.6959, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.267728567123413, |
|
"learning_rate": 2.5395675178283874e-05, |
|
"loss": 0.7109, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.010473370552063, |
|
"learning_rate": 2.5165631469979294e-05, |
|
"loss": 0.707, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.3132517337799072, |
|
"learning_rate": 2.493616287094548e-05, |
|
"loss": 0.7049, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 1.0806896686553955, |
|
"learning_rate": 2.4706119162640903e-05, |
|
"loss": 0.7032, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.0918660163879395, |
|
"learning_rate": 2.4476075454336326e-05, |
|
"loss": 0.6876, |
|
"step": 45200 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.172402024269104, |
|
"learning_rate": 2.424603174603175e-05, |
|
"loss": 0.697, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.3677465915679932, |
|
"learning_rate": 2.4015988037727168e-05, |
|
"loss": 0.694, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.9764865636825562, |
|
"learning_rate": 2.378594432942259e-05, |
|
"loss": 0.692, |
|
"step": 46400 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.168467402458191, |
|
"learning_rate": 2.3555900621118014e-05, |
|
"loss": 0.6907, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.0712902545928955, |
|
"learning_rate": 2.3325856912813434e-05, |
|
"loss": 0.6969, |
|
"step": 47200 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.2307454347610474, |
|
"learning_rate": 2.3095813204508857e-05, |
|
"loss": 0.6912, |
|
"step": 47600 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.0218665599822998, |
|
"learning_rate": 2.286576949620428e-05, |
|
"loss": 0.6977, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.2914952039718628, |
|
"learning_rate": 2.2635725787899703e-05, |
|
"loss": 0.6902, |
|
"step": 48400 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.1566945314407349, |
|
"learning_rate": 2.2405682079595126e-05, |
|
"loss": 0.6812, |
|
"step": 48800 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.0954487323760986, |
|
"learning_rate": 2.2175638371290545e-05, |
|
"loss": 0.6884, |
|
"step": 49200 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.2148932218551636, |
|
"learning_rate": 2.1945594662985968e-05, |
|
"loss": 0.6815, |
|
"step": 49600 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.3392747640609741, |
|
"learning_rate": 2.171555095468139e-05, |
|
"loss": 0.68, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 1.1132878065109253, |
|
"learning_rate": 2.1485507246376814e-05, |
|
"loss": 0.6985, |
|
"step": 50400 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.1941944360733032, |
|
"learning_rate": 2.1255463538072234e-05, |
|
"loss": 0.6888, |
|
"step": 50800 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.3875670433044434, |
|
"learning_rate": 2.1025994939038417e-05, |
|
"loss": 0.689, |
|
"step": 51200 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.1829801797866821, |
|
"learning_rate": 2.079595123073384e-05, |
|
"loss": 0.6913, |
|
"step": 51600 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.1661145687103271, |
|
"learning_rate": 2.0565907522429263e-05, |
|
"loss": 0.6889, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.4086240530014038, |
|
"learning_rate": 2.0335863814124682e-05, |
|
"loss": 0.6835, |
|
"step": 52400 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.08446204662323, |
|
"eval_runtime": 239.6214, |
|
"eval_samples_per_second": 251.543, |
|
"eval_steps_per_second": 3.931, |
|
"step": 52644 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 1.4694873094558716, |
|
"learning_rate": 2.0105820105820105e-05, |
|
"loss": 0.6682, |
|
"step": 52800 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.2044637203216553, |
|
"learning_rate": 1.9875776397515528e-05, |
|
"loss": 0.6507, |
|
"step": 53200 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 1.100796103477478, |
|
"learning_rate": 1.964573268921095e-05, |
|
"loss": 0.6401, |
|
"step": 53600 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.0444916486740112, |
|
"learning_rate": 1.9415688980906374e-05, |
|
"loss": 0.6449, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.0386139154434204, |
|
"learning_rate": 1.918679549114332e-05, |
|
"loss": 0.6501, |
|
"step": 54400 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.1518653631210327, |
|
"learning_rate": 1.895675178283874e-05, |
|
"loss": 0.6461, |
|
"step": 54800 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 1.1994818449020386, |
|
"learning_rate": 1.8726708074534163e-05, |
|
"loss": 0.6556, |
|
"step": 55200 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 1.1442828178405762, |
|
"learning_rate": 1.8496664366229586e-05, |
|
"loss": 0.6361, |
|
"step": 55600 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 1.0276410579681396, |
|
"learning_rate": 1.826662065792501e-05, |
|
"loss": 0.6378, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.1680177450180054, |
|
"learning_rate": 1.8036576949620428e-05, |
|
"loss": 0.6439, |
|
"step": 56400 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.1519464254379272, |
|
"learning_rate": 1.780653324131585e-05, |
|
"loss": 0.6466, |
|
"step": 56800 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.1728110313415527, |
|
"learning_rate": 1.7576489533011274e-05, |
|
"loss": 0.6419, |
|
"step": 57200 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.1479480266571045, |
|
"learning_rate": 1.7347020933977457e-05, |
|
"loss": 0.6398, |
|
"step": 57600 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 1.3310575485229492, |
|
"learning_rate": 1.7116977225672877e-05, |
|
"loss": 0.6407, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 1.3201924562454224, |
|
"learning_rate": 1.68869335173683e-05, |
|
"loss": 0.643, |
|
"step": 58400 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 1.1604301929473877, |
|
"learning_rate": 1.6656889809063723e-05, |
|
"loss": 0.638, |
|
"step": 58800 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.0410959720611572, |
|
"learning_rate": 1.6426846100759146e-05, |
|
"loss": 0.6418, |
|
"step": 59200 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.3607354164123535, |
|
"learning_rate": 1.6196802392454565e-05, |
|
"loss": 0.6486, |
|
"step": 59600 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.0975886583328247, |
|
"learning_rate": 1.596675868414999e-05, |
|
"loss": 0.6309, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.1079365015029907, |
|
"learning_rate": 1.573671497584541e-05, |
|
"loss": 0.6362, |
|
"step": 60400 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 1.1985440254211426, |
|
"learning_rate": 1.5506671267540834e-05, |
|
"loss": 0.6402, |
|
"step": 60800 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.9779540300369263, |
|
"learning_rate": 1.5276627559236257e-05, |
|
"loss": 0.6441, |
|
"step": 61200 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 1.2140247821807861, |
|
"learning_rate": 1.5046583850931678e-05, |
|
"loss": 0.6393, |
|
"step": 61600 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.3921316862106323, |
|
"learning_rate": 1.48165401426271e-05, |
|
"loss": 0.6407, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.286074161529541, |
|
"learning_rate": 1.4587071543593284e-05, |
|
"loss": 0.6447, |
|
"step": 62400 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.2328264713287354, |
|
"learning_rate": 1.4357027835288705e-05, |
|
"loss": 0.6387, |
|
"step": 62800 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.1091105937957764, |
|
"learning_rate": 1.4126984126984127e-05, |
|
"loss": 0.6287, |
|
"step": 63200 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 1.3000015020370483, |
|
"learning_rate": 1.389694041867955e-05, |
|
"loss": 0.6421, |
|
"step": 63600 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.9806800484657288, |
|
"learning_rate": 1.3666896710374971e-05, |
|
"loss": 0.6342, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 1.4063104391098022, |
|
"learning_rate": 1.3436853002070392e-05, |
|
"loss": 0.6323, |
|
"step": 64400 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 1.1137511730194092, |
|
"learning_rate": 1.3206809293765815e-05, |
|
"loss": 0.6316, |
|
"step": 64800 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 1.307237148284912, |
|
"learning_rate": 1.2976765585461237e-05, |
|
"loss": 0.6209, |
|
"step": 65200 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 1.2854503393173218, |
|
"learning_rate": 1.2746721877156661e-05, |
|
"loss": 0.635, |
|
"step": 65600 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.4092977046966553, |
|
"learning_rate": 1.2516678168852084e-05, |
|
"loss": 0.6337, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 1.146622657775879, |
|
"learning_rate": 1.2286634460547504e-05, |
|
"loss": 0.621, |
|
"step": 66400 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 1.4500105381011963, |
|
"learning_rate": 1.2056590752242927e-05, |
|
"loss": 0.6351, |
|
"step": 66800 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 1.3374382257461548, |
|
"learning_rate": 1.182654704393835e-05, |
|
"loss": 0.6434, |
|
"step": 67200 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 1.172215223312378, |
|
"learning_rate": 1.1596503335633771e-05, |
|
"loss": 0.6299, |
|
"step": 67600 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.228962779045105, |
|
"learning_rate": 1.1366459627329192e-05, |
|
"loss": 0.6289, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.1930328607559204, |
|
"learning_rate": 1.1136415919024615e-05, |
|
"loss": 0.6387, |
|
"step": 68400 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 1.0659236907958984, |
|
"learning_rate": 1.0906947319990798e-05, |
|
"loss": 0.6323, |
|
"step": 68800 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 1.2611548900604248, |
|
"learning_rate": 1.0677478720956983e-05, |
|
"loss": 0.6287, |
|
"step": 69200 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 1.186289668083191, |
|
"learning_rate": 1.0447435012652404e-05, |
|
"loss": 0.6232, |
|
"step": 69600 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 1.2363786697387695, |
|
"learning_rate": 1.0217391304347827e-05, |
|
"loss": 0.6239, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.0999037027359009, |
|
"eval_runtime": 239.6686, |
|
"eval_samples_per_second": 251.493, |
|
"eval_steps_per_second": 3.93, |
|
"step": 70192 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 1.0348886251449585, |
|
"learning_rate": 9.98734759604325e-06, |
|
"loss": 0.6073, |
|
"step": 70400 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 1.2015148401260376, |
|
"learning_rate": 9.757303887738671e-06, |
|
"loss": 0.5995, |
|
"step": 70800 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 1.2094097137451172, |
|
"learning_rate": 9.527260179434092e-06, |
|
"loss": 0.5939, |
|
"step": 71200 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 1.1249605417251587, |
|
"learning_rate": 9.297216471129515e-06, |
|
"loss": 0.6087, |
|
"step": 71600 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 1.1234875917434692, |
|
"learning_rate": 9.067172762824937e-06, |
|
"loss": 0.5966, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 1.3029675483703613, |
|
"learning_rate": 8.837704163791121e-06, |
|
"loss": 0.6023, |
|
"step": 72400 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 1.1492359638214111, |
|
"learning_rate": 8.607660455486542e-06, |
|
"loss": 0.6014, |
|
"step": 72800 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 1.1293883323669434, |
|
"learning_rate": 8.377616747181965e-06, |
|
"loss": 0.5975, |
|
"step": 73200 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 1.1434813737869263, |
|
"learning_rate": 8.147573038877387e-06, |
|
"loss": 0.5928, |
|
"step": 73600 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 1.2983373403549194, |
|
"learning_rate": 7.91752933057281e-06, |
|
"loss": 0.5987, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 1.190185785293579, |
|
"learning_rate": 7.687485622268231e-06, |
|
"loss": 0.5948, |
|
"step": 74400 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 1.068985939025879, |
|
"learning_rate": 7.457441913963653e-06, |
|
"loss": 0.6059, |
|
"step": 74800 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 1.0893745422363281, |
|
"learning_rate": 7.227398205659075e-06, |
|
"loss": 0.61, |
|
"step": 75200 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 1.1232637166976929, |
|
"learning_rate": 6.997354497354498e-06, |
|
"loss": 0.5947, |
|
"step": 75600 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 1.1357260942459106, |
|
"learning_rate": 6.76731078904992e-06, |
|
"loss": 0.596, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 1.1744706630706787, |
|
"learning_rate": 6.537267080745342e-06, |
|
"loss": 0.5909, |
|
"step": 76400 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 1.2433269023895264, |
|
"learning_rate": 6.307223372440764e-06, |
|
"loss": 0.6008, |
|
"step": 76800 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.1302207708358765, |
|
"learning_rate": 6.0777547734069475e-06, |
|
"loss": 0.6091, |
|
"step": 77200 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 1.309975504875183, |
|
"learning_rate": 5.848286174373131e-06, |
|
"loss": 0.5954, |
|
"step": 77600 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 1.2409945726394653, |
|
"learning_rate": 5.618242466068553e-06, |
|
"loss": 0.5986, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 1.274553894996643, |
|
"learning_rate": 5.388198757763975e-06, |
|
"loss": 0.6031, |
|
"step": 78400 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 1.2017253637313843, |
|
"learning_rate": 5.1581550494593975e-06, |
|
"loss": 0.5944, |
|
"step": 78800 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 1.2163805961608887, |
|
"learning_rate": 4.92811134115482e-06, |
|
"loss": 0.5921, |
|
"step": 79200 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 1.2417266368865967, |
|
"learning_rate": 4.698067632850242e-06, |
|
"loss": 0.5897, |
|
"step": 79600 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 1.212352991104126, |
|
"learning_rate": 4.468023924545664e-06, |
|
"loss": 0.6028, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 1.2147575616836548, |
|
"learning_rate": 4.237980216241086e-06, |
|
"loss": 0.5964, |
|
"step": 80400 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.315047025680542, |
|
"learning_rate": 4.007936507936508e-06, |
|
"loss": 0.6008, |
|
"step": 80800 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 1.3426513671875, |
|
"learning_rate": 3.7778927996319303e-06, |
|
"loss": 0.5913, |
|
"step": 81200 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 1.1529968976974487, |
|
"learning_rate": 3.5478490913273524e-06, |
|
"loss": 0.5966, |
|
"step": 81600 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 1.2418185472488403, |
|
"learning_rate": 3.317805383022775e-06, |
|
"loss": 0.5878, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 1.1235523223876953, |
|
"learning_rate": 3.0877616747181967e-06, |
|
"loss": 0.5931, |
|
"step": 82400 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 1.3443814516067505, |
|
"learning_rate": 2.85829307568438e-06, |
|
"loss": 0.6064, |
|
"step": 82800 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 1.2628613710403442, |
|
"learning_rate": 2.6288244766505636e-06, |
|
"loss": 0.6001, |
|
"step": 83200 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 1.1324889659881592, |
|
"learning_rate": 2.3987807683459858e-06, |
|
"loss": 0.5916, |
|
"step": 83600 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 1.1024839878082275, |
|
"learning_rate": 2.168737060041408e-06, |
|
"loss": 0.6048, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 1.3531837463378906, |
|
"learning_rate": 1.93869335173683e-06, |
|
"loss": 0.5896, |
|
"step": 84400 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 1.1726586818695068, |
|
"learning_rate": 1.7086496434322524e-06, |
|
"loss": 0.5905, |
|
"step": 84800 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 1.0651309490203857, |
|
"learning_rate": 1.4791810443984358e-06, |
|
"loss": 0.6002, |
|
"step": 85200 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.277769684791565, |
|
"learning_rate": 1.2497124453646193e-06, |
|
"loss": 0.5899, |
|
"step": 85600 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 1.2923473119735718, |
|
"learning_rate": 1.0196687370600414e-06, |
|
"loss": 0.5924, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 1.1012171506881714, |
|
"learning_rate": 7.896250287554636e-07, |
|
"loss": 0.591, |
|
"step": 86400 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 1.2737475633621216, |
|
"learning_rate": 5.595813204508857e-07, |
|
"loss": 0.5968, |
|
"step": 86800 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 1.2628225088119507, |
|
"learning_rate": 3.295376121463078e-07, |
|
"loss": 0.591, |
|
"step": 87200 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 1.3517860174179077, |
|
"learning_rate": 9.949390384172993e-08, |
|
"loss": 0.5918, |
|
"step": 87600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.1124831438064575, |
|
"eval_runtime": 239.5934, |
|
"eval_samples_per_second": 251.572, |
|
"eval_steps_per_second": 3.932, |
|
"step": 87740 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 87740, |
|
"total_flos": 1.7097588901675008e+18, |
|
"train_loss": 0.74035006675955, |
|
"train_runtime": 31417.8788, |
|
"train_samples_per_second": 89.369, |
|
"train_steps_per_second": 2.793 |
|
} |
|
], |
|
"logging_steps": 400, |
|
"max_steps": 87740, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 1.7097588901675008e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|