eljavatar's picture
Upload model finetuned on utg4java-220m using strategy src_fm_fc_dctx
a5f479d verified
{
"best_metric": 1.1765093803405762,
"best_model_checkpoint": "/root/finetuning_executions/finetuning_04_utg4java_src_fm_fc_dctx/checkpoint-52644",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 87740,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 2.022568941116333,
"learning_rate": 2.45625e-05,
"loss": 3.9615,
"step": 400
},
{
"epoch": 0.05,
"grad_norm": 1.4656506776809692,
"learning_rate": 4.95625e-05,
"loss": 1.7529,
"step": 800
},
{
"epoch": 0.07,
"grad_norm": 1.1944507360458374,
"learning_rate": 4.977398205659076e-05,
"loss": 1.544,
"step": 1200
},
{
"epoch": 0.09,
"grad_norm": 1.3468198776245117,
"learning_rate": 4.954393834828618e-05,
"loss": 1.4706,
"step": 1600
},
{
"epoch": 0.11,
"grad_norm": 1.04371178150177,
"learning_rate": 4.93138946399816e-05,
"loss": 1.4141,
"step": 2000
},
{
"epoch": 0.14,
"grad_norm": 1.1084234714508057,
"learning_rate": 4.908385093167702e-05,
"loss": 1.376,
"step": 2400
},
{
"epoch": 0.16,
"grad_norm": 1.1234780550003052,
"learning_rate": 4.885380722337244e-05,
"loss": 1.3424,
"step": 2800
},
{
"epoch": 0.18,
"grad_norm": 1.5502243041992188,
"learning_rate": 4.8623763515067866e-05,
"loss": 1.3219,
"step": 3200
},
{
"epoch": 0.21,
"grad_norm": 1.0266711711883545,
"learning_rate": 4.839371980676329e-05,
"loss": 1.2912,
"step": 3600
},
{
"epoch": 0.23,
"grad_norm": 1.1499156951904297,
"learning_rate": 4.816367609845871e-05,
"loss": 1.2708,
"step": 4000
},
{
"epoch": 0.25,
"grad_norm": 1.2277108430862427,
"learning_rate": 4.7933632390154135e-05,
"loss": 1.266,
"step": 4400
},
{
"epoch": 0.27,
"grad_norm": 1.151228427886963,
"learning_rate": 4.770358868184955e-05,
"loss": 1.2418,
"step": 4800
},
{
"epoch": 0.3,
"grad_norm": 1.230625867843628,
"learning_rate": 4.7473544973544974e-05,
"loss": 1.2383,
"step": 5200
},
{
"epoch": 0.32,
"grad_norm": 0.9033710360527039,
"learning_rate": 4.72435012652404e-05,
"loss": 1.2078,
"step": 5600
},
{
"epoch": 0.34,
"grad_norm": 0.9645096659660339,
"learning_rate": 4.701345755693582e-05,
"loss": 1.1893,
"step": 6000
},
{
"epoch": 0.36,
"grad_norm": 1.1617854833602905,
"learning_rate": 4.678341384863124e-05,
"loss": 1.1773,
"step": 6400
},
{
"epoch": 0.39,
"grad_norm": 0.9844051003456116,
"learning_rate": 4.6553370140326666e-05,
"loss": 1.1652,
"step": 6800
},
{
"epoch": 0.41,
"grad_norm": 0.9416425824165344,
"learning_rate": 4.632332643202208e-05,
"loss": 1.1752,
"step": 7200
},
{
"epoch": 0.43,
"grad_norm": 1.103413701057434,
"learning_rate": 4.6093282723717505e-05,
"loss": 1.1423,
"step": 7600
},
{
"epoch": 0.46,
"grad_norm": 1.1644244194030762,
"learning_rate": 4.586323901541293e-05,
"loss": 1.1412,
"step": 8000
},
{
"epoch": 0.48,
"grad_norm": 0.9481434226036072,
"learning_rate": 4.563319530710835e-05,
"loss": 1.1406,
"step": 8400
},
{
"epoch": 0.5,
"grad_norm": 1.1336678266525269,
"learning_rate": 4.5403151598803774e-05,
"loss": 1.1247,
"step": 8800
},
{
"epoch": 0.52,
"grad_norm": 1.1613398790359497,
"learning_rate": 4.51731078904992e-05,
"loss": 1.1139,
"step": 9200
},
{
"epoch": 0.55,
"grad_norm": 0.9106454849243164,
"learning_rate": 4.494306418219462e-05,
"loss": 1.1018,
"step": 9600
},
{
"epoch": 0.57,
"grad_norm": 1.121297001838684,
"learning_rate": 4.471302047389004e-05,
"loss": 1.093,
"step": 10000
},
{
"epoch": 0.59,
"grad_norm": 1.2510058879852295,
"learning_rate": 4.4482976765585466e-05,
"loss": 1.0772,
"step": 10400
},
{
"epoch": 0.62,
"grad_norm": 1.0042085647583008,
"learning_rate": 4.425293305728089e-05,
"loss": 1.0952,
"step": 10800
},
{
"epoch": 0.64,
"grad_norm": 0.9691294431686401,
"learning_rate": 4.402288934897631e-05,
"loss": 1.0728,
"step": 11200
},
{
"epoch": 0.66,
"grad_norm": 0.900873064994812,
"learning_rate": 4.379284564067173e-05,
"loss": 1.0683,
"step": 11600
},
{
"epoch": 0.68,
"grad_norm": 0.942535936832428,
"learning_rate": 4.356280193236715e-05,
"loss": 1.0574,
"step": 12000
},
{
"epoch": 0.71,
"grad_norm": 0.9630743265151978,
"learning_rate": 4.3332758224062574e-05,
"loss": 1.0626,
"step": 12400
},
{
"epoch": 0.73,
"grad_norm": 0.9657124280929565,
"learning_rate": 4.3102714515758e-05,
"loss": 1.0664,
"step": 12800
},
{
"epoch": 0.75,
"grad_norm": 1.0142230987548828,
"learning_rate": 4.287267080745342e-05,
"loss": 1.0421,
"step": 13200
},
{
"epoch": 0.78,
"grad_norm": 1.021824598312378,
"learning_rate": 4.264262709914884e-05,
"loss": 1.0393,
"step": 13600
},
{
"epoch": 0.8,
"grad_norm": 0.866675853729248,
"learning_rate": 4.241258339084426e-05,
"loss": 1.0398,
"step": 14000
},
{
"epoch": 0.82,
"grad_norm": 1.0492345094680786,
"learning_rate": 4.218253968253968e-05,
"loss": 1.0266,
"step": 14400
},
{
"epoch": 0.84,
"grad_norm": 1.1229575872421265,
"learning_rate": 4.1953071083505865e-05,
"loss": 1.0328,
"step": 14800
},
{
"epoch": 0.87,
"grad_norm": 0.9247739315032959,
"learning_rate": 4.172302737520129e-05,
"loss": 1.033,
"step": 15200
},
{
"epoch": 0.89,
"grad_norm": 0.8983958959579468,
"learning_rate": 4.149298366689671e-05,
"loss": 1.0075,
"step": 15600
},
{
"epoch": 0.91,
"grad_norm": 0.9420925974845886,
"learning_rate": 4.1262939958592134e-05,
"loss": 1.0005,
"step": 16000
},
{
"epoch": 0.93,
"grad_norm": 0.9218415021896362,
"learning_rate": 4.103289625028756e-05,
"loss": 1.0117,
"step": 16400
},
{
"epoch": 0.96,
"grad_norm": 0.9700437784194946,
"learning_rate": 4.080285254198298e-05,
"loss": 1.0018,
"step": 16800
},
{
"epoch": 0.98,
"grad_norm": 0.9360381960868835,
"learning_rate": 4.0572808833678396e-05,
"loss": 0.9986,
"step": 17200
},
{
"epoch": 1.0,
"eval_loss": 1.2005890607833862,
"eval_runtime": 368.0591,
"eval_samples_per_second": 163.764,
"eval_steps_per_second": 2.559,
"step": 17548
},
{
"epoch": 1.0,
"grad_norm": 0.8369306921958923,
"learning_rate": 4.034276512537382e-05,
"loss": 0.9929,
"step": 17600
},
{
"epoch": 1.03,
"grad_norm": 0.9470133781433105,
"learning_rate": 4.011272141706924e-05,
"loss": 0.9462,
"step": 18000
},
{
"epoch": 1.05,
"grad_norm": 0.9572529196739197,
"learning_rate": 3.9882677708764665e-05,
"loss": 0.95,
"step": 18400
},
{
"epoch": 1.07,
"grad_norm": 1.0236432552337646,
"learning_rate": 3.965263400046009e-05,
"loss": 0.947,
"step": 18800
},
{
"epoch": 1.09,
"grad_norm": 1.105034589767456,
"learning_rate": 3.942259029215551e-05,
"loss": 0.9382,
"step": 19200
},
{
"epoch": 1.12,
"grad_norm": 1.322292447090149,
"learning_rate": 3.9192546583850934e-05,
"loss": 0.941,
"step": 19600
},
{
"epoch": 1.14,
"grad_norm": 1.137208104133606,
"learning_rate": 3.896250287554636e-05,
"loss": 0.9247,
"step": 20000
},
{
"epoch": 1.16,
"grad_norm": 1.1079697608947754,
"learning_rate": 3.873245916724178e-05,
"loss": 0.9383,
"step": 20400
},
{
"epoch": 1.19,
"grad_norm": 1.3894723653793335,
"learning_rate": 3.85024154589372e-05,
"loss": 0.9381,
"step": 20800
},
{
"epoch": 1.21,
"grad_norm": 0.900830864906311,
"learning_rate": 3.8272371750632626e-05,
"loss": 0.9175,
"step": 21200
},
{
"epoch": 1.23,
"grad_norm": 1.0307693481445312,
"learning_rate": 3.804232804232805e-05,
"loss": 0.924,
"step": 21600
},
{
"epoch": 1.25,
"grad_norm": 1.0316811800003052,
"learning_rate": 3.7812284334023465e-05,
"loss": 0.9165,
"step": 22000
},
{
"epoch": 1.28,
"grad_norm": 1.1016398668289185,
"learning_rate": 3.758224062571889e-05,
"loss": 0.9168,
"step": 22400
},
{
"epoch": 1.3,
"grad_norm": 1.0044025182724,
"learning_rate": 3.735277202668507e-05,
"loss": 0.9146,
"step": 22800
},
{
"epoch": 1.32,
"grad_norm": 1.1868786811828613,
"learning_rate": 3.7122728318380494e-05,
"loss": 0.9253,
"step": 23200
},
{
"epoch": 1.34,
"grad_norm": 1.041280746459961,
"learning_rate": 3.689268461007592e-05,
"loss": 0.9003,
"step": 23600
},
{
"epoch": 1.37,
"grad_norm": 1.0607842206954956,
"learning_rate": 3.666264090177134e-05,
"loss": 0.9075,
"step": 24000
},
{
"epoch": 1.39,
"grad_norm": 0.8795559406280518,
"learning_rate": 3.643259719346676e-05,
"loss": 0.907,
"step": 24400
},
{
"epoch": 1.41,
"grad_norm": 1.0717073678970337,
"learning_rate": 3.6203128594432946e-05,
"loss": 0.9081,
"step": 24800
},
{
"epoch": 1.44,
"grad_norm": 0.9691916108131409,
"learning_rate": 3.597308488612837e-05,
"loss": 0.8907,
"step": 25200
},
{
"epoch": 1.46,
"grad_norm": 1.1249741315841675,
"learning_rate": 3.574304117782379e-05,
"loss": 0.8942,
"step": 25600
},
{
"epoch": 1.48,
"grad_norm": 0.8723602890968323,
"learning_rate": 3.551299746951921e-05,
"loss": 0.9014,
"step": 26000
},
{
"epoch": 1.5,
"grad_norm": 1.1265227794647217,
"learning_rate": 3.528295376121463e-05,
"loss": 0.8862,
"step": 26400
},
{
"epoch": 1.53,
"grad_norm": 0.8394224047660828,
"learning_rate": 3.5052910052910054e-05,
"loss": 0.8823,
"step": 26800
},
{
"epoch": 1.55,
"grad_norm": 1.1697936058044434,
"learning_rate": 3.482344145387624e-05,
"loss": 0.8786,
"step": 27200
},
{
"epoch": 1.57,
"grad_norm": 0.8561883568763733,
"learning_rate": 3.459339774557166e-05,
"loss": 0.8816,
"step": 27600
},
{
"epoch": 1.6,
"grad_norm": 1.054382562637329,
"learning_rate": 3.436335403726708e-05,
"loss": 0.8862,
"step": 28000
},
{
"epoch": 1.62,
"grad_norm": 1.1177219152450562,
"learning_rate": 3.4133310328962506e-05,
"loss": 0.8867,
"step": 28400
},
{
"epoch": 1.64,
"grad_norm": 0.9827663898468018,
"learning_rate": 3.390326662065793e-05,
"loss": 0.8893,
"step": 28800
},
{
"epoch": 1.66,
"grad_norm": 0.9570045471191406,
"learning_rate": 3.3673222912353345e-05,
"loss": 0.8839,
"step": 29200
},
{
"epoch": 1.69,
"grad_norm": 3.5448367595672607,
"learning_rate": 3.3444904531861054e-05,
"loss": 0.8851,
"step": 29600
},
{
"epoch": 1.71,
"grad_norm": 1.0004465579986572,
"learning_rate": 3.321543593282724e-05,
"loss": 0.8824,
"step": 30000
},
{
"epoch": 1.73,
"grad_norm": 0.9247887134552002,
"learning_rate": 3.298539222452266e-05,
"loss": 0.8633,
"step": 30400
},
{
"epoch": 1.76,
"grad_norm": 1.1364272832870483,
"learning_rate": 3.275534851621808e-05,
"loss": 0.8755,
"step": 30800
},
{
"epoch": 1.78,
"grad_norm": 1.057548999786377,
"learning_rate": 3.2525304807913506e-05,
"loss": 0.867,
"step": 31200
},
{
"epoch": 1.8,
"grad_norm": 0.9271708130836487,
"learning_rate": 3.229526109960893e-05,
"loss": 0.8795,
"step": 31600
},
{
"epoch": 1.82,
"grad_norm": 1.1106973886489868,
"learning_rate": 3.2065217391304345e-05,
"loss": 0.8636,
"step": 32000
},
{
"epoch": 1.85,
"grad_norm": 1.1093181371688843,
"learning_rate": 3.183517368299977e-05,
"loss": 0.8633,
"step": 32400
},
{
"epoch": 1.87,
"grad_norm": 1.1222974061965942,
"learning_rate": 3.160512997469519e-05,
"loss": 0.8637,
"step": 32800
},
{
"epoch": 1.89,
"grad_norm": 0.9591478109359741,
"learning_rate": 3.1375086266390614e-05,
"loss": 0.8632,
"step": 33200
},
{
"epoch": 1.91,
"grad_norm": 0.9238194823265076,
"learning_rate": 3.1145042558086044e-05,
"loss": 0.8489,
"step": 33600
},
{
"epoch": 1.94,
"grad_norm": 1.0500926971435547,
"learning_rate": 3.091499884978146e-05,
"loss": 0.8581,
"step": 34000
},
{
"epoch": 1.96,
"grad_norm": 1.459464430809021,
"learning_rate": 3.068495514147688e-05,
"loss": 0.8522,
"step": 34400
},
{
"epoch": 1.98,
"grad_norm": 1.1386796236038208,
"learning_rate": 3.045548654244307e-05,
"loss": 0.8615,
"step": 34800
},
{
"epoch": 2.0,
"eval_loss": 1.1808243989944458,
"eval_runtime": 367.8213,
"eval_samples_per_second": 163.87,
"eval_steps_per_second": 2.561,
"step": 35096
},
{
"epoch": 2.01,
"grad_norm": 1.064687728881836,
"learning_rate": 3.022544283413849e-05,
"loss": 0.8377,
"step": 35200
},
{
"epoch": 2.03,
"grad_norm": 1.260934591293335,
"learning_rate": 2.999539912583391e-05,
"loss": 0.8038,
"step": 35600
},
{
"epoch": 2.05,
"grad_norm": 1.0460811853408813,
"learning_rate": 2.9765355417529335e-05,
"loss": 0.7998,
"step": 36000
},
{
"epoch": 2.07,
"grad_norm": 1.0479072332382202,
"learning_rate": 2.9535311709224754e-05,
"loss": 0.8006,
"step": 36400
},
{
"epoch": 2.1,
"grad_norm": 1.1579115390777588,
"learning_rate": 2.9305268000920177e-05,
"loss": 0.7927,
"step": 36800
},
{
"epoch": 2.12,
"grad_norm": 1.0748111009597778,
"learning_rate": 2.90752242926156e-05,
"loss": 0.7942,
"step": 37200
},
{
"epoch": 2.14,
"grad_norm": 1.0448018312454224,
"learning_rate": 2.884518058431102e-05,
"loss": 0.8047,
"step": 37600
},
{
"epoch": 2.17,
"grad_norm": 1.122674822807312,
"learning_rate": 2.8615136876006443e-05,
"loss": 0.8034,
"step": 38000
},
{
"epoch": 2.19,
"grad_norm": 1.0929648876190186,
"learning_rate": 2.8385093167701866e-05,
"loss": 0.7982,
"step": 38400
},
{
"epoch": 2.21,
"grad_norm": 1.0236926078796387,
"learning_rate": 2.8155049459397285e-05,
"loss": 0.804,
"step": 38800
},
{
"epoch": 2.23,
"grad_norm": 1.4479929208755493,
"learning_rate": 2.792558086036347e-05,
"loss": 0.7986,
"step": 39200
},
{
"epoch": 2.26,
"grad_norm": 1.0635368824005127,
"learning_rate": 2.769553715205889e-05,
"loss": 0.793,
"step": 39600
},
{
"epoch": 2.28,
"grad_norm": 0.9539445638656616,
"learning_rate": 2.7466068553025077e-05,
"loss": 0.7891,
"step": 40000
},
{
"epoch": 2.3,
"grad_norm": 1.0995815992355347,
"learning_rate": 2.723659995399126e-05,
"loss": 0.8014,
"step": 40400
},
{
"epoch": 2.33,
"grad_norm": 0.9494262337684631,
"learning_rate": 2.7006556245686683e-05,
"loss": 0.7913,
"step": 40800
},
{
"epoch": 2.35,
"grad_norm": 0.9940603375434875,
"learning_rate": 2.6776512537382103e-05,
"loss": 0.7967,
"step": 41200
},
{
"epoch": 2.37,
"grad_norm": 1.0324065685272217,
"learning_rate": 2.6546468829077526e-05,
"loss": 0.783,
"step": 41600
},
{
"epoch": 2.39,
"grad_norm": 0.9601799845695496,
"learning_rate": 2.631642512077295e-05,
"loss": 0.7871,
"step": 42000
},
{
"epoch": 2.42,
"grad_norm": 1.0334033966064453,
"learning_rate": 2.6086381412468368e-05,
"loss": 0.793,
"step": 42400
},
{
"epoch": 2.44,
"grad_norm": 1.070986270904541,
"learning_rate": 2.5856912813434555e-05,
"loss": 0.7908,
"step": 42800
},
{
"epoch": 2.46,
"grad_norm": 0.9531447887420654,
"learning_rate": 2.5626869105129974e-05,
"loss": 0.7806,
"step": 43200
},
{
"epoch": 2.48,
"grad_norm": 1.2823644876480103,
"learning_rate": 2.5397400506096157e-05,
"loss": 0.7953,
"step": 43600
},
{
"epoch": 2.51,
"grad_norm": 0.8898524641990662,
"learning_rate": 2.516735679779158e-05,
"loss": 0.7926,
"step": 44000
},
{
"epoch": 2.53,
"grad_norm": 1.2231683731079102,
"learning_rate": 2.4937313089487006e-05,
"loss": 0.7895,
"step": 44400
},
{
"epoch": 2.55,
"grad_norm": 0.9554850459098816,
"learning_rate": 2.4707269381182426e-05,
"loss": 0.7871,
"step": 44800
},
{
"epoch": 2.58,
"grad_norm": 1.007906436920166,
"learning_rate": 2.447722567287785e-05,
"loss": 0.7713,
"step": 45200
},
{
"epoch": 2.6,
"grad_norm": 1.0699195861816406,
"learning_rate": 2.4247181964573272e-05,
"loss": 0.7817,
"step": 45600
},
{
"epoch": 2.62,
"grad_norm": 1.1465818881988525,
"learning_rate": 2.401713825626869e-05,
"loss": 0.7779,
"step": 46000
},
{
"epoch": 2.64,
"grad_norm": 0.8878755569458008,
"learning_rate": 2.3787094547964114e-05,
"loss": 0.7759,
"step": 46400
},
{
"epoch": 2.67,
"grad_norm": 1.0451282262802124,
"learning_rate": 2.3557050839659537e-05,
"loss": 0.775,
"step": 46800
},
{
"epoch": 2.69,
"grad_norm": 1.015073299407959,
"learning_rate": 2.3327007131354957e-05,
"loss": 0.7799,
"step": 47200
},
{
"epoch": 2.71,
"grad_norm": 1.1620949506759644,
"learning_rate": 2.309696342305038e-05,
"loss": 0.7732,
"step": 47600
},
{
"epoch": 2.74,
"grad_norm": 0.8986193537712097,
"learning_rate": 2.2866919714745803e-05,
"loss": 0.7819,
"step": 48000
},
{
"epoch": 2.76,
"grad_norm": 1.1293410062789917,
"learning_rate": 2.2636876006441222e-05,
"loss": 0.7709,
"step": 48400
},
{
"epoch": 2.78,
"grad_norm": 1.0725595951080322,
"learning_rate": 2.2406832298136645e-05,
"loss": 0.7635,
"step": 48800
},
{
"epoch": 2.8,
"grad_norm": 0.98313307762146,
"learning_rate": 2.2176788589832072e-05,
"loss": 0.7692,
"step": 49200
},
{
"epoch": 2.83,
"grad_norm": 1.0612819194793701,
"learning_rate": 2.194674488152749e-05,
"loss": 0.7626,
"step": 49600
},
{
"epoch": 2.85,
"grad_norm": 1.1520133018493652,
"learning_rate": 2.1717276282493678e-05,
"loss": 0.7608,
"step": 50000
},
{
"epoch": 2.87,
"grad_norm": 1.0242923498153687,
"learning_rate": 2.1487232574189097e-05,
"loss": 0.7787,
"step": 50400
},
{
"epoch": 2.89,
"grad_norm": 1.09690523147583,
"learning_rate": 2.125718886588452e-05,
"loss": 0.7716,
"step": 50800
},
{
"epoch": 2.92,
"grad_norm": 1.1755789518356323,
"learning_rate": 2.1027145157579943e-05,
"loss": 0.7713,
"step": 51200
},
{
"epoch": 2.94,
"grad_norm": 0.9996896386146545,
"learning_rate": 2.0797101449275363e-05,
"loss": 0.7733,
"step": 51600
},
{
"epoch": 2.96,
"grad_norm": 1.0416631698608398,
"learning_rate": 2.0567057740970786e-05,
"loss": 0.7707,
"step": 52000
},
{
"epoch": 2.99,
"grad_norm": 1.2188984155654907,
"learning_rate": 2.033701403266621e-05,
"loss": 0.7639,
"step": 52400
},
{
"epoch": 3.0,
"eval_loss": 1.1765093803405762,
"eval_runtime": 367.9851,
"eval_samples_per_second": 163.797,
"eval_steps_per_second": 2.56,
"step": 52644
},
{
"epoch": 3.01,
"grad_norm": 1.4339423179626465,
"learning_rate": 2.0106970324361628e-05,
"loss": 0.7487,
"step": 52800
},
{
"epoch": 3.03,
"grad_norm": 1.022152304649353,
"learning_rate": 1.987692661605705e-05,
"loss": 0.7295,
"step": 53200
},
{
"epoch": 3.05,
"grad_norm": 1.0018941164016724,
"learning_rate": 1.9646882907752474e-05,
"loss": 0.7181,
"step": 53600
},
{
"epoch": 3.08,
"grad_norm": 0.9470273852348328,
"learning_rate": 1.9416839199447894e-05,
"loss": 0.7245,
"step": 54000
},
{
"epoch": 3.1,
"grad_norm": 0.9430755972862244,
"learning_rate": 1.918737060041408e-05,
"loss": 0.7293,
"step": 54400
},
{
"epoch": 3.12,
"grad_norm": 1.1047594547271729,
"learning_rate": 1.89573268921095e-05,
"loss": 0.7243,
"step": 54800
},
{
"epoch": 3.15,
"grad_norm": 1.011172890663147,
"learning_rate": 1.8727858293075682e-05,
"loss": 0.734,
"step": 55200
},
{
"epoch": 3.17,
"grad_norm": 1.0907740592956543,
"learning_rate": 1.849781458477111e-05,
"loss": 0.7139,
"step": 55600
},
{
"epoch": 3.19,
"grad_norm": 0.9432533979415894,
"learning_rate": 1.8267770876466532e-05,
"loss": 0.7144,
"step": 56000
},
{
"epoch": 3.21,
"grad_norm": 1.0588159561157227,
"learning_rate": 1.803772716816195e-05,
"loss": 0.7205,
"step": 56400
},
{
"epoch": 3.24,
"grad_norm": 1.0993419885635376,
"learning_rate": 1.7807683459857374e-05,
"loss": 0.7245,
"step": 56800
},
{
"epoch": 3.26,
"grad_norm": 1.082985520362854,
"learning_rate": 1.7577639751552797e-05,
"loss": 0.72,
"step": 57200
},
{
"epoch": 3.28,
"grad_norm": 1.021746039390564,
"learning_rate": 1.7347596043248217e-05,
"loss": 0.7165,
"step": 57600
},
{
"epoch": 3.31,
"grad_norm": 1.167556643486023,
"learning_rate": 1.7118127444214403e-05,
"loss": 0.7177,
"step": 58000
},
{
"epoch": 3.33,
"grad_norm": 1.1957422494888306,
"learning_rate": 1.6888083735909823e-05,
"loss": 0.7199,
"step": 58400
},
{
"epoch": 3.35,
"grad_norm": 1.0189940929412842,
"learning_rate": 1.6658040027605246e-05,
"loss": 0.7142,
"step": 58800
},
{
"epoch": 3.37,
"grad_norm": 1.7046924829483032,
"learning_rate": 1.642799631930067e-05,
"loss": 0.7187,
"step": 59200
},
{
"epoch": 3.4,
"grad_norm": 1.220874309539795,
"learning_rate": 1.6197952610996088e-05,
"loss": 0.7252,
"step": 59600
},
{
"epoch": 3.42,
"grad_norm": 1.0041950941085815,
"learning_rate": 1.596790890269151e-05,
"loss": 0.7053,
"step": 60000
},
{
"epoch": 3.44,
"grad_norm": 1.0260080099105835,
"learning_rate": 1.5737865194386934e-05,
"loss": 0.7118,
"step": 60400
},
{
"epoch": 3.46,
"grad_norm": 1.1826013326644897,
"learning_rate": 1.5507821486082354e-05,
"loss": 0.7176,
"step": 60800
},
{
"epoch": 3.49,
"grad_norm": 0.9184863567352295,
"learning_rate": 1.527835288704854e-05,
"loss": 0.7227,
"step": 61200
},
{
"epoch": 3.51,
"grad_norm": 1.1603368520736694,
"learning_rate": 1.5048309178743963e-05,
"loss": 0.7162,
"step": 61600
},
{
"epoch": 3.53,
"grad_norm": 1.2338982820510864,
"learning_rate": 1.4818265470439386e-05,
"loss": 0.718,
"step": 62000
},
{
"epoch": 3.56,
"grad_norm": 1.1764419078826904,
"learning_rate": 1.4588221762134807e-05,
"loss": 0.7213,
"step": 62400
},
{
"epoch": 3.58,
"grad_norm": 1.0851845741271973,
"learning_rate": 1.4358178053830229e-05,
"loss": 0.717,
"step": 62800
},
{
"epoch": 3.6,
"grad_norm": 1.0550730228424072,
"learning_rate": 1.4128134345525652e-05,
"loss": 0.7037,
"step": 63200
},
{
"epoch": 3.62,
"grad_norm": 1.1784164905548096,
"learning_rate": 1.3898665746491834e-05,
"loss": 0.7194,
"step": 63600
},
{
"epoch": 3.65,
"grad_norm": 0.9505347609519958,
"learning_rate": 1.3668622038187256e-05,
"loss": 0.7105,
"step": 64000
},
{
"epoch": 3.67,
"grad_norm": 1.266885757446289,
"learning_rate": 1.3438578329882679e-05,
"loss": 0.7101,
"step": 64400
},
{
"epoch": 3.69,
"grad_norm": 1.092987060546875,
"learning_rate": 1.32085346215781e-05,
"loss": 0.7089,
"step": 64800
},
{
"epoch": 3.72,
"grad_norm": 1.2043814659118652,
"learning_rate": 1.2978490913273523e-05,
"loss": 0.6956,
"step": 65200
},
{
"epoch": 3.74,
"grad_norm": 1.230975866317749,
"learning_rate": 1.2748447204968944e-05,
"loss": 0.7118,
"step": 65600
},
{
"epoch": 3.76,
"grad_norm": 1.2574669122695923,
"learning_rate": 1.2518978605935127e-05,
"loss": 0.7102,
"step": 66000
},
{
"epoch": 3.78,
"grad_norm": 1.1570420265197754,
"learning_rate": 1.2288934897630552e-05,
"loss": 0.696,
"step": 66400
},
{
"epoch": 3.81,
"grad_norm": 1.3489502668380737,
"learning_rate": 1.2058891189325973e-05,
"loss": 0.7108,
"step": 66800
},
{
"epoch": 3.83,
"grad_norm": 1.2118040323257446,
"learning_rate": 1.1828847481021394e-05,
"loss": 0.7211,
"step": 67200
},
{
"epoch": 3.85,
"grad_norm": 1.1167511940002441,
"learning_rate": 1.1599378881987579e-05,
"loss": 0.7044,
"step": 67600
},
{
"epoch": 3.88,
"grad_norm": 1.0914174318313599,
"learning_rate": 1.1369335173683e-05,
"loss": 0.7051,
"step": 68000
},
{
"epoch": 3.9,
"grad_norm": 1.059415340423584,
"learning_rate": 1.1139291465378423e-05,
"loss": 0.7142,
"step": 68400
},
{
"epoch": 3.92,
"grad_norm": 0.94761723279953,
"learning_rate": 1.0909247757073844e-05,
"loss": 0.7072,
"step": 68800
},
{
"epoch": 3.94,
"grad_norm": 1.2550498247146606,
"learning_rate": 1.0679204048769266e-05,
"loss": 0.7034,
"step": 69200
},
{
"epoch": 3.97,
"grad_norm": 1.0935131311416626,
"learning_rate": 1.0449160340464689e-05,
"loss": 0.6976,
"step": 69600
},
{
"epoch": 3.99,
"grad_norm": 1.8066422939300537,
"learning_rate": 1.0219116632160112e-05,
"loss": 0.698,
"step": 70000
},
{
"epoch": 4.0,
"eval_loss": 1.1811184883117676,
"eval_runtime": 368.0007,
"eval_samples_per_second": 163.79,
"eval_steps_per_second": 2.56,
"step": 70192
},
{
"epoch": 4.01,
"grad_norm": 1.1222364902496338,
"learning_rate": 9.989072923855533e-06,
"loss": 0.6825,
"step": 70400
},
{
"epoch": 4.03,
"grad_norm": 1.1273374557495117,
"learning_rate": 9.759029215550954e-06,
"loss": 0.6736,
"step": 70800
},
{
"epoch": 4.06,
"grad_norm": 1.1399248838424683,
"learning_rate": 9.528985507246377e-06,
"loss": 0.6667,
"step": 71200
},
{
"epoch": 4.08,
"grad_norm": 1.0817917585372925,
"learning_rate": 9.298941798941798e-06,
"loss": 0.6826,
"step": 71600
},
{
"epoch": 4.1,
"grad_norm": 1.085259199142456,
"learning_rate": 9.068898090637221e-06,
"loss": 0.6699,
"step": 72000
},
{
"epoch": 4.13,
"grad_norm": 1.1712231636047363,
"learning_rate": 8.838854382332644e-06,
"loss": 0.677,
"step": 72400
},
{
"epoch": 4.15,
"grad_norm": 1.1486331224441528,
"learning_rate": 8.608810674028066e-06,
"loss": 0.674,
"step": 72800
},
{
"epoch": 4.17,
"grad_norm": 1.0690157413482666,
"learning_rate": 8.378766965723487e-06,
"loss": 0.6712,
"step": 73200
},
{
"epoch": 4.19,
"grad_norm": 1.0190378427505493,
"learning_rate": 8.14872325741891e-06,
"loss": 0.666,
"step": 73600
},
{
"epoch": 4.22,
"grad_norm": 1.2103822231292725,
"learning_rate": 7.919254658385093e-06,
"loss": 0.673,
"step": 74000
},
{
"epoch": 4.24,
"grad_norm": 1.1462812423706055,
"learning_rate": 7.689210950080516e-06,
"loss": 0.669,
"step": 74400
},
{
"epoch": 4.26,
"grad_norm": 1.0003070831298828,
"learning_rate": 7.459167241775939e-06,
"loss": 0.6806,
"step": 74800
},
{
"epoch": 4.29,
"grad_norm": 1.0154987573623657,
"learning_rate": 7.22912353347136e-06,
"loss": 0.6838,
"step": 75200
},
{
"epoch": 4.31,
"grad_norm": 1.0686919689178467,
"learning_rate": 6.999079825166782e-06,
"loss": 0.6677,
"step": 75600
},
{
"epoch": 4.33,
"grad_norm": 1.0061548948287964,
"learning_rate": 6.769611226132966e-06,
"loss": 0.6696,
"step": 76000
},
{
"epoch": 4.35,
"grad_norm": 4.049505710601807,
"learning_rate": 6.539567517828388e-06,
"loss": 0.6625,
"step": 76400
},
{
"epoch": 4.38,
"grad_norm": 1.1459605693817139,
"learning_rate": 6.30952380952381e-06,
"loss": 0.6747,
"step": 76800
},
{
"epoch": 4.4,
"grad_norm": 1.0267729759216309,
"learning_rate": 6.079480101219232e-06,
"loss": 0.684,
"step": 77200
},
{
"epoch": 4.42,
"grad_norm": 1.109384298324585,
"learning_rate": 5.849436392914654e-06,
"loss": 0.6695,
"step": 77600
},
{
"epoch": 4.44,
"grad_norm": 1.1579252481460571,
"learning_rate": 5.619392684610076e-06,
"loss": 0.6727,
"step": 78000
},
{
"epoch": 4.47,
"grad_norm": 1.2438113689422607,
"learning_rate": 5.3893489763054985e-06,
"loss": 0.6778,
"step": 78400
},
{
"epoch": 4.49,
"grad_norm": 1.1091639995574951,
"learning_rate": 5.159305268000921e-06,
"loss": 0.6665,
"step": 78800
},
{
"epoch": 4.51,
"grad_norm": 1.0381008386611938,
"learning_rate": 4.929261559696342e-06,
"loss": 0.6641,
"step": 79200
},
{
"epoch": 4.54,
"grad_norm": 1.0532902479171753,
"learning_rate": 4.699217851391765e-06,
"loss": 0.6629,
"step": 79600
},
{
"epoch": 4.56,
"grad_norm": 1.0946133136749268,
"learning_rate": 4.469174143087187e-06,
"loss": 0.6765,
"step": 80000
},
{
"epoch": 4.58,
"grad_norm": 1.119805932044983,
"learning_rate": 4.239130434782608e-06,
"loss": 0.6707,
"step": 80400
},
{
"epoch": 4.6,
"grad_norm": 1.2089961767196655,
"learning_rate": 4.009086726478031e-06,
"loss": 0.6753,
"step": 80800
},
{
"epoch": 4.63,
"grad_norm": 1.2102470397949219,
"learning_rate": 3.779618127444214e-06,
"loss": 0.6644,
"step": 81200
},
{
"epoch": 4.65,
"grad_norm": 1.0439101457595825,
"learning_rate": 3.5495744191396367e-06,
"loss": 0.6713,
"step": 81600
},
{
"epoch": 4.67,
"grad_norm": 1.1744842529296875,
"learning_rate": 3.319530710835059e-06,
"loss": 0.6599,
"step": 82000
},
{
"epoch": 4.7,
"grad_norm": 0.9925849437713623,
"learning_rate": 3.089487002530481e-06,
"loss": 0.6656,
"step": 82400
},
{
"epoch": 4.72,
"grad_norm": 1.2233749628067017,
"learning_rate": 2.8600184034966646e-06,
"loss": 0.6813,
"step": 82800
},
{
"epoch": 4.74,
"grad_norm": 1.1348555088043213,
"learning_rate": 2.6299746951920863e-06,
"loss": 0.6738,
"step": 83200
},
{
"epoch": 4.76,
"grad_norm": 1.0024689435958862,
"learning_rate": 2.399930986887509e-06,
"loss": 0.6654,
"step": 83600
},
{
"epoch": 4.79,
"grad_norm": 1.049222469329834,
"learning_rate": 2.1704623878536926e-06,
"loss": 0.6782,
"step": 84000
},
{
"epoch": 4.81,
"grad_norm": 1.2224096059799194,
"learning_rate": 1.9404186795491143e-06,
"loss": 0.6623,
"step": 84400
},
{
"epoch": 4.83,
"grad_norm": 1.0780987739562988,
"learning_rate": 1.7103749712445366e-06,
"loss": 0.6633,
"step": 84800
},
{
"epoch": 4.86,
"grad_norm": 1.0614657402038574,
"learning_rate": 1.4803312629399585e-06,
"loss": 0.6748,
"step": 85200
},
{
"epoch": 4.88,
"grad_norm": 1.1810928583145142,
"learning_rate": 1.2502875546353809e-06,
"loss": 0.6623,
"step": 85600
},
{
"epoch": 4.9,
"grad_norm": 1.1799181699752808,
"learning_rate": 1.0202438463308028e-06,
"loss": 0.6653,
"step": 86000
},
{
"epoch": 4.92,
"grad_norm": 1.012217402458191,
"learning_rate": 7.902001380262249e-07,
"loss": 0.6632,
"step": 86400
},
{
"epoch": 4.95,
"grad_norm": 1.2088865041732788,
"learning_rate": 5.601564297216472e-07,
"loss": 0.6719,
"step": 86800
},
{
"epoch": 4.97,
"grad_norm": 1.1404399871826172,
"learning_rate": 3.301127214170693e-07,
"loss": 0.6636,
"step": 87200
},
{
"epoch": 4.99,
"grad_norm": 1.2054311037063599,
"learning_rate": 1.0006901311249138e-07,
"loss": 0.6656,
"step": 87600
},
{
"epoch": 5.0,
"eval_loss": 1.1946403980255127,
"eval_runtime": 367.8686,
"eval_samples_per_second": 163.849,
"eval_steps_per_second": 2.561,
"step": 87740
},
{
"epoch": 5.0,
"step": 87740,
"total_flos": 1.7097588901675008e+18,
"train_loss": 0.858644689469069,
"train_runtime": 51993.6603,
"train_samples_per_second": 54.003,
"train_steps_per_second": 1.688
}
],
"logging_steps": 400,
"max_steps": 87740,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 1.7097588901675008e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}