sedrickkeh's picture
End of training
ed09d8e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999438727782975,
"eval_steps": 500,
"global_step": 1002,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.029934518241347054,
"grad_norm": 1.21135244631254,
"learning_rate": 8e-06,
"loss": 0.7943,
"step": 10
},
{
"epoch": 0.05986903648269411,
"grad_norm": 0.9257888985815798,
"learning_rate": 8e-06,
"loss": 0.6917,
"step": 20
},
{
"epoch": 0.08980355472404115,
"grad_norm": 0.7841012581900926,
"learning_rate": 8e-06,
"loss": 0.6676,
"step": 30
},
{
"epoch": 0.11973807296538821,
"grad_norm": 0.8186897658494727,
"learning_rate": 8e-06,
"loss": 0.6642,
"step": 40
},
{
"epoch": 0.14967259120673526,
"grad_norm": 0.9640808091551717,
"learning_rate": 8e-06,
"loss": 0.6563,
"step": 50
},
{
"epoch": 0.1796071094480823,
"grad_norm": 0.6573771107768287,
"learning_rate": 8e-06,
"loss": 0.6486,
"step": 60
},
{
"epoch": 0.20954162768942938,
"grad_norm": 0.7451852720331203,
"learning_rate": 8e-06,
"loss": 0.6459,
"step": 70
},
{
"epoch": 0.23947614593077643,
"grad_norm": 0.6760171899376464,
"learning_rate": 8e-06,
"loss": 0.6469,
"step": 80
},
{
"epoch": 0.2694106641721235,
"grad_norm": 0.9185224784748138,
"learning_rate": 8e-06,
"loss": 0.638,
"step": 90
},
{
"epoch": 0.2993451824134705,
"grad_norm": 0.8310714141547997,
"learning_rate": 8e-06,
"loss": 0.64,
"step": 100
},
{
"epoch": 0.3292797006548176,
"grad_norm": 0.6968599344480478,
"learning_rate": 8e-06,
"loss": 0.6357,
"step": 110
},
{
"epoch": 0.3592142188961646,
"grad_norm": 0.7783107345226523,
"learning_rate": 8e-06,
"loss": 0.6344,
"step": 120
},
{
"epoch": 0.3891487371375117,
"grad_norm": 0.7320920547129813,
"learning_rate": 8e-06,
"loss": 0.6347,
"step": 130
},
{
"epoch": 0.41908325537885877,
"grad_norm": 0.8310187430809345,
"learning_rate": 8e-06,
"loss": 0.6284,
"step": 140
},
{
"epoch": 0.4490177736202058,
"grad_norm": 0.742852301838313,
"learning_rate": 8e-06,
"loss": 0.6305,
"step": 150
},
{
"epoch": 0.47895229186155286,
"grad_norm": 0.7072242715008016,
"learning_rate": 8e-06,
"loss": 0.6294,
"step": 160
},
{
"epoch": 0.5088868101028999,
"grad_norm": 0.698214271861064,
"learning_rate": 8e-06,
"loss": 0.6239,
"step": 170
},
{
"epoch": 0.538821328344247,
"grad_norm": 0.6662946690363835,
"learning_rate": 8e-06,
"loss": 0.6253,
"step": 180
},
{
"epoch": 0.568755846585594,
"grad_norm": 0.6538562990065714,
"learning_rate": 8e-06,
"loss": 0.623,
"step": 190
},
{
"epoch": 0.598690364826941,
"grad_norm": 0.7549312483543436,
"learning_rate": 8e-06,
"loss": 0.6308,
"step": 200
},
{
"epoch": 0.6286248830682881,
"grad_norm": 0.806924453966631,
"learning_rate": 8e-06,
"loss": 0.6247,
"step": 210
},
{
"epoch": 0.6585594013096352,
"grad_norm": 0.8525309679457572,
"learning_rate": 8e-06,
"loss": 0.623,
"step": 220
},
{
"epoch": 0.6884939195509823,
"grad_norm": 0.7201883248635861,
"learning_rate": 8e-06,
"loss": 0.6257,
"step": 230
},
{
"epoch": 0.7184284377923292,
"grad_norm": 0.6818329042197464,
"learning_rate": 8e-06,
"loss": 0.6247,
"step": 240
},
{
"epoch": 0.7483629560336763,
"grad_norm": 0.6151687407500573,
"learning_rate": 8e-06,
"loss": 0.6212,
"step": 250
},
{
"epoch": 0.7782974742750234,
"grad_norm": 0.6622047387016661,
"learning_rate": 8e-06,
"loss": 0.6198,
"step": 260
},
{
"epoch": 0.8082319925163705,
"grad_norm": 0.661479958293304,
"learning_rate": 8e-06,
"loss": 0.6272,
"step": 270
},
{
"epoch": 0.8381665107577175,
"grad_norm": 0.7220669462375645,
"learning_rate": 8e-06,
"loss": 0.6174,
"step": 280
},
{
"epoch": 0.8681010289990645,
"grad_norm": 0.6245633494447357,
"learning_rate": 8e-06,
"loss": 0.6142,
"step": 290
},
{
"epoch": 0.8980355472404116,
"grad_norm": 0.7369753450517131,
"learning_rate": 8e-06,
"loss": 0.6155,
"step": 300
},
{
"epoch": 0.9279700654817586,
"grad_norm": 0.684240331799871,
"learning_rate": 8e-06,
"loss": 0.6142,
"step": 310
},
{
"epoch": 0.9579045837231057,
"grad_norm": 0.6592793440033525,
"learning_rate": 8e-06,
"loss": 0.6165,
"step": 320
},
{
"epoch": 0.9878391019644528,
"grad_norm": 0.7596640728440148,
"learning_rate": 8e-06,
"loss": 0.6094,
"step": 330
},
{
"epoch": 0.9998129092609915,
"eval_loss": 0.6207359433174133,
"eval_runtime": 516.5507,
"eval_samples_per_second": 17.425,
"eval_steps_per_second": 0.546,
"step": 334
},
{
"epoch": 1.0177736202057999,
"grad_norm": 1.0914053476567482,
"learning_rate": 8e-06,
"loss": 0.627,
"step": 340
},
{
"epoch": 1.047708138447147,
"grad_norm": 0.9628992294558122,
"learning_rate": 8e-06,
"loss": 0.5324,
"step": 350
},
{
"epoch": 1.077642656688494,
"grad_norm": 0.7102957206524368,
"learning_rate": 8e-06,
"loss": 0.5276,
"step": 360
},
{
"epoch": 1.1075771749298409,
"grad_norm": 0.6396993471164553,
"learning_rate": 8e-06,
"loss": 0.5282,
"step": 370
},
{
"epoch": 1.137511693171188,
"grad_norm": 0.6680934494961647,
"learning_rate": 8e-06,
"loss": 0.5287,
"step": 380
},
{
"epoch": 1.167446211412535,
"grad_norm": 0.6517910836945853,
"learning_rate": 8e-06,
"loss": 0.5311,
"step": 390
},
{
"epoch": 1.197380729653882,
"grad_norm": 0.6720240409719913,
"learning_rate": 8e-06,
"loss": 0.5318,
"step": 400
},
{
"epoch": 1.2273152478952292,
"grad_norm": 0.6803368260593882,
"learning_rate": 8e-06,
"loss": 0.5347,
"step": 410
},
{
"epoch": 1.2572497661365762,
"grad_norm": 0.6884652022904354,
"learning_rate": 8e-06,
"loss": 0.5383,
"step": 420
},
{
"epoch": 1.2871842843779233,
"grad_norm": 0.7329284898776822,
"learning_rate": 8e-06,
"loss": 0.5338,
"step": 430
},
{
"epoch": 1.3171188026192704,
"grad_norm": 0.6392516489495297,
"learning_rate": 8e-06,
"loss": 0.5359,
"step": 440
},
{
"epoch": 1.3470533208606175,
"grad_norm": 0.6262376414200107,
"learning_rate": 8e-06,
"loss": 0.5306,
"step": 450
},
{
"epoch": 1.3769878391019645,
"grad_norm": 0.6361024950573079,
"learning_rate": 8e-06,
"loss": 0.5411,
"step": 460
},
{
"epoch": 1.4069223573433116,
"grad_norm": 0.6335891265421159,
"learning_rate": 8e-06,
"loss": 0.5407,
"step": 470
},
{
"epoch": 1.4368568755846587,
"grad_norm": 0.6848282816282614,
"learning_rate": 8e-06,
"loss": 0.5428,
"step": 480
},
{
"epoch": 1.4667913938260055,
"grad_norm": 0.6098388511595073,
"learning_rate": 8e-06,
"loss": 0.5424,
"step": 490
},
{
"epoch": 1.4967259120673526,
"grad_norm": 0.6168997234574297,
"learning_rate": 8e-06,
"loss": 0.5386,
"step": 500
},
{
"epoch": 1.5266604303086997,
"grad_norm": 0.6272985821202596,
"learning_rate": 8e-06,
"loss": 0.542,
"step": 510
},
{
"epoch": 1.5565949485500468,
"grad_norm": 0.6696253220648498,
"learning_rate": 8e-06,
"loss": 0.5402,
"step": 520
},
{
"epoch": 1.5865294667913938,
"grad_norm": 0.6447015112747386,
"learning_rate": 8e-06,
"loss": 0.5408,
"step": 530
},
{
"epoch": 1.616463985032741,
"grad_norm": 0.6296023955791409,
"learning_rate": 8e-06,
"loss": 0.5449,
"step": 540
},
{
"epoch": 1.646398503274088,
"grad_norm": 0.6086610727578513,
"learning_rate": 8e-06,
"loss": 0.5442,
"step": 550
},
{
"epoch": 1.6763330215154348,
"grad_norm": 0.6145121966059982,
"learning_rate": 8e-06,
"loss": 0.5352,
"step": 560
},
{
"epoch": 1.706267539756782,
"grad_norm": 0.6924376605157173,
"learning_rate": 8e-06,
"loss": 0.5371,
"step": 570
},
{
"epoch": 1.736202057998129,
"grad_norm": 0.6574403524622597,
"learning_rate": 8e-06,
"loss": 0.5387,
"step": 580
},
{
"epoch": 1.766136576239476,
"grad_norm": 0.6340547035740907,
"learning_rate": 8e-06,
"loss": 0.5417,
"step": 590
},
{
"epoch": 1.7960710944808231,
"grad_norm": 0.6875606923110796,
"learning_rate": 8e-06,
"loss": 0.5422,
"step": 600
},
{
"epoch": 1.8260056127221702,
"grad_norm": 0.601235347108058,
"learning_rate": 8e-06,
"loss": 0.5404,
"step": 610
},
{
"epoch": 1.8559401309635173,
"grad_norm": 0.6002851968034844,
"learning_rate": 8e-06,
"loss": 0.5388,
"step": 620
},
{
"epoch": 1.8858746492048644,
"grad_norm": 0.6476875772663511,
"learning_rate": 8e-06,
"loss": 0.5402,
"step": 630
},
{
"epoch": 1.9158091674462114,
"grad_norm": 0.6584929467438523,
"learning_rate": 8e-06,
"loss": 0.5419,
"step": 640
},
{
"epoch": 1.9457436856875585,
"grad_norm": 0.635864107312744,
"learning_rate": 8e-06,
"loss": 0.5405,
"step": 650
},
{
"epoch": 1.9756782039289056,
"grad_norm": 0.6971331062500022,
"learning_rate": 8e-06,
"loss": 0.5452,
"step": 660
},
{
"epoch": 1.999625818521983,
"eval_loss": 0.6230265498161316,
"eval_runtime": 517.4275,
"eval_samples_per_second": 17.396,
"eval_steps_per_second": 0.545,
"step": 668
},
{
"epoch": 2.0056127221702527,
"grad_norm": 1.1175717427173644,
"learning_rate": 8e-06,
"loss": 0.5812,
"step": 670
},
{
"epoch": 2.0355472404115997,
"grad_norm": 0.9014193389938453,
"learning_rate": 8e-06,
"loss": 0.4457,
"step": 680
},
{
"epoch": 2.065481758652947,
"grad_norm": 0.7610276344475498,
"learning_rate": 8e-06,
"loss": 0.4401,
"step": 690
},
{
"epoch": 2.095416276894294,
"grad_norm": 0.7005120545670707,
"learning_rate": 8e-06,
"loss": 0.4442,
"step": 700
},
{
"epoch": 2.125350795135641,
"grad_norm": 0.694947791996958,
"learning_rate": 8e-06,
"loss": 0.4449,
"step": 710
},
{
"epoch": 2.155285313376988,
"grad_norm": 0.8511316426645517,
"learning_rate": 8e-06,
"loss": 0.4488,
"step": 720
},
{
"epoch": 2.185219831618335,
"grad_norm": 0.8835380141763632,
"learning_rate": 8e-06,
"loss": 0.4493,
"step": 730
},
{
"epoch": 2.2151543498596817,
"grad_norm": 0.711769327296804,
"learning_rate": 8e-06,
"loss": 0.4504,
"step": 740
},
{
"epoch": 2.245088868101029,
"grad_norm": 0.8058645279970468,
"learning_rate": 8e-06,
"loss": 0.4527,
"step": 750
},
{
"epoch": 2.275023386342376,
"grad_norm": 0.6752264716430066,
"learning_rate": 8e-06,
"loss": 0.4497,
"step": 760
},
{
"epoch": 2.304957904583723,
"grad_norm": 0.7456444513922865,
"learning_rate": 8e-06,
"loss": 0.4543,
"step": 770
},
{
"epoch": 2.33489242282507,
"grad_norm": 0.6829933479220452,
"learning_rate": 8e-06,
"loss": 0.4517,
"step": 780
},
{
"epoch": 2.364826941066417,
"grad_norm": 0.6401921390853881,
"learning_rate": 8e-06,
"loss": 0.4542,
"step": 790
},
{
"epoch": 2.394761459307764,
"grad_norm": 0.7127587062318401,
"learning_rate": 8e-06,
"loss": 0.4565,
"step": 800
},
{
"epoch": 2.4246959775491113,
"grad_norm": 0.7066267427455665,
"learning_rate": 8e-06,
"loss": 0.4632,
"step": 810
},
{
"epoch": 2.4546304957904583,
"grad_norm": 0.7226062602383946,
"learning_rate": 8e-06,
"loss": 0.4611,
"step": 820
},
{
"epoch": 2.4845650140318054,
"grad_norm": 0.7793500688968047,
"learning_rate": 8e-06,
"loss": 0.4613,
"step": 830
},
{
"epoch": 2.5144995322731525,
"grad_norm": 0.6820846969489297,
"learning_rate": 8e-06,
"loss": 0.4591,
"step": 840
},
{
"epoch": 2.5444340505144996,
"grad_norm": 0.6470236236262259,
"learning_rate": 8e-06,
"loss": 0.4629,
"step": 850
},
{
"epoch": 2.5743685687558466,
"grad_norm": 0.706376237919755,
"learning_rate": 8e-06,
"loss": 0.4607,
"step": 860
},
{
"epoch": 2.6043030869971937,
"grad_norm": 0.6665109583977343,
"learning_rate": 8e-06,
"loss": 0.4596,
"step": 870
},
{
"epoch": 2.634237605238541,
"grad_norm": 0.6583544195294855,
"learning_rate": 8e-06,
"loss": 0.4571,
"step": 880
},
{
"epoch": 2.664172123479888,
"grad_norm": 0.6766815167737668,
"learning_rate": 8e-06,
"loss": 0.4634,
"step": 890
},
{
"epoch": 2.694106641721235,
"grad_norm": 0.6485676514451294,
"learning_rate": 8e-06,
"loss": 0.4661,
"step": 900
},
{
"epoch": 2.724041159962582,
"grad_norm": 0.6787774852448176,
"learning_rate": 8e-06,
"loss": 0.4605,
"step": 910
},
{
"epoch": 2.753975678203929,
"grad_norm": 0.6424941259370515,
"learning_rate": 8e-06,
"loss": 0.4619,
"step": 920
},
{
"epoch": 2.7839101964452757,
"grad_norm": 0.6314188313732944,
"learning_rate": 8e-06,
"loss": 0.4676,
"step": 930
},
{
"epoch": 2.8138447146866232,
"grad_norm": 0.654263916123419,
"learning_rate": 8e-06,
"loss": 0.4651,
"step": 940
},
{
"epoch": 2.84377923292797,
"grad_norm": 0.7342445113024143,
"learning_rate": 8e-06,
"loss": 0.4657,
"step": 950
},
{
"epoch": 2.8737137511693174,
"grad_norm": 0.749469773156686,
"learning_rate": 8e-06,
"loss": 0.4674,
"step": 960
},
{
"epoch": 2.903648269410664,
"grad_norm": 0.6461650603511763,
"learning_rate": 8e-06,
"loss": 0.4638,
"step": 970
},
{
"epoch": 2.933582787652011,
"grad_norm": 0.6858936516605968,
"learning_rate": 8e-06,
"loss": 0.4688,
"step": 980
},
{
"epoch": 2.963517305893358,
"grad_norm": 0.6772228156335088,
"learning_rate": 8e-06,
"loss": 0.4707,
"step": 990
},
{
"epoch": 2.9934518241347052,
"grad_norm": 0.677502927868203,
"learning_rate": 8e-06,
"loss": 0.472,
"step": 1000
},
{
"epoch": 2.999438727782975,
"eval_loss": 0.6610371470451355,
"eval_runtime": 518.6132,
"eval_samples_per_second": 17.356,
"eval_steps_per_second": 0.544,
"step": 1002
},
{
"epoch": 2.999438727782975,
"step": 1002,
"total_flos": 3818092983484416.0,
"train_loss": 0.545673328840328,
"train_runtime": 90899.0902,
"train_samples_per_second": 5.644,
"train_steps_per_second": 0.011
}
],
"logging_steps": 10,
"max_steps": 1002,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3818092983484416.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}