|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.999498746867168, |
|
"eval_steps": 500, |
|
"global_step": 2244, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013366750208855471, |
|
"grad_norm": 2.4323846059062397, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0521, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.026733500417710943, |
|
"grad_norm": 1.2707159898783558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9449, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.040100250626566414, |
|
"grad_norm": 0.6645760066182232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9205, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.053467000835421885, |
|
"grad_norm": 0.6860381528425127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9062, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06683375104427736, |
|
"grad_norm": 0.8462056832267063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8941, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08020050125313283, |
|
"grad_norm": 0.5498617128094427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8866, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0935672514619883, |
|
"grad_norm": 0.6217303867910247, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8719, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.10693400167084377, |
|
"grad_norm": 0.687429978149511, |
|
"learning_rate": 5e-06, |
|
"loss": 0.876, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12030075187969924, |
|
"grad_norm": 0.7639829931940186, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8704, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1336675020885547, |
|
"grad_norm": 0.5349974897408032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8677, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.14703425229741018, |
|
"grad_norm": 0.6212381364086903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8624, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.16040100250626566, |
|
"grad_norm": 0.5610901155787884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8621, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.17376775271512113, |
|
"grad_norm": 0.6155926013296407, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8582, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1871345029239766, |
|
"grad_norm": 0.6528571036607788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8532, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.20050125313283207, |
|
"grad_norm": 0.5372075443842537, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8492, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.21386800334168754, |
|
"grad_norm": 0.7095829143035569, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8494, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.227234753550543, |
|
"grad_norm": 0.7745444177509586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8476, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.24060150375939848, |
|
"grad_norm": 0.7586050901974903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8494, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.25396825396825395, |
|
"grad_norm": 0.5964597569119979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8498, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2673350041771094, |
|
"grad_norm": 0.6293549963407589, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8432, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2807017543859649, |
|
"grad_norm": 0.5524407679849426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8475, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.29406850459482037, |
|
"grad_norm": 0.524350214049005, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8431, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.30743525480367584, |
|
"grad_norm": 0.6760002252683699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8386, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3208020050125313, |
|
"grad_norm": 0.5906902446596286, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8349, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3341687552213868, |
|
"grad_norm": 0.5723926384792003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8361, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.34753550543024225, |
|
"grad_norm": 0.5616096712561062, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8368, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3609022556390977, |
|
"grad_norm": 0.5507735559959206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.835, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3742690058479532, |
|
"grad_norm": 0.4803949597709757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8414, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.38763575605680867, |
|
"grad_norm": 0.5121852118343002, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8325, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.40100250626566414, |
|
"grad_norm": 0.5559477754717894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8364, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4143692564745196, |
|
"grad_norm": 0.7469026400245374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8306, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4277360066833751, |
|
"grad_norm": 0.5090947427034287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8339, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.44110275689223055, |
|
"grad_norm": 0.6018861983279394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8283, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.454469507101086, |
|
"grad_norm": 0.5434521657719814, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8285, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4678362573099415, |
|
"grad_norm": 0.5903702809830117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8324, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.48120300751879697, |
|
"grad_norm": 0.6243867601355255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8284, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.49456975772765244, |
|
"grad_norm": 0.6094144532555286, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8283, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5079365079365079, |
|
"grad_norm": 0.5482360219270039, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8289, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5213032581453634, |
|
"grad_norm": 0.5061542985510644, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8317, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5346700083542189, |
|
"grad_norm": 0.6652440131533577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8256, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5480367585630743, |
|
"grad_norm": 0.5613018728699922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8252, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5614035087719298, |
|
"grad_norm": 0.7255190718604577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8247, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5747702589807853, |
|
"grad_norm": 0.6781380945175464, |
|
"learning_rate": 5e-06, |
|
"loss": 0.823, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5881370091896407, |
|
"grad_norm": 0.5530197743336887, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8251, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6015037593984962, |
|
"grad_norm": 0.571851888660113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8232, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6148705096073517, |
|
"grad_norm": 0.5208791337420644, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8235, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6282372598162071, |
|
"grad_norm": 0.5198842932978275, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8238, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6416040100250626, |
|
"grad_norm": 0.48452315583166233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8221, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6549707602339181, |
|
"grad_norm": 0.5219240912238245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8168, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6683375104427736, |
|
"grad_norm": 0.51813285089071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8173, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.681704260651629, |
|
"grad_norm": 0.49897768190410446, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8193, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6950710108604845, |
|
"grad_norm": 0.546834157816808, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8129, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.70843776106934, |
|
"grad_norm": 0.5295360571693272, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8194, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7218045112781954, |
|
"grad_norm": 0.6854942956404928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8193, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7351712614870509, |
|
"grad_norm": 0.6819748794747951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8161, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7485380116959064, |
|
"grad_norm": 0.7134808000164234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8166, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 0.6412479917820569, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8172, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7752715121136173, |
|
"grad_norm": 0.5246142664617556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8145, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7886382623224728, |
|
"grad_norm": 0.588843604202556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.82, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8020050125313283, |
|
"grad_norm": 0.5124861711768851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8156, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8153717627401837, |
|
"grad_norm": 0.5015203839251716, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8191, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.8287385129490392, |
|
"grad_norm": 0.6441893371422894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.812, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.5838304398634407, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8086, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.8554720133667502, |
|
"grad_norm": 0.5107304906894905, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8155, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8688387635756056, |
|
"grad_norm": 0.5122885155184959, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8131, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8822055137844611, |
|
"grad_norm": 0.5985811394437027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8104, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8955722639933166, |
|
"grad_norm": 0.5323936368547137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8186, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.908939014202172, |
|
"grad_norm": 0.616312309430872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8124, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9223057644110275, |
|
"grad_norm": 0.6593022396181776, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8156, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.935672514619883, |
|
"grad_norm": 0.5181097754729659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8135, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9490392648287385, |
|
"grad_norm": 0.5160202542043503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8108, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.9624060150375939, |
|
"grad_norm": 0.5439429222609182, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8098, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9757727652464494, |
|
"grad_norm": 0.5666778381149935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8064, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.9891395154553049, |
|
"grad_norm": 0.5087008142559319, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8124, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9998329156223893, |
|
"eval_loss": 0.8087860345840454, |
|
"eval_runtime": 793.9439, |
|
"eval_samples_per_second": 25.391, |
|
"eval_steps_per_second": 0.397, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.0025062656641603, |
|
"grad_norm": 0.6885103061332264, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8763, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0158730158730158, |
|
"grad_norm": 0.6156521836752095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7692, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.0292397660818713, |
|
"grad_norm": 0.6134559509903806, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7719, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.0426065162907268, |
|
"grad_norm": 0.635583159755333, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7724, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.0559732664995822, |
|
"grad_norm": 0.5771840092558814, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7724, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.0693400167084377, |
|
"grad_norm": 0.5138399093282234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7671, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0827067669172932, |
|
"grad_norm": 0.5865180500219783, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7741, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.0960735171261486, |
|
"grad_norm": 0.5737059877569465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7735, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.1094402673350041, |
|
"grad_norm": 0.7198057887439943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7715, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.1228070175438596, |
|
"grad_norm": 0.723247678442899, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7688, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.136173767752715, |
|
"grad_norm": 0.5724777994659187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7709, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.1495405179615705, |
|
"grad_norm": 0.6343455699124487, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7756, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.162907268170426, |
|
"grad_norm": 0.5975092244071976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7762, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.1762740183792815, |
|
"grad_norm": 0.5550810138685736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7713, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.189640768588137, |
|
"grad_norm": 0.6031833100946619, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7717, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.2030075187969924, |
|
"grad_norm": 0.5674692784021945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7714, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.2163742690058479, |
|
"grad_norm": 0.6831373781930358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7727, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.2297410192147034, |
|
"grad_norm": 0.517398562451772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7715, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.2431077694235588, |
|
"grad_norm": 0.5689793551691444, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7682, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.2564745196324143, |
|
"grad_norm": 0.6979997189308218, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7753, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.2698412698412698, |
|
"grad_norm": 0.5431703707142987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7726, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.2832080200501252, |
|
"grad_norm": 0.5341233588300426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7721, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.2965747702589807, |
|
"grad_norm": 0.5621957425809071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7702, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.3099415204678362, |
|
"grad_norm": 0.6187116295591158, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7755, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.3233082706766917, |
|
"grad_norm": 0.6251656247161459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7742, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.3366750208855471, |
|
"grad_norm": 0.6092934361550684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7732, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.3500417710944026, |
|
"grad_norm": 0.8086073910477094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7663, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.363408521303258, |
|
"grad_norm": 0.6337909009600926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7698, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.3767752715121135, |
|
"grad_norm": 0.6156017975821142, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7687, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.390142021720969, |
|
"grad_norm": 0.4791494199069362, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7707, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.4035087719298245, |
|
"grad_norm": 0.5102907384647386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7698, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.41687552213868, |
|
"grad_norm": 0.60763231448239, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7722, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.4302422723475354, |
|
"grad_norm": 0.5538961425736992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7769, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.443609022556391, |
|
"grad_norm": 0.511489662319519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7709, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.4569757727652464, |
|
"grad_norm": 0.5006381424370965, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7652, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.4703425229741018, |
|
"grad_norm": 0.6446877306415851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7668, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.4837092731829573, |
|
"grad_norm": 0.6472792025046472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7748, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.4970760233918128, |
|
"grad_norm": 0.5297094594069526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7716, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.5104427736006683, |
|
"grad_norm": 0.5172754876638852, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7693, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.5238095238095237, |
|
"grad_norm": 0.5499645842959932, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7663, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.5371762740183792, |
|
"grad_norm": 0.5115786493746641, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7707, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.5505430242272347, |
|
"grad_norm": 0.5733666230248589, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7708, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.5639097744360901, |
|
"grad_norm": 0.4914243878129098, |
|
"learning_rate": 5e-06, |
|
"loss": 0.769, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.5772765246449456, |
|
"grad_norm": 0.5986514689445189, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7722, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.590643274853801, |
|
"grad_norm": 0.49301214049058534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7709, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.6040100250626566, |
|
"grad_norm": 0.49122462674305145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7684, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.617376775271512, |
|
"grad_norm": 0.5231320343494373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.773, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.6307435254803675, |
|
"grad_norm": 0.5974519524827527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7703, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.644110275689223, |
|
"grad_norm": 0.49755848059450075, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7684, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.6574770258980784, |
|
"grad_norm": 0.49980350150699104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7648, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.670843776106934, |
|
"grad_norm": 0.660197673406872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7663, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 0.501447743813946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7687, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.6975772765246449, |
|
"grad_norm": 0.47339053427865196, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7677, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.7109440267335003, |
|
"grad_norm": 0.4776630843112484, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7705, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.7243107769423558, |
|
"grad_norm": 0.5805611285838953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7664, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.7376775271512113, |
|
"grad_norm": 0.5589747352729452, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7643, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.7510442773600667, |
|
"grad_norm": 0.5862892637271495, |
|
"learning_rate": 5e-06, |
|
"loss": 0.767, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.7644110275689222, |
|
"grad_norm": 0.6267084370944045, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7701, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.5590629149887701, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7725, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.7911445279866332, |
|
"grad_norm": 0.589200505231269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.768, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.8045112781954886, |
|
"grad_norm": 0.4948446583957624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7685, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.817878028404344, |
|
"grad_norm": 0.471229575382462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7685, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.8312447786131996, |
|
"grad_norm": 0.5347363048336566, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7668, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.844611528822055, |
|
"grad_norm": 0.6085798758140744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7685, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.8579782790309105, |
|
"grad_norm": 0.49237779847072155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.766, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.871345029239766, |
|
"grad_norm": 0.5429938063483495, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7675, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.8847117794486214, |
|
"grad_norm": 0.5315522378087794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7651, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.898078529657477, |
|
"grad_norm": 0.5774851920268103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7683, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.9114452798663324, |
|
"grad_norm": 0.4774206459938876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7651, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.9248120300751879, |
|
"grad_norm": 0.48893280928600313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7664, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.9381787802840433, |
|
"grad_norm": 0.47709822943051283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7667, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.9515455304928988, |
|
"grad_norm": 0.5221458173728611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7649, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.9649122807017543, |
|
"grad_norm": 0.5458985479332612, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7653, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.9782790309106097, |
|
"grad_norm": 0.5449151757658263, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7665, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.9916457811194652, |
|
"grad_norm": 0.5792068417255367, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7674, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.9996658312447786, |
|
"eval_loss": 0.7951143383979797, |
|
"eval_runtime": 795.386, |
|
"eval_samples_per_second": 25.345, |
|
"eval_steps_per_second": 0.396, |
|
"step": 1496 |
|
}, |
|
{ |
|
"epoch": 2.0050125313283207, |
|
"grad_norm": 0.7521880602206925, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8233, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.018379281537176, |
|
"grad_norm": 0.6560054074439666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7256, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.0317460317460316, |
|
"grad_norm": 0.5201512747130638, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7218, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.045112781954887, |
|
"grad_norm": 0.5262590120532872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7285, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.0584795321637426, |
|
"grad_norm": 0.5393650388873087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7229, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.071846282372598, |
|
"grad_norm": 0.5105428821348765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7231, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.0852130325814535, |
|
"grad_norm": 0.6021970483052078, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7239, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.098579782790309, |
|
"grad_norm": 0.5009099309313954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7226, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.1119465329991645, |
|
"grad_norm": 0.5605434690720502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7277, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.12531328320802, |
|
"grad_norm": 0.5732299598938305, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7286, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.1386800334168754, |
|
"grad_norm": 0.5399334511302041, |
|
"learning_rate": 5e-06, |
|
"loss": 0.726, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.152046783625731, |
|
"grad_norm": 0.505832452848056, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7304, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.1654135338345863, |
|
"grad_norm": 0.5674143618926153, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7232, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.178780284043442, |
|
"grad_norm": 0.5068914103748654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7336, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.1921470342522973, |
|
"grad_norm": 0.5118320329600874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7255, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.2055137844611528, |
|
"grad_norm": 0.5156250232792499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7295, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.2188805346700082, |
|
"grad_norm": 0.6165225897496419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7274, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.2322472848788637, |
|
"grad_norm": 0.5863877720536036, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7256, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.245614035087719, |
|
"grad_norm": 0.5641007704480012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7308, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.2589807852965746, |
|
"grad_norm": 0.6101312501534099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7314, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.27234753550543, |
|
"grad_norm": 0.5200998469176243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7275, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.5398343134194046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.727, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.299081035923141, |
|
"grad_norm": 0.5247712631574941, |
|
"learning_rate": 5e-06, |
|
"loss": 0.727, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.3124477861319965, |
|
"grad_norm": 0.5655985095958795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7286, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.325814536340852, |
|
"grad_norm": 0.5927409653328921, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7271, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.3391812865497075, |
|
"grad_norm": 0.6148593425957483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.733, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.352548036758563, |
|
"grad_norm": 0.5969831864554942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7302, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.3659147869674184, |
|
"grad_norm": 0.4985456007136878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7341, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.379281537176274, |
|
"grad_norm": 0.5005254522981937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7244, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.3926482873851294, |
|
"grad_norm": 0.5288709360617612, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7312, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.406015037593985, |
|
"grad_norm": 0.5355584900475018, |
|
"learning_rate": 5e-06, |
|
"loss": 0.727, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.4193817878028403, |
|
"grad_norm": 0.5666733459714918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.731, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.4327485380116958, |
|
"grad_norm": 0.5939862506331437, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7292, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.4461152882205512, |
|
"grad_norm": 0.5696153125681646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7295, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.4594820384294067, |
|
"grad_norm": 0.5263801998302109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7289, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.472848788638262, |
|
"grad_norm": 0.5564137280433736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7289, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.4862155388471177, |
|
"grad_norm": 0.6117589560276474, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7281, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.499582289055973, |
|
"grad_norm": 0.5556838242891475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7296, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.5129490392648286, |
|
"grad_norm": 0.4681598446789898, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7296, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.526315789473684, |
|
"grad_norm": 0.5231611697501862, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7303, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.5396825396825395, |
|
"grad_norm": 0.5126109088017671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7324, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.553049289891395, |
|
"grad_norm": 0.5300428577804921, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7273, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.5664160401002505, |
|
"grad_norm": 0.4968055663040118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.729, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.579782790309106, |
|
"grad_norm": 0.568494743059541, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7269, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.5931495405179614, |
|
"grad_norm": 0.5482221484283202, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7285, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.606516290726817, |
|
"grad_norm": 0.47129332867964935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7292, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.6198830409356724, |
|
"grad_norm": 0.5198836974979396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7264, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.633249791144528, |
|
"grad_norm": 0.4945939304862693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7279, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.6466165413533833, |
|
"grad_norm": 0.5751403403674279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7282, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.659983291562239, |
|
"grad_norm": 0.5611452949151137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7331, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.6733500417710943, |
|
"grad_norm": 0.6119128996618558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7296, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.6867167919799497, |
|
"grad_norm": 0.4799215562608329, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7298, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.700083542188805, |
|
"grad_norm": 0.5541418078345739, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7268, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.7134502923976607, |
|
"grad_norm": 0.6870311878219804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7277, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.726817042606516, |
|
"grad_norm": 0.5687894755714459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7298, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.7401837928153716, |
|
"grad_norm": 0.5330460246090263, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7325, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.753550543024227, |
|
"grad_norm": 0.5427879116319339, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7296, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.7669172932330826, |
|
"grad_norm": 0.6013738539276209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7281, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.780284043441938, |
|
"grad_norm": 0.6091854363964149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7294, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.7936507936507935, |
|
"grad_norm": 0.5190279913663577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7248, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.807017543859649, |
|
"grad_norm": 0.5126718278939274, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7311, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.8203842940685044, |
|
"grad_norm": 0.5571607138857257, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7318, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.83375104427736, |
|
"grad_norm": 0.5341175882686895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7336, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.8471177944862154, |
|
"grad_norm": 0.4817774606348232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.731, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.860484544695071, |
|
"grad_norm": 0.5487220776810837, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7282, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.8738512949039263, |
|
"grad_norm": 0.6342699103351254, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7335, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.887218045112782, |
|
"grad_norm": 0.5078552425291176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7273, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.9005847953216373, |
|
"grad_norm": 0.4819316377635323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7332, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.9139515455304927, |
|
"grad_norm": 0.4627017239179797, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7306, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.927318295739348, |
|
"grad_norm": 0.4761325291977869, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7314, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.9406850459482037, |
|
"grad_norm": 0.5784029020001881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7298, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.954051796157059, |
|
"grad_norm": 0.5120822643666457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.731, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.9674185463659146, |
|
"grad_norm": 0.5116915736315969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7322, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.98078529657477, |
|
"grad_norm": 0.5021133290964584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7269, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.9941520467836256, |
|
"grad_norm": 0.5317540745896701, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7322, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.999498746867168, |
|
"eval_loss": 0.7926730513572693, |
|
"eval_runtime": 792.6639, |
|
"eval_samples_per_second": 25.432, |
|
"eval_steps_per_second": 0.397, |
|
"step": 2244 |
|
}, |
|
{ |
|
"epoch": 2.999498746867168, |
|
"step": 2244, |
|
"total_flos": 3758574199111680.0, |
|
"train_loss": 0.7796513685780625, |
|
"train_runtime": 132137.1731, |
|
"train_samples_per_second": 8.696, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2244, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3758574199111680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|