diff --git "a/checkpoint-1200/trainer_state.json" "b/checkpoint-1200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1200/trainer_state.json" @@ -0,0 +1,8826 @@ +{ + "best_metric": 0.0031958832405507565, + "best_model_checkpoint": "/home/paperspace/Data/models/akash_unifo_757/llm3br256/checkpoint-1200", + "epoch": 3.878787878787879, + "eval_steps": 25, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032323232323232323, + "grad_norm": 0.11503094434738159, + "learning_rate": 1.2936610608020701e-07, + "loss": 0.0131, + "step": 1 + }, + { + "epoch": 0.006464646464646465, + "grad_norm": 0.13206231594085693, + "learning_rate": 2.5873221216041403e-07, + "loss": 0.0155, + "step": 2 + }, + { + "epoch": 0.009696969696969697, + "grad_norm": 0.1161206066608429, + "learning_rate": 3.8809831824062096e-07, + "loss": 0.0245, + "step": 3 + }, + { + "epoch": 0.01292929292929293, + "grad_norm": 0.1274755299091339, + "learning_rate": 5.174644243208281e-07, + "loss": 0.0127, + "step": 4 + }, + { + "epoch": 0.01616161616161616, + "grad_norm": 0.12534309923648834, + "learning_rate": 6.468305304010349e-07, + "loss": 0.0137, + "step": 5 + }, + { + "epoch": 0.019393939393939394, + "grad_norm": 0.11610464006662369, + "learning_rate": 7.761966364812419e-07, + "loss": 0.0153, + "step": 6 + }, + { + "epoch": 0.022626262626262626, + "grad_norm": 0.10964969545602798, + "learning_rate": 9.055627425614489e-07, + "loss": 0.0205, + "step": 7 + }, + { + "epoch": 0.02585858585858586, + "grad_norm": 0.11367467045783997, + "learning_rate": 1.0349288486416561e-06, + "loss": 0.0167, + "step": 8 + }, + { + "epoch": 0.02909090909090909, + "grad_norm": 0.12127836048603058, + "learning_rate": 1.164294954721863e-06, + "loss": 0.0155, + "step": 9 + }, + { + "epoch": 0.03232323232323232, + "grad_norm": 0.12462533265352249, + "learning_rate": 1.2936610608020699e-06, + "loss": 0.0169, + "step": 10 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 0.13044221699237823, + "learning_rate": 1.423027166882277e-06, + "loss": 0.0104, + "step": 11 + }, + { + "epoch": 0.03878787878787879, + "grad_norm": 0.08799455314874649, + "learning_rate": 1.5523932729624839e-06, + "loss": 0.021, + "step": 12 + }, + { + "epoch": 0.04202020202020202, + "grad_norm": 0.09403660893440247, + "learning_rate": 1.6817593790426907e-06, + "loss": 0.0151, + "step": 13 + }, + { + "epoch": 0.04525252525252525, + "grad_norm": 0.07015115022659302, + "learning_rate": 1.8111254851228978e-06, + "loss": 0.0157, + "step": 14 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 0.07983457297086716, + "learning_rate": 1.9404915912031045e-06, + "loss": 0.0091, + "step": 15 + }, + { + "epoch": 0.05171717171717172, + "grad_norm": 0.06726181507110596, + "learning_rate": 2.0698576972833122e-06, + "loss": 0.0147, + "step": 16 + }, + { + "epoch": 0.05494949494949495, + "grad_norm": 0.06473492830991745, + "learning_rate": 2.199223803363519e-06, + "loss": 0.0091, + "step": 17 + }, + { + "epoch": 0.05818181818181818, + "grad_norm": 0.051432088017463684, + "learning_rate": 2.328589909443726e-06, + "loss": 0.0129, + "step": 18 + }, + { + "epoch": 0.061414141414141414, + "grad_norm": 0.048174694180488586, + "learning_rate": 2.457956015523933e-06, + "loss": 0.0119, + "step": 19 + }, + { + "epoch": 0.06464646464646465, + "grad_norm": 0.04263145476579666, + "learning_rate": 2.5873221216041398e-06, + "loss": 0.0112, + "step": 20 + }, + { + "epoch": 0.06787878787878789, + "grad_norm": 0.03820733353495598, + "learning_rate": 2.716688227684347e-06, + "loss": 0.0091, + "step": 21 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 0.03781511262059212, + "learning_rate": 2.846054333764554e-06, + "loss": 0.0103, + "step": 22 + }, + { + "epoch": 0.07434343434343435, + "grad_norm": 0.03252599388360977, + "learning_rate": 2.975420439844761e-06, + "loss": 0.01, + "step": 23 + }, + { + "epoch": 0.07757575757575758, + "grad_norm": 0.03773924708366394, + "learning_rate": 3.1047865459249677e-06, + "loss": 0.0115, + "step": 24 + }, + { + "epoch": 0.08080808080808081, + "grad_norm": 0.03268961235880852, + "learning_rate": 3.234152652005175e-06, + "loss": 0.0133, + "step": 25 + }, + { + "epoch": 0.08080808080808081, + "eval_loss": 0.01230227667838335, + "eval_runtime": 21.2392, + "eval_samples_per_second": 4.708, + "eval_steps_per_second": 1.177, + "step": 25 + }, + { + "epoch": 0.08404040404040404, + "grad_norm": 0.02927996963262558, + "learning_rate": 3.3635187580853815e-06, + "loss": 0.0092, + "step": 26 + }, + { + "epoch": 0.08727272727272728, + "grad_norm": 0.039579834789037704, + "learning_rate": 3.492884864165589e-06, + "loss": 0.0163, + "step": 27 + }, + { + "epoch": 0.0905050505050505, + "grad_norm": 0.024472083896398544, + "learning_rate": 3.6222509702457957e-06, + "loss": 0.0067, + "step": 28 + }, + { + "epoch": 0.09373737373737374, + "grad_norm": 0.03290821984410286, + "learning_rate": 3.751617076326003e-06, + "loss": 0.0127, + "step": 29 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 0.03189826384186745, + "learning_rate": 3.880983182406209e-06, + "loss": 0.0096, + "step": 30 + }, + { + "epoch": 0.10020202020202021, + "grad_norm": 0.030306048691272736, + "learning_rate": 4.010349288486417e-06, + "loss": 0.0082, + "step": 31 + }, + { + "epoch": 0.10343434343434343, + "grad_norm": 0.03980877622961998, + "learning_rate": 4.1397153945666245e-06, + "loss": 0.0133, + "step": 32 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.030887724831700325, + "learning_rate": 4.2690815006468305e-06, + "loss": 0.0074, + "step": 33 + }, + { + "epoch": 0.1098989898989899, + "grad_norm": 0.027738964185118675, + "learning_rate": 4.398447606727038e-06, + "loss": 0.0069, + "step": 34 + }, + { + "epoch": 0.11313131313131314, + "grad_norm": 0.03853800520300865, + "learning_rate": 4.527813712807244e-06, + "loss": 0.0128, + "step": 35 + }, + { + "epoch": 0.11636363636363636, + "grad_norm": 0.028064019978046417, + "learning_rate": 4.657179818887452e-06, + "loss": 0.0091, + "step": 36 + }, + { + "epoch": 0.1195959595959596, + "grad_norm": 0.026782216504216194, + "learning_rate": 4.786545924967659e-06, + "loss": 0.0071, + "step": 37 + }, + { + "epoch": 0.12282828282828283, + "grad_norm": 0.017248690128326416, + "learning_rate": 4.915912031047866e-06, + "loss": 0.0033, + "step": 38 + }, + { + "epoch": 0.12606060606060607, + "grad_norm": 0.0254344642162323, + "learning_rate": 5.045278137128073e-06, + "loss": 0.0076, + "step": 39 + }, + { + "epoch": 0.1292929292929293, + "grad_norm": 0.02050536312162876, + "learning_rate": 5.1746442432082795e-06, + "loss": 0.0072, + "step": 40 + }, + { + "epoch": 0.13252525252525252, + "grad_norm": 0.02892652526497841, + "learning_rate": 5.304010349288486e-06, + "loss": 0.0086, + "step": 41 + }, + { + "epoch": 0.13575757575757577, + "grad_norm": 0.025958560407161713, + "learning_rate": 5.433376455368694e-06, + "loss": 0.0092, + "step": 42 + }, + { + "epoch": 0.138989898989899, + "grad_norm": 0.026592133566737175, + "learning_rate": 5.5627425614489e-06, + "loss": 0.0077, + "step": 43 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 0.016826845705509186, + "learning_rate": 5.692108667529108e-06, + "loss": 0.0048, + "step": 44 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 0.02611837536096573, + "learning_rate": 5.821474773609315e-06, + "loss": 0.0103, + "step": 45 + }, + { + "epoch": 0.1486868686868687, + "grad_norm": 0.018496474251151085, + "learning_rate": 5.950840879689522e-06, + "loss": 0.0046, + "step": 46 + }, + { + "epoch": 0.15191919191919193, + "grad_norm": 0.016068018972873688, + "learning_rate": 6.0802069857697286e-06, + "loss": 0.0068, + "step": 47 + }, + { + "epoch": 0.15515151515151515, + "grad_norm": 0.023762917146086693, + "learning_rate": 6.2095730918499354e-06, + "loss": 0.0078, + "step": 48 + }, + { + "epoch": 0.15838383838383838, + "grad_norm": 0.022558651864528656, + "learning_rate": 6.338939197930142e-06, + "loss": 0.0079, + "step": 49 + }, + { + "epoch": 0.16161616161616163, + "grad_norm": 0.02249864488840103, + "learning_rate": 6.46830530401035e-06, + "loss": 0.0056, + "step": 50 + }, + { + "epoch": 0.16161616161616163, + "eval_loss": 0.009845715016126633, + "eval_runtime": 18.6136, + "eval_samples_per_second": 5.372, + "eval_steps_per_second": 1.343, + "step": 50 + }, + { + "epoch": 0.16484848484848486, + "grad_norm": 0.01815730892121792, + "learning_rate": 6.597671410090557e-06, + "loss": 0.0053, + "step": 51 + }, + { + "epoch": 0.16808080808080808, + "grad_norm": 0.023437149822711945, + "learning_rate": 6.727037516170763e-06, + "loss": 0.0089, + "step": 52 + }, + { + "epoch": 0.1713131313131313, + "grad_norm": 0.02191866748034954, + "learning_rate": 6.856403622250971e-06, + "loss": 0.0116, + "step": 53 + }, + { + "epoch": 0.17454545454545456, + "grad_norm": 0.018593238666653633, + "learning_rate": 6.985769728331178e-06, + "loss": 0.0067, + "step": 54 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.019710399210453033, + "learning_rate": 7.115135834411385e-06, + "loss": 0.0065, + "step": 55 + }, + { + "epoch": 0.181010101010101, + "grad_norm": 0.01702931523323059, + "learning_rate": 7.244501940491591e-06, + "loss": 0.0054, + "step": 56 + }, + { + "epoch": 0.18424242424242424, + "grad_norm": 0.020323824137449265, + "learning_rate": 7.373868046571798e-06, + "loss": 0.0072, + "step": 57 + }, + { + "epoch": 0.1874747474747475, + "grad_norm": 0.022320428863167763, + "learning_rate": 7.503234152652006e-06, + "loss": 0.0107, + "step": 58 + }, + { + "epoch": 0.1907070707070707, + "grad_norm": 0.01765402965247631, + "learning_rate": 7.632600258732213e-06, + "loss": 0.0055, + "step": 59 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 0.019662927836179733, + "learning_rate": 7.761966364812418e-06, + "loss": 0.0073, + "step": 60 + }, + { + "epoch": 0.19717171717171716, + "grad_norm": 0.016214415431022644, + "learning_rate": 7.891332470892627e-06, + "loss": 0.0083, + "step": 61 + }, + { + "epoch": 0.20040404040404042, + "grad_norm": 0.015475176274776459, + "learning_rate": 8.020698576972833e-06, + "loss": 0.0075, + "step": 62 + }, + { + "epoch": 0.20363636363636364, + "grad_norm": 0.016818247735500336, + "learning_rate": 8.15006468305304e-06, + "loss": 0.0081, + "step": 63 + }, + { + "epoch": 0.20686868686868687, + "grad_norm": 0.018314030021429062, + "learning_rate": 8.279430789133249e-06, + "loss": 0.0064, + "step": 64 + }, + { + "epoch": 0.2101010101010101, + "grad_norm": 0.021758757531642914, + "learning_rate": 8.408796895213454e-06, + "loss": 0.0105, + "step": 65 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.01629498228430748, + "learning_rate": 8.538163001293661e-06, + "loss": 0.0062, + "step": 66 + }, + { + "epoch": 0.21656565656565657, + "grad_norm": 0.017469746991991997, + "learning_rate": 8.66752910737387e-06, + "loss": 0.0058, + "step": 67 + }, + { + "epoch": 0.2197979797979798, + "grad_norm": 0.016880586743354797, + "learning_rate": 8.796895213454076e-06, + "loss": 0.0093, + "step": 68 + }, + { + "epoch": 0.22303030303030302, + "grad_norm": 0.020206857472658157, + "learning_rate": 8.926261319534282e-06, + "loss": 0.0068, + "step": 69 + }, + { + "epoch": 0.22626262626262628, + "grad_norm": 0.018911803141236305, + "learning_rate": 9.055627425614489e-06, + "loss": 0.008, + "step": 70 + }, + { + "epoch": 0.2294949494949495, + "grad_norm": 0.017788654193282127, + "learning_rate": 9.184993531694697e-06, + "loss": 0.0088, + "step": 71 + }, + { + "epoch": 0.23272727272727273, + "grad_norm": 0.013764932751655579, + "learning_rate": 9.314359637774904e-06, + "loss": 0.0053, + "step": 72 + }, + { + "epoch": 0.23595959595959595, + "grad_norm": 0.016450250521302223, + "learning_rate": 9.44372574385511e-06, + "loss": 0.0074, + "step": 73 + }, + { + "epoch": 0.2391919191919192, + "grad_norm": 0.02177259512245655, + "learning_rate": 9.573091849935318e-06, + "loss": 0.0119, + "step": 74 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.019282735884189606, + "learning_rate": 9.702457956015525e-06, + "loss": 0.0113, + "step": 75 + }, + { + "epoch": 0.24242424242424243, + "eval_loss": 0.00841269176453352, + "eval_runtime": 18.5455, + "eval_samples_per_second": 5.392, + "eval_steps_per_second": 1.348, + "step": 75 + }, + { + "epoch": 0.24565656565656566, + "grad_norm": 0.020014960318803787, + "learning_rate": 9.831824062095732e-06, + "loss": 0.0097, + "step": 76 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 0.021017547696828842, + "learning_rate": 9.961190168175938e-06, + "loss": 0.0085, + "step": 77 + }, + { + "epoch": 0.25212121212121213, + "grad_norm": 0.016342103481292725, + "learning_rate": 1.0090556274256145e-05, + "loss": 0.0065, + "step": 78 + }, + { + "epoch": 0.25535353535353533, + "grad_norm": 0.020554421469569206, + "learning_rate": 1.0219922380336352e-05, + "loss": 0.0092, + "step": 79 + }, + { + "epoch": 0.2585858585858586, + "grad_norm": 0.011228547431528568, + "learning_rate": 1.0349288486416559e-05, + "loss": 0.005, + "step": 80 + }, + { + "epoch": 0.26181818181818184, + "grad_norm": 0.01503695547580719, + "learning_rate": 1.0478654592496766e-05, + "loss": 0.0056, + "step": 81 + }, + { + "epoch": 0.26505050505050504, + "grad_norm": 0.016742516309022903, + "learning_rate": 1.0608020698576973e-05, + "loss": 0.0067, + "step": 82 + }, + { + "epoch": 0.2682828282828283, + "grad_norm": 0.015310220420360565, + "learning_rate": 1.073738680465718e-05, + "loss": 0.0084, + "step": 83 + }, + { + "epoch": 0.27151515151515154, + "grad_norm": 0.01845848560333252, + "learning_rate": 1.0866752910737388e-05, + "loss": 0.0069, + "step": 84 + }, + { + "epoch": 0.27474747474747474, + "grad_norm": 0.014536215923726559, + "learning_rate": 1.0996119016817593e-05, + "loss": 0.0057, + "step": 85 + }, + { + "epoch": 0.277979797979798, + "grad_norm": 0.016418730840086937, + "learning_rate": 1.11254851228978e-05, + "loss": 0.0064, + "step": 86 + }, + { + "epoch": 0.2812121212121212, + "grad_norm": 0.01798829808831215, + "learning_rate": 1.1254851228978009e-05, + "loss": 0.0048, + "step": 87 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 0.017619946971535683, + "learning_rate": 1.1384217335058216e-05, + "loss": 0.0112, + "step": 88 + }, + { + "epoch": 0.2876767676767677, + "grad_norm": 0.017026174813508987, + "learning_rate": 1.1513583441138421e-05, + "loss": 0.005, + "step": 89 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 0.019230445846915245, + "learning_rate": 1.164294954721863e-05, + "loss": 0.0108, + "step": 90 + }, + { + "epoch": 0.29414141414141415, + "grad_norm": 0.014893417246639729, + "learning_rate": 1.1772315653298836e-05, + "loss": 0.0067, + "step": 91 + }, + { + "epoch": 0.2973737373737374, + "grad_norm": 0.01982959918677807, + "learning_rate": 1.1901681759379043e-05, + "loss": 0.0092, + "step": 92 + }, + { + "epoch": 0.3006060606060606, + "grad_norm": 0.016334552317857742, + "learning_rate": 1.203104786545925e-05, + "loss": 0.0042, + "step": 93 + }, + { + "epoch": 0.30383838383838385, + "grad_norm": 0.012507308274507523, + "learning_rate": 1.2160413971539457e-05, + "loss": 0.0059, + "step": 94 + }, + { + "epoch": 0.30707070707070705, + "grad_norm": 0.014092699624598026, + "learning_rate": 1.2289780077619664e-05, + "loss": 0.0075, + "step": 95 + }, + { + "epoch": 0.3103030303030303, + "grad_norm": 0.01649198681116104, + "learning_rate": 1.2419146183699871e-05, + "loss": 0.0078, + "step": 96 + }, + { + "epoch": 0.31353535353535356, + "grad_norm": 0.017579909414052963, + "learning_rate": 1.254851228978008e-05, + "loss": 0.0067, + "step": 97 + }, + { + "epoch": 0.31676767676767675, + "grad_norm": 0.01603030413389206, + "learning_rate": 1.2677878395860285e-05, + "loss": 0.0056, + "step": 98 + }, + { + "epoch": 0.32, + "grad_norm": 0.01683826372027397, + "learning_rate": 1.2807244501940493e-05, + "loss": 0.0087, + "step": 99 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 0.01677022874355316, + "learning_rate": 1.29366106080207e-05, + "loss": 0.0083, + "step": 100 + }, + { + "epoch": 0.32323232323232326, + "eval_loss": 0.00736634898930788, + "eval_runtime": 18.5672, + "eval_samples_per_second": 5.386, + "eval_steps_per_second": 1.346, + "step": 100 + }, + { + "epoch": 0.32646464646464646, + "grad_norm": 0.014008735306560993, + "learning_rate": 1.3065976714100905e-05, + "loss": 0.0043, + "step": 101 + }, + { + "epoch": 0.3296969696969697, + "grad_norm": 0.010679924860596657, + "learning_rate": 1.3195342820181114e-05, + "loss": 0.0034, + "step": 102 + }, + { + "epoch": 0.3329292929292929, + "grad_norm": 0.01666383258998394, + "learning_rate": 1.332470892626132e-05, + "loss": 0.0079, + "step": 103 + }, + { + "epoch": 0.33616161616161616, + "grad_norm": 0.016743313521146774, + "learning_rate": 1.3454075032341526e-05, + "loss": 0.005, + "step": 104 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 0.02069983258843422, + "learning_rate": 1.3583441138421735e-05, + "loss": 0.0103, + "step": 105 + }, + { + "epoch": 0.3426262626262626, + "grad_norm": 0.01835963875055313, + "learning_rate": 1.3712807244501941e-05, + "loss": 0.0048, + "step": 106 + }, + { + "epoch": 0.34585858585858587, + "grad_norm": 0.019786424934864044, + "learning_rate": 1.384217335058215e-05, + "loss": 0.0143, + "step": 107 + }, + { + "epoch": 0.3490909090909091, + "grad_norm": 0.019772058352828026, + "learning_rate": 1.3971539456662355e-05, + "loss": 0.009, + "step": 108 + }, + { + "epoch": 0.3523232323232323, + "grad_norm": 0.018820039927959442, + "learning_rate": 1.4100905562742562e-05, + "loss": 0.0103, + "step": 109 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.01463607419282198, + "learning_rate": 1.423027166882277e-05, + "loss": 0.0064, + "step": 110 + }, + { + "epoch": 0.35878787878787877, + "grad_norm": 0.015627332031726837, + "learning_rate": 1.4359637774902976e-05, + "loss": 0.0064, + "step": 111 + }, + { + "epoch": 0.362020202020202, + "grad_norm": 0.014721403829753399, + "learning_rate": 1.4489003880983183e-05, + "loss": 0.0043, + "step": 112 + }, + { + "epoch": 0.3652525252525253, + "grad_norm": 0.015796532854437828, + "learning_rate": 1.4618369987063391e-05, + "loss": 0.0072, + "step": 113 + }, + { + "epoch": 0.36848484848484847, + "grad_norm": 0.0158319603651762, + "learning_rate": 1.4747736093143596e-05, + "loss": 0.0059, + "step": 114 + }, + { + "epoch": 0.3717171717171717, + "grad_norm": 0.015959493815898895, + "learning_rate": 1.4877102199223805e-05, + "loss": 0.0051, + "step": 115 + }, + { + "epoch": 0.374949494949495, + "grad_norm": 0.016709132120013237, + "learning_rate": 1.5006468305304012e-05, + "loss": 0.0077, + "step": 116 + }, + { + "epoch": 0.3781818181818182, + "grad_norm": 0.02095826528966427, + "learning_rate": 1.5135834411384217e-05, + "loss": 0.012, + "step": 117 + }, + { + "epoch": 0.3814141414141414, + "grad_norm": 0.012002995237708092, + "learning_rate": 1.5265200517464426e-05, + "loss": 0.0048, + "step": 118 + }, + { + "epoch": 0.3846464646464646, + "grad_norm": 0.019866062328219414, + "learning_rate": 1.5394566623544633e-05, + "loss": 0.0137, + "step": 119 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 0.013328887522220612, + "learning_rate": 1.5523932729624836e-05, + "loss": 0.0042, + "step": 120 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 0.012495558708906174, + "learning_rate": 1.5653298835705046e-05, + "loss": 0.0029, + "step": 121 + }, + { + "epoch": 0.39434343434343433, + "grad_norm": 0.01434282772243023, + "learning_rate": 1.5782664941785253e-05, + "loss": 0.0043, + "step": 122 + }, + { + "epoch": 0.3975757575757576, + "grad_norm": 0.019523974508047104, + "learning_rate": 1.591203104786546e-05, + "loss": 0.0098, + "step": 123 + }, + { + "epoch": 0.40080808080808084, + "grad_norm": 0.013115576468408108, + "learning_rate": 1.6041397153945667e-05, + "loss": 0.0038, + "step": 124 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.016748910769820213, + "learning_rate": 1.6170763260025874e-05, + "loss": 0.0086, + "step": 125 + }, + { + "epoch": 0.40404040404040403, + "eval_loss": 0.006461107637733221, + "eval_runtime": 18.517, + "eval_samples_per_second": 5.4, + "eval_steps_per_second": 1.35, + "step": 125 + }, + { + "epoch": 0.4072727272727273, + "grad_norm": 0.013579982332885265, + "learning_rate": 1.630012936610608e-05, + "loss": 0.0054, + "step": 126 + }, + { + "epoch": 0.4105050505050505, + "grad_norm": 0.0175962932407856, + "learning_rate": 1.6429495472186288e-05, + "loss": 0.0058, + "step": 127 + }, + { + "epoch": 0.41373737373737374, + "grad_norm": 0.014661675319075584, + "learning_rate": 1.6558861578266498e-05, + "loss": 0.0067, + "step": 128 + }, + { + "epoch": 0.416969696969697, + "grad_norm": 0.016726892441511154, + "learning_rate": 1.66882276843467e-05, + "loss": 0.0079, + "step": 129 + }, + { + "epoch": 0.4202020202020202, + "grad_norm": 0.012960881926119328, + "learning_rate": 1.6817593790426908e-05, + "loss": 0.0053, + "step": 130 + }, + { + "epoch": 0.42343434343434344, + "grad_norm": 0.019023625180125237, + "learning_rate": 1.694695989650712e-05, + "loss": 0.0097, + "step": 131 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.018510041758418083, + "learning_rate": 1.7076326002587322e-05, + "loss": 0.0053, + "step": 132 + }, + { + "epoch": 0.4298989898989899, + "grad_norm": 0.016019567847251892, + "learning_rate": 1.720569210866753e-05, + "loss": 0.0063, + "step": 133 + }, + { + "epoch": 0.43313131313131314, + "grad_norm": 0.01722690835595131, + "learning_rate": 1.733505821474774e-05, + "loss": 0.0063, + "step": 134 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 0.015361025929450989, + "learning_rate": 1.7464424320827943e-05, + "loss": 0.0037, + "step": 135 + }, + { + "epoch": 0.4395959595959596, + "grad_norm": 0.014419769868254662, + "learning_rate": 1.7593790426908153e-05, + "loss": 0.0046, + "step": 136 + }, + { + "epoch": 0.44282828282828285, + "grad_norm": 0.01369231566786766, + "learning_rate": 1.7723156532988356e-05, + "loss": 0.0039, + "step": 137 + }, + { + "epoch": 0.44606060606060605, + "grad_norm": 0.01594926416873932, + "learning_rate": 1.7852522639068563e-05, + "loss": 0.0037, + "step": 138 + }, + { + "epoch": 0.4492929292929293, + "grad_norm": 0.01753450371325016, + "learning_rate": 1.7981888745148774e-05, + "loss": 0.0063, + "step": 139 + }, + { + "epoch": 0.45252525252525255, + "grad_norm": 0.012597791850566864, + "learning_rate": 1.8111254851228977e-05, + "loss": 0.0039, + "step": 140 + }, + { + "epoch": 0.45575757575757575, + "grad_norm": 0.015224005095660686, + "learning_rate": 1.8240620957309184e-05, + "loss": 0.008, + "step": 141 + }, + { + "epoch": 0.458989898989899, + "grad_norm": 0.01901715062558651, + "learning_rate": 1.8369987063389394e-05, + "loss": 0.0104, + "step": 142 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 0.014205937273800373, + "learning_rate": 1.8499353169469598e-05, + "loss": 0.006, + "step": 143 + }, + { + "epoch": 0.46545454545454545, + "grad_norm": 0.017999831587076187, + "learning_rate": 1.8628719275549808e-05, + "loss": 0.0067, + "step": 144 + }, + { + "epoch": 0.4686868686868687, + "grad_norm": 0.01627945713698864, + "learning_rate": 1.8758085381630015e-05, + "loss": 0.0066, + "step": 145 + }, + { + "epoch": 0.4719191919191919, + "grad_norm": 0.015483185648918152, + "learning_rate": 1.888745148771022e-05, + "loss": 0.0048, + "step": 146 + }, + { + "epoch": 0.47515151515151516, + "grad_norm": 0.01489502564072609, + "learning_rate": 1.901681759379043e-05, + "loss": 0.0042, + "step": 147 + }, + { + "epoch": 0.4783838383838384, + "grad_norm": 0.013736383058130741, + "learning_rate": 1.9146183699870636e-05, + "loss": 0.0054, + "step": 148 + }, + { + "epoch": 0.4816161616161616, + "grad_norm": 0.014781012199819088, + "learning_rate": 1.927554980595084e-05, + "loss": 0.0038, + "step": 149 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.013498370535671711, + "learning_rate": 1.940491591203105e-05, + "loss": 0.0067, + "step": 150 + }, + { + "epoch": 0.48484848484848486, + "eval_loss": 0.006012982223182917, + "eval_runtime": 18.5388, + "eval_samples_per_second": 5.394, + "eval_steps_per_second": 1.349, + "step": 150 + }, + { + "epoch": 0.48808080808080806, + "grad_norm": 0.019391966983675957, + "learning_rate": 1.9534282018111256e-05, + "loss": 0.0062, + "step": 151 + }, + { + "epoch": 0.4913131313131313, + "grad_norm": 0.01793098822236061, + "learning_rate": 1.9663648124191463e-05, + "loss": 0.0048, + "step": 152 + }, + { + "epoch": 0.49454545454545457, + "grad_norm": 0.025553777813911438, + "learning_rate": 1.979301423027167e-05, + "loss": 0.0088, + "step": 153 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 0.015286175534129143, + "learning_rate": 1.9922380336351877e-05, + "loss": 0.0052, + "step": 154 + }, + { + "epoch": 0.501010101010101, + "grad_norm": 0.013409867882728577, + "learning_rate": 2.0051746442432084e-05, + "loss": 0.0029, + "step": 155 + }, + { + "epoch": 0.5042424242424243, + "grad_norm": 0.02216433919966221, + "learning_rate": 2.018111254851229e-05, + "loss": 0.0068, + "step": 156 + }, + { + "epoch": 0.5074747474747475, + "grad_norm": 0.01498087402433157, + "learning_rate": 2.0310478654592497e-05, + "loss": 0.0037, + "step": 157 + }, + { + "epoch": 0.5107070707070707, + "grad_norm": 0.06357154995203018, + "learning_rate": 2.0439844760672704e-05, + "loss": 0.0074, + "step": 158 + }, + { + "epoch": 0.5139393939393939, + "grad_norm": 0.014753567986190319, + "learning_rate": 2.056921086675291e-05, + "loss": 0.0042, + "step": 159 + }, + { + "epoch": 0.5171717171717172, + "grad_norm": 0.020152902230620384, + "learning_rate": 2.0698576972833118e-05, + "loss": 0.0072, + "step": 160 + }, + { + "epoch": 0.5204040404040404, + "grad_norm": 0.02215396985411644, + "learning_rate": 2.0827943078913325e-05, + "loss": 0.0063, + "step": 161 + }, + { + "epoch": 0.5236363636363637, + "grad_norm": 0.01737254299223423, + "learning_rate": 2.0957309184993532e-05, + "loss": 0.0045, + "step": 162 + }, + { + "epoch": 0.5268686868686868, + "grad_norm": 0.01855519600212574, + "learning_rate": 2.108667529107374e-05, + "loss": 0.0079, + "step": 163 + }, + { + "epoch": 0.5301010101010101, + "grad_norm": 0.0156853087246418, + "learning_rate": 2.1216041397153946e-05, + "loss": 0.0059, + "step": 164 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.015692561864852905, + "learning_rate": 2.1345407503234156e-05, + "loss": 0.0033, + "step": 165 + }, + { + "epoch": 0.5365656565656566, + "grad_norm": 0.01826147921383381, + "learning_rate": 2.147477360931436e-05, + "loss": 0.0082, + "step": 166 + }, + { + "epoch": 0.5397979797979798, + "grad_norm": 0.017402255907654762, + "learning_rate": 2.1604139715394566e-05, + "loss": 0.0085, + "step": 167 + }, + { + "epoch": 0.5430303030303031, + "grad_norm": 0.020618412643671036, + "learning_rate": 2.1733505821474777e-05, + "loss": 0.0057, + "step": 168 + }, + { + "epoch": 0.5462626262626262, + "grad_norm": 0.013431048952043056, + "learning_rate": 2.186287192755498e-05, + "loss": 0.0057, + "step": 169 + }, + { + "epoch": 0.5494949494949495, + "grad_norm": 0.020214086398482323, + "learning_rate": 2.1992238033635187e-05, + "loss": 0.0098, + "step": 170 + }, + { + "epoch": 0.5527272727272727, + "grad_norm": 0.018985526636242867, + "learning_rate": 2.2121604139715397e-05, + "loss": 0.0047, + "step": 171 + }, + { + "epoch": 0.555959595959596, + "grad_norm": 0.01286108884960413, + "learning_rate": 2.22509702457956e-05, + "loss": 0.0045, + "step": 172 + }, + { + "epoch": 0.5591919191919192, + "grad_norm": 0.017977848649024963, + "learning_rate": 2.238033635187581e-05, + "loss": 0.0075, + "step": 173 + }, + { + "epoch": 0.5624242424242424, + "grad_norm": 0.01748410426080227, + "learning_rate": 2.2509702457956018e-05, + "loss": 0.0054, + "step": 174 + }, + { + "epoch": 0.5656565656565656, + "grad_norm": 0.014239229261875153, + "learning_rate": 2.263906856403622e-05, + "loss": 0.0038, + "step": 175 + }, + { + "epoch": 0.5656565656565656, + "eval_loss": 0.00581002701073885, + "eval_runtime": 18.5437, + "eval_samples_per_second": 5.393, + "eval_steps_per_second": 1.348, + "step": 175 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 0.015481271781027317, + "learning_rate": 2.276843467011643e-05, + "loss": 0.0049, + "step": 176 + }, + { + "epoch": 0.5721212121212121, + "grad_norm": 0.01505062822252512, + "learning_rate": 2.289780077619664e-05, + "loss": 0.0042, + "step": 177 + }, + { + "epoch": 0.5753535353535354, + "grad_norm": 0.016980962827801704, + "learning_rate": 2.3027166882276842e-05, + "loss": 0.0068, + "step": 178 + }, + { + "epoch": 0.5785858585858585, + "grad_norm": 0.017680799588561058, + "learning_rate": 2.3156532988357052e-05, + "loss": 0.006, + "step": 179 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 0.017391802743077278, + "learning_rate": 2.328589909443726e-05, + "loss": 0.0088, + "step": 180 + }, + { + "epoch": 0.585050505050505, + "grad_norm": 0.015167100355029106, + "learning_rate": 2.3415265200517466e-05, + "loss": 0.0036, + "step": 181 + }, + { + "epoch": 0.5882828282828283, + "grad_norm": 0.0169716514647007, + "learning_rate": 2.3544631306597673e-05, + "loss": 0.0032, + "step": 182 + }, + { + "epoch": 0.5915151515151515, + "grad_norm": 0.013683251105248928, + "learning_rate": 2.367399741267788e-05, + "loss": 0.0026, + "step": 183 + }, + { + "epoch": 0.5947474747474748, + "grad_norm": 0.01659058965742588, + "learning_rate": 2.3803363518758087e-05, + "loss": 0.0058, + "step": 184 + }, + { + "epoch": 0.597979797979798, + "grad_norm": 0.019827809184789658, + "learning_rate": 2.3932729624838294e-05, + "loss": 0.007, + "step": 185 + }, + { + "epoch": 0.6012121212121212, + "grad_norm": 0.01573983021080494, + "learning_rate": 2.40620957309185e-05, + "loss": 0.0041, + "step": 186 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 0.016037194058299065, + "learning_rate": 2.4191461836998707e-05, + "loss": 0.0048, + "step": 187 + }, + { + "epoch": 0.6076767676767677, + "grad_norm": 0.017361117526888847, + "learning_rate": 2.4320827943078914e-05, + "loss": 0.0041, + "step": 188 + }, + { + "epoch": 0.610909090909091, + "grad_norm": 0.013419854454696178, + "learning_rate": 2.445019404915912e-05, + "loss": 0.0057, + "step": 189 + }, + { + "epoch": 0.6141414141414141, + "grad_norm": 0.014191025868058205, + "learning_rate": 2.4579560155239328e-05, + "loss": 0.0026, + "step": 190 + }, + { + "epoch": 0.6173737373737374, + "grad_norm": 0.017151927575469017, + "learning_rate": 2.4708926261319535e-05, + "loss": 0.0077, + "step": 191 + }, + { + "epoch": 0.6206060606060606, + "grad_norm": 0.01370987668633461, + "learning_rate": 2.4838292367399742e-05, + "loss": 0.0049, + "step": 192 + }, + { + "epoch": 0.6238383838383839, + "grad_norm": 0.015623660758137703, + "learning_rate": 2.496765847347995e-05, + "loss": 0.0038, + "step": 193 + }, + { + "epoch": 0.6270707070707071, + "grad_norm": 0.015143529511988163, + "learning_rate": 2.509702457956016e-05, + "loss": 0.0036, + "step": 194 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 0.025124063715338707, + "learning_rate": 2.5226390685640362e-05, + "loss": 0.0065, + "step": 195 + }, + { + "epoch": 0.6335353535353535, + "grad_norm": 0.016248000785708427, + "learning_rate": 2.535575679172057e-05, + "loss": 0.005, + "step": 196 + }, + { + "epoch": 0.6367676767676768, + "grad_norm": 0.018650708720088005, + "learning_rate": 2.548512289780078e-05, + "loss": 0.0038, + "step": 197 + }, + { + "epoch": 0.64, + "grad_norm": 0.019356263801455498, + "learning_rate": 2.5614489003880986e-05, + "loss": 0.0034, + "step": 198 + }, + { + "epoch": 0.6432323232323233, + "grad_norm": 0.014984376728534698, + "learning_rate": 2.574385510996119e-05, + "loss": 0.0041, + "step": 199 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 0.014227643609046936, + "learning_rate": 2.58732212160414e-05, + "loss": 0.0043, + "step": 200 + }, + { + "epoch": 0.6464646464646465, + "eval_loss": 0.005420052912086248, + "eval_runtime": 18.5123, + "eval_samples_per_second": 5.402, + "eval_steps_per_second": 1.35, + "step": 200 + }, + { + "epoch": 0.6496969696969697, + "grad_norm": 0.01470093335956335, + "learning_rate": 2.6002587322121607e-05, + "loss": 0.0058, + "step": 201 + }, + { + "epoch": 0.6529292929292929, + "grad_norm": 0.01742161437869072, + "learning_rate": 2.613195342820181e-05, + "loss": 0.006, + "step": 202 + }, + { + "epoch": 0.6561616161616162, + "grad_norm": 0.01952281780540943, + "learning_rate": 2.626131953428202e-05, + "loss": 0.0035, + "step": 203 + }, + { + "epoch": 0.6593939393939394, + "grad_norm": 0.015216818079352379, + "learning_rate": 2.6390685640362228e-05, + "loss": 0.0052, + "step": 204 + }, + { + "epoch": 0.6626262626262627, + "grad_norm": 0.013172456994652748, + "learning_rate": 2.652005174644243e-05, + "loss": 0.0041, + "step": 205 + }, + { + "epoch": 0.6658585858585858, + "grad_norm": 0.01898268423974514, + "learning_rate": 2.664941785252264e-05, + "loss": 0.0068, + "step": 206 + }, + { + "epoch": 0.6690909090909091, + "grad_norm": 0.02104322984814644, + "learning_rate": 2.677878395860285e-05, + "loss": 0.0059, + "step": 207 + }, + { + "epoch": 0.6723232323232323, + "grad_norm": 0.014322024770081043, + "learning_rate": 2.6908150064683052e-05, + "loss": 0.0038, + "step": 208 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 0.011105583049356937, + "learning_rate": 2.7037516170763262e-05, + "loss": 0.0016, + "step": 209 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 0.012854715809226036, + "learning_rate": 2.716688227684347e-05, + "loss": 0.0036, + "step": 210 + }, + { + "epoch": 0.682020202020202, + "grad_norm": 0.018748968839645386, + "learning_rate": 2.7296248382923673e-05, + "loss": 0.0044, + "step": 211 + }, + { + "epoch": 0.6852525252525252, + "grad_norm": 0.013730528764426708, + "learning_rate": 2.7425614489003883e-05, + "loss": 0.0024, + "step": 212 + }, + { + "epoch": 0.6884848484848485, + "grad_norm": 0.01574782282114029, + "learning_rate": 2.755498059508409e-05, + "loss": 0.0049, + "step": 213 + }, + { + "epoch": 0.6917171717171717, + "grad_norm": 0.016125742346048355, + "learning_rate": 2.76843467011643e-05, + "loss": 0.0047, + "step": 214 + }, + { + "epoch": 0.694949494949495, + "grad_norm": 0.01681477390229702, + "learning_rate": 2.7813712807244503e-05, + "loss": 0.004, + "step": 215 + }, + { + "epoch": 0.6981818181818182, + "grad_norm": 0.013295124284923077, + "learning_rate": 2.794307891332471e-05, + "loss": 0.0027, + "step": 216 + }, + { + "epoch": 0.7014141414141414, + "grad_norm": 0.01294485479593277, + "learning_rate": 2.807244501940492e-05, + "loss": 0.0039, + "step": 217 + }, + { + "epoch": 0.7046464646464646, + "grad_norm": 0.011234375648200512, + "learning_rate": 2.8201811125485124e-05, + "loss": 0.0026, + "step": 218 + }, + { + "epoch": 0.7078787878787879, + "grad_norm": 0.01438144315034151, + "learning_rate": 2.833117723156533e-05, + "loss": 0.0052, + "step": 219 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.013788231648504734, + "learning_rate": 2.846054333764554e-05, + "loss": 0.0037, + "step": 220 + }, + { + "epoch": 0.7143434343434344, + "grad_norm": 0.010624570772051811, + "learning_rate": 2.8589909443725745e-05, + "loss": 0.0037, + "step": 221 + }, + { + "epoch": 0.7175757575757575, + "grad_norm": 0.01703670434653759, + "learning_rate": 2.871927554980595e-05, + "loss": 0.0035, + "step": 222 + }, + { + "epoch": 0.7208080808080808, + "grad_norm": 0.020146546885371208, + "learning_rate": 2.8848641655886162e-05, + "loss": 0.0057, + "step": 223 + }, + { + "epoch": 0.724040404040404, + "grad_norm": 0.014769370667636395, + "learning_rate": 2.8978007761966365e-05, + "loss": 0.007, + "step": 224 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.017661113291978836, + "learning_rate": 2.9107373868046572e-05, + "loss": 0.004, + "step": 225 + }, + { + "epoch": 0.7272727272727273, + "eval_loss": 0.0052576931193470955, + "eval_runtime": 18.5949, + "eval_samples_per_second": 5.378, + "eval_steps_per_second": 1.344, + "step": 225 + }, + { + "epoch": 0.7305050505050505, + "grad_norm": 0.01635902002453804, + "learning_rate": 2.9236739974126783e-05, + "loss": 0.0044, + "step": 226 + }, + { + "epoch": 0.7337373737373737, + "grad_norm": 0.013843000866472721, + "learning_rate": 2.936610608020699e-05, + "loss": 0.0051, + "step": 227 + }, + { + "epoch": 0.7369696969696969, + "grad_norm": 0.01753757894039154, + "learning_rate": 2.9495472186287193e-05, + "loss": 0.0056, + "step": 228 + }, + { + "epoch": 0.7402020202020202, + "grad_norm": 0.021675704047083855, + "learning_rate": 2.9624838292367403e-05, + "loss": 0.0084, + "step": 229 + }, + { + "epoch": 0.7434343434343434, + "grad_norm": 0.01598481833934784, + "learning_rate": 2.975420439844761e-05, + "loss": 0.006, + "step": 230 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.016229236498475075, + "learning_rate": 2.9883570504527814e-05, + "loss": 0.0081, + "step": 231 + }, + { + "epoch": 0.74989898989899, + "grad_norm": 0.017028771340847015, + "learning_rate": 3.0012936610608024e-05, + "loss": 0.0081, + "step": 232 + }, + { + "epoch": 0.7531313131313131, + "grad_norm": 0.009784836322069168, + "learning_rate": 3.014230271668823e-05, + "loss": 0.0022, + "step": 233 + }, + { + "epoch": 0.7563636363636363, + "grad_norm": 0.015256540849804878, + "learning_rate": 3.0271668822768434e-05, + "loss": 0.0091, + "step": 234 + }, + { + "epoch": 0.7595959595959596, + "grad_norm": 0.019257735460996628, + "learning_rate": 3.0401034928848644e-05, + "loss": 0.0096, + "step": 235 + }, + { + "epoch": 0.7628282828282829, + "grad_norm": 0.01391427218914032, + "learning_rate": 3.053040103492885e-05, + "loss": 0.0052, + "step": 236 + }, + { + "epoch": 0.7660606060606061, + "grad_norm": 0.012768765911459923, + "learning_rate": 3.0659767141009055e-05, + "loss": 0.0043, + "step": 237 + }, + { + "epoch": 0.7692929292929293, + "grad_norm": 0.015724869444966316, + "learning_rate": 3.0789133247089265e-05, + "loss": 0.0066, + "step": 238 + }, + { + "epoch": 0.7725252525252525, + "grad_norm": 0.016083568334579468, + "learning_rate": 3.0918499353169475e-05, + "loss": 0.0082, + "step": 239 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 0.01483457162976265, + "learning_rate": 3.104786545924967e-05, + "loss": 0.0046, + "step": 240 + }, + { + "epoch": 0.778989898989899, + "grad_norm": 0.015515637584030628, + "learning_rate": 3.117723156532988e-05, + "loss": 0.0048, + "step": 241 + }, + { + "epoch": 0.7822222222222223, + "grad_norm": 0.016584735363721848, + "learning_rate": 3.130659767141009e-05, + "loss": 0.0052, + "step": 242 + }, + { + "epoch": 0.7854545454545454, + "grad_norm": 0.014646550640463829, + "learning_rate": 3.14359637774903e-05, + "loss": 0.0022, + "step": 243 + }, + { + "epoch": 0.7886868686868687, + "grad_norm": 0.014621698297560215, + "learning_rate": 3.1565329883570506e-05, + "loss": 0.006, + "step": 244 + }, + { + "epoch": 0.7919191919191919, + "grad_norm": 0.015543024055659771, + "learning_rate": 3.169469598965072e-05, + "loss": 0.0051, + "step": 245 + }, + { + "epoch": 0.7951515151515152, + "grad_norm": 0.02005600556731224, + "learning_rate": 3.182406209573092e-05, + "loss": 0.0119, + "step": 246 + }, + { + "epoch": 0.7983838383838384, + "grad_norm": 0.01678292639553547, + "learning_rate": 3.1953428201811124e-05, + "loss": 0.0052, + "step": 247 + }, + { + "epoch": 0.8016161616161617, + "grad_norm": 0.013058966025710106, + "learning_rate": 3.2082794307891334e-05, + "loss": 0.0033, + "step": 248 + }, + { + "epoch": 0.8048484848484848, + "grad_norm": 0.019511310383677483, + "learning_rate": 3.2212160413971544e-05, + "loss": 0.0048, + "step": 249 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.0210106261074543, + "learning_rate": 3.234152652005175e-05, + "loss": 0.0095, + "step": 250 + }, + { + "epoch": 0.8080808080808081, + "eval_loss": 0.0050358218140900135, + "eval_runtime": 18.5332, + "eval_samples_per_second": 5.396, + "eval_steps_per_second": 1.349, + "step": 250 + }, + { + "epoch": 0.8113131313131313, + "grad_norm": 0.008065695874392986, + "learning_rate": 3.247089262613196e-05, + "loss": 0.0022, + "step": 251 + }, + { + "epoch": 0.8145454545454546, + "grad_norm": 0.013688714243471622, + "learning_rate": 3.260025873221216e-05, + "loss": 0.0042, + "step": 252 + }, + { + "epoch": 0.8177777777777778, + "grad_norm": 0.023383015766739845, + "learning_rate": 3.2729624838292365e-05, + "loss": 0.006, + "step": 253 + }, + { + "epoch": 0.821010101010101, + "grad_norm": 0.011116388253867626, + "learning_rate": 3.2858990944372575e-05, + "loss": 0.0029, + "step": 254 + }, + { + "epoch": 0.8242424242424242, + "grad_norm": 0.01456495188176632, + "learning_rate": 3.2988357050452786e-05, + "loss": 0.0038, + "step": 255 + }, + { + "epoch": 0.8274747474747475, + "grad_norm": 0.014656679704785347, + "learning_rate": 3.3117723156532996e-05, + "loss": 0.0045, + "step": 256 + }, + { + "epoch": 0.8307070707070707, + "grad_norm": 0.024107573553919792, + "learning_rate": 3.324708926261319e-05, + "loss": 0.0047, + "step": 257 + }, + { + "epoch": 0.833939393939394, + "grad_norm": 0.015572698786854744, + "learning_rate": 3.33764553686934e-05, + "loss": 0.0064, + "step": 258 + }, + { + "epoch": 0.8371717171717171, + "grad_norm": 0.01332643162459135, + "learning_rate": 3.350582147477361e-05, + "loss": 0.0048, + "step": 259 + }, + { + "epoch": 0.8404040404040404, + "grad_norm": 0.018154941499233246, + "learning_rate": 3.3635187580853817e-05, + "loss": 0.006, + "step": 260 + }, + { + "epoch": 0.8436363636363636, + "grad_norm": 0.015191690064966679, + "learning_rate": 3.376455368693403e-05, + "loss": 0.0039, + "step": 261 + }, + { + "epoch": 0.8468686868686869, + "grad_norm": 0.014015432447195053, + "learning_rate": 3.389391979301424e-05, + "loss": 0.0046, + "step": 262 + }, + { + "epoch": 0.8501010101010101, + "grad_norm": 0.022445395588874817, + "learning_rate": 3.4023285899094434e-05, + "loss": 0.0113, + "step": 263 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.015556816942989826, + "learning_rate": 3.4152652005174644e-05, + "loss": 0.0052, + "step": 264 + }, + { + "epoch": 0.8565656565656565, + "grad_norm": 0.01582123525440693, + "learning_rate": 3.4282018111254854e-05, + "loss": 0.0037, + "step": 265 + }, + { + "epoch": 0.8597979797979798, + "grad_norm": 0.014743310399353504, + "learning_rate": 3.441138421733506e-05, + "loss": 0.004, + "step": 266 + }, + { + "epoch": 0.863030303030303, + "grad_norm": 0.013264276087284088, + "learning_rate": 3.454075032341527e-05, + "loss": 0.0041, + "step": 267 + }, + { + "epoch": 0.8662626262626263, + "grad_norm": 0.01359622087329626, + "learning_rate": 3.467011642949548e-05, + "loss": 0.004, + "step": 268 + }, + { + "epoch": 0.8694949494949495, + "grad_norm": 0.01856253668665886, + "learning_rate": 3.4799482535575675e-05, + "loss": 0.0047, + "step": 269 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 0.01446174643933773, + "learning_rate": 3.4928848641655885e-05, + "loss": 0.0048, + "step": 270 + }, + { + "epoch": 0.8759595959595959, + "grad_norm": 0.01740093156695366, + "learning_rate": 3.5058214747736096e-05, + "loss": 0.0083, + "step": 271 + }, + { + "epoch": 0.8791919191919192, + "grad_norm": 0.021578310057520866, + "learning_rate": 3.5187580853816306e-05, + "loss": 0.0069, + "step": 272 + }, + { + "epoch": 0.8824242424242424, + "grad_norm": 0.013721848838031292, + "learning_rate": 3.531694695989651e-05, + "loss": 0.0047, + "step": 273 + }, + { + "epoch": 0.8856565656565657, + "grad_norm": 0.015622769482433796, + "learning_rate": 3.544631306597671e-05, + "loss": 0.0064, + "step": 274 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.01654888316988945, + "learning_rate": 3.557567917205692e-05, + "loss": 0.0071, + "step": 275 + }, + { + "epoch": 0.8888888888888888, + "eval_loss": 0.004978961776942015, + "eval_runtime": 18.5395, + "eval_samples_per_second": 5.394, + "eval_steps_per_second": 1.348, + "step": 275 + }, + { + "epoch": 0.8921212121212121, + "grad_norm": 0.01643654704093933, + "learning_rate": 3.570504527813713e-05, + "loss": 0.0033, + "step": 276 + }, + { + "epoch": 0.8953535353535353, + "grad_norm": 0.015030475333333015, + "learning_rate": 3.583441138421734e-05, + "loss": 0.003, + "step": 277 + }, + { + "epoch": 0.8985858585858586, + "grad_norm": 0.010186896659433842, + "learning_rate": 3.596377749029755e-05, + "loss": 0.0035, + "step": 278 + }, + { + "epoch": 0.9018181818181819, + "grad_norm": 0.012999375350773335, + "learning_rate": 3.609314359637775e-05, + "loss": 0.0033, + "step": 279 + }, + { + "epoch": 0.9050505050505051, + "grad_norm": 0.015473639592528343, + "learning_rate": 3.6222509702457954e-05, + "loss": 0.0044, + "step": 280 + }, + { + "epoch": 0.9082828282828282, + "grad_norm": 0.011669456027448177, + "learning_rate": 3.6351875808538164e-05, + "loss": 0.0018, + "step": 281 + }, + { + "epoch": 0.9115151515151515, + "grad_norm": 0.011770956218242645, + "learning_rate": 3.648124191461837e-05, + "loss": 0.0036, + "step": 282 + }, + { + "epoch": 0.9147474747474748, + "grad_norm": 0.013007396832108498, + "learning_rate": 3.661060802069858e-05, + "loss": 0.005, + "step": 283 + }, + { + "epoch": 0.917979797979798, + "grad_norm": 0.012168454937636852, + "learning_rate": 3.673997412677879e-05, + "loss": 0.0043, + "step": 284 + }, + { + "epoch": 0.9212121212121213, + "grad_norm": 0.010970023460686207, + "learning_rate": 3.6869340232859e-05, + "loss": 0.0032, + "step": 285 + }, + { + "epoch": 0.9244444444444444, + "grad_norm": 0.012372443452477455, + "learning_rate": 3.6998706338939195e-05, + "loss": 0.0041, + "step": 286 + }, + { + "epoch": 0.9276767676767677, + "grad_norm": 0.013462092727422714, + "learning_rate": 3.7128072445019406e-05, + "loss": 0.0046, + "step": 287 + }, + { + "epoch": 0.9309090909090909, + "grad_norm": 0.01485317014157772, + "learning_rate": 3.7257438551099616e-05, + "loss": 0.0041, + "step": 288 + }, + { + "epoch": 0.9341414141414142, + "grad_norm": 0.014620418660342693, + "learning_rate": 3.738680465717982e-05, + "loss": 0.007, + "step": 289 + }, + { + "epoch": 0.9373737373737374, + "grad_norm": 0.008358921855688095, + "learning_rate": 3.751617076326003e-05, + "loss": 0.0018, + "step": 290 + }, + { + "epoch": 0.9406060606060606, + "grad_norm": 0.015606251545250416, + "learning_rate": 3.764553686934023e-05, + "loss": 0.0032, + "step": 291 + }, + { + "epoch": 0.9438383838383838, + "grad_norm": 0.017797425389289856, + "learning_rate": 3.777490297542044e-05, + "loss": 0.007, + "step": 292 + }, + { + "epoch": 0.9470707070707071, + "grad_norm": 0.011691603809595108, + "learning_rate": 3.790426908150065e-05, + "loss": 0.0042, + "step": 293 + }, + { + "epoch": 0.9503030303030303, + "grad_norm": 0.009671170264482498, + "learning_rate": 3.803363518758086e-05, + "loss": 0.0031, + "step": 294 + }, + { + "epoch": 0.9535353535353536, + "grad_norm": 0.010161872953176498, + "learning_rate": 3.816300129366106e-05, + "loss": 0.003, + "step": 295 + }, + { + "epoch": 0.9567676767676768, + "grad_norm": 0.014181291684508324, + "learning_rate": 3.829236739974127e-05, + "loss": 0.0039, + "step": 296 + }, + { + "epoch": 0.96, + "grad_norm": 0.014730575494468212, + "learning_rate": 3.8421733505821475e-05, + "loss": 0.0048, + "step": 297 + }, + { + "epoch": 0.9632323232323232, + "grad_norm": 0.012808237224817276, + "learning_rate": 3.855109961190168e-05, + "loss": 0.005, + "step": 298 + }, + { + "epoch": 0.9664646464646465, + "grad_norm": 0.010238195769488811, + "learning_rate": 3.868046571798189e-05, + "loss": 0.0035, + "step": 299 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.01267747487872839, + "learning_rate": 3.88098318240621e-05, + "loss": 0.0056, + "step": 300 + }, + { + "epoch": 0.9696969696969697, + "eval_loss": 0.004724177997559309, + "eval_runtime": 18.5689, + "eval_samples_per_second": 5.385, + "eval_steps_per_second": 1.346, + "step": 300 + }, + { + "epoch": 0.972929292929293, + "grad_norm": 0.009855328127741814, + "learning_rate": 3.893919793014231e-05, + "loss": 0.0022, + "step": 301 + }, + { + "epoch": 0.9761616161616161, + "grad_norm": 0.011048916727304459, + "learning_rate": 3.906856403622251e-05, + "loss": 0.0035, + "step": 302 + }, + { + "epoch": 0.9793939393939394, + "grad_norm": 0.02079099602997303, + "learning_rate": 3.9197930142302716e-05, + "loss": 0.012, + "step": 303 + }, + { + "epoch": 0.9826262626262626, + "grad_norm": 0.01305049005895853, + "learning_rate": 3.9327296248382926e-05, + "loss": 0.0047, + "step": 304 + }, + { + "epoch": 0.9858585858585859, + "grad_norm": 0.010051904246211052, + "learning_rate": 3.945666235446313e-05, + "loss": 0.0025, + "step": 305 + }, + { + "epoch": 0.9890909090909091, + "grad_norm": 0.01657235063612461, + "learning_rate": 3.958602846054334e-05, + "loss": 0.008, + "step": 306 + }, + { + "epoch": 0.9923232323232323, + "grad_norm": 0.008980581536889076, + "learning_rate": 3.971539456662355e-05, + "loss": 0.0022, + "step": 307 + }, + { + "epoch": 0.9955555555555555, + "grad_norm": 0.01711704581975937, + "learning_rate": 3.9844760672703754e-05, + "loss": 0.0082, + "step": 308 + }, + { + "epoch": 0.9987878787878788, + "grad_norm": 0.01096993125975132, + "learning_rate": 3.997412677878396e-05, + "loss": 0.0042, + "step": 309 + }, + { + "epoch": 1.002020202020202, + "grad_norm": 0.023890549317002296, + "learning_rate": 4.010349288486417e-05, + "loss": 0.0076, + "step": 310 + }, + { + "epoch": 1.0052525252525253, + "grad_norm": 0.009565077722072601, + "learning_rate": 4.023285899094437e-05, + "loss": 0.003, + "step": 311 + }, + { + "epoch": 1.0084848484848485, + "grad_norm": 0.012166207656264305, + "learning_rate": 4.036222509702458e-05, + "loss": 0.0057, + "step": 312 + }, + { + "epoch": 1.0117171717171718, + "grad_norm": 0.007330482825636864, + "learning_rate": 4.049159120310479e-05, + "loss": 0.0021, + "step": 313 + }, + { + "epoch": 1.014949494949495, + "grad_norm": 0.010659433901309967, + "learning_rate": 4.0620957309184995e-05, + "loss": 0.0031, + "step": 314 + }, + { + "epoch": 1.018181818181818, + "grad_norm": 0.009105874225497246, + "learning_rate": 4.07503234152652e-05, + "loss": 0.0028, + "step": 315 + }, + { + "epoch": 1.0214141414141413, + "grad_norm": 0.011084525845944881, + "learning_rate": 4.087968952134541e-05, + "loss": 0.0029, + "step": 316 + }, + { + "epoch": 1.0246464646464646, + "grad_norm": 0.008576706051826477, + "learning_rate": 4.100905562742562e-05, + "loss": 0.0024, + "step": 317 + }, + { + "epoch": 1.0278787878787878, + "grad_norm": 0.01535963173955679, + "learning_rate": 4.113842173350582e-05, + "loss": 0.0049, + "step": 318 + }, + { + "epoch": 1.031111111111111, + "grad_norm": 0.013791275210678577, + "learning_rate": 4.126778783958603e-05, + "loss": 0.0028, + "step": 319 + }, + { + "epoch": 1.0343434343434343, + "grad_norm": 0.01699056848883629, + "learning_rate": 4.1397153945666236e-05, + "loss": 0.0075, + "step": 320 + }, + { + "epoch": 1.0375757575757576, + "grad_norm": 0.008655001409351826, + "learning_rate": 4.152652005174644e-05, + "loss": 0.0037, + "step": 321 + }, + { + "epoch": 1.0408080808080808, + "grad_norm": 0.009002985432744026, + "learning_rate": 4.165588615782665e-05, + "loss": 0.0031, + "step": 322 + }, + { + "epoch": 1.044040404040404, + "grad_norm": 0.008515028282999992, + "learning_rate": 4.178525226390686e-05, + "loss": 0.002, + "step": 323 + }, + { + "epoch": 1.0472727272727274, + "grad_norm": 0.011139890179038048, + "learning_rate": 4.1914618369987064e-05, + "loss": 0.0031, + "step": 324 + }, + { + "epoch": 1.0505050505050506, + "grad_norm": 0.008947103284299374, + "learning_rate": 4.2043984476067274e-05, + "loss": 0.0017, + "step": 325 + }, + { + "epoch": 1.0505050505050506, + "eval_loss": 0.004780053161084652, + "eval_runtime": 18.5718, + "eval_samples_per_second": 5.385, + "eval_steps_per_second": 1.346, + "step": 325 + }, + { + "epoch": 1.0537373737373736, + "grad_norm": 0.01565450057387352, + "learning_rate": 4.217335058214748e-05, + "loss": 0.0033, + "step": 326 + }, + { + "epoch": 1.056969696969697, + "grad_norm": 0.016629492864012718, + "learning_rate": 4.230271668822768e-05, + "loss": 0.0071, + "step": 327 + }, + { + "epoch": 1.0602020202020201, + "grad_norm": 0.012543603777885437, + "learning_rate": 4.243208279430789e-05, + "loss": 0.0042, + "step": 328 + }, + { + "epoch": 1.0634343434343434, + "grad_norm": 0.010482177138328552, + "learning_rate": 4.25614489003881e-05, + "loss": 0.0028, + "step": 329 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.010985253378748894, + "learning_rate": 4.269081500646831e-05, + "loss": 0.0033, + "step": 330 + }, + { + "epoch": 1.06989898989899, + "grad_norm": 0.010153640992939472, + "learning_rate": 4.2820181112548515e-05, + "loss": 0.0028, + "step": 331 + }, + { + "epoch": 1.0731313131313132, + "grad_norm": 0.0068151215091347694, + "learning_rate": 4.294954721862872e-05, + "loss": 0.0016, + "step": 332 + }, + { + "epoch": 1.0763636363636364, + "grad_norm": 0.01169463712722063, + "learning_rate": 4.307891332470893e-05, + "loss": 0.0029, + "step": 333 + }, + { + "epoch": 1.0795959595959597, + "grad_norm": 0.016815535724163055, + "learning_rate": 4.320827943078913e-05, + "loss": 0.0069, + "step": 334 + }, + { + "epoch": 1.082828282828283, + "grad_norm": 0.01823563128709793, + "learning_rate": 4.333764553686934e-05, + "loss": 0.0096, + "step": 335 + }, + { + "epoch": 1.086060606060606, + "grad_norm": 0.009779485873878002, + "learning_rate": 4.346701164294955e-05, + "loss": 0.0023, + "step": 336 + }, + { + "epoch": 1.0892929292929292, + "grad_norm": 0.014083399437367916, + "learning_rate": 4.359637774902976e-05, + "loss": 0.0032, + "step": 337 + }, + { + "epoch": 1.0925252525252525, + "grad_norm": 0.01473916880786419, + "learning_rate": 4.372574385510996e-05, + "loss": 0.0047, + "step": 338 + }, + { + "epoch": 1.0957575757575757, + "grad_norm": 0.010457643307745457, + "learning_rate": 4.385510996119017e-05, + "loss": 0.0031, + "step": 339 + }, + { + "epoch": 1.098989898989899, + "grad_norm": 0.010025047697126865, + "learning_rate": 4.3984476067270374e-05, + "loss": 0.0035, + "step": 340 + }, + { + "epoch": 1.1022222222222222, + "grad_norm": 0.013103731907904148, + "learning_rate": 4.4113842173350584e-05, + "loss": 0.0046, + "step": 341 + }, + { + "epoch": 1.1054545454545455, + "grad_norm": 0.011729112826287746, + "learning_rate": 4.4243208279430794e-05, + "loss": 0.0031, + "step": 342 + }, + { + "epoch": 1.1086868686868687, + "grad_norm": 0.010078197345137596, + "learning_rate": 4.4372574385511e-05, + "loss": 0.0037, + "step": 343 + }, + { + "epoch": 1.111919191919192, + "grad_norm": 0.01012883149087429, + "learning_rate": 4.45019404915912e-05, + "loss": 0.0027, + "step": 344 + }, + { + "epoch": 1.1151515151515152, + "grad_norm": 0.01668359898030758, + "learning_rate": 4.463130659767141e-05, + "loss": 0.0057, + "step": 345 + }, + { + "epoch": 1.1183838383838385, + "grad_norm": 0.011654009111225605, + "learning_rate": 4.476067270375162e-05, + "loss": 0.0029, + "step": 346 + }, + { + "epoch": 1.1216161616161617, + "grad_norm": 0.018832651898264885, + "learning_rate": 4.4890038809831825e-05, + "loss": 0.0056, + "step": 347 + }, + { + "epoch": 1.1248484848484848, + "grad_norm": 0.006298431195318699, + "learning_rate": 4.5019404915912036e-05, + "loss": 0.0012, + "step": 348 + }, + { + "epoch": 1.128080808080808, + "grad_norm": 0.018230654299259186, + "learning_rate": 4.514877102199224e-05, + "loss": 0.0065, + "step": 349 + }, + { + "epoch": 1.1313131313131313, + "grad_norm": 0.007927543483674526, + "learning_rate": 4.527813712807244e-05, + "loss": 0.0024, + "step": 350 + }, + { + "epoch": 1.1313131313131313, + "eval_loss": 0.004599680192768574, + "eval_runtime": 18.5134, + "eval_samples_per_second": 5.401, + "eval_steps_per_second": 1.35, + "step": 350 + }, + { + "epoch": 1.1345454545454545, + "grad_norm": 0.012782711535692215, + "learning_rate": 4.540750323415265e-05, + "loss": 0.0043, + "step": 351 + }, + { + "epoch": 1.1377777777777778, + "grad_norm": 0.01866583153605461, + "learning_rate": 4.553686934023286e-05, + "loss": 0.0083, + "step": 352 + }, + { + "epoch": 1.141010101010101, + "grad_norm": 0.010514589957892895, + "learning_rate": 4.566623544631307e-05, + "loss": 0.0033, + "step": 353 + }, + { + "epoch": 1.1442424242424243, + "grad_norm": 0.01175905391573906, + "learning_rate": 4.579560155239328e-05, + "loss": 0.0036, + "step": 354 + }, + { + "epoch": 1.1474747474747475, + "grad_norm": 0.01182876992970705, + "learning_rate": 4.592496765847348e-05, + "loss": 0.0051, + "step": 355 + }, + { + "epoch": 1.1507070707070708, + "grad_norm": 0.007950863800942898, + "learning_rate": 4.6054333764553684e-05, + "loss": 0.002, + "step": 356 + }, + { + "epoch": 1.1539393939393938, + "grad_norm": 0.009339825250208378, + "learning_rate": 4.6183699870633894e-05, + "loss": 0.0022, + "step": 357 + }, + { + "epoch": 1.157171717171717, + "grad_norm": 0.012703759595751762, + "learning_rate": 4.6313065976714105e-05, + "loss": 0.0039, + "step": 358 + }, + { + "epoch": 1.1604040404040403, + "grad_norm": 0.019440334290266037, + "learning_rate": 4.6442432082794315e-05, + "loss": 0.0086, + "step": 359 + }, + { + "epoch": 1.1636363636363636, + "grad_norm": 0.008004146628081799, + "learning_rate": 4.657179818887452e-05, + "loss": 0.0015, + "step": 360 + }, + { + "epoch": 1.1668686868686868, + "grad_norm": 0.017837535589933395, + "learning_rate": 4.670116429495472e-05, + "loss": 0.0055, + "step": 361 + }, + { + "epoch": 1.17010101010101, + "grad_norm": 0.011339561082422733, + "learning_rate": 4.683053040103493e-05, + "loss": 0.0035, + "step": 362 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 0.012849444523453712, + "learning_rate": 4.6959896507115136e-05, + "loss": 0.0046, + "step": 363 + }, + { + "epoch": 1.1765656565656566, + "grad_norm": 0.015334355644881725, + "learning_rate": 4.7089262613195346e-05, + "loss": 0.0061, + "step": 364 + }, + { + "epoch": 1.1797979797979798, + "grad_norm": 0.010206463746726513, + "learning_rate": 4.7218628719275556e-05, + "loss": 0.0031, + "step": 365 + }, + { + "epoch": 1.183030303030303, + "grad_norm": 0.004871445242315531, + "learning_rate": 4.734799482535576e-05, + "loss": 0.0016, + "step": 366 + }, + { + "epoch": 1.1862626262626264, + "grad_norm": 0.012882773764431477, + "learning_rate": 4.747736093143596e-05, + "loss": 0.003, + "step": 367 + }, + { + "epoch": 1.1894949494949496, + "grad_norm": 0.009307224303483963, + "learning_rate": 4.760672703751617e-05, + "loss": 0.0018, + "step": 368 + }, + { + "epoch": 1.1927272727272726, + "grad_norm": 0.011766292154788971, + "learning_rate": 4.773609314359638e-05, + "loss": 0.0049, + "step": 369 + }, + { + "epoch": 1.195959595959596, + "grad_norm": 0.006270453333854675, + "learning_rate": 4.786545924967659e-05, + "loss": 0.0016, + "step": 370 + }, + { + "epoch": 1.1991919191919191, + "grad_norm": 0.011882314458489418, + "learning_rate": 4.79948253557568e-05, + "loss": 0.0045, + "step": 371 + }, + { + "epoch": 1.2024242424242424, + "grad_norm": 0.012190425768494606, + "learning_rate": 4.8124191461837e-05, + "loss": 0.0029, + "step": 372 + }, + { + "epoch": 1.2056565656565656, + "grad_norm": 0.017565051093697548, + "learning_rate": 4.8253557567917204e-05, + "loss": 0.0066, + "step": 373 + }, + { + "epoch": 1.208888888888889, + "grad_norm": 0.01365733053535223, + "learning_rate": 4.8382923673997415e-05, + "loss": 0.003, + "step": 374 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.00959896668791771, + "learning_rate": 4.8512289780077625e-05, + "loss": 0.0023, + "step": 375 + }, + { + "epoch": 1.2121212121212122, + "eval_loss": 0.004647642374038696, + "eval_runtime": 18.5462, + "eval_samples_per_second": 5.392, + "eval_steps_per_second": 1.348, + "step": 375 + }, + { + "epoch": 1.2153535353535354, + "grad_norm": 0.008952205069363117, + "learning_rate": 4.864165588615783e-05, + "loss": 0.003, + "step": 376 + }, + { + "epoch": 1.2185858585858587, + "grad_norm": 0.01386015210300684, + "learning_rate": 4.877102199223804e-05, + "loss": 0.0046, + "step": 377 + }, + { + "epoch": 1.221818181818182, + "grad_norm": 0.01027520652860403, + "learning_rate": 4.890038809831824e-05, + "loss": 0.0031, + "step": 378 + }, + { + "epoch": 1.225050505050505, + "grad_norm": 0.014104802161455154, + "learning_rate": 4.9029754204398446e-05, + "loss": 0.0055, + "step": 379 + }, + { + "epoch": 1.2282828282828282, + "grad_norm": 0.012762988917529583, + "learning_rate": 4.9159120310478656e-05, + "loss": 0.0026, + "step": 380 + }, + { + "epoch": 1.2315151515151515, + "grad_norm": 0.010741367004811764, + "learning_rate": 4.9288486416558866e-05, + "loss": 0.0028, + "step": 381 + }, + { + "epoch": 1.2347474747474747, + "grad_norm": 0.008360295556485653, + "learning_rate": 4.941785252263907e-05, + "loss": 0.0022, + "step": 382 + }, + { + "epoch": 1.237979797979798, + "grad_norm": 0.013385162688791752, + "learning_rate": 4.954721862871928e-05, + "loss": 0.0032, + "step": 383 + }, + { + "epoch": 1.2412121212121212, + "grad_norm": 0.009576680138707161, + "learning_rate": 4.9676584734799483e-05, + "loss": 0.0024, + "step": 384 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 0.011253474280238152, + "learning_rate": 4.980595084087969e-05, + "loss": 0.0035, + "step": 385 + }, + { + "epoch": 1.2476767676767677, + "grad_norm": 0.013251024298369884, + "learning_rate": 4.99353169469599e-05, + "loss": 0.0037, + "step": 386 + }, + { + "epoch": 1.250909090909091, + "grad_norm": 0.0128615228459239, + "learning_rate": 5.006468305304011e-05, + "loss": 0.0042, + "step": 387 + }, + { + "epoch": 1.2541414141414142, + "grad_norm": 0.011205179616808891, + "learning_rate": 5.019404915912032e-05, + "loss": 0.0022, + "step": 388 + }, + { + "epoch": 1.2573737373737375, + "grad_norm": 0.009263547137379646, + "learning_rate": 5.032341526520052e-05, + "loss": 0.0033, + "step": 389 + }, + { + "epoch": 1.2606060606060607, + "grad_norm": 0.013434207998216152, + "learning_rate": 5.0452781371280725e-05, + "loss": 0.0041, + "step": 390 + }, + { + "epoch": 1.2638383838383838, + "grad_norm": 0.008763938210904598, + "learning_rate": 5.058214747736093e-05, + "loss": 0.0026, + "step": 391 + }, + { + "epoch": 1.267070707070707, + "grad_norm": 0.013437739573419094, + "learning_rate": 5.071151358344114e-05, + "loss": 0.003, + "step": 392 + }, + { + "epoch": 1.2703030303030303, + "grad_norm": 0.017890624701976776, + "learning_rate": 5.084087968952135e-05, + "loss": 0.0095, + "step": 393 + }, + { + "epoch": 1.2735353535353535, + "grad_norm": 0.008945245295763016, + "learning_rate": 5.097024579560156e-05, + "loss": 0.0022, + "step": 394 + }, + { + "epoch": 1.2767676767676768, + "grad_norm": 0.010177022777497768, + "learning_rate": 5.109961190168176e-05, + "loss": 0.0026, + "step": 395 + }, + { + "epoch": 1.28, + "grad_norm": 0.010548125021159649, + "learning_rate": 5.122897800776197e-05, + "loss": 0.0022, + "step": 396 + }, + { + "epoch": 1.2832323232323233, + "grad_norm": 0.01482499111443758, + "learning_rate": 5.135834411384217e-05, + "loss": 0.0041, + "step": 397 + }, + { + "epoch": 1.2864646464646465, + "grad_norm": 0.009971357882022858, + "learning_rate": 5.148771021992238e-05, + "loss": 0.0032, + "step": 398 + }, + { + "epoch": 1.2896969696969696, + "grad_norm": 0.009518085978925228, + "learning_rate": 5.161707632600259e-05, + "loss": 0.0026, + "step": 399 + }, + { + "epoch": 1.2929292929292928, + "grad_norm": 0.010194380767643452, + "learning_rate": 5.17464424320828e-05, + "loss": 0.003, + "step": 400 + }, + { + "epoch": 1.2929292929292928, + "eval_loss": 0.004324791021645069, + "eval_runtime": 18.5425, + "eval_samples_per_second": 5.393, + "eval_steps_per_second": 1.348, + "step": 400 + }, + { + "epoch": 1.296161616161616, + "grad_norm": 0.013518190011382103, + "learning_rate": 5.1875808538163004e-05, + "loss": 0.0048, + "step": 401 + }, + { + "epoch": 1.2993939393939393, + "grad_norm": 0.009436222724616528, + "learning_rate": 5.2005174644243214e-05, + "loss": 0.0032, + "step": 402 + }, + { + "epoch": 1.3026262626262626, + "grad_norm": 0.014251810498535633, + "learning_rate": 5.213454075032341e-05, + "loss": 0.0038, + "step": 403 + }, + { + "epoch": 1.3058585858585858, + "grad_norm": 0.013866190798580647, + "learning_rate": 5.226390685640362e-05, + "loss": 0.0053, + "step": 404 + }, + { + "epoch": 1.309090909090909, + "grad_norm": 0.014752188697457314, + "learning_rate": 5.239327296248383e-05, + "loss": 0.0024, + "step": 405 + }, + { + "epoch": 1.3123232323232323, + "grad_norm": 0.010886342264711857, + "learning_rate": 5.252263906856404e-05, + "loss": 0.0041, + "step": 406 + }, + { + "epoch": 1.3155555555555556, + "grad_norm": 0.010374347679316998, + "learning_rate": 5.2652005174644245e-05, + "loss": 0.0037, + "step": 407 + }, + { + "epoch": 1.3187878787878788, + "grad_norm": 0.010170732624828815, + "learning_rate": 5.2781371280724455e-05, + "loss": 0.0051, + "step": 408 + }, + { + "epoch": 1.322020202020202, + "grad_norm": 0.010164180770516396, + "learning_rate": 5.2910737386804666e-05, + "loss": 0.004, + "step": 409 + }, + { + "epoch": 1.3252525252525253, + "grad_norm": 0.008564438670873642, + "learning_rate": 5.304010349288486e-05, + "loss": 0.0029, + "step": 410 + }, + { + "epoch": 1.3284848484848486, + "grad_norm": 0.008800626732409, + "learning_rate": 5.316946959896507e-05, + "loss": 0.003, + "step": 411 + }, + { + "epoch": 1.3317171717171719, + "grad_norm": 0.007803233340382576, + "learning_rate": 5.329883570504528e-05, + "loss": 0.0027, + "step": 412 + }, + { + "epoch": 1.3349494949494949, + "grad_norm": 0.00937716942280531, + "learning_rate": 5.3428201811125486e-05, + "loss": 0.0027, + "step": 413 + }, + { + "epoch": 1.3381818181818181, + "grad_norm": 0.008757539093494415, + "learning_rate": 5.35575679172057e-05, + "loss": 0.0031, + "step": 414 + }, + { + "epoch": 1.3414141414141414, + "grad_norm": 0.00794417317956686, + "learning_rate": 5.368693402328591e-05, + "loss": 0.0028, + "step": 415 + }, + { + "epoch": 1.3446464646464646, + "grad_norm": 0.011016054078936577, + "learning_rate": 5.3816300129366104e-05, + "loss": 0.0042, + "step": 416 + }, + { + "epoch": 1.347878787878788, + "grad_norm": 0.011397106572985649, + "learning_rate": 5.3945666235446314e-05, + "loss": 0.0043, + "step": 417 + }, + { + "epoch": 1.3511111111111112, + "grad_norm": 0.020235812291502953, + "learning_rate": 5.4075032341526524e-05, + "loss": 0.0082, + "step": 418 + }, + { + "epoch": 1.3543434343434344, + "grad_norm": 0.010009384714066982, + "learning_rate": 5.420439844760673e-05, + "loss": 0.0037, + "step": 419 + }, + { + "epoch": 1.3575757575757577, + "grad_norm": 0.010380405001342297, + "learning_rate": 5.433376455368694e-05, + "loss": 0.0032, + "step": 420 + }, + { + "epoch": 1.3608080808080807, + "grad_norm": 0.011236142367124557, + "learning_rate": 5.446313065976715e-05, + "loss": 0.0043, + "step": 421 + }, + { + "epoch": 1.364040404040404, + "grad_norm": 0.008906678296625614, + "learning_rate": 5.4592496765847345e-05, + "loss": 0.0025, + "step": 422 + }, + { + "epoch": 1.3672727272727272, + "grad_norm": 0.013497211039066315, + "learning_rate": 5.4721862871927555e-05, + "loss": 0.0039, + "step": 423 + }, + { + "epoch": 1.3705050505050504, + "grad_norm": 0.007696770131587982, + "learning_rate": 5.4851228978007766e-05, + "loss": 0.0028, + "step": 424 + }, + { + "epoch": 1.3737373737373737, + "grad_norm": 0.07293156534433365, + "learning_rate": 5.498059508408797e-05, + "loss": 0.005, + "step": 425 + }, + { + "epoch": 1.3737373737373737, + "eval_loss": 0.004379382357001305, + "eval_runtime": 18.5797, + "eval_samples_per_second": 5.382, + "eval_steps_per_second": 1.346, + "step": 425 + }, + { + "epoch": 1.376969696969697, + "grad_norm": 0.01241080742329359, + "learning_rate": 5.510996119016818e-05, + "loss": 0.0035, + "step": 426 + }, + { + "epoch": 1.3802020202020202, + "grad_norm": 0.010909633710980415, + "learning_rate": 5.523932729624839e-05, + "loss": 0.0036, + "step": 427 + }, + { + "epoch": 1.3834343434343435, + "grad_norm": 0.014454485848546028, + "learning_rate": 5.53686934023286e-05, + "loss": 0.0061, + "step": 428 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 0.006944994442164898, + "learning_rate": 5.5498059508408797e-05, + "loss": 0.0024, + "step": 429 + }, + { + "epoch": 1.38989898989899, + "grad_norm": 0.009738634340465069, + "learning_rate": 5.562742561448901e-05, + "loss": 0.0037, + "step": 430 + }, + { + "epoch": 1.3931313131313132, + "grad_norm": 0.01077584270387888, + "learning_rate": 5.575679172056921e-05, + "loss": 0.0041, + "step": 431 + }, + { + "epoch": 1.3963636363636365, + "grad_norm": 0.009695135056972504, + "learning_rate": 5.588615782664942e-05, + "loss": 0.0031, + "step": 432 + }, + { + "epoch": 1.3995959595959597, + "grad_norm": 0.014413881115615368, + "learning_rate": 5.601552393272963e-05, + "loss": 0.0037, + "step": 433 + }, + { + "epoch": 1.4028282828282828, + "grad_norm": 0.012725897133350372, + "learning_rate": 5.614489003880984e-05, + "loss": 0.0043, + "step": 434 + }, + { + "epoch": 1.406060606060606, + "grad_norm": 0.010477320291101933, + "learning_rate": 5.627425614489004e-05, + "loss": 0.003, + "step": 435 + }, + { + "epoch": 1.4092929292929293, + "grad_norm": 0.010203997604548931, + "learning_rate": 5.640362225097025e-05, + "loss": 0.0031, + "step": 436 + }, + { + "epoch": 1.4125252525252525, + "grad_norm": 0.011213167570531368, + "learning_rate": 5.653298835705045e-05, + "loss": 0.0045, + "step": 437 + }, + { + "epoch": 1.4157575757575758, + "grad_norm": 0.011191388592123985, + "learning_rate": 5.666235446313066e-05, + "loss": 0.0048, + "step": 438 + }, + { + "epoch": 1.418989898989899, + "grad_norm": 0.008038587868213654, + "learning_rate": 5.679172056921087e-05, + "loss": 0.0027, + "step": 439 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 0.010327471420168877, + "learning_rate": 5.692108667529108e-05, + "loss": 0.004, + "step": 440 + }, + { + "epoch": 1.4254545454545455, + "grad_norm": 0.01546016801148653, + "learning_rate": 5.7050452781371286e-05, + "loss": 0.0052, + "step": 441 + }, + { + "epoch": 1.4286868686868686, + "grad_norm": 0.00971939880400896, + "learning_rate": 5.717981888745149e-05, + "loss": 0.0029, + "step": 442 + }, + { + "epoch": 1.4319191919191918, + "grad_norm": 0.019665749743580818, + "learning_rate": 5.730918499353169e-05, + "loss": 0.01, + "step": 443 + }, + { + "epoch": 1.435151515151515, + "grad_norm": 0.00928762461990118, + "learning_rate": 5.74385510996119e-05, + "loss": 0.003, + "step": 444 + }, + { + "epoch": 1.4383838383838383, + "grad_norm": 0.009248577058315277, + "learning_rate": 5.7567917205692113e-05, + "loss": 0.0024, + "step": 445 + }, + { + "epoch": 1.4416161616161616, + "grad_norm": 0.006708938162773848, + "learning_rate": 5.7697283311772324e-05, + "loss": 0.0021, + "step": 446 + }, + { + "epoch": 1.4448484848484848, + "grad_norm": 0.014481152407824993, + "learning_rate": 5.782664941785253e-05, + "loss": 0.004, + "step": 447 + }, + { + "epoch": 1.448080808080808, + "grad_norm": 0.015891794115304947, + "learning_rate": 5.795601552393273e-05, + "loss": 0.0078, + "step": 448 + }, + { + "epoch": 1.4513131313131313, + "grad_norm": 0.01767084375023842, + "learning_rate": 5.8085381630012934e-05, + "loss": 0.0078, + "step": 449 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.009849028661847115, + "learning_rate": 5.8214747736093145e-05, + "loss": 0.0039, + "step": 450 + }, + { + "epoch": 1.4545454545454546, + "eval_loss": 0.003997213672846556, + "eval_runtime": 18.599, + "eval_samples_per_second": 5.377, + "eval_steps_per_second": 1.344, + "step": 450 + }, + { + "epoch": 1.4577777777777778, + "grad_norm": 0.007672747131437063, + "learning_rate": 5.8344113842173355e-05, + "loss": 0.0024, + "step": 451 + }, + { + "epoch": 1.461010101010101, + "grad_norm": 0.0118905920535326, + "learning_rate": 5.8473479948253565e-05, + "loss": 0.0048, + "step": 452 + }, + { + "epoch": 1.4642424242424243, + "grad_norm": 0.01564439944922924, + "learning_rate": 5.860284605433377e-05, + "loss": 0.0084, + "step": 453 + }, + { + "epoch": 1.4674747474747476, + "grad_norm": 0.005980201996862888, + "learning_rate": 5.873221216041398e-05, + "loss": 0.0018, + "step": 454 + }, + { + "epoch": 1.4707070707070706, + "grad_norm": 0.007433287333697081, + "learning_rate": 5.8861578266494176e-05, + "loss": 0.0039, + "step": 455 + }, + { + "epoch": 1.4739393939393939, + "grad_norm": 0.010296525433659554, + "learning_rate": 5.8990944372574386e-05, + "loss": 0.0039, + "step": 456 + }, + { + "epoch": 1.4771717171717171, + "grad_norm": 0.009413332678377628, + "learning_rate": 5.9120310478654596e-05, + "loss": 0.0027, + "step": 457 + }, + { + "epoch": 1.4804040404040404, + "grad_norm": 0.012635589577257633, + "learning_rate": 5.9249676584734806e-05, + "loss": 0.0053, + "step": 458 + }, + { + "epoch": 1.4836363636363636, + "grad_norm": 0.0071015129797160625, + "learning_rate": 5.937904269081501e-05, + "loss": 0.0026, + "step": 459 + }, + { + "epoch": 1.486868686868687, + "grad_norm": 0.010815092362463474, + "learning_rate": 5.950840879689522e-05, + "loss": 0.004, + "step": 460 + }, + { + "epoch": 1.4901010101010101, + "grad_norm": 0.010755320079624653, + "learning_rate": 5.963777490297542e-05, + "loss": 0.0043, + "step": 461 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 0.0068894485011696815, + "learning_rate": 5.976714100905563e-05, + "loss": 0.0024, + "step": 462 + }, + { + "epoch": 1.4965656565656564, + "grad_norm": 0.010043000802397728, + "learning_rate": 5.989650711513584e-05, + "loss": 0.004, + "step": 463 + }, + { + "epoch": 1.4997979797979797, + "grad_norm": 0.013079461641609669, + "learning_rate": 6.002587322121605e-05, + "loss": 0.0061, + "step": 464 + }, + { + "epoch": 1.503030303030303, + "grad_norm": 0.012679477222263813, + "learning_rate": 6.015523932729625e-05, + "loss": 0.0036, + "step": 465 + }, + { + "epoch": 1.5062626262626262, + "grad_norm": 0.009493381716310978, + "learning_rate": 6.028460543337646e-05, + "loss": 0.0038, + "step": 466 + }, + { + "epoch": 1.5094949494949494, + "grad_norm": 0.016118116676807404, + "learning_rate": 6.041397153945667e-05, + "loss": 0.0058, + "step": 467 + }, + { + "epoch": 1.5127272727272727, + "grad_norm": 0.01638084277510643, + "learning_rate": 6.054333764553687e-05, + "loss": 0.0056, + "step": 468 + }, + { + "epoch": 1.515959595959596, + "grad_norm": 0.007027367129921913, + "learning_rate": 6.067270375161708e-05, + "loss": 0.002, + "step": 469 + }, + { + "epoch": 1.5191919191919192, + "grad_norm": 0.00825897790491581, + "learning_rate": 6.080206985769729e-05, + "loss": 0.0018, + "step": 470 + }, + { + "epoch": 1.5224242424242425, + "grad_norm": 0.00937443133443594, + "learning_rate": 6.093143596377749e-05, + "loss": 0.0025, + "step": 471 + }, + { + "epoch": 1.5256565656565657, + "grad_norm": 0.009296581149101257, + "learning_rate": 6.10608020698577e-05, + "loss": 0.0033, + "step": 472 + }, + { + "epoch": 1.528888888888889, + "grad_norm": 0.008974263444542885, + "learning_rate": 6.119016817593791e-05, + "loss": 0.0033, + "step": 473 + }, + { + "epoch": 1.5321212121212122, + "grad_norm": 0.012877869419753551, + "learning_rate": 6.131953428201811e-05, + "loss": 0.0049, + "step": 474 + }, + { + "epoch": 1.5353535353535355, + "grad_norm": 0.011777269653975964, + "learning_rate": 6.144890038809832e-05, + "loss": 0.0045, + "step": 475 + }, + { + "epoch": 1.5353535353535355, + "eval_loss": 0.004129638429731131, + "eval_runtime": 18.5241, + "eval_samples_per_second": 5.398, + "eval_steps_per_second": 1.35, + "step": 475 + }, + { + "epoch": 1.5385858585858587, + "grad_norm": 0.010810826905071735, + "learning_rate": 6.157826649417853e-05, + "loss": 0.0027, + "step": 476 + }, + { + "epoch": 1.541818181818182, + "grad_norm": 0.008260673843324184, + "learning_rate": 6.170763260025874e-05, + "loss": 0.0022, + "step": 477 + }, + { + "epoch": 1.545050505050505, + "grad_norm": 0.018339315429329872, + "learning_rate": 6.183699870633895e-05, + "loss": 0.0081, + "step": 478 + }, + { + "epoch": 1.5482828282828283, + "grad_norm": 0.010011550970375538, + "learning_rate": 6.196636481241915e-05, + "loss": 0.0053, + "step": 479 + }, + { + "epoch": 1.5515151515151515, + "grad_norm": 0.012307565659284592, + "learning_rate": 6.209573091849934e-05, + "loss": 0.0052, + "step": 480 + }, + { + "epoch": 1.5547474747474748, + "grad_norm": 0.0053881146013736725, + "learning_rate": 6.222509702457955e-05, + "loss": 0.002, + "step": 481 + }, + { + "epoch": 1.557979797979798, + "grad_norm": 0.00998155027627945, + "learning_rate": 6.235446313065976e-05, + "loss": 0.0034, + "step": 482 + }, + { + "epoch": 1.561212121212121, + "grad_norm": 0.010800400748848915, + "learning_rate": 6.248382923673998e-05, + "loss": 0.004, + "step": 483 + }, + { + "epoch": 1.5644444444444443, + "grad_norm": 0.008864471688866615, + "learning_rate": 6.261319534282019e-05, + "loss": 0.0037, + "step": 484 + }, + { + "epoch": 1.5676767676767676, + "grad_norm": 0.012302475981414318, + "learning_rate": 6.27425614489004e-05, + "loss": 0.0051, + "step": 485 + }, + { + "epoch": 1.5709090909090908, + "grad_norm": 0.007560106460005045, + "learning_rate": 6.28719275549806e-05, + "loss": 0.0022, + "step": 486 + }, + { + "epoch": 1.574141414141414, + "grad_norm": 0.018010618165135384, + "learning_rate": 6.30012936610608e-05, + "loss": 0.0067, + "step": 487 + }, + { + "epoch": 1.5773737373737373, + "grad_norm": 0.00866749044507742, + "learning_rate": 6.313065976714101e-05, + "loss": 0.0022, + "step": 488 + }, + { + "epoch": 1.5806060606060606, + "grad_norm": 0.01100226491689682, + "learning_rate": 6.326002587322122e-05, + "loss": 0.0043, + "step": 489 + }, + { + "epoch": 1.5838383838383838, + "grad_norm": 0.008842782117426395, + "learning_rate": 6.338939197930143e-05, + "loss": 0.0025, + "step": 490 + }, + { + "epoch": 1.587070707070707, + "grad_norm": 0.010475879535079002, + "learning_rate": 6.351875808538163e-05, + "loss": 0.003, + "step": 491 + }, + { + "epoch": 1.5903030303030303, + "grad_norm": 0.005673507694154978, + "learning_rate": 6.364812419146184e-05, + "loss": 0.0012, + "step": 492 + }, + { + "epoch": 1.5935353535353536, + "grad_norm": 0.009791476652026176, + "learning_rate": 6.377749029754204e-05, + "loss": 0.0019, + "step": 493 + }, + { + "epoch": 1.5967676767676768, + "grad_norm": 0.009763398207724094, + "learning_rate": 6.390685640362225e-05, + "loss": 0.0027, + "step": 494 + }, + { + "epoch": 1.6, + "grad_norm": 0.009230737574398518, + "learning_rate": 6.403622250970246e-05, + "loss": 0.0023, + "step": 495 + }, + { + "epoch": 1.6032323232323233, + "grad_norm": 0.007090194150805473, + "learning_rate": 6.416558861578267e-05, + "loss": 0.002, + "step": 496 + }, + { + "epoch": 1.6064646464646466, + "grad_norm": 0.014568297192454338, + "learning_rate": 6.429495472186288e-05, + "loss": 0.0036, + "step": 497 + }, + { + "epoch": 1.6096969696969698, + "grad_norm": 0.011722132563591003, + "learning_rate": 6.442432082794309e-05, + "loss": 0.0035, + "step": 498 + }, + { + "epoch": 1.6129292929292929, + "grad_norm": 0.009481179527938366, + "learning_rate": 6.45536869340233e-05, + "loss": 0.0035, + "step": 499 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.011115781962871552, + "learning_rate": 6.46830530401035e-05, + "loss": 0.0032, + "step": 500 + }, + { + "epoch": 1.6161616161616161, + "eval_loss": 0.00416895467787981, + "eval_runtime": 18.5315, + "eval_samples_per_second": 5.396, + "eval_steps_per_second": 1.349, + "step": 500 + }, + { + "epoch": 1.6193939393939394, + "grad_norm": 0.010090421885251999, + "learning_rate": 6.48124191461837e-05, + "loss": 0.0036, + "step": 501 + }, + { + "epoch": 1.6226262626262626, + "grad_norm": 0.00535732414573431, + "learning_rate": 6.494178525226392e-05, + "loss": 0.0018, + "step": 502 + }, + { + "epoch": 1.625858585858586, + "grad_norm": 0.01589041016995907, + "learning_rate": 6.507115135834411e-05, + "loss": 0.0092, + "step": 503 + }, + { + "epoch": 1.6290909090909091, + "grad_norm": 0.011435211636126041, + "learning_rate": 6.520051746442432e-05, + "loss": 0.005, + "step": 504 + }, + { + "epoch": 1.6323232323232322, + "grad_norm": 0.008358100429177284, + "learning_rate": 6.532988357050453e-05, + "loss": 0.0032, + "step": 505 + }, + { + "epoch": 1.6355555555555554, + "grad_norm": 0.01174091175198555, + "learning_rate": 6.545924967658473e-05, + "loss": 0.0032, + "step": 506 + }, + { + "epoch": 1.6387878787878787, + "grad_norm": 0.011000048369169235, + "learning_rate": 6.558861578266494e-05, + "loss": 0.002, + "step": 507 + }, + { + "epoch": 1.642020202020202, + "grad_norm": 0.009679551236331463, + "learning_rate": 6.571798188874515e-05, + "loss": 0.0024, + "step": 508 + }, + { + "epoch": 1.6452525252525252, + "grad_norm": 0.01002445723861456, + "learning_rate": 6.584734799482536e-05, + "loss": 0.0019, + "step": 509 + }, + { + "epoch": 1.6484848484848484, + "grad_norm": 0.01417891588062048, + "learning_rate": 6.597671410090557e-05, + "loss": 0.0033, + "step": 510 + }, + { + "epoch": 1.6517171717171717, + "grad_norm": 0.013681537471711636, + "learning_rate": 6.610608020698578e-05, + "loss": 0.0036, + "step": 511 + }, + { + "epoch": 1.654949494949495, + "grad_norm": 0.009443574585020542, + "learning_rate": 6.623544631306599e-05, + "loss": 0.0023, + "step": 512 + }, + { + "epoch": 1.6581818181818182, + "grad_norm": 0.009754985570907593, + "learning_rate": 6.636481241914619e-05, + "loss": 0.0033, + "step": 513 + }, + { + "epoch": 1.6614141414141415, + "grad_norm": 0.010350651107728481, + "learning_rate": 6.649417852522638e-05, + "loss": 0.0024, + "step": 514 + }, + { + "epoch": 1.6646464646464647, + "grad_norm": 0.012001021765172482, + "learning_rate": 6.66235446313066e-05, + "loss": 0.0046, + "step": 515 + }, + { + "epoch": 1.667878787878788, + "grad_norm": 0.008749539963901043, + "learning_rate": 6.67529107373868e-05, + "loss": 0.0029, + "step": 516 + }, + { + "epoch": 1.6711111111111112, + "grad_norm": 0.012192701920866966, + "learning_rate": 6.688227684346702e-05, + "loss": 0.0057, + "step": 517 + }, + { + "epoch": 1.6743434343434345, + "grad_norm": 0.007326943334192038, + "learning_rate": 6.701164294954723e-05, + "loss": 0.0022, + "step": 518 + }, + { + "epoch": 1.6775757575757577, + "grad_norm": 0.01528987754136324, + "learning_rate": 6.714100905562742e-05, + "loss": 0.0081, + "step": 519 + }, + { + "epoch": 1.680808080808081, + "grad_norm": 0.011944162659347057, + "learning_rate": 6.727037516170763e-05, + "loss": 0.0041, + "step": 520 + }, + { + "epoch": 1.684040404040404, + "grad_norm": 0.007957994006574154, + "learning_rate": 6.739974126778784e-05, + "loss": 0.0029, + "step": 521 + }, + { + "epoch": 1.6872727272727273, + "grad_norm": 0.008053603582084179, + "learning_rate": 6.752910737386805e-05, + "loss": 0.0031, + "step": 522 + }, + { + "epoch": 1.6905050505050505, + "grad_norm": 0.008817918598651886, + "learning_rate": 6.765847347994826e-05, + "loss": 0.0045, + "step": 523 + }, + { + "epoch": 1.6937373737373738, + "grad_norm": 0.010544711723923683, + "learning_rate": 6.778783958602847e-05, + "loss": 0.0052, + "step": 524 + }, + { + "epoch": 1.696969696969697, + "grad_norm": 0.006746083032339811, + "learning_rate": 6.791720569210867e-05, + "loss": 0.0021, + "step": 525 + }, + { + "epoch": 1.696969696969697, + "eval_loss": 0.003925703000277281, + "eval_runtime": 18.5246, + "eval_samples_per_second": 5.398, + "eval_steps_per_second": 1.35, + "step": 525 + }, + { + "epoch": 1.70020202020202, + "grad_norm": 0.00789568293839693, + "learning_rate": 6.804657179818887e-05, + "loss": 0.0026, + "step": 526 + }, + { + "epoch": 1.7034343434343433, + "grad_norm": 0.004943890497088432, + "learning_rate": 6.817593790426908e-05, + "loss": 0.0016, + "step": 527 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 0.011704254895448685, + "learning_rate": 6.830530401034929e-05, + "loss": 0.0058, + "step": 528 + }, + { + "epoch": 1.7098989898989898, + "grad_norm": 0.010488653555512428, + "learning_rate": 6.84346701164295e-05, + "loss": 0.0038, + "step": 529 + }, + { + "epoch": 1.713131313131313, + "grad_norm": 0.014268319122493267, + "learning_rate": 6.856403622250971e-05, + "loss": 0.0035, + "step": 530 + }, + { + "epoch": 1.7163636363636363, + "grad_norm": 0.008911222219467163, + "learning_rate": 6.869340232858992e-05, + "loss": 0.0022, + "step": 531 + }, + { + "epoch": 1.7195959595959596, + "grad_norm": 0.007050775457173586, + "learning_rate": 6.882276843467012e-05, + "loss": 0.0024, + "step": 532 + }, + { + "epoch": 1.7228282828282828, + "grad_norm": 0.008809910155832767, + "learning_rate": 6.895213454075033e-05, + "loss": 0.0035, + "step": 533 + }, + { + "epoch": 1.726060606060606, + "grad_norm": 0.010169385932385921, + "learning_rate": 6.908150064683054e-05, + "loss": 0.0031, + "step": 534 + }, + { + "epoch": 1.7292929292929293, + "grad_norm": 0.00835514348000288, + "learning_rate": 6.921086675291075e-05, + "loss": 0.003, + "step": 535 + }, + { + "epoch": 1.7325252525252526, + "grad_norm": 0.011087946593761444, + "learning_rate": 6.934023285899096e-05, + "loss": 0.0047, + "step": 536 + }, + { + "epoch": 1.7357575757575758, + "grad_norm": 0.011055010370910168, + "learning_rate": 6.946959896507115e-05, + "loss": 0.003, + "step": 537 + }, + { + "epoch": 1.738989898989899, + "grad_norm": 0.011571435257792473, + "learning_rate": 6.959896507115135e-05, + "loss": 0.005, + "step": 538 + }, + { + "epoch": 1.7422222222222223, + "grad_norm": 0.010782378725707531, + "learning_rate": 6.972833117723156e-05, + "loss": 0.004, + "step": 539 + }, + { + "epoch": 1.7454545454545456, + "grad_norm": 0.013954983092844486, + "learning_rate": 6.985769728331177e-05, + "loss": 0.0061, + "step": 540 + }, + { + "epoch": 1.7486868686868688, + "grad_norm": 0.012455100193619728, + "learning_rate": 6.998706338939198e-05, + "loss": 0.0062, + "step": 541 + }, + { + "epoch": 1.7519191919191919, + "grad_norm": 0.012288283556699753, + "learning_rate": 7.011642949547219e-05, + "loss": 0.0035, + "step": 542 + }, + { + "epoch": 1.7551515151515151, + "grad_norm": 0.007455012295395136, + "learning_rate": 7.02457956015524e-05, + "loss": 0.0022, + "step": 543 + }, + { + "epoch": 1.7583838383838384, + "grad_norm": 0.00867515616118908, + "learning_rate": 7.037516170763261e-05, + "loss": 0.0035, + "step": 544 + }, + { + "epoch": 1.7616161616161616, + "grad_norm": 0.005005441606044769, + "learning_rate": 7.050452781371281e-05, + "loss": 0.002, + "step": 545 + }, + { + "epoch": 1.7648484848484849, + "grad_norm": 0.008555242791771889, + "learning_rate": 7.063389391979302e-05, + "loss": 0.0037, + "step": 546 + }, + { + "epoch": 1.768080808080808, + "grad_norm": 0.010418135672807693, + "learning_rate": 7.076326002587323e-05, + "loss": 0.0038, + "step": 547 + }, + { + "epoch": 1.7713131313131312, + "grad_norm": 0.006279077846556902, + "learning_rate": 7.089262613195343e-05, + "loss": 0.0017, + "step": 548 + }, + { + "epoch": 1.7745454545454544, + "grad_norm": 0.012131530791521072, + "learning_rate": 7.102199223803364e-05, + "loss": 0.0061, + "step": 549 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.009560024365782738, + "learning_rate": 7.115135834411385e-05, + "loss": 0.0028, + "step": 550 + }, + { + "epoch": 1.7777777777777777, + "eval_loss": 0.0038468276616185904, + "eval_runtime": 18.5364, + "eval_samples_per_second": 5.395, + "eval_steps_per_second": 1.349, + "step": 550 + }, + { + "epoch": 1.781010101010101, + "grad_norm": 0.00908865500241518, + "learning_rate": 7.128072445019404e-05, + "loss": 0.0028, + "step": 551 + }, + { + "epoch": 1.7842424242424242, + "grad_norm": 0.010756799951195717, + "learning_rate": 7.141009055627425e-05, + "loss": 0.0044, + "step": 552 + }, + { + "epoch": 1.7874747474747474, + "grad_norm": 0.007486172951757908, + "learning_rate": 7.153945666235446e-05, + "loss": 0.0026, + "step": 553 + }, + { + "epoch": 1.7907070707070707, + "grad_norm": 0.011368786916136742, + "learning_rate": 7.166882276843467e-05, + "loss": 0.0029, + "step": 554 + }, + { + "epoch": 1.793939393939394, + "grad_norm": 0.009916623122990131, + "learning_rate": 7.179818887451488e-05, + "loss": 0.0041, + "step": 555 + }, + { + "epoch": 1.7971717171717172, + "grad_norm": 0.008696876466274261, + "learning_rate": 7.19275549805951e-05, + "loss": 0.0028, + "step": 556 + }, + { + "epoch": 1.8004040404040405, + "grad_norm": 0.013026717118918896, + "learning_rate": 7.20569210866753e-05, + "loss": 0.0081, + "step": 557 + }, + { + "epoch": 1.8036363636363637, + "grad_norm": 0.014385446906089783, + "learning_rate": 7.21862871927555e-05, + "loss": 0.0053, + "step": 558 + }, + { + "epoch": 1.806868686868687, + "grad_norm": 0.009371618740260601, + "learning_rate": 7.231565329883571e-05, + "loss": 0.0022, + "step": 559 + }, + { + "epoch": 1.8101010101010102, + "grad_norm": 0.00902082584798336, + "learning_rate": 7.244501940491591e-05, + "loss": 0.0043, + "step": 560 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 0.007532780524343252, + "learning_rate": 7.257438551099612e-05, + "loss": 0.0026, + "step": 561 + }, + { + "epoch": 1.8165656565656567, + "grad_norm": 0.011961591430008411, + "learning_rate": 7.270375161707633e-05, + "loss": 0.0048, + "step": 562 + }, + { + "epoch": 1.8197979797979797, + "grad_norm": 0.006793793756514788, + "learning_rate": 7.283311772315654e-05, + "loss": 0.0026, + "step": 563 + }, + { + "epoch": 1.823030303030303, + "grad_norm": 0.005821246188133955, + "learning_rate": 7.296248382923674e-05, + "loss": 0.0021, + "step": 564 + }, + { + "epoch": 1.8262626262626263, + "grad_norm": 0.010227235034108162, + "learning_rate": 7.309184993531695e-05, + "loss": 0.0036, + "step": 565 + }, + { + "epoch": 1.8294949494949495, + "grad_norm": 0.008356478065252304, + "learning_rate": 7.322121604139716e-05, + "loss": 0.0023, + "step": 566 + }, + { + "epoch": 1.8327272727272728, + "grad_norm": 0.009778296574950218, + "learning_rate": 7.335058214747737e-05, + "loss": 0.0037, + "step": 567 + }, + { + "epoch": 1.835959595959596, + "grad_norm": 0.005213058087974787, + "learning_rate": 7.347994825355758e-05, + "loss": 0.0015, + "step": 568 + }, + { + "epoch": 1.839191919191919, + "grad_norm": 0.008941391482949257, + "learning_rate": 7.360931435963779e-05, + "loss": 0.0042, + "step": 569 + }, + { + "epoch": 1.8424242424242423, + "grad_norm": 0.011023417115211487, + "learning_rate": 7.3738680465718e-05, + "loss": 0.0043, + "step": 570 + }, + { + "epoch": 1.8456565656565656, + "grad_norm": 0.007842346094548702, + "learning_rate": 7.38680465717982e-05, + "loss": 0.0022, + "step": 571 + }, + { + "epoch": 1.8488888888888888, + "grad_norm": 0.013547302223742008, + "learning_rate": 7.399741267787839e-05, + "loss": 0.0056, + "step": 572 + }, + { + "epoch": 1.852121212121212, + "grad_norm": 0.006040791980922222, + "learning_rate": 7.41267787839586e-05, + "loss": 0.0018, + "step": 573 + }, + { + "epoch": 1.8553535353535353, + "grad_norm": 0.014373473823070526, + "learning_rate": 7.425614489003881e-05, + "loss": 0.0065, + "step": 574 + }, + { + "epoch": 1.8585858585858586, + "grad_norm": 0.013441539369523525, + "learning_rate": 7.438551099611902e-05, + "loss": 0.0072, + "step": 575 + }, + { + "epoch": 1.8585858585858586, + "eval_loss": 0.0037343467120081186, + "eval_runtime": 18.5469, + "eval_samples_per_second": 5.392, + "eval_steps_per_second": 1.348, + "step": 575 + }, + { + "epoch": 1.8618181818181818, + "grad_norm": 0.010687317699193954, + "learning_rate": 7.451487710219923e-05, + "loss": 0.0046, + "step": 576 + }, + { + "epoch": 1.865050505050505, + "grad_norm": 0.007853105664253235, + "learning_rate": 7.464424320827943e-05, + "loss": 0.0034, + "step": 577 + }, + { + "epoch": 1.8682828282828283, + "grad_norm": 0.0073550548404455185, + "learning_rate": 7.477360931435964e-05, + "loss": 0.0023, + "step": 578 + }, + { + "epoch": 1.8715151515151516, + "grad_norm": 0.010772529058158398, + "learning_rate": 7.490297542043985e-05, + "loss": 0.0036, + "step": 579 + }, + { + "epoch": 1.8747474747474748, + "grad_norm": 0.008650882169604301, + "learning_rate": 7.503234152652006e-05, + "loss": 0.0028, + "step": 580 + }, + { + "epoch": 1.877979797979798, + "grad_norm": 0.0067758322693407536, + "learning_rate": 7.516170763260027e-05, + "loss": 0.0027, + "step": 581 + }, + { + "epoch": 1.8812121212121213, + "grad_norm": 0.006348081398755312, + "learning_rate": 7.529107373868047e-05, + "loss": 0.0022, + "step": 582 + }, + { + "epoch": 1.8844444444444446, + "grad_norm": 0.009605488739907742, + "learning_rate": 7.542043984476068e-05, + "loss": 0.0034, + "step": 583 + }, + { + "epoch": 1.8876767676767678, + "grad_norm": 0.007978033274412155, + "learning_rate": 7.554980595084087e-05, + "loss": 0.0026, + "step": 584 + }, + { + "epoch": 1.8909090909090909, + "grad_norm": 0.004183103330433369, + "learning_rate": 7.567917205692108e-05, + "loss": 0.0013, + "step": 585 + }, + { + "epoch": 1.8941414141414141, + "grad_norm": 0.009894490242004395, + "learning_rate": 7.58085381630013e-05, + "loss": 0.0036, + "step": 586 + }, + { + "epoch": 1.8973737373737374, + "grad_norm": 0.009437436237931252, + "learning_rate": 7.59379042690815e-05, + "loss": 0.0032, + "step": 587 + }, + { + "epoch": 1.9006060606060606, + "grad_norm": 0.007200904656201601, + "learning_rate": 7.606727037516171e-05, + "loss": 0.0026, + "step": 588 + }, + { + "epoch": 1.9038383838383839, + "grad_norm": 0.010573303326964378, + "learning_rate": 7.619663648124192e-05, + "loss": 0.003, + "step": 589 + }, + { + "epoch": 1.907070707070707, + "grad_norm": 0.011452763341367245, + "learning_rate": 7.632600258732212e-05, + "loss": 0.0066, + "step": 590 + }, + { + "epoch": 1.9103030303030302, + "grad_norm": 0.006278225686401129, + "learning_rate": 7.645536869340233e-05, + "loss": 0.0026, + "step": 591 + }, + { + "epoch": 1.9135353535353534, + "grad_norm": 0.009631357155740261, + "learning_rate": 7.658473479948254e-05, + "loss": 0.0028, + "step": 592 + }, + { + "epoch": 1.9167676767676767, + "grad_norm": 0.007342373952269554, + "learning_rate": 7.671410090556275e-05, + "loss": 0.003, + "step": 593 + }, + { + "epoch": 1.92, + "grad_norm": 0.007070086430758238, + "learning_rate": 7.684346701164295e-05, + "loss": 0.0017, + "step": 594 + }, + { + "epoch": 1.9232323232323232, + "grad_norm": 0.00917192455381155, + "learning_rate": 7.697283311772316e-05, + "loss": 0.0017, + "step": 595 + }, + { + "epoch": 1.9264646464646464, + "grad_norm": 0.008652327582240105, + "learning_rate": 7.710219922380336e-05, + "loss": 0.0027, + "step": 596 + }, + { + "epoch": 1.9296969696969697, + "grad_norm": 0.012016909196972847, + "learning_rate": 7.723156532988357e-05, + "loss": 0.0036, + "step": 597 + }, + { + "epoch": 1.932929292929293, + "grad_norm": 0.006076544988900423, + "learning_rate": 7.736093143596378e-05, + "loss": 0.002, + "step": 598 + }, + { + "epoch": 1.9361616161616162, + "grad_norm": 0.004448462277650833, + "learning_rate": 7.749029754204399e-05, + "loss": 0.0015, + "step": 599 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 0.012907430529594421, + "learning_rate": 7.76196636481242e-05, + "loss": 0.0091, + "step": 600 + }, + { + "epoch": 1.9393939393939394, + "eval_loss": 0.004055679310113192, + "eval_runtime": 18.6094, + "eval_samples_per_second": 5.374, + "eval_steps_per_second": 1.343, + "step": 600 + }, + { + "epoch": 1.9426262626262627, + "grad_norm": 0.011052297428250313, + "learning_rate": 7.774902975420441e-05, + "loss": 0.0041, + "step": 601 + }, + { + "epoch": 1.945858585858586, + "grad_norm": 0.008796362206339836, + "learning_rate": 7.787839586028462e-05, + "loss": 0.0036, + "step": 602 + }, + { + "epoch": 1.9490909090909092, + "grad_norm": 0.014682842418551445, + "learning_rate": 7.800776196636481e-05, + "loss": 0.0048, + "step": 603 + }, + { + "epoch": 1.9523232323232325, + "grad_norm": 0.008703047409653664, + "learning_rate": 7.813712807244502e-05, + "loss": 0.0034, + "step": 604 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.006767407990992069, + "learning_rate": 7.826649417852523e-05, + "loss": 0.0024, + "step": 605 + }, + { + "epoch": 1.9587878787878787, + "grad_norm": 0.008383658714592457, + "learning_rate": 7.839586028460543e-05, + "loss": 0.0032, + "step": 606 + }, + { + "epoch": 1.962020202020202, + "grad_norm": 0.005494570825248957, + "learning_rate": 7.852522639068564e-05, + "loss": 0.0019, + "step": 607 + }, + { + "epoch": 1.9652525252525253, + "grad_norm": 0.009956276044249535, + "learning_rate": 7.865459249676585e-05, + "loss": 0.0032, + "step": 608 + }, + { + "epoch": 1.9684848484848485, + "grad_norm": 0.007628526072949171, + "learning_rate": 7.878395860284605e-05, + "loss": 0.0028, + "step": 609 + }, + { + "epoch": 1.9717171717171718, + "grad_norm": 0.007804389111697674, + "learning_rate": 7.891332470892626e-05, + "loss": 0.0029, + "step": 610 + }, + { + "epoch": 1.9749494949494948, + "grad_norm": 0.007861903868615627, + "learning_rate": 7.904269081500647e-05, + "loss": 0.0032, + "step": 611 + }, + { + "epoch": 1.978181818181818, + "grad_norm": 0.004911143332719803, + "learning_rate": 7.917205692108668e-05, + "loss": 0.0015, + "step": 612 + }, + { + "epoch": 1.9814141414141413, + "grad_norm": 0.009105902165174484, + "learning_rate": 7.930142302716689e-05, + "loss": 0.0052, + "step": 613 + }, + { + "epoch": 1.9846464646464645, + "grad_norm": 0.006514673586934805, + "learning_rate": 7.94307891332471e-05, + "loss": 0.0022, + "step": 614 + }, + { + "epoch": 1.9878787878787878, + "grad_norm": 0.009500369429588318, + "learning_rate": 7.956015523932731e-05, + "loss": 0.0035, + "step": 615 + }, + { + "epoch": 1.991111111111111, + "grad_norm": 0.008826248347759247, + "learning_rate": 7.968952134540751e-05, + "loss": 0.0035, + "step": 616 + }, + { + "epoch": 1.9943434343434343, + "grad_norm": 0.018535856157541275, + "learning_rate": 7.981888745148772e-05, + "loss": 0.011, + "step": 617 + }, + { + "epoch": 1.9975757575757576, + "grad_norm": 0.00722027663141489, + "learning_rate": 7.994825355756791e-05, + "loss": 0.0024, + "step": 618 + }, + { + "epoch": 2.000808080808081, + "grad_norm": 0.015815898776054382, + "learning_rate": 8.007761966364812e-05, + "loss": 0.0062, + "step": 619 + }, + { + "epoch": 2.004040404040404, + "grad_norm": 0.01083781011402607, + "learning_rate": 8.020698576972833e-05, + "loss": 0.004, + "step": 620 + }, + { + "epoch": 2.0072727272727273, + "grad_norm": 0.005312093999236822, + "learning_rate": 8.033635187580855e-05, + "loss": 0.0018, + "step": 621 + }, + { + "epoch": 2.0105050505050506, + "grad_norm": 0.006571098230779171, + "learning_rate": 8.046571798188874e-05, + "loss": 0.0022, + "step": 622 + }, + { + "epoch": 2.013737373737374, + "grad_norm": 0.005624288227409124, + "learning_rate": 8.059508408796895e-05, + "loss": 0.0017, + "step": 623 + }, + { + "epoch": 2.016969696969697, + "grad_norm": 0.007645309902727604, + "learning_rate": 8.072445019404916e-05, + "loss": 0.0025, + "step": 624 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.007025252562016249, + "learning_rate": 8.085381630012937e-05, + "loss": 0.0021, + "step": 625 + }, + { + "epoch": 2.0202020202020203, + "eval_loss": 0.003941759467124939, + "eval_runtime": 18.549, + "eval_samples_per_second": 5.391, + "eval_steps_per_second": 1.348, + "step": 625 + }, + { + "epoch": 2.0234343434343436, + "grad_norm": 0.006916250567883253, + "learning_rate": 8.098318240620958e-05, + "loss": 0.0023, + "step": 626 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 0.007594529539346695, + "learning_rate": 8.111254851228979e-05, + "loss": 0.0028, + "step": 627 + }, + { + "epoch": 2.02989898989899, + "grad_norm": 0.010098805651068687, + "learning_rate": 8.124191461836999e-05, + "loss": 0.0042, + "step": 628 + }, + { + "epoch": 2.0331313131313133, + "grad_norm": 0.006069192662835121, + "learning_rate": 8.13712807244502e-05, + "loss": 0.0019, + "step": 629 + }, + { + "epoch": 2.036363636363636, + "grad_norm": 0.008519073016941547, + "learning_rate": 8.15006468305304e-05, + "loss": 0.0025, + "step": 630 + }, + { + "epoch": 2.0395959595959594, + "grad_norm": 0.007799296174198389, + "learning_rate": 8.163001293661061e-05, + "loss": 0.0028, + "step": 631 + }, + { + "epoch": 2.0428282828282827, + "grad_norm": 0.007101105526089668, + "learning_rate": 8.175937904269082e-05, + "loss": 0.0023, + "step": 632 + }, + { + "epoch": 2.046060606060606, + "grad_norm": 0.008387326262891293, + "learning_rate": 8.188874514877103e-05, + "loss": 0.0023, + "step": 633 + }, + { + "epoch": 2.049292929292929, + "grad_norm": 0.00731281191110611, + "learning_rate": 8.201811125485124e-05, + "loss": 0.002, + "step": 634 + }, + { + "epoch": 2.0525252525252524, + "grad_norm": 0.008349014446139336, + "learning_rate": 8.214747736093143e-05, + "loss": 0.0028, + "step": 635 + }, + { + "epoch": 2.0557575757575757, + "grad_norm": 0.006019700318574905, + "learning_rate": 8.227684346701164e-05, + "loss": 0.0014, + "step": 636 + }, + { + "epoch": 2.058989898989899, + "grad_norm": 0.010838819667696953, + "learning_rate": 8.240620957309186e-05, + "loss": 0.0038, + "step": 637 + }, + { + "epoch": 2.062222222222222, + "grad_norm": 0.007151686120778322, + "learning_rate": 8.253557567917207e-05, + "loss": 0.0027, + "step": 638 + }, + { + "epoch": 2.0654545454545454, + "grad_norm": 0.006484883837401867, + "learning_rate": 8.266494178525228e-05, + "loss": 0.0018, + "step": 639 + }, + { + "epoch": 2.0686868686868687, + "grad_norm": 0.008207079023122787, + "learning_rate": 8.279430789133247e-05, + "loss": 0.0028, + "step": 640 + }, + { + "epoch": 2.071919191919192, + "grad_norm": 0.005906389560550451, + "learning_rate": 8.292367399741268e-05, + "loss": 0.0018, + "step": 641 + }, + { + "epoch": 2.075151515151515, + "grad_norm": 0.010167015716433525, + "learning_rate": 8.305304010349288e-05, + "loss": 0.0033, + "step": 642 + }, + { + "epoch": 2.0783838383838384, + "grad_norm": 0.01006457768380642, + "learning_rate": 8.318240620957309e-05, + "loss": 0.0034, + "step": 643 + }, + { + "epoch": 2.0816161616161617, + "grad_norm": 0.012126877903938293, + "learning_rate": 8.33117723156533e-05, + "loss": 0.0032, + "step": 644 + }, + { + "epoch": 2.084848484848485, + "grad_norm": 0.0077040912583470345, + "learning_rate": 8.344113842173351e-05, + "loss": 0.003, + "step": 645 + }, + { + "epoch": 2.088080808080808, + "grad_norm": 0.006703955586999655, + "learning_rate": 8.357050452781372e-05, + "loss": 0.0023, + "step": 646 + }, + { + "epoch": 2.0913131313131315, + "grad_norm": 0.009514384903013706, + "learning_rate": 8.369987063389393e-05, + "loss": 0.0028, + "step": 647 + }, + { + "epoch": 2.0945454545454547, + "grad_norm": 0.008247634395956993, + "learning_rate": 8.382923673997413e-05, + "loss": 0.0015, + "step": 648 + }, + { + "epoch": 2.097777777777778, + "grad_norm": 0.006758189760148525, + "learning_rate": 8.395860284605434e-05, + "loss": 0.0017, + "step": 649 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 0.00795058161020279, + "learning_rate": 8.408796895213455e-05, + "loss": 0.0026, + "step": 650 + }, + { + "epoch": 2.101010101010101, + "eval_loss": 0.004011084325611591, + "eval_runtime": 18.5451, + "eval_samples_per_second": 5.392, + "eval_steps_per_second": 1.348, + "step": 650 + }, + { + "epoch": 2.1042424242424245, + "grad_norm": 0.006842897739261389, + "learning_rate": 8.421733505821476e-05, + "loss": 0.0019, + "step": 651 + }, + { + "epoch": 2.1074747474747473, + "grad_norm": 0.008785001002252102, + "learning_rate": 8.434670116429496e-05, + "loss": 0.0018, + "step": 652 + }, + { + "epoch": 2.1107070707070705, + "grad_norm": 0.012285329401493073, + "learning_rate": 8.447606727037517e-05, + "loss": 0.0041, + "step": 653 + }, + { + "epoch": 2.113939393939394, + "grad_norm": 0.0077829184010624886, + "learning_rate": 8.460543337645536e-05, + "loss": 0.0025, + "step": 654 + }, + { + "epoch": 2.117171717171717, + "grad_norm": 0.008003472350537777, + "learning_rate": 8.473479948253557e-05, + "loss": 0.0032, + "step": 655 + }, + { + "epoch": 2.1204040404040403, + "grad_norm": 0.004586879629641771, + "learning_rate": 8.486416558861578e-05, + "loss": 0.0012, + "step": 656 + }, + { + "epoch": 2.1236363636363635, + "grad_norm": 0.005405848380178213, + "learning_rate": 8.499353169469599e-05, + "loss": 0.0018, + "step": 657 + }, + { + "epoch": 2.126868686868687, + "grad_norm": 0.004980962257832289, + "learning_rate": 8.51228978007762e-05, + "loss": 0.0021, + "step": 658 + }, + { + "epoch": 2.13010101010101, + "grad_norm": 0.007102985866367817, + "learning_rate": 8.525226390685641e-05, + "loss": 0.0027, + "step": 659 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.0070281801745295525, + "learning_rate": 8.538163001293662e-05, + "loss": 0.0024, + "step": 660 + }, + { + "epoch": 2.1365656565656566, + "grad_norm": 0.01416573766618967, + "learning_rate": 8.551099611901682e-05, + "loss": 0.0028, + "step": 661 + }, + { + "epoch": 2.13979797979798, + "grad_norm": 0.00932216364890337, + "learning_rate": 8.564036222509703e-05, + "loss": 0.0048, + "step": 662 + }, + { + "epoch": 2.143030303030303, + "grad_norm": 0.008925393223762512, + "learning_rate": 8.576972833117724e-05, + "loss": 0.0023, + "step": 663 + }, + { + "epoch": 2.1462626262626263, + "grad_norm": 0.006588651333004236, + "learning_rate": 8.589909443725744e-05, + "loss": 0.0028, + "step": 664 + }, + { + "epoch": 2.1494949494949496, + "grad_norm": 0.004025767557322979, + "learning_rate": 8.602846054333765e-05, + "loss": 0.0013, + "step": 665 + }, + { + "epoch": 2.152727272727273, + "grad_norm": 0.005784463603049517, + "learning_rate": 8.615782664941786e-05, + "loss": 0.0022, + "step": 666 + }, + { + "epoch": 2.155959595959596, + "grad_norm": 0.008596932515501976, + "learning_rate": 8.628719275549805e-05, + "loss": 0.0037, + "step": 667 + }, + { + "epoch": 2.1591919191919193, + "grad_norm": 0.006371675059199333, + "learning_rate": 8.641655886157827e-05, + "loss": 0.0016, + "step": 668 + }, + { + "epoch": 2.1624242424242426, + "grad_norm": 0.00666641304269433, + "learning_rate": 8.654592496765848e-05, + "loss": 0.0028, + "step": 669 + }, + { + "epoch": 2.165656565656566, + "grad_norm": 0.006822609808295965, + "learning_rate": 8.667529107373869e-05, + "loss": 0.0029, + "step": 670 + }, + { + "epoch": 2.168888888888889, + "grad_norm": 0.012946972623467445, + "learning_rate": 8.68046571798189e-05, + "loss": 0.0053, + "step": 671 + }, + { + "epoch": 2.172121212121212, + "grad_norm": 0.005195168312638998, + "learning_rate": 8.69340232858991e-05, + "loss": 0.0013, + "step": 672 + }, + { + "epoch": 2.175353535353535, + "grad_norm": 0.004933821968734264, + "learning_rate": 8.706338939197932e-05, + "loss": 0.0014, + "step": 673 + }, + { + "epoch": 2.1785858585858584, + "grad_norm": 0.00683283805847168, + "learning_rate": 8.719275549805951e-05, + "loss": 0.0027, + "step": 674 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 0.008817669935524464, + "learning_rate": 8.732212160413972e-05, + "loss": 0.0028, + "step": 675 + }, + { + "epoch": 2.1818181818181817, + "eval_loss": 0.00383288087323308, + "eval_runtime": 18.5346, + "eval_samples_per_second": 5.395, + "eval_steps_per_second": 1.349, + "step": 675 + }, + { + "epoch": 2.185050505050505, + "grad_norm": 0.004791957791894674, + "learning_rate": 8.745148771021992e-05, + "loss": 0.0014, + "step": 676 + }, + { + "epoch": 2.188282828282828, + "grad_norm": 0.0061204745434224606, + "learning_rate": 8.758085381630013e-05, + "loss": 0.0018, + "step": 677 + }, + { + "epoch": 2.1915151515151514, + "grad_norm": 0.011844735592603683, + "learning_rate": 8.771021992238034e-05, + "loss": 0.0048, + "step": 678 + }, + { + "epoch": 2.1947474747474747, + "grad_norm": 0.006783795543015003, + "learning_rate": 8.783958602846055e-05, + "loss": 0.0019, + "step": 679 + }, + { + "epoch": 2.197979797979798, + "grad_norm": 0.011243957094848156, + "learning_rate": 8.796895213454075e-05, + "loss": 0.0036, + "step": 680 + }, + { + "epoch": 2.201212121212121, + "grad_norm": 0.007772068493068218, + "learning_rate": 8.809831824062096e-05, + "loss": 0.0022, + "step": 681 + }, + { + "epoch": 2.2044444444444444, + "grad_norm": 0.0051286122761666775, + "learning_rate": 8.822768434670117e-05, + "loss": 0.0013, + "step": 682 + }, + { + "epoch": 2.2076767676767677, + "grad_norm": 0.01016032975167036, + "learning_rate": 8.835705045278138e-05, + "loss": 0.0039, + "step": 683 + }, + { + "epoch": 2.210909090909091, + "grad_norm": 0.006838168017566204, + "learning_rate": 8.848641655886159e-05, + "loss": 0.002, + "step": 684 + }, + { + "epoch": 2.214141414141414, + "grad_norm": 0.008824942633509636, + "learning_rate": 8.86157826649418e-05, + "loss": 0.0024, + "step": 685 + }, + { + "epoch": 2.2173737373737374, + "grad_norm": 0.00688344007357955, + "learning_rate": 8.8745148771022e-05, + "loss": 0.002, + "step": 686 + }, + { + "epoch": 2.2206060606060607, + "grad_norm": 0.0066194599494338036, + "learning_rate": 8.88745148771022e-05, + "loss": 0.0019, + "step": 687 + }, + { + "epoch": 2.223838383838384, + "grad_norm": 0.007829937152564526, + "learning_rate": 8.90038809831824e-05, + "loss": 0.0022, + "step": 688 + }, + { + "epoch": 2.227070707070707, + "grad_norm": 0.009328142739832401, + "learning_rate": 8.913324708926261e-05, + "loss": 0.002, + "step": 689 + }, + { + "epoch": 2.2303030303030305, + "grad_norm": 0.009020456112921238, + "learning_rate": 8.926261319534282e-05, + "loss": 0.0031, + "step": 690 + }, + { + "epoch": 2.2335353535353537, + "grad_norm": 0.00872059352695942, + "learning_rate": 8.939197930142303e-05, + "loss": 0.0026, + "step": 691 + }, + { + "epoch": 2.236767676767677, + "grad_norm": 0.009462382644414902, + "learning_rate": 8.952134540750324e-05, + "loss": 0.0034, + "step": 692 + }, + { + "epoch": 2.24, + "grad_norm": 0.0071613360196352005, + "learning_rate": 8.965071151358344e-05, + "loss": 0.002, + "step": 693 + }, + { + "epoch": 2.2432323232323235, + "grad_norm": 0.00632578507065773, + "learning_rate": 8.978007761966365e-05, + "loss": 0.0019, + "step": 694 + }, + { + "epoch": 2.2464646464646463, + "grad_norm": 0.007571411784738302, + "learning_rate": 8.990944372574386e-05, + "loss": 0.0023, + "step": 695 + }, + { + "epoch": 2.2496969696969695, + "grad_norm": 0.009794604033231735, + "learning_rate": 9.003880983182407e-05, + "loss": 0.0032, + "step": 696 + }, + { + "epoch": 2.252929292929293, + "grad_norm": 0.006740176118910313, + "learning_rate": 9.016817593790428e-05, + "loss": 0.002, + "step": 697 + }, + { + "epoch": 2.256161616161616, + "grad_norm": 0.008458360098302364, + "learning_rate": 9.029754204398448e-05, + "loss": 0.0022, + "step": 698 + }, + { + "epoch": 2.2593939393939393, + "grad_norm": 0.010913528501987457, + "learning_rate": 9.042690815006469e-05, + "loss": 0.0027, + "step": 699 + }, + { + "epoch": 2.2626262626262625, + "grad_norm": 0.00851623248308897, + "learning_rate": 9.055627425614489e-05, + "loss": 0.0027, + "step": 700 + }, + { + "epoch": 2.2626262626262625, + "eval_loss": 0.0037586898542940617, + "eval_runtime": 18.6064, + "eval_samples_per_second": 5.375, + "eval_steps_per_second": 1.344, + "step": 700 + }, + { + "epoch": 2.265858585858586, + "grad_norm": 0.008736968040466309, + "learning_rate": 9.06856403622251e-05, + "loss": 0.0026, + "step": 701 + }, + { + "epoch": 2.269090909090909, + "grad_norm": 0.007413066923618317, + "learning_rate": 9.08150064683053e-05, + "loss": 0.002, + "step": 702 + }, + { + "epoch": 2.2723232323232323, + "grad_norm": 0.009513185359537601, + "learning_rate": 9.094437257438552e-05, + "loss": 0.0029, + "step": 703 + }, + { + "epoch": 2.2755555555555556, + "grad_norm": 0.006239297799766064, + "learning_rate": 9.107373868046573e-05, + "loss": 0.0024, + "step": 704 + }, + { + "epoch": 2.278787878787879, + "grad_norm": 0.009737716056406498, + "learning_rate": 9.120310478654594e-05, + "loss": 0.0032, + "step": 705 + }, + { + "epoch": 2.282020202020202, + "grad_norm": 0.005764431785792112, + "learning_rate": 9.133247089262613e-05, + "loss": 0.0021, + "step": 706 + }, + { + "epoch": 2.2852525252525253, + "grad_norm": 0.01445918157696724, + "learning_rate": 9.146183699870634e-05, + "loss": 0.0041, + "step": 707 + }, + { + "epoch": 2.2884848484848486, + "grad_norm": 0.007700520567595959, + "learning_rate": 9.159120310478655e-05, + "loss": 0.0024, + "step": 708 + }, + { + "epoch": 2.291717171717172, + "grad_norm": 0.007032734341919422, + "learning_rate": 9.172056921086676e-05, + "loss": 0.002, + "step": 709 + }, + { + "epoch": 2.294949494949495, + "grad_norm": 0.012682800181210041, + "learning_rate": 9.184993531694696e-05, + "loss": 0.0023, + "step": 710 + }, + { + "epoch": 2.2981818181818183, + "grad_norm": 0.01120555866509676, + "learning_rate": 9.197930142302717e-05, + "loss": 0.0065, + "step": 711 + }, + { + "epoch": 2.3014141414141416, + "grad_norm": 0.009766733273863792, + "learning_rate": 9.210866752910737e-05, + "loss": 0.0034, + "step": 712 + }, + { + "epoch": 2.304646464646465, + "grad_norm": 0.009006375446915627, + "learning_rate": 9.223803363518758e-05, + "loss": 0.0028, + "step": 713 + }, + { + "epoch": 2.3078787878787876, + "grad_norm": 0.0110854497179389, + "learning_rate": 9.236739974126779e-05, + "loss": 0.0044, + "step": 714 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 0.00787447765469551, + "learning_rate": 9.2496765847348e-05, + "loss": 0.0028, + "step": 715 + }, + { + "epoch": 2.314343434343434, + "grad_norm": 0.007719083689153194, + "learning_rate": 9.262613195342821e-05, + "loss": 0.0022, + "step": 716 + }, + { + "epoch": 2.3175757575757574, + "grad_norm": 0.004776225425302982, + "learning_rate": 9.275549805950842e-05, + "loss": 0.0014, + "step": 717 + }, + { + "epoch": 2.3208080808080807, + "grad_norm": 0.009256124496459961, + "learning_rate": 9.288486416558863e-05, + "loss": 0.0034, + "step": 718 + }, + { + "epoch": 2.324040404040404, + "grad_norm": 0.00416655233129859, + "learning_rate": 9.301423027166883e-05, + "loss": 0.0014, + "step": 719 + }, + { + "epoch": 2.327272727272727, + "grad_norm": 0.010706130415201187, + "learning_rate": 9.314359637774904e-05, + "loss": 0.0064, + "step": 720 + }, + { + "epoch": 2.3305050505050504, + "grad_norm": 0.010033998638391495, + "learning_rate": 9.327296248382925e-05, + "loss": 0.0032, + "step": 721 + }, + { + "epoch": 2.3337373737373737, + "grad_norm": 0.010281972587108612, + "learning_rate": 9.340232858990944e-05, + "loss": 0.0046, + "step": 722 + }, + { + "epoch": 2.336969696969697, + "grad_norm": 0.007038958836346865, + "learning_rate": 9.353169469598965e-05, + "loss": 0.0025, + "step": 723 + }, + { + "epoch": 2.34020202020202, + "grad_norm": 0.006714510731399059, + "learning_rate": 9.366106080206986e-05, + "loss": 0.0023, + "step": 724 + }, + { + "epoch": 2.3434343434343434, + "grad_norm": 0.008812773041427135, + "learning_rate": 9.379042690815006e-05, + "loss": 0.0032, + "step": 725 + }, + { + "epoch": 2.3434343434343434, + "eval_loss": 0.0038471692241728306, + "eval_runtime": 18.6172, + "eval_samples_per_second": 5.371, + "eval_steps_per_second": 1.343, + "step": 725 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 0.00827405508607626, + "learning_rate": 9.391979301423027e-05, + "loss": 0.0024, + "step": 726 + }, + { + "epoch": 2.34989898989899, + "grad_norm": 0.009304560720920563, + "learning_rate": 9.404915912031048e-05, + "loss": 0.0032, + "step": 727 + }, + { + "epoch": 2.353131313131313, + "grad_norm": 0.004189759958535433, + "learning_rate": 9.417852522639069e-05, + "loss": 0.001, + "step": 728 + }, + { + "epoch": 2.3563636363636364, + "grad_norm": 0.010822804644703865, + "learning_rate": 9.43078913324709e-05, + "loss": 0.0037, + "step": 729 + }, + { + "epoch": 2.3595959595959597, + "grad_norm": 0.010418107733130455, + "learning_rate": 9.443725743855111e-05, + "loss": 0.0029, + "step": 730 + }, + { + "epoch": 2.362828282828283, + "grad_norm": 0.013821220025420189, + "learning_rate": 9.456662354463132e-05, + "loss": 0.0037, + "step": 731 + }, + { + "epoch": 2.366060606060606, + "grad_norm": 0.006959732621908188, + "learning_rate": 9.469598965071152e-05, + "loss": 0.0021, + "step": 732 + }, + { + "epoch": 2.3692929292929295, + "grad_norm": 0.010071862488985062, + "learning_rate": 9.482535575679173e-05, + "loss": 0.0019, + "step": 733 + }, + { + "epoch": 2.3725252525252527, + "grad_norm": 0.010580536909401417, + "learning_rate": 9.495472186287193e-05, + "loss": 0.0041, + "step": 734 + }, + { + "epoch": 2.375757575757576, + "grad_norm": 0.009700599126517773, + "learning_rate": 9.508408796895214e-05, + "loss": 0.0036, + "step": 735 + }, + { + "epoch": 2.378989898989899, + "grad_norm": 0.008695641532540321, + "learning_rate": 9.521345407503235e-05, + "loss": 0.0034, + "step": 736 + }, + { + "epoch": 2.3822222222222225, + "grad_norm": 0.007013495545834303, + "learning_rate": 9.534282018111256e-05, + "loss": 0.0024, + "step": 737 + }, + { + "epoch": 2.3854545454545453, + "grad_norm": 0.007236323785036802, + "learning_rate": 9.547218628719275e-05, + "loss": 0.002, + "step": 738 + }, + { + "epoch": 2.3886868686868685, + "grad_norm": 0.005592238157987595, + "learning_rate": 9.560155239327296e-05, + "loss": 0.0016, + "step": 739 + }, + { + "epoch": 2.391919191919192, + "grad_norm": 0.00680518476292491, + "learning_rate": 9.573091849935317e-05, + "loss": 0.002, + "step": 740 + }, + { + "epoch": 2.395151515151515, + "grad_norm": 0.008009014651179314, + "learning_rate": 9.586028460543338e-05, + "loss": 0.0018, + "step": 741 + }, + { + "epoch": 2.3983838383838383, + "grad_norm": 0.006378913763910532, + "learning_rate": 9.59896507115136e-05, + "loss": 0.0019, + "step": 742 + }, + { + "epoch": 2.4016161616161615, + "grad_norm": 0.006395281758159399, + "learning_rate": 9.61190168175938e-05, + "loss": 0.002, + "step": 743 + }, + { + "epoch": 2.404848484848485, + "grad_norm": 0.008712049573659897, + "learning_rate": 9.6248382923674e-05, + "loss": 0.0026, + "step": 744 + }, + { + "epoch": 2.408080808080808, + "grad_norm": 0.009031414985656738, + "learning_rate": 9.63777490297542e-05, + "loss": 0.0029, + "step": 745 + }, + { + "epoch": 2.4113131313131313, + "grad_norm": 0.010614071041345596, + "learning_rate": 9.650711513583441e-05, + "loss": 0.0039, + "step": 746 + }, + { + "epoch": 2.4145454545454546, + "grad_norm": 0.006484184879809618, + "learning_rate": 9.663648124191462e-05, + "loss": 0.0015, + "step": 747 + }, + { + "epoch": 2.417777777777778, + "grad_norm": 0.005564129911363125, + "learning_rate": 9.676584734799483e-05, + "loss": 0.0018, + "step": 748 + }, + { + "epoch": 2.421010101010101, + "grad_norm": 0.00510470662266016, + "learning_rate": 9.689521345407504e-05, + "loss": 0.0013, + "step": 749 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.007081560790538788, + "learning_rate": 9.702457956015525e-05, + "loss": 0.0022, + "step": 750 + }, + { + "epoch": 2.4242424242424243, + "eval_loss": 0.0037562157958745956, + "eval_runtime": 18.5443, + "eval_samples_per_second": 5.393, + "eval_steps_per_second": 1.348, + "step": 750 + }, + { + "epoch": 2.4274747474747476, + "grad_norm": 0.009013883769512177, + "learning_rate": 9.715394566623545e-05, + "loss": 0.0031, + "step": 751 + }, + { + "epoch": 2.430707070707071, + "grad_norm": 0.009731669910252094, + "learning_rate": 9.728331177231566e-05, + "loss": 0.0041, + "step": 752 + }, + { + "epoch": 2.433939393939394, + "grad_norm": 0.010123556479811668, + "learning_rate": 9.741267787839587e-05, + "loss": 0.0042, + "step": 753 + }, + { + "epoch": 2.4371717171717173, + "grad_norm": 0.005915546324104071, + "learning_rate": 9.754204398447608e-05, + "loss": 0.0024, + "step": 754 + }, + { + "epoch": 2.4404040404040406, + "grad_norm": 0.008191730827093124, + "learning_rate": 9.767141009055629e-05, + "loss": 0.0032, + "step": 755 + }, + { + "epoch": 2.443636363636364, + "grad_norm": 0.005690063815563917, + "learning_rate": 9.780077619663648e-05, + "loss": 0.0021, + "step": 756 + }, + { + "epoch": 2.4468686868686866, + "grad_norm": 0.008176354691386223, + "learning_rate": 9.793014230271668e-05, + "loss": 0.0027, + "step": 757 + }, + { + "epoch": 2.45010101010101, + "grad_norm": 0.007075440138578415, + "learning_rate": 9.805950840879689e-05, + "loss": 0.0023, + "step": 758 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 0.006414720788598061, + "learning_rate": 9.81888745148771e-05, + "loss": 0.0018, + "step": 759 + }, + { + "epoch": 2.4565656565656564, + "grad_norm": 0.011210867203772068, + "learning_rate": 9.831824062095731e-05, + "loss": 0.0039, + "step": 760 + }, + { + "epoch": 2.4597979797979797, + "grad_norm": 0.006865155417472124, + "learning_rate": 9.844760672703752e-05, + "loss": 0.002, + "step": 761 + }, + { + "epoch": 2.463030303030303, + "grad_norm": 0.010836309753358364, + "learning_rate": 9.857697283311773e-05, + "loss": 0.0036, + "step": 762 + }, + { + "epoch": 2.466262626262626, + "grad_norm": 0.007952204905450344, + "learning_rate": 9.870633893919794e-05, + "loss": 0.003, + "step": 763 + }, + { + "epoch": 2.4694949494949494, + "grad_norm": 0.007584880106151104, + "learning_rate": 9.883570504527814e-05, + "loss": 0.0023, + "step": 764 + }, + { + "epoch": 2.4727272727272727, + "grad_norm": 0.00835363008081913, + "learning_rate": 9.896507115135835e-05, + "loss": 0.0016, + "step": 765 + }, + { + "epoch": 2.475959595959596, + "grad_norm": 0.007696302607655525, + "learning_rate": 9.909443725743856e-05, + "loss": 0.0023, + "step": 766 + }, + { + "epoch": 2.479191919191919, + "grad_norm": 0.008753744885325432, + "learning_rate": 9.922380336351877e-05, + "loss": 0.0027, + "step": 767 + }, + { + "epoch": 2.4824242424242424, + "grad_norm": 0.007579338271170855, + "learning_rate": 9.935316946959897e-05, + "loss": 0.0028, + "step": 768 + }, + { + "epoch": 2.4856565656565657, + "grad_norm": 0.011279474943876266, + "learning_rate": 9.948253557567918e-05, + "loss": 0.0042, + "step": 769 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 0.008233487606048584, + "learning_rate": 9.961190168175937e-05, + "loss": 0.0027, + "step": 770 + }, + { + "epoch": 2.492121212121212, + "grad_norm": 0.006320327054709196, + "learning_rate": 9.974126778783958e-05, + "loss": 0.0014, + "step": 771 + }, + { + "epoch": 2.4953535353535354, + "grad_norm": 0.007094175089150667, + "learning_rate": 9.98706338939198e-05, + "loss": 0.0019, + "step": 772 + }, + { + "epoch": 2.4985858585858587, + "grad_norm": 0.007033037953078747, + "learning_rate": 0.0001, + "loss": 0.0021, + "step": 773 + }, + { + "epoch": 2.501818181818182, + "grad_norm": 0.008634313941001892, + "learning_rate": 9.999999489471233e-05, + "loss": 0.0027, + "step": 774 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.00898764282464981, + "learning_rate": 9.99999795788503e-05, + "loss": 0.0024, + "step": 775 + }, + { + "epoch": 2.505050505050505, + "eval_loss": 0.0035804486833512783, + "eval_runtime": 18.5794, + "eval_samples_per_second": 5.382, + "eval_steps_per_second": 1.346, + "step": 775 + }, + { + "epoch": 2.5082828282828284, + "grad_norm": 0.007207226939499378, + "learning_rate": 9.99999540524171e-05, + "loss": 0.0022, + "step": 776 + }, + { + "epoch": 2.5115151515151517, + "grad_norm": 0.012771569192409515, + "learning_rate": 9.999991831541789e-05, + "loss": 0.0045, + "step": 777 + }, + { + "epoch": 2.514747474747475, + "grad_norm": 0.0067327446304261684, + "learning_rate": 9.999987236786e-05, + "loss": 0.002, + "step": 778 + }, + { + "epoch": 2.517979797979798, + "grad_norm": 0.009945794008672237, + "learning_rate": 9.999981620975281e-05, + "loss": 0.0034, + "step": 779 + }, + { + "epoch": 2.5212121212121215, + "grad_norm": 0.006773203145712614, + "learning_rate": 9.999974984110779e-05, + "loss": 0.0023, + "step": 780 + }, + { + "epoch": 2.5244444444444447, + "grad_norm": 0.00785363744944334, + "learning_rate": 9.999967326193847e-05, + "loss": 0.0019, + "step": 781 + }, + { + "epoch": 2.5276767676767675, + "grad_norm": 0.010476037859916687, + "learning_rate": 9.999958647226049e-05, + "loss": 0.0033, + "step": 782 + }, + { + "epoch": 2.5309090909090908, + "grad_norm": 0.007686255965381861, + "learning_rate": 9.999948947209162e-05, + "loss": 0.0026, + "step": 783 + }, + { + "epoch": 2.534141414141414, + "grad_norm": 0.007367600221186876, + "learning_rate": 9.999938226145161e-05, + "loss": 0.0023, + "step": 784 + }, + { + "epoch": 2.5373737373737373, + "grad_norm": 0.00861043855547905, + "learning_rate": 9.999926484036237e-05, + "loss": 0.0026, + "step": 785 + }, + { + "epoch": 2.5406060606060605, + "grad_norm": 0.009773798286914825, + "learning_rate": 9.999913720884791e-05, + "loss": 0.0025, + "step": 786 + }, + { + "epoch": 2.543838383838384, + "grad_norm": 0.006338477600365877, + "learning_rate": 9.999899936693426e-05, + "loss": 0.0018, + "step": 787 + }, + { + "epoch": 2.547070707070707, + "grad_norm": 0.012471425347030163, + "learning_rate": 9.99988513146496e-05, + "loss": 0.0053, + "step": 788 + }, + { + "epoch": 2.5503030303030303, + "grad_norm": 0.009621040895581245, + "learning_rate": 9.999869305202412e-05, + "loss": 0.0031, + "step": 789 + }, + { + "epoch": 2.5535353535353535, + "grad_norm": 0.006214372348040342, + "learning_rate": 9.999852457909018e-05, + "loss": 0.0016, + "step": 790 + }, + { + "epoch": 2.556767676767677, + "grad_norm": 0.011059516109526157, + "learning_rate": 9.999834589588217e-05, + "loss": 0.0024, + "step": 791 + }, + { + "epoch": 2.56, + "grad_norm": 0.012596976943314075, + "learning_rate": 9.999815700243656e-05, + "loss": 0.0061, + "step": 792 + }, + { + "epoch": 2.5632323232323233, + "grad_norm": 0.012062342837452888, + "learning_rate": 9.999795789879196e-05, + "loss": 0.0031, + "step": 793 + }, + { + "epoch": 2.5664646464646466, + "grad_norm": 0.012593230232596397, + "learning_rate": 9.9997748584989e-05, + "loss": 0.0016, + "step": 794 + }, + { + "epoch": 2.56969696969697, + "grad_norm": 0.0052862451411783695, + "learning_rate": 9.999752906107042e-05, + "loss": 0.0017, + "step": 795 + }, + { + "epoch": 2.572929292929293, + "grad_norm": 0.009504672139883041, + "learning_rate": 9.999729932708109e-05, + "loss": 0.0031, + "step": 796 + }, + { + "epoch": 2.5761616161616163, + "grad_norm": 0.006016214843839407, + "learning_rate": 9.999705938306789e-05, + "loss": 0.0022, + "step": 797 + }, + { + "epoch": 2.579393939393939, + "grad_norm": 0.009827526286244392, + "learning_rate": 9.999680922907982e-05, + "loss": 0.0028, + "step": 798 + }, + { + "epoch": 2.5826262626262624, + "grad_norm": 0.00939889531582594, + "learning_rate": 9.999654886516798e-05, + "loss": 0.0025, + "step": 799 + }, + { + "epoch": 2.5858585858585856, + "grad_norm": 0.004873231053352356, + "learning_rate": 9.999627829138554e-05, + "loss": 0.0015, + "step": 800 + }, + { + "epoch": 2.5858585858585856, + "eval_loss": 0.003381554502993822, + "eval_runtime": 18.5553, + "eval_samples_per_second": 5.389, + "eval_steps_per_second": 1.347, + "step": 800 + }, + { + "epoch": 2.589090909090909, + "grad_norm": 0.011554384604096413, + "learning_rate": 9.999599750778772e-05, + "loss": 0.0024, + "step": 801 + }, + { + "epoch": 2.592323232323232, + "grad_norm": 0.00788146536797285, + "learning_rate": 9.999570651443191e-05, + "loss": 0.0028, + "step": 802 + }, + { + "epoch": 2.5955555555555554, + "grad_norm": 0.007881752215325832, + "learning_rate": 9.99954053113775e-05, + "loss": 0.003, + "step": 803 + }, + { + "epoch": 2.5987878787878786, + "grad_norm": 0.00717515591531992, + "learning_rate": 9.9995093898686e-05, + "loss": 0.0025, + "step": 804 + }, + { + "epoch": 2.602020202020202, + "grad_norm": 0.008232329972088337, + "learning_rate": 9.999477227642103e-05, + "loss": 0.0029, + "step": 805 + }, + { + "epoch": 2.605252525252525, + "grad_norm": 0.017594264820218086, + "learning_rate": 9.999444044464823e-05, + "loss": 0.0038, + "step": 806 + }, + { + "epoch": 2.6084848484848484, + "grad_norm": 0.006009047385305166, + "learning_rate": 9.999409840343539e-05, + "loss": 0.0023, + "step": 807 + }, + { + "epoch": 2.6117171717171717, + "grad_norm": 0.005063756834715605, + "learning_rate": 9.999374615285236e-05, + "loss": 0.0018, + "step": 808 + }, + { + "epoch": 2.614949494949495, + "grad_norm": 0.007926912046968937, + "learning_rate": 9.999338369297106e-05, + "loss": 0.003, + "step": 809 + }, + { + "epoch": 2.618181818181818, + "grad_norm": 0.00994369387626648, + "learning_rate": 9.999301102386553e-05, + "loss": 0.0031, + "step": 810 + }, + { + "epoch": 2.6214141414141414, + "grad_norm": 0.004684642422944307, + "learning_rate": 9.999262814561185e-05, + "loss": 0.0017, + "step": 811 + }, + { + "epoch": 2.6246464646464647, + "grad_norm": 0.01245308667421341, + "learning_rate": 9.999223505828821e-05, + "loss": 0.0037, + "step": 812 + }, + { + "epoch": 2.627878787878788, + "grad_norm": 0.00802707765251398, + "learning_rate": 9.999183176197491e-05, + "loss": 0.0027, + "step": 813 + }, + { + "epoch": 2.631111111111111, + "grad_norm": 0.008055022917687893, + "learning_rate": 9.999141825675426e-05, + "loss": 0.0036, + "step": 814 + }, + { + "epoch": 2.6343434343434344, + "grad_norm": 0.005763540044426918, + "learning_rate": 9.999099454271074e-05, + "loss": 0.0016, + "step": 815 + }, + { + "epoch": 2.6375757575757577, + "grad_norm": 0.008480647578835487, + "learning_rate": 9.999056061993089e-05, + "loss": 0.0027, + "step": 816 + }, + { + "epoch": 2.640808080808081, + "grad_norm": 0.010444406419992447, + "learning_rate": 9.999011648850329e-05, + "loss": 0.0025, + "step": 817 + }, + { + "epoch": 2.644040404040404, + "grad_norm": 0.010159408673644066, + "learning_rate": 9.998966214851864e-05, + "loss": 0.0042, + "step": 818 + }, + { + "epoch": 2.6472727272727274, + "grad_norm": 0.009814529679715633, + "learning_rate": 9.998919760006972e-05, + "loss": 0.0036, + "step": 819 + }, + { + "epoch": 2.6505050505050507, + "grad_norm": 0.009227470494806767, + "learning_rate": 9.998872284325142e-05, + "loss": 0.0027, + "step": 820 + }, + { + "epoch": 2.653737373737374, + "grad_norm": 0.008259695954620838, + "learning_rate": 9.998823787816066e-05, + "loss": 0.003, + "step": 821 + }, + { + "epoch": 2.656969696969697, + "grad_norm": 0.010044812224805355, + "learning_rate": 9.99877427048965e-05, + "loss": 0.0023, + "step": 822 + }, + { + "epoch": 2.6602020202020205, + "grad_norm": 0.007520666811615229, + "learning_rate": 9.998723732356006e-05, + "loss": 0.0026, + "step": 823 + }, + { + "epoch": 2.6634343434343437, + "grad_norm": 0.012261404655873775, + "learning_rate": 9.998672173425452e-05, + "loss": 0.0053, + "step": 824 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.006036568898707628, + "learning_rate": 9.998619593708518e-05, + "loss": 0.0022, + "step": 825 + }, + { + "epoch": 2.6666666666666665, + "eval_loss": 0.0035520135425031185, + "eval_runtime": 18.6008, + "eval_samples_per_second": 5.376, + "eval_steps_per_second": 1.344, + "step": 825 + }, + { + "epoch": 2.6698989898989898, + "grad_norm": 0.006456975359469652, + "learning_rate": 9.998565993215943e-05, + "loss": 0.0024, + "step": 826 + }, + { + "epoch": 2.673131313131313, + "grad_norm": 0.00519346771761775, + "learning_rate": 9.998511371958672e-05, + "loss": 0.0016, + "step": 827 + }, + { + "epoch": 2.6763636363636363, + "grad_norm": 0.007795286830514669, + "learning_rate": 9.998455729947858e-05, + "loss": 0.0036, + "step": 828 + }, + { + "epoch": 2.6795959595959595, + "grad_norm": 0.009498896077275276, + "learning_rate": 9.998399067194864e-05, + "loss": 0.0035, + "step": 829 + }, + { + "epoch": 2.682828282828283, + "grad_norm": 0.005344277247786522, + "learning_rate": 9.998341383711263e-05, + "loss": 0.0019, + "step": 830 + }, + { + "epoch": 2.686060606060606, + "grad_norm": 0.006172938738018274, + "learning_rate": 9.998282679508835e-05, + "loss": 0.0021, + "step": 831 + }, + { + "epoch": 2.6892929292929293, + "grad_norm": 0.006847708486020565, + "learning_rate": 9.998222954599563e-05, + "loss": 0.0018, + "step": 832 + }, + { + "epoch": 2.6925252525252525, + "grad_norm": 0.007936537265777588, + "learning_rate": 9.99816220899565e-05, + "loss": 0.0025, + "step": 833 + }, + { + "epoch": 2.695757575757576, + "grad_norm": 0.0077203791588544846, + "learning_rate": 9.998100442709497e-05, + "loss": 0.003, + "step": 834 + }, + { + "epoch": 2.698989898989899, + "grad_norm": 0.007416009437292814, + "learning_rate": 9.998037655753717e-05, + "loss": 0.0027, + "step": 835 + }, + { + "epoch": 2.7022222222222223, + "grad_norm": 0.006032218690961599, + "learning_rate": 9.997973848141137e-05, + "loss": 0.0019, + "step": 836 + }, + { + "epoch": 2.7054545454545456, + "grad_norm": 0.007939686998724937, + "learning_rate": 9.997909019884781e-05, + "loss": 0.0022, + "step": 837 + }, + { + "epoch": 2.708686868686869, + "grad_norm": 0.007059860974550247, + "learning_rate": 9.99784317099789e-05, + "loss": 0.003, + "step": 838 + }, + { + "epoch": 2.711919191919192, + "grad_norm": 0.01049652136862278, + "learning_rate": 9.997776301493914e-05, + "loss": 0.0034, + "step": 839 + }, + { + "epoch": 2.7151515151515153, + "grad_norm": 0.010508295148611069, + "learning_rate": 9.997708411386501e-05, + "loss": 0.0031, + "step": 840 + }, + { + "epoch": 2.718383838383838, + "grad_norm": 0.010989880189299583, + "learning_rate": 9.997639500689523e-05, + "loss": 0.004, + "step": 841 + }, + { + "epoch": 2.7216161616161614, + "grad_norm": 0.0070852129720151424, + "learning_rate": 9.997569569417049e-05, + "loss": 0.0022, + "step": 842 + }, + { + "epoch": 2.7248484848484846, + "grad_norm": 0.007061573676764965, + "learning_rate": 9.997498617583358e-05, + "loss": 0.0018, + "step": 843 + }, + { + "epoch": 2.728080808080808, + "grad_norm": 0.004920569248497486, + "learning_rate": 9.997426645202943e-05, + "loss": 0.0015, + "step": 844 + }, + { + "epoch": 2.731313131313131, + "grad_norm": 0.009148934856057167, + "learning_rate": 9.9973536522905e-05, + "loss": 0.0028, + "step": 845 + }, + { + "epoch": 2.7345454545454544, + "grad_norm": 0.006100498139858246, + "learning_rate": 9.997279638860933e-05, + "loss": 0.002, + "step": 846 + }, + { + "epoch": 2.7377777777777776, + "grad_norm": 0.010787500068545341, + "learning_rate": 9.99720460492936e-05, + "loss": 0.0022, + "step": 847 + }, + { + "epoch": 2.741010101010101, + "grad_norm": 0.007626068778336048, + "learning_rate": 9.997128550511099e-05, + "loss": 0.0019, + "step": 848 + }, + { + "epoch": 2.744242424242424, + "grad_norm": 0.007720778696238995, + "learning_rate": 9.997051475621687e-05, + "loss": 0.002, + "step": 849 + }, + { + "epoch": 2.7474747474747474, + "grad_norm": 0.010044184513390064, + "learning_rate": 9.996973380276857e-05, + "loss": 0.0045, + "step": 850 + }, + { + "epoch": 2.7474747474747474, + "eval_loss": 0.0034316859673708677, + "eval_runtime": 18.5217, + "eval_samples_per_second": 5.399, + "eval_steps_per_second": 1.35, + "step": 850 + }, + { + "epoch": 2.7507070707070707, + "grad_norm": 0.008457653224468231, + "learning_rate": 9.996894264492563e-05, + "loss": 0.0027, + "step": 851 + }, + { + "epoch": 2.753939393939394, + "grad_norm": 0.0105837257578969, + "learning_rate": 9.99681412828496e-05, + "loss": 0.0038, + "step": 852 + }, + { + "epoch": 2.757171717171717, + "grad_norm": 0.012494977563619614, + "learning_rate": 9.996732971670408e-05, + "loss": 0.0053, + "step": 853 + }, + { + "epoch": 2.7604040404040404, + "grad_norm": 0.006976769305765629, + "learning_rate": 9.996650794665487e-05, + "loss": 0.0018, + "step": 854 + }, + { + "epoch": 2.7636363636363637, + "grad_norm": 0.00825168564915657, + "learning_rate": 9.996567597286974e-05, + "loss": 0.0026, + "step": 855 + }, + { + "epoch": 2.766868686868687, + "grad_norm": 0.007835673168301582, + "learning_rate": 9.996483379551861e-05, + "loss": 0.0022, + "step": 856 + }, + { + "epoch": 2.77010101010101, + "grad_norm": 0.007407719735056162, + "learning_rate": 9.996398141477344e-05, + "loss": 0.0023, + "step": 857 + }, + { + "epoch": 2.7733333333333334, + "grad_norm": 0.008899042382836342, + "learning_rate": 9.996311883080832e-05, + "loss": 0.0032, + "step": 858 + }, + { + "epoch": 2.7765656565656567, + "grad_norm": 0.008769405074417591, + "learning_rate": 9.996224604379938e-05, + "loss": 0.0042, + "step": 859 + }, + { + "epoch": 2.77979797979798, + "grad_norm": 0.007881446741521358, + "learning_rate": 9.996136305392487e-05, + "loss": 0.0027, + "step": 860 + }, + { + "epoch": 2.783030303030303, + "grad_norm": 0.005930191837251186, + "learning_rate": 9.996046986136509e-05, + "loss": 0.0023, + "step": 861 + }, + { + "epoch": 2.7862626262626264, + "grad_norm": 0.007934244349598885, + "learning_rate": 9.995956646630246e-05, + "loss": 0.0027, + "step": 862 + }, + { + "epoch": 2.7894949494949497, + "grad_norm": 0.0088659031316638, + "learning_rate": 9.995865286892145e-05, + "loss": 0.0035, + "step": 863 + }, + { + "epoch": 2.792727272727273, + "grad_norm": 0.01819589175283909, + "learning_rate": 9.995772906940864e-05, + "loss": 0.0076, + "step": 864 + }, + { + "epoch": 2.795959595959596, + "grad_norm": 0.005735949147492647, + "learning_rate": 9.995679506795264e-05, + "loss": 0.0021, + "step": 865 + }, + { + "epoch": 2.7991919191919195, + "grad_norm": 0.008254305459558964, + "learning_rate": 9.995585086474424e-05, + "loss": 0.0029, + "step": 866 + }, + { + "epoch": 2.8024242424242423, + "grad_norm": 0.007419264875352383, + "learning_rate": 9.995489645997622e-05, + "loss": 0.0016, + "step": 867 + }, + { + "epoch": 2.8056565656565655, + "grad_norm": 0.013274376280605793, + "learning_rate": 9.99539318538435e-05, + "loss": 0.0042, + "step": 868 + }, + { + "epoch": 2.8088888888888888, + "grad_norm": 0.008460251614451408, + "learning_rate": 9.995295704654304e-05, + "loss": 0.0028, + "step": 869 + }, + { + "epoch": 2.812121212121212, + "grad_norm": 0.007807243149727583, + "learning_rate": 9.995197203827393e-05, + "loss": 0.0032, + "step": 870 + }, + { + "epoch": 2.8153535353535353, + "grad_norm": 0.008562683127820492, + "learning_rate": 9.995097682923733e-05, + "loss": 0.0019, + "step": 871 + }, + { + "epoch": 2.8185858585858585, + "grad_norm": 0.0074541945941746235, + "learning_rate": 9.994997141963644e-05, + "loss": 0.002, + "step": 872 + }, + { + "epoch": 2.821818181818182, + "grad_norm": 0.006698752287775278, + "learning_rate": 9.994895580967658e-05, + "loss": 0.0025, + "step": 873 + }, + { + "epoch": 2.825050505050505, + "grad_norm": 0.015189897269010544, + "learning_rate": 9.994792999956518e-05, + "loss": 0.0063, + "step": 874 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 0.009802022948861122, + "learning_rate": 9.994689398951169e-05, + "loss": 0.004, + "step": 875 + }, + { + "epoch": 2.8282828282828283, + "eval_loss": 0.003472257638350129, + "eval_runtime": 18.5556, + "eval_samples_per_second": 5.389, + "eval_steps_per_second": 1.347, + "step": 875 + }, + { + "epoch": 2.8315151515151515, + "grad_norm": 0.008691331371665001, + "learning_rate": 9.994584777972769e-05, + "loss": 0.0038, + "step": 876 + }, + { + "epoch": 2.834747474747475, + "grad_norm": 0.004085318651050329, + "learning_rate": 9.994479137042683e-05, + "loss": 0.0017, + "step": 877 + }, + { + "epoch": 2.837979797979798, + "grad_norm": 0.01081239152699709, + "learning_rate": 9.994372476182484e-05, + "loss": 0.0051, + "step": 878 + }, + { + "epoch": 2.8412121212121213, + "grad_norm": 0.00917616207152605, + "learning_rate": 9.994264795413953e-05, + "loss": 0.0027, + "step": 879 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 0.0047592418268322945, + "learning_rate": 9.99415609475908e-05, + "loss": 0.0014, + "step": 880 + }, + { + "epoch": 2.847676767676768, + "grad_norm": 0.009527614340186119, + "learning_rate": 9.994046374240062e-05, + "loss": 0.0032, + "step": 881 + }, + { + "epoch": 2.850909090909091, + "grad_norm": 0.007115239277482033, + "learning_rate": 9.993935633879306e-05, + "loss": 0.0027, + "step": 882 + }, + { + "epoch": 2.854141414141414, + "grad_norm": 0.008802005089819431, + "learning_rate": 9.993823873699426e-05, + "loss": 0.0026, + "step": 883 + }, + { + "epoch": 2.857373737373737, + "grad_norm": 0.008757531642913818, + "learning_rate": 9.993711093723245e-05, + "loss": 0.0036, + "step": 884 + }, + { + "epoch": 2.8606060606060604, + "grad_norm": 0.0062814452685415745, + "learning_rate": 9.993597293973796e-05, + "loss": 0.0026, + "step": 885 + }, + { + "epoch": 2.8638383838383836, + "grad_norm": 0.007916122674942017, + "learning_rate": 9.993482474474314e-05, + "loss": 0.0026, + "step": 886 + }, + { + "epoch": 2.867070707070707, + "grad_norm": 0.012285426259040833, + "learning_rate": 9.99336663524825e-05, + "loss": 0.0048, + "step": 887 + }, + { + "epoch": 2.87030303030303, + "grad_norm": 0.007117332424968481, + "learning_rate": 9.993249776319258e-05, + "loss": 0.0014, + "step": 888 + }, + { + "epoch": 2.8735353535353534, + "grad_norm": 0.007871873676776886, + "learning_rate": 9.993131897711202e-05, + "loss": 0.0029, + "step": 889 + }, + { + "epoch": 2.8767676767676766, + "grad_norm": 0.009144186042249203, + "learning_rate": 9.993012999448154e-05, + "loss": 0.003, + "step": 890 + }, + { + "epoch": 2.88, + "grad_norm": 0.006271008867770433, + "learning_rate": 9.992893081554397e-05, + "loss": 0.0026, + "step": 891 + }, + { + "epoch": 2.883232323232323, + "grad_norm": 0.004877634346485138, + "learning_rate": 9.992772144054415e-05, + "loss": 0.0017, + "step": 892 + }, + { + "epoch": 2.8864646464646464, + "grad_norm": 0.01159873977303505, + "learning_rate": 9.992650186972909e-05, + "loss": 0.0025, + "step": 893 + }, + { + "epoch": 2.8896969696969697, + "grad_norm": 0.006861403118818998, + "learning_rate": 9.99252721033478e-05, + "loss": 0.0022, + "step": 894 + }, + { + "epoch": 2.892929292929293, + "grad_norm": 0.009415537118911743, + "learning_rate": 9.992403214165147e-05, + "loss": 0.0036, + "step": 895 + }, + { + "epoch": 2.896161616161616, + "grad_norm": 0.007355588488280773, + "learning_rate": 9.992278198489327e-05, + "loss": 0.0035, + "step": 896 + }, + { + "epoch": 2.8993939393939394, + "grad_norm": 0.003762519918382168, + "learning_rate": 9.99215216333285e-05, + "loss": 0.0013, + "step": 897 + }, + { + "epoch": 2.9026262626262627, + "grad_norm": 0.007687287870794535, + "learning_rate": 9.992025108721454e-05, + "loss": 0.0023, + "step": 898 + }, + { + "epoch": 2.905858585858586, + "grad_norm": 0.006643611006438732, + "learning_rate": 9.991897034681087e-05, + "loss": 0.0024, + "step": 899 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.006356996949762106, + "learning_rate": 9.9917679412379e-05, + "loss": 0.0026, + "step": 900 + }, + { + "epoch": 2.909090909090909, + "eval_loss": 0.0033850145991891623, + "eval_runtime": 18.5793, + "eval_samples_per_second": 5.382, + "eval_steps_per_second": 1.346, + "step": 900 + }, + { + "epoch": 2.9123232323232324, + "grad_norm": 0.004237829241901636, + "learning_rate": 9.99163782841826e-05, + "loss": 0.0018, + "step": 901 + }, + { + "epoch": 2.9155555555555557, + "grad_norm": 0.006980978883802891, + "learning_rate": 9.991506696248731e-05, + "loss": 0.0022, + "step": 902 + }, + { + "epoch": 2.918787878787879, + "grad_norm": 0.007011194713413715, + "learning_rate": 9.991374544756098e-05, + "loss": 0.0034, + "step": 903 + }, + { + "epoch": 2.922020202020202, + "grad_norm": 0.006265897769480944, + "learning_rate": 9.991241373967344e-05, + "loss": 0.0023, + "step": 904 + }, + { + "epoch": 2.9252525252525254, + "grad_norm": 0.006889102049171925, + "learning_rate": 9.991107183909664e-05, + "loss": 0.0021, + "step": 905 + }, + { + "epoch": 2.9284848484848487, + "grad_norm": 0.00801191944628954, + "learning_rate": 9.990971974610466e-05, + "loss": 0.0028, + "step": 906 + }, + { + "epoch": 2.931717171717172, + "grad_norm": 0.006524492986500263, + "learning_rate": 9.990835746097356e-05, + "loss": 0.0018, + "step": 907 + }, + { + "epoch": 2.934949494949495, + "grad_norm": 0.007707702461630106, + "learning_rate": 9.990698498398155e-05, + "loss": 0.003, + "step": 908 + }, + { + "epoch": 2.9381818181818184, + "grad_norm": 0.006807916797697544, + "learning_rate": 9.990560231540889e-05, + "loss": 0.0019, + "step": 909 + }, + { + "epoch": 2.9414141414141413, + "grad_norm": 0.01063486561179161, + "learning_rate": 9.990420945553797e-05, + "loss": 0.0041, + "step": 910 + }, + { + "epoch": 2.9446464646464645, + "grad_norm": 0.007649898063391447, + "learning_rate": 9.990280640465321e-05, + "loss": 0.0021, + "step": 911 + }, + { + "epoch": 2.9478787878787878, + "grad_norm": 0.0075135622173547745, + "learning_rate": 9.990139316304112e-05, + "loss": 0.0018, + "step": 912 + }, + { + "epoch": 2.951111111111111, + "grad_norm": 0.008089892566204071, + "learning_rate": 9.989996973099032e-05, + "loss": 0.0018, + "step": 913 + }, + { + "epoch": 2.9543434343434343, + "grad_norm": 0.006691002286970615, + "learning_rate": 9.989853610879147e-05, + "loss": 0.0018, + "step": 914 + }, + { + "epoch": 2.9575757575757575, + "grad_norm": 0.008318419568240643, + "learning_rate": 9.989709229673736e-05, + "loss": 0.0012, + "step": 915 + }, + { + "epoch": 2.9608080808080808, + "grad_norm": 0.009121793322265148, + "learning_rate": 9.98956382951228e-05, + "loss": 0.003, + "step": 916 + }, + { + "epoch": 2.964040404040404, + "grad_norm": 0.008950279094278812, + "learning_rate": 9.989417410424475e-05, + "loss": 0.003, + "step": 917 + }, + { + "epoch": 2.9672727272727273, + "grad_norm": 0.014811480417847633, + "learning_rate": 9.98926997244022e-05, + "loss": 0.0041, + "step": 918 + }, + { + "epoch": 2.9705050505050505, + "grad_norm": 0.007069775369018316, + "learning_rate": 9.989121515589622e-05, + "loss": 0.003, + "step": 919 + }, + { + "epoch": 2.973737373737374, + "grad_norm": 0.0060707684606313705, + "learning_rate": 9.988972039902997e-05, + "loss": 0.0019, + "step": 920 + }, + { + "epoch": 2.976969696969697, + "grad_norm": 0.005915610119700432, + "learning_rate": 9.988821545410874e-05, + "loss": 0.002, + "step": 921 + }, + { + "epoch": 2.9802020202020203, + "grad_norm": 0.008888405747711658, + "learning_rate": 9.988670032143981e-05, + "loss": 0.0033, + "step": 922 + }, + { + "epoch": 2.9834343434343435, + "grad_norm": 0.0058620586059987545, + "learning_rate": 9.988517500133262e-05, + "loss": 0.0019, + "step": 923 + }, + { + "epoch": 2.986666666666667, + "grad_norm": 0.005796400364488363, + "learning_rate": 9.988363949409865e-05, + "loss": 0.0021, + "step": 924 + }, + { + "epoch": 2.98989898989899, + "grad_norm": 0.0063505168072879314, + "learning_rate": 9.988209380005144e-05, + "loss": 0.0019, + "step": 925 + }, + { + "epoch": 2.98989898989899, + "eval_loss": 0.003320756834000349, + "eval_runtime": 18.5511, + "eval_samples_per_second": 5.391, + "eval_steps_per_second": 1.348, + "step": 925 + }, + { + "epoch": 2.993131313131313, + "grad_norm": 0.008493457920849323, + "learning_rate": 9.98805379195067e-05, + "loss": 0.0033, + "step": 926 + }, + { + "epoch": 2.996363636363636, + "grad_norm": 0.009364234283566475, + "learning_rate": 9.987897185278208e-05, + "loss": 0.0048, + "step": 927 + }, + { + "epoch": 2.9995959595959594, + "grad_norm": 0.009802762418985367, + "learning_rate": 9.987739560019746e-05, + "loss": 0.0028, + "step": 928 + }, + { + "epoch": 3.0028282828282826, + "grad_norm": 0.018954146653413773, + "learning_rate": 9.987580916207468e-05, + "loss": 0.0086, + "step": 929 + }, + { + "epoch": 3.006060606060606, + "grad_norm": 0.006144994404166937, + "learning_rate": 9.987421253873775e-05, + "loss": 0.0018, + "step": 930 + }, + { + "epoch": 3.009292929292929, + "grad_norm": 0.004907108843326569, + "learning_rate": 9.987260573051269e-05, + "loss": 0.0015, + "step": 931 + }, + { + "epoch": 3.0125252525252524, + "grad_norm": 0.006426096893846989, + "learning_rate": 9.987098873772763e-05, + "loss": 0.0023, + "step": 932 + }, + { + "epoch": 3.0157575757575756, + "grad_norm": 0.006343123037368059, + "learning_rate": 9.986936156071278e-05, + "loss": 0.0018, + "step": 933 + }, + { + "epoch": 3.018989898989899, + "grad_norm": 0.010373006574809551, + "learning_rate": 9.986772419980044e-05, + "loss": 0.0034, + "step": 934 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 0.004722005221992731, + "learning_rate": 9.986607665532497e-05, + "loss": 0.0012, + "step": 935 + }, + { + "epoch": 3.0254545454545454, + "grad_norm": 0.007018386386334896, + "learning_rate": 9.986441892762281e-05, + "loss": 0.0019, + "step": 936 + }, + { + "epoch": 3.0286868686868686, + "grad_norm": 0.006261017639189959, + "learning_rate": 9.98627510170325e-05, + "loss": 0.0014, + "step": 937 + }, + { + "epoch": 3.031919191919192, + "grad_norm": 0.008190566673874855, + "learning_rate": 9.986107292389464e-05, + "loss": 0.0017, + "step": 938 + }, + { + "epoch": 3.035151515151515, + "grad_norm": 0.006314157508313656, + "learning_rate": 9.985938464855191e-05, + "loss": 0.0015, + "step": 939 + }, + { + "epoch": 3.0383838383838384, + "grad_norm": 0.008036741055548191, + "learning_rate": 9.985768619134909e-05, + "loss": 0.0021, + "step": 940 + }, + { + "epoch": 3.0416161616161617, + "grad_norm": 0.005572419613599777, + "learning_rate": 9.985597755263302e-05, + "loss": 0.0019, + "step": 941 + }, + { + "epoch": 3.044848484848485, + "grad_norm": 0.007106456905603409, + "learning_rate": 9.985425873275263e-05, + "loss": 0.0017, + "step": 942 + }, + { + "epoch": 3.048080808080808, + "grad_norm": 0.006766545120626688, + "learning_rate": 9.98525297320589e-05, + "loss": 0.0016, + "step": 943 + }, + { + "epoch": 3.0513131313131314, + "grad_norm": 0.00921205710619688, + "learning_rate": 9.985079055090493e-05, + "loss": 0.0012, + "step": 944 + }, + { + "epoch": 3.0545454545454547, + "grad_norm": 0.00547370221465826, + "learning_rate": 9.984904118964588e-05, + "loss": 0.0017, + "step": 945 + }, + { + "epoch": 3.057777777777778, + "grad_norm": 0.007727344986051321, + "learning_rate": 9.984728164863898e-05, + "loss": 0.0016, + "step": 946 + }, + { + "epoch": 3.061010101010101, + "grad_norm": 0.006054291035979986, + "learning_rate": 9.984551192824355e-05, + "loss": 0.0017, + "step": 947 + }, + { + "epoch": 3.0642424242424244, + "grad_norm": 0.00898781418800354, + "learning_rate": 9.9843732028821e-05, + "loss": 0.0022, + "step": 948 + }, + { + "epoch": 3.0674747474747477, + "grad_norm": 0.0060675074346363544, + "learning_rate": 9.98419419507348e-05, + "loss": 0.0013, + "step": 949 + }, + { + "epoch": 3.0707070707070705, + "grad_norm": 0.007023282814770937, + "learning_rate": 9.98401416943505e-05, + "loss": 0.0015, + "step": 950 + }, + { + "epoch": 3.0707070707070705, + "eval_loss": 0.003585340455174446, + "eval_runtime": 18.5936, + "eval_samples_per_second": 5.378, + "eval_steps_per_second": 1.345, + "step": 950 + }, + { + "epoch": 3.0739393939393937, + "grad_norm": 0.007739969063550234, + "learning_rate": 9.983833126003572e-05, + "loss": 0.0022, + "step": 951 + }, + { + "epoch": 3.077171717171717, + "grad_norm": 0.008393503725528717, + "learning_rate": 9.98365106481602e-05, + "loss": 0.0019, + "step": 952 + }, + { + "epoch": 3.0804040404040403, + "grad_norm": 0.006369095295667648, + "learning_rate": 9.983467985909573e-05, + "loss": 0.0011, + "step": 953 + }, + { + "epoch": 3.0836363636363635, + "grad_norm": 0.008163919672369957, + "learning_rate": 9.983283889321615e-05, + "loss": 0.0018, + "step": 954 + }, + { + "epoch": 3.0868686868686868, + "grad_norm": 0.018781553953886032, + "learning_rate": 9.983098775089742e-05, + "loss": 0.0038, + "step": 955 + }, + { + "epoch": 3.09010101010101, + "grad_norm": 0.012334289960563183, + "learning_rate": 9.982912643251757e-05, + "loss": 0.004, + "step": 956 + }, + { + "epoch": 3.0933333333333333, + "grad_norm": 0.005684965290129185, + "learning_rate": 9.98272549384567e-05, + "loss": 0.0016, + "step": 957 + }, + { + "epoch": 3.0965656565656565, + "grad_norm": 0.00594430323690176, + "learning_rate": 9.982537326909697e-05, + "loss": 0.0015, + "step": 958 + }, + { + "epoch": 3.0997979797979798, + "grad_norm": 0.006609255447983742, + "learning_rate": 9.982348142482269e-05, + "loss": 0.0016, + "step": 959 + }, + { + "epoch": 3.103030303030303, + "grad_norm": 0.010460656136274338, + "learning_rate": 9.982157940602014e-05, + "loss": 0.0039, + "step": 960 + }, + { + "epoch": 3.1062626262626263, + "grad_norm": 0.006908283568918705, + "learning_rate": 9.981966721307778e-05, + "loss": 0.0025, + "step": 961 + }, + { + "epoch": 3.1094949494949495, + "grad_norm": 0.006505319382995367, + "learning_rate": 9.981774484638606e-05, + "loss": 0.0018, + "step": 962 + }, + { + "epoch": 3.112727272727273, + "grad_norm": 0.005758680868893862, + "learning_rate": 9.981581230633758e-05, + "loss": 0.0014, + "step": 963 + }, + { + "epoch": 3.115959595959596, + "grad_norm": 0.004635549616068602, + "learning_rate": 9.981386959332697e-05, + "loss": 0.0013, + "step": 964 + }, + { + "epoch": 3.1191919191919193, + "grad_norm": 0.010312603786587715, + "learning_rate": 9.981191670775097e-05, + "loss": 0.0029, + "step": 965 + }, + { + "epoch": 3.1224242424242425, + "grad_norm": 0.006679642479866743, + "learning_rate": 9.980995365000836e-05, + "loss": 0.0022, + "step": 966 + }, + { + "epoch": 3.125656565656566, + "grad_norm": 0.009541639126837254, + "learning_rate": 9.980798042050004e-05, + "loss": 0.0024, + "step": 967 + }, + { + "epoch": 3.128888888888889, + "grad_norm": 0.006234496366232634, + "learning_rate": 9.980599701962896e-05, + "loss": 0.0018, + "step": 968 + }, + { + "epoch": 3.1321212121212123, + "grad_norm": 0.007063133176416159, + "learning_rate": 9.980400344780015e-05, + "loss": 0.0023, + "step": 969 + }, + { + "epoch": 3.1353535353535356, + "grad_norm": 0.009879599325358868, + "learning_rate": 9.98019997054207e-05, + "loss": 0.0037, + "step": 970 + }, + { + "epoch": 3.1385858585858584, + "grad_norm": 0.006942450068891048, + "learning_rate": 9.979998579289984e-05, + "loss": 0.0018, + "step": 971 + }, + { + "epoch": 3.1418181818181816, + "grad_norm": 0.007005956023931503, + "learning_rate": 9.979796171064881e-05, + "loss": 0.0015, + "step": 972 + }, + { + "epoch": 3.145050505050505, + "grad_norm": 0.009488895535469055, + "learning_rate": 9.979592745908095e-05, + "loss": 0.0022, + "step": 973 + }, + { + "epoch": 3.148282828282828, + "grad_norm": 0.00824445765465498, + "learning_rate": 9.979388303861169e-05, + "loss": 0.0016, + "step": 974 + }, + { + "epoch": 3.1515151515151514, + "grad_norm": 0.006121410988271236, + "learning_rate": 9.97918284496585e-05, + "loss": 0.0018, + "step": 975 + }, + { + "epoch": 3.1515151515151514, + "eval_loss": 0.0034363586455583572, + "eval_runtime": 18.5459, + "eval_samples_per_second": 5.392, + "eval_steps_per_second": 1.348, + "step": 975 + }, + { + "epoch": 3.1547474747474746, + "grad_norm": 0.003954809159040451, + "learning_rate": 9.978976369264098e-05, + "loss": 0.0008, + "step": 976 + }, + { + "epoch": 3.157979797979798, + "grad_norm": 0.0071476600132882595, + "learning_rate": 9.978768876798075e-05, + "loss": 0.0019, + "step": 977 + }, + { + "epoch": 3.161212121212121, + "grad_norm": 0.006608667783439159, + "learning_rate": 9.978560367610156e-05, + "loss": 0.0015, + "step": 978 + }, + { + "epoch": 3.1644444444444444, + "grad_norm": 0.010076618753373623, + "learning_rate": 9.978350841742919e-05, + "loss": 0.0031, + "step": 979 + }, + { + "epoch": 3.1676767676767676, + "grad_norm": 0.008147617802023888, + "learning_rate": 9.978140299239152e-05, + "loss": 0.0023, + "step": 980 + }, + { + "epoch": 3.170909090909091, + "grad_norm": 0.006681856233626604, + "learning_rate": 9.977928740141851e-05, + "loss": 0.0021, + "step": 981 + }, + { + "epoch": 3.174141414141414, + "grad_norm": 0.007071407046169043, + "learning_rate": 9.977716164494217e-05, + "loss": 0.0018, + "step": 982 + }, + { + "epoch": 3.1773737373737374, + "grad_norm": 0.009117268025875092, + "learning_rate": 9.977502572339664e-05, + "loss": 0.0023, + "step": 983 + }, + { + "epoch": 3.1806060606060607, + "grad_norm": 0.007354568224400282, + "learning_rate": 9.977287963721804e-05, + "loss": 0.002, + "step": 984 + }, + { + "epoch": 3.183838383838384, + "grad_norm": 0.003036913461983204, + "learning_rate": 9.977072338684469e-05, + "loss": 0.0007, + "step": 985 + }, + { + "epoch": 3.187070707070707, + "grad_norm": 0.011151008307933807, + "learning_rate": 9.976855697271689e-05, + "loss": 0.0026, + "step": 986 + }, + { + "epoch": 3.1903030303030304, + "grad_norm": 0.008813554421067238, + "learning_rate": 9.976638039527704e-05, + "loss": 0.0023, + "step": 987 + }, + { + "epoch": 3.1935353535353537, + "grad_norm": 0.005291069392114878, + "learning_rate": 9.976419365496963e-05, + "loss": 0.0013, + "step": 988 + }, + { + "epoch": 3.196767676767677, + "grad_norm": 0.007516716606914997, + "learning_rate": 9.976199675224123e-05, + "loss": 0.0021, + "step": 989 + }, + { + "epoch": 3.2, + "grad_norm": 0.0058667948469519615, + "learning_rate": 9.975978968754045e-05, + "loss": 0.0014, + "step": 990 + }, + { + "epoch": 3.2032323232323234, + "grad_norm": 0.011559627018868923, + "learning_rate": 9.975757246131803e-05, + "loss": 0.0027, + "step": 991 + }, + { + "epoch": 3.2064646464646467, + "grad_norm": 0.008682847954332829, + "learning_rate": 9.975534507402671e-05, + "loss": 0.0025, + "step": 992 + }, + { + "epoch": 3.2096969696969695, + "grad_norm": 0.009858380071818829, + "learning_rate": 9.975310752612137e-05, + "loss": 0.0035, + "step": 993 + }, + { + "epoch": 3.2129292929292927, + "grad_norm": 0.00856295507401228, + "learning_rate": 9.975085981805897e-05, + "loss": 0.0021, + "step": 994 + }, + { + "epoch": 3.216161616161616, + "grad_norm": 0.0047248350456357, + "learning_rate": 9.974860195029847e-05, + "loss": 0.0013, + "step": 995 + }, + { + "epoch": 3.2193939393939393, + "grad_norm": 0.008117076009511948, + "learning_rate": 9.974633392330097e-05, + "loss": 0.0019, + "step": 996 + }, + { + "epoch": 3.2226262626262625, + "grad_norm": 0.008703903295099735, + "learning_rate": 9.974405573752965e-05, + "loss": 0.0027, + "step": 997 + }, + { + "epoch": 3.2258585858585858, + "grad_norm": 0.011345437727868557, + "learning_rate": 9.974176739344971e-05, + "loss": 0.0033, + "step": 998 + }, + { + "epoch": 3.229090909090909, + "grad_norm": 0.006451399996876717, + "learning_rate": 9.973946889152847e-05, + "loss": 0.0015, + "step": 999 + }, + { + "epoch": 3.2323232323232323, + "grad_norm": 0.004938072524964809, + "learning_rate": 9.973716023223531e-05, + "loss": 0.0013, + "step": 1000 + }, + { + "epoch": 3.2323232323232323, + "eval_loss": 0.003583682468160987, + "eval_runtime": 18.5886, + "eval_samples_per_second": 5.38, + "eval_steps_per_second": 1.345, + "step": 1000 + }, + { + "epoch": 3.2355555555555555, + "grad_norm": 0.008080260828137398, + "learning_rate": 9.97348414160417e-05, + "loss": 0.0021, + "step": 1001 + }, + { + "epoch": 3.2387878787878788, + "grad_norm": 0.00879309605807066, + "learning_rate": 9.973251244342114e-05, + "loss": 0.0022, + "step": 1002 + }, + { + "epoch": 3.242020202020202, + "grad_norm": 0.008178070187568665, + "learning_rate": 9.973017331484926e-05, + "loss": 0.0021, + "step": 1003 + }, + { + "epoch": 3.2452525252525253, + "grad_norm": 0.004717908333986998, + "learning_rate": 9.972782403080372e-05, + "loss": 0.0014, + "step": 1004 + }, + { + "epoch": 3.2484848484848485, + "grad_norm": 0.006190045271068811, + "learning_rate": 9.972546459176425e-05, + "loss": 0.0017, + "step": 1005 + }, + { + "epoch": 3.251717171717172, + "grad_norm": 0.010530155152082443, + "learning_rate": 9.972309499821273e-05, + "loss": 0.0034, + "step": 1006 + }, + { + "epoch": 3.254949494949495, + "grad_norm": 0.00893255416303873, + "learning_rate": 9.972071525063303e-05, + "loss": 0.0025, + "step": 1007 + }, + { + "epoch": 3.2581818181818183, + "grad_norm": 0.007108526304364204, + "learning_rate": 9.971832534951108e-05, + "loss": 0.0014, + "step": 1008 + }, + { + "epoch": 3.2614141414141415, + "grad_norm": 0.00837770476937294, + "learning_rate": 9.9715925295335e-05, + "loss": 0.0025, + "step": 1009 + }, + { + "epoch": 3.264646464646465, + "grad_norm": 0.0038234281819313765, + "learning_rate": 9.971351508859488e-05, + "loss": 0.0012, + "step": 1010 + }, + { + "epoch": 3.267878787878788, + "grad_norm": 0.00693462323397398, + "learning_rate": 9.971109472978288e-05, + "loss": 0.0022, + "step": 1011 + }, + { + "epoch": 3.2711111111111113, + "grad_norm": 0.006061007268726826, + "learning_rate": 9.97086642193933e-05, + "loss": 0.0017, + "step": 1012 + }, + { + "epoch": 3.274343434343434, + "grad_norm": 0.0076158661395311356, + "learning_rate": 9.970622355792247e-05, + "loss": 0.0019, + "step": 1013 + }, + { + "epoch": 3.2775757575757574, + "grad_norm": 0.003440310014411807, + "learning_rate": 9.970377274586879e-05, + "loss": 0.0008, + "step": 1014 + }, + { + "epoch": 3.2808080808080806, + "grad_norm": 0.006996039301156998, + "learning_rate": 9.970131178373277e-05, + "loss": 0.0015, + "step": 1015 + }, + { + "epoch": 3.284040404040404, + "grad_norm": 0.006500132381916046, + "learning_rate": 9.969884067201695e-05, + "loss": 0.002, + "step": 1016 + }, + { + "epoch": 3.287272727272727, + "grad_norm": 0.009818831458687782, + "learning_rate": 9.969635941122595e-05, + "loss": 0.0021, + "step": 1017 + }, + { + "epoch": 3.2905050505050504, + "grad_norm": 0.005738329142332077, + "learning_rate": 9.969386800186649e-05, + "loss": 0.0015, + "step": 1018 + }, + { + "epoch": 3.2937373737373736, + "grad_norm": 0.005722105037420988, + "learning_rate": 9.969136644444731e-05, + "loss": 0.0014, + "step": 1019 + }, + { + "epoch": 3.296969696969697, + "grad_norm": 0.005793727934360504, + "learning_rate": 9.968885473947932e-05, + "loss": 0.0016, + "step": 1020 + }, + { + "epoch": 3.30020202020202, + "grad_norm": 0.004094158299267292, + "learning_rate": 9.968633288747539e-05, + "loss": 0.001, + "step": 1021 + }, + { + "epoch": 3.3034343434343434, + "grad_norm": 0.011295527219772339, + "learning_rate": 9.968380088895052e-05, + "loss": 0.0015, + "step": 1022 + }, + { + "epoch": 3.3066666666666666, + "grad_norm": 0.006014000624418259, + "learning_rate": 9.968125874442179e-05, + "loss": 0.002, + "step": 1023 + }, + { + "epoch": 3.30989898989899, + "grad_norm": 0.010188030079007149, + "learning_rate": 9.96787064544083e-05, + "loss": 0.0022, + "step": 1024 + }, + { + "epoch": 3.313131313131313, + "grad_norm": 0.00868601817637682, + "learning_rate": 9.96761440194313e-05, + "loss": 0.0019, + "step": 1025 + }, + { + "epoch": 3.313131313131313, + "eval_loss": 0.0033993881661444902, + "eval_runtime": 18.5483, + "eval_samples_per_second": 5.391, + "eval_steps_per_second": 1.348, + "step": 1025 + }, + { + "epoch": 3.3163636363636364, + "grad_norm": 0.007088824175298214, + "learning_rate": 9.967357144001403e-05, + "loss": 0.002, + "step": 1026 + }, + { + "epoch": 3.3195959595959597, + "grad_norm": 0.007010690867900848, + "learning_rate": 9.967098871668186e-05, + "loss": 0.0017, + "step": 1027 + }, + { + "epoch": 3.322828282828283, + "grad_norm": 0.005562958773225546, + "learning_rate": 9.966839584996222e-05, + "loss": 0.0016, + "step": 1028 + }, + { + "epoch": 3.326060606060606, + "grad_norm": 0.0047673918306827545, + "learning_rate": 9.96657928403846e-05, + "loss": 0.0013, + "step": 1029 + }, + { + "epoch": 3.3292929292929294, + "grad_norm": 0.007019753102213144, + "learning_rate": 9.966317968848054e-05, + "loss": 0.0019, + "step": 1030 + }, + { + "epoch": 3.3325252525252527, + "grad_norm": 0.006734140682965517, + "learning_rate": 9.966055639478369e-05, + "loss": 0.0021, + "step": 1031 + }, + { + "epoch": 3.335757575757576, + "grad_norm": 0.008648472838103771, + "learning_rate": 9.965792295982978e-05, + "loss": 0.0033, + "step": 1032 + }, + { + "epoch": 3.338989898989899, + "grad_norm": 0.004571566358208656, + "learning_rate": 9.965527938415655e-05, + "loss": 0.0014, + "step": 1033 + }, + { + "epoch": 3.3422222222222224, + "grad_norm": 0.007610399276018143, + "learning_rate": 9.965262566830388e-05, + "loss": 0.0026, + "step": 1034 + }, + { + "epoch": 3.3454545454545457, + "grad_norm": 0.006182728800922632, + "learning_rate": 9.964996181281367e-05, + "loss": 0.0022, + "step": 1035 + }, + { + "epoch": 3.348686868686869, + "grad_norm": 0.006889553740620613, + "learning_rate": 9.964728781822992e-05, + "loss": 0.0018, + "step": 1036 + }, + { + "epoch": 3.3519191919191917, + "grad_norm": 0.005316969007253647, + "learning_rate": 9.964460368509867e-05, + "loss": 0.0013, + "step": 1037 + }, + { + "epoch": 3.355151515151515, + "grad_norm": 0.014395351521670818, + "learning_rate": 9.964190941396808e-05, + "loss": 0.0057, + "step": 1038 + }, + { + "epoch": 3.3583838383838382, + "grad_norm": 0.007097650784999132, + "learning_rate": 9.963920500538834e-05, + "loss": 0.0015, + "step": 1039 + }, + { + "epoch": 3.3616161616161615, + "grad_norm": 0.009330140426754951, + "learning_rate": 9.963649045991173e-05, + "loss": 0.0029, + "step": 1040 + }, + { + "epoch": 3.3648484848484848, + "grad_norm": 0.009492140263319016, + "learning_rate": 9.963376577809256e-05, + "loss": 0.0016, + "step": 1041 + }, + { + "epoch": 3.368080808080808, + "grad_norm": 0.00905213225632906, + "learning_rate": 9.963103096048728e-05, + "loss": 0.0027, + "step": 1042 + }, + { + "epoch": 3.3713131313131313, + "grad_norm": 0.010693477466702461, + "learning_rate": 9.962828600765433e-05, + "loss": 0.003, + "step": 1043 + }, + { + "epoch": 3.3745454545454545, + "grad_norm": 0.006187601946294308, + "learning_rate": 9.96255309201543e-05, + "loss": 0.0014, + "step": 1044 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 0.00784679502248764, + "learning_rate": 9.962276569854977e-05, + "loss": 0.0019, + "step": 1045 + }, + { + "epoch": 3.381010101010101, + "grad_norm": 0.022418344393372536, + "learning_rate": 9.96199903434055e-05, + "loss": 0.0017, + "step": 1046 + }, + { + "epoch": 3.3842424242424243, + "grad_norm": 0.005133874248713255, + "learning_rate": 9.961720485528819e-05, + "loss": 0.0013, + "step": 1047 + }, + { + "epoch": 3.3874747474747475, + "grad_norm": 0.007564987521618605, + "learning_rate": 9.961440923476666e-05, + "loss": 0.0019, + "step": 1048 + }, + { + "epoch": 3.390707070707071, + "grad_norm": 0.008043873123824596, + "learning_rate": 9.961160348241185e-05, + "loss": 0.0016, + "step": 1049 + }, + { + "epoch": 3.393939393939394, + "grad_norm": 0.0051131471991539, + "learning_rate": 9.96087875987967e-05, + "loss": 0.0012, + "step": 1050 + }, + { + "epoch": 3.393939393939394, + "eval_loss": 0.003347641322761774, + "eval_runtime": 18.537, + "eval_samples_per_second": 5.395, + "eval_steps_per_second": 1.349, + "step": 1050 + }, + { + "epoch": 3.3971717171717173, + "grad_norm": 0.008095295168459415, + "learning_rate": 9.960596158449627e-05, + "loss": 0.002, + "step": 1051 + }, + { + "epoch": 3.4004040404040405, + "grad_norm": 0.006727342493832111, + "learning_rate": 9.960312544008763e-05, + "loss": 0.0018, + "step": 1052 + }, + { + "epoch": 3.403636363636364, + "grad_norm": 0.006898078136146069, + "learning_rate": 9.960027916614998e-05, + "loss": 0.0012, + "step": 1053 + }, + { + "epoch": 3.406868686868687, + "grad_norm": 0.007435730658471584, + "learning_rate": 9.959742276326456e-05, + "loss": 0.0017, + "step": 1054 + }, + { + "epoch": 3.41010101010101, + "grad_norm": 0.008618218824267387, + "learning_rate": 9.959455623201465e-05, + "loss": 0.0021, + "step": 1055 + }, + { + "epoch": 3.413333333333333, + "grad_norm": 0.005248959641903639, + "learning_rate": 9.959167957298568e-05, + "loss": 0.0015, + "step": 1056 + }, + { + "epoch": 3.4165656565656564, + "grad_norm": 0.006079430691897869, + "learning_rate": 9.958879278676506e-05, + "loss": 0.0015, + "step": 1057 + }, + { + "epoch": 3.4197979797979796, + "grad_norm": 0.007255829870700836, + "learning_rate": 9.958589587394231e-05, + "loss": 0.002, + "step": 1058 + }, + { + "epoch": 3.423030303030303, + "grad_norm": 0.00781642273068428, + "learning_rate": 9.958298883510903e-05, + "loss": 0.0021, + "step": 1059 + }, + { + "epoch": 3.426262626262626, + "grad_norm": 0.008929956704378128, + "learning_rate": 9.958007167085886e-05, + "loss": 0.0018, + "step": 1060 + }, + { + "epoch": 3.4294949494949494, + "grad_norm": 0.008798833936452866, + "learning_rate": 9.95771443817875e-05, + "loss": 0.0025, + "step": 1061 + }, + { + "epoch": 3.4327272727272726, + "grad_norm": 0.008338157087564468, + "learning_rate": 9.957420696849275e-05, + "loss": 0.0021, + "step": 1062 + }, + { + "epoch": 3.435959595959596, + "grad_norm": 0.011783271096646786, + "learning_rate": 9.957125943157448e-05, + "loss": 0.0024, + "step": 1063 + }, + { + "epoch": 3.439191919191919, + "grad_norm": 0.005160772707313299, + "learning_rate": 9.956830177163461e-05, + "loss": 0.0014, + "step": 1064 + }, + { + "epoch": 3.4424242424242424, + "grad_norm": 0.007435562089085579, + "learning_rate": 9.95653339892771e-05, + "loss": 0.0025, + "step": 1065 + }, + { + "epoch": 3.4456565656565656, + "grad_norm": 0.009998551569879055, + "learning_rate": 9.956235608510802e-05, + "loss": 0.0026, + "step": 1066 + }, + { + "epoch": 3.448888888888889, + "grad_norm": 0.008799971081316471, + "learning_rate": 9.95593680597355e-05, + "loss": 0.0028, + "step": 1067 + }, + { + "epoch": 3.452121212121212, + "grad_norm": 0.013296165503561497, + "learning_rate": 9.955636991376971e-05, + "loss": 0.0044, + "step": 1068 + }, + { + "epoch": 3.4553535353535354, + "grad_norm": 0.008160554803907871, + "learning_rate": 9.955336164782292e-05, + "loss": 0.002, + "step": 1069 + }, + { + "epoch": 3.4585858585858587, + "grad_norm": 0.010213302448391914, + "learning_rate": 9.955034326250946e-05, + "loss": 0.0027, + "step": 1070 + }, + { + "epoch": 3.461818181818182, + "grad_norm": 0.009601346217095852, + "learning_rate": 9.954731475844571e-05, + "loss": 0.0036, + "step": 1071 + }, + { + "epoch": 3.465050505050505, + "grad_norm": 0.009020958095788956, + "learning_rate": 9.954427613625013e-05, + "loss": 0.0027, + "step": 1072 + }, + { + "epoch": 3.4682828282828284, + "grad_norm": 0.005136935506016016, + "learning_rate": 9.95412273965432e-05, + "loss": 0.0016, + "step": 1073 + }, + { + "epoch": 3.4715151515151517, + "grad_norm": 0.004412552807480097, + "learning_rate": 9.953816853994759e-05, + "loss": 0.0012, + "step": 1074 + }, + { + "epoch": 3.474747474747475, + "grad_norm": 0.00772328395396471, + "learning_rate": 9.953509956708789e-05, + "loss": 0.0018, + "step": 1075 + }, + { + "epoch": 3.474747474747475, + "eval_loss": 0.0034066771622747183, + "eval_runtime": 18.5582, + "eval_samples_per_second": 5.388, + "eval_steps_per_second": 1.347, + "step": 1075 + }, + { + "epoch": 3.477979797979798, + "grad_norm": 0.010046589188277721, + "learning_rate": 9.953202047859085e-05, + "loss": 0.0035, + "step": 1076 + }, + { + "epoch": 3.4812121212121214, + "grad_norm": 0.008048112504184246, + "learning_rate": 9.952893127508522e-05, + "loss": 0.0021, + "step": 1077 + }, + { + "epoch": 3.4844444444444447, + "grad_norm": 0.0077101015485823154, + "learning_rate": 9.95258319572019e-05, + "loss": 0.002, + "step": 1078 + }, + { + "epoch": 3.4876767676767675, + "grad_norm": 0.00719053577631712, + "learning_rate": 9.952272252557378e-05, + "loss": 0.002, + "step": 1079 + }, + { + "epoch": 3.4909090909090907, + "grad_norm": 0.006908205803483725, + "learning_rate": 9.951960298083583e-05, + "loss": 0.0017, + "step": 1080 + }, + { + "epoch": 3.494141414141414, + "grad_norm": 0.006123875733464956, + "learning_rate": 9.95164733236251e-05, + "loss": 0.0019, + "step": 1081 + }, + { + "epoch": 3.4973737373737372, + "grad_norm": 0.00980414543300867, + "learning_rate": 9.951333355458072e-05, + "loss": 0.0022, + "step": 1082 + }, + { + "epoch": 3.5006060606060605, + "grad_norm": 0.004448724910616875, + "learning_rate": 9.951018367434386e-05, + "loss": 0.0012, + "step": 1083 + }, + { + "epoch": 3.5038383838383838, + "grad_norm": 0.0064595406875014305, + "learning_rate": 9.950702368355775e-05, + "loss": 0.0012, + "step": 1084 + }, + { + "epoch": 3.507070707070707, + "grad_norm": 0.005067575257271528, + "learning_rate": 9.950385358286772e-05, + "loss": 0.0014, + "step": 1085 + }, + { + "epoch": 3.5103030303030303, + "grad_norm": 0.006261507514864206, + "learning_rate": 9.950067337292112e-05, + "loss": 0.0018, + "step": 1086 + }, + { + "epoch": 3.5135353535353535, + "grad_norm": 0.008754471316933632, + "learning_rate": 9.949748305436741e-05, + "loss": 0.0027, + "step": 1087 + }, + { + "epoch": 3.5167676767676768, + "grad_norm": 0.011027954518795013, + "learning_rate": 9.949428262785805e-05, + "loss": 0.0028, + "step": 1088 + }, + { + "epoch": 3.52, + "grad_norm": 0.012316351756453514, + "learning_rate": 9.949107209404665e-05, + "loss": 0.0025, + "step": 1089 + }, + { + "epoch": 3.5232323232323233, + "grad_norm": 0.005655081011354923, + "learning_rate": 9.948785145358879e-05, + "loss": 0.0014, + "step": 1090 + }, + { + "epoch": 3.5264646464646465, + "grad_norm": 0.010539629496634007, + "learning_rate": 9.948462070714219e-05, + "loss": 0.002, + "step": 1091 + }, + { + "epoch": 3.5296969696969698, + "grad_norm": 0.006462043151259422, + "learning_rate": 9.948137985536662e-05, + "loss": 0.0022, + "step": 1092 + }, + { + "epoch": 3.532929292929293, + "grad_norm": 0.007820974104106426, + "learning_rate": 9.947812889892387e-05, + "loss": 0.0016, + "step": 1093 + }, + { + "epoch": 3.5361616161616163, + "grad_norm": 0.006513669621199369, + "learning_rate": 9.947486783847784e-05, + "loss": 0.0019, + "step": 1094 + }, + { + "epoch": 3.5393939393939395, + "grad_norm": 0.007371378131210804, + "learning_rate": 9.947159667469446e-05, + "loss": 0.0014, + "step": 1095 + }, + { + "epoch": 3.542626262626263, + "grad_norm": 0.005782526917755604, + "learning_rate": 9.946831540824175e-05, + "loss": 0.0018, + "step": 1096 + }, + { + "epoch": 3.5458585858585856, + "grad_norm": 0.00670381635427475, + "learning_rate": 9.94650240397898e-05, + "loss": 0.0023, + "step": 1097 + }, + { + "epoch": 3.549090909090909, + "grad_norm": 0.010472368448972702, + "learning_rate": 9.946172257001069e-05, + "loss": 0.0031, + "step": 1098 + }, + { + "epoch": 3.552323232323232, + "grad_norm": 0.01276701781898737, + "learning_rate": 9.945841099957869e-05, + "loss": 0.0037, + "step": 1099 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.00712077459320426, + "learning_rate": 9.945508932917001e-05, + "loss": 0.0015, + "step": 1100 + }, + { + "epoch": 3.5555555555555554, + "eval_loss": 0.003400927409529686, + "eval_runtime": 18.5727, + "eval_samples_per_second": 5.384, + "eval_steps_per_second": 1.346, + "step": 1100 + }, + { + "epoch": 3.5587878787878786, + "grad_norm": 0.00810240674763918, + "learning_rate": 9.9451757559463e-05, + "loss": 0.0023, + "step": 1101 + }, + { + "epoch": 3.562020202020202, + "grad_norm": 0.011555838398635387, + "learning_rate": 9.944841569113803e-05, + "loss": 0.0029, + "step": 1102 + }, + { + "epoch": 3.565252525252525, + "grad_norm": 0.006031879689544439, + "learning_rate": 9.944506372487754e-05, + "loss": 0.0015, + "step": 1103 + }, + { + "epoch": 3.5684848484848484, + "grad_norm": 0.007457388564944267, + "learning_rate": 9.944170166136607e-05, + "loss": 0.0016, + "step": 1104 + }, + { + "epoch": 3.5717171717171716, + "grad_norm": 0.00727875716984272, + "learning_rate": 9.943832950129018e-05, + "loss": 0.0017, + "step": 1105 + }, + { + "epoch": 3.574949494949495, + "grad_norm": 0.014444559812545776, + "learning_rate": 9.943494724533848e-05, + "loss": 0.0041, + "step": 1106 + }, + { + "epoch": 3.578181818181818, + "grad_norm": 0.01081145741045475, + "learning_rate": 9.943155489420169e-05, + "loss": 0.0018, + "step": 1107 + }, + { + "epoch": 3.5814141414141414, + "grad_norm": 0.01200890727341175, + "learning_rate": 9.942815244857256e-05, + "loss": 0.0026, + "step": 1108 + }, + { + "epoch": 3.5846464646464646, + "grad_norm": 0.008189111016690731, + "learning_rate": 9.942473990914593e-05, + "loss": 0.003, + "step": 1109 + }, + { + "epoch": 3.587878787878788, + "grad_norm": 0.0054262722842395306, + "learning_rate": 9.942131727661863e-05, + "loss": 0.0014, + "step": 1110 + }, + { + "epoch": 3.591111111111111, + "grad_norm": 0.009128454141318798, + "learning_rate": 9.941788455168965e-05, + "loss": 0.0023, + "step": 1111 + }, + { + "epoch": 3.5943434343434344, + "grad_norm": 0.007988560944795609, + "learning_rate": 9.941444173505997e-05, + "loss": 0.003, + "step": 1112 + }, + { + "epoch": 3.5975757575757576, + "grad_norm": 0.0045745945535600185, + "learning_rate": 9.941098882743267e-05, + "loss": 0.0015, + "step": 1113 + }, + { + "epoch": 3.600808080808081, + "grad_norm": 0.009445126168429852, + "learning_rate": 9.940752582951283e-05, + "loss": 0.0025, + "step": 1114 + }, + { + "epoch": 3.604040404040404, + "grad_norm": 0.007550603710114956, + "learning_rate": 9.940405274200769e-05, + "loss": 0.0021, + "step": 1115 + }, + { + "epoch": 3.6072727272727274, + "grad_norm": 0.006470720749348402, + "learning_rate": 9.940056956562645e-05, + "loss": 0.002, + "step": 1116 + }, + { + "epoch": 3.6105050505050507, + "grad_norm": 0.006469183601438999, + "learning_rate": 9.939707630108044e-05, + "loss": 0.0014, + "step": 1117 + }, + { + "epoch": 3.613737373737374, + "grad_norm": 0.008634147234261036, + "learning_rate": 9.939357294908301e-05, + "loss": 0.0021, + "step": 1118 + }, + { + "epoch": 3.616969696969697, + "grad_norm": 0.0053526172414422035, + "learning_rate": 9.939005951034959e-05, + "loss": 0.0013, + "step": 1119 + }, + { + "epoch": 3.6202020202020204, + "grad_norm": 0.007559166289865971, + "learning_rate": 9.938653598559769e-05, + "loss": 0.0013, + "step": 1120 + }, + { + "epoch": 3.6234343434343437, + "grad_norm": 0.005990424659103155, + "learning_rate": 9.93830023755468e-05, + "loss": 0.0014, + "step": 1121 + }, + { + "epoch": 3.626666666666667, + "grad_norm": 0.009027826599776745, + "learning_rate": 9.937945868091856e-05, + "loss": 0.002, + "step": 1122 + }, + { + "epoch": 3.6298989898989897, + "grad_norm": 0.006613972131162882, + "learning_rate": 9.937590490243665e-05, + "loss": 0.0019, + "step": 1123 + }, + { + "epoch": 3.633131313131313, + "grad_norm": 0.009335087612271309, + "learning_rate": 9.937234104082676e-05, + "loss": 0.0025, + "step": 1124 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.005268748849630356, + "learning_rate": 9.936876709681668e-05, + "loss": 0.0012, + "step": 1125 + }, + { + "epoch": 3.6363636363636362, + "eval_loss": 0.0034194523468613625, + "eval_runtime": 18.5538, + "eval_samples_per_second": 5.39, + "eval_steps_per_second": 1.347, + "step": 1125 + }, + { + "epoch": 3.6395959595959595, + "grad_norm": 0.008097901940345764, + "learning_rate": 9.936518307113625e-05, + "loss": 0.0019, + "step": 1126 + }, + { + "epoch": 3.6428282828282827, + "grad_norm": 0.007424204144626856, + "learning_rate": 9.936158896451737e-05, + "loss": 0.0015, + "step": 1127 + }, + { + "epoch": 3.646060606060606, + "grad_norm": 0.00813052337616682, + "learning_rate": 9.9357984777694e-05, + "loss": 0.0026, + "step": 1128 + }, + { + "epoch": 3.6492929292929293, + "grad_norm": 0.010218325071036816, + "learning_rate": 9.935437051140216e-05, + "loss": 0.0021, + "step": 1129 + }, + { + "epoch": 3.6525252525252525, + "grad_norm": 0.006201726384460926, + "learning_rate": 9.935074616637992e-05, + "loss": 0.0015, + "step": 1130 + }, + { + "epoch": 3.6557575757575758, + "grad_norm": 0.010994481854140759, + "learning_rate": 9.934711174336742e-05, + "loss": 0.0045, + "step": 1131 + }, + { + "epoch": 3.658989898989899, + "grad_norm": 0.005597412586212158, + "learning_rate": 9.934346724310684e-05, + "loss": 0.0015, + "step": 1132 + }, + { + "epoch": 3.6622222222222223, + "grad_norm": 0.008483976125717163, + "learning_rate": 9.933981266634243e-05, + "loss": 0.0016, + "step": 1133 + }, + { + "epoch": 3.6654545454545455, + "grad_norm": 0.008453707210719585, + "learning_rate": 9.93361480138205e-05, + "loss": 0.0026, + "step": 1134 + }, + { + "epoch": 3.6686868686868688, + "grad_norm": 0.007491501048207283, + "learning_rate": 9.933247328628944e-05, + "loss": 0.0016, + "step": 1135 + }, + { + "epoch": 3.671919191919192, + "grad_norm": 0.00986399594694376, + "learning_rate": 9.93287884844996e-05, + "loss": 0.0028, + "step": 1136 + }, + { + "epoch": 3.6751515151515153, + "grad_norm": 0.0068638077937066555, + "learning_rate": 9.932509360920353e-05, + "loss": 0.0015, + "step": 1137 + }, + { + "epoch": 3.6783838383838385, + "grad_norm": 0.005731069948524237, + "learning_rate": 9.932138866115574e-05, + "loss": 0.0021, + "step": 1138 + }, + { + "epoch": 3.6816161616161613, + "grad_norm": 0.006409571971744299, + "learning_rate": 9.931767364111283e-05, + "loss": 0.0018, + "step": 1139 + }, + { + "epoch": 3.6848484848484846, + "grad_norm": 0.010908740572631359, + "learning_rate": 9.931394854983345e-05, + "loss": 0.002, + "step": 1140 + }, + { + "epoch": 3.688080808080808, + "grad_norm": 0.011212184093892574, + "learning_rate": 9.931021338807828e-05, + "loss": 0.004, + "step": 1141 + }, + { + "epoch": 3.691313131313131, + "grad_norm": 0.011185558512806892, + "learning_rate": 9.93064681566101e-05, + "loss": 0.0032, + "step": 1142 + }, + { + "epoch": 3.6945454545454544, + "grad_norm": 0.00570765370503068, + "learning_rate": 9.930271285619376e-05, + "loss": 0.0015, + "step": 1143 + }, + { + "epoch": 3.6977777777777776, + "grad_norm": 0.008618151769042015, + "learning_rate": 9.92989474875961e-05, + "loss": 0.0027, + "step": 1144 + }, + { + "epoch": 3.701010101010101, + "grad_norm": 0.009267282672226429, + "learning_rate": 9.929517205158605e-05, + "loss": 0.0032, + "step": 1145 + }, + { + "epoch": 3.704242424242424, + "grad_norm": 0.010019971989095211, + "learning_rate": 9.929138654893462e-05, + "loss": 0.0027, + "step": 1146 + }, + { + "epoch": 3.7074747474747474, + "grad_norm": 0.0062004695646464825, + "learning_rate": 9.928759098041482e-05, + "loss": 0.0018, + "step": 1147 + }, + { + "epoch": 3.7107070707070706, + "grad_norm": 0.005688016768544912, + "learning_rate": 9.928378534680178e-05, + "loss": 0.0012, + "step": 1148 + }, + { + "epoch": 3.713939393939394, + "grad_norm": 0.008614295162260532, + "learning_rate": 9.927996964887265e-05, + "loss": 0.0023, + "step": 1149 + }, + { + "epoch": 3.717171717171717, + "grad_norm": 0.004473026841878891, + "learning_rate": 9.927614388740663e-05, + "loss": 0.0018, + "step": 1150 + }, + { + "epoch": 3.717171717171717, + "eval_loss": 0.0032353622373193502, + "eval_runtime": 18.57, + "eval_samples_per_second": 5.385, + "eval_steps_per_second": 1.346, + "step": 1150 + }, + { + "epoch": 3.7204040404040404, + "grad_norm": 0.0077863396145403385, + "learning_rate": 9.9272308063185e-05, + "loss": 0.0016, + "step": 1151 + }, + { + "epoch": 3.7236363636363636, + "grad_norm": 0.010232211090624332, + "learning_rate": 9.926846217699104e-05, + "loss": 0.0028, + "step": 1152 + }, + { + "epoch": 3.726868686868687, + "grad_norm": 0.006621774286031723, + "learning_rate": 9.926460622961016e-05, + "loss": 0.002, + "step": 1153 + }, + { + "epoch": 3.73010101010101, + "grad_norm": 0.004830366000533104, + "learning_rate": 9.926074022182979e-05, + "loss": 0.0012, + "step": 1154 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 0.006570787634700537, + "learning_rate": 9.92568641544394e-05, + "loss": 0.0019, + "step": 1155 + }, + { + "epoch": 3.7365656565656566, + "grad_norm": 0.005988228600472212, + "learning_rate": 9.925297802823054e-05, + "loss": 0.0015, + "step": 1156 + }, + { + "epoch": 3.73979797979798, + "grad_norm": 0.007999586872756481, + "learning_rate": 9.924908184399677e-05, + "loss": 0.0024, + "step": 1157 + }, + { + "epoch": 3.743030303030303, + "grad_norm": 0.010493471287190914, + "learning_rate": 9.924517560253378e-05, + "loss": 0.002, + "step": 1158 + }, + { + "epoch": 3.7462626262626264, + "grad_norm": 0.006136655807495117, + "learning_rate": 9.924125930463924e-05, + "loss": 0.0013, + "step": 1159 + }, + { + "epoch": 3.7494949494949497, + "grad_norm": 0.0069641610607504845, + "learning_rate": 9.92373329511129e-05, + "loss": 0.0018, + "step": 1160 + }, + { + "epoch": 3.752727272727273, + "grad_norm": 0.0039301421493291855, + "learning_rate": 9.92333965427566e-05, + "loss": 0.0014, + "step": 1161 + }, + { + "epoch": 3.755959595959596, + "grad_norm": 0.006330995354801416, + "learning_rate": 9.922945008037417e-05, + "loss": 0.0018, + "step": 1162 + }, + { + "epoch": 3.7591919191919194, + "grad_norm": 0.010062163695693016, + "learning_rate": 9.922549356477152e-05, + "loss": 0.0026, + "step": 1163 + }, + { + "epoch": 3.7624242424242427, + "grad_norm": 0.00648066122084856, + "learning_rate": 9.922152699675664e-05, + "loss": 0.0015, + "step": 1164 + }, + { + "epoch": 3.765656565656566, + "grad_norm": 0.005979966837912798, + "learning_rate": 9.921755037713952e-05, + "loss": 0.0017, + "step": 1165 + }, + { + "epoch": 3.7688888888888887, + "grad_norm": 0.005628496408462524, + "learning_rate": 9.921356370673225e-05, + "loss": 0.0017, + "step": 1166 + }, + { + "epoch": 3.772121212121212, + "grad_norm": 0.008315288461744785, + "learning_rate": 9.920956698634896e-05, + "loss": 0.0019, + "step": 1167 + }, + { + "epoch": 3.7753535353535352, + "grad_norm": 0.005807152483612299, + "learning_rate": 9.92055602168058e-05, + "loss": 0.0014, + "step": 1168 + }, + { + "epoch": 3.7785858585858585, + "grad_norm": 0.006952639203518629, + "learning_rate": 9.920154339892104e-05, + "loss": 0.0016, + "step": 1169 + }, + { + "epoch": 3.7818181818181817, + "grad_norm": 0.0038634459488093853, + "learning_rate": 9.919751653351493e-05, + "loss": 0.0011, + "step": 1170 + }, + { + "epoch": 3.785050505050505, + "grad_norm": 0.007332623470574617, + "learning_rate": 9.919347962140979e-05, + "loss": 0.0023, + "step": 1171 + }, + { + "epoch": 3.7882828282828283, + "grad_norm": 0.008147547952830791, + "learning_rate": 9.918943266343004e-05, + "loss": 0.0023, + "step": 1172 + }, + { + "epoch": 3.7915151515151515, + "grad_norm": 0.010149548761546612, + "learning_rate": 9.91853756604021e-05, + "loss": 0.0022, + "step": 1173 + }, + { + "epoch": 3.7947474747474748, + "grad_norm": 0.009069012477993965, + "learning_rate": 9.918130861315444e-05, + "loss": 0.0026, + "step": 1174 + }, + { + "epoch": 3.797979797979798, + "grad_norm": 0.0058683850802481174, + "learning_rate": 9.91772315225176e-05, + "loss": 0.002, + "step": 1175 + }, + { + "epoch": 3.797979797979798, + "eval_loss": 0.003167589195072651, + "eval_runtime": 18.5504, + "eval_samples_per_second": 5.391, + "eval_steps_per_second": 1.348, + "step": 1175 + }, + { + "epoch": 3.8012121212121213, + "grad_norm": 0.007021645549684763, + "learning_rate": 9.917314438932421e-05, + "loss": 0.0021, + "step": 1176 + }, + { + "epoch": 3.8044444444444445, + "grad_norm": 0.006806994788348675, + "learning_rate": 9.916904721440887e-05, + "loss": 0.0032, + "step": 1177 + }, + { + "epoch": 3.8076767676767678, + "grad_norm": 0.004510834813117981, + "learning_rate": 9.916493999860828e-05, + "loss": 0.0018, + "step": 1178 + }, + { + "epoch": 3.810909090909091, + "grad_norm": 0.005397962871938944, + "learning_rate": 9.916082274276117e-05, + "loss": 0.0022, + "step": 1179 + }, + { + "epoch": 3.8141414141414143, + "grad_norm": 0.0048407320864498615, + "learning_rate": 9.915669544770836e-05, + "loss": 0.0012, + "step": 1180 + }, + { + "epoch": 3.8173737373737375, + "grad_norm": 0.008071119897067547, + "learning_rate": 9.915255811429267e-05, + "loss": 0.0027, + "step": 1181 + }, + { + "epoch": 3.8206060606060603, + "grad_norm": 0.005428985692560673, + "learning_rate": 9.914841074335898e-05, + "loss": 0.002, + "step": 1182 + }, + { + "epoch": 3.8238383838383836, + "grad_norm": 0.00699599226936698, + "learning_rate": 9.914425333575426e-05, + "loss": 0.0019, + "step": 1183 + }, + { + "epoch": 3.827070707070707, + "grad_norm": 0.007088671904057264, + "learning_rate": 9.914008589232749e-05, + "loss": 0.0024, + "step": 1184 + }, + { + "epoch": 3.83030303030303, + "grad_norm": 0.008141590282320976, + "learning_rate": 9.91359084139297e-05, + "loss": 0.0028, + "step": 1185 + }, + { + "epoch": 3.8335353535353534, + "grad_norm": 0.006589834112673998, + "learning_rate": 9.913172090141399e-05, + "loss": 0.002, + "step": 1186 + }, + { + "epoch": 3.8367676767676766, + "grad_norm": 0.008189876563847065, + "learning_rate": 9.912752335563548e-05, + "loss": 0.0019, + "step": 1187 + }, + { + "epoch": 3.84, + "grad_norm": 0.005289820954203606, + "learning_rate": 9.912331577745138e-05, + "loss": 0.0017, + "step": 1188 + }, + { + "epoch": 3.843232323232323, + "grad_norm": 0.0077620758675038815, + "learning_rate": 9.911909816772091e-05, + "loss": 0.0024, + "step": 1189 + }, + { + "epoch": 3.8464646464646464, + "grad_norm": 0.006231298670172691, + "learning_rate": 9.911487052730537e-05, + "loss": 0.0016, + "step": 1190 + }, + { + "epoch": 3.8496969696969696, + "grad_norm": 0.007889009080827236, + "learning_rate": 9.911063285706808e-05, + "loss": 0.002, + "step": 1191 + }, + { + "epoch": 3.852929292929293, + "grad_norm": 0.01032167673110962, + "learning_rate": 9.910638515787442e-05, + "loss": 0.0014, + "step": 1192 + }, + { + "epoch": 3.856161616161616, + "grad_norm": 0.007993797771632671, + "learning_rate": 9.910212743059182e-05, + "loss": 0.0022, + "step": 1193 + }, + { + "epoch": 3.8593939393939394, + "grad_norm": 0.005151242949068546, + "learning_rate": 9.909785967608977e-05, + "loss": 0.0015, + "step": 1194 + }, + { + "epoch": 3.8626262626262626, + "grad_norm": 0.007885465398430824, + "learning_rate": 9.909358189523978e-05, + "loss": 0.0024, + "step": 1195 + }, + { + "epoch": 3.865858585858586, + "grad_norm": 0.009114793501794338, + "learning_rate": 9.908929408891542e-05, + "loss": 0.0015, + "step": 1196 + }, + { + "epoch": 3.869090909090909, + "grad_norm": 0.00829376745969057, + "learning_rate": 9.908499625799235e-05, + "loss": 0.002, + "step": 1197 + }, + { + "epoch": 3.8723232323232324, + "grad_norm": 0.007342406082898378, + "learning_rate": 9.908068840334818e-05, + "loss": 0.0021, + "step": 1198 + }, + { + "epoch": 3.8755555555555556, + "grad_norm": 0.0075923847034573555, + "learning_rate": 9.907637052586265e-05, + "loss": 0.0019, + "step": 1199 + }, + { + "epoch": 3.878787878787879, + "grad_norm": 0.008298894390463829, + "learning_rate": 9.907204262641751e-05, + "loss": 0.002, + "step": 1200 + }, + { + "epoch": 3.878787878787879, + "eval_loss": 0.0031958832405507565, + "eval_runtime": 18.5457, + "eval_samples_per_second": 5.392, + "eval_steps_per_second": 1.348, + "step": 1200 + } + ], + "logging_steps": 1, + "max_steps": 7725, + "num_input_tokens_seen": 0, + "num_train_epochs": 25, + "save_steps": 100, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.684921203405947e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}