{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.014009649145849203, "eval_steps": 200, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.004824572924602e-05, "grad_norm": 6.222772121429443, "learning_rate": 9.99930017513135e-05, "loss": 1.1076, "num_input_tokens_seen": 16384, "step": 1 }, { "epoch": 0.00014009649145849205, "grad_norm": 6.042057037353516, "learning_rate": 9.998600350262697e-05, "loss": 1.1086, "num_input_tokens_seen": 32768, "step": 2 }, { "epoch": 0.00021014473718773804, "grad_norm": 7.119229316711426, "learning_rate": 9.997900525394046e-05, "loss": 1.4047, "num_input_tokens_seen": 49152, "step": 3 }, { "epoch": 0.0002801929829169841, "grad_norm": 7.133191108703613, "learning_rate": 9.997200700525395e-05, "loss": 1.3921, "num_input_tokens_seen": 65536, "step": 4 }, { "epoch": 0.0003502412286462301, "grad_norm": 6.1078338623046875, "learning_rate": 9.996500875656743e-05, "loss": 1.3171, "num_input_tokens_seen": 81920, "step": 5 }, { "epoch": 0.0004202894743754761, "grad_norm": 6.466420650482178, "learning_rate": 9.995801050788092e-05, "loss": 1.0732, "num_input_tokens_seen": 97344, "step": 6 }, { "epoch": 0.0004903377201047221, "grad_norm": 5.578189849853516, "learning_rate": 9.99510122591944e-05, "loss": 0.9929, "num_input_tokens_seen": 113728, "step": 7 }, { "epoch": 0.0005603859658339682, "grad_norm": 7.197720527648926, "learning_rate": 9.994401401050789e-05, "loss": 1.2512, "num_input_tokens_seen": 129528, "step": 8 }, { "epoch": 0.0006304342115632141, "grad_norm": 6.618913650512695, "learning_rate": 9.993701576182136e-05, "loss": 1.3495, "num_input_tokens_seen": 145704, "step": 9 }, { "epoch": 0.0007004824572924602, "grad_norm": 6.955508232116699, "learning_rate": 9.993001751313485e-05, "loss": 1.1823, "num_input_tokens_seen": 161664, "step": 10 }, { "epoch": 0.0007705307030217062, "grad_norm": 6.6807074546813965, "learning_rate": 9.992301926444835e-05, "loss": 1.1693, "num_input_tokens_seen": 177960, "step": 11 }, { "epoch": 0.0008405789487509522, "grad_norm": 6.784447193145752, "learning_rate": 9.991602101576183e-05, "loss": 1.3744, "num_input_tokens_seen": 194344, "step": 12 }, { "epoch": 0.0009106271944801982, "grad_norm": 6.7418437004089355, "learning_rate": 9.990902276707532e-05, "loss": 1.22, "num_input_tokens_seen": 210728, "step": 13 }, { "epoch": 0.0009806754402094443, "grad_norm": 6.43395471572876, "learning_rate": 9.990202451838879e-05, "loss": 1.1772, "num_input_tokens_seen": 227112, "step": 14 }, { "epoch": 0.0010507236859386903, "grad_norm": 6.09422492980957, "learning_rate": 9.989502626970228e-05, "loss": 1.195, "num_input_tokens_seen": 243496, "step": 15 }, { "epoch": 0.0011207719316679364, "grad_norm": 6.238271236419678, "learning_rate": 9.988802802101577e-05, "loss": 1.2623, "num_input_tokens_seen": 259744, "step": 16 }, { "epoch": 0.0011908201773971822, "grad_norm": 6.56187629699707, "learning_rate": 9.988102977232926e-05, "loss": 1.2721, "num_input_tokens_seen": 276128, "step": 17 }, { "epoch": 0.0012608684231264283, "grad_norm": 6.818358898162842, "learning_rate": 9.987403152364275e-05, "loss": 1.2649, "num_input_tokens_seen": 292512, "step": 18 }, { "epoch": 0.0013309166688556743, "grad_norm": 5.950352191925049, "learning_rate": 9.986703327495622e-05, "loss": 1.0024, "num_input_tokens_seen": 308632, "step": 19 }, { "epoch": 0.0014009649145849204, "grad_norm": 6.387479305267334, "learning_rate": 9.986003502626971e-05, "loss": 1.2783, "num_input_tokens_seen": 325016, "step": 20 }, { "epoch": 0.0014710131603141664, "grad_norm": 6.187346458435059, "learning_rate": 9.985303677758318e-05, "loss": 1.1701, "num_input_tokens_seen": 341384, "step": 21 }, { "epoch": 0.0015410614060434125, "grad_norm": 5.371951103210449, "learning_rate": 9.984603852889667e-05, "loss": 1.0483, "num_input_tokens_seen": 357768, "step": 22 }, { "epoch": 0.0016111096517726585, "grad_norm": 6.2206807136535645, "learning_rate": 9.983904028021016e-05, "loss": 1.2516, "num_input_tokens_seen": 374152, "step": 23 }, { "epoch": 0.0016811578975019044, "grad_norm": 6.121264457702637, "learning_rate": 9.983204203152365e-05, "loss": 1.1506, "num_input_tokens_seen": 390536, "step": 24 }, { "epoch": 0.0017512061432311504, "grad_norm": 6.353756904602051, "learning_rate": 9.982504378283714e-05, "loss": 1.3118, "num_input_tokens_seen": 406920, "step": 25 }, { "epoch": 0.0018212543889603965, "grad_norm": 6.270686149597168, "learning_rate": 9.981804553415061e-05, "loss": 1.0883, "num_input_tokens_seen": 422728, "step": 26 }, { "epoch": 0.0018913026346896425, "grad_norm": 6.117632865905762, "learning_rate": 9.98110472854641e-05, "loss": 1.3346, "num_input_tokens_seen": 439112, "step": 27 }, { "epoch": 0.0019613508804188886, "grad_norm": 6.429015159606934, "learning_rate": 9.980404903677759e-05, "loss": 1.2494, "num_input_tokens_seen": 455144, "step": 28 }, { "epoch": 0.0020313991261481346, "grad_norm": 6.4467620849609375, "learning_rate": 9.979705078809107e-05, "loss": 1.3335, "num_input_tokens_seen": 470360, "step": 29 }, { "epoch": 0.0021014473718773807, "grad_norm": 6.57926082611084, "learning_rate": 9.979005253940455e-05, "loss": 1.2126, "num_input_tokens_seen": 486120, "step": 30 }, { "epoch": 0.0021714956176066267, "grad_norm": 5.650569915771484, "learning_rate": 9.978305429071804e-05, "loss": 1.1363, "num_input_tokens_seen": 501896, "step": 31 }, { "epoch": 0.0022415438633358728, "grad_norm": 6.380292892456055, "learning_rate": 9.977605604203153e-05, "loss": 1.2251, "num_input_tokens_seen": 517752, "step": 32 }, { "epoch": 0.002311592109065119, "grad_norm": 5.704173564910889, "learning_rate": 9.976905779334502e-05, "loss": 1.1685, "num_input_tokens_seen": 534136, "step": 33 }, { "epoch": 0.0023816403547943644, "grad_norm": 5.342978000640869, "learning_rate": 9.97620595446585e-05, "loss": 1.2012, "num_input_tokens_seen": 550216, "step": 34 }, { "epoch": 0.0024516886005236105, "grad_norm": 5.7014241218566895, "learning_rate": 9.975506129597198e-05, "loss": 1.2342, "num_input_tokens_seen": 566600, "step": 35 }, { "epoch": 0.0025217368462528565, "grad_norm": 6.26229190826416, "learning_rate": 9.974806304728546e-05, "loss": 1.2041, "num_input_tokens_seen": 582984, "step": 36 }, { "epoch": 0.0025917850919821026, "grad_norm": 6.583463191986084, "learning_rate": 9.974106479859896e-05, "loss": 1.3021, "num_input_tokens_seen": 598968, "step": 37 }, { "epoch": 0.0026618333377113486, "grad_norm": 5.58498477935791, "learning_rate": 9.973406654991245e-05, "loss": 1.1622, "num_input_tokens_seen": 614840, "step": 38 }, { "epoch": 0.0027318815834405947, "grad_norm": 5.906906604766846, "learning_rate": 9.972706830122592e-05, "loss": 1.1971, "num_input_tokens_seen": 631224, "step": 39 }, { "epoch": 0.0028019298291698407, "grad_norm": 5.962359428405762, "learning_rate": 9.972007005253941e-05, "loss": 1.1326, "num_input_tokens_seen": 647000, "step": 40 }, { "epoch": 0.002871978074899087, "grad_norm": 6.447500705718994, "learning_rate": 9.971307180385289e-05, "loss": 1.0905, "num_input_tokens_seen": 662480, "step": 41 }, { "epoch": 0.002942026320628333, "grad_norm": 5.7290520668029785, "learning_rate": 9.970607355516638e-05, "loss": 1.3585, "num_input_tokens_seen": 678480, "step": 42 }, { "epoch": 0.003012074566357579, "grad_norm": 6.063445568084717, "learning_rate": 9.969907530647987e-05, "loss": 1.2841, "num_input_tokens_seen": 694256, "step": 43 }, { "epoch": 0.003082122812086825, "grad_norm": 5.302809238433838, "learning_rate": 9.969207705779335e-05, "loss": 1.1168, "num_input_tokens_seen": 710152, "step": 44 }, { "epoch": 0.003152171057816071, "grad_norm": 5.634128093719482, "learning_rate": 9.968507880910684e-05, "loss": 1.0609, "num_input_tokens_seen": 726184, "step": 45 }, { "epoch": 0.003222219303545317, "grad_norm": 5.652642726898193, "learning_rate": 9.967808056042032e-05, "loss": 1.2228, "num_input_tokens_seen": 742520, "step": 46 }, { "epoch": 0.0032922675492745627, "grad_norm": 5.340751647949219, "learning_rate": 9.96710823117338e-05, "loss": 1.0595, "num_input_tokens_seen": 758904, "step": 47 }, { "epoch": 0.0033623157950038087, "grad_norm": 5.422239780426025, "learning_rate": 9.966408406304728e-05, "loss": 1.1161, "num_input_tokens_seen": 775040, "step": 48 }, { "epoch": 0.0034323640407330548, "grad_norm": 5.29241418838501, "learning_rate": 9.965708581436077e-05, "loss": 1.0255, "num_input_tokens_seen": 790856, "step": 49 }, { "epoch": 0.003502412286462301, "grad_norm": 5.146270275115967, "learning_rate": 9.965008756567426e-05, "loss": 0.9762, "num_input_tokens_seen": 807064, "step": 50 }, { "epoch": 0.003572460532191547, "grad_norm": 5.825758457183838, "learning_rate": 9.964308931698775e-05, "loss": 1.2108, "num_input_tokens_seen": 823448, "step": 51 }, { "epoch": 0.003642508777920793, "grad_norm": 6.179538726806641, "learning_rate": 9.963609106830124e-05, "loss": 1.322, "num_input_tokens_seen": 838888, "step": 52 }, { "epoch": 0.003712557023650039, "grad_norm": 6.464454174041748, "learning_rate": 9.962909281961471e-05, "loss": 1.5077, "num_input_tokens_seen": 855272, "step": 53 }, { "epoch": 0.003782605269379285, "grad_norm": 5.4227294921875, "learning_rate": 9.96220945709282e-05, "loss": 1.2679, "num_input_tokens_seen": 871656, "step": 54 }, { "epoch": 0.003852653515108531, "grad_norm": 5.949041366577148, "learning_rate": 9.961509632224169e-05, "loss": 1.3618, "num_input_tokens_seen": 888040, "step": 55 }, { "epoch": 0.003922701760837777, "grad_norm": 6.050904750823975, "learning_rate": 9.960809807355516e-05, "loss": 1.3155, "num_input_tokens_seen": 904400, "step": 56 }, { "epoch": 0.003992750006567023, "grad_norm": 6.048308849334717, "learning_rate": 9.960109982486866e-05, "loss": 1.3131, "num_input_tokens_seen": 919952, "step": 57 }, { "epoch": 0.004062798252296269, "grad_norm": 5.683863162994385, "learning_rate": 9.959410157618214e-05, "loss": 1.1692, "num_input_tokens_seen": 936336, "step": 58 }, { "epoch": 0.004132846498025515, "grad_norm": 5.449287414550781, "learning_rate": 9.958710332749563e-05, "loss": 1.0613, "num_input_tokens_seen": 952152, "step": 59 }, { "epoch": 0.004202894743754761, "grad_norm": 5.31496524810791, "learning_rate": 9.958010507880912e-05, "loss": 0.9605, "num_input_tokens_seen": 967824, "step": 60 }, { "epoch": 0.004272942989484007, "grad_norm": 5.57105016708374, "learning_rate": 9.957310683012259e-05, "loss": 1.1701, "num_input_tokens_seen": 983864, "step": 61 }, { "epoch": 0.004342991235213253, "grad_norm": 5.3456830978393555, "learning_rate": 9.956610858143608e-05, "loss": 1.0995, "num_input_tokens_seen": 1000248, "step": 62 }, { "epoch": 0.004413039480942499, "grad_norm": 5.453295707702637, "learning_rate": 9.955911033274957e-05, "loss": 1.2413, "num_input_tokens_seen": 1016632, "step": 63 }, { "epoch": 0.0044830877266717455, "grad_norm": 4.975449562072754, "learning_rate": 9.955211208406306e-05, "loss": 1.0961, "num_input_tokens_seen": 1033016, "step": 64 }, { "epoch": 0.004553135972400991, "grad_norm": 5.542137145996094, "learning_rate": 9.954511383537655e-05, "loss": 1.1171, "num_input_tokens_seen": 1049400, "step": 65 }, { "epoch": 0.004623184218130238, "grad_norm": 5.213950157165527, "learning_rate": 9.953811558669002e-05, "loss": 1.2228, "num_input_tokens_seen": 1065784, "step": 66 }, { "epoch": 0.004693232463859483, "grad_norm": 5.496099948883057, "learning_rate": 9.953111733800351e-05, "loss": 1.1529, "num_input_tokens_seen": 1082168, "step": 67 }, { "epoch": 0.004763280709588729, "grad_norm": 5.64145565032959, "learning_rate": 9.952411908931698e-05, "loss": 1.2301, "num_input_tokens_seen": 1098024, "step": 68 }, { "epoch": 0.004833328955317975, "grad_norm": 5.566709995269775, "learning_rate": 9.951712084063047e-05, "loss": 1.2679, "num_input_tokens_seen": 1114408, "step": 69 }, { "epoch": 0.004903377201047221, "grad_norm": 6.443673133850098, "learning_rate": 9.951012259194396e-05, "loss": 1.2313, "num_input_tokens_seen": 1130792, "step": 70 }, { "epoch": 0.0049734254467764675, "grad_norm": 5.882962226867676, "learning_rate": 9.950312434325745e-05, "loss": 1.4304, "num_input_tokens_seen": 1147176, "step": 71 }, { "epoch": 0.005043473692505713, "grad_norm": 6.0052666664123535, "learning_rate": 9.949612609457094e-05, "loss": 1.3027, "num_input_tokens_seen": 1160968, "step": 72 }, { "epoch": 0.0051135219382349596, "grad_norm": 5.260256767272949, "learning_rate": 9.948912784588441e-05, "loss": 1.1526, "num_input_tokens_seen": 1177352, "step": 73 }, { "epoch": 0.005183570183964205, "grad_norm": 5.641814708709717, "learning_rate": 9.94821295971979e-05, "loss": 1.0666, "num_input_tokens_seen": 1193032, "step": 74 }, { "epoch": 0.005253618429693452, "grad_norm": 5.121115207672119, "learning_rate": 9.947513134851138e-05, "loss": 1.2404, "num_input_tokens_seen": 1208952, "step": 75 }, { "epoch": 0.005323666675422697, "grad_norm": 5.63930082321167, "learning_rate": 9.946813309982487e-05, "loss": 1.5127, "num_input_tokens_seen": 1225000, "step": 76 }, { "epoch": 0.005393714921151944, "grad_norm": 4.880716800689697, "learning_rate": 9.946113485113837e-05, "loss": 1.1484, "num_input_tokens_seen": 1241384, "step": 77 }, { "epoch": 0.005463763166881189, "grad_norm": 5.59611177444458, "learning_rate": 9.945413660245184e-05, "loss": 1.1678, "num_input_tokens_seen": 1257680, "step": 78 }, { "epoch": 0.005533811412610436, "grad_norm": 5.052026271820068, "learning_rate": 9.944713835376533e-05, "loss": 1.2207, "num_input_tokens_seen": 1274064, "step": 79 }, { "epoch": 0.0056038596583396815, "grad_norm": 5.285096168518066, "learning_rate": 9.944014010507881e-05, "loss": 1.1457, "num_input_tokens_seen": 1290448, "step": 80 }, { "epoch": 0.005673907904068927, "grad_norm": 5.4286580085754395, "learning_rate": 9.94331418563923e-05, "loss": 1.3047, "num_input_tokens_seen": 1306832, "step": 81 }, { "epoch": 0.005743956149798174, "grad_norm": 5.937953472137451, "learning_rate": 9.942614360770578e-05, "loss": 1.4353, "num_input_tokens_seen": 1323216, "step": 82 }, { "epoch": 0.005814004395527419, "grad_norm": 5.129006385803223, "learning_rate": 9.941914535901927e-05, "loss": 1.1434, "num_input_tokens_seen": 1339408, "step": 83 }, { "epoch": 0.005884052641256666, "grad_norm": 5.179675102233887, "learning_rate": 9.941214711033276e-05, "loss": 1.2452, "num_input_tokens_seen": 1355792, "step": 84 }, { "epoch": 0.005954100886985911, "grad_norm": 4.912832736968994, "learning_rate": 9.940514886164624e-05, "loss": 1.1255, "num_input_tokens_seen": 1372176, "step": 85 }, { "epoch": 0.006024149132715158, "grad_norm": 5.190899848937988, "learning_rate": 9.939815061295973e-05, "loss": 1.2543, "num_input_tokens_seen": 1388560, "step": 86 }, { "epoch": 0.006094197378444403, "grad_norm": 5.1751275062561035, "learning_rate": 9.939115236427321e-05, "loss": 1.3145, "num_input_tokens_seen": 1404944, "step": 87 }, { "epoch": 0.00616424562417365, "grad_norm": 5.450705528259277, "learning_rate": 9.938415411558669e-05, "loss": 1.2844, "num_input_tokens_seen": 1421328, "step": 88 }, { "epoch": 0.0062342938699028955, "grad_norm": 5.593935012817383, "learning_rate": 9.937715586690018e-05, "loss": 1.3284, "num_input_tokens_seen": 1437464, "step": 89 }, { "epoch": 0.006304342115632142, "grad_norm": 5.156428813934326, "learning_rate": 9.937015761821367e-05, "loss": 1.1682, "num_input_tokens_seen": 1452952, "step": 90 }, { "epoch": 0.006374390361361388, "grad_norm": 4.673638820648193, "learning_rate": 9.936315936952715e-05, "loss": 1.004, "num_input_tokens_seen": 1469336, "step": 91 }, { "epoch": 0.006444438607090634, "grad_norm": 4.996700763702393, "learning_rate": 9.935616112084064e-05, "loss": 1.087, "num_input_tokens_seen": 1485448, "step": 92 }, { "epoch": 0.00651448685281988, "grad_norm": 4.817474365234375, "learning_rate": 9.934916287215412e-05, "loss": 1.151, "num_input_tokens_seen": 1501472, "step": 93 }, { "epoch": 0.006584535098549125, "grad_norm": 5.400479316711426, "learning_rate": 9.934216462346761e-05, "loss": 1.3144, "num_input_tokens_seen": 1516424, "step": 94 }, { "epoch": 0.006654583344278372, "grad_norm": 5.232216835021973, "learning_rate": 9.933516637478108e-05, "loss": 1.0019, "num_input_tokens_seen": 1532792, "step": 95 }, { "epoch": 0.006724631590007617, "grad_norm": 5.392521381378174, "learning_rate": 9.932816812609457e-05, "loss": 1.3195, "num_input_tokens_seen": 1548600, "step": 96 }, { "epoch": 0.006794679835736864, "grad_norm": 5.5280866622924805, "learning_rate": 9.932116987740806e-05, "loss": 1.283, "num_input_tokens_seen": 1564088, "step": 97 }, { "epoch": 0.0068647280814661095, "grad_norm": 4.963179588317871, "learning_rate": 9.931417162872155e-05, "loss": 1.2716, "num_input_tokens_seen": 1580040, "step": 98 }, { "epoch": 0.006934776327195356, "grad_norm": 4.920302391052246, "learning_rate": 9.930717338003504e-05, "loss": 1.088, "num_input_tokens_seen": 1595880, "step": 99 }, { "epoch": 0.007004824572924602, "grad_norm": 4.935486793518066, "learning_rate": 9.930017513134851e-05, "loss": 1.0122, "num_input_tokens_seen": 1611864, "step": 100 }, { "epoch": 0.007074872818653848, "grad_norm": 5.099087238311768, "learning_rate": 9.9293176882662e-05, "loss": 1.1605, "num_input_tokens_seen": 1627472, "step": 101 }, { "epoch": 0.007144921064383094, "grad_norm": 5.3764328956604, "learning_rate": 9.928617863397548e-05, "loss": 1.2225, "num_input_tokens_seen": 1643856, "step": 102 }, { "epoch": 0.00721496931011234, "grad_norm": 5.281564712524414, "learning_rate": 9.927918038528898e-05, "loss": 1.1483, "num_input_tokens_seen": 1660240, "step": 103 }, { "epoch": 0.007285017555841586, "grad_norm": 5.395167827606201, "learning_rate": 9.927218213660247e-05, "loss": 1.6014, "num_input_tokens_seen": 1676624, "step": 104 }, { "epoch": 0.007355065801570832, "grad_norm": 5.322319507598877, "learning_rate": 9.926518388791594e-05, "loss": 1.0933, "num_input_tokens_seen": 1693008, "step": 105 }, { "epoch": 0.007425114047300078, "grad_norm": 5.301229953765869, "learning_rate": 9.925818563922943e-05, "loss": 1.1998, "num_input_tokens_seen": 1708424, "step": 106 }, { "epoch": 0.0074951622930293236, "grad_norm": 4.958597183227539, "learning_rate": 9.92511873905429e-05, "loss": 1.3285, "num_input_tokens_seen": 1724808, "step": 107 }, { "epoch": 0.00756521053875857, "grad_norm": 4.3913960456848145, "learning_rate": 9.924418914185639e-05, "loss": 0.9017, "num_input_tokens_seen": 1740752, "step": 108 }, { "epoch": 0.007635258784487816, "grad_norm": 5.401021480560303, "learning_rate": 9.923719089316988e-05, "loss": 1.3646, "num_input_tokens_seen": 1755176, "step": 109 }, { "epoch": 0.007705307030217062, "grad_norm": 4.894444942474365, "learning_rate": 9.923019264448337e-05, "loss": 0.9955, "num_input_tokens_seen": 1771560, "step": 110 }, { "epoch": 0.007775355275946308, "grad_norm": 4.878688335418701, "learning_rate": 9.922319439579686e-05, "loss": 1.1766, "num_input_tokens_seen": 1787944, "step": 111 }, { "epoch": 0.007845403521675554, "grad_norm": 4.9379777908325195, "learning_rate": 9.921619614711033e-05, "loss": 1.1631, "num_input_tokens_seen": 1803568, "step": 112 }, { "epoch": 0.0079154517674048, "grad_norm": 5.101811408996582, "learning_rate": 9.920919789842382e-05, "loss": 1.2165, "num_input_tokens_seen": 1819952, "step": 113 }, { "epoch": 0.007985500013134045, "grad_norm": 5.32574987411499, "learning_rate": 9.920219964973731e-05, "loss": 1.3012, "num_input_tokens_seen": 1835296, "step": 114 }, { "epoch": 0.008055548258863293, "grad_norm": 5.2391180992126465, "learning_rate": 9.919520140105079e-05, "loss": 1.2451, "num_input_tokens_seen": 1851224, "step": 115 }, { "epoch": 0.008125596504592538, "grad_norm": 4.865017890930176, "learning_rate": 9.918820315236427e-05, "loss": 1.1683, "num_input_tokens_seen": 1867608, "step": 116 }, { "epoch": 0.008195644750321784, "grad_norm": 4.943136215209961, "learning_rate": 9.918120490367776e-05, "loss": 1.31, "num_input_tokens_seen": 1883696, "step": 117 }, { "epoch": 0.00826569299605103, "grad_norm": 4.769871711730957, "learning_rate": 9.917420665499125e-05, "loss": 1.1212, "num_input_tokens_seen": 1900080, "step": 118 }, { "epoch": 0.008335741241780275, "grad_norm": 4.785780429840088, "learning_rate": 9.916720840630474e-05, "loss": 1.2415, "num_input_tokens_seen": 1916464, "step": 119 }, { "epoch": 0.008405789487509523, "grad_norm": 4.802333831787109, "learning_rate": 9.916021015761822e-05, "loss": 1.0513, "num_input_tokens_seen": 1932848, "step": 120 }, { "epoch": 0.008475837733238768, "grad_norm": 5.22212553024292, "learning_rate": 9.91532119089317e-05, "loss": 1.2574, "num_input_tokens_seen": 1949232, "step": 121 }, { "epoch": 0.008545885978968014, "grad_norm": 5.104204177856445, "learning_rate": 9.914621366024518e-05, "loss": 1.0436, "num_input_tokens_seen": 1964184, "step": 122 }, { "epoch": 0.00861593422469726, "grad_norm": 5.11055326461792, "learning_rate": 9.913921541155868e-05, "loss": 1.1939, "num_input_tokens_seen": 1980568, "step": 123 }, { "epoch": 0.008685982470426507, "grad_norm": 4.784866809844971, "learning_rate": 9.913221716287216e-05, "loss": 1.2056, "num_input_tokens_seen": 1996952, "step": 124 }, { "epoch": 0.008756030716155752, "grad_norm": 4.763037204742432, "learning_rate": 9.912521891418564e-05, "loss": 1.1403, "num_input_tokens_seen": 2013336, "step": 125 }, { "epoch": 0.008826078961884998, "grad_norm": 4.813408851623535, "learning_rate": 9.911822066549913e-05, "loss": 1.1897, "num_input_tokens_seen": 2029720, "step": 126 }, { "epoch": 0.008896127207614244, "grad_norm": 4.79008674621582, "learning_rate": 9.911122241681261e-05, "loss": 1.2315, "num_input_tokens_seen": 2046104, "step": 127 }, { "epoch": 0.008966175453343491, "grad_norm": 4.843508720397949, "learning_rate": 9.91042241681261e-05, "loss": 1.0883, "num_input_tokens_seen": 2061592, "step": 128 }, { "epoch": 0.009036223699072737, "grad_norm": 4.917592525482178, "learning_rate": 9.909722591943959e-05, "loss": 1.2512, "num_input_tokens_seen": 2077792, "step": 129 }, { "epoch": 0.009106271944801982, "grad_norm": 4.9154133796691895, "learning_rate": 9.909022767075307e-05, "loss": 1.3284, "num_input_tokens_seen": 2094176, "step": 130 }, { "epoch": 0.009176320190531228, "grad_norm": 5.2125420570373535, "learning_rate": 9.908322942206656e-05, "loss": 1.3469, "num_input_tokens_seen": 2110480, "step": 131 }, { "epoch": 0.009246368436260475, "grad_norm": 4.715712547302246, "learning_rate": 9.907623117338004e-05, "loss": 1.0844, "num_input_tokens_seen": 2126864, "step": 132 }, { "epoch": 0.009316416681989721, "grad_norm": 4.805694580078125, "learning_rate": 9.906923292469353e-05, "loss": 1.069, "num_input_tokens_seen": 2142848, "step": 133 }, { "epoch": 0.009386464927718966, "grad_norm": 4.961355209350586, "learning_rate": 9.9062234676007e-05, "loss": 1.3387, "num_input_tokens_seen": 2159232, "step": 134 }, { "epoch": 0.009456513173448212, "grad_norm": 4.582219123840332, "learning_rate": 9.905523642732049e-05, "loss": 1.2013, "num_input_tokens_seen": 2175616, "step": 135 }, { "epoch": 0.009526561419177458, "grad_norm": 5.195998191833496, "learning_rate": 9.904823817863398e-05, "loss": 1.2552, "num_input_tokens_seen": 2191872, "step": 136 }, { "epoch": 0.009596609664906705, "grad_norm": 4.934189319610596, "learning_rate": 9.904123992994747e-05, "loss": 1.2961, "num_input_tokens_seen": 2208208, "step": 137 }, { "epoch": 0.00966665791063595, "grad_norm": 4.981037616729736, "learning_rate": 9.903424168126096e-05, "loss": 1.1546, "num_input_tokens_seen": 2224592, "step": 138 }, { "epoch": 0.009736706156365196, "grad_norm": 5.469496250152588, "learning_rate": 9.902724343257443e-05, "loss": 1.3833, "num_input_tokens_seen": 2240976, "step": 139 }, { "epoch": 0.009806754402094442, "grad_norm": 4.889583587646484, "learning_rate": 9.902024518388792e-05, "loss": 1.2095, "num_input_tokens_seen": 2257360, "step": 140 }, { "epoch": 0.00987680264782369, "grad_norm": 4.532052516937256, "learning_rate": 9.901324693520141e-05, "loss": 1.143, "num_input_tokens_seen": 2272848, "step": 141 }, { "epoch": 0.009946850893552935, "grad_norm": 5.278079032897949, "learning_rate": 9.900624868651488e-05, "loss": 1.2849, "num_input_tokens_seen": 2289232, "step": 142 }, { "epoch": 0.01001689913928218, "grad_norm": 4.549891948699951, "learning_rate": 9.899925043782839e-05, "loss": 1.0482, "num_input_tokens_seen": 2305424, "step": 143 }, { "epoch": 0.010086947385011426, "grad_norm": 4.7777180671691895, "learning_rate": 9.899225218914186e-05, "loss": 1.1926, "num_input_tokens_seen": 2320968, "step": 144 }, { "epoch": 0.010156995630740673, "grad_norm": 4.320313453674316, "learning_rate": 9.898525394045535e-05, "loss": 1.0468, "num_input_tokens_seen": 2337352, "step": 145 }, { "epoch": 0.010227043876469919, "grad_norm": 4.915202617645264, "learning_rate": 9.897825569176882e-05, "loss": 1.1326, "num_input_tokens_seen": 2353064, "step": 146 }, { "epoch": 0.010297092122199165, "grad_norm": 4.569783687591553, "learning_rate": 9.897125744308231e-05, "loss": 0.8586, "num_input_tokens_seen": 2369128, "step": 147 }, { "epoch": 0.01036714036792841, "grad_norm": 4.591664791107178, "learning_rate": 9.89642591943958e-05, "loss": 1.1369, "num_input_tokens_seen": 2385512, "step": 148 }, { "epoch": 0.010437188613657656, "grad_norm": 4.913016319274902, "learning_rate": 9.895726094570929e-05, "loss": 1.1564, "num_input_tokens_seen": 2401208, "step": 149 }, { "epoch": 0.010507236859386903, "grad_norm": 4.908018112182617, "learning_rate": 9.895026269702278e-05, "loss": 1.1247, "num_input_tokens_seen": 2417592, "step": 150 }, { "epoch": 0.010577285105116149, "grad_norm": 4.536910057067871, "learning_rate": 9.894326444833625e-05, "loss": 1.014, "num_input_tokens_seen": 2433976, "step": 151 }, { "epoch": 0.010647333350845395, "grad_norm": 4.899227142333984, "learning_rate": 9.893626619964974e-05, "loss": 1.0418, "num_input_tokens_seen": 2448072, "step": 152 }, { "epoch": 0.01071738159657464, "grad_norm": 4.600861072540283, "learning_rate": 9.892926795096323e-05, "loss": 1.0459, "num_input_tokens_seen": 2464240, "step": 153 }, { "epoch": 0.010787429842303888, "grad_norm": 4.707681179046631, "learning_rate": 9.89222697022767e-05, "loss": 1.0859, "num_input_tokens_seen": 2480624, "step": 154 }, { "epoch": 0.010857478088033133, "grad_norm": 4.748518466949463, "learning_rate": 9.89152714535902e-05, "loss": 1.0608, "num_input_tokens_seen": 2497008, "step": 155 }, { "epoch": 0.010927526333762379, "grad_norm": 4.794179439544678, "learning_rate": 9.890827320490368e-05, "loss": 1.2243, "num_input_tokens_seen": 2513392, "step": 156 }, { "epoch": 0.010997574579491624, "grad_norm": 4.593925476074219, "learning_rate": 9.890127495621717e-05, "loss": 1.1002, "num_input_tokens_seen": 2529776, "step": 157 }, { "epoch": 0.011067622825220872, "grad_norm": 4.318257808685303, "learning_rate": 9.889427670753066e-05, "loss": 0.9561, "num_input_tokens_seen": 2546160, "step": 158 }, { "epoch": 0.011137671070950117, "grad_norm": 4.631777286529541, "learning_rate": 9.888727845884414e-05, "loss": 1.1553, "num_input_tokens_seen": 2562544, "step": 159 }, { "epoch": 0.011207719316679363, "grad_norm": 4.896609783172607, "learning_rate": 9.888028021015762e-05, "loss": 1.1779, "num_input_tokens_seen": 2578088, "step": 160 }, { "epoch": 0.011277767562408609, "grad_norm": 4.3978681564331055, "learning_rate": 9.88732819614711e-05, "loss": 1.1778, "num_input_tokens_seen": 2594416, "step": 161 }, { "epoch": 0.011347815808137854, "grad_norm": 4.82927942276001, "learning_rate": 9.886628371278459e-05, "loss": 1.0339, "num_input_tokens_seen": 2609776, "step": 162 }, { "epoch": 0.011417864053867102, "grad_norm": 4.413319110870361, "learning_rate": 9.885928546409809e-05, "loss": 1.0992, "num_input_tokens_seen": 2626160, "step": 163 }, { "epoch": 0.011487912299596347, "grad_norm": 4.626354694366455, "learning_rate": 9.885228721541156e-05, "loss": 1.1948, "num_input_tokens_seen": 2642464, "step": 164 }, { "epoch": 0.011557960545325593, "grad_norm": 4.328434467315674, "learning_rate": 9.884528896672505e-05, "loss": 1.1493, "num_input_tokens_seen": 2658528, "step": 165 }, { "epoch": 0.011628008791054838, "grad_norm": 4.57839822769165, "learning_rate": 9.883829071803853e-05, "loss": 1.0775, "num_input_tokens_seen": 2674912, "step": 166 }, { "epoch": 0.011698057036784086, "grad_norm": 5.103973865509033, "learning_rate": 9.883129246935202e-05, "loss": 1.2458, "num_input_tokens_seen": 2690792, "step": 167 }, { "epoch": 0.011768105282513331, "grad_norm": 4.558016300201416, "learning_rate": 9.88242942206655e-05, "loss": 1.0122, "num_input_tokens_seen": 2705616, "step": 168 }, { "epoch": 0.011838153528242577, "grad_norm": 4.811260223388672, "learning_rate": 9.8817295971979e-05, "loss": 1.2989, "num_input_tokens_seen": 2721704, "step": 169 }, { "epoch": 0.011908201773971823, "grad_norm": 4.726966857910156, "learning_rate": 9.881029772329248e-05, "loss": 1.176, "num_input_tokens_seen": 2738088, "step": 170 }, { "epoch": 0.01197825001970107, "grad_norm": 4.874902725219727, "learning_rate": 9.880329947460596e-05, "loss": 1.2586, "num_input_tokens_seen": 2754040, "step": 171 }, { "epoch": 0.012048298265430316, "grad_norm": 4.379549980163574, "learning_rate": 9.879630122591945e-05, "loss": 1.1771, "num_input_tokens_seen": 2770424, "step": 172 }, { "epoch": 0.012118346511159561, "grad_norm": 4.455331802368164, "learning_rate": 9.878930297723292e-05, "loss": 1.0714, "num_input_tokens_seen": 2786808, "step": 173 }, { "epoch": 0.012188394756888807, "grad_norm": 4.42273473739624, "learning_rate": 9.878230472854641e-05, "loss": 1.1798, "num_input_tokens_seen": 2803176, "step": 174 }, { "epoch": 0.012258443002618052, "grad_norm": 4.4078874588012695, "learning_rate": 9.87753064798599e-05, "loss": 1.1672, "num_input_tokens_seen": 2819448, "step": 175 }, { "epoch": 0.0123284912483473, "grad_norm": 4.79048490524292, "learning_rate": 9.876830823117339e-05, "loss": 1.3331, "num_input_tokens_seen": 2835832, "step": 176 }, { "epoch": 0.012398539494076545, "grad_norm": 4.212133884429932, "learning_rate": 9.876130998248688e-05, "loss": 1.0007, "num_input_tokens_seen": 2851776, "step": 177 }, { "epoch": 0.012468587739805791, "grad_norm": 5.7587738037109375, "learning_rate": 9.875431173380035e-05, "loss": 1.4729, "num_input_tokens_seen": 2867896, "step": 178 }, { "epoch": 0.012538635985535037, "grad_norm": 4.3469462394714355, "learning_rate": 9.874731348511384e-05, "loss": 0.957, "num_input_tokens_seen": 2884280, "step": 179 }, { "epoch": 0.012608684231264284, "grad_norm": 4.584625244140625, "learning_rate": 9.874031523642733e-05, "loss": 1.0753, "num_input_tokens_seen": 2899208, "step": 180 }, { "epoch": 0.01267873247699353, "grad_norm": 4.544627666473389, "learning_rate": 9.87333169877408e-05, "loss": 1.1706, "num_input_tokens_seen": 2915416, "step": 181 }, { "epoch": 0.012748780722722775, "grad_norm": 4.8749237060546875, "learning_rate": 9.872631873905429e-05, "loss": 1.3382, "num_input_tokens_seen": 2931360, "step": 182 }, { "epoch": 0.01281882896845202, "grad_norm": 4.593903541564941, "learning_rate": 9.871932049036778e-05, "loss": 1.1588, "num_input_tokens_seen": 2947744, "step": 183 }, { "epoch": 0.012888877214181268, "grad_norm": 4.478219509124756, "learning_rate": 9.871232224168127e-05, "loss": 1.1013, "num_input_tokens_seen": 2963664, "step": 184 }, { "epoch": 0.012958925459910514, "grad_norm": 5.028106212615967, "learning_rate": 9.870532399299476e-05, "loss": 1.3223, "num_input_tokens_seen": 2980048, "step": 185 }, { "epoch": 0.01302897370563976, "grad_norm": 4.866946697235107, "learning_rate": 9.869832574430823e-05, "loss": 1.2376, "num_input_tokens_seen": 2995992, "step": 186 }, { "epoch": 0.013099021951369005, "grad_norm": 4.421341419219971, "learning_rate": 9.869132749562172e-05, "loss": 1.2252, "num_input_tokens_seen": 3012000, "step": 187 }, { "epoch": 0.01316907019709825, "grad_norm": 4.88083028793335, "learning_rate": 9.86843292469352e-05, "loss": 1.2951, "num_input_tokens_seen": 3028384, "step": 188 }, { "epoch": 0.013239118442827498, "grad_norm": 4.654318809509277, "learning_rate": 9.86773309982487e-05, "loss": 1.2839, "num_input_tokens_seen": 3044768, "step": 189 }, { "epoch": 0.013309166688556744, "grad_norm": 4.626763820648193, "learning_rate": 9.867033274956219e-05, "loss": 1.2389, "num_input_tokens_seen": 3061152, "step": 190 }, { "epoch": 0.01337921493428599, "grad_norm": 4.178484916687012, "learning_rate": 9.866333450087566e-05, "loss": 1.1186, "num_input_tokens_seen": 3077056, "step": 191 }, { "epoch": 0.013449263180015235, "grad_norm": 4.755034923553467, "learning_rate": 9.865633625218915e-05, "loss": 1.0594, "num_input_tokens_seen": 3093400, "step": 192 }, { "epoch": 0.013519311425744482, "grad_norm": 4.437506198883057, "learning_rate": 9.864933800350263e-05, "loss": 1.2078, "num_input_tokens_seen": 3109784, "step": 193 }, { "epoch": 0.013589359671473728, "grad_norm": 5.140488624572754, "learning_rate": 9.864233975481611e-05, "loss": 1.4312, "num_input_tokens_seen": 3124976, "step": 194 }, { "epoch": 0.013659407917202973, "grad_norm": 4.72155237197876, "learning_rate": 9.86353415061296e-05, "loss": 1.1752, "num_input_tokens_seen": 3140632, "step": 195 }, { "epoch": 0.013729456162932219, "grad_norm": 4.914645671844482, "learning_rate": 9.862834325744309e-05, "loss": 1.2464, "num_input_tokens_seen": 3156616, "step": 196 }, { "epoch": 0.013799504408661466, "grad_norm": 4.23387336730957, "learning_rate": 9.862134500875658e-05, "loss": 0.9722, "num_input_tokens_seen": 3172840, "step": 197 }, { "epoch": 0.013869552654390712, "grad_norm": 4.659370422363281, "learning_rate": 9.861434676007005e-05, "loss": 1.1981, "num_input_tokens_seen": 3188584, "step": 198 }, { "epoch": 0.013939600900119958, "grad_norm": 4.580902576446533, "learning_rate": 9.860734851138354e-05, "loss": 1.1913, "num_input_tokens_seen": 3204432, "step": 199 }, { "epoch": 0.014009649145849203, "grad_norm": 4.208237648010254, "learning_rate": 9.860035026269702e-05, "loss": 1.2056, "num_input_tokens_seen": 3220816, "step": 200 }, { "epoch": 0.014009649145849203, "eval_loss": 1.2226407527923584, "eval_runtime": 0.3992, "eval_samples_per_second": 2.505, "eval_steps_per_second": 2.505, "num_input_tokens_seen": 3220816, "step": 200 } ], "logging_steps": 1, "max_steps": 14275, "num_input_tokens_seen": 3220816, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6916343792996352.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }