diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 3.68600129617628, + "epoch": 4.0, "eval_steps": 500, - "global_step": 45500, + "global_step": 49376, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -318507,6 +318507,27138 @@ "learning_rate": 1.7444529456771233e-05, "loss": 0.2034, "step": 45500 + }, + { + "epoch": 3.6860823071937783, + "grad_norm": 0.08332670480012894, + "learning_rate": 1.7440028804176606e-05, + "loss": 0.2416, + "step": 45501 + }, + { + "epoch": 3.6861633182112765, + "grad_norm": 0.07239510118961334, + "learning_rate": 1.743552815158198e-05, + "loss": 0.1988, + "step": 45502 + }, + { + "epoch": 3.686244329228775, + "grad_norm": 0.08834824711084366, + "learning_rate": 1.7431027498987353e-05, + "loss": 0.234, + "step": 45503 + }, + { + "epoch": 3.6863253402462735, + "grad_norm": 0.06722290068864822, + "learning_rate": 1.7426526846392727e-05, + "loss": 0.236, + "step": 45504 + }, + { + "epoch": 3.6864063512637717, + "grad_norm": 0.07778163254261017, + "learning_rate": 1.74220261937981e-05, + "loss": 0.2244, + "step": 45505 + }, + { + "epoch": 3.6864873622812704, + "grad_norm": 0.06533099710941315, + "learning_rate": 1.7417525541203474e-05, + "loss": 0.181, + "step": 45506 + }, + { + "epoch": 3.6865683732987686, + "grad_norm": 0.07727663218975067, + "learning_rate": 1.741302488860885e-05, + "loss": 0.2273, + "step": 45507 + }, + { + "epoch": 3.686649384316267, + "grad_norm": 0.08147032558917999, + "learning_rate": 1.7408524236014225e-05, + "loss": 0.2455, + "step": 45508 + }, + { + "epoch": 3.6867303953337656, + "grad_norm": 0.07328832149505615, + "learning_rate": 1.7404023583419595e-05, + "loss": 0.2112, + "step": 45509 + }, + { + "epoch": 3.686811406351264, + "grad_norm": 0.08036161959171295, + "learning_rate": 1.7399522930824972e-05, + "loss": 0.2313, + "step": 45510 + }, + { + "epoch": 3.686892417368762, + "grad_norm": 0.060651037842035294, + "learning_rate": 1.7395022278230346e-05, + "loss": 0.181, + "step": 45511 + }, + { + "epoch": 3.6869734283862607, + "grad_norm": 0.06498783826828003, + "learning_rate": 1.7390521625635716e-05, + "loss": 0.2286, + "step": 45512 + }, + { + "epoch": 3.687054439403759, + "grad_norm": 0.07742474973201752, + "learning_rate": 1.7386020973041093e-05, + "loss": 0.2486, + "step": 45513 + }, + { + "epoch": 3.6871354504212572, + "grad_norm": 0.08565546572208405, + "learning_rate": 1.7381520320446466e-05, + "loss": 0.2706, + "step": 45514 + }, + { + "epoch": 3.687216461438756, + "grad_norm": 0.05786954611539841, + "learning_rate": 1.737701966785184e-05, + "loss": 0.2004, + "step": 45515 + }, + { + "epoch": 3.687297472456254, + "grad_norm": 0.07234077900648117, + "learning_rate": 1.7372519015257214e-05, + "loss": 0.2385, + "step": 45516 + }, + { + "epoch": 3.6873784834737524, + "grad_norm": 0.06998328864574432, + "learning_rate": 1.7368018362662587e-05, + "loss": 0.2232, + "step": 45517 + }, + { + "epoch": 3.6874594944912507, + "grad_norm": 0.08726705610752106, + "learning_rate": 1.736351771006796e-05, + "loss": 0.2643, + "step": 45518 + }, + { + "epoch": 3.6875405055087493, + "grad_norm": 0.06822450459003448, + "learning_rate": 1.7359017057473334e-05, + "loss": 0.1948, + "step": 45519 + }, + { + "epoch": 3.6876215165262476, + "grad_norm": 0.08558908849954605, + "learning_rate": 1.7354516404878708e-05, + "loss": 0.214, + "step": 45520 + }, + { + "epoch": 3.687702527543746, + "grad_norm": 0.0689425840973854, + "learning_rate": 1.735001575228408e-05, + "loss": 0.213, + "step": 45521 + }, + { + "epoch": 3.687783538561244, + "grad_norm": 0.06478188931941986, + "learning_rate": 1.7345515099689455e-05, + "loss": 0.2408, + "step": 45522 + }, + { + "epoch": 3.6878645495787428, + "grad_norm": 0.06854352355003357, + "learning_rate": 1.734101444709483e-05, + "loss": 0.2573, + "step": 45523 + }, + { + "epoch": 3.687945560596241, + "grad_norm": 0.09567511081695557, + "learning_rate": 1.7336513794500202e-05, + "loss": 0.2461, + "step": 45524 + }, + { + "epoch": 3.6880265716137393, + "grad_norm": 0.08633017539978027, + "learning_rate": 1.7332013141905576e-05, + "loss": 0.2201, + "step": 45525 + }, + { + "epoch": 3.688107582631238, + "grad_norm": 0.07063650339841843, + "learning_rate": 1.7327512489310953e-05, + "loss": 0.1685, + "step": 45526 + }, + { + "epoch": 3.688188593648736, + "grad_norm": 0.07147509604692459, + "learning_rate": 1.7323011836716323e-05, + "loss": 0.2507, + "step": 45527 + }, + { + "epoch": 3.6882696046662344, + "grad_norm": 0.08037501573562622, + "learning_rate": 1.73185111841217e-05, + "loss": 0.2337, + "step": 45528 + }, + { + "epoch": 3.688350615683733, + "grad_norm": 0.06710902601480484, + "learning_rate": 1.7314010531527074e-05, + "loss": 0.2117, + "step": 45529 + }, + { + "epoch": 3.6884316267012314, + "grad_norm": 0.07238079607486725, + "learning_rate": 1.7309509878932444e-05, + "loss": 0.2188, + "step": 45530 + }, + { + "epoch": 3.6885126377187296, + "grad_norm": 0.07218148559331894, + "learning_rate": 1.730500922633782e-05, + "loss": 0.227, + "step": 45531 + }, + { + "epoch": 3.6885936487362283, + "grad_norm": 0.09336385875940323, + "learning_rate": 1.7300508573743195e-05, + "loss": 0.2507, + "step": 45532 + }, + { + "epoch": 3.6886746597537265, + "grad_norm": 0.08338413387537003, + "learning_rate": 1.7296007921148568e-05, + "loss": 0.2168, + "step": 45533 + }, + { + "epoch": 3.688755670771225, + "grad_norm": 0.07480164617300034, + "learning_rate": 1.7291507268553942e-05, + "loss": 0.2406, + "step": 45534 + }, + { + "epoch": 3.6888366817887235, + "grad_norm": 0.061553705483675, + "learning_rate": 1.7287006615959315e-05, + "loss": 0.1878, + "step": 45535 + }, + { + "epoch": 3.6889176928062217, + "grad_norm": 0.08677572011947632, + "learning_rate": 1.728250596336469e-05, + "loss": 0.226, + "step": 45536 + }, + { + "epoch": 3.68899870382372, + "grad_norm": 0.096988245844841, + "learning_rate": 1.7278005310770063e-05, + "loss": 0.289, + "step": 45537 + }, + { + "epoch": 3.6890797148412187, + "grad_norm": 0.06231336668133736, + "learning_rate": 1.7273504658175436e-05, + "loss": 0.2039, + "step": 45538 + }, + { + "epoch": 3.689160725858717, + "grad_norm": 0.07273116707801819, + "learning_rate": 1.726900400558081e-05, + "loss": 0.2481, + "step": 45539 + }, + { + "epoch": 3.689241736876215, + "grad_norm": 0.06380755454301834, + "learning_rate": 1.7264503352986183e-05, + "loss": 0.2278, + "step": 45540 + }, + { + "epoch": 3.6893227478937134, + "grad_norm": 0.07865901291370392, + "learning_rate": 1.726000270039156e-05, + "loss": 0.237, + "step": 45541 + }, + { + "epoch": 3.689403758911212, + "grad_norm": 0.08549056202173233, + "learning_rate": 1.725550204779693e-05, + "loss": 0.196, + "step": 45542 + }, + { + "epoch": 3.6894847699287103, + "grad_norm": 0.07149454206228256, + "learning_rate": 1.7251001395202304e-05, + "loss": 0.2474, + "step": 45543 + }, + { + "epoch": 3.6895657809462086, + "grad_norm": 0.07113026827573776, + "learning_rate": 1.724650074260768e-05, + "loss": 0.2095, + "step": 45544 + }, + { + "epoch": 3.689646791963707, + "grad_norm": 0.07473193854093552, + "learning_rate": 1.724200009001305e-05, + "loss": 0.2236, + "step": 45545 + }, + { + "epoch": 3.6897278029812055, + "grad_norm": 0.08211532980203629, + "learning_rate": 1.723749943741843e-05, + "loss": 0.2621, + "step": 45546 + }, + { + "epoch": 3.6898088139987038, + "grad_norm": 0.07540342956781387, + "learning_rate": 1.7232998784823802e-05, + "loss": 0.246, + "step": 45547 + }, + { + "epoch": 3.689889825016202, + "grad_norm": 0.06144927814602852, + "learning_rate": 1.7228498132229172e-05, + "loss": 0.1974, + "step": 45548 + }, + { + "epoch": 3.6899708360337007, + "grad_norm": 0.06908820569515228, + "learning_rate": 1.722399747963455e-05, + "loss": 0.2061, + "step": 45549 + }, + { + "epoch": 3.690051847051199, + "grad_norm": 0.07357897609472275, + "learning_rate": 1.7219496827039923e-05, + "loss": 0.1908, + "step": 45550 + }, + { + "epoch": 3.690132858068697, + "grad_norm": 0.07140400260686874, + "learning_rate": 1.7214996174445293e-05, + "loss": 0.2132, + "step": 45551 + }, + { + "epoch": 3.690213869086196, + "grad_norm": 0.07593827694654465, + "learning_rate": 1.721049552185067e-05, + "loss": 0.2472, + "step": 45552 + }, + { + "epoch": 3.690294880103694, + "grad_norm": 0.07725334167480469, + "learning_rate": 1.7205994869256044e-05, + "loss": 0.2395, + "step": 45553 + }, + { + "epoch": 3.6903758911211924, + "grad_norm": 0.08335854858160019, + "learning_rate": 1.7201494216661417e-05, + "loss": 0.2146, + "step": 45554 + }, + { + "epoch": 3.690456902138691, + "grad_norm": 0.06782727688550949, + "learning_rate": 1.719699356406679e-05, + "loss": 0.2199, + "step": 45555 + }, + { + "epoch": 3.6905379131561893, + "grad_norm": 0.08147486299276352, + "learning_rate": 1.7192492911472164e-05, + "loss": 0.2419, + "step": 45556 + }, + { + "epoch": 3.6906189241736875, + "grad_norm": 0.08341187983751297, + "learning_rate": 1.7187992258877538e-05, + "loss": 0.2484, + "step": 45557 + }, + { + "epoch": 3.690699935191186, + "grad_norm": 0.07176518440246582, + "learning_rate": 1.718349160628291e-05, + "loss": 0.2306, + "step": 45558 + }, + { + "epoch": 3.6907809462086845, + "grad_norm": 0.071227066218853, + "learning_rate": 1.7178990953688285e-05, + "loss": 0.2252, + "step": 45559 + }, + { + "epoch": 3.6908619572261827, + "grad_norm": 0.06585358828306198, + "learning_rate": 1.717449030109366e-05, + "loss": 0.2561, + "step": 45560 + }, + { + "epoch": 3.6909429682436814, + "grad_norm": 0.10919830203056335, + "learning_rate": 1.7169989648499032e-05, + "loss": 0.2707, + "step": 45561 + }, + { + "epoch": 3.6910239792611796, + "grad_norm": 0.08778324723243713, + "learning_rate": 1.716548899590441e-05, + "loss": 0.2543, + "step": 45562 + }, + { + "epoch": 3.691104990278678, + "grad_norm": 0.06133211776614189, + "learning_rate": 1.716098834330978e-05, + "loss": 0.2007, + "step": 45563 + }, + { + "epoch": 3.691186001296176, + "grad_norm": 0.06192644312977791, + "learning_rate": 1.7156487690715153e-05, + "loss": 0.2036, + "step": 45564 + }, + { + "epoch": 3.691267012313675, + "grad_norm": 0.07539551705121994, + "learning_rate": 1.715198703812053e-05, + "loss": 0.2636, + "step": 45565 + }, + { + "epoch": 3.691348023331173, + "grad_norm": 0.06967008113861084, + "learning_rate": 1.71474863855259e-05, + "loss": 0.2439, + "step": 45566 + }, + { + "epoch": 3.6914290343486713, + "grad_norm": 0.08135908842086792, + "learning_rate": 1.7142985732931278e-05, + "loss": 0.2111, + "step": 45567 + }, + { + "epoch": 3.6915100453661696, + "grad_norm": 0.07239343971014023, + "learning_rate": 1.713848508033665e-05, + "loss": 0.235, + "step": 45568 + }, + { + "epoch": 3.6915910563836682, + "grad_norm": 0.07607700675725937, + "learning_rate": 1.713398442774202e-05, + "loss": 0.2198, + "step": 45569 + }, + { + "epoch": 3.6916720674011665, + "grad_norm": 0.0712643638253212, + "learning_rate": 1.71294837751474e-05, + "loss": 0.2369, + "step": 45570 + }, + { + "epoch": 3.6917530784186647, + "grad_norm": 0.06370777636766434, + "learning_rate": 1.7124983122552772e-05, + "loss": 0.1989, + "step": 45571 + }, + { + "epoch": 3.6918340894361634, + "grad_norm": 0.06736060231924057, + "learning_rate": 1.7120482469958142e-05, + "loss": 0.2148, + "step": 45572 + }, + { + "epoch": 3.6919151004536617, + "grad_norm": 0.05656532570719719, + "learning_rate": 1.711598181736352e-05, + "loss": 0.1881, + "step": 45573 + }, + { + "epoch": 3.69199611147116, + "grad_norm": 0.07122478634119034, + "learning_rate": 1.7111481164768893e-05, + "loss": 0.2198, + "step": 45574 + }, + { + "epoch": 3.6920771224886586, + "grad_norm": 0.08074306696653366, + "learning_rate": 1.7106980512174266e-05, + "loss": 0.2857, + "step": 45575 + }, + { + "epoch": 3.692158133506157, + "grad_norm": 0.07378386706113815, + "learning_rate": 1.710247985957964e-05, + "loss": 0.203, + "step": 45576 + }, + { + "epoch": 3.692239144523655, + "grad_norm": 0.06506312638521194, + "learning_rate": 1.7097979206985014e-05, + "loss": 0.2338, + "step": 45577 + }, + { + "epoch": 3.692320155541154, + "grad_norm": 0.0626230537891388, + "learning_rate": 1.7093478554390387e-05, + "loss": 0.2434, + "step": 45578 + }, + { + "epoch": 3.692401166558652, + "grad_norm": 0.09149368107318878, + "learning_rate": 1.708897790179576e-05, + "loss": 0.2593, + "step": 45579 + }, + { + "epoch": 3.6924821775761503, + "grad_norm": 0.07215756922960281, + "learning_rate": 1.7084477249201138e-05, + "loss": 0.219, + "step": 45580 + }, + { + "epoch": 3.692563188593649, + "grad_norm": 0.07592841237783432, + "learning_rate": 1.7079976596606508e-05, + "loss": 0.2666, + "step": 45581 + }, + { + "epoch": 3.692644199611147, + "grad_norm": 0.0961163192987442, + "learning_rate": 1.707547594401188e-05, + "loss": 0.2635, + "step": 45582 + }, + { + "epoch": 3.6927252106286454, + "grad_norm": 0.08747298270463943, + "learning_rate": 1.707097529141726e-05, + "loss": 0.233, + "step": 45583 + }, + { + "epoch": 3.692806221646144, + "grad_norm": 0.08068632334470749, + "learning_rate": 1.706647463882263e-05, + "loss": 0.2191, + "step": 45584 + }, + { + "epoch": 3.6928872326636424, + "grad_norm": 0.07764396071434021, + "learning_rate": 1.7061973986228002e-05, + "loss": 0.2235, + "step": 45585 + }, + { + "epoch": 3.6929682436811406, + "grad_norm": 0.07049860060214996, + "learning_rate": 1.705747333363338e-05, + "loss": 0.2229, + "step": 45586 + }, + { + "epoch": 3.693049254698639, + "grad_norm": 0.06508810073137283, + "learning_rate": 1.705297268103875e-05, + "loss": 0.2215, + "step": 45587 + }, + { + "epoch": 3.693130265716137, + "grad_norm": 0.06824822723865509, + "learning_rate": 1.7048472028444127e-05, + "loss": 0.1971, + "step": 45588 + }, + { + "epoch": 3.693211276733636, + "grad_norm": 0.07630736380815506, + "learning_rate": 1.70439713758495e-05, + "loss": 0.2508, + "step": 45589 + }, + { + "epoch": 3.693292287751134, + "grad_norm": 0.06600065529346466, + "learning_rate": 1.703947072325487e-05, + "loss": 0.2006, + "step": 45590 + }, + { + "epoch": 3.6933732987686323, + "grad_norm": 0.0757368728518486, + "learning_rate": 1.7034970070660247e-05, + "loss": 0.235, + "step": 45591 + }, + { + "epoch": 3.693454309786131, + "grad_norm": 0.0807269960641861, + "learning_rate": 1.703046941806562e-05, + "loss": 0.2203, + "step": 45592 + }, + { + "epoch": 3.6935353208036292, + "grad_norm": 0.0880458652973175, + "learning_rate": 1.7025968765470995e-05, + "loss": 0.1987, + "step": 45593 + }, + { + "epoch": 3.6936163318211275, + "grad_norm": 0.07349114865064621, + "learning_rate": 1.7021468112876368e-05, + "loss": 0.242, + "step": 45594 + }, + { + "epoch": 3.693697342838626, + "grad_norm": 0.07621750235557556, + "learning_rate": 1.7016967460281742e-05, + "loss": 0.2247, + "step": 45595 + }, + { + "epoch": 3.6937783538561244, + "grad_norm": 0.07071096450090408, + "learning_rate": 1.7012466807687115e-05, + "loss": 0.2282, + "step": 45596 + }, + { + "epoch": 3.6938593648736227, + "grad_norm": 0.05581112951040268, + "learning_rate": 1.700796615509249e-05, + "loss": 0.196, + "step": 45597 + }, + { + "epoch": 3.6939403758911213, + "grad_norm": 0.08093393594026566, + "learning_rate": 1.7003465502497863e-05, + "loss": 0.2417, + "step": 45598 + }, + { + "epoch": 3.6940213869086196, + "grad_norm": 0.07044923305511475, + "learning_rate": 1.6998964849903236e-05, + "loss": 0.2366, + "step": 45599 + }, + { + "epoch": 3.694102397926118, + "grad_norm": 0.0671868696808815, + "learning_rate": 1.699446419730861e-05, + "loss": 0.1992, + "step": 45600 + }, + { + "epoch": 3.6941834089436165, + "grad_norm": 0.06780609488487244, + "learning_rate": 1.6989963544713987e-05, + "loss": 0.1734, + "step": 45601 + }, + { + "epoch": 3.6942644199611148, + "grad_norm": 0.07801494002342224, + "learning_rate": 1.6985462892119357e-05, + "loss": 0.2503, + "step": 45602 + }, + { + "epoch": 3.694345430978613, + "grad_norm": 0.07000548392534256, + "learning_rate": 1.698096223952473e-05, + "loss": 0.2211, + "step": 45603 + }, + { + "epoch": 3.6944264419961117, + "grad_norm": 0.09492857754230499, + "learning_rate": 1.6976461586930108e-05, + "loss": 0.2736, + "step": 45604 + }, + { + "epoch": 3.69450745301361, + "grad_norm": 0.08605027943849564, + "learning_rate": 1.6971960934335478e-05, + "loss": 0.2609, + "step": 45605 + }, + { + "epoch": 3.694588464031108, + "grad_norm": 0.07421616464853287, + "learning_rate": 1.6967460281740855e-05, + "loss": 0.2275, + "step": 45606 + }, + { + "epoch": 3.694669475048607, + "grad_norm": 0.08524744212627411, + "learning_rate": 1.696295962914623e-05, + "loss": 0.2311, + "step": 45607 + }, + { + "epoch": 3.694750486066105, + "grad_norm": 0.06704798340797424, + "learning_rate": 1.69584589765516e-05, + "loss": 0.2533, + "step": 45608 + }, + { + "epoch": 3.6948314970836034, + "grad_norm": 0.049864400178194046, + "learning_rate": 1.6953958323956976e-05, + "loss": 0.2026, + "step": 45609 + }, + { + "epoch": 3.6949125081011016, + "grad_norm": 0.07128980755805969, + "learning_rate": 1.694945767136235e-05, + "loss": 0.2313, + "step": 45610 + }, + { + "epoch": 3.6949935191186, + "grad_norm": 0.06947165727615356, + "learning_rate": 1.694495701876772e-05, + "loss": 0.2192, + "step": 45611 + }, + { + "epoch": 3.6950745301360985, + "grad_norm": 0.06866627931594849, + "learning_rate": 1.6940456366173096e-05, + "loss": 0.2308, + "step": 45612 + }, + { + "epoch": 3.695155541153597, + "grad_norm": 0.07855135202407837, + "learning_rate": 1.693595571357847e-05, + "loss": 0.2266, + "step": 45613 + }, + { + "epoch": 3.695236552171095, + "grad_norm": 0.06514532119035721, + "learning_rate": 1.6931455060983844e-05, + "loss": 0.2095, + "step": 45614 + }, + { + "epoch": 3.6953175631885937, + "grad_norm": 0.058014001697301865, + "learning_rate": 1.6926954408389217e-05, + "loss": 0.2925, + "step": 45615 + }, + { + "epoch": 3.695398574206092, + "grad_norm": 0.06363331526517868, + "learning_rate": 1.692245375579459e-05, + "loss": 0.2324, + "step": 45616 + }, + { + "epoch": 3.69547958522359, + "grad_norm": 0.06826046854257584, + "learning_rate": 1.6917953103199964e-05, + "loss": 0.2338, + "step": 45617 + }, + { + "epoch": 3.695560596241089, + "grad_norm": 0.08055315911769867, + "learning_rate": 1.6913452450605338e-05, + "loss": 0.2187, + "step": 45618 + }, + { + "epoch": 3.695641607258587, + "grad_norm": 0.07960078120231628, + "learning_rate": 1.6908951798010715e-05, + "loss": 0.2421, + "step": 45619 + }, + { + "epoch": 3.6957226182760854, + "grad_norm": 0.05961255356669426, + "learning_rate": 1.6904451145416085e-05, + "loss": 0.2017, + "step": 45620 + }, + { + "epoch": 3.695803629293584, + "grad_norm": 0.07862067967653275, + "learning_rate": 1.689995049282146e-05, + "loss": 0.2251, + "step": 45621 + }, + { + "epoch": 3.6958846403110823, + "grad_norm": 0.07465961575508118, + "learning_rate": 1.6895449840226836e-05, + "loss": 0.222, + "step": 45622 + }, + { + "epoch": 3.6959656513285806, + "grad_norm": 0.07449892908334732, + "learning_rate": 1.6890949187632206e-05, + "loss": 0.2295, + "step": 45623 + }, + { + "epoch": 3.6960466623460793, + "grad_norm": 0.0704660639166832, + "learning_rate": 1.688644853503758e-05, + "loss": 0.2101, + "step": 45624 + }, + { + "epoch": 3.6961276733635775, + "grad_norm": 0.07495720684528351, + "learning_rate": 1.6881947882442957e-05, + "loss": 0.2571, + "step": 45625 + }, + { + "epoch": 3.6962086843810757, + "grad_norm": 0.06861800700426102, + "learning_rate": 1.6877447229848327e-05, + "loss": 0.2147, + "step": 45626 + }, + { + "epoch": 3.6962896953985744, + "grad_norm": 0.07803268730640411, + "learning_rate": 1.6872946577253704e-05, + "loss": 0.2491, + "step": 45627 + }, + { + "epoch": 3.6963707064160727, + "grad_norm": 0.0892467275261879, + "learning_rate": 1.6868445924659077e-05, + "loss": 0.2448, + "step": 45628 + }, + { + "epoch": 3.696451717433571, + "grad_norm": 0.07141023129224777, + "learning_rate": 1.6863945272064448e-05, + "loss": 0.215, + "step": 45629 + }, + { + "epoch": 3.6965327284510696, + "grad_norm": 0.06902121752500534, + "learning_rate": 1.6859444619469825e-05, + "loss": 0.2112, + "step": 45630 + }, + { + "epoch": 3.696613739468568, + "grad_norm": 0.058419931679964066, + "learning_rate": 1.6854943966875198e-05, + "loss": 0.196, + "step": 45631 + }, + { + "epoch": 3.696694750486066, + "grad_norm": 0.06349142640829086, + "learning_rate": 1.685044331428057e-05, + "loss": 0.2146, + "step": 45632 + }, + { + "epoch": 3.6967757615035644, + "grad_norm": 0.07252699136734009, + "learning_rate": 1.6845942661685945e-05, + "loss": 0.2223, + "step": 45633 + }, + { + "epoch": 3.6968567725210626, + "grad_norm": 0.09001344442367554, + "learning_rate": 1.684144200909132e-05, + "loss": 0.253, + "step": 45634 + }, + { + "epoch": 3.6969377835385613, + "grad_norm": 0.058856382966041565, + "learning_rate": 1.6836941356496693e-05, + "loss": 0.2569, + "step": 45635 + }, + { + "epoch": 3.6970187945560595, + "grad_norm": 0.0726538598537445, + "learning_rate": 1.6832440703902066e-05, + "loss": 0.216, + "step": 45636 + }, + { + "epoch": 3.6970998055735578, + "grad_norm": 0.06102930009365082, + "learning_rate": 1.682794005130744e-05, + "loss": 0.221, + "step": 45637 + }, + { + "epoch": 3.6971808165910565, + "grad_norm": 0.06977268308401108, + "learning_rate": 1.6823439398712814e-05, + "loss": 0.2021, + "step": 45638 + }, + { + "epoch": 3.6972618276085547, + "grad_norm": 0.06037342548370361, + "learning_rate": 1.6818938746118187e-05, + "loss": 0.2138, + "step": 45639 + }, + { + "epoch": 3.697342838626053, + "grad_norm": 0.07608211785554886, + "learning_rate": 1.6814438093523564e-05, + "loss": 0.2652, + "step": 45640 + }, + { + "epoch": 3.6974238496435516, + "grad_norm": 0.07579577714204788, + "learning_rate": 1.6809937440928934e-05, + "loss": 0.2378, + "step": 45641 + }, + { + "epoch": 3.69750486066105, + "grad_norm": 0.08454374969005585, + "learning_rate": 1.6805436788334308e-05, + "loss": 0.2475, + "step": 45642 + }, + { + "epoch": 3.697585871678548, + "grad_norm": 0.07013771682977676, + "learning_rate": 1.6800936135739685e-05, + "loss": 0.2294, + "step": 45643 + }, + { + "epoch": 3.697666882696047, + "grad_norm": 0.07883304357528687, + "learning_rate": 1.6796435483145055e-05, + "loss": 0.2483, + "step": 45644 + }, + { + "epoch": 3.697747893713545, + "grad_norm": 0.08239931613206863, + "learning_rate": 1.679193483055043e-05, + "loss": 0.2347, + "step": 45645 + }, + { + "epoch": 3.6978289047310433, + "grad_norm": 0.06723158061504364, + "learning_rate": 1.6787434177955806e-05, + "loss": 0.2109, + "step": 45646 + }, + { + "epoch": 3.697909915748542, + "grad_norm": 0.08971650153398514, + "learning_rate": 1.6782933525361176e-05, + "loss": 0.217, + "step": 45647 + }, + { + "epoch": 3.6979909267660402, + "grad_norm": 0.08268365263938904, + "learning_rate": 1.6778432872766553e-05, + "loss": 0.2135, + "step": 45648 + }, + { + "epoch": 3.6980719377835385, + "grad_norm": 0.06352110952138901, + "learning_rate": 1.6773932220171927e-05, + "loss": 0.1921, + "step": 45649 + }, + { + "epoch": 3.698152948801037, + "grad_norm": 0.08296002447605133, + "learning_rate": 1.6769431567577297e-05, + "loss": 0.2022, + "step": 45650 + }, + { + "epoch": 3.6982339598185354, + "grad_norm": 0.07762415707111359, + "learning_rate": 1.6764930914982674e-05, + "loss": 0.2247, + "step": 45651 + }, + { + "epoch": 3.6983149708360337, + "grad_norm": 0.06467720866203308, + "learning_rate": 1.6760430262388047e-05, + "loss": 0.2153, + "step": 45652 + }, + { + "epoch": 3.6983959818535324, + "grad_norm": 0.07439886033535004, + "learning_rate": 1.675592960979342e-05, + "loss": 0.2479, + "step": 45653 + }, + { + "epoch": 3.6984769928710306, + "grad_norm": 0.05738019570708275, + "learning_rate": 1.6751428957198795e-05, + "loss": 0.212, + "step": 45654 + }, + { + "epoch": 3.698558003888529, + "grad_norm": 0.07403714954853058, + "learning_rate": 1.6746928304604168e-05, + "loss": 0.2277, + "step": 45655 + }, + { + "epoch": 3.698639014906027, + "grad_norm": 0.07722627371549606, + "learning_rate": 1.6742427652009542e-05, + "loss": 0.2012, + "step": 45656 + }, + { + "epoch": 3.6987200259235253, + "grad_norm": 0.0807010680437088, + "learning_rate": 1.6737926999414915e-05, + "loss": 0.2054, + "step": 45657 + }, + { + "epoch": 3.698801036941024, + "grad_norm": 0.08320476114749908, + "learning_rate": 1.673342634682029e-05, + "loss": 0.2418, + "step": 45658 + }, + { + "epoch": 3.6988820479585223, + "grad_norm": 0.0703231692314148, + "learning_rate": 1.6728925694225663e-05, + "loss": 0.2332, + "step": 45659 + }, + { + "epoch": 3.6989630589760205, + "grad_norm": 0.06997893750667572, + "learning_rate": 1.6724425041631036e-05, + "loss": 0.2516, + "step": 45660 + }, + { + "epoch": 3.699044069993519, + "grad_norm": 0.06285632401704788, + "learning_rate": 1.6719924389036413e-05, + "loss": 0.2355, + "step": 45661 + }, + { + "epoch": 3.6991250810110174, + "grad_norm": 0.07425893098115921, + "learning_rate": 1.6715423736441783e-05, + "loss": 0.2049, + "step": 45662 + }, + { + "epoch": 3.6992060920285157, + "grad_norm": 0.07462365180253983, + "learning_rate": 1.6710923083847157e-05, + "loss": 0.217, + "step": 45663 + }, + { + "epoch": 3.6992871030460144, + "grad_norm": 0.07210385799407959, + "learning_rate": 1.6706422431252534e-05, + "loss": 0.2285, + "step": 45664 + }, + { + "epoch": 3.6993681140635126, + "grad_norm": 0.06601511687040329, + "learning_rate": 1.6701921778657904e-05, + "loss": 0.2469, + "step": 45665 + }, + { + "epoch": 3.699449125081011, + "grad_norm": 0.07676741480827332, + "learning_rate": 1.669742112606328e-05, + "loss": 0.2265, + "step": 45666 + }, + { + "epoch": 3.6995301360985096, + "grad_norm": 0.058619819581508636, + "learning_rate": 1.6692920473468655e-05, + "loss": 0.2046, + "step": 45667 + }, + { + "epoch": 3.699611147116008, + "grad_norm": 0.07764916867017746, + "learning_rate": 1.6688419820874025e-05, + "loss": 0.2205, + "step": 45668 + }, + { + "epoch": 3.699692158133506, + "grad_norm": 0.06930316984653473, + "learning_rate": 1.6683919168279402e-05, + "loss": 0.2196, + "step": 45669 + }, + { + "epoch": 3.6997731691510047, + "grad_norm": 0.08031480759382248, + "learning_rate": 1.6679418515684776e-05, + "loss": 0.2509, + "step": 45670 + }, + { + "epoch": 3.699854180168503, + "grad_norm": 0.07526817917823792, + "learning_rate": 1.667491786309015e-05, + "loss": 0.2521, + "step": 45671 + }, + { + "epoch": 3.6999351911860012, + "grad_norm": 0.0837547555565834, + "learning_rate": 1.6670417210495523e-05, + "loss": 0.2373, + "step": 45672 + }, + { + "epoch": 3.7000162022035, + "grad_norm": 0.11006888747215271, + "learning_rate": 1.6665916557900896e-05, + "loss": 0.2406, + "step": 45673 + }, + { + "epoch": 3.700097213220998, + "grad_norm": 0.060090772807598114, + "learning_rate": 1.666141590530627e-05, + "loss": 0.2311, + "step": 45674 + }, + { + "epoch": 3.7001782242384964, + "grad_norm": 0.07697679847478867, + "learning_rate": 1.6656915252711644e-05, + "loss": 0.2144, + "step": 45675 + }, + { + "epoch": 3.7002592352559946, + "grad_norm": 0.075130395591259, + "learning_rate": 1.6652414600117017e-05, + "loss": 0.2075, + "step": 45676 + }, + { + "epoch": 3.7003402462734933, + "grad_norm": 0.06719817966222763, + "learning_rate": 1.664791394752239e-05, + "loss": 0.262, + "step": 45677 + }, + { + "epoch": 3.7004212572909916, + "grad_norm": 0.07148056477308273, + "learning_rate": 1.6643413294927764e-05, + "loss": 0.2244, + "step": 45678 + }, + { + "epoch": 3.70050226830849, + "grad_norm": 0.07177882641553879, + "learning_rate": 1.663891264233314e-05, + "loss": 0.2164, + "step": 45679 + }, + { + "epoch": 3.700583279325988, + "grad_norm": 0.06076192483305931, + "learning_rate": 1.663441198973851e-05, + "loss": 0.2045, + "step": 45680 + }, + { + "epoch": 3.7006642903434868, + "grad_norm": 0.05700770765542984, + "learning_rate": 1.6629911337143885e-05, + "loss": 0.2023, + "step": 45681 + }, + { + "epoch": 3.700745301360985, + "grad_norm": 0.07019194215536118, + "learning_rate": 1.6625410684549262e-05, + "loss": 0.2374, + "step": 45682 + }, + { + "epoch": 3.7008263123784833, + "grad_norm": 0.059162236750125885, + "learning_rate": 1.6620910031954632e-05, + "loss": 0.2114, + "step": 45683 + }, + { + "epoch": 3.700907323395982, + "grad_norm": 0.07613490521907806, + "learning_rate": 1.6616409379360006e-05, + "loss": 0.2293, + "step": 45684 + }, + { + "epoch": 3.70098833441348, + "grad_norm": 0.08490029722452164, + "learning_rate": 1.6611908726765383e-05, + "loss": 0.1955, + "step": 45685 + }, + { + "epoch": 3.7010693454309784, + "grad_norm": 0.07354255020618439, + "learning_rate": 1.6607408074170757e-05, + "loss": 0.2268, + "step": 45686 + }, + { + "epoch": 3.701150356448477, + "grad_norm": 0.0803595781326294, + "learning_rate": 1.660290742157613e-05, + "loss": 0.2502, + "step": 45687 + }, + { + "epoch": 3.7012313674659754, + "grad_norm": 0.07173190265893936, + "learning_rate": 1.6598406768981504e-05, + "loss": 0.2447, + "step": 45688 + }, + { + "epoch": 3.7013123784834736, + "grad_norm": 0.08350685238838196, + "learning_rate": 1.6593906116386877e-05, + "loss": 0.2585, + "step": 45689 + }, + { + "epoch": 3.7013933895009723, + "grad_norm": 0.06672927737236023, + "learning_rate": 1.658940546379225e-05, + "loss": 0.21, + "step": 45690 + }, + { + "epoch": 3.7014744005184705, + "grad_norm": 0.07220529764890671, + "learning_rate": 1.6584904811197625e-05, + "loss": 0.2327, + "step": 45691 + }, + { + "epoch": 3.701555411535969, + "grad_norm": 0.07617419213056564, + "learning_rate": 1.6580404158602998e-05, + "loss": 0.2163, + "step": 45692 + }, + { + "epoch": 3.7016364225534675, + "grad_norm": 0.06999776512384415, + "learning_rate": 1.6575903506008372e-05, + "loss": 0.2125, + "step": 45693 + }, + { + "epoch": 3.7017174335709657, + "grad_norm": 0.07320782542228699, + "learning_rate": 1.6571402853413745e-05, + "loss": 0.2258, + "step": 45694 + }, + { + "epoch": 3.701798444588464, + "grad_norm": 0.07582980394363403, + "learning_rate": 1.656690220081912e-05, + "loss": 0.213, + "step": 45695 + }, + { + "epoch": 3.7018794556059627, + "grad_norm": 0.05617687106132507, + "learning_rate": 1.6562401548224493e-05, + "loss": 0.1939, + "step": 45696 + }, + { + "epoch": 3.701960466623461, + "grad_norm": 0.058806758373975754, + "learning_rate": 1.6557900895629866e-05, + "loss": 0.2449, + "step": 45697 + }, + { + "epoch": 3.702041477640959, + "grad_norm": 0.0760156661272049, + "learning_rate": 1.655340024303524e-05, + "loss": 0.238, + "step": 45698 + }, + { + "epoch": 3.7021224886584574, + "grad_norm": 0.06211948022246361, + "learning_rate": 1.6548899590440613e-05, + "loss": 0.2389, + "step": 45699 + }, + { + "epoch": 3.702203499675956, + "grad_norm": 0.0635630339384079, + "learning_rate": 1.654439893784599e-05, + "loss": 0.217, + "step": 45700 + }, + { + "epoch": 3.7022845106934543, + "grad_norm": 0.07854915410280228, + "learning_rate": 1.653989828525136e-05, + "loss": 0.2497, + "step": 45701 + }, + { + "epoch": 3.7023655217109526, + "grad_norm": 0.06777413934469223, + "learning_rate": 1.6535397632656734e-05, + "loss": 0.24, + "step": 45702 + }, + { + "epoch": 3.702446532728451, + "grad_norm": 0.06990412622690201, + "learning_rate": 1.653089698006211e-05, + "loss": 0.2094, + "step": 45703 + }, + { + "epoch": 3.7025275437459495, + "grad_norm": 0.06906888633966446, + "learning_rate": 1.6526396327467485e-05, + "loss": 0.2311, + "step": 45704 + }, + { + "epoch": 3.7026085547634477, + "grad_norm": 0.06097817420959473, + "learning_rate": 1.652189567487286e-05, + "loss": 0.1887, + "step": 45705 + }, + { + "epoch": 3.702689565780946, + "grad_norm": 0.07380296289920807, + "learning_rate": 1.6517395022278232e-05, + "loss": 0.2236, + "step": 45706 + }, + { + "epoch": 3.7027705767984447, + "grad_norm": 0.07449514418840408, + "learning_rate": 1.6512894369683606e-05, + "loss": 0.2055, + "step": 45707 + }, + { + "epoch": 3.702851587815943, + "grad_norm": 0.07819279283285141, + "learning_rate": 1.650839371708898e-05, + "loss": 0.2339, + "step": 45708 + }, + { + "epoch": 3.702932598833441, + "grad_norm": 0.06513992697000504, + "learning_rate": 1.6503893064494353e-05, + "loss": 0.216, + "step": 45709 + }, + { + "epoch": 3.70301360985094, + "grad_norm": 0.08240225911140442, + "learning_rate": 1.6499392411899726e-05, + "loss": 0.2141, + "step": 45710 + }, + { + "epoch": 3.703094620868438, + "grad_norm": 0.08612393587827682, + "learning_rate": 1.64948917593051e-05, + "loss": 0.2298, + "step": 45711 + }, + { + "epoch": 3.7031756318859363, + "grad_norm": 0.06854256242513657, + "learning_rate": 1.6490391106710474e-05, + "loss": 0.2128, + "step": 45712 + }, + { + "epoch": 3.703256642903435, + "grad_norm": 0.05733377858996391, + "learning_rate": 1.6485890454115847e-05, + "loss": 0.2351, + "step": 45713 + }, + { + "epoch": 3.7033376539209333, + "grad_norm": 0.06590426713228226, + "learning_rate": 1.648138980152122e-05, + "loss": 0.2482, + "step": 45714 + }, + { + "epoch": 3.7034186649384315, + "grad_norm": 0.06614598631858826, + "learning_rate": 1.6476889148926595e-05, + "loss": 0.2188, + "step": 45715 + }, + { + "epoch": 3.70349967595593, + "grad_norm": 0.06795570999383926, + "learning_rate": 1.6472388496331968e-05, + "loss": 0.2445, + "step": 45716 + }, + { + "epoch": 3.7035806869734285, + "grad_norm": 0.07279584556818008, + "learning_rate": 1.6467887843737342e-05, + "loss": 0.2135, + "step": 45717 + }, + { + "epoch": 3.7036616979909267, + "grad_norm": 0.0802987739443779, + "learning_rate": 1.6463387191142715e-05, + "loss": 0.2216, + "step": 45718 + }, + { + "epoch": 3.7037427090084254, + "grad_norm": 0.07141979783773422, + "learning_rate": 1.645888653854809e-05, + "loss": 0.2343, + "step": 45719 + }, + { + "epoch": 3.7038237200259236, + "grad_norm": 0.058926984667778015, + "learning_rate": 1.6454385885953463e-05, + "loss": 0.2207, + "step": 45720 + }, + { + "epoch": 3.703904731043422, + "grad_norm": 0.07551530748605728, + "learning_rate": 1.644988523335884e-05, + "loss": 0.2285, + "step": 45721 + }, + { + "epoch": 3.70398574206092, + "grad_norm": 0.08013749122619629, + "learning_rate": 1.6445384580764213e-05, + "loss": 0.2577, + "step": 45722 + }, + { + "epoch": 3.704066753078419, + "grad_norm": 0.0685802772641182, + "learning_rate": 1.6440883928169583e-05, + "loss": 0.2172, + "step": 45723 + }, + { + "epoch": 3.704147764095917, + "grad_norm": 0.07025080174207687, + "learning_rate": 1.643638327557496e-05, + "loss": 0.2394, + "step": 45724 + }, + { + "epoch": 3.7042287751134153, + "grad_norm": 0.07615825533866882, + "learning_rate": 1.6431882622980334e-05, + "loss": 0.2186, + "step": 45725 + }, + { + "epoch": 3.7043097861309136, + "grad_norm": 0.07653851807117462, + "learning_rate": 1.6427381970385708e-05, + "loss": 0.2106, + "step": 45726 + }, + { + "epoch": 3.7043907971484122, + "grad_norm": 0.07739897817373276, + "learning_rate": 1.642288131779108e-05, + "loss": 0.2066, + "step": 45727 + }, + { + "epoch": 3.7044718081659105, + "grad_norm": 0.07978235185146332, + "learning_rate": 1.6418380665196455e-05, + "loss": 0.252, + "step": 45728 + }, + { + "epoch": 3.7045528191834087, + "grad_norm": 0.07335931062698364, + "learning_rate": 1.641388001260183e-05, + "loss": 0.1746, + "step": 45729 + }, + { + "epoch": 3.7046338302009074, + "grad_norm": 0.08139511197805405, + "learning_rate": 1.6409379360007202e-05, + "loss": 0.2766, + "step": 45730 + }, + { + "epoch": 3.7047148412184057, + "grad_norm": 0.06404054909944534, + "learning_rate": 1.6404878707412576e-05, + "loss": 0.1958, + "step": 45731 + }, + { + "epoch": 3.704795852235904, + "grad_norm": 0.062324702739715576, + "learning_rate": 1.640037805481795e-05, + "loss": 0.2066, + "step": 45732 + }, + { + "epoch": 3.7048768632534026, + "grad_norm": 0.08526472002267838, + "learning_rate": 1.6395877402223323e-05, + "loss": 0.2312, + "step": 45733 + }, + { + "epoch": 3.704957874270901, + "grad_norm": 0.06586598604917526, + "learning_rate": 1.6391376749628696e-05, + "loss": 0.2201, + "step": 45734 + }, + { + "epoch": 3.705038885288399, + "grad_norm": 0.08027635514736176, + "learning_rate": 1.638687609703407e-05, + "loss": 0.2203, + "step": 45735 + }, + { + "epoch": 3.7051198963058978, + "grad_norm": 0.0849221721291542, + "learning_rate": 1.6382375444439444e-05, + "loss": 0.2481, + "step": 45736 + }, + { + "epoch": 3.705200907323396, + "grad_norm": 0.08749920129776001, + "learning_rate": 1.637787479184482e-05, + "loss": 0.2237, + "step": 45737 + }, + { + "epoch": 3.7052819183408943, + "grad_norm": 0.07888040691614151, + "learning_rate": 1.637337413925019e-05, + "loss": 0.224, + "step": 45738 + }, + { + "epoch": 3.705362929358393, + "grad_norm": 0.08087297528982162, + "learning_rate": 1.6368873486655568e-05, + "loss": 0.2175, + "step": 45739 + }, + { + "epoch": 3.705443940375891, + "grad_norm": 0.0801047757267952, + "learning_rate": 1.636437283406094e-05, + "loss": 0.2298, + "step": 45740 + }, + { + "epoch": 3.7055249513933894, + "grad_norm": 0.06400655955076218, + "learning_rate": 1.635987218146631e-05, + "loss": 0.2214, + "step": 45741 + }, + { + "epoch": 3.705605962410888, + "grad_norm": 0.06360850483179092, + "learning_rate": 1.635537152887169e-05, + "loss": 0.2173, + "step": 45742 + }, + { + "epoch": 3.7056869734283864, + "grad_norm": 0.06993993371725082, + "learning_rate": 1.6350870876277062e-05, + "loss": 0.2188, + "step": 45743 + }, + { + "epoch": 3.7057679844458846, + "grad_norm": 0.08145015686750412, + "learning_rate": 1.6346370223682432e-05, + "loss": 0.2529, + "step": 45744 + }, + { + "epoch": 3.705848995463383, + "grad_norm": 0.08533214777708054, + "learning_rate": 1.634186957108781e-05, + "loss": 0.2268, + "step": 45745 + }, + { + "epoch": 3.7059300064808816, + "grad_norm": 0.08632870018482208, + "learning_rate": 1.6337368918493183e-05, + "loss": 0.2359, + "step": 45746 + }, + { + "epoch": 3.70601101749838, + "grad_norm": 0.07463864982128143, + "learning_rate": 1.6332868265898557e-05, + "loss": 0.2247, + "step": 45747 + }, + { + "epoch": 3.706092028515878, + "grad_norm": 0.06898374110460281, + "learning_rate": 1.632836761330393e-05, + "loss": 0.2013, + "step": 45748 + }, + { + "epoch": 3.7061730395333763, + "grad_norm": 0.06634795665740967, + "learning_rate": 1.6323866960709304e-05, + "loss": 0.1984, + "step": 45749 + }, + { + "epoch": 3.706254050550875, + "grad_norm": 0.08424997329711914, + "learning_rate": 1.6319366308114677e-05, + "loss": 0.2292, + "step": 45750 + }, + { + "epoch": 3.7063350615683732, + "grad_norm": 0.08333491533994675, + "learning_rate": 1.631486565552005e-05, + "loss": 0.2562, + "step": 45751 + }, + { + "epoch": 3.7064160725858715, + "grad_norm": 0.07007382810115814, + "learning_rate": 1.6310365002925425e-05, + "loss": 0.2016, + "step": 45752 + }, + { + "epoch": 3.70649708360337, + "grad_norm": 0.07411225140094757, + "learning_rate": 1.6305864350330798e-05, + "loss": 0.2552, + "step": 45753 + }, + { + "epoch": 3.7065780946208684, + "grad_norm": 0.08287201076745987, + "learning_rate": 1.6301363697736172e-05, + "loss": 0.2521, + "step": 45754 + }, + { + "epoch": 3.7066591056383666, + "grad_norm": 0.07097062468528748, + "learning_rate": 1.629686304514155e-05, + "loss": 0.2275, + "step": 45755 + }, + { + "epoch": 3.7067401166558653, + "grad_norm": 0.07343895733356476, + "learning_rate": 1.629236239254692e-05, + "loss": 0.197, + "step": 45756 + }, + { + "epoch": 3.7068211276733636, + "grad_norm": 0.06564123928546906, + "learning_rate": 1.6287861739952293e-05, + "loss": 0.2087, + "step": 45757 + }, + { + "epoch": 3.706902138690862, + "grad_norm": 0.0649932399392128, + "learning_rate": 1.628336108735767e-05, + "loss": 0.2356, + "step": 45758 + }, + { + "epoch": 3.7069831497083605, + "grad_norm": 0.07513502985239029, + "learning_rate": 1.627886043476304e-05, + "loss": 0.2562, + "step": 45759 + }, + { + "epoch": 3.7070641607258588, + "grad_norm": 0.07147255539894104, + "learning_rate": 1.6274359782168417e-05, + "loss": 0.2592, + "step": 45760 + }, + { + "epoch": 3.707145171743357, + "grad_norm": 0.0936325266957283, + "learning_rate": 1.626985912957379e-05, + "loss": 0.2757, + "step": 45761 + }, + { + "epoch": 3.7072261827608557, + "grad_norm": 0.06833754479885101, + "learning_rate": 1.626535847697916e-05, + "loss": 0.2258, + "step": 45762 + }, + { + "epoch": 3.707307193778354, + "grad_norm": 0.08121831715106964, + "learning_rate": 1.6260857824384538e-05, + "loss": 0.211, + "step": 45763 + }, + { + "epoch": 3.707388204795852, + "grad_norm": 0.0541922003030777, + "learning_rate": 1.625635717178991e-05, + "loss": 0.1754, + "step": 45764 + }, + { + "epoch": 3.707469215813351, + "grad_norm": 0.07351001352071762, + "learning_rate": 1.6251856519195285e-05, + "loss": 0.2369, + "step": 45765 + }, + { + "epoch": 3.707550226830849, + "grad_norm": 0.1002897396683693, + "learning_rate": 1.624735586660066e-05, + "loss": 0.2499, + "step": 45766 + }, + { + "epoch": 3.7076312378483474, + "grad_norm": 0.053547605872154236, + "learning_rate": 1.6242855214006032e-05, + "loss": 0.2213, + "step": 45767 + }, + { + "epoch": 3.7077122488658456, + "grad_norm": 0.06925728917121887, + "learning_rate": 1.6238354561411406e-05, + "loss": 0.2017, + "step": 45768 + }, + { + "epoch": 3.7077932598833443, + "grad_norm": 0.06350000202655792, + "learning_rate": 1.623385390881678e-05, + "loss": 0.2297, + "step": 45769 + }, + { + "epoch": 3.7078742709008425, + "grad_norm": 0.06750939041376114, + "learning_rate": 1.6229353256222153e-05, + "loss": 0.1952, + "step": 45770 + }, + { + "epoch": 3.707955281918341, + "grad_norm": 0.0888800173997879, + "learning_rate": 1.6224852603627526e-05, + "loss": 0.2372, + "step": 45771 + }, + { + "epoch": 3.708036292935839, + "grad_norm": 0.06730051338672638, + "learning_rate": 1.62203519510329e-05, + "loss": 0.2262, + "step": 45772 + }, + { + "epoch": 3.7081173039533377, + "grad_norm": 0.09187749773263931, + "learning_rate": 1.6215851298438277e-05, + "loss": 0.2275, + "step": 45773 + }, + { + "epoch": 3.708198314970836, + "grad_norm": 0.06515898555517197, + "learning_rate": 1.6211350645843647e-05, + "loss": 0.199, + "step": 45774 + }, + { + "epoch": 3.708279325988334, + "grad_norm": 0.07909893989562988, + "learning_rate": 1.620684999324902e-05, + "loss": 0.2394, + "step": 45775 + }, + { + "epoch": 3.708360337005833, + "grad_norm": 0.0797453448176384, + "learning_rate": 1.6202349340654398e-05, + "loss": 0.2138, + "step": 45776 + }, + { + "epoch": 3.708441348023331, + "grad_norm": 0.06687083095312119, + "learning_rate": 1.6197848688059768e-05, + "loss": 0.228, + "step": 45777 + }, + { + "epoch": 3.7085223590408294, + "grad_norm": 0.0814642459154129, + "learning_rate": 1.6193348035465145e-05, + "loss": 0.1993, + "step": 45778 + }, + { + "epoch": 3.708603370058328, + "grad_norm": 0.07986731082201004, + "learning_rate": 1.618884738287052e-05, + "loss": 0.251, + "step": 45779 + }, + { + "epoch": 3.7086843810758263, + "grad_norm": 0.06904210150241852, + "learning_rate": 1.618434673027589e-05, + "loss": 0.2449, + "step": 45780 + }, + { + "epoch": 3.7087653920933246, + "grad_norm": 0.06334845721721649, + "learning_rate": 1.6179846077681266e-05, + "loss": 0.2306, + "step": 45781 + }, + { + "epoch": 3.7088464031108233, + "grad_norm": 0.06799927353858948, + "learning_rate": 1.617534542508664e-05, + "loss": 0.2138, + "step": 45782 + }, + { + "epoch": 3.7089274141283215, + "grad_norm": 0.07648967951536179, + "learning_rate": 1.617084477249201e-05, + "loss": 0.2229, + "step": 45783 + }, + { + "epoch": 3.7090084251458197, + "grad_norm": 0.05069386214017868, + "learning_rate": 1.6166344119897387e-05, + "loss": 0.1997, + "step": 45784 + }, + { + "epoch": 3.7090894361633184, + "grad_norm": 0.07770735770463943, + "learning_rate": 1.616184346730276e-05, + "loss": 0.2248, + "step": 45785 + }, + { + "epoch": 3.7091704471808167, + "grad_norm": 0.08397640287876129, + "learning_rate": 1.6157342814708134e-05, + "loss": 0.2367, + "step": 45786 + }, + { + "epoch": 3.709251458198315, + "grad_norm": 0.059977419674396515, + "learning_rate": 1.6152842162113508e-05, + "loss": 0.2107, + "step": 45787 + }, + { + "epoch": 3.7093324692158136, + "grad_norm": 0.0799662172794342, + "learning_rate": 1.614834150951888e-05, + "loss": 0.2496, + "step": 45788 + }, + { + "epoch": 3.709413480233312, + "grad_norm": 0.058951485902071, + "learning_rate": 1.6143840856924255e-05, + "loss": 0.211, + "step": 45789 + }, + { + "epoch": 3.70949449125081, + "grad_norm": 0.07071194797754288, + "learning_rate": 1.613934020432963e-05, + "loss": 0.2352, + "step": 45790 + }, + { + "epoch": 3.7095755022683083, + "grad_norm": 0.05729727819561958, + "learning_rate": 1.6134839551735005e-05, + "loss": 0.1978, + "step": 45791 + }, + { + "epoch": 3.709656513285807, + "grad_norm": 0.06328321993350983, + "learning_rate": 1.6130338899140376e-05, + "loss": 0.2145, + "step": 45792 + }, + { + "epoch": 3.7097375243033053, + "grad_norm": 0.07447624206542969, + "learning_rate": 1.612583824654575e-05, + "loss": 0.2312, + "step": 45793 + }, + { + "epoch": 3.7098185353208035, + "grad_norm": 0.06918629258871078, + "learning_rate": 1.6121337593951126e-05, + "loss": 0.2223, + "step": 45794 + }, + { + "epoch": 3.7098995463383018, + "grad_norm": 0.059608764946460724, + "learning_rate": 1.6116836941356496e-05, + "loss": 0.2501, + "step": 45795 + }, + { + "epoch": 3.7099805573558005, + "grad_norm": 0.07887037843465805, + "learning_rate": 1.611233628876187e-05, + "loss": 0.2245, + "step": 45796 + }, + { + "epoch": 3.7100615683732987, + "grad_norm": 0.06873272359371185, + "learning_rate": 1.6107835636167247e-05, + "loss": 0.2225, + "step": 45797 + }, + { + "epoch": 3.710142579390797, + "grad_norm": 0.09617675095796585, + "learning_rate": 1.6103334983572617e-05, + "loss": 0.2345, + "step": 45798 + }, + { + "epoch": 3.7102235904082956, + "grad_norm": 0.07761441171169281, + "learning_rate": 1.6098834330977994e-05, + "loss": 0.2673, + "step": 45799 + }, + { + "epoch": 3.710304601425794, + "grad_norm": 0.09091390669345856, + "learning_rate": 1.6094333678383368e-05, + "loss": 0.2659, + "step": 45800 + }, + { + "epoch": 3.710385612443292, + "grad_norm": 0.06801667809486389, + "learning_rate": 1.6089833025788738e-05, + "loss": 0.2206, + "step": 45801 + }, + { + "epoch": 3.710466623460791, + "grad_norm": 0.07345068454742432, + "learning_rate": 1.6085332373194115e-05, + "loss": 0.2409, + "step": 45802 + }, + { + "epoch": 3.710547634478289, + "grad_norm": 0.08711735904216766, + "learning_rate": 1.608083172059949e-05, + "loss": 0.2144, + "step": 45803 + }, + { + "epoch": 3.7106286454957873, + "grad_norm": 0.06707657873630524, + "learning_rate": 1.607633106800486e-05, + "loss": 0.2124, + "step": 45804 + }, + { + "epoch": 3.710709656513286, + "grad_norm": 0.08471059054136276, + "learning_rate": 1.6071830415410236e-05, + "loss": 0.2525, + "step": 45805 + }, + { + "epoch": 3.7107906675307842, + "grad_norm": 0.0858485996723175, + "learning_rate": 1.606732976281561e-05, + "loss": 0.2109, + "step": 45806 + }, + { + "epoch": 3.7108716785482825, + "grad_norm": 0.08600111305713654, + "learning_rate": 1.6062829110220983e-05, + "loss": 0.2317, + "step": 45807 + }, + { + "epoch": 3.710952689565781, + "grad_norm": 0.06662708520889282, + "learning_rate": 1.6058328457626357e-05, + "loss": 0.2328, + "step": 45808 + }, + { + "epoch": 3.7110337005832794, + "grad_norm": 0.05524429306387901, + "learning_rate": 1.605382780503173e-05, + "loss": 0.2432, + "step": 45809 + }, + { + "epoch": 3.7111147116007777, + "grad_norm": 0.07598711550235748, + "learning_rate": 1.6049327152437104e-05, + "loss": 0.2376, + "step": 45810 + }, + { + "epoch": 3.7111957226182763, + "grad_norm": 0.0858222246170044, + "learning_rate": 1.6044826499842477e-05, + "loss": 0.2052, + "step": 45811 + }, + { + "epoch": 3.7112767336357746, + "grad_norm": 0.06710031628608704, + "learning_rate": 1.6040325847247854e-05, + "loss": 0.2005, + "step": 45812 + }, + { + "epoch": 3.711357744653273, + "grad_norm": 0.07461856305599213, + "learning_rate": 1.6035825194653225e-05, + "loss": 0.2281, + "step": 45813 + }, + { + "epoch": 3.711438755670771, + "grad_norm": 0.06366579234600067, + "learning_rate": 1.6031324542058598e-05, + "loss": 0.2136, + "step": 45814 + }, + { + "epoch": 3.7115197666882693, + "grad_norm": 0.07957981526851654, + "learning_rate": 1.6026823889463975e-05, + "loss": 0.2297, + "step": 45815 + }, + { + "epoch": 3.711600777705768, + "grad_norm": 0.06554809212684631, + "learning_rate": 1.6022323236869345e-05, + "loss": 0.23, + "step": 45816 + }, + { + "epoch": 3.7116817887232663, + "grad_norm": 0.06822206825017929, + "learning_rate": 1.601782258427472e-05, + "loss": 0.204, + "step": 45817 + }, + { + "epoch": 3.7117627997407645, + "grad_norm": 0.07761096954345703, + "learning_rate": 1.6013321931680096e-05, + "loss": 0.2212, + "step": 45818 + }, + { + "epoch": 3.711843810758263, + "grad_norm": 0.05761200562119484, + "learning_rate": 1.6008821279085466e-05, + "loss": 0.2218, + "step": 45819 + }, + { + "epoch": 3.7119248217757614, + "grad_norm": 0.06800486892461777, + "learning_rate": 1.6004320626490843e-05, + "loss": 0.202, + "step": 45820 + }, + { + "epoch": 3.7120058327932597, + "grad_norm": 0.08612167090177536, + "learning_rate": 1.5999819973896217e-05, + "loss": 0.2504, + "step": 45821 + }, + { + "epoch": 3.7120868438107584, + "grad_norm": 0.0777377337217331, + "learning_rate": 1.5995319321301587e-05, + "loss": 0.2192, + "step": 45822 + }, + { + "epoch": 3.7121678548282566, + "grad_norm": 0.0771232396364212, + "learning_rate": 1.5990818668706964e-05, + "loss": 0.233, + "step": 45823 + }, + { + "epoch": 3.712248865845755, + "grad_norm": 0.0695451945066452, + "learning_rate": 1.5986318016112338e-05, + "loss": 0.2525, + "step": 45824 + }, + { + "epoch": 3.7123298768632536, + "grad_norm": 0.08157804608345032, + "learning_rate": 1.598181736351771e-05, + "loss": 0.2623, + "step": 45825 + }, + { + "epoch": 3.712410887880752, + "grad_norm": 0.07781726866960526, + "learning_rate": 1.5977316710923085e-05, + "loss": 0.2288, + "step": 45826 + }, + { + "epoch": 3.71249189889825, + "grad_norm": 0.08144915848970413, + "learning_rate": 1.597281605832846e-05, + "loss": 0.245, + "step": 45827 + }, + { + "epoch": 3.7125729099157487, + "grad_norm": 0.0604521706700325, + "learning_rate": 1.5968315405733832e-05, + "loss": 0.1962, + "step": 45828 + }, + { + "epoch": 3.712653920933247, + "grad_norm": 0.056658048182725906, + "learning_rate": 1.5963814753139206e-05, + "loss": 0.1805, + "step": 45829 + }, + { + "epoch": 3.712734931950745, + "grad_norm": 0.06701702624559402, + "learning_rate": 1.595931410054458e-05, + "loss": 0.2013, + "step": 45830 + }, + { + "epoch": 3.712815942968244, + "grad_norm": 0.07957711070775986, + "learning_rate": 1.5954813447949953e-05, + "loss": 0.2567, + "step": 45831 + }, + { + "epoch": 3.712896953985742, + "grad_norm": 0.07916118204593658, + "learning_rate": 1.5950312795355326e-05, + "loss": 0.2017, + "step": 45832 + }, + { + "epoch": 3.7129779650032404, + "grad_norm": 0.06471701711416245, + "learning_rate": 1.5945812142760703e-05, + "loss": 0.2304, + "step": 45833 + }, + { + "epoch": 3.713058976020739, + "grad_norm": 0.0795748233795166, + "learning_rate": 1.5941311490166074e-05, + "loss": 0.2586, + "step": 45834 + }, + { + "epoch": 3.7131399870382373, + "grad_norm": 0.07704074680805206, + "learning_rate": 1.5936810837571447e-05, + "loss": 0.2104, + "step": 45835 + }, + { + "epoch": 3.7132209980557356, + "grad_norm": 0.09900951385498047, + "learning_rate": 1.5932310184976824e-05, + "loss": 0.2398, + "step": 45836 + }, + { + "epoch": 3.713302009073234, + "grad_norm": 0.08793462067842484, + "learning_rate": 1.5927809532382194e-05, + "loss": 0.2122, + "step": 45837 + }, + { + "epoch": 3.713383020090732, + "grad_norm": 0.07224460691213608, + "learning_rate": 1.592330887978757e-05, + "loss": 0.2259, + "step": 45838 + }, + { + "epoch": 3.7134640311082308, + "grad_norm": 0.06400369852781296, + "learning_rate": 1.5918808227192945e-05, + "loss": 0.2213, + "step": 45839 + }, + { + "epoch": 3.713545042125729, + "grad_norm": 0.06662207841873169, + "learning_rate": 1.5914307574598315e-05, + "loss": 0.2238, + "step": 45840 + }, + { + "epoch": 3.7136260531432272, + "grad_norm": 0.07715484499931335, + "learning_rate": 1.5909806922003692e-05, + "loss": 0.2254, + "step": 45841 + }, + { + "epoch": 3.713707064160726, + "grad_norm": 0.08571289479732513, + "learning_rate": 1.5905306269409066e-05, + "loss": 0.227, + "step": 45842 + }, + { + "epoch": 3.713788075178224, + "grad_norm": 0.08624354749917984, + "learning_rate": 1.5900805616814436e-05, + "loss": 0.2444, + "step": 45843 + }, + { + "epoch": 3.7138690861957224, + "grad_norm": 0.07141070067882538, + "learning_rate": 1.5896304964219813e-05, + "loss": 0.2302, + "step": 45844 + }, + { + "epoch": 3.713950097213221, + "grad_norm": 0.07181796431541443, + "learning_rate": 1.5891804311625187e-05, + "loss": 0.2712, + "step": 45845 + }, + { + "epoch": 3.7140311082307194, + "grad_norm": 0.08645230531692505, + "learning_rate": 1.588730365903056e-05, + "loss": 0.2544, + "step": 45846 + }, + { + "epoch": 3.7141121192482176, + "grad_norm": 0.08005709946155548, + "learning_rate": 1.5882803006435934e-05, + "loss": 0.2271, + "step": 45847 + }, + { + "epoch": 3.7141931302657163, + "grad_norm": 0.06787635385990143, + "learning_rate": 1.5878302353841307e-05, + "loss": 0.2092, + "step": 45848 + }, + { + "epoch": 3.7142741412832145, + "grad_norm": 0.07872307300567627, + "learning_rate": 1.587380170124668e-05, + "loss": 0.1951, + "step": 45849 + }, + { + "epoch": 3.714355152300713, + "grad_norm": 0.06552909314632416, + "learning_rate": 1.5869301048652055e-05, + "loss": 0.2069, + "step": 45850 + }, + { + "epoch": 3.7144361633182115, + "grad_norm": 0.06676136702299118, + "learning_rate": 1.586480039605743e-05, + "loss": 0.2355, + "step": 45851 + }, + { + "epoch": 3.7145171743357097, + "grad_norm": 0.07451087981462479, + "learning_rate": 1.5860299743462802e-05, + "loss": 0.2289, + "step": 45852 + }, + { + "epoch": 3.714598185353208, + "grad_norm": 0.08217606693506241, + "learning_rate": 1.5855799090868175e-05, + "loss": 0.2305, + "step": 45853 + }, + { + "epoch": 3.7146791963707066, + "grad_norm": 0.08405117690563202, + "learning_rate": 1.5851298438273552e-05, + "loss": 0.2559, + "step": 45854 + }, + { + "epoch": 3.714760207388205, + "grad_norm": 0.049762554466724396, + "learning_rate": 1.5846797785678923e-05, + "loss": 0.229, + "step": 45855 + }, + { + "epoch": 3.714841218405703, + "grad_norm": 0.07197723537683487, + "learning_rate": 1.5842297133084296e-05, + "loss": 0.2394, + "step": 45856 + }, + { + "epoch": 3.714922229423202, + "grad_norm": 0.07479429244995117, + "learning_rate": 1.5837796480489673e-05, + "loss": 0.2368, + "step": 45857 + }, + { + "epoch": 3.7150032404407, + "grad_norm": 0.0639149472117424, + "learning_rate": 1.5833295827895044e-05, + "loss": 0.2262, + "step": 45858 + }, + { + "epoch": 3.7150842514581983, + "grad_norm": 0.06918316334486008, + "learning_rate": 1.582879517530042e-05, + "loss": 0.2128, + "step": 45859 + }, + { + "epoch": 3.7151652624756966, + "grad_norm": 0.08518219739198685, + "learning_rate": 1.5824294522705794e-05, + "loss": 0.265, + "step": 45860 + }, + { + "epoch": 3.715246273493195, + "grad_norm": 0.07202792167663574, + "learning_rate": 1.5819793870111164e-05, + "loss": 0.2217, + "step": 45861 + }, + { + "epoch": 3.7153272845106935, + "grad_norm": 0.06419474631547928, + "learning_rate": 1.581529321751654e-05, + "loss": 0.2155, + "step": 45862 + }, + { + "epoch": 3.7154082955281917, + "grad_norm": 0.07064595073461533, + "learning_rate": 1.5810792564921915e-05, + "loss": 0.1976, + "step": 45863 + }, + { + "epoch": 3.71548930654569, + "grad_norm": 0.0711565911769867, + "learning_rate": 1.580629191232729e-05, + "loss": 0.2549, + "step": 45864 + }, + { + "epoch": 3.7155703175631887, + "grad_norm": 0.06996343284845352, + "learning_rate": 1.5801791259732662e-05, + "loss": 0.1958, + "step": 45865 + }, + { + "epoch": 3.715651328580687, + "grad_norm": 0.06321109086275101, + "learning_rate": 1.5797290607138036e-05, + "loss": 0.2098, + "step": 45866 + }, + { + "epoch": 3.715732339598185, + "grad_norm": 0.07822613418102264, + "learning_rate": 1.579278995454341e-05, + "loss": 0.2079, + "step": 45867 + }, + { + "epoch": 3.715813350615684, + "grad_norm": 0.08399234712123871, + "learning_rate": 1.5788289301948783e-05, + "loss": 0.2388, + "step": 45868 + }, + { + "epoch": 3.715894361633182, + "grad_norm": 0.06259548664093018, + "learning_rate": 1.5783788649354157e-05, + "loss": 0.2342, + "step": 45869 + }, + { + "epoch": 3.7159753726506803, + "grad_norm": 0.08495364338159561, + "learning_rate": 1.577928799675953e-05, + "loss": 0.2148, + "step": 45870 + }, + { + "epoch": 3.716056383668179, + "grad_norm": 0.0643046498298645, + "learning_rate": 1.5774787344164904e-05, + "loss": 0.2099, + "step": 45871 + }, + { + "epoch": 3.7161373946856773, + "grad_norm": 0.06250204145908356, + "learning_rate": 1.577028669157028e-05, + "loss": 0.1988, + "step": 45872 + }, + { + "epoch": 3.7162184057031755, + "grad_norm": 0.0693424865603447, + "learning_rate": 1.576578603897565e-05, + "loss": 0.2287, + "step": 45873 + }, + { + "epoch": 3.716299416720674, + "grad_norm": 0.07121361792087555, + "learning_rate": 1.5761285386381025e-05, + "loss": 0.235, + "step": 45874 + }, + { + "epoch": 3.7163804277381725, + "grad_norm": 0.07043685764074326, + "learning_rate": 1.57567847337864e-05, + "loss": 0.2493, + "step": 45875 + }, + { + "epoch": 3.7164614387556707, + "grad_norm": 0.07397102564573288, + "learning_rate": 1.5752284081191772e-05, + "loss": 0.2426, + "step": 45876 + }, + { + "epoch": 3.7165424497731694, + "grad_norm": 0.0942964181303978, + "learning_rate": 1.574778342859715e-05, + "loss": 0.2565, + "step": 45877 + }, + { + "epoch": 3.7166234607906676, + "grad_norm": 0.08170833438634872, + "learning_rate": 1.5743282776002522e-05, + "loss": 0.2895, + "step": 45878 + }, + { + "epoch": 3.716704471808166, + "grad_norm": 0.07267139106988907, + "learning_rate": 1.5738782123407893e-05, + "loss": 0.22, + "step": 45879 + }, + { + "epoch": 3.7167854828256646, + "grad_norm": 0.06435095518827438, + "learning_rate": 1.573428147081327e-05, + "loss": 0.2089, + "step": 45880 + }, + { + "epoch": 3.716866493843163, + "grad_norm": 0.07378797233104706, + "learning_rate": 1.5729780818218643e-05, + "loss": 0.2211, + "step": 45881 + }, + { + "epoch": 3.716947504860661, + "grad_norm": 0.08473730087280273, + "learning_rate": 1.5725280165624017e-05, + "loss": 0.2114, + "step": 45882 + }, + { + "epoch": 3.7170285158781593, + "grad_norm": 0.06100822240114212, + "learning_rate": 1.572077951302939e-05, + "loss": 0.2234, + "step": 45883 + }, + { + "epoch": 3.7171095268956575, + "grad_norm": 0.08396106958389282, + "learning_rate": 1.5716278860434764e-05, + "loss": 0.2526, + "step": 45884 + }, + { + "epoch": 3.7171905379131562, + "grad_norm": 0.08896417915821075, + "learning_rate": 1.5711778207840138e-05, + "loss": 0.2871, + "step": 45885 + }, + { + "epoch": 3.7172715489306545, + "grad_norm": 0.06886778771877289, + "learning_rate": 1.570727755524551e-05, + "loss": 0.2309, + "step": 45886 + }, + { + "epoch": 3.7173525599481527, + "grad_norm": 0.06046053022146225, + "learning_rate": 1.5702776902650885e-05, + "loss": 0.2314, + "step": 45887 + }, + { + "epoch": 3.7174335709656514, + "grad_norm": 0.0786987841129303, + "learning_rate": 1.569827625005626e-05, + "loss": 0.2126, + "step": 45888 + }, + { + "epoch": 3.7175145819831497, + "grad_norm": 0.08876615762710571, + "learning_rate": 1.5693775597461632e-05, + "loss": 0.2403, + "step": 45889 + }, + { + "epoch": 3.717595593000648, + "grad_norm": 0.06113407388329506, + "learning_rate": 1.5689274944867006e-05, + "loss": 0.211, + "step": 45890 + }, + { + "epoch": 3.7176766040181466, + "grad_norm": 0.06730061769485474, + "learning_rate": 1.568477429227238e-05, + "loss": 0.2186, + "step": 45891 + }, + { + "epoch": 3.717757615035645, + "grad_norm": 0.07700332999229431, + "learning_rate": 1.5680273639677753e-05, + "loss": 0.2191, + "step": 45892 + }, + { + "epoch": 3.717838626053143, + "grad_norm": 0.06986545771360397, + "learning_rate": 1.567577298708313e-05, + "loss": 0.2744, + "step": 45893 + }, + { + "epoch": 3.7179196370706418, + "grad_norm": 0.07376525551080704, + "learning_rate": 1.56712723344885e-05, + "loss": 0.2353, + "step": 45894 + }, + { + "epoch": 3.71800064808814, + "grad_norm": 0.08193989098072052, + "learning_rate": 1.5666771681893874e-05, + "loss": 0.25, + "step": 45895 + }, + { + "epoch": 3.7180816591056383, + "grad_norm": 0.06663582473993301, + "learning_rate": 1.566227102929925e-05, + "loss": 0.2445, + "step": 45896 + }, + { + "epoch": 3.718162670123137, + "grad_norm": 0.08676223456859589, + "learning_rate": 1.565777037670462e-05, + "loss": 0.2503, + "step": 45897 + }, + { + "epoch": 3.718243681140635, + "grad_norm": 0.05338404327630997, + "learning_rate": 1.5653269724109998e-05, + "loss": 0.197, + "step": 45898 + }, + { + "epoch": 3.7183246921581334, + "grad_norm": 0.08339407294988632, + "learning_rate": 1.564876907151537e-05, + "loss": 0.2584, + "step": 45899 + }, + { + "epoch": 3.718405703175632, + "grad_norm": 0.08016695827245712, + "learning_rate": 1.5644268418920745e-05, + "loss": 0.2498, + "step": 45900 + }, + { + "epoch": 3.7184867141931304, + "grad_norm": 0.06563158333301544, + "learning_rate": 1.563976776632612e-05, + "loss": 0.2013, + "step": 45901 + }, + { + "epoch": 3.7185677252106286, + "grad_norm": 0.07572820782661438, + "learning_rate": 1.5635267113731492e-05, + "loss": 0.2516, + "step": 45902 + }, + { + "epoch": 3.718648736228127, + "grad_norm": 0.0666859969496727, + "learning_rate": 1.5630766461136866e-05, + "loss": 0.2378, + "step": 45903 + }, + { + "epoch": 3.7187297472456255, + "grad_norm": 0.0793486088514328, + "learning_rate": 1.562626580854224e-05, + "loss": 0.2311, + "step": 45904 + }, + { + "epoch": 3.718810758263124, + "grad_norm": 0.07341185212135315, + "learning_rate": 1.5621765155947613e-05, + "loss": 0.2338, + "step": 45905 + }, + { + "epoch": 3.718891769280622, + "grad_norm": 0.07319411635398865, + "learning_rate": 1.5617264503352987e-05, + "loss": 0.1935, + "step": 45906 + }, + { + "epoch": 3.7189727802981203, + "grad_norm": 0.0693352222442627, + "learning_rate": 1.561276385075836e-05, + "loss": 0.2155, + "step": 45907 + }, + { + "epoch": 3.719053791315619, + "grad_norm": 0.07195544987916946, + "learning_rate": 1.5608263198163734e-05, + "loss": 0.2404, + "step": 45908 + }, + { + "epoch": 3.719134802333117, + "grad_norm": 0.08030025660991669, + "learning_rate": 1.5603762545569107e-05, + "loss": 0.2222, + "step": 45909 + }, + { + "epoch": 3.7192158133506155, + "grad_norm": 0.0607793889939785, + "learning_rate": 1.559926189297448e-05, + "loss": 0.1801, + "step": 45910 + }, + { + "epoch": 3.719296824368114, + "grad_norm": 0.0874897688627243, + "learning_rate": 1.5594761240379858e-05, + "loss": 0.2549, + "step": 45911 + }, + { + "epoch": 3.7193778353856124, + "grad_norm": 0.08436858654022217, + "learning_rate": 1.5590260587785228e-05, + "loss": 0.23, + "step": 45912 + }, + { + "epoch": 3.7194588464031106, + "grad_norm": 0.06420475244522095, + "learning_rate": 1.5585759935190602e-05, + "loss": 0.216, + "step": 45913 + }, + { + "epoch": 3.7195398574206093, + "grad_norm": 0.0704539343714714, + "learning_rate": 1.558125928259598e-05, + "loss": 0.258, + "step": 45914 + }, + { + "epoch": 3.7196208684381076, + "grad_norm": 0.07884382456541061, + "learning_rate": 1.557675863000135e-05, + "loss": 0.2243, + "step": 45915 + }, + { + "epoch": 3.719701879455606, + "grad_norm": 0.07424849271774292, + "learning_rate": 1.5572257977406723e-05, + "loss": 0.2347, + "step": 45916 + }, + { + "epoch": 3.7197828904731045, + "grad_norm": 0.0690733939409256, + "learning_rate": 1.55677573248121e-05, + "loss": 0.2222, + "step": 45917 + }, + { + "epoch": 3.7198639014906028, + "grad_norm": 0.06833771616220474, + "learning_rate": 1.5563256672217473e-05, + "loss": 0.2025, + "step": 45918 + }, + { + "epoch": 3.719944912508101, + "grad_norm": 0.07035204023122787, + "learning_rate": 1.5558756019622847e-05, + "loss": 0.2337, + "step": 45919 + }, + { + "epoch": 3.7200259235255997, + "grad_norm": 0.07200058549642563, + "learning_rate": 1.555425536702822e-05, + "loss": 0.2247, + "step": 45920 + }, + { + "epoch": 3.720106934543098, + "grad_norm": 0.07681140303611755, + "learning_rate": 1.5549754714433594e-05, + "loss": 0.2033, + "step": 45921 + }, + { + "epoch": 3.720187945560596, + "grad_norm": 0.076214499771595, + "learning_rate": 1.5545254061838968e-05, + "loss": 0.237, + "step": 45922 + }, + { + "epoch": 3.720268956578095, + "grad_norm": 0.060587845742702484, + "learning_rate": 1.554075340924434e-05, + "loss": 0.2238, + "step": 45923 + }, + { + "epoch": 3.720349967595593, + "grad_norm": 0.05092794820666313, + "learning_rate": 1.5536252756649715e-05, + "loss": 0.2015, + "step": 45924 + }, + { + "epoch": 3.7204309786130914, + "grad_norm": 0.08442278951406479, + "learning_rate": 1.553175210405509e-05, + "loss": 0.2763, + "step": 45925 + }, + { + "epoch": 3.7205119896305896, + "grad_norm": 0.07512500137090683, + "learning_rate": 1.5527251451460462e-05, + "loss": 0.2318, + "step": 45926 + }, + { + "epoch": 3.7205930006480883, + "grad_norm": 0.07768537849187851, + "learning_rate": 1.5522750798865836e-05, + "loss": 0.2908, + "step": 45927 + }, + { + "epoch": 3.7206740116655865, + "grad_norm": 0.07198259234428406, + "learning_rate": 1.551825014627121e-05, + "loss": 0.2386, + "step": 45928 + }, + { + "epoch": 3.720755022683085, + "grad_norm": 0.06927481293678284, + "learning_rate": 1.5513749493676583e-05, + "loss": 0.221, + "step": 45929 + }, + { + "epoch": 3.720836033700583, + "grad_norm": 0.06755954027175903, + "learning_rate": 1.5509248841081956e-05, + "loss": 0.2161, + "step": 45930 + }, + { + "epoch": 3.7209170447180817, + "grad_norm": 0.08039725571870804, + "learning_rate": 1.550474818848733e-05, + "loss": 0.2205, + "step": 45931 + }, + { + "epoch": 3.72099805573558, + "grad_norm": 0.06722427904605865, + "learning_rate": 1.5500247535892707e-05, + "loss": 0.2134, + "step": 45932 + }, + { + "epoch": 3.721079066753078, + "grad_norm": 0.07862570881843567, + "learning_rate": 1.5495746883298077e-05, + "loss": 0.2321, + "step": 45933 + }, + { + "epoch": 3.721160077770577, + "grad_norm": 0.0632789134979248, + "learning_rate": 1.549124623070345e-05, + "loss": 0.2378, + "step": 45934 + }, + { + "epoch": 3.721241088788075, + "grad_norm": 0.06634281575679779, + "learning_rate": 1.5486745578108828e-05, + "loss": 0.2357, + "step": 45935 + }, + { + "epoch": 3.7213220998055734, + "grad_norm": 0.07613595575094223, + "learning_rate": 1.54822449255142e-05, + "loss": 0.2295, + "step": 45936 + }, + { + "epoch": 3.721403110823072, + "grad_norm": 0.0799480453133583, + "learning_rate": 1.5477744272919575e-05, + "loss": 0.2222, + "step": 45937 + }, + { + "epoch": 3.7214841218405703, + "grad_norm": 0.07500126212835312, + "learning_rate": 1.547324362032495e-05, + "loss": 0.2368, + "step": 45938 + }, + { + "epoch": 3.7215651328580686, + "grad_norm": 0.06687241792678833, + "learning_rate": 1.5468742967730322e-05, + "loss": 0.2251, + "step": 45939 + }, + { + "epoch": 3.7216461438755672, + "grad_norm": 0.06396705657243729, + "learning_rate": 1.5464242315135696e-05, + "loss": 0.2216, + "step": 45940 + }, + { + "epoch": 3.7217271548930655, + "grad_norm": 0.06962263584136963, + "learning_rate": 1.545974166254107e-05, + "loss": 0.2254, + "step": 45941 + }, + { + "epoch": 3.7218081659105637, + "grad_norm": 0.06181929260492325, + "learning_rate": 1.5455241009946443e-05, + "loss": 0.2235, + "step": 45942 + }, + { + "epoch": 3.7218891769280624, + "grad_norm": 0.07112901657819748, + "learning_rate": 1.5450740357351817e-05, + "loss": 0.2422, + "step": 45943 + }, + { + "epoch": 3.7219701879455607, + "grad_norm": 0.07527320086956024, + "learning_rate": 1.544623970475719e-05, + "loss": 0.217, + "step": 45944 + }, + { + "epoch": 3.722051198963059, + "grad_norm": 0.08394154906272888, + "learning_rate": 1.5441739052162564e-05, + "loss": 0.2236, + "step": 45945 + }, + { + "epoch": 3.7221322099805576, + "grad_norm": 0.07716701924800873, + "learning_rate": 1.5437238399567938e-05, + "loss": 0.2107, + "step": 45946 + }, + { + "epoch": 3.722213220998056, + "grad_norm": 0.06959621608257294, + "learning_rate": 1.543273774697331e-05, + "loss": 0.2397, + "step": 45947 + }, + { + "epoch": 3.722294232015554, + "grad_norm": 0.06327780336141586, + "learning_rate": 1.5428237094378685e-05, + "loss": 0.1967, + "step": 45948 + }, + { + "epoch": 3.7223752430330523, + "grad_norm": 0.0753185898065567, + "learning_rate": 1.542373644178406e-05, + "loss": 0.2441, + "step": 45949 + }, + { + "epoch": 3.722456254050551, + "grad_norm": 0.062304459512233734, + "learning_rate": 1.5419235789189435e-05, + "loss": 0.239, + "step": 45950 + }, + { + "epoch": 3.7225372650680493, + "grad_norm": 0.10284210741519928, + "learning_rate": 1.541473513659481e-05, + "loss": 0.2458, + "step": 45951 + }, + { + "epoch": 3.7226182760855475, + "grad_norm": 0.07348388433456421, + "learning_rate": 1.541023448400018e-05, + "loss": 0.2355, + "step": 45952 + }, + { + "epoch": 3.7226992871030458, + "grad_norm": 0.06129736825823784, + "learning_rate": 1.5405733831405556e-05, + "loss": 0.197, + "step": 45953 + }, + { + "epoch": 3.7227802981205445, + "grad_norm": 0.07383542507886887, + "learning_rate": 1.540123317881093e-05, + "loss": 0.2116, + "step": 45954 + }, + { + "epoch": 3.7228613091380427, + "grad_norm": 0.07537295669317245, + "learning_rate": 1.53967325262163e-05, + "loss": 0.2375, + "step": 45955 + }, + { + "epoch": 3.722942320155541, + "grad_norm": 0.06659074127674103, + "learning_rate": 1.5392231873621677e-05, + "loss": 0.2277, + "step": 45956 + }, + { + "epoch": 3.7230233311730396, + "grad_norm": 0.07395710796117783, + "learning_rate": 1.538773122102705e-05, + "loss": 0.2246, + "step": 45957 + }, + { + "epoch": 3.723104342190538, + "grad_norm": 0.07573916763067245, + "learning_rate": 1.5383230568432424e-05, + "loss": 0.2409, + "step": 45958 + }, + { + "epoch": 3.723185353208036, + "grad_norm": 0.06378524005413055, + "learning_rate": 1.5378729915837798e-05, + "loss": 0.2007, + "step": 45959 + }, + { + "epoch": 3.723266364225535, + "grad_norm": 0.0667334794998169, + "learning_rate": 1.537422926324317e-05, + "loss": 0.2533, + "step": 45960 + }, + { + "epoch": 3.723347375243033, + "grad_norm": 0.06946897506713867, + "learning_rate": 1.5369728610648545e-05, + "loss": 0.1876, + "step": 45961 + }, + { + "epoch": 3.7234283862605313, + "grad_norm": 0.06870533525943756, + "learning_rate": 1.536522795805392e-05, + "loss": 0.1848, + "step": 45962 + }, + { + "epoch": 3.72350939727803, + "grad_norm": 0.0735953152179718, + "learning_rate": 1.5360727305459292e-05, + "loss": 0.201, + "step": 45963 + }, + { + "epoch": 3.7235904082955282, + "grad_norm": 0.063729427754879, + "learning_rate": 1.5356226652864666e-05, + "loss": 0.2096, + "step": 45964 + }, + { + "epoch": 3.7236714193130265, + "grad_norm": 0.06849828362464905, + "learning_rate": 1.535172600027004e-05, + "loss": 0.2433, + "step": 45965 + }, + { + "epoch": 3.723752430330525, + "grad_norm": 0.08339733630418777, + "learning_rate": 1.5347225347675413e-05, + "loss": 0.2275, + "step": 45966 + }, + { + "epoch": 3.7238334413480234, + "grad_norm": 0.07081194967031479, + "learning_rate": 1.5342724695080787e-05, + "loss": 0.1976, + "step": 45967 + }, + { + "epoch": 3.7239144523655217, + "grad_norm": 0.08006429672241211, + "learning_rate": 1.533822404248616e-05, + "loss": 0.2303, + "step": 45968 + }, + { + "epoch": 3.7239954633830203, + "grad_norm": 0.0940973311662674, + "learning_rate": 1.5333723389891537e-05, + "loss": 0.2473, + "step": 45969 + }, + { + "epoch": 3.7240764744005186, + "grad_norm": 0.0769064798951149, + "learning_rate": 1.5329222737296907e-05, + "loss": 0.2101, + "step": 45970 + }, + { + "epoch": 3.724157485418017, + "grad_norm": 0.06387367844581604, + "learning_rate": 1.5324722084702284e-05, + "loss": 0.2787, + "step": 45971 + }, + { + "epoch": 3.724238496435515, + "grad_norm": 0.07406259328126907, + "learning_rate": 1.5320221432107658e-05, + "loss": 0.234, + "step": 45972 + }, + { + "epoch": 3.7243195074530138, + "grad_norm": 0.06665657460689545, + "learning_rate": 1.5315720779513028e-05, + "loss": 0.2568, + "step": 45973 + }, + { + "epoch": 3.724400518470512, + "grad_norm": 0.0778389424085617, + "learning_rate": 1.5311220126918405e-05, + "loss": 0.2208, + "step": 45974 + }, + { + "epoch": 3.7244815294880103, + "grad_norm": 0.06476572155952454, + "learning_rate": 1.530671947432378e-05, + "loss": 0.2047, + "step": 45975 + }, + { + "epoch": 3.7245625405055085, + "grad_norm": 0.07310286909341812, + "learning_rate": 1.530221882172915e-05, + "loss": 0.2068, + "step": 45976 + }, + { + "epoch": 3.724643551523007, + "grad_norm": 0.06835059821605682, + "learning_rate": 1.5297718169134526e-05, + "loss": 0.2077, + "step": 45977 + }, + { + "epoch": 3.7247245625405054, + "grad_norm": 0.08028055727481842, + "learning_rate": 1.52932175165399e-05, + "loss": 0.2292, + "step": 45978 + }, + { + "epoch": 3.7248055735580037, + "grad_norm": 0.06749919801950455, + "learning_rate": 1.5288716863945273e-05, + "loss": 0.1826, + "step": 45979 + }, + { + "epoch": 3.7248865845755024, + "grad_norm": 0.05876849219202995, + "learning_rate": 1.5284216211350647e-05, + "loss": 0.2335, + "step": 45980 + }, + { + "epoch": 3.7249675955930006, + "grad_norm": 0.07642512768507004, + "learning_rate": 1.527971555875602e-05, + "loss": 0.2121, + "step": 45981 + }, + { + "epoch": 3.725048606610499, + "grad_norm": 0.06255366653203964, + "learning_rate": 1.5275214906161394e-05, + "loss": 0.2031, + "step": 45982 + }, + { + "epoch": 3.7251296176279975, + "grad_norm": 0.06796018779277802, + "learning_rate": 1.5270714253566768e-05, + "loss": 0.2456, + "step": 45983 + }, + { + "epoch": 3.725210628645496, + "grad_norm": 0.0734555795788765, + "learning_rate": 1.526621360097214e-05, + "loss": 0.2187, + "step": 45984 + }, + { + "epoch": 3.725291639662994, + "grad_norm": 0.06981045752763748, + "learning_rate": 1.5261712948377515e-05, + "loss": 0.2197, + "step": 45985 + }, + { + "epoch": 3.7253726506804927, + "grad_norm": 0.07305343449115753, + "learning_rate": 1.5257212295782888e-05, + "loss": 0.2002, + "step": 45986 + }, + { + "epoch": 3.725453661697991, + "grad_norm": 0.08334986120462418, + "learning_rate": 1.5252711643188264e-05, + "loss": 0.2253, + "step": 45987 + }, + { + "epoch": 3.725534672715489, + "grad_norm": 0.08052529394626617, + "learning_rate": 1.5248210990593637e-05, + "loss": 0.2338, + "step": 45988 + }, + { + "epoch": 3.725615683732988, + "grad_norm": 0.08272536844015121, + "learning_rate": 1.524371033799901e-05, + "loss": 0.2334, + "step": 45989 + }, + { + "epoch": 3.725696694750486, + "grad_norm": 0.07033362984657288, + "learning_rate": 1.5239209685404385e-05, + "loss": 0.2335, + "step": 45990 + }, + { + "epoch": 3.7257777057679844, + "grad_norm": 0.06358537077903748, + "learning_rate": 1.5234709032809758e-05, + "loss": 0.2542, + "step": 45991 + }, + { + "epoch": 3.725858716785483, + "grad_norm": 0.08354093134403229, + "learning_rate": 1.5230208380215133e-05, + "loss": 0.2444, + "step": 45992 + }, + { + "epoch": 3.7259397278029813, + "grad_norm": 0.07609385251998901, + "learning_rate": 1.5225707727620505e-05, + "loss": 0.2156, + "step": 45993 + }, + { + "epoch": 3.7260207388204796, + "grad_norm": 0.07945896685123444, + "learning_rate": 1.5221207075025879e-05, + "loss": 0.2156, + "step": 45994 + }, + { + "epoch": 3.726101749837978, + "grad_norm": 0.07363783568143845, + "learning_rate": 1.5216706422431254e-05, + "loss": 0.2019, + "step": 45995 + }, + { + "epoch": 3.7261827608554765, + "grad_norm": 0.07394561171531677, + "learning_rate": 1.5212205769836626e-05, + "loss": 0.1975, + "step": 45996 + }, + { + "epoch": 3.7262637718729748, + "grad_norm": 0.0713035985827446, + "learning_rate": 1.5207705117242001e-05, + "loss": 0.2476, + "step": 45997 + }, + { + "epoch": 3.726344782890473, + "grad_norm": 0.06830364465713501, + "learning_rate": 1.5203204464647375e-05, + "loss": 0.2395, + "step": 45998 + }, + { + "epoch": 3.7264257939079712, + "grad_norm": 0.07303628325462341, + "learning_rate": 1.5198703812052747e-05, + "loss": 0.2143, + "step": 45999 + }, + { + "epoch": 3.72650680492547, + "grad_norm": 0.07435938715934753, + "learning_rate": 1.5194203159458122e-05, + "loss": 0.2648, + "step": 46000 + }, + { + "epoch": 3.726587815942968, + "grad_norm": 0.0673922747373581, + "learning_rate": 1.5189702506863496e-05, + "loss": 0.2256, + "step": 46001 + }, + { + "epoch": 3.7266688269604664, + "grad_norm": 0.06162109971046448, + "learning_rate": 1.5185201854268868e-05, + "loss": 0.2254, + "step": 46002 + }, + { + "epoch": 3.726749837977965, + "grad_norm": 0.10011584311723709, + "learning_rate": 1.5180701201674243e-05, + "loss": 0.2665, + "step": 46003 + }, + { + "epoch": 3.7268308489954634, + "grad_norm": 0.07240945100784302, + "learning_rate": 1.5176200549079617e-05, + "loss": 0.2359, + "step": 46004 + }, + { + "epoch": 3.7269118600129616, + "grad_norm": 0.07601157575845718, + "learning_rate": 1.5171699896484992e-05, + "loss": 0.2338, + "step": 46005 + }, + { + "epoch": 3.7269928710304603, + "grad_norm": 0.08337153494358063, + "learning_rate": 1.5167199243890366e-05, + "loss": 0.2743, + "step": 46006 + }, + { + "epoch": 3.7270738820479585, + "grad_norm": 0.07271520048379898, + "learning_rate": 1.5162698591295738e-05, + "loss": 0.2244, + "step": 46007 + }, + { + "epoch": 3.7271548930654568, + "grad_norm": 0.07792726159095764, + "learning_rate": 1.5158197938701113e-05, + "loss": 0.247, + "step": 46008 + }, + { + "epoch": 3.7272359040829555, + "grad_norm": 0.07396295666694641, + "learning_rate": 1.5153697286106486e-05, + "loss": 0.265, + "step": 46009 + }, + { + "epoch": 3.7273169151004537, + "grad_norm": 0.06522729247808456, + "learning_rate": 1.5149196633511862e-05, + "loss": 0.2406, + "step": 46010 + }, + { + "epoch": 3.727397926117952, + "grad_norm": 0.07633328437805176, + "learning_rate": 1.5144695980917234e-05, + "loss": 0.2499, + "step": 46011 + }, + { + "epoch": 3.7274789371354506, + "grad_norm": 0.07195275276899338, + "learning_rate": 1.5140195328322607e-05, + "loss": 0.2147, + "step": 46012 + }, + { + "epoch": 3.727559948152949, + "grad_norm": 0.07411804050207138, + "learning_rate": 1.5135694675727983e-05, + "loss": 0.2065, + "step": 46013 + }, + { + "epoch": 3.727640959170447, + "grad_norm": 0.06602322310209274, + "learning_rate": 1.5131194023133354e-05, + "loss": 0.2006, + "step": 46014 + }, + { + "epoch": 3.727721970187946, + "grad_norm": 0.07067742198705673, + "learning_rate": 1.5126693370538728e-05, + "loss": 0.2145, + "step": 46015 + }, + { + "epoch": 3.727802981205444, + "grad_norm": 0.06092274561524391, + "learning_rate": 1.5122192717944103e-05, + "loss": 0.2087, + "step": 46016 + }, + { + "epoch": 3.7278839922229423, + "grad_norm": 0.07934263348579407, + "learning_rate": 1.5117692065349475e-05, + "loss": 0.2166, + "step": 46017 + }, + { + "epoch": 3.7279650032404406, + "grad_norm": 0.07236896455287933, + "learning_rate": 1.511319141275485e-05, + "loss": 0.2496, + "step": 46018 + }, + { + "epoch": 3.7280460142579392, + "grad_norm": 0.07020833343267441, + "learning_rate": 1.5108690760160224e-05, + "loss": 0.2294, + "step": 46019 + }, + { + "epoch": 3.7281270252754375, + "grad_norm": 0.0577675886452198, + "learning_rate": 1.5104190107565596e-05, + "loss": 0.2161, + "step": 46020 + }, + { + "epoch": 3.7282080362929357, + "grad_norm": 0.06877872347831726, + "learning_rate": 1.5099689454970973e-05, + "loss": 0.2433, + "step": 46021 + }, + { + "epoch": 3.728289047310434, + "grad_norm": 0.05976353585720062, + "learning_rate": 1.5095188802376345e-05, + "loss": 0.2133, + "step": 46022 + }, + { + "epoch": 3.7283700583279327, + "grad_norm": 0.07601095736026764, + "learning_rate": 1.509068814978172e-05, + "loss": 0.2678, + "step": 46023 + }, + { + "epoch": 3.728451069345431, + "grad_norm": 0.09052573144435883, + "learning_rate": 1.5086187497187094e-05, + "loss": 0.2468, + "step": 46024 + }, + { + "epoch": 3.728532080362929, + "grad_norm": 0.0643225833773613, + "learning_rate": 1.5081686844592466e-05, + "loss": 0.2259, + "step": 46025 + }, + { + "epoch": 3.728613091380428, + "grad_norm": 0.07924170792102814, + "learning_rate": 1.5077186191997841e-05, + "loss": 0.2399, + "step": 46026 + }, + { + "epoch": 3.728694102397926, + "grad_norm": 0.07029829919338226, + "learning_rate": 1.5072685539403215e-05, + "loss": 0.2177, + "step": 46027 + }, + { + "epoch": 3.7287751134154243, + "grad_norm": 0.07222612202167511, + "learning_rate": 1.5068184886808587e-05, + "loss": 0.2465, + "step": 46028 + }, + { + "epoch": 3.728856124432923, + "grad_norm": 0.08449197560548782, + "learning_rate": 1.5063684234213962e-05, + "loss": 0.2017, + "step": 46029 + }, + { + "epoch": 3.7289371354504213, + "grad_norm": 0.05938245728611946, + "learning_rate": 1.5059183581619335e-05, + "loss": 0.2076, + "step": 46030 + }, + { + "epoch": 3.7290181464679195, + "grad_norm": 0.07024046778678894, + "learning_rate": 1.505468292902471e-05, + "loss": 0.2041, + "step": 46031 + }, + { + "epoch": 3.729099157485418, + "grad_norm": 0.07162220031023026, + "learning_rate": 1.5050182276430083e-05, + "loss": 0.2144, + "step": 46032 + }, + { + "epoch": 3.7291801685029164, + "grad_norm": 0.06670884788036346, + "learning_rate": 1.5045681623835456e-05, + "loss": 0.2002, + "step": 46033 + }, + { + "epoch": 3.7292611795204147, + "grad_norm": 0.0706491693854332, + "learning_rate": 1.5041180971240832e-05, + "loss": 0.1878, + "step": 46034 + }, + { + "epoch": 3.7293421905379134, + "grad_norm": 0.07656066864728928, + "learning_rate": 1.5036680318646203e-05, + "loss": 0.1964, + "step": 46035 + }, + { + "epoch": 3.7294232015554116, + "grad_norm": 0.07184910029172897, + "learning_rate": 1.5032179666051579e-05, + "loss": 0.2506, + "step": 46036 + }, + { + "epoch": 3.72950421257291, + "grad_norm": 0.07077179104089737, + "learning_rate": 1.5027679013456952e-05, + "loss": 0.208, + "step": 46037 + }, + { + "epoch": 3.7295852235904086, + "grad_norm": 0.0829695537686348, + "learning_rate": 1.5023178360862324e-05, + "loss": 0.229, + "step": 46038 + }, + { + "epoch": 3.729666234607907, + "grad_norm": 0.06951703131198883, + "learning_rate": 1.5018677708267701e-05, + "loss": 0.2148, + "step": 46039 + }, + { + "epoch": 3.729747245625405, + "grad_norm": 0.0725255087018013, + "learning_rate": 1.5014177055673073e-05, + "loss": 0.2519, + "step": 46040 + }, + { + "epoch": 3.7298282566429033, + "grad_norm": 0.06751804798841476, + "learning_rate": 1.5009676403078445e-05, + "loss": 0.223, + "step": 46041 + }, + { + "epoch": 3.7299092676604015, + "grad_norm": 0.08554006367921829, + "learning_rate": 1.5005175750483822e-05, + "loss": 0.2673, + "step": 46042 + }, + { + "epoch": 3.7299902786779002, + "grad_norm": 0.06487467885017395, + "learning_rate": 1.5000675097889194e-05, + "loss": 0.2243, + "step": 46043 + }, + { + "epoch": 3.7300712896953985, + "grad_norm": 0.08155498653650284, + "learning_rate": 1.499617444529457e-05, + "loss": 0.2768, + "step": 46044 + }, + { + "epoch": 3.7301523007128967, + "grad_norm": 0.06040952354669571, + "learning_rate": 1.4991673792699943e-05, + "loss": 0.2139, + "step": 46045 + }, + { + "epoch": 3.7302333117303954, + "grad_norm": 0.07658139616250992, + "learning_rate": 1.4987173140105315e-05, + "loss": 0.2356, + "step": 46046 + }, + { + "epoch": 3.7303143227478937, + "grad_norm": 0.08683796226978302, + "learning_rate": 1.498267248751069e-05, + "loss": 0.2153, + "step": 46047 + }, + { + "epoch": 3.730395333765392, + "grad_norm": 0.08210854977369308, + "learning_rate": 1.4978171834916064e-05, + "loss": 0.2168, + "step": 46048 + }, + { + "epoch": 3.7304763447828906, + "grad_norm": 0.08197548985481262, + "learning_rate": 1.4973671182321436e-05, + "loss": 0.2389, + "step": 46049 + }, + { + "epoch": 3.730557355800389, + "grad_norm": 0.06574133038520813, + "learning_rate": 1.4969170529726811e-05, + "loss": 0.2428, + "step": 46050 + }, + { + "epoch": 3.730638366817887, + "grad_norm": 0.082312673330307, + "learning_rate": 1.4964669877132185e-05, + "loss": 0.2285, + "step": 46051 + }, + { + "epoch": 3.7307193778353858, + "grad_norm": 0.0676484927535057, + "learning_rate": 1.496016922453756e-05, + "loss": 0.2057, + "step": 46052 + }, + { + "epoch": 3.730800388852884, + "grad_norm": 0.06505704671144485, + "learning_rate": 1.4955668571942932e-05, + "loss": 0.2257, + "step": 46053 + }, + { + "epoch": 3.7308813998703823, + "grad_norm": 0.0626121386885643, + "learning_rate": 1.4951167919348305e-05, + "loss": 0.2512, + "step": 46054 + }, + { + "epoch": 3.730962410887881, + "grad_norm": 0.07555672526359558, + "learning_rate": 1.494666726675368e-05, + "loss": 0.1947, + "step": 46055 + }, + { + "epoch": 3.731043421905379, + "grad_norm": 0.05888738855719566, + "learning_rate": 1.4942166614159053e-05, + "loss": 0.208, + "step": 46056 + }, + { + "epoch": 3.7311244329228774, + "grad_norm": 0.0674942135810852, + "learning_rate": 1.493766596156443e-05, + "loss": 0.2275, + "step": 46057 + }, + { + "epoch": 3.731205443940376, + "grad_norm": 0.05916551873087883, + "learning_rate": 1.4933165308969801e-05, + "loss": 0.1929, + "step": 46058 + }, + { + "epoch": 3.7312864549578744, + "grad_norm": 0.08539033681154251, + "learning_rate": 1.4928664656375175e-05, + "loss": 0.2194, + "step": 46059 + }, + { + "epoch": 3.7313674659753726, + "grad_norm": 0.06798946112394333, + "learning_rate": 1.492416400378055e-05, + "loss": 0.2181, + "step": 46060 + }, + { + "epoch": 3.7314484769928713, + "grad_norm": 0.06484483182430267, + "learning_rate": 1.4919663351185922e-05, + "loss": 0.2043, + "step": 46061 + }, + { + "epoch": 3.7315294880103695, + "grad_norm": 0.0604698583483696, + "learning_rate": 1.4915162698591296e-05, + "loss": 0.1975, + "step": 46062 + }, + { + "epoch": 3.731610499027868, + "grad_norm": 0.0822327584028244, + "learning_rate": 1.4910662045996671e-05, + "loss": 0.2609, + "step": 46063 + }, + { + "epoch": 3.731691510045366, + "grad_norm": 0.09345957636833191, + "learning_rate": 1.4906161393402043e-05, + "loss": 0.2302, + "step": 46064 + }, + { + "epoch": 3.7317725210628643, + "grad_norm": 0.06493161618709564, + "learning_rate": 1.4901660740807418e-05, + "loss": 0.1933, + "step": 46065 + }, + { + "epoch": 3.731853532080363, + "grad_norm": 0.0702953115105629, + "learning_rate": 1.4897160088212792e-05, + "loss": 0.1897, + "step": 46066 + }, + { + "epoch": 3.731934543097861, + "grad_norm": 0.06461524218320847, + "learning_rate": 1.4892659435618164e-05, + "loss": 0.2407, + "step": 46067 + }, + { + "epoch": 3.7320155541153595, + "grad_norm": 0.08654788136482239, + "learning_rate": 1.488815878302354e-05, + "loss": 0.236, + "step": 46068 + }, + { + "epoch": 3.732096565132858, + "grad_norm": 0.0883837565779686, + "learning_rate": 1.4883658130428913e-05, + "loss": 0.2058, + "step": 46069 + }, + { + "epoch": 3.7321775761503564, + "grad_norm": 0.06732512265443802, + "learning_rate": 1.4879157477834288e-05, + "loss": 0.2224, + "step": 46070 + }, + { + "epoch": 3.7322585871678546, + "grad_norm": 0.06484067440032959, + "learning_rate": 1.487465682523966e-05, + "loss": 0.2419, + "step": 46071 + }, + { + "epoch": 3.7323395981853533, + "grad_norm": 0.0646497905254364, + "learning_rate": 1.4870156172645034e-05, + "loss": 0.1876, + "step": 46072 + }, + { + "epoch": 3.7324206092028516, + "grad_norm": 0.101152203977108, + "learning_rate": 1.4865655520050409e-05, + "loss": 0.2442, + "step": 46073 + }, + { + "epoch": 3.73250162022035, + "grad_norm": 0.08255450427532196, + "learning_rate": 1.486115486745578e-05, + "loss": 0.2307, + "step": 46074 + }, + { + "epoch": 3.7325826312378485, + "grad_norm": 0.08730298280715942, + "learning_rate": 1.4856654214861154e-05, + "loss": 0.2408, + "step": 46075 + }, + { + "epoch": 3.7326636422553467, + "grad_norm": 0.0731593444943428, + "learning_rate": 1.485215356226653e-05, + "loss": 0.1948, + "step": 46076 + }, + { + "epoch": 3.732744653272845, + "grad_norm": 0.07262025028467178, + "learning_rate": 1.4847652909671903e-05, + "loss": 0.2244, + "step": 46077 + }, + { + "epoch": 3.7328256642903437, + "grad_norm": 0.07813366502523422, + "learning_rate": 1.4843152257077279e-05, + "loss": 0.2623, + "step": 46078 + }, + { + "epoch": 3.732906675307842, + "grad_norm": 0.06445509940385818, + "learning_rate": 1.483865160448265e-05, + "loss": 0.2185, + "step": 46079 + }, + { + "epoch": 3.73298768632534, + "grad_norm": 0.07868556678295135, + "learning_rate": 1.4834150951888024e-05, + "loss": 0.212, + "step": 46080 + }, + { + "epoch": 3.733068697342839, + "grad_norm": 0.0866679698228836, + "learning_rate": 1.48296502992934e-05, + "loss": 0.2424, + "step": 46081 + }, + { + "epoch": 3.733149708360337, + "grad_norm": 0.06397400051355362, + "learning_rate": 1.4825149646698771e-05, + "loss": 0.2272, + "step": 46082 + }, + { + "epoch": 3.7332307193778353, + "grad_norm": 0.07194807380437851, + "learning_rate": 1.4820648994104147e-05, + "loss": 0.2261, + "step": 46083 + }, + { + "epoch": 3.733311730395334, + "grad_norm": 0.06789196282625198, + "learning_rate": 1.481614834150952e-05, + "loss": 0.2246, + "step": 46084 + }, + { + "epoch": 3.7333927414128323, + "grad_norm": 0.07250261306762695, + "learning_rate": 1.4811647688914892e-05, + "loss": 0.2306, + "step": 46085 + }, + { + "epoch": 3.7334737524303305, + "grad_norm": 0.10532908141613007, + "learning_rate": 1.4807147036320267e-05, + "loss": 0.2482, + "step": 46086 + }, + { + "epoch": 3.7335547634478288, + "grad_norm": 0.062470242381095886, + "learning_rate": 1.4802646383725641e-05, + "loss": 0.1862, + "step": 46087 + }, + { + "epoch": 3.733635774465327, + "grad_norm": 0.07458098232746124, + "learning_rate": 1.4798145731131013e-05, + "loss": 0.2172, + "step": 46088 + }, + { + "epoch": 3.7337167854828257, + "grad_norm": 0.07555191963911057, + "learning_rate": 1.4793645078536388e-05, + "loss": 0.2453, + "step": 46089 + }, + { + "epoch": 3.733797796500324, + "grad_norm": 0.08217816799879074, + "learning_rate": 1.4789144425941762e-05, + "loss": 0.2386, + "step": 46090 + }, + { + "epoch": 3.733878807517822, + "grad_norm": 0.07929562777280807, + "learning_rate": 1.4784643773347137e-05, + "loss": 0.2048, + "step": 46091 + }, + { + "epoch": 3.733959818535321, + "grad_norm": 0.09434565156698227, + "learning_rate": 1.4780143120752509e-05, + "loss": 0.238, + "step": 46092 + }, + { + "epoch": 3.734040829552819, + "grad_norm": 0.07045267522335052, + "learning_rate": 1.4775642468157883e-05, + "loss": 0.2308, + "step": 46093 + }, + { + "epoch": 3.7341218405703174, + "grad_norm": 0.0724666640162468, + "learning_rate": 1.4771141815563258e-05, + "loss": 0.1918, + "step": 46094 + }, + { + "epoch": 3.734202851587816, + "grad_norm": 0.08512706309556961, + "learning_rate": 1.4766641162968632e-05, + "loss": 0.2463, + "step": 46095 + }, + { + "epoch": 3.7342838626053143, + "grad_norm": 0.06601796299219131, + "learning_rate": 1.4762140510374007e-05, + "loss": 0.2152, + "step": 46096 + }, + { + "epoch": 3.7343648736228126, + "grad_norm": 0.0734667181968689, + "learning_rate": 1.4757639857779379e-05, + "loss": 0.2308, + "step": 46097 + }, + { + "epoch": 3.7344458846403112, + "grad_norm": 0.076157346367836, + "learning_rate": 1.4753139205184752e-05, + "loss": 0.1982, + "step": 46098 + }, + { + "epoch": 3.7345268956578095, + "grad_norm": 0.06739217787981033, + "learning_rate": 1.4748638552590128e-05, + "loss": 0.267, + "step": 46099 + }, + { + "epoch": 3.7346079066753077, + "grad_norm": 0.06588942557573318, + "learning_rate": 1.47441378999955e-05, + "loss": 0.2426, + "step": 46100 + }, + { + "epoch": 3.7346889176928064, + "grad_norm": 0.08635196089744568, + "learning_rate": 1.4739637247400873e-05, + "loss": 0.27, + "step": 46101 + }, + { + "epoch": 3.7347699287103047, + "grad_norm": 0.06514254957437515, + "learning_rate": 1.4735136594806248e-05, + "loss": 0.2243, + "step": 46102 + }, + { + "epoch": 3.734850939727803, + "grad_norm": 0.06313839554786682, + "learning_rate": 1.473063594221162e-05, + "loss": 0.2219, + "step": 46103 + }, + { + "epoch": 3.7349319507453016, + "grad_norm": 0.06508518010377884, + "learning_rate": 1.4726135289616996e-05, + "loss": 0.2105, + "step": 46104 + }, + { + "epoch": 3.7350129617628, + "grad_norm": 0.06646306067705154, + "learning_rate": 1.472163463702237e-05, + "loss": 0.2061, + "step": 46105 + }, + { + "epoch": 3.735093972780298, + "grad_norm": 0.07501812279224396, + "learning_rate": 1.4717133984427741e-05, + "loss": 0.2194, + "step": 46106 + }, + { + "epoch": 3.7351749837977968, + "grad_norm": 0.08135095983743668, + "learning_rate": 1.4712633331833116e-05, + "loss": 0.2341, + "step": 46107 + }, + { + "epoch": 3.735255994815295, + "grad_norm": 0.07120555639266968, + "learning_rate": 1.470813267923849e-05, + "loss": 0.23, + "step": 46108 + }, + { + "epoch": 3.7353370058327933, + "grad_norm": 0.05750159174203873, + "learning_rate": 1.4703632026643865e-05, + "loss": 0.2264, + "step": 46109 + }, + { + "epoch": 3.7354180168502915, + "grad_norm": 0.060754384845495224, + "learning_rate": 1.4699131374049237e-05, + "loss": 0.2099, + "step": 46110 + }, + { + "epoch": 3.7354990278677898, + "grad_norm": 0.08677901327610016, + "learning_rate": 1.4694630721454611e-05, + "loss": 0.2165, + "step": 46111 + }, + { + "epoch": 3.7355800388852884, + "grad_norm": 0.07808344811201096, + "learning_rate": 1.4690130068859986e-05, + "loss": 0.2626, + "step": 46112 + }, + { + "epoch": 3.7356610499027867, + "grad_norm": 0.08093435317277908, + "learning_rate": 1.468562941626536e-05, + "loss": 0.2381, + "step": 46113 + }, + { + "epoch": 3.735742060920285, + "grad_norm": 0.07375194877386093, + "learning_rate": 1.4681128763670732e-05, + "loss": 0.196, + "step": 46114 + }, + { + "epoch": 3.7358230719377836, + "grad_norm": 0.07342427223920822, + "learning_rate": 1.4676628111076107e-05, + "loss": 0.1924, + "step": 46115 + }, + { + "epoch": 3.735904082955282, + "grad_norm": 0.06829115003347397, + "learning_rate": 1.467212745848148e-05, + "loss": 0.2165, + "step": 46116 + }, + { + "epoch": 3.73598509397278, + "grad_norm": 0.07790973037481308, + "learning_rate": 1.4667626805886856e-05, + "loss": 0.2195, + "step": 46117 + }, + { + "epoch": 3.736066104990279, + "grad_norm": 0.06781540811061859, + "learning_rate": 1.4663126153292228e-05, + "loss": 0.1872, + "step": 46118 + }, + { + "epoch": 3.736147116007777, + "grad_norm": 0.08341260254383087, + "learning_rate": 1.4658625500697601e-05, + "loss": 0.2518, + "step": 46119 + }, + { + "epoch": 3.7362281270252753, + "grad_norm": 0.08755706995725632, + "learning_rate": 1.4654124848102977e-05, + "loss": 0.268, + "step": 46120 + }, + { + "epoch": 3.736309138042774, + "grad_norm": 0.08430002629756927, + "learning_rate": 1.4649624195508349e-05, + "loss": 0.2045, + "step": 46121 + }, + { + "epoch": 3.7363901490602722, + "grad_norm": 0.09913341701030731, + "learning_rate": 1.4645123542913724e-05, + "loss": 0.2489, + "step": 46122 + }, + { + "epoch": 3.7364711600777705, + "grad_norm": 0.08157473057508469, + "learning_rate": 1.4640622890319098e-05, + "loss": 0.2011, + "step": 46123 + }, + { + "epoch": 3.736552171095269, + "grad_norm": 0.06985532492399216, + "learning_rate": 1.463612223772447e-05, + "loss": 0.1987, + "step": 46124 + }, + { + "epoch": 3.7366331821127674, + "grad_norm": 0.0689232274889946, + "learning_rate": 1.4631621585129845e-05, + "loss": 0.1943, + "step": 46125 + }, + { + "epoch": 3.7367141931302656, + "grad_norm": 0.07660727947950363, + "learning_rate": 1.4627120932535218e-05, + "loss": 0.2355, + "step": 46126 + }, + { + "epoch": 3.7367952041477643, + "grad_norm": 0.07977120578289032, + "learning_rate": 1.462262027994059e-05, + "loss": 0.2234, + "step": 46127 + }, + { + "epoch": 3.7368762151652626, + "grad_norm": 0.09145716577768326, + "learning_rate": 1.4618119627345967e-05, + "loss": 0.2729, + "step": 46128 + }, + { + "epoch": 3.736957226182761, + "grad_norm": 0.06459056586027145, + "learning_rate": 1.4613618974751339e-05, + "loss": 0.2561, + "step": 46129 + }, + { + "epoch": 3.737038237200259, + "grad_norm": 0.07937449961900711, + "learning_rate": 1.4609118322156714e-05, + "loss": 0.2047, + "step": 46130 + }, + { + "epoch": 3.7371192482177578, + "grad_norm": 0.08180252462625504, + "learning_rate": 1.4604617669562088e-05, + "loss": 0.2194, + "step": 46131 + }, + { + "epoch": 3.737200259235256, + "grad_norm": 0.06420495361089706, + "learning_rate": 1.460011701696746e-05, + "loss": 0.1774, + "step": 46132 + }, + { + "epoch": 3.7372812702527543, + "grad_norm": 0.0610925629734993, + "learning_rate": 1.4595616364372835e-05, + "loss": 0.191, + "step": 46133 + }, + { + "epoch": 3.7373622812702525, + "grad_norm": 0.06337723135948181, + "learning_rate": 1.4591115711778209e-05, + "loss": 0.2092, + "step": 46134 + }, + { + "epoch": 3.737443292287751, + "grad_norm": 0.0656128078699112, + "learning_rate": 1.458661505918358e-05, + "loss": 0.2142, + "step": 46135 + }, + { + "epoch": 3.7375243033052494, + "grad_norm": 0.0756414532661438, + "learning_rate": 1.4582114406588956e-05, + "loss": 0.2215, + "step": 46136 + }, + { + "epoch": 3.7376053143227477, + "grad_norm": 0.06451795995235443, + "learning_rate": 1.457761375399433e-05, + "loss": 0.2521, + "step": 46137 + }, + { + "epoch": 3.7376863253402464, + "grad_norm": 0.07743828743696213, + "learning_rate": 1.4573113101399705e-05, + "loss": 0.2147, + "step": 46138 + }, + { + "epoch": 3.7377673363577446, + "grad_norm": 0.0701700821518898, + "learning_rate": 1.4568612448805077e-05, + "loss": 0.2088, + "step": 46139 + }, + { + "epoch": 3.737848347375243, + "grad_norm": 0.08126933127641678, + "learning_rate": 1.456411179621045e-05, + "loss": 0.251, + "step": 46140 + }, + { + "epoch": 3.7379293583927415, + "grad_norm": 0.07062458992004395, + "learning_rate": 1.4559611143615826e-05, + "loss": 0.2112, + "step": 46141 + }, + { + "epoch": 3.73801036941024, + "grad_norm": 0.08064697682857513, + "learning_rate": 1.4555110491021198e-05, + "loss": 0.2212, + "step": 46142 + }, + { + "epoch": 3.738091380427738, + "grad_norm": 0.06433752179145813, + "learning_rate": 1.4550609838426573e-05, + "loss": 0.2224, + "step": 46143 + }, + { + "epoch": 3.7381723914452367, + "grad_norm": 0.0687873363494873, + "learning_rate": 1.4546109185831947e-05, + "loss": 0.2246, + "step": 46144 + }, + { + "epoch": 3.738253402462735, + "grad_norm": 0.06918583065271378, + "learning_rate": 1.4541608533237318e-05, + "loss": 0.211, + "step": 46145 + }, + { + "epoch": 3.738334413480233, + "grad_norm": 0.07868772000074387, + "learning_rate": 1.4537107880642695e-05, + "loss": 0.2275, + "step": 46146 + }, + { + "epoch": 3.738415424497732, + "grad_norm": 0.06755036860704422, + "learning_rate": 1.4532607228048067e-05, + "loss": 0.195, + "step": 46147 + }, + { + "epoch": 3.73849643551523, + "grad_norm": 0.06591461598873138, + "learning_rate": 1.452810657545344e-05, + "loss": 0.1961, + "step": 46148 + }, + { + "epoch": 3.7385774465327284, + "grad_norm": 0.090169258415699, + "learning_rate": 1.4523605922858816e-05, + "loss": 0.243, + "step": 46149 + }, + { + "epoch": 3.738658457550227, + "grad_norm": 0.07051538676023483, + "learning_rate": 1.4519105270264188e-05, + "loss": 0.1963, + "step": 46150 + }, + { + "epoch": 3.7387394685677253, + "grad_norm": 0.08839746564626694, + "learning_rate": 1.4514604617669563e-05, + "loss": 0.2587, + "step": 46151 + }, + { + "epoch": 3.7388204795852236, + "grad_norm": 0.06953133642673492, + "learning_rate": 1.4510103965074937e-05, + "loss": 0.1979, + "step": 46152 + }, + { + "epoch": 3.738901490602722, + "grad_norm": 0.08543260395526886, + "learning_rate": 1.4505603312480309e-05, + "loss": 0.2256, + "step": 46153 + }, + { + "epoch": 3.7389825016202205, + "grad_norm": 0.06941500306129456, + "learning_rate": 1.4501102659885684e-05, + "loss": 0.2061, + "step": 46154 + }, + { + "epoch": 3.7390635126377187, + "grad_norm": 0.05090836063027382, + "learning_rate": 1.4496602007291058e-05, + "loss": 0.2119, + "step": 46155 + }, + { + "epoch": 3.739144523655217, + "grad_norm": 0.06198856979608536, + "learning_rate": 1.4492101354696433e-05, + "loss": 0.2057, + "step": 46156 + }, + { + "epoch": 3.7392255346727152, + "grad_norm": 0.07832913100719452, + "learning_rate": 1.4487600702101805e-05, + "loss": 0.1959, + "step": 46157 + }, + { + "epoch": 3.739306545690214, + "grad_norm": 0.07277149707078934, + "learning_rate": 1.4483100049507179e-05, + "loss": 0.2088, + "step": 46158 + }, + { + "epoch": 3.739387556707712, + "grad_norm": 0.08063270896673203, + "learning_rate": 1.4478599396912554e-05, + "loss": 0.2205, + "step": 46159 + }, + { + "epoch": 3.7394685677252104, + "grad_norm": 0.058787040412425995, + "learning_rate": 1.4474098744317926e-05, + "loss": 0.1775, + "step": 46160 + }, + { + "epoch": 3.739549578742709, + "grad_norm": 0.0893818661570549, + "learning_rate": 1.44695980917233e-05, + "loss": 0.2183, + "step": 46161 + }, + { + "epoch": 3.7396305897602073, + "grad_norm": 0.0777546837925911, + "learning_rate": 1.4465097439128675e-05, + "loss": 0.2185, + "step": 46162 + }, + { + "epoch": 3.7397116007777056, + "grad_norm": 0.0717715248465538, + "learning_rate": 1.4460596786534047e-05, + "loss": 0.2058, + "step": 46163 + }, + { + "epoch": 3.7397926117952043, + "grad_norm": 0.07632574439048767, + "learning_rate": 1.4456096133939424e-05, + "loss": 0.2519, + "step": 46164 + }, + { + "epoch": 3.7398736228127025, + "grad_norm": 0.07262930274009705, + "learning_rate": 1.4451595481344796e-05, + "loss": 0.2087, + "step": 46165 + }, + { + "epoch": 3.7399546338302008, + "grad_norm": 0.07834277302026749, + "learning_rate": 1.444709482875017e-05, + "loss": 0.2157, + "step": 46166 + }, + { + "epoch": 3.7400356448476995, + "grad_norm": 0.07401188462972641, + "learning_rate": 1.4442594176155545e-05, + "loss": 0.2323, + "step": 46167 + }, + { + "epoch": 3.7401166558651977, + "grad_norm": 0.06235014274716377, + "learning_rate": 1.4438093523560916e-05, + "loss": 0.2128, + "step": 46168 + }, + { + "epoch": 3.740197666882696, + "grad_norm": 0.09011727571487427, + "learning_rate": 1.4433592870966292e-05, + "loss": 0.251, + "step": 46169 + }, + { + "epoch": 3.7402786779001946, + "grad_norm": 0.07866238057613373, + "learning_rate": 1.4429092218371665e-05, + "loss": 0.2425, + "step": 46170 + }, + { + "epoch": 3.740359688917693, + "grad_norm": 0.06134997680783272, + "learning_rate": 1.4424591565777037e-05, + "loss": 0.191, + "step": 46171 + }, + { + "epoch": 3.740440699935191, + "grad_norm": 0.05997680500149727, + "learning_rate": 1.4420090913182413e-05, + "loss": 0.2054, + "step": 46172 + }, + { + "epoch": 3.74052171095269, + "grad_norm": 0.06824682652950287, + "learning_rate": 1.4415590260587786e-05, + "loss": 0.2271, + "step": 46173 + }, + { + "epoch": 3.740602721970188, + "grad_norm": 0.062017668038606644, + "learning_rate": 1.4411089607993158e-05, + "loss": 0.2037, + "step": 46174 + }, + { + "epoch": 3.7406837329876863, + "grad_norm": 0.06739827990531921, + "learning_rate": 1.4406588955398533e-05, + "loss": 0.2437, + "step": 46175 + }, + { + "epoch": 3.7407647440051845, + "grad_norm": 0.0841369777917862, + "learning_rate": 1.4402088302803907e-05, + "loss": 0.1901, + "step": 46176 + }, + { + "epoch": 3.7408457550226832, + "grad_norm": 0.06553324311971664, + "learning_rate": 1.4397587650209282e-05, + "loss": 0.211, + "step": 46177 + }, + { + "epoch": 3.7409267660401815, + "grad_norm": 0.07525978982448578, + "learning_rate": 1.4393086997614654e-05, + "loss": 0.2238, + "step": 46178 + }, + { + "epoch": 3.7410077770576797, + "grad_norm": 0.07539505511522293, + "learning_rate": 1.4388586345020028e-05, + "loss": 0.211, + "step": 46179 + }, + { + "epoch": 3.741088788075178, + "grad_norm": 0.063744455575943, + "learning_rate": 1.4384085692425403e-05, + "loss": 0.2007, + "step": 46180 + }, + { + "epoch": 3.7411697990926767, + "grad_norm": 0.07325825095176697, + "learning_rate": 1.4379585039830775e-05, + "loss": 0.2334, + "step": 46181 + }, + { + "epoch": 3.741250810110175, + "grad_norm": 0.07868387550115585, + "learning_rate": 1.4375084387236152e-05, + "loss": 0.2157, + "step": 46182 + }, + { + "epoch": 3.741331821127673, + "grad_norm": 0.07169875502586365, + "learning_rate": 1.4370583734641524e-05, + "loss": 0.2132, + "step": 46183 + }, + { + "epoch": 3.741412832145172, + "grad_norm": 0.07317327708005905, + "learning_rate": 1.4366083082046897e-05, + "loss": 0.253, + "step": 46184 + }, + { + "epoch": 3.74149384316267, + "grad_norm": 0.06858827918767929, + "learning_rate": 1.4361582429452273e-05, + "loss": 0.2038, + "step": 46185 + }, + { + "epoch": 3.7415748541801683, + "grad_norm": 0.05808888003230095, + "learning_rate": 1.4357081776857645e-05, + "loss": 0.1893, + "step": 46186 + }, + { + "epoch": 3.741655865197667, + "grad_norm": 0.06428666412830353, + "learning_rate": 1.4352581124263018e-05, + "loss": 0.2119, + "step": 46187 + }, + { + "epoch": 3.7417368762151653, + "grad_norm": 0.07074768841266632, + "learning_rate": 1.4348080471668394e-05, + "loss": 0.2288, + "step": 46188 + }, + { + "epoch": 3.7418178872326635, + "grad_norm": 0.08114055544137955, + "learning_rate": 1.4343579819073765e-05, + "loss": 0.2495, + "step": 46189 + }, + { + "epoch": 3.741898898250162, + "grad_norm": 0.07321310043334961, + "learning_rate": 1.433907916647914e-05, + "loss": 0.2476, + "step": 46190 + }, + { + "epoch": 3.7419799092676604, + "grad_norm": 0.08425623923540115, + "learning_rate": 1.4334578513884514e-05, + "loss": 0.2302, + "step": 46191 + }, + { + "epoch": 3.7420609202851587, + "grad_norm": 0.06360992789268494, + "learning_rate": 1.4330077861289886e-05, + "loss": 0.2127, + "step": 46192 + }, + { + "epoch": 3.7421419313026574, + "grad_norm": 0.07066339999437332, + "learning_rate": 1.4325577208695262e-05, + "loss": 0.2265, + "step": 46193 + }, + { + "epoch": 3.7422229423201556, + "grad_norm": 0.07735880464315414, + "learning_rate": 1.4321076556100635e-05, + "loss": 0.1989, + "step": 46194 + }, + { + "epoch": 3.742303953337654, + "grad_norm": 0.07494205236434937, + "learning_rate": 1.431657590350601e-05, + "loss": 0.2466, + "step": 46195 + }, + { + "epoch": 3.7423849643551526, + "grad_norm": 0.07548823207616806, + "learning_rate": 1.4312075250911382e-05, + "loss": 0.25, + "step": 46196 + }, + { + "epoch": 3.742465975372651, + "grad_norm": 0.0775391012430191, + "learning_rate": 1.4307574598316756e-05, + "loss": 0.2154, + "step": 46197 + }, + { + "epoch": 3.742546986390149, + "grad_norm": 0.06699874252080917, + "learning_rate": 1.4303073945722131e-05, + "loss": 0.2235, + "step": 46198 + }, + { + "epoch": 3.7426279974076473, + "grad_norm": 0.07778064161539078, + "learning_rate": 1.4298573293127503e-05, + "loss": 0.2105, + "step": 46199 + }, + { + "epoch": 3.742709008425146, + "grad_norm": 0.06393593549728394, + "learning_rate": 1.4294072640532877e-05, + "loss": 0.1742, + "step": 46200 + }, + { + "epoch": 3.7427900194426442, + "grad_norm": 0.0789911299943924, + "learning_rate": 1.4289571987938252e-05, + "loss": 0.2344, + "step": 46201 + }, + { + "epoch": 3.7428710304601425, + "grad_norm": 0.07193242013454437, + "learning_rate": 1.4285071335343626e-05, + "loss": 0.1964, + "step": 46202 + }, + { + "epoch": 3.7429520414776407, + "grad_norm": 0.04981725290417671, + "learning_rate": 1.4280570682749001e-05, + "loss": 0.1949, + "step": 46203 + }, + { + "epoch": 3.7430330524951394, + "grad_norm": 0.05438205972313881, + "learning_rate": 1.4276070030154373e-05, + "loss": 0.1834, + "step": 46204 + }, + { + "epoch": 3.7431140635126376, + "grad_norm": 0.07782084494829178, + "learning_rate": 1.4271569377559747e-05, + "loss": 0.2375, + "step": 46205 + }, + { + "epoch": 3.743195074530136, + "grad_norm": 0.08021578937768936, + "learning_rate": 1.4267068724965122e-05, + "loss": 0.2317, + "step": 46206 + }, + { + "epoch": 3.7432760855476346, + "grad_norm": 0.06510843336582184, + "learning_rate": 1.4262568072370494e-05, + "loss": 0.1998, + "step": 46207 + }, + { + "epoch": 3.743357096565133, + "grad_norm": 0.08877057582139969, + "learning_rate": 1.4258067419775867e-05, + "loss": 0.2367, + "step": 46208 + }, + { + "epoch": 3.743438107582631, + "grad_norm": 0.06314340233802795, + "learning_rate": 1.4253566767181243e-05, + "loss": 0.2043, + "step": 46209 + }, + { + "epoch": 3.7435191186001298, + "grad_norm": 0.06458821147680283, + "learning_rate": 1.4249066114586615e-05, + "loss": 0.1905, + "step": 46210 + }, + { + "epoch": 3.743600129617628, + "grad_norm": 0.054313499480485916, + "learning_rate": 1.424456546199199e-05, + "loss": 0.2063, + "step": 46211 + }, + { + "epoch": 3.7436811406351262, + "grad_norm": 0.059741366654634476, + "learning_rate": 1.4240064809397363e-05, + "loss": 0.1971, + "step": 46212 + }, + { + "epoch": 3.743762151652625, + "grad_norm": 0.07452462613582611, + "learning_rate": 1.4235564156802735e-05, + "loss": 0.2196, + "step": 46213 + }, + { + "epoch": 3.743843162670123, + "grad_norm": 0.07014549523591995, + "learning_rate": 1.423106350420811e-05, + "loss": 0.2512, + "step": 46214 + }, + { + "epoch": 3.7439241736876214, + "grad_norm": 0.08021154999732971, + "learning_rate": 1.4226562851613484e-05, + "loss": 0.2251, + "step": 46215 + }, + { + "epoch": 3.74400518470512, + "grad_norm": 0.06592398136854172, + "learning_rate": 1.422206219901886e-05, + "loss": 0.2422, + "step": 46216 + }, + { + "epoch": 3.7440861957226184, + "grad_norm": 0.0847722515463829, + "learning_rate": 1.4217561546424233e-05, + "loss": 0.2682, + "step": 46217 + }, + { + "epoch": 3.7441672067401166, + "grad_norm": 0.07960616797208786, + "learning_rate": 1.4213060893829605e-05, + "loss": 0.2091, + "step": 46218 + }, + { + "epoch": 3.7442482177576153, + "grad_norm": 0.06076245382428169, + "learning_rate": 1.420856024123498e-05, + "loss": 0.2446, + "step": 46219 + }, + { + "epoch": 3.7443292287751135, + "grad_norm": 0.07363170385360718, + "learning_rate": 1.4204059588640354e-05, + "loss": 0.218, + "step": 46220 + }, + { + "epoch": 3.744410239792612, + "grad_norm": 0.062311843037605286, + "learning_rate": 1.4199558936045726e-05, + "loss": 0.2067, + "step": 46221 + }, + { + "epoch": 3.74449125081011, + "grad_norm": 0.05965553969144821, + "learning_rate": 1.4195058283451101e-05, + "loss": 0.218, + "step": 46222 + }, + { + "epoch": 3.7445722618276087, + "grad_norm": 0.08739668875932693, + "learning_rate": 1.4190557630856475e-05, + "loss": 0.2213, + "step": 46223 + }, + { + "epoch": 3.744653272845107, + "grad_norm": 0.08150233328342438, + "learning_rate": 1.418605697826185e-05, + "loss": 0.2578, + "step": 46224 + }, + { + "epoch": 3.744734283862605, + "grad_norm": 0.09309550374746323, + "learning_rate": 1.4181556325667222e-05, + "loss": 0.2188, + "step": 46225 + }, + { + "epoch": 3.7448152948801035, + "grad_norm": 0.0847107321023941, + "learning_rate": 1.4177055673072596e-05, + "loss": 0.227, + "step": 46226 + }, + { + "epoch": 3.744896305897602, + "grad_norm": 0.06954275071620941, + "learning_rate": 1.4172555020477971e-05, + "loss": 0.2429, + "step": 46227 + }, + { + "epoch": 3.7449773169151004, + "grad_norm": 0.0696917250752449, + "learning_rate": 1.4168054367883343e-05, + "loss": 0.2489, + "step": 46228 + }, + { + "epoch": 3.7450583279325986, + "grad_norm": 0.08773817121982574, + "learning_rate": 1.4163553715288718e-05, + "loss": 0.2053, + "step": 46229 + }, + { + "epoch": 3.7451393389500973, + "grad_norm": 0.06404940038919449, + "learning_rate": 1.4159053062694092e-05, + "loss": 0.2722, + "step": 46230 + }, + { + "epoch": 3.7452203499675956, + "grad_norm": 0.06850332766771317, + "learning_rate": 1.4154552410099464e-05, + "loss": 0.2227, + "step": 46231 + }, + { + "epoch": 3.745301360985094, + "grad_norm": 0.06635698676109314, + "learning_rate": 1.4150051757504839e-05, + "loss": 0.1951, + "step": 46232 + }, + { + "epoch": 3.7453823720025925, + "grad_norm": 0.0643756315112114, + "learning_rate": 1.4145551104910213e-05, + "loss": 0.2088, + "step": 46233 + }, + { + "epoch": 3.7454633830200907, + "grad_norm": 0.08907685428857803, + "learning_rate": 1.4141050452315584e-05, + "loss": 0.2359, + "step": 46234 + }, + { + "epoch": 3.745544394037589, + "grad_norm": 0.07513679563999176, + "learning_rate": 1.4136549799720961e-05, + "loss": 0.1924, + "step": 46235 + }, + { + "epoch": 3.7456254050550877, + "grad_norm": 0.06546442955732346, + "learning_rate": 1.4132049147126333e-05, + "loss": 0.2136, + "step": 46236 + }, + { + "epoch": 3.745706416072586, + "grad_norm": 0.0697525292634964, + "learning_rate": 1.4127548494531709e-05, + "loss": 0.2123, + "step": 46237 + }, + { + "epoch": 3.745787427090084, + "grad_norm": 0.07150597870349884, + "learning_rate": 1.4123047841937082e-05, + "loss": 0.1814, + "step": 46238 + }, + { + "epoch": 3.745868438107583, + "grad_norm": 0.08265596628189087, + "learning_rate": 1.4118547189342454e-05, + "loss": 0.2069, + "step": 46239 + }, + { + "epoch": 3.745949449125081, + "grad_norm": 0.06267092376947403, + "learning_rate": 1.411404653674783e-05, + "loss": 0.2089, + "step": 46240 + }, + { + "epoch": 3.7460304601425793, + "grad_norm": 0.06582377105951309, + "learning_rate": 1.4109545884153203e-05, + "loss": 0.265, + "step": 46241 + }, + { + "epoch": 3.746111471160078, + "grad_norm": 0.061006009578704834, + "learning_rate": 1.4105045231558578e-05, + "loss": 0.2243, + "step": 46242 + }, + { + "epoch": 3.7461924821775763, + "grad_norm": 0.06815444678068161, + "learning_rate": 1.410054457896395e-05, + "loss": 0.2261, + "step": 46243 + }, + { + "epoch": 3.7462734931950745, + "grad_norm": 0.06948699802160263, + "learning_rate": 1.4096043926369324e-05, + "loss": 0.2476, + "step": 46244 + }, + { + "epoch": 3.7463545042125728, + "grad_norm": 0.06912975758314133, + "learning_rate": 1.4091543273774699e-05, + "loss": 0.2313, + "step": 46245 + }, + { + "epoch": 3.7464355152300715, + "grad_norm": 0.06383688002824783, + "learning_rate": 1.4087042621180071e-05, + "loss": 0.2016, + "step": 46246 + }, + { + "epoch": 3.7465165262475697, + "grad_norm": 0.08401155471801758, + "learning_rate": 1.4082541968585445e-05, + "loss": 0.2159, + "step": 46247 + }, + { + "epoch": 3.746597537265068, + "grad_norm": 0.06746657192707062, + "learning_rate": 1.407804131599082e-05, + "loss": 0.2241, + "step": 46248 + }, + { + "epoch": 3.746678548282566, + "grad_norm": 0.08890854567289352, + "learning_rate": 1.4073540663396192e-05, + "loss": 0.2303, + "step": 46249 + }, + { + "epoch": 3.746759559300065, + "grad_norm": 0.06370741128921509, + "learning_rate": 1.4069040010801567e-05, + "loss": 0.2286, + "step": 46250 + }, + { + "epoch": 3.746840570317563, + "grad_norm": 0.05881880968809128, + "learning_rate": 1.406453935820694e-05, + "loss": 0.195, + "step": 46251 + }, + { + "epoch": 3.7469215813350614, + "grad_norm": 0.06718897074460983, + "learning_rate": 1.4060038705612313e-05, + "loss": 0.2372, + "step": 46252 + }, + { + "epoch": 3.74700259235256, + "grad_norm": 0.06468194723129272, + "learning_rate": 1.405553805301769e-05, + "loss": 0.1888, + "step": 46253 + }, + { + "epoch": 3.7470836033700583, + "grad_norm": 0.06398289650678635, + "learning_rate": 1.4051037400423062e-05, + "loss": 0.1908, + "step": 46254 + }, + { + "epoch": 3.7471646143875565, + "grad_norm": 0.06329668313264847, + "learning_rate": 1.4046536747828437e-05, + "loss": 0.2213, + "step": 46255 + }, + { + "epoch": 3.7472456254050552, + "grad_norm": 0.07826821506023407, + "learning_rate": 1.404203609523381e-05, + "loss": 0.1977, + "step": 46256 + }, + { + "epoch": 3.7473266364225535, + "grad_norm": 0.0786462053656578, + "learning_rate": 1.4037535442639182e-05, + "loss": 0.2486, + "step": 46257 + }, + { + "epoch": 3.7474076474400517, + "grad_norm": 0.07066361606121063, + "learning_rate": 1.4033034790044558e-05, + "loss": 0.2057, + "step": 46258 + }, + { + "epoch": 3.7474886584575504, + "grad_norm": 0.07975254207849503, + "learning_rate": 1.4028534137449931e-05, + "loss": 0.2555, + "step": 46259 + }, + { + "epoch": 3.7475696694750487, + "grad_norm": 0.08178477734327316, + "learning_rate": 1.4024033484855303e-05, + "loss": 0.2112, + "step": 46260 + }, + { + "epoch": 3.747650680492547, + "grad_norm": 0.07174079120159149, + "learning_rate": 1.4019532832260678e-05, + "loss": 0.1987, + "step": 46261 + }, + { + "epoch": 3.7477316915100456, + "grad_norm": 0.06191396713256836, + "learning_rate": 1.4015032179666052e-05, + "loss": 0.2372, + "step": 46262 + }, + { + "epoch": 3.747812702527544, + "grad_norm": 0.0650116503238678, + "learning_rate": 1.4010531527071427e-05, + "loss": 0.2416, + "step": 46263 + }, + { + "epoch": 3.747893713545042, + "grad_norm": 0.07233452051877975, + "learning_rate": 1.40060308744768e-05, + "loss": 0.2519, + "step": 46264 + }, + { + "epoch": 3.7479747245625408, + "grad_norm": 0.06753069162368774, + "learning_rate": 1.4001530221882173e-05, + "loss": 0.1924, + "step": 46265 + }, + { + "epoch": 3.748055735580039, + "grad_norm": 0.08413957059383392, + "learning_rate": 1.3997029569287548e-05, + "loss": 0.2399, + "step": 46266 + }, + { + "epoch": 3.7481367465975373, + "grad_norm": 0.07612863928079605, + "learning_rate": 1.399252891669292e-05, + "loss": 0.1912, + "step": 46267 + }, + { + "epoch": 3.7482177576150355, + "grad_norm": 0.06556925177574158, + "learning_rate": 1.3988028264098297e-05, + "loss": 0.2088, + "step": 46268 + }, + { + "epoch": 3.7482987686325338, + "grad_norm": 0.07448197156190872, + "learning_rate": 1.3983527611503669e-05, + "loss": 0.2145, + "step": 46269 + }, + { + "epoch": 3.7483797796500324, + "grad_norm": 0.07443884760141373, + "learning_rate": 1.3979026958909041e-05, + "loss": 0.1992, + "step": 46270 + }, + { + "epoch": 3.7484607906675307, + "grad_norm": 0.07081007212400436, + "learning_rate": 1.3974526306314418e-05, + "loss": 0.2568, + "step": 46271 + }, + { + "epoch": 3.748541801685029, + "grad_norm": 0.06729474663734436, + "learning_rate": 1.397002565371979e-05, + "loss": 0.1957, + "step": 46272 + }, + { + "epoch": 3.7486228127025276, + "grad_norm": 0.06812798231840134, + "learning_rate": 1.3965525001125163e-05, + "loss": 0.2147, + "step": 46273 + }, + { + "epoch": 3.748703823720026, + "grad_norm": 0.07089778780937195, + "learning_rate": 1.3961024348530539e-05, + "loss": 0.2223, + "step": 46274 + }, + { + "epoch": 3.748784834737524, + "grad_norm": 0.06887844949960709, + "learning_rate": 1.395652369593591e-05, + "loss": 0.2128, + "step": 46275 + }, + { + "epoch": 3.748865845755023, + "grad_norm": 0.0692133754491806, + "learning_rate": 1.3952023043341286e-05, + "loss": 0.2063, + "step": 46276 + }, + { + "epoch": 3.748946856772521, + "grad_norm": 0.059787359088659286, + "learning_rate": 1.394752239074666e-05, + "loss": 0.2294, + "step": 46277 + }, + { + "epoch": 3.7490278677900193, + "grad_norm": 0.055009905248880386, + "learning_rate": 1.3943021738152031e-05, + "loss": 0.194, + "step": 46278 + }, + { + "epoch": 3.749108878807518, + "grad_norm": 0.07308180630207062, + "learning_rate": 1.3938521085557407e-05, + "loss": 0.2311, + "step": 46279 + }, + { + "epoch": 3.749189889825016, + "grad_norm": 0.0746115893125534, + "learning_rate": 1.393402043296278e-05, + "loss": 0.247, + "step": 46280 + }, + { + "epoch": 3.7492709008425145, + "grad_norm": 0.058564692735672, + "learning_rate": 1.3929519780368156e-05, + "loss": 0.1998, + "step": 46281 + }, + { + "epoch": 3.749351911860013, + "grad_norm": 0.06961791962385178, + "learning_rate": 1.3925019127773528e-05, + "loss": 0.2499, + "step": 46282 + }, + { + "epoch": 3.7494329228775114, + "grad_norm": 0.06499256938695908, + "learning_rate": 1.3920518475178901e-05, + "loss": 0.213, + "step": 46283 + }, + { + "epoch": 3.7495139338950096, + "grad_norm": 0.0894896611571312, + "learning_rate": 1.3916017822584276e-05, + "loss": 0.2478, + "step": 46284 + }, + { + "epoch": 3.7495949449125083, + "grad_norm": 0.08614595234394073, + "learning_rate": 1.3911517169989648e-05, + "loss": 0.209, + "step": 46285 + }, + { + "epoch": 3.7496759559300066, + "grad_norm": 0.06723782420158386, + "learning_rate": 1.3907016517395022e-05, + "loss": 0.2103, + "step": 46286 + }, + { + "epoch": 3.749756966947505, + "grad_norm": 0.07532931864261627, + "learning_rate": 1.3902515864800397e-05, + "loss": 0.2022, + "step": 46287 + }, + { + "epoch": 3.7498379779650035, + "grad_norm": 0.08021144568920135, + "learning_rate": 1.389801521220577e-05, + "loss": 0.2217, + "step": 46288 + }, + { + "epoch": 3.7499189889825018, + "grad_norm": 0.07499111443758011, + "learning_rate": 1.3893514559611146e-05, + "loss": 0.2636, + "step": 46289 + }, + { + "epoch": 3.75, + "grad_norm": 0.10113779455423355, + "learning_rate": 1.3889013907016518e-05, + "loss": 0.2517, + "step": 46290 + }, + { + "epoch": 3.7500810110174982, + "grad_norm": 0.05415019765496254, + "learning_rate": 1.3884513254421892e-05, + "loss": 0.2003, + "step": 46291 + }, + { + "epoch": 3.7501620220349965, + "grad_norm": 0.06942694634199142, + "learning_rate": 1.3880012601827267e-05, + "loss": 0.2233, + "step": 46292 + }, + { + "epoch": 3.750243033052495, + "grad_norm": 0.07195769250392914, + "learning_rate": 1.3875511949232639e-05, + "loss": 0.1826, + "step": 46293 + }, + { + "epoch": 3.7503240440699934, + "grad_norm": 0.05428317189216614, + "learning_rate": 1.3871011296638012e-05, + "loss": 0.21, + "step": 46294 + }, + { + "epoch": 3.7504050550874917, + "grad_norm": 0.050274260342121124, + "learning_rate": 1.3866510644043388e-05, + "loss": 0.2175, + "step": 46295 + }, + { + "epoch": 3.7504860661049904, + "grad_norm": 0.06542641669511795, + "learning_rate": 1.386200999144876e-05, + "loss": 0.2478, + "step": 46296 + }, + { + "epoch": 3.7505670771224886, + "grad_norm": 0.07026327401399612, + "learning_rate": 1.3857509338854135e-05, + "loss": 0.2171, + "step": 46297 + }, + { + "epoch": 3.750648088139987, + "grad_norm": 0.07940451055765152, + "learning_rate": 1.3853008686259509e-05, + "loss": 0.2378, + "step": 46298 + }, + { + "epoch": 3.7507290991574855, + "grad_norm": 0.08603038638830185, + "learning_rate": 1.384850803366488e-05, + "loss": 0.2269, + "step": 46299 + }, + { + "epoch": 3.750810110174984, + "grad_norm": 0.07071874290704727, + "learning_rate": 1.3844007381070256e-05, + "loss": 0.2226, + "step": 46300 + }, + { + "epoch": 3.750891121192482, + "grad_norm": 0.0803099051117897, + "learning_rate": 1.383950672847563e-05, + "loss": 0.2084, + "step": 46301 + }, + { + "epoch": 3.7509721322099807, + "grad_norm": 0.07697241753339767, + "learning_rate": 1.3835006075881005e-05, + "loss": 0.214, + "step": 46302 + }, + { + "epoch": 3.751053143227479, + "grad_norm": 0.06945455074310303, + "learning_rate": 1.3830505423286377e-05, + "loss": 0.2187, + "step": 46303 + }, + { + "epoch": 3.751134154244977, + "grad_norm": 0.0705169215798378, + "learning_rate": 1.382600477069175e-05, + "loss": 0.2639, + "step": 46304 + }, + { + "epoch": 3.751215165262476, + "grad_norm": 0.0709058865904808, + "learning_rate": 1.3821504118097126e-05, + "loss": 0.2424, + "step": 46305 + }, + { + "epoch": 3.751296176279974, + "grad_norm": 0.0878700241446495, + "learning_rate": 1.3817003465502497e-05, + "loss": 0.2605, + "step": 46306 + }, + { + "epoch": 3.7513771872974724, + "grad_norm": 0.0711369663476944, + "learning_rate": 1.3812502812907871e-05, + "loss": 0.2516, + "step": 46307 + }, + { + "epoch": 3.751458198314971, + "grad_norm": 0.07304982095956802, + "learning_rate": 1.3808002160313246e-05, + "loss": 0.2469, + "step": 46308 + }, + { + "epoch": 3.7515392093324693, + "grad_norm": 0.0640924721956253, + "learning_rate": 1.380350150771862e-05, + "loss": 0.1904, + "step": 46309 + }, + { + "epoch": 3.7516202203499676, + "grad_norm": 0.0668100118637085, + "learning_rate": 1.3799000855123995e-05, + "loss": 0.2016, + "step": 46310 + }, + { + "epoch": 3.7517012313674662, + "grad_norm": 0.0843009278178215, + "learning_rate": 1.3794500202529367e-05, + "loss": 0.229, + "step": 46311 + }, + { + "epoch": 3.7517822423849645, + "grad_norm": 0.07822394371032715, + "learning_rate": 1.378999954993474e-05, + "loss": 0.2195, + "step": 46312 + }, + { + "epoch": 3.7518632534024627, + "grad_norm": 0.07780709862709045, + "learning_rate": 1.3785498897340116e-05, + "loss": 0.2342, + "step": 46313 + }, + { + "epoch": 3.751944264419961, + "grad_norm": 0.06893333792686462, + "learning_rate": 1.3780998244745488e-05, + "loss": 0.232, + "step": 46314 + }, + { + "epoch": 3.7520252754374592, + "grad_norm": 0.06503161042928696, + "learning_rate": 1.3776497592150863e-05, + "loss": 0.24, + "step": 46315 + }, + { + "epoch": 3.752106286454958, + "grad_norm": 0.06948009878396988, + "learning_rate": 1.3771996939556237e-05, + "loss": 0.2304, + "step": 46316 + }, + { + "epoch": 3.752187297472456, + "grad_norm": 0.07049506157636642, + "learning_rate": 1.3767496286961609e-05, + "loss": 0.1865, + "step": 46317 + }, + { + "epoch": 3.7522683084899544, + "grad_norm": 0.07123062759637833, + "learning_rate": 1.3762995634366984e-05, + "loss": 0.2311, + "step": 46318 + }, + { + "epoch": 3.752349319507453, + "grad_norm": 0.07314299792051315, + "learning_rate": 1.3758494981772358e-05, + "loss": 0.236, + "step": 46319 + }, + { + "epoch": 3.7524303305249513, + "grad_norm": 0.07087218761444092, + "learning_rate": 1.375399432917773e-05, + "loss": 0.2167, + "step": 46320 + }, + { + "epoch": 3.7525113415424496, + "grad_norm": 0.09059132635593414, + "learning_rate": 1.3749493676583105e-05, + "loss": 0.2234, + "step": 46321 + }, + { + "epoch": 3.7525923525599483, + "grad_norm": 0.06358359009027481, + "learning_rate": 1.3744993023988478e-05, + "loss": 0.2308, + "step": 46322 + }, + { + "epoch": 3.7526733635774465, + "grad_norm": 0.06344583630561829, + "learning_rate": 1.3740492371393854e-05, + "loss": 0.2081, + "step": 46323 + }, + { + "epoch": 3.7527543745949448, + "grad_norm": 0.0792200043797493, + "learning_rate": 1.3735991718799227e-05, + "loss": 0.2335, + "step": 46324 + }, + { + "epoch": 3.7528353856124435, + "grad_norm": 0.06948511302471161, + "learning_rate": 1.37314910662046e-05, + "loss": 0.2181, + "step": 46325 + }, + { + "epoch": 3.7529163966299417, + "grad_norm": 0.06803888082504272, + "learning_rate": 1.3726990413609975e-05, + "loss": 0.2079, + "step": 46326 + }, + { + "epoch": 3.75299740764744, + "grad_norm": 0.06263738125562668, + "learning_rate": 1.3722489761015348e-05, + "loss": 0.2381, + "step": 46327 + }, + { + "epoch": 3.7530784186649386, + "grad_norm": 0.06731061637401581, + "learning_rate": 1.3717989108420723e-05, + "loss": 0.2008, + "step": 46328 + }, + { + "epoch": 3.753159429682437, + "grad_norm": 0.07265542447566986, + "learning_rate": 1.3713488455826095e-05, + "loss": 0.2197, + "step": 46329 + }, + { + "epoch": 3.753240440699935, + "grad_norm": 0.08350956439971924, + "learning_rate": 1.3708987803231469e-05, + "loss": 0.2395, + "step": 46330 + }, + { + "epoch": 3.753321451717434, + "grad_norm": 0.07093897461891174, + "learning_rate": 1.3704487150636844e-05, + "loss": 0.2328, + "step": 46331 + }, + { + "epoch": 3.753402462734932, + "grad_norm": 0.10086548328399658, + "learning_rate": 1.3699986498042216e-05, + "loss": 0.2574, + "step": 46332 + }, + { + "epoch": 3.7534834737524303, + "grad_norm": 0.06807972490787506, + "learning_rate": 1.369548584544759e-05, + "loss": 0.2357, + "step": 46333 + }, + { + "epoch": 3.7535644847699285, + "grad_norm": 0.09186132252216339, + "learning_rate": 1.3690985192852965e-05, + "loss": 0.2512, + "step": 46334 + }, + { + "epoch": 3.7536454957874272, + "grad_norm": 0.06752340495586395, + "learning_rate": 1.3686484540258337e-05, + "loss": 0.212, + "step": 46335 + }, + { + "epoch": 3.7537265068049255, + "grad_norm": 0.07947950065135956, + "learning_rate": 1.3681983887663712e-05, + "loss": 0.2415, + "step": 46336 + }, + { + "epoch": 3.7538075178224237, + "grad_norm": 0.07616253942251205, + "learning_rate": 1.3677483235069086e-05, + "loss": 0.2219, + "step": 46337 + }, + { + "epoch": 3.753888528839922, + "grad_norm": 0.074013851583004, + "learning_rate": 1.3672982582474458e-05, + "loss": 0.2383, + "step": 46338 + }, + { + "epoch": 3.7539695398574207, + "grad_norm": 0.07347376644611359, + "learning_rate": 1.3668481929879833e-05, + "loss": 0.1882, + "step": 46339 + }, + { + "epoch": 3.754050550874919, + "grad_norm": 0.06440410017967224, + "learning_rate": 1.3663981277285207e-05, + "loss": 0.2126, + "step": 46340 + }, + { + "epoch": 3.754131561892417, + "grad_norm": 0.061819300055503845, + "learning_rate": 1.3659480624690582e-05, + "loss": 0.2163, + "step": 46341 + }, + { + "epoch": 3.754212572909916, + "grad_norm": 0.09030622243881226, + "learning_rate": 1.3654979972095956e-05, + "loss": 0.2193, + "step": 46342 + }, + { + "epoch": 3.754293583927414, + "grad_norm": 0.06111063435673714, + "learning_rate": 1.3650479319501328e-05, + "loss": 0.2516, + "step": 46343 + }, + { + "epoch": 3.7543745949449123, + "grad_norm": 0.07280724495649338, + "learning_rate": 1.3645978666906703e-05, + "loss": 0.2058, + "step": 46344 + }, + { + "epoch": 3.754455605962411, + "grad_norm": 0.0639389231801033, + "learning_rate": 1.3641478014312076e-05, + "loss": 0.2089, + "step": 46345 + }, + { + "epoch": 3.7545366169799093, + "grad_norm": 0.08013645559549332, + "learning_rate": 1.3636977361717448e-05, + "loss": 0.2546, + "step": 46346 + }, + { + "epoch": 3.7546176279974075, + "grad_norm": 0.08095403015613556, + "learning_rate": 1.3632476709122824e-05, + "loss": 0.218, + "step": 46347 + }, + { + "epoch": 3.754698639014906, + "grad_norm": 0.06311283260583878, + "learning_rate": 1.3627976056528197e-05, + "loss": 0.2036, + "step": 46348 + }, + { + "epoch": 3.7547796500324044, + "grad_norm": 0.07423945516347885, + "learning_rate": 1.3623475403933573e-05, + "loss": 0.2177, + "step": 46349 + }, + { + "epoch": 3.7548606610499027, + "grad_norm": 0.07765256613492966, + "learning_rate": 1.3618974751338944e-05, + "loss": 0.2528, + "step": 46350 + }, + { + "epoch": 3.7549416720674014, + "grad_norm": 0.05665789544582367, + "learning_rate": 1.3614474098744318e-05, + "loss": 0.2193, + "step": 46351 + }, + { + "epoch": 3.7550226830848996, + "grad_norm": 0.07820606231689453, + "learning_rate": 1.3609973446149693e-05, + "loss": 0.2629, + "step": 46352 + }, + { + "epoch": 3.755103694102398, + "grad_norm": 0.06132517755031586, + "learning_rate": 1.3605472793555065e-05, + "loss": 0.2099, + "step": 46353 + }, + { + "epoch": 3.7551847051198965, + "grad_norm": 0.07045713812112808, + "learning_rate": 1.360097214096044e-05, + "loss": 0.2298, + "step": 46354 + }, + { + "epoch": 3.755265716137395, + "grad_norm": 0.07467834651470184, + "learning_rate": 1.3596471488365814e-05, + "loss": 0.2029, + "step": 46355 + }, + { + "epoch": 3.755346727154893, + "grad_norm": 0.07477845996618271, + "learning_rate": 1.3591970835771186e-05, + "loss": 0.2221, + "step": 46356 + }, + { + "epoch": 3.7554277381723913, + "grad_norm": 0.06774298846721649, + "learning_rate": 1.3587470183176561e-05, + "loss": 0.2328, + "step": 46357 + }, + { + "epoch": 3.75550874918989, + "grad_norm": 0.06271317601203918, + "learning_rate": 1.3582969530581935e-05, + "loss": 0.2151, + "step": 46358 + }, + { + "epoch": 3.755589760207388, + "grad_norm": 0.07417146861553192, + "learning_rate": 1.3578468877987307e-05, + "loss": 0.2502, + "step": 46359 + }, + { + "epoch": 3.7556707712248865, + "grad_norm": 0.07773631066083908, + "learning_rate": 1.3573968225392684e-05, + "loss": 0.2285, + "step": 46360 + }, + { + "epoch": 3.7557517822423847, + "grad_norm": 0.07055094838142395, + "learning_rate": 1.3569467572798056e-05, + "loss": 0.2339, + "step": 46361 + }, + { + "epoch": 3.7558327932598834, + "grad_norm": 0.07049763202667236, + "learning_rate": 1.3564966920203431e-05, + "loss": 0.2265, + "step": 46362 + }, + { + "epoch": 3.7559138042773816, + "grad_norm": 0.07336530089378357, + "learning_rate": 1.3560466267608805e-05, + "loss": 0.1979, + "step": 46363 + }, + { + "epoch": 3.75599481529488, + "grad_norm": 0.0836474746465683, + "learning_rate": 1.3555965615014177e-05, + "loss": 0.223, + "step": 46364 + }, + { + "epoch": 3.7560758263123786, + "grad_norm": 0.076979860663414, + "learning_rate": 1.3551464962419552e-05, + "loss": 0.2199, + "step": 46365 + }, + { + "epoch": 3.756156837329877, + "grad_norm": 0.05993069335818291, + "learning_rate": 1.3546964309824925e-05, + "loss": 0.2014, + "step": 46366 + }, + { + "epoch": 3.756237848347375, + "grad_norm": 0.07212058454751968, + "learning_rate": 1.35424636572303e-05, + "loss": 0.2198, + "step": 46367 + }, + { + "epoch": 3.7563188593648738, + "grad_norm": 0.06068550795316696, + "learning_rate": 1.3537963004635673e-05, + "loss": 0.2511, + "step": 46368 + }, + { + "epoch": 3.756399870382372, + "grad_norm": 0.0721498504281044, + "learning_rate": 1.3533462352041046e-05, + "loss": 0.2439, + "step": 46369 + }, + { + "epoch": 3.7564808813998702, + "grad_norm": 0.07898570597171783, + "learning_rate": 1.3528961699446422e-05, + "loss": 0.2429, + "step": 46370 + }, + { + "epoch": 3.756561892417369, + "grad_norm": 0.06375134736299515, + "learning_rate": 1.3524461046851793e-05, + "loss": 0.2478, + "step": 46371 + }, + { + "epoch": 3.756642903434867, + "grad_norm": 0.08874838799238205, + "learning_rate": 1.3519960394257167e-05, + "loss": 0.2353, + "step": 46372 + }, + { + "epoch": 3.7567239144523654, + "grad_norm": 0.07910864055156708, + "learning_rate": 1.3515459741662542e-05, + "loss": 0.2272, + "step": 46373 + }, + { + "epoch": 3.756804925469864, + "grad_norm": 0.07340925931930542, + "learning_rate": 1.3510959089067914e-05, + "loss": 0.2493, + "step": 46374 + }, + { + "epoch": 3.7568859364873624, + "grad_norm": 0.0714777335524559, + "learning_rate": 1.3506458436473291e-05, + "loss": 0.2262, + "step": 46375 + }, + { + "epoch": 3.7569669475048606, + "grad_norm": 0.08851076662540436, + "learning_rate": 1.3501957783878663e-05, + "loss": 0.2639, + "step": 46376 + }, + { + "epoch": 3.7570479585223593, + "grad_norm": 0.06556161493062973, + "learning_rate": 1.3497457131284035e-05, + "loss": 0.2123, + "step": 46377 + }, + { + "epoch": 3.7571289695398575, + "grad_norm": 0.0682496726512909, + "learning_rate": 1.3492956478689412e-05, + "loss": 0.2372, + "step": 46378 + }, + { + "epoch": 3.7572099805573558, + "grad_norm": 0.07470902800559998, + "learning_rate": 1.3488455826094784e-05, + "loss": 0.2004, + "step": 46379 + }, + { + "epoch": 3.757290991574854, + "grad_norm": 0.0711580142378807, + "learning_rate": 1.3483955173500158e-05, + "loss": 0.2295, + "step": 46380 + }, + { + "epoch": 3.7573720025923527, + "grad_norm": 0.06948163360357285, + "learning_rate": 1.3479454520905533e-05, + "loss": 0.2627, + "step": 46381 + }, + { + "epoch": 3.757453013609851, + "grad_norm": 0.08253484219312668, + "learning_rate": 1.3474953868310905e-05, + "loss": 0.2362, + "step": 46382 + }, + { + "epoch": 3.757534024627349, + "grad_norm": 0.0675487071275711, + "learning_rate": 1.347045321571628e-05, + "loss": 0.2605, + "step": 46383 + }, + { + "epoch": 3.7576150356448474, + "grad_norm": 0.0675104409456253, + "learning_rate": 1.3465952563121654e-05, + "loss": 0.1905, + "step": 46384 + }, + { + "epoch": 3.757696046662346, + "grad_norm": 0.07248274981975555, + "learning_rate": 1.3461451910527026e-05, + "loss": 0.2062, + "step": 46385 + }, + { + "epoch": 3.7577770576798444, + "grad_norm": 0.06134706735610962, + "learning_rate": 1.3456951257932401e-05, + "loss": 0.1993, + "step": 46386 + }, + { + "epoch": 3.7578580686973426, + "grad_norm": 0.07231275737285614, + "learning_rate": 1.3452450605337775e-05, + "loss": 0.2197, + "step": 46387 + }, + { + "epoch": 3.7579390797148413, + "grad_norm": 0.07492779940366745, + "learning_rate": 1.344794995274315e-05, + "loss": 0.1983, + "step": 46388 + }, + { + "epoch": 3.7580200907323396, + "grad_norm": 0.05680622532963753, + "learning_rate": 1.3443449300148522e-05, + "loss": 0.2109, + "step": 46389 + }, + { + "epoch": 3.758101101749838, + "grad_norm": 0.06529416888952255, + "learning_rate": 1.3438948647553895e-05, + "loss": 0.2125, + "step": 46390 + }, + { + "epoch": 3.7581821127673365, + "grad_norm": 0.07710691541433334, + "learning_rate": 1.343444799495927e-05, + "loss": 0.2367, + "step": 46391 + }, + { + "epoch": 3.7582631237848347, + "grad_norm": 0.0891771912574768, + "learning_rate": 1.3429947342364643e-05, + "loss": 0.2411, + "step": 46392 + }, + { + "epoch": 3.758344134802333, + "grad_norm": 0.07291162014007568, + "learning_rate": 1.3425446689770016e-05, + "loss": 0.1957, + "step": 46393 + }, + { + "epoch": 3.7584251458198317, + "grad_norm": 0.07850871235132217, + "learning_rate": 1.3420946037175391e-05, + "loss": 0.2449, + "step": 46394 + }, + { + "epoch": 3.75850615683733, + "grad_norm": 0.07013242691755295, + "learning_rate": 1.3416445384580763e-05, + "loss": 0.256, + "step": 46395 + }, + { + "epoch": 3.758587167854828, + "grad_norm": 0.09972033649682999, + "learning_rate": 1.341194473198614e-05, + "loss": 0.1975, + "step": 46396 + }, + { + "epoch": 3.758668178872327, + "grad_norm": 0.06596890836954117, + "learning_rate": 1.3407444079391512e-05, + "loss": 0.1988, + "step": 46397 + }, + { + "epoch": 3.758749189889825, + "grad_norm": 0.07399401068687439, + "learning_rate": 1.3402943426796886e-05, + "loss": 0.1893, + "step": 46398 + }, + { + "epoch": 3.7588302009073233, + "grad_norm": 0.06135550141334534, + "learning_rate": 1.3398442774202261e-05, + "loss": 0.1996, + "step": 46399 + }, + { + "epoch": 3.758911211924822, + "grad_norm": 0.06000867113471031, + "learning_rate": 1.3393942121607633e-05, + "loss": 0.1857, + "step": 46400 + }, + { + "epoch": 3.7589922229423203, + "grad_norm": 0.06590692698955536, + "learning_rate": 1.3389441469013008e-05, + "loss": 0.2164, + "step": 46401 + }, + { + "epoch": 3.7590732339598185, + "grad_norm": 0.06405100971460342, + "learning_rate": 1.3384940816418382e-05, + "loss": 0.1724, + "step": 46402 + }, + { + "epoch": 3.7591542449773168, + "grad_norm": 0.06549198925495148, + "learning_rate": 1.3380440163823754e-05, + "loss": 0.2295, + "step": 46403 + }, + { + "epoch": 3.7592352559948155, + "grad_norm": 0.06221283972263336, + "learning_rate": 1.337593951122913e-05, + "loss": 0.2328, + "step": 46404 + }, + { + "epoch": 3.7593162670123137, + "grad_norm": 0.07838263362646103, + "learning_rate": 1.3371438858634503e-05, + "loss": 0.2247, + "step": 46405 + }, + { + "epoch": 3.759397278029812, + "grad_norm": 0.07541157305240631, + "learning_rate": 1.3366938206039875e-05, + "loss": 0.2444, + "step": 46406 + }, + { + "epoch": 3.75947828904731, + "grad_norm": 0.06589043140411377, + "learning_rate": 1.336243755344525e-05, + "loss": 0.1966, + "step": 46407 + }, + { + "epoch": 3.759559300064809, + "grad_norm": 0.06893175840377808, + "learning_rate": 1.3357936900850624e-05, + "loss": 0.1866, + "step": 46408 + }, + { + "epoch": 3.759640311082307, + "grad_norm": 0.07315077632665634, + "learning_rate": 1.3353436248255999e-05, + "loss": 0.2394, + "step": 46409 + }, + { + "epoch": 3.7597213220998054, + "grad_norm": 0.06642105430364609, + "learning_rate": 1.334893559566137e-05, + "loss": 0.2194, + "step": 46410 + }, + { + "epoch": 3.759802333117304, + "grad_norm": 0.06641194224357605, + "learning_rate": 1.3344434943066744e-05, + "loss": 0.2434, + "step": 46411 + }, + { + "epoch": 3.7598833441348023, + "grad_norm": 0.07174092531204224, + "learning_rate": 1.333993429047212e-05, + "loss": 0.2248, + "step": 46412 + }, + { + "epoch": 3.7599643551523005, + "grad_norm": 0.08386538177728653, + "learning_rate": 1.3335433637877492e-05, + "loss": 0.2254, + "step": 46413 + }, + { + "epoch": 3.7600453661697992, + "grad_norm": 0.07093953341245651, + "learning_rate": 1.3330932985282869e-05, + "loss": 0.2036, + "step": 46414 + }, + { + "epoch": 3.7601263771872975, + "grad_norm": 0.06664960831403732, + "learning_rate": 1.332643233268824e-05, + "loss": 0.1924, + "step": 46415 + }, + { + "epoch": 3.7602073882047957, + "grad_norm": 0.07157678157091141, + "learning_rate": 1.3321931680093614e-05, + "loss": 0.2119, + "step": 46416 + }, + { + "epoch": 3.7602883992222944, + "grad_norm": 0.07037477195262909, + "learning_rate": 1.331743102749899e-05, + "loss": 0.1769, + "step": 46417 + }, + { + "epoch": 3.7603694102397927, + "grad_norm": 0.07439891993999481, + "learning_rate": 1.3312930374904361e-05, + "loss": 0.2255, + "step": 46418 + }, + { + "epoch": 3.760450421257291, + "grad_norm": 0.0761137455701828, + "learning_rate": 1.3308429722309735e-05, + "loss": 0.2132, + "step": 46419 + }, + { + "epoch": 3.7605314322747896, + "grad_norm": 0.061975765973329544, + "learning_rate": 1.330392906971511e-05, + "loss": 0.1905, + "step": 46420 + }, + { + "epoch": 3.760612443292288, + "grad_norm": 0.06800936907529831, + "learning_rate": 1.3299428417120482e-05, + "loss": 0.2167, + "step": 46421 + }, + { + "epoch": 3.760693454309786, + "grad_norm": 0.06478226184844971, + "learning_rate": 1.3294927764525857e-05, + "loss": 0.2169, + "step": 46422 + }, + { + "epoch": 3.7607744653272848, + "grad_norm": 0.06802545487880707, + "learning_rate": 1.3290427111931231e-05, + "loss": 0.1845, + "step": 46423 + }, + { + "epoch": 3.760855476344783, + "grad_norm": 0.09708104282617569, + "learning_rate": 1.3285926459336603e-05, + "loss": 0.2462, + "step": 46424 + }, + { + "epoch": 3.7609364873622813, + "grad_norm": 0.06333111971616745, + "learning_rate": 1.3281425806741978e-05, + "loss": 0.209, + "step": 46425 + }, + { + "epoch": 3.7610174983797795, + "grad_norm": 0.07289381325244904, + "learning_rate": 1.3276925154147352e-05, + "loss": 0.1954, + "step": 46426 + }, + { + "epoch": 3.761098509397278, + "grad_norm": 0.06891093403100967, + "learning_rate": 1.3272424501552727e-05, + "loss": 0.216, + "step": 46427 + }, + { + "epoch": 3.7611795204147764, + "grad_norm": 0.06211341172456741, + "learning_rate": 1.3267923848958099e-05, + "loss": 0.217, + "step": 46428 + }, + { + "epoch": 3.7612605314322747, + "grad_norm": 0.07642439752817154, + "learning_rate": 1.3263423196363473e-05, + "loss": 0.2142, + "step": 46429 + }, + { + "epoch": 3.761341542449773, + "grad_norm": 0.06167442724108696, + "learning_rate": 1.3258922543768848e-05, + "loss": 0.2275, + "step": 46430 + }, + { + "epoch": 3.7614225534672716, + "grad_norm": 0.0749720111489296, + "learning_rate": 1.3254421891174222e-05, + "loss": 0.2434, + "step": 46431 + }, + { + "epoch": 3.76150356448477, + "grad_norm": 0.07655184715986252, + "learning_rate": 1.3249921238579593e-05, + "loss": 0.2593, + "step": 46432 + }, + { + "epoch": 3.761584575502268, + "grad_norm": 0.07166169583797455, + "learning_rate": 1.3245420585984969e-05, + "loss": 0.2194, + "step": 46433 + }, + { + "epoch": 3.761665586519767, + "grad_norm": 0.07488848268985748, + "learning_rate": 1.3240919933390342e-05, + "loss": 0.228, + "step": 46434 + }, + { + "epoch": 3.761746597537265, + "grad_norm": 0.07291141152381897, + "learning_rate": 1.3236419280795718e-05, + "loss": 0.2271, + "step": 46435 + }, + { + "epoch": 3.7618276085547633, + "grad_norm": 0.06775379180908203, + "learning_rate": 1.323191862820109e-05, + "loss": 0.2234, + "step": 46436 + }, + { + "epoch": 3.761908619572262, + "grad_norm": 0.09158001095056534, + "learning_rate": 1.3227417975606463e-05, + "loss": 0.2341, + "step": 46437 + }, + { + "epoch": 3.76198963058976, + "grad_norm": 0.07301585376262665, + "learning_rate": 1.3222917323011838e-05, + "loss": 0.2097, + "step": 46438 + }, + { + "epoch": 3.7620706416072585, + "grad_norm": 0.06108618527650833, + "learning_rate": 1.321841667041721e-05, + "loss": 0.2379, + "step": 46439 + }, + { + "epoch": 3.762151652624757, + "grad_norm": 0.08330275863409042, + "learning_rate": 1.3213916017822586e-05, + "loss": 0.2056, + "step": 46440 + }, + { + "epoch": 3.7622326636422554, + "grad_norm": 0.08911581337451935, + "learning_rate": 1.320941536522796e-05, + "loss": 0.2901, + "step": 46441 + }, + { + "epoch": 3.7623136746597536, + "grad_norm": 0.0730244368314743, + "learning_rate": 1.3204914712633331e-05, + "loss": 0.2179, + "step": 46442 + }, + { + "epoch": 3.7623946856772523, + "grad_norm": 0.08556057512760162, + "learning_rate": 1.3200414060038706e-05, + "loss": 0.2335, + "step": 46443 + }, + { + "epoch": 3.7624756966947506, + "grad_norm": 0.07071968913078308, + "learning_rate": 1.319591340744408e-05, + "loss": 0.2253, + "step": 46444 + }, + { + "epoch": 3.762556707712249, + "grad_norm": 0.07956349104642868, + "learning_rate": 1.3191412754849452e-05, + "loss": 0.2367, + "step": 46445 + }, + { + "epoch": 3.7626377187297475, + "grad_norm": 0.06337764859199524, + "learning_rate": 1.3186912102254827e-05, + "loss": 0.1947, + "step": 46446 + }, + { + "epoch": 3.7627187297472457, + "grad_norm": 0.060510363429784775, + "learning_rate": 1.3182411449660201e-05, + "loss": 0.2314, + "step": 46447 + }, + { + "epoch": 3.762799740764744, + "grad_norm": 0.062422338873147964, + "learning_rate": 1.3177910797065576e-05, + "loss": 0.1888, + "step": 46448 + }, + { + "epoch": 3.7628807517822422, + "grad_norm": 0.07294078171253204, + "learning_rate": 1.317341014447095e-05, + "loss": 0.2152, + "step": 46449 + }, + { + "epoch": 3.762961762799741, + "grad_norm": 0.0869693011045456, + "learning_rate": 1.3168909491876322e-05, + "loss": 0.227, + "step": 46450 + }, + { + "epoch": 3.763042773817239, + "grad_norm": 0.08141772449016571, + "learning_rate": 1.3164408839281697e-05, + "loss": 0.2268, + "step": 46451 + }, + { + "epoch": 3.7631237848347374, + "grad_norm": 0.07432594150304794, + "learning_rate": 1.315990818668707e-05, + "loss": 0.1958, + "step": 46452 + }, + { + "epoch": 3.7632047958522357, + "grad_norm": 0.07830096781253815, + "learning_rate": 1.3155407534092446e-05, + "loss": 0.2131, + "step": 46453 + }, + { + "epoch": 3.7632858068697344, + "grad_norm": 0.06278984993696213, + "learning_rate": 1.3150906881497818e-05, + "loss": 0.2175, + "step": 46454 + }, + { + "epoch": 3.7633668178872326, + "grad_norm": 0.06053311750292778, + "learning_rate": 1.3146406228903191e-05, + "loss": 0.2049, + "step": 46455 + }, + { + "epoch": 3.763447828904731, + "grad_norm": 0.0874246209859848, + "learning_rate": 1.3141905576308567e-05, + "loss": 0.2286, + "step": 46456 + }, + { + "epoch": 3.7635288399222295, + "grad_norm": 0.05780855566263199, + "learning_rate": 1.3137404923713939e-05, + "loss": 0.2368, + "step": 46457 + }, + { + "epoch": 3.7636098509397278, + "grad_norm": 0.057224925607442856, + "learning_rate": 1.3132904271119312e-05, + "loss": 0.1832, + "step": 46458 + }, + { + "epoch": 3.763690861957226, + "grad_norm": 0.08486709743738174, + "learning_rate": 1.3128403618524688e-05, + "loss": 0.1981, + "step": 46459 + }, + { + "epoch": 3.7637718729747247, + "grad_norm": 0.08651462197303772, + "learning_rate": 1.312390296593006e-05, + "loss": 0.2557, + "step": 46460 + }, + { + "epoch": 3.763852883992223, + "grad_norm": 0.05747639387845993, + "learning_rate": 1.3119402313335435e-05, + "loss": 0.1794, + "step": 46461 + }, + { + "epoch": 3.763933895009721, + "grad_norm": 0.04803988337516785, + "learning_rate": 1.3114901660740808e-05, + "loss": 0.2113, + "step": 46462 + }, + { + "epoch": 3.76401490602722, + "grad_norm": 0.0615316703915596, + "learning_rate": 1.311040100814618e-05, + "loss": 0.2208, + "step": 46463 + }, + { + "epoch": 3.764095917044718, + "grad_norm": 0.09276503324508667, + "learning_rate": 1.3105900355551556e-05, + "loss": 0.2528, + "step": 46464 + }, + { + "epoch": 3.7641769280622164, + "grad_norm": 0.07706604897975922, + "learning_rate": 1.3101399702956929e-05, + "loss": 0.2107, + "step": 46465 + }, + { + "epoch": 3.764257939079715, + "grad_norm": 0.0845157578587532, + "learning_rate": 1.3096899050362301e-05, + "loss": 0.2054, + "step": 46466 + }, + { + "epoch": 3.7643389500972133, + "grad_norm": 0.08171258866786957, + "learning_rate": 1.3092398397767678e-05, + "loss": 0.2411, + "step": 46467 + }, + { + "epoch": 3.7644199611147116, + "grad_norm": 0.08288677036762238, + "learning_rate": 1.308789774517305e-05, + "loss": 0.206, + "step": 46468 + }, + { + "epoch": 3.7645009721322102, + "grad_norm": 0.06667889654636383, + "learning_rate": 1.3083397092578425e-05, + "loss": 0.2449, + "step": 46469 + }, + { + "epoch": 3.7645819831497085, + "grad_norm": 0.060472164303064346, + "learning_rate": 1.3078896439983799e-05, + "loss": 0.2135, + "step": 46470 + }, + { + "epoch": 3.7646629941672067, + "grad_norm": 0.07328839600086212, + "learning_rate": 1.307439578738917e-05, + "loss": 0.2034, + "step": 46471 + }, + { + "epoch": 3.764744005184705, + "grad_norm": 0.06965433806180954, + "learning_rate": 1.3069895134794546e-05, + "loss": 0.1813, + "step": 46472 + }, + { + "epoch": 3.7648250162022032, + "grad_norm": 0.08212985843420029, + "learning_rate": 1.306539448219992e-05, + "loss": 0.2399, + "step": 46473 + }, + { + "epoch": 3.764906027219702, + "grad_norm": 0.06601189821958542, + "learning_rate": 1.3060893829605295e-05, + "loss": 0.202, + "step": 46474 + }, + { + "epoch": 3.7649870382372, + "grad_norm": 0.08131560683250427, + "learning_rate": 1.3056393177010667e-05, + "loss": 0.2358, + "step": 46475 + }, + { + "epoch": 3.7650680492546984, + "grad_norm": 0.07608696073293686, + "learning_rate": 1.305189252441604e-05, + "loss": 0.1951, + "step": 46476 + }, + { + "epoch": 3.765149060272197, + "grad_norm": 0.07890737056732178, + "learning_rate": 1.3047391871821416e-05, + "loss": 0.2582, + "step": 46477 + }, + { + "epoch": 3.7652300712896953, + "grad_norm": 0.09459563344717026, + "learning_rate": 1.3042891219226788e-05, + "loss": 0.2385, + "step": 46478 + }, + { + "epoch": 3.7653110823071936, + "grad_norm": 0.06472989171743393, + "learning_rate": 1.3038390566632161e-05, + "loss": 0.1975, + "step": 46479 + }, + { + "epoch": 3.7653920933246923, + "grad_norm": 0.10067077726125717, + "learning_rate": 1.3033889914037537e-05, + "loss": 0.2349, + "step": 46480 + }, + { + "epoch": 3.7654731043421905, + "grad_norm": 0.07255268096923828, + "learning_rate": 1.3029389261442908e-05, + "loss": 0.2272, + "step": 46481 + }, + { + "epoch": 3.7655541153596888, + "grad_norm": 0.06430191546678543, + "learning_rate": 1.3024888608848285e-05, + "loss": 0.1932, + "step": 46482 + }, + { + "epoch": 3.7656351263771874, + "grad_norm": 0.08290670067071915, + "learning_rate": 1.3020387956253657e-05, + "loss": 0.2751, + "step": 46483 + }, + { + "epoch": 3.7657161373946857, + "grad_norm": 0.0648224726319313, + "learning_rate": 1.301588730365903e-05, + "loss": 0.2181, + "step": 46484 + }, + { + "epoch": 3.765797148412184, + "grad_norm": 0.07298358529806137, + "learning_rate": 1.3011386651064406e-05, + "loss": 0.2469, + "step": 46485 + }, + { + "epoch": 3.7658781594296826, + "grad_norm": 0.07706646621227264, + "learning_rate": 1.3006885998469778e-05, + "loss": 0.2174, + "step": 46486 + }, + { + "epoch": 3.765959170447181, + "grad_norm": 0.07118535041809082, + "learning_rate": 1.3002385345875153e-05, + "loss": 0.223, + "step": 46487 + }, + { + "epoch": 3.766040181464679, + "grad_norm": 0.09811242669820786, + "learning_rate": 1.2997884693280527e-05, + "loss": 0.2232, + "step": 46488 + }, + { + "epoch": 3.766121192482178, + "grad_norm": 0.0811871886253357, + "learning_rate": 1.2993384040685899e-05, + "loss": 0.2519, + "step": 46489 + }, + { + "epoch": 3.766202203499676, + "grad_norm": 0.09767928719520569, + "learning_rate": 1.2988883388091274e-05, + "loss": 0.2349, + "step": 46490 + }, + { + "epoch": 3.7662832145171743, + "grad_norm": 0.07688478380441666, + "learning_rate": 1.2984382735496648e-05, + "loss": 0.2133, + "step": 46491 + }, + { + "epoch": 3.766364225534673, + "grad_norm": 0.0702710896730423, + "learning_rate": 1.297988208290202e-05, + "loss": 0.1993, + "step": 46492 + }, + { + "epoch": 3.7664452365521712, + "grad_norm": 0.0628383606672287, + "learning_rate": 1.2975381430307395e-05, + "loss": 0.197, + "step": 46493 + }, + { + "epoch": 3.7665262475696695, + "grad_norm": 0.06867960095405579, + "learning_rate": 1.2970880777712769e-05, + "loss": 0.2463, + "step": 46494 + }, + { + "epoch": 3.7666072585871677, + "grad_norm": 0.08608069270849228, + "learning_rate": 1.2966380125118144e-05, + "loss": 0.224, + "step": 46495 + }, + { + "epoch": 3.766688269604666, + "grad_norm": 0.06254792213439941, + "learning_rate": 1.2961879472523516e-05, + "loss": 0.2338, + "step": 46496 + }, + { + "epoch": 3.7667692806221647, + "grad_norm": 0.0764245092868805, + "learning_rate": 1.295737881992889e-05, + "loss": 0.1998, + "step": 46497 + }, + { + "epoch": 3.766850291639663, + "grad_norm": 0.08328287303447723, + "learning_rate": 1.2952878167334265e-05, + "loss": 0.226, + "step": 46498 + }, + { + "epoch": 3.766931302657161, + "grad_norm": 0.08136755973100662, + "learning_rate": 1.2948377514739637e-05, + "loss": 0.2651, + "step": 46499 + }, + { + "epoch": 3.76701231367466, + "grad_norm": 0.08298899978399277, + "learning_rate": 1.2943876862145014e-05, + "loss": 0.2291, + "step": 46500 + }, + { + "epoch": 3.767093324692158, + "grad_norm": 0.06559506058692932, + "learning_rate": 1.2939376209550386e-05, + "loss": 0.2238, + "step": 46501 + }, + { + "epoch": 3.7671743357096563, + "grad_norm": 0.08516070246696472, + "learning_rate": 1.2934875556955758e-05, + "loss": 0.2544, + "step": 46502 + }, + { + "epoch": 3.767255346727155, + "grad_norm": 0.06356891244649887, + "learning_rate": 1.2930374904361135e-05, + "loss": 0.2192, + "step": 46503 + }, + { + "epoch": 3.7673363577446533, + "grad_norm": 0.07620159536600113, + "learning_rate": 1.2925874251766506e-05, + "loss": 0.2559, + "step": 46504 + }, + { + "epoch": 3.7674173687621515, + "grad_norm": 0.06729257851839066, + "learning_rate": 1.292137359917188e-05, + "loss": 0.1718, + "step": 46505 + }, + { + "epoch": 3.76749837977965, + "grad_norm": 0.0670362114906311, + "learning_rate": 1.2916872946577255e-05, + "loss": 0.1767, + "step": 46506 + }, + { + "epoch": 3.7675793907971484, + "grad_norm": 0.07508490234613419, + "learning_rate": 1.2912372293982627e-05, + "loss": 0.2409, + "step": 46507 + }, + { + "epoch": 3.7676604018146467, + "grad_norm": 0.07815518975257874, + "learning_rate": 1.2907871641388003e-05, + "loss": 0.2148, + "step": 46508 + }, + { + "epoch": 3.7677414128321454, + "grad_norm": 0.07622785121202469, + "learning_rate": 1.2903370988793376e-05, + "loss": 0.2404, + "step": 46509 + }, + { + "epoch": 3.7678224238496436, + "grad_norm": 0.07081098109483719, + "learning_rate": 1.2898870336198748e-05, + "loss": 0.1935, + "step": 46510 + }, + { + "epoch": 3.767903434867142, + "grad_norm": 0.0799146220088005, + "learning_rate": 1.2894369683604123e-05, + "loss": 0.2275, + "step": 46511 + }, + { + "epoch": 3.7679844458846405, + "grad_norm": 0.08372996002435684, + "learning_rate": 1.2889869031009497e-05, + "loss": 0.2283, + "step": 46512 + }, + { + "epoch": 3.768065456902139, + "grad_norm": 0.07471054792404175, + "learning_rate": 1.2885368378414872e-05, + "loss": 0.2373, + "step": 46513 + }, + { + "epoch": 3.768146467919637, + "grad_norm": 0.07189594954252243, + "learning_rate": 1.2880867725820244e-05, + "loss": 0.1982, + "step": 46514 + }, + { + "epoch": 3.7682274789371357, + "grad_norm": 0.07205279171466827, + "learning_rate": 1.2876367073225618e-05, + "loss": 0.2298, + "step": 46515 + }, + { + "epoch": 3.768308489954634, + "grad_norm": 0.06825879216194153, + "learning_rate": 1.2871866420630993e-05, + "loss": 0.2154, + "step": 46516 + }, + { + "epoch": 3.768389500972132, + "grad_norm": 0.06254030764102936, + "learning_rate": 1.2867365768036365e-05, + "loss": 0.2073, + "step": 46517 + }, + { + "epoch": 3.7684705119896305, + "grad_norm": 0.06677734106779099, + "learning_rate": 1.2862865115441739e-05, + "loss": 0.1922, + "step": 46518 + }, + { + "epoch": 3.7685515230071287, + "grad_norm": 0.06440742313861847, + "learning_rate": 1.2858364462847114e-05, + "loss": 0.2021, + "step": 46519 + }, + { + "epoch": 3.7686325340246274, + "grad_norm": 0.08662337809801102, + "learning_rate": 1.2853863810252487e-05, + "loss": 0.2315, + "step": 46520 + }, + { + "epoch": 3.7687135450421256, + "grad_norm": 0.06901124119758606, + "learning_rate": 1.2849363157657863e-05, + "loss": 0.1859, + "step": 46521 + }, + { + "epoch": 3.768794556059624, + "grad_norm": 0.08281880617141724, + "learning_rate": 1.2844862505063235e-05, + "loss": 0.2549, + "step": 46522 + }, + { + "epoch": 3.7688755670771226, + "grad_norm": 0.07393230497837067, + "learning_rate": 1.2840361852468608e-05, + "loss": 0.2464, + "step": 46523 + }, + { + "epoch": 3.768956578094621, + "grad_norm": 0.07908966392278671, + "learning_rate": 1.2835861199873984e-05, + "loss": 0.2262, + "step": 46524 + }, + { + "epoch": 3.769037589112119, + "grad_norm": 0.08938897401094437, + "learning_rate": 1.2831360547279356e-05, + "loss": 0.2342, + "step": 46525 + }, + { + "epoch": 3.7691186001296177, + "grad_norm": 0.0999879539012909, + "learning_rate": 1.282685989468473e-05, + "loss": 0.2072, + "step": 46526 + }, + { + "epoch": 3.769199611147116, + "grad_norm": 0.05360986292362213, + "learning_rate": 1.2822359242090104e-05, + "loss": 0.1837, + "step": 46527 + }, + { + "epoch": 3.7692806221646142, + "grad_norm": 0.06904050707817078, + "learning_rate": 1.2817858589495476e-05, + "loss": 0.2043, + "step": 46528 + }, + { + "epoch": 3.769361633182113, + "grad_norm": 0.0860978439450264, + "learning_rate": 1.2813357936900852e-05, + "loss": 0.2197, + "step": 46529 + }, + { + "epoch": 3.769442644199611, + "grad_norm": 0.0657324492931366, + "learning_rate": 1.2808857284306225e-05, + "loss": 0.2159, + "step": 46530 + }, + { + "epoch": 3.7695236552171094, + "grad_norm": 0.06086688116192818, + "learning_rate": 1.2804356631711597e-05, + "loss": 0.214, + "step": 46531 + }, + { + "epoch": 3.769604666234608, + "grad_norm": 0.07376797497272491, + "learning_rate": 1.2799855979116972e-05, + "loss": 0.249, + "step": 46532 + }, + { + "epoch": 3.7696856772521063, + "grad_norm": 0.0695323571562767, + "learning_rate": 1.2795355326522346e-05, + "loss": 0.2095, + "step": 46533 + }, + { + "epoch": 3.7697666882696046, + "grad_norm": 0.0635128989815712, + "learning_rate": 1.2790854673927721e-05, + "loss": 0.1936, + "step": 46534 + }, + { + "epoch": 3.7698476992871033, + "grad_norm": 0.07343027740716934, + "learning_rate": 1.2786354021333093e-05, + "loss": 0.2146, + "step": 46535 + }, + { + "epoch": 3.7699287103046015, + "grad_norm": 0.06408240646123886, + "learning_rate": 1.2781853368738467e-05, + "loss": 0.2182, + "step": 46536 + }, + { + "epoch": 3.7700097213220998, + "grad_norm": 0.08215359598398209, + "learning_rate": 1.2777352716143842e-05, + "loss": 0.2187, + "step": 46537 + }, + { + "epoch": 3.7700907323395985, + "grad_norm": 0.07449215650558472, + "learning_rate": 1.2772852063549216e-05, + "loss": 0.2312, + "step": 46538 + }, + { + "epoch": 3.7701717433570967, + "grad_norm": 0.07269278913736343, + "learning_rate": 1.2768351410954588e-05, + "loss": 0.2118, + "step": 46539 + }, + { + "epoch": 3.770252754374595, + "grad_norm": 0.0680905431509018, + "learning_rate": 1.2763850758359963e-05, + "loss": 0.1955, + "step": 46540 + }, + { + "epoch": 3.770333765392093, + "grad_norm": 0.06702621281147003, + "learning_rate": 1.2759350105765337e-05, + "loss": 0.2244, + "step": 46541 + }, + { + "epoch": 3.7704147764095914, + "grad_norm": 0.09051982313394547, + "learning_rate": 1.2754849453170712e-05, + "loss": 0.2369, + "step": 46542 + }, + { + "epoch": 3.77049578742709, + "grad_norm": 0.07590014487504959, + "learning_rate": 1.2750348800576084e-05, + "loss": 0.2358, + "step": 46543 + }, + { + "epoch": 3.7705767984445884, + "grad_norm": 0.0708058774471283, + "learning_rate": 1.2745848147981457e-05, + "loss": 0.2117, + "step": 46544 + }, + { + "epoch": 3.7706578094620866, + "grad_norm": 0.0713592991232872, + "learning_rate": 1.2741347495386833e-05, + "loss": 0.2053, + "step": 46545 + }, + { + "epoch": 3.7707388204795853, + "grad_norm": 0.07061046361923218, + "learning_rate": 1.2736846842792205e-05, + "loss": 0.2271, + "step": 46546 + }, + { + "epoch": 3.7708198314970836, + "grad_norm": 0.08287011831998825, + "learning_rate": 1.273234619019758e-05, + "loss": 0.2432, + "step": 46547 + }, + { + "epoch": 3.770900842514582, + "grad_norm": 0.07770534604787827, + "learning_rate": 1.2727845537602953e-05, + "loss": 0.2462, + "step": 46548 + }, + { + "epoch": 3.7709818535320805, + "grad_norm": 0.07193398475646973, + "learning_rate": 1.2723344885008325e-05, + "loss": 0.218, + "step": 46549 + }, + { + "epoch": 3.7710628645495787, + "grad_norm": 0.06820261478424072, + "learning_rate": 1.27188442324137e-05, + "loss": 0.236, + "step": 46550 + }, + { + "epoch": 3.771143875567077, + "grad_norm": 0.07764853537082672, + "learning_rate": 1.2714343579819074e-05, + "loss": 0.2336, + "step": 46551 + }, + { + "epoch": 3.7712248865845757, + "grad_norm": 0.07523754239082336, + "learning_rate": 1.2709842927224446e-05, + "loss": 0.2062, + "step": 46552 + }, + { + "epoch": 3.771305897602074, + "grad_norm": 0.07026119530200958, + "learning_rate": 1.2705342274629821e-05, + "loss": 0.2321, + "step": 46553 + }, + { + "epoch": 3.771386908619572, + "grad_norm": 0.06876692920923233, + "learning_rate": 1.2700841622035195e-05, + "loss": 0.2054, + "step": 46554 + }, + { + "epoch": 3.771467919637071, + "grad_norm": 0.06335452198982239, + "learning_rate": 1.269634096944057e-05, + "loss": 0.1917, + "step": 46555 + }, + { + "epoch": 3.771548930654569, + "grad_norm": 0.07451608031988144, + "learning_rate": 1.2691840316845944e-05, + "loss": 0.255, + "step": 46556 + }, + { + "epoch": 3.7716299416720673, + "grad_norm": 0.07577665895223618, + "learning_rate": 1.2687339664251316e-05, + "loss": 0.2117, + "step": 46557 + }, + { + "epoch": 3.771710952689566, + "grad_norm": 0.07046057283878326, + "learning_rate": 1.2682839011656691e-05, + "loss": 0.2154, + "step": 46558 + }, + { + "epoch": 3.7717919637070643, + "grad_norm": 0.09273316711187363, + "learning_rate": 1.2678338359062065e-05, + "loss": 0.3072, + "step": 46559 + }, + { + "epoch": 3.7718729747245625, + "grad_norm": 0.07682998478412628, + "learning_rate": 1.267383770646744e-05, + "loss": 0.1781, + "step": 46560 + }, + { + "epoch": 3.7719539857420608, + "grad_norm": 0.0753307119011879, + "learning_rate": 1.2669337053872812e-05, + "loss": 0.2, + "step": 46561 + }, + { + "epoch": 3.7720349967595594, + "grad_norm": 0.06994988024234772, + "learning_rate": 1.2664836401278186e-05, + "loss": 0.2132, + "step": 46562 + }, + { + "epoch": 3.7721160077770577, + "grad_norm": 0.08743558824062347, + "learning_rate": 1.2660335748683561e-05, + "loss": 0.2153, + "step": 46563 + }, + { + "epoch": 3.772197018794556, + "grad_norm": 0.06563448905944824, + "learning_rate": 1.2655835096088933e-05, + "loss": 0.2165, + "step": 46564 + }, + { + "epoch": 3.772278029812054, + "grad_norm": 0.0646919459104538, + "learning_rate": 1.2651334443494306e-05, + "loss": 0.1949, + "step": 46565 + }, + { + "epoch": 3.772359040829553, + "grad_norm": 0.06894893944263458, + "learning_rate": 1.2646833790899682e-05, + "loss": 0.2411, + "step": 46566 + }, + { + "epoch": 3.772440051847051, + "grad_norm": 0.06888721883296967, + "learning_rate": 1.2642333138305054e-05, + "loss": 0.2475, + "step": 46567 + }, + { + "epoch": 3.7725210628645494, + "grad_norm": 0.0673423707485199, + "learning_rate": 1.2637832485710429e-05, + "loss": 0.2775, + "step": 46568 + }, + { + "epoch": 3.772602073882048, + "grad_norm": 0.06546924263238907, + "learning_rate": 1.2633331833115803e-05, + "loss": 0.248, + "step": 46569 + }, + { + "epoch": 3.7726830848995463, + "grad_norm": 0.06502770632505417, + "learning_rate": 1.2628831180521174e-05, + "loss": 0.2148, + "step": 46570 + }, + { + "epoch": 3.7727640959170445, + "grad_norm": 0.07278035581111908, + "learning_rate": 1.2624330527926551e-05, + "loss": 0.2191, + "step": 46571 + }, + { + "epoch": 3.7728451069345432, + "grad_norm": 0.0673990324139595, + "learning_rate": 1.2619829875331923e-05, + "loss": 0.2305, + "step": 46572 + }, + { + "epoch": 3.7729261179520415, + "grad_norm": 0.07896624505519867, + "learning_rate": 1.2615329222737299e-05, + "loss": 0.2684, + "step": 46573 + }, + { + "epoch": 3.7730071289695397, + "grad_norm": 0.06549516320228577, + "learning_rate": 1.2610828570142672e-05, + "loss": 0.2069, + "step": 46574 + }, + { + "epoch": 3.7730881399870384, + "grad_norm": 0.06927746534347534, + "learning_rate": 1.2606327917548044e-05, + "loss": 0.1988, + "step": 46575 + }, + { + "epoch": 3.7731691510045366, + "grad_norm": 0.07695776224136353, + "learning_rate": 1.260182726495342e-05, + "loss": 0.2593, + "step": 46576 + }, + { + "epoch": 3.773250162022035, + "grad_norm": 0.0668836161494255, + "learning_rate": 1.2597326612358793e-05, + "loss": 0.2381, + "step": 46577 + }, + { + "epoch": 3.7733311730395336, + "grad_norm": 0.0662456750869751, + "learning_rate": 1.2592825959764165e-05, + "loss": 0.2232, + "step": 46578 + }, + { + "epoch": 3.773412184057032, + "grad_norm": 0.07020401209592819, + "learning_rate": 1.258832530716954e-05, + "loss": 0.2331, + "step": 46579 + }, + { + "epoch": 3.77349319507453, + "grad_norm": 0.07420952618122101, + "learning_rate": 1.2583824654574914e-05, + "loss": 0.2642, + "step": 46580 + }, + { + "epoch": 3.7735742060920288, + "grad_norm": 0.0619262158870697, + "learning_rate": 1.2579324001980289e-05, + "loss": 0.2069, + "step": 46581 + }, + { + "epoch": 3.773655217109527, + "grad_norm": 0.06421462446451187, + "learning_rate": 1.2574823349385661e-05, + "loss": 0.1812, + "step": 46582 + }, + { + "epoch": 3.7737362281270252, + "grad_norm": 0.06833804398775101, + "learning_rate": 1.2570322696791035e-05, + "loss": 0.1944, + "step": 46583 + }, + { + "epoch": 3.7738172391445235, + "grad_norm": 0.08516917377710342, + "learning_rate": 1.256582204419641e-05, + "loss": 0.2427, + "step": 46584 + }, + { + "epoch": 3.773898250162022, + "grad_norm": 0.08085508644580841, + "learning_rate": 1.2561321391601782e-05, + "loss": 0.232, + "step": 46585 + }, + { + "epoch": 3.7739792611795204, + "grad_norm": 0.07463680952787399, + "learning_rate": 1.2556820739007157e-05, + "loss": 0.2403, + "step": 46586 + }, + { + "epoch": 3.7740602721970187, + "grad_norm": 0.07635892182588577, + "learning_rate": 1.255232008641253e-05, + "loss": 0.2527, + "step": 46587 + }, + { + "epoch": 3.774141283214517, + "grad_norm": 0.07656805962324142, + "learning_rate": 1.2547819433817903e-05, + "loss": 0.2201, + "step": 46588 + }, + { + "epoch": 3.7742222942320156, + "grad_norm": 0.08051590621471405, + "learning_rate": 1.254331878122328e-05, + "loss": 0.2406, + "step": 46589 + }, + { + "epoch": 3.774303305249514, + "grad_norm": 0.08109819889068604, + "learning_rate": 1.2538818128628652e-05, + "loss": 0.2175, + "step": 46590 + }, + { + "epoch": 3.774384316267012, + "grad_norm": 0.07000181823968887, + "learning_rate": 1.2534317476034023e-05, + "loss": 0.1995, + "step": 46591 + }, + { + "epoch": 3.774465327284511, + "grad_norm": 0.08554103225469589, + "learning_rate": 1.25298168234394e-05, + "loss": 0.2115, + "step": 46592 + }, + { + "epoch": 3.774546338302009, + "grad_norm": 0.06602445989847183, + "learning_rate": 1.2525316170844772e-05, + "loss": 0.2711, + "step": 46593 + }, + { + "epoch": 3.7746273493195073, + "grad_norm": 0.06864854693412781, + "learning_rate": 1.2520815518250148e-05, + "loss": 0.2129, + "step": 46594 + }, + { + "epoch": 3.774708360337006, + "grad_norm": 0.06876920908689499, + "learning_rate": 1.2516314865655521e-05, + "loss": 0.2075, + "step": 46595 + }, + { + "epoch": 3.774789371354504, + "grad_norm": 0.0704292431473732, + "learning_rate": 1.2511814213060893e-05, + "loss": 0.2022, + "step": 46596 + }, + { + "epoch": 3.7748703823720025, + "grad_norm": 0.06541324406862259, + "learning_rate": 1.2507313560466268e-05, + "loss": 0.2333, + "step": 46597 + }, + { + "epoch": 3.774951393389501, + "grad_norm": 0.08963797986507416, + "learning_rate": 1.2502812907871642e-05, + "loss": 0.2533, + "step": 46598 + }, + { + "epoch": 3.7750324044069994, + "grad_norm": 0.0644608810544014, + "learning_rate": 1.2498312255277016e-05, + "loss": 0.2091, + "step": 46599 + }, + { + "epoch": 3.7751134154244976, + "grad_norm": 0.07457215338945389, + "learning_rate": 1.249381160268239e-05, + "loss": 0.2618, + "step": 46600 + }, + { + "epoch": 3.7751944264419963, + "grad_norm": 0.06820184737443924, + "learning_rate": 1.2489310950087765e-05, + "loss": 0.2127, + "step": 46601 + }, + { + "epoch": 3.7752754374594946, + "grad_norm": 0.060140807181596756, + "learning_rate": 1.2484810297493137e-05, + "loss": 0.2029, + "step": 46602 + }, + { + "epoch": 3.775356448476993, + "grad_norm": 0.05878528580069542, + "learning_rate": 1.248030964489851e-05, + "loss": 0.2461, + "step": 46603 + }, + { + "epoch": 3.7754374594944915, + "grad_norm": 0.0732777863740921, + "learning_rate": 1.2475808992303885e-05, + "loss": 0.2469, + "step": 46604 + }, + { + "epoch": 3.7755184705119897, + "grad_norm": 0.07616322487592697, + "learning_rate": 1.2471308339709259e-05, + "loss": 0.2513, + "step": 46605 + }, + { + "epoch": 3.775599481529488, + "grad_norm": 0.07453019171953201, + "learning_rate": 1.2466807687114631e-05, + "loss": 0.2097, + "step": 46606 + }, + { + "epoch": 3.7756804925469862, + "grad_norm": 0.07242731750011444, + "learning_rate": 1.2462307034520006e-05, + "loss": 0.2427, + "step": 46607 + }, + { + "epoch": 3.775761503564485, + "grad_norm": 0.07155609875917435, + "learning_rate": 1.245780638192538e-05, + "loss": 0.236, + "step": 46608 + }, + { + "epoch": 3.775842514581983, + "grad_norm": 0.07473588734865189, + "learning_rate": 1.2453305729330753e-05, + "loss": 0.2572, + "step": 46609 + }, + { + "epoch": 3.7759235255994814, + "grad_norm": 0.06979592144489288, + "learning_rate": 1.2448805076736127e-05, + "loss": 0.2232, + "step": 46610 + }, + { + "epoch": 3.7760045366169797, + "grad_norm": 0.0813089907169342, + "learning_rate": 1.24443044241415e-05, + "loss": 0.2052, + "step": 46611 + }, + { + "epoch": 3.7760855476344783, + "grad_norm": 0.08803302049636841, + "learning_rate": 1.2439803771546874e-05, + "loss": 0.2069, + "step": 46612 + }, + { + "epoch": 3.7761665586519766, + "grad_norm": 0.06549622863531113, + "learning_rate": 1.243530311895225e-05, + "loss": 0.2477, + "step": 46613 + }, + { + "epoch": 3.776247569669475, + "grad_norm": 0.07154756784439087, + "learning_rate": 1.2430802466357623e-05, + "loss": 0.2317, + "step": 46614 + }, + { + "epoch": 3.7763285806869735, + "grad_norm": 0.06701797246932983, + "learning_rate": 1.2426301813762995e-05, + "loss": 0.211, + "step": 46615 + }, + { + "epoch": 3.7764095917044718, + "grad_norm": 0.06661625951528549, + "learning_rate": 1.242180116116837e-05, + "loss": 0.2281, + "step": 46616 + }, + { + "epoch": 3.77649060272197, + "grad_norm": 0.06580673903226852, + "learning_rate": 1.2417300508573744e-05, + "loss": 0.2164, + "step": 46617 + }, + { + "epoch": 3.7765716137394687, + "grad_norm": 0.0783298909664154, + "learning_rate": 1.2412799855979118e-05, + "loss": 0.2132, + "step": 46618 + }, + { + "epoch": 3.776652624756967, + "grad_norm": 0.10125415027141571, + "learning_rate": 1.2408299203384491e-05, + "loss": 0.2638, + "step": 46619 + }, + { + "epoch": 3.776733635774465, + "grad_norm": 0.06981062144041061, + "learning_rate": 1.2403798550789865e-05, + "loss": 0.1883, + "step": 46620 + }, + { + "epoch": 3.776814646791964, + "grad_norm": 0.06791463494300842, + "learning_rate": 1.2399297898195238e-05, + "loss": 0.2145, + "step": 46621 + }, + { + "epoch": 3.776895657809462, + "grad_norm": 0.06609434634447098, + "learning_rate": 1.2394797245600614e-05, + "loss": 0.2248, + "step": 46622 + }, + { + "epoch": 3.7769766688269604, + "grad_norm": 0.065769724547863, + "learning_rate": 1.2390296593005986e-05, + "loss": 0.2087, + "step": 46623 + }, + { + "epoch": 3.777057679844459, + "grad_norm": 0.07486658543348312, + "learning_rate": 1.238579594041136e-05, + "loss": 0.2623, + "step": 46624 + }, + { + "epoch": 3.7771386908619573, + "grad_norm": 0.06038758158683777, + "learning_rate": 1.2381295287816734e-05, + "loss": 0.1932, + "step": 46625 + }, + { + "epoch": 3.7772197018794555, + "grad_norm": 0.07746127247810364, + "learning_rate": 1.2376794635222108e-05, + "loss": 0.2262, + "step": 46626 + }, + { + "epoch": 3.7773007128969542, + "grad_norm": 0.07121626287698746, + "learning_rate": 1.2372293982627482e-05, + "loss": 0.2153, + "step": 46627 + }, + { + "epoch": 3.7773817239144525, + "grad_norm": 0.07847585529088974, + "learning_rate": 1.2367793330032855e-05, + "loss": 0.1917, + "step": 46628 + }, + { + "epoch": 3.7774627349319507, + "grad_norm": 0.0689721554517746, + "learning_rate": 1.2363292677438229e-05, + "loss": 0.2269, + "step": 46629 + }, + { + "epoch": 3.777543745949449, + "grad_norm": 0.08667279779911041, + "learning_rate": 1.2358792024843602e-05, + "loss": 0.222, + "step": 46630 + }, + { + "epoch": 3.7776247569669477, + "grad_norm": 0.06442601978778839, + "learning_rate": 1.2354291372248978e-05, + "loss": 0.2319, + "step": 46631 + }, + { + "epoch": 3.777705767984446, + "grad_norm": 0.0721411257982254, + "learning_rate": 1.234979071965435e-05, + "loss": 0.1926, + "step": 46632 + }, + { + "epoch": 3.777786779001944, + "grad_norm": 0.06399735063314438, + "learning_rate": 1.2345290067059723e-05, + "loss": 0.1966, + "step": 46633 + }, + { + "epoch": 3.7778677900194424, + "grad_norm": 0.0750143975019455, + "learning_rate": 1.2340789414465099e-05, + "loss": 0.2406, + "step": 46634 + }, + { + "epoch": 3.777948801036941, + "grad_norm": 0.08481192588806152, + "learning_rate": 1.2336288761870472e-05, + "loss": 0.2353, + "step": 46635 + }, + { + "epoch": 3.7780298120544393, + "grad_norm": 0.060849450528621674, + "learning_rate": 1.2331788109275846e-05, + "loss": 0.2453, + "step": 46636 + }, + { + "epoch": 3.7781108230719376, + "grad_norm": 0.06765329837799072, + "learning_rate": 1.232728745668122e-05, + "loss": 0.1992, + "step": 46637 + }, + { + "epoch": 3.7781918340894363, + "grad_norm": 0.07488850504159927, + "learning_rate": 1.2322786804086593e-05, + "loss": 0.2427, + "step": 46638 + }, + { + "epoch": 3.7782728451069345, + "grad_norm": 0.07653640955686569, + "learning_rate": 1.2318286151491967e-05, + "loss": 0.2297, + "step": 46639 + }, + { + "epoch": 3.7783538561244328, + "grad_norm": 0.07188255339860916, + "learning_rate": 1.231378549889734e-05, + "loss": 0.2335, + "step": 46640 + }, + { + "epoch": 3.7784348671419314, + "grad_norm": 0.07283948361873627, + "learning_rate": 1.2309284846302714e-05, + "loss": 0.2368, + "step": 46641 + }, + { + "epoch": 3.7785158781594297, + "grad_norm": 0.056375570595264435, + "learning_rate": 1.2304784193708087e-05, + "loss": 0.1894, + "step": 46642 + }, + { + "epoch": 3.778596889176928, + "grad_norm": 0.07126890867948532, + "learning_rate": 1.2300283541113463e-05, + "loss": 0.2171, + "step": 46643 + }, + { + "epoch": 3.7786779001944266, + "grad_norm": 0.07158375531435013, + "learning_rate": 1.2295782888518836e-05, + "loss": 0.1882, + "step": 46644 + }, + { + "epoch": 3.778758911211925, + "grad_norm": 0.072811059653759, + "learning_rate": 1.229128223592421e-05, + "loss": 0.2374, + "step": 46645 + }, + { + "epoch": 3.778839922229423, + "grad_norm": 0.06893569976091385, + "learning_rate": 1.2286781583329584e-05, + "loss": 0.2205, + "step": 46646 + }, + { + "epoch": 3.778920933246922, + "grad_norm": 0.07594628632068634, + "learning_rate": 1.2282280930734957e-05, + "loss": 0.2129, + "step": 46647 + }, + { + "epoch": 3.77900194426442, + "grad_norm": 0.06287289410829544, + "learning_rate": 1.227778027814033e-05, + "loss": 0.2281, + "step": 46648 + }, + { + "epoch": 3.7790829552819183, + "grad_norm": 0.07265198230743408, + "learning_rate": 1.2273279625545704e-05, + "loss": 0.2196, + "step": 46649 + }, + { + "epoch": 3.779163966299417, + "grad_norm": 0.05850711092352867, + "learning_rate": 1.2268778972951078e-05, + "loss": 0.2111, + "step": 46650 + }, + { + "epoch": 3.779244977316915, + "grad_norm": 0.06421723961830139, + "learning_rate": 1.2264278320356452e-05, + "loss": 0.2301, + "step": 46651 + }, + { + "epoch": 3.7793259883344135, + "grad_norm": 0.06834607571363449, + "learning_rate": 1.2259777667761827e-05, + "loss": 0.2464, + "step": 46652 + }, + { + "epoch": 3.7794069993519117, + "grad_norm": 0.06908293813467026, + "learning_rate": 1.2255277015167199e-05, + "loss": 0.1841, + "step": 46653 + }, + { + "epoch": 3.7794880103694104, + "grad_norm": 0.06474535912275314, + "learning_rate": 1.2250776362572574e-05, + "loss": 0.1876, + "step": 46654 + }, + { + "epoch": 3.7795690213869086, + "grad_norm": 0.07207901775836945, + "learning_rate": 1.2246275709977948e-05, + "loss": 0.2289, + "step": 46655 + }, + { + "epoch": 3.779650032404407, + "grad_norm": 0.05818149074912071, + "learning_rate": 1.2241775057383321e-05, + "loss": 0.1996, + "step": 46656 + }, + { + "epoch": 3.779731043421905, + "grad_norm": 0.0827532559633255, + "learning_rate": 1.2237274404788695e-05, + "loss": 0.2101, + "step": 46657 + }, + { + "epoch": 3.779812054439404, + "grad_norm": 0.09072255343198776, + "learning_rate": 1.2232773752194068e-05, + "loss": 0.256, + "step": 46658 + }, + { + "epoch": 3.779893065456902, + "grad_norm": 0.07674512267112732, + "learning_rate": 1.2228273099599442e-05, + "loss": 0.2505, + "step": 46659 + }, + { + "epoch": 3.7799740764744003, + "grad_norm": 0.060501959174871445, + "learning_rate": 1.2223772447004816e-05, + "loss": 0.2001, + "step": 46660 + }, + { + "epoch": 3.780055087491899, + "grad_norm": 0.06538248807191849, + "learning_rate": 1.2219271794410191e-05, + "loss": 0.2453, + "step": 46661 + }, + { + "epoch": 3.7801360985093972, + "grad_norm": 0.07560870796442032, + "learning_rate": 1.2214771141815563e-05, + "loss": 0.1887, + "step": 46662 + }, + { + "epoch": 3.7802171095268955, + "grad_norm": 0.08395927399396896, + "learning_rate": 1.2210270489220938e-05, + "loss": 0.1986, + "step": 46663 + }, + { + "epoch": 3.780298120544394, + "grad_norm": 0.05529021471738815, + "learning_rate": 1.2205769836626312e-05, + "loss": 0.1969, + "step": 46664 + }, + { + "epoch": 3.7803791315618924, + "grad_norm": 0.09511775523424149, + "learning_rate": 1.2201269184031685e-05, + "loss": 0.233, + "step": 46665 + }, + { + "epoch": 3.7804601425793907, + "grad_norm": 0.06025183945894241, + "learning_rate": 1.2196768531437059e-05, + "loss": 0.2091, + "step": 46666 + }, + { + "epoch": 3.7805411535968894, + "grad_norm": 0.07725583016872406, + "learning_rate": 1.2192267878842433e-05, + "loss": 0.2462, + "step": 46667 + }, + { + "epoch": 3.7806221646143876, + "grad_norm": 0.07531146705150604, + "learning_rate": 1.2187767226247806e-05, + "loss": 0.1891, + "step": 46668 + }, + { + "epoch": 3.780703175631886, + "grad_norm": 0.06599050760269165, + "learning_rate": 1.218326657365318e-05, + "loss": 0.2141, + "step": 46669 + }, + { + "epoch": 3.7807841866493845, + "grad_norm": 0.08106547594070435, + "learning_rate": 1.2178765921058555e-05, + "loss": 0.2459, + "step": 46670 + }, + { + "epoch": 3.780865197666883, + "grad_norm": 0.07727594673633575, + "learning_rate": 1.2174265268463927e-05, + "loss": 0.2672, + "step": 46671 + }, + { + "epoch": 3.780946208684381, + "grad_norm": 0.07024858891963959, + "learning_rate": 1.2169764615869302e-05, + "loss": 0.1984, + "step": 46672 + }, + { + "epoch": 3.7810272197018797, + "grad_norm": 0.05269385129213333, + "learning_rate": 1.2165263963274676e-05, + "loss": 0.1987, + "step": 46673 + }, + { + "epoch": 3.781108230719378, + "grad_norm": 0.07926183193922043, + "learning_rate": 1.216076331068005e-05, + "loss": 0.225, + "step": 46674 + }, + { + "epoch": 3.781189241736876, + "grad_norm": 0.08314008265733719, + "learning_rate": 1.2156262658085423e-05, + "loss": 0.2404, + "step": 46675 + }, + { + "epoch": 3.7812702527543745, + "grad_norm": 0.08210525661706924, + "learning_rate": 1.2151762005490797e-05, + "loss": 0.2526, + "step": 46676 + }, + { + "epoch": 3.781351263771873, + "grad_norm": 0.08011859655380249, + "learning_rate": 1.214726135289617e-05, + "loss": 0.2305, + "step": 46677 + }, + { + "epoch": 3.7814322747893714, + "grad_norm": 0.07893484830856323, + "learning_rate": 1.2142760700301546e-05, + "loss": 0.2348, + "step": 46678 + }, + { + "epoch": 3.7815132858068696, + "grad_norm": 0.06235817074775696, + "learning_rate": 1.2138260047706918e-05, + "loss": 0.1975, + "step": 46679 + }, + { + "epoch": 3.781594296824368, + "grad_norm": 0.0649525374174118, + "learning_rate": 1.2133759395112291e-05, + "loss": 0.2165, + "step": 46680 + }, + { + "epoch": 3.7816753078418666, + "grad_norm": 0.06788064539432526, + "learning_rate": 1.2129258742517666e-05, + "loss": 0.1885, + "step": 46681 + }, + { + "epoch": 3.781756318859365, + "grad_norm": 0.07292314618825912, + "learning_rate": 1.212475808992304e-05, + "loss": 0.2394, + "step": 46682 + }, + { + "epoch": 3.781837329876863, + "grad_norm": 0.06189664825797081, + "learning_rate": 1.2120257437328412e-05, + "loss": 0.2327, + "step": 46683 + }, + { + "epoch": 3.7819183408943617, + "grad_norm": 0.054467201232910156, + "learning_rate": 1.2115756784733787e-05, + "loss": 0.1965, + "step": 46684 + }, + { + "epoch": 3.78199935191186, + "grad_norm": 0.06842909753322601, + "learning_rate": 1.211125613213916e-05, + "loss": 0.2055, + "step": 46685 + }, + { + "epoch": 3.7820803629293582, + "grad_norm": 0.07629824429750443, + "learning_rate": 1.2106755479544534e-05, + "loss": 0.2288, + "step": 46686 + }, + { + "epoch": 3.782161373946857, + "grad_norm": 0.06854870170354843, + "learning_rate": 1.210225482694991e-05, + "loss": 0.2034, + "step": 46687 + }, + { + "epoch": 3.782242384964355, + "grad_norm": 0.08357995003461838, + "learning_rate": 1.2097754174355282e-05, + "loss": 0.2294, + "step": 46688 + }, + { + "epoch": 3.7823233959818534, + "grad_norm": 0.05101915821433067, + "learning_rate": 1.2093253521760655e-05, + "loss": 0.1895, + "step": 46689 + }, + { + "epoch": 3.782404406999352, + "grad_norm": 0.0792124941945076, + "learning_rate": 1.208875286916603e-05, + "loss": 0.216, + "step": 46690 + }, + { + "epoch": 3.7824854180168503, + "grad_norm": 0.07666429132223129, + "learning_rate": 1.2084252216571404e-05, + "loss": 0.2374, + "step": 46691 + }, + { + "epoch": 3.7825664290343486, + "grad_norm": 0.08312845975160599, + "learning_rate": 1.2079751563976776e-05, + "loss": 0.2384, + "step": 46692 + }, + { + "epoch": 3.7826474400518473, + "grad_norm": 0.08143067359924316, + "learning_rate": 1.2075250911382151e-05, + "loss": 0.2452, + "step": 46693 + }, + { + "epoch": 3.7827284510693455, + "grad_norm": 0.06815093010663986, + "learning_rate": 1.2070750258787525e-05, + "loss": 0.2508, + "step": 46694 + }, + { + "epoch": 3.7828094620868438, + "grad_norm": 0.07493395358324051, + "learning_rate": 1.2066249606192899e-05, + "loss": 0.2458, + "step": 46695 + }, + { + "epoch": 3.7828904731043425, + "grad_norm": 0.079828180372715, + "learning_rate": 1.2061748953598272e-05, + "loss": 0.2392, + "step": 46696 + }, + { + "epoch": 3.7829714841218407, + "grad_norm": 0.0668768361210823, + "learning_rate": 1.2057248301003646e-05, + "loss": 0.2474, + "step": 46697 + }, + { + "epoch": 3.783052495139339, + "grad_norm": 0.06913848966360092, + "learning_rate": 1.205274764840902e-05, + "loss": 0.2297, + "step": 46698 + }, + { + "epoch": 3.783133506156837, + "grad_norm": 0.07236789911985397, + "learning_rate": 1.2048246995814395e-05, + "loss": 0.2377, + "step": 46699 + }, + { + "epoch": 3.7832145171743354, + "grad_norm": 0.06536900997161865, + "learning_rate": 1.2043746343219768e-05, + "loss": 0.2262, + "step": 46700 + }, + { + "epoch": 3.783295528191834, + "grad_norm": 0.08164361864328384, + "learning_rate": 1.203924569062514e-05, + "loss": 0.2633, + "step": 46701 + }, + { + "epoch": 3.7833765392093324, + "grad_norm": 0.0710197389125824, + "learning_rate": 1.2034745038030515e-05, + "loss": 0.2106, + "step": 46702 + }, + { + "epoch": 3.7834575502268306, + "grad_norm": 0.08425655961036682, + "learning_rate": 1.2030244385435889e-05, + "loss": 0.2288, + "step": 46703 + }, + { + "epoch": 3.7835385612443293, + "grad_norm": 0.06478645652532578, + "learning_rate": 1.2025743732841263e-05, + "loss": 0.2321, + "step": 46704 + }, + { + "epoch": 3.7836195722618275, + "grad_norm": 0.06343191862106323, + "learning_rate": 1.2021243080246636e-05, + "loss": 0.2063, + "step": 46705 + }, + { + "epoch": 3.783700583279326, + "grad_norm": 0.08273705095052719, + "learning_rate": 1.201674242765201e-05, + "loss": 0.2683, + "step": 46706 + }, + { + "epoch": 3.7837815942968245, + "grad_norm": 0.05265715345740318, + "learning_rate": 1.2012241775057383e-05, + "loss": 0.2125, + "step": 46707 + }, + { + "epoch": 3.7838626053143227, + "grad_norm": 0.06474486738443375, + "learning_rate": 1.2007741122462759e-05, + "loss": 0.1809, + "step": 46708 + }, + { + "epoch": 3.783943616331821, + "grad_norm": 0.07659123092889786, + "learning_rate": 1.200324046986813e-05, + "loss": 0.2185, + "step": 46709 + }, + { + "epoch": 3.7840246273493197, + "grad_norm": 0.0784589946269989, + "learning_rate": 1.1998739817273504e-05, + "loss": 0.2195, + "step": 46710 + }, + { + "epoch": 3.784105638366818, + "grad_norm": 0.0916212946176529, + "learning_rate": 1.199423916467888e-05, + "loss": 0.2416, + "step": 46711 + }, + { + "epoch": 3.784186649384316, + "grad_norm": 0.0649249330163002, + "learning_rate": 1.1989738512084253e-05, + "loss": 0.2101, + "step": 46712 + }, + { + "epoch": 3.784267660401815, + "grad_norm": 0.0661170706152916, + "learning_rate": 1.1985237859489627e-05, + "loss": 0.2285, + "step": 46713 + }, + { + "epoch": 3.784348671419313, + "grad_norm": 0.08094004541635513, + "learning_rate": 1.1980737206895e-05, + "loss": 0.2624, + "step": 46714 + }, + { + "epoch": 3.7844296824368113, + "grad_norm": 0.06313817203044891, + "learning_rate": 1.1976236554300374e-05, + "loss": 0.1953, + "step": 46715 + }, + { + "epoch": 3.78451069345431, + "grad_norm": 0.07197370380163193, + "learning_rate": 1.1971735901705748e-05, + "loss": 0.2086, + "step": 46716 + }, + { + "epoch": 3.7845917044718083, + "grad_norm": 0.060122050344944, + "learning_rate": 1.1967235249111123e-05, + "loss": 0.2372, + "step": 46717 + }, + { + "epoch": 3.7846727154893065, + "grad_norm": 0.0652763694524765, + "learning_rate": 1.1962734596516495e-05, + "loss": 0.2288, + "step": 46718 + }, + { + "epoch": 3.784753726506805, + "grad_norm": 0.07435501366853714, + "learning_rate": 1.1958233943921868e-05, + "loss": 0.2219, + "step": 46719 + }, + { + "epoch": 3.7848347375243034, + "grad_norm": 0.06599126011133194, + "learning_rate": 1.1953733291327244e-05, + "loss": 0.221, + "step": 46720 + }, + { + "epoch": 3.7849157485418017, + "grad_norm": 0.08771032840013504, + "learning_rate": 1.1949232638732617e-05, + "loss": 0.2255, + "step": 46721 + }, + { + "epoch": 3.7849967595593, + "grad_norm": 0.08068722486495972, + "learning_rate": 1.194473198613799e-05, + "loss": 0.24, + "step": 46722 + }, + { + "epoch": 3.785077770576798, + "grad_norm": 0.06992881000041962, + "learning_rate": 1.1940231333543365e-05, + "loss": 0.2148, + "step": 46723 + }, + { + "epoch": 3.785158781594297, + "grad_norm": 0.06782852113246918, + "learning_rate": 1.1935730680948738e-05, + "loss": 0.2005, + "step": 46724 + }, + { + "epoch": 3.785239792611795, + "grad_norm": 0.07253803312778473, + "learning_rate": 1.1931230028354112e-05, + "loss": 0.2628, + "step": 46725 + }, + { + "epoch": 3.7853208036292934, + "grad_norm": 0.05372246354818344, + "learning_rate": 1.1926729375759485e-05, + "loss": 0.1846, + "step": 46726 + }, + { + "epoch": 3.785401814646792, + "grad_norm": 0.0695682018995285, + "learning_rate": 1.1922228723164859e-05, + "loss": 0.2172, + "step": 46727 + }, + { + "epoch": 3.7854828256642903, + "grad_norm": 0.07446936517953873, + "learning_rate": 1.1917728070570233e-05, + "loss": 0.2159, + "step": 46728 + }, + { + "epoch": 3.7855638366817885, + "grad_norm": 0.07652425020933151, + "learning_rate": 1.1913227417975608e-05, + "loss": 0.2145, + "step": 46729 + }, + { + "epoch": 3.785644847699287, + "grad_norm": 0.07526133209466934, + "learning_rate": 1.1908726765380981e-05, + "loss": 0.2344, + "step": 46730 + }, + { + "epoch": 3.7857258587167855, + "grad_norm": 0.06465907394886017, + "learning_rate": 1.1904226112786353e-05, + "loss": 0.2097, + "step": 46731 + }, + { + "epoch": 3.7858068697342837, + "grad_norm": 0.055496908724308014, + "learning_rate": 1.1899725460191729e-05, + "loss": 0.204, + "step": 46732 + }, + { + "epoch": 3.7858878807517824, + "grad_norm": 0.07140771299600601, + "learning_rate": 1.1895224807597102e-05, + "loss": 0.1999, + "step": 46733 + }, + { + "epoch": 3.7859688917692806, + "grad_norm": 0.07559964805841446, + "learning_rate": 1.1890724155002476e-05, + "loss": 0.2583, + "step": 46734 + }, + { + "epoch": 3.786049902786779, + "grad_norm": 0.0660320520401001, + "learning_rate": 1.188622350240785e-05, + "loss": 0.231, + "step": 46735 + }, + { + "epoch": 3.7861309138042776, + "grad_norm": 0.06079673767089844, + "learning_rate": 1.1881722849813223e-05, + "loss": 0.2112, + "step": 46736 + }, + { + "epoch": 3.786211924821776, + "grad_norm": 0.07198115438222885, + "learning_rate": 1.1877222197218597e-05, + "loss": 0.1936, + "step": 46737 + }, + { + "epoch": 3.786292935839274, + "grad_norm": 0.07653490453958511, + "learning_rate": 1.1872721544623972e-05, + "loss": 0.2059, + "step": 46738 + }, + { + "epoch": 3.7863739468567728, + "grad_norm": 0.07065163552761078, + "learning_rate": 1.1868220892029344e-05, + "loss": 0.197, + "step": 46739 + }, + { + "epoch": 3.786454957874271, + "grad_norm": 0.08107943832874298, + "learning_rate": 1.1863720239434717e-05, + "loss": 0.2431, + "step": 46740 + }, + { + "epoch": 3.7865359688917692, + "grad_norm": 0.056850139051675797, + "learning_rate": 1.1859219586840093e-05, + "loss": 0.2156, + "step": 46741 + }, + { + "epoch": 3.786616979909268, + "grad_norm": 0.10843037068843842, + "learning_rate": 1.1854718934245466e-05, + "loss": 0.2501, + "step": 46742 + }, + { + "epoch": 3.786697990926766, + "grad_norm": 0.0759415253996849, + "learning_rate": 1.185021828165084e-05, + "loss": 0.2182, + "step": 46743 + }, + { + "epoch": 3.7867790019442644, + "grad_norm": 0.07525965571403503, + "learning_rate": 1.1845717629056214e-05, + "loss": 0.237, + "step": 46744 + }, + { + "epoch": 3.7868600129617627, + "grad_norm": 0.07234998047351837, + "learning_rate": 1.1841216976461587e-05, + "loss": 0.2268, + "step": 46745 + }, + { + "epoch": 3.786941023979261, + "grad_norm": 0.05831919237971306, + "learning_rate": 1.183671632386696e-05, + "loss": 0.2266, + "step": 46746 + }, + { + "epoch": 3.7870220349967596, + "grad_norm": 0.07513531297445297, + "learning_rate": 1.1832215671272336e-05, + "loss": 0.2511, + "step": 46747 + }, + { + "epoch": 3.787103046014258, + "grad_norm": 0.06514651328325272, + "learning_rate": 1.1827715018677708e-05, + "loss": 0.1951, + "step": 46748 + }, + { + "epoch": 3.787184057031756, + "grad_norm": 0.07075642794370651, + "learning_rate": 1.1823214366083082e-05, + "loss": 0.2222, + "step": 46749 + }, + { + "epoch": 3.787265068049255, + "grad_norm": 0.09682905673980713, + "learning_rate": 1.1818713713488457e-05, + "loss": 0.2183, + "step": 46750 + }, + { + "epoch": 3.787346079066753, + "grad_norm": 0.07281651347875595, + "learning_rate": 1.181421306089383e-05, + "loss": 0.2481, + "step": 46751 + }, + { + "epoch": 3.7874270900842513, + "grad_norm": 0.055947430431842804, + "learning_rate": 1.1809712408299204e-05, + "loss": 0.2086, + "step": 46752 + }, + { + "epoch": 3.78750810110175, + "grad_norm": 0.07676409929990768, + "learning_rate": 1.1805211755704578e-05, + "loss": 0.2487, + "step": 46753 + }, + { + "epoch": 3.787589112119248, + "grad_norm": 0.07692111283540726, + "learning_rate": 1.1800711103109951e-05, + "loss": 0.2786, + "step": 46754 + }, + { + "epoch": 3.7876701231367464, + "grad_norm": 0.06674608588218689, + "learning_rate": 1.1796210450515325e-05, + "loss": 0.2249, + "step": 46755 + }, + { + "epoch": 3.787751134154245, + "grad_norm": 0.06333047896623611, + "learning_rate": 1.17917097979207e-05, + "loss": 0.2187, + "step": 46756 + }, + { + "epoch": 3.7878321451717434, + "grad_norm": 0.07365437597036362, + "learning_rate": 1.1787209145326072e-05, + "loss": 0.2089, + "step": 46757 + }, + { + "epoch": 3.7879131561892416, + "grad_norm": 0.09069455415010452, + "learning_rate": 1.1782708492731446e-05, + "loss": 0.2148, + "step": 46758 + }, + { + "epoch": 3.7879941672067403, + "grad_norm": 0.07776268571615219, + "learning_rate": 1.1778207840136821e-05, + "loss": 0.2311, + "step": 46759 + }, + { + "epoch": 3.7880751782242386, + "grad_norm": 0.08250845968723297, + "learning_rate": 1.1773707187542195e-05, + "loss": 0.2531, + "step": 46760 + }, + { + "epoch": 3.788156189241737, + "grad_norm": 0.08193209767341614, + "learning_rate": 1.1769206534947568e-05, + "loss": 0.2188, + "step": 46761 + }, + { + "epoch": 3.7882372002592355, + "grad_norm": 0.07312662899494171, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.2247, + "step": 46762 + }, + { + "epoch": 3.7883182112767337, + "grad_norm": 0.07228390127420425, + "learning_rate": 1.1760205229758315e-05, + "loss": 0.2473, + "step": 46763 + }, + { + "epoch": 3.788399222294232, + "grad_norm": 0.0762069895863533, + "learning_rate": 1.1755704577163689e-05, + "loss": 0.206, + "step": 46764 + }, + { + "epoch": 3.7884802333117307, + "grad_norm": 0.06151983514428139, + "learning_rate": 1.1751203924569063e-05, + "loss": 0.1886, + "step": 46765 + }, + { + "epoch": 3.788561244329229, + "grad_norm": 0.07053109258413315, + "learning_rate": 1.1746703271974436e-05, + "loss": 0.227, + "step": 46766 + }, + { + "epoch": 3.788642255346727, + "grad_norm": 0.06008979305624962, + "learning_rate": 1.174220261937981e-05, + "loss": 0.1776, + "step": 46767 + }, + { + "epoch": 3.7887232663642254, + "grad_norm": 0.06683298200368881, + "learning_rate": 1.1737701966785185e-05, + "loss": 0.2484, + "step": 46768 + }, + { + "epoch": 3.7888042773817237, + "grad_norm": 0.07074110209941864, + "learning_rate": 1.1733201314190557e-05, + "loss": 0.1979, + "step": 46769 + }, + { + "epoch": 3.7888852883992223, + "grad_norm": 0.07903765887022018, + "learning_rate": 1.1728700661595932e-05, + "loss": 0.2239, + "step": 46770 + }, + { + "epoch": 3.7889662994167206, + "grad_norm": 0.08197832852602005, + "learning_rate": 1.1724200009001306e-05, + "loss": 0.2449, + "step": 46771 + }, + { + "epoch": 3.789047310434219, + "grad_norm": 0.07146139442920685, + "learning_rate": 1.171969935640668e-05, + "loss": 0.2572, + "step": 46772 + }, + { + "epoch": 3.7891283214517175, + "grad_norm": 0.061843667179346085, + "learning_rate": 1.1715198703812053e-05, + "loss": 0.2238, + "step": 46773 + }, + { + "epoch": 3.7892093324692158, + "grad_norm": 0.07453931123018265, + "learning_rate": 1.1710698051217427e-05, + "loss": 0.1794, + "step": 46774 + }, + { + "epoch": 3.789290343486714, + "grad_norm": 0.06958021223545074, + "learning_rate": 1.17061973986228e-05, + "loss": 0.2162, + "step": 46775 + }, + { + "epoch": 3.7893713545042127, + "grad_norm": 0.09496792405843735, + "learning_rate": 1.1701696746028176e-05, + "loss": 0.2395, + "step": 46776 + }, + { + "epoch": 3.789452365521711, + "grad_norm": 0.06393705308437347, + "learning_rate": 1.169719609343355e-05, + "loss": 0.2149, + "step": 46777 + }, + { + "epoch": 3.789533376539209, + "grad_norm": 0.07385969907045364, + "learning_rate": 1.1692695440838921e-05, + "loss": 0.2408, + "step": 46778 + }, + { + "epoch": 3.789614387556708, + "grad_norm": 0.06838518381118774, + "learning_rate": 1.1688194788244296e-05, + "loss": 0.2357, + "step": 46779 + }, + { + "epoch": 3.789695398574206, + "grad_norm": 0.07647363841533661, + "learning_rate": 1.168369413564967e-05, + "loss": 0.2594, + "step": 46780 + }, + { + "epoch": 3.7897764095917044, + "grad_norm": 0.0688256323337555, + "learning_rate": 1.1679193483055044e-05, + "loss": 0.2013, + "step": 46781 + }, + { + "epoch": 3.789857420609203, + "grad_norm": 0.07052019238471985, + "learning_rate": 1.1674692830460417e-05, + "loss": 0.2156, + "step": 46782 + }, + { + "epoch": 3.7899384316267013, + "grad_norm": 0.07364077866077423, + "learning_rate": 1.1670192177865791e-05, + "loss": 0.2067, + "step": 46783 + }, + { + "epoch": 3.7900194426441995, + "grad_norm": 0.0811486765742302, + "learning_rate": 1.1665691525271165e-05, + "loss": 0.2418, + "step": 46784 + }, + { + "epoch": 3.7901004536616982, + "grad_norm": 0.08562948554754257, + "learning_rate": 1.166119087267654e-05, + "loss": 0.2343, + "step": 46785 + }, + { + "epoch": 3.7901814646791965, + "grad_norm": 0.08794140070676804, + "learning_rate": 1.1656690220081913e-05, + "loss": 0.2201, + "step": 46786 + }, + { + "epoch": 3.7902624756966947, + "grad_norm": 0.07163339853286743, + "learning_rate": 1.1652189567487285e-05, + "loss": 0.2319, + "step": 46787 + }, + { + "epoch": 3.790343486714193, + "grad_norm": 0.06453966349363327, + "learning_rate": 1.164768891489266e-05, + "loss": 0.2019, + "step": 46788 + }, + { + "epoch": 3.7904244977316917, + "grad_norm": 0.0652361512184143, + "learning_rate": 1.1643188262298034e-05, + "loss": 0.2053, + "step": 46789 + }, + { + "epoch": 3.79050550874919, + "grad_norm": 0.06884594261646271, + "learning_rate": 1.1638687609703408e-05, + "loss": 0.2362, + "step": 46790 + }, + { + "epoch": 3.790586519766688, + "grad_norm": 0.07537364959716797, + "learning_rate": 1.1634186957108781e-05, + "loss": 0.2326, + "step": 46791 + }, + { + "epoch": 3.7906675307841864, + "grad_norm": 0.06967596709728241, + "learning_rate": 1.1629686304514155e-05, + "loss": 0.1997, + "step": 46792 + }, + { + "epoch": 3.790748541801685, + "grad_norm": 0.08205866068601608, + "learning_rate": 1.1625185651919529e-05, + "loss": 0.2289, + "step": 46793 + }, + { + "epoch": 3.7908295528191833, + "grad_norm": 0.08043761551380157, + "learning_rate": 1.1620684999324904e-05, + "loss": 0.2037, + "step": 46794 + }, + { + "epoch": 3.7909105638366816, + "grad_norm": 0.07155504822731018, + "learning_rate": 1.1616184346730276e-05, + "loss": 0.2216, + "step": 46795 + }, + { + "epoch": 3.7909915748541803, + "grad_norm": 0.07402777671813965, + "learning_rate": 1.161168369413565e-05, + "loss": 0.237, + "step": 46796 + }, + { + "epoch": 3.7910725858716785, + "grad_norm": 0.07951734960079193, + "learning_rate": 1.1607183041541025e-05, + "loss": 0.2707, + "step": 46797 + }, + { + "epoch": 3.7911535968891767, + "grad_norm": 0.07128282636404037, + "learning_rate": 1.1602682388946398e-05, + "loss": 0.2531, + "step": 46798 + }, + { + "epoch": 3.7912346079066754, + "grad_norm": 0.07474765181541443, + "learning_rate": 1.1598181736351772e-05, + "loss": 0.2073, + "step": 46799 + }, + { + "epoch": 3.7913156189241737, + "grad_norm": 0.06123476102948189, + "learning_rate": 1.1593681083757146e-05, + "loss": 0.2372, + "step": 46800 + }, + { + "epoch": 3.791396629941672, + "grad_norm": 0.08566850423812866, + "learning_rate": 1.1589180431162519e-05, + "loss": 0.2129, + "step": 46801 + }, + { + "epoch": 3.7914776409591706, + "grad_norm": 0.0908622220158577, + "learning_rate": 1.1584679778567893e-05, + "loss": 0.2277, + "step": 46802 + }, + { + "epoch": 3.791558651976669, + "grad_norm": 0.05587979778647423, + "learning_rate": 1.1580179125973268e-05, + "loss": 0.2102, + "step": 46803 + }, + { + "epoch": 3.791639662994167, + "grad_norm": 0.08635761588811874, + "learning_rate": 1.157567847337864e-05, + "loss": 0.208, + "step": 46804 + }, + { + "epoch": 3.791720674011666, + "grad_norm": 0.07278425991535187, + "learning_rate": 1.1571177820784014e-05, + "loss": 0.2413, + "step": 46805 + }, + { + "epoch": 3.791801685029164, + "grad_norm": 0.08281299471855164, + "learning_rate": 1.1566677168189389e-05, + "loss": 0.2106, + "step": 46806 + }, + { + "epoch": 3.7918826960466623, + "grad_norm": 0.07331330329179764, + "learning_rate": 1.1562176515594762e-05, + "loss": 0.2529, + "step": 46807 + }, + { + "epoch": 3.791963707064161, + "grad_norm": 0.07193376123905182, + "learning_rate": 1.1557675863000134e-05, + "loss": 0.2057, + "step": 46808 + }, + { + "epoch": 3.792044718081659, + "grad_norm": 0.0806722342967987, + "learning_rate": 1.155317521040551e-05, + "loss": 0.2446, + "step": 46809 + }, + { + "epoch": 3.7921257290991575, + "grad_norm": 0.0638408288359642, + "learning_rate": 1.1548674557810883e-05, + "loss": 0.24, + "step": 46810 + }, + { + "epoch": 3.7922067401166557, + "grad_norm": 0.07036413997411728, + "learning_rate": 1.1544173905216257e-05, + "loss": 0.2169, + "step": 46811 + }, + { + "epoch": 3.7922877511341544, + "grad_norm": 0.06902635842561722, + "learning_rate": 1.153967325262163e-05, + "loss": 0.2231, + "step": 46812 + }, + { + "epoch": 3.7923687621516526, + "grad_norm": 0.06593969464302063, + "learning_rate": 1.1535172600027004e-05, + "loss": 0.2089, + "step": 46813 + }, + { + "epoch": 3.792449773169151, + "grad_norm": 0.07559620589017868, + "learning_rate": 1.1530671947432378e-05, + "loss": 0.2276, + "step": 46814 + }, + { + "epoch": 3.792530784186649, + "grad_norm": 0.06421814113855362, + "learning_rate": 1.1526171294837753e-05, + "loss": 0.2729, + "step": 46815 + }, + { + "epoch": 3.792611795204148, + "grad_norm": 0.06696971505880356, + "learning_rate": 1.1521670642243127e-05, + "loss": 0.2587, + "step": 46816 + }, + { + "epoch": 3.792692806221646, + "grad_norm": 0.08010052144527435, + "learning_rate": 1.1517169989648498e-05, + "loss": 0.1825, + "step": 46817 + }, + { + "epoch": 3.7927738172391443, + "grad_norm": 0.08656228333711624, + "learning_rate": 1.1512669337053874e-05, + "loss": 0.2341, + "step": 46818 + }, + { + "epoch": 3.792854828256643, + "grad_norm": 0.07407809793949127, + "learning_rate": 1.1508168684459247e-05, + "loss": 0.2599, + "step": 46819 + }, + { + "epoch": 3.7929358392741412, + "grad_norm": 0.07218343764543533, + "learning_rate": 1.1503668031864621e-05, + "loss": 0.2203, + "step": 46820 + }, + { + "epoch": 3.7930168502916395, + "grad_norm": 0.07398039847612381, + "learning_rate": 1.1499167379269995e-05, + "loss": 0.2177, + "step": 46821 + }, + { + "epoch": 3.793097861309138, + "grad_norm": 0.06317486613988876, + "learning_rate": 1.1494666726675368e-05, + "loss": 0.2141, + "step": 46822 + }, + { + "epoch": 3.7931788723266364, + "grad_norm": 0.07027360796928406, + "learning_rate": 1.1490166074080742e-05, + "loss": 0.2265, + "step": 46823 + }, + { + "epoch": 3.7932598833441347, + "grad_norm": 0.07729087769985199, + "learning_rate": 1.1485665421486117e-05, + "loss": 0.2256, + "step": 46824 + }, + { + "epoch": 3.7933408943616334, + "grad_norm": 0.07687903195619583, + "learning_rate": 1.1481164768891489e-05, + "loss": 0.2307, + "step": 46825 + }, + { + "epoch": 3.7934219053791316, + "grad_norm": 0.07208074629306793, + "learning_rate": 1.1476664116296863e-05, + "loss": 0.2155, + "step": 46826 + }, + { + "epoch": 3.79350291639663, + "grad_norm": 0.07030342519283295, + "learning_rate": 1.1472163463702238e-05, + "loss": 0.2055, + "step": 46827 + }, + { + "epoch": 3.7935839274141285, + "grad_norm": 0.06092243641614914, + "learning_rate": 1.1467662811107612e-05, + "loss": 0.2253, + "step": 46828 + }, + { + "epoch": 3.7936649384316268, + "grad_norm": 0.08080343902111053, + "learning_rate": 1.1463162158512985e-05, + "loss": 0.2315, + "step": 46829 + }, + { + "epoch": 3.793745949449125, + "grad_norm": 0.06889671087265015, + "learning_rate": 1.1458661505918359e-05, + "loss": 0.2071, + "step": 46830 + }, + { + "epoch": 3.7938269604666237, + "grad_norm": 0.061423543840646744, + "learning_rate": 1.1454160853323732e-05, + "loss": 0.1851, + "step": 46831 + }, + { + "epoch": 3.793907971484122, + "grad_norm": 0.06487643718719482, + "learning_rate": 1.1449660200729106e-05, + "loss": 0.225, + "step": 46832 + }, + { + "epoch": 3.79398898250162, + "grad_norm": 0.0830056294798851, + "learning_rate": 1.1445159548134481e-05, + "loss": 0.217, + "step": 46833 + }, + { + "epoch": 3.7940699935191184, + "grad_norm": 0.06530779600143433, + "learning_rate": 1.1440658895539853e-05, + "loss": 0.2175, + "step": 46834 + }, + { + "epoch": 3.794151004536617, + "grad_norm": 0.07887805998325348, + "learning_rate": 1.1436158242945227e-05, + "loss": 0.2445, + "step": 46835 + }, + { + "epoch": 3.7942320155541154, + "grad_norm": 0.06664882600307465, + "learning_rate": 1.1431657590350602e-05, + "loss": 0.2433, + "step": 46836 + }, + { + "epoch": 3.7943130265716136, + "grad_norm": 0.06435833871364594, + "learning_rate": 1.1427156937755976e-05, + "loss": 0.2001, + "step": 46837 + }, + { + "epoch": 3.794394037589112, + "grad_norm": 0.062327224761247635, + "learning_rate": 1.1422656285161348e-05, + "loss": 0.2213, + "step": 46838 + }, + { + "epoch": 3.7944750486066106, + "grad_norm": 0.07332039624452591, + "learning_rate": 1.1418155632566723e-05, + "loss": 0.1763, + "step": 46839 + }, + { + "epoch": 3.794556059624109, + "grad_norm": 0.09450053423643112, + "learning_rate": 1.1413654979972096e-05, + "loss": 0.2553, + "step": 46840 + }, + { + "epoch": 3.794637070641607, + "grad_norm": 0.06693083792924881, + "learning_rate": 1.140915432737747e-05, + "loss": 0.1962, + "step": 46841 + }, + { + "epoch": 3.7947180816591057, + "grad_norm": 0.07547756284475327, + "learning_rate": 1.1404653674782845e-05, + "loss": 0.217, + "step": 46842 + }, + { + "epoch": 3.794799092676604, + "grad_norm": 0.06735184788703918, + "learning_rate": 1.1400153022188217e-05, + "loss": 0.2216, + "step": 46843 + }, + { + "epoch": 3.7948801036941022, + "grad_norm": 0.06307090818881989, + "learning_rate": 1.1395652369593591e-05, + "loss": 0.2395, + "step": 46844 + }, + { + "epoch": 3.794961114711601, + "grad_norm": 0.06501543521881104, + "learning_rate": 1.1391151716998966e-05, + "loss": 0.2125, + "step": 46845 + }, + { + "epoch": 3.795042125729099, + "grad_norm": 0.09293830394744873, + "learning_rate": 1.138665106440434e-05, + "loss": 0.2492, + "step": 46846 + }, + { + "epoch": 3.7951231367465974, + "grad_norm": 0.06201693415641785, + "learning_rate": 1.1382150411809712e-05, + "loss": 0.2137, + "step": 46847 + }, + { + "epoch": 3.795204147764096, + "grad_norm": 0.08595079183578491, + "learning_rate": 1.1377649759215087e-05, + "loss": 0.2071, + "step": 46848 + }, + { + "epoch": 3.7952851587815943, + "grad_norm": 0.07711045444011688, + "learning_rate": 1.137314910662046e-05, + "loss": 0.2485, + "step": 46849 + }, + { + "epoch": 3.7953661697990926, + "grad_norm": 0.08922844380140305, + "learning_rate": 1.1368648454025834e-05, + "loss": 0.23, + "step": 46850 + }, + { + "epoch": 3.7954471808165913, + "grad_norm": 0.07273554801940918, + "learning_rate": 1.1364147801431208e-05, + "loss": 0.2015, + "step": 46851 + }, + { + "epoch": 3.7955281918340895, + "grad_norm": 0.06916754692792892, + "learning_rate": 1.1359647148836581e-05, + "loss": 0.2357, + "step": 46852 + }, + { + "epoch": 3.7956092028515878, + "grad_norm": 0.08604365587234497, + "learning_rate": 1.1355146496241955e-05, + "loss": 0.2403, + "step": 46853 + }, + { + "epoch": 3.7956902138690864, + "grad_norm": 0.07010132074356079, + "learning_rate": 1.135064584364733e-05, + "loss": 0.2159, + "step": 46854 + }, + { + "epoch": 3.7957712248865847, + "grad_norm": 0.08245091885328293, + "learning_rate": 1.1346145191052702e-05, + "loss": 0.2221, + "step": 46855 + }, + { + "epoch": 3.795852235904083, + "grad_norm": 0.07700005918741226, + "learning_rate": 1.1341644538458076e-05, + "loss": 0.2152, + "step": 46856 + }, + { + "epoch": 3.795933246921581, + "grad_norm": 0.08508516103029251, + "learning_rate": 1.1337143885863451e-05, + "loss": 0.2322, + "step": 46857 + }, + { + "epoch": 3.79601425793908, + "grad_norm": 0.08205549418926239, + "learning_rate": 1.1332643233268825e-05, + "loss": 0.1954, + "step": 46858 + }, + { + "epoch": 3.796095268956578, + "grad_norm": 0.06523613631725311, + "learning_rate": 1.1328142580674198e-05, + "loss": 0.2025, + "step": 46859 + }, + { + "epoch": 3.7961762799740764, + "grad_norm": 0.08023390918970108, + "learning_rate": 1.1323641928079572e-05, + "loss": 0.2702, + "step": 46860 + }, + { + "epoch": 3.7962572909915746, + "grad_norm": 0.08578156679868698, + "learning_rate": 1.1319141275484946e-05, + "loss": 0.2349, + "step": 46861 + }, + { + "epoch": 3.7963383020090733, + "grad_norm": 0.06631217896938324, + "learning_rate": 1.1314640622890319e-05, + "loss": 0.2388, + "step": 46862 + }, + { + "epoch": 3.7964193130265715, + "grad_norm": 0.07069762051105499, + "learning_rate": 1.1310139970295694e-05, + "loss": 0.2054, + "step": 46863 + }, + { + "epoch": 3.79650032404407, + "grad_norm": 0.07219357043504715, + "learning_rate": 1.1305639317701066e-05, + "loss": 0.2596, + "step": 46864 + }, + { + "epoch": 3.7965813350615685, + "grad_norm": 0.05612671747803688, + "learning_rate": 1.130113866510644e-05, + "loss": 0.2227, + "step": 46865 + }, + { + "epoch": 3.7966623460790667, + "grad_norm": 0.08102644979953766, + "learning_rate": 1.1296638012511815e-05, + "loss": 0.2556, + "step": 46866 + }, + { + "epoch": 3.796743357096565, + "grad_norm": 0.05597464740276337, + "learning_rate": 1.1292137359917189e-05, + "loss": 0.2073, + "step": 46867 + }, + { + "epoch": 3.7968243681140637, + "grad_norm": 0.060989245772361755, + "learning_rate": 1.1287636707322562e-05, + "loss": 0.2173, + "step": 46868 + }, + { + "epoch": 3.796905379131562, + "grad_norm": 0.07221823185682297, + "learning_rate": 1.1283136054727936e-05, + "loss": 0.2246, + "step": 46869 + }, + { + "epoch": 3.79698639014906, + "grad_norm": 0.07029426097869873, + "learning_rate": 1.127863540213331e-05, + "loss": 0.2219, + "step": 46870 + }, + { + "epoch": 3.797067401166559, + "grad_norm": 0.06306787580251694, + "learning_rate": 1.1274134749538683e-05, + "loss": 0.1918, + "step": 46871 + }, + { + "epoch": 3.797148412184057, + "grad_norm": 0.0684630498290062, + "learning_rate": 1.1269634096944059e-05, + "loss": 0.2177, + "step": 46872 + }, + { + "epoch": 3.7972294232015553, + "grad_norm": 0.06849277019500732, + "learning_rate": 1.126513344434943e-05, + "loss": 0.2131, + "step": 46873 + }, + { + "epoch": 3.797310434219054, + "grad_norm": 0.08130700141191483, + "learning_rate": 1.1260632791754806e-05, + "loss": 0.2531, + "step": 46874 + }, + { + "epoch": 3.7973914452365523, + "grad_norm": 0.0685175359249115, + "learning_rate": 1.125613213916018e-05, + "loss": 0.2292, + "step": 46875 + }, + { + "epoch": 3.7974724562540505, + "grad_norm": 0.054702240973711014, + "learning_rate": 1.1251631486565553e-05, + "loss": 0.1895, + "step": 46876 + }, + { + "epoch": 3.797553467271549, + "grad_norm": 0.07435936480760574, + "learning_rate": 1.1247130833970927e-05, + "loss": 0.2479, + "step": 46877 + }, + { + "epoch": 3.7976344782890474, + "grad_norm": 0.062439724802970886, + "learning_rate": 1.12426301813763e-05, + "loss": 0.2002, + "step": 46878 + }, + { + "epoch": 3.7977154893065457, + "grad_norm": 0.07218343019485474, + "learning_rate": 1.1238129528781674e-05, + "loss": 0.1966, + "step": 46879 + }, + { + "epoch": 3.797796500324044, + "grad_norm": 0.055278975516557693, + "learning_rate": 1.1233628876187047e-05, + "loss": 0.2051, + "step": 46880 + }, + { + "epoch": 3.7978775113415426, + "grad_norm": 0.07765598595142365, + "learning_rate": 1.1229128223592421e-05, + "loss": 0.2112, + "step": 46881 + }, + { + "epoch": 3.797958522359041, + "grad_norm": 0.07940090447664261, + "learning_rate": 1.1224627570997795e-05, + "loss": 0.2491, + "step": 46882 + }, + { + "epoch": 3.798039533376539, + "grad_norm": 0.06536324322223663, + "learning_rate": 1.122012691840317e-05, + "loss": 0.2599, + "step": 46883 + }, + { + "epoch": 3.7981205443940373, + "grad_norm": 0.06517624109983444, + "learning_rate": 1.1215626265808543e-05, + "loss": 0.1863, + "step": 46884 + }, + { + "epoch": 3.798201555411536, + "grad_norm": 0.06190231814980507, + "learning_rate": 1.1211125613213917e-05, + "loss": 0.2044, + "step": 46885 + }, + { + "epoch": 3.7982825664290343, + "grad_norm": 0.0682215765118599, + "learning_rate": 1.120662496061929e-05, + "loss": 0.2717, + "step": 46886 + }, + { + "epoch": 3.7983635774465325, + "grad_norm": 0.08895383030176163, + "learning_rate": 1.1202124308024664e-05, + "loss": 0.2608, + "step": 46887 + }, + { + "epoch": 3.798444588464031, + "grad_norm": 0.07362218201160431, + "learning_rate": 1.1197623655430038e-05, + "loss": 0.224, + "step": 46888 + }, + { + "epoch": 3.7985255994815295, + "grad_norm": 0.06116807460784912, + "learning_rate": 1.1193123002835411e-05, + "loss": 0.2436, + "step": 46889 + }, + { + "epoch": 3.7986066104990277, + "grad_norm": 0.06734014302492142, + "learning_rate": 1.1188622350240785e-05, + "loss": 0.2235, + "step": 46890 + }, + { + "epoch": 3.7986876215165264, + "grad_norm": 0.06932863593101501, + "learning_rate": 1.1184121697646159e-05, + "loss": 0.2126, + "step": 46891 + }, + { + "epoch": 3.7987686325340246, + "grad_norm": 0.06863842904567719, + "learning_rate": 1.1179621045051534e-05, + "loss": 0.2258, + "step": 46892 + }, + { + "epoch": 3.798849643551523, + "grad_norm": 0.06340306252241135, + "learning_rate": 1.1175120392456908e-05, + "loss": 0.2538, + "step": 46893 + }, + { + "epoch": 3.7989306545690216, + "grad_norm": 0.0711321234703064, + "learning_rate": 1.117061973986228e-05, + "loss": 0.2343, + "step": 46894 + }, + { + "epoch": 3.79901166558652, + "grad_norm": 0.08093009144067764, + "learning_rate": 1.1166119087267655e-05, + "loss": 0.2195, + "step": 46895 + }, + { + "epoch": 3.799092676604018, + "grad_norm": 0.08460341393947601, + "learning_rate": 1.1161618434673028e-05, + "loss": 0.2329, + "step": 46896 + }, + { + "epoch": 3.7991736876215167, + "grad_norm": 0.06339055299758911, + "learning_rate": 1.1157117782078402e-05, + "loss": 0.2479, + "step": 46897 + }, + { + "epoch": 3.799254698639015, + "grad_norm": 0.06539372354745865, + "learning_rate": 1.1152617129483776e-05, + "loss": 0.2171, + "step": 46898 + }, + { + "epoch": 3.7993357096565132, + "grad_norm": 0.0670798122882843, + "learning_rate": 1.114811647688915e-05, + "loss": 0.2258, + "step": 46899 + }, + { + "epoch": 3.799416720674012, + "grad_norm": 0.07918045669794083, + "learning_rate": 1.1143615824294523e-05, + "loss": 0.2393, + "step": 46900 + }, + { + "epoch": 3.79949773169151, + "grad_norm": 0.08320426940917969, + "learning_rate": 1.1139115171699898e-05, + "loss": 0.211, + "step": 46901 + }, + { + "epoch": 3.7995787427090084, + "grad_norm": 0.07436665892601013, + "learning_rate": 1.1134614519105272e-05, + "loss": 0.2237, + "step": 46902 + }, + { + "epoch": 3.7996597537265067, + "grad_norm": 0.0866570994257927, + "learning_rate": 1.1130113866510644e-05, + "loss": 0.2256, + "step": 46903 + }, + { + "epoch": 3.7997407647440054, + "grad_norm": 0.07788897305727005, + "learning_rate": 1.1125613213916019e-05, + "loss": 0.2382, + "step": 46904 + }, + { + "epoch": 3.7998217757615036, + "grad_norm": 0.06670193374156952, + "learning_rate": 1.1121112561321393e-05, + "loss": 0.2062, + "step": 46905 + }, + { + "epoch": 3.799902786779002, + "grad_norm": 0.0638703927397728, + "learning_rate": 1.1116611908726766e-05, + "loss": 0.2127, + "step": 46906 + }, + { + "epoch": 3.7999837977965, + "grad_norm": 0.06693035364151001, + "learning_rate": 1.111211125613214e-05, + "loss": 0.2352, + "step": 46907 + }, + { + "epoch": 3.8000648088139988, + "grad_norm": 0.08188080787658691, + "learning_rate": 1.1107610603537513e-05, + "loss": 0.2332, + "step": 46908 + }, + { + "epoch": 3.800145819831497, + "grad_norm": 0.06827425211668015, + "learning_rate": 1.1103109950942887e-05, + "loss": 0.2249, + "step": 46909 + }, + { + "epoch": 3.8002268308489953, + "grad_norm": 0.06886734068393707, + "learning_rate": 1.1098609298348262e-05, + "loss": 0.2149, + "step": 46910 + }, + { + "epoch": 3.800307841866494, + "grad_norm": 0.08088071644306183, + "learning_rate": 1.1094108645753634e-05, + "loss": 0.2399, + "step": 46911 + }, + { + "epoch": 3.800388852883992, + "grad_norm": 0.08014461398124695, + "learning_rate": 1.1089607993159008e-05, + "loss": 0.2109, + "step": 46912 + }, + { + "epoch": 3.8004698639014904, + "grad_norm": 0.08599234372377396, + "learning_rate": 1.1085107340564383e-05, + "loss": 0.2075, + "step": 46913 + }, + { + "epoch": 3.800550874918989, + "grad_norm": 0.07755023241043091, + "learning_rate": 1.1080606687969757e-05, + "loss": 0.2339, + "step": 46914 + }, + { + "epoch": 3.8006318859364874, + "grad_norm": 0.06985119730234146, + "learning_rate": 1.107610603537513e-05, + "loss": 0.2058, + "step": 46915 + }, + { + "epoch": 3.8007128969539856, + "grad_norm": 0.0760570541024208, + "learning_rate": 1.1071605382780504e-05, + "loss": 0.1955, + "step": 46916 + }, + { + "epoch": 3.8007939079714843, + "grad_norm": 0.06698282063007355, + "learning_rate": 1.1067104730185877e-05, + "loss": 0.1952, + "step": 46917 + }, + { + "epoch": 3.8008749189889826, + "grad_norm": 0.07477567344903946, + "learning_rate": 1.1062604077591251e-05, + "loss": 0.214, + "step": 46918 + }, + { + "epoch": 3.800955930006481, + "grad_norm": 0.07477416098117828, + "learning_rate": 1.1058103424996626e-05, + "loss": 0.2861, + "step": 46919 + }, + { + "epoch": 3.8010369410239795, + "grad_norm": 0.07379210740327835, + "learning_rate": 1.1053602772401998e-05, + "loss": 0.2168, + "step": 46920 + }, + { + "epoch": 3.8011179520414777, + "grad_norm": 0.07364281266927719, + "learning_rate": 1.1049102119807372e-05, + "loss": 0.2101, + "step": 46921 + }, + { + "epoch": 3.801198963058976, + "grad_norm": 0.06869223713874817, + "learning_rate": 1.1044601467212747e-05, + "loss": 0.1867, + "step": 46922 + }, + { + "epoch": 3.8012799740764747, + "grad_norm": 0.072836734354496, + "learning_rate": 1.104010081461812e-05, + "loss": 0.2036, + "step": 46923 + }, + { + "epoch": 3.801360985093973, + "grad_norm": 0.060940057039260864, + "learning_rate": 1.1035600162023493e-05, + "loss": 0.196, + "step": 46924 + }, + { + "epoch": 3.801441996111471, + "grad_norm": 0.07574928551912308, + "learning_rate": 1.1031099509428868e-05, + "loss": 0.2082, + "step": 46925 + }, + { + "epoch": 3.8015230071289694, + "grad_norm": 0.08537424355745316, + "learning_rate": 1.1026598856834242e-05, + "loss": 0.233, + "step": 46926 + }, + { + "epoch": 3.8016040181464676, + "grad_norm": 0.06580768525600433, + "learning_rate": 1.1022098204239615e-05, + "loss": 0.197, + "step": 46927 + }, + { + "epoch": 3.8016850291639663, + "grad_norm": 0.06638256460428238, + "learning_rate": 1.1017597551644989e-05, + "loss": 0.2215, + "step": 46928 + }, + { + "epoch": 3.8017660401814646, + "grad_norm": 0.09222010523080826, + "learning_rate": 1.1013096899050362e-05, + "loss": 0.2341, + "step": 46929 + }, + { + "epoch": 3.801847051198963, + "grad_norm": 0.07473092526197433, + "learning_rate": 1.1008596246455736e-05, + "loss": 0.2388, + "step": 46930 + }, + { + "epoch": 3.8019280622164615, + "grad_norm": 0.07252812385559082, + "learning_rate": 1.1004095593861111e-05, + "loss": 0.2267, + "step": 46931 + }, + { + "epoch": 3.8020090732339598, + "grad_norm": 0.077595055103302, + "learning_rate": 1.0999594941266485e-05, + "loss": 0.2154, + "step": 46932 + }, + { + "epoch": 3.802090084251458, + "grad_norm": 0.06827554106712341, + "learning_rate": 1.0995094288671857e-05, + "loss": 0.2222, + "step": 46933 + }, + { + "epoch": 3.8021710952689567, + "grad_norm": 0.07482986897230148, + "learning_rate": 1.0990593636077232e-05, + "loss": 0.1913, + "step": 46934 + }, + { + "epoch": 3.802252106286455, + "grad_norm": 0.07571776211261749, + "learning_rate": 1.0986092983482606e-05, + "loss": 0.2262, + "step": 46935 + }, + { + "epoch": 3.802333117303953, + "grad_norm": 0.08740594238042831, + "learning_rate": 1.098159233088798e-05, + "loss": 0.2704, + "step": 46936 + }, + { + "epoch": 3.802414128321452, + "grad_norm": 0.08148932456970215, + "learning_rate": 1.0977091678293353e-05, + "loss": 0.1973, + "step": 46937 + }, + { + "epoch": 3.80249513933895, + "grad_norm": 0.07424408197402954, + "learning_rate": 1.0972591025698727e-05, + "loss": 0.1896, + "step": 46938 + }, + { + "epoch": 3.8025761503564484, + "grad_norm": 0.06565883755683899, + "learning_rate": 1.09680903731041e-05, + "loss": 0.193, + "step": 46939 + }, + { + "epoch": 3.802657161373947, + "grad_norm": 0.0705159530043602, + "learning_rate": 1.0963589720509475e-05, + "loss": 0.218, + "step": 46940 + }, + { + "epoch": 3.8027381723914453, + "grad_norm": 0.07602539658546448, + "learning_rate": 1.0959089067914847e-05, + "loss": 0.2325, + "step": 46941 + }, + { + "epoch": 3.8028191834089435, + "grad_norm": 0.08233897387981415, + "learning_rate": 1.0954588415320221e-05, + "loss": 0.2499, + "step": 46942 + }, + { + "epoch": 3.8029001944264422, + "grad_norm": 0.07934436202049255, + "learning_rate": 1.0950087762725596e-05, + "loss": 0.2348, + "step": 46943 + }, + { + "epoch": 3.8029812054439405, + "grad_norm": 0.06771920621395111, + "learning_rate": 1.094558711013097e-05, + "loss": 0.2272, + "step": 46944 + }, + { + "epoch": 3.8030622164614387, + "grad_norm": 0.0798344835639, + "learning_rate": 1.0941086457536343e-05, + "loss": 0.2718, + "step": 46945 + }, + { + "epoch": 3.8031432274789374, + "grad_norm": 0.062396757304668427, + "learning_rate": 1.0936585804941717e-05, + "loss": 0.2419, + "step": 46946 + }, + { + "epoch": 3.8032242384964356, + "grad_norm": 0.0885952040553093, + "learning_rate": 1.093208515234709e-05, + "loss": 0.2712, + "step": 46947 + }, + { + "epoch": 3.803305249513934, + "grad_norm": 0.06853952258825302, + "learning_rate": 1.0927584499752464e-05, + "loss": 0.2351, + "step": 46948 + }, + { + "epoch": 3.803386260531432, + "grad_norm": 0.05572928115725517, + "learning_rate": 1.092308384715784e-05, + "loss": 0.181, + "step": 46949 + }, + { + "epoch": 3.8034672715489304, + "grad_norm": 0.07789760828018188, + "learning_rate": 1.0918583194563211e-05, + "loss": 0.2282, + "step": 46950 + }, + { + "epoch": 3.803548282566429, + "grad_norm": 0.06651580333709717, + "learning_rate": 1.0914082541968585e-05, + "loss": 0.2261, + "step": 46951 + }, + { + "epoch": 3.8036292935839273, + "grad_norm": 0.07149095833301544, + "learning_rate": 1.090958188937396e-05, + "loss": 0.2143, + "step": 46952 + }, + { + "epoch": 3.8037103046014256, + "grad_norm": 0.07038377225399017, + "learning_rate": 1.0905081236779334e-05, + "loss": 0.2192, + "step": 46953 + }, + { + "epoch": 3.8037913156189243, + "grad_norm": 0.07646401226520538, + "learning_rate": 1.0900580584184706e-05, + "loss": 0.2406, + "step": 46954 + }, + { + "epoch": 3.8038723266364225, + "grad_norm": 0.09022786468267441, + "learning_rate": 1.0896079931590081e-05, + "loss": 0.2274, + "step": 46955 + }, + { + "epoch": 3.8039533376539207, + "grad_norm": 0.0737614631652832, + "learning_rate": 1.0891579278995455e-05, + "loss": 0.2387, + "step": 46956 + }, + { + "epoch": 3.8040343486714194, + "grad_norm": 0.06396791338920593, + "learning_rate": 1.0887078626400828e-05, + "loss": 0.2446, + "step": 46957 + }, + { + "epoch": 3.8041153596889177, + "grad_norm": 0.09200059622526169, + "learning_rate": 1.0882577973806204e-05, + "loss": 0.225, + "step": 46958 + }, + { + "epoch": 3.804196370706416, + "grad_norm": 0.07705798000097275, + "learning_rate": 1.0878077321211576e-05, + "loss": 0.2335, + "step": 46959 + }, + { + "epoch": 3.8042773817239146, + "grad_norm": 0.06533969193696976, + "learning_rate": 1.087357666861695e-05, + "loss": 0.2366, + "step": 46960 + }, + { + "epoch": 3.804358392741413, + "grad_norm": 0.06600935012102127, + "learning_rate": 1.0869076016022324e-05, + "loss": 0.2587, + "step": 46961 + }, + { + "epoch": 3.804439403758911, + "grad_norm": 0.06288919597864151, + "learning_rate": 1.0864575363427698e-05, + "loss": 0.2536, + "step": 46962 + }, + { + "epoch": 3.80452041477641, + "grad_norm": 0.09356596320867538, + "learning_rate": 1.086007471083307e-05, + "loss": 0.2503, + "step": 46963 + }, + { + "epoch": 3.804601425793908, + "grad_norm": 0.0647389367222786, + "learning_rate": 1.0855574058238445e-05, + "loss": 0.2205, + "step": 46964 + }, + { + "epoch": 3.8046824368114063, + "grad_norm": 0.06880980730056763, + "learning_rate": 1.0851073405643819e-05, + "loss": 0.2179, + "step": 46965 + }, + { + "epoch": 3.804763447828905, + "grad_norm": 0.06399885565042496, + "learning_rate": 1.0846572753049192e-05, + "loss": 0.2462, + "step": 46966 + }, + { + "epoch": 3.804844458846403, + "grad_norm": 0.07660163193941116, + "learning_rate": 1.0842072100454566e-05, + "loss": 0.2162, + "step": 46967 + }, + { + "epoch": 3.8049254698639015, + "grad_norm": 0.06323090195655823, + "learning_rate": 1.083757144785994e-05, + "loss": 0.2289, + "step": 46968 + }, + { + "epoch": 3.8050064808814, + "grad_norm": 0.05920364335179329, + "learning_rate": 1.0833070795265313e-05, + "loss": 0.2378, + "step": 46969 + }, + { + "epoch": 3.8050874918988984, + "grad_norm": 0.07590261846780777, + "learning_rate": 1.0828570142670689e-05, + "loss": 0.205, + "step": 46970 + }, + { + "epoch": 3.8051685029163966, + "grad_norm": 0.07777293771505356, + "learning_rate": 1.082406949007606e-05, + "loss": 0.2218, + "step": 46971 + }, + { + "epoch": 3.805249513933895, + "grad_norm": 0.09066983312368393, + "learning_rate": 1.0819568837481436e-05, + "loss": 0.2632, + "step": 46972 + }, + { + "epoch": 3.805330524951393, + "grad_norm": 0.07806780189275742, + "learning_rate": 1.081506818488681e-05, + "loss": 0.1912, + "step": 46973 + }, + { + "epoch": 3.805411535968892, + "grad_norm": 0.0826018676161766, + "learning_rate": 1.0810567532292183e-05, + "loss": 0.2424, + "step": 46974 + }, + { + "epoch": 3.80549254698639, + "grad_norm": 0.08856657892465591, + "learning_rate": 1.0806066879697557e-05, + "loss": 0.2551, + "step": 46975 + }, + { + "epoch": 3.8055735580038883, + "grad_norm": 0.06531494110822678, + "learning_rate": 1.080156622710293e-05, + "loss": 0.223, + "step": 46976 + }, + { + "epoch": 3.805654569021387, + "grad_norm": 0.07311249524354935, + "learning_rate": 1.0797065574508304e-05, + "loss": 0.2724, + "step": 46977 + }, + { + "epoch": 3.8057355800388852, + "grad_norm": 0.060811128467321396, + "learning_rate": 1.0792564921913677e-05, + "loss": 0.2253, + "step": 46978 + }, + { + "epoch": 3.8058165910563835, + "grad_norm": 0.06457604467868805, + "learning_rate": 1.0788064269319053e-05, + "loss": 0.2496, + "step": 46979 + }, + { + "epoch": 3.805897602073882, + "grad_norm": 0.08397595584392548, + "learning_rate": 1.0783563616724425e-05, + "loss": 0.2444, + "step": 46980 + }, + { + "epoch": 3.8059786130913804, + "grad_norm": 0.07146035879850388, + "learning_rate": 1.07790629641298e-05, + "loss": 0.2103, + "step": 46981 + }, + { + "epoch": 3.8060596241088787, + "grad_norm": 0.08192295581102371, + "learning_rate": 1.0774562311535174e-05, + "loss": 0.2084, + "step": 46982 + }, + { + "epoch": 3.8061406351263773, + "grad_norm": 0.057976480573415756, + "learning_rate": 1.0770061658940547e-05, + "loss": 0.2378, + "step": 46983 + }, + { + "epoch": 3.8062216461438756, + "grad_norm": 0.05698695778846741, + "learning_rate": 1.076556100634592e-05, + "loss": 0.1849, + "step": 46984 + }, + { + "epoch": 3.806302657161374, + "grad_norm": 0.06821150332689285, + "learning_rate": 1.0761060353751294e-05, + "loss": 0.2393, + "step": 46985 + }, + { + "epoch": 3.8063836681788725, + "grad_norm": 0.07215702533721924, + "learning_rate": 1.0756559701156668e-05, + "loss": 0.2485, + "step": 46986 + }, + { + "epoch": 3.8064646791963708, + "grad_norm": 0.08539849519729614, + "learning_rate": 1.0752059048562042e-05, + "loss": 0.2288, + "step": 46987 + }, + { + "epoch": 3.806545690213869, + "grad_norm": 0.08093149214982986, + "learning_rate": 1.0747558395967417e-05, + "loss": 0.2345, + "step": 46988 + }, + { + "epoch": 3.8066267012313677, + "grad_norm": 0.06490255147218704, + "learning_rate": 1.0743057743372789e-05, + "loss": 0.2143, + "step": 46989 + }, + { + "epoch": 3.806707712248866, + "grad_norm": 0.08042438328266144, + "learning_rate": 1.0738557090778164e-05, + "loss": 0.2459, + "step": 46990 + }, + { + "epoch": 3.806788723266364, + "grad_norm": 0.09989763796329498, + "learning_rate": 1.0734056438183538e-05, + "loss": 0.2358, + "step": 46991 + }, + { + "epoch": 3.806869734283863, + "grad_norm": 0.06737187504768372, + "learning_rate": 1.0729555785588911e-05, + "loss": 0.2002, + "step": 46992 + }, + { + "epoch": 3.806950745301361, + "grad_norm": 0.0732245147228241, + "learning_rate": 1.0725055132994285e-05, + "loss": 0.2308, + "step": 46993 + }, + { + "epoch": 3.8070317563188594, + "grad_norm": 0.08116856962442398, + "learning_rate": 1.0720554480399658e-05, + "loss": 0.2367, + "step": 46994 + }, + { + "epoch": 3.8071127673363576, + "grad_norm": 0.08011330664157867, + "learning_rate": 1.0716053827805032e-05, + "loss": 0.2478, + "step": 46995 + }, + { + "epoch": 3.807193778353856, + "grad_norm": 0.06650631129741669, + "learning_rate": 1.0711553175210406e-05, + "loss": 0.2343, + "step": 46996 + }, + { + "epoch": 3.8072747893713546, + "grad_norm": 0.075144462287426, + "learning_rate": 1.070705252261578e-05, + "loss": 0.2326, + "step": 46997 + }, + { + "epoch": 3.807355800388853, + "grad_norm": 0.05947421118617058, + "learning_rate": 1.0702551870021153e-05, + "loss": 0.1932, + "step": 46998 + }, + { + "epoch": 3.807436811406351, + "grad_norm": 0.08503783494234085, + "learning_rate": 1.0698051217426528e-05, + "loss": 0.2376, + "step": 46999 + }, + { + "epoch": 3.8075178224238497, + "grad_norm": 0.06805943697690964, + "learning_rate": 1.0693550564831902e-05, + "loss": 0.1997, + "step": 47000 + }, + { + "epoch": 3.807598833441348, + "grad_norm": 0.07300718128681183, + "learning_rate": 1.0689049912237275e-05, + "loss": 0.2294, + "step": 47001 + }, + { + "epoch": 3.807679844458846, + "grad_norm": 0.06472918391227722, + "learning_rate": 1.0684549259642649e-05, + "loss": 0.2222, + "step": 47002 + }, + { + "epoch": 3.807760855476345, + "grad_norm": 0.08821234107017517, + "learning_rate": 1.0680048607048023e-05, + "loss": 0.2618, + "step": 47003 + }, + { + "epoch": 3.807841866493843, + "grad_norm": 0.061438728123903275, + "learning_rate": 1.0675547954453396e-05, + "loss": 0.2295, + "step": 47004 + }, + { + "epoch": 3.8079228775113414, + "grad_norm": 0.0627550482749939, + "learning_rate": 1.067104730185877e-05, + "loss": 0.2232, + "step": 47005 + }, + { + "epoch": 3.80800388852884, + "grad_norm": 0.07976122945547104, + "learning_rate": 1.0666546649264143e-05, + "loss": 0.2091, + "step": 47006 + }, + { + "epoch": 3.8080848995463383, + "grad_norm": 0.06467067450284958, + "learning_rate": 1.0662045996669517e-05, + "loss": 0.2178, + "step": 47007 + }, + { + "epoch": 3.8081659105638366, + "grad_norm": 0.07180802524089813, + "learning_rate": 1.0657545344074892e-05, + "loss": 0.1963, + "step": 47008 + }, + { + "epoch": 3.8082469215813353, + "grad_norm": 0.06795401126146317, + "learning_rate": 1.0653044691480266e-05, + "loss": 0.2451, + "step": 47009 + }, + { + "epoch": 3.8083279325988335, + "grad_norm": 0.10496599227190018, + "learning_rate": 1.0648544038885638e-05, + "loss": 0.2441, + "step": 47010 + }, + { + "epoch": 3.8084089436163318, + "grad_norm": 0.0665077492594719, + "learning_rate": 1.0644043386291013e-05, + "loss": 0.2213, + "step": 47011 + }, + { + "epoch": 3.8084899546338304, + "grad_norm": 0.07035201042890549, + "learning_rate": 1.0639542733696387e-05, + "loss": 0.2269, + "step": 47012 + }, + { + "epoch": 3.8085709656513287, + "grad_norm": 0.07154831290245056, + "learning_rate": 1.063504208110176e-05, + "loss": 0.2242, + "step": 47013 + }, + { + "epoch": 3.808651976668827, + "grad_norm": 0.0710812658071518, + "learning_rate": 1.0630541428507134e-05, + "loss": 0.2438, + "step": 47014 + }, + { + "epoch": 3.808732987686325, + "grad_norm": 0.07493715733289719, + "learning_rate": 1.0626040775912508e-05, + "loss": 0.1937, + "step": 47015 + }, + { + "epoch": 3.808813998703824, + "grad_norm": 0.0741448625922203, + "learning_rate": 1.0621540123317881e-05, + "loss": 0.2626, + "step": 47016 + }, + { + "epoch": 3.808895009721322, + "grad_norm": 0.07966463267803192, + "learning_rate": 1.0617039470723256e-05, + "loss": 0.2063, + "step": 47017 + }, + { + "epoch": 3.8089760207388204, + "grad_norm": 0.0726398155093193, + "learning_rate": 1.061253881812863e-05, + "loss": 0.2499, + "step": 47018 + }, + { + "epoch": 3.8090570317563186, + "grad_norm": 0.07825640588998795, + "learning_rate": 1.0608038165534002e-05, + "loss": 0.2546, + "step": 47019 + }, + { + "epoch": 3.8091380427738173, + "grad_norm": 0.07793726027011871, + "learning_rate": 1.0603537512939377e-05, + "loss": 0.22, + "step": 47020 + }, + { + "epoch": 3.8092190537913155, + "grad_norm": 0.06841432303190231, + "learning_rate": 1.059903686034475e-05, + "loss": 0.186, + "step": 47021 + }, + { + "epoch": 3.809300064808814, + "grad_norm": 0.06918314844369888, + "learning_rate": 1.0594536207750124e-05, + "loss": 0.2519, + "step": 47022 + }, + { + "epoch": 3.8093810758263125, + "grad_norm": 0.07555932551622391, + "learning_rate": 1.0590035555155498e-05, + "loss": 0.2431, + "step": 47023 + }, + { + "epoch": 3.8094620868438107, + "grad_norm": 0.06252846866846085, + "learning_rate": 1.0585534902560872e-05, + "loss": 0.2124, + "step": 47024 + }, + { + "epoch": 3.809543097861309, + "grad_norm": 0.07834303379058838, + "learning_rate": 1.0581034249966245e-05, + "loss": 0.2625, + "step": 47025 + }, + { + "epoch": 3.8096241088788076, + "grad_norm": 0.08204419165849686, + "learning_rate": 1.057653359737162e-05, + "loss": 0.2187, + "step": 47026 + }, + { + "epoch": 3.809705119896306, + "grad_norm": 0.06510677188634872, + "learning_rate": 1.0572032944776992e-05, + "loss": 0.2681, + "step": 47027 + }, + { + "epoch": 3.809786130913804, + "grad_norm": 0.06332890689373016, + "learning_rate": 1.0567532292182366e-05, + "loss": 0.2501, + "step": 47028 + }, + { + "epoch": 3.809867141931303, + "grad_norm": 0.057162806391716, + "learning_rate": 1.0563031639587741e-05, + "loss": 0.2032, + "step": 47029 + }, + { + "epoch": 3.809948152948801, + "grad_norm": 0.06294887512922287, + "learning_rate": 1.0558530986993115e-05, + "loss": 0.2121, + "step": 47030 + }, + { + "epoch": 3.8100291639662993, + "grad_norm": 0.06234182044863701, + "learning_rate": 1.0554030334398489e-05, + "loss": 0.2268, + "step": 47031 + }, + { + "epoch": 3.810110174983798, + "grad_norm": 0.06718173623085022, + "learning_rate": 1.0549529681803862e-05, + "loss": 0.2252, + "step": 47032 + }, + { + "epoch": 3.8101911860012962, + "grad_norm": 0.07185987383127213, + "learning_rate": 1.0545029029209236e-05, + "loss": 0.24, + "step": 47033 + }, + { + "epoch": 3.8102721970187945, + "grad_norm": 0.07447812706232071, + "learning_rate": 1.054052837661461e-05, + "loss": 0.2539, + "step": 47034 + }, + { + "epoch": 3.810353208036293, + "grad_norm": 0.06620065122842789, + "learning_rate": 1.0536027724019985e-05, + "loss": 0.2141, + "step": 47035 + }, + { + "epoch": 3.8104342190537914, + "grad_norm": 0.07579522579908371, + "learning_rate": 1.0531527071425357e-05, + "loss": 0.227, + "step": 47036 + }, + { + "epoch": 3.8105152300712897, + "grad_norm": 0.07805155217647552, + "learning_rate": 1.052702641883073e-05, + "loss": 0.2272, + "step": 47037 + }, + { + "epoch": 3.810596241088788, + "grad_norm": 0.06929910182952881, + "learning_rate": 1.0522525766236105e-05, + "loss": 0.2378, + "step": 47038 + }, + { + "epoch": 3.8106772521062866, + "grad_norm": 0.0670161247253418, + "learning_rate": 1.0518025113641479e-05, + "loss": 0.1883, + "step": 47039 + }, + { + "epoch": 3.810758263123785, + "grad_norm": 0.06071063503623009, + "learning_rate": 1.0513524461046851e-05, + "loss": 0.2017, + "step": 47040 + }, + { + "epoch": 3.810839274141283, + "grad_norm": 0.07034777849912643, + "learning_rate": 1.0509023808452226e-05, + "loss": 0.2325, + "step": 47041 + }, + { + "epoch": 3.8109202851587813, + "grad_norm": 0.0668913945555687, + "learning_rate": 1.05045231558576e-05, + "loss": 0.2328, + "step": 47042 + }, + { + "epoch": 3.81100129617628, + "grad_norm": 0.07246463745832443, + "learning_rate": 1.0500022503262973e-05, + "loss": 0.2207, + "step": 47043 + }, + { + "epoch": 3.8110823071937783, + "grad_norm": 0.07312662899494171, + "learning_rate": 1.0495521850668349e-05, + "loss": 0.1934, + "step": 47044 + }, + { + "epoch": 3.8111633182112765, + "grad_norm": 0.08061020821332932, + "learning_rate": 1.049102119807372e-05, + "loss": 0.2648, + "step": 47045 + }, + { + "epoch": 3.811244329228775, + "grad_norm": 0.06351017206907272, + "learning_rate": 1.0486520545479094e-05, + "loss": 0.2229, + "step": 47046 + }, + { + "epoch": 3.8113253402462735, + "grad_norm": 0.06795711815357208, + "learning_rate": 1.048201989288447e-05, + "loss": 0.2055, + "step": 47047 + }, + { + "epoch": 3.8114063512637717, + "grad_norm": 0.06773922592401505, + "learning_rate": 1.0477519240289843e-05, + "loss": 0.2385, + "step": 47048 + }, + { + "epoch": 3.8114873622812704, + "grad_norm": 0.07266738265752792, + "learning_rate": 1.0473018587695215e-05, + "loss": 0.2618, + "step": 47049 + }, + { + "epoch": 3.8115683732987686, + "grad_norm": 0.08389297127723694, + "learning_rate": 1.046851793510059e-05, + "loss": 0.2324, + "step": 47050 + }, + { + "epoch": 3.811649384316267, + "grad_norm": 0.06552331149578094, + "learning_rate": 1.0464017282505964e-05, + "loss": 0.2226, + "step": 47051 + }, + { + "epoch": 3.8117303953337656, + "grad_norm": 0.07753307372331619, + "learning_rate": 1.0459516629911338e-05, + "loss": 0.2392, + "step": 47052 + }, + { + "epoch": 3.811811406351264, + "grad_norm": 0.07049240171909332, + "learning_rate": 1.0455015977316711e-05, + "loss": 0.1888, + "step": 47053 + }, + { + "epoch": 3.811892417368762, + "grad_norm": 0.07749372720718384, + "learning_rate": 1.0450515324722085e-05, + "loss": 0.2226, + "step": 47054 + }, + { + "epoch": 3.8119734283862607, + "grad_norm": 0.07106560468673706, + "learning_rate": 1.0446014672127458e-05, + "loss": 0.2248, + "step": 47055 + }, + { + "epoch": 3.812054439403759, + "grad_norm": 0.0729873925447464, + "learning_rate": 1.0441514019532834e-05, + "loss": 0.2439, + "step": 47056 + }, + { + "epoch": 3.8121354504212572, + "grad_norm": 0.06278953701257706, + "learning_rate": 1.0437013366938206e-05, + "loss": 0.1792, + "step": 47057 + }, + { + "epoch": 3.812216461438756, + "grad_norm": 0.08032325655221939, + "learning_rate": 1.043251271434358e-05, + "loss": 0.2306, + "step": 47058 + }, + { + "epoch": 3.812297472456254, + "grad_norm": 0.07312075048685074, + "learning_rate": 1.0428012061748955e-05, + "loss": 0.2361, + "step": 47059 + }, + { + "epoch": 3.8123784834737524, + "grad_norm": 0.09086312353610992, + "learning_rate": 1.0423511409154328e-05, + "loss": 0.2511, + "step": 47060 + }, + { + "epoch": 3.8124594944912507, + "grad_norm": 0.07702907919883728, + "learning_rate": 1.0419010756559702e-05, + "loss": 0.1923, + "step": 47061 + }, + { + "epoch": 3.8125405055087493, + "grad_norm": 0.09750400483608246, + "learning_rate": 1.0414510103965075e-05, + "loss": 0.2327, + "step": 47062 + }, + { + "epoch": 3.8126215165262476, + "grad_norm": 0.0724315345287323, + "learning_rate": 1.0410009451370449e-05, + "loss": 0.2207, + "step": 47063 + }, + { + "epoch": 3.812702527543746, + "grad_norm": 0.06693287193775177, + "learning_rate": 1.0405508798775823e-05, + "loss": 0.2127, + "step": 47064 + }, + { + "epoch": 3.812783538561244, + "grad_norm": 0.07102993130683899, + "learning_rate": 1.0401008146181198e-05, + "loss": 0.2034, + "step": 47065 + }, + { + "epoch": 3.8128645495787428, + "grad_norm": 0.07117815315723419, + "learning_rate": 1.039650749358657e-05, + "loss": 0.2773, + "step": 47066 + }, + { + "epoch": 3.812945560596241, + "grad_norm": 0.06364418566226959, + "learning_rate": 1.0392006840991943e-05, + "loss": 0.243, + "step": 47067 + }, + { + "epoch": 3.8130265716137393, + "grad_norm": 0.10085894912481308, + "learning_rate": 1.0387506188397319e-05, + "loss": 0.2299, + "step": 47068 + }, + { + "epoch": 3.813107582631238, + "grad_norm": 0.06981120258569717, + "learning_rate": 1.0383005535802692e-05, + "loss": 0.1809, + "step": 47069 + }, + { + "epoch": 3.813188593648736, + "grad_norm": 0.06516273319721222, + "learning_rate": 1.0378504883208064e-05, + "loss": 0.1991, + "step": 47070 + }, + { + "epoch": 3.8132696046662344, + "grad_norm": 0.08382715284824371, + "learning_rate": 1.037400423061344e-05, + "loss": 0.2271, + "step": 47071 + }, + { + "epoch": 3.813350615683733, + "grad_norm": 0.08625270426273346, + "learning_rate": 1.0369503578018813e-05, + "loss": 0.2135, + "step": 47072 + }, + { + "epoch": 3.8134316267012314, + "grad_norm": 0.07801330834627151, + "learning_rate": 1.0365002925424187e-05, + "loss": 0.2405, + "step": 47073 + }, + { + "epoch": 3.8135126377187296, + "grad_norm": 0.08635073155164719, + "learning_rate": 1.0360502272829562e-05, + "loss": 0.2164, + "step": 47074 + }, + { + "epoch": 3.8135936487362283, + "grad_norm": 0.07388366758823395, + "learning_rate": 1.0356001620234934e-05, + "loss": 0.2158, + "step": 47075 + }, + { + "epoch": 3.8136746597537265, + "grad_norm": 0.06860913336277008, + "learning_rate": 1.0351500967640307e-05, + "loss": 0.2161, + "step": 47076 + }, + { + "epoch": 3.813755670771225, + "grad_norm": 0.07081514596939087, + "learning_rate": 1.0347000315045683e-05, + "loss": 0.2519, + "step": 47077 + }, + { + "epoch": 3.8138366817887235, + "grad_norm": 0.07424487173557281, + "learning_rate": 1.0342499662451056e-05, + "loss": 0.2206, + "step": 47078 + }, + { + "epoch": 3.8139176928062217, + "grad_norm": 0.07972376048564911, + "learning_rate": 1.033799900985643e-05, + "loss": 0.2378, + "step": 47079 + }, + { + "epoch": 3.81399870382372, + "grad_norm": 0.07115171104669571, + "learning_rate": 1.0333498357261804e-05, + "loss": 0.2134, + "step": 47080 + }, + { + "epoch": 3.8140797148412187, + "grad_norm": 0.07627512514591217, + "learning_rate": 1.0328997704667177e-05, + "loss": 0.19, + "step": 47081 + }, + { + "epoch": 3.814160725858717, + "grad_norm": 0.07725601643323898, + "learning_rate": 1.032449705207255e-05, + "loss": 0.2713, + "step": 47082 + }, + { + "epoch": 3.814241736876215, + "grad_norm": 0.07265699654817581, + "learning_rate": 1.0319996399477924e-05, + "loss": 0.2425, + "step": 47083 + }, + { + "epoch": 3.8143227478937134, + "grad_norm": 0.07284402847290039, + "learning_rate": 1.0315495746883298e-05, + "loss": 0.2464, + "step": 47084 + }, + { + "epoch": 3.814403758911212, + "grad_norm": 0.07427023351192474, + "learning_rate": 1.0310995094288672e-05, + "loss": 0.2099, + "step": 47085 + }, + { + "epoch": 3.8144847699287103, + "grad_norm": 0.05913237854838371, + "learning_rate": 1.0306494441694047e-05, + "loss": 0.1989, + "step": 47086 + }, + { + "epoch": 3.8145657809462086, + "grad_norm": 0.07824961096048355, + "learning_rate": 1.030199378909942e-05, + "loss": 0.2294, + "step": 47087 + }, + { + "epoch": 3.814646791963707, + "grad_norm": 0.06919219344854355, + "learning_rate": 1.0297493136504794e-05, + "loss": 0.2452, + "step": 47088 + }, + { + "epoch": 3.8147278029812055, + "grad_norm": 0.09075573086738586, + "learning_rate": 1.0292992483910168e-05, + "loss": 0.2306, + "step": 47089 + }, + { + "epoch": 3.8148088139987038, + "grad_norm": 0.07499659061431885, + "learning_rate": 1.0288491831315541e-05, + "loss": 0.1971, + "step": 47090 + }, + { + "epoch": 3.814889825016202, + "grad_norm": 0.06980670988559723, + "learning_rate": 1.0283991178720915e-05, + "loss": 0.2373, + "step": 47091 + }, + { + "epoch": 3.8149708360337007, + "grad_norm": 0.06824301928281784, + "learning_rate": 1.0279490526126289e-05, + "loss": 0.2478, + "step": 47092 + }, + { + "epoch": 3.815051847051199, + "grad_norm": 0.06680431962013245, + "learning_rate": 1.0274989873531662e-05, + "loss": 0.2322, + "step": 47093 + }, + { + "epoch": 3.815132858068697, + "grad_norm": 0.06365301460027695, + "learning_rate": 1.0270489220937036e-05, + "loss": 0.2449, + "step": 47094 + }, + { + "epoch": 3.815213869086196, + "grad_norm": 0.07429978251457214, + "learning_rate": 1.0265988568342411e-05, + "loss": 0.2178, + "step": 47095 + }, + { + "epoch": 3.815294880103694, + "grad_norm": 0.07181497663259506, + "learning_rate": 1.0261487915747783e-05, + "loss": 0.2281, + "step": 47096 + }, + { + "epoch": 3.8153758911211924, + "grad_norm": 0.06791014224290848, + "learning_rate": 1.0256987263153158e-05, + "loss": 0.2259, + "step": 47097 + }, + { + "epoch": 3.815456902138691, + "grad_norm": 0.08312923461198807, + "learning_rate": 1.0252486610558532e-05, + "loss": 0.2231, + "step": 47098 + }, + { + "epoch": 3.8155379131561893, + "grad_norm": 0.07397427409887314, + "learning_rate": 1.0247985957963905e-05, + "loss": 0.2101, + "step": 47099 + }, + { + "epoch": 3.8156189241736875, + "grad_norm": 0.08106406778097153, + "learning_rate": 1.0243485305369279e-05, + "loss": 0.2218, + "step": 47100 + }, + { + "epoch": 3.815699935191186, + "grad_norm": 0.07498326152563095, + "learning_rate": 1.0238984652774653e-05, + "loss": 0.2559, + "step": 47101 + }, + { + "epoch": 3.8157809462086845, + "grad_norm": 0.07188771665096283, + "learning_rate": 1.0234484000180026e-05, + "loss": 0.2475, + "step": 47102 + }, + { + "epoch": 3.8158619572261827, + "grad_norm": 0.07121875882148743, + "learning_rate": 1.02299833475854e-05, + "loss": 0.238, + "step": 47103 + }, + { + "epoch": 3.8159429682436814, + "grad_norm": 0.067873515188694, + "learning_rate": 1.0225482694990775e-05, + "loss": 0.1926, + "step": 47104 + }, + { + "epoch": 3.8160239792611796, + "grad_norm": 0.07478199154138565, + "learning_rate": 1.0220982042396147e-05, + "loss": 0.245, + "step": 47105 + }, + { + "epoch": 3.816104990278678, + "grad_norm": 0.06381670385599136, + "learning_rate": 1.0216481389801522e-05, + "loss": 0.265, + "step": 47106 + }, + { + "epoch": 3.816186001296176, + "grad_norm": 0.07955259829759598, + "learning_rate": 1.0211980737206896e-05, + "loss": 0.2075, + "step": 47107 + }, + { + "epoch": 3.816267012313675, + "grad_norm": 0.09649759531021118, + "learning_rate": 1.020748008461227e-05, + "loss": 0.236, + "step": 47108 + }, + { + "epoch": 3.816348023331173, + "grad_norm": 0.06495945900678635, + "learning_rate": 1.0202979432017643e-05, + "loss": 0.2247, + "step": 47109 + }, + { + "epoch": 3.8164290343486713, + "grad_norm": 0.07418833673000336, + "learning_rate": 1.0198478779423017e-05, + "loss": 0.2183, + "step": 47110 + }, + { + "epoch": 3.8165100453661696, + "grad_norm": 0.0698823407292366, + "learning_rate": 1.019397812682839e-05, + "loss": 0.1966, + "step": 47111 + }, + { + "epoch": 3.8165910563836682, + "grad_norm": 0.08725552260875702, + "learning_rate": 1.0189477474233764e-05, + "loss": 0.2079, + "step": 47112 + }, + { + "epoch": 3.8166720674011665, + "grad_norm": 0.060063328593969345, + "learning_rate": 1.0184976821639138e-05, + "loss": 0.1972, + "step": 47113 + }, + { + "epoch": 3.8167530784186647, + "grad_norm": 0.06648138910531998, + "learning_rate": 1.0180476169044511e-05, + "loss": 0.2085, + "step": 47114 + }, + { + "epoch": 3.8168340894361634, + "grad_norm": 0.06786450743675232, + "learning_rate": 1.0175975516449886e-05, + "loss": 0.1979, + "step": 47115 + }, + { + "epoch": 3.8169151004536617, + "grad_norm": 0.06979463994503021, + "learning_rate": 1.017147486385526e-05, + "loss": 0.2417, + "step": 47116 + }, + { + "epoch": 3.81699611147116, + "grad_norm": 0.08767568320035934, + "learning_rate": 1.0166974211260634e-05, + "loss": 0.2051, + "step": 47117 + }, + { + "epoch": 3.8170771224886586, + "grad_norm": 0.061907850205898285, + "learning_rate": 1.0162473558666007e-05, + "loss": 0.2234, + "step": 47118 + }, + { + "epoch": 3.817158133506157, + "grad_norm": 0.06363208591938019, + "learning_rate": 1.0157972906071381e-05, + "loss": 0.1849, + "step": 47119 + }, + { + "epoch": 3.817239144523655, + "grad_norm": 0.07457547634840012, + "learning_rate": 1.0153472253476755e-05, + "loss": 0.2401, + "step": 47120 + }, + { + "epoch": 3.817320155541154, + "grad_norm": 0.07944779843091965, + "learning_rate": 1.0148971600882128e-05, + "loss": 0.2281, + "step": 47121 + }, + { + "epoch": 3.817401166558652, + "grad_norm": 0.07503432035446167, + "learning_rate": 1.0144470948287502e-05, + "loss": 0.2235, + "step": 47122 + }, + { + "epoch": 3.8174821775761503, + "grad_norm": 0.07481568306684494, + "learning_rate": 1.0139970295692875e-05, + "loss": 0.2295, + "step": 47123 + }, + { + "epoch": 3.817563188593649, + "grad_norm": 0.07879388332366943, + "learning_rate": 1.013546964309825e-05, + "loss": 0.2554, + "step": 47124 + }, + { + "epoch": 3.817644199611147, + "grad_norm": 0.0720246359705925, + "learning_rate": 1.0130968990503624e-05, + "loss": 0.1888, + "step": 47125 + }, + { + "epoch": 3.8177252106286454, + "grad_norm": 0.08811241388320923, + "learning_rate": 1.0126468337908996e-05, + "loss": 0.2323, + "step": 47126 + }, + { + "epoch": 3.817806221646144, + "grad_norm": 0.06210675463080406, + "learning_rate": 1.0121967685314371e-05, + "loss": 0.2095, + "step": 47127 + }, + { + "epoch": 3.8178872326636424, + "grad_norm": 0.0800977572798729, + "learning_rate": 1.0117467032719745e-05, + "loss": 0.2031, + "step": 47128 + }, + { + "epoch": 3.8179682436811406, + "grad_norm": 0.06897516548633575, + "learning_rate": 1.0112966380125119e-05, + "loss": 0.2186, + "step": 47129 + }, + { + "epoch": 3.818049254698639, + "grad_norm": 0.06213768571615219, + "learning_rate": 1.0108465727530494e-05, + "loss": 0.229, + "step": 47130 + }, + { + "epoch": 3.818130265716137, + "grad_norm": 0.07513062655925751, + "learning_rate": 1.0103965074935866e-05, + "loss": 0.2606, + "step": 47131 + }, + { + "epoch": 3.818211276733636, + "grad_norm": 0.0878511592745781, + "learning_rate": 1.009946442234124e-05, + "loss": 0.2577, + "step": 47132 + }, + { + "epoch": 3.818292287751134, + "grad_norm": 0.10174962133169174, + "learning_rate": 1.0094963769746615e-05, + "loss": 0.2628, + "step": 47133 + }, + { + "epoch": 3.8183732987686323, + "grad_norm": 0.08247354626655579, + "learning_rate": 1.0090463117151988e-05, + "loss": 0.223, + "step": 47134 + }, + { + "epoch": 3.818454309786131, + "grad_norm": 0.10168582201004028, + "learning_rate": 1.008596246455736e-05, + "loss": 0.2132, + "step": 47135 + }, + { + "epoch": 3.8185353208036292, + "grad_norm": 0.06533244997262955, + "learning_rate": 1.0081461811962736e-05, + "loss": 0.2634, + "step": 47136 + }, + { + "epoch": 3.8186163318211275, + "grad_norm": 0.05962993949651718, + "learning_rate": 1.0076961159368109e-05, + "loss": 0.2111, + "step": 47137 + }, + { + "epoch": 3.818697342838626, + "grad_norm": 0.06415306776762009, + "learning_rate": 1.0072460506773483e-05, + "loss": 0.2213, + "step": 47138 + }, + { + "epoch": 3.8187783538561244, + "grad_norm": 0.0562756322324276, + "learning_rate": 1.0067959854178856e-05, + "loss": 0.19, + "step": 47139 + }, + { + "epoch": 3.8188593648736227, + "grad_norm": 0.06657944619655609, + "learning_rate": 1.006345920158423e-05, + "loss": 0.2282, + "step": 47140 + }, + { + "epoch": 3.8189403758911213, + "grad_norm": 0.06836859881877899, + "learning_rate": 1.0058958548989604e-05, + "loss": 0.2284, + "step": 47141 + }, + { + "epoch": 3.8190213869086196, + "grad_norm": 0.07675183564424515, + "learning_rate": 1.0054457896394979e-05, + "loss": 0.2459, + "step": 47142 + }, + { + "epoch": 3.819102397926118, + "grad_norm": 0.08422303944826126, + "learning_rate": 1.004995724380035e-05, + "loss": 0.2848, + "step": 47143 + }, + { + "epoch": 3.8191834089436165, + "grad_norm": 0.07882612198591232, + "learning_rate": 1.0045456591205724e-05, + "loss": 0.2573, + "step": 47144 + }, + { + "epoch": 3.8192644199611148, + "grad_norm": 0.058982573449611664, + "learning_rate": 1.00409559386111e-05, + "loss": 0.2202, + "step": 47145 + }, + { + "epoch": 3.819345430978613, + "grad_norm": 0.07594690471887589, + "learning_rate": 1.0036455286016473e-05, + "loss": 0.2612, + "step": 47146 + }, + { + "epoch": 3.8194264419961117, + "grad_norm": 0.06958582997322083, + "learning_rate": 1.0031954633421847e-05, + "loss": 0.1971, + "step": 47147 + }, + { + "epoch": 3.81950745301361, + "grad_norm": 0.09242573380470276, + "learning_rate": 1.002745398082722e-05, + "loss": 0.2689, + "step": 47148 + }, + { + "epoch": 3.819588464031108, + "grad_norm": 0.08315322548151016, + "learning_rate": 1.0022953328232594e-05, + "loss": 0.2526, + "step": 47149 + }, + { + "epoch": 3.819669475048607, + "grad_norm": 0.06824847310781479, + "learning_rate": 1.0018452675637968e-05, + "loss": 0.2435, + "step": 47150 + }, + { + "epoch": 3.819750486066105, + "grad_norm": 0.07796567678451538, + "learning_rate": 1.0013952023043343e-05, + "loss": 0.221, + "step": 47151 + }, + { + "epoch": 3.8198314970836034, + "grad_norm": 0.07060608267784119, + "learning_rate": 1.0009451370448715e-05, + "loss": 0.2133, + "step": 47152 + }, + { + "epoch": 3.8199125081011016, + "grad_norm": 0.07457584887742996, + "learning_rate": 1.0004950717854088e-05, + "loss": 0.2156, + "step": 47153 + }, + { + "epoch": 3.8199935191186, + "grad_norm": 0.09381989389657974, + "learning_rate": 1.0000450065259464e-05, + "loss": 0.2183, + "step": 47154 + }, + { + "epoch": 3.8200745301360985, + "grad_norm": 0.0633532926440239, + "learning_rate": 9.995949412664837e-06, + "loss": 0.2166, + "step": 47155 + }, + { + "epoch": 3.820155541153597, + "grad_norm": 0.06335076689720154, + "learning_rate": 9.99144876007021e-06, + "loss": 0.1908, + "step": 47156 + }, + { + "epoch": 3.820236552171095, + "grad_norm": 0.0725879818201065, + "learning_rate": 9.986948107475585e-06, + "loss": 0.2631, + "step": 47157 + }, + { + "epoch": 3.8203175631885937, + "grad_norm": 0.07099777460098267, + "learning_rate": 9.982447454880958e-06, + "loss": 0.2009, + "step": 47158 + }, + { + "epoch": 3.820398574206092, + "grad_norm": 0.08141002804040909, + "learning_rate": 9.977946802286332e-06, + "loss": 0.2486, + "step": 47159 + }, + { + "epoch": 3.82047958522359, + "grad_norm": 0.06455133110284805, + "learning_rate": 9.973446149691707e-06, + "loss": 0.2197, + "step": 47160 + }, + { + "epoch": 3.820560596241089, + "grad_norm": 0.08412550389766693, + "learning_rate": 9.968945497097079e-06, + "loss": 0.2283, + "step": 47161 + }, + { + "epoch": 3.820641607258587, + "grad_norm": 0.08144482970237732, + "learning_rate": 9.964444844502453e-06, + "loss": 0.198, + "step": 47162 + }, + { + "epoch": 3.8207226182760854, + "grad_norm": 0.06888513267040253, + "learning_rate": 9.959944191907828e-06, + "loss": 0.2045, + "step": 47163 + }, + { + "epoch": 3.820803629293584, + "grad_norm": 0.06686069816350937, + "learning_rate": 9.955443539313202e-06, + "loss": 0.1933, + "step": 47164 + }, + { + "epoch": 3.8208846403110823, + "grad_norm": 0.1000877320766449, + "learning_rate": 9.950942886718573e-06, + "loss": 0.2362, + "step": 47165 + }, + { + "epoch": 3.8209656513285806, + "grad_norm": 0.07413871586322784, + "learning_rate": 9.946442234123949e-06, + "loss": 0.2607, + "step": 47166 + }, + { + "epoch": 3.8210466623460793, + "grad_norm": 0.07616769522428513, + "learning_rate": 9.941941581529322e-06, + "loss": 0.2294, + "step": 47167 + }, + { + "epoch": 3.8211276733635775, + "grad_norm": 0.08212704211473465, + "learning_rate": 9.937440928934696e-06, + "loss": 0.208, + "step": 47168 + }, + { + "epoch": 3.8212086843810757, + "grad_norm": 0.06919477134943008, + "learning_rate": 9.93294027634007e-06, + "loss": 0.2393, + "step": 47169 + }, + { + "epoch": 3.8212896953985744, + "grad_norm": 0.07358191162347794, + "learning_rate": 9.928439623745443e-06, + "loss": 0.2176, + "step": 47170 + }, + { + "epoch": 3.8213707064160727, + "grad_norm": 0.0823577418923378, + "learning_rate": 9.923938971150817e-06, + "loss": 0.2783, + "step": 47171 + }, + { + "epoch": 3.821451717433571, + "grad_norm": 0.06688446551561356, + "learning_rate": 9.919438318556192e-06, + "loss": 0.2079, + "step": 47172 + }, + { + "epoch": 3.8215327284510696, + "grad_norm": 0.0666053518652916, + "learning_rate": 9.914937665961566e-06, + "loss": 0.2031, + "step": 47173 + }, + { + "epoch": 3.821613739468568, + "grad_norm": 0.08584258705377579, + "learning_rate": 9.910437013366938e-06, + "loss": 0.2129, + "step": 47174 + }, + { + "epoch": 3.821694750486066, + "grad_norm": 0.07315590977668762, + "learning_rate": 9.905936360772313e-06, + "loss": 0.2045, + "step": 47175 + }, + { + "epoch": 3.8217757615035644, + "grad_norm": 0.05562111362814903, + "learning_rate": 9.901435708177686e-06, + "loss": 0.181, + "step": 47176 + }, + { + "epoch": 3.8218567725210626, + "grad_norm": 0.07396908849477768, + "learning_rate": 9.89693505558306e-06, + "loss": 0.2028, + "step": 47177 + }, + { + "epoch": 3.8219377835385613, + "grad_norm": 0.0769689679145813, + "learning_rate": 9.892434402988434e-06, + "loss": 0.2081, + "step": 47178 + }, + { + "epoch": 3.8220187945560595, + "grad_norm": 0.09181632101535797, + "learning_rate": 9.887933750393807e-06, + "loss": 0.2465, + "step": 47179 + }, + { + "epoch": 3.8220998055735578, + "grad_norm": 0.07666897028684616, + "learning_rate": 9.883433097799181e-06, + "loss": 0.2228, + "step": 47180 + }, + { + "epoch": 3.8221808165910565, + "grad_norm": 0.06831949949264526, + "learning_rate": 9.878932445204556e-06, + "loss": 0.2263, + "step": 47181 + }, + { + "epoch": 3.8222618276085547, + "grad_norm": 0.06521642953157425, + "learning_rate": 9.874431792609928e-06, + "loss": 0.2294, + "step": 47182 + }, + { + "epoch": 3.822342838626053, + "grad_norm": 0.07536672800779343, + "learning_rate": 9.869931140015302e-06, + "loss": 0.2143, + "step": 47183 + }, + { + "epoch": 3.8224238496435516, + "grad_norm": 0.08660756051540375, + "learning_rate": 9.865430487420677e-06, + "loss": 0.2777, + "step": 47184 + }, + { + "epoch": 3.82250486066105, + "grad_norm": 0.0728485956788063, + "learning_rate": 9.86092983482605e-06, + "loss": 0.2351, + "step": 47185 + }, + { + "epoch": 3.822585871678548, + "grad_norm": 0.06775187700986862, + "learning_rate": 9.856429182231424e-06, + "loss": 0.2358, + "step": 47186 + }, + { + "epoch": 3.822666882696047, + "grad_norm": 0.07535045593976974, + "learning_rate": 9.851928529636798e-06, + "loss": 0.2201, + "step": 47187 + }, + { + "epoch": 3.822747893713545, + "grad_norm": 0.07898474484682083, + "learning_rate": 9.847427877042171e-06, + "loss": 0.2667, + "step": 47188 + }, + { + "epoch": 3.8228289047310433, + "grad_norm": 0.08030825108289719, + "learning_rate": 9.842927224447545e-06, + "loss": 0.2676, + "step": 47189 + }, + { + "epoch": 3.822909915748542, + "grad_norm": 0.07572636753320694, + "learning_rate": 9.83842657185292e-06, + "loss": 0.2874, + "step": 47190 + }, + { + "epoch": 3.8229909267660402, + "grad_norm": 0.07902305573225021, + "learning_rate": 9.833925919258292e-06, + "loss": 0.2218, + "step": 47191 + }, + { + "epoch": 3.8230719377835385, + "grad_norm": 0.07862576842308044, + "learning_rate": 9.829425266663666e-06, + "loss": 0.2515, + "step": 47192 + }, + { + "epoch": 3.823152948801037, + "grad_norm": 0.07469526678323746, + "learning_rate": 9.824924614069041e-06, + "loss": 0.221, + "step": 47193 + }, + { + "epoch": 3.8232339598185354, + "grad_norm": 0.07735791057348251, + "learning_rate": 9.820423961474415e-06, + "loss": 0.2396, + "step": 47194 + }, + { + "epoch": 3.8233149708360337, + "grad_norm": 0.06498521566390991, + "learning_rate": 9.815923308879788e-06, + "loss": 0.2311, + "step": 47195 + }, + { + "epoch": 3.8233959818535324, + "grad_norm": 0.07805728167295456, + "learning_rate": 9.811422656285162e-06, + "loss": 0.2454, + "step": 47196 + }, + { + "epoch": 3.8234769928710306, + "grad_norm": 0.065180204808712, + "learning_rate": 9.806922003690536e-06, + "loss": 0.2308, + "step": 47197 + }, + { + "epoch": 3.823558003888529, + "grad_norm": 0.07017407566308975, + "learning_rate": 9.802421351095909e-06, + "loss": 0.2326, + "step": 47198 + }, + { + "epoch": 3.823639014906027, + "grad_norm": 0.08247072994709015, + "learning_rate": 9.797920698501283e-06, + "loss": 0.23, + "step": 47199 + }, + { + "epoch": 3.8237200259235253, + "grad_norm": 0.06262676417827606, + "learning_rate": 9.793420045906656e-06, + "loss": 0.2121, + "step": 47200 + }, + { + "epoch": 3.823801036941024, + "grad_norm": 0.10090301930904388, + "learning_rate": 9.78891939331203e-06, + "loss": 0.2373, + "step": 47201 + }, + { + "epoch": 3.8238820479585223, + "grad_norm": 0.08574521541595459, + "learning_rate": 9.784418740717405e-06, + "loss": 0.2366, + "step": 47202 + }, + { + "epoch": 3.8239630589760205, + "grad_norm": 0.0797303095459938, + "learning_rate": 9.779918088122779e-06, + "loss": 0.2047, + "step": 47203 + }, + { + "epoch": 3.824044069993519, + "grad_norm": 0.07990000396966934, + "learning_rate": 9.775417435528152e-06, + "loss": 0.194, + "step": 47204 + }, + { + "epoch": 3.8241250810110174, + "grad_norm": 0.06872112303972244, + "learning_rate": 9.770916782933526e-06, + "loss": 0.1983, + "step": 47205 + }, + { + "epoch": 3.8242060920285157, + "grad_norm": 0.06418676674365997, + "learning_rate": 9.7664161303389e-06, + "loss": 0.2198, + "step": 47206 + }, + { + "epoch": 3.8242871030460144, + "grad_norm": 0.0675126165151596, + "learning_rate": 9.761915477744273e-06, + "loss": 0.191, + "step": 47207 + }, + { + "epoch": 3.8243681140635126, + "grad_norm": 0.0721009373664856, + "learning_rate": 9.757414825149647e-06, + "loss": 0.2582, + "step": 47208 + }, + { + "epoch": 3.824449125081011, + "grad_norm": 0.05475969240069389, + "learning_rate": 9.75291417255502e-06, + "loss": 0.1922, + "step": 47209 + }, + { + "epoch": 3.8245301360985096, + "grad_norm": 0.07951171696186066, + "learning_rate": 9.748413519960394e-06, + "loss": 0.2298, + "step": 47210 + }, + { + "epoch": 3.824611147116008, + "grad_norm": 0.059227924793958664, + "learning_rate": 9.74391286736577e-06, + "loss": 0.2276, + "step": 47211 + }, + { + "epoch": 3.824692158133506, + "grad_norm": 0.06179160252213478, + "learning_rate": 9.739412214771141e-06, + "loss": 0.2209, + "step": 47212 + }, + { + "epoch": 3.8247731691510047, + "grad_norm": 0.06057712808251381, + "learning_rate": 9.734911562176517e-06, + "loss": 0.1778, + "step": 47213 + }, + { + "epoch": 3.824854180168503, + "grad_norm": 0.06524763256311417, + "learning_rate": 9.73041090958189e-06, + "loss": 0.2312, + "step": 47214 + }, + { + "epoch": 3.8249351911860012, + "grad_norm": 0.06625210493803024, + "learning_rate": 9.725910256987264e-06, + "loss": 0.2464, + "step": 47215 + }, + { + "epoch": 3.8250162022035, + "grad_norm": 0.05672033131122589, + "learning_rate": 9.721409604392637e-06, + "loss": 0.2107, + "step": 47216 + }, + { + "epoch": 3.825097213220998, + "grad_norm": 0.08716113865375519, + "learning_rate": 9.716908951798011e-06, + "loss": 0.2429, + "step": 47217 + }, + { + "epoch": 3.8251782242384964, + "grad_norm": 0.09075607359409332, + "learning_rate": 9.712408299203385e-06, + "loss": 0.2329, + "step": 47218 + }, + { + "epoch": 3.8252592352559946, + "grad_norm": 0.08668406307697296, + "learning_rate": 9.707907646608758e-06, + "loss": 0.2409, + "step": 47219 + }, + { + "epoch": 3.8253402462734933, + "grad_norm": 0.07025102525949478, + "learning_rate": 9.703406994014133e-06, + "loss": 0.1961, + "step": 47220 + }, + { + "epoch": 3.8254212572909916, + "grad_norm": 0.05928102135658264, + "learning_rate": 9.698906341419505e-06, + "loss": 0.2193, + "step": 47221 + }, + { + "epoch": 3.82550226830849, + "grad_norm": 0.0666092187166214, + "learning_rate": 9.69440568882488e-06, + "loss": 0.2267, + "step": 47222 + }, + { + "epoch": 3.825583279325988, + "grad_norm": 0.07980141043663025, + "learning_rate": 9.689905036230254e-06, + "loss": 0.232, + "step": 47223 + }, + { + "epoch": 3.8256642903434868, + "grad_norm": 0.09296827018260956, + "learning_rate": 9.685404383635628e-06, + "loss": 0.2557, + "step": 47224 + }, + { + "epoch": 3.825745301360985, + "grad_norm": 0.08211761713027954, + "learning_rate": 9.680903731041001e-06, + "loss": 0.2107, + "step": 47225 + }, + { + "epoch": 3.8258263123784833, + "grad_norm": 0.0850875973701477, + "learning_rate": 9.676403078446375e-06, + "loss": 0.2087, + "step": 47226 + }, + { + "epoch": 3.825907323395982, + "grad_norm": 0.0714445635676384, + "learning_rate": 9.671902425851749e-06, + "loss": 0.2302, + "step": 47227 + }, + { + "epoch": 3.82598833441348, + "grad_norm": 0.08113661408424377, + "learning_rate": 9.667401773257124e-06, + "loss": 0.2145, + "step": 47228 + }, + { + "epoch": 3.8260693454309784, + "grad_norm": 0.0769774541258812, + "learning_rate": 9.662901120662496e-06, + "loss": 0.2095, + "step": 47229 + }, + { + "epoch": 3.826150356448477, + "grad_norm": 0.0717720314860344, + "learning_rate": 9.65840046806787e-06, + "loss": 0.2377, + "step": 47230 + }, + { + "epoch": 3.8262313674659754, + "grad_norm": 0.06568815559148788, + "learning_rate": 9.653899815473245e-06, + "loss": 0.2288, + "step": 47231 + }, + { + "epoch": 3.8263123784834736, + "grad_norm": 0.0737098678946495, + "learning_rate": 9.649399162878618e-06, + "loss": 0.2059, + "step": 47232 + }, + { + "epoch": 3.8263933895009723, + "grad_norm": 0.06918147206306458, + "learning_rate": 9.644898510283992e-06, + "loss": 0.2238, + "step": 47233 + }, + { + "epoch": 3.8264744005184705, + "grad_norm": 0.0679457038640976, + "learning_rate": 9.640397857689366e-06, + "loss": 0.2414, + "step": 47234 + }, + { + "epoch": 3.826555411535969, + "grad_norm": 0.07485439628362656, + "learning_rate": 9.63589720509474e-06, + "loss": 0.1962, + "step": 47235 + }, + { + "epoch": 3.8266364225534675, + "grad_norm": 0.07543614506721497, + "learning_rate": 9.631396552500113e-06, + "loss": 0.2554, + "step": 47236 + }, + { + "epoch": 3.8267174335709657, + "grad_norm": 0.07579874992370605, + "learning_rate": 9.626895899905488e-06, + "loss": 0.1838, + "step": 47237 + }, + { + "epoch": 3.826798444588464, + "grad_norm": 0.06702306121587753, + "learning_rate": 9.62239524731086e-06, + "loss": 0.246, + "step": 47238 + }, + { + "epoch": 3.8268794556059627, + "grad_norm": 0.06871180236339569, + "learning_rate": 9.617894594716234e-06, + "loss": 0.232, + "step": 47239 + }, + { + "epoch": 3.826960466623461, + "grad_norm": 0.0695720836520195, + "learning_rate": 9.613393942121609e-06, + "loss": 0.2181, + "step": 47240 + }, + { + "epoch": 3.827041477640959, + "grad_norm": 0.07191573828458786, + "learning_rate": 9.608893289526983e-06, + "loss": 0.2541, + "step": 47241 + }, + { + "epoch": 3.8271224886584574, + "grad_norm": 0.07103823870420456, + "learning_rate": 9.604392636932354e-06, + "loss": 0.2188, + "step": 47242 + }, + { + "epoch": 3.827203499675956, + "grad_norm": 0.057614583522081375, + "learning_rate": 9.59989198433773e-06, + "loss": 0.247, + "step": 47243 + }, + { + "epoch": 3.8272845106934543, + "grad_norm": 0.08780107647180557, + "learning_rate": 9.595391331743103e-06, + "loss": 0.2499, + "step": 47244 + }, + { + "epoch": 3.8273655217109526, + "grad_norm": 0.07042431086301804, + "learning_rate": 9.590890679148477e-06, + "loss": 0.2408, + "step": 47245 + }, + { + "epoch": 3.827446532728451, + "grad_norm": 0.09787672013044357, + "learning_rate": 9.586390026553852e-06, + "loss": 0.233, + "step": 47246 + }, + { + "epoch": 3.8275275437459495, + "grad_norm": 0.05550922080874443, + "learning_rate": 9.581889373959224e-06, + "loss": 0.254, + "step": 47247 + }, + { + "epoch": 3.8276085547634477, + "grad_norm": 0.06615682691335678, + "learning_rate": 9.577388721364598e-06, + "loss": 0.2103, + "step": 47248 + }, + { + "epoch": 3.827689565780946, + "grad_norm": 0.0725463330745697, + "learning_rate": 9.572888068769973e-06, + "loss": 0.2129, + "step": 47249 + }, + { + "epoch": 3.8277705767984447, + "grad_norm": 0.08158998191356659, + "learning_rate": 9.568387416175347e-06, + "loss": 0.2423, + "step": 47250 + }, + { + "epoch": 3.827851587815943, + "grad_norm": 0.07956718653440475, + "learning_rate": 9.563886763580719e-06, + "loss": 0.2078, + "step": 47251 + }, + { + "epoch": 3.827932598833441, + "grad_norm": 0.07327364385128021, + "learning_rate": 9.559386110986094e-06, + "loss": 0.2675, + "step": 47252 + }, + { + "epoch": 3.82801360985094, + "grad_norm": 0.0754021480679512, + "learning_rate": 9.554885458391467e-06, + "loss": 0.2211, + "step": 47253 + }, + { + "epoch": 3.828094620868438, + "grad_norm": 0.07733095437288284, + "learning_rate": 9.550384805796841e-06, + "loss": 0.2136, + "step": 47254 + }, + { + "epoch": 3.8281756318859363, + "grad_norm": 0.0626547783613205, + "learning_rate": 9.545884153202215e-06, + "loss": 0.2049, + "step": 47255 + }, + { + "epoch": 3.828256642903435, + "grad_norm": 0.0786009356379509, + "learning_rate": 9.541383500607588e-06, + "loss": 0.226, + "step": 47256 + }, + { + "epoch": 3.8283376539209333, + "grad_norm": 0.07420649379491806, + "learning_rate": 9.536882848012962e-06, + "loss": 0.1857, + "step": 47257 + }, + { + "epoch": 3.8284186649384315, + "grad_norm": 0.08987807482481003, + "learning_rate": 9.532382195418337e-06, + "loss": 0.2385, + "step": 47258 + }, + { + "epoch": 3.82849967595593, + "grad_norm": 0.08396057784557343, + "learning_rate": 9.527881542823709e-06, + "loss": 0.227, + "step": 47259 + }, + { + "epoch": 3.8285806869734285, + "grad_norm": 0.08503457903862, + "learning_rate": 9.523380890229083e-06, + "loss": 0.2937, + "step": 47260 + }, + { + "epoch": 3.8286616979909267, + "grad_norm": 0.07922942191362381, + "learning_rate": 9.518880237634458e-06, + "loss": 0.2653, + "step": 47261 + }, + { + "epoch": 3.8287427090084254, + "grad_norm": 0.07605444639921188, + "learning_rate": 9.514379585039832e-06, + "loss": 0.2179, + "step": 47262 + }, + { + "epoch": 3.8288237200259236, + "grad_norm": 0.062154121696949005, + "learning_rate": 9.509878932445205e-06, + "loss": 0.2167, + "step": 47263 + }, + { + "epoch": 3.828904731043422, + "grad_norm": 0.059175748378038406, + "learning_rate": 9.505378279850579e-06, + "loss": 0.2376, + "step": 47264 + }, + { + "epoch": 3.82898574206092, + "grad_norm": 0.06250623613595963, + "learning_rate": 9.500877627255952e-06, + "loss": 0.2287, + "step": 47265 + }, + { + "epoch": 3.829066753078419, + "grad_norm": 0.06687180697917938, + "learning_rate": 9.496376974661326e-06, + "loss": 0.2141, + "step": 47266 + }, + { + "epoch": 3.829147764095917, + "grad_norm": 0.08554664254188538, + "learning_rate": 9.491876322066701e-06, + "loss": 0.2137, + "step": 47267 + }, + { + "epoch": 3.8292287751134153, + "grad_norm": 0.07720212638378143, + "learning_rate": 9.487375669472073e-06, + "loss": 0.2481, + "step": 47268 + }, + { + "epoch": 3.8293097861309136, + "grad_norm": 0.07368236035108566, + "learning_rate": 9.482875016877447e-06, + "loss": 0.1931, + "step": 47269 + }, + { + "epoch": 3.8293907971484122, + "grad_norm": 0.07523944973945618, + "learning_rate": 9.478374364282822e-06, + "loss": 0.198, + "step": 47270 + }, + { + "epoch": 3.8294718081659105, + "grad_norm": 0.07883183658123016, + "learning_rate": 9.473873711688196e-06, + "loss": 0.2222, + "step": 47271 + }, + { + "epoch": 3.8295528191834087, + "grad_norm": 0.06268235296010971, + "learning_rate": 9.469373059093568e-06, + "loss": 0.2399, + "step": 47272 + }, + { + "epoch": 3.8296338302009074, + "grad_norm": 0.06675557792186737, + "learning_rate": 9.464872406498943e-06, + "loss": 0.2021, + "step": 47273 + }, + { + "epoch": 3.8297148412184057, + "grad_norm": 0.07962408661842346, + "learning_rate": 9.460371753904317e-06, + "loss": 0.2065, + "step": 47274 + }, + { + "epoch": 3.829795852235904, + "grad_norm": 0.06950665265321732, + "learning_rate": 9.45587110130969e-06, + "loss": 0.2035, + "step": 47275 + }, + { + "epoch": 3.8298768632534026, + "grad_norm": 0.07848667353391647, + "learning_rate": 9.451370448715065e-06, + "loss": 0.2406, + "step": 47276 + }, + { + "epoch": 3.829957874270901, + "grad_norm": 0.07826760411262512, + "learning_rate": 9.446869796120437e-06, + "loss": 0.2433, + "step": 47277 + }, + { + "epoch": 3.830038885288399, + "grad_norm": 0.06589142233133316, + "learning_rate": 9.442369143525811e-06, + "loss": 0.2277, + "step": 47278 + }, + { + "epoch": 3.8301198963058978, + "grad_norm": 0.0657556802034378, + "learning_rate": 9.437868490931186e-06, + "loss": 0.2145, + "step": 47279 + }, + { + "epoch": 3.830200907323396, + "grad_norm": 0.08655353635549545, + "learning_rate": 9.43336783833656e-06, + "loss": 0.2212, + "step": 47280 + }, + { + "epoch": 3.8302819183408943, + "grad_norm": 0.06460024416446686, + "learning_rate": 9.428867185741932e-06, + "loss": 0.2437, + "step": 47281 + }, + { + "epoch": 3.830362929358393, + "grad_norm": 0.06946887820959091, + "learning_rate": 9.424366533147307e-06, + "loss": 0.2367, + "step": 47282 + }, + { + "epoch": 3.830443940375891, + "grad_norm": 0.07267460972070694, + "learning_rate": 9.41986588055268e-06, + "loss": 0.2402, + "step": 47283 + }, + { + "epoch": 3.8305249513933894, + "grad_norm": 0.06545892357826233, + "learning_rate": 9.415365227958054e-06, + "loss": 0.243, + "step": 47284 + }, + { + "epoch": 3.830605962410888, + "grad_norm": 0.07193044573068619, + "learning_rate": 9.410864575363428e-06, + "loss": 0.2309, + "step": 47285 + }, + { + "epoch": 3.8306869734283864, + "grad_norm": 0.07626271992921829, + "learning_rate": 9.406363922768801e-06, + "loss": 0.2003, + "step": 47286 + }, + { + "epoch": 3.8307679844458846, + "grad_norm": 0.06547614932060242, + "learning_rate": 9.401863270174175e-06, + "loss": 0.2529, + "step": 47287 + }, + { + "epoch": 3.830848995463383, + "grad_norm": 0.07183431833982468, + "learning_rate": 9.39736261757955e-06, + "loss": 0.1978, + "step": 47288 + }, + { + "epoch": 3.8309300064808816, + "grad_norm": 0.06502663344144821, + "learning_rate": 9.392861964984924e-06, + "loss": 0.2103, + "step": 47289 + }, + { + "epoch": 3.83101101749838, + "grad_norm": 0.0723065510392189, + "learning_rate": 9.388361312390296e-06, + "loss": 0.2106, + "step": 47290 + }, + { + "epoch": 3.831092028515878, + "grad_norm": 0.06067093834280968, + "learning_rate": 9.383860659795671e-06, + "loss": 0.2475, + "step": 47291 + }, + { + "epoch": 3.8311730395333763, + "grad_norm": 0.0675438940525055, + "learning_rate": 9.379360007201045e-06, + "loss": 0.2143, + "step": 47292 + }, + { + "epoch": 3.831254050550875, + "grad_norm": 0.08510125428438187, + "learning_rate": 9.374859354606418e-06, + "loss": 0.2459, + "step": 47293 + }, + { + "epoch": 3.8313350615683732, + "grad_norm": 0.08360667526721954, + "learning_rate": 9.370358702011792e-06, + "loss": 0.2058, + "step": 47294 + }, + { + "epoch": 3.8314160725858715, + "grad_norm": 0.06987543404102325, + "learning_rate": 9.365858049417166e-06, + "loss": 0.2325, + "step": 47295 + }, + { + "epoch": 3.83149708360337, + "grad_norm": 0.07574071735143661, + "learning_rate": 9.36135739682254e-06, + "loss": 0.2243, + "step": 47296 + }, + { + "epoch": 3.8315780946208684, + "grad_norm": 0.06666197627782822, + "learning_rate": 9.356856744227914e-06, + "loss": 0.2158, + "step": 47297 + }, + { + "epoch": 3.8316591056383666, + "grad_norm": 0.08488959819078445, + "learning_rate": 9.352356091633286e-06, + "loss": 0.2025, + "step": 47298 + }, + { + "epoch": 3.8317401166558653, + "grad_norm": 0.07799491286277771, + "learning_rate": 9.34785543903866e-06, + "loss": 0.209, + "step": 47299 + }, + { + "epoch": 3.8318211276733636, + "grad_norm": 0.07381635904312134, + "learning_rate": 9.343354786444035e-06, + "loss": 0.2521, + "step": 47300 + }, + { + "epoch": 3.831902138690862, + "grad_norm": 0.05815286561846733, + "learning_rate": 9.338854133849409e-06, + "loss": 0.2023, + "step": 47301 + }, + { + "epoch": 3.8319831497083605, + "grad_norm": 0.08299100399017334, + "learning_rate": 9.334353481254782e-06, + "loss": 0.1874, + "step": 47302 + }, + { + "epoch": 3.8320641607258588, + "grad_norm": 0.0680263414978981, + "learning_rate": 9.329852828660156e-06, + "loss": 0.2688, + "step": 47303 + }, + { + "epoch": 3.832145171743357, + "grad_norm": 0.07331136614084244, + "learning_rate": 9.32535217606553e-06, + "loss": 0.2756, + "step": 47304 + }, + { + "epoch": 3.8322261827608557, + "grad_norm": 0.07893449068069458, + "learning_rate": 9.320851523470903e-06, + "loss": 0.2511, + "step": 47305 + }, + { + "epoch": 3.832307193778354, + "grad_norm": 0.06692652404308319, + "learning_rate": 9.316350870876279e-06, + "loss": 0.1834, + "step": 47306 + }, + { + "epoch": 3.832388204795852, + "grad_norm": 0.10165359824895859, + "learning_rate": 9.31185021828165e-06, + "loss": 0.2666, + "step": 47307 + }, + { + "epoch": 3.832469215813351, + "grad_norm": 0.06731588393449783, + "learning_rate": 9.307349565687024e-06, + "loss": 0.2005, + "step": 47308 + }, + { + "epoch": 3.832550226830849, + "grad_norm": 0.09456529468297958, + "learning_rate": 9.3028489130924e-06, + "loss": 0.2468, + "step": 47309 + }, + { + "epoch": 3.8326312378483474, + "grad_norm": 0.07092951238155365, + "learning_rate": 9.298348260497773e-06, + "loss": 0.2123, + "step": 47310 + }, + { + "epoch": 3.8327122488658456, + "grad_norm": 0.07524535059928894, + "learning_rate": 9.293847607903147e-06, + "loss": 0.227, + "step": 47311 + }, + { + "epoch": 3.8327932598833443, + "grad_norm": 0.0925503671169281, + "learning_rate": 9.28934695530852e-06, + "loss": 0.2284, + "step": 47312 + }, + { + "epoch": 3.8328742709008425, + "grad_norm": 0.0901738703250885, + "learning_rate": 9.284846302713894e-06, + "loss": 0.2441, + "step": 47313 + }, + { + "epoch": 3.832955281918341, + "grad_norm": 0.06687404960393906, + "learning_rate": 9.280345650119267e-06, + "loss": 0.2427, + "step": 47314 + }, + { + "epoch": 3.833036292935839, + "grad_norm": 0.07051212340593338, + "learning_rate": 9.275844997524641e-06, + "loss": 0.2208, + "step": 47315 + }, + { + "epoch": 3.8331173039533377, + "grad_norm": 0.08045321702957153, + "learning_rate": 9.271344344930015e-06, + "loss": 0.2244, + "step": 47316 + }, + { + "epoch": 3.833198314970836, + "grad_norm": 0.06755134463310242, + "learning_rate": 9.266843692335388e-06, + "loss": 0.1857, + "step": 47317 + }, + { + "epoch": 3.833279325988334, + "grad_norm": 0.06752123683691025, + "learning_rate": 9.262343039740764e-06, + "loss": 0.2187, + "step": 47318 + }, + { + "epoch": 3.833360337005833, + "grad_norm": 0.07143384963274002, + "learning_rate": 9.257842387146137e-06, + "loss": 0.2296, + "step": 47319 + }, + { + "epoch": 3.833441348023331, + "grad_norm": 0.06815242767333984, + "learning_rate": 9.25334173455151e-06, + "loss": 0.1858, + "step": 47320 + }, + { + "epoch": 3.8335223590408294, + "grad_norm": 0.08003578335046768, + "learning_rate": 9.248841081956884e-06, + "loss": 0.2072, + "step": 47321 + }, + { + "epoch": 3.833603370058328, + "grad_norm": 0.07147976011037827, + "learning_rate": 9.244340429362258e-06, + "loss": 0.2295, + "step": 47322 + }, + { + "epoch": 3.8336843810758263, + "grad_norm": 0.07325585186481476, + "learning_rate": 9.239839776767632e-06, + "loss": 0.2423, + "step": 47323 + }, + { + "epoch": 3.8337653920933246, + "grad_norm": 0.06136981397867203, + "learning_rate": 9.235339124173005e-06, + "loss": 0.1829, + "step": 47324 + }, + { + "epoch": 3.8338464031108233, + "grad_norm": 0.07087627798318863, + "learning_rate": 9.230838471578379e-06, + "loss": 0.1777, + "step": 47325 + }, + { + "epoch": 3.8339274141283215, + "grad_norm": 0.07408089935779572, + "learning_rate": 9.226337818983752e-06, + "loss": 0.2293, + "step": 47326 + }, + { + "epoch": 3.8340084251458197, + "grad_norm": 0.08621295541524887, + "learning_rate": 9.221837166389128e-06, + "loss": 0.2657, + "step": 47327 + }, + { + "epoch": 3.8340894361633184, + "grad_norm": 0.06372913718223572, + "learning_rate": 9.2173365137945e-06, + "loss": 0.2012, + "step": 47328 + }, + { + "epoch": 3.8341704471808167, + "grad_norm": 0.07300638407468796, + "learning_rate": 9.212835861199875e-06, + "loss": 0.2104, + "step": 47329 + }, + { + "epoch": 3.834251458198315, + "grad_norm": 0.05497792363166809, + "learning_rate": 9.208335208605248e-06, + "loss": 0.2106, + "step": 47330 + }, + { + "epoch": 3.8343324692158136, + "grad_norm": 0.06223801150918007, + "learning_rate": 9.203834556010622e-06, + "loss": 0.1981, + "step": 47331 + }, + { + "epoch": 3.834413480233312, + "grad_norm": 0.08193052560091019, + "learning_rate": 9.199333903415996e-06, + "loss": 0.2798, + "step": 47332 + }, + { + "epoch": 3.83449449125081, + "grad_norm": 0.07334339618682861, + "learning_rate": 9.19483325082137e-06, + "loss": 0.2237, + "step": 47333 + }, + { + "epoch": 3.8345755022683083, + "grad_norm": 0.07783214747905731, + "learning_rate": 9.190332598226743e-06, + "loss": 0.2113, + "step": 47334 + }, + { + "epoch": 3.834656513285807, + "grad_norm": 0.07094062864780426, + "learning_rate": 9.185831945632118e-06, + "loss": 0.2396, + "step": 47335 + }, + { + "epoch": 3.8347375243033053, + "grad_norm": 0.07203572988510132, + "learning_rate": 9.181331293037492e-06, + "loss": 0.2329, + "step": 47336 + }, + { + "epoch": 3.8348185353208035, + "grad_norm": 0.06293873488903046, + "learning_rate": 9.176830640442864e-06, + "loss": 0.2344, + "step": 47337 + }, + { + "epoch": 3.8348995463383018, + "grad_norm": 0.07198001444339752, + "learning_rate": 9.172329987848239e-06, + "loss": 0.2002, + "step": 47338 + }, + { + "epoch": 3.8349805573558005, + "grad_norm": 0.07168415933847427, + "learning_rate": 9.167829335253613e-06, + "loss": 0.2593, + "step": 47339 + }, + { + "epoch": 3.8350615683732987, + "grad_norm": 0.06184476613998413, + "learning_rate": 9.163328682658986e-06, + "loss": 0.2313, + "step": 47340 + }, + { + "epoch": 3.835142579390797, + "grad_norm": 0.05653228610754013, + "learning_rate": 9.15882803006436e-06, + "loss": 0.1854, + "step": 47341 + }, + { + "epoch": 3.8352235904082956, + "grad_norm": 0.06754690408706665, + "learning_rate": 9.154327377469733e-06, + "loss": 0.2537, + "step": 47342 + }, + { + "epoch": 3.835304601425794, + "grad_norm": 0.07572982460260391, + "learning_rate": 9.149826724875107e-06, + "loss": 0.264, + "step": 47343 + }, + { + "epoch": 3.835385612443292, + "grad_norm": 0.07662025839090347, + "learning_rate": 9.145326072280482e-06, + "loss": 0.2123, + "step": 47344 + }, + { + "epoch": 3.835466623460791, + "grad_norm": 0.07293036580085754, + "learning_rate": 9.140825419685854e-06, + "loss": 0.231, + "step": 47345 + }, + { + "epoch": 3.835547634478289, + "grad_norm": 0.061868395656347275, + "learning_rate": 9.136324767091228e-06, + "loss": 0.2088, + "step": 47346 + }, + { + "epoch": 3.8356286454957873, + "grad_norm": 0.05718451365828514, + "learning_rate": 9.131824114496603e-06, + "loss": 0.2455, + "step": 47347 + }, + { + "epoch": 3.835709656513286, + "grad_norm": 0.06627894937992096, + "learning_rate": 9.127323461901977e-06, + "loss": 0.2288, + "step": 47348 + }, + { + "epoch": 3.8357906675307842, + "grad_norm": 0.08713270723819733, + "learning_rate": 9.12282280930735e-06, + "loss": 0.2371, + "step": 47349 + }, + { + "epoch": 3.8358716785482825, + "grad_norm": 0.05953427404165268, + "learning_rate": 9.118322156712724e-06, + "loss": 0.214, + "step": 47350 + }, + { + "epoch": 3.835952689565781, + "grad_norm": 0.06580721586942673, + "learning_rate": 9.113821504118098e-06, + "loss": 0.2079, + "step": 47351 + }, + { + "epoch": 3.8360337005832794, + "grad_norm": 0.08798111230134964, + "learning_rate": 9.109320851523471e-06, + "loss": 0.2701, + "step": 47352 + }, + { + "epoch": 3.8361147116007777, + "grad_norm": 0.0650232806801796, + "learning_rate": 9.104820198928846e-06, + "loss": 0.2124, + "step": 47353 + }, + { + "epoch": 3.8361957226182763, + "grad_norm": 0.0631076991558075, + "learning_rate": 9.100319546334218e-06, + "loss": 0.2093, + "step": 47354 + }, + { + "epoch": 3.8362767336357746, + "grad_norm": 0.0658016949892044, + "learning_rate": 9.095818893739592e-06, + "loss": 0.2163, + "step": 47355 + }, + { + "epoch": 3.836357744653273, + "grad_norm": 0.06750229001045227, + "learning_rate": 9.091318241144967e-06, + "loss": 0.2013, + "step": 47356 + }, + { + "epoch": 3.836438755670771, + "grad_norm": 0.07032305002212524, + "learning_rate": 9.08681758855034e-06, + "loss": 0.2533, + "step": 47357 + }, + { + "epoch": 3.8365197666882693, + "grad_norm": 0.0590241402387619, + "learning_rate": 9.082316935955713e-06, + "loss": 0.2212, + "step": 47358 + }, + { + "epoch": 3.836600777705768, + "grad_norm": 0.06706096976995468, + "learning_rate": 9.077816283361088e-06, + "loss": 0.2238, + "step": 47359 + }, + { + "epoch": 3.8366817887232663, + "grad_norm": 0.08542868494987488, + "learning_rate": 9.073315630766462e-06, + "loss": 0.2208, + "step": 47360 + }, + { + "epoch": 3.8367627997407645, + "grad_norm": 0.06941386312246323, + "learning_rate": 9.068814978171835e-06, + "loss": 0.2698, + "step": 47361 + }, + { + "epoch": 3.836843810758263, + "grad_norm": 0.060006238520145416, + "learning_rate": 9.06431432557721e-06, + "loss": 0.1835, + "step": 47362 + }, + { + "epoch": 3.8369248217757614, + "grad_norm": 0.06161088868975639, + "learning_rate": 9.059813672982582e-06, + "loss": 0.204, + "step": 47363 + }, + { + "epoch": 3.8370058327932597, + "grad_norm": 0.0850311890244484, + "learning_rate": 9.055313020387956e-06, + "loss": 0.2353, + "step": 47364 + }, + { + "epoch": 3.8370868438107584, + "grad_norm": 0.06477893143892288, + "learning_rate": 9.050812367793331e-06, + "loss": 0.2299, + "step": 47365 + }, + { + "epoch": 3.8371678548282566, + "grad_norm": 0.06894608587026596, + "learning_rate": 9.046311715198705e-06, + "loss": 0.232, + "step": 47366 + }, + { + "epoch": 3.837248865845755, + "grad_norm": 0.07398436218500137, + "learning_rate": 9.041811062604077e-06, + "loss": 0.2443, + "step": 47367 + }, + { + "epoch": 3.8373298768632536, + "grad_norm": 0.07592013478279114, + "learning_rate": 9.037310410009452e-06, + "loss": 0.2166, + "step": 47368 + }, + { + "epoch": 3.837410887880752, + "grad_norm": 0.07261329144239426, + "learning_rate": 9.032809757414826e-06, + "loss": 0.2163, + "step": 47369 + }, + { + "epoch": 3.83749189889825, + "grad_norm": 0.0708971843123436, + "learning_rate": 9.0283091048202e-06, + "loss": 0.256, + "step": 47370 + }, + { + "epoch": 3.8375729099157487, + "grad_norm": 0.07381059974431992, + "learning_rate": 9.023808452225573e-06, + "loss": 0.2307, + "step": 47371 + }, + { + "epoch": 3.837653920933247, + "grad_norm": 0.04990634694695473, + "learning_rate": 9.019307799630947e-06, + "loss": 0.2111, + "step": 47372 + }, + { + "epoch": 3.837734931950745, + "grad_norm": 0.06506974995136261, + "learning_rate": 9.01480714703632e-06, + "loss": 0.2267, + "step": 47373 + }, + { + "epoch": 3.837815942968244, + "grad_norm": 0.08760213106870651, + "learning_rate": 9.010306494441695e-06, + "loss": 0.2154, + "step": 47374 + }, + { + "epoch": 3.837896953985742, + "grad_norm": 0.07203999906778336, + "learning_rate": 9.005805841847069e-06, + "loss": 0.2392, + "step": 47375 + }, + { + "epoch": 3.8379779650032404, + "grad_norm": 0.0878889188170433, + "learning_rate": 9.001305189252441e-06, + "loss": 0.2725, + "step": 47376 + }, + { + "epoch": 3.838058976020739, + "grad_norm": 0.08162927627563477, + "learning_rate": 8.996804536657816e-06, + "loss": 0.217, + "step": 47377 + }, + { + "epoch": 3.8381399870382373, + "grad_norm": 0.07394520938396454, + "learning_rate": 8.99230388406319e-06, + "loss": 0.185, + "step": 47378 + }, + { + "epoch": 3.8382209980557356, + "grad_norm": 0.0606919601559639, + "learning_rate": 8.987803231468564e-06, + "loss": 0.2269, + "step": 47379 + }, + { + "epoch": 3.838302009073234, + "grad_norm": 0.07362279295921326, + "learning_rate": 8.983302578873937e-06, + "loss": 0.2205, + "step": 47380 + }, + { + "epoch": 3.838383020090732, + "grad_norm": 0.06929288804531097, + "learning_rate": 8.97880192627931e-06, + "loss": 0.1886, + "step": 47381 + }, + { + "epoch": 3.8384640311082308, + "grad_norm": 0.05175120756030083, + "learning_rate": 8.974301273684684e-06, + "loss": 0.1816, + "step": 47382 + }, + { + "epoch": 3.838545042125729, + "grad_norm": 0.06899210810661316, + "learning_rate": 8.96980062109006e-06, + "loss": 0.2392, + "step": 47383 + }, + { + "epoch": 3.8386260531432272, + "grad_norm": 0.07615984231233597, + "learning_rate": 8.965299968495432e-06, + "loss": 0.2399, + "step": 47384 + }, + { + "epoch": 3.838707064160726, + "grad_norm": 0.08293159306049347, + "learning_rate": 8.960799315900805e-06, + "loss": 0.2523, + "step": 47385 + }, + { + "epoch": 3.838788075178224, + "grad_norm": 0.06818405538797379, + "learning_rate": 8.95629866330618e-06, + "loss": 0.2465, + "step": 47386 + }, + { + "epoch": 3.8388690861957224, + "grad_norm": 0.07864660769701004, + "learning_rate": 8.951798010711554e-06, + "loss": 0.2487, + "step": 47387 + }, + { + "epoch": 3.838950097213221, + "grad_norm": 0.06878384947776794, + "learning_rate": 8.947297358116926e-06, + "loss": 0.2125, + "step": 47388 + }, + { + "epoch": 3.8390311082307194, + "grad_norm": 0.08059203624725342, + "learning_rate": 8.942796705522301e-06, + "loss": 0.1948, + "step": 47389 + }, + { + "epoch": 3.8391121192482176, + "grad_norm": 0.07346974313259125, + "learning_rate": 8.938296052927675e-06, + "loss": 0.2186, + "step": 47390 + }, + { + "epoch": 3.8391931302657163, + "grad_norm": 0.06083647534251213, + "learning_rate": 8.933795400333048e-06, + "loss": 0.2405, + "step": 47391 + }, + { + "epoch": 3.8392741412832145, + "grad_norm": 0.07009142637252808, + "learning_rate": 8.929294747738424e-06, + "loss": 0.2472, + "step": 47392 + }, + { + "epoch": 3.839355152300713, + "grad_norm": 0.07019902765750885, + "learning_rate": 8.924794095143796e-06, + "loss": 0.1912, + "step": 47393 + }, + { + "epoch": 3.8394361633182115, + "grad_norm": 0.07780317962169647, + "learning_rate": 8.92029344254917e-06, + "loss": 0.2321, + "step": 47394 + }, + { + "epoch": 3.8395171743357097, + "grad_norm": 0.061078991740942, + "learning_rate": 8.915792789954545e-06, + "loss": 0.1962, + "step": 47395 + }, + { + "epoch": 3.839598185353208, + "grad_norm": 0.07453559339046478, + "learning_rate": 8.911292137359918e-06, + "loss": 0.2545, + "step": 47396 + }, + { + "epoch": 3.8396791963707066, + "grad_norm": 0.0780182033777237, + "learning_rate": 8.90679148476529e-06, + "loss": 0.2747, + "step": 47397 + }, + { + "epoch": 3.839760207388205, + "grad_norm": 0.07100244611501694, + "learning_rate": 8.902290832170665e-06, + "loss": 0.2418, + "step": 47398 + }, + { + "epoch": 3.839841218405703, + "grad_norm": 0.053392693400382996, + "learning_rate": 8.897790179576039e-06, + "loss": 0.2013, + "step": 47399 + }, + { + "epoch": 3.839922229423202, + "grad_norm": 0.052919309586286545, + "learning_rate": 8.893289526981413e-06, + "loss": 0.2098, + "step": 47400 + }, + { + "epoch": 3.8400032404407, + "grad_norm": 0.0662708431482315, + "learning_rate": 8.888788874386786e-06, + "loss": 0.2121, + "step": 47401 + }, + { + "epoch": 3.8400842514581983, + "grad_norm": 0.07481697201728821, + "learning_rate": 8.88428822179216e-06, + "loss": 0.2062, + "step": 47402 + }, + { + "epoch": 3.8401652624756966, + "grad_norm": 0.07575065642595291, + "learning_rate": 8.879787569197533e-06, + "loss": 0.2033, + "step": 47403 + }, + { + "epoch": 3.840246273493195, + "grad_norm": 0.06485338509082794, + "learning_rate": 8.875286916602909e-06, + "loss": 0.2298, + "step": 47404 + }, + { + "epoch": 3.8403272845106935, + "grad_norm": 0.06933486461639404, + "learning_rate": 8.870786264008282e-06, + "loss": 0.2329, + "step": 47405 + }, + { + "epoch": 3.8404082955281917, + "grad_norm": 0.07605458796024323, + "learning_rate": 8.866285611413654e-06, + "loss": 0.2347, + "step": 47406 + }, + { + "epoch": 3.84048930654569, + "grad_norm": 0.07894845306873322, + "learning_rate": 8.86178495881903e-06, + "loss": 0.2167, + "step": 47407 + }, + { + "epoch": 3.8405703175631887, + "grad_norm": 0.0782691240310669, + "learning_rate": 8.857284306224403e-06, + "loss": 0.264, + "step": 47408 + }, + { + "epoch": 3.840651328580687, + "grad_norm": 0.07806120067834854, + "learning_rate": 8.852783653629777e-06, + "loss": 0.2047, + "step": 47409 + }, + { + "epoch": 3.840732339598185, + "grad_norm": 0.06680679321289062, + "learning_rate": 8.84828300103515e-06, + "loss": 0.1997, + "step": 47410 + }, + { + "epoch": 3.840813350615684, + "grad_norm": 0.09053415805101395, + "learning_rate": 8.843782348440524e-06, + "loss": 0.2392, + "step": 47411 + }, + { + "epoch": 3.840894361633182, + "grad_norm": 0.08757054805755615, + "learning_rate": 8.839281695845897e-06, + "loss": 0.2489, + "step": 47412 + }, + { + "epoch": 3.8409753726506803, + "grad_norm": 0.10022596269845963, + "learning_rate": 8.834781043251273e-06, + "loss": 0.2216, + "step": 47413 + }, + { + "epoch": 3.841056383668179, + "grad_norm": 0.06728453189134598, + "learning_rate": 8.830280390656645e-06, + "loss": 0.1942, + "step": 47414 + }, + { + "epoch": 3.8411373946856773, + "grad_norm": 0.07960285991430283, + "learning_rate": 8.825779738062018e-06, + "loss": 0.2327, + "step": 47415 + }, + { + "epoch": 3.8412184057031755, + "grad_norm": 0.0784974917769432, + "learning_rate": 8.821279085467394e-06, + "loss": 0.2411, + "step": 47416 + }, + { + "epoch": 3.841299416720674, + "grad_norm": 0.06307407468557358, + "learning_rate": 8.816778432872767e-06, + "loss": 0.198, + "step": 47417 + }, + { + "epoch": 3.8413804277381725, + "grad_norm": 0.07177886366844177, + "learning_rate": 8.81227778027814e-06, + "loss": 0.2221, + "step": 47418 + }, + { + "epoch": 3.8414614387556707, + "grad_norm": 0.07093466073274612, + "learning_rate": 8.807777127683514e-06, + "loss": 0.2285, + "step": 47419 + }, + { + "epoch": 3.8415424497731694, + "grad_norm": 0.07861575484275818, + "learning_rate": 8.803276475088888e-06, + "loss": 0.2226, + "step": 47420 + }, + { + "epoch": 3.8416234607906676, + "grad_norm": 0.08279623091220856, + "learning_rate": 8.798775822494262e-06, + "loss": 0.2354, + "step": 47421 + }, + { + "epoch": 3.841704471808166, + "grad_norm": 0.07097911089658737, + "learning_rate": 8.794275169899637e-06, + "loss": 0.1956, + "step": 47422 + }, + { + "epoch": 3.8417854828256646, + "grad_norm": 0.08817955106496811, + "learning_rate": 8.789774517305009e-06, + "loss": 0.2312, + "step": 47423 + }, + { + "epoch": 3.841866493843163, + "grad_norm": 0.06684723496437073, + "learning_rate": 8.785273864710382e-06, + "loss": 0.2451, + "step": 47424 + }, + { + "epoch": 3.841947504860661, + "grad_norm": 0.07929820567369461, + "learning_rate": 8.780773212115758e-06, + "loss": 0.2249, + "step": 47425 + }, + { + "epoch": 3.8420285158781593, + "grad_norm": 0.069803386926651, + "learning_rate": 8.776272559521131e-06, + "loss": 0.2111, + "step": 47426 + }, + { + "epoch": 3.8421095268956575, + "grad_norm": 0.07593082636594772, + "learning_rate": 8.771771906926505e-06, + "loss": 0.1928, + "step": 47427 + }, + { + "epoch": 3.8421905379131562, + "grad_norm": 0.0848335325717926, + "learning_rate": 8.767271254331879e-06, + "loss": 0.2202, + "step": 47428 + }, + { + "epoch": 3.8422715489306545, + "grad_norm": 0.07958927750587463, + "learning_rate": 8.762770601737252e-06, + "loss": 0.2404, + "step": 47429 + }, + { + "epoch": 3.8423525599481527, + "grad_norm": 0.08629477769136429, + "learning_rate": 8.758269949142626e-06, + "loss": 0.2137, + "step": 47430 + }, + { + "epoch": 3.8424335709656514, + "grad_norm": 0.0668138712644577, + "learning_rate": 8.753769296548e-06, + "loss": 0.2097, + "step": 47431 + }, + { + "epoch": 3.8425145819831497, + "grad_norm": 0.06344747543334961, + "learning_rate": 8.749268643953373e-06, + "loss": 0.2491, + "step": 47432 + }, + { + "epoch": 3.842595593000648, + "grad_norm": 0.05329788476228714, + "learning_rate": 8.744767991358748e-06, + "loss": 0.1815, + "step": 47433 + }, + { + "epoch": 3.8426766040181466, + "grad_norm": 0.06972850859165192, + "learning_rate": 8.740267338764122e-06, + "loss": 0.234, + "step": 47434 + }, + { + "epoch": 3.842757615035645, + "grad_norm": 0.06892770528793335, + "learning_rate": 8.735766686169495e-06, + "loss": 0.2041, + "step": 47435 + }, + { + "epoch": 3.842838626053143, + "grad_norm": 0.07953707873821259, + "learning_rate": 8.731266033574869e-06, + "loss": 0.2542, + "step": 47436 + }, + { + "epoch": 3.8429196370706418, + "grad_norm": 0.09926214069128036, + "learning_rate": 8.726765380980243e-06, + "loss": 0.2195, + "step": 47437 + }, + { + "epoch": 3.84300064808814, + "grad_norm": 0.08672218024730682, + "learning_rate": 8.722264728385616e-06, + "loss": 0.2131, + "step": 47438 + }, + { + "epoch": 3.8430816591056383, + "grad_norm": 0.07387741655111313, + "learning_rate": 8.71776407579099e-06, + "loss": 0.2321, + "step": 47439 + }, + { + "epoch": 3.843162670123137, + "grad_norm": 0.10262610018253326, + "learning_rate": 8.713263423196363e-06, + "loss": 0.2587, + "step": 47440 + }, + { + "epoch": 3.843243681140635, + "grad_norm": 0.0705871507525444, + "learning_rate": 8.708762770601737e-06, + "loss": 0.2413, + "step": 47441 + }, + { + "epoch": 3.8433246921581334, + "grad_norm": 0.0629960298538208, + "learning_rate": 8.704262118007112e-06, + "loss": 0.2274, + "step": 47442 + }, + { + "epoch": 3.843405703175632, + "grad_norm": 0.06705893576145172, + "learning_rate": 8.699761465412486e-06, + "loss": 0.2104, + "step": 47443 + }, + { + "epoch": 3.8434867141931304, + "grad_norm": 0.0768185630440712, + "learning_rate": 8.695260812817858e-06, + "loss": 0.2321, + "step": 47444 + }, + { + "epoch": 3.8435677252106286, + "grad_norm": 0.07738561928272247, + "learning_rate": 8.690760160223233e-06, + "loss": 0.2219, + "step": 47445 + }, + { + "epoch": 3.843648736228127, + "grad_norm": 0.087811179459095, + "learning_rate": 8.686259507628607e-06, + "loss": 0.2231, + "step": 47446 + }, + { + "epoch": 3.8437297472456255, + "grad_norm": 0.0710371658205986, + "learning_rate": 8.68175885503398e-06, + "loss": 0.2017, + "step": 47447 + }, + { + "epoch": 3.843810758263124, + "grad_norm": 0.06479393690824509, + "learning_rate": 8.677258202439354e-06, + "loss": 0.1882, + "step": 47448 + }, + { + "epoch": 3.843891769280622, + "grad_norm": 0.06025944650173187, + "learning_rate": 8.672757549844728e-06, + "loss": 0.2261, + "step": 47449 + }, + { + "epoch": 3.8439727802981203, + "grad_norm": 0.05862006917595863, + "learning_rate": 8.668256897250101e-06, + "loss": 0.213, + "step": 47450 + }, + { + "epoch": 3.844053791315619, + "grad_norm": 0.07549209147691727, + "learning_rate": 8.663756244655476e-06, + "loss": 0.2305, + "step": 47451 + }, + { + "epoch": 3.844134802333117, + "grad_norm": 0.07217980176210403, + "learning_rate": 8.65925559206085e-06, + "loss": 0.2431, + "step": 47452 + }, + { + "epoch": 3.8442158133506155, + "grad_norm": 0.07103247195482254, + "learning_rate": 8.654754939466222e-06, + "loss": 0.2087, + "step": 47453 + }, + { + "epoch": 3.844296824368114, + "grad_norm": 0.06943337619304657, + "learning_rate": 8.650254286871597e-06, + "loss": 0.2815, + "step": 47454 + }, + { + "epoch": 3.8443778353856124, + "grad_norm": 0.06522897630929947, + "learning_rate": 8.645753634276971e-06, + "loss": 0.1963, + "step": 47455 + }, + { + "epoch": 3.8444588464031106, + "grad_norm": 0.07871834933757782, + "learning_rate": 8.641252981682345e-06, + "loss": 0.2545, + "step": 47456 + }, + { + "epoch": 3.8445398574206093, + "grad_norm": 0.054104987531900406, + "learning_rate": 8.636752329087718e-06, + "loss": 0.2043, + "step": 47457 + }, + { + "epoch": 3.8446208684381076, + "grad_norm": 0.06654616445302963, + "learning_rate": 8.632251676493092e-06, + "loss": 0.2432, + "step": 47458 + }, + { + "epoch": 3.844701879455606, + "grad_norm": 0.09337121993303299, + "learning_rate": 8.627751023898465e-06, + "loss": 0.2573, + "step": 47459 + }, + { + "epoch": 3.8447828904731045, + "grad_norm": 0.0734034851193428, + "learning_rate": 8.62325037130384e-06, + "loss": 0.237, + "step": 47460 + }, + { + "epoch": 3.8448639014906028, + "grad_norm": 0.0719435065984726, + "learning_rate": 8.618749718709214e-06, + "loss": 0.1828, + "step": 47461 + }, + { + "epoch": 3.844944912508101, + "grad_norm": 0.0561227910220623, + "learning_rate": 8.614249066114586e-06, + "loss": 0.1634, + "step": 47462 + }, + { + "epoch": 3.8450259235255997, + "grad_norm": 0.06853722035884857, + "learning_rate": 8.609748413519961e-06, + "loss": 0.2296, + "step": 47463 + }, + { + "epoch": 3.845106934543098, + "grad_norm": 0.07261405140161514, + "learning_rate": 8.605247760925335e-06, + "loss": 0.2562, + "step": 47464 + }, + { + "epoch": 3.845187945560596, + "grad_norm": 0.06826271861791611, + "learning_rate": 8.600747108330709e-06, + "loss": 0.2318, + "step": 47465 + }, + { + "epoch": 3.845268956578095, + "grad_norm": 0.07757086306810379, + "learning_rate": 8.596246455736082e-06, + "loss": 0.1849, + "step": 47466 + }, + { + "epoch": 3.845349967595593, + "grad_norm": 0.07977145165205002, + "learning_rate": 8.591745803141456e-06, + "loss": 0.2338, + "step": 47467 + }, + { + "epoch": 3.8454309786130914, + "grad_norm": 0.06785174459218979, + "learning_rate": 8.58724515054683e-06, + "loss": 0.2292, + "step": 47468 + }, + { + "epoch": 3.8455119896305896, + "grad_norm": 0.07069353014230728, + "learning_rate": 8.582744497952205e-06, + "loss": 0.2281, + "step": 47469 + }, + { + "epoch": 3.8455930006480883, + "grad_norm": 0.0715603157877922, + "learning_rate": 8.578243845357577e-06, + "loss": 0.2052, + "step": 47470 + }, + { + "epoch": 3.8456740116655865, + "grad_norm": 0.08521236479282379, + "learning_rate": 8.57374319276295e-06, + "loss": 0.2068, + "step": 47471 + }, + { + "epoch": 3.845755022683085, + "grad_norm": 0.06399007141590118, + "learning_rate": 8.569242540168326e-06, + "loss": 0.2399, + "step": 47472 + }, + { + "epoch": 3.845836033700583, + "grad_norm": 0.06669558584690094, + "learning_rate": 8.5647418875737e-06, + "loss": 0.2103, + "step": 47473 + }, + { + "epoch": 3.8459170447180817, + "grad_norm": 0.06358367949724197, + "learning_rate": 8.560241234979071e-06, + "loss": 0.2175, + "step": 47474 + }, + { + "epoch": 3.84599805573558, + "grad_norm": 0.07232925295829773, + "learning_rate": 8.555740582384446e-06, + "loss": 0.264, + "step": 47475 + }, + { + "epoch": 3.846079066753078, + "grad_norm": 0.0711951032280922, + "learning_rate": 8.55123992978982e-06, + "loss": 0.2351, + "step": 47476 + }, + { + "epoch": 3.846160077770577, + "grad_norm": 0.07407108694314957, + "learning_rate": 8.546739277195194e-06, + "loss": 0.248, + "step": 47477 + }, + { + "epoch": 3.846241088788075, + "grad_norm": 0.06144161522388458, + "learning_rate": 8.542238624600569e-06, + "loss": 0.2372, + "step": 47478 + }, + { + "epoch": 3.8463220998055734, + "grad_norm": 0.07053658366203308, + "learning_rate": 8.53773797200594e-06, + "loss": 0.2325, + "step": 47479 + }, + { + "epoch": 3.846403110823072, + "grad_norm": 0.08187015354633331, + "learning_rate": 8.533237319411314e-06, + "loss": 0.2449, + "step": 47480 + }, + { + "epoch": 3.8464841218405703, + "grad_norm": 0.06435208022594452, + "learning_rate": 8.52873666681669e-06, + "loss": 0.2177, + "step": 47481 + }, + { + "epoch": 3.8465651328580686, + "grad_norm": 0.06776495277881622, + "learning_rate": 8.524236014222063e-06, + "loss": 0.197, + "step": 47482 + }, + { + "epoch": 3.8466461438755672, + "grad_norm": 0.07252959907054901, + "learning_rate": 8.519735361627435e-06, + "loss": 0.2276, + "step": 47483 + }, + { + "epoch": 3.8467271548930655, + "grad_norm": 0.08307579159736633, + "learning_rate": 8.51523470903281e-06, + "loss": 0.2332, + "step": 47484 + }, + { + "epoch": 3.8468081659105637, + "grad_norm": 0.07153511792421341, + "learning_rate": 8.510734056438184e-06, + "loss": 0.2307, + "step": 47485 + }, + { + "epoch": 3.8468891769280624, + "grad_norm": 0.07924754917621613, + "learning_rate": 8.506233403843558e-06, + "loss": 0.2227, + "step": 47486 + }, + { + "epoch": 3.8469701879455607, + "grad_norm": 0.08361861109733582, + "learning_rate": 8.501732751248931e-06, + "loss": 0.2129, + "step": 47487 + }, + { + "epoch": 3.847051198963059, + "grad_norm": 0.06627753376960754, + "learning_rate": 8.497232098654305e-06, + "loss": 0.2173, + "step": 47488 + }, + { + "epoch": 3.8471322099805576, + "grad_norm": 0.08294981718063354, + "learning_rate": 8.492731446059679e-06, + "loss": 0.2502, + "step": 47489 + }, + { + "epoch": 3.847213220998056, + "grad_norm": 0.060485485941171646, + "learning_rate": 8.488230793465054e-06, + "loss": 0.1837, + "step": 47490 + }, + { + "epoch": 3.847294232015554, + "grad_norm": 0.06141113489866257, + "learning_rate": 8.483730140870427e-06, + "loss": 0.2074, + "step": 47491 + }, + { + "epoch": 3.8473752430330523, + "grad_norm": 0.07824747264385223, + "learning_rate": 8.4792294882758e-06, + "loss": 0.2496, + "step": 47492 + }, + { + "epoch": 3.847456254050551, + "grad_norm": 0.058931589126586914, + "learning_rate": 8.474728835681175e-06, + "loss": 0.1947, + "step": 47493 + }, + { + "epoch": 3.8475372650680493, + "grad_norm": 0.08158572763204575, + "learning_rate": 8.470228183086548e-06, + "loss": 0.2319, + "step": 47494 + }, + { + "epoch": 3.8476182760855475, + "grad_norm": 0.06126769632101059, + "learning_rate": 8.465727530491922e-06, + "loss": 0.2286, + "step": 47495 + }, + { + "epoch": 3.8476992871030458, + "grad_norm": 0.05657956004142761, + "learning_rate": 8.461226877897295e-06, + "loss": 0.1731, + "step": 47496 + }, + { + "epoch": 3.8477802981205445, + "grad_norm": 0.07108405232429504, + "learning_rate": 8.456726225302669e-06, + "loss": 0.2344, + "step": 47497 + }, + { + "epoch": 3.8478613091380427, + "grad_norm": 0.07252726703882217, + "learning_rate": 8.452225572708043e-06, + "loss": 0.218, + "step": 47498 + }, + { + "epoch": 3.847942320155541, + "grad_norm": 0.07801450788974762, + "learning_rate": 8.447724920113418e-06, + "loss": 0.2097, + "step": 47499 + }, + { + "epoch": 3.8480233311730396, + "grad_norm": 0.06933198869228363, + "learning_rate": 8.44322426751879e-06, + "loss": 0.2117, + "step": 47500 + }, + { + "epoch": 3.848104342190538, + "grad_norm": 0.06913826614618301, + "learning_rate": 8.438723614924163e-06, + "loss": 0.2026, + "step": 47501 + }, + { + "epoch": 3.848185353208036, + "grad_norm": 0.08163614571094513, + "learning_rate": 8.434222962329539e-06, + "loss": 0.1878, + "step": 47502 + }, + { + "epoch": 3.848266364225535, + "grad_norm": 0.06842586398124695, + "learning_rate": 8.429722309734912e-06, + "loss": 0.2108, + "step": 47503 + }, + { + "epoch": 3.848347375243033, + "grad_norm": 0.07575143128633499, + "learning_rate": 8.425221657140284e-06, + "loss": 0.2302, + "step": 47504 + }, + { + "epoch": 3.8484283862605313, + "grad_norm": 0.06495438516139984, + "learning_rate": 8.42072100454566e-06, + "loss": 0.1875, + "step": 47505 + }, + { + "epoch": 3.84850939727803, + "grad_norm": 0.06851106882095337, + "learning_rate": 8.416220351951033e-06, + "loss": 0.2367, + "step": 47506 + }, + { + "epoch": 3.8485904082955282, + "grad_norm": 0.08247493207454681, + "learning_rate": 8.411719699356407e-06, + "loss": 0.2409, + "step": 47507 + }, + { + "epoch": 3.8486714193130265, + "grad_norm": 0.07875152677297592, + "learning_rate": 8.407219046761782e-06, + "loss": 0.246, + "step": 47508 + }, + { + "epoch": 3.848752430330525, + "grad_norm": 0.08067942410707474, + "learning_rate": 8.402718394167154e-06, + "loss": 0.2295, + "step": 47509 + }, + { + "epoch": 3.8488334413480234, + "grad_norm": 0.07155679911375046, + "learning_rate": 8.398217741572528e-06, + "loss": 0.2452, + "step": 47510 + }, + { + "epoch": 3.8489144523655217, + "grad_norm": 0.06344729661941528, + "learning_rate": 8.393717088977903e-06, + "loss": 0.2275, + "step": 47511 + }, + { + "epoch": 3.8489954633830203, + "grad_norm": 0.06181887909770012, + "learning_rate": 8.389216436383276e-06, + "loss": 0.1943, + "step": 47512 + }, + { + "epoch": 3.8490764744005186, + "grad_norm": 0.07348298281431198, + "learning_rate": 8.384715783788648e-06, + "loss": 0.1927, + "step": 47513 + }, + { + "epoch": 3.849157485418017, + "grad_norm": 0.06353019177913666, + "learning_rate": 8.380215131194024e-06, + "loss": 0.2485, + "step": 47514 + }, + { + "epoch": 3.849238496435515, + "grad_norm": 0.07545365393161774, + "learning_rate": 8.375714478599397e-06, + "loss": 0.2549, + "step": 47515 + }, + { + "epoch": 3.8493195074530138, + "grad_norm": 0.07624714821577072, + "learning_rate": 8.371213826004771e-06, + "loss": 0.2302, + "step": 47516 + }, + { + "epoch": 3.849400518470512, + "grad_norm": 0.06423209607601166, + "learning_rate": 8.366713173410144e-06, + "loss": 0.2249, + "step": 47517 + }, + { + "epoch": 3.8494815294880103, + "grad_norm": 0.08231019973754883, + "learning_rate": 8.362212520815518e-06, + "loss": 0.2052, + "step": 47518 + }, + { + "epoch": 3.8495625405055085, + "grad_norm": 0.06847333908081055, + "learning_rate": 8.357711868220892e-06, + "loss": 0.1864, + "step": 47519 + }, + { + "epoch": 3.849643551523007, + "grad_norm": 0.07198107242584229, + "learning_rate": 8.353211215626267e-06, + "loss": 0.2418, + "step": 47520 + }, + { + "epoch": 3.8497245625405054, + "grad_norm": 0.06648959964513779, + "learning_rate": 8.34871056303164e-06, + "loss": 0.2507, + "step": 47521 + }, + { + "epoch": 3.8498055735580037, + "grad_norm": 0.06837810575962067, + "learning_rate": 8.344209910437012e-06, + "loss": 0.2359, + "step": 47522 + }, + { + "epoch": 3.8498865845755024, + "grad_norm": 0.06529515236616135, + "learning_rate": 8.339709257842388e-06, + "loss": 0.2084, + "step": 47523 + }, + { + "epoch": 3.8499675955930006, + "grad_norm": 0.058206506073474884, + "learning_rate": 8.335208605247761e-06, + "loss": 0.186, + "step": 47524 + }, + { + "epoch": 3.850048606610499, + "grad_norm": 0.07718270272016525, + "learning_rate": 8.330707952653135e-06, + "loss": 0.2177, + "step": 47525 + }, + { + "epoch": 3.8501296176279975, + "grad_norm": 0.07879050821065903, + "learning_rate": 8.326207300058509e-06, + "loss": 0.2429, + "step": 47526 + }, + { + "epoch": 3.850210628645496, + "grad_norm": 0.0804138109087944, + "learning_rate": 8.321706647463882e-06, + "loss": 0.2276, + "step": 47527 + }, + { + "epoch": 3.850291639662994, + "grad_norm": 0.07796037197113037, + "learning_rate": 8.317205994869256e-06, + "loss": 0.2471, + "step": 47528 + }, + { + "epoch": 3.8503726506804927, + "grad_norm": 0.0824827328324318, + "learning_rate": 8.312705342274631e-06, + "loss": 0.2451, + "step": 47529 + }, + { + "epoch": 3.850453661697991, + "grad_norm": 0.06677177548408508, + "learning_rate": 8.308204689680003e-06, + "loss": 0.2119, + "step": 47530 + }, + { + "epoch": 3.850534672715489, + "grad_norm": 0.09421838819980621, + "learning_rate": 8.303704037085378e-06, + "loss": 0.2343, + "step": 47531 + }, + { + "epoch": 3.850615683732988, + "grad_norm": 0.06931822001934052, + "learning_rate": 8.299203384490752e-06, + "loss": 0.2017, + "step": 47532 + }, + { + "epoch": 3.850696694750486, + "grad_norm": 0.07877426594495773, + "learning_rate": 8.294702731896126e-06, + "loss": 0.2199, + "step": 47533 + }, + { + "epoch": 3.8507777057679844, + "grad_norm": 0.07271779328584671, + "learning_rate": 8.290202079301499e-06, + "loss": 0.2334, + "step": 47534 + }, + { + "epoch": 3.850858716785483, + "grad_norm": 0.0686742514371872, + "learning_rate": 8.285701426706873e-06, + "loss": 0.2049, + "step": 47535 + }, + { + "epoch": 3.8509397278029813, + "grad_norm": 0.08275677263736725, + "learning_rate": 8.281200774112246e-06, + "loss": 0.2126, + "step": 47536 + }, + { + "epoch": 3.8510207388204796, + "grad_norm": 0.059194862842559814, + "learning_rate": 8.27670012151762e-06, + "loss": 0.1964, + "step": 47537 + }, + { + "epoch": 3.851101749837978, + "grad_norm": 0.05792196840047836, + "learning_rate": 8.272199468922995e-06, + "loss": 0.1838, + "step": 47538 + }, + { + "epoch": 3.8511827608554765, + "grad_norm": 0.07841230928897858, + "learning_rate": 8.267698816328367e-06, + "loss": 0.2224, + "step": 47539 + }, + { + "epoch": 3.8512637718729748, + "grad_norm": 0.06587585806846619, + "learning_rate": 8.263198163733742e-06, + "loss": 0.2365, + "step": 47540 + }, + { + "epoch": 3.851344782890473, + "grad_norm": 0.06362347304821014, + "learning_rate": 8.258697511139116e-06, + "loss": 0.2163, + "step": 47541 + }, + { + "epoch": 3.8514257939079712, + "grad_norm": 0.06370340287685394, + "learning_rate": 8.25419685854449e-06, + "loss": 0.2243, + "step": 47542 + }, + { + "epoch": 3.85150680492547, + "grad_norm": 0.07428768277168274, + "learning_rate": 8.249696205949863e-06, + "loss": 0.2428, + "step": 47543 + }, + { + "epoch": 3.851587815942968, + "grad_norm": 0.08028055727481842, + "learning_rate": 8.245195553355237e-06, + "loss": 0.2172, + "step": 47544 + }, + { + "epoch": 3.8516688269604664, + "grad_norm": 0.06244838237762451, + "learning_rate": 8.24069490076061e-06, + "loss": 0.2239, + "step": 47545 + }, + { + "epoch": 3.851749837977965, + "grad_norm": 0.07794243097305298, + "learning_rate": 8.236194248165984e-06, + "loss": 0.2254, + "step": 47546 + }, + { + "epoch": 3.8518308489954634, + "grad_norm": 0.07461123168468475, + "learning_rate": 8.231693595571358e-06, + "loss": 0.2043, + "step": 47547 + }, + { + "epoch": 3.8519118600129616, + "grad_norm": 0.07418804615736008, + "learning_rate": 8.227192942976731e-06, + "loss": 0.2453, + "step": 47548 + }, + { + "epoch": 3.8519928710304603, + "grad_norm": 0.06374292820692062, + "learning_rate": 8.222692290382107e-06, + "loss": 0.205, + "step": 47549 + }, + { + "epoch": 3.8520738820479585, + "grad_norm": 0.061300892382860184, + "learning_rate": 8.21819163778748e-06, + "loss": 0.223, + "step": 47550 + }, + { + "epoch": 3.8521548930654568, + "grad_norm": 0.06799773126840591, + "learning_rate": 8.213690985192854e-06, + "loss": 0.2759, + "step": 47551 + }, + { + "epoch": 3.8522359040829555, + "grad_norm": 0.07855547964572906, + "learning_rate": 8.209190332598227e-06, + "loss": 0.2448, + "step": 47552 + }, + { + "epoch": 3.8523169151004537, + "grad_norm": 0.07066880166530609, + "learning_rate": 8.204689680003601e-06, + "loss": 0.2482, + "step": 47553 + }, + { + "epoch": 3.852397926117952, + "grad_norm": 0.08537422120571136, + "learning_rate": 8.200189027408975e-06, + "loss": 0.2267, + "step": 47554 + }, + { + "epoch": 3.8524789371354506, + "grad_norm": 0.07067983597517014, + "learning_rate": 8.195688374814348e-06, + "loss": 0.2373, + "step": 47555 + }, + { + "epoch": 3.852559948152949, + "grad_norm": 0.058485932648181915, + "learning_rate": 8.191187722219722e-06, + "loss": 0.1834, + "step": 47556 + }, + { + "epoch": 3.852640959170447, + "grad_norm": 0.06360162794589996, + "learning_rate": 8.186687069625095e-06, + "loss": 0.1954, + "step": 47557 + }, + { + "epoch": 3.852721970187946, + "grad_norm": 0.07332653552293777, + "learning_rate": 8.18218641703047e-06, + "loss": 0.2465, + "step": 47558 + }, + { + "epoch": 3.852802981205444, + "grad_norm": 0.07406601309776306, + "learning_rate": 8.177685764435844e-06, + "loss": 0.1827, + "step": 47559 + }, + { + "epoch": 3.8528839922229423, + "grad_norm": 0.06085878983139992, + "learning_rate": 8.173185111841216e-06, + "loss": 0.247, + "step": 47560 + }, + { + "epoch": 3.8529650032404406, + "grad_norm": 0.07216692715883255, + "learning_rate": 8.168684459246591e-06, + "loss": 0.2053, + "step": 47561 + }, + { + "epoch": 3.8530460142579392, + "grad_norm": 0.07375588268041611, + "learning_rate": 8.164183806651965e-06, + "loss": 0.2224, + "step": 47562 + }, + { + "epoch": 3.8531270252754375, + "grad_norm": 0.07635340839624405, + "learning_rate": 8.159683154057339e-06, + "loss": 0.2446, + "step": 47563 + }, + { + "epoch": 3.8532080362929357, + "grad_norm": 0.08759939670562744, + "learning_rate": 8.155182501462712e-06, + "loss": 0.2265, + "step": 47564 + }, + { + "epoch": 3.853289047310434, + "grad_norm": 0.0754440650343895, + "learning_rate": 8.150681848868086e-06, + "loss": 0.2195, + "step": 47565 + }, + { + "epoch": 3.8533700583279327, + "grad_norm": 0.08304356038570404, + "learning_rate": 8.14618119627346e-06, + "loss": 0.2309, + "step": 47566 + }, + { + "epoch": 3.853451069345431, + "grad_norm": 0.07643993943929672, + "learning_rate": 8.141680543678835e-06, + "loss": 0.2244, + "step": 47567 + }, + { + "epoch": 3.853532080362929, + "grad_norm": 0.08395697921514511, + "learning_rate": 8.137179891084208e-06, + "loss": 0.1951, + "step": 47568 + }, + { + "epoch": 3.853613091380428, + "grad_norm": 0.07898442447185516, + "learning_rate": 8.13267923848958e-06, + "loss": 0.2543, + "step": 47569 + }, + { + "epoch": 3.853694102397926, + "grad_norm": 0.0748404785990715, + "learning_rate": 8.128178585894956e-06, + "loss": 0.2381, + "step": 47570 + }, + { + "epoch": 3.8537751134154243, + "grad_norm": 0.05667143315076828, + "learning_rate": 8.12367793330033e-06, + "loss": 0.2349, + "step": 47571 + }, + { + "epoch": 3.853856124432923, + "grad_norm": 0.058684345334768295, + "learning_rate": 8.119177280705703e-06, + "loss": 0.2218, + "step": 47572 + }, + { + "epoch": 3.8539371354504213, + "grad_norm": 0.05510832741856575, + "learning_rate": 8.114676628111076e-06, + "loss": 0.1891, + "step": 47573 + }, + { + "epoch": 3.8540181464679195, + "grad_norm": 0.07431642711162567, + "learning_rate": 8.11017597551645e-06, + "loss": 0.2462, + "step": 47574 + }, + { + "epoch": 3.854099157485418, + "grad_norm": 0.07371494174003601, + "learning_rate": 8.105675322921824e-06, + "loss": 0.218, + "step": 47575 + }, + { + "epoch": 3.8541801685029164, + "grad_norm": 0.07297206670045853, + "learning_rate": 8.101174670327199e-06, + "loss": 0.23, + "step": 47576 + }, + { + "epoch": 3.8542611795204147, + "grad_norm": 0.07745885103940964, + "learning_rate": 8.096674017732573e-06, + "loss": 0.2658, + "step": 47577 + }, + { + "epoch": 3.8543421905379134, + "grad_norm": 0.07080569863319397, + "learning_rate": 8.092173365137944e-06, + "loss": 0.2377, + "step": 47578 + }, + { + "epoch": 3.8544232015554116, + "grad_norm": 0.07197723537683487, + "learning_rate": 8.08767271254332e-06, + "loss": 0.2297, + "step": 47579 + }, + { + "epoch": 3.85450421257291, + "grad_norm": 0.0773099884390831, + "learning_rate": 8.083172059948693e-06, + "loss": 0.2535, + "step": 47580 + }, + { + "epoch": 3.8545852235904086, + "grad_norm": 0.06363910436630249, + "learning_rate": 8.078671407354067e-06, + "loss": 0.2199, + "step": 47581 + }, + { + "epoch": 3.854666234607907, + "grad_norm": 0.06943506747484207, + "learning_rate": 8.07417075475944e-06, + "loss": 0.2324, + "step": 47582 + }, + { + "epoch": 3.854747245625405, + "grad_norm": 0.0709523931145668, + "learning_rate": 8.069670102164814e-06, + "loss": 0.2093, + "step": 47583 + }, + { + "epoch": 3.8548282566429033, + "grad_norm": 0.06644728779792786, + "learning_rate": 8.065169449570188e-06, + "loss": 0.1882, + "step": 47584 + }, + { + "epoch": 3.8549092676604015, + "grad_norm": 0.06858302652835846, + "learning_rate": 8.060668796975563e-06, + "loss": 0.2329, + "step": 47585 + }, + { + "epoch": 3.8549902786779002, + "grad_norm": 0.06228305771946907, + "learning_rate": 8.056168144380935e-06, + "loss": 0.2061, + "step": 47586 + }, + { + "epoch": 3.8550712896953985, + "grad_norm": 0.07189958542585373, + "learning_rate": 8.051667491786309e-06, + "loss": 0.2127, + "step": 47587 + }, + { + "epoch": 3.8551523007128967, + "grad_norm": 0.07673601806163788, + "learning_rate": 8.047166839191684e-06, + "loss": 0.181, + "step": 47588 + }, + { + "epoch": 3.8552333117303954, + "grad_norm": 0.07827714085578918, + "learning_rate": 8.042666186597057e-06, + "loss": 0.2282, + "step": 47589 + }, + { + "epoch": 3.8553143227478937, + "grad_norm": 0.08124902099370956, + "learning_rate": 8.03816553400243e-06, + "loss": 0.2034, + "step": 47590 + }, + { + "epoch": 3.855395333765392, + "grad_norm": 0.08442587405443192, + "learning_rate": 8.033664881407805e-06, + "loss": 0.2236, + "step": 47591 + }, + { + "epoch": 3.8554763447828906, + "grad_norm": 0.06765611469745636, + "learning_rate": 8.029164228813178e-06, + "loss": 0.2152, + "step": 47592 + }, + { + "epoch": 3.855557355800389, + "grad_norm": 0.06260386109352112, + "learning_rate": 8.024663576218552e-06, + "loss": 0.2243, + "step": 47593 + }, + { + "epoch": 3.855638366817887, + "grad_norm": 0.06030689924955368, + "learning_rate": 8.020162923623927e-06, + "loss": 0.2254, + "step": 47594 + }, + { + "epoch": 3.8557193778353858, + "grad_norm": 0.06486286222934723, + "learning_rate": 8.015662271029299e-06, + "loss": 0.2008, + "step": 47595 + }, + { + "epoch": 3.855800388852884, + "grad_norm": 0.06872467696666718, + "learning_rate": 8.011161618434673e-06, + "loss": 0.2458, + "step": 47596 + }, + { + "epoch": 3.8558813998703823, + "grad_norm": 0.06587346643209457, + "learning_rate": 8.006660965840048e-06, + "loss": 0.2315, + "step": 47597 + }, + { + "epoch": 3.855962410887881, + "grad_norm": 0.07045286148786545, + "learning_rate": 8.002160313245422e-06, + "loss": 0.1869, + "step": 47598 + }, + { + "epoch": 3.856043421905379, + "grad_norm": 0.09421037882566452, + "learning_rate": 7.997659660650794e-06, + "loss": 0.2215, + "step": 47599 + }, + { + "epoch": 3.8561244329228774, + "grad_norm": 0.09723097085952759, + "learning_rate": 7.993159008056169e-06, + "loss": 0.2381, + "step": 47600 + }, + { + "epoch": 3.856205443940376, + "grad_norm": 0.0791918933391571, + "learning_rate": 7.988658355461542e-06, + "loss": 0.2435, + "step": 47601 + }, + { + "epoch": 3.8562864549578744, + "grad_norm": 0.06977692991495132, + "learning_rate": 7.984157702866916e-06, + "loss": 0.2726, + "step": 47602 + }, + { + "epoch": 3.8563674659753726, + "grad_norm": 0.09157250821590424, + "learning_rate": 7.97965705027229e-06, + "loss": 0.234, + "step": 47603 + }, + { + "epoch": 3.8564484769928713, + "grad_norm": 0.07410857826471329, + "learning_rate": 7.975156397677663e-06, + "loss": 0.2527, + "step": 47604 + }, + { + "epoch": 3.8565294880103695, + "grad_norm": 0.05430734157562256, + "learning_rate": 7.970655745083037e-06, + "loss": 0.209, + "step": 47605 + }, + { + "epoch": 3.856610499027868, + "grad_norm": 0.08034774661064148, + "learning_rate": 7.966155092488412e-06, + "loss": 0.208, + "step": 47606 + }, + { + "epoch": 3.856691510045366, + "grad_norm": 0.07333406805992126, + "learning_rate": 7.961654439893786e-06, + "loss": 0.2328, + "step": 47607 + }, + { + "epoch": 3.8567725210628643, + "grad_norm": 0.06615491956472397, + "learning_rate": 7.957153787299158e-06, + "loss": 0.1903, + "step": 47608 + }, + { + "epoch": 3.856853532080363, + "grad_norm": 0.07822062820196152, + "learning_rate": 7.952653134704533e-06, + "loss": 0.2189, + "step": 47609 + }, + { + "epoch": 3.856934543097861, + "grad_norm": 0.06766904890537262, + "learning_rate": 7.948152482109907e-06, + "loss": 0.2208, + "step": 47610 + }, + { + "epoch": 3.8570155541153595, + "grad_norm": 0.08099085837602615, + "learning_rate": 7.94365182951528e-06, + "loss": 0.2364, + "step": 47611 + }, + { + "epoch": 3.857096565132858, + "grad_norm": 0.06338600814342499, + "learning_rate": 7.939151176920654e-06, + "loss": 0.2428, + "step": 47612 + }, + { + "epoch": 3.8571775761503564, + "grad_norm": 0.06889151781797409, + "learning_rate": 7.934650524326027e-06, + "loss": 0.232, + "step": 47613 + }, + { + "epoch": 3.8572585871678546, + "grad_norm": 0.08298153430223465, + "learning_rate": 7.930149871731401e-06, + "loss": 0.2129, + "step": 47614 + }, + { + "epoch": 3.8573395981853533, + "grad_norm": 0.09067533910274506, + "learning_rate": 7.925649219136776e-06, + "loss": 0.2504, + "step": 47615 + }, + { + "epoch": 3.8574206092028516, + "grad_norm": 0.07296990603208542, + "learning_rate": 7.921148566542148e-06, + "loss": 0.208, + "step": 47616 + }, + { + "epoch": 3.85750162022035, + "grad_norm": 0.09865029901266098, + "learning_rate": 7.916647913947522e-06, + "loss": 0.2474, + "step": 47617 + }, + { + "epoch": 3.8575826312378485, + "grad_norm": 0.06511934846639633, + "learning_rate": 7.912147261352897e-06, + "loss": 0.2175, + "step": 47618 + }, + { + "epoch": 3.8576636422553467, + "grad_norm": 0.08340641856193542, + "learning_rate": 7.90764660875827e-06, + "loss": 0.2267, + "step": 47619 + }, + { + "epoch": 3.857744653272845, + "grad_norm": 0.06559319794178009, + "learning_rate": 7.903145956163644e-06, + "loss": 0.2252, + "step": 47620 + }, + { + "epoch": 3.8578256642903437, + "grad_norm": 0.07104986160993576, + "learning_rate": 7.898645303569018e-06, + "loss": 0.2364, + "step": 47621 + }, + { + "epoch": 3.857906675307842, + "grad_norm": 0.06943206489086151, + "learning_rate": 7.894144650974391e-06, + "loss": 0.2119, + "step": 47622 + }, + { + "epoch": 3.85798768632534, + "grad_norm": 0.05858639255166054, + "learning_rate": 7.889643998379765e-06, + "loss": 0.2225, + "step": 47623 + }, + { + "epoch": 3.858068697342839, + "grad_norm": 0.0890335813164711, + "learning_rate": 7.88514334578514e-06, + "loss": 0.2403, + "step": 47624 + }, + { + "epoch": 3.858149708360337, + "grad_norm": 0.061061326414346695, + "learning_rate": 7.880642693190512e-06, + "loss": 0.2216, + "step": 47625 + }, + { + "epoch": 3.8582307193778353, + "grad_norm": 0.049848560243844986, + "learning_rate": 7.876142040595886e-06, + "loss": 0.2138, + "step": 47626 + }, + { + "epoch": 3.858311730395334, + "grad_norm": 0.08001705259084702, + "learning_rate": 7.871641388001261e-06, + "loss": 0.2296, + "step": 47627 + }, + { + "epoch": 3.8583927414128323, + "grad_norm": 0.09020736813545227, + "learning_rate": 7.867140735406635e-06, + "loss": 0.2792, + "step": 47628 + }, + { + "epoch": 3.8584737524303305, + "grad_norm": 0.09471940994262695, + "learning_rate": 7.862640082812008e-06, + "loss": 0.2496, + "step": 47629 + }, + { + "epoch": 3.8585547634478288, + "grad_norm": 0.08428974449634552, + "learning_rate": 7.858139430217382e-06, + "loss": 0.2456, + "step": 47630 + }, + { + "epoch": 3.858635774465327, + "grad_norm": 0.0708090215921402, + "learning_rate": 7.853638777622756e-06, + "loss": 0.2004, + "step": 47631 + }, + { + "epoch": 3.8587167854828257, + "grad_norm": 0.07518108189105988, + "learning_rate": 7.84913812502813e-06, + "loss": 0.2188, + "step": 47632 + }, + { + "epoch": 3.858797796500324, + "grad_norm": 0.07420327514410019, + "learning_rate": 7.844637472433503e-06, + "loss": 0.2057, + "step": 47633 + }, + { + "epoch": 3.858878807517822, + "grad_norm": 0.08655344694852829, + "learning_rate": 7.840136819838876e-06, + "loss": 0.2228, + "step": 47634 + }, + { + "epoch": 3.858959818535321, + "grad_norm": 0.0829836055636406, + "learning_rate": 7.83563616724425e-06, + "loss": 0.2225, + "step": 47635 + }, + { + "epoch": 3.859040829552819, + "grad_norm": 0.06697103381156921, + "learning_rate": 7.831135514649625e-06, + "loss": 0.1977, + "step": 47636 + }, + { + "epoch": 3.8591218405703174, + "grad_norm": 0.0661018043756485, + "learning_rate": 7.826634862054999e-06, + "loss": 0.2293, + "step": 47637 + }, + { + "epoch": 3.859202851587816, + "grad_norm": 0.07524219155311584, + "learning_rate": 7.822134209460373e-06, + "loss": 0.2315, + "step": 47638 + }, + { + "epoch": 3.8592838626053143, + "grad_norm": 0.06726586073637009, + "learning_rate": 7.817633556865746e-06, + "loss": 0.2015, + "step": 47639 + }, + { + "epoch": 3.8593648736228126, + "grad_norm": 0.06795717775821686, + "learning_rate": 7.81313290427112e-06, + "loss": 0.2322, + "step": 47640 + }, + { + "epoch": 3.8594458846403112, + "grad_norm": 0.07243220508098602, + "learning_rate": 7.808632251676493e-06, + "loss": 0.2447, + "step": 47641 + }, + { + "epoch": 3.8595268956578095, + "grad_norm": 0.06129208952188492, + "learning_rate": 7.804131599081867e-06, + "loss": 0.2262, + "step": 47642 + }, + { + "epoch": 3.8596079066753077, + "grad_norm": 0.06558024883270264, + "learning_rate": 7.79963094648724e-06, + "loss": 0.2312, + "step": 47643 + }, + { + "epoch": 3.8596889176928064, + "grad_norm": 0.07786918431520462, + "learning_rate": 7.795130293892614e-06, + "loss": 0.198, + "step": 47644 + }, + { + "epoch": 3.8597699287103047, + "grad_norm": 0.06113965064287186, + "learning_rate": 7.79062964129799e-06, + "loss": 0.2327, + "step": 47645 + }, + { + "epoch": 3.859850939727803, + "grad_norm": 0.06368988752365112, + "learning_rate": 7.786128988703361e-06, + "loss": 0.2153, + "step": 47646 + }, + { + "epoch": 3.8599319507453016, + "grad_norm": 0.06596409529447556, + "learning_rate": 7.781628336108737e-06, + "loss": 0.2012, + "step": 47647 + }, + { + "epoch": 3.8600129617628, + "grad_norm": 0.06855443865060806, + "learning_rate": 7.77712768351411e-06, + "loss": 0.1867, + "step": 47648 + }, + { + "epoch": 3.860093972780298, + "grad_norm": 0.08048706501722336, + "learning_rate": 7.772627030919484e-06, + "loss": 0.231, + "step": 47649 + }, + { + "epoch": 3.8601749837977968, + "grad_norm": 0.06914348155260086, + "learning_rate": 7.768126378324857e-06, + "loss": 0.1876, + "step": 47650 + }, + { + "epoch": 3.860255994815295, + "grad_norm": 0.059210408478975296, + "learning_rate": 7.763625725730231e-06, + "loss": 0.1888, + "step": 47651 + }, + { + "epoch": 3.8603370058327933, + "grad_norm": 0.07270028442144394, + "learning_rate": 7.759125073135605e-06, + "loss": 0.2167, + "step": 47652 + }, + { + "epoch": 3.8604180168502915, + "grad_norm": 0.07310686260461807, + "learning_rate": 7.754624420540978e-06, + "loss": 0.219, + "step": 47653 + }, + { + "epoch": 3.8604990278677898, + "grad_norm": 0.07835370302200317, + "learning_rate": 7.750123767946354e-06, + "loss": 0.2336, + "step": 47654 + }, + { + "epoch": 3.8605800388852884, + "grad_norm": 0.06291946768760681, + "learning_rate": 7.745623115351725e-06, + "loss": 0.2268, + "step": 47655 + }, + { + "epoch": 3.8606610499027867, + "grad_norm": 0.07243097573518753, + "learning_rate": 7.7411224627571e-06, + "loss": 0.2729, + "step": 47656 + }, + { + "epoch": 3.860742060920285, + "grad_norm": 0.07852333039045334, + "learning_rate": 7.736621810162474e-06, + "loss": 0.2527, + "step": 47657 + }, + { + "epoch": 3.8608230719377836, + "grad_norm": 0.0972912460565567, + "learning_rate": 7.732121157567848e-06, + "loss": 0.2353, + "step": 47658 + }, + { + "epoch": 3.860904082955282, + "grad_norm": 0.05944295972585678, + "learning_rate": 7.727620504973222e-06, + "loss": 0.1901, + "step": 47659 + }, + { + "epoch": 3.86098509397278, + "grad_norm": 0.06972935795783997, + "learning_rate": 7.723119852378595e-06, + "loss": 0.2063, + "step": 47660 + }, + { + "epoch": 3.861066104990279, + "grad_norm": 0.07527864724397659, + "learning_rate": 7.718619199783969e-06, + "loss": 0.2041, + "step": 47661 + }, + { + "epoch": 3.861147116007777, + "grad_norm": 0.06941043585538864, + "learning_rate": 7.714118547189342e-06, + "loss": 0.2499, + "step": 47662 + }, + { + "epoch": 3.8612281270252753, + "grad_norm": 0.06668348610401154, + "learning_rate": 7.709617894594718e-06, + "loss": 0.2021, + "step": 47663 + }, + { + "epoch": 3.861309138042774, + "grad_norm": 0.06672746688127518, + "learning_rate": 7.70511724200009e-06, + "loss": 0.2108, + "step": 47664 + }, + { + "epoch": 3.8613901490602722, + "grad_norm": 0.07876428961753845, + "learning_rate": 7.700616589405465e-06, + "loss": 0.2696, + "step": 47665 + }, + { + "epoch": 3.8614711600777705, + "grad_norm": 0.07343681901693344, + "learning_rate": 7.696115936810838e-06, + "loss": 0.2279, + "step": 47666 + }, + { + "epoch": 3.861552171095269, + "grad_norm": 0.06768275797367096, + "learning_rate": 7.691615284216212e-06, + "loss": 0.233, + "step": 47667 + }, + { + "epoch": 3.8616331821127674, + "grad_norm": 0.07021360844373703, + "learning_rate": 7.687114631621586e-06, + "loss": 0.2214, + "step": 47668 + }, + { + "epoch": 3.8617141931302656, + "grad_norm": 0.06176622211933136, + "learning_rate": 7.68261397902696e-06, + "loss": 0.2481, + "step": 47669 + }, + { + "epoch": 3.8617952041477643, + "grad_norm": 0.08668699860572815, + "learning_rate": 7.678113326432333e-06, + "loss": 0.2582, + "step": 47670 + }, + { + "epoch": 3.8618762151652626, + "grad_norm": 0.07205960154533386, + "learning_rate": 7.673612673837706e-06, + "loss": 0.2159, + "step": 47671 + }, + { + "epoch": 3.861957226182761, + "grad_norm": 0.0772663801908493, + "learning_rate": 7.66911202124308e-06, + "loss": 0.2653, + "step": 47672 + }, + { + "epoch": 3.862038237200259, + "grad_norm": 0.10105203837156296, + "learning_rate": 7.664611368648454e-06, + "loss": 0.2585, + "step": 47673 + }, + { + "epoch": 3.8621192482177578, + "grad_norm": 0.06276296079158783, + "learning_rate": 7.660110716053829e-06, + "loss": 0.1984, + "step": 47674 + }, + { + "epoch": 3.862200259235256, + "grad_norm": 0.06691402941942215, + "learning_rate": 7.655610063459203e-06, + "loss": 0.2076, + "step": 47675 + }, + { + "epoch": 3.8622812702527543, + "grad_norm": 0.06842536479234695, + "learning_rate": 7.651109410864575e-06, + "loss": 0.2234, + "step": 47676 + }, + { + "epoch": 3.8623622812702525, + "grad_norm": 0.08435767143964767, + "learning_rate": 7.64660875826995e-06, + "loss": 0.2745, + "step": 47677 + }, + { + "epoch": 3.862443292287751, + "grad_norm": 0.06818706542253494, + "learning_rate": 7.642108105675323e-06, + "loss": 0.2251, + "step": 47678 + }, + { + "epoch": 3.8625243033052494, + "grad_norm": 0.07116655260324478, + "learning_rate": 7.637607453080697e-06, + "loss": 0.2734, + "step": 47679 + }, + { + "epoch": 3.8626053143227477, + "grad_norm": 0.06385378539562225, + "learning_rate": 7.63310680048607e-06, + "loss": 0.2295, + "step": 47680 + }, + { + "epoch": 3.8626863253402464, + "grad_norm": 0.07078246772289276, + "learning_rate": 7.628606147891444e-06, + "loss": 0.2238, + "step": 47681 + }, + { + "epoch": 3.8627673363577446, + "grad_norm": 0.09647151082754135, + "learning_rate": 7.624105495296819e-06, + "loss": 0.2137, + "step": 47682 + }, + { + "epoch": 3.862848347375243, + "grad_norm": 0.07162981480360031, + "learning_rate": 7.619604842702192e-06, + "loss": 0.2156, + "step": 47683 + }, + { + "epoch": 3.8629293583927415, + "grad_norm": 0.06130341812968254, + "learning_rate": 7.615104190107567e-06, + "loss": 0.2207, + "step": 47684 + }, + { + "epoch": 3.86301036941024, + "grad_norm": 0.0788789615035057, + "learning_rate": 7.6106035375129395e-06, + "loss": 0.2426, + "step": 47685 + }, + { + "epoch": 3.863091380427738, + "grad_norm": 0.07394041866064072, + "learning_rate": 7.606102884918313e-06, + "loss": 0.2206, + "step": 47686 + }, + { + "epoch": 3.8631723914452367, + "grad_norm": 0.07278414815664291, + "learning_rate": 7.6016022323236875e-06, + "loss": 0.2395, + "step": 47687 + }, + { + "epoch": 3.863253402462735, + "grad_norm": 0.07134126871824265, + "learning_rate": 7.597101579729061e-06, + "loss": 0.2305, + "step": 47688 + }, + { + "epoch": 3.863334413480233, + "grad_norm": 0.07982486486434937, + "learning_rate": 7.592600927134434e-06, + "loss": 0.2611, + "step": 47689 + }, + { + "epoch": 3.863415424497732, + "grad_norm": 0.06409557163715363, + "learning_rate": 7.588100274539808e-06, + "loss": 0.2097, + "step": 47690 + }, + { + "epoch": 3.86349643551523, + "grad_norm": 0.0836891233921051, + "learning_rate": 7.583599621945183e-06, + "loss": 0.214, + "step": 47691 + }, + { + "epoch": 3.8635774465327284, + "grad_norm": 0.06255742907524109, + "learning_rate": 7.579098969350556e-06, + "loss": 0.2042, + "step": 47692 + }, + { + "epoch": 3.863658457550227, + "grad_norm": 0.08783721923828125, + "learning_rate": 7.574598316755931e-06, + "loss": 0.2644, + "step": 47693 + }, + { + "epoch": 3.8637394685677253, + "grad_norm": 0.05096682161092758, + "learning_rate": 7.570097664161304e-06, + "loss": 0.166, + "step": 47694 + }, + { + "epoch": 3.8638204795852236, + "grad_norm": 0.07620611041784286, + "learning_rate": 7.565597011566677e-06, + "loss": 0.2563, + "step": 47695 + }, + { + "epoch": 3.863901490602722, + "grad_norm": 0.0640488713979721, + "learning_rate": 7.561096358972052e-06, + "loss": 0.2226, + "step": 47696 + }, + { + "epoch": 3.8639825016202205, + "grad_norm": 0.05692597106099129, + "learning_rate": 7.556595706377425e-06, + "loss": 0.2158, + "step": 47697 + }, + { + "epoch": 3.8640635126377187, + "grad_norm": 0.08596763014793396, + "learning_rate": 7.552095053782798e-06, + "loss": 0.2586, + "step": 47698 + }, + { + "epoch": 3.864144523655217, + "grad_norm": 0.06715045869350433, + "learning_rate": 7.5475944011881725e-06, + "loss": 0.2337, + "step": 47699 + }, + { + "epoch": 3.8642255346727152, + "grad_norm": 0.06749710440635681, + "learning_rate": 7.543093748593547e-06, + "loss": 0.2108, + "step": 47700 + }, + { + "epoch": 3.864306545690214, + "grad_norm": 0.08679332584142685, + "learning_rate": 7.5385930959989205e-06, + "loss": 0.2405, + "step": 47701 + }, + { + "epoch": 3.864387556707712, + "grad_norm": 0.07425546646118164, + "learning_rate": 7.534092443404293e-06, + "loss": 0.2116, + "step": 47702 + }, + { + "epoch": 3.8644685677252104, + "grad_norm": 0.07318223267793655, + "learning_rate": 7.529591790809668e-06, + "loss": 0.2119, + "step": 47703 + }, + { + "epoch": 3.864549578742709, + "grad_norm": 0.0675434023141861, + "learning_rate": 7.525091138215041e-06, + "loss": 0.2143, + "step": 47704 + }, + { + "epoch": 3.8646305897602073, + "grad_norm": 0.07143059372901917, + "learning_rate": 7.520590485620416e-06, + "loss": 0.2349, + "step": 47705 + }, + { + "epoch": 3.8647116007777056, + "grad_norm": 0.07418014854192734, + "learning_rate": 7.516089833025789e-06, + "loss": 0.2372, + "step": 47706 + }, + { + "epoch": 3.8647926117952043, + "grad_norm": 0.0735107809305191, + "learning_rate": 7.511589180431162e-06, + "loss": 0.2023, + "step": 47707 + }, + { + "epoch": 3.8648736228127025, + "grad_norm": 0.06702899187803268, + "learning_rate": 7.507088527836537e-06, + "loss": 0.2527, + "step": 47708 + }, + { + "epoch": 3.8649546338302008, + "grad_norm": 0.0680052861571312, + "learning_rate": 7.502587875241911e-06, + "loss": 0.2181, + "step": 47709 + }, + { + "epoch": 3.8650356448476995, + "grad_norm": 0.07556014508008957, + "learning_rate": 7.498087222647285e-06, + "loss": 0.1964, + "step": 47710 + }, + { + "epoch": 3.8651166558651977, + "grad_norm": 0.06902261078357697, + "learning_rate": 7.493586570052657e-06, + "loss": 0.212, + "step": 47711 + }, + { + "epoch": 3.865197666882696, + "grad_norm": 0.07958406209945679, + "learning_rate": 7.489085917458032e-06, + "loss": 0.2346, + "step": 47712 + }, + { + "epoch": 3.8652786779001946, + "grad_norm": 0.076631560921669, + "learning_rate": 7.4845852648634055e-06, + "loss": 0.2123, + "step": 47713 + }, + { + "epoch": 3.865359688917693, + "grad_norm": 0.06917714327573776, + "learning_rate": 7.48008461226878e-06, + "loss": 0.2468, + "step": 47714 + }, + { + "epoch": 3.865440699935191, + "grad_norm": 0.08754683285951614, + "learning_rate": 7.475583959674153e-06, + "loss": 0.2226, + "step": 47715 + }, + { + "epoch": 3.86552171095269, + "grad_norm": 0.07786770164966583, + "learning_rate": 7.471083307079526e-06, + "loss": 0.2222, + "step": 47716 + }, + { + "epoch": 3.865602721970188, + "grad_norm": 0.06999102234840393, + "learning_rate": 7.466582654484901e-06, + "loss": 0.2052, + "step": 47717 + }, + { + "epoch": 3.8656837329876863, + "grad_norm": 0.0734010711312294, + "learning_rate": 7.462082001890275e-06, + "loss": 0.1994, + "step": 47718 + }, + { + "epoch": 3.8657647440051845, + "grad_norm": 0.07305295020341873, + "learning_rate": 7.457581349295648e-06, + "loss": 0.2284, + "step": 47719 + }, + { + "epoch": 3.8658457550226832, + "grad_norm": 0.07487478107213974, + "learning_rate": 7.4530806967010215e-06, + "loss": 0.2056, + "step": 47720 + }, + { + "epoch": 3.8659267660401815, + "grad_norm": 0.07713651657104492, + "learning_rate": 7.448580044106396e-06, + "loss": 0.2125, + "step": 47721 + }, + { + "epoch": 3.8660077770576797, + "grad_norm": 0.07192755490541458, + "learning_rate": 7.44407939151177e-06, + "loss": 0.2273, + "step": 47722 + }, + { + "epoch": 3.866088788075178, + "grad_norm": 0.07173226028680801, + "learning_rate": 7.439578738917144e-06, + "loss": 0.2011, + "step": 47723 + }, + { + "epoch": 3.8661697990926767, + "grad_norm": 0.09787497669458389, + "learning_rate": 7.435078086322517e-06, + "loss": 0.2243, + "step": 47724 + }, + { + "epoch": 3.866250810110175, + "grad_norm": 0.09983297437429428, + "learning_rate": 7.43057743372789e-06, + "loss": 0.277, + "step": 47725 + }, + { + "epoch": 3.866331821127673, + "grad_norm": 0.0823056623339653, + "learning_rate": 7.426076781133265e-06, + "loss": 0.2216, + "step": 47726 + }, + { + "epoch": 3.866412832145172, + "grad_norm": 0.059470660984516144, + "learning_rate": 7.421576128538639e-06, + "loss": 0.2341, + "step": 47727 + }, + { + "epoch": 3.86649384316267, + "grad_norm": 0.06996002048254013, + "learning_rate": 7.417075475944012e-06, + "loss": 0.2243, + "step": 47728 + }, + { + "epoch": 3.8665748541801683, + "grad_norm": 0.07670509815216064, + "learning_rate": 7.412574823349386e-06, + "loss": 0.1981, + "step": 47729 + }, + { + "epoch": 3.866655865197667, + "grad_norm": 0.07600407302379608, + "learning_rate": 7.40807417075476e-06, + "loss": 0.2125, + "step": 47730 + }, + { + "epoch": 3.8667368762151653, + "grad_norm": 0.0708894208073616, + "learning_rate": 7.403573518160134e-06, + "loss": 0.2284, + "step": 47731 + }, + { + "epoch": 3.8668178872326635, + "grad_norm": 0.07632531970739365, + "learning_rate": 7.3990728655655065e-06, + "loss": 0.2346, + "step": 47732 + }, + { + "epoch": 3.866898898250162, + "grad_norm": 0.06774915009737015, + "learning_rate": 7.394572212970881e-06, + "loss": 0.2466, + "step": 47733 + }, + { + "epoch": 3.8669799092676604, + "grad_norm": 0.07281273603439331, + "learning_rate": 7.3900715603762545e-06, + "loss": 0.2315, + "step": 47734 + }, + { + "epoch": 3.8670609202851587, + "grad_norm": 0.06819894909858704, + "learning_rate": 7.385570907781629e-06, + "loss": 0.2103, + "step": 47735 + }, + { + "epoch": 3.8671419313026574, + "grad_norm": 0.07330294698476791, + "learning_rate": 7.381070255187003e-06, + "loss": 0.2339, + "step": 47736 + }, + { + "epoch": 3.8672229423201556, + "grad_norm": 0.0813525915145874, + "learning_rate": 7.376569602592376e-06, + "loss": 0.2204, + "step": 47737 + }, + { + "epoch": 3.867303953337654, + "grad_norm": 0.05763068050146103, + "learning_rate": 7.37206894999775e-06, + "loss": 0.2022, + "step": 47738 + }, + { + "epoch": 3.8673849643551526, + "grad_norm": 0.07878169417381287, + "learning_rate": 7.367568297403124e-06, + "loss": 0.2099, + "step": 47739 + }, + { + "epoch": 3.867465975372651, + "grad_norm": 0.06456834822893143, + "learning_rate": 7.363067644808498e-06, + "loss": 0.2102, + "step": 47740 + }, + { + "epoch": 3.867546986390149, + "grad_norm": 0.06957262009382248, + "learning_rate": 7.358566992213871e-06, + "loss": 0.2178, + "step": 47741 + }, + { + "epoch": 3.8676279974076473, + "grad_norm": 0.06936120241880417, + "learning_rate": 7.354066339619245e-06, + "loss": 0.2231, + "step": 47742 + }, + { + "epoch": 3.867709008425146, + "grad_norm": 0.08154875040054321, + "learning_rate": 7.349565687024619e-06, + "loss": 0.2084, + "step": 47743 + }, + { + "epoch": 3.8677900194426442, + "grad_norm": 0.07597959786653519, + "learning_rate": 7.345065034429993e-06, + "loss": 0.2453, + "step": 47744 + }, + { + "epoch": 3.8678710304601425, + "grad_norm": 0.05550116300582886, + "learning_rate": 7.340564381835366e-06, + "loss": 0.2199, + "step": 47745 + }, + { + "epoch": 3.8679520414776407, + "grad_norm": 0.06986594945192337, + "learning_rate": 7.33606372924074e-06, + "loss": 0.2177, + "step": 47746 + }, + { + "epoch": 3.8680330524951394, + "grad_norm": 0.09499722719192505, + "learning_rate": 7.331563076646114e-06, + "loss": 0.2531, + "step": 47747 + }, + { + "epoch": 3.8681140635126376, + "grad_norm": 0.07436975091695786, + "learning_rate": 7.327062424051488e-06, + "loss": 0.2138, + "step": 47748 + }, + { + "epoch": 3.868195074530136, + "grad_norm": 0.08323274552822113, + "learning_rate": 7.322561771456862e-06, + "loss": 0.2353, + "step": 47749 + }, + { + "epoch": 3.8682760855476346, + "grad_norm": 0.06948281079530716, + "learning_rate": 7.318061118862235e-06, + "loss": 0.2377, + "step": 47750 + }, + { + "epoch": 3.868357096565133, + "grad_norm": 0.08530833572149277, + "learning_rate": 7.313560466267609e-06, + "loss": 0.2491, + "step": 47751 + }, + { + "epoch": 3.868438107582631, + "grad_norm": 0.07238127291202545, + "learning_rate": 7.309059813672984e-06, + "loss": 0.2442, + "step": 47752 + }, + { + "epoch": 3.8685191186001298, + "grad_norm": 0.06473894417285919, + "learning_rate": 7.304559161078357e-06, + "loss": 0.2375, + "step": 47753 + }, + { + "epoch": 3.868600129617628, + "grad_norm": 0.06814999133348465, + "learning_rate": 7.30005850848373e-06, + "loss": 0.2127, + "step": 47754 + }, + { + "epoch": 3.8686811406351262, + "grad_norm": 0.08416423946619034, + "learning_rate": 7.295557855889104e-06, + "loss": 0.2159, + "step": 47755 + }, + { + "epoch": 3.868762151652625, + "grad_norm": 0.06730145215988159, + "learning_rate": 7.291057203294478e-06, + "loss": 0.2514, + "step": 47756 + }, + { + "epoch": 3.868843162670123, + "grad_norm": 0.061186712235212326, + "learning_rate": 7.2865565506998525e-06, + "loss": 0.1958, + "step": 47757 + }, + { + "epoch": 3.8689241736876214, + "grad_norm": 0.07807473093271255, + "learning_rate": 7.282055898105225e-06, + "loss": 0.1886, + "step": 47758 + }, + { + "epoch": 3.86900518470512, + "grad_norm": 0.07652845978736877, + "learning_rate": 7.277555245510599e-06, + "loss": 0.2328, + "step": 47759 + }, + { + "epoch": 3.8690861957226184, + "grad_norm": 0.07402211427688599, + "learning_rate": 7.273054592915973e-06, + "loss": 0.2609, + "step": 47760 + }, + { + "epoch": 3.8691672067401166, + "grad_norm": 0.05133407190442085, + "learning_rate": 7.268553940321348e-06, + "loss": 0.2124, + "step": 47761 + }, + { + "epoch": 3.8692482177576153, + "grad_norm": 0.08077628910541534, + "learning_rate": 7.26405328772672e-06, + "loss": 0.2662, + "step": 47762 + }, + { + "epoch": 3.8693292287751135, + "grad_norm": 0.06384014338254929, + "learning_rate": 7.259552635132094e-06, + "loss": 0.2063, + "step": 47763 + }, + { + "epoch": 3.869410239792612, + "grad_norm": 0.06897179782390594, + "learning_rate": 7.2550519825374685e-06, + "loss": 0.227, + "step": 47764 + }, + { + "epoch": 3.86949125081011, + "grad_norm": 0.0741807222366333, + "learning_rate": 7.250551329942842e-06, + "loss": 0.1986, + "step": 47765 + }, + { + "epoch": 3.8695722618276087, + "grad_norm": 0.07519097626209259, + "learning_rate": 7.246050677348217e-06, + "loss": 0.2383, + "step": 47766 + }, + { + "epoch": 3.869653272845107, + "grad_norm": 0.07148484885692596, + "learning_rate": 7.241550024753589e-06, + "loss": 0.2481, + "step": 47767 + }, + { + "epoch": 3.869734283862605, + "grad_norm": 0.07364317774772644, + "learning_rate": 7.237049372158963e-06, + "loss": 0.2098, + "step": 47768 + }, + { + "epoch": 3.8698152948801035, + "grad_norm": 0.07559014856815338, + "learning_rate": 7.232548719564337e-06, + "loss": 0.2394, + "step": 47769 + }, + { + "epoch": 3.869896305897602, + "grad_norm": 0.07173273712396622, + "learning_rate": 7.228048066969712e-06, + "loss": 0.2245, + "step": 47770 + }, + { + "epoch": 3.8699773169151004, + "grad_norm": 0.07271765172481537, + "learning_rate": 7.223547414375085e-06, + "loss": 0.1872, + "step": 47771 + }, + { + "epoch": 3.8700583279325986, + "grad_norm": 0.06458864361047745, + "learning_rate": 7.219046761780458e-06, + "loss": 0.2289, + "step": 47772 + }, + { + "epoch": 3.8701393389500973, + "grad_norm": 0.07476481795310974, + "learning_rate": 7.214546109185833e-06, + "loss": 0.2548, + "step": 47773 + }, + { + "epoch": 3.8702203499675956, + "grad_norm": 0.07546652108430862, + "learning_rate": 7.210045456591206e-06, + "loss": 0.2514, + "step": 47774 + }, + { + "epoch": 3.870301360985094, + "grad_norm": 0.06332651525735855, + "learning_rate": 7.205544803996579e-06, + "loss": 0.1829, + "step": 47775 + }, + { + "epoch": 3.8703823720025925, + "grad_norm": 0.07313700765371323, + "learning_rate": 7.2010441514019535e-06, + "loss": 0.2082, + "step": 47776 + }, + { + "epoch": 3.8704633830200907, + "grad_norm": 0.06601651757955551, + "learning_rate": 7.196543498807327e-06, + "loss": 0.2213, + "step": 47777 + }, + { + "epoch": 3.870544394037589, + "grad_norm": 0.07746600359678268, + "learning_rate": 7.1920428462127015e-06, + "loss": 0.2392, + "step": 47778 + }, + { + "epoch": 3.8706254050550877, + "grad_norm": 0.08070232719182968, + "learning_rate": 7.187542193618076e-06, + "loss": 0.2264, + "step": 47779 + }, + { + "epoch": 3.870706416072586, + "grad_norm": 0.06326809525489807, + "learning_rate": 7.183041541023449e-06, + "loss": 0.2145, + "step": 47780 + }, + { + "epoch": 3.870787427090084, + "grad_norm": 0.058843906968832016, + "learning_rate": 7.178540888428822e-06, + "loss": 0.1969, + "step": 47781 + }, + { + "epoch": 3.870868438107583, + "grad_norm": 0.05918506532907486, + "learning_rate": 7.174040235834197e-06, + "loss": 0.2175, + "step": 47782 + }, + { + "epoch": 3.870949449125081, + "grad_norm": 0.071584552526474, + "learning_rate": 7.16953958323957e-06, + "loss": 0.1764, + "step": 47783 + }, + { + "epoch": 3.8710304601425793, + "grad_norm": 0.05277230218052864, + "learning_rate": 7.165038930644943e-06, + "loss": 0.2319, + "step": 47784 + }, + { + "epoch": 3.871111471160078, + "grad_norm": 0.07672587037086487, + "learning_rate": 7.160538278050318e-06, + "loss": 0.209, + "step": 47785 + }, + { + "epoch": 3.8711924821775763, + "grad_norm": 0.08189419656991959, + "learning_rate": 7.156037625455691e-06, + "loss": 0.2412, + "step": 47786 + }, + { + "epoch": 3.8712734931950745, + "grad_norm": 0.06885275989770889, + "learning_rate": 7.151536972861066e-06, + "loss": 0.2351, + "step": 47787 + }, + { + "epoch": 3.8713545042125728, + "grad_norm": 0.07223542034626007, + "learning_rate": 7.147036320266438e-06, + "loss": 0.2375, + "step": 47788 + }, + { + "epoch": 3.8714355152300715, + "grad_norm": 0.07468521595001221, + "learning_rate": 7.142535667671813e-06, + "loss": 0.2284, + "step": 47789 + }, + { + "epoch": 3.8715165262475697, + "grad_norm": 0.06014881283044815, + "learning_rate": 7.1380350150771865e-06, + "loss": 0.1972, + "step": 47790 + }, + { + "epoch": 3.871597537265068, + "grad_norm": 0.0729193314909935, + "learning_rate": 7.133534362482561e-06, + "loss": 0.2033, + "step": 47791 + }, + { + "epoch": 3.871678548282566, + "grad_norm": 0.0688919946551323, + "learning_rate": 7.129033709887934e-06, + "loss": 0.2597, + "step": 47792 + }, + { + "epoch": 3.871759559300065, + "grad_norm": 0.08578962832689285, + "learning_rate": 7.124533057293307e-06, + "loss": 0.2209, + "step": 47793 + }, + { + "epoch": 3.871840570317563, + "grad_norm": 0.06939221173524857, + "learning_rate": 7.120032404698682e-06, + "loss": 0.2107, + "step": 47794 + }, + { + "epoch": 3.8719215813350614, + "grad_norm": 0.07219339162111282, + "learning_rate": 7.115531752104055e-06, + "loss": 0.2238, + "step": 47795 + }, + { + "epoch": 3.87200259235256, + "grad_norm": 0.06591325253248215, + "learning_rate": 7.11103109950943e-06, + "loss": 0.2357, + "step": 47796 + }, + { + "epoch": 3.8720836033700583, + "grad_norm": 0.06282512098550797, + "learning_rate": 7.1065304469148025e-06, + "loss": 0.209, + "step": 47797 + }, + { + "epoch": 3.8721646143875565, + "grad_norm": 0.07997066527605057, + "learning_rate": 7.102029794320177e-06, + "loss": 0.2465, + "step": 47798 + }, + { + "epoch": 3.8722456254050552, + "grad_norm": 0.08144117146730423, + "learning_rate": 7.097529141725551e-06, + "loss": 0.2519, + "step": 47799 + }, + { + "epoch": 3.8723266364225535, + "grad_norm": 0.06622984260320663, + "learning_rate": 7.093028489130925e-06, + "loss": 0.1985, + "step": 47800 + }, + { + "epoch": 3.8724076474400517, + "grad_norm": 0.06841832399368286, + "learning_rate": 7.088527836536298e-06, + "loss": 0.2252, + "step": 47801 + }, + { + "epoch": 3.8724886584575504, + "grad_norm": 0.06852193176746368, + "learning_rate": 7.084027183941671e-06, + "loss": 0.2244, + "step": 47802 + }, + { + "epoch": 3.8725696694750487, + "grad_norm": 0.09205551445484161, + "learning_rate": 7.079526531347046e-06, + "loss": 0.2398, + "step": 47803 + }, + { + "epoch": 3.872650680492547, + "grad_norm": 0.07046569138765335, + "learning_rate": 7.0750258787524195e-06, + "loss": 0.2459, + "step": 47804 + }, + { + "epoch": 3.8727316915100456, + "grad_norm": 0.0705006867647171, + "learning_rate": 7.070525226157792e-06, + "loss": 0.2379, + "step": 47805 + }, + { + "epoch": 3.872812702527544, + "grad_norm": 0.06745926290750504, + "learning_rate": 7.066024573563167e-06, + "loss": 0.2444, + "step": 47806 + }, + { + "epoch": 3.872893713545042, + "grad_norm": 0.06479792296886444, + "learning_rate": 7.061523920968541e-06, + "loss": 0.1962, + "step": 47807 + }, + { + "epoch": 3.8729747245625408, + "grad_norm": 0.07828095555305481, + "learning_rate": 7.057023268373915e-06, + "loss": 0.2304, + "step": 47808 + }, + { + "epoch": 3.873055735580039, + "grad_norm": 0.05744514614343643, + "learning_rate": 7.052522615779289e-06, + "loss": 0.2172, + "step": 47809 + }, + { + "epoch": 3.8731367465975373, + "grad_norm": 0.06972257792949677, + "learning_rate": 7.048021963184662e-06, + "loss": 0.2432, + "step": 47810 + }, + { + "epoch": 3.8732177576150355, + "grad_norm": 0.058357443660497665, + "learning_rate": 7.0435213105900355e-06, + "loss": 0.1878, + "step": 47811 + }, + { + "epoch": 3.8732987686325338, + "grad_norm": 0.06073423847556114, + "learning_rate": 7.03902065799541e-06, + "loss": 0.2022, + "step": 47812 + }, + { + "epoch": 3.8733797796500324, + "grad_norm": 0.059745658189058304, + "learning_rate": 7.034520005400784e-06, + "loss": 0.1826, + "step": 47813 + }, + { + "epoch": 3.8734607906675307, + "grad_norm": 0.07808952778577805, + "learning_rate": 7.030019352806156e-06, + "loss": 0.2375, + "step": 47814 + }, + { + "epoch": 3.873541801685029, + "grad_norm": 0.07170235365629196, + "learning_rate": 7.025518700211531e-06, + "loss": 0.2244, + "step": 47815 + }, + { + "epoch": 3.8736228127025276, + "grad_norm": 0.07987401634454727, + "learning_rate": 7.021018047616905e-06, + "loss": 0.2086, + "step": 47816 + }, + { + "epoch": 3.873703823720026, + "grad_norm": 0.0680292397737503, + "learning_rate": 7.016517395022279e-06, + "loss": 0.2353, + "step": 47817 + }, + { + "epoch": 3.873784834737524, + "grad_norm": 0.06402581185102463, + "learning_rate": 7.012016742427652e-06, + "loss": 0.1892, + "step": 47818 + }, + { + "epoch": 3.873865845755023, + "grad_norm": 0.07108760625123978, + "learning_rate": 7.007516089833026e-06, + "loss": 0.2752, + "step": 47819 + }, + { + "epoch": 3.873946856772521, + "grad_norm": 0.07389482855796814, + "learning_rate": 7.0030154372384e-06, + "loss": 0.2215, + "step": 47820 + }, + { + "epoch": 3.8740278677900193, + "grad_norm": 0.07002807408571243, + "learning_rate": 6.998514784643774e-06, + "loss": 0.2196, + "step": 47821 + }, + { + "epoch": 3.874108878807518, + "grad_norm": 0.07865195721387863, + "learning_rate": 6.9940141320491485e-06, + "loss": 0.2383, + "step": 47822 + }, + { + "epoch": 3.874189889825016, + "grad_norm": 0.07489572465419769, + "learning_rate": 6.9895134794545205e-06, + "loss": 0.2034, + "step": 47823 + }, + { + "epoch": 3.8742709008425145, + "grad_norm": 0.0735786184668541, + "learning_rate": 6.985012826859895e-06, + "loss": 0.2558, + "step": 47824 + }, + { + "epoch": 3.874351911860013, + "grad_norm": 0.07100185006856918, + "learning_rate": 6.980512174265269e-06, + "loss": 0.2405, + "step": 47825 + }, + { + "epoch": 3.8744329228775114, + "grad_norm": 0.06581882387399673, + "learning_rate": 6.976011521670643e-06, + "loss": 0.1944, + "step": 47826 + }, + { + "epoch": 3.8745139338950096, + "grad_norm": 0.06488057225942612, + "learning_rate": 6.971510869076016e-06, + "loss": 0.224, + "step": 47827 + }, + { + "epoch": 3.8745949449125083, + "grad_norm": 0.07766662538051605, + "learning_rate": 6.96701021648139e-06, + "loss": 0.234, + "step": 47828 + }, + { + "epoch": 3.8746759559300066, + "grad_norm": 0.06239504739642143, + "learning_rate": 6.962509563886764e-06, + "loss": 0.1958, + "step": 47829 + }, + { + "epoch": 3.874756966947505, + "grad_norm": 0.06142323091626167, + "learning_rate": 6.958008911292138e-06, + "loss": 0.219, + "step": 47830 + }, + { + "epoch": 3.8748379779650035, + "grad_norm": 0.07786870002746582, + "learning_rate": 6.953508258697511e-06, + "loss": 0.239, + "step": 47831 + }, + { + "epoch": 3.8749189889825018, + "grad_norm": 0.062448933720588684, + "learning_rate": 6.949007606102885e-06, + "loss": 0.2096, + "step": 47832 + }, + { + "epoch": 3.875, + "grad_norm": 0.05940355360507965, + "learning_rate": 6.944506953508259e-06, + "loss": 0.1905, + "step": 47833 + }, + { + "epoch": 3.8750810110174982, + "grad_norm": 0.05543764308094978, + "learning_rate": 6.9400063009136335e-06, + "loss": 0.1915, + "step": 47834 + }, + { + "epoch": 3.8751620220349965, + "grad_norm": 0.08461619913578033, + "learning_rate": 6.935505648319006e-06, + "loss": 0.2633, + "step": 47835 + }, + { + "epoch": 3.875243033052495, + "grad_norm": 0.0797097235918045, + "learning_rate": 6.93100499572438e-06, + "loss": 0.2521, + "step": 47836 + }, + { + "epoch": 3.8753240440699934, + "grad_norm": 0.07515867799520493, + "learning_rate": 6.926504343129754e-06, + "loss": 0.2429, + "step": 47837 + }, + { + "epoch": 3.8754050550874917, + "grad_norm": 0.0636037066578865, + "learning_rate": 6.922003690535128e-06, + "loss": 0.1941, + "step": 47838 + }, + { + "epoch": 3.8754860661049904, + "grad_norm": 0.0706285685300827, + "learning_rate": 6.917503037940502e-06, + "loss": 0.2353, + "step": 47839 + }, + { + "epoch": 3.8755670771224886, + "grad_norm": 0.0567207969725132, + "learning_rate": 6.913002385345875e-06, + "loss": 0.1933, + "step": 47840 + }, + { + "epoch": 3.875648088139987, + "grad_norm": 0.07477942109107971, + "learning_rate": 6.908501732751249e-06, + "loss": 0.2183, + "step": 47841 + }, + { + "epoch": 3.8757290991574855, + "grad_norm": 0.057915329933166504, + "learning_rate": 6.904001080156623e-06, + "loss": 0.2068, + "step": 47842 + }, + { + "epoch": 3.875810110174984, + "grad_norm": 0.0781538337469101, + "learning_rate": 6.899500427561998e-06, + "loss": 0.1967, + "step": 47843 + }, + { + "epoch": 3.875891121192482, + "grad_norm": 0.07939945161342621, + "learning_rate": 6.89499977496737e-06, + "loss": 0.2356, + "step": 47844 + }, + { + "epoch": 3.8759721322099807, + "grad_norm": 0.05997350439429283, + "learning_rate": 6.890499122372744e-06, + "loss": 0.2141, + "step": 47845 + }, + { + "epoch": 3.876053143227479, + "grad_norm": 0.06055265665054321, + "learning_rate": 6.885998469778118e-06, + "loss": 0.2461, + "step": 47846 + }, + { + "epoch": 3.876134154244977, + "grad_norm": 0.07572788745164871, + "learning_rate": 6.881497817183492e-06, + "loss": 0.2126, + "step": 47847 + }, + { + "epoch": 3.876215165262476, + "grad_norm": 0.06472624838352203, + "learning_rate": 6.876997164588865e-06, + "loss": 0.189, + "step": 47848 + }, + { + "epoch": 3.876296176279974, + "grad_norm": 0.08194227516651154, + "learning_rate": 6.872496511994239e-06, + "loss": 0.1988, + "step": 47849 + }, + { + "epoch": 3.8763771872974724, + "grad_norm": 0.08095631748437881, + "learning_rate": 6.867995859399614e-06, + "loss": 0.2483, + "step": 47850 + }, + { + "epoch": 3.876458198314971, + "grad_norm": 0.075938880443573, + "learning_rate": 6.863495206804987e-06, + "loss": 0.2077, + "step": 47851 + }, + { + "epoch": 3.8765392093324693, + "grad_norm": 0.07606460154056549, + "learning_rate": 6.858994554210362e-06, + "loss": 0.2077, + "step": 47852 + }, + { + "epoch": 3.8766202203499676, + "grad_norm": 0.06377337127923965, + "learning_rate": 6.8544939016157345e-06, + "loss": 0.2375, + "step": 47853 + }, + { + "epoch": 3.8767012313674662, + "grad_norm": 0.06751275062561035, + "learning_rate": 6.849993249021108e-06, + "loss": 0.2163, + "step": 47854 + }, + { + "epoch": 3.8767822423849645, + "grad_norm": 0.06989561021327972, + "learning_rate": 6.8454925964264825e-06, + "loss": 0.2183, + "step": 47855 + }, + { + "epoch": 3.8768632534024627, + "grad_norm": 0.05857057496905327, + "learning_rate": 6.840991943831856e-06, + "loss": 0.2215, + "step": 47856 + }, + { + "epoch": 3.876944264419961, + "grad_norm": 0.09082645177841187, + "learning_rate": 6.836491291237229e-06, + "loss": 0.2536, + "step": 47857 + }, + { + "epoch": 3.8770252754374592, + "grad_norm": 0.0694134309887886, + "learning_rate": 6.831990638642603e-06, + "loss": 0.196, + "step": 47858 + }, + { + "epoch": 3.877106286454958, + "grad_norm": 0.07403349131345749, + "learning_rate": 6.827489986047978e-06, + "loss": 0.2341, + "step": 47859 + }, + { + "epoch": 3.877187297472456, + "grad_norm": 0.061810147017240524, + "learning_rate": 6.822989333453351e-06, + "loss": 0.2047, + "step": 47860 + }, + { + "epoch": 3.8772683084899544, + "grad_norm": 0.07295320928096771, + "learning_rate": 6.818488680858724e-06, + "loss": 0.257, + "step": 47861 + }, + { + "epoch": 3.877349319507453, + "grad_norm": 0.06367763131856918, + "learning_rate": 6.813988028264099e-06, + "loss": 0.2078, + "step": 47862 + }, + { + "epoch": 3.8774303305249513, + "grad_norm": 0.084407739341259, + "learning_rate": 6.809487375669472e-06, + "loss": 0.1852, + "step": 47863 + }, + { + "epoch": 3.8775113415424496, + "grad_norm": 0.07967525720596313, + "learning_rate": 6.804986723074847e-06, + "loss": 0.2155, + "step": 47864 + }, + { + "epoch": 3.8775923525599483, + "grad_norm": 0.0668228268623352, + "learning_rate": 6.80048607048022e-06, + "loss": 0.2117, + "step": 47865 + }, + { + "epoch": 3.8776733635774465, + "grad_norm": 0.06333678960800171, + "learning_rate": 6.795985417885593e-06, + "loss": 0.1996, + "step": 47866 + }, + { + "epoch": 3.8777543745949448, + "grad_norm": 0.07337368279695511, + "learning_rate": 6.7914847652909675e-06, + "loss": 0.2246, + "step": 47867 + }, + { + "epoch": 3.8778353856124435, + "grad_norm": 0.0809299424290657, + "learning_rate": 6.786984112696342e-06, + "loss": 0.2155, + "step": 47868 + }, + { + "epoch": 3.8779163966299417, + "grad_norm": 0.0768105685710907, + "learning_rate": 6.7824834601017155e-06, + "loss": 0.205, + "step": 47869 + }, + { + "epoch": 3.87799740764744, + "grad_norm": 0.07321664690971375, + "learning_rate": 6.777982807507088e-06, + "loss": 0.2521, + "step": 47870 + }, + { + "epoch": 3.8780784186649386, + "grad_norm": 0.06322470307350159, + "learning_rate": 6.773482154912463e-06, + "loss": 0.2193, + "step": 47871 + }, + { + "epoch": 3.878159429682437, + "grad_norm": 0.0774395614862442, + "learning_rate": 6.768981502317836e-06, + "loss": 0.2615, + "step": 47872 + }, + { + "epoch": 3.878240440699935, + "grad_norm": 0.07283959537744522, + "learning_rate": 6.764480849723211e-06, + "loss": 0.2156, + "step": 47873 + }, + { + "epoch": 3.878321451717434, + "grad_norm": 0.0624484121799469, + "learning_rate": 6.7599801971285835e-06, + "loss": 0.227, + "step": 47874 + }, + { + "epoch": 3.878402462734932, + "grad_norm": 0.07772690057754517, + "learning_rate": 6.755479544533957e-06, + "loss": 0.2378, + "step": 47875 + }, + { + "epoch": 3.8784834737524303, + "grad_norm": 0.06441731750965118, + "learning_rate": 6.750978891939332e-06, + "loss": 0.2066, + "step": 47876 + }, + { + "epoch": 3.8785644847699285, + "grad_norm": 0.07770302146673203, + "learning_rate": 6.746478239344706e-06, + "loss": 0.2372, + "step": 47877 + }, + { + "epoch": 3.8786454957874272, + "grad_norm": 0.07686355710029602, + "learning_rate": 6.741977586750079e-06, + "loss": 0.2351, + "step": 47878 + }, + { + "epoch": 3.8787265068049255, + "grad_norm": 0.09807416796684265, + "learning_rate": 6.737476934155452e-06, + "loss": 0.2255, + "step": 47879 + }, + { + "epoch": 3.8788075178224237, + "grad_norm": 0.06583904474973679, + "learning_rate": 6.732976281560827e-06, + "loss": 0.2109, + "step": 47880 + }, + { + "epoch": 3.878888528839922, + "grad_norm": 0.07584837824106216, + "learning_rate": 6.7284756289662005e-06, + "loss": 0.2419, + "step": 47881 + }, + { + "epoch": 3.8789695398574207, + "grad_norm": 0.05641576275229454, + "learning_rate": 6.723974976371575e-06, + "loss": 0.2139, + "step": 47882 + }, + { + "epoch": 3.879050550874919, + "grad_norm": 0.07774477452039719, + "learning_rate": 6.719474323776948e-06, + "loss": 0.2532, + "step": 47883 + }, + { + "epoch": 3.879131561892417, + "grad_norm": 0.06663135439157486, + "learning_rate": 6.714973671182321e-06, + "loss": 0.2139, + "step": 47884 + }, + { + "epoch": 3.879212572909916, + "grad_norm": 0.07237344980239868, + "learning_rate": 6.710473018587696e-06, + "loss": 0.188, + "step": 47885 + }, + { + "epoch": 3.879293583927414, + "grad_norm": 0.0650007426738739, + "learning_rate": 6.70597236599307e-06, + "loss": 0.2663, + "step": 47886 + }, + { + "epoch": 3.8793745949449123, + "grad_norm": 0.07538321614265442, + "learning_rate": 6.701471713398443e-06, + "loss": 0.2219, + "step": 47887 + }, + { + "epoch": 3.879455605962411, + "grad_norm": 0.0606030635535717, + "learning_rate": 6.6969710608038165e-06, + "loss": 0.2124, + "step": 47888 + }, + { + "epoch": 3.8795366169799093, + "grad_norm": 0.07034909725189209, + "learning_rate": 6.692470408209191e-06, + "loss": 0.2611, + "step": 47889 + }, + { + "epoch": 3.8796176279974075, + "grad_norm": 0.055674366652965546, + "learning_rate": 6.687969755614565e-06, + "loss": 0.1966, + "step": 47890 + }, + { + "epoch": 3.879698639014906, + "grad_norm": 0.06106676533818245, + "learning_rate": 6.683469103019937e-06, + "loss": 0.184, + "step": 47891 + }, + { + "epoch": 3.8797796500324044, + "grad_norm": 0.06368546187877655, + "learning_rate": 6.678968450425312e-06, + "loss": 0.2065, + "step": 47892 + }, + { + "epoch": 3.8798606610499027, + "grad_norm": 0.08897584676742554, + "learning_rate": 6.674467797830685e-06, + "loss": 0.2394, + "step": 47893 + }, + { + "epoch": 3.8799416720674014, + "grad_norm": 0.06789989769458771, + "learning_rate": 6.66996714523606e-06, + "loss": 0.2254, + "step": 47894 + }, + { + "epoch": 3.8800226830848996, + "grad_norm": 0.057918865233659744, + "learning_rate": 6.665466492641434e-06, + "loss": 0.2368, + "step": 47895 + }, + { + "epoch": 3.880103694102398, + "grad_norm": 0.06686865538358688, + "learning_rate": 6.660965840046807e-06, + "loss": 0.1985, + "step": 47896 + }, + { + "epoch": 3.8801847051198965, + "grad_norm": 0.07179523259401321, + "learning_rate": 6.656465187452181e-06, + "loss": 0.2398, + "step": 47897 + }, + { + "epoch": 3.880265716137395, + "grad_norm": 0.09124450385570526, + "learning_rate": 6.651964534857555e-06, + "loss": 0.2308, + "step": 47898 + }, + { + "epoch": 3.880346727154893, + "grad_norm": 0.06625550985336304, + "learning_rate": 6.647463882262929e-06, + "loss": 0.2009, + "step": 47899 + }, + { + "epoch": 3.8804277381723913, + "grad_norm": 0.08759491890668869, + "learning_rate": 6.6429632296683015e-06, + "loss": 0.2358, + "step": 47900 + }, + { + "epoch": 3.88050874918989, + "grad_norm": 0.08081474155187607, + "learning_rate": 6.638462577073676e-06, + "loss": 0.2259, + "step": 47901 + }, + { + "epoch": 3.880589760207388, + "grad_norm": 0.08028419315814972, + "learning_rate": 6.6339619244790495e-06, + "loss": 0.2476, + "step": 47902 + }, + { + "epoch": 3.8806707712248865, + "grad_norm": 0.0740567073225975, + "learning_rate": 6.629461271884424e-06, + "loss": 0.2456, + "step": 47903 + }, + { + "epoch": 3.8807517822423847, + "grad_norm": 0.06059883162379265, + "learning_rate": 6.624960619289797e-06, + "loss": 0.2061, + "step": 47904 + }, + { + "epoch": 3.8808327932598834, + "grad_norm": 0.06069719418883324, + "learning_rate": 6.620459966695171e-06, + "loss": 0.1905, + "step": 47905 + }, + { + "epoch": 3.8809138042773816, + "grad_norm": 0.07481228560209274, + "learning_rate": 6.615959314100545e-06, + "loss": 0.2472, + "step": 47906 + }, + { + "epoch": 3.88099481529488, + "grad_norm": 0.0695427656173706, + "learning_rate": 6.611458661505919e-06, + "loss": 0.2197, + "step": 47907 + }, + { + "epoch": 3.8810758263123786, + "grad_norm": 0.07964444160461426, + "learning_rate": 6.606958008911293e-06, + "loss": 0.2039, + "step": 47908 + }, + { + "epoch": 3.881156837329877, + "grad_norm": 0.06760581582784653, + "learning_rate": 6.602457356316666e-06, + "loss": 0.24, + "step": 47909 + }, + { + "epoch": 3.881237848347375, + "grad_norm": 0.08289235085248947, + "learning_rate": 6.59795670372204e-06, + "loss": 0.2075, + "step": 47910 + }, + { + "epoch": 3.8813188593648738, + "grad_norm": 0.05402611568570137, + "learning_rate": 6.593456051127414e-06, + "loss": 0.1988, + "step": 47911 + }, + { + "epoch": 3.881399870382372, + "grad_norm": 0.06173928454518318, + "learning_rate": 6.588955398532788e-06, + "loss": 0.2041, + "step": 47912 + }, + { + "epoch": 3.8814808813998702, + "grad_norm": 0.06798407435417175, + "learning_rate": 6.584454745938161e-06, + "loss": 0.2773, + "step": 47913 + }, + { + "epoch": 3.881561892417369, + "grad_norm": 0.07197803258895874, + "learning_rate": 6.579954093343535e-06, + "loss": 0.2325, + "step": 47914 + }, + { + "epoch": 3.881642903434867, + "grad_norm": 0.0872403159737587, + "learning_rate": 6.575453440748909e-06, + "loss": 0.2524, + "step": 47915 + }, + { + "epoch": 3.8817239144523654, + "grad_norm": 0.07195434719324112, + "learning_rate": 6.570952788154283e-06, + "loss": 0.2117, + "step": 47916 + }, + { + "epoch": 3.881804925469864, + "grad_norm": 0.05543294548988342, + "learning_rate": 6.566452135559656e-06, + "loss": 0.1988, + "step": 47917 + }, + { + "epoch": 3.8818859364873624, + "grad_norm": 0.0677102655172348, + "learning_rate": 6.56195148296503e-06, + "loss": 0.2404, + "step": 47918 + }, + { + "epoch": 3.8819669475048606, + "grad_norm": 0.07490658760070801, + "learning_rate": 6.557450830370404e-06, + "loss": 0.2111, + "step": 47919 + }, + { + "epoch": 3.8820479585223593, + "grad_norm": 0.06448684632778168, + "learning_rate": 6.552950177775778e-06, + "loss": 0.1931, + "step": 47920 + }, + { + "epoch": 3.8821289695398575, + "grad_norm": 0.06765627861022949, + "learning_rate": 6.5484495251811505e-06, + "loss": 0.2379, + "step": 47921 + }, + { + "epoch": 3.8822099805573558, + "grad_norm": 0.05987018346786499, + "learning_rate": 6.543948872586525e-06, + "loss": 0.2281, + "step": 47922 + }, + { + "epoch": 3.882290991574854, + "grad_norm": 0.07392005622386932, + "learning_rate": 6.5394482199918994e-06, + "loss": 0.2199, + "step": 47923 + }, + { + "epoch": 3.8823720025923527, + "grad_norm": 0.07084795087575912, + "learning_rate": 6.534947567397273e-06, + "loss": 0.2391, + "step": 47924 + }, + { + "epoch": 3.882453013609851, + "grad_norm": 0.06273969262838364, + "learning_rate": 6.5304469148026475e-06, + "loss": 0.2309, + "step": 47925 + }, + { + "epoch": 3.882534024627349, + "grad_norm": 0.08438969403505325, + "learning_rate": 6.52594626220802e-06, + "loss": 0.2198, + "step": 47926 + }, + { + "epoch": 3.8826150356448474, + "grad_norm": 0.0724400132894516, + "learning_rate": 6.521445609613394e-06, + "loss": 0.2152, + "step": 47927 + }, + { + "epoch": 3.882696046662346, + "grad_norm": 0.060646384954452515, + "learning_rate": 6.516944957018768e-06, + "loss": 0.2088, + "step": 47928 + }, + { + "epoch": 3.8827770576798444, + "grad_norm": 0.07137048989534378, + "learning_rate": 6.512444304424143e-06, + "loss": 0.2723, + "step": 47929 + }, + { + "epoch": 3.8828580686973426, + "grad_norm": 0.07384997606277466, + "learning_rate": 6.507943651829515e-06, + "loss": 0.2015, + "step": 47930 + }, + { + "epoch": 3.8829390797148413, + "grad_norm": 0.082190603017807, + "learning_rate": 6.503442999234889e-06, + "loss": 0.2473, + "step": 47931 + }, + { + "epoch": 3.8830200907323396, + "grad_norm": 0.07313903421163559, + "learning_rate": 6.4989423466402635e-06, + "loss": 0.2301, + "step": 47932 + }, + { + "epoch": 3.883101101749838, + "grad_norm": 0.0644293874502182, + "learning_rate": 6.494441694045637e-06, + "loss": 0.2226, + "step": 47933 + }, + { + "epoch": 3.8831821127673365, + "grad_norm": 0.0785566046833992, + "learning_rate": 6.48994104145101e-06, + "loss": 0.2528, + "step": 47934 + }, + { + "epoch": 3.8832631237848347, + "grad_norm": 0.08307922631502151, + "learning_rate": 6.485440388856384e-06, + "loss": 0.207, + "step": 47935 + }, + { + "epoch": 3.883344134802333, + "grad_norm": 0.07435651123523712, + "learning_rate": 6.480939736261758e-06, + "loss": 0.2639, + "step": 47936 + }, + { + "epoch": 3.8834251458198317, + "grad_norm": 0.06681717187166214, + "learning_rate": 6.476439083667132e-06, + "loss": 0.1967, + "step": 47937 + }, + { + "epoch": 3.88350615683733, + "grad_norm": 0.06831416487693787, + "learning_rate": 6.471938431072507e-06, + "loss": 0.2202, + "step": 47938 + }, + { + "epoch": 3.883587167854828, + "grad_norm": 0.06847839057445526, + "learning_rate": 6.467437778477879e-06, + "loss": 0.1872, + "step": 47939 + }, + { + "epoch": 3.883668178872327, + "grad_norm": 0.07324326038360596, + "learning_rate": 6.462937125883253e-06, + "loss": 0.2338, + "step": 47940 + }, + { + "epoch": 3.883749189889825, + "grad_norm": 0.06632672995328903, + "learning_rate": 6.458436473288628e-06, + "loss": 0.2185, + "step": 47941 + }, + { + "epoch": 3.8838302009073233, + "grad_norm": 0.09587211906909943, + "learning_rate": 6.453935820694001e-06, + "loss": 0.2197, + "step": 47942 + }, + { + "epoch": 3.883911211924822, + "grad_norm": 0.07188690453767776, + "learning_rate": 6.449435168099374e-06, + "loss": 0.206, + "step": 47943 + }, + { + "epoch": 3.8839922229423203, + "grad_norm": 0.06359101831912994, + "learning_rate": 6.4449345155047485e-06, + "loss": 0.1849, + "step": 47944 + }, + { + "epoch": 3.8840732339598185, + "grad_norm": 0.08528807014226913, + "learning_rate": 6.440433862910122e-06, + "loss": 0.2048, + "step": 47945 + }, + { + "epoch": 3.8841542449773168, + "grad_norm": 0.09145567566156387, + "learning_rate": 6.4359332103154965e-06, + "loss": 0.2217, + "step": 47946 + }, + { + "epoch": 3.8842352559948155, + "grad_norm": 0.0667794719338417, + "learning_rate": 6.431432557720869e-06, + "loss": 0.2307, + "step": 47947 + }, + { + "epoch": 3.8843162670123137, + "grad_norm": 0.08978570997714996, + "learning_rate": 6.426931905126244e-06, + "loss": 0.2436, + "step": 47948 + }, + { + "epoch": 3.884397278029812, + "grad_norm": 0.09070317447185516, + "learning_rate": 6.422431252531617e-06, + "loss": 0.2234, + "step": 47949 + }, + { + "epoch": 3.88447828904731, + "grad_norm": 0.07344508171081543, + "learning_rate": 6.417930599936992e-06, + "loss": 0.1885, + "step": 47950 + }, + { + "epoch": 3.884559300064809, + "grad_norm": 0.0661882534623146, + "learning_rate": 6.413429947342365e-06, + "loss": 0.1988, + "step": 47951 + }, + { + "epoch": 3.884640311082307, + "grad_norm": 0.07575134187936783, + "learning_rate": 6.408929294747738e-06, + "loss": 0.2317, + "step": 47952 + }, + { + "epoch": 3.8847213220998054, + "grad_norm": 0.08309940248727798, + "learning_rate": 6.404428642153113e-06, + "loss": 0.2172, + "step": 47953 + }, + { + "epoch": 3.884802333117304, + "grad_norm": 0.06687768548727036, + "learning_rate": 6.399927989558486e-06, + "loss": 0.211, + "step": 47954 + }, + { + "epoch": 3.8848833441348023, + "grad_norm": 0.10296569019556046, + "learning_rate": 6.395427336963861e-06, + "loss": 0.2346, + "step": 47955 + }, + { + "epoch": 3.8849643551523005, + "grad_norm": 0.0639154314994812, + "learning_rate": 6.390926684369233e-06, + "loss": 0.2162, + "step": 47956 + }, + { + "epoch": 3.8850453661697992, + "grad_norm": 0.07917410880327225, + "learning_rate": 6.386426031774608e-06, + "loss": 0.2054, + "step": 47957 + }, + { + "epoch": 3.8851263771872975, + "grad_norm": 0.08458743989467621, + "learning_rate": 6.3819253791799815e-06, + "loss": 0.2153, + "step": 47958 + }, + { + "epoch": 3.8852073882047957, + "grad_norm": 0.07421503216028214, + "learning_rate": 6.377424726585356e-06, + "loss": 0.2597, + "step": 47959 + }, + { + "epoch": 3.8852883992222944, + "grad_norm": 0.08566386252641678, + "learning_rate": 6.372924073990729e-06, + "loss": 0.2534, + "step": 47960 + }, + { + "epoch": 3.8853694102397927, + "grad_norm": 0.07812653481960297, + "learning_rate": 6.368423421396102e-06, + "loss": 0.2537, + "step": 47961 + }, + { + "epoch": 3.885450421257291, + "grad_norm": 0.09011181443929672, + "learning_rate": 6.363922768801477e-06, + "loss": 0.2197, + "step": 47962 + }, + { + "epoch": 3.8855314322747896, + "grad_norm": 0.07989250868558884, + "learning_rate": 6.35942211620685e-06, + "loss": 0.2303, + "step": 47963 + }, + { + "epoch": 3.885612443292288, + "grad_norm": 0.08667084574699402, + "learning_rate": 6.354921463612223e-06, + "loss": 0.247, + "step": 47964 + }, + { + "epoch": 3.885693454309786, + "grad_norm": 0.07098482549190521, + "learning_rate": 6.3504208110175975e-06, + "loss": 0.2297, + "step": 47965 + }, + { + "epoch": 3.8857744653272848, + "grad_norm": 0.06345818936824799, + "learning_rate": 6.345920158422972e-06, + "loss": 0.1796, + "step": 47966 + }, + { + "epoch": 3.885855476344783, + "grad_norm": 0.06717762351036072, + "learning_rate": 6.341419505828346e-06, + "loss": 0.2043, + "step": 47967 + }, + { + "epoch": 3.8859364873622813, + "grad_norm": 0.06535407900810242, + "learning_rate": 6.33691885323372e-06, + "loss": 0.2326, + "step": 47968 + }, + { + "epoch": 3.8860174983797795, + "grad_norm": 0.07947888225317001, + "learning_rate": 6.332418200639093e-06, + "loss": 0.2643, + "step": 47969 + }, + { + "epoch": 3.886098509397278, + "grad_norm": 0.07099038362503052, + "learning_rate": 6.327917548044466e-06, + "loss": 0.2183, + "step": 47970 + }, + { + "epoch": 3.8861795204147764, + "grad_norm": 0.08765587210655212, + "learning_rate": 6.323416895449841e-06, + "loss": 0.2197, + "step": 47971 + }, + { + "epoch": 3.8862605314322747, + "grad_norm": 0.06536171585321426, + "learning_rate": 6.3189162428552145e-06, + "loss": 0.1928, + "step": 47972 + }, + { + "epoch": 3.886341542449773, + "grad_norm": 0.08230137079954147, + "learning_rate": 6.314415590260587e-06, + "loss": 0.231, + "step": 47973 + }, + { + "epoch": 3.8864225534672716, + "grad_norm": 0.06578972190618515, + "learning_rate": 6.309914937665962e-06, + "loss": 0.2168, + "step": 47974 + }, + { + "epoch": 3.88650356448477, + "grad_norm": 0.06204579770565033, + "learning_rate": 6.305414285071336e-06, + "loss": 0.2093, + "step": 47975 + }, + { + "epoch": 3.886584575502268, + "grad_norm": 0.06162213534116745, + "learning_rate": 6.30091363247671e-06, + "loss": 0.2351, + "step": 47976 + }, + { + "epoch": 3.886665586519767, + "grad_norm": 0.07075859606266022, + "learning_rate": 6.2964129798820825e-06, + "loss": 0.1968, + "step": 47977 + }, + { + "epoch": 3.886746597537265, + "grad_norm": 0.08070050179958344, + "learning_rate": 6.291912327287457e-06, + "loss": 0.2021, + "step": 47978 + }, + { + "epoch": 3.8868276085547633, + "grad_norm": 0.07428973913192749, + "learning_rate": 6.2874116746928305e-06, + "loss": 0.249, + "step": 47979 + }, + { + "epoch": 3.886908619572262, + "grad_norm": 0.08572898060083389, + "learning_rate": 6.282911022098205e-06, + "loss": 0.2257, + "step": 47980 + }, + { + "epoch": 3.88698963058976, + "grad_norm": 0.06448810547590256, + "learning_rate": 6.278410369503579e-06, + "loss": 0.1896, + "step": 47981 + }, + { + "epoch": 3.8870706416072585, + "grad_norm": 0.05051233246922493, + "learning_rate": 6.273909716908951e-06, + "loss": 0.2156, + "step": 47982 + }, + { + "epoch": 3.887151652624757, + "grad_norm": 0.08261623233556747, + "learning_rate": 6.269409064314326e-06, + "loss": 0.2675, + "step": 47983 + }, + { + "epoch": 3.8872326636422554, + "grad_norm": 0.05112019553780556, + "learning_rate": 6.2649084117197e-06, + "loss": 0.2252, + "step": 47984 + }, + { + "epoch": 3.8873136746597536, + "grad_norm": 0.09426064044237137, + "learning_rate": 6.260407759125074e-06, + "loss": 0.2285, + "step": 47985 + }, + { + "epoch": 3.8873946856772523, + "grad_norm": 0.08980914205312729, + "learning_rate": 6.255907106530447e-06, + "loss": 0.2585, + "step": 47986 + }, + { + "epoch": 3.8874756966947506, + "grad_norm": 0.0838213637471199, + "learning_rate": 6.251406453935821e-06, + "loss": 0.1998, + "step": 47987 + }, + { + "epoch": 3.887556707712249, + "grad_norm": 0.07811640948057175, + "learning_rate": 6.246905801341195e-06, + "loss": 0.2306, + "step": 47988 + }, + { + "epoch": 3.8876377187297475, + "grad_norm": 0.08223123103380203, + "learning_rate": 6.242405148746568e-06, + "loss": 0.2695, + "step": 47989 + }, + { + "epoch": 3.8877187297472457, + "grad_norm": 0.0751977488398552, + "learning_rate": 6.237904496151943e-06, + "loss": 0.2244, + "step": 47990 + }, + { + "epoch": 3.887799740764744, + "grad_norm": 0.06286336481571198, + "learning_rate": 6.2334038435573155e-06, + "loss": 0.202, + "step": 47991 + }, + { + "epoch": 3.8878807517822422, + "grad_norm": 0.08371991664171219, + "learning_rate": 6.22890319096269e-06, + "loss": 0.229, + "step": 47992 + }, + { + "epoch": 3.887961762799741, + "grad_norm": 0.054850783199071884, + "learning_rate": 6.2244025383680635e-06, + "loss": 0.2143, + "step": 47993 + }, + { + "epoch": 3.888042773817239, + "grad_norm": 0.056063272058963776, + "learning_rate": 6.219901885773437e-06, + "loss": 0.1868, + "step": 47994 + }, + { + "epoch": 3.8881237848347374, + "grad_norm": 0.07119081169366837, + "learning_rate": 6.2154012331788116e-06, + "loss": 0.2235, + "step": 47995 + }, + { + "epoch": 3.8882047958522357, + "grad_norm": 0.10805629193782806, + "learning_rate": 6.210900580584185e-06, + "loss": 0.1982, + "step": 47996 + }, + { + "epoch": 3.8882858068697344, + "grad_norm": 0.0731663703918457, + "learning_rate": 6.206399927989559e-06, + "loss": 0.2528, + "step": 47997 + }, + { + "epoch": 3.8883668178872326, + "grad_norm": 0.07942456007003784, + "learning_rate": 6.201899275394932e-06, + "loss": 0.2717, + "step": 47998 + }, + { + "epoch": 3.888447828904731, + "grad_norm": 0.0707758367061615, + "learning_rate": 6.197398622800307e-06, + "loss": 0.2002, + "step": 47999 + }, + { + "epoch": 3.8885288399222295, + "grad_norm": 0.06386274099349976, + "learning_rate": 6.19289797020568e-06, + "loss": 0.2274, + "step": 48000 + }, + { + "epoch": 3.8886098509397278, + "grad_norm": 0.07666601985692978, + "learning_rate": 6.188397317611054e-06, + "loss": 0.2049, + "step": 48001 + }, + { + "epoch": 3.888690861957226, + "grad_norm": 0.07881522923707962, + "learning_rate": 6.183896665016428e-06, + "loss": 0.2338, + "step": 48002 + }, + { + "epoch": 3.8887718729747247, + "grad_norm": 0.07827413827180862, + "learning_rate": 6.179396012421801e-06, + "loss": 0.2311, + "step": 48003 + }, + { + "epoch": 3.888852883992223, + "grad_norm": 0.08375756442546844, + "learning_rate": 6.174895359827175e-06, + "loss": 0.2089, + "step": 48004 + }, + { + "epoch": 3.888933895009721, + "grad_norm": 0.06420580297708511, + "learning_rate": 6.170394707232549e-06, + "loss": 0.2367, + "step": 48005 + }, + { + "epoch": 3.88901490602722, + "grad_norm": 0.08399081230163574, + "learning_rate": 6.165894054637923e-06, + "loss": 0.2573, + "step": 48006 + }, + { + "epoch": 3.889095917044718, + "grad_norm": 0.07225336134433746, + "learning_rate": 6.1613934020432965e-06, + "loss": 0.2114, + "step": 48007 + }, + { + "epoch": 3.8891769280622164, + "grad_norm": 0.07664166390895844, + "learning_rate": 6.15689274944867e-06, + "loss": 0.2658, + "step": 48008 + }, + { + "epoch": 3.889257939079715, + "grad_norm": 0.07305591553449631, + "learning_rate": 6.152392096854044e-06, + "loss": 0.2189, + "step": 48009 + }, + { + "epoch": 3.8893389500972133, + "grad_norm": 0.06890987604856491, + "learning_rate": 6.147891444259418e-06, + "loss": 0.2175, + "step": 48010 + }, + { + "epoch": 3.8894199611147116, + "grad_norm": 0.06536436080932617, + "learning_rate": 6.143390791664792e-06, + "loss": 0.251, + "step": 48011 + }, + { + "epoch": 3.8895009721322102, + "grad_norm": 0.07678571343421936, + "learning_rate": 6.138890139070165e-06, + "loss": 0.2241, + "step": 48012 + }, + { + "epoch": 3.8895819831497085, + "grad_norm": 0.06775952875614166, + "learning_rate": 6.134389486475539e-06, + "loss": 0.2281, + "step": 48013 + }, + { + "epoch": 3.8896629941672067, + "grad_norm": 0.09381411224603653, + "learning_rate": 6.129888833880913e-06, + "loss": 0.2554, + "step": 48014 + }, + { + "epoch": 3.889744005184705, + "grad_norm": 0.05685148388147354, + "learning_rate": 6.125388181286287e-06, + "loss": 0.239, + "step": 48015 + }, + { + "epoch": 3.8898250162022032, + "grad_norm": 0.07663068175315857, + "learning_rate": 6.120887528691661e-06, + "loss": 0.2633, + "step": 48016 + }, + { + "epoch": 3.889906027219702, + "grad_norm": 0.0696638897061348, + "learning_rate": 6.116386876097034e-06, + "loss": 0.2184, + "step": 48017 + }, + { + "epoch": 3.8899870382372, + "grad_norm": 0.06149803474545479, + "learning_rate": 6.111886223502408e-06, + "loss": 0.2162, + "step": 48018 + }, + { + "epoch": 3.8900680492546984, + "grad_norm": 0.07225798070430756, + "learning_rate": 6.1073855709077814e-06, + "loss": 0.2452, + "step": 48019 + }, + { + "epoch": 3.890149060272197, + "grad_norm": 0.06644891202449799, + "learning_rate": 6.102884918313156e-06, + "loss": 0.2403, + "step": 48020 + }, + { + "epoch": 3.8902300712896953, + "grad_norm": 0.07242048531770706, + "learning_rate": 6.0983842657185295e-06, + "loss": 0.2397, + "step": 48021 + }, + { + "epoch": 3.8903110823071936, + "grad_norm": 0.07406973838806152, + "learning_rate": 6.093883613123903e-06, + "loss": 0.223, + "step": 48022 + }, + { + "epoch": 3.8903920933246923, + "grad_norm": 0.06930666416883469, + "learning_rate": 6.0893829605292775e-06, + "loss": 0.2271, + "step": 48023 + }, + { + "epoch": 3.8904731043421905, + "grad_norm": 0.07608542591333389, + "learning_rate": 6.084882307934651e-06, + "loss": 0.2076, + "step": 48024 + }, + { + "epoch": 3.8905541153596888, + "grad_norm": 0.06247393414378166, + "learning_rate": 6.080381655340025e-06, + "loss": 0.2051, + "step": 48025 + }, + { + "epoch": 3.8906351263771874, + "grad_norm": 0.061167728155851364, + "learning_rate": 6.075881002745398e-06, + "loss": 0.2088, + "step": 48026 + }, + { + "epoch": 3.8907161373946857, + "grad_norm": 0.06517788022756577, + "learning_rate": 6.071380350150773e-06, + "loss": 0.241, + "step": 48027 + }, + { + "epoch": 3.890797148412184, + "grad_norm": 0.07304881513118744, + "learning_rate": 6.0668796975561456e-06, + "loss": 0.2088, + "step": 48028 + }, + { + "epoch": 3.8908781594296826, + "grad_norm": 0.06430026143789291, + "learning_rate": 6.06237904496152e-06, + "loss": 0.235, + "step": 48029 + }, + { + "epoch": 3.890959170447181, + "grad_norm": 0.06506291031837463, + "learning_rate": 6.057878392366894e-06, + "loss": 0.1662, + "step": 48030 + }, + { + "epoch": 3.891040181464679, + "grad_norm": 0.06876938790082932, + "learning_rate": 6.053377739772267e-06, + "loss": 0.2141, + "step": 48031 + }, + { + "epoch": 3.891121192482178, + "grad_norm": 0.08055271953344345, + "learning_rate": 6.048877087177641e-06, + "loss": 0.2293, + "step": 48032 + }, + { + "epoch": 3.891202203499676, + "grad_norm": 0.07058732956647873, + "learning_rate": 6.044376434583015e-06, + "loss": 0.2289, + "step": 48033 + }, + { + "epoch": 3.8912832145171743, + "grad_norm": 0.086977019906044, + "learning_rate": 6.039875781988388e-06, + "loss": 0.2359, + "step": 48034 + }, + { + "epoch": 3.891364225534673, + "grad_norm": 0.07203526794910431, + "learning_rate": 6.0353751293937625e-06, + "loss": 0.2086, + "step": 48035 + }, + { + "epoch": 3.8914452365521712, + "grad_norm": 0.06266345083713531, + "learning_rate": 6.030874476799136e-06, + "loss": 0.1983, + "step": 48036 + }, + { + "epoch": 3.8915262475696695, + "grad_norm": 0.07486934214830399, + "learning_rate": 6.02637382420451e-06, + "loss": 0.235, + "step": 48037 + }, + { + "epoch": 3.8916072585871677, + "grad_norm": 0.07059779763221741, + "learning_rate": 6.021873171609884e-06, + "loss": 0.2082, + "step": 48038 + }, + { + "epoch": 3.891688269604666, + "grad_norm": 0.06780023127794266, + "learning_rate": 6.017372519015258e-06, + "loss": 0.2632, + "step": 48039 + }, + { + "epoch": 3.8917692806221647, + "grad_norm": 0.0676514059305191, + "learning_rate": 6.012871866420631e-06, + "loss": 0.2406, + "step": 48040 + }, + { + "epoch": 3.891850291639663, + "grad_norm": 0.06825751811265945, + "learning_rate": 6.008371213826005e-06, + "loss": 0.2402, + "step": 48041 + }, + { + "epoch": 3.891931302657161, + "grad_norm": 0.06496651470661163, + "learning_rate": 6.003870561231379e-06, + "loss": 0.2294, + "step": 48042 + }, + { + "epoch": 3.89201231367466, + "grad_norm": 0.0673632025718689, + "learning_rate": 5.999369908636752e-06, + "loss": 0.206, + "step": 48043 + }, + { + "epoch": 3.892093324692158, + "grad_norm": 0.059508103877305984, + "learning_rate": 5.994869256042127e-06, + "loss": 0.2146, + "step": 48044 + }, + { + "epoch": 3.8921743357096563, + "grad_norm": 0.07362375408411026, + "learning_rate": 5.9903686034475e-06, + "loss": 0.2195, + "step": 48045 + }, + { + "epoch": 3.892255346727155, + "grad_norm": 0.07658713310956955, + "learning_rate": 5.985867950852874e-06, + "loss": 0.2371, + "step": 48046 + }, + { + "epoch": 3.8923363577446533, + "grad_norm": 0.06624037772417068, + "learning_rate": 5.981367298258247e-06, + "loss": 0.1921, + "step": 48047 + }, + { + "epoch": 3.8924173687621515, + "grad_norm": 0.08692865073680878, + "learning_rate": 5.976866645663622e-06, + "loss": 0.2362, + "step": 48048 + }, + { + "epoch": 3.89249837977965, + "grad_norm": 0.07544679939746857, + "learning_rate": 5.972365993068995e-06, + "loss": 0.221, + "step": 48049 + }, + { + "epoch": 3.8925793907971484, + "grad_norm": 0.05986184626817703, + "learning_rate": 5.967865340474369e-06, + "loss": 0.2001, + "step": 48050 + }, + { + "epoch": 3.8926604018146467, + "grad_norm": 0.08265157043933868, + "learning_rate": 5.963364687879743e-06, + "loss": 0.3066, + "step": 48051 + }, + { + "epoch": 3.8927414128321454, + "grad_norm": 0.07029622048139572, + "learning_rate": 5.958864035285116e-06, + "loss": 0.2507, + "step": 48052 + }, + { + "epoch": 3.8928224238496436, + "grad_norm": 0.07296372205018997, + "learning_rate": 5.954363382690491e-06, + "loss": 0.2628, + "step": 48053 + }, + { + "epoch": 3.892903434867142, + "grad_norm": 0.08202331513166428, + "learning_rate": 5.949862730095864e-06, + "loss": 0.236, + "step": 48054 + }, + { + "epoch": 3.8929844458846405, + "grad_norm": 0.07177530229091644, + "learning_rate": 5.945362077501238e-06, + "loss": 0.2239, + "step": 48055 + }, + { + "epoch": 3.893065456902139, + "grad_norm": 0.06391312927007675, + "learning_rate": 5.9408614249066115e-06, + "loss": 0.2415, + "step": 48056 + }, + { + "epoch": 3.893146467919637, + "grad_norm": 0.07368528842926025, + "learning_rate": 5.936360772311986e-06, + "loss": 0.2102, + "step": 48057 + }, + { + "epoch": 3.8932274789371357, + "grad_norm": 0.09229487180709839, + "learning_rate": 5.931860119717359e-06, + "loss": 0.2683, + "step": 48058 + }, + { + "epoch": 3.893308489954634, + "grad_norm": 0.07881072908639908, + "learning_rate": 5.927359467122733e-06, + "loss": 0.205, + "step": 48059 + }, + { + "epoch": 3.893389500972132, + "grad_norm": 0.08213283866643906, + "learning_rate": 5.922858814528107e-06, + "loss": 0.2216, + "step": 48060 + }, + { + "epoch": 3.8934705119896305, + "grad_norm": 0.06197618693113327, + "learning_rate": 5.91835816193348e-06, + "loss": 0.2131, + "step": 48061 + }, + { + "epoch": 3.8935515230071287, + "grad_norm": 0.06296448409557343, + "learning_rate": 5.913857509338854e-06, + "loss": 0.2414, + "step": 48062 + }, + { + "epoch": 3.8936325340246274, + "grad_norm": 0.057206038385629654, + "learning_rate": 5.9093568567442285e-06, + "loss": 0.2052, + "step": 48063 + }, + { + "epoch": 3.8937135450421256, + "grad_norm": 0.06184779480099678, + "learning_rate": 5.904856204149602e-06, + "loss": 0.1972, + "step": 48064 + }, + { + "epoch": 3.893794556059624, + "grad_norm": 0.06405945867300034, + "learning_rate": 5.900355551554976e-06, + "loss": 0.1939, + "step": 48065 + }, + { + "epoch": 3.8938755670771226, + "grad_norm": 0.08158735185861588, + "learning_rate": 5.89585489896035e-06, + "loss": 0.2208, + "step": 48066 + }, + { + "epoch": 3.893956578094621, + "grad_norm": 0.06656576693058014, + "learning_rate": 5.891354246365723e-06, + "loss": 0.2169, + "step": 48067 + }, + { + "epoch": 3.894037589112119, + "grad_norm": 0.06311673671007156, + "learning_rate": 5.886853593771097e-06, + "loss": 0.2384, + "step": 48068 + }, + { + "epoch": 3.8941186001296177, + "grad_norm": 0.0772138312458992, + "learning_rate": 5.882352941176471e-06, + "loss": 0.2311, + "step": 48069 + }, + { + "epoch": 3.894199611147116, + "grad_norm": 0.0871867686510086, + "learning_rate": 5.8778522885818445e-06, + "loss": 0.22, + "step": 48070 + }, + { + "epoch": 3.8942806221646142, + "grad_norm": 0.07590407133102417, + "learning_rate": 5.873351635987218e-06, + "loss": 0.2274, + "step": 48071 + }, + { + "epoch": 3.894361633182113, + "grad_norm": 0.0857917070388794, + "learning_rate": 5.868850983392593e-06, + "loss": 0.2184, + "step": 48072 + }, + { + "epoch": 3.894442644199611, + "grad_norm": 0.07525145262479782, + "learning_rate": 5.864350330797966e-06, + "loss": 0.1988, + "step": 48073 + }, + { + "epoch": 3.8945236552171094, + "grad_norm": 0.06835979968309402, + "learning_rate": 5.85984967820334e-06, + "loss": 0.2214, + "step": 48074 + }, + { + "epoch": 3.894604666234608, + "grad_norm": 0.06695519387722015, + "learning_rate": 5.855349025608713e-06, + "loss": 0.2288, + "step": 48075 + }, + { + "epoch": 3.8946856772521063, + "grad_norm": 0.08336454629898071, + "learning_rate": 5.850848373014088e-06, + "loss": 0.2016, + "step": 48076 + }, + { + "epoch": 3.8947666882696046, + "grad_norm": 0.06542129814624786, + "learning_rate": 5.846347720419461e-06, + "loss": 0.1926, + "step": 48077 + }, + { + "epoch": 3.8948476992871033, + "grad_norm": 0.07468671351671219, + "learning_rate": 5.841847067824835e-06, + "loss": 0.2031, + "step": 48078 + }, + { + "epoch": 3.8949287103046015, + "grad_norm": 0.08390598744153976, + "learning_rate": 5.837346415230209e-06, + "loss": 0.2465, + "step": 48079 + }, + { + "epoch": 3.8950097213220998, + "grad_norm": 0.0719221755862236, + "learning_rate": 5.832845762635582e-06, + "loss": 0.2255, + "step": 48080 + }, + { + "epoch": 3.8950907323395985, + "grad_norm": 0.07787550985813141, + "learning_rate": 5.828345110040957e-06, + "loss": 0.1899, + "step": 48081 + }, + { + "epoch": 3.8951717433570967, + "grad_norm": 0.06885991245508194, + "learning_rate": 5.82384445744633e-06, + "loss": 0.1858, + "step": 48082 + }, + { + "epoch": 3.895252754374595, + "grad_norm": 0.08002043515443802, + "learning_rate": 5.819343804851704e-06, + "loss": 0.2095, + "step": 48083 + }, + { + "epoch": 3.895333765392093, + "grad_norm": 0.06390658766031265, + "learning_rate": 5.8148431522570775e-06, + "loss": 0.2274, + "step": 48084 + }, + { + "epoch": 3.8954147764095914, + "grad_norm": 0.0804852768778801, + "learning_rate": 5.810342499662452e-06, + "loss": 0.2423, + "step": 48085 + }, + { + "epoch": 3.89549578742709, + "grad_norm": 0.06975232064723969, + "learning_rate": 5.805841847067825e-06, + "loss": 0.198, + "step": 48086 + }, + { + "epoch": 3.8955767984445884, + "grad_norm": 0.09129457175731659, + "learning_rate": 5.801341194473199e-06, + "loss": 0.1861, + "step": 48087 + }, + { + "epoch": 3.8956578094620866, + "grad_norm": 0.07378769665956497, + "learning_rate": 5.796840541878573e-06, + "loss": 0.2115, + "step": 48088 + }, + { + "epoch": 3.8957388204795853, + "grad_norm": 0.07973496615886688, + "learning_rate": 5.792339889283946e-06, + "loss": 0.2271, + "step": 48089 + }, + { + "epoch": 3.8958198314970836, + "grad_norm": 0.07622456550598145, + "learning_rate": 5.78783923668932e-06, + "loss": 0.2271, + "step": 48090 + }, + { + "epoch": 3.895900842514582, + "grad_norm": 0.06588424742221832, + "learning_rate": 5.7833385840946944e-06, + "loss": 0.2054, + "step": 48091 + }, + { + "epoch": 3.8959818535320805, + "grad_norm": 0.06296347826719284, + "learning_rate": 5.778837931500067e-06, + "loss": 0.1969, + "step": 48092 + }, + { + "epoch": 3.8960628645495787, + "grad_norm": 0.07207436114549637, + "learning_rate": 5.774337278905442e-06, + "loss": 0.2263, + "step": 48093 + }, + { + "epoch": 3.896143875567077, + "grad_norm": 0.07418610155582428, + "learning_rate": 5.769836626310815e-06, + "loss": 0.2296, + "step": 48094 + }, + { + "epoch": 3.8962248865845757, + "grad_norm": 0.08219721913337708, + "learning_rate": 5.765335973716189e-06, + "loss": 0.2525, + "step": 48095 + }, + { + "epoch": 3.896305897602074, + "grad_norm": 0.06975866854190826, + "learning_rate": 5.760835321121563e-06, + "loss": 0.2565, + "step": 48096 + }, + { + "epoch": 3.896386908619572, + "grad_norm": 0.06241089478135109, + "learning_rate": 5.756334668526937e-06, + "loss": 0.2418, + "step": 48097 + }, + { + "epoch": 3.896467919637071, + "grad_norm": 0.0809803307056427, + "learning_rate": 5.7518340159323105e-06, + "loss": 0.2403, + "step": 48098 + }, + { + "epoch": 3.896548930654569, + "grad_norm": 0.06909254193305969, + "learning_rate": 5.747333363337684e-06, + "loss": 0.2054, + "step": 48099 + }, + { + "epoch": 3.8966299416720673, + "grad_norm": 0.06723940372467041, + "learning_rate": 5.7428327107430586e-06, + "loss": 0.2459, + "step": 48100 + }, + { + "epoch": 3.896710952689566, + "grad_norm": 0.07118137925863266, + "learning_rate": 5.738332058148431e-06, + "loss": 0.2237, + "step": 48101 + }, + { + "epoch": 3.8967919637070643, + "grad_norm": 0.07935565710067749, + "learning_rate": 5.733831405553806e-06, + "loss": 0.2189, + "step": 48102 + }, + { + "epoch": 3.8968729747245625, + "grad_norm": 0.08424384146928787, + "learning_rate": 5.729330752959179e-06, + "loss": 0.2415, + "step": 48103 + }, + { + "epoch": 3.8969539857420608, + "grad_norm": 0.07037527859210968, + "learning_rate": 5.724830100364553e-06, + "loss": 0.1857, + "step": 48104 + }, + { + "epoch": 3.8970349967595594, + "grad_norm": 0.06845095008611679, + "learning_rate": 5.7203294477699266e-06, + "loss": 0.2233, + "step": 48105 + }, + { + "epoch": 3.8971160077770577, + "grad_norm": 0.08850236237049103, + "learning_rate": 5.715828795175301e-06, + "loss": 0.279, + "step": 48106 + }, + { + "epoch": 3.897197018794556, + "grad_norm": 0.06084459275007248, + "learning_rate": 5.711328142580674e-06, + "loss": 0.201, + "step": 48107 + }, + { + "epoch": 3.897278029812054, + "grad_norm": 0.07755912840366364, + "learning_rate": 5.706827489986048e-06, + "loss": 0.2315, + "step": 48108 + }, + { + "epoch": 3.897359040829553, + "grad_norm": 0.06639105826616287, + "learning_rate": 5.702326837391423e-06, + "loss": 0.2502, + "step": 48109 + }, + { + "epoch": 3.897440051847051, + "grad_norm": 0.07094542682170868, + "learning_rate": 5.6978261847967954e-06, + "loss": 0.1805, + "step": 48110 + }, + { + "epoch": 3.8975210628645494, + "grad_norm": 0.07018940150737762, + "learning_rate": 5.69332553220217e-06, + "loss": 0.1724, + "step": 48111 + }, + { + "epoch": 3.897602073882048, + "grad_norm": 0.06900765746831894, + "learning_rate": 5.6888248796075435e-06, + "loss": 0.2331, + "step": 48112 + }, + { + "epoch": 3.8976830848995463, + "grad_norm": 0.0696382075548172, + "learning_rate": 5.684324227012917e-06, + "loss": 0.2468, + "step": 48113 + }, + { + "epoch": 3.8977640959170445, + "grad_norm": 0.06245898827910423, + "learning_rate": 5.679823574418291e-06, + "loss": 0.2286, + "step": 48114 + }, + { + "epoch": 3.8978451069345432, + "grad_norm": 0.07008770108222961, + "learning_rate": 5.675322921823665e-06, + "loss": 0.2411, + "step": 48115 + }, + { + "epoch": 3.8979261179520415, + "grad_norm": 0.05966051667928696, + "learning_rate": 5.670822269229038e-06, + "loss": 0.1998, + "step": 48116 + }, + { + "epoch": 3.8980071289695397, + "grad_norm": 0.07350362837314606, + "learning_rate": 5.666321616634412e-06, + "loss": 0.2289, + "step": 48117 + }, + { + "epoch": 3.8980881399870384, + "grad_norm": 0.06739611178636551, + "learning_rate": 5.661820964039786e-06, + "loss": 0.2126, + "step": 48118 + }, + { + "epoch": 3.8981691510045366, + "grad_norm": 0.06899629533290863, + "learning_rate": 5.6573203114451596e-06, + "loss": 0.2236, + "step": 48119 + }, + { + "epoch": 3.898250162022035, + "grad_norm": 0.06305073946714401, + "learning_rate": 5.652819658850533e-06, + "loss": 0.2044, + "step": 48120 + }, + { + "epoch": 3.8983311730395336, + "grad_norm": 0.0846519023180008, + "learning_rate": 5.648319006255908e-06, + "loss": 0.2821, + "step": 48121 + }, + { + "epoch": 3.898412184057032, + "grad_norm": 0.06912169605493546, + "learning_rate": 5.643818353661281e-06, + "loss": 0.2427, + "step": 48122 + }, + { + "epoch": 3.89849319507453, + "grad_norm": 0.07149054855108261, + "learning_rate": 5.639317701066655e-06, + "loss": 0.2001, + "step": 48123 + }, + { + "epoch": 3.8985742060920288, + "grad_norm": 0.05685052648186684, + "learning_rate": 5.634817048472029e-06, + "loss": 0.2235, + "step": 48124 + }, + { + "epoch": 3.898655217109527, + "grad_norm": 0.0757313072681427, + "learning_rate": 5.630316395877403e-06, + "loss": 0.2373, + "step": 48125 + }, + { + "epoch": 3.8987362281270252, + "grad_norm": 0.08734872937202454, + "learning_rate": 5.6258157432827765e-06, + "loss": 0.2707, + "step": 48126 + }, + { + "epoch": 3.8988172391445235, + "grad_norm": 0.07042701542377472, + "learning_rate": 5.62131509068815e-06, + "loss": 0.2077, + "step": 48127 + }, + { + "epoch": 3.898898250162022, + "grad_norm": 0.06531260907649994, + "learning_rate": 5.616814438093524e-06, + "loss": 0.2404, + "step": 48128 + }, + { + "epoch": 3.8989792611795204, + "grad_norm": 0.06730636954307556, + "learning_rate": 5.612313785498897e-06, + "loss": 0.2318, + "step": 48129 + }, + { + "epoch": 3.8990602721970187, + "grad_norm": 0.08711560815572739, + "learning_rate": 5.607813132904272e-06, + "loss": 0.2223, + "step": 48130 + }, + { + "epoch": 3.899141283214517, + "grad_norm": 0.06874527782201767, + "learning_rate": 5.603312480309645e-06, + "loss": 0.2563, + "step": 48131 + }, + { + "epoch": 3.8992222942320156, + "grad_norm": 0.08103624731302261, + "learning_rate": 5.598811827715019e-06, + "loss": 0.2288, + "step": 48132 + }, + { + "epoch": 3.899303305249514, + "grad_norm": 0.07119531184434891, + "learning_rate": 5.5943111751203925e-06, + "loss": 0.2334, + "step": 48133 + }, + { + "epoch": 3.899384316267012, + "grad_norm": 0.06922445446252823, + "learning_rate": 5.589810522525767e-06, + "loss": 0.2301, + "step": 48134 + }, + { + "epoch": 3.899465327284511, + "grad_norm": 0.07962538301944733, + "learning_rate": 5.58530986993114e-06, + "loss": 0.2631, + "step": 48135 + }, + { + "epoch": 3.899546338302009, + "grad_norm": 0.07506944984197617, + "learning_rate": 5.580809217336514e-06, + "loss": 0.2203, + "step": 48136 + }, + { + "epoch": 3.8996273493195073, + "grad_norm": 0.08897983282804489, + "learning_rate": 5.576308564741888e-06, + "loss": 0.2219, + "step": 48137 + }, + { + "epoch": 3.899708360337006, + "grad_norm": 0.06361392885446548, + "learning_rate": 5.571807912147261e-06, + "loss": 0.2409, + "step": 48138 + }, + { + "epoch": 3.899789371354504, + "grad_norm": 0.07459505647420883, + "learning_rate": 5.567307259552636e-06, + "loss": 0.2043, + "step": 48139 + }, + { + "epoch": 3.8998703823720025, + "grad_norm": 0.059818338602781296, + "learning_rate": 5.5628066069580095e-06, + "loss": 0.2136, + "step": 48140 + }, + { + "epoch": 3.899951393389501, + "grad_norm": 0.08024513721466064, + "learning_rate": 5.558305954363383e-06, + "loss": 0.2156, + "step": 48141 + }, + { + "epoch": 3.9000324044069994, + "grad_norm": 0.06556698679924011, + "learning_rate": 5.553805301768757e-06, + "loss": 0.2035, + "step": 48142 + }, + { + "epoch": 3.9001134154244976, + "grad_norm": 0.08273103833198547, + "learning_rate": 5.549304649174131e-06, + "loss": 0.251, + "step": 48143 + }, + { + "epoch": 3.9001944264419963, + "grad_norm": 0.07010772824287415, + "learning_rate": 5.544803996579504e-06, + "loss": 0.2426, + "step": 48144 + }, + { + "epoch": 3.9002754374594946, + "grad_norm": 0.07389405369758606, + "learning_rate": 5.540303343984878e-06, + "loss": 0.2398, + "step": 48145 + }, + { + "epoch": 3.900356448476993, + "grad_norm": 0.051351238042116165, + "learning_rate": 5.535802691390252e-06, + "loss": 0.2364, + "step": 48146 + }, + { + "epoch": 3.9004374594944915, + "grad_norm": 0.0765320435166359, + "learning_rate": 5.5313020387956255e-06, + "loss": 0.2237, + "step": 48147 + }, + { + "epoch": 3.9005184705119897, + "grad_norm": 0.07203897833824158, + "learning_rate": 5.526801386200999e-06, + "loss": 0.2089, + "step": 48148 + }, + { + "epoch": 3.900599481529488, + "grad_norm": 0.06963177770376205, + "learning_rate": 5.522300733606374e-06, + "loss": 0.21, + "step": 48149 + }, + { + "epoch": 3.9006804925469862, + "grad_norm": 0.07618485391139984, + "learning_rate": 5.517800081011746e-06, + "loss": 0.2284, + "step": 48150 + }, + { + "epoch": 3.900761503564485, + "grad_norm": 0.08152638375759125, + "learning_rate": 5.513299428417121e-06, + "loss": 0.2354, + "step": 48151 + }, + { + "epoch": 3.900842514581983, + "grad_norm": 0.07824641466140747, + "learning_rate": 5.508798775822494e-06, + "loss": 0.2582, + "step": 48152 + }, + { + "epoch": 3.9009235255994814, + "grad_norm": 0.07195582240819931, + "learning_rate": 5.504298123227868e-06, + "loss": 0.1995, + "step": 48153 + }, + { + "epoch": 3.9010045366169797, + "grad_norm": 0.07551611214876175, + "learning_rate": 5.4997974706332424e-06, + "loss": 0.2055, + "step": 48154 + }, + { + "epoch": 3.9010855476344783, + "grad_norm": 0.06853777915239334, + "learning_rate": 5.495296818038616e-06, + "loss": 0.1924, + "step": 48155 + }, + { + "epoch": 3.9011665586519766, + "grad_norm": 0.09760008007287979, + "learning_rate": 5.49079616544399e-06, + "loss": 0.2279, + "step": 48156 + }, + { + "epoch": 3.901247569669475, + "grad_norm": 0.060199085623025894, + "learning_rate": 5.486295512849363e-06, + "loss": 0.1821, + "step": 48157 + }, + { + "epoch": 3.9013285806869735, + "grad_norm": 0.07289555668830872, + "learning_rate": 5.481794860254738e-06, + "loss": 0.241, + "step": 48158 + }, + { + "epoch": 3.9014095917044718, + "grad_norm": 0.08234255015850067, + "learning_rate": 5.4772942076601105e-06, + "loss": 0.2462, + "step": 48159 + }, + { + "epoch": 3.90149060272197, + "grad_norm": 0.07204477488994598, + "learning_rate": 5.472793555065485e-06, + "loss": 0.2149, + "step": 48160 + }, + { + "epoch": 3.9015716137394687, + "grad_norm": 0.0728304460644722, + "learning_rate": 5.4682929024708585e-06, + "loss": 0.2578, + "step": 48161 + }, + { + "epoch": 3.901652624756967, + "grad_norm": 0.08322672545909882, + "learning_rate": 5.463792249876232e-06, + "loss": 0.2468, + "step": 48162 + }, + { + "epoch": 3.901733635774465, + "grad_norm": 0.07119564712047577, + "learning_rate": 5.459291597281606e-06, + "loss": 0.2027, + "step": 48163 + }, + { + "epoch": 3.901814646791964, + "grad_norm": 0.06412103772163391, + "learning_rate": 5.45479094468698e-06, + "loss": 0.2256, + "step": 48164 + }, + { + "epoch": 3.901895657809462, + "grad_norm": 0.07036664336919785, + "learning_rate": 5.450290292092353e-06, + "loss": 0.2439, + "step": 48165 + }, + { + "epoch": 3.9019766688269604, + "grad_norm": 0.07236220687627792, + "learning_rate": 5.445789639497727e-06, + "loss": 0.232, + "step": 48166 + }, + { + "epoch": 3.902057679844459, + "grad_norm": 0.07063250243663788, + "learning_rate": 5.441288986903102e-06, + "loss": 0.1938, + "step": 48167 + }, + { + "epoch": 3.9021386908619573, + "grad_norm": 0.08155050873756409, + "learning_rate": 5.436788334308475e-06, + "loss": 0.207, + "step": 48168 + }, + { + "epoch": 3.9022197018794555, + "grad_norm": 0.09283901005983353, + "learning_rate": 5.432287681713849e-06, + "loss": 0.2062, + "step": 48169 + }, + { + "epoch": 3.9023007128969542, + "grad_norm": 0.07058636099100113, + "learning_rate": 5.427787029119223e-06, + "loss": 0.2262, + "step": 48170 + }, + { + "epoch": 3.9023817239144525, + "grad_norm": 0.07090901583433151, + "learning_rate": 5.423286376524596e-06, + "loss": 0.217, + "step": 48171 + }, + { + "epoch": 3.9024627349319507, + "grad_norm": 0.06685709208250046, + "learning_rate": 5.41878572392997e-06, + "loss": 0.2017, + "step": 48172 + }, + { + "epoch": 3.902543745949449, + "grad_norm": 0.0714755430817604, + "learning_rate": 5.414285071335344e-06, + "loss": 0.2207, + "step": 48173 + }, + { + "epoch": 3.9026247569669477, + "grad_norm": 0.06478474289178848, + "learning_rate": 5.409784418740718e-06, + "loss": 0.1963, + "step": 48174 + }, + { + "epoch": 3.902705767984446, + "grad_norm": 0.059301264584064484, + "learning_rate": 5.4052837661460915e-06, + "loss": 0.1878, + "step": 48175 + }, + { + "epoch": 3.902786779001944, + "grad_norm": 0.07554680854082108, + "learning_rate": 5.400783113551465e-06, + "loss": 0.196, + "step": 48176 + }, + { + "epoch": 3.9028677900194424, + "grad_norm": 0.071644127368927, + "learning_rate": 5.396282460956839e-06, + "loss": 0.2209, + "step": 48177 + }, + { + "epoch": 3.902948801036941, + "grad_norm": 0.07129331678152084, + "learning_rate": 5.391781808362212e-06, + "loss": 0.1822, + "step": 48178 + }, + { + "epoch": 3.9030298120544393, + "grad_norm": 0.08478628844022751, + "learning_rate": 5.387281155767587e-06, + "loss": 0.2269, + "step": 48179 + }, + { + "epoch": 3.9031108230719376, + "grad_norm": 0.06071711704134941, + "learning_rate": 5.38278050317296e-06, + "loss": 0.2276, + "step": 48180 + }, + { + "epoch": 3.9031918340894363, + "grad_norm": 0.0666237398982048, + "learning_rate": 5.378279850578334e-06, + "loss": 0.2171, + "step": 48181 + }, + { + "epoch": 3.9032728451069345, + "grad_norm": 0.07781444489955902, + "learning_rate": 5.373779197983708e-06, + "loss": 0.2382, + "step": 48182 + }, + { + "epoch": 3.9033538561244328, + "grad_norm": 0.07283270359039307, + "learning_rate": 5.369278545389082e-06, + "loss": 0.2168, + "step": 48183 + }, + { + "epoch": 3.9034348671419314, + "grad_norm": 0.07119312882423401, + "learning_rate": 5.364777892794456e-06, + "loss": 0.2312, + "step": 48184 + }, + { + "epoch": 3.9035158781594297, + "grad_norm": 0.06919389218091965, + "learning_rate": 5.360277240199829e-06, + "loss": 0.2035, + "step": 48185 + }, + { + "epoch": 3.903596889176928, + "grad_norm": 0.09564153850078583, + "learning_rate": 5.355776587605203e-06, + "loss": 0.2171, + "step": 48186 + }, + { + "epoch": 3.9036779001944266, + "grad_norm": 0.06871563196182251, + "learning_rate": 5.3512759350105764e-06, + "loss": 0.2065, + "step": 48187 + }, + { + "epoch": 3.903758911211925, + "grad_norm": 0.07969631999731064, + "learning_rate": 5.346775282415951e-06, + "loss": 0.2581, + "step": 48188 + }, + { + "epoch": 3.903839922229423, + "grad_norm": 0.07645699381828308, + "learning_rate": 5.3422746298213245e-06, + "loss": 0.2623, + "step": 48189 + }, + { + "epoch": 3.903920933246922, + "grad_norm": 0.07592182606458664, + "learning_rate": 5.337773977226698e-06, + "loss": 0.2024, + "step": 48190 + }, + { + "epoch": 3.90400194426442, + "grad_norm": 0.07032636553049088, + "learning_rate": 5.333273324632072e-06, + "loss": 0.2, + "step": 48191 + }, + { + "epoch": 3.9040829552819183, + "grad_norm": 0.06590494513511658, + "learning_rate": 5.328772672037446e-06, + "loss": 0.2019, + "step": 48192 + }, + { + "epoch": 3.904163966299417, + "grad_norm": 0.07423686981201172, + "learning_rate": 5.324272019442819e-06, + "loss": 0.2317, + "step": 48193 + }, + { + "epoch": 3.904244977316915, + "grad_norm": 0.0696929395198822, + "learning_rate": 5.319771366848193e-06, + "loss": 0.2315, + "step": 48194 + }, + { + "epoch": 3.9043259883344135, + "grad_norm": 0.08379188925027847, + "learning_rate": 5.315270714253567e-06, + "loss": 0.2312, + "step": 48195 + }, + { + "epoch": 3.9044069993519117, + "grad_norm": 0.07730215787887573, + "learning_rate": 5.3107700616589406e-06, + "loss": 0.2267, + "step": 48196 + }, + { + "epoch": 3.9044880103694104, + "grad_norm": 0.07866203784942627, + "learning_rate": 5.306269409064315e-06, + "loss": 0.2381, + "step": 48197 + }, + { + "epoch": 3.9045690213869086, + "grad_norm": 0.08509163558483124, + "learning_rate": 5.301768756469689e-06, + "loss": 0.2215, + "step": 48198 + }, + { + "epoch": 3.904650032404407, + "grad_norm": 0.07737905532121658, + "learning_rate": 5.297268103875062e-06, + "loss": 0.2104, + "step": 48199 + }, + { + "epoch": 3.904731043421905, + "grad_norm": 0.08024188876152039, + "learning_rate": 5.292767451280436e-06, + "loss": 0.2141, + "step": 48200 + }, + { + "epoch": 3.904812054439404, + "grad_norm": 0.0622677244246006, + "learning_rate": 5.28826679868581e-06, + "loss": 0.1994, + "step": 48201 + }, + { + "epoch": 3.904893065456902, + "grad_norm": 0.08390653878450394, + "learning_rate": 5.283766146091183e-06, + "loss": 0.2637, + "step": 48202 + }, + { + "epoch": 3.9049740764744003, + "grad_norm": 0.07863643020391464, + "learning_rate": 5.2792654934965575e-06, + "loss": 0.2527, + "step": 48203 + }, + { + "epoch": 3.905055087491899, + "grad_norm": 0.07339465618133545, + "learning_rate": 5.274764840901931e-06, + "loss": 0.2585, + "step": 48204 + }, + { + "epoch": 3.9051360985093972, + "grad_norm": 0.06816806644201279, + "learning_rate": 5.270264188307305e-06, + "loss": 0.1963, + "step": 48205 + }, + { + "epoch": 3.9052171095268955, + "grad_norm": 0.07438135147094727, + "learning_rate": 5.265763535712678e-06, + "loss": 0.2358, + "step": 48206 + }, + { + "epoch": 3.905298120544394, + "grad_norm": 0.09304346889257431, + "learning_rate": 5.261262883118053e-06, + "loss": 0.2418, + "step": 48207 + }, + { + "epoch": 3.9053791315618924, + "grad_norm": 0.07517294585704803, + "learning_rate": 5.2567622305234255e-06, + "loss": 0.2178, + "step": 48208 + }, + { + "epoch": 3.9054601425793907, + "grad_norm": 0.06006520986557007, + "learning_rate": 5.2522615779288e-06, + "loss": 0.2233, + "step": 48209 + }, + { + "epoch": 3.9055411535968894, + "grad_norm": 0.061688102781772614, + "learning_rate": 5.247760925334174e-06, + "loss": 0.1981, + "step": 48210 + }, + { + "epoch": 3.9056221646143876, + "grad_norm": 0.11346621811389923, + "learning_rate": 5.243260272739547e-06, + "loss": 0.2113, + "step": 48211 + }, + { + "epoch": 3.905703175631886, + "grad_norm": 0.06851883232593536, + "learning_rate": 5.238759620144922e-06, + "loss": 0.2678, + "step": 48212 + }, + { + "epoch": 3.9057841866493845, + "grad_norm": 0.07190129160881042, + "learning_rate": 5.234258967550295e-06, + "loss": 0.2227, + "step": 48213 + }, + { + "epoch": 3.905865197666883, + "grad_norm": 0.07402346283197403, + "learning_rate": 5.229758314955669e-06, + "loss": 0.23, + "step": 48214 + }, + { + "epoch": 3.905946208684381, + "grad_norm": 0.0806785300374031, + "learning_rate": 5.225257662361042e-06, + "loss": 0.2571, + "step": 48215 + }, + { + "epoch": 3.9060272197018797, + "grad_norm": 0.06920921057462692, + "learning_rate": 5.220757009766417e-06, + "loss": 0.2173, + "step": 48216 + }, + { + "epoch": 3.906108230719378, + "grad_norm": 0.06963902711868286, + "learning_rate": 5.21625635717179e-06, + "loss": 0.19, + "step": 48217 + }, + { + "epoch": 3.906189241736876, + "grad_norm": 0.06644149869680405, + "learning_rate": 5.211755704577164e-06, + "loss": 0.2332, + "step": 48218 + }, + { + "epoch": 3.9062702527543745, + "grad_norm": 0.06656293570995331, + "learning_rate": 5.207255051982538e-06, + "loss": 0.1953, + "step": 48219 + }, + { + "epoch": 3.906351263771873, + "grad_norm": 0.07155372947454453, + "learning_rate": 5.202754399387911e-06, + "loss": 0.2218, + "step": 48220 + }, + { + "epoch": 3.9064322747893714, + "grad_norm": 0.06269364804029465, + "learning_rate": 5.198253746793285e-06, + "loss": 0.1846, + "step": 48221 + }, + { + "epoch": 3.9065132858068696, + "grad_norm": 0.06916271895170212, + "learning_rate": 5.193753094198659e-06, + "loss": 0.2343, + "step": 48222 + }, + { + "epoch": 3.906594296824368, + "grad_norm": 0.07343260943889618, + "learning_rate": 5.189252441604032e-06, + "loss": 0.2274, + "step": 48223 + }, + { + "epoch": 3.9066753078418666, + "grad_norm": 0.08011267334222794, + "learning_rate": 5.1847517890094065e-06, + "loss": 0.2191, + "step": 48224 + }, + { + "epoch": 3.906756318859365, + "grad_norm": 0.07275541871786118, + "learning_rate": 5.180251136414781e-06, + "loss": 0.2005, + "step": 48225 + }, + { + "epoch": 3.906837329876863, + "grad_norm": 0.08077385276556015, + "learning_rate": 5.175750483820154e-06, + "loss": 0.2796, + "step": 48226 + }, + { + "epoch": 3.9069183408943617, + "grad_norm": 0.08260789513587952, + "learning_rate": 5.171249831225528e-06, + "loss": 0.2247, + "step": 48227 + }, + { + "epoch": 3.90699935191186, + "grad_norm": 0.05853225663304329, + "learning_rate": 5.166749178630902e-06, + "loss": 0.1863, + "step": 48228 + }, + { + "epoch": 3.9070803629293582, + "grad_norm": 0.08310955762863159, + "learning_rate": 5.162248526036275e-06, + "loss": 0.2158, + "step": 48229 + }, + { + "epoch": 3.907161373946857, + "grad_norm": 0.07807844877243042, + "learning_rate": 5.157747873441649e-06, + "loss": 0.2443, + "step": 48230 + }, + { + "epoch": 3.907242384964355, + "grad_norm": 0.07691746205091476, + "learning_rate": 5.1532472208470235e-06, + "loss": 0.2138, + "step": 48231 + }, + { + "epoch": 3.9073233959818534, + "grad_norm": 0.07122568041086197, + "learning_rate": 5.148746568252397e-06, + "loss": 0.2198, + "step": 48232 + }, + { + "epoch": 3.907404406999352, + "grad_norm": 0.06453979015350342, + "learning_rate": 5.144245915657771e-06, + "loss": 0.2408, + "step": 48233 + }, + { + "epoch": 3.9074854180168503, + "grad_norm": 0.07711661607027054, + "learning_rate": 5.139745263063144e-06, + "loss": 0.201, + "step": 48234 + }, + { + "epoch": 3.9075664290343486, + "grad_norm": 0.08223751932382584, + "learning_rate": 5.135244610468518e-06, + "loss": 0.2204, + "step": 48235 + }, + { + "epoch": 3.9076474400518473, + "grad_norm": 0.08006234467029572, + "learning_rate": 5.1307439578738915e-06, + "loss": 0.2498, + "step": 48236 + }, + { + "epoch": 3.9077284510693455, + "grad_norm": 0.06987843662500381, + "learning_rate": 5.126243305279266e-06, + "loss": 0.2437, + "step": 48237 + }, + { + "epoch": 3.9078094620868438, + "grad_norm": 0.06831565499305725, + "learning_rate": 5.1217426526846395e-06, + "loss": 0.1795, + "step": 48238 + }, + { + "epoch": 3.9078904731043425, + "grad_norm": 0.0886467844247818, + "learning_rate": 5.117242000090013e-06, + "loss": 0.2054, + "step": 48239 + }, + { + "epoch": 3.9079714841218407, + "grad_norm": 0.09053325653076172, + "learning_rate": 5.112741347495388e-06, + "loss": 0.2823, + "step": 48240 + }, + { + "epoch": 3.908052495139339, + "grad_norm": 0.07012274116277695, + "learning_rate": 5.108240694900761e-06, + "loss": 0.2037, + "step": 48241 + }, + { + "epoch": 3.908133506156837, + "grad_norm": 0.07728597521781921, + "learning_rate": 5.103740042306135e-06, + "loss": 0.2068, + "step": 48242 + }, + { + "epoch": 3.9082145171743354, + "grad_norm": 0.06844566017389297, + "learning_rate": 5.099239389711508e-06, + "loss": 0.1873, + "step": 48243 + }, + { + "epoch": 3.908295528191834, + "grad_norm": 0.0649128332734108, + "learning_rate": 5.094738737116882e-06, + "loss": 0.1839, + "step": 48244 + }, + { + "epoch": 3.9083765392093324, + "grad_norm": 0.06959527730941772, + "learning_rate": 5.090238084522256e-06, + "loss": 0.2126, + "step": 48245 + }, + { + "epoch": 3.9084575502268306, + "grad_norm": 0.0665079727768898, + "learning_rate": 5.08573743192763e-06, + "loss": 0.2057, + "step": 48246 + }, + { + "epoch": 3.9085385612443293, + "grad_norm": 0.09884722530841827, + "learning_rate": 5.081236779333004e-06, + "loss": 0.2282, + "step": 48247 + }, + { + "epoch": 3.9086195722618275, + "grad_norm": 0.07287062704563141, + "learning_rate": 5.076736126738377e-06, + "loss": 0.2146, + "step": 48248 + }, + { + "epoch": 3.908700583279326, + "grad_norm": 0.07356756180524826, + "learning_rate": 5.072235474143751e-06, + "loss": 0.2424, + "step": 48249 + }, + { + "epoch": 3.9087815942968245, + "grad_norm": 0.0727347582578659, + "learning_rate": 5.067734821549125e-06, + "loss": 0.1947, + "step": 48250 + }, + { + "epoch": 3.9088626053143227, + "grad_norm": 0.06179845333099365, + "learning_rate": 5.063234168954498e-06, + "loss": 0.2131, + "step": 48251 + }, + { + "epoch": 3.908943616331821, + "grad_norm": 0.08347577601671219, + "learning_rate": 5.0587335163598725e-06, + "loss": 0.2417, + "step": 48252 + }, + { + "epoch": 3.9090246273493197, + "grad_norm": 0.07791955024003983, + "learning_rate": 5.054232863765247e-06, + "loss": 0.2335, + "step": 48253 + }, + { + "epoch": 3.909105638366818, + "grad_norm": 0.07559989392757416, + "learning_rate": 5.04973221117062e-06, + "loss": 0.203, + "step": 48254 + }, + { + "epoch": 3.909186649384316, + "grad_norm": 0.06269510090351105, + "learning_rate": 5.045231558575994e-06, + "loss": 0.2261, + "step": 48255 + }, + { + "epoch": 3.909267660401815, + "grad_norm": 0.06181592866778374, + "learning_rate": 5.040730905981368e-06, + "loss": 0.2074, + "step": 48256 + }, + { + "epoch": 3.909348671419313, + "grad_norm": 0.07220128178596497, + "learning_rate": 5.036230253386741e-06, + "loss": 0.2157, + "step": 48257 + }, + { + "epoch": 3.9094296824368113, + "grad_norm": 0.07272609323263168, + "learning_rate": 5.031729600792115e-06, + "loss": 0.2441, + "step": 48258 + }, + { + "epoch": 3.90951069345431, + "grad_norm": 0.0771983340382576, + "learning_rate": 5.0272289481974894e-06, + "loss": 0.2157, + "step": 48259 + }, + { + "epoch": 3.9095917044718083, + "grad_norm": 0.08448202162981033, + "learning_rate": 5.022728295602862e-06, + "loss": 0.207, + "step": 48260 + }, + { + "epoch": 3.9096727154893065, + "grad_norm": 0.07362006604671478, + "learning_rate": 5.018227643008237e-06, + "loss": 0.2275, + "step": 48261 + }, + { + "epoch": 3.909753726506805, + "grad_norm": 0.05870373547077179, + "learning_rate": 5.01372699041361e-06, + "loss": 0.2229, + "step": 48262 + }, + { + "epoch": 3.9098347375243034, + "grad_norm": 0.07628266513347626, + "learning_rate": 5.009226337818984e-06, + "loss": 0.2208, + "step": 48263 + }, + { + "epoch": 3.9099157485418017, + "grad_norm": 0.08400886505842209, + "learning_rate": 5.0047256852243574e-06, + "loss": 0.2524, + "step": 48264 + }, + { + "epoch": 3.9099967595593, + "grad_norm": 0.0675138533115387, + "learning_rate": 5.000225032629732e-06, + "loss": 0.1857, + "step": 48265 + }, + { + "epoch": 3.910077770576798, + "grad_norm": 0.08384126424789429, + "learning_rate": 4.995724380035105e-06, + "loss": 0.2271, + "step": 48266 + }, + { + "epoch": 3.910158781594297, + "grad_norm": 0.0884663388133049, + "learning_rate": 4.991223727440479e-06, + "loss": 0.267, + "step": 48267 + }, + { + "epoch": 3.910239792611795, + "grad_norm": 0.06389939039945602, + "learning_rate": 4.9867230748458536e-06, + "loss": 0.2102, + "step": 48268 + }, + { + "epoch": 3.9103208036292934, + "grad_norm": 0.060799382627010345, + "learning_rate": 4.982222422251226e-06, + "loss": 0.2267, + "step": 48269 + }, + { + "epoch": 3.910401814646792, + "grad_norm": 0.07198642194271088, + "learning_rate": 4.977721769656601e-06, + "loss": 0.2249, + "step": 48270 + }, + { + "epoch": 3.9104828256642903, + "grad_norm": 0.0687641128897667, + "learning_rate": 4.973221117061974e-06, + "loss": 0.1983, + "step": 48271 + }, + { + "epoch": 3.9105638366817885, + "grad_norm": 0.07991275936365128, + "learning_rate": 4.968720464467348e-06, + "loss": 0.253, + "step": 48272 + }, + { + "epoch": 3.910644847699287, + "grad_norm": 0.07373076677322388, + "learning_rate": 4.9642198118727216e-06, + "loss": 0.235, + "step": 48273 + }, + { + "epoch": 3.9107258587167855, + "grad_norm": 0.07304935157299042, + "learning_rate": 4.959719159278096e-06, + "loss": 0.1938, + "step": 48274 + }, + { + "epoch": 3.9108068697342837, + "grad_norm": 0.06456219404935837, + "learning_rate": 4.955218506683469e-06, + "loss": 0.2212, + "step": 48275 + }, + { + "epoch": 3.9108878807517824, + "grad_norm": 0.06691355258226395, + "learning_rate": 4.950717854088843e-06, + "loss": 0.198, + "step": 48276 + }, + { + "epoch": 3.9109688917692806, + "grad_norm": 0.07013965398073196, + "learning_rate": 4.946217201494217e-06, + "loss": 0.2197, + "step": 48277 + }, + { + "epoch": 3.911049902786779, + "grad_norm": 0.0730832889676094, + "learning_rate": 4.9417165488995904e-06, + "loss": 0.1978, + "step": 48278 + }, + { + "epoch": 3.9111309138042776, + "grad_norm": 0.06756222993135452, + "learning_rate": 4.937215896304964e-06, + "loss": 0.2059, + "step": 48279 + }, + { + "epoch": 3.911211924821776, + "grad_norm": 0.05772652477025986, + "learning_rate": 4.9327152437103385e-06, + "loss": 0.2107, + "step": 48280 + }, + { + "epoch": 3.911292935839274, + "grad_norm": 0.07321856915950775, + "learning_rate": 4.928214591115712e-06, + "loss": 0.2468, + "step": 48281 + }, + { + "epoch": 3.9113739468567728, + "grad_norm": 0.07220222055912018, + "learning_rate": 4.923713938521086e-06, + "loss": 0.2097, + "step": 48282 + }, + { + "epoch": 3.911454957874271, + "grad_norm": 0.08239828050136566, + "learning_rate": 4.91921328592646e-06, + "loss": 0.2462, + "step": 48283 + }, + { + "epoch": 3.9115359688917692, + "grad_norm": 0.053629614412784576, + "learning_rate": 4.914712633331833e-06, + "loss": 0.1856, + "step": 48284 + }, + { + "epoch": 3.911616979909268, + "grad_norm": 0.07192451506853104, + "learning_rate": 4.910211980737207e-06, + "loss": 0.2489, + "step": 48285 + }, + { + "epoch": 3.911697990926766, + "grad_norm": 0.07000795751810074, + "learning_rate": 4.905711328142581e-06, + "loss": 0.2101, + "step": 48286 + }, + { + "epoch": 3.9117790019442644, + "grad_norm": 0.06989791244268417, + "learning_rate": 4.9012106755479546e-06, + "loss": 0.2376, + "step": 48287 + }, + { + "epoch": 3.9118600129617627, + "grad_norm": 0.08663799613714218, + "learning_rate": 4.896710022953328e-06, + "loss": 0.2485, + "step": 48288 + }, + { + "epoch": 3.911941023979261, + "grad_norm": 0.07052966952323914, + "learning_rate": 4.892209370358703e-06, + "loss": 0.1894, + "step": 48289 + }, + { + "epoch": 3.9120220349967596, + "grad_norm": 0.05605921521782875, + "learning_rate": 4.887708717764076e-06, + "loss": 0.1966, + "step": 48290 + }, + { + "epoch": 3.912103046014258, + "grad_norm": 0.06981483846902847, + "learning_rate": 4.88320806516945e-06, + "loss": 0.2424, + "step": 48291 + }, + { + "epoch": 3.912184057031756, + "grad_norm": 0.05360450968146324, + "learning_rate": 4.878707412574823e-06, + "loss": 0.1981, + "step": 48292 + }, + { + "epoch": 3.912265068049255, + "grad_norm": 0.07205212116241455, + "learning_rate": 4.874206759980197e-06, + "loss": 0.1913, + "step": 48293 + }, + { + "epoch": 3.912346079066753, + "grad_norm": 0.06900376826524734, + "learning_rate": 4.869706107385571e-06, + "loss": 0.2177, + "step": 48294 + }, + { + "epoch": 3.9124270900842513, + "grad_norm": 0.07426581531763077, + "learning_rate": 4.865205454790945e-06, + "loss": 0.2662, + "step": 48295 + }, + { + "epoch": 3.91250810110175, + "grad_norm": 0.0741383284330368, + "learning_rate": 4.860704802196319e-06, + "loss": 0.2323, + "step": 48296 + }, + { + "epoch": 3.912589112119248, + "grad_norm": 0.07219462841749191, + "learning_rate": 4.856204149601692e-06, + "loss": 0.2199, + "step": 48297 + }, + { + "epoch": 3.9126701231367464, + "grad_norm": 0.06240931153297424, + "learning_rate": 4.851703497007067e-06, + "loss": 0.2398, + "step": 48298 + }, + { + "epoch": 3.912751134154245, + "grad_norm": 0.0665903314948082, + "learning_rate": 4.84720284441244e-06, + "loss": 0.2174, + "step": 48299 + }, + { + "epoch": 3.9128321451717434, + "grad_norm": 0.06555341929197311, + "learning_rate": 4.842702191817814e-06, + "loss": 0.1886, + "step": 48300 + }, + { + "epoch": 3.9129131561892416, + "grad_norm": 0.06640680134296417, + "learning_rate": 4.8382015392231875e-06, + "loss": 0.1735, + "step": 48301 + }, + { + "epoch": 3.9129941672067403, + "grad_norm": 0.07242927700281143, + "learning_rate": 4.833700886628562e-06, + "loss": 0.2254, + "step": 48302 + }, + { + "epoch": 3.9130751782242386, + "grad_norm": 0.06851553171873093, + "learning_rate": 4.829200234033935e-06, + "loss": 0.1927, + "step": 48303 + }, + { + "epoch": 3.913156189241737, + "grad_norm": 0.07618321478366852, + "learning_rate": 4.824699581439309e-06, + "loss": 0.2333, + "step": 48304 + }, + { + "epoch": 3.9132372002592355, + "grad_norm": 0.07696599513292313, + "learning_rate": 4.820198928844683e-06, + "loss": 0.2151, + "step": 48305 + }, + { + "epoch": 3.9133182112767337, + "grad_norm": 0.07403657585382462, + "learning_rate": 4.815698276250056e-06, + "loss": 0.2144, + "step": 48306 + }, + { + "epoch": 3.913399222294232, + "grad_norm": 0.07694181799888611, + "learning_rate": 4.81119762365543e-06, + "loss": 0.2745, + "step": 48307 + }, + { + "epoch": 3.9134802333117307, + "grad_norm": 0.06381962448358536, + "learning_rate": 4.8066969710608045e-06, + "loss": 0.2746, + "step": 48308 + }, + { + "epoch": 3.913561244329229, + "grad_norm": 0.06298644095659256, + "learning_rate": 4.802196318466177e-06, + "loss": 0.1824, + "step": 48309 + }, + { + "epoch": 3.913642255346727, + "grad_norm": 0.07825368642807007, + "learning_rate": 4.797695665871552e-06, + "loss": 0.2237, + "step": 48310 + }, + { + "epoch": 3.9137232663642254, + "grad_norm": 0.05910641327500343, + "learning_rate": 4.793195013276926e-06, + "loss": 0.1973, + "step": 48311 + }, + { + "epoch": 3.9138042773817237, + "grad_norm": 0.08719029277563095, + "learning_rate": 4.788694360682299e-06, + "loss": 0.2168, + "step": 48312 + }, + { + "epoch": 3.9138852883992223, + "grad_norm": 0.07956347614526749, + "learning_rate": 4.784193708087673e-06, + "loss": 0.2315, + "step": 48313 + }, + { + "epoch": 3.9139662994167206, + "grad_norm": 0.07194239646196365, + "learning_rate": 4.779693055493047e-06, + "loss": 0.2449, + "step": 48314 + }, + { + "epoch": 3.914047310434219, + "grad_norm": 0.06650910526514053, + "learning_rate": 4.7751924028984205e-06, + "loss": 0.2138, + "step": 48315 + }, + { + "epoch": 3.9141283214517175, + "grad_norm": 0.07512077689170837, + "learning_rate": 4.770691750303794e-06, + "loss": 0.2245, + "step": 48316 + }, + { + "epoch": 3.9142093324692158, + "grad_norm": 0.09508345276117325, + "learning_rate": 4.766191097709169e-06, + "loss": 0.3053, + "step": 48317 + }, + { + "epoch": 3.914290343486714, + "grad_norm": 0.07228030264377594, + "learning_rate": 4.761690445114541e-06, + "loss": 0.2202, + "step": 48318 + }, + { + "epoch": 3.9143713545042127, + "grad_norm": 0.07119833678007126, + "learning_rate": 4.757189792519916e-06, + "loss": 0.1919, + "step": 48319 + }, + { + "epoch": 3.914452365521711, + "grad_norm": 0.059811804443597794, + "learning_rate": 4.752689139925289e-06, + "loss": 0.1986, + "step": 48320 + }, + { + "epoch": 3.914533376539209, + "grad_norm": 0.06910062581300735, + "learning_rate": 4.748188487330663e-06, + "loss": 0.2151, + "step": 48321 + }, + { + "epoch": 3.914614387556708, + "grad_norm": 0.06613852828741074, + "learning_rate": 4.743687834736037e-06, + "loss": 0.2022, + "step": 48322 + }, + { + "epoch": 3.914695398574206, + "grad_norm": 0.06570880860090256, + "learning_rate": 4.739187182141411e-06, + "loss": 0.1867, + "step": 48323 + }, + { + "epoch": 3.9147764095917044, + "grad_norm": 0.0788615420460701, + "learning_rate": 4.734686529546784e-06, + "loss": 0.2258, + "step": 48324 + }, + { + "epoch": 3.914857420609203, + "grad_norm": 0.0693746879696846, + "learning_rate": 4.730185876952158e-06, + "loss": 0.2217, + "step": 48325 + }, + { + "epoch": 3.9149384316267013, + "grad_norm": 0.08763092756271362, + "learning_rate": 4.725685224357533e-06, + "loss": 0.2442, + "step": 48326 + }, + { + "epoch": 3.9150194426441995, + "grad_norm": 0.06549570709466934, + "learning_rate": 4.7211845717629055e-06, + "loss": 0.2142, + "step": 48327 + }, + { + "epoch": 3.9151004536616982, + "grad_norm": 0.08535436540842056, + "learning_rate": 4.71668391916828e-06, + "loss": 0.249, + "step": 48328 + }, + { + "epoch": 3.9151814646791965, + "grad_norm": 0.0782715380191803, + "learning_rate": 4.7121832665736535e-06, + "loss": 0.2268, + "step": 48329 + }, + { + "epoch": 3.9152624756966947, + "grad_norm": 0.07017436623573303, + "learning_rate": 4.707682613979027e-06, + "loss": 0.231, + "step": 48330 + }, + { + "epoch": 3.915343486714193, + "grad_norm": 0.06549299508333206, + "learning_rate": 4.703181961384401e-06, + "loss": 0.2067, + "step": 48331 + }, + { + "epoch": 3.9154244977316917, + "grad_norm": 0.07573448866605759, + "learning_rate": 4.698681308789775e-06, + "loss": 0.2563, + "step": 48332 + }, + { + "epoch": 3.91550550874919, + "grad_norm": 0.07193144410848618, + "learning_rate": 4.694180656195148e-06, + "loss": 0.2246, + "step": 48333 + }, + { + "epoch": 3.915586519766688, + "grad_norm": 0.0836506187915802, + "learning_rate": 4.689680003600522e-06, + "loss": 0.2372, + "step": 48334 + }, + { + "epoch": 3.9156675307841864, + "grad_norm": 0.07173941284418106, + "learning_rate": 4.685179351005896e-06, + "loss": 0.2322, + "step": 48335 + }, + { + "epoch": 3.915748541801685, + "grad_norm": 0.0687011256814003, + "learning_rate": 4.68067869841127e-06, + "loss": 0.2385, + "step": 48336 + }, + { + "epoch": 3.9158295528191833, + "grad_norm": 0.06658374518156052, + "learning_rate": 4.676178045816643e-06, + "loss": 0.2497, + "step": 48337 + }, + { + "epoch": 3.9159105638366816, + "grad_norm": 0.0663018673658371, + "learning_rate": 4.671677393222018e-06, + "loss": 0.197, + "step": 48338 + }, + { + "epoch": 3.9159915748541803, + "grad_norm": 0.058599188923835754, + "learning_rate": 4.667176740627391e-06, + "loss": 0.2165, + "step": 48339 + }, + { + "epoch": 3.9160725858716785, + "grad_norm": 0.06904032081365585, + "learning_rate": 4.662676088032765e-06, + "loss": 0.2292, + "step": 48340 + }, + { + "epoch": 3.9161535968891767, + "grad_norm": 0.08278106898069382, + "learning_rate": 4.658175435438139e-06, + "loss": 0.2045, + "step": 48341 + }, + { + "epoch": 3.9162346079066754, + "grad_norm": 0.0761694386601448, + "learning_rate": 4.653674782843512e-06, + "loss": 0.2144, + "step": 48342 + }, + { + "epoch": 3.9163156189241737, + "grad_norm": 0.07530530542135239, + "learning_rate": 4.6491741302488865e-06, + "loss": 0.2537, + "step": 48343 + }, + { + "epoch": 3.916396629941672, + "grad_norm": 0.08102653175592422, + "learning_rate": 4.64467347765426e-06, + "loss": 0.2358, + "step": 48344 + }, + { + "epoch": 3.9164776409591706, + "grad_norm": 0.07261081039905548, + "learning_rate": 4.640172825059634e-06, + "loss": 0.249, + "step": 48345 + }, + { + "epoch": 3.916558651976669, + "grad_norm": 0.09279650449752808, + "learning_rate": 4.635672172465007e-06, + "loss": 0.2493, + "step": 48346 + }, + { + "epoch": 3.916639662994167, + "grad_norm": 0.08426755666732788, + "learning_rate": 4.631171519870382e-06, + "loss": 0.2518, + "step": 48347 + }, + { + "epoch": 3.916720674011666, + "grad_norm": 0.06982297450304031, + "learning_rate": 4.626670867275755e-06, + "loss": 0.2133, + "step": 48348 + }, + { + "epoch": 3.916801685029164, + "grad_norm": 0.062374137341976166, + "learning_rate": 4.622170214681129e-06, + "loss": 0.2058, + "step": 48349 + }, + { + "epoch": 3.9168826960466623, + "grad_norm": 0.06334874778985977, + "learning_rate": 4.617669562086503e-06, + "loss": 0.2218, + "step": 48350 + }, + { + "epoch": 3.916963707064161, + "grad_norm": 0.07470595091581345, + "learning_rate": 4.613168909491876e-06, + "loss": 0.2633, + "step": 48351 + }, + { + "epoch": 3.917044718081659, + "grad_norm": 0.06525980681180954, + "learning_rate": 4.60866825689725e-06, + "loss": 0.2238, + "step": 48352 + }, + { + "epoch": 3.9171257290991575, + "grad_norm": 0.0697428286075592, + "learning_rate": 4.604167604302624e-06, + "loss": 0.2647, + "step": 48353 + }, + { + "epoch": 3.9172067401166557, + "grad_norm": 0.07528168708086014, + "learning_rate": 4.599666951707998e-06, + "loss": 0.2266, + "step": 48354 + }, + { + "epoch": 3.9172877511341544, + "grad_norm": 0.07216835767030716, + "learning_rate": 4.5951662991133714e-06, + "loss": 0.2087, + "step": 48355 + }, + { + "epoch": 3.9173687621516526, + "grad_norm": 0.06816212087869644, + "learning_rate": 4.590665646518746e-06, + "loss": 0.1967, + "step": 48356 + }, + { + "epoch": 3.917449773169151, + "grad_norm": 0.07805240899324417, + "learning_rate": 4.5861649939241195e-06, + "loss": 0.2293, + "step": 48357 + }, + { + "epoch": 3.917530784186649, + "grad_norm": 0.06178323179483414, + "learning_rate": 4.581664341329493e-06, + "loss": 0.2204, + "step": 48358 + }, + { + "epoch": 3.917611795204148, + "grad_norm": 0.07420182973146439, + "learning_rate": 4.577163688734867e-06, + "loss": 0.2364, + "step": 48359 + }, + { + "epoch": 3.917692806221646, + "grad_norm": 0.06499535590410233, + "learning_rate": 4.572663036140241e-06, + "loss": 0.2054, + "step": 48360 + }, + { + "epoch": 3.9177738172391443, + "grad_norm": 0.07386469841003418, + "learning_rate": 4.568162383545614e-06, + "loss": 0.224, + "step": 48361 + }, + { + "epoch": 3.917854828256643, + "grad_norm": 0.07269761711359024, + "learning_rate": 4.563661730950988e-06, + "loss": 0.1961, + "step": 48362 + }, + { + "epoch": 3.9179358392741412, + "grad_norm": 0.07524976879358292, + "learning_rate": 4.559161078356362e-06, + "loss": 0.22, + "step": 48363 + }, + { + "epoch": 3.9180168502916395, + "grad_norm": 0.07413894683122635, + "learning_rate": 4.5546604257617356e-06, + "loss": 0.2123, + "step": 48364 + }, + { + "epoch": 3.918097861309138, + "grad_norm": 0.06486944854259491, + "learning_rate": 4.550159773167109e-06, + "loss": 0.2077, + "step": 48365 + }, + { + "epoch": 3.9181788723266364, + "grad_norm": 0.06430947780609131, + "learning_rate": 4.545659120572484e-06, + "loss": 0.2326, + "step": 48366 + }, + { + "epoch": 3.9182598833441347, + "grad_norm": 0.08823602646589279, + "learning_rate": 4.541158467977856e-06, + "loss": 0.2792, + "step": 48367 + }, + { + "epoch": 3.9183408943616334, + "grad_norm": 0.05794184282422066, + "learning_rate": 4.536657815383231e-06, + "loss": 0.2169, + "step": 48368 + }, + { + "epoch": 3.9184219053791316, + "grad_norm": 0.08605548739433289, + "learning_rate": 4.532157162788605e-06, + "loss": 0.2541, + "step": 48369 + }, + { + "epoch": 3.91850291639663, + "grad_norm": 0.08970436453819275, + "learning_rate": 4.527656510193978e-06, + "loss": 0.2634, + "step": 48370 + }, + { + "epoch": 3.9185839274141285, + "grad_norm": 0.07923775911331177, + "learning_rate": 4.5231558575993525e-06, + "loss": 0.2305, + "step": 48371 + }, + { + "epoch": 3.9186649384316268, + "grad_norm": 0.07092005014419556, + "learning_rate": 4.518655205004726e-06, + "loss": 0.2317, + "step": 48372 + }, + { + "epoch": 3.918745949449125, + "grad_norm": 0.08713727444410324, + "learning_rate": 4.5141545524101e-06, + "loss": 0.2262, + "step": 48373 + }, + { + "epoch": 3.9188269604666237, + "grad_norm": 0.06912411004304886, + "learning_rate": 4.509653899815473e-06, + "loss": 0.2173, + "step": 48374 + }, + { + "epoch": 3.918907971484122, + "grad_norm": 0.0842410996556282, + "learning_rate": 4.505153247220848e-06, + "loss": 0.2268, + "step": 48375 + }, + { + "epoch": 3.91898898250162, + "grad_norm": 0.06546398997306824, + "learning_rate": 4.5006525946262205e-06, + "loss": 0.2216, + "step": 48376 + }, + { + "epoch": 3.9190699935191184, + "grad_norm": 0.09463486075401306, + "learning_rate": 4.496151942031595e-06, + "loss": 0.3016, + "step": 48377 + }, + { + "epoch": 3.919151004536617, + "grad_norm": 0.0747930109500885, + "learning_rate": 4.4916512894369686e-06, + "loss": 0.2222, + "step": 48378 + }, + { + "epoch": 3.9192320155541154, + "grad_norm": 0.06386351585388184, + "learning_rate": 4.487150636842342e-06, + "loss": 0.204, + "step": 48379 + }, + { + "epoch": 3.9193130265716136, + "grad_norm": 0.06935272365808487, + "learning_rate": 4.482649984247716e-06, + "loss": 0.227, + "step": 48380 + }, + { + "epoch": 3.919394037589112, + "grad_norm": 0.0631873607635498, + "learning_rate": 4.47814933165309e-06, + "loss": 0.2332, + "step": 48381 + }, + { + "epoch": 3.9194750486066106, + "grad_norm": 0.07327888160943985, + "learning_rate": 4.473648679058463e-06, + "loss": 0.225, + "step": 48382 + }, + { + "epoch": 3.919556059624109, + "grad_norm": 0.07921938598155975, + "learning_rate": 4.469148026463837e-06, + "loss": 0.2254, + "step": 48383 + }, + { + "epoch": 3.919637070641607, + "grad_norm": 0.06906044483184814, + "learning_rate": 4.464647373869212e-06, + "loss": 0.2089, + "step": 48384 + }, + { + "epoch": 3.9197180816591057, + "grad_norm": 0.0595407597720623, + "learning_rate": 4.460146721274585e-06, + "loss": 0.1853, + "step": 48385 + }, + { + "epoch": 3.919799092676604, + "grad_norm": 0.08025841414928436, + "learning_rate": 4.455646068679959e-06, + "loss": 0.2379, + "step": 48386 + }, + { + "epoch": 3.9198801036941022, + "grad_norm": 0.06751351803541183, + "learning_rate": 4.451145416085333e-06, + "loss": 0.1824, + "step": 48387 + }, + { + "epoch": 3.919961114711601, + "grad_norm": 0.07128855586051941, + "learning_rate": 4.446644763490706e-06, + "loss": 0.2065, + "step": 48388 + }, + { + "epoch": 3.920042125729099, + "grad_norm": 0.07042013853788376, + "learning_rate": 4.44214411089608e-06, + "loss": 0.2286, + "step": 48389 + }, + { + "epoch": 3.9201231367465974, + "grad_norm": 0.07936536520719528, + "learning_rate": 4.437643458301454e-06, + "loss": 0.2273, + "step": 48390 + }, + { + "epoch": 3.920204147764096, + "grad_norm": 0.07564130425453186, + "learning_rate": 4.433142805706827e-06, + "loss": 0.2124, + "step": 48391 + }, + { + "epoch": 3.9202851587815943, + "grad_norm": 0.0812234953045845, + "learning_rate": 4.4286421531122015e-06, + "loss": 0.2354, + "step": 48392 + }, + { + "epoch": 3.9203661697990926, + "grad_norm": 0.07354864478111267, + "learning_rate": 4.424141500517575e-06, + "loss": 0.2063, + "step": 48393 + }, + { + "epoch": 3.9204471808165913, + "grad_norm": 0.060475897043943405, + "learning_rate": 4.419640847922949e-06, + "loss": 0.1771, + "step": 48394 + }, + { + "epoch": 3.9205281918340895, + "grad_norm": 0.06647646427154541, + "learning_rate": 4.415140195328322e-06, + "loss": 0.2066, + "step": 48395 + }, + { + "epoch": 3.9206092028515878, + "grad_norm": 0.07337382435798645, + "learning_rate": 4.410639542733697e-06, + "loss": 0.2287, + "step": 48396 + }, + { + "epoch": 3.9206902138690864, + "grad_norm": 0.07359199970960617, + "learning_rate": 4.40613889013907e-06, + "loss": 0.244, + "step": 48397 + }, + { + "epoch": 3.9207712248865847, + "grad_norm": 0.074060820043087, + "learning_rate": 4.401638237544444e-06, + "loss": 0.2418, + "step": 48398 + }, + { + "epoch": 3.920852235904083, + "grad_norm": 0.07838385552167892, + "learning_rate": 4.3971375849498185e-06, + "loss": 0.2612, + "step": 48399 + }, + { + "epoch": 3.920933246921581, + "grad_norm": 0.07551511377096176, + "learning_rate": 4.392636932355191e-06, + "loss": 0.2409, + "step": 48400 + }, + { + "epoch": 3.92101425793908, + "grad_norm": 0.08527684211730957, + "learning_rate": 4.388136279760566e-06, + "loss": 0.204, + "step": 48401 + }, + { + "epoch": 3.921095268956578, + "grad_norm": 0.06834513694047928, + "learning_rate": 4.383635627165939e-06, + "loss": 0.2157, + "step": 48402 + }, + { + "epoch": 3.9211762799740764, + "grad_norm": 0.06227676197886467, + "learning_rate": 4.379134974571313e-06, + "loss": 0.2034, + "step": 48403 + }, + { + "epoch": 3.9212572909915746, + "grad_norm": 0.07426196336746216, + "learning_rate": 4.3746343219766865e-06, + "loss": 0.2606, + "step": 48404 + }, + { + "epoch": 3.9213383020090733, + "grad_norm": 0.07575037330389023, + "learning_rate": 4.370133669382061e-06, + "loss": 0.22, + "step": 48405 + }, + { + "epoch": 3.9214193130265715, + "grad_norm": 0.06229977682232857, + "learning_rate": 4.3656330167874345e-06, + "loss": 0.2217, + "step": 48406 + }, + { + "epoch": 3.92150032404407, + "grad_norm": 0.07139471918344498, + "learning_rate": 4.361132364192808e-06, + "loss": 0.2437, + "step": 48407 + }, + { + "epoch": 3.9215813350615685, + "grad_norm": 0.06911724805831909, + "learning_rate": 4.356631711598182e-06, + "loss": 0.1807, + "step": 48408 + }, + { + "epoch": 3.9216623460790667, + "grad_norm": 0.07534075528383255, + "learning_rate": 4.352131059003556e-06, + "loss": 0.2281, + "step": 48409 + }, + { + "epoch": 3.921743357096565, + "grad_norm": 0.07420405745506287, + "learning_rate": 4.347630406408929e-06, + "loss": 0.2289, + "step": 48410 + }, + { + "epoch": 3.9218243681140637, + "grad_norm": 0.07598921656608582, + "learning_rate": 4.343129753814303e-06, + "loss": 0.2104, + "step": 48411 + }, + { + "epoch": 3.921905379131562, + "grad_norm": 0.061629679054021835, + "learning_rate": 4.338629101219677e-06, + "loss": 0.2472, + "step": 48412 + }, + { + "epoch": 3.92198639014906, + "grad_norm": 0.07201918214559555, + "learning_rate": 4.334128448625051e-06, + "loss": 0.1946, + "step": 48413 + }, + { + "epoch": 3.922067401166559, + "grad_norm": 0.07076304405927658, + "learning_rate": 4.329627796030425e-06, + "loss": 0.221, + "step": 48414 + }, + { + "epoch": 3.922148412184057, + "grad_norm": 0.0704672783613205, + "learning_rate": 4.325127143435799e-06, + "loss": 0.2151, + "step": 48415 + }, + { + "epoch": 3.9222294232015553, + "grad_norm": 0.06875283271074295, + "learning_rate": 4.320626490841172e-06, + "loss": 0.1926, + "step": 48416 + }, + { + "epoch": 3.922310434219054, + "grad_norm": 0.05916054546833038, + "learning_rate": 4.316125838246546e-06, + "loss": 0.2194, + "step": 48417 + }, + { + "epoch": 3.9223914452365523, + "grad_norm": 0.060847945511341095, + "learning_rate": 4.31162518565192e-06, + "loss": 0.2302, + "step": 48418 + }, + { + "epoch": 3.9224724562540505, + "grad_norm": 0.07540497928857803, + "learning_rate": 4.307124533057293e-06, + "loss": 0.2381, + "step": 48419 + }, + { + "epoch": 3.922553467271549, + "grad_norm": 0.07262145727872849, + "learning_rate": 4.3026238804626675e-06, + "loss": 0.209, + "step": 48420 + }, + { + "epoch": 3.9226344782890474, + "grad_norm": 0.061988167464733124, + "learning_rate": 4.298123227868041e-06, + "loss": 0.1932, + "step": 48421 + }, + { + "epoch": 3.9227154893065457, + "grad_norm": 0.06330009549856186, + "learning_rate": 4.293622575273415e-06, + "loss": 0.222, + "step": 48422 + }, + { + "epoch": 3.922796500324044, + "grad_norm": 0.08140560984611511, + "learning_rate": 4.289121922678788e-06, + "loss": 0.2162, + "step": 48423 + }, + { + "epoch": 3.9228775113415426, + "grad_norm": 0.07807426899671555, + "learning_rate": 4.284621270084163e-06, + "loss": 0.2249, + "step": 48424 + }, + { + "epoch": 3.922958522359041, + "grad_norm": 0.07387779653072357, + "learning_rate": 4.2801206174895355e-06, + "loss": 0.2144, + "step": 48425 + }, + { + "epoch": 3.923039533376539, + "grad_norm": 0.06375554203987122, + "learning_rate": 4.27561996489491e-06, + "loss": 0.2085, + "step": 48426 + }, + { + "epoch": 3.9231205443940373, + "grad_norm": 0.07752867043018341, + "learning_rate": 4.2711193123002844e-06, + "loss": 0.2388, + "step": 48427 + }, + { + "epoch": 3.923201555411536, + "grad_norm": 0.07950585335493088, + "learning_rate": 4.266618659705657e-06, + "loss": 0.2315, + "step": 48428 + }, + { + "epoch": 3.9232825664290343, + "grad_norm": 0.0622391402721405, + "learning_rate": 4.262118007111032e-06, + "loss": 0.2157, + "step": 48429 + }, + { + "epoch": 3.9233635774465325, + "grad_norm": 0.0833074301481247, + "learning_rate": 4.257617354516405e-06, + "loss": 0.2555, + "step": 48430 + }, + { + "epoch": 3.923444588464031, + "grad_norm": 0.06539009511470795, + "learning_rate": 4.253116701921779e-06, + "loss": 0.2225, + "step": 48431 + }, + { + "epoch": 3.9235255994815295, + "grad_norm": 0.07489827275276184, + "learning_rate": 4.2486160493271525e-06, + "loss": 0.2581, + "step": 48432 + }, + { + "epoch": 3.9236066104990277, + "grad_norm": 0.06998512893915176, + "learning_rate": 4.244115396732527e-06, + "loss": 0.176, + "step": 48433 + }, + { + "epoch": 3.9236876215165264, + "grad_norm": 0.06104741990566254, + "learning_rate": 4.2396147441379e-06, + "loss": 0.2076, + "step": 48434 + }, + { + "epoch": 3.9237686325340246, + "grad_norm": 0.07176847755908966, + "learning_rate": 4.235114091543274e-06, + "loss": 0.2334, + "step": 48435 + }, + { + "epoch": 3.923849643551523, + "grad_norm": 0.05985512211918831, + "learning_rate": 4.230613438948648e-06, + "loss": 0.2031, + "step": 48436 + }, + { + "epoch": 3.9239306545690216, + "grad_norm": 0.06858471781015396, + "learning_rate": 4.226112786354021e-06, + "loss": 0.249, + "step": 48437 + }, + { + "epoch": 3.92401166558652, + "grad_norm": 0.07977214455604553, + "learning_rate": 4.221612133759395e-06, + "loss": 0.2112, + "step": 48438 + }, + { + "epoch": 3.924092676604018, + "grad_norm": 0.07993137091398239, + "learning_rate": 4.217111481164769e-06, + "loss": 0.1969, + "step": 48439 + }, + { + "epoch": 3.9241736876215167, + "grad_norm": 0.06767088174819946, + "learning_rate": 4.212610828570142e-06, + "loss": 0.2476, + "step": 48440 + }, + { + "epoch": 3.924254698639015, + "grad_norm": 0.0661626085639, + "learning_rate": 4.2081101759755166e-06, + "loss": 0.2613, + "step": 48441 + }, + { + "epoch": 3.9243357096565132, + "grad_norm": 0.07554125785827637, + "learning_rate": 4.203609523380891e-06, + "loss": 0.207, + "step": 48442 + }, + { + "epoch": 3.924416720674012, + "grad_norm": 0.06287429481744766, + "learning_rate": 4.199108870786264e-06, + "loss": 0.2114, + "step": 48443 + }, + { + "epoch": 3.92449773169151, + "grad_norm": 0.07063333690166473, + "learning_rate": 4.194608218191638e-06, + "loss": 0.2113, + "step": 48444 + }, + { + "epoch": 3.9245787427090084, + "grad_norm": 0.074994757771492, + "learning_rate": 4.190107565597012e-06, + "loss": 0.2693, + "step": 48445 + }, + { + "epoch": 3.9246597537265067, + "grad_norm": 0.07786433398723602, + "learning_rate": 4.1856069130023854e-06, + "loss": 0.2375, + "step": 48446 + }, + { + "epoch": 3.9247407647440054, + "grad_norm": 0.08209618926048279, + "learning_rate": 4.181106260407759e-06, + "loss": 0.2616, + "step": 48447 + }, + { + "epoch": 3.9248217757615036, + "grad_norm": 0.060023292899131775, + "learning_rate": 4.1766056078131335e-06, + "loss": 0.2174, + "step": 48448 + }, + { + "epoch": 3.924902786779002, + "grad_norm": 0.08991419523954391, + "learning_rate": 4.172104955218506e-06, + "loss": 0.2533, + "step": 48449 + }, + { + "epoch": 3.9249837977965, + "grad_norm": 0.07790505886077881, + "learning_rate": 4.167604302623881e-06, + "loss": 0.2452, + "step": 48450 + }, + { + "epoch": 3.9250648088139988, + "grad_norm": 0.07460498064756393, + "learning_rate": 4.163103650029254e-06, + "loss": 0.2162, + "step": 48451 + }, + { + "epoch": 3.925145819831497, + "grad_norm": 0.07503633946180344, + "learning_rate": 4.158602997434628e-06, + "loss": 0.1964, + "step": 48452 + }, + { + "epoch": 3.9252268308489953, + "grad_norm": 0.06638554483652115, + "learning_rate": 4.1541023448400015e-06, + "loss": 0.1769, + "step": 48453 + }, + { + "epoch": 3.925307841866494, + "grad_norm": 0.0684497132897377, + "learning_rate": 4.149601692245376e-06, + "loss": 0.1895, + "step": 48454 + }, + { + "epoch": 3.925388852883992, + "grad_norm": 0.06736957281827927, + "learning_rate": 4.1451010396507496e-06, + "loss": 0.2522, + "step": 48455 + }, + { + "epoch": 3.9254698639014904, + "grad_norm": 0.07692014425992966, + "learning_rate": 4.140600387056123e-06, + "loss": 0.2327, + "step": 48456 + }, + { + "epoch": 3.925550874918989, + "grad_norm": 0.07177066057920456, + "learning_rate": 4.136099734461498e-06, + "loss": 0.198, + "step": 48457 + }, + { + "epoch": 3.9256318859364874, + "grad_norm": 0.05650921165943146, + "learning_rate": 4.131599081866871e-06, + "loss": 0.2021, + "step": 48458 + }, + { + "epoch": 3.9257128969539856, + "grad_norm": 0.07662155479192734, + "learning_rate": 4.127098429272245e-06, + "loss": 0.1941, + "step": 48459 + }, + { + "epoch": 3.9257939079714843, + "grad_norm": 0.07912242412567139, + "learning_rate": 4.1225977766776184e-06, + "loss": 0.2419, + "step": 48460 + }, + { + "epoch": 3.9258749189889826, + "grad_norm": 0.09148503839969635, + "learning_rate": 4.118097124082992e-06, + "loss": 0.1989, + "step": 48461 + }, + { + "epoch": 3.925955930006481, + "grad_norm": 0.072085440158844, + "learning_rate": 4.113596471488366e-06, + "loss": 0.2192, + "step": 48462 + }, + { + "epoch": 3.9260369410239795, + "grad_norm": 0.0772864818572998, + "learning_rate": 4.10909581889374e-06, + "loss": 0.2057, + "step": 48463 + }, + { + "epoch": 3.9261179520414777, + "grad_norm": 0.07115405797958374, + "learning_rate": 4.104595166299114e-06, + "loss": 0.2298, + "step": 48464 + }, + { + "epoch": 3.926198963058976, + "grad_norm": 0.08247821033000946, + "learning_rate": 4.100094513704487e-06, + "loss": 0.2068, + "step": 48465 + }, + { + "epoch": 3.9262799740764747, + "grad_norm": 0.09296160936355591, + "learning_rate": 4.095593861109861e-06, + "loss": 0.2661, + "step": 48466 + }, + { + "epoch": 3.926360985093973, + "grad_norm": 0.0736137181520462, + "learning_rate": 4.091093208515235e-06, + "loss": 0.2183, + "step": 48467 + }, + { + "epoch": 3.926441996111471, + "grad_norm": 0.07801380753517151, + "learning_rate": 4.086592555920608e-06, + "loss": 0.2269, + "step": 48468 + }, + { + "epoch": 3.9265230071289694, + "grad_norm": 0.07622180134057999, + "learning_rate": 4.0820919033259826e-06, + "loss": 0.2498, + "step": 48469 + }, + { + "epoch": 3.9266040181464676, + "grad_norm": 0.08292534947395325, + "learning_rate": 4.077591250731356e-06, + "loss": 0.2366, + "step": 48470 + }, + { + "epoch": 3.9266850291639663, + "grad_norm": 0.06659669429063797, + "learning_rate": 4.07309059813673e-06, + "loss": 0.2112, + "step": 48471 + }, + { + "epoch": 3.9267660401814646, + "grad_norm": 0.07388713955879211, + "learning_rate": 4.068589945542104e-06, + "loss": 0.2077, + "step": 48472 + }, + { + "epoch": 3.926847051198963, + "grad_norm": 0.09441567212343216, + "learning_rate": 4.064089292947478e-06, + "loss": 0.2082, + "step": 48473 + }, + { + "epoch": 3.9269280622164615, + "grad_norm": 0.06913454085588455, + "learning_rate": 4.059588640352851e-06, + "loss": 0.2489, + "step": 48474 + }, + { + "epoch": 3.9270090732339598, + "grad_norm": 0.0766616016626358, + "learning_rate": 4.055087987758225e-06, + "loss": 0.2211, + "step": 48475 + }, + { + "epoch": 3.927090084251458, + "grad_norm": 0.07652173191308975, + "learning_rate": 4.0505873351635995e-06, + "loss": 0.1975, + "step": 48476 + }, + { + "epoch": 3.9271710952689567, + "grad_norm": 0.06538225710391998, + "learning_rate": 4.046086682568972e-06, + "loss": 0.2346, + "step": 48477 + }, + { + "epoch": 3.927252106286455, + "grad_norm": 0.06540494412183762, + "learning_rate": 4.041586029974347e-06, + "loss": 0.1988, + "step": 48478 + }, + { + "epoch": 3.927333117303953, + "grad_norm": 0.07058582454919815, + "learning_rate": 4.03708537737972e-06, + "loss": 0.2295, + "step": 48479 + }, + { + "epoch": 3.927414128321452, + "grad_norm": 0.06702663749456406, + "learning_rate": 4.032584724785094e-06, + "loss": 0.2176, + "step": 48480 + }, + { + "epoch": 3.92749513933895, + "grad_norm": 0.07759080082178116, + "learning_rate": 4.0280840721904675e-06, + "loss": 0.2394, + "step": 48481 + }, + { + "epoch": 3.9275761503564484, + "grad_norm": 0.07646369934082031, + "learning_rate": 4.023583419595842e-06, + "loss": 0.2168, + "step": 48482 + }, + { + "epoch": 3.927657161373947, + "grad_norm": 0.0533691830933094, + "learning_rate": 4.019082767001215e-06, + "loss": 0.2078, + "step": 48483 + }, + { + "epoch": 3.9277381723914453, + "grad_norm": 0.0704951360821724, + "learning_rate": 4.014582114406589e-06, + "loss": 0.2282, + "step": 48484 + }, + { + "epoch": 3.9278191834089435, + "grad_norm": 0.07538004964590073, + "learning_rate": 4.010081461811964e-06, + "loss": 0.2543, + "step": 48485 + }, + { + "epoch": 3.9279001944264422, + "grad_norm": 0.08612173050642014, + "learning_rate": 4.005580809217336e-06, + "loss": 0.2632, + "step": 48486 + }, + { + "epoch": 3.9279812054439405, + "grad_norm": 0.07190410047769547, + "learning_rate": 4.001080156622711e-06, + "loss": 0.216, + "step": 48487 + }, + { + "epoch": 3.9280622164614387, + "grad_norm": 0.07013415545225143, + "learning_rate": 3.996579504028084e-06, + "loss": 0.1973, + "step": 48488 + }, + { + "epoch": 3.9281432274789374, + "grad_norm": 0.06593007594347, + "learning_rate": 3.992078851433458e-06, + "loss": 0.2084, + "step": 48489 + }, + { + "epoch": 3.9282242384964356, + "grad_norm": 0.07043427973985672, + "learning_rate": 3.987578198838832e-06, + "loss": 0.2394, + "step": 48490 + }, + { + "epoch": 3.928305249513934, + "grad_norm": 0.06437504291534424, + "learning_rate": 3.983077546244206e-06, + "loss": 0.171, + "step": 48491 + }, + { + "epoch": 3.928386260531432, + "grad_norm": 0.07793059200048447, + "learning_rate": 3.978576893649579e-06, + "loss": 0.1901, + "step": 48492 + }, + { + "epoch": 3.9284672715489304, + "grad_norm": 0.0729973241686821, + "learning_rate": 3.974076241054953e-06, + "loss": 0.1898, + "step": 48493 + }, + { + "epoch": 3.928548282566429, + "grad_norm": 0.08081178367137909, + "learning_rate": 3.969575588460327e-06, + "loss": 0.2452, + "step": 48494 + }, + { + "epoch": 3.9286292935839273, + "grad_norm": 0.08382073789834976, + "learning_rate": 3.9650749358657005e-06, + "loss": 0.2119, + "step": 48495 + }, + { + "epoch": 3.9287103046014256, + "grad_norm": 0.07415829598903656, + "learning_rate": 3.960574283271074e-06, + "loss": 0.1941, + "step": 48496 + }, + { + "epoch": 3.9287913156189243, + "grad_norm": 0.06375744938850403, + "learning_rate": 3.9560736306764485e-06, + "loss": 0.2514, + "step": 48497 + }, + { + "epoch": 3.9288723266364225, + "grad_norm": 0.07081378996372223, + "learning_rate": 3.951572978081822e-06, + "loss": 0.2356, + "step": 48498 + }, + { + "epoch": 3.9289533376539207, + "grad_norm": 0.06521176546812057, + "learning_rate": 3.947072325487196e-06, + "loss": 0.2177, + "step": 48499 + }, + { + "epoch": 3.9290343486714194, + "grad_norm": 0.07624541968107224, + "learning_rate": 3.94257167289257e-06, + "loss": 0.1909, + "step": 48500 + }, + { + "epoch": 3.9291153596889177, + "grad_norm": 0.08679436892271042, + "learning_rate": 3.938071020297943e-06, + "loss": 0.2492, + "step": 48501 + }, + { + "epoch": 3.929196370706416, + "grad_norm": 0.06400018185377121, + "learning_rate": 3.933570367703317e-06, + "loss": 0.2234, + "step": 48502 + }, + { + "epoch": 3.9292773817239146, + "grad_norm": 0.07425360381603241, + "learning_rate": 3.929069715108691e-06, + "loss": 0.2425, + "step": 48503 + }, + { + "epoch": 3.929358392741413, + "grad_norm": 0.08528902381658554, + "learning_rate": 3.924569062514065e-06, + "loss": 0.2177, + "step": 48504 + }, + { + "epoch": 3.929439403758911, + "grad_norm": 0.05845775082707405, + "learning_rate": 3.920068409919438e-06, + "loss": 0.1795, + "step": 48505 + }, + { + "epoch": 3.92952041477641, + "grad_norm": 0.06745034456253052, + "learning_rate": 3.915567757324813e-06, + "loss": 0.2215, + "step": 48506 + }, + { + "epoch": 3.929601425793908, + "grad_norm": 0.07435566931962967, + "learning_rate": 3.911067104730186e-06, + "loss": 0.2253, + "step": 48507 + }, + { + "epoch": 3.9296824368114063, + "grad_norm": 0.08194363862276077, + "learning_rate": 3.90656645213556e-06, + "loss": 0.2381, + "step": 48508 + }, + { + "epoch": 3.929763447828905, + "grad_norm": 0.07673655450344086, + "learning_rate": 3.9020657995409335e-06, + "loss": 0.2508, + "step": 48509 + }, + { + "epoch": 3.929844458846403, + "grad_norm": 0.06954783946275711, + "learning_rate": 3.897565146946307e-06, + "loss": 0.2466, + "step": 48510 + }, + { + "epoch": 3.9299254698639015, + "grad_norm": 0.05643027275800705, + "learning_rate": 3.893064494351681e-06, + "loss": 0.2215, + "step": 48511 + }, + { + "epoch": 3.9300064808814, + "grad_norm": 0.06844652444124222, + "learning_rate": 3.888563841757055e-06, + "loss": 0.2043, + "step": 48512 + }, + { + "epoch": 3.9300874918988984, + "grad_norm": 0.07119164615869522, + "learning_rate": 3.884063189162429e-06, + "loss": 0.2427, + "step": 48513 + }, + { + "epoch": 3.9301685029163966, + "grad_norm": 0.07002280652523041, + "learning_rate": 3.879562536567802e-06, + "loss": 0.2559, + "step": 48514 + }, + { + "epoch": 3.930249513933895, + "grad_norm": 0.06633292883634567, + "learning_rate": 3.875061883973177e-06, + "loss": 0.2434, + "step": 48515 + }, + { + "epoch": 3.930330524951393, + "grad_norm": 0.06863868981599808, + "learning_rate": 3.87056123137855e-06, + "loss": 0.2181, + "step": 48516 + }, + { + "epoch": 3.930411535968892, + "grad_norm": 0.0700906440615654, + "learning_rate": 3.866060578783924e-06, + "loss": 0.2224, + "step": 48517 + }, + { + "epoch": 3.93049254698639, + "grad_norm": 0.07740800082683563, + "learning_rate": 3.861559926189298e-06, + "loss": 0.1978, + "step": 48518 + }, + { + "epoch": 3.9305735580038883, + "grad_norm": 0.07479584217071533, + "learning_rate": 3.857059273594671e-06, + "loss": 0.2067, + "step": 48519 + }, + { + "epoch": 3.930654569021387, + "grad_norm": 0.07812254130840302, + "learning_rate": 3.852558621000045e-06, + "loss": 0.2554, + "step": 48520 + }, + { + "epoch": 3.9307355800388852, + "grad_norm": 0.08209620416164398, + "learning_rate": 3.848057968405419e-06, + "loss": 0.2061, + "step": 48521 + }, + { + "epoch": 3.9308165910563835, + "grad_norm": 0.06698629260063171, + "learning_rate": 3.843557315810793e-06, + "loss": 0.2113, + "step": 48522 + }, + { + "epoch": 3.930897602073882, + "grad_norm": 0.0727633461356163, + "learning_rate": 3.8390566632161664e-06, + "loss": 0.2138, + "step": 48523 + }, + { + "epoch": 3.9309786130913804, + "grad_norm": 0.07083755731582642, + "learning_rate": 3.83455601062154e-06, + "loss": 0.2036, + "step": 48524 + }, + { + "epoch": 3.9310596241088787, + "grad_norm": 0.07174476236104965, + "learning_rate": 3.8300553580269145e-06, + "loss": 0.2266, + "step": 48525 + }, + { + "epoch": 3.9311406351263773, + "grad_norm": 0.07710791379213333, + "learning_rate": 3.825554705432287e-06, + "loss": 0.2302, + "step": 48526 + }, + { + "epoch": 3.9312216461438756, + "grad_norm": 0.07124129682779312, + "learning_rate": 3.821054052837662e-06, + "loss": 0.2003, + "step": 48527 + }, + { + "epoch": 3.931302657161374, + "grad_norm": 0.07960180193185806, + "learning_rate": 3.816553400243035e-06, + "loss": 0.2458, + "step": 48528 + }, + { + "epoch": 3.9313836681788725, + "grad_norm": 0.08520202338695526, + "learning_rate": 3.8120527476484093e-06, + "loss": 0.2287, + "step": 48529 + }, + { + "epoch": 3.9314646791963708, + "grad_norm": 0.06994429975748062, + "learning_rate": 3.8075520950537834e-06, + "loss": 0.2281, + "step": 48530 + }, + { + "epoch": 3.931545690213869, + "grad_norm": 0.07213662564754486, + "learning_rate": 3.8030514424591565e-06, + "loss": 0.1761, + "step": 48531 + }, + { + "epoch": 3.9316267012313677, + "grad_norm": 0.07031689584255219, + "learning_rate": 3.7985507898645306e-06, + "loss": 0.2011, + "step": 48532 + }, + { + "epoch": 3.931707712248866, + "grad_norm": 0.0783347338438034, + "learning_rate": 3.794050137269904e-06, + "loss": 0.2522, + "step": 48533 + }, + { + "epoch": 3.931788723266364, + "grad_norm": 0.07658717036247253, + "learning_rate": 3.789549484675278e-06, + "loss": 0.2342, + "step": 48534 + }, + { + "epoch": 3.931869734283863, + "grad_norm": 0.07086281478404999, + "learning_rate": 3.785048832080652e-06, + "loss": 0.2134, + "step": 48535 + }, + { + "epoch": 3.931950745301361, + "grad_norm": 0.06902775913476944, + "learning_rate": 3.780548179486026e-06, + "loss": 0.1919, + "step": 48536 + }, + { + "epoch": 3.9320317563188594, + "grad_norm": 0.06554694473743439, + "learning_rate": 3.776047526891399e-06, + "loss": 0.2314, + "step": 48537 + }, + { + "epoch": 3.9321127673363576, + "grad_norm": 0.08830972760915756, + "learning_rate": 3.7715468742967735e-06, + "loss": 0.2198, + "step": 48538 + }, + { + "epoch": 3.932193778353856, + "grad_norm": 0.07285527884960175, + "learning_rate": 3.7670462217021466e-06, + "loss": 0.2034, + "step": 48539 + }, + { + "epoch": 3.9322747893713546, + "grad_norm": 0.08034533262252808, + "learning_rate": 3.7625455691075207e-06, + "loss": 0.2121, + "step": 48540 + }, + { + "epoch": 3.932355800388853, + "grad_norm": 0.07474691420793533, + "learning_rate": 3.7580449165128947e-06, + "loss": 0.2456, + "step": 48541 + }, + { + "epoch": 3.932436811406351, + "grad_norm": 0.06668350100517273, + "learning_rate": 3.7535442639182683e-06, + "loss": 0.239, + "step": 48542 + }, + { + "epoch": 3.9325178224238497, + "grad_norm": 0.06283923983573914, + "learning_rate": 3.7490436113236423e-06, + "loss": 0.1962, + "step": 48543 + }, + { + "epoch": 3.932598833441348, + "grad_norm": 0.08116850256919861, + "learning_rate": 3.744542958729016e-06, + "loss": 0.256, + "step": 48544 + }, + { + "epoch": 3.932679844458846, + "grad_norm": 0.07628383487462997, + "learning_rate": 3.74004230613439e-06, + "loss": 0.2212, + "step": 48545 + }, + { + "epoch": 3.932760855476345, + "grad_norm": 0.07699400186538696, + "learning_rate": 3.735541653539763e-06, + "loss": 0.194, + "step": 48546 + }, + { + "epoch": 3.932841866493843, + "grad_norm": 0.08026984333992004, + "learning_rate": 3.7310410009451376e-06, + "loss": 0.1822, + "step": 48547 + }, + { + "epoch": 3.9329228775113414, + "grad_norm": 0.0703667625784874, + "learning_rate": 3.7265403483505108e-06, + "loss": 0.2552, + "step": 48548 + }, + { + "epoch": 3.93300388852884, + "grad_norm": 0.06922667473554611, + "learning_rate": 3.722039695755885e-06, + "loss": 0.2323, + "step": 48549 + }, + { + "epoch": 3.9330848995463383, + "grad_norm": 0.07332278788089752, + "learning_rate": 3.7175390431612584e-06, + "loss": 0.2274, + "step": 48550 + }, + { + "epoch": 3.9331659105638366, + "grad_norm": 0.07613363862037659, + "learning_rate": 3.7130383905666324e-06, + "loss": 0.2087, + "step": 48551 + }, + { + "epoch": 3.9332469215813353, + "grad_norm": 0.09742595255374908, + "learning_rate": 3.708537737972006e-06, + "loss": 0.2368, + "step": 48552 + }, + { + "epoch": 3.9333279325988335, + "grad_norm": 0.0734672024846077, + "learning_rate": 3.70403708537738e-06, + "loss": 0.2262, + "step": 48553 + }, + { + "epoch": 3.9334089436163318, + "grad_norm": 0.06810317933559418, + "learning_rate": 3.6995364327827532e-06, + "loss": 0.2113, + "step": 48554 + }, + { + "epoch": 3.9334899546338304, + "grad_norm": 0.07320161908864975, + "learning_rate": 3.6950357801881273e-06, + "loss": 0.2495, + "step": 48555 + }, + { + "epoch": 3.9335709656513287, + "grad_norm": 0.07830044627189636, + "learning_rate": 3.6905351275935017e-06, + "loss": 0.1991, + "step": 48556 + }, + { + "epoch": 3.933651976668827, + "grad_norm": 0.06398749351501465, + "learning_rate": 3.686034474998875e-06, + "loss": 0.2219, + "step": 48557 + }, + { + "epoch": 3.933732987686325, + "grad_norm": 0.07395961135625839, + "learning_rate": 3.681533822404249e-06, + "loss": 0.211, + "step": 48558 + }, + { + "epoch": 3.933813998703824, + "grad_norm": 0.0828605592250824, + "learning_rate": 3.6770331698096225e-06, + "loss": 0.2104, + "step": 48559 + }, + { + "epoch": 3.933895009721322, + "grad_norm": 0.06945844739675522, + "learning_rate": 3.6725325172149965e-06, + "loss": 0.2022, + "step": 48560 + }, + { + "epoch": 3.9339760207388204, + "grad_norm": 0.08338592201471329, + "learning_rate": 3.66803186462037e-06, + "loss": 0.2406, + "step": 48561 + }, + { + "epoch": 3.9340570317563186, + "grad_norm": 0.07173708081245422, + "learning_rate": 3.663531212025744e-06, + "loss": 0.2097, + "step": 48562 + }, + { + "epoch": 3.9341380427738173, + "grad_norm": 0.06706194579601288, + "learning_rate": 3.6590305594311174e-06, + "loss": 0.2191, + "step": 48563 + }, + { + "epoch": 3.9342190537913155, + "grad_norm": 0.06909432262182236, + "learning_rate": 3.654529906836492e-06, + "loss": 0.226, + "step": 48564 + }, + { + "epoch": 3.934300064808814, + "grad_norm": 0.07070121169090271, + "learning_rate": 3.650029254241865e-06, + "loss": 0.2187, + "step": 48565 + }, + { + "epoch": 3.9343810758263125, + "grad_norm": 0.07241905480623245, + "learning_rate": 3.645528601647239e-06, + "loss": 0.2526, + "step": 48566 + }, + { + "epoch": 3.9344620868438107, + "grad_norm": 0.059708014130592346, + "learning_rate": 3.6410279490526126e-06, + "loss": 0.2184, + "step": 48567 + }, + { + "epoch": 3.934543097861309, + "grad_norm": 0.08609946072101593, + "learning_rate": 3.6365272964579866e-06, + "loss": 0.2065, + "step": 48568 + }, + { + "epoch": 3.9346241088788076, + "grad_norm": 0.07386572659015656, + "learning_rate": 3.63202664386336e-06, + "loss": 0.2091, + "step": 48569 + }, + { + "epoch": 3.934705119896306, + "grad_norm": 0.06636470556259155, + "learning_rate": 3.6275259912687343e-06, + "loss": 0.2014, + "step": 48570 + }, + { + "epoch": 3.934786130913804, + "grad_norm": 0.07500702887773514, + "learning_rate": 3.6230253386741083e-06, + "loss": 0.2051, + "step": 48571 + }, + { + "epoch": 3.934867141931303, + "grad_norm": 0.06589839607477188, + "learning_rate": 3.6185246860794815e-06, + "loss": 0.2176, + "step": 48572 + }, + { + "epoch": 3.934948152948801, + "grad_norm": 0.06833402067422867, + "learning_rate": 3.614024033484856e-06, + "loss": 0.2436, + "step": 48573 + }, + { + "epoch": 3.9350291639662993, + "grad_norm": 0.07177960872650146, + "learning_rate": 3.609523380890229e-06, + "loss": 0.263, + "step": 48574 + }, + { + "epoch": 3.935110174983798, + "grad_norm": 0.06268391013145447, + "learning_rate": 3.605022728295603e-06, + "loss": 0.2019, + "step": 48575 + }, + { + "epoch": 3.9351911860012962, + "grad_norm": 0.0845642238855362, + "learning_rate": 3.6005220757009767e-06, + "loss": 0.2184, + "step": 48576 + }, + { + "epoch": 3.9352721970187945, + "grad_norm": 0.05564136430621147, + "learning_rate": 3.5960214231063508e-06, + "loss": 0.2024, + "step": 48577 + }, + { + "epoch": 3.935353208036293, + "grad_norm": 0.06291015446186066, + "learning_rate": 3.5915207705117244e-06, + "loss": 0.2304, + "step": 48578 + }, + { + "epoch": 3.9354342190537914, + "grad_norm": 0.07351517677307129, + "learning_rate": 3.5870201179170984e-06, + "loss": 0.234, + "step": 48579 + }, + { + "epoch": 3.9355152300712897, + "grad_norm": 0.07256698608398438, + "learning_rate": 3.5825194653224716e-06, + "loss": 0.2334, + "step": 48580 + }, + { + "epoch": 3.935596241088788, + "grad_norm": 0.08631699532270432, + "learning_rate": 3.5780188127278456e-06, + "loss": 0.2292, + "step": 48581 + }, + { + "epoch": 3.9356772521062866, + "grad_norm": 0.06245209649205208, + "learning_rate": 3.573518160133219e-06, + "loss": 0.2052, + "step": 48582 + }, + { + "epoch": 3.935758263123785, + "grad_norm": 0.06966308504343033, + "learning_rate": 3.5690175075385932e-06, + "loss": 0.2312, + "step": 48583 + }, + { + "epoch": 3.935839274141283, + "grad_norm": 0.057079412043094635, + "learning_rate": 3.564516854943967e-06, + "loss": 0.2015, + "step": 48584 + }, + { + "epoch": 3.9359202851587813, + "grad_norm": 0.0594225637614727, + "learning_rate": 3.560016202349341e-06, + "loss": 0.2156, + "step": 48585 + }, + { + "epoch": 3.93600129617628, + "grad_norm": 0.06722415238618851, + "learning_rate": 3.555515549754715e-06, + "loss": 0.1988, + "step": 48586 + }, + { + "epoch": 3.9360823071937783, + "grad_norm": 0.06993760913610458, + "learning_rate": 3.5510148971600885e-06, + "loss": 0.2112, + "step": 48587 + }, + { + "epoch": 3.9361633182112765, + "grad_norm": 0.08021251857280731, + "learning_rate": 3.5465142445654625e-06, + "loss": 0.2323, + "step": 48588 + }, + { + "epoch": 3.936244329228775, + "grad_norm": 0.07788848876953125, + "learning_rate": 3.5420135919708357e-06, + "loss": 0.2113, + "step": 48589 + }, + { + "epoch": 3.9363253402462735, + "grad_norm": 0.06252850592136383, + "learning_rate": 3.5375129393762097e-06, + "loss": 0.214, + "step": 48590 + }, + { + "epoch": 3.9364063512637717, + "grad_norm": 0.0875447690486908, + "learning_rate": 3.5330122867815833e-06, + "loss": 0.2773, + "step": 48591 + }, + { + "epoch": 3.9364873622812704, + "grad_norm": 0.067319855093956, + "learning_rate": 3.5285116341869574e-06, + "loss": 0.2524, + "step": 48592 + }, + { + "epoch": 3.9365683732987686, + "grad_norm": 0.06517969071865082, + "learning_rate": 3.524010981592331e-06, + "loss": 0.2369, + "step": 48593 + }, + { + "epoch": 3.936649384316267, + "grad_norm": 0.06585026532411575, + "learning_rate": 3.519510328997705e-06, + "loss": 0.2006, + "step": 48594 + }, + { + "epoch": 3.9367303953337656, + "grad_norm": 0.0712442547082901, + "learning_rate": 3.515009676403078e-06, + "loss": 0.1914, + "step": 48595 + }, + { + "epoch": 3.936811406351264, + "grad_norm": 0.07155847549438477, + "learning_rate": 3.5105090238084526e-06, + "loss": 0.2596, + "step": 48596 + }, + { + "epoch": 3.936892417368762, + "grad_norm": 0.0668448656797409, + "learning_rate": 3.506008371213826e-06, + "loss": 0.2167, + "step": 48597 + }, + { + "epoch": 3.9369734283862607, + "grad_norm": 0.06561889499425888, + "learning_rate": 3.5015077186192e-06, + "loss": 0.1859, + "step": 48598 + }, + { + "epoch": 3.937054439403759, + "grad_norm": 0.05717025324702263, + "learning_rate": 3.4970070660245743e-06, + "loss": 0.2039, + "step": 48599 + }, + { + "epoch": 3.9371354504212572, + "grad_norm": 0.0749245211482048, + "learning_rate": 3.4925064134299475e-06, + "loss": 0.2545, + "step": 48600 + }, + { + "epoch": 3.937216461438756, + "grad_norm": 0.0730648785829544, + "learning_rate": 3.4880057608353215e-06, + "loss": 0.2204, + "step": 48601 + }, + { + "epoch": 3.937297472456254, + "grad_norm": 0.06548863649368286, + "learning_rate": 3.483505108240695e-06, + "loss": 0.2004, + "step": 48602 + }, + { + "epoch": 3.9373784834737524, + "grad_norm": 0.06222781166434288, + "learning_rate": 3.479004455646069e-06, + "loss": 0.1914, + "step": 48603 + }, + { + "epoch": 3.9374594944912507, + "grad_norm": 0.061245959252119064, + "learning_rate": 3.4745038030514423e-06, + "loss": 0.2335, + "step": 48604 + }, + { + "epoch": 3.9375405055087493, + "grad_norm": 0.09785652905702591, + "learning_rate": 3.4700031504568167e-06, + "loss": 0.2717, + "step": 48605 + }, + { + "epoch": 3.9376215165262476, + "grad_norm": 0.07860812544822693, + "learning_rate": 3.46550249786219e-06, + "loss": 0.2264, + "step": 48606 + }, + { + "epoch": 3.937702527543746, + "grad_norm": 0.08016818761825562, + "learning_rate": 3.461001845267564e-06, + "loss": 0.2651, + "step": 48607 + }, + { + "epoch": 3.937783538561244, + "grad_norm": 0.07426834851503372, + "learning_rate": 3.4565011926729376e-06, + "loss": 0.2745, + "step": 48608 + }, + { + "epoch": 3.9378645495787428, + "grad_norm": 0.0777416005730629, + "learning_rate": 3.4520005400783116e-06, + "loss": 0.2511, + "step": 48609 + }, + { + "epoch": 3.937945560596241, + "grad_norm": 0.08751657605171204, + "learning_rate": 3.447499887483685e-06, + "loss": 0.2446, + "step": 48610 + }, + { + "epoch": 3.9380265716137393, + "grad_norm": 0.08031098544597626, + "learning_rate": 3.442999234889059e-06, + "loss": 0.2586, + "step": 48611 + }, + { + "epoch": 3.938107582631238, + "grad_norm": 0.0816061943769455, + "learning_rate": 3.4384985822944324e-06, + "loss": 0.2311, + "step": 48612 + }, + { + "epoch": 3.938188593648736, + "grad_norm": 0.07019905745983124, + "learning_rate": 3.433997929699807e-06, + "loss": 0.2114, + "step": 48613 + }, + { + "epoch": 3.9382696046662344, + "grad_norm": 0.061768822371959686, + "learning_rate": 3.429497277105181e-06, + "loss": 0.2184, + "step": 48614 + }, + { + "epoch": 3.938350615683733, + "grad_norm": 0.07346883416175842, + "learning_rate": 3.424996624510554e-06, + "loss": 0.2302, + "step": 48615 + }, + { + "epoch": 3.9384316267012314, + "grad_norm": 0.078464575111866, + "learning_rate": 3.420495971915928e-06, + "loss": 0.1974, + "step": 48616 + }, + { + "epoch": 3.9385126377187296, + "grad_norm": 0.08362342417240143, + "learning_rate": 3.4159953193213017e-06, + "loss": 0.2104, + "step": 48617 + }, + { + "epoch": 3.9385936487362283, + "grad_norm": 0.07425287365913391, + "learning_rate": 3.4114946667266757e-06, + "loss": 0.1987, + "step": 48618 + }, + { + "epoch": 3.9386746597537265, + "grad_norm": 0.05362372472882271, + "learning_rate": 3.4069940141320493e-06, + "loss": 0.1655, + "step": 48619 + }, + { + "epoch": 3.938755670771225, + "grad_norm": 0.0676586851477623, + "learning_rate": 3.4024933615374233e-06, + "loss": 0.1871, + "step": 48620 + }, + { + "epoch": 3.9388366817887235, + "grad_norm": 0.06317394226789474, + "learning_rate": 3.3979927089427965e-06, + "loss": 0.2735, + "step": 48621 + }, + { + "epoch": 3.9389176928062217, + "grad_norm": 0.06809334456920624, + "learning_rate": 3.393492056348171e-06, + "loss": 0.2171, + "step": 48622 + }, + { + "epoch": 3.93899870382372, + "grad_norm": 0.0660707876086235, + "learning_rate": 3.388991403753544e-06, + "loss": 0.2299, + "step": 48623 + }, + { + "epoch": 3.9390797148412187, + "grad_norm": 0.08089926093816757, + "learning_rate": 3.384490751158918e-06, + "loss": 0.2247, + "step": 48624 + }, + { + "epoch": 3.939160725858717, + "grad_norm": 0.06330505758523941, + "learning_rate": 3.3799900985642918e-06, + "loss": 0.2135, + "step": 48625 + }, + { + "epoch": 3.939241736876215, + "grad_norm": 0.06716379523277283, + "learning_rate": 3.375489445969666e-06, + "loss": 0.232, + "step": 48626 + }, + { + "epoch": 3.9393227478937134, + "grad_norm": 0.06354763358831406, + "learning_rate": 3.3709887933750394e-06, + "loss": 0.2079, + "step": 48627 + }, + { + "epoch": 3.939403758911212, + "grad_norm": 0.07505752146244049, + "learning_rate": 3.3664881407804134e-06, + "loss": 0.2218, + "step": 48628 + }, + { + "epoch": 3.9394847699287103, + "grad_norm": 0.08010073751211166, + "learning_rate": 3.3619874881857875e-06, + "loss": 0.2033, + "step": 48629 + }, + { + "epoch": 3.9395657809462086, + "grad_norm": 0.07689400762319565, + "learning_rate": 3.3574868355911606e-06, + "loss": 0.2303, + "step": 48630 + }, + { + "epoch": 3.939646791963707, + "grad_norm": 0.0748937800526619, + "learning_rate": 3.352986182996535e-06, + "loss": 0.2066, + "step": 48631 + }, + { + "epoch": 3.9397278029812055, + "grad_norm": 0.08342572301626205, + "learning_rate": 3.3484855304019083e-06, + "loss": 0.2457, + "step": 48632 + }, + { + "epoch": 3.9398088139987038, + "grad_norm": 0.08893360197544098, + "learning_rate": 3.3439848778072823e-06, + "loss": 0.2338, + "step": 48633 + }, + { + "epoch": 3.939889825016202, + "grad_norm": 0.06578105688095093, + "learning_rate": 3.339484225212656e-06, + "loss": 0.2092, + "step": 48634 + }, + { + "epoch": 3.9399708360337007, + "grad_norm": 0.06503915041685104, + "learning_rate": 3.33498357261803e-06, + "loss": 0.2476, + "step": 48635 + }, + { + "epoch": 3.940051847051199, + "grad_norm": 0.0719245970249176, + "learning_rate": 3.3304829200234035e-06, + "loss": 0.2677, + "step": 48636 + }, + { + "epoch": 3.940132858068697, + "grad_norm": 0.06504833698272705, + "learning_rate": 3.3259822674287776e-06, + "loss": 0.2252, + "step": 48637 + }, + { + "epoch": 3.940213869086196, + "grad_norm": 0.08234488219022751, + "learning_rate": 3.3214816148341507e-06, + "loss": 0.2319, + "step": 48638 + }, + { + "epoch": 3.940294880103694, + "grad_norm": 0.06820017844438553, + "learning_rate": 3.3169809622395248e-06, + "loss": 0.1929, + "step": 48639 + }, + { + "epoch": 3.9403758911211924, + "grad_norm": 0.05836334079504013, + "learning_rate": 3.3124803096448984e-06, + "loss": 0.2172, + "step": 48640 + }, + { + "epoch": 3.940456902138691, + "grad_norm": 0.06549455970525742, + "learning_rate": 3.3079796570502724e-06, + "loss": 0.2515, + "step": 48641 + }, + { + "epoch": 3.9405379131561893, + "grad_norm": 0.07402468472719193, + "learning_rate": 3.3034790044556464e-06, + "loss": 0.2103, + "step": 48642 + }, + { + "epoch": 3.9406189241736875, + "grad_norm": 0.06810709089040756, + "learning_rate": 3.29897835186102e-06, + "loss": 0.2017, + "step": 48643 + }, + { + "epoch": 3.940699935191186, + "grad_norm": 0.06563235074281693, + "learning_rate": 3.294477699266394e-06, + "loss": 0.2042, + "step": 48644 + }, + { + "epoch": 3.9407809462086845, + "grad_norm": 0.08226774632930756, + "learning_rate": 3.2899770466717677e-06, + "loss": 0.2507, + "step": 48645 + }, + { + "epoch": 3.9408619572261827, + "grad_norm": 0.06738846004009247, + "learning_rate": 3.2854763940771417e-06, + "loss": 0.2144, + "step": 48646 + }, + { + "epoch": 3.9409429682436814, + "grad_norm": 0.06427872180938721, + "learning_rate": 3.280975741482515e-06, + "loss": 0.2197, + "step": 48647 + }, + { + "epoch": 3.9410239792611796, + "grad_norm": 0.07773533463478088, + "learning_rate": 3.276475088887889e-06, + "loss": 0.2123, + "step": 48648 + }, + { + "epoch": 3.941104990278678, + "grad_norm": 0.08229377865791321, + "learning_rate": 3.2719744362932625e-06, + "loss": 0.2247, + "step": 48649 + }, + { + "epoch": 3.941186001296176, + "grad_norm": 0.0808233693242073, + "learning_rate": 3.2674737836986365e-06, + "loss": 0.2419, + "step": 48650 + }, + { + "epoch": 3.941267012313675, + "grad_norm": 0.08312023431062698, + "learning_rate": 3.26297313110401e-06, + "loss": 0.2376, + "step": 48651 + }, + { + "epoch": 3.941348023331173, + "grad_norm": 0.0641668364405632, + "learning_rate": 3.258472478509384e-06, + "loss": 0.2425, + "step": 48652 + }, + { + "epoch": 3.9414290343486713, + "grad_norm": 0.06582210958003998, + "learning_rate": 3.2539718259147573e-06, + "loss": 0.2453, + "step": 48653 + }, + { + "epoch": 3.9415100453661696, + "grad_norm": 0.07841324061155319, + "learning_rate": 3.2494711733201318e-06, + "loss": 0.2307, + "step": 48654 + }, + { + "epoch": 3.9415910563836682, + "grad_norm": 0.0673704668879509, + "learning_rate": 3.244970520725505e-06, + "loss": 0.2163, + "step": 48655 + }, + { + "epoch": 3.9416720674011665, + "grad_norm": 0.0752340778708458, + "learning_rate": 3.240469868130879e-06, + "loss": 0.2501, + "step": 48656 + }, + { + "epoch": 3.9417530784186647, + "grad_norm": 0.07765194028615952, + "learning_rate": 3.2359692155362534e-06, + "loss": 0.2192, + "step": 48657 + }, + { + "epoch": 3.9418340894361634, + "grad_norm": 0.06525509059429169, + "learning_rate": 3.2314685629416266e-06, + "loss": 0.223, + "step": 48658 + }, + { + "epoch": 3.9419151004536617, + "grad_norm": 0.06445495784282684, + "learning_rate": 3.2269679103470006e-06, + "loss": 0.2472, + "step": 48659 + }, + { + "epoch": 3.94199611147116, + "grad_norm": 0.06818129867315292, + "learning_rate": 3.2224672577523742e-06, + "loss": 0.2138, + "step": 48660 + }, + { + "epoch": 3.9420771224886586, + "grad_norm": 0.07735411822795868, + "learning_rate": 3.2179666051577483e-06, + "loss": 0.2252, + "step": 48661 + }, + { + "epoch": 3.942158133506157, + "grad_norm": 0.07455040514469147, + "learning_rate": 3.213465952563122e-06, + "loss": 0.2383, + "step": 48662 + }, + { + "epoch": 3.942239144523655, + "grad_norm": 0.07096820324659348, + "learning_rate": 3.208965299968496e-06, + "loss": 0.2368, + "step": 48663 + }, + { + "epoch": 3.942320155541154, + "grad_norm": 0.06698424369096756, + "learning_rate": 3.204464647373869e-06, + "loss": 0.2426, + "step": 48664 + }, + { + "epoch": 3.942401166558652, + "grad_norm": 0.05881981924176216, + "learning_rate": 3.199963994779243e-06, + "loss": 0.1789, + "step": 48665 + }, + { + "epoch": 3.9424821775761503, + "grad_norm": 0.07194176316261292, + "learning_rate": 3.1954633421846167e-06, + "loss": 0.2194, + "step": 48666 + }, + { + "epoch": 3.942563188593649, + "grad_norm": 0.06709557771682739, + "learning_rate": 3.1909626895899907e-06, + "loss": 0.2023, + "step": 48667 + }, + { + "epoch": 3.942644199611147, + "grad_norm": 0.08858643472194672, + "learning_rate": 3.1864620369953643e-06, + "loss": 0.2341, + "step": 48668 + }, + { + "epoch": 3.9427252106286454, + "grad_norm": 0.08386151492595673, + "learning_rate": 3.1819613844007384e-06, + "loss": 0.2279, + "step": 48669 + }, + { + "epoch": 3.942806221646144, + "grad_norm": 0.07175328582525253, + "learning_rate": 3.1774607318061115e-06, + "loss": 0.2233, + "step": 48670 + }, + { + "epoch": 3.9428872326636424, + "grad_norm": 0.0729939341545105, + "learning_rate": 3.172960079211486e-06, + "loss": 0.2228, + "step": 48671 + }, + { + "epoch": 3.9429682436811406, + "grad_norm": 0.07817017287015915, + "learning_rate": 3.16845942661686e-06, + "loss": 0.23, + "step": 48672 + }, + { + "epoch": 3.943049254698639, + "grad_norm": 0.06868911534547806, + "learning_rate": 3.163958774022233e-06, + "loss": 0.2288, + "step": 48673 + }, + { + "epoch": 3.943130265716137, + "grad_norm": 0.07988391816616058, + "learning_rate": 3.1594581214276072e-06, + "loss": 0.2186, + "step": 48674 + }, + { + "epoch": 3.943211276733636, + "grad_norm": 0.062225591391325, + "learning_rate": 3.154957468832981e-06, + "loss": 0.2169, + "step": 48675 + }, + { + "epoch": 3.943292287751134, + "grad_norm": 0.07657849788665771, + "learning_rate": 3.150456816238355e-06, + "loss": 0.2065, + "step": 48676 + }, + { + "epoch": 3.9433732987686323, + "grad_norm": 0.09984945505857468, + "learning_rate": 3.1459561636437285e-06, + "loss": 0.2521, + "step": 48677 + }, + { + "epoch": 3.943454309786131, + "grad_norm": 0.08694443851709366, + "learning_rate": 3.1414555110491025e-06, + "loss": 0.2002, + "step": 48678 + }, + { + "epoch": 3.9435353208036292, + "grad_norm": 0.06631725281476974, + "learning_rate": 3.1369548584544757e-06, + "loss": 0.1917, + "step": 48679 + }, + { + "epoch": 3.9436163318211275, + "grad_norm": 0.06001517176628113, + "learning_rate": 3.13245420585985e-06, + "loss": 0.203, + "step": 48680 + }, + { + "epoch": 3.943697342838626, + "grad_norm": 0.060536958277225494, + "learning_rate": 3.1279535532652233e-06, + "loss": 0.2354, + "step": 48681 + }, + { + "epoch": 3.9437783538561244, + "grad_norm": 0.07151622325181961, + "learning_rate": 3.1234529006705973e-06, + "loss": 0.2221, + "step": 48682 + }, + { + "epoch": 3.9438593648736227, + "grad_norm": 0.07806843519210815, + "learning_rate": 3.1189522480759714e-06, + "loss": 0.1761, + "step": 48683 + }, + { + "epoch": 3.9439403758911213, + "grad_norm": 0.061488077044487, + "learning_rate": 3.114451595481345e-06, + "loss": 0.1776, + "step": 48684 + }, + { + "epoch": 3.9440213869086196, + "grad_norm": 0.08169107884168625, + "learning_rate": 3.1099509428867186e-06, + "loss": 0.2266, + "step": 48685 + }, + { + "epoch": 3.944102397926118, + "grad_norm": 0.0697530210018158, + "learning_rate": 3.1054502902920926e-06, + "loss": 0.1994, + "step": 48686 + }, + { + "epoch": 3.9441834089436165, + "grad_norm": 0.06424905359745026, + "learning_rate": 3.100949637697466e-06, + "loss": 0.2624, + "step": 48687 + }, + { + "epoch": 3.9442644199611148, + "grad_norm": 0.0686437338590622, + "learning_rate": 3.09644898510284e-06, + "loss": 0.199, + "step": 48688 + }, + { + "epoch": 3.944345430978613, + "grad_norm": 0.07491966336965561, + "learning_rate": 3.091948332508214e-06, + "loss": 0.2548, + "step": 48689 + }, + { + "epoch": 3.9444264419961117, + "grad_norm": 0.08908192068338394, + "learning_rate": 3.0874476799135874e-06, + "loss": 0.2572, + "step": 48690 + }, + { + "epoch": 3.94450745301361, + "grad_norm": 0.07038780301809311, + "learning_rate": 3.0829470273189614e-06, + "loss": 0.2381, + "step": 48691 + }, + { + "epoch": 3.944588464031108, + "grad_norm": 0.06175041198730469, + "learning_rate": 3.078446374724335e-06, + "loss": 0.251, + "step": 48692 + }, + { + "epoch": 3.944669475048607, + "grad_norm": 0.07298500090837479, + "learning_rate": 3.073945722129709e-06, + "loss": 0.2434, + "step": 48693 + }, + { + "epoch": 3.944750486066105, + "grad_norm": 0.07891558855772018, + "learning_rate": 3.0694450695350827e-06, + "loss": 0.246, + "step": 48694 + }, + { + "epoch": 3.9448314970836034, + "grad_norm": 0.07210226356983185, + "learning_rate": 3.0649444169404567e-06, + "loss": 0.2258, + "step": 48695 + }, + { + "epoch": 3.9449125081011016, + "grad_norm": 0.07565312087535858, + "learning_rate": 3.0604437643458303e-06, + "loss": 0.2653, + "step": 48696 + }, + { + "epoch": 3.9449935191186, + "grad_norm": 0.06832912564277649, + "learning_rate": 3.055943111751204e-06, + "loss": 0.1948, + "step": 48697 + }, + { + "epoch": 3.9450745301360985, + "grad_norm": 0.07523760199546814, + "learning_rate": 3.051442459156578e-06, + "loss": 0.2343, + "step": 48698 + }, + { + "epoch": 3.945155541153597, + "grad_norm": 0.0894690528512001, + "learning_rate": 3.0469418065619515e-06, + "loss": 0.2323, + "step": 48699 + }, + { + "epoch": 3.945236552171095, + "grad_norm": 0.06247549131512642, + "learning_rate": 3.0424411539673256e-06, + "loss": 0.2019, + "step": 48700 + }, + { + "epoch": 3.9453175631885937, + "grad_norm": 0.06587101519107819, + "learning_rate": 3.037940501372699e-06, + "loss": 0.2405, + "step": 48701 + }, + { + "epoch": 3.945398574206092, + "grad_norm": 0.07532519102096558, + "learning_rate": 3.0334398487780728e-06, + "loss": 0.1945, + "step": 48702 + }, + { + "epoch": 3.94547958522359, + "grad_norm": 0.06462553888559341, + "learning_rate": 3.028939196183447e-06, + "loss": 0.2456, + "step": 48703 + }, + { + "epoch": 3.945560596241089, + "grad_norm": 0.05730045959353447, + "learning_rate": 3.0244385435888204e-06, + "loss": 0.1876, + "step": 48704 + }, + { + "epoch": 3.945641607258587, + "grad_norm": 0.08065466582775116, + "learning_rate": 3.019937890994194e-06, + "loss": 0.2045, + "step": 48705 + }, + { + "epoch": 3.9457226182760854, + "grad_norm": 0.08913881331682205, + "learning_rate": 3.015437238399568e-06, + "loss": 0.2335, + "step": 48706 + }, + { + "epoch": 3.945803629293584, + "grad_norm": 0.0686987042427063, + "learning_rate": 3.010936585804942e-06, + "loss": 0.2315, + "step": 48707 + }, + { + "epoch": 3.9458846403110823, + "grad_norm": 0.07149714231491089, + "learning_rate": 3.0064359332103157e-06, + "loss": 0.1983, + "step": 48708 + }, + { + "epoch": 3.9459656513285806, + "grad_norm": 0.09264829754829407, + "learning_rate": 3.0019352806156897e-06, + "loss": 0.2337, + "step": 48709 + }, + { + "epoch": 3.9460466623460793, + "grad_norm": 0.06271969527006149, + "learning_rate": 2.9974346280210633e-06, + "loss": 0.2145, + "step": 48710 + }, + { + "epoch": 3.9461276733635775, + "grad_norm": 0.06298379600048065, + "learning_rate": 2.992933975426437e-06, + "loss": 0.1905, + "step": 48711 + }, + { + "epoch": 3.9462086843810757, + "grad_norm": 0.09435836970806122, + "learning_rate": 2.988433322831811e-06, + "loss": 0.2418, + "step": 48712 + }, + { + "epoch": 3.9462896953985744, + "grad_norm": 0.06588099151849747, + "learning_rate": 2.9839326702371845e-06, + "loss": 0.2022, + "step": 48713 + }, + { + "epoch": 3.9463707064160727, + "grad_norm": 0.06596425175666809, + "learning_rate": 2.979432017642558e-06, + "loss": 0.1856, + "step": 48714 + }, + { + "epoch": 3.946451717433571, + "grad_norm": 0.06752095371484756, + "learning_rate": 2.974931365047932e-06, + "loss": 0.2004, + "step": 48715 + }, + { + "epoch": 3.9465327284510696, + "grad_norm": 0.07633611559867859, + "learning_rate": 2.9704307124533058e-06, + "loss": 0.2349, + "step": 48716 + }, + { + "epoch": 3.946613739468568, + "grad_norm": 0.08213532716035843, + "learning_rate": 2.9659300598586794e-06, + "loss": 0.2315, + "step": 48717 + }, + { + "epoch": 3.946694750486066, + "grad_norm": 0.06958873569965363, + "learning_rate": 2.9614294072640534e-06, + "loss": 0.2222, + "step": 48718 + }, + { + "epoch": 3.9467757615035644, + "grad_norm": 0.0933084487915039, + "learning_rate": 2.956928754669427e-06, + "loss": 0.2254, + "step": 48719 + }, + { + "epoch": 3.9468567725210626, + "grad_norm": 0.07122278958559036, + "learning_rate": 2.952428102074801e-06, + "loss": 0.2559, + "step": 48720 + }, + { + "epoch": 3.9469377835385613, + "grad_norm": 0.08493492752313614, + "learning_rate": 2.947927449480175e-06, + "loss": 0.2415, + "step": 48721 + }, + { + "epoch": 3.9470187945560595, + "grad_norm": 0.07494331151247025, + "learning_rate": 2.9434267968855487e-06, + "loss": 0.2334, + "step": 48722 + }, + { + "epoch": 3.9470998055735578, + "grad_norm": 0.07146378606557846, + "learning_rate": 2.9389261442909223e-06, + "loss": 0.217, + "step": 48723 + }, + { + "epoch": 3.9471808165910565, + "grad_norm": 0.05992691218852997, + "learning_rate": 2.9344254916962963e-06, + "loss": 0.2004, + "step": 48724 + }, + { + "epoch": 3.9472618276085547, + "grad_norm": 0.07522998005151749, + "learning_rate": 2.92992483910167e-06, + "loss": 0.1857, + "step": 48725 + }, + { + "epoch": 3.947342838626053, + "grad_norm": 0.07378930598497391, + "learning_rate": 2.925424186507044e-06, + "loss": 0.1981, + "step": 48726 + }, + { + "epoch": 3.9474238496435516, + "grad_norm": 0.08331488817930222, + "learning_rate": 2.9209235339124175e-06, + "loss": 0.2355, + "step": 48727 + }, + { + "epoch": 3.94750486066105, + "grad_norm": 0.08735651522874832, + "learning_rate": 2.916422881317791e-06, + "loss": 0.2182, + "step": 48728 + }, + { + "epoch": 3.947585871678548, + "grad_norm": 0.06788183748722076, + "learning_rate": 2.911922228723165e-06, + "loss": 0.2102, + "step": 48729 + }, + { + "epoch": 3.947666882696047, + "grad_norm": 0.0716099888086319, + "learning_rate": 2.9074215761285388e-06, + "loss": 0.2277, + "step": 48730 + }, + { + "epoch": 3.947747893713545, + "grad_norm": 0.08619321882724762, + "learning_rate": 2.9029209235339124e-06, + "loss": 0.2234, + "step": 48731 + }, + { + "epoch": 3.9478289047310433, + "grad_norm": 0.0653134360909462, + "learning_rate": 2.8984202709392864e-06, + "loss": 0.2295, + "step": 48732 + }, + { + "epoch": 3.947909915748542, + "grad_norm": 0.08110470324754715, + "learning_rate": 2.89391961834466e-06, + "loss": 0.2422, + "step": 48733 + }, + { + "epoch": 3.9479909267660402, + "grad_norm": 0.08325320482254028, + "learning_rate": 2.8894189657500336e-06, + "loss": 0.22, + "step": 48734 + }, + { + "epoch": 3.9480719377835385, + "grad_norm": 0.07577069103717804, + "learning_rate": 2.8849183131554076e-06, + "loss": 0.1979, + "step": 48735 + }, + { + "epoch": 3.948152948801037, + "grad_norm": 0.07868307828903198, + "learning_rate": 2.8804176605607816e-06, + "loss": 0.241, + "step": 48736 + }, + { + "epoch": 3.9482339598185354, + "grad_norm": 0.07431667298078537, + "learning_rate": 2.8759170079661552e-06, + "loss": 0.2376, + "step": 48737 + }, + { + "epoch": 3.9483149708360337, + "grad_norm": 0.08248012512922287, + "learning_rate": 2.8714163553715293e-06, + "loss": 0.2488, + "step": 48738 + }, + { + "epoch": 3.9483959818535324, + "grad_norm": 0.07398643344640732, + "learning_rate": 2.866915702776903e-06, + "loss": 0.1939, + "step": 48739 + }, + { + "epoch": 3.9484769928710306, + "grad_norm": 0.06579329073429108, + "learning_rate": 2.8624150501822765e-06, + "loss": 0.2379, + "step": 48740 + }, + { + "epoch": 3.948558003888529, + "grad_norm": 0.06077032908797264, + "learning_rate": 2.8579143975876505e-06, + "loss": 0.2155, + "step": 48741 + }, + { + "epoch": 3.948639014906027, + "grad_norm": 0.07503858208656311, + "learning_rate": 2.853413744993024e-06, + "loss": 0.1803, + "step": 48742 + }, + { + "epoch": 3.9487200259235253, + "grad_norm": 0.08480262011289597, + "learning_rate": 2.8489130923983977e-06, + "loss": 0.2527, + "step": 48743 + }, + { + "epoch": 3.948801036941024, + "grad_norm": 0.062442317605018616, + "learning_rate": 2.8444124398037717e-06, + "loss": 0.2152, + "step": 48744 + }, + { + "epoch": 3.9488820479585223, + "grad_norm": 0.07862494140863419, + "learning_rate": 2.8399117872091453e-06, + "loss": 0.2158, + "step": 48745 + }, + { + "epoch": 3.9489630589760205, + "grad_norm": 0.07681937515735626, + "learning_rate": 2.835411134614519e-06, + "loss": 0.2325, + "step": 48746 + }, + { + "epoch": 3.949044069993519, + "grad_norm": 0.0824529230594635, + "learning_rate": 2.830910482019893e-06, + "loss": 0.2331, + "step": 48747 + }, + { + "epoch": 3.9491250810110174, + "grad_norm": 0.06473097205162048, + "learning_rate": 2.8264098294252666e-06, + "loss": 0.2245, + "step": 48748 + }, + { + "epoch": 3.9492060920285157, + "grad_norm": 0.08356422930955887, + "learning_rate": 2.8219091768306406e-06, + "loss": 0.2531, + "step": 48749 + }, + { + "epoch": 3.9492871030460144, + "grad_norm": 0.0842425748705864, + "learning_rate": 2.8174085242360146e-06, + "loss": 0.2154, + "step": 48750 + }, + { + "epoch": 3.9493681140635126, + "grad_norm": 0.05836378410458565, + "learning_rate": 2.8129078716413882e-06, + "loss": 0.2178, + "step": 48751 + }, + { + "epoch": 3.949449125081011, + "grad_norm": 0.060801442712545395, + "learning_rate": 2.808407219046762e-06, + "loss": 0.2085, + "step": 48752 + }, + { + "epoch": 3.9495301360985096, + "grad_norm": 0.06836865842342377, + "learning_rate": 2.803906566452136e-06, + "loss": 0.2641, + "step": 48753 + }, + { + "epoch": 3.949611147116008, + "grad_norm": 0.06536520272493362, + "learning_rate": 2.7994059138575095e-06, + "loss": 0.228, + "step": 48754 + }, + { + "epoch": 3.949692158133506, + "grad_norm": 0.0712912380695343, + "learning_rate": 2.7949052612628835e-06, + "loss": 0.2276, + "step": 48755 + }, + { + "epoch": 3.9497731691510047, + "grad_norm": 0.060562681406736374, + "learning_rate": 2.790404608668257e-06, + "loss": 0.1713, + "step": 48756 + }, + { + "epoch": 3.949854180168503, + "grad_norm": 0.07603670656681061, + "learning_rate": 2.7859039560736307e-06, + "loss": 0.2296, + "step": 48757 + }, + { + "epoch": 3.9499351911860012, + "grad_norm": 0.08450186252593994, + "learning_rate": 2.7814033034790047e-06, + "loss": 0.2233, + "step": 48758 + }, + { + "epoch": 3.9500162022035, + "grad_norm": 0.07135901600122452, + "learning_rate": 2.7769026508843783e-06, + "loss": 0.2052, + "step": 48759 + }, + { + "epoch": 3.950097213220998, + "grad_norm": 0.07295495271682739, + "learning_rate": 2.772401998289752e-06, + "loss": 0.2177, + "step": 48760 + }, + { + "epoch": 3.9501782242384964, + "grad_norm": 0.07470090687274933, + "learning_rate": 2.767901345695126e-06, + "loss": 0.2413, + "step": 48761 + }, + { + "epoch": 3.9502592352559946, + "grad_norm": 0.08046706765890121, + "learning_rate": 2.7634006931004996e-06, + "loss": 0.2657, + "step": 48762 + }, + { + "epoch": 3.9503402462734933, + "grad_norm": 0.06358887255191803, + "learning_rate": 2.758900040505873e-06, + "loss": 0.2253, + "step": 48763 + }, + { + "epoch": 3.9504212572909916, + "grad_norm": 0.0710747092962265, + "learning_rate": 2.754399387911247e-06, + "loss": 0.2431, + "step": 48764 + }, + { + "epoch": 3.95050226830849, + "grad_norm": 0.08081281930208206, + "learning_rate": 2.7498987353166212e-06, + "loss": 0.2467, + "step": 48765 + }, + { + "epoch": 3.950583279325988, + "grad_norm": 0.07632328569889069, + "learning_rate": 2.745398082721995e-06, + "loss": 0.2273, + "step": 48766 + }, + { + "epoch": 3.9506642903434868, + "grad_norm": 0.09434718638658524, + "learning_rate": 2.740897430127369e-06, + "loss": 0.2381, + "step": 48767 + }, + { + "epoch": 3.950745301360985, + "grad_norm": 0.0825645849108696, + "learning_rate": 2.7363967775327425e-06, + "loss": 0.2702, + "step": 48768 + }, + { + "epoch": 3.9508263123784833, + "grad_norm": 0.07329137623310089, + "learning_rate": 2.731896124938116e-06, + "loss": 0.228, + "step": 48769 + }, + { + "epoch": 3.950907323395982, + "grad_norm": 0.06830742955207825, + "learning_rate": 2.72739547234349e-06, + "loss": 0.2123, + "step": 48770 + }, + { + "epoch": 3.95098833441348, + "grad_norm": 0.06959126144647598, + "learning_rate": 2.7228948197488637e-06, + "loss": 0.2137, + "step": 48771 + }, + { + "epoch": 3.9510693454309784, + "grad_norm": 0.07482346892356873, + "learning_rate": 2.7183941671542373e-06, + "loss": 0.2218, + "step": 48772 + }, + { + "epoch": 3.951150356448477, + "grad_norm": 0.0668899193406105, + "learning_rate": 2.7138935145596113e-06, + "loss": 0.2187, + "step": 48773 + }, + { + "epoch": 3.9512313674659754, + "grad_norm": 0.061617180705070496, + "learning_rate": 2.709392861964985e-06, + "loss": 0.2002, + "step": 48774 + }, + { + "epoch": 3.9513123784834736, + "grad_norm": 0.08560971915721893, + "learning_rate": 2.704892209370359e-06, + "loss": 0.2289, + "step": 48775 + }, + { + "epoch": 3.9513933895009723, + "grad_norm": 0.08538944274187088, + "learning_rate": 2.7003915567757326e-06, + "loss": 0.2263, + "step": 48776 + }, + { + "epoch": 3.9514744005184705, + "grad_norm": 0.07387255877256393, + "learning_rate": 2.695890904181106e-06, + "loss": 0.2137, + "step": 48777 + }, + { + "epoch": 3.951555411535969, + "grad_norm": 0.07042668759822845, + "learning_rate": 2.69139025158648e-06, + "loss": 0.1982, + "step": 48778 + }, + { + "epoch": 3.9516364225534675, + "grad_norm": 0.09200357645750046, + "learning_rate": 2.686889598991854e-06, + "loss": 0.21, + "step": 48779 + }, + { + "epoch": 3.9517174335709657, + "grad_norm": 0.11538971960544586, + "learning_rate": 2.682388946397228e-06, + "loss": 0.2395, + "step": 48780 + }, + { + "epoch": 3.951798444588464, + "grad_norm": 0.09120815247297287, + "learning_rate": 2.6778882938026014e-06, + "loss": 0.266, + "step": 48781 + }, + { + "epoch": 3.9518794556059627, + "grad_norm": 0.06618034094572067, + "learning_rate": 2.6733876412079754e-06, + "loss": 0.1992, + "step": 48782 + }, + { + "epoch": 3.951960466623461, + "grad_norm": 0.07401160895824432, + "learning_rate": 2.668886988613349e-06, + "loss": 0.207, + "step": 48783 + }, + { + "epoch": 3.952041477640959, + "grad_norm": 0.08141220360994339, + "learning_rate": 2.664386336018723e-06, + "loss": 0.2225, + "step": 48784 + }, + { + "epoch": 3.9521224886584574, + "grad_norm": 0.06158602237701416, + "learning_rate": 2.6598856834240967e-06, + "loss": 0.1807, + "step": 48785 + }, + { + "epoch": 3.952203499675956, + "grad_norm": 0.07168226689100266, + "learning_rate": 2.6553850308294703e-06, + "loss": 0.1947, + "step": 48786 + }, + { + "epoch": 3.9522845106934543, + "grad_norm": 0.10002150386571884, + "learning_rate": 2.6508843782348443e-06, + "loss": 0.227, + "step": 48787 + }, + { + "epoch": 3.9523655217109526, + "grad_norm": 0.060031428933143616, + "learning_rate": 2.646383725640218e-06, + "loss": 0.2223, + "step": 48788 + }, + { + "epoch": 3.952446532728451, + "grad_norm": 0.07038126140832901, + "learning_rate": 2.6418830730455915e-06, + "loss": 0.22, + "step": 48789 + }, + { + "epoch": 3.9525275437459495, + "grad_norm": 0.06693422794342041, + "learning_rate": 2.6373824204509655e-06, + "loss": 0.2072, + "step": 48790 + }, + { + "epoch": 3.9526085547634477, + "grad_norm": 0.07882367074489594, + "learning_rate": 2.632881767856339e-06, + "loss": 0.2548, + "step": 48791 + }, + { + "epoch": 3.952689565780946, + "grad_norm": 0.06965406239032745, + "learning_rate": 2.6283811152617127e-06, + "loss": 0.23, + "step": 48792 + }, + { + "epoch": 3.9527705767984447, + "grad_norm": 0.060692980885505676, + "learning_rate": 2.623880462667087e-06, + "loss": 0.2233, + "step": 48793 + }, + { + "epoch": 3.952851587815943, + "grad_norm": 0.06531994789838791, + "learning_rate": 2.619379810072461e-06, + "loss": 0.2353, + "step": 48794 + }, + { + "epoch": 3.952932598833441, + "grad_norm": 0.06411170959472656, + "learning_rate": 2.6148791574778344e-06, + "loss": 0.2196, + "step": 48795 + }, + { + "epoch": 3.95301360985094, + "grad_norm": 0.08259644359350204, + "learning_rate": 2.6103785048832084e-06, + "loss": 0.2309, + "step": 48796 + }, + { + "epoch": 3.953094620868438, + "grad_norm": 0.07093773037195206, + "learning_rate": 2.605877852288582e-06, + "loss": 0.2239, + "step": 48797 + }, + { + "epoch": 3.9531756318859363, + "grad_norm": 0.06640201807022095, + "learning_rate": 2.6013771996939556e-06, + "loss": 0.2004, + "step": 48798 + }, + { + "epoch": 3.953256642903435, + "grad_norm": 0.07526429742574692, + "learning_rate": 2.5968765470993297e-06, + "loss": 0.2639, + "step": 48799 + }, + { + "epoch": 3.9533376539209333, + "grad_norm": 0.06677767634391785, + "learning_rate": 2.5923758945047033e-06, + "loss": 0.2551, + "step": 48800 + }, + { + "epoch": 3.9534186649384315, + "grad_norm": 0.06592489778995514, + "learning_rate": 2.587875241910077e-06, + "loss": 0.2013, + "step": 48801 + }, + { + "epoch": 3.95349967595593, + "grad_norm": 0.07010385394096375, + "learning_rate": 2.583374589315451e-06, + "loss": 0.2542, + "step": 48802 + }, + { + "epoch": 3.9535806869734285, + "grad_norm": 0.05288232862949371, + "learning_rate": 2.5788739367208245e-06, + "loss": 0.2347, + "step": 48803 + }, + { + "epoch": 3.9536616979909267, + "grad_norm": 0.07920708507299423, + "learning_rate": 2.5743732841261985e-06, + "loss": 0.2599, + "step": 48804 + }, + { + "epoch": 3.9537427090084254, + "grad_norm": 0.07881376147270203, + "learning_rate": 2.569872631531572e-06, + "loss": 0.2526, + "step": 48805 + }, + { + "epoch": 3.9538237200259236, + "grad_norm": 0.07075386494398117, + "learning_rate": 2.5653719789369457e-06, + "loss": 0.2357, + "step": 48806 + }, + { + "epoch": 3.953904731043422, + "grad_norm": 0.0738983005285263, + "learning_rate": 2.5608713263423198e-06, + "loss": 0.2094, + "step": 48807 + }, + { + "epoch": 3.95398574206092, + "grad_norm": 0.06531957536935806, + "learning_rate": 2.556370673747694e-06, + "loss": 0.1814, + "step": 48808 + }, + { + "epoch": 3.954066753078419, + "grad_norm": 0.07314100116491318, + "learning_rate": 2.5518700211530674e-06, + "loss": 0.2491, + "step": 48809 + }, + { + "epoch": 3.954147764095917, + "grad_norm": 0.06974782794713974, + "learning_rate": 2.547369368558441e-06, + "loss": 0.2168, + "step": 48810 + }, + { + "epoch": 3.9542287751134153, + "grad_norm": 0.07167565077543259, + "learning_rate": 2.542868715963815e-06, + "loss": 0.2411, + "step": 48811 + }, + { + "epoch": 3.9543097861309136, + "grad_norm": 0.057915229350328445, + "learning_rate": 2.5383680633691886e-06, + "loss": 0.2, + "step": 48812 + }, + { + "epoch": 3.9543907971484122, + "grad_norm": 0.0739583894610405, + "learning_rate": 2.5338674107745627e-06, + "loss": 0.2263, + "step": 48813 + }, + { + "epoch": 3.9544718081659105, + "grad_norm": 0.08724256604909897, + "learning_rate": 2.5293667581799363e-06, + "loss": 0.2154, + "step": 48814 + }, + { + "epoch": 3.9545528191834087, + "grad_norm": 0.08466099947690964, + "learning_rate": 2.52486610558531e-06, + "loss": 0.2697, + "step": 48815 + }, + { + "epoch": 3.9546338302009074, + "grad_norm": 0.06930121034383774, + "learning_rate": 2.520365452990684e-06, + "loss": 0.1939, + "step": 48816 + }, + { + "epoch": 3.9547148412184057, + "grad_norm": 0.08779342472553253, + "learning_rate": 2.5158648003960575e-06, + "loss": 0.2267, + "step": 48817 + }, + { + "epoch": 3.954795852235904, + "grad_norm": 0.06339520961046219, + "learning_rate": 2.511364147801431e-06, + "loss": 0.2178, + "step": 48818 + }, + { + "epoch": 3.9548768632534026, + "grad_norm": 0.06660187989473343, + "learning_rate": 2.506863495206805e-06, + "loss": 0.2536, + "step": 48819 + }, + { + "epoch": 3.954957874270901, + "grad_norm": 0.06031389907002449, + "learning_rate": 2.5023628426121787e-06, + "loss": 0.1746, + "step": 48820 + }, + { + "epoch": 3.955038885288399, + "grad_norm": 0.07642403244972229, + "learning_rate": 2.4978621900175523e-06, + "loss": 0.2516, + "step": 48821 + }, + { + "epoch": 3.9551198963058978, + "grad_norm": 0.06653022021055222, + "learning_rate": 2.4933615374229268e-06, + "loss": 0.2368, + "step": 48822 + }, + { + "epoch": 3.955200907323396, + "grad_norm": 0.05918882414698601, + "learning_rate": 2.4888608848283004e-06, + "loss": 0.2194, + "step": 48823 + }, + { + "epoch": 3.9552819183408943, + "grad_norm": 0.090346559882164, + "learning_rate": 2.484360232233674e-06, + "loss": 0.226, + "step": 48824 + }, + { + "epoch": 3.955362929358393, + "grad_norm": 0.06755279004573822, + "learning_rate": 2.479859579639048e-06, + "loss": 0.2484, + "step": 48825 + }, + { + "epoch": 3.955443940375891, + "grad_norm": 0.06781566888093948, + "learning_rate": 2.4753589270444216e-06, + "loss": 0.1975, + "step": 48826 + }, + { + "epoch": 3.9555249513933894, + "grad_norm": 0.07741211354732513, + "learning_rate": 2.4708582744497952e-06, + "loss": 0.2321, + "step": 48827 + }, + { + "epoch": 3.955605962410888, + "grad_norm": 0.07529742270708084, + "learning_rate": 2.4663576218551692e-06, + "loss": 0.2329, + "step": 48828 + }, + { + "epoch": 3.9556869734283864, + "grad_norm": 0.06810367852449417, + "learning_rate": 2.461856969260543e-06, + "loss": 0.2081, + "step": 48829 + }, + { + "epoch": 3.9557679844458846, + "grad_norm": 0.06922873854637146, + "learning_rate": 2.4573563166659165e-06, + "loss": 0.2262, + "step": 48830 + }, + { + "epoch": 3.955848995463383, + "grad_norm": 0.07545153796672821, + "learning_rate": 2.4528556640712905e-06, + "loss": 0.2295, + "step": 48831 + }, + { + "epoch": 3.9559300064808816, + "grad_norm": 0.07698988914489746, + "learning_rate": 2.448355011476664e-06, + "loss": 0.2162, + "step": 48832 + }, + { + "epoch": 3.95601101749838, + "grad_norm": 0.06655178219079971, + "learning_rate": 2.443854358882038e-06, + "loss": 0.2242, + "step": 48833 + }, + { + "epoch": 3.956092028515878, + "grad_norm": 0.06900296360254288, + "learning_rate": 2.4393537062874117e-06, + "loss": 0.2319, + "step": 48834 + }, + { + "epoch": 3.9561730395333763, + "grad_norm": 0.0699462965130806, + "learning_rate": 2.4348530536927853e-06, + "loss": 0.2429, + "step": 48835 + }, + { + "epoch": 3.956254050550875, + "grad_norm": 0.07188187539577484, + "learning_rate": 2.4303524010981593e-06, + "loss": 0.2344, + "step": 48836 + }, + { + "epoch": 3.9563350615683732, + "grad_norm": 0.07619171589612961, + "learning_rate": 2.4258517485035334e-06, + "loss": 0.2031, + "step": 48837 + }, + { + "epoch": 3.9564160725858715, + "grad_norm": 0.0603308379650116, + "learning_rate": 2.421351095908907e-06, + "loss": 0.2033, + "step": 48838 + }, + { + "epoch": 3.95649708360337, + "grad_norm": 0.0679556280374527, + "learning_rate": 2.416850443314281e-06, + "loss": 0.2109, + "step": 48839 + }, + { + "epoch": 3.9565780946208684, + "grad_norm": 0.06686028093099594, + "learning_rate": 2.4123497907196546e-06, + "loss": 0.2583, + "step": 48840 + }, + { + "epoch": 3.9566591056383666, + "grad_norm": 0.07004310190677643, + "learning_rate": 2.407849138125028e-06, + "loss": 0.232, + "step": 48841 + }, + { + "epoch": 3.9567401166558653, + "grad_norm": 0.05624713748693466, + "learning_rate": 2.4033484855304022e-06, + "loss": 0.2123, + "step": 48842 + }, + { + "epoch": 3.9568211276733636, + "grad_norm": 0.0720772072672844, + "learning_rate": 2.398847832935776e-06, + "loss": 0.2112, + "step": 48843 + }, + { + "epoch": 3.956902138690862, + "grad_norm": 0.07022903114557266, + "learning_rate": 2.3943471803411494e-06, + "loss": 0.199, + "step": 48844 + }, + { + "epoch": 3.9569831497083605, + "grad_norm": 0.0740879699587822, + "learning_rate": 2.3898465277465235e-06, + "loss": 0.2469, + "step": 48845 + }, + { + "epoch": 3.9570641607258588, + "grad_norm": 0.07837864011526108, + "learning_rate": 2.385345875151897e-06, + "loss": 0.2414, + "step": 48846 + }, + { + "epoch": 3.957145171743357, + "grad_norm": 0.06510287523269653, + "learning_rate": 2.3808452225572707e-06, + "loss": 0.2119, + "step": 48847 + }, + { + "epoch": 3.9572261827608557, + "grad_norm": 0.06796301901340485, + "learning_rate": 2.3763445699626447e-06, + "loss": 0.1879, + "step": 48848 + }, + { + "epoch": 3.957307193778354, + "grad_norm": 0.0652165561914444, + "learning_rate": 2.3718439173680183e-06, + "loss": 0.2306, + "step": 48849 + }, + { + "epoch": 3.957388204795852, + "grad_norm": 0.08012350648641586, + "learning_rate": 2.367343264773392e-06, + "loss": 0.2511, + "step": 48850 + }, + { + "epoch": 3.957469215813351, + "grad_norm": 0.06359118223190308, + "learning_rate": 2.3628426121787664e-06, + "loss": 0.1956, + "step": 48851 + }, + { + "epoch": 3.957550226830849, + "grad_norm": 0.09778131544589996, + "learning_rate": 2.35834195958414e-06, + "loss": 0.2337, + "step": 48852 + }, + { + "epoch": 3.9576312378483474, + "grad_norm": 0.07835190743207932, + "learning_rate": 2.3538413069895136e-06, + "loss": 0.2037, + "step": 48853 + }, + { + "epoch": 3.9577122488658456, + "grad_norm": 0.07351531833410263, + "learning_rate": 2.3493406543948876e-06, + "loss": 0.276, + "step": 48854 + }, + { + "epoch": 3.9577932598833443, + "grad_norm": 0.0607091523706913, + "learning_rate": 2.344840001800261e-06, + "loss": 0.2262, + "step": 48855 + }, + { + "epoch": 3.9578742709008425, + "grad_norm": 0.06498251855373383, + "learning_rate": 2.340339349205635e-06, + "loss": 0.2159, + "step": 48856 + }, + { + "epoch": 3.957955281918341, + "grad_norm": 0.07029843330383301, + "learning_rate": 2.335838696611009e-06, + "loss": 0.1888, + "step": 48857 + }, + { + "epoch": 3.958036292935839, + "grad_norm": 0.07102137804031372, + "learning_rate": 2.3313380440163824e-06, + "loss": 0.2604, + "step": 48858 + }, + { + "epoch": 3.9581173039533377, + "grad_norm": 0.0782492458820343, + "learning_rate": 2.326837391421756e-06, + "loss": 0.2622, + "step": 48859 + }, + { + "epoch": 3.958198314970836, + "grad_norm": 0.06843463331460953, + "learning_rate": 2.32233673882713e-06, + "loss": 0.2179, + "step": 48860 + }, + { + "epoch": 3.958279325988334, + "grad_norm": 0.07084222882986069, + "learning_rate": 2.3178360862325037e-06, + "loss": 0.1941, + "step": 48861 + }, + { + "epoch": 3.958360337005833, + "grad_norm": 0.0704958513379097, + "learning_rate": 2.3133354336378777e-06, + "loss": 0.226, + "step": 48862 + }, + { + "epoch": 3.958441348023331, + "grad_norm": 0.06495588272809982, + "learning_rate": 2.3088347810432513e-06, + "loss": 0.2503, + "step": 48863 + }, + { + "epoch": 3.9585223590408294, + "grad_norm": 0.0666610449552536, + "learning_rate": 2.304334128448625e-06, + "loss": 0.2016, + "step": 48864 + }, + { + "epoch": 3.958603370058328, + "grad_norm": 0.0791105106472969, + "learning_rate": 2.299833475853999e-06, + "loss": 0.211, + "step": 48865 + }, + { + "epoch": 3.9586843810758263, + "grad_norm": 0.08585468679666519, + "learning_rate": 2.295332823259373e-06, + "loss": 0.2206, + "step": 48866 + }, + { + "epoch": 3.9587653920933246, + "grad_norm": 0.07571440190076828, + "learning_rate": 2.2908321706647466e-06, + "loss": 0.2165, + "step": 48867 + }, + { + "epoch": 3.9588464031108233, + "grad_norm": 0.07079493254423141, + "learning_rate": 2.2863315180701206e-06, + "loss": 0.2351, + "step": 48868 + }, + { + "epoch": 3.9589274141283215, + "grad_norm": 0.08157934993505478, + "learning_rate": 2.281830865475494e-06, + "loss": 0.209, + "step": 48869 + }, + { + "epoch": 3.9590084251458197, + "grad_norm": 0.058229394257068634, + "learning_rate": 2.2773302128808678e-06, + "loss": 0.216, + "step": 48870 + }, + { + "epoch": 3.9590894361633184, + "grad_norm": 0.06710825115442276, + "learning_rate": 2.272829560286242e-06, + "loss": 0.2343, + "step": 48871 + }, + { + "epoch": 3.9591704471808167, + "grad_norm": 0.07763312757015228, + "learning_rate": 2.2683289076916154e-06, + "loss": 0.2398, + "step": 48872 + }, + { + "epoch": 3.959251458198315, + "grad_norm": 0.07329755276441574, + "learning_rate": 2.263828255096989e-06, + "loss": 0.1929, + "step": 48873 + }, + { + "epoch": 3.9593324692158136, + "grad_norm": 0.052778083831071854, + "learning_rate": 2.259327602502363e-06, + "loss": 0.2198, + "step": 48874 + }, + { + "epoch": 3.959413480233312, + "grad_norm": 0.06382036954164505, + "learning_rate": 2.2548269499077366e-06, + "loss": 0.2334, + "step": 48875 + }, + { + "epoch": 3.95949449125081, + "grad_norm": 0.06736230105161667, + "learning_rate": 2.2503262973131103e-06, + "loss": 0.1772, + "step": 48876 + }, + { + "epoch": 3.9595755022683083, + "grad_norm": 0.09311016649007797, + "learning_rate": 2.2458256447184843e-06, + "loss": 0.2461, + "step": 48877 + }, + { + "epoch": 3.959656513285807, + "grad_norm": 0.067777618765831, + "learning_rate": 2.241324992123858e-06, + "loss": 0.2162, + "step": 48878 + }, + { + "epoch": 3.9597375243033053, + "grad_norm": 0.07997038960456848, + "learning_rate": 2.2368243395292315e-06, + "loss": 0.241, + "step": 48879 + }, + { + "epoch": 3.9598185353208035, + "grad_norm": 0.07150693982839584, + "learning_rate": 2.232323686934606e-06, + "loss": 0.2265, + "step": 48880 + }, + { + "epoch": 3.9598995463383018, + "grad_norm": 0.06763894110918045, + "learning_rate": 2.2278230343399795e-06, + "loss": 0.2165, + "step": 48881 + }, + { + "epoch": 3.9599805573558005, + "grad_norm": 0.07978539168834686, + "learning_rate": 2.223322381745353e-06, + "loss": 0.2443, + "step": 48882 + }, + { + "epoch": 3.9600615683732987, + "grad_norm": 0.06002137437462807, + "learning_rate": 2.218821729150727e-06, + "loss": 0.249, + "step": 48883 + }, + { + "epoch": 3.960142579390797, + "grad_norm": 0.06918959319591522, + "learning_rate": 2.2143210765561008e-06, + "loss": 0.2428, + "step": 48884 + }, + { + "epoch": 3.9602235904082956, + "grad_norm": 0.06803537160158157, + "learning_rate": 2.2098204239614744e-06, + "loss": 0.2001, + "step": 48885 + }, + { + "epoch": 3.960304601425794, + "grad_norm": 0.06053481996059418, + "learning_rate": 2.2053197713668484e-06, + "loss": 0.1956, + "step": 48886 + }, + { + "epoch": 3.960385612443292, + "grad_norm": 0.062240056693553925, + "learning_rate": 2.200819118772222e-06, + "loss": 0.2152, + "step": 48887 + }, + { + "epoch": 3.960466623460791, + "grad_norm": 0.06525016576051712, + "learning_rate": 2.1963184661775956e-06, + "loss": 0.1922, + "step": 48888 + }, + { + "epoch": 3.960547634478289, + "grad_norm": 0.07532958686351776, + "learning_rate": 2.1918178135829696e-06, + "loss": 0.2079, + "step": 48889 + }, + { + "epoch": 3.9606286454957873, + "grad_norm": 0.08174377679824829, + "learning_rate": 2.1873171609883432e-06, + "loss": 0.2211, + "step": 48890 + }, + { + "epoch": 3.960709656513286, + "grad_norm": 0.07705192267894745, + "learning_rate": 2.1828165083937173e-06, + "loss": 0.2116, + "step": 48891 + }, + { + "epoch": 3.9607906675307842, + "grad_norm": 0.08387063443660736, + "learning_rate": 2.178315855799091e-06, + "loss": 0.1979, + "step": 48892 + }, + { + "epoch": 3.9608716785482825, + "grad_norm": 0.07427896559238434, + "learning_rate": 2.1738152032044645e-06, + "loss": 0.2114, + "step": 48893 + }, + { + "epoch": 3.960952689565781, + "grad_norm": 0.0671217292547226, + "learning_rate": 2.1693145506098385e-06, + "loss": 0.2102, + "step": 48894 + }, + { + "epoch": 3.9610337005832794, + "grad_norm": 0.0731186717748642, + "learning_rate": 2.1648138980152125e-06, + "loss": 0.2358, + "step": 48895 + }, + { + "epoch": 3.9611147116007777, + "grad_norm": 0.08874718844890594, + "learning_rate": 2.160313245420586e-06, + "loss": 0.2466, + "step": 48896 + }, + { + "epoch": 3.9611957226182763, + "grad_norm": 0.08343454450368881, + "learning_rate": 2.15581259282596e-06, + "loss": 0.2146, + "step": 48897 + }, + { + "epoch": 3.9612767336357746, + "grad_norm": 0.07566836476325989, + "learning_rate": 2.1513119402313338e-06, + "loss": 0.2411, + "step": 48898 + }, + { + "epoch": 3.961357744653273, + "grad_norm": 0.08290202170610428, + "learning_rate": 2.1468112876367074e-06, + "loss": 0.2512, + "step": 48899 + }, + { + "epoch": 3.961438755670771, + "grad_norm": 0.07356373220682144, + "learning_rate": 2.1423106350420814e-06, + "loss": 0.2484, + "step": 48900 + }, + { + "epoch": 3.9615197666882693, + "grad_norm": 0.08584269881248474, + "learning_rate": 2.137809982447455e-06, + "loss": 0.2302, + "step": 48901 + }, + { + "epoch": 3.961600777705768, + "grad_norm": 0.06294574588537216, + "learning_rate": 2.1333093298528286e-06, + "loss": 0.2307, + "step": 48902 + }, + { + "epoch": 3.9616817887232663, + "grad_norm": 0.06678062677383423, + "learning_rate": 2.1288086772582026e-06, + "loss": 0.2059, + "step": 48903 + }, + { + "epoch": 3.9617627997407645, + "grad_norm": 0.06297358125448227, + "learning_rate": 2.1243080246635762e-06, + "loss": 0.2176, + "step": 48904 + }, + { + "epoch": 3.961843810758263, + "grad_norm": 0.07585348933935165, + "learning_rate": 2.11980737206895e-06, + "loss": 0.2497, + "step": 48905 + }, + { + "epoch": 3.9619248217757614, + "grad_norm": 0.07568811625242233, + "learning_rate": 2.115306719474324e-06, + "loss": 0.2114, + "step": 48906 + }, + { + "epoch": 3.9620058327932597, + "grad_norm": 0.07441581040620804, + "learning_rate": 2.1108060668796975e-06, + "loss": 0.287, + "step": 48907 + }, + { + "epoch": 3.9620868438107584, + "grad_norm": 0.06748269498348236, + "learning_rate": 2.106305414285071e-06, + "loss": 0.2269, + "step": 48908 + }, + { + "epoch": 3.9621678548282566, + "grad_norm": 0.07350584864616394, + "learning_rate": 2.1018047616904455e-06, + "loss": 0.2298, + "step": 48909 + }, + { + "epoch": 3.962248865845755, + "grad_norm": 0.07611532509326935, + "learning_rate": 2.097304109095819e-06, + "loss": 0.236, + "step": 48910 + }, + { + "epoch": 3.9623298768632536, + "grad_norm": 0.07814755290746689, + "learning_rate": 2.0928034565011927e-06, + "loss": 0.2699, + "step": 48911 + }, + { + "epoch": 3.962410887880752, + "grad_norm": 0.07571461796760559, + "learning_rate": 2.0883028039065667e-06, + "loss": 0.2033, + "step": 48912 + }, + { + "epoch": 3.96249189889825, + "grad_norm": 0.08205527067184448, + "learning_rate": 2.0838021513119403e-06, + "loss": 0.2449, + "step": 48913 + }, + { + "epoch": 3.9625729099157487, + "grad_norm": 0.06475473195314407, + "learning_rate": 2.079301498717314e-06, + "loss": 0.2245, + "step": 48914 + }, + { + "epoch": 3.962653920933247, + "grad_norm": 0.06566368043422699, + "learning_rate": 2.074800846122688e-06, + "loss": 0.1976, + "step": 48915 + }, + { + "epoch": 3.962734931950745, + "grad_norm": 0.08065775781869888, + "learning_rate": 2.0703001935280616e-06, + "loss": 0.2169, + "step": 48916 + }, + { + "epoch": 3.962815942968244, + "grad_norm": 0.06467075645923615, + "learning_rate": 2.0657995409334356e-06, + "loss": 0.2001, + "step": 48917 + }, + { + "epoch": 3.962896953985742, + "grad_norm": 0.06762147694826126, + "learning_rate": 2.0612988883388092e-06, + "loss": 0.2267, + "step": 48918 + }, + { + "epoch": 3.9629779650032404, + "grad_norm": 0.09147316217422485, + "learning_rate": 2.056798235744183e-06, + "loss": 0.2093, + "step": 48919 + }, + { + "epoch": 3.963058976020739, + "grad_norm": 0.050940994173288345, + "learning_rate": 2.052297583149557e-06, + "loss": 0.2315, + "step": 48920 + }, + { + "epoch": 3.9631399870382373, + "grad_norm": 0.06474053114652634, + "learning_rate": 2.0477969305549304e-06, + "loss": 0.2059, + "step": 48921 + }, + { + "epoch": 3.9632209980557356, + "grad_norm": 0.06484822183847427, + "learning_rate": 2.043296277960304e-06, + "loss": 0.2061, + "step": 48922 + }, + { + "epoch": 3.963302009073234, + "grad_norm": 0.0787152349948883, + "learning_rate": 2.038795625365678e-06, + "loss": 0.2333, + "step": 48923 + }, + { + "epoch": 3.963383020090732, + "grad_norm": 0.07503559440374374, + "learning_rate": 2.034294972771052e-06, + "loss": 0.2119, + "step": 48924 + }, + { + "epoch": 3.9634640311082308, + "grad_norm": 0.06612849235534668, + "learning_rate": 2.0297943201764257e-06, + "loss": 0.2236, + "step": 48925 + }, + { + "epoch": 3.963545042125729, + "grad_norm": 0.07943277806043625, + "learning_rate": 2.0252936675817997e-06, + "loss": 0.1927, + "step": 48926 + }, + { + "epoch": 3.9636260531432272, + "grad_norm": 0.08900720626115799, + "learning_rate": 2.0207930149871733e-06, + "loss": 0.2616, + "step": 48927 + }, + { + "epoch": 3.963707064160726, + "grad_norm": 0.07647895067930222, + "learning_rate": 2.016292362392547e-06, + "loss": 0.2124, + "step": 48928 + }, + { + "epoch": 3.963788075178224, + "grad_norm": 0.08729333430528641, + "learning_rate": 2.011791709797921e-06, + "loss": 0.225, + "step": 48929 + }, + { + "epoch": 3.9638690861957224, + "grad_norm": 0.06749184429645538, + "learning_rate": 2.0072910572032946e-06, + "loss": 0.2568, + "step": 48930 + }, + { + "epoch": 3.963950097213221, + "grad_norm": 0.07493914663791656, + "learning_rate": 2.002790404608668e-06, + "loss": 0.2483, + "step": 48931 + }, + { + "epoch": 3.9640311082307194, + "grad_norm": 0.07415442168712616, + "learning_rate": 1.998289752014042e-06, + "loss": 0.2709, + "step": 48932 + }, + { + "epoch": 3.9641121192482176, + "grad_norm": 0.055181387811899185, + "learning_rate": 1.993789099419416e-06, + "loss": 0.1915, + "step": 48933 + }, + { + "epoch": 3.9641931302657163, + "grad_norm": 0.0670875683426857, + "learning_rate": 1.9892884468247894e-06, + "loss": 0.2323, + "step": 48934 + }, + { + "epoch": 3.9642741412832145, + "grad_norm": 0.06657449901103973, + "learning_rate": 1.9847877942301634e-06, + "loss": 0.2334, + "step": 48935 + }, + { + "epoch": 3.964355152300713, + "grad_norm": 0.06159026175737381, + "learning_rate": 1.980287141635537e-06, + "loss": 0.1616, + "step": 48936 + }, + { + "epoch": 3.9644361633182115, + "grad_norm": 0.06844350695610046, + "learning_rate": 1.975786489040911e-06, + "loss": 0.2003, + "step": 48937 + }, + { + "epoch": 3.9645171743357097, + "grad_norm": 0.07265136390924454, + "learning_rate": 1.971285836446285e-06, + "loss": 0.2481, + "step": 48938 + }, + { + "epoch": 3.964598185353208, + "grad_norm": 0.07296200096607208, + "learning_rate": 1.9667851838516587e-06, + "loss": 0.2074, + "step": 48939 + }, + { + "epoch": 3.9646791963707066, + "grad_norm": 0.07084230333566666, + "learning_rate": 1.9622845312570323e-06, + "loss": 0.2267, + "step": 48940 + }, + { + "epoch": 3.964760207388205, + "grad_norm": 0.06154124438762665, + "learning_rate": 1.9577838786624063e-06, + "loss": 0.218, + "step": 48941 + }, + { + "epoch": 3.964841218405703, + "grad_norm": 0.05799073725938797, + "learning_rate": 1.95328322606778e-06, + "loss": 0.2076, + "step": 48942 + }, + { + "epoch": 3.964922229423202, + "grad_norm": 0.06497079133987427, + "learning_rate": 1.9487825734731535e-06, + "loss": 0.2096, + "step": 48943 + }, + { + "epoch": 3.9650032404407, + "grad_norm": 0.07975282520055771, + "learning_rate": 1.9442819208785276e-06, + "loss": 0.2206, + "step": 48944 + }, + { + "epoch": 3.9650842514581983, + "grad_norm": 0.06756958365440369, + "learning_rate": 1.939781268283901e-06, + "loss": 0.2282, + "step": 48945 + }, + { + "epoch": 3.9651652624756966, + "grad_norm": 0.08484717458486557, + "learning_rate": 1.935280615689275e-06, + "loss": 0.2343, + "step": 48946 + }, + { + "epoch": 3.965246273493195, + "grad_norm": 0.07329592853784561, + "learning_rate": 1.930779963094649e-06, + "loss": 0.2059, + "step": 48947 + }, + { + "epoch": 3.9653272845106935, + "grad_norm": 0.05684712901711464, + "learning_rate": 1.9262793105000224e-06, + "loss": 0.228, + "step": 48948 + }, + { + "epoch": 3.9654082955281917, + "grad_norm": 0.05936472862958908, + "learning_rate": 1.9217786579053964e-06, + "loss": 0.2148, + "step": 48949 + }, + { + "epoch": 3.96548930654569, + "grad_norm": 0.07850753515958786, + "learning_rate": 1.91727800531077e-06, + "loss": 0.2136, + "step": 48950 + }, + { + "epoch": 3.9655703175631887, + "grad_norm": 0.0721631646156311, + "learning_rate": 1.9127773527161436e-06, + "loss": 0.1994, + "step": 48951 + }, + { + "epoch": 3.965651328580687, + "grad_norm": 0.06391099095344543, + "learning_rate": 1.9082767001215177e-06, + "loss": 0.2079, + "step": 48952 + }, + { + "epoch": 3.965732339598185, + "grad_norm": 0.06663154065608978, + "learning_rate": 1.9037760475268917e-06, + "loss": 0.2159, + "step": 48953 + }, + { + "epoch": 3.965813350615684, + "grad_norm": 0.060114867985248566, + "learning_rate": 1.8992753949322653e-06, + "loss": 0.2138, + "step": 48954 + }, + { + "epoch": 3.965894361633182, + "grad_norm": 0.07615365833044052, + "learning_rate": 1.894774742337639e-06, + "loss": 0.2277, + "step": 48955 + }, + { + "epoch": 3.9659753726506803, + "grad_norm": 0.06525284796953201, + "learning_rate": 1.890274089743013e-06, + "loss": 0.2232, + "step": 48956 + }, + { + "epoch": 3.966056383668179, + "grad_norm": 0.06666583567857742, + "learning_rate": 1.8857734371483867e-06, + "loss": 0.2199, + "step": 48957 + }, + { + "epoch": 3.9661373946856773, + "grad_norm": 0.07888475805521011, + "learning_rate": 1.8812727845537603e-06, + "loss": 0.2368, + "step": 48958 + }, + { + "epoch": 3.9662184057031755, + "grad_norm": 0.05811379477381706, + "learning_rate": 1.8767721319591341e-06, + "loss": 0.2513, + "step": 48959 + }, + { + "epoch": 3.966299416720674, + "grad_norm": 0.07906852662563324, + "learning_rate": 1.872271479364508e-06, + "loss": 0.2457, + "step": 48960 + }, + { + "epoch": 3.9663804277381725, + "grad_norm": 0.07660603523254395, + "learning_rate": 1.8677708267698816e-06, + "loss": 0.2245, + "step": 48961 + }, + { + "epoch": 3.9664614387556707, + "grad_norm": 0.07632768154144287, + "learning_rate": 1.8632701741752554e-06, + "loss": 0.2333, + "step": 48962 + }, + { + "epoch": 3.9665424497731694, + "grad_norm": 0.07934503257274628, + "learning_rate": 1.8587695215806292e-06, + "loss": 0.2479, + "step": 48963 + }, + { + "epoch": 3.9666234607906676, + "grad_norm": 0.06583593785762787, + "learning_rate": 1.854268868986003e-06, + "loss": 0.1923, + "step": 48964 + }, + { + "epoch": 3.966704471808166, + "grad_norm": 0.07885053008794785, + "learning_rate": 1.8497682163913766e-06, + "loss": 0.2003, + "step": 48965 + }, + { + "epoch": 3.9667854828256646, + "grad_norm": 0.07885368168354034, + "learning_rate": 1.8452675637967509e-06, + "loss": 0.2529, + "step": 48966 + }, + { + "epoch": 3.966866493843163, + "grad_norm": 0.05907120555639267, + "learning_rate": 1.8407669112021245e-06, + "loss": 0.2194, + "step": 48967 + }, + { + "epoch": 3.966947504860661, + "grad_norm": 0.05766279622912407, + "learning_rate": 1.8362662586074983e-06, + "loss": 0.2244, + "step": 48968 + }, + { + "epoch": 3.9670285158781593, + "grad_norm": 0.08073402941226959, + "learning_rate": 1.831765606012872e-06, + "loss": 0.1892, + "step": 48969 + }, + { + "epoch": 3.9671095268956575, + "grad_norm": 0.08297807723283768, + "learning_rate": 1.827264953418246e-06, + "loss": 0.2275, + "step": 48970 + }, + { + "epoch": 3.9671905379131562, + "grad_norm": 0.07996219396591187, + "learning_rate": 1.8227643008236195e-06, + "loss": 0.2753, + "step": 48971 + }, + { + "epoch": 3.9672715489306545, + "grad_norm": 0.06728029996156693, + "learning_rate": 1.8182636482289933e-06, + "loss": 0.2084, + "step": 48972 + }, + { + "epoch": 3.9673525599481527, + "grad_norm": 0.07134328782558441, + "learning_rate": 1.8137629956343671e-06, + "loss": 0.2387, + "step": 48973 + }, + { + "epoch": 3.9674335709656514, + "grad_norm": 0.06936592608690262, + "learning_rate": 1.8092623430397407e-06, + "loss": 0.226, + "step": 48974 + }, + { + "epoch": 3.9675145819831497, + "grad_norm": 0.060569167137145996, + "learning_rate": 1.8047616904451146e-06, + "loss": 0.2323, + "step": 48975 + }, + { + "epoch": 3.967595593000648, + "grad_norm": 0.06395695358514786, + "learning_rate": 1.8002610378504884e-06, + "loss": 0.2362, + "step": 48976 + }, + { + "epoch": 3.9676766040181466, + "grad_norm": 0.0779598280787468, + "learning_rate": 1.7957603852558622e-06, + "loss": 0.215, + "step": 48977 + }, + { + "epoch": 3.967757615035645, + "grad_norm": 0.06363376975059509, + "learning_rate": 1.7912597326612358e-06, + "loss": 0.1997, + "step": 48978 + }, + { + "epoch": 3.967838626053143, + "grad_norm": 0.07200288027524948, + "learning_rate": 1.7867590800666096e-06, + "loss": 0.2048, + "step": 48979 + }, + { + "epoch": 3.9679196370706418, + "grad_norm": 0.07821229100227356, + "learning_rate": 1.7822584274719834e-06, + "loss": 0.2087, + "step": 48980 + }, + { + "epoch": 3.96800064808814, + "grad_norm": 0.062123917043209076, + "learning_rate": 1.7777577748773574e-06, + "loss": 0.2389, + "step": 48981 + }, + { + "epoch": 3.9680816591056383, + "grad_norm": 0.06906332820653915, + "learning_rate": 1.7732571222827313e-06, + "loss": 0.2279, + "step": 48982 + }, + { + "epoch": 3.968162670123137, + "grad_norm": 0.08044512569904327, + "learning_rate": 1.7687564696881049e-06, + "loss": 0.2301, + "step": 48983 + }, + { + "epoch": 3.968243681140635, + "grad_norm": 0.06454446911811829, + "learning_rate": 1.7642558170934787e-06, + "loss": 0.1969, + "step": 48984 + }, + { + "epoch": 3.9683246921581334, + "grad_norm": 0.08553464710712433, + "learning_rate": 1.7597551644988525e-06, + "loss": 0.2333, + "step": 48985 + }, + { + "epoch": 3.968405703175632, + "grad_norm": 0.06968870759010315, + "learning_rate": 1.7552545119042263e-06, + "loss": 0.2341, + "step": 48986 + }, + { + "epoch": 3.9684867141931304, + "grad_norm": 0.06505095958709717, + "learning_rate": 1.7507538593096e-06, + "loss": 0.2056, + "step": 48987 + }, + { + "epoch": 3.9685677252106286, + "grad_norm": 0.07731178402900696, + "learning_rate": 1.7462532067149737e-06, + "loss": 0.216, + "step": 48988 + }, + { + "epoch": 3.968648736228127, + "grad_norm": 0.05740072578191757, + "learning_rate": 1.7417525541203475e-06, + "loss": 0.2334, + "step": 48989 + }, + { + "epoch": 3.9687297472456255, + "grad_norm": 0.06639726459980011, + "learning_rate": 1.7372519015257211e-06, + "loss": 0.2386, + "step": 48990 + }, + { + "epoch": 3.968810758263124, + "grad_norm": 0.0703684538602829, + "learning_rate": 1.732751248931095e-06, + "loss": 0.2012, + "step": 48991 + }, + { + "epoch": 3.968891769280622, + "grad_norm": 0.07246779650449753, + "learning_rate": 1.7282505963364688e-06, + "loss": 0.2345, + "step": 48992 + }, + { + "epoch": 3.9689727802981203, + "grad_norm": 0.060479771345853806, + "learning_rate": 1.7237499437418426e-06, + "loss": 0.1812, + "step": 48993 + }, + { + "epoch": 3.969053791315619, + "grad_norm": 0.06786667555570602, + "learning_rate": 1.7192492911472162e-06, + "loss": 0.2288, + "step": 48994 + }, + { + "epoch": 3.969134802333117, + "grad_norm": 0.07820719480514526, + "learning_rate": 1.7147486385525904e-06, + "loss": 0.2097, + "step": 48995 + }, + { + "epoch": 3.9692158133506155, + "grad_norm": 0.06689919531345367, + "learning_rate": 1.710247985957964e-06, + "loss": 0.2155, + "step": 48996 + }, + { + "epoch": 3.969296824368114, + "grad_norm": 0.07324912399053574, + "learning_rate": 1.7057473333633379e-06, + "loss": 0.2158, + "step": 48997 + }, + { + "epoch": 3.9693778353856124, + "grad_norm": 0.07657838612794876, + "learning_rate": 1.7012466807687117e-06, + "loss": 0.2181, + "step": 48998 + }, + { + "epoch": 3.9694588464031106, + "grad_norm": 0.07740187644958496, + "learning_rate": 1.6967460281740855e-06, + "loss": 0.2351, + "step": 48999 + }, + { + "epoch": 3.9695398574206093, + "grad_norm": 0.048127420246601105, + "learning_rate": 1.692245375579459e-06, + "loss": 0.2236, + "step": 49000 + }, + { + "epoch": 3.9696208684381076, + "grad_norm": 0.07730501145124435, + "learning_rate": 1.687744722984833e-06, + "loss": 0.2137, + "step": 49001 + }, + { + "epoch": 3.969701879455606, + "grad_norm": 0.07064519822597504, + "learning_rate": 1.6832440703902067e-06, + "loss": 0.2536, + "step": 49002 + }, + { + "epoch": 3.9697828904731045, + "grad_norm": 0.07275062054395676, + "learning_rate": 1.6787434177955803e-06, + "loss": 0.2273, + "step": 49003 + }, + { + "epoch": 3.9698639014906028, + "grad_norm": 0.08206738531589508, + "learning_rate": 1.6742427652009541e-06, + "loss": 0.2448, + "step": 49004 + }, + { + "epoch": 3.969944912508101, + "grad_norm": 0.07399595528841019, + "learning_rate": 1.669742112606328e-06, + "loss": 0.2251, + "step": 49005 + }, + { + "epoch": 3.9700259235255997, + "grad_norm": 0.07586342096328735, + "learning_rate": 1.6652414600117018e-06, + "loss": 0.2214, + "step": 49006 + }, + { + "epoch": 3.970106934543098, + "grad_norm": 0.07879979908466339, + "learning_rate": 1.6607408074170754e-06, + "loss": 0.2474, + "step": 49007 + }, + { + "epoch": 3.970187945560596, + "grad_norm": 0.061993636190891266, + "learning_rate": 1.6562401548224492e-06, + "loss": 0.2171, + "step": 49008 + }, + { + "epoch": 3.970268956578095, + "grad_norm": 0.0825590267777443, + "learning_rate": 1.6517395022278232e-06, + "loss": 0.2091, + "step": 49009 + }, + { + "epoch": 3.970349967595593, + "grad_norm": 0.07057903707027435, + "learning_rate": 1.647238849633197e-06, + "loss": 0.2551, + "step": 49010 + }, + { + "epoch": 3.9704309786130914, + "grad_norm": 0.07474622130393982, + "learning_rate": 1.6427381970385708e-06, + "loss": 0.2076, + "step": 49011 + }, + { + "epoch": 3.9705119896305896, + "grad_norm": 0.06988409161567688, + "learning_rate": 1.6382375444439444e-06, + "loss": 0.2123, + "step": 49012 + }, + { + "epoch": 3.9705930006480883, + "grad_norm": 0.09014685451984406, + "learning_rate": 1.6337368918493183e-06, + "loss": 0.2558, + "step": 49013 + }, + { + "epoch": 3.9706740116655865, + "grad_norm": 0.05605039373040199, + "learning_rate": 1.629236239254692e-06, + "loss": 0.2196, + "step": 49014 + }, + { + "epoch": 3.970755022683085, + "grad_norm": 0.06118842214345932, + "learning_rate": 1.6247355866600659e-06, + "loss": 0.2252, + "step": 49015 + }, + { + "epoch": 3.970836033700583, + "grad_norm": 0.0719040036201477, + "learning_rate": 1.6202349340654395e-06, + "loss": 0.2198, + "step": 49016 + }, + { + "epoch": 3.9709170447180817, + "grad_norm": 0.07295656204223633, + "learning_rate": 1.6157342814708133e-06, + "loss": 0.2345, + "step": 49017 + }, + { + "epoch": 3.97099805573558, + "grad_norm": 0.07177040725946426, + "learning_rate": 1.6112336288761871e-06, + "loss": 0.2319, + "step": 49018 + }, + { + "epoch": 3.971079066753078, + "grad_norm": 0.06768198311328888, + "learning_rate": 1.606732976281561e-06, + "loss": 0.244, + "step": 49019 + }, + { + "epoch": 3.971160077770577, + "grad_norm": 0.07683272659778595, + "learning_rate": 1.6022323236869345e-06, + "loss": 0.2378, + "step": 49020 + }, + { + "epoch": 3.971241088788075, + "grad_norm": 0.06740285456180573, + "learning_rate": 1.5977316710923084e-06, + "loss": 0.2183, + "step": 49021 + }, + { + "epoch": 3.9713220998055734, + "grad_norm": 0.0736578181385994, + "learning_rate": 1.5932310184976822e-06, + "loss": 0.2054, + "step": 49022 + }, + { + "epoch": 3.971403110823072, + "grad_norm": 0.10093529522418976, + "learning_rate": 1.5887303659030558e-06, + "loss": 0.2277, + "step": 49023 + }, + { + "epoch": 3.9714841218405703, + "grad_norm": 0.08745209872722626, + "learning_rate": 1.58422971330843e-06, + "loss": 0.2632, + "step": 49024 + }, + { + "epoch": 3.9715651328580686, + "grad_norm": 0.07922547310590744, + "learning_rate": 1.5797290607138036e-06, + "loss": 0.2189, + "step": 49025 + }, + { + "epoch": 3.9716461438755672, + "grad_norm": 0.06714785844087601, + "learning_rate": 1.5752284081191774e-06, + "loss": 0.1954, + "step": 49026 + }, + { + "epoch": 3.9717271548930655, + "grad_norm": 0.07381433248519897, + "learning_rate": 1.5707277555245512e-06, + "loss": 0.2117, + "step": 49027 + }, + { + "epoch": 3.9718081659105637, + "grad_norm": 0.06595097482204437, + "learning_rate": 1.566227102929925e-06, + "loss": 0.2259, + "step": 49028 + }, + { + "epoch": 3.9718891769280624, + "grad_norm": 0.07303611189126968, + "learning_rate": 1.5617264503352987e-06, + "loss": 0.2157, + "step": 49029 + }, + { + "epoch": 3.9719701879455607, + "grad_norm": 0.09227094054222107, + "learning_rate": 1.5572257977406725e-06, + "loss": 0.2355, + "step": 49030 + }, + { + "epoch": 3.972051198963059, + "grad_norm": 0.07859620451927185, + "learning_rate": 1.5527251451460463e-06, + "loss": 0.2492, + "step": 49031 + }, + { + "epoch": 3.9721322099805576, + "grad_norm": 0.07130131870508194, + "learning_rate": 1.54822449255142e-06, + "loss": 0.2507, + "step": 49032 + }, + { + "epoch": 3.972213220998056, + "grad_norm": 0.08751437067985535, + "learning_rate": 1.5437238399567937e-06, + "loss": 0.22, + "step": 49033 + }, + { + "epoch": 3.972294232015554, + "grad_norm": 0.07616513967514038, + "learning_rate": 1.5392231873621675e-06, + "loss": 0.2547, + "step": 49034 + }, + { + "epoch": 3.9723752430330523, + "grad_norm": 0.07148608565330505, + "learning_rate": 1.5347225347675413e-06, + "loss": 0.2185, + "step": 49035 + }, + { + "epoch": 3.972456254050551, + "grad_norm": 0.07048953324556351, + "learning_rate": 1.5302218821729152e-06, + "loss": 0.2212, + "step": 49036 + }, + { + "epoch": 3.9725372650680493, + "grad_norm": 0.06672168523073196, + "learning_rate": 1.525721229578289e-06, + "loss": 0.1841, + "step": 49037 + }, + { + "epoch": 3.9726182760855475, + "grad_norm": 0.07018128037452698, + "learning_rate": 1.5212205769836628e-06, + "loss": 0.1995, + "step": 49038 + }, + { + "epoch": 3.9726992871030458, + "grad_norm": 0.07781115919351578, + "learning_rate": 1.5167199243890364e-06, + "loss": 0.2275, + "step": 49039 + }, + { + "epoch": 3.9727802981205445, + "grad_norm": 0.06993318349123001, + "learning_rate": 1.5122192717944102e-06, + "loss": 0.2015, + "step": 49040 + }, + { + "epoch": 3.9728613091380427, + "grad_norm": 0.0710805132985115, + "learning_rate": 1.507718619199784e-06, + "loss": 0.2146, + "step": 49041 + }, + { + "epoch": 3.972942320155541, + "grad_norm": 0.08362104743719101, + "learning_rate": 1.5032179666051578e-06, + "loss": 0.2333, + "step": 49042 + }, + { + "epoch": 3.9730233311730396, + "grad_norm": 0.07563608884811401, + "learning_rate": 1.4987173140105317e-06, + "loss": 0.214, + "step": 49043 + }, + { + "epoch": 3.973104342190538, + "grad_norm": 0.07376503944396973, + "learning_rate": 1.4942166614159055e-06, + "loss": 0.1837, + "step": 49044 + }, + { + "epoch": 3.973185353208036, + "grad_norm": 0.07147838920354843, + "learning_rate": 1.489716008821279e-06, + "loss": 0.2029, + "step": 49045 + }, + { + "epoch": 3.973266364225535, + "grad_norm": 0.08322165906429291, + "learning_rate": 1.4852153562266529e-06, + "loss": 0.2219, + "step": 49046 + }, + { + "epoch": 3.973347375243033, + "grad_norm": 0.07755421847105026, + "learning_rate": 1.4807147036320267e-06, + "loss": 0.2314, + "step": 49047 + }, + { + "epoch": 3.9734283862605313, + "grad_norm": 0.06550732254981995, + "learning_rate": 1.4762140510374005e-06, + "loss": 0.218, + "step": 49048 + }, + { + "epoch": 3.97350939727803, + "grad_norm": 0.062065187841653824, + "learning_rate": 1.4717133984427743e-06, + "loss": 0.2148, + "step": 49049 + }, + { + "epoch": 3.9735904082955282, + "grad_norm": 0.06424777954816818, + "learning_rate": 1.4672127458481481e-06, + "loss": 0.223, + "step": 49050 + }, + { + "epoch": 3.9736714193130265, + "grad_norm": 0.06383474171161652, + "learning_rate": 1.462712093253522e-06, + "loss": 0.216, + "step": 49051 + }, + { + "epoch": 3.973752430330525, + "grad_norm": 0.07289434969425201, + "learning_rate": 1.4582114406588956e-06, + "loss": 0.2535, + "step": 49052 + }, + { + "epoch": 3.9738334413480234, + "grad_norm": 0.08658458292484283, + "learning_rate": 1.4537107880642694e-06, + "loss": 0.2414, + "step": 49053 + }, + { + "epoch": 3.9739144523655217, + "grad_norm": 0.050192661583423615, + "learning_rate": 1.4492101354696432e-06, + "loss": 0.2052, + "step": 49054 + }, + { + "epoch": 3.9739954633830203, + "grad_norm": 0.061246853321790695, + "learning_rate": 1.4447094828750168e-06, + "loss": 0.2015, + "step": 49055 + }, + { + "epoch": 3.9740764744005186, + "grad_norm": 0.08989216387271881, + "learning_rate": 1.4402088302803908e-06, + "loss": 0.2483, + "step": 49056 + }, + { + "epoch": 3.974157485418017, + "grad_norm": 0.06548626720905304, + "learning_rate": 1.4357081776857646e-06, + "loss": 0.2245, + "step": 49057 + }, + { + "epoch": 3.974238496435515, + "grad_norm": 0.09296084940433502, + "learning_rate": 1.4312075250911382e-06, + "loss": 0.211, + "step": 49058 + }, + { + "epoch": 3.9743195074530138, + "grad_norm": 0.08216899633407593, + "learning_rate": 1.426706872496512e-06, + "loss": 0.2369, + "step": 49059 + }, + { + "epoch": 3.974400518470512, + "grad_norm": 0.06864972412586212, + "learning_rate": 1.4222062199018859e-06, + "loss": 0.2543, + "step": 49060 + }, + { + "epoch": 3.9744815294880103, + "grad_norm": 0.06113716587424278, + "learning_rate": 1.4177055673072595e-06, + "loss": 0.2291, + "step": 49061 + }, + { + "epoch": 3.9745625405055085, + "grad_norm": 0.07123496383428574, + "learning_rate": 1.4132049147126333e-06, + "loss": 0.192, + "step": 49062 + }, + { + "epoch": 3.974643551523007, + "grad_norm": 0.07263696938753128, + "learning_rate": 1.4087042621180073e-06, + "loss": 0.2148, + "step": 49063 + }, + { + "epoch": 3.9747245625405054, + "grad_norm": 0.07747448235750198, + "learning_rate": 1.404203609523381e-06, + "loss": 0.2068, + "step": 49064 + }, + { + "epoch": 3.9748055735580037, + "grad_norm": 0.06405054777860641, + "learning_rate": 1.3997029569287547e-06, + "loss": 0.2361, + "step": 49065 + }, + { + "epoch": 3.9748865845755024, + "grad_norm": 0.08656203746795654, + "learning_rate": 1.3952023043341286e-06, + "loss": 0.2595, + "step": 49066 + }, + { + "epoch": 3.9749675955930006, + "grad_norm": 0.06898082792758942, + "learning_rate": 1.3907016517395024e-06, + "loss": 0.1922, + "step": 49067 + }, + { + "epoch": 3.975048606610499, + "grad_norm": 0.08081252872943878, + "learning_rate": 1.386200999144876e-06, + "loss": 0.2472, + "step": 49068 + }, + { + "epoch": 3.9751296176279975, + "grad_norm": 0.1028057113289833, + "learning_rate": 1.3817003465502498e-06, + "loss": 0.2194, + "step": 49069 + }, + { + "epoch": 3.975210628645496, + "grad_norm": 0.0644211396574974, + "learning_rate": 1.3771996939556236e-06, + "loss": 0.2284, + "step": 49070 + }, + { + "epoch": 3.975291639662994, + "grad_norm": 0.07005336135625839, + "learning_rate": 1.3726990413609974e-06, + "loss": 0.2258, + "step": 49071 + }, + { + "epoch": 3.9753726506804927, + "grad_norm": 0.07396363466978073, + "learning_rate": 1.3681983887663712e-06, + "loss": 0.2059, + "step": 49072 + }, + { + "epoch": 3.975453661697991, + "grad_norm": 0.08447012305259705, + "learning_rate": 1.363697736171745e-06, + "loss": 0.2223, + "step": 49073 + }, + { + "epoch": 3.975534672715489, + "grad_norm": 0.07208585739135742, + "learning_rate": 1.3591970835771186e-06, + "loss": 0.2292, + "step": 49074 + }, + { + "epoch": 3.975615683732988, + "grad_norm": 0.07174715399742126, + "learning_rate": 1.3546964309824925e-06, + "loss": 0.2476, + "step": 49075 + }, + { + "epoch": 3.975696694750486, + "grad_norm": 0.06257397681474686, + "learning_rate": 1.3501957783878663e-06, + "loss": 0.2405, + "step": 49076 + }, + { + "epoch": 3.9757777057679844, + "grad_norm": 0.07049359381198883, + "learning_rate": 1.34569512579324e-06, + "loss": 0.2424, + "step": 49077 + }, + { + "epoch": 3.975858716785483, + "grad_norm": 0.06485053151845932, + "learning_rate": 1.341194473198614e-06, + "loss": 0.2209, + "step": 49078 + }, + { + "epoch": 3.9759397278029813, + "grad_norm": 0.07532617449760437, + "learning_rate": 1.3366938206039877e-06, + "loss": 0.2469, + "step": 49079 + }, + { + "epoch": 3.9760207388204796, + "grad_norm": 0.07897515594959259, + "learning_rate": 1.3321931680093615e-06, + "loss": 0.2256, + "step": 49080 + }, + { + "epoch": 3.976101749837978, + "grad_norm": 0.07902484387159348, + "learning_rate": 1.3276925154147351e-06, + "loss": 0.2179, + "step": 49081 + }, + { + "epoch": 3.9761827608554765, + "grad_norm": 0.07536326348781586, + "learning_rate": 1.323191862820109e-06, + "loss": 0.2311, + "step": 49082 + }, + { + "epoch": 3.9762637718729748, + "grad_norm": 0.1022820770740509, + "learning_rate": 1.3186912102254828e-06, + "loss": 0.2329, + "step": 49083 + }, + { + "epoch": 3.976344782890473, + "grad_norm": 0.08580672740936279, + "learning_rate": 1.3141905576308564e-06, + "loss": 0.2162, + "step": 49084 + }, + { + "epoch": 3.9764257939079712, + "grad_norm": 0.06963769346475601, + "learning_rate": 1.3096899050362304e-06, + "loss": 0.2087, + "step": 49085 + }, + { + "epoch": 3.97650680492547, + "grad_norm": 0.06818298995494843, + "learning_rate": 1.3051892524416042e-06, + "loss": 0.1951, + "step": 49086 + }, + { + "epoch": 3.976587815942968, + "grad_norm": 0.0778006762266159, + "learning_rate": 1.3006885998469778e-06, + "loss": 0.2266, + "step": 49087 + }, + { + "epoch": 3.9766688269604664, + "grad_norm": 0.06969862431287766, + "learning_rate": 1.2961879472523516e-06, + "loss": 0.2652, + "step": 49088 + }, + { + "epoch": 3.976749837977965, + "grad_norm": 0.07298214733600616, + "learning_rate": 1.2916872946577254e-06, + "loss": 0.2344, + "step": 49089 + }, + { + "epoch": 3.9768308489954634, + "grad_norm": 0.07190950959920883, + "learning_rate": 1.2871866420630993e-06, + "loss": 0.2142, + "step": 49090 + }, + { + "epoch": 3.9769118600129616, + "grad_norm": 0.07995373755693436, + "learning_rate": 1.2826859894684729e-06, + "loss": 0.1973, + "step": 49091 + }, + { + "epoch": 3.9769928710304603, + "grad_norm": 0.0613616481423378, + "learning_rate": 1.278185336873847e-06, + "loss": 0.2093, + "step": 49092 + }, + { + "epoch": 3.9770738820479585, + "grad_norm": 0.06585995107889175, + "learning_rate": 1.2736846842792205e-06, + "loss": 0.2344, + "step": 49093 + }, + { + "epoch": 3.9771548930654568, + "grad_norm": 0.07106611132621765, + "learning_rate": 1.2691840316845943e-06, + "loss": 0.2236, + "step": 49094 + }, + { + "epoch": 3.9772359040829555, + "grad_norm": 0.08742845058441162, + "learning_rate": 1.2646833790899681e-06, + "loss": 0.2795, + "step": 49095 + }, + { + "epoch": 3.9773169151004537, + "grad_norm": 0.06771603226661682, + "learning_rate": 1.260182726495342e-06, + "loss": 0.1974, + "step": 49096 + }, + { + "epoch": 3.977397926117952, + "grad_norm": 0.07327091693878174, + "learning_rate": 1.2556820739007155e-06, + "loss": 0.2038, + "step": 49097 + }, + { + "epoch": 3.9774789371354506, + "grad_norm": 0.08291606605052948, + "learning_rate": 1.2511814213060894e-06, + "loss": 0.2405, + "step": 49098 + }, + { + "epoch": 3.977559948152949, + "grad_norm": 0.07166020572185516, + "learning_rate": 1.2466807687114634e-06, + "loss": 0.2089, + "step": 49099 + }, + { + "epoch": 3.977640959170447, + "grad_norm": 0.0633031502366066, + "learning_rate": 1.242180116116837e-06, + "loss": 0.2083, + "step": 49100 + }, + { + "epoch": 3.977721970187946, + "grad_norm": 0.07655423879623413, + "learning_rate": 1.2376794635222108e-06, + "loss": 0.2237, + "step": 49101 + }, + { + "epoch": 3.977802981205444, + "grad_norm": 0.06801167130470276, + "learning_rate": 1.2331788109275846e-06, + "loss": 0.2008, + "step": 49102 + }, + { + "epoch": 3.9778839922229423, + "grad_norm": 0.09980975836515427, + "learning_rate": 1.2286781583329582e-06, + "loss": 0.2363, + "step": 49103 + }, + { + "epoch": 3.9779650032404406, + "grad_norm": 0.08504997938871384, + "learning_rate": 1.224177505738332e-06, + "loss": 0.241, + "step": 49104 + }, + { + "epoch": 3.9780460142579392, + "grad_norm": 0.07652637362480164, + "learning_rate": 1.2196768531437059e-06, + "loss": 0.2589, + "step": 49105 + }, + { + "epoch": 3.9781270252754375, + "grad_norm": 0.06277074664831161, + "learning_rate": 1.2151762005490797e-06, + "loss": 0.2218, + "step": 49106 + }, + { + "epoch": 3.9782080362929357, + "grad_norm": 0.06175466999411583, + "learning_rate": 1.2106755479544535e-06, + "loss": 0.2167, + "step": 49107 + }, + { + "epoch": 3.978289047310434, + "grad_norm": 0.06797228753566742, + "learning_rate": 1.2061748953598273e-06, + "loss": 0.1944, + "step": 49108 + }, + { + "epoch": 3.9783700583279327, + "grad_norm": 0.07684358209371567, + "learning_rate": 1.2016742427652011e-06, + "loss": 0.2701, + "step": 49109 + }, + { + "epoch": 3.978451069345431, + "grad_norm": 0.06503838300704956, + "learning_rate": 1.1971735901705747e-06, + "loss": 0.1913, + "step": 49110 + }, + { + "epoch": 3.978532080362929, + "grad_norm": 0.06464909762144089, + "learning_rate": 1.1926729375759485e-06, + "loss": 0.1783, + "step": 49111 + }, + { + "epoch": 3.978613091380428, + "grad_norm": 0.09183213114738464, + "learning_rate": 1.1881722849813223e-06, + "loss": 0.2243, + "step": 49112 + }, + { + "epoch": 3.978694102397926, + "grad_norm": 0.05835896357893944, + "learning_rate": 1.183671632386696e-06, + "loss": 0.1829, + "step": 49113 + }, + { + "epoch": 3.9787751134154243, + "grad_norm": 0.07257760316133499, + "learning_rate": 1.17917097979207e-06, + "loss": 0.2254, + "step": 49114 + }, + { + "epoch": 3.978856124432923, + "grad_norm": 0.06942275911569595, + "learning_rate": 1.1746703271974438e-06, + "loss": 0.2233, + "step": 49115 + }, + { + "epoch": 3.9789371354504213, + "grad_norm": 0.0803317055106163, + "learning_rate": 1.1701696746028174e-06, + "loss": 0.2426, + "step": 49116 + }, + { + "epoch": 3.9790181464679195, + "grad_norm": 0.08098768442869186, + "learning_rate": 1.1656690220081912e-06, + "loss": 0.2706, + "step": 49117 + }, + { + "epoch": 3.979099157485418, + "grad_norm": 0.07547670602798462, + "learning_rate": 1.161168369413565e-06, + "loss": 0.2932, + "step": 49118 + }, + { + "epoch": 3.9791801685029164, + "grad_norm": 0.060938771814107895, + "learning_rate": 1.1566677168189388e-06, + "loss": 0.2588, + "step": 49119 + }, + { + "epoch": 3.9792611795204147, + "grad_norm": 0.0760703831911087, + "learning_rate": 1.1521670642243124e-06, + "loss": 0.2226, + "step": 49120 + }, + { + "epoch": 3.9793421905379134, + "grad_norm": 0.0770978331565857, + "learning_rate": 1.1476664116296865e-06, + "loss": 0.2161, + "step": 49121 + }, + { + "epoch": 3.9794232015554116, + "grad_norm": 0.08614175021648407, + "learning_rate": 1.1431657590350603e-06, + "loss": 0.2087, + "step": 49122 + }, + { + "epoch": 3.97950421257291, + "grad_norm": 0.07408390194177628, + "learning_rate": 1.1386651064404339e-06, + "loss": 0.2756, + "step": 49123 + }, + { + "epoch": 3.9795852235904086, + "grad_norm": 0.07045788317918777, + "learning_rate": 1.1341644538458077e-06, + "loss": 0.25, + "step": 49124 + }, + { + "epoch": 3.979666234607907, + "grad_norm": 0.06843449175357819, + "learning_rate": 1.1296638012511815e-06, + "loss": 0.235, + "step": 49125 + }, + { + "epoch": 3.979747245625405, + "grad_norm": 0.06088000163435936, + "learning_rate": 1.1251631486565551e-06, + "loss": 0.1662, + "step": 49126 + }, + { + "epoch": 3.9798282566429033, + "grad_norm": 0.09108071774244308, + "learning_rate": 1.120662496061929e-06, + "loss": 0.1939, + "step": 49127 + }, + { + "epoch": 3.9799092676604015, + "grad_norm": 0.08113054931163788, + "learning_rate": 1.116161843467303e-06, + "loss": 0.2037, + "step": 49128 + }, + { + "epoch": 3.9799902786779002, + "grad_norm": 0.06638524681329727, + "learning_rate": 1.1116611908726766e-06, + "loss": 0.2, + "step": 49129 + }, + { + "epoch": 3.9800712896953985, + "grad_norm": 0.07080232352018356, + "learning_rate": 1.1071605382780504e-06, + "loss": 0.2203, + "step": 49130 + }, + { + "epoch": 3.9801523007128967, + "grad_norm": 0.06715063750743866, + "learning_rate": 1.1026598856834242e-06, + "loss": 0.2034, + "step": 49131 + }, + { + "epoch": 3.9802333117303954, + "grad_norm": 0.07261967658996582, + "learning_rate": 1.0981592330887978e-06, + "loss": 0.2407, + "step": 49132 + }, + { + "epoch": 3.9803143227478937, + "grad_norm": 0.08318926393985748, + "learning_rate": 1.0936585804941716e-06, + "loss": 0.1799, + "step": 49133 + }, + { + "epoch": 3.980395333765392, + "grad_norm": 0.07309187203645706, + "learning_rate": 1.0891579278995454e-06, + "loss": 0.2382, + "step": 49134 + }, + { + "epoch": 3.9804763447828906, + "grad_norm": 0.07083363831043243, + "learning_rate": 1.0846572753049192e-06, + "loss": 0.2248, + "step": 49135 + }, + { + "epoch": 3.980557355800389, + "grad_norm": 0.07788734883069992, + "learning_rate": 1.080156622710293e-06, + "loss": 0.2758, + "step": 49136 + }, + { + "epoch": 3.980638366817887, + "grad_norm": 0.07184889167547226, + "learning_rate": 1.0756559701156669e-06, + "loss": 0.2069, + "step": 49137 + }, + { + "epoch": 3.9807193778353858, + "grad_norm": 0.05995340272784233, + "learning_rate": 1.0711553175210407e-06, + "loss": 0.1908, + "step": 49138 + }, + { + "epoch": 3.980800388852884, + "grad_norm": 0.06793571263551712, + "learning_rate": 1.0666546649264143e-06, + "loss": 0.2183, + "step": 49139 + }, + { + "epoch": 3.9808813998703823, + "grad_norm": 0.0675991103053093, + "learning_rate": 1.0621540123317881e-06, + "loss": 0.2088, + "step": 49140 + }, + { + "epoch": 3.980962410887881, + "grad_norm": 0.06840286403894424, + "learning_rate": 1.057653359737162e-06, + "loss": 0.2176, + "step": 49141 + }, + { + "epoch": 3.981043421905379, + "grad_norm": 0.07902107387781143, + "learning_rate": 1.0531527071425355e-06, + "loss": 0.2281, + "step": 49142 + }, + { + "epoch": 3.9811244329228774, + "grad_norm": 0.06392716616392136, + "learning_rate": 1.0486520545479096e-06, + "loss": 0.2276, + "step": 49143 + }, + { + "epoch": 3.981205443940376, + "grad_norm": 0.07346402853727341, + "learning_rate": 1.0441514019532834e-06, + "loss": 0.2349, + "step": 49144 + }, + { + "epoch": 3.9812864549578744, + "grad_norm": 0.07689003646373749, + "learning_rate": 1.039650749358657e-06, + "loss": 0.2228, + "step": 49145 + }, + { + "epoch": 3.9813674659753726, + "grad_norm": 0.0874786451458931, + "learning_rate": 1.0351500967640308e-06, + "loss": 0.2419, + "step": 49146 + }, + { + "epoch": 3.9814484769928713, + "grad_norm": 0.07139569520950317, + "learning_rate": 1.0306494441694046e-06, + "loss": 0.252, + "step": 49147 + }, + { + "epoch": 3.9815294880103695, + "grad_norm": 0.08377675712108612, + "learning_rate": 1.0261487915747784e-06, + "loss": 0.2182, + "step": 49148 + }, + { + "epoch": 3.981610499027868, + "grad_norm": 0.0838305801153183, + "learning_rate": 1.021648138980152e-06, + "loss": 0.237, + "step": 49149 + }, + { + "epoch": 3.981691510045366, + "grad_norm": 0.07349993288516998, + "learning_rate": 1.017147486385526e-06, + "loss": 0.2277, + "step": 49150 + }, + { + "epoch": 3.9817725210628643, + "grad_norm": 0.07764997333288193, + "learning_rate": 1.0126468337908999e-06, + "loss": 0.2132, + "step": 49151 + }, + { + "epoch": 3.981853532080363, + "grad_norm": 0.07144745439291, + "learning_rate": 1.0081461811962735e-06, + "loss": 0.2223, + "step": 49152 + }, + { + "epoch": 3.981934543097861, + "grad_norm": 0.06616366654634476, + "learning_rate": 1.0036455286016473e-06, + "loss": 0.2418, + "step": 49153 + }, + { + "epoch": 3.9820155541153595, + "grad_norm": 0.0774175375699997, + "learning_rate": 9.99144876007021e-07, + "loss": 0.2486, + "step": 49154 + }, + { + "epoch": 3.982096565132858, + "grad_norm": 0.08736024051904678, + "learning_rate": 9.946442234123947e-07, + "loss": 0.2434, + "step": 49155 + }, + { + "epoch": 3.9821775761503564, + "grad_norm": 0.06919243931770325, + "learning_rate": 9.901435708177685e-07, + "loss": 0.2726, + "step": 49156 + }, + { + "epoch": 3.9822585871678546, + "grad_norm": 0.07379870116710663, + "learning_rate": 9.856429182231425e-07, + "loss": 0.2308, + "step": 49157 + }, + { + "epoch": 3.9823395981853533, + "grad_norm": 0.05552654340863228, + "learning_rate": 9.811422656285161e-07, + "loss": 0.2211, + "step": 49158 + }, + { + "epoch": 3.9824206092028516, + "grad_norm": 0.07735942304134369, + "learning_rate": 9.7664161303389e-07, + "loss": 0.2101, + "step": 49159 + }, + { + "epoch": 3.98250162022035, + "grad_norm": 0.07008510828018188, + "learning_rate": 9.721409604392638e-07, + "loss": 0.2402, + "step": 49160 + }, + { + "epoch": 3.9825826312378485, + "grad_norm": 0.06766650825738907, + "learning_rate": 9.676403078446376e-07, + "loss": 0.2189, + "step": 49161 + }, + { + "epoch": 3.9826636422553467, + "grad_norm": 0.06831780821084976, + "learning_rate": 9.631396552500112e-07, + "loss": 0.1818, + "step": 49162 + }, + { + "epoch": 3.982744653272845, + "grad_norm": 0.07361584156751633, + "learning_rate": 9.58639002655385e-07, + "loss": 0.2391, + "step": 49163 + }, + { + "epoch": 3.9828256642903437, + "grad_norm": 0.07479403913021088, + "learning_rate": 9.541383500607588e-07, + "loss": 0.2105, + "step": 49164 + }, + { + "epoch": 3.982906675307842, + "grad_norm": 0.07533962279558182, + "learning_rate": 9.496376974661326e-07, + "loss": 0.2128, + "step": 49165 + }, + { + "epoch": 3.98298768632534, + "grad_norm": 0.07377752661705017, + "learning_rate": 9.451370448715065e-07, + "loss": 0.2785, + "step": 49166 + }, + { + "epoch": 3.983068697342839, + "grad_norm": 0.06769118458032608, + "learning_rate": 9.406363922768802e-07, + "loss": 0.2386, + "step": 49167 + }, + { + "epoch": 3.983149708360337, + "grad_norm": 0.06710786372423172, + "learning_rate": 9.36135739682254e-07, + "loss": 0.2021, + "step": 49168 + }, + { + "epoch": 3.9832307193778353, + "grad_norm": 0.07649517804384232, + "learning_rate": 9.316350870876277e-07, + "loss": 0.2129, + "step": 49169 + }, + { + "epoch": 3.983311730395334, + "grad_norm": 0.07384105026721954, + "learning_rate": 9.271344344930015e-07, + "loss": 0.206, + "step": 49170 + }, + { + "epoch": 3.9833927414128323, + "grad_norm": 0.07594034075737, + "learning_rate": 9.226337818983754e-07, + "loss": 0.2499, + "step": 49171 + }, + { + "epoch": 3.9834737524303305, + "grad_norm": 0.07467283308506012, + "learning_rate": 9.181331293037491e-07, + "loss": 0.236, + "step": 49172 + }, + { + "epoch": 3.9835547634478288, + "grad_norm": 0.07646078616380692, + "learning_rate": 9.13632476709123e-07, + "loss": 0.2835, + "step": 49173 + }, + { + "epoch": 3.983635774465327, + "grad_norm": 0.079698346555233, + "learning_rate": 9.091318241144967e-07, + "loss": 0.2269, + "step": 49174 + }, + { + "epoch": 3.9837167854828257, + "grad_norm": 0.08032261580228806, + "learning_rate": 9.046311715198704e-07, + "loss": 0.2175, + "step": 49175 + }, + { + "epoch": 3.983797796500324, + "grad_norm": 0.08225135505199432, + "learning_rate": 9.001305189252442e-07, + "loss": 0.2507, + "step": 49176 + }, + { + "epoch": 3.983878807517822, + "grad_norm": 0.08566189557313919, + "learning_rate": 8.956298663306179e-07, + "loss": 0.2036, + "step": 49177 + }, + { + "epoch": 3.983959818535321, + "grad_norm": 0.07229893654584885, + "learning_rate": 8.911292137359917e-07, + "loss": 0.2109, + "step": 49178 + }, + { + "epoch": 3.984040829552819, + "grad_norm": 0.07611256837844849, + "learning_rate": 8.866285611413656e-07, + "loss": 0.2151, + "step": 49179 + }, + { + "epoch": 3.9841218405703174, + "grad_norm": 0.06633207947015762, + "learning_rate": 8.821279085467393e-07, + "loss": 0.2284, + "step": 49180 + }, + { + "epoch": 3.984202851587816, + "grad_norm": 0.0713014304637909, + "learning_rate": 8.776272559521132e-07, + "loss": 0.2267, + "step": 49181 + }, + { + "epoch": 3.9842838626053143, + "grad_norm": 0.05671551451086998, + "learning_rate": 8.731266033574869e-07, + "loss": 0.231, + "step": 49182 + }, + { + "epoch": 3.9843648736228126, + "grad_norm": 0.08908804506063461, + "learning_rate": 8.686259507628606e-07, + "loss": 0.243, + "step": 49183 + }, + { + "epoch": 3.9844458846403112, + "grad_norm": 0.08369207382202148, + "learning_rate": 8.641252981682344e-07, + "loss": 0.2124, + "step": 49184 + }, + { + "epoch": 3.9845268956578095, + "grad_norm": 0.075348399579525, + "learning_rate": 8.596246455736081e-07, + "loss": 0.2084, + "step": 49185 + }, + { + "epoch": 3.9846079066753077, + "grad_norm": 0.07731541991233826, + "learning_rate": 8.55123992978982e-07, + "loss": 0.2089, + "step": 49186 + }, + { + "epoch": 3.9846889176928064, + "grad_norm": 0.06858506798744202, + "learning_rate": 8.506233403843558e-07, + "loss": 0.1975, + "step": 49187 + }, + { + "epoch": 3.9847699287103047, + "grad_norm": 0.06854752451181412, + "learning_rate": 8.461226877897295e-07, + "loss": 0.2154, + "step": 49188 + }, + { + "epoch": 3.984850939727803, + "grad_norm": 0.0722111314535141, + "learning_rate": 8.416220351951034e-07, + "loss": 0.2499, + "step": 49189 + }, + { + "epoch": 3.9849319507453016, + "grad_norm": 0.07207582145929337, + "learning_rate": 8.371213826004771e-07, + "loss": 0.2138, + "step": 49190 + }, + { + "epoch": 3.9850129617628, + "grad_norm": 0.07529518753290176, + "learning_rate": 8.326207300058509e-07, + "loss": 0.2467, + "step": 49191 + }, + { + "epoch": 3.985093972780298, + "grad_norm": 0.0700589045882225, + "learning_rate": 8.281200774112246e-07, + "loss": 0.237, + "step": 49192 + }, + { + "epoch": 3.9851749837977968, + "grad_norm": 0.06596378237009048, + "learning_rate": 8.236194248165985e-07, + "loss": 0.19, + "step": 49193 + }, + { + "epoch": 3.985255994815295, + "grad_norm": 0.058588989078998566, + "learning_rate": 8.191187722219722e-07, + "loss": 0.2075, + "step": 49194 + }, + { + "epoch": 3.9853370058327933, + "grad_norm": 0.08515679091215134, + "learning_rate": 8.14618119627346e-07, + "loss": 0.2337, + "step": 49195 + }, + { + "epoch": 3.9854180168502915, + "grad_norm": 0.08090581744909286, + "learning_rate": 8.101174670327197e-07, + "loss": 0.2216, + "step": 49196 + }, + { + "epoch": 3.9854990278677898, + "grad_norm": 0.06988976150751114, + "learning_rate": 8.056168144380936e-07, + "loss": 0.2209, + "step": 49197 + }, + { + "epoch": 3.9855800388852884, + "grad_norm": 0.08122329413890839, + "learning_rate": 8.011161618434673e-07, + "loss": 0.2666, + "step": 49198 + }, + { + "epoch": 3.9856610499027867, + "grad_norm": 0.06371399760246277, + "learning_rate": 7.966155092488411e-07, + "loss": 0.2513, + "step": 49199 + }, + { + "epoch": 3.985742060920285, + "grad_norm": 0.07604499161243439, + "learning_rate": 7.92114856654215e-07, + "loss": 0.2035, + "step": 49200 + }, + { + "epoch": 3.9858230719377836, + "grad_norm": 0.07374300062656403, + "learning_rate": 7.876142040595887e-07, + "loss": 0.2181, + "step": 49201 + }, + { + "epoch": 3.985904082955282, + "grad_norm": 0.07626064866781235, + "learning_rate": 7.831135514649625e-07, + "loss": 0.239, + "step": 49202 + }, + { + "epoch": 3.98598509397278, + "grad_norm": 0.05903629586100578, + "learning_rate": 7.786128988703362e-07, + "loss": 0.2137, + "step": 49203 + }, + { + "epoch": 3.986066104990279, + "grad_norm": 0.07717857509851456, + "learning_rate": 7.7411224627571e-07, + "loss": 0.2144, + "step": 49204 + }, + { + "epoch": 3.986147116007777, + "grad_norm": 0.07685103267431259, + "learning_rate": 7.696115936810838e-07, + "loss": 0.2063, + "step": 49205 + }, + { + "epoch": 3.9862281270252753, + "grad_norm": 0.06778260320425034, + "learning_rate": 7.651109410864576e-07, + "loss": 0.2426, + "step": 49206 + }, + { + "epoch": 3.986309138042774, + "grad_norm": 0.06964470446109772, + "learning_rate": 7.606102884918314e-07, + "loss": 0.2167, + "step": 49207 + }, + { + "epoch": 3.9863901490602722, + "grad_norm": 0.0588604211807251, + "learning_rate": 7.561096358972051e-07, + "loss": 0.2095, + "step": 49208 + }, + { + "epoch": 3.9864711600777705, + "grad_norm": 0.06138395890593529, + "learning_rate": 7.516089833025789e-07, + "loss": 0.2115, + "step": 49209 + }, + { + "epoch": 3.986552171095269, + "grad_norm": 0.07522926479578018, + "learning_rate": 7.471083307079527e-07, + "loss": 0.228, + "step": 49210 + }, + { + "epoch": 3.9866331821127674, + "grad_norm": 0.06714662909507751, + "learning_rate": 7.426076781133264e-07, + "loss": 0.2457, + "step": 49211 + }, + { + "epoch": 3.9867141931302656, + "grad_norm": 0.06937988847494125, + "learning_rate": 7.381070255187003e-07, + "loss": 0.2112, + "step": 49212 + }, + { + "epoch": 3.9867952041477643, + "grad_norm": 0.06987298280000687, + "learning_rate": 7.336063729240741e-07, + "loss": 0.2207, + "step": 49213 + }, + { + "epoch": 3.9868762151652626, + "grad_norm": 0.07640345394611359, + "learning_rate": 7.291057203294478e-07, + "loss": 0.2241, + "step": 49214 + }, + { + "epoch": 3.986957226182761, + "grad_norm": 0.06703965365886688, + "learning_rate": 7.246050677348216e-07, + "loss": 0.2068, + "step": 49215 + }, + { + "epoch": 3.987038237200259, + "grad_norm": 0.06497196108102798, + "learning_rate": 7.201044151401954e-07, + "loss": 0.2513, + "step": 49216 + }, + { + "epoch": 3.9871192482177578, + "grad_norm": 0.08134781569242477, + "learning_rate": 7.156037625455691e-07, + "loss": 0.2198, + "step": 49217 + }, + { + "epoch": 3.987200259235256, + "grad_norm": 0.07386305928230286, + "learning_rate": 7.111031099509429e-07, + "loss": 0.2293, + "step": 49218 + }, + { + "epoch": 3.9872812702527543, + "grad_norm": 0.06610637903213501, + "learning_rate": 7.066024573563166e-07, + "loss": 0.2115, + "step": 49219 + }, + { + "epoch": 3.9873622812702525, + "grad_norm": 0.073157899081707, + "learning_rate": 7.021018047616905e-07, + "loss": 0.2124, + "step": 49220 + }, + { + "epoch": 3.987443292287751, + "grad_norm": 0.0644218772649765, + "learning_rate": 6.976011521670643e-07, + "loss": 0.2333, + "step": 49221 + }, + { + "epoch": 3.9875243033052494, + "grad_norm": 0.07077537477016449, + "learning_rate": 6.93100499572438e-07, + "loss": 0.1877, + "step": 49222 + }, + { + "epoch": 3.9876053143227477, + "grad_norm": 0.0648820549249649, + "learning_rate": 6.885998469778118e-07, + "loss": 0.2393, + "step": 49223 + }, + { + "epoch": 3.9876863253402464, + "grad_norm": 0.06722790747880936, + "learning_rate": 6.840991943831856e-07, + "loss": 0.2105, + "step": 49224 + }, + { + "epoch": 3.9877673363577446, + "grad_norm": 0.07644420117139816, + "learning_rate": 6.795985417885593e-07, + "loss": 0.2234, + "step": 49225 + }, + { + "epoch": 3.987848347375243, + "grad_norm": 0.07702749967575073, + "learning_rate": 6.750978891939331e-07, + "loss": 0.2136, + "step": 49226 + }, + { + "epoch": 3.9879293583927415, + "grad_norm": 0.06729872524738312, + "learning_rate": 6.70597236599307e-07, + "loss": 0.191, + "step": 49227 + }, + { + "epoch": 3.98801036941024, + "grad_norm": 0.07493147253990173, + "learning_rate": 6.660965840046808e-07, + "loss": 0.2039, + "step": 49228 + }, + { + "epoch": 3.988091380427738, + "grad_norm": 0.06625507026910782, + "learning_rate": 6.615959314100545e-07, + "loss": 0.2119, + "step": 49229 + }, + { + "epoch": 3.9881723914452367, + "grad_norm": 0.052606746554374695, + "learning_rate": 6.570952788154282e-07, + "loss": 0.2399, + "step": 49230 + }, + { + "epoch": 3.988253402462735, + "grad_norm": 0.06827935576438904, + "learning_rate": 6.525946262208021e-07, + "loss": 0.2288, + "step": 49231 + }, + { + "epoch": 3.988334413480233, + "grad_norm": 0.08367155492305756, + "learning_rate": 6.480939736261758e-07, + "loss": 0.2511, + "step": 49232 + }, + { + "epoch": 3.988415424497732, + "grad_norm": 0.08619492501020432, + "learning_rate": 6.435933210315496e-07, + "loss": 0.2279, + "step": 49233 + }, + { + "epoch": 3.98849643551523, + "grad_norm": 0.08508004993200302, + "learning_rate": 6.390926684369234e-07, + "loss": 0.2145, + "step": 49234 + }, + { + "epoch": 3.9885774465327284, + "grad_norm": 0.07652024179697037, + "learning_rate": 6.345920158422972e-07, + "loss": 0.3088, + "step": 49235 + }, + { + "epoch": 3.988658457550227, + "grad_norm": 0.09197687357664108, + "learning_rate": 6.30091363247671e-07, + "loss": 0.2427, + "step": 49236 + }, + { + "epoch": 3.9887394685677253, + "grad_norm": 0.05859651789069176, + "learning_rate": 6.255907106530447e-07, + "loss": 0.2136, + "step": 49237 + }, + { + "epoch": 3.9888204795852236, + "grad_norm": 0.08267036080360413, + "learning_rate": 6.210900580584185e-07, + "loss": 0.2678, + "step": 49238 + }, + { + "epoch": 3.988901490602722, + "grad_norm": 0.05907399207353592, + "learning_rate": 6.165894054637923e-07, + "loss": 0.2066, + "step": 49239 + }, + { + "epoch": 3.9889825016202205, + "grad_norm": 0.07600349932909012, + "learning_rate": 6.12088752869166e-07, + "loss": 0.2264, + "step": 49240 + }, + { + "epoch": 3.9890635126377187, + "grad_norm": 0.08025678247213364, + "learning_rate": 6.075881002745398e-07, + "loss": 0.2387, + "step": 49241 + }, + { + "epoch": 3.989144523655217, + "grad_norm": 0.06902442127466202, + "learning_rate": 6.030874476799137e-07, + "loss": 0.2211, + "step": 49242 + }, + { + "epoch": 3.9892255346727152, + "grad_norm": 0.08164741843938828, + "learning_rate": 5.985867950852874e-07, + "loss": 0.2501, + "step": 49243 + }, + { + "epoch": 3.989306545690214, + "grad_norm": 0.058244526386260986, + "learning_rate": 5.940861424906612e-07, + "loss": 0.2106, + "step": 49244 + }, + { + "epoch": 3.989387556707712, + "grad_norm": 0.06369130313396454, + "learning_rate": 5.89585489896035e-07, + "loss": 0.1867, + "step": 49245 + }, + { + "epoch": 3.9894685677252104, + "grad_norm": 0.05307641252875328, + "learning_rate": 5.850848373014087e-07, + "loss": 0.1732, + "step": 49246 + }, + { + "epoch": 3.989549578742709, + "grad_norm": 0.06525785475969315, + "learning_rate": 5.805841847067825e-07, + "loss": 0.2048, + "step": 49247 + }, + { + "epoch": 3.9896305897602073, + "grad_norm": 0.06935491412878036, + "learning_rate": 5.760835321121562e-07, + "loss": 0.1843, + "step": 49248 + }, + { + "epoch": 3.9897116007777056, + "grad_norm": 0.058017581701278687, + "learning_rate": 5.715828795175301e-07, + "loss": 0.21, + "step": 49249 + }, + { + "epoch": 3.9897926117952043, + "grad_norm": 0.0743686705827713, + "learning_rate": 5.670822269229039e-07, + "loss": 0.2, + "step": 49250 + }, + { + "epoch": 3.9898736228127025, + "grad_norm": 0.07183623313903809, + "learning_rate": 5.625815743282776e-07, + "loss": 0.2428, + "step": 49251 + }, + { + "epoch": 3.9899546338302008, + "grad_norm": 0.06011229380965233, + "learning_rate": 5.580809217336515e-07, + "loss": 0.2315, + "step": 49252 + }, + { + "epoch": 3.9900356448476995, + "grad_norm": 0.0707809180021286, + "learning_rate": 5.535802691390252e-07, + "loss": 0.2014, + "step": 49253 + }, + { + "epoch": 3.9901166558651977, + "grad_norm": 0.06794430315494537, + "learning_rate": 5.490796165443989e-07, + "loss": 0.2359, + "step": 49254 + }, + { + "epoch": 3.990197666882696, + "grad_norm": 0.07199983298778534, + "learning_rate": 5.445789639497727e-07, + "loss": 0.2156, + "step": 49255 + }, + { + "epoch": 3.9902786779001946, + "grad_norm": 0.10244887322187424, + "learning_rate": 5.400783113551465e-07, + "loss": 0.2666, + "step": 49256 + }, + { + "epoch": 3.990359688917693, + "grad_norm": 0.058576490730047226, + "learning_rate": 5.355776587605203e-07, + "loss": 0.197, + "step": 49257 + }, + { + "epoch": 3.990440699935191, + "grad_norm": 0.07232537120580673, + "learning_rate": 5.310770061658941e-07, + "loss": 0.203, + "step": 49258 + }, + { + "epoch": 3.99052171095269, + "grad_norm": 0.07895112782716751, + "learning_rate": 5.265763535712678e-07, + "loss": 0.1973, + "step": 49259 + }, + { + "epoch": 3.990602721970188, + "grad_norm": 0.0817318707704544, + "learning_rate": 5.220757009766417e-07, + "loss": 0.2765, + "step": 49260 + }, + { + "epoch": 3.9906837329876863, + "grad_norm": 0.06556425988674164, + "learning_rate": 5.175750483820154e-07, + "loss": 0.2576, + "step": 49261 + }, + { + "epoch": 3.9907647440051845, + "grad_norm": 0.07194989919662476, + "learning_rate": 5.130743957873892e-07, + "loss": 0.2051, + "step": 49262 + }, + { + "epoch": 3.9908457550226832, + "grad_norm": 0.07065384835004807, + "learning_rate": 5.08573743192763e-07, + "loss": 0.2483, + "step": 49263 + }, + { + "epoch": 3.9909267660401815, + "grad_norm": 0.07403761893510818, + "learning_rate": 5.040730905981367e-07, + "loss": 0.2408, + "step": 49264 + }, + { + "epoch": 3.9910077770576797, + "grad_norm": 0.06701704114675522, + "learning_rate": 4.995724380035106e-07, + "loss": 0.1916, + "step": 49265 + }, + { + "epoch": 3.991088788075178, + "grad_norm": 0.07365620136260986, + "learning_rate": 4.950717854088843e-07, + "loss": 0.2207, + "step": 49266 + }, + { + "epoch": 3.9911697990926767, + "grad_norm": 0.07243810594081879, + "learning_rate": 4.905711328142581e-07, + "loss": 0.1908, + "step": 49267 + }, + { + "epoch": 3.991250810110175, + "grad_norm": 0.06687340140342712, + "learning_rate": 4.860704802196319e-07, + "loss": 0.2387, + "step": 49268 + }, + { + "epoch": 3.991331821127673, + "grad_norm": 0.07321549952030182, + "learning_rate": 4.815698276250056e-07, + "loss": 0.2107, + "step": 49269 + }, + { + "epoch": 3.991412832145172, + "grad_norm": 0.06943608075380325, + "learning_rate": 4.770691750303794e-07, + "loss": 0.2445, + "step": 49270 + }, + { + "epoch": 3.99149384316267, + "grad_norm": 0.06473980844020844, + "learning_rate": 4.7256852243575323e-07, + "loss": 0.1774, + "step": 49271 + }, + { + "epoch": 3.9915748541801683, + "grad_norm": 0.08363772183656693, + "learning_rate": 4.68067869841127e-07, + "loss": 0.2475, + "step": 49272 + }, + { + "epoch": 3.991655865197667, + "grad_norm": 0.08146770298480988, + "learning_rate": 4.6356721724650075e-07, + "loss": 0.2079, + "step": 49273 + }, + { + "epoch": 3.9917368762151653, + "grad_norm": 0.07280329614877701, + "learning_rate": 4.5906656465187457e-07, + "loss": 0.2295, + "step": 49274 + }, + { + "epoch": 3.9918178872326635, + "grad_norm": 0.08392112702131271, + "learning_rate": 4.5456591205724833e-07, + "loss": 0.2488, + "step": 49275 + }, + { + "epoch": 3.991898898250162, + "grad_norm": 0.06567014008760452, + "learning_rate": 4.500652594626221e-07, + "loss": 0.2126, + "step": 49276 + }, + { + "epoch": 3.9919799092676604, + "grad_norm": 0.06484349817037582, + "learning_rate": 4.4556460686799585e-07, + "loss": 0.1919, + "step": 49277 + }, + { + "epoch": 3.9920609202851587, + "grad_norm": 0.07832783460617065, + "learning_rate": 4.4106395427336967e-07, + "loss": 0.2157, + "step": 49278 + }, + { + "epoch": 3.9921419313026574, + "grad_norm": 0.0768279954791069, + "learning_rate": 4.3656330167874343e-07, + "loss": 0.2463, + "step": 49279 + }, + { + "epoch": 3.9922229423201556, + "grad_norm": 0.06600246578454971, + "learning_rate": 4.320626490841172e-07, + "loss": 0.2531, + "step": 49280 + }, + { + "epoch": 3.992303953337654, + "grad_norm": 0.0632348507642746, + "learning_rate": 4.27561996489491e-07, + "loss": 0.2087, + "step": 49281 + }, + { + "epoch": 3.9923849643551526, + "grad_norm": 0.10359801352024078, + "learning_rate": 4.2306134389486477e-07, + "loss": 0.1958, + "step": 49282 + }, + { + "epoch": 3.992465975372651, + "grad_norm": 0.0757150650024414, + "learning_rate": 4.1856069130023853e-07, + "loss": 0.2425, + "step": 49283 + }, + { + "epoch": 3.992546986390149, + "grad_norm": 0.07178405672311783, + "learning_rate": 4.140600387056123e-07, + "loss": 0.2043, + "step": 49284 + }, + { + "epoch": 3.9926279974076473, + "grad_norm": 0.09844893217086792, + "learning_rate": 4.095593861109861e-07, + "loss": 0.2262, + "step": 49285 + }, + { + "epoch": 3.992709008425146, + "grad_norm": 0.07299178838729858, + "learning_rate": 4.0505873351635987e-07, + "loss": 0.2082, + "step": 49286 + }, + { + "epoch": 3.9927900194426442, + "grad_norm": 0.08776606619358063, + "learning_rate": 4.0055808092173363e-07, + "loss": 0.2229, + "step": 49287 + }, + { + "epoch": 3.9928710304601425, + "grad_norm": 0.06883752346038818, + "learning_rate": 3.960574283271075e-07, + "loss": 0.1953, + "step": 49288 + }, + { + "epoch": 3.9929520414776407, + "grad_norm": 0.06461341679096222, + "learning_rate": 3.9155677573248126e-07, + "loss": 0.2013, + "step": 49289 + }, + { + "epoch": 3.9930330524951394, + "grad_norm": 0.058432161808013916, + "learning_rate": 3.87056123137855e-07, + "loss": 0.2223, + "step": 49290 + }, + { + "epoch": 3.9931140635126376, + "grad_norm": 0.06838857382535934, + "learning_rate": 3.825554705432288e-07, + "loss": 0.2357, + "step": 49291 + }, + { + "epoch": 3.993195074530136, + "grad_norm": 0.07145708054304123, + "learning_rate": 3.7805481794860255e-07, + "loss": 0.2288, + "step": 49292 + }, + { + "epoch": 3.9932760855476346, + "grad_norm": 0.06667095422744751, + "learning_rate": 3.7355416535397637e-07, + "loss": 0.22, + "step": 49293 + }, + { + "epoch": 3.993357096565133, + "grad_norm": 0.06883816421031952, + "learning_rate": 3.6905351275935013e-07, + "loss": 0.2138, + "step": 49294 + }, + { + "epoch": 3.993438107582631, + "grad_norm": 0.07182978838682175, + "learning_rate": 3.645528601647239e-07, + "loss": 0.2382, + "step": 49295 + }, + { + "epoch": 3.9935191186001298, + "grad_norm": 0.08456697314977646, + "learning_rate": 3.600522075700977e-07, + "loss": 0.2418, + "step": 49296 + }, + { + "epoch": 3.993600129617628, + "grad_norm": 0.06518872827291489, + "learning_rate": 3.5555155497547147e-07, + "loss": 0.2274, + "step": 49297 + }, + { + "epoch": 3.9936811406351262, + "grad_norm": 0.07191832363605499, + "learning_rate": 3.5105090238084523e-07, + "loss": 0.2078, + "step": 49298 + }, + { + "epoch": 3.993762151652625, + "grad_norm": 0.07570493221282959, + "learning_rate": 3.46550249786219e-07, + "loss": 0.1972, + "step": 49299 + }, + { + "epoch": 3.993843162670123, + "grad_norm": 0.0672534927725792, + "learning_rate": 3.420495971915928e-07, + "loss": 0.2069, + "step": 49300 + }, + { + "epoch": 3.9939241736876214, + "grad_norm": 0.08297023177146912, + "learning_rate": 3.3754894459696657e-07, + "loss": 0.2224, + "step": 49301 + }, + { + "epoch": 3.99400518470512, + "grad_norm": 0.0740421712398529, + "learning_rate": 3.330482920023404e-07, + "loss": 0.2122, + "step": 49302 + }, + { + "epoch": 3.9940861957226184, + "grad_norm": 0.07864964753389359, + "learning_rate": 3.285476394077141e-07, + "loss": 0.2519, + "step": 49303 + }, + { + "epoch": 3.9941672067401166, + "grad_norm": 0.07417288422584534, + "learning_rate": 3.240469868130879e-07, + "loss": 0.2207, + "step": 49304 + }, + { + "epoch": 3.9942482177576153, + "grad_norm": 0.06941584497690201, + "learning_rate": 3.195463342184617e-07, + "loss": 0.2537, + "step": 49305 + }, + { + "epoch": 3.9943292287751135, + "grad_norm": 0.07486558705568314, + "learning_rate": 3.150456816238355e-07, + "loss": 0.2112, + "step": 49306 + }, + { + "epoch": 3.994410239792612, + "grad_norm": 0.06437824666500092, + "learning_rate": 3.1054502902920925e-07, + "loss": 0.2062, + "step": 49307 + }, + { + "epoch": 3.99449125081011, + "grad_norm": 0.07233487814664841, + "learning_rate": 3.06044376434583e-07, + "loss": 0.2241, + "step": 49308 + }, + { + "epoch": 3.9945722618276087, + "grad_norm": 0.06914579123258591, + "learning_rate": 3.015437238399568e-07, + "loss": 0.1937, + "step": 49309 + }, + { + "epoch": 3.994653272845107, + "grad_norm": 0.07031723856925964, + "learning_rate": 2.970430712453306e-07, + "loss": 0.2298, + "step": 49310 + }, + { + "epoch": 3.994734283862605, + "grad_norm": 0.0636644959449768, + "learning_rate": 2.9254241865070435e-07, + "loss": 0.2067, + "step": 49311 + }, + { + "epoch": 3.9948152948801035, + "grad_norm": 0.08062580972909927, + "learning_rate": 2.880417660560781e-07, + "loss": 0.2336, + "step": 49312 + }, + { + "epoch": 3.994896305897602, + "grad_norm": 0.07863558083772659, + "learning_rate": 2.8354111346145193e-07, + "loss": 0.202, + "step": 49313 + }, + { + "epoch": 3.9949773169151004, + "grad_norm": 0.0746251717209816, + "learning_rate": 2.7904046086682574e-07, + "loss": 0.2108, + "step": 49314 + }, + { + "epoch": 3.9950583279325986, + "grad_norm": 0.06599914282560349, + "learning_rate": 2.7453980827219945e-07, + "loss": 0.203, + "step": 49315 + }, + { + "epoch": 3.9951393389500973, + "grad_norm": 0.0633784607052803, + "learning_rate": 2.7003915567757327e-07, + "loss": 0.214, + "step": 49316 + }, + { + "epoch": 3.9952203499675956, + "grad_norm": 0.07074976712465286, + "learning_rate": 2.6553850308294703e-07, + "loss": 0.2186, + "step": 49317 + }, + { + "epoch": 3.995301360985094, + "grad_norm": 0.08301674574613571, + "learning_rate": 2.6103785048832084e-07, + "loss": 0.2604, + "step": 49318 + }, + { + "epoch": 3.9953823720025925, + "grad_norm": 0.07280449569225311, + "learning_rate": 2.565371978936946e-07, + "loss": 0.2365, + "step": 49319 + }, + { + "epoch": 3.9954633830200907, + "grad_norm": 0.06467054784297943, + "learning_rate": 2.5203654529906837e-07, + "loss": 0.2106, + "step": 49320 + }, + { + "epoch": 3.995544394037589, + "grad_norm": 0.07908143103122711, + "learning_rate": 2.4753589270444213e-07, + "loss": 0.2395, + "step": 49321 + }, + { + "epoch": 3.9956254050550877, + "grad_norm": 0.06821177899837494, + "learning_rate": 2.4303524010981594e-07, + "loss": 0.2398, + "step": 49322 + }, + { + "epoch": 3.995706416072586, + "grad_norm": 0.07683097571134567, + "learning_rate": 2.385345875151897e-07, + "loss": 0.2641, + "step": 49323 + }, + { + "epoch": 3.995787427090084, + "grad_norm": 0.0672578513622284, + "learning_rate": 2.340339349205635e-07, + "loss": 0.201, + "step": 49324 + }, + { + "epoch": 3.995868438107583, + "grad_norm": 0.10541965812444687, + "learning_rate": 2.2953328232593728e-07, + "loss": 0.1953, + "step": 49325 + }, + { + "epoch": 3.995949449125081, + "grad_norm": 0.07959267497062683, + "learning_rate": 2.2503262973131105e-07, + "loss": 0.2044, + "step": 49326 + }, + { + "epoch": 3.9960304601425793, + "grad_norm": 0.07636445015668869, + "learning_rate": 2.2053197713668483e-07, + "loss": 0.2205, + "step": 49327 + }, + { + "epoch": 3.996111471160078, + "grad_norm": 0.07331163436174393, + "learning_rate": 2.160313245420586e-07, + "loss": 0.2267, + "step": 49328 + }, + { + "epoch": 3.9961924821775763, + "grad_norm": 0.06154733523726463, + "learning_rate": 2.1153067194743239e-07, + "loss": 0.2145, + "step": 49329 + }, + { + "epoch": 3.9962734931950745, + "grad_norm": 0.07518443465232849, + "learning_rate": 2.0703001935280615e-07, + "loss": 0.2381, + "step": 49330 + }, + { + "epoch": 3.9963545042125728, + "grad_norm": 0.06782988458871841, + "learning_rate": 2.0252936675817994e-07, + "loss": 0.2327, + "step": 49331 + }, + { + "epoch": 3.9964355152300715, + "grad_norm": 0.08090611547231674, + "learning_rate": 1.9802871416355375e-07, + "loss": 0.2294, + "step": 49332 + }, + { + "epoch": 3.9965165262475697, + "grad_norm": 0.05896861106157303, + "learning_rate": 1.935280615689275e-07, + "loss": 0.2398, + "step": 49333 + }, + { + "epoch": 3.996597537265068, + "grad_norm": 0.06596443057060242, + "learning_rate": 1.8902740897430128e-07, + "loss": 0.2233, + "step": 49334 + }, + { + "epoch": 3.996678548282566, + "grad_norm": 0.0906447172164917, + "learning_rate": 1.8452675637967506e-07, + "loss": 0.25, + "step": 49335 + }, + { + "epoch": 3.996759559300065, + "grad_norm": 0.07093532383441925, + "learning_rate": 1.8002610378504885e-07, + "loss": 0.2599, + "step": 49336 + }, + { + "epoch": 3.996840570317563, + "grad_norm": 0.07621223479509354, + "learning_rate": 1.7552545119042261e-07, + "loss": 0.2288, + "step": 49337 + }, + { + "epoch": 3.9969215813350614, + "grad_norm": 0.06504885852336884, + "learning_rate": 1.710247985957964e-07, + "loss": 0.1997, + "step": 49338 + }, + { + "epoch": 3.99700259235256, + "grad_norm": 0.07198863476514816, + "learning_rate": 1.665241460011702e-07, + "loss": 0.2141, + "step": 49339 + }, + { + "epoch": 3.9970836033700583, + "grad_norm": 0.06278213858604431, + "learning_rate": 1.6202349340654395e-07, + "loss": 0.219, + "step": 49340 + }, + { + "epoch": 3.9971646143875565, + "grad_norm": 0.06734520196914673, + "learning_rate": 1.5752284081191774e-07, + "loss": 0.2033, + "step": 49341 + }, + { + "epoch": 3.9972456254050552, + "grad_norm": 0.05727878585457802, + "learning_rate": 1.530221882172915e-07, + "loss": 0.2046, + "step": 49342 + }, + { + "epoch": 3.9973266364225535, + "grad_norm": 0.08659632503986359, + "learning_rate": 1.485215356226653e-07, + "loss": 0.2383, + "step": 49343 + }, + { + "epoch": 3.9974076474400517, + "grad_norm": 0.06606918573379517, + "learning_rate": 1.4402088302803906e-07, + "loss": 0.1683, + "step": 49344 + }, + { + "epoch": 3.9974886584575504, + "grad_norm": 0.07601425051689148, + "learning_rate": 1.3952023043341287e-07, + "loss": 0.2367, + "step": 49345 + }, + { + "epoch": 3.9975696694750487, + "grad_norm": 0.07500436156988144, + "learning_rate": 1.3501957783878663e-07, + "loss": 0.2605, + "step": 49346 + }, + { + "epoch": 3.997650680492547, + "grad_norm": 0.07162158936262131, + "learning_rate": 1.3051892524416042e-07, + "loss": 0.2128, + "step": 49347 + }, + { + "epoch": 3.9977316915100456, + "grad_norm": 0.07516414672136307, + "learning_rate": 1.2601827264953418e-07, + "loss": 0.1984, + "step": 49348 + }, + { + "epoch": 3.997812702527544, + "grad_norm": 0.07357397675514221, + "learning_rate": 1.2151762005490797e-07, + "loss": 0.2305, + "step": 49349 + }, + { + "epoch": 3.997893713545042, + "grad_norm": 0.07578955590724945, + "learning_rate": 1.1701696746028175e-07, + "loss": 0.2729, + "step": 49350 + }, + { + "epoch": 3.9979747245625408, + "grad_norm": 0.08693145960569382, + "learning_rate": 1.1251631486565552e-07, + "loss": 0.2081, + "step": 49351 + }, + { + "epoch": 3.998055735580039, + "grad_norm": 0.06208210065960884, + "learning_rate": 1.080156622710293e-07, + "loss": 0.1848, + "step": 49352 + }, + { + "epoch": 3.9981367465975373, + "grad_norm": 0.06679573655128479, + "learning_rate": 1.0351500967640307e-07, + "loss": 0.2396, + "step": 49353 + }, + { + "epoch": 3.9982177576150355, + "grad_norm": 0.07043427228927612, + "learning_rate": 9.901435708177688e-08, + "loss": 0.2277, + "step": 49354 + }, + { + "epoch": 3.9982987686325338, + "grad_norm": 0.06167418509721756, + "learning_rate": 9.451370448715064e-08, + "loss": 0.2084, + "step": 49355 + }, + { + "epoch": 3.9983797796500324, + "grad_norm": 0.058673106133937836, + "learning_rate": 9.001305189252443e-08, + "loss": 0.2388, + "step": 49356 + }, + { + "epoch": 3.9984607906675307, + "grad_norm": 0.0721500888466835, + "learning_rate": 8.55123992978982e-08, + "loss": 0.2257, + "step": 49357 + }, + { + "epoch": 3.998541801685029, + "grad_norm": 0.07805679738521576, + "learning_rate": 8.101174670327198e-08, + "loss": 0.285, + "step": 49358 + }, + { + "epoch": 3.9986228127025276, + "grad_norm": 0.0756671130657196, + "learning_rate": 7.651109410864575e-08, + "loss": 0.2225, + "step": 49359 + }, + { + "epoch": 3.998703823720026, + "grad_norm": 0.0644574761390686, + "learning_rate": 7.201044151401953e-08, + "loss": 0.2034, + "step": 49360 + }, + { + "epoch": 3.998784834737524, + "grad_norm": 0.09972898662090302, + "learning_rate": 6.750978891939332e-08, + "loss": 0.2121, + "step": 49361 + }, + { + "epoch": 3.998865845755023, + "grad_norm": 0.07944905012845993, + "learning_rate": 6.300913632476709e-08, + "loss": 0.2141, + "step": 49362 + }, + { + "epoch": 3.998946856772521, + "grad_norm": 0.07724474370479584, + "learning_rate": 5.8508483730140874e-08, + "loss": 0.2399, + "step": 49363 + }, + { + "epoch": 3.9990278677900193, + "grad_norm": 0.06207426264882088, + "learning_rate": 5.400783113551465e-08, + "loss": 0.2202, + "step": 49364 + }, + { + "epoch": 3.999108878807518, + "grad_norm": 0.07973155379295349, + "learning_rate": 4.950717854088844e-08, + "loss": 0.2378, + "step": 49365 + }, + { + "epoch": 3.999189889825016, + "grad_norm": 0.07270248979330063, + "learning_rate": 4.500652594626221e-08, + "loss": 0.2186, + "step": 49366 + }, + { + "epoch": 3.9992709008425145, + "grad_norm": 0.06034340709447861, + "learning_rate": 4.050587335163599e-08, + "loss": 0.1782, + "step": 49367 + }, + { + "epoch": 3.999351911860013, + "grad_norm": 0.08102718740701675, + "learning_rate": 3.6005220757009764e-08, + "loss": 0.2278, + "step": 49368 + }, + { + "epoch": 3.9994329228775114, + "grad_norm": 0.060907378792762756, + "learning_rate": 3.1504568162383546e-08, + "loss": 0.2229, + "step": 49369 + }, + { + "epoch": 3.9995139338950096, + "grad_norm": 0.07012410461902618, + "learning_rate": 2.7003915567757325e-08, + "loss": 0.2241, + "step": 49370 + }, + { + "epoch": 3.9995949449125083, + "grad_norm": 0.0700206607580185, + "learning_rate": 2.2503262973131107e-08, + "loss": 0.2083, + "step": 49371 + }, + { + "epoch": 3.9996759559300066, + "grad_norm": 0.08111918717622757, + "learning_rate": 1.8002610378504882e-08, + "loss": 0.2562, + "step": 49372 + }, + { + "epoch": 3.999756966947505, + "grad_norm": 0.07391811162233353, + "learning_rate": 1.3501957783878662e-08, + "loss": 0.1937, + "step": 49373 + }, + { + "epoch": 3.9998379779650035, + "grad_norm": 0.05770977959036827, + "learning_rate": 9.001305189252441e-09, + "loss": 0.191, + "step": 49374 + }, + { + "epoch": 3.9999189889825018, + "grad_norm": 0.05946144461631775, + "learning_rate": 4.5006525946262205e-09, + "loss": 0.1792, + "step": 49375 + }, + { + "epoch": 4.0, + "grad_norm": 0.07257899641990662, + "learning_rate": 0.0, + "loss": 0.2477, + "step": 49376 } ], "logging_steps": 1, @@ -318521,12 +345653,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 2.985306826203359e+19, + "total_flos": 3.2393504748092785e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null