{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08410959480202704, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.410959480202704e-05, "grad_norm": 0.27725711464881897, "learning_rate": 0.00019999950652018584, "loss": 0.8848, "step": 1 }, { "epoch": 0.00016821918960405408, "grad_norm": 0.2778919041156769, "learning_rate": 0.0001999980260856137, "loss": 0.5957, "step": 2 }, { "epoch": 0.0002523287844060811, "grad_norm": 0.5110663771629333, "learning_rate": 0.000199995558710895, "loss": 1.2037, "step": 3 }, { "epoch": 0.00033643837920810816, "grad_norm": 0.4022953510284424, "learning_rate": 0.00019999210442038162, "loss": 1.1604, "step": 4 }, { "epoch": 0.0004205479740101352, "grad_norm": 0.5099120736122131, "learning_rate": 0.00019998766324816607, "loss": 1.1557, "step": 5 }, { "epoch": 0.0005046575688121622, "grad_norm": 0.4215346872806549, "learning_rate": 0.0001999822352380809, "loss": 1.0749, "step": 6 }, { "epoch": 0.0005887671636141893, "grad_norm": 0.5019777417182922, "learning_rate": 0.00019997582044369843, "loss": 0.8256, "step": 7 }, { "epoch": 0.0006728767584162163, "grad_norm": 0.39011430740356445, "learning_rate": 0.00019996841892833, "loss": 0.7847, "step": 8 }, { "epoch": 0.0007569863532182434, "grad_norm": 0.49417048692703247, "learning_rate": 0.00019996003076502565, "loss": 1.0241, "step": 9 }, { "epoch": 0.0008410959480202704, "grad_norm": 0.5206520557403564, "learning_rate": 0.00019995065603657316, "loss": 0.8424, "step": 10 }, { "epoch": 0.0009252055428222974, "grad_norm": 0.4732626974582672, "learning_rate": 0.0001999402948354973, "loss": 0.7602, "step": 11 }, { "epoch": 0.0010093151376243245, "grad_norm": 0.7119062542915344, "learning_rate": 0.00019992894726405893, "loss": 0.9964, "step": 12 }, { "epoch": 0.0010934247324263515, "grad_norm": 0.6489611864089966, "learning_rate": 0.000199916613434254, "loss": 0.8901, "step": 13 }, { "epoch": 0.0011775343272283786, "grad_norm": 0.6794676184654236, "learning_rate": 0.0001999032934678125, "loss": 0.8788, "step": 14 }, { "epoch": 0.0012616439220304056, "grad_norm": 0.5384562015533447, "learning_rate": 0.00019988898749619702, "loss": 0.8936, "step": 15 }, { "epoch": 0.0013457535168324326, "grad_norm": 0.6276252865791321, "learning_rate": 0.00019987369566060176, "loss": 0.7279, "step": 16 }, { "epoch": 0.0014298631116344597, "grad_norm": 0.8388938903808594, "learning_rate": 0.00019985741811195097, "loss": 0.9929, "step": 17 }, { "epoch": 0.0015139727064364867, "grad_norm": 0.5140799283981323, "learning_rate": 0.00019984015501089752, "loss": 0.6327, "step": 18 }, { "epoch": 0.0015980823012385138, "grad_norm": 0.6707816123962402, "learning_rate": 0.0001998219065278212, "loss": 0.8234, "step": 19 }, { "epoch": 0.0016821918960405408, "grad_norm": 0.721817672252655, "learning_rate": 0.00019980267284282717, "loss": 0.8038, "step": 20 }, { "epoch": 0.0017663014908425678, "grad_norm": 0.6394887566566467, "learning_rate": 0.00019978245414574417, "loss": 0.7706, "step": 21 }, { "epoch": 0.0018504110856445949, "grad_norm": 0.523406982421875, "learning_rate": 0.00019976125063612252, "loss": 0.6581, "step": 22 }, { "epoch": 0.001934520680446622, "grad_norm": 0.5513215065002441, "learning_rate": 0.00019973906252323238, "loss": 0.9306, "step": 23 }, { "epoch": 0.002018630275248649, "grad_norm": 0.5572922825813293, "learning_rate": 0.0001997158900260614, "loss": 0.7959, "step": 24 }, { "epoch": 0.002102739870050676, "grad_norm": 0.567721426486969, "learning_rate": 0.0001996917333733128, "loss": 0.684, "step": 25 }, { "epoch": 0.002186849464852703, "grad_norm": 0.8525415062904358, "learning_rate": 0.00019966659280340297, "loss": 0.6086, "step": 26 }, { "epoch": 0.0022709590596547303, "grad_norm": 0.6331901550292969, "learning_rate": 0.00019964046856445924, "loss": 0.8775, "step": 27 }, { "epoch": 0.002355068654456757, "grad_norm": 0.5875001549720764, "learning_rate": 0.00019961336091431727, "loss": 0.7884, "step": 28 }, { "epoch": 0.0024391782492587844, "grad_norm": 0.4703105092048645, "learning_rate": 0.00019958527012051857, "loss": 0.8803, "step": 29 }, { "epoch": 0.002523287844060811, "grad_norm": 0.46804890036582947, "learning_rate": 0.00019955619646030802, "loss": 0.7249, "step": 30 }, { "epoch": 0.0026073974388628384, "grad_norm": 0.7693579792976379, "learning_rate": 0.00019952614022063084, "loss": 1.0132, "step": 31 }, { "epoch": 0.0026915070336648653, "grad_norm": 0.7424771785736084, "learning_rate": 0.00019949510169813003, "loss": 0.9041, "step": 32 }, { "epoch": 0.0027756166284668925, "grad_norm": 0.6431573033332825, "learning_rate": 0.00019946308119914323, "loss": 0.8694, "step": 33 }, { "epoch": 0.0028597262232689193, "grad_norm": 0.5848168730735779, "learning_rate": 0.0001994300790396999, "loss": 0.7372, "step": 34 }, { "epoch": 0.0029438358180709466, "grad_norm": 0.5710355043411255, "learning_rate": 0.000199396095545518, "loss": 0.7431, "step": 35 }, { "epoch": 0.0030279454128729734, "grad_norm": 0.531509518623352, "learning_rate": 0.00019936113105200085, "loss": 0.7768, "step": 36 }, { "epoch": 0.0031120550076750007, "grad_norm": 0.4804302752017975, "learning_rate": 0.00019932518590423394, "loss": 0.7301, "step": 37 }, { "epoch": 0.0031961646024770275, "grad_norm": 0.5045036673545837, "learning_rate": 0.00019928826045698136, "loss": 0.6649, "step": 38 }, { "epoch": 0.0032802741972790548, "grad_norm": 0.41377341747283936, "learning_rate": 0.0001992503550746824, "loss": 0.8951, "step": 39 }, { "epoch": 0.0033643837920810816, "grad_norm": 0.534492552280426, "learning_rate": 0.0001992114701314478, "loss": 0.6731, "step": 40 }, { "epoch": 0.003448493386883109, "grad_norm": 0.36469462513923645, "learning_rate": 0.0001991716060110563, "loss": 0.6212, "step": 41 }, { "epoch": 0.0035326029816851357, "grad_norm": 0.44110405445098877, "learning_rate": 0.00019913076310695068, "loss": 0.8254, "step": 42 }, { "epoch": 0.003616712576487163, "grad_norm": 0.4397999942302704, "learning_rate": 0.00019908894182223388, "loss": 0.7209, "step": 43 }, { "epoch": 0.0037008221712891897, "grad_norm": 0.5739462971687317, "learning_rate": 0.00019904614256966512, "loss": 0.8095, "step": 44 }, { "epoch": 0.003784931766091217, "grad_norm": 0.5561273097991943, "learning_rate": 0.00019900236577165576, "loss": 0.7944, "step": 45 }, { "epoch": 0.003869041360893244, "grad_norm": 0.5051570534706116, "learning_rate": 0.0001989576118602651, "loss": 0.8223, "step": 46 }, { "epoch": 0.003953150955695271, "grad_norm": 0.4156644642353058, "learning_rate": 0.00019891188127719618, "loss": 0.7488, "step": 47 }, { "epoch": 0.004037260550497298, "grad_norm": 0.520811140537262, "learning_rate": 0.0001988651744737914, "loss": 0.8407, "step": 48 }, { "epoch": 0.004121370145299325, "grad_norm": 0.7556874752044678, "learning_rate": 0.00019881749191102808, "loss": 0.7462, "step": 49 }, { "epoch": 0.004205479740101352, "grad_norm": 0.692021369934082, "learning_rate": 0.00019876883405951377, "loss": 0.9717, "step": 50 }, { "epoch": 0.004289589334903379, "grad_norm": 0.7206075191497803, "learning_rate": 0.00019871920139948192, "loss": 0.7717, "step": 51 }, { "epoch": 0.004373698929705406, "grad_norm": 0.5266078114509583, "learning_rate": 0.0001986685944207868, "loss": 0.843, "step": 52 }, { "epoch": 0.004457808524507433, "grad_norm": 0.4582868218421936, "learning_rate": 0.0001986170136228989, "loss": 0.709, "step": 53 }, { "epoch": 0.004541918119309461, "grad_norm": 0.5274102687835693, "learning_rate": 0.00019856445951489982, "loss": 0.6895, "step": 54 }, { "epoch": 0.004626027714111487, "grad_norm": 0.5545452237129211, "learning_rate": 0.0001985109326154774, "loss": 0.7461, "step": 55 }, { "epoch": 0.004710137308913514, "grad_norm": 0.6029804944992065, "learning_rate": 0.00019845643345292054, "loss": 0.7165, "step": 56 }, { "epoch": 0.004794246903715541, "grad_norm": 0.5293462872505188, "learning_rate": 0.00019840096256511398, "loss": 0.6772, "step": 57 }, { "epoch": 0.004878356498517569, "grad_norm": 0.5571961402893066, "learning_rate": 0.00019834452049953297, "loss": 0.5852, "step": 58 }, { "epoch": 0.0049624660933195956, "grad_norm": 0.5451956391334534, "learning_rate": 0.00019828710781323792, "loss": 0.7897, "step": 59 }, { "epoch": 0.005046575688121622, "grad_norm": 0.6682764887809753, "learning_rate": 0.0001982287250728689, "loss": 0.8288, "step": 60 }, { "epoch": 0.005130685282923649, "grad_norm": 0.6017076373100281, "learning_rate": 0.0001981693728546399, "loss": 0.8949, "step": 61 }, { "epoch": 0.005214794877725677, "grad_norm": 0.5544684529304504, "learning_rate": 0.0001981090517443334, "loss": 0.6256, "step": 62 }, { "epoch": 0.005298904472527704, "grad_norm": 0.5648895502090454, "learning_rate": 0.00019804776233729444, "loss": 0.6528, "step": 63 }, { "epoch": 0.0053830140673297305, "grad_norm": 0.4875372350215912, "learning_rate": 0.0001979855052384247, "loss": 0.7416, "step": 64 }, { "epoch": 0.005467123662131757, "grad_norm": 0.649088978767395, "learning_rate": 0.00019792228106217658, "loss": 0.7148, "step": 65 }, { "epoch": 0.005551233256933785, "grad_norm": 0.39869123697280884, "learning_rate": 0.00019785809043254722, "loss": 0.5991, "step": 66 }, { "epoch": 0.005635342851735812, "grad_norm": 0.6871259212493896, "learning_rate": 0.0001977929339830722, "loss": 0.6648, "step": 67 }, { "epoch": 0.005719452446537839, "grad_norm": 0.5564553737640381, "learning_rate": 0.00019772681235681936, "loss": 0.842, "step": 68 }, { "epoch": 0.0058035620413398655, "grad_norm": 0.6598843336105347, "learning_rate": 0.00019765972620638248, "loss": 1.0846, "step": 69 }, { "epoch": 0.005887671636141893, "grad_norm": 0.5316952466964722, "learning_rate": 0.00019759167619387476, "loss": 0.8352, "step": 70 }, { "epoch": 0.00597178123094392, "grad_norm": 0.923975944519043, "learning_rate": 0.00019752266299092236, "loss": 0.761, "step": 71 }, { "epoch": 0.006055890825745947, "grad_norm": 0.6253910064697266, "learning_rate": 0.00019745268727865774, "loss": 0.8125, "step": 72 }, { "epoch": 0.006140000420547974, "grad_norm": 0.5049079060554504, "learning_rate": 0.0001973817497477129, "loss": 0.7948, "step": 73 }, { "epoch": 0.006224110015350001, "grad_norm": 0.5132893919944763, "learning_rate": 0.00019730985109821266, "loss": 0.7674, "step": 74 }, { "epoch": 0.006308219610152028, "grad_norm": 0.5373795032501221, "learning_rate": 0.00019723699203976766, "loss": 0.9345, "step": 75 }, { "epoch": 0.006392329204954055, "grad_norm": 0.658785343170166, "learning_rate": 0.0001971631732914674, "loss": 0.7366, "step": 76 }, { "epoch": 0.006476438799756082, "grad_norm": 0.42672979831695557, "learning_rate": 0.0001970883955818731, "loss": 0.8569, "step": 77 }, { "epoch": 0.0065605483945581095, "grad_norm": 0.6121531128883362, "learning_rate": 0.0001970126596490106, "loss": 0.7555, "step": 78 }, { "epoch": 0.006644657989360136, "grad_norm": 0.5736799836158752, "learning_rate": 0.00019693596624036292, "loss": 0.7651, "step": 79 }, { "epoch": 0.006728767584162163, "grad_norm": 0.5765815377235413, "learning_rate": 0.0001968583161128631, "loss": 0.7362, "step": 80 }, { "epoch": 0.00681287717896419, "grad_norm": 0.5843616127967834, "learning_rate": 0.00019677971003288655, "loss": 0.8952, "step": 81 }, { "epoch": 0.006896986773766218, "grad_norm": 0.5047760009765625, "learning_rate": 0.00019670014877624353, "loss": 0.6147, "step": 82 }, { "epoch": 0.0069810963685682445, "grad_norm": 0.5199994444847107, "learning_rate": 0.00019661963312817148, "loss": 0.8876, "step": 83 }, { "epoch": 0.007065205963370271, "grad_norm": 0.49524548649787903, "learning_rate": 0.0001965381638833274, "loss": 0.7281, "step": 84 }, { "epoch": 0.007149315558172298, "grad_norm": 0.4999952018260956, "learning_rate": 0.00019645574184577982, "loss": 0.7967, "step": 85 }, { "epoch": 0.007233425152974326, "grad_norm": 0.5383725762367249, "learning_rate": 0.000196372367829001, "loss": 0.7426, "step": 86 }, { "epoch": 0.007317534747776353, "grad_norm": 0.4904189705848694, "learning_rate": 0.00019628804265585877, "loss": 0.8622, "step": 87 }, { "epoch": 0.0074016443425783795, "grad_norm": 0.5760535001754761, "learning_rate": 0.0001962027671586086, "loss": 0.7718, "step": 88 }, { "epoch": 0.007485753937380406, "grad_norm": 0.5867496728897095, "learning_rate": 0.0001961165421788852, "loss": 0.7467, "step": 89 }, { "epoch": 0.007569863532182434, "grad_norm": 0.5310566425323486, "learning_rate": 0.0001960293685676943, "loss": 0.7393, "step": 90 }, { "epoch": 0.007653973126984461, "grad_norm": 0.4217927157878876, "learning_rate": 0.0001959412471854043, "loss": 0.696, "step": 91 }, { "epoch": 0.007738082721786488, "grad_norm": 0.6704983711242676, "learning_rate": 0.0001958521789017376, "loss": 0.8934, "step": 92 }, { "epoch": 0.007822192316588514, "grad_norm": 0.543559193611145, "learning_rate": 0.00019576216459576222, "loss": 0.547, "step": 93 }, { "epoch": 0.007906301911390542, "grad_norm": 0.47226864099502563, "learning_rate": 0.00019567120515588308, "loss": 0.6723, "step": 94 }, { "epoch": 0.007990411506192568, "grad_norm": 0.41364991664886475, "learning_rate": 0.00019557930147983302, "loss": 0.6751, "step": 95 }, { "epoch": 0.008074521100994596, "grad_norm": 0.595898449420929, "learning_rate": 0.00019548645447466431, "loss": 0.8247, "step": 96 }, { "epoch": 0.008158630695796623, "grad_norm": 0.6959226131439209, "learning_rate": 0.00019539266505673938, "loss": 0.6472, "step": 97 }, { "epoch": 0.00824274029059865, "grad_norm": 0.4702865779399872, "learning_rate": 0.00019529793415172192, "loss": 0.6569, "step": 98 }, { "epoch": 0.008326849885400677, "grad_norm": 0.5658578872680664, "learning_rate": 0.00019520226269456768, "loss": 0.8219, "step": 99 }, { "epoch": 0.008410959480202705, "grad_norm": 0.5993213057518005, "learning_rate": 0.00019510565162951537, "loss": 0.7014, "step": 100 }, { "epoch": 0.00849506907500473, "grad_norm": 0.5463736057281494, "learning_rate": 0.00019500810191007718, "loss": 0.657, "step": 101 }, { "epoch": 0.008579178669806758, "grad_norm": 0.5861630439758301, "learning_rate": 0.00019490961449902946, "loss": 0.7181, "step": 102 }, { "epoch": 0.008663288264608784, "grad_norm": 0.5364252328872681, "learning_rate": 0.0001948101903684032, "loss": 0.7727, "step": 103 }, { "epoch": 0.008747397859410812, "grad_norm": 0.6941062808036804, "learning_rate": 0.00019470983049947444, "loss": 0.7413, "step": 104 }, { "epoch": 0.00883150745421284, "grad_norm": 0.7105836272239685, "learning_rate": 0.00019460853588275454, "loss": 0.8972, "step": 105 }, { "epoch": 0.008915617049014866, "grad_norm": 0.4372139871120453, "learning_rate": 0.00019450630751798048, "loss": 0.5836, "step": 106 }, { "epoch": 0.008999726643816893, "grad_norm": 0.6489446759223938, "learning_rate": 0.000194403146414105, "loss": 0.877, "step": 107 }, { "epoch": 0.009083836238618921, "grad_norm": 0.5020645260810852, "learning_rate": 0.00019429905358928646, "loss": 0.6035, "step": 108 }, { "epoch": 0.009167945833420947, "grad_norm": 0.7856727242469788, "learning_rate": 0.00019419403007087907, "loss": 0.715, "step": 109 }, { "epoch": 0.009252055428222975, "grad_norm": 0.6128354072570801, "learning_rate": 0.00019408807689542257, "loss": 0.8376, "step": 110 }, { "epoch": 0.009336165023025, "grad_norm": 0.5398645401000977, "learning_rate": 0.00019398119510863197, "loss": 0.7696, "step": 111 }, { "epoch": 0.009420274617827028, "grad_norm": 0.575010359287262, "learning_rate": 0.00019387338576538744, "loss": 0.6847, "step": 112 }, { "epoch": 0.009504384212629056, "grad_norm": 0.5950291156768799, "learning_rate": 0.00019376464992972356, "loss": 0.8757, "step": 113 }, { "epoch": 0.009588493807431082, "grad_norm": 0.5944623947143555, "learning_rate": 0.00019365498867481923, "loss": 0.7609, "step": 114 }, { "epoch": 0.00967260340223311, "grad_norm": 0.4505338668823242, "learning_rate": 0.00019354440308298675, "loss": 0.8388, "step": 115 }, { "epoch": 0.009756712997035137, "grad_norm": 0.4795793294906616, "learning_rate": 0.00019343289424566122, "loss": 0.8994, "step": 116 }, { "epoch": 0.009840822591837163, "grad_norm": 0.4607990086078644, "learning_rate": 0.00019332046326338986, "loss": 0.6956, "step": 117 }, { "epoch": 0.009924932186639191, "grad_norm": 0.5157178640365601, "learning_rate": 0.0001932071112458211, "loss": 0.5357, "step": 118 }, { "epoch": 0.010009041781441217, "grad_norm": 0.5431911945343018, "learning_rate": 0.00019309283931169356, "loss": 0.7704, "step": 119 }, { "epoch": 0.010093151376243245, "grad_norm": 0.5151223540306091, "learning_rate": 0.00019297764858882514, "loss": 0.6825, "step": 120 }, { "epoch": 0.010177260971045272, "grad_norm": 0.5834304094314575, "learning_rate": 0.00019286154021410173, "loss": 0.5308, "step": 121 }, { "epoch": 0.010261370565847298, "grad_norm": 0.6810638904571533, "learning_rate": 0.00019274451533346615, "loss": 0.7112, "step": 122 }, { "epoch": 0.010345480160649326, "grad_norm": 0.48631688952445984, "learning_rate": 0.00019262657510190666, "loss": 0.7143, "step": 123 }, { "epoch": 0.010429589755451354, "grad_norm": 0.6056327819824219, "learning_rate": 0.0001925077206834458, "loss": 0.6615, "step": 124 }, { "epoch": 0.01051369935025338, "grad_norm": 0.4841483235359192, "learning_rate": 0.0001923879532511287, "loss": 0.7337, "step": 125 }, { "epoch": 0.010597808945055407, "grad_norm": 0.5701360702514648, "learning_rate": 0.0001922672739870115, "loss": 0.5952, "step": 126 }, { "epoch": 0.010681918539857433, "grad_norm": 0.6007951498031616, "learning_rate": 0.00019214568408214985, "loss": 0.7754, "step": 127 }, { "epoch": 0.010766028134659461, "grad_norm": 0.6508734822273254, "learning_rate": 0.00019202318473658705, "loss": 0.9641, "step": 128 }, { "epoch": 0.010850137729461489, "grad_norm": 0.608518123626709, "learning_rate": 0.00019189977715934213, "loss": 0.8748, "step": 129 }, { "epoch": 0.010934247324263515, "grad_norm": 0.43734872341156006, "learning_rate": 0.00019177546256839812, "loss": 0.742, "step": 130 }, { "epoch": 0.011018356919065542, "grad_norm": 0.5520758628845215, "learning_rate": 0.0001916502421906898, "loss": 0.5509, "step": 131 }, { "epoch": 0.01110246651386757, "grad_norm": 0.3889513909816742, "learning_rate": 0.00019152411726209176, "loss": 0.4892, "step": 132 }, { "epoch": 0.011186576108669596, "grad_norm": 0.5818157196044922, "learning_rate": 0.00019139708902740613, "loss": 0.6986, "step": 133 }, { "epoch": 0.011270685703471624, "grad_norm": 0.5319333076477051, "learning_rate": 0.0001912691587403503, "loss": 0.6992, "step": 134 }, { "epoch": 0.011354795298273651, "grad_norm": 0.54970782995224, "learning_rate": 0.00019114032766354453, "loss": 0.7275, "step": 135 }, { "epoch": 0.011438904893075677, "grad_norm": 0.4689725637435913, "learning_rate": 0.00019101059706849957, "loss": 0.7424, "step": 136 }, { "epoch": 0.011523014487877705, "grad_norm": 0.625424861907959, "learning_rate": 0.00019087996823560402, "loss": 0.6456, "step": 137 }, { "epoch": 0.011607124082679731, "grad_norm": 0.4590742290019989, "learning_rate": 0.0001907484424541117, "loss": 0.792, "step": 138 }, { "epoch": 0.011691233677481759, "grad_norm": 0.48178377747535706, "learning_rate": 0.00019061602102212898, "loss": 0.8383, "step": 139 }, { "epoch": 0.011775343272283786, "grad_norm": 0.49192577600479126, "learning_rate": 0.00019048270524660196, "loss": 0.7532, "step": 140 }, { "epoch": 0.011859452867085812, "grad_norm": 0.5243375301361084, "learning_rate": 0.0001903484964433035, "loss": 0.8344, "step": 141 }, { "epoch": 0.01194356246188784, "grad_norm": 0.4813317060470581, "learning_rate": 0.00019021339593682028, "loss": 0.9669, "step": 142 }, { "epoch": 0.012027672056689868, "grad_norm": 0.49398431181907654, "learning_rate": 0.00019007740506053983, "loss": 0.6194, "step": 143 }, { "epoch": 0.012111781651491894, "grad_norm": 0.5070907473564148, "learning_rate": 0.0001899405251566371, "loss": 0.4834, "step": 144 }, { "epoch": 0.012195891246293921, "grad_norm": 0.5289168953895569, "learning_rate": 0.00018980275757606157, "loss": 0.6171, "step": 145 }, { "epoch": 0.012280000841095947, "grad_norm": 0.6366864442825317, "learning_rate": 0.00018966410367852362, "loss": 0.8194, "step": 146 }, { "epoch": 0.012364110435897975, "grad_norm": 0.7337288856506348, "learning_rate": 0.00018952456483248119, "loss": 0.8086, "step": 147 }, { "epoch": 0.012448220030700003, "grad_norm": 0.683866560459137, "learning_rate": 0.0001893841424151264, "loss": 0.609, "step": 148 }, { "epoch": 0.012532329625502029, "grad_norm": 0.5067821145057678, "learning_rate": 0.0001892428378123718, "loss": 0.8969, "step": 149 }, { "epoch": 0.012616439220304056, "grad_norm": 0.48310670256614685, "learning_rate": 0.0001891006524188368, "loss": 0.6375, "step": 150 }, { "epoch": 0.012700548815106084, "grad_norm": 0.5184181928634644, "learning_rate": 0.00018895758763783383, "loss": 0.7395, "step": 151 }, { "epoch": 0.01278465840990811, "grad_norm": 0.4921519458293915, "learning_rate": 0.00018881364488135448, "loss": 0.8635, "step": 152 }, { "epoch": 0.012868768004710138, "grad_norm": 0.42886897921562195, "learning_rate": 0.00018866882557005567, "loss": 0.6536, "step": 153 }, { "epoch": 0.012952877599512164, "grad_norm": 0.5397732257843018, "learning_rate": 0.00018852313113324552, "loss": 0.698, "step": 154 }, { "epoch": 0.013036987194314191, "grad_norm": 0.6344212889671326, "learning_rate": 0.00018837656300886937, "loss": 0.7285, "step": 155 }, { "epoch": 0.013121096789116219, "grad_norm": 0.5104556679725647, "learning_rate": 0.00018822912264349534, "loss": 0.5943, "step": 156 }, { "epoch": 0.013205206383918245, "grad_norm": 0.5612802505493164, "learning_rate": 0.00018808081149230036, "loss": 0.7107, "step": 157 }, { "epoch": 0.013289315978720273, "grad_norm": 0.47731515765190125, "learning_rate": 0.00018793163101905563, "loss": 0.7419, "step": 158 }, { "epoch": 0.0133734255735223, "grad_norm": 0.47317004203796387, "learning_rate": 0.00018778158269611218, "loss": 0.6444, "step": 159 }, { "epoch": 0.013457535168324326, "grad_norm": 0.5630394816398621, "learning_rate": 0.00018763066800438636, "loss": 0.66, "step": 160 }, { "epoch": 0.013541644763126354, "grad_norm": 0.4079178273677826, "learning_rate": 0.0001874788884333453, "loss": 0.6648, "step": 161 }, { "epoch": 0.01362575435792838, "grad_norm": 0.4921400547027588, "learning_rate": 0.00018732624548099204, "loss": 0.6975, "step": 162 }, { "epoch": 0.013709863952730408, "grad_norm": 0.5111605525016785, "learning_rate": 0.0001871727406538509, "loss": 0.7385, "step": 163 }, { "epoch": 0.013793973547532435, "grad_norm": 0.6458659172058105, "learning_rate": 0.0001870183754669526, "loss": 0.7494, "step": 164 }, { "epoch": 0.013878083142334461, "grad_norm": 0.5011044144630432, "learning_rate": 0.00018686315144381913, "loss": 0.5897, "step": 165 }, { "epoch": 0.013962192737136489, "grad_norm": 0.524375319480896, "learning_rate": 0.000186707070116449, "loss": 0.721, "step": 166 }, { "epoch": 0.014046302331938517, "grad_norm": 0.6293932795524597, "learning_rate": 0.0001865501330253019, "loss": 0.8962, "step": 167 }, { "epoch": 0.014130411926740543, "grad_norm": 0.6843417882919312, "learning_rate": 0.00018639234171928353, "loss": 0.9065, "step": 168 }, { "epoch": 0.01421452152154257, "grad_norm": 0.5260113477706909, "learning_rate": 0.0001862336977557304, "loss": 0.6079, "step": 169 }, { "epoch": 0.014298631116344596, "grad_norm": 0.4797818064689636, "learning_rate": 0.0001860742027003944, "loss": 0.839, "step": 170 }, { "epoch": 0.014382740711146624, "grad_norm": 0.5858710408210754, "learning_rate": 0.00018591385812742725, "loss": 0.7166, "step": 171 }, { "epoch": 0.014466850305948652, "grad_norm": 0.47610747814178467, "learning_rate": 0.00018575266561936523, "loss": 0.5801, "step": 172 }, { "epoch": 0.014550959900750678, "grad_norm": 0.7322710156440735, "learning_rate": 0.00018559062676711332, "loss": 0.7788, "step": 173 }, { "epoch": 0.014635069495552705, "grad_norm": 0.5088084936141968, "learning_rate": 0.0001854277431699295, "loss": 0.6265, "step": 174 }, { "epoch": 0.014719179090354733, "grad_norm": 0.5596153736114502, "learning_rate": 0.00018526401643540922, "loss": 0.7236, "step": 175 }, { "epoch": 0.014803288685156759, "grad_norm": 0.7297357320785522, "learning_rate": 0.00018509944817946922, "loss": 0.6347, "step": 176 }, { "epoch": 0.014887398279958787, "grad_norm": 0.45782700181007385, "learning_rate": 0.00018493404002633166, "loss": 0.6243, "step": 177 }, { "epoch": 0.014971507874760813, "grad_norm": 0.46937909722328186, "learning_rate": 0.00018476779360850832, "loss": 0.7372, "step": 178 }, { "epoch": 0.01505561746956284, "grad_norm": 0.4801153540611267, "learning_rate": 0.00018460071056678422, "loss": 0.6242, "step": 179 }, { "epoch": 0.015139727064364868, "grad_norm": 0.42098918557167053, "learning_rate": 0.00018443279255020152, "loss": 0.823, "step": 180 }, { "epoch": 0.015223836659166894, "grad_norm": 0.5428348183631897, "learning_rate": 0.00018426404121604323, "loss": 0.9499, "step": 181 }, { "epoch": 0.015307946253968922, "grad_norm": 0.5740225315093994, "learning_rate": 0.00018409445822981693, "loss": 0.8347, "step": 182 }, { "epoch": 0.01539205584877095, "grad_norm": 0.5669785141944885, "learning_rate": 0.00018392404526523817, "loss": 0.7831, "step": 183 }, { "epoch": 0.015476165443572975, "grad_norm": 0.5455140471458435, "learning_rate": 0.0001837528040042142, "loss": 0.6875, "step": 184 }, { "epoch": 0.015560275038375003, "grad_norm": 0.5965580344200134, "learning_rate": 0.00018358073613682706, "loss": 0.8344, "step": 185 }, { "epoch": 0.01564438463317703, "grad_norm": 0.4952075779438019, "learning_rate": 0.00018340784336131713, "loss": 0.4143, "step": 186 }, { "epoch": 0.01572849422797906, "grad_norm": 0.584922730922699, "learning_rate": 0.00018323412738406635, "loss": 0.6031, "step": 187 }, { "epoch": 0.015812603822781084, "grad_norm": 0.5585885047912598, "learning_rate": 0.00018305958991958127, "loss": 0.7149, "step": 188 }, { "epoch": 0.01589671341758311, "grad_norm": 0.6081206202507019, "learning_rate": 0.0001828842326904762, "loss": 0.7388, "step": 189 }, { "epoch": 0.015980823012385136, "grad_norm": 0.5838598608970642, "learning_rate": 0.00018270805742745617, "loss": 0.7166, "step": 190 }, { "epoch": 0.016064932607187166, "grad_norm": 0.5591777563095093, "learning_rate": 0.00018253106586929997, "loss": 0.6833, "step": 191 }, { "epoch": 0.01614904220198919, "grad_norm": 0.49327778816223145, "learning_rate": 0.00018235325976284275, "loss": 0.6518, "step": 192 }, { "epoch": 0.016233151796791218, "grad_norm": 0.5115340948104858, "learning_rate": 0.00018217464086295904, "loss": 0.9741, "step": 193 }, { "epoch": 0.016317261391593247, "grad_norm": 0.44589102268218994, "learning_rate": 0.00018199521093254523, "loss": 0.7523, "step": 194 }, { "epoch": 0.016401370986395273, "grad_norm": 0.4889369606971741, "learning_rate": 0.00018181497174250236, "loss": 0.5771, "step": 195 }, { "epoch": 0.0164854805811973, "grad_norm": 0.5447149276733398, "learning_rate": 0.00018163392507171842, "loss": 0.7554, "step": 196 }, { "epoch": 0.01656959017599933, "grad_norm": 0.4478766918182373, "learning_rate": 0.00018145207270705096, "loss": 0.7293, "step": 197 }, { "epoch": 0.016653699770801354, "grad_norm": 0.4789084494113922, "learning_rate": 0.0001812694164433094, "loss": 0.7598, "step": 198 }, { "epoch": 0.01673780936560338, "grad_norm": 0.5047253966331482, "learning_rate": 0.00018108595808323736, "loss": 0.703, "step": 199 }, { "epoch": 0.01682191896040541, "grad_norm": 0.44823339581489563, "learning_rate": 0.00018090169943749476, "loss": 0.71, "step": 200 }, { "epoch": 0.016906028555207436, "grad_norm": 0.5810935497283936, "learning_rate": 0.00018071664232464002, "loss": 0.7506, "step": 201 }, { "epoch": 0.01699013815000946, "grad_norm": 0.44463852047920227, "learning_rate": 0.0001805307885711122, "loss": 0.7075, "step": 202 }, { "epoch": 0.01707424774481149, "grad_norm": 0.5899863839149475, "learning_rate": 0.00018034414001121278, "loss": 0.7882, "step": 203 }, { "epoch": 0.017158357339613517, "grad_norm": 0.5106207132339478, "learning_rate": 0.00018015669848708767, "loss": 0.8387, "step": 204 }, { "epoch": 0.017242466934415543, "grad_norm": 0.5090427994728088, "learning_rate": 0.00017996846584870908, "loss": 0.8835, "step": 205 }, { "epoch": 0.01732657652921757, "grad_norm": 0.5673806071281433, "learning_rate": 0.0001797794439538571, "loss": 0.6051, "step": 206 }, { "epoch": 0.0174106861240196, "grad_norm": 0.4103122651576996, "learning_rate": 0.0001795896346681016, "loss": 0.6471, "step": 207 }, { "epoch": 0.017494795718821624, "grad_norm": 0.5361846089363098, "learning_rate": 0.00017939903986478355, "loss": 0.6601, "step": 208 }, { "epoch": 0.01757890531362365, "grad_norm": 0.6034552454948425, "learning_rate": 0.00017920766142499672, "loss": 0.7289, "step": 209 }, { "epoch": 0.01766301490842568, "grad_norm": 0.5352544784545898, "learning_rate": 0.00017901550123756906, "loss": 0.6441, "step": 210 }, { "epoch": 0.017747124503227706, "grad_norm": 0.48942509293556213, "learning_rate": 0.00017882256119904403, "loss": 0.7517, "step": 211 }, { "epoch": 0.01783123409802973, "grad_norm": 0.6330734491348267, "learning_rate": 0.00017862884321366188, "loss": 0.8312, "step": 212 }, { "epoch": 0.01791534369283176, "grad_norm": 0.5588996410369873, "learning_rate": 0.000178434349193341, "loss": 0.8497, "step": 213 }, { "epoch": 0.017999453287633787, "grad_norm": 0.4862910807132721, "learning_rate": 0.0001782390810576588, "loss": 0.7367, "step": 214 }, { "epoch": 0.018083562882435813, "grad_norm": 0.45474931597709656, "learning_rate": 0.000178043040733833, "loss": 0.6998, "step": 215 }, { "epoch": 0.018167672477237842, "grad_norm": 0.4923236072063446, "learning_rate": 0.00017784623015670238, "loss": 0.8464, "step": 216 }, { "epoch": 0.018251782072039868, "grad_norm": 0.6574183702468872, "learning_rate": 0.00017764865126870786, "loss": 0.7606, "step": 217 }, { "epoch": 0.018335891666841894, "grad_norm": 0.4556383192539215, "learning_rate": 0.00017745030601987337, "loss": 0.5689, "step": 218 }, { "epoch": 0.018420001261643924, "grad_norm": 0.49504584074020386, "learning_rate": 0.00017725119636778644, "loss": 0.6945, "step": 219 }, { "epoch": 0.01850411085644595, "grad_norm": 0.48922199010849, "learning_rate": 0.00017705132427757895, "loss": 0.6024, "step": 220 }, { "epoch": 0.018588220451247976, "grad_norm": 0.4516255259513855, "learning_rate": 0.00017685069172190766, "loss": 0.6825, "step": 221 }, { "epoch": 0.01867233004605, "grad_norm": 0.5373181104660034, "learning_rate": 0.00017664930068093498, "loss": 0.7165, "step": 222 }, { "epoch": 0.01875643964085203, "grad_norm": 0.45531630516052246, "learning_rate": 0.00017644715314230918, "loss": 0.614, "step": 223 }, { "epoch": 0.018840549235654057, "grad_norm": 0.38286691904067993, "learning_rate": 0.0001762442511011448, "loss": 0.6237, "step": 224 }, { "epoch": 0.018924658830456083, "grad_norm": 0.8253074288368225, "learning_rate": 0.0001760405965600031, "loss": 0.7572, "step": 225 }, { "epoch": 0.019008768425258112, "grad_norm": 0.49788135290145874, "learning_rate": 0.0001758361915288722, "loss": 0.836, "step": 226 }, { "epoch": 0.019092878020060138, "grad_norm": 0.7003273367881775, "learning_rate": 0.0001756310380251472, "loss": 0.6137, "step": 227 }, { "epoch": 0.019176987614862164, "grad_norm": 0.5549869537353516, "learning_rate": 0.00017542513807361037, "loss": 0.7504, "step": 228 }, { "epoch": 0.019261097209664194, "grad_norm": 0.5484214425086975, "learning_rate": 0.00017521849370641114, "loss": 0.8757, "step": 229 }, { "epoch": 0.01934520680446622, "grad_norm": 0.48561275005340576, "learning_rate": 0.00017501110696304596, "loss": 0.6577, "step": 230 }, { "epoch": 0.019429316399268245, "grad_norm": 0.5252694487571716, "learning_rate": 0.00017480297989033825, "loss": 0.7415, "step": 231 }, { "epoch": 0.019513425994070275, "grad_norm": 0.5625510811805725, "learning_rate": 0.00017459411454241822, "loss": 0.7563, "step": 232 }, { "epoch": 0.0195975355888723, "grad_norm": 0.5950868725776672, "learning_rate": 0.00017438451298070252, "loss": 0.5738, "step": 233 }, { "epoch": 0.019681645183674327, "grad_norm": 0.3883379399776459, "learning_rate": 0.00017417417727387394, "loss": 0.6791, "step": 234 }, { "epoch": 0.019765754778476356, "grad_norm": 0.3784724175930023, "learning_rate": 0.000173963109497861, "loss": 0.6673, "step": 235 }, { "epoch": 0.019849864373278382, "grad_norm": 0.5025336146354675, "learning_rate": 0.0001737513117358174, "loss": 0.678, "step": 236 }, { "epoch": 0.019933973968080408, "grad_norm": 0.5078927874565125, "learning_rate": 0.0001735387860781016, "loss": 0.7093, "step": 237 }, { "epoch": 0.020018083562882434, "grad_norm": 0.67873215675354, "learning_rate": 0.00017332553462225602, "loss": 0.7362, "step": 238 }, { "epoch": 0.020102193157684464, "grad_norm": 0.6332508325576782, "learning_rate": 0.00017311155947298643, "loss": 0.848, "step": 239 }, { "epoch": 0.02018630275248649, "grad_norm": 0.6372016668319702, "learning_rate": 0.00017289686274214118, "loss": 0.7129, "step": 240 }, { "epoch": 0.020270412347288515, "grad_norm": 0.6516194939613342, "learning_rate": 0.0001726814465486903, "loss": 0.7255, "step": 241 }, { "epoch": 0.020354521942090545, "grad_norm": 0.5692350268363953, "learning_rate": 0.0001724653130187047, "loss": 0.6044, "step": 242 }, { "epoch": 0.02043863153689257, "grad_norm": 0.6217871308326721, "learning_rate": 0.00017224846428533499, "loss": 0.8463, "step": 243 }, { "epoch": 0.020522741131694597, "grad_norm": 0.4982471466064453, "learning_rate": 0.0001720309024887907, "loss": 0.7304, "step": 244 }, { "epoch": 0.020606850726496626, "grad_norm": 0.46791961789131165, "learning_rate": 0.00017181262977631888, "loss": 0.6765, "step": 245 }, { "epoch": 0.020690960321298652, "grad_norm": 0.54993736743927, "learning_rate": 0.00017159364830218312, "loss": 0.5422, "step": 246 }, { "epoch": 0.020775069916100678, "grad_norm": 0.557114839553833, "learning_rate": 0.00017137396022764214, "loss": 0.8051, "step": 247 }, { "epoch": 0.020859179510902708, "grad_norm": 0.4503937065601349, "learning_rate": 0.00017115356772092857, "loss": 0.6993, "step": 248 }, { "epoch": 0.020943289105704734, "grad_norm": 0.5274853706359863, "learning_rate": 0.0001709324729572274, "loss": 0.5102, "step": 249 }, { "epoch": 0.02102739870050676, "grad_norm": 0.5487212538719177, "learning_rate": 0.00017071067811865476, "loss": 0.7262, "step": 250 }, { "epoch": 0.02111150829530879, "grad_norm": 0.4630710482597351, "learning_rate": 0.00017048818539423615, "loss": 0.6907, "step": 251 }, { "epoch": 0.021195617890110815, "grad_norm": 0.4759877622127533, "learning_rate": 0.00017026499697988493, "loss": 0.6945, "step": 252 }, { "epoch": 0.02127972748491284, "grad_norm": 0.44283753633499146, "learning_rate": 0.00017004111507838064, "loss": 0.491, "step": 253 }, { "epoch": 0.021363837079714867, "grad_norm": 0.6089275479316711, "learning_rate": 0.00016981654189934727, "loss": 0.7725, "step": 254 }, { "epoch": 0.021447946674516896, "grad_norm": 0.48466405272483826, "learning_rate": 0.00016959127965923142, "loss": 0.6735, "step": 255 }, { "epoch": 0.021532056269318922, "grad_norm": 0.5322965383529663, "learning_rate": 0.0001693653305812805, "loss": 0.7648, "step": 256 }, { "epoch": 0.021616165864120948, "grad_norm": 0.5427155494689941, "learning_rate": 0.00016913869689552064, "loss": 0.6212, "step": 257 }, { "epoch": 0.021700275458922978, "grad_norm": 0.6860820651054382, "learning_rate": 0.00016891138083873487, "loss": 0.9867, "step": 258 }, { "epoch": 0.021784385053725003, "grad_norm": 0.5238876342773438, "learning_rate": 0.00016868338465444085, "loss": 0.7087, "step": 259 }, { "epoch": 0.02186849464852703, "grad_norm": 0.613838255405426, "learning_rate": 0.00016845471059286887, "loss": 0.6237, "step": 260 }, { "epoch": 0.02195260424332906, "grad_norm": 0.6442722082138062, "learning_rate": 0.00016822536091093965, "loss": 0.6983, "step": 261 }, { "epoch": 0.022036713838131085, "grad_norm": 0.5362198352813721, "learning_rate": 0.00016799533787224192, "loss": 0.6856, "step": 262 }, { "epoch": 0.02212082343293311, "grad_norm": 0.5040967464447021, "learning_rate": 0.00016776464374701025, "loss": 0.5694, "step": 263 }, { "epoch": 0.02220493302773514, "grad_norm": 0.6717795133590698, "learning_rate": 0.00016753328081210245, "loss": 0.7468, "step": 264 }, { "epoch": 0.022289042622537166, "grad_norm": 0.5034369230270386, "learning_rate": 0.00016730125135097735, "loss": 0.7802, "step": 265 }, { "epoch": 0.022373152217339192, "grad_norm": 0.49784526228904724, "learning_rate": 0.000167068557653672, "loss": 0.6088, "step": 266 }, { "epoch": 0.02245726181214122, "grad_norm": 0.6437605619430542, "learning_rate": 0.0001668352020167793, "loss": 0.6883, "step": 267 }, { "epoch": 0.022541371406943247, "grad_norm": 0.7116613984107971, "learning_rate": 0.00016660118674342517, "loss": 0.6185, "step": 268 }, { "epoch": 0.022625481001745273, "grad_norm": 0.5817685723304749, "learning_rate": 0.00016636651414324587, "loss": 0.6651, "step": 269 }, { "epoch": 0.022709590596547303, "grad_norm": 0.37707141041755676, "learning_rate": 0.00016613118653236518, "loss": 0.6047, "step": 270 }, { "epoch": 0.02279370019134933, "grad_norm": 0.4310030937194824, "learning_rate": 0.0001658952062333717, "loss": 0.7464, "step": 271 }, { "epoch": 0.022877809786151355, "grad_norm": 0.5687549710273743, "learning_rate": 0.00016565857557529566, "loss": 0.7018, "step": 272 }, { "epoch": 0.02296191938095338, "grad_norm": 0.4826352894306183, "learning_rate": 0.00016542129689358612, "loss": 0.6731, "step": 273 }, { "epoch": 0.02304602897575541, "grad_norm": 0.58961021900177, "learning_rate": 0.0001651833725300879, "loss": 0.7081, "step": 274 }, { "epoch": 0.023130138570557436, "grad_norm": 0.3250476121902466, "learning_rate": 0.00016494480483301836, "loss": 0.3532, "step": 275 }, { "epoch": 0.023214248165359462, "grad_norm": 0.6899309754371643, "learning_rate": 0.00016470559615694446, "loss": 0.7164, "step": 276 }, { "epoch": 0.02329835776016149, "grad_norm": 0.5115370154380798, "learning_rate": 0.00016446574886275913, "loss": 0.8014, "step": 277 }, { "epoch": 0.023382467354963517, "grad_norm": 0.4905194640159607, "learning_rate": 0.00016422526531765846, "loss": 0.5492, "step": 278 }, { "epoch": 0.023466576949765543, "grad_norm": 0.5156686305999756, "learning_rate": 0.00016398414789511786, "loss": 0.5411, "step": 279 }, { "epoch": 0.023550686544567573, "grad_norm": 0.6288424730300903, "learning_rate": 0.000163742398974869, "loss": 0.7018, "step": 280 }, { "epoch": 0.0236347961393696, "grad_norm": 0.5037186145782471, "learning_rate": 0.00016350002094287609, "loss": 0.7827, "step": 281 }, { "epoch": 0.023718905734171625, "grad_norm": 0.4114580750465393, "learning_rate": 0.00016325701619131246, "loss": 0.707, "step": 282 }, { "epoch": 0.023803015328973654, "grad_norm": 0.4033624529838562, "learning_rate": 0.00016301338711853693, "loss": 0.7176, "step": 283 }, { "epoch": 0.02388712492377568, "grad_norm": 0.7627992630004883, "learning_rate": 0.00016276913612907007, "loss": 0.7968, "step": 284 }, { "epoch": 0.023971234518577706, "grad_norm": 0.5841469168663025, "learning_rate": 0.00016252426563357055, "loss": 0.7753, "step": 285 }, { "epoch": 0.024055344113379735, "grad_norm": 0.7086002826690674, "learning_rate": 0.00016227877804881127, "loss": 0.701, "step": 286 }, { "epoch": 0.02413945370818176, "grad_norm": 0.6601248979568481, "learning_rate": 0.00016203267579765563, "loss": 0.709, "step": 287 }, { "epoch": 0.024223563302983787, "grad_norm": 0.6370972990989685, "learning_rate": 0.00016178596130903344, "loss": 0.7909, "step": 288 }, { "epoch": 0.024307672897785813, "grad_norm": 0.5954873561859131, "learning_rate": 0.00016153863701791717, "loss": 0.5179, "step": 289 }, { "epoch": 0.024391782492587843, "grad_norm": 0.6457350850105286, "learning_rate": 0.00016129070536529766, "loss": 0.8393, "step": 290 }, { "epoch": 0.02447589208738987, "grad_norm": 0.6270333528518677, "learning_rate": 0.00016104216879816026, "loss": 0.6952, "step": 291 }, { "epoch": 0.024560001682191895, "grad_norm": 0.4968658983707428, "learning_rate": 0.00016079302976946055, "loss": 0.8266, "step": 292 }, { "epoch": 0.024644111276993924, "grad_norm": 0.6642244458198547, "learning_rate": 0.00016054329073810015, "loss": 0.7603, "step": 293 }, { "epoch": 0.02472822087179595, "grad_norm": 0.4561289846897125, "learning_rate": 0.00016029295416890248, "loss": 0.5671, "step": 294 }, { "epoch": 0.024812330466597976, "grad_norm": 0.5380001664161682, "learning_rate": 0.00016004202253258842, "loss": 0.7924, "step": 295 }, { "epoch": 0.024896440061400005, "grad_norm": 0.48755741119384766, "learning_rate": 0.0001597904983057519, "loss": 0.5793, "step": 296 }, { "epoch": 0.02498054965620203, "grad_norm": 0.4303657114505768, "learning_rate": 0.00015953838397083552, "loss": 0.5611, "step": 297 }, { "epoch": 0.025064659251004057, "grad_norm": 0.5174483060836792, "learning_rate": 0.00015928568201610595, "loss": 0.8403, "step": 298 }, { "epoch": 0.025148768845806087, "grad_norm": 0.4314514398574829, "learning_rate": 0.00015903239493562948, "loss": 0.6122, "step": 299 }, { "epoch": 0.025232878440608113, "grad_norm": 0.5762454867362976, "learning_rate": 0.00015877852522924732, "loss": 0.6049, "step": 300 }, { "epoch": 0.02531698803541014, "grad_norm": 0.40056511759757996, "learning_rate": 0.00015852407540255104, "loss": 0.6657, "step": 301 }, { "epoch": 0.025401097630212168, "grad_norm": 0.3158378303050995, "learning_rate": 0.00015826904796685762, "loss": 0.4039, "step": 302 }, { "epoch": 0.025485207225014194, "grad_norm": 0.4131661653518677, "learning_rate": 0.00015801344543918495, "loss": 0.6172, "step": 303 }, { "epoch": 0.02556931681981622, "grad_norm": 0.46788647770881653, "learning_rate": 0.00015775727034222675, "loss": 0.6595, "step": 304 }, { "epoch": 0.025653426414618246, "grad_norm": 0.8410905003547668, "learning_rate": 0.00015750052520432787, "loss": 0.6009, "step": 305 }, { "epoch": 0.025737536009420275, "grad_norm": 0.4922102987766266, "learning_rate": 0.0001572432125594591, "loss": 0.737, "step": 306 }, { "epoch": 0.0258216456042223, "grad_norm": 0.48740994930267334, "learning_rate": 0.00015698533494719238, "loss": 0.6165, "step": 307 }, { "epoch": 0.025905755199024327, "grad_norm": 0.6041740775108337, "learning_rate": 0.00015672689491267567, "loss": 0.5473, "step": 308 }, { "epoch": 0.025989864793826357, "grad_norm": 0.5123183727264404, "learning_rate": 0.00015646789500660773, "loss": 0.7032, "step": 309 }, { "epoch": 0.026073974388628383, "grad_norm": 0.4975537359714508, "learning_rate": 0.00015620833778521307, "loss": 0.7373, "step": 310 }, { "epoch": 0.02615808398343041, "grad_norm": 0.41992077231407166, "learning_rate": 0.0001559482258102167, "loss": 0.7797, "step": 311 }, { "epoch": 0.026242193578232438, "grad_norm": 0.5822920203208923, "learning_rate": 0.00015568756164881882, "loss": 0.6739, "step": 312 }, { "epoch": 0.026326303173034464, "grad_norm": 0.5167087316513062, "learning_rate": 0.00015542634787366942, "loss": 0.6933, "step": 313 }, { "epoch": 0.02641041276783649, "grad_norm": 0.5934262871742249, "learning_rate": 0.00015516458706284303, "loss": 0.5772, "step": 314 }, { "epoch": 0.02649452236263852, "grad_norm": 0.43231528997421265, "learning_rate": 0.0001549022817998132, "loss": 0.7363, "step": 315 }, { "epoch": 0.026578631957440545, "grad_norm": 0.6113074421882629, "learning_rate": 0.00015463943467342693, "loss": 0.8104, "step": 316 }, { "epoch": 0.02666274155224257, "grad_norm": 0.6448655724525452, "learning_rate": 0.00015437604827787927, "loss": 0.7304, "step": 317 }, { "epoch": 0.0267468511470446, "grad_norm": 0.630864143371582, "learning_rate": 0.00015411212521268758, "loss": 0.7209, "step": 318 }, { "epoch": 0.026830960741846627, "grad_norm": 0.5093964338302612, "learning_rate": 0.00015384766808266602, "loss": 0.7396, "step": 319 }, { "epoch": 0.026915070336648653, "grad_norm": 0.5301403999328613, "learning_rate": 0.00015358267949789966, "loss": 0.8367, "step": 320 }, { "epoch": 0.02699917993145068, "grad_norm": 0.5352559089660645, "learning_rate": 0.00015331716207371888, "loss": 0.6919, "step": 321 }, { "epoch": 0.027083289526252708, "grad_norm": 0.4436148703098297, "learning_rate": 0.0001530511184306734, "loss": 0.6293, "step": 322 }, { "epoch": 0.027167399121054734, "grad_norm": 0.5910261273384094, "learning_rate": 0.00015278455119450664, "loss": 0.7454, "step": 323 }, { "epoch": 0.02725150871585676, "grad_norm": 0.5505343079566956, "learning_rate": 0.0001525174629961296, "loss": 0.7661, "step": 324 }, { "epoch": 0.02733561831065879, "grad_norm": 0.5973290205001831, "learning_rate": 0.0001522498564715949, "loss": 0.754, "step": 325 }, { "epoch": 0.027419727905460815, "grad_norm": 0.5219733715057373, "learning_rate": 0.00015198173426207094, "loss": 0.9089, "step": 326 }, { "epoch": 0.02750383750026284, "grad_norm": 0.48295462131500244, "learning_rate": 0.00015171309901381572, "loss": 0.7115, "step": 327 }, { "epoch": 0.02758794709506487, "grad_norm": 0.3980119228363037, "learning_rate": 0.00015144395337815064, "loss": 0.6346, "step": 328 }, { "epoch": 0.027672056689866897, "grad_norm": 0.44824841618537903, "learning_rate": 0.00015117430001143452, "loss": 0.7356, "step": 329 }, { "epoch": 0.027756166284668923, "grad_norm": 0.4944741129875183, "learning_rate": 0.00015090414157503714, "loss": 0.7034, "step": 330 }, { "epoch": 0.027840275879470952, "grad_norm": 0.4512578845024109, "learning_rate": 0.00015063348073531324, "loss": 0.7027, "step": 331 }, { "epoch": 0.027924385474272978, "grad_norm": 0.5666627287864685, "learning_rate": 0.0001503623201635761, "loss": 0.6587, "step": 332 }, { "epoch": 0.028008495069075004, "grad_norm": 0.4888990521430969, "learning_rate": 0.000150090662536071, "loss": 0.5327, "step": 333 }, { "epoch": 0.028092604663877033, "grad_norm": 0.5111712217330933, "learning_rate": 0.0001498185105339491, "loss": 0.6042, "step": 334 }, { "epoch": 0.02817671425867906, "grad_norm": 0.4700281620025635, "learning_rate": 0.00014954586684324078, "loss": 0.583, "step": 335 }, { "epoch": 0.028260823853481085, "grad_norm": 0.5448441505432129, "learning_rate": 0.00014927273415482915, "loss": 0.6319, "step": 336 }, { "epoch": 0.02834493344828311, "grad_norm": 0.6051144599914551, "learning_rate": 0.00014899911516442365, "loss": 0.5532, "step": 337 }, { "epoch": 0.02842904304308514, "grad_norm": 0.6583679914474487, "learning_rate": 0.00014872501257253323, "loss": 0.7324, "step": 338 }, { "epoch": 0.028513152637887167, "grad_norm": 0.468326210975647, "learning_rate": 0.0001484504290844398, "loss": 0.6747, "step": 339 }, { "epoch": 0.028597262232689193, "grad_norm": 0.5111216306686401, "learning_rate": 0.00014817536741017152, "loss": 0.8554, "step": 340 }, { "epoch": 0.028681371827491222, "grad_norm": 0.42130234837532043, "learning_rate": 0.00014789983026447612, "loss": 0.5064, "step": 341 }, { "epoch": 0.028765481422293248, "grad_norm": 0.6524533033370972, "learning_rate": 0.0001476238203667939, "loss": 0.7676, "step": 342 }, { "epoch": 0.028849591017095274, "grad_norm": 0.5613992810249329, "learning_rate": 0.0001473473404412312, "loss": 0.563, "step": 343 }, { "epoch": 0.028933700611897303, "grad_norm": 0.5069441795349121, "learning_rate": 0.0001470703932165333, "loss": 0.6787, "step": 344 }, { "epoch": 0.02901781020669933, "grad_norm": 0.47367653250694275, "learning_rate": 0.00014679298142605734, "loss": 0.8746, "step": 345 }, { "epoch": 0.029101919801501355, "grad_norm": 0.4816168546676636, "learning_rate": 0.00014651510780774583, "loss": 0.6131, "step": 346 }, { "epoch": 0.029186029396303385, "grad_norm": 0.5362951159477234, "learning_rate": 0.00014623677510409918, "loss": 0.6675, "step": 347 }, { "epoch": 0.02927013899110541, "grad_norm": 0.4960172772407532, "learning_rate": 0.00014595798606214882, "loss": 0.624, "step": 348 }, { "epoch": 0.029354248585907437, "grad_norm": 0.3880994915962219, "learning_rate": 0.00014567874343342997, "loss": 0.4903, "step": 349 }, { "epoch": 0.029438358180709466, "grad_norm": 0.5134441256523132, "learning_rate": 0.00014539904997395468, "loss": 0.7855, "step": 350 }, { "epoch": 0.029522467775511492, "grad_norm": 0.5130655765533447, "learning_rate": 0.00014511890844418453, "loss": 0.723, "step": 351 }, { "epoch": 0.029606577370313518, "grad_norm": 0.63141930103302, "learning_rate": 0.00014483832160900326, "loss": 0.8834, "step": 352 }, { "epoch": 0.029690686965115544, "grad_norm": 0.5271434783935547, "learning_rate": 0.00014455729223768966, "loss": 0.5313, "step": 353 }, { "epoch": 0.029774796559917573, "grad_norm": 0.44754379987716675, "learning_rate": 0.0001442758231038902, "loss": 0.6777, "step": 354 }, { "epoch": 0.0298589061547196, "grad_norm": 0.5021183490753174, "learning_rate": 0.00014399391698559152, "loss": 0.476, "step": 355 }, { "epoch": 0.029943015749521625, "grad_norm": 0.5433945059776306, "learning_rate": 0.0001437115766650933, "loss": 0.6773, "step": 356 }, { "epoch": 0.030027125344323655, "grad_norm": 0.42916494607925415, "learning_rate": 0.00014342880492898048, "loss": 0.5919, "step": 357 }, { "epoch": 0.03011123493912568, "grad_norm": 0.5809445381164551, "learning_rate": 0.0001431456045680959, "loss": 0.7717, "step": 358 }, { "epoch": 0.030195344533927707, "grad_norm": 0.5388799905776978, "learning_rate": 0.00014286197837751286, "loss": 0.6828, "step": 359 }, { "epoch": 0.030279454128729736, "grad_norm": 0.5621784329414368, "learning_rate": 0.00014257792915650728, "loss": 0.5758, "step": 360 }, { "epoch": 0.030363563723531762, "grad_norm": 0.5903728604316711, "learning_rate": 0.00014229345970853032, "loss": 0.6692, "step": 361 }, { "epoch": 0.030447673318333788, "grad_norm": 0.5247724652290344, "learning_rate": 0.00014200857284118066, "loss": 0.513, "step": 362 }, { "epoch": 0.030531782913135817, "grad_norm": 0.6713283658027649, "learning_rate": 0.00014172327136617656, "loss": 0.6672, "step": 363 }, { "epoch": 0.030615892507937843, "grad_norm": 0.4961901903152466, "learning_rate": 0.00014143755809932845, "loss": 0.5941, "step": 364 }, { "epoch": 0.03070000210273987, "grad_norm": 0.5194029211997986, "learning_rate": 0.00014115143586051088, "loss": 0.7219, "step": 365 }, { "epoch": 0.0307841116975419, "grad_norm": 0.5475531816482544, "learning_rate": 0.00014086490747363493, "loss": 0.7603, "step": 366 }, { "epoch": 0.030868221292343925, "grad_norm": 0.6493870615959167, "learning_rate": 0.00014057797576662, "loss": 0.6935, "step": 367 }, { "epoch": 0.03095233088714595, "grad_norm": 0.7089727520942688, "learning_rate": 0.00014029064357136628, "loss": 0.7187, "step": 368 }, { "epoch": 0.031036440481947977, "grad_norm": 0.4623405337333679, "learning_rate": 0.00014000291372372647, "loss": 0.5853, "step": 369 }, { "epoch": 0.031120550076750006, "grad_norm": 0.5741824507713318, "learning_rate": 0.00013971478906347806, "loss": 0.6912, "step": 370 }, { "epoch": 0.031204659671552032, "grad_norm": 0.5672146081924438, "learning_rate": 0.00013942627243429512, "loss": 0.9029, "step": 371 }, { "epoch": 0.03128876926635406, "grad_norm": 0.6760275959968567, "learning_rate": 0.00013913736668372026, "loss": 0.6691, "step": 372 }, { "epoch": 0.031372878861156084, "grad_norm": 0.7026247978210449, "learning_rate": 0.00013884807466313663, "loss": 0.6567, "step": 373 }, { "epoch": 0.03145698845595812, "grad_norm": 0.5599506497383118, "learning_rate": 0.00013855839922773968, "loss": 0.7411, "step": 374 }, { "epoch": 0.03154109805076014, "grad_norm": 0.370534747838974, "learning_rate": 0.000138268343236509, "loss": 0.5276, "step": 375 }, { "epoch": 0.03162520764556217, "grad_norm": 0.5061807036399841, "learning_rate": 0.00013797790955218014, "loss": 0.6831, "step": 376 }, { "epoch": 0.031709317240364195, "grad_norm": 0.5783600807189941, "learning_rate": 0.00013768710104121627, "loss": 0.5448, "step": 377 }, { "epoch": 0.03179342683516622, "grad_norm": 0.5063284039497375, "learning_rate": 0.00013739592057378003, "loss": 0.6389, "step": 378 }, { "epoch": 0.031877536429968246, "grad_norm": 0.5093449950218201, "learning_rate": 0.0001371043710237051, "loss": 0.7035, "step": 379 }, { "epoch": 0.03196164602477027, "grad_norm": 0.5934507250785828, "learning_rate": 0.00013681245526846783, "loss": 0.7793, "step": 380 }, { "epoch": 0.032045755619572305, "grad_norm": 0.5645430088043213, "learning_rate": 0.0001365201761891588, "loss": 0.9189, "step": 381 }, { "epoch": 0.03212986521437433, "grad_norm": 0.5648247599601746, "learning_rate": 0.00013622753667045457, "loss": 0.7319, "step": 382 }, { "epoch": 0.03221397480917636, "grad_norm": 0.48350241780281067, "learning_rate": 0.00013593453960058908, "loss": 0.65, "step": 383 }, { "epoch": 0.03229808440397838, "grad_norm": 0.4776511788368225, "learning_rate": 0.00013564118787132506, "loss": 0.5555, "step": 384 }, { "epoch": 0.03238219399878041, "grad_norm": 0.339843213558197, "learning_rate": 0.00013534748437792573, "loss": 0.5663, "step": 385 }, { "epoch": 0.032466303593582435, "grad_norm": 0.522289514541626, "learning_rate": 0.0001350534320191259, "loss": 0.7173, "step": 386 }, { "epoch": 0.03255041318838447, "grad_norm": 0.6175903081893921, "learning_rate": 0.0001347590336971037, "loss": 0.6715, "step": 387 }, { "epoch": 0.032634522783186494, "grad_norm": 0.5097800493240356, "learning_rate": 0.0001344642923174517, "loss": 0.7981, "step": 388 }, { "epoch": 0.03271863237798852, "grad_norm": 0.45350587368011475, "learning_rate": 0.00013416921078914835, "loss": 0.7346, "step": 389 }, { "epoch": 0.032802741972790546, "grad_norm": 0.7196130156517029, "learning_rate": 0.00013387379202452917, "loss": 0.6926, "step": 390 }, { "epoch": 0.03288685156759257, "grad_norm": 0.44739460945129395, "learning_rate": 0.00013357803893925807, "loss": 0.6764, "step": 391 }, { "epoch": 0.0329709611623946, "grad_norm": 0.42210137844085693, "learning_rate": 0.00013328195445229868, "loss": 0.5693, "step": 392 }, { "epoch": 0.033055070757196624, "grad_norm": 0.25353190302848816, "learning_rate": 0.00013298554148588528, "loss": 0.2683, "step": 393 }, { "epoch": 0.03313918035199866, "grad_norm": 0.5350023508071899, "learning_rate": 0.00013268880296549425, "loss": 0.6929, "step": 394 }, { "epoch": 0.03322328994680068, "grad_norm": 0.5112295150756836, "learning_rate": 0.00013239174181981495, "loss": 0.6803, "step": 395 }, { "epoch": 0.03330739954160271, "grad_norm": 0.5625605583190918, "learning_rate": 0.00013209436098072095, "loss": 0.7018, "step": 396 }, { "epoch": 0.033391509136404735, "grad_norm": 0.6036564111709595, "learning_rate": 0.00013179666338324108, "loss": 0.6632, "step": 397 }, { "epoch": 0.03347561873120676, "grad_norm": 0.6720706224441528, "learning_rate": 0.0001314986519655305, "loss": 0.6026, "step": 398 }, { "epoch": 0.033559728326008786, "grad_norm": 0.5537442564964294, "learning_rate": 0.0001312003296688415, "loss": 0.7609, "step": 399 }, { "epoch": 0.03364383792081082, "grad_norm": 0.567884087562561, "learning_rate": 0.00013090169943749476, "loss": 0.4756, "step": 400 }, { "epoch": 0.033727947515612845, "grad_norm": 0.47909706830978394, "learning_rate": 0.0001306027642188501, "loss": 0.6392, "step": 401 }, { "epoch": 0.03381205711041487, "grad_norm": 0.48807093501091003, "learning_rate": 0.00013030352696327742, "loss": 0.4313, "step": 402 }, { "epoch": 0.0338961667052169, "grad_norm": 0.5797786712646484, "learning_rate": 0.00013000399062412763, "loss": 0.6879, "step": 403 }, { "epoch": 0.03398027630001892, "grad_norm": 0.5691391229629517, "learning_rate": 0.0001297041581577035, "loss": 0.611, "step": 404 }, { "epoch": 0.03406438589482095, "grad_norm": 0.491915762424469, "learning_rate": 0.0001294040325232304, "loss": 0.7652, "step": 405 }, { "epoch": 0.03414849548962298, "grad_norm": 0.6337294578552246, "learning_rate": 0.00012910361668282719, "loss": 0.6599, "step": 406 }, { "epoch": 0.03423260508442501, "grad_norm": 0.5192781686782837, "learning_rate": 0.00012880291360147693, "loss": 0.6136, "step": 407 }, { "epoch": 0.034316714679227034, "grad_norm": 0.6465744376182556, "learning_rate": 0.0001285019262469976, "loss": 0.7522, "step": 408 }, { "epoch": 0.03440082427402906, "grad_norm": 0.5028295516967773, "learning_rate": 0.00012820065759001293, "loss": 0.9119, "step": 409 }, { "epoch": 0.034484933868831086, "grad_norm": 0.519190788269043, "learning_rate": 0.00012789911060392294, "loss": 0.4264, "step": 410 }, { "epoch": 0.03456904346363311, "grad_norm": 0.6284279227256775, "learning_rate": 0.0001275972882648746, "loss": 0.7378, "step": 411 }, { "epoch": 0.03465315305843514, "grad_norm": 0.679527997970581, "learning_rate": 0.00012729519355173254, "loss": 0.7436, "step": 412 }, { "epoch": 0.03473726265323717, "grad_norm": 0.49145928025245667, "learning_rate": 0.00012699282944604967, "loss": 0.8936, "step": 413 }, { "epoch": 0.0348213722480392, "grad_norm": 0.6400472521781921, "learning_rate": 0.00012669019893203759, "loss": 0.9139, "step": 414 }, { "epoch": 0.03490548184284122, "grad_norm": 0.46638303995132446, "learning_rate": 0.0001263873049965373, "loss": 0.5398, "step": 415 }, { "epoch": 0.03498959143764325, "grad_norm": 0.6132822632789612, "learning_rate": 0.00012608415062898972, "loss": 0.7835, "step": 416 }, { "epoch": 0.035073701032445274, "grad_norm": 0.5918437838554382, "learning_rate": 0.000125780738821406, "loss": 0.6589, "step": 417 }, { "epoch": 0.0351578106272473, "grad_norm": 0.45070680975914, "learning_rate": 0.00012547707256833823, "loss": 0.5931, "step": 418 }, { "epoch": 0.03524192022204933, "grad_norm": 0.5677469372749329, "learning_rate": 0.00012517315486684972, "loss": 0.7679, "step": 419 }, { "epoch": 0.03532602981685136, "grad_norm": 0.4486066997051239, "learning_rate": 0.0001248689887164855, "loss": 0.5029, "step": 420 }, { "epoch": 0.035410139411653385, "grad_norm": 0.5143264532089233, "learning_rate": 0.00012456457711924266, "loss": 0.512, "step": 421 }, { "epoch": 0.03549424900645541, "grad_norm": 0.5197947025299072, "learning_rate": 0.00012425992307954075, "loss": 0.6199, "step": 422 }, { "epoch": 0.03557835860125744, "grad_norm": 0.5069385170936584, "learning_rate": 0.0001239550296041922, "loss": 0.5604, "step": 423 }, { "epoch": 0.03566246819605946, "grad_norm": 0.5928438305854797, "learning_rate": 0.00012364989970237248, "loss": 0.8877, "step": 424 }, { "epoch": 0.035746577790861496, "grad_norm": 0.5464084148406982, "learning_rate": 0.00012334453638559057, "loss": 0.7264, "step": 425 }, { "epoch": 0.03583068738566352, "grad_norm": 0.5650874972343445, "learning_rate": 0.00012303894266765908, "loss": 0.6815, "step": 426 }, { "epoch": 0.03591479698046555, "grad_norm": 0.5731722116470337, "learning_rate": 0.00012273312156466464, "loss": 0.9198, "step": 427 }, { "epoch": 0.035998906575267574, "grad_norm": 0.3634185194969177, "learning_rate": 0.00012242707609493814, "loss": 0.6057, "step": 428 }, { "epoch": 0.0360830161700696, "grad_norm": 0.5701162815093994, "learning_rate": 0.00012212080927902474, "loss": 0.6718, "step": 429 }, { "epoch": 0.036167125764871626, "grad_norm": 0.5000714063644409, "learning_rate": 0.00012181432413965428, "loss": 0.7592, "step": 430 }, { "epoch": 0.03625123535967365, "grad_norm": 0.5954747200012207, "learning_rate": 0.00012150762370171136, "loss": 0.7098, "step": 431 }, { "epoch": 0.036335344954475685, "grad_norm": 0.5186465978622437, "learning_rate": 0.00012120071099220549, "loss": 0.5954, "step": 432 }, { "epoch": 0.03641945454927771, "grad_norm": 0.5555379390716553, "learning_rate": 0.00012089358904024117, "loss": 0.7133, "step": 433 }, { "epoch": 0.036503564144079736, "grad_norm": 0.46339845657348633, "learning_rate": 0.00012058626087698814, "loss": 0.7121, "step": 434 }, { "epoch": 0.03658767373888176, "grad_norm": 0.45588961243629456, "learning_rate": 0.00012027872953565125, "loss": 0.705, "step": 435 }, { "epoch": 0.03667178333368379, "grad_norm": 0.6387124061584473, "learning_rate": 0.00011997099805144069, "loss": 0.6375, "step": 436 }, { "epoch": 0.036755892928485814, "grad_norm": 0.5656935572624207, "learning_rate": 0.000119663069461542, "loss": 0.7211, "step": 437 }, { "epoch": 0.03684000252328785, "grad_norm": 0.4781017303466797, "learning_rate": 0.00011935494680508606, "loss": 0.6053, "step": 438 }, { "epoch": 0.03692411211808987, "grad_norm": 0.5316019654273987, "learning_rate": 0.00011904663312311901, "loss": 0.6341, "step": 439 }, { "epoch": 0.0370082217128919, "grad_norm": 0.535388171672821, "learning_rate": 0.00011873813145857249, "loss": 0.7128, "step": 440 }, { "epoch": 0.037092331307693925, "grad_norm": 0.7049708366394043, "learning_rate": 0.00011842944485623335, "loss": 0.673, "step": 441 }, { "epoch": 0.03717644090249595, "grad_norm": 0.5531265139579773, "learning_rate": 0.00011812057636271374, "loss": 0.7491, "step": 442 }, { "epoch": 0.03726055049729798, "grad_norm": 0.5389276146888733, "learning_rate": 0.000117811529026421, "loss": 0.6979, "step": 443 }, { "epoch": 0.0373446600921, "grad_norm": 0.3878541588783264, "learning_rate": 0.00011750230589752762, "loss": 0.4515, "step": 444 }, { "epoch": 0.037428769686902036, "grad_norm": 0.7032809853553772, "learning_rate": 0.00011719291002794096, "loss": 0.6921, "step": 445 }, { "epoch": 0.03751287928170406, "grad_norm": 0.5832718014717102, "learning_rate": 0.00011688334447127338, "loss": 0.508, "step": 446 }, { "epoch": 0.03759698887650609, "grad_norm": 0.4945680499076843, "learning_rate": 0.00011657361228281199, "loss": 0.5809, "step": 447 }, { "epoch": 0.037681098471308114, "grad_norm": 0.6034420728683472, "learning_rate": 0.00011626371651948838, "loss": 0.6125, "step": 448 }, { "epoch": 0.03776520806611014, "grad_norm": 0.6533922553062439, "learning_rate": 0.00011595366023984864, "loss": 0.5984, "step": 449 }, { "epoch": 0.037849317660912166, "grad_norm": 0.46035146713256836, "learning_rate": 0.0001156434465040231, "loss": 0.6094, "step": 450 }, { "epoch": 0.0379334272557142, "grad_norm": 0.6323526501655579, "learning_rate": 0.00011533307837369607, "loss": 0.9116, "step": 451 }, { "epoch": 0.038017536850516225, "grad_norm": 0.4355090260505676, "learning_rate": 0.00011502255891207572, "loss": 0.4778, "step": 452 }, { "epoch": 0.03810164644531825, "grad_norm": 0.5190046429634094, "learning_rate": 0.00011471189118386375, "loss": 0.5997, "step": 453 }, { "epoch": 0.038185756040120276, "grad_norm": 0.534466564655304, "learning_rate": 0.00011440107825522521, "loss": 0.7846, "step": 454 }, { "epoch": 0.0382698656349223, "grad_norm": 0.6099439859390259, "learning_rate": 0.00011409012319375827, "loss": 0.5393, "step": 455 }, { "epoch": 0.03835397522972433, "grad_norm": 0.4053705036640167, "learning_rate": 0.0001137790290684638, "loss": 0.547, "step": 456 }, { "epoch": 0.03843808482452636, "grad_norm": 0.3879355788230896, "learning_rate": 0.00011346779894971527, "loss": 0.5772, "step": 457 }, { "epoch": 0.03852219441932839, "grad_norm": 0.7428577542304993, "learning_rate": 0.00011315643590922827, "loss": 0.9059, "step": 458 }, { "epoch": 0.03860630401413041, "grad_norm": 0.41162705421447754, "learning_rate": 0.0001128449430200303, "loss": 0.6041, "step": 459 }, { "epoch": 0.03869041360893244, "grad_norm": 0.5828412175178528, "learning_rate": 0.00011253332335643043, "loss": 0.4467, "step": 460 }, { "epoch": 0.038774523203734465, "grad_norm": 0.5897490978240967, "learning_rate": 0.00011222157999398895, "loss": 0.8586, "step": 461 }, { "epoch": 0.03885863279853649, "grad_norm": 0.4602713882923126, "learning_rate": 0.00011190971600948699, "loss": 0.6802, "step": 462 }, { "epoch": 0.03894274239333852, "grad_norm": 0.5070551633834839, "learning_rate": 0.00011159773448089614, "loss": 0.5585, "step": 463 }, { "epoch": 0.03902685198814055, "grad_norm": 0.4254646897315979, "learning_rate": 0.00011128563848734816, "loss": 0.7244, "step": 464 }, { "epoch": 0.039110961582942576, "grad_norm": 0.5656306147575378, "learning_rate": 0.00011097343110910452, "loss": 0.8105, "step": 465 }, { "epoch": 0.0391950711777446, "grad_norm": 0.5550054311752319, "learning_rate": 0.000110661115427526, "loss": 0.4696, "step": 466 }, { "epoch": 0.03927918077254663, "grad_norm": 0.47665438055992126, "learning_rate": 0.00011034869452504226, "loss": 0.4867, "step": 467 }, { "epoch": 0.039363290367348654, "grad_norm": 0.6439712047576904, "learning_rate": 0.00011003617148512149, "loss": 0.6037, "step": 468 }, { "epoch": 0.03944739996215068, "grad_norm": 0.49243077635765076, "learning_rate": 0.00010972354939223996, "loss": 0.646, "step": 469 }, { "epoch": 0.03953150955695271, "grad_norm": 0.4704420268535614, "learning_rate": 0.00010941083133185146, "loss": 0.7801, "step": 470 }, { "epoch": 0.03961561915175474, "grad_norm": 0.4875670075416565, "learning_rate": 0.00010909802039035701, "loss": 0.5314, "step": 471 }, { "epoch": 0.039699728746556764, "grad_norm": 0.5254621505737305, "learning_rate": 0.00010878511965507434, "loss": 0.659, "step": 472 }, { "epoch": 0.03978383834135879, "grad_norm": 0.5760181546211243, "learning_rate": 0.00010847213221420736, "loss": 0.9349, "step": 473 }, { "epoch": 0.039867947936160816, "grad_norm": 0.4383930563926697, "learning_rate": 0.00010815906115681578, "loss": 0.8152, "step": 474 }, { "epoch": 0.03995205753096284, "grad_norm": 0.5969526171684265, "learning_rate": 0.0001078459095727845, "loss": 0.7431, "step": 475 }, { "epoch": 0.04003616712576487, "grad_norm": 0.5625801086425781, "learning_rate": 0.00010753268055279329, "loss": 0.7582, "step": 476 }, { "epoch": 0.0401202767205669, "grad_norm": 0.5394108295440674, "learning_rate": 0.0001072193771882861, "loss": 0.6964, "step": 477 }, { "epoch": 0.04020438631536893, "grad_norm": 0.8828211426734924, "learning_rate": 0.00010690600257144061, "loss": 0.6602, "step": 478 }, { "epoch": 0.04028849591017095, "grad_norm": 0.6621891856193542, "learning_rate": 0.0001065925597951378, "loss": 0.6224, "step": 479 }, { "epoch": 0.04037260550497298, "grad_norm": 0.5769515633583069, "learning_rate": 0.00010627905195293135, "loss": 0.7293, "step": 480 }, { "epoch": 0.040456715099775005, "grad_norm": 0.47239354252815247, "learning_rate": 0.00010596548213901708, "loss": 0.6623, "step": 481 }, { "epoch": 0.04054082469457703, "grad_norm": 0.5481030344963074, "learning_rate": 0.00010565185344820247, "loss": 0.8693, "step": 482 }, { "epoch": 0.040624934289379064, "grad_norm": 0.5839731693267822, "learning_rate": 0.00010533816897587606, "loss": 0.7281, "step": 483 }, { "epoch": 0.04070904388418109, "grad_norm": 0.5088500380516052, "learning_rate": 0.00010502443181797697, "loss": 0.6444, "step": 484 }, { "epoch": 0.040793153478983116, "grad_norm": 0.514184832572937, "learning_rate": 0.00010471064507096426, "loss": 0.678, "step": 485 }, { "epoch": 0.04087726307378514, "grad_norm": 0.48522424697875977, "learning_rate": 0.0001043968118317865, "loss": 0.6344, "step": 486 }, { "epoch": 0.04096137266858717, "grad_norm": 0.5985432863235474, "learning_rate": 0.00010408293519785101, "loss": 0.6916, "step": 487 }, { "epoch": 0.041045482263389194, "grad_norm": 0.5493951439857483, "learning_rate": 0.00010376901826699348, "loss": 0.6433, "step": 488 }, { "epoch": 0.041129591858191226, "grad_norm": 0.5034871101379395, "learning_rate": 0.00010345506413744726, "loss": 0.6704, "step": 489 }, { "epoch": 0.04121370145299325, "grad_norm": 0.5221031904220581, "learning_rate": 0.00010314107590781284, "loss": 0.4167, "step": 490 }, { "epoch": 0.04129781104779528, "grad_norm": 0.5250301957130432, "learning_rate": 0.00010282705667702734, "loss": 0.5978, "step": 491 }, { "epoch": 0.041381920642597304, "grad_norm": 0.5023308396339417, "learning_rate": 0.00010251300954433376, "loss": 0.448, "step": 492 }, { "epoch": 0.04146603023739933, "grad_norm": 0.6943901181221008, "learning_rate": 0.00010219893760925052, "loss": 0.7664, "step": 493 }, { "epoch": 0.041550139832201356, "grad_norm": 0.4787749946117401, "learning_rate": 0.00010188484397154084, "loss": 0.8114, "step": 494 }, { "epoch": 0.04163424942700338, "grad_norm": 0.5617536902427673, "learning_rate": 0.00010157073173118208, "loss": 0.6003, "step": 495 }, { "epoch": 0.041718359021805415, "grad_norm": 0.3669779300689697, "learning_rate": 0.00010125660398833528, "loss": 0.5983, "step": 496 }, { "epoch": 0.04180246861660744, "grad_norm": 0.46090903878211975, "learning_rate": 0.00010094246384331442, "loss": 0.7085, "step": 497 }, { "epoch": 0.04188657821140947, "grad_norm": 0.4907785952091217, "learning_rate": 0.00010062831439655591, "loss": 0.6197, "step": 498 }, { "epoch": 0.04197068780621149, "grad_norm": 0.4900142252445221, "learning_rate": 0.00010031415874858797, "loss": 0.53, "step": 499 }, { "epoch": 0.04205479740101352, "grad_norm": 0.5708168745040894, "learning_rate": 0.0001, "loss": 0.7168, "step": 500 }, { "epoch": 0.042138906995815545, "grad_norm": 0.5526877045631409, "learning_rate": 9.968584125141204e-05, "loss": 0.7179, "step": 501 }, { "epoch": 0.04222301659061758, "grad_norm": 0.4733210802078247, "learning_rate": 9.937168560344412e-05, "loss": 0.4766, "step": 502 }, { "epoch": 0.042307126185419604, "grad_norm": 0.5687675476074219, "learning_rate": 9.90575361566856e-05, "loss": 0.7382, "step": 503 }, { "epoch": 0.04239123578022163, "grad_norm": 0.5211085677146912, "learning_rate": 9.874339601166473e-05, "loss": 0.7914, "step": 504 }, { "epoch": 0.042475345375023656, "grad_norm": 0.6340366005897522, "learning_rate": 9.842926826881796e-05, "loss": 0.7195, "step": 505 }, { "epoch": 0.04255945496982568, "grad_norm": 0.5366642475128174, "learning_rate": 9.81151560284592e-05, "loss": 0.6782, "step": 506 }, { "epoch": 0.04264356456462771, "grad_norm": 0.6311143040657043, "learning_rate": 9.78010623907495e-05, "loss": 0.7104, "step": 507 }, { "epoch": 0.042727674159429734, "grad_norm": 0.5550001859664917, "learning_rate": 9.748699045566626e-05, "loss": 0.834, "step": 508 }, { "epoch": 0.042811783754231766, "grad_norm": 0.601676881313324, "learning_rate": 9.717294332297268e-05, "loss": 0.7551, "step": 509 }, { "epoch": 0.04289589334903379, "grad_norm": 0.4123936891555786, "learning_rate": 9.685892409218717e-05, "loss": 0.7682, "step": 510 }, { "epoch": 0.04298000294383582, "grad_norm": 0.7674761414527893, "learning_rate": 9.654493586255278e-05, "loss": 0.6967, "step": 511 }, { "epoch": 0.043064112538637844, "grad_norm": 0.6111290454864502, "learning_rate": 9.623098173300654e-05, "loss": 0.5819, "step": 512 }, { "epoch": 0.04314822213343987, "grad_norm": 0.43844571709632874, "learning_rate": 9.591706480214901e-05, "loss": 0.6972, "step": 513 }, { "epoch": 0.043232331728241896, "grad_norm": 0.538962185382843, "learning_rate": 9.560318816821353e-05, "loss": 0.7395, "step": 514 }, { "epoch": 0.04331644132304393, "grad_norm": 0.5768121480941772, "learning_rate": 9.528935492903575e-05, "loss": 0.8873, "step": 515 }, { "epoch": 0.043400550917845955, "grad_norm": 0.492720365524292, "learning_rate": 9.497556818202306e-05, "loss": 0.5948, "step": 516 }, { "epoch": 0.04348466051264798, "grad_norm": 0.516925036907196, "learning_rate": 9.466183102412395e-05, "loss": 0.6319, "step": 517 }, { "epoch": 0.04356877010745001, "grad_norm": 0.6040381193161011, "learning_rate": 9.434814655179755e-05, "loss": 0.6935, "step": 518 }, { "epoch": 0.04365287970225203, "grad_norm": 0.48684102296829224, "learning_rate": 9.403451786098294e-05, "loss": 0.5688, "step": 519 }, { "epoch": 0.04373698929705406, "grad_norm": 0.5389447212219238, "learning_rate": 9.372094804706867e-05, "loss": 0.8248, "step": 520 }, { "epoch": 0.04382109889185609, "grad_norm": 0.4723369777202606, "learning_rate": 9.340744020486222e-05, "loss": 0.5912, "step": 521 }, { "epoch": 0.04390520848665812, "grad_norm": 0.5267740488052368, "learning_rate": 9.309399742855942e-05, "loss": 0.5262, "step": 522 }, { "epoch": 0.043989318081460144, "grad_norm": 0.52064049243927, "learning_rate": 9.278062281171393e-05, "loss": 0.6672, "step": 523 }, { "epoch": 0.04407342767626217, "grad_norm": 0.46649792790412903, "learning_rate": 9.246731944720675e-05, "loss": 0.7222, "step": 524 }, { "epoch": 0.044157537271064196, "grad_norm": 0.5226630568504333, "learning_rate": 9.215409042721552e-05, "loss": 0.6677, "step": 525 }, { "epoch": 0.04424164686586622, "grad_norm": 0.5565252304077148, "learning_rate": 9.184093884318425e-05, "loss": 0.5528, "step": 526 }, { "epoch": 0.04432575646066825, "grad_norm": 0.5086897611618042, "learning_rate": 9.152786778579267e-05, "loss": 0.6953, "step": 527 }, { "epoch": 0.04440986605547028, "grad_norm": 0.5770063400268555, "learning_rate": 9.121488034492569e-05, "loss": 0.7933, "step": 528 }, { "epoch": 0.044493975650272306, "grad_norm": 0.6236185431480408, "learning_rate": 9.090197960964301e-05, "loss": 0.7398, "step": 529 }, { "epoch": 0.04457808524507433, "grad_norm": 0.605636477470398, "learning_rate": 9.058916866814858e-05, "loss": 0.7508, "step": 530 }, { "epoch": 0.04466219483987636, "grad_norm": 0.5256229043006897, "learning_rate": 9.027645060776006e-05, "loss": 0.7415, "step": 531 }, { "epoch": 0.044746304434678384, "grad_norm": 0.4349639117717743, "learning_rate": 8.99638285148785e-05, "loss": 0.7829, "step": 532 }, { "epoch": 0.04483041402948041, "grad_norm": 0.5248279571533203, "learning_rate": 8.965130547495776e-05, "loss": 0.6153, "step": 533 }, { "epoch": 0.04491452362428244, "grad_norm": 0.47936955094337463, "learning_rate": 8.933888457247402e-05, "loss": 0.5984, "step": 534 }, { "epoch": 0.04499863321908447, "grad_norm": 0.4791763722896576, "learning_rate": 8.902656889089548e-05, "loss": 0.6953, "step": 535 }, { "epoch": 0.045082742813886495, "grad_norm": 0.47786253690719604, "learning_rate": 8.871436151265184e-05, "loss": 0.6712, "step": 536 }, { "epoch": 0.04516685240868852, "grad_norm": 0.580772340297699, "learning_rate": 8.840226551910387e-05, "loss": 0.6813, "step": 537 }, { "epoch": 0.04525096200349055, "grad_norm": 0.6560893058776855, "learning_rate": 8.809028399051302e-05, "loss": 0.5794, "step": 538 }, { "epoch": 0.04533507159829257, "grad_norm": 0.479247510433197, "learning_rate": 8.777842000601105e-05, "loss": 0.6182, "step": 539 }, { "epoch": 0.045419181193094606, "grad_norm": 0.5315003395080566, "learning_rate": 8.746667664356956e-05, "loss": 0.6151, "step": 540 }, { "epoch": 0.04550329078789663, "grad_norm": 0.4729808568954468, "learning_rate": 8.715505697996971e-05, "loss": 0.6079, "step": 541 }, { "epoch": 0.04558740038269866, "grad_norm": 0.5379468202590942, "learning_rate": 8.684356409077176e-05, "loss": 0.6553, "step": 542 }, { "epoch": 0.045671509977500684, "grad_norm": 0.6452252864837646, "learning_rate": 8.653220105028474e-05, "loss": 0.7077, "step": 543 }, { "epoch": 0.04575561957230271, "grad_norm": 0.5322025418281555, "learning_rate": 8.62209709315362e-05, "loss": 0.5957, "step": 544 }, { "epoch": 0.045839729167104735, "grad_norm": 0.6063228249549866, "learning_rate": 8.590987680624174e-05, "loss": 0.8442, "step": 545 }, { "epoch": 0.04592383876190676, "grad_norm": 0.6377241015434265, "learning_rate": 8.559892174477479e-05, "loss": 0.6145, "step": 546 }, { "epoch": 0.046007948356708794, "grad_norm": 0.49134746193885803, "learning_rate": 8.528810881613626e-05, "loss": 0.7311, "step": 547 }, { "epoch": 0.04609205795151082, "grad_norm": 0.7310553193092346, "learning_rate": 8.497744108792429e-05, "loss": 0.7093, "step": 548 }, { "epoch": 0.046176167546312846, "grad_norm": 0.44871386885643005, "learning_rate": 8.466692162630392e-05, "loss": 0.488, "step": 549 }, { "epoch": 0.04626027714111487, "grad_norm": 0.604282021522522, "learning_rate": 8.435655349597689e-05, "loss": 0.5964, "step": 550 }, { "epoch": 0.0463443867359169, "grad_norm": 0.46552160382270813, "learning_rate": 8.404633976015134e-05, "loss": 0.7442, "step": 551 }, { "epoch": 0.046428496330718924, "grad_norm": 0.5520333051681519, "learning_rate": 8.373628348051165e-05, "loss": 0.5222, "step": 552 }, { "epoch": 0.04651260592552096, "grad_norm": 0.5013942718505859, "learning_rate": 8.342638771718802e-05, "loss": 0.6123, "step": 553 }, { "epoch": 0.04659671552032298, "grad_norm": 0.5522478222846985, "learning_rate": 8.311665552872662e-05, "loss": 0.6076, "step": 554 }, { "epoch": 0.04668082511512501, "grad_norm": 0.43584609031677246, "learning_rate": 8.280708997205904e-05, "loss": 0.5835, "step": 555 }, { "epoch": 0.046764934709927035, "grad_norm": 0.6730960011482239, "learning_rate": 8.249769410247239e-05, "loss": 0.5367, "step": 556 }, { "epoch": 0.04684904430472906, "grad_norm": 0.4698852002620697, "learning_rate": 8.218847097357898e-05, "loss": 0.7015, "step": 557 }, { "epoch": 0.04693315389953109, "grad_norm": 0.6545222401618958, "learning_rate": 8.187942363728625e-05, "loss": 0.7149, "step": 558 }, { "epoch": 0.04701726349433311, "grad_norm": 0.5913302898406982, "learning_rate": 8.157055514376666e-05, "loss": 0.634, "step": 559 }, { "epoch": 0.047101373089135146, "grad_norm": 0.6566689610481262, "learning_rate": 8.126186854142752e-05, "loss": 0.6646, "step": 560 }, { "epoch": 0.04718548268393717, "grad_norm": 0.5921803116798401, "learning_rate": 8.095336687688102e-05, "loss": 0.5967, "step": 561 }, { "epoch": 0.0472695922787392, "grad_norm": 0.5838126540184021, "learning_rate": 8.064505319491398e-05, "loss": 0.6807, "step": 562 }, { "epoch": 0.047353701873541224, "grad_norm": 0.5889870524406433, "learning_rate": 8.033693053845801e-05, "loss": 0.6156, "step": 563 }, { "epoch": 0.04743781146834325, "grad_norm": 0.5544804334640503, "learning_rate": 8.002900194855932e-05, "loss": 0.7279, "step": 564 }, { "epoch": 0.047521921063145275, "grad_norm": 0.5051852464675903, "learning_rate": 7.972127046434878e-05, "loss": 0.6375, "step": 565 }, { "epoch": 0.04760603065794731, "grad_norm": 0.6946410536766052, "learning_rate": 7.941373912301189e-05, "loss": 0.8067, "step": 566 }, { "epoch": 0.047690140252749334, "grad_norm": 0.5952439904212952, "learning_rate": 7.910641095975886e-05, "loss": 0.8346, "step": 567 }, { "epoch": 0.04777424984755136, "grad_norm": 0.5868372321128845, "learning_rate": 7.879928900779456e-05, "loss": 0.8255, "step": 568 }, { "epoch": 0.047858359442353386, "grad_norm": 0.6287057399749756, "learning_rate": 7.849237629828869e-05, "loss": 0.5474, "step": 569 }, { "epoch": 0.04794246903715541, "grad_norm": 0.5471776127815247, "learning_rate": 7.818567586034577e-05, "loss": 0.5467, "step": 570 }, { "epoch": 0.04802657863195744, "grad_norm": 0.8107778429985046, "learning_rate": 7.787919072097531e-05, "loss": 0.6715, "step": 571 }, { "epoch": 0.04811068822675947, "grad_norm": 0.6035006046295166, "learning_rate": 7.75729239050619e-05, "loss": 0.6016, "step": 572 }, { "epoch": 0.0481947978215615, "grad_norm": 0.6550392508506775, "learning_rate": 7.726687843533538e-05, "loss": 0.5897, "step": 573 }, { "epoch": 0.04827890741636352, "grad_norm": 0.5324556827545166, "learning_rate": 7.696105733234098e-05, "loss": 0.5424, "step": 574 }, { "epoch": 0.04836301701116555, "grad_norm": 0.5881845355033875, "learning_rate": 7.66554636144095e-05, "loss": 0.6065, "step": 575 }, { "epoch": 0.048447126605967575, "grad_norm": 0.537804901599884, "learning_rate": 7.635010029762756e-05, "loss": 0.5698, "step": 576 }, { "epoch": 0.0485312362007696, "grad_norm": 0.5901293754577637, "learning_rate": 7.604497039580785e-05, "loss": 0.6954, "step": 577 }, { "epoch": 0.04861534579557163, "grad_norm": 0.5681110620498657, "learning_rate": 7.574007692045928e-05, "loss": 0.6799, "step": 578 }, { "epoch": 0.04869945539037366, "grad_norm": 0.5399741530418396, "learning_rate": 7.543542288075739e-05, "loss": 0.6881, "step": 579 }, { "epoch": 0.048783564985175686, "grad_norm": 0.605559229850769, "learning_rate": 7.513101128351454e-05, "loss": 0.608, "step": 580 }, { "epoch": 0.04886767457997771, "grad_norm": 0.5992718935012817, "learning_rate": 7.48268451331503e-05, "loss": 0.6869, "step": 581 }, { "epoch": 0.04895178417477974, "grad_norm": 0.8363056182861328, "learning_rate": 7.45229274316618e-05, "loss": 0.7678, "step": 582 }, { "epoch": 0.04903589376958176, "grad_norm": 0.421335905790329, "learning_rate": 7.421926117859403e-05, "loss": 0.508, "step": 583 }, { "epoch": 0.04912000336438379, "grad_norm": 0.4553971588611603, "learning_rate": 7.391584937101033e-05, "loss": 0.6228, "step": 584 }, { "epoch": 0.04920411295918582, "grad_norm": 0.5710154175758362, "learning_rate": 7.361269500346274e-05, "loss": 0.6005, "step": 585 }, { "epoch": 0.04928822255398785, "grad_norm": 0.5714388489723206, "learning_rate": 7.330980106796246e-05, "loss": 0.7007, "step": 586 }, { "epoch": 0.049372332148789874, "grad_norm": 0.5537533760070801, "learning_rate": 7.300717055395039e-05, "loss": 0.6398, "step": 587 }, { "epoch": 0.0494564417435919, "grad_norm": 0.43929553031921387, "learning_rate": 7.270480644826749e-05, "loss": 0.6056, "step": 588 }, { "epoch": 0.049540551338393926, "grad_norm": 0.485586017370224, "learning_rate": 7.240271173512546e-05, "loss": 0.5453, "step": 589 }, { "epoch": 0.04962466093319595, "grad_norm": 0.43666794896125793, "learning_rate": 7.210088939607708e-05, "loss": 0.5794, "step": 590 }, { "epoch": 0.04970877052799798, "grad_norm": 0.5430508852005005, "learning_rate": 7.179934240998706e-05, "loss": 0.8037, "step": 591 }, { "epoch": 0.04979288012280001, "grad_norm": 0.6481785774230957, "learning_rate": 7.149807375300239e-05, "loss": 0.7141, "step": 592 }, { "epoch": 0.04987698971760204, "grad_norm": 0.5810648202896118, "learning_rate": 7.119708639852312e-05, "loss": 0.5569, "step": 593 }, { "epoch": 0.04996109931240406, "grad_norm": 0.6031057238578796, "learning_rate": 7.089638331717284e-05, "loss": 0.6682, "step": 594 }, { "epoch": 0.05004520890720609, "grad_norm": 0.48759695887565613, "learning_rate": 7.059596747676962e-05, "loss": 0.634, "step": 595 }, { "epoch": 0.050129318502008115, "grad_norm": 0.4522048532962799, "learning_rate": 7.029584184229653e-05, "loss": 0.6403, "step": 596 }, { "epoch": 0.05021342809681014, "grad_norm": 0.4637600779533386, "learning_rate": 6.999600937587239e-05, "loss": 0.798, "step": 597 }, { "epoch": 0.050297537691612174, "grad_norm": 0.5411447882652283, "learning_rate": 6.969647303672262e-05, "loss": 0.6316, "step": 598 }, { "epoch": 0.0503816472864142, "grad_norm": 0.48814845085144043, "learning_rate": 6.939723578114993e-05, "loss": 0.775, "step": 599 }, { "epoch": 0.050465756881216225, "grad_norm": 0.6729063391685486, "learning_rate": 6.909830056250527e-05, "loss": 0.6383, "step": 600 }, { "epoch": 0.05054986647601825, "grad_norm": 0.5428520441055298, "learning_rate": 6.879967033115853e-05, "loss": 0.5884, "step": 601 }, { "epoch": 0.05063397607082028, "grad_norm": 0.5439213514328003, "learning_rate": 6.850134803446954e-05, "loss": 0.6811, "step": 602 }, { "epoch": 0.0507180856656223, "grad_norm": 0.5737605690956116, "learning_rate": 6.820333661675893e-05, "loss": 0.559, "step": 603 }, { "epoch": 0.050802195260424336, "grad_norm": 0.610083281993866, "learning_rate": 6.790563901927907e-05, "loss": 0.6291, "step": 604 }, { "epoch": 0.05088630485522636, "grad_norm": 0.6092396378517151, "learning_rate": 6.760825818018508e-05, "loss": 0.8043, "step": 605 }, { "epoch": 0.05097041445002839, "grad_norm": 0.43380871415138245, "learning_rate": 6.731119703450577e-05, "loss": 0.8899, "step": 606 }, { "epoch": 0.051054524044830414, "grad_norm": 0.6090589761734009, "learning_rate": 6.701445851411472e-05, "loss": 0.7548, "step": 607 }, { "epoch": 0.05113863363963244, "grad_norm": 0.5451867580413818, "learning_rate": 6.671804554770135e-05, "loss": 0.5946, "step": 608 }, { "epoch": 0.051222743234434466, "grad_norm": 0.6997014880180359, "learning_rate": 6.642196106074194e-05, "loss": 0.6974, "step": 609 }, { "epoch": 0.05130685282923649, "grad_norm": 0.5238434672355652, "learning_rate": 6.612620797547087e-05, "loss": 0.6132, "step": 610 }, { "epoch": 0.051390962424038525, "grad_norm": 0.5040790438652039, "learning_rate": 6.583078921085167e-05, "loss": 0.7068, "step": 611 }, { "epoch": 0.05147507201884055, "grad_norm": 0.5203388333320618, "learning_rate": 6.55357076825483e-05, "loss": 0.7418, "step": 612 }, { "epoch": 0.05155918161364258, "grad_norm": 0.5750290751457214, "learning_rate": 6.52409663028963e-05, "loss": 0.5788, "step": 613 }, { "epoch": 0.0516432912084446, "grad_norm": 0.4557915925979614, "learning_rate": 6.494656798087412e-05, "loss": 0.6357, "step": 614 }, { "epoch": 0.05172740080324663, "grad_norm": 0.48326605558395386, "learning_rate": 6.465251562207431e-05, "loss": 0.6878, "step": 615 }, { "epoch": 0.051811510398048655, "grad_norm": 0.5541282296180725, "learning_rate": 6.435881212867493e-05, "loss": 0.5612, "step": 616 }, { "epoch": 0.05189561999285069, "grad_norm": 0.5726374983787537, "learning_rate": 6.406546039941094e-05, "loss": 0.5127, "step": 617 }, { "epoch": 0.051979729587652714, "grad_norm": 0.4574257731437683, "learning_rate": 6.377246332954544e-05, "loss": 0.6469, "step": 618 }, { "epoch": 0.05206383918245474, "grad_norm": 0.5137540698051453, "learning_rate": 6.347982381084123e-05, "loss": 0.6729, "step": 619 }, { "epoch": 0.052147948777256765, "grad_norm": 0.5800307393074036, "learning_rate": 6.318754473153221e-05, "loss": 0.7838, "step": 620 }, { "epoch": 0.05223205837205879, "grad_norm": 0.45078638195991516, "learning_rate": 6.289562897629492e-05, "loss": 0.4851, "step": 621 }, { "epoch": 0.05231616796686082, "grad_norm": 0.45273396372795105, "learning_rate": 6.260407942621998e-05, "loss": 0.4557, "step": 622 }, { "epoch": 0.05240027756166284, "grad_norm": 0.46480339765548706, "learning_rate": 6.231289895878375e-05, "loss": 0.5967, "step": 623 }, { "epoch": 0.052484387156464876, "grad_norm": 0.5039330124855042, "learning_rate": 6.20220904478199e-05, "loss": 0.6384, "step": 624 }, { "epoch": 0.0525684967512669, "grad_norm": 0.637080192565918, "learning_rate": 6.173165676349103e-05, "loss": 0.8325, "step": 625 }, { "epoch": 0.05265260634606893, "grad_norm": 0.5003658533096313, "learning_rate": 6.144160077226036e-05, "loss": 0.5859, "step": 626 }, { "epoch": 0.052736715940870954, "grad_norm": 0.6767755150794983, "learning_rate": 6.11519253368634e-05, "loss": 0.6984, "step": 627 }, { "epoch": 0.05282082553567298, "grad_norm": 0.5123332142829895, "learning_rate": 6.086263331627976e-05, "loss": 0.5831, "step": 628 }, { "epoch": 0.052904935130475006, "grad_norm": 0.6226670742034912, "learning_rate": 6.05737275657049e-05, "loss": 0.7954, "step": 629 }, { "epoch": 0.05298904472527704, "grad_norm": 0.584923267364502, "learning_rate": 6.0285210936521955e-05, "loss": 0.7863, "step": 630 }, { "epoch": 0.053073154320079065, "grad_norm": 0.5787730813026428, "learning_rate": 5.999708627627354e-05, "loss": 0.7289, "step": 631 }, { "epoch": 0.05315726391488109, "grad_norm": 0.5147082805633545, "learning_rate": 5.9709356428633746e-05, "loss": 0.5297, "step": 632 }, { "epoch": 0.05324137350968312, "grad_norm": 0.6487715840339661, "learning_rate": 5.9422024233380013e-05, "loss": 0.6826, "step": 633 }, { "epoch": 0.05332548310448514, "grad_norm": 0.5044452548027039, "learning_rate": 5.913509252636511e-05, "loss": 0.5771, "step": 634 }, { "epoch": 0.05340959269928717, "grad_norm": 0.6998575329780579, "learning_rate": 5.884856413948913e-05, "loss": 0.7294, "step": 635 }, { "epoch": 0.0534937022940892, "grad_norm": 0.4688400626182556, "learning_rate": 5.856244190067159e-05, "loss": 0.6579, "step": 636 }, { "epoch": 0.05357781188889123, "grad_norm": 0.4600168466567993, "learning_rate": 5.82767286338235e-05, "loss": 0.5962, "step": 637 }, { "epoch": 0.05366192148369325, "grad_norm": 0.6258283257484436, "learning_rate": 5.799142715881938e-05, "loss": 0.6653, "step": 638 }, { "epoch": 0.05374603107849528, "grad_norm": 0.5476671457290649, "learning_rate": 5.770654029146969e-05, "loss": 0.6163, "step": 639 }, { "epoch": 0.053830140673297305, "grad_norm": 0.4993649125099182, "learning_rate": 5.7422070843492734e-05, "loss": 0.6052, "step": 640 }, { "epoch": 0.05391425026809933, "grad_norm": 0.5785011053085327, "learning_rate": 5.713802162248718e-05, "loss": 0.6716, "step": 641 }, { "epoch": 0.05399835986290136, "grad_norm": 0.6870988607406616, "learning_rate": 5.6854395431904094e-05, "loss": 0.6964, "step": 642 }, { "epoch": 0.05408246945770339, "grad_norm": 0.40551093220710754, "learning_rate": 5.657119507101954e-05, "loss": 0.6014, "step": 643 }, { "epoch": 0.054166579052505416, "grad_norm": 0.4204919934272766, "learning_rate": 5.6288423334906735e-05, "loss": 0.6235, "step": 644 }, { "epoch": 0.05425068864730744, "grad_norm": 0.6323750019073486, "learning_rate": 5.6006083014408484e-05, "loss": 0.7131, "step": 645 }, { "epoch": 0.05433479824210947, "grad_norm": 0.4721733033657074, "learning_rate": 5.572417689610987e-05, "loss": 0.5972, "step": 646 }, { "epoch": 0.054418907836911494, "grad_norm": 0.5242834687232971, "learning_rate": 5.544270776231038e-05, "loss": 0.7438, "step": 647 }, { "epoch": 0.05450301743171352, "grad_norm": 0.5273514986038208, "learning_rate": 5.5161678390996796e-05, "loss": 0.753, "step": 648 }, { "epoch": 0.05458712702651555, "grad_norm": 0.5042054653167725, "learning_rate": 5.488109155581549e-05, "loss": 0.5714, "step": 649 }, { "epoch": 0.05467123662131758, "grad_norm": 0.6366623640060425, "learning_rate": 5.4600950026045326e-05, "loss": 0.6925, "step": 650 }, { "epoch": 0.054755346216119605, "grad_norm": 0.4337083101272583, "learning_rate": 5.4321256566570036e-05, "loss": 0.49, "step": 651 }, { "epoch": 0.05483945581092163, "grad_norm": 0.5946835875511169, "learning_rate": 5.404201393785122e-05, "loss": 0.6322, "step": 652 }, { "epoch": 0.05492356540572366, "grad_norm": 0.37681615352630615, "learning_rate": 5.3763224895900846e-05, "loss": 0.6986, "step": 653 }, { "epoch": 0.05500767500052568, "grad_norm": 0.4949415326118469, "learning_rate": 5.348489219225416e-05, "loss": 0.5161, "step": 654 }, { "epoch": 0.05509178459532771, "grad_norm": 0.5567456483840942, "learning_rate": 5.320701857394268e-05, "loss": 0.6031, "step": 655 }, { "epoch": 0.05517589419012974, "grad_norm": 0.6035876870155334, "learning_rate": 5.292960678346675e-05, "loss": 0.6714, "step": 656 }, { "epoch": 0.05526000378493177, "grad_norm": 0.830230712890625, "learning_rate": 5.265265955876879e-05, "loss": 0.6222, "step": 657 }, { "epoch": 0.05534411337973379, "grad_norm": 0.5250204801559448, "learning_rate": 5.237617963320608e-05, "loss": 0.7947, "step": 658 }, { "epoch": 0.05542822297453582, "grad_norm": 0.5689926147460938, "learning_rate": 5.210016973552391e-05, "loss": 0.752, "step": 659 }, { "epoch": 0.055512332569337845, "grad_norm": 0.5850744843482971, "learning_rate": 5.182463258982846e-05, "loss": 0.758, "step": 660 }, { "epoch": 0.05559644216413987, "grad_norm": 0.8335689306259155, "learning_rate": 5.1549570915560206e-05, "loss": 0.8418, "step": 661 }, { "epoch": 0.055680551758941904, "grad_norm": 0.5002387762069702, "learning_rate": 5.127498742746675e-05, "loss": 0.5669, "step": 662 }, { "epoch": 0.05576466135374393, "grad_norm": 0.5952647924423218, "learning_rate": 5.100088483557634e-05, "loss": 0.6905, "step": 663 }, { "epoch": 0.055848770948545956, "grad_norm": 0.6531146764755249, "learning_rate": 5.072726584517086e-05, "loss": 0.527, "step": 664 }, { "epoch": 0.05593288054334798, "grad_norm": 0.48260536789894104, "learning_rate": 5.045413315675924e-05, "loss": 0.6368, "step": 665 }, { "epoch": 0.05601699013815001, "grad_norm": 0.5707049369812012, "learning_rate": 5.018148946605092e-05, "loss": 0.676, "step": 666 }, { "epoch": 0.056101099732952034, "grad_norm": 0.5223572850227356, "learning_rate": 4.990933746392899e-05, "loss": 0.6228, "step": 667 }, { "epoch": 0.05618520932775407, "grad_norm": 0.6292992234230042, "learning_rate": 4.9637679836423924e-05, "loss": 0.5522, "step": 668 }, { "epoch": 0.05626931892255609, "grad_norm": 0.6064843535423279, "learning_rate": 4.9366519264686725e-05, "loss": 0.6125, "step": 669 }, { "epoch": 0.05635342851735812, "grad_norm": 0.6772509813308716, "learning_rate": 4.909585842496287e-05, "loss": 0.7981, "step": 670 }, { "epoch": 0.056437538112160145, "grad_norm": 0.5350778698921204, "learning_rate": 4.8825699988565485e-05, "loss": 0.8081, "step": 671 }, { "epoch": 0.05652164770696217, "grad_norm": 0.4920620620250702, "learning_rate": 4.8556046621849346e-05, "loss": 0.4888, "step": 672 }, { "epoch": 0.0566057573017642, "grad_norm": 0.5023487210273743, "learning_rate": 4.828690098618429e-05, "loss": 0.7385, "step": 673 }, { "epoch": 0.05668986689656622, "grad_norm": 0.7149183750152588, "learning_rate": 4.8018265737929044e-05, "loss": 0.6377, "step": 674 }, { "epoch": 0.056773976491368255, "grad_norm": 0.5488125681877136, "learning_rate": 4.7750143528405126e-05, "loss": 0.6302, "step": 675 }, { "epoch": 0.05685808608617028, "grad_norm": 0.6204077005386353, "learning_rate": 4.748253700387042e-05, "loss": 0.7237, "step": 676 }, { "epoch": 0.05694219568097231, "grad_norm": 0.4989658296108246, "learning_rate": 4.721544880549337e-05, "loss": 0.6335, "step": 677 }, { "epoch": 0.05702630527577433, "grad_norm": 0.582874059677124, "learning_rate": 4.694888156932658e-05, "loss": 0.6944, "step": 678 }, { "epoch": 0.05711041487057636, "grad_norm": 0.5402207374572754, "learning_rate": 4.668283792628114e-05, "loss": 0.752, "step": 679 }, { "epoch": 0.057194524465378385, "grad_norm": 0.46678709983825684, "learning_rate": 4.6417320502100316e-05, "loss": 0.625, "step": 680 }, { "epoch": 0.05727863406018042, "grad_norm": 0.5934203267097473, "learning_rate": 4.615233191733398e-05, "loss": 0.7576, "step": 681 }, { "epoch": 0.057362743654982444, "grad_norm": 0.6489574909210205, "learning_rate": 4.588787478731242e-05, "loss": 0.6303, "step": 682 }, { "epoch": 0.05744685324978447, "grad_norm": 0.503392219543457, "learning_rate": 4.5623951722120736e-05, "loss": 0.6418, "step": 683 }, { "epoch": 0.057530962844586496, "grad_norm": 0.7234579920768738, "learning_rate": 4.5360565326573104e-05, "loss": 0.8319, "step": 684 }, { "epoch": 0.05761507243938852, "grad_norm": 0.6499959826469421, "learning_rate": 4.5097718200186814e-05, "loss": 0.6247, "step": 685 }, { "epoch": 0.05769918203419055, "grad_norm": 0.5063481330871582, "learning_rate": 4.483541293715698e-05, "loss": 0.6584, "step": 686 }, { "epoch": 0.05778329162899258, "grad_norm": 0.5390834808349609, "learning_rate": 4.457365212633058e-05, "loss": 0.7036, "step": 687 }, { "epoch": 0.05786740122379461, "grad_norm": 0.4784635305404663, "learning_rate": 4.431243835118124e-05, "loss": 0.4651, "step": 688 }, { "epoch": 0.05795151081859663, "grad_norm": 0.5334429740905762, "learning_rate": 4.4051774189783315e-05, "loss": 0.5427, "step": 689 }, { "epoch": 0.05803562041339866, "grad_norm": 0.5312195420265198, "learning_rate": 4.379166221478697e-05, "loss": 0.6366, "step": 690 }, { "epoch": 0.058119730008200685, "grad_norm": 0.6458830237388611, "learning_rate": 4.3532104993392306e-05, "loss": 0.7941, "step": 691 }, { "epoch": 0.05820383960300271, "grad_norm": 0.5415295362472534, "learning_rate": 4.327310508732437e-05, "loss": 0.5574, "step": 692 }, { "epoch": 0.058287949197804736, "grad_norm": 0.4901331663131714, "learning_rate": 4.301466505280762e-05, "loss": 0.6664, "step": 693 }, { "epoch": 0.05837205879260677, "grad_norm": 0.5454357266426086, "learning_rate": 4.2756787440540936e-05, "loss": 0.7233, "step": 694 }, { "epoch": 0.058456168387408795, "grad_norm": 0.6961559057235718, "learning_rate": 4.249947479567218e-05, "loss": 0.707, "step": 695 }, { "epoch": 0.05854027798221082, "grad_norm": 0.6365110874176025, "learning_rate": 4.224272965777326e-05, "loss": 0.7353, "step": 696 }, { "epoch": 0.05862438757701285, "grad_norm": 0.527192234992981, "learning_rate": 4.1986554560815096e-05, "loss": 0.7661, "step": 697 }, { "epoch": 0.05870849717181487, "grad_norm": 0.48179832100868225, "learning_rate": 4.173095203314241e-05, "loss": 0.665, "step": 698 }, { "epoch": 0.0587926067666169, "grad_norm": 0.6591465473175049, "learning_rate": 4.1475924597449024e-05, "loss": 0.7179, "step": 699 }, { "epoch": 0.05887671636141893, "grad_norm": 0.4673934876918793, "learning_rate": 4.12214747707527e-05, "loss": 0.6477, "step": 700 }, { "epoch": 0.05896082595622096, "grad_norm": 0.8560356497764587, "learning_rate": 4.096760506437057e-05, "loss": 0.4083, "step": 701 }, { "epoch": 0.059044935551022984, "grad_norm": 0.5286902189254761, "learning_rate": 4.071431798389408e-05, "loss": 0.7202, "step": 702 }, { "epoch": 0.05912904514582501, "grad_norm": 0.5945466160774231, "learning_rate": 4.0461616029164526e-05, "loss": 0.8511, "step": 703 }, { "epoch": 0.059213154740627036, "grad_norm": 0.7009603381156921, "learning_rate": 4.020950169424815e-05, "loss": 0.8503, "step": 704 }, { "epoch": 0.05929726433542906, "grad_norm": 0.5323594808578491, "learning_rate": 3.9957977467411615e-05, "loss": 0.6064, "step": 705 }, { "epoch": 0.05938137393023109, "grad_norm": 0.4955034852027893, "learning_rate": 3.9707045831097555e-05, "loss": 0.6115, "step": 706 }, { "epoch": 0.05946548352503312, "grad_norm": 0.5311620831489563, "learning_rate": 3.945670926189987e-05, "loss": 0.6827, "step": 707 }, { "epoch": 0.05954959311983515, "grad_norm": 0.432479590177536, "learning_rate": 3.920697023053949e-05, "loss": 0.5194, "step": 708 }, { "epoch": 0.05963370271463717, "grad_norm": 0.544047474861145, "learning_rate": 3.895783120183976e-05, "loss": 0.6654, "step": 709 }, { "epoch": 0.0597178123094392, "grad_norm": 0.4839647114276886, "learning_rate": 3.8709294634702376e-05, "loss": 0.5999, "step": 710 }, { "epoch": 0.059801921904241225, "grad_norm": 0.6779318451881409, "learning_rate": 3.846136298208285e-05, "loss": 0.655, "step": 711 }, { "epoch": 0.05988603149904325, "grad_norm": 0.5102113485336304, "learning_rate": 3.821403869096658e-05, "loss": 0.6759, "step": 712 }, { "epoch": 0.05997014109384528, "grad_norm": 0.5493488311767578, "learning_rate": 3.796732420234443e-05, "loss": 0.5339, "step": 713 }, { "epoch": 0.06005425068864731, "grad_norm": 0.5925408005714417, "learning_rate": 3.7721221951188765e-05, "loss": 0.751, "step": 714 }, { "epoch": 0.060138360283449335, "grad_norm": 0.6988538503646851, "learning_rate": 3.747573436642951e-05, "loss": 0.759, "step": 715 }, { "epoch": 0.06022246987825136, "grad_norm": 0.5451468825340271, "learning_rate": 3.7230863870929964e-05, "loss": 0.5549, "step": 716 }, { "epoch": 0.06030657947305339, "grad_norm": 0.5933196544647217, "learning_rate": 3.698661288146311e-05, "loss": 0.5338, "step": 717 }, { "epoch": 0.06039068906785541, "grad_norm": 0.5368034839630127, "learning_rate": 3.674298380868756e-05, "loss": 0.6672, "step": 718 }, { "epoch": 0.060474798662657446, "grad_norm": 0.6387892961502075, "learning_rate": 3.649997905712396e-05, "loss": 0.6483, "step": 719 }, { "epoch": 0.06055890825745947, "grad_norm": 0.5074757933616638, "learning_rate": 3.6257601025131026e-05, "loss": 0.5061, "step": 720 }, { "epoch": 0.0606430178522615, "grad_norm": 0.7502937912940979, "learning_rate": 3.601585210488218e-05, "loss": 0.7917, "step": 721 }, { "epoch": 0.060727127447063524, "grad_norm": 0.5781522989273071, "learning_rate": 3.577473468234156e-05, "loss": 0.825, "step": 722 }, { "epoch": 0.06081123704186555, "grad_norm": 0.559719443321228, "learning_rate": 3.553425113724088e-05, "loss": 0.5209, "step": 723 }, { "epoch": 0.060895346636667576, "grad_norm": 0.714248538017273, "learning_rate": 3.52944038430556e-05, "loss": 0.5418, "step": 724 }, { "epoch": 0.0609794562314696, "grad_norm": 0.5317071080207825, "learning_rate": 3.5055195166981645e-05, "loss": 0.6363, "step": 725 }, { "epoch": 0.061063565826271635, "grad_norm": 0.5349583625793457, "learning_rate": 3.481662746991214e-05, "loss": 0.5828, "step": 726 }, { "epoch": 0.06114767542107366, "grad_norm": 0.5363706350326538, "learning_rate": 3.4578703106413904e-05, "loss": 0.5334, "step": 727 }, { "epoch": 0.06123178501587569, "grad_norm": 0.4438410699367523, "learning_rate": 3.4341424424704375e-05, "loss": 0.7108, "step": 728 }, { "epoch": 0.06131589461067771, "grad_norm": 0.6724744439125061, "learning_rate": 3.4104793766628304e-05, "loss": 0.6905, "step": 729 }, { "epoch": 0.06140000420547974, "grad_norm": 0.5650711059570312, "learning_rate": 3.386881346763483e-05, "loss": 0.5404, "step": 730 }, { "epoch": 0.061484113800281764, "grad_norm": 0.6993920803070068, "learning_rate": 3.363348585675414e-05, "loss": 0.6725, "step": 731 }, { "epoch": 0.0615682233950838, "grad_norm": 0.5407055616378784, "learning_rate": 3.339881325657484e-05, "loss": 0.6004, "step": 732 }, { "epoch": 0.06165233298988582, "grad_norm": 0.5904746055603027, "learning_rate": 3.316479798322072e-05, "loss": 0.6763, "step": 733 }, { "epoch": 0.06173644258468785, "grad_norm": 0.5810348987579346, "learning_rate": 3.2931442346328004e-05, "loss": 0.8605, "step": 734 }, { "epoch": 0.061820552179489875, "grad_norm": 0.48206856846809387, "learning_rate": 3.269874864902269e-05, "loss": 0.6533, "step": 735 }, { "epoch": 0.0619046617742919, "grad_norm": 0.5678583383560181, "learning_rate": 3.246671918789755e-05, "loss": 0.5872, "step": 736 }, { "epoch": 0.06198877136909393, "grad_norm": 0.5930237174034119, "learning_rate": 3.223535625298979e-05, "loss": 0.7073, "step": 737 }, { "epoch": 0.06207288096389595, "grad_norm": 0.4858033061027527, "learning_rate": 3.200466212775808e-05, "loss": 0.5829, "step": 738 }, { "epoch": 0.062156990558697986, "grad_norm": 0.6299439668655396, "learning_rate": 3.1774639089060363e-05, "loss": 0.4702, "step": 739 }, { "epoch": 0.06224110015350001, "grad_norm": 0.5381211042404175, "learning_rate": 3.154528940713113e-05, "loss": 0.7213, "step": 740 }, { "epoch": 0.06232520974830204, "grad_norm": 0.4895922839641571, "learning_rate": 3.1316615345559185e-05, "loss": 0.571, "step": 741 }, { "epoch": 0.062409319343104064, "grad_norm": 0.507371723651886, "learning_rate": 3.108861916126518e-05, "loss": 0.6783, "step": 742 }, { "epoch": 0.06249342893790609, "grad_norm": 0.5576531291007996, "learning_rate": 3.086130310447937e-05, "loss": 0.6561, "step": 743 }, { "epoch": 0.06257753853270812, "grad_norm": 0.42016541957855225, "learning_rate": 3.063466941871952e-05, "loss": 0.5153, "step": 744 }, { "epoch": 0.06266164812751014, "grad_norm": 0.7200675010681152, "learning_rate": 3.0408720340768572e-05, "loss": 0.6269, "step": 745 }, { "epoch": 0.06274575772231217, "grad_norm": 0.5375575423240662, "learning_rate": 3.018345810065275e-05, "loss": 0.5603, "step": 746 }, { "epoch": 0.0628298673171142, "grad_norm": 0.45930182933807373, "learning_rate": 2.9958884921619367e-05, "loss": 0.7863, "step": 747 }, { "epoch": 0.06291397691191623, "grad_norm": 0.5124622583389282, "learning_rate": 2.9735003020115092e-05, "loss": 0.7125, "step": 748 }, { "epoch": 0.06299808650671826, "grad_norm": 0.5122153162956238, "learning_rate": 2.9511814605763855e-05, "loss": 0.7104, "step": 749 }, { "epoch": 0.06308219610152029, "grad_norm": 0.56404048204422, "learning_rate": 2.9289321881345254e-05, "loss": 0.6569, "step": 750 }, { "epoch": 0.06316630569632231, "grad_norm": 0.4476301968097687, "learning_rate": 2.9067527042772636e-05, "loss": 0.6703, "step": 751 }, { "epoch": 0.06325041529112434, "grad_norm": 0.432229220867157, "learning_rate": 2.8846432279071467e-05, "loss": 0.7372, "step": 752 }, { "epoch": 0.06333452488592636, "grad_norm": 0.5445795059204102, "learning_rate": 2.8626039772357882e-05, "loss": 0.5326, "step": 753 }, { "epoch": 0.06341863448072839, "grad_norm": 0.645452618598938, "learning_rate": 2.840635169781688e-05, "loss": 0.3862, "step": 754 }, { "epoch": 0.06350274407553042, "grad_norm": 0.4100976288318634, "learning_rate": 2.8187370223681132e-05, "loss": 0.8218, "step": 755 }, { "epoch": 0.06358685367033244, "grad_norm": 0.7901608347892761, "learning_rate": 2.7969097511209308e-05, "loss": 0.6172, "step": 756 }, { "epoch": 0.06367096326513447, "grad_norm": 0.4609740376472473, "learning_rate": 2.775153571466502e-05, "loss": 0.7415, "step": 757 }, { "epoch": 0.06375507285993649, "grad_norm": 0.684585452079773, "learning_rate": 2.753468698129533e-05, "loss": 0.696, "step": 758 }, { "epoch": 0.06383918245473852, "grad_norm": 0.5767757296562195, "learning_rate": 2.7318553451309726e-05, "loss": 0.6223, "step": 759 }, { "epoch": 0.06392329204954054, "grad_norm": 0.5188784003257751, "learning_rate": 2.7103137257858868e-05, "loss": 0.7564, "step": 760 }, { "epoch": 0.06400740164434258, "grad_norm": 0.6304595470428467, "learning_rate": 2.688844052701359e-05, "loss": 0.7426, "step": 761 }, { "epoch": 0.06409151123914461, "grad_norm": 0.5373560190200806, "learning_rate": 2.6674465377744017e-05, "loss": 0.7388, "step": 762 }, { "epoch": 0.06417562083394664, "grad_norm": 0.5632065534591675, "learning_rate": 2.646121392189841e-05, "loss": 0.5892, "step": 763 }, { "epoch": 0.06425973042874866, "grad_norm": 0.5747659802436829, "learning_rate": 2.624868826418262e-05, "loss": 0.4149, "step": 764 }, { "epoch": 0.06434384002355069, "grad_norm": 0.7094728946685791, "learning_rate": 2.603689050213902e-05, "loss": 0.6391, "step": 765 }, { "epoch": 0.06442794961835271, "grad_norm": 0.6141682863235474, "learning_rate": 2.582582272612609e-05, "loss": 0.5478, "step": 766 }, { "epoch": 0.06451205921315474, "grad_norm": 0.5336157083511353, "learning_rate": 2.561548701929749e-05, "loss": 0.6272, "step": 767 }, { "epoch": 0.06459616880795677, "grad_norm": 0.5744621753692627, "learning_rate": 2.540588545758179e-05, "loss": 0.6357, "step": 768 }, { "epoch": 0.06468027840275879, "grad_norm": 0.5280835628509521, "learning_rate": 2.5197020109661772e-05, "loss": 0.6744, "step": 769 }, { "epoch": 0.06476438799756082, "grad_norm": 0.5798782110214233, "learning_rate": 2.4988893036954043e-05, "loss": 0.9299, "step": 770 }, { "epoch": 0.06484849759236284, "grad_norm": 0.7400964498519897, "learning_rate": 2.4781506293588873e-05, "loss": 0.7047, "step": 771 }, { "epoch": 0.06493260718716487, "grad_norm": 0.4643491208553314, "learning_rate": 2.4574861926389615e-05, "loss": 0.5174, "step": 772 }, { "epoch": 0.0650167167819669, "grad_norm": 0.4251709580421448, "learning_rate": 2.436896197485282e-05, "loss": 0.6295, "step": 773 }, { "epoch": 0.06510082637676894, "grad_norm": 0.48561739921569824, "learning_rate": 2.4163808471127812e-05, "loss": 0.6543, "step": 774 }, { "epoch": 0.06518493597157096, "grad_norm": 0.6305271983146667, "learning_rate": 2.3959403439996907e-05, "loss": 0.5729, "step": 775 }, { "epoch": 0.06526904556637299, "grad_norm": 0.6219698190689087, "learning_rate": 2.37557488988552e-05, "loss": 0.7075, "step": 776 }, { "epoch": 0.06535315516117501, "grad_norm": 0.5188342332839966, "learning_rate": 2.3552846857690846e-05, "loss": 0.7433, "step": 777 }, { "epoch": 0.06543726475597704, "grad_norm": 1.439207673072815, "learning_rate": 2.3350699319065026e-05, "loss": 0.8646, "step": 778 }, { "epoch": 0.06552137435077907, "grad_norm": 0.5281020402908325, "learning_rate": 2.3149308278092342e-05, "loss": 0.7782, "step": 779 }, { "epoch": 0.06560548394558109, "grad_norm": 0.5641273260116577, "learning_rate": 2.2948675722421086e-05, "loss": 0.8682, "step": 780 }, { "epoch": 0.06568959354038312, "grad_norm": 0.6752006411552429, "learning_rate": 2.2748803632213557e-05, "loss": 0.652, "step": 781 }, { "epoch": 0.06577370313518514, "grad_norm": 0.48423561453819275, "learning_rate": 2.254969398012663e-05, "loss": 0.3943, "step": 782 }, { "epoch": 0.06585781272998717, "grad_norm": 0.5390269756317139, "learning_rate": 2.235134873129213e-05, "loss": 0.657, "step": 783 }, { "epoch": 0.0659419223247892, "grad_norm": 0.469672828912735, "learning_rate": 2.2153769843297667e-05, "loss": 0.5305, "step": 784 }, { "epoch": 0.06602603191959122, "grad_norm": 0.6228986978530884, "learning_rate": 2.195695926616702e-05, "loss": 0.546, "step": 785 }, { "epoch": 0.06611014151439325, "grad_norm": 0.537236750125885, "learning_rate": 2.1760918942341192e-05, "loss": 0.7306, "step": 786 }, { "epoch": 0.06619425110919529, "grad_norm": 0.48184770345687866, "learning_rate": 2.1565650806658975e-05, "loss": 0.6454, "step": 787 }, { "epoch": 0.06627836070399731, "grad_norm": 0.6069773435592651, "learning_rate": 2.137115678633811e-05, "loss": 0.6599, "step": 788 }, { "epoch": 0.06636247029879934, "grad_norm": 0.5169489979743958, "learning_rate": 2.1177438800956007e-05, "loss": 0.5624, "step": 789 }, { "epoch": 0.06644657989360137, "grad_norm": 0.4128231704235077, "learning_rate": 2.098449876243096e-05, "loss": 0.5663, "step": 790 }, { "epoch": 0.06653068948840339, "grad_norm": 0.715587854385376, "learning_rate": 2.07923385750033e-05, "loss": 0.7287, "step": 791 }, { "epoch": 0.06661479908320542, "grad_norm": 0.44507458806037903, "learning_rate": 2.0600960135216462e-05, "loss": 0.6717, "step": 792 }, { "epoch": 0.06669890867800744, "grad_norm": 0.5170130729675293, "learning_rate": 2.0410365331898416e-05, "loss": 0.6406, "step": 793 }, { "epoch": 0.06678301827280947, "grad_norm": 0.46590495109558105, "learning_rate": 2.0220556046142893e-05, "loss": 0.6061, "step": 794 }, { "epoch": 0.0668671278676115, "grad_norm": 0.5688216686248779, "learning_rate": 2.0031534151290943e-05, "loss": 0.7645, "step": 795 }, { "epoch": 0.06695123746241352, "grad_norm": 0.6071348786354065, "learning_rate": 1.9843301512912327e-05, "loss": 0.8112, "step": 796 }, { "epoch": 0.06703534705721555, "grad_norm": 0.639324963092804, "learning_rate": 1.965585998878724e-05, "loss": 0.683, "step": 797 }, { "epoch": 0.06711945665201757, "grad_norm": 0.5158023238182068, "learning_rate": 1.946921142888781e-05, "loss": 0.6532, "step": 798 }, { "epoch": 0.06720356624681961, "grad_norm": 0.5088067650794983, "learning_rate": 1.928335767535997e-05, "loss": 0.6117, "step": 799 }, { "epoch": 0.06728767584162164, "grad_norm": 0.4564967155456543, "learning_rate": 1.9098300562505266e-05, "loss": 0.6276, "step": 800 }, { "epoch": 0.06737178543642366, "grad_norm": 0.620216429233551, "learning_rate": 1.891404191676265e-05, "loss": 0.655, "step": 801 }, { "epoch": 0.06745589503122569, "grad_norm": 0.698749303817749, "learning_rate": 1.8730583556690605e-05, "loss": 0.5483, "step": 802 }, { "epoch": 0.06754000462602772, "grad_norm": 0.48406967520713806, "learning_rate": 1.854792729294905e-05, "loss": 0.7465, "step": 803 }, { "epoch": 0.06762411422082974, "grad_norm": 0.5097486972808838, "learning_rate": 1.8366074928281607e-05, "loss": 0.4625, "step": 804 }, { "epoch": 0.06770822381563177, "grad_norm": 0.5742330551147461, "learning_rate": 1.818502825749764e-05, "loss": 0.6649, "step": 805 }, { "epoch": 0.0677923334104338, "grad_norm": 0.49662643671035767, "learning_rate": 1.8004789067454764e-05, "loss": 0.7763, "step": 806 }, { "epoch": 0.06787644300523582, "grad_norm": 0.6889787912368774, "learning_rate": 1.7825359137040988e-05, "loss": 0.6646, "step": 807 }, { "epoch": 0.06796055260003785, "grad_norm": 0.47545164823532104, "learning_rate": 1.7646740237157256e-05, "loss": 0.6404, "step": 808 }, { "epoch": 0.06804466219483987, "grad_norm": 0.5143394470214844, "learning_rate": 1.7468934130700044e-05, "loss": 0.6727, "step": 809 }, { "epoch": 0.0681287717896419, "grad_norm": 0.5231197476387024, "learning_rate": 1.7291942572543807e-05, "loss": 0.6471, "step": 810 }, { "epoch": 0.06821288138444392, "grad_norm": 0.5731273293495178, "learning_rate": 1.7115767309523812e-05, "loss": 0.7455, "step": 811 }, { "epoch": 0.06829699097924596, "grad_norm": 0.5894562005996704, "learning_rate": 1.6940410080418723e-05, "loss": 0.6111, "step": 812 }, { "epoch": 0.06838110057404799, "grad_norm": 0.5060394406318665, "learning_rate": 1.6765872615933677e-05, "loss": 0.5798, "step": 813 }, { "epoch": 0.06846521016885002, "grad_norm": 0.5075900554656982, "learning_rate": 1.6592156638682886e-05, "loss": 0.6851, "step": 814 }, { "epoch": 0.06854931976365204, "grad_norm": 0.40963247418403625, "learning_rate": 1.6419263863172997e-05, "loss": 0.53, "step": 815 }, { "epoch": 0.06863342935845407, "grad_norm": 0.5541537404060364, "learning_rate": 1.6247195995785837e-05, "loss": 0.8987, "step": 816 }, { "epoch": 0.0687175389532561, "grad_norm": 0.6535544395446777, "learning_rate": 1.6075954734761845e-05, "loss": 0.5659, "step": 817 }, { "epoch": 0.06880164854805812, "grad_norm": 0.4968286454677582, "learning_rate": 1.5905541770183096e-05, "loss": 0.6928, "step": 818 }, { "epoch": 0.06888575814286015, "grad_norm": 0.852871835231781, "learning_rate": 1.5735958783956794e-05, "loss": 0.5483, "step": 819 }, { "epoch": 0.06896986773766217, "grad_norm": 0.6427825689315796, "learning_rate": 1.5567207449798515e-05, "loss": 0.5945, "step": 820 }, { "epoch": 0.0690539773324642, "grad_norm": 0.6467059850692749, "learning_rate": 1.539928943321579e-05, "loss": 0.5874, "step": 821 }, { "epoch": 0.06913808692726622, "grad_norm": 0.5707730054855347, "learning_rate": 1.5232206391491699e-05, "loss": 0.616, "step": 822 }, { "epoch": 0.06922219652206825, "grad_norm": 0.6659408211708069, "learning_rate": 1.5065959973668353e-05, "loss": 0.782, "step": 823 }, { "epoch": 0.06930630611687028, "grad_norm": 0.5345143675804138, "learning_rate": 1.4900551820530828e-05, "loss": 0.7579, "step": 824 }, { "epoch": 0.06939041571167232, "grad_norm": 0.5110843777656555, "learning_rate": 1.4735983564590783e-05, "loss": 0.778, "step": 825 }, { "epoch": 0.06947452530647434, "grad_norm": 0.4799653887748718, "learning_rate": 1.4572256830070497e-05, "loss": 0.6927, "step": 826 }, { "epoch": 0.06955863490127637, "grad_norm": 0.5991430282592773, "learning_rate": 1.4409373232886702e-05, "loss": 0.6772, "step": 827 }, { "epoch": 0.0696427444960784, "grad_norm": 0.6813725233078003, "learning_rate": 1.4247334380634792e-05, "loss": 0.6461, "step": 828 }, { "epoch": 0.06972685409088042, "grad_norm": 0.6396465301513672, "learning_rate": 1.4086141872572789e-05, "loss": 0.7156, "step": 829 }, { "epoch": 0.06981096368568245, "grad_norm": 0.6053589582443237, "learning_rate": 1.3925797299605647e-05, "loss": 0.8174, "step": 830 }, { "epoch": 0.06989507328048447, "grad_norm": 0.5992446541786194, "learning_rate": 1.3766302244269624e-05, "loss": 0.7147, "step": 831 }, { "epoch": 0.0699791828752865, "grad_norm": 0.5334646701812744, "learning_rate": 1.3607658280716473e-05, "loss": 0.5372, "step": 832 }, { "epoch": 0.07006329247008852, "grad_norm": 0.4751766622066498, "learning_rate": 1.3449866974698122e-05, "loss": 0.4352, "step": 833 }, { "epoch": 0.07014740206489055, "grad_norm": 0.4926972985267639, "learning_rate": 1.3292929883550998e-05, "loss": 0.6821, "step": 834 }, { "epoch": 0.07023151165969257, "grad_norm": 0.4819171726703644, "learning_rate": 1.3136848556180892e-05, "loss": 0.6051, "step": 835 }, { "epoch": 0.0703156212544946, "grad_norm": 0.48462870717048645, "learning_rate": 1.2981624533047432e-05, "loss": 0.5676, "step": 836 }, { "epoch": 0.07039973084929663, "grad_norm": 0.5962949395179749, "learning_rate": 1.2827259346149122e-05, "loss": 0.5559, "step": 837 }, { "epoch": 0.07048384044409867, "grad_norm": 0.5228509306907654, "learning_rate": 1.2673754519008008e-05, "loss": 0.6072, "step": 838 }, { "epoch": 0.07056795003890069, "grad_norm": 0.6193320751190186, "learning_rate": 1.2521111566654731e-05, "loss": 0.6156, "step": 839 }, { "epoch": 0.07065205963370272, "grad_norm": 0.48941028118133545, "learning_rate": 1.2369331995613665e-05, "loss": 0.5834, "step": 840 }, { "epoch": 0.07073616922850474, "grad_norm": 0.5763342380523682, "learning_rate": 1.2218417303887842e-05, "loss": 0.6142, "step": 841 }, { "epoch": 0.07082027882330677, "grad_norm": 0.5619479417800903, "learning_rate": 1.206836898094439e-05, "loss": 0.5918, "step": 842 }, { "epoch": 0.0709043884181088, "grad_norm": 0.4731215536594391, "learning_rate": 1.191918850769964e-05, "loss": 0.4884, "step": 843 }, { "epoch": 0.07098849801291082, "grad_norm": 0.7118493318557739, "learning_rate": 1.1770877356504683e-05, "loss": 0.6499, "step": 844 }, { "epoch": 0.07107260760771285, "grad_norm": 0.7185307741165161, "learning_rate": 1.1623436991130654e-05, "loss": 0.7735, "step": 845 }, { "epoch": 0.07115671720251487, "grad_norm": 0.5481952428817749, "learning_rate": 1.1476868866754486e-05, "loss": 0.919, "step": 846 }, { "epoch": 0.0712408267973169, "grad_norm": 0.596553385257721, "learning_rate": 1.1331174429944347e-05, "loss": 0.6963, "step": 847 }, { "epoch": 0.07132493639211893, "grad_norm": 0.5250246524810791, "learning_rate": 1.1186355118645554e-05, "loss": 0.6193, "step": 848 }, { "epoch": 0.07140904598692095, "grad_norm": 0.5260514616966248, "learning_rate": 1.1042412362166222e-05, "loss": 0.6361, "step": 849 }, { "epoch": 0.07149315558172299, "grad_norm": 0.5367104411125183, "learning_rate": 1.0899347581163221e-05, "loss": 0.5347, "step": 850 }, { "epoch": 0.07157726517652502, "grad_norm": 0.5701801776885986, "learning_rate": 1.0757162187628222e-05, "loss": 0.6477, "step": 851 }, { "epoch": 0.07166137477132704, "grad_norm": 0.494419127702713, "learning_rate": 1.0615857584873623e-05, "loss": 0.6831, "step": 852 }, { "epoch": 0.07174548436612907, "grad_norm": 0.6777026057243347, "learning_rate": 1.0475435167518843e-05, "loss": 0.6998, "step": 853 }, { "epoch": 0.0718295939609311, "grad_norm": 0.6337087154388428, "learning_rate": 1.0335896321476413e-05, "loss": 0.6884, "step": 854 }, { "epoch": 0.07191370355573312, "grad_norm": 0.508190393447876, "learning_rate": 1.0197242423938446e-05, "loss": 0.6033, "step": 855 }, { "epoch": 0.07199781315053515, "grad_norm": 0.588848888874054, "learning_rate": 1.0059474843362892e-05, "loss": 0.6572, "step": 856 }, { "epoch": 0.07208192274533717, "grad_norm": 0.6991647481918335, "learning_rate": 9.922594939460194e-06, "loss": 0.5044, "step": 857 }, { "epoch": 0.0721660323401392, "grad_norm": 0.6183068752288818, "learning_rate": 9.786604063179728e-06, "loss": 0.5294, "step": 858 }, { "epoch": 0.07225014193494123, "grad_norm": 0.6094670295715332, "learning_rate": 9.651503556696516e-06, "loss": 0.4282, "step": 859 }, { "epoch": 0.07233425152974325, "grad_norm": 0.5989966988563538, "learning_rate": 9.517294753398064e-06, "loss": 0.6284, "step": 860 }, { "epoch": 0.07241836112454528, "grad_norm": 0.5924322009086609, "learning_rate": 9.383978977871021e-06, "loss": 0.4946, "step": 861 }, { "epoch": 0.0725024707193473, "grad_norm": 0.6521950364112854, "learning_rate": 9.251557545888312e-06, "loss": 0.6351, "step": 862 }, { "epoch": 0.07258658031414934, "grad_norm": 0.5028837323188782, "learning_rate": 9.120031764395987e-06, "loss": 0.6651, "step": 863 }, { "epoch": 0.07267068990895137, "grad_norm": 0.5665683746337891, "learning_rate": 8.989402931500434e-06, "loss": 0.6975, "step": 864 }, { "epoch": 0.0727547995037534, "grad_norm": 0.6868077516555786, "learning_rate": 8.85967233645547e-06, "loss": 0.6654, "step": 865 }, { "epoch": 0.07283890909855542, "grad_norm": 0.4886159300804138, "learning_rate": 8.730841259649725e-06, "loss": 0.6435, "step": 866 }, { "epoch": 0.07292301869335745, "grad_norm": 0.5298961997032166, "learning_rate": 8.602910972593892e-06, "loss": 0.6013, "step": 867 }, { "epoch": 0.07300712828815947, "grad_norm": 0.5650634765625, "learning_rate": 8.475882737908248e-06, "loss": 0.6252, "step": 868 }, { "epoch": 0.0730912378829615, "grad_norm": 0.6334890723228455, "learning_rate": 8.34975780931021e-06, "loss": 0.5932, "step": 869 }, { "epoch": 0.07317534747776352, "grad_norm": 0.42107874155044556, "learning_rate": 8.224537431601886e-06, "loss": 0.7897, "step": 870 }, { "epoch": 0.07325945707256555, "grad_norm": 0.4216519594192505, "learning_rate": 8.100222840657878e-06, "loss": 0.671, "step": 871 }, { "epoch": 0.07334356666736758, "grad_norm": 0.4145600497722626, "learning_rate": 7.976815263412963e-06, "loss": 0.5159, "step": 872 }, { "epoch": 0.0734276762621696, "grad_norm": 0.6570865511894226, "learning_rate": 7.854315917850163e-06, "loss": 0.685, "step": 873 }, { "epoch": 0.07351178585697163, "grad_norm": 0.42912426590919495, "learning_rate": 7.73272601298851e-06, "loss": 0.5689, "step": 874 }, { "epoch": 0.07359589545177365, "grad_norm": 0.5340107679367065, "learning_rate": 7.612046748871327e-06, "loss": 0.6738, "step": 875 }, { "epoch": 0.0736800050465757, "grad_norm": 0.6763095855712891, "learning_rate": 7.492279316554207e-06, "loss": 0.6679, "step": 876 }, { "epoch": 0.07376411464137772, "grad_norm": 0.6157491207122803, "learning_rate": 7.3734248980933395e-06, "loss": 0.7077, "step": 877 }, { "epoch": 0.07384822423617975, "grad_norm": 0.6420180797576904, "learning_rate": 7.255484666533874e-06, "loss": 0.6328, "step": 878 }, { "epoch": 0.07393233383098177, "grad_norm": 0.7011188268661499, "learning_rate": 7.138459785898266e-06, "loss": 0.6339, "step": 879 }, { "epoch": 0.0740164434257838, "grad_norm": 0.5735622048377991, "learning_rate": 7.022351411174866e-06, "loss": 0.6621, "step": 880 }, { "epoch": 0.07410055302058582, "grad_norm": 0.49764999747276306, "learning_rate": 6.907160688306425e-06, "loss": 0.5328, "step": 881 }, { "epoch": 0.07418466261538785, "grad_norm": 0.6401441097259521, "learning_rate": 6.7928887541789055e-06, "loss": 0.6566, "step": 882 }, { "epoch": 0.07426877221018988, "grad_norm": 0.44652143120765686, "learning_rate": 6.679536736610137e-06, "loss": 0.5332, "step": 883 }, { "epoch": 0.0743528818049919, "grad_norm": 0.6002373695373535, "learning_rate": 6.5671057543387985e-06, "loss": 0.7656, "step": 884 }, { "epoch": 0.07443699139979393, "grad_norm": 0.6372143030166626, "learning_rate": 6.455596917013273e-06, "loss": 0.7022, "step": 885 }, { "epoch": 0.07452110099459595, "grad_norm": 0.4614960253238678, "learning_rate": 6.345011325180772e-06, "loss": 0.62, "step": 886 }, { "epoch": 0.07460521058939798, "grad_norm": 0.6100625395774841, "learning_rate": 6.235350070276447e-06, "loss": 0.7091, "step": 887 }, { "epoch": 0.0746893201842, "grad_norm": 0.6311874985694885, "learning_rate": 6.126614234612593e-06, "loss": 0.6988, "step": 888 }, { "epoch": 0.07477342977900205, "grad_norm": 0.6123420000076294, "learning_rate": 6.018804891368035e-06, "loss": 0.6147, "step": 889 }, { "epoch": 0.07485753937380407, "grad_norm": 0.5420098304748535, "learning_rate": 5.911923104577455e-06, "loss": 0.6469, "step": 890 }, { "epoch": 0.0749416489686061, "grad_norm": 0.5997559428215027, "learning_rate": 5.805969929120947e-06, "loss": 0.6006, "step": 891 }, { "epoch": 0.07502575856340812, "grad_norm": 0.801465630531311, "learning_rate": 5.700946410713548e-06, "loss": 0.7753, "step": 892 }, { "epoch": 0.07510986815821015, "grad_norm": 0.5048848986625671, "learning_rate": 5.5968535858950345e-06, "loss": 0.6048, "step": 893 }, { "epoch": 0.07519397775301218, "grad_norm": 0.6630212664604187, "learning_rate": 5.49369248201953e-06, "loss": 0.8072, "step": 894 }, { "epoch": 0.0752780873478142, "grad_norm": 0.7513829469680786, "learning_rate": 5.39146411724547e-06, "loss": 0.6736, "step": 895 }, { "epoch": 0.07536219694261623, "grad_norm": 1.04941987991333, "learning_rate": 5.290169500525577e-06, "loss": 0.6241, "step": 896 }, { "epoch": 0.07544630653741825, "grad_norm": 0.4719288647174835, "learning_rate": 5.189809631596798e-06, "loss": 0.5016, "step": 897 }, { "epoch": 0.07553041613222028, "grad_norm": 0.5207799673080444, "learning_rate": 5.0903855009705514e-06, "loss": 0.6855, "step": 898 }, { "epoch": 0.0756145257270223, "grad_norm": 0.5883520245552063, "learning_rate": 4.991898089922819e-06, "loss": 0.841, "step": 899 }, { "epoch": 0.07569863532182433, "grad_norm": 0.48410382866859436, "learning_rate": 4.8943483704846475e-06, "loss": 0.7806, "step": 900 }, { "epoch": 0.07578274491662636, "grad_norm": 0.5785539150238037, "learning_rate": 4.797737305432337e-06, "loss": 0.7894, "step": 901 }, { "epoch": 0.0758668545114284, "grad_norm": 0.6412384510040283, "learning_rate": 4.702065848278126e-06, "loss": 0.5901, "step": 902 }, { "epoch": 0.07595096410623042, "grad_norm": 0.6484128832817078, "learning_rate": 4.607334943260655e-06, "loss": 0.5787, "step": 903 }, { "epoch": 0.07603507370103245, "grad_norm": 0.6749901175498962, "learning_rate": 4.513545525335705e-06, "loss": 0.6713, "step": 904 }, { "epoch": 0.07611918329583447, "grad_norm": 0.6375152468681335, "learning_rate": 4.420698520166988e-06, "loss": 0.5882, "step": 905 }, { "epoch": 0.0762032928906365, "grad_norm": 0.5309505462646484, "learning_rate": 4.328794844116946e-06, "loss": 0.6259, "step": 906 }, { "epoch": 0.07628740248543853, "grad_norm": 0.5349912047386169, "learning_rate": 4.237835404237778e-06, "loss": 0.7206, "step": 907 }, { "epoch": 0.07637151208024055, "grad_norm": 0.5662460327148438, "learning_rate": 4.147821098262405e-06, "loss": 0.8227, "step": 908 }, { "epoch": 0.07645562167504258, "grad_norm": 0.5176487565040588, "learning_rate": 4.0587528145957235e-06, "loss": 0.6219, "step": 909 }, { "epoch": 0.0765397312698446, "grad_norm": 0.680101752281189, "learning_rate": 3.970631432305694e-06, "loss": 0.822, "step": 910 }, { "epoch": 0.07662384086464663, "grad_norm": 0.6776043176651001, "learning_rate": 3.883457821114811e-06, "loss": 0.5682, "step": 911 }, { "epoch": 0.07670795045944866, "grad_norm": 0.5205222368240356, "learning_rate": 3.797232841391407e-06, "loss": 0.6134, "step": 912 }, { "epoch": 0.07679206005425068, "grad_norm": 0.5915586948394775, "learning_rate": 3.711957344141237e-06, "loss": 0.748, "step": 913 }, { "epoch": 0.07687616964905272, "grad_norm": 0.5967119336128235, "learning_rate": 3.627632170999029e-06, "loss": 0.6232, "step": 914 }, { "epoch": 0.07696027924385475, "grad_norm": 0.48944488167762756, "learning_rate": 3.5442581542201923e-06, "loss": 0.5955, "step": 915 }, { "epoch": 0.07704438883865677, "grad_norm": 0.6073352098464966, "learning_rate": 3.461836116672612e-06, "loss": 0.7206, "step": 916 }, { "epoch": 0.0771284984334588, "grad_norm": 0.6056821346282959, "learning_rate": 3.380366871828522e-06, "loss": 0.6869, "step": 917 }, { "epoch": 0.07721260802826083, "grad_norm": 0.50712651014328, "learning_rate": 3.2998512237565005e-06, "loss": 0.5963, "step": 918 }, { "epoch": 0.07729671762306285, "grad_norm": 0.5057475566864014, "learning_rate": 3.2202899671134546e-06, "loss": 0.7265, "step": 919 }, { "epoch": 0.07738082721786488, "grad_norm": 0.4675261378288269, "learning_rate": 3.1416838871368924e-06, "loss": 0.4619, "step": 920 }, { "epoch": 0.0774649368126669, "grad_norm": 0.5350446105003357, "learning_rate": 3.064033759637064e-06, "loss": 0.6337, "step": 921 }, { "epoch": 0.07754904640746893, "grad_norm": 0.653113067150116, "learning_rate": 2.9873403509894203e-06, "loss": 0.6033, "step": 922 }, { "epoch": 0.07763315600227096, "grad_norm": 0.6161055564880371, "learning_rate": 2.9116044181269007e-06, "loss": 0.5878, "step": 923 }, { "epoch": 0.07771726559707298, "grad_norm": 0.569428563117981, "learning_rate": 2.836826708532603e-06, "loss": 0.6588, "step": 924 }, { "epoch": 0.07780137519187501, "grad_norm": 0.7410022020339966, "learning_rate": 2.7630079602323442e-06, "loss": 0.4855, "step": 925 }, { "epoch": 0.07788548478667703, "grad_norm": 0.6758790016174316, "learning_rate": 2.690148901787337e-06, "loss": 0.6881, "step": 926 }, { "epoch": 0.07796959438147907, "grad_norm": 0.49047234654426575, "learning_rate": 2.618250252287113e-06, "loss": 0.5011, "step": 927 }, { "epoch": 0.0780537039762811, "grad_norm": 0.5375692248344421, "learning_rate": 2.5473127213422763e-06, "loss": 0.7307, "step": 928 }, { "epoch": 0.07813781357108313, "grad_norm": 0.581413745880127, "learning_rate": 2.4773370090776626e-06, "loss": 0.6392, "step": 929 }, { "epoch": 0.07822192316588515, "grad_norm": 0.6516633033752441, "learning_rate": 2.4083238061252567e-06, "loss": 0.6592, "step": 930 }, { "epoch": 0.07830603276068718, "grad_norm": 0.4725818634033203, "learning_rate": 2.3402737936175425e-06, "loss": 0.6819, "step": 931 }, { "epoch": 0.0783901423554892, "grad_norm": 0.6635422110557556, "learning_rate": 2.273187643180652e-06, "loss": 0.6987, "step": 932 }, { "epoch": 0.07847425195029123, "grad_norm": 0.5256288647651672, "learning_rate": 2.2070660169278166e-06, "loss": 0.5697, "step": 933 }, { "epoch": 0.07855836154509326, "grad_norm": 0.4704425036907196, "learning_rate": 2.141909567452793e-06, "loss": 0.4441, "step": 934 }, { "epoch": 0.07864247113989528, "grad_norm": 0.4934440553188324, "learning_rate": 2.0777189378234143e-06, "loss": 0.6377, "step": 935 }, { "epoch": 0.07872658073469731, "grad_norm": 0.5220550298690796, "learning_rate": 2.014494761575314e-06, "loss": 0.5492, "step": 936 }, { "epoch": 0.07881069032949933, "grad_norm": 0.7263022065162659, "learning_rate": 1.9522376627055583e-06, "loss": 0.6481, "step": 937 }, { "epoch": 0.07889479992430136, "grad_norm": 0.6368206739425659, "learning_rate": 1.8909482556666024e-06, "loss": 0.5843, "step": 938 }, { "epoch": 0.07897890951910339, "grad_norm": 0.522922933101654, "learning_rate": 1.8306271453601199e-06, "loss": 0.7028, "step": 939 }, { "epoch": 0.07906301911390543, "grad_norm": 0.5848362445831299, "learning_rate": 1.771274927131139e-06, "loss": 0.7176, "step": 940 }, { "epoch": 0.07914712870870745, "grad_norm": 0.5615982413291931, "learning_rate": 1.712892186762083e-06, "loss": 0.5697, "step": 941 }, { "epoch": 0.07923123830350948, "grad_norm": 0.4720976948738098, "learning_rate": 1.6554795004670388e-06, "loss": 0.4611, "step": 942 }, { "epoch": 0.0793153478983115, "grad_norm": 0.5845123529434204, "learning_rate": 1.5990374348860305e-06, "loss": 0.6374, "step": 943 }, { "epoch": 0.07939945749311353, "grad_norm": 0.5101219415664673, "learning_rate": 1.543566547079467e-06, "loss": 0.551, "step": 944 }, { "epoch": 0.07948356708791555, "grad_norm": 0.6655003428459167, "learning_rate": 1.4890673845226133e-06, "loss": 0.7873, "step": 945 }, { "epoch": 0.07956767668271758, "grad_norm": 0.552952766418457, "learning_rate": 1.4355404851001952e-06, "loss": 0.7553, "step": 946 }, { "epoch": 0.0796517862775196, "grad_norm": 0.37656185030937195, "learning_rate": 1.3829863771011253e-06, "loss": 0.5656, "step": 947 }, { "epoch": 0.07973589587232163, "grad_norm": 0.7558304667472839, "learning_rate": 1.3314055792131964e-06, "loss": 0.6068, "step": 948 }, { "epoch": 0.07982000546712366, "grad_norm": 0.584898054599762, "learning_rate": 1.280798600518085e-06, "loss": 0.5412, "step": 949 }, { "epoch": 0.07990411506192568, "grad_norm": 0.542145848274231, "learning_rate": 1.231165940486234e-06, "loss": 0.563, "step": 950 }, { "epoch": 0.07998822465672771, "grad_norm": 0.6324697732925415, "learning_rate": 1.1825080889719563e-06, "loss": 0.5728, "step": 951 }, { "epoch": 0.08007233425152974, "grad_norm": 0.5715826749801636, "learning_rate": 1.134825526208605e-06, "loss": 0.6769, "step": 952 }, { "epoch": 0.08015644384633178, "grad_norm": 0.5874225497245789, "learning_rate": 1.0881187228038215e-06, "loss": 0.579, "step": 953 }, { "epoch": 0.0802405534411338, "grad_norm": 0.5252524018287659, "learning_rate": 1.0423881397349068e-06, "loss": 0.6326, "step": 954 }, { "epoch": 0.08032466303593583, "grad_norm": 0.6097508668899536, "learning_rate": 9.976342283442463e-07, "loss": 0.6524, "step": 955 }, { "epoch": 0.08040877263073785, "grad_norm": 0.41464993357658386, "learning_rate": 9.538574303348813e-07, "loss": 0.4743, "step": 956 }, { "epoch": 0.08049288222553988, "grad_norm": 0.46832600235939026, "learning_rate": 9.110581777661331e-07, "loss": 0.5314, "step": 957 }, { "epoch": 0.0805769918203419, "grad_norm": 0.5717182159423828, "learning_rate": 8.692368930493521e-07, "loss": 0.7329, "step": 958 }, { "epoch": 0.08066110141514393, "grad_norm": 0.5042733550071716, "learning_rate": 8.283939889437209e-07, "loss": 0.7913, "step": 959 }, { "epoch": 0.08074521100994596, "grad_norm": 0.626025915145874, "learning_rate": 7.885298685522235e-07, "loss": 0.7545, "step": 960 }, { "epoch": 0.08082932060474798, "grad_norm": 0.5646237730979919, "learning_rate": 7.496449253176274e-07, "loss": 0.5513, "step": 961 }, { "epoch": 0.08091343019955001, "grad_norm": 0.6173761487007141, "learning_rate": 7.117395430186414e-07, "loss": 0.6928, "step": 962 }, { "epoch": 0.08099753979435204, "grad_norm": 0.5799218416213989, "learning_rate": 6.748140957660631e-07, "loss": 0.8141, "step": 963 }, { "epoch": 0.08108164938915406, "grad_norm": 0.5109354853630066, "learning_rate": 6.388689479991605e-07, "loss": 0.8059, "step": 964 }, { "epoch": 0.0811657589839561, "grad_norm": 0.5159839391708374, "learning_rate": 6.039044544820404e-07, "loss": 0.5749, "step": 965 }, { "epoch": 0.08124986857875813, "grad_norm": 0.6682405471801758, "learning_rate": 5.699209603001076e-07, "loss": 0.7188, "step": 966 }, { "epoch": 0.08133397817356015, "grad_norm": 0.8375939130783081, "learning_rate": 5.369188008567672e-07, "loss": 0.9425, "step": 967 }, { "epoch": 0.08141808776836218, "grad_norm": 0.6056286692619324, "learning_rate": 5.048983018699827e-07, "loss": 0.6488, "step": 968 }, { "epoch": 0.0815021973631642, "grad_norm": 0.5580015778541565, "learning_rate": 4.738597793691679e-07, "loss": 0.5895, "step": 969 }, { "epoch": 0.08158630695796623, "grad_norm": 0.5345575213432312, "learning_rate": 4.438035396920004e-07, "loss": 0.6993, "step": 970 }, { "epoch": 0.08167041655276826, "grad_norm": 0.5608043074607849, "learning_rate": 4.1472987948143473e-07, "loss": 0.7225, "step": 971 }, { "epoch": 0.08175452614757028, "grad_norm": 0.5627851486206055, "learning_rate": 3.866390856827495e-07, "loss": 0.6505, "step": 972 }, { "epoch": 0.08183863574237231, "grad_norm": 0.7156445384025574, "learning_rate": 3.595314355407609e-07, "loss": 0.8306, "step": 973 }, { "epoch": 0.08192274533717434, "grad_norm": 0.8838444352149963, "learning_rate": 3.3340719659701313e-07, "loss": 0.5709, "step": 974 }, { "epoch": 0.08200685493197636, "grad_norm": 0.5641219615936279, "learning_rate": 3.0826662668720364e-07, "loss": 0.7521, "step": 975 }, { "epoch": 0.08209096452677839, "grad_norm": 0.636716902256012, "learning_rate": 2.841099739386066e-07, "loss": 0.6791, "step": 976 }, { "epoch": 0.08217507412158041, "grad_norm": 0.5187475681304932, "learning_rate": 2.609374767676309e-07, "loss": 0.5493, "step": 977 }, { "epoch": 0.08225918371638245, "grad_norm": 0.555397629737854, "learning_rate": 2.387493638774774e-07, "loss": 0.8756, "step": 978 }, { "epoch": 0.08234329331118448, "grad_norm": 0.4511655271053314, "learning_rate": 2.175458542558517e-07, "loss": 0.43, "step": 979 }, { "epoch": 0.0824274029059865, "grad_norm": 0.7089412212371826, "learning_rate": 1.973271571728441e-07, "loss": 0.6912, "step": 980 }, { "epoch": 0.08251151250078853, "grad_norm": 0.46385011076927185, "learning_rate": 1.7809347217881966e-07, "loss": 0.6673, "step": 981 }, { "epoch": 0.08259562209559056, "grad_norm": 0.5081963539123535, "learning_rate": 1.598449891024978e-07, "loss": 0.6091, "step": 982 }, { "epoch": 0.08267973169039258, "grad_norm": 0.4684099853038788, "learning_rate": 1.425818880490315e-07, "loss": 0.6555, "step": 983 }, { "epoch": 0.08276384128519461, "grad_norm": 0.5321608185768127, "learning_rate": 1.2630433939825327e-07, "loss": 0.6384, "step": 984 }, { "epoch": 0.08284795087999663, "grad_norm": 0.5801177620887756, "learning_rate": 1.1101250380300965e-07, "loss": 0.6731, "step": 985 }, { "epoch": 0.08293206047479866, "grad_norm": 0.5255969166755676, "learning_rate": 9.670653218752934e-08, "loss": 0.6411, "step": 986 }, { "epoch": 0.08301617006960069, "grad_norm": 0.5361884236335754, "learning_rate": 8.33865657459909e-08, "loss": 0.4983, "step": 987 }, { "epoch": 0.08310027966440271, "grad_norm": 0.42637842893600464, "learning_rate": 7.105273594107953e-08, "loss": 0.501, "step": 988 }, { "epoch": 0.08318438925920474, "grad_norm": 0.5724627375602722, "learning_rate": 5.970516450271025e-08, "loss": 0.7244, "step": 989 }, { "epoch": 0.08326849885400676, "grad_norm": 0.5623000264167786, "learning_rate": 4.934396342684e-08, "loss": 0.6702, "step": 990 }, { "epoch": 0.0833526084488088, "grad_norm": 0.505596399307251, "learning_rate": 3.996923497434635e-08, "loss": 0.7967, "step": 991 }, { "epoch": 0.08343671804361083, "grad_norm": 0.7859603762626648, "learning_rate": 3.1581071670006015e-08, "loss": 0.7663, "step": 992 }, { "epoch": 0.08352082763841286, "grad_norm": 0.5909059643745422, "learning_rate": 2.417955630159563e-08, "loss": 0.6006, "step": 993 }, { "epoch": 0.08360493723321488, "grad_norm": 0.4647238552570343, "learning_rate": 1.7764761919103477e-08, "loss": 0.6949, "step": 994 }, { "epoch": 0.08368904682801691, "grad_norm": 0.5446833372116089, "learning_rate": 1.2336751833941229e-08, "loss": 0.5849, "step": 995 }, { "epoch": 0.08377315642281893, "grad_norm": 0.5875367522239685, "learning_rate": 7.895579618388827e-09, "loss": 0.769, "step": 996 }, { "epoch": 0.08385726601762096, "grad_norm": 0.769709587097168, "learning_rate": 4.4412891050171765e-09, "loss": 0.8165, "step": 997 }, { "epoch": 0.08394137561242299, "grad_norm": 0.5166672468185425, "learning_rate": 1.973914386288467e-09, "loss": 0.696, "step": 998 }, { "epoch": 0.08402548520722501, "grad_norm": 0.6133189797401428, "learning_rate": 4.934798141786879e-10, "loss": 0.7572, "step": 999 }, { "epoch": 0.08410959480202704, "grad_norm": 0.4853799045085907, "learning_rate": 0.0, "loss": 0.6055, "step": 1000 }, { "epoch": 0.08410959480202704, "eval_loss": 0.6270027160644531, "eval_runtime": 118.2303, "eval_samples_per_second": 21.179, "eval_steps_per_second": 21.179, "step": 1000 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.774858388504576e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }