diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6827 @@ +{ + "best_metric": 1.2795084714889526, + "best_model_checkpoint": "saved_model/c2s_jun2024/checkpoint-9692", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 9692, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 72.6113, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 18.23219108581543, + "learning_rate": 2.5e-06, + "loss": 74.5495, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 16.582162857055664, + "learning_rate": 7.5e-06, + "loss": 73.7367, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 14.804972648620605, + "learning_rate": 1.2e-05, + "loss": 72.8853, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 13.634269714355469, + "learning_rate": 1.7000000000000003e-05, + "loss": 70.9592, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 13.762855529785156, + "learning_rate": 2.2000000000000003e-05, + "loss": 66.9603, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 16.27646827697754, + "learning_rate": 2.7000000000000002e-05, + "loss": 61.4318, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 27.16312026977539, + "learning_rate": 3.15e-05, + "loss": 53.3651, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 28.43309783935547, + "learning_rate": 3.65e-05, + "loss": 33.9745, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 12.292057991027832, + "learning_rate": 4.15e-05, + "loss": 13.4627, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 9.148832321166992, + "learning_rate": 4.6500000000000005e-05, + "loss": 6.8387, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 6.579999923706055, + "learning_rate": 5.1500000000000005e-05, + "loss": 4.7847, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 6.650771141052246, + "learning_rate": 5.65e-05, + "loss": 4.1684, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 6.5379791259765625, + "learning_rate": 6.15e-05, + "loss": 3.8221, + "step": 130 + }, + { + "epoch": 0.03, + "grad_norm": 6.095062732696533, + "learning_rate": 6.65e-05, + "loss": 3.5635, + "step": 140 + }, + { + "epoch": 0.03, + "grad_norm": 7.0580973625183105, + "learning_rate": 7.15e-05, + "loss": 3.4446, + "step": 150 + }, + { + "epoch": 0.03, + "grad_norm": 6.517209053039551, + "learning_rate": 7.65e-05, + "loss": 3.2972, + "step": 160 + }, + { + "epoch": 0.04, + "grad_norm": 5.954787731170654, + "learning_rate": 8.15e-05, + "loss": 3.2621, + "step": 170 + }, + { + "epoch": 0.04, + "grad_norm": 6.085761547088623, + "learning_rate": 8.65e-05, + "loss": 3.2072, + "step": 180 + }, + { + "epoch": 0.04, + "grad_norm": 6.4346442222595215, + "learning_rate": 9.15e-05, + "loss": 3.0868, + "step": 190 + }, + { + "epoch": 0.04, + "grad_norm": 6.535578727722168, + "learning_rate": 9.65e-05, + "loss": 3.0201, + "step": 200 + }, + { + "epoch": 0.04, + "grad_norm": 5.239222526550293, + "learning_rate": 9.999378367177788e-05, + "loss": 2.9792, + "step": 210 + }, + { + "epoch": 0.05, + "grad_norm": 5.576033592224121, + "learning_rate": 9.997306257770411e-05, + "loss": 3.0079, + "step": 220 + }, + { + "epoch": 0.05, + "grad_norm": 5.455887794494629, + "learning_rate": 9.995234148363033e-05, + "loss": 2.8296, + "step": 230 + }, + { + "epoch": 0.05, + "grad_norm": 4.566660404205322, + "learning_rate": 9.993162038955657e-05, + "loss": 2.7655, + "step": 240 + }, + { + "epoch": 0.05, + "grad_norm": 4.954742908477783, + "learning_rate": 9.99108992954828e-05, + "loss": 2.5655, + "step": 250 + }, + { + "epoch": 0.05, + "grad_norm": 3.5510752201080322, + "learning_rate": 9.989017820140904e-05, + "loss": 2.4527, + "step": 260 + }, + { + "epoch": 0.06, + "grad_norm": 3.358351230621338, + "learning_rate": 9.986945710733528e-05, + "loss": 2.2679, + "step": 270 + }, + { + "epoch": 0.06, + "grad_norm": 2.9349524974823, + "learning_rate": 9.98487360132615e-05, + "loss": 2.1456, + "step": 280 + }, + { + "epoch": 0.06, + "grad_norm": 3.3249402046203613, + "learning_rate": 9.982801491918775e-05, + "loss": 2.0943, + "step": 290 + }, + { + "epoch": 0.06, + "grad_norm": 2.92372989654541, + "learning_rate": 9.980729382511397e-05, + "loss": 2.0194, + "step": 300 + }, + { + "epoch": 0.06, + "grad_norm": 3.124359130859375, + "learning_rate": 9.97865727310402e-05, + "loss": 1.9523, + "step": 310 + }, + { + "epoch": 0.07, + "grad_norm": 2.372561454772949, + "learning_rate": 9.976585163696644e-05, + "loss": 1.905, + "step": 320 + }, + { + "epoch": 0.07, + "grad_norm": 2.5799174308776855, + "learning_rate": 9.974513054289267e-05, + "loss": 1.9159, + "step": 330 + }, + { + "epoch": 0.07, + "grad_norm": 2.1826956272125244, + "learning_rate": 9.97244094488189e-05, + "loss": 1.8362, + "step": 340 + }, + { + "epoch": 0.07, + "grad_norm": 2.1002371311187744, + "learning_rate": 9.970368835474514e-05, + "loss": 1.844, + "step": 350 + }, + { + "epoch": 0.07, + "grad_norm": 3.1345527172088623, + "learning_rate": 9.968296726067136e-05, + "loss": 1.8084, + "step": 360 + }, + { + "epoch": 0.08, + "grad_norm": 1.9457321166992188, + "learning_rate": 9.96622461665976e-05, + "loss": 1.7775, + "step": 370 + }, + { + "epoch": 0.08, + "grad_norm": 1.9511795043945312, + "learning_rate": 9.964152507252383e-05, + "loss": 1.7872, + "step": 380 + }, + { + "epoch": 0.08, + "grad_norm": 1.9775121212005615, + "learning_rate": 9.962080397845007e-05, + "loss": 1.7665, + "step": 390 + }, + { + "epoch": 0.08, + "grad_norm": 1.9561394453048706, + "learning_rate": 9.96000828843763e-05, + "loss": 1.7664, + "step": 400 + }, + { + "epoch": 0.08, + "grad_norm": 2.7436013221740723, + "learning_rate": 9.957936179030253e-05, + "loss": 1.7016, + "step": 410 + }, + { + "epoch": 0.09, + "grad_norm": 1.6739649772644043, + "learning_rate": 9.955864069622876e-05, + "loss": 1.7219, + "step": 420 + }, + { + "epoch": 0.09, + "grad_norm": 1.940246343612671, + "learning_rate": 9.9537919602155e-05, + "loss": 1.7174, + "step": 430 + }, + { + "epoch": 0.09, + "grad_norm": 1.8286395072937012, + "learning_rate": 9.951719850808123e-05, + "loss": 1.6698, + "step": 440 + }, + { + "epoch": 0.09, + "grad_norm": 2.0042293071746826, + "learning_rate": 9.949647741400747e-05, + "loss": 1.6908, + "step": 450 + }, + { + "epoch": 0.09, + "grad_norm": 1.6445887088775635, + "learning_rate": 9.94757563199337e-05, + "loss": 1.6796, + "step": 460 + }, + { + "epoch": 0.1, + "grad_norm": 2.068713903427124, + "learning_rate": 9.945503522585992e-05, + "loss": 1.6685, + "step": 470 + }, + { + "epoch": 0.1, + "grad_norm": 1.8053257465362549, + "learning_rate": 9.943431413178617e-05, + "loss": 1.6522, + "step": 480 + }, + { + "epoch": 0.1, + "grad_norm": 1.580461859703064, + "learning_rate": 9.94135930377124e-05, + "loss": 1.6425, + "step": 490 + }, + { + "epoch": 0.1, + "grad_norm": 1.607007384300232, + "learning_rate": 9.939287194363863e-05, + "loss": 1.632, + "step": 500 + }, + { + "epoch": 0.11, + "grad_norm": 1.649885654449463, + "learning_rate": 9.937215084956486e-05, + "loss": 1.5966, + "step": 510 + }, + { + "epoch": 0.11, + "grad_norm": 1.7667235136032104, + "learning_rate": 9.93514297554911e-05, + "loss": 1.5942, + "step": 520 + }, + { + "epoch": 0.11, + "grad_norm": 1.595691442489624, + "learning_rate": 9.933070866141732e-05, + "loss": 1.609, + "step": 530 + }, + { + "epoch": 0.11, + "grad_norm": 1.5232254266738892, + "learning_rate": 9.930998756734357e-05, + "loss": 1.5614, + "step": 540 + }, + { + "epoch": 0.11, + "grad_norm": 1.4872910976409912, + "learning_rate": 9.928926647326979e-05, + "loss": 1.5657, + "step": 550 + }, + { + "epoch": 0.12, + "grad_norm": 1.609491229057312, + "learning_rate": 9.926854537919603e-05, + "loss": 1.5935, + "step": 560 + }, + { + "epoch": 0.12, + "grad_norm": 1.6403166055679321, + "learning_rate": 9.924782428512226e-05, + "loss": 1.6159, + "step": 570 + }, + { + "epoch": 0.12, + "grad_norm": 1.6648396253585815, + "learning_rate": 9.922710319104848e-05, + "loss": 1.6012, + "step": 580 + }, + { + "epoch": 0.12, + "grad_norm": 1.6322458982467651, + "learning_rate": 9.920638209697473e-05, + "loss": 1.5541, + "step": 590 + }, + { + "epoch": 0.12, + "grad_norm": 1.5503164529800415, + "learning_rate": 9.918566100290095e-05, + "loss": 1.5733, + "step": 600 + }, + { + "epoch": 0.13, + "grad_norm": 1.6093209981918335, + "learning_rate": 9.916493990882719e-05, + "loss": 1.5144, + "step": 610 + }, + { + "epoch": 0.13, + "grad_norm": 1.6871626377105713, + "learning_rate": 9.914421881475342e-05, + "loss": 1.573, + "step": 620 + }, + { + "epoch": 0.13, + "grad_norm": 1.7600977420806885, + "learning_rate": 9.912349772067966e-05, + "loss": 1.5577, + "step": 630 + }, + { + "epoch": 0.13, + "grad_norm": 1.4892425537109375, + "learning_rate": 9.910277662660588e-05, + "loss": 1.5751, + "step": 640 + }, + { + "epoch": 0.13, + "grad_norm": 1.5667476654052734, + "learning_rate": 9.908205553253213e-05, + "loss": 1.5298, + "step": 650 + }, + { + "epoch": 0.14, + "grad_norm": 1.3411659002304077, + "learning_rate": 9.906133443845835e-05, + "loss": 1.5409, + "step": 660 + }, + { + "epoch": 0.14, + "grad_norm": 1.5329233407974243, + "learning_rate": 9.904061334438459e-05, + "loss": 1.5165, + "step": 670 + }, + { + "epoch": 0.14, + "grad_norm": 1.5168925523757935, + "learning_rate": 9.901989225031082e-05, + "loss": 1.5222, + "step": 680 + }, + { + "epoch": 0.14, + "grad_norm": 1.6860578060150146, + "learning_rate": 9.899917115623706e-05, + "loss": 1.5179, + "step": 690 + }, + { + "epoch": 0.14, + "grad_norm": 1.4629698991775513, + "learning_rate": 9.897845006216329e-05, + "loss": 1.5593, + "step": 700 + }, + { + "epoch": 0.15, + "grad_norm": 1.3701924085617065, + "learning_rate": 9.895772896808953e-05, + "loss": 1.52, + "step": 710 + }, + { + "epoch": 0.15, + "grad_norm": 1.4276106357574463, + "learning_rate": 9.893700787401575e-05, + "loss": 1.5546, + "step": 720 + }, + { + "epoch": 0.15, + "grad_norm": 1.5609627962112427, + "learning_rate": 9.8916286779942e-05, + "loss": 1.5071, + "step": 730 + }, + { + "epoch": 0.15, + "grad_norm": 1.9602493047714233, + "learning_rate": 9.889556568586822e-05, + "loss": 1.5192, + "step": 740 + }, + { + "epoch": 0.15, + "grad_norm": 1.4681726694107056, + "learning_rate": 9.887484459179444e-05, + "loss": 1.5065, + "step": 750 + }, + { + "epoch": 0.16, + "grad_norm": 1.547143816947937, + "learning_rate": 9.885412349772069e-05, + "loss": 1.5303, + "step": 760 + }, + { + "epoch": 0.16, + "grad_norm": 1.7585084438323975, + "learning_rate": 9.883340240364691e-05, + "loss": 1.5412, + "step": 770 + }, + { + "epoch": 0.16, + "grad_norm": 1.4589301347732544, + "learning_rate": 9.881268130957315e-05, + "loss": 1.5008, + "step": 780 + }, + { + "epoch": 0.16, + "grad_norm": 1.5748664140701294, + "learning_rate": 9.879196021549938e-05, + "loss": 1.4856, + "step": 790 + }, + { + "epoch": 0.17, + "grad_norm": 1.4392333030700684, + "learning_rate": 9.877123912142562e-05, + "loss": 1.4593, + "step": 800 + }, + { + "epoch": 0.17, + "grad_norm": 1.439276933670044, + "learning_rate": 9.875051802735185e-05, + "loss": 1.4565, + "step": 810 + }, + { + "epoch": 0.17, + "grad_norm": 1.5028575658798218, + "learning_rate": 9.872979693327809e-05, + "loss": 1.5106, + "step": 820 + }, + { + "epoch": 0.17, + "grad_norm": 1.5902388095855713, + "learning_rate": 9.870907583920431e-05, + "loss": 1.459, + "step": 830 + }, + { + "epoch": 0.17, + "grad_norm": 1.5270620584487915, + "learning_rate": 9.868835474513056e-05, + "loss": 1.4705, + "step": 840 + }, + { + "epoch": 0.18, + "grad_norm": 1.354683518409729, + "learning_rate": 9.866763365105678e-05, + "loss": 1.4468, + "step": 850 + }, + { + "epoch": 0.18, + "grad_norm": 1.3697203397750854, + "learning_rate": 9.864691255698301e-05, + "loss": 1.4669, + "step": 860 + }, + { + "epoch": 0.18, + "grad_norm": 1.5006585121154785, + "learning_rate": 9.862619146290925e-05, + "loss": 1.4641, + "step": 870 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566001653671265, + "learning_rate": 9.860547036883548e-05, + "loss": 1.4545, + "step": 880 + }, + { + "epoch": 0.18, + "grad_norm": 1.3500274419784546, + "learning_rate": 9.85847492747617e-05, + "loss": 1.477, + "step": 890 + }, + { + "epoch": 0.19, + "grad_norm": 1.3306142091751099, + "learning_rate": 9.856402818068794e-05, + "loss": 1.4469, + "step": 900 + }, + { + "epoch": 0.19, + "grad_norm": 1.2983628511428833, + "learning_rate": 9.854330708661418e-05, + "loss": 1.4603, + "step": 910 + }, + { + "epoch": 0.19, + "grad_norm": 1.3828344345092773, + "learning_rate": 9.852258599254041e-05, + "loss": 1.4686, + "step": 920 + }, + { + "epoch": 0.19, + "grad_norm": 1.427741527557373, + "learning_rate": 9.850186489846665e-05, + "loss": 1.4756, + "step": 930 + }, + { + "epoch": 0.19, + "grad_norm": 1.267404556274414, + "learning_rate": 9.848114380439287e-05, + "loss": 1.4777, + "step": 940 + }, + { + "epoch": 0.2, + "grad_norm": 1.3213374614715576, + "learning_rate": 9.846042271031912e-05, + "loss": 1.4526, + "step": 950 + }, + { + "epoch": 0.2, + "grad_norm": 1.6813840866088867, + "learning_rate": 9.843970161624534e-05, + "loss": 1.49, + "step": 960 + }, + { + "epoch": 0.2, + "grad_norm": 1.2110322713851929, + "learning_rate": 9.841898052217157e-05, + "loss": 1.4796, + "step": 970 + }, + { + "epoch": 0.2, + "grad_norm": 1.3316526412963867, + "learning_rate": 9.839825942809781e-05, + "loss": 1.4523, + "step": 980 + }, + { + "epoch": 0.2, + "grad_norm": 1.313766598701477, + "learning_rate": 9.837753833402404e-05, + "loss": 1.4195, + "step": 990 + }, + { + "epoch": 0.21, + "grad_norm": 1.4528905153274536, + "learning_rate": 9.835681723995028e-05, + "loss": 1.4433, + "step": 1000 + }, + { + "epoch": 0.21, + "grad_norm": 1.3782751560211182, + "learning_rate": 9.833609614587651e-05, + "loss": 1.4673, + "step": 1010 + }, + { + "epoch": 0.21, + "grad_norm": 1.5674275159835815, + "learning_rate": 9.831537505180273e-05, + "loss": 1.4296, + "step": 1020 + }, + { + "epoch": 0.21, + "grad_norm": 1.3901402950286865, + "learning_rate": 9.829465395772898e-05, + "loss": 1.4516, + "step": 1030 + }, + { + "epoch": 0.21, + "grad_norm": 1.1594748497009277, + "learning_rate": 9.82739328636552e-05, + "loss": 1.4225, + "step": 1040 + }, + { + "epoch": 0.22, + "grad_norm": 1.2297048568725586, + "learning_rate": 9.825321176958144e-05, + "loss": 1.4416, + "step": 1050 + }, + { + "epoch": 0.22, + "grad_norm": 1.1866023540496826, + "learning_rate": 9.823249067550768e-05, + "loss": 1.444, + "step": 1060 + }, + { + "epoch": 0.22, + "grad_norm": 1.407461404800415, + "learning_rate": 9.82117695814339e-05, + "loss": 1.4415, + "step": 1070 + }, + { + "epoch": 0.22, + "grad_norm": 1.302164912223816, + "learning_rate": 9.819104848736013e-05, + "loss": 1.4405, + "step": 1080 + }, + { + "epoch": 0.22, + "grad_norm": 1.3704490661621094, + "learning_rate": 9.817032739328637e-05, + "loss": 1.4408, + "step": 1090 + }, + { + "epoch": 0.23, + "grad_norm": 1.2673710584640503, + "learning_rate": 9.81496062992126e-05, + "loss": 1.4221, + "step": 1100 + }, + { + "epoch": 0.23, + "grad_norm": 1.3337206840515137, + "learning_rate": 9.812888520513884e-05, + "loss": 1.4193, + "step": 1110 + }, + { + "epoch": 0.23, + "grad_norm": 1.3280502557754517, + "learning_rate": 9.810816411106507e-05, + "loss": 1.4736, + "step": 1120 + }, + { + "epoch": 0.23, + "grad_norm": 1.2532864809036255, + "learning_rate": 9.80874430169913e-05, + "loss": 1.4665, + "step": 1130 + }, + { + "epoch": 0.24, + "grad_norm": 1.2475242614746094, + "learning_rate": 9.806672192291754e-05, + "loss": 1.426, + "step": 1140 + }, + { + "epoch": 0.24, + "grad_norm": 1.7034567594528198, + "learning_rate": 9.804600082884376e-05, + "loss": 1.4473, + "step": 1150 + }, + { + "epoch": 0.24, + "grad_norm": 1.3586080074310303, + "learning_rate": 9.802527973477e-05, + "loss": 1.3959, + "step": 1160 + }, + { + "epoch": 0.24, + "grad_norm": 1.2611415386199951, + "learning_rate": 9.800455864069623e-05, + "loss": 1.4401, + "step": 1170 + }, + { + "epoch": 0.24, + "grad_norm": 1.3101681470870972, + "learning_rate": 9.798383754662247e-05, + "loss": 1.4431, + "step": 1180 + }, + { + "epoch": 0.25, + "grad_norm": 1.1770988702774048, + "learning_rate": 9.796311645254869e-05, + "loss": 1.4108, + "step": 1190 + }, + { + "epoch": 0.25, + "grad_norm": 1.2325702905654907, + "learning_rate": 9.794239535847494e-05, + "loss": 1.4141, + "step": 1200 + }, + { + "epoch": 0.25, + "grad_norm": 1.2543164491653442, + "learning_rate": 9.792167426440116e-05, + "loss": 1.4133, + "step": 1210 + }, + { + "epoch": 0.25, + "grad_norm": 1.1258199214935303, + "learning_rate": 9.79009531703274e-05, + "loss": 1.4041, + "step": 1220 + }, + { + "epoch": 0.25, + "grad_norm": 1.3423455953598022, + "learning_rate": 9.788023207625363e-05, + "loss": 1.4144, + "step": 1230 + }, + { + "epoch": 0.26, + "grad_norm": 1.248947024345398, + "learning_rate": 9.785951098217985e-05, + "loss": 1.4043, + "step": 1240 + }, + { + "epoch": 0.26, + "grad_norm": 1.129650354385376, + "learning_rate": 9.78387898881061e-05, + "loss": 1.4216, + "step": 1250 + }, + { + "epoch": 0.26, + "grad_norm": 1.2218910455703735, + "learning_rate": 9.781806879403232e-05, + "loss": 1.3976, + "step": 1260 + }, + { + "epoch": 0.26, + "grad_norm": 1.153981328010559, + "learning_rate": 9.779734769995856e-05, + "loss": 1.4304, + "step": 1270 + }, + { + "epoch": 0.26, + "grad_norm": 1.1724766492843628, + "learning_rate": 9.77766266058848e-05, + "loss": 1.43, + "step": 1280 + }, + { + "epoch": 0.27, + "grad_norm": 1.2830730676651, + "learning_rate": 9.775590551181103e-05, + "loss": 1.4429, + "step": 1290 + }, + { + "epoch": 0.27, + "grad_norm": 1.2320913076400757, + "learning_rate": 9.773518441773725e-05, + "loss": 1.3898, + "step": 1300 + }, + { + "epoch": 0.27, + "grad_norm": 1.2313491106033325, + "learning_rate": 9.77144633236635e-05, + "loss": 1.4273, + "step": 1310 + }, + { + "epoch": 0.27, + "grad_norm": 1.1946086883544922, + "learning_rate": 9.769374222958972e-05, + "loss": 1.4234, + "step": 1320 + }, + { + "epoch": 0.27, + "grad_norm": 1.127300500869751, + "learning_rate": 9.767302113551596e-05, + "loss": 1.4144, + "step": 1330 + }, + { + "epoch": 0.28, + "grad_norm": 1.4888228178024292, + "learning_rate": 9.765230004144219e-05, + "loss": 1.4092, + "step": 1340 + }, + { + "epoch": 0.28, + "grad_norm": 1.3795928955078125, + "learning_rate": 9.763157894736843e-05, + "loss": 1.3647, + "step": 1350 + }, + { + "epoch": 0.28, + "grad_norm": 1.1433610916137695, + "learning_rate": 9.761085785329466e-05, + "loss": 1.415, + "step": 1360 + }, + { + "epoch": 0.28, + "grad_norm": 1.040281891822815, + "learning_rate": 9.75901367592209e-05, + "loss": 1.4244, + "step": 1370 + }, + { + "epoch": 0.28, + "grad_norm": 1.1311726570129395, + "learning_rate": 9.756941566514712e-05, + "loss": 1.3852, + "step": 1380 + }, + { + "epoch": 0.29, + "grad_norm": 1.2847346067428589, + "learning_rate": 9.754869457107337e-05, + "loss": 1.4225, + "step": 1390 + }, + { + "epoch": 0.29, + "grad_norm": 1.2235894203186035, + "learning_rate": 9.752797347699959e-05, + "loss": 1.3973, + "step": 1400 + }, + { + "epoch": 0.29, + "grad_norm": 1.1802481412887573, + "learning_rate": 9.750725238292582e-05, + "loss": 1.3923, + "step": 1410 + }, + { + "epoch": 0.29, + "grad_norm": 1.141739010810852, + "learning_rate": 9.748653128885206e-05, + "loss": 1.4049, + "step": 1420 + }, + { + "epoch": 0.3, + "grad_norm": 1.2155243158340454, + "learning_rate": 9.746581019477828e-05, + "loss": 1.3866, + "step": 1430 + }, + { + "epoch": 0.3, + "grad_norm": 1.4717819690704346, + "learning_rate": 9.744508910070453e-05, + "loss": 1.4264, + "step": 1440 + }, + { + "epoch": 0.3, + "grad_norm": 1.1440094709396362, + "learning_rate": 9.742436800663075e-05, + "loss": 1.4291, + "step": 1450 + }, + { + "epoch": 0.3, + "grad_norm": 1.3254936933517456, + "learning_rate": 9.740364691255699e-05, + "loss": 1.3973, + "step": 1460 + }, + { + "epoch": 0.3, + "grad_norm": 1.2041431665420532, + "learning_rate": 9.738292581848322e-05, + "loss": 1.3779, + "step": 1470 + }, + { + "epoch": 0.31, + "grad_norm": 1.1422394514083862, + "learning_rate": 9.736220472440946e-05, + "loss": 1.3918, + "step": 1480 + }, + { + "epoch": 0.31, + "grad_norm": 1.2341557741165161, + "learning_rate": 9.734148363033568e-05, + "loss": 1.4065, + "step": 1490 + }, + { + "epoch": 0.31, + "grad_norm": 1.1723967790603638, + "learning_rate": 9.732076253626193e-05, + "loss": 1.4003, + "step": 1500 + }, + { + "epoch": 0.31, + "grad_norm": 1.279010534286499, + "learning_rate": 9.730004144218815e-05, + "loss": 1.3762, + "step": 1510 + }, + { + "epoch": 0.31, + "grad_norm": 1.2639541625976562, + "learning_rate": 9.727932034811438e-05, + "loss": 1.3932, + "step": 1520 + }, + { + "epoch": 0.32, + "grad_norm": 1.1406500339508057, + "learning_rate": 9.725859925404062e-05, + "loss": 1.4318, + "step": 1530 + }, + { + "epoch": 0.32, + "grad_norm": 1.1991297006607056, + "learning_rate": 9.723787815996685e-05, + "loss": 1.3742, + "step": 1540 + }, + { + "epoch": 0.32, + "grad_norm": 1.1058017015457153, + "learning_rate": 9.721715706589309e-05, + "loss": 1.3975, + "step": 1550 + }, + { + "epoch": 0.32, + "grad_norm": 1.3658838272094727, + "learning_rate": 9.719643597181932e-05, + "loss": 1.4245, + "step": 1560 + }, + { + "epoch": 0.32, + "grad_norm": 1.0663561820983887, + "learning_rate": 9.717571487774555e-05, + "loss": 1.3779, + "step": 1570 + }, + { + "epoch": 0.33, + "grad_norm": 1.1523654460906982, + "learning_rate": 9.715499378367178e-05, + "loss": 1.4306, + "step": 1580 + }, + { + "epoch": 0.33, + "grad_norm": 1.223913311958313, + "learning_rate": 9.713427268959802e-05, + "loss": 1.3748, + "step": 1590 + }, + { + "epoch": 0.33, + "grad_norm": 1.0876872539520264, + "learning_rate": 9.711355159552424e-05, + "loss": 1.3806, + "step": 1600 + }, + { + "epoch": 0.33, + "grad_norm": 1.3317033052444458, + "learning_rate": 9.709283050145049e-05, + "loss": 1.3586, + "step": 1610 + }, + { + "epoch": 0.33, + "grad_norm": 1.2402222156524658, + "learning_rate": 9.707210940737671e-05, + "loss": 1.3886, + "step": 1620 + }, + { + "epoch": 0.34, + "grad_norm": 1.1467841863632202, + "learning_rate": 9.705138831330294e-05, + "loss": 1.3634, + "step": 1630 + }, + { + "epoch": 0.34, + "grad_norm": 1.1589218378067017, + "learning_rate": 9.703066721922918e-05, + "loss": 1.3466, + "step": 1640 + }, + { + "epoch": 0.34, + "grad_norm": 0.9369345307350159, + "learning_rate": 9.700994612515541e-05, + "loss": 1.3819, + "step": 1650 + }, + { + "epoch": 0.34, + "grad_norm": 1.0450528860092163, + "learning_rate": 9.698922503108165e-05, + "loss": 1.3482, + "step": 1660 + }, + { + "epoch": 0.34, + "grad_norm": 1.0236886739730835, + "learning_rate": 9.696850393700788e-05, + "loss": 1.3468, + "step": 1670 + }, + { + "epoch": 0.35, + "grad_norm": 1.0324066877365112, + "learning_rate": 9.69477828429341e-05, + "loss": 1.3926, + "step": 1680 + }, + { + "epoch": 0.35, + "grad_norm": 1.1705087423324585, + "learning_rate": 9.692706174886035e-05, + "loss": 1.3547, + "step": 1690 + }, + { + "epoch": 0.35, + "grad_norm": 1.1479854583740234, + "learning_rate": 9.690634065478658e-05, + "loss": 1.3517, + "step": 1700 + }, + { + "epoch": 0.35, + "grad_norm": 1.1700282096862793, + "learning_rate": 9.688561956071281e-05, + "loss": 1.3635, + "step": 1710 + }, + { + "epoch": 0.35, + "grad_norm": 1.079822301864624, + "learning_rate": 9.686489846663905e-05, + "loss": 1.3878, + "step": 1720 + }, + { + "epoch": 0.36, + "grad_norm": 1.188466191291809, + "learning_rate": 9.684417737256528e-05, + "loss": 1.36, + "step": 1730 + }, + { + "epoch": 0.36, + "grad_norm": 1.1050995588302612, + "learning_rate": 9.68234562784915e-05, + "loss": 1.3513, + "step": 1740 + }, + { + "epoch": 0.36, + "grad_norm": 1.2480050325393677, + "learning_rate": 9.680273518441774e-05, + "loss": 1.362, + "step": 1750 + }, + { + "epoch": 0.36, + "grad_norm": 1.1782851219177246, + "learning_rate": 9.678201409034397e-05, + "loss": 1.378, + "step": 1760 + }, + { + "epoch": 0.37, + "grad_norm": 1.1327308416366577, + "learning_rate": 9.676129299627021e-05, + "loss": 1.3836, + "step": 1770 + }, + { + "epoch": 0.37, + "grad_norm": 1.0974417924880981, + "learning_rate": 9.674057190219644e-05, + "loss": 1.3589, + "step": 1780 + }, + { + "epoch": 0.37, + "grad_norm": 1.1006550788879395, + "learning_rate": 9.671985080812266e-05, + "loss": 1.3734, + "step": 1790 + }, + { + "epoch": 0.37, + "grad_norm": 1.0745372772216797, + "learning_rate": 9.669912971404891e-05, + "loss": 1.4078, + "step": 1800 + }, + { + "epoch": 0.37, + "grad_norm": 1.2572031021118164, + "learning_rate": 9.667840861997513e-05, + "loss": 1.3535, + "step": 1810 + }, + { + "epoch": 0.38, + "grad_norm": 1.065767526626587, + "learning_rate": 9.665768752590137e-05, + "loss": 1.3657, + "step": 1820 + }, + { + "epoch": 0.38, + "grad_norm": 1.2773245573043823, + "learning_rate": 9.66369664318276e-05, + "loss": 1.3813, + "step": 1830 + }, + { + "epoch": 0.38, + "grad_norm": 1.0642096996307373, + "learning_rate": 9.661624533775384e-05, + "loss": 1.3829, + "step": 1840 + }, + { + "epoch": 0.38, + "grad_norm": 1.1348739862442017, + "learning_rate": 9.659552424368008e-05, + "loss": 1.3864, + "step": 1850 + }, + { + "epoch": 0.38, + "grad_norm": 1.136107087135315, + "learning_rate": 9.657480314960631e-05, + "loss": 1.3523, + "step": 1860 + }, + { + "epoch": 0.39, + "grad_norm": 1.1533474922180176, + "learning_rate": 9.655408205553253e-05, + "loss": 1.3669, + "step": 1870 + }, + { + "epoch": 0.39, + "grad_norm": 1.1027289628982544, + "learning_rate": 9.653336096145878e-05, + "loss": 1.3256, + "step": 1880 + }, + { + "epoch": 0.39, + "grad_norm": 0.9988449811935425, + "learning_rate": 9.6512639867385e-05, + "loss": 1.4024, + "step": 1890 + }, + { + "epoch": 0.39, + "grad_norm": 1.2975176572799683, + "learning_rate": 9.649191877331124e-05, + "loss": 1.3751, + "step": 1900 + }, + { + "epoch": 0.39, + "grad_norm": 1.2186543941497803, + "learning_rate": 9.647119767923747e-05, + "loss": 1.3444, + "step": 1910 + }, + { + "epoch": 0.4, + "grad_norm": 1.1342490911483765, + "learning_rate": 9.64504765851637e-05, + "loss": 1.3449, + "step": 1920 + }, + { + "epoch": 0.4, + "grad_norm": 1.148695707321167, + "learning_rate": 9.642975549108993e-05, + "loss": 1.3325, + "step": 1930 + }, + { + "epoch": 0.4, + "grad_norm": 0.9545331001281738, + "learning_rate": 9.640903439701616e-05, + "loss": 1.3375, + "step": 1940 + }, + { + "epoch": 0.4, + "grad_norm": 1.0941437482833862, + "learning_rate": 9.63883133029424e-05, + "loss": 1.3671, + "step": 1950 + }, + { + "epoch": 0.4, + "grad_norm": 1.0803030729293823, + "learning_rate": 9.636759220886863e-05, + "loss": 1.3648, + "step": 1960 + }, + { + "epoch": 0.41, + "grad_norm": 1.0937373638153076, + "learning_rate": 9.634687111479487e-05, + "loss": 1.3518, + "step": 1970 + }, + { + "epoch": 0.41, + "grad_norm": 1.1884483098983765, + "learning_rate": 9.632615002072109e-05, + "loss": 1.3461, + "step": 1980 + }, + { + "epoch": 0.41, + "grad_norm": 1.1179327964782715, + "learning_rate": 9.630542892664734e-05, + "loss": 1.3765, + "step": 1990 + }, + { + "epoch": 0.41, + "grad_norm": 0.9843894839286804, + "learning_rate": 9.628470783257356e-05, + "loss": 1.3379, + "step": 2000 + }, + { + "epoch": 0.41, + "grad_norm": 1.0279515981674194, + "learning_rate": 9.62639867384998e-05, + "loss": 1.3389, + "step": 2010 + }, + { + "epoch": 0.42, + "grad_norm": 1.0797231197357178, + "learning_rate": 9.624326564442603e-05, + "loss": 1.346, + "step": 2020 + }, + { + "epoch": 0.42, + "grad_norm": 1.1976298093795776, + "learning_rate": 9.622254455035227e-05, + "loss": 1.3366, + "step": 2030 + }, + { + "epoch": 0.42, + "grad_norm": 1.057880163192749, + "learning_rate": 9.620182345627849e-05, + "loss": 1.3264, + "step": 2040 + }, + { + "epoch": 0.42, + "grad_norm": 1.1059492826461792, + "learning_rate": 9.618110236220474e-05, + "loss": 1.3446, + "step": 2050 + }, + { + "epoch": 0.43, + "grad_norm": 1.0970298051834106, + "learning_rate": 9.616038126813096e-05, + "loss": 1.3521, + "step": 2060 + }, + { + "epoch": 0.43, + "grad_norm": 1.0951462984085083, + "learning_rate": 9.61396601740572e-05, + "loss": 1.3669, + "step": 2070 + }, + { + "epoch": 0.43, + "grad_norm": 1.0926049947738647, + "learning_rate": 9.611893907998343e-05, + "loss": 1.354, + "step": 2080 + }, + { + "epoch": 0.43, + "grad_norm": 1.0136979818344116, + "learning_rate": 9.609821798590965e-05, + "loss": 1.3321, + "step": 2090 + }, + { + "epoch": 0.43, + "grad_norm": 1.129214882850647, + "learning_rate": 9.60774968918359e-05, + "loss": 1.382, + "step": 2100 + }, + { + "epoch": 0.44, + "grad_norm": 1.1166954040527344, + "learning_rate": 9.605677579776212e-05, + "loss": 1.3337, + "step": 2110 + }, + { + "epoch": 0.44, + "grad_norm": 1.204231858253479, + "learning_rate": 9.603605470368836e-05, + "loss": 1.3642, + "step": 2120 + }, + { + "epoch": 0.44, + "grad_norm": 1.0265048742294312, + "learning_rate": 9.601533360961459e-05, + "loss": 1.3662, + "step": 2130 + }, + { + "epoch": 0.44, + "grad_norm": 1.0513389110565186, + "learning_rate": 9.599461251554083e-05, + "loss": 1.3395, + "step": 2140 + }, + { + "epoch": 0.44, + "grad_norm": 1.17727792263031, + "learning_rate": 9.597389142146705e-05, + "loss": 1.3738, + "step": 2150 + }, + { + "epoch": 0.45, + "grad_norm": 1.0676214694976807, + "learning_rate": 9.59531703273933e-05, + "loss": 1.3383, + "step": 2160 + }, + { + "epoch": 0.45, + "grad_norm": 0.9273681640625, + "learning_rate": 9.593244923331952e-05, + "loss": 1.367, + "step": 2170 + }, + { + "epoch": 0.45, + "grad_norm": 1.0774747133255005, + "learning_rate": 9.591172813924575e-05, + "loss": 1.3369, + "step": 2180 + }, + { + "epoch": 0.45, + "grad_norm": 1.131264090538025, + "learning_rate": 9.589100704517199e-05, + "loss": 1.3457, + "step": 2190 + }, + { + "epoch": 0.45, + "grad_norm": 1.106242060661316, + "learning_rate": 9.587028595109822e-05, + "loss": 1.321, + "step": 2200 + }, + { + "epoch": 0.46, + "grad_norm": 1.054598331451416, + "learning_rate": 9.584956485702446e-05, + "loss": 1.3424, + "step": 2210 + }, + { + "epoch": 0.46, + "grad_norm": 1.0380080938339233, + "learning_rate": 9.58288437629507e-05, + "loss": 1.3425, + "step": 2220 + }, + { + "epoch": 0.46, + "grad_norm": 1.1068315505981445, + "learning_rate": 9.580812266887692e-05, + "loss": 1.321, + "step": 2230 + }, + { + "epoch": 0.46, + "grad_norm": 1.1228212118148804, + "learning_rate": 9.578740157480316e-05, + "loss": 1.3301, + "step": 2240 + }, + { + "epoch": 0.46, + "grad_norm": 0.9643247127532959, + "learning_rate": 9.576668048072939e-05, + "loss": 1.3403, + "step": 2250 + }, + { + "epoch": 0.47, + "grad_norm": 0.9587458372116089, + "learning_rate": 9.574595938665562e-05, + "loss": 1.3402, + "step": 2260 + }, + { + "epoch": 0.47, + "grad_norm": 1.0192015171051025, + "learning_rate": 9.572523829258186e-05, + "loss": 1.3595, + "step": 2270 + }, + { + "epoch": 0.47, + "grad_norm": 1.1033486127853394, + "learning_rate": 9.570451719850808e-05, + "loss": 1.3515, + "step": 2280 + }, + { + "epoch": 0.47, + "grad_norm": 1.244828462600708, + "learning_rate": 9.568379610443431e-05, + "loss": 1.3148, + "step": 2290 + }, + { + "epoch": 0.47, + "grad_norm": 1.031778335571289, + "learning_rate": 9.566307501036055e-05, + "loss": 1.3343, + "step": 2300 + }, + { + "epoch": 0.48, + "grad_norm": 1.0581692457199097, + "learning_rate": 9.564235391628678e-05, + "loss": 1.3352, + "step": 2310 + }, + { + "epoch": 0.48, + "grad_norm": 0.9989519119262695, + "learning_rate": 9.562163282221302e-05, + "loss": 1.3206, + "step": 2320 + }, + { + "epoch": 0.48, + "grad_norm": 1.1149669885635376, + "learning_rate": 9.560091172813925e-05, + "loss": 1.3355, + "step": 2330 + }, + { + "epoch": 0.48, + "grad_norm": 1.1359626054763794, + "learning_rate": 9.558019063406548e-05, + "loss": 1.3233, + "step": 2340 + }, + { + "epoch": 0.48, + "grad_norm": 1.1091575622558594, + "learning_rate": 9.555946953999172e-05, + "loss": 1.3678, + "step": 2350 + }, + { + "epoch": 0.49, + "grad_norm": 1.0405771732330322, + "learning_rate": 9.553874844591795e-05, + "loss": 1.3555, + "step": 2360 + }, + { + "epoch": 0.49, + "grad_norm": 1.068385124206543, + "learning_rate": 9.551802735184418e-05, + "loss": 1.346, + "step": 2370 + }, + { + "epoch": 0.49, + "grad_norm": 1.0115128755569458, + "learning_rate": 9.549730625777042e-05, + "loss": 1.3448, + "step": 2380 + }, + { + "epoch": 0.49, + "grad_norm": 1.026138424873352, + "learning_rate": 9.547658516369665e-05, + "loss": 1.3286, + "step": 2390 + }, + { + "epoch": 0.5, + "grad_norm": 1.375127911567688, + "learning_rate": 9.545586406962289e-05, + "loss": 1.3931, + "step": 2400 + }, + { + "epoch": 0.5, + "grad_norm": 1.2297391891479492, + "learning_rate": 9.543514297554912e-05, + "loss": 1.3223, + "step": 2410 + }, + { + "epoch": 0.5, + "grad_norm": 1.143249750137329, + "learning_rate": 9.541442188147534e-05, + "loss": 1.3142, + "step": 2420 + }, + { + "epoch": 0.5, + "grad_norm": 1.1182348728179932, + "learning_rate": 9.539370078740158e-05, + "loss": 1.3414, + "step": 2430 + }, + { + "epoch": 0.5, + "grad_norm": 1.0450687408447266, + "learning_rate": 9.537297969332781e-05, + "loss": 1.3119, + "step": 2440 + }, + { + "epoch": 0.51, + "grad_norm": 1.105624794960022, + "learning_rate": 9.535225859925403e-05, + "loss": 1.3275, + "step": 2450 + }, + { + "epoch": 0.51, + "grad_norm": 1.1117305755615234, + "learning_rate": 9.533153750518028e-05, + "loss": 1.3384, + "step": 2460 + }, + { + "epoch": 0.51, + "grad_norm": 1.122660756111145, + "learning_rate": 9.53108164111065e-05, + "loss": 1.3509, + "step": 2470 + }, + { + "epoch": 0.51, + "grad_norm": 0.994361937046051, + "learning_rate": 9.529009531703274e-05, + "loss": 1.3638, + "step": 2480 + }, + { + "epoch": 0.51, + "grad_norm": 1.1339287757873535, + "learning_rate": 9.526937422295898e-05, + "loss": 1.3282, + "step": 2490 + }, + { + "epoch": 0.52, + "grad_norm": 1.0273313522338867, + "learning_rate": 9.524865312888521e-05, + "loss": 1.3261, + "step": 2500 + }, + { + "epoch": 0.52, + "grad_norm": 1.067122220993042, + "learning_rate": 9.522793203481145e-05, + "loss": 1.3502, + "step": 2510 + }, + { + "epoch": 0.52, + "grad_norm": 0.9780186414718628, + "learning_rate": 9.520721094073768e-05, + "loss": 1.3209, + "step": 2520 + }, + { + "epoch": 0.52, + "grad_norm": 1.0634074211120605, + "learning_rate": 9.51864898466639e-05, + "loss": 1.3508, + "step": 2530 + }, + { + "epoch": 0.52, + "grad_norm": 1.0088226795196533, + "learning_rate": 9.516576875259015e-05, + "loss": 1.2848, + "step": 2540 + }, + { + "epoch": 0.53, + "grad_norm": 1.156569242477417, + "learning_rate": 9.514504765851637e-05, + "loss": 1.3336, + "step": 2550 + }, + { + "epoch": 0.53, + "grad_norm": 0.9981438517570496, + "learning_rate": 9.512432656444261e-05, + "loss": 1.3237, + "step": 2560 + }, + { + "epoch": 0.53, + "grad_norm": 1.0465401411056519, + "learning_rate": 9.510360547036884e-05, + "loss": 1.3347, + "step": 2570 + }, + { + "epoch": 0.53, + "grad_norm": 3.282174825668335, + "learning_rate": 9.508288437629508e-05, + "loss": 1.3234, + "step": 2580 + }, + { + "epoch": 0.53, + "grad_norm": 1.0925480127334595, + "learning_rate": 9.50621632822213e-05, + "loss": 1.3604, + "step": 2590 + }, + { + "epoch": 0.54, + "grad_norm": 1.0559757947921753, + "learning_rate": 9.504144218814753e-05, + "loss": 1.3411, + "step": 2600 + }, + { + "epoch": 0.54, + "grad_norm": 1.0160987377166748, + "learning_rate": 9.502072109407377e-05, + "loss": 1.3299, + "step": 2610 + }, + { + "epoch": 0.54, + "grad_norm": 1.0814076662063599, + "learning_rate": 9.5e-05, + "loss": 1.3053, + "step": 2620 + }, + { + "epoch": 0.54, + "grad_norm": 1.1541906595230103, + "learning_rate": 9.497927890592624e-05, + "loss": 1.3368, + "step": 2630 + }, + { + "epoch": 0.54, + "grad_norm": 1.0476430654525757, + "learning_rate": 9.495855781185246e-05, + "loss": 1.3266, + "step": 2640 + }, + { + "epoch": 0.55, + "grad_norm": 1.0859614610671997, + "learning_rate": 9.493783671777871e-05, + "loss": 1.3077, + "step": 2650 + }, + { + "epoch": 0.55, + "grad_norm": 1.047561526298523, + "learning_rate": 9.491711562370493e-05, + "loss": 1.301, + "step": 2660 + }, + { + "epoch": 0.55, + "grad_norm": 1.1071749925613403, + "learning_rate": 9.489639452963117e-05, + "loss": 1.3069, + "step": 2670 + }, + { + "epoch": 0.55, + "grad_norm": 1.0207133293151855, + "learning_rate": 9.48756734355574e-05, + "loss": 1.3128, + "step": 2680 + }, + { + "epoch": 0.56, + "grad_norm": 1.1883114576339722, + "learning_rate": 9.485495234148364e-05, + "loss": 1.2987, + "step": 2690 + }, + { + "epoch": 0.56, + "grad_norm": 1.1708128452301025, + "learning_rate": 9.483423124740986e-05, + "loss": 1.3386, + "step": 2700 + }, + { + "epoch": 0.56, + "grad_norm": 1.0731940269470215, + "learning_rate": 9.481351015333611e-05, + "loss": 1.3165, + "step": 2710 + }, + { + "epoch": 0.56, + "grad_norm": 1.02231764793396, + "learning_rate": 9.479278905926233e-05, + "loss": 1.3364, + "step": 2720 + }, + { + "epoch": 0.56, + "grad_norm": 0.9825921654701233, + "learning_rate": 9.477206796518856e-05, + "loss": 1.3078, + "step": 2730 + }, + { + "epoch": 0.57, + "grad_norm": 1.1280665397644043, + "learning_rate": 9.47513468711148e-05, + "loss": 1.3359, + "step": 2740 + }, + { + "epoch": 0.57, + "grad_norm": 0.9910861253738403, + "learning_rate": 9.473062577704103e-05, + "loss": 1.3361, + "step": 2750 + }, + { + "epoch": 0.57, + "grad_norm": 1.0153850317001343, + "learning_rate": 9.470990468296727e-05, + "loss": 1.3059, + "step": 2760 + }, + { + "epoch": 0.57, + "grad_norm": 1.01111900806427, + "learning_rate": 9.468918358889349e-05, + "loss": 1.3226, + "step": 2770 + }, + { + "epoch": 0.57, + "grad_norm": 1.0714573860168457, + "learning_rate": 9.466846249481973e-05, + "loss": 1.3195, + "step": 2780 + }, + { + "epoch": 0.58, + "grad_norm": 1.0012733936309814, + "learning_rate": 9.464774140074596e-05, + "loss": 1.2948, + "step": 2790 + }, + { + "epoch": 0.58, + "grad_norm": 0.9637882709503174, + "learning_rate": 9.46270203066722e-05, + "loss": 1.3208, + "step": 2800 + }, + { + "epoch": 0.58, + "grad_norm": 1.0453296899795532, + "learning_rate": 9.460629921259843e-05, + "loss": 1.3095, + "step": 2810 + }, + { + "epoch": 0.58, + "grad_norm": 1.0107698440551758, + "learning_rate": 9.458557811852467e-05, + "loss": 1.3164, + "step": 2820 + }, + { + "epoch": 0.58, + "grad_norm": 1.1132638454437256, + "learning_rate": 9.456485702445089e-05, + "loss": 1.3162, + "step": 2830 + }, + { + "epoch": 0.59, + "grad_norm": 1.0389189720153809, + "learning_rate": 9.454413593037714e-05, + "loss": 1.317, + "step": 2840 + }, + { + "epoch": 0.59, + "grad_norm": 1.0654906034469604, + "learning_rate": 9.452341483630336e-05, + "loss": 1.305, + "step": 2850 + }, + { + "epoch": 0.59, + "grad_norm": 1.2564867734909058, + "learning_rate": 9.45026937422296e-05, + "loss": 1.3301, + "step": 2860 + }, + { + "epoch": 0.59, + "grad_norm": 1.0308964252471924, + "learning_rate": 9.448197264815583e-05, + "loss": 1.334, + "step": 2870 + }, + { + "epoch": 0.59, + "grad_norm": 1.0542854070663452, + "learning_rate": 9.446125155408206e-05, + "loss": 1.3001, + "step": 2880 + }, + { + "epoch": 0.6, + "grad_norm": 1.2161365747451782, + "learning_rate": 9.444053046000829e-05, + "loss": 1.2985, + "step": 2890 + }, + { + "epoch": 0.6, + "grad_norm": 1.206581473350525, + "learning_rate": 9.441980936593454e-05, + "loss": 1.3177, + "step": 2900 + }, + { + "epoch": 0.6, + "grad_norm": 1.1631922721862793, + "learning_rate": 9.439908827186076e-05, + "loss": 1.3269, + "step": 2910 + }, + { + "epoch": 0.6, + "grad_norm": 0.9827607274055481, + "learning_rate": 9.437836717778699e-05, + "loss": 1.3228, + "step": 2920 + }, + { + "epoch": 0.6, + "grad_norm": 1.0078628063201904, + "learning_rate": 9.435764608371323e-05, + "loss": 1.3047, + "step": 2930 + }, + { + "epoch": 0.61, + "grad_norm": 1.1704260110855103, + "learning_rate": 9.433692498963945e-05, + "loss": 1.3074, + "step": 2940 + }, + { + "epoch": 0.61, + "grad_norm": 1.075964093208313, + "learning_rate": 9.43162038955657e-05, + "loss": 1.3252, + "step": 2950 + }, + { + "epoch": 0.61, + "grad_norm": 0.9463378190994263, + "learning_rate": 9.429548280149192e-05, + "loss": 1.3201, + "step": 2960 + }, + { + "epoch": 0.61, + "grad_norm": 1.01523756980896, + "learning_rate": 9.427476170741815e-05, + "loss": 1.3127, + "step": 2970 + }, + { + "epoch": 0.61, + "grad_norm": 0.9392449259757996, + "learning_rate": 9.425404061334439e-05, + "loss": 1.3254, + "step": 2980 + }, + { + "epoch": 0.62, + "grad_norm": 1.01919424533844, + "learning_rate": 9.423331951927062e-05, + "loss": 1.3021, + "step": 2990 + }, + { + "epoch": 0.62, + "grad_norm": 1.1243764162063599, + "learning_rate": 9.421259842519685e-05, + "loss": 1.3112, + "step": 3000 + }, + { + "epoch": 0.62, + "grad_norm": 1.0084974765777588, + "learning_rate": 9.41918773311231e-05, + "loss": 1.3173, + "step": 3010 + }, + { + "epoch": 0.62, + "grad_norm": 0.9945486783981323, + "learning_rate": 9.417115623704932e-05, + "loss": 1.3114, + "step": 3020 + }, + { + "epoch": 0.63, + "grad_norm": 1.1148301362991333, + "learning_rate": 9.415043514297555e-05, + "loss": 1.3275, + "step": 3030 + }, + { + "epoch": 0.63, + "grad_norm": 1.2701823711395264, + "learning_rate": 9.412971404890179e-05, + "loss": 1.3094, + "step": 3040 + }, + { + "epoch": 0.63, + "grad_norm": 1.1923747062683105, + "learning_rate": 9.410899295482802e-05, + "loss": 1.2812, + "step": 3050 + }, + { + "epoch": 0.63, + "grad_norm": 1.2106274366378784, + "learning_rate": 9.408827186075426e-05, + "loss": 1.3011, + "step": 3060 + }, + { + "epoch": 0.63, + "grad_norm": 1.0127681493759155, + "learning_rate": 9.406755076668049e-05, + "loss": 1.3059, + "step": 3070 + }, + { + "epoch": 0.64, + "grad_norm": 1.042222499847412, + "learning_rate": 9.404682967260671e-05, + "loss": 1.2961, + "step": 3080 + }, + { + "epoch": 0.64, + "grad_norm": 0.9650092124938965, + "learning_rate": 9.402610857853296e-05, + "loss": 1.3264, + "step": 3090 + }, + { + "epoch": 0.64, + "grad_norm": 1.0504155158996582, + "learning_rate": 9.400538748445918e-05, + "loss": 1.2853, + "step": 3100 + }, + { + "epoch": 0.64, + "grad_norm": 1.0501419305801392, + "learning_rate": 9.39846663903854e-05, + "loss": 1.3179, + "step": 3110 + }, + { + "epoch": 0.64, + "grad_norm": 1.056299090385437, + "learning_rate": 9.396394529631165e-05, + "loss": 1.2962, + "step": 3120 + }, + { + "epoch": 0.65, + "grad_norm": 1.0278836488723755, + "learning_rate": 9.394322420223788e-05, + "loss": 1.2828, + "step": 3130 + }, + { + "epoch": 0.65, + "grad_norm": 0.9813990592956543, + "learning_rate": 9.392250310816411e-05, + "loss": 1.2986, + "step": 3140 + }, + { + "epoch": 0.65, + "grad_norm": 1.0665332078933716, + "learning_rate": 9.390178201409035e-05, + "loss": 1.2891, + "step": 3150 + }, + { + "epoch": 0.65, + "grad_norm": 1.0281347036361694, + "learning_rate": 9.388106092001658e-05, + "loss": 1.299, + "step": 3160 + }, + { + "epoch": 0.65, + "grad_norm": 1.0530226230621338, + "learning_rate": 9.386033982594282e-05, + "loss": 1.2887, + "step": 3170 + }, + { + "epoch": 0.66, + "grad_norm": 1.0053261518478394, + "learning_rate": 9.383961873186905e-05, + "loss": 1.327, + "step": 3180 + }, + { + "epoch": 0.66, + "grad_norm": 1.1362097263336182, + "learning_rate": 9.381889763779527e-05, + "loss": 1.3001, + "step": 3190 + }, + { + "epoch": 0.66, + "grad_norm": 1.0610814094543457, + "learning_rate": 9.379817654372152e-05, + "loss": 1.2535, + "step": 3200 + }, + { + "epoch": 0.66, + "grad_norm": 0.9906120300292969, + "learning_rate": 9.377745544964774e-05, + "loss": 1.291, + "step": 3210 + }, + { + "epoch": 0.66, + "grad_norm": 1.0676803588867188, + "learning_rate": 9.375673435557398e-05, + "loss": 1.3032, + "step": 3220 + }, + { + "epoch": 0.67, + "grad_norm": 1.056851863861084, + "learning_rate": 9.373601326150021e-05, + "loss": 1.2879, + "step": 3230 + }, + { + "epoch": 0.67, + "grad_norm": 1.048841118812561, + "learning_rate": 9.371529216742645e-05, + "loss": 1.2798, + "step": 3240 + }, + { + "epoch": 0.67, + "grad_norm": 1.047361969947815, + "learning_rate": 9.369457107335268e-05, + "loss": 1.3195, + "step": 3250 + }, + { + "epoch": 0.67, + "grad_norm": 1.076904296875, + "learning_rate": 9.367384997927892e-05, + "loss": 1.3013, + "step": 3260 + }, + { + "epoch": 0.67, + "grad_norm": 1.0863533020019531, + "learning_rate": 9.365312888520514e-05, + "loss": 1.2971, + "step": 3270 + }, + { + "epoch": 0.68, + "grad_norm": 1.0460786819458008, + "learning_rate": 9.363240779113138e-05, + "loss": 1.3023, + "step": 3280 + }, + { + "epoch": 0.68, + "grad_norm": 0.906493604183197, + "learning_rate": 9.361168669705761e-05, + "loss": 1.3053, + "step": 3290 + }, + { + "epoch": 0.68, + "grad_norm": 1.1181541681289673, + "learning_rate": 9.359096560298383e-05, + "loss": 1.3142, + "step": 3300 + }, + { + "epoch": 0.68, + "grad_norm": 1.0198432207107544, + "learning_rate": 9.357024450891008e-05, + "loss": 1.293, + "step": 3310 + }, + { + "epoch": 0.69, + "grad_norm": 1.0075292587280273, + "learning_rate": 9.35495234148363e-05, + "loss": 1.299, + "step": 3320 + }, + { + "epoch": 0.69, + "grad_norm": 1.024592399597168, + "learning_rate": 9.352880232076254e-05, + "loss": 1.2983, + "step": 3330 + }, + { + "epoch": 0.69, + "grad_norm": 0.9931455254554749, + "learning_rate": 9.350808122668877e-05, + "loss": 1.279, + "step": 3340 + }, + { + "epoch": 0.69, + "grad_norm": 1.0673152208328247, + "learning_rate": 9.348736013261501e-05, + "loss": 1.2816, + "step": 3350 + }, + { + "epoch": 0.69, + "grad_norm": 1.068587303161621, + "learning_rate": 9.346663903854124e-05, + "loss": 1.2934, + "step": 3360 + }, + { + "epoch": 0.7, + "grad_norm": 0.9838789701461792, + "learning_rate": 9.344591794446748e-05, + "loss": 1.2917, + "step": 3370 + }, + { + "epoch": 0.7, + "grad_norm": 1.0613404512405396, + "learning_rate": 9.34251968503937e-05, + "loss": 1.2879, + "step": 3380 + }, + { + "epoch": 0.7, + "grad_norm": 1.0173070430755615, + "learning_rate": 9.340447575631995e-05, + "loss": 1.2966, + "step": 3390 + }, + { + "epoch": 0.7, + "grad_norm": 1.1227622032165527, + "learning_rate": 9.338375466224617e-05, + "loss": 1.2554, + "step": 3400 + }, + { + "epoch": 0.7, + "grad_norm": 1.007338523864746, + "learning_rate": 9.33630335681724e-05, + "loss": 1.3115, + "step": 3410 + }, + { + "epoch": 0.71, + "grad_norm": 1.0479813814163208, + "learning_rate": 9.334231247409864e-05, + "loss": 1.3048, + "step": 3420 + }, + { + "epoch": 0.71, + "grad_norm": 1.0560479164123535, + "learning_rate": 9.332159138002486e-05, + "loss": 1.2919, + "step": 3430 + }, + { + "epoch": 0.71, + "grad_norm": 1.1081204414367676, + "learning_rate": 9.33008702859511e-05, + "loss": 1.2967, + "step": 3440 + }, + { + "epoch": 0.71, + "grad_norm": 1.0260145664215088, + "learning_rate": 9.328014919187733e-05, + "loss": 1.3195, + "step": 3450 + }, + { + "epoch": 0.71, + "grad_norm": 1.057966947555542, + "learning_rate": 9.325942809780357e-05, + "loss": 1.2896, + "step": 3460 + }, + { + "epoch": 0.72, + "grad_norm": 1.0711556673049927, + "learning_rate": 9.32387070037298e-05, + "loss": 1.2817, + "step": 3470 + }, + { + "epoch": 0.72, + "grad_norm": 1.0118924379348755, + "learning_rate": 9.321798590965604e-05, + "loss": 1.3052, + "step": 3480 + }, + { + "epoch": 0.72, + "grad_norm": 1.0227614641189575, + "learning_rate": 9.319726481558226e-05, + "loss": 1.3186, + "step": 3490 + }, + { + "epoch": 0.72, + "grad_norm": 1.0655134916305542, + "learning_rate": 9.317654372150851e-05, + "loss": 1.3087, + "step": 3500 + }, + { + "epoch": 0.72, + "grad_norm": 1.1255359649658203, + "learning_rate": 9.315582262743473e-05, + "loss": 1.2749, + "step": 3510 + }, + { + "epoch": 0.73, + "grad_norm": 1.0832923650741577, + "learning_rate": 9.313510153336096e-05, + "loss": 1.2892, + "step": 3520 + }, + { + "epoch": 0.73, + "grad_norm": 1.067236304283142, + "learning_rate": 9.31143804392872e-05, + "loss": 1.284, + "step": 3530 + }, + { + "epoch": 0.73, + "grad_norm": 1.1556322574615479, + "learning_rate": 9.309365934521344e-05, + "loss": 1.2604, + "step": 3540 + }, + { + "epoch": 0.73, + "grad_norm": 1.151723861694336, + "learning_rate": 9.307293825113966e-05, + "loss": 1.3045, + "step": 3550 + }, + { + "epoch": 0.73, + "grad_norm": 1.0258938074111938, + "learning_rate": 9.30522171570659e-05, + "loss": 1.2859, + "step": 3560 + }, + { + "epoch": 0.74, + "grad_norm": 1.0165237188339233, + "learning_rate": 9.303149606299213e-05, + "loss": 1.3212, + "step": 3570 + }, + { + "epoch": 0.74, + "grad_norm": 0.9969586133956909, + "learning_rate": 9.301077496891836e-05, + "loss": 1.3038, + "step": 3580 + }, + { + "epoch": 0.74, + "grad_norm": 1.1335457563400269, + "learning_rate": 9.29900538748446e-05, + "loss": 1.2747, + "step": 3590 + }, + { + "epoch": 0.74, + "grad_norm": 1.0744903087615967, + "learning_rate": 9.296933278077082e-05, + "loss": 1.3078, + "step": 3600 + }, + { + "epoch": 0.74, + "grad_norm": 1.2294646501541138, + "learning_rate": 9.294861168669707e-05, + "loss": 1.2631, + "step": 3610 + }, + { + "epoch": 0.75, + "grad_norm": 1.0542582273483276, + "learning_rate": 9.292789059262329e-05, + "loss": 1.2778, + "step": 3620 + }, + { + "epoch": 0.75, + "grad_norm": 1.0787122249603271, + "learning_rate": 9.290716949854952e-05, + "loss": 1.2952, + "step": 3630 + }, + { + "epoch": 0.75, + "grad_norm": 1.182387351989746, + "learning_rate": 9.288644840447576e-05, + "loss": 1.2955, + "step": 3640 + }, + { + "epoch": 0.75, + "grad_norm": 1.0466411113739014, + "learning_rate": 9.2865727310402e-05, + "loss": 1.3085, + "step": 3650 + }, + { + "epoch": 0.76, + "grad_norm": 1.0271363258361816, + "learning_rate": 9.284500621632823e-05, + "loss": 1.2881, + "step": 3660 + }, + { + "epoch": 0.76, + "grad_norm": 1.1320871114730835, + "learning_rate": 9.282428512225446e-05, + "loss": 1.2671, + "step": 3670 + }, + { + "epoch": 0.76, + "grad_norm": 1.1176432371139526, + "learning_rate": 9.280356402818069e-05, + "loss": 1.299, + "step": 3680 + }, + { + "epoch": 0.76, + "grad_norm": 0.9895033240318298, + "learning_rate": 9.278284293410694e-05, + "loss": 1.2984, + "step": 3690 + }, + { + "epoch": 0.76, + "grad_norm": 1.191007137298584, + "learning_rate": 9.276212184003316e-05, + "loss": 1.2808, + "step": 3700 + }, + { + "epoch": 0.77, + "grad_norm": 1.0878729820251465, + "learning_rate": 9.274140074595939e-05, + "loss": 1.2864, + "step": 3710 + }, + { + "epoch": 0.77, + "grad_norm": 1.1144053936004639, + "learning_rate": 9.272067965188563e-05, + "loss": 1.3175, + "step": 3720 + }, + { + "epoch": 0.77, + "grad_norm": 1.128405213356018, + "learning_rate": 9.269995855781186e-05, + "loss": 1.3147, + "step": 3730 + }, + { + "epoch": 0.77, + "grad_norm": 1.0539438724517822, + "learning_rate": 9.267923746373808e-05, + "loss": 1.2927, + "step": 3740 + }, + { + "epoch": 0.77, + "grad_norm": 1.0515836477279663, + "learning_rate": 9.265851636966433e-05, + "loss": 1.2743, + "step": 3750 + }, + { + "epoch": 0.78, + "grad_norm": 1.1526955366134644, + "learning_rate": 9.263779527559055e-05, + "loss": 1.2911, + "step": 3760 + }, + { + "epoch": 0.78, + "grad_norm": 1.010903000831604, + "learning_rate": 9.261707418151679e-05, + "loss": 1.2735, + "step": 3770 + }, + { + "epoch": 0.78, + "grad_norm": 1.1373246908187866, + "learning_rate": 9.259635308744302e-05, + "loss": 1.2952, + "step": 3780 + }, + { + "epoch": 0.78, + "grad_norm": 0.9458179473876953, + "learning_rate": 9.257563199336925e-05, + "loss": 1.2936, + "step": 3790 + }, + { + "epoch": 0.78, + "grad_norm": 0.949252724647522, + "learning_rate": 9.25549108992955e-05, + "loss": 1.287, + "step": 3800 + }, + { + "epoch": 0.79, + "grad_norm": 1.1074215173721313, + "learning_rate": 9.253418980522172e-05, + "loss": 1.2937, + "step": 3810 + }, + { + "epoch": 0.79, + "grad_norm": 0.995959460735321, + "learning_rate": 9.251346871114795e-05, + "loss": 1.265, + "step": 3820 + }, + { + "epoch": 0.79, + "grad_norm": 1.0461138486862183, + "learning_rate": 9.249274761707419e-05, + "loss": 1.2822, + "step": 3830 + }, + { + "epoch": 0.79, + "grad_norm": 1.0449700355529785, + "learning_rate": 9.247202652300042e-05, + "loss": 1.2896, + "step": 3840 + }, + { + "epoch": 0.79, + "grad_norm": 1.0590384006500244, + "learning_rate": 9.245130542892664e-05, + "loss": 1.2923, + "step": 3850 + }, + { + "epoch": 0.8, + "grad_norm": 1.178272008895874, + "learning_rate": 9.243058433485289e-05, + "loss": 1.2797, + "step": 3860 + }, + { + "epoch": 0.8, + "grad_norm": 1.0651668310165405, + "learning_rate": 9.240986324077911e-05, + "loss": 1.2632, + "step": 3870 + }, + { + "epoch": 0.8, + "grad_norm": 1.0944633483886719, + "learning_rate": 9.238914214670535e-05, + "loss": 1.2853, + "step": 3880 + }, + { + "epoch": 0.8, + "grad_norm": 1.042576551437378, + "learning_rate": 9.236842105263158e-05, + "loss": 1.2884, + "step": 3890 + }, + { + "epoch": 0.8, + "grad_norm": 1.1282780170440674, + "learning_rate": 9.234769995855782e-05, + "loss": 1.2937, + "step": 3900 + }, + { + "epoch": 0.81, + "grad_norm": 0.9996076822280884, + "learning_rate": 9.232697886448405e-05, + "loss": 1.2657, + "step": 3910 + }, + { + "epoch": 0.81, + "grad_norm": 0.9630957245826721, + "learning_rate": 9.230625777041029e-05, + "loss": 1.2679, + "step": 3920 + }, + { + "epoch": 0.81, + "grad_norm": 1.0428036451339722, + "learning_rate": 9.228553667633651e-05, + "loss": 1.2921, + "step": 3930 + }, + { + "epoch": 0.81, + "grad_norm": 1.1940115690231323, + "learning_rate": 9.226481558226275e-05, + "loss": 1.2759, + "step": 3940 + }, + { + "epoch": 0.82, + "grad_norm": 1.1081199645996094, + "learning_rate": 9.224409448818898e-05, + "loss": 1.2636, + "step": 3950 + }, + { + "epoch": 0.82, + "grad_norm": 0.989032506942749, + "learning_rate": 9.22233733941152e-05, + "loss": 1.2489, + "step": 3960 + }, + { + "epoch": 0.82, + "grad_norm": 1.0575727224349976, + "learning_rate": 9.220265230004145e-05, + "loss": 1.2777, + "step": 3970 + }, + { + "epoch": 0.82, + "grad_norm": 1.0007938146591187, + "learning_rate": 9.218193120596767e-05, + "loss": 1.286, + "step": 3980 + }, + { + "epoch": 0.82, + "grad_norm": 1.0560977458953857, + "learning_rate": 9.216121011189391e-05, + "loss": 1.2939, + "step": 3990 + }, + { + "epoch": 0.83, + "grad_norm": 1.308013916015625, + "learning_rate": 9.214048901782014e-05, + "loss": 1.2728, + "step": 4000 + }, + { + "epoch": 0.83, + "grad_norm": 1.2011935710906982, + "learning_rate": 9.211976792374638e-05, + "loss": 1.2853, + "step": 4010 + }, + { + "epoch": 0.83, + "grad_norm": 1.0514850616455078, + "learning_rate": 9.209904682967261e-05, + "loss": 1.3102, + "step": 4020 + }, + { + "epoch": 0.83, + "grad_norm": 1.0865683555603027, + "learning_rate": 9.207832573559885e-05, + "loss": 1.2835, + "step": 4030 + }, + { + "epoch": 0.83, + "grad_norm": 1.2012451887130737, + "learning_rate": 9.205760464152507e-05, + "loss": 1.2801, + "step": 4040 + }, + { + "epoch": 0.84, + "grad_norm": 1.0710102319717407, + "learning_rate": 9.203688354745132e-05, + "loss": 1.2745, + "step": 4050 + }, + { + "epoch": 0.84, + "grad_norm": 1.0107301473617554, + "learning_rate": 9.201616245337754e-05, + "loss": 1.2928, + "step": 4060 + }, + { + "epoch": 0.84, + "grad_norm": 1.0026335716247559, + "learning_rate": 9.199544135930378e-05, + "loss": 1.2916, + "step": 4070 + }, + { + "epoch": 0.84, + "grad_norm": 0.9443692564964294, + "learning_rate": 9.197472026523001e-05, + "loss": 1.2956, + "step": 4080 + }, + { + "epoch": 0.84, + "grad_norm": 0.9472268223762512, + "learning_rate": 9.195399917115625e-05, + "loss": 1.2875, + "step": 4090 + }, + { + "epoch": 0.85, + "grad_norm": 1.0817506313323975, + "learning_rate": 9.193327807708247e-05, + "loss": 1.2779, + "step": 4100 + }, + { + "epoch": 0.85, + "grad_norm": 1.0539813041687012, + "learning_rate": 9.19125569830087e-05, + "loss": 1.2661, + "step": 4110 + }, + { + "epoch": 0.85, + "grad_norm": 0.9975004196166992, + "learning_rate": 9.189183588893494e-05, + "loss": 1.2499, + "step": 4120 + }, + { + "epoch": 0.85, + "grad_norm": 1.0313421487808228, + "learning_rate": 9.187111479486117e-05, + "loss": 1.2907, + "step": 4130 + }, + { + "epoch": 0.85, + "grad_norm": 1.0273147821426392, + "learning_rate": 9.185039370078741e-05, + "loss": 1.2929, + "step": 4140 + }, + { + "epoch": 0.86, + "grad_norm": 0.9810879230499268, + "learning_rate": 9.182967260671363e-05, + "loss": 1.2974, + "step": 4150 + }, + { + "epoch": 0.86, + "grad_norm": 1.0243279933929443, + "learning_rate": 9.180895151263988e-05, + "loss": 1.2406, + "step": 4160 + }, + { + "epoch": 0.86, + "grad_norm": 1.0115349292755127, + "learning_rate": 9.17882304185661e-05, + "loss": 1.2765, + "step": 4170 + }, + { + "epoch": 0.86, + "grad_norm": 1.1206727027893066, + "learning_rate": 9.176750932449234e-05, + "loss": 1.2956, + "step": 4180 + }, + { + "epoch": 0.86, + "grad_norm": 0.99837327003479, + "learning_rate": 9.174678823041857e-05, + "loss": 1.2614, + "step": 4190 + }, + { + "epoch": 0.87, + "grad_norm": 1.0117827653884888, + "learning_rate": 9.17260671363448e-05, + "loss": 1.2611, + "step": 4200 + }, + { + "epoch": 0.87, + "grad_norm": 1.1119413375854492, + "learning_rate": 9.170534604227104e-05, + "loss": 1.3006, + "step": 4210 + }, + { + "epoch": 0.87, + "grad_norm": 1.0567435026168823, + "learning_rate": 9.168462494819728e-05, + "loss": 1.2544, + "step": 4220 + }, + { + "epoch": 0.87, + "grad_norm": 1.1326267719268799, + "learning_rate": 9.16639038541235e-05, + "loss": 1.2895, + "step": 4230 + }, + { + "epoch": 0.87, + "grad_norm": 1.0858250856399536, + "learning_rate": 9.164318276004975e-05, + "loss": 1.2672, + "step": 4240 + }, + { + "epoch": 0.88, + "grad_norm": 1.1111207008361816, + "learning_rate": 9.162246166597597e-05, + "loss": 1.255, + "step": 4250 + }, + { + "epoch": 0.88, + "grad_norm": 1.0791577100753784, + "learning_rate": 9.16017405719022e-05, + "loss": 1.2865, + "step": 4260 + }, + { + "epoch": 0.88, + "grad_norm": 1.1741734743118286, + "learning_rate": 9.158101947782844e-05, + "loss": 1.2898, + "step": 4270 + }, + { + "epoch": 0.88, + "grad_norm": 0.9900088310241699, + "learning_rate": 9.156029838375466e-05, + "loss": 1.26, + "step": 4280 + }, + { + "epoch": 0.89, + "grad_norm": 1.2227307558059692, + "learning_rate": 9.15395772896809e-05, + "loss": 1.2818, + "step": 4290 + }, + { + "epoch": 0.89, + "grad_norm": 1.0696605443954468, + "learning_rate": 9.151885619560713e-05, + "loss": 1.2739, + "step": 4300 + }, + { + "epoch": 0.89, + "grad_norm": 1.145063042640686, + "learning_rate": 9.149813510153336e-05, + "loss": 1.2886, + "step": 4310 + }, + { + "epoch": 0.89, + "grad_norm": 1.0415133237838745, + "learning_rate": 9.14774140074596e-05, + "loss": 1.2999, + "step": 4320 + }, + { + "epoch": 0.89, + "grad_norm": 0.9151254296302795, + "learning_rate": 9.145669291338584e-05, + "loss": 1.2883, + "step": 4330 + }, + { + "epoch": 0.9, + "grad_norm": 1.1220918893814087, + "learning_rate": 9.143597181931206e-05, + "loss": 1.2751, + "step": 4340 + }, + { + "epoch": 0.9, + "grad_norm": 1.0417348146438599, + "learning_rate": 9.14152507252383e-05, + "loss": 1.2675, + "step": 4350 + }, + { + "epoch": 0.9, + "grad_norm": 1.1090322732925415, + "learning_rate": 9.139452963116453e-05, + "loss": 1.2521, + "step": 4360 + }, + { + "epoch": 0.9, + "grad_norm": 1.003110408782959, + "learning_rate": 9.137380853709076e-05, + "loss": 1.2461, + "step": 4370 + }, + { + "epoch": 0.9, + "grad_norm": 1.0214240550994873, + "learning_rate": 9.1353087443017e-05, + "loss": 1.3003, + "step": 4380 + }, + { + "epoch": 0.91, + "grad_norm": 1.0389635562896729, + "learning_rate": 9.133236634894323e-05, + "loss": 1.2642, + "step": 4390 + }, + { + "epoch": 0.91, + "grad_norm": 1.1644842624664307, + "learning_rate": 9.131164525486945e-05, + "loss": 1.247, + "step": 4400 + }, + { + "epoch": 0.91, + "grad_norm": 1.0494420528411865, + "learning_rate": 9.12909241607957e-05, + "loss": 1.2727, + "step": 4410 + }, + { + "epoch": 0.91, + "grad_norm": 1.1759871244430542, + "learning_rate": 9.127020306672192e-05, + "loss": 1.2569, + "step": 4420 + }, + { + "epoch": 0.91, + "grad_norm": 1.0006252527236938, + "learning_rate": 9.124948197264816e-05, + "loss": 1.2761, + "step": 4430 + }, + { + "epoch": 0.92, + "grad_norm": 1.0942422151565552, + "learning_rate": 9.12287608785744e-05, + "loss": 1.2778, + "step": 4440 + }, + { + "epoch": 0.92, + "grad_norm": 1.1273877620697021, + "learning_rate": 9.120803978450062e-05, + "loss": 1.2738, + "step": 4450 + }, + { + "epoch": 0.92, + "grad_norm": 1.0103455781936646, + "learning_rate": 9.118731869042686e-05, + "loss": 1.2559, + "step": 4460 + }, + { + "epoch": 0.92, + "grad_norm": 1.0412319898605347, + "learning_rate": 9.116659759635309e-05, + "loss": 1.2839, + "step": 4470 + }, + { + "epoch": 0.92, + "grad_norm": 1.1623831987380981, + "learning_rate": 9.114587650227932e-05, + "loss": 1.2684, + "step": 4480 + }, + { + "epoch": 0.93, + "grad_norm": 1.213977336883545, + "learning_rate": 9.112515540820556e-05, + "loss": 1.2587, + "step": 4490 + }, + { + "epoch": 0.93, + "grad_norm": 1.1630234718322754, + "learning_rate": 9.110443431413179e-05, + "loss": 1.2557, + "step": 4500 + }, + { + "epoch": 0.93, + "grad_norm": 1.0047425031661987, + "learning_rate": 9.108371322005801e-05, + "loss": 1.2785, + "step": 4510 + }, + { + "epoch": 0.93, + "grad_norm": 1.0434467792510986, + "learning_rate": 9.106299212598426e-05, + "loss": 1.2916, + "step": 4520 + }, + { + "epoch": 0.93, + "grad_norm": 1.0278736352920532, + "learning_rate": 9.104227103191048e-05, + "loss": 1.2494, + "step": 4530 + }, + { + "epoch": 0.94, + "grad_norm": 0.9865803122520447, + "learning_rate": 9.102154993783672e-05, + "loss": 1.2487, + "step": 4540 + }, + { + "epoch": 0.94, + "grad_norm": 1.0419522523880005, + "learning_rate": 9.100082884376295e-05, + "loss": 1.2618, + "step": 4550 + }, + { + "epoch": 0.94, + "grad_norm": 1.1537505388259888, + "learning_rate": 9.098010774968919e-05, + "loss": 1.2613, + "step": 4560 + }, + { + "epoch": 0.94, + "grad_norm": 1.0565922260284424, + "learning_rate": 9.095938665561542e-05, + "loss": 1.2592, + "step": 4570 + }, + { + "epoch": 0.95, + "grad_norm": 1.106313705444336, + "learning_rate": 9.093866556154166e-05, + "loss": 1.2585, + "step": 4580 + }, + { + "epoch": 0.95, + "grad_norm": 1.0392132997512817, + "learning_rate": 9.091794446746788e-05, + "loss": 1.2626, + "step": 4590 + }, + { + "epoch": 0.95, + "grad_norm": 1.1017506122589111, + "learning_rate": 9.089722337339413e-05, + "loss": 1.2696, + "step": 4600 + }, + { + "epoch": 0.95, + "grad_norm": 1.134838581085205, + "learning_rate": 9.087650227932035e-05, + "loss": 1.2626, + "step": 4610 + }, + { + "epoch": 0.95, + "grad_norm": 1.3448967933654785, + "learning_rate": 9.085578118524659e-05, + "loss": 1.2728, + "step": 4620 + }, + { + "epoch": 0.96, + "grad_norm": 0.9399920701980591, + "learning_rate": 9.083506009117282e-05, + "loss": 1.253, + "step": 4630 + }, + { + "epoch": 0.96, + "grad_norm": 1.0530354976654053, + "learning_rate": 9.081433899709904e-05, + "loss": 1.2501, + "step": 4640 + }, + { + "epoch": 0.96, + "grad_norm": 1.0609047412872314, + "learning_rate": 9.079361790302529e-05, + "loss": 1.2599, + "step": 4650 + }, + { + "epoch": 0.96, + "grad_norm": 1.120044469833374, + "learning_rate": 9.077289680895151e-05, + "loss": 1.2511, + "step": 4660 + }, + { + "epoch": 0.96, + "grad_norm": 1.068034291267395, + "learning_rate": 9.075217571487775e-05, + "loss": 1.2623, + "step": 4670 + }, + { + "epoch": 0.97, + "grad_norm": 0.9914749264717102, + "learning_rate": 9.073145462080398e-05, + "loss": 1.2754, + "step": 4680 + }, + { + "epoch": 0.97, + "grad_norm": 1.084227204322815, + "learning_rate": 9.071073352673022e-05, + "loss": 1.2861, + "step": 4690 + }, + { + "epoch": 0.97, + "grad_norm": 0.9811500906944275, + "learning_rate": 9.069001243265644e-05, + "loss": 1.2361, + "step": 4700 + }, + { + "epoch": 0.97, + "grad_norm": 1.0811760425567627, + "learning_rate": 9.066929133858269e-05, + "loss": 1.247, + "step": 4710 + }, + { + "epoch": 0.97, + "grad_norm": 1.0787228345870972, + "learning_rate": 9.064857024450891e-05, + "loss": 1.2609, + "step": 4720 + }, + { + "epoch": 0.98, + "grad_norm": 1.0164875984191895, + "learning_rate": 9.062784915043515e-05, + "loss": 1.289, + "step": 4730 + }, + { + "epoch": 0.98, + "grad_norm": 1.1449209451675415, + "learning_rate": 9.060712805636138e-05, + "loss": 1.269, + "step": 4740 + }, + { + "epoch": 0.98, + "grad_norm": 1.2805284261703491, + "learning_rate": 9.058640696228762e-05, + "loss": 1.2803, + "step": 4750 + }, + { + "epoch": 0.98, + "grad_norm": 1.03864586353302, + "learning_rate": 9.056568586821385e-05, + "loss": 1.2635, + "step": 4760 + }, + { + "epoch": 0.98, + "grad_norm": 0.9715671539306641, + "learning_rate": 9.054496477414009e-05, + "loss": 1.2232, + "step": 4770 + }, + { + "epoch": 0.99, + "grad_norm": 1.1221860647201538, + "learning_rate": 9.052424368006631e-05, + "loss": 1.2536, + "step": 4780 + }, + { + "epoch": 0.99, + "grad_norm": 0.9467464089393616, + "learning_rate": 9.050352258599254e-05, + "loss": 1.2529, + "step": 4790 + }, + { + "epoch": 0.99, + "grad_norm": 1.1306428909301758, + "learning_rate": 9.048280149191878e-05, + "loss": 1.2621, + "step": 4800 + }, + { + "epoch": 0.99, + "grad_norm": 1.038275122642517, + "learning_rate": 9.0462080397845e-05, + "loss": 1.2785, + "step": 4810 + }, + { + "epoch": 0.99, + "grad_norm": 1.1947646141052246, + "learning_rate": 9.044135930377125e-05, + "loss": 1.2643, + "step": 4820 + }, + { + "epoch": 1.0, + "grad_norm": 1.0585174560546875, + "learning_rate": 9.042063820969747e-05, + "loss": 1.248, + "step": 4830 + }, + { + "epoch": 1.0, + "grad_norm": 1.1931108236312866, + "learning_rate": 9.03999171156237e-05, + "loss": 1.2884, + "step": 4840 + }, + { + "epoch": 1.0, + "eval_loss": 1.280229926109314, + "eval_runtime": 1606.712, + "eval_samples_per_second": 262.53, + "eval_steps_per_second": 4.102, + "step": 4846 + }, + { + "epoch": 1.0, + "grad_norm": 1.0185681581497192, + "learning_rate": 9.037919602154994e-05, + "loss": 1.2701, + "step": 4850 + }, + { + "epoch": 1.0, + "grad_norm": 1.2571942806243896, + "learning_rate": 9.035847492747618e-05, + "loss": 1.2309, + "step": 4860 + }, + { + "epoch": 1.0, + "grad_norm": 1.1584192514419556, + "learning_rate": 9.033775383340241e-05, + "loss": 1.2859, + "step": 4870 + }, + { + "epoch": 1.01, + "grad_norm": 1.05573570728302, + "learning_rate": 9.031703273932865e-05, + "loss": 1.2502, + "step": 4880 + }, + { + "epoch": 1.01, + "grad_norm": 1.0500950813293457, + "learning_rate": 9.029631164525487e-05, + "loss": 1.2288, + "step": 4890 + }, + { + "epoch": 1.01, + "grad_norm": 1.0603646039962769, + "learning_rate": 9.027559055118112e-05, + "loss": 1.2609, + "step": 4900 + }, + { + "epoch": 1.01, + "grad_norm": 1.085644245147705, + "learning_rate": 9.025486945710734e-05, + "loss": 1.2392, + "step": 4910 + }, + { + "epoch": 1.02, + "grad_norm": 1.0088363885879517, + "learning_rate": 9.023414836303357e-05, + "loss": 1.2361, + "step": 4920 + }, + { + "epoch": 1.02, + "grad_norm": 1.0100585222244263, + "learning_rate": 9.021342726895981e-05, + "loss": 1.2382, + "step": 4930 + }, + { + "epoch": 1.02, + "grad_norm": 1.067159652709961, + "learning_rate": 9.019270617488604e-05, + "loss": 1.2433, + "step": 4940 + }, + { + "epoch": 1.02, + "grad_norm": 1.0431448221206665, + "learning_rate": 9.017198508081226e-05, + "loss": 1.2461, + "step": 4950 + }, + { + "epoch": 1.02, + "grad_norm": 1.3945410251617432, + "learning_rate": 9.01512639867385e-05, + "loss": 1.2406, + "step": 4960 + }, + { + "epoch": 1.03, + "grad_norm": 1.0631392002105713, + "learning_rate": 9.013054289266474e-05, + "loss": 1.2265, + "step": 4970 + }, + { + "epoch": 1.03, + "grad_norm": 1.0823907852172852, + "learning_rate": 9.010982179859097e-05, + "loss": 1.2388, + "step": 4980 + }, + { + "epoch": 1.03, + "grad_norm": 1.1125047206878662, + "learning_rate": 9.00891007045172e-05, + "loss": 1.2553, + "step": 4990 + }, + { + "epoch": 1.03, + "grad_norm": 1.0436159372329712, + "learning_rate": 9.006837961044343e-05, + "loss": 1.2362, + "step": 5000 + }, + { + "epoch": 1.03, + "grad_norm": 1.2972134351730347, + "learning_rate": 9.004765851636968e-05, + "loss": 1.2387, + "step": 5010 + }, + { + "epoch": 1.04, + "grad_norm": 1.0587375164031982, + "learning_rate": 9.00269374222959e-05, + "loss": 1.2348, + "step": 5020 + }, + { + "epoch": 1.04, + "grad_norm": 1.0204670429229736, + "learning_rate": 9.000621632822213e-05, + "loss": 1.2391, + "step": 5030 + }, + { + "epoch": 1.04, + "grad_norm": 1.0182541608810425, + "learning_rate": 8.998549523414837e-05, + "loss": 1.2337, + "step": 5040 + }, + { + "epoch": 1.04, + "grad_norm": 0.9534347057342529, + "learning_rate": 8.99647741400746e-05, + "loss": 1.2403, + "step": 5050 + }, + { + "epoch": 1.04, + "grad_norm": 1.1534489393234253, + "learning_rate": 8.994405304600084e-05, + "loss": 1.251, + "step": 5060 + }, + { + "epoch": 1.05, + "grad_norm": 1.0630913972854614, + "learning_rate": 8.992333195192707e-05, + "loss": 1.2741, + "step": 5070 + }, + { + "epoch": 1.05, + "grad_norm": 1.0464857816696167, + "learning_rate": 8.99026108578533e-05, + "loss": 1.27, + "step": 5080 + }, + { + "epoch": 1.05, + "grad_norm": 1.1541072130203247, + "learning_rate": 8.988188976377954e-05, + "loss": 1.2183, + "step": 5090 + }, + { + "epoch": 1.05, + "grad_norm": 1.007450819015503, + "learning_rate": 8.986116866970576e-05, + "loss": 1.2345, + "step": 5100 + }, + { + "epoch": 1.05, + "grad_norm": 0.984767496585846, + "learning_rate": 8.9840447575632e-05, + "loss": 1.2661, + "step": 5110 + }, + { + "epoch": 1.06, + "grad_norm": 1.2972489595413208, + "learning_rate": 8.981972648155824e-05, + "loss": 1.2577, + "step": 5120 + }, + { + "epoch": 1.06, + "grad_norm": 1.1882805824279785, + "learning_rate": 8.979900538748446e-05, + "loss": 1.2685, + "step": 5130 + }, + { + "epoch": 1.06, + "grad_norm": 1.1580913066864014, + "learning_rate": 8.977828429341069e-05, + "loss": 1.2269, + "step": 5140 + }, + { + "epoch": 1.06, + "grad_norm": 1.0898735523223877, + "learning_rate": 8.975756319933693e-05, + "loss": 1.2315, + "step": 5150 + }, + { + "epoch": 1.06, + "grad_norm": 1.1261813640594482, + "learning_rate": 8.973684210526316e-05, + "loss": 1.2617, + "step": 5160 + }, + { + "epoch": 1.07, + "grad_norm": 1.0266163349151611, + "learning_rate": 8.97161210111894e-05, + "loss": 1.2379, + "step": 5170 + }, + { + "epoch": 1.07, + "grad_norm": 1.0973056554794312, + "learning_rate": 8.969539991711563e-05, + "loss": 1.2527, + "step": 5180 + }, + { + "epoch": 1.07, + "grad_norm": 1.0927435159683228, + "learning_rate": 8.967467882304185e-05, + "loss": 1.2433, + "step": 5190 + }, + { + "epoch": 1.07, + "grad_norm": 1.1209070682525635, + "learning_rate": 8.96539577289681e-05, + "loss": 1.2512, + "step": 5200 + }, + { + "epoch": 1.08, + "grad_norm": 1.222163200378418, + "learning_rate": 8.963323663489432e-05, + "loss": 1.2377, + "step": 5210 + }, + { + "epoch": 1.08, + "grad_norm": 1.1234538555145264, + "learning_rate": 8.961251554082056e-05, + "loss": 1.2409, + "step": 5220 + }, + { + "epoch": 1.08, + "grad_norm": 1.1121318340301514, + "learning_rate": 8.95917944467468e-05, + "loss": 1.2402, + "step": 5230 + }, + { + "epoch": 1.08, + "grad_norm": 1.0124129056930542, + "learning_rate": 8.957107335267303e-05, + "loss": 1.2609, + "step": 5240 + }, + { + "epoch": 1.08, + "grad_norm": 1.0647163391113281, + "learning_rate": 8.955035225859925e-05, + "loss": 1.2144, + "step": 5250 + }, + { + "epoch": 1.09, + "grad_norm": 1.0653977394104004, + "learning_rate": 8.95296311645255e-05, + "loss": 1.2337, + "step": 5260 + }, + { + "epoch": 1.09, + "grad_norm": 1.0555377006530762, + "learning_rate": 8.950891007045172e-05, + "loss": 1.2121, + "step": 5270 + }, + { + "epoch": 1.09, + "grad_norm": 1.0859911441802979, + "learning_rate": 8.948818897637796e-05, + "loss": 1.232, + "step": 5280 + }, + { + "epoch": 1.09, + "grad_norm": 1.0167226791381836, + "learning_rate": 8.946746788230419e-05, + "loss": 1.2252, + "step": 5290 + }, + { + "epoch": 1.09, + "grad_norm": 1.0780484676361084, + "learning_rate": 8.944674678823041e-05, + "loss": 1.2537, + "step": 5300 + }, + { + "epoch": 1.1, + "grad_norm": 1.0674623250961304, + "learning_rate": 8.942602569415666e-05, + "loss": 1.2511, + "step": 5310 + }, + { + "epoch": 1.1, + "grad_norm": 1.1867153644561768, + "learning_rate": 8.940530460008288e-05, + "loss": 1.2406, + "step": 5320 + }, + { + "epoch": 1.1, + "grad_norm": 1.0617753267288208, + "learning_rate": 8.938458350600912e-05, + "loss": 1.2648, + "step": 5330 + }, + { + "epoch": 1.1, + "grad_norm": 1.1268622875213623, + "learning_rate": 8.936386241193535e-05, + "loss": 1.2461, + "step": 5340 + }, + { + "epoch": 1.1, + "grad_norm": 1.06646728515625, + "learning_rate": 8.934314131786159e-05, + "loss": 1.235, + "step": 5350 + }, + { + "epoch": 1.11, + "grad_norm": 1.0061641931533813, + "learning_rate": 8.932242022378781e-05, + "loss": 1.2256, + "step": 5360 + }, + { + "epoch": 1.11, + "grad_norm": 1.1114619970321655, + "learning_rate": 8.930169912971406e-05, + "loss": 1.2481, + "step": 5370 + }, + { + "epoch": 1.11, + "grad_norm": 1.121626615524292, + "learning_rate": 8.928097803564028e-05, + "loss": 1.2592, + "step": 5380 + }, + { + "epoch": 1.11, + "grad_norm": 1.0986332893371582, + "learning_rate": 8.926025694156652e-05, + "loss": 1.2397, + "step": 5390 + }, + { + "epoch": 1.11, + "grad_norm": 1.055759072303772, + "learning_rate": 8.923953584749275e-05, + "loss": 1.2468, + "step": 5400 + }, + { + "epoch": 1.12, + "grad_norm": 1.035019874572754, + "learning_rate": 8.921881475341899e-05, + "loss": 1.2455, + "step": 5410 + }, + { + "epoch": 1.12, + "grad_norm": 1.0355908870697021, + "learning_rate": 8.919809365934522e-05, + "loss": 1.2584, + "step": 5420 + }, + { + "epoch": 1.12, + "grad_norm": 1.0616044998168945, + "learning_rate": 8.917737256527146e-05, + "loss": 1.2472, + "step": 5430 + }, + { + "epoch": 1.12, + "grad_norm": 1.0461037158966064, + "learning_rate": 8.915665147119768e-05, + "loss": 1.2531, + "step": 5440 + }, + { + "epoch": 1.12, + "grad_norm": 1.2218214273452759, + "learning_rate": 8.913593037712393e-05, + "loss": 1.2486, + "step": 5450 + }, + { + "epoch": 1.13, + "grad_norm": 1.0007187128067017, + "learning_rate": 8.911520928305015e-05, + "loss": 1.2464, + "step": 5460 + }, + { + "epoch": 1.13, + "grad_norm": 1.056276798248291, + "learning_rate": 8.909448818897638e-05, + "loss": 1.2375, + "step": 5470 + }, + { + "epoch": 1.13, + "grad_norm": 1.0595409870147705, + "learning_rate": 8.907376709490262e-05, + "loss": 1.2645, + "step": 5480 + }, + { + "epoch": 1.13, + "grad_norm": 1.1598786115646362, + "learning_rate": 8.905304600082884e-05, + "loss": 1.2502, + "step": 5490 + }, + { + "epoch": 1.13, + "grad_norm": 1.0664838552474976, + "learning_rate": 8.903232490675508e-05, + "loss": 1.2712, + "step": 5500 + }, + { + "epoch": 1.14, + "grad_norm": 1.0016567707061768, + "learning_rate": 8.901160381268131e-05, + "loss": 1.2386, + "step": 5510 + }, + { + "epoch": 1.14, + "grad_norm": 1.0359711647033691, + "learning_rate": 8.899088271860755e-05, + "loss": 1.238, + "step": 5520 + }, + { + "epoch": 1.14, + "grad_norm": 1.1633154153823853, + "learning_rate": 8.897016162453378e-05, + "loss": 1.2139, + "step": 5530 + }, + { + "epoch": 1.14, + "grad_norm": 1.085390567779541, + "learning_rate": 8.894944053046002e-05, + "loss": 1.2564, + "step": 5540 + }, + { + "epoch": 1.15, + "grad_norm": 1.041222333908081, + "learning_rate": 8.892871943638624e-05, + "loss": 1.2361, + "step": 5550 + }, + { + "epoch": 1.15, + "grad_norm": 1.0352637767791748, + "learning_rate": 8.890799834231249e-05, + "loss": 1.2104, + "step": 5560 + }, + { + "epoch": 1.15, + "grad_norm": 1.0879154205322266, + "learning_rate": 8.888727724823871e-05, + "loss": 1.2408, + "step": 5570 + }, + { + "epoch": 1.15, + "grad_norm": 1.177937388420105, + "learning_rate": 8.886655615416494e-05, + "loss": 1.221, + "step": 5580 + }, + { + "epoch": 1.15, + "grad_norm": 1.0638147592544556, + "learning_rate": 8.884583506009118e-05, + "loss": 1.2309, + "step": 5590 + }, + { + "epoch": 1.16, + "grad_norm": 1.0461276769638062, + "learning_rate": 8.882511396601741e-05, + "loss": 1.2498, + "step": 5600 + }, + { + "epoch": 1.16, + "grad_norm": 0.9356592297554016, + "learning_rate": 8.880439287194365e-05, + "loss": 1.2584, + "step": 5610 + }, + { + "epoch": 1.16, + "grad_norm": 0.9423561096191406, + "learning_rate": 8.878367177786988e-05, + "loss": 1.2094, + "step": 5620 + }, + { + "epoch": 1.16, + "grad_norm": 1.1970158815383911, + "learning_rate": 8.87629506837961e-05, + "loss": 1.2134, + "step": 5630 + }, + { + "epoch": 1.16, + "grad_norm": 1.1081819534301758, + "learning_rate": 8.874222958972234e-05, + "loss": 1.2134, + "step": 5640 + }, + { + "epoch": 1.17, + "grad_norm": 1.1248120069503784, + "learning_rate": 8.872150849564858e-05, + "loss": 1.2288, + "step": 5650 + }, + { + "epoch": 1.17, + "grad_norm": 1.0750600099563599, + "learning_rate": 8.87007874015748e-05, + "loss": 1.2191, + "step": 5660 + }, + { + "epoch": 1.17, + "grad_norm": 1.058366060256958, + "learning_rate": 8.868006630750105e-05, + "loss": 1.2274, + "step": 5670 + }, + { + "epoch": 1.17, + "grad_norm": 0.9436173439025879, + "learning_rate": 8.865934521342727e-05, + "loss": 1.2556, + "step": 5680 + }, + { + "epoch": 1.17, + "grad_norm": 1.0624874830245972, + "learning_rate": 8.86386241193535e-05, + "loss": 1.2383, + "step": 5690 + }, + { + "epoch": 1.18, + "grad_norm": 1.0870168209075928, + "learning_rate": 8.861790302527974e-05, + "loss": 1.2432, + "step": 5700 + }, + { + "epoch": 1.18, + "grad_norm": 1.0561186075210571, + "learning_rate": 8.859718193120597e-05, + "loss": 1.2329, + "step": 5710 + }, + { + "epoch": 1.18, + "grad_norm": 1.2139157056808472, + "learning_rate": 8.857646083713221e-05, + "loss": 1.2183, + "step": 5720 + }, + { + "epoch": 1.18, + "grad_norm": 1.0662713050842285, + "learning_rate": 8.855573974305844e-05, + "loss": 1.2371, + "step": 5730 + }, + { + "epoch": 1.18, + "grad_norm": 1.1198335886001587, + "learning_rate": 8.853501864898466e-05, + "loss": 1.2396, + "step": 5740 + }, + { + "epoch": 1.19, + "grad_norm": 1.2191115617752075, + "learning_rate": 8.851429755491091e-05, + "loss": 1.2355, + "step": 5750 + }, + { + "epoch": 1.19, + "grad_norm": 1.039201259613037, + "learning_rate": 8.849357646083714e-05, + "loss": 1.2251, + "step": 5760 + }, + { + "epoch": 1.19, + "grad_norm": 0.9540196061134338, + "learning_rate": 8.847285536676337e-05, + "loss": 1.2424, + "step": 5770 + }, + { + "epoch": 1.19, + "grad_norm": 1.0640240907669067, + "learning_rate": 8.84521342726896e-05, + "loss": 1.2414, + "step": 5780 + }, + { + "epoch": 1.19, + "grad_norm": 1.0465424060821533, + "learning_rate": 8.843141317861584e-05, + "loss": 1.2241, + "step": 5790 + }, + { + "epoch": 1.2, + "grad_norm": 1.1881465911865234, + "learning_rate": 8.841069208454206e-05, + "loss": 1.2182, + "step": 5800 + }, + { + "epoch": 1.2, + "grad_norm": 1.0851510763168335, + "learning_rate": 8.83899709904683e-05, + "loss": 1.2115, + "step": 5810 + }, + { + "epoch": 1.2, + "grad_norm": 1.0743972063064575, + "learning_rate": 8.836924989639453e-05, + "loss": 1.2211, + "step": 5820 + }, + { + "epoch": 1.2, + "grad_norm": 1.049249529838562, + "learning_rate": 8.834852880232077e-05, + "loss": 1.2452, + "step": 5830 + }, + { + "epoch": 1.21, + "grad_norm": 1.0910190343856812, + "learning_rate": 8.8327807708247e-05, + "loss": 1.249, + "step": 5840 + }, + { + "epoch": 1.21, + "grad_norm": 1.0976841449737549, + "learning_rate": 8.830708661417322e-05, + "loss": 1.2123, + "step": 5850 + }, + { + "epoch": 1.21, + "grad_norm": 1.3569914102554321, + "learning_rate": 8.828636552009947e-05, + "loss": 1.2338, + "step": 5860 + }, + { + "epoch": 1.21, + "grad_norm": 1.068427324295044, + "learning_rate": 8.82656444260257e-05, + "loss": 1.2351, + "step": 5870 + }, + { + "epoch": 1.21, + "grad_norm": 1.1309690475463867, + "learning_rate": 8.824492333195193e-05, + "loss": 1.2367, + "step": 5880 + }, + { + "epoch": 1.22, + "grad_norm": 1.1933242082595825, + "learning_rate": 8.822420223787816e-05, + "loss": 1.2261, + "step": 5890 + }, + { + "epoch": 1.22, + "grad_norm": 1.029557466506958, + "learning_rate": 8.82034811438044e-05, + "loss": 1.2499, + "step": 5900 + }, + { + "epoch": 1.22, + "grad_norm": 1.106086015701294, + "learning_rate": 8.818276004973062e-05, + "loss": 1.2236, + "step": 5910 + }, + { + "epoch": 1.22, + "grad_norm": 1.0114145278930664, + "learning_rate": 8.816203895565687e-05, + "loss": 1.2445, + "step": 5920 + }, + { + "epoch": 1.22, + "grad_norm": 1.1829569339752197, + "learning_rate": 8.814131786158309e-05, + "loss": 1.2334, + "step": 5930 + }, + { + "epoch": 1.23, + "grad_norm": 1.1952738761901855, + "learning_rate": 8.812059676750933e-05, + "loss": 1.2402, + "step": 5940 + }, + { + "epoch": 1.23, + "grad_norm": 1.0816442966461182, + "learning_rate": 8.809987567343556e-05, + "loss": 1.252, + "step": 5950 + }, + { + "epoch": 1.23, + "grad_norm": 1.1453193426132202, + "learning_rate": 8.80791545793618e-05, + "loss": 1.233, + "step": 5960 + }, + { + "epoch": 1.23, + "grad_norm": 1.045602798461914, + "learning_rate": 8.805843348528803e-05, + "loss": 1.2195, + "step": 5970 + }, + { + "epoch": 1.23, + "grad_norm": 1.0934393405914307, + "learning_rate": 8.803771239121425e-05, + "loss": 1.2177, + "step": 5980 + }, + { + "epoch": 1.24, + "grad_norm": 1.0120600461959839, + "learning_rate": 8.801699129714049e-05, + "loss": 1.2329, + "step": 5990 + }, + { + "epoch": 1.24, + "grad_norm": 1.103251576423645, + "learning_rate": 8.799627020306672e-05, + "loss": 1.2379, + "step": 6000 + }, + { + "epoch": 1.24, + "grad_norm": 1.0402165651321411, + "learning_rate": 8.797554910899296e-05, + "loss": 1.2256, + "step": 6010 + }, + { + "epoch": 1.24, + "grad_norm": 1.0983202457427979, + "learning_rate": 8.79548280149192e-05, + "loss": 1.2296, + "step": 6020 + }, + { + "epoch": 1.24, + "grad_norm": 1.0412105321884155, + "learning_rate": 8.793410692084543e-05, + "loss": 1.2228, + "step": 6030 + }, + { + "epoch": 1.25, + "grad_norm": 1.0473746061325073, + "learning_rate": 8.791338582677165e-05, + "loss": 1.2228, + "step": 6040 + }, + { + "epoch": 1.25, + "grad_norm": 1.004840612411499, + "learning_rate": 8.78926647326979e-05, + "loss": 1.2108, + "step": 6050 + }, + { + "epoch": 1.25, + "grad_norm": 1.1168406009674072, + "learning_rate": 8.787194363862412e-05, + "loss": 1.2104, + "step": 6060 + }, + { + "epoch": 1.25, + "grad_norm": 1.1124876737594604, + "learning_rate": 8.785122254455036e-05, + "loss": 1.2336, + "step": 6070 + }, + { + "epoch": 1.25, + "grad_norm": 1.059415340423584, + "learning_rate": 8.783050145047659e-05, + "loss": 1.2438, + "step": 6080 + }, + { + "epoch": 1.26, + "grad_norm": 1.063344955444336, + "learning_rate": 8.780978035640283e-05, + "loss": 1.2352, + "step": 6090 + }, + { + "epoch": 1.26, + "grad_norm": 1.0552620887756348, + "learning_rate": 8.778905926232905e-05, + "loss": 1.2131, + "step": 6100 + }, + { + "epoch": 1.26, + "grad_norm": 1.1673498153686523, + "learning_rate": 8.77683381682553e-05, + "loss": 1.213, + "step": 6110 + }, + { + "epoch": 1.26, + "grad_norm": 1.1740206480026245, + "learning_rate": 8.774761707418152e-05, + "loss": 1.2162, + "step": 6120 + }, + { + "epoch": 1.26, + "grad_norm": 1.0944995880126953, + "learning_rate": 8.772689598010775e-05, + "loss": 1.2005, + "step": 6130 + }, + { + "epoch": 1.27, + "grad_norm": 1.0152305364608765, + "learning_rate": 8.770617488603399e-05, + "loss": 1.2198, + "step": 6140 + }, + { + "epoch": 1.27, + "grad_norm": 1.1265654563903809, + "learning_rate": 8.768545379196021e-05, + "loss": 1.2362, + "step": 6150 + }, + { + "epoch": 1.27, + "grad_norm": 1.151207447052002, + "learning_rate": 8.766473269788646e-05, + "loss": 1.2561, + "step": 6160 + }, + { + "epoch": 1.27, + "grad_norm": 1.035855770111084, + "learning_rate": 8.764401160381268e-05, + "loss": 1.2391, + "step": 6170 + }, + { + "epoch": 1.28, + "grad_norm": 1.091009497642517, + "learning_rate": 8.762329050973892e-05, + "loss": 1.2342, + "step": 6180 + }, + { + "epoch": 1.28, + "grad_norm": 1.057400107383728, + "learning_rate": 8.760256941566515e-05, + "loss": 1.2216, + "step": 6190 + }, + { + "epoch": 1.28, + "grad_norm": 0.9303261041641235, + "learning_rate": 8.758184832159139e-05, + "loss": 1.2352, + "step": 6200 + }, + { + "epoch": 1.28, + "grad_norm": 1.2061206102371216, + "learning_rate": 8.756112722751761e-05, + "loss": 1.215, + "step": 6210 + }, + { + "epoch": 1.28, + "grad_norm": 1.0886818170547485, + "learning_rate": 8.754040613344386e-05, + "loss": 1.2266, + "step": 6220 + }, + { + "epoch": 1.29, + "grad_norm": 1.279563546180725, + "learning_rate": 8.751968503937008e-05, + "loss": 1.237, + "step": 6230 + }, + { + "epoch": 1.29, + "grad_norm": 1.0063716173171997, + "learning_rate": 8.749896394529631e-05, + "loss": 1.2119, + "step": 6240 + }, + { + "epoch": 1.29, + "grad_norm": 1.0414812564849854, + "learning_rate": 8.747824285122255e-05, + "loss": 1.2177, + "step": 6250 + }, + { + "epoch": 1.29, + "grad_norm": 1.0915932655334473, + "learning_rate": 8.745752175714878e-05, + "loss": 1.2422, + "step": 6260 + }, + { + "epoch": 1.29, + "grad_norm": 1.0544465780258179, + "learning_rate": 8.743680066307502e-05, + "loss": 1.234, + "step": 6270 + }, + { + "epoch": 1.3, + "grad_norm": 1.1502125263214111, + "learning_rate": 8.741607956900125e-05, + "loss": 1.2183, + "step": 6280 + }, + { + "epoch": 1.3, + "grad_norm": 1.3113386631011963, + "learning_rate": 8.739535847492748e-05, + "loss": 1.202, + "step": 6290 + }, + { + "epoch": 1.3, + "grad_norm": 1.087583303451538, + "learning_rate": 8.737463738085371e-05, + "loss": 1.241, + "step": 6300 + }, + { + "epoch": 1.3, + "grad_norm": 1.135565161705017, + "learning_rate": 8.735391628677995e-05, + "loss": 1.2411, + "step": 6310 + }, + { + "epoch": 1.3, + "grad_norm": 1.0986896753311157, + "learning_rate": 8.733319519270617e-05, + "loss": 1.2178, + "step": 6320 + }, + { + "epoch": 1.31, + "grad_norm": 1.6357513666152954, + "learning_rate": 8.731247409863242e-05, + "loss": 1.2102, + "step": 6330 + }, + { + "epoch": 1.31, + "grad_norm": 1.0731899738311768, + "learning_rate": 8.729175300455864e-05, + "loss": 1.213, + "step": 6340 + }, + { + "epoch": 1.31, + "grad_norm": 1.1432324647903442, + "learning_rate": 8.727103191048487e-05, + "loss": 1.2395, + "step": 6350 + }, + { + "epoch": 1.31, + "grad_norm": 1.0547071695327759, + "learning_rate": 8.725031081641111e-05, + "loss": 1.2121, + "step": 6360 + }, + { + "epoch": 1.31, + "grad_norm": 1.1022225618362427, + "learning_rate": 8.722958972233734e-05, + "loss": 1.2274, + "step": 6370 + }, + { + "epoch": 1.32, + "grad_norm": 1.0772980451583862, + "learning_rate": 8.720886862826358e-05, + "loss": 1.2244, + "step": 6380 + }, + { + "epoch": 1.32, + "grad_norm": 1.073470115661621, + "learning_rate": 8.718814753418981e-05, + "loss": 1.2243, + "step": 6390 + }, + { + "epoch": 1.32, + "grad_norm": 1.1750410795211792, + "learning_rate": 8.716742644011604e-05, + "loss": 1.2191, + "step": 6400 + }, + { + "epoch": 1.32, + "grad_norm": 1.206298828125, + "learning_rate": 8.714670534604228e-05, + "loss": 1.2284, + "step": 6410 + }, + { + "epoch": 1.32, + "grad_norm": 1.1253398656845093, + "learning_rate": 8.71259842519685e-05, + "loss": 1.2222, + "step": 6420 + }, + { + "epoch": 1.33, + "grad_norm": 1.1828629970550537, + "learning_rate": 8.710526315789474e-05, + "loss": 1.225, + "step": 6430 + }, + { + "epoch": 1.33, + "grad_norm": 1.0462387800216675, + "learning_rate": 8.708454206382098e-05, + "loss": 1.2156, + "step": 6440 + }, + { + "epoch": 1.33, + "grad_norm": 1.3468637466430664, + "learning_rate": 8.706382096974721e-05, + "loss": 1.2111, + "step": 6450 + }, + { + "epoch": 1.33, + "grad_norm": 1.1359398365020752, + "learning_rate": 8.704309987567345e-05, + "loss": 1.2022, + "step": 6460 + }, + { + "epoch": 1.34, + "grad_norm": 1.0748237371444702, + "learning_rate": 8.702237878159967e-05, + "loss": 1.2227, + "step": 6470 + }, + { + "epoch": 1.34, + "grad_norm": 1.0240176916122437, + "learning_rate": 8.70016576875259e-05, + "loss": 1.2042, + "step": 6480 + }, + { + "epoch": 1.34, + "grad_norm": 1.0575100183486938, + "learning_rate": 8.698093659345214e-05, + "loss": 1.2166, + "step": 6490 + }, + { + "epoch": 1.34, + "grad_norm": 1.0095828771591187, + "learning_rate": 8.696021549937837e-05, + "loss": 1.2352, + "step": 6500 + }, + { + "epoch": 1.34, + "grad_norm": 1.140643835067749, + "learning_rate": 8.69394944053046e-05, + "loss": 1.2143, + "step": 6510 + }, + { + "epoch": 1.35, + "grad_norm": 1.0403310060501099, + "learning_rate": 8.691877331123084e-05, + "loss": 1.2268, + "step": 6520 + }, + { + "epoch": 1.35, + "grad_norm": 1.1176542043685913, + "learning_rate": 8.689805221715706e-05, + "loss": 1.2123, + "step": 6530 + }, + { + "epoch": 1.35, + "grad_norm": 1.041100025177002, + "learning_rate": 8.68773311230833e-05, + "loss": 1.2356, + "step": 6540 + }, + { + "epoch": 1.35, + "grad_norm": 0.9852685928344727, + "learning_rate": 8.685661002900954e-05, + "loss": 1.2043, + "step": 6550 + }, + { + "epoch": 1.35, + "grad_norm": 1.206588864326477, + "learning_rate": 8.683588893493577e-05, + "loss": 1.2132, + "step": 6560 + }, + { + "epoch": 1.36, + "grad_norm": 1.1841477155685425, + "learning_rate": 8.6815167840862e-05, + "loss": 1.241, + "step": 6570 + }, + { + "epoch": 1.36, + "grad_norm": 1.1257410049438477, + "learning_rate": 8.679444674678824e-05, + "loss": 1.2241, + "step": 6580 + }, + { + "epoch": 1.36, + "grad_norm": 1.053404688835144, + "learning_rate": 8.677372565271446e-05, + "loss": 1.2409, + "step": 6590 + }, + { + "epoch": 1.36, + "grad_norm": 1.109271764755249, + "learning_rate": 8.675300455864071e-05, + "loss": 1.2036, + "step": 6600 + }, + { + "epoch": 1.36, + "grad_norm": 1.0103741884231567, + "learning_rate": 8.673228346456693e-05, + "loss": 1.2128, + "step": 6610 + }, + { + "epoch": 1.37, + "grad_norm": 1.149715542793274, + "learning_rate": 8.671156237049317e-05, + "loss": 1.208, + "step": 6620 + }, + { + "epoch": 1.37, + "grad_norm": 1.0523351430892944, + "learning_rate": 8.66908412764194e-05, + "loss": 1.2146, + "step": 6630 + }, + { + "epoch": 1.37, + "grad_norm": 1.2250432968139648, + "learning_rate": 8.667012018234562e-05, + "loss": 1.2454, + "step": 6640 + }, + { + "epoch": 1.37, + "grad_norm": 0.9975298643112183, + "learning_rate": 8.664939908827186e-05, + "loss": 1.228, + "step": 6650 + }, + { + "epoch": 1.37, + "grad_norm": 1.0323069095611572, + "learning_rate": 8.66286779941981e-05, + "loss": 1.2223, + "step": 6660 + }, + { + "epoch": 1.38, + "grad_norm": 1.0941072702407837, + "learning_rate": 8.660795690012433e-05, + "loss": 1.2214, + "step": 6670 + }, + { + "epoch": 1.38, + "grad_norm": 1.2442706823349, + "learning_rate": 8.658723580605056e-05, + "loss": 1.2316, + "step": 6680 + }, + { + "epoch": 1.38, + "grad_norm": 1.1723049879074097, + "learning_rate": 8.65665147119768e-05, + "loss": 1.2184, + "step": 6690 + }, + { + "epoch": 1.38, + "grad_norm": 1.1901644468307495, + "learning_rate": 8.654579361790302e-05, + "loss": 1.2382, + "step": 6700 + }, + { + "epoch": 1.38, + "grad_norm": 1.1003718376159668, + "learning_rate": 8.652507252382927e-05, + "loss": 1.2272, + "step": 6710 + }, + { + "epoch": 1.39, + "grad_norm": 1.1174200773239136, + "learning_rate": 8.650435142975549e-05, + "loss": 1.2248, + "step": 6720 + }, + { + "epoch": 1.39, + "grad_norm": 1.2489265203475952, + "learning_rate": 8.648363033568173e-05, + "loss": 1.2252, + "step": 6730 + }, + { + "epoch": 1.39, + "grad_norm": 1.2472732067108154, + "learning_rate": 8.646290924160796e-05, + "loss": 1.227, + "step": 6740 + }, + { + "epoch": 1.39, + "grad_norm": 1.0011487007141113, + "learning_rate": 8.64421881475342e-05, + "loss": 1.2267, + "step": 6750 + }, + { + "epoch": 1.39, + "grad_norm": 1.060117244720459, + "learning_rate": 8.642146705346042e-05, + "loss": 1.2263, + "step": 6760 + }, + { + "epoch": 1.4, + "grad_norm": 1.102454423904419, + "learning_rate": 8.640074595938667e-05, + "loss": 1.2205, + "step": 6770 + }, + { + "epoch": 1.4, + "grad_norm": 1.0496042966842651, + "learning_rate": 8.638002486531289e-05, + "loss": 1.2207, + "step": 6780 + }, + { + "epoch": 1.4, + "grad_norm": 1.005552053451538, + "learning_rate": 8.635930377123912e-05, + "loss": 1.1992, + "step": 6790 + }, + { + "epoch": 1.4, + "grad_norm": 1.0828580856323242, + "learning_rate": 8.633858267716536e-05, + "loss": 1.2164, + "step": 6800 + }, + { + "epoch": 1.41, + "grad_norm": 1.0597617626190186, + "learning_rate": 8.631786158309158e-05, + "loss": 1.2081, + "step": 6810 + }, + { + "epoch": 1.41, + "grad_norm": 1.1591441631317139, + "learning_rate": 8.629714048901783e-05, + "loss": 1.2315, + "step": 6820 + }, + { + "epoch": 1.41, + "grad_norm": 1.1969908475875854, + "learning_rate": 8.627641939494405e-05, + "loss": 1.2307, + "step": 6830 + }, + { + "epoch": 1.41, + "grad_norm": 1.1163939237594604, + "learning_rate": 8.625569830087029e-05, + "loss": 1.2272, + "step": 6840 + }, + { + "epoch": 1.41, + "grad_norm": 1.047241449356079, + "learning_rate": 8.623497720679652e-05, + "loss": 1.1991, + "step": 6850 + }, + { + "epoch": 1.42, + "grad_norm": 1.0410964488983154, + "learning_rate": 8.621425611272276e-05, + "loss": 1.211, + "step": 6860 + }, + { + "epoch": 1.42, + "grad_norm": 1.0674628019332886, + "learning_rate": 8.619353501864899e-05, + "loss": 1.2195, + "step": 6870 + }, + { + "epoch": 1.42, + "grad_norm": 0.9892363548278809, + "learning_rate": 8.617281392457523e-05, + "loss": 1.2344, + "step": 6880 + }, + { + "epoch": 1.42, + "grad_norm": 1.08130943775177, + "learning_rate": 8.615209283050145e-05, + "loss": 1.2299, + "step": 6890 + }, + { + "epoch": 1.42, + "grad_norm": 1.0285786390304565, + "learning_rate": 8.613137173642768e-05, + "loss": 1.2166, + "step": 6900 + }, + { + "epoch": 1.43, + "grad_norm": 1.0203038454055786, + "learning_rate": 8.611065064235392e-05, + "loss": 1.2043, + "step": 6910 + }, + { + "epoch": 1.43, + "grad_norm": 1.17252516746521, + "learning_rate": 8.608992954828015e-05, + "loss": 1.2278, + "step": 6920 + }, + { + "epoch": 1.43, + "grad_norm": 1.233001708984375, + "learning_rate": 8.606920845420639e-05, + "loss": 1.233, + "step": 6930 + }, + { + "epoch": 1.43, + "grad_norm": 1.0858713388442993, + "learning_rate": 8.604848736013262e-05, + "loss": 1.2174, + "step": 6940 + }, + { + "epoch": 1.43, + "grad_norm": 1.069405198097229, + "learning_rate": 8.602776626605885e-05, + "loss": 1.2176, + "step": 6950 + }, + { + "epoch": 1.44, + "grad_norm": 1.0860111713409424, + "learning_rate": 8.60070451719851e-05, + "loss": 1.2179, + "step": 6960 + }, + { + "epoch": 1.44, + "grad_norm": 1.111624836921692, + "learning_rate": 8.598632407791132e-05, + "loss": 1.2167, + "step": 6970 + }, + { + "epoch": 1.44, + "grad_norm": 1.092925786972046, + "learning_rate": 8.596560298383755e-05, + "loss": 1.2154, + "step": 6980 + }, + { + "epoch": 1.44, + "grad_norm": 1.115460753440857, + "learning_rate": 8.594488188976379e-05, + "loss": 1.2052, + "step": 6990 + }, + { + "epoch": 1.44, + "grad_norm": 1.229777455329895, + "learning_rate": 8.592416079569001e-05, + "loss": 1.1769, + "step": 7000 + }, + { + "epoch": 1.45, + "grad_norm": 1.1182063817977905, + "learning_rate": 8.590343970161626e-05, + "loss": 1.2035, + "step": 7010 + }, + { + "epoch": 1.45, + "grad_norm": 1.0315207242965698, + "learning_rate": 8.588271860754248e-05, + "loss": 1.2158, + "step": 7020 + }, + { + "epoch": 1.45, + "grad_norm": 1.1051239967346191, + "learning_rate": 8.586199751346871e-05, + "loss": 1.1874, + "step": 7030 + }, + { + "epoch": 1.45, + "grad_norm": 1.0437036752700806, + "learning_rate": 8.584127641939495e-05, + "loss": 1.2421, + "step": 7040 + }, + { + "epoch": 1.45, + "grad_norm": 1.1060231924057007, + "learning_rate": 8.582055532532118e-05, + "loss": 1.2392, + "step": 7050 + }, + { + "epoch": 1.46, + "grad_norm": 1.0951759815216064, + "learning_rate": 8.57998342312474e-05, + "loss": 1.2348, + "step": 7060 + }, + { + "epoch": 1.46, + "grad_norm": 1.7351630926132202, + "learning_rate": 8.577911313717365e-05, + "loss": 1.2198, + "step": 7070 + }, + { + "epoch": 1.46, + "grad_norm": 1.3410096168518066, + "learning_rate": 8.575839204309988e-05, + "loss": 1.2069, + "step": 7080 + }, + { + "epoch": 1.46, + "grad_norm": 1.1723905801773071, + "learning_rate": 8.573767094902611e-05, + "loss": 1.2264, + "step": 7090 + }, + { + "epoch": 1.47, + "grad_norm": 1.0400230884552002, + "learning_rate": 8.571694985495235e-05, + "loss": 1.2285, + "step": 7100 + }, + { + "epoch": 1.47, + "grad_norm": 1.1555556058883667, + "learning_rate": 8.569622876087858e-05, + "loss": 1.1982, + "step": 7110 + }, + { + "epoch": 1.47, + "grad_norm": 1.0779943466186523, + "learning_rate": 8.567550766680482e-05, + "loss": 1.2241, + "step": 7120 + }, + { + "epoch": 1.47, + "grad_norm": 1.011435627937317, + "learning_rate": 8.565478657273105e-05, + "loss": 1.1919, + "step": 7130 + }, + { + "epoch": 1.47, + "grad_norm": 1.1413905620574951, + "learning_rate": 8.563406547865727e-05, + "loss": 1.2026, + "step": 7140 + }, + { + "epoch": 1.48, + "grad_norm": 1.305766224861145, + "learning_rate": 8.561334438458351e-05, + "loss": 1.2243, + "step": 7150 + }, + { + "epoch": 1.48, + "grad_norm": 1.1241309642791748, + "learning_rate": 8.559262329050974e-05, + "loss": 1.2305, + "step": 7160 + }, + { + "epoch": 1.48, + "grad_norm": 1.1354045867919922, + "learning_rate": 8.557190219643596e-05, + "loss": 1.2203, + "step": 7170 + }, + { + "epoch": 1.48, + "grad_norm": 1.0294325351715088, + "learning_rate": 8.555118110236221e-05, + "loss": 1.2356, + "step": 7180 + }, + { + "epoch": 1.48, + "grad_norm": 1.1068978309631348, + "learning_rate": 8.553046000828844e-05, + "loss": 1.2191, + "step": 7190 + }, + { + "epoch": 1.49, + "grad_norm": 1.0707268714904785, + "learning_rate": 8.550973891421467e-05, + "loss": 1.2168, + "step": 7200 + }, + { + "epoch": 1.49, + "grad_norm": 1.158420443534851, + "learning_rate": 8.54890178201409e-05, + "loss": 1.1924, + "step": 7210 + }, + { + "epoch": 1.49, + "grad_norm": 1.018847107887268, + "learning_rate": 8.546829672606714e-05, + "loss": 1.2326, + "step": 7220 + }, + { + "epoch": 1.49, + "grad_norm": 1.1226823329925537, + "learning_rate": 8.544757563199338e-05, + "loss": 1.2029, + "step": 7230 + }, + { + "epoch": 1.49, + "grad_norm": 1.1094051599502563, + "learning_rate": 8.542685453791961e-05, + "loss": 1.2005, + "step": 7240 + }, + { + "epoch": 1.5, + "grad_norm": 1.1425297260284424, + "learning_rate": 8.540613344384583e-05, + "loss": 1.2109, + "step": 7250 + }, + { + "epoch": 1.5, + "grad_norm": 1.0846836566925049, + "learning_rate": 8.538541234977208e-05, + "loss": 1.2098, + "step": 7260 + }, + { + "epoch": 1.5, + "grad_norm": 1.0861196517944336, + "learning_rate": 8.53646912556983e-05, + "loss": 1.2201, + "step": 7270 + }, + { + "epoch": 1.5, + "grad_norm": 1.0006380081176758, + "learning_rate": 8.534397016162454e-05, + "loss": 1.2457, + "step": 7280 + }, + { + "epoch": 1.5, + "grad_norm": 1.0799134969711304, + "learning_rate": 8.532324906755077e-05, + "loss": 1.2117, + "step": 7290 + }, + { + "epoch": 1.51, + "grad_norm": 1.1802451610565186, + "learning_rate": 8.530252797347701e-05, + "loss": 1.2277, + "step": 7300 + }, + { + "epoch": 1.51, + "grad_norm": 1.0235154628753662, + "learning_rate": 8.528180687940323e-05, + "loss": 1.204, + "step": 7310 + }, + { + "epoch": 1.51, + "grad_norm": 1.1273279190063477, + "learning_rate": 8.526108578532946e-05, + "loss": 1.2179, + "step": 7320 + }, + { + "epoch": 1.51, + "grad_norm": 0.9829218983650208, + "learning_rate": 8.52403646912557e-05, + "loss": 1.1913, + "step": 7330 + }, + { + "epoch": 1.51, + "grad_norm": 1.1805267333984375, + "learning_rate": 8.521964359718194e-05, + "loss": 1.1822, + "step": 7340 + }, + { + "epoch": 1.52, + "grad_norm": 1.026557207107544, + "learning_rate": 8.519892250310817e-05, + "loss": 1.1912, + "step": 7350 + }, + { + "epoch": 1.52, + "grad_norm": 1.04379141330719, + "learning_rate": 8.517820140903439e-05, + "loss": 1.2093, + "step": 7360 + }, + { + "epoch": 1.52, + "grad_norm": 1.0553088188171387, + "learning_rate": 8.515748031496064e-05, + "loss": 1.2213, + "step": 7370 + }, + { + "epoch": 1.52, + "grad_norm": 0.8822417855262756, + "learning_rate": 8.513675922088686e-05, + "loss": 1.2223, + "step": 7380 + }, + { + "epoch": 1.52, + "grad_norm": 0.9738419055938721, + "learning_rate": 8.51160381268131e-05, + "loss": 1.2011, + "step": 7390 + }, + { + "epoch": 1.53, + "grad_norm": 1.0899471044540405, + "learning_rate": 8.509738914214672e-05, + "loss": 1.2406, + "step": 7400 + }, + { + "epoch": 1.53, + "grad_norm": 1.1138182878494263, + "learning_rate": 8.507666804807294e-05, + "loss": 1.1963, + "step": 7410 + }, + { + "epoch": 1.53, + "grad_norm": 1.1311627626419067, + "learning_rate": 8.505594695399917e-05, + "loss": 1.1977, + "step": 7420 + }, + { + "epoch": 1.53, + "grad_norm": 2.138723134994507, + "learning_rate": 8.503522585992541e-05, + "loss": 1.2104, + "step": 7430 + }, + { + "epoch": 1.54, + "grad_norm": 1.0873496532440186, + "learning_rate": 8.501450476585164e-05, + "loss": 1.2113, + "step": 7440 + }, + { + "epoch": 1.54, + "grad_norm": 0.9628106355667114, + "learning_rate": 8.499378367177787e-05, + "loss": 1.2258, + "step": 7450 + }, + { + "epoch": 1.54, + "grad_norm": 1.1409735679626465, + "learning_rate": 8.497306257770411e-05, + "loss": 1.1964, + "step": 7460 + }, + { + "epoch": 1.54, + "grad_norm": 1.111512541770935, + "learning_rate": 8.495234148363034e-05, + "loss": 1.2061, + "step": 7470 + }, + { + "epoch": 1.54, + "grad_norm": 1.0905687808990479, + "learning_rate": 8.493162038955657e-05, + "loss": 1.2025, + "step": 7480 + }, + { + "epoch": 1.55, + "grad_norm": 1.1417665481567383, + "learning_rate": 8.49108992954828e-05, + "loss": 1.1861, + "step": 7490 + }, + { + "epoch": 1.55, + "grad_norm": 1.024594783782959, + "learning_rate": 8.489017820140904e-05, + "loss": 1.1893, + "step": 7500 + }, + { + "epoch": 1.55, + "grad_norm": 1.112746238708496, + "learning_rate": 8.486945710733528e-05, + "loss": 1.2196, + "step": 7510 + }, + { + "epoch": 1.55, + "grad_norm": 1.1404857635498047, + "learning_rate": 8.484873601326151e-05, + "loss": 1.2131, + "step": 7520 + }, + { + "epoch": 1.55, + "grad_norm": 1.1401077508926392, + "learning_rate": 8.482801491918773e-05, + "loss": 1.2107, + "step": 7530 + }, + { + "epoch": 1.56, + "grad_norm": 1.2235480546951294, + "learning_rate": 8.480729382511397e-05, + "loss": 1.2363, + "step": 7540 + }, + { + "epoch": 1.56, + "grad_norm": 1.0837748050689697, + "learning_rate": 8.47865727310402e-05, + "loss": 1.2139, + "step": 7550 + }, + { + "epoch": 1.56, + "grad_norm": 1.22895348072052, + "learning_rate": 8.476585163696643e-05, + "loss": 1.1998, + "step": 7560 + }, + { + "epoch": 1.56, + "grad_norm": 1.105093240737915, + "learning_rate": 8.474513054289267e-05, + "loss": 1.2212, + "step": 7570 + }, + { + "epoch": 1.56, + "grad_norm": 1.222209095954895, + "learning_rate": 8.47244094488189e-05, + "loss": 1.2453, + "step": 7580 + }, + { + "epoch": 1.57, + "grad_norm": 1.1390637159347534, + "learning_rate": 8.470368835474513e-05, + "loss": 1.2071, + "step": 7590 + }, + { + "epoch": 1.57, + "grad_norm": 1.0356426239013672, + "learning_rate": 8.468296726067137e-05, + "loss": 1.2157, + "step": 7600 + }, + { + "epoch": 1.57, + "grad_norm": 1.0771561861038208, + "learning_rate": 8.46622461665976e-05, + "loss": 1.212, + "step": 7610 + }, + { + "epoch": 1.57, + "grad_norm": 1.3607767820358276, + "learning_rate": 8.464152507252384e-05, + "loss": 1.2253, + "step": 7620 + }, + { + "epoch": 1.57, + "grad_norm": 1.0732100009918213, + "learning_rate": 8.462080397845007e-05, + "loss": 1.1938, + "step": 7630 + }, + { + "epoch": 1.58, + "grad_norm": 1.0551025867462158, + "learning_rate": 8.460008288437629e-05, + "loss": 1.2163, + "step": 7640 + }, + { + "epoch": 1.58, + "grad_norm": 1.127244472503662, + "learning_rate": 8.457936179030254e-05, + "loss": 1.1956, + "step": 7650 + }, + { + "epoch": 1.58, + "grad_norm": 1.0608160495758057, + "learning_rate": 8.455864069622876e-05, + "loss": 1.191, + "step": 7660 + }, + { + "epoch": 1.58, + "grad_norm": 1.0632834434509277, + "learning_rate": 8.4537919602155e-05, + "loss": 1.1971, + "step": 7670 + }, + { + "epoch": 1.58, + "grad_norm": 1.1187163591384888, + "learning_rate": 8.451719850808123e-05, + "loss": 1.2156, + "step": 7680 + }, + { + "epoch": 1.59, + "grad_norm": 1.135420799255371, + "learning_rate": 8.449647741400747e-05, + "loss": 1.1953, + "step": 7690 + }, + { + "epoch": 1.59, + "grad_norm": 1.1564319133758545, + "learning_rate": 8.447575631993369e-05, + "loss": 1.2169, + "step": 7700 + }, + { + "epoch": 1.59, + "grad_norm": 1.1321407556533813, + "learning_rate": 8.445503522585993e-05, + "loss": 1.1869, + "step": 7710 + }, + { + "epoch": 1.59, + "grad_norm": 1.097676157951355, + "learning_rate": 8.443431413178616e-05, + "loss": 1.2039, + "step": 7720 + }, + { + "epoch": 1.6, + "grad_norm": 0.9873398542404175, + "learning_rate": 8.44135930377124e-05, + "loss": 1.2115, + "step": 7730 + }, + { + "epoch": 1.6, + "grad_norm": 1.0774983167648315, + "learning_rate": 8.439287194363863e-05, + "loss": 1.2089, + "step": 7740 + }, + { + "epoch": 1.6, + "grad_norm": 1.199475884437561, + "learning_rate": 8.437215084956485e-05, + "loss": 1.1913, + "step": 7750 + }, + { + "epoch": 1.6, + "grad_norm": 1.2517530918121338, + "learning_rate": 8.43514297554911e-05, + "loss": 1.24, + "step": 7760 + }, + { + "epoch": 1.6, + "grad_norm": 1.1117113828659058, + "learning_rate": 8.433070866141732e-05, + "loss": 1.1907, + "step": 7770 + }, + { + "epoch": 1.61, + "grad_norm": 1.1518152952194214, + "learning_rate": 8.430998756734356e-05, + "loss": 1.2194, + "step": 7780 + }, + { + "epoch": 1.61, + "grad_norm": 1.0633752346038818, + "learning_rate": 8.428926647326979e-05, + "loss": 1.2173, + "step": 7790 + }, + { + "epoch": 1.61, + "grad_norm": 1.1065930128097534, + "learning_rate": 8.426854537919603e-05, + "loss": 1.1803, + "step": 7800 + }, + { + "epoch": 1.61, + "grad_norm": 1.03267240524292, + "learning_rate": 8.424782428512226e-05, + "loss": 1.2178, + "step": 7810 + }, + { + "epoch": 1.61, + "grad_norm": 0.9949610233306885, + "learning_rate": 8.42271031910485e-05, + "loss": 1.1999, + "step": 7820 + }, + { + "epoch": 1.62, + "grad_norm": 1.0859277248382568, + "learning_rate": 8.420638209697472e-05, + "loss": 1.2056, + "step": 7830 + }, + { + "epoch": 1.62, + "grad_norm": 1.0535070896148682, + "learning_rate": 8.418566100290096e-05, + "loss": 1.2004, + "step": 7840 + }, + { + "epoch": 1.62, + "grad_norm": 1.1210662126541138, + "learning_rate": 8.416493990882719e-05, + "loss": 1.2223, + "step": 7850 + }, + { + "epoch": 1.62, + "grad_norm": 1.0601907968521118, + "learning_rate": 8.414421881475343e-05, + "loss": 1.1985, + "step": 7860 + }, + { + "epoch": 1.62, + "grad_norm": 1.0329095125198364, + "learning_rate": 8.412349772067966e-05, + "loss": 1.202, + "step": 7870 + }, + { + "epoch": 1.63, + "grad_norm": 1.1331712007522583, + "learning_rate": 8.410277662660588e-05, + "loss": 1.1923, + "step": 7880 + }, + { + "epoch": 1.63, + "grad_norm": 1.1752616167068481, + "learning_rate": 8.408205553253212e-05, + "loss": 1.195, + "step": 7890 + }, + { + "epoch": 1.63, + "grad_norm": 1.1065553426742554, + "learning_rate": 8.406133443845835e-05, + "loss": 1.2173, + "step": 7900 + }, + { + "epoch": 1.63, + "grad_norm": 1.1917232275009155, + "learning_rate": 8.404061334438459e-05, + "loss": 1.1926, + "step": 7910 + }, + { + "epoch": 1.63, + "grad_norm": 1.1937179565429688, + "learning_rate": 8.401989225031082e-05, + "loss": 1.18, + "step": 7920 + }, + { + "epoch": 1.64, + "grad_norm": 1.1231236457824707, + "learning_rate": 8.399917115623706e-05, + "loss": 1.2006, + "step": 7930 + }, + { + "epoch": 1.64, + "grad_norm": 1.2102551460266113, + "learning_rate": 8.397845006216328e-05, + "loss": 1.209, + "step": 7940 + }, + { + "epoch": 1.64, + "grad_norm": 1.0514934062957764, + "learning_rate": 8.395772896808953e-05, + "loss": 1.1959, + "step": 7950 + }, + { + "epoch": 1.64, + "grad_norm": 1.0771079063415527, + "learning_rate": 8.393700787401575e-05, + "loss": 1.2303, + "step": 7960 + }, + { + "epoch": 1.64, + "grad_norm": 1.1473889350891113, + "learning_rate": 8.391628677994198e-05, + "loss": 1.189, + "step": 7970 + }, + { + "epoch": 1.65, + "grad_norm": 1.101943850517273, + "learning_rate": 8.389556568586822e-05, + "loss": 1.1792, + "step": 7980 + }, + { + "epoch": 1.65, + "grad_norm": 1.217653751373291, + "learning_rate": 8.387484459179446e-05, + "loss": 1.2032, + "step": 7990 + }, + { + "epoch": 1.65, + "grad_norm": 1.1558287143707275, + "learning_rate": 8.385412349772068e-05, + "loss": 1.2048, + "step": 8000 + }, + { + "epoch": 1.65, + "grad_norm": 1.1772674322128296, + "learning_rate": 8.383340240364693e-05, + "loss": 1.208, + "step": 8010 + }, + { + "epoch": 1.65, + "grad_norm": 1.1206742525100708, + "learning_rate": 8.381268130957315e-05, + "loss": 1.2158, + "step": 8020 + }, + { + "epoch": 1.66, + "grad_norm": 1.0629864931106567, + "learning_rate": 8.379196021549938e-05, + "loss": 1.1989, + "step": 8030 + }, + { + "epoch": 1.66, + "grad_norm": 1.0601340532302856, + "learning_rate": 8.377123912142562e-05, + "loss": 1.2149, + "step": 8040 + }, + { + "epoch": 1.66, + "grad_norm": 1.2142128944396973, + "learning_rate": 8.375051802735184e-05, + "loss": 1.2045, + "step": 8050 + }, + { + "epoch": 1.66, + "grad_norm": 1.015764832496643, + "learning_rate": 8.372979693327809e-05, + "loss": 1.2177, + "step": 8060 + }, + { + "epoch": 1.67, + "grad_norm": 1.1525570154190063, + "learning_rate": 8.370907583920431e-05, + "loss": 1.2203, + "step": 8070 + }, + { + "epoch": 1.67, + "grad_norm": 1.0959409475326538, + "learning_rate": 8.368835474513054e-05, + "loss": 1.2022, + "step": 8080 + }, + { + "epoch": 1.67, + "grad_norm": 1.073423981666565, + "learning_rate": 8.366763365105678e-05, + "loss": 1.2225, + "step": 8090 + }, + { + "epoch": 1.67, + "grad_norm": 1.1058803796768188, + "learning_rate": 8.364691255698301e-05, + "loss": 1.202, + "step": 8100 + }, + { + "epoch": 1.67, + "grad_norm": 1.067132592201233, + "learning_rate": 8.362619146290924e-05, + "loss": 1.2215, + "step": 8110 + }, + { + "epoch": 1.68, + "grad_norm": 1.1303818225860596, + "learning_rate": 8.360547036883548e-05, + "loss": 1.2049, + "step": 8120 + }, + { + "epoch": 1.68, + "grad_norm": 1.1500287055969238, + "learning_rate": 8.35847492747617e-05, + "loss": 1.2241, + "step": 8130 + }, + { + "epoch": 1.68, + "grad_norm": 0.9954307675361633, + "learning_rate": 8.356402818068794e-05, + "loss": 1.1806, + "step": 8140 + }, + { + "epoch": 1.68, + "grad_norm": 1.0905861854553223, + "learning_rate": 8.354330708661418e-05, + "loss": 1.2108, + "step": 8150 + }, + { + "epoch": 1.68, + "grad_norm": 1.1111913919448853, + "learning_rate": 8.352258599254041e-05, + "loss": 1.2026, + "step": 8160 + }, + { + "epoch": 1.69, + "grad_norm": 0.9808979034423828, + "learning_rate": 8.350186489846665e-05, + "loss": 1.2101, + "step": 8170 + }, + { + "epoch": 1.69, + "grad_norm": 1.127181053161621, + "learning_rate": 8.348114380439288e-05, + "loss": 1.1955, + "step": 8180 + }, + { + "epoch": 1.69, + "grad_norm": 1.0933669805526733, + "learning_rate": 8.34604227103191e-05, + "loss": 1.1904, + "step": 8190 + }, + { + "epoch": 1.69, + "grad_norm": 1.2010207176208496, + "learning_rate": 8.343970161624535e-05, + "loss": 1.2018, + "step": 8200 + }, + { + "epoch": 1.69, + "grad_norm": 1.019642949104309, + "learning_rate": 8.341898052217157e-05, + "loss": 1.221, + "step": 8210 + }, + { + "epoch": 1.7, + "grad_norm": 1.115064024925232, + "learning_rate": 8.339825942809781e-05, + "loss": 1.193, + "step": 8220 + }, + { + "epoch": 1.7, + "grad_norm": 1.008520245552063, + "learning_rate": 8.337753833402404e-05, + "loss": 1.222, + "step": 8230 + }, + { + "epoch": 1.7, + "grad_norm": 1.0627065896987915, + "learning_rate": 8.335681723995027e-05, + "loss": 1.1968, + "step": 8240 + }, + { + "epoch": 1.7, + "grad_norm": 1.2253535985946655, + "learning_rate": 8.33360961458765e-05, + "loss": 1.2135, + "step": 8250 + }, + { + "epoch": 1.7, + "grad_norm": 1.0848592519760132, + "learning_rate": 8.331537505180274e-05, + "loss": 1.2087, + "step": 8260 + }, + { + "epoch": 1.71, + "grad_norm": 1.1441541910171509, + "learning_rate": 8.329465395772897e-05, + "loss": 1.1789, + "step": 8270 + }, + { + "epoch": 1.71, + "grad_norm": 1.1108758449554443, + "learning_rate": 8.32739328636552e-05, + "loss": 1.1778, + "step": 8280 + }, + { + "epoch": 1.71, + "grad_norm": 1.1133983135223389, + "learning_rate": 8.325321176958144e-05, + "loss": 1.2062, + "step": 8290 + }, + { + "epoch": 1.71, + "grad_norm": 1.1412779092788696, + "learning_rate": 8.323249067550766e-05, + "loss": 1.2129, + "step": 8300 + }, + { + "epoch": 1.71, + "grad_norm": 1.0338503122329712, + "learning_rate": 8.321176958143391e-05, + "loss": 1.1727, + "step": 8310 + }, + { + "epoch": 1.72, + "grad_norm": 1.1191462278366089, + "learning_rate": 8.319104848736013e-05, + "loss": 1.1997, + "step": 8320 + }, + { + "epoch": 1.72, + "grad_norm": 1.0909239053726196, + "learning_rate": 8.317032739328637e-05, + "loss": 1.2144, + "step": 8330 + }, + { + "epoch": 1.72, + "grad_norm": 1.1151865720748901, + "learning_rate": 8.31496062992126e-05, + "loss": 1.2033, + "step": 8340 + }, + { + "epoch": 1.72, + "grad_norm": 1.143604040145874, + "learning_rate": 8.312888520513884e-05, + "loss": 1.2088, + "step": 8350 + }, + { + "epoch": 1.73, + "grad_norm": 1.2317973375320435, + "learning_rate": 8.310816411106507e-05, + "loss": 1.2179, + "step": 8360 + }, + { + "epoch": 1.73, + "grad_norm": 1.1043517589569092, + "learning_rate": 8.308744301699131e-05, + "loss": 1.1913, + "step": 8370 + }, + { + "epoch": 1.73, + "grad_norm": 1.149396300315857, + "learning_rate": 8.306672192291753e-05, + "loss": 1.2066, + "step": 8380 + }, + { + "epoch": 1.73, + "grad_norm": 1.0468456745147705, + "learning_rate": 8.304600082884377e-05, + "loss": 1.1823, + "step": 8390 + }, + { + "epoch": 1.73, + "grad_norm": 1.0730814933776855, + "learning_rate": 8.302527973477e-05, + "loss": 1.2053, + "step": 8400 + }, + { + "epoch": 1.74, + "grad_norm": 1.2069722414016724, + "learning_rate": 8.300455864069622e-05, + "loss": 1.1999, + "step": 8410 + }, + { + "epoch": 1.74, + "grad_norm": 1.0964728593826294, + "learning_rate": 8.298383754662247e-05, + "loss": 1.1927, + "step": 8420 + }, + { + "epoch": 1.74, + "grad_norm": 1.0547986030578613, + "learning_rate": 8.296311645254869e-05, + "loss": 1.2198, + "step": 8430 + }, + { + "epoch": 1.74, + "grad_norm": 0.9480800628662109, + "learning_rate": 8.294239535847493e-05, + "loss": 1.1673, + "step": 8440 + }, + { + "epoch": 1.74, + "grad_norm": 1.042935848236084, + "learning_rate": 8.292167426440116e-05, + "loss": 1.211, + "step": 8450 + }, + { + "epoch": 1.75, + "grad_norm": 1.1812864542007446, + "learning_rate": 8.29009531703274e-05, + "loss": 1.1775, + "step": 8460 + }, + { + "epoch": 1.75, + "grad_norm": 1.0850603580474854, + "learning_rate": 8.288023207625363e-05, + "loss": 1.2023, + "step": 8470 + }, + { + "epoch": 1.75, + "grad_norm": 1.0494657754898071, + "learning_rate": 8.285951098217987e-05, + "loss": 1.1897, + "step": 8480 + }, + { + "epoch": 1.75, + "grad_norm": 1.2138115167617798, + "learning_rate": 8.283878988810609e-05, + "loss": 1.1691, + "step": 8490 + }, + { + "epoch": 1.75, + "grad_norm": 1.0943918228149414, + "learning_rate": 8.281806879403234e-05, + "loss": 1.1972, + "step": 8500 + }, + { + "epoch": 1.76, + "grad_norm": 1.0463635921478271, + "learning_rate": 8.279734769995856e-05, + "loss": 1.2312, + "step": 8510 + }, + { + "epoch": 1.76, + "grad_norm": 1.1013418436050415, + "learning_rate": 8.27766266058848e-05, + "loss": 1.2311, + "step": 8520 + }, + { + "epoch": 1.76, + "grad_norm": 0.9996619820594788, + "learning_rate": 8.275590551181103e-05, + "loss": 1.2129, + "step": 8530 + }, + { + "epoch": 1.76, + "grad_norm": 1.0416114330291748, + "learning_rate": 8.273518441773727e-05, + "loss": 1.1822, + "step": 8540 + }, + { + "epoch": 1.76, + "grad_norm": 1.0862529277801514, + "learning_rate": 8.271446332366349e-05, + "loss": 1.2145, + "step": 8550 + }, + { + "epoch": 1.77, + "grad_norm": 1.1965521574020386, + "learning_rate": 8.269374222958972e-05, + "loss": 1.1969, + "step": 8560 + }, + { + "epoch": 1.77, + "grad_norm": 1.1601965427398682, + "learning_rate": 8.267302113551596e-05, + "loss": 1.2112, + "step": 8570 + }, + { + "epoch": 1.77, + "grad_norm": 1.0845617055892944, + "learning_rate": 8.265230004144219e-05, + "loss": 1.1885, + "step": 8580 + }, + { + "epoch": 1.77, + "grad_norm": 1.0907461643218994, + "learning_rate": 8.263157894736843e-05, + "loss": 1.2091, + "step": 8590 + }, + { + "epoch": 1.77, + "grad_norm": 1.027777075767517, + "learning_rate": 8.261085785329465e-05, + "loss": 1.1919, + "step": 8600 + }, + { + "epoch": 1.78, + "grad_norm": 1.2468430995941162, + "learning_rate": 8.25901367592209e-05, + "loss": 1.1948, + "step": 8610 + }, + { + "epoch": 1.78, + "grad_norm": 1.1889891624450684, + "learning_rate": 8.256941566514712e-05, + "loss": 1.2143, + "step": 8620 + }, + { + "epoch": 1.78, + "grad_norm": 1.2794381380081177, + "learning_rate": 8.254869457107336e-05, + "loss": 1.2132, + "step": 8630 + }, + { + "epoch": 1.78, + "grad_norm": 1.0493546724319458, + "learning_rate": 8.252797347699959e-05, + "loss": 1.1766, + "step": 8640 + }, + { + "epoch": 1.78, + "grad_norm": 1.001658320426941, + "learning_rate": 8.250725238292583e-05, + "loss": 1.1715, + "step": 8650 + }, + { + "epoch": 1.79, + "grad_norm": 1.06058669090271, + "learning_rate": 8.248653128885205e-05, + "loss": 1.1793, + "step": 8660 + }, + { + "epoch": 1.79, + "grad_norm": 1.09765625, + "learning_rate": 8.24658101947783e-05, + "loss": 1.1954, + "step": 8670 + }, + { + "epoch": 1.79, + "grad_norm": 1.0230962038040161, + "learning_rate": 8.244508910070452e-05, + "loss": 1.2158, + "step": 8680 + }, + { + "epoch": 1.79, + "grad_norm": 1.028173804283142, + "learning_rate": 8.242436800663075e-05, + "loss": 1.1874, + "step": 8690 + }, + { + "epoch": 1.8, + "grad_norm": 1.0620988607406616, + "learning_rate": 8.240364691255699e-05, + "loss": 1.2101, + "step": 8700 + }, + { + "epoch": 1.8, + "grad_norm": 1.0968023538589478, + "learning_rate": 8.238292581848322e-05, + "loss": 1.2131, + "step": 8710 + }, + { + "epoch": 1.8, + "grad_norm": 1.1402392387390137, + "learning_rate": 8.236220472440946e-05, + "loss": 1.2173, + "step": 8720 + }, + { + "epoch": 1.8, + "grad_norm": 1.1263937950134277, + "learning_rate": 8.234148363033568e-05, + "loss": 1.1796, + "step": 8730 + }, + { + "epoch": 1.8, + "grad_norm": 1.1383157968521118, + "learning_rate": 8.232076253626191e-05, + "loss": 1.1724, + "step": 8740 + }, + { + "epoch": 1.81, + "grad_norm": 1.255072832107544, + "learning_rate": 8.230004144218815e-05, + "loss": 1.2019, + "step": 8750 + }, + { + "epoch": 1.81, + "grad_norm": 1.4536107778549194, + "learning_rate": 8.227932034811438e-05, + "loss": 1.189, + "step": 8760 + }, + { + "epoch": 1.81, + "grad_norm": 1.0804840326309204, + "learning_rate": 8.225859925404062e-05, + "loss": 1.2009, + "step": 8770 + }, + { + "epoch": 1.81, + "grad_norm": 1.1545298099517822, + "learning_rate": 8.223787815996686e-05, + "loss": 1.2047, + "step": 8780 + }, + { + "epoch": 1.81, + "grad_norm": 1.2308061122894287, + "learning_rate": 8.221715706589308e-05, + "loss": 1.1958, + "step": 8790 + }, + { + "epoch": 1.82, + "grad_norm": 1.2053345441818237, + "learning_rate": 8.219643597181933e-05, + "loss": 1.183, + "step": 8800 + }, + { + "epoch": 1.82, + "grad_norm": 1.2016477584838867, + "learning_rate": 8.217571487774555e-05, + "loss": 1.1831, + "step": 8810 + }, + { + "epoch": 1.82, + "grad_norm": 1.2991312742233276, + "learning_rate": 8.215499378367178e-05, + "loss": 1.1893, + "step": 8820 + }, + { + "epoch": 1.82, + "grad_norm": 1.0688191652297974, + "learning_rate": 8.213427268959802e-05, + "loss": 1.1962, + "step": 8830 + }, + { + "epoch": 1.82, + "grad_norm": 1.0042345523834229, + "learning_rate": 8.211355159552425e-05, + "loss": 1.1935, + "step": 8840 + }, + { + "epoch": 1.83, + "grad_norm": 1.1283092498779297, + "learning_rate": 8.209283050145047e-05, + "loss": 1.1983, + "step": 8850 + }, + { + "epoch": 1.83, + "grad_norm": 1.1149276494979858, + "learning_rate": 8.207210940737672e-05, + "loss": 1.1967, + "step": 8860 + }, + { + "epoch": 1.83, + "grad_norm": 1.0818403959274292, + "learning_rate": 8.205138831330294e-05, + "loss": 1.187, + "step": 8870 + }, + { + "epoch": 1.83, + "grad_norm": 1.1263095140457153, + "learning_rate": 8.203066721922918e-05, + "loss": 1.1866, + "step": 8880 + }, + { + "epoch": 1.83, + "grad_norm": 1.0879017114639282, + "learning_rate": 8.200994612515541e-05, + "loss": 1.2018, + "step": 8890 + }, + { + "epoch": 1.84, + "grad_norm": 1.1448615789413452, + "learning_rate": 8.198922503108164e-05, + "loss": 1.1879, + "step": 8900 + }, + { + "epoch": 1.84, + "grad_norm": 0.9900506138801575, + "learning_rate": 8.196850393700788e-05, + "loss": 1.1985, + "step": 8910 + }, + { + "epoch": 1.84, + "grad_norm": 1.0438898801803589, + "learning_rate": 8.19477828429341e-05, + "loss": 1.1881, + "step": 8920 + }, + { + "epoch": 1.84, + "grad_norm": 1.1896075010299683, + "learning_rate": 8.192706174886034e-05, + "loss": 1.2022, + "step": 8930 + }, + { + "epoch": 1.84, + "grad_norm": 1.152300238609314, + "learning_rate": 8.190634065478658e-05, + "loss": 1.1828, + "step": 8940 + }, + { + "epoch": 1.85, + "grad_norm": 1.0616978406906128, + "learning_rate": 8.188561956071281e-05, + "loss": 1.1788, + "step": 8950 + }, + { + "epoch": 1.85, + "grad_norm": 1.0310215950012207, + "learning_rate": 8.186489846663903e-05, + "loss": 1.1898, + "step": 8960 + }, + { + "epoch": 1.85, + "grad_norm": 1.1227622032165527, + "learning_rate": 8.184417737256528e-05, + "loss": 1.1936, + "step": 8970 + }, + { + "epoch": 1.85, + "grad_norm": 1.0663117170333862, + "learning_rate": 8.18234562784915e-05, + "loss": 1.1857, + "step": 8980 + }, + { + "epoch": 1.86, + "grad_norm": 1.202330470085144, + "learning_rate": 8.180273518441774e-05, + "loss": 1.1811, + "step": 8990 + }, + { + "epoch": 1.86, + "grad_norm": 1.036596417427063, + "learning_rate": 8.178201409034397e-05, + "loss": 1.2013, + "step": 9000 + }, + { + "epoch": 1.86, + "grad_norm": 1.0503324270248413, + "learning_rate": 8.176129299627021e-05, + "loss": 1.1755, + "step": 9010 + }, + { + "epoch": 1.86, + "grad_norm": 1.0510581731796265, + "learning_rate": 8.174057190219644e-05, + "loss": 1.1936, + "step": 9020 + }, + { + "epoch": 1.86, + "grad_norm": 1.0580915212631226, + "learning_rate": 8.171985080812268e-05, + "loss": 1.2006, + "step": 9030 + }, + { + "epoch": 1.87, + "grad_norm": 1.0723899602890015, + "learning_rate": 8.16991297140489e-05, + "loss": 1.1841, + "step": 9040 + }, + { + "epoch": 1.87, + "grad_norm": 1.0289912223815918, + "learning_rate": 8.167840861997515e-05, + "loss": 1.1762, + "step": 9050 + }, + { + "epoch": 1.87, + "grad_norm": 1.2664040327072144, + "learning_rate": 8.165768752590137e-05, + "loss": 1.1695, + "step": 9060 + }, + { + "epoch": 1.87, + "grad_norm": 1.0689359903335571, + "learning_rate": 8.163696643182759e-05, + "loss": 1.1888, + "step": 9070 + }, + { + "epoch": 1.87, + "grad_norm": 1.131007194519043, + "learning_rate": 8.161624533775384e-05, + "loss": 1.1856, + "step": 9080 + }, + { + "epoch": 1.88, + "grad_norm": 1.06068754196167, + "learning_rate": 8.159552424368006e-05, + "loss": 1.2055, + "step": 9090 + }, + { + "epoch": 1.88, + "grad_norm": 1.0843102931976318, + "learning_rate": 8.15748031496063e-05, + "loss": 1.1898, + "step": 9100 + }, + { + "epoch": 1.88, + "grad_norm": 1.141878366470337, + "learning_rate": 8.155408205553253e-05, + "loss": 1.1885, + "step": 9110 + }, + { + "epoch": 1.88, + "grad_norm": 1.1082584857940674, + "learning_rate": 8.153336096145877e-05, + "loss": 1.1843, + "step": 9120 + }, + { + "epoch": 1.88, + "grad_norm": 1.1285297870635986, + "learning_rate": 8.1512639867385e-05, + "loss": 1.2126, + "step": 9130 + }, + { + "epoch": 1.89, + "grad_norm": 1.2701120376586914, + "learning_rate": 8.149191877331124e-05, + "loss": 1.1816, + "step": 9140 + }, + { + "epoch": 1.89, + "grad_norm": 1.1763367652893066, + "learning_rate": 8.147119767923746e-05, + "loss": 1.2031, + "step": 9150 + }, + { + "epoch": 1.89, + "grad_norm": 1.0942009687423706, + "learning_rate": 8.145047658516371e-05, + "loss": 1.1922, + "step": 9160 + }, + { + "epoch": 1.89, + "grad_norm": 1.1373592615127563, + "learning_rate": 8.142975549108993e-05, + "loss": 1.1891, + "step": 9170 + }, + { + "epoch": 1.89, + "grad_norm": 1.0547840595245361, + "learning_rate": 8.140903439701617e-05, + "loss": 1.2119, + "step": 9180 + }, + { + "epoch": 1.9, + "grad_norm": 0.973156213760376, + "learning_rate": 8.13883133029424e-05, + "loss": 1.1929, + "step": 9190 + }, + { + "epoch": 1.9, + "grad_norm": 1.07832670211792, + "learning_rate": 8.136759220886864e-05, + "loss": 1.1693, + "step": 9200 + }, + { + "epoch": 1.9, + "grad_norm": 1.173492670059204, + "learning_rate": 8.134687111479487e-05, + "loss": 1.1998, + "step": 9210 + }, + { + "epoch": 1.9, + "grad_norm": 1.161414623260498, + "learning_rate": 8.132615002072109e-05, + "loss": 1.1769, + "step": 9220 + }, + { + "epoch": 1.9, + "grad_norm": 1.1352944374084473, + "learning_rate": 8.130542892664733e-05, + "loss": 1.1837, + "step": 9230 + }, + { + "epoch": 1.91, + "grad_norm": 1.102401852607727, + "learning_rate": 8.128470783257356e-05, + "loss": 1.175, + "step": 9240 + }, + { + "epoch": 1.91, + "grad_norm": 1.2963926792144775, + "learning_rate": 8.12639867384998e-05, + "loss": 1.1782, + "step": 9250 + }, + { + "epoch": 1.91, + "grad_norm": 1.0536015033721924, + "learning_rate": 8.124326564442602e-05, + "loss": 1.1799, + "step": 9260 + }, + { + "epoch": 1.91, + "grad_norm": 1.1193984746932983, + "learning_rate": 8.122254455035227e-05, + "loss": 1.1729, + "step": 9270 + }, + { + "epoch": 1.91, + "grad_norm": 1.1112126111984253, + "learning_rate": 8.120182345627849e-05, + "loss": 1.168, + "step": 9280 + }, + { + "epoch": 1.92, + "grad_norm": 1.0647683143615723, + "learning_rate": 8.118110236220473e-05, + "loss": 1.2057, + "step": 9290 + }, + { + "epoch": 1.92, + "grad_norm": 1.1709637641906738, + "learning_rate": 8.116038126813096e-05, + "loss": 1.1757, + "step": 9300 + }, + { + "epoch": 1.92, + "grad_norm": 1.0774716138839722, + "learning_rate": 8.11396601740572e-05, + "loss": 1.1784, + "step": 9310 + }, + { + "epoch": 1.92, + "grad_norm": 1.088477373123169, + "learning_rate": 8.111893907998343e-05, + "loss": 1.1726, + "step": 9320 + }, + { + "epoch": 1.93, + "grad_norm": 1.0596317052841187, + "learning_rate": 8.109821798590967e-05, + "loss": 1.2029, + "step": 9330 + }, + { + "epoch": 1.93, + "grad_norm": 1.1406421661376953, + "learning_rate": 8.107749689183589e-05, + "loss": 1.2154, + "step": 9340 + }, + { + "epoch": 1.93, + "grad_norm": 1.1345916986465454, + "learning_rate": 8.105677579776214e-05, + "loss": 1.1684, + "step": 9350 + }, + { + "epoch": 1.93, + "grad_norm": 1.2776840925216675, + "learning_rate": 8.103605470368836e-05, + "loss": 1.1915, + "step": 9360 + }, + { + "epoch": 1.93, + "grad_norm": 1.1104062795639038, + "learning_rate": 8.101533360961459e-05, + "loss": 1.1935, + "step": 9370 + }, + { + "epoch": 1.94, + "grad_norm": 1.0262198448181152, + "learning_rate": 8.099461251554083e-05, + "loss": 1.2047, + "step": 9380 + }, + { + "epoch": 1.94, + "grad_norm": 1.1350001096725464, + "learning_rate": 8.097389142146705e-05, + "loss": 1.1798, + "step": 9390 + }, + { + "epoch": 1.94, + "grad_norm": 1.0631601810455322, + "learning_rate": 8.095317032739328e-05, + "loss": 1.1744, + "step": 9400 + }, + { + "epoch": 1.94, + "grad_norm": 1.1678036451339722, + "learning_rate": 8.093244923331952e-05, + "loss": 1.208, + "step": 9410 + }, + { + "epoch": 1.94, + "grad_norm": 1.0665799379348755, + "learning_rate": 8.091172813924576e-05, + "loss": 1.1792, + "step": 9420 + }, + { + "epoch": 1.95, + "grad_norm": 1.1364054679870605, + "learning_rate": 8.089100704517199e-05, + "loss": 1.2036, + "step": 9430 + }, + { + "epoch": 1.95, + "grad_norm": 1.0586035251617432, + "learning_rate": 8.087028595109823e-05, + "loss": 1.201, + "step": 9440 + }, + { + "epoch": 1.95, + "grad_norm": 0.9820613861083984, + "learning_rate": 8.084956485702445e-05, + "loss": 1.1912, + "step": 9450 + }, + { + "epoch": 1.95, + "grad_norm": 1.0832290649414062, + "learning_rate": 8.08288437629507e-05, + "loss": 1.2044, + "step": 9460 + }, + { + "epoch": 1.95, + "grad_norm": 1.2247551679611206, + "learning_rate": 8.080812266887692e-05, + "loss": 1.1613, + "step": 9470 + }, + { + "epoch": 1.96, + "grad_norm": 1.0764803886413574, + "learning_rate": 8.078740157480315e-05, + "loss": 1.1885, + "step": 9480 + }, + { + "epoch": 1.96, + "grad_norm": 1.1871037483215332, + "learning_rate": 8.076668048072939e-05, + "loss": 1.2089, + "step": 9490 + }, + { + "epoch": 1.96, + "grad_norm": 1.0709806680679321, + "learning_rate": 8.074595938665562e-05, + "loss": 1.1743, + "step": 9500 + }, + { + "epoch": 1.96, + "grad_norm": 1.2077934741973877, + "learning_rate": 8.072523829258184e-05, + "loss": 1.192, + "step": 9510 + }, + { + "epoch": 1.96, + "grad_norm": 1.111323356628418, + "learning_rate": 8.070451719850809e-05, + "loss": 1.178, + "step": 9520 + }, + { + "epoch": 1.97, + "grad_norm": 1.1351996660232544, + "learning_rate": 8.06858682138417e-05, + "loss": 1.1921, + "step": 9530 + }, + { + "epoch": 1.97, + "grad_norm": 1.1319098472595215, + "learning_rate": 8.066514711976792e-05, + "loss": 1.2069, + "step": 9540 + }, + { + "epoch": 1.97, + "grad_norm": 1.080480694770813, + "learning_rate": 8.064442602569417e-05, + "loss": 1.1949, + "step": 9550 + }, + { + "epoch": 1.97, + "grad_norm": 1.126483678817749, + "learning_rate": 8.062370493162039e-05, + "loss": 1.1523, + "step": 9560 + }, + { + "epoch": 1.97, + "grad_norm": 1.0840978622436523, + "learning_rate": 8.060298383754663e-05, + "loss": 1.2074, + "step": 9570 + }, + { + "epoch": 1.98, + "grad_norm": 1.1075447797775269, + "learning_rate": 8.058226274347286e-05, + "loss": 1.1937, + "step": 9580 + }, + { + "epoch": 1.98, + "grad_norm": 1.0781664848327637, + "learning_rate": 8.05615416493991e-05, + "loss": 1.1837, + "step": 9590 + }, + { + "epoch": 1.98, + "grad_norm": 1.0375909805297852, + "learning_rate": 8.054082055532532e-05, + "loss": 1.1928, + "step": 9600 + }, + { + "epoch": 1.98, + "grad_norm": 1.101149320602417, + "learning_rate": 8.052009946125155e-05, + "loss": 1.2006, + "step": 9610 + }, + { + "epoch": 1.99, + "grad_norm": 0.9537743926048279, + "learning_rate": 8.049937836717779e-05, + "loss": 1.1939, + "step": 9620 + }, + { + "epoch": 1.99, + "grad_norm": 1.1106035709381104, + "learning_rate": 8.047865727310402e-05, + "loss": 1.1874, + "step": 9630 + }, + { + "epoch": 1.99, + "grad_norm": 1.1406303644180298, + "learning_rate": 8.045793617903026e-05, + "loss": 1.1885, + "step": 9640 + }, + { + "epoch": 1.99, + "grad_norm": 1.0510179996490479, + "learning_rate": 8.043721508495648e-05, + "loss": 1.1831, + "step": 9650 + }, + { + "epoch": 1.99, + "grad_norm": 1.1432597637176514, + "learning_rate": 8.041649399088273e-05, + "loss": 1.1746, + "step": 9660 + }, + { + "epoch": 2.0, + "grad_norm": 1.0710557699203491, + "learning_rate": 8.039577289680895e-05, + "loss": 1.1668, + "step": 9670 + }, + { + "epoch": 2.0, + "grad_norm": 1.067130446434021, + "learning_rate": 8.037505180273519e-05, + "loss": 1.1895, + "step": 9680 + }, + { + "epoch": 2.0, + "grad_norm": 1.1639565229415894, + "learning_rate": 8.035433070866142e-05, + "loss": 1.1786, + "step": 9690 + }, + { + "epoch": 2.0, + "eval_loss": 1.2795084714889526, + "eval_runtime": 1604.6697, + "eval_samples_per_second": 262.864, + "eval_steps_per_second": 4.107, + "step": 9692 + } + ], + "logging_steps": 10, + "max_steps": 48460, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 4.1335523730815713e+18, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}