{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9938573003622615, "eval_steps": 500, "global_step": 297, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010080327610647345, "grad_norm": 20.48233413696289, "learning_rate": 6.666666666666667e-07, "loss": 1.318, "step": 1 }, { "epoch": 0.02016065522129469, "grad_norm": 22.18445587158203, "learning_rate": 1.3333333333333334e-06, "loss": 1.2034, "step": 2 }, { "epoch": 0.03024098283194204, "grad_norm": 20.34792137145996, "learning_rate": 2.0000000000000003e-06, "loss": 1.0455, "step": 3 }, { "epoch": 0.04032131044258938, "grad_norm": 8.893705368041992, "learning_rate": 2.666666666666667e-06, "loss": 2.0515, "step": 4 }, { "epoch": 0.05040163805323673, "grad_norm": 6.274827003479004, "learning_rate": 3.3333333333333333e-06, "loss": 1.0793, "step": 5 }, { "epoch": 0.06048196566388408, "grad_norm": 6.051919937133789, "learning_rate": 4.000000000000001e-06, "loss": 0.9434, "step": 6 }, { "epoch": 0.07056229327453142, "grad_norm": 7.502919673919678, "learning_rate": 4.666666666666667e-06, "loss": 0.9741, "step": 7 }, { "epoch": 0.08064262088517876, "grad_norm": 6.437217712402344, "learning_rate": 5.333333333333334e-06, "loss": 1.1227, "step": 8 }, { "epoch": 0.09072294849582611, "grad_norm": 5.169360160827637, "learning_rate": 6e-06, "loss": 0.9156, "step": 9 }, { "epoch": 0.10080327610647347, "grad_norm": 5.1474432945251465, "learning_rate": 6.666666666666667e-06, "loss": 1.0925, "step": 10 }, { "epoch": 0.1108836037171208, "grad_norm": 6.759896755218506, "learning_rate": 7.333333333333333e-06, "loss": 0.8985, "step": 11 }, { "epoch": 0.12096393132776816, "grad_norm": 5.231770992279053, "learning_rate": 8.000000000000001e-06, "loss": 1.6954, "step": 12 }, { "epoch": 0.1310442589384155, "grad_norm": 5.028665542602539, "learning_rate": 8.666666666666668e-06, "loss": 0.9763, "step": 13 }, { "epoch": 0.14112458654906285, "grad_norm": 7.008236408233643, "learning_rate": 9.333333333333334e-06, "loss": 0.9969, "step": 14 }, { "epoch": 0.15120491415971019, "grad_norm": 4.675139904022217, "learning_rate": 1e-05, "loss": 0.8619, "step": 15 }, { "epoch": 0.16128524177035752, "grad_norm": 5.249491214752197, "learning_rate": 1.0666666666666667e-05, "loss": 0.9176, "step": 16 }, { "epoch": 0.1713655693810049, "grad_norm": 4.5402092933654785, "learning_rate": 1.1333333333333334e-05, "loss": 0.8809, "step": 17 }, { "epoch": 0.18144589699165223, "grad_norm": 4.799923896789551, "learning_rate": 1.2e-05, "loss": 1.005, "step": 18 }, { "epoch": 0.19152622460229957, "grad_norm": 3.9302682876586914, "learning_rate": 1.2666666666666667e-05, "loss": 0.8784, "step": 19 }, { "epoch": 0.20160655221294693, "grad_norm": 4.542870044708252, "learning_rate": 1.3333333333333333e-05, "loss": 1.0378, "step": 20 }, { "epoch": 0.21168687982359427, "grad_norm": 10.4898042678833, "learning_rate": 1.4e-05, "loss": 0.8458, "step": 21 }, { "epoch": 0.2217672074342416, "grad_norm": 4.678144454956055, "learning_rate": 1.4666666666666666e-05, "loss": 0.9237, "step": 22 }, { "epoch": 0.23184753504488895, "grad_norm": 4.897671222686768, "learning_rate": 1.5333333333333334e-05, "loss": 0.9661, "step": 23 }, { "epoch": 0.2419278626555363, "grad_norm": 5.067351818084717, "learning_rate": 1.6000000000000003e-05, "loss": 0.9975, "step": 24 }, { "epoch": 0.25200819026618365, "grad_norm": 3.7107274532318115, "learning_rate": 1.6666666666666667e-05, "loss": 0.9316, "step": 25 }, { "epoch": 0.262088517876831, "grad_norm": 4.553698539733887, "learning_rate": 1.7333333333333336e-05, "loss": 0.871, "step": 26 }, { "epoch": 0.27216884548747833, "grad_norm": 5.10447359085083, "learning_rate": 1.8e-05, "loss": 0.8687, "step": 27 }, { "epoch": 0.2822491730981257, "grad_norm": 3.7116832733154297, "learning_rate": 1.866666666666667e-05, "loss": 0.8586, "step": 28 }, { "epoch": 0.29232950070877306, "grad_norm": 5.299854755401611, "learning_rate": 1.9333333333333333e-05, "loss": 0.8595, "step": 29 }, { "epoch": 0.30240982831942037, "grad_norm": 3.825899600982666, "learning_rate": 2e-05, "loss": 0.9615, "step": 30 }, { "epoch": 0.31249015593006774, "grad_norm": 4.806526184082031, "learning_rate": 1.999930778307066e-05, "loss": 0.8595, "step": 31 }, { "epoch": 0.32257048354071505, "grad_norm": 3.5444633960723877, "learning_rate": 1.9997231228115487e-05, "loss": 0.9748, "step": 32 }, { "epoch": 0.3326508111513624, "grad_norm": 4.36836051940918, "learning_rate": 1.9993770622619784e-05, "loss": 0.8577, "step": 33 }, { "epoch": 0.3427311387620098, "grad_norm": 3.0740671157836914, "learning_rate": 1.9988926445681495e-05, "loss": 0.8407, "step": 34 }, { "epoch": 0.3528114663726571, "grad_norm": 3.8326187133789062, "learning_rate": 1.998269936794487e-05, "loss": 0.8997, "step": 35 }, { "epoch": 0.36289179398330446, "grad_norm": 3.6133008003234863, "learning_rate": 1.9975090251507637e-05, "loss": 0.9572, "step": 36 }, { "epoch": 0.3729721215939518, "grad_norm": 4.111402988433838, "learning_rate": 1.9966100149801648e-05, "loss": 0.8465, "step": 37 }, { "epoch": 0.38305244920459913, "grad_norm": 3.105464220046997, "learning_rate": 1.9955730307447015e-05, "loss": 0.84, "step": 38 }, { "epoch": 0.3931327768152465, "grad_norm": 3.377089738845825, "learning_rate": 1.9943982160079823e-05, "loss": 0.977, "step": 39 }, { "epoch": 0.40321310442589386, "grad_norm": 3.674912214279175, "learning_rate": 1.9930857334153374e-05, "loss": 0.9114, "step": 40 }, { "epoch": 0.4132934320365412, "grad_norm": 3.491791248321533, "learning_rate": 1.9916357646713006e-05, "loss": 0.8507, "step": 41 }, { "epoch": 0.42337375964718854, "grad_norm": 3.5988316535949707, "learning_rate": 1.9900485105144544e-05, "loss": 0.8459, "step": 42 }, { "epoch": 0.4334540872578359, "grad_norm": 3.147287368774414, "learning_rate": 1.988324190689639e-05, "loss": 0.9254, "step": 43 }, { "epoch": 0.4435344148684832, "grad_norm": 3.4546704292297363, "learning_rate": 1.9864630439175282e-05, "loss": 0.9388, "step": 44 }, { "epoch": 0.4536147424791306, "grad_norm": 3.39437198638916, "learning_rate": 1.9844653278615836e-05, "loss": 0.8751, "step": 45 }, { "epoch": 0.4636950700897779, "grad_norm": 2.966585159301758, "learning_rate": 1.9823313190923797e-05, "loss": 0.833, "step": 46 }, { "epoch": 0.47377539770042526, "grad_norm": 6.668085098266602, "learning_rate": 1.9800613130493158e-05, "loss": 0.9399, "step": 47 }, { "epoch": 0.4838557253110726, "grad_norm": 4.198956489562988, "learning_rate": 1.9776556239997146e-05, "loss": 0.8604, "step": 48 }, { "epoch": 0.49393605292171994, "grad_norm": 3.0325896739959717, "learning_rate": 1.9751145849953135e-05, "loss": 0.8399, "step": 49 }, { "epoch": 0.5040163805323673, "grad_norm": 3.0284290313720703, "learning_rate": 1.972438547826156e-05, "loss": 0.962, "step": 50 }, { "epoch": 0.5140967081430147, "grad_norm": 3.7126681804656982, "learning_rate": 1.9696278829718882e-05, "loss": 0.8381, "step": 51 }, { "epoch": 0.524177035753662, "grad_norm": 3.0753326416015625, "learning_rate": 1.9666829795504693e-05, "loss": 1.2808, "step": 52 }, { "epoch": 0.5342573633643093, "grad_norm": 3.3041043281555176, "learning_rate": 1.9636042452643004e-05, "loss": 1.0921, "step": 53 }, { "epoch": 0.5443376909749567, "grad_norm": 2.974684953689575, "learning_rate": 1.9603921063437795e-05, "loss": 1.1766, "step": 54 }, { "epoch": 0.554418018585604, "grad_norm": 3.104491710662842, "learning_rate": 1.9570470074882947e-05, "loss": 0.8245, "step": 55 }, { "epoch": 0.5644983461962514, "grad_norm": 2.8233213424682617, "learning_rate": 1.9535694118046584e-05, "loss": 0.8327, "step": 56 }, { "epoch": 0.5745786738068988, "grad_norm": 3.0855748653411865, "learning_rate": 1.949959800742991e-05, "loss": 0.8333, "step": 57 }, { "epoch": 0.5846590014175461, "grad_norm": 3.309098482131958, "learning_rate": 1.9462186740300697e-05, "loss": 0.8437, "step": 58 }, { "epoch": 0.5947393290281934, "grad_norm": 3.7956173419952393, "learning_rate": 1.942346549600144e-05, "loss": 0.8533, "step": 59 }, { "epoch": 0.6048196566388407, "grad_norm": 2.885507106781006, "learning_rate": 1.9383439635232296e-05, "loss": 0.9791, "step": 60 }, { "epoch": 0.6148999842494881, "grad_norm": 2.976921319961548, "learning_rate": 1.9342114699308962e-05, "loss": 0.9537, "step": 61 }, { "epoch": 0.6249803118601355, "grad_norm": 3.9822583198547363, "learning_rate": 1.9299496409395482e-05, "loss": 0.8513, "step": 62 }, { "epoch": 0.6350606394707828, "grad_norm": 3.2742626667022705, "learning_rate": 1.9255590665712214e-05, "loss": 0.8606, "step": 63 }, { "epoch": 0.6451409670814301, "grad_norm": 2.989588975906372, "learning_rate": 1.921040354671897e-05, "loss": 1.0236, "step": 64 }, { "epoch": 0.6552212946920775, "grad_norm": 3.3849992752075195, "learning_rate": 1.9163941308273504e-05, "loss": 0.8274, "step": 65 }, { "epoch": 0.6653016223027248, "grad_norm": 2.9485199451446533, "learning_rate": 1.911621038276542e-05, "loss": 0.8366, "step": 66 }, { "epoch": 0.6753819499133722, "grad_norm": 2.8090484142303467, "learning_rate": 1.9067217378225655e-05, "loss": 1.0603, "step": 67 }, { "epoch": 0.6854622775240196, "grad_norm": 2.795466899871826, "learning_rate": 1.9016969077411645e-05, "loss": 1.0391, "step": 68 }, { "epoch": 0.6955426051346669, "grad_norm": 3.0513088703155518, "learning_rate": 1.8965472436868288e-05, "loss": 0.9469, "step": 69 }, { "epoch": 0.7056229327453142, "grad_norm": 2.89764404296875, "learning_rate": 1.891273458596486e-05, "loss": 0.8358, "step": 70 }, { "epoch": 0.7157032603559615, "grad_norm": 2.940281629562378, "learning_rate": 1.8858762825908e-05, "loss": 0.9117, "step": 71 }, { "epoch": 0.7257835879666089, "grad_norm": 3.0357506275177, "learning_rate": 1.8803564628730916e-05, "loss": 0.8441, "step": 72 }, { "epoch": 0.7358639155772563, "grad_norm": 3.0957512855529785, "learning_rate": 1.874714763625892e-05, "loss": 0.8154, "step": 73 }, { "epoch": 0.7459442431879036, "grad_norm": 3.3382112979888916, "learning_rate": 1.8689519659051467e-05, "loss": 1.0091, "step": 74 }, { "epoch": 0.7560245707985509, "grad_norm": 2.7359678745269775, "learning_rate": 1.8630688675320844e-05, "loss": 0.9901, "step": 75 }, { "epoch": 0.7661048984091983, "grad_norm": 2.9160029888153076, "learning_rate": 1.8570662829827632e-05, "loss": 1.0645, "step": 76 }, { "epoch": 0.7761852260198456, "grad_norm": 3.7096657752990723, "learning_rate": 1.8509450432753123e-05, "loss": 0.8458, "step": 77 }, { "epoch": 0.786265553630493, "grad_norm": 2.9605114459991455, "learning_rate": 1.8447059958548822e-05, "loss": 0.8315, "step": 78 }, { "epoch": 0.7963458812411404, "grad_norm": 3.1716909408569336, "learning_rate": 1.8383500044763226e-05, "loss": 0.8427, "step": 79 }, { "epoch": 0.8064262088517877, "grad_norm": 4.014035224914551, "learning_rate": 1.8318779490846005e-05, "loss": 0.8391, "step": 80 }, { "epoch": 0.816506536462435, "grad_norm": 2.6693358421325684, "learning_rate": 1.8252907256929777e-05, "loss": 1.0444, "step": 81 }, { "epoch": 0.8265868640730824, "grad_norm": 3.824836254119873, "learning_rate": 1.818589246258964e-05, "loss": 0.9738, "step": 82 }, { "epoch": 0.8366671916837297, "grad_norm": 3.0832369327545166, "learning_rate": 1.8117744385580627e-05, "loss": 0.9109, "step": 83 }, { "epoch": 0.8467475192943771, "grad_norm": 3.707331657409668, "learning_rate": 1.804847246055326e-05, "loss": 0.8324, "step": 84 }, { "epoch": 0.8568278469050244, "grad_norm": 3.3008220195770264, "learning_rate": 1.797808627774738e-05, "loss": 0.9383, "step": 85 }, { "epoch": 0.8669081745156718, "grad_norm": 3.1887435913085938, "learning_rate": 1.7906595581664462e-05, "loss": 0.8441, "step": 86 }, { "epoch": 0.8769885021263191, "grad_norm": 2.8672823905944824, "learning_rate": 1.7834010269718526e-05, "loss": 0.8353, "step": 87 }, { "epoch": 0.8870688297369664, "grad_norm": 3.0346055030822754, "learning_rate": 1.776034039086592e-05, "loss": 1.348, "step": 88 }, { "epoch": 0.8971491573476138, "grad_norm": 3.2000229358673096, "learning_rate": 1.768559614421411e-05, "loss": 0.8544, "step": 89 }, { "epoch": 0.9072294849582612, "grad_norm": 3.007753610610962, "learning_rate": 1.7609787877609678e-05, "loss": 0.8505, "step": 90 }, { "epoch": 0.9173098125689085, "grad_norm": 2.9321534633636475, "learning_rate": 1.753292608620573e-05, "loss": 0.8402, "step": 91 }, { "epoch": 0.9273901401795558, "grad_norm": 2.6721303462982178, "learning_rate": 1.7455021411008906e-05, "loss": 0.8421, "step": 92 }, { "epoch": 0.9374704677902032, "grad_norm": 3.0125327110290527, "learning_rate": 1.7376084637406222e-05, "loss": 0.8443, "step": 93 }, { "epoch": 0.9475507954008505, "grad_norm": 2.6522045135498047, "learning_rate": 1.7296126693671886e-05, "loss": 1.2249, "step": 94 }, { "epoch": 0.9576311230114979, "grad_norm": 2.6742281913757324, "learning_rate": 1.721515864945435e-05, "loss": 0.846, "step": 95 }, { "epoch": 0.9677114506221453, "grad_norm": 2.9093592166900635, "learning_rate": 1.7133191714243805e-05, "loss": 0.9391, "step": 96 }, { "epoch": 0.9777917782327926, "grad_norm": 3.675670623779297, "learning_rate": 1.7050237235820287e-05, "loss": 0.8723, "step": 97 }, { "epoch": 0.9878721058434399, "grad_norm": 2.698991298675537, "learning_rate": 1.6966306698682672e-05, "loss": 0.8491, "step": 98 }, { "epoch": 0.9979524334540872, "grad_norm": 2.8708646297454834, "learning_rate": 1.6881411722458688e-05, "loss": 1.1059, "step": 99 }, { "epoch": 1.0080327610647346, "grad_norm": 3.9162817001342773, "learning_rate": 1.6795564060296295e-05, "loss": 0.711, "step": 100 }, { "epoch": 1.018113088675382, "grad_norm": 3.2829103469848633, "learning_rate": 1.6708775597236507e-05, "loss": 0.7179, "step": 101 }, { "epoch": 1.0281934162860293, "grad_norm": 4.782021999359131, "learning_rate": 1.6621058348568008e-05, "loss": 0.759, "step": 102 }, { "epoch": 1.0382737438966767, "grad_norm": 4.203643798828125, "learning_rate": 1.6532424458163692e-05, "loss": 0.7717, "step": 103 }, { "epoch": 1.048354071507324, "grad_norm": 4.421259880065918, "learning_rate": 1.6442886196799465e-05, "loss": 0.651, "step": 104 }, { "epoch": 1.0584343991179714, "grad_norm": 3.6463119983673096, "learning_rate": 1.6352455960455385e-05, "loss": 0.7719, "step": 105 }, { "epoch": 1.0685147267286186, "grad_norm": 3.408778429031372, "learning_rate": 1.6261146268599564e-05, "loss": 0.6591, "step": 106 }, { "epoch": 1.078595054339266, "grad_norm": 3.7105519771575928, "learning_rate": 1.6168969762454897e-05, "loss": 0.6645, "step": 107 }, { "epoch": 1.0886753819499133, "grad_norm": 3.8667266368865967, "learning_rate": 1.607593920324899e-05, "loss": 0.7159, "step": 108 }, { "epoch": 1.0987557095605607, "grad_norm": 3.4244542121887207, "learning_rate": 1.598206747044746e-05, "loss": 0.8331, "step": 109 }, { "epoch": 1.108836037171208, "grad_norm": 3.326638698577881, "learning_rate": 1.5887367559970825e-05, "loss": 0.6831, "step": 110 }, { "epoch": 1.1189163647818554, "grad_norm": 3.7026546001434326, "learning_rate": 1.5791852582395334e-05, "loss": 0.6642, "step": 111 }, { "epoch": 1.1289966923925028, "grad_norm": 3.1158483028411865, "learning_rate": 1.569553576113789e-05, "loss": 0.6531, "step": 112 }, { "epoch": 1.1390770200031501, "grad_norm": 3.1516995429992676, "learning_rate": 1.5598430430625335e-05, "loss": 0.6734, "step": 113 }, { "epoch": 1.1491573476137975, "grad_norm": 3.048830270767212, "learning_rate": 1.5500550034448415e-05, "loss": 0.6412, "step": 114 }, { "epoch": 1.1592376752244449, "grad_norm": 2.9057071208953857, "learning_rate": 1.540190812350059e-05, "loss": 0.6441, "step": 115 }, { "epoch": 1.1693180028350922, "grad_norm": 4.930371284484863, "learning_rate": 1.5302518354101992e-05, "loss": 0.6499, "step": 116 }, { "epoch": 1.1793983304457396, "grad_norm": 3.2132606506347656, "learning_rate": 1.5202394486108823e-05, "loss": 0.7648, "step": 117 }, { "epoch": 1.1894786580563868, "grad_norm": 3.247512102127075, "learning_rate": 1.5101550381008377e-05, "loss": 0.6341, "step": 118 }, { "epoch": 1.1995589856670341, "grad_norm": 2.8837008476257324, "learning_rate": 1.5000000000000002e-05, "loss": 0.6785, "step": 119 }, { "epoch": 1.2096393132776815, "grad_norm": 3.362884998321533, "learning_rate": 1.4897757402062285e-05, "loss": 0.6433, "step": 120 }, { "epoch": 1.2197196408883288, "grad_norm": 3.0538456439971924, "learning_rate": 1.4794836742006667e-05, "loss": 0.7454, "step": 121 }, { "epoch": 1.2297999684989762, "grad_norm": 3.1768131256103516, "learning_rate": 1.4691252268517794e-05, "loss": 0.7879, "step": 122 }, { "epoch": 1.2398802961096236, "grad_norm": 3.0788283348083496, "learning_rate": 1.4587018322180906e-05, "loss": 0.7103, "step": 123 }, { "epoch": 1.249960623720271, "grad_norm": 2.7563843727111816, "learning_rate": 1.4482149333496455e-05, "loss": 0.6592, "step": 124 }, { "epoch": 1.2600409513309183, "grad_norm": 3.421998977661133, "learning_rate": 1.4376659820882308e-05, "loss": 0.7862, "step": 125 }, { "epoch": 1.2701212789415657, "grad_norm": 3.0229763984680176, "learning_rate": 1.4270564388663761e-05, "loss": 0.6586, "step": 126 }, { "epoch": 1.2802016065522128, "grad_norm": 2.8393478393554688, "learning_rate": 1.4163877725051677e-05, "loss": 0.7359, "step": 127 }, { "epoch": 1.2902819341628602, "grad_norm": 3.009399175643921, "learning_rate": 1.4056614600108998e-05, "loss": 0.6755, "step": 128 }, { "epoch": 1.3003622617735076, "grad_norm": 2.9821226596832275, "learning_rate": 1.3948789863705914e-05, "loss": 0.629, "step": 129 }, { "epoch": 1.310442589384155, "grad_norm": 3.1034419536590576, "learning_rate": 1.3840418443464015e-05, "loss": 0.6466, "step": 130 }, { "epoch": 1.3205229169948023, "grad_norm": 3.318528413772583, "learning_rate": 1.3731515342689654e-05, "loss": 1.0047, "step": 131 }, { "epoch": 1.3306032446054497, "grad_norm": 2.9240915775299072, "learning_rate": 1.3622095638296827e-05, "loss": 0.668, "step": 132 }, { "epoch": 1.340683572216097, "grad_norm": 3.9342963695526123, "learning_rate": 1.3512174478719896e-05, "loss": 0.6465, "step": 133 }, { "epoch": 1.3507638998267444, "grad_norm": 2.6329188346862793, "learning_rate": 1.340176708181637e-05, "loss": 0.8036, "step": 134 }, { "epoch": 1.3608442274373918, "grad_norm": 3.2634246349334717, "learning_rate": 1.32908887327601e-05, "loss": 0.6348, "step": 135 }, { "epoch": 1.3709245550480391, "grad_norm": 2.8796093463897705, "learning_rate": 1.317955478192515e-05, "loss": 0.7079, "step": 136 }, { "epoch": 1.3810048826586865, "grad_norm": 3.15555477142334, "learning_rate": 1.306778064276064e-05, "loss": 0.7197, "step": 137 }, { "epoch": 1.3910852102693338, "grad_norm": 2.8696324825286865, "learning_rate": 1.2955581789656844e-05, "loss": 0.6422, "step": 138 }, { "epoch": 1.4011655378799812, "grad_norm": 2.841829538345337, "learning_rate": 1.2842973755802872e-05, "loss": 0.6522, "step": 139 }, { "epoch": 1.4112458654906286, "grad_norm": 2.869424819946289, "learning_rate": 1.2729972131036212e-05, "loss": 0.7462, "step": 140 }, { "epoch": 1.4213261931012757, "grad_norm": 3.1612401008605957, "learning_rate": 1.2616592559684408e-05, "loss": 0.6471, "step": 141 }, { "epoch": 1.431406520711923, "grad_norm": 2.7609212398529053, "learning_rate": 1.25028507383992e-05, "loss": 0.7627, "step": 142 }, { "epoch": 1.4414868483225705, "grad_norm": 2.678645610809326, "learning_rate": 1.2388762413983447e-05, "loss": 0.6729, "step": 143 }, { "epoch": 1.4515671759332178, "grad_norm": 2.8031179904937744, "learning_rate": 1.2274343381211067e-05, "loss": 0.6497, "step": 144 }, { "epoch": 1.4616475035438652, "grad_norm": 2.735318660736084, "learning_rate": 1.2159609480640361e-05, "loss": 0.6786, "step": 145 }, { "epoch": 1.4717278311545126, "grad_norm": 2.9970738887786865, "learning_rate": 1.2044576596421003e-05, "loss": 0.6498, "step": 146 }, { "epoch": 1.48180815876516, "grad_norm": 2.8578624725341797, "learning_rate": 1.192926065409497e-05, "loss": 0.6432, "step": 147 }, { "epoch": 1.4918884863758073, "grad_norm": 2.9433505535125732, "learning_rate": 1.1813677618391759e-05, "loss": 0.6274, "step": 148 }, { "epoch": 1.5019688139864544, "grad_norm": 2.799851417541504, "learning_rate": 1.1697843491018189e-05, "loss": 0.6507, "step": 149 }, { "epoch": 1.5120491415971018, "grad_norm": 2.589261770248413, "learning_rate": 1.1581774308443042e-05, "loss": 0.8801, "step": 150 }, { "epoch": 1.5221294692077492, "grad_norm": 2.7499136924743652, "learning_rate": 1.1465486139676955e-05, "loss": 0.6428, "step": 151 }, { "epoch": 1.5322097968183965, "grad_norm": 3.6993484497070312, "learning_rate": 1.134899508404775e-05, "loss": 0.6641, "step": 152 }, { "epoch": 1.542290124429044, "grad_norm": 5.174093723297119, "learning_rate": 1.1232317268971586e-05, "loss": 0.7828, "step": 153 }, { "epoch": 1.5523704520396913, "grad_norm": 2.734003782272339, "learning_rate": 1.1115468847720245e-05, "loss": 0.7631, "step": 154 }, { "epoch": 1.5624507796503386, "grad_norm": 3.996946334838867, "learning_rate": 1.0998465997184798e-05, "loss": 0.6416, "step": 155 }, { "epoch": 1.572531107260986, "grad_norm": 3.302497386932373, "learning_rate": 1.088132491563602e-05, "loss": 0.6543, "step": 156 }, { "epoch": 1.5826114348716334, "grad_norm": 2.9945664405822754, "learning_rate": 1.0764061820481872e-05, "loss": 0.6902, "step": 157 }, { "epoch": 1.5926917624822807, "grad_norm": 2.6038448810577393, "learning_rate": 1.0646692946022285e-05, "loss": 0.6289, "step": 158 }, { "epoch": 1.602772090092928, "grad_norm": 2.6396548748016357, "learning_rate": 1.0529234541201631e-05, "loss": 0.8164, "step": 159 }, { "epoch": 1.6128524177035755, "grad_norm": 2.770799160003662, "learning_rate": 1.041170286735918e-05, "loss": 0.6438, "step": 160 }, { "epoch": 1.6229327453142228, "grad_norm": 2.666429042816162, "learning_rate": 1.0294114195977796e-05, "loss": 0.6912, "step": 161 }, { "epoch": 1.6330130729248702, "grad_norm": 2.9112017154693604, "learning_rate": 1.0176484806431288e-05, "loss": 0.7345, "step": 162 }, { "epoch": 1.6430934005355176, "grad_norm": 3.0811562538146973, "learning_rate": 1.0058830983730622e-05, "loss": 0.7558, "step": 163 }, { "epoch": 1.6531737281461647, "grad_norm": 2.8588948249816895, "learning_rate": 9.94116901626938e-06, "loss": 0.648, "step": 164 }, { "epoch": 1.663254055756812, "grad_norm": 2.7404801845550537, "learning_rate": 9.823515193568715e-06, "loss": 0.695, "step": 165 }, { "epoch": 1.6733343833674594, "grad_norm": 2.9604055881500244, "learning_rate": 9.705885804022207e-06, "loss": 0.6304, "step": 166 }, { "epoch": 1.6834147109781068, "grad_norm": 3.1369986534118652, "learning_rate": 9.588297132640824e-06, "loss": 0.8216, "step": 167 }, { "epoch": 1.6934950385887542, "grad_norm": 2.912094831466675, "learning_rate": 9.470765458798369e-06, "loss": 0.6653, "step": 168 }, { "epoch": 1.7035753661994015, "grad_norm": 2.6910438537597656, "learning_rate": 9.353307053977717e-06, "loss": 0.6645, "step": 169 }, { "epoch": 1.7136556938100487, "grad_norm": 2.958939790725708, "learning_rate": 9.235938179518131e-06, "loss": 0.6222, "step": 170 }, { "epoch": 1.723736021420696, "grad_norm": 2.7143542766571045, "learning_rate": 9.118675084363986e-06, "loss": 0.7051, "step": 171 }, { "epoch": 1.7338163490313434, "grad_norm": 2.6610677242279053, "learning_rate": 9.001534002815209e-06, "loss": 0.6333, "step": 172 }, { "epoch": 1.7438966766419908, "grad_norm": 2.9667537212371826, "learning_rate": 8.884531152279757e-06, "loss": 0.6832, "step": 173 }, { "epoch": 1.7539770042526381, "grad_norm": 2.64609956741333, "learning_rate": 8.767682731028415e-06, "loss": 0.9484, "step": 174 }, { "epoch": 1.7640573318632855, "grad_norm": 2.682523012161255, "learning_rate": 8.651004915952252e-06, "loss": 0.8721, "step": 175 }, { "epoch": 1.7741376594739329, "grad_norm": 2.5906975269317627, "learning_rate": 8.534513860323047e-06, "loss": 0.9793, "step": 176 }, { "epoch": 1.7842179870845802, "grad_norm": 2.636467456817627, "learning_rate": 8.418225691556962e-06, "loss": 0.9016, "step": 177 }, { "epoch": 1.7942983146952276, "grad_norm": 3.5005948543548584, "learning_rate": 8.302156508981816e-06, "loss": 0.738, "step": 178 }, { "epoch": 1.804378642305875, "grad_norm": 2.7986643314361572, "learning_rate": 8.18632238160824e-06, "loss": 0.6635, "step": 179 }, { "epoch": 1.8144589699165223, "grad_norm": 2.8597512245178223, "learning_rate": 8.070739345905032e-06, "loss": 0.7473, "step": 180 }, { "epoch": 1.8245392975271697, "grad_norm": 2.7487239837646484, "learning_rate": 7.955423403578998e-06, "loss": 0.7526, "step": 181 }, { "epoch": 1.834619625137817, "grad_norm": 2.68874454498291, "learning_rate": 7.840390519359644e-06, "loss": 0.6491, "step": 182 }, { "epoch": 1.8446999527484644, "grad_norm": 2.8393709659576416, "learning_rate": 7.725656618788938e-06, "loss": 0.6401, "step": 183 }, { "epoch": 1.8547802803591118, "grad_norm": 2.8322646617889404, "learning_rate": 7.611237586016558e-06, "loss": 0.7692, "step": 184 }, { "epoch": 1.8648606079697592, "grad_norm": 2.760575771331787, "learning_rate": 7.497149261600803e-06, "loss": 0.926, "step": 185 }, { "epoch": 1.8749409355804063, "grad_norm": 2.6379311084747314, "learning_rate": 7.383407440315595e-06, "loss": 0.654, "step": 186 }, { "epoch": 1.8850212631910537, "grad_norm": 2.6411261558532715, "learning_rate": 7.27002786896379e-06, "loss": 0.7753, "step": 187 }, { "epoch": 1.895101590801701, "grad_norm": 2.6866044998168945, "learning_rate": 7.157026244197132e-06, "loss": 0.6479, "step": 188 }, { "epoch": 1.9051819184123484, "grad_norm": 2.743093252182007, "learning_rate": 7.044418210343161e-06, "loss": 0.7825, "step": 189 }, { "epoch": 1.9152622460229958, "grad_norm": 2.7608628273010254, "learning_rate": 6.932219357239362e-06, "loss": 0.6497, "step": 190 }, { "epoch": 1.9253425736336431, "grad_norm": 2.581033706665039, "learning_rate": 6.820445218074849e-06, "loss": 0.658, "step": 191 }, { "epoch": 1.9354229012442903, "grad_norm": 2.8341994285583496, "learning_rate": 6.7091112672399e-06, "loss": 0.8367, "step": 192 }, { "epoch": 1.9455032288549376, "grad_norm": 2.712247133255005, "learning_rate": 6.5982329181836325e-06, "loss": 0.647, "step": 193 }, { "epoch": 1.955583556465585, "grad_norm": 2.683356761932373, "learning_rate": 6.487825521280109e-06, "loss": 0.7316, "step": 194 }, { "epoch": 1.9656638840762324, "grad_norm": 2.6433842182159424, "learning_rate": 6.3779043617031775e-06, "loss": 0.84, "step": 195 }, { "epoch": 1.9757442116868797, "grad_norm": 4.231180667877197, "learning_rate": 6.268484657310351e-06, "loss": 0.7416, "step": 196 }, { "epoch": 1.9858245392975271, "grad_norm": 3.0023813247680664, "learning_rate": 6.159581556535989e-06, "loss": 0.8632, "step": 197 }, { "epoch": 1.9959048669081745, "grad_norm": 2.6306092739105225, "learning_rate": 6.051210136294089e-06, "loss": 0.6557, "step": 198 }, { "epoch": 2.005985194518822, "grad_norm": 3.3288090229034424, "learning_rate": 5.943385399891004e-06, "loss": 0.5327, "step": 199 }, { "epoch": 2.016065522129469, "grad_norm": 3.394890069961548, "learning_rate": 5.8361222749483246e-06, "loss": 0.5682, "step": 200 }, { "epoch": 2.0261458497401166, "grad_norm": 4.5706658363342285, "learning_rate": 5.729435611336239e-06, "loss": 0.521, "step": 201 }, { "epoch": 2.036226177350764, "grad_norm": 3.595043420791626, "learning_rate": 5.6233401791176946e-06, "loss": 0.4973, "step": 202 }, { "epoch": 2.0463065049614113, "grad_norm": 3.381319761276245, "learning_rate": 5.517850666503547e-06, "loss": 0.7273, "step": 203 }, { "epoch": 2.0563868325720587, "grad_norm": 3.813019037246704, "learning_rate": 5.412981677819094e-06, "loss": 0.5748, "step": 204 }, { "epoch": 2.066467160182706, "grad_norm": 2.9697418212890625, "learning_rate": 5.308747731482207e-06, "loss": 0.6197, "step": 205 }, { "epoch": 2.0765474877933534, "grad_norm": 4.898626804351807, "learning_rate": 5.205163257993341e-06, "loss": 0.4839, "step": 206 }, { "epoch": 2.0866278154040008, "grad_norm": 4.427509784698486, "learning_rate": 5.1022425979377174e-06, "loss": 0.455, "step": 207 }, { "epoch": 2.096708143014648, "grad_norm": 3.7625739574432373, "learning_rate": 5.000000000000003e-06, "loss": 0.538, "step": 208 }, { "epoch": 2.1067884706252955, "grad_norm": 2.9984402656555176, "learning_rate": 4.89844961899163e-06, "loss": 0.5426, "step": 209 }, { "epoch": 2.116868798235943, "grad_norm": 3.2791342735290527, "learning_rate": 4.797605513891179e-06, "loss": 0.5505, "step": 210 }, { "epoch": 2.12694912584659, "grad_norm": 3.1224496364593506, "learning_rate": 4.697481645898012e-06, "loss": 0.5466, "step": 211 }, { "epoch": 2.137029453457237, "grad_norm": 2.9908924102783203, "learning_rate": 4.598091876499417e-06, "loss": 0.4739, "step": 212 }, { "epoch": 2.1471097810678845, "grad_norm": 3.272909164428711, "learning_rate": 4.4994499655515865e-06, "loss": 0.4773, "step": 213 }, { "epoch": 2.157190108678532, "grad_norm": 3.1660666465759277, "learning_rate": 4.4015695693746685e-06, "loss": 0.6012, "step": 214 }, { "epoch": 2.1672704362891793, "grad_norm": 2.9826879501342773, "learning_rate": 4.304464238862115e-06, "loss": 0.559, "step": 215 }, { "epoch": 2.1773507638998266, "grad_norm": 3.156632900238037, "learning_rate": 4.208147417604665e-06, "loss": 0.4658, "step": 216 }, { "epoch": 2.187431091510474, "grad_norm": 2.7207696437835693, "learning_rate": 4.112632440029176e-06, "loss": 0.4746, "step": 217 }, { "epoch": 2.1975114191211214, "grad_norm": 3.170917272567749, "learning_rate": 4.017932529552543e-06, "loss": 0.4555, "step": 218 }, { "epoch": 2.2075917467317687, "grad_norm": 2.873971939086914, "learning_rate": 3.924060796751012e-06, "loss": 0.4927, "step": 219 }, { "epoch": 2.217672074342416, "grad_norm": 3.0037410259246826, "learning_rate": 3.83103023754511e-06, "loss": 0.5199, "step": 220 }, { "epoch": 2.2277524019530635, "grad_norm": 3.2414352893829346, "learning_rate": 3.7388537314004394e-06, "loss": 0.4665, "step": 221 }, { "epoch": 2.237832729563711, "grad_norm": 2.9535632133483887, "learning_rate": 3.647544039544615e-06, "loss": 0.4625, "step": 222 }, { "epoch": 2.247913057174358, "grad_norm": 2.874563455581665, "learning_rate": 3.557113803200537e-06, "loss": 0.4651, "step": 223 }, { "epoch": 2.2579933847850056, "grad_norm": 2.8400771617889404, "learning_rate": 3.4675755418363054e-06, "loss": 0.4741, "step": 224 }, { "epoch": 2.268073712395653, "grad_norm": 3.162914752960205, "learning_rate": 3.378941651431996e-06, "loss": 0.5043, "step": 225 }, { "epoch": 2.2781540400063003, "grad_norm": 3.108367681503296, "learning_rate": 3.2912244027634953e-06, "loss": 0.4612, "step": 226 }, { "epoch": 2.2882343676169477, "grad_norm": 2.9453072547912598, "learning_rate": 3.204435939703705e-06, "loss": 0.5951, "step": 227 }, { "epoch": 2.298314695227595, "grad_norm": 2.926748752593994, "learning_rate": 3.1185882775413123e-06, "loss": 0.4727, "step": 228 }, { "epoch": 2.3083950228382424, "grad_norm": 2.7505362033843994, "learning_rate": 3.0336933013173307e-06, "loss": 0.4771, "step": 229 }, { "epoch": 2.3184753504488897, "grad_norm": 3.1303627490997314, "learning_rate": 2.949762764179711e-06, "loss": 0.5534, "step": 230 }, { "epoch": 2.328555678059537, "grad_norm": 2.6740784645080566, "learning_rate": 2.8668082857562006e-06, "loss": 0.4713, "step": 231 }, { "epoch": 2.3386360056701845, "grad_norm": 2.6228513717651367, "learning_rate": 2.7848413505456564e-06, "loss": 0.532, "step": 232 }, { "epoch": 2.348716333280832, "grad_norm": 3.3503799438476562, "learning_rate": 2.7038733063281177e-06, "loss": 0.5022, "step": 233 }, { "epoch": 2.358796660891479, "grad_norm": 2.798093557357788, "learning_rate": 2.6239153625937786e-06, "loss": 0.4674, "step": 234 }, { "epoch": 2.368876988502126, "grad_norm": 2.9283840656280518, "learning_rate": 2.544978588991096e-06, "loss": 0.5145, "step": 235 }, { "epoch": 2.3789573161127735, "grad_norm": 2.9294168949127197, "learning_rate": 2.4670739137942723e-06, "loss": 0.5262, "step": 236 }, { "epoch": 2.389037643723421, "grad_norm": 2.9340579509735107, "learning_rate": 2.390212122390323e-06, "loss": 0.4654, "step": 237 }, { "epoch": 2.3991179713340682, "grad_norm": 2.9282586574554443, "learning_rate": 2.3144038557858915e-06, "loss": 0.7147, "step": 238 }, { "epoch": 2.4091982989447156, "grad_norm": 2.9570140838623047, "learning_rate": 2.2396596091340805e-06, "loss": 0.4643, "step": 239 }, { "epoch": 2.419278626555363, "grad_norm": 2.7537012100219727, "learning_rate": 2.165989730281475e-06, "loss": 0.4467, "step": 240 }, { "epoch": 2.4293589541660103, "grad_norm": 2.764420986175537, "learning_rate": 2.0934044183355384e-06, "loss": 0.4774, "step": 241 }, { "epoch": 2.4394392817766577, "grad_norm": 3.11476993560791, "learning_rate": 2.0219137222526188e-06, "loss": 0.5792, "step": 242 }, { "epoch": 2.449519609387305, "grad_norm": 4.960219860076904, "learning_rate": 1.9515275394467446e-06, "loss": 0.457, "step": 243 }, { "epoch": 2.4595999369979524, "grad_norm": 3.0883636474609375, "learning_rate": 1.882255614419376e-06, "loss": 0.6268, "step": 244 }, { "epoch": 2.4696802646086, "grad_norm": 2.9060003757476807, "learning_rate": 1.8141075374103634e-06, "loss": 0.5785, "step": 245 }, { "epoch": 2.479760592219247, "grad_norm": 2.7196030616760254, "learning_rate": 1.7470927430702277e-06, "loss": 0.4658, "step": 246 }, { "epoch": 2.4898409198298945, "grad_norm": 2.8663458824157715, "learning_rate": 1.6812205091539979e-06, "loss": 0.4635, "step": 247 }, { "epoch": 2.499921247440542, "grad_norm": 2.7674198150634766, "learning_rate": 1.6164999552367767e-06, "loss": 0.475, "step": 248 }, { "epoch": 2.5100015750511893, "grad_norm": 2.971801280975342, "learning_rate": 1.5529400414511809e-06, "loss": 0.5657, "step": 249 }, { "epoch": 2.5200819026618366, "grad_norm": 3.0309677124023438, "learning_rate": 1.4905495672468784e-06, "loss": 0.4609, "step": 250 }, { "epoch": 2.530162230272484, "grad_norm": 2.9311161041259766, "learning_rate": 1.4293371701723701e-06, "loss": 0.5184, "step": 251 }, { "epoch": 2.5402425578831314, "grad_norm": 2.813347578048706, "learning_rate": 1.369311324679159e-06, "loss": 0.4675, "step": 252 }, { "epoch": 2.5503228854937783, "grad_norm": 2.8366470336914062, "learning_rate": 1.3104803409485357e-06, "loss": 0.5566, "step": 253 }, { "epoch": 2.5604032131044256, "grad_norm": 2.9654905796051025, "learning_rate": 1.252852363741084e-06, "loss": 0.4739, "step": 254 }, { "epoch": 2.570483540715073, "grad_norm": 2.7376723289489746, "learning_rate": 1.196435371269089e-06, "loss": 0.4755, "step": 255 }, { "epoch": 2.5805638683257204, "grad_norm": 2.603374719619751, "learning_rate": 1.1412371740920036e-06, "loss": 0.45, "step": 256 }, { "epoch": 2.5906441959363677, "grad_norm": 2.7828543186187744, "learning_rate": 1.0872654140351458e-06, "loss": 0.7043, "step": 257 }, { "epoch": 2.600724523547015, "grad_norm": 2.6468091011047363, "learning_rate": 1.0345275631317165e-06, "loss": 0.6008, "step": 258 }, { "epoch": 2.6108048511576625, "grad_norm": 2.7856605052948, "learning_rate": 9.830309225883562e-07, "loss": 0.55, "step": 259 }, { "epoch": 2.62088517876831, "grad_norm": 2.949894428253174, "learning_rate": 9.327826217743452e-07, "loss": 0.4517, "step": 260 }, { "epoch": 2.630965506378957, "grad_norm": 3.2041678428649902, "learning_rate": 8.837896172345827e-07, "loss": 0.5641, "step": 261 }, { "epoch": 2.6410458339896046, "grad_norm": 3.0084447860717773, "learning_rate": 8.360586917264979e-07, "loss": 0.4635, "step": 262 }, { "epoch": 2.651126161600252, "grad_norm": 2.7008798122406006, "learning_rate": 7.895964532810318e-07, "loss": 0.5861, "step": 263 }, { "epoch": 2.6612064892108993, "grad_norm": 2.8974366188049316, "learning_rate": 7.4440933428779e-07, "loss": 0.4619, "step": 264 }, { "epoch": 2.6712868168215467, "grad_norm": 2.954524040222168, "learning_rate": 7.005035906045199e-07, "loss": 0.4819, "step": 265 }, { "epoch": 2.681367144432194, "grad_norm": 2.6953723430633545, "learning_rate": 6.578853006910402e-07, "loss": 0.4843, "step": 266 }, { "epoch": 2.6914474720428414, "grad_norm": 3.108698844909668, "learning_rate": 6.165603647677054e-07, "loss": 0.4586, "step": 267 }, { "epoch": 2.7015277996534888, "grad_norm": 2.9222874641418457, "learning_rate": 5.765345039985648e-07, "loss": 0.4737, "step": 268 }, { "epoch": 2.711608127264136, "grad_norm": 2.904561996459961, "learning_rate": 5.378132596993047e-07, "loss": 0.4641, "step": 269 }, { "epoch": 2.7216884548747835, "grad_norm": 3.0697550773620605, "learning_rate": 5.004019925700921e-07, "loss": 0.5555, "step": 270 }, { "epoch": 2.731768782485431, "grad_norm": 2.980802536010742, "learning_rate": 4.6430588195341853e-07, "loss": 0.5908, "step": 271 }, { "epoch": 2.7418491100960782, "grad_norm": 3.690682888031006, "learning_rate": 4.295299251170537e-07, "loss": 0.5399, "step": 272 }, { "epoch": 2.7519294377067256, "grad_norm": 2.920565128326416, "learning_rate": 3.960789365622075e-07, "loss": 0.7304, "step": 273 }, { "epoch": 2.762009765317373, "grad_norm": 2.853963851928711, "learning_rate": 3.6395754735699896e-07, "loss": 0.571, "step": 274 }, { "epoch": 2.7720900929280203, "grad_norm": 2.8439598083496094, "learning_rate": 3.3317020449530666e-07, "loss": 0.5728, "step": 275 }, { "epoch": 2.7821704205386677, "grad_norm": 2.6904118061065674, "learning_rate": 3.0372117028111825e-07, "loss": 0.4688, "step": 276 }, { "epoch": 2.792250748149315, "grad_norm": 2.8385207653045654, "learning_rate": 2.7561452173844206e-07, "loss": 0.5202, "step": 277 }, { "epoch": 2.8023310757599624, "grad_norm": 2.739987373352051, "learning_rate": 2.488541500468666e-07, "loss": 0.586, "step": 278 }, { "epoch": 2.81241140337061, "grad_norm": 2.9050772190093994, "learning_rate": 2.2344376000285606e-07, "loss": 0.4698, "step": 279 }, { "epoch": 2.822491730981257, "grad_norm": 2.7418956756591797, "learning_rate": 1.993868695068457e-07, "loss": 0.4432, "step": 280 }, { "epoch": 2.832572058591904, "grad_norm": 2.746795892715454, "learning_rate": 1.766868090762075e-07, "loss": 0.5331, "step": 281 }, { "epoch": 2.8426523862025515, "grad_norm": 2.7366440296173096, "learning_rate": 1.553467213841664e-07, "loss": 0.4813, "step": 282 }, { "epoch": 2.852732713813199, "grad_norm": 2.755558729171753, "learning_rate": 1.3536956082472074e-07, "loss": 0.4859, "step": 283 }, { "epoch": 2.862813041423846, "grad_norm": 2.7609097957611084, "learning_rate": 1.1675809310361497e-07, "loss": 0.4655, "step": 284 }, { "epoch": 2.8728933690344935, "grad_norm": 3.0042688846588135, "learning_rate": 9.951489485545696e-08, "loss": 0.4557, "step": 285 }, { "epoch": 2.882973696645141, "grad_norm": 2.9627745151519775, "learning_rate": 8.364235328699566e-08, "loss": 0.4968, "step": 286 }, { "epoch": 2.8930540242557883, "grad_norm": 2.8256924152374268, "learning_rate": 6.914266584662988e-08, "loss": 0.4728, "step": 287 }, { "epoch": 2.9031343518664356, "grad_norm": 2.7654502391815186, "learning_rate": 5.6017839920180506e-08, "loss": 0.4594, "step": 288 }, { "epoch": 2.913214679477083, "grad_norm": 5.463405609130859, "learning_rate": 4.426969255298841e-08, "loss": 0.4786, "step": 289 }, { "epoch": 2.9232950070877304, "grad_norm": 2.978433132171631, "learning_rate": 3.38998501983534e-08, "loss": 0.4687, "step": 290 }, { "epoch": 2.9333753346983777, "grad_norm": 2.887697458267212, "learning_rate": 2.4909748492362162e-08, "loss": 0.5466, "step": 291 }, { "epoch": 2.943455662309025, "grad_norm": 2.823009729385376, "learning_rate": 1.730063205513277e-08, "loss": 0.5068, "step": 292 }, { "epoch": 2.9535359899196725, "grad_norm": 2.9089882373809814, "learning_rate": 1.1073554318509206e-08, "loss": 0.4521, "step": 293 }, { "epoch": 2.96361631753032, "grad_norm": 2.763700246810913, "learning_rate": 6.229377380218005e-09, "loss": 0.456, "step": 294 }, { "epoch": 2.973696645140967, "grad_norm": 2.9146535396575928, "learning_rate": 2.7687718845148538e-09, "loss": 0.4586, "step": 295 }, { "epoch": 2.9837769727516146, "grad_norm": 2.999981641769409, "learning_rate": 6.922169293421821e-10, "loss": 0.464, "step": 296 }, { "epoch": 2.9938573003622615, "grad_norm": 4.130459785461426, "learning_rate": 0.0, "loss": 0.4729, "step": 297 }, { "epoch": 2.9938573003622615, "step": 297, "total_flos": 1.3286712322107113e+19, "train_loss": 0.7266113918638389, "train_runtime": 26017.2965, "train_samples_per_second": 5.856, "train_steps_per_second": 0.011 } ], "logging_steps": 1.0, "max_steps": 297, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3286712322107113e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }