|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9938573003622615, |
|
"eval_steps": 500, |
|
"global_step": 297, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010080327610647345, |
|
"grad_norm": 20.48233413696289, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 1.318, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02016065522129469, |
|
"grad_norm": 22.18445587158203, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 1.2034, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03024098283194204, |
|
"grad_norm": 20.34792137145996, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.0455, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.04032131044258938, |
|
"grad_norm": 8.893705368041992, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 2.0515, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05040163805323673, |
|
"grad_norm": 6.274827003479004, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.0793, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06048196566388408, |
|
"grad_norm": 6.051919937133789, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.9434, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07056229327453142, |
|
"grad_norm": 7.502919673919678, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.9741, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.08064262088517876, |
|
"grad_norm": 6.437217712402344, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.1227, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.09072294849582611, |
|
"grad_norm": 5.169360160827637, |
|
"learning_rate": 6e-06, |
|
"loss": 0.9156, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.10080327610647347, |
|
"grad_norm": 5.1474432945251465, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.0925, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1108836037171208, |
|
"grad_norm": 6.759896755218506, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 0.8985, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.12096393132776816, |
|
"grad_norm": 5.231770992279053, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.6954, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1310442589384155, |
|
"grad_norm": 5.028665542602539, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 0.9763, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.14112458654906285, |
|
"grad_norm": 7.008236408233643, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 0.9969, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.15120491415971019, |
|
"grad_norm": 4.675139904022217, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8619, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.16128524177035752, |
|
"grad_norm": 5.249491214752197, |
|
"learning_rate": 1.0666666666666667e-05, |
|
"loss": 0.9176, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1713655693810049, |
|
"grad_norm": 4.5402092933654785, |
|
"learning_rate": 1.1333333333333334e-05, |
|
"loss": 0.8809, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.18144589699165223, |
|
"grad_norm": 4.799923896789551, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.005, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.19152622460229957, |
|
"grad_norm": 3.9302682876586914, |
|
"learning_rate": 1.2666666666666667e-05, |
|
"loss": 0.8784, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.20160655221294693, |
|
"grad_norm": 4.542870044708252, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.0378, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21168687982359427, |
|
"grad_norm": 10.4898042678833, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.8458, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2217672074342416, |
|
"grad_norm": 4.678144454956055, |
|
"learning_rate": 1.4666666666666666e-05, |
|
"loss": 0.9237, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.23184753504488895, |
|
"grad_norm": 4.897671222686768, |
|
"learning_rate": 1.5333333333333334e-05, |
|
"loss": 0.9661, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.2419278626555363, |
|
"grad_norm": 5.067351818084717, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.9975, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.25200819026618365, |
|
"grad_norm": 3.7107274532318115, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.9316, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.262088517876831, |
|
"grad_norm": 4.553698539733887, |
|
"learning_rate": 1.7333333333333336e-05, |
|
"loss": 0.871, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.27216884548747833, |
|
"grad_norm": 5.10447359085083, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.8687, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.2822491730981257, |
|
"grad_norm": 3.7116832733154297, |
|
"learning_rate": 1.866666666666667e-05, |
|
"loss": 0.8586, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.29232950070877306, |
|
"grad_norm": 5.299854755401611, |
|
"learning_rate": 1.9333333333333333e-05, |
|
"loss": 0.8595, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.30240982831942037, |
|
"grad_norm": 3.825899600982666, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9615, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.31249015593006774, |
|
"grad_norm": 4.806526184082031, |
|
"learning_rate": 1.999930778307066e-05, |
|
"loss": 0.8595, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.32257048354071505, |
|
"grad_norm": 3.5444633960723877, |
|
"learning_rate": 1.9997231228115487e-05, |
|
"loss": 0.9748, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3326508111513624, |
|
"grad_norm": 4.36836051940918, |
|
"learning_rate": 1.9993770622619784e-05, |
|
"loss": 0.8577, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3427311387620098, |
|
"grad_norm": 3.0740671157836914, |
|
"learning_rate": 1.9988926445681495e-05, |
|
"loss": 0.8407, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.3528114663726571, |
|
"grad_norm": 3.8326187133789062, |
|
"learning_rate": 1.998269936794487e-05, |
|
"loss": 0.8997, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.36289179398330446, |
|
"grad_norm": 3.6133008003234863, |
|
"learning_rate": 1.9975090251507637e-05, |
|
"loss": 0.9572, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3729721215939518, |
|
"grad_norm": 4.111402988433838, |
|
"learning_rate": 1.9966100149801648e-05, |
|
"loss": 0.8465, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.38305244920459913, |
|
"grad_norm": 3.105464220046997, |
|
"learning_rate": 1.9955730307447015e-05, |
|
"loss": 0.84, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3931327768152465, |
|
"grad_norm": 3.377089738845825, |
|
"learning_rate": 1.9943982160079823e-05, |
|
"loss": 0.977, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.40321310442589386, |
|
"grad_norm": 3.674912214279175, |
|
"learning_rate": 1.9930857334153374e-05, |
|
"loss": 0.9114, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4132934320365412, |
|
"grad_norm": 3.491791248321533, |
|
"learning_rate": 1.9916357646713006e-05, |
|
"loss": 0.8507, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.42337375964718854, |
|
"grad_norm": 3.5988316535949707, |
|
"learning_rate": 1.9900485105144544e-05, |
|
"loss": 0.8459, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.4334540872578359, |
|
"grad_norm": 3.147287368774414, |
|
"learning_rate": 1.988324190689639e-05, |
|
"loss": 0.9254, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.4435344148684832, |
|
"grad_norm": 3.4546704292297363, |
|
"learning_rate": 1.9864630439175282e-05, |
|
"loss": 0.9388, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4536147424791306, |
|
"grad_norm": 3.39437198638916, |
|
"learning_rate": 1.9844653278615836e-05, |
|
"loss": 0.8751, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4636950700897779, |
|
"grad_norm": 2.966585159301758, |
|
"learning_rate": 1.9823313190923797e-05, |
|
"loss": 0.833, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.47377539770042526, |
|
"grad_norm": 6.668085098266602, |
|
"learning_rate": 1.9800613130493158e-05, |
|
"loss": 0.9399, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4838557253110726, |
|
"grad_norm": 4.198956489562988, |
|
"learning_rate": 1.9776556239997146e-05, |
|
"loss": 0.8604, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.49393605292171994, |
|
"grad_norm": 3.0325896739959717, |
|
"learning_rate": 1.9751145849953135e-05, |
|
"loss": 0.8399, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5040163805323673, |
|
"grad_norm": 3.0284290313720703, |
|
"learning_rate": 1.972438547826156e-05, |
|
"loss": 0.962, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5140967081430147, |
|
"grad_norm": 3.7126681804656982, |
|
"learning_rate": 1.9696278829718882e-05, |
|
"loss": 0.8381, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.524177035753662, |
|
"grad_norm": 3.0753326416015625, |
|
"learning_rate": 1.9666829795504693e-05, |
|
"loss": 1.2808, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5342573633643093, |
|
"grad_norm": 3.3041043281555176, |
|
"learning_rate": 1.9636042452643004e-05, |
|
"loss": 1.0921, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5443376909749567, |
|
"grad_norm": 2.974684953689575, |
|
"learning_rate": 1.9603921063437795e-05, |
|
"loss": 1.1766, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.554418018585604, |
|
"grad_norm": 3.104491710662842, |
|
"learning_rate": 1.9570470074882947e-05, |
|
"loss": 0.8245, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5644983461962514, |
|
"grad_norm": 2.8233213424682617, |
|
"learning_rate": 1.9535694118046584e-05, |
|
"loss": 0.8327, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5745786738068988, |
|
"grad_norm": 3.0855748653411865, |
|
"learning_rate": 1.949959800742991e-05, |
|
"loss": 0.8333, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5846590014175461, |
|
"grad_norm": 3.309098482131958, |
|
"learning_rate": 1.9462186740300697e-05, |
|
"loss": 0.8437, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5947393290281934, |
|
"grad_norm": 3.7956173419952393, |
|
"learning_rate": 1.942346549600144e-05, |
|
"loss": 0.8533, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6048196566388407, |
|
"grad_norm": 2.885507106781006, |
|
"learning_rate": 1.9383439635232296e-05, |
|
"loss": 0.9791, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6148999842494881, |
|
"grad_norm": 2.976921319961548, |
|
"learning_rate": 1.9342114699308962e-05, |
|
"loss": 0.9537, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.6249803118601355, |
|
"grad_norm": 3.9822583198547363, |
|
"learning_rate": 1.9299496409395482e-05, |
|
"loss": 0.8513, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6350606394707828, |
|
"grad_norm": 3.2742626667022705, |
|
"learning_rate": 1.9255590665712214e-05, |
|
"loss": 0.8606, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6451409670814301, |
|
"grad_norm": 2.989588975906372, |
|
"learning_rate": 1.921040354671897e-05, |
|
"loss": 1.0236, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6552212946920775, |
|
"grad_norm": 3.3849992752075195, |
|
"learning_rate": 1.9163941308273504e-05, |
|
"loss": 0.8274, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6653016223027248, |
|
"grad_norm": 2.9485199451446533, |
|
"learning_rate": 1.911621038276542e-05, |
|
"loss": 0.8366, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6753819499133722, |
|
"grad_norm": 2.8090484142303467, |
|
"learning_rate": 1.9067217378225655e-05, |
|
"loss": 1.0603, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6854622775240196, |
|
"grad_norm": 2.795466899871826, |
|
"learning_rate": 1.9016969077411645e-05, |
|
"loss": 1.0391, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6955426051346669, |
|
"grad_norm": 3.0513088703155518, |
|
"learning_rate": 1.8965472436868288e-05, |
|
"loss": 0.9469, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7056229327453142, |
|
"grad_norm": 2.89764404296875, |
|
"learning_rate": 1.891273458596486e-05, |
|
"loss": 0.8358, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7157032603559615, |
|
"grad_norm": 2.940281629562378, |
|
"learning_rate": 1.8858762825908e-05, |
|
"loss": 0.9117, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.7257835879666089, |
|
"grad_norm": 3.0357506275177, |
|
"learning_rate": 1.8803564628730916e-05, |
|
"loss": 0.8441, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7358639155772563, |
|
"grad_norm": 3.0957512855529785, |
|
"learning_rate": 1.874714763625892e-05, |
|
"loss": 0.8154, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7459442431879036, |
|
"grad_norm": 3.3382112979888916, |
|
"learning_rate": 1.8689519659051467e-05, |
|
"loss": 1.0091, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7560245707985509, |
|
"grad_norm": 2.7359678745269775, |
|
"learning_rate": 1.8630688675320844e-05, |
|
"loss": 0.9901, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7661048984091983, |
|
"grad_norm": 2.9160029888153076, |
|
"learning_rate": 1.8570662829827632e-05, |
|
"loss": 1.0645, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7761852260198456, |
|
"grad_norm": 3.7096657752990723, |
|
"learning_rate": 1.8509450432753123e-05, |
|
"loss": 0.8458, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.786265553630493, |
|
"grad_norm": 2.9605114459991455, |
|
"learning_rate": 1.8447059958548822e-05, |
|
"loss": 0.8315, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7963458812411404, |
|
"grad_norm": 3.1716909408569336, |
|
"learning_rate": 1.8383500044763226e-05, |
|
"loss": 0.8427, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.8064262088517877, |
|
"grad_norm": 4.014035224914551, |
|
"learning_rate": 1.8318779490846005e-05, |
|
"loss": 0.8391, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.816506536462435, |
|
"grad_norm": 2.6693358421325684, |
|
"learning_rate": 1.8252907256929777e-05, |
|
"loss": 1.0444, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.8265868640730824, |
|
"grad_norm": 3.824836254119873, |
|
"learning_rate": 1.818589246258964e-05, |
|
"loss": 0.9738, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.8366671916837297, |
|
"grad_norm": 3.0832369327545166, |
|
"learning_rate": 1.8117744385580627e-05, |
|
"loss": 0.9109, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.8467475192943771, |
|
"grad_norm": 3.707331657409668, |
|
"learning_rate": 1.804847246055326e-05, |
|
"loss": 0.8324, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8568278469050244, |
|
"grad_norm": 3.3008220195770264, |
|
"learning_rate": 1.797808627774738e-05, |
|
"loss": 0.9383, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8669081745156718, |
|
"grad_norm": 3.1887435913085938, |
|
"learning_rate": 1.7906595581664462e-05, |
|
"loss": 0.8441, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8769885021263191, |
|
"grad_norm": 2.8672823905944824, |
|
"learning_rate": 1.7834010269718526e-05, |
|
"loss": 0.8353, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8870688297369664, |
|
"grad_norm": 3.0346055030822754, |
|
"learning_rate": 1.776034039086592e-05, |
|
"loss": 1.348, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.8971491573476138, |
|
"grad_norm": 3.2000229358673096, |
|
"learning_rate": 1.768559614421411e-05, |
|
"loss": 0.8544, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.9072294849582612, |
|
"grad_norm": 3.007753610610962, |
|
"learning_rate": 1.7609787877609678e-05, |
|
"loss": 0.8505, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9173098125689085, |
|
"grad_norm": 2.9321534633636475, |
|
"learning_rate": 1.753292608620573e-05, |
|
"loss": 0.8402, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.9273901401795558, |
|
"grad_norm": 2.6721303462982178, |
|
"learning_rate": 1.7455021411008906e-05, |
|
"loss": 0.8421, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9374704677902032, |
|
"grad_norm": 3.0125327110290527, |
|
"learning_rate": 1.7376084637406222e-05, |
|
"loss": 0.8443, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.9475507954008505, |
|
"grad_norm": 2.6522045135498047, |
|
"learning_rate": 1.7296126693671886e-05, |
|
"loss": 1.2249, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.9576311230114979, |
|
"grad_norm": 2.6742281913757324, |
|
"learning_rate": 1.721515864945435e-05, |
|
"loss": 0.846, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9677114506221453, |
|
"grad_norm": 2.9093592166900635, |
|
"learning_rate": 1.7133191714243805e-05, |
|
"loss": 0.9391, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.9777917782327926, |
|
"grad_norm": 3.675670623779297, |
|
"learning_rate": 1.7050237235820287e-05, |
|
"loss": 0.8723, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.9878721058434399, |
|
"grad_norm": 2.698991298675537, |
|
"learning_rate": 1.6966306698682672e-05, |
|
"loss": 0.8491, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.9979524334540872, |
|
"grad_norm": 2.8708646297454834, |
|
"learning_rate": 1.6881411722458688e-05, |
|
"loss": 1.1059, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.0080327610647346, |
|
"grad_norm": 3.9162817001342773, |
|
"learning_rate": 1.6795564060296295e-05, |
|
"loss": 0.711, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.018113088675382, |
|
"grad_norm": 3.2829103469848633, |
|
"learning_rate": 1.6708775597236507e-05, |
|
"loss": 0.7179, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.0281934162860293, |
|
"grad_norm": 4.782021999359131, |
|
"learning_rate": 1.6621058348568008e-05, |
|
"loss": 0.759, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.0382737438966767, |
|
"grad_norm": 4.203643798828125, |
|
"learning_rate": 1.6532424458163692e-05, |
|
"loss": 0.7717, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.048354071507324, |
|
"grad_norm": 4.421259880065918, |
|
"learning_rate": 1.6442886196799465e-05, |
|
"loss": 0.651, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.0584343991179714, |
|
"grad_norm": 3.6463119983673096, |
|
"learning_rate": 1.6352455960455385e-05, |
|
"loss": 0.7719, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.0685147267286186, |
|
"grad_norm": 3.408778429031372, |
|
"learning_rate": 1.6261146268599564e-05, |
|
"loss": 0.6591, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.078595054339266, |
|
"grad_norm": 3.7105519771575928, |
|
"learning_rate": 1.6168969762454897e-05, |
|
"loss": 0.6645, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.0886753819499133, |
|
"grad_norm": 3.8667266368865967, |
|
"learning_rate": 1.607593920324899e-05, |
|
"loss": 0.7159, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.0987557095605607, |
|
"grad_norm": 3.4244542121887207, |
|
"learning_rate": 1.598206747044746e-05, |
|
"loss": 0.8331, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.108836037171208, |
|
"grad_norm": 3.326638698577881, |
|
"learning_rate": 1.5887367559970825e-05, |
|
"loss": 0.6831, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.1189163647818554, |
|
"grad_norm": 3.7026546001434326, |
|
"learning_rate": 1.5791852582395334e-05, |
|
"loss": 0.6642, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.1289966923925028, |
|
"grad_norm": 3.1158483028411865, |
|
"learning_rate": 1.569553576113789e-05, |
|
"loss": 0.6531, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.1390770200031501, |
|
"grad_norm": 3.1516995429992676, |
|
"learning_rate": 1.5598430430625335e-05, |
|
"loss": 0.6734, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.1491573476137975, |
|
"grad_norm": 3.048830270767212, |
|
"learning_rate": 1.5500550034448415e-05, |
|
"loss": 0.6412, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.1592376752244449, |
|
"grad_norm": 2.9057071208953857, |
|
"learning_rate": 1.540190812350059e-05, |
|
"loss": 0.6441, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.1693180028350922, |
|
"grad_norm": 4.930371284484863, |
|
"learning_rate": 1.5302518354101992e-05, |
|
"loss": 0.6499, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.1793983304457396, |
|
"grad_norm": 3.2132606506347656, |
|
"learning_rate": 1.5202394486108823e-05, |
|
"loss": 0.7648, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.1894786580563868, |
|
"grad_norm": 3.247512102127075, |
|
"learning_rate": 1.5101550381008377e-05, |
|
"loss": 0.6341, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.1995589856670341, |
|
"grad_norm": 2.8837008476257324, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.6785, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.2096393132776815, |
|
"grad_norm": 3.362884998321533, |
|
"learning_rate": 1.4897757402062285e-05, |
|
"loss": 0.6433, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2197196408883288, |
|
"grad_norm": 3.0538456439971924, |
|
"learning_rate": 1.4794836742006667e-05, |
|
"loss": 0.7454, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.2297999684989762, |
|
"grad_norm": 3.1768131256103516, |
|
"learning_rate": 1.4691252268517794e-05, |
|
"loss": 0.7879, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.2398802961096236, |
|
"grad_norm": 3.0788283348083496, |
|
"learning_rate": 1.4587018322180906e-05, |
|
"loss": 0.7103, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.249960623720271, |
|
"grad_norm": 2.7563843727111816, |
|
"learning_rate": 1.4482149333496455e-05, |
|
"loss": 0.6592, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.2600409513309183, |
|
"grad_norm": 3.421998977661133, |
|
"learning_rate": 1.4376659820882308e-05, |
|
"loss": 0.7862, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.2701212789415657, |
|
"grad_norm": 3.0229763984680176, |
|
"learning_rate": 1.4270564388663761e-05, |
|
"loss": 0.6586, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.2802016065522128, |
|
"grad_norm": 2.8393478393554688, |
|
"learning_rate": 1.4163877725051677e-05, |
|
"loss": 0.7359, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.2902819341628602, |
|
"grad_norm": 3.009399175643921, |
|
"learning_rate": 1.4056614600108998e-05, |
|
"loss": 0.6755, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.3003622617735076, |
|
"grad_norm": 2.9821226596832275, |
|
"learning_rate": 1.3948789863705914e-05, |
|
"loss": 0.629, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.310442589384155, |
|
"grad_norm": 3.1034419536590576, |
|
"learning_rate": 1.3840418443464015e-05, |
|
"loss": 0.6466, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.3205229169948023, |
|
"grad_norm": 3.318528413772583, |
|
"learning_rate": 1.3731515342689654e-05, |
|
"loss": 1.0047, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.3306032446054497, |
|
"grad_norm": 2.9240915775299072, |
|
"learning_rate": 1.3622095638296827e-05, |
|
"loss": 0.668, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.340683572216097, |
|
"grad_norm": 3.9342963695526123, |
|
"learning_rate": 1.3512174478719896e-05, |
|
"loss": 0.6465, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.3507638998267444, |
|
"grad_norm": 2.6329188346862793, |
|
"learning_rate": 1.340176708181637e-05, |
|
"loss": 0.8036, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.3608442274373918, |
|
"grad_norm": 3.2634246349334717, |
|
"learning_rate": 1.32908887327601e-05, |
|
"loss": 0.6348, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.3709245550480391, |
|
"grad_norm": 2.8796093463897705, |
|
"learning_rate": 1.317955478192515e-05, |
|
"loss": 0.7079, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.3810048826586865, |
|
"grad_norm": 3.15555477142334, |
|
"learning_rate": 1.306778064276064e-05, |
|
"loss": 0.7197, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.3910852102693338, |
|
"grad_norm": 2.8696324825286865, |
|
"learning_rate": 1.2955581789656844e-05, |
|
"loss": 0.6422, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.4011655378799812, |
|
"grad_norm": 2.841829538345337, |
|
"learning_rate": 1.2842973755802872e-05, |
|
"loss": 0.6522, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.4112458654906286, |
|
"grad_norm": 2.869424819946289, |
|
"learning_rate": 1.2729972131036212e-05, |
|
"loss": 0.7462, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.4213261931012757, |
|
"grad_norm": 3.1612401008605957, |
|
"learning_rate": 1.2616592559684408e-05, |
|
"loss": 0.6471, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.431406520711923, |
|
"grad_norm": 2.7609212398529053, |
|
"learning_rate": 1.25028507383992e-05, |
|
"loss": 0.7627, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.4414868483225705, |
|
"grad_norm": 2.678645610809326, |
|
"learning_rate": 1.2388762413983447e-05, |
|
"loss": 0.6729, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.4515671759332178, |
|
"grad_norm": 2.8031179904937744, |
|
"learning_rate": 1.2274343381211067e-05, |
|
"loss": 0.6497, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.4616475035438652, |
|
"grad_norm": 2.735318660736084, |
|
"learning_rate": 1.2159609480640361e-05, |
|
"loss": 0.6786, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.4717278311545126, |
|
"grad_norm": 2.9970738887786865, |
|
"learning_rate": 1.2044576596421003e-05, |
|
"loss": 0.6498, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.48180815876516, |
|
"grad_norm": 2.8578624725341797, |
|
"learning_rate": 1.192926065409497e-05, |
|
"loss": 0.6432, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.4918884863758073, |
|
"grad_norm": 2.9433505535125732, |
|
"learning_rate": 1.1813677618391759e-05, |
|
"loss": 0.6274, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.5019688139864544, |
|
"grad_norm": 2.799851417541504, |
|
"learning_rate": 1.1697843491018189e-05, |
|
"loss": 0.6507, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.5120491415971018, |
|
"grad_norm": 2.589261770248413, |
|
"learning_rate": 1.1581774308443042e-05, |
|
"loss": 0.8801, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.5221294692077492, |
|
"grad_norm": 2.7499136924743652, |
|
"learning_rate": 1.1465486139676955e-05, |
|
"loss": 0.6428, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.5322097968183965, |
|
"grad_norm": 3.6993484497070312, |
|
"learning_rate": 1.134899508404775e-05, |
|
"loss": 0.6641, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.542290124429044, |
|
"grad_norm": 5.174093723297119, |
|
"learning_rate": 1.1232317268971586e-05, |
|
"loss": 0.7828, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.5523704520396913, |
|
"grad_norm": 2.734003782272339, |
|
"learning_rate": 1.1115468847720245e-05, |
|
"loss": 0.7631, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.5624507796503386, |
|
"grad_norm": 3.996946334838867, |
|
"learning_rate": 1.0998465997184798e-05, |
|
"loss": 0.6416, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.572531107260986, |
|
"grad_norm": 3.302497386932373, |
|
"learning_rate": 1.088132491563602e-05, |
|
"loss": 0.6543, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.5826114348716334, |
|
"grad_norm": 2.9945664405822754, |
|
"learning_rate": 1.0764061820481872e-05, |
|
"loss": 0.6902, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.5926917624822807, |
|
"grad_norm": 2.6038448810577393, |
|
"learning_rate": 1.0646692946022285e-05, |
|
"loss": 0.6289, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.602772090092928, |
|
"grad_norm": 2.6396548748016357, |
|
"learning_rate": 1.0529234541201631e-05, |
|
"loss": 0.8164, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.6128524177035755, |
|
"grad_norm": 2.770799160003662, |
|
"learning_rate": 1.041170286735918e-05, |
|
"loss": 0.6438, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.6229327453142228, |
|
"grad_norm": 2.666429042816162, |
|
"learning_rate": 1.0294114195977796e-05, |
|
"loss": 0.6912, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.6330130729248702, |
|
"grad_norm": 2.9112017154693604, |
|
"learning_rate": 1.0176484806431288e-05, |
|
"loss": 0.7345, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.6430934005355176, |
|
"grad_norm": 3.0811562538146973, |
|
"learning_rate": 1.0058830983730622e-05, |
|
"loss": 0.7558, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.6531737281461647, |
|
"grad_norm": 2.8588948249816895, |
|
"learning_rate": 9.94116901626938e-06, |
|
"loss": 0.648, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.663254055756812, |
|
"grad_norm": 2.7404801845550537, |
|
"learning_rate": 9.823515193568715e-06, |
|
"loss": 0.695, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.6733343833674594, |
|
"grad_norm": 2.9604055881500244, |
|
"learning_rate": 9.705885804022207e-06, |
|
"loss": 0.6304, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.6834147109781068, |
|
"grad_norm": 3.1369986534118652, |
|
"learning_rate": 9.588297132640824e-06, |
|
"loss": 0.8216, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.6934950385887542, |
|
"grad_norm": 2.912094831466675, |
|
"learning_rate": 9.470765458798369e-06, |
|
"loss": 0.6653, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.7035753661994015, |
|
"grad_norm": 2.6910438537597656, |
|
"learning_rate": 9.353307053977717e-06, |
|
"loss": 0.6645, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.7136556938100487, |
|
"grad_norm": 2.958939790725708, |
|
"learning_rate": 9.235938179518131e-06, |
|
"loss": 0.6222, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.723736021420696, |
|
"grad_norm": 2.7143542766571045, |
|
"learning_rate": 9.118675084363986e-06, |
|
"loss": 0.7051, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.7338163490313434, |
|
"grad_norm": 2.6610677242279053, |
|
"learning_rate": 9.001534002815209e-06, |
|
"loss": 0.6333, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.7438966766419908, |
|
"grad_norm": 2.9667537212371826, |
|
"learning_rate": 8.884531152279757e-06, |
|
"loss": 0.6832, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.7539770042526381, |
|
"grad_norm": 2.64609956741333, |
|
"learning_rate": 8.767682731028415e-06, |
|
"loss": 0.9484, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.7640573318632855, |
|
"grad_norm": 2.682523012161255, |
|
"learning_rate": 8.651004915952252e-06, |
|
"loss": 0.8721, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.7741376594739329, |
|
"grad_norm": 2.5906975269317627, |
|
"learning_rate": 8.534513860323047e-06, |
|
"loss": 0.9793, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.7842179870845802, |
|
"grad_norm": 2.636467456817627, |
|
"learning_rate": 8.418225691556962e-06, |
|
"loss": 0.9016, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.7942983146952276, |
|
"grad_norm": 3.5005948543548584, |
|
"learning_rate": 8.302156508981816e-06, |
|
"loss": 0.738, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.804378642305875, |
|
"grad_norm": 2.7986643314361572, |
|
"learning_rate": 8.18632238160824e-06, |
|
"loss": 0.6635, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.8144589699165223, |
|
"grad_norm": 2.8597512245178223, |
|
"learning_rate": 8.070739345905032e-06, |
|
"loss": 0.7473, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.8245392975271697, |
|
"grad_norm": 2.7487239837646484, |
|
"learning_rate": 7.955423403578998e-06, |
|
"loss": 0.7526, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.834619625137817, |
|
"grad_norm": 2.68874454498291, |
|
"learning_rate": 7.840390519359644e-06, |
|
"loss": 0.6491, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.8446999527484644, |
|
"grad_norm": 2.8393709659576416, |
|
"learning_rate": 7.725656618788938e-06, |
|
"loss": 0.6401, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.8547802803591118, |
|
"grad_norm": 2.8322646617889404, |
|
"learning_rate": 7.611237586016558e-06, |
|
"loss": 0.7692, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.8648606079697592, |
|
"grad_norm": 2.760575771331787, |
|
"learning_rate": 7.497149261600803e-06, |
|
"loss": 0.926, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.8749409355804063, |
|
"grad_norm": 2.6379311084747314, |
|
"learning_rate": 7.383407440315595e-06, |
|
"loss": 0.654, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.8850212631910537, |
|
"grad_norm": 2.6411261558532715, |
|
"learning_rate": 7.27002786896379e-06, |
|
"loss": 0.7753, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.895101590801701, |
|
"grad_norm": 2.6866044998168945, |
|
"learning_rate": 7.157026244197132e-06, |
|
"loss": 0.6479, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.9051819184123484, |
|
"grad_norm": 2.743093252182007, |
|
"learning_rate": 7.044418210343161e-06, |
|
"loss": 0.7825, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.9152622460229958, |
|
"grad_norm": 2.7608628273010254, |
|
"learning_rate": 6.932219357239362e-06, |
|
"loss": 0.6497, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.9253425736336431, |
|
"grad_norm": 2.581033706665039, |
|
"learning_rate": 6.820445218074849e-06, |
|
"loss": 0.658, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.9354229012442903, |
|
"grad_norm": 2.8341994285583496, |
|
"learning_rate": 6.7091112672399e-06, |
|
"loss": 0.8367, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.9455032288549376, |
|
"grad_norm": 2.712247133255005, |
|
"learning_rate": 6.5982329181836325e-06, |
|
"loss": 0.647, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.955583556465585, |
|
"grad_norm": 2.683356761932373, |
|
"learning_rate": 6.487825521280109e-06, |
|
"loss": 0.7316, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.9656638840762324, |
|
"grad_norm": 2.6433842182159424, |
|
"learning_rate": 6.3779043617031775e-06, |
|
"loss": 0.84, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.9757442116868797, |
|
"grad_norm": 4.231180667877197, |
|
"learning_rate": 6.268484657310351e-06, |
|
"loss": 0.7416, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.9858245392975271, |
|
"grad_norm": 3.0023813247680664, |
|
"learning_rate": 6.159581556535989e-06, |
|
"loss": 0.8632, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.9959048669081745, |
|
"grad_norm": 2.6306092739105225, |
|
"learning_rate": 6.051210136294089e-06, |
|
"loss": 0.6557, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.005985194518822, |
|
"grad_norm": 3.3288090229034424, |
|
"learning_rate": 5.943385399891004e-06, |
|
"loss": 0.5327, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.016065522129469, |
|
"grad_norm": 3.394890069961548, |
|
"learning_rate": 5.8361222749483246e-06, |
|
"loss": 0.5682, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.0261458497401166, |
|
"grad_norm": 4.5706658363342285, |
|
"learning_rate": 5.729435611336239e-06, |
|
"loss": 0.521, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.036226177350764, |
|
"grad_norm": 3.595043420791626, |
|
"learning_rate": 5.6233401791176946e-06, |
|
"loss": 0.4973, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.0463065049614113, |
|
"grad_norm": 3.381319761276245, |
|
"learning_rate": 5.517850666503547e-06, |
|
"loss": 0.7273, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.0563868325720587, |
|
"grad_norm": 3.813019037246704, |
|
"learning_rate": 5.412981677819094e-06, |
|
"loss": 0.5748, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.066467160182706, |
|
"grad_norm": 2.9697418212890625, |
|
"learning_rate": 5.308747731482207e-06, |
|
"loss": 0.6197, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.0765474877933534, |
|
"grad_norm": 4.898626804351807, |
|
"learning_rate": 5.205163257993341e-06, |
|
"loss": 0.4839, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.0866278154040008, |
|
"grad_norm": 4.427509784698486, |
|
"learning_rate": 5.1022425979377174e-06, |
|
"loss": 0.455, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.096708143014648, |
|
"grad_norm": 3.7625739574432373, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.538, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.1067884706252955, |
|
"grad_norm": 2.9984402656555176, |
|
"learning_rate": 4.89844961899163e-06, |
|
"loss": 0.5426, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.116868798235943, |
|
"grad_norm": 3.2791342735290527, |
|
"learning_rate": 4.797605513891179e-06, |
|
"loss": 0.5505, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.12694912584659, |
|
"grad_norm": 3.1224496364593506, |
|
"learning_rate": 4.697481645898012e-06, |
|
"loss": 0.5466, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.137029453457237, |
|
"grad_norm": 2.9908924102783203, |
|
"learning_rate": 4.598091876499417e-06, |
|
"loss": 0.4739, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.1471097810678845, |
|
"grad_norm": 3.272909164428711, |
|
"learning_rate": 4.4994499655515865e-06, |
|
"loss": 0.4773, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.157190108678532, |
|
"grad_norm": 3.1660666465759277, |
|
"learning_rate": 4.4015695693746685e-06, |
|
"loss": 0.6012, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.1672704362891793, |
|
"grad_norm": 2.9826879501342773, |
|
"learning_rate": 4.304464238862115e-06, |
|
"loss": 0.559, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.1773507638998266, |
|
"grad_norm": 3.156632900238037, |
|
"learning_rate": 4.208147417604665e-06, |
|
"loss": 0.4658, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.187431091510474, |
|
"grad_norm": 2.7207696437835693, |
|
"learning_rate": 4.112632440029176e-06, |
|
"loss": 0.4746, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.1975114191211214, |
|
"grad_norm": 3.170917272567749, |
|
"learning_rate": 4.017932529552543e-06, |
|
"loss": 0.4555, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.2075917467317687, |
|
"grad_norm": 2.873971939086914, |
|
"learning_rate": 3.924060796751012e-06, |
|
"loss": 0.4927, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.217672074342416, |
|
"grad_norm": 3.0037410259246826, |
|
"learning_rate": 3.83103023754511e-06, |
|
"loss": 0.5199, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.2277524019530635, |
|
"grad_norm": 3.2414352893829346, |
|
"learning_rate": 3.7388537314004394e-06, |
|
"loss": 0.4665, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.237832729563711, |
|
"grad_norm": 2.9535632133483887, |
|
"learning_rate": 3.647544039544615e-06, |
|
"loss": 0.4625, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.247913057174358, |
|
"grad_norm": 2.874563455581665, |
|
"learning_rate": 3.557113803200537e-06, |
|
"loss": 0.4651, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.2579933847850056, |
|
"grad_norm": 2.8400771617889404, |
|
"learning_rate": 3.4675755418363054e-06, |
|
"loss": 0.4741, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.268073712395653, |
|
"grad_norm": 3.162914752960205, |
|
"learning_rate": 3.378941651431996e-06, |
|
"loss": 0.5043, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.2781540400063003, |
|
"grad_norm": 3.108367681503296, |
|
"learning_rate": 3.2912244027634953e-06, |
|
"loss": 0.4612, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.2882343676169477, |
|
"grad_norm": 2.9453072547912598, |
|
"learning_rate": 3.204435939703705e-06, |
|
"loss": 0.5951, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.298314695227595, |
|
"grad_norm": 2.926748752593994, |
|
"learning_rate": 3.1185882775413123e-06, |
|
"loss": 0.4727, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.3083950228382424, |
|
"grad_norm": 2.7505362033843994, |
|
"learning_rate": 3.0336933013173307e-06, |
|
"loss": 0.4771, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.3184753504488897, |
|
"grad_norm": 3.1303627490997314, |
|
"learning_rate": 2.949762764179711e-06, |
|
"loss": 0.5534, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.328555678059537, |
|
"grad_norm": 2.6740784645080566, |
|
"learning_rate": 2.8668082857562006e-06, |
|
"loss": 0.4713, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.3386360056701845, |
|
"grad_norm": 2.6228513717651367, |
|
"learning_rate": 2.7848413505456564e-06, |
|
"loss": 0.532, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.348716333280832, |
|
"grad_norm": 3.3503799438476562, |
|
"learning_rate": 2.7038733063281177e-06, |
|
"loss": 0.5022, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.358796660891479, |
|
"grad_norm": 2.798093557357788, |
|
"learning_rate": 2.6239153625937786e-06, |
|
"loss": 0.4674, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.368876988502126, |
|
"grad_norm": 2.9283840656280518, |
|
"learning_rate": 2.544978588991096e-06, |
|
"loss": 0.5145, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.3789573161127735, |
|
"grad_norm": 2.9294168949127197, |
|
"learning_rate": 2.4670739137942723e-06, |
|
"loss": 0.5262, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.389037643723421, |
|
"grad_norm": 2.9340579509735107, |
|
"learning_rate": 2.390212122390323e-06, |
|
"loss": 0.4654, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.3991179713340682, |
|
"grad_norm": 2.9282586574554443, |
|
"learning_rate": 2.3144038557858915e-06, |
|
"loss": 0.7147, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.4091982989447156, |
|
"grad_norm": 2.9570140838623047, |
|
"learning_rate": 2.2396596091340805e-06, |
|
"loss": 0.4643, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.419278626555363, |
|
"grad_norm": 2.7537012100219727, |
|
"learning_rate": 2.165989730281475e-06, |
|
"loss": 0.4467, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.4293589541660103, |
|
"grad_norm": 2.764420986175537, |
|
"learning_rate": 2.0934044183355384e-06, |
|
"loss": 0.4774, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.4394392817766577, |
|
"grad_norm": 3.11476993560791, |
|
"learning_rate": 2.0219137222526188e-06, |
|
"loss": 0.5792, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.449519609387305, |
|
"grad_norm": 4.960219860076904, |
|
"learning_rate": 1.9515275394467446e-06, |
|
"loss": 0.457, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.4595999369979524, |
|
"grad_norm": 3.0883636474609375, |
|
"learning_rate": 1.882255614419376e-06, |
|
"loss": 0.6268, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.4696802646086, |
|
"grad_norm": 2.9060003757476807, |
|
"learning_rate": 1.8141075374103634e-06, |
|
"loss": 0.5785, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.479760592219247, |
|
"grad_norm": 2.7196030616760254, |
|
"learning_rate": 1.7470927430702277e-06, |
|
"loss": 0.4658, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.4898409198298945, |
|
"grad_norm": 2.8663458824157715, |
|
"learning_rate": 1.6812205091539979e-06, |
|
"loss": 0.4635, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.499921247440542, |
|
"grad_norm": 2.7674198150634766, |
|
"learning_rate": 1.6164999552367767e-06, |
|
"loss": 0.475, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.5100015750511893, |
|
"grad_norm": 2.971801280975342, |
|
"learning_rate": 1.5529400414511809e-06, |
|
"loss": 0.5657, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.5200819026618366, |
|
"grad_norm": 3.0309677124023438, |
|
"learning_rate": 1.4905495672468784e-06, |
|
"loss": 0.4609, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.530162230272484, |
|
"grad_norm": 2.9311161041259766, |
|
"learning_rate": 1.4293371701723701e-06, |
|
"loss": 0.5184, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.5402425578831314, |
|
"grad_norm": 2.813347578048706, |
|
"learning_rate": 1.369311324679159e-06, |
|
"loss": 0.4675, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.5503228854937783, |
|
"grad_norm": 2.8366470336914062, |
|
"learning_rate": 1.3104803409485357e-06, |
|
"loss": 0.5566, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.5604032131044256, |
|
"grad_norm": 2.9654905796051025, |
|
"learning_rate": 1.252852363741084e-06, |
|
"loss": 0.4739, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.570483540715073, |
|
"grad_norm": 2.7376723289489746, |
|
"learning_rate": 1.196435371269089e-06, |
|
"loss": 0.4755, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.5805638683257204, |
|
"grad_norm": 2.603374719619751, |
|
"learning_rate": 1.1412371740920036e-06, |
|
"loss": 0.45, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.5906441959363677, |
|
"grad_norm": 2.7828543186187744, |
|
"learning_rate": 1.0872654140351458e-06, |
|
"loss": 0.7043, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.600724523547015, |
|
"grad_norm": 2.6468091011047363, |
|
"learning_rate": 1.0345275631317165e-06, |
|
"loss": 0.6008, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.6108048511576625, |
|
"grad_norm": 2.7856605052948, |
|
"learning_rate": 9.830309225883562e-07, |
|
"loss": 0.55, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.62088517876831, |
|
"grad_norm": 2.949894428253174, |
|
"learning_rate": 9.327826217743452e-07, |
|
"loss": 0.4517, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.630965506378957, |
|
"grad_norm": 3.2041678428649902, |
|
"learning_rate": 8.837896172345827e-07, |
|
"loss": 0.5641, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.6410458339896046, |
|
"grad_norm": 3.0084447860717773, |
|
"learning_rate": 8.360586917264979e-07, |
|
"loss": 0.4635, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.651126161600252, |
|
"grad_norm": 2.7008798122406006, |
|
"learning_rate": 7.895964532810318e-07, |
|
"loss": 0.5861, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.6612064892108993, |
|
"grad_norm": 2.8974366188049316, |
|
"learning_rate": 7.4440933428779e-07, |
|
"loss": 0.4619, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.6712868168215467, |
|
"grad_norm": 2.954524040222168, |
|
"learning_rate": 7.005035906045199e-07, |
|
"loss": 0.4819, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.681367144432194, |
|
"grad_norm": 2.6953723430633545, |
|
"learning_rate": 6.578853006910402e-07, |
|
"loss": 0.4843, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.6914474720428414, |
|
"grad_norm": 3.108698844909668, |
|
"learning_rate": 6.165603647677054e-07, |
|
"loss": 0.4586, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.7015277996534888, |
|
"grad_norm": 2.9222874641418457, |
|
"learning_rate": 5.765345039985648e-07, |
|
"loss": 0.4737, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.711608127264136, |
|
"grad_norm": 2.904561996459961, |
|
"learning_rate": 5.378132596993047e-07, |
|
"loss": 0.4641, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.7216884548747835, |
|
"grad_norm": 3.0697550773620605, |
|
"learning_rate": 5.004019925700921e-07, |
|
"loss": 0.5555, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.731768782485431, |
|
"grad_norm": 2.980802536010742, |
|
"learning_rate": 4.6430588195341853e-07, |
|
"loss": 0.5908, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.7418491100960782, |
|
"grad_norm": 3.690682888031006, |
|
"learning_rate": 4.295299251170537e-07, |
|
"loss": 0.5399, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.7519294377067256, |
|
"grad_norm": 2.920565128326416, |
|
"learning_rate": 3.960789365622075e-07, |
|
"loss": 0.7304, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.762009765317373, |
|
"grad_norm": 2.853963851928711, |
|
"learning_rate": 3.6395754735699896e-07, |
|
"loss": 0.571, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.7720900929280203, |
|
"grad_norm": 2.8439598083496094, |
|
"learning_rate": 3.3317020449530666e-07, |
|
"loss": 0.5728, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.7821704205386677, |
|
"grad_norm": 2.6904118061065674, |
|
"learning_rate": 3.0372117028111825e-07, |
|
"loss": 0.4688, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.792250748149315, |
|
"grad_norm": 2.8385207653045654, |
|
"learning_rate": 2.7561452173844206e-07, |
|
"loss": 0.5202, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.8023310757599624, |
|
"grad_norm": 2.739987373352051, |
|
"learning_rate": 2.488541500468666e-07, |
|
"loss": 0.586, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.81241140337061, |
|
"grad_norm": 2.9050772190093994, |
|
"learning_rate": 2.2344376000285606e-07, |
|
"loss": 0.4698, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.822491730981257, |
|
"grad_norm": 2.7418956756591797, |
|
"learning_rate": 1.993868695068457e-07, |
|
"loss": 0.4432, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.832572058591904, |
|
"grad_norm": 2.746795892715454, |
|
"learning_rate": 1.766868090762075e-07, |
|
"loss": 0.5331, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.8426523862025515, |
|
"grad_norm": 2.7366440296173096, |
|
"learning_rate": 1.553467213841664e-07, |
|
"loss": 0.4813, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.852732713813199, |
|
"grad_norm": 2.755558729171753, |
|
"learning_rate": 1.3536956082472074e-07, |
|
"loss": 0.4859, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.862813041423846, |
|
"grad_norm": 2.7609097957611084, |
|
"learning_rate": 1.1675809310361497e-07, |
|
"loss": 0.4655, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.8728933690344935, |
|
"grad_norm": 3.0042688846588135, |
|
"learning_rate": 9.951489485545696e-08, |
|
"loss": 0.4557, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.882973696645141, |
|
"grad_norm": 2.9627745151519775, |
|
"learning_rate": 8.364235328699566e-08, |
|
"loss": 0.4968, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.8930540242557883, |
|
"grad_norm": 2.8256924152374268, |
|
"learning_rate": 6.914266584662988e-08, |
|
"loss": 0.4728, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.9031343518664356, |
|
"grad_norm": 2.7654502391815186, |
|
"learning_rate": 5.6017839920180506e-08, |
|
"loss": 0.4594, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.913214679477083, |
|
"grad_norm": 5.463405609130859, |
|
"learning_rate": 4.426969255298841e-08, |
|
"loss": 0.4786, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.9232950070877304, |
|
"grad_norm": 2.978433132171631, |
|
"learning_rate": 3.38998501983534e-08, |
|
"loss": 0.4687, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.9333753346983777, |
|
"grad_norm": 2.887697458267212, |
|
"learning_rate": 2.4909748492362162e-08, |
|
"loss": 0.5466, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.943455662309025, |
|
"grad_norm": 2.823009729385376, |
|
"learning_rate": 1.730063205513277e-08, |
|
"loss": 0.5068, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.9535359899196725, |
|
"grad_norm": 2.9089882373809814, |
|
"learning_rate": 1.1073554318509206e-08, |
|
"loss": 0.4521, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.96361631753032, |
|
"grad_norm": 2.763700246810913, |
|
"learning_rate": 6.229377380218005e-09, |
|
"loss": 0.456, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.973696645140967, |
|
"grad_norm": 2.9146535396575928, |
|
"learning_rate": 2.7687718845148538e-09, |
|
"loss": 0.4586, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.9837769727516146, |
|
"grad_norm": 2.999981641769409, |
|
"learning_rate": 6.922169293421821e-10, |
|
"loss": 0.464, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.9938573003622615, |
|
"grad_norm": 4.130459785461426, |
|
"learning_rate": 0.0, |
|
"loss": 0.4729, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.9938573003622615, |
|
"step": 297, |
|
"total_flos": 1.3286712322107113e+19, |
|
"train_loss": 0.7266113918638389, |
|
"train_runtime": 26017.2965, |
|
"train_samples_per_second": 5.856, |
|
"train_steps_per_second": 0.011 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 297, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3286712322107113e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|