MHGanainy/gpt2-xl-lora-multi-3

Browse files

Files changed (5) hide show

README.md +2 -0
all_results.json +10 -10
eval_results.json +5 -5
train_results.json +5 -5
trainer_state.json +237 -321

README.md CHANGED Viewed

@@ -15,6 +15,8 @@ should probably proofread and complete it, then remove this comment. -->
 # gpt2-xl-lora-multi-3
 This model is a fine-tuned version of [openai-community/gpt2-xl](https://huggingface.co/openai-community/gpt2-xl) on an unknown dataset.
 ## Model description

 # gpt2-xl-lora-multi-3
 This model is a fine-tuned version of [openai-community/gpt2-xl](https://huggingface.co/openai-community/gpt2-xl) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 2.4837
 ## Model description

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
     "epoch": 1.0,
-    "eval_loss": 2.447016954421997,
-    "eval_runtime": 255.0441,
-    "eval_samples_per_second": 7.795,
-    "eval_steps_per_second": 0.49,
-    "perplexity": 11.553829661360664,
-    "total_flos": 1.0085141819823227e+18,
-    "train_loss": 2.6225888340860033,
-    "train_runtime": 2201.4938,
-    "train_samples_per_second": 50.3,
-    "train_steps_per_second": 3.144
 }

 {
     "epoch": 1.0,
+    "eval_loss": 2.4837162494659424,
+    "eval_runtime": 2242.6453,
+    "eval_samples_per_second": 8.521,
+    "eval_steps_per_second": 0.533,
+    "perplexity": 11.985723695080148,
+    "total_flos": 8.372955480242258e+17,
+    "train_loss": 2.6846868539578486,
+    "train_runtime": 1624.688,
+    "train_samples_per_second": 56.585,
+    "train_steps_per_second": 3.537
 }

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 1.0,
-    "eval_loss": 2.447016954421997,
-    "eval_runtime": 255.0441,
-    "eval_samples_per_second": 7.795,
-    "eval_steps_per_second": 0.49,
-    "perplexity": 11.553829661360664
 }

 {
     "epoch": 1.0,
+    "eval_loss": 2.4837162494659424,
+    "eval_runtime": 2242.6453,
+    "eval_samples_per_second": 8.521,
+    "eval_steps_per_second": 0.533,
+    "perplexity": 11.985723695080148
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 1.0,
-    "total_flos": 1.0085141819823227e+18,
-    "train_loss": 2.6225888340860033,
-    "train_runtime": 2201.4938,
-    "train_samples_per_second": 50.3,
-    "train_steps_per_second": 3.144
 }

 {
     "epoch": 1.0,
+    "total_flos": 8.372955480242258e+17,
+    "train_loss": 2.6846868539578486,
+    "train_runtime": 1624.688,
+    "train_samples_per_second": 56.585,
+    "train_steps_per_second": 3.537
 }

trainer_state.json CHANGED Viewed

@@ -3,506 +3,422 @@
   "best_model_checkpoint": null,
   "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 6921,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.014448779078167894,
-      "grad_norm": 0.1319790631532669,
-      "learning_rate": 3.576589595375723e-07,
-      "loss": 2.9859,
       "step": 100
     },
     {
-      "epoch": 0.02889755815633579,
-      "grad_norm": 0.09819761663675308,
-      "learning_rate": 7.153179190751446e-07,
-      "loss": 2.9932,
       "step": 200
     },
     {
-      "epoch": 0.043346337234503686,
-      "grad_norm": 0.1437925547361374,
-      "learning_rate": 1.0729768786127169e-06,
-      "loss": 2.9802,
       "step": 300
     },
     {
-      "epoch": 0.05779511631267158,
-      "grad_norm": 0.1754620224237442,
-      "learning_rate": 1.4306358381502892e-06,
-      "loss": 2.951,
       "step": 400
     },
     {
-      "epoch": 0.07224389539083947,
-      "grad_norm": 0.16474293172359467,
-      "learning_rate": 1.791907514450867e-06,
-      "loss": 2.9437,
       "step": 500
     },
     {
-      "epoch": 0.08669267446900737,
-      "grad_norm": 0.19413641095161438,
-      "learning_rate": 2.153179190751445e-06,
-      "loss": 2.909,
       "step": 600
     },
     {
-      "epoch": 0.10114145354717527,
-      "grad_norm": 0.21083518862724304,
-      "learning_rate": 2.5144508670520235e-06,
-      "loss": 2.8579,
       "step": 700
     },
     {
-      "epoch": 0.11559023262534315,
-      "grad_norm": 0.16336515545845032,
-      "learning_rate": 2.8757225433526016e-06,
-      "loss": 2.8228,
       "step": 800
     },
     {
-      "epoch": 0.13003901170351106,
-      "grad_norm": 0.20088078081607819,
-      "learning_rate": 3.2369942196531797e-06,
-      "loss": 2.8003,
       "step": 900
     },
     {
-      "epoch": 0.14448779078167895,
-      "grad_norm": 0.15995082259178162,
-      "learning_rate": 3.5982658959537574e-06,
-      "loss": 2.7788,
       "step": 1000
     },
     {
-      "epoch": 0.15893656985984683,
-      "grad_norm": 0.15541793406009674,
-      "learning_rate": 3.9595375722543355e-06,
-      "loss": 2.731,
       "step": 1100
     },
     {
-      "epoch": 0.17338534893801474,
-      "grad_norm": 0.16839973628520966,
-      "learning_rate": 4.320809248554913e-06,
-      "loss": 2.71,
       "step": 1200
     },
     {
-      "epoch": 0.18783412801618263,
-      "grad_norm": 0.16297543048858643,
-      "learning_rate": 4.682080924855492e-06,
-      "loss": 2.6887,
       "step": 1300
     },
     {
-      "epoch": 0.20228290709435054,
-      "grad_norm": 0.1565973162651062,
-      "learning_rate": 5.043352601156069e-06,
-      "loss": 2.666,
       "step": 1400
     },
     {
-      "epoch": 0.21673168617251842,
-      "grad_norm": 0.13941773772239685,
-      "learning_rate": 5.404624277456648e-06,
-      "loss": 2.6619,
       "step": 1500
     },
     {
-      "epoch": 0.2311804652506863,
-      "grad_norm": 0.14650990068912506,
-      "learning_rate": 5.765895953757226e-06,
-      "loss": 2.6402,
       "step": 1600
     },
     {
-      "epoch": 0.24562924432885422,
-      "grad_norm": 0.16068528592586517,
-      "learning_rate": 6.127167630057804e-06,
-      "loss": 2.639,
       "step": 1700
     },
     {
-      "epoch": 0.26007802340702213,
-      "grad_norm": 0.16715380549430847,
-      "learning_rate": 6.488439306358382e-06,
-      "loss": 2.6312,
       "step": 1800
     },
     {
-      "epoch": 0.27452680248519,
-      "grad_norm": 0.2185903787612915,
-      "learning_rate": 6.84971098265896e-06,
-      "loss": 2.627,
       "step": 1900
     },
     {
-      "epoch": 0.2889755815633579,
-      "grad_norm": 0.16029363870620728,
-      "learning_rate": 7.210982658959538e-06,
-      "loss": 2.6231,
       "step": 2000
     },
     {
-      "epoch": 0.3034243606415258,
-      "grad_norm": 0.16060592234134674,
-      "learning_rate": 7.5722543352601166e-06,
-      "loss": 2.6127,
       "step": 2100
     },
     {
-      "epoch": 0.31787313971969366,
-      "grad_norm": 0.16263119876384735,
-      "learning_rate": 7.933526011560694e-06,
-      "loss": 2.6082,
       "step": 2200
     },
     {
-      "epoch": 0.3323219187978616,
-      "grad_norm": 0.1585131585597992,
-      "learning_rate": 8.294797687861273e-06,
-      "loss": 2.6037,
       "step": 2300
     },
     {
-      "epoch": 0.3467706978760295,
-      "grad_norm": 0.16766057908535004,
-      "learning_rate": 8.656069364161851e-06,
-      "loss": 2.6005,
       "step": 2400
     },
     {
-      "epoch": 0.36121947695419737,
-      "grad_norm": 0.1492636650800705,
-      "learning_rate": 9.017341040462428e-06,
-      "loss": 2.5915,
       "step": 2500
     },
     {
-      "epoch": 0.37566825603236526,
-      "grad_norm": 0.17886599898338318,
-      "learning_rate": 9.378612716763007e-06,
-      "loss": 2.5881,
       "step": 2600
     },
     {
-      "epoch": 0.39011703511053314,
-      "grad_norm": 0.18520204722881317,
-      "learning_rate": 9.739884393063585e-06,
-      "loss": 2.5915,
       "step": 2700
     },
     {
-      "epoch": 0.4045658141887011,
-      "grad_norm": 0.1520683318376541,
-      "learning_rate": 1.0101156069364162e-05,
-      "loss": 2.5806,
       "step": 2800
     },
     {
-      "epoch": 0.41901459326686896,
-      "grad_norm": 0.1854362040758133,
-      "learning_rate": 1.046242774566474e-05,
-      "loss": 2.5711,
       "step": 2900
     },
     {
-      "epoch": 0.43346337234503685,
-      "grad_norm": 0.18411140143871307,
-      "learning_rate": 1.0823699421965319e-05,
-      "loss": 2.5841,
       "step": 3000
     },
     {
-      "epoch": 0.44791215142320473,
-      "grad_norm": 0.1857396364212036,
-      "learning_rate": 1.1184971098265898e-05,
-      "loss": 2.5727,
       "step": 3100
     },
     {
-      "epoch": 0.4623609305013726,
-      "grad_norm": 0.23173969984054565,
-      "learning_rate": 1.1546242774566474e-05,
-      "loss": 2.5793,
       "step": 3200
     },
     {
-      "epoch": 0.47680970957954055,
-      "grad_norm": 0.17829596996307373,
-      "learning_rate": 1.1907514450867053e-05,
-      "loss": 2.574,
       "step": 3300
     },
     {
-      "epoch": 0.49125848865770844,
-      "grad_norm": 0.1748592108488083,
-      "learning_rate": 1.2268786127167632e-05,
-      "loss": 2.5675,
       "step": 3400
     },
     {
-      "epoch": 0.5057072677358763,
-      "grad_norm": 0.18341368436813354,
-      "learning_rate": 1.263005780346821e-05,
-      "loss": 2.5695,
       "step": 3500
     },
     {
-      "epoch": 0.5201560468140443,
-      "grad_norm": 0.17201048135757446,
-      "learning_rate": 1.2991329479768787e-05,
-      "loss": 2.565,
       "step": 3600
     },
     {
-      "epoch": 0.5346048258922121,
-      "grad_norm": 0.21022534370422363,
-      "learning_rate": 1.3352601156069365e-05,
-      "loss": 2.5662,
       "step": 3700
     },
     {
-      "epoch": 0.54905360497038,
-      "grad_norm": 0.17963945865631104,
-      "learning_rate": 1.3713872832369944e-05,
-      "loss": 2.5645,
       "step": 3800
     },
     {
-      "epoch": 0.5635023840485479,
-      "grad_norm": 0.18298958241939545,
-      "learning_rate": 1.4075144508670523e-05,
-      "loss": 2.5634,
       "step": 3900
     },
     {
-      "epoch": 0.5779511631267158,
-      "grad_norm": 0.221732035279274,
-      "learning_rate": 1.4436416184971101e-05,
-      "loss": 2.5532,
       "step": 4000
     },
     {
-      "epoch": 0.5923999422048837,
-      "grad_norm": 0.19882583618164062,
-      "learning_rate": 1.4797687861271676e-05,
-      "loss": 2.5595,
       "step": 4100
     },
     {
-      "epoch": 0.6068487212830516,
-      "grad_norm": 0.21510709822177887,
-      "learning_rate": 1.5158959537572255e-05,
-      "loss": 2.5632,
       "step": 4200
     },
     {
-      "epoch": 0.6212975003612194,
-      "grad_norm": 0.17533738911151886,
-      "learning_rate": 1.5520231213872833e-05,
-      "loss": 2.5448,
       "step": 4300
     },
     {
-      "epoch": 0.6357462794393873,
-      "grad_norm": 0.19394342601299286,
-      "learning_rate": 1.5881502890173412e-05,
-      "loss": 2.5597,
       "step": 4400
     },
     {
-      "epoch": 0.6501950585175552,
-      "grad_norm": 0.196193665266037,
-      "learning_rate": 1.624277456647399e-05,
-      "loss": 2.551,
       "step": 4500
     },
     {
-      "epoch": 0.6646438375957232,
-      "grad_norm": 0.1857462376356125,
-      "learning_rate": 1.660404624277457e-05,
-      "loss": 2.5478,
       "step": 4600
     },
     {
-      "epoch": 0.6790926166738911,
-      "grad_norm": 0.20186574757099152,
-      "learning_rate": 1.6965317919075147e-05,
-      "loss": 2.5508,
       "step": 4700
     },
     {
-      "epoch": 0.693541395752059,
-      "grad_norm": 0.22574672102928162,
-      "learning_rate": 1.7326589595375726e-05,
-      "loss": 2.5492,
       "step": 4800
     },
     {
-      "epoch": 0.7079901748302269,
-      "grad_norm": 0.21713367104530334,
-      "learning_rate": 1.76878612716763e-05,
-      "loss": 2.5326,
       "step": 4900
     },
     {
-      "epoch": 0.7224389539083947,
-      "grad_norm": 0.2183527648448944,
-      "learning_rate": 1.804913294797688e-05,
-      "loss": 2.5512,
       "step": 5000
     },
     {
-      "epoch": 0.7368877329865626,
-      "grad_norm": 0.23467661440372467,
-      "learning_rate": 1.8410404624277458e-05,
-      "loss": 2.5462,
       "step": 5100
     },
     {
-      "epoch": 0.7513365120647305,
-      "grad_norm": 0.20273762941360474,
-      "learning_rate": 1.8771676300578037e-05,
-      "loss": 2.5493,
       "step": 5200
     },
     {
-      "epoch": 0.7657852911428984,
-      "grad_norm": 0.20707377791404724,
-      "learning_rate": 1.9132947976878615e-05,
-      "loss": 2.5425,
       "step": 5300
     },
     {
-      "epoch": 0.7802340702210663,
-      "grad_norm": 0.20679670572280884,
-      "learning_rate": 1.949421965317919e-05,
-      "loss": 2.5424,
       "step": 5400
     },
     {
-      "epoch": 0.7946828492992342,
-      "grad_norm": 0.22911733388900757,
-      "learning_rate": 1.985549132947977e-05,
-      "loss": 2.5369,
       "step": 5500
     },
     {
-      "epoch": 0.8091316283774022,
-      "grad_norm": 0.2293398678302765,
-      "learning_rate": 1.990752970486014e-05,
-      "loss": 2.5414,
       "step": 5600
     },
     {
-      "epoch": 0.82358040745557,
-      "grad_norm": 0.21145911514759064,
-      "learning_rate": 1.934861471129627e-05,
-      "loss": 2.5341,
       "step": 5700
     },
-    {
-      "epoch": 0.8380291865337379,
-      "grad_norm": 0.22642740607261658,
-      "learning_rate": 1.8310755913154726e-05,
-      "loss": 2.5312,
-      "step": 5800
-    },
-    {
-      "epoch": 0.8524779656119058,
-      "grad_norm": 0.22286154329776764,
-      "learning_rate": 1.6847124401206384e-05,
-      "loss": 2.5354,
-      "step": 5900
-    },
-    {
-      "epoch": 0.8669267446900737,
-      "grad_norm": 0.22432397305965424,
-      "learning_rate": 1.5032704252395315e-05,
-      "loss": 2.5355,
-      "step": 6000
-    },
-    {
-      "epoch": 0.8813755237682416,
-      "grad_norm": 0.21558962762355804,
-      "learning_rate": 1.2960450981095643e-05,
-      "loss": 2.5481,
-      "step": 6100
-    },
-    {
-      "epoch": 0.8958243028464095,
-      "grad_norm": 0.2449900358915329,
-      "learning_rate": 1.0736529286191087e-05,
-      "loss": 2.5352,
-      "step": 6200
-    },
-    {
-      "epoch": 0.9102730819245773,
-      "grad_norm": 0.22407910227775574,
-      "learning_rate": 8.474874071465144e-06,
-      "loss": 2.5261,
-      "step": 6300
-    },
-    {
-      "epoch": 0.9247218610027452,
-      "grad_norm": 0.20298534631729126,
-      "learning_rate": 6.291353386531074e-06,
-      "loss": 2.5389,
-      "step": 6400
-    },
-    {
-      "epoch": 0.9391706400809131,
-      "grad_norm": 0.2112707942724228,
-      "learning_rate": 4.297832329749687e-06,
-      "loss": 2.5369,
-      "step": 6500
-    },
-    {
-      "epoch": 0.9536194191590811,
-      "grad_norm": 0.22108633816242218,
-      "learning_rate": 2.596442028451194e-06,
-      "loss": 2.5331,
-      "step": 6600
-    },
-    {
-      "epoch": 0.968068198237249,
-      "grad_norm": 0.21827644109725952,
-      "learning_rate": 1.2743473053542842e-06,
-      "loss": 2.5279,
-      "step": 6700
-    },
-    {
-      "epoch": 0.9825169773154169,
-      "grad_norm": 0.22307871282100677,
-      "learning_rate": 3.9928109162008953e-07,
-      "loss": 2.5346,
-      "step": 6800
-    },
-    {
-      "epoch": 0.9969657563935848,
-      "grad_norm": 0.22532746195793152,
-      "learning_rate": 1.6074365378105915e-08,
-      "loss": 2.5428,
-      "step": 6900
-    },
     {
       "epoch": 1.0,
-      "step": 6921,
-      "total_flos": 1.0085141819823227e+18,
-      "train_loss": 2.6225888340860033,
-      "train_runtime": 2201.4938,
-      "train_samples_per_second": 50.3,
-      "train_steps_per_second": 3.144
     }
   ],
   "logging_steps": 100,
-  "max_steps": 6921,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 500,
@@ -518,7 +434,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.0085141819823227e+18,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

   "best_model_checkpoint": null,
   "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 5746,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.01740341106856944,
+      "grad_norm": 0.046589791774749756,
+      "learning_rate": 4.351610095735422e-07,
+      "loss": 2.7999,
       "step": 100
     },
     {
+      "epoch": 0.03480682213713888,
+      "grad_norm": 0.04616040736436844,
+      "learning_rate": 8.703220191470844e-07,
+      "loss": 2.7996,
       "step": 200
     },
     {
+      "epoch": 0.05221023320570832,
+      "grad_norm": 0.05549981817603111,
+      "learning_rate": 1.305483028720627e-06,
+      "loss": 2.797,
       "step": 300
     },
     {
+      "epoch": 0.06961364427427776,
+      "grad_norm": 0.063571035861969,
+      "learning_rate": 1.7406440382941688e-06,
+      "loss": 2.7909,
       "step": 400
     },
     {
+      "epoch": 0.0870170553428472,
+      "grad_norm": 0.08422163128852844,
+      "learning_rate": 2.1758050478677113e-06,
+      "loss": 2.7951,
       "step": 500
     },
     {
+      "epoch": 0.10442046641141664,
+      "grad_norm": 0.09257014095783234,
+      "learning_rate": 2.610966057441254e-06,
+      "loss": 2.7803,
       "step": 600
     },
     {
+      "epoch": 0.12182387747998608,
+      "grad_norm": 0.11055697500705719,
+      "learning_rate": 3.046127067014796e-06,
+      "loss": 2.7681,
       "step": 700
     },
     {
+      "epoch": 0.1392272885485555,
+      "grad_norm": 0.10759040713310242,
+      "learning_rate": 3.4812880765883376e-06,
+      "loss": 2.7611,
       "step": 800
     },
     {
+      "epoch": 0.15663069961712495,
+      "grad_norm": 0.12318646907806396,
+      "learning_rate": 3.9164490861618806e-06,
+      "loss": 2.7402,
       "step": 900
     },
     {
+      "epoch": 0.1740341106856944,
+      "grad_norm": 0.12962989509105682,
+      "learning_rate": 4.351610095735423e-06,
+      "loss": 2.7451,
       "step": 1000
     },
     {
+      "epoch": 0.19143752175426385,
+      "grad_norm": 0.13981275260448456,
+      "learning_rate": 4.786771105308965e-06,
+      "loss": 2.735,
       "step": 1100
     },
     {
+      "epoch": 0.20884093282283328,
+      "grad_norm": 0.14711035788059235,
+      "learning_rate": 5.221932114882508e-06,
+      "loss": 2.7469,
       "step": 1200
     },
     {
+      "epoch": 0.22624434389140272,
+      "grad_norm": 0.15727241337299347,
+      "learning_rate": 5.657093124456049e-06,
+      "loss": 2.7327,
       "step": 1300
     },
     {
+      "epoch": 0.24364775495997215,
+      "grad_norm": 0.15055705606937408,
+      "learning_rate": 6.092254134029592e-06,
+      "loss": 2.7234,
       "step": 1400
     },
     {
+      "epoch": 0.2610511660285416,
+      "grad_norm": 0.16661331057548523,
+      "learning_rate": 6.527415143603134e-06,
+      "loss": 2.7174,
       "step": 1500
     },
     {
+      "epoch": 0.278454577097111,
+      "grad_norm": 0.17976854741573334,
+      "learning_rate": 6.962576153176675e-06,
+      "loss": 2.719,
       "step": 1600
     },
     {
+      "epoch": 0.2958579881656805,
+      "grad_norm": 0.1790621429681778,
+      "learning_rate": 7.397737162750218e-06,
+      "loss": 2.7173,
       "step": 1700
     },
     {
+      "epoch": 0.3132613992342499,
+      "grad_norm": 0.19079644978046417,
+      "learning_rate": 7.832898172323761e-06,
+      "loss": 2.7131,
       "step": 1800
     },
     {
+      "epoch": 0.33066481030281936,
+      "grad_norm": 0.19005636870861053,
+      "learning_rate": 8.268059181897302e-06,
+      "loss": 2.7168,
       "step": 1900
     },
     {
+      "epoch": 0.3480682213713888,
+      "grad_norm": 0.19910404086112976,
+      "learning_rate": 8.703220191470845e-06,
+      "loss": 2.7061,
       "step": 2000
     },
     {
+      "epoch": 0.3654716324399582,
+      "grad_norm": 0.20510949194431305,
+      "learning_rate": 9.138381201044387e-06,
+      "loss": 2.6862,
       "step": 2100
     },
     {
+      "epoch": 0.3828750435085277,
+      "grad_norm": 0.20418143272399902,
+      "learning_rate": 9.57354221061793e-06,
+      "loss": 2.6802,
       "step": 2200
     },
     {
+      "epoch": 0.4002784545770971,
+      "grad_norm": 0.21713656187057495,
+      "learning_rate": 1.000870322019147e-05,
+      "loss": 2.6923,
       "step": 2300
     },
     {
+      "epoch": 0.41768186564566656,
+      "grad_norm": 0.2298802137374878,
+      "learning_rate": 1.0443864229765015e-05,
+      "loss": 2.6818,
       "step": 2400
     },
     {
+      "epoch": 0.43508527671423597,
+      "grad_norm": 0.2294008880853653,
+      "learning_rate": 1.0879025239338557e-05,
+      "loss": 2.6896,
       "step": 2500
     },
     {
+      "epoch": 0.45248868778280543,
+      "grad_norm": 0.21464629471302032,
+      "learning_rate": 1.1314186248912098e-05,
+      "loss": 2.6805,
       "step": 2600
     },
     {
+      "epoch": 0.4698920988513749,
+      "grad_norm": 0.25449061393737793,
+      "learning_rate": 1.174934725848564e-05,
+      "loss": 2.6806,
       "step": 2700
     },
     {
+      "epoch": 0.4872955099199443,
+      "grad_norm": 0.24079586565494537,
+      "learning_rate": 1.2184508268059184e-05,
+      "loss": 2.6844,
       "step": 2800
     },
     {
+      "epoch": 0.5046989209885138,
+      "grad_norm": 0.2414436638355255,
+      "learning_rate": 1.2619669277632725e-05,
+      "loss": 2.6817,
       "step": 2900
     },
     {
+      "epoch": 0.5221023320570832,
+      "grad_norm": 0.2530564069747925,
+      "learning_rate": 1.3054830287206268e-05,
+      "loss": 2.6556,
       "step": 3000
     },
     {
+      "epoch": 0.5395057431256526,
+      "grad_norm": 0.26441535353660583,
+      "learning_rate": 1.348999129677981e-05,
+      "loss": 2.6749,
       "step": 3100
     },
     {
+      "epoch": 0.556909154194222,
+      "grad_norm": 0.2584131062030792,
+      "learning_rate": 1.392515230635335e-05,
+      "loss": 2.6575,
       "step": 3200
     },
     {
+      "epoch": 0.5743125652627915,
+      "grad_norm": 0.25025609135627747,
+      "learning_rate": 1.4360313315926895e-05,
+      "loss": 2.6658,
       "step": 3300
     },
     {
+      "epoch": 0.591715976331361,
+      "grad_norm": 0.26518625020980835,
+      "learning_rate": 1.4795474325500436e-05,
+      "loss": 2.6586,
       "step": 3400
     },
     {
+      "epoch": 0.6091193873999304,
+      "grad_norm": 0.26597312092781067,
+      "learning_rate": 1.5230635335073978e-05,
+      "loss": 2.6451,
       "step": 3500
     },
     {
+      "epoch": 0.6265227984684998,
+      "grad_norm": 0.2725384831428528,
+      "learning_rate": 1.5665796344647522e-05,
+      "loss": 2.6521,
       "step": 3600
     },
     {
+      "epoch": 0.6439262095370692,
+      "grad_norm": 0.2752222716808319,
+      "learning_rate": 1.6100957354221064e-05,
+      "loss": 2.6398,
       "step": 3700
     },
     {
+      "epoch": 0.6613296206056387,
+      "grad_norm": 0.2558598518371582,
+      "learning_rate": 1.6536118363794605e-05,
+      "loss": 2.6486,
       "step": 3800
     },
     {
+      "epoch": 0.6787330316742082,
+      "grad_norm": 0.26938167214393616,
+      "learning_rate": 1.697127937336815e-05,
+      "loss": 2.641,
       "step": 3900
     },
     {
+      "epoch": 0.6961364427427776,
+      "grad_norm": 0.28793784976005554,
+      "learning_rate": 1.740644038294169e-05,
+      "loss": 2.6344,
       "step": 4000
     },
     {
+      "epoch": 0.713539853811347,
+      "grad_norm": 0.2677360773086548,
+      "learning_rate": 1.7841601392515232e-05,
+      "loss": 2.6542,
       "step": 4100
     },
     {
+      "epoch": 0.7309432648799165,
+      "grad_norm": 0.28143930435180664,
+      "learning_rate": 1.8276762402088773e-05,
+      "loss": 2.6446,
       "step": 4200
     },
     {
+      "epoch": 0.7483466759484859,
+      "grad_norm": 0.28870299458503723,
+      "learning_rate": 1.8711923411662314e-05,
+      "loss": 2.6243,
       "step": 4300
     },
     {
+      "epoch": 0.7657500870170554,
+      "grad_norm": 0.296633780002594,
+      "learning_rate": 1.914708442123586e-05,
+      "loss": 2.6306,
       "step": 4400
     },
     {
+      "epoch": 0.7831534980856247,
+      "grad_norm": 0.2806219160556793,
+      "learning_rate": 1.95822454308094e-05,
+      "loss": 2.6356,
       "step": 4500
     },
     {
+      "epoch": 0.8005569091541942,
+      "grad_norm": 0.2914940416812897,
+      "learning_rate": 1.999940297883134e-05,
+      "loss": 2.644,
       "step": 4600
     },
     {
+      "epoch": 0.8179603202227637,
+      "grad_norm": 0.28510311245918274,
+      "learning_rate": 1.9599117132813187e-05,
+      "loss": 2.6357,
       "step": 4700
     },
     {
+      "epoch": 0.8353637312913331,
+      "grad_norm": 0.3171123266220093,
+      "learning_rate": 1.8486908682093175e-05,
+      "loss": 2.6307,
       "step": 4800
     },
     {
+      "epoch": 0.8527671423599026,
+      "grad_norm": 0.2955775558948517,
+      "learning_rate": 1.674526503944611e-05,
+      "loss": 2.6315,
       "step": 4900
     },
     {
+      "epoch": 0.8701705534284719,
+      "grad_norm": 0.2767013907432556,
+      "learning_rate": 1.450335594635761e-05,
+      "loss": 2.6138,
       "step": 5000
     },
     {
+      "epoch": 0.8875739644970414,
+      "grad_norm": 0.27960339188575745,
+      "learning_rate": 1.1927453544210397e-05,
+      "loss": 2.6305,
       "step": 5100
     },
     {
+      "epoch": 0.9049773755656109,
+      "grad_norm": 0.31521016359329224,
+      "learning_rate": 9.20860073020234e-06,
+      "loss": 2.6249,
       "step": 5200
     },
     {
+      "epoch": 0.9223807866341803,
+      "grad_norm": 0.2640378773212433,
+      "learning_rate": 6.548442379624425e-06,
+      "loss": 2.6257,
       "step": 5300
     },
     {
+      "epoch": 0.9397841977027498,
+      "grad_norm": 0.28068870306015015,
+      "learning_rate": 4.144270267924306e-06,
+      "loss": 2.6261,
       "step": 5400
     },
     {
+      "epoch": 0.9571876087713191,
+      "grad_norm": 0.2999429702758789,
+      "learning_rate": 2.1743908422712135e-06,
+      "loss": 2.6245,
       "step": 5500
     },
     {
+      "epoch": 0.9745910198398886,
+      "grad_norm": 0.2793658971786499,
+      "learning_rate": 7.849010480670938e-07,
+      "loss": 2.6209,
       "step": 5600
     },
     {
+      "epoch": 0.9919944309084581,
+      "grad_norm": 0.30049070715904236,
+      "learning_rate": 7.885298685522235e-08,
+      "loss": 2.6215,
       "step": 5700
     },
     {
       "epoch": 1.0,
+      "step": 5746,
+      "total_flos": 8.372955480242258e+17,
+      "train_loss": 2.6846868539578486,
+      "train_runtime": 1624.688,
+      "train_samples_per_second": 56.585,
+      "train_steps_per_second": 3.537
     }
   ],
   "logging_steps": 100,
+  "max_steps": 5746,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 8.372955480242258e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null