Model save

Browse files

Files changed (4) hide show

README.md +7 -7
all_results.json +7 -12
train_results.json +7 -7
trainer_state.json +262 -101

README.md CHANGED Viewed

@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
 It achieves the following results on the evaluation set:
-- Loss: 1.5500
 ## Model description
@@ -48,22 +48,22 @@ The following hyperparameters were used during training:
 - gradient_accumulation_steps: 2
 - total_train_batch_size: 192
 - total_eval_batch_size: 96
-- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
 - num_epochs: 1
 ### Training results
-| Training Loss | Epoch  | Step | Validation Loss |
-|:-------------:|:------:|:----:|:---------------:|
-| 1.5519        | 0.9951 | 102  | 1.5500          |
 ### Framework versions
 - PEFT 0.13.1.dev0
-- Transformers 4.46.2
-- Pytorch 2.5.1+cu124
 - Datasets 3.1.0
 - Tokenizers 0.20.3

 This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
 It achieves the following results on the evaluation set:
+- Loss: 1.4864
 ## Model description
 - gradient_accumulation_steps: 2
 - total_train_batch_size: 192
 - total_eval_batch_size: 96
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
 - num_epochs: 1
 ### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 0.982         | 1.0   | 216  | 1.4864          |
 ### Framework versions
 - PEFT 0.13.1.dev0
+- Transformers 4.46.3
+- Pytorch 2.3.1+cu121
 - Datasets 3.1.0
 - Tokenizers 0.20.3

all_results.json CHANGED Viewed

@@ -1,14 +1,9 @@
 {
-    "epoch": 0.9951219512195122,
-    "eval_loss": 1.5499577522277832,
-    "eval_runtime": 1.7323,
-    "eval_samples": 518,
-    "eval_samples_per_second": 112.567,
-    "eval_steps_per_second": 1.732,
-    "total_flos": 4.281868708751606e+17,
-    "train_loss": 1.6186961426454431,
-    "train_runtime": 363.7432,
-    "train_samples": 51241,
-    "train_samples_per_second": 53.848,
-    "train_steps_per_second": 0.28
 }

 {
+    "epoch": 1.0,
+    "total_flos": 9.067486658407956e+17,
+    "train_loss": 1.0541787544886272,
+    "train_runtime": 774.5406,
+    "train_samples": 116368,
+    "train_samples_per_second": 53.491,
+    "train_steps_per_second": 0.279
 }

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 0.9951219512195122,
-    "total_flos": 4.281868708751606e+17,
-    "train_loss": 1.6186961426454431,
-    "train_runtime": 363.7432,
-    "train_samples": 51241,
-    "train_samples_per_second": 53.848,
-    "train_steps_per_second": 0.28
 }

 {
+    "epoch": 1.0,
+    "total_flos": 9.067486658407956e+17,
+    "train_loss": 1.0541787544886272,
+    "train_runtime": 774.5406,
+    "train_samples": 116368,
+    "train_samples_per_second": 53.491,
+    "train_steps_per_second": 0.279
 }

trainer_state.json CHANGED Viewed

@@ -1,180 +1,341 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.9951219512195122,
   "eval_steps": 500,
-  "global_step": 102,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.00975609756097561,
-      "grad_norm": 2.371129274368286,
-      "learning_rate": 1.8181818181818182e-05,
-      "loss": 2.0473,
       "step": 1
     },
     {
-      "epoch": 0.04878048780487805,
-      "grad_norm": 2.3514864444732666,
-      "learning_rate": 9.090909090909092e-05,
-      "loss": 2.0124,
       "step": 5
     },
     {
-      "epoch": 0.0975609756097561,
-      "grad_norm": 1.9732930660247803,
-      "learning_rate": 0.00018181818181818183,
-      "loss": 1.9222,
       "step": 10
     },
     {
-      "epoch": 0.14634146341463414,
-      "grad_norm": 2.1772541999816895,
-      "learning_rate": 0.00019904804439875633,
-      "loss": 1.7928,
       "step": 15
     },
     {
-      "epoch": 0.1951219512195122,
-      "grad_norm": 1.5341215133666992,
-      "learning_rate": 0.00019521176659107142,
-      "loss": 1.7354,
       "step": 20
     },
     {
-      "epoch": 0.24390243902439024,
-      "grad_norm": 1.4509671926498413,
-      "learning_rate": 0.000188545602565321,
-      "loss": 1.6622,
       "step": 25
     },
     {
-      "epoch": 0.2926829268292683,
-      "grad_norm": 0.9245131015777588,
-      "learning_rate": 0.00017924768419510904,
-      "loss": 1.5826,
       "step": 30
     },
     {
-      "epoch": 0.34146341463414637,
-      "grad_norm": 0.8397857546806335,
-      "learning_rate": 0.00016759436441447545,
-      "loss": 1.5755,
       "step": 35
     },
     {
-      "epoch": 0.3902439024390244,
-      "grad_norm": 0.7870491743087769,
-      "learning_rate": 0.00015393200344991995,
-      "loss": 1.5685,
       "step": 40
     },
     {
-      "epoch": 0.43902439024390244,
-      "grad_norm": 0.7064230442047119,
-      "learning_rate": 0.0001386666742941419,
-      "loss": 1.5434,
       "step": 45
     },
     {
-      "epoch": 0.4878048780487805,
-      "grad_norm": 0.8221641182899475,
-      "learning_rate": 0.00012225209339563145,
-      "loss": 1.5552,
       "step": 50
     },
     {
-      "epoch": 0.5365853658536586,
-      "grad_norm": 0.7419559359550476,
-      "learning_rate": 0.00010517613528842097,
-      "loss": 1.5569,
       "step": 55
     },
     {
-      "epoch": 0.5853658536585366,
-      "grad_norm": 0.7535139322280884,
-      "learning_rate": 8.79463319744677e-05,
-      "loss": 1.5627,
       "step": 60
     },
     {
-      "epoch": 0.6341463414634146,
-      "grad_norm": 0.709441065788269,
-      "learning_rate": 7.107478804634325e-05,
-      "loss": 1.5634,
       "step": 65
     },
     {
-      "epoch": 0.6829268292682927,
-      "grad_norm": 0.6745243668556213,
-      "learning_rate": 5.506295990328385e-05,
-      "loss": 1.5537,
       "step": 70
     },
     {
-      "epoch": 0.7317073170731707,
-      "grad_norm": 0.6785560250282288,
-      "learning_rate": 4.038675145307747e-05,
-      "loss": 1.5373,
       "step": 75
     },
     {
-      "epoch": 0.7804878048780488,
-      "grad_norm": 0.7758954167366028,
-      "learning_rate": 2.7482369285662378e-05,
-      "loss": 1.5341,
       "step": 80
     },
     {
-      "epoch": 0.8292682926829268,
-      "grad_norm": 0.6606050729751587,
-      "learning_rate": 1.6733357731279377e-05,
-      "loss": 1.5326,
       "step": 85
     },
     {
-      "epoch": 0.8780487804878049,
-      "grad_norm": 0.8155940771102905,
-      "learning_rate": 8.45919914746337e-06,
-      "loss": 1.5411,
       "step": 90
     },
     {
-      "epoch": 0.926829268292683,
-      "grad_norm": 0.7202953100204468,
-      "learning_rate": 2.905818257394799e-06,
-      "loss": 1.5414,
       "step": 95
     },
     {
-      "epoch": 0.975609756097561,
-      "grad_norm": 0.670049786567688,
-      "learning_rate": 2.382727698752474e-07,
-      "loss": 1.5519,
       "step": 100
     },
     {
-      "epoch": 0.9951219512195122,
-      "eval_loss": 1.5499577522277832,
-      "eval_runtime": 1.7555,
-      "eval_samples_per_second": 111.078,
-      "eval_steps_per_second": 1.709,
-      "step": 102
     },
     {
-      "epoch": 0.9951219512195122,
-      "step": 102,
-      "total_flos": 4.281868708751606e+17,
-      "train_loss": 1.6186961426454431,
-      "train_runtime": 363.7432,
-      "train_samples_per_second": 53.848,
-      "train_steps_per_second": 0.28
     }
   ],
   "logging_steps": 5,
-  "max_steps": 102,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
@@ -190,7 +351,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 4.281868708751606e+17,
   "train_batch_size": 12,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 216,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.004629629629629629,
+      "grad_norm": 2.620875597000122,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 1.5605,
       "step": 1
     },
     {
+      "epoch": 0.023148148148148147,
+      "grad_norm": 2.674976348876953,
+      "learning_rate": 4.545454545454546e-05,
+      "loss": 1.5457,
       "step": 5
     },
     {
+      "epoch": 0.046296296296296294,
+      "grad_norm": 2.3885157108306885,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 1.5006,
       "step": 10
     },
     {
+      "epoch": 0.06944444444444445,
+      "grad_norm": 2.1752545833587646,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 1.4093,
       "step": 15
     },
     {
+      "epoch": 0.09259259259259259,
+      "grad_norm": 2.1516635417938232,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 1.301,
       "step": 20
     },
     {
+      "epoch": 0.11574074074074074,
+      "grad_norm": 1.6158533096313477,
+      "learning_rate": 0.0001998820159279591,
+      "loss": 1.195,
       "step": 25
     },
     {
+      "epoch": 0.1388888888888889,
+      "grad_norm": 0.7115136384963989,
+      "learning_rate": 0.00019916201012264254,
+      "loss": 1.1232,
       "step": 30
     },
     {
+      "epoch": 0.16203703703703703,
+      "grad_norm": 0.5917097926139832,
+      "learning_rate": 0.00019779225723955707,
+      "loss": 1.0867,
       "step": 35
     },
     {
+      "epoch": 0.18518518518518517,
+      "grad_norm": 0.6770131587982178,
+      "learning_rate": 0.00019578173241879872,
+      "loss": 1.0683,
       "step": 40
     },
     {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.5598504543304443,
+      "learning_rate": 0.00019314360938108425,
+      "loss": 1.0576,
       "step": 45
     },
     {
+      "epoch": 0.23148148148148148,
+      "grad_norm": 0.5453623533248901,
+      "learning_rate": 0.00018989517410853955,
+      "loss": 1.0375,
       "step": 50
     },
     {
+      "epoch": 0.25462962962962965,
+      "grad_norm": 0.507411539554596,
+      "learning_rate": 0.00018605771158039253,
+      "loss": 1.0349,
       "step": 55
     },
     {
+      "epoch": 0.2777777777777778,
+      "grad_norm": 0.5281575918197632,
+      "learning_rate": 0.0001816563663057211,
+      "loss": 1.0306,
       "step": 60
     },
     {
+      "epoch": 0.30092592592592593,
+      "grad_norm": 0.49278953671455383,
+      "learning_rate": 0.00017671997756709863,
+      "loss": 1.0232,
       "step": 65
     },
     {
+      "epoch": 0.32407407407407407,
+      "grad_norm": 0.44363367557525635,
+      "learning_rate": 0.00017128089045468294,
+      "loss": 1.0206,
       "step": 70
     },
     {
+      "epoch": 0.3472222222222222,
+      "grad_norm": 0.4600500464439392,
+      "learning_rate": 0.00016537474392892528,
+      "loss": 1.0185,
       "step": 75
     },
     {
+      "epoch": 0.37037037037037035,
+      "grad_norm": 0.4178927540779114,
+      "learning_rate": 0.00015904023730059228,
+      "loss": 1.0105,
       "step": 80
     },
     {
+      "epoch": 0.39351851851851855,
+      "grad_norm": 0.5482760071754456,
+      "learning_rate": 0.000152318876658213,
+      "loss": 1.0164,
       "step": 85
     },
     {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.4235095679759979,
+      "learning_rate": 0.00014525470290445392,
+      "loss": 1.0151,
       "step": 90
     },
     {
+      "epoch": 0.4398148148148148,
+      "grad_norm": 0.4932386875152588,
+      "learning_rate": 0.00013789400318343068,
+      "loss": 1.0081,
       "step": 95
     },
     {
+      "epoch": 0.46296296296296297,
+      "grad_norm": 0.4402116537094116,
+      "learning_rate": 0.00013028500758979506,
+      "loss": 1.0094,
       "step": 100
     },
     {
+      "epoch": 0.4861111111111111,
+      "grad_norm": 0.4497814476490021,
+      "learning_rate": 0.00012247757314687297,
+      "loss": 0.9996,
+      "step": 105
+    },
+    {
+      "epoch": 0.5092592592592593,
+      "grad_norm": 0.43658843636512756,
+      "learning_rate": 0.00011452285712454904,
+      "loss": 1.004,
+      "step": 110
+    },
+    {
+      "epoch": 0.5324074074074074,
+      "grad_norm": 0.4577714800834656,
+      "learning_rate": 0.00010647298183744359,
+      "loss": 0.9936,
+      "step": 115
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 0.44585293531417847,
+      "learning_rate": 9.838069311974986e-05,
+      "loss": 0.9953,
+      "step": 120
+    },
+    {
+      "epoch": 0.5787037037037037,
+      "grad_norm": 0.4536885619163513,
+      "learning_rate": 9.02990147145352e-05,
+      "loss": 0.9972,
+      "step": 125
+    },
+    {
+      "epoch": 0.6018518518518519,
+      "grad_norm": 0.4714517593383789,
+      "learning_rate": 8.228090084207774e-05,
+      "loss": 0.9963,
+      "step": 130
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.45539769530296326,
+      "learning_rate": 7.437888922374276e-05,
+      "loss": 1.0039,
+      "step": 135
+    },
+    {
+      "epoch": 0.6481481481481481,
+      "grad_norm": 0.4661619961261749,
+      "learning_rate": 6.664475683491796e-05,
+      "loss": 0.996,
+      "step": 140
+    },
+    {
+      "epoch": 0.6712962962962963,
+      "grad_norm": 0.4308771789073944,
+      "learning_rate": 5.9129180642644414e-05,
+      "loss": 0.9968,
+      "step": 145
+    },
+    {
+      "epoch": 0.6944444444444444,
+      "grad_norm": 0.42372000217437744,
+      "learning_rate": 5.1881405550919493e-05,
+      "loss": 0.997,
+      "step": 150
+    },
+    {
+      "epoch": 0.7175925925925926,
+      "grad_norm": 0.4466856122016907,
+      "learning_rate": 4.494892172941965e-05,
+      "loss": 0.997,
+      "step": 155
+    },
+    {
+      "epoch": 0.7407407407407407,
+      "grad_norm": 0.47718337178230286,
+      "learning_rate": 3.8377153439907266e-05,
+      "loss": 0.9932,
+      "step": 160
+    },
+    {
+      "epoch": 0.7638888888888888,
+      "grad_norm": 0.4494944214820862,
+      "learning_rate": 3.2209161399249674e-05,
+      "loss": 0.981,
+      "step": 165
+    },
+    {
+      "epoch": 0.7870370370370371,
+      "grad_norm": 0.4661237597465515,
+      "learning_rate": 2.6485360629279987e-05,
+      "loss": 0.988,
+      "step": 170
+    },
+    {
+      "epoch": 0.8101851851851852,
+      "grad_norm": 0.4337325394153595,
+      "learning_rate": 2.1243255642254578e-05,
+      "loss": 0.9888,
+      "step": 175
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.4312609136104584,
+      "learning_rate": 1.65171946970729e-05,
+      "loss": 0.9938,
+      "step": 180
+    },
+    {
+      "epoch": 0.8564814814814815,
+      "grad_norm": 0.41870856285095215,
+      "learning_rate": 1.233814473646524e-05,
+      "loss": 0.9948,
+      "step": 185
+    },
+    {
+      "epoch": 0.8796296296296297,
+      "grad_norm": 0.47287535667419434,
+      "learning_rate": 8.733488479845997e-06,
+      "loss": 0.9905,
+      "step": 190
+    },
+    {
+      "epoch": 0.9027777777777778,
+      "grad_norm": 0.42414429783821106,
+      "learning_rate": 5.726845001356573e-06,
+      "loss": 0.9834,
+      "step": 195
+    },
+    {
+      "epoch": 0.9259259259259259,
+      "grad_norm": 0.4528570771217346,
+      "learning_rate": 3.3379149687388867e-06,
+      "loss": 0.9822,
+      "step": 200
+    },
+    {
+      "epoch": 0.9490740740740741,
+      "grad_norm": 0.4307001233100891,
+      "learning_rate": 1.5823515570925763e-06,
+      "loss": 0.9802,
+      "step": 205
+    },
+    {
+      "epoch": 0.9722222222222222,
+      "grad_norm": 0.42982199788093567,
+      "learning_rate": 4.7165788333860536e-07,
+      "loss": 0.9846,
+      "step": 210
+    },
+    {
+      "epoch": 0.9953703703703703,
+      "grad_norm": 0.4325573146343231,
+      "learning_rate": 1.3111633436779791e-08,
+      "loss": 0.982,
+      "step": 215
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.4864426851272583,
+      "eval_runtime": 0.5986,
+      "eval_samples_per_second": 18.377,
+      "eval_steps_per_second": 1.671,
+      "step": 216
     },
     {
+      "epoch": 1.0,
+      "step": 216,
+      "total_flos": 9.067486658407956e+17,
+      "train_loss": 1.0541787544886272,
+      "train_runtime": 774.5406,
+      "train_samples_per_second": 53.491,
+      "train_steps_per_second": 0.279
     }
   ],
   "logging_steps": 5,
+  "max_steps": 216,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 9.067486658407956e+17,
   "train_batch_size": 12,
   "trial_name": null,
   "trial_params": null