Training in progress, epoch 0

Browse files

Files changed (9) hide show

README.md +82 -0
adapter_config.json +34 -0
adapter_model.safetensors +3 -0
all_results.json +16 -0
eval_results.json +11 -0
preprocessor_config.json +22 -0
train_results.json +8 -0
trainer_state.json +1571 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,82 @@

+---
+license: apache-2.0
+library_name: peft
+tags:
+- generated_from_trainer
+datasets:
+- medmnist-v2
+metrics:
+- accuracy
+- precision
+- recall
+- f1
+base_model: google/vit-base-patch16-224-in21k
+model-index:
+- name: organc-beit-base-finetuned
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# organc-beit-base-finetuned
+This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the medmnist-v2 dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.2607
+- Accuracy: 0.9128
+- Precision: 0.9094
+- Recall: 0.8976
+- F1: 0.9019
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.005
+- train_batch_size: 16
+- eval_batch_size: 16
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 64
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- num_epochs: 10
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch | Step | Validation Loss | Accuracy | Precision | Recall | F1     |
+|:-------------:|:-----:|:----:|:---------------:|:--------:|:---------:|:------:|:------:|
+| 0.7007        | 1.0   | 203  | 0.2457          | 0.9114   | 0.9019    | 0.8644 | 0.8529 |
+| 0.6322        | 2.0   | 406  | 0.2148          | 0.9423   | 0.9424    | 0.9292 | 0.9340 |
+| 0.6353        | 3.0   | 609  | 0.1218          | 0.9632   | 0.9546    | 0.9542 | 0.9529 |
+| 0.6176        | 4.0   | 813  | 0.0839          | 0.9799   | 0.9775    | 0.9793 | 0.9782 |
+| 0.4913        | 5.0   | 1016 | 0.1008          | 0.9712   | 0.9713    | 0.9717 | 0.9707 |
+| 0.4943        | 6.0   | 1219 | 0.0805          | 0.9799   | 0.9843    | 0.9820 | 0.9828 |
+| 0.426         | 7.0   | 1422 | 0.0671          | 0.9799   | 0.9782    | 0.9787 | 0.9781 |
+| 0.4152        | 8.0   | 1626 | 0.0566          | 0.9870   | 0.9862    | 0.9896 | 0.9878 |
+| 0.2927        | 9.0   | 1829 | 0.0658          | 0.9837   | 0.9850    | 0.9850 | 0.9848 |
+| 0.3154        | 9.99  | 2030 | 0.0537          | 0.9841   | 0.9855    | 0.9854 | 0.9853 |
+### Framework versions
+- PEFT 0.10.0
+- Transformers 4.38.2
+- Pytorch 2.2.1+cu121
+- Datasets 2.18.0
+- Tokenizers 0.15.2

adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "ViTForImageClassification",
+    "parent_library": "transformers.models.vit.modeling_vit"
+  },
+  "base_model_name_or_path": "google/vit-base-patch16-224-in21k",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [
+    "classifier"
+  ],
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "value",
+    "query"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7263a4d313c3148cd1e6b8fadb91f659f4d278a890ebd78dfb535b6f740cc665
+size 2400284

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 9.99,
+    "eval_accuracy": 0.9127963231736816,
+    "eval_f1": 0.9018775351313549,
+    "eval_loss": 0.26074379682540894,
+    "eval_precision": 0.9094064911689247,
+    "eval_recall": 0.8975649435800629,
+    "eval_runtime": 44.5141,
+    "eval_samples_per_second": 185.739,
+    "eval_steps_per_second": 11.614,
+    "total_flos": 1.0133154899356189e+19,
+    "train_loss": 0.5219255947714369,
+    "train_runtime": 1565.9689,
+    "train_samples_per_second": 83.016,
+    "train_steps_per_second": 1.296
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "epoch": 9.99,
+    "eval_accuracy": 0.9127963231736816,
+    "eval_f1": 0.9018775351313549,
+    "eval_loss": 0.26074379682540894,
+    "eval_precision": 0.9094064911689247,
+    "eval_recall": 0.8975649435800629,
+    "eval_runtime": 44.5141,
+    "eval_samples_per_second": 185.739,
+    "eval_steps_per_second": 11.614
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "ViTImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 9.99,
+    "total_flos": 1.0133154899356189e+19,
+    "train_loss": 0.5219255947714369,
+    "train_runtime": 1565.9689,
+    "train_samples_per_second": 83.016,
+    "train_steps_per_second": 1.296
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1571 @@

+{
+  "best_metric": 0.9870401337792643,
+  "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-lora-medmnistv2/checkpoint-1626",
+  "epoch": 9.98769987699877,
+  "eval_steps": 500,
+  "global_step": 2030,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.1422470808029175,
+      "learning_rate": 0.004975369458128079,
+      "loss": 1.9628,
+      "step": 10
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.255900502204895,
+      "learning_rate": 0.004950738916256157,
+      "loss": 1.3552,
+      "step": 20
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.5407381057739258,
+      "learning_rate": 0.00493103448275862,
+      "loss": 1.0858,
+      "step": 30
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.6224355697631836,
+      "learning_rate": 0.0049064039408866994,
+      "loss": 1.0843,
+      "step": 40
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.8357070088386536,
+      "learning_rate": 0.0048817733990147785,
+      "loss": 0.96,
+      "step": 50
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.080548644065857,
+      "learning_rate": 0.004857142857142858,
+      "loss": 0.8978,
+      "step": 60
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.4522780179977417,
+      "learning_rate": 0.004832512315270936,
+      "loss": 0.8698,
+      "step": 70
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.9388962388038635,
+      "learning_rate": 0.004807881773399015,
+      "loss": 0.9796,
+      "step": 80
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.6392161846160889,
+      "learning_rate": 0.004783251231527094,
+      "loss": 0.814,
+      "step": 90
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.7927560210227966,
+      "learning_rate": 0.004758620689655172,
+      "loss": 0.8127,
+      "step": 100
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.9725190997123718,
+      "learning_rate": 0.004733990147783251,
+      "loss": 0.7416,
+      "step": 110
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.2956442832946777,
+      "learning_rate": 0.00470935960591133,
+      "loss": 0.7281,
+      "step": 120
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.2394323348999023,
+      "learning_rate": 0.0046847290640394095,
+      "loss": 0.7632,
+      "step": 130
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.1683493852615356,
+      "learning_rate": 0.004660098522167488,
+      "loss": 0.7589,
+      "step": 140
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.8499715328216553,
+      "learning_rate": 0.004635467980295567,
+      "loss": 0.6864,
+      "step": 150
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.9673293232917786,
+      "learning_rate": 0.004610837438423646,
+      "loss": 0.6827,
+      "step": 160
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.7566954493522644,
+      "learning_rate": 0.0045886699507389165,
+      "loss": 0.667,
+      "step": 170
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.0029590129852295,
+      "learning_rate": 0.004564039408866995,
+      "loss": 0.7118,
+      "step": 180
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.9083508849143982,
+      "learning_rate": 0.004539408866995074,
+      "loss": 0.6388,
+      "step": 190
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.9889862537384033,
+      "learning_rate": 0.004514778325123153,
+      "loss": 0.7007,
+      "step": 200
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9113712374581939,
+      "eval_f1": 0.8529028766456601,
+      "eval_loss": 0.2457016110420227,
+      "eval_precision": 0.9018735870596898,
+      "eval_recall": 0.8643919959588723,
+      "eval_runtime": 12.8411,
+      "eval_samples_per_second": 186.276,
+      "eval_steps_per_second": 11.681,
+      "step": 203
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.352356195449829,
+      "learning_rate": 0.004490147783251232,
+      "loss": 0.7043,
+      "step": 210
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.9853256940841675,
+      "learning_rate": 0.00446551724137931,
+      "loss": 0.5867,
+      "step": 220
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.2557787895202637,
+      "learning_rate": 0.004440886699507389,
+      "loss": 0.6101,
+      "step": 230
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.9615167379379272,
+      "learning_rate": 0.004416256157635468,
+      "loss": 0.7143,
+      "step": 240
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.9772002696990967,
+      "learning_rate": 0.004391625615763547,
+      "loss": 0.6388,
+      "step": 250
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.6855661869049072,
+      "learning_rate": 0.004366995073891626,
+      "loss": 0.5621,
+      "step": 260
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 2.439969062805176,
+      "learning_rate": 0.004342364532019705,
+      "loss": 0.6839,
+      "step": 270
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.0421786308288574,
+      "learning_rate": 0.004317733990147784,
+      "loss": 0.6545,
+      "step": 280
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.445427417755127,
+      "learning_rate": 0.004293103448275862,
+      "loss": 0.6383,
+      "step": 290
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.585347056388855,
+      "learning_rate": 0.00426847290640394,
+      "loss": 0.6446,
+      "step": 300
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.3732099533081055,
+      "learning_rate": 0.004243842364532019,
+      "loss": 0.5853,
+      "step": 310
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.9869935512542725,
+      "learning_rate": 0.0042192118226600985,
+      "loss": 0.6442,
+      "step": 320
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 2.5270144939422607,
+      "learning_rate": 0.004194581280788178,
+      "loss": 0.6744,
+      "step": 330
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.2350406646728516,
+      "learning_rate": 0.004169950738916256,
+      "loss": 0.711,
+      "step": 340
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 2.2132787704467773,
+      "learning_rate": 0.004145320197044335,
+      "loss": 0.7377,
+      "step": 350
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.414409875869751,
+      "learning_rate": 0.004120689655172414,
+      "loss": 0.6735,
+      "step": 360
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.505163311958313,
+      "learning_rate": 0.004096059113300492,
+      "loss": 0.6941,
+      "step": 370
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.286877155303955,
+      "learning_rate": 0.004071428571428571,
+      "loss": 0.5817,
+      "step": 380
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 2.4780466556549072,
+      "learning_rate": 0.00404679802955665,
+      "loss": 0.6474,
+      "step": 390
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.735863208770752,
+      "learning_rate": 0.0040221674876847295,
+      "loss": 0.6322,
+      "step": 400
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9423076923076923,
+      "eval_f1": 0.9340154565022668,
+      "eval_loss": 0.21482966840267181,
+      "eval_precision": 0.9423994421185976,
+      "eval_recall": 0.9291886861477701,
+      "eval_runtime": 12.9083,
+      "eval_samples_per_second": 185.306,
+      "eval_steps_per_second": 11.62,
+      "step": 406
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 1.8992809057235718,
+      "learning_rate": 0.003997536945812808,
+      "loss": 0.7819,
+      "step": 410
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 3.8816006183624268,
+      "learning_rate": 0.003972906403940887,
+      "loss": 0.7579,
+      "step": 420
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 2.3438549041748047,
+      "learning_rate": 0.003948275862068966,
+      "loss": 0.6899,
+      "step": 430
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 2.111189126968384,
+      "learning_rate": 0.003923645320197044,
+      "loss": 0.6711,
+      "step": 440
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 2.7600784301757812,
+      "learning_rate": 0.0038990147783251232,
+      "loss": 0.6031,
+      "step": 450
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 1.7545028924942017,
+      "learning_rate": 0.0038743842364532023,
+      "loss": 0.6801,
+      "step": 460
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 3.5373642444610596,
+      "learning_rate": 0.003849753694581281,
+      "loss": 0.6683,
+      "step": 470
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 2.0872020721435547,
+      "learning_rate": 0.00382512315270936,
+      "loss": 0.5974,
+      "step": 480
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 2.178804636001587,
+      "learning_rate": 0.0038004926108374383,
+      "loss": 0.5688,
+      "step": 490
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 2.1402218341827393,
+      "learning_rate": 0.003775862068965517,
+      "loss": 0.6617,
+      "step": 500
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 3.2831871509552,
+      "learning_rate": 0.003751231527093596,
+      "loss": 0.6706,
+      "step": 510
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 1.9515079259872437,
+      "learning_rate": 0.0037266009852216747,
+      "loss": 0.6616,
+      "step": 520
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 1.7052913904190063,
+      "learning_rate": 0.003701970443349754,
+      "loss": 0.6816,
+      "step": 530
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 1.4746874570846558,
+      "learning_rate": 0.0036773399014778324,
+      "loss": 0.613,
+      "step": 540
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 1.1124660968780518,
+      "learning_rate": 0.0036527093596059115,
+      "loss": 0.6295,
+      "step": 550
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 1.4814788103103638,
+      "learning_rate": 0.00362807881773399,
+      "loss": 0.618,
+      "step": 560
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 1.1870466470718384,
+      "learning_rate": 0.003603448275862069,
+      "loss": 0.6529,
+      "step": 570
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 1.1089264154434204,
+      "learning_rate": 0.003578817733990148,
+      "loss": 0.5042,
+      "step": 580
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 2.6037137508392334,
+      "learning_rate": 0.0035541871921182266,
+      "loss": 0.7198,
+      "step": 590
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 1.4666210412979126,
+      "learning_rate": 0.0035295566502463057,
+      "loss": 0.6353,
+      "step": 600
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.9632107023411371,
+      "eval_f1": 0.9529335555710525,
+      "eval_loss": 0.121844083070755,
+      "eval_precision": 0.9546285621864314,
+      "eval_recall": 0.9542450303236223,
+      "eval_runtime": 12.9205,
+      "eval_samples_per_second": 185.132,
+      "eval_steps_per_second": 11.609,
+      "step": 609
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.863990068435669,
+      "learning_rate": 0.0035049261083743843,
+      "loss": 0.6389,
+      "step": 610
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 1.3299729824066162,
+      "learning_rate": 0.0034802955665024634,
+      "loss": 0.6781,
+      "step": 620
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 2.5126378536224365,
+      "learning_rate": 0.003455665024630542,
+      "loss": 0.6492,
+      "step": 630
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 1.380492925643921,
+      "learning_rate": 0.0034310344827586207,
+      "loss": 0.631,
+      "step": 640
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 2.03764009475708,
+      "learning_rate": 0.0034064039408867,
+      "loss": 0.6221,
+      "step": 650
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 1.1895209550857544,
+      "learning_rate": 0.0033817733990147785,
+      "loss": 0.5881,
+      "step": 660
+    },
+    {
+      "epoch": 3.3,
+      "grad_norm": 1.354785442352295,
+      "learning_rate": 0.003357142857142857,
+      "loss": 0.584,
+      "step": 670
+    },
+    {
+      "epoch": 3.35,
+      "grad_norm": 1.9887776374816895,
+      "learning_rate": 0.003332512315270936,
+      "loss": 0.5914,
+      "step": 680
+    },
+    {
+      "epoch": 3.39,
+      "grad_norm": 1.7261571884155273,
+      "learning_rate": 0.0033078817733990145,
+      "loss": 0.5608,
+      "step": 690
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 1.3888462781906128,
+      "learning_rate": 0.0032832512315270936,
+      "loss": 0.5832,
+      "step": 700
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 1.6422044038772583,
+      "learning_rate": 0.003258620689655172,
+      "loss": 0.5759,
+      "step": 710
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 1.2814769744873047,
+      "learning_rate": 0.0032339901477832513,
+      "loss": 0.5845,
+      "step": 720
+    },
+    {
+      "epoch": 3.59,
+      "grad_norm": 1.835681676864624,
+      "learning_rate": 0.00320935960591133,
+      "loss": 0.5756,
+      "step": 730
+    },
+    {
+      "epoch": 3.64,
+      "grad_norm": 1.3922501802444458,
+      "learning_rate": 0.003184729064039409,
+      "loss": 0.5878,
+      "step": 740
+    },
+    {
+      "epoch": 3.69,
+      "grad_norm": 1.3808457851409912,
+      "learning_rate": 0.0031600985221674877,
+      "loss": 0.5593,
+      "step": 750
+    },
+    {
+      "epoch": 3.74,
+      "grad_norm": 1.3295152187347412,
+      "learning_rate": 0.0031379310344827587,
+      "loss": 0.5563,
+      "step": 760
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 2.4613001346588135,
+      "learning_rate": 0.003113300492610838,
+      "loss": 0.5341,
+      "step": 770
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 1.1632391214370728,
+      "learning_rate": 0.0030886699507389165,
+      "loss": 0.6108,
+      "step": 780
+    },
+    {
+      "epoch": 3.89,
+      "grad_norm": 1.0384527444839478,
+      "learning_rate": 0.0030640394088669956,
+      "loss": 0.5597,
+      "step": 790
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 1.5166749954223633,
+      "learning_rate": 0.0030394088669950738,
+      "loss": 0.5153,
+      "step": 800
+    },
+    {
+      "epoch": 3.99,
+      "grad_norm": 1.5253658294677734,
+      "learning_rate": 0.0030147783251231524,
+      "loss": 0.6176,
+      "step": 810
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.979933110367893,
+      "eval_f1": 0.9781756712736359,
+      "eval_loss": 0.08385530859231949,
+      "eval_precision": 0.9775461189264221,
+      "eval_recall": 0.979303313253159,
+      "eval_runtime": 12.9634,
+      "eval_samples_per_second": 184.519,
+      "eval_steps_per_second": 11.571,
+      "step": 813
+    },
+    {
+      "epoch": 4.03,
+      "grad_norm": 1.8829090595245361,
+      "learning_rate": 0.0029901477832512315,
+      "loss": 0.5208,
+      "step": 820
+    },
+    {
+      "epoch": 4.08,
+      "grad_norm": 1.4730515480041504,
+      "learning_rate": 0.00296551724137931,
+      "loss": 0.5205,
+      "step": 830
+    },
+    {
+      "epoch": 4.13,
+      "grad_norm": 0.6617820858955383,
+      "learning_rate": 0.0029408866995073893,
+      "loss": 0.5029,
+      "step": 840
+    },
+    {
+      "epoch": 4.18,
+      "grad_norm": 0.9558489918708801,
+      "learning_rate": 0.002916256157635468,
+      "loss": 0.533,
+      "step": 850
+    },
+    {
+      "epoch": 4.23,
+      "grad_norm": 2.0359411239624023,
+      "learning_rate": 0.0028916256157635466,
+      "loss": 0.4633,
+      "step": 860
+    },
+    {
+      "epoch": 4.28,
+      "grad_norm": 2.325270414352417,
+      "learning_rate": 0.0028669950738916257,
+      "loss": 0.5877,
+      "step": 870
+    },
+    {
+      "epoch": 4.33,
+      "grad_norm": 1.3358855247497559,
+      "learning_rate": 0.0028423645320197043,
+      "loss": 0.4447,
+      "step": 880
+    },
+    {
+      "epoch": 4.38,
+      "grad_norm": 1.4927520751953125,
+      "learning_rate": 0.0028177339901477834,
+      "loss": 0.5783,
+      "step": 890
+    },
+    {
+      "epoch": 4.43,
+      "grad_norm": 1.231078028678894,
+      "learning_rate": 0.002793103448275862,
+      "loss": 0.5606,
+      "step": 900
+    },
+    {
+      "epoch": 4.48,
+      "grad_norm": 1.4861023426055908,
+      "learning_rate": 0.002768472906403941,
+      "loss": 0.5006,
+      "step": 910
+    },
+    {
+      "epoch": 4.53,
+      "grad_norm": 1.0326859951019287,
+      "learning_rate": 0.00274384236453202,
+      "loss": 0.4989,
+      "step": 920
+    },
+    {
+      "epoch": 4.58,
+      "grad_norm": 1.290980577468872,
+      "learning_rate": 0.0027192118226600985,
+      "loss": 0.514,
+      "step": 930
+    },
+    {
+      "epoch": 4.62,
+      "grad_norm": 1.3768541812896729,
+      "learning_rate": 0.0026945812807881776,
+      "loss": 0.5221,
+      "step": 940
+    },
+    {
+      "epoch": 4.67,
+      "grad_norm": 4.485782623291016,
+      "learning_rate": 0.0026699507389162562,
+      "loss": 0.4992,
+      "step": 950
+    },
+    {
+      "epoch": 4.72,
+      "grad_norm": 1.4199550151824951,
+      "learning_rate": 0.0026453201970443353,
+      "loss": 0.5256,
+      "step": 960
+    },
+    {
+      "epoch": 4.77,
+      "grad_norm": 1.4900827407836914,
+      "learning_rate": 0.002620689655172414,
+      "loss": 0.5111,
+      "step": 970
+    },
+    {
+      "epoch": 4.82,
+      "grad_norm": 1.874714970588684,
+      "learning_rate": 0.002596059113300493,
+      "loss": 0.4774,
+      "step": 980
+    },
+    {
+      "epoch": 4.87,
+      "grad_norm": 1.254228115081787,
+      "learning_rate": 0.0025714285714285713,
+      "loss": 0.4789,
+      "step": 990
+    },
+    {
+      "epoch": 4.92,
+      "grad_norm": 2.317281723022461,
+      "learning_rate": 0.00254679802955665,
+      "loss": 0.5338,
+      "step": 1000
+    },
+    {
+      "epoch": 4.97,
+      "grad_norm": 1.482914686203003,
+      "learning_rate": 0.002522167487684729,
+      "loss": 0.4913,
+      "step": 1010
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.9711538461538461,
+      "eval_f1": 0.9707062805947578,
+      "eval_loss": 0.10083355009555817,
+      "eval_precision": 0.9712614809682641,
+      "eval_recall": 0.9717429310005482,
+      "eval_runtime": 12.9897,
+      "eval_samples_per_second": 184.146,
+      "eval_steps_per_second": 11.548,
+      "step": 1016
+    },
+    {
+      "epoch": 5.02,
+      "grad_norm": 1.070559024810791,
+      "learning_rate": 0.002497536945812808,
+      "loss": 0.4719,
+      "step": 1020
+    },
+    {
+      "epoch": 5.07,
+      "grad_norm": 1.206444263458252,
+      "learning_rate": 0.002472906403940887,
+      "loss": 0.4973,
+      "step": 1030
+    },
+    {
+      "epoch": 5.12,
+      "grad_norm": 1.2651773691177368,
+      "learning_rate": 0.0024482758620689654,
+      "loss": 0.5203,
+      "step": 1040
+    },
+    {
+      "epoch": 5.17,
+      "grad_norm": 1.162040114402771,
+      "learning_rate": 0.002423645320197044,
+      "loss": 0.4781,
+      "step": 1050
+    },
+    {
+      "epoch": 5.22,
+      "grad_norm": 1.2628921270370483,
+      "learning_rate": 0.002399014778325123,
+      "loss": 0.558,
+      "step": 1060
+    },
+    {
+      "epoch": 5.26,
+      "grad_norm": 0.6029661297798157,
+      "learning_rate": 0.002374384236453202,
+      "loss": 0.4991,
+      "step": 1070
+    },
+    {
+      "epoch": 5.31,
+      "grad_norm": 1.7291909456253052,
+      "learning_rate": 0.002349753694581281,
+      "loss": 0.5331,
+      "step": 1080
+    },
+    {
+      "epoch": 5.36,
+      "grad_norm": 0.8556851148605347,
+      "learning_rate": 0.0023251231527093596,
+      "loss": 0.4594,
+      "step": 1090
+    },
+    {
+      "epoch": 5.41,
+      "grad_norm": 0.9938213229179382,
+      "learning_rate": 0.0023004926108374387,
+      "loss": 0.4841,
+      "step": 1100
+    },
+    {
+      "epoch": 5.46,
+      "grad_norm": 2.558023452758789,
+      "learning_rate": 0.0022758620689655173,
+      "loss": 0.5241,
+      "step": 1110
+    },
+    {
+      "epoch": 5.51,
+      "grad_norm": 1.3117694854736328,
+      "learning_rate": 0.0022512315270935964,
+      "loss": 0.4821,
+      "step": 1120
+    },
+    {
+      "epoch": 5.56,
+      "grad_norm": 1.2305413484573364,
+      "learning_rate": 0.0022266009852216747,
+      "loss": 0.4234,
+      "step": 1130
+    },
+    {
+      "epoch": 5.61,
+      "grad_norm": 1.0140172243118286,
+      "learning_rate": 0.0022019704433497538,
+      "loss": 0.4008,
+      "step": 1140
+    },
+    {
+      "epoch": 5.66,
+      "grad_norm": 1.2372822761535645,
+      "learning_rate": 0.0021773399014778324,
+      "loss": 0.4507,
+      "step": 1150
+    },
+    {
+      "epoch": 5.71,
+      "grad_norm": 0.8806868195533752,
+      "learning_rate": 0.0021527093596059115,
+      "loss": 0.5398,
+      "step": 1160
+    },
+    {
+      "epoch": 5.76,
+      "grad_norm": 0.9182419180870056,
+      "learning_rate": 0.00212807881773399,
+      "loss": 0.4085,
+      "step": 1170
+    },
+    {
+      "epoch": 5.81,
+      "grad_norm": 1.4466331005096436,
+      "learning_rate": 0.0021034482758620692,
+      "loss": 0.4966,
+      "step": 1180
+    },
+    {
+      "epoch": 5.85,
+      "grad_norm": 1.1195909976959229,
+      "learning_rate": 0.002078817733990148,
+      "loss": 0.4349,
+      "step": 1190
+    },
+    {
+      "epoch": 5.9,
+      "grad_norm": 1.7363073825836182,
+      "learning_rate": 0.0020541871921182266,
+      "loss": 0.4528,
+      "step": 1200
+    },
+    {
+      "epoch": 5.95,
+      "grad_norm": 1.3727185726165771,
+      "learning_rate": 0.0020295566502463057,
+      "loss": 0.4943,
+      "step": 1210
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.979933110367893,
+      "eval_f1": 0.9828335162681155,
+      "eval_loss": 0.08049997687339783,
+      "eval_precision": 0.9843107832686134,
+      "eval_recall": 0.9819606365532088,
+      "eval_runtime": 12.9302,
+      "eval_samples_per_second": 184.993,
+      "eval_steps_per_second": 11.601,
+      "step": 1219
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.9201492667198181,
+      "learning_rate": 0.0020049261083743843,
+      "loss": 0.4137,
+      "step": 1220
+    },
+    {
+      "epoch": 6.05,
+      "grad_norm": 1.0448100566864014,
+      "learning_rate": 0.001980295566502463,
+      "loss": 0.4731,
+      "step": 1230
+    },
+    {
+      "epoch": 6.1,
+      "grad_norm": 7.53134298324585,
+      "learning_rate": 0.001955665024630542,
+      "loss": 0.4018,
+      "step": 1240
+    },
+    {
+      "epoch": 6.15,
+      "grad_norm": 1.0926949977874756,
+      "learning_rate": 0.0019310344827586207,
+      "loss": 0.4643,
+      "step": 1250
+    },
+    {
+      "epoch": 6.2,
+      "grad_norm": 1.1615904569625854,
+      "learning_rate": 0.0019064039408866996,
+      "loss": 0.4885,
+      "step": 1260
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 1.0068614482879639,
+      "learning_rate": 0.0018817733990147785,
+      "loss": 0.4001,
+      "step": 1270
+    },
+    {
+      "epoch": 6.3,
+      "grad_norm": 1.080955982208252,
+      "learning_rate": 0.0018571428571428573,
+      "loss": 0.4136,
+      "step": 1280
+    },
+    {
+      "epoch": 6.35,
+      "grad_norm": 1.4813597202301025,
+      "learning_rate": 0.001832512315270936,
+      "loss": 0.3908,
+      "step": 1290
+    },
+    {
+      "epoch": 6.4,
+      "grad_norm": 0.8198271989822388,
+      "learning_rate": 0.0018078817733990149,
+      "loss": 0.4096,
+      "step": 1300
+    },
+    {
+      "epoch": 6.45,
+      "grad_norm": 0.9625017642974854,
+      "learning_rate": 0.0017832512315270935,
+      "loss": 0.408,
+      "step": 1310
+    },
+    {
+      "epoch": 6.49,
+      "grad_norm": 0.969749391078949,
+      "learning_rate": 0.0017586206896551724,
+      "loss": 0.4371,
+      "step": 1320
+    },
+    {
+      "epoch": 6.54,
+      "grad_norm": 1.013454794883728,
+      "learning_rate": 0.0017339901477832513,
+      "loss": 0.4635,
+      "step": 1330
+    },
+    {
+      "epoch": 6.59,
+      "grad_norm": 1.1573213338851929,
+      "learning_rate": 0.0017093596059113301,
+      "loss": 0.3503,
+      "step": 1340
+    },
+    {
+      "epoch": 6.64,
+      "grad_norm": 0.9733229875564575,
+      "learning_rate": 0.0016847290640394088,
+      "loss": 0.447,
+      "step": 1350
+    },
+    {
+      "epoch": 6.69,
+      "grad_norm": 0.6445940136909485,
+      "learning_rate": 0.0016600985221674877,
+      "loss": 0.4405,
+      "step": 1360
+    },
+    {
+      "epoch": 6.74,
+      "grad_norm": 1.0442399978637695,
+      "learning_rate": 0.0016354679802955666,
+      "loss": 0.3957,
+      "step": 1370
+    },
+    {
+      "epoch": 6.79,
+      "grad_norm": 1.03929603099823,
+      "learning_rate": 0.0016108374384236454,
+      "loss": 0.4509,
+      "step": 1380
+    },
+    {
+      "epoch": 6.84,
+      "grad_norm": 0.7856884598731995,
+      "learning_rate": 0.0015862068965517243,
+      "loss": 0.3689,
+      "step": 1390
+    },
+    {
+      "epoch": 6.89,
+      "grad_norm": 0.6504011750221252,
+      "learning_rate": 0.001561576354679803,
+      "loss": 0.4059,
+      "step": 1400
+    },
+    {
+      "epoch": 6.94,
+      "grad_norm": 1.0724025964736938,
+      "learning_rate": 0.0015369458128078816,
+      "loss": 0.4188,
+      "step": 1410
+    },
+    {
+      "epoch": 6.99,
+      "grad_norm": 0.8927863836288452,
+      "learning_rate": 0.0015123152709359605,
+      "loss": 0.426,
+      "step": 1420
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.979933110367893,
+      "eval_f1": 0.9780623859702803,
+      "eval_loss": 0.06709808856248856,
+      "eval_precision": 0.9782146710296142,
+      "eval_recall": 0.9786844286441465,
+      "eval_runtime": 12.9674,
+      "eval_samples_per_second": 184.462,
+      "eval_steps_per_second": 11.567,
+      "step": 1422
+    },
+    {
+      "epoch": 7.04,
+      "grad_norm": 0.7571507096290588,
+      "learning_rate": 0.0014876847290640394,
+      "loss": 0.3513,
+      "step": 1430
+    },
+    {
+      "epoch": 7.08,
+      "grad_norm": 0.9653282761573792,
+      "learning_rate": 0.0014630541871921182,
+      "loss": 0.3661,
+      "step": 1440
+    },
+    {
+      "epoch": 7.13,
+      "grad_norm": 1.1139851808547974,
+      "learning_rate": 0.001438423645320197,
+      "loss": 0.3978,
+      "step": 1450
+    },
+    {
+      "epoch": 7.18,
+      "grad_norm": 1.7360243797302246,
+      "learning_rate": 0.001413793103448276,
+      "loss": 0.3504,
+      "step": 1460
+    },
+    {
+      "epoch": 7.23,
+      "grad_norm": 1.009930968284607,
+      "learning_rate": 0.0013891625615763549,
+      "loss": 0.3718,
+      "step": 1470
+    },
+    {
+      "epoch": 7.28,
+      "grad_norm": 1.2110596895217896,
+      "learning_rate": 0.0013645320197044337,
+      "loss": 0.4109,
+      "step": 1480
+    },
+    {
+      "epoch": 7.33,
+      "grad_norm": 1.1028022766113281,
+      "learning_rate": 0.0013399014778325122,
+      "loss": 0.4052,
+      "step": 1490
+    },
+    {
+      "epoch": 7.38,
+      "grad_norm": 1.0521297454833984,
+      "learning_rate": 0.001315270935960591,
+      "loss": 0.4152,
+      "step": 1500
+    },
+    {
+      "epoch": 7.43,
+      "grad_norm": 0.8621806502342224,
+      "learning_rate": 0.00129064039408867,
+      "loss": 0.3573,
+      "step": 1510
+    },
+    {
+      "epoch": 7.48,
+      "grad_norm": 1.0334779024124146,
+      "learning_rate": 0.0012660098522167488,
+      "loss": 0.3673,
+      "step": 1520
+    },
+    {
+      "epoch": 7.53,
+      "grad_norm": 0.7882916331291199,
+      "learning_rate": 0.0012413793103448277,
+      "loss": 0.3391,
+      "step": 1530
+    },
+    {
+      "epoch": 7.58,
+      "grad_norm": 1.2786757946014404,
+      "learning_rate": 0.0012167487684729065,
+      "loss": 0.3939,
+      "step": 1540
+    },
+    {
+      "epoch": 7.63,
+      "grad_norm": 1.4354298114776611,
+      "learning_rate": 0.0011921182266009852,
+      "loss": 0.4541,
+      "step": 1550
+    },
+    {
+      "epoch": 7.68,
+      "grad_norm": 1.0900424718856812,
+      "learning_rate": 0.001167487684729064,
+      "loss": 0.3183,
+      "step": 1560
+    },
+    {
+      "epoch": 7.72,
+      "grad_norm": 0.8424840569496155,
+      "learning_rate": 0.001142857142857143,
+      "loss": 0.3408,
+      "step": 1570
+    },
+    {
+      "epoch": 7.77,
+      "grad_norm": 0.6967119574546814,
+      "learning_rate": 0.0011182266009852216,
+      "loss": 0.4029,
+      "step": 1580
+    },
+    {
+      "epoch": 7.82,
+      "grad_norm": 0.7286412119865417,
+      "learning_rate": 0.0010935960591133005,
+      "loss": 0.3717,
+      "step": 1590
+    },
+    {
+      "epoch": 7.87,
+      "grad_norm": 0.9532930254936218,
+      "learning_rate": 0.0010689655172413793,
+      "loss": 0.3215,
+      "step": 1600
+    },
+    {
+      "epoch": 7.92,
+      "grad_norm": 1.0920195579528809,
+      "learning_rate": 0.001044334975369458,
+      "loss": 0.3885,
+      "step": 1610
+    },
+    {
+      "epoch": 7.97,
+      "grad_norm": 0.8902508020401001,
+      "learning_rate": 0.0010197044334975369,
+      "loss": 0.4152,
+      "step": 1620
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.9870401337792643,
+      "eval_f1": 0.9877879659430536,
+      "eval_loss": 0.056644052267074585,
+      "eval_precision": 0.9862146260055851,
+      "eval_recall": 0.9896401486739793,
+      "eval_runtime": 12.9254,
+      "eval_samples_per_second": 185.062,
+      "eval_steps_per_second": 11.605,
+      "step": 1626
+    },
+    {
+      "epoch": 8.02,
+      "grad_norm": 0.998824954032898,
+      "learning_rate": 0.0009950738916256158,
+      "loss": 0.289,
+      "step": 1630
+    },
+    {
+      "epoch": 8.07,
+      "grad_norm": 0.6882240772247314,
+      "learning_rate": 0.0009704433497536946,
+      "loss": 0.3786,
+      "step": 1640
+    },
+    {
+      "epoch": 8.12,
+      "grad_norm": 0.8674384951591492,
+      "learning_rate": 0.0009458128078817735,
+      "loss": 0.4118,
+      "step": 1650
+    },
+    {
+      "epoch": 8.17,
+      "grad_norm": 1.109112024307251,
+      "learning_rate": 0.0009211822660098522,
+      "loss": 0.3298,
+      "step": 1660
+    },
+    {
+      "epoch": 8.22,
+      "grad_norm": 0.8515803217887878,
+      "learning_rate": 0.000896551724137931,
+      "loss": 0.2702,
+      "step": 1670
+    },
+    {
+      "epoch": 8.27,
+      "grad_norm": 1.003696322441101,
+      "learning_rate": 0.0008719211822660099,
+      "loss": 0.3509,
+      "step": 1680
+    },
+    {
+      "epoch": 8.31,
+      "grad_norm": 0.8540720343589783,
+      "learning_rate": 0.0008472906403940888,
+      "loss": 0.31,
+      "step": 1690
+    },
+    {
+      "epoch": 8.36,
+      "grad_norm": 1.6798268556594849,
+      "learning_rate": 0.0008226600985221674,
+      "loss": 0.3727,
+      "step": 1700
+    },
+    {
+      "epoch": 8.41,
+      "grad_norm": 0.8054636120796204,
+      "learning_rate": 0.0007980295566502463,
+      "loss": 0.3159,
+      "step": 1710
+    },
+    {
+      "epoch": 8.46,
+      "grad_norm": 1.4890103340148926,
+      "learning_rate": 0.0007733990147783252,
+      "loss": 0.3387,
+      "step": 1720
+    },
+    {
+      "epoch": 8.51,
+      "grad_norm": 0.7350850105285645,
+      "learning_rate": 0.000748768472906404,
+      "loss": 0.3237,
+      "step": 1730
+    },
+    {
+      "epoch": 8.56,
+      "grad_norm": 1.4844990968704224,
+      "learning_rate": 0.0007241379310344828,
+      "loss": 0.3395,
+      "step": 1740
+    },
+    {
+      "epoch": 8.61,
+      "grad_norm": 0.887062132358551,
+      "learning_rate": 0.0006995073891625616,
+      "loss": 0.3418,
+      "step": 1750
+    },
+    {
+      "epoch": 8.66,
+      "grad_norm": 0.7276543378829956,
+      "learning_rate": 0.0006748768472906404,
+      "loss": 0.3433,
+      "step": 1760
+    },
+    {
+      "epoch": 8.71,
+      "grad_norm": 0.6962388753890991,
+      "learning_rate": 0.0006502463054187192,
+      "loss": 0.3588,
+      "step": 1770
+    },
+    {
+      "epoch": 8.76,
+      "grad_norm": 1.764138102531433,
+      "learning_rate": 0.0006256157635467981,
+      "loss": 0.3565,
+      "step": 1780
+    },
+    {
+      "epoch": 8.81,
+      "grad_norm": 0.7240989208221436,
+      "learning_rate": 0.0006009852216748769,
+      "loss": 0.3361,
+      "step": 1790
+    },
+    {
+      "epoch": 8.86,
+      "grad_norm": 0.9319175481796265,
+      "learning_rate": 0.0005763546798029557,
+      "loss": 0.3313,
+      "step": 1800
+    },
+    {
+      "epoch": 8.91,
+      "grad_norm": 0.8015260100364685,
+      "learning_rate": 0.0005517241379310345,
+      "loss": 0.3776,
+      "step": 1810
+    },
+    {
+      "epoch": 8.95,
+      "grad_norm": 0.6193283200263977,
+      "learning_rate": 0.0005270935960591134,
+      "loss": 0.2927,
+      "step": 1820
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.9836956521739131,
+      "eval_f1": 0.9847638806045893,
+      "eval_loss": 0.06579139083623886,
+      "eval_precision": 0.9850023004987147,
+      "eval_recall": 0.9849849759564541,
+      "eval_runtime": 12.7582,
+      "eval_samples_per_second": 187.488,
+      "eval_steps_per_second": 11.757,
+      "step": 1829
+    },
+    {
+      "epoch": 9.0,
+      "grad_norm": 0.6149892210960388,
+      "learning_rate": 0.0005024630541871921,
+      "loss": 0.3451,
+      "step": 1830
+    },
+    {
+      "epoch": 9.05,
+      "grad_norm": 0.7571848034858704,
+      "learning_rate": 0.00047783251231527096,
+      "loss": 0.3187,
+      "step": 1840
+    },
+    {
+      "epoch": 9.1,
+      "grad_norm": 0.8765355348587036,
+      "learning_rate": 0.00045320197044334973,
+      "loss": 0.3447,
+      "step": 1850
+    },
+    {
+      "epoch": 9.15,
+      "grad_norm": 0.5563300848007202,
+      "learning_rate": 0.0004285714285714286,
+      "loss": 0.3529,
+      "step": 1860
+    },
+    {
+      "epoch": 9.2,
+      "grad_norm": 0.7253230810165405,
+      "learning_rate": 0.0004039408866995074,
+      "loss": 0.2695,
+      "step": 1870
+    },
+    {
+      "epoch": 9.25,
+      "grad_norm": 0.6665922403335571,
+      "learning_rate": 0.0003793103448275862,
+      "loss": 0.3091,
+      "step": 1880
+    },
+    {
+      "epoch": 9.3,
+      "grad_norm": 1.43943452835083,
+      "learning_rate": 0.00035467980295566506,
+      "loss": 0.2942,
+      "step": 1890
+    },
+    {
+      "epoch": 9.35,
+      "grad_norm": 2.071869373321533,
+      "learning_rate": 0.00033004926108374383,
+      "loss": 0.3097,
+      "step": 1900
+    },
+    {
+      "epoch": 9.4,
+      "grad_norm": 0.6491404175758362,
+      "learning_rate": 0.0003054187192118227,
+      "loss": 0.2895,
+      "step": 1910
+    },
+    {
+      "epoch": 9.45,
+      "grad_norm": 0.9849537014961243,
+      "learning_rate": 0.0002807881773399015,
+      "loss": 0.3157,
+      "step": 1920
+    },
+    {
+      "epoch": 9.5,
+      "grad_norm": 8.456196784973145,
+      "learning_rate": 0.0002561576354679803,
+      "loss": 0.322,
+      "step": 1930
+    },
+    {
+      "epoch": 9.54,
+      "grad_norm": 0.5972003936767578,
+      "learning_rate": 0.0002315270935960591,
+      "loss": 0.2867,
+      "step": 1940
+    },
+    {
+      "epoch": 9.59,
+      "grad_norm": 0.42175132036209106,
+      "learning_rate": 0.00020689655172413793,
+      "loss": 0.2804,
+      "step": 1950
+    },
+    {
+      "epoch": 9.64,
+      "grad_norm": 1.2799464464187622,
+      "learning_rate": 0.00018226600985221675,
+      "loss": 0.2941,
+      "step": 1960
+    },
+    {
+      "epoch": 9.69,
+      "grad_norm": 0.8577640056610107,
+      "learning_rate": 0.00015763546798029557,
+      "loss": 0.3202,
+      "step": 1970
+    },
+    {
+      "epoch": 9.74,
+      "grad_norm": 0.9166315197944641,
+      "learning_rate": 0.00013300492610837438,
+      "loss": 0.3404,
+      "step": 1980
+    },
+    {
+      "epoch": 9.79,
+      "grad_norm": 1.3336009979248047,
+      "learning_rate": 0.0001083743842364532,
+      "loss": 0.3071,
+      "step": 1990
+    },
+    {
+      "epoch": 9.84,
+      "grad_norm": 0.7033586502075195,
+      "learning_rate": 8.374384236453201e-05,
+      "loss": 0.3037,
+      "step": 2000
+    },
+    {
+      "epoch": 9.89,
+      "grad_norm": 0.48131078481674194,
+      "learning_rate": 5.9113300492610844e-05,
+      "loss": 0.289,
+      "step": 2010
+    },
+    {
+      "epoch": 9.94,
+      "grad_norm": 0.8269424438476562,
+      "learning_rate": 3.4482758620689657e-05,
+      "loss": 0.3021,
+      "step": 2020
+    },
+    {
+      "epoch": 9.99,
+      "grad_norm": 1.1306647062301636,
+      "learning_rate": 9.852216748768473e-06,
+      "loss": 0.3154,
+      "step": 2030
+    },
+    {
+      "epoch": 9.99,
+      "eval_accuracy": 0.9841137123745819,
+      "eval_f1": 0.9852639526448546,
+      "eval_loss": 0.0536968968808651,
+      "eval_precision": 0.9855005493602992,
+      "eval_recall": 0.9854135689141749,
+      "eval_runtime": 12.8562,
+      "eval_samples_per_second": 186.058,
+      "eval_steps_per_second": 11.668,
+      "step": 2030
+    },
+    {
+      "epoch": 9.99,
+      "step": 2030,
+      "total_flos": 1.0133154899356189e+19,
+      "train_loss": 0.5219255947714369,
+      "train_runtime": 1565.9689,
+      "train_samples_per_second": 83.016,
+      "train_steps_per_second": 1.296
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2030,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "total_flos": 1.0133154899356189e+19,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c062acd83ee2ca580a9704ecd3d8efd31c27d20ee1585c2f9b475a09eeb40b3
+size 5112