Upload 12 files

Browse files

Files changed (13) hide show

.gitattributes +1 -0
README.md +202 -3
adapter_config.json +29 -0
adapter_model.safetensors +3 -0
config.json +37 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +5 -0
tokenizer.json +3 -0
tokenizer_config.json +14 -0
trainer_state.json +1473 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,202 @@
----
-license: cc-by-sa-4.0
----

+---
+base_model: meta-llama/Llama-3.2-1B
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-3.2-1B",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10537d2d22002792e1ac8e8538bbcd3a441994cbc815a0a8790470612ef85c63
+size 6824216

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "meta-llama/Llama-3.2-1B",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float16",
+  "transformers_version": "4.46.2",
+  "use_cache": true,
+  "vocab_size": 128256
+}

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89d296f6d9344bd12e70f36a7751a7c872649d9e0cd5e806943410db1a1d8975
+size 13685562

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3999dbe7fd81ee82657626bacf614e181a00f55ab3158e59b929b2ed6052a6a
+size 14308

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8ef07665904841379c613e25046770311b3dc46250ff9db0dfe289e56bc1cde
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "pad_token": "<|end_of_text|>",
+  "eos_token": "<|end_of_text|>",
+  "unk_token": null
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "vocab_file": null,
+  "name_or_path": "meta-llama/Llama-3.2-1B",
+  "padding_side": "right",
+  "pad_token": "<|end_of_text|>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1473 @@

+{
+  "best_metric": 2.931408643722534,
+  "best_model_checkpoint": "/content/drive/My Drive/Hugh Mann/Llama3.2-1B-SMS-All/checkpoint-1000",
+  "epoch": 0.8733307403085699,
+  "eval_steps": 50,
+  "global_step": 1684,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005186049526772981,
+      "grad_norm": 4.40848970413208,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 5.4447,
+      "step": 10
+    },
+    {
+      "epoch": 0.010372099053545962,
+      "grad_norm": 3.9115631580352783,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 5.2681,
+      "step": 20
+    },
+    {
+      "epoch": 0.015558148580318941,
+      "grad_norm": 4.100546836853027,
+      "learning_rate": 1.5e-06,
+      "loss": 5.3836,
+      "step": 30
+    },
+    {
+      "epoch": 0.020744198107091924,
+      "grad_norm": 3.005075693130493,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 5.2503,
+      "step": 40
+    },
+    {
+      "epoch": 0.025930247633864905,
+      "grad_norm": 4.4445929527282715,
+      "learning_rate": 2.5e-06,
+      "loss": 5.1054,
+      "step": 50
+    },
+    {
+      "epoch": 0.025930247633864905,
+      "eval_loss": 5.573875904083252,
+      "eval_runtime": 86.7128,
+      "eval_samples_per_second": 79.066,
+      "eval_steps_per_second": 9.883,
+      "step": 50
+    },
+    {
+      "epoch": 0.031116297160637883,
+      "grad_norm": 4.169810771942139,
+      "learning_rate": 3e-06,
+      "loss": 5.1836,
+      "step": 60
+    },
+    {
+      "epoch": 0.036302346687410864,
+      "grad_norm": 3.448535442352295,
+      "learning_rate": 3.5e-06,
+      "loss": 5.206,
+      "step": 70
+    },
+    {
+      "epoch": 0.04148839621418385,
+      "grad_norm": 4.423728942871094,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 5.1756,
+      "step": 80
+    },
+    {
+      "epoch": 0.046674445740956826,
+      "grad_norm": 2.7958264350891113,
+      "learning_rate": 4.5e-06,
+      "loss": 5.0787,
+      "step": 90
+    },
+    {
+      "epoch": 0.05186049526772981,
+      "grad_norm": 3.927069902420044,
+      "learning_rate": 5e-06,
+      "loss": 5.2152,
+      "step": 100
+    },
+    {
+      "epoch": 0.05186049526772981,
+      "eval_loss": 5.308503150939941,
+      "eval_runtime": 86.6743,
+      "eval_samples_per_second": 79.101,
+      "eval_steps_per_second": 9.888,
+      "step": 100
+    },
+    {
+      "epoch": 0.05704654479450279,
+      "grad_norm": 3.202113151550293,
+      "learning_rate": 5.500000000000001e-06,
+      "loss": 5.0395,
+      "step": 110
+    },
+    {
+      "epoch": 0.062232594321275765,
+      "grad_norm": 3.387860059738159,
+      "learning_rate": 6e-06,
+      "loss": 5.0002,
+      "step": 120
+    },
+    {
+      "epoch": 0.06741864384804874,
+      "grad_norm": 3.1187663078308105,
+      "learning_rate": 6.5000000000000004e-06,
+      "loss": 4.8854,
+      "step": 130
+    },
+    {
+      "epoch": 0.07260469337482173,
+      "grad_norm": 2.3404035568237305,
+      "learning_rate": 7e-06,
+      "loss": 4.7552,
+      "step": 140
+    },
+    {
+      "epoch": 0.07779074290159471,
+      "grad_norm": 1.9955620765686035,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 4.6971,
+      "step": 150
+    },
+    {
+      "epoch": 0.07779074290159471,
+      "eval_loss": 4.777082443237305,
+      "eval_runtime": 86.9581,
+      "eval_samples_per_second": 78.843,
+      "eval_steps_per_second": 9.855,
+      "step": 150
+    },
+    {
+      "epoch": 0.0829767924283677,
+      "grad_norm": 2.9313905239105225,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 4.5428,
+      "step": 160
+    },
+    {
+      "epoch": 0.08816284195514067,
+      "grad_norm": 2.6419179439544678,
+      "learning_rate": 8.5e-06,
+      "loss": 4.4158,
+      "step": 170
+    },
+    {
+      "epoch": 0.09334889148191365,
+      "grad_norm": 3.8043394088745117,
+      "learning_rate": 9e-06,
+      "loss": 4.1911,
+      "step": 180
+    },
+    {
+      "epoch": 0.09853494100868664,
+      "grad_norm": 4.278996467590332,
+      "learning_rate": 9.5e-06,
+      "loss": 4.1579,
+      "step": 190
+    },
+    {
+      "epoch": 0.10372099053545962,
+      "grad_norm": 5.0697526931762695,
+      "learning_rate": 1e-05,
+      "loss": 3.7557,
+      "step": 200
+    },
+    {
+      "epoch": 0.10372099053545962,
+      "eval_loss": 3.8124358654022217,
+      "eval_runtime": 86.9124,
+      "eval_samples_per_second": 78.884,
+      "eval_steps_per_second": 9.861,
+      "step": 200
+    },
+    {
+      "epoch": 0.10890704006223259,
+      "grad_norm": 2.8836448192596436,
+      "learning_rate": 9.942129629629629e-06,
+      "loss": 3.628,
+      "step": 210
+    },
+    {
+      "epoch": 0.11409308958900558,
+      "grad_norm": 2.7785139083862305,
+      "learning_rate": 9.88425925925926e-06,
+      "loss": 3.523,
+      "step": 220
+    },
+    {
+      "epoch": 0.11927913911577856,
+      "grad_norm": 2.3677213191986084,
+      "learning_rate": 9.826388888888889e-06,
+      "loss": 3.3988,
+      "step": 230
+    },
+    {
+      "epoch": 0.12446518864255153,
+      "grad_norm": 1.9909400939941406,
+      "learning_rate": 9.768518518518519e-06,
+      "loss": 3.1866,
+      "step": 240
+    },
+    {
+      "epoch": 0.12965123816932453,
+      "grad_norm": 2.019188404083252,
+      "learning_rate": 9.710648148148149e-06,
+      "loss": 3.119,
+      "step": 250
+    },
+    {
+      "epoch": 0.12965123816932453,
+      "eval_loss": 3.190325975418091,
+      "eval_runtime": 86.8985,
+      "eval_samples_per_second": 78.897,
+      "eval_steps_per_second": 9.862,
+      "step": 250
+    },
+    {
+      "epoch": 0.13483728769609749,
+      "grad_norm": 1.8608777523040771,
+      "learning_rate": 9.652777777777779e-06,
+      "loss": 3.0325,
+      "step": 260
+    },
+    {
+      "epoch": 0.14002333722287047,
+      "grad_norm": 1.8644330501556396,
+      "learning_rate": 9.594907407407407e-06,
+      "loss": 3.0687,
+      "step": 270
+    },
+    {
+      "epoch": 0.14520938674964345,
+      "grad_norm": 1.6900441646575928,
+      "learning_rate": 9.537037037037037e-06,
+      "loss": 2.96,
+      "step": 280
+    },
+    {
+      "epoch": 0.15039543627641644,
+      "grad_norm": 1.8331371545791626,
+      "learning_rate": 9.479166666666667e-06,
+      "loss": 2.8971,
+      "step": 290
+    },
+    {
+      "epoch": 0.15558148580318942,
+      "grad_norm": 1.2244369983673096,
+      "learning_rate": 9.421296296296297e-06,
+      "loss": 2.941,
+      "step": 300
+    },
+    {
+      "epoch": 0.15558148580318942,
+      "eval_loss": 3.0352883338928223,
+      "eval_runtime": 86.8833,
+      "eval_samples_per_second": 78.91,
+      "eval_steps_per_second": 9.864,
+      "step": 300
+    },
+    {
+      "epoch": 0.1607675353299624,
+      "grad_norm": 1.599660873413086,
+      "learning_rate": 9.363425925925927e-06,
+      "loss": 2.9286,
+      "step": 310
+    },
+    {
+      "epoch": 0.1659535848567354,
+      "grad_norm": 1.4191596508026123,
+      "learning_rate": 9.305555555555557e-06,
+      "loss": 3.0023,
+      "step": 320
+    },
+    {
+      "epoch": 0.17113963438350835,
+      "grad_norm": 1.3025909662246704,
+      "learning_rate": 9.247685185185185e-06,
+      "loss": 2.9386,
+      "step": 330
+    },
+    {
+      "epoch": 0.17632568391028133,
+      "grad_norm": 1.919588327407837,
+      "learning_rate": 9.189814814814815e-06,
+      "loss": 2.8861,
+      "step": 340
+    },
+    {
+      "epoch": 0.18151173343705432,
+      "grad_norm": 1.416272521018982,
+      "learning_rate": 9.131944444444445e-06,
+      "loss": 3.0905,
+      "step": 350
+    },
+    {
+      "epoch": 0.18151173343705432,
+      "eval_loss": 3.011669635772705,
+      "eval_runtime": 86.7963,
+      "eval_samples_per_second": 78.99,
+      "eval_steps_per_second": 9.874,
+      "step": 350
+    },
+    {
+      "epoch": 0.1866977829638273,
+      "grad_norm": 2.392760753631592,
+      "learning_rate": 9.074074074074075e-06,
+      "loss": 2.899,
+      "step": 360
+    },
+    {
+      "epoch": 0.1918838324906003,
+      "grad_norm": 2.157073736190796,
+      "learning_rate": 9.016203703703704e-06,
+      "loss": 2.8805,
+      "step": 370
+    },
+    {
+      "epoch": 0.19706988201737327,
+      "grad_norm": 1.3468859195709229,
+      "learning_rate": 8.958333333333334e-06,
+      "loss": 2.9088,
+      "step": 380
+    },
+    {
+      "epoch": 0.20225593154414626,
+      "grad_norm": 1.31523597240448,
+      "learning_rate": 8.900462962962964e-06,
+      "loss": 2.9042,
+      "step": 390
+    },
+    {
+      "epoch": 0.20744198107091924,
+      "grad_norm": 1.5310451984405518,
+      "learning_rate": 8.842592592592594e-06,
+      "loss": 2.8714,
+      "step": 400
+    },
+    {
+      "epoch": 0.20744198107091924,
+      "eval_loss": 2.996051073074341,
+      "eval_runtime": 86.8896,
+      "eval_samples_per_second": 78.905,
+      "eval_steps_per_second": 9.863,
+      "step": 400
+    },
+    {
+      "epoch": 0.2126280305976922,
+      "grad_norm": 1.4996702671051025,
+      "learning_rate": 8.784722222222224e-06,
+      "loss": 2.8562,
+      "step": 410
+    },
+    {
+      "epoch": 0.21781408012446518,
+      "grad_norm": 1.5760114192962646,
+      "learning_rate": 8.726851851851854e-06,
+      "loss": 2.8765,
+      "step": 420
+    },
+    {
+      "epoch": 0.22300012965123817,
+      "grad_norm": 1.4508875608444214,
+      "learning_rate": 8.668981481481482e-06,
+      "loss": 3.0157,
+      "step": 430
+    },
+    {
+      "epoch": 0.22818617917801115,
+      "grad_norm": 1.8075891733169556,
+      "learning_rate": 8.611111111111112e-06,
+      "loss": 2.9959,
+      "step": 440
+    },
+    {
+      "epoch": 0.23337222870478413,
+      "grad_norm": 1.3157118558883667,
+      "learning_rate": 8.553240740740742e-06,
+      "loss": 2.8601,
+      "step": 450
+    },
+    {
+      "epoch": 0.23337222870478413,
+      "eval_loss": 2.985109806060791,
+      "eval_runtime": 86.8685,
+      "eval_samples_per_second": 78.924,
+      "eval_steps_per_second": 9.865,
+      "step": 450
+    },
+    {
+      "epoch": 0.23855827823155712,
+      "grad_norm": 1.4455087184906006,
+      "learning_rate": 8.495370370370372e-06,
+      "loss": 2.938,
+      "step": 460
+    },
+    {
+      "epoch": 0.2437443277583301,
+      "grad_norm": 1.647705078125,
+      "learning_rate": 8.4375e-06,
+      "loss": 2.9344,
+      "step": 470
+    },
+    {
+      "epoch": 0.24893037728510306,
+      "grad_norm": 1.3752045631408691,
+      "learning_rate": 8.37962962962963e-06,
+      "loss": 2.9085,
+      "step": 480
+    },
+    {
+      "epoch": 0.2541164268118761,
+      "grad_norm": 1.541107416152954,
+      "learning_rate": 8.32175925925926e-06,
+      "loss": 2.8821,
+      "step": 490
+    },
+    {
+      "epoch": 0.25930247633864906,
+      "grad_norm": 1.626123070716858,
+      "learning_rate": 8.263888888888888e-06,
+      "loss": 2.8736,
+      "step": 500
+    },
+    {
+      "epoch": 0.25930247633864906,
+      "eval_loss": 2.976685047149658,
+      "eval_runtime": 87.0158,
+      "eval_samples_per_second": 78.79,
+      "eval_steps_per_second": 9.849,
+      "step": 500
+    },
+    {
+      "epoch": 0.26448852586542204,
+      "grad_norm": 1.8355194330215454,
+      "learning_rate": 8.20601851851852e-06,
+      "loss": 2.9574,
+      "step": 510
+    },
+    {
+      "epoch": 0.26967457539219497,
+      "grad_norm": 1.9231911897659302,
+      "learning_rate": 8.148148148148148e-06,
+      "loss": 2.8982,
+      "step": 520
+    },
+    {
+      "epoch": 0.27486062491896796,
+      "grad_norm": 1.8790838718414307,
+      "learning_rate": 8.090277777777778e-06,
+      "loss": 2.9279,
+      "step": 530
+    },
+    {
+      "epoch": 0.28004667444574094,
+      "grad_norm": 1.870382308959961,
+      "learning_rate": 8.032407407407408e-06,
+      "loss": 2.868,
+      "step": 540
+    },
+    {
+      "epoch": 0.2852327239725139,
+      "grad_norm": 1.8041619062423706,
+      "learning_rate": 7.974537037037038e-06,
+      "loss": 2.9388,
+      "step": 550
+    },
+    {
+      "epoch": 0.2852327239725139,
+      "eval_loss": 2.969029664993286,
+      "eval_runtime": 87.0488,
+      "eval_samples_per_second": 78.76,
+      "eval_steps_per_second": 9.845,
+      "step": 550
+    },
+    {
+      "epoch": 0.2904187734992869,
+      "grad_norm": 1.3382441997528076,
+      "learning_rate": 7.916666666666667e-06,
+      "loss": 2.8644,
+      "step": 560
+    },
+    {
+      "epoch": 0.2956048230260599,
+      "grad_norm": 1.5517808198928833,
+      "learning_rate": 7.858796296296297e-06,
+      "loss": 2.99,
+      "step": 570
+    },
+    {
+      "epoch": 0.3007908725528329,
+      "grad_norm": 1.3038547039031982,
+      "learning_rate": 7.800925925925926e-06,
+      "loss": 2.8649,
+      "step": 580
+    },
+    {
+      "epoch": 0.30597692207960586,
+      "grad_norm": 1.515796184539795,
+      "learning_rate": 7.743055555555556e-06,
+      "loss": 2.8558,
+      "step": 590
+    },
+    {
+      "epoch": 0.31116297160637885,
+      "grad_norm": 1.7148423194885254,
+      "learning_rate": 7.685185185185185e-06,
+      "loss": 2.8933,
+      "step": 600
+    },
+    {
+      "epoch": 0.31116297160637885,
+      "eval_loss": 2.9627323150634766,
+      "eval_runtime": 87.0142,
+      "eval_samples_per_second": 78.792,
+      "eval_steps_per_second": 9.849,
+      "step": 600
+    },
+    {
+      "epoch": 0.31634902113315183,
+      "grad_norm": 1.7385451793670654,
+      "learning_rate": 7.627314814814816e-06,
+      "loss": 2.9509,
+      "step": 610
+    },
+    {
+      "epoch": 0.3215350706599248,
+      "grad_norm": 1.514612078666687,
+      "learning_rate": 7.569444444444445e-06,
+      "loss": 2.9305,
+      "step": 620
+    },
+    {
+      "epoch": 0.3267211201866978,
+      "grad_norm": 1.6873114109039307,
+      "learning_rate": 7.511574074074075e-06,
+      "loss": 2.908,
+      "step": 630
+    },
+    {
+      "epoch": 0.3319071697134708,
+      "grad_norm": 1.757570505142212,
+      "learning_rate": 7.453703703703704e-06,
+      "loss": 2.9484,
+      "step": 640
+    },
+    {
+      "epoch": 0.33709321924024377,
+      "grad_norm": 1.4156616926193237,
+      "learning_rate": 7.395833333333335e-06,
+      "loss": 2.9533,
+      "step": 650
+    },
+    {
+      "epoch": 0.33709321924024377,
+      "eval_loss": 2.9573678970336914,
+      "eval_runtime": 86.9385,
+      "eval_samples_per_second": 78.86,
+      "eval_steps_per_second": 9.858,
+      "step": 650
+    },
+    {
+      "epoch": 0.3422792687670167,
+      "grad_norm": 2.0717520713806152,
+      "learning_rate": 7.337962962962964e-06,
+      "loss": 2.8436,
+      "step": 660
+    },
+    {
+      "epoch": 0.3474653182937897,
+      "grad_norm": 1.478298544883728,
+      "learning_rate": 7.280092592592594e-06,
+      "loss": 2.9204,
+      "step": 670
+    },
+    {
+      "epoch": 0.35265136782056267,
+      "grad_norm": 1.712254524230957,
+      "learning_rate": 7.222222222222223e-06,
+      "loss": 2.9538,
+      "step": 680
+    },
+    {
+      "epoch": 0.35783741734733565,
+      "grad_norm": 1.3590025901794434,
+      "learning_rate": 7.164351851851853e-06,
+      "loss": 2.9414,
+      "step": 690
+    },
+    {
+      "epoch": 0.36302346687410864,
+      "grad_norm": 2.1505372524261475,
+      "learning_rate": 7.106481481481482e-06,
+      "loss": 3.0889,
+      "step": 700
+    },
+    {
+      "epoch": 0.36302346687410864,
+      "eval_loss": 2.9526450634002686,
+      "eval_runtime": 87.0066,
+      "eval_samples_per_second": 78.799,
+      "eval_steps_per_second": 9.85,
+      "step": 700
+    },
+    {
+      "epoch": 0.3682095164008816,
+      "grad_norm": 1.6749532222747803,
+      "learning_rate": 7.048611111111112e-06,
+      "loss": 2.9108,
+      "step": 710
+    },
+    {
+      "epoch": 0.3733955659276546,
+      "grad_norm": 1.8624922037124634,
+      "learning_rate": 6.990740740740741e-06,
+      "loss": 2.7685,
+      "step": 720
+    },
+    {
+      "epoch": 0.3785816154544276,
+      "grad_norm": 1.6713345050811768,
+      "learning_rate": 6.932870370370371e-06,
+      "loss": 2.9072,
+      "step": 730
+    },
+    {
+      "epoch": 0.3837676649812006,
+      "grad_norm": 1.852766513824463,
+      "learning_rate": 6.875e-06,
+      "loss": 2.8404,
+      "step": 740
+    },
+    {
+      "epoch": 0.38895371450797356,
+      "grad_norm": 1.5447180271148682,
+      "learning_rate": 6.817129629629629e-06,
+      "loss": 2.8887,
+      "step": 750
+    },
+    {
+      "epoch": 0.38895371450797356,
+      "eval_loss": 2.9479236602783203,
+      "eval_runtime": 87.1235,
+      "eval_samples_per_second": 78.693,
+      "eval_steps_per_second": 9.837,
+      "step": 750
+    },
+    {
+      "epoch": 0.39413976403474654,
+      "grad_norm": 1.8909484148025513,
+      "learning_rate": 6.75925925925926e-06,
+      "loss": 2.9101,
+      "step": 760
+    },
+    {
+      "epoch": 0.3993258135615195,
+      "grad_norm": 2.3639767169952393,
+      "learning_rate": 6.701388888888889e-06,
+      "loss": 2.6818,
+      "step": 770
+    },
+    {
+      "epoch": 0.4045118630882925,
+      "grad_norm": 1.801645040512085,
+      "learning_rate": 6.643518518518519e-06,
+      "loss": 2.8709,
+      "step": 780
+    },
+    {
+      "epoch": 0.4096979126150655,
+      "grad_norm": 1.6030060052871704,
+      "learning_rate": 6.5856481481481484e-06,
+      "loss": 2.9025,
+      "step": 790
+    },
+    {
+      "epoch": 0.4148839621418385,
+      "grad_norm": 1.7171462774276733,
+      "learning_rate": 6.5277777777777784e-06,
+      "loss": 3.0349,
+      "step": 800
+    },
+    {
+      "epoch": 0.4148839621418385,
+      "eval_loss": 2.943669557571411,
+      "eval_runtime": 86.9632,
+      "eval_samples_per_second": 78.838,
+      "eval_steps_per_second": 9.855,
+      "step": 800
+    },
+    {
+      "epoch": 0.4200700116686114,
+      "grad_norm": 1.5010789632797241,
+      "learning_rate": 6.4699074074074076e-06,
+      "loss": 2.8971,
+      "step": 810
+    },
+    {
+      "epoch": 0.4252560611953844,
+      "grad_norm": 1.8026429414749146,
+      "learning_rate": 6.4120370370370375e-06,
+      "loss": 2.9048,
+      "step": 820
+    },
+    {
+      "epoch": 0.4304421107221574,
+      "grad_norm": 2.0750973224639893,
+      "learning_rate": 6.354166666666667e-06,
+      "loss": 2.8126,
+      "step": 830
+    },
+    {
+      "epoch": 0.43562816024893036,
+      "grad_norm": 1.7901376485824585,
+      "learning_rate": 6.296296296296297e-06,
+      "loss": 2.8484,
+      "step": 840
+    },
+    {
+      "epoch": 0.44081420977570335,
+      "grad_norm": 1.805792212486267,
+      "learning_rate": 6.238425925925926e-06,
+      "loss": 2.8988,
+      "step": 850
+    },
+    {
+      "epoch": 0.44081420977570335,
+      "eval_loss": 2.9402518272399902,
+      "eval_runtime": 87.0076,
+      "eval_samples_per_second": 78.798,
+      "eval_steps_per_second": 9.85,
+      "step": 850
+    },
+    {
+      "epoch": 0.44600025930247633,
+      "grad_norm": 1.8961949348449707,
+      "learning_rate": 6.180555555555557e-06,
+      "loss": 2.8334,
+      "step": 860
+    },
+    {
+      "epoch": 0.4511863088292493,
+      "grad_norm": 1.4542121887207031,
+      "learning_rate": 6.122685185185186e-06,
+      "loss": 2.9224,
+      "step": 870
+    },
+    {
+      "epoch": 0.4563723583560223,
+      "grad_norm": 1.6367915868759155,
+      "learning_rate": 6.064814814814816e-06,
+      "loss": 2.8376,
+      "step": 880
+    },
+    {
+      "epoch": 0.4615584078827953,
+      "grad_norm": 1.796552062034607,
+      "learning_rate": 6.006944444444445e-06,
+      "loss": 2.8408,
+      "step": 890
+    },
+    {
+      "epoch": 0.46674445740956827,
+      "grad_norm": 1.7545686960220337,
+      "learning_rate": 5.949074074074075e-06,
+      "loss": 3.0174,
+      "step": 900
+    },
+    {
+      "epoch": 0.46674445740956827,
+      "eval_loss": 2.9366812705993652,
+      "eval_runtime": 87.091,
+      "eval_samples_per_second": 78.722,
+      "eval_steps_per_second": 9.84,
+      "step": 900
+    },
+    {
+      "epoch": 0.47193050693634125,
+      "grad_norm": 1.7723950147628784,
+      "learning_rate": 5.891203703703704e-06,
+      "loss": 2.8517,
+      "step": 910
+    },
+    {
+      "epoch": 0.47711655646311424,
+      "grad_norm": 1.7314085960388184,
+      "learning_rate": 5.833333333333334e-06,
+      "loss": 2.8079,
+      "step": 920
+    },
+    {
+      "epoch": 0.4823026059898872,
+      "grad_norm": 1.993122935295105,
+      "learning_rate": 5.775462962962963e-06,
+      "loss": 2.9262,
+      "step": 930
+    },
+    {
+      "epoch": 0.4874886555166602,
+      "grad_norm": 2.3595573902130127,
+      "learning_rate": 5.717592592592593e-06,
+      "loss": 2.9056,
+      "step": 940
+    },
+    {
+      "epoch": 0.4926747050434332,
+      "grad_norm": 1.8839762210845947,
+      "learning_rate": 5.659722222222222e-06,
+      "loss": 2.8617,
+      "step": 950
+    },
+    {
+      "epoch": 0.4926747050434332,
+      "eval_loss": 2.9337050914764404,
+      "eval_runtime": 87.1298,
+      "eval_samples_per_second": 78.687,
+      "eval_steps_per_second": 9.836,
+      "step": 950
+    },
+    {
+      "epoch": 0.4978607545702061,
+      "grad_norm": 2.0222883224487305,
+      "learning_rate": 5.601851851851853e-06,
+      "loss": 2.869,
+      "step": 960
+    },
+    {
+      "epoch": 0.5030468040969791,
+      "grad_norm": 1.7449781894683838,
+      "learning_rate": 5.543981481481482e-06,
+      "loss": 2.885,
+      "step": 970
+    },
+    {
+      "epoch": 0.5082328536237521,
+      "grad_norm": 2.0181517601013184,
+      "learning_rate": 5.486111111111112e-06,
+      "loss": 2.8688,
+      "step": 980
+    },
+    {
+      "epoch": 0.5134189031505251,
+      "grad_norm": 1.6998732089996338,
+      "learning_rate": 5.428240740740741e-06,
+      "loss": 2.917,
+      "step": 990
+    },
+    {
+      "epoch": 0.5186049526772981,
+      "grad_norm": 1.8947477340698242,
+      "learning_rate": 5.370370370370371e-06,
+      "loss": 2.8555,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5186049526772981,
+      "eval_loss": 2.931408643722534,
+      "eval_runtime": 87.0111,
+      "eval_samples_per_second": 78.795,
+      "eval_steps_per_second": 9.849,
+      "step": 1000
+    },
+    {
+      "epoch": 0.523791002204071,
+      "grad_norm": 2.1364452838897705,
+      "learning_rate": 5.3125e-06,
+      "loss": 2.7673,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5289770517308441,
+      "grad_norm": 2.374993085861206,
+      "learning_rate": 5.2546296296296295e-06,
+      "loss": 2.8901,
+      "step": 1020
+    },
+    {
+      "epoch": 0.534163101257617,
+      "grad_norm": 2.291280746459961,
+      "learning_rate": 5.1967592592592595e-06,
+      "loss": 2.9203,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5393491507843899,
+      "grad_norm": 2.6651227474212646,
+      "learning_rate": 5.138888888888889e-06,
+      "loss": 2.8047,
+      "step": 1040
+    },
+    {
+      "epoch": 0.544535200311163,
+      "grad_norm": 1.8160004615783691,
+      "learning_rate": 5.081018518518519e-06,
+      "loss": 2.8169,
+      "step": 1050
+    },
+    {
+      "epoch": 0.544535200311163,
+      "eval_loss": 2.9287192821502686,
+      "eval_runtime": 87.097,
+      "eval_samples_per_second": 78.717,
+      "eval_steps_per_second": 9.84,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5497212498379359,
+      "grad_norm": 1.9724560976028442,
+      "learning_rate": 5.023148148148148e-06,
+      "loss": 2.9291,
+      "step": 1060
+    },
+    {
+      "epoch": 0.554907299364709,
+      "grad_norm": 2.5785677433013916,
+      "learning_rate": 4.9652777777777786e-06,
+      "loss": 2.8179,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5600933488914819,
+      "grad_norm": 1.6687511205673218,
+      "learning_rate": 4.907407407407408e-06,
+      "loss": 2.7731,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5652793984182549,
+      "grad_norm": 2.003852605819702,
+      "learning_rate": 4.849537037037038e-06,
+      "loss": 2.8838,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5704654479450278,
+      "grad_norm": 2.078784227371216,
+      "learning_rate": 4.791666666666668e-06,
+      "loss": 2.9151,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5704654479450278,
+      "eval_loss": 2.9263577461242676,
+      "eval_runtime": 87.0972,
+      "eval_samples_per_second": 78.717,
+      "eval_steps_per_second": 9.84,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5756514974718009,
+      "grad_norm": 1.9661970138549805,
+      "learning_rate": 4.733796296296297e-06,
+      "loss": 2.9136,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5808375469985738,
+      "grad_norm": 2.3089287281036377,
+      "learning_rate": 4.675925925925927e-06,
+      "loss": 2.8187,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5860235965253469,
+      "grad_norm": 1.668616771697998,
+      "learning_rate": 4.618055555555556e-06,
+      "loss": 2.777,
+      "step": 1130
+    },
+    {
+      "epoch": 0.5912096460521198,
+      "grad_norm": 1.6333500146865845,
+      "learning_rate": 4.560185185185186e-06,
+      "loss": 2.8093,
+      "step": 1140
+    },
+    {
+      "epoch": 0.5963956955788928,
+      "grad_norm": 1.801171064376831,
+      "learning_rate": 4.502314814814815e-06,
+      "loss": 2.774,
+      "step": 1150
+    },
+    {
+      "epoch": 0.5963956955788928,
+      "eval_loss": 2.92441725730896,
+      "eval_runtime": 87.1718,
+      "eval_samples_per_second": 78.649,
+      "eval_steps_per_second": 9.831,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6015817451056658,
+      "grad_norm": 1.9352556467056274,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 2.8634,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6067677946324388,
+      "grad_norm": 2.076831817626953,
+      "learning_rate": 4.386574074074074e-06,
+      "loss": 2.8592,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6119538441592117,
+      "grad_norm": 2.5992085933685303,
+      "learning_rate": 4.328703703703704e-06,
+      "loss": 2.7938,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6171398936859847,
+      "grad_norm": 1.8197615146636963,
+      "learning_rate": 4.270833333333333e-06,
+      "loss": 2.8091,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6223259432127577,
+      "grad_norm": 2.15179181098938,
+      "learning_rate": 4.212962962962963e-06,
+      "loss": 2.7899,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6223259432127577,
+      "eval_loss": 2.922563314437866,
+      "eval_runtime": 87.083,
+      "eval_samples_per_second": 78.729,
+      "eval_steps_per_second": 9.841,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6275119927395306,
+      "grad_norm": 2.6506097316741943,
+      "learning_rate": 4.155092592592593e-06,
+      "loss": 2.8336,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6326980422663037,
+      "grad_norm": 2.2627806663513184,
+      "learning_rate": 4.097222222222222e-06,
+      "loss": 2.8635,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6378840917930766,
+      "grad_norm": 2.34142804145813,
+      "learning_rate": 4.039351851851852e-06,
+      "loss": 2.9534,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6430701413198496,
+      "grad_norm": 1.9826796054840088,
+      "learning_rate": 3.9814814814814814e-06,
+      "loss": 2.8219,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6482561908466226,
+      "grad_norm": 2.4062864780426025,
+      "learning_rate": 3.9236111111111114e-06,
+      "loss": 2.8348,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6482561908466226,
+      "eval_loss": 2.921142339706421,
+      "eval_runtime": 87.113,
+      "eval_samples_per_second": 78.702,
+      "eval_steps_per_second": 9.838,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6534422403733956,
+      "grad_norm": 1.7716823816299438,
+      "learning_rate": 3.865740740740741e-06,
+      "loss": 2.8568,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6586282899001685,
+      "grad_norm": 2.330777406692505,
+      "learning_rate": 3.8078703703703705e-06,
+      "loss": 2.9541,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6638143394269416,
+      "grad_norm": 1.8923736810684204,
+      "learning_rate": 3.7500000000000005e-06,
+      "loss": 2.8021,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6690003889537145,
+      "grad_norm": 1.858708381652832,
+      "learning_rate": 3.69212962962963e-06,
+      "loss": 2.8218,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6741864384804875,
+      "grad_norm": 1.7053884267807007,
+      "learning_rate": 3.6342592592592596e-06,
+      "loss": 2.7704,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6741864384804875,
+      "eval_loss": 2.919332504272461,
+      "eval_runtime": 87.1672,
+      "eval_samples_per_second": 78.653,
+      "eval_steps_per_second": 9.832,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6793724880072605,
+      "grad_norm": 2.1638433933258057,
+      "learning_rate": 3.576388888888889e-06,
+      "loss": 2.8667,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6845585375340334,
+      "grad_norm": 1.951277256011963,
+      "learning_rate": 3.5185185185185187e-06,
+      "loss": 2.8129,
+      "step": 1320
+    },
+    {
+      "epoch": 0.6897445870608064,
+      "grad_norm": 2.4789998531341553,
+      "learning_rate": 3.4606481481481487e-06,
+      "loss": 2.8462,
+      "step": 1330
+    },
+    {
+      "epoch": 0.6949306365875794,
+      "grad_norm": 1.6878948211669922,
+      "learning_rate": 3.4027777777777783e-06,
+      "loss": 2.8111,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7001166861143524,
+      "grad_norm": 1.99266517162323,
+      "learning_rate": 3.344907407407408e-06,
+      "loss": 2.9152,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7001166861143524,
+      "eval_loss": 2.917971134185791,
+      "eval_runtime": 87.0384,
+      "eval_samples_per_second": 78.77,
+      "eval_steps_per_second": 9.846,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7053027356411253,
+      "grad_norm": 1.6022475957870483,
+      "learning_rate": 3.2870370370370374e-06,
+      "loss": 2.8767,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7104887851678984,
+      "grad_norm": 2.165276527404785,
+      "learning_rate": 3.229166666666667e-06,
+      "loss": 2.8321,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7156748346946713,
+      "grad_norm": 2.1206817626953125,
+      "learning_rate": 3.171296296296297e-06,
+      "loss": 2.8714,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7208608842214443,
+      "grad_norm": 2.4431049823760986,
+      "learning_rate": 3.1134259259259265e-06,
+      "loss": 2.8628,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7260469337482173,
+      "grad_norm": 2.304499387741089,
+      "learning_rate": 3.055555555555556e-06,
+      "loss": 2.7895,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7260469337482173,
+      "eval_loss": 2.917088270187378,
+      "eval_runtime": 87.1021,
+      "eval_samples_per_second": 78.712,
+      "eval_steps_per_second": 9.839,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7312329832749903,
+      "grad_norm": 1.3344799280166626,
+      "learning_rate": 2.9976851851851856e-06,
+      "loss": 2.9102,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7364190328017632,
+      "grad_norm": 1.719092607498169,
+      "learning_rate": 2.9398148148148147e-06,
+      "loss": 2.7918,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7416050823285363,
+      "grad_norm": 1.8856807947158813,
+      "learning_rate": 2.8819444444444443e-06,
+      "loss": 2.9078,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7467911318553092,
+      "grad_norm": 2.222034215927124,
+      "learning_rate": 2.8240740740740743e-06,
+      "loss": 2.8047,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7519771813820822,
+      "grad_norm": 2.0205276012420654,
+      "learning_rate": 2.766203703703704e-06,
+      "loss": 2.8561,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7519771813820822,
+      "eval_loss": 2.9157726764678955,
+      "eval_runtime": 87.0003,
+      "eval_samples_per_second": 78.804,
+      "eval_steps_per_second": 9.851,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7571632309088552,
+      "grad_norm": 2.2088751792907715,
+      "learning_rate": 2.7083333333333334e-06,
+      "loss": 2.8221,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7623492804356281,
+      "grad_norm": 1.8010945320129395,
+      "learning_rate": 2.650462962962963e-06,
+      "loss": 2.9598,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7675353299624011,
+      "grad_norm": 1.8427363634109497,
+      "learning_rate": 2.5925925925925925e-06,
+      "loss": 2.9063,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7727213794891741,
+      "grad_norm": 2.112938404083252,
+      "learning_rate": 2.5347222222222225e-06,
+      "loss": 2.8391,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7779074290159471,
+      "grad_norm": 1.8270915746688843,
+      "learning_rate": 2.476851851851852e-06,
+      "loss": 2.7957,
+      "step": 1500
+    },
+    {
+      "epoch": 0.7779074290159471,
+      "eval_loss": 2.9150469303131104,
+      "eval_runtime": 87.2201,
+      "eval_samples_per_second": 78.606,
+      "eval_steps_per_second": 9.826,
+      "step": 1500
+    },
+    {
+      "epoch": 0.78309347854272,
+      "grad_norm": 2.6664621829986572,
+      "learning_rate": 2.4189814814814816e-06,
+      "loss": 2.8278,
+      "step": 1510
+    },
+    {
+      "epoch": 0.7882795280694931,
+      "grad_norm": 2.025564193725586,
+      "learning_rate": 2.361111111111111e-06,
+      "loss": 2.8109,
+      "step": 1520
+    },
+    {
+      "epoch": 0.793465577596266,
+      "grad_norm": 1.752272605895996,
+      "learning_rate": 2.3032407407407407e-06,
+      "loss": 2.8855,
+      "step": 1530
+    },
+    {
+      "epoch": 0.798651627123039,
+      "grad_norm": 1.9612879753112793,
+      "learning_rate": 2.2453703703703707e-06,
+      "loss": 2.8185,
+      "step": 1540
+    },
+    {
+      "epoch": 0.803837676649812,
+      "grad_norm": 1.9773454666137695,
+      "learning_rate": 2.1875000000000002e-06,
+      "loss": 2.8071,
+      "step": 1550
+    },
+    {
+      "epoch": 0.803837676649812,
+      "eval_loss": 2.9142231941223145,
+      "eval_runtime": 87.0585,
+      "eval_samples_per_second": 78.752,
+      "eval_steps_per_second": 9.844,
+      "step": 1550
+    },
+    {
+      "epoch": 0.809023726176585,
+      "grad_norm": 3.341409206390381,
+      "learning_rate": 2.1296296296296298e-06,
+      "loss": 2.8467,
+      "step": 1560
+    },
+    {
+      "epoch": 0.814209775703358,
+      "grad_norm": 1.7923222780227661,
+      "learning_rate": 2.0717592592592593e-06,
+      "loss": 2.8776,
+      "step": 1570
+    },
+    {
+      "epoch": 0.819395825230131,
+      "grad_norm": 1.8666836023330688,
+      "learning_rate": 2.0138888888888893e-06,
+      "loss": 2.8347,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8245818747569039,
+      "grad_norm": 2.2585649490356445,
+      "learning_rate": 1.956018518518519e-06,
+      "loss": 2.8068,
+      "step": 1590
+    },
+    {
+      "epoch": 0.829767924283677,
+      "grad_norm": 1.8757206201553345,
+      "learning_rate": 1.8981481481481484e-06,
+      "loss": 2.8491,
+      "step": 1600
+    },
+    {
+      "epoch": 0.829767924283677,
+      "eval_loss": 2.913360834121704,
+      "eval_runtime": 87.2275,
+      "eval_samples_per_second": 78.599,
+      "eval_steps_per_second": 9.825,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8349539738104499,
+      "grad_norm": 1.6995919942855835,
+      "learning_rate": 1.840277777777778e-06,
+      "loss": 2.8562,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8401400233372228,
+      "grad_norm": 1.780346393585205,
+      "learning_rate": 1.7824074074074073e-06,
+      "loss": 2.8461,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8453260728639959,
+      "grad_norm": 2.147747755050659,
+      "learning_rate": 1.724537037037037e-06,
+      "loss": 2.7899,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8505121223907688,
+      "grad_norm": 1.769389033317566,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 2.8092,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8556981719175418,
+      "grad_norm": 3.257025957107544,
+      "learning_rate": 1.6087962962962964e-06,
+      "loss": 2.8567,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8556981719175418,
+      "eval_loss": 2.9123799800872803,
+      "eval_runtime": 87.0603,
+      "eval_samples_per_second": 78.75,
+      "eval_steps_per_second": 9.844,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8608842214443148,
+      "grad_norm": 1.7707329988479614,
+      "learning_rate": 1.550925925925926e-06,
+      "loss": 2.856,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8660702709710878,
+      "grad_norm": 1.8951716423034668,
+      "learning_rate": 1.4930555555555555e-06,
+      "loss": 2.8768,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8712563204978607,
+      "grad_norm": 2.0087685585021973,
+      "learning_rate": 1.4351851851851853e-06,
+      "loss": 2.8264,
+      "step": 1680
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1928,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.6099718731923456e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2f9bb710eae82136aef820a5f3e9478da07c3b8a8ce4419dc998b680738e7e7
+size 5304