bobox commited on Aug 25, 2024

Commit

90548a4

verified ·

1 Parent(s): e57da11

Training in progress, step 206, checkpoint

Browse files

Files changed (17) hide show

checkpoint-206/1_Pooling/config.json +10 -0
checkpoint-206/README.md +733 -0
checkpoint-206/added_tokens.json +3 -0
checkpoint-206/config.json +35 -0
checkpoint-206/config_sentence_transformers.json +10 -0
checkpoint-206/modules.json +14 -0
checkpoint-206/optimizer.pt +3 -0
checkpoint-206/pytorch_model.bin +3 -0
checkpoint-206/rng_state.pth +3 -0
checkpoint-206/scheduler.pt +3 -0
checkpoint-206/sentence_bert_config.json +4 -0
checkpoint-206/special_tokens_map.json +51 -0
checkpoint-206/spm.model +3 -0
checkpoint-206/tokenizer.json +0 -0
checkpoint-206/tokenizer_config.json +65 -0
checkpoint-206/trainer_state.json +2213 -0
checkpoint-206/training_args.bin +3 -0

checkpoint-206/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 768,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

checkpoint-206/README.md ADDED Viewed

	@@ -0,0 +1,733 @@

+---
+base_model: bobox/DeBERTa-small-ST-v1-test-step3
+datasets: []
+language: []
+library_name: sentence-transformers
+metrics:
+- pearson_cosine
+- spearman_cosine
+- pearson_manhattan
+- spearman_manhattan
+- pearson_euclidean
+- spearman_euclidean
+- pearson_dot
+- spearman_dot
+- pearson_max
+- spearman_max
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- sentence-similarity
+- feature-extraction
+- generated_from_trainer
+- dataset_size:163205
+- loss:CachedGISTEmbedLoss
+widget:
+- source_sentence: interview question on why you want to leave your current job?
+  sentences:
+  - Carrier proteins bind and carry the molecules across the cell membrane. These
+    proteins bind a molecule on one side of the membrane, change shape as they carry
+    the molecule across the membrane, and deposit the molecule on the other side of
+    the membrane. Even though a protein is involved in both these methods of transport,
+    neither method requires energy. Therefore these are still types of passive transport.
+  - '[''Desire to learn.'', ''Desire to take on more responsibility.'', ''Desire to
+    take on less responsibility.'', ''Desire to relocate.'', ''Desire for a career
+    change.'', ''Desire to gain a new skill or grow a current skill.'', ''Company
+    reorganization has led to change in job content.'']'
+  - The small intestine is a narrow tube that starts at the stomach and ends at the
+    large intestine. In adults, it’s about 7 meters (23 feet) long. Most chemical
+    digestion and almost all nutrient absorption take place in the small intestine.
+- source_sentence: 'They say a number of people from the Mujahideen-e-Khalq (MEK)
+    group were injured at Camp Liberty in Baghdad.
+    Baghdad has in the past repeatedly denied attacking the group.
+    MEK members fought with Iraq against Iran in the 1980s, but have since fallen
+    out with the current Iraqi government.
+    In an emailed message, the Paris-based National Council of Resistance of Iran
+    (NCRI), the MEK''s parent group, said dozens of missiles hit the camp on Thursday
+    evening.
+    It said two residents were killed and a third later died in hospital of his wounds.
+    The US condemned the attack "in the strongest terms" and urged Iraq to better
+    protect the camp.
+    An Iranian-backed Shia militia, al-Mukhtar Army, said it had fired rockets at
+    the camp, Reuters news agency reported.
+    The camp is located in a former US military base, near Baghdad''s airport.
+    The Iraqi authorities have made no public comments on the report. However, one
+    security official was quoted by the Associated Press as saying four rockets hit
+    the camp, injuring two people.
+    In September, the MEK accused Iraqi forces of attacking Camp Ashraf north-east
+    of Baghdad and killing 52 of the group''s members.
+    In recent years, Baghdad has been trying to dismantle MEK camps and eject the
+    group.
+    Iran considers the MEK a terrorist group.
+    The group was removed from the US state department''s list of terrorist organisations
+    last year.'
+  sentences:
+  - A rocket attack has killed three members of an Iranian opposition group in Iraq,
+    the group and its parent organisation say.
+  - Directions See How It's Made. 1  Preheat the oven to 375 degrees. 2  Heat a large
+    skillet over medium heat. 3  Add the spinach.  Spread the spinach mixture evenly
+    on the bottom of the prepared baking 1  dish. Bake until the egg whites are set,
+    about 25 minutes.  Let the casserole sit for about 5 minutes, and then cut into
+    pieces and 1  serve. Submit a Correction.
+  - 'Toll free phone number: 011-44-871-246-0002. Ryan Airlines www.ryanair.com is
+    a low-fare airlines headquartered in Dublin, Ireland. Transporting over 103 million
+    passengers last year there are 1600 daily flights with 185 destinations. The Ryan
+    fleet consists of 300 new Boeing 737-800 aircraft in operation with 283 738 aircraft
+    on order.'
+- source_sentence: In what unit is heat measured in?
+  sentences:
+  - The heat that is either absorbed or released is measured in joules. The mass is
+    measured in grams. The change in temperature is given by , where is the final
+    temperature and is the initial temperature.
+  - The nitrogen atom of a primary amine is bonded to two hydrogen atoms and one carbon.
+    The nitrogen atom of a secondary amine is bonded to one hydrogen and two carbons.
+    The nitrogen atom of a tertiary amine is bonded to three carbon atoms. Amines
+    are typically named by a common system rather than by IUPAC guidelines. The common
+    system for naming amines along with several examples is shown below.
+  - 'Seattle Symphony Live @ Benaroya Hall — Windborne''s The Music of David Bowie:
+    A Rock Symphony with the Seattle Symphony Tuesday, 10 January, 2017 7:30PM Join
+    conductor Brent Havens and a full rock band on a symphonic musical odyssey that
+    explores the incredible range of David Bowie’s Music.'
+- source_sentence: meristematic tissue definition
+  sentences:
+  - In this chapter, you saw how pressure and buoyancy of fluids can be used to make
+    work easier — from raising a car on a lift to floating a ship on the ocean. Devices
+    that make work easier are called machines in physics.
+  - A Land Rover is splashing water as it crosses a river.
+  - "meristem. n. 1. (Botany) a plant tissue responsible for growth, whose cells divide\
+    \ and differentiate to form the tissues and organs of the plant. Meristems occur\
+    \ within the stem (see cambium) and leaves and at the tips of stems and roots.\
+    \ [C19: from Greek meristos divided, from merizein to divide, from meris portion].\
+    \ (Ë\x88mÉ\x9Br É\x99Ë\x8CstÉ\x9Bm)."
+- source_sentence: More than 190 countries and territories around the world had confirmed
+    coronavirus cases by March 22 , 2020 .
+  sentences:
+  - when electricity flows to a light bulb , the light bulb will come on
+  - As of 22 March , more than 337,000 cases of COVID-19 have been reported in over
+    190 countries and territories , resulting in more than 14,400 deaths and 96,000
+    recoveries .
+  - a greenhouse is used to protect plants by keeping them warm
+model-index:
+- name: SentenceTransformer based on bobox/DeBERTa-small-ST-v1-test-step3
+  results:
+  - task:
+      type: semantic-similarity
+      name: Semantic Similarity
+    dataset:
+      name: sts test
+      type: sts-test
+    metrics:
+    - type: pearson_cosine
+      value: 0.8785914848590666
+      name: Pearson Cosine
+    - type: spearman_cosine
+      value: 0.9048987433800361
+      name: Spearman Cosine
+    - type: pearson_manhattan
+      value: 0.9087606701935215
+      name: Pearson Manhattan
+    - type: spearman_manhattan
+      value: 0.9056138237858093
+      name: Spearman Manhattan
+    - type: pearson_euclidean
+      value: 0.9086611488562145
+      name: Pearson Euclidean
+    - type: spearman_euclidean
+      value: 0.9052247563192726
+      name: Spearman Euclidean
+    - type: pearson_dot
+      value: 0.8570818659891223
+      name: Pearson Dot
+    - type: spearman_dot
+      value: 0.8616398023022556
+      name: Spearman Dot
+    - type: pearson_max
+      value: 0.9087606701935215
+      name: Pearson Max
+    - type: spearman_max
+      value: 0.9056138237858093
+      name: Spearman Max
+---
+# SentenceTransformer based on bobox/DeBERTa-small-ST-v1-test-step3
+This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [bobox/DeBERTa-small-ST-v1-test-step3](https://huggingface.co/bobox/DeBERTa-small-ST-v1-test-step3) on the bobox/enhanced_nli-50_k dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
+## Model Details
+### Model Description
+- **Model Type:** Sentence Transformer
+- **Base model:** [bobox/DeBERTa-small-ST-v1-test-step3](https://huggingface.co/bobox/DeBERTa-small-ST-v1-test-step3) <!-- at revision df9aaa75fe0c2791e5ed35ff33de1689d9a5f5ff -->
+- **Maximum Sequence Length:** 512 tokens
+- **Output Dimensionality:** 768 tokens
+- **Similarity Function:** Cosine Similarity
+- **Training Dataset:**
+    - bobox/enhanced_nli-50_k
+<!-- - **Language:** Unknown -->
+<!-- - **License:** Unknown -->
+### Model Sources
+- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
+- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
+- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
+### Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DebertaV2Model
+  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+)
+```
+## Usage
+### Direct Usage (Sentence Transformers)
+First install the Sentence Transformers library:
+```bash
+pip install -U sentence-transformers
+```
+Then you can load this model and run inference.
+```python
+from sentence_transformers import SentenceTransformer
+# Download from the 🤗 Hub
+model = SentenceTransformer("bobox/DeBERTa-small-ST-v1-test-UnifiedDatasets-Ft-checkpoints-tmp")
+# Run inference
+sentences = [
+    'More than 190 countries and territories around the world had confirmed coronavirus cases by March 22 , 2020 .',
+    'As of 22 March , more than 337,000 cases of COVID-19 have been reported in over 190 countries and territories , resulting in more than 14,400 deaths and 96,000 recoveries .',
+    'a greenhouse is used to protect plants by keeping them warm',
+]
+embeddings = model.encode(sentences)
+print(embeddings.shape)
+# [3, 768]
+# Get the similarity scores for the embeddings
+similarities = model.similarity(embeddings, embeddings)
+print(similarities.shape)
+# [3, 3]
+```
+<!--
+### Direct Usage (Transformers)
+<details><summary>Click to see the direct usage in Transformers</summary>
+</details>
+-->
+<!--
+### Downstream Usage (Sentence Transformers)
+You can finetune this model on your own dataset.
+<details><summary>Click to expand</summary>
+</details>
+-->
+<!--
+### Out-of-Scope Use
+*List how the model may foreseeably be misused and address what users ought not to do with the model.*
+-->
+## Evaluation
+### Metrics
+#### Semantic Similarity
+* Dataset: `sts-test`
+* Evaluated with [<code>EmbeddingSimilarityEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator)
+| Metric              | Value      |
+|:--------------------|:-----------|
+| pearson_cosine      | 0.8786     |
+| **spearman_cosine** | **0.9049** |
+| pearson_manhattan   | 0.9088     |
+| spearman_manhattan  | 0.9056     |
+| pearson_euclidean   | 0.9087     |
+| spearman_euclidean  | 0.9052     |
+| pearson_dot         | 0.8571     |
+| spearman_dot        | 0.8616     |
+| pearson_max         | 0.9088     |
+| spearman_max        | 0.9056     |
+<!--
+## Bias, Risks and Limitations
+*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
+-->
+<!--
+### Recommendations
+*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
+-->
+## Training Details
+### Training Dataset
+#### bobox/enhanced_nli-50_k
+* Dataset: bobox/enhanced_nli-50_k
+* Size: 163,205 training samples
+* Columns: <code>sentence1</code> and <code>sentence2</code>
+* Approximate statistics based on the first 1000 samples:
+  |         | sentence1                                                                          | sentence2                                                                          |
+  |:--------|:-----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|
+  | type    | string                                                                             | string                                                                             |
+  | details | <ul><li>min: 4 tokens</li><li>mean: 36.63 tokens</li><li>max: 370 tokens</li></ul> | <ul><li>min: 2 tokens</li><li>mean: 54.93 tokens</li><li>max: 363 tokens</li></ul> |
+* Samples:
+  | sentence1                                                                                                                   | sentence2                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+  |:----------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | <code>apple customer number</code>                                                                                          | <code>Apple Customer Service. 800-676-2775. Before dialing Apple Customer Service. We've tried for years to eliminate that 44 minute wait for you. But the fact is that it can still take many phone calls to Apple to resolve your issue. Now you can hire GetHuman to do all the work on the phone for you.</code>                                                                                                                         |
+  | <code>Molly 's Game grossed more than $ 28.5 million in the US and Canada and less than $ 24.65 in other countries .</code> | <code>, Molly 's Game has grossed $ 28.8 million in the United States and Canada , and $ 24.6 million in other territories , for a worldwide total of $ 53.4 million .</code>                                                                                                                                                                                                                                                                |
+  | <code>mawk definition</code>                                                                                                | <code>Definitions for mawk. Here are all the possible meanings and translations of the word mawk. Wiktionary(0.00 / 0 votes)Rate this definition: Origin: From mawk, mauk, a contraction of mathek, from maÃ°kr, a diminutive of a base from maÃ¾a- (Old English maÃ¾a), from Indo-European *math-, moth- used in reference to insects and vermin. Cognate with Danish madike, Swedish mask, archaic English maddock (modern maggot).</code> |
+* Loss: [<code>CachedGISTEmbedLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cachedgistembedloss) with these parameters:
+  ```json
+  {'guide': SentenceTransformer(
+    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel
+    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+    (2): Normalize()
+  ), 'temperature': 0.025}
+  ```
+### Evaluation Dataset
+#### bobox/enhanced_nli-50_k
+* Dataset: bobox/enhanced_nli-50_k
+* Size: 3,052 evaluation samples
+* Columns: <code>sentence1</code> and <code>sentence2</code>
+* Approximate statistics based on the first 1000 samples:
+  |         | sentence1                                                                          | sentence2                                                                          |
+  |:--------|:-----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|
+  | type    | string                                                                             | string                                                                             |
+  | details | <ul><li>min: 4 tokens</li><li>mean: 33.13 tokens</li><li>max: 313 tokens</li></ul> | <ul><li>min: 2 tokens</li><li>mean: 58.53 tokens</li><li>max: 422 tokens</li></ul> |
+* Samples:
+  | sentence1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   | sentence2                                                                                                                                                                                                                                                                                                       |
+  |:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | <code>Dwight Beare, 27, from Melbourne, Australia, was killed in the crash near to the 16th milestone on the TT course on the Isle of Man.<br>Mr Beare suffered fatal spine and back injuries on the west of the island on 4 June at about 14:10 BST.<br>A verdict of misadventure was recorded by Coroner John Needham at Douglas Court House earlier.<br>Mr Needham said: "I am not in a position to give an exact reason for the loss of control but Mr Beare would have died almost instantaneously".<br>The inquest heard from a witness who said the carpenter and his passenger, Benjamin Binns, were thrown into the air after their vehicle hit the road side.<br>Mr Binns, who was airlifted to hospital with a broken leg, said before the race both he and Mr Beare had been "excited and confident".<br>"I will miss my friend dearly and his memory will live on," he said.<br>No defects were found on the vehicle.<br>Mr Beare moved to Onchan on the Isle of Man to pursue his road racing passion.<br>He made his TT debut in 2014 finishing 12th in the second race of the week, with his father Noel as his passenger.<br>He returned in 2015 when he came 17th.</code> | <code>A TT racer died after being thrown from his sidecar when he hit the side of a road, an inquest has heard.</code>                                                                                                                                                                                          |
+  | <code>are yeezy adidas or nike?</code>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | <code>Adidas Yeezy is a fashion collaboration between the German sportswear brand Adidas and American designer Kanye West. The collaboration has become notable for its high-end sneakers, and the Yeezy Boost sneaker line has been considered one of the most influential sneaker brands in the world.</code> |
+  | <code>what remains are changed into natural gas by heat and pressure change?</code>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         | <code>heat and pressure change the remains of prehistoric living things into natural gas. Dinosaurs and Other Prehistoric Creatures Dinosaurs are just one group of prehistoric animals. <br> heat and pressure change the remains of dinosaurs into natural gas</code>                                         |
+* Loss: [<code>CachedGISTEmbedLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cachedgistembedloss) with these parameters:
+  ```json
+  {'guide': SentenceTransformer(
+    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel
+    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+    (2): Normalize()
+  ), 'temperature': 0.025}
+  ```
+### Training Hyperparameters
+#### Non-Default Hyperparameters
+- `eval_strategy`: steps
+- `per_device_train_batch_size`: 320
+- `per_device_eval_batch_size`: 128
+- `learning_rate`: 2e-05
+- `weight_decay`: 0.0001
+- `num_train_epochs`: 2
+- `lr_scheduler_type`: cosine_with_restarts
+- `lr_scheduler_kwargs`: {'num_cycles': 3}
+- `warmup_ratio`: 0.25
+- `save_safetensors`: False
+- `fp16`: True
+- `push_to_hub`: True
+- `hub_model_id`: bobox/DeBERTa-small-ST-v1-test-UnifiedDatasets-Ft-checkpoints-tmp
+- `hub_strategy`: all_checkpoints
+- `batch_sampler`: no_duplicates
+#### All Hyperparameters
+<details><summary>Click to expand</summary>
+- `overwrite_output_dir`: False
+- `do_predict`: False
+- `eval_strategy`: steps
+- `prediction_loss_only`: True
+- `per_device_train_batch_size`: 320
+- `per_device_eval_batch_size`: 128
+- `per_gpu_train_batch_size`: None
+- `per_gpu_eval_batch_size`: None
+- `gradient_accumulation_steps`: 1
+- `eval_accumulation_steps`: None
+- `torch_empty_cache_steps`: None
+- `learning_rate`: 2e-05
+- `weight_decay`: 0.0001
+- `adam_beta1`: 0.9
+- `adam_beta2`: 0.999
+- `adam_epsilon`: 1e-08
+- `max_grad_norm`: 1.0
+- `num_train_epochs`: 2
+- `max_steps`: -1
+- `lr_scheduler_type`: cosine_with_restarts
+- `lr_scheduler_kwargs`: {'num_cycles': 3}
+- `warmup_ratio`: 0.25
+- `warmup_steps`: 0
+- `log_level`: passive
+- `log_level_replica`: warning
+- `log_on_each_node`: True
+- `logging_nan_inf_filter`: True
+- `save_safetensors`: False
+- `save_on_each_node`: False
+- `save_only_model`: False
+- `restore_callback_states_from_checkpoint`: False
+- `no_cuda`: False
+- `use_cpu`: False
+- `use_mps_device`: False
+- `seed`: 42
+- `data_seed`: None
+- `jit_mode_eval`: False
+- `use_ipex`: False
+- `bf16`: False
+- `fp16`: True
+- `fp16_opt_level`: O1
+- `half_precision_backend`: auto
+- `bf16_full_eval`: False
+- `fp16_full_eval`: False
+- `tf32`: None
+- `local_rank`: 0
+- `ddp_backend`: None
+- `tpu_num_cores`: None
+- `tpu_metrics_debug`: False
+- `debug`: []
+- `dataloader_drop_last`: False
+- `dataloader_num_workers`: 0
+- `dataloader_prefetch_factor`: None
+- `past_index`: -1
+- `disable_tqdm`: False
+- `remove_unused_columns`: True
+- `label_names`: None
+- `load_best_model_at_end`: False
+- `ignore_data_skip`: False
+- `fsdp`: []
+- `fsdp_min_num_params`: 0
+- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
+- `fsdp_transformer_layer_cls_to_wrap`: None
+- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
+- `deepspeed`: None
+- `label_smoothing_factor`: 0.0
+- `optim`: adamw_torch
+- `optim_args`: None
+- `adafactor`: False
+- `group_by_length`: False
+- `length_column_name`: length
+- `ddp_find_unused_parameters`: None
+- `ddp_bucket_cap_mb`: None
+- `ddp_broadcast_buffers`: False
+- `dataloader_pin_memory`: True
+- `dataloader_persistent_workers`: False
+- `skip_memory_metrics`: True
+- `use_legacy_prediction_loop`: False
+- `push_to_hub`: True
+- `resume_from_checkpoint`: None
+- `hub_model_id`: bobox/DeBERTa-small-ST-v1-test-UnifiedDatasets-Ft-checkpoints-tmp
+- `hub_strategy`: all_checkpoints
+- `hub_private_repo`: False
+- `hub_always_push`: False
+- `gradient_checkpointing`: False
+- `gradient_checkpointing_kwargs`: None
+- `include_inputs_for_metrics`: False
+- `eval_do_concat_batches`: True
+- `fp16_backend`: auto
+- `push_to_hub_model_id`: None
+- `push_to_hub_organization`: None
+- `mp_parameters`:
+- `auto_find_batch_size`: False
+- `full_determinism`: False
+- `torchdynamo`: None
+- `ray_scope`: last
+- `ddp_timeout`: 1800
+- `torch_compile`: False
+- `torch_compile_backend`: None
+- `torch_compile_mode`: None
+- `dispatch_batches`: None
+- `split_batches`: None
+- `include_tokens_per_second`: False
+- `include_num_input_tokens_seen`: False
+- `neftune_noise_alpha`: None
+- `optim_target_modules`: None
+- `batch_eval_metrics`: False
+- `eval_on_start`: False
+- `eval_use_gather_object`: False
+- `batch_sampler`: no_duplicates
+- `multi_dataset_batch_sampler`: proportional
+</details>
+### Training Logs
+<details><summary>Click to expand</summary>
+| Epoch  | Step | Training Loss | loss   | sts-test_spearman_cosine |
+|:------:|:----:|:-------------:|:------:|:------------------------:|
+| 0.0020 | 1    | 0.107         | -      | -                        |
+| 0.0039 | 2    | 0.1529        | -      | -                        |
+| 0.0059 | 3    | 0.1874        | -      | -                        |
+| 0.0078 | 4    | 0.1682        | -      | -                        |
+| 0.0098 | 5    | 0.1438        | 0.1470 | 0.9078                   |
+| 0.0117 | 6    | 0.2961        | -      | -                        |
+| 0.0137 | 7    | 0.3019        | -      | -                        |
+| 0.0157 | 8    | 0.1184        | -      | -                        |
+| 0.0176 | 9    | 0.3176        | -      | -                        |
+| 0.0196 | 10   | 0.2234        | 0.1468 | 0.9078                   |
+| 0.0215 | 11   | 0.1881        | -      | -                        |
+| 0.0235 | 12   | 0.1593        | -      | -                        |
+| 0.0254 | 13   | 0.1833        | -      | -                        |
+| 0.0274 | 14   | 0.1352        | -      | -                        |
+| 0.0294 | 15   | 0.3143        | 0.1462 | 0.9077                   |
+| 0.0313 | 16   | 0.1583        | -      | -                        |
+| 0.0333 | 17   | 0.2015        | -      | -                        |
+| 0.0352 | 18   | 0.1476        | -      | -                        |
+| 0.0372 | 19   | 0.1676        | -      | -                        |
+| 0.0391 | 20   | 0.1525        | 0.1454 | 0.9079                   |
+| 0.0411 | 21   | 0.1717        | -      | -                        |
+| 0.0431 | 22   | 0.198         | -      | -                        |
+| 0.0450 | 23   | 0.3062        | -      | -                        |
+| 0.0470 | 24   | 0.1241        | -      | -                        |
+| 0.0489 | 25   | 0.1087        | 0.1446 | 0.9082                   |
+| 0.0509 | 26   | 0.1767        | -      | -                        |
+| 0.0528 | 27   | 0.1951        | -      | -                        |
+| 0.0548 | 28   | 0.1621        | -      | -                        |
+| 0.0568 | 29   | 0.221         | -      | -                        |
+| 0.0587 | 30   | 0.2241        | 0.1435 | 0.9083                   |
+| 0.0607 | 31   | 0.2093        | -      | -                        |
+| 0.0626 | 32   | 0.1615        | -      | -                        |
+| 0.0646 | 33   | 0.1615        | -      | -                        |
+| 0.0665 | 34   | 0.1772        | -      | -                        |
+| 0.0685 | 35   | 0.2324        | 0.1423 | 0.9084                   |
+| 0.0705 | 36   | 0.2611        | -      | -                        |
+| 0.0724 | 37   | 0.214         | -      | -                        |
+| 0.0744 | 38   | 0.1985        | -      | -                        |
+| 0.0763 | 39   | 0.1855        | -      | -                        |
+| 0.0783 | 40   | 0.1234        | 0.1410 | 0.9085                   |
+| 0.0802 | 41   | 0.1492        | -      | -                        |
+| 0.0822 | 42   | 0.2022        | -      | -                        |
+| 0.0841 | 43   | 0.2146        | -      | -                        |
+| 0.0861 | 44   | 0.1688        | -      | -                        |
+| 0.0881 | 45   | 0.175         | 0.1396 | 0.9087                   |
+| 0.0900 | 46   | 0.2123        | -      | -                        |
+| 0.0920 | 47   | 0.1118        | -      | -                        |
+| 0.0939 | 48   | 0.3009        | -      | -                        |
+| 0.0959 | 49   | 0.1071        | -      | -                        |
+| 0.0978 | 50   | 0.2608        | 0.1382 | 0.9085                   |
+| 0.0998 | 51   | 0.1368        | -      | -                        |
+| 0.1018 | 52   | 0.2307        | -      | -                        |
+| 0.1037 | 53   | 0.1366        | -      | -                        |
+| 0.1057 | 54   | 0.1857        | -      | -                        |
+| 0.1076 | 55   | 0.2155        | 0.1367 | 0.9085                   |
+| 0.1096 | 56   | 0.2022        | -      | -                        |
+| 0.1115 | 57   | 0.2076        | -      | -                        |
+| 0.1135 | 58   | 0.4133        | -      | -                        |
+| 0.1155 | 59   | 0.1823        | -      | -                        |
+| 0.1174 | 60   | 0.1136        | 0.1353 | 0.9088                   |
+| 0.1194 | 61   | 0.1687        | -      | -                        |
+| 0.1213 | 62   | 0.1591        | -      | -                        |
+| 0.1233 | 63   | 0.1653        | -      | -                        |
+| 0.1252 | 64   | 0.1799        | -      | -                        |
+| 0.1272 | 65   | 0.1578        | 0.1337 | 0.9090                   |
+| 0.1292 | 66   | 0.1844        | -      | -                        |
+| 0.1311 | 67   | 0.1489        | -      | -                        |
+| 0.1331 | 68   | 0.1845        | -      | -                        |
+| 0.1350 | 69   | 0.1364        | -      | -                        |
+| 0.1370 | 70   | 0.1584        | 0.1321 | 0.9086                   |
+| 0.1389 | 71   | 0.2279        | -      | -                        |
+| 0.1409 | 72   | 0.2028        | -      | -                        |
+| 0.1429 | 73   | 0.2291        | -      | -                        |
+| 0.1448 | 74   | 0.2419        | -      | -                        |
+| 0.1468 | 75   | 0.1329        | 0.1306 | 0.9083                   |
+| 0.1487 | 76   | 0.204         | -      | -                        |
+| 0.1507 | 77   | 0.2239        | -      | -                        |
+| 0.1526 | 78   | 0.2181        | -      | -                        |
+| 0.1546 | 79   | 0.1285        | -      | -                        |
+| 0.1566 | 80   | 0.1067        | 0.1292 | 0.9079                   |
+| 0.1585 | 81   | 0.1189        | -      | -                        |
+| 0.1605 | 82   | 0.236         | -      | -                        |
+| 0.1624 | 83   | 0.1584        | -      | -                        |
+| 0.1644 | 84   | 0.1925        | -      | -                        |
+| 0.1663 | 85   | 0.129         | 0.1278 | 0.9079                   |
+| 0.1683 | 86   | 0.1376        | -      | -                        |
+| 0.1703 | 87   | 0.1691        | -      | -                        |
+| 0.1722 | 88   | 0.1045        | -      | -                        |
+| 0.1742 | 89   | 0.165         | -      | -                        |
+| 0.1761 | 90   | 0.2926        | 0.1267 | 0.9076                   |
+| 0.1781 | 91   | 0.1048        | -      | -                        |
+| 0.1800 | 92   | 0.1596        | -      | -                        |
+| 0.1820 | 93   | 0.2474        | -      | -                        |
+| 0.1840 | 94   | 0.1652        | -      | -                        |
+| 0.1859 | 95   | 0.2483        | 0.1253 | 0.9076                   |
+| 0.1879 | 96   | 0.1623        | -      | -                        |
+| 0.1898 | 97   | 0.1955        | -      | -                        |
+| 0.1918 | 98   | 0.2023        | -      | -                        |
+| 0.1937 | 99   | 0.1886        | -      | -                        |
+| 0.1957 | 100  | 0.1284        | 0.1229 | 0.9079                   |
+| 0.1977 | 101  | 0.2005        | -      | -                        |
+| 0.1996 | 102  | 0.2301        | -      | -                        |
+| 0.2016 | 103  | 0.2249        | -      | -                        |
+| 0.2035 | 104  | 0.214         | -      | -                        |
+| 0.2055 | 105  | 0.1429        | 0.1208 | 0.9077                   |
+| 0.2074 | 106  | 0.17          | -      | -                        |
+| 0.2094 | 107  | 0.1955        | -      | -                        |
+| 0.2114 | 108  | 0.1964        | -      | -                        |
+| 0.2133 | 109  | 0.1246        | -      | -                        |
+| 0.2153 | 110  | 0.1295        | 0.1190 | 0.9072                   |
+| 0.2172 | 111  | 0.2203        | -      | -                        |
+| 0.2192 | 112  | 0.2195        | -      | -                        |
+| 0.2211 | 113  | 0.1823        | -      | -                        |
+| 0.2231 | 114  | 0.174         | -      | -                        |
+| 0.2250 | 115  | 0.207         | 0.1175 | 0.9069                   |
+| 0.2270 | 116  | 0.2156        | -      | -                        |
+| 0.2290 | 117  | 0.2202        | -      | -                        |
+| 0.2309 | 118  | 0.2718        | -      | -                        |
+| 0.2329 | 119  | 0.1387        | -      | -                        |
+| 0.2348 | 120  | 0.1506        | 0.1168 | 0.9069                   |
+| 0.2368 | 121  | 0.1185        | -      | -                        |
+| 0.2387 | 122  | 0.1681        | -      | -                        |
+| 0.2407 | 123  | 0.2321        | -      | -                        |
+| 0.2427 | 124  | 0.1457        | -      | -                        |
+| 0.2446 | 125  | 0.2027        | 0.1165 | 0.9071                   |
+| 0.2466 | 126  | 0.1821        | -      | -                        |
+| 0.2485 | 127  | 0.1258        | -      | -                        |
+| 0.2505 | 128  | 0.184         | -      | -                        |
+| 0.2524 | 129  | 0.2015        | -      | -                        |
+| 0.2544 | 130  | 0.1323        | 0.1154 | 0.9074                   |
+| 0.2564 | 131  | 0.1939        | -      | -                        |
+| 0.2583 | 132  | 0.1428        | -      | -                        |
+| 0.2603 | 133  | 0.1063        | -      | -                        |
+| 0.2622 | 134  | 0.1602        | -      | -                        |
+| 0.2642 | 135  | 0.1814        | 0.1139 | 0.9067                   |
+| 0.2661 | 136  | 0.1518        | -      | -                        |
+| 0.2681 | 137  | 0.1379        | -      | -                        |
+| 0.2701 | 138  | 0.1708        | -      | -                        |
+| 0.2720 | 139  | 0.2046        | -      | -                        |
+| 0.2740 | 140  | 0.1259        | 0.1124 | 0.9063                   |
+| 0.2759 | 141  | 0.1181        | -      | -                        |
+| 0.2779 | 142  | 0.2144        | -      | -                        |
+| 0.2798 | 143  | 0.1822        | -      | -                        |
+| 0.2818 | 144  | 0.1667        | -      | -                        |
+| 0.2838 | 145  | 0.0779        | 0.1118 | 0.9060                   |
+| 0.2857 | 146  | 0.147         | -      | -                        |
+| 0.2877 | 147  | 0.1913        | -      | -                        |
+| 0.2896 | 148  | 0.1357        | -      | -                        |
+| 0.2916 | 149  | 0.1128        | -      | -                        |
+| 0.2935 | 150  | 0.0996        | 0.1113 | 0.9054                   |
+| 0.2955 | 151  | 0.1956        | -      | -                        |
+| 0.2975 | 152  | 0.0942        | -      | -                        |
+| 0.2994 | 153  | 0.1406        | -      | -                        |
+| 0.3014 | 154  | 0.2868        | -      | -                        |
+| 0.3033 | 155  | 0.1102        | 0.1114 | 0.9048                   |
+| 0.3053 | 156  | 0.1659        | -      | -                        |
+| 0.3072 | 157  | 0.1645        | -      | -                        |
+| 0.3092 | 158  | 0.151         | -      | -                        |
+| 0.3112 | 159  | 0.158         | -      | -                        |
+| 0.3131 | 160  | 0.2323        | 0.1113 | 0.9048                   |
+| 0.3151 | 161  | 0.1157        | -      | -                        |
+| 0.3170 | 162  | 0.1507        | -      | -                        |
+| 0.3190 | 163  | 0.1879        | -      | -                        |
+| 0.3209 | 164  | 0.143         | -      | -                        |
+| 0.3229 | 165  | 0.2227        | 0.1116 | 0.9050                   |
+| 0.3249 | 166  | 0.1624        | -      | -                        |
+| 0.3268 | 167  | 0.1345        | -      | -                        |
+| 0.3288 | 168  | 0.1765        | -      | -                        |
+| 0.3307 | 169  | 0.1368        | -      | -                        |
+| 0.3327 | 170  | 0.0962        | 0.1113 | 0.9056                   |
+| 0.3346 | 171  | 0.1783        | -      | -                        |
+| 0.3366 | 172  | 0.2019        | -      | -                        |
+| 0.3386 | 173  | 0.1761        | -      | -                        |
+| 0.3405 | 174  | 0.1855        | -      | -                        |
+| 0.3425 | 175  | 0.1922        | 0.1106 | 0.9054                   |
+| 0.3444 | 176  | 0.1538        | -      | -                        |
+| 0.3464 | 177  | 0.1049        | -      | -                        |
+| 0.3483 | 178  | 0.1619        | -      | -                        |
+| 0.3503 | 179  | 0.0731        | -      | -                        |
+| 0.3523 | 180  | 0.1205        | 0.1097 | 0.9059                   |
+| 0.3542 | 181  | 0.169         | -      | -                        |
+| 0.3562 | 182  | 0.1688        | -      | -                        |
+| 0.3581 | 183  | 0.1274        | -      | -                        |
+| 0.3601 | 184  | 0.1477        | -      | -                        |
+| 0.3620 | 185  | 0.1418        | 0.1094 | 0.9055                   |
+| 0.3640 | 186  | 0.2477        | -      | -                        |
+| 0.3659 | 187  | 0.1713        | -      | -                        |
+| 0.3679 | 188  | 0.1703        | -      | -                        |
+| 0.3699 | 189  | 0.1176        | -      | -                        |
+| 0.3718 | 190  | 0.1811        | 0.1084 | 0.9048                   |
+| 0.3738 | 191  | 0.162         | -      | -                        |
+| 0.3757 | 192  | 0.1141        | -      | -                        |
+| 0.3777 | 193  | 0.154         | -      | -                        |
+| 0.3796 | 194  | 0.2461        | -      | -                        |
+| 0.3816 | 195  | 0.1573        | 0.1076 | 0.9046                   |
+| 0.3836 | 196  | 0.1197        | -      | -                        |
+| 0.3855 | 197  | 0.1395        | -      | -                        |
+| 0.3875 | 198  | 0.0847        | -      | -                        |
+| 0.3894 | 199  | 0.1848        | -      | -                        |
+| 0.3914 | 200  | 0.1377        | 0.1072 | 0.9047                   |
+| 0.3933 | 201  | 0.1109        | -      | -                        |
+| 0.3953 | 202  | 0.1051        | -      | -                        |
+| 0.3973 | 203  | 0.0975        | -      | -                        |
+| 0.3992 | 204  | 0.127         | -      | -                        |
+| 0.4012 | 205  | 0.1297        | 0.1069 | 0.9049                   |
+| 0.4031 | 206  | 0.0783        | -      | -                        |
+</details>
+### Framework Versions
+- Python: 3.10.14
+- Sentence Transformers: 3.0.1
+- Transformers: 4.44.0
+- PyTorch: 2.4.0
+- Accelerate: 0.33.0
+- Datasets: 2.21.0
+- Tokenizers: 0.19.1
+## Citation
+### BibTeX
+#### Sentence Transformers
+```bibtex
+@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}
+```
+<!--
+## Glossary
+*Clearly define terms in order to be accessible across audiences.*
+-->
+<!--
+## Model Card Authors
+*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
+-->
+<!--
+## Model Card Contact
+*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
+-->

checkpoint-206/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[MASK]": 128000
+}

checkpoint-206/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "_name_or_path": "bobox/DeBERTa-small-ST-v1-test-step3",
+  "architectures": [
+    "DebertaV2Model"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 768,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": false,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.0",
+  "type_vocab_size": 0,
+  "vocab_size": 128100
+}

checkpoint-206/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "__version__": {
+    "sentence_transformers": "3.0.1",
+    "transformers": "4.44.0",
+    "pytorch": "2.4.0"
+  },
+  "prompts": {},
+  "default_prompt_name": null,
+  "similarity_fn_name": null
+}

checkpoint-206/modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  }
+]

checkpoint-206/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22fd561cf8189d98056838fb978c385ff94a9d18c0fcf7fc9962b2a1127115e6
+size 1130520122

checkpoint-206/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:835e1a950aafde50a5c498724053dfe69aca89911f95905be112175bc51a75f4
+size 565251810

checkpoint-206/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c039da187e466f9333bdfd85aea3b08a2b6daa5e4c64a1538e2195ef50dcb72
+size 14244

checkpoint-206/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:019c75093bbfb69f10e80aa350a238b9b054cf41c2d7260f32b195feb944ab96
+size 1064

checkpoint-206/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 512,
+  "do_lower_case": false
+}

checkpoint-206/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-206/spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616

checkpoint-206/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-206/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128000": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "max_length": 512,
+  "model_max_length": 512,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "split_by_punct": false,
+  "stride": 0,
+  "tokenizer_class": "DebertaV2Tokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]",
+  "vocab_type": "spm"
+}

checkpoint-206/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2213 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.40313111545988256,
+  "eval_steps": 5,
+  "global_step": 206,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0019569471624266144,
+      "grad_norm": 3.932948112487793,
+      "learning_rate": 7.8125e-08,
+      "loss": 0.107,
+      "step": 1
+    },
+    {
+      "epoch": 0.003913894324853229,
+      "grad_norm": 4.482716083526611,
+      "learning_rate": 1.5625e-07,
+      "loss": 0.1529,
+      "step": 2
+    },
+    {
+      "epoch": 0.005870841487279843,
+      "grad_norm": 4.672689437866211,
+      "learning_rate": 2.3437500000000003e-07,
+      "loss": 0.1874,
+      "step": 3
+    },
+    {
+      "epoch": 0.007827788649706457,
+      "grad_norm": 4.226949214935303,
+      "learning_rate": 3.125e-07,
+      "loss": 0.1682,
+      "step": 4
+    },
+    {
+      "epoch": 0.009784735812133072,
+      "grad_norm": 4.327479362487793,
+      "learning_rate": 3.90625e-07,
+      "loss": 0.1438,
+      "step": 5
+    },
+    {
+      "epoch": 0.009784735812133072,
+      "eval_loss": 0.1470455378293991,
+      "eval_runtime": 107.3614,
+      "eval_samples_per_second": 28.427,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8861388036460539,
+      "eval_sts-test_pearson_dot": 0.8769528313548112,
+      "eval_sts-test_pearson_euclidean": 0.9079831987750276,
+      "eval_sts-test_pearson_manhattan": 0.9086786527495163,
+      "eval_sts-test_pearson_max": 0.9086786527495163,
+      "eval_sts-test_spearman_cosine": 0.9077902566323186,
+      "eval_sts-test_spearman_dot": 0.8794770733264693,
+      "eval_sts-test_spearman_euclidean": 0.903967335376697,
+      "eval_sts-test_spearman_manhattan": 0.9043498244078092,
+      "eval_sts-test_spearman_max": 0.9077902566323186,
+      "step": 5
+    },
+    {
+      "epoch": 0.011741682974559686,
+      "grad_norm": 5.27250337600708,
+      "learning_rate": 4.6875000000000006e-07,
+      "loss": 0.2961,
+      "step": 6
+    },
+    {
+      "epoch": 0.0136986301369863,
+      "grad_norm": 5.903276443481445,
+      "learning_rate": 5.468750000000001e-07,
+      "loss": 0.3019,
+      "step": 7
+    },
+    {
+      "epoch": 0.015655577299412915,
+      "grad_norm": 4.000335693359375,
+      "learning_rate": 6.25e-07,
+      "loss": 0.1184,
+      "step": 8
+    },
+    {
+      "epoch": 0.01761252446183953,
+      "grad_norm": 5.876769065856934,
+      "learning_rate": 7.03125e-07,
+      "loss": 0.3176,
+      "step": 9
+    },
+    {
+      "epoch": 0.019569471624266144,
+      "grad_norm": 4.8437933921813965,
+      "learning_rate": 7.8125e-07,
+      "loss": 0.2234,
+      "step": 10
+    },
+    {
+      "epoch": 0.019569471624266144,
+      "eval_loss": 0.1467687040567398,
+      "eval_runtime": 107.2549,
+      "eval_samples_per_second": 28.456,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8861409457129842,
+      "eval_sts-test_pearson_dot": 0.876972814890145,
+      "eval_sts-test_pearson_euclidean": 0.9080268416052204,
+      "eval_sts-test_pearson_manhattan": 0.9087444298597203,
+      "eval_sts-test_pearson_max": 0.9087444298597203,
+      "eval_sts-test_spearman_cosine": 0.9078342918735278,
+      "eval_sts-test_spearman_dot": 0.8794190309404447,
+      "eval_sts-test_spearman_euclidean": 0.9039501508923226,
+      "eval_sts-test_spearman_manhattan": 0.9044244247605487,
+      "eval_sts-test_spearman_max": 0.9078342918735278,
+      "step": 10
+    },
+    {
+      "epoch": 0.021526418786692758,
+      "grad_norm": 4.726498603820801,
+      "learning_rate": 8.59375e-07,
+      "loss": 0.1881,
+      "step": 11
+    },
+    {
+      "epoch": 0.023483365949119372,
+      "grad_norm": 4.818070411682129,
+      "learning_rate": 9.375000000000001e-07,
+      "loss": 0.1593,
+      "step": 12
+    },
+    {
+      "epoch": 0.025440313111545987,
+      "grad_norm": 4.98201322555542,
+      "learning_rate": 1.0156250000000001e-06,
+      "loss": 0.1833,
+      "step": 13
+    },
+    {
+      "epoch": 0.0273972602739726,
+      "grad_norm": 4.269514560699463,
+      "learning_rate": 1.0937500000000001e-06,
+      "loss": 0.1352,
+      "step": 14
+    },
+    {
+      "epoch": 0.029354207436399216,
+      "grad_norm": 6.1525492668151855,
+      "learning_rate": 1.1718750000000001e-06,
+      "loss": 0.3143,
+      "step": 15
+    },
+    {
+      "epoch": 0.029354207436399216,
+      "eval_loss": 0.1462097316980362,
+      "eval_runtime": 107.0721,
+      "eval_samples_per_second": 28.504,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8860829119688085,
+      "eval_sts-test_pearson_dot": 0.8768990080043222,
+      "eval_sts-test_pearson_euclidean": 0.9080646402781543,
+      "eval_sts-test_pearson_manhattan": 0.9088063929836994,
+      "eval_sts-test_pearson_max": 0.9088063929836994,
+      "eval_sts-test_spearman_cosine": 0.907713597721555,
+      "eval_sts-test_spearman_dot": 0.8795110842851269,
+      "eval_sts-test_spearman_euclidean": 0.9040110126078148,
+      "eval_sts-test_spearman_manhattan": 0.9045081991218733,
+      "eval_sts-test_spearman_max": 0.907713597721555,
+      "step": 15
+    },
+    {
+      "epoch": 0.03131115459882583,
+      "grad_norm": 4.751354694366455,
+      "learning_rate": 1.25e-06,
+      "loss": 0.1583,
+      "step": 16
+    },
+    {
+      "epoch": 0.033268101761252444,
+      "grad_norm": 5.435980319976807,
+      "learning_rate": 1.328125e-06,
+      "loss": 0.2015,
+      "step": 17
+    },
+    {
+      "epoch": 0.03522504892367906,
+      "grad_norm": 4.1765851974487305,
+      "learning_rate": 1.40625e-06,
+      "loss": 0.1476,
+      "step": 18
+    },
+    {
+      "epoch": 0.03718199608610567,
+      "grad_norm": 4.689794540405273,
+      "learning_rate": 1.484375e-06,
+      "loss": 0.1676,
+      "step": 19
+    },
+    {
+      "epoch": 0.03913894324853229,
+      "grad_norm": 4.203744888305664,
+      "learning_rate": 1.5625e-06,
+      "loss": 0.1525,
+      "step": 20
+    },
+    {
+      "epoch": 0.03913894324853229,
+      "eval_loss": 0.14544810354709625,
+      "eval_runtime": 107.1845,
+      "eval_samples_per_second": 28.474,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8861436293943533,
+      "eval_sts-test_pearson_dot": 0.8769239163708102,
+      "eval_sts-test_pearson_euclidean": 0.9082269545633608,
+      "eval_sts-test_pearson_manhattan": 0.9089828403051001,
+      "eval_sts-test_pearson_max": 0.9089828403051001,
+      "eval_sts-test_spearman_cosine": 0.907929343552723,
+      "eval_sts-test_spearman_dot": 0.8796122221358714,
+      "eval_sts-test_spearman_euclidean": 0.9043074002120102,
+      "eval_sts-test_spearman_manhattan": 0.9047217521412333,
+      "eval_sts-test_spearman_max": 0.907929343552723,
+      "step": 20
+    },
+    {
+      "epoch": 0.0410958904109589,
+      "grad_norm": 5.152130603790283,
+      "learning_rate": 1.640625e-06,
+      "loss": 0.1717,
+      "step": 21
+    },
+    {
+      "epoch": 0.043052837573385516,
+      "grad_norm": 5.343059062957764,
+      "learning_rate": 1.71875e-06,
+      "loss": 0.198,
+      "step": 22
+    },
+    {
+      "epoch": 0.04500978473581213,
+      "grad_norm": 5.224748134613037,
+      "learning_rate": 1.796875e-06,
+      "loss": 0.3062,
+      "step": 23
+    },
+    {
+      "epoch": 0.046966731898238745,
+      "grad_norm": 4.6179423332214355,
+      "learning_rate": 1.8750000000000003e-06,
+      "loss": 0.1241,
+      "step": 24
+    },
+    {
+      "epoch": 0.04892367906066536,
+      "grad_norm": 4.200148105621338,
+      "learning_rate": 1.953125e-06,
+      "loss": 0.1087,
+      "step": 25
+    },
+    {
+      "epoch": 0.04892367906066536,
+      "eval_loss": 0.14457188546657562,
+      "eval_runtime": 107.3809,
+      "eval_samples_per_second": 28.422,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8862905994058754,
+      "eval_sts-test_pearson_dot": 0.877015249192232,
+      "eval_sts-test_pearson_euclidean": 0.9085054742522269,
+      "eval_sts-test_pearson_manhattan": 0.9092575877809899,
+      "eval_sts-test_pearson_max": 0.9092575877809899,
+      "eval_sts-test_spearman_cosine": 0.9082294902628751,
+      "eval_sts-test_spearman_dot": 0.8798810429630494,
+      "eval_sts-test_spearman_euclidean": 0.9047149499495015,
+      "eval_sts-test_spearman_manhattan": 0.9051023616193669,
+      "eval_sts-test_spearman_max": 0.9082294902628751,
+      "step": 25
+    },
+    {
+      "epoch": 0.050880626223091974,
+      "grad_norm": 4.890737533569336,
+      "learning_rate": 2.0312500000000002e-06,
+      "loss": 0.1767,
+      "step": 26
+    },
+    {
+      "epoch": 0.05283757338551859,
+      "grad_norm": 4.683767795562744,
+      "learning_rate": 2.109375e-06,
+      "loss": 0.1951,
+      "step": 27
+    },
+    {
+      "epoch": 0.0547945205479452,
+      "grad_norm": 4.656280040740967,
+      "learning_rate": 2.1875000000000002e-06,
+      "loss": 0.1621,
+      "step": 28
+    },
+    {
+      "epoch": 0.05675146771037182,
+      "grad_norm": 4.446409702301025,
+      "learning_rate": 2.265625e-06,
+      "loss": 0.221,
+      "step": 29
+    },
+    {
+      "epoch": 0.05870841487279843,
+      "grad_norm": 5.765133857727051,
+      "learning_rate": 2.3437500000000002e-06,
+      "loss": 0.2241,
+      "step": 30
+    },
+    {
+      "epoch": 0.05870841487279843,
+      "eval_loss": 0.14350731670856476,
+      "eval_runtime": 107.3747,
+      "eval_samples_per_second": 28.424,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8863784941826807,
+      "eval_sts-test_pearson_dot": 0.8768948467465629,
+      "eval_sts-test_pearson_euclidean": 0.9088066170487232,
+      "eval_sts-test_pearson_manhattan": 0.9095658568102677,
+      "eval_sts-test_pearson_max": 0.9095658568102677,
+      "eval_sts-test_spearman_cosine": 0.9082580415676429,
+      "eval_sts-test_spearman_dot": 0.8801849487791585,
+      "eval_sts-test_spearman_euclidean": 0.9051721735871375,
+      "eval_sts-test_spearman_manhattan": 0.9054862826908437,
+      "eval_sts-test_spearman_max": 0.9082580415676429,
+      "step": 30
+    },
+    {
+      "epoch": 0.060665362035225046,
+      "grad_norm": 5.359245777130127,
+      "learning_rate": 2.421875e-06,
+      "loss": 0.2093,
+      "step": 31
+    },
+    {
+      "epoch": 0.06262230919765166,
+      "grad_norm": 4.439486503601074,
+      "learning_rate": 2.5e-06,
+      "loss": 0.1615,
+      "step": 32
+    },
+    {
+      "epoch": 0.06457925636007827,
+      "grad_norm": 3.689824342727661,
+      "learning_rate": 2.5781250000000004e-06,
+      "loss": 0.1615,
+      "step": 33
+    },
+    {
+      "epoch": 0.06653620352250489,
+      "grad_norm": 4.842885494232178,
+      "learning_rate": 2.65625e-06,
+      "loss": 0.1772,
+      "step": 34
+    },
+    {
+      "epoch": 0.0684931506849315,
+      "grad_norm": 5.209301948547363,
+      "learning_rate": 2.7343750000000004e-06,
+      "loss": 0.2324,
+      "step": 35
+    },
+    {
+      "epoch": 0.0684931506849315,
+      "eval_loss": 0.14226235449314117,
+      "eval_runtime": 107.3108,
+      "eval_samples_per_second": 28.441,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8863574366132135,
+      "eval_sts-test_pearson_dot": 0.8765683077424664,
+      "eval_sts-test_pearson_euclidean": 0.9091012263251723,
+      "eval_sts-test_pearson_manhattan": 0.9098631032540263,
+      "eval_sts-test_pearson_max": 0.9098631032540263,
+      "eval_sts-test_spearman_cosine": 0.9083728733043733,
+      "eval_sts-test_spearman_dot": 0.8800282746130272,
+      "eval_sts-test_spearman_euclidean": 0.9052579170039636,
+      "eval_sts-test_spearman_manhattan": 0.9059997586640487,
+      "eval_sts-test_spearman_max": 0.9083728733043733,
+      "step": 35
+    },
+    {
+      "epoch": 0.07045009784735812,
+      "grad_norm": 4.740983009338379,
+      "learning_rate": 2.8125e-06,
+      "loss": 0.2611,
+      "step": 36
+    },
+    {
+      "epoch": 0.07240704500978473,
+      "grad_norm": 5.090059757232666,
+      "learning_rate": 2.8906250000000004e-06,
+      "loss": 0.214,
+      "step": 37
+    },
+    {
+      "epoch": 0.07436399217221135,
+      "grad_norm": 5.123153209686279,
+      "learning_rate": 2.96875e-06,
+      "loss": 0.1985,
+      "step": 38
+    },
+    {
+      "epoch": 0.07632093933463796,
+      "grad_norm": 5.401946067810059,
+      "learning_rate": 3.0468750000000004e-06,
+      "loss": 0.1855,
+      "step": 39
+    },
+    {
+      "epoch": 0.07827788649706457,
+      "grad_norm": 4.838700294494629,
+      "learning_rate": 3.125e-06,
+      "loss": 0.1234,
+      "step": 40
+    },
+    {
+      "epoch": 0.07827788649706457,
+      "eval_loss": 0.14100149273872375,
+      "eval_runtime": 107.3059,
+      "eval_samples_per_second": 28.442,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8864265749012155,
+      "eval_sts-test_pearson_dot": 0.8764612424174422,
+      "eval_sts-test_pearson_euclidean": 0.9094092487009695,
+      "eval_sts-test_pearson_manhattan": 0.9101707626021143,
+      "eval_sts-test_pearson_max": 0.9101707626021143,
+      "eval_sts-test_spearman_cosine": 0.908505695048183,
+      "eval_sts-test_spearman_dot": 0.8802103674956289,
+      "eval_sts-test_spearman_euclidean": 0.9054564783507572,
+      "eval_sts-test_spearman_manhattan": 0.9063046490079084,
+      "eval_sts-test_spearman_max": 0.908505695048183,
+      "step": 40
+    },
+    {
+      "epoch": 0.08023483365949119,
+      "grad_norm": 3.8856801986694336,
+      "learning_rate": 3.2031250000000004e-06,
+      "loss": 0.1492,
+      "step": 41
+    },
+    {
+      "epoch": 0.0821917808219178,
+      "grad_norm": 5.678151607513428,
+      "learning_rate": 3.28125e-06,
+      "loss": 0.2022,
+      "step": 42
+    },
+    {
+      "epoch": 0.08414872798434442,
+      "grad_norm": 5.104148864746094,
+      "learning_rate": 3.3593750000000003e-06,
+      "loss": 0.2146,
+      "step": 43
+    },
+    {
+      "epoch": 0.08610567514677103,
+      "grad_norm": 4.76043701171875,
+      "learning_rate": 3.4375e-06,
+      "loss": 0.1688,
+      "step": 44
+    },
+    {
+      "epoch": 0.08806262230919765,
+      "grad_norm": 5.128803730010986,
+      "learning_rate": 3.5156250000000003e-06,
+      "loss": 0.175,
+      "step": 45
+    },
+    {
+      "epoch": 0.08806262230919765,
+      "eval_loss": 0.13962982594966888,
+      "eval_runtime": 107.4144,
+      "eval_samples_per_second": 28.413,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.886410908658177,
+      "eval_sts-test_pearson_dot": 0.8762836795862763,
+      "eval_sts-test_pearson_euclidean": 0.9096890242379734,
+      "eval_sts-test_pearson_manhattan": 0.9104590803642174,
+      "eval_sts-test_pearson_max": 0.9104590803642174,
+      "eval_sts-test_spearman_cosine": 0.9086694846648755,
+      "eval_sts-test_spearman_dot": 0.8801346931126159,
+      "eval_sts-test_spearman_euclidean": 0.9057376952773407,
+      "eval_sts-test_spearman_manhattan": 0.9064708999439774,
+      "eval_sts-test_spearman_max": 0.9086694846648755,
+      "step": 45
+    },
+    {
+      "epoch": 0.09001956947162426,
+      "grad_norm": 4.968522548675537,
+      "learning_rate": 3.59375e-06,
+      "loss": 0.2123,
+      "step": 46
+    },
+    {
+      "epoch": 0.09197651663405088,
+      "grad_norm": 4.343472957611084,
+      "learning_rate": 3.6718750000000003e-06,
+      "loss": 0.1118,
+      "step": 47
+    },
+    {
+      "epoch": 0.09393346379647749,
+      "grad_norm": 6.252938270568848,
+      "learning_rate": 3.7500000000000005e-06,
+      "loss": 0.3009,
+      "step": 48
+    },
+    {
+      "epoch": 0.0958904109589041,
+      "grad_norm": 3.411029815673828,
+      "learning_rate": 3.828125000000001e-06,
+      "loss": 0.1071,
+      "step": 49
+    },
+    {
+      "epoch": 0.09784735812133072,
+      "grad_norm": 5.379226207733154,
+      "learning_rate": 3.90625e-06,
+      "loss": 0.2608,
+      "step": 50
+    },
+    {
+      "epoch": 0.09784735812133072,
+      "eval_loss": 0.13823722302913666,
+      "eval_runtime": 107.3656,
+      "eval_samples_per_second": 28.426,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8863074884351817,
+      "eval_sts-test_pearson_dot": 0.8763122134205692,
+      "eval_sts-test_pearson_euclidean": 0.9097700018848961,
+      "eval_sts-test_pearson_manhattan": 0.9105724410858811,
+      "eval_sts-test_pearson_max": 0.9105724410858811,
+      "eval_sts-test_spearman_cosine": 0.9085105281844131,
+      "eval_sts-test_spearman_dot": 0.8801239975611433,
+      "eval_sts-test_spearman_euclidean": 0.9059798443527296,
+      "eval_sts-test_spearman_manhattan": 0.9065691737139927,
+      "eval_sts-test_spearman_max": 0.9085105281844131,
+      "step": 50
+    },
+    {
+      "epoch": 0.09980430528375733,
+      "grad_norm": 4.599095821380615,
+      "learning_rate": 3.984375e-06,
+      "loss": 0.1368,
+      "step": 51
+    },
+    {
+      "epoch": 0.10176125244618395,
+      "grad_norm": 5.634761333465576,
+      "learning_rate": 4.0625000000000005e-06,
+      "loss": 0.2307,
+      "step": 52
+    },
+    {
+      "epoch": 0.10371819960861056,
+      "grad_norm": 4.678525924682617,
+      "learning_rate": 4.140625000000001e-06,
+      "loss": 0.1366,
+      "step": 53
+    },
+    {
+      "epoch": 0.10567514677103718,
+      "grad_norm": 4.931070327758789,
+      "learning_rate": 4.21875e-06,
+      "loss": 0.1857,
+      "step": 54
+    },
+    {
+      "epoch": 0.10763209393346379,
+      "grad_norm": 4.903087139129639,
+      "learning_rate": 4.296875e-06,
+      "loss": 0.2155,
+      "step": 55
+    },
+    {
+      "epoch": 0.10763209393346379,
+      "eval_loss": 0.1367325782775879,
+      "eval_runtime": 107.3012,
+      "eval_samples_per_second": 28.443,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.88603017002284,
+      "eval_sts-test_pearson_dot": 0.8761626193697236,
+      "eval_sts-test_pearson_euclidean": 0.9096799681812165,
+      "eval_sts-test_pearson_manhattan": 0.9104977957475867,
+      "eval_sts-test_pearson_max": 0.9104977957475867,
+      "eval_sts-test_spearman_cosine": 0.9084685067499666,
+      "eval_sts-test_spearman_dot": 0.8802836700617878,
+      "eval_sts-test_spearman_euclidean": 0.9058409364373706,
+      "eval_sts-test_spearman_manhattan": 0.9064240006220393,
+      "eval_sts-test_spearman_max": 0.9084685067499666,
+      "step": 55
+    },
+    {
+      "epoch": 0.1095890410958904,
+      "grad_norm": 5.408311367034912,
+      "learning_rate": 4.3750000000000005e-06,
+      "loss": 0.2022,
+      "step": 56
+    },
+    {
+      "epoch": 0.11154598825831702,
+      "grad_norm": 4.5926713943481445,
+      "learning_rate": 4.453125000000001e-06,
+      "loss": 0.2076,
+      "step": 57
+    },
+    {
+      "epoch": 0.11350293542074363,
+      "grad_norm": 6.475535869598389,
+      "learning_rate": 4.53125e-06,
+      "loss": 0.4133,
+      "step": 58
+    },
+    {
+      "epoch": 0.11545988258317025,
+      "grad_norm": 4.997581481933594,
+      "learning_rate": 4.609375e-06,
+      "loss": 0.1823,
+      "step": 59
+    },
+    {
+      "epoch": 0.11741682974559686,
+      "grad_norm": 3.899284601211548,
+      "learning_rate": 4.6875000000000004e-06,
+      "loss": 0.1136,
+      "step": 60
+    },
+    {
+      "epoch": 0.11741682974559686,
+      "eval_loss": 0.13528631627559662,
+      "eval_runtime": 107.3435,
+      "eval_samples_per_second": 28.432,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8860224650016398,
+      "eval_sts-test_pearson_dot": 0.8762739756970772,
+      "eval_sts-test_pearson_euclidean": 0.9099016820022997,
+      "eval_sts-test_pearson_manhattan": 0.9107281338135995,
+      "eval_sts-test_pearson_max": 0.9107281338135995,
+      "eval_sts-test_spearman_cosine": 0.9087510214631306,
+      "eval_sts-test_spearman_dot": 0.8808623486228402,
+      "eval_sts-test_spearman_euclidean": 0.9060555634870038,
+      "eval_sts-test_spearman_manhattan": 0.9067256241238172,
+      "eval_sts-test_spearman_max": 0.9087510214631306,
+      "step": 60
+    },
+    {
+      "epoch": 0.11937377690802348,
+      "grad_norm": 4.476404190063477,
+      "learning_rate": 4.765625000000001e-06,
+      "loss": 0.1687,
+      "step": 61
+    },
+    {
+      "epoch": 0.12133072407045009,
+      "grad_norm": 4.893277168273926,
+      "learning_rate": 4.84375e-06,
+      "loss": 0.1591,
+      "step": 62
+    },
+    {
+      "epoch": 0.1232876712328767,
+      "grad_norm": 4.510354042053223,
+      "learning_rate": 4.921875e-06,
+      "loss": 0.1653,
+      "step": 63
+    },
+    {
+      "epoch": 0.12524461839530332,
+      "grad_norm": 4.400285243988037,
+      "learning_rate": 5e-06,
+      "loss": 0.1799,
+      "step": 64
+    },
+    {
+      "epoch": 0.12720156555772993,
+      "grad_norm": 4.631839752197266,
+      "learning_rate": 5.078125000000001e-06,
+      "loss": 0.1578,
+      "step": 65
+    },
+    {
+      "epoch": 0.12720156555772993,
+      "eval_loss": 0.1336735188961029,
+      "eval_runtime": 107.4984,
+      "eval_samples_per_second": 28.391,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.886014179849858,
+      "eval_sts-test_pearson_dot": 0.8762492282837839,
+      "eval_sts-test_pearson_euclidean": 0.9101155794045166,
+      "eval_sts-test_pearson_manhattan": 0.9109538919103571,
+      "eval_sts-test_pearson_max": 0.9109538919103571,
+      "eval_sts-test_spearman_cosine": 0.9089514176116413,
+      "eval_sts-test_spearman_dot": 0.8810853441583534,
+      "eval_sts-test_spearman_euclidean": 0.9061670836303911,
+      "eval_sts-test_spearman_manhattan": 0.9072153371772234,
+      "eval_sts-test_spearman_max": 0.9089514176116413,
+      "step": 65
+    },
+    {
+      "epoch": 0.12915851272015655,
+      "grad_norm": 4.043459415435791,
+      "learning_rate": 5.156250000000001e-06,
+      "loss": 0.1844,
+      "step": 66
+    },
+    {
+      "epoch": 0.13111545988258316,
+      "grad_norm": 4.447835922241211,
+      "learning_rate": 5.234375e-06,
+      "loss": 0.1489,
+      "step": 67
+    },
+    {
+      "epoch": 0.13307240704500978,
+      "grad_norm": 5.372109889984131,
+      "learning_rate": 5.3125e-06,
+      "loss": 0.1845,
+      "step": 68
+    },
+    {
+      "epoch": 0.1350293542074364,
+      "grad_norm": 3.5112483501434326,
+      "learning_rate": 5.390625000000001e-06,
+      "loss": 0.1364,
+      "step": 69
+    },
+    {
+      "epoch": 0.136986301369863,
+      "grad_norm": 4.305239200592041,
+      "learning_rate": 5.468750000000001e-06,
+      "loss": 0.1584,
+      "step": 70
+    },
+    {
+      "epoch": 0.136986301369863,
+      "eval_loss": 0.1320798397064209,
+      "eval_runtime": 107.505,
+      "eval_samples_per_second": 28.389,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.88578311613969,
+      "eval_sts-test_pearson_dot": 0.875928774505713,
+      "eval_sts-test_pearson_euclidean": 0.91024619729973,
+      "eval_sts-test_pearson_manhattan": 0.9110959495329505,
+      "eval_sts-test_pearson_max": 0.9110959495329505,
+      "eval_sts-test_spearman_cosine": 0.9086066538938818,
+      "eval_sts-test_spearman_dot": 0.8801235500485294,
+      "eval_sts-test_spearman_euclidean": 0.9060052183179386,
+      "eval_sts-test_spearman_manhattan": 0.907439182986703,
+      "eval_sts-test_spearman_max": 0.9086066538938818,
+      "step": 70
+    },
+    {
+      "epoch": 0.13894324853228962,
+      "grad_norm": 5.093306064605713,
+      "learning_rate": 5.546875e-06,
+      "loss": 0.2279,
+      "step": 71
+    },
+    {
+      "epoch": 0.14090019569471623,
+      "grad_norm": 4.953585147857666,
+      "learning_rate": 5.625e-06,
+      "loss": 0.2028,
+      "step": 72
+    },
+    {
+      "epoch": 0.14285714285714285,
+      "grad_norm": 4.1561102867126465,
+      "learning_rate": 5.7031250000000006e-06,
+      "loss": 0.2291,
+      "step": 73
+    },
+    {
+      "epoch": 0.14481409001956946,
+      "grad_norm": 5.00941801071167,
+      "learning_rate": 5.781250000000001e-06,
+      "loss": 0.2419,
+      "step": 74
+    },
+    {
+      "epoch": 0.14677103718199608,
+      "grad_norm": 3.6476099491119385,
+      "learning_rate": 5.859375e-06,
+      "loss": 0.1329,
+      "step": 75
+    },
+    {
+      "epoch": 0.14677103718199608,
+      "eval_loss": 0.13061992824077606,
+      "eval_runtime": 107.3395,
+      "eval_samples_per_second": 28.433,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8854112983780439,
+      "eval_sts-test_pearson_dot": 0.8752625071185561,
+      "eval_sts-test_pearson_euclidean": 0.9103378320010516,
+      "eval_sts-test_pearson_manhattan": 0.9112261622276095,
+      "eval_sts-test_pearson_max": 0.9112261622276095,
+      "eval_sts-test_spearman_cosine": 0.9082604133844965,
+      "eval_sts-test_spearman_dot": 0.8794192099454903,
+      "eval_sts-test_spearman_euclidean": 0.9060063370994732,
+      "eval_sts-test_spearman_manhattan": 0.90766132824825,
+      "eval_sts-test_spearman_max": 0.9082604133844965,
+      "step": 75
+    },
+    {
+      "epoch": 0.1487279843444227,
+      "grad_norm": 4.10636568069458,
+      "learning_rate": 5.9375e-06,
+      "loss": 0.204,
+      "step": 76
+    },
+    {
+      "epoch": 0.1506849315068493,
+      "grad_norm": 4.767779350280762,
+      "learning_rate": 6.0156250000000005e-06,
+      "loss": 0.2239,
+      "step": 77
+    },
+    {
+      "epoch": 0.15264187866927592,
+      "grad_norm": 5.366302490234375,
+      "learning_rate": 6.093750000000001e-06,
+      "loss": 0.2181,
+      "step": 78
+    },
+    {
+      "epoch": 0.15459882583170254,
+      "grad_norm": 4.087960720062256,
+      "learning_rate": 6.171875e-06,
+      "loss": 0.1285,
+      "step": 79
+    },
+    {
+      "epoch": 0.15655577299412915,
+      "grad_norm": 3.7557668685913086,
+      "learning_rate": 6.25e-06,
+      "loss": 0.1067,
+      "step": 80
+    },
+    {
+      "epoch": 0.15655577299412915,
+      "eval_loss": 0.12924787402153015,
+      "eval_runtime": 107.2528,
+      "eval_samples_per_second": 28.456,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8850894038300653,
+      "eval_sts-test_pearson_dot": 0.874941916465686,
+      "eval_sts-test_pearson_euclidean": 0.9101863990952803,
+      "eval_sts-test_pearson_manhattan": 0.9110826056950171,
+      "eval_sts-test_pearson_max": 0.9110826056950171,
+      "eval_sts-test_spearman_cosine": 0.9078700928826409,
+      "eval_sts-test_spearman_dot": 0.8792947566875607,
+      "eval_sts-test_spearman_euclidean": 0.9059290069197888,
+      "eval_sts-test_spearman_manhattan": 0.9075206750336968,
+      "eval_sts-test_spearman_max": 0.9078700928826409,
+      "step": 80
+    },
+    {
+      "epoch": 0.15851272015655576,
+      "grad_norm": 3.5708839893341064,
+      "learning_rate": 6.3281250000000005e-06,
+      "loss": 0.1189,
+      "step": 81
+    },
+    {
+      "epoch": 0.16046966731898238,
+      "grad_norm": 4.602839469909668,
+      "learning_rate": 6.406250000000001e-06,
+      "loss": 0.236,
+      "step": 82
+    },
+    {
+      "epoch": 0.162426614481409,
+      "grad_norm": 4.304513931274414,
+      "learning_rate": 6.484375000000001e-06,
+      "loss": 0.1584,
+      "step": 83
+    },
+    {
+      "epoch": 0.1643835616438356,
+      "grad_norm": 4.165163516998291,
+      "learning_rate": 6.5625e-06,
+      "loss": 0.1925,
+      "step": 84
+    },
+    {
+      "epoch": 0.16634050880626222,
+      "grad_norm": 3.9157192707061768,
+      "learning_rate": 6.6406250000000005e-06,
+      "loss": 0.129,
+      "step": 85
+    },
+    {
+      "epoch": 0.16634050880626222,
+      "eval_loss": 0.1278335303068161,
+      "eval_runtime": 107.1978,
+      "eval_samples_per_second": 28.471,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8845993101894516,
+      "eval_sts-test_pearson_dot": 0.8740701762146532,
+      "eval_sts-test_pearson_euclidean": 0.9100055922999684,
+      "eval_sts-test_pearson_manhattan": 0.9108899080028133,
+      "eval_sts-test_pearson_max": 0.9108899080028133,
+      "eval_sts-test_spearman_cosine": 0.9078923342595523,
+      "eval_sts-test_spearman_dot": 0.8788126513485913,
+      "eval_sts-test_spearman_euclidean": 0.9057257466905491,
+      "eval_sts-test_spearman_manhattan": 0.9070083178420268,
+      "eval_sts-test_spearman_max": 0.9078923342595523,
+      "step": 85
+    },
+    {
+      "epoch": 0.16829745596868884,
+      "grad_norm": 4.233823776245117,
+      "learning_rate": 6.718750000000001e-06,
+      "loss": 0.1376,
+      "step": 86
+    },
+    {
+      "epoch": 0.17025440313111545,
+      "grad_norm": 4.670790195465088,
+      "learning_rate": 6.796875000000001e-06,
+      "loss": 0.1691,
+      "step": 87
+    },
+    {
+      "epoch": 0.17221135029354206,
+      "grad_norm": 3.742030382156372,
+      "learning_rate": 6.875e-06,
+      "loss": 0.1045,
+      "step": 88
+    },
+    {
+      "epoch": 0.17416829745596868,
+      "grad_norm": 4.242702960968018,
+      "learning_rate": 6.9531250000000004e-06,
+      "loss": 0.165,
+      "step": 89
+    },
+    {
+      "epoch": 0.1761252446183953,
+      "grad_norm": 5.499476909637451,
+      "learning_rate": 7.031250000000001e-06,
+      "loss": 0.2926,
+      "step": 90
+    },
+    {
+      "epoch": 0.1761252446183953,
+      "eval_loss": 0.12669824063777924,
+      "eval_runtime": 107.2778,
+      "eval_samples_per_second": 28.45,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8844194771150324,
+      "eval_sts-test_pearson_dot": 0.873458365713796,
+      "eval_sts-test_pearson_euclidean": 0.9099396625521212,
+      "eval_sts-test_pearson_manhattan": 0.910745898918033,
+      "eval_sts-test_pearson_max": 0.910745898918033,
+      "eval_sts-test_spearman_cosine": 0.907622707909669,
+      "eval_sts-test_spearman_dot": 0.8783740442356941,
+      "eval_sts-test_spearman_euclidean": 0.9058808545625318,
+      "eval_sts-test_spearman_manhattan": 0.906889458491771,
+      "eval_sts-test_spearman_max": 0.907622707909669,
+      "step": 90
+    },
+    {
+      "epoch": 0.1780821917808219,
+      "grad_norm": 2.992021083831787,
+      "learning_rate": 7.109375000000001e-06,
+      "loss": 0.1048,
+      "step": 91
+    },
+    {
+      "epoch": 0.18003913894324852,
+      "grad_norm": 4.298286437988281,
+      "learning_rate": 7.1875e-06,
+      "loss": 0.1596,
+      "step": 92
+    },
+    {
+      "epoch": 0.18199608610567514,
+      "grad_norm": 5.210509300231934,
+      "learning_rate": 7.265625e-06,
+      "loss": 0.2474,
+      "step": 93
+    },
+    {
+      "epoch": 0.18395303326810175,
+      "grad_norm": 4.527407169342041,
+      "learning_rate": 7.343750000000001e-06,
+      "loss": 0.1652,
+      "step": 94
+    },
+    {
+      "epoch": 0.18590998043052837,
+      "grad_norm": 5.302050590515137,
+      "learning_rate": 7.421875000000001e-06,
+      "loss": 0.2483,
+      "step": 95
+    },
+    {
+      "epoch": 0.18590998043052837,
+      "eval_loss": 0.1252526491880417,
+      "eval_runtime": 107.5519,
+      "eval_samples_per_second": 28.377,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.884272350180128,
+      "eval_sts-test_pearson_dot": 0.8727334938335432,
+      "eval_sts-test_pearson_euclidean": 0.9099441972021025,
+      "eval_sts-test_pearson_manhattan": 0.9106991509833859,
+      "eval_sts-test_pearson_max": 0.9106991509833859,
+      "eval_sts-test_spearman_cosine": 0.9075948278738224,
+      "eval_sts-test_spearman_dot": 0.87780624023116,
+      "eval_sts-test_spearman_euclidean": 0.9060086194138042,
+      "eval_sts-test_spearman_manhattan": 0.9069788267607697,
+      "eval_sts-test_spearman_max": 0.9075948278738224,
+      "step": 95
+    },
+    {
+      "epoch": 0.18786692759295498,
+      "grad_norm": 3.690441608428955,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 0.1623,
+      "step": 96
+    },
+    {
+      "epoch": 0.1898238747553816,
+      "grad_norm": 4.585984706878662,
+      "learning_rate": 7.578125e-06,
+      "loss": 0.1955,
+      "step": 97
+    },
+    {
+      "epoch": 0.1917808219178082,
+      "grad_norm": 4.493942737579346,
+      "learning_rate": 7.656250000000001e-06,
+      "loss": 0.2023,
+      "step": 98
+    },
+    {
+      "epoch": 0.19373776908023482,
+      "grad_norm": 4.569936275482178,
+      "learning_rate": 7.734375e-06,
+      "loss": 0.1886,
+      "step": 99
+    },
+    {
+      "epoch": 0.19569471624266144,
+      "grad_norm": 3.7703664302825928,
+      "learning_rate": 7.8125e-06,
+      "loss": 0.1284,
+      "step": 100
+    },
+    {
+      "epoch": 0.19569471624266144,
+      "eval_loss": 0.12290485948324203,
+      "eval_runtime": 107.6958,
+      "eval_samples_per_second": 28.339,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.8836376979322419,
+      "eval_sts-test_pearson_dot": 0.8710695777275684,
+      "eval_sts-test_pearson_euclidean": 0.9098265834859519,
+      "eval_sts-test_pearson_manhattan": 0.9106248996071287,
+      "eval_sts-test_pearson_max": 0.9106248996071287,
+      "eval_sts-test_spearman_cosine": 0.9078868298544011,
+      "eval_sts-test_spearman_dot": 0.8773200625274038,
+      "eval_sts-test_spearman_euclidean": 0.9063156130669492,
+      "eval_sts-test_spearman_manhattan": 0.9071474495136926,
+      "eval_sts-test_spearman_max": 0.9078868298544011,
+      "step": 100
+    },
+    {
+      "epoch": 0.19765166340508805,
+      "grad_norm": 4.356619358062744,
+      "learning_rate": 7.890625e-06,
+      "loss": 0.2005,
+      "step": 101
+    },
+    {
+      "epoch": 0.19960861056751467,
+      "grad_norm": 4.293449878692627,
+      "learning_rate": 7.96875e-06,
+      "loss": 0.2301,
+      "step": 102
+    },
+    {
+      "epoch": 0.20156555772994128,
+      "grad_norm": 4.654509544372559,
+      "learning_rate": 8.046875e-06,
+      "loss": 0.2249,
+      "step": 103
+    },
+    {
+      "epoch": 0.2035225048923679,
+      "grad_norm": 4.510340213775635,
+      "learning_rate": 8.125000000000001e-06,
+      "loss": 0.214,
+      "step": 104
+    },
+    {
+      "epoch": 0.2054794520547945,
+      "grad_norm": 3.880908489227295,
+      "learning_rate": 8.203125000000001e-06,
+      "loss": 0.1429,
+      "step": 105
+    },
+    {
+      "epoch": 0.2054794520547945,
+      "eval_loss": 0.12076468020677567,
+      "eval_runtime": 107.7074,
+      "eval_samples_per_second": 28.336,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.8828542959864998,
+      "eval_sts-test_pearson_dot": 0.8689355363147886,
+      "eval_sts-test_pearson_euclidean": 0.9096459762354197,
+      "eval_sts-test_pearson_manhattan": 0.9104979967855148,
+      "eval_sts-test_pearson_max": 0.9104979967855148,
+      "eval_sts-test_spearman_cosine": 0.9076751563880199,
+      "eval_sts-test_spearman_dot": 0.8750991469270715,
+      "eval_sts-test_spearman_euclidean": 0.906379383614432,
+      "eval_sts-test_spearman_manhattan": 0.9071111562407043,
+      "eval_sts-test_spearman_max": 0.9076751563880199,
+      "step": 105
+    },
+    {
+      "epoch": 0.20743639921722112,
+      "grad_norm": 3.8524463176727295,
+      "learning_rate": 8.281250000000001e-06,
+      "loss": 0.17,
+      "step": 106
+    },
+    {
+      "epoch": 0.20939334637964774,
+      "grad_norm": 4.660905838012695,
+      "learning_rate": 8.359375e-06,
+      "loss": 0.1955,
+      "step": 107
+    },
+    {
+      "epoch": 0.21135029354207435,
+      "grad_norm": 4.391407012939453,
+      "learning_rate": 8.4375e-06,
+      "loss": 0.1964,
+      "step": 108
+    },
+    {
+      "epoch": 0.21330724070450097,
+      "grad_norm": 3.908740758895874,
+      "learning_rate": 8.515625e-06,
+      "loss": 0.1246,
+      "step": 109
+    },
+    {
+      "epoch": 0.21526418786692758,
+      "grad_norm": 3.295600414276123,
+      "learning_rate": 8.59375e-06,
+      "loss": 0.1295,
+      "step": 110
+    },
+    {
+      "epoch": 0.21526418786692758,
+      "eval_loss": 0.11901199817657471,
+      "eval_runtime": 107.5373,
+      "eval_samples_per_second": 28.381,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.8820675142963768,
+      "eval_sts-test_pearson_dot": 0.8664913359514981,
+      "eval_sts-test_pearson_euclidean": 0.9093761405951237,
+      "eval_sts-test_pearson_manhattan": 0.910248319457324,
+      "eval_sts-test_pearson_max": 0.910248319457324,
+      "eval_sts-test_spearman_cosine": 0.9071699146469111,
+      "eval_sts-test_spearman_dot": 0.8726812810253556,
+      "eval_sts-test_spearman_euclidean": 0.9064896954737618,
+      "eval_sts-test_spearman_manhattan": 0.9068174537121922,
+      "eval_sts-test_spearman_max": 0.9071699146469111,
+      "step": 110
+    },
+    {
+      "epoch": 0.2172211350293542,
+      "grad_norm": 5.0308518409729,
+      "learning_rate": 8.671875e-06,
+      "loss": 0.2203,
+      "step": 111
+    },
+    {
+      "epoch": 0.2191780821917808,
+      "grad_norm": 4.501624584197998,
+      "learning_rate": 8.750000000000001e-06,
+      "loss": 0.2195,
+      "step": 112
+    },
+    {
+      "epoch": 0.22113502935420742,
+      "grad_norm": 4.200097560882568,
+      "learning_rate": 8.828125000000001e-06,
+      "loss": 0.1823,
+      "step": 113
+    },
+    {
+      "epoch": 0.22309197651663404,
+      "grad_norm": 3.6750545501708984,
+      "learning_rate": 8.906250000000001e-06,
+      "loss": 0.174,
+      "step": 114
+    },
+    {
+      "epoch": 0.22504892367906065,
+      "grad_norm": 4.105295181274414,
+      "learning_rate": 8.984375000000002e-06,
+      "loss": 0.207,
+      "step": 115
+    },
+    {
+      "epoch": 0.22504892367906065,
+      "eval_loss": 0.11745984107255936,
+      "eval_runtime": 107.5979,
+      "eval_samples_per_second": 28.365,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.882042560326929,
+      "eval_sts-test_pearson_dot": 0.8653067979173212,
+      "eval_sts-test_pearson_euclidean": 0.9095832495385563,
+      "eval_sts-test_pearson_manhattan": 0.9103602950988618,
+      "eval_sts-test_pearson_max": 0.9103602950988618,
+      "eval_sts-test_spearman_cosine": 0.9068824772949942,
+      "eval_sts-test_spearman_dot": 0.8714208617482668,
+      "eval_sts-test_spearman_euclidean": 0.906395180809703,
+      "eval_sts-test_spearman_manhattan": 0.9068741088091138,
+      "eval_sts-test_spearman_max": 0.9068824772949942,
+      "step": 115
+    },
+    {
+      "epoch": 0.22700587084148727,
+      "grad_norm": 4.654273509979248,
+      "learning_rate": 9.0625e-06,
+      "loss": 0.2156,
+      "step": 116
+    },
+    {
+      "epoch": 0.22896281800391388,
+      "grad_norm": 4.661588191986084,
+      "learning_rate": 9.140625e-06,
+      "loss": 0.2202,
+      "step": 117
+    },
+    {
+      "epoch": 0.2309197651663405,
+      "grad_norm": 5.366416931152344,
+      "learning_rate": 9.21875e-06,
+      "loss": 0.2718,
+      "step": 118
+    },
+    {
+      "epoch": 0.2328767123287671,
+      "grad_norm": 3.672802448272705,
+      "learning_rate": 9.296875e-06,
+      "loss": 0.1387,
+      "step": 119
+    },
+    {
+      "epoch": 0.23483365949119372,
+      "grad_norm": 3.7878501415252686,
+      "learning_rate": 9.375000000000001e-06,
+      "loss": 0.1506,
+      "step": 120
+    },
+    {
+      "epoch": 0.23483365949119372,
+      "eval_loss": 0.11679373681545258,
+      "eval_runtime": 107.6687,
+      "eval_samples_per_second": 28.346,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.882107468031623,
+      "eval_sts-test_pearson_dot": 0.8647556765462645,
+      "eval_sts-test_pearson_euclidean": 0.9099443435071429,
+      "eval_sts-test_pearson_manhattan": 0.9105934104125866,
+      "eval_sts-test_pearson_max": 0.9105934104125866,
+      "eval_sts-test_spearman_cosine": 0.9068624287298908,
+      "eval_sts-test_spearman_dot": 0.8710628964083971,
+      "eval_sts-test_spearman_euclidean": 0.906624531024334,
+      "eval_sts-test_spearman_manhattan": 0.9069254385059298,
+      "eval_sts-test_spearman_max": 0.9069254385059298,
+      "step": 120
+    },
+    {
+      "epoch": 0.23679060665362034,
+      "grad_norm": 3.4761197566986084,
+      "learning_rate": 9.453125000000001e-06,
+      "loss": 0.1185,
+      "step": 121
+    },
+    {
+      "epoch": 0.23874755381604695,
+      "grad_norm": 3.9917871952056885,
+      "learning_rate": 9.531250000000001e-06,
+      "loss": 0.1681,
+      "step": 122
+    },
+    {
+      "epoch": 0.24070450097847357,
+      "grad_norm": 4.491674423217773,
+      "learning_rate": 9.609375000000001e-06,
+      "loss": 0.2321,
+      "step": 123
+    },
+    {
+      "epoch": 0.24266144814090018,
+      "grad_norm": 3.903496503829956,
+      "learning_rate": 9.6875e-06,
+      "loss": 0.1457,
+      "step": 124
+    },
+    {
+      "epoch": 0.2446183953033268,
+      "grad_norm": 5.046339988708496,
+      "learning_rate": 9.765625e-06,
+      "loss": 0.2027,
+      "step": 125
+    },
+    {
+      "epoch": 0.2446183953033268,
+      "eval_loss": 0.11647585779428482,
+      "eval_runtime": 107.5396,
+      "eval_samples_per_second": 28.38,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.8824938293263067,
+      "eval_sts-test_pearson_dot": 0.8653100788410637,
+      "eval_sts-test_pearson_euclidean": 0.9104636052712812,
+      "eval_sts-test_pearson_manhattan": 0.9109341151161342,
+      "eval_sts-test_pearson_max": 0.9109341151161342,
+      "eval_sts-test_spearman_cosine": 0.9070702535877924,
+      "eval_sts-test_spearman_dot": 0.8716920543922986,
+      "eval_sts-test_spearman_euclidean": 0.9070027239343528,
+      "eval_sts-test_spearman_manhattan": 0.9073061822378479,
+      "eval_sts-test_spearman_max": 0.9073061822378479,
+      "step": 125
+    },
+    {
+      "epoch": 0.2465753424657534,
+      "grad_norm": 4.304446697235107,
+      "learning_rate": 9.84375e-06,
+      "loss": 0.1821,
+      "step": 126
+    },
+    {
+      "epoch": 0.24853228962818003,
+      "grad_norm": 3.208357810974121,
+      "learning_rate": 9.921875e-06,
+      "loss": 0.1258,
+      "step": 127
+    },
+    {
+      "epoch": 0.25048923679060664,
+      "grad_norm": 4.275379657745361,
+      "learning_rate": 1e-05,
+      "loss": 0.184,
+      "step": 128
+    },
+    {
+      "epoch": 0.25244618395303325,
+      "grad_norm": 4.408608436584473,
+      "learning_rate": 1.0078125000000001e-05,
+      "loss": 0.2015,
+      "step": 129
+    },
+    {
+      "epoch": 0.25440313111545987,
+      "grad_norm": 3.565253973007202,
+      "learning_rate": 1.0156250000000001e-05,
+      "loss": 0.1323,
+      "step": 130
+    },
+    {
+      "epoch": 0.25440313111545987,
+      "eval_loss": 0.1154385656118393,
+      "eval_runtime": 107.5442,
+      "eval_samples_per_second": 28.379,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.8820850631122565,
+      "eval_sts-test_pearson_dot": 0.8648589750662984,
+      "eval_sts-test_pearson_euclidean": 0.9105884442785888,
+      "eval_sts-test_pearson_manhattan": 0.9109040210291837,
+      "eval_sts-test_pearson_max": 0.9109040210291837,
+      "eval_sts-test_spearman_cosine": 0.9074317095260507,
+      "eval_sts-test_spearman_dot": 0.8710452196601474,
+      "eval_sts-test_spearman_euclidean": 0.9070635408985837,
+      "eval_sts-test_spearman_manhattan": 0.9074422260724778,
+      "eval_sts-test_spearman_max": 0.9074422260724778,
+      "step": 130
+    },
+    {
+      "epoch": 0.2563600782778865,
+      "grad_norm": 4.261953353881836,
+      "learning_rate": 1.0234375000000001e-05,
+      "loss": 0.1939,
+      "step": 131
+    },
+    {
+      "epoch": 0.2583170254403131,
+      "grad_norm": 3.806480646133423,
+      "learning_rate": 1.0312500000000002e-05,
+      "loss": 0.1428,
+      "step": 132
+    },
+    {
+      "epoch": 0.2602739726027397,
+      "grad_norm": 2.824733257293701,
+      "learning_rate": 1.0390625e-05,
+      "loss": 0.1063,
+      "step": 133
+    },
+    {
+      "epoch": 0.2622309197651663,
+      "grad_norm": 4.076455116271973,
+      "learning_rate": 1.046875e-05,
+      "loss": 0.1602,
+      "step": 134
+    },
+    {
+      "epoch": 0.26418786692759294,
+      "grad_norm": 3.7571659088134766,
+      "learning_rate": 1.0546875e-05,
+      "loss": 0.1814,
+      "step": 135
+    },
+    {
+      "epoch": 0.26418786692759294,
+      "eval_loss": 0.11387230455875397,
+      "eval_runtime": 107.5968,
+      "eval_samples_per_second": 28.365,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.8812889193869892,
+      "eval_sts-test_pearson_dot": 0.8634898982579755,
+      "eval_sts-test_pearson_euclidean": 0.9104977472627025,
+      "eval_sts-test_pearson_manhattan": 0.9107178140804983,
+      "eval_sts-test_pearson_max": 0.9107178140804983,
+      "eval_sts-test_spearman_cosine": 0.9066986391131981,
+      "eval_sts-test_spearman_dot": 0.870129116588204,
+      "eval_sts-test_spearman_euclidean": 0.9070359293703052,
+      "eval_sts-test_spearman_manhattan": 0.9073414909830857,
+      "eval_sts-test_spearman_max": 0.9073414909830857,
+      "step": 135
+    },
+    {
+      "epoch": 0.26614481409001955,
+      "grad_norm": 3.864948034286499,
+      "learning_rate": 1.0625e-05,
+      "loss": 0.1518,
+      "step": 136
+    },
+    {
+      "epoch": 0.26810176125244617,
+      "grad_norm": 3.5900001525878906,
+      "learning_rate": 1.0703125000000001e-05,
+      "loss": 0.1379,
+      "step": 137
+    },
+    {
+      "epoch": 0.2700587084148728,
+      "grad_norm": 4.291954517364502,
+      "learning_rate": 1.0781250000000001e-05,
+      "loss": 0.1708,
+      "step": 138
+    },
+    {
+      "epoch": 0.2720156555772994,
+      "grad_norm": 3.8340342044830322,
+      "learning_rate": 1.0859375000000001e-05,
+      "loss": 0.2046,
+      "step": 139
+    },
+    {
+      "epoch": 0.273972602739726,
+      "grad_norm": 3.749396562576294,
+      "learning_rate": 1.0937500000000002e-05,
+      "loss": 0.1259,
+      "step": 140
+    },
+    {
+      "epoch": 0.273972602739726,
+      "eval_loss": 0.1124362125992775,
+      "eval_runtime": 107.5142,
+      "eval_samples_per_second": 28.387,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.8805714116282963,
+      "eval_sts-test_pearson_dot": 0.8618911680351633,
+      "eval_sts-test_pearson_euclidean": 0.9102979980912764,
+      "eval_sts-test_pearson_manhattan": 0.9105232760600299,
+      "eval_sts-test_pearson_max": 0.9105232760600299,
+      "eval_sts-test_spearman_cosine": 0.9063180743863257,
+      "eval_sts-test_spearman_dot": 0.8687826406354595,
+      "eval_sts-test_spearman_euclidean": 0.9070556199253175,
+      "eval_sts-test_spearman_manhattan": 0.9073570196707885,
+      "eval_sts-test_spearman_max": 0.9073570196707885,
+      "step": 140
+    },
+    {
+      "epoch": 0.2759295499021526,
+      "grad_norm": 2.8815276622772217,
+      "learning_rate": 1.1015625e-05,
+      "loss": 0.1181,
+      "step": 141
+    },
+    {
+      "epoch": 0.27788649706457924,
+      "grad_norm": 3.766554355621338,
+      "learning_rate": 1.109375e-05,
+      "loss": 0.2144,
+      "step": 142
+    },
+    {
+      "epoch": 0.27984344422700586,
+      "grad_norm": 4.289268493652344,
+      "learning_rate": 1.1171875e-05,
+      "loss": 0.1822,
+      "step": 143
+    },
+    {
+      "epoch": 0.28180039138943247,
+      "grad_norm": 3.9036617279052734,
+      "learning_rate": 1.125e-05,
+      "loss": 0.1667,
+      "step": 144
+    },
+    {
+      "epoch": 0.2837573385518591,
+      "grad_norm": 3.321366786956787,
+      "learning_rate": 1.1328125000000001e-05,
+      "loss": 0.0779,
+      "step": 145
+    },
+    {
+      "epoch": 0.2837573385518591,
+      "eval_loss": 0.1118142157793045,
+      "eval_runtime": 107.3173,
+      "eval_samples_per_second": 28.439,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8796044904115364,
+      "eval_sts-test_pearson_dot": 0.8607678603166254,
+      "eval_sts-test_pearson_euclidean": 0.9097479995877322,
+      "eval_sts-test_pearson_manhattan": 0.9098650580518599,
+      "eval_sts-test_pearson_max": 0.9098650580518599,
+      "eval_sts-test_spearman_cosine": 0.9059690592987342,
+      "eval_sts-test_spearman_dot": 0.8685229490656053,
+      "eval_sts-test_spearman_euclidean": 0.90680836920613,
+      "eval_sts-test_spearman_manhattan": 0.9069437865231001,
+      "eval_sts-test_spearman_max": 0.9069437865231001,
+      "step": 145
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 3.460301160812378,
+      "learning_rate": 1.1406250000000001e-05,
+      "loss": 0.147,
+      "step": 146
+    },
+    {
+      "epoch": 0.2876712328767123,
+      "grad_norm": 3.8999266624450684,
+      "learning_rate": 1.1484375000000001e-05,
+      "loss": 0.1913,
+      "step": 147
+    },
+    {
+      "epoch": 0.2896281800391389,
+      "grad_norm": 3.539788007736206,
+      "learning_rate": 1.1562500000000002e-05,
+      "loss": 0.1357,
+      "step": 148
+    },
+    {
+      "epoch": 0.29158512720156554,
+      "grad_norm": 3.499439001083374,
+      "learning_rate": 1.1640625000000002e-05,
+      "loss": 0.1128,
+      "step": 149
+    },
+    {
+      "epoch": 0.29354207436399216,
+      "grad_norm": 3.2960240840911865,
+      "learning_rate": 1.171875e-05,
+      "loss": 0.0996,
+      "step": 150
+    },
+    {
+      "epoch": 0.29354207436399216,
+      "eval_loss": 0.11132737249135971,
+      "eval_runtime": 107.5867,
+      "eval_samples_per_second": 28.368,
+      "eval_steps_per_second": 0.223,
+      "eval_sts-test_pearson_cosine": 0.8787852416493207,
+      "eval_sts-test_pearson_dot": 0.8593025559452621,
+      "eval_sts-test_pearson_euclidean": 0.9091617970047303,
+      "eval_sts-test_pearson_manhattan": 0.9091664157178929,
+      "eval_sts-test_pearson_max": 0.9091664157178929,
+      "eval_sts-test_spearman_cosine": 0.9054375485671886,
+      "eval_sts-test_spearman_dot": 0.867029912731804,
+      "eval_sts-test_spearman_euclidean": 0.9062253050214613,
+      "eval_sts-test_spearman_manhattan": 0.9062610165280517,
+      "eval_sts-test_spearman_max": 0.9062610165280517,
+      "step": 150
+    },
+    {
+      "epoch": 0.29549902152641877,
+      "grad_norm": 4.271719932556152,
+      "learning_rate": 1.1796875e-05,
+      "loss": 0.1956,
+      "step": 151
+    },
+    {
+      "epoch": 0.2974559686888454,
+      "grad_norm": 3.168663501739502,
+      "learning_rate": 1.1875e-05,
+      "loss": 0.0942,
+      "step": 152
+    },
+    {
+      "epoch": 0.299412915851272,
+      "grad_norm": 3.816993236541748,
+      "learning_rate": 1.1953125000000001e-05,
+      "loss": 0.1406,
+      "step": 153
+    },
+    {
+      "epoch": 0.3013698630136986,
+      "grad_norm": 5.383023738861084,
+      "learning_rate": 1.2031250000000001e-05,
+      "loss": 0.2868,
+      "step": 154
+    },
+    {
+      "epoch": 0.30332681017612523,
+      "grad_norm": 3.123462677001953,
+      "learning_rate": 1.2109375000000001e-05,
+      "loss": 0.1102,
+      "step": 155
+    },
+    {
+      "epoch": 0.30332681017612523,
+      "eval_loss": 0.11142811924219131,
+      "eval_runtime": 107.3019,
+      "eval_samples_per_second": 28.443,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8780761726881443,
+      "eval_sts-test_pearson_dot": 0.8581767032057357,
+      "eval_sts-test_pearson_euclidean": 0.9081534036571242,
+      "eval_sts-test_pearson_manhattan": 0.9081724370385316,
+      "eval_sts-test_pearson_max": 0.9081724370385316,
+      "eval_sts-test_spearman_cosine": 0.9048428490545583,
+      "eval_sts-test_spearman_dot": 0.8670075818523697,
+      "eval_sts-test_spearman_euclidean": 0.9052714766361651,
+      "eval_sts-test_spearman_manhattan": 0.9054467225757737,
+      "eval_sts-test_spearman_max": 0.9054467225757737,
+      "step": 155
+    },
+    {
+      "epoch": 0.30528375733855184,
+      "grad_norm": 4.1034979820251465,
+      "learning_rate": 1.2187500000000001e-05,
+      "loss": 0.1659,
+      "step": 156
+    },
+    {
+      "epoch": 0.30724070450097846,
+      "grad_norm": 3.60249400138855,
+      "learning_rate": 1.2265625000000002e-05,
+      "loss": 0.1645,
+      "step": 157
+    },
+    {
+      "epoch": 0.30919765166340507,
+      "grad_norm": 3.771853446960449,
+      "learning_rate": 1.234375e-05,
+      "loss": 0.151,
+      "step": 158
+    },
+    {
+      "epoch": 0.3111545988258317,
+      "grad_norm": 4.291686058044434,
+      "learning_rate": 1.2421875e-05,
+      "loss": 0.158,
+      "step": 159
+    },
+    {
+      "epoch": 0.3131115459882583,
+      "grad_norm": 5.1689453125,
+      "learning_rate": 1.25e-05,
+      "loss": 0.2323,
+      "step": 160
+    },
+    {
+      "epoch": 0.3131115459882583,
+      "eval_loss": 0.11126424372196198,
+      "eval_runtime": 107.301,
+      "eval_samples_per_second": 28.443,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8777597983330929,
+      "eval_sts-test_pearson_dot": 0.8577739588604719,
+      "eval_sts-test_pearson_euclidean": 0.9075483317216817,
+      "eval_sts-test_pearson_manhattan": 0.9075908461381532,
+      "eval_sts-test_pearson_max": 0.9075908461381532,
+      "eval_sts-test_spearman_cosine": 0.9047649818597372,
+      "eval_sts-test_spearman_dot": 0.867389712873391,
+      "eval_sts-test_spearman_euclidean": 0.9048189966322366,
+      "eval_sts-test_spearman_manhattan": 0.9049692713679889,
+      "eval_sts-test_spearman_max": 0.9049692713679889,
+      "step": 160
+    },
+    {
+      "epoch": 0.3150684931506849,
+      "grad_norm": 3.304703712463379,
+      "learning_rate": 1.2578125e-05,
+      "loss": 0.1157,
+      "step": 161
+    },
+    {
+      "epoch": 0.31702544031311153,
+      "grad_norm": 4.064731121063232,
+      "learning_rate": 1.2656250000000001e-05,
+      "loss": 0.1507,
+      "step": 162
+    },
+    {
+      "epoch": 0.31898238747553814,
+      "grad_norm": 4.615545749664307,
+      "learning_rate": 1.2734375000000001e-05,
+      "loss": 0.1879,
+      "step": 163
+    },
+    {
+      "epoch": 0.32093933463796476,
+      "grad_norm": 3.767533540725708,
+      "learning_rate": 1.2812500000000001e-05,
+      "loss": 0.143,
+      "step": 164
+    },
+    {
+      "epoch": 0.32289628180039137,
+      "grad_norm": 4.727967262268066,
+      "learning_rate": 1.2890625000000002e-05,
+      "loss": 0.2227,
+      "step": 165
+    },
+    {
+      "epoch": 0.32289628180039137,
+      "eval_loss": 0.11155427247285843,
+      "eval_runtime": 107.2898,
+      "eval_samples_per_second": 28.446,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8775899700998113,
+      "eval_sts-test_pearson_dot": 0.8571711542435376,
+      "eval_sts-test_pearson_euclidean": 0.907399950708088,
+      "eval_sts-test_pearson_manhattan": 0.9073879045697356,
+      "eval_sts-test_pearson_max": 0.907399950708088,
+      "eval_sts-test_spearman_cosine": 0.9049959431197784,
+      "eval_sts-test_spearman_dot": 0.8667648957618442,
+      "eval_sts-test_spearman_euclidean": 0.9048916279294749,
+      "eval_sts-test_spearman_manhattan": 0.9050786882020909,
+      "eval_sts-test_spearman_max": 0.9050786882020909,
+      "step": 165
+    },
+    {
+      "epoch": 0.324853228962818,
+      "grad_norm": 4.0150017738342285,
+      "learning_rate": 1.2968750000000002e-05,
+      "loss": 0.1624,
+      "step": 166
+    },
+    {
+      "epoch": 0.3268101761252446,
+      "grad_norm": 3.021153450012207,
+      "learning_rate": 1.3046875e-05,
+      "loss": 0.1345,
+      "step": 167
+    },
+    {
+      "epoch": 0.3287671232876712,
+      "grad_norm": 3.869710922241211,
+      "learning_rate": 1.3125e-05,
+      "loss": 0.1765,
+      "step": 168
+    },
+    {
+      "epoch": 0.33072407045009783,
+      "grad_norm": 3.538076162338257,
+      "learning_rate": 1.3203125e-05,
+      "loss": 0.1368,
+      "step": 169
+    },
+    {
+      "epoch": 0.33268101761252444,
+      "grad_norm": 3.378551483154297,
+      "learning_rate": 1.3281250000000001e-05,
+      "loss": 0.0962,
+      "step": 170
+    },
+    {
+      "epoch": 0.33268101761252444,
+      "eval_loss": 0.11131894588470459,
+      "eval_runtime": 107.3532,
+      "eval_samples_per_second": 28.43,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8782576778514848,
+      "eval_sts-test_pearson_dot": 0.8576530243239538,
+      "eval_sts-test_pearson_euclidean": 0.9077401564122008,
+      "eval_sts-test_pearson_manhattan": 0.907609849534313,
+      "eval_sts-test_pearson_max": 0.9077401564122008,
+      "eval_sts-test_spearman_cosine": 0.9055560946586144,
+      "eval_sts-test_spearman_dot": 0.8666707838591381,
+      "eval_sts-test_spearman_euclidean": 0.9054064016892602,
+      "eval_sts-test_spearman_manhattan": 0.9054834186101147,
+      "eval_sts-test_spearman_max": 0.9055560946586144,
+      "step": 170
+    },
+    {
+      "epoch": 0.33463796477495106,
+      "grad_norm": 4.588249683380127,
+      "learning_rate": 1.3359375000000001e-05,
+      "loss": 0.1783,
+      "step": 171
+    },
+    {
+      "epoch": 0.33659491193737767,
+      "grad_norm": 4.370199680328369,
+      "learning_rate": 1.3437500000000001e-05,
+      "loss": 0.2019,
+      "step": 172
+    },
+    {
+      "epoch": 0.3385518590998043,
+      "grad_norm": 4.000157356262207,
+      "learning_rate": 1.3515625000000002e-05,
+      "loss": 0.1761,
+      "step": 173
+    },
+    {
+      "epoch": 0.3405088062622309,
+      "grad_norm": 4.3335862159729,
+      "learning_rate": 1.3593750000000002e-05,
+      "loss": 0.1855,
+      "step": 174
+    },
+    {
+      "epoch": 0.3424657534246575,
+      "grad_norm": 4.247244358062744,
+      "learning_rate": 1.3671875e-05,
+      "loss": 0.1922,
+      "step": 175
+    },
+    {
+      "epoch": 0.3424657534246575,
+      "eval_loss": 0.1105586364865303,
+      "eval_runtime": 107.3507,
+      "eval_samples_per_second": 28.43,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8775475016000731,
+      "eval_sts-test_pearson_dot": 0.8543732981082479,
+      "eval_sts-test_pearson_euclidean": 0.9076643456809551,
+      "eval_sts-test_pearson_manhattan": 0.9075054089199206,
+      "eval_sts-test_pearson_max": 0.9076643456809551,
+      "eval_sts-test_spearman_cosine": 0.905357578063082,
+      "eval_sts-test_spearman_dot": 0.8628476388472094,
+      "eval_sts-test_spearman_euclidean": 0.9054710672619708,
+      "eval_sts-test_spearman_manhattan": 0.9055309444497123,
+      "eval_sts-test_spearman_max": 0.9055309444497123,
+      "step": 175
+    },
+    {
+      "epoch": 0.34442270058708413,
+      "grad_norm": 3.881108522415161,
+      "learning_rate": 1.375e-05,
+      "loss": 0.1538,
+      "step": 176
+    },
+    {
+      "epoch": 0.34637964774951074,
+      "grad_norm": 3.4271416664123535,
+      "learning_rate": 1.3828125e-05,
+      "loss": 0.1049,
+      "step": 177
+    },
+    {
+      "epoch": 0.34833659491193736,
+      "grad_norm": 3.7847940921783447,
+      "learning_rate": 1.3906250000000001e-05,
+      "loss": 0.1619,
+      "step": 178
+    },
+    {
+      "epoch": 0.350293542074364,
+      "grad_norm": 2.3725311756134033,
+      "learning_rate": 1.3984375000000001e-05,
+      "loss": 0.0731,
+      "step": 179
+    },
+    {
+      "epoch": 0.3522504892367906,
+      "grad_norm": 3.6820032596588135,
+      "learning_rate": 1.4062500000000001e-05,
+      "loss": 0.1205,
+      "step": 180
+    },
+    {
+      "epoch": 0.3522504892367906,
+      "eval_loss": 0.10974939167499542,
+      "eval_runtime": 107.353,
+      "eval_samples_per_second": 28.43,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8782123578217031,
+      "eval_sts-test_pearson_dot": 0.852106566478191,
+      "eval_sts-test_pearson_euclidean": 0.9088860377565003,
+      "eval_sts-test_pearson_manhattan": 0.9087269620613702,
+      "eval_sts-test_pearson_max": 0.9088860377565003,
+      "eval_sts-test_spearman_cosine": 0.9058966517578029,
+      "eval_sts-test_spearman_dot": 0.8595467858069799,
+      "eval_sts-test_spearman_euclidean": 0.9064047128283795,
+      "eval_sts-test_spearman_manhattan": 0.9067846510375924,
+      "eval_sts-test_spearman_max": 0.9067846510375924,
+      "step": 180
+    },
+    {
+      "epoch": 0.3542074363992172,
+      "grad_norm": 3.7714688777923584,
+      "learning_rate": 1.4140625000000002e-05,
+      "loss": 0.169,
+      "step": 181
+    },
+    {
+      "epoch": 0.3561643835616438,
+      "grad_norm": 3.7113559246063232,
+      "learning_rate": 1.4218750000000002e-05,
+      "loss": 0.1688,
+      "step": 182
+    },
+    {
+      "epoch": 0.35812133072407043,
+      "grad_norm": 3.1639597415924072,
+      "learning_rate": 1.4296875000000002e-05,
+      "loss": 0.1274,
+      "step": 183
+    },
+    {
+      "epoch": 0.36007827788649704,
+      "grad_norm": 4.144288539886475,
+      "learning_rate": 1.4375e-05,
+      "loss": 0.1477,
+      "step": 184
+    },
+    {
+      "epoch": 0.36203522504892366,
+      "grad_norm": 3.4342098236083984,
+      "learning_rate": 1.4453125e-05,
+      "loss": 0.1418,
+      "step": 185
+    },
+    {
+      "epoch": 0.36203522504892366,
+      "eval_loss": 0.10942607372999191,
+      "eval_runtime": 107.2679,
+      "eval_samples_per_second": 28.452,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8778855142398189,
+      "eval_sts-test_pearson_dot": 0.8501658695420333,
+      "eval_sts-test_pearson_euclidean": 0.9088432870055996,
+      "eval_sts-test_pearson_manhattan": 0.9086435133118579,
+      "eval_sts-test_pearson_max": 0.9088432870055996,
+      "eval_sts-test_spearman_cosine": 0.9055185931015683,
+      "eval_sts-test_spearman_dot": 0.8575025481866207,
+      "eval_sts-test_spearman_euclidean": 0.9063994321795352,
+      "eval_sts-test_spearman_manhattan": 0.9064969899293684,
+      "eval_sts-test_spearman_max": 0.9064969899293684,
+      "step": 185
+    },
+    {
+      "epoch": 0.3639921722113503,
+      "grad_norm": 4.744626045227051,
+      "learning_rate": 1.453125e-05,
+      "loss": 0.2477,
+      "step": 186
+    },
+    {
+      "epoch": 0.3659491193737769,
+      "grad_norm": 4.062248229980469,
+      "learning_rate": 1.4609375000000001e-05,
+      "loss": 0.1713,
+      "step": 187
+    },
+    {
+      "epoch": 0.3679060665362035,
+      "grad_norm": 3.989694833755493,
+      "learning_rate": 1.4687500000000001e-05,
+      "loss": 0.1703,
+      "step": 188
+    },
+    {
+      "epoch": 0.3698630136986301,
+      "grad_norm": 3.3543660640716553,
+      "learning_rate": 1.4765625000000001e-05,
+      "loss": 0.1176,
+      "step": 189
+    },
+    {
+      "epoch": 0.37181996086105673,
+      "grad_norm": 4.307045936584473,
+      "learning_rate": 1.4843750000000002e-05,
+      "loss": 0.1811,
+      "step": 190
+    },
+    {
+      "epoch": 0.37181996086105673,
+      "eval_loss": 0.10837770998477936,
+      "eval_runtime": 107.3429,
+      "eval_samples_per_second": 28.432,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8774103555789884,
+      "eval_sts-test_pearson_dot": 0.84942827650618,
+      "eval_sts-test_pearson_euclidean": 0.9086430009253119,
+      "eval_sts-test_pearson_manhattan": 0.9084642534632353,
+      "eval_sts-test_pearson_max": 0.9086430009253119,
+      "eval_sts-test_spearman_cosine": 0.9048482639571866,
+      "eval_sts-test_spearman_dot": 0.8562155914115267,
+      "eval_sts-test_spearman_euclidean": 0.9060070531196555,
+      "eval_sts-test_spearman_manhattan": 0.9061608184537963,
+      "eval_sts-test_spearman_max": 0.9061608184537963,
+      "step": 190
+    },
+    {
+      "epoch": 0.37377690802348335,
+      "grad_norm": 4.140930652618408,
+      "learning_rate": 1.4921875000000002e-05,
+      "loss": 0.162,
+      "step": 191
+    },
+    {
+      "epoch": 0.37573385518590996,
+      "grad_norm": 2.7555642127990723,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 0.1141,
+      "step": 192
+    },
+    {
+      "epoch": 0.3776908023483366,
+      "grad_norm": 4.070343017578125,
+      "learning_rate": 1.5078125e-05,
+      "loss": 0.154,
+      "step": 193
+    },
+    {
+      "epoch": 0.3796477495107632,
+      "grad_norm": 4.453440189361572,
+      "learning_rate": 1.515625e-05,
+      "loss": 0.2461,
+      "step": 194
+    },
+    {
+      "epoch": 0.3816046966731898,
+      "grad_norm": 3.7656772136688232,
+      "learning_rate": 1.5234375000000001e-05,
+      "loss": 0.1573,
+      "step": 195
+    },
+    {
+      "epoch": 0.3816046966731898,
+      "eval_loss": 0.10762027651071548,
+      "eval_runtime": 107.299,
+      "eval_samples_per_second": 28.444,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8779461080888007,
+      "eval_sts-test_pearson_dot": 0.8521074278329072,
+      "eval_sts-test_pearson_euclidean": 0.9087045359990432,
+      "eval_sts-test_pearson_manhattan": 0.9086340705654771,
+      "eval_sts-test_pearson_max": 0.9087045359990432,
+      "eval_sts-test_spearman_cosine": 0.9045706718827756,
+      "eval_sts-test_spearman_dot": 0.8584340456924826,
+      "eval_sts-test_spearman_euclidean": 0.9055143864829975,
+      "eval_sts-test_spearman_manhattan": 0.9058283613329196,
+      "eval_sts-test_spearman_max": 0.9058283613329196,
+      "step": 195
+    },
+    {
+      "epoch": 0.3835616438356164,
+      "grad_norm": 3.063400983810425,
+      "learning_rate": 1.5312500000000003e-05,
+      "loss": 0.1197,
+      "step": 196
+    },
+    {
+      "epoch": 0.38551859099804303,
+      "grad_norm": 3.893153429031372,
+      "learning_rate": 1.5390625e-05,
+      "loss": 0.1395,
+      "step": 197
+    },
+    {
+      "epoch": 0.38747553816046965,
+      "grad_norm": 2.95540714263916,
+      "learning_rate": 1.546875e-05,
+      "loss": 0.0847,
+      "step": 198
+    },
+    {
+      "epoch": 0.38943248532289626,
+      "grad_norm": 3.4665300846099854,
+      "learning_rate": 1.5546875e-05,
+      "loss": 0.1848,
+      "step": 199
+    },
+    {
+      "epoch": 0.3913894324853229,
+      "grad_norm": 3.6926543712615967,
+      "learning_rate": 1.5625e-05,
+      "loss": 0.1377,
+      "step": 200
+    },
+    {
+      "epoch": 0.3913894324853229,
+      "eval_loss": 0.10723523795604706,
+      "eval_runtime": 107.245,
+      "eval_samples_per_second": 28.458,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.877994665901344,
+      "eval_sts-test_pearson_dot": 0.854134605280733,
+      "eval_sts-test_pearson_euclidean": 0.9085191117850383,
+      "eval_sts-test_pearson_manhattan": 0.9086424100414001,
+      "eval_sts-test_pearson_max": 0.9086424100414001,
+      "eval_sts-test_spearman_cosine": 0.904685279863199,
+      "eval_sts-test_spearman_dot": 0.8598855528557127,
+      "eval_sts-test_spearman_euclidean": 0.9052407772708506,
+      "eval_sts-test_spearman_manhattan": 0.9058868959828196,
+      "eval_sts-test_spearman_max": 0.9058868959828196,
+      "step": 200
+    },
+    {
+      "epoch": 0.3933463796477495,
+      "grad_norm": 3.303112268447876,
+      "learning_rate": 1.5703125e-05,
+      "loss": 0.1109,
+      "step": 201
+    },
+    {
+      "epoch": 0.3953033268101761,
+      "grad_norm": 3.4490058422088623,
+      "learning_rate": 1.578125e-05,
+      "loss": 0.1051,
+      "step": 202
+    },
+    {
+      "epoch": 0.3972602739726027,
+      "grad_norm": 2.6598286628723145,
+      "learning_rate": 1.5859375e-05,
+      "loss": 0.0975,
+      "step": 203
+    },
+    {
+      "epoch": 0.39921722113502933,
+      "grad_norm": 3.373512029647827,
+      "learning_rate": 1.59375e-05,
+      "loss": 0.127,
+      "step": 204
+    },
+    {
+      "epoch": 0.40117416829745595,
+      "grad_norm": 3.1471354961395264,
+      "learning_rate": 1.6015625e-05,
+      "loss": 0.1297,
+      "step": 205
+    },
+    {
+      "epoch": 0.40117416829745595,
+      "eval_loss": 0.10685314983129501,
+      "eval_runtime": 107.3321,
+      "eval_samples_per_second": 28.435,
+      "eval_steps_per_second": 0.224,
+      "eval_sts-test_pearson_cosine": 0.8785914848590666,
+      "eval_sts-test_pearson_dot": 0.8570818659891223,
+      "eval_sts-test_pearson_euclidean": 0.9086611488562145,
+      "eval_sts-test_pearson_manhattan": 0.9087606701935215,
+      "eval_sts-test_pearson_max": 0.9087606701935215,
+      "eval_sts-test_spearman_cosine": 0.9048987433800361,
+      "eval_sts-test_spearman_dot": 0.8616398023022556,
+      "eval_sts-test_spearman_euclidean": 0.9052247563192726,
+      "eval_sts-test_spearman_manhattan": 0.9056138237858093,
+      "eval_sts-test_spearman_max": 0.9056138237858093,
+      "step": 205
+    },
+    {
+      "epoch": 0.40313111545988256,
+      "grad_norm": 2.6924684047698975,
+      "learning_rate": 1.609375e-05,
+      "loss": 0.0783,
+      "step": 206
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1022,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 103,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 320,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-206/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a4ab13b16ebf2fbc29de19d38dcf8ae372111604b2fa45007db3f5dfb73ba23
+size 5688