tomaarsen's picture
tomaarsen HF staff
Add new SentenceTransformer model.
0da87e5 verified
|
raw
history blame
49.3 kB
metadata
language:
  - en
library_name: sentence-transformers
tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - loss:MatryoshkaLoss
  - loss:MultipleNegativesRankingLoss
base_model: distilbert/distilroberta-base
metrics:
  - pearson_cosine
  - spearman_cosine
  - pearson_manhattan
  - spearman_manhattan
  - pearson_euclidean
  - spearman_euclidean
  - pearson_dot
  - spearman_dot
  - pearson_max
  - spearman_max
widget:
  - source_sentence: The gate is yellow.
    sentences:
      - A yellow dog is playing in the snow.
      - A turtle walks over the ground.
      - Three men are on stage playing guitars.
  - source_sentence: A woman is reading.
    sentences:
      - A woman is writing something.
      - A tiger walks around aimlessly.
      - Gunmen 'kill 10 tourists' in Kashmir
  - source_sentence: A man jumping rope
    sentences:
      - A man is climbing a rope.
      - Bombings kill 19 people in Iraq
      - Kittens are eating from dishes.
  - source_sentence: A baby is laughing.
    sentences:
      - A baby is crawling happily.
      - Kittens are eating from dishes.
      - SFG meeting reviews situation in Mali
  - source_sentence: A man shoots a man.
    sentences:
      - A man is shooting off guns.
      - A man is erasing a chalk board.
      - A girl is riding a bicycle.
pipeline_tag: sentence-similarity
co2_eq_emissions:
  emissions: 134.46101750442273
  energy_consumed: 0.34592314293320514
  source: codecarbon
  training_type: fine-tuning
  on_cloud: false
  cpu_model: 13th Gen Intel(R) Core(TM) i7-13700K
  ram_total_size: 31.777088165283203
  hours_used: 1.296
  hardware_used: 1 x NVIDIA GeForce RTX 3090
model-index:
  - name: SentenceTransformer based on distilbert/distilroberta-base
    results:
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 768
          type: sts-dev-768
        metrics:
          - type: pearson_cosine
            value: 0.8481251400932781
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.851870210632031
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8393267568646925
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8384807951588668
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8409860761844343
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8402437232149903
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.778375740024104
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7779671330832745
            name: Spearman Dot
          - type: pearson_max
            value: 0.8481251400932781
            name: Pearson Max
          - type: spearman_max
            value: 0.851870210632031
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 512
          type: sts-dev-512
        metrics:
          - type: pearson_cosine
            value: 0.8481027005283404
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8523762836460506
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8386304289845581
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8377488866945335
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8402060724091132
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8394674780683281
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7711669414347555
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7713442697629354
            name: Spearman Dot
          - type: pearson_max
            value: 0.8481027005283404
            name: Pearson Max
          - type: spearman_max
            value: 0.8523762836460506
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 256
          type: sts-dev-256
        metrics:
          - type: pearson_cosine
            value: 0.842129976172463
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8488334736505414
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8313278330554295
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8315716535622544
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8333448222091957
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8335338271135746
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7445817504026263
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7450058498333884
            name: Spearman Dot
          - type: pearson_max
            value: 0.842129976172463
            name: Pearson Max
          - type: spearman_max
            value: 0.8488334736505414
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 128
          type: sts-dev-128
        metrics:
          - type: pearson_cosine
            value: 0.8346971467711455
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8445473333837453
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8240728025222037
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8248062249521573
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8254381823447683
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8261820268848477
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7083986436033697
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7093343189476312
            name: Spearman Dot
          - type: pearson_max
            value: 0.8346971467711455
            name: Pearson Max
          - type: spearman_max
            value: 0.8445473333837453
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 64
          type: sts-dev-64
        metrics:
          - type: pearson_cosine
            value: 0.8201235619233855
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8352180907883887
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8032422421113089
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8047180797117756
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8059536263441476
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8072309964597537
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.6360301824635421
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6388601952951507
            name: Spearman Dot
          - type: pearson_max
            value: 0.8201235619233855
            name: Pearson Max
          - type: spearman_max
            value: 0.8352180907883887
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 768
          type: sts-test-768
        metrics:
          - type: pearson_cosine
            value: 0.8262197279185375
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8297611922199533
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8103738584802076
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8032653500693283
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8113711464219397
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8047844488402207
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7351063083543349
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7222898603318773
            name: Spearman Dot
          - type: pearson_max
            value: 0.8262197279185375
            name: Pearson Max
          - type: spearman_max
            value: 0.8297611922199533
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 512
          type: sts-test-512
        metrics:
          - type: pearson_cosine
            value: 0.8265289700873992
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8303420710627304
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8092042518460232
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8021561300791633
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8099517575676378
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8034311442407586
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7239156858292818
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7141021600172974
            name: Spearman Dot
          - type: pearson_max
            value: 0.8265289700873992
            name: Pearson Max
          - type: spearman_max
            value: 0.8303420710627304
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 256
          type: sts-test-256
        metrics:
          - type: pearson_cosine
            value: 0.8247713863827557
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8304669772286988
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8012313573943666
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.7951476656544464
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8028104839960224
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.7974260171623634
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7011271518071694
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6946104528279369
            name: Spearman Dot
          - type: pearson_max
            value: 0.8247713863827557
            name: Pearson Max
          - type: spearman_max
            value: 0.8304669772286988
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 128
          type: sts-test-128
        metrics:
          - type: pearson_cosine
            value: 0.8205553018873636
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8283987535951244
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.7931877193499666
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.7878356187942884
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.7946730313407452
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.7891423743206649
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.6617612604436709
            name: Pearson Dot
          - type: spearman_dot
            value: 0.658567255717814
            name: Spearman Dot
          - type: pearson_max
            value: 0.8205553018873636
            name: Pearson Max
          - type: spearman_max
            value: 0.8283987535951244
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 64
          type: sts-test-64
        metrics:
          - type: pearson_cosine
            value: 0.8118818737650724
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8241392189948019
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.7761319753952881
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.7738169467058665
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.7777045912119006
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.7745630850628562
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.5934162536230442
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5884207612393454
            name: Spearman Dot
          - type: pearson_max
            value: 0.8118818737650724
            name: Pearson Max
          - type: spearman_max
            value: 0.8241392189948019
            name: Spearman Max

SentenceTransformer based on distilbert/distilroberta-base

This is a sentence-transformers model finetuned from distilbert/distilroberta-base on the sentence-transformers/all-nli dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

Model Sources

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("tomaarsen/distilroberta-base-nli-matryoshka-v3")
# Run inference
sentences = [
    'A man shoots a man.',
    'A man is shooting off guns.',
    'A man is erasing a chalk board.',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Semantic Similarity

Metric Value
pearson_cosine 0.8481
spearman_cosine 0.8519
pearson_manhattan 0.8393
spearman_manhattan 0.8385
pearson_euclidean 0.841
spearman_euclidean 0.8402
pearson_dot 0.7784
spearman_dot 0.778
pearson_max 0.8481
spearman_max 0.8519

Semantic Similarity

Metric Value
pearson_cosine 0.8481
spearman_cosine 0.8524
pearson_manhattan 0.8386
spearman_manhattan 0.8377
pearson_euclidean 0.8402
spearman_euclidean 0.8395
pearson_dot 0.7712
spearman_dot 0.7713
pearson_max 0.8481
spearman_max 0.8524

Semantic Similarity

Metric Value
pearson_cosine 0.8421
spearman_cosine 0.8488
pearson_manhattan 0.8313
spearman_manhattan 0.8316
pearson_euclidean 0.8333
spearman_euclidean 0.8335
pearson_dot 0.7446
spearman_dot 0.745
pearson_max 0.8421
spearman_max 0.8488

Semantic Similarity

Metric Value
pearson_cosine 0.8347
spearman_cosine 0.8445
pearson_manhattan 0.8241
spearman_manhattan 0.8248
pearson_euclidean 0.8254
spearman_euclidean 0.8262
pearson_dot 0.7084
spearman_dot 0.7093
pearson_max 0.8347
spearman_max 0.8445

Semantic Similarity

Metric Value
pearson_cosine 0.8201
spearman_cosine 0.8352
pearson_manhattan 0.8032
spearman_manhattan 0.8047
pearson_euclidean 0.806
spearman_euclidean 0.8072
pearson_dot 0.636
spearman_dot 0.6389
pearson_max 0.8201
spearman_max 0.8352

Semantic Similarity

Metric Value
pearson_cosine 0.8262
spearman_cosine 0.8298
pearson_manhattan 0.8104
spearman_manhattan 0.8033
pearson_euclidean 0.8114
spearman_euclidean 0.8048
pearson_dot 0.7351
spearman_dot 0.7223
pearson_max 0.8262
spearman_max 0.8298

Semantic Similarity

Metric Value
pearson_cosine 0.8265
spearman_cosine 0.8303
pearson_manhattan 0.8092
spearman_manhattan 0.8022
pearson_euclidean 0.81
spearman_euclidean 0.8034
pearson_dot 0.7239
spearman_dot 0.7141
pearson_max 0.8265
spearman_max 0.8303

Semantic Similarity

Metric Value
pearson_cosine 0.8248
spearman_cosine 0.8305
pearson_manhattan 0.8012
spearman_manhattan 0.7951
pearson_euclidean 0.8028
spearman_euclidean 0.7974
pearson_dot 0.7011
spearman_dot 0.6946
pearson_max 0.8248
spearman_max 0.8305

Semantic Similarity

Metric Value
pearson_cosine 0.8206
spearman_cosine 0.8284
pearson_manhattan 0.7932
spearman_manhattan 0.7878
pearson_euclidean 0.7947
spearman_euclidean 0.7891
pearson_dot 0.6618
spearman_dot 0.6586
pearson_max 0.8206
spearman_max 0.8284

Semantic Similarity

Metric Value
pearson_cosine 0.8119
spearman_cosine 0.8241
pearson_manhattan 0.7761
spearman_manhattan 0.7738
pearson_euclidean 0.7777
spearman_euclidean 0.7746
pearson_dot 0.5934
spearman_dot 0.5884
pearson_max 0.8119
spearman_max 0.8241

Training Details

Training Dataset

sentence-transformers/all-nli

  • Dataset: sentence-transformers/all-nli at 65dd388
  • Size: 557,850 training samples
  • Columns: anchor, positive, and negative
  • Approximate statistics based on the first 1000 samples:
    anchor positive negative
    type string string string
    details
    • min: 7 tokens
    • mean: 10.38 tokens
    • max: 45 tokens
    • min: 6 tokens
    • mean: 12.8 tokens
    • max: 39 tokens
    • min: 6 tokens
    • mean: 13.4 tokens
    • max: 50 tokens
  • Samples:
    anchor positive negative
    A person on a horse jumps over a broken down airplane. A person is outdoors, on a horse. A person is at a diner, ordering an omelette.
    Children smiling and waving at camera There are children present The kids are frowning
    A boy is jumping on skateboard in the middle of a red bridge. The boy does a skateboarding trick. The boy skates down the sidewalk.
  • Loss: MatryoshkaLoss with these parameters:
    {
        "loss": "MultipleNegativesRankingLoss",
        "matryoshka_dims": [
            768,
            512,
            256,
            128,
            64
        ],
        "matryoshka_weights": [
            1,
            1,
            1,
            1,
            1
        ],
        "n_dims_per_step": -1
    }
    

Evaluation Dataset

sentence-transformers/stsb

  • Dataset: sentence-transformers/stsb at ab7a5ac
  • Size: 1,500 evaluation samples
  • Columns: sentence1, sentence2, and score
  • Approximate statistics based on the first 1000 samples:
    sentence1 sentence2 score
    type string string float
    details
    • min: 5 tokens
    • mean: 15.0 tokens
    • max: 44 tokens
    • min: 6 tokens
    • mean: 14.99 tokens
    • max: 61 tokens
    • min: 0.0
    • mean: 0.47
    • max: 1.0
  • Samples:
    sentence1 sentence2 score
    A man with a hard hat is dancing. A man wearing a hard hat is dancing. 1.0
    A young child is riding a horse. A child is riding a horse. 0.95
    A man is feeding a mouse to a snake. The man is feeding a mouse to the snake. 1.0
  • Loss: MatryoshkaLoss with these parameters:
    {
        "loss": "MultipleNegativesRankingLoss",
        "matryoshka_dims": [
            768,
            512,
            256,
            128,
            64
        ],
        "matryoshka_weights": [
            1,
            1,
            1,
            1,
            1
        ],
        "n_dims_per_step": -1
    }
    

Training Hyperparameters

Non-Default Hyperparameters

  • eval_strategy: steps
  • per_device_train_batch_size: 128
  • per_device_eval_batch_size: 128
  • num_train_epochs: 1
  • warmup_ratio: 0.1
  • fp16: True
  • batch_sampler: no_duplicates

All Hyperparameters

Click to expand
  • overwrite_output_dir: False
  • do_predict: False
  • eval_strategy: steps
  • prediction_loss_only: False
  • per_device_train_batch_size: 128
  • per_device_eval_batch_size: 128
  • per_gpu_train_batch_size: None
  • per_gpu_eval_batch_size: None
  • gradient_accumulation_steps: 1
  • eval_accumulation_steps: None
  • learning_rate: 5e-05
  • weight_decay: 0.0
  • adam_beta1: 0.9
  • adam_beta2: 0.999
  • adam_epsilon: 1e-08
  • max_grad_norm: 1.0
  • num_train_epochs: 1
  • max_steps: -1
  • lr_scheduler_type: linear
  • lr_scheduler_kwargs: {}
  • warmup_ratio: 0.1
  • warmup_steps: 0
  • log_level: passive
  • log_level_replica: warning
  • log_on_each_node: True
  • logging_nan_inf_filter: True
  • save_safetensors: True
  • save_on_each_node: False
  • save_only_model: False
  • no_cuda: False
  • use_cpu: False
  • use_mps_device: False
  • seed: 42
  • data_seed: None
  • jit_mode_eval: False
  • use_ipex: False
  • bf16: False
  • fp16: True
  • fp16_opt_level: O1
  • half_precision_backend: auto
  • bf16_full_eval: False
  • fp16_full_eval: False
  • tf32: None
  • local_rank: 0
  • ddp_backend: None
  • tpu_num_cores: None
  • tpu_metrics_debug: False
  • debug: []
  • dataloader_drop_last: False
  • dataloader_num_workers: 0
  • dataloader_prefetch_factor: None
  • past_index: -1
  • disable_tqdm: False
  • remove_unused_columns: True
  • label_names: None
  • load_best_model_at_end: False
  • ignore_data_skip: False
  • fsdp: []
  • fsdp_min_num_params: 0
  • fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
  • fsdp_transformer_layer_cls_to_wrap: None
  • accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
  • deepspeed: None
  • label_smoothing_factor: 0.0
  • optim: adamw_torch
  • optim_args: None
  • adafactor: False
  • group_by_length: False
  • length_column_name: length
  • ddp_find_unused_parameters: None
  • ddp_bucket_cap_mb: None
  • ddp_broadcast_buffers: None
  • dataloader_pin_memory: True
  • dataloader_persistent_workers: False
  • skip_memory_metrics: True
  • use_legacy_prediction_loop: False
  • push_to_hub: False
  • resume_from_checkpoint: None
  • hub_model_id: None
  • hub_strategy: every_save
  • hub_private_repo: False
  • hub_always_push: False
  • gradient_checkpointing: False
  • gradient_checkpointing_kwargs: None
  • include_inputs_for_metrics: False
  • eval_do_concat_batches: True
  • fp16_backend: auto
  • push_to_hub_model_id: None
  • push_to_hub_organization: None
  • mp_parameters:
  • auto_find_batch_size: False
  • full_determinism: False
  • torchdynamo: None
  • ray_scope: last
  • ddp_timeout: 1800
  • torch_compile: False
  • torch_compile_backend: None
  • torch_compile_mode: None
  • dispatch_batches: None
  • split_batches: None
  • include_tokens_per_second: False
  • include_num_input_tokens_seen: False
  • neftune_noise_alpha: None
  • optim_target_modules: None
  • batch_sampler: no_duplicates
  • multi_dataset_batch_sampler: proportional

Training Logs

Epoch Step Training Loss loss sts-dev-128_spearman_cosine sts-dev-256_spearman_cosine sts-dev-512_spearman_cosine sts-dev-64_spearman_cosine sts-dev-768_spearman_cosine sts-test-128_spearman_cosine sts-test-256_spearman_cosine sts-test-512_spearman_cosine sts-test-64_spearman_cosine sts-test-768_spearman_cosine
0.0229 100 19.9245 11.3900 0.7772 0.7998 0.8049 0.7902 0.7919 - - - - -
0.0459 200 10.6055 11.1510 0.7809 0.7996 0.8055 0.7954 0.7954 - - - - -
0.0688 300 9.6389 11.1229 0.7836 0.8029 0.8114 0.7923 0.8083 - - - - -
0.0918 400 8.6917 11.0299 0.7976 0.8117 0.8142 0.8002 0.8087 - - - - -
0.1147 500 8.3064 11.3586 0.7895 0.8058 0.8120 0.7978 0.8065 - - - - -
0.1376 600 7.8026 11.5047 0.7876 0.8015 0.8065 0.7934 0.8016 - - - - -
0.1606 700 7.9978 11.5823 0.7944 0.8067 0.8072 0.7994 0.8045 - - - - -
0.1835 800 6.9249 11.5862 0.7945 0.8054 0.8085 0.8012 0.8033 - - - - -
0.2065 900 7.1059 11.2365 0.7895 0.8035 0.8072 0.7956 0.8031 - - - - -
0.2294 1000 6.5483 11.3770 0.7853 0.7994 0.8039 0.7894 0.8024 - - - - -
0.2524 1100 6.6684 11.5038 0.7968 0.8087 0.8115 0.8002 0.8065 - - - - -
0.2753 1200 6.4661 11.4057 0.7980 0.8082 0.8103 0.8057 0.8070 - - - - -
0.2982 1300 6.501 11.2521 0.7974 0.8100 0.8111 0.8025 0.8079 - - - - -
0.3212 1400 6.0769 11.1458 0.7971 0.8103 0.8124 0.7982 0.8082 - - - - -
0.3441 1500 6.1919 11.3180 0.8039 0.8129 0.8144 0.8094 0.8098 - - - - -
0.3671 1600 5.8213 11.6196 0.7924 0.8072 0.8090 0.8003 0.8012 - - - - -
0.3900 1700 5.534 11.0700 0.7979 0.8104 0.8132 0.8028 0.8101 - - - - -
0.4129 1800 5.7536 11.0916 0.7934 0.8087 0.8149 0.8008 0.8085 - - - - -
0.4359 1900 5.3778 11.2658 0.7942 0.8084 0.8104 0.7980 0.8049 - - - - -
0.4588 2000 5.4925 11.4851 0.7932 0.8062 0.8086 0.7932 0.8057 - - - - -
0.4818 2100 5.3125 11.4833 0.7987 0.8119 0.8154 0.8012 0.8124 - - - - -
0.5047 2200 5.1914 11.2848 0.7784 0.7971 0.8037 0.7911 0.8004 - - - - -
0.5276 2300 5.2921 11.5364 0.7698 0.7910 0.7974 0.7839 0.7900 - - - - -
0.5506 2400 5.288 11.3944 0.7873 0.8011 0.8051 0.7877 0.8003 - - - - -
0.5735 2500 5.3697 11.4532 0.7949 0.8077 0.8111 0.7955 0.8069 - - - - -
0.5965 2600 5.1521 11.2788 0.7973 0.8095 0.8130 0.7940 0.8088 - - - - -
0.6194 2700 5.2316 11.2472 0.7948 0.8077 0.8102 0.7939 0.8053 - - - - -
0.6423 2800 5.2599 11.4171 0.7882 0.8029 0.8065 0.7888 0.8019 - - - - -
0.6653 2900 5.4052 11.4026 0.7871 0.8005 0.8021 0.7833 0.7985 - - - - -
0.6882 3000 5.3474 11.2084 0.7895 0.8047 0.8079 0.7928 0.8050 - - - - -
0.7112 3100 5.0336 11.3999 0.8023 0.8150 0.8182 0.8024 0.8168 - - - - -
0.7341 3200 5.2496 11.2307 0.8015 0.8137 0.8167 0.8000 0.8140 - - - - -
0.7571 3300 3.8712 10.9468 0.8396 0.8440 0.8471 0.8284 0.8479 - - - - -
0.7800 3400 2.7068 10.9292 0.8414 0.8453 0.8489 0.8305 0.8497 - - - - -
0.8029 3500 2.3418 10.8626 0.8427 0.8467 0.8504 0.8322 0.8504 - - - - -
0.8259 3600 2.2419 10.9065 0.8421 0.8467 0.8504 0.8320 0.8502 - - - - -
0.8488 3700 2.125 10.9517 0.8424 0.8472 0.8509 0.8324 0.8510 - - - - -
0.8718 3800 1.9942 11.0142 0.8438 0.8482 0.8519 0.8337 0.8517 - - - - -
0.8947 3900 2.031 10.9662 0.8433 0.8480 0.8519 0.8340 0.8515 - - - - -
0.9176 4000 1.9734 11.0054 0.8452 0.8495 0.8531 0.8354 0.8528 - - - - -
0.9406 4100 1.9468 11.0183 0.8447 0.8490 0.8526 0.8348 0.8522 - - - - -
0.9635 4200 1.9008 11.0154 0.8445 0.8485 0.8521 0.8352 0.8517 - - - - -
0.9865 4300 1.8511 10.9966 0.8445 0.8488 0.8524 0.8352 0.8519 - - - - -
1.0 4359 - - - - - - - 0.8284 0.8305 0.8303 0.8241 0.8298

Environmental Impact

Carbon emissions were measured using CodeCarbon.

  • Energy Consumed: 0.346 kWh
  • Carbon Emitted: 0.134 kg of CO2
  • Hours Used: 1.296 hours

Training Hardware

  • On Cloud: No
  • GPU Model: 1 x NVIDIA GeForce RTX 3090
  • CPU Model: 13th Gen Intel(R) Core(TM) i7-13700K
  • RAM Size: 31.78 GB

Framework Versions

  • Python: 3.11.6
  • Sentence Transformers: 3.0.0.dev0
  • Transformers: 4.41.0.dev0
  • PyTorch: 2.3.0+cu121
  • Accelerate: 0.26.1
  • Datasets: 2.18.0
  • Tokenizers: 0.19.1

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MatryoshkaLoss

@misc{kusupati2024matryoshka,
    title={Matryoshka Representation Learning}, 
    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
    year={2024},
    eprint={2205.13147},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

MultipleNegativesRankingLoss

@misc{henderson2017efficient,
    title={Efficient Natural Language Response Suggestion for Smart Reply}, 
    author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
    year={2017},
    eprint={1705.00652},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}