yahyaabd's picture
Add new SentenceTransformer model
97d64a4 verified
metadata
tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - generated_from_trainer
  - dataset_size:1432
  - loss:MultipleNegativesRankingLoss
base_model: denaya/indoSBERT-large
widget:
  - source_sentence: >-
      Input-output domestik Indonesia: 17 sektor usaha, harga produsen, data
      tahun 2016 (juta Rp)
    sentences:
      - 'Impor Besi dan Baja Menurut Negara Asal Utama, 2017-2023 '
      - >-
        IHK dan Rata-rata Upah per Bulan Buruh Hotel di Bawah Mandor
        (Supervisor), 1996-2014 (1996=100) 
      - >-
        Tabel Input-Output Indonesia Transaksi Domestik Atas Dasar Harga
        Produsen (17 Lapangan Usaha), 2016 (Juta Rupiah) 
  - source_sentence: 'Gaji bulanan: beda umur, beda jenis pekerjaan (9 sektor), 2017'
    sentences:
      - >-
        Rata-rata Upah/Gaji Bersih Sebulan Buruh/Karyawan/Pegawai Menurut
        Kelompok Umur dan Lapangan Pekerjaan Utama di 9 Sektor (Rupiah), 2017 
      - >-
        Ekspor Rumput Laut dan Ganggang Lainnya menurut Negara Tujuan Utama,
        2012-2023 
      - 'Rata-Rata Harga Valuta Asing Terpilih menurut Provinsi 2017 '
  - source_sentence: Ringkasan aliran dana kuartal terakhir 2009 dalam Rupiah
    sentences:
      - >-
        Jumlah Perahu/Kapal, Luas Usaha Budidaya dan Produksi menurut Sub Sektor
        Perikanan, 2002-2016 
      - >-
        Jumlah Pendapatan Menurut Golongan Rumah Tangga (miliar rupiah) 2000,
        2005, dan 2008 
      - 'Ringkasan Neraca Arus Dana, Triwulan IV, 2009, (Miliar Rupiah) '
  - source_sentence: >-
      Berapa total transaksi (harga pembeli) untuk 9 sektor ekonomi di Indonesia
      tahun 2005? (miliar rupiah)
    sentences:
      - >-
        Jumlah Rumah Tangga Perikanan Budidaya Menurut Provinsi dan Jenis
        Budidaya, 2000-2016 
      - >-
        Transaksi Total Atas Dasar Harga Pembeli 9 Sektor Ekonomi (miliar
        rupiah), 2005 
      - >-
        Perbandingan Indeks dan Tingkat Inflasi Desember 2023 Kota-kota di Luar
        Pulau Jawa dan Sumatera dengan Nasional (2018=100) 
  - source_sentence: >-
      Bagaimana kaitan antara pendidikan dan kegiatan mingguan penduduk usia 15+
      pada tahun 2022?
    sentences:
      - 'Persentase Perkembangan Distribusi Pengeluaran '
      - >-
        Rata-rata Pendapatan Bersih Pekerja Bebas Menurut Provinsi dan Lapangan
        Pekerjaan Utama (ribu rupiah), 2018 
      - >-
        Penduduk Berumur 15 Tahun Ke Atas Menurut Pendidikan Tertinggi yang
        Ditamatkan dan Jenis Kegiatan Selama Seminggu yang Lalu, 2008-2024 
pipeline_tag: sentence-similarity
library_name: sentence-transformers
metrics:
  - cosine_accuracy@1
  - cosine_accuracy@3
  - cosine_accuracy@5
  - cosine_accuracy@10
  - cosine_precision@1
  - cosine_precision@3
  - cosine_precision@5
  - cosine_precision@10
  - cosine_recall@1
  - cosine_recall@3
  - cosine_recall@5
  - cosine_recall@10
  - cosine_ndcg@10
  - cosine_mrr@10
  - cosine_map@100
  - cosine_accuracy
  - cosine_accuracy_threshold
  - cosine_f1
  - cosine_f1_threshold
  - cosine_precision
  - cosine_recall
  - cosine_ap
  - cosine_mcc
model-index:
  - name: SentenceTransformer based on denaya/indoSBERT-large
    results:
      - task:
          type: information-retrieval
          name: Information Retrieval
        dataset:
          name: eval
          type: eval
        metrics:
          - type: cosine_accuracy@1
            value: 0.9120521172638436
            name: Cosine Accuracy@1
          - type: cosine_accuracy@3
            value: 0.990228013029316
            name: Cosine Accuracy@3
          - type: cosine_accuracy@5
            value: 0.993485342019544
            name: Cosine Accuracy@5
          - type: cosine_accuracy@10
            value: 0.996742671009772
            name: Cosine Accuracy@10
          - type: cosine_precision@1
            value: 0.9120521172638436
            name: Cosine Precision@1
          - type: cosine_precision@3
            value: 0.3572204125950054
            name: Cosine Precision@3
          - type: cosine_precision@5
            value: 0.23778501628664495
            name: Cosine Precision@5
          - type: cosine_precision@10
            value: 0.13745928338762217
            name: Cosine Precision@10
          - type: cosine_recall@1
            value: 0.7097252402956855
            name: Cosine Recall@1
          - type: cosine_recall@3
            value: 0.7867346590488319
            name: Cosine Recall@3
          - type: cosine_recall@5
            value: 0.8052359035035943
            name: Cosine Recall@5
          - type: cosine_recall@10
            value: 0.8221312325947948
            name: Cosine Recall@10
          - type: cosine_ndcg@10
            value: 0.8348212945928647
            name: Cosine Ndcg@10
          - type: cosine_mrr@10
            value: 0.9497052892818366
            name: Cosine Mrr@10
          - type: cosine_map@100
            value: 0.7729410950742827
            name: Cosine Map@100
      - task:
          type: binary-classification
          name: Binary Classification
        dataset:
          name: quora duplicates dev
          type: quora_duplicates_dev
        metrics:
          - type: cosine_accuracy
            value: 0.9914529914529915
            name: Cosine Accuracy
          - type: cosine_accuracy_threshold
            value: 0.31953397393226624
            name: Cosine Accuracy Threshold
          - type: cosine_f1
            value: 0.9850953206239168
            name: Cosine F1
          - type: cosine_f1_threshold
            value: 0.30364981293678284
            name: Cosine F1 Threshold
          - type: cosine_precision
            value: 0.988865692414753
            name: Cosine Precision
          - type: cosine_recall
            value: 0.981353591160221
            name: Cosine Recall
          - type: cosine_ap
            value: 0.9956970583311449
            name: Cosine Ap
          - type: cosine_mcc
            value: 0.9791180702139771
            name: Cosine Mcc

SentenceTransformer based on denaya/indoSBERT-large

This is a sentence-transformers model finetuned from denaya/indoSBERT-large. It maps sentences & paragraphs to a 256-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

  • Model Type: Sentence Transformer
  • Base model: denaya/indoSBERT-large
  • Maximum Sequence Length: 256 tokens
  • Output Dimensionality: 256 dimensions
  • Similarity Function: Cosine Similarity

Model Sources

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 1024, 'out_features': 256, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("yahyaabd/allstats-search-large-bpstable-v1")
# Run inference
sentences = [
    'Bagaimana kaitan antara pendidikan dan kegiatan mingguan penduduk usia 15+ pada tahun 2022?',
    'Penduduk Berumur 15 Tahun Ke Atas Menurut Pendidikan Tertinggi yang Ditamatkan dan Jenis Kegiatan Selama Seminggu yang Lalu, 2008-2024 ',
    'Persentase Perkembangan Distribusi Pengeluaran ',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 256]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Information Retrieval

Metric Value
cosine_accuracy@1 0.9121
cosine_accuracy@3 0.9902
cosine_accuracy@5 0.9935
cosine_accuracy@10 0.9967
cosine_precision@1 0.9121
cosine_precision@3 0.3572
cosine_precision@5 0.2378
cosine_precision@10 0.1375
cosine_recall@1 0.7097
cosine_recall@3 0.7867
cosine_recall@5 0.8052
cosine_recall@10 0.8221
cosine_ndcg@10 0.8348
cosine_mrr@10 0.9497
cosine_map@100 0.7729

Binary Classification

Metric Value
cosine_accuracy 0.9915
cosine_accuracy_threshold 0.3195
cosine_f1 0.9851
cosine_f1_threshold 0.3036
cosine_precision 0.9889
cosine_recall 0.9814
cosine_ap 0.9957
cosine_mcc 0.9791

Training Details

Training Dataset

Unnamed Dataset

  • Size: 1,432 training samples
  • Columns: sentence_0, sentence_1, and label
  • Approximate statistics based on the first 1000 samples:
    sentence_0 sentence_1 label
    type string string int
    details
    • min: 4 tokens
    • mean: 16.84 tokens
    • max: 32 tokens
    • min: 3 tokens
    • mean: 20.88 tokens
    • max: 48 tokens
    • 1: 100.00%
  • Samples:
    sentence_0 sentence_1 label
    Average monthly net wage/salary of employees by age group and type of work (Rupiah), 2018 Rata-rata Upah/Gaji Bersih Sebulan Buruh/Karyawan/Pegawai Menurut Kelompok Umur dan Jenis Pekerjaan (Rupiah), 2018 1
    Cek average real wage buruh industri pengolahan (level bawah) sekitar tahun 2009 Rata-rata Upah Riil Per Bulan Buruh Industri Pengolahan di Bawah Mandor, 2005-2014 (1996=100) 1
    Dimana saya bisa lihat rekapitulasi dokumen RPB kabupaten/kota? Rekap Dokumen RPB Kabupaten/Kota 1
  • Loss: MultipleNegativesRankingLoss with these parameters:
    {
        "scale": 20.0,
        "similarity_fct": "cos_sim"
    }
    

Training Hyperparameters

Non-Default Hyperparameters

  • eval_strategy: steps
  • per_device_train_batch_size: 16
  • per_device_eval_batch_size: 16
  • num_train_epochs: 30
  • fp16: True
  • multi_dataset_batch_sampler: round_robin

All Hyperparameters

Click to expand
  • overwrite_output_dir: False
  • do_predict: False
  • eval_strategy: steps
  • prediction_loss_only: True
  • per_device_train_batch_size: 16
  • per_device_eval_batch_size: 16
  • per_gpu_train_batch_size: None
  • per_gpu_eval_batch_size: None
  • gradient_accumulation_steps: 1
  • eval_accumulation_steps: None
  • torch_empty_cache_steps: None
  • learning_rate: 5e-05
  • weight_decay: 0.0
  • adam_beta1: 0.9
  • adam_beta2: 0.999
  • adam_epsilon: 1e-08
  • max_grad_norm: 1
  • num_train_epochs: 30
  • max_steps: -1
  • lr_scheduler_type: linear
  • lr_scheduler_kwargs: {}
  • warmup_ratio: 0.0
  • warmup_steps: 0
  • log_level: passive
  • log_level_replica: warning
  • log_on_each_node: True
  • logging_nan_inf_filter: True
  • save_safetensors: True
  • save_on_each_node: False
  • save_only_model: False
  • restore_callback_states_from_checkpoint: False
  • no_cuda: False
  • use_cpu: False
  • use_mps_device: False
  • seed: 42
  • data_seed: None
  • jit_mode_eval: False
  • use_ipex: False
  • bf16: False
  • fp16: True
  • fp16_opt_level: O1
  • half_precision_backend: auto
  • bf16_full_eval: False
  • fp16_full_eval: False
  • tf32: None
  • local_rank: 0
  • ddp_backend: None
  • tpu_num_cores: None
  • tpu_metrics_debug: False
  • debug: []
  • dataloader_drop_last: False
  • dataloader_num_workers: 0
  • dataloader_prefetch_factor: None
  • past_index: -1
  • disable_tqdm: False
  • remove_unused_columns: True
  • label_names: None
  • load_best_model_at_end: False
  • ignore_data_skip: False
  • fsdp: []
  • fsdp_min_num_params: 0
  • fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
  • fsdp_transformer_layer_cls_to_wrap: None
  • accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
  • deepspeed: None
  • label_smoothing_factor: 0.0
  • optim: adamw_torch
  • optim_args: None
  • adafactor: False
  • group_by_length: False
  • length_column_name: length
  • ddp_find_unused_parameters: None
  • ddp_bucket_cap_mb: None
  • ddp_broadcast_buffers: False
  • dataloader_pin_memory: True
  • dataloader_persistent_workers: False
  • skip_memory_metrics: True
  • use_legacy_prediction_loop: False
  • push_to_hub: False
  • resume_from_checkpoint: None
  • hub_model_id: None
  • hub_strategy: every_save
  • hub_private_repo: None
  • hub_always_push: False
  • gradient_checkpointing: False
  • gradient_checkpointing_kwargs: None
  • include_inputs_for_metrics: False
  • include_for_metrics: []
  • eval_do_concat_batches: True
  • fp16_backend: auto
  • push_to_hub_model_id: None
  • push_to_hub_organization: None
  • mp_parameters:
  • auto_find_batch_size: False
  • full_determinism: False
  • torchdynamo: None
  • ray_scope: last
  • ddp_timeout: 1800
  • torch_compile: False
  • torch_compile_backend: None
  • torch_compile_mode: None
  • dispatch_batches: None
  • split_batches: None
  • include_tokens_per_second: False
  • include_num_input_tokens_seen: False
  • neftune_noise_alpha: None
  • optim_target_modules: None
  • batch_eval_metrics: False
  • eval_on_start: False
  • use_liger_kernel: False
  • eval_use_gather_object: False
  • average_tokens_across_devices: False
  • prompts: None
  • batch_sampler: batch_sampler
  • multi_dataset_batch_sampler: round_robin

Training Logs

Click to expand
Epoch Step Training Loss eval_cosine_ndcg@10 quora_duplicates_dev_cosine_ap
0.2222 20 - 0.7769 -
0.4444 40 - 0.8167 -
0.6667 60 - 0.8221 -
0.8889 80 - 0.8282 -
1.0 90 - 0.8256 -
1.1111 100 - 0.8278 -
1.3333 120 - 0.8388 -
1.5556 140 - 0.8347 -
1.7778 160 - 0.8351 -
2.0 180 - 0.8407 -
2.2222 200 - 0.8302 -
2.4444 220 - 0.8261 -
2.6667 240 - 0.8217 -
2.8889 260 - 0.8161 -
3.0 270 - 0.8143 -
3.1111 280 - 0.8133 -
3.3333 300 - 0.8259 -
3.5556 320 - 0.8342 -
3.7778 340 - 0.8267 -
4.0 360 - 0.8190 -
4.2222 380 - 0.8193 -
4.4444 400 - 0.8281 -
4.6667 420 - 0.8283 -
4.8889 440 - 0.8197 -
5.0 450 - 0.8211 -
5.1111 460 - 0.8118 -
5.3333 480 - 0.8298 -
5.5556 500 0.0412 0.8283 -
5.7778 520 - 0.8264 -
6.0 540 - 0.8271 -
6.2222 560 - 0.8243 -
6.4444 580 - 0.8256 -
6.6667 600 - 0.8356 -
6.8889 620 - 0.8332 -
7.0 630 - 0.8250 -
7.1111 640 - 0.8179 -
7.3333 660 - 0.8356 -
7.5556 680 - 0.8400 -
7.7778 700 - 0.8349 -
8.0 720 - 0.8281 -
8.2222 740 - 0.8330 -
8.4444 760 - 0.8338 -
8.6667 780 - 0.8338 -
8.8889 800 - 0.8344 -
9.0 810 - 0.8319 -
9.1111 820 - 0.8328 -
9.3333 840 - 0.8325 -
9.5556 860 - 0.8375 -
9.7778 880 - 0.8306 -
10.0 900 - 0.8263 -
10.2222 920 - 0.8280 -
10.4444 940 - 0.8272 -
10.6667 960 - 0.8280 -
10.8889 980 - 0.8313 -
11.0 990 - 0.8307 -
11.1111 1000 0.0198 0.8324 -
11.3333 1020 - 0.8303 -
11.5556 1040 - 0.8262 -
11.7778 1060 - 0.8294 -
12.0 1080 - 0.8309 -
12.2222 1100 - 0.8274 -
12.4444 1120 - 0.8312 -
12.6667 1140 - 0.8371 -
12.8889 1160 - 0.8408 -
13.0 1170 - 0.8374 -
13.1111 1180 - 0.8344 -
13.3333 1200 - 0.8341 -
13.5556 1220 - 0.8333 -
13.7778 1240 - 0.8388 -
14.0 1260 - 0.8414 -
14.2222 1280 - 0.8344 -
14.4444 1300 - 0.8328 -
14.6667 1320 - 0.8340 -
14.8889 1340 - 0.8317 -
15.0 1350 - 0.8260 -
15.1111 1360 - 0.8252 -
15.3333 1380 - 0.8244 -
15.5556 1400 - 0.8269 -
15.7778 1420 - 0.8275 -
16.0 1440 - 0.8281 -
16.2222 1460 - 0.8294 -
16.4444 1480 - 0.8299 -
16.6667 1500 0.0136 0.8318 -
16.8889 1520 - 0.8320 -
17.0 1530 - 0.8332 -
17.1111 1540 - 0.8337 -
17.3333 1560 - 0.8299 -
17.5556 1580 - 0.8283 -
17.7778 1600 - 0.8309 -
18.0 1620 - 0.8329 -
18.2222 1640 - 0.8317 -
18.4444 1660 - 0.8313 -
18.6667 1680 - 0.8317 -
18.8889 1700 - 0.8356 -
19.0 1710 - 0.8345 -
19.1111 1720 - 0.8358 -
19.3333 1740 - 0.8334 -
19.5556 1760 - 0.8335 -
19.7778 1780 - 0.8318 -
20.0 1800 - 0.8326 -
20.2222 1820 - 0.8318 -
20.4444 1840 - 0.8335 -
20.6667 1860 - 0.8333 -
20.8889 1880 - 0.8335 -
21.0 1890 - 0.8341 -
21.1111 1900 - 0.8341 -
21.3333 1920 - 0.8355 -
21.5556 1940 - 0.8360 -
21.7778 1960 - 0.8343 -
22.0 1980 - 0.8351 -
22.2222 2000 0.015 0.8342 -
22.4444 2020 - 0.8342 -
22.6667 2040 - 0.8339 -
22.8889 2060 - 0.8342 -
23.0 2070 - 0.8345 -
23.1111 2080 - 0.8354 -
23.3333 2100 - 0.8366 -
23.5556 2120 - 0.8379 -
23.7778 2140 - 0.8386 -
24.0 2160 - 0.8367 -
24.2222 2180 - 0.8357 -
24.4444 2200 - 0.8372 -
24.6667 2220 - 0.8377 -
24.8889 2240 - 0.8373 -
25.0 2250 - 0.8367 -
25.1111 2260 - 0.8366 -
25.3333 2280 - 0.8369 -
25.5556 2300 - 0.8373 -
25.7778 2320 - 0.8366 -
26.0 2340 - 0.8354 -
26.2222 2360 - 0.8347 -
26.4444 2380 - 0.8344 -
26.6667 2400 - 0.8341 -
26.8889 2420 - 0.8343 -
27.0 2430 - 0.8344 -
27.1111 2440 - 0.8345 -
27.3333 2460 - 0.8344 -
27.5556 2480 - 0.8347 -
27.7778 2500 0.0136 0.8342 -
28.0 2520 - 0.8347 -
28.2222 2540 - 0.8346 -
28.4444 2560 - 0.8346 -
28.6667 2580 - 0.8347 -
28.8889 2600 - 0.8348 -
29.0 2610 - 0.8348 -
29.1111 2620 - 0.8348 -
29.3333 2640 - 0.8348 -
29.5556 2660 - 0.8348 -
29.7778 2680 - 0.8348 -
30.0 2700 - 0.8348 -
-1 -1 - - 0.9957

Framework Versions

  • Python: 3.10.12
  • Sentence Transformers: 3.4.0
  • Transformers: 4.48.1
  • PyTorch: 2.5.1+cu124
  • Accelerate: 1.3.0
  • Datasets: 3.2.0
  • Tokenizers: 0.21.0

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MultipleNegativesRankingLoss

@misc{henderson2017efficient,
    title={Efficient Natural Language Response Suggestion for Smart Reply},
    author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
    year={2017},
    eprint={1705.00652},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}