metadata

tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - generated_from_trainer
  - dataset_size:1432
  - loss:MultipleNegativesRankingLoss
base_model: denaya/indoSBERT-large
widget:
  - source_sentence: >-
      Input-output domestik Indonesia: 17 sektor usaha, harga produsen, data
      tahun 2016 (juta Rp)
    sentences:
      - 'Impor Besi dan Baja Menurut Negara Asal Utama, 2017-2023 '
      - >-
        IHK dan Rata-rata Upah per Bulan Buruh Hotel di Bawah Mandor
        (Supervisor), 1996-2014 (1996=100) 
      - >-
        Tabel Input-Output Indonesia Transaksi Domestik Atas Dasar Harga
        Produsen (17 Lapangan Usaha), 2016 (Juta Rupiah) 
  - source_sentence: 'Gaji bulanan: beda umur, beda jenis pekerjaan (9 sektor), 2017'
    sentences:
      - >-
        Rata-rata Upah/Gaji Bersih Sebulan Buruh/Karyawan/Pegawai Menurut
        Kelompok Umur dan Lapangan Pekerjaan Utama di 9 Sektor (Rupiah), 2017 
      - >-
        Ekspor Rumput Laut dan Ganggang Lainnya menurut Negara Tujuan Utama,
        2012-2023 
      - 'Rata-Rata Harga Valuta Asing Terpilih menurut Provinsi 2017 '
  - source_sentence: Ringkasan aliran dana kuartal terakhir 2009 dalam Rupiah
    sentences:
      - >-
        Jumlah Perahu/Kapal, Luas Usaha Budidaya dan Produksi menurut Sub Sektor
        Perikanan, 2002-2016 
      - >-
        Jumlah Pendapatan Menurut Golongan Rumah Tangga (miliar rupiah) 2000,
        2005, dan 2008 
      - 'Ringkasan Neraca Arus Dana, Triwulan IV, 2009, (Miliar Rupiah) '
  - source_sentence: >-
      Berapa total transaksi (harga pembeli) untuk 9 sektor ekonomi di Indonesia
      tahun 2005? (miliar rupiah)
    sentences:
      - >-
        Jumlah Rumah Tangga Perikanan Budidaya Menurut Provinsi dan Jenis
        Budidaya, 2000-2016 
      - >-
        Transaksi Total Atas Dasar Harga Pembeli 9 Sektor Ekonomi (miliar
        rupiah), 2005 
      - >-
        Perbandingan Indeks dan Tingkat Inflasi Desember 2023 Kota-kota di Luar
        Pulau Jawa dan Sumatera dengan Nasional (2018=100) 
  - source_sentence: >-
      Bagaimana kaitan antara pendidikan dan kegiatan mingguan penduduk usia 15+
      pada tahun 2022?
    sentences:
      - 'Persentase Perkembangan Distribusi Pengeluaran '
      - >-
        Rata-rata Pendapatan Bersih Pekerja Bebas Menurut Provinsi dan Lapangan
        Pekerjaan Utama (ribu rupiah), 2018 
      - >-
        Penduduk Berumur 15 Tahun Ke Atas Menurut Pendidikan Tertinggi yang
        Ditamatkan dan Jenis Kegiatan Selama Seminggu yang Lalu, 2008-2024 
pipeline_tag: sentence-similarity
library_name: sentence-transformers
metrics:
  - cosine_accuracy@1
  - cosine_accuracy@3
  - cosine_accuracy@5
  - cosine_accuracy@10
  - cosine_precision@1
  - cosine_precision@3
  - cosine_precision@5
  - cosine_precision@10
  - cosine_recall@1
  - cosine_recall@3
  - cosine_recall@5
  - cosine_recall@10
  - cosine_ndcg@10
  - cosine_mrr@10
  - cosine_map@100
  - cosine_accuracy
  - cosine_accuracy_threshold
  - cosine_f1
  - cosine_f1_threshold
  - cosine_precision
  - cosine_recall
  - cosine_ap
  - cosine_mcc
model-index:
  - name: SentenceTransformer based on denaya/indoSBERT-large
    results:
      - task:
          type: information-retrieval
          name: Information Retrieval
        dataset:
          name: eval
          type: eval
        metrics:
          - type: cosine_accuracy@1
            value: 0.9120521172638436
            name: Cosine Accuracy@1
          - type: cosine_accuracy@3
            value: 0.990228013029316
            name: Cosine Accuracy@3
          - type: cosine_accuracy@5
            value: 0.993485342019544
            name: Cosine Accuracy@5
          - type: cosine_accuracy@10
            value: 0.996742671009772
            name: Cosine Accuracy@10
          - type: cosine_precision@1
            value: 0.9120521172638436
            name: Cosine Precision@1
          - type: cosine_precision@3
            value: 0.3572204125950054
            name: Cosine Precision@3
          - type: cosine_precision@5
            value: 0.23778501628664495
            name: Cosine Precision@5
          - type: cosine_precision@10
            value: 0.13745928338762217
            name: Cosine Precision@10
          - type: cosine_recall@1
            value: 0.7097252402956855
            name: Cosine Recall@1
          - type: cosine_recall@3
            value: 0.7867346590488319
            name: Cosine Recall@3
          - type: cosine_recall@5
            value: 0.8052359035035943
            name: Cosine Recall@5
          - type: cosine_recall@10
            value: 0.8221312325947948
            name: Cosine Recall@10
          - type: cosine_ndcg@10
            value: 0.8348212945928647
            name: Cosine Ndcg@10
          - type: cosine_mrr@10
            value: 0.9497052892818366
            name: Cosine Mrr@10
          - type: cosine_map@100
            value: 0.7729410950742827
            name: Cosine Map@100
      - task:
          type: binary-classification
          name: Binary Classification
        dataset:
          name: quora duplicates dev
          type: quora_duplicates_dev
        metrics:
          - type: cosine_accuracy
            value: 0.9914529914529915
            name: Cosine Accuracy
          - type: cosine_accuracy_threshold
            value: 0.31953397393226624
            name: Cosine Accuracy Threshold
          - type: cosine_f1
            value: 0.9850953206239168
            name: Cosine F1
          - type: cosine_f1_threshold
            value: 0.30364981293678284
            name: Cosine F1 Threshold
          - type: cosine_precision
            value: 0.988865692414753
            name: Cosine Precision
          - type: cosine_recall
            value: 0.981353591160221
            name: Cosine Recall
          - type: cosine_ap
            value: 0.9956970583311449
            name: Cosine Ap
          - type: cosine_mcc
            value: 0.9791180702139771
            name: Cosine Mcc

SentenceTransformer based on denaya/indoSBERT-large

This is a sentence-transformers model finetuned from denaya/indoSBERT-large. It maps sentences & paragraphs to a 256-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

Model Type: Sentence Transformer
Base model: denaya/indoSBERT-large
Maximum Sequence Length: 256 tokens
Output Dimensionality: 256 dimensions
Similarity Function: Cosine Similarity

Model Sources

Documentation: Sentence Transformers Documentation
Repository: Sentence Transformers on GitHub
Hugging Face: Sentence Transformers on Hugging Face

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 1024, 'out_features': 256, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("yahyaabd/allstats-search-large-bpstable-v1")
# Run inference
sentences = [
    'Bagaimana kaitan antara pendidikan dan kegiatan mingguan penduduk usia 15+ pada tahun 2022?',
    'Penduduk Berumur 15 Tahun Ke Atas Menurut Pendidikan Tertinggi yang Ditamatkan dan Jenis Kegiatan Selama Seminggu yang Lalu, 2008-2024 ',
    'Persentase Perkembangan Distribusi Pengeluaran ',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 256]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Information Retrieval

Dataset: eval
Evaluated with InformationRetrievalEvaluator

Metric	Value
cosine_accuracy@1	0.9121
cosine_accuracy@3	0.9902
cosine_accuracy@5	0.9935
cosine_accuracy@10	0.9967
cosine_precision@1	0.9121
cosine_precision@3	0.3572
cosine_precision@5	0.2378
cosine_precision@10	0.1375
cosine_recall@1	0.7097
cosine_recall@3	0.7867
cosine_recall@5	0.8052
cosine_recall@10	0.8221
cosine_ndcg@10	0.8348
cosine_mrr@10	0.9497
cosine_map@100	0.7729

Binary Classification

Dataset: quora_duplicates_dev
Evaluated with BinaryClassificationEvaluator

Metric	Value
cosine_accuracy	0.9915
cosine_accuracy_threshold	0.3195
cosine_f1	0.9851
cosine_f1_threshold	0.3036
cosine_precision	0.9889
cosine_recall	0.9814
cosine_ap	0.9957
cosine_mcc	0.9791

Training Details

Training Dataset

Unnamed Dataset

Size: 1,432 training samples
Columns: sentence_0, sentence_1, and label
Approximate statistics based on the first 1000 samples:
sentence_0 sentence_1 label
type string string int
details
min: 4 tokens
mean: 16.84 tokens
max: 32 tokens

min: 3 tokens
mean: 20.88 tokens
max: 48 tokens

1: 100.00%

	sentence_0	sentence_1	label
type	string	string	int
details	min: 4 tokens mean: 16.84 tokens max: 32 tokens	min: 3 tokens mean: 20.88 tokens max: 48 tokens	1: 100.00%

Samples:

sentence_0	sentence_1	label
`Average monthly net wage/salary of employees by age group and type of work (Rupiah), 2018`	`Rata-rata Upah/Gaji Bersih Sebulan Buruh/Karyawan/Pegawai Menurut Kelompok Umur dan Jenis Pekerjaan (Rupiah), 2018`	`1`
`Cek average real wage buruh industri pengolahan (level bawah) sekitar tahun 2009`	`Rata-rata Upah Riil Per Bulan Buruh Industri Pengolahan di Bawah Mandor, 2005-2014 (1996=100)`	`1`
`Dimana saya bisa lihat rekapitulasi dokumen RPB kabupaten/kota?`	`Rekap Dokumen RPB Kabupaten/Kota`	`1`

Loss: MultipleNegativesRankingLoss with these parameters:

{
    "scale": 20.0,
    "similarity_fct": "cos_sim"
}

Training Hyperparameters

Non-Default Hyperparameters

eval_strategy: steps
per_device_train_batch_size: 16
per_device_eval_batch_size: 16
num_train_epochs: 30
fp16: True
multi_dataset_batch_sampler: round_robin

All Hyperparameters

Click to expand

overwrite_output_dir: False
do_predict: False
eval_strategy: steps
prediction_loss_only: True
per_device_train_batch_size: 16
per_device_eval_batch_size: 16
per_gpu_train_batch_size: None
per_gpu_eval_batch_size: None
gradient_accumulation_steps: 1
eval_accumulation_steps: None
torch_empty_cache_steps: None
learning_rate: 5e-05
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1e-08
max_grad_norm: 1
num_train_epochs: 30
max_steps: -1
lr_scheduler_type: linear
lr_scheduler_kwargs: {}
warmup_ratio: 0.0
warmup_steps: 0
log_level: passive
log_level_replica: warning
log_on_each_node: True
logging_nan_inf_filter: True
save_safetensors: True
save_on_each_node: False
save_only_model: False
restore_callback_states_from_checkpoint: False
no_cuda: False
use_cpu: False
use_mps_device: False
seed: 42
data_seed: None
jit_mode_eval: False
use_ipex: False
bf16: False
fp16: True
fp16_opt_level: O1
half_precision_backend: auto
bf16_full_eval: False
fp16_full_eval: False
tf32: None
local_rank: 0
ddp_backend: None
tpu_num_cores: None
tpu_metrics_debug: False
debug: []
dataloader_drop_last: False
dataloader_num_workers: 0
dataloader_prefetch_factor: None
past_index: -1
disable_tqdm: False
remove_unused_columns: True
label_names: None
load_best_model_at_end: False
ignore_data_skip: False
fsdp: []
fsdp_min_num_params: 0
fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
fsdp_transformer_layer_cls_to_wrap: None
accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
deepspeed: None
label_smoothing_factor: 0.0
optim: adamw_torch
optim_args: None
adafactor: False
group_by_length: False
length_column_name: length
ddp_find_unused_parameters: None
ddp_bucket_cap_mb: None
ddp_broadcast_buffers: False
dataloader_pin_memory: True
dataloader_persistent_workers: False
skip_memory_metrics: True
use_legacy_prediction_loop: False
push_to_hub: False
resume_from_checkpoint: None
hub_model_id: None
hub_strategy: every_save
hub_private_repo: None
hub_always_push: False
gradient_checkpointing: False
gradient_checkpointing_kwargs: None
include_inputs_for_metrics: False
include_for_metrics: []
eval_do_concat_batches: True
fp16_backend: auto
push_to_hub_model_id: None
push_to_hub_organization: None
mp_parameters:
auto_find_batch_size: False
full_determinism: False
torchdynamo: None
ray_scope: last
ddp_timeout: 1800
torch_compile: False
torch_compile_backend: None
torch_compile_mode: None
dispatch_batches: None
split_batches: None
include_tokens_per_second: False
include_num_input_tokens_seen: False
neftune_noise_alpha: None
optim_target_modules: None
batch_eval_metrics: False
eval_on_start: False
use_liger_kernel: False
eval_use_gather_object: False
average_tokens_across_devices: False
prompts: None
batch_sampler: batch_sampler
multi_dataset_batch_sampler: round_robin

Training Logs

Click to expand

Epoch	Step	Training Loss	eval_cosine_ndcg@10	quora_duplicates_dev_cosine_ap
0.2222	20	-	0.7769	-
0.4444	40	-	0.8167	-
0.6667	60	-	0.8221	-
0.8889	80	-	0.8282	-
1.0	90	-	0.8256	-
1.1111	100	-	0.8278	-
1.3333	120	-	0.8388	-
1.5556	140	-	0.8347	-
1.7778	160	-	0.8351	-
2.0	180	-	0.8407	-
2.2222	200	-	0.8302	-
2.4444	220	-	0.8261	-
2.6667	240	-	0.8217	-
2.8889	260	-	0.8161	-
3.0	270	-	0.8143	-
3.1111	280	-	0.8133	-
3.3333	300	-	0.8259	-
3.5556	320	-	0.8342	-
3.7778	340	-	0.8267	-
4.0	360	-	0.8190	-
4.2222	380	-	0.8193	-
4.4444	400	-	0.8281	-
4.6667	420	-	0.8283	-
4.8889	440	-	0.8197	-
5.0	450	-	0.8211	-
5.1111	460	-	0.8118	-
5.3333	480	-	0.8298	-
5.5556	500	0.0412	0.8283	-
5.7778	520	-	0.8264	-
6.0	540	-	0.8271	-
6.2222	560	-	0.8243	-
6.4444	580	-	0.8256	-
6.6667	600	-	0.8356	-
6.8889	620	-	0.8332	-
7.0	630	-	0.8250	-
7.1111	640	-	0.8179	-
7.3333	660	-	0.8356	-
7.5556	680	-	0.8400	-
7.7778	700	-	0.8349	-
8.0	720	-	0.8281	-
8.2222	740	-	0.8330	-
8.4444	760	-	0.8338	-
8.6667	780	-	0.8338	-
8.8889	800	-	0.8344	-
9.0	810	-	0.8319	-
9.1111	820	-	0.8328	-
9.3333	840	-	0.8325	-
9.5556	860	-	0.8375	-
9.7778	880	-	0.8306	-
10.0	900	-	0.8263	-
10.2222	920	-	0.8280	-
10.4444	940	-	0.8272	-
10.6667	960	-	0.8280	-
10.8889	980	-	0.8313	-
11.0	990	-	0.8307	-
11.1111	1000	0.0198	0.8324	-
11.3333	1020	-	0.8303	-
11.5556	1040	-	0.8262	-
11.7778	1060	-	0.8294	-
12.0	1080	-	0.8309	-
12.2222	1100	-	0.8274	-
12.4444	1120	-	0.8312	-
12.6667	1140	-	0.8371	-
12.8889	1160	-	0.8408	-
13.0	1170	-	0.8374	-
13.1111	1180	-	0.8344	-
13.3333	1200	-	0.8341	-
13.5556	1220	-	0.8333	-
13.7778	1240	-	0.8388	-
14.0	1260	-	0.8414	-
14.2222	1280	-	0.8344	-
14.4444	1300	-	0.8328	-
14.6667	1320	-	0.8340	-
14.8889	1340	-	0.8317	-
15.0	1350	-	0.8260	-
15.1111	1360	-	0.8252	-
15.3333	1380	-	0.8244	-
15.5556	1400	-	0.8269	-
15.7778	1420	-	0.8275	-
16.0	1440	-	0.8281	-
16.2222	1460	-	0.8294	-
16.4444	1480	-	0.8299	-
16.6667	1500	0.0136	0.8318	-
16.8889	1520	-	0.8320	-
17.0	1530	-	0.8332	-
17.1111	1540	-	0.8337	-
17.3333	1560	-	0.8299	-
17.5556	1580	-	0.8283	-
17.7778	1600	-	0.8309	-
18.0	1620	-	0.8329	-
18.2222	1640	-	0.8317	-
18.4444	1660	-	0.8313	-
18.6667	1680	-	0.8317	-
18.8889	1700	-	0.8356	-
19.0	1710	-	0.8345	-
19.1111	1720	-	0.8358	-
19.3333	1740	-	0.8334	-
19.5556	1760	-	0.8335	-
19.7778	1780	-	0.8318	-
20.0	1800	-	0.8326	-
20.2222	1820	-	0.8318	-
20.4444	1840	-	0.8335	-
20.6667	1860	-	0.8333	-
20.8889	1880	-	0.8335	-
21.0	1890	-	0.8341	-
21.1111	1900	-	0.8341	-
21.3333	1920	-	0.8355	-
21.5556	1940	-	0.8360	-
21.7778	1960	-	0.8343	-
22.0	1980	-	0.8351	-
22.2222	2000	0.015	0.8342	-
22.4444	2020	-	0.8342	-
22.6667	2040	-	0.8339	-
22.8889	2060	-	0.8342	-
23.0	2070	-	0.8345	-
23.1111	2080	-	0.8354	-
23.3333	2100	-	0.8366	-
23.5556	2120	-	0.8379	-
23.7778	2140	-	0.8386	-
24.0	2160	-	0.8367	-
24.2222	2180	-	0.8357	-
24.4444	2200	-	0.8372	-
24.6667	2220	-	0.8377	-
24.8889	2240	-	0.8373	-
25.0	2250	-	0.8367	-
25.1111	2260	-	0.8366	-
25.3333	2280	-	0.8369	-
25.5556	2300	-	0.8373	-
25.7778	2320	-	0.8366	-
26.0	2340	-	0.8354	-
26.2222	2360	-	0.8347	-
26.4444	2380	-	0.8344	-
26.6667	2400	-	0.8341	-
26.8889	2420	-	0.8343	-
27.0	2430	-	0.8344	-
27.1111	2440	-	0.8345	-
27.3333	2460	-	0.8344	-
27.5556	2480	-	0.8347	-
27.7778	2500	0.0136	0.8342	-
28.0	2520	-	0.8347	-
28.2222	2540	-	0.8346	-
28.4444	2560	-	0.8346	-
28.6667	2580	-	0.8347	-
28.8889	2600	-	0.8348	-
29.0	2610	-	0.8348	-
29.1111	2620	-	0.8348	-
29.3333	2640	-	0.8348	-
29.5556	2660	-	0.8348	-
29.7778	2680	-	0.8348	-
30.0	2700	-	0.8348	-
-1	-1	-	-	0.9957

Framework Versions

Python: 3.10.12
Sentence Transformers: 3.4.0
Transformers: 4.48.1
PyTorch: 2.5.1+cu124
Accelerate: 1.3.0
Datasets: 3.2.0
Tokenizers: 0.21.0

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MultipleNegativesRankingLoss

@misc{henderson2017efficient,
    title={Efficient Natural Language Response Suggestion for Smart Reply},
    author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
    year={2017},
    eprint={1705.00652},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}