|
--- |
|
base_model: Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2 |
|
datasets: |
|
- Omartificial-Intelligence-Space/Arabic-stsb |
|
- Omartificial-Intelligence-Space/Arabic-NLi-Pair-Class |
|
language: |
|
- ar |
|
library_name: sentence-transformers |
|
metrics: |
|
- pearson_cosine |
|
- spearman_cosine |
|
- pearson_manhattan |
|
- spearman_manhattan |
|
- pearson_euclidean |
|
- spearman_euclidean |
|
- pearson_dot |
|
- spearman_dot |
|
- pearson_max |
|
- spearman_max |
|
pipeline_tag: sentence-similarity |
|
tags: |
|
- mteb |
|
- sentence-transformers |
|
- sentence-similarity |
|
- feature-extraction |
|
- generated_from_trainer |
|
- dataset_size:947818 |
|
- loss:SoftmaxLoss |
|
- loss:CosineSimilarityLoss |
|
- transformers |
|
model-index: |
|
- name: Omartificial-Intelligence-Space/GATE-AraBert-v1 |
|
results: |
|
- dataset: |
|
config: ara-ara |
|
name: MTEB MLQARetrieval (ara-ara) |
|
revision: 397ed406c1a7902140303e7faf60fff35b58d285 |
|
split: test |
|
type: facebook/mlqa |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 37.409 |
|
- type: ndcg_at_3 |
|
value: 44.269 |
|
- type: ndcg_at_5 |
|
value: 46.23 |
|
- type: ndcg_at_10 |
|
value: 48.076 |
|
- type: ndcg_at_20 |
|
value: 49.679 |
|
- type: ndcg_at_100 |
|
value: 52.037 |
|
- type: ndcg_at_1000 |
|
value: 53.958 |
|
- type: map_at_1 |
|
value: 37.399 |
|
- type: map_at_3 |
|
value: 42.577999999999996 |
|
- type: map_at_5 |
|
value: 43.661 |
|
- type: map_at_10 |
|
value: 44.42 |
|
- type: map_at_20 |
|
value: 44.861000000000004 |
|
- type: map_at_100 |
|
value: 45.179 |
|
- type: map_at_1000 |
|
value: 45.242 |
|
- type: recall_at_1 |
|
value: 37.399 |
|
- type: recall_at_3 |
|
value: 49.156 |
|
- type: recall_at_5 |
|
value: 53.937999999999995 |
|
- type: recall_at_10 |
|
value: 59.657000000000004 |
|
- type: recall_at_20 |
|
value: 65.995 |
|
- type: recall_at_100 |
|
value: 78.821 |
|
- type: recall_at_1000 |
|
value: 94.45 |
|
- type: precision_at_1 |
|
value: 37.409 |
|
- type: precision_at_3 |
|
value: 16.389 |
|
- type: precision_at_5 |
|
value: 10.789 |
|
- type: precision_at_10 |
|
value: 5.9670000000000005 |
|
- type: precision_at_20 |
|
value: 3.3000000000000003 |
|
- type: precision_at_100 |
|
value: 0.788 |
|
- type: precision_at_1000 |
|
value: 0.094 |
|
- type: mrr_at_1 |
|
value: 37.4086 |
|
- type: mrr_at_3 |
|
value: 42.587 |
|
- type: mrr_at_5 |
|
value: 43.6699 |
|
- type: mrr_at_10 |
|
value: 44.4297 |
|
- type: mrr_at_20 |
|
value: 44.8704 |
|
- type: mrr_at_100 |
|
value: 45.1881 |
|
- type: mrr_at_1000 |
|
value: 45.251000000000005 |
|
- type: nauc_ndcg_at_1_max |
|
value: 61.8437 |
|
- type: nauc_ndcg_at_1_std |
|
value: 10.782 |
|
- type: nauc_ndcg_at_1_diff1 |
|
value: 66.1842 |
|
- type: nauc_ndcg_at_3_max |
|
value: 63.157399999999996 |
|
- type: nauc_ndcg_at_3_std |
|
value: 13.114899999999999 |
|
- type: nauc_ndcg_at_3_diff1 |
|
value: 60.312 |
|
- type: nauc_ndcg_at_5_max |
|
value: 63.027100000000004 |
|
- type: nauc_ndcg_at_5_std |
|
value: 13.995099999999999 |
|
- type: nauc_ndcg_at_5_diff1 |
|
value: 59.272499999999994 |
|
- type: nauc_ndcg_at_10_max |
|
value: 63.0273 |
|
- type: nauc_ndcg_at_10_std |
|
value: 14.898700000000002 |
|
- type: nauc_ndcg_at_10_diff1 |
|
value: 58.2739 |
|
- type: nauc_ndcg_at_20_max |
|
value: 62.785199999999996 |
|
- type: nauc_ndcg_at_20_std |
|
value: 15.259800000000002 |
|
- type: nauc_ndcg_at_20_diff1 |
|
value: 57.8913 |
|
- type: nauc_ndcg_at_100_max |
|
value: 62.641999999999996 |
|
- type: nauc_ndcg_at_100_std |
|
value: 15.738299999999999 |
|
- type: nauc_ndcg_at_100_diff1 |
|
value: 58.2303 |
|
- type: nauc_ndcg_at_1000_max |
|
value: 62.7624 |
|
- type: nauc_ndcg_at_1000_std |
|
value: 15.1653 |
|
- type: nauc_ndcg_at_1000_diff1 |
|
value: 58.9359 |
|
- type: nauc_map_at_1_max |
|
value: 61.800900000000006 |
|
- type: nauc_map_at_1_std |
|
value: 10.7369 |
|
- type: nauc_map_at_1_diff1 |
|
value: 66.18270000000001 |
|
- type: nauc_map_at_3_max |
|
value: 62.8757 |
|
- type: nauc_map_at_3_std |
|
value: 12.5061 |
|
- type: nauc_map_at_3_diff1 |
|
value: 61.767 |
|
- type: nauc_map_at_5_max |
|
value: 62.793299999999995 |
|
- type: nauc_map_at_5_std |
|
value: 12.964500000000001 |
|
- type: nauc_map_at_5_diff1 |
|
value: 61.211000000000006 |
|
- type: nauc_map_at_10_max |
|
value: 62.8054 |
|
- type: nauc_map_at_10_std |
|
value: 13.328000000000001 |
|
- type: nauc_map_at_10_diff1 |
|
value: 60.833400000000005 |
|
- type: nauc_map_at_20_max |
|
value: 62.734199999999994 |
|
- type: nauc_map_at_20_std |
|
value: 13.4114 |
|
- type: nauc_map_at_20_diff1 |
|
value: 60.747099999999996 |
|
- type: nauc_map_at_100_max |
|
value: 62.7054 |
|
- type: nauc_map_at_100_std |
|
value: 13.4556 |
|
- type: nauc_map_at_100_diff1 |
|
value: 60.79259999999999 |
|
- type: nauc_map_at_1000_max |
|
value: 62.71099999999999 |
|
- type: nauc_map_at_1000_std |
|
value: 13.444400000000002 |
|
- type: nauc_map_at_1000_diff1 |
|
value: 60.815 |
|
- type: nauc_recall_at_1_max |
|
value: 61.800900000000006 |
|
- type: nauc_recall_at_1_std |
|
value: 10.7369 |
|
- type: nauc_recall_at_1_diff1 |
|
value: 66.18270000000001 |
|
- type: nauc_recall_at_3_max |
|
value: 63.914300000000004 |
|
- type: nauc_recall_at_3_std |
|
value: 14.8614 |
|
- type: nauc_recall_at_3_diff1 |
|
value: 56.044700000000006 |
|
- type: nauc_recall_at_5_max |
|
value: 63.6523 |
|
- type: nauc_recall_at_5_std |
|
value: 17.2352 |
|
- type: nauc_recall_at_5_diff1 |
|
value: 53.2316 |
|
- type: nauc_recall_at_10_max |
|
value: 63.6138 |
|
- type: nauc_recall_at_10_std |
|
value: 20.4315 |
|
- type: nauc_recall_at_10_diff1 |
|
value: 49.4388 |
|
- type: nauc_recall_at_20_max |
|
value: 62.605 |
|
- type: nauc_recall_at_20_std |
|
value: 22.8045 |
|
- type: nauc_recall_at_20_diff1 |
|
value: 46.5945 |
|
- type: nauc_recall_at_100_max |
|
value: 61.5178 |
|
- type: nauc_recall_at_100_std |
|
value: 30.4825 |
|
- type: nauc_recall_at_100_diff1 |
|
value: 44.9405 |
|
- type: nauc_recall_at_1000_max |
|
value: 63.473 |
|
- type: nauc_recall_at_1000_std |
|
value: 39.1421 |
|
- type: nauc_recall_at_1000_diff1 |
|
value: 43.4873 |
|
- type: nauc_precision_at_1_max |
|
value: 61.8437 |
|
- type: nauc_precision_at_1_std |
|
value: 10.782 |
|
- type: nauc_precision_at_1_diff1 |
|
value: 66.1842 |
|
- type: nauc_precision_at_3_max |
|
value: 63.962799999999994 |
|
- type: nauc_precision_at_3_std |
|
value: 14.908299999999999 |
|
- type: nauc_precision_at_3_diff1 |
|
value: 56.0511 |
|
- type: nauc_precision_at_5_max |
|
value: 63.7072 |
|
- type: nauc_precision_at_5_std |
|
value: 17.2854 |
|
- type: nauc_precision_at_5_diff1 |
|
value: 53.2417 |
|
- type: nauc_precision_at_10_max |
|
value: 63.672200000000004 |
|
- type: nauc_precision_at_10_std |
|
value: 20.485300000000002 |
|
- type: nauc_precision_at_10_diff1 |
|
value: 49.4491 |
|
- type: nauc_precision_at_20_max |
|
value: 62.674600000000005 |
|
- type: nauc_precision_at_20_std |
|
value: 22.8667 |
|
- type: nauc_precision_at_20_diff1 |
|
value: 46.6088 |
|
- type: nauc_precision_at_100_max |
|
value: 61.622600000000006 |
|
- type: nauc_precision_at_100_std |
|
value: 30.5766 |
|
- type: nauc_precision_at_100_diff1 |
|
value: 44.9643 |
|
- type: nauc_precision_at_1000_max |
|
value: 63.131400000000006 |
|
- type: nauc_precision_at_1000_std |
|
value: 39.6527 |
|
- type: nauc_precision_at_1000_diff1 |
|
value: 42.9196 |
|
- type: nauc_mrr_at_1_max |
|
value: 61.8437 |
|
- type: nauc_mrr_at_1_std |
|
value: 10.782 |
|
- type: nauc_mrr_at_1_diff1 |
|
value: 66.1842 |
|
- type: nauc_mrr_at_3_max |
|
value: 62.9188 |
|
- type: nauc_mrr_at_3_std |
|
value: 12.5514 |
|
- type: nauc_mrr_at_3_diff1 |
|
value: 61.768699999999995 |
|
- type: nauc_mrr_at_5_max |
|
value: 62.836800000000004 |
|
- type: nauc_mrr_at_5_std |
|
value: 13.0102 |
|
- type: nauc_mrr_at_5_diff1 |
|
value: 61.2128 |
|
- type: nauc_mrr_at_10_max |
|
value: 62.8492 |
|
- type: nauc_mrr_at_10_std |
|
value: 13.3741 |
|
- type: nauc_mrr_at_10_diff1 |
|
value: 60.8352 |
|
- type: nauc_mrr_at_20_max |
|
value: 62.7783 |
|
- type: nauc_mrr_at_20_std |
|
value: 13.4578 |
|
- type: nauc_mrr_at_20_diff1 |
|
value: 60.74889999999999 |
|
- type: nauc_mrr_at_100_max |
|
value: 62.7497 |
|
- type: nauc_mrr_at_100_std |
|
value: 13.5022 |
|
- type: nauc_mrr_at_100_diff1 |
|
value: 60.7944 |
|
- type: nauc_mrr_at_1000_max |
|
value: 62.7546 |
|
- type: nauc_mrr_at_1000_std |
|
value: 13.490499999999999 |
|
- type: nauc_mrr_at_1000_diff1 |
|
value: 60.8168 |
|
- type: main_score |
|
value: 48.076 |
|
task: |
|
type: Retrieval |
|
- dataset: |
|
config: ar-ar |
|
name: MTEB STS17 (ar-ar) |
|
revision: faeb762787bd10488a50c8b5be4a3b82e411949c |
|
split: test |
|
type: mteb/sts17-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 82.06597171670848 |
|
- type: cosine_spearman |
|
value: 82.7809395809498 |
|
- type: euclidean_pearson |
|
value: 79.23996991139896 |
|
- type: euclidean_spearman |
|
value: 81.5287595404711 |
|
- type: main_score |
|
value: 82.7809395809498 |
|
- type: manhattan_pearson |
|
value: 78.95407006608013 |
|
- type: manhattan_spearman |
|
value: 81.15109493737467 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: ar |
|
name: MTEB STS22.v2 (ar) |
|
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd |
|
split: test |
|
type: mteb/sts22-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 54.912880452465004 |
|
- type: cosine_spearman |
|
value: 63.09788380910325 |
|
- type: euclidean_pearson |
|
value: 57.92665617677832 |
|
- type: euclidean_spearman |
|
value: 62.76032598469037 |
|
- type: main_score |
|
value: 63.09788380910325 |
|
- type: manhattan_pearson |
|
value: 58.0736648155273 |
|
- type: manhattan_spearman |
|
value: 62.94190582776664 |
|
task: |
|
type: STS |
|
- dataset: |
|
config: ar |
|
name: MTEB STS22 (ar) |
|
revision: de9d86b3b84231dc21f76c7b7af1f28e2f57f6e3 |
|
split: test |
|
type: mteb/sts22-crosslingual-sts |
|
metrics: |
|
- type: cosine_pearson |
|
value: 51.72534929358701 |
|
- type: cosine_spearman |
|
value: 59.75149627160101 |
|
- type: euclidean_pearson |
|
value: 53.894835373598774 |
|
- type: euclidean_spearman |
|
value: 59.44278354697161 |
|
- type: main_score |
|
value: 59.75149627160101 |
|
- type: manhattan_pearson |
|
value: 54.076675975406985 |
|
- type: manhattan_spearman |
|
value: 59.610061143235725 |
|
task: |
|
type: STS |
|
widget: |
|
- source_sentence: امرأة تكتب شيئاً |
|
sentences: |
|
- مراهق يتحدث إلى فتاة عبر كاميرا الإنترنت |
|
- امرأة تقطع البصل الأخضر. |
|
- مجموعة من كبار السن يتظاهرون حول طاولة الطعام. |
|
- source_sentence: تتشكل النجوم في مناطق تكوين النجوم، والتي تنشأ نفسها من السحب الجزيئية. |
|
sentences: |
|
- لاعب كرة السلة على وشك تسجيل نقاط لفريقه. |
|
- المقال التالي مأخوذ من نسختي من "أطلس البطريق الجديد للتاريخ الوسطى" |
|
- قد يكون من الممكن أن يوجد نظام شمسي مثل نظامنا خارج المجرة |
|
- source_sentence: >- |
|
تحت السماء الزرقاء مع الغيوم البيضاء، يصل طفل لمس مروحة طائرة واقفة على حقل |
|
من العشب. |
|
sentences: |
|
- امرأة تحمل كأساً |
|
- طفل يحاول لمس مروحة طائرة |
|
- اثنان من عازبين عن الشرب يستعدون للعشاء |
|
- source_sentence: رجل في منتصف العمر يحلق لحيته في غرفة ذات جدران بيضاء والتي لا تبدو كحمام |
|
sentences: |
|
- فتى يخطط اسمه على مكتبه |
|
- رجل ينام |
|
- المرأة وحدها وهي نائمة في غرفة نومها |
|
- source_sentence: الكلب البني مستلقي على جانبه على سجادة بيج، مع جسم أخضر في المقدمة. |
|
sentences: |
|
- شخص طويل القامة |
|
- المرأة تنظر من النافذة. |
|
- لقد مات الكلب |
|
license: apache-2.0 |
|
--- |
|
|
|
# GATE-AraBert-V1 |
|
|
|
This is **GATE | General Arabic Text Embedding** trained using SentenceTransformers in a **multi-task** setup. The system trains on the **AllNLI** and on the **STS** dataset. |
|
|
|
## Model Details |
|
|
|
### Model Description |
|
- **Model Type:** Sentence Transformer |
|
- **Base model:** [Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2](https://huggingface.co/Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2) <!-- at revision 5ce4f80f3ede26de623d6ac10681399dba5c684a --> |
|
- **Maximum Sequence Length:** 512 tokens |
|
- **Output Dimensionality:** 768 tokens |
|
- **Similarity Function:** Cosine Similarity |
|
- **Training Datasets:** |
|
- [all-nli](https://huggingface.co/datasets/Omartificial-Intelligence-Space/Arabic-NLi-Pair-Class) |
|
- [sts](https://huggingface.co/datasets/Omartificial-Intelligence-Space/arabic-stsb) |
|
- **Language:** ar |
|
|
|
|
|
## Usage |
|
|
|
### Direct Usage (Sentence Transformers) |
|
|
|
First install the Sentence Transformers library: |
|
|
|
```bash |
|
pip install -U sentence-transformers |
|
``` |
|
|
|
Then you can load this model and run inference. |
|
```python |
|
from sentence_transformers import SentenceTransformer |
|
|
|
# Download from the 🤗 Hub |
|
model = SentenceTransformer("Omartificial-Intelligence-Space/GATE-AraBert-v1") |
|
# Run inference |
|
sentences = [ |
|
'الكلب البني مستلقي على جانبه على سجادة بيج، مع جسم أخضر في المقدمة.', |
|
'لقد مات الكلب', |
|
'شخص طويل القامة', |
|
] |
|
embeddings = model.encode(sentences) |
|
print(embeddings.shape) |
|
# [3, 768] |
|
|
|
# Get the similarity scores for the embeddings |
|
similarities = model.similarity(embeddings, embeddings) |
|
print(similarities.shape) |
|
# [3, 3] |
|
``` |
|
|
|
|
|
## Evaluation |
|
|
|
### Metrics |
|
|
|
#### Semantic Similarity |
|
* Dataset: `sts-dev` |
|
* Evaluated with [<code>EmbeddingSimilarityEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator) |
|
|
|
| Metric | Value | |
|
|:--------------------|:----------| |
|
| pearson_cosine | 0.8391 | |
|
| **spearman_cosine** | **0.841** | |
|
| pearson_manhattan | 0.8277 | |
|
| spearman_manhattan | 0.8361 | |
|
| pearson_euclidean | 0.8274 | |
|
| spearman_euclidean | 0.8358 | |
|
| pearson_dot | 0.8154 | |
|
| spearman_dot | 0.818 | |
|
| pearson_max | 0.8391 | |
|
| spearman_max | 0.841 | |
|
|
|
#### Semantic Similarity |
|
* Dataset: `sts-test` |
|
* Evaluated with [<code>EmbeddingSimilarityEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator) |
|
|
|
| Metric | Value | |
|
|:--------------------|:-----------| |
|
| pearson_cosine | 0.813 | |
|
| **spearman_cosine** | **0.8173** | |
|
| pearson_manhattan | 0.8114 | |
|
| spearman_manhattan | 0.8164 | |
|
| pearson_euclidean | 0.8103 | |
|
| spearman_euclidean | 0.8158 | |
|
| pearson_dot | 0.7908 | |
|
| spearman_dot | 0.7887 | |
|
| pearson_max | 0.813 | |
|
| spearman_max | 0.8173 | |
|
|
|
|
|
## <span style="color:blue">Acknowledgments</span> |
|
|
|
The author would like to thank Prince Sultan University for their invaluable support in this project. Their contributions and resources have been instrumental in the development and fine-tuning of these models. |
|
|
|
|
|
```markdown |
|
## Citation |
|
|
|
If you use the GATE, please cite it as follows: |
|
|
|
@misc{nacar2025GATE, |
|
title={GATE: General Arabic Text Embedding for Enhanced Semantic Textual Similarity with Hybrid Loss Training}, |
|
author={Omer Nacar, Anis Koubaa, Serry Taiseer Sibaee and Lahouari Ghouti}, |
|
year={2025}, |
|
note={Submitted to COLING 2025}, |
|
url={https://huggingface.co/Omartificial-Intelligence-Space/GATE-AraBert-v1}, |
|
} |
|
|
|
|
|
|