kevinkrahn
commited on
Commit
•
fbc88e2
1
Parent(s):
ee1ab63
Update README.md
Browse files
README.md
CHANGED
@@ -6,24 +6,30 @@ tags:
|
|
6 |
- feature-extraction
|
7 |
- sentence-similarity
|
8 |
- transformers
|
|
|
9 |
|
10 |
---
|
11 |
|
12 |
-
#
|
13 |
|
14 |
-
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
## Usage (Sentence-Transformers)
|
19 |
|
20 |
-
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
```
|
25 |
|
26 |
-
|
27 |
|
28 |
```python
|
29 |
from sentence_transformers import SentenceTransformer
|
@@ -44,7 +50,7 @@ from transformers import AutoTokenizer, AutoModel
|
|
44 |
import torch
|
45 |
|
46 |
|
47 |
-
def cls_pooling(model_output
|
48 |
return model_output[0][:,0]
|
49 |
|
50 |
|
@@ -52,8 +58,8 @@ def cls_pooling(model_output, attention_mask):
|
|
52 |
sentences = ['This is an example sentence', 'Each sentence is converted']
|
53 |
|
54 |
# Load model from HuggingFace Hub
|
55 |
-
|
56 |
-
|
57 |
|
58 |
# Tokenize sentences
|
59 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
@@ -63,30 +69,61 @@ with torch.no_grad():
|
|
63 |
model_output = model(**encoded_input)
|
64 |
|
65 |
# Perform pooling. In this case, cls pooling.
|
66 |
-
sentence_embeddings = cls_pooling(model_output
|
67 |
|
68 |
print("Sentence embeddings:")
|
69 |
print(sentence_embeddings)
|
70 |
-
```
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
## Evaluation Results
|
75 |
-
|
76 |
-
<!--- Describe how your model was evaluated -->
|
77 |
-
|
78 |
-
For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=kevinkrahn/shlm-grc-en)
|
79 |
|
|
|
80 |
|
|
|
81 |
|
82 |
-
## Full Model Architecture
|
83 |
```
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
```
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
- feature-extraction
|
7 |
- sentence-similarity
|
8 |
- transformers
|
9 |
+
- semantic-search
|
10 |
|
11 |
---
|
12 |
|
13 |
+
# shlm-grc-en
|
14 |
|
15 |
+
## Sentence embeddings for English and Ancient Greek
|
16 |
|
17 |
+
This model creates sentence embeddings in a shared vector space for Ancient Greek and English text.
|
18 |
+
|
19 |
+
The base model uses a modified version of the HLM architecture described in [Heidelberg-Boston @ SIGTYP 2024 Shared Task: Enhancing Low-Resource Language Analysis With Character-Aware Hierarchical Transformers](https://aclanthology.org/2024.sigtyp-1.16/)
|
20 |
+
|
21 |
+
This model is trained to produce sentence embeddings using the multilingual knowledge distillation method and datasets described in [Sentence Embedding Models for Ancient Greek Using Multilingual Knowledge Distillation](https://aclanthology.org/2023.alp-1.2/).
|
22 |
+
|
23 |
+
This model was distilled from `BAAI/bge-base-en-v1.5` for embedding English and Ancient Greek text.
|
24 |
|
25 |
## Usage (Sentence-Transformers)
|
26 |
|
27 |
+
**This model is currently incompatible with the latest version of the sentence-transformers library.**
|
28 |
|
29 |
+
For now, either use HuggingFace Transformers directly (see below) or the following fork of sentence-transformers:
|
30 |
+
https://github.com/kevinkrahn/sentence-transformers
|
|
|
31 |
|
32 |
+
You can use the model with sentence-transformers like this:
|
33 |
|
34 |
```python
|
35 |
from sentence_transformers import SentenceTransformer
|
|
|
50 |
import torch
|
51 |
|
52 |
|
53 |
+
def cls_pooling(model_output):
|
54 |
return model_output[0][:,0]
|
55 |
|
56 |
|
|
|
58 |
sentences = ['This is an example sentence', 'Each sentence is converted']
|
59 |
|
60 |
# Load model from HuggingFace Hub
|
61 |
+
model = AutoModel.from_pretrained('kevinkrahn/shlm-grc-en', trust_remote_code=True)
|
62 |
+
tokenizer = AutoTokenizer.from_pretrained('kevinkrahn/shlm-grc-en', trust_remote_code=True)
|
63 |
|
64 |
# Tokenize sentences
|
65 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
|
|
69 |
model_output = model(**encoded_input)
|
70 |
|
71 |
# Perform pooling. In this case, cls pooling.
|
72 |
+
sentence_embeddings = cls_pooling(model_output)
|
73 |
|
74 |
print("Sentence embeddings:")
|
75 |
print(sentence_embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
+
```
|
78 |
|
79 |
+
## Citing & Authors
|
80 |
|
|
|
81 |
```
|
82 |
+
@inproceedings{riemenschneider-krahn-2024-heidelberg,
|
83 |
+
title = "Heidelberg-Boston @ {SIGTYP} 2024 Shared Task: Enhancing Low-Resource Language Analysis With Character-Aware Hierarchical Transformers",
|
84 |
+
author = "Riemenschneider, Frederick and
|
85 |
+
Krahn, Kevin",
|
86 |
+
editor = "Hahn, Michael and
|
87 |
+
Sorokin, Alexey and
|
88 |
+
Kumar, Ritesh and
|
89 |
+
Shcherbakov, Andreas and
|
90 |
+
Otmakhova, Yulia and
|
91 |
+
Yang, Jinrui and
|
92 |
+
Serikov, Oleg and
|
93 |
+
Rani, Priya and
|
94 |
+
Ponti, Edoardo M. and
|
95 |
+
Murado{\u{g}}lu, Saliha and
|
96 |
+
Gao, Rena and
|
97 |
+
Cotterell, Ryan and
|
98 |
+
Vylomova, Ekaterina",
|
99 |
+
booktitle = "Proceedings of the 6th Workshop on Research in Computational Linguistic Typology and Multilingual NLP",
|
100 |
+
month = mar,
|
101 |
+
year = "2024",
|
102 |
+
address = "St. Julian's, Malta",
|
103 |
+
publisher = "Association for Computational Linguistics",
|
104 |
+
url = "https://aclanthology.org/2024.sigtyp-1.16",
|
105 |
+
pages = "131--141",
|
106 |
+
}
|
107 |
```
|
108 |
|
109 |
+
```
|
110 |
+
@inproceedings{krahn-etal-2023-sentence,
|
111 |
+
title = "Sentence Embedding Models for {A}ncient {G}reek Using Multilingual Knowledge Distillation",
|
112 |
+
author = "Krahn, Kevin and
|
113 |
+
Tate, Derrick and
|
114 |
+
Lamicela, Andrew C.",
|
115 |
+
editor = "Anderson, Adam and
|
116 |
+
Gordin, Shai and
|
117 |
+
Li, Bin and
|
118 |
+
Liu, Yudong and
|
119 |
+
Passarotti, Marco C.",
|
120 |
+
booktitle = "Proceedings of the Ancient Language Processing Workshop",
|
121 |
+
month = sep,
|
122 |
+
year = "2023",
|
123 |
+
address = "Varna, Bulgaria",
|
124 |
+
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
|
125 |
+
url = "https://aclanthology.org/2023.alp-1.2",
|
126 |
+
pages = "13--22",
|
127 |
+
}
|
128 |
+
|
129 |
+
```
|