Update README.md
Browse files
README.md
CHANGED
@@ -33,7 +33,7 @@ import torch
|
|
33 |
from transformers import AutoModel, AutoTokenizer
|
34 |
|
35 |
# Load swissBERT for sentence embeddings model
|
36 |
-
model_name="jgrosjean-mathesis/swissbert-for-sentence-embeddings"
|
37 |
model = AutoModel.from_pretrained(model_name)
|
38 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
39 |
|
@@ -41,13 +41,13 @@ def generate_sentence_embedding(sentence, language):
|
|
41 |
|
42 |
# Set adapter to specified language
|
43 |
if "de" in language:
|
44 |
-
|
45 |
if "fr" in language:
|
46 |
-
|
47 |
if "it" in language:
|
48 |
-
|
49 |
if "rm" in language:
|
50 |
-
|
51 |
|
52 |
# Tokenize input sentence
|
53 |
inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
@@ -86,11 +86,15 @@ embedding_1 = generate_sentence_embedding(sentence_1, language="de")
|
|
86 |
embedding_2 = generate_sentence_embedding(sentence_2, language="fr")
|
87 |
|
88 |
#Compute cosine-similarity
|
89 |
-
cosine_score = cosine_similarity(
|
90 |
|
91 |
#Output the score
|
92 |
print("The cosine score for", sentence_1, "and", sentence_2, "is", cosine_score)
|
93 |
```
|
|
|
|
|
|
|
|
|
94 |
|
95 |
## Bias, Risks, and Limitations
|
96 |
|
@@ -123,8 +127,6 @@ Batch size: 512
|
|
123 |
|
124 |
<!-- This section describes the evaluation protocols and provides the results. -->
|
125 |
|
126 |
-
### Testing Data, Factors & Metrics
|
127 |
-
|
128 |
#### Baseline
|
129 |
|
130 |
The first baseline is [distiluse-base-multilingual-cased](https://www.sbert.net/examples/training/multilingual/README.html), a high-performing Sentence Transformer model that is able to process German, French and Italian (and more).
|
|
|
33 |
from transformers import AutoModel, AutoTokenizer
|
34 |
|
35 |
# Load swissBERT for sentence embeddings model
|
36 |
+
model_name = "jgrosjean-mathesis/swissbert-for-sentence-embeddings"
|
37 |
model = AutoModel.from_pretrained(model_name)
|
38 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
39 |
|
|
|
41 |
|
42 |
# Set adapter to specified language
|
43 |
if "de" in language:
|
44 |
+
model.set_default_language("de_CH")
|
45 |
if "fr" in language:
|
46 |
+
model.set_default_language("fr_CH")
|
47 |
if "it" in language:
|
48 |
+
model.set_default_language("it_CH")
|
49 |
if "rm" in language:
|
50 |
+
model.set_default_language("rm_CH")
|
51 |
|
52 |
# Tokenize input sentence
|
53 |
inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
|
|
86 |
embedding_2 = generate_sentence_embedding(sentence_2, language="fr")
|
87 |
|
88 |
#Compute cosine-similarity
|
89 |
+
cosine_score = cosine_similarity(embedding_1, embedding_2)
|
90 |
|
91 |
#Output the score
|
92 |
print("The cosine score for", sentence_1, "and", sentence_2, "is", cosine_score)
|
93 |
```
|
94 |
+
Output:
|
95 |
+
```
|
96 |
+
The cosine score for ['Der Zug kommt um 9 Uhr in Zürich an.'] and ['Le train arrive à Lausanne à 9h.'] is [[0.85555995]]
|
97 |
+
```
|
98 |
|
99 |
## Bias, Risks, and Limitations
|
100 |
|
|
|
127 |
|
128 |
<!-- This section describes the evaluation protocols and provides the results. -->
|
129 |
|
|
|
|
|
130 |
#### Baseline
|
131 |
|
132 |
The first baseline is [distiluse-base-multilingual-cased](https://www.sbert.net/examples/training/multilingual/README.html), a high-performing Sentence Transformer model that is able to process German, French and Italian (and more).
|