jgrosjean commited on
Commit
d8f35d4
1 Parent(s): ef4d93a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +33 -16
README.md CHANGED
@@ -40,10 +40,19 @@ from transformers import AutoModel, AutoTokenizer
40
  model_name="jgrosjean-mathesis/swissbert-for-sentence-embeddings"
41
  model = AutoModel.from_pretrained(model_name)
42
  tokenizer = AutoTokenizer.from_pretrained(model_name)
43
- model.set_default_language("de_CH")
44
-
45
- def generate_sentence_embedding(sentence, ):
46
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  # Tokenize input sentence
48
  inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=512)
49
 
@@ -56,7 +65,7 @@ def generate_sentence_embedding(sentence, ):
56
 
57
  return embedding
58
 
59
- sentence_embedding = generate_sentence_embedding("Wir feiern am 1. August den Schweizer Nationalfeiertag.")
60
  print(sentence_embedding)
61
  ```
62
  Output:
@@ -67,6 +76,26 @@ tensor([[ 5.6306e-02, -2.8375e-01, -4.1495e-02, 7.4393e-02, -3.1552e-01,
67
  ...]])
68
  ```
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  ## Bias, Risks, and Limitations
71
 
72
  <!-- This section is meant to convey both technical and sociotechnical limitations. -->
@@ -162,18 +191,6 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
162
 
163
  [More Information Needed]
164
 
165
- ### Compute Infrastructure
166
-
167
- [More Information Needed]
168
-
169
- #### Hardware
170
-
171
- [More Information Needed]
172
-
173
- #### Software
174
-
175
- [More Information Needed]
176
-
177
  ## Citation [optional]
178
 
179
  <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
 
40
  model_name="jgrosjean-mathesis/swissbert-for-sentence-embeddings"
41
  model = AutoModel.from_pretrained(model_name)
42
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
43
 
44
+ def generate_sentence_embedding(sentence, language):
45
+
46
+ # Set adapter to specified language
47
+ if "de" in language:
48
+ model.set_default_language("de_CH")
49
+ if "fr" in language:
50
+ model.set_default_language("fr_CH")
51
+ if "it" in language:
52
+ model.set_default_language("it_CH")
53
+ if "rm" in language:
54
+ model.set_default_language("rm_CH")
55
+
56
  # Tokenize input sentence
57
  inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=512)
58
 
 
65
 
66
  return embedding
67
 
68
+ sentence_embedding = generate_sentence_embedding("Wir feiern am 1. August den Schweizer Nationalfeiertag.", language="de")
69
  print(sentence_embedding)
70
  ```
71
  Output:
 
76
  ...]])
77
  ```
78
 
79
+ ### Semantic Textual Similarity
80
+
81
+ ```python
82
+ from sklearn.metrics.pairwise import cosine_similarity
83
+
84
+ # Define two sentences
85
+ sentence_1 = ["Der Zug kommt um 9 Uhr in Zürich an."]
86
+ sentence_2 = ["Le train arrive à Lausanne à 9h."]
87
+
88
+ #Compute embedding for both
89
+ embedding_1 = generate_sentence_embedding(sentence_1, language="de")
90
+ embedding_2 = generate_sentence_embedding(sentence_2, language="fr")
91
+
92
+ #Compute cosine-similarity
93
+ cosine_score = cosine_similarity((embedding_1, embedding_2)
94
+
95
+ #Output the score
96
+ print("The cosine score for", sentence_1, "and", sentence_2, "is", cosine_score)
97
+ ```
98
+
99
  ## Bias, Risks, and Limitations
100
 
101
  <!-- This section is meant to convey both technical and sociotechnical limitations. -->
 
191
 
192
  [More Information Needed]
193
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  ## Citation [optional]
195
 
196
  <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->