ai-forever commited on
Commit
da6ad08
1 Parent(s): 49cf776

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +10 -0
README.md CHANGED
@@ -10,12 +10,18 @@ tags:
10
  # ruELECTRA large model multitask (cased) for Sentence Embeddings in Russian language.
11
 
12
  For better quality, use mean token embeddings.
 
13
  ## Usage (HuggingFace Models Repository)
 
14
  You can use the model directly from the model repository to compute sentence embeddings:
 
 
15
  ```python
16
  from transformers import AutoTokenizer, AutoModel
17
  import torch
 
18
  #Mean Pooling - Take attention mask into account for correct averaging
 
19
  def mean_pooling(model_output, attention_mask):
20
  token_embeddings = model_output[0] #First element of model_output contains all token embeddings
21
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
@@ -25,14 +31,18 @@ def mean_pooling(model_output, attention_mask):
25
  #Sentences we want sentence embeddings for
26
  sentences = ['Привет! Как твои дела?',
27
  'А правда, что 42 твое любимое число?']
 
28
  #Load AutoModel from huggingface model repository
29
  tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruELECTRA-large")
30
  model = AutoModel.from_pretrained("ai-forever/ruELECTRA-large")
 
31
  #Tokenize sentences
32
  encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors='pt')
 
33
  #Compute token embeddings
34
  with torch.no_grad():
35
  model_output = model(**encoded_input)
 
36
  #Perform pooling. In this case, mean pooling
37
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
38
  ```
 
10
  # ruELECTRA large model multitask (cased) for Sentence Embeddings in Russian language.
11
 
12
  For better quality, use mean token embeddings.
13
+
14
  ## Usage (HuggingFace Models Repository)
15
+
16
  You can use the model directly from the model repository to compute sentence embeddings:
17
+
18
+
19
  ```python
20
  from transformers import AutoTokenizer, AutoModel
21
  import torch
22
+
23
  #Mean Pooling - Take attention mask into account for correct averaging
24
+
25
  def mean_pooling(model_output, attention_mask):
26
  token_embeddings = model_output[0] #First element of model_output contains all token embeddings
27
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
 
31
  #Sentences we want sentence embeddings for
32
  sentences = ['Привет! Как твои дела?',
33
  'А правда, что 42 твое любимое число?']
34
+
35
  #Load AutoModel from huggingface model repository
36
  tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruELECTRA-large")
37
  model = AutoModel.from_pretrained("ai-forever/ruELECTRA-large")
38
+
39
  #Tokenize sentences
40
  encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors='pt')
41
+
42
  #Compute token embeddings
43
  with torch.no_grad():
44
  model_output = model(**encoded_input)
45
+
46
  #Perform pooling. In this case, mean pooling
47
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
48
  ```