nickprock commited on
Commit
415e069
·
1 Parent(s): 21e0c10

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +33 -13
README.md CHANGED
@@ -65,34 +65,54 @@ Without [sentence-transformers](https://www.SBERT.net), you can use the model li
65
  from transformers import AutoTokenizer, AutoModel
66
  import torch
67
 
68
-
69
  #Mean Pooling - Take attention mask into account for correct averaging
70
  def mean_pooling(model_output, attention_mask):
71
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
72
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
73
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # Sentences we want sentence embeddings for
77
  query = "Quante persone vivono a Londra?"
78
  docs = ["A Londra vivono circa 9 milioni di persone", "Londra è conosciuta per il suo quartiere finanziario"]
79
 
80
  # Load model from HuggingFace Hub
81
- tokenizer = AutoTokenizer.from_pretrained('nickprock/sentence-bert-base-italian-uncased')
82
- model = AutoModel.from_pretrained('nickprock/sentence-bert-base-italian-uncased')
 
 
 
 
 
 
 
83
 
84
- # Tokenize sentences
85
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
86
 
87
- # Compute token embeddings
88
- with torch.no_grad():
89
- model_output = model(**encoded_input)
90
 
91
- # Perform pooling. In this case, mean pooling.
92
- sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
 
 
93
 
94
- print("Sentence embeddings:")
95
- print(sentence_embeddings)
96
  ```
97
 
98
 
 
65
  from transformers import AutoTokenizer, AutoModel
66
  import torch
67
 
 
68
  #Mean Pooling - Take attention mask into account for correct averaging
69
  def mean_pooling(model_output, attention_mask):
70
+ token_embeddings = model_output.last_hidden_state
71
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
72
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
73
 
74
 
75
+ #Encode text
76
+ def encode(texts):
77
+ # Tokenize sentences
78
+ encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
79
+
80
+ # Compute token embeddings
81
+ with torch.no_grad():
82
+ model_output = model(**encoded_input, return_dict=True)
83
+
84
+ # Perform pooling
85
+ embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
86
+
87
+ return embeddings
88
+
89
+
90
  # Sentences we want sentence embeddings for
91
  query = "Quante persone vivono a Londra?"
92
  docs = ["A Londra vivono circa 9 milioni di persone", "Londra è conosciuta per il suo quartiere finanziario"]
93
 
94
  # Load model from HuggingFace Hub
95
+ tokenizer = AutoTokenizer.from_pretrained("nickprock/mmarco-bert-base-italian-uncased")
96
+ model = AutoModel.from_pretrained("nickprock/mmarco-bert-base-italian-uncased")
97
+
98
+ #Encode query and docs
99
+ query_emb = encode(query)
100
+ doc_emb = encode(docs)
101
+
102
+ #Compute dot score between query and all document embeddings
103
+ scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()
104
 
105
+ #Combine docs & scores
106
+ doc_score_pairs = list(zip(docs, scores))
107
 
108
+ #Sort by decreasing score
109
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
 
110
 
111
+ #Output passages & scores
112
+ print("Query:", query)
113
+ for doc, score in doc_score_pairs:
114
+ print(score, doc)
115
 
 
 
116
  ```
117
 
118