Tom Aarsen
commited on
Commit
·
10a34f1
1
Parent(s):
00b4c3f
embeddings models -> embedding models
Browse files
README.md
CHANGED
@@ -59,14 +59,14 @@ from transformers import AutoTokenizer, AutoModel
|
|
59 |
import torch
|
60 |
import torch.nn.functional as F
|
61 |
|
62 |
-
#Mean Pooling - Take average of all tokens
|
63 |
def mean_pooling(model_output, attention_mask):
|
64 |
-
token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
|
65 |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
66 |
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
67 |
|
68 |
|
69 |
-
#Encode text
|
70 |
def encode(texts):
|
71 |
# Tokenize sentences
|
72 |
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
|
@@ -92,27 +92,27 @@ docs = ["Around 9 Million people live in London", "London is known for its finan
|
|
92 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")
|
93 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")
|
94 |
|
95 |
-
#Encode query and docs
|
96 |
query_emb = encode(query)
|
97 |
doc_emb = encode(docs)
|
98 |
|
99 |
-
#Compute dot score between query and all document embeddings
|
100 |
scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()
|
101 |
|
102 |
-
#Combine docs & scores
|
103 |
doc_score_pairs = list(zip(docs, scores))
|
104 |
|
105 |
-
#Sort by decreasing score
|
106 |
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
|
107 |
|
108 |
-
#Output passages & scores
|
109 |
for doc, score in doc_score_pairs:
|
110 |
print(score, doc)
|
111 |
```
|
112 |
|
113 |
## Usage (Text Embeddings Inference (TEI))
|
114 |
|
115 |
-
[Text Embeddings Inference (TEI)](https://github.com/huggingface/text-embeddings-inference) is a blazing fast inference solution for text
|
116 |
|
117 |
- CPU:
|
118 |
```bash
|
|
|
59 |
import torch
|
60 |
import torch.nn.functional as F
|
61 |
|
62 |
+
# Mean Pooling - Take average of all tokens
|
63 |
def mean_pooling(model_output, attention_mask):
|
64 |
+
token_embeddings = model_output.last_hidden_state # First element of model_output contains all token embeddings
|
65 |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
66 |
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
67 |
|
68 |
|
69 |
+
# Encode text
|
70 |
def encode(texts):
|
71 |
# Tokenize sentences
|
72 |
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
|
|
|
92 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")
|
93 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")
|
94 |
|
95 |
+
# Encode query and docs
|
96 |
query_emb = encode(query)
|
97 |
doc_emb = encode(docs)
|
98 |
|
99 |
+
# Compute dot score between query and all document embeddings
|
100 |
scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()
|
101 |
|
102 |
+
# Combine docs & scores
|
103 |
doc_score_pairs = list(zip(docs, scores))
|
104 |
|
105 |
+
# Sort by decreasing score
|
106 |
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
|
107 |
|
108 |
+
# Output passages & scores
|
109 |
for doc, score in doc_score_pairs:
|
110 |
print(score, doc)
|
111 |
```
|
112 |
|
113 |
## Usage (Text Embeddings Inference (TEI))
|
114 |
|
115 |
+
[Text Embeddings Inference (TEI)](https://github.com/huggingface/text-embeddings-inference) is a blazing fast inference solution for text embedding models.
|
116 |
|
117 |
- CPU:
|
118 |
```bash
|