Update README.md
Browse files
README.md
CHANGED
@@ -60,11 +60,12 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
|
|
60 |
|
61 |
|
62 |
# get sparse vector from dense vectors with shape batch_size * seq_len * vocab_size
|
63 |
-
def get_sparse_vector(feature, output):
|
64 |
values, _ = torch.max(output*feature["attention_mask"].unsqueeze(-1), dim=1)
|
65 |
values = torch.log(1 + torch.relu(values))
|
66 |
values[:,special_token_ids] = 0
|
67 |
-
|
|
|
68 |
|
69 |
# transform the sparse vector to a dict of (token, weight)
|
70 |
def transform_sparse_vector_to_dict(sparse_vector):
|
@@ -127,7 +128,7 @@ document_sparse_vector = get_sparse_vector(feature_document, output)
|
|
127 |
|
128 |
# get similarity score
|
129 |
sim_score = torch.matmul(query_sparse_vector[0],document_sparse_vector[0])
|
130 |
-
print(sim_score) # tensor(7.
|
131 |
|
132 |
|
133 |
query_token_weight = transform_sparse_vector_to_dict(query_sparse_vector)[0]
|
@@ -143,7 +144,6 @@ for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reve
|
|
143 |
# score in query: 1.6406, score in document: 0.9018, token: now
|
144 |
# score in query: 1.6108, score in document: 0.3141, token: ?
|
145 |
# score in query: 1.2721, score in document: 1.3446, token: ny
|
146 |
-
# score in query: 0.6005, score in document: 0.1804, token: in
|
147 |
```
|
148 |
|
149 |
The above code sample shows an example of neural sparse search. Although there is no overlap token in original query and document, but this model performs a good match.
|
|
|
60 |
|
61 |
|
62 |
# get sparse vector from dense vectors with shape batch_size * seq_len * vocab_size
|
63 |
+
def get_sparse_vector(feature, output, prune_ratio=0.1):
|
64 |
values, _ = torch.max(output*feature["attention_mask"].unsqueeze(-1), dim=1)
|
65 |
values = torch.log(1 + torch.relu(values))
|
66 |
values[:,special_token_ids] = 0
|
67 |
+
max_values = values.max(dim=-1)[0].unsqueeze(1) * prune_ratio
|
68 |
+
return values * (values > max_values)
|
69 |
|
70 |
# transform the sparse vector to a dict of (token, weight)
|
71 |
def transform_sparse_vector_to_dict(sparse_vector):
|
|
|
128 |
|
129 |
# get similarity score
|
130 |
sim_score = torch.matmul(query_sparse_vector[0],document_sparse_vector[0])
|
131 |
+
print(sim_score) # tensor(7.6317, grad_fn=<DotBackward0>)
|
132 |
|
133 |
|
134 |
query_token_weight = transform_sparse_vector_to_dict(query_sparse_vector)[0]
|
|
|
144 |
# score in query: 1.6406, score in document: 0.9018, token: now
|
145 |
# score in query: 1.6108, score in document: 0.3141, token: ?
|
146 |
# score in query: 1.2721, score in document: 1.3446, token: ny
|
|
|
147 |
```
|
148 |
|
149 |
The above code sample shows an example of neural sparse search. Although there is no overlap token in original query and document, but this model performs a good match.
|