File size: 2,497 Bytes
5f773d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# Description: Using Facebook's Faiss library to perform semantic search according to the query
# Reference: https://deepnote.com/blog/semantic-search-using-faiss-and-mpnet

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import faiss

class SemanticEmbedding:

    def __init__(self, model_name="sentence-transformers/all-mpnet-base-v2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    # Mean Pooling - Take attention mask into account for correct averaging
    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[
            0
        ]  # First element of model_output contains all token embeddings
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        )
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )

    def get_embedding(self, sentences):
        # Tokenize sentences
        encoded_input = self.tokenizer(
            sentences, padding=True, truncation=True, return_tensors="pt"
        )
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        # Perform pooling
        sentence_embeddings = self.mean_pooling(
            model_output, encoded_input["attention_mask"]
        )

        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings.detach().numpy()


class FaissForQuerySearch:

    def __init__(self, model, dim=768):
        self.index = faiss.IndexFlatIP(dim)
        # Maintaining the document data
        self.doc_map = dict()
        self.model = model
        self.ctr = 0
        self.uuid = []
        self.labels = []

    def search_query(self, query, k=1):
        D, I = self.index.search(self.model.get_embedding(query), k)
        return [
            self.labels[idx] for idx, score in zip(I[0], D[0]) if idx in self.doc_map
        ]

    def add_summary(self, document_text, id, predicted_label):
        self.index.add((self.model.get_embedding(document_text)))  # index
        self.uuid.append(id)  # appending the uuid
        self.labels.append(predicted_label)  # appending the predicted label
        self.doc_map[self.ctr] = document_text  # store the original document text
        self.ctr += 1