File size: 4,389 Bytes
2e2dda5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np
import streamlit as st

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
f = open("/home/ubuntu/AI-search-with-amazon-opensearch-service/OpenSearchApp/utilities/colbert_vocab.txt", "r")
vocab = f.read()
vocab_arr = vocab.split("\n")
vocab_arr
vocab_dict={}
for index,n in enumerate(vocab_arr):
    vocab_dict[str(index)]=n
  


def vectorise(sentence,token_level_vectors):
    print("-------------colbert ---- 2-----------")
    # Tokenize sentences
    encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
     # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    if(token_level_vectors):
        return encoded_input['input_ids'][0].tolist(),model_output['last_hidden_state'][0]

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings[0].tolist()

def search(hits):
    print("-------------COLBERT------------4------------------------------------------")
    token_ids,token_vectors = vectorise(st.session_state.input_text,True)
    final_docs = []
    for ind,j in enumerate(hits):
        max_score_dict_list = []
        doc={"_source":
            {
            "description":j["_source"]["description"],"caption":j["_source"]["title"],
            "image_s3_url":j["_source"]["image_s3_url"],"price":j["_source"]["price"],
            "style":j["_source"]["style"],"category":j["_source"]["category"]},"_id":j["_id"],"_score":j["_score"]}
            
        if("gender_affinity" in j["_source"]):
            doc["_source"]["gender_affinity"] = j["_source"]["gender_affinity"]
        else:
            doc["_source"]["gender_affinity"] = ""
        #print(j["_source"]["title"])
        source_doc_token_keys = list(j["_source"].keys())
        with_s = [x for x in source_doc_token_keys if x.startswith("description-token-")]
        add_score = 0
        
        for index,i in enumerate(token_vectors):
            token = vocab_dict[str(token_ids[index])]
            if(token!='[SEP]' and token!='[CLS]'):
                query_token_vector = np.array(i)
                print("query token: "+token)
                print("-----------------")
                scores = []
                for m in with_s:
                    m_arr = m.split("-")
                    if(m_arr[-1]!='[SEP]' and m_arr[-1]!='[CLS]'):
                        #print("document token: "+m_arr[3])
                        doc_token_vector = np.array(j["_source"][m])
                        score = np.dot(query_token_vector,doc_token_vector)
                        scores.append({"doc_token":m_arr[3],"score":score})
                        #print({"doc_token":m_arr[3],"score":score})
                    
                newlist = sorted(scores, key=lambda d: d['score'], reverse=True)
                max_score = newlist[0]['score']
                add_score+=max_score
                max_score_dict_list.append(newlist[0])
                print(newlist[0])
        max_score_dict_list_sorted = sorted(max_score_dict_list, key=lambda d: d['score'], reverse=True)
        print(max_score_dict_list_sorted)
        # print(add_score)
        doc["total_score"] = add_score
        doc['max_score_dict_list_sorted'] = max_score_dict_list_sorted
        final_docs.append(doc)
    final_docs_sorted = sorted(final_docs, key=lambda d: d['total_score'], reverse=True)
    print("-------------COLBERT-----final--------")
    print(final_docs_sorted)
    return final_docs_sorted