Spaces:
Running
on
T4
Running
on
T4
mvectors
Browse files
semantic_search/all_search_execute.py
CHANGED
@@ -217,7 +217,6 @@ def handler(input_,session_id):
|
|
217 |
if(st.session_state.input_mvector_rerank):
|
218 |
query_vector = cb.vectorise(query,False)
|
219 |
vector_field = "description_vector"
|
220 |
-
print("-------------COLBERT-----1-------------------------------------------------")
|
221 |
vector_payload = {"knn": {}}
|
222 |
vector_payload["knn"][vector_field]= {
|
223 |
"vector":query_vector,
|
|
|
217 |
if(st.session_state.input_mvector_rerank):
|
218 |
query_vector = cb.vectorise(query,False)
|
219 |
vector_field = "description_vector"
|
|
|
220 |
vector_payload = {"knn": {}}
|
221 |
vector_payload["knn"][vector_field]= {
|
222 |
"vector":query_vector,
|
utilities/mvectors.py
CHANGED
@@ -8,7 +8,7 @@ import json
|
|
8 |
|
9 |
runtime = boto3.client('sagemaker-runtime',aws_access_key_id=st.secrets['user_access_key'],
|
10 |
aws_secret_access_key=st.secrets['user_secret_key'],region_name='us-east-1')
|
11 |
-
# Load
|
12 |
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
13 |
endpoint_name = 'huggingface-pytorch-inference-2025-05-21-16-31-07-967'
|
14 |
|
@@ -20,7 +20,6 @@ def mean_pooling(token_embeddings, attention_mask):
|
|
20 |
|
21 |
|
22 |
def vectorise(sentence,token_level_vectors):
|
23 |
-
print("-------------colbert ---- 2-----------")
|
24 |
encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
|
25 |
# Get input IDs (token IDs)
|
26 |
input_ids = encoded_input['input_ids'][0]
|
@@ -48,7 +47,6 @@ def vectorise(sentence,token_level_vectors):
|
|
48 |
return sentence_embeddings[0].tolist()
|
49 |
|
50 |
def search(hits):
|
51 |
-
print("-------------COLBERT------------4------------------------------------------")
|
52 |
tokens,token_vectors = vectorise(st.session_state.input_text,True)
|
53 |
final_docs = []
|
54 |
for ind,j in enumerate(hits):
|
@@ -64,7 +62,6 @@ def search(hits):
|
|
64 |
doc["_source"]["gender_affinity"] = j["_source"]["gender_affinity"]
|
65 |
else:
|
66 |
doc["_source"]["gender_affinity"] = ""
|
67 |
-
#print(j["_source"]["title"])
|
68 |
source_doc_token_keys = list(j["_source"].keys())
|
69 |
with_s = [x for x in source_doc_token_keys if x.startswith("description-token-")]
|
70 |
add_score = 0
|
@@ -79,26 +76,22 @@ def search(hits):
|
|
79 |
for m in with_s:
|
80 |
m_arr = m.split("-")
|
81 |
if(m_arr[-1]!='[SEP]' and m_arr[-1]!='[CLS]'):
|
82 |
-
#print("document token: "+m_arr[3])
|
83 |
doc_token_vector = np.array(j["_source"][m])
|
84 |
score = np.dot(query_token_vector,doc_token_vector)
|
85 |
scores.append({"doc_token":m_arr[3],"score":score})
|
86 |
-
|
87 |
-
|
88 |
newlist = sorted(scores, key=lambda d: d['score'], reverse=True)
|
89 |
max_score = newlist[0]['score']
|
90 |
add_score+=max_score
|
91 |
max_score_dict_list.append(newlist[0])
|
92 |
-
|
93 |
max_score_dict_list_sorted = sorted(max_score_dict_list, key=lambda d: d['score'], reverse=True)
|
94 |
print(max_score_dict_list_sorted)
|
95 |
-
|
96 |
doc["total_score"] = add_score
|
97 |
doc['max_score_dict_list_sorted'] = max_score_dict_list_sorted
|
98 |
final_docs.append(doc)
|
99 |
final_docs_sorted = sorted(final_docs, key=lambda d: d['total_score'], reverse=True)
|
100 |
-
print("-------------COLBERT-----final--------")
|
101 |
-
print(final_docs_sorted)
|
102 |
return final_docs_sorted
|
103 |
|
104 |
|
|
|
8 |
|
9 |
runtime = boto3.client('sagemaker-runtime',aws_access_key_id=st.secrets['user_access_key'],
|
10 |
aws_secret_access_key=st.secrets['user_secret_key'],region_name='us-east-1')
|
11 |
+
# Load Tokenizer from HuggingFace Hub
|
12 |
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
13 |
endpoint_name = 'huggingface-pytorch-inference-2025-05-21-16-31-07-967'
|
14 |
|
|
|
20 |
|
21 |
|
22 |
def vectorise(sentence,token_level_vectors):
|
|
|
23 |
encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
|
24 |
# Get input IDs (token IDs)
|
25 |
input_ids = encoded_input['input_ids'][0]
|
|
|
47 |
return sentence_embeddings[0].tolist()
|
48 |
|
49 |
def search(hits):
|
|
|
50 |
tokens,token_vectors = vectorise(st.session_state.input_text,True)
|
51 |
final_docs = []
|
52 |
for ind,j in enumerate(hits):
|
|
|
62 |
doc["_source"]["gender_affinity"] = j["_source"]["gender_affinity"]
|
63 |
else:
|
64 |
doc["_source"]["gender_affinity"] = ""
|
|
|
65 |
source_doc_token_keys = list(j["_source"].keys())
|
66 |
with_s = [x for x in source_doc_token_keys if x.startswith("description-token-")]
|
67 |
add_score = 0
|
|
|
76 |
for m in with_s:
|
77 |
m_arr = m.split("-")
|
78 |
if(m_arr[-1]!='[SEP]' and m_arr[-1]!='[CLS]'):
|
|
|
79 |
doc_token_vector = np.array(j["_source"][m])
|
80 |
score = np.dot(query_token_vector,doc_token_vector)
|
81 |
scores.append({"doc_token":m_arr[3],"score":score})
|
82 |
+
|
|
|
83 |
newlist = sorted(scores, key=lambda d: d['score'], reverse=True)
|
84 |
max_score = newlist[0]['score']
|
85 |
add_score+=max_score
|
86 |
max_score_dict_list.append(newlist[0])
|
87 |
+
|
88 |
max_score_dict_list_sorted = sorted(max_score_dict_list, key=lambda d: d['score'], reverse=True)
|
89 |
print(max_score_dict_list_sorted)
|
90 |
+
|
91 |
doc["total_score"] = add_score
|
92 |
doc['max_score_dict_list_sorted'] = max_score_dict_list_sorted
|
93 |
final_docs.append(doc)
|
94 |
final_docs_sorted = sorted(final_docs, key=lambda d: d['total_score'], reverse=True)
|
|
|
|
|
95 |
return final_docs_sorted
|
96 |
|
97 |
|