prasadnu commited on
Commit
f3cf390
·
1 Parent(s): c005379
semantic_search/all_search_execute.py CHANGED
@@ -217,7 +217,6 @@ def handler(input_,session_id):
217
  if(st.session_state.input_mvector_rerank):
218
  query_vector = cb.vectorise(query,False)
219
  vector_field = "description_vector"
220
- print("-------------COLBERT-----1-------------------------------------------------")
221
  vector_payload = {"knn": {}}
222
  vector_payload["knn"][vector_field]= {
223
  "vector":query_vector,
 
217
  if(st.session_state.input_mvector_rerank):
218
  query_vector = cb.vectorise(query,False)
219
  vector_field = "description_vector"
 
220
  vector_payload = {"knn": {}}
221
  vector_payload["knn"][vector_field]= {
222
  "vector":query_vector,
utilities/mvectors.py CHANGED
@@ -8,7 +8,7 @@ import json
8
 
9
  runtime = boto3.client('sagemaker-runtime',aws_access_key_id=st.secrets['user_access_key'],
10
  aws_secret_access_key=st.secrets['user_secret_key'],region_name='us-east-1')
11
- # Load model from HuggingFace Hub
12
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
13
  endpoint_name = 'huggingface-pytorch-inference-2025-05-21-16-31-07-967'
14
 
@@ -20,7 +20,6 @@ def mean_pooling(token_embeddings, attention_mask):
20
 
21
 
22
  def vectorise(sentence,token_level_vectors):
23
- print("-------------colbert ---- 2-----------")
24
  encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
25
  # Get input IDs (token IDs)
26
  input_ids = encoded_input['input_ids'][0]
@@ -48,7 +47,6 @@ def vectorise(sentence,token_level_vectors):
48
  return sentence_embeddings[0].tolist()
49
 
50
  def search(hits):
51
- print("-------------COLBERT------------4------------------------------------------")
52
  tokens,token_vectors = vectorise(st.session_state.input_text,True)
53
  final_docs = []
54
  for ind,j in enumerate(hits):
@@ -64,7 +62,6 @@ def search(hits):
64
  doc["_source"]["gender_affinity"] = j["_source"]["gender_affinity"]
65
  else:
66
  doc["_source"]["gender_affinity"] = ""
67
- #print(j["_source"]["title"])
68
  source_doc_token_keys = list(j["_source"].keys())
69
  with_s = [x for x in source_doc_token_keys if x.startswith("description-token-")]
70
  add_score = 0
@@ -79,26 +76,22 @@ def search(hits):
79
  for m in with_s:
80
  m_arr = m.split("-")
81
  if(m_arr[-1]!='[SEP]' and m_arr[-1]!='[CLS]'):
82
- #print("document token: "+m_arr[3])
83
  doc_token_vector = np.array(j["_source"][m])
84
  score = np.dot(query_token_vector,doc_token_vector)
85
  scores.append({"doc_token":m_arr[3],"score":score})
86
- #print({"doc_token":m_arr[3],"score":score})
87
-
88
  newlist = sorted(scores, key=lambda d: d['score'], reverse=True)
89
  max_score = newlist[0]['score']
90
  add_score+=max_score
91
  max_score_dict_list.append(newlist[0])
92
- print(newlist[0])
93
  max_score_dict_list_sorted = sorted(max_score_dict_list, key=lambda d: d['score'], reverse=True)
94
  print(max_score_dict_list_sorted)
95
- # print(add_score)
96
  doc["total_score"] = add_score
97
  doc['max_score_dict_list_sorted'] = max_score_dict_list_sorted
98
  final_docs.append(doc)
99
  final_docs_sorted = sorted(final_docs, key=lambda d: d['total_score'], reverse=True)
100
- print("-------------COLBERT-----final--------")
101
- print(final_docs_sorted)
102
  return final_docs_sorted
103
 
104
 
 
8
 
9
  runtime = boto3.client('sagemaker-runtime',aws_access_key_id=st.secrets['user_access_key'],
10
  aws_secret_access_key=st.secrets['user_secret_key'],region_name='us-east-1')
11
+ # Load Tokenizer from HuggingFace Hub
12
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
13
  endpoint_name = 'huggingface-pytorch-inference-2025-05-21-16-31-07-967'
14
 
 
20
 
21
 
22
  def vectorise(sentence,token_level_vectors):
 
23
  encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
24
  # Get input IDs (token IDs)
25
  input_ids = encoded_input['input_ids'][0]
 
47
  return sentence_embeddings[0].tolist()
48
 
49
  def search(hits):
 
50
  tokens,token_vectors = vectorise(st.session_state.input_text,True)
51
  final_docs = []
52
  for ind,j in enumerate(hits):
 
62
  doc["_source"]["gender_affinity"] = j["_source"]["gender_affinity"]
63
  else:
64
  doc["_source"]["gender_affinity"] = ""
 
65
  source_doc_token_keys = list(j["_source"].keys())
66
  with_s = [x for x in source_doc_token_keys if x.startswith("description-token-")]
67
  add_score = 0
 
76
  for m in with_s:
77
  m_arr = m.split("-")
78
  if(m_arr[-1]!='[SEP]' and m_arr[-1]!='[CLS]'):
 
79
  doc_token_vector = np.array(j["_source"][m])
80
  score = np.dot(query_token_vector,doc_token_vector)
81
  scores.append({"doc_token":m_arr[3],"score":score})
82
+
 
83
  newlist = sorted(scores, key=lambda d: d['score'], reverse=True)
84
  max_score = newlist[0]['score']
85
  add_score+=max_score
86
  max_score_dict_list.append(newlist[0])
87
+
88
  max_score_dict_list_sorted = sorted(max_score_dict_list, key=lambda d: d['score'], reverse=True)
89
  print(max_score_dict_list_sorted)
90
+
91
  doc["total_score"] = add_score
92
  doc['max_score_dict_list_sorted'] = max_score_dict_list_sorted
93
  final_docs.append(doc)
94
  final_docs_sorted = sorted(final_docs, key=lambda d: d['total_score'], reverse=True)
 
 
95
  return final_docs_sorted
96
 
97