ej68okap commited on
Commit
a53d884
Β·
1 Parent(s): 9832882

new code added

Browse files
Files changed (1) hide show
  1. milvus_manager.py +68 -6
milvus_manager.py CHANGED
@@ -99,17 +99,17 @@ class MilvusManager:
99
  self.client.create_index(
100
  collection_name=self.collection_name, index_params=index_params, sync=True
101
  )
102
-
103
- def search(self, data, topk):
104
  """
105
- Search for the top-k most similar vectors in the collection.
106
 
107
  Args:
108
  data (array-like): Query vector.
109
  topk (int): Number of top results to return.
 
110
 
111
  Returns:
112
- list: Sorted list of top-k results.
113
  """
114
  search_params = {"metric_type": "IP", "params": {}} # Search parameters for Inner Product
115
  results = self.client.search(
@@ -155,9 +155,71 @@ class MilvusManager:
155
  score, doc_id = future.result()
156
  scores.append((score, doc_id))
157
 
 
 
 
158
  # Sort scores in descending order and return the top-k results
159
- scores.sort(key=lambda x: x[0], reverse=True)
160
- return scores[:topk] if len(scores) >= topk else scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  def insert(self, data):
163
  """
 
99
  self.client.create_index(
100
  collection_name=self.collection_name, index_params=index_params, sync=True
101
  )
102
+ def search(self, data, topk, threshold=0.7):
 
103
  """
104
+ Search for the top-k most similar vectors in the collection, filtered by a relevance threshold.
105
 
106
  Args:
107
  data (array-like): Query vector.
108
  topk (int): Number of top results to return.
109
+ threshold (float): Minimum score threshold for relevance (default is 0.5).
110
 
111
  Returns:
112
+ list: Sorted list of top-k results that meet the threshold.
113
  """
114
  search_params = {"metric_type": "IP", "params": {}} # Search parameters for Inner Product
115
  results = self.client.search(
 
155
  score, doc_id = future.result()
156
  scores.append((score, doc_id))
157
 
158
+ # Filter scores by threshold
159
+ filtered_scores = [item for item in scores if item[0] >= threshold]
160
+
161
  # Sort scores in descending order and return the top-k results
162
+ filtered_scores.sort(key=lambda x: x[0], reverse=True)
163
+ return filtered_scores[:topk] if len(filtered_scores) >= topk else filtered_scores
164
+
165
+ # def search(self, data, topk):
166
+ # """
167
+ # Search for the top-k most similar vectors in the collection.
168
+
169
+ # Args:
170
+ # data (array-like): Query vector.
171
+ # topk (int): Number of top results to return.
172
+
173
+ # Returns:
174
+ # list: Sorted list of top-k results.
175
+ # """
176
+ # search_params = {"metric_type": "IP", "params": {}} # Search parameters for Inner Product
177
+ # results = self.client.search(
178
+ # self.collection_name,
179
+ # data,
180
+ # limit=50, # Initial retrieval limit
181
+ # output_fields=["vector", "seq_id", "doc_id"], # Fields to include in the output
182
+ # search_params=search_params,
183
+ # )
184
+
185
+ # # Collect unique document IDs from the search results
186
+ # doc_ids = set()
187
+ # for r_id in range(len(results)):
188
+ # for r in range(len(results[r_id])):
189
+ # doc_ids.add(results[r_id][r]["entity"]["doc_id"])
190
+
191
+ # scores = []
192
+
193
+ # # Function to rerank a single document based on its relevance to the query
194
+ # def rerank_single_doc(doc_id, data, client, collection_name):
195
+ # doc_colbert_vecs = client.query(
196
+ # collection_name=collection_name,
197
+ # filter=f"doc_id in [{doc_id}, {doc_id + 1}]", # Query documents by ID
198
+ # output_fields=["seq_id", "vector", "doc"], # Fields to retrieve
199
+ # limit=1000, # Retrieve a maximum of 1000 vectors per document
200
+ # )
201
+ # # Compute the maximum similarity score for the document
202
+ # doc_vecs = np.vstack(
203
+ # [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
204
+ # )
205
+ # score = np.dot(data, doc_vecs.T).max(1).sum()
206
+ # return (score, doc_id)
207
+
208
+ # # Use multithreading to rerank documents in parallel
209
+ # with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
210
+ # futures = {
211
+ # executor.submit(
212
+ # rerank_single_doc, doc_id, data, self.client, self.collection_name
213
+ # ): doc_id
214
+ # for doc_id in doc_ids
215
+ # }
216
+ # for future in concurrent.futures.as_completed(futures):
217
+ # score, doc_id = future.result()
218
+ # scores.append((score, doc_id))
219
+
220
+ # # Sort scores in descending order and return the top-k results
221
+ # scores.sort(key=lambda x: x[0], reverse=True)
222
+ # return scores[:topk] if len(scores) >= topk else scores
223
 
224
  def insert(self, data):
225
  """