Spaces:
Runtime error
Runtime error
ej68okap
commited on
Commit
Β·
a53d884
1
Parent(s):
9832882
new code added
Browse files- milvus_manager.py +68 -6
milvus_manager.py
CHANGED
@@ -99,17 +99,17 @@ class MilvusManager:
|
|
99 |
self.client.create_index(
|
100 |
collection_name=self.collection_name, index_params=index_params, sync=True
|
101 |
)
|
102 |
-
|
103 |
-
def search(self, data, topk):
|
104 |
"""
|
105 |
-
Search for the top-k most similar vectors in the collection.
|
106 |
|
107 |
Args:
|
108 |
data (array-like): Query vector.
|
109 |
topk (int): Number of top results to return.
|
|
|
110 |
|
111 |
Returns:
|
112 |
-
list: Sorted list of top-k results.
|
113 |
"""
|
114 |
search_params = {"metric_type": "IP", "params": {}} # Search parameters for Inner Product
|
115 |
results = self.client.search(
|
@@ -155,9 +155,71 @@ class MilvusManager:
|
|
155 |
score, doc_id = future.result()
|
156 |
scores.append((score, doc_id))
|
157 |
|
|
|
|
|
|
|
158 |
# Sort scores in descending order and return the top-k results
|
159 |
-
|
160 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
def insert(self, data):
|
163 |
"""
|
|
|
99 |
self.client.create_index(
|
100 |
collection_name=self.collection_name, index_params=index_params, sync=True
|
101 |
)
|
102 |
+
def search(self, data, topk, threshold=0.7):
|
|
|
103 |
"""
|
104 |
+
Search for the top-k most similar vectors in the collection, filtered by a relevance threshold.
|
105 |
|
106 |
Args:
|
107 |
data (array-like): Query vector.
|
108 |
topk (int): Number of top results to return.
|
109 |
+
threshold (float): Minimum score threshold for relevance (default is 0.5).
|
110 |
|
111 |
Returns:
|
112 |
+
list: Sorted list of top-k results that meet the threshold.
|
113 |
"""
|
114 |
search_params = {"metric_type": "IP", "params": {}} # Search parameters for Inner Product
|
115 |
results = self.client.search(
|
|
|
155 |
score, doc_id = future.result()
|
156 |
scores.append((score, doc_id))
|
157 |
|
158 |
+
# Filter scores by threshold
|
159 |
+
filtered_scores = [item for item in scores if item[0] >= threshold]
|
160 |
+
|
161 |
# Sort scores in descending order and return the top-k results
|
162 |
+
filtered_scores.sort(key=lambda x: x[0], reverse=True)
|
163 |
+
return filtered_scores[:topk] if len(filtered_scores) >= topk else filtered_scores
|
164 |
+
|
165 |
+
# def search(self, data, topk):
|
166 |
+
# """
|
167 |
+
# Search for the top-k most similar vectors in the collection.
|
168 |
+
|
169 |
+
# Args:
|
170 |
+
# data (array-like): Query vector.
|
171 |
+
# topk (int): Number of top results to return.
|
172 |
+
|
173 |
+
# Returns:
|
174 |
+
# list: Sorted list of top-k results.
|
175 |
+
# """
|
176 |
+
# search_params = {"metric_type": "IP", "params": {}} # Search parameters for Inner Product
|
177 |
+
# results = self.client.search(
|
178 |
+
# self.collection_name,
|
179 |
+
# data,
|
180 |
+
# limit=50, # Initial retrieval limit
|
181 |
+
# output_fields=["vector", "seq_id", "doc_id"], # Fields to include in the output
|
182 |
+
# search_params=search_params,
|
183 |
+
# )
|
184 |
+
|
185 |
+
# # Collect unique document IDs from the search results
|
186 |
+
# doc_ids = set()
|
187 |
+
# for r_id in range(len(results)):
|
188 |
+
# for r in range(len(results[r_id])):
|
189 |
+
# doc_ids.add(results[r_id][r]["entity"]["doc_id"])
|
190 |
+
|
191 |
+
# scores = []
|
192 |
+
|
193 |
+
# # Function to rerank a single document based on its relevance to the query
|
194 |
+
# def rerank_single_doc(doc_id, data, client, collection_name):
|
195 |
+
# doc_colbert_vecs = client.query(
|
196 |
+
# collection_name=collection_name,
|
197 |
+
# filter=f"doc_id in [{doc_id}, {doc_id + 1}]", # Query documents by ID
|
198 |
+
# output_fields=["seq_id", "vector", "doc"], # Fields to retrieve
|
199 |
+
# limit=1000, # Retrieve a maximum of 1000 vectors per document
|
200 |
+
# )
|
201 |
+
# # Compute the maximum similarity score for the document
|
202 |
+
# doc_vecs = np.vstack(
|
203 |
+
# [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
|
204 |
+
# )
|
205 |
+
# score = np.dot(data, doc_vecs.T).max(1).sum()
|
206 |
+
# return (score, doc_id)
|
207 |
+
|
208 |
+
# # Use multithreading to rerank documents in parallel
|
209 |
+
# with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
|
210 |
+
# futures = {
|
211 |
+
# executor.submit(
|
212 |
+
# rerank_single_doc, doc_id, data, self.client, self.collection_name
|
213 |
+
# ): doc_id
|
214 |
+
# for doc_id in doc_ids
|
215 |
+
# }
|
216 |
+
# for future in concurrent.futures.as_completed(futures):
|
217 |
+
# score, doc_id = future.result()
|
218 |
+
# scores.append((score, doc_id))
|
219 |
+
|
220 |
+
# # Sort scores in descending order and return the top-k results
|
221 |
+
# scores.sort(key=lambda x: x[0], reverse=True)
|
222 |
+
# return scores[:topk] if len(scores) >= topk else scores
|
223 |
|
224 |
def insert(self, data):
|
225 |
"""
|