File size: 11,612 Bytes
241c492
 
 
 
273089c
 
241c492
 
 
 
 
273089c
241c492
 
 
 
 
 
 
 
 
 
273089c
241c492
 
 
273089c
 
 
 
241c492
 
273089c
 
241c492
 
 
 
273089c
 
241c492
 
273089c
241c492
 
273089c
241c492
273089c
241c492
273089c
241c492
 
 
273089c
241c492
273089c
 
 
 
 
241c492
 
 
 
273089c
241c492
 
 
273089c
 
 
 
241c492
 
273089c
241c492
 
273089c
 
 
241c492
273089c
 
 
 
 
241c492
 
 
273089c
 
 
 
 
 
241c492
273089c
 
 
 
 
a53d884
241c492
a53d884
241c492
 
 
 
a53d884
241c492
 
a53d884
241c492
 
273089c
 
 
241c492
 
273089c
 
241c492
 
273089c
 
 
 
 
 
 
241c492
273089c
 
 
241c492
 
 
273089c
241c492
273089c
 
 
 
 
 
241c492
273089c
 
 
 
 
 
 
 
 
 
 
a53d884
 
 
241c492
a53d884
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273089c
 
241c492
 
 
 
 
 
273089c
 
 
 
 
241c492
273089c
241c492
273089c
 
 
 
 
 
 
 
 
 
 
 
 
241c492
 
 
273089c
241c492
 
273089c
241c492
 
 
 
273089c
 
 
 
 
 
 
 
 
 
241c492
 
273089c
241c492
 
 
 
273089c
241c492
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# Import necessary modules
from pymilvus import MilvusClient, DataType  # Milvus client and data type definitions
import numpy as np  # For numerical operations
import concurrent.futures  # For concurrent execution of tasks

class MilvusManager:
    """
    A manager class for interacting with the Milvus database, handling collection creation,
    data insertion, and search functionality.
    """

    def __init__(self, milvus_uri, collection_name, create_collection, dim=128):
        """
        Initialize the MilvusManager.

        Args:
            milvus_uri (str): URI for connecting to the Milvus server.
            collection_name (str): Name of the collection in Milvus.
            create_collection (bool): Whether to create a new collection.
            dim (int): Dimensionality of the vector embeddings (default is 128).
        """
        self.client = MilvusClient(uri=milvus_uri)  # Initialize the Milvus client
        self.collection_name = collection_name
        self.dim = dim

        # Load the collection if it exists, otherwise create it
        if self.client.has_collection(collection_name=self.collection_name):
            self.client.load_collection(collection_name)

        if create_collection:
            self.create_collection()  # Create a new collection
            self.create_index()       # Create an index for the collection

    def create_collection(self):
        """
        Create a new collection in Milvus with a predefined schema.
        """
        # Drop the collection if it already exists
        if self.client.has_collection(collection_name=self.collection_name):
            self.client.drop_collection(collection_name=self.collection_name)

        # Define the schema for the collection
        schema = self.client.create_schema(
            auto_id=True,  # Enable automatic ID assignment
            enable_dynamic_fields=True,  # Allow dynamic fields
        )
        schema.add_field(field_name="pk", datatype=DataType.INT64, is_primary=True)  # Primary key
        schema.add_field(
            field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=self.dim  # Vector field
        )
        schema.add_field(field_name="seq_id", datatype=DataType.INT16)  # Sequence ID
        schema.add_field(field_name="doc_id", datatype=DataType.INT64)  # Document ID
        schema.add_field(field_name="doc", datatype=DataType.VARCHAR, max_length=65535)  # Document path

        # Create the collection with the specified schema
        self.client.create_collection(
            collection_name=self.collection_name, schema=schema
        )

    def create_index(self):
        """
        Create an HNSW index for the vector field in the collection.
        """
        # Release the collection before updating the index
        self.client.release_collection(collection_name=self.collection_name)
        self.client.drop_index(collection_name=self.collection_name, index_name="vector")

        # Define the HNSW index parameters
        index_params = self.client.prepare_index_params()
        index_params.add_index(
            field_name="vector",
            index_name="vector_index",
            index_type="HNSW",  # Hierarchical Navigable Small World graph index
            metric_type="IP",  # Inner Product (dot product) as similarity metric
            params={
                "M": 16,              # Number of candidate connections
                "efConstruction": 500,  # Construction complexity
            },
        )

        # Create the index and synchronize with the server
        self.client.create_index(
            collection_name=self.collection_name, index_params=index_params, sync=True
        )

    def create_scalar_index(self):
        """
        Create an inverted index for scalar fields such as document IDs.
        """
        self.client.release_collection(collection_name=self.collection_name)

        index_params = self.client.prepare_index_params()
        index_params.add_index(
            field_name="doc_id",
            index_name="int32_index",
            index_type="INVERTED",  # Inverted index for scalar data
        )

        self.client.create_index(
            collection_name=self.collection_name, index_params=index_params, sync=True
        )
    def search(self, data, topk, threshold=0.7):
        """
        Search for the top-k most similar vectors in the collection, filtered by a relevance threshold.

        Args:
            data (array-like): Query vector.
            topk (int): Number of top results to return.
            threshold (float): Minimum score threshold for relevance (default is 0.5).

        Returns:
            list: Sorted list of top-k results that meet the threshold.
        """
        search_params = {"metric_type": "IP", "params": {}}  # Search parameters for Inner Product
        results = self.client.search(
            self.collection_name,
            data,
            limit=50,  # Initial retrieval limit
            output_fields=["vector", "seq_id", "doc_id"],  # Fields to include in the output
            search_params=search_params,
        )

        # Collect unique document IDs from the search results
        doc_ids = set()
        for r_id in range(len(results)):
            for r in range(len(results[r_id])):
                doc_ids.add(results[r_id][r]["entity"]["doc_id"])

        scores = []

        # Function to rerank a single document based on its relevance to the query
        def rerank_single_doc(doc_id, data, client, collection_name):
            doc_colbert_vecs = client.query(
                collection_name=collection_name,
                filter=f"doc_id in [{doc_id}, {doc_id + 1}]",  # Query documents by ID
                output_fields=["seq_id", "vector", "doc"],  # Fields to retrieve
                limit=1000,  # Retrieve a maximum of 1000 vectors per document
            )
            # Compute the maximum similarity score for the document
            doc_vecs = np.vstack(
                [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
            )
            score = np.dot(data, doc_vecs.T).max(1).sum()
            return (score, doc_id)

        # Use multithreading to rerank documents in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
            futures = {
                executor.submit(
                    rerank_single_doc, doc_id, data, self.client, self.collection_name
                ): doc_id
                for doc_id in doc_ids
            }
            for future in concurrent.futures.as_completed(futures):
                score, doc_id = future.result()
                scores.append((score, doc_id))

        # Filter scores by threshold
        filtered_scores = [item for item in scores if item[0] >= threshold]

        # Sort scores in descending order and return the top-k results
        filtered_scores.sort(key=lambda x: x[0], reverse=True)
        return filtered_scores[:topk] if len(filtered_scores) >= topk else filtered_scores

    # def search(self, data, topk):
    #     """
    #     Search for the top-k most similar vectors in the collection.

    #     Args:
    #         data (array-like): Query vector.
    #         topk (int): Number of top results to return.

    #     Returns:
    #         list: Sorted list of top-k results.
    #     """
    #     search_params = {"metric_type": "IP", "params": {}}  # Search parameters for Inner Product
    #     results = self.client.search(
    #         self.collection_name,
    #         data,
    #         limit=50,  # Initial retrieval limit
    #         output_fields=["vector", "seq_id", "doc_id"],  # Fields to include in the output
    #         search_params=search_params,
    #     )

    #     # Collect unique document IDs from the search results
    #     doc_ids = set()
    #     for r_id in range(len(results)):
    #         for r in range(len(results[r_id])):
    #             doc_ids.add(results[r_id][r]["entity"]["doc_id"])

    #     scores = []

    #     # Function to rerank a single document based on its relevance to the query
    #     def rerank_single_doc(doc_id, data, client, collection_name):
    #         doc_colbert_vecs = client.query(
    #             collection_name=collection_name,
    #             filter=f"doc_id in [{doc_id}, {doc_id + 1}]",  # Query documents by ID
    #             output_fields=["seq_id", "vector", "doc"],  # Fields to retrieve
    #             limit=1000,  # Retrieve a maximum of 1000 vectors per document
    #         )
    #         # Compute the maximum similarity score for the document
    #         doc_vecs = np.vstack(
    #             [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
    #         )
    #         score = np.dot(data, doc_vecs.T).max(1).sum()
    #         return (score, doc_id)

    #     # Use multithreading to rerank documents in parallel
    #     with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
    #         futures = {
    #             executor.submit(
    #                 rerank_single_doc, doc_id, data, self.client, self.collection_name
    #             ): doc_id
    #             for doc_id in doc_ids
    #         }
    #         for future in concurrent.futures.as_completed(futures):
    #             score, doc_id = future.result()
    #             scores.append((score, doc_id))

    #     # Sort scores in descending order and return the top-k results
    #     scores.sort(key=lambda x: x[0], reverse=True)
    #     return scores[:topk] if len(scores) >= topk else scores

    def insert(self, data):
        """
        Insert a batch of data into the collection.

        Args:
            data (dict): Dictionary containing vector embeddings and metadata.
        """
        colbert_vecs = [vec for vec in data["colbert_vecs"]]
        seq_length = len(colbert_vecs)
        doc_ids = [data["doc_id"] for i in range(seq_length)]
        seq_ids = list(range(seq_length))
        docs = [""] * seq_length
        docs[0] = data["filepath"]  # Store file path in the first entry

        # Insert the data into the collection
        self.client.insert(
            self.collection_name,
            [
                {
                    "vector": colbert_vecs[i],
                    "seq_id": seq_ids[i],
                    "doc_id": doc_ids[i],
                    "doc": docs[i],
                }
                for i in range(seq_length)
            ],
        )

    def get_images_as_doc(self, images_with_vectors: list):
        """
        Convert image data with vectors into document-like format for insertion.

        Args:
            images_with_vectors (list): List of dictionaries containing image vectors and file paths.

        Returns:
            list: Transformed data ready for insertion.
        """
        images_data = []
        for i in range(len(images_with_vectors)):
            data = {
                "colbert_vecs": images_with_vectors[i]["colbert_vecs"],
                "doc_id": i,
                "filepath": images_with_vectors[i]["filepath"],
            }
            images_data.append(data)
        return images_data

    def insert_images_data(self, image_data):
        """
        Insert processed image data into the collection.

        Args:
            image_data (list): List of image data dictionaries.
        """
        data = self.get_images_as_doc(image_data)
        for i in range(len(data)):
            self.insert(data[i])  # Insert each item individually