# Import necessary modules
from pymilvus import MilvusClient, DataType  # Milvus client and data type definitions
import numpy as np  # For numerical operations
import concurrent.futures  # For concurrent execution of tasks

class MilvusManager:
    """
    A manager class for interacting with the Milvus database, handling collection creation,
    data insertion, and search functionality.
    """

    def __init__(self, milvus_uri, collection_name, create_collection, dim=128):
        """
        Initialize the MilvusManager.

        Args:
            milvus_uri (str): URI for connecting to the Milvus server.
            collection_name (str): Name of the collection in Milvus.
            create_collection (bool): Whether to create a new collection.
            dim (int): Dimensionality of the vector embeddings (default is 128).
        """
        self.client = MilvusClient(uri=milvus_uri)  # Initialize the Milvus client
        self.collection_name = collection_name
        self.dim = dim

        # Load the collection if it exists, otherwise create it
        if self.client.has_collection(collection_name=self.collection_name):
            self.client.load_collection(collection_name)

        if create_collection:
            self.create_collection()  # Create a new collection
            self.create_index()       # Create an index for the collection

    def create_collection(self):
        """
        Create a new collection in Milvus with a predefined schema.
        """
        # Drop the collection if it already exists
        if self.client.has_collection(collection_name=self.collection_name):
            self.client.drop_collection(collection_name=self.collection_name)

        # Define the schema for the collection
        schema = self.client.create_schema(
            auto_id=True,  # Enable automatic ID assignment
            enable_dynamic_fields=True,  # Allow dynamic fields
        )
        schema.add_field(field_name="pk", datatype=DataType.INT64, is_primary=True)  # Primary key
        schema.add_field(
            field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=self.dim  # Vector field
        )
        schema.add_field(field_name="seq_id", datatype=DataType.INT16)  # Sequence ID
        schema.add_field(field_name="doc_id", datatype=DataType.INT64)  # Document ID
        schema.add_field(field_name="doc", datatype=DataType.VARCHAR, max_length=65535)  # Document path

        # Create the collection with the specified schema
        self.client.create_collection(
            collection_name=self.collection_name, schema=schema
        )

    def create_index(self):
        """
        Create an HNSW index for the vector field in the collection.
        """
        # Release the collection before updating the index
        self.client.release_collection(collection_name=self.collection_name)
        self.client.drop_index(collection_name=self.collection_name, index_name="vector")

        # Define the HNSW index parameters
        index_params = self.client.prepare_index_params()
        index_params.add_index(
            field_name="vector",
            index_name="vector_index",
            index_type="HNSW",  # Hierarchical Navigable Small World graph index
            metric_type="IP",  # Inner Product (dot product) as similarity metric
            params={
                "M": 16,              # Number of candidate connections
                "efConstruction": 500,  # Construction complexity
            },
        )

        # Create the index and synchronize with the server
        self.client.create_index(
            collection_name=self.collection_name, index_params=index_params, sync=True
        )

    def create_scalar_index(self):
        """
        Create an inverted index for scalar fields such as document IDs.
        """
        self.client.release_collection(collection_name=self.collection_name)

        index_params = self.client.prepare_index_params()
        index_params.add_index(
            field_name="doc_id",
            index_name="int32_index",
            index_type="INVERTED",  # Inverted index for scalar data
        )

        self.client.create_index(
            collection_name=self.collection_name, index_params=index_params, sync=True
        )
    def search(self, data, topk, threshold=0.7):
        """
        Search for the top-k most similar vectors in the collection, filtered by a relevance threshold.

        Args:
            data (array-like): Query vector.
            topk (int): Number of top results to return.
            threshold (float): Minimum score threshold for relevance (default is 0.5).

        Returns:
            list: Sorted list of top-k results that meet the threshold.
        """
        search_params = {"metric_type": "IP", "params": {}}  # Search parameters for Inner Product
        results = self.client.search(
            self.collection_name,
            data,
            limit=50,  # Initial retrieval limit
            output_fields=["vector", "seq_id", "doc_id"],  # Fields to include in the output
            search_params=search_params,
        )

        # Collect unique document IDs from the search results
        doc_ids = set()
        for r_id in range(len(results)):
            for r in range(len(results[r_id])):
                doc_ids.add(results[r_id][r]["entity"]["doc_id"])

        scores = []

        # Function to rerank a single document based on its relevance to the query
        def rerank_single_doc(doc_id, data, client, collection_name):
            doc_colbert_vecs = client.query(
                collection_name=collection_name,
                filter=f"doc_id in [{doc_id}, {doc_id + 1}]",  # Query documents by ID
                output_fields=["seq_id", "vector", "doc"],  # Fields to retrieve
                limit=1000,  # Retrieve a maximum of 1000 vectors per document
            )
            # Compute the maximum similarity score for the document
            doc_vecs = np.vstack(
                [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
            )
            score = np.dot(data, doc_vecs.T).max(1).sum()
            return (score, doc_id)

        # Use multithreading to rerank documents in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
            futures = {
                executor.submit(
                    rerank_single_doc, doc_id, data, self.client, self.collection_name
                ): doc_id
                for doc_id in doc_ids
            }
            for future in concurrent.futures.as_completed(futures):
                score, doc_id = future.result()
                scores.append((score, doc_id))

        # Filter scores by threshold
        filtered_scores = [item for item in scores if item[0] >= threshold]

        # Sort scores in descending order and return the top-k results
        filtered_scores.sort(key=lambda x: x[0], reverse=True)
        return filtered_scores[:topk] if len(filtered_scores) >= topk else filtered_scores

    # def search(self, data, topk):
    #     """
    #     Search for the top-k most similar vectors in the collection.

    #     Args:
    #         data (array-like): Query vector.
    #         topk (int): Number of top results to return.

    #     Returns:
    #         list: Sorted list of top-k results.
    #     """
    #     search_params = {"metric_type": "IP", "params": {}}  # Search parameters for Inner Product
    #     results = self.client.search(
    #         self.collection_name,
    #         data,
    #         limit=50,  # Initial retrieval limit
    #         output_fields=["vector", "seq_id", "doc_id"],  # Fields to include in the output
    #         search_params=search_params,
    #     )

    #     # Collect unique document IDs from the search results
    #     doc_ids = set()
    #     for r_id in range(len(results)):
    #         for r in range(len(results[r_id])):
    #             doc_ids.add(results[r_id][r]["entity"]["doc_id"])

    #     scores = []

    #     # Function to rerank a single document based on its relevance to the query
    #     def rerank_single_doc(doc_id, data, client, collection_name):
    #         doc_colbert_vecs = client.query(
    #             collection_name=collection_name,
    #             filter=f"doc_id in [{doc_id}, {doc_id + 1}]",  # Query documents by ID
    #             output_fields=["seq_id", "vector", "doc"],  # Fields to retrieve
    #             limit=1000,  # Retrieve a maximum of 1000 vectors per document
    #         )
    #         # Compute the maximum similarity score for the document
    #         doc_vecs = np.vstack(
    #             [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
    #         )
    #         score = np.dot(data, doc_vecs.T).max(1).sum()
    #         return (score, doc_id)

    #     # Use multithreading to rerank documents in parallel
    #     with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
    #         futures = {
    #             executor.submit(
    #                 rerank_single_doc, doc_id, data, self.client, self.collection_name
    #             ): doc_id
    #             for doc_id in doc_ids
    #         }
    #         for future in concurrent.futures.as_completed(futures):
    #             score, doc_id = future.result()
    #             scores.append((score, doc_id))

    #     # Sort scores in descending order and return the top-k results
    #     scores.sort(key=lambda x: x[0], reverse=True)
    #     return scores[:topk] if len(scores) >= topk else scores

    def insert(self, data):
        """
        Insert a batch of data into the collection.

        Args:
            data (dict): Dictionary containing vector embeddings and metadata.
        """
        colbert_vecs = [vec for vec in data["colbert_vecs"]]
        seq_length = len(colbert_vecs)
        doc_ids = [data["doc_id"] for i in range(seq_length)]
        seq_ids = list(range(seq_length))
        docs = [""] * seq_length
        docs[0] = data["filepath"]  # Store file path in the first entry

        # Insert the data into the collection
        self.client.insert(
            self.collection_name,
            [
                {
                    "vector": colbert_vecs[i],
                    "seq_id": seq_ids[i],
                    "doc_id": doc_ids[i],
                    "doc": docs[i],
                }
                for i in range(seq_length)
            ],
        )

    def get_images_as_doc(self, images_with_vectors: list):
        """
        Convert image data with vectors into document-like format for insertion.

        Args:
            images_with_vectors (list): List of dictionaries containing image vectors and file paths.

        Returns:
            list: Transformed data ready for insertion.
        """
        images_data = []
        for i in range(len(images_with_vectors)):
            data = {
                "colbert_vecs": images_with_vectors[i]["colbert_vecs"],
                "doc_id": i,
                "filepath": images_with_vectors[i]["filepath"],
            }
            images_data.append(data)
        return images_data

    def insert_images_data(self, image_data):
        """
        Insert processed image data into the collection.

        Args:
            image_data (list): List of image data dictionaries.
        """
        data = self.get_images_as_doc(image_data)
        for i in range(len(data)):
            self.insert(data[i])  # Insert each item individually