File size: 3,874 Bytes
241c492
 
 
 
 
273089c
241c492
 
 
273089c
 
 
241c492
 
 
 
 
 
 
 
 
 
 
 
 
273089c
 
241c492
 
273089c
 
241c492
 
 
 
 
 
 
 
 
 
 
 
 
273089c
 
241c492
273089c
 
 
241c492
273089c
 
241c492
273089c
241c492
 
273089c
 
 
 
241c492
273089c
 
 
 
241c492
273089c
 
241c492
 
 
 
 
 
 
 
 
273089c
 
241c492
273089c
 
 
241c492
 
273089c
241c492
 
273089c
241c492
273089c
 
241c492
 
273089c
241c492
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# Import necessary modules and classes
from colpali_manager import ColpaliManager  # Manages processing of images and text with the ColPali model
from milvus_manager import MilvusManager  # Manages interactions with the Milvus database
from pdf_manager import PdfManager  # Handles PDF processing tasks
import hashlib  # Library for creating hashed identifiers

# Initialize managers
pdf_manager = PdfManager()  # PDF manager instance for handling PDF-related operations
colpali_manager = ColpaliManager()  # ColPali manager instance for processing images and text


class Middleware:
    """
    Middleware class that integrates PDF processing, image embedding, and database indexing/searching.
    """

    def __init__(self, id: str, create_collection=True):
        """
        Initialize the Middleware with a unique identifier and Milvus database setup.

        Args:
            id (str): Unique identifier for the user/session.
            create_collection (bool): Whether to create a new collection in the Milvus database.
        """
        # Generate a hashed ID for the Milvus database name
        hashed_id = hashlib.md5(id.encode()).hexdigest()[:8]
        milvus_db_name = f"milvus_{hashed_id}.db"

        # Initialize the Milvus manager with the generated database name
        self.milvus_manager = MilvusManager(milvus_db_name, "colpali", create_collection)

    def index(self, pdf_path: str, id: str, max_pages: int, pages: list[int] = None):
        """
        Index the content of a PDF file into the Milvus database.

        Args:
            pdf_path (str): Path to the PDF file.
            id (str): Unique identifier for the session.
            max_pages (int): Maximum number of pages to extract and index.
            pages (list[int], optional): Specific pages to extract (default is None for all).

        Returns:
            list[str]: List of paths to the saved image files.
        """
        print(f"Indexing {pdf_path}, id: {id}, max_pages: {max_pages}")

        # Convert PDF pages into image files and save them
        image_paths = pdf_manager.save_images(id, pdf_path, max_pages)
        print(f"Saved {len(image_paths)} images")

        # Generate image embeddings using the ColPali model
        colbert_vecs = colpali_manager.process_images(image_paths)

        # Prepare data for insertion into Milvus
        images_data = [{
            "colbert_vecs": colbert_vecs[i],  # Image embeddings
            "filepath": image_paths[i]       # Corresponding image file path
        } for i in range(len(image_paths))]

        print(f"Inserting {len(images_data)} images data to Milvus")

        # Insert the image data into the Milvus database
        self.milvus_manager.insert_images_data(images_data)

        print("Indexing completed")

        return image_paths  # Return the list of saved image paths

    def search(self, search_queries: list[str]):
        """
        Search for matching results in the indexed database based on text queries.

        Args:
            search_queries (list[str]): List of search queries.

        Returns:
            list: Search results for each query.
        """
        print(f"Searching for {len(search_queries)} queries")

        final_res = []  # List to store the final search results

        for query in search_queries:
            print(f"Searching for query: {query}")

            # Process the query text to generate an embedding
            query_vec = colpali_manager.process_text([query])[0]

            # Perform the search in the Milvus database
            search_res = self.milvus_manager.search(query_vec, topk=1)

            print(f"Search result: {search_res} for query: {query}")

            # Append the search results to the final results list
            final_res.append(search_res)

        return final_res  # Return all search results