Spaces:
Runtime error
Runtime error
File size: 3,874 Bytes
241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# Import necessary modules and classes
from colpali_manager import ColpaliManager # Manages processing of images and text with the ColPali model
from milvus_manager import MilvusManager # Manages interactions with the Milvus database
from pdf_manager import PdfManager # Handles PDF processing tasks
import hashlib # Library for creating hashed identifiers
# Initialize managers
pdf_manager = PdfManager() # PDF manager instance for handling PDF-related operations
colpali_manager = ColpaliManager() # ColPali manager instance for processing images and text
class Middleware:
"""
Middleware class that integrates PDF processing, image embedding, and database indexing/searching.
"""
def __init__(self, id: str, create_collection=True):
"""
Initialize the Middleware with a unique identifier and Milvus database setup.
Args:
id (str): Unique identifier for the user/session.
create_collection (bool): Whether to create a new collection in the Milvus database.
"""
# Generate a hashed ID for the Milvus database name
hashed_id = hashlib.md5(id.encode()).hexdigest()[:8]
milvus_db_name = f"milvus_{hashed_id}.db"
# Initialize the Milvus manager with the generated database name
self.milvus_manager = MilvusManager(milvus_db_name, "colpali", create_collection)
def index(self, pdf_path: str, id: str, max_pages: int, pages: list[int] = None):
"""
Index the content of a PDF file into the Milvus database.
Args:
pdf_path (str): Path to the PDF file.
id (str): Unique identifier for the session.
max_pages (int): Maximum number of pages to extract and index.
pages (list[int], optional): Specific pages to extract (default is None for all).
Returns:
list[str]: List of paths to the saved image files.
"""
print(f"Indexing {pdf_path}, id: {id}, max_pages: {max_pages}")
# Convert PDF pages into image files and save them
image_paths = pdf_manager.save_images(id, pdf_path, max_pages)
print(f"Saved {len(image_paths)} images")
# Generate image embeddings using the ColPali model
colbert_vecs = colpali_manager.process_images(image_paths)
# Prepare data for insertion into Milvus
images_data = [{
"colbert_vecs": colbert_vecs[i], # Image embeddings
"filepath": image_paths[i] # Corresponding image file path
} for i in range(len(image_paths))]
print(f"Inserting {len(images_data)} images data to Milvus")
# Insert the image data into the Milvus database
self.milvus_manager.insert_images_data(images_data)
print("Indexing completed")
return image_paths # Return the list of saved image paths
def search(self, search_queries: list[str]):
"""
Search for matching results in the indexed database based on text queries.
Args:
search_queries (list[str]): List of search queries.
Returns:
list: Search results for each query.
"""
print(f"Searching for {len(search_queries)} queries")
final_res = [] # List to store the final search results
for query in search_queries:
print(f"Searching for query: {query}")
# Process the query text to generate an embedding
query_vec = colpali_manager.process_text([query])[0]
# Perform the search in the Milvus database
search_res = self.milvus_manager.search(query_vec, topk=1)
print(f"Search result: {search_res} for query: {query}")
# Append the search results to the final results list
final_res.append(search_res)
return final_res # Return all search results
|