|
from colpali_manager import ColpaliManager |
|
from milvus_manager import MilvusManager |
|
from pdf_manager import PdfManager |
|
import hashlib |
|
|
|
|
|
|
|
pdf_manager = PdfManager() |
|
colpali_manager = ColpaliManager() |
|
|
|
|
|
|
|
class Middleware: |
|
def __init__(self, id:str, create_collection=True): |
|
|
|
hashed_id = 0 |
|
milvus_db_name = f"milvus_{hashed_id}.db" |
|
self.milvus_manager = MilvusManager(milvus_db_name, id, create_collection) |
|
|
|
def index(self, pdf_path: str, id:str, max_pages: int, pages: list[int] = None): |
|
|
|
if type(pdf_path) == None: |
|
print("no docs") |
|
return |
|
|
|
print(f"Indexing {pdf_path}, id: {id}, max_pages: {max_pages}") |
|
|
|
image_paths = pdf_manager.save_images(id, pdf_path, max_pages) |
|
|
|
print(f"Saved {len(image_paths)} images") |
|
|
|
colbert_vecs = colpali_manager.process_images(image_paths) |
|
|
|
images_data = [{ |
|
"colbert_vecs": colbert_vecs[i], |
|
"filepath": image_paths[i] |
|
} for i in range(len(image_paths))] |
|
|
|
print(f"Inserting {len(images_data)} images data to Milvus") |
|
|
|
self.milvus_manager.insert_images_data(images_data) |
|
|
|
print("Indexing completed") |
|
|
|
return image_paths |
|
|
|
|
|
|
|
def search(self, search_queries: list[str]): |
|
print(f"Searching for {len(search_queries)} queries") |
|
|
|
final_res = [] |
|
|
|
for query in search_queries: |
|
print(f"Searching for query: {query}") |
|
query_vec = colpali_manager.process_text([query])[0] |
|
search_res = self.milvus_manager.search(query_vec, topk=1) |
|
print(f"Search result: {search_res} for query: {query}") |
|
final_res.append(search_res) |
|
|
|
return final_res |
|
|
|
|