from pdf2image import convert_from_path import os #inget image import os import fitz import chromadb from chromadb.utils.data_loaders import ImageLoader from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction # type: ignore path = "mm_vdb2" client = chromadb.PersistentClient(path=path) def extract_and_store_images2(pdf_path,images_dir=r'extracted_images2'): output_dir = 'extracted_images2' # Directory to save images # Ensure the output directory exists os.makedirs(output_dir, exist_ok=True) # Convert PDF to a list of images (one per page) pages = convert_from_path(pdf_path, 300) # 300 dpi is a good resolution # Save each page as an image (screenshot) for i, page in enumerate(pages): output_path = os.path.join(output_dir, f'page_{i + 1}.png') page.save(output_path, 'PNG') print(f"Saved: {output_path}") print("Image extraction complete.") # Step 2: Add extracted images to ChromaDB image_loader = ImageLoader() CLIP = OpenCLIPEmbeddingFunction() image_collection2 = client.get_or_create_collection(name="image2", embedding_function=CLIP, data_loader=image_loader) ids = [] uris = [] for i, filename in enumerate(sorted(os.listdir(images_dir))): if filename.endswith('.jpeg') or filename.endswith('.png'): file_path = os.path.join(images_dir, filename) ids.append(str(i)) uris.append(file_path) image_collection2.add(ids=ids, uris=uris) print("Images added to the database.") # return image_vdb return image_collection2