from PIL import Image import numpy as np import base64 import io from io import BytesIO from PIL import Image, ImageFile from pdf2image import convert_from_path import tempfile from multiprocessing import Pool import os from loguru import logger import uuid from typing import Any, List, Tuple, Type, Literal, Optional, Union, Dict def encode_image(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def load_image_from_base64(image): return Image.open(BytesIO(base64.b64decode(image))) def pil_image_to_base64(image: Image) -> str: """ Convert a PIL Image object to its base64 representation. Args: image (Image): The PIL Image object to be converted. Returns: str: The base64 representation of the image. """ # Create a bytes buffer buffer = io.BytesIO() # Save the image to the buffer image.save(buffer, format="PNG") # Get the bytes from the buffer img_bytes = buffer.getvalue() # Convert the bytes to base64 img_base64 = base64.b64encode(img_bytes).decode("utf-8") return img_base64 def scale_image(image: Image.Image, new_height: int = 1024) -> Image.Image: """ Scale an image to a new height while maintaining the aspect ratio. """ width, height = image.size aspect_ratio = width / height new_width = int(new_height * aspect_ratio) scaled_image = image.resize((new_width, new_height)) return scaled_image def unflatten_array(flat_list, vector_size=128): return np.array(flat_list).reshape(-1, vector_size) def get_image_embedding(image_list: list[Image], openai_client, model: str, flatten: bool = False) -> list: """ Get the embedding of an image. Args: image (Image): The image to be embedded. Returns: list[list[float]] if flatten, else: list[list[list[float]]] with shape = (number of images (m), number of vector for each text (n), vector dim = 128) """ if not isinstance(image_list, list): image_list = [image_list] input_base64_list = [f"data:image/png;base64,{pil_image_to_base64(image)}" for image in image_list] # Get the embedding of the image embedding = openai_client.embeddings.create( input=input_base64_list, model=model, extra_body={ "modality": "image", "encoding_format":"float" if not flatten else "base64", }, ) result = [] for embed in embedding.data: result.append(embed.embedding) # embed.embedding is a list[float] in case of flatten, else: list[list[float]] return result def get_text_embedding(texts: list[str], openai_client, model: str, flatten: bool = False) -> list: """ Get the embedding of a text. Args: text (str): The text to be embedded. Returns: list[list[float]] if flatten, else: list[list[list[float]]] with shape = (number of texts (m), number of vector for each text (n), vector dim = 128) """ if not isinstance(texts, list): texts = [texts] # Get the embedding of the text embedding = openai_client.embeddings.create( input=texts, model=model, extra_body={ "encoding_format":"float" if not flatten else "base64", }, ) result = [] for embed in embedding.data: result.append(embed.embedding) # embed.embedding is a list[float] in case of flatten, else: list[list[float]] return result def load_images(image_paths): """ Load images from a list of paths and return a list of PIL image objects. Args: image_paths (list): List of image paths. Returns: list: List of PIL image objects. """ images = [] for path in image_paths: try: img = Image.open(path) images.append(img) except Exception as e: logger.error(f"Error loading image at path {path}: {str(e)}") return images def process_pdf(pdf_path: str, output_folder: str, thread_count=1): result_image_paths = [] with tempfile.TemporaryDirectory() as temp_dir: images = convert_from_path(pdf_path, dpi=200, output_folder=temp_dir, thread_count=thread_count) # for page_num, image in enumerate(images): # image_filename = f"{str(uuid.uuid4())}.png" # image_path = os.path.join(output_folder, image_filename) # image.save(image_path, "PNG") # result_image_paths.append(image_path) # del images # return result_image_paths return images def pdf_folder_to_images(pdf_folder: str, output_folder: str, process_count: int = 2): try: if process_count is None: process_count = os.cpu_count() pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')] # Create a list of tuples containing (pdf_file, output_folder) args = [(pdf_file, output_folder) for pdf_file in pdf_files] with Pool(process_count) as pool: all_images = pool.starmap(process_pdf, args) result = [img for sublist in all_images for img in sublist] logger.debug(f"Number of pdfs processed: {len(all_images)} - Number of images: {len(result)}") return result except Exception as e: logger.exception(f"Error during processing pdf: {e}")