|
from PIL import Image |
|
import numpy as np |
|
import base64 |
|
import io |
|
from io import BytesIO |
|
from PIL import Image, ImageFile |
|
from pdf2image import convert_from_path |
|
import tempfile |
|
from multiprocessing import Pool |
|
import os |
|
from loguru import logger |
|
import uuid |
|
|
|
from typing import Any, List, Tuple, Type, Literal, Optional, Union, Dict |
|
|
|
def encode_image(image_path): |
|
with open(image_path, "rb") as image_file: |
|
return base64.b64encode(image_file.read()).decode('utf-8') |
|
|
|
def load_image_from_base64(image): |
|
return Image.open(BytesIO(base64.b64decode(image))) |
|
|
|
def pil_image_to_base64(image: Image) -> str: |
|
""" |
|
Convert a PIL Image object to its base64 representation. |
|
|
|
Args: |
|
image (Image): The PIL Image object to be converted. |
|
|
|
Returns: |
|
str: The base64 representation of the image. |
|
""" |
|
|
|
|
|
buffer = io.BytesIO() |
|
|
|
|
|
image.save(buffer, format="PNG") |
|
|
|
|
|
img_bytes = buffer.getvalue() |
|
|
|
|
|
img_base64 = base64.b64encode(img_bytes).decode("utf-8") |
|
|
|
return img_base64 |
|
|
|
def scale_image(image: Image.Image, new_height: int = 1024) -> Image.Image: |
|
""" |
|
Scale an image to a new height while maintaining the aspect ratio. |
|
""" |
|
width, height = image.size |
|
aspect_ratio = width / height |
|
new_width = int(new_height * aspect_ratio) |
|
|
|
scaled_image = image.resize((new_width, new_height)) |
|
|
|
return scaled_image |
|
|
|
def unflatten_array(flat_list, vector_size=128): |
|
return np.array(flat_list).reshape(-1, vector_size) |
|
|
|
def get_image_embedding(image_list: list[Image], openai_client, model: str, flatten: bool = False) -> list: |
|
""" |
|
Get the embedding of an image. |
|
|
|
Args: |
|
image (Image): The image to be embedded. |
|
|
|
Returns: |
|
list[list[float]] if flatten, |
|
else: list[list[list[float]]] with shape = (number of images (m), number of vector for each text (n), vector dim = 128) |
|
""" |
|
if not isinstance(image_list, list): |
|
image_list = [image_list] |
|
|
|
input_base64_list = [f"data:image/png;base64,{pil_image_to_base64(image)}" for image in image_list] |
|
|
|
embedding = openai_client.embeddings.create( |
|
input=input_base64_list, |
|
model=model, |
|
extra_body={ |
|
"modality": "image", |
|
"encoding_format":"float" if not flatten else "base64", |
|
}, |
|
) |
|
|
|
result = [] |
|
for embed in embedding.data: |
|
result.append(embed.embedding) |
|
return result |
|
|
|
def get_text_embedding(texts: list[str], openai_client, model: str, flatten: bool = False) -> list: |
|
""" |
|
Get the embedding of a text. |
|
|
|
Args: |
|
text (str): The text to be embedded. |
|
|
|
Returns: |
|
list[list[float]] if flatten, |
|
else: list[list[list[float]]] with shape = (number of texts (m), number of vector for each text (n), vector dim = 128) |
|
""" |
|
if not isinstance(texts, list): |
|
texts = [texts] |
|
|
|
|
|
embedding = openai_client.embeddings.create( |
|
input=texts, |
|
model=model, |
|
extra_body={ |
|
"encoding_format":"float" if not flatten else "base64", |
|
}, |
|
) |
|
|
|
result = [] |
|
for embed in embedding.data: |
|
result.append(embed.embedding) |
|
return result |
|
|
|
def load_images(image_paths): |
|
""" |
|
Load images from a list of paths and return a list of PIL image objects. |
|
|
|
Args: |
|
image_paths (list): List of image paths. |
|
|
|
Returns: |
|
list: List of PIL image objects. |
|
""" |
|
images = [] |
|
for path in image_paths: |
|
try: |
|
img = Image.open(path) |
|
images.append(img) |
|
except Exception as e: |
|
logger.error(f"Error loading image at path {path}: {str(e)}") |
|
return images |
|
|
|
|
|
def process_pdf(pdf_path: str, output_folder: str, thread_count=1): |
|
result_image_paths = [] |
|
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
images = convert_from_path(pdf_path, dpi=200, output_folder=temp_dir, thread_count=thread_count) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return images |
|
|
|
|
|
def pdf_folder_to_images(pdf_folder: str, output_folder: str, process_count: int = 2): |
|
try: |
|
if process_count is None: |
|
process_count = os.cpu_count() |
|
|
|
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) |
|
if f.lower().endswith('.pdf')] |
|
|
|
|
|
args = [(pdf_file, output_folder) for pdf_file in pdf_files] |
|
|
|
with Pool(process_count) as pool: |
|
all_images = pool.starmap(process_pdf, args) |
|
|
|
result = [img for sublist in all_images for img in sublist] |
|
|
|
logger.debug(f"Number of pdfs processed: {len(all_images)} - Number of images: {len(result)}") |
|
return result |
|
except Exception as e: |
|
logger.exception(f"Error during processing pdf: {e}") |
|
|
|
|
|
|