root
upload
e676d24
from PIL import Image
import numpy as np
import base64
import io
from io import BytesIO
from PIL import Image, ImageFile
from pdf2image import convert_from_path
import tempfile
from multiprocessing import Pool
import os
from loguru import logger
import uuid
from typing import Any, List, Tuple, Type, Literal, Optional, Union, Dict
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def load_image_from_base64(image):
return Image.open(BytesIO(base64.b64decode(image)))
def pil_image_to_base64(image: Image) -> str:
"""
Convert a PIL Image object to its base64 representation.
Args:
image (Image): The PIL Image object to be converted.
Returns:
str: The base64 representation of the image.
"""
# Create a bytes buffer
buffer = io.BytesIO()
# Save the image to the buffer
image.save(buffer, format="PNG")
# Get the bytes from the buffer
img_bytes = buffer.getvalue()
# Convert the bytes to base64
img_base64 = base64.b64encode(img_bytes).decode("utf-8")
return img_base64
def scale_image(image: Image.Image, new_height: int = 1024) -> Image.Image:
"""
Scale an image to a new height while maintaining the aspect ratio.
"""
width, height = image.size
aspect_ratio = width / height
new_width = int(new_height * aspect_ratio)
scaled_image = image.resize((new_width, new_height))
return scaled_image
def unflatten_array(flat_list, vector_size=128):
return np.array(flat_list).reshape(-1, vector_size)
def get_image_embedding(image_list: list[Image], openai_client, model: str, flatten: bool = False) -> list:
"""
Get the embedding of an image.
Args:
image (Image): The image to be embedded.
Returns:
list[list[float]] if flatten,
else: list[list[list[float]]] with shape = (number of images (m), number of vector for each text (n), vector dim = 128)
"""
if not isinstance(image_list, list):
image_list = [image_list]
input_base64_list = [f"data:image/png;base64,{pil_image_to_base64(image)}" for image in image_list]
# Get the embedding of the image
embedding = openai_client.embeddings.create(
input=input_base64_list,
model=model,
extra_body={
"modality": "image",
"encoding_format":"float" if not flatten else "base64",
},
)
result = []
for embed in embedding.data:
result.append(embed.embedding) # embed.embedding is a list[float] in case of flatten, else: list[list[float]]
return result
def get_text_embedding(texts: list[str], openai_client, model: str, flatten: bool = False) -> list:
"""
Get the embedding of a text.
Args:
text (str): The text to be embedded.
Returns:
list[list[float]] if flatten,
else: list[list[list[float]]] with shape = (number of texts (m), number of vector for each text (n), vector dim = 128)
"""
if not isinstance(texts, list):
texts = [texts]
# Get the embedding of the text
embedding = openai_client.embeddings.create(
input=texts,
model=model,
extra_body={
"encoding_format":"float" if not flatten else "base64",
},
)
result = []
for embed in embedding.data:
result.append(embed.embedding) # embed.embedding is a list[float] in case of flatten, else: list[list[float]]
return result
def load_images(image_paths):
"""
Load images from a list of paths and return a list of PIL image objects.
Args:
image_paths (list): List of image paths.
Returns:
list: List of PIL image objects.
"""
images = []
for path in image_paths:
try:
img = Image.open(path)
images.append(img)
except Exception as e:
logger.error(f"Error loading image at path {path}: {str(e)}")
return images
def process_pdf(pdf_path: str, output_folder: str, thread_count=1):
result_image_paths = []
with tempfile.TemporaryDirectory() as temp_dir:
images = convert_from_path(pdf_path, dpi=200, output_folder=temp_dir, thread_count=thread_count)
# for page_num, image in enumerate(images):
# image_filename = f"{str(uuid.uuid4())}.png"
# image_path = os.path.join(output_folder, image_filename)
# image.save(image_path, "PNG")
# result_image_paths.append(image_path)
# del images
# return result_image_paths
return images
def pdf_folder_to_images(pdf_folder: str, output_folder: str, process_count: int = 2):
try:
if process_count is None:
process_count = os.cpu_count()
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder)
if f.lower().endswith('.pdf')]
# Create a list of tuples containing (pdf_file, output_folder)
args = [(pdf_file, output_folder) for pdf_file in pdf_files]
with Pool(process_count) as pool:
all_images = pool.starmap(process_pdf, args)
result = [img for sublist in all_images for img in sublist]
logger.debug(f"Number of pdfs processed: {len(all_images)} - Number of images: {len(result)}")
return result
except Exception as e:
logger.exception(f"Error during processing pdf: {e}")