Spaces:
Sleeping
Sleeping
from PIL import Image | |
import numpy as np | |
import base64 | |
import io | |
from io import BytesIO | |
from PIL import Image, ImageFile | |
from pdf2image import convert_from_path | |
import tempfile | |
from multiprocessing import Pool | |
import os | |
from loguru import logger | |
import uuid | |
from typing import Any, List, Tuple, Type, Literal, Optional, Union, Dict | |
def encode_image(image_path): | |
with open(image_path, "rb") as image_file: | |
return base64.b64encode(image_file.read()).decode('utf-8') | |
def load_image_from_base64(image): | |
return Image.open(BytesIO(base64.b64decode(image))) | |
def pil_image_to_base64(image: Image) -> str: | |
""" | |
Convert a PIL Image object to its base64 representation. | |
Args: | |
image (Image): The PIL Image object to be converted. | |
Returns: | |
str: The base64 representation of the image. | |
""" | |
# Create a bytes buffer | |
buffer = io.BytesIO() | |
# Save the image to the buffer | |
image.save(buffer, format="PNG") | |
# Get the bytes from the buffer | |
img_bytes = buffer.getvalue() | |
# Convert the bytes to base64 | |
img_base64 = base64.b64encode(img_bytes).decode("utf-8") | |
return img_base64 | |
def scale_image(image: Image.Image, new_height: int = 1024) -> Image.Image: | |
""" | |
Scale an image to a new height while maintaining the aspect ratio. | |
""" | |
width, height = image.size | |
aspect_ratio = width / height | |
new_width = int(new_height * aspect_ratio) | |
scaled_image = image.resize((new_width, new_height)) | |
return scaled_image | |
def unflatten_array(flat_list, vector_size=128): | |
return np.array(flat_list).reshape(-1, vector_size) | |
def get_image_embedding(image_list: list[Image], openai_client, model: str, flatten: bool = False) -> list: | |
""" | |
Get the embedding of an image. | |
Args: | |
image (Image): The image to be embedded. | |
Returns: | |
list[list[float]] if flatten, | |
else: list[list[list[float]]] with shape = (number of images (m), number of vector for each text (n), vector dim = 128) | |
""" | |
if not isinstance(image_list, list): | |
image_list = [image_list] | |
input_base64_list = [f"data:image/png;base64,{pil_image_to_base64(image)}" for image in image_list] | |
# Get the embedding of the image | |
embedding = openai_client.embeddings.create( | |
input=input_base64_list, | |
model=model, | |
extra_body={ | |
"modality": "image", | |
"encoding_format":"float" if not flatten else "base64", | |
}, | |
) | |
result = [] | |
for embed in embedding.data: | |
result.append(embed.embedding) # embed.embedding is a list[float] in case of flatten, else: list[list[float]] | |
return result | |
def get_text_embedding(texts: list[str], openai_client, model: str, flatten: bool = False) -> list: | |
""" | |
Get the embedding of a text. | |
Args: | |
text (str): The text to be embedded. | |
Returns: | |
list[list[float]] if flatten, | |
else: list[list[list[float]]] with shape = (number of texts (m), number of vector for each text (n), vector dim = 128) | |
""" | |
if not isinstance(texts, list): | |
texts = [texts] | |
# Get the embedding of the text | |
embedding = openai_client.embeddings.create( | |
input=texts, | |
model=model, | |
extra_body={ | |
"encoding_format":"float" if not flatten else "base64", | |
}, | |
) | |
result = [] | |
for embed in embedding.data: | |
result.append(embed.embedding) # embed.embedding is a list[float] in case of flatten, else: list[list[float]] | |
return result | |
def load_images(image_paths): | |
""" | |
Load images from a list of paths and return a list of PIL image objects. | |
Args: | |
image_paths (list): List of image paths. | |
Returns: | |
list: List of PIL image objects. | |
""" | |
images = [] | |
for path in image_paths: | |
try: | |
img = Image.open(path) | |
images.append(img) | |
except Exception as e: | |
logger.error(f"Error loading image at path {path}: {str(e)}") | |
return images | |
def process_pdf(pdf_path: str, output_folder: str, thread_count=1): | |
result_image_paths = [] | |
with tempfile.TemporaryDirectory() as temp_dir: | |
images = convert_from_path(pdf_path, dpi=200, output_folder=temp_dir, thread_count=thread_count) | |
# for page_num, image in enumerate(images): | |
# image_filename = f"{str(uuid.uuid4())}.png" | |
# image_path = os.path.join(output_folder, image_filename) | |
# image.save(image_path, "PNG") | |
# result_image_paths.append(image_path) | |
# del images | |
# return result_image_paths | |
return images | |
def pdf_folder_to_images(pdf_folder: str, output_folder: str, process_count: int = 2): | |
try: | |
if process_count is None: | |
process_count = os.cpu_count() | |
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) | |
if f.lower().endswith('.pdf')] | |
# Create a list of tuples containing (pdf_file, output_folder) | |
args = [(pdf_file, output_folder) for pdf_file in pdf_files] | |
with Pool(process_count) as pool: | |
all_images = pool.starmap(process_pdf, args) | |
result = [img for sublist in all_images for img in sublist] | |
logger.debug(f"Number of pdfs processed: {len(all_images)} - Number of images: {len(result)}") | |
return result | |
except Exception as e: | |
logger.exception(f"Error during processing pdf: {e}") | |