Spaces:

tsystems
/

visual_document_retrieval

Running

visual_document_retrieval / server /app /vdr_utils.py

root

upload

e676d24 6 months ago

5.46 kB

	from PIL import Image
	import numpy as np
	import base64
	import io
	from io import BytesIO
	from PIL import Image, ImageFile
	from pdf2image import convert_from_path
	import tempfile
	from multiprocessing import Pool
	import os
	from loguru import logger
	import uuid

	from typing import Any, List, Tuple, Type, Literal, Optional, Union, Dict

	def encode_image(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode('utf-8')

	def load_image_from_base64(image):
	return Image.open(BytesIO(base64.b64decode(image)))

	def pil_image_to_base64(image: Image) -> str:
	"""
	Convert a PIL Image object to its base64 representation.

	Args:
	image (Image): The PIL Image object to be converted.

	Returns:
	str: The base64 representation of the image.
	"""

	# Create a bytes buffer
	buffer = io.BytesIO()

	# Save the image to the buffer
	image.save(buffer, format="PNG")

	# Get the bytes from the buffer
	img_bytes = buffer.getvalue()

	# Convert the bytes to base64
	img_base64 = base64.b64encode(img_bytes).decode("utf-8")

	return img_base64

	def scale_image(image: Image.Image, new_height: int = 1024) -> Image.Image:
	"""
	Scale an image to a new height while maintaining the aspect ratio.
	"""
	width, height = image.size
	aspect_ratio = width / height
	new_width = int(new_height * aspect_ratio)

	scaled_image = image.resize((new_width, new_height))

	return scaled_image

	def unflatten_array(flat_list, vector_size=128):
	return np.array(flat_list).reshape(-1, vector_size)

	def get_image_embedding(image_list: list[Image], openai_client, model: str, flatten: bool = False) -> list:
	"""
	Get the embedding of an image.

	Args:
	image (Image): The image to be embedded.

	Returns:
	list[list[float]] if flatten,
	else: list[list[list[float]]] with shape = (number of images (m), number of vector for each text (n), vector dim = 128)
	"""
	if not isinstance(image_list, list):
	image_list = [image_list]

	input_base64_list = [f"data:image/png;base64,{pil_image_to_base64(image)}" for image in image_list]
	# Get the embedding of the image
	embedding = openai_client.embeddings.create(
	input=input_base64_list,
	model=model,
	extra_body={
	"modality": "image",
	"encoding_format":"float" if not flatten else "base64",
	},
	)

	result = []
	for embed in embedding.data:
	result.append(embed.embedding) # embed.embedding is a list[float] in case of flatten, else: list[list[float]]
	return result

	def get_text_embedding(texts: list[str], openai_client, model: str, flatten: bool = False) -> list:
	"""
	Get the embedding of a text.

	Args:
	text (str): The text to be embedded.

	Returns:
	list[list[float]] if flatten,
	else: list[list[list[float]]] with shape = (number of texts (m), number of vector for each text (n), vector dim = 128)
	"""
	if not isinstance(texts, list):
	texts = [texts]

	# Get the embedding of the text
	embedding = openai_client.embeddings.create(
	input=texts,
	model=model,
	extra_body={
	"encoding_format":"float" if not flatten else "base64",
	},
	)

	result = []
	for embed in embedding.data:
	result.append(embed.embedding) # embed.embedding is a list[float] in case of flatten, else: list[list[float]]
	return result

	def load_images(image_paths):
	"""
	Load images from a list of paths and return a list of PIL image objects.

	Args:
	image_paths (list): List of image paths.

	Returns:
	list: List of PIL image objects.
	"""
	images = []
	for path in image_paths:
	try:
	img = Image.open(path)
	images.append(img)
	except Exception as e:
	logger.error(f"Error loading image at path {path}: {str(e)}")
	return images


	def process_pdf(pdf_path: str, output_folder: str, thread_count=1):
	result_image_paths = []

	with tempfile.TemporaryDirectory() as temp_dir:
	images = convert_from_path(pdf_path, dpi=200, output_folder=temp_dir, thread_count=thread_count)

	# for page_num, image in enumerate(images):
	# image_filename = f"{str(uuid.uuid4())}.png"
	# image_path = os.path.join(output_folder, image_filename)
	# image.save(image_path, "PNG")
	# result_image_paths.append(image_path)

	# del images
	# return result_image_paths
	return images


	def pdf_folder_to_images(pdf_folder: str, output_folder: str, process_count: int = 2):
	try:
	if process_count is None:
	process_count = os.cpu_count()

	pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder)
	if f.lower().endswith('.pdf')]

	# Create a list of tuples containing (pdf_file, output_folder)
	args = [(pdf_file, output_folder) for pdf_file in pdf_files]

	with Pool(process_count) as pool:
	all_images = pool.starmap(process_pdf, args)

	result = [img for sublist in all_images for img in sublist]

	logger.debug(f"Number of pdfs processed: {len(all_images)} - Number of images: {len(result)}")
	return result
	except Exception as e:
	logger.exception(f"Error during processing pdf: {e}")