import base64 import io import pdf2image from typing import Any class Preprocessor: """Preprocessor.""" def run(self, file_path: str) -> Any: """Preprocess.""" raise NotImplementedError("Preprocess method is not implemented") # Convert PDF to image class PdfPreprocessor(Preprocessor): """PDF Preprocessor.""" def run(self, file_path: str) -> str: images = pdf2image.convert_from_path(file_path) image = images[0] # Assuming there is only one page in the PDF # Convert image to base64 with io.BytesIO() as buffer: image.save(buffer, format="JPEG") image_content = buffer.getvalue() file_content = base64.b64encode(image_content).decode("utf-8") # Process all pages and return a list of images images = pdf2image.convert_from_path(file_path) image_list = [] for image in images: # Convert image to base64 with io.BytesIO() as buffer: image.save(buffer, format="JPEG") image_content = buffer.getvalue() file_content = base64.b64encode(image_content).decode("utf-8") image_list.append(file_content) return image_list