Spaces:
Sleeping
Sleeping
import base64 | |
import io | |
import pdf2image | |
from typing import Any | |
class Preprocessor: | |
"""Preprocessor.""" | |
def run(self, file_path: str) -> Any: | |
"""Preprocess.""" | |
raise NotImplementedError("Preprocess method is not implemented") | |
# Convert PDF to image | |
class PdfPreprocessor(Preprocessor): | |
"""PDF Preprocessor.""" | |
def run(self, file_path: str) -> str: | |
images = pdf2image.convert_from_path(file_path) | |
image = images[0] # Assuming there is only one page in the PDF | |
# Convert image to base64 | |
with io.BytesIO() as buffer: | |
image.save(buffer, format="JPEG") | |
image_content = buffer.getvalue() | |
file_content = base64.b64encode(image_content).decode("utf-8") | |
# Process all pages and return a list of images | |
images = pdf2image.convert_from_path(file_path) | |
image_list = [] | |
for image in images: | |
# Convert image to base64 | |
with io.BytesIO() as buffer: | |
image.save(buffer, format="JPEG") | |
image_content = buffer.getvalue() | |
file_content = base64.b64encode(image_content).decode("utf-8") | |
image_list.append(file_content) | |
return image_list | |