File size: 1,243 Bytes
f745baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44


import base64
import io
import pdf2image
from typing import Any


class Preprocessor:
    """Preprocessor."""

    def run(self, file_path: str) -> Any:
        """Preprocess."""
        raise NotImplementedError("Preprocess method is not implemented")

# Convert PDF to image
class PdfPreprocessor(Preprocessor):
    """PDF Preprocessor."""

    def run(self, file_path: str) -> str:
        images = pdf2image.convert_from_path(file_path)
        image = images[0]  # Assuming there is only one page in the PDF

        # Convert image to base64
        with io.BytesIO() as buffer:
            image.save(buffer, format="JPEG")
            image_content = buffer.getvalue()

        file_content = base64.b64encode(image_content).decode("utf-8")
        # Process all pages and return a list of images
        images = pdf2image.convert_from_path(file_path)
        image_list = []

        for image in images:
            # Convert image to base64
            with io.BytesIO() as buffer:
                image.save(buffer, format="JPEG")
                image_content = buffer.getvalue()

            file_content = base64.b64encode(image_content).decode("utf-8")
            image_list.append(file_content)

        return image_list