File size: 2,893 Bytes
241c492
 
 
 
273089c
 
241c492
 
 
 
 
273089c
241c492
 
 
 
273089c
241c492
273089c
241c492
 
 
 
 
 
273089c
 
241c492
273089c
241c492
273089c
241c492
273089c
 
 
241c492
 
 
 
 
 
 
 
 
 
 
 
 
273089c
 
241c492
 
273089c
 
241c492
273089c
 
241c492
273089c
241c492
273089c
241c492
273089c
 
 
241c492
273089c
 
 
241c492
273089c
 
241c492
273089c
 
241c492
273089c
241c492
273089c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Import necessary modules
from pdf2image import convert_from_path  # Convert PDF pages to images
import os  # For file and directory operations
import shutil  # For removing and recreating directories

class PdfManager:
    """
    A manager class for handling PDF-related operations, such as converting pages to images
    and managing output directories.
    """

    def __init__(self):
        """
        Initialize the PdfManager.
        Currently, no attributes are set during initialization.
        """
        pass

    def clear_and_recreate_dir(self, output_folder):
        """
        Clear the specified directory and recreate it.

        Args:
            output_folder (str): Path to the directory to be cleared and recreated.
        """
        print(f"Clearing output folder {output_folder}")

        # Remove the directory if it exists
        if os.path.exists(output_folder):
            shutil.rmtree(output_folder)  # Delete the folder and its contents

        # Recreate the directory
        os.makedirs(output_folder)

    def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
        """
        Convert PDF pages to images and save them to a specified directory.

        Args:
            id (str): Unique identifier for the output folder.
            pdf_path (str): Path to the PDF file to be processed.
            max_pages (int): Maximum number of pages to convert and save.
            pages (list[int], optional): Specific page numbers to convert (default is None for all).

        Returns:
            list[str]: List of paths to the saved images.
        """
        # Define the output folder for the images
        output_folder = f"pages/{id}/"

        # Convert the PDF pages to images
        images = convert_from_path(pdf_path)
        print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")

        # Clear the existing directory and recreate it
        self.clear_and_recreate_dir(output_folder)

        num_page_processed = 0  # Counter for the number of pages processed

        # Iterate through the converted images
        for i, image in enumerate(images):
            # Stop processing if the maximum number of pages is reached
            if max_pages and num_page_processed >= max_pages:
                break

            # Skip pages not in the specified list (if provided)
            if pages and i not in pages:
                continue

            # Define the save path for the current page
            full_save_path = f"{output_folder}/page_{i + 1}.png"

            # Save the image in PNG format
            image.save(full_save_path, "PNG")

            num_page_processed += 1  # Increment the processed page counter

        # Return the paths of the saved images
        return [f"{output_folder}/page_{i + 1}.png" for i in range(num_page_processed)]