# Import necessary modules from pdf2image import convert_from_path # Convert PDF pages to images import os # For file and directory operations import shutil # For removing and recreating directories class PdfManager: """ A manager class for handling PDF-related operations, such as converting pages to images and managing output directories. """ def __init__(self): """ Initialize the PdfManager. Currently, no attributes are set during initialization. """ pass def clear_and_recreate_dir(self, output_folder): """ Clear the specified directory and recreate it. Args: output_folder (str): Path to the directory to be cleared and recreated. """ print(f"Clearing output folder {output_folder}") # Remove the directory if it exists if os.path.exists(output_folder): shutil.rmtree(output_folder) # Delete the folder and its contents # Recreate the directory os.makedirs(output_folder) def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]: """ Convert PDF pages to images and save them to a specified directory. Args: id (str): Unique identifier for the output folder. pdf_path (str): Path to the PDF file to be processed. max_pages (int): Maximum number of pages to convert and save. pages (list[int], optional): Specific page numbers to convert (default is None for all). Returns: list[str]: List of paths to the saved images. """ # Define the output folder for the images output_folder = f"pages/{id}/" # Convert the PDF pages to images images = convert_from_path(pdf_path) print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}") # Clear the existing directory and recreate it self.clear_and_recreate_dir(output_folder) num_page_processed = 0 # Counter for the number of pages processed # Iterate through the converted images for i, image in enumerate(images): # Stop processing if the maximum number of pages is reached if max_pages and num_page_processed >= max_pages: break # Skip pages not in the specified list (if provided) if pages and i not in pages: continue # Define the save path for the current page full_save_path = f"{output_folder}/page_{i + 1}.png" # Save the image in PNG format image.save(full_save_path, "PNG") num_page_processed += 1 # Increment the processed page counter # Return the paths of the saved images return [f"{output_folder}/page_{i + 1}.png" for i in range(num_page_processed)]