Spaces:
Runtime error
Runtime error
# Import necessary modules | |
from pdf2image import convert_from_path # Convert PDF pages to images | |
import os # For file and directory operations | |
import shutil # For removing and recreating directories | |
class PdfManager: | |
""" | |
A manager class for handling PDF-related operations, such as converting pages to images | |
and managing output directories. | |
""" | |
def __init__(self): | |
""" | |
Initialize the PdfManager. | |
Currently, no attributes are set during initialization. | |
""" | |
pass | |
def clear_and_recreate_dir(self, output_folder): | |
""" | |
Clear the specified directory and recreate it. | |
Args: | |
output_folder (str): Path to the directory to be cleared and recreated. | |
""" | |
print(f"Clearing output folder {output_folder}") | |
# Remove the directory if it exists | |
if os.path.exists(output_folder): | |
shutil.rmtree(output_folder) # Delete the folder and its contents | |
# Recreate the directory | |
os.makedirs(output_folder) | |
def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]: | |
""" | |
Convert PDF pages to images and save them to a specified directory. | |
Args: | |
id (str): Unique identifier for the output folder. | |
pdf_path (str): Path to the PDF file to be processed. | |
max_pages (int): Maximum number of pages to convert and save. | |
pages (list[int], optional): Specific page numbers to convert (default is None for all). | |
Returns: | |
list[str]: List of paths to the saved images. | |
""" | |
# Define the output folder for the images | |
output_folder = f"pages/{id}/" | |
# Convert the PDF pages to images | |
images = convert_from_path(pdf_path) | |
print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}") | |
# Clear the existing directory and recreate it | |
self.clear_and_recreate_dir(output_folder) | |
num_page_processed = 0 # Counter for the number of pages processed | |
# Iterate through the converted images | |
for i, image in enumerate(images): | |
# Stop processing if the maximum number of pages is reached | |
if max_pages and num_page_processed >= max_pages: | |
break | |
# Skip pages not in the specified list (if provided) | |
if pages and i not in pages: | |
continue | |
# Define the save path for the current page | |
full_save_path = f"{output_folder}/page_{i + 1}.png" | |
# Save the image in PNG format | |
image.save(full_save_path, "PNG") | |
num_page_processed += 1 # Increment the processed page counter | |
# Return the paths of the saved images | |
return [f"{output_folder}/page_{i + 1}.png" for i in range(num_page_processed)] | |