Spaces:
Runtime error
Runtime error
File size: 2,893 Bytes
241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c 241c492 273089c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# Import necessary modules
from pdf2image import convert_from_path # Convert PDF pages to images
import os # For file and directory operations
import shutil # For removing and recreating directories
class PdfManager:
"""
A manager class for handling PDF-related operations, such as converting pages to images
and managing output directories.
"""
def __init__(self):
"""
Initialize the PdfManager.
Currently, no attributes are set during initialization.
"""
pass
def clear_and_recreate_dir(self, output_folder):
"""
Clear the specified directory and recreate it.
Args:
output_folder (str): Path to the directory to be cleared and recreated.
"""
print(f"Clearing output folder {output_folder}")
# Remove the directory if it exists
if os.path.exists(output_folder):
shutil.rmtree(output_folder) # Delete the folder and its contents
# Recreate the directory
os.makedirs(output_folder)
def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
"""
Convert PDF pages to images and save them to a specified directory.
Args:
id (str): Unique identifier for the output folder.
pdf_path (str): Path to the PDF file to be processed.
max_pages (int): Maximum number of pages to convert and save.
pages (list[int], optional): Specific page numbers to convert (default is None for all).
Returns:
list[str]: List of paths to the saved images.
"""
# Define the output folder for the images
output_folder = f"pages/{id}/"
# Convert the PDF pages to images
images = convert_from_path(pdf_path)
print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")
# Clear the existing directory and recreate it
self.clear_and_recreate_dir(output_folder)
num_page_processed = 0 # Counter for the number of pages processed
# Iterate through the converted images
for i, image in enumerate(images):
# Stop processing if the maximum number of pages is reached
if max_pages and num_page_processed >= max_pages:
break
# Skip pages not in the specified list (if provided)
if pages and i not in pages:
continue
# Define the save path for the current page
full_save_path = f"{output_folder}/page_{i + 1}.png"
# Save the image in PNG format
image.save(full_save_path, "PNG")
num_page_processed += 1 # Increment the processed page counter
# Return the paths of the saved images
return [f"{output_folder}/page_{i + 1}.png" for i in range(num_page_processed)]
|