multimodal_rag / pdf_manager.py
ej68okap
new code added
241c492
# Import necessary modules
from pdf2image import convert_from_path # Convert PDF pages to images
import os # For file and directory operations
import shutil # For removing and recreating directories
class PdfManager:
"""
A manager class for handling PDF-related operations, such as converting pages to images
and managing output directories.
"""
def __init__(self):
"""
Initialize the PdfManager.
Currently, no attributes are set during initialization.
"""
pass
def clear_and_recreate_dir(self, output_folder):
"""
Clear the specified directory and recreate it.
Args:
output_folder (str): Path to the directory to be cleared and recreated.
"""
print(f"Clearing output folder {output_folder}")
# Remove the directory if it exists
if os.path.exists(output_folder):
shutil.rmtree(output_folder) # Delete the folder and its contents
# Recreate the directory
os.makedirs(output_folder)
def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
"""
Convert PDF pages to images and save them to a specified directory.
Args:
id (str): Unique identifier for the output folder.
pdf_path (str): Path to the PDF file to be processed.
max_pages (int): Maximum number of pages to convert and save.
pages (list[int], optional): Specific page numbers to convert (default is None for all).
Returns:
list[str]: List of paths to the saved images.
"""
# Define the output folder for the images
output_folder = f"pages/{id}/"
# Convert the PDF pages to images
images = convert_from_path(pdf_path)
print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")
# Clear the existing directory and recreate it
self.clear_and_recreate_dir(output_folder)
num_page_processed = 0 # Counter for the number of pages processed
# Iterate through the converted images
for i, image in enumerate(images):
# Stop processing if the maximum number of pages is reached
if max_pages and num_page_processed >= max_pages:
break
# Skip pages not in the specified list (if provided)
if pages and i not in pages:
continue
# Define the save path for the current page
full_save_path = f"{output_folder}/page_{i + 1}.png"
# Save the image in PNG format
image.save(full_save_path, "PNG")
num_page_processed += 1 # Increment the processed page counter
# Return the paths of the saved images
return [f"{output_folder}/page_{i + 1}.png" for i in range(num_page_processed)]