ocr / pdftoimage.py
washeed's picture
Upload 18 files
b692870 verified
import os
from pdf2image import convert_from_path
def convert_pdf_to_images(pdf_path, output_format="png", max_pages=None):
"""Converts a single PDF file to images.
Args:
pdf_path (str): Path to the PDF file.
output_format (str, optional): Desired output format for images (default: "png").
Supported formats are "png", "jpg", and "ppm".
max_pages (int, optional): Maximum number of pages to convert (default: None, all pages).
"""
try:
pdf_name, _ = os.path.splitext(os.path.basename(pdf_path)) # Extract filename without extension
images = convert_from_path(pdf_path, fmt=output_format, first_page=1, last_page=max_pages or None) # Use None for all pages
buffer_folder_path = os.path.join(os.path.dirname(pdf_path), pdf_name) # Create folder next to the PDF
os.makedirs(buffer_folder_path, exist_ok=True) # Create if not exists
for i, image in enumerate(images):
image_path = os.path.join(buffer_folder_path, f"page_{i+1}.{output_format}")
image.save(image_path, output_format.upper()) # Use uppercase extension
except Exception as e:
print(f"Error converting {pdf_path}: {e}")
def convert_pdfs(pdf_folder_path, output_format="png", max_pages=None):
"""Converts all PDF files in a folder to images sequentially.
Args:
pdf_folder_path (str): Path to the folder containing PDF files.
output_format (str, optional): Desired output format for images (default: "png").
Supported formats are "png", "jpg", and "ppm".
max_pages (int, optional): Maximum number of pages to convert per PDF (default: None, all pages).
"""
for filename in os.listdir(pdf_folder_path):
if filename.endswith(".pdf"):
pdf_path = os.path.join(pdf_folder_path, filename)
convert_pdf_to_images(pdf_path, output_format, max_pages)
# Example usage
#convert_pdfs("input", output_format="png", max_pages=2) # Convert PDFs to JPG, keeping only the first 2 pages