Spaces:
Sleeping
Sleeping
"""Responsible for (pre)processing images and PDFs before they are passed to the OCR | |
engine and other miscellaneous actions concerning processing. | |
""" | |
import os | |
from pathlib import Path | |
from typing import List | |
# import cv2 | |
# import numpy as np | |
import pyocr | |
from pdf2image import pdf2image | |
from PIL import Image #, ImageOps | |
PDF_CONVERSION_DPI = 300 | |
ROTATION_CONFIDENCE_THRESHOLD = 2.0 | |
# def rotate_image(image: Image, angle: float): | |
# """Rotates the given image by the given angle. | |
# Args: | |
# image(PIL.Image.Image): The image to be rotated. | |
# angle(float): The angle to rotate the image by. | |
# Returns: The rotated image. | |
# """ | |
# image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) | |
# height, width, _ = image.shape # Get the image height, width, and channels | |
# # Compute the rotation matrix | |
# rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1) | |
# # Apply the rotation to the image | |
# rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height)) | |
# rotated_image = Image.fromarray(cv2.cvtColor(rotated_image, cv2.COLOR_BGR2RGB)) | |
# return rotated_image | |
# class PDF_CONVERTER(enum.Enum): | |
# PDF2IMAGE = 1 | |
# IMAGEMAGICK = 2 | |
def correct_orientation(image: Image.Image) -> Image.Image: | |
"""Corrects the orientation of an image if it is not upright. | |
Args: | |
image(PIL.Image.Image): The pillow image to be corrected. | |
Returns: The corrected pillow image as a copy. The original image is not closed. | |
""" | |
if not pyocr.tesseract.is_available(): | |
raise Exception("Tesseract is not available.") | |
# image = ImageOps.exif_transpose(image) # EXIF rotation is apparent, not actual | |
orientation_info = {} | |
try: | |
orientation_info = pyocr.tesseract.detect_orientation(image) | |
except pyocr.PyocrException as e: | |
print("Orientation detection failed: {}".format(e)) | |
# output = pytesseract.image_to_osd( | |
# image, config=" --psm 0", output_type=pytesseract.Output.DICT | |
# ) | |
angle = orientation_info.get("angle", 0) | |
confidence = orientation_info.get("confidence", 100) | |
# rotate = output["rotate"] | |
# confidence = output["orientation_conf"] | |
if confidence > ROTATION_CONFIDENCE_THRESHOLD: | |
new_image = image.rotate(angle, expand=True) | |
else: | |
new_image = image.copy() | |
return new_image | |
def convert_pdf_to_image_pdf2image(pdf_bytes: bytes) -> List[Image.Image]: | |
"""Converts a PDF to an image using pdf2image. | |
Args: | |
pdf_bytes(bytes): The bytes of the PDF to be converted. | |
Returns: A list of pillow images corresponding to each page from the PDF. | |
""" | |
images = pdf2image.convert_from_bytes(pdf_bytes, dpi=PDF_CONVERSION_DPI) | |
return images | |
def convert_pdf_to_image_ImageMagick(filename: Path, dest_folder: Path) -> Path: | |
"""Converts a PDF to an image using ImageMagick. | |
Args: | |
filename(pathlib.Path): The path to the PDF to be converted. | |
dest_folder(pathlib.Path): The destination folder for the converted pages. Pages | |
are saved in the folder as page.jpg or as page-01.jpg, | |
page-02.jpg, etc. | |
Returns: dest_folder | |
""" | |
os.system(f"magick convert" | |
f"-density {PDF_CONVERSION_DPI}" | |
f"{filename}" | |
f"-quality 100" | |
f"{dest_folder/'page.jpg'}") | |
return dest_folder | |
def preprocess_image(image: Image.Image) -> Image.Image: | |
"""Preprocesses an image for future use with OCR. | |
The following operations are performed: | |
1. Orientation correction | |
Args: | |
image(PIL.Image.Image): The image to be preprocessed. | |
Returns: The preprocessed pillow image. | |
""" | |
rotated_image = correct_orientation(image) | |
result = rotated_image | |
image.close() | |
return result | |
def preprocess_pdf_pdf2image(pdf_bytes: bytes) -> List[Image.Image]: | |
"""Preprocesses a PDF for future use with OCR. | |
The following operations are performed: | |
1. PDF to image conversion | |
2. Orientation correction | |
Args: | |
pdf_bytes(bytes): The bytes of the PDF to be preprocessed. | |
Returns: A list of pillow images corresponding to each page from the PDF. | |
""" | |
images = convert_pdf_to_image_pdf2image(pdf_bytes) | |
result = [] | |
for image in images: | |
new_image = preprocess_image(image) | |
image.close() | |
result.append(new_image) | |
return result | |
def preprocess_pdf_ImageMagick(filename: Path) -> List[Image.Image]: | |
"""Preprocesses a PDF for future use with OCR. | |
The following operations are performed: | |
1. PDF to image conversion | |
2. Orientation correction | |
Args: | |
filename(pathlib.Path): The path to the PDF to be preprocessed. | |
Returns: A list of pillow images corresponding to each page from the PDF. | |
""" | |
dest_folder = convert_pdf_to_image_ImageMagick(filename, dest_folder) | |
result = [] | |
for image in dest_folder.glob("*.jpg"): | |
new_image = preprocess_image(image) | |
image.close() | |
result.append(new_image) | |
return result | |
if __name__ == '__main__': | |
filename = 'examples/upright.jpeg' | |
image = Image.open(filename) | |
new_image = preprocess_image(image) | |
image.close() | |
new_image.show() | |
new_image.close() | |
filename = 'examples/rotated.pdf' | |
with open(filename, 'rb') as file: | |
bytes_ = bytes(file.read()) | |
images = preprocess_pdf_pdf2image(bytes_) | |
for image in images: | |
image.show() | |
image.close() |