Spaces:

AutomaticReimbursementTool
/

demo

Sleeping

File size: 1,889 Bytes

317211f

"""Responsible for extracting text from images and PDFs using OCR engines or other modules.
"""
from io import BytesIO
from typing import List

import pyocr.tesseract
import pypdf
from PIL import Image


def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str:
    """Extracts text from the given PDF file using pypdf.

    Args:
        bytes_stream (BytesIO): The PDF file to extract text from.

    Returns: The extracted text
    """
    pdf_reader = pypdf.PdfReader(bytes_stream)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
        text += "\n\n"
    return text


def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str:
    """Extracts text from the given image using tesseract via pyocr.

    Args:
        image(PIL.Image.Image): The image to extract text from.

    Returns: The extracted text.
    """
    if not pyocr.tesseract.is_available():
        raise Exception("Tesseract is not available.")
    text = pyocr.tesseract.image_to_string(image, lang="eng")
    return text


def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str:
    """Extracts text from the given images using tesseract via pyocr.

    Args:
        images(List[PIL.Image.Image]): The images to extract text from.

    Returns: The extracted text.
    """
    text = ""
    for image in images:
        text += extract_text_from_image_pyocr_tesseract(image)
        text += "\n\n"
        image.close()
    return text

if __name__ == '__main__':
    filename = 'examples/upright.pdf'
    with open(filename, 'rb') as file:
        bytes_stream = BytesIO(file.read())
    text = extract_text_from_pdf_pypdf(bytes_stream)
    print(text)
    print("-"*25)
    filename = 'examples/upright.jpeg'
    image = Image.open(filename)
    text = extract_text_from_image_pyocr_tesseract(image)
    print(text)
    image.close()