"""Responsible for extracting text from images and PDFs using OCR engines or other modules. """ from io import BytesIO from typing import List import pyocr.tesseract import pypdf from PIL import Image def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str: """Extracts text from the given PDF file using pypdf. Args: bytes_stream (BytesIO): The PDF file to extract text from. Returns: The extracted text """ pdf_reader = pypdf.PdfReader(bytes_stream) text = "" for page in pdf_reader.pages: text += page.extract_text() text += "\n\n" return text def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str: """Extracts text from the given image using tesseract via pyocr. Args: image(PIL.Image.Image): The image to extract text from. Returns: The extracted text. """ if not pyocr.tesseract.is_available(): raise Exception("Tesseract is not available.") text = pyocr.tesseract.image_to_string(image, lang="eng") return text def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str: """Extracts text from the given images using tesseract via pyocr. Args: images(List[PIL.Image.Image]): The images to extract text from. Returns: The extracted text. """ text = "" for image in images: text += extract_text_from_image_pyocr_tesseract(image) text += "\n\n" image.close() return text if __name__ == '__main__': filename = 'examples/upright.pdf' with open(filename, 'rb') as file: bytes_stream = BytesIO(file.read()) text = extract_text_from_pdf_pypdf(bytes_stream) print(text) print("-"*25) filename = 'examples/upright.jpeg' image = Image.open(filename) text = extract_text_from_image_pyocr_tesseract(image) print(text) image.close()