demo / extract.py
ankur-bohra's picture
Initial commit
317211f
raw
history blame
1.89 kB
"""Responsible for extracting text from images and PDFs using OCR engines or other modules.
"""
from io import BytesIO
from typing import List
import pyocr.tesseract
import pypdf
from PIL import Image
def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str:
"""Extracts text from the given PDF file using pypdf.
Args:
bytes_stream (BytesIO): The PDF file to extract text from.
Returns: The extracted text
"""
pdf_reader = pypdf.PdfReader(bytes_stream)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
text += "\n\n"
return text
def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str:
"""Extracts text from the given image using tesseract via pyocr.
Args:
image(PIL.Image.Image): The image to extract text from.
Returns: The extracted text.
"""
if not pyocr.tesseract.is_available():
raise Exception("Tesseract is not available.")
text = pyocr.tesseract.image_to_string(image, lang="eng")
return text
def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str:
"""Extracts text from the given images using tesseract via pyocr.
Args:
images(List[PIL.Image.Image]): The images to extract text from.
Returns: The extracted text.
"""
text = ""
for image in images:
text += extract_text_from_image_pyocr_tesseract(image)
text += "\n\n"
image.close()
return text
if __name__ == '__main__':
filename = 'examples/upright.pdf'
with open(filename, 'rb') as file:
bytes_stream = BytesIO(file.read())
text = extract_text_from_pdf_pypdf(bytes_stream)
print(text)
print("-"*25)
filename = 'examples/upright.jpeg'
image = Image.open(filename)
text = extract_text_from_image_pyocr_tesseract(image)
print(text)
image.close()