Spaces:
Sleeping
Sleeping
"""Responsible for extracting text from images and PDFs using OCR engines or other modules. | |
""" | |
from io import BytesIO | |
from typing import List | |
import pyocr.tesseract | |
import pypdf | |
from PIL import Image | |
def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str: | |
"""Extracts text from the given PDF file using pypdf. | |
Args: | |
bytes_stream (BytesIO): The PDF file to extract text from. | |
Returns: The extracted text | |
""" | |
pdf_reader = pypdf.PdfReader(bytes_stream) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
text += "\n\n" | |
return text | |
def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str: | |
"""Extracts text from the given image using tesseract via pyocr. | |
Args: | |
image(PIL.Image.Image): The image to extract text from. | |
Returns: The extracted text. | |
""" | |
if not pyocr.tesseract.is_available(): | |
raise Exception("Tesseract is not available.") | |
text = pyocr.tesseract.image_to_string(image, lang="eng") | |
return text | |
def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str: | |
"""Extracts text from the given images using tesseract via pyocr. | |
Args: | |
images(List[PIL.Image.Image]): The images to extract text from. | |
Returns: The extracted text. | |
""" | |
text = "" | |
for image in images: | |
text += extract_text_from_image_pyocr_tesseract(image) | |
text += "\n\n" | |
image.close() | |
return text | |
if __name__ == '__main__': | |
filename = 'examples/upright.pdf' | |
with open(filename, 'rb') as file: | |
bytes_stream = BytesIO(file.read()) | |
text = extract_text_from_pdf_pypdf(bytes_stream) | |
print(text) | |
print("-"*25) | |
filename = 'examples/upright.jpeg' | |
image = Image.open(filename) | |
text = extract_text_from_image_pyocr_tesseract(image) | |
print(text) | |
image.close() |