Spaces:
Sleeping
Sleeping
File size: 1,889 Bytes
317211f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
"""Responsible for extracting text from images and PDFs using OCR engines or other modules.
"""
from io import BytesIO
from typing import List
import pyocr.tesseract
import pypdf
from PIL import Image
def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str:
"""Extracts text from the given PDF file using pypdf.
Args:
bytes_stream (BytesIO): The PDF file to extract text from.
Returns: The extracted text
"""
pdf_reader = pypdf.PdfReader(bytes_stream)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
text += "\n\n"
return text
def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str:
"""Extracts text from the given image using tesseract via pyocr.
Args:
image(PIL.Image.Image): The image to extract text from.
Returns: The extracted text.
"""
if not pyocr.tesseract.is_available():
raise Exception("Tesseract is not available.")
text = pyocr.tesseract.image_to_string(image, lang="eng")
return text
def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str:
"""Extracts text from the given images using tesseract via pyocr.
Args:
images(List[PIL.Image.Image]): The images to extract text from.
Returns: The extracted text.
"""
text = ""
for image in images:
text += extract_text_from_image_pyocr_tesseract(image)
text += "\n\n"
image.close()
return text
if __name__ == '__main__':
filename = 'examples/upright.pdf'
with open(filename, 'rb') as file:
bytes_stream = BytesIO(file.read())
text = extract_text_from_pdf_pypdf(bytes_stream)
print(text)
print("-"*25)
filename = 'examples/upright.jpeg'
image = Image.open(filename)
text = extract_text_from_image_pyocr_tesseract(image)
print(text)
image.close() |