Spaces:

AutomaticReimbursementTool
/

demo

Sleeping

App Files Files Community

demo / extract.py

ankur-bohra

Initial commit

317211f over 1 year ago

raw

history blame

1.89 kB

	"""Responsible for extracting text from images and PDFs using OCR engines or other modules.
	"""
	from io import BytesIO
	from typing import List

	import pyocr.tesseract
	import pypdf
	from PIL import Image


	def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str:
	"""Extracts text from the given PDF file using pypdf.

	Args:
	bytes_stream (BytesIO): The PDF file to extract text from.

	Returns: The extracted text
	"""
	pdf_reader = pypdf.PdfReader(bytes_stream)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text()
	text += "\n\n"
	return text


	def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str:
	"""Extracts text from the given image using tesseract via pyocr.

	Args:
	image(PIL.Image.Image): The image to extract text from.

	Returns: The extracted text.
	"""
	if not pyocr.tesseract.is_available():
	raise Exception("Tesseract is not available.")
	text = pyocr.tesseract.image_to_string(image, lang="eng")
	return text


	def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str:
	"""Extracts text from the given images using tesseract via pyocr.

	Args:
	images(List[PIL.Image.Image]): The images to extract text from.

	Returns: The extracted text.
	"""
	text = ""
	for image in images:
	text += extract_text_from_image_pyocr_tesseract(image)
	text += "\n\n"
	image.close()
	return text

	if __name__ == '__main__':
	filename = 'examples/upright.pdf'
	with open(filename, 'rb') as file:
	bytes_stream = BytesIO(file.read())
	text = extract_text_from_pdf_pypdf(bytes_stream)
	print(text)
	print("-"*25)
	filename = 'examples/upright.jpeg'
	image = Image.open(filename)
	text = extract_text_from_image_pyocr_tesseract(image)
	print(text)
	image.close()