Spaces:

AutomaticReimbursementTool
/

demo

Sleeping

App Files Files Community

demo / main.py

ankur-bohra

Initial commit

317211f over 1 year ago

raw

history blame

1.85 kB

	from pathlib import Path

	import categories
	import processing
	import extract
	from PIL import Image
	from pydantic import BaseModel
	from io import BytesIO

	def categorize_and_parse_text(text: str) -> BaseModel:
	"""Categorizes the text and parses the information from it.

	Args:
	text(str): The text to categorize and parse information from.

	Returns: The category of the text.
	"""
	category = categories.categorize_text(text)
	print("Categorized as category", category)
	result = categories.run_category_chain(category, text)
	return result

	def process_pdf(filename: Path) -> BaseModel:
	"""Processes the given PDF file and extracts information from it.

	Args:
	filename(Path): The PDF file to process.

	Returns: The extracted information.
	"""
	with open(filename, "rb") as f:
	pdf_bytes = bytes(f.read())

	text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
	# If the encoded text is too short, a pdf scanner probably added a watermark
	if len(text) < 20:
	# Try to extract text from images
	images = processing.convert_pdf_to_image_pdf2image(pdf_bytes)
	text = extract.extract_text_from_images_pyocr_tesseract(images)

	result = categorize_and_parse_text(text)
	return result

	def process_image(filename: Path) -> BaseModel:
	"""Processes the given image file and extracts information from it.

	Args:
	filename(Path): The image file to process.

	Returns: The extracted information.
	"""
	image = Image.open(filename)
	text = extract.extract_text_from_image_pyocr_tesseract(image)
	image.close()
	result = categorize_and_parse_text(text)
	return result

	if __name__ == "__main__":
	filename = Path("examples/example1.pdf")
	result = process_pdf(filename)
	print(result.json(indent=4))