demo / main.py
ankur-bohra's picture
Initial commit
317211f
raw
history blame
1.85 kB
from pathlib import Path
import categories
import processing
import extract
from PIL import Image
from pydantic import BaseModel
from io import BytesIO
def categorize_and_parse_text(text: str) -> BaseModel:
"""Categorizes the text and parses the information from it.
Args:
text(str): The text to categorize and parse information from.
Returns: The category of the text.
"""
category = categories.categorize_text(text)
print("Categorized as category", category)
result = categories.run_category_chain(category, text)
return result
def process_pdf(filename: Path) -> BaseModel:
"""Processes the given PDF file and extracts information from it.
Args:
filename(Path): The PDF file to process.
Returns: The extracted information.
"""
with open(filename, "rb") as f:
pdf_bytes = bytes(f.read())
text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
# If the encoded text is too short, a pdf scanner probably added a watermark
if len(text) < 20:
# Try to extract text from images
images = processing.convert_pdf_to_image_pdf2image(pdf_bytes)
text = extract.extract_text_from_images_pyocr_tesseract(images)
result = categorize_and_parse_text(text)
return result
def process_image(filename: Path) -> BaseModel:
"""Processes the given image file and extracts information from it.
Args:
filename(Path): The image file to process.
Returns: The extracted information.
"""
image = Image.open(filename)
text = extract.extract_text_from_image_pyocr_tesseract(image)
image.close()
result = categorize_and_parse_text(text)
return result
if __name__ == "__main__":
filename = Path("examples/example1.pdf")
result = process_pdf(filename)
print(result.json(indent=4))