Spaces:

AutomaticReimbursementTool
/

demo

Sleeping

File size: 2,022 Bytes

from pathlib import Path

import categories
import processing
import extract
from PIL import Image
from pydantic import BaseModel
from io import BytesIO

def categorize_and_parse_text(text: str) -> BaseModel:
    """Categorizes the text and parses the information from it.

    Args:
        text(str): The text to categorize and parse information from.

    Returns: The category of the text.
    """
    category = categories.categorize_text(text)
    # if stop_on_category:
    #     return category, text
    result = categories.run_category_chain(category, text)
    return result

def process_pdf(filename: Path, extract_only=False) -> BaseModel:
    """Processes the given PDF file and extracts information from it.

    Args:
        filename(Path): The PDF file to process.

    Returns: The extracted information.
    """
    with open(filename, "rb") as f:
        pdf_bytes = bytes(f.read())
    
    text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
    # If the encoded text is too short, a pdf scanner probably added a watermark
    if len(text) < 20:
        # Try to extract text from images
        images = processing.preprocess_pdf_pdf2image(pdf_bytes)
        text = extract.extract_text_from_images_pyocr_tesseract(images)
    if extract_only:
        return text
    result = categorize_and_parse_text(text)
    return result

def process_image(filename: Path, extract_only=False) -> BaseModel:
    """Processes the given image file and extracts information from it.

    Args:
        filename(Path): The image file to process.

    Returns: The extracted information.
    """
    image = Image.open(filename)
    image = processing.preprocess_image(image)
    text = extract.extract_text_from_image_pyocr_tesseract(image)
    image.close()
    if extract_only:
        return text
    result = categorize_and_parse_text(text)
    return result

if __name__ == "__main__":
    filename = Path("examples/example1.pdf")
    result = process_pdf(filename)
    print(result.json(indent=4))