from pathlib import Path import categories import processing import extract from PIL import Image from pydantic import BaseModel from io import BytesIO def categorize_and_parse_text(text: str) -> BaseModel: """Categorizes the text and parses the information from it. Args: text(str): The text to categorize and parse information from. Returns: The category of the text. """ category = categories.categorize_text(text) # if stop_on_category: # return category, text result = categories.run_category_chain(category, text) return result def process_pdf(filename: Path, extract_only=False) -> BaseModel: """Processes the given PDF file and extracts information from it. Args: filename(Path): The PDF file to process. Returns: The extracted information. """ with open(filename, "rb") as f: pdf_bytes = bytes(f.read()) text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes)) # If the encoded text is too short, a pdf scanner probably added a watermark if len(text) < 20: # Try to extract text from images images = processing.preprocess_pdf_pdf2image(pdf_bytes) text = extract.extract_text_from_images_pyocr_tesseract(images) if extract_only: return text result = categorize_and_parse_text(text) return result def process_image(filename: Path, extract_only=False) -> BaseModel: """Processes the given image file and extracts information from it. Args: filename(Path): The image file to process. Returns: The extracted information. """ image = Image.open(filename) image = processing.preprocess_image(image) text = extract.extract_text_from_image_pyocr_tesseract(image) image.close() if extract_only: return text result = categorize_and_parse_text(text) return result if __name__ == "__main__": filename = Path("examples/example1.pdf") result = process_pdf(filename) print(result.json(indent=4))