Spaces:
Sleeping
Sleeping
from pathlib import Path | |
import categories | |
import processing | |
import extract | |
from PIL import Image | |
from pydantic import BaseModel | |
from io import BytesIO | |
def categorize_and_parse_text(text: str) -> BaseModel: | |
"""Categorizes the text and parses the information from it. | |
Args: | |
text(str): The text to categorize and parse information from. | |
Returns: The category of the text. | |
""" | |
category = categories.categorize_text(text) | |
# if stop_on_category: | |
# return category, text | |
result = categories.run_category_chain(category, text) | |
return result | |
def process_pdf(filename: Path, extract_only=False) -> BaseModel: | |
"""Processes the given PDF file and extracts information from it. | |
Args: | |
filename(Path): The PDF file to process. | |
Returns: The extracted information. | |
""" | |
with open(filename, "rb") as f: | |
pdf_bytes = bytes(f.read()) | |
text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes)) | |
# If the encoded text is too short, a pdf scanner probably added a watermark | |
if len(text) < 20: | |
# Try to extract text from images | |
images = processing.preprocess_pdf_pdf2image(pdf_bytes) | |
text = extract.extract_text_from_images_pyocr_tesseract(images) | |
if extract_only: | |
return text | |
result = categorize_and_parse_text(text) | |
return result | |
def process_image(filename: Path, extract_only=False) -> BaseModel: | |
"""Processes the given image file and extracts information from it. | |
Args: | |
filename(Path): The image file to process. | |
Returns: The extracted information. | |
""" | |
image = Image.open(filename) | |
image = processing.preprocess_image(image) | |
text = extract.extract_text_from_image_pyocr_tesseract(image) | |
image.close() | |
if extract_only: | |
return text | |
result = categorize_and_parse_text(text) | |
return result | |
if __name__ == "__main__": | |
filename = Path("examples/example1.pdf") | |
result = process_pdf(filename) | |
print(result.json(indent=4)) |