Spaces:
Sleeping
Sleeping
File size: 1,852 Bytes
317211f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from pathlib import Path
import categories
import processing
import extract
from PIL import Image
from pydantic import BaseModel
from io import BytesIO
def categorize_and_parse_text(text: str) -> BaseModel:
"""Categorizes the text and parses the information from it.
Args:
text(str): The text to categorize and parse information from.
Returns: The category of the text.
"""
category = categories.categorize_text(text)
print("Categorized as category", category)
result = categories.run_category_chain(category, text)
return result
def process_pdf(filename: Path) -> BaseModel:
"""Processes the given PDF file and extracts information from it.
Args:
filename(Path): The PDF file to process.
Returns: The extracted information.
"""
with open(filename, "rb") as f:
pdf_bytes = bytes(f.read())
text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
# If the encoded text is too short, a pdf scanner probably added a watermark
if len(text) < 20:
# Try to extract text from images
images = processing.convert_pdf_to_image_pdf2image(pdf_bytes)
text = extract.extract_text_from_images_pyocr_tesseract(images)
result = categorize_and_parse_text(text)
return result
def process_image(filename: Path) -> BaseModel:
"""Processes the given image file and extracts information from it.
Args:
filename(Path): The image file to process.
Returns: The extracted information.
"""
image = Image.open(filename)
text = extract.extract_text_from_image_pyocr_tesseract(image)
image.close()
result = categorize_and_parse_text(text)
return result
if __name__ == "__main__":
filename = Path("examples/example1.pdf")
result = process_pdf(filename)
print(result.json(indent=4)) |