Spaces:

davanstrien
/

extractous-demo

Running

File size: 1,763 Bytes

import gradio as gr
from extractous import Extractor, TesseractOcrConfig


def extract_document(file):
    """
    Extract text and metadata from an uploaded document
    """
    if file is None:
        return "Please upload a file", "No metadata available"

    try:
        # Create an extractor with default settings
        extractor = Extractor()

        # Optional: Add OCR config for image-based or scanned documents
        extractor = extractor.set_ocr_config(TesseractOcrConfig().set_language("eng"))

        # Extract text and metadata
        result, metadata = extractor.extract_file_to_string(file)

        return result, str(metadata)
    except Exception as e:
        return f"Error extracting document: {str(e)}", "No metadata available"


# Create the Gradio interface
demo = gr.Interface(
    fn=extract_document,
    inputs=gr.File(label="Upload Document"),
    outputs=[
        gr.Textbox(label="Extracted Text", lines=10),
        gr.Textbox(label="Metadata", lines=3),
    ],
    title="Extractus Demo",
    description="""
    Upload a document to extract its text content and metadata using [Extractous](https://github.com/yobix-ai/extractous).
    
    **Supported formats include:**
    - PDF files (with OCR support)
    - Microsoft Office (DOC, DOCX, PPT, PPTX, etc.)
    - Web Documents (HTML, XML)
    - Text Files (TXT, Markdown)
    - Images (with OCR capability)
    - And more
    """,
    article="""
    This demo showcases document text and metadata extraction capabilities.
    For more information, visit [Extractous on GitHub](https://github.com/yobix-ai/extractous).
    """,
    examples=[
        ["2412.13663v2.pdf"],  # Add example files to demo directory
    ],
)

if __name__ == "__main__":
    demo.launch()