File size: 1,763 Bytes
f77a9a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01f73b8
f77a9a3
01f73b8
f77a9a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import gradio as gr
from extractous import Extractor, TesseractOcrConfig


def extract_document(file):
    """
    Extract text and metadata from an uploaded document
    """
    if file is None:
        return "Please upload a file", "No metadata available"

    try:
        # Create an extractor with default settings
        extractor = Extractor()

        # Optional: Add OCR config for image-based or scanned documents
        extractor = extractor.set_ocr_config(TesseractOcrConfig().set_language("eng"))

        # Extract text and metadata
        result, metadata = extractor.extract_file_to_string(file)

        return result, str(metadata)
    except Exception as e:
        return f"Error extracting document: {str(e)}", "No metadata available"


# Create the Gradio interface
demo = gr.Interface(
    fn=extract_document,
    inputs=gr.File(label="Upload Document"),
    outputs=[
        gr.Textbox(label="Extracted Text", lines=10),
        gr.Textbox(label="Metadata", lines=3),
    ],
    title="Extractus Demo",
    description="""
    Upload a document to extract its text content and metadata using [Extractous](https://github.com/yobix-ai/extractous).
    
    **Supported formats include:**
    - PDF files (with OCR support)
    - Microsoft Office (DOC, DOCX, PPT, PPTX, etc.)
    - Web Documents (HTML, XML)
    - Text Files (TXT, Markdown)
    - Images (with OCR capability)
    - And more
    """,
    article="""
    This demo showcases document text and metadata extraction capabilities.
    For more information, visit [Extractous on GitHub](https://github.com/yobix-ai/extractous).
    """,
    examples=[
        ["2412.13663v2.pdf"],  # Add example files to demo directory
    ],
)

if __name__ == "__main__":
    demo.launch()