Spaces:

davanstrien
/

extractous-demo

Running

App Files Files Community

davanstrien HF staff commited on Jan 30

Commit

f77a9a3

1 Parent(s): bab3281

Add document text extraction functionality using Extractous and Gradio

Browse files

Files changed (1) hide show

app.py +58 -0

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import gradio as gr
+from extractous import Extractor, TesseractOcrConfig
+def extract_document(file):
+    """
+    Extract text and metadata from an uploaded document
+    """
+    if file is None:
+        return "Please upload a file", "No metadata available"
+    try:
+        # Create an extractor with default settings
+        extractor = Extractor()
+        # Optional: Add OCR config for image-based or scanned documents
+        extractor = extractor.set_ocr_config(TesseractOcrConfig().set_language("eng"))
+        # Extract text and metadata
+        result, metadata = extractor.extract_file_to_string(file)
+        return result, str(metadata)
+    except Exception as e:
+        return f"Error extracting document: {str(e)}", "No metadata available"
+# Create the Gradio interface
+demo = gr.Interface(
+    fn=extract_document,
+    inputs=gr.File(label="Upload Document"),
+    outputs=[
+        gr.Textbox(label="Extracted Text", lines=10),
+        gr.Textbox(label="Metadata", lines=3),
+    ],
+    title="Document Text Extraction Demo",
+    description="""
+    Upload a document to extract its text content and metadata using Extractous.
+    **Supported formats include:**
+    - PDF files (with OCR support)
+    - Microsoft Office (DOC, DOCX, PPT, PPTX, etc.)
+    - Web Documents (HTML, XML)
+    - Text Files (TXT, Markdown)
+    - Images (with OCR capability)
+    - And more
+    """,
+    article="""
+    This demo showcases document text and metadata extraction capabilities.
+    For more information, visit [Extractous on GitHub](https://github.com/yobix-ai/extractous).
+    """,
+    examples=[
+        ["2412.13663v2.pdf"],  # Add example files to demo directory
+    ],
+)
+if __name__ == "__main__":
+    demo.launch()