Spaces:
Running
Running
import gradio as gr | |
from extractous import Extractor, TesseractOcrConfig | |
def extract_document(file): | |
""" | |
Extract text and metadata from an uploaded document | |
""" | |
if file is None: | |
return "Please upload a file", "No metadata available" | |
try: | |
# Create an extractor with default settings | |
extractor = Extractor() | |
# Optional: Add OCR config for image-based or scanned documents | |
extractor = extractor.set_ocr_config(TesseractOcrConfig().set_language("eng")) | |
# Extract text and metadata | |
result, metadata = extractor.extract_file_to_string(file) | |
return result, str(metadata) | |
except Exception as e: | |
return f"Error extracting document: {str(e)}", "No metadata available" | |
# Create the Gradio interface | |
demo = gr.Interface( | |
fn=extract_document, | |
inputs=gr.File(label="Upload Document"), | |
outputs=[ | |
gr.Textbox(label="Extracted Text", lines=10), | |
gr.Textbox(label="Metadata", lines=3), | |
], | |
title="Extractus Demo", | |
description=""" | |
Upload a document to extract its text content and metadata using [Extractous](https://github.com/yobix-ai/extractous). | |
**Supported formats include:** | |
- PDF files (with OCR support) | |
- Microsoft Office (DOC, DOCX, PPT, PPTX, etc.) | |
- Web Documents (HTML, XML) | |
- Text Files (TXT, Markdown) | |
- Images (with OCR capability) | |
- And more | |
""", | |
article=""" | |
This demo showcases document text and metadata extraction capabilities. | |
For more information, visit [Extractous on GitHub](https://github.com/yobix-ai/extractous). | |
""", | |
examples=[ | |
["2412.13663v2.pdf"], # Add example files to demo directory | |
], | |
) | |
if __name__ == "__main__": | |
demo.launch() | |