import gradio as gr from extractous import Extractor, TesseractOcrConfig def extract_document(file): """ Extract text and metadata from an uploaded document """ if file is None: return "Please upload a file", "No metadata available" try: # Create an extractor with default settings extractor = Extractor() # Optional: Add OCR config for image-based or scanned documents extractor = extractor.set_ocr_config(TesseractOcrConfig().set_language("eng")) # Extract text and metadata result, metadata = extractor.extract_file_to_string(file) return result, str(metadata) except Exception as e: return f"Error extracting document: {str(e)}", "No metadata available" # Create the Gradio interface demo = gr.Interface( fn=extract_document, inputs=gr.File(label="Upload Document"), outputs=[ gr.Textbox(label="Extracted Text", lines=10), gr.Textbox(label="Metadata", lines=3), ], title="Extractus Demo", description=""" Upload a document to extract its text content and metadata using [Extractous](https://github.com/yobix-ai/extractous). **Supported formats include:** - PDF files (with OCR support) - Microsoft Office (DOC, DOCX, PPT, PPTX, etc.) - Web Documents (HTML, XML) - Text Files (TXT, Markdown) - Images (with OCR capability) - And more """, article=""" This demo showcases document text and metadata extraction capabilities. For more information, visit [Extractous on GitHub](https://github.com/yobix-ai/extractous). """, examples=[ ["2412.13663v2.pdf"], # Add example files to demo directory ], ) if __name__ == "__main__": demo.launch()