Spaces:

davanstrien
/

extractous-demo

Running

extractous-demo / app.py

Refactor app.py: Remove unnecessary blank line

f4ed82b 5 months ago

1.76 kB

	import gradio as gr
	from extractous import Extractor, TesseractOcrConfig


	def extract_document(file):
	"""
	Extract text and metadata from an uploaded document
	"""
	if file is None:
	return "Please upload a file", "No metadata available"

	try:
	# Create an extractor with default settings
	extractor = Extractor()

	# Optional: Add OCR config for image-based or scanned documents
	extractor = extractor.set_ocr_config(TesseractOcrConfig().set_language("eng"))

	# Extract text and metadata
	result, metadata = extractor.extract_file_to_string(file)

	return result, str(metadata)
	except Exception as e:
	return f"Error extracting document: {str(e)}", "No metadata available"


	# Create the Gradio interface
	demo = gr.Interface(
	fn=extract_document,
	inputs=gr.File(label="Upload Document"),
	outputs=[
	gr.Textbox(label="Extracted Text", lines=10),
	gr.Textbox(label="Metadata", lines=3),
	],
	title="Extractus Demo",
	description="""
	Upload a document to extract its text content and metadata using [Extractous](https://github.com/yobix-ai/extractous).

	Supported formats include:
	- PDF files (with OCR support)
	- Microsoft Office (DOC, DOCX, PPT, PPTX, etc.)
	- Web Documents (HTML, XML)
	- Text Files (TXT, Markdown)
	- Images (with OCR capability)
	- And more
	""",
	article="""
	This demo showcases document text and metadata extraction capabilities.
	For more information, visit [Extractous on GitHub](https://github.com/yobix-ai/extractous).
	""",
	examples=[
	["2412.13663v2.pdf"], # Add example files to demo directory
	],
	)

	if __name__ == "__main__":
	demo.launch()