davanstrien HF staff commited on
Commit
f77a9a3
·
1 Parent(s): bab3281

Add document text extraction functionality using Extractous and Gradio

Browse files
Files changed (1) hide show
  1. app.py +58 -0
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from extractous import Extractor, TesseractOcrConfig
3
+
4
+
5
+
6
+ def extract_document(file):
7
+ """
8
+ Extract text and metadata from an uploaded document
9
+ """
10
+ if file is None:
11
+ return "Please upload a file", "No metadata available"
12
+
13
+ try:
14
+ # Create an extractor with default settings
15
+ extractor = Extractor()
16
+
17
+ # Optional: Add OCR config for image-based or scanned documents
18
+ extractor = extractor.set_ocr_config(TesseractOcrConfig().set_language("eng"))
19
+
20
+ # Extract text and metadata
21
+ result, metadata = extractor.extract_file_to_string(file)
22
+
23
+ return result, str(metadata)
24
+ except Exception as e:
25
+ return f"Error extracting document: {str(e)}", "No metadata available"
26
+
27
+
28
+ # Create the Gradio interface
29
+ demo = gr.Interface(
30
+ fn=extract_document,
31
+ inputs=gr.File(label="Upload Document"),
32
+ outputs=[
33
+ gr.Textbox(label="Extracted Text", lines=10),
34
+ gr.Textbox(label="Metadata", lines=3),
35
+ ],
36
+ title="Document Text Extraction Demo",
37
+ description="""
38
+ Upload a document to extract its text content and metadata using Extractous.
39
+
40
+ **Supported formats include:**
41
+ - PDF files (with OCR support)
42
+ - Microsoft Office (DOC, DOCX, PPT, PPTX, etc.)
43
+ - Web Documents (HTML, XML)
44
+ - Text Files (TXT, Markdown)
45
+ - Images (with OCR capability)
46
+ - And more
47
+ """,
48
+ article="""
49
+ This demo showcases document text and metadata extraction capabilities.
50
+ For more information, visit [Extractous on GitHub](https://github.com/yobix-ai/extractous).
51
+ """,
52
+ examples=[
53
+ ["2412.13663v2.pdf"], # Add example files to demo directory
54
+ ],
55
+ )
56
+
57
+ if __name__ == "__main__":
58
+ demo.launch()