okewunmi commited on
Commit
2d31420
Β·
verified Β·
1 Parent(s): e594ed7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -26
app.py CHANGED
@@ -1,66 +1,204 @@
1
  import gradio as gr
2
  import fitz # PyMuPDF
 
 
 
 
 
3
 
4
- def extract_text_from_pdf(pdf_file):
5
- """Extract text from uploaded PDF file"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  if pdf_file is None:
7
- return "No file uploaded"
 
 
8
 
9
  try:
10
- # Open the PDF file
11
  doc = fitz.open(pdf_file.name)
12
  text = ""
13
 
14
  # Extract text from each page
15
- for page in doc:
16
- text += page.get_text("text") + "\n"
 
 
17
 
18
  doc.close()
19
 
20
- if not text.strip():
21
- return "No text found in the PDF file"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- return text
24
-
25
  except Exception as e:
26
- return f"Error processing PDF: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # Create the Gradio interface
29
- with gr.Blocks(title="PDF Text Extraction App") as demo:
30
  gr.Markdown("# πŸ“„ PDF Text Extraction App")
31
- gr.Markdown("Upload a PDF file to extract its text content.")
 
 
 
 
 
 
 
32
 
33
  with gr.Row():
34
- with gr.Column():
35
  pdf_input = gr.File(
36
- label="Upload PDF File",
37
  file_types=[".pdf"],
38
  type="filepath"
39
  )
40
- extract_btn = gr.Button("Extract Text", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- with gr.Column():
43
  text_output = gr.Textbox(
44
- label="Extracted Text",
45
- lines=20,
46
- max_lines=30,
47
- placeholder="Extracted text will appear here..."
 
48
  )
49
 
50
- # Connect the button to the function
51
  extract_btn.click(
52
  fn=extract_text_from_pdf,
53
  inputs=pdf_input,
54
- outputs=text_output
55
  )
56
 
57
- # Also allow automatic extraction when file is uploaded
 
 
 
 
 
58
  pdf_input.change(
59
  fn=extract_text_from_pdf,
60
  inputs=pdf_input,
61
- outputs=text_output
62
  )
 
 
 
 
 
 
 
 
 
63
 
64
  # Launch the app
65
  if __name__ == "__main__":
66
- demo.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
+ import requests
4
+ import os
5
+ import tempfile
6
+ import base64
7
+ from typing import Optional, Tuple
8
 
9
+ # OCR.space API configuration
10
+ OCR_API_KEY = os.getenv('OCR_API_KEY', 'your_ocr_space_api_key_here')
11
+ OCR_API_URL = 'https://api.ocr.space/parse/image'
12
+
13
+ def extract_text_with_ocr(pdf_file_path: str) -> str:
14
+ """Extract text using OCR.space API as fallback"""
15
+ try:
16
+ # Convert PDF to image first (using first page)
17
+ doc = fitz.open(pdf_file_path)
18
+ page = doc[0] # Get first page
19
+
20
+ # Convert page to image
21
+ mat = fitz.Matrix(2.0, 2.0) # Higher resolution
22
+ pix = page.get_pixmap(matrix=mat)
23
+ img_data = pix.tobytes("png")
24
+ doc.close()
25
+
26
+ # Encode image to base64
27
+ img_base64 = base64.b64encode(img_data).decode('utf-8')
28
+
29
+ # Prepare OCR.space API request
30
+ payload = {
31
+ 'apikey': OCR_API_KEY,
32
+ 'language': 'eng',
33
+ 'isOverlayRequired': False,
34
+ 'base64Image': f'data:image/png;base64,{img_base64}',
35
+ 'iscreatesearchablepdf': False,
36
+ 'issearchablepdfhidetextlayer': False
37
+ }
38
+
39
+ # Make API request
40
+ response = requests.post(OCR_API_URL, data=payload, timeout=60)
41
+
42
+ if response.status_code == 200:
43
+ result = response.json()
44
+ if result.get('IsErroredOnProcessing', False):
45
+ return f"OCR Error: {result.get('ErrorMessage', 'Unknown error')}"
46
+
47
+ parsed_results = result.get('ParsedResults', [])
48
+ if parsed_results:
49
+ return parsed_results[0].get('ParsedText', 'No text found')
50
+ else:
51
+ return "No text extracted from OCR"
52
+ else:
53
+ return f"OCR API Error: {response.status_code}"
54
+
55
+ except Exception as e:
56
+ return f"OCR processing error: {str(e)}"
57
+
58
+ def extract_text_from_pdf(pdf_file) -> Tuple[str, str]:
59
+ """Extract text from uploaded PDF file with OCR fallback"""
60
  if pdf_file is None:
61
+ return "No file uploaded", "❌ Error"
62
+
63
+ status = "βœ… Success"
64
 
65
  try:
66
+ # Primary method: PyMuPDF text extraction
67
  doc = fitz.open(pdf_file.name)
68
  text = ""
69
 
70
  # Extract text from each page
71
+ for page_num, page in enumerate(doc):
72
+ page_text = page.get_text("text")
73
+ if page_text.strip():
74
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
75
 
76
  doc.close()
77
 
78
+ # If we got meaningful text, return it
79
+ if text.strip() and len(text.strip()) > 50: # Arbitrary threshold
80
+ return text.strip(), status
81
+
82
+ # If no text or very little text, try OCR fallback
83
+ status = "⚠️ Using OCR (Image-based PDF detected)"
84
+
85
+ # Check if OCR API key is configured
86
+ if OCR_API_KEY == 'your_ocr_space_api_key_here':
87
+ return ("No extractable text found. This appears to be an image-based PDF.\n"
88
+ "To extract text from image-based PDFs, please:\n"
89
+ "1. Get a free API key from https://ocr.space/ocrapi\n"
90
+ "2. Set the OCR_API_KEY environment variable\n"
91
+ "3. Restart the application"), "❌ OCR Not Configured"
92
+
93
+ # Try OCR extraction
94
+ ocr_text = extract_text_with_ocr(pdf_file.name)
95
+
96
+ if ocr_text.startswith("OCR Error:") or ocr_text.startswith("OCR processing error:"):
97
+ return f"Primary extraction failed, OCR fallback error:\n{ocr_text}", "❌ OCR Failed"
98
+
99
+ return f"Extracted using OCR:\n\n{ocr_text}", status
100
 
 
 
101
  except Exception as e:
102
+ # Complete fallback error handling
103
+ error_msg = f"Error processing PDF: {str(e)}"
104
+
105
+ # Try to provide helpful error messages
106
+ if "No such file" in str(e):
107
+ error_msg = "File not found. Please try uploading the PDF again."
108
+ elif "not a PDF" in str(e):
109
+ error_msg = "Invalid file format. Please upload a valid PDF file."
110
+ elif "encrypted" in str(e).lower():
111
+ error_msg = "This PDF is password-protected. Please provide an unlocked PDF."
112
+ elif "corrupted" in str(e).lower():
113
+ error_msg = "This PDF file appears to be corrupted. Please try a different file."
114
+
115
+ return error_msg, "❌ Error"
116
+
117
+ def clear_output():
118
+ """Clear the output textbox"""
119
+ return "", "πŸ”„ Ready"
120
 
121
  # Create the Gradio interface
122
+ with gr.Blocks(title="PDF Text Extraction App", theme=gr.themes.Soft()) as demo:
123
  gr.Markdown("# πŸ“„ PDF Text Extraction App")
124
+ gr.Markdown("""
125
+ Upload a PDF file to extract its text content.
126
+
127
+ **Features:**
128
+ - βœ… Direct text extraction from text-based PDFs
129
+ - πŸ” OCR fallback for image-based PDFs (requires OCR.space API key)
130
+ - πŸ“Š Status indicators for extraction method used
131
+ """)
132
 
133
  with gr.Row():
134
+ with gr.Column(scale=1):
135
  pdf_input = gr.File(
136
+ label="πŸ“Ž Upload PDF File",
137
  file_types=[".pdf"],
138
  type="filepath"
139
  )
140
+
141
+ with gr.Row():
142
+ extract_btn = gr.Button("πŸ” Extract Text", variant="primary", size="lg")
143
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
144
+
145
+ # Status indicator
146
+ status_output = gr.Textbox(
147
+ label="Status",
148
+ value="πŸ”„ Ready",
149
+ interactive=False,
150
+ max_lines=1
151
+ )
152
+
153
+ # OCR Configuration info
154
+ gr.Markdown("""
155
+ **OCR Configuration:**
156
+ Set `OCR_API_KEY` environment variable for image-based PDF support.
157
+ Get free API key at: https://ocr.space/ocrapi
158
+ """)
159
 
160
+ with gr.Column(scale=2):
161
  text_output = gr.Textbox(
162
+ label="πŸ“ Extracted Text",
163
+ lines=25,
164
+ max_lines=50,
165
+ placeholder="Extracted text will appear here...",
166
+ show_copy_button=True
167
  )
168
 
169
+ # Event handlers
170
  extract_btn.click(
171
  fn=extract_text_from_pdf,
172
  inputs=pdf_input,
173
+ outputs=[text_output, status_output]
174
  )
175
 
176
+ clear_btn.click(
177
+ fn=clear_output,
178
+ outputs=[text_output, status_output]
179
+ )
180
+
181
+ # Auto-extract when file is uploaded
182
  pdf_input.change(
183
  fn=extract_text_from_pdf,
184
  inputs=pdf_input,
185
+ outputs=[text_output, status_output]
186
  )
187
+
188
+ # Footer
189
+ gr.Markdown("""
190
+ ---
191
+ **Tips:**
192
+ - For best results with image-based PDFs, ensure good image quality
193
+ - Large PDFs may take longer to process
194
+ - OCR works best with clear, high-contrast text
195
+ """)
196
 
197
  # Launch the app
198
  if __name__ == "__main__":
199
+ demo.launch(
200
+ server_name="0.0.0.0",
201
+ server_port=7860,
202
+ share=False,
203
+ debug=True
204
+ )