Spaces:

okewunmi
/

pdf-text-extraction

Sleeping

App Files Files Community

okewunmi commited on Jul 17

Commit

2d31420

verified ·

1 Parent(s): e594ed7

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -26

app.py CHANGED Viewed

@@ -1,66 +1,204 @@
 import gradio as gr
 import fitz  # PyMuPDF
-def extract_text_from_pdf(pdf_file):
-    """Extract text from uploaded PDF file"""
     if pdf_file is None:
-        return "No file uploaded"
     try:
-        # Open the PDF file
         doc = fitz.open(pdf_file.name)
         text = ""
         # Extract text from each page
-        for page in doc:
-            text += page.get_text("text") + "\n"
         doc.close()
-        if not text.strip():
-            return "No text found in the PDF file"
-        return text
     except Exception as e:
-        return f"Error processing PDF: {str(e)}"
 # Create the Gradio interface
-with gr.Blocks(title="PDF Text Extraction App") as demo:
     gr.Markdown("# 📄 PDF Text Extraction App")
-    gr.Markdown("Upload a PDF file to extract its text content.")
     with gr.Row():
-        with gr.Column():
             pdf_input = gr.File(
-                label="Upload PDF File",
                 file_types=[".pdf"],
                 type="filepath"
             )
-            extract_btn = gr.Button("Extract Text", variant="primary")
-        with gr.Column():
             text_output = gr.Textbox(
-                label="Extracted Text",
-                lines=20,
-                max_lines=30,
-                placeholder="Extracted text will appear here..."
             )
-    # Connect the button to the function
     extract_btn.click(
         fn=extract_text_from_pdf,
         inputs=pdf_input,
-        outputs=text_output
     )
-    # Also allow automatic extraction when file is uploaded
     pdf_input.change(
         fn=extract_text_from_pdf,
         inputs=pdf_input,
-        outputs=text_output
     )
 # Launch the app
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import fitz  # PyMuPDF
+import requests
+import os
+import tempfile
+import base64
+from typing import Optional, Tuple
+# OCR.space API configuration
+OCR_API_KEY = os.getenv('OCR_API_KEY', 'your_ocr_space_api_key_here')
+OCR_API_URL = 'https://api.ocr.space/parse/image'
+def extract_text_with_ocr(pdf_file_path: str) -> str:
+    """Extract text using OCR.space API as fallback"""
+    try:
+        # Convert PDF to image first (using first page)
+        doc = fitz.open(pdf_file_path)
+        page = doc[0]  # Get first page
+        # Convert page to image
+        mat = fitz.Matrix(2.0, 2.0)  # Higher resolution
+        pix = page.get_pixmap(matrix=mat)
+        img_data = pix.tobytes("png")
+        doc.close()
+        # Encode image to base64
+        img_base64 = base64.b64encode(img_data).decode('utf-8')
+        # Prepare OCR.space API request
+        payload = {
+            'apikey': OCR_API_KEY,
+            'language': 'eng',
+            'isOverlayRequired': False,
+            'base64Image': f'data:image/png;base64,{img_base64}',
+            'iscreatesearchablepdf': False,
+            'issearchablepdfhidetextlayer': False
+        }
+        # Make API request
+        response = requests.post(OCR_API_URL, data=payload, timeout=60)
+        if response.status_code == 200:
+            result = response.json()
+            if result.get('IsErroredOnProcessing', False):
+                return f"OCR Error: {result.get('ErrorMessage', 'Unknown error')}"
+            parsed_results = result.get('ParsedResults', [])
+            if parsed_results:
+                return parsed_results[0].get('ParsedText', 'No text found')
+            else:
+                return "No text extracted from OCR"
+        else:
+            return f"OCR API Error: {response.status_code}"
+    except Exception as e:
+        return f"OCR processing error: {str(e)}"
+def extract_text_from_pdf(pdf_file) -> Tuple[str, str]:
+    """Extract text from uploaded PDF file with OCR fallback"""
     if pdf_file is None:
+        return "No file uploaded", "❌ Error"
+    status = "✅ Success"
     try:
+        # Primary method: PyMuPDF text extraction
         doc = fitz.open(pdf_file.name)
         text = ""
         # Extract text from each page
+        for page_num, page in enumerate(doc):
+            page_text = page.get_text("text")
+            if page_text.strip():
+                text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
         doc.close()
+        # If we got meaningful text, return it
+        if text.strip() and len(text.strip()) > 50:  # Arbitrary threshold
+            return text.strip(), status
+        # If no text or very little text, try OCR fallback
+        status = "⚠️ Using OCR (Image-based PDF detected)"
+        # Check if OCR API key is configured
+        if OCR_API_KEY == 'your_ocr_space_api_key_here':
+            return ("No extractable text found. This appears to be an image-based PDF.\n"
+                   "To extract text from image-based PDFs, please:\n"
+                   "1. Get a free API key from https://ocr.space/ocrapi\n"
+                   "2. Set the OCR_API_KEY environment variable\n"
+                   "3. Restart the application"), "❌ OCR Not Configured"
+        # Try OCR extraction
+        ocr_text = extract_text_with_ocr(pdf_file.name)
+        if ocr_text.startswith("OCR Error:") or ocr_text.startswith("OCR processing error:"):
+            return f"Primary extraction failed, OCR fallback error:\n{ocr_text}", "❌ OCR Failed"
+        return f"Extracted using OCR:\n\n{ocr_text}", status
     except Exception as e:
+        # Complete fallback error handling
+        error_msg = f"Error processing PDF: {str(e)}"
+        # Try to provide helpful error messages
+        if "No such file" in str(e):
+            error_msg = "File not found. Please try uploading the PDF again."
+        elif "not a PDF" in str(e):
+            error_msg = "Invalid file format. Please upload a valid PDF file."
+        elif "encrypted" in str(e).lower():
+            error_msg = "This PDF is password-protected. Please provide an unlocked PDF."
+        elif "corrupted" in str(e).lower():
+            error_msg = "This PDF file appears to be corrupted. Please try a different file."
+        return error_msg, "❌ Error"
+def clear_output():
+    """Clear the output textbox"""
+    return "", "🔄 Ready"
 # Create the Gradio interface
+with gr.Blocks(title="PDF Text Extraction App", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📄 PDF Text Extraction App")
+    gr.Markdown("""
+    Upload a PDF file to extract its text content.
+    **Features:**
+    - ✅ Direct text extraction from text-based PDFs
+    - 🔍 OCR fallback for image-based PDFs (requires OCR.space API key)
+    - 📊 Status indicators for extraction method used
+    """)
     with gr.Row():
+        with gr.Column(scale=1):
             pdf_input = gr.File(
+                label="📎 Upload PDF File",
                 file_types=[".pdf"],
                 type="filepath"
             )
+            with gr.Row():
+                extract_btn = gr.Button("🔍 Extract Text", variant="primary", size="lg")
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+            # Status indicator
+            status_output = gr.Textbox(
+                label="Status",
+                value="🔄 Ready",
+                interactive=False,
+                max_lines=1
+            )
+            # OCR Configuration info
+            gr.Markdown("""
+            **OCR Configuration:**
+            Set `OCR_API_KEY` environment variable for image-based PDF support.
+            Get free API key at: https://ocr.space/ocrapi
+            """)
+        with gr.Column(scale=2):
             text_output = gr.Textbox(
+                label="📝 Extracted Text",
+                lines=25,
+                max_lines=50,
+                placeholder="Extracted text will appear here...",
+                show_copy_button=True
             )
+    # Event handlers
     extract_btn.click(
         fn=extract_text_from_pdf,
         inputs=pdf_input,
+        outputs=[text_output, status_output]
     )
+    clear_btn.click(
+        fn=clear_output,
+        outputs=[text_output, status_output]
+    )
+    # Auto-extract when file is uploaded
     pdf_input.change(
         fn=extract_text_from_pdf,
         inputs=pdf_input,
+        outputs=[text_output, status_output]
     )
+    # Footer
+    gr.Markdown("""
+    ---
+    **Tips:**
+    - For best results with image-based PDFs, ensure good image quality
+    - Large PDFs may take longer to process
+    - OCR works best with clear, high-contrast text
+    """)
 # Launch the app
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=True
+    )