File size: 5,378 Bytes
7dec78f
 
e92fbe1
7dec78f
17d36dc
 
 
b8d5f22
 
 
3aebabb
b8d5f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f7269c
b8d5f22
 
 
 
 
 
 
 
 
 
 
 
 
 
17d36dc
 
 
 
b8d5f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dec78f
 
 
7c51401
7dec78f
 
 
 
 
 
 
 
5e94ef1
17d36dc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import json
import gradio as gr
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os
import io
from PIL import Image
import pandas as pd
import tabula
import camelot
from PyPDF2 import PdfReader

def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
    """
    Parses a PDF file, extracts text, tables, and images, and formats the output.

    Args:
        pdf_file: Path to the uploaded PDF file.
        output_format: Desired output format ("JSON", "Markdown", or "HTML").
        progress: Gradio Progress object for displaying progress.

    Returns:
        tuple: Extracted text and download data in the specified format.
               Returns an empty string and None if there is an error.
    """
    try:
        with open(pdf_file, 'rb') as file:
            pages = list(extract_pages(file))  # Convert generator to list
            text = ""
            tables = []
            images = []

            # Iterate through pages and extract text and images
            for i, page in enumerate(pages):
                progress(i / len(pages))  # Update progress bar
                for element in page:
                    if isinstance(element, LTTextBoxHorizontal):
                        text += element.get_text()
                    elif isinstance(element, (LTFigure, LTImage)):
                        try:
                            if hasattr(element, 'stream'):
                                image_data = element.stream.read()
                                image = Image.open(io.BytesIO(image_data))
                                image_filename = f"extracted_image_{len(images)}.png"
                                image.save(image_filename)
                                images.append({"filename": image_filename})
                            else:
                                for child in element:
                                    if isinstance(child, LTImage):
                                        image_data = child.stream.read()
                                        image = Image.open(io.BytesIO(image_data))
                                        image_filename = f"extracted_image_{len(images)}.png"
                                        image.save(image_filename)
                                        images.append({"filename": image_filename})
                        except Exception as e:
                            print(f"Error extracting image: {e}")

            # Enhanced table extraction (tabula-py preferred, fallback to camelot)
            try:
                tables = tabula.read_pdf(pdf_file, pages='all', multiple_tables=True)
            except Exception as e:
                print(f"tabula-py failed: {e}. Trying camelot...")
                try:
                    camelot_tables = camelot.read_pdf(pdf_file)
                    for table in camelot_tables:
                        tables.append(table.df)
                except Exception as e:
                    print(f"camelot also failed: {e}. No tables extracted.")

            # Format extracted data based on user selection
            if output_format == "JSON":
                json_data = {
                    "text": text,
                    "tables": [table.to_dict() for table in tables],
                    "images": images
                }
                download_data = json.dumps(json_data, indent=4)  # Add indentation for readability
            elif output_format == "Markdown":
                markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                for i, table in enumerate(tables):
                    markdown_text += f"## Table {i+1}\n"
                    markdown_text += table.to_markdown(index=False) + "\n\n"

                # Image embedding in Markdown (using relative paths)
                markdown_text += "\n\n# Images\n\n"
                for image in images:
                  image_path = os.path.join(os.getcwd(), image["filename"])
                  markdown_text += f'![Image]({image_path})\n'

                download_data = markdown_text
            elif output_format == "HTML":
                html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                for i, table in enumerate(tables):
                    html_text += f"<h2>Table {i+1}</h2>\n"
                    html_text += table.to_html() + "<br>"

                # Image embedding in HTML (using relative paths)
                html_text += "\n\n<h2>Images</h2>\n\n"
                for image in images:
                  image_path = os.path.join(os.getcwd(), image["filename"])
                  html_text += f'<img src="{image_path}" alt="Image"><br>\n'

                download_data = html_text.encode("utf-8")  # Encode for HTML download
            return text, download_data

    except Exception as main_e:
        print(f"A main error occurred: {main_e}")
        return "", None # Return empty string and None in case of error

iface = gr.Interface(
    fn=parse_pdf,
    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], # Remove gr.Progress() from inputs
    outputs=[
        gr.Text(label="Output Text"),
        gr.File(label="Download Output")
    ],
    title="PDF Parser",
    description="Parse a PDF and choose the output format."
)

if __name__ == "__main__":
    iface.launch(share=False)