Spaces:

sblumenf
/

pdf-convert

Sleeping

File size: 3,735 Bytes

import json
import gradio as gr
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os
import io
from PIL import Image

def parse_pdf(pdf_file, output_format):
    with open(pdf_file, 'rb') as file:
        pages = extract_pages(file)

        text = ""
        tables = []
        images = []

        for page in pages:
            for element in page:
                if isinstance(element, LTTextBoxHorizontal):
                    text += element.get_text()
                elif isinstance(element, (LTFigure, LTImage)):
                    # Extract image data
                    if hasattr(element, 'stream'):
                        image_data = element.stream.read()
                        image = Image.open(io.BytesIO(image_data))
                        image_filename = f"extracted_image_{len(images)}.png"
                        image.save(image_filename)
                        images.append({"filename": image_filename})
                    else:
                        # Handle LTFigure (potentially nested LTImage)
                        for child in element:
                            if isinstance(child, LTImage):
                                image_data = child.stream.read()
                                image = Image.open(io.BytesIO(image_data))
                                image_filename = f"extracted_image_{len(images)}.png"
                                image.save(image_filename)
                                images.append({"filename": image_filename})
                        # You can add logic here to handle other child elements within LTFigure

        # Implement table extraction logic using Camelot
        import camelot
        tables = camelot.read_pdf(pdf_file)

    # Convert extracted data to desired format and populate download_data
    if output_format == "JSON":
        json_data = {
            "text": text,
            "tables": [table.df.to_dict() for table in tables],
            "images": images
        }
        download_data = json.dumps(json_data)

    elif output_format == "Markdown":
        markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
        for table in tables:
            markdown_text += table.df.to_markdown(index=False) + "\n\n"

        # Image embedding in Markdown (using relative paths)
        image_tags = []
        for image in images:
            image_path = os.path.join(os.getcwd(), image["filename"])  # Replace with your path logic
            image_tags.append(f'![Image {len(image_tags) + 1}]({image_path})')

        markdown_text += "\n\n# Images\n\n" + "\n".join(image_tags)

        download_data = markdown_text

    elif output_format == "HTML":
        html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
        for table in tables:
            html_text += table.df.to_html() + "<br>"

        # Image embedding in HTML (using relative paths)
        image_tags = []
        for image in images:
            image_path = os.path.join(os.getcwd(), image["filename"])  # Replace with your path logic
            image_tags.append(f'<img src="{image_path}" alt="Image {len(image_tags) + 1}">')

        html_text += "\n\n<h2>Images</h2>\n\n" + "\n".join(image_tags)

        download_data = html_text.encode("utf-8")  # Encode for HTML download

    return text, download_data

iface = gr.Interface(
    fn=parse_pdf,
    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
    outputs=[
        gr.Text(label="Output Text"),
        gr.File(label="Download Output")
    ],
    title="PDF Parser",
    description="Parse a PDF and choose the output format."
)

if __name__ == "__main__":
    iface.launch(share=False)