Spaces:
Sleeping
Sleeping
import json | |
import gradio as gr | |
from pdfminer.high_level import extract_pages, extract_text | |
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage | |
import os | |
import io | |
from PIL import Image | |
def parse_pdf(pdf_file, output_format): | |
with open(pdf_file, 'rb') as file: | |
pages = extract_pages(file) | |
text = "" | |
tables = [] | |
images = [] | |
for page in pages: | |
for element in page: | |
if isinstance(element, LTTextBoxHorizontal): | |
text += element.get_text() | |
elif isinstance(element, (LTFigure, LTImage)): | |
# Extract image data | |
if hasattr(element, 'stream'): | |
image_data = element.stream.read() | |
image = Image.open(io.BytesIO(image_data)) | |
image_filename = f"extracted_image_{len(images)}.png" | |
image.save(image_filename) | |
images.append({"filename": image_filename}) | |
else: | |
# Handle LTFigure (potentially nested LTImage) | |
for child in element: | |
if isinstance(child, LTImage): | |
image_data = child.stream.read() | |
image = Image.open(io.BytesIO(image_data)) | |
image_filename = f"extracted_image_{len(images)}.png" | |
image.save(image_filename) | |
images.append({"filename": image_filename}) | |
# You can add logic here to handle other child elements within LTFigure | |
# Implement table extraction logic using Camelot | |
import camelot | |
tables = camelot.read_pdf(pdf_file) | |
# Convert extracted data to desired format and populate download_data | |
if output_format == "JSON": | |
json_data = { | |
"text": text, | |
"tables": [table.df.to_dict() for table in tables], | |
"images": images | |
} | |
download_data = json.dumps(json_data) | |
elif output_format == "Markdown": | |
markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n" | |
for table in tables: | |
markdown_text += table.df.to_markdown(index=False) + "\n\n" | |
# Image embedding in Markdown (using relative paths) | |
image_tags = [] | |
for image in images: | |
image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic | |
image_tags.append(f'') | |
markdown_text += "\n\n# Images\n\n" + "\n".join(image_tags) | |
download_data = markdown_text | |
elif output_format == "HTML": | |
html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n" | |
for table in tables: | |
html_text += table.df.to_html() + "<br>" | |
# Image embedding in HTML (using relative paths) | |
image_tags = [] | |
for image in images: | |
image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic | |
image_tags.append(f'<img src="{image_path}" alt="Image {len(image_tags) + 1}">') | |
html_text += "\n\n<h2>Images</h2>\n\n" + "\n".join(image_tags) | |
download_data = html_text.encode("utf-8") # Encode for HTML download | |
return text, download_data | |
iface = gr.Interface( | |
fn=parse_pdf, | |
inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], | |
outputs=[ | |
gr.Text(label="Output Text"), | |
gr.File(label="Download Output") | |
], | |
title="PDF Parser", | |
description="Parse a PDF and choose the output format." | |
) | |
if __name__ == "__main__": | |
iface.launch(share=False) |