Spaces:
Sleeping
Sleeping
File size: 3,735 Bytes
7dec78f 17d36dc f598e4b 7dec78f 17d36dc 7dec78f 17d36dc 7dec78f 17d36dc 7dec78f 17d36dc 546291c 7dec78f 17d36dc c506d0d 546291c 7dec78f 5e94ef1 17d36dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import json
import gradio as gr
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os
import io
from PIL import Image
def parse_pdf(pdf_file, output_format):
with open(pdf_file, 'rb') as file:
pages = extract_pages(file)
text = ""
tables = []
images = []
for page in pages:
for element in page:
if isinstance(element, LTTextBoxHorizontal):
text += element.get_text()
elif isinstance(element, (LTFigure, LTImage)):
# Extract image data
if hasattr(element, 'stream'):
image_data = element.stream.read()
image = Image.open(io.BytesIO(image_data))
image_filename = f"extracted_image_{len(images)}.png"
image.save(image_filename)
images.append({"filename": image_filename})
else:
# Handle LTFigure (potentially nested LTImage)
for child in element:
if isinstance(child, LTImage):
image_data = child.stream.read()
image = Image.open(io.BytesIO(image_data))
image_filename = f"extracted_image_{len(images)}.png"
image.save(image_filename)
images.append({"filename": image_filename})
# You can add logic here to handle other child elements within LTFigure
# Implement table extraction logic using Camelot
import camelot
tables = camelot.read_pdf(pdf_file)
# Convert extracted data to desired format and populate download_data
if output_format == "JSON":
json_data = {
"text": text,
"tables": [table.df.to_dict() for table in tables],
"images": images
}
download_data = json.dumps(json_data)
elif output_format == "Markdown":
markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
for table in tables:
markdown_text += table.df.to_markdown(index=False) + "\n\n"
# Image embedding in Markdown (using relative paths)
image_tags = []
for image in images:
image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic
image_tags.append(f'data:image/s3,"s3://crabby-images/5c427/5c427a70d400d789a230603e6068925efe4ddc9a" alt="Image {len(image_tags) + 1}"')
markdown_text += "\n\n# Images\n\n" + "\n".join(image_tags)
download_data = markdown_text
elif output_format == "HTML":
html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
for table in tables:
html_text += table.df.to_html() + "<br>"
# Image embedding in HTML (using relative paths)
image_tags = []
for image in images:
image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic
image_tags.append(f'<img src="{image_path}" alt="Image {len(image_tags) + 1}">')
html_text += "\n\n<h2>Images</h2>\n\n" + "\n".join(image_tags)
download_data = html_text.encode("utf-8") # Encode for HTML download
return text, download_data
iface = gr.Interface(
fn=parse_pdf,
inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
outputs=[
gr.Text(label="Output Text"),
gr.File(label="Download Output")
],
title="PDF Parser",
description="Parse a PDF and choose the output format."
)
if __name__ == "__main__":
iface.launch(share=False) |