import json import gradio as gr from pdfminer.high_level import extract_pages, extract_text from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage import os import io from PIL import Image def parse_pdf(pdf_file, output_format): with open(pdf_file, 'rb') as file: pages = extract_pages(file) text = "" tables = [] images = [] for page in pages: for element in page: if isinstance(element, LTTextBoxHorizontal): text += element.get_text() elif isinstance(element, (LTFigure, LTImage)): # Extract image data if hasattr(element, 'stream'): image_data = element.stream.read() image = Image.open(io.BytesIO(image_data)) image_filename = f"extracted_image_{len(images)}.png" image.save(image_filename) images.append({"filename": image_filename}) else: # Handle LTFigure (potentially nested LTImage) for child in element: if isinstance(child, LTImage): image_data = child.stream.read() image = Image.open(io.BytesIO(image_data)) image_filename = f"extracted_image_{len(images)}.png" image.save(image_filename) images.append({"filename": image_filename}) # You can add logic here to handle other child elements within LTFigure # Implement table extraction logic using Camelot import camelot tables = camelot.read_pdf(pdf_file) # Convert extracted data to desired format and populate download_data if output_format == "JSON": json_data = { "text": text, "tables": [table.df.to_dict() for table in tables], "images": images } download_data = json.dumps(json_data) elif output_format == "Markdown": markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n" for table in tables: markdown_text += table.df.to_markdown(index=False) + "\n\n" # Image embedding in Markdown (using relative paths) image_tags = [] for image in images: image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic image_tags.append(f'![Image {len(image_tags) + 1}]({image_path})') markdown_text += "\n\n# Images\n\n" + "\n".join(image_tags) download_data = markdown_text elif output_format == "HTML": html_text = f"

{text}

\n\n

Tables

\n" for table in tables: html_text += table.df.to_html() + "
" # Image embedding in HTML (using relative paths) image_tags = [] for image in images: image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic image_tags.append(f'Image {len(image_tags) + 1}') html_text += "\n\n

Images

\n\n" + "\n".join(image_tags) download_data = html_text.encode("utf-8") # Encode for HTML download return text, download_data iface = gr.Interface( fn=parse_pdf, inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], outputs=[ gr.Text(label="Output Text"), gr.File(label="Download Output") ], title="PDF Parser", description="Parse a PDF and choose the output format." ) if __name__ == "__main__": iface.launch(share=False)