Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

pdf-convert / app.py

sblumenf

Update app.py

17d36dc verified 7 months ago

raw

history blame

3.74 kB

	import json
	import gradio as gr
	from pdfminer.high_level import extract_pages, extract_text
	from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
	import os
	import io
	from PIL import Image

	def parse_pdf(pdf_file, output_format):
	with open(pdf_file, 'rb') as file:
	pages = extract_pages(file)

	text = ""
	tables = []
	images = []

	for page in pages:
	for element in page:
	if isinstance(element, LTTextBoxHorizontal):
	text += element.get_text()
	elif isinstance(element, (LTFigure, LTImage)):
	# Extract image data
	if hasattr(element, 'stream'):
	image_data = element.stream.read()
	image = Image.open(io.BytesIO(image_data))
	image_filename = f"extracted_image_{len(images)}.png"
	image.save(image_filename)
	images.append({"filename": image_filename})
	else:
	# Handle LTFigure (potentially nested LTImage)
	for child in element:
	if isinstance(child, LTImage):
	image_data = child.stream.read()
	image = Image.open(io.BytesIO(image_data))
	image_filename = f"extracted_image_{len(images)}.png"
	image.save(image_filename)
	images.append({"filename": image_filename})
	# You can add logic here to handle other child elements within LTFigure

	# Implement table extraction logic using Camelot
	import camelot
	tables = camelot.read_pdf(pdf_file)

	# Convert extracted data to desired format and populate download_data
	if output_format == "JSON":
	json_data = {
	"text": text,
	"tables": [table.df.to_dict() for table in tables],
	"images": images
	}
	download_data = json.dumps(json_data)

	elif output_format == "Markdown":
	markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
	for table in tables:
	markdown_text += table.df.to_markdown(index=False) + "\n\n"

	# Image embedding in Markdown (using relative paths)
	image_tags = []
	for image in images:
	image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic
	image_tags.append(f'![Image {len(image_tags) + 1}]({image_path})')

	markdown_text += "\n\n# Images\n\n" + "\n".join(image_tags)

	download_data = markdown_text

	elif output_format == "HTML":
	html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
	for table in tables:
	html_text += table.df.to_html() + "<br>"

	# Image embedding in HTML (using relative paths)
	image_tags = []
	for image in images:
	image_path = os.path.join(os.getcwd(), image["filename"]) # Replace with your path logic
	image_tags.append(f'<img src="{image_path}" alt="Image {len(image_tags) + 1}">')

	html_text += "\n\n<h2>Images</h2>\n\n" + "\n".join(image_tags)

	download_data = html_text.encode("utf-8") # Encode for HTML download

	return text, download_data

	iface = gr.Interface(
	fn=parse_pdf,
	inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
	outputs=[
	gr.Text(label="Output Text"),
	gr.File(label="Download Output")
	],
	title="PDF Parser",
	description="Parse a PDF and choose the output format."
	)

	if __name__ == "__main__":
	iface.launch(share=False)