Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

pdf-convert / app.py

sblumenf

Update app.py

c506d0d verified 2 months ago

raw

history blame

3.8 kB

	import json
	import gradio as gr
	from pdfminer.high_level import extract_pages, extract_text
	from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
	import os # Import os for file path manipulation

	def parse_pdf(pdf_file, output_format):
	with open(pdf_file, 'rb') as file:
	pages = extract_pages(file)

	text = ""
	tables = [] # Placeholder for extracted table data
	images = [] # List to store extracted image data

	for page in pages:
	for element in page:
	if isinstance(element, LTTextBoxHorizontal):
	text += element.get_text()
	elif isinstance(element, (LTFigure, LTImage)):
	# Extract image data (e.g., save as image, convert to base64)
	# ... (Implement image processing logic)
	# Here's an example of saving images with a unique filename
	image_data = element # Replace with your image extraction logic
	image_filename = f"extracted_image_{len(images)}.jpg"
	# Save the image using the filename
	with open(image_filename, 'wb') as image_file:
	image_file.write(image_data) # Assuming image_data is binary data
	images.append({"filename": image_filename}) # Add filename to image data

	# Implement table extraction logic (e.g., using heuristics or advanced techniques)
	# You can use libraries like Camelot for complex tables
	# ...

	# Convert extracted data to desired format and populate download_data
	if output_format == "JSON":
	json_data = {
	"text": text,
	"tables": tables, # Replace with actual table data
	"images": images # List of dictionaries with filenames
	}
	download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download

	elif output_format == "Markdown":
	# Implement table conversion using mistletoe or other Markdown libraries (uncomment if needed)
	# markdown_tables = mistletoe.markdown(convert_table=True)(tables) # Example using mistletoe

	markdown_text = f"# Extracted Text\n\n{text}\n\n# Images\n"
	# Implement logic to embed images within Markdown (optional)
	# ... (e.g., use relative paths if images are saved locally)
	# or (consider alternative Markdown image embedding methods)
	download_data = markdown_text.encode("utf-8")

	elif output_format == "HTML":
	# Implement table conversion using HTML table tags
	html_tables = "<table>" # Start of HTML table (replace with actual table structure)
	# ... (Implement table data conversion to HTML)
	# html_tables += "</table>"

	html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
	# Implement logic to display images within HTML (optional)
	# ... (e.g., use `<img>` tags with image source)
	download_data = html_text.encode("utf-8")

	# Create a temporary directory to store downloaded files (optional)
	# download_dir = tempfile.mkdtemp() # Uncomment if needed for temporary storage

	# Return the extracted text and the filename (or path) for download
	return text, os.path.join(os.getcwd(), images[0]["filename"]) # Example using first image

	iface = gr.Interface(
	fn=parse_pdf,
	inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
	outputs=[
	gr.Text(label="Output Text"),
	gr.File(label="Download Output")
	],
	title="PDF Parser",
	description="Parse a PDF and choose the output format."
	)

	if __name__ == "__main__":
	iface.launch(share=False) # Set share=False as Gradio warns about it on Hugging Face Spaces