Spaces:
Sleeping
Sleeping
import json | |
import gradio as gr | |
from pdfminer.high_level import extract_pages, extract_text | |
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage | |
import os # Import os for file path manipulation | |
def parse_pdf(pdf_file, output_format): | |
with open(pdf_file, 'rb') as file: | |
pages = extract_pages(file) | |
text = "" | |
tables = [] # Placeholder for extracted table data | |
images = [] # List to store extracted image data | |
for page in pages: | |
for element in page: | |
if isinstance(element, LTTextBoxHorizontal): | |
text += element.get_text() | |
elif isinstance(element, (LTFigure, LTImage)): | |
# Extract image data (e.g., save as image, convert to base64) | |
# ... (Implement image processing logic) | |
# Here's an example of saving images with a unique filename | |
image_data = element # Replace with your image extraction logic | |
image_filename = f"extracted_image_{len(images)}.jpg" | |
# Save the image using the filename | |
with open(image_filename, 'wb') as image_file: | |
image_file.write(image_data) # Assuming image_data is binary data | |
images.append({"filename": image_filename}) # Add filename to image data | |
# Implement table extraction logic (e.g., using heuristics or advanced techniques) | |
# You can use libraries like Camelot for complex tables | |
# ... | |
# Convert extracted data to desired format and populate download_data | |
if output_format == "JSON": | |
json_data = { | |
"text": text, | |
"tables": tables, # Replace with actual table data | |
"images": images # List of dictionaries with filenames | |
} | |
download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download | |
elif output_format == "Markdown": | |
# Implement table conversion using mistletoe or other Markdown libraries (uncomment if needed) | |
# markdown_tables = mistletoe.markdown(convert_table=True)(tables) # Example using mistletoe | |
markdown_text = f"# Extracted Text\n\n{text}\n\n# Images\n" | |
# Implement logic to embed images within Markdown (optional) | |
# ... (e.g., use relative paths if images are saved locally) | |
# or (consider alternative Markdown image embedding methods) | |
download_data = markdown_text.encode("utf-8") | |
elif output_format == "HTML": | |
# Implement table conversion using HTML table tags | |
html_tables = "<table>" # Start of HTML table (replace with actual table structure) | |
# ... (Implement table data conversion to HTML) | |
# html_tables += "</table>" | |
html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n" | |
# Implement logic to display images within HTML (optional) | |
# ... (e.g., use `<img>` tags with image source) | |
download_data = html_text.encode("utf-8") | |
# Create a temporary directory to store downloaded files (optional) | |
# download_dir = tempfile.mkdtemp() # Uncomment if needed for temporary storage | |
# Return the extracted text and the filename (or path) for download | |
return text, os.path.join(os.getcwd(), images[0]["filename"]) # Example using first image | |
iface = gr.Interface( | |
fn=parse_pdf, | |
inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], | |
outputs=[ | |
gr.Text(label="Output Text"), | |
gr.File(label="Download Output") | |
], | |
title="PDF Parser", | |
description="Parse a PDF and choose the output format." | |
) | |
if __name__ == "__main__": | |
iface.launch(share=False) # Set share=False as Gradio warns about it on Hugging Face Spaces |