import json import gradio as gr from pdfminer.high_level import extract_pages, extract_text from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage import os # Import os for file path manipulation def parse_pdf(pdf_file, output_format): with open(pdf_file, 'rb') as file: pages = extract_pages(file) text = "" tables = [] # Placeholder for extracted table data images = [] # List to store extracted image data for page in pages: for element in page: if isinstance(element, LTTextBoxHorizontal): text += element.get_text() elif isinstance(element, (LTFigure, LTImage)): # Extract image data (e.g., save as image, convert to base64) # ... (Implement image processing logic) # Here's an example of saving images with a unique filename image_data = element # Replace with your image extraction logic image_filename = f"extracted_image_{len(images)}.jpg" # Save the image using the filename with open(image_filename, 'wb') as image_file: image_file.write(image_data) # Assuming image_data is binary data images.append({"filename": image_filename}) # Add filename to image data # Implement table extraction logic (e.g., using heuristics or advanced techniques) # You can use libraries like Camelot for complex tables # ... # Convert extracted data to desired format and populate download_data if output_format == "JSON": json_data = { "text": text, "tables": tables, # Replace with actual table data "images": images # List of dictionaries with filenames } download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download elif output_format == "Markdown": # Implement table conversion using mistletoe or other Markdown libraries (uncomment if needed) # markdown_tables = mistletoe.markdown(convert_table=True)(tables) # Example using mistletoe markdown_text = f"# Extracted Text\n\n{text}\n\n# Images\n" # Implement logic to embed images within Markdown (optional) # ... (e.g., use relative paths if images are saved locally) # or (consider alternative Markdown image embedding methods) download_data = markdown_text.encode("utf-8") elif output_format == "HTML": # Implement table conversion using HTML table tags html_tables = "" # Start of HTML table (replace with actual table structure) # ... (Implement table data conversion to HTML) # html_tables += "
" html_text = f"

{text}

\n\n

Tables

\n{html_tables}\n\n

Images

\n" # Implement logic to display images within HTML (optional) # ... (e.g., use `` tags with image source) download_data = html_text.encode("utf-8") # Create a temporary directory to store downloaded files (optional) # download_dir = tempfile.mkdtemp() # Uncomment if needed for temporary storage # Return the extracted text and the filename (or path) for download return text, os.path.join(os.getcwd(), images[0]["filename"]) # Example using first image iface = gr.Interface( fn=parse_pdf, inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], outputs=[ gr.Text(label="Output Text"), gr.File(label="Download Output") ], title="PDF Parser", description="Parse a PDF and choose the output format." ) if __name__ == "__main__": iface.launch(share=False) # Set share=False as Gradio warns about it on Hugging Face Spaces