pdf-convert / app.py
sblumenf's picture
Update app.py
7d2c1f9 verified
raw
history blame
1.75 kB
from marker import convert
import gradio as gr
def parse_pdf(pdf_content, output_format):
"""
Parses the content of a PDF into the specified output format.
Args:
pdf_content (bytes): The content of the PDF file in bytes format.
output_format (str): The desired output format (Markdown, HTML, or JSON).
Returns:
str: The parsed text in the chosen output format.
"""
if not isinstance(pdf_content, bytes):
raise TypeError("Input must be bytes representing the PDF content.")
# Use in-memory conversion using marker-pdf for deployment compatibility
converted_text = convert(pdf_content)
if output_format == "Markdown":
return converted_text
elif output_format == "HTML":
# Convert to HTML using marker-pdf's advanced parsing capabilities
# Explore additional options and parameters for fine-tuning the output
html_text = convert(pdf_content, output_format="html")
return html_text
elif output_format == "JSON":
# Convert to JSON using marker-pdf's structured output
json_output = convert(pdf_content, output_format="json")
return json_output
else:
raise ValueError("Invalid output format. Supported formats: Markdown, HTML, JSON")
# Define the Gradio interface (no need to install Gradio)
iface = gr.Interface(
fn=parse_pdf,
inputs=[gr.File(type="pdf", label="Upload PDF"), gr.Dropdown(["Markdown", "HTML", "JSON"])],
outputs="text",
title="PDF Parser",
description="Parse a PDF and choose the output format."
)
# Launch the app (not applicable on Hugging Face Spaces)
# if __name__ == "__main__":
# iface.launch() # For local development (comment out for deployment)