Spaces:
Sleeping
Sleeping
from marker import convert | |
import gradio as gr | |
def parse_pdf(pdf_content, output_format): | |
""" | |
Parses the content of a PDF into the specified output format. | |
Args: | |
pdf_content (bytes): The content of the PDF file in bytes format. | |
output_format (str): The desired output format (Markdown, HTML, or JSON). | |
Returns: | |
str: The parsed text in the chosen output format. | |
""" | |
if not isinstance(pdf_content, bytes): | |
raise TypeError("Input must be bytes representing the PDF content.") | |
# Use in-memory conversion using marker-pdf for deployment compatibility | |
converted_text = convert(pdf_content) | |
if output_format == "Markdown": | |
return converted_text | |
elif output_format == "HTML": | |
# Convert to HTML using marker-pdf's advanced parsing capabilities | |
# Explore additional options and parameters for fine-tuning the output | |
html_text = convert(pdf_content, output_format="html") | |
return html_text | |
elif output_format == "JSON": | |
# Convert to JSON using marker-pdf's structured output | |
json_output = convert(pdf_content, output_format="json") | |
return json_output | |
else: | |
raise ValueError("Invalid output format. Supported formats: Markdown, HTML, JSON") | |
# Define the Gradio interface (no need to install Gradio) | |
iface = gr.Interface( | |
fn=parse_pdf, | |
inputs=[gr.File(type="pdf", label="Upload PDF"), gr.Dropdown(["Markdown", "HTML", "JSON"])], | |
outputs="text", | |
title="PDF Parser", | |
description="Parse a PDF and choose the output format." | |
) | |
# Launch the app (not applicable on Hugging Face Spaces) | |
# if __name__ == "__main__": | |
# iface.launch() # For local development (comment out for deployment) |