File size: 1,748 Bytes
e147409
919f74f
f3515e2
7d2c1f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b0be64
b4b5bbe
7d2c1f9
b4b5bbe
 
 
f3515e2
 
7d2c1f9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from marker import convert
import gradio as gr

def parse_pdf(pdf_content, output_format):
    """
    Parses the content of a PDF into the specified output format.

    Args:
        pdf_content (bytes): The content of the PDF file in bytes format.
        output_format (str): The desired output format (Markdown, HTML, or JSON).

    Returns:
        str: The parsed text in the chosen output format.
    """

    if not isinstance(pdf_content, bytes):
        raise TypeError("Input must be bytes representing the PDF content.")

    # Use in-memory conversion using marker-pdf for deployment compatibility
    converted_text = convert(pdf_content)

    if output_format == "Markdown":
        return converted_text
    elif output_format == "HTML":
        # Convert to HTML using marker-pdf's advanced parsing capabilities
        # Explore additional options and parameters for fine-tuning the output
        html_text = convert(pdf_content, output_format="html")
        return html_text
    elif output_format == "JSON":
        # Convert to JSON using marker-pdf's structured output
        json_output = convert(pdf_content, output_format="json")
        return json_output
    else:
        raise ValueError("Invalid output format. Supported formats: Markdown, HTML, JSON")

# Define the Gradio interface (no need to install Gradio)
iface = gr.Interface(
    fn=parse_pdf,
    inputs=[gr.File(type="pdf", label="Upload PDF"), gr.Dropdown(["Markdown", "HTML", "JSON"])],
    outputs="text",
    title="PDF Parser",
    description="Parse a PDF and choose the output format."
)

# Launch the app (not applicable on Hugging Face Spaces)
# if __name__ == "__main__":
#    iface.launch()  # For local development (comment out for deployment)