from marker import convert import gradio as gr def parse_pdf(pdf_content, output_format): """ Parses the content of a PDF into the specified output format. Args: pdf_content (bytes): The content of the PDF file in bytes format. output_format (str): The desired output format (Markdown, HTML, or JSON). Returns: str: The parsed text in the chosen output format. """ if not isinstance(pdf_content, bytes): raise TypeError("Input must be bytes representing the PDF content.") # Use in-memory conversion using marker-pdf for deployment compatibility converted_text = convert(pdf_content) if output_format == "Markdown": return converted_text elif output_format == "HTML": # Convert to HTML using marker-pdf's advanced parsing capabilities # Explore additional options and parameters for fine-tuning the output html_text = convert(pdf_content, output_format="html") return html_text elif output_format == "JSON": # Convert to JSON using marker-pdf's structured output json_output = convert(pdf_content, output_format="json") return json_output else: raise ValueError("Invalid output format. Supported formats: Markdown, HTML, JSON") # Define the Gradio interface (no need to install Gradio) iface = gr.Interface( fn=parse_pdf, inputs=[gr.File(type="pdf", label="Upload PDF"), gr.Dropdown(["Markdown", "HTML", "JSON"])], outputs="text", title="PDF Parser", description="Parse a PDF and choose the output format." ) # Launch the app (not applicable on Hugging Face Spaces) # if __name__ == "__main__": # iface.launch() # For local development (comment out for deployment)