# app.py import fitz # PyMuPDF from markdownify import markdownify as md import json import gradio as gr def convert_pdf_to_markdown(path): """Extract each page as HTML, convert to Markdown.""" doc = fitz.open(path) pages_md = [] for i, page in enumerate(doc, start=1): html = page.get_text("html") or "" # Clean conversion: collapse multiple newlines page_md = md(html).strip() pages_md.append({"page": i, "markdown": page_md}) return pages_md def process_upload(pdf_file, output_format): """ pdf_file: tempfile-like object from Gradio output_format: "markdown" or "json" """ # Convert and collect pages = convert_pdf_to_markdown(pdf_file.name) if output_format == "markdown": # Join all pages full_md = "\n\n---\n\n".join(p["markdown"] for p in pages) return full_md else: # Return pretty JSON return json.dumps({"pages": pages}, indent=2, ensure_ascii=False) # Gradio interface demo = gr.Interface( fn=process_upload, inputs=[ gr.File(label="Upload your PDF", file_types=[".pdf"]), gr.Radio(choices=["markdown", "json"], value="markdown", label="Output format") ], outputs=gr.Code(label="Converted Output"), title="PDF → Markdown/JSON Converter", description=( "Upload a PDF and get back a professionally converted Markdown " "or a structured JSON with each page’s Markdown. " "PDFs with images or complex tables may still need manual review." ), examples=[ # you can add example PDFs here if desired ] ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)