Spaces:
Running
Running
# app.py | |
import fitz # PyMuPDF | |
from markdownify import markdownify as md | |
import json | |
import gradio as gr | |
def convert_pdf_to_markdown(path): | |
"""Extract each page as HTML, convert to Markdown.""" | |
doc = fitz.open(path) | |
pages_md = [] | |
for i, page in enumerate(doc, start=1): | |
html = page.get_text("html") or "" | |
# Clean conversion: collapse multiple newlines | |
page_md = md(html).strip() | |
pages_md.append({"page": i, "markdown": page_md}) | |
return pages_md | |
def process_upload(pdf_file, output_format): | |
""" | |
pdf_file: tempfile-like object from Gradio | |
output_format: "markdown" or "json" | |
""" | |
# Convert and collect | |
pages = convert_pdf_to_markdown(pdf_file.name) | |
if output_format == "markdown": | |
# Join all pages | |
full_md = "\n\n---\n\n".join(p["markdown"] for p in pages) | |
return full_md | |
else: | |
# Return pretty JSON | |
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False) | |
# Gradio interface | |
demo = gr.Interface( | |
fn=process_upload, | |
inputs=[ | |
gr.File(label="Upload your PDF", file_types=[".pdf"]), | |
gr.Radio(choices=["markdown", "json"], | |
value="markdown", | |
label="Output format") | |
], | |
outputs=gr.Code(label="Converted Output"), | |
title="PDF → Markdown/JSON Converter", | |
description=( | |
"Upload a PDF and get back a professionally converted Markdown " | |
"or a structured JSON with each page’s Markdown. " | |
"PDFs with images or complex tables may still need manual review." | |
), | |
examples=[ | |
# you can add example PDFs here if desired | |
] | |
) | |
if __name__ == "__main__": | |
demo.launch(server_name="0.0.0.0", server_port=7860) | |