euler314's picture
Update app.py
e219826 verified
raw
history blame
1.77 kB
# app.py
import fitz # PyMuPDF
from markdownify import markdownify as md
import json
import gradio as gr
def convert_pdf_to_markdown(path):
"""Extract each page as HTML, convert to Markdown."""
doc = fitz.open(path)
pages_md = []
for i, page in enumerate(doc, start=1):
html = page.get_text("html") or ""
# Clean conversion: collapse multiple newlines
page_md = md(html).strip()
pages_md.append({"page": i, "markdown": page_md})
return pages_md
def process_upload(pdf_file, output_format):
"""
pdf_file: tempfile-like object from Gradio
output_format: "markdown" or "json"
"""
# Convert and collect
pages = convert_pdf_to_markdown(pdf_file.name)
if output_format == "markdown":
# Join all pages
full_md = "\n\n---\n\n".join(p["markdown"] for p in pages)
return full_md
else:
# Return pretty JSON
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
# Gradio interface
demo = gr.Interface(
fn=process_upload,
inputs=[
gr.File(label="Upload your PDF", file_types=[".pdf"]),
gr.Radio(choices=["markdown", "json"],
value="markdown",
label="Output format")
],
outputs=gr.Code(label="Converted Output"),
title="PDF → Markdown/JSON Converter",
description=(
"Upload a PDF and get back a professionally converted Markdown "
"or a structured JSON with each page’s Markdown. "
"PDFs with images or complex tables may still need manual review."
),
examples=[
# you can add example PDFs here if desired
]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)