Spaces:
Sleeping
Sleeping
File size: 3,015 Bytes
cdb52cd f25ee15 b89a1c3 683fa93 b89a1c3 0532015 b89a1c3 0532015 cdb52cd 0532015 f25ee15 b89a1c3 cdb52cd f25ee15 b89a1c3 f25ee15 b89a1c3 cdb52cd b89a1c3 cdb52cd b89a1c3 f25ee15 cdb52cd f25ee15 b89a1c3 0532015 f25ee15 cdb52cd b89a1c3 f25ee15 b89a1c3 cdb52cd b89a1c3 cdb52cd b89a1c3 e219826 f25ee15 e219826 f25ee15 e219826 b89a1c3 683fa93 e219826 0532015 cdb52cd e219826 b89a1c3 e219826 cdb52cd b89a1c3 0532015 683fa93 b89a1c3 cdb52cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# app.py
import os
import shutil
import subprocess
import tempfile
import json
import pypandoc
import gradio as gr
def run_marker(pdf_path, out_dir, fmt):
"""Run Marker to convert PDF, extracting images & math."""
cmd = [
"marker_single",
pdf_path,
"--output_format", fmt,
"--output_dir", out_dir,
"--extract_images", "True", # pass explicit boolean
"--paginate_output", "True"
]
subprocess.run(cmd, check=True)
def collect_outputs(out_dir, ext):
"""Recursively gather all files with the given extension."""
collected = []
for root, _, files in os.walk(out_dir):
for fname in sorted(files):
if fname.lower().endswith(ext):
collected.append(os.path.join(root, fname))
return collected
def process_upload(pdf_file, output_format):
# 1) Create temp dir for Marker outputs
out_dir = tempfile.mkdtemp()
# 2) Map Gradio choice to Marker’s format
fmt = {
"markdown": "markdown",
"json": "json",
"docx": "markdown" # produce .md before converting to DOCX
}[output_format]
# 3) Run Marker CLI
run_marker(pdf_file.name, out_dir, fmt)
# 4) Read the generated pages
ext = ".json" if output_format == "json" else ".md"
pages = []
for path in collect_outputs(out_dir, ext):
with open(path, 'r', encoding='utf-8') as f:
pages.append(f.read())
# 5) DOCX branch: combine markdown and convert via Pandoc
if output_format == "docx":
md_path = os.path.join(out_dir, "combined.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write("\n\n---\n\n".join(pages))
docx_path = tempfile.mktemp(suffix=".docx")
pypandoc.convert_file(
md_path,
"docx",
outputfile=docx_path,
extra_args=[f"--resource-path={out_dir}"]
)
# Clean up and return the path to the .docx file
shutil.rmtree(out_dir)
return docx_path
# 6) Non-DOCX: clean up and return Markdown or JSON string
shutil.rmtree(out_dir)
if output_format == "markdown":
return "\n\n---\n\n".join(pages)
else:
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
# Gradio Interface
demo = gr.Interface(
fn=process_upload,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Radio(
choices=["markdown", "json", "docx"],
value="markdown",
label="Output format"
)
],
outputs=gr.File(label="Download Result"),
title="PDF → Markdown/JSON/DOCX Converter",
description=(
"Upload a PDF (with images & math). "
"Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
"Or choose **DOCX** to get a Word document with everything embedded."
)
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
|