File size: 3,015 Bytes
cdb52cd
 
f25ee15
 
 
 
 
b89a1c3
 
683fa93
 
b89a1c3
 
0532015
 
b89a1c3
0532015
 
cdb52cd
 
0532015
 
f25ee15
b89a1c3
cdb52cd
f25ee15
 
 
b89a1c3
f25ee15
b89a1c3
 
 
cdb52cd
b89a1c3
cdb52cd
 
 
 
 
 
 
 
 
b89a1c3
f25ee15
cdb52cd
 
f25ee15
b89a1c3
0532015
f25ee15
 
cdb52cd
b89a1c3
 
 
 
f25ee15
b89a1c3
 
 
 
 
 
 
 
cdb52cd
b89a1c3
 
 
cdb52cd
b89a1c3
e219826
f25ee15
e219826
f25ee15
e219826
b89a1c3
683fa93
e219826
 
0532015
cdb52cd
 
 
 
 
e219826
b89a1c3
 
e219826
cdb52cd
b89a1c3
 
0532015
683fa93
 
b89a1c3
cdb52cd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# app.py

import os
import shutil
import subprocess
import tempfile
import json

import pypandoc
import gradio as gr

def run_marker(pdf_path, out_dir, fmt):
    """Run Marker to convert PDF, extracting images & math."""
    cmd = [
        "marker_single",
        pdf_path,
        "--output_format", fmt,
        "--output_dir", out_dir,
        "--extract_images", "True",    # pass explicit boolean
        "--paginate_output", "True"
    ]
    subprocess.run(cmd, check=True)

def collect_outputs(out_dir, ext):
    """Recursively gather all files with the given extension."""
    collected = []
    for root, _, files in os.walk(out_dir):
        for fname in sorted(files):
            if fname.lower().endswith(ext):
                collected.append(os.path.join(root, fname))
    return collected

def process_upload(pdf_file, output_format):
    # 1) Create temp dir for Marker outputs
    out_dir = tempfile.mkdtemp()

    # 2) Map Gradio choice to Marker’s format
    fmt = {
        "markdown": "markdown",
        "json":     "json",
        "docx":     "markdown"   # produce .md before converting to DOCX
    }[output_format]

    # 3) Run Marker CLI
    run_marker(pdf_file.name, out_dir, fmt)

    # 4) Read the generated pages
    ext = ".json" if output_format == "json" else ".md"
    pages = []
    for path in collect_outputs(out_dir, ext):
        with open(path, 'r', encoding='utf-8') as f:
            pages.append(f.read())

    # 5) DOCX branch: combine markdown and convert via Pandoc
    if output_format == "docx":
        md_path = os.path.join(out_dir, "combined.md")
        with open(md_path, "w", encoding="utf-8") as f:
            f.write("\n\n---\n\n".join(pages))

        docx_path = tempfile.mktemp(suffix=".docx")
        pypandoc.convert_file(
            md_path,
            "docx",
            outputfile=docx_path,
            extra_args=[f"--resource-path={out_dir}"]
        )

        # Clean up and return the path to the .docx file
        shutil.rmtree(out_dir)
        return docx_path

    # 6) Non-DOCX: clean up and return Markdown or JSON string
    shutil.rmtree(out_dir)
    if output_format == "markdown":
        return "\n\n---\n\n".join(pages)
    else:
        return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)

# Gradio Interface
demo = gr.Interface(
    fn=process_upload,
    inputs=[
        gr.File(label="Upload PDF", file_types=[".pdf"]),
        gr.Radio(
            choices=["markdown", "json", "docx"],
            value="markdown",
            label="Output format"
        )
    ],
    outputs=gr.File(label="Download Result"),
    title="PDF → Markdown/JSON/DOCX Converter",
    description=(
        "Upload a PDF (with images & math). "
        "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
        "Or choose **DOCX** to get a Word document with everything embedded."
    )
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)