Spaces:

euler314
/

file_extension_change

Sleeping

File size: 2,986 Bytes

f25ee15
 
 
 
 
b89a1c3
 
683fa93
 
b89a1c3
 
0532015
 
b89a1c3
0532015
 
b89a1c3
f25ee15
0532015
 
f25ee15
b89a1c3
 
f25ee15
 
 
b89a1c3
f25ee15
b89a1c3
 
 
 
 
 
 
f25ee15
b89a1c3
 
f25ee15
b89a1c3
0532015
f25ee15
 
b89a1c3
 
 
 
 
 
 
 
 
 
f25ee15
b89a1c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e219826
f25ee15
e219826
f25ee15
e219826
b89a1c3
683fa93
e219826
 
0532015
b89a1c3
 
 
e219826
b89a1c3
 
e219826
b89a1c3
 
 
0532015
683fa93
 
b89a1c3
683fa93

import os
import shutil
import subprocess
import tempfile
import json

import pypandoc
import gradio as gr

def run_marker(pdf_path, out_dir, fmt):
    """Run Marker to convert PDF, extracting images & math."""
    cmd = [
        "marker_single",
        pdf_path,
        "--output_format", fmt,
        "--output_dir", out_dir,
        "--extract_images",    # ensure images get saved
        "--paginate_output"
    ]
    subprocess.run(cmd, check=True)

def collect_outputs(out_dir, ext):
    """Recursively gather all files with given extension."""
    collected = []
    for root, _, files in os.walk(out_dir):
        for fname in sorted(files):
            if fname.lower().endswith(ext):
                collected.append(os.path.join(root, fname))
    return collected

def process_upload(pdf_file, output_format):
    # 1) Temp dir for Marker outputs
    out_dir = tempfile.mkdtemp()
    fmt = {"markdown": "markdown", "json": "json"}[output_format]
    run_marker(pdf_file.name, out_dir, fmt)

    # 2) Read pages
    ext = ".md" if output_format in ["markdown","docx"] else ".json"
    pages = []
    for path in collect_outputs(out_dir, ext):
        with open(path, 'r', encoding='utf-8') as f:
            pages.append(f.read())

    # 3) Cleanup Marker temp files if not doing docx
    #    (but keep them for Pandoc image embedding)
    #    so we delay full cleanup until end.

    # 4) If Word requested, first join markdown then convert.
    if output_format == "docx":
        # write a single temp .md
        md_path = os.path.join(out_dir, "combined.md")
        with open(md_path, "w", encoding="utf-8") as f:
            f.write("\n\n---\n\n".join(pages))

        # produce .docx via Pandoc, telling it where images live
        docx_path = tempfile.mktemp(suffix=".docx")
        pypandoc.convert_file(
            md_path,
            "docx",
            outputfile=docx_path,
            extra_args=[f"--resource-path={out_dir}"]
        )

        # clean up Marker outputs
        shutil.rmtree(out_dir)
        return docx_path

    # 5) Non-docx: join or wrap JSON
    shutil.rmtree(out_dir)
    if output_format == "markdown":
        return "\n\n---\n\n".join(pages)
    else:
        return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)

# Gradio Interface
demo = gr.Interface(
    fn=process_upload,
    inputs=[
        gr.File(label="Upload PDF", file_types=[".pdf"]),
        gr.Radio(choices=["markdown", "json", "docx"],
                 value="markdown",
                 label="Output format")
    ],
    outputs=gr.File(label="Download Result"),
    title="PDF → Markdown/JSON/DOCX Converter",
    description=(
        "Upload a PDF (even with images & math). "
        "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
        "Or choose **DOCX** to get a Word document with everything embedded."
    )
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)