import os
import shutil
import subprocess
import tempfile
import json

import pypandoc
import gradio as gr

def run_marker(pdf_path, out_dir, fmt):
    """Run Marker to convert PDF, extracting images & math."""
    cmd = [
        "marker_single",
        pdf_path,
        "--output_format", fmt,
        "--output_dir", out_dir,
        "--extract_images",    # ensure images get saved
        "--paginate_output"
    ]
    subprocess.run(cmd, check=True)

def collect_outputs(out_dir, ext):
    """Recursively gather all files with given extension."""
    collected = []
    for root, _, files in os.walk(out_dir):
        for fname in sorted(files):
            if fname.lower().endswith(ext):
                collected.append(os.path.join(root, fname))
    return collected

def process_upload(pdf_file, output_format):
    # 1) Temp dir for Marker outputs
    out_dir = tempfile.mkdtemp()
    fmt = {"markdown": "markdown", "json": "json"}[output_format]
    run_marker(pdf_file.name, out_dir, fmt)

    # 2) Read pages
    ext = ".md" if output_format in ["markdown","docx"] else ".json"
    pages = []
    for path in collect_outputs(out_dir, ext):
        with open(path, 'r', encoding='utf-8') as f:
            pages.append(f.read())

    # 3) Cleanup Marker temp files if not doing docx
    #    (but keep them for Pandoc image embedding)
    #    so we delay full cleanup until end.

    # 4) If Word requested, first join markdown then convert.
    if output_format == "docx":
        # write a single temp .md
        md_path = os.path.join(out_dir, "combined.md")
        with open(md_path, "w", encoding="utf-8") as f:
            f.write("\n\n---\n\n".join(pages))

        # produce .docx via Pandoc, telling it where images live
        docx_path = tempfile.mktemp(suffix=".docx")
        pypandoc.convert_file(
            md_path,
            "docx",
            outputfile=docx_path,
            extra_args=[f"--resource-path={out_dir}"]
        )

        # clean up Marker outputs
        shutil.rmtree(out_dir)
        return docx_path

    # 5) Non-docx: join or wrap JSON
    shutil.rmtree(out_dir)
    if output_format == "markdown":
        return "\n\n---\n\n".join(pages)
    else:
        return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)

# Gradio Interface
demo = gr.Interface(
    fn=process_upload,
    inputs=[
        gr.File(label="Upload PDF", file_types=[".pdf"]),
        gr.Radio(choices=["markdown", "json", "docx"],
                 value="markdown",
                 label="Output format")
    ],
    outputs=gr.File(label="Download Result"),
    title="PDF → Markdown/JSON/DOCX Converter",
    description=(
        "Upload a PDF (even with images & math). "
        "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
        "Or choose **DOCX** to get a Word document with everything embedded."
    )
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)