Spaces:
Sleeping
Sleeping
File size: 2,986 Bytes
f25ee15 b89a1c3 683fa93 b89a1c3 0532015 b89a1c3 0532015 b89a1c3 f25ee15 0532015 f25ee15 b89a1c3 f25ee15 b89a1c3 f25ee15 b89a1c3 f25ee15 b89a1c3 f25ee15 b89a1c3 0532015 f25ee15 b89a1c3 f25ee15 b89a1c3 e219826 f25ee15 e219826 f25ee15 e219826 b89a1c3 683fa93 e219826 0532015 b89a1c3 e219826 b89a1c3 e219826 b89a1c3 0532015 683fa93 b89a1c3 683fa93 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import os
import shutil
import subprocess
import tempfile
import json
import pypandoc
import gradio as gr
def run_marker(pdf_path, out_dir, fmt):
"""Run Marker to convert PDF, extracting images & math."""
cmd = [
"marker_single",
pdf_path,
"--output_format", fmt,
"--output_dir", out_dir,
"--extract_images", # ensure images get saved
"--paginate_output"
]
subprocess.run(cmd, check=True)
def collect_outputs(out_dir, ext):
"""Recursively gather all files with given extension."""
collected = []
for root, _, files in os.walk(out_dir):
for fname in sorted(files):
if fname.lower().endswith(ext):
collected.append(os.path.join(root, fname))
return collected
def process_upload(pdf_file, output_format):
# 1) Temp dir for Marker outputs
out_dir = tempfile.mkdtemp()
fmt = {"markdown": "markdown", "json": "json"}[output_format]
run_marker(pdf_file.name, out_dir, fmt)
# 2) Read pages
ext = ".md" if output_format in ["markdown","docx"] else ".json"
pages = []
for path in collect_outputs(out_dir, ext):
with open(path, 'r', encoding='utf-8') as f:
pages.append(f.read())
# 3) Cleanup Marker temp files if not doing docx
# (but keep them for Pandoc image embedding)
# so we delay full cleanup until end.
# 4) If Word requested, first join markdown then convert.
if output_format == "docx":
# write a single temp .md
md_path = os.path.join(out_dir, "combined.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write("\n\n---\n\n".join(pages))
# produce .docx via Pandoc, telling it where images live
docx_path = tempfile.mktemp(suffix=".docx")
pypandoc.convert_file(
md_path,
"docx",
outputfile=docx_path,
extra_args=[f"--resource-path={out_dir}"]
)
# clean up Marker outputs
shutil.rmtree(out_dir)
return docx_path
# 5) Non-docx: join or wrap JSON
shutil.rmtree(out_dir)
if output_format == "markdown":
return "\n\n---\n\n".join(pages)
else:
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
# Gradio Interface
demo = gr.Interface(
fn=process_upload,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Radio(choices=["markdown", "json", "docx"],
value="markdown",
label="Output format")
],
outputs=gr.File(label="Download Result"),
title="PDF → Markdown/JSON/DOCX Converter",
description=(
"Upload a PDF (even with images & math). "
"Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
"Or choose **DOCX** to get a Word document with everything embedded."
)
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|