Spaces:
Running
Running
File size: 1,667 Bytes
f25ee15 683fa93 e219826 0532015 f25ee15 0532015 f25ee15 0532015 f25ee15 0532015 f25ee15 0532015 f25ee15 e219826 f25ee15 e219826 f25ee15 e219826 683fa93 e219826 0532015 e219826 f25ee15 0532015 e219826 f25ee15 0532015 683fa93 0532015 683fa93 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import os
import shutil
import subprocess
import tempfile
import json
import gradio as gr
def process_upload(pdf_file, output_format):
out_dir = tempfile.mkdtemp()
fmt = "markdown" if output_format == "markdown" else "json"
cmd = [
"marker_single",
pdf_file.name,
"--output_format", fmt,
"--output_dir", out_dir,
"--paginate_output"
]
subprocess.run(cmd, check=True)
# Recursively find only .md/.json files
collected = []
for root, _, files in os.walk(out_dir):
for fname in sorted(files):
if fmt == "markdown" and fname.lower().endswith(".md"):
collected.append(os.path.join(root, fname))
elif fmt == "json" and fname.lower().endswith(".json"):
collected.append(os.path.join(root, fname))
pages = []
for path in collected:
with open(path, 'r', encoding='utf-8') as f:
pages.append(f.read())
shutil.rmtree(out_dir)
if output_format == "markdown":
return "\n\n---\n\n".join(pages)
else:
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
demo = gr.Interface(
fn=process_upload,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Radio(["markdown","json"], value="markdown", label="Output format")
],
outputs=gr.Code(label="Converted Output"),
title="PDF → Markdown/JSON with LaTeX Support",
description=(
"Upload a PDF and get back Markdown or structured JSON, "
"with math preserved as LaTeX."
)
)
if __name__=="__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|