euler314 commited on
Commit
0532015
·
verified ·
1 Parent(s): ff1f350

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -42
app.py CHANGED
@@ -1,56 +1,47 @@
1
- # app.py
2
- import fitz # PyMuPDF
3
- from markdownify import markdownify as md
4
- import json
5
  import gradio as gr
6
 
7
- def convert_pdf_to_markdown(path):
8
- """Extract each page as HTML, convert to Markdown."""
9
- doc = fitz.open(path)
10
- pages_md = []
11
- for i, page in enumerate(doc, start=1):
12
- html = page.get_text("html") or ""
13
- # Clean conversion: collapse multiple newlines
14
- page_md = md(html).strip()
15
- pages_md.append({"page": i, "markdown": page_md})
16
- return pages_md
17
-
18
  def process_upload(pdf_file, output_format):
19
- """
20
- pdf_file: tempfile-like object from Gradio
21
- output_format: "markdown" or "json"
22
- """
23
- # Convert and collect
24
- pages = convert_pdf_to_markdown(pdf_file.name)
25
-
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  if output_format == "markdown":
27
- # Join all pages
28
- full_md = "\n\n---\n\n".join(p["markdown"] for p in pages)
29
- return full_md
30
  else:
31
- # Return pretty JSON
32
- return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
33
 
34
- # Gradio interface
35
  demo = gr.Interface(
36
  fn=process_upload,
37
  inputs=[
38
- gr.File(label="Upload your PDF", file_types=[".pdf"]),
39
- gr.Radio(choices=["markdown", "json"],
40
- value="markdown",
41
- label="Output format")
42
  ],
43
- outputs=gr.Code(label="Converted Output"),
44
- title="PDF → Markdown/JSON Converter",
45
  description=(
46
- "Upload a PDF and get back a professionally converted Markdown "
47
- "or a structured JSON with each page’s Markdown. "
48
- "PDFs with images or complex tables may still need manual review."
49
- ),
50
- examples=[
51
- # you can add example PDFs here if desired
52
- ]
53
  )
54
 
55
- if __name__ == "__main__":
56
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ import os, shutil, subprocess, tempfile, json
 
 
 
2
  import gradio as gr
3
 
 
 
 
 
 
 
 
 
 
 
 
4
  def process_upload(pdf_file, output_format):
5
+ # Create temp output directory
6
+ out_dir = tempfile.mkdtemp()
7
+ # Run Marker CLI: outputs files marker-0000.md or marker-0000.json
8
+ fmt = "markdown" if output_format=="markdown" else "json"
9
+ cmd = [
10
+ "marker_single",
11
+ pdf_file.name,
12
+ "--output_format", fmt,
13
+ "--output_dir", out_dir,
14
+ "--paginate_output" # page separators
15
+ ]
16
+ subprocess.run(cmd, check=True)
17
+ # Read and combine results
18
+ results = []
19
+ for fname in sorted(os.listdir(out_dir)):
20
+ path = os.path.join(out_dir, fname)
21
+ with open(path, 'r', encoding='utf-8') as f:
22
+ results.append(f.read())
23
+ # Cleanup
24
+ shutil.rmtree(out_dir)
25
  if output_format == "markdown":
26
+ return "\n\n---\n\n".join(results)
 
 
27
  else:
28
+ # If JSON, combine into list of pages
29
+ return json.dumps({"pages": results}, indent=2, ensure_ascii=False)
30
 
 
31
  demo = gr.Interface(
32
  fn=process_upload,
33
  inputs=[
34
+ gr.File(label="Upload PDF", file_types=[".pdf"]),
35
+ gr.Radio(["markdown","json"], value="markdown", label="Output format")
 
 
36
  ],
37
+ outputs=gr.Code(label="Output"),
38
+ title="PDF → Markdown/JSON with LaTeX Support",
39
  description=(
40
+ "Uploads a PDF and uses Marker to extract text, structure, and LaTeX math. "
41
+ "Choose Markdown to get a single .md with `$...$`/`$$...$$` math, "
42
+ "or JSON for a page-by-page array."
43
+ )
 
 
 
44
  )
45
 
46
+ if __name__=="__main__":
47
  demo.launch(server_name="0.0.0.0", server_port=7860)