taprosoft commited on
Commit
acbe414
·
1 Parent(s): df456bd

feat: add img2table gmft

Browse files
app.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  from utils import fix_problematic_imports, prepare_env_mineru
2
 
3
  fix_problematic_imports() # noqa
@@ -13,18 +17,23 @@ from gradio_pdf import PDF
13
 
14
  from backends import (
15
  convert_docling,
 
 
 
16
  convert_marker,
17
  convert_mineru,
18
  convert_unstructured,
 
19
  )
20
  from backends.settings import ENABLE_DEBUG_MODE
21
  from utils import remove_images_from_markdown, trim_pages
22
 
23
  TRIMMED_PDF_PATH = Path("/tmp/trimmed_input")
24
  TRIMMED_PDF_PATH.mkdir(exist_ok=True)
 
25
 
26
 
27
- def convert_document(path, method, enabled=True):
28
  if enabled:
29
  print("Processing file", path, "with method", method)
30
  else:
@@ -33,7 +42,11 @@ def convert_document(path, method, enabled=True):
33
  # benchmarking
34
  start = time.time()
35
 
36
- path = trim_pages(path, output_path=TRIMMED_PDF_PATH)
 
 
 
 
37
  file_name = Path(path).stem
38
  debug_image_paths = []
39
  text = "unknown method"
@@ -51,6 +64,16 @@ def convert_document(path, method, enabled=True):
51
  )
52
  elif method == "MinerU":
53
  text, debug_image_paths = convert_mineru(path, file_name)
 
 
 
 
 
 
 
 
 
 
54
 
55
  duration = time.time() - start
56
  duration_message = f"Conversion with {method} took *{duration:.2f} seconds*"
@@ -63,6 +86,51 @@ def convert_document(path, method, enabled=True):
63
  )
64
 
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  def show_tabs(selected_methods):
67
  visible_tabs = []
68
  for method in SUPPORTED_METHODS:
@@ -79,14 +147,25 @@ latex_delimiters = [
79
  # startup test (also for loading models the first time)
80
  start_startup = time.time()
81
  WARMUP_PDF_PATH = "table.pdf"
82
- SUPPORTED_METHODS = ["PyMuPDF", "Docling", "Marker", "MinerU", "Unstructured"]
 
 
 
 
 
 
 
 
 
 
83
 
84
- print("Warm-up sequence")
85
- for method in SUPPORTED_METHODS:
86
- for _ in range(1):
87
- convert_document(WARMUP_PDF_PATH, method)
88
- startup_duration = time.time() - start_startup
89
- print(f"Total start-up time: {startup_duration:.2f} seconds")
 
90
 
91
  with gr.Blocks(
92
  theme=gr.themes.Ocean(),
@@ -106,7 +185,28 @@ with gr.Blocks(
106
  ".pdf",
107
  ],
108
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  progress_status = gr.Markdown("", show_label=False, container=False)
 
 
 
 
 
110
 
111
  with gr.Column(variant="panel", scale=5):
112
  with gr.Row():
@@ -116,12 +216,6 @@ with gr.Blocks(
116
  value=SUPPORTED_METHODS[:2],
117
  multiselect=True,
118
  )
119
- with gr.Row():
120
- visual_checkbox = gr.Checkbox(
121
- label="Enable debug visualization",
122
- visible=ENABLE_DEBUG_MODE,
123
- value=True,
124
- )
125
  with gr.Row():
126
  convert_btn = gr.Button("Convert", variant="primary", scale=2)
127
  clear_btn = gr.ClearButton(value="Clear", scale=1)
@@ -210,11 +304,14 @@ with gr.Blocks(
210
 
211
  return msg
212
 
213
- def process_method(input_file, selected_methods, method=method):
214
  if input_file is None:
215
  raise ValueError("Please upload a PDF file first!")
216
  return convert_document(
217
- input_file, method=method, enabled=method in selected_methods
 
 
 
218
  )
219
 
220
  click_event = click_event.then(
@@ -222,25 +319,35 @@ with gr.Blocks(
222
  inputs=[methods],
223
  outputs=[progress_status],
224
  ).then(
225
- fn=lambda input_file, methods, method=method: process_method(
226
- input_file, methods, method
227
  ),
228
- inputs=[input_file, methods],
229
  outputs=output_components[idx * 4 : (idx + 1) * 4],
230
  )
231
 
232
- click_event.then(
233
- lambda: "All tasks completed.",
234
- outputs=[progress_status],
 
 
 
 
 
235
  )
236
 
237
  clear_btn.add(
238
  [
239
  input_file,
240
  pdf_preview,
 
241
  ]
242
  + output_components
243
  )
 
 
 
 
244
 
245
  visual_checkbox.change(
246
  fn=lambda state: [gr.update(visible=state)] * len(visualization_sub_tabs),
@@ -248,4 +355,7 @@ with gr.Blocks(
248
  outputs=visualization_sub_tabs,
249
  )
250
 
251
- demo.launch(show_error=True)
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ from collections import defaultdict
4
+
5
  from utils import fix_problematic_imports, prepare_env_mineru
6
 
7
  fix_problematic_imports() # noqa
 
17
 
18
  from backends import (
19
  convert_docling,
20
+ convert_gemini,
21
+ convert_gmft,
22
+ convert_img2table,
23
  convert_marker,
24
  convert_mineru,
25
  convert_unstructured,
26
+ convert_zerox,
27
  )
28
  from backends.settings import ENABLE_DEBUG_MODE
29
  from utils import remove_images_from_markdown, trim_pages
30
 
31
  TRIMMED_PDF_PATH = Path("/tmp/trimmed_input")
32
  TRIMMED_PDF_PATH.mkdir(exist_ok=True)
33
+ DO_WARMUP = os.getenv("DO_WARMUP", "True").lower() == "true"
34
 
35
 
36
+ def convert_document(path, method, start_page=0, enabled=True):
37
  if enabled:
38
  print("Processing file", path, "with method", method)
39
  else:
 
42
  # benchmarking
43
  start = time.time()
44
 
45
+ path = trim_pages(
46
+ path,
47
+ output_path=TRIMMED_PDF_PATH,
48
+ start_page=start_page,
49
+ )
50
  file_name = Path(path).stem
51
  debug_image_paths = []
52
  text = "unknown method"
 
64
  )
65
  elif method == "MinerU":
66
  text, debug_image_paths = convert_mineru(path, file_name)
67
+ elif method == "Gemini (API)":
68
+ text, debug_image_paths = convert_gemini(path, file_name)
69
+ elif method == "Zerox":
70
+ text, debug_image_paths = convert_zerox(path, file_name)
71
+ elif method == "Img2Table":
72
+ text, debug_image_paths = convert_img2table(path, file_name)
73
+ elif method == "GMFT":
74
+ text, debug_image_paths = convert_gmft(path, file_name)
75
+ else:
76
+ raise ValueError(f"Unsupported method: {method}")
77
 
78
  duration = time.time() - start
79
  duration_message = f"Conversion with {method} took *{duration:.2f} seconds*"
 
86
  )
87
 
88
 
89
+ def to_zip_file(file_path, methods, *output_components):
90
+ markdown_text_dict = dict()
91
+ debug_images_dict = defaultdict(list)
92
+ for idx, method_name in enumerate(SUPPORTED_METHODS):
93
+ if method_name not in methods:
94
+ continue
95
+
96
+ markdown_text = output_components[idx * 4 + 2]
97
+ debug_images = output_components[idx * 4 + 3]
98
+
99
+ markdown_text_dict[method_name] = markdown_text
100
+ debug_images_dict[method_name] = debug_images
101
+
102
+ # create new temp directory using Python's tempfile module
103
+ temp_dir = Path(file_path).parent
104
+ zip_file_path = temp_dir / "output.zip"
105
+
106
+ markdown_path = temp_dir / f"{method_name}.md"
107
+ with open(markdown_path, "w") as f:
108
+ f.write(markdown_text)
109
+
110
+ # create a zip file in write mode
111
+ with zipfile.ZipFile(zip_file_path, "w", zipfile.ZIP_DEFLATED) as zipf:
112
+ for method_name, markdown_text in markdown_text_dict.items():
113
+ debug_image_paths = debug_images_dict[method_name]
114
+
115
+ # write the markdown text to the zip file
116
+ zipf.write(
117
+ markdown_path,
118
+ f"{method_name}/{method_name}.md",
119
+ )
120
+ if debug_image_paths:
121
+ for idx, (debug_image_path, _) in enumerate(debug_image_paths):
122
+ debug_image_name = Path(debug_image_path).name
123
+ zipf.write(
124
+ debug_image_path,
125
+ f"{method_name}/{debug_image_name}",
126
+ )
127
+
128
+ return gr.update(
129
+ value=str(zip_file_path),
130
+ visible=True,
131
+ )
132
+
133
+
134
  def show_tabs(selected_methods):
135
  visible_tabs = []
136
  for method in SUPPORTED_METHODS:
 
147
  # startup test (also for loading models the first time)
148
  start_startup = time.time()
149
  WARMUP_PDF_PATH = "table.pdf"
150
+ SUPPORTED_METHODS = [
151
+ "PyMuPDF",
152
+ "Docling",
153
+ "Marker",
154
+ "MinerU",
155
+ "Unstructured",
156
+ "Gemini (API)",
157
+ "Img2Table",
158
+ "GMFT",
159
+ # "Zerox"
160
+ ]
161
 
162
+ if DO_WARMUP:
163
+ print("Warm-up sequence")
164
+ for method in SUPPORTED_METHODS:
165
+ for _ in range(1):
166
+ convert_document(WARMUP_PDF_PATH, method)
167
+ startup_duration = time.time() - start_startup
168
+ print(f"Total start-up time: {startup_duration:.2f} seconds")
169
 
170
  with gr.Blocks(
171
  theme=gr.themes.Ocean(),
 
185
  ".pdf",
186
  ],
187
  )
188
+ with gr.Accordion(
189
+ "Advanced settings",
190
+ open=False,
191
+ ):
192
+ start_page = gr.Number(
193
+ label="Starting page (only max 5 consecutive pages are processed)",
194
+ minimum=1,
195
+ maximum=100,
196
+ step=1,
197
+ value=1,
198
+ )
199
+ visual_checkbox = gr.Checkbox(
200
+ label="Enable debug visualization",
201
+ visible=ENABLE_DEBUG_MODE,
202
+ value=True,
203
+ )
204
  progress_status = gr.Markdown("", show_label=False, container=False)
205
+ output_file = gr.File(
206
+ label="Download output",
207
+ interactive=False,
208
+ visible=False,
209
+ )
210
 
211
  with gr.Column(variant="panel", scale=5):
212
  with gr.Row():
 
216
  value=SUPPORTED_METHODS[:2],
217
  multiselect=True,
218
  )
 
 
 
 
 
 
219
  with gr.Row():
220
  convert_btn = gr.Button("Convert", variant="primary", scale=2)
221
  clear_btn = gr.ClearButton(value="Clear", scale=1)
 
304
 
305
  return msg
306
 
307
+ def process_method(input_file, start_page, selected_methods, method=method):
308
  if input_file is None:
309
  raise ValueError("Please upload a PDF file first!")
310
  return convert_document(
311
+ input_file,
312
+ method=method,
313
+ start_page=start_page - 1,
314
+ enabled=method in selected_methods,
315
  )
316
 
317
  click_event = click_event.then(
 
319
  inputs=[methods],
320
  outputs=[progress_status],
321
  ).then(
322
+ fn=lambda input_file, start_page, methods, method=method: process_method(
323
+ input_file, start_page, methods, method
324
  ),
325
+ inputs=[input_file, start_page, methods],
326
  outputs=output_components[idx * 4 : (idx + 1) * 4],
327
  )
328
 
329
+ click_event.then(lambda: "All tasks completed.", outputs=[progress_status],).then(
330
+ fn=to_zip_file,
331
+ inputs=[
332
+ input_file,
333
+ methods,
334
+ ]
335
+ + output_components,
336
+ outputs=[output_file],
337
  )
338
 
339
  clear_btn.add(
340
  [
341
  input_file,
342
  pdf_preview,
343
+ output_file,
344
  ]
345
  + output_components
346
  )
347
+ clear_btn.click(
348
+ fn=lambda: gr.update(visible=False),
349
+ outputs=[output_file],
350
+ )
351
 
352
  visual_checkbox.change(
353
  fn=lambda state: [gr.update(visible=state)] * len(visualization_sub_tabs),
 
355
  outputs=visualization_sub_tabs,
356
  )
357
 
358
+ demo.launch(
359
+ show_error=True,
360
+ max_file_size="50mb",
361
+ )
backends/__init__.py CHANGED
@@ -1,11 +1,19 @@
1
  from .docling import convert_docling
 
 
 
2
  from .marker import convert_marker
3
  from .mineru import convert_mineru
4
  from .unstructured import convert_unstructured
 
5
 
6
  __all__ = [
7
  "convert_docling",
8
  "convert_marker",
9
  "convert_mineru",
10
  "convert_unstructured",
 
 
 
 
11
  ]
 
1
  from .docling import convert_docling
2
+ from .gemini import convert_gemini
3
+ from .gmft import convert_gmft
4
+ from .img2table import convert_img2table
5
  from .marker import convert_marker
6
  from .mineru import convert_mineru
7
  from .unstructured import convert_unstructured
8
+ from .zerox import convert_zerox
9
 
10
  __all__ = [
11
  "convert_docling",
12
  "convert_marker",
13
  "convert_mineru",
14
  "convert_unstructured",
15
+ "convert_gemini",
16
+ "convert_zerox",
17
+ "convert_img2table",
18
+ "convert_gmft",
19
  ]
backends/gemini.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from google import genai
5
+ from google.genai import types
6
+
7
+ # Create a client
8
+ client = genai.Client(api_key=os.getenv("GEMINI_API_KEY", ""))
9
+ MODEL_NAME = "gemini-2.0-flash"
10
+ PROMPT = """
11
+ Convert the following document to markdown, preserving header, table and figure structure as much as possible.
12
+ Return only the markdown with no explanation text. Do not include delimiters like ```markdown or ```html.
13
+
14
+ RULES:
15
+ - You must include all information on the page. Do not exclude headers, footers, or subtext.
16
+ - Return tables in Markdown format.
17
+ - Must format headers / sub-headers in Markdown format (#, ##, etc).
18
+ - Attempt to merge line-breaks in to coherent paragraphs.
19
+ - Charts & infographics must be interpreted to a text-based markdown format. Prefer table format when applicable.
20
+ - Do not include any images URL / tag in the markdown.
21
+ - Page numbers should be wrapped in brackets. Ex: <page_number>14<page_number> or <page_number>9/22<page_number>
22
+ - Prefer using ☐ and ☑ for check boxes.
23
+ """ # noqa: E501
24
+
25
+
26
+ def convert_gemini(path: str, file_name: str):
27
+ # Generate a structured response using the Gemini API
28
+ generation_config = types.GenerationConfig(
29
+ max_output_tokens=8192,
30
+ ).to_json_dict()
31
+ response = client.models.generate_content(
32
+ model=MODEL_NAME,
33
+ contents=[
34
+ PROMPT,
35
+ types.Part.from_bytes(
36
+ data=Path(path).read_bytes(),
37
+ mime_type="application/pdf",
38
+ ),
39
+ ],
40
+ config=generation_config,
41
+ )
42
+ # Convert the response to the pydantic model and return it
43
+ return response.text, []
backends/gmft.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from gmft.auto import AutoFormatConfig, AutoTableFormatter, CroppedTable, TableDetector
4
+ from gmft.pdf_bindings import PyPDFium2Document
5
+
6
+ from .settings import ENABLE_DEBUG_MODE
7
+
8
+ detector = TableDetector()
9
+ config = AutoFormatConfig()
10
+ config.semantic_spanning_cells = True # [Experimental] better spanning cells
11
+ config.enable_multi_header = True # multi-headers
12
+ formatter = AutoTableFormatter(config)
13
+
14
+
15
+ GMFT_DEBUG_PATH = Path("/tmp/gmft")
16
+ GMFT_DEBUG_PATH.mkdir(exist_ok=True)
17
+
18
+
19
+ def ingest_pdf(pdf_path) -> list[CroppedTable]:
20
+ doc = PyPDFium2Document(pdf_path)
21
+
22
+ tables = []
23
+ for page in doc:
24
+ tables += detector.extract(page)
25
+ return tables
26
+
27
+
28
+ def convert_gmft(path: str, file_name: str):
29
+ tables = ingest_pdf(path)
30
+ formatted_tables = []
31
+ debug_image_paths = []
32
+
33
+ debug_path = GMFT_DEBUG_PATH / file_name
34
+ debug_path.mkdir(exist_ok=True)
35
+
36
+ for idx, table in enumerate(tables):
37
+ ft = formatter.extract(
38
+ table,
39
+ dpi=72 * 2,
40
+ )
41
+ df = ft.df()
42
+ if df is not None:
43
+ html = df.fillna("").to_html(
44
+ index=False,
45
+ )
46
+ formatted_tables.append(html)
47
+
48
+ if ENABLE_DEBUG_MODE:
49
+ image_path = debug_path / f"table_{idx}.png"
50
+ ft.image().save(image_path)
51
+ debug_image_paths.append(image_path)
52
+
53
+ content = "\n\n".join(formatted_tables)
54
+ return content, debug_image_paths
backends/img2table.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import cv2
4
+ from img2table.document import PDF
5
+ from img2table.ocr import SuryaOCR
6
+
7
+ from .settings import ENABLE_DEBUG_MODE
8
+
9
+ ocr = SuryaOCR(
10
+ langs=["en"],
11
+ )
12
+ IMG2TABLE_DEBUG_PATH = Path("/tmp/img2table")
13
+ IMG2TABLE_DEBUG_PATH.mkdir(exist_ok=True)
14
+
15
+
16
+ def convert_img2table(path: str, file_name: str):
17
+ doc = PDF(path)
18
+ pages = doc.extract_tables(
19
+ ocr=ocr,
20
+ implicit_rows=False,
21
+ implicit_columns=False,
22
+ borderless_tables=True,
23
+ min_confidence=50,
24
+ )
25
+ debug_image_paths = []
26
+
27
+ if ENABLE_DEBUG_MODE:
28
+ debug_path = IMG2TABLE_DEBUG_PATH / file_name
29
+ debug_path.mkdir(exist_ok=True)
30
+
31
+ images = doc.images
32
+ for idx, page_number in enumerate(doc.pages or range(len(images))):
33
+ page_image = images[idx]
34
+ for table in pages[page_number]:
35
+ for row in table.content.values():
36
+ for cell in row:
37
+ cv2.rectangle(
38
+ page_image,
39
+ (cell.bbox.x1, cell.bbox.y1),
40
+ (cell.bbox.x2, cell.bbox.y2),
41
+ (0, 0, 255),
42
+ 2,
43
+ )
44
+ image_path = debug_path / f"page_{idx}.png"
45
+ debug_image_paths.append(image_path)
46
+ cv2.imwrite(str(image_path), page_image)
47
+
48
+ content = "\n\n".join(
49
+ [
50
+ (table.title if table.title else "") + "\n\n" + table.html
51
+ for tables in pages.values()
52
+ for table in tables
53
+ ]
54
+ )
55
+ return content, debug_image_paths
backends/zerox.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import re
3
+ from pathlib import Path
4
+
5
+ from pyzerox import zerox
6
+
7
+
8
+ def remove_images_from_markdown(markdown_text):
9
+ # remove <image> and ![image](path) from markdown
10
+ markdown_text = re.sub(r"<img[^>]*>", "", markdown_text)
11
+ markdown_text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", markdown_text)
12
+ return markdown_text
13
+
14
+
15
+ ZEROX_DEBUG_PATH = Path("/tmp/zerox_debug")
16
+ ZEROX_DEBUG_PATH.mkdir(exist_ok=True)
17
+ MODEL_NAME = "gemini/gemini-2.0-flash"
18
+
19
+
20
+ def clean_up_html_code_block(text: str):
21
+ # remove ```html and ``` from text
22
+ text = text.replace("```html", "")
23
+ text = text.replace("```", "")
24
+ return text
25
+
26
+
27
+ def convert_zerox(path: str, file_name: str):
28
+ output_dir = ZEROX_DEBUG_PATH / file_name
29
+ output_dir.mkdir(exist_ok=True)
30
+
31
+ async def async_convert():
32
+ return await zerox(
33
+ concurrency=4,
34
+ file_path=path,
35
+ model=MODEL_NAME,
36
+ output_dir=output_dir,
37
+ )
38
+
39
+ output = asyncio.run(async_convert())
40
+ output_text = "\n\n".join(page.content for page in output.pages)
41
+ output_text = clean_up_html_code_block(output_text)
42
+ output_text = remove_images_from_markdown(output_text)
43
+ return output_text, []
requirements.txt CHANGED
@@ -14,5 +14,7 @@ unstructured[pdf]
14
  ultralytics>=8.3.48
15
  transformers<5.0.0,>=4.45.2
16
  struct-eqtable==0.3.2
17
- openai
18
  doclayout_yolo==0.0.2b1
 
 
 
 
14
  ultralytics>=8.3.48
15
  transformers<5.0.0,>=4.45.2
16
  struct-eqtable==0.3.2
 
17
  doclayout_yolo==0.0.2b1
18
+ openai
19
+ opencv-contrib-python
20
+ gmft
utils.py CHANGED
@@ -14,14 +14,14 @@ def remove_images_from_markdown(markdown_text):
14
 
15
 
16
  @functools.lru_cache(maxsize=None)
17
- def trim_pages(pdf_path, output_path, trim_pages=5):
18
  doc = pymupdf.open(pdf_path)
19
  parent_dir_name = Path(pdf_path).parent.name
20
  output_file_path = Path(output_path) / f"{parent_dir_name}.pdf"
21
 
22
  num_pages = len(doc)
23
  if num_pages > trim_pages:
24
- to_select = list(range(trim_pages))
25
  doc.select(to_select)
26
  doc.ez_save(output_file_path)
27
  print("Trimmed pdf to with pages", to_select, "path", output_file_path)
 
14
 
15
 
16
  @functools.lru_cache(maxsize=None)
17
+ def trim_pages(pdf_path, output_path, start_page=0, trim_pages=5):
18
  doc = pymupdf.open(pdf_path)
19
  parent_dir_name = Path(pdf_path).parent.name
20
  output_file_path = Path(output_path) / f"{parent_dir_name}.pdf"
21
 
22
  num_pages = len(doc)
23
  if num_pages > trim_pages:
24
+ to_select = list(range(start_page, min(start_page + trim_pages, num_pages)))
25
  doc.select(to_select)
26
  doc.ez_save(output_file_path)
27
  print("Trimmed pdf to with pages", to_select, "path", output_file_path)