taprosoft commited on
Commit
9adfc08
·
1 Parent(s): d381432

fix: disable formula recognition and add env var to toggle

Browse files
backends/docling.py CHANGED
@@ -10,7 +10,7 @@ from docling.datamodel.settings import settings
10
  from docling.document_converter import DocumentConverter, PdfFormatOption
11
  from docling_core.types.doc import ImageRefMode
12
 
13
- from .settings import ENABLE_DEBUG_MODE
14
 
15
  DOCLING_DEBUG_PATH = Path("/tmp/docling")
16
 
@@ -20,7 +20,7 @@ pipeline_options = PdfPipelineOptions()
20
  pipeline_options.accelerator_options = accelerator_options
21
  pipeline_options.do_ocr = True
22
  pipeline_options.do_table_structure = True
23
- pipeline_options.do_formula_enrichment = True
24
  pipeline_options.generate_picture_images = True
25
  pipeline_options.images_scale = 2.0
26
 
 
10
  from docling.document_converter import DocumentConverter, PdfFormatOption
11
  from docling_core.types.doc import ImageRefMode
12
 
13
+ from .settings import ENABLE_DEBUG_MODE, ENABLE_FORMULA
14
 
15
  DOCLING_DEBUG_PATH = Path("/tmp/docling")
16
 
 
20
  pipeline_options.accelerator_options = accelerator_options
21
  pipeline_options.do_ocr = True
22
  pipeline_options.do_table_structure = True
23
+ pipeline_options.do_formula_enrichment = ENABLE_FORMULA
24
  pipeline_options.generate_picture_images = True
25
  pipeline_options.images_scale = 2.0
26
 
backends/marker.py CHANGED
@@ -6,11 +6,18 @@ from pathlib import Path
6
  from marker.converters.pdf import PdfConverter
7
  from marker.models import create_model_dict
8
  from marker.output import text_from_rendered
 
9
  from marker.settings import settings
10
 
11
- from .settings import ENABLE_DEBUG_MODE
12
 
13
  # Marker init
 
 
 
 
 
 
14
  marker_converter = PdfConverter(
15
  artifact_dict=create_model_dict(),
16
  config={
 
6
  from marker.converters.pdf import PdfConverter
7
  from marker.models import create_model_dict
8
  from marker.output import text_from_rendered
9
+ from marker.processors.equation import EquationProcessor
10
  from marker.settings import settings
11
 
12
+ from .settings import ENABLE_DEBUG_MODE, ENABLE_FORMULA
13
 
14
  # Marker init
15
+ if not ENABLE_FORMULA:
16
+ PdfConverter.default_processors = (
17
+ processor
18
+ for processor in PdfConverter.default_processors
19
+ if processor != EquationProcessor
20
+ )
21
  marker_converter = PdfConverter(
22
  artifact_dict=create_model_dict(),
23
  config={
backends/mineru.py CHANGED
@@ -7,7 +7,7 @@ import pymupdf
7
  from magic_pdf.data.data_reader_writer import FileBasedDataReader
8
  from magic_pdf.tools.common import do_parse, prepare_env
9
 
10
- from .settings import ENABLE_DEBUG_MODE
11
 
12
  MINERU_DEBUG_PATH = Path("/tmp/mineru")
13
  MINERU_DEBUG_PATH.mkdir(exist_ok=True)
@@ -52,7 +52,7 @@ def do_process_mineru(input_path, output_dir):
52
  f_dump_orig_pdf=False,
53
  f_draw_layout_bbox=ENABLE_DEBUG_MODE,
54
  f_draw_char_bbox=False,
55
- formula_enable=True,
56
  table_enable=True,
57
  )
58
  return local_md_dir, file_name
 
7
  from magic_pdf.data.data_reader_writer import FileBasedDataReader
8
  from magic_pdf.tools.common import do_parse, prepare_env
9
 
10
+ from .settings import ENABLE_DEBUG_MODE, ENABLE_FORMULA
11
 
12
  MINERU_DEBUG_PATH = Path("/tmp/mineru")
13
  MINERU_DEBUG_PATH.mkdir(exist_ok=True)
 
52
  f_dump_orig_pdf=False,
53
  f_draw_layout_bbox=ENABLE_DEBUG_MODE,
54
  f_draw_char_bbox=False,
55
+ formula_enable=ENABLE_FORMULA,
56
  table_enable=True,
57
  )
58
  return local_md_dir, file_name
backends/settings.py CHANGED
@@ -1,3 +1,4 @@
1
  import os
2
 
3
  ENABLE_DEBUG_MODE = os.environ.get("ENABLE_DEBUG_MODE", "True").lower() == "true"
 
 
1
  import os
2
 
3
  ENABLE_DEBUG_MODE = os.environ.get("ENABLE_DEBUG_MODE", "True").lower() == "true"
4
+ ENABLE_FORMULA = os.environ.get("ENABLE_FORMULA", "False").lower() == "true"