Spaces:
Running
on
T4
Running
on
T4
taprosoft
commited on
Commit
·
9adfc08
1
Parent(s):
d381432
fix: disable formula recognition and add env var to toggle
Browse files- backends/docling.py +2 -2
- backends/marker.py +8 -1
- backends/mineru.py +2 -2
- backends/settings.py +1 -0
backends/docling.py
CHANGED
@@ -10,7 +10,7 @@ from docling.datamodel.settings import settings
|
|
10 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
11 |
from docling_core.types.doc import ImageRefMode
|
12 |
|
13 |
-
from .settings import ENABLE_DEBUG_MODE
|
14 |
|
15 |
DOCLING_DEBUG_PATH = Path("/tmp/docling")
|
16 |
|
@@ -20,7 +20,7 @@ pipeline_options = PdfPipelineOptions()
|
|
20 |
pipeline_options.accelerator_options = accelerator_options
|
21 |
pipeline_options.do_ocr = True
|
22 |
pipeline_options.do_table_structure = True
|
23 |
-
pipeline_options.do_formula_enrichment =
|
24 |
pipeline_options.generate_picture_images = True
|
25 |
pipeline_options.images_scale = 2.0
|
26 |
|
|
|
10 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
11 |
from docling_core.types.doc import ImageRefMode
|
12 |
|
13 |
+
from .settings import ENABLE_DEBUG_MODE, ENABLE_FORMULA
|
14 |
|
15 |
DOCLING_DEBUG_PATH = Path("/tmp/docling")
|
16 |
|
|
|
20 |
pipeline_options.accelerator_options = accelerator_options
|
21 |
pipeline_options.do_ocr = True
|
22 |
pipeline_options.do_table_structure = True
|
23 |
+
pipeline_options.do_formula_enrichment = ENABLE_FORMULA
|
24 |
pipeline_options.generate_picture_images = True
|
25 |
pipeline_options.images_scale = 2.0
|
26 |
|
backends/marker.py
CHANGED
@@ -6,11 +6,18 @@ from pathlib import Path
|
|
6 |
from marker.converters.pdf import PdfConverter
|
7 |
from marker.models import create_model_dict
|
8 |
from marker.output import text_from_rendered
|
|
|
9 |
from marker.settings import settings
|
10 |
|
11 |
-
from .settings import ENABLE_DEBUG_MODE
|
12 |
|
13 |
# Marker init
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
marker_converter = PdfConverter(
|
15 |
artifact_dict=create_model_dict(),
|
16 |
config={
|
|
|
6 |
from marker.converters.pdf import PdfConverter
|
7 |
from marker.models import create_model_dict
|
8 |
from marker.output import text_from_rendered
|
9 |
+
from marker.processors.equation import EquationProcessor
|
10 |
from marker.settings import settings
|
11 |
|
12 |
+
from .settings import ENABLE_DEBUG_MODE, ENABLE_FORMULA
|
13 |
|
14 |
# Marker init
|
15 |
+
if not ENABLE_FORMULA:
|
16 |
+
PdfConverter.default_processors = (
|
17 |
+
processor
|
18 |
+
for processor in PdfConverter.default_processors
|
19 |
+
if processor != EquationProcessor
|
20 |
+
)
|
21 |
marker_converter = PdfConverter(
|
22 |
artifact_dict=create_model_dict(),
|
23 |
config={
|
backends/mineru.py
CHANGED
@@ -7,7 +7,7 @@ import pymupdf
|
|
7 |
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
8 |
from magic_pdf.tools.common import do_parse, prepare_env
|
9 |
|
10 |
-
from .settings import ENABLE_DEBUG_MODE
|
11 |
|
12 |
MINERU_DEBUG_PATH = Path("/tmp/mineru")
|
13 |
MINERU_DEBUG_PATH.mkdir(exist_ok=True)
|
@@ -52,7 +52,7 @@ def do_process_mineru(input_path, output_dir):
|
|
52 |
f_dump_orig_pdf=False,
|
53 |
f_draw_layout_bbox=ENABLE_DEBUG_MODE,
|
54 |
f_draw_char_bbox=False,
|
55 |
-
formula_enable=
|
56 |
table_enable=True,
|
57 |
)
|
58 |
return local_md_dir, file_name
|
|
|
7 |
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
8 |
from magic_pdf.tools.common import do_parse, prepare_env
|
9 |
|
10 |
+
from .settings import ENABLE_DEBUG_MODE, ENABLE_FORMULA
|
11 |
|
12 |
MINERU_DEBUG_PATH = Path("/tmp/mineru")
|
13 |
MINERU_DEBUG_PATH.mkdir(exist_ok=True)
|
|
|
52 |
f_dump_orig_pdf=False,
|
53 |
f_draw_layout_bbox=ENABLE_DEBUG_MODE,
|
54 |
f_draw_char_bbox=False,
|
55 |
+
formula_enable=ENABLE_FORMULA,
|
56 |
table_enable=True,
|
57 |
)
|
58 |
return local_md_dir, file_name
|
backends/settings.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
import os
|
2 |
|
3 |
ENABLE_DEBUG_MODE = os.environ.get("ENABLE_DEBUG_MODE", "True").lower() == "true"
|
|
|
|
1 |
import os
|
2 |
|
3 |
ENABLE_DEBUG_MODE = os.environ.get("ENABLE_DEBUG_MODE", "True").lower() == "true"
|
4 |
+
ENABLE_FORMULA = os.environ.get("ENABLE_FORMULA", "False").lower() == "true"
|