|
import warnings |
|
warnings.filterwarnings("ignore", category=UserWarning) |
|
|
|
import pypdfium2 as pdfium |
|
from PIL import Image |
|
|
|
from marker.utils import flush_cuda_memory |
|
from marker.tables.table import format_tables |
|
from marker.debug.data import dump_bbox_debug_data |
|
from marker.layout.layout import surya_layout, annotate_block_types |
|
from marker.layout.order import surya_order, sort_blocks_in_reading_order |
|
from marker.ocr.lang import replace_langs_with_codes, validate_langs |
|
from marker.ocr.detection import surya_detection |
|
from marker.ocr.recognition import run_ocr |
|
from marker.pdf.extract_text import get_text_blocks |
|
from marker.cleaners.headers import filter_header_footer, filter_common_titles |
|
from marker.equations.equations import replace_equations |
|
from marker.pdf.utils import find_filetype |
|
from marker.postprocessors.editor import edit_full_text |
|
from marker.cleaners.code import identify_code_blocks, indent_blocks |
|
from marker.cleaners.bullets import replace_bullets |
|
from marker.cleaners.headings import split_heading_blocks |
|
from marker.cleaners.fontstyle import find_bold_italic |
|
from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text |
|
from marker.cleaners.text import cleanup_text |
|
from marker.images.extract import extract_images |
|
from marker.images.save import images_to_dict |
|
|
|
from typing import List, Dict, Tuple, Optional |
|
from marker.settings import settings |
|
|
|
|
|
def convert_single_pdf( |
|
fname: str, |
|
model_lst: List, |
|
max_pages=None, |
|
metadata: Optional[Dict]=None, |
|
langs: Optional[List[str]] = None, |
|
batch_multiplier: int = 1 |
|
) -> Tuple[str, Dict[str, Image.Image], Dict]: |
|
|
|
if langs is None: |
|
langs = [settings.DEFAULT_LANG] |
|
|
|
if metadata: |
|
langs = metadata.get("languages", langs) |
|
|
|
langs = replace_langs_with_codes(langs) |
|
validate_langs(langs) |
|
|
|
|
|
filetype = find_filetype(fname) |
|
|
|
|
|
out_meta = { |
|
"languages": langs, |
|
"filetype": filetype, |
|
} |
|
|
|
if filetype == "other": |
|
return "", out_meta |
|
|
|
|
|
doc = pdfium.PdfDocument(fname) |
|
pages, toc = get_text_blocks( |
|
doc, |
|
max_pages=max_pages, |
|
) |
|
out_meta.update({ |
|
"toc": toc, |
|
"pages": len(pages), |
|
}) |
|
|
|
|
|
texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst |
|
|
|
|
|
surya_detection(doc, pages, detection_model, batch_multiplier=batch_multiplier) |
|
flush_cuda_memory() |
|
|
|
|
|
pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier) |
|
flush_cuda_memory() |
|
|
|
out_meta["ocr_stats"] = ocr_stats |
|
if len([b for p in pages for b in p.blocks]) == 0: |
|
print(f"Could not extract any text blocks for {fname}") |
|
return "", out_meta |
|
|
|
surya_layout(doc, pages, layout_model, batch_multiplier=batch_multiplier) |
|
flush_cuda_memory() |
|
|
|
|
|
bad_span_ids = filter_header_footer(pages) |
|
out_meta["block_stats"] = {"header_footer": len(bad_span_ids)} |
|
|
|
|
|
annotate_block_types(pages) |
|
|
|
|
|
dump_bbox_debug_data(doc, pages) |
|
|
|
|
|
|
|
surya_order(doc, pages, order_model, batch_multiplier=batch_multiplier) |
|
sort_blocks_in_reading_order(pages) |
|
flush_cuda_memory() |
|
|
|
|
|
code_block_count = identify_code_blocks(pages) |
|
out_meta["block_stats"]["code"] = code_block_count |
|
indent_blocks(pages) |
|
|
|
|
|
table_count = format_tables(pages) |
|
out_meta["block_stats"]["table"] = table_count |
|
|
|
for page in pages: |
|
for block in page.blocks: |
|
block.filter_spans(bad_span_ids) |
|
block.filter_bad_span_types() |
|
|
|
filtered, eq_stats = replace_equations( |
|
doc, |
|
pages, |
|
texify_model, |
|
batch_multiplier=batch_multiplier |
|
) |
|
flush_cuda_memory() |
|
out_meta["block_stats"]["equations"] = eq_stats |
|
|
|
|
|
if settings.EXTRACT_IMAGES: |
|
extract_images(doc, pages) |
|
|
|
|
|
split_heading_blocks(pages) |
|
find_bold_italic(pages) |
|
|
|
|
|
merged_lines = merge_spans(filtered) |
|
text_blocks = merge_lines(merged_lines) |
|
text_blocks = filter_common_titles(text_blocks) |
|
full_text = get_full_text(text_blocks) |
|
|
|
|
|
full_text = cleanup_text(full_text) |
|
|
|
|
|
full_text = replace_bullets(full_text) |
|
|
|
|
|
full_text, edit_stats = edit_full_text( |
|
full_text, |
|
edit_model, |
|
batch_multiplier=batch_multiplier |
|
) |
|
flush_cuda_memory() |
|
out_meta["postprocess_stats"] = {"edit": edit_stats} |
|
doc_images = images_to_dict(pages) |
|
|
|
return full_text, doc_images, out_meta |