marker-io / marker /convert.py
Ritvik19's picture
Add all files and directories
c8a32e7
raw
history blame
5.16 kB
import warnings
warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
import pypdfium2 as pdfium
from PIL import Image
from marker.utils import flush_cuda_memory
from marker.tables.table import format_tables
from marker.debug.data import dump_bbox_debug_data
from marker.layout.layout import surya_layout, annotate_block_types
from marker.layout.order import surya_order, sort_blocks_in_reading_order
from marker.ocr.lang import replace_langs_with_codes, validate_langs
from marker.ocr.detection import surya_detection
from marker.ocr.recognition import run_ocr
from marker.pdf.extract_text import get_text_blocks
from marker.cleaners.headers import filter_header_footer, filter_common_titles
from marker.equations.equations import replace_equations
from marker.pdf.utils import find_filetype
from marker.postprocessors.editor import edit_full_text
from marker.cleaners.code import identify_code_blocks, indent_blocks
from marker.cleaners.bullets import replace_bullets
from marker.cleaners.headings import split_heading_blocks
from marker.cleaners.fontstyle import find_bold_italic
from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
from marker.cleaners.text import cleanup_text
from marker.images.extract import extract_images
from marker.images.save import images_to_dict
from typing import List, Dict, Tuple, Optional
from marker.settings import settings
def convert_single_pdf(
fname: str,
model_lst: List,
max_pages=None,
metadata: Optional[Dict]=None,
langs: Optional[List[str]] = None,
batch_multiplier: int = 1
) -> Tuple[str, Dict[str, Image.Image], Dict]:
# Set language needed for OCR
if langs is None:
langs = [settings.DEFAULT_LANG]
if metadata:
langs = metadata.get("languages", langs)
langs = replace_langs_with_codes(langs)
validate_langs(langs)
# Find the filetype
filetype = find_filetype(fname)
# Setup output metadata
out_meta = {
"languages": langs,
"filetype": filetype,
}
if filetype == "other": # We can't process this file
return "", out_meta
# Get initial text blocks from the pdf
doc = pdfium.PdfDocument(fname)
pages, toc = get_text_blocks(
doc,
max_pages=max_pages,
)
out_meta.update({
"toc": toc,
"pages": len(pages),
})
# Unpack models from list
texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst
# Identify text lines on pages
surya_detection(doc, pages, detection_model, batch_multiplier=batch_multiplier)
flush_cuda_memory()
# OCR pages as needed
pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier)
flush_cuda_memory()
out_meta["ocr_stats"] = ocr_stats
if len([b for p in pages for b in p.blocks]) == 0:
print(f"Could not extract any text blocks for {fname}")
return "", out_meta
surya_layout(doc, pages, layout_model, batch_multiplier=batch_multiplier)
flush_cuda_memory()
# Find headers and footers
bad_span_ids = filter_header_footer(pages)
out_meta["block_stats"] = {"header_footer": len(bad_span_ids)}
# Add block types in
annotate_block_types(pages)
# Dump debug data if flags are set
dump_bbox_debug_data(doc, pages)
# Find reading order for blocks
# Sort blocks by reading order
surya_order(doc, pages, order_model, batch_multiplier=batch_multiplier)
sort_blocks_in_reading_order(pages)
flush_cuda_memory()
# Fix code blocks
code_block_count = identify_code_blocks(pages)
out_meta["block_stats"]["code"] = code_block_count
indent_blocks(pages)
# Fix table blocks
table_count = format_tables(pages)
out_meta["block_stats"]["table"] = table_count
for page in pages:
for block in page.blocks:
block.filter_spans(bad_span_ids)
block.filter_bad_span_types()
filtered, eq_stats = replace_equations(
doc,
pages,
texify_model,
batch_multiplier=batch_multiplier
)
flush_cuda_memory()
out_meta["block_stats"]["equations"] = eq_stats
# Extract images and figures
if settings.EXTRACT_IMAGES:
extract_images(doc, pages)
# Split out headers
split_heading_blocks(pages)
find_bold_italic(pages)
# Copy to avoid changing original data
merged_lines = merge_spans(filtered)
text_blocks = merge_lines(merged_lines)
text_blocks = filter_common_titles(text_blocks)
full_text = get_full_text(text_blocks)
# Handle empty blocks being joined
full_text = cleanup_text(full_text)
# Replace bullet characters with a -
full_text = replace_bullets(full_text)
# Postprocess text with editor model
full_text, edit_stats = edit_full_text(
full_text,
edit_model,
batch_multiplier=batch_multiplier
)
flush_cuda_memory()
out_meta["postprocess_stats"] = {"edit": edit_stats}
doc_images = images_to_dict(pages)
return full_text, doc_images, out_meta