File size: 5,164 Bytes
c8a32e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import warnings
warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
import pypdfium2 as pdfium
from PIL import Image
from marker.utils import flush_cuda_memory
from marker.tables.table import format_tables
from marker.debug.data import dump_bbox_debug_data
from marker.layout.layout import surya_layout, annotate_block_types
from marker.layout.order import surya_order, sort_blocks_in_reading_order
from marker.ocr.lang import replace_langs_with_codes, validate_langs
from marker.ocr.detection import surya_detection
from marker.ocr.recognition import run_ocr
from marker.pdf.extract_text import get_text_blocks
from marker.cleaners.headers import filter_header_footer, filter_common_titles
from marker.equations.equations import replace_equations
from marker.pdf.utils import find_filetype
from marker.postprocessors.editor import edit_full_text
from marker.cleaners.code import identify_code_blocks, indent_blocks
from marker.cleaners.bullets import replace_bullets
from marker.cleaners.headings import split_heading_blocks
from marker.cleaners.fontstyle import find_bold_italic
from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
from marker.cleaners.text import cleanup_text
from marker.images.extract import extract_images
from marker.images.save import images_to_dict
from typing import List, Dict, Tuple, Optional
from marker.settings import settings
def convert_single_pdf(
fname: str,
model_lst: List,
max_pages=None,
metadata: Optional[Dict]=None,
langs: Optional[List[str]] = None,
batch_multiplier: int = 1
) -> Tuple[str, Dict[str, Image.Image], Dict]:
# Set language needed for OCR
if langs is None:
langs = [settings.DEFAULT_LANG]
if metadata:
langs = metadata.get("languages", langs)
langs = replace_langs_with_codes(langs)
validate_langs(langs)
# Find the filetype
filetype = find_filetype(fname)
# Setup output metadata
out_meta = {
"languages": langs,
"filetype": filetype,
}
if filetype == "other": # We can't process this file
return "", out_meta
# Get initial text blocks from the pdf
doc = pdfium.PdfDocument(fname)
pages, toc = get_text_blocks(
doc,
max_pages=max_pages,
)
out_meta.update({
"toc": toc,
"pages": len(pages),
})
# Unpack models from list
texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst
# Identify text lines on pages
surya_detection(doc, pages, detection_model, batch_multiplier=batch_multiplier)
flush_cuda_memory()
# OCR pages as needed
pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier)
flush_cuda_memory()
out_meta["ocr_stats"] = ocr_stats
if len([b for p in pages for b in p.blocks]) == 0:
print(f"Could not extract any text blocks for {fname}")
return "", out_meta
surya_layout(doc, pages, layout_model, batch_multiplier=batch_multiplier)
flush_cuda_memory()
# Find headers and footers
bad_span_ids = filter_header_footer(pages)
out_meta["block_stats"] = {"header_footer": len(bad_span_ids)}
# Add block types in
annotate_block_types(pages)
# Dump debug data if flags are set
dump_bbox_debug_data(doc, pages)
# Find reading order for blocks
# Sort blocks by reading order
surya_order(doc, pages, order_model, batch_multiplier=batch_multiplier)
sort_blocks_in_reading_order(pages)
flush_cuda_memory()
# Fix code blocks
code_block_count = identify_code_blocks(pages)
out_meta["block_stats"]["code"] = code_block_count
indent_blocks(pages)
# Fix table blocks
table_count = format_tables(pages)
out_meta["block_stats"]["table"] = table_count
for page in pages:
for block in page.blocks:
block.filter_spans(bad_span_ids)
block.filter_bad_span_types()
filtered, eq_stats = replace_equations(
doc,
pages,
texify_model,
batch_multiplier=batch_multiplier
)
flush_cuda_memory()
out_meta["block_stats"]["equations"] = eq_stats
# Extract images and figures
if settings.EXTRACT_IMAGES:
extract_images(doc, pages)
# Split out headers
split_heading_blocks(pages)
find_bold_italic(pages)
# Copy to avoid changing original data
merged_lines = merge_spans(filtered)
text_blocks = merge_lines(merged_lines)
text_blocks = filter_common_titles(text_blocks)
full_text = get_full_text(text_blocks)
# Handle empty blocks being joined
full_text = cleanup_text(full_text)
# Replace bullet characters with a -
full_text = replace_bullets(full_text)
# Postprocess text with editor model
full_text, edit_stats = edit_full_text(
full_text,
edit_model,
batch_multiplier=batch_multiplier
)
flush_cuda_memory()
out_meta["postprocess_stats"] = {"edit": edit_stats}
doc_images = images_to_dict(pages)
return full_text, doc_images, out_meta |