Spaces:

Ritvik19
/

marker-io

Running

App Files Files Community

marker-io / marker /convert.py

Ritvik19

Add all files and directories

c8a32e7 6 months ago

raw

history blame contribute delete

5.16 kB

	import warnings
	warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings

	import pypdfium2 as pdfium
	from PIL import Image

	from marker.utils import flush_cuda_memory
	from marker.tables.table import format_tables
	from marker.debug.data import dump_bbox_debug_data
	from marker.layout.layout import surya_layout, annotate_block_types
	from marker.layout.order import surya_order, sort_blocks_in_reading_order
	from marker.ocr.lang import replace_langs_with_codes, validate_langs
	from marker.ocr.detection import surya_detection
	from marker.ocr.recognition import run_ocr
	from marker.pdf.extract_text import get_text_blocks
	from marker.cleaners.headers import filter_header_footer, filter_common_titles
	from marker.equations.equations import replace_equations
	from marker.pdf.utils import find_filetype
	from marker.postprocessors.editor import edit_full_text
	from marker.cleaners.code import identify_code_blocks, indent_blocks
	from marker.cleaners.bullets import replace_bullets
	from marker.cleaners.headings import split_heading_blocks
	from marker.cleaners.fontstyle import find_bold_italic
	from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
	from marker.cleaners.text import cleanup_text
	from marker.images.extract import extract_images
	from marker.images.save import images_to_dict

	from typing import List, Dict, Tuple, Optional
	from marker.settings import settings


	def convert_single_pdf(
	fname: str,
	model_lst: List,
	max_pages=None,
	metadata: Optional[Dict]=None,
	langs: Optional[List[str]] = None,
	batch_multiplier: int = 1
	) -> Tuple[str, Dict[str, Image.Image], Dict]:
	# Set language needed for OCR
	if langs is None:
	langs = [settings.DEFAULT_LANG]

	if metadata:
	langs = metadata.get("languages", langs)

	langs = replace_langs_with_codes(langs)
	validate_langs(langs)

	# Find the filetype
	filetype = find_filetype(fname)

	# Setup output metadata
	out_meta = {
	"languages": langs,
	"filetype": filetype,
	}

	if filetype == "other": # We can't process this file
	return "", out_meta

	# Get initial text blocks from the pdf
	doc = pdfium.PdfDocument(fname)
	pages, toc = get_text_blocks(
	doc,
	max_pages=max_pages,
	)
	out_meta.update({
	"toc": toc,
	"pages": len(pages),
	})

	# Unpack models from list
	texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst

	# Identify text lines on pages
	surya_detection(doc, pages, detection_model, batch_multiplier=batch_multiplier)
	flush_cuda_memory()

	# OCR pages as needed
	pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier)
	flush_cuda_memory()

	out_meta["ocr_stats"] = ocr_stats
	if len([b for p in pages for b in p.blocks]) == 0:
	print(f"Could not extract any text blocks for {fname}")
	return "", out_meta

	surya_layout(doc, pages, layout_model, batch_multiplier=batch_multiplier)
	flush_cuda_memory()

	# Find headers and footers
	bad_span_ids = filter_header_footer(pages)
	out_meta["block_stats"] = {"header_footer": len(bad_span_ids)}

	# Add block types in
	annotate_block_types(pages)

	# Dump debug data if flags are set
	dump_bbox_debug_data(doc, pages)

	# Find reading order for blocks
	# Sort blocks by reading order
	surya_order(doc, pages, order_model, batch_multiplier=batch_multiplier)
	sort_blocks_in_reading_order(pages)
	flush_cuda_memory()

	# Fix code blocks
	code_block_count = identify_code_blocks(pages)
	out_meta["block_stats"]["code"] = code_block_count
	indent_blocks(pages)

	# Fix table blocks
	table_count = format_tables(pages)
	out_meta["block_stats"]["table"] = table_count

	for page in pages:
	for block in page.blocks:
	block.filter_spans(bad_span_ids)
	block.filter_bad_span_types()

	filtered, eq_stats = replace_equations(
	doc,
	pages,
	texify_model,
	batch_multiplier=batch_multiplier
	)
	flush_cuda_memory()
	out_meta["block_stats"]["equations"] = eq_stats

	# Extract images and figures
	if settings.EXTRACT_IMAGES:
	extract_images(doc, pages)

	# Split out headers
	split_heading_blocks(pages)
	find_bold_italic(pages)

	# Copy to avoid changing original data
	merged_lines = merge_spans(filtered)
	text_blocks = merge_lines(merged_lines)
	text_blocks = filter_common_titles(text_blocks)
	full_text = get_full_text(text_blocks)

	# Handle empty blocks being joined
	full_text = cleanup_text(full_text)

	# Replace bullet characters with a -
	full_text = replace_bullets(full_text)

	# Postprocess text with editor model
	full_text, edit_stats = edit_full_text(
	full_text,
	edit_model,
	batch_multiplier=batch_multiplier
	)
	flush_cuda_memory()
	out_meta["postprocess_stats"] = {"edit": edit_stats}
	doc_images = images_to_dict(pages)

	return full_text, doc_images, out_meta