Spaces:

Ritvik19
/

marker-io

Running

App Files Files Community

marker-io / marker /ocr /heuristics.py

Ritvik19

Add all files and directories

c8a32e7 6 months ago

raw

history blame contribute delete

2.47 kB

	import re
	from typing import List

	from marker.ocr.utils import alphanum_ratio
	from marker.schema.bbox import rescale_bbox, box_intersection_pct
	from marker.schema.page import Page
	from marker.settings import settings


	def should_ocr_page(page: Page, no_text: bool):
	detected_lines_found = detected_line_coverage(page)

	# OCR page if we got minimal text, or if we got too many spaces
	conditions = [
	no_text , # Full doc has no text, and needs full OCR
	(len(page.prelim_text) > 0 and detect_bad_ocr(page.prelim_text)), # Bad OCR
	detected_lines_found is False, # didn't extract text for all detected lines
	]

	return any(conditions) or settings.OCR_ALL_PAGES


	def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3):
	if len(text) == 0:
	# Assume OCR failed if we have no text
	return True

	spaces = len(re.findall(r'\s+', text))
	alpha_chars = len(re.sub(r'\s+', '', text))
	if spaces / (alpha_chars + spaces) > space_threshold:
	return True

	newlines = len(re.findall(r'\n+', text))
	non_newlines = len(re.sub(r'\n+', '', text))
	if newlines / (newlines + non_newlines) > newline_threshold:
	return True

	if alphanum_ratio(text) < alphanum_threshold: # Garbled text
	return True

	invalid_chars = len([c for c in text if c in settings.INVALID_CHARS])
	if invalid_chars > max(4.0, len(text) * .03):
	return True

	return False


	def no_text_found(pages: List[Page]):
	full_text = ""
	for page in pages:
	full_text += page.prelim_text
	return len(full_text.strip()) == 0


	def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.65):
	found_lines = 0
	for detected_line in page.text_lines.bboxes:

	# Get bbox and rescale to match dimensions of original page
	detected_bbox = detected_line.bbox
	detected_bbox = rescale_bbox(page.text_lines.image_bbox, page.bbox, detected_bbox)

	total_intersection = 0
	for block in page.blocks:
	for line in block.lines:
	intersection_pct = box_intersection_pct(detected_bbox, line.bbox)
	total_intersection += intersection_pct
	if total_intersection > intersect_thresh:
	found_lines += 1

	total_lines = len(page.text_lines.bboxes)
	if total_lines == 0:
	return False
	return found_lines / total_lines > detection_thresh