|
from marker.schema.merged import MergedLine, MergedBlock, FullyMergedBlock |
|
from marker.schema.page import Page |
|
import re |
|
import regex |
|
from typing import List |
|
|
|
|
|
def escape_markdown(text): |
|
|
|
characters_to_escape = r"[#]" |
|
|
|
escaped_text = re.sub(characters_to_escape, r'\\\g<0>', text) |
|
return escaped_text |
|
|
|
|
|
def surround_text(s, char_to_insert): |
|
leading_whitespace = re.match(r'^(\s*)', s).group(1) |
|
trailing_whitespace = re.search(r'(\s*)$', s).group(1) |
|
stripped_string = s.strip() |
|
modified_string = char_to_insert + stripped_string + char_to_insert |
|
final_string = leading_whitespace + modified_string + trailing_whitespace |
|
return final_string |
|
|
|
|
|
def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]: |
|
merged_blocks = [] |
|
for page in pages: |
|
page_blocks = [] |
|
for blocknum, block in enumerate(page.blocks): |
|
block_lines = [] |
|
for linenum, line in enumerate(block.lines): |
|
line_text = "" |
|
if len(line.spans) == 0: |
|
continue |
|
fonts = [] |
|
for i, span in enumerate(line.spans): |
|
font = span.font.lower() |
|
next_span = None |
|
next_idx = 1 |
|
while len(line.spans) > i + next_idx: |
|
next_span = line.spans[i + next_idx] |
|
next_idx += 1 |
|
if len(next_span.text.strip()) > 2: |
|
break |
|
|
|
fonts.append(font) |
|
span_text = span.text |
|
|
|
|
|
|
|
if len(span_text) > 3 and 0 < i < len(line.spans) - 1: |
|
if span.italic and (not next_span or not next_span.italic): |
|
span_text = surround_text(span_text, "*") |
|
elif span.bold and (not next_span or not next_span.bold): |
|
span_text = surround_text(span_text, "**") |
|
line_text += span_text |
|
block_lines.append(MergedLine( |
|
text=line_text, |
|
fonts=fonts, |
|
bbox=line.bbox |
|
)) |
|
if len(block_lines) > 0: |
|
page_blocks.append(MergedBlock( |
|
lines=block_lines, |
|
pnum=block.pnum, |
|
bbox=block.bbox, |
|
block_type=block.block_type |
|
)) |
|
merged_blocks.append(page_blocks) |
|
|
|
return merged_blocks |
|
|
|
|
|
def block_surround(text, block_type): |
|
if block_type == "Section-header": |
|
if not text.startswith("#"): |
|
text = "\n## " + text.strip().title() + "\n" |
|
elif block_type == "Title": |
|
if not text.startswith("#"): |
|
text = "# " + text.strip().title() + "\n" |
|
elif block_type == "Table": |
|
text = "\n" + text + "\n" |
|
elif block_type == "List-item": |
|
text = escape_markdown(text) |
|
elif block_type == "Code": |
|
text = "\n```\n" + text + "\n```\n" |
|
elif block_type == "Text": |
|
text = escape_markdown(text) |
|
elif block_type == "Formula": |
|
if text.strip().startswith("$$") and text.strip().endswith("$$"): |
|
text = text.strip() |
|
text = "\n" + text + "\n" |
|
return text |
|
|
|
|
|
def line_separator(line1, line2, block_type, is_continuation=False): |
|
|
|
lowercase_letters = r'\p{Lo}|\p{Ll}|\d' |
|
hyphens = r'-βΒ¬' |
|
|
|
hyphen_pattern = regex.compile(rf'.*[{lowercase_letters}][{hyphens}]\s?$', regex.DOTALL) |
|
if line1 and hyphen_pattern.match(line1) and regex.match(rf"^\s?[{lowercase_letters}]", line2): |
|
|
|
line1 = regex.split(rf"[{hyphens}]\s?$", line1)[0] |
|
return line1.rstrip() + line2.lstrip() |
|
|
|
all_letters = r'\p{L}|\d' |
|
sentence_continuations = r',;\(\β\"\'\*' |
|
sentence_ends = r'γΰΉ\.?!' |
|
line_end_pattern = regex.compile(rf'.*[{lowercase_letters}][{sentence_continuations}]?\s?$', regex.DOTALL) |
|
line_start_pattern = regex.compile(rf'^\s?[{all_letters}]', regex.DOTALL) |
|
sentence_end_pattern = regex.compile(rf'.*[{sentence_ends}]\s?$', regex.DOTALL) |
|
|
|
text_blocks = ["Text", "List-item", "Footnote", "Caption", "Figure"] |
|
if block_type in ["Title", "Section-header"]: |
|
return line1.rstrip() + " " + line2.lstrip() |
|
elif block_type == "Formula": |
|
return line1 + "\n" + line2 |
|
elif line_end_pattern.match(line1) and line_start_pattern.match(line2) and block_type in text_blocks: |
|
return line1.rstrip() + " " + line2.lstrip() |
|
elif is_continuation: |
|
return line1.rstrip() + " " + line2.lstrip() |
|
elif block_type in text_blocks and sentence_end_pattern.match(line1): |
|
return line1 + "\n\n" + line2 |
|
elif block_type == "Table": |
|
return line1 + "\n\n" + line2 |
|
else: |
|
return line1 + "\n" + line2 |
|
|
|
|
|
def block_separator(line1, line2, block_type1, block_type2): |
|
sep = "\n" |
|
if block_type1 == "Text": |
|
sep = "\n\n" |
|
|
|
return sep + line2 |
|
|
|
|
|
def merge_lines(blocks: List[List[MergedBlock]]): |
|
text_blocks = [] |
|
prev_type = None |
|
prev_line = None |
|
block_text = "" |
|
block_type = "" |
|
|
|
for page in blocks: |
|
for block in page: |
|
block_type = block.block_type |
|
if block_type != prev_type and prev_type: |
|
text_blocks.append( |
|
FullyMergedBlock( |
|
text=block_surround(block_text, prev_type), |
|
block_type=prev_type |
|
) |
|
) |
|
block_text = "" |
|
|
|
prev_type = block_type |
|
|
|
for i, line in enumerate(block.lines): |
|
line_height = line.bbox[3] - line.bbox[1] |
|
prev_line_height = prev_line.bbox[3] - prev_line.bbox[1] if prev_line else 0 |
|
prev_line_x = prev_line.bbox[0] if prev_line else 0 |
|
prev_line = line |
|
is_continuation = line_height == prev_line_height and line.bbox[0] == prev_line_x |
|
if block_text: |
|
block_text = line_separator(block_text, line.text, block_type, is_continuation) |
|
else: |
|
block_text = line.text |
|
|
|
|
|
text_blocks.append( |
|
FullyMergedBlock( |
|
text=block_surround(block_text, prev_type), |
|
block_type=block_type |
|
) |
|
) |
|
return text_blocks |
|
|
|
|
|
def get_full_text(text_blocks): |
|
full_text = "" |
|
prev_block = None |
|
for block in text_blocks: |
|
if prev_block: |
|
full_text += block_separator(prev_block.text, block.text, prev_block.block_type, block.block_type) |
|
else: |
|
full_text += block.text |
|
prev_block = block |
|
return full_text |
|
|