File size: 7,226 Bytes
c8a32e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
from marker.schema.merged import MergedLine, MergedBlock, FullyMergedBlock
from marker.schema.page import Page
import re
import regex
from typing import List
def escape_markdown(text):
# List of characters that need to be escaped in markdown
characters_to_escape = r"[#]"
# Escape each of these characters with a backslash
escaped_text = re.sub(characters_to_escape, r'\\\g<0>', text)
return escaped_text
def surround_text(s, char_to_insert):
leading_whitespace = re.match(r'^(\s*)', s).group(1)
trailing_whitespace = re.search(r'(\s*)$', s).group(1)
stripped_string = s.strip()
modified_string = char_to_insert + stripped_string + char_to_insert
final_string = leading_whitespace + modified_string + trailing_whitespace
return final_string
def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]:
merged_blocks = []
for page in pages:
page_blocks = []
for blocknum, block in enumerate(page.blocks):
block_lines = []
for linenum, line in enumerate(block.lines):
line_text = ""
if len(line.spans) == 0:
continue
fonts = []
for i, span in enumerate(line.spans):
font = span.font.lower()
next_span = None
next_idx = 1
while len(line.spans) > i + next_idx:
next_span = line.spans[i + next_idx]
next_idx += 1
if len(next_span.text.strip()) > 2:
break
fonts.append(font)
span_text = span.text
# Don't bold or italicize very short sequences
# Avoid bolding first and last sequence so lines can be joined properly
if len(span_text) > 3 and 0 < i < len(line.spans) - 1:
if span.italic and (not next_span or not next_span.italic):
span_text = surround_text(span_text, "*")
elif span.bold and (not next_span or not next_span.bold):
span_text = surround_text(span_text, "**")
line_text += span_text
block_lines.append(MergedLine(
text=line_text,
fonts=fonts,
bbox=line.bbox
))
if len(block_lines) > 0:
page_blocks.append(MergedBlock(
lines=block_lines,
pnum=block.pnum,
bbox=block.bbox,
block_type=block.block_type
))
merged_blocks.append(page_blocks)
return merged_blocks
def block_surround(text, block_type):
if block_type == "Section-header":
if not text.startswith("#"):
text = "\n## " + text.strip().title() + "\n"
elif block_type == "Title":
if not text.startswith("#"):
text = "# " + text.strip().title() + "\n"
elif block_type == "Table":
text = "\n" + text + "\n"
elif block_type == "List-item":
text = escape_markdown(text)
elif block_type == "Code":
text = "\n```\n" + text + "\n```\n"
elif block_type == "Text":
text = escape_markdown(text)
elif block_type == "Formula":
if text.strip().startswith("$$") and text.strip().endswith("$$"):
text = text.strip()
text = "\n" + text + "\n"
return text
def line_separator(line1, line2, block_type, is_continuation=False):
# Should cover latin-derived languages and russian
lowercase_letters = r'\p{Lo}|\p{Ll}|\d'
hyphens = r'-—¬'
# Remove hyphen in current line if next line and current line appear to be joined
hyphen_pattern = regex.compile(rf'.*[{lowercase_letters}][{hyphens}]\s?$', regex.DOTALL)
if line1 and hyphen_pattern.match(line1) and regex.match(rf"^\s?[{lowercase_letters}]", line2):
# Split on — or - from the right
line1 = regex.split(rf"[{hyphens}]\s?$", line1)[0]
return line1.rstrip() + line2.lstrip()
all_letters = r'\p{L}|\d'
sentence_continuations = r',;\(\—\"\'\*'
sentence_ends = r'。ๆ\.?!'
line_end_pattern = regex.compile(rf'.*[{lowercase_letters}][{sentence_continuations}]?\s?$', regex.DOTALL)
line_start_pattern = regex.compile(rf'^\s?[{all_letters}]', regex.DOTALL)
sentence_end_pattern = regex.compile(rf'.*[{sentence_ends}]\s?$', regex.DOTALL)
text_blocks = ["Text", "List-item", "Footnote", "Caption", "Figure"]
if block_type in ["Title", "Section-header"]:
return line1.rstrip() + " " + line2.lstrip()
elif block_type == "Formula":
return line1 + "\n" + line2
elif line_end_pattern.match(line1) and line_start_pattern.match(line2) and block_type in text_blocks:
return line1.rstrip() + " " + line2.lstrip()
elif is_continuation:
return line1.rstrip() + " " + line2.lstrip()
elif block_type in text_blocks and sentence_end_pattern.match(line1):
return line1 + "\n\n" + line2
elif block_type == "Table":
return line1 + "\n\n" + line2
else:
return line1 + "\n" + line2
def block_separator(line1, line2, block_type1, block_type2):
sep = "\n"
if block_type1 == "Text":
sep = "\n\n"
return sep + line2
def merge_lines(blocks: List[List[MergedBlock]]):
text_blocks = []
prev_type = None
prev_line = None
block_text = ""
block_type = ""
for page in blocks:
for block in page:
block_type = block.block_type
if block_type != prev_type and prev_type:
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type),
block_type=prev_type
)
)
block_text = ""
prev_type = block_type
# Join lines in the block together properly
for i, line in enumerate(block.lines):
line_height = line.bbox[3] - line.bbox[1]
prev_line_height = prev_line.bbox[3] - prev_line.bbox[1] if prev_line else 0
prev_line_x = prev_line.bbox[0] if prev_line else 0
prev_line = line
is_continuation = line_height == prev_line_height and line.bbox[0] == prev_line_x
if block_text:
block_text = line_separator(block_text, line.text, block_type, is_continuation)
else:
block_text = line.text
# Append the final block
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type),
block_type=block_type
)
)
return text_blocks
def get_full_text(text_blocks):
full_text = ""
prev_block = None
for block in text_blocks:
if prev_block:
full_text += block_separator(prev_block.text, block.text, prev_block.block_type, block.block_type)
else:
full_text += block.text
prev_block = block
return full_text
|