File size: 1,128 Bytes
c8a32e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import re
def sort_table_blocks(blocks, tolerance=5):
vertical_groups = {}
for block in blocks:
if hasattr(block, "bbox"):
bbox = block.bbox
else:
bbox = block["bbox"]
group_key = round(bbox[1] / tolerance)
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)
# Sort each group horizontally and flatten the groups into a single list
sorted_blocks = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
sorted_blocks.extend(sorted_group)
return sorted_blocks
def replace_dots(text):
dot_pattern = re.compile(r'(\s*\.\s*){4,}')
dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
if dot_multiline_pattern.match(text):
text = dot_pattern.sub(' ', text)
return text
def replace_newlines(text):
# Replace all newlines
newline_pattern = re.compile(r'[\r\n]+')
return newline_pattern.sub(' ', text.strip())
|