|
import re |
|
|
|
|
|
def sort_table_blocks(blocks, tolerance=5): |
|
vertical_groups = {} |
|
for block in blocks: |
|
if hasattr(block, "bbox"): |
|
bbox = block.bbox |
|
else: |
|
bbox = block["bbox"] |
|
group_key = round(bbox[1] / tolerance) |
|
if group_key not in vertical_groups: |
|
vertical_groups[group_key] = [] |
|
vertical_groups[group_key].append(block) |
|
|
|
|
|
sorted_blocks = [] |
|
for _, group in sorted(vertical_groups.items()): |
|
sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0]) |
|
sorted_blocks.extend(sorted_group) |
|
|
|
return sorted_blocks |
|
|
|
|
|
def replace_dots(text): |
|
dot_pattern = re.compile(r'(\s*\.\s*){4,}') |
|
dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL) |
|
|
|
if dot_multiline_pattern.match(text): |
|
text = dot_pattern.sub(' ', text) |
|
return text |
|
|
|
|
|
def replace_newlines(text): |
|
|
|
newline_pattern = re.compile(r'[\r\n]+') |
|
return newline_pattern.sub(' ', text.strip()) |
|
|