Ritvik19's picture
Add all files and directories
c8a32e7
raw
history blame
1.13 kB
import re
def sort_table_blocks(blocks, tolerance=5):
vertical_groups = {}
for block in blocks:
if hasattr(block, "bbox"):
bbox = block.bbox
else:
bbox = block["bbox"]
group_key = round(bbox[1] / tolerance)
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)
# Sort each group horizontally and flatten the groups into a single list
sorted_blocks = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
sorted_blocks.extend(sorted_group)
return sorted_blocks
def replace_dots(text):
dot_pattern = re.compile(r'(\s*\.\s*){4,}')
dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
if dot_multiline_pattern.match(text):
text = dot_pattern.sub(' ', text)
return text
def replace_newlines(text):
# Replace all newlines
newline_pattern = re.compile(r'[\r\n]+')
return newline_pattern.sub(' ', text.strip())