File size: 3,384 Bytes
c8a32e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
from marker.schema.bbox import rescale_bbox, box_intersection_pct
from marker.schema.page import Page
from sklearn.cluster import DBSCAN
import numpy as np
from marker.settings import settings
def cluster_coords(coords):
if len(coords) == 0:
return []
coords = np.array(sorted(set(coords))).reshape(-1, 1)
clustering = DBSCAN(eps=5, min_samples=1).fit(coords)
clusters = clustering.labels_
separators = []
for label in set(clusters):
clustered_points = coords[clusters == label]
separators.append(np.mean(clustered_points))
separators = sorted(separators)
return separators
def find_column_separators(page: Page, table_box, round_factor=4, min_count=1):
left_edges = []
right_edges = []
centers = []
line_boxes = [p.bbox for p in page.text_lines.bboxes]
line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes]
line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > settings.BBOX_INTERSECTION_THRESH]
for cell in line_boxes:
left_edges.append(cell[0] / round_factor * round_factor)
right_edges.append(cell[2] / round_factor * round_factor)
centers.append((cell[0] + cell[2]) / 2 * round_factor / round_factor)
left_edges = [l for l in left_edges if left_edges.count(l) > min_count]
right_edges = [r for r in right_edges if right_edges.count(r) > min_count]
centers = [c for c in centers if centers.count(c) > min_count]
sorted_left = cluster_coords(left_edges)
sorted_right = cluster_coords(right_edges)
sorted_center = cluster_coords(centers)
# Find list with minimum length
separators = max([sorted_left, sorted_right, sorted_center], key=len)
separators.append(page.bbox[2])
separators.insert(0, page.bbox[0])
return separators
def assign_cells_to_columns(page, table_box, rows, round_factor=4, tolerance=4):
separators = find_column_separators(page, table_box, round_factor=round_factor)
new_rows = []
additional_column_index = 0
for row in rows:
new_row = {}
last_col_index = -1
for cell in row:
left_edge = cell[0][0]
column_index = -1
for i, separator in enumerate(separators):
if left_edge - tolerance < separator and last_col_index < i:
column_index = i
break
if column_index == -1:
column_index = len(separators) + additional_column_index
additional_column_index += 1
new_row[column_index] = cell[1]
last_col_index = column_index
additional_column_index = 0
flat_row = []
for cell_idx, cell in enumerate(sorted(new_row.items())):
flat_row.append(cell[1])
new_rows.append(flat_row)
# Pad rows to have the same length
max_row_len = max([len(r) for r in new_rows])
for row in new_rows:
while len(row) < max_row_len:
row.append("")
cols_to_remove = set()
for idx, col in enumerate(zip(*new_rows)):
col_total = sum([len(cell.strip()) > 0 for cell in col])
if col_total == 0:
cols_to_remove.add(idx)
rows = []
for row in new_rows:
rows.append([col for idx, col in enumerate(row) if idx not in cols_to_remove])
return rows
|