File size: 30,807 Bytes
240e0a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 |
# 定义这里的bbox是一个list [x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None
# 其中x0, y0代表左上角坐标,x1, y1代表右下角坐标,坐标原点在左上角。
from magic_pdf.layout.layout_spiler_recog import get_spilter_of_page
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
from magic_pdf.libs.commons import mymax
X0_IDX = 0
Y0_IDX = 1
X1_IDX = 2
Y1_IDX = 3
CONTENT_IDX = 4
IDX_X = 5
IDX_Y = 6
CONTENT_TYPE_IDX = 7
X0_EXT_IDX = 8
Y0_EXT_IDX = 9
X1_EXT_IDX = 10
Y1_EXT_IDX = 11
def prepare_bboxes_for_layout_split(image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info, text_raw_blocks: dict, page_boundry, page):
"""
text_raw_blocks:结构参考test/assets/papre/pymu_textblocks.json
把bbox重新组装成一个list,每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None. 对于图片、公式来说,block_content是图片的地址, 对于段落来说,block_content是pymupdf里的block结构
"""
all_bboxes = []
for image in image_info:
box = image['bbox']
# 由于没有实现横向的栏切分,因此在这里先过滤掉一些小的图片。这些图片有可能影响layout,造成没有横向栏切分的情况下,layout切分不准确。例如 scihub_76500000/libgen.scimag76570000-76570999.zip_10.1186/s13287-019-1355-1
# 把长宽都小于50的去掉
if abs(box[0]-box[2]) < 50 and abs(box[1]-box[3]) < 50:
continue
all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'image', None, None, None, None])
for table in table_info:
box = table['bbox']
all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'table', None, None, None, None])
"""由于公式与段落混合,因此公式不再参与layout划分,无需加入all_bboxes"""
# 加入文本block
text_block_temp = []
for block in text_raw_blocks:
bbox = block['bbox']
text_block_temp.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
text_block_new = resolve_bbox_overlap_for_layout_det(text_block_temp)
text_block_new = filter_lines_bbox(text_block_new) # 去掉线条bbox,有可能让layout探测陷入无限循环
"""找出会影响layout的色块、横向分割线"""
spilter_bboxes = get_spilter_of_page(page, [b['bbox'] for b in image_info]+[b['bbox'] for b in image_backup_info], [b['bbox'] for b in table_info], )
# 还要去掉存在于spilter_bboxes里的text_block
if len(spilter_bboxes) > 0:
text_block_new = [box for box in text_block_new if not any([_is_in_or_part_overlap(box[:4], spilter_bbox) for spilter_bbox in spilter_bboxes])]
for bbox in text_block_new:
all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
for bbox in spilter_bboxes:
all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'spilter', None, None, None, None])
return all_bboxes
def resolve_bbox_overlap_for_layout_det(bboxes:list):
"""
1. 去掉bbox互相包含的,去掉被包含的
2. 上下方向上如果有重叠,就扩大大box范围,直到覆盖小box
"""
def _is_in_other_bbox(i:int):
"""
判断i个box是否被其他box有所包含
"""
for j in range(0, len(bboxes)):
if j!=i and _is_in(bboxes[i][:4], bboxes[j][:4]):
return True
# elif j!=i and _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
# return True
return False
# 首先去掉被包含的bbox
new_bbox_1 = []
for i in range(0, len(bboxes)):
if not _is_in_other_bbox(i):
new_bbox_1.append(bboxes[i])
# 其次扩展大的box
new_box = []
new_bbox_2 = []
len_1 = len(new_bbox_2)
while True:
merged_idx = []
for i in range(0, len(new_bbox_1)):
if i in merged_idx:
continue
for j in range(i+1, len(new_bbox_1)):
if j in merged_idx:
continue
bx1 = new_bbox_1[i]
bx2 = new_bbox_1[j]
if i!=j and _is_vertical_full_overlap(bx1[:4], bx2[:4]):
merged_box = min([bx1[0], bx2[0]]), min([bx1[1], bx2[1]]), max([bx1[2], bx2[2]]), max([bx1[3], bx2[3]])
new_bbox_2.append(merged_box)
merged_idx.append(i)
merged_idx.append(j)
for i in range(0, len(new_bbox_1)): # 没有合并的加入进来
if i not in merged_idx:
new_bbox_2.append(new_bbox_1[i])
if len(new_bbox_2)==0 or len_1==len(new_bbox_2):
break
else:
len_1 = len(new_bbox_2)
new_box = new_bbox_2
new_bbox_1, new_bbox_2 = new_bbox_2, []
return new_box
def filter_lines_bbox(bboxes: list):
"""
过滤掉bbox为空的行
"""
new_box = []
for box in bboxes:
x0, y0, x1, y1 = box[0], box[1], box[2], box[3]
if abs(x0-x1)<=1 or abs(y0-y1)<=1:
continue
else:
new_box.append(box)
return new_box
################################################################################
# 第一种排序算法
# 以下是基于延长线遮挡做的一个算法
#
################################################################################
def find_all_left_bbox(this_bbox, all_bboxes) -> list:
"""
寻找this_bbox左边的所有bbox
"""
left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
return left_boxes
def find_all_top_bbox(this_bbox, all_bboxes) -> list:
"""
寻找this_bbox上面的所有bbox
"""
top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX]]
return top_boxes
def get_and_set_idx_x(this_bbox, all_bboxes) -> int:
"""
寻找this_bbox在all_bboxes中的遮挡深度 idx_x
"""
if this_bbox[IDX_X] is not None:
return this_bbox[IDX_X]
else:
all_left_bboxes = find_all_left_bbox(this_bbox, all_bboxes)
if len(all_left_bboxes) == 0:
this_bbox[IDX_X] = 0
else:
all_left_bboxes_idx = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_left_bboxes]
max_idx_x = mymax(all_left_bboxes_idx)
this_bbox[IDX_X] = max_idx_x + 1
return this_bbox[IDX_X]
def get_and_set_idx_y(this_bbox, all_bboxes) -> int:
"""
寻找this_bbox在all_bboxes中y方向的遮挡深度 idx_y
"""
if this_bbox[IDX_Y] is not None:
return this_bbox[IDX_Y]
else:
all_top_bboxes = find_all_top_bbox(this_bbox, all_bboxes)
if len(all_top_bboxes) == 0:
this_bbox[IDX_Y] = 0
else:
all_top_bboxes_idx = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_top_bboxes]
max_idx_y = mymax(all_top_bboxes_idx)
this_bbox[IDX_Y] = max_idx_y + 1
return this_bbox[IDX_Y]
def bbox_sort(all_bboxes: list):
"""
排序
"""
all_bboxes_idx_x = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_bboxes]
all_bboxes_idx_y = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_bboxes]
all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
all_bboxes_idx.sort(key=lambda x: x[0])
sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
return sorted_bboxes
################################################################################
# 第二种排序算法
# 下面的算法在计算idx_x和idx_y的时候不考虑延长线,而只考虑实际的长或者宽被遮挡的情况
#
################################################################################
def find_left_nearest_bbox(this_bbox, all_bboxes) -> list:
"""
在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
"""
left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
# 然后再过滤一下,找到水平上距离this_bbox最近的那个
if len(left_boxes) > 0:
left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
left_boxes = [left_boxes[0]]
else:
left_boxes = []
return left_boxes
def get_and_set_idx_x_2(this_bbox, all_bboxes):
"""
寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
"""
if this_bbox[IDX_X] is not None:
return this_bbox[IDX_X]
else:
left_nearest_bbox = find_left_nearest_bbox(this_bbox, all_bboxes)
if len(left_nearest_bbox) == 0:
this_bbox[IDX_X] = 0
else:
left_idx_x = get_and_set_idx_x_2(left_nearest_bbox[0], all_bboxes)
this_bbox[IDX_X] = left_idx_x + 1
return this_bbox[IDX_X]
def find_top_nearest_bbox(this_bbox, all_bboxes) -> list:
"""
在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
"""
top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
# 然后再过滤一下,找到水平上距离this_bbox最近的那个
if len(top_boxes) > 0:
top_boxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
top_boxes = [top_boxes[0]]
else:
top_boxes = []
return top_boxes
def get_and_set_idx_y_2(this_bbox, all_bboxes):
"""
寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
"""
if this_bbox[IDX_Y] is not None:
return this_bbox[IDX_Y]
else:
top_nearest_bbox = find_top_nearest_bbox(this_bbox, all_bboxes)
if len(top_nearest_bbox) == 0:
this_bbox[IDX_Y] = 0
else:
top_idx_y = get_and_set_idx_y_2(top_nearest_bbox[0], all_bboxes)
this_bbox[IDX_Y] = top_idx_y + 1
return this_bbox[IDX_Y]
def paper_bbox_sort(all_bboxes: list, page_width, page_height):
all_bboxes_idx_x = [get_and_set_idx_x_2(bbox, all_bboxes) for bbox in all_bboxes]
all_bboxes_idx_y = [get_and_set_idx_y_2(bbox, all_bboxes) for bbox in all_bboxes]
all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
all_bboxes_idx.sort(key=lambda x: x[0])
sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
return sorted_bboxes
################################################################################
"""
第三种排序算法, 假设page的最左侧为X0,最右侧为X1,最上侧为Y0,最下侧为Y1
这个排序算法在第二种算法基础上增加对bbox的预处理步骤。预处理思路如下:
1. 首先在水平方向上对bbox进行扩展。扩展方法是:
- 对每个bbox,找到其左边最近的bbox(也就是y方向有重叠),然后将其左边界扩展到左边最近bbox的右边界(x1+1),这里加1是为了避免重叠。如果没有左边的bbox,那么就将其左边界扩展到page的最左侧X0。
- 对每个bbox,找到其右边最近的bbox(也就是y方向有重叠),然后将其右边界扩展到右边最近bbox的左边界(x0-1),这里减1是为了避免重叠。如果没有右边的bbox,那么就将其右边界扩展到page的最右侧X1。
- 经过上面2个步骤,bbox扩展到了水平方向的最大范围。[左最近bbox.x1+1, 右最近bbox.x0-1]
2. 合并所有的连续水平方向的bbox, 合并方法是:
- 对bbox进行y方向排序,然后从上到下遍历所有bbox,如果当前bbox和下一个bbox的x0, x1等于X0, X1,那么就合并这两个bbox。
3. 然后在垂直方向上对bbox进行扩展。扩展方法是:
- 首先从page上切割掉合并后的水平bbox, 得到几个新的block
针对每个block
- x0: 扎到位于左侧x=x0延长线的左侧所有的bboxes, 找到最大的x1,让x0=x1+1。如果没有,则x0=X0
- x1: 找到位于右侧x=x1延长线右侧所有的bboxes, 找到最小的x0, 让x1=x0-1。如果没有,则x1=X1
随后在垂直方向上合并所有的连续的block,方法如下:
- 对block进行x方向排序,然后从左到右遍历所有block,如果当前block和下一个block的x0, x1相等,那么就合并这两个block。
如果垂直切分后所有小bbox都被分配到了一个block, 那么分割就完成了。这些合并后的block打上标签'GOOD_LAYOUT’
如果在某个垂直方向上无法被完全分割到一个block,那么就将这个block打上标签'BAD_LAYOUT'。
至此完成,一个页面的预处理,天然的block要么属于'GOOD_LAYOUT',要么属于'BAD_LAYOUT'。针对含有'BAD_LAYOUT'的页面,可以先按照自上而下,自左到右进行天然排序,也可以先过滤掉这种书籍。
(完成条件下次加强:进行水平方向切分,把混乱的layout部分尽可能切割出去)
"""
################################################################################
def find_left_neighbor_bboxes(this_bbox, all_bboxes) -> list:
"""
在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
这里使用扩展之后的bbox
"""
left_boxes = [box for box in all_bboxes if box[X1_EXT_IDX] <= this_bbox[X0_EXT_IDX] and any([
box[Y0_EXT_IDX] < this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX], box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX] < box[Y1_EXT_IDX],
this_bbox[Y0_EXT_IDX] < box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX], this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX] < this_bbox[Y1_EXT_IDX],
box[Y0_EXT_IDX]==this_bbox[Y0_EXT_IDX] and box[Y1_EXT_IDX]==this_bbox[Y1_EXT_IDX]])]
# 然后再过滤一下,找到水平上距离this_bbox最近的那个
if len(left_boxes) > 0:
left_boxes.sort(key=lambda x: x[X1_EXT_IDX], reverse=True)
left_boxes = left_boxes
else:
left_boxes = []
return left_boxes
def find_top_neighbor_bboxes(this_bbox, all_bboxes) -> list:
"""
在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
这里使用扩展之后的bbox
"""
top_boxes = [box for box in all_bboxes if box[Y1_EXT_IDX] <= this_bbox[Y0_EXT_IDX] and any([
box[X0_EXT_IDX] < this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX], box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX] < box[X1_EXT_IDX],
this_bbox[X0_EXT_IDX] < box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX], this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX] < this_bbox[X1_EXT_IDX],
box[X0_EXT_IDX]==this_bbox[X0_EXT_IDX] and box[X1_EXT_IDX]==this_bbox[X1_EXT_IDX]])]
# 然后再过滤一下,找到水平上距离this_bbox最近的那个
if len(top_boxes) > 0:
top_boxes.sort(key=lambda x: x[Y1_EXT_IDX], reverse=True)
top_boxes = top_boxes
else:
top_boxes = []
return top_boxes
def get_and_set_idx_x_2_ext(this_bbox, all_bboxes):
"""
寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
"""
if this_bbox[IDX_X] is not None:
return this_bbox[IDX_X]
else:
left_nearest_bbox = find_left_neighbor_bboxes(this_bbox, all_bboxes)
if len(left_nearest_bbox) == 0:
this_bbox[IDX_X] = 0
else:
left_idx_x = [get_and_set_idx_x_2(b, all_bboxes) for b in left_nearest_bbox]
this_bbox[IDX_X] = mymax(left_idx_x) + 1
return this_bbox[IDX_X]
def get_and_set_idx_y_2_ext(this_bbox, all_bboxes):
"""
寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
"""
if this_bbox[IDX_Y] is not None:
return this_bbox[IDX_Y]
else:
top_nearest_bbox = find_top_neighbor_bboxes(this_bbox, all_bboxes)
if len(top_nearest_bbox) == 0:
this_bbox[IDX_Y] = 0
else:
top_idx_y = [get_and_set_idx_y_2_ext(b, all_bboxes) for b in top_nearest_bbox]
this_bbox[IDX_Y] = mymax(top_idx_y) + 1
return this_bbox[IDX_Y]
def _paper_bbox_sort_ext(all_bboxes: list):
all_bboxes_idx_x = [get_and_set_idx_x_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
all_bboxes_idx_y = [get_and_set_idx_y_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
all_bboxes_idx.sort(key=lambda x: x[0])
sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
return sorted_bboxes
# ===============================================================================================
def find_left_bbox_ext_line(this_bbox, all_bboxes) -> list:
"""
寻找this_bbox左边的所有bbox, 使用延长线
"""
left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
if len(left_boxes):
left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
left_boxes = left_boxes[0]
else:
left_boxes = None
return left_boxes
def find_right_bbox_ext_line(this_bbox, all_bboxes) -> list:
"""
寻找this_bbox右边的所有bbox, 使用延长线
"""
right_boxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]]
if len(right_boxes):
right_boxes.sort(key=lambda x: x[X0_IDX])
right_boxes = right_boxes[0]
else:
right_boxes = None
return right_boxes
# =============================================================================================
def find_left_nearest_bbox_direct(this_bbox, all_bboxes) -> list:
"""
在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox, 不用延长线并且不能像
"""
left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
# 然后再过滤一下,找到水平上距离this_bbox最近的那个——x1最大的那个
if len(left_boxes) > 0:
left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
left_boxes = left_boxes[0]
else:
left_boxes = None
return left_boxes
def find_right_nearst_bbox_direct(this_bbox, all_bboxes) -> list:
"""
找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
"""
right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] and any([
this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
if len(right_bboxes)>0:
right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
right_bboxes = right_bboxes[0]
else:
right_bboxes = None
return right_bboxes
def reset_idx_x_y(all_boxes:list)->list:
for box in all_boxes:
box[IDX_X] = None
box[IDX_Y] = None
return all_boxes
# ===================================================================================================
def find_top_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
"""
找到在this_bbox上方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
"""
top_bboxes = [box for box in bboxes_collection if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
# 然后再过滤一下,找到上方距离this_bbox最近的那个
if len(top_bboxes) > 0:
top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
top_bboxes = top_bboxes[0]
else:
top_bboxes = None
return top_bboxes
def find_bottom_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
"""
找到在this_bbox下方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
"""
bottom_bboxes = [box for box in bboxes_collection if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
# 然后再过滤一下,找到水平上距离this_bbox最近的那个
if len(bottom_bboxes) > 0:
bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
bottom_bboxes = bottom_bboxes[0]
else:
bottom_bboxes = None
return bottom_bboxes
def find_boundry_bboxes(bboxes:list) -> tuple:
"""
找到bboxes的边界——找到所有bbox里最小的(x0, y0), 最大的(x1, y1)
"""
x0, y0, x1, y1 = bboxes[0][X0_IDX], bboxes[0][Y0_IDX], bboxes[0][X1_IDX], bboxes[0][Y1_IDX]
for box in bboxes:
x0 = min(box[X0_IDX], x0)
y0 = min(box[Y0_IDX], y0)
x1 = max(box[X1_IDX], x1)
y1 = max(box[Y1_IDX], y1)
return x0, y0, x1, y1
def extend_bbox_vertical(bboxes:list, boundry_x0, boundry_y0, boundry_x1, boundry_y1) -> list:
"""
在垂直方向上扩展能够直接垂直打通的bbox,也就是那些上下都没有其他box的bbox
"""
for box in bboxes:
top_nearest_bbox = find_top_nearest_bbox_direct(box, bboxes)
bottom_nearest_bbox = find_bottom_nearest_bbox_direct(box, bboxes)
if top_nearest_bbox is None and bottom_nearest_bbox is None: # 独占一列
box[X0_EXT_IDX] = box[X0_IDX]
box[Y0_EXT_IDX] = boundry_y0
box[X1_EXT_IDX] = box[X1_IDX]
box[Y1_EXT_IDX] = boundry_y1
# else:
# if top_nearest_bbox is None:
# box[Y0_EXT_IDX] = boundry_y0
# else:
# box[Y0_EXT_IDX] = top_nearest_bbox[Y1_IDX] + 1
# if bottom_nearest_bbox is None:
# box[Y1_EXT_IDX] = boundry_y1
# else:
# box[Y1_EXT_IDX] = bottom_nearest_bbox[Y0_IDX] - 1
# box[X0_EXT_IDX] = box[X0_IDX]
# box[X1_EXT_IDX] = box[X1_IDX]
return bboxes
# ===================================================================================================
def paper_bbox_sort_v2(all_bboxes: list, page_width:int, page_height:int):
"""
增加预处理行为的排序:
return:
[
{
"layout_bbox": [x0, y0, x1, y1],
"layout_label":"GOOD_LAYOUT/BAD_LAYOUT",
"content_bboxes": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序
}
]
"""
sorted_layouts = [] # 最后的返回结果
page_x0, page_y0, page_x1, page_y1 = 1, 1, page_width-1, page_height-1
all_bboxes = paper_bbox_sort(all_bboxes) # 大致拍下序
# 首先在水平方向上扩展独占一行的bbox
for bbox in all_bboxes:
left_nearest_bbox = find_left_nearest_bbox_direct(bbox, all_bboxes) # 非扩展线
right_nearest_bbox = find_right_nearst_bbox_direct(bbox, all_bboxes)
if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行
bbox[X0_EXT_IDX] = page_x0
bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
bbox[X1_EXT_IDX] = page_x1
bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
# 此时独占一行的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group
if len(all_bboxes)==1:
return [{"layout_bbox": [page_x0, page_y0, page_x1, page_y1], "layout_label":"GOOD_LAYOUT", "content_bboxes": all_bboxes}]
if len(all_bboxes)==0:
return []
"""
然后合并所有连续水平方向的bbox.
"""
all_bboxes.sort(key=lambda x: x[Y0_IDX])
h_bboxes = []
h_bbox_group = []
v_boxes = []
for bbox in all_bboxes:
if bbox[X0_IDX] == page_x0 and bbox[X1_IDX] == page_x1:
h_bbox_group.append(bbox)
else:
if len(h_bbox_group)>0:
h_bboxes.append(h_bbox_group)
h_bbox_group = []
# 最后一个group
if len(h_bbox_group)>0:
h_bboxes.append(h_bbox_group)
"""
现在h_bboxes里面是所有的group了,每个group都是一个list
对h_bboxes里的每个group进行计算放回到sorted_layouts里
"""
for gp in h_bboxes:
gp.sort(key=lambda x: x[Y0_IDX])
block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
# 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
block_info["layout_bbox"] = [x0, y0, x1, y1]
sorted_layouts.append(block_info)
# 接下来利用这些连续的水平bbox的layout_bbox的y0, y1,从水平上切分开其余的为几个部分
h_split_lines = [page_y0]
for gp in h_bboxes:
layout_bbox = gp['layout_bbox']
y0, y1 = layout_bbox[1], layout_bbox[3]
h_split_lines.append(y0)
h_split_lines.append(y1)
h_split_lines.append(page_y1)
unsplited_bboxes = []
for i in range(0, len(h_split_lines), 2):
start_y0, start_y1 = h_split_lines[i:i+2]
# 然后找出[start_y0, start_y1]之间的其他bbox,这些组成一个未分割板块
bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1]
unsplited_bboxes.append(bboxes_in_block)
# ================== 至此,水平方向的 已经切分排序完毕====================================
"""
接下来针对每个非水平的部分切分垂直方向的
此时,只剩下了无法被完全水平打通的bbox了。对这些box,优先进行垂直扩展,然后进行垂直切分.
分3步:
1. 先把能完全垂直打通的隔离出去当做一个layout
2. 其余的先垂直切分
3. 垂直切分之后的部分再尝试水平切分
4. 剩下的不能被切分的各个部分当成一个layout
"""
# 对每部分进行垂直切分
for bboxes_in_block in unsplited_bboxes:
# 首先对这个block的bbox进行垂直方向上的扩展
boundry_x0, boundry_y0, boundry_x1, boundry_y1 = find_boundry_bboxes(bboxes_in_block)
# 进行垂直方向上的扩展
extended_vertical_bboxes = extend_bbox_vertical(bboxes_in_block, boundry_x0, boundry_y0, boundry_x1, boundry_y1)
# 然后对这个block进行垂直方向上的切分
extend_bbox_vertical.sort(key=lambda x: x[X0_IDX]) # x方向上从小到大,代表了从左到右读取
v_boxes_group = []
for bbox in extended_vertical_bboxes:
if bbox[Y0_IDX]==boundry_y0 and bbox[Y1_IDX]==boundry_y1:
v_boxes_group.append(bbox)
else:
if len(v_boxes_group)>0:
v_boxes.append(v_boxes_group)
v_boxes_group = []
if len(v_boxes_group)>0:
v_boxes.append(v_boxes_group)
# 把连续的垂直部分加入到sorted_layouts里。注意这个时候已经是连续的垂直部分了,因为上面已经做了
for gp in v_boxes:
gp.sort(key=lambda x: x[X0_IDX])
block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
# 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
block_info["layout_bbox"] = [x0, y0, x1, y1]
sorted_layouts.append(block_info)
# 在垂直方向上,划分子块,也就是用贯通的垂直线进行切分。这些被切分出来的块,极大可能是可被垂直切分的,如果不能完全的垂直切分,那么尝试水平切分。都不能的则当成一个layout
v_split_lines = [boundry_x0]
for gp in v_boxes:
layout_bbox = gp['layout_bbox']
x0, x1 = layout_bbox[0], layout_bbox[2]
v_split_lines.append(x0)
v_split_lines.append(x1)
v_split_lines.append(boundry_x1)
reset_idx_x_y(all_bboxes)
all_boxes = _paper_bbox_sort_ext(all_bboxes)
return all_boxes
|