|
from magic_pdf.libs.commons import fitz |
|
import os |
|
|
|
|
|
def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str): |
|
""" |
|
在page上画出bbox,保存到save_path |
|
""" |
|
|
|
is_new_pdf = False |
|
if os.path.exists(save_path): |
|
|
|
doc = fitz.open(save_path) |
|
else: |
|
|
|
is_new_pdf = True |
|
doc = fitz.open('') |
|
|
|
color_map = { |
|
'image': fitz.pdfcolor["yellow"], |
|
'text': fitz.pdfcolor['blue'], |
|
"table": fitz.pdfcolor['green'] |
|
} |
|
|
|
for k, v in paras_dict.items(): |
|
page_idx = v['page_idx'] |
|
width = raw_pdf_doc[page_idx].rect.width |
|
height = raw_pdf_doc[page_idx].rect.height |
|
new_page = doc.new_page(width=width, height=height) |
|
|
|
shape = new_page.new_shape() |
|
for order, block in enumerate(v['preproc_blocks']): |
|
rect = fitz.Rect(block['bbox']) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2) |
|
shape.finish() |
|
shape.commit() |
|
|
|
for img in v['images']: |
|
|
|
rect = fitz.Rect(img['bbox']) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=None, fill=fitz.pdfcolor['yellow']) |
|
shape.finish() |
|
shape.commit() |
|
|
|
for img in v['image_backup']: |
|
|
|
rect = fitz.Rect(img['bbox']) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=fitz.pdfcolor['yellow'], fill=None) |
|
shape.finish() |
|
shape.commit() |
|
|
|
for tb in v['droped_text_block']: |
|
|
|
rect = fitz.Rect(tb['bbox']) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4) |
|
shape.finish() |
|
shape.commit() |
|
|
|
|
|
for tb in v['tables']: |
|
rect = fitz.Rect(tb['bbox']) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2) |
|
shape.finish() |
|
shape.commit() |
|
|
|
|
|
parent_dir = os.path.dirname(save_path) |
|
if not os.path.exists(parent_dir): |
|
os.makedirs(parent_dir) |
|
|
|
if is_new_pdf: |
|
doc.save(save_path) |
|
else: |
|
doc.saveIncr() |
|
doc.close() |
|
|
|
|
|
def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list, expect_drop_bboxes:list, save_path: str, expected_page_id:int): |
|
""" |
|
以覆盖的方式写个临时的pdf,用于debug |
|
""" |
|
if page_idx!=expected_page_id: |
|
return |
|
|
|
if os.path.exists(save_path): |
|
|
|
os.remove(save_path) |
|
|
|
doc = fitz.open('') |
|
|
|
width = raw_pdf_doc[page_idx].rect.width |
|
height = raw_pdf_doc[page_idx].rect.height |
|
new_page = doc.new_page(width=width, height=height) |
|
|
|
shape = new_page.new_shape() |
|
for bbox in bboxes: |
|
|
|
rect = fitz.Rect(*bbox[0:4]) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2) |
|
shape.finish() |
|
shape.commit() |
|
|
|
for bbox in droped_bboxes: |
|
|
|
rect = fitz.Rect(*bbox[0:4]) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2) |
|
shape.finish() |
|
shape.commit() |
|
|
|
for bbox in expect_drop_bboxes: |
|
|
|
rect = fitz.Rect(*bbox[0:4]) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=fitz.pdfcolor['red'], fill=None) |
|
shape.finish() |
|
shape.commit() |
|
|
|
|
|
|
|
|
|
|
|
|
|
parent_dir = os.path.dirname(save_path) |
|
if not os.path.exists(parent_dir): |
|
os.makedirs(parent_dir) |
|
|
|
doc.save(save_path) |
|
doc.close() |
|
|
|
|
|
def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,): |
|
save_path = "./tmp/debug.pdf" |
|
if os.path.exists(save_path): |
|
|
|
os.remove(save_path) |
|
|
|
doc = fitz.open('') |
|
|
|
width = page.rect.width |
|
height = page.rect.height |
|
new_page = doc.new_page(width=width, height=height) |
|
|
|
shape = new_page.new_shape() |
|
for bbox in bboxes1: |
|
|
|
rect = fitz.Rect(*bbox[0:4]) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2) |
|
shape.finish() |
|
shape.commit() |
|
|
|
for bbox in bboxes2: |
|
|
|
rect = fitz.Rect(*bbox[0:4]) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2) |
|
shape.finish() |
|
shape.commit() |
|
|
|
for bbox in bboxes3: |
|
|
|
rect = fitz.Rect(*bbox[0:4]) |
|
shape = new_page.new_shape() |
|
shape.draw_rect(rect) |
|
shape.finish(color=fitz.pdfcolor['red'], fill=None) |
|
shape.finish() |
|
shape.commit() |
|
|
|
parent_dir = os.path.dirname(save_path) |
|
if not os.path.exists(parent_dir): |
|
os.makedirs(parent_dir) |
|
|
|
doc.save(save_path) |
|
doc.close() |
|
|
|
|
|
|
|
|
|
def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str): |
|
""" |
|
在page上画出bbox,保存到save_path |
|
""" |
|
|
|
is_new_pdf = False |
|
if os.path.exists(pdf_path): |
|
|
|
doc = fitz.open(pdf_path) |
|
else: |
|
|
|
is_new_pdf = True |
|
doc = fitz.open('') |
|
|
|
for k, v in paras_dict.items(): |
|
page_idx = v['page_idx'] |
|
layouts = v['layout_bboxes'] |
|
page = doc[page_idx] |
|
shape = page.new_shape() |
|
for order, layout in enumerate(layouts): |
|
border_offset = 1 |
|
rect_box = layout['layout_bbox'] |
|
layout_label = layout['layout_label'] |
|
fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None |
|
rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset] |
|
rect = fitz.Rect(*rect_box) |
|
shape.draw_rect(rect) |
|
shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4) |
|
""" |
|
draw order text on layout box |
|
""" |
|
font_size = 10 |
|
shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0)) |
|
|
|
"""画上footer header""" |
|
if header: |
|
shape.draw_rect(fitz.Rect(header)) |
|
shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2) |
|
if footer: |
|
shape.draw_rect(fitz.Rect(footer)) |
|
shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2) |
|
|
|
shape.commit() |
|
|
|
if is_new_pdf: |
|
doc.save(pdf_path) |
|
else: |
|
doc.saveIncr() |
|
doc.close() |
|
|
|
|
|
@DeprecationWarning |
|
def draw_layout_on_page(raw_pdf_doc: fitz.Document, page_idx: int, page_layout: list, pdf_path: str): |
|
""" |
|
把layout的box用红色边框花在pdf_path的page_idx上 |
|
""" |
|
def draw(shape, layout, fill_color=fitz.pdfcolor['pink']): |
|
border_offset = 1 |
|
rect_box = layout['layout_bbox'] |
|
layout_label = layout['layout_label'] |
|
sub_layout = layout['sub_layout'] |
|
if len(sub_layout)==0: |
|
fill_color = fill_color if layout_label=='U' else None |
|
rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset] |
|
rect = fitz.Rect(*rect_box) |
|
shape.draw_rect(rect) |
|
shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for sub_layout in sub_layout: |
|
draw(shape, sub_layout) |
|
shape.commit() |
|
|
|
|
|
|
|
is_new_pdf = False |
|
if os.path.exists(pdf_path): |
|
|
|
doc = fitz.open(pdf_path) |
|
else: |
|
|
|
is_new_pdf = True |
|
doc = fitz.open('') |
|
|
|
page = doc[page_idx] |
|
shape = page.new_shape() |
|
for order, layout in enumerate(page_layout): |
|
draw(shape, layout, fitz.pdfcolor['yellow']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
parent_dir = os.path.dirname(pdf_path) |
|
if not os.path.exists(parent_dir): |
|
os.makedirs(parent_dir) |
|
|
|
if is_new_pdf: |
|
doc.save(pdf_path) |
|
else: |
|
doc.saveIncr() |
|
doc.close() |
|
|