|
from magic_pdf.libs.commons import fitz |
|
|
|
from magic_pdf.para.commons import * |
|
|
|
|
|
if sys.version_info[0] >= 3: |
|
sys.stdout.reconfigure(encoding="utf-8") |
|
|
|
|
|
class DrawAnnos: |
|
""" |
|
This class draws annotations on the pdf file |
|
|
|
---------------------------------------- |
|
Color Code |
|
---------------------------------------- |
|
Red: (1, 0, 0) |
|
Green: (0, 1, 0) |
|
Blue: (0, 0, 1) |
|
Yellow: (1, 1, 0) - mix of red and green |
|
Cyan: (0, 1, 1) - mix of green and blue |
|
Magenta: (1, 0, 1) - mix of red and blue |
|
White: (1, 1, 1) - red, green and blue full intensity |
|
Black: (0, 0, 0) - no color component whatsoever |
|
Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components |
|
Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component |
|
""" |
|
|
|
def __init__(self) -> None: |
|
pass |
|
|
|
def __is_nested_list(self, lst): |
|
""" |
|
This function returns True if the given list is a nested list of any degree. |
|
""" |
|
if isinstance(lst, list): |
|
return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst) |
|
return False |
|
|
|
def __valid_rect(self, bbox): |
|
|
|
if isinstance(bbox[0], list): |
|
return False |
|
else: |
|
return bbox[0] < bbox[2] and bbox[1] < bbox[3] |
|
|
|
def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)): |
|
""" |
|
This function draws the nested boxes |
|
|
|
Parameters |
|
---------- |
|
page : fitz.Page |
|
page |
|
nested_bbox : list |
|
nested bbox |
|
color : tuple |
|
color, by default (0, 1, 1) # draw with cyan color for combined paragraph |
|
""" |
|
if self.__is_nested_list(nested_bbox): |
|
for bbox in nested_bbox: |
|
self.__draw_nested_boxes(page, bbox, color) |
|
elif self.__valid_rect(nested_bbox): |
|
para_rect = fitz.Rect(nested_bbox) |
|
para_anno = page.add_rect_annot(para_rect) |
|
para_anno.set_colors(stroke=color) |
|
para_anno.set_border(width=1) |
|
para_anno.update() |
|
|
|
def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path): |
|
pdf_doc = open_pdf(input_pdf_path) |
|
|
|
if pdf_dic is None: |
|
pdf_dic = {} |
|
|
|
if output_pdf_path is None: |
|
output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf") |
|
|
|
for page_id, page in enumerate(pdf_doc): |
|
page_key = f"page_{page_id}" |
|
for ele_key, ele_data in pdf_dic[page_key].items(): |
|
if ele_key == "para_blocks": |
|
para_blocks = ele_data |
|
for para_block in para_blocks: |
|
if "paras" in para_block.keys(): |
|
paras = para_block["paras"] |
|
for para_key, para_content in paras.items(): |
|
para_bbox = para_content["para_bbox"] |
|
|
|
|
|
if self.__is_nested_list(para_bbox) and len(para_bbox) > 1: |
|
color = (0, 1, 1) |
|
self.__draw_nested_boxes( |
|
page, para_bbox, color |
|
) |
|
else: |
|
if self.__valid_rect(para_bbox): |
|
para_rect = fitz.Rect(para_bbox) |
|
para_anno = page.add_rect_annot(para_rect) |
|
para_anno.set_colors(stroke=(0, 1, 0)) |
|
para_anno.set_border(width=0.5) |
|
para_anno.update() |
|
|
|
is_para_title = para_content["is_para_title"] |
|
if is_para_title: |
|
if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1: |
|
color = (0, 0, 1) |
|
self.__draw_nested_boxes( |
|
page, para_content["para_bbox"], color |
|
) |
|
else: |
|
if self.__valid_rect(para_content["para_bbox"]): |
|
para_rect = fitz.Rect(para_content["para_bbox"]) |
|
if self.__valid_rect(para_content["para_bbox"]): |
|
para_anno = page.add_rect_annot(para_rect) |
|
para_anno.set_colors(stroke=(0, 0, 1)) |
|
para_anno.set_border(width=0.5) |
|
para_anno.update() |
|
|
|
pdf_doc.save(output_pdf_path) |
|
pdf_doc.close() |
|
|