derful's picture
Upload folder using huggingface_hub
240e0a0 verified
from magic_pdf.libs.commons import fitz
from magic_pdf.para.commons import *
if sys.version_info[0] >= 3:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
class DrawAnnos:
"""
This class draws annotations on the pdf file
----------------------------------------
Color Code
----------------------------------------
Red: (1, 0, 0)
Green: (0, 1, 0)
Blue: (0, 0, 1)
Yellow: (1, 1, 0) - mix of red and green
Cyan: (0, 1, 1) - mix of green and blue
Magenta: (1, 0, 1) - mix of red and blue
White: (1, 1, 1) - red, green and blue full intensity
Black: (0, 0, 0) - no color component whatsoever
Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
"""
def __init__(self) -> None:
pass
def __is_nested_list(self, lst):
"""
This function returns True if the given list is a nested list of any degree.
"""
if isinstance(lst, list):
return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
return False
def __valid_rect(self, bbox):
# Ensure that the rectangle is not empty or invalid
if isinstance(bbox[0], list):
return False # It's a nested list, hence it can't be valid rect
else:
return bbox[0] < bbox[2] and bbox[1] < bbox[3]
def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
"""
This function draws the nested boxes
Parameters
----------
page : fitz.Page
page
nested_bbox : list
nested bbox
color : tuple
color, by default (0, 1, 1) # draw with cyan color for combined paragraph
"""
if self.__is_nested_list(nested_bbox): # If it's a nested list
for bbox in nested_bbox:
self.__draw_nested_boxes(page, bbox, color) # Recursively call the function
elif self.__valid_rect(nested_bbox): # If valid rectangle
para_rect = fitz.Rect(nested_bbox)
para_anno = page.add_rect_annot(para_rect)
para_anno.set_colors(stroke=color) # draw with cyan color for combined paragraph
para_anno.set_border(width=1)
para_anno.update()
def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
pdf_doc = open_pdf(input_pdf_path)
if pdf_dic is None:
pdf_dic = {}
if output_pdf_path is None:
output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
for page_id, page in enumerate(pdf_doc): # type: ignore
page_key = f"page_{page_id}"
for ele_key, ele_data in pdf_dic[page_key].items():
if ele_key == "para_blocks":
para_blocks = ele_data
for para_block in para_blocks:
if "paras" in para_block.keys():
paras = para_block["paras"]
for para_key, para_content in paras.items():
para_bbox = para_content["para_bbox"]
# print(f"para_bbox: {para_bbox}")
# print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
color = (0, 1, 1)
self.__draw_nested_boxes(
page, para_bbox, color
) # draw with cyan color for combined paragraph
else:
if self.__valid_rect(para_bbox):
para_rect = fitz.Rect(para_bbox)
para_anno = page.add_rect_annot(para_rect)
para_anno.set_colors(stroke=(0, 1, 0)) # draw with green color for normal paragraph
para_anno.set_border(width=0.5)
para_anno.update()
is_para_title = para_content["is_para_title"]
if is_para_title:
if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
color = (0, 0, 1)
self.__draw_nested_boxes(
page, para_content["para_bbox"], color
) # draw with cyan color for combined title
else:
if self.__valid_rect(para_content["para_bbox"]):
para_rect = fitz.Rect(para_content["para_bbox"])
if self.__valid_rect(para_content["para_bbox"]):
para_anno = page.add_rect_annot(para_rect)
para_anno.set_colors(stroke=(0, 0, 1)) # draw with blue color for normal title
para_anno.set_border(width=0.5)
para_anno.update()
pdf_doc.save(output_pdf_path)
pdf_doc.close()