File size: 5,643 Bytes
240e0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from magic_pdf.libs.commons import fitz

from magic_pdf.para.commons import *


if sys.version_info[0] >= 3:
    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore


class DrawAnnos:
    """
    This class draws annotations on the pdf file

    ----------------------------------------
                Color Code
    ----------------------------------------
        Red: (1, 0, 0)
        Green: (0, 1, 0)
        Blue: (0, 0, 1)
        Yellow: (1, 1, 0) - mix of red and green
        Cyan: (0, 1, 1) - mix of green and blue
        Magenta: (1, 0, 1) - mix of red and blue
        White: (1, 1, 1) - red, green and blue full intensity
        Black: (0, 0, 0) - no color component whatsoever
        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
    """

    def __init__(self) -> None:
        pass

    def __is_nested_list(self, lst):
        """
        This function returns True if the given list is a nested list of any degree.
        """
        if isinstance(lst, list):
            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
        return False

    def __valid_rect(self, bbox):
        # Ensure that the rectangle is not empty or invalid
        if isinstance(bbox[0], list):
            return False  # It's a nested list, hence it can't be valid rect
        else:
            return bbox[0] < bbox[2] and bbox[1] < bbox[3]

    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
        """
        This function draws the nested boxes

        Parameters
        ----------
        page : fitz.Page
            page
        nested_bbox : list
            nested bbox
        color : tuple
            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
        """
        if self.__is_nested_list(nested_bbox):  # If it's a nested list
            for bbox in nested_bbox:
                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
        elif self.__valid_rect(nested_bbox):  # If valid rectangle
            para_rect = fitz.Rect(nested_bbox)
            para_anno = page.add_rect_annot(para_rect)
            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
            para_anno.set_border(width=1)
            para_anno.update()

    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
        pdf_doc = open_pdf(input_pdf_path)

        if pdf_dic is None:
            pdf_dic = {}

        if output_pdf_path is None:
            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")

        for page_id, page in enumerate(pdf_doc):  # type: ignore
            page_key = f"page_{page_id}"
            for ele_key, ele_data in pdf_dic[page_key].items():
                if ele_key == "para_blocks":
                    para_blocks = ele_data
                    for para_block in para_blocks:
                        if "paras" in para_block.keys():
                            paras = para_block["paras"]
                            for para_key, para_content in paras.items():
                                para_bbox = para_content["para_bbox"]
                                # print(f"para_bbox: {para_bbox}")
                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
                                    color = (0, 1, 1)
                                    self.__draw_nested_boxes(
                                        page, para_bbox, color
                                    )  # draw with cyan color for combined paragraph
                                else:
                                    if self.__valid_rect(para_bbox):
                                        para_rect = fitz.Rect(para_bbox)
                                        para_anno = page.add_rect_annot(para_rect)
                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
                                        para_anno.set_border(width=0.5)
                                        para_anno.update()

                                is_para_title = para_content["is_para_title"]
                                if is_para_title:
                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
                                        color = (0, 0, 1)
                                        self.__draw_nested_boxes(
                                            page, para_content["para_bbox"], color
                                        )  # draw with cyan color for combined title
                                    else:
                                        if self.__valid_rect(para_content["para_bbox"]):
                                            para_rect = fitz.Rect(para_content["para_bbox"])
                                            if self.__valid_rect(para_content["para_bbox"]):
                                                para_anno = page.add_rect_annot(para_rect)
                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
                                                para_anno.set_border(width=0.5)
                                                para_anno.update()

        pdf_doc.save(output_pdf_path)
        pdf_doc.close()