Spaces:

derful
/

MinerU

Runtime error

File size: 22,749 Bytes

240e0a0

import os
import unicodedata

from magic_pdf.para.commons import *


if sys.version_info[0] >= 3:
    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore


class BlockContinuationProcessor:
    """
    This class is used to process the blocks to detect block continuations.
    """

    def __init__(self) -> None:
        pass

    def __is_similar_font_type(self, font_type1, font_type2, prefix_length_ratio=0.3):
        """
        This function checks if the two font types are similar.
        Definition of similar font types: the two font types have a common prefix,
        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.

        Parameters
        ----------
        font_type1 : str
            font type 1
        font_type2 : str
            font type 2
        prefix_length_ratio : float
            minimum ratio of the common prefix length to the length of the shorter font type

        Returns
        -------
        bool
            True if the two font types are similar, False otherwise.
        """

        if isinstance(font_type1, list):
            font_type1 = font_type1[0] if font_type1 else ""
        if isinstance(font_type2, list):
            font_type2 = font_type2[0] if font_type2 else ""

        if font_type1 == font_type2:
            return True

        # Find the length of the common prefix
        common_prefix_length = len(os.path.commonprefix([font_type1, font_type2]))

        # Calculate the minimum prefix length based on the ratio
        min_prefix_length = int(min(len(font_type1), len(font_type2)) * prefix_length_ratio)

        return common_prefix_length >= min_prefix_length

    def __is_same_block_font(self, block1, block2):
        """
        This function compares the font of block1 and block2

        Parameters
        ----------
        block1 : dict
            block1
        block2 : dict
            block2

        Returns
        -------
        is_same : bool
            True if block1 and block2 have the same font, else False
        """
        block_1_font_type = safe_get(block1, "block_font_type", "")
        block_1_font_size = safe_get(block1, "block_font_size", 0)
        block_1_avg_char_width = safe_get(block1, "avg_char_width", 0)

        block_2_font_type = safe_get(block2, "block_font_type", "")
        block_2_font_size = safe_get(block2, "block_font_size", 0)
        block_2_avg_char_width = safe_get(block2, "avg_char_width", 0)

        if isinstance(block_1_font_size, list):
            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
        if isinstance(block_2_font_size, list):
            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0

        block_1_text = safe_get(block1, "text", "")
        block_2_text = safe_get(block2, "text", "")

        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
            return False

        if not block_1_text or not block_2_text:
            return False
        else:
            text_len_ratio = len(block_2_text) / len(block_1_text)
            if text_len_ratio < 0.2:
                avg_char_width_condition = (
                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
                    < 0.5
                )
            else:
                avg_char_width_condition = (
                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
                    < 0.2
                )

        block_font_size_condtion = abs(block_1_font_size - block_2_font_size) < 1

        return (
            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
            and avg_char_width_condition
            and block_font_size_condtion
        )

    def _is_alphabet_char(self, char):
        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
            return True
        else:
            return False

    def _is_chinese_char(self, char):
        if char >= "\u4e00" and char <= "\u9fa5":
            return True
        else:
            return False

    def _is_other_letter_char(self, char):
        try:
            cat = unicodedata.category(char)
            if cat == "Lu" or cat == "Ll":
                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
        except TypeError:
            print("The input to the function must be a single character.")
        return False

    def _is_year(self, s: str):
        try:
            number = int(s)
            return 1900 <= number <= 2099
        except ValueError:
            return False

    def __is_para_font_consistent(self, para_1, para_2):
        """
        This function compares the font of para1 and para2

        Parameters
        ----------
        para1 : dict
            para1
        para2 : dict
            para2

        Returns
        -------
        is_same : bool
            True if para1 and para2 have the same font, else False
        """
        if para_1 is None or para_2 is None:
            return False

        para_1_font_type = safe_get(para_1, "para_font_type", "")
        para_1_font_size = safe_get(para_1, "para_font_size", 0)
        para_1_font_color = safe_get(para_1, "para_font_color", "")

        para_2_font_type = safe_get(para_2, "para_font_type", "")
        para_2_font_size = safe_get(para_2, "para_font_size", 0)
        para_2_font_color = safe_get(para_2, "para_font_color", "")

        if isinstance(para_1_font_type, list):  # get the most common font type
            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
        if isinstance(para_2_font_type, list):
            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
        if isinstance(para_1_font_size, list):  # compute average font type
            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
        if isinstance(para_2_font_size, list):  # compute average font type
            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)

        return (
            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
            and abs(para_1_font_size - para_2_font_size) < 1.5
            # and para_font_color1 == para_font_color2
        )

    def _is_para_puncs_consistent(self, para_1, para_2):
        """
        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)

        Parameters
        ----------
        para1 : dict
            para1
        para2 : dict
            para2

        Returns
        -------
        is_same : bool
            True if para1 and para2 are from the same paragraph by using the puncs, else False
        """
        para_1_text = safe_get(para_1, "para_text", "").strip()
        para_2_text = safe_get(para_2, "para_text", "").strip()

        para_1_bboxes = safe_get(para_1, "para_bbox", [])
        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)

        para_2_bboxes = safe_get(para_2, "para_bbox", [])
        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)

        # print_yellow("    Features of determine puncs_consistent:")
        # print(f"    para_1_text: {para_1_text}")
        # print(f"    para_2_text: {para_2_text}")
        # print(f"    para_1_bboxes: {para_1_bboxes}")
        # print(f"    para_2_bboxes: {para_2_bboxes}")
        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
        # print(f"    para_2_font_sizes: {para_2_font_sizes}")

        if is_nested_list(para_1_bboxes):
            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
        else:
            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes

        if is_nested_list(para_2_bboxes):
            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
        else:
            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes

        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold

        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold

        # Check if either para_text1 or para_text2 is empty
        if not para_1_text or not para_2_text:
            return False

        # Define the end puncs for a sentence to end and hyphen
        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
        hyphen = ["-", "—"]

        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs

        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")

        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
            # print_red(f"para_1 is end with hyphen.")
            para_2_is_consistent = para_2_text and (
                para_2_text[0] in hyphen
                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
                or (self._is_chinese_char(para_2_text[0]))
                or (self._is_other_letter_char(para_2_text[0]))
            )
            if para_2_is_consistent:
                # print(f"para_2 is consistent.\n")
                return True
            else:
                # print(f"para_2 is not consistent.\n")
                pass

        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
            # print_red(f"para_1 is end with end_punc.")
            para_2_is_consistent = (
                para_2_text
                and (
                    para_2_text[0] == " "
                    or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
                    or (self._is_chinese_char(para_2_text[0]))
                    or (self._is_other_letter_char(para_2_text[0]))
                )
                and not is_para2_left_indent_than_papa1
            )
            if para_2_is_consistent:
                # print(f"para_2 is consistent.\n")
                return True
            else:
                # print(f"para_2 is not consistent.\n")
                pass

        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
            # print_red(f"para_1 is NOT end with end_punc.")
            para_2_is_consistent = para_2_text and (
                para_2_text[0] == " "
                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
                or (self._is_alphabet_char(para_2_text[0]))
                or (self._is_year(para_2_text[0:4]))
                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
                or (self._is_chinese_char(para_2_text[0]))
                or (self._is_other_letter_char(para_2_text[0]))
            )
            if para_2_is_consistent:
                # print(f"para_2 is consistent.\n")
                return True
            else:
                # print(f"para_2 is not consistent.\n")
                pass

        elif para_1_end_with_space:  # If para_text1 ends with space
            # print_red(f"para_1 is end with space.")
            para_2_is_consistent = para_2_text and (
                para_2_text[0] == " "
                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
                or (self._is_chinese_char(para_2_text[0]))
                or (self._is_other_letter_char(para_2_text[0]))
            )
            if para_2_is_consistent:
                # print(f"para_2 is consistent.\n")
                return True
            else:
                pass
                # print(f"para_2 is not consistent.\n")

        return False

    def _is_block_consistent(self, block1, block2):
        """
        This function determines whether block1 and block2 are originally from the same block

        Parameters
        ----------
        block1 : dict
            block1s
        block2 : dict
            block2

        Returns
        -------
        is_same : bool
            True if block1 and block2 are from the same block, else False
        """
        return self.__is_same_block_font(block1, block2)

    def _is_para_continued(self, para1, para2):
        """
        This function determines whether para1 and para2 are originally from the same paragraph

        Parameters
        ----------
        para1 : dict
            para1
        para2 : dict
            para2

        Returns
        -------
        is_same : bool
            True if para1 and para2 are from the same paragraph, else False
        """
        is_para_font_consistent = self.__is_para_font_consistent(para1, para2)
        is_para_puncs_consistent = self._is_para_puncs_consistent(para1, para2)

        return is_para_font_consistent and is_para_puncs_consistent

    def _are_boundaries_of_block_consistent(self, block1, block2):
        """
        This function checks if the boundaries of block1 and block2 are consistent

        Parameters
        ----------
        block1 : dict
            block1

        block2 : dict
            block2

        Returns
        -------
        is_consistent : bool
            True if the boundaries of block1 and block2 are consistent, else False
        """

        last_line_of_block1 = block1["lines"][-1]
        first_line_of_block2 = block2["lines"][0]

        spans_of_last_line_of_block1 = last_line_of_block1["spans"]
        spans_of_first_line_of_block2 = first_line_of_block2["spans"]

        font_type_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["font"].lower()
        font_size_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["size"]
        font_color_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["color"]
        font_flags_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["flags"]

        font_type_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["font"].lower()
        font_size_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["size"]
        font_color_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["color"]
        font_flags_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["flags"]

        return (
            self.__is_similar_font_type(font_type_of_last_line_of_block1, font_type_of_first_line_of_block2)
            and abs(font_size_of_last_line_of_block1 - font_size_of_first_line_of_block2) < 1
            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
            and font_flags_of_last_line_of_block1 == font_flags_of_first_line_of_block2
        )

    def _get_last_paragraph(self, block):
        """
        Retrieves the last paragraph from a block.

        Parameters
        ----------
        block : dict
            The block from which to retrieve the paragraph.

        Returns
        -------
        dict
            The last paragraph of the block.
        """
        if block["paras"]:
            last_para_key = list(block["paras"].keys())[-1]
            return block["paras"][last_para_key]
        else:
            return None

    def _get_first_paragraph(self, block):
        """
        Retrieves the first paragraph from a block.

        Parameters
        ----------
        block : dict
            The block from which to retrieve the paragraph.

        Returns
        -------
        dict
            The first paragraph of the block.
        """
        if block["paras"]:
            first_para_key = list(block["paras"].keys())[0]
            return block["paras"][first_para_key]
        else:
            return None

    def should_merge_next_para(self, curr_para, next_para):
        if self._is_para_continued(curr_para, next_para):
            return True
        else:
            return False

    def batch_tag_paras(self, pdf_dict):
        the_last_page_id = len(pdf_dict) - 1

        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
                para_blocks_of_curr_page = curr_page_content["para_blocks"]
                next_page_idx = curr_page_idx + 1
                next_page_id = f"page_{next_page_idx}"
                next_page_content = pdf_dict.get(next_page_id, {})

                for i, current_block in enumerate(para_blocks_of_curr_page):
                    for para_id, curr_para in current_block["paras"].items():
                        curr_para["curr_para_location"] = [
                            curr_page_idx,
                            current_block["block_id"],
                            int(para_id.split("_")[-1]),
                        ]
                        curr_para["next_para_location"] = None  # 默认设置为None
                        curr_para["merge_next_para"] = False  # 默认设置为False

                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None

                    if next_block:
                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]

                        next_block_first_para_key = list(next_block["paras"].keys())[0]
                        next_blk_first_para = next_block["paras"][next_block_first_para_key]

                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
                            curr_blk_last_para["next_para_location"] = [
                                curr_page_idx,
                                next_block["block_id"],
                                int(next_block_first_para_key.split("_")[-1]),
                            ]
                            curr_blk_last_para["merge_next_para"] = True
                    else:
                        # Handle the case where the next block is in a different page
                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]

                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
                            next_page_idx += 1
                            next_page_id = f"page_{next_page_idx}"
                            next_page_content = pdf_dict.get(next_page_id, {})

                        if next_page_content.get("para_blocks", []):
                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]

                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
                                curr_blk_last_para["next_para_location"] = [
                                    next_page_idx,
                                    next_page_content["para_blocks"][0]["block_id"],
                                    int(next_blk_first_para_key.split("_")[-1]),
                                ]
                                curr_blk_last_para["merge_next_para"] = True

        return pdf_dict

    def find_block_by_id(self, para_blocks, block_id):
        for block in para_blocks:
            if block.get("block_id") == block_id:
                return block
        return None

    def batch_merge_paras(self, pdf_dict):
        for page_id, page_content in pdf_dict.items():
            if page_id.startswith("page_") and page_content.get("para_blocks", []):
                para_blocks_of_page = page_content["para_blocks"]

                for i in range(len(para_blocks_of_page)):
                    current_block = para_blocks_of_page[i]
                    paras = current_block["paras"]

                    for para_id, curr_para in list(paras.items()):
                        # 跳过标题段落
                        if curr_para.get("is_para_title"):
                            continue

                        while curr_para.get("merge_next_para"):
                            next_para_location = curr_para.get("next_para_location")
                            if not next_para_location:
                                break

                            next_page_idx, next_block_id, next_para_id = next_para_location
                            next_page_id = f"page_{next_page_idx}"
                            next_page_content = pdf_dict.get(next_page_id)
                            if not next_page_content:
                                break

                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
                            if not next_block:
                                break

                            next_para = next_block["paras"].get(f"para_{next_para_id}")
                            if not next_para or next_para.get("is_para_title"):
                                break

                            # 合并段落文本
                            curr_para_text = curr_para.get("para_text", "")
                            next_para_text = next_para.get("para_text", "")
                            curr_para["para_text"] = curr_para_text + " " + next_para_text

                            # 更新 next_para_location
                            curr_para["next_para_location"] = next_para.get("next_para_location")

                            # 将下一个段落文本置为空，表示已被合并
                            next_para["para_text"] = ""

                            # 更新 merge_next_para 标记
                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)

        return pdf_dict