File size: 4,229 Bytes
9bf4bd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Copyright (c) OpenMMLab. All rights reserved.
import json
from typing import List, Tuple

import numpy as np

from mmocr.registry import DATA_PARSERS
from .base import BaseParser


@DATA_PARSERS.register_module()
class NAFAnnParser(BaseParser):
    """NAF dataset parser.

    The original annotation format of this dataset is stored in json files,
    which has the following keys that will be used here:
        - 'textBBs': List of text bounding box objects
            - 'poly_points': list of [x,y] pairs, the box corners going
                top-left,top-right,bottom-right,bottom-left
            - 'id': id of the textBB, used to match with the text
        - 'transcriptions': Dict of transcription objects, use the 'id' key
            to match with the textBB.

    Some special characters are used in the transcription:
    "«text»" indicates that "text" had a strikethrough
    "¿" indicates the transcriber could not read a character
    "§" indicates the whole line or word was illegible
    "" (empty string) is if the field was blank

    Args:
        ignore (list(str)): The text of the ignored instances. Default: ['#'].
        det (bool): Whether to parse the detection annotation. Default: True.
            If False, the parser will consider special case in NAF dataset
            where the transcription is not available.
    """

    def __init__(self,
                 ignore: List[str] = ['#'],
                 det: bool = True,
                 **kwargs) -> None:
        self.ignore = ignore
        self.det = det
        super().__init__(**kwargs)

    def parse_file(self, img_path: str, ann_path: str) -> Tuple:
        """Convert single annotation."""
        instances = list()
        for poly, text in self.loader(ann_path):
            instances.append(
                dict(poly=poly, text=text, ignore=text in self.ignore))

        return img_path, instances

    def loader(self, file_path: str) -> str:
        """Load the annotation of the NAF dataset.

        Args:
            file_path (str): Path to the json file

        Retyrb:
            str: Complete annotation of the json file
        """
        with open(file_path, 'r') as f:
            data = json.load(f)

        # 'textBBs' contains the printed texts of the table while 'fieldBBs'
        #  contains the text filled by human.
        for box_type in ['textBBs', 'fieldBBs']:
            if not self.det:
                # 'textBBs' is only used for detection task.
                if box_type == 'textBBs':
                    continue
            for anno in data[box_type]:
                # Skip blanks
                if self.det:
                    if box_type == 'fieldBBs':
                        if anno['type'] == 'blank':
                            continue
                    poly = np.array(anno['poly_points']).reshape(
                        1, 8)[0].tolist()
                    # Since detection task only need poly, we can skip the
                    # transcription part that can be empty.
                    text = None
                else:
                    # For tasks that need transcription, NAF dataset has
                    # serval special cases:
                    # 1. The transcription for the whole image is not
                    # available.
                    # 2. The transcription for the certain text is not
                    # available.
                    # 3. If the length of the transcription is 0, it should
                    # be ignored.
                    if 'transcriptions' not in data.keys():
                        break
                    if anno['id'] not in data['transcriptions'].keys():
                        continue
                    text = data['transcriptions'][anno['id']]
                    text = text.strip(
                        '\u202a')  # Remove unicode control character
                    text = text.replace('»', '').replace(
                        '«', '')  # Remove strikethrough flag
                    if len(text) == 0:
                        continue
                    poly = np.array(anno['poly_points']).reshape(
                        1, 8)[0].tolist()
                yield poly, text