File size: 1,917 Bytes
9bf4bd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
from typing import List

from mmocr.registry import DATA_PARSERS
from .icdar_txt_parser import ICDARTxtTextRecogAnnParser


@DATA_PARSERS.register_module()
class MJSynthAnnParser(ICDARTxtTextRecogAnnParser):
    """MJSynth Text Recognition Annotation Parser.

    The original annotation format of this dataset is stored in txt files,
    which is formed as the following format:
        img_path, transcription

    Args:
        separator (str): The separator between each element in a line. Defaults
            to ','.
        ignore (str): The text to be ignored. Defaults to '#'.
        format (str): The format of the annotation. Defaults to 'img, text'.
        encoding (str): The encoding of the annotation file. Defaults to
            'utf-8-sig'.
        nproc (int): The number of processes to parse the annotation. Defaults
            to 1.
        base_name (bool): Whether to use the basename of the image path as the
            image name. Defaults to False.
        remove_strs (List[str], Optional): Used to remove redundant strings in
            the transcription. Defaults to ['"'].
    """

    def parse_files(self, img_dir: str, ann_path: str) -> List:
        """Parse annotations."""
        assert isinstance(ann_path, str)
        samples = list()
        for anno in self.loader(
                file_path=ann_path,
                format=self.format,
                encoding=self.encoding,
                separator=self.sep):
            text = osp.basename(anno['img']).split('_')[1]
            if self.remove_strs is not None:
                for strs in self.remove_strs:
                    text = text.replace(strs, '')
            if text == self.ignore:
                continue
            img_name = anno['img']
            samples.append((osp.join(img_dir, img_name), text))

        return samples