Mountchicken's picture
Upload 704 files
9bf4bd7
raw
history blame
1.92 kB
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
from typing import List
from mmocr.registry import DATA_PARSERS
from .icdar_txt_parser import ICDARTxtTextRecogAnnParser
@DATA_PARSERS.register_module()
class MJSynthAnnParser(ICDARTxtTextRecogAnnParser):
"""MJSynth Text Recognition Annotation Parser.
The original annotation format of this dataset is stored in txt files,
which is formed as the following format:
img_path, transcription
Args:
separator (str): The separator between each element in a line. Defaults
to ','.
ignore (str): The text to be ignored. Defaults to '#'.
format (str): The format of the annotation. Defaults to 'img, text'.
encoding (str): The encoding of the annotation file. Defaults to
'utf-8-sig'.
nproc (int): The number of processes to parse the annotation. Defaults
to 1.
base_name (bool): Whether to use the basename of the image path as the
image name. Defaults to False.
remove_strs (List[str], Optional): Used to remove redundant strings in
the transcription. Defaults to ['"'].
"""
def parse_files(self, img_dir: str, ann_path: str) -> List:
"""Parse annotations."""
assert isinstance(ann_path, str)
samples = list()
for anno in self.loader(
file_path=ann_path,
format=self.format,
encoding=self.encoding,
separator=self.sep):
text = osp.basename(anno['img']).split('_')[1]
if self.remove_strs is not None:
for strs in self.remove_strs:
text = text.replace(strs, '')
if text == self.ignore:
continue
img_name = anno['img']
samples.append((osp.join(img_dir, img_name), text))
return samples