Spaces:
Sleeping
Sleeping
# Copyright (c) OpenMMLab. All rights reserved. | |
import os.path as osp | |
from typing import List | |
from mmocr.registry import DATA_PARSERS | |
from .icdar_txt_parser import ICDARTxtTextRecogAnnParser | |
class MJSynthAnnParser(ICDARTxtTextRecogAnnParser): | |
"""MJSynth Text Recognition Annotation Parser. | |
The original annotation format of this dataset is stored in txt files, | |
which is formed as the following format: | |
img_path, transcription | |
Args: | |
separator (str): The separator between each element in a line. Defaults | |
to ','. | |
ignore (str): The text to be ignored. Defaults to '#'. | |
format (str): The format of the annotation. Defaults to 'img, text'. | |
encoding (str): The encoding of the annotation file. Defaults to | |
'utf-8-sig'. | |
nproc (int): The number of processes to parse the annotation. Defaults | |
to 1. | |
base_name (bool): Whether to use the basename of the image path as the | |
image name. Defaults to False. | |
remove_strs (List[str], Optional): Used to remove redundant strings in | |
the transcription. Defaults to ['"']. | |
""" | |
def parse_files(self, img_dir: str, ann_path: str) -> List: | |
"""Parse annotations.""" | |
assert isinstance(ann_path, str) | |
samples = list() | |
for anno in self.loader( | |
file_path=ann_path, | |
format=self.format, | |
encoding=self.encoding, | |
separator=self.sep): | |
text = osp.basename(anno['img']).split('_')[1] | |
if self.remove_strs is not None: | |
for strs in self.remove_strs: | |
text = text.replace(strs, '') | |
if text == self.ignore: | |
continue | |
img_name = anno['img'] | |
samples.append((osp.join(img_dir, img_name), text)) | |
return samples | |