Spaces:

Mountchicken
/

MAERec-Gradio

Sleeping

App Files Files Community

MAERec-Gradio / mmocr /datasets /preparers /parsers /mjsynth_parser.py

Mountchicken

Upload 704 files

9bf4bd7 over 1 year ago

raw

history blame

1.92 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	import os.path as osp
	from typing import List

	from mmocr.registry import DATA_PARSERS
	from .icdar_txt_parser import ICDARTxtTextRecogAnnParser


	@DATA_PARSERS.register_module()
	class MJSynthAnnParser(ICDARTxtTextRecogAnnParser):
	"""MJSynth Text Recognition Annotation Parser.

	The original annotation format of this dataset is stored in txt files,
	which is formed as the following format:
	img_path, transcription

	Args:
	separator (str): The separator between each element in a line. Defaults
	to ','.
	ignore (str): The text to be ignored. Defaults to '#'.
	format (str): The format of the annotation. Defaults to 'img, text'.
	encoding (str): The encoding of the annotation file. Defaults to
	'utf-8-sig'.
	nproc (int): The number of processes to parse the annotation. Defaults
	to 1.
	base_name (bool): Whether to use the basename of the image path as the
	image name. Defaults to False.
	remove_strs (List[str], Optional): Used to remove redundant strings in
	the transcription. Defaults to ['"'].
	"""

	def parse_files(self, img_dir: str, ann_path: str) -> List:
	"""Parse annotations."""
	assert isinstance(ann_path, str)
	samples = list()
	for anno in self.loader(
	file_path=ann_path,
	format=self.format,
	encoding=self.encoding,
	separator=self.sep):
	text = osp.basename(anno['img']).split('_')[1]
	if self.remove_strs is not None:
	for strs in self.remove_strs:
	text = text.replace(strs, '')
	if text == self.ignore:
	continue
	img_name = anno['img']
	samples.append((osp.join(img_dir, img_name), text))

	return samples