Spaces:

Mountchicken
/

MAERec-Gradio

Sleeping

[Update] Inital Update

174ad5e over 1 year ago

1.8 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	import argparse
	import os.path as osp

	from mmocr.utils import dump_ocr_data


	def convert_annotations(root_path, split):
	"""Convert original annotations to mmocr format.

	The annotation format of this dataset is as the following:
	word_1.png, "flying"
	word_2.png, "today"
	word_3.png, "means"
	See the format of converted annotation in mmocr.utils.dump_ocr_data.

	Args:
	root_path (str): The root path of the dataset
	split (str): The split of dataset. Namely: Train or Test
	"""
	assert isinstance(root_path, str)
	assert isinstance(split, str)

	img_info = []
	with open(
	osp.join(root_path, 'annotations',
	f'Challenge1_{split}_Task3_GT.txt'),
	encoding='"utf-8-sig') as f:
	annos = f.readlines()
	for anno in annos:
	# text may contain comma ','
	dst_img_name, word = anno.split(', "')
	word = word.replace('"\n', '')

	img_info.append({
	'file_name': dst_img_name,
	'anno_info': [{
	'text': word
	}]
	})

	return img_info


	def parse_args():
	parser = argparse.ArgumentParser(
	description='Generate training and test set of IC11')
	parser.add_argument('root_path', help='Root dir path of IC11')
	args = parser.parse_args()
	return args


	def main():
	args = parse_args()
	root_path = args.root_path

	for split in ['Train', 'Test']:
	img_info = convert_annotations(root_path, split)
	dump_ocr_data(img_info,
	osp.join(root_path, f'{split.lower()}_label.json'),
	'textrecog')
	print(f'{split} split converted.')


	if __name__ == '__main__':
	main()