# Copyright (c) OpenMMLab. All rights reserved. import argparse import json from typing import List, Tuple from mmocr.datasets import RecogLMDBDataset from mmocr.utils import StringStripper, dump_ocr_data, recog_anno_to_imginfo def parse_legacy_data(in_path: str, format: str) -> Tuple[List[str], List[str]]: """Load legacy data and return a list of file paths and labels. Args: in_path (str): Path to annotation file. format (str): Annotation format. Choices are 'txt', 'json' and 'lmdb'. For 'lmdb' format, the lmdb file should only contains labels. For lmdb file with labels and images, the conversion is unnecessary. Returns: tuple(list[str], list[str]): File paths and labels. """ file_paths = [] labels = [] strip_cls = StringStripper() if format == 'lmdb': dataset = RecogLMDBDataset( in_path, parser_cfg=dict(type='LineJsonParser', keys=['filename', 'text'])) for data_info in dataset: file_path = data_info['img_path'] label = data_info['instances'][0]['text'] file_path = strip_cls(file_path) label = strip_cls(label) # MJ's file_path starts with './' if file_path.startswith('./'): file_path = file_path[2:] file_paths.append(file_path) labels.append(label) return file_paths, labels else: with open(in_path) as f: if format == 'txt': for line in f: line = strip_cls(line) file_path, label = line.split()[:2] # MJ's file_path starts with './' if file_path.startswith('./'): file_path = file_path[2:] file_paths.append(file_path) labels.append(label) elif format == 'jsonl': for line in f: datum = json.loads(line) file_path = datum['filename'] # MJ's file_path starts with './' if file_path.startswith('./'): file_path = file_path[2:] file_paths.append(file_path) labels.append(datum['text']) return file_paths, labels def parse_args(): """Parse input arguments.""" parser = argparse.ArgumentParser( description='Convert annotations for' 'text recognition tasks in MMOCR 0.x into the latest openmmlab format.' ) parser.add_argument( 'in_path', help='The path to legacy recognition data file') parser.add_argument( 'out_path', help='The output json path in openmmlab format') parser.add_argument( '--format', choices=['txt', 'jsonl', 'lmdb'], type=str, default='txt', help='Legacy data format') args = parser.parse_args() if args.out_path.split('.')[-1] != 'json': raise ValueError('The output path must be a json file.') return args def main(): args = parse_args() file_paths, labels = parse_legacy_data(args.in_path, args.format) img_infos = recog_anno_to_imginfo(file_paths, labels) dump_ocr_data(img_infos, args.out_path, 'textrecog') print('finish') if __name__ == '__main__': main()