Spaces:
Sleeping
Sleeping
# Copyright (c) OpenMMLab. All rights reserved. | |
import argparse | |
import os.path as osp | |
import time | |
import warnings | |
from mmengine import Config | |
from mmocr.datasets.preparers import DatasetPreparer | |
def parse_args(): | |
parser = argparse.ArgumentParser( | |
description='Preparing datasets used in MMOCR.') | |
parser.add_argument( | |
'datasets', | |
help='A list of the dataset names that would like to prepare.', | |
nargs='+') | |
parser.add_argument( | |
'--nproc', help='Number of processes to run', default=4, type=int) | |
parser.add_argument( | |
'--task', | |
default='textdet', | |
choices=['textdet', 'textrecog', 'textspotting', 'kie'], | |
help='Task type. Options are "textdet", "textrecog", "textspotting"' | |
' and "kie".') | |
parser.add_argument( | |
'--splits', | |
default=['train', 'test', 'val'], | |
help='A list of the split that would like to prepare.', | |
nargs='+') | |
parser.add_argument( | |
'--lmdb', | |
action='store_true', | |
default=False, | |
help='Whether to dump the textrecog dataset to LMDB format, It\'s a ' | |
'shortcut to force the dataset to be dumped in lmdb format. ' | |
'Applicable when --task=textrecog') | |
parser.add_argument( | |
'--overwrite-cfg', | |
action='store_true', | |
default=False, | |
help='Whether to overwrite the dataset config file if it already' | |
' exists. If not specified, Dataset Preparer will not generate' | |
' new config for datasets whose configs are already in base.') | |
parser.add_argument( | |
'--dataset-zoo-path', | |
default='./dataset_zoo', | |
help='Path to dataset zoo config files.') | |
args = parser.parse_args() | |
return args | |
def parse_meta(task: str, meta_path: str) -> None: | |
"""Parse meta file. | |
Args: | |
cfg_path (str): Path to meta file. | |
""" | |
try: | |
meta = Config.fromfile(meta_path) | |
except FileNotFoundError: | |
return | |
assert task in meta['Data']['Tasks'], \ | |
f'Task {task} not supported!' | |
# License related | |
if meta['Data']['License']['Type']: | |
print(f"\033[1;33;40mDataset Name: {meta['Name']}") | |
print(f"License Type: {meta['Data']['License']['Type']}") | |
print(f"License Link: {meta['Data']['License']['Link']}") | |
print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m") | |
print('\033[1;31;43mMMOCR does not own the dataset. Using this ' | |
'dataset you must accept the license provided by the owners, ' | |
'and cite the corresponding papers appropriately.') | |
print('If you do not agree with the above license, please cancel ' | |
'the progress immediately by pressing ctrl+c. Otherwise, ' | |
'you are deemed to accept the terms and conditions.\033[0m') | |
for i in range(5): | |
print(f'{5-i}...') | |
time.sleep(1) | |
def force_lmdb(cfg): | |
"""Force the dataset to be dumped in lmdb format. | |
Args: | |
cfg (Config): Config object. | |
Returns: | |
Config: Config object. | |
""" | |
for split in ['train', 'val', 'test']: | |
preparer_cfg = cfg.get(f'{split}_preparer') | |
if preparer_cfg: | |
if preparer_cfg.get('dumper') is None: | |
raise ValueError( | |
f'{split} split does not come with a dumper, ' | |
'so most likely the annotations are MMOCR-ready and do ' | |
'not need any adaptation, and it ' | |
'cannot be dumped in LMDB format.') | |
preparer_cfg.dumper['type'] = 'TextRecogLMDBDumper' | |
cfg.config_generator['dataset_name'] = f'{cfg.dataset_name}_lmdb' | |
for split in ['train_anns', 'val_anns', 'test_anns']: | |
if split in cfg.config_generator: | |
# It can be None when users want to clear out the default | |
# value | |
if not cfg.config_generator[split]: | |
continue | |
ann_list = cfg.config_generator[split] | |
for ann_dict in ann_list: | |
ann_dict['ann_file'] = ( | |
osp.splitext(ann_dict['ann_file'])[0] + '.lmdb') | |
else: | |
if split == 'train_anns': | |
ann_list = [dict(ann_file='textrecog_train.lmdb')] | |
elif split == 'test_anns': | |
ann_list = [dict(ann_file='textrecog_test.lmdb')] | |
else: | |
ann_list = [] | |
cfg.config_generator[split] = ann_list | |
return cfg | |
def main(): | |
args = parse_args() | |
if args.lmdb and args.task != 'textrecog': | |
raise ValueError('--lmdb only works with --task=textrecog') | |
for dataset in args.datasets: | |
if not osp.isdir(osp.join(args.dataset_zoo_path, dataset)): | |
warnings.warn(f'{dataset} is not supported yet. Please check ' | |
'dataset zoo for supported datasets.') | |
continue | |
meta_path = osp.join(args.dataset_zoo_path, dataset, 'metafile.yml') | |
parse_meta(args.task, meta_path) | |
cfg_path = osp.join(args.dataset_zoo_path, dataset, args.task + '.py') | |
cfg = Config.fromfile(cfg_path) | |
if args.overwrite_cfg and cfg.get('config_generator', | |
None) is not None: | |
cfg.config_generator.overwrite_cfg = args.overwrite_cfg | |
cfg.nproc = args.nproc | |
cfg.task = args.task | |
cfg.dataset_name = dataset | |
if args.lmdb: | |
cfg = force_lmdb(cfg) | |
preparer = DatasetPreparer.from_file(cfg) | |
preparer.run(args.splits) | |
if __name__ == '__main__': | |
main() | |