Spaces:
Sleeping
Sleeping
File size: 7,680 Bytes
9bf4bd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import os
import os.path as osp
import shutil
from typing import List, Optional, Union
from mmocr.registry import (CFG_GENERATORS, DATA_DUMPERS, DATA_GATHERERS,
DATA_OBTAINERS, DATA_PACKERS, DATA_PARSERS)
from mmocr.utils.typing_utils import ConfigType, OptConfigType
class DatasetPreparer:
"""Base class of dataset preparer.
Dataset preparer is used to prepare dataset for MMOCR. It mainly consists
of three steps:
1. For each split:
- Obtain the dataset
- Download
- Extract
- Move/Rename
- Gather the dataset
- Parse the dataset
- Pack the dataset to MMOCR format
- Dump the dataset
2. Delete useless files
3. Generate the base config for this dataset
After all these steps, the original datasets have been prepared for
usage in MMOCR. Check out the dataset format used in MMOCR here:
https://mmocr.readthedocs.io/en/dev-1.x/user_guides/dataset_prepare.html
Args:
data_root (str): Root directory of data.
dataset_name (str): Dataset name.
task (str): Task type. Options are 'textdet', 'textrecog',
'textspotter', and 'kie'. Defaults to 'textdet'.
nproc (int): Number of parallel processes. Defaults to 4.
train_preparer (OptConfigType): cfg for train data prepare. It contains
the following keys:
- obtainer: cfg for data obtainer.
- gatherer: cfg for data gatherer.
- parser: cfg for data parser.
- packer: cfg for data packer.
- dumper: cfg for data dumper.
Defaults to None.
test_preparer (OptConfigType): cfg for test data prepare. Defaults to
None.
val_preparer (OptConfigType): cfg for val data prepare. Defaults to
None.
config_generator (OptConfigType): cfg for config generator. Defaults to
None.
delete (list[str], optional): List of files to be deleted.
Defaults to None.
"""
def __init__(self,
data_root: str,
dataset_name: str = '',
task: str = 'textdet',
nproc: int = 4,
train_preparer: OptConfigType = None,
test_preparer: OptConfigType = None,
val_preparer: OptConfigType = None,
config_generator: OptConfigType = None,
delete: Optional[List[str]] = None) -> None:
self.data_root = data_root
self.nproc = nproc
self.task = task
self.dataset_name = dataset_name
self.train_preparer = train_preparer
self.test_preparer = test_preparer
self.val_preparer = val_preparer
self.config_generator = config_generator
self.delete = delete
def run(self, splits: Union[str, List] = ['train', 'test', 'val']) -> None:
"""Prepare the dataset."""
if isinstance(splits, str):
splits = [splits]
assert set(splits).issubset(set(['train', 'test',
'val'])), 'Invalid split name'
for split in splits:
self.loop(split, getattr(self, f'{split}_preparer'))
self.clean()
self.generate_config()
@classmethod
def from_file(cls, cfg: ConfigType) -> 'DatasetPreparer':
"""Create a DataPreparer from config file.
Args:
cfg (ConfigType): A config used for building runner. Keys of
``cfg`` can see :meth:`__init__`.
Returns:
Runner: A DatasetPreparer build from ``cfg``.
"""
cfg = copy.deepcopy(cfg)
data_preparer = cls(
data_root=cfg['data_root'],
dataset_name=cfg.get('dataset_name', ''),
task=cfg.get('task', 'textdet'),
nproc=cfg.get('nproc', 4),
train_preparer=cfg.get('train_preparer', None),
test_preparer=cfg.get('test_preparer', None),
val_preparer=cfg.get('val_preparer', None),
delete=cfg.get('delete', None),
config_generator=cfg.get('config_generator', None))
return data_preparer
def loop(self, split: str, cfg: ConfigType) -> None:
"""Loop over the dataset.
Args:
split (str): The split of the dataset.
cfg (ConfigType): A config used for building obtainer, gatherer,
parser, packer and dumper.
"""
if cfg is None:
return
# build obtainer and run
obtainer = cfg.get('obtainer', None)
if obtainer:
print(f'Obtaining {split} Dataset...')
obtainer.setdefault('task', default=self.task)
obtainer.setdefault('data_root', default=self.data_root)
obtainer = DATA_OBTAINERS.build(obtainer)
obtainer()
# build gatherer
gatherer = cfg.get('gatherer', None)
parser = cfg.get('parser', None)
packer = cfg.get('packer', None)
dumper = cfg.get('dumper', None)
related = [gatherer, parser, packer, dumper]
if all(item is None for item in related): # no data process
return
if not all(item is not None for item in related):
raise ValueError('gatherer, parser, packer and dumper should be '
'either all None or not None')
print(f'Gathering {split} Dataset...')
gatherer.setdefault('split', default=split)
gatherer.setdefault('data_root', default=self.data_root)
gatherer.setdefault('ann_dir', default='annotations')
gatherer.setdefault(
'img_dir', default=osp.join(f'{self.task}_imgs', split))
gatherer = DATA_GATHERERS.build(gatherer)
img_paths, ann_paths = gatherer()
# build parser
print(f'Parsing {split} Images and Annotations...')
parser.setdefault('split', default=split)
parser.setdefault('nproc', default=self.nproc)
parser = DATA_PARSERS.build(parser)
# Convert dataset annotations to MMOCR format
samples = parser(img_paths, ann_paths)
# build packer
print(f'Packing {split} Annotations...')
packer.setdefault('split', default=split)
packer.setdefault('nproc', default=self.nproc)
packer.setdefault('data_root', default=self.data_root)
packer = DATA_PACKERS.build(packer)
samples = packer(samples)
# build dumper
print(f'Dumping {split} Annotations...')
# Dump annotation files
dumper.setdefault('task', default=self.task)
dumper.setdefault('split', default=split)
dumper.setdefault('data_root', default=self.data_root)
dumper = DATA_DUMPERS.build(dumper)
dumper(samples)
def generate_config(self):
if self.config_generator is None:
return
self.config_generator.setdefault(
'dataset_name', default=self.dataset_name)
self.config_generator.setdefault('data_root', default=self.data_root)
config_generator = CFG_GENERATORS.build(self.config_generator)
print('Generating base configs...')
config_generator()
def clean(self) -> None:
if self.delete is None:
return
for d in self.delete:
delete_file = osp.join(self.data_root, d)
if osp.exists(delete_file):
if osp.isdir(delete_file):
shutil.rmtree(delete_file)
else:
os.remove(delete_file)
|