Spaces:

Mountchicken
/

MAERec-Gradio

Sleeping

File size: 5,606 Bytes

9bf4bd7

# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import json
import os
import os.path as osp

import cv2
import lmdb
import numpy as np

from mmocr.utils import list_from_file


def parse_line(line, format):
    if format == 'txt':
        img_name, text = line.split(' ')
    else:
        line = json.loads(line)
        img_name = line['filename']
        text = line['text']
    return img_name, text


def check_image_is_valid(imageBin):
    if imageBin is None:
        return False
    imageBuf = np.frombuffer(imageBin, dtype=np.uint8)
    img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
    imgH, imgW = img.shape[0], img.shape[1]
    if imgH * imgW == 0:
        return False
    return True


def write_cache(env, cache):
    with env.begin(write=True) as txn:
        cursor = txn.cursor()
        cursor.putmulti(cache, dupdata=False, overwrite=True)


def recog2lmdb(img_root,
               label_path,
               output,
               label_format='txt',
               label_only=False,
               batch_size=1000,
               encoding='utf-8',
               lmdb_map_size=1099511627776,
               verify=True):
    """Create text recognition dataset to LMDB format.

    Args:
        img_root (str): Path to images.
        label_path (str): Path to label file.
        output (str): LMDB output path.
        label_format (str): Format of the label file, either txt or jsonl.
        label_only (bool): Only convert label to lmdb format.
        batch_size (int): Number of files written to the cache each time.
        encoding (str): Label encoding method.
        lmdb_map_size (int): Maximum size database may grow to.
        verify (bool): If true, check the validity of
            every image.Defaults to True.

    E.g.
    This function supports MMOCR's recognition data format and the label file
    can be txt or jsonl, as follows:

        ├──img_root
        |      |—— img1.jpg
        |      |—— img2.jpg
        |      |—— ...
        |——label.txt (or label.jsonl)

        label.txt: img1.jpg HELLO
                   img2.jpg WORLD
                   ...

        label.jsonl: {'filename':'img1.jpg', 'text':'HELLO'}
                     {'filename':'img2.jpg', 'text':'WORLD'}
                     ...
    """
    # check label format
    assert osp.basename(label_path).split('.')[-1] == label_format
    # create lmdb env
    os.makedirs(output, exist_ok=True)
    env = lmdb.open(output, map_size=lmdb_map_size)
    # load label file
    anno_list = list_from_file(label_path, encoding=encoding)
    cache = []
    # index start from 1
    cnt = 1
    n_samples = len(anno_list)
    for anno in anno_list:
        label_key = 'label-%09d'.encode(encoding) % cnt
        img_name, text = parse_line(anno, label_format)
        if label_only:
            # convert only labels to lmdb
            line = json.dumps(
                dict(filename=img_name, text=text), ensure_ascii=False)
            cache.append((label_key, line.encode(encoding)))
        else:
            # convert both images and labels to lmdb
            img_path = osp.join(img_root, img_name)
            if not osp.exists(img_path):
                print('%s does not exist' % img_path)
                continue
            with open(img_path, 'rb') as f:
                image_bin = f.read()
            if verify:
                try:
                    if not check_image_is_valid(image_bin):
                        print('%s is not a valid image' % img_path)
                        continue
                except Exception:
                    print('error occurred at ', img_name)
            image_key = 'image-%09d'.encode(encoding) % cnt
            cache.append((image_key, image_bin))
            cache.append((label_key, text.encode(encoding)))

        if cnt % batch_size == 0:
            write_cache(env, cache)
            cache = []
            print('Written %d / %d' % (cnt, n_samples))
        cnt += 1
    n_samples = cnt - 1
    cache.append(
        ('num-samples'.encode(encoding), str(n_samples).encode(encoding)))
    write_cache(env, cache)
    print('Created lmdb dataset with %d samples' % n_samples)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('label_path', type=str, help='Path to label file')
    parser.add_argument('output', type=str, help='Output lmdb path')
    parser.add_argument(
        '--img-root', '-i', type=str, help='Input imglist path')
    parser.add_argument(
        '--label-only',
        action='store_true',
        help='Only converter label to lmdb')
    parser.add_argument(
        '--label-format',
        '-f',
        default='txt',
        choices=['txt', 'jsonl'],
        help='The format of the label file, either txt or jsonl')
    parser.add_argument(
        '--batch-size',
        '-b',
        type=int,
        default=1000,
        help='Processing batch size, defaults to 1000')
    parser.add_argument(
        '--encoding',
        '-e',
        type=str,
        default='utf8',
        help='Bytes coding scheme, defaults to utf8')
    parser.add_argument(
        '--lmdb-map-size',
        '-m',
        type=int,
        default=1099511627776,
        help='Maximum size database may grow to, '
        'defaults to 1099511627776 bytes (1TB)')
    opt = parser.parse_args()

    assert opt.img_root or opt.label_only
    recog2lmdb(opt.img_root, opt.label_path, opt.output, opt.label_format,
               opt.label_only, opt.batch_size, opt.encoding, opt.lmdb_map_size)


if __name__ == '__main__':
    main()