File size: 9,743 Bytes

edcf5ee

"""
dataset divide script  ver： Jan 9th 15：30 official release

ref：https://zhuanlan.zhihu.com/p/199238910
"""
import os
import random
import shutil
from shutil import copy2
from multiprocessing import Pool, cpu_count


def del_file(filepath):
    """
    Delete all files or folders in a directory
    :param filepath: path of file
    :return:
    """
    del_list = os.listdir(filepath)
    for f in del_list:
        file_path = os.path.join(filepath, f)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)


def make_and_clear_path(file_pack_path):
    if not os.path.exists(file_pack_path):
        os.makedirs(file_pack_path)
    del_file(file_pack_path)


def a_dataset_split(src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale, com_num=None):
    current_class_data_path = os.path.join(src_data_folder, class_name)
    current_all_data = os.listdir(current_class_data_path)

    current_data_length = len(current_all_data)
    current_data_index_list = list(range(current_data_length))
    random.shuffle(current_data_index_list)

    train_folder = os.path.join(os.path.join(target_data_folder, 'train'), class_name)
    val_folder = os.path.join(os.path.join(target_data_folder, 'val'), class_name)
    test_folder = os.path.join(os.path.join(target_data_folder, 'test'), class_name)

    train_stop_flag = current_data_length * train_scale
    val_stop_flag = current_data_length * (train_scale + val_scale)
    current_idx = 0
    train_num = 0
    val_num = 0
    test_num = 0
    for i in current_data_index_list:
        src_img_path = os.path.join(current_class_data_path, current_all_data[i])
        if current_idx <= train_stop_flag:
            copy2(src_img_path, train_folder)
            # print("{} copied to {}".format(src_img_path, train_folder))
            train_num = train_num + 1

        elif (current_idx > train_stop_flag) and (current_idx <= val_stop_flag):
            copy2(src_img_path, val_folder)
            # print("{} copied to{}".format(src_img_path, val_folder))
            val_num = val_num + 1

        else:
            copy2(src_img_path, test_folder)
            # print("{} copied to {}".format(src_img_path, test_folder))
            test_num = test_num + 1

        current_idx = current_idx + 1

    print("*********************************{}*************************************".format(class_name) + '\n' +
          "{} class has been divided into {}:{}:{}, a total of {} images".format(class_name, train_scale, val_scale,
                                                                                 test_scale,
                                                                                 current_data_length) +
          '\n' + "Train set{}: {} pics".format(
        train_folder,
        train_num)
          + '\n' + "Validation set{}: {} pics".format(val_folder, val_num) + '\n' + "Test set{}: {} pics".format(
        test_folder, test_num)
          + '\n')

    if com_num is not None:
        print('processed class idx:', com_num)


def data_set_split(src_data_folder, target_data_folder='./dataset', train_scale=0.8, val_scale=0.2, test_scale=0.0,
                   Parallel_processing=False):
    """
    Read source data folder, generate divided folders as 'train', 'val' and 'test'
    :param src_data_folder: source folder E:/biye/gogogo/note_book/torch_note/data/utils_test/data_split/src_data
    :param target_data_folder: target folder E:/biye/gogogo/note_book/torch_note/data/utils_test/data_split/target_data
    :param train_scale: train set ratio
    :param val_scale: validation set ratio
    :param test_scale: test set ratio

    :param Parallel_processing: whether to process in parallel

    :return:
    """
    make_and_clear_path(target_data_folder)
    print("Begin dataset division")
    class_names = os.listdir(src_data_folder)
    # Create folder in the target directory
    split_names = ['train', 'val', 'test']
    for split_name in split_names:
        split_path = os.path.join(target_data_folder, split_name)
        # Then create category folder under the split_path directory
        for class_name in class_names:
            class_split_path = os.path.join(split_path, class_name)
            os.makedirs(class_split_path)

    if Parallel_processing:
        # Create process pool
        tasks_num = len(class_names)
        process_pool = Pool(min(cpu_count() - 2, tasks_num))  # Number of parallels, leave at least 2 cores

        com_num = 0
        print("start processing" + str(tasks_num) + " files by multi-process")
        # Schedule tasks
        for class_name in class_names:
            # Pool.apply_async(target to be called,(parameter tuple passed to the target,))
            # Use free process to call the target during each loop
            com_num += 1
            args = (src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale, com_num)
            process_pool.apply_async(a_dataset_split, args)

        process_pool.close()  # Close the process pool, process pool will no longer receive new requests once it is closed.
        process_pool.join()  # Wait till all process in process pool finished, must be placed after the 'close' statement

    else:
        # Divide the dataset according to the proportion, and copy the data image
        # Traverse by category
        for class_name in class_names:
            a_dataset_split(src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale)


def k_fold_split(src_data_folder, target_data_folder='./kfold', k=5):
    """
    Read the source data folder, generate divided folders as 'train', 'val'.

    :param src_data_folder: organized imagenet format folders that need to be divided by k-folding
    :param target_data_folder: large target folder with k folders generated inside, k folders are in imagenet format with train and val inside
    :param k: the number of divided folds

    :return:
    """
    make_and_clear_path(target_data_folder)
    print("Begin dataset division")
    class_names = os.listdir(src_data_folder)  # Get category name

    # Divide the dataset for each category according to the proportion, and copy and distribute the data images
    for class_name in class_names:  # Classification traversal first

        current_class_data_path = os.path.join(src_data_folder, class_name)
        current_class_data_names = os.listdir(current_class_data_path)

        current_data_length = len(current_class_data_names)
        random.shuffle(current_class_data_names)

        # Divide data
        split_num = current_data_length // k
        # Put a packet for evert split_num data, and if there are k+1 packets, the last packet can only have k-1 data at most
        temp_split_pack = [current_class_data_names[i:i + split_num] for i in range(0, current_data_length, split_num)]
        fold_name_pack = [temp_split_pack[i] for i in range(0, k)]  # Get the first k packets
        if len(
                temp_split_pack) > k:  # If it can’t be divided equally at the end, the last one will have one more pack, and put the contents into different packs in turn
            for pack_idx, name in enumerate(temp_split_pack[-1]):  # The extra pack have at most k-1 data
                fold_name_pack[pack_idx].append(name)

        print("{} class is divided into {} cross-validation, a total of {} images".format(class_name, k,
                                                                                          current_data_length))

        for p in range(1, k + 1):  # For each fold, start from 1
            # Folder
            train_folder = os.path.join(target_data_folder, 'fold_' + str(p), 'train', class_name)
            val_folder = os.path.join(target_data_folder, 'fold_' + str(p), 'val', class_name)
            os.makedirs(train_folder)
            os.makedirs(val_folder)

            pack_idx = p - 1  # Use the current fold of data as val set, and use the rest as train set

            # Copy divided data
            train_num = 0
            val_num = 0

            for j in range(k):
                if j == pack_idx:
                    for i in fold_name_pack[j]:
                        src_img_path = os.path.join(current_class_data_path, i)
                        copy2(src_img_path, val_folder)
                        val_num += 1
                        # print("{} has copied to {}".format(src_img_path, val_folder))
                else:
                    for i in fold_name_pack[j]:
                        src_img_path = os.path.join(current_class_data_path, i)
                        copy2(src_img_path, train_folder)
                        train_num += 1
                        # print("{} has copied to {}".format(src_img_path, train_folder))
            print("fold {}:  class:{}  train num: {}".format(p, class_name, train_num))
            print("fold {}:  class:{}  val num: {}".format(p, class_name, val_num))


if __name__ == '__main__':
    # step1: create train_val and test dataset
    src_data_folder = r'C:\Users\admin\Desktop\ROSE_5k'
    target_data_folder1 = r'C:\Users\admin\Desktop\ROSE_5000_train_val'  # _5fold
    data_set_split(src_data_folder, target_data_folder1, train_scale=0.8, val_scale=0.0, test_scale=0.2,
                   Parallel_processing=False)

    # step2: create 5 fold dataset
    src_data_folder = os.path.join(target_data_folder1, 'train')
    target_data_folder2 = r'C:\Users\admin\Desktop\ROSE_5000_5fold'  #
    k_fold_split(src_data_folder, target_data_folder2, k=5)

    # step3: move the test dataset into file folder of the 5 fold dataset