Tianyinus's picture
init submit
edcf5ee verified
"""
dataset divide script ver: Jan 9th 15:30 official release
ref:https://zhuanlan.zhihu.com/p/199238910
"""
import os
import random
import shutil
from shutil import copy2
from multiprocessing import Pool, cpu_count
def del_file(filepath):
"""
Delete all files or folders in a directory
:param filepath: path of file
:return:
"""
del_list = os.listdir(filepath)
for f in del_list:
file_path = os.path.join(filepath, f)
if os.path.isfile(file_path):
os.remove(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
def make_and_clear_path(file_pack_path):
if not os.path.exists(file_pack_path):
os.makedirs(file_pack_path)
del_file(file_pack_path)
def a_dataset_split(src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale, com_num=None):
current_class_data_path = os.path.join(src_data_folder, class_name)
current_all_data = os.listdir(current_class_data_path)
current_data_length = len(current_all_data)
current_data_index_list = list(range(current_data_length))
random.shuffle(current_data_index_list)
train_folder = os.path.join(os.path.join(target_data_folder, 'train'), class_name)
val_folder = os.path.join(os.path.join(target_data_folder, 'val'), class_name)
test_folder = os.path.join(os.path.join(target_data_folder, 'test'), class_name)
train_stop_flag = current_data_length * train_scale
val_stop_flag = current_data_length * (train_scale + val_scale)
current_idx = 0
train_num = 0
val_num = 0
test_num = 0
for i in current_data_index_list:
src_img_path = os.path.join(current_class_data_path, current_all_data[i])
if current_idx <= train_stop_flag:
copy2(src_img_path, train_folder)
# print("{} copied to {}".format(src_img_path, train_folder))
train_num = train_num + 1
elif (current_idx > train_stop_flag) and (current_idx <= val_stop_flag):
copy2(src_img_path, val_folder)
# print("{} copied to{}".format(src_img_path, val_folder))
val_num = val_num + 1
else:
copy2(src_img_path, test_folder)
# print("{} copied to {}".format(src_img_path, test_folder))
test_num = test_num + 1
current_idx = current_idx + 1
print("*********************************{}*************************************".format(class_name) + '\n' +
"{} class has been divided into {}:{}:{}, a total of {} images".format(class_name, train_scale, val_scale,
test_scale,
current_data_length) +
'\n' + "Train set{}: {} pics".format(
train_folder,
train_num)
+ '\n' + "Validation set{}: {} pics".format(val_folder, val_num) + '\n' + "Test set{}: {} pics".format(
test_folder, test_num)
+ '\n')
if com_num is not None:
print('processed class idx:', com_num)
def data_set_split(src_data_folder, target_data_folder='./dataset', train_scale=0.8, val_scale=0.2, test_scale=0.0,
Parallel_processing=False):
"""
Read source data folder, generate divided folders as 'train', 'val' and 'test'
:param src_data_folder: source folder E:/biye/gogogo/note_book/torch_note/data/utils_test/data_split/src_data
:param target_data_folder: target folder E:/biye/gogogo/note_book/torch_note/data/utils_test/data_split/target_data
:param train_scale: train set ratio
:param val_scale: validation set ratio
:param test_scale: test set ratio
:param Parallel_processing: whether to process in parallel
:return:
"""
make_and_clear_path(target_data_folder)
print("Begin dataset division")
class_names = os.listdir(src_data_folder)
# Create folder in the target directory
split_names = ['train', 'val', 'test']
for split_name in split_names:
split_path = os.path.join(target_data_folder, split_name)
# Then create category folder under the split_path directory
for class_name in class_names:
class_split_path = os.path.join(split_path, class_name)
os.makedirs(class_split_path)
if Parallel_processing:
# Create process pool
tasks_num = len(class_names)
process_pool = Pool(min(cpu_count() - 2, tasks_num)) # Number of parallels, leave at least 2 cores
com_num = 0
print("start processing" + str(tasks_num) + " files by multi-process")
# Schedule tasks
for class_name in class_names:
# Pool.apply_async(target to be called,(parameter tuple passed to the target,))
# Use free process to call the target during each loop
com_num += 1
args = (src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale, com_num)
process_pool.apply_async(a_dataset_split, args)
process_pool.close() # Close the process pool, process pool will no longer receive new requests once it is closed.
process_pool.join() # Wait till all process in process pool finished, must be placed after the 'close' statement
else:
# Divide the dataset according to the proportion, and copy the data image
# Traverse by category
for class_name in class_names:
a_dataset_split(src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale)
def k_fold_split(src_data_folder, target_data_folder='./kfold', k=5):
"""
Read the source data folder, generate divided folders as 'train', 'val'.
:param src_data_folder: organized imagenet format folders that need to be divided by k-folding
:param target_data_folder: large target folder with k folders generated inside, k folders are in imagenet format with train and val inside
:param k: the number of divided folds
:return:
"""
make_and_clear_path(target_data_folder)
print("Begin dataset division")
class_names = os.listdir(src_data_folder) # Get category name
# Divide the dataset for each category according to the proportion, and copy and distribute the data images
for class_name in class_names: # Classification traversal first
current_class_data_path = os.path.join(src_data_folder, class_name)
current_class_data_names = os.listdir(current_class_data_path)
current_data_length = len(current_class_data_names)
random.shuffle(current_class_data_names)
# Divide data
split_num = current_data_length // k
# Put a packet for evert split_num data, and if there are k+1 packets, the last packet can only have k-1 data at most
temp_split_pack = [current_class_data_names[i:i + split_num] for i in range(0, current_data_length, split_num)]
fold_name_pack = [temp_split_pack[i] for i in range(0, k)] # Get the first k packets
if len(
temp_split_pack) > k: # If it can’t be divided equally at the end, the last one will have one more pack, and put the contents into different packs in turn
for pack_idx, name in enumerate(temp_split_pack[-1]): # The extra pack have at most k-1 data
fold_name_pack[pack_idx].append(name)
print("{} class is divided into {} cross-validation, a total of {} images".format(class_name, k,
current_data_length))
for p in range(1, k + 1): # For each fold, start from 1
# Folder
train_folder = os.path.join(target_data_folder, 'fold_' + str(p), 'train', class_name)
val_folder = os.path.join(target_data_folder, 'fold_' + str(p), 'val', class_name)
os.makedirs(train_folder)
os.makedirs(val_folder)
pack_idx = p - 1 # Use the current fold of data as val set, and use the rest as train set
# Copy divided data
train_num = 0
val_num = 0
for j in range(k):
if j == pack_idx:
for i in fold_name_pack[j]:
src_img_path = os.path.join(current_class_data_path, i)
copy2(src_img_path, val_folder)
val_num += 1
# print("{} has copied to {}".format(src_img_path, val_folder))
else:
for i in fold_name_pack[j]:
src_img_path = os.path.join(current_class_data_path, i)
copy2(src_img_path, train_folder)
train_num += 1
# print("{} has copied to {}".format(src_img_path, train_folder))
print("fold {}: class:{} train num: {}".format(p, class_name, train_num))
print("fold {}: class:{} val num: {}".format(p, class_name, val_num))
if __name__ == '__main__':
# step1: create train_val and test dataset
src_data_folder = r'C:\Users\admin\Desktop\ROSE_5k'
target_data_folder1 = r'C:\Users\admin\Desktop\ROSE_5000_train_val' # _5fold
data_set_split(src_data_folder, target_data_folder1, train_scale=0.8, val_scale=0.0, test_scale=0.2,
Parallel_processing=False)
# step2: create 5 fold dataset
src_data_folder = os.path.join(target_data_folder1, 'train')
target_data_folder2 = r'C:\Users\admin\Desktop\ROSE_5000_5fold' #
k_fold_split(src_data_folder, target_data_folder2, k=5)
# step3: move the test dataset into file folder of the 5 fold dataset