File size: 9,743 Bytes
edcf5ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
"""
dataset divide script ver: Jan 9th 15:30 official release
ref:https://zhuanlan.zhihu.com/p/199238910
"""
import os
import random
import shutil
from shutil import copy2
from multiprocessing import Pool, cpu_count
def del_file(filepath):
"""
Delete all files or folders in a directory
:param filepath: path of file
:return:
"""
del_list = os.listdir(filepath)
for f in del_list:
file_path = os.path.join(filepath, f)
if os.path.isfile(file_path):
os.remove(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
def make_and_clear_path(file_pack_path):
if not os.path.exists(file_pack_path):
os.makedirs(file_pack_path)
del_file(file_pack_path)
def a_dataset_split(src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale, com_num=None):
current_class_data_path = os.path.join(src_data_folder, class_name)
current_all_data = os.listdir(current_class_data_path)
current_data_length = len(current_all_data)
current_data_index_list = list(range(current_data_length))
random.shuffle(current_data_index_list)
train_folder = os.path.join(os.path.join(target_data_folder, 'train'), class_name)
val_folder = os.path.join(os.path.join(target_data_folder, 'val'), class_name)
test_folder = os.path.join(os.path.join(target_data_folder, 'test'), class_name)
train_stop_flag = current_data_length * train_scale
val_stop_flag = current_data_length * (train_scale + val_scale)
current_idx = 0
train_num = 0
val_num = 0
test_num = 0
for i in current_data_index_list:
src_img_path = os.path.join(current_class_data_path, current_all_data[i])
if current_idx <= train_stop_flag:
copy2(src_img_path, train_folder)
# print("{} copied to {}".format(src_img_path, train_folder))
train_num = train_num + 1
elif (current_idx > train_stop_flag) and (current_idx <= val_stop_flag):
copy2(src_img_path, val_folder)
# print("{} copied to{}".format(src_img_path, val_folder))
val_num = val_num + 1
else:
copy2(src_img_path, test_folder)
# print("{} copied to {}".format(src_img_path, test_folder))
test_num = test_num + 1
current_idx = current_idx + 1
print("*********************************{}*************************************".format(class_name) + '\n' +
"{} class has been divided into {}:{}:{}, a total of {} images".format(class_name, train_scale, val_scale,
test_scale,
current_data_length) +
'\n' + "Train set{}: {} pics".format(
train_folder,
train_num)
+ '\n' + "Validation set{}: {} pics".format(val_folder, val_num) + '\n' + "Test set{}: {} pics".format(
test_folder, test_num)
+ '\n')
if com_num is not None:
print('processed class idx:', com_num)
def data_set_split(src_data_folder, target_data_folder='./dataset', train_scale=0.8, val_scale=0.2, test_scale=0.0,
Parallel_processing=False):
"""
Read source data folder, generate divided folders as 'train', 'val' and 'test'
:param src_data_folder: source folder E:/biye/gogogo/note_book/torch_note/data/utils_test/data_split/src_data
:param target_data_folder: target folder E:/biye/gogogo/note_book/torch_note/data/utils_test/data_split/target_data
:param train_scale: train set ratio
:param val_scale: validation set ratio
:param test_scale: test set ratio
:param Parallel_processing: whether to process in parallel
:return:
"""
make_and_clear_path(target_data_folder)
print("Begin dataset division")
class_names = os.listdir(src_data_folder)
# Create folder in the target directory
split_names = ['train', 'val', 'test']
for split_name in split_names:
split_path = os.path.join(target_data_folder, split_name)
# Then create category folder under the split_path directory
for class_name in class_names:
class_split_path = os.path.join(split_path, class_name)
os.makedirs(class_split_path)
if Parallel_processing:
# Create process pool
tasks_num = len(class_names)
process_pool = Pool(min(cpu_count() - 2, tasks_num)) # Number of parallels, leave at least 2 cores
com_num = 0
print("start processing" + str(tasks_num) + " files by multi-process")
# Schedule tasks
for class_name in class_names:
# Pool.apply_async(target to be called,(parameter tuple passed to the target,))
# Use free process to call the target during each loop
com_num += 1
args = (src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale, com_num)
process_pool.apply_async(a_dataset_split, args)
process_pool.close() # Close the process pool, process pool will no longer receive new requests once it is closed.
process_pool.join() # Wait till all process in process pool finished, must be placed after the 'close' statement
else:
# Divide the dataset according to the proportion, and copy the data image
# Traverse by category
for class_name in class_names:
a_dataset_split(src_data_folder, target_data_folder, class_name, train_scale, val_scale, test_scale)
def k_fold_split(src_data_folder, target_data_folder='./kfold', k=5):
"""
Read the source data folder, generate divided folders as 'train', 'val'.
:param src_data_folder: organized imagenet format folders that need to be divided by k-folding
:param target_data_folder: large target folder with k folders generated inside, k folders are in imagenet format with train and val inside
:param k: the number of divided folds
:return:
"""
make_and_clear_path(target_data_folder)
print("Begin dataset division")
class_names = os.listdir(src_data_folder) # Get category name
# Divide the dataset for each category according to the proportion, and copy and distribute the data images
for class_name in class_names: # Classification traversal first
current_class_data_path = os.path.join(src_data_folder, class_name)
current_class_data_names = os.listdir(current_class_data_path)
current_data_length = len(current_class_data_names)
random.shuffle(current_class_data_names)
# Divide data
split_num = current_data_length // k
# Put a packet for evert split_num data, and if there are k+1 packets, the last packet can only have k-1 data at most
temp_split_pack = [current_class_data_names[i:i + split_num] for i in range(0, current_data_length, split_num)]
fold_name_pack = [temp_split_pack[i] for i in range(0, k)] # Get the first k packets
if len(
temp_split_pack) > k: # If it can’t be divided equally at the end, the last one will have one more pack, and put the contents into different packs in turn
for pack_idx, name in enumerate(temp_split_pack[-1]): # The extra pack have at most k-1 data
fold_name_pack[pack_idx].append(name)
print("{} class is divided into {} cross-validation, a total of {} images".format(class_name, k,
current_data_length))
for p in range(1, k + 1): # For each fold, start from 1
# Folder
train_folder = os.path.join(target_data_folder, 'fold_' + str(p), 'train', class_name)
val_folder = os.path.join(target_data_folder, 'fold_' + str(p), 'val', class_name)
os.makedirs(train_folder)
os.makedirs(val_folder)
pack_idx = p - 1 # Use the current fold of data as val set, and use the rest as train set
# Copy divided data
train_num = 0
val_num = 0
for j in range(k):
if j == pack_idx:
for i in fold_name_pack[j]:
src_img_path = os.path.join(current_class_data_path, i)
copy2(src_img_path, val_folder)
val_num += 1
# print("{} has copied to {}".format(src_img_path, val_folder))
else:
for i in fold_name_pack[j]:
src_img_path = os.path.join(current_class_data_path, i)
copy2(src_img_path, train_folder)
train_num += 1
# print("{} has copied to {}".format(src_img_path, train_folder))
print("fold {}: class:{} train num: {}".format(p, class_name, train_num))
print("fold {}: class:{} val num: {}".format(p, class_name, val_num))
if __name__ == '__main__':
# step1: create train_val and test dataset
src_data_folder = r'C:\Users\admin\Desktop\ROSE_5k'
target_data_folder1 = r'C:\Users\admin\Desktop\ROSE_5000_train_val' # _5fold
data_set_split(src_data_folder, target_data_folder1, train_scale=0.8, val_scale=0.0, test_scale=0.2,
Parallel_processing=False)
# step2: create 5 fold dataset
src_data_folder = os.path.join(target_data_folder1, 'train')
target_data_folder2 = r'C:\Users\admin\Desktop\ROSE_5000_5fold' #
k_fold_split(src_data_folder, target_data_folder2, k=5)
# step3: move the test dataset into file folder of the 5 fold dataset
|