PuzzleTuning_VPT / PuzzleTuning /dataprocessing /WSI_whole_cropping.py

init submit

edcf5ee verified 9 months ago

16.6 kB

	"""
	'JPG_cropping_960...' ver： 22 Nov 10
	Crop pathology images into patches Using average filtering to screen the useful pieces which are mostly red/purple

	Specially mod ver
	maximize the efficient of cropping in different size
	"""
	import os

	os.add_dll_directory(r"D:\chrome_download\github220901\openslide-win64\bin")
	# 注意openslide的使用需要这样另外叫将openslide添加到PATh里面
	import openslide
	import shutil
	import PIL.Image as Image
	import numpy as np
	import openslide
	import torch
	from tqdm import tqdm
	import cv2
	from torchvision import transforms
	from PIL import ImageFile
	import pandas as pd

	ImageFile.LOAD_TRUNCATED_IMAGES = True
	Image.MAX_IMAGE_PIXELS = None

	STANDARD_MPP = 0.4942
	patch_size = [(3840, 3840), (960, 960), (384, 384), (96, 96)]


	def save_file(f_image, save_dir, suffix='.jpg'):
	"""
	重命名并保存图片，生成重命名的表
	"""
	filepath, _ = os.path.split(save_dir)
	if not os.path.exists(filepath):
	os.makedirs(filepath)
	# f_image.save(save_dir + suffix)
	image_data = np.asarray(f_image)
	cv2.imwrite(save_dir+suffix, image_data)


	def make_and_clear_path(file_pack_path):
	if not os.path.exists(file_pack_path):
	os.makedirs(file_pack_path)


	def find_all_files(root, suffix=None):
	"""
	Return a list of file paths ended with specific suffix
	"""
	res = []
	if type(suffix) is tuple or type(suffix) is list:
	for root, _, files in os.walk(root):
	for f in files:
	if suffix is not None:
	status = 0
	for i in suffix:
	if not f.endswith(i):
	pass
	else:
	status = 1
	break
	if status == 0:
	continue
	res.append(os.path.join(root, f))
	return res

	elif type(suffix) is str or suffix is None:
	for root, _, files in os.walk(root):
	for f in files:
	if suffix is not None and not f.endswith(suffix):
	continue
	res.append(os.path.join(root, f))
	return res

	else:
	print('type of suffix is not legal :', type(suffix))
	return -1


	def convert_to_npy(a_data_path, patch_size=(960, 960)):
	patch_size = to_2tuple(patch_size)

	# 处理转换

	# 传回npy
	img = Image.open(a_data_path)
	w, h = img.size
	factor = min(w // patch_size[0], h // patch_size[1])
	numpy_img = img.crop([0, 0, factor * patch_size[0], factor * patch_size[1]])
	numpy_img = np.array(numpy_img)

	return numpy_img


	class to_patch:
	"""
	Split an image into patches, each patch with the size of patch_size
	"""

	def __init__(self, patch_size=(16, 16)):
	patch_size = to_2tuple(patch_size)
	self.patch_h = patch_size[0]
	self.patch_w = patch_size[1]

	def __call__(self, x):
	x = torch.tensor(x)
	x = x.permute(2, 0, 1)
	c, h, w = x.shape
	# print(x.shape)
	# assert h // self.patch_h == h / self.patch_h and w // self.patch_w == w / self.patch_w
	num_patches = (h // self.patch_h) * (w // self.patch_w)

	h_1 = (h // self.patch_h) * self.patch_h
	w_1 = (w // self.patch_w) * self.patch_w
	x = x[:, ((h - h_1) // 2):((h - h_1) // 2 + h_1), ((w - w_1) // 2):((w - w_1) // 2 + w_1)]
	# patch encoding
	# (c, h, w)
	# -> (c, h // self.patch_h, self.patch_h, w // self.patch_w, self.patch_w)
	# -> (h // self.patch_h, w // self.patch_w, self.patch_h, self.patch_w, c)
	# -> (n_patches, patch_size^2*c)
	patches = x.view(
	c,
	h // self.patch_h,
	self.patch_h,
	w // self.patch_w,
	self.patch_w).permute(1, 3, 2, 4, 0).reshape(num_patches, -1) # it can also used in transformer Encoding

	# patch split
	# (n_patches, patch_size^2*c)
	# -> (num_patches, self.patch_h, self.patch_w, c)
	# -> (num_patches, c, self.patch_h, self.patch_w)
	patches = patches.view(num_patches,
	self.patch_h,
	self.patch_w,
	c).permute(0, 3, 1, 2)

	return patches


	def to_2tuple(input):
	if type(input) is tuple:
	if len(input) == 2:
	return input
	else:
	if len(input) > 2:
	output = (input[0], input[1])
	return output
	elif len(input) == 1:
	output = (input[0], input[0])
	return output
	else:
	print('cannot handle none tuple')
	else:
	if type(input) is list:
	if len(input) == 2:
	output = (input[0], input[1])
	return output
	else:
	if len(input) > 2:
	output = (input[0], input[1])
	return output
	elif len(input) == 1:
	output = (input[0], input[0])
	return output
	else:
	print('cannot handle none list')
	elif type(input) is int:
	output = (input, input)
	return output
	else:
	print('cannot handle ', type(input))
	raise ('cannot handle ', type(input))


	def pick_patch(patch):
	"""
	用于选择合适颜色的图片
	:param patch:
	:return:
	"""
	patch = array2img(patch)
	img_single = patch.resize((1, 1), Image.ANTIALIAS)
	r, g, b = img_single.getpixel((0, 0))
	if r - g < 30:
	return False
	else:
	return True


	def array2img(patch):
	img = Image.fromarray(patch.astype('uint8')).convert('RGB')
	return img


	def make_name(former_name, patch_size, patch_num):
	"""
	确保每个名字都反映原图上的横向x，纵向y，步长为自身patch_size
	:param former_name:
	:param patch_size:
	:return:
	"""
	former_patch_size = int(former_name.split('-')[-3])
	former_x = int(former_name.split('-')[-2])
	former_y = int(former_name.split('-')[-1])
	img_real_name = former_name[::-1].split('-', 3)[-1][::-1]

	ratio = int(former_patch_size / patch_size)
	x = patch_num % ratio if patch_num % ratio != 0 else ratio
	x = x - 1 # every coordinate starts with 0
	x = former_x * ratio + x

	y = patch_num // ratio if patch_num % ratio != 0 else patch_num // ratio - 1
	y = former_y * ratio + y

	img_name = img_real_name + '-' + str(patch_size) + '-' + str(x) + '-' + str(y)
	print(img_name)
	return img_name


	def SVS_cut_to_patch(img, save_root,
	patch_size,
	img_name,
	class_name,
	name_dir_3840, name_dir_0, name_dir_1, name_dir_2,
	patient_folder=False,
	L=True, M=True, S=False):
	slide = openslide.open_slide(img)
	try:
	MPP = slide.properties[openslide.PROPERTY_NAME_MPP_X]
	print(MPP, img)
	resize_ratio = STANDARD_MPP/float(MPP)
	print(resize_ratio)
	if 1.1 > resize_ratio > 0.9:
	patch_size_num_0 = patch_size[0][0]
	else:
	patch_size_num_0 = int(patch_size[0][0] * resize_ratio)
	print(patch_size_num_0)
	save_root_0 = os.path.join(os.path.join(save_root, str(patch_size[0][0])), class_name + '-' + str(patch_size[0][0]))
	make_and_clear_path(save_root_0)
	w, h = slide.level_dimensions[0]
	for i in range(1, w // patch_size_num_0 - 1):

	for j in range(1, h // patch_size_num_0 - 1):

	patch = slide.read_region((i * patch_size_num_0, j * patch_size_num_0), 0, (patch_size_num_0, patch_size_num_0))
	patch = patch.convert('RGB')
	# print('finish id:%d image' % image_list.index(id))
	if not 1.1 > resize_ratio > 0.9:
	patch = patch.resize(patch_size[0], Image.ANTIALIAS) # resize 到 3840 3840
	# 统一归为384*384
	# save_file(patch, os.path.join(save_root_0, img_name + '-' + str((i + 1) * (j + 1))))
	img_single = patch.resize((1, 1), Image.ANTIALIAS)
	r, g, b = img_single.getpixel((0, 0))
	if r < 220 and g < 220 and b < 220 and r > 100 and b > 30 and r > g + 20:

	save_file(patch, os.path.join(save_root_0, img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j)))
	name_dir_3840[os.path.join(save_root_0, img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j)) + '-' + str(resize_ratio)] = img
	if patient_folder is True:
	save_root_patient_0 = os.path.join(save_root_0 + '-patient', img_name)
	save_file(patch, os.path.join(save_root_patient_0, img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j)))
	current_img = os.path.join(save_root_0, img_name + '-' + str(patch_size[0][0]) + '-' + str(i) + '-' + str(j)) + '.jpg'

	cut_to_patch(current_img, save_root,
	patch_size[1], patch_size[2], patch_size[3],
	img_name, class_name,
	name_dir_0, name_dir_1, name_dir_2,
	patient_folder=patient_folder,
	L=L, M=M, S=S)
	else:
	continue
	# save_file(patch, os.path.join('H:\PuzzleTuning\SNL-Breast-Back', img_name + '-' + str(i) + '-' +c
	# str(j)))
	pd.DataFrame.from_dict(name_dir_3840, orient='index', columns=['origin path']).to_csv(
	os.path.join(os.path.join(save_root, str(patch_size[0][0])), class_name + '-' + str(patch_size[0][0]) + '.csv')
	)

	except Exception as e:
	print(e)


	def cut_to_patch(img,
	save_root,
	patch_size_0, patch_size_1, patch_size_2,
	img_name, class_name,
	name_dir_0, name_dir_1, name_dir_2,
	patient_folder=True,
	L=True, M=True, S=False
	):
	current_img_name = os.path.split(img)[1].split('.')[0]
	numpy_img = convert_to_npy(img)
	patch_size_num_0 = patch_size_0[0]
	patch_size_num_1 = patch_size_1[0]
	patch_size_num_2 = patch_size_2[0]
	save_root_0 = os.path.join(os.path.join(save_root, str(patch_size_num_0)), class_name + '-' + str(patch_size_num_0))
	save_root_1 = os.path.join(os.path.join(save_root, str(patch_size_num_1)), class_name + '-' + str(patch_size_num_1))
	save_root_2 = os.path.join(os.path.join(save_root, str(patch_size_num_2)), class_name + '-' + str(patch_size_num_2))

	save_root_patient_0 = os.path.join(save_root_0 + '-patient', img_name)
	save_root_patient_1 = os.path.join(save_root_1 + '-patient', img_name)
	save_root_patient_2 = os.path.join(save_root_2 + '-patient', img_name)

	img_split_0 = to_patch(patch_size_0)
	img_patches_0 = img_split_0(numpy_img)

	img_split_1 = to_patch(patch_size_1)
	img_patches_1 = img_split_1(numpy_img)
	i = 0
	j = 0
	if L:
	# on most cases we need L-scale, which is 960 * 960
	for patch in img_patches_0:
	i = i + 1
	patch = patch.permute(1, 2, 0)
	patch = patch.numpy()
	if pick_patch(patch):
	img_name_0 = make_name(current_img_name, patch_size_num_0, i)
	save_dir_0 = os.path.join(save_root_0, img_name_0)
	print(save_dir_0)
	patch = array2img(patch)
	# patch = patch.resize((384, 384), Image.ANTIALIAS) # 归为384*384
	# for our biggest CPIA we dont want to resize
	if patient_folder:
	save_file(patch, os.path.join(save_root_patient_0, img_name_0))
	name_dir_0[save_dir_0] = img
	# 保存相关.csv
	save_file(patch, save_dir_0)
	else:
	pass
	if M:
	# on most cases we need M-scale, which is 384 * 384
	# if M is false then S must be false
	for patch_1 in img_patches_1:
	# convert the image into numpy
	j = j + 1
	patch_1 = patch_1.permute(1, 2, 0)
	patch_1 = patch_1.numpy()
	if pick_patch(patch_1):
	# save 384*384 image
	img_name_1 = make_name(current_img_name, patch_size_num_1, j)
	save_dir_1 = os.path.join(save_root_1, img_name_1)
	print(save_dir_1)
	if S:
	# 2023.5.12 暂时不处理S
	k = 0
	img_split_2 = to_patch(patch_size_2)
	img_patches_2 = img_split_2(patch_1)
	for patch_2 in img_patches_2:
	k = k + 1
	patch_2 = patch_2.permute(1, 2, 0)
	patch_2 = patch_2.numpy()
	if pick_patch(patch_2):
	# if k % 10 == 0:
	# for our biggest CPIA we don't want sampling
	img_name_2 = make_name(img_name_1, patch_size_num_2, k)
	patch_2 = array2img(patch_2)
	save_dir_2 = os.path.join(save_root_2, img_name_2)
	print(save_dir_2)
	if patient_folder:
	save_file(patch_2, os.path.join(save_root_patient_2, img_name_2))
	name_dir_2[save_dir_2] = img
	save_file(patch_2, save_dir_2)
	else:
	pass

	patch_1 = array2img(patch_1)
	if patient_folder:
	save_file(patch_1, os.path.join(save_root_patient_1, img_name_1))
	name_dir_1[save_dir_1] = img
	save_file(patch_1, save_dir_1)
	else:
	pass
	pd.DataFrame.from_dict(name_dir_0, orient='index', columns=['origin path']).to_csv(
	os.path.join(os.path.join(save_root,
	str(patch_size_num_0)), class_name + '-' + str(patch_size_num_0) + '.csv')
	)
	pd.DataFrame.from_dict(name_dir_1, orient='index', columns=['origin path']).to_csv(
	os.path.join(os.path.join(save_root,
	str(patch_size_num_1)), class_name + '-' + str(patch_size_num_1) + '.csv')
	)
	pd.DataFrame.from_dict(name_dir_2, orient='index', columns=['origin path']).to_csv(
	os.path.join(os.path.join(save_root,
	str(patch_size_num_2)), class_name + '-' + str(patch_size_num_2) + '.csv')
	)


	def read_and_convert(data_root, save_root, suffix=None, patient_folder=False, L=True, M=True, S=False):
	# 一次处理只一个数据集, 每个数据集的处理方式可能有不同

	# 读入所有数据

	class_names = os.listdir(data_root)

	class_names = ['PAIP2019']
	# 接下来一行代码只在断点续传使用
	# class_names = class_names[class_names.index('CPTAC-LUAD') :]

	for class_name in class_names:

	svs_class_root = os.path.join(data_root, class_name)
	svs_all_files = find_all_files(svs_class_root, suffix)
	# 接下来一行代码只在断点续传使用
	# if class_name == 'CPTAC-LUAD':
	# svs_all_files = svs_all_files[svs_all_files.index(r'E:\Puzzle_Tuning_Datasets\Raw\WSI\CPTAC-LUAD\LUAD\C3N-02141-27.svs') + 1:]
	name_dir_3840 = {}
	name_dir_0 = {}
	name_dir_1 = {}
	name_dir_2 = {}
	for img in svs_all_files:
	img_name = os.path.split(img)[1].split('.')[0]
	SVS_cut_to_patch(img, save_root, patch_size, img_name, class_name, name_dir_3840, name_dir_0, name_dir_1, name_dir_2,
	patient_folder, L=L, M=M, S=S)



	if __name__ == '__main__':
	read_and_convert(r'I:\Puzzle_Tuning_Datasets\Raw',
	r'X:\CPIA_WSI_no_sampling_no_rezising',
	'svs',
	patient_folder=False,
	L=True, M=True, S=False)
	# fixme: X: doesn't take the picture
	# fixed use image_data = np.asarray(f_image)
	# cv2.imwrite(save_dir+suffix, image_data)


	# 2023.5.1 E: CPTAC-(CCRCC CM HNSCC LSCC LUAD PDA SAR UCEC) Post-NAT-BRCA