diffcr-models / UnCRtainTS /data /dataLoader.py

Upload folder using huggingface_hub

3c8ff2e verified 4 months ago

22.2 kB

	import os
	import glob
	import warnings
	import numpy as np
	from natsort import natsorted

	from datetime import datetime

	to_date = lambda string: datetime.strptime(string, "%Y-%m-%d")
	S1_LAUNCH = to_date("2014-04-03")

	# s2cloudless: see https://github.com/sentinel-hub/sentinel2-cloud-detector
	from s2cloudless import S2PixelCloudDetector

	import rasterio
	from rasterio.merge import merge
	from scipy.ndimage import gaussian_filter
	from torch.utils.data import Dataset
	# import sys

	# sys.path.append(".")
	from util.detect_cloudshadow import get_cloud_mask, get_shadow_mask


	# utility functions used in the dataloaders of SEN12MS-CR and SEN12MS-CR-TS
	def read_tif(path_IMG):
	tif = rasterio.open(path_IMG)
	return tif


	def read_img(tif):
	return tif.read().astype(np.float32)


	def rescale(img, oldMin, oldMax):
	oldRange = oldMax - oldMin
	img = (img - oldMin) / oldRange
	return img


	def process_MS(img, method):
	if method == "default":
	intensity_min, intensity_max = (
	0,
	10000,
	) # define a reasonable range of MS intensities
	img = np.clip(
	img, intensity_min, intensity_max
	) # intensity clipping to a global unified MS intensity range
	img = rescale(
	img, intensity_min, intensity_max
	) # project to [0,1], preserve global intensities (across patches), gets mapped to [-1,+1] in wrapper
	if method == "resnet":
	intensity_min, intensity_max = (
	0,
	10000,
	) # define a reasonable range of MS intensities
	img = np.clip(
	img, intensity_min, intensity_max
	) # intensity clipping to a global unified MS intensity range
	img /= 2000 # project to [0,5], preserve global intensities (across patches)
	img = np.nan_to_num(img)
	return img


	def process_SAR(img, method):
	if method == "default":
	dB_min, dB_max = -25, 0 # define a reasonable range of SAR dB
	img = np.clip(
	img, dB_min, dB_max
	) # intensity clipping to a global unified SAR dB range
	img = rescale(
	img, dB_min, dB_max
	) # project to [0,1], preserve global intensities (across patches), gets mapped to [-1,+1] in wrapper
	if method == "resnet":
	# project SAR to [0, 2] range
	dB_min, dB_max = [-25.0, -32.5], [0, 0]
	img = np.concatenate(
	[
	(
	2
	* (np.clip(img[0], dB_min[0], dB_max[0]) - dB_min[0])
	/ (dB_max[0] - dB_min[0])
	)[None, ...],
	(
	2
	* (np.clip(img[1], dB_min[1], dB_max[1]) - dB_min[1])
	/ (dB_max[1] - dB_min[1])
	)[None, ...],
	],
	axis=0,
	)
	img = np.nan_to_num(img)
	return img


	def get_cloud_cloudshadow_mask(img, cloud_threshold=0.2):
	cloud_mask = get_cloud_mask(img, cloud_threshold, binarize=True)
	shadow_mask = get_shadow_mask(img)

	# encode clouds and shadows as segmentation masks
	cloud_cloudshadow_mask = np.zeros_like(cloud_mask)
	cloud_cloudshadow_mask[shadow_mask < 0] = -1
	cloud_cloudshadow_mask[cloud_mask > 0] = 1

	# label clouds and shadows
	cloud_cloudshadow_mask[cloud_cloudshadow_mask != 0] = 1
	return cloud_cloudshadow_mask


	# recursively apply function to nested dictionary
	def iterdict(dictionary, fct):
	for k, v in dictionary.items():
	if isinstance(v, dict):
	dictionary[k] = iterdict(v, fct)
	else:
	dictionary[k] = fct(v)
	return dictionary


	def get_cloud_map(img, detector, instance=None):
	# get cloud masks
	img = np.clip(img, 0, 10000)
	mask = np.ones((img.shape[-1], img.shape[-1]))
	# note: if your model may suffer from dark pixel artifacts,
	# you may consider adjusting these filtering parameters
	if not (img.mean() < 1e-5 and img.std() < 1e-5):
	if detector == "cloud_cloudshadow_mask":
	threshold = 0.2 # set to e.g. 0.2 or 0.4
	mask = get_cloud_cloudshadow_mask(img, threshold)
	elif detector == "s2cloudless_map":
	threshold = 0.5
	mask = instance.get_cloud_probability_maps(
	np.moveaxis(img / 10000, 0, -1)[None, ...]
	)[0, ...]
	mask[mask < threshold] = 0
	mask = gaussian_filter(mask, sigma=2)
	elif detector == "s2cloudless_mask":
	mask = instance.get_cloud_masks(np.moveaxis(img / 10000, 0, -1)[None, ...])[
	0, ...
	]
	else:
	mask = np.ones((img.shape[-1], img.shape[-1]))
	warnings.warn(f"Method {detector} not yet implemented!")
	else:
	warnings.warn(f"Encountered a blank sample, defaulting to cloudy mask.")
	return mask.astype(np.float32)


	# function to fetch paired data, which may differ in modalities or dates
	def get_pairedS1(patch_list, root_dir, mod=None, time=None):
	paired_list = []
	for patch in patch_list:
	seed, roi, modality, time_number, fname = patch.split("/")
	time = time_number if time is None else time # unless overwriting, ...
	mod = (
	modality if mod is None else mod
	) # keep the patch list's original time and modality
	n_patch = fname.split("patch_")[-1].split(".tif")[0]
	paired_dir = os.path.join(seed, roi, mod.upper(), str(time))
	candidates = os.path.join(
	root_dir,
	paired_dir,
	f"{mod}_{seed}_{roi}_ImgNo_{time}_*_patch_{n_patch}.tif",
	)
	paired_list.append(
	os.path.join(paired_dir, os.path.basename(glob.glob(candidates)[0]))
	)
	return paired_list






	""" SEN12MSCR data loader class, inherits from torch.utils.data.Dataset

	IN:
	root: str, path to your copy of the SEN12MS-CR-TS data set
	split: str, in [all \| train \| val \| test]
	region: str, [all \| africa \| america \| asiaEast \| asiaWest \| europa]
	cloud_masks: str, type of cloud mask detector to run on optical data, in []
	sample_type: str, [generic \| cloudy_cloudfree]
	n_input_samples: int, number of input samples in time series
	rescale_method: str, [default \| resnet]

	OUT:
	data_loader: SEN12MSCRTS instance, implements an iterator that can be traversed via __getitem__(pdx),
	which returns the pdx-th dictionary of patch-samples (whose structure depends on sample_type)
	"""


	class SEN12MSCR(Dataset):
	def __init__(
	self,
	root,
	split="all",
	region="all",
	cloud_masks="s2cloudless_mask",
	sample_type="pretrain",
	rescale_method="default",
	):
	self.root_dir = root # set root directory which contains all ROI
	self.region = region # region according to which the ROI are selected
	if self.region != "all":
	raise NotImplementedError # TODO: currently only supporting 'all'
	self.ROI = {
	"ROIs1158": ["106"],
	"ROIs1868": [
	"17",
	"36",
	"56",
	"73",
	"85",
	"100",
	"114",
	"119",
	"121",
	"126",
	"127",
	"139",
	"142",
	"143",
	],
	"ROIs1970": [
	"20",
	"21",
	"35",
	"40",
	"57",
	"65",
	"71",
	"82",
	"83",
	"91",
	"112",
	"116",
	"119",
	"128",
	"132",
	"133",
	"135",
	"139",
	"142",
	"144",
	"149",
	],
	"ROIs2017": [
	"8",
	"22",
	"25",
	"32",
	"49",
	"61",
	"63",
	"69",
	"75",
	"103",
	"108",
	"115",
	"116",
	"117",
	"130",
	"140",
	"146",
	],
	}

	# define splits conform with SEN12MS-CR-TS
	self.splits = {}
	self.splits["train"] = [
	"ROIs1970_fall_s1/s1_3",
	"ROIs1970_fall_s1/s1_22",
	"ROIs1970_fall_s1/s1_148",
	"ROIs1970_fall_s1/s1_107",
	"ROIs1970_fall_s1/s1_1",
	"ROIs1970_fall_s1/s1_114",
	"ROIs1970_fall_s1/s1_135",
	"ROIs1970_fall_s1/s1_40",
	"ROIs1970_fall_s1/s1_42",
	"ROIs1970_fall_s1/s1_31",
	"ROIs1970_fall_s1/s1_149",
	"ROIs1970_fall_s1/s1_64",
	"ROIs1970_fall_s1/s1_28",
	"ROIs1970_fall_s1/s1_144",
	"ROIs1970_fall_s1/s1_57",
	"ROIs1970_fall_s1/s1_35",
	"ROIs1970_fall_s1/s1_133",
	"ROIs1970_fall_s1/s1_30",
	"ROIs1970_fall_s1/s1_134",
	"ROIs1970_fall_s1/s1_141",
	"ROIs1970_fall_s1/s1_112",
	"ROIs1970_fall_s1/s1_116",
	"ROIs1970_fall_s1/s1_37",
	"ROIs1970_fall_s1/s1_26",
	"ROIs1970_fall_s1/s1_77",
	"ROIs1970_fall_s1/s1_100",
	"ROIs1970_fall_s1/s1_83",
	"ROIs1970_fall_s1/s1_71",
	"ROIs1970_fall_s1/s1_93",
	"ROIs1970_fall_s1/s1_119",
	"ROIs1970_fall_s1/s1_104",
	"ROIs1970_fall_s1/s1_136",
	"ROIs1970_fall_s1/s1_6",
	"ROIs1970_fall_s1/s1_41",
	"ROIs1970_fall_s1/s1_125",
	"ROIs1970_fall_s1/s1_91",
	"ROIs1970_fall_s1/s1_131",
	"ROIs1970_fall_s1/s1_120",
	"ROIs1970_fall_s1/s1_110",
	"ROIs1970_fall_s1/s1_19",
	"ROIs1970_fall_s1/s1_14",
	"ROIs1970_fall_s1/s1_81",
	"ROIs1970_fall_s1/s1_39",
	"ROIs1970_fall_s1/s1_109",
	"ROIs1970_fall_s1/s1_33",
	"ROIs1970_fall_s1/s1_88",
	"ROIs1970_fall_s1/s1_11",
	"ROIs1970_fall_s1/s1_128",
	"ROIs1970_fall_s1/s1_142",
	"ROIs1970_fall_s1/s1_122",
	"ROIs1970_fall_s1/s1_4",
	"ROIs1970_fall_s1/s1_27",
	"ROIs1970_fall_s1/s1_147",
	"ROIs1970_fall_s1/s1_85",
	"ROIs1970_fall_s1/s1_82",
	"ROIs1970_fall_s1/s1_105",
	"ROIs1158_spring_s1/s1_9",
	"ROIs1158_spring_s1/s1_1",
	"ROIs1158_spring_s1/s1_124",
	"ROIs1158_spring_s1/s1_40",
	"ROIs1158_spring_s1/s1_101",
	"ROIs1158_spring_s1/s1_21",
	"ROIs1158_spring_s1/s1_134",
	"ROIs1158_spring_s1/s1_145",
	"ROIs1158_spring_s1/s1_141",
	"ROIs1158_spring_s1/s1_66",
	"ROIs1158_spring_s1/s1_8",
	"ROIs1158_spring_s1/s1_26",
	"ROIs1158_spring_s1/s1_77",
	"ROIs1158_spring_s1/s1_113",
	"ROIs1158_spring_s1/s1_100",
	"ROIs1158_spring_s1/s1_117",
	"ROIs1158_spring_s1/s1_119",
	"ROIs1158_spring_s1/s1_6",
	"ROIs1158_spring_s1/s1_58",
	"ROIs1158_spring_s1/s1_120",
	"ROIs1158_spring_s1/s1_110",
	"ROIs1158_spring_s1/s1_126",
	"ROIs1158_spring_s1/s1_115",
	"ROIs1158_spring_s1/s1_121",
	"ROIs1158_spring_s1/s1_39",
	"ROIs1158_spring_s1/s1_109",
	"ROIs1158_spring_s1/s1_63",
	"ROIs1158_spring_s1/s1_75",
	"ROIs1158_spring_s1/s1_132",
	"ROIs1158_spring_s1/s1_128",
	"ROIs1158_spring_s1/s1_142",
	"ROIs1158_spring_s1/s1_15",
	"ROIs1158_spring_s1/s1_45",
	"ROIs1158_spring_s1/s1_97",
	"ROIs1158_spring_s1/s1_147",
	"ROIs1868_summer_s1/s1_90",
	"ROIs1868_summer_s1/s1_87",
	"ROIs1868_summer_s1/s1_25",
	"ROIs1868_summer_s1/s1_124",
	"ROIs1868_summer_s1/s1_114",
	"ROIs1868_summer_s1/s1_135",
	"ROIs1868_summer_s1/s1_40",
	"ROIs1868_summer_s1/s1_101",
	"ROIs1868_summer_s1/s1_42",
	"ROIs1868_summer_s1/s1_31",
	"ROIs1868_summer_s1/s1_36",
	"ROIs1868_summer_s1/s1_139",
	"ROIs1868_summer_s1/s1_56",
	"ROIs1868_summer_s1/s1_133",
	"ROIs1868_summer_s1/s1_55",
	"ROIs1868_summer_s1/s1_43",
	"ROIs1868_summer_s1/s1_113",
	"ROIs1868_summer_s1/s1_76",
	"ROIs1868_summer_s1/s1_123",
	"ROIs1868_summer_s1/s1_143",
	"ROIs1868_summer_s1/s1_93",
	"ROIs1868_summer_s1/s1_125",
	"ROIs1868_summer_s1/s1_89",
	"ROIs1868_summer_s1/s1_120",
	"ROIs1868_summer_s1/s1_126",
	"ROIs1868_summer_s1/s1_72",
	"ROIs1868_summer_s1/s1_115",
	"ROIs1868_summer_s1/s1_121",
	"ROIs1868_summer_s1/s1_146",
	"ROIs1868_summer_s1/s1_140",
	"ROIs1868_summer_s1/s1_95",
	"ROIs1868_summer_s1/s1_102",
	"ROIs1868_summer_s1/s1_7",
	"ROIs1868_summer_s1/s1_11",
	"ROIs1868_summer_s1/s1_132",
	"ROIs1868_summer_s1/s1_15",
	"ROIs1868_summer_s1/s1_137",
	"ROIs1868_summer_s1/s1_4",
	"ROIs1868_summer_s1/s1_27",
	"ROIs1868_summer_s1/s1_147",
	"ROIs1868_summer_s1/s1_86",
	"ROIs1868_summer_s1/s1_47",
	"ROIs2017_winter_s1/s1_68",
	"ROIs2017_winter_s1/s1_25",
	"ROIs2017_winter_s1/s1_62",
	"ROIs2017_winter_s1/s1_135",
	"ROIs2017_winter_s1/s1_42",
	"ROIs2017_winter_s1/s1_64",
	"ROIs2017_winter_s1/s1_21",
	"ROIs2017_winter_s1/s1_55",
	"ROIs2017_winter_s1/s1_112",
	"ROIs2017_winter_s1/s1_116",
	"ROIs2017_winter_s1/s1_8",
	"ROIs2017_winter_s1/s1_59",
	"ROIs2017_winter_s1/s1_49",
	"ROIs2017_winter_s1/s1_104",
	"ROIs2017_winter_s1/s1_81",
	"ROIs2017_winter_s1/s1_146",
	"ROIs2017_winter_s1/s1_75",
	"ROIs2017_winter_s1/s1_94",
	"ROIs2017_winter_s1/s1_102",
	"ROIs2017_winter_s1/s1_61",
	"ROIs2017_winter_s1/s1_47",
	"ROIs1868_summer_s1/s1_100", # note: this ROI is also used for testing in SEN12MS-CR-TS. If you wish to combine both datasets, please comment out this line
	]
	self.splits["val"] = [
	"ROIs2017_winter_s1/s1_22",
	"ROIs1868_summer_s1/s1_19",
	"ROIs1970_fall_s1/s1_65",
	"ROIs1158_spring_s1/s1_17",
	"ROIs2017_winter_s1/s1_107",
	"ROIs1868_summer_s1/s1_80",
	"ROIs1868_summer_s1/s1_127",
	"ROIs2017_winter_s1/s1_130",
	"ROIs1868_summer_s1/s1_17",
	"ROIs2017_winter_s1/s1_84",
	]
	self.splits["test"] = [
	"ROIs1158_spring_s1/s1_106",
	"ROIs1158_spring_s1/s1_123",
	"ROIs1158_spring_s1/s1_140",
	"ROIs1158_spring_s1/s1_31",
	"ROIs1158_spring_s1/s1_44",
	"ROIs1868_summer_s1/s1_119",
	"ROIs1868_summer_s1/s1_73",
	"ROIs1970_fall_s1/s1_139",
	"ROIs2017_winter_s1/s1_108",
	"ROIs2017_winter_s1/s1_63",
	]

	self.splits["all"] = (
	self.splits["train"] + self.splits["test"] + self.splits["val"]
	)
	self.split = split

	assert split in [
	"all",
	"train",
	"val",
	"test",
	], "Input dataset must be either assigned as all, train, test, or val!"
	assert sample_type in ["pretrain"], "Input data must be pretrain!"
	assert cloud_masks in [
	None,
	"cloud_cloudshadow_mask",
	"s2cloudless_map",
	"s2cloudless_mask",
	], "Unknown cloud mask type!"

	self.modalities = ["S1", "S2"]
	self.cloud_masks = cloud_masks # e.g. 'cloud_cloudshadow_mask', 's2cloudless_map', 's2cloudless_mask'
	self.sample_type = sample_type # e.g. 'pretrain'

	self.time_points = range(1)
	self.n_input_t = 1 # specifies the number of samples, if only part of the time series is used as an input

	if self.cloud_masks in ["s2cloudless_map", "s2cloudless_mask"]:
	self.cloud_detector = S2PixelCloudDetector(
	threshold=0.4, all_bands=True, average_over=4, dilation_size=2
	)
	else:
	self.cloud_detector = None

	self.paths = self.get_paths()
	self.n_samples = len(self.paths)

	# raise a warning if no data has been found
	if not self.n_samples:
	self.throw_warn()

	self.method = rescale_method

	# indexes all patches contained in the current data split
	def get_paths(
	self,
	): # assuming for the same ROI+num, the patch numbers are the same
	print(f"\nProcessing paths for {self.split} split of region {self.region}")

	paths = []
	seeds_S1 = natsorted(
	[s1dir for s1dir in os.listdir(self.root_dir) if "_s1" in s1dir]
	)
	for seed in seeds_S1:
	rois_S1 = natsorted(os.listdir(os.path.join(self.root_dir, seed)))
	for roi in rois_S1:
	roi_dir = os.path.join(self.root_dir, seed, roi)
	paths_S1 = natsorted(
	[os.path.join(roi_dir, s1patch) for s1patch in os.listdir(roi_dir)]
	)
	paths_S2 = [
	patch.replace("/s1", "/s2").replace("_s1", "_s2")
	for patch in paths_S1
	]
	paths_S2_cloudy = [
	patch.replace("/s1", "/s2_cloudy").replace("_s1", "_s2_cloudy")
	for patch in paths_S1
	]

	for pdx, _ in enumerate(paths_S1):
	# omit patches that are potentially unpaired
	if not all(
	[
	os.path.isfile(paths_S1[pdx]),
	os.path.isfile(paths_S2[pdx]),
	os.path.isfile(paths_S2_cloudy[pdx]),
	]
	):
	continue
	# don't add patch if not belonging to the selected split
	if not any(
	[
	split_roi in paths_S1[pdx]
	for split_roi in self.splits[self.split]
	]
	):
	continue
	sample = {
	"S1": paths_S1[pdx],
	"S2": paths_S2[pdx],
	"S2_cloudy": paths_S2_cloudy[pdx],
	}
	paths.append(sample)
	return paths

	def __getitem__(self, pdx): # get the triplet of patch with ID pdx
	s1_tif = read_tif(self.paths[pdx]["S1"])
	s2_tif = read_tif(self.paths[pdx]["S2"])
	s2_cloudy_tif = read_tif(self.paths[pdx]["S2_cloudy"])
	coord = list(s2_tif.bounds)
	s1 = process_SAR(read_img(s1_tif), self.method)
	s2 = read_img(s2_tif) # note: pre-processing happens after cloud detection
	s2_cloudy = read_img(
	s2_cloudy_tif
	) # note: pre-processing happens after cloud detection
	mask = (
	None
	if not self.cloud_masks
	else get_cloud_map(s2_cloudy, self.cloud_masks, self.cloud_detector)
	)

	sample = {
	"input": {
	"S1": s1,
	"S2": process_MS(s2_cloudy, self.method),
	"masks": mask,
	"coverage": np.mean(mask),
	"S1 path": os.path.join(self.root_dir, self.paths[pdx]["S1"]),
	"S2 path": os.path.join(self.root_dir, self.paths[pdx]["S2_cloudy"]),
	"coord": coord,
	},
	"target": {
	"S2": process_MS(s2, self.method),
	"S2 path": os.path.join(self.root_dir, self.paths[pdx]["S2"]),
	"coord": coord,
	},
	}
	return sample

	def throw_warn(self):
	warnings.warn(
	"""No data samples found! Please use the following directory structure:

	path/to/your/SEN12MSCR/directory:
	├───ROIs1158_spring_s1
	\| ├─s1_1
	\| \| \|...
	\| \| ├─ROIs1158_spring_s1_1_p407.tif
	\| \| \|...
	\| ...
	├───ROIs1158_spring_s2
	\| ├─s2_1
	\| \| \|...
	\| \| ├─ROIs1158_spring_s2_1_p407.tif
	\| \| \|...
	\| ...
	├───ROIs1158_spring_s2_cloudy
	\| ├─s2_cloudy_1
	\| \| \|...
	\| \| ├─ROIs1158_spring_s2_cloudy_1_p407.tif
	\| \| \|...
	\| ...
	...

	Note: Please arrange the dataset in a format as e.g. provided by the script dl_data.sh.
	"""
	)

	def __len__(self):
	# length of generated list
	return self.n_samples


	if __name__ == "__main__":
	dataset = SEN12MSCR(
	root="data2/SEN12MSCR",
	split="all",
	region="all",
	cloud_masks="s2cloudless_mask",
	sample_type="pretrain",
	rescale_method="default",
	)
	for each in dataset:
	print(f"{each['input']['S1'].shape}")
	print(f"{each['input']['S2'].shape}")
	print(f"{each['input']['masks'].shape}")
	print(f"{each['target']['S2'].shape}")
	# (2, 256, 256)
	# (13, 256, 256)
	# (256, 256)
	# (13, 256, 256)
	break