import os import glob import os.path as osp import numpy as np import torch import torch.nn.functional as F from torch.utils.data import Dataset from loguru import logger import random from src.utils.dataset import read_pretrain_gray class PretrainDataset(Dataset): def __init__(self, root_dir, mode='train', img_resize=None, df=None, img_padding=False, frame_gap=2, **kwargs): """ Manage image pairs of KAIST Multispectral Pedestrian Detection Benchmark Dataset. Args: root_dir (str): KAIST Multispectral Pedestrian root directory that has `phoenix`. mode (str): options are ['train', 'val'] img_resize (int, optional): the longer edge of resized images. None for no resize. 640 is recommended. This is useful during training with batches and testing with memory intensive algorithms. df (int, optional): image size division factor. NOTE: this will change the final image size after img_resize. img_padding (bool): If set to 'True', zero-pad the image to squared size. This is useful during training. augment_fn (callable, optional): augments images with pre-defined visual effects. """ super().__init__() self.root_dir = root_dir self.mode = mode # specify which part of the data is used for trainng and testing if mode == 'train': assert img_resize is not None and img_padding self.start_ratio = 0.0 self.end_ratio = 0.9 elif mode == 'val': assert img_resize is not None and img_padding self.start_ratio = 0.9 self.end_ratio = 1.0 else: raise NotImplementedError() # parameters for image resizing, padding self.img_resize = img_resize self.df = df self.img_padding = img_padding # for training XoFTR self.coarse_scale = getattr(kwargs, 'coarse_scale', 0.125) self.pair_paths = self.generate_kaist_pairs(root_dir, frame_gap=frame_gap, second_frame_range=0) def get_kaist_image_paths(self, root_dir): vis_img_paths = [] lwir_img_paths = [] img_num_per_folder = [] # Recursively search for folders named "image" for folder, subfolders, filenames in os.walk(root_dir): if "visible" in subfolders and "lwir" in subfolders: vis_img_folder = osp.join(folder, "visible") lwir_img_folder = osp.join(folder, "lwir") # Use glob to find image files (you can add more extensions if needed) vis_imgs_i = glob.glob(osp.join(vis_img_folder, '*.jpg')) vis_imgs_i.sort() lwir_imgs_i = glob.glob(osp.join(lwir_img_folder, '*.jpg')) lwir_imgs_i.sort() vis_img_paths.append(vis_imgs_i) lwir_img_paths.append(lwir_imgs_i) img_num_per_folder.append(len(vis_imgs_i)) assert len(vis_imgs_i) == len(lwir_imgs_i), f"Image numbers do not match in {folder}, {len(vis_imgs_i)} != {len(lwir_imgs_i)}" # Add more image file extensions as necessary return vis_img_paths, lwir_img_paths, img_num_per_folder def generate_kaist_pairs(self, root_dir, frame_gap, second_frame_range): """ generate image pairs (Vis-TIR) from KAIST Pedestrian dataset Args: root_dir: root directory for the dataset frame_gap (int): the frame gap between consecutive images second_frame_range (int): the range for second image i.e. for the first ind i, second ind j element of [i-10, i+10] Returns: pair_paths (list) """ vis_img_paths, lwir_img_paths, img_num_per_folder = self.get_kaist_image_paths(root_dir) pair_paths = [] for i in range(len(img_num_per_folder)): num_img = img_num_per_folder[i] inds_vis = torch.arange(int(self.start_ratio * num_img), int(self.end_ratio * num_img), frame_gap, dtype=int) if second_frame_range > 0: inds_lwir = inds_vis + torch.randint(-second_frame_range, second_frame_range, (inds_vis.shape[0],)) inds_lwir[inds_lwirint(self.end_ratio * num_img)-1] = int(self.end_ratio * num_img)-1 else: inds_lwir = inds_vis for j, k in zip(inds_vis, inds_lwir): img_name0 = os.path.relpath(vis_img_paths[i][j], root_dir) img_name1 = os.path.relpath(lwir_img_paths[i][k], root_dir) if torch.rand(1) > 0.5: img_name0, img_name1 = img_name1, img_name0 pair_paths.append([img_name0, img_name1]) random.shuffle(pair_paths) return pair_paths def __len__(self): return len(self.pair_paths) def __getitem__(self, idx): # read grayscale and normalized image, and mask. (1, h, w) and (h, w) img_name0 = osp.join(self.root_dir, self.pair_paths[idx][0]) img_name1 = osp.join(self.root_dir, self.pair_paths[idx][1]) if self.mode == "train" and torch.rand(1) > 0.5: img_name0, img_name1 = img_name1, img_name0 image0, image0_norm, mask0, scale0, image0_mean, image0_std = read_pretrain_gray( img_name0, self.img_resize, self.df, self.img_padding, None) image1, image1_norm, mask1, scale1, image1_mean, image1_std = read_pretrain_gray( img_name1, self.img_resize, self.df, self.img_padding, None) data = { 'image0': image0, # (1, h, w) 'image1': image1, 'image0_norm': image0_norm, 'image1_norm': image1_norm, 'scale0': scale0, # [scale_w, scale_h] 'scale1': scale1, "image0_mean": image0_mean, "image0_std": image0_std, "image1_mean": image1_mean, "image1_std": image1_std, 'dataset_name': 'PreTrain', 'pair_id': idx, 'pair_names': (self.pair_paths[idx][0], self.pair_paths[idx][1]), } # for XoFTR training if mask0 is not None: # img_padding is True if self.coarse_scale: [ts_mask_0, ts_mask_1] = F.interpolate(torch.stack([mask0, mask1], dim=0)[None].float(), scale_factor=self.coarse_scale, mode='nearest', recompute_scale_factor=False)[0].bool() data.update({'mask0': ts_mask_0, 'mask1': ts_mask_1}) return data