File size: 7,206 Bytes
a930e1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import glob
import os.path as osp
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from loguru import logger
import random
from src.utils.dataset import read_pretrain_gray

class PretrainDataset(Dataset):
    def __init__(self,

                 root_dir,

                 mode='train',

                 img_resize=None,

                 df=None,

                 img_padding=False,

                 frame_gap=2,

                 **kwargs):
        """

        Manage image pairs of KAIST Multispectral Pedestrian Detection Benchmark Dataset.

        

        Args:

            root_dir (str): KAIST Multispectral Pedestrian  root directory that has `phoenix`.

            mode (str): options are ['train', 'val']

            img_resize (int, optional): the longer edge of resized images. None for no resize. 640 is recommended.

                                        This is useful during training with batches and testing with memory intensive algorithms.

            df (int, optional): image size division factor. NOTE: this will change the final image size after img_resize.

            img_padding (bool): If set to 'True', zero-pad the image to squared size. This is useful during training.

            augment_fn (callable, optional): augments images with pre-defined visual effects.

        """
        super().__init__()
        self.root_dir = root_dir
        self.mode = mode

        # specify which part of the data is used for trainng and testing
        if mode == 'train':
            assert img_resize is not None and img_padding 
            self.start_ratio = 0.0
            self.end_ratio = 0.9
        elif mode == 'val':
            assert img_resize is not None and img_padding 
            self.start_ratio = 0.9
            self.end_ratio = 1.0
        else:
            raise NotImplementedError()
        
        # parameters for image resizing, padding 
        self.img_resize = img_resize
        self.df = df
        self.img_padding = img_padding

        # for training XoFTR
        self.coarse_scale = getattr(kwargs, 'coarse_scale', 0.125)

        self.pair_paths = self.generate_kaist_pairs(root_dir, frame_gap=frame_gap, second_frame_range=0)

    def get_kaist_image_paths(self, root_dir):
        vis_img_paths = []
        lwir_img_paths = []
        img_num_per_folder = []

        # Recursively search for folders named "image"
        for folder, subfolders, filenames in os.walk(root_dir):
            if "visible" in subfolders and "lwir" in subfolders:
                vis_img_folder = osp.join(folder, "visible")
                lwir_img_folder = osp.join(folder, "lwir")
                # Use glob to find image files (you can add more extensions if needed)
                vis_imgs_i = glob.glob(osp.join(vis_img_folder, '*.jpg'))
                vis_imgs_i.sort()
                lwir_imgs_i = glob.glob(osp.join(lwir_img_folder, '*.jpg'))      
                lwir_imgs_i.sort()     
                vis_img_paths.append(vis_imgs_i)
                lwir_img_paths.append(lwir_imgs_i)
                img_num_per_folder.append(len(vis_imgs_i))
                assert len(vis_imgs_i) == len(lwir_imgs_i), f"Image numbers do not match in {folder}, {len(vis_imgs_i)} != {len(lwir_imgs_i)}"
                # Add more image file extensions as necessary
        return vis_img_paths, lwir_img_paths, img_num_per_folder
    
    def generate_kaist_pairs(self, root_dir, frame_gap, second_frame_range):
        """ generate image pairs (Vis-TIR) from KAIST Pedestrian dataset

        Args:

            root_dir: root directory for the dataset

            frame_gap (int): the frame gap between consecutive images 

            second_frame_range (int): the range for second image i.e. for the first ind i, second ind j element of [i-10, i+10]

        Returns:

            pair_paths (list)

        """
        vis_img_paths, lwir_img_paths, img_num_per_folder = self.get_kaist_image_paths(root_dir)
        pair_paths = []
        for i in range(len(img_num_per_folder)):
            num_img = img_num_per_folder[i]
            inds_vis = torch.arange(int(self.start_ratio * num_img),
                                    int(self.end_ratio * num_img),
                                    frame_gap, dtype=int)
            if second_frame_range > 0:
                inds_lwir = inds_vis + torch.randint(-second_frame_range, second_frame_range, (inds_vis.shape[0],))
                inds_lwir[inds_lwir<int(self.start_ratio * num_img)] = int(self.start_ratio * num_img)
                inds_lwir[inds_lwir>int(self.end_ratio * num_img)-1] = int(self.end_ratio * num_img)-1
            else:
                inds_lwir = inds_vis
            for j, k in zip(inds_vis, inds_lwir):
                img_name0 = os.path.relpath(vis_img_paths[i][j], root_dir)
                img_name1 = os.path.relpath(lwir_img_paths[i][k], root_dir)

                if torch.rand(1) > 0.5:
                    img_name0, img_name1 = img_name1, img_name0

                pair_paths.append([img_name0, img_name1])
        
        random.shuffle(pair_paths)
        return pair_paths

    def __len__(self):
        return len(self.pair_paths)

    def __getitem__(self, idx):
        # read grayscale and normalized image, and mask. (1, h, w) and (h, w)
        img_name0 = osp.join(self.root_dir, self.pair_paths[idx][0])
        img_name1 = osp.join(self.root_dir, self.pair_paths[idx][1])

        if self.mode == "train" and torch.rand(1) > 0.5:
            img_name0, img_name1 = img_name1, img_name0

        image0, image0_norm, mask0, scale0, image0_mean, image0_std = read_pretrain_gray(
            img_name0, self.img_resize, self.df, self.img_padding, None)
        image1, image1_norm, mask1, scale1, image1_mean, image1_std = read_pretrain_gray(
            img_name1, self.img_resize, self.df, self.img_padding, None)

        data = {
            'image0': image0,  # (1, h, w)
            'image1': image1,
            'image0_norm': image0_norm,
            'image1_norm': image1_norm,
            'scale0': scale0,  # [scale_w, scale_h]
            'scale1': scale1,
            "image0_mean": image0_mean,
            "image0_std": image0_std,
            "image1_mean": image1_mean,
            "image1_std": image1_std,
            'dataset_name': 'PreTrain',
            'pair_id': idx,
            'pair_names': (self.pair_paths[idx][0], self.pair_paths[idx][1]),
        }

        # for XoFTR training
        if mask0 is not None:  # img_padding is True
            if self.coarse_scale:
                [ts_mask_0, ts_mask_1] = F.interpolate(torch.stack([mask0, mask1], dim=0)[None].float(),
                                                       scale_factor=self.coarse_scale,
                                                       mode='nearest',
                                                       recompute_scale_factor=False)[0].bool()
            data.update({'mask0': ts_mask_0, 'mask1': ts_mask_1})

        return data