File size: 8,262 Bytes
753fd9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198

import numpy as np
import random
import copy
import time
import warnings
import random

from torch.utils.data import Sampler
from torch._six import int_classes as _int_classes

class CustomGCSampler(Sampler):
    """Wraps another sampler to yield a mini-batch of indices.
    The structure of this sampler is way to complicated because it is a shorter/simplified version of 
    CustomBatchSampler. The relations between breeds are not relevant for the cvpr 2022 paper, but we kept 
    this structure which we were using for the experiments with clade related losses. ToDo: restructure 
    this sampler. 
    Args:
        data_sampler_info (dict): a dictionnary, containing information about the dataset and breeds. 
        batch_size (int): Size of mini-batch.
    """

    def __init__(self, data_sampler_info_gc, batch_size, add_nonflat=False, more_standing=False):
        if not isinstance(batch_size, _int_classes) or isinstance(batch_size, bool) or \
                batch_size <= 0:
            assert (batch_size == 12 and add_nonflat==False) or (batch_size == 14 and add_nonflat==True) 
            raise ValueError("batch_size should be a positive integer value, "
                             "but got batch_size={}".format(batch_size))
        self.data_sampler_info_gc = data_sampler_info_gc
        self.batch_size = batch_size
        self.add_nonflat = add_nonflat
        self.more_standing = more_standing

        self.n_images_tot = len(self.data_sampler_info_gc['name_list'])  # 4305

        # get full sorted image list
        self.pose_dict = {}
        self.dict_name_to_idx = {}
        for ind_img, img in enumerate(self.data_sampler_info_gc['name_list']):
            self.dict_name_to_idx[img] = ind_img
            pose = self.data_sampler_info_gc['gc_annots_categories'][img]['pose']
            if pose in self.pose_dict.keys():
                self.pose_dict[pose].append(img) 
            else:
                self.pose_dict[pose] = [img]

        # prepare non-flat images
        if self.add_nonflat:
            self.n_images_nonflat_tot = len(self.data_sampler_info_gc['name_list_nonflat'])

        # self.n_desired_batches = int(np.floor(len(self.data_sampler_info_gc['name_list']) / batch_size))        # 157
        self.n_desired_batches = int(np.ceil(len(self.get_list_for_group_index(ind_g=1, n_groups=5, shuffle=True, more_standing=self.more_standing)) / 3))

    def get_description(self):
        description = "\
            This sampler returns stanext data such that poses are more balanced. \n\
            -> works on top of stanext24_withgc_v2"
        return description

    def get_nonflat_idx_list(self, shuffle=True):
        all_nonflat_idxs = list(range(self.n_images_tot, self.n_images_tot + self.n_images_nonflat_tot))
        if shuffle:
            random.shuffle(all_nonflat_idxs)
        return all_nonflat_idxs

    def get_list_for_group_index(self, ind_g, n_groups=5, shuffle=True, return_info=False, more_standing=False):
        # availabe poses
        #   sitting_sym: 561
        #   lying_sym: 199
        #   jumping_touching: 21
        #   standing_4paws: 1999
        #   running: 132
        #   sitting_comp: 306
        #   onhindlegs: 16
        #   walking: 325
        #   lying_comp: 596
        #   standing_fewpaws: 98
        #   otherpose: 22
        #   downwardfacingdog: 14
        #   jumping_nottouching: 16
        # 
        # available groups (7 groups)
        #   89: 'otherpose', 'downwardfacingdog', 'jumping_nottouching', 'onhindlegs', 'jumping_touching' 
        #   561: 'sitting_sym'
        #   306: 'sitting_comp'
        #   199: 'lying_sym'
        #   596: 'lying_comp'
        #   555: 'standing_fewpaws', 'running', 'walking'
        #   1999: 'standing_4paws'
        #       -> sample: 2, 1.5, 1.5, 1.5, 1.5, 2, 2
        # 
        # available groups (5 groups)
        #   89: 'otherpose', 'downwardfacingdog', 'jumping_nottouching', 'onhindlegs', 'jumping_touching' 
        #   867: 'sitting_sym', 'sitting_comp'
        #   795: 'lying_sym', 'lying_comp'
        #   555: 'standing_fewpaws', 'running', 'walking'
        #   1999: 'standing_4paws'
        #       -> sample: 2, 3, 3, 2, 2
        assert (n_groups == 5)
        if more_standing:
            if ind_g == 0:
                n_samples_per_batch = 2
                pose_names = ['otherpose', 'downwardfacingdog', 'jumping_nottouching', 'onhindlegs', 'jumping_touching']
            elif ind_g == 1:
                n_samples_per_batch = 2
                pose_names = ['sitting_sym', 'sitting_comp']
            elif ind_g == 2:
                n_samples_per_batch = 2
                pose_names = ['lying_sym', 'lying_comp']
            elif ind_g == 3:
                n_samples_per_batch = 2
                pose_names = ['standing_fewpaws', 'running', 'walking']
            elif ind_g == 4:
                n_samples_per_batch = 4
                pose_names = ['standing_4paws']
            else: 
                raise ValueError
        else:
            if ind_g == 0:
                n_samples_per_batch = 2
                pose_names = ['otherpose', 'downwardfacingdog', 'jumping_nottouching', 'onhindlegs', 'jumping_touching']
            elif ind_g == 1:
                n_samples_per_batch = 3
                pose_names = ['sitting_sym', 'sitting_comp']
            elif ind_g == 2:
                n_samples_per_batch = 3
                pose_names = ['lying_sym', 'lying_comp']
            elif ind_g == 3:
                n_samples_per_batch = 2
                pose_names = ['standing_fewpaws', 'running', 'walking']
            elif ind_g == 4:
                n_samples_per_batch = 2
                pose_names = ['standing_4paws']
            else: 
                raise ValueError
        all_imgs_this_group = []
        for pose_name in pose_names:
            all_imgs_this_group.extend(self.pose_dict[pose_name])
        if shuffle:
            random.shuffle(all_imgs_this_group)
        if return_info:
            return all_imgs_this_group, pose_names, n_samples_per_batch
        else:
            return all_imgs_this_group


    def __iter__(self):

        n_groups = 5
        group_lists = {}
        n_samples_per_batch = {}
        for ind_g in range(n_groups):
            group_lists[ind_g], pose_names, n_samples_per_batch[ind_g] = self.get_list_for_group_index(ind_g, n_groups=5, shuffle=True, return_info=True, more_standing=self.more_standing)
        if self.add_nonflat:
            nonflat_idx_list = self.get_nonflat_idx_list()

        # we want to sample all sitting poses at least once per batch (and ths all other 
        #   images except standing on 4 paws)
        all_batches = []
        for ind in range(self.n_desired_batches):
            batch_with_idxs = []
            for ind_g in range(n_groups):
                for ind_s in range(n_samples_per_batch[ind_g]):
                    if len(group_lists[ind_g]) == 0:
                        group_lists[ind_g] = self.get_list_for_group_index(ind_g, n_groups=5, shuffle=True, more_standing=self.more_standing)
                    name = group_lists[ind_g].pop(0)
                    idx = self.dict_name_to_idx[name]
                    batch_with_idxs.append(idx)
            if self.add_nonflat:
                for ind_x in range(2):
                    if len(nonflat_idx_list) == 0:
                        nonflat_idx_list = self.get_nonflat_idx_list()
                    idx = nonflat_idx_list.pop(0)
                    batch_with_idxs.append(idx)
            all_batches.append(batch_with_idxs)

        for batch in all_batches:
            yield batch


    def __len__(self):
        # Since we are sampling pairs of dogs and not each breed has an even number of dogs, we can not 
        # guarantee to show each dog exacly once. What we do instead, is returning the same amount of 
        # batches as we would return with a standard sampler which is not based on dog pairs.    
        '''if self.drop_last:
            return len(self.sampler) // self.batch_size  # type: ignore
        else:
            return (len(self.sampler) + self.batch_size - 1) // self.batch_size  # type: ignore'''
        return self.n_desired_batches