File size: 3,953 Bytes
3424266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Copyright 2024 EPFL and Apple Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from torch.utils.data import Dataset
from typing import Any, Callable, Dict, List, Optional, Tuple, cast

from fourm.data.multimodal_dataset_folder import make_dataset, UNIFIED_EXTENSIONS
from fourm.data.modality_transforms import get_transform_key, RGBTransform, CaptionTransform, UnifiedDataTransform


class ImageCaptionDataset(Dataset):
    """
    Similar to MultiModalDatasetFolder, but specialized for image-caption datasets.
    """
    def __init__(self, 
                 root: str, 
                 augmenter: Optional[Callable] = None,
                 modality_paths: Dict[str, str] = None, 
                 is_valid_file: Optional[Callable[[str], bool]] = None, 
                 cache=False):
        self.root = root
        self.modality_paths = modality_paths or {}
        
        self.modality_transforms = {
            'rgb': RGBTransform(imagenet_default_mean_and_std=False),
            'caption': CaptionTransform()
        }
        
        self.transform = UnifiedDataTransform(transforms_dict=self.modality_transforms, image_augmenter=augmenter)
        
        classes, class_to_idx = self._find_classes(os.path.join(self.root, self.modality_paths.get('caption', 'caption')))
        extensions = UNIFIED_EXTENSIONS if is_valid_file is None else None
        
        samples = {
            mod: make_dataset(
                os.path.join(self.root, self.modality_paths.get(mod, mod)),
                class_to_idx, 
                extensions, 
                is_valid_file,
                cache_path=os.path.join(self.root, 'dataloader_cache', f'{self.modality_paths.get(mod, mod)}.pkl') if cache else None)
            for mod in ['caption', 'rgb']
        }
        
        for mod, mod_samples in samples.items():
            if len(mod_samples) == 0:
                msg = "Found 0 logs in subfolders of: {}\n".format(os.path.join(self.root, self.modality_paths.get(mod, mod)))
                if extensions is not None:
                    msg += "Supported extensions are: {}".format(",".join(extensions))
                raise RuntimeError(msg)

        self.extensions = extensions
        self.classes = classes
        self.class_to_idx = class_to_idx
        self.samples = samples
        
    def _find_classes(self, dir: str) -> Tuple[List[str], Dict[str, int]]:
        """
        Finds the class folders in a dataset.

        Args:
            dir (string): Root directory path.

        Returns:
            tuple: (classes, class_to_idx) where classes are relative to (dir), and class_to_idx is a dictionary.

        Ensures:
            No class is a subdirectory of another.
        """
        classes = [d.name for d in os.scandir(dir) if d.is_dir()]
        classes.sort()
        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes, class_to_idx
        
    def __getitem__(self, index):
        
        sample_dict = {}
        for mod in ['caption', 'rgb']:
            path, _ = self.samples[mod][index]
            sample = self.modality_transforms[get_transform_key(mod)].load(path)
            sample_dict[mod] = sample
            
        if self.transform is not None:
            sample_dict = self.transform(sample_dict)
        
        return sample_dict
    
    def __len__(self) -> int:
        return len(list(self.samples.values())[0])