Spaces:

aroraaman
/

image-retrieval-using-apple-4M-21

Runtime error

App Files Files Community

aroraaman commited on Jul 1, 2024

Commit

3424266

1 Parent(s): 453e23a

Add all of `fourm`

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
fourm/data/__init__.py +4 -0
fourm/data/dataset_utils.py +85 -0
fourm/data/image_augmenter.py +186 -0
fourm/data/masking.py +747 -0
fourm/data/modality_info.py +427 -0
fourm/data/modality_transforms.py +1387 -0
fourm/data/multimodal_dataset_folder.py +363 -0
fourm/data/pretrain_utils.py +292 -0
fourm/data/transfer_utils.py +53 -0
fourm/data/unified_datasets.py +557 -0
fourm/demo_4M_sampler.py +540 -0
fourm/models/__init__.py +0 -0
fourm/models/decoder_embeddings.py +268 -0
fourm/models/encoder_embeddings.py +422 -0
fourm/models/fm.py +1130 -0
fourm/models/fm_utils.py +387 -0
fourm/models/fm_vit.py +485 -0
fourm/models/generate.py +1273 -0
fourm/models/lora_utils.py +177 -0
fourm/utils/__init__.py +22 -0
fourm/utils/checkpoint.py +185 -0
fourm/utils/clip/__init__.py +2 -0
fourm/utils/clip/clip.py +236 -0
fourm/utils/clip/model.py +504 -0
fourm/utils/clip/simple_tokenizer.py +137 -0
fourm/utils/data_constants.py +40 -0
fourm/utils/dist.py +100 -0
fourm/utils/fsdp_utils.py +116 -0
fourm/utils/generation.py +99 -0
fourm/utils/generation_datasets/PartiPrompts.tsv +0 -0
fourm/utils/generation_datasets/__init__.py +3 -0
fourm/utils/generation_datasets/empty_dataset.py +27 -0
fourm/utils/generation_datasets/image_caption_dataset.py +99 -0
fourm/utils/generation_datasets/parti_prompts_dataset.py +114 -0
fourm/utils/hmr2_utils/hmr2/__init__.py +0 -0
fourm/utils/hmr2_utils/hmr2/models/__init__.py +2 -0
fourm/utils/hmr2_utils/hmr2/models/backbones/__init__.py +13 -0
fourm/utils/hmr2_utils/hmr2/models/backbones/vit.py +353 -0
fourm/utils/hmr2_utils/hmr2/models/components/__init__.py +0 -0
fourm/utils/hmr2_utils/hmr2/models/components/pose_transformer.py +363 -0
fourm/utils/hmr2_utils/hmr2/models/components/t_cond_mlp.py +204 -0
fourm/utils/hmr2_utils/hmr2/models/heads/__init__.py +1 -0
fourm/utils/hmr2_utils/hmr2/models/heads/smpl_head.py +116 -0
fourm/utils/hmr2_utils/hmr2/models/hmr2.py +117 -0
fourm/utils/hmr2_utils/hmr2/models/smpl_wrapper.py +47 -0
fourm/utils/hmr2_utils/hmr2/utils/__init__.py +31 -0
fourm/utils/hmr2_utils/hmr2/utils/geometry.py +109 -0
fourm/utils/hmr2_utils/hmr2/utils/mesh_renderer.py +155 -0
fourm/utils/hmr2_utils/hmr2/utils/render_openpose.py +155 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+fourm/utils/clip/bpe_simple_vocab_16e6.txt.gz filter=lfs diff=lfs merge=lfs -text

fourm/data/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .image_augmenter import *
+from .modality_transforms import *
+from .unified_datasets import build_fm_pretraining_dataset, build_fm_transfer_dataset, build_wds_fm_pretraining_dataloader, build_wds_divae_dataloader, build_huggingface_pretraining_dataloader, build_mixture_dataloader
+from .pretrain_utils import *

fourm/data/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from torch.utils.data import Dataset
+class RepeatedDatasetWrapper(Dataset):
+    def __init__(self, original_dataset, num_repeats):
+        """
+        Dataset wrapper that repeats the original dataset n times.
+        Args:
+            original_dataset (torch.utils.data.Dataset): The original dataset to be repeated.
+            num_repeats (int): The number of times the dataset should be repeated.
+        """
+        self.original_dataset = original_dataset
+        self.num_repeats = num_repeats
+    def __getitem__(self, index):
+        """
+        Retrieve the item at the given index.
+        Args:
+            index (int): The index of the item to be retrieved.
+        """
+        original_index = index % len(self.original_dataset)
+        return self.original_dataset[original_index]
+    def __len__(self):
+        """
+        Get the length of the dataset after repeating it n times.
+        Returns:
+            int: The length of the dataset.
+        """
+        return len(self.original_dataset) * self.num_repeats
+class SubsampleDatasetWrapper(Dataset):
+    def __init__(self, original_dataset, dataset_size, seed=0, return_orig_idx=False):
+        """
+        Dataset wrapper that randomly subsamples the original dataset.
+        Args:
+            original_dataset (torch.utils.data.Dataset): The original dataset to be subsampled.
+            dataset_size (int): The size of the subsampled dataset.
+            seed (int): The seed to use for selecting the subset of indices of the original dataset.
+            return_orig_idx (bool): Whether to return the original index of the item in the original dataset.
+        """
+        self.original_dataset = original_dataset
+        self.dataset_size = dataset_size or len(original_dataset)
+        self.return_orig_idx = return_orig_idx
+        np.random.seed(seed)
+        self.indices = np.random.permutation(len(self.original_dataset))[:self.dataset_size]
+    def __getitem__(self, index):
+        """
+        Retrieve the item at the given index.
+        Args:
+            index (int): The index of the item to be retrieved.
+        """
+        original_index = self.indices[index]
+        sample = self.original_dataset[original_index]
+        return sample, original_index if self.return_orig_idx else sample
+    def __len__(self):
+        """
+        Get the length of the dataset after subsampling it.
+        Returns:
+            int: The length of the dataset.
+        """
+        return len(self.indices)

fourm/data/image_augmenter.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from abc import ABC, abstractmethod
+import numpy as np
+import torchvision
+from fourm.utils import to_2tuple
+class AbstractImageAugmenter(ABC):
+    """Abstract class for image augmenters.
+    """
+    @abstractmethod
+    def __call__(self, mod_dict, crop_settings):
+        pass
+class RandomCropImageAugmenter(AbstractImageAugmenter):
+    def __init__(self, target_size=224, hflip=0.5, crop_scale=(0.2, 1.0), crop_ratio=(0.75, 1.3333), main_domain='rgb'):
+        self.target_size = to_2tuple(target_size)
+        self.hflip = hflip
+        self.crop_scale = crop_scale
+        self.crop_ratio = crop_ratio
+        self.main_domain = main_domain
+    def __call__(self, mod_dict, crop_settings):
+        if crop_settings is not None:
+            raise ValueError("Crop settings are provided but not used by this augmenter.")
+        image = mod_dict[self.main_domain] if self.main_domain is not None else mod_dict[list(mod_dict.keys())[0]]
+        # With torchvision 0.13+, can also be: orig_size = TF.get_dimensions(image)
+        orig_width, orig_height = image.size
+        orig_size = (orig_height, orig_width)
+        top, left, h, w = torchvision.transforms.RandomResizedCrop.get_params(
+            image, scale=self.crop_scale, ratio=self.crop_ratio
+        )
+        crop_coords = top, left, h, w
+        flip = random.random() < self.hflip
+        rand_aug_idx = None
+        return crop_coords, flip, orig_size, self.target_size, rand_aug_idx
+class NoImageAugmenter(AbstractImageAugmenter): # this is for non-image modalities like poses where we don't do any augs, e.g. during tokenization
+    def __init__(self, no_aug=True, main_domain='human_poses'):
+        self.target_size = None #to_2tuple(target_size)
+        self.no_aug = no_aug
+        self.main_domain = main_domain
+    def __call__(self, mod_dict, crop_settings):
+        # # With torchvision 0.13+, can also be: orig_size = TF.get_dimensions(image)
+        orig_size = (224, 224)
+        rand_aug_idx = 0
+        top, left, h, w, flip = 0, 0, 224, 224, 0
+        crop_coords = (top, left, h, w)
+        return crop_coords, flip, orig_size, self.target_size, rand_aug_idx
+class PreTokenizedImageAugmenter(AbstractImageAugmenter):
+    def __init__(self, target_size, no_aug=False, main_domain='rgb'):
+        self.target_size = to_2tuple(target_size)
+        self.no_aug = no_aug
+        self.main_domain = main_domain
+    def __call__(self, mod_dict, crop_settings):
+        # With torchvision 0.13+, can also be: orig_size = TF.get_dimensions(image)
+        if self.main_domain in mod_dict and 'tok' not in self.main_domain:
+            image = mod_dict[self.main_domain] if self.main_domain is not None else mod_dict[list(mod_dict.keys())[0]]
+            orig_width, orig_height = image.size
+            orig_size = (orig_height, orig_width)
+        else:
+            orig_size = None
+        rand_aug_idx = 0 if self.no_aug else np.random.randint(len(crop_settings))
+        top, left, h, w, flip = crop_settings[rand_aug_idx]
+        crop_coords = (top, left, h, w)
+        return crop_coords, flip, orig_size, self.target_size, rand_aug_idx
+class CenterCropImageAugmenter(AbstractImageAugmenter):
+    def __init__(self, target_size, hflip=0.0, main_domain='rgb'):
+        self.target_size = to_2tuple(target_size)
+        self.hflip = hflip
+        self.main_domain = main_domain
+    def __call__(self, mod_dict, crop_settings=None):
+        image = mod_dict[self.main_domain] if self.main_domain is not None else mod_dict[list(mod_dict.keys())[0]]
+        orig_width, orig_height = image.size
+        orig_size = (orig_height, orig_width)
+        if orig_height > orig_width:
+            h = w = orig_width
+            top = (orig_height - orig_width) // 2
+            left = 0
+        else:
+            h = w = orig_height
+            top = 0
+            left = (orig_width - orig_height) // 2
+        crop_coords = (top, left, h, w)
+        flip = random.random() < self.hflip
+        rand_aug_idx = None
+        return crop_coords, flip, orig_size, self.target_size, rand_aug_idx
+class PaddingImageAugmenter(AbstractImageAugmenter):
+    def __init__(self, target_size, hflip=0.0, main_domain='rgb'):
+        self.target_size = to_2tuple(target_size)
+        self.hflip = hflip
+        self.main_domain = main_domain
+    def __call__(self, mod_dict, crop_settings):
+        image = mod_dict[self.main_domain] if self.main_domain is not None else mod_dict[list(mod_dict.keys())[0]]
+        orig_width, orig_height = image.size
+        orig_size = (orig_height, orig_width)
+        h = w = max(orig_width, orig_height)
+        top = left = 0
+        crop_coords = (top, left, h, w)
+        flip = random.random() < self.hflip
+        rand_aug_idx = None
+        return crop_coords, flip, orig_size, self.target_size, rand_aug_idx
+class ScaleJitteringImageAugmenter(AbstractImageAugmenter):
+    def __init__(self, target_size, hflip=0.0, scale=(0.1, 2.0), main_domain='rgb'):
+        self.target_size = to_2tuple(target_size)
+        self.hflip = hflip
+        self.scale = scale
+        self.main_domain = main_domain
+    def scale_jitter(self, orig_height, orig_width):
+        rand_scale = np.random.uniform(self.scale[0], self.scale[1])
+        max_hw = max(orig_height, orig_width)
+        h = w = round(max_hw / rand_scale)
+        top = round(max(0, np.random.uniform(0, orig_height - h)))
+        left = round(max(0, np.random.uniform(0, orig_width - w)))
+        return top, left, h, w
+    def __call__(self, mod_dict, crop_settings):
+        if crop_settings is not None:
+            raise ValueError("Crop settings are provided but not used by this augmenter.")
+        image = mod_dict[self.main_domain] if self.main_domain is not None else mod_dict[list(mod_dict.keys())[0]]
+        # With torchvision 0.13+, can also be: orig_size = TF.get_dimensions(image)
+        orig_width, orig_height = image.size
+        orig_size = (orig_height, orig_width)
+        crop_coords = self.scale_jitter(orig_height, orig_width)
+        flip = random.random() < self.hflip
+        rand_aug_idx = None
+        return crop_coords, flip, orig_size, self.target_size, rand_aug_idx
+class EmptyAugmenter(AbstractImageAugmenter):
+    def __init__(self):
+        pass
+    def __call__(self, mod_dict, crop_settings):
+        return None, None, None, None, None

fourm/data/masking.py ADDED Viewed

	@@ -0,0 +1,747 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import random
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from tokenizers import Tokenizer
+from torch.distributions import Dirichlet
+from fourm.data.modality_transforms import get_transform_key
+from fourm.utils import to_2tuple
+from fourm.utils.tokenizer import get_sentinel_to_id_mapping
+def sample_cosine(min_val: float = 0, max_val: float =1) -> float:
+    """Sample a value from a cosine distribution between min_val and max_val
+    Args:
+        min_val: Minimum value
+        max_val: Maximum value
+    Returns:
+        Sampled value
+    """
+    return min_val + 0.5 * (max_val - min_val) * (1 + math.cos(math.pi * random.uniform(0, 1)))
+def sample_uniform(min_val: float = 0, max_val: float =1) -> float:
+    """Sample a value from a uniform distribution between min_val and max_val
+    Args:
+        min_val: Minimum value
+        max_val: Maximum value
+    Returns:
+        Sampled value
+    """
+    return random.uniform(min_val, max_val)
+def simple_span_masking(sequence: List[int], sentinel_to_id: Dict[int, int], keep_prob: float) -> Tuple[List[int], List[int]]:
+    """Span masking for a sequence
+    Args:
+        sequence: Sequence to mask
+        sentinel_to_id: Mapping from sentinel to id
+        keep_prob: Probability of keeping a token
+    Returns:
+        Masked input sequence and masked target sequence
+    """
+    sequence_length = len(sequence)
+    # 0 for keep, 1 for mask
+    masks = torch.where(torch.rand(sequence_length) <= keep_prob, 0, 1).bool().tolist()
+    input_sequence = []
+    target_sequence = []
+    prev_mask = False
+    sentinel_count = 0
+    for token, mask in zip(sequence, masks):
+        if mask:
+            if not prev_mask:
+                sentinel_count += 1
+                input_sequence.append(sentinel_to_id[sentinel_count])
+                target_sequence.append(sentinel_to_id[sentinel_count])
+            prev_mask = True
+            target_sequence.append(token)
+        else:
+            prev_mask = False
+            input_sequence.append(token)
+    target_sequence.append(sentinel_to_id[sentinel_count + 1])
+    return input_sequence, target_sequence
+def chunk_span_masking(sequence_chunks: List[List[int]], sentinel_to_id: Dict[int, int], keep_prob: float) -> Tuple[List[int], List[int]]:
+    """Span masking where masking is performed at the chunk level.
+    Args:
+        sequence_chunks: Sequence chunks to mask
+        sentinel_to_id: Mapping from sentinel to id
+        keep_prob: Probability of keeping a token
+    Returns:
+        Masked input sequence and masked target sequence
+    """
+    chunk_length = len(sequence_chunks)
+    # 0 for keep, 1 for mask
+    masks = torch.where(torch.rand(chunk_length) <= keep_prob, 0, 1).bool().tolist()
+    input_sequence = []
+    target_sequence = []
+    prev_mask = False
+    sentinel_count = 0
+    for chunk, mask in zip(sequence_chunks, masks):
+        if mask:
+            if not prev_mask:
+                sentinel_count += 1
+                input_sequence.append(sentinel_to_id[sentinel_count])
+                target_sequence.append(sentinel_to_id[sentinel_count])
+            prev_mask = True
+            target_sequence.extend(chunk)
+        else:
+            prev_mask = False
+            input_sequence.extend(chunk)
+    target_sequence.append(sentinel_to_id[sentinel_count + 1])
+    return input_sequence, target_sequence
+class UnifiedMasking(object):
+    def __init__(self,
+                 modality_info: Dict,
+                 text_tokenizer: Optional[Tokenizer],
+                 input_tokens_range: Union[int, Tuple[int, int]],
+                 target_tokens_range: Optional[Union[int, Tuple[int, int]]],
+                 max_tries: int = 100,
+                 sampling_weights: Optional[List[float]] = None,):
+        """Performs masking on a dict of modalities (both image based and sequence based modalities)
+        Args:
+            modality_info: Dict with the modalities and their corresponding information
+            text_tokenizer: Tokenizer to use for text modalities
+            input_tokens_range: Range of number of tokens to mask in the input
+            target_tokens_range: Range of number of tokens to mask in the target
+            max_tries: Maximum number of tries to find a valid token budgets
+            sampling_weights: Sampling weights for the mixture of Dirichlet distributions
+        """
+        self.input_tokens_range = to_2tuple(input_tokens_range)
+        self.target_tokens_range = to_2tuple(target_tokens_range) if target_tokens_range is not None else None
+        self.modality_info = modality_info
+        self.num_modalities = len(modality_info)
+        self.max_tries = max_tries
+        self.min_tokens = torch.tensor([mod['min_tokens'] for mod in modality_info.values()])
+        self.max_tokens = torch.tensor([mod['max_tokens'] for mod in modality_info.values()])
+        self.mod_is_img = torch.tensor([mod['type'] == 'img' for mod in modality_info.values()])
+        # Dirichlet sampling (supports a mixture of multiple Dirichlet distributions)
+        eps = 1e-9
+        input_alphas = torch.tensor([mod["input_alphas"] for mod in modality_info.values()])
+        input_alphas = rearrange(input_alphas, "nmod nmix -> nmix nmod")
+        self.input_dirichlets = [Dirichlet(torch.clamp(input_alpha, min=eps)) for input_alpha in input_alphas]
+        target_alphas = torch.tensor([mod["target_alphas"] for mod in modality_info.values()])
+        target_alphas = rearrange(target_alphas, "nmod nmix -> nmix nmod")
+        self.target_dirichlets = [Dirichlet(torch.clamp(target_alpha, min=eps)) for target_alpha in target_alphas]
+        assert(len(self.input_dirichlets) == len(self.target_dirichlets))
+        self.num_dirichlets = len(self.input_dirichlets)
+        if sampling_weights is not None:
+            assert len(sampling_weights) == self.num_dirichlets
+            self.sampling_weights = torch.tensor(sampling_weights)
+        else:
+            self.sampling_weights = None
+        self.text_tokenizer = text_tokenizer
+        self.keep_prob_decay_factor = 0.9
+        self.sentinel_to_id = get_sentinel_to_id_mapping(text_tokenizer)
+        self.sentinel_ids = set(self.sentinel_to_id.values())
+        self.pad_id = text_tokenizer.token_to_id("[PAD]")
+        self.eos_id = text_tokenizer.token_to_id("[EOS]")
+    def input_token_budget(self, num_input_tokens, dir_idx=0):
+        """Sample a token budget for the input
+        Args:
+            num_input_tokens: Number of tokens in the input
+        Returns:
+            Token budget for the input
+        """
+        # Get the number of tokens for each modality
+        for i in range(self.max_tries):
+            input_token_budget = (self.input_dirichlets[dir_idx].sample() * num_input_tokens).floor().int()
+            diff = num_input_tokens - input_token_budget.sum()
+            # Adds the remaining tokens by sampling from the Dirichlet and taking the argmax
+            # This avoids adding tokens to modalities that shouldn't be sampled (i.e. with alphas ~=0)
+            input_token_budget += torch.bincount(self.input_dirichlets[dir_idx].sample_n(diff).argmax(dim=-1), minlength=len(input_token_budget))
+            # If token budget is over max tokens for a given modality, set it to max
+            input_token_budget = torch.clamp(input_token_budget, max=self.max_tokens)
+            if (input_token_budget >= self.min_tokens).all():
+                return input_token_budget.tolist()
+        print(f"More than max tries for input!")
+        return input_token_budget.tolist()
+    def target_token_budget(self, input_token_budget, num_target_tokens, dir_idx=0):
+        """Sample a token budget for the target
+        Args:
+            input_token_budget: Token budget for the input
+            num_target_tokens: Number of tokens in the target
+        Returns:
+            Token budget for the target
+        """
+        # We don't reduce the number of tokens for sequence based tasks
+        max_tokens_remaining = torch.where(self.mod_is_img, self.max_tokens - torch.tensor(input_token_budget), self.max_tokens)
+        max_tokens_remaining = torch.max(self.min_tokens, max_tokens_remaining)
+        for i in range(self.max_tries):
+            target_token_budget = (self.target_dirichlets[dir_idx].sample() * num_target_tokens).floor().int()
+            diff = num_target_tokens - target_token_budget.sum()
+            # Adds the remaining tokens by sampling from the Dirichlet and taking the argmax
+            # This avoids adding tokens to modalities that shouldn't be sampled (i.e. with alphas ~=0)
+            target_token_budget += torch.bincount(self.target_dirichlets[dir_idx].sample_n(diff).argmax(dim=-1), minlength=len(target_token_budget))
+            # If token budget is over max tokens for a given modality, set it to max
+            target_token_budget = torch.clamp(target_token_budget, max=max_tokens_remaining)
+            if (target_token_budget >= self.min_tokens).all():
+                return target_token_budget.tolist()
+        print(f"More than max tries for target!")
+        return target_token_budget.tolist()
+    def image_mask(self, tensor: torch.Tensor, num_tokens: int, input_budget: int, target_budget: int):
+        """Applies input and target masking to an image tensor
+        Args:
+            tensor: Image tensor
+            num_tokens: Number of tokens in the tensor
+            input_budget: Token budget for the input
+            target_budget: Token budget for the target
+        Returns:
+            Dictionary containing the masked image tensor, the input mask, the target mask, and the decoder attention mask
+        """
+        noise = torch.rand(num_tokens)
+        ids_shuffle = torch.argsort(noise, dim=0)
+        input_mask = torch.ones(num_tokens, dtype=torch.bool)
+        input_mask[:input_budget] = 0
+        input_mask = torch.gather(input_mask, dim=0, index=ids_shuffle)
+        if target_budget is None:
+            target_mask = ~input_mask
+        else:
+            target_mask = torch.ones(num_tokens, dtype=torch.bool)
+            target_mask[input_budget:input_budget + target_budget] = 0
+            target_mask = torch.gather(target_mask, dim=0, index=ids_shuffle)
+        decoder_attention_mask = torch.zeros(num_tokens, dtype=torch.int)
+        first_mask_token = torch.argmin(target_mask + torch.arange(target_mask.shape[0], device=target_mask.device) * 1e-6)
+        decoder_attention_mask[first_mask_token] = (~target_mask).sum()  # Equiv. to target budget
+        return {"tensor": tensor, "input_mask": input_mask, "target_mask": target_mask, "decoder_attention_mask": decoder_attention_mask}
+    def sequence_token_mask(self, sequence_ids: str, max_tokens: int, input_budget: int, target_budget: int, keep_scheme: str, vocab_offset: int):
+        """Applies input and target masking to a sequence of tokens (e.g. DINOv2 global tokens)
+        The keep probability is sampled from a cosine schedule and does not depend on the number of tokens in the sequence.
+        If the keep probability results in a sequence that is too long, then it is lowered until the sequence is short enough.
+        Args:
+            sequence_ids: Sequence ids
+            max_tokens: Maximum number of tokens in the sequence
+            input_budget: Token budget for the input
+            target_budget: Token budget for the target
+            keep_scheme: Scheme for sampling the keep probability
+            vocab_offset: Offset to avoid overlap with sentinel tokens
+        Returns:
+            Dictionary containing the masked sequence tensor, the input mask, the target mask, and the decoder attention mask
+        """
+        seq_ids = sequence_ids
+        seq_ids = seq_ids + vocab_offset # Avoid overlap with sentinel tokens (needs to be substracted after decoding)
+        # If input budget is 0, treat it as if the whole sequence is completely masked
+        if input_budget == 0:
+            keep_prob = 0.
+            input_seq_ids = []
+            _, target_seq_ids = simple_span_masking(seq_ids, self.sentinel_to_id, keep_prob)
+        else:
+            if keep_scheme == 'random':
+                keep_prob = sample_uniform(0, 1)
+            elif keep_scheme == 'all':
+                keep_prob = 1.0
+            elif keep_scheme == 'binary':
+                keep_prob = random.choice([0., 1.])
+            else:
+                raise ValueError(f"Invalid keep scheme for sequence masking: {keep_scheme}")
+            input_seq_ids, target_seq_ids = simple_span_masking(seq_ids, self.sentinel_to_id, keep_prob)
+            # Keep lowering the keep_prob while we are over-budget
+            while len(input_seq_ids) > input_budget:
+                keep_prob = keep_prob * self.keep_prob_decay_factor
+                input_seq_ids, target_seq_ids = simple_span_masking(seq_ids, self.sentinel_to_id, keep_prob)
+        # Span masking can add up to (max_tokens + 1) * 2 tokens for input + target
+        max_length = (max_tokens + 1) * 2
+        tensor = torch.ones(max_length, dtype=torch.int) * self.pad_id
+        input_mask = torch.ones(max_length, dtype=torch.bool)
+        target_mask = torch.ones(max_length, dtype=torch.bool)
+        decoder_attention_mask = torch.zeros(max_length, dtype=torch.int)
+        # Set input and input mask
+        tensor[:len(input_seq_ids)] = torch.tensor(input_seq_ids, dtype=torch.int)
+        input_mask[:len(input_seq_ids)] = 0
+        if target_budget is None or len(target_seq_ids) <= target_budget:
+            tensor[input_budget:input_budget + len(target_seq_ids)] = torch.tensor(target_seq_ids, dtype=torch.int)
+            target_mask[input_budget:input_budget + len(target_seq_ids)] = 0
+            decoder_attention_mask[input_budget:input_budget + len(target_seq_ids)] = 1
+        else:
+            # Randomly choose sentinel token.
+            sentinel_indices = [i for i, token_id in enumerate(target_seq_ids) if token_id in self.sentinel_ids]
+            # If there is more than 1 sentinel, avoid sampling the very last one which indicates the end of the sequence
+            chosen_sentinel = np.random.randint(max(1, len(sentinel_indices) - 1))
+            # If length starting at this token g.t. budget, truncate until budget is reached
+            if len(target_seq_ids) - sentinel_indices[chosen_sentinel] >= target_budget:
+                target_seq_ids = target_seq_ids[sentinel_indices[chosen_sentinel]:sentinel_indices[chosen_sentinel] + target_budget]
+            # Otherwise, select earliest sentinel token such that we don't go over budget
+            # Note: We could also use the randomly chosen sentinel token, but that would waste budget
+            else:
+                for idx in sentinel_indices:
+                    if len(target_seq_ids) - idx <= target_budget:
+                        target_seq_ids = target_seq_ids[idx:]
+                        break
+            tensor[input_budget:input_budget + len(target_seq_ids)] = torch.tensor(target_seq_ids, dtype=torch.int)
+            target_mask[input_budget:input_budget + len(target_seq_ids)] = 0
+            decoder_attention_mask[input_budget:input_budget + len(target_seq_ids)] = 1
+        return {"tensor": tensor, "input_mask": input_mask, "target_mask": target_mask, "decoder_attention_mask": decoder_attention_mask}
+    def sequence_mask(self, sequence: Union[str, List[str]], max_tokens: int, input_budget: int, target_budget: int, keep_scheme: str):
+        """Applies input and target masking to a sequence
+        The keep probability is sampled from a cosine schedule and does not depend on the number of tokens in the sequence.
+        If the keep probability results in a sequence that is too long, then it is lowered until the sequence is short enough.
+        Args:
+            sequence: Sequence, can be either a str or list of strings
+            max_tokens: Maximum number of tokens in the sequence
+            input_budget: Token budget for the input
+            target_budget: Token budget for the target
+            keep_scheme: Scheme for sampling the keep probability
+        Returns:
+            Dictionary containing the masked sequence tensor, the input mask, the target mask, and the decoder attention mask
+        """
+        if isinstance(sequence, str):
+            # Tokenize the sequence and get the ids
+            seq_ids: List[int] = self.text_tokenizer.encode(sequence).ids
+            # Add EOS to all sequences
+            seq_ids.append(self.eos_id)
+            # Truncate sequence
+            seq_ids = seq_ids[:max_tokens]
+            # Use default span masking
+            span_masking_fn = simple_span_masking
+        elif isinstance(sequence, list):
+            # Tokenize the sequence chunks and get the ids
+            encoded_seq_chunks = self.text_tokenizer.encode_batch(sequence)
+            seq_ids: List[List[int]] = [seq.ids for seq in encoded_seq_chunks]
+            # Add EOS as an extra chunk
+            seq_ids.append([self.eos_id])
+            # Truncate sequence to keep all chunks below max token length
+            cumulative_token_count = np.cumsum(np.array([len(chunk) for chunk in seq_ids]))
+            seq_ids = [chunk for (chunk, token_count) in zip(seq_ids, cumulative_token_count) if token_count <= max_tokens]
+            # Span mask over chunks
+            span_masking_fn = chunk_span_masking
+        else:
+            raise ValueError(f"Invalid sequence: {sequence}")
+        # If input budget is 0, treat it as if the whole sequence is completely masked
+        if input_budget == 0:
+            keep_prob = 0.
+            input_seq_ids = []
+            _, target_seq_ids = span_masking_fn(seq_ids, self.sentinel_to_id, keep_prob)
+        else:
+            if keep_scheme == 'random':
+                keep_prob = sample_uniform(0, 1)
+            elif keep_scheme == 'all':
+                keep_prob = 1.0
+            elif keep_scheme == 'binary':
+                keep_prob = random.choice([0., 1.])
+            else:
+                raise ValueError(f"Invalid keep scheme for sequence masking: {keep_scheme}")
+            input_seq_ids, target_seq_ids = span_masking_fn(seq_ids, self.sentinel_to_id, keep_prob)
+            # Keep lowering the keep_prob while we are over-budget
+            while len(input_seq_ids) > input_budget:
+                keep_prob = keep_prob * self.keep_prob_decay_factor
+                input_seq_ids, target_seq_ids = span_masking_fn(seq_ids, self.sentinel_to_id, keep_prob)
+        # Span masking can add up to (max_tokens + 1) * 2 tokens for input + target
+        max_length = (max_tokens + 1) * 2
+        tensor = torch.ones(max_length, dtype=torch.int) * self.pad_id
+        input_mask = torch.ones(max_length, dtype=torch.bool)
+        target_mask = torch.ones(max_length, dtype=torch.bool)
+        decoder_attention_mask = torch.zeros(max_length, dtype=torch.int)
+        # Set input and input mask
+        tensor[:len(input_seq_ids)] = torch.tensor(input_seq_ids, dtype=torch.int)
+        input_mask[:len(input_seq_ids)] = 0
+        if target_budget is None or len(target_seq_ids) <= target_budget:
+            tensor[input_budget:input_budget + len(target_seq_ids)] = torch.tensor(target_seq_ids, dtype=torch.int)
+            target_mask[input_budget:input_budget + len(target_seq_ids)] = 0
+            decoder_attention_mask[input_budget:input_budget + len(target_seq_ids)] = 1
+        else:
+            # Randomly choose sentinel token.
+            sentinel_indices = [i for i, token_id in enumerate(target_seq_ids) if token_id in self.sentinel_ids]
+            # If there is more than 1 sentinel, avoid sampling the very last one which indicates the end of the sequence
+            chosen_sentinel = np.random.randint(max(1, len(sentinel_indices) - 1))
+            # If length starting at this token g.t. budget, truncate until budget is reached
+            if len(target_seq_ids) - sentinel_indices[chosen_sentinel] >= target_budget:
+                target_seq_ids = target_seq_ids[sentinel_indices[chosen_sentinel]:sentinel_indices[chosen_sentinel] + target_budget]
+            # Otherwise, select earliest sentinel token such that we don't go over budget
+            # Note: We could also use the randomly chosen sentinel token, but that would waste budget
+            else:
+                for idx in sentinel_indices:
+                    if len(target_seq_ids) - idx <= target_budget:
+                        target_seq_ids = target_seq_ids[idx:]
+                        break
+            tensor[input_budget:input_budget + len(target_seq_ids)] = torch.tensor(target_seq_ids, dtype=torch.int)
+            target_mask[input_budget:input_budget + len(target_seq_ids)] = 0
+            decoder_attention_mask[input_budget:input_budget + len(target_seq_ids)] = 1
+        return {"tensor": tensor, "input_mask": input_mask, "target_mask": target_mask, "decoder_attention_mask": decoder_attention_mask}
+    def sequence_emb_mask_span(self, emb_tensor: torch.Tensor, max_tokens: int, input_budget: int, target_budget: int, keep_scheme: str):
+        """Applies input masking to an sequence embedding tensor, target masking is not supported with sequence embeddings
+        Args:
+            emb_tensor: Sequence embedding tensor
+            max_tokens: Maximum number of tokens in the sequence
+            input_budget: Token budget for the input
+            target_budget: Token budget for the target (unused for now)
+            keep_scheme: Scheme for sampling the keep probability
+        Returns:
+            Dictionary containing the masked sequence embedding tensor, the input mask, the target mask, and the decoder attention mask
+        """
+        # Only supported as input modality now
+        # Make fake seq ids for sequence embeddings to reuse simple_span_masking function
+        fake_seq_ids = []
+        emb_dict = {}
+        id_num = len(self.sentinel_ids)
+        emb_ind = 0
+        while(len(fake_seq_ids) < len(emb_tensor)):
+            if id_num not in self.sentinel_ids: # replace with T5 sentinel_id
+                fake_seq_ids.append(id_num)
+                emb_dict[id_num] = emb_tensor[emb_ind, :]
+                emb_ind += 1
+            id_num += 1
+        # Truncate sequence
+        fake_seq_ids = fake_seq_ids[:max_tokens]
+        # If input budget is 0, treat it as if the whole sequence is completely masked
+        if input_budget == 0:
+            keep_prob = 0.
+            fake_input_seq_ids = []
+            _, fake_target_seq_ids = simple_span_masking(fake_seq_ids, self.sentinel_to_id, keep_prob)
+        else:
+            if keep_scheme == 'random':
+                keep_prob = sample_uniform(0, 1)
+            elif keep_scheme == 'all':
+                keep_prob = 1.0
+            elif keep_scheme == 'binary':
+                keep_prob = random.choice([0., 1.])
+            else:
+                raise ValueError(f"Invalid keep scheme for sequence masking: {keep_scheme}")
+            fake_input_seq_ids, fake_target_seq_ids = simple_span_masking(fake_seq_ids, self.sentinel_to_id, keep_prob)
+            # Keep lowering the keep_prob while we are over-budget
+            while len(fake_input_seq_ids) > input_budget:
+                keep_prob = keep_prob * self.keep_prob_decay_factor
+                fake_input_seq_ids, fake_target_seq_ids = simple_span_masking(fake_seq_ids, self.sentinel_to_id, keep_prob)
+        # Span masking can add up to max_tokens tokens for input
+        max_length = max_tokens
+        tensor = torch.zeros((max_length, emb_tensor.shape[1]), dtype=torch.float32)
+        input_mask = torch.ones(max_length, dtype=torch.bool)
+        target_mask = torch.ones(max_length, dtype=torch.bool)
+        decoder_attention_mask = torch.zeros(max_length, dtype=torch.int)
+        # Put tensor values back based on the fake seq ids
+        for i_, fake_id in enumerate(fake_input_seq_ids):
+            if fake_id in self.sentinel_ids:
+                tensor[i_, :] = torch.zeros_like(emb_tensor[0,:]) # TODO replace to learned embeddings later
+            else:
+                tensor[i_, :] = emb_dict[fake_id]
+        # Set input and input mask
+        input_mask[:len(fake_input_seq_ids)] = 0
+        return {"tensor": tensor, "input_mask": input_mask, "target_mask": target_mask, "decoder_attention_mask": decoder_attention_mask}
+    def __call__(self, mod_dict):
+        """Applies input and target masking to a dictionary of modalities
+        Args:
+            mod_dict: Dictionary of modalities
+        Returns:
+            Dictionary containing the masked modalities
+        """
+        if self.sampling_weights is not None:
+            # Sample masking scheme according to a list of weights
+            dir_idx = torch.multinomial(self.sampling_weights, 1).item()
+        else:
+            # Randomly sample masking scheme
+            dir_idx = random.randint(0, self.num_dirichlets - 1)
+        num_input_tokens = random.randint(*self.input_tokens_range)
+        num_target_tokens = random.randint(*self.target_tokens_range) if self.target_tokens_range is not None else None
+        input_token_budget = self.input_token_budget(num_input_tokens, dir_idx)
+        if num_target_tokens is not None:
+            target_token_budget = self.target_token_budget(input_token_budget, num_target_tokens, dir_idx)
+        else:
+            target_token_budget = [None] * self.num_modalities
+        masked_mod_dict = {}
+        for (mod_name, mod_info), input_budget, target_budget in zip(self.modality_info.items(), input_token_budget, target_token_budget):
+            mod_type = mod_info['type']
+            mod_name_load = mod_name if mod_name in mod_dict else get_transform_key(mod_name)
+            if mod_type == 'img':
+                masked_mod_dict[mod_name] = self.image_mask(mod_dict[mod_name_load], mod_info['max_tokens'], input_budget, target_budget)
+            elif mod_type == 'seq':
+                keep_scheme = 'random' if ('keep' not in mod_info) else mod_info['keep'][dir_idx]
+                masked_mod_dict[mod_name] = self.sequence_mask(mod_dict[mod_name_load], mod_info['max_tokens'], input_budget, target_budget, keep_scheme)
+            elif mod_type == 'seq_token':
+                keep_scheme = 'random' if ('keep' not in mod_info) else mod_info['keep'][dir_idx]
+                vocab_offset =  mod_info.get('vocab_offset', 0) # Check if any space is allocated to sentinel tokens and other special tokens
+                masked_mod_dict[mod_name] = self.sequence_token_mask(mod_dict[mod_name_load], mod_info['max_tokens'], input_budget, target_budget, keep_scheme, vocab_offset=vocab_offset)
+            elif mod_type == "seq_emb":
+                keep_scheme = 'random' if ('keep' not in mod_info) else mod_info['keep'][dir_idx]
+                masked_mod_dict[mod_name] = self.sequence_emb_mask_span(mod_dict[mod_name_load], mod_info['max_tokens'], input_budget, target_budget, keep_scheme)
+            else:
+                raise ValueError(f"Invalid modality type: {mod_type}")
+        return masked_mod_dict
+class TransferMasking(object):
+    def __init__(self,
+                 modality_info: Dict,
+                 text_tokenizer: Optional[Tokenizer],
+                 input_modalities: List[str],
+                 target_modalities: List[str]):
+        """Performs masking for transfer on a dict of modalities (both image based and sequence based modalities),
+        by specifying which modalities are inputs and which are targets.
+        Args:
+            modality_info: Dict with the modalities and their corresponding information
+            text_tokenizer: Tokenizer to use for text modalities
+            input_modalities: List of modalities to use as input
+            target_modalities: List of modalities to use as target
+        """
+        self.modality_info = modality_info
+        self.num_modalities = len(modality_info)
+        self.min_tokens = torch.tensor([mod['min_tokens'] for mod in modality_info.values()])
+        self.max_tokens = torch.tensor([mod['max_tokens'] for mod in modality_info.values()])
+        self.mod_is_img = torch.tensor([mod['type'] == 'img' for mod in modality_info.values()])
+        self.input_modalities = set(input_modalities)
+        self.target_modalities = set(target_modalities)
+        # Tokenizer for text modalities
+        self.text_tokenizer = text_tokenizer
+        if self.text_tokenizer is not None:
+            self.keep_prob_decay_factor = 0.9
+            self.sentinel_to_id = get_sentinel_to_id_mapping(text_tokenizer)
+            self.sentinel_ids = set(self.sentinel_to_id.values())
+            self.pad_id = text_tokenizer.token_to_id("[PAD]")
+            self.eos_id = text_tokenizer.token_to_id("[EOS]")
+    def input_image(self, tensor: torch.Tensor, num_tokens: int):
+        """Applies masking for an image given as input
+        Args:
+            tensor: Image tensor
+            num_tokens: Number of tokens in the tensor
+        Returns:
+            Dictionary containing the masked image tensor, the input mask, the target mask, and the decoder attention mask
+        """
+        # Input mask
+        input_mask = torch.zeros(num_tokens, dtype=torch.bool)
+        # Target mask
+        target_mask = torch.ones(num_tokens, dtype=torch.bool)
+        # Decoder attention mask
+        decoder_attention_mask = torch.zeros(num_tokens, dtype=torch.int)
+        return {"tensor": tensor, "input_mask": input_mask, "target_mask": target_mask, "decoder_attention_mask": decoder_attention_mask}
+    def target_image(self, tensor: torch.Tensor, num_tokens: int):
+        """Applies masking for an image given as target
+        Args:
+            tensor: Image tensor
+            num_tokens: Number of tokens in the tensor
+        Returns:
+            Dictionary containing the masked image tensor, the input mask, the target mask, and the decoder attention mask
+        """
+        # Input mask
+        input_mask = torch.ones(num_tokens, dtype=torch.bool)
+        # Target mask
+        target_mask = torch.zeros(num_tokens, dtype=torch.bool)
+        # Decoder attention mask
+        decoder_attention_mask = torch.zeros(num_tokens, dtype=torch.int)
+        decoder_attention_mask[0] = num_tokens
+        return {"tensor": tensor, "input_mask": input_mask, "target_mask": target_mask, "decoder_attention_mask": decoder_attention_mask}
+    def input_sequence(self, sequence_str: str, max_tokens: int):
+        """Applies masking for a sequence given as input
+        Args:
+            sequence_str: Sequence string
+            max_tokens: Maximum number of tokens in the sequence
+        Returns:
+            Dictionary containing the masked sequence string, the input mask, the target mask, and the decoder attention mask
+        """
+        # Tokenize the text and get the ids
+        seq_ids = self.text_tokenizer.encode(sequence_str).ids
+        # Add EOS to all sequences
+        seq_ids.append(self.eos_id)
+        # Truncate sequence
+        seq_ids = seq_ids[:max_tokens]
+        keep_prob = 1.
+        input_seq_ids, target_seq_ids = simple_span_masking(seq_ids, self.sentinel_to_id, keep_prob)
+        # Span masking can add up to (max_tokens + 1) * 2 tokens for input + target
+        max_length = (max_tokens + 1) * 2
+        tensor = torch.ones(max_length, dtype=torch.int) * self.pad_id
+        input_mask = torch.ones(max_length, dtype=torch.bool)
+        target_mask = torch.ones(max_length, dtype=torch.bool)
+        decoder_attention_mask = torch.zeros(max_length, dtype=torch.int)
+        # Set input and input mask
+        tensor[:len(input_seq_ids)] = torch.tensor(input_seq_ids, dtype=torch.int)
+        input_mask[:len(input_seq_ids)] = 0
+        tensor[max_tokens:max_tokens + len(target_seq_ids)] = torch.tensor(target_seq_ids, dtype=torch.int)
+        target_mask[max_tokens:max_tokens + len(target_seq_ids)] = 0
+        decoder_attention_mask[max_tokens:max_tokens + len(target_seq_ids)] = 1
+        return {"tensor": tensor, "input_mask": input_mask, "target_mask": target_mask, "decoder_attention_mask": decoder_attention_mask}
+    def target_sequence(self, sequence_str: str, max_tokens: int):
+        """Applies masking for a sequence given as target
+        Args:
+            sequence_str: Sequence string
+            max_tokens: Maximum number of tokens in the sequence
+        Returns:
+            Dictionary containing the masked sequence string, the input mask, the target mask, and the decoder attention mask
+        """
+        # Tokenize the text and get the ids
+        seq_ids = self.text_tokenizer.encode(sequence_str).ids
+        # Add EOS to all sequences
+        seq_ids.append(self.eos_id)
+        # Truncate sequence
+        seq_ids = seq_ids[:max_tokens]
+        keep_prob = 0.
+        input_seq_ids = []
+        _, target_seq_ids = simple_span_masking(seq_ids, self.sentinel_to_id, keep_prob)
+        # Span masking can add up to (max_tokens + 1) * 2 tokens for input + target
+        max_length = (max_tokens + 1) * 2
+        tensor = torch.ones(max_length, dtype=torch.int) * self.pad_id
+        input_mask = torch.ones(max_length, dtype=torch.bool)
+        target_mask = torch.ones(max_length, dtype=torch.bool)
+        decoder_attention_mask = torch.zeros(max_length, dtype=torch.int)
+        # Set input and input mask
+        tensor[:len(input_seq_ids)] = torch.tensor(input_seq_ids, dtype=torch.int)
+        input_mask[:len(input_seq_ids)] = 0
+        tensor[max_tokens:max_tokens + len(target_seq_ids)] = torch.tensor(target_seq_ids, dtype=torch.int)
+        target_mask[max_tokens:max_tokens + len(target_seq_ids)] = 0
+        decoder_attention_mask[max_tokens:max_tokens + len(target_seq_ids)] = 1
+        return {"tensor": tensor, "input_mask": input_mask, "target_mask": target_mask,
+                "decoder_attention_mask": decoder_attention_mask}
+    def __call__(self, mod_dict):
+        """Applies input and target masking to a dictionary of modalities
+        Args:
+            mod_dict: Dictionary of modalities
+        Returns:
+            Dictionary containing the masked modalities
+        """
+        masked_mod_dict = {}
+        for mod_name, mod_info in self.modality_info.items():
+            mod_type = mod_info['type']
+            if mod_type == 'img' and mod_name in self.input_modalities:
+                masked_mod_dict[mod_name] = self.input_image(mod_dict[mod_name], mod_info['max_tokens'])
+            elif mod_type == 'img' and mod_name in self.target_modalities:
+                masked_mod_dict[mod_name] = self.target_image(mod_dict[mod_name], mod_info['max_tokens'])
+            elif mod_type == 'seq' and mod_name in self.input_modalities:
+                masked_mod_dict[mod_name] = self.input_sequence(mod_dict[mod_name], mod_info['max_tokens'])
+            elif mod_type == 'seq' and mod_name in self.target_modalities:
+                masked_mod_dict[mod_name] = self.target_sequence(mod_dict[mod_name], mod_info['max_tokens'])
+            else:
+                raise ValueError(f"Invalid modality type: {mod_type} or modality name not in input or target modalities: {mod_name}")
+        if 'mask_valid' in mod_dict:
+            masked_mod_dict['mask_valid'] = mod_dict['mask_valid']
+        return masked_mod_dict

fourm/data/modality_info.py ADDED Viewed

	@@ -0,0 +1,427 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import fourm.utils.data_constants as data_constants
+from fourm.data.modality_transforms import (CaptionTransform, DepthTransform,
+                                      DetectionTransform, MaskTransform,
+                                      NormalTransform, RGBTransform,
+                                      SemsegTransform, TokTransform,
+                                      CaptionEmbTransform, MetadataTransform,
+                                      HumanPoseTransform, ColorPaletteTransform,
+                                      SAMInstanceTokTransform, SAMInstanceTransform)
+from fourm.models.decoder_embeddings import (ImageTokenDecoderEmbedding,
+                                   SequenceDecoderEmbedding)
+from fourm.models.encoder_embeddings import (ImageEncoderEmbedding,
+                                   ImageTokenEncoderEmbedding,
+                                   SequenceEncoderEmbedding,
+                                   SequenceEmbEncoderEmbedding)
+from fourm.utils import generate_uint15_hash
+MODALITY_INFO = {
+    # 4M-7 modalities
+    'rgb@224': {
+        'input_size': 224,
+        'patch_size': 16,
+        'encoder_embedding': partial(ImageEncoderEmbedding, num_channels=3),
+        'decoder_embedding': None,
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 196
+        'type': 'img',
+        'num_channels': 3,
+        'id': generate_uint15_hash('rgb@224'),
+        'path': 'rgb',
+    },
+    'rgb': { # used for tokenizer training
+        'type': 'img',
+        'num_channels': 3,
+        'id': generate_uint15_hash('rgb'),
+        'path': 'rgb',
+    },
+    'caption': {
+        'vocab_size': 30_000,
+        'encoder_embedding': partial(SequenceEncoderEmbedding, vocab_size=30_000, max_length=256, padding_idx=0),
+        'decoder_embedding': partial(SequenceDecoderEmbedding, vocab_size=30_000, max_length=256, padding_idx=0),
+        'min_tokens': 0,
+        'max_tokens': 256,
+        'type': 'seq',
+        'id': generate_uint15_hash('caption'),
+    },
+    'det': {
+        'vocab_size': 30_000,
+        'encoder_embedding': partial(SequenceEncoderEmbedding, vocab_size=30_000, max_length=256, padding_idx=0),
+        'decoder_embedding': partial(SequenceDecoderEmbedding, vocab_size=30_000, max_length=256, padding_idx=0),
+        'min_tokens': 0,
+        'max_tokens': 256,
+        'type': 'seq',
+        'id': generate_uint15_hash('det'),
+    },
+    'tok_rgb@224': {
+        'input_size': 224,
+        'patch_size': 16,
+        'vocab_size': 16384,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=16384),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=16384),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 196
+        'type': 'img',
+        'id': generate_uint15_hash('tok_rgb@224'),
+        'pretokenized': True,
+    },
+    'tok_depth@224': {
+        'input_size': 224,
+        'patch_size': 16,
+        'vocab_size': 8192,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 196
+        'type': 'img',
+        'id': generate_uint15_hash('tok_depth@224'),
+        'pretokenized': True,
+    },
+    'depth': { # used for tokenizer training
+        'type': 'img',
+        'num_channels': 1,
+        'id': generate_uint15_hash('depth'),
+    },
+    'tok_normal@224': {
+        'input_size': 224,
+        'patch_size': 16,
+        'vocab_size': 8192,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 196
+        'type': 'img',
+        'id': generate_uint15_hash('tok_normal@224'),
+        'pretokenized': True,
+    },
+    'normal': { # used for tokenizer training
+        'type': 'img',
+        'num_channels': 3,
+        'id': generate_uint15_hash('normal'),
+    },
+    'tok_semseg@224': {
+        'input_size': 224,
+        'patch_size': 16,
+        'vocab_size': 4096,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=4096),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=4096),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 196
+        'type': 'img',
+        'id': generate_uint15_hash('tok_semseg@224'),
+        'pretokenized': True,
+    },
+    'semseg_coco': { # used for tokenizer training
+        'type': 'img',
+        'num_channels': 64,
+        'num_labels': data_constants.COCO_SEMSEG_NUM_CLASSES,
+        'id': generate_uint15_hash('semseg_coco'),
+    },
+    'tok_clip@224': {
+        'input_size': 224,
+        'patch_size': 16,
+        'vocab_size': 8192,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 196
+        'type': 'img',
+        'id': generate_uint15_hash('tok_clip@224'),
+        'pretokenized': True,
+    },
+    'CLIP-B16': { # used for tokenizer training
+        'type': 'feature_map',
+        'num_channels': 512,
+        'id': generate_uint15_hash('CLIP-B16'),
+    },
+    # 4M-21 modalities
+    't5_caption': {
+        'encoder_embedding': partial(SequenceEmbEncoderEmbedding, max_length=77, padding_idx=0),
+        'decoder_embedding': None,
+        'min_tokens': 0,
+        'max_tokens': 77,
+        'type': 'seq_emb',
+        'id': generate_uint15_hash('t5_caption'),
+    },
+    'metadata': {
+        'vocab_size': 30_000,
+        'encoder_embedding': partial(SequenceEncoderEmbedding, vocab_size=30_000, max_length=40, padding_idx=0, sincos_pos_emb=True),
+        'decoder_embedding': partial(SequenceDecoderEmbedding, vocab_size=30_000, max_length=40, padding_idx=0, sincos_pos_emb=True),
+        'min_tokens': 0,
+        'max_tokens': 40, # At most 2x19=38 for 19 metadata types, +1 for EOS, +1 for sentinel
+        'type': 'seq',
+        'id': generate_uint15_hash('metadata'),
+        'shared_vocab': ['caption'],
+        'path': 'metadata',
+    },
+    'human_poses': {
+        'vocab_size': 30_000,
+        'encoder_embedding': partial(SequenceEncoderEmbedding, vocab_size=30_000, max_length=263, padding_idx=0, sincos_pos_emb=True),
+        'decoder_embedding': partial(SequenceDecoderEmbedding, vocab_size=30_000, max_length=263, padding_idx=0, sincos_pos_emb=True),
+        'min_tokens': 0,
+        'max_tokens': 275, #7*39+1 EOS+1 S_1#263, #261 in one of the models, or 263 to have EOS #261+1+1 #238,
+        'type': 'seq',
+        'num_channels': 207, # for tokenization training, only the pose part is needed
+        'id': generate_uint15_hash('human_poses'),
+        'shared_vocab': ['caption'],
+    },
+    'color_palette': {
+        'vocab_size': 30_000,
+        'encoder_embedding': partial(SequenceEncoderEmbedding, vocab_size=30_000, max_length=23, padding_idx=0, sincos_pos_emb=True),
+        'decoder_embedding': partial(SequenceDecoderEmbedding, vocab_size=30_000, max_length=23, padding_idx=0, sincos_pos_emb=True),
+        'min_tokens': 0,
+        'max_tokens': 23, #7x3=21 for 7 colors, +1 for EOS, +1 for sentinel
+        'type': 'seq',
+        'id': generate_uint15_hash('color_palette'),
+        'shared_vocab': ['caption'],
+        'path': 'color_palette',
+    },
+    'sam_mask': {
+        'encoder_embedding': None,
+        'decoder_embedding': None,
+        'min_tokens': 0,
+        'max_tokens': 64,
+        'type': 'img',
+        'num_channels': 1,
+        'id': generate_uint15_hash('sam_mask'),
+    },
+    'sam_instance': {
+        'vocab_size': 30_000,
+        'encoder_embedding': partial(SequenceEncoderEmbedding, vocab_size=30_000, max_length=290, padding_idx=0, sincos_pos_emb=True),
+        'decoder_embedding': partial(SequenceDecoderEmbedding, vocab_size=30_000, max_length=290, padding_idx=0, sincos_pos_emb=True),
+        'min_tokens': 0,
+        'max_tokens': 290,
+        'type': 'seq',
+        'id': generate_uint15_hash('sam_instance'),
+        'shared_vocab': ['caption'],
+        'pretokenized': True,
+    },
+    'tok_canny_edge@224': {
+        'input_size': 224,
+        'patch_size': 16,
+        'vocab_size': 8192,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 196
+        'type': 'img',
+        'id': generate_uint15_hash('tok_canny_edge@224'),
+        'pretokenized': True,
+    },
+    'canny_edge': { # used for tokenizer training
+        'type': 'img',
+        'num_channels': 1,
+        'id': generate_uint15_hash('canny_edge'),
+    },
+    'tok_sam_edge@224': {
+        'input_size': 224,
+        'patch_size': 16,
+        'vocab_size': 8192,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 196
+        'type': 'img',
+        'id': generate_uint15_hash('tok_sam_edge@224'),
+        'pretokenized': True,
+    },
+    'tok_dinov2@224': {
+        'input_size': 224,
+        'patch_size': 14,
+        'vocab_size': 8192,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 256
+        'type': 'img',
+        'id': generate_uint15_hash('tok_dinov2@224'),
+        'pretokenized': True,
+    },
+    'DINOv2-B14': { # used for tokenizer training
+        'type': 'feature_map',
+        'num_channels': 768,
+        'id': generate_uint15_hash('DINOv2-B14'),
+    },
+    'tok_imagebind@224': {
+        'input_size': 224,
+        'patch_size': 14,
+        'vocab_size': 8192,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 256
+        'type': 'img',
+        'id': generate_uint15_hash('tok_imagebind@224'),
+        'pretokenized': True,
+    },
+    'ImageBind-H14': { # used for tokenizer training
+        'type': 'feature_map',
+        'num_channels': 1280,
+        'id': generate_uint15_hash('ImageBind-H14'),
+    },
+    'tok_dinov2_global': {
+        'vocab_size': 8192,
+        'patch_size': 56,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192, sincos_pos_emb=False),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192, sincos_pos_emb=False),
+        'min_tokens': 0,
+        'max_tokens': 16,
+        'type': 'img',
+        'id': generate_uint15_hash('tok_dinov2_global'),
+        'pretokenized': True,
+    },
+    'DINOv2-B14-global': { # used for tokenizer training
+        'type': 'feature_map',
+        'num_channels': 768,
+        'id': generate_uint15_hash('DINOv2-B14-global'),
+    },
+    'tok_imagebind_global': {
+        'vocab_size': 8192,
+        'patch_size': 56,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192, sincos_pos_emb=False),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192, sincos_pos_emb=False),
+        'min_tokens': 0,
+        'max_tokens': 16,
+        'type': 'img',
+        'id': generate_uint15_hash('tok_imagebind_global'),
+        'pretokenized': True,
+    },
+    'ImageBind-H14-global': { # used for tokenizer training
+        'type': 'feature_map',
+        'num_channels': 1280,
+        'id': generate_uint15_hash('ImageBind-H14-global'),
+    },
+    ### 224->448 super resolution modalities
+    'rgb@448': {
+        'input_size': 448,
+        'patch_size': 16,
+        'encoder_embedding': partial(ImageEncoderEmbedding, num_channels=3),
+        'decoder_embedding': None,
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 784
+        'type': 'img',
+        'num_channels': 3,
+        'id': generate_uint15_hash('rgb@448'),
+        'path': 'rgb',
+    },
+    'tok_rgb@448': {
+        'input_size': 448,
+        'patch_size': 16,
+        'vocab_size': 16384,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=16384),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=16384),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 784
+        'type': 'img',
+        'id': generate_uint15_hash('tok_rgb@448'),
+        'pretokenized': True,
+    },
+    'tok_depth@448': {
+        'input_size': 448,
+        'patch_size': 16,
+        'vocab_size': 8192,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 784
+        'type': 'img',
+        'id': generate_uint15_hash('tok_depth@448'),
+        'pretokenized': True,
+    },
+    'tok_normal@448': {
+        'input_size': 448,
+        'patch_size': 16,
+        'vocab_size': 8192,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 784
+        'type': 'img',
+        'id': generate_uint15_hash('tok_normal@448'),
+        'pretokenized': True,
+    },
+    'tok_semseg@448': {
+        'input_size': 448,
+        'patch_size': 16,
+        'vocab_size': 4096,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=4096),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=4096),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 784
+        'type': 'img',
+        'id': generate_uint15_hash('tok_semseg@448'),
+        'pretokenized': True,
+    },
+    'tok_clip@448': {
+        'input_size': 448,
+        'patch_size': 16,
+        'vocab_size': 8192,
+        'encoder_embedding': partial(ImageTokenEncoderEmbedding, vocab_size=8192),
+        'decoder_embedding': partial(ImageTokenDecoderEmbedding, vocab_size=8192),
+        'min_tokens': 0,
+        'max_tokens': None, # Will be set to 784
+        'type': 'img',
+        'id': generate_uint15_hash('tok_clip@448'),
+        'pretokenized': True,
+    },
+}
+# Note: @res suffix is ignored for modality transforms
+MODALITY_TRANSFORMS = {
+    # 4M-7 modalities
+    'rgb': RGBTransform(imagenet_default_mean_and_std=True),
+    'caption': CaptionTransform(aligned_captions=True),
+    'det': DetectionTransform(det_threshold=0.6, det_max_instances=None, bbox_order='dist_to_orig', coord_bins=1000, min_visibility=0.0),
+    'tok_rgb': TokTransform(),
+    'tok_depth': TokTransform(),
+    'tok_normal': TokTransform(),
+    'tok_semseg': TokTransform(),
+    'tok_clip': TokTransform(),
+    # 4M-21 modalities
+    't5_caption': CaptionEmbTransform(),
+    'metadata': MetadataTransform(special_vmin=0, special_vmax=999, shuffle=True, random_trunc=False, return_chunks=True),
+    'human_poses': HumanPoseTransform(coord_bins=1000),
+    'color_palette': ColorPaletteTransform(coord_bins=1000),
+    'sam_instance': SAMInstanceTokTransform(image_size=224, points_per_side=7, point_order='random'),
+    'tok_canny_edge': TokTransform(),
+    'tok_sam_edge': TokTransform(),
+    'tok_dinov2': TokTransform(),
+    'tok_imagebind': TokTransform(),
+    'tok_dinov2_global': TokTransform(),
+    'tok_imagebind_global': TokTransform(),
+    # Other
+    'mask_valid': MaskTransform(mask_pool_size=1),
+}
+MODALITY_TRANSFORMS_DIVAE = {
+    'rgb': RGBTransform(imagenet_default_mean_and_std=False),
+    'depth': DepthTransform(standardize_depth=True),
+    'normal': NormalTransform(standardize_surface_normals=False),
+    'mask_valid': MaskTransform(mask_pool_size=1),
+    'semseg_coco': SemsegTransform(shift_idx_by_one=True),
+    'canny_edge': RGBTransform(imagenet_default_mean_and_std=False),
+    'human_poses': HumanPoseTransform(coord_bins=1000, only_pose=True),
+    'sam_mask': SAMInstanceTransform(mask_size=64, max_instance_n=1),
+}
+MODALITY_TRANSFORMS_VQCONTROLNET = {
+    'rgb': RGBTransform(imagenet_default_mean_and_std=False),
+    'mask_valid': MaskTransform(mask_pool_size=1),
+    'caption': CaptionTransform(aligned_captions=True),
+}

fourm/data/modality_transforms.py ADDED Viewed

	@@ -0,0 +1,1387 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gzip
+import json
+import random
+from pathlib import Path
+from typing import Optional, Tuple, List, Dict
+from abc import ABC, abstractmethod
+from PIL import Image
+import cv2
+import albumentations as A
+import numpy as np
+import torch
+import torchvision.transforms.functional as TF
+import torchvision.transforms as T
+from einops import rearrange, repeat, reduce
+from fourm.utils import to_2tuple
+from fourm.utils.data_constants import (IMAGENET_DEFAULT_MEAN,
+                                  IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN,
+                                  IMAGENET_SURFACE_NORMAL_STD, IMAGENET_SURFACE_NORMAL_MEAN,
+                                  IMAGENET_INCEPTION_STD, SEG_IGNORE_INDEX, PAD_MASK_VALUE)
+# The @-symbol is used to specify the resolution of a modality. Syntax: modality@resolution
+def get_transform_key(mod_name):
+    return mod_name.split('@')[0]
+def get_transform_resolution(mod_name, default_resolution, to_tuple=True):
+    res = int(mod_name.split('@')[1]) if '@' in mod_name else default_resolution
+    return to_2tuple(res) if to_tuple else res
+def get_transform(mod_name, transforms_dict):
+    return transforms_dict.get(get_transform_key(mod_name), IdentityTransform())
+def get_pil_resample_mode(resample_mode: str):
+    """
+    Returns the PIL resampling mode for the given resample mode string.
+    Args:
+        resample_mode: Resampling mode string
+    """
+    if resample_mode is None:
+        return None
+    elif resample_mode == "bilinear":
+        return Image.Resampling.BILINEAR if hasattr(Image, 'Resampling') else Image.BILINEAR
+    elif resample_mode == "bicubic":
+        return Image.Resampling.BICUBIC if hasattr(Image, 'Resampling') else Image.BICUBIC
+    elif resample_mode == "nearest":
+        return Image.Resampling.NEAREST if hasattr(Image, 'Resampling') else Image.NEAREST
+    else:
+        raise ValueError(f"Resample mode {resample_mode} is not supported.")
+class UnifiedDataTransform(object):
+    def __init__(self, transforms_dict, image_augmenter, resample_mode: str = None, add_sizes: bool = False, **kwargs):
+        """Unified data augmentation for FourM
+        Args:
+            transforms_dict (dict): Dict of transforms for each modality
+            image_augmenter (AbstractImageAugmenter): Image augmenter
+            resample_mode (str, optional): Resampling mode for PIL images (default: None -> uses default resampling mode for data type)
+                One out of ["bilinear", "bicubic", "nearest", None].
+            add_sizes (bool, optional): Whether to add crop coordinates and original size to the output dict
+        """
+        self.transforms_dict = transforms_dict
+        self.image_augmenter = image_augmenter
+        self.resample_mode = resample_mode
+        self.add_sizes = add_sizes
+    def unified_image_augment(self, mod_dict, crop_settings):
+        """Apply the image augmenter to all modalities where it is applicable
+        Args:
+            mod_dict (dict): Dict of modalities
+            crop_settings (dict): Crop settings
+        Returns:
+            dict: Transformed dict of modalities
+        """
+        crop_coords, flip, orig_size, target_size, rand_aug_idx = self.image_augmenter(mod_dict, crop_settings)
+        mod_dict = {
+            k: self.transforms_dict[get_transform_key(k)].image_augment(
+                v, crop_coords=crop_coords, flip=flip, orig_size=orig_size,
+                target_size=get_transform_resolution(k, target_size), rand_aug_idx=rand_aug_idx,
+                resample_mode=self.resample_mode
+            )
+            for k, v in mod_dict.items()
+        }
+        if self.add_sizes:
+            mod_dict["crop_coords"] = torch.tensor(crop_coords)
+            mod_dict["orig_size"] = torch.tensor(orig_size)
+        return mod_dict
+    def __call__(self, mod_dict):
+        """Apply the augmentation to a dict of modalities (both image based and sequence based modalities)
+        Args:
+            mod_dict (dict): Dict of modalities
+        Returns:
+            dict: Transformed dict of modalities
+        """
+        crop_settings = mod_dict.pop("crop_settings", None)
+        mod_dict = {k: get_transform(k, self.transforms_dict).preprocess(v) for k, v in mod_dict.items()}
+        mod_dict = self.unified_image_augment(mod_dict, crop_settings)
+        mod_dict = {k: get_transform(k, self.transforms_dict).postprocess(v) for k, v in mod_dict.items()}
+        return mod_dict
+    def __repr__(self):
+        repr = "(UnifiedDataAugmentation,\n"
+        repr += ")"
+        return repr
+class AbstractTransform(ABC):
+    @abstractmethod
+    def load(self, sample):
+        pass
+    @abstractmethod
+    def preprocess(self, sample):
+        pass
+    @abstractmethod
+    def image_augment(self, v, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        pass
+    @abstractmethod
+    def postprocess(self, v):
+        pass
+class ImageTransform(AbstractTransform):
+    @staticmethod
+    def pil_loader(path: str) -> Image.Image:
+        # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+        # with open(path, 'rb') as f:
+        #     img = Image.open(f)
+        img = Image.open(path)
+        return img
+    @staticmethod
+    def image_hflip(img: Image, flip: bool):
+        """Crop and resize an image
+        :param img: Image to crop and resize
+        :param flip: Whether to flip the image
+        :return: Flipped image (if flip = True)
+        """
+        if flip:
+            img = TF.hflip(img)
+        return img
+    @staticmethod
+    def image_crop_and_resize(img: Image, crop_coords: Tuple, target_size: Tuple, resample_mode: str = None):
+        """Crop and resize an image
+        :param img: Image to crop and resize
+        :param crop_coords: Coordinates of the crop (top, left, h, w)
+        :param target_size: Coordinates of the resize (height, width)
+        :return: Cropped and resized image
+        """
+        top, left, h, w = crop_coords
+        resize_height, resize_width = target_size
+        img = TF.crop(img, top, left, h, w)
+        resample_mode = get_pil_resample_mode(resample_mode)
+        img = img.resize((resize_height, resize_width), resample=resample_mode)
+        return img
+class RGBTransform(ImageTransform):
+    def __init__(self, imagenet_default_mean_and_std=True, color_jitter=False, color_jitter_strength=0.5):
+        self.rgb_mean = IMAGENET_INCEPTION_MEAN if not imagenet_default_mean_and_std else IMAGENET_DEFAULT_MEAN
+        self.rgb_std = IMAGENET_INCEPTION_STD if not imagenet_default_mean_and_std else IMAGENET_DEFAULT_STD
+        self.color_jitter = color_jitter
+        self.color_jitter_transform = self.random_color_jitter(color_jitter_strength)
+    def random_color_jitter(self, strength=0.5):
+        # Color Jitter from Pix2Seq and SimCLR
+        # Source: https://github.com/google-research/pix2seq/blob/main/data/data_utils.py#L114
+        t = T.Compose([
+            T.RandomApply([T.ColorJitter(brightness=0.8 * strength, contrast=0.8 * strength, saturation=0.8 * strength, hue=0.2 * strength)], p=0.8),
+            T.RandomApply([T.Grayscale(num_output_channels=3)], p=0.2),
+        ])
+        return t
+    def rgb_to_tensor(self, img):
+        img = TF.to_tensor(img)
+        img = TF.normalize(img, mean=self.rgb_mean, std=self.rgb_std)
+        return img
+    def load(self, path):
+        # TODO: Instead of converting to RGB here, do it either in the preprocess or the postprocess step. Makes it compatible with wds dataloading.
+        sample = self.pil_loader(path)
+        return sample
+    def preprocess(self, sample):
+        sample = sample.convert('RGB')
+        if self.color_jitter:
+            sample = self.color_jitter_transform(sample)
+        return sample
+    def image_augment(self, img, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        img = self.image_crop_and_resize(img, crop_coords, target_size, resample_mode=resample_mode)
+        img = self.image_hflip(img, flip)
+        return img
+    def postprocess(self, sample):
+        sample = self.rgb_to_tensor(sample)
+        return sample
+class DepthTransform(ImageTransform):
+    def __init__(self, standardize_depth=True):
+        self.standardize_depth = standardize_depth
+    def depth_to_tensor(self, img):
+        img = torch.Tensor( img / (2 ** 16 - 1.0) )
+        img = img.unsqueeze(0)  # 1 x H x W
+        if self.standardize_depth:
+            img = self.truncated_depth_standardization(img)
+        return img
+    @staticmethod
+    def truncated_depth_standardization(depth, thresh: float = 0.1):
+        """Truncated depth standardization
+        :param depth: Depth map
+        :param thresh: Threshold
+        :return: Robustly standardized depth map
+        """
+        # Flatten depth and remove bottom and top 10% of values
+        trunc_depth = torch.sort(depth.reshape(-1), dim=0)[0]
+        trunc_depth = trunc_depth[int(thresh * trunc_depth.shape[0]): int((1 - thresh) * trunc_depth.shape[0])]
+        return (depth - trunc_depth.mean()) / torch.sqrt(trunc_depth.var() + 1e-6)
+    def load(self, path):
+        sample = self.pil_loader(path)
+        return sample
+    def preprocess(self, sample):
+        return sample
+    def image_augment(self, img, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        img = self.image_crop_and_resize(img, crop_coords, target_size, resample_mode=resample_mode)
+        img = self.image_hflip(img, flip)
+        return img
+    def postprocess(self, sample):
+        sample = np.array(sample)
+        sample = self.depth_to_tensor(sample)
+        return sample
+class NormalTransform(ImageTransform):
+    def __init__(self, standardize_surface_normals=False):
+        self.normal_mean = (0.5, 0.5, 0.5) if not standardize_surface_normals else IMAGENET_SURFACE_NORMAL_MEAN
+        self.normal_std = (0.5, 0.5, 0.5) if not standardize_surface_normals else IMAGENET_SURFACE_NORMAL_STD
+    def normal_to_tensor(self, img):
+        img = TF.to_tensor(img)
+        img = TF.normalize(img, mean=self.normal_mean, std=self.normal_std)
+        return img
+    def load(self, path):
+        sample = self.pil_loader(path)
+        return sample
+    def preprocess(self, sample):
+        return sample
+    def image_hflip(self, img: Image, flip: bool):
+        if flip:
+            img = TF.hflip(img)
+            flipped_np = np.array(img)
+            flipped_np[:, :, 0] = 255 - flipped_np[:, :, 0]
+            img = Image.fromarray(flipped_np)
+        return img
+    def image_augment(self, img, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        img = self.image_crop_and_resize(img, crop_coords, target_size, resample_mode=resample_mode)
+        img = self.image_hflip(img, flip)
+        return img
+    def postprocess(self, sample):
+        sample = self.normal_to_tensor(sample)
+        return sample
+class SemsegTransform(ImageTransform):
+    def __init__(self, scale_factor=1.0, shift_idx_by_one=False, id_mapping: Optional[Dict] = None, select_channel=None):
+        self.scale_factor = scale_factor
+        self.shift_idx_by_one = shift_idx_by_one
+        self.id_mapping = id_mapping
+        self.select_channel = select_channel
+    def map_semseg_values(self, sample):
+        sample = np.asarray(sample)
+        mapping_fn = lambda x: self.id_mapping.get(x, x)
+        sample = np.vectorize(mapping_fn)(sample)
+        sample = Image.fromarray(sample, mode='P')
+        return sample
+    def semseg_to_tensor(self, img):
+        # Rescale to scale factor
+        if self.scale_factor != 1.0:
+            target_height, target_width = int(img.height * self.scale_factor), int(img.width * self.scale_factor)
+            img = img.resize((target_width, target_height))
+        # Using pil_to_tensor keeps it in uint8, to_tensor converts it to float (rescaled to [0, 1])
+        img = TF.pil_to_tensor(img).to(torch.long).squeeze(0)
+        # 255->0, 254->0, all else shifted up by one
+        return img
+    def load(self, path):
+        sample = self.pil_loader(path)
+        if self.select_channel is not None:
+            sample = sample.split()[self.select_channel]
+        return sample
+    def preprocess(self, sample):
+        sample = sample.convert('P')
+        if self.id_mapping is not None:
+            sample = self.map_semseg_values(sample)
+        if self.shift_idx_by_one:
+            sample = np.asarray(sample)
+            sample = sample + 1
+            sample = Image.fromarray(sample, mode='P')
+        return sample
+    def image_augment(self, img, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        # Value for padding with TF.crop is always 0.
+        # Override resampling mode to 'nearest' for semseg
+        img = self.image_crop_and_resize(img, crop_coords, target_size, resample_mode='nearest')
+        img = self.image_hflip(img, flip)
+        return img
+    def postprocess(self, sample):
+        img = self.semseg_to_tensor(sample)
+        return img
+class SAMInstanceTransform(AbstractTransform):
+    def __init__(self, mask_size=64, max_instance_n=20, bbox_area_threshold=0.0005):
+        self.mask_size = mask_size
+        self.max_instance_n = max_instance_n
+        self.bbox_area_threshold = bbox_area_threshold
+    def get_bbox(self, instance):
+        """ Gets bounding box of the given instance
+        """
+        min_h, max_h =  instance[:,:,1].min(), instance[:,:,1].max()
+        min_w, max_w =  instance[:,:,0].min(), instance[:,:,0].max()
+        return [min_h, min_w, max_h, max_w]
+    def extend_instance_points(self, instance, border_fn):
+        """ Given an instance and a border function `border_fn`, extends the instance points with crossing points between the instance and
+        the crop borders. The crossing points are obtained using border_fn.
+        """
+        p = instance[:,0]
+        p_next = np.roll(p, (-1), axis=(0))
+        final_points = []
+        for x, xn in zip(p, p_next):
+            final_points.append(x)
+            for r in border_fn(x, xn):
+                final_points.append(r.astype(np.int32))
+        p = np.stack(final_points)
+        return p[:,None]
+    def remove_redundant_lines(self, orig_instance, instance):
+        """ Removes the redundant lines added during cropping.
+        """
+        final_points = []
+        for p in instance:
+            distance = cv2.pointPolygonTest(orig_instance, (p[0,0].item(), p[0,1].item()), measureDist=True)
+            if distance >= 0:
+                final_points.append(p[0])
+        return np.stack(final_points)[:,None]
+    def get_border_functions(self, crop_points):
+        """ Creates and returns a function `fn` using crop region coordinates given in crop_points.
+        `fn` receives two input points x and xn and returns all the crossing points between the line connecting
+        x and xn, and the borders of the cropping rectangle.
+        """
+        p = crop_points[:,0]
+        p_next = np.roll(p, (-1), axis=(0))
+        def fn(x, xn):
+            output = []
+            c_diff = p_next - p
+            x_diff = x - xn
+            for diff, c in zip(c_diff, p):
+                A = np.array([
+                        [diff[0], x_diff[0]],
+                        [diff[1], x_diff[1]]
+                    ])
+                b = x - c
+                try:
+                    lmbda = np.linalg.solve(A, b)
+                    if 0 <= lmbda[0] <= 1 and 0 <= lmbda[1] <= 1:
+                        output.append(lmbda[1] * xn + (1-lmbda[1]) * x)
+                except:
+                    continue
+            return output
+        return fn
+    def crop_sample(self, sample, crop_coords):
+        """ Crop the sample using crop coordinates.
+        """
+        top, left, h, w = crop_coords
+        crop_region = (left, top, left + w, top + h)
+        crop_points = np.array([
+            [crop_region[0], crop_region[1]],
+            [crop_region[2], crop_region[1]],
+            [crop_region[2], crop_region[3]],
+            [crop_region[0], crop_region[3]],
+        ])[:,None]
+        border_functions = self.get_border_functions(crop_points)
+        cropped_sample = []
+        for instance in sample:
+            instance = self.extend_instance_points(instance, border_functions)
+            filter_condition = (
+                (instance[:, :, 0] > crop_region[0]) &
+                (instance[:, :, 0] < crop_region[2]) &
+                (instance[:, :, 1] > crop_region[1]) &
+                (instance[:, :, 1] < crop_region[3])
+            )
+            if not np.any(filter_condition):
+                continue
+            instance_copy = instance.copy()
+            instance_copy[:, :, 0] = np.clip(instance[:, :, 0], a_min=crop_region[0], a_max=crop_region[2])
+            instance_copy[:, :, 1] = np.clip(instance[:, :, 1], a_min=crop_region[1], a_max=crop_region[3])
+            instance_copy = self.remove_redundant_lines(instance, instance_copy)
+            instance_copy[:, :, 0] -= crop_region[0]
+            instance_copy[:, :, 1] -= crop_region[1]
+            cropped_sample.append(instance_copy)
+        return cropped_sample
+    def resize_sample(self, sample, original_size, target_size):
+        """ Resize the sample
+        """
+        width_scale = target_size[1] / original_size[1]
+        height_scale = target_size[0] / original_size[0]
+        resized_sample = []
+        for instance in sample:
+            instance_copy = instance.copy()
+            instance_copy[:, :, 0] = np.round(width_scale * instance_copy[:, :, 0])
+            instance_copy[:, :, 1] = np.round(height_scale * instance_copy[:, :, 1])
+            resized_sample.append(instance_copy)
+        return resized_sample
+    def remove_tiny_instances(self, sample, image_size):
+        """ Remove instances that have an area ratio smaller than `bbox_area_threshold`.
+        """
+        filtered_sample = []
+        for instance in sample:
+            min_h, min_w, max_h, max_w = self.get_bbox(instance)
+            bbox_area_ratio = (max_h - min_h) * (max_w - min_w) / (image_size[0] * image_size[1])
+            if bbox_area_ratio < self.bbox_area_threshold:
+                continue
+            filtered_sample.append(instance)
+        return filtered_sample
+    def hflip(self, sample, width):
+        """ Horizontal flipping the instances in a sample.
+        """
+        flipped_sample = []
+        for instance in sample:
+            instance_copy = instance.copy()
+            instance_copy[:, :, 0] = width - instance_copy[:, :, 0]
+            flipped_sample.append(instance_copy)
+        return flipped_sample
+    def get_binary_masks(self, sample):
+        """ Creates the binary mask of each instance in the sample.
+        """
+        if self.max_instance_n is None:
+            max_instance_n = len(sample)
+        else:
+            max_instance_n = self.max_instance_n
+        masks = np.zeros((max_instance_n, self.mask_size, self.mask_size))
+        bboxes = np.zeros((max_instance_n, 4))
+        valid = np.full(max_instance_n, False)
+        for i, instance in enumerate(sample):
+            bbox = self.get_bbox(instance)
+            min_h, min_w, max_h, max_w = bbox
+            instance_copy = instance.copy()
+            mask = np.zeros((self.mask_size, self.mask_size), dtype=np.uint8)
+            instance_copy[:,:,0] = (instance_copy[:,:,0] - min_w) / (max_w - min_w) * self.mask_size
+            instance_copy[:,:,1] = (instance_copy[:,:,1] - min_h) / (max_h - min_h) * self.mask_size
+            cv2.drawContours(mask, [instance_copy], 0, (255), thickness=cv2.FILLED)
+            masks[i] = mask / 255.0
+            bboxes[i] = np.array(bbox)
+            valid[i] = True
+        return masks, bboxes, valid
+    def load(self, path):
+        sample = np.load(path, allow_pickle=True)
+        return sample
+    def preprocess(self, sample):
+        if self.max_instance_n is None or len(sample) <= self.max_instance_n:
+            indecies = np.arange(len(sample))
+        else:
+            indecies = np.random.choice(len(sample), size=self.max_instance_n, replace=False)
+        return [p['points'] for i, p in enumerate(sample) if i in indecies]
+    def image_augment(self, v, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        v = self.crop_sample(v, crop_coords)
+        _, _, h, w = crop_coords
+        v = self.resize_sample(v, (h, w), target_size)
+        v = self.remove_tiny_instances(v, target_size)
+        if flip:
+            v = self.hflip(v, target_size[0])
+        return v
+    def postprocess(self, sample):
+        sample, bboxes, valid = self.get_binary_masks(sample)
+        return {
+            'instance': torch.from_numpy(sample).to(torch.float32),
+            'bbox': torch.from_numpy(bboxes).to(torch.float32),
+            'valid': torch.from_numpy(valid)
+        }
+class MaskTransform(ImageTransform):
+    def __init__(self, mask_pool_size=1):
+        assert isinstance(mask_pool_size, int)
+        self.mask_pool_size = mask_pool_size # Use to expand masks
+    def mask_to_tensor(self, img):
+        mask = TF.to_tensor(img)
+        if self.mask_pool_size > 1:
+            mask = reduce(mask, 'c (h1 h2) (w1 w2) -> c h1 w1', 'min', h2=self.mask_pool_size, w2=self.mask_pool_size)
+            mask = repeat(mask, 'c h1 w1 -> c (h1 h2) (w1 w2)', h2=self.mask_pool_size, w2=self.mask_pool_size)
+        return (mask == 1.0)
+    def load(self, path):
+        sample = self.pil_loader(path)
+        return sample
+    def preprocess(self, sample):
+        return sample
+    def image_augment(self, img, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        # Override resampling mode to 'nearest' for masks
+        img = self.image_crop_and_resize(img, crop_coords, target_size, resample_mode='nearest')
+        img = self.image_hflip(img, flip)
+        return img
+    def postprocess(self, sample):
+        sample = self.mask_to_tensor(sample)
+        return sample
+class TokTransform(AbstractTransform):
+    def __init__(self):
+        pass
+    def load(self, path):
+        sample = np.load(path).astype(int)
+        return sample
+    def preprocess(self, sample):
+        return sample
+    def image_augment(self, v, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        if rand_aug_idx is None:
+            raise ValueError("Crop settings / augmentation index are missing but a pre-tokenized modality is being used")
+        v = torch.tensor(v[rand_aug_idx])
+        return v
+    def postprocess(self, sample):
+        return sample
+class DetectionTransform(AbstractTransform):
+    def __init__(self, det_threshold=0.6, det_max_instances=None, bbox_order='dist_to_orig', coord_bins=1000, min_visibility=0.0, return_raw=False):
+        self.det_threshold = det_threshold
+        self.det_max_instances = det_max_instances
+        self.coord_bins = coord_bins
+        self.min_visibility = min_visibility
+        self.return_raw = return_raw
+        if bbox_order == 'area':
+            self.bbox_order = self.order_bboxes_by_area
+        elif bbox_order == 'score':
+            self.bbox_order = self.order_bboxes_by_score
+        elif bbox_order == 'random':
+            self.bbox_order = self.shuffle_bboxes
+        else:
+            self.bbox_order = self.order_bboxes_by_dist_to_orig
+    @staticmethod
+    def order_bboxes_by_area(bboxes):
+        return sorted(bboxes, key=lambda x: (x[2] - x[0]) * (x[3] - x[1]), reverse=True)
+    @staticmethod
+    def order_bboxes_by_dist_to_orig(bboxes):
+        return sorted(bboxes, key=lambda x: x[0] ** 2 + x[1] ** 2)
+    @staticmethod
+    def order_bboxes_by_score(bboxes):
+        return sorted(bboxes, key=lambda x: x[5], reverse=True)
+    @staticmethod
+    def shuffle_bboxes(bboxes):
+        return sorted(bboxes, key=lambda x: random.random())
+    def convert_detection_instance(self, instances):
+        """Convert instances dict to list of lists where each list takes the form:
+        [xmin, ymin, xmax, ymax, class_name, score]
+        """
+        instances = [inst['boxes'] + [inst['class_name'], inst['score']] for inst in instances if inst['score'] >= self.det_threshold]
+        return instances
+    def bboxes_hflip(self, bboxes: List[Tuple], image_size: Tuple, flip: bool):
+        image_height, image_width = image_size
+        if flip:
+            bboxes = [tuple(A.bbox_hflip(bbox[:4], rows=image_height, cols=image_width)) + tuple(bbox[4:])
+                      for bbox in bboxes]
+        return bboxes
+    def bboxes_crop_and_resize(self, bboxes: List[Tuple], crop_coords: Tuple, orig_size: Tuple):
+        """Crop and resize bounding boxes
+        Args:
+            bboxes: Bounding boxes to crop and resize
+            crop_coords: Coordinates of the crop (top, left, h, w)
+            orig_size: Size of the original image
+        Returns:
+            Cropped and resized bounding boxes
+        """
+        orig_height, orig_width = orig_size
+        top, left, h, w = crop_coords
+        xmin, ymin, xmax, ymax = left, top, left + w, top + h
+        bboxes = [tuple(A.bbox_crop(bbox[:4], x_min=xmin, y_min=ymin, x_max=xmax, y_max=ymax, rows=orig_height,
+                                    cols=orig_width)) + tuple(bbox[4:])
+                  for bbox in bboxes]
+        bboxes = A.core.bbox_utils.filter_bboxes(bboxes, rows=h, cols=w, min_visibility=self.min_visibility)
+        # No need to resize, bounding boxes in albumentations format are scale invariant
+        return bboxes
+    def order_and_filter_bboxes(self, bboxes):
+        if self.det_max_instances is not None and len(bboxes) > self.det_max_instances:
+            bboxes = self.order_bboxes_by_score(bboxes)[:self.det_max_instances]
+        return self.bbox_order(bboxes)
+    def convert_bboxes_to_string(self, bboxes: List[Tuple]):
+        """Convert bounding boxes to a string.
+        xmin, ymin, xmax, ymax are mapped to v0, v1, v2, v3 special tokens.
+        Args:
+            bboxes: Bounding boxes
+        Returns:
+            String representation of the bounding boxes
+        """
+        # Remove score, quantize coordinates
+        bins = self.coord_bins
+        bboxes = [
+            [
+                f"v0={round(xmin * (bins - 1))}",
+                f"v1={round(ymin * (bins - 1))}",
+                f"v2={round(xmax * (bins - 1))}",
+                f"v3={round(ymax * (bins - 1))}",
+                cls,
+            ]
+            for (xmin, ymin, xmax, ymax, cls, score) in bboxes
+        ]
+        # Convert each bounding box to a string
+        bboxes = [' '.join(b) for b in bboxes]
+        # Convert the list to a str
+        return ' '.join(bboxes)
+    def load(self, path):
+        with open(path, 'r') as f:
+            sample = json.load(f)
+        return sample
+    def preprocess(self, sample):
+        instances = sample['instances']
+        return self.convert_detection_instance(instances)
+    def image_augment(self, bboxes: List[Tuple], crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx=None, resample_mode: str = None):
+        bboxes = self.bboxes_crop_and_resize(bboxes, crop_coords, orig_size)
+        bboxes = self.bboxes_hflip(bboxes, target_size, flip)
+        bboxes = self.order_and_filter_bboxes(bboxes)
+        return bboxes
+    def postprocess(self, bboxes):
+        if self.return_raw:
+            return bboxes
+        bboxes = self.convert_bboxes_to_string(bboxes)
+        return bboxes
+class CaptionTransform(AbstractTransform):
+    def __init__(self, aligned_captions=True, no_aug=False):
+        self.aligned_captions = aligned_captions
+        self.no_aug = no_aug
+    def load(self, path):
+        # Caption can either be stored as .txt or .json.gz (in which case it's a list of dicts)
+        if path.endswith('.txt'):
+            sample = Path(path).read_text()
+        elif path.endswith('.json'):
+            with open(path, 'r') as f:
+                sample = json.load(f)
+        elif path.endswith('.json.gz'):
+            with gzip.open(path, 'rb') as f:
+                sample = json.load(f)
+        return sample
+    def preprocess(self, sample):
+        return sample
+    def image_augment(self, val, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        if isinstance(val, list) or isinstance(val, tuple):
+            if self.aligned_captions:
+                val = val[0] if rand_aug_idx is None else val[rand_aug_idx]
+            else:
+                val = random.choice(val) if not self.no_aug else val[0]
+        if isinstance(val, dict):
+            # If each caption is saved as a dict, extract the string
+            val = val["caption"]
+        assert isinstance(val, str)
+        return val
+    def postprocess(self, sample):
+        return sample
+class CaptionEmbTransform(AbstractTransform):
+    def __init__(self, aligned_captions=True, no_aug=False):
+        self.aligned_captions = aligned_captions
+        self.no_aug = no_aug
+    def load(self, path):
+        if path.endswith('.npz'):
+            sample = np.load(path)
+            sample = {'emb': sample['emb'], 'mask_valid': sample['mask_valid']}
+        else:
+            raise ValueError(f"Invalid file format for caption embedding: {path}")
+        return sample
+    def preprocess(self, sample):
+        return sample
+    def image_augment(self, val, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        emb = val['emb']
+        mask_valid = val['mask_valid'].astype(bool)
+        num_sequences = emb.shape[0]
+        if num_sequences > 1:
+            if self.aligned_captions:
+                if rand_aug_idx is None:
+                    emb, mask_valid = emb[0], mask_valid[0]
+                else:
+                    emb, mask_valid = emb[rand_aug_idx], mask_valid[rand_aug_idx]
+            else:
+                if self.no_aug:
+                    emb, mask_valid = emb[0], mask_valid[0]
+                else:
+                    rand_idx = random.randint(0, num_sequences - 1)
+                    emb, mask_valid = emb[rand_idx], mask_valid[rand_idx]
+        else:
+            emb, mask_valid = emb[0], mask_valid[0]
+        emb = emb[mask_valid] # Keep only valid embeddings
+        return emb
+    def postprocess(self, sample):
+        return torch.tensor(sample)
+class MetadataTransform(AbstractTransform):
+    def __init__(self,
+                 special_vmin: int = 0,
+                 special_vmax: int = 999,
+                 shuffle: bool = True,
+                 random_trunc: bool = False,
+                 return_chunks: bool = True,
+                 return_raw: bool = False,
+                 image_dim_bin_size: int = 32,):
+        """Metadata transform that takes in a metadata dictionary and converts
+        it into a string, or list of strings (for chunked span masking).
+        Uses special tokens v1 to denote metadata types, and v0 for their values.
+        Args:
+            special_vmin: Minimum value for special tokens
+            special_vmax: Maximum value for special tokens
+            shuffle: Whether to shuffle the metadata order
+            random_trunc: Whether to randomly truncate the returned metadata
+            return_chunks: Whether to return a list of strings (for chunked span masking),
+                or a single string with all metadata concatenated
+            return_raw: Whether to return the raw metadata dictionary
+        """
+        self.special_vmin = special_vmin
+        self.special_vmax = special_vmax
+        self.shuffle = shuffle
+        self.random_trunc = random_trunc
+        self.return_chunks = return_chunks
+        self.return_raw = return_raw
+        self.image_dim_bin_size = image_dim_bin_size
+        # Explicit map to make sure that additional entries do not change existing IDs
+        # TODO: Make this work with other text tokenizers
+        self.metadata_id_map = {
+            'original_width': 'v1=0',
+            'original_height': 'v1=1',
+            'caption_n_chars': 'v1=2',
+            'caption_n_words': 'v1=3',
+            'caption_n_sentences': 'v1=4',
+            'n_humans': 'v1=5',
+            'n_sam_instances': 'v1=6',
+            'n_coco_instances': 'v1=7',
+            'coco_instance_diversity': 'v1=8',
+            'colorfulness': 'v1=9',
+            'brightness': 'v1=10',
+            'contrast': 'v1=11',
+            'saturation': 'v1=12',
+            'entropy': 'v1=13',
+            'walkability': 'v1=14',
+            'objectness': 'v1=15',
+            'semantic_diversity': 'v1=16',
+            'geometric_complexity': 'v1=17',
+            'occlusion_score': 'v1=18',
+            'watermark_score': 'v1=19',
+            'aesthetic_score': 'v1=20',
+        }
+        self.id_metadata_map = {v: k for k, v in self.metadata_id_map.items()}
+        # Image-dimension modalities are binned into 32 bins
+        self.image_dim_modalities = ['original_height', 'original_width']
+        # Integer modalities that don't undergo any scaling (except for truncation)
+        self.metadata_int_modalities = [
+            'caption_n_chars', 'caption_n_words', 'caption_n_sentences',
+            'n_humans', 'n_sam_instances', 'n_coco_instances',
+            'coco_instance_diversity', 'semantic_diversity',
+        ]
+        # Bin boundaries for manually defined metadata modalities.
+        # Lowest and highest bin boundaries are implicitly set to -inf and +inf
+        self.metadata_manual_bins = {
+            'watermark_score': [0.5],
+            'aesthetic_score': [4.5, 5.5],
+        }
+        # All other float or integer modalities that are binned into a defined number of bins
+        # Dictionary entries are (vmin, vmax, num_bins)
+        self.metadata_min_max_bins = {
+            'colorfulness': (0, 150, 50),
+            'brightness': (0, 255, 50),
+            'contrast': (0, 127, 50),
+            'saturation': (0, 255, 50),
+            'entropy': (0, 10, 50),
+            'walkability': (0, 1, 50),
+            'objectness': (0, 1, 50),
+            'geometric_complexity': (0, 0.75, 50),
+            'occlusion_score': (0, 0.25, 50),
+        }
+    def image_dim_to_string(self, metadata, key, bin_size=32):
+        value = metadata[key] // bin_size
+        value = max(self.special_vmin, min(value, self.special_vmax))
+        return f"{self.metadata_id_map[key]} v0={value}"
+    def int_metadata_to_string(self, metadata, key):
+        value = max(self.special_vmin, min(metadata[key], self.special_vmax))
+        return f"{self.metadata_id_map[key]} v0={value}"
+    def float_metadata_to_string(self, metadata, key, vmin, vmax, bins):
+        value = max(vmin, min(metadata[key], vmax))
+        value = (value - vmin) / (vmax - vmin)
+        value = int(value * (bins-1))
+        return f"{self.metadata_id_map[key]} v0={value}"
+    def manual_bin_metadata_to_string(self, metadata, key):
+        value = metadata[key]
+        bin_idx = 0
+        for bin_value in self.metadata_manual_bins[key]:
+            if value < bin_value:
+                break
+            bin_idx += 1
+        return f"{self.metadata_id_map[key]} v0={bin_idx}"
+    def metadata_to_string(self, metadata, keys: List[str] = None):
+        keys = list(metadata.keys()) if keys is None else keys
+        if self.shuffle:
+            # Randomly shuffle
+            random.shuffle(keys)
+        if self.random_trunc:
+            # Randomly truncate
+            keys = keys[:random.randint(1,len(keys))]
+        metadata_strings = []
+        for key in keys:
+            if key in self.image_dim_modalities:
+                # Image dimension modalities
+                metadata_str = self.image_dim_to_string(metadata, key, bin_size=self.image_dim_bin_size)
+            elif key in self.metadata_int_modalities:
+                # Integer modalities that don't undergo any scaling
+                metadata_str = self.int_metadata_to_string(metadata, key)
+            elif key in self.metadata_manual_bins:
+                # Metadata modalities for which bin boundaries are manually defined
+                metadata_str = self.manual_bin_metadata_to_string(metadata, key)
+            else:
+                # All other modalities
+                vmin, vmax, bins = self.metadata_min_max_bins[key]
+                metadata_str = self.float_metadata_to_string(metadata, key, vmin, vmax, bins)
+            metadata_strings.append(metadata_str)
+        if self.return_chunks:
+            return metadata_strings
+        else:
+            return ' '.join(metadata_strings)
+    def load(self, path):
+        with open(path, 'r') as f:
+            sample = json.load(f)
+        return sample
+    def preprocess(self, sample):
+        return sample
+    def image_augment(self, val, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx=None, resample_mode: str = None):
+        return val
+    def postprocess(self, metadata):
+        if self.return_raw:
+            return metadata
+        metadata_str = self.metadata_to_string(metadata)
+        return metadata_str
+class HumanPoseTransform(AbstractTransform):
+    def __init__(self, coord_bins=1000, only_pose=False, return_raw=False):
+        self.coord_bins = coord_bins
+        self.return_raw = return_raw
+        self.only_pose = only_pose
+    def convert_humanpose_instance(self, instances, only_pose=False):
+        """Convert instances dict to list of lists where each list takes the form:
+        [human, xmin xmax ymin ymax global val1 val2 ... val10 pose val1 val2 ... val 207 shape val1 val2 ... val10 camera val1 val2 val3 val4]
+        Like for bounding boxes, xmin, ymin, xmax, and ymax map to v0, v1, v2, and v3 respectively.
+        """
+        if only_pose: # used for tokenizer training for pose
+            if len(instances) == 0:
+                return torch.zeros(207)
+            else:
+                return torch.from_numpy(np.array(instances['pred_smpl_params']['body_pose'][0]).flatten()).float()
+        if len(instances) == 0: #empty, i.e. there are no humans
+            return 'none'
+        for k in instances:
+            if k!='pred_smpl_params':
+                instances[k] = torch.from_numpy(np.array(instances[k]))
+        smpl_params = (instances['pred_smpl_params'])
+        for k in smpl_params:
+            smpl_params[k] = torch.from_numpy(np.array(smpl_params[k]))
+        total_num_instances = len(instances['bbox_xyxy'])
+        instances_converted = []
+        for ii in range(total_num_instances):
+            instances_converted.append(['human'] + (np.array(instances['bbox_xyxy'][ii]).flatten().tolist()) + ['global'] + (np.array(instances['pred_smpl_params']['global_orient'][ii]).flatten().tolist()) + ['pose'] + (instances['pose_tokenized'][ii].flatten().tolist()) + ['shape'] + (instances['pred_smpl_params']['betas'][ii].flatten().tolist()) + ['camera'] + (instances['pred_cam'][ii].flatten().tolist()))
+        return instances_converted
+    def humanposes_crop_and_resize(self, humanposes: List[Tuple], crop_coords: Tuple, orig_size: Tuple,):
+        """Crop and resize human poses (and their bounding boxes)
+        """
+        orig_height, orig_width = orig_size
+        top, left, h, w = crop_coords
+        humanposes_converted_resized = []
+        for instance in humanposes:
+            bbox_curr = instance[1:5]
+            bbox_curr = np.array(bbox_curr)
+            bbox_curr[0::2] = bbox_curr[0::2] / orig_width
+            bbox_curr[1::2] = bbox_curr[1::2] / orig_height
+            xmin, ymin, xmax, ymax = left, top, left + w, top + h
+            bbox_curr = A.bbox_crop(bbox_curr, x_min=xmin, y_min=ymin, x_max=xmax, y_max=ymax, rows=orig_height,
+                                        cols=orig_width)
+            bbox_curr = np.array(bbox_curr)
+            if np.all(bbox_curr[1::2]<0) or np.all(bbox_curr[0::2]<0): #bbox is out of range, remove it
+                continue
+            if np.all(bbox_curr[1::2]>1.0) or np.all(bbox_curr[0::2]>1.0): #bbox is out of range, remove it
+                continue
+            bbox_curr = np.clip(bbox_curr, a_min=0, a_max=1.)
+            instance[1:5] = bbox_curr
+            humanposes_converted_resized.append(instance)
+        # now return all instances, or none if there is no instance
+        if len(humanposes_converted_resized)>0:
+            pass
+        else: #no valid masks remains
+            return 'none'
+        humanpose_returned = humanposes_converted_resized
+        return humanpose_returned
+    def convert_humanposes_to_string(self, all_humanposes: List[Tuple]):
+        """Convert humanposes to a string
+           range of global orientation: [-1, 1]
+           range of object pose: [-1, 1]
+           range of shape (betas): [-3, 3]
+           range of camera: [-1, 19]
+        """
+        bins = self.coord_bins
+        instance_final_all = ''
+        for humanposes in all_humanposes:
+            human = humanposes[0]
+            bboxes = humanposes[1:5]
+            glob = humanposes[5]
+            global_orient = np.array(humanposes[6:15])
+            pose = humanposes[15]
+            pose_params = np.array(humanposes[16:24])
+            shape = humanposes[24]
+            shape_params = np.array(humanposes[25:35])
+            camera = humanposes[35]
+            camera_params = np.clip(np.array(humanposes[36:]), a_min=-1., a_max=19.)
+            bboxes_new = [
+                    f"v0={round(bboxes[0] * (bins - 1))}",
+                    f"v1={round(bboxes[1] * (bins - 1))}",
+                    f"v2={round(bboxes[2] * (bins - 1))}",
+                    f"v3={round(bboxes[3] * (bins - 1))}"]
+            global_orient = 499.5*global_orient
+            global_orient_new = []
+            for ii in range(len(global_orient)):
+                global_orient_curr =  f"v0={round(global_orient[ii]+499.5)}"
+                global_orient_new.append(global_orient_curr)
+            pose_params_new = []
+            for ii in range(len(pose_params)):
+                if pose_params[ii]<512:
+                    pose_params_curr =  f"v0={round(pose_params[ii])}"
+                else:
+                    pose_params_curr =  f"v1={round(pose_params[ii] - 512)}"
+                pose_params_new.append(pose_params_curr)
+            shape_params = 166.5*shape_params
+            shape_params_new = []
+            for ii in range(len(shape_params)):
+                shape_params_curr =  f"v0={round(shape_params[ii]+499.5)}"
+                shape_params_new.append(shape_params_curr)
+            camera_params = 49.95*camera_params
+            camera_params_new = []
+            for ii in range(len(camera_params)):
+                camera_params_curr =  f"v0={round(camera_params[ii]+49.95)}"
+                camera_params_new.append(camera_params_curr)
+            #randomly shuffle everything except bbox part of the sequence
+            all_strings = [[pose]+pose_params_new, [glob] + global_orient_new, [camera] + camera_params_new, [shape] + shape_params_new ]
+            rand_perm = torch.randperm(4)
+            instance_final = [human] + bboxes_new + all_strings[rand_perm[0]] + all_strings[rand_perm[1]] + all_strings[rand_perm[2]] + all_strings[rand_perm[3]]
+            instance_final = ', '.join(instance_final)
+            instance_final = instance_final.replace(",", "")
+            instance_final_all = instance_final_all + instance_final + ' '
+        return instance_final_all
+    def load(self, path):
+        with open(path, 'r') as f:
+            sample = json.load(f)
+        return sample
+    def preprocess(self, sample):
+        instances = sample
+        instances = self.convert_humanpose_instance(instances, only_pose=self.only_pose)
+        return instances
+    def image_augment(self, humanposes: List[Tuple], crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx=None, resample_mode: str = None):
+        if humanposes=='none' or self.only_pose:
+            return humanposes
+        humanposes = self.humanposes_crop_and_resize(humanposes, crop_coords, orig_size)
+        return humanposes
+    def postprocess(self, humanposes):
+        if humanposes=='none' or self.only_pose:
+            return humanposes if not self.return_raw else []
+        if self.return_raw:
+            return humanposes
+        humanposes = self.convert_humanposes_to_string(humanposes)
+        return humanposes
+class ColorPaletteTransform(AbstractTransform):
+    def __init__(self, coord_bins=1000, return_raw=False):
+        self.coord_bins = coord_bins
+        self.return_raw = return_raw
+    def convert_palette_instance(self, instances):
+        """Convert colors to v0= v0= ...
+        """
+        length = random.randint(1,7)
+        instances_converted = np.array(instances[0][str(length)]).flatten().tolist()
+        return instances_converted
+    def palette_hflip(self, palettes: List[Tuple], image_size: Tuple, flip: bool):
+        return palettes
+    def convert_palettes_to_string(self, all_palettes: List[Tuple]):
+        """Convert palettes to a string
+        """
+        colors = []
+        len_palettes = len(all_palettes)
+        colors.append(f"v1={round(len_palettes/3)}") # start with the length of the color palette to avoid confusion
+        for ii in range(len(all_palettes)):
+            color_new = f"v0={round(all_palettes[ii])}"
+            colors.append(color_new)
+        instance_final_all = colors
+        instance_final_all = ', '.join(instance_final_all)
+        instance_final_all = instance_final_all.replace(",", "")
+        return instance_final_all
+    def load(self, path):
+        with open(path, 'r') as f:
+            sample = json.load(f)
+        return sample
+    def preprocess(self, sample):
+        if self.return_raw:
+            return sample
+        instances = sample
+        instances = self.convert_palette_instance(instances)
+        return instances
+    def image_augment(self, palettes: List[Tuple], crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx=None, resample_mode: str = None):
+        return palettes
+    def postprocess(self, palettes):
+        if self.return_raw:
+            return palettes
+        palettes = self.convert_palettes_to_string(palettes)
+        return palettes
+class SAMInstanceTokTransform(AbstractTransform):
+    def __init__(self, image_size=224, points_per_side=7, point_order='random'):
+        self.H, self.W = to_2tuple(image_size)
+        self.points_per_h, self.points_per_w = to_2tuple(points_per_side)
+        assert point_order in ['random', 'grid']
+        self.point_order = point_order
+    def get_query_points(self):
+        if self.point_order == 'grid':
+            # Create and cache grid query points
+            if not hasattr(self, 'grid_query_points'):
+                y, x = np.meshgrid(np.linspace(0, self.H, self.points_per_h + 2)[1:-1], np.linspace(0, self.W, self.points_per_w + 2)[1:-1])
+                grid = np.stack((x, y), axis=2).astype(np.int32)
+                self.grid_query_points = grid.reshape(-1, 2)
+            return self.grid_query_points
+        elif self.point_order == 'random':
+            # Randomly sample query points
+            y = np.random.randint(0, self.H, self.points_per_h)
+            x = np.random.randint(0, self.W, self.points_per_w)
+            return np.concatenate((x[:,None], y[:,None]), axis=1)
+        else:
+            raise ValueError(f"Query point order mode {self.point_order} is not supported.")
+    def get_target_tokens(self, sample, query_points):
+        instances_coords = [coords[0] for coords in sample['points']]
+        tokens = sample['token_ids']
+        bboxes = sample['bbox']
+        instance_tokens_per_qpoint = dict()
+        for point in query_points:
+            point = (int(point[0].item()), int(point[1].item()))
+            instance_tokens_per_qpoint[point] = []
+            for i, (coords, tok, bbox) in enumerate(zip(instances_coords, tokens, bboxes)):
+                # Calculate the distance from the query point to the instance
+                distance = cv2.pointPolygonTest(coords, point, measureDist=True)
+                # If the query point is inside the instance, add its corresponding token
+                if distance >= 0:
+                    instance_tokens_per_qpoint[point].append((tok, bbox))
+        return instance_tokens_per_qpoint
+    def convert_target_tokens_to_string(self, target_tokens):
+        result_text = []
+        query_points = list(target_tokens.keys())
+        # Randomly shuffle query points order (mainly for grid order)
+        random.shuffle(query_points)
+        for point in query_points:
+            # Add query point coordinates to the string
+            result_text.append('point')
+            result_text.append(f'v0={point[1]}')
+            result_text.append(f'v1={point[0]}')
+            # Randomly shuffle the order of instance tokens per query point
+            random.shuffle(target_tokens[point])
+            if len(target_tokens[point]) == 0:
+                # If no instances tokens are found, add 'none' to the string
+                result_text.append('none')
+            else:
+                for tok, bbox in target_tokens[point]:
+                    result_text.append(f'polygon')
+                    # Add bounding box coordinates to the string
+                    ymin, xmin, ymax, xmax = bbox.astype(np.int32)
+                    result_text.extend([
+                        f'v0={xmin}',
+                        f'v1={ymin}',
+                        f'v2={xmax}',
+                        f'v3={ymax}',
+                    ])
+                    # Add instance tokens ids to the string
+                    for idx in tok.tolist():
+                        if idx < 512:
+                            result_text.append(f'v0={idx}')
+                        else:
+                            result_text.append(f'v1={idx - 512}')
+        return " ".join(result_text)
+    def load(self, path):
+        sample = np.load(path, allow_pickle=True)
+        return sample
+    def preprocess(self, sample):
+        for s in sample:
+            s['token_ids'] = s['token_ids'].astype(np.int32)
+        return sample
+    def image_augment(self, v, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        if rand_aug_idx is None:
+            raise ValueError("Crop settings / augmentation index are missing but a pre-tokenized modality is being used")
+        v = v[rand_aug_idx]
+        return v
+    def postprocess(self, sample):
+        query_points = self.get_query_points()
+        target_tokens = self.get_target_tokens(sample, query_points)
+        final_string = self.convert_target_tokens_to_string(target_tokens)
+        return final_string
+class CropSettingsTransform(AbstractTransform):
+    def load(self, path):
+        sample = np.load(path)
+        return sample
+    def preprocess(self, sample):
+        raise NotImplementedError("CropSettingsTransform does not support preprocessing")
+    def image_augment(self, val, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        raise NotImplementedError("CropSettingsTransform is not meant to be used for image augmentation")
+    def postprocess(self, sample):
+        raise NotImplementedError("CropSettingsTransform does not support postprocessing")
+class IdentityTransform(AbstractTransform):
+    def load(self, path):
+        raise NotImplementedError("IdentityTransform does not support loading")
+    def preprocess(self, sample):
+        return sample
+    def image_augment(self, val, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        return val
+    def postprocess(self, sample):
+        return sample
+class JSONTransform(AbstractTransform):
+    def load(self, path):
+        if path.endswith('.json'):
+            with open(path, 'r') as f:
+                sample = json.load(f)
+        elif path.endswith('.json.gz'):
+            with gzip.open(path, 'rb') as f:
+                sample = json.load(f)
+        return sample
+    def preprocess(self, sample):
+        return sample
+    def image_augment(self, val, crop_coords: Tuple, flip: bool, orig_size: Tuple, target_size: Tuple,
+                      rand_aug_idx: Optional[int], resample_mode: str = None):
+        return val
+    def postprocess(self, sample):
+        return sample

fourm/data/multimodal_dataset_folder.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import os.path
+import pickle
+import random
+from copy import deepcopy
+from typing import Any, Callable, Dict, List, Optional, Tuple, cast
+import numpy as np
+from torchvision.datasets.vision import VisionDataset
+from fourm.data.modality_transforms import AbstractTransform, get_transform_key
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp', '.jpx', '.npy', '.npz')
+UNIFIED_EXTENSIONS = IMG_EXTENSIONS + ('.json', '.txt', '.json.gz')
+def has_file_allowed_extension(filename: str, extensions: Tuple[str, ...]) -> bool:
+    """Checks if a file is an allowed extension.
+    Args:
+        filename (string): path to a file
+        extensions (tuple of strings): extensions to consider (lowercase)
+    Returns:
+        bool: True if the filename ends with one of given extensions
+    """
+    return filename.lower().endswith(extensions)
+def is_image_file(filename: str) -> bool:
+    """Checks if a file is an allowed image extension.
+    Args:
+        filename (string): path to a file
+    Returns:
+        bool: True if the filename ends with a known image extension
+    """
+    return has_file_allowed_extension(filename, IMG_EXTENSIONS)
+def make_dataset(
+        directory: str,
+        class_to_idx: Dict[str, int],
+        extensions: Optional[Tuple[str, ...]] = None,
+        is_valid_file: Optional[Callable[[str], bool]] = None,
+        cache_path: Optional[str] = None,
+) -> List[Tuple[str, int]]:
+    if cache_path is not None and os.path.exists(cache_path):
+        # Load cached file paths from disk if it exists
+        with open(cache_path, 'rb') as f:
+            return pickle.load(f)
+    instances = []
+    directory = os.path.expanduser(directory)
+    both_none = extensions is None and is_valid_file is None
+    both_something = extensions is not None and is_valid_file is not None
+    if both_none or both_something:
+        raise ValueError("Both extensions and is_valid_file cannot be None or not None at the same time")
+    if extensions is not None:
+        def is_valid_file(x: str) -> bool:
+            return has_file_allowed_extension(x, cast(Tuple[str, ...], extensions))
+    is_valid_file = cast(Callable[[str], bool], is_valid_file)
+    for target_class in sorted(class_to_idx.keys()):
+        class_index = class_to_idx[target_class]
+        target_dir = os.path.join(directory, target_class)
+        if not os.path.isdir(target_dir):
+            continue
+        for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)):
+            for fname in sorted(fnames):
+                path = os.path.join(root, fname)
+                if is_valid_file(path):
+                    item = path, class_index
+                    instances.append(item)
+    if cache_path is not None:
+        # Cache all file paths s.t. setting up the dataloader is instant in the future
+        os.makedirs(os.path.dirname(cache_path), exist_ok=True)
+        with open(cache_path, 'wb') as f:
+            pickle.dump(instances, f)
+    return instances
+class DatasetFolder(VisionDataset):
+    """A generic data loader where the samples are arranged in this way: ::
+        root/class_x/xxx.ext
+        root/class_x/xxy.ext
+        root/class_x/xxz.ext
+        root/class_y/123.ext
+        root/class_y/nsdf3.ext
+        root/class_y/asd932_.ext
+    Args:
+        root (string): Root directory path.
+        loader (callable): A function to load a sample given its path.
+        extensions (tuple[string]): A list of allowed extensions.
+            both extensions and is_valid_file should not be passed.
+        transform (callable, optional): A function/transform that takes in
+            a sample and returns a transformed version.
+            E.g, ``transforms.RandomCrop`` for images.
+        target_transform (callable, optional): A function/transform that takes
+            in the target and transforms it.
+        is_valid_file (callable, optional): A function that takes path of a file
+            and check if the file is a valid file (used to check of corrupt logs)
+            both extensions and is_valid_file should not be passed.
+     Attributes:
+        classes (list): List of the class names sorted alphabetically.
+        class_to_idx (dict): Dict with items (class_name, class_index).
+        samples (list): List of (sample path, class_index) tuples
+        targets (list): The class_index value for each image in the dataset
+    """
+    def __init__(
+            self,
+            root: str,
+            loader: Callable[[str], Any],
+            extensions: Optional[Tuple[str, ...]] = None,
+            transform: Optional[Callable] = None,
+            target_transform: Optional[Callable] = None,
+            is_valid_file: Optional[Callable[[str], bool]] = None,
+    ) -> None:
+        super(DatasetFolder, self).__init__(root, transform=transform,
+                                            target_transform=target_transform)
+        classes, class_to_idx = self._find_classes(self.root)
+        samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file)
+        if len(samples) == 0:
+            msg = "Found 0 logs in subfolders of: {}\n".format(self.root)
+            if extensions is not None:
+                msg += "Supported extensions are: {}".format(",".join(extensions))
+            raise RuntimeError(msg)
+        self.loader = loader
+        self.extensions = extensions
+        self.classes = classes
+        self.class_to_idx = class_to_idx
+        self.samples = samples
+        self.targets = [s[1] for s in samples]
+    def _find_classes(self, dir: str) -> Tuple[List[str], Dict[str, int]]:
+        """
+        Finds the class folders in a dataset.
+        Args:
+            dir (string): Root directory path.
+        Returns:
+            tuple: (classes, class_to_idx) where classes are relative to (dir), and class_to_idx is a dictionary.
+        Ensures:
+            No class is a subdirectory of another.
+        """
+        classes = [d.name for d in os.scandir(dir) if d.is_dir()]
+        classes.sort()
+        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+        return classes, class_to_idx
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        while True:
+            try:
+                path, target = self.samples[index]
+                sample = self.loader(path)
+                break
+            except Exception as e:
+                print(e)
+                index = random.randint(0, len(self.samples) - 1)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return sample, target
+    def __len__(self) -> int:
+        return len(self.samples)
+class MultiModalDatasetFolder(VisionDataset):
+    """A generic multi-modal dataset loader where the samples are arranged in this way: ::
+        root/modality_a/class_x/xxx.ext
+        root/modality_a/class_y/xxy.ext
+        root/modality_a/class_z/xxz.ext
+        root/modality_b/class_x/xxx.ext
+        root/modality_b/class_y/xxy.ext
+        root/modality_b/class_z/xxz.ext
+    Args:
+        root (string): Root directory path.
+        modalities (list): List of modalities as strings
+        modality_paths (dict): Dict of paths to modalities
+        modality_transforms (dict): Dict of transforms for each modality
+        loader (callable): A function to load a sample given its path.
+        transform (callable, optional): A function/transform that takes in
+            a sample and returns a transformed version.
+            E.g, ``transforms.RandomCrop`` for images.
+        target_transform (callable, optional): A function/transform that takes
+            in the target and transforms it.
+        is_valid_file (callable, optional): A function that takes path of a file
+            and check if the file is a valid file (used to check of corrupt logs)
+            both extensions and is_valid_file should not be passed.
+        max_samples (int, optional): Maximum number of samples to load. If None, all samples are loaded.
+        pre_shuffle (bool, optional): Whether to shuffle the sample during the init.
+        return_paths (bool, optional): Whether to return the paths of the samples.
+        cache (bool, optional): Whether to cache the samples in memory. If True, the samples are loaded only once and then cached in memory.
+     Attributes:
+        classes (list): List of the class names sorted alphabetically.
+        class_to_idx (dict): Dict with items (class_name, class_index).
+        samples (list): List of (sample path, class_index) tuples
+        targets (list): The class_index value for each image in the dataset
+    """
+    def __init__(
+            self,
+            root: str,
+            modalities: List[str],
+            modality_paths: Dict[str, str],
+            modality_transforms: Dict[str, AbstractTransform],
+            transform: Optional[Callable] = None,
+            target_transform: Optional[Callable] = None,
+            is_valid_file: Optional[Callable[[str], bool]] = None,
+            max_samples: Optional[int] = None,
+            pre_shuffle: bool = False,
+            cache: bool = False,
+            return_path: bool = False,
+    ) -> None:
+        super(MultiModalDatasetFolder, self).__init__(root, transform=transform, target_transform=target_transform)
+        self.modalities = modalities
+        # If modality_paths is not provided, use the default paths
+        self.modality_paths = modality_paths
+        for mod in self.modalities:
+            if mod not in self.modality_paths:
+                modality_paths[mod] = mod
+        self.modality_transforms = modality_transforms
+        self.return_path = return_path
+        classes, class_to_idx = self._find_classes(os.path.join(self.root, list(self.modality_paths.values())[0]))
+        extensions = UNIFIED_EXTENSIONS if is_valid_file is None else None
+        samples = {
+            mod: make_dataset(
+                os.path.join(self.root, f'{self.modality_paths[mod]}'),
+                class_to_idx,
+                extensions,
+                is_valid_file,
+                cache_path=os.path.join(self.root, 'dataloader_cache', f'{self.modality_paths[mod]}.pkl') if cache else None)
+            for mod in self.modalities
+        }
+        for mod, mod_samples in samples.items():
+            if len(mod_samples) == 0:
+                msg = "Found 0 logs in subfolders of: {}\n".format(os.path.join(self.root, f'{self.modality_paths[mod]}'))
+                if extensions is not None:
+                    msg += "Supported extensions are: {}".format(",".join(extensions))
+                raise RuntimeError(msg)
+        self.extensions = extensions
+        self.classes = classes
+        self.class_to_idx = class_to_idx
+        self.samples = samples
+        # Select random subset of dataset if so specified
+        if isinstance(max_samples, int):
+            total_samples = len(list(self.samples.values())[0])
+            np.random.seed(0)
+            permutation = np.random.permutation(total_samples)
+            for task in samples:
+                self.samples[task] = [self.samples[task][i] for i in permutation][:max_samples]
+        if pre_shuffle:
+            total_samples = len(list(self.samples.values())[0])
+            np.random.seed(100)
+            permutation = np.random.permutation(total_samples)
+            for task in samples:
+                self.samples[task] = [self.samples[task][i] for i in permutation]
+        self.cache = {}
+        self.imgs = self.samples
+    def _find_classes(self, dir: str) -> Tuple[List[str], Dict[str, int]]:
+        """
+        Finds the class folders in a dataset.
+        Args:
+            dir (string): Root directory path.
+        Returns:
+            tuple: (classes, class_to_idx) where classes are relative to (dir), and class_to_idx is a dictionary.
+        Ensures:
+            No class is a subdirectory of another.
+        """
+        classes = [d.name for d in os.scandir(dir) if d.is_dir()]
+        classes.sort()
+        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+        return classes, class_to_idx
+    def get_class_and_file(self, path: str) -> Tuple[str, str]:
+        """ Extracts the class and file name from a path. """
+        class_id, file_name = path.split('/')[-2:]
+        file_name = file_name.split('.')[0]
+        return class_id, file_name
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        if index in self.cache:
+            sample_dict, target = deepcopy(self.cache[index])
+        else:
+            sample_dict = {}
+            for mod in self.modalities:
+                path, target = self.samples[mod][index]
+                sample = self.modality_transforms[get_transform_key(mod)].load(path)
+                sample_dict[mod] = sample
+            # self.cache[index] = deepcopy((sample_dict, target))
+        if self.transform is not None:
+            sample_dict = self.transform(sample_dict)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        sample_dict['class_idx'] = target
+        if self.return_path and not index in self.cache:
+            class_id, file_name = self.get_class_and_file(path)
+            sample_dict['class_id'] = class_id
+            sample_dict['file_name'] = file_name
+        return sample_dict
+    def __len__(self) -> int:
+        return len(list(self.samples.values())[0])

fourm/data/pretrain_utils.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import torch
+import yaml
+import fourm.utils as utils
+from fourm.data import (CenterCropImageAugmenter, EmptyAugmenter,
+                  PreTokenizedImageAugmenter,RandomCropImageAugmenter, build_fm_pretraining_dataset,
+                  build_huggingface_pretraining_dataloader,
+                  build_wds_fm_pretraining_dataloader)
+from fourm.data.modality_transforms import CaptionTransform
+from fourm.data.modality_info import MODALITY_TRANSFORMS
+def setup_sampling_mod_info(dataset_config, modality_info):
+    # Subset of modality info for each dataset
+    # Input and output modalities for one dataset
+    in_domains = sorted(dataset_config['in_domains'].split('-'))
+    out_domains = sorted(dataset_config['out_domains'].split('-'))
+    all_domains = sorted(list(set(in_domains) | set(out_domains)))
+    mod_info = copy.deepcopy(modality_info)
+    mod_info = {mod: mod_info[mod] for mod in all_domains}
+    # Dirichlet concentration parameter (Alpha)
+    if dataset_config.get('alphas_config', None) is None:
+        for mod in mod_info:
+            mod_info[mod]["input_alphas"] = [0.]
+            mod_info[mod]["target_alphas"] = [0.]
+        if 'input_alphas' in dataset_config:
+            input_alphas = dataset_config['input_alphas'].split('-')
+            if len(input_alphas) == 1:
+                input_alphas = [float(input_alphas[0])] * len(in_domains)
+            else:
+                input_alphas = [float(alpha) for alpha in input_alphas]
+            for mod, alpha in zip(in_domains, input_alphas):
+                mod_info[mod]['input_alphas'] = [alpha]
+        if 'target_alphas' in dataset_config:
+            target_alphas = dataset_config['target_alphas'].split('-')
+            if len(target_alphas) == 1:
+                target_alphas = [float(target_alphas[0])] * len(out_domains)
+            else:
+                target_alphas = [float(alpha) for alpha in target_alphas]
+            for mod, alpha in zip(out_domains, target_alphas):
+                mod_info[mod]["target_alphas"] = [alpha]
+        sampling_weights = None
+    else:
+        print(f"Loading alphas config from: {dataset_config['alphas_config']}")
+        with open(dataset_config['alphas_config'], "r") as f:
+            alphas_config = yaml.safe_load(f)
+        if 'sampling_weights' in alphas_config:
+            sampling_weights = alphas_config['sampling_weights']
+            alphas_config = alphas_config['alphas_mixture']
+        else:
+            sampling_weights = None
+        for mod in mod_info:
+            mod_info[mod]["input_alphas"] = alphas_config[mod]["input_alphas"]
+            mod_info[mod]["target_alphas"] = alphas_config[mod]["target_alphas"]
+            if modality_info[mod]['type'] in ['seq', 'seq_emb', 'seq_token']:
+                mod_info[mod]['keep'] = alphas_config[mod]['keep']
+    return mod_info, sampling_weights
+def get_train_dataloader(dataset_config, modality_info, sampling_weights, text_tokenizer, input_size,
+                         num_input_tokens, num_target_tokens, min_input_tokens, min_target_tokens,
+                         num_tasks, num_workers, dataset_batch_size=None, epoch_size=None):
+    in_domains = sorted(list(dataset_config['in_domains'].split('-')))
+    out_domains = sorted(list(dataset_config['out_domains'].split('-')))
+    all_domains = sorted(list(set(in_domains) | set(out_domains)))
+    modality_transforms = MODALITY_TRANSFORMS
+    if 'caption' in modality_transforms:
+        modality_transforms['caption'] = CaptionTransform(
+            aligned_captions=dataset_config.get('aligned_captions', True)
+        )
+    if dataset_config['type'] == 'multimodal':
+        is_pretokenized = any([modality_info[mod].get('pretokenized', False) for mod in modality_info])
+        if is_pretokenized:
+            # Multi-modal training data augmentation (uses pre-tokenized data augmentation)
+            image_augmenter = PreTokenizedImageAugmenter(
+                target_size=input_size,
+                no_aug=(not dataset_config.get('tok_train_aug', True)),
+                main_domain=dataset_config['main_augment_domain']
+            )
+        else:
+            image_augmenter = RandomCropImageAugmenter(
+                 target_size=input_size,
+                 hflip=dataset_config.get('hflip'),
+                 crop_scale=tuple(dataset_config.get('crop_scale')),
+                 crop_ratio=tuple(dataset_config.get('crop_ratio')),
+            )
+        # Input and target token ranges
+        num_input_tokens = dataset_config.get('num_input_tokens', num_input_tokens)
+        num_target_tokens = dataset_config.get('num_target_tokens', num_target_tokens)
+        min_input_tokens = dataset_config.get('min_input_tokens', min_input_tokens)
+        min_target_tokens = dataset_config.get('min_target_tokens', min_target_tokens)
+        min_input_tokens = num_input_tokens if min_input_tokens is None else min_input_tokens
+        min_target_tokens = num_target_tokens if min_target_tokens is None else min_target_tokens
+        if dataset_config['use_wds']:
+            # Using webdataset
+            loader = build_wds_fm_pretraining_dataloader(
+                data_path=dataset_config['data_path'], all_domains=all_domains,
+                modality_info=modality_info, modality_transforms=modality_transforms,
+                image_augmenter=image_augmenter, text_tokenizer=text_tokenizer,
+                input_tokens_range=(min_input_tokens, num_input_tokens),
+                target_tokens_range=(min_target_tokens, num_target_tokens),
+                num_gpus=num_tasks, num_workers=num_workers,
+                batch_size=dataset_batch_size, epoch_size=epoch_size,
+                modality_name_map=dataset_config.get('modality_name_map', None),
+                shuffle_buffer_load=dataset_config.get('wds_shuffle_buffer_tar', 1_000),
+                shuffle_buffer_repeat=dataset_config.get('wds_shuffle_buffer_repeat', 1_000),
+                n_repeats=dataset_config.get('wds_n_repeats', 1),
+                sampling_weights=sampling_weights,
+            )
+        else:
+            dataset_train = build_fm_pretraining_dataset(
+                data_path=dataset_config['data_path'],
+                all_domains=all_domains, modality_info=modality_info, modality_transforms=modality_transforms,
+                image_augmenter=image_augmenter, text_tokenizer=text_tokenizer,
+                input_tokens_range=(min_input_tokens, num_input_tokens),
+                target_tokens_range=(min_target_tokens, num_target_tokens)
+            )
+            sampler_train = torch.utils.data.DistributedSampler(
+                dataset_train, num_replicas=num_tasks, rank=utils.get_rank(), shuffle=True, drop_last=True,
+            )
+            # DataLoader has batch size 1 as it then gets collated through the Mixture dataloader
+            loader = torch.utils.data.DataLoader(
+                dataset_train, sampler=sampler_train,
+                batch_size=1, num_workers=0,
+                pin_memory=False, drop_last=True,
+                collate_fn=lambda x: x[0],
+            )
+    elif dataset_config['type'] == 'huggingface':
+        # Input and target token ranges
+        num_input_tokens = dataset_config.get('num_input_tokens', num_input_tokens)
+        num_target_tokens = dataset_config.get('num_target_tokens', num_target_tokens)
+        if dataset_config.get('use_wds', False):
+            raise NotImplementedError('Webdataset not yet implemented for huggingface datasets.')
+        else:
+            loader = build_huggingface_pretraining_dataloader(
+                data_path=dataset_config['data_path'], all_domains=all_domains,
+                modality_info=modality_info, modality_transforms=modality_transforms,
+                image_augmenter=EmptyAugmenter(), text_tokenizer=text_tokenizer,
+                input_tokens_range=(num_input_tokens, num_input_tokens),
+                target_tokens_range=(num_target_tokens, num_target_tokens),
+                num_gpus=num_tasks, num_workers=num_workers,
+                batch_size=dataset_batch_size, epoch_size=epoch_size,
+                split='train', streaming=True, rename_text_to_caption=True,
+                shuffle_buffer_load=dataset_config.get('shuffle_buffer_load', 1_000),
+                shuffle_seed=0,
+            )
+    else:
+        raise NotImplementedError(f'Dataset type {dataset_config["type"]} not implemented.')
+    return loader
+def cfgs_get(key, val_config, dataset_name, train_configs, default=None):
+    """ Try to retrieve a key from the validation set config.
+    If it does not exist, default to retrieving it from the train set config
+    with the same dataset name.
+    """
+    return val_config.get(key, train_configs[dataset_name].get(key, default))
+def get_val_dataloader(dataset_config, dataset_name, train_configs, modality_info, sampling_weights, text_tokenizer,
+                       input_size, num_input_tokens, num_target_tokens, min_input_tokens, min_target_tokens,
+                       fixed_eval, fixed_eval_input_tokens, fixed_eval_target_tokens,
+                       dist_eval, num_tasks, num_workers, batch_size, pin_mem):
+    in_domains = sorted(list(cfgs_get('in_domains', dataset_config, dataset_name, train_configs).split('-')))
+    out_domains = sorted(list(cfgs_get('out_domains', dataset_config, dataset_name, train_configs).split('-')))
+    all_domains = sorted(list(set(in_domains) | set(out_domains)))
+    modality_transforms = MODALITY_TRANSFORMS
+    if 'caption' in modality_transforms:
+        modality_transforms['caption'] = CaptionTransform(
+            aligned_captions=cfgs_get('aligned_captions', dataset_config, dataset_name, train_configs, True)
+        )
+    dataset_type = cfgs_get('type', dataset_config, dataset_name, train_configs)
+    if dataset_type == 'multimodal':
+        main_augment_domain = cfgs_get('main_augment_domain', dataset_config, dataset_name, train_configs)
+        is_pretokenized = any([modality_info[mod].get('pretokenized', False) for mod in modality_info])
+        if is_pretokenized:
+            eval_image_augmenter = PreTokenizedImageAugmenter(
+                target_size=input_size, no_aug=True, main_domain=main_augment_domain
+            )
+        else:
+            eval_image_augmenter = CenterCropImageAugmenter(
+                target_size=input_size, main_domain=main_augment_domain
+            )
+        if fixed_eval:
+            input_tokens_range=(fixed_eval_input_tokens, fixed_eval_input_tokens)
+            target_tokens_range=(fixed_eval_target_tokens, fixed_eval_target_tokens)
+        else:
+            # Input and target token ranges
+            num_input_tokens = dataset_config.get('num_input_tokens', num_input_tokens)
+            num_target_tokens = dataset_config.get('num_target_tokens', num_target_tokens)
+            min_input_tokens = dataset_config.get('min_input_tokens', min_input_tokens)
+            min_target_tokens = dataset_config.get('min_target_tokens', min_target_tokens)
+            min_input_tokens = num_input_tokens if min_input_tokens is None else min_input_tokens
+            min_target_tokens = num_target_tokens if min_target_tokens is None else min_target_tokens
+            input_tokens_range = (min_input_tokens, num_input_tokens)
+            target_tokens_range = (min_target_tokens, num_target_tokens)
+        dataset_val = build_fm_pretraining_dataset(
+            data_path=cfgs_get('data_path', dataset_config, dataset_name, train_configs),
+            all_domains=all_domains, modality_info=modality_info, modality_transforms=modality_transforms,
+            image_augmenter=eval_image_augmenter, text_tokenizer=text_tokenizer,
+            input_tokens_range=input_tokens_range, target_tokens_range=target_tokens_range
+        )
+        print("Warning: Eval stats may vary slightly as the masking applied on images is random.")
+        if dist_eval:
+            if len(dataset_val) % num_tasks != 0:
+                print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. '
+                    'This will slightly alter validation results as extra duplicate entries are added to achieve '
+                    'equal num of samples per-process.')
+            sampler_val = torch.utils.data.DistributedSampler(
+                dataset_val, num_replicas=num_tasks, rank=utils.get_rank(), shuffle=False)
+        else:
+            sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+        loader = torch.utils.data.DataLoader(
+            dataset_val, sampler=sampler_val,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            pin_memory=pin_mem,
+            drop_last=False,
+        )
+    elif dataset_type == 'huggingface':
+        if fixed_eval:
+            input_tokens_range=(fixed_eval_input_tokens, fixed_eval_input_tokens)
+            target_tokens_range=(fixed_eval_target_tokens, fixed_eval_target_tokens)
+        else:
+            # Input and target token ranges
+            num_input_tokens = dataset_config.get('num_input_tokens', num_input_tokens)
+            num_target_tokens = dataset_config.get('num_target_tokens', num_target_tokens)
+            input_tokens_range = (num_input_tokens, num_input_tokens)
+            target_tokens_range = (num_target_tokens, num_target_tokens)
+        loader = build_huggingface_pretraining_dataloader(
+            data_path=cfgs_get('data_path', dataset_config, dataset_name, train_configs),
+            all_domains=all_domains, modality_info=modality_info, modality_transforms=modality_transforms,
+            image_augmenter=EmptyAugmenter(), text_tokenizer=text_tokenizer,
+            input_tokens_range=input_tokens_range, target_tokens_range=target_tokens_range,
+            num_gpus=num_tasks, num_workers=num_workers,
+            batch_size=batch_size, epoch_size=None,
+            split='validation', streaming=True, rename_text_to_caption=True,
+            shuffle_buffer_load=cfgs_get('shuffle_buffer_load', dataset_config, dataset_name, train_configs, 1_000),
+            shuffle_seed=0,
+        )
+    else:
+        raise NotImplementedError(f'Dataset type {dataset_type} not implemented.')
+    return loader

fourm/data/transfer_utils.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+def convert_samples_to_mod_dict(samples, input_mod, target_mod, num_input_tokens, num_target_tokens):
+    """Converts a sample (e.g. a batch of RGB images) to a mod dict that can be passed directly to FourM.
+    Assumes both the input modality and target modality are dense tasks.
+    """
+    B = samples.shape[0]
+    device = samples.device
+    if input_mod == target_mod:
+        assert(num_input_tokens == num_target_tokens)
+        mod_dict = {
+            input_mod: {
+                'tensor': samples,
+                'input_mask': torch.zeros((B, num_input_tokens), dtype=torch.bool, device=device),
+                'target_mask': torch.zeros((B, num_target_tokens), dtype=torch.bool, device=device),
+                'decoder_attention_mask': torch.zeros((B, num_target_tokens), dtype=torch.int, device=device),
+            },
+        }
+        mod_dict[input_mod]['decoder_attention_mask'][:, 0] = num_target_tokens
+    else:
+        mod_dict = {
+            input_mod: {
+                'tensor': samples,
+                'input_mask': torch.zeros((B, num_input_tokens), dtype=torch.bool, device=samples.device),
+                'target_mask': torch.ones((B, num_input_tokens), dtype=torch.bool, device=samples.device),
+                'decoder_attention_mask': torch.zeros((B, num_input_tokens), dtype=torch.int, device=samples.device),
+            },
+            target_mod: {
+                'tensor': torch.zeros((B, num_target_tokens), dtype=torch.long, device=samples.device),
+                'input_mask': torch.ones((B, num_target_tokens), dtype=torch.bool, device=samples.device),
+                'target_mask': torch.zeros((B, num_target_tokens), dtype=torch.bool, device=samples.device),
+                'decoder_attention_mask': torch.ones((B, num_target_tokens), dtype=torch.int, device=samples.device),
+            },
+        }
+        mod_dict[target_mod]['decoder_attention_mask'][:, 0] = num_target_tokens
+    return mod_dict

fourm/data/unified_datasets.py ADDED Viewed

	@@ -0,0 +1,557 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import io
+import itertools
+import os
+import re
+from functools import partial
+from typing import Any, Callable, Dict, Iterable, List, Optional
+import braceexpand
+import numpy as np
+import torch
+import webdataset as wds
+from PIL import Image
+from torch.utils.data import IterableDataset
+from torch.utils.data._utils.collate import default_collate
+from torchvision import transforms
+from webdataset.filters import pipelinefilter, reraise_exception
+from webdataset.handlers import warn_and_continue
+try:
+    # Optionally load huggingface datasets
+    from datasets import load_dataset
+    from datasets.distributed import split_dataset_by_node
+except ImportError:
+    print("Huggingface datasets not installed. Please install with `pip install datasets`.")
+from fourm.data.masking import TransferMasking, UnifiedMasking
+from fourm.data.modality_transforms import (CropSettingsTransform, IdentityTransform,
+                                      MaskTransform, UnifiedDataTransform,
+                                      get_transform_key)
+from fourm.data.multimodal_dataset_folder import MultiModalDatasetFolder
+from fourm.utils.dist import get_rank, get_world_size
+def build_fm_pretraining_dataset(
+        data_path, all_domains, modality_info, modality_transforms,
+        image_augmenter, text_tokenizer,
+        input_tokens_range, target_tokens_range,
+        sampling_weights=None):
+    """Builds the FourM pre-training dataset based on the given arguments.
+    This function should mainly used for smaller datasets (e.g. validation sets),
+    while large training sets should be loaded with build_wds_fm_pretraining_dataloader in webdataset format.
+    Args:
+        data_path: Path to the dataset.
+        all_domains: List of all modalities to be used.
+        modality_info: Dictionary containing information about the modalities.
+        modality_transforms: Dictionary containing the transforms for each modality.
+        image_augmenter: Image augmenter.
+        text_tokenizer: Text tokenizer (for sequence modalities).
+        input_tokens_range: Range of the input token budget.
+        target_tokens_range: Range of the target token budget.
+        sampling_weights: Sampling weights for the mixture of Dirichlet distributions.
+    Returns:
+        FourM pre-training dataset as a PyTorch Dataset.
+    """
+    transform = transforms.Compose([
+        UnifiedDataTransform(transforms_dict=modality_transforms, image_augmenter=image_augmenter),
+        UnifiedMasking(modality_info=modality_info, text_tokenizer=text_tokenizer,
+                       input_tokens_range=input_tokens_range, target_tokens_range=target_tokens_range,
+                       sampling_weights=sampling_weights),
+         ])
+    # Remove vq domains that require a tokenizer
+    modalities_without_vq = [mod for mod in all_domains if not modality_info[mod].get("requires_tokenizer", False)]
+    # If we are using a pre-tokenized modality, we default to pre-computed crop settings
+    if any([modality_info[domain].get("pretokenized", False) for domain in all_domains]):
+        modalities_without_vq.append("crop_settings")
+        modality_transforms = copy.deepcopy(modality_transforms)
+        modality_transforms["crop_settings"] = CropSettingsTransform()
+    modality_paths = {mod: modality_info[mod]['path'] for mod in modality_info if modality_info[mod].get('path', None) is not None}
+    return MultiModalDatasetFolder(root=data_path, modalities=modalities_without_vq, modality_paths=modality_paths,
+                                   modality_transforms=modality_transforms, transform=transform)
+def build_fm_transfer_dataset(
+    data_path, modality_info, transform, modality_transforms, all_domains,
+    load_mask_valid: bool = False, max_samples: Optional[int] = None,
+    pre_shuffle: bool = False, cache: bool = False):
+    """Builds the FourM transfer dataset based on the given arguments.
+    Args:
+        data_path: Path to the dataset.
+        modality_info: Dictionary containing information about the modalities.
+        transform: Transform to be applied to the dataset.
+        modality_transforms: Dictionary containing the transforms for each modality.
+        all_domains: List of all modalities to be used.
+        load_mask_valid: Whether to load the mask_valid "modality".
+        max_samples: Maximum number of samples to load.
+        pre_shuffle: Whether to shuffle the dataset before loading.
+        cache: Whether to cache the dataset in memory.
+    Returns:
+        FourM transfer dataset as a PyTorch Dataset.
+    """
+    # Remove vq domains that require a tokenizer
+    modalities_without_vq = [mod for mod in all_domains if not modality_info[mod].get("requires_tokenizer", False)]
+    # If we are using a pre-tokenized modality, we default to pre-computed crop settings
+    if any([modality_info[domain].get("pretokenized", False) for domain in all_domains]):
+        modalities_without_vq.append("crop_settings")
+        modality_transforms = copy.deepcopy(modality_transforms)
+        modality_transforms["crop_settings"] = CropSettingsTransform()
+    if load_mask_valid:
+        modalities_without_vq.append("mask_valid")
+        modality_transforms = copy.deepcopy(modality_transforms)
+        modality_transforms["mask_valid"] = MaskTransform()
+    modality_paths = {mod: modality_info[mod]['path'] for mod in modality_info if modality_info[mod].get('path', None) is not None}
+    return MultiModalDatasetFolder(root=data_path, modalities=modalities_without_vq, modality_paths=modality_paths,
+                                   modality_transforms=modality_transforms, transform=transform, max_samples=max_samples,
+                                   pre_shuffle=pre_shuffle, cache=cache)
+### Webdatasets (wds) functions
+def _keyless_map(data, f, handler=reraise_exception):
+    """Map samples without adding __key__."""
+    for sample in data:
+        try:
+            result = f(sample)
+        except Exception as exn:
+            if handler(exn):
+                continue
+            else:
+                break
+        if result is None:
+            continue
+        yield result
+map = pipelinefilter(_keyless_map)
+def check_dots(s):
+    if '.gz' in s:
+        return s.count('.') == 2
+    return s.count('.') == 1
+def remove_ext_with_gz(s):
+    if s.endswith('.gz'):
+        s = s.replace(".gz", "")
+    return os.path.splitext(s)[0]
+def wds_decoder(key, value):
+    if key == "png" or key.endswith(".png"):
+        img = Image.open(io.BytesIO(value))
+        return img
+    elif key == "jpg" or key.endswith(".jpg"):
+        img = Image.open(io.BytesIO(value))
+        return img
+    elif key == "jpeg" or key.endswith(".jpeg"):
+        img = Image.open(io.BytesIO(value))
+        return img
+    elif key == 'npy' or key.endswith("npy"):
+        content = np.load(io.BytesIO(value), allow_pickle=True)
+        # try:
+        #     content = np.load(io.BytesIO(value))
+        # except:
+        #     content = np.load(io.BytesIO(value), allow_pickle=True)
+        return content
+    elif key == "jpx" or key.endswith('.jpx'):
+        img = Image.open(io.BytesIO(value))
+        return img
+    elif 'output' in key:
+        return int(value)
+    else:
+        # If not an image, use the basic handlers (.txt, .json, .pickle, .npz, ...)
+        # See https://github.com/webdataset/webdataset/blob/main/webdataset/autodecode.py
+        return None
+def repeat_fn(src, n_repeats=5):
+    """
+    Repeat each sample n_repeats times.
+    E.g. A B C ... repeated 3 times becomes A A A B B B C C C ...
+    Depending on the downstream application, a shuffle should be added after this.
+    """
+    for sample in src:
+        for _ in range(n_repeats):
+            yield sample
+def remove_extensions(sample):
+    """
+    In webdatasets, we identify the type of a given modality by adding an extension
+    in the form f"{modality_name}.{modality_extension}", e.g. "rgb.jpg" or "caption.json".
+    This function removes them and returns a dictionary of {f"{modality_name}": modality}.
+    """
+    return {remove_ext_with_gz(k): v for k, v in sample.items()}
+def filter_metadata(sample, metadata=['__key__', '__url__', 'file_name', 'class_name', 'class_idx']):
+    """ Filters out non-modality entries specified in metadata when loading tar files with webdatasets. """
+    return {k: v for k, v in sample.items() if k not in metadata}
+def apply_modality_transforms(sample, modality_transforms):
+    """ Applies a dictionary of modality-specific transforms to a dictionary of modalities. """
+    return {k: (modality_transforms[get_transform_key(k)](v) if k in modality_transforms else v) for k, v in sample.items() }
+def tok_to_int64(sample):
+    """
+    Pre-computed tokens are saved as int16, but we need them as int64 instead.
+    """
+    return {k: (v.astype('int64') if 'tok_' in k else v) for k, v in sample.items()}
+def rename_modalities(sample, modality_paths):
+    """
+    Renames modalities to their corresponding names in modality_paths.
+    """
+    return {out_path: sample[loaded_path] for out_path, loaded_path in modality_paths.items()}
+def extract_modality_names(s):
+    # Regular expression pattern to match anything enclosed in '{' and '}', and comma separated
+    pattern = r'\{([^}]*)\}'
+    match = re.search(pattern, s)
+    return match.group(1).split(',') if match else []
+def identity(sample):
+    """ Identity function that does nothing. """
+    return sample
+def multi_tarfile_samples(src_iter: Iterable[Dict[str, Any]],
+                          modality_name_map: Dict[str, str] = None,
+                          handler: Callable[[Exception], bool] = warn_and_continue):
+    """Webdataset does not support splitting up shards by modality, so we need to do this manually.
+    Usually, we would need to save all modalities in the same tar file, e.g. shard_root_train/{00000..12345}.tar,
+    where each shard contains 1000 samples and each sample contains all modalities.
+    This is not flexible when adding new modalities, so we instead save each modality in a separate tar file,
+    e.g. shard_root_train_rgb/{00000..12345}.tar, shard_root_train_caption/{00000..12345}.tar, etc., where each shard contains
+    again 1000 samples, but each sample contains only one modality. All samples in all shards have to be aligned.
+    This function takes an iterator over shard URLs, where we use brace expansion to specify multiple tar files per modality.
+    E.g. shard_root_train_[rgb,caption]/00123.tar will be expanded to shard_root_train_rgb/00123.tar and shard_root_train_caption/00123.tar,
+    and the samples from these two tar files will be combined into a single sample.
+    Args:
+        src_iter: Iterator over shards that *already brace expanded the shard numbers*,
+            e.g. {'url': 'shard_root_train_[rgb,caption]/00000.tar'}, {'url': 'shard_root_train_[rgb,caption]/00001.tar'}, ...
+            This function will also work when no square braces for multiple modalities are used, e.g. {'url': 'shard_root_train/00000.tar'}, ...
+            It can be a drop-in replacement for wds.tarfile_samples.
+        modality_name_map: Optional dictionary specifying a mapping from modality folder names to arbitrary other names.
+        handler: Function that handles exceptions. If it returns True, the shard is skipped. If it returns False, the function exits.
+    Yields:
+        Dictionary of aligned samples from all modalities.
+    """
+    for src in src_iter:
+        # Multi tar file URLs use brace expansion with square braces
+        multi_tar_urls = src['url'].translate(str.maketrans('[]', '{}'))
+        modality_names = extract_modality_names(multi_tar_urls)
+        if len(modality_names) == 0:
+            # Case where multi-modal braceexpand is not used, e.g. shard_dir/shard00000.tar
+            modality_names = [None]
+            multi_tar_urls = [multi_tar_urls]
+        elif len(modality_names) == 1:
+            # Brace expand doesn't work with a single entry, e.g. shard_dir/[foo]/shard00000.tar
+            multi_tar_urls = [multi_tar_urls.replace("{", "").replace("}", "")]
+        else:
+            # Remaining cases where multiple modalities are specified, e.g. shard_dir/[foo,bar]/shard00000.tar
+            multi_tar_urls = list(braceexpand.braceexpand(multi_tar_urls))
+        # Create tar iterators for shards of all modalities
+        tar_iters = [wds.tarfile_samples([{'url': tar_url}]) for tar_url in multi_tar_urls]
+        try:
+            # Loop over these iterators in parallel and combine the tar files from different modalities
+            for multi_tar_files in zip(*tar_iters):
+                merged_dict = {}
+                merged_dict['__key__'] = multi_tar_files[0]['__key__']
+                merged_dict['__url__'] = src['url']
+                for modality_name, modality_dict in zip(modality_names, multi_tar_files):
+                    _key = modality_dict.pop('__key__')
+                    _url = modality_dict.pop('__url__')
+                    if _key != merged_dict['__key__']:
+                        raise ValueError(f"Divergence detected! Trying to merge keys {_key} of {modality_name} and {merged_dict['__key__']} of merged_dict with modalities {merged_dict.keys()}.")
+                    tar_is_multimodal = len(modality_dict) > 1
+                    for k, v in modality_dict.items():
+                        if tar_is_multimodal or check_dots(k) or modality_name is None:
+                            # We don't change the keys in the following cases:
+                            # 1. The shard contains multiple modalities. Then they *have* to follow the idx.modality_id.ext convention
+                            # 2. If any key contains a dot, this means it already has the idx.modality_id.ext format (idx. is already removed at this stage)
+                            # 3. If the modality name is None, no modality folder was specified (see beginning of function)
+                            merged_dict[k] = v
+                        else:
+                            mapped_name = modality_name if modality_name_map is None else modality_name_map.get(modality_name, modality_name)
+                            merged_dict[f'{mapped_name}.{k}'] = v
+                yield merged_dict
+        except Exception as e:
+            print(e)
+            print(f"Exception occurred while processing {src['url']}.")
+            if handler(e):
+                print('Skipping shard...')
+                continue
+            else:
+                break
+def build_wds_fm_pretraining_dataloader(
+        data_path, all_domains, modality_info, modality_transforms, image_augmenter,
+        text_tokenizer, input_tokens_range, target_tokens_range,
+        num_gpus, num_workers, batch_size, epoch_size, sampling_weights=None, modality_name_map=None,
+        shuffle_buffer_load=1000, shuffle_buffer_repeat=5000, n_repeats=5):
+    """Builds the WebDataset FourM pre-training dataloader based on the given arguments.
+    Args:
+        data_path: Path to the dataset.
+        all_domains: List of all modalities to be used.
+        modality_info: Dictionary containing information about the modalities.
+        modality_transforms: Dictionary containing the transforms for each modality.
+        image_augmenter: Image augmenter.
+        text_tokenizer: Text tokenizer (for sequence modalities).
+        input_tokens_range: Range of the input token budget.
+        target_tokens_range: Range of the target token budget.
+        num_gpus: Number of GPUs.
+        num_workers: Number of workers.
+        batch_size: Batch size.
+        epoch_size: Number of samples per "epoch". (Here, epoch refers to an interrupted training loop without evaluation or checkpointing).
+        sampling_weights: Sampling weights for the mixture of Dirichlet distributions.
+        modality_name_map: Optional dictionary specifying a mapping from modality folder names to arbitrary other names.
+        shuffle_buffer_load: Shuffle buffer size when loading samples from tar files (first shuffle).
+        shuffle_buffer_repeat: Shuffle buffer size after repeating samples (second shuffle).
+        n_repeats: Number of times to repeat each sample.
+    Returns:
+        FourM pre-training dataloader as a WebDataset DataLoader.
+    """
+    modality_paths = {mod: modality_info[mod].get('path', None) or mod for mod in modality_info}
+    # Remove vq domains that require a tokenizer
+    modalities_without_vq = [mod for mod in all_domains if not modality_info[mod].get("requires_tokenizer", False)]
+    # If we are using a pre-tokenized modality, we default to pre-computed crop settings
+    if any([modality_info[domain].get("pretokenized", False) for domain in all_domains]):
+        modalities_without_vq.append("crop_settings")
+        modality_transforms = copy.deepcopy(modality_transforms)
+        modality_transforms["crop_settings"] = CropSettingsTransform()
+        modality_paths["crop_settings"] = "crop_settings"
+    # Webdatasets always adds __key__ to the dictionary, so we add a transform that does nothing with it
+    modality_transforms["__key__"] = IdentityTransform()
+    transform = transforms.Compose([
+        UnifiedDataTransform(transforms_dict=modality_transforms, image_augmenter=image_augmenter),
+        UnifiedMasking(modality_info=modality_info, text_tokenizer=text_tokenizer,
+                       input_tokens_range=input_tokens_range, target_tokens_range=target_tokens_range,
+                       sampling_weights=sampling_weights)
+    ])
+    datapipe = wds.DataPipeline(
+        # Infinitely sample shards from the shard list with replacement. Each worker is seeded independently.
+        wds.ResampledShards(data_path),
+        partial(multi_tarfile_samples, modality_name_map=modality_name_map), # Extract individual samples from single or multi-modal tar files
+        wds.shuffle(shuffle_buffer_load), # Shuffle with a buffer of given size
+        wds.decode(wds_decoder), # Decode from bytes to PIL images, numpy arrays, etc.
+        wds.filters.compose(partial(repeat_fn, n_repeats=n_repeats)), # Repeats each sample n times -> A A A B B B C C C ...
+        wds.shuffle(shuffle_buffer_repeat), # Shuffle again with a buffer of given size
+        wds.map(remove_extensions), # Remove "file extensions" from dictionary keys
+        map(filter_metadata), # Remove non-task keys
+        map(tok_to_int64), # Convert pre-computed tokens to int64
+        map(partial(rename_modalities, modality_paths=modality_paths)), # Rename modalities to their corresponding names in modality_paths
+        map(transform), # Apply data augmentation and masking
+        wds.batched(batch_size, collation_fn=default_collate, partial=False)
+            if batch_size is not None else map(identity), # Batching
+    )
+    if epoch_size is not None:
+        batch_size_iter = batch_size if batch_size is not None else 1
+        datapipe = datapipe.with_epoch(epoch_size // (num_gpus * num_workers * batch_size_iter)) # Pre-define iterator length
+    if batch_size is not None:
+        # Perform multi-threaded dataloading
+        return wds.WebLoader(datapipe, num_workers=num_workers, batch_size=None)
+    else:
+        return datapipe
+def build_wds_divae_dataloader(
+    data_path, modality_info, modality_transforms, image_augmenter,
+    num_gpus, num_workers, batch_size, epoch_size, shuffle_buffer_load=1000,
+    shuffle_buffer_repeat=5000, n_repeats=1):
+    modality_paths = {mod: modality_info[mod].get('path', None) or mod for mod in modality_info}
+    # Webdatasets always adds __key__ to the dictionary, so we add a transform that does nothing with it
+    modality_transforms["__key__"] = IdentityTransform()
+    transform = UnifiedDataTransform(transforms_dict=modality_transforms, image_augmenter=image_augmenter)
+    datapipe = wds.DataPipeline(
+        # Infinitely sample shards from the shard list with replacement. Each worker is seeded independently.
+        wds.ResampledShards(data_path),
+        multi_tarfile_samples, # Extract individual samples from single or multi-modal tar files
+        wds.shuffle(shuffle_buffer_load), # Shuffle with a buffer of given size
+        wds.decode(wds_decoder), # Decode from bytes to PIL images, numpy arrays, etc.
+        wds.filters.compose(partial(repeat_fn, n_repeats=n_repeats)), # Repeats each sample n times -> A A A B B B C C C ...
+        wds.shuffle(shuffle_buffer_repeat), # Shuffle again with a buffer of given size
+        map(remove_extensions), # Remove "file extensions" from dictionary keys
+        map(filter_metadata), # Remove non-task keys
+        map(tok_to_int64), # Convert pre-computed tokens to int64
+        map(partial(rename_modalities, modality_paths=modality_paths)), # Rename modalities to their corresponding names in modality_paths
+        map(transform), # Apply data augmentation and masking
+        wds.batched(batch_size, collation_fn=default_collate, partial=False)
+            if batch_size is not None else map(identity), # Batching
+    )
+    if epoch_size is not None:
+        batch_size_iter = batch_size if batch_size is not None else 1
+        datapipe = datapipe.with_epoch(epoch_size // (num_gpus * num_workers * batch_size_iter)) # Pre-define iterator length
+    if batch_size is not None:
+        # Perform multi-threaded dataloading
+        return wds.WebLoader(datapipe, num_workers=num_workers, batch_size=None)
+    else:
+        return datapipe
+### Huggingface datasets functions
+def text_to_caption(sample):
+    """ Rename "text" to "caption". """
+    return {'caption': sample['text']}
+def build_huggingface_pretraining_dataloader(
+        data_path, all_domains, modality_info, modality_transforms, image_augmenter,
+        text_tokenizer, input_tokens_range, target_tokens_range,
+        num_gpus, num_workers, batch_size, epoch_size, split,
+        streaming=True, rename_text_to_caption=True, shuffle_buffer_load=10_000, shuffle_seed=0):
+    # Load huggingface dataset and split samples across workers. Shuffle samples in each worker
+    dataset = load_dataset(data_path, split=split, streaming=streaming)
+    dataset = split_dataset_by_node(dataset, rank=get_rank(), world_size=get_world_size())
+    dataset = dataset.shuffle(seed=shuffle_seed, buffer_size=shuffle_buffer_load)
+    modality_info = {mod: modality_info[mod] for mod in modality_info if mod in all_domains}
+    transform = transforms.Compose([
+        UnifiedDataTransform(transforms_dict=modality_transforms, image_augmenter=image_augmenter),
+        UnifiedMasking(modality_info=modality_info, text_tokenizer=text_tokenizer,
+                       input_tokens_range=input_tokens_range, target_tokens_range=target_tokens_range)
+    ])
+    datapipe = wds.DataPipeline(
+        dataset,
+        map(text_to_caption) if rename_text_to_caption else map(identity), # Rename "text" to "caption"
+        map(filter_metadata), # Remove non-task keys
+        map(transform), # Apply data augmentation and masking
+        wds.batched(batch_size, collation_fn=default_collate, partial=False)
+            if batch_size is not None else map(identity), # Batching
+    )
+    datapipe.n_shards = dataset.n_shards
+    num_workers = min(num_workers, dataset.n_shards)
+    if epoch_size is not None:
+        batch_size_iter = batch_size if batch_size is not None else 1
+        datapipe = datapipe.with_epoch(epoch_size // (num_gpus * num_workers * batch_size_iter)) # Pre-define iterator length
+    if batch_size is not None:
+        # Perform multi-threaded dataloading
+        return wds.WebLoader(datapipe, num_workers=num_workers, batch_size=None)
+    else:
+        return datapipe
+### Multi-dataset loading utils
+def make_empty_mod_dict(modality_info):
+    empty_mod_dicts = {}
+    for mod_name, mod_info in modality_info.items():
+        empty_mod = {}
+        # Tensor
+        if 'num_channels' in mod_info and 'input_size' in mod_info:
+            # Handle image-like modalities
+            max_tokens = mod_info['max_tokens']
+            empty_mod['tensor'] = torch.zeros((mod_info['num_channels'], mod_info['input_size'], mod_info['input_size']), dtype=torch.float32)
+        elif mod_name == 't5_caption':
+            # Handle T5 embedding
+            max_tokens = mod_info['max_tokens']
+            orig_emb_dim = mod_info['encoder_embedding']().orig_emb_dim
+            empty_mod['tensor'] = torch.zeros((max_tokens, orig_emb_dim), dtype=torch.float32)
+        elif mod_info['type'] in ['seq', 'seq_emb', 'seq_token']:
+            # Handle all other discrete sequence modalities
+            max_tokens = (mod_info['max_tokens'] + 1) * 2
+            empty_mod['tensor'] = torch.zeros((max_tokens), dtype=torch.int32)
+        else:
+            max_tokens = mod_info['max_tokens']
+            empty_mod['tensor'] = torch.zeros((max_tokens), dtype=torch.int32)
+        # Input and target masks
+        empty_mod['input_mask'] = torch.ones((max_tokens), dtype=torch.bool)
+        empty_mod['target_mask'] = torch.ones((max_tokens), dtype=torch.bool)
+        # Decoder attention mask
+        empty_mod['decoder_attention_mask'] = torch.zeros((max_tokens), dtype=torch.int32)
+        empty_mod_dicts[mod_name] = empty_mod
+    return empty_mod_dicts
+class MixtureDataset(IterableDataset):
+    def __init__(self, data_iters, weights, modality_info):
+        self.orig_data_iters = data_iters
+        self.data_iters = [iter(data_iter) for data_iter in data_iters]  # Create initial iterators
+        self.sampling_probs = np.array(weights) / sum(weights)
+        self.modality_info = modality_info
+    def reset_iterator(self, idx):
+        """ Reset the iterator when exhausted. """
+        self.data_iters[idx] = iter(self.orig_data_iters[idx])
+    def __iter__(self):
+        while True:
+            dataset_idx = np.random.choice(len(self.sampling_probs), p=self.sampling_probs)
+            try:
+                data = next(self.data_iters[dataset_idx])
+            except StopIteration:  # If the iterator is exhausted
+                self.reset_iterator(dataset_idx)  # Reset it
+                data = next(self.data_iters[dataset_idx])
+            mod_dict = make_empty_mod_dict(self.modality_info)
+            mod_dict.update(data)
+            yield mod_dict
+def build_mixture_dataloader(data_iters, weights, modality_info, batch_size, num_workers, epoch_size, num_gpus):
+    mixture_pipe = wds.DataPipeline(
+        MixtureDataset(data_iters, weights, modality_info),
+        wds.batched(batch_size, collation_fn=default_collate, partial=False),
+    ).with_epoch(epoch_size // (num_gpus * num_workers * batch_size)) # Pre-define iterator length
+    mixture_loader = wds.WebLoader(mixture_pipe, num_workers=num_workers, batch_size=None)
+    return mixture_loader

fourm/demo_4M_sampler.py ADDED Viewed

	@@ -0,0 +1,540 @@

+from typing import Optional, List
+import os
+import math
+from PIL import Image
+import numpy as np
+import torch
+import torch.nn as nn
+import requests
+from tokenizers import Tokenizer
+import matplotlib.pyplot as plt
+from torchvision.transforms.functional import center_crop
+from fourm.models.fm import FM
+from fourm.vq.vqvae import VQVAE, DiVAE
+from fourm.models.generate import GenerationSampler, build_chained_generation_schedules, init_empty_target_modality, init_full_input_modality, custom_text
+from fourm.utils.plotting_utils import decode_dict
+from fourm.data.modality_info import MODALITY_INFO
+from fourm.data.modality_transforms import RGBTransform
+from fourm.utils import load_safetensors
+from fourm.utils.plotting_utils import decode_dict, visualize_bboxes, plot_text_in_square, text_to_pil_image
+# The flag below controls whether to allow TF32 on matmul. This flag defaults to False in PyTorch 1.12 and later.
+torch.backends.cuda.matmul.allow_tf32 = True
+# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
+torch.backends.cudnn.allow_tf32 = True
+# Default chained generation order
+DEFAULT_ORDER = [
+    'tok_clip@224', 'tok_dinov2@224', 'tok_imagebind@224', 'tok_depth@224', 'tok_normal@224',
+    'tok_semseg@224', 'tok_canny_edge@224', 'tok_sam_edge@224', 'tok_rgb@224',
+    'caption', 'det', 'human_poses', 'sam_instance', 'color_palette', 'metadata',
+]
+# Default super-resolution chained generation order
+DEFAULT_ORDER_SR = [
+    'tok_clip@448', 'tok_depth@448', 'tok_normal@448',
+    'tok_semseg@448', 'tok_rgb@448',
+]
+# Default generation parameters for the case where the input contains RGB
+DEFAULTS_RGB2X = {
+    'tok_clip@224/tok_depth@224/tok_normal@224/tok_semseg@224/tok_canny_edge@224/tok_sam_edge@224': {
+        'tokens_per_target': 196, 'autoregression_scheme': 'roar', 'decoding_steps': 1,
+        'token_decoding_schedule': 'linear', 'temp': 0.01, 'temp_schedule': 'constant',
+        'cfg_scale': 2.0, 'cfg_schedule': 'constant',
+    },
+    'tok_dinov2@224/tok_imagebind@224': {
+        'tokens_per_target': 256, 'autoregression_scheme': 'roar', 'decoding_steps': 1,
+        'token_decoding_schedule': 'linear', 'temp': 0.01, 'temp_schedule': 'constant',
+        'cfg_scale': 2.0, 'cfg_schedule': 'constant',
+    },
+    'caption/det': {
+        'tokens_per_target': 256, 'autoregression_scheme': 'autoregressive', 'decoding_steps': None,
+        'token_decoding_schedule': None, 'temp': 0.3, 'temp_schedule': 'constant',
+        'cfg_scale': 1.0, 'cfg_schedule': 'constant',
+    },
+    'human_poses': {
+        'tokens_per_target': 275, 'autoregression_scheme': 'autoregressive', 'decoding_steps': None,
+        'token_decoding_schedule': None, 'temp': 0.1, 'temp_schedule': 'constant',
+        'cfg_scale': 1.0, 'cfg_schedule': 'constant',
+    },
+    'sam_instance': {
+        'tokens_per_target': 256, 'autoregression_scheme': 'autoregressive', 'decoding_steps': None,
+        'token_decoding_schedule': None, 'temp': 0.01, 'temp_schedule': 'constant',
+        'cfg_scale': 1.0, 'cfg_schedule': 'constant',
+    },
+    'color_palette': {
+        'tokens_per_target': 23, 'autoregression_scheme': 'autoregressive', 'decoding_steps': None,
+        'token_decoding_schedule': None, 'temp': 0.1, 'temp_schedule': 'constant',
+        'cfg_scale': 1.0, 'cfg_schedule': 'constant',
+    },
+    'metadata': {
+        'tokens_per_target': 40, 'autoregression_scheme': 'autoregressive', 'decoding_steps': None,
+        'token_decoding_schedule': None, 'temp': 0.1, 'temp_schedule': 'constant',
+        'cfg_scale': 1.0, 'cfg_schedule': 'constant',
+    },
+}
+# Default generation parameters for the case where the target is RGB
+DEFAULTS_X2RGB = {
+    'tok_clip@224': {
+        'tokens_per_target': 196, 'autoregression_scheme': 'roar', 'decoding_steps': 50,
+        'token_decoding_schedule': 'linear', 'temp': 5.0, 'temp_schedule': 'onex:0.5:0.5',
+        'cfg_scale': 3.0, 'cfg_schedule': 'constant',
+    },
+    'tok_dinov2@224/tok_imagebind@224': {
+        'tokens_per_target': 256, 'autoregression_scheme': 'roar', 'decoding_steps': 8,
+        'token_decoding_schedule': 'linear', 'temp': 0.01, 'temp_schedule': 'constant',
+        'cfg_scale': 2.0, 'cfg_schedule': 'constant',
+    },
+    'tok_depth@224/tok_normal@224/tok_semseg@224/tok_canny_edge@224/tok_sam_edge@224': {
+        'tokens_per_target': 196, 'autoregression_scheme': 'roar', 'decoding_steps': 8,
+        'token_decoding_schedule': 'linear', 'temp': 3.0, 'temp_schedule': 'onex:0.5:0.5',
+        'cfg_scale': 2.0, 'cfg_schedule': 'constant',
+    },
+    'tok_rgb@224': {
+        'tokens_per_target': 196, 'autoregression_scheme': 'roar', 'decoding_steps': 25,
+        'token_decoding_schedule': 'linear', 'temp': 3.0, 'temp_schedule': 'onex:0.5:0.5',
+        'cfg_scale': 2.0, 'cfg_schedule': 'constant',
+    },
+    'caption/det': {
+        'tokens_per_target': 256, 'autoregression_scheme': 'autoregressive', 'decoding_steps': None,
+        'token_decoding_schedule': None, 'temp': 0.3, 'temp_schedule': 'constant',
+        'cfg_scale': 1.0, 'cfg_schedule': 'constant',
+    },
+    'human_poses': {
+        'tokens_per_target': 275, 'autoregression_scheme': 'autoregressive', 'decoding_steps': None,
+        'token_decoding_schedule': None, 'temp': 0.1, 'temp_schedule': 'constant',
+        'cfg_scale': 1.0, 'cfg_schedule': 'constant',
+    },
+    'sam_instance': {
+        'tokens_per_target': 256, 'autoregression_scheme': 'autoregressive', 'decoding_steps': None,
+        'token_decoding_schedule': None, 'temp': 0.01, 'temp_schedule': 'constant',
+        'cfg_scale': 1.0, 'cfg_schedule': 'constant',
+    },
+    'color_palette': {
+        'tokens_per_target': 23, 'autoregression_scheme': 'autoregressive', 'decoding_steps': None,
+        'token_decoding_schedule': None, 'temp': 0.1, 'temp_schedule': 'constant',
+        'cfg_scale': 1.0, 'cfg_schedule': 'constant',
+    },
+    'metadata': {
+        'tokens_per_target': 40, 'autoregression_scheme': 'autoregressive', 'decoding_steps': None,
+        'token_decoding_schedule': None, 'temp': 0.1, 'temp_schedule': 'constant',
+        'cfg_scale': 1.0, 'cfg_schedule': 'constant',
+    },
+}
+# Default generation parameters for super-resolution
+DEFAULTS_SR = {
+    'tok_clip@448/tok_depth@448/tok_normal@448/tok_semseg@448/tok_rgb@448': {
+        'tokens_per_target': 784, 'autoregression_scheme': 'maskgit', 'decoding_steps': 8,
+        'token_decoding_schedule': 'cosine', 'temp': 1.0, 'temp_schedule': 'constant',
+        'cfg_scale': 2.0, 'cfg_schedule': 'constant',
+    },
+}
+# Plotting names for each modality
+MODALITY_PLOTTING_NAME_MAP = {
+    'caption': 'Caption',
+    'det': 'Bounding boxes',
+    'human_poses': 'Human poses',
+    'sam_instance': 'SAM instances (single pass)',
+    'color_palette': 'Color palette',
+    'metadata': 'Metadata',
+    'rgb@224': 'RGB (224x224)',
+    'rgb@448': 'RGB (448x448)',
+    'tok_rgb@224': 'RGB (tokenized, 224x224)',
+    'tok_rgb@448': 'RGB (tokenized, 448x448)',
+    'tok_clip@224': 'CLIP-B/16 (224x224)',
+    'tok_clip@448': 'CLIP-B/16  (448x448)',
+    'tok_depth@224': 'Depth (224x224)',
+    'tok_depth@448': 'Depth (448x448)',
+    'tok_normal@224': 'Normals (224x224)',
+    'tok_normal@448': 'Normals (448x448)',
+    'tok_semseg@224': 'Semantic segmentation (224x224)',
+    'tok_semseg@448': 'Semantic segmentation (448x448)',
+    'tok_canny_edge@224': 'Canny edges (224x224)',
+    'tok_sam_edge@224': 'SAM edges (224x224)',
+    'tok_dinov2@224': 'DINOv2-B/14 (224x224)',
+    'tok_imagebind@224': 'ImageBind-H/14 (224x224)',
+}
+# Optional fixed plotting order (by default, plotting order is determined by generation order)
+MODALITY_PLOTTING_ORDER = [
+    'rgb@224', 'rgb@448', 'tok_rgb@224', 'tok_rgb@448',
+    'tok_depth@224', 'tok_depth@448', 'tok_normal@224', 'tok_normal@448',
+    'tok_semseg@224', 'tok_semseg@448', 'tok_canny_edge@224', 'tok_sam_edge@224',
+    'sam_instance', 'human_poses', 'det', 'caption', 'metadata', 'color_palette',
+    'tok_clip@224', 'tok_clip@448', 'tok_dinov2@224', 'tok_imagebind@224',
+]
+def get_value(defaults_dict, domain, key):
+    """Look up a default value belonging to a given domain and key."""
+    for domains, defaults in defaults_dict.items():
+        if domain in domains:
+            return defaults[key]
+def load_model(model_id, model_class):
+    """Load a model from HuggingFace hub or a given .safetensors checkpoint path."""
+    if model_id.endswith('.safetensors'):
+        ckpt, config = load_safetensors(model_id)
+        model = model_class(config=config)
+        model.load_state_dict(ckpt)
+    else:
+        model = model_class.from_pretrained(model_id)
+    return model
+def img_from_url(url: str):
+    rgb_transform = RGBTransform(imagenet_default_mean_and_std=True)
+    img_data = requests.get(url).content
+    with open('demo.png', 'wb') as handler:
+        handler.write(img_data)
+    img_pil = rgb_transform.load('./demo.png')
+    img_pil = rgb_transform.preprocess(img_pil)
+    img_pil = center_crop(img_pil, (min(img_pil.size), min(img_pil.size))).resize((224,224))
+    img = rgb_transform.postprocess(img_pil).unsqueeze(0)
+    return img
+class Demo4MSampler(nn.Module):
+    """Convenience wrapper for easy 4M loading and generation. Users can specify HuggingFace Hub
+    model URLs, or downloaded safetensors checkpoints paths, and the models will be automatically
+    loaded. The `forward` function can be used for RGB-2-all and {caption,det}-2-all generation.
+    This wrapper is only intended for quickly trying out 4M models. For more advanced usecases we
+    recommend looking at the generation notebooks in `./notebooks/`, and `./run_generation.py`.
+    Args:
+        fm: Hub or safetensors path of 4M base model
+        fm_sr: Hub or safetensors path of 4M super-resolution model
+        tok_rgb: Hub or safetensors path of RGB tokenizer
+        tok_depth: Hub or safetensors path of depth tokenizer
+        tok_normal: Hub or safetensors path of surface normal tokenizer
+        tok_edge: Hub or safetensors path of canny edge tokenizer (for SAM and RGB edges)
+        tok_semseg: Hub or safetensors path of COCO semantic segmentation tokenizer
+        tok_clip: Hub or safetensors path of CLIP-B/16 tokenizer
+        tok_dinov2: Hub or safetensors path of DINOv2-B/14 tokenizer
+        tok_imagebind: Hub or safetensors path of ImageBind-H/14 tokenizer
+        tok_sam_instance: Hub or safetensors path of SAM instance tokenizer
+        tok_human_poses: Hub or safetensors path of human poses tokenizer
+        tok_text: Path to text tokenizer JSON file
+        mods: Optional list of modalities to override default behavior of generating everything
+        mods_sr: Optional list of super-res modalities to override default behavior of generating everything
+    """
+    def __init__(self,
+                 fm: str = 'EPFL-VILAB/4M-21_XL_CC12M',
+                 fm_sr: Optional[str] = 'EPFL-VILAB/4M-7-SR_L_CC12M',
+                 tok_rgb: Optional[str] = 'EPFL-VILAB/4M_tokenizers_rgb_16k_224-448',
+                 tok_depth: Optional[str] = 'EPFL-VILAB/4M_tokenizers_depth_8k_224-448',
+                 tok_normal: Optional[str] = 'EPFL-VILAB/4M_tokenizers_normal_8k_224-448',
+                 tok_edge: Optional[str] = 'EPFL-VILAB/4M_tokenizers_edge_8k_224-512',
+                 tok_semseg: Optional[str] = 'EPFL-VILAB/4M_tokenizers_semseg_4k_224-448',
+                 tok_clip: Optional[str] = 'EPFL-VILAB/4M_tokenizers_CLIP-B16_8k_224-448',
+                 tok_dinov2: Optional[str] = 'EPFL-VILAB/4M_tokenizers_DINOv2-B14_8k_224-448',
+                 tok_imagebind: Optional[str] = 'EPFL-VILAB/4M_tokenizers_ImageBind-H14_8k_224-448',
+                 tok_sam_instance: Optional[str] = 'EPFL-VILAB/4M_tokenizers_sam-instance_1k_64',
+                 tok_human_poses: Optional[str] = 'EPFL-VILAB/4M_tokenizers_human-poses_1k_8',
+                 tok_text: str = './fourm/utils/tokenizer/trained/text_tokenizer_4m_wordpiece_30k.json',
+                 mods: Optional[List[str]] = None,
+                 mods_sr: Optional[List[str]] = None,
+                 verbose: bool = True):
+        super().__init__()
+        self.verbose = verbose
+        if self.verbose:
+            print('Loading 4M models and tokenizers...', end='')
+        # Load 4M model and initialize sampler
+        fm = load_model(fm, FM)
+        self.sampler_fm = GenerationSampler(fm)
+        self.mods = mods or list(set(fm.encoder_modalities) | set(fm.decoder_modalities))
+        # Load optional 4M super-res model and initialize sampler
+        if fm_sr is not None:
+            fm_sr = load_model(fm_sr, FM)
+            self.sampler_fm_sr = GenerationSampler(fm_sr)
+            self.mods_sr = mods_sr or list(set(fm_sr.encoder_modalities) | set(fm_sr.decoder_modalities))
+        else:
+            self.sampler_fm_sr = None
+        # Load tokenizers
+        self.toks = {}
+        if ('tok_rgb@224' in self.mods or 'tok_rgb@448' in self.mods_sr) and tok_rgb is not None:
+            self.toks['tok_rgb'] = load_model(tok_rgb, DiVAE)
+        if ('tok_depth@224' in self.mods or 'tok_depth@448' in self.mods_sr) and tok_depth is not None:
+            self.toks['tok_depth'] = load_model(tok_depth, DiVAE)
+        if ('tok_normal@224' in self.mods or 'tok_normal@448' in self.mods_sr) and tok_normal is not None:
+            self.toks['tok_normal'] = load_model(tok_normal, DiVAE)
+        if ('tok_canny_edge@224' in self.mods or 'tok_sam_edge@224' in self.mods) and tok_edge is not None:
+            self.toks['tok_canny_edge'] = load_model(tok_edge, DiVAE)
+            self.toks['tok_sam_edge'] = self.toks['tok_canny_edge'] # Shared tokenizer
+        if ('tok_semseg@224' in self.mods or 'tok_semseg@448' in self.mods_sr) and tok_semseg is not None:
+            self.toks['tok_semseg'] = load_model(tok_semseg, VQVAE)
+        if ('tok_clip@224' in self.mods or 'tok_clip@448' in self.mods_sr) and tok_clip is not None:
+            self.toks['tok_clip'] = load_model(tok_clip, VQVAE)
+        if 'tok_dinov2@224' in self.mods and tok_dinov2 is not None:
+            self.toks['tok_dinov2'] = load_model(tok_dinov2, VQVAE)
+        if 'tok_imagebind@224' in self.mods and tok_imagebind is not None:
+            self.toks['tok_imagebind'] = load_model(tok_imagebind, VQVAE)
+        if 'sam_instance' in self.mods and tok_sam_instance is not None:
+            self.toks['sam_instance'] = load_model(tok_sam_instance, VQVAE)
+        if 'human_poses' in self.mods and tok_human_poses is not None:
+            self.toks['human_poses'] = load_model(tok_human_poses, VQVAE)
+        self.toks = nn.ModuleDict(self.toks)
+        self.tok_text = Tokenizer.from_file(tok_text)
+        if self.verbose:
+            print(' done!')
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def __setup_conds_and_targets(self, sample):
+        # Input and output modalities
+        cond_domains = [domain for domain in list(sample.keys()) if domain in self.mods]
+        target_domains = [domain for domain in DEFAULT_ORDER if (domain not in cond_domains and domain in self.mods)]
+        if 'rgb@224' in cond_domains:
+            # Do not generate tokenized RGB if pixel RGB is given as input
+            target_domains.remove('tok_rgb@224')
+        return cond_domains, target_domains
+    def __setup_sr_conds_and_targets(self, sample):
+        cond_domains_sr = [domain for domain in list(sample.keys()) if domain in self.mods_sr]
+        target_domains_sr = [domain for domain in DEFAULT_ORDER_SR if (domain.replace('448', '224') in cond_domains_sr and domain in self.mods_sr)]
+        return cond_domains_sr, target_domains_sr
+    def __setup_sample_and_schedule(self, sample, cond_domains, target_domains, cfg_grow_conditioning=True):
+        # 1 - Setup generation schedule
+        defaults = DEFAULTS_RGB2X if ('rgb@224' in cond_domains or 'tok_rgb@224' in cond_domains) else DEFAULTS_X2RGB
+        tokens_per_target = [get_value(defaults, domain, 'tokens_per_target') for domain in target_domains]
+        autoregression_schemes = [get_value(defaults, domain, 'autoregression_scheme') for domain in target_domains]
+        decoding_steps = [get_value(defaults, domain, 'decoding_steps') for domain in target_domains]
+        token_decoding_schedules = [get_value(defaults, domain, 'token_decoding_schedule') for domain in target_domains]
+        temps = [get_value(defaults, domain, 'temp') for domain in target_domains]
+        temp_schedules = [get_value(defaults, domain, 'temp_schedule') for domain in target_domains]
+        cfg_scales = [get_value(defaults, domain, 'cfg_scale') for domain in target_domains]
+        cfg_schedules = [get_value(defaults, domain, 'cfg_schedule') for domain in target_domains]
+        schedule = build_chained_generation_schedules(
+            cond_domains=cond_domains, target_domains=target_domains, tokens_per_target=tokens_per_target,
+            autoregression_schemes=autoregression_schemes, decoding_steps=decoding_steps,
+            token_decoding_schedules=token_decoding_schedules, temps=temps, temp_schedules=temp_schedules,
+            cfg_scales=cfg_scales, cfg_schedules=cfg_schedules, cfg_grow_conditioning=cfg_grow_conditioning,
+        )
+        # 2 - Setup sample
+        sample_dict = {}
+        # Handle special cases
+        if 'caption' in sample:
+            caption = sample.pop('caption')
+            sample_dict = custom_text(
+                sample_dict, input_text=caption, eos_token='[EOS]',
+                key='caption', device=self.device, text_tokenizer=self.tok_text
+            )
+        if 'det' in sample:
+            caption = sample.pop('det')
+            sample_dict = custom_text(
+                sample_dict, input_text=caption, eos_token='[EOS]',
+                key='det', device=self.device, text_tokenizer=self.tok_text
+            )
+        # Add remaining modalities
+        sample_dict.update({domain: {'tensor': tensor} for domain, tensor in sample.items()})
+        # Initialize these remaining input modalities (caption and det are already initialized by custom_text)
+        for cond_mod in sample.keys():
+            sample_dict = init_full_input_modality(sample_dict, MODALITY_INFO, cond_mod, self.device, eos_id=self.tok_text.token_to_id("[EOS]"))
+        # Initialize target modalities
+        for target_mod, ntoks in zip(target_domains, tokens_per_target):
+            sample_dict = init_empty_target_modality(sample_dict, MODALITY_INFO, target_mod, 1, ntoks, self.device)
+        return sample_dict, schedule
+    def __setup_sr_sample_and_schedule(self, out_dict, cond_domains_sr, target_domains_sr, cfg_grow_conditioning_sr=True):
+        # 1 - Setup generation schedule
+        tokens_per_target_sr = [get_value(DEFAULTS_SR, domain, 'tokens_per_target') for domain in target_domains_sr]
+        autoregression_schemes_sr = [get_value(DEFAULTS_SR, domain, 'autoregression_scheme') for domain in target_domains_sr]
+        decoding_steps_sr = [get_value(DEFAULTS_SR, domain, 'decoding_steps') for domain in target_domains_sr]
+        token_decoding_schedules_sr = [get_value(DEFAULTS_SR, domain, 'token_decoding_schedule') for domain in target_domains_sr]
+        temps_sr = [get_value(DEFAULTS_SR, domain, 'temp') for domain in target_domains_sr]
+        temp_schedules_sr = [get_value(DEFAULTS_SR, domain, 'temp_schedule') for domain in target_domains_sr]
+        cfg_scales_sr = [get_value(DEFAULTS_SR, domain, 'cfg_scale') for domain in target_domains_sr]
+        cfg_schedules_sr = [get_value(DEFAULTS_SR, domain, 'cfg_schedule') for domain in target_domains_sr]
+        schedule_sr = build_chained_generation_schedules(
+            cond_domains=cond_domains_sr, target_domains=target_domains_sr, tokens_per_target=tokens_per_target_sr,
+            autoregression_schemes=autoregression_schemes_sr, decoding_steps=decoding_steps_sr,
+            token_decoding_schedules=token_decoding_schedules_sr, temps=temps_sr, temp_schedules=temp_schedules_sr,
+            cfg_scales=cfg_scales_sr, cfg_schedules=cfg_schedules_sr, cfg_grow_conditioning=cfg_grow_conditioning_sr,
+        )
+        # 2 - Setup sample
+        sample_sr = out_dict
+        # Handle case where generated caption or bounding boxes is just [EOS]
+        if 'caption' in sample_sr and sample_sr['caption']['tensor'].shape[1] <= 1 and 'caption' in cond_domains_sr:
+            sample_sr = custom_text(
+                sample_sr, input_text='[S_1]', eos_token='[EOS]',
+                key='caption', device=self.device, text_tokenizer=self.tok_text
+            )
+        if 'det' in sample_sr and sample_sr['det']['tensor'].shape[1] <= 1 and 'det' in cond_domains_sr:
+            sample_sr = custom_text(
+                sample_sr, input_text='[S_1]', eos_token='[EOS]',
+                key='det', device=self.device, text_tokenizer=self.tok_text
+            )
+        # Initialize input modalities
+        for cond_mod in cond_domains_sr:
+            sample_sr = init_full_input_modality(sample_sr, MODALITY_INFO, cond_mod, self.device, eos_id=self.tok_text.token_to_id("[EOS]"))
+        # Initialize target modalities
+        for target_mod, ntoks in zip(target_domains_sr, tokens_per_target_sr):
+            sample_sr = init_empty_target_modality(sample_sr, MODALITY_INFO, target_mod, 1, ntoks, self.device)
+        return sample_sr, schedule_sr
+    def forward(self, sample, seed: Optional[int] = None, top_p: float = 0.8, top_k: float = 0.0, target_modalities: Optional[List[str]] = None, perform_sr: bool = True):
+        seed = seed or np.random.randint(np.iinfo(np.int64).max)
+        # Prepare the generation parameters and sample
+        cond_domains, target_domains = self.__setup_conds_and_targets(sample)
+        target_domains = target_modalities or target_domains
+        sample, generation_schedule = self.__setup_sample_and_schedule(sample, cond_domains, target_domains)
+        # Generation and decoding at the base resolution 224x224
+        if self.verbose:
+            print(f'Generating {cond_domains} -> {target_domains} ...')
+        out_dict = self.sampler_fm.generate(
+            sample, generation_schedule, text_tokenizer=self.tok_text,
+            verbose=self.verbose, seed=seed, top_p=top_p, top_k=top_k,
+        )
+        dec_dict = decode_dict(
+            out_dict, self.toks, self.tok_text, image_size=224,
+            patch_size=16, decoding_steps=50
+        )
+        # Optional upsampling to 448x448
+        if self.sampler_fm_sr is not None and perform_sr:
+            cond_domains_sr, target_domains_sr = self.__setup_sr_conds_and_targets(out_dict)
+            sample_sr, generation_schedule_sr = self.__setup_sr_sample_and_schedule(out_dict, cond_domains_sr, target_domains_sr)
+            if self.verbose:
+                print(f'Super-resolving {target_domains_sr} ...')
+            out_dict_sr = self.sampler_fm_sr.generate(
+                sample_sr, generation_schedule_sr, text_tokenizer=self.tok_text,
+                verbose=self.verbose, seed=seed+1, top_p=top_p, top_k=top_k,
+            )
+            dec_dict = decode_dict(
+                out_dict_sr, self.toks, self.tok_text, image_size=448,
+                patch_size=16, decoding_steps=50
+            )
+        # Remove padding tokens
+        if 'caption' in dec_dict:
+            dec_dict['caption'][0].replace('[PAD]', '').strip()
+        if 'det' in dec_dict:
+            dec_dict['det'][0].replace('[PAD]', '').strip()
+        return dec_dict
+    def plot_modalities(self, mod_dict, ncols_max=5, figscale=4.0, save_path=None, use_fixed_plotting_order=False):
+        nmods = len(mod_dict)
+        ncols = min(nmods, ncols_max)
+        nrows = math.ceil(nmods / ncols)
+        fig, ax = plt.subplots(
+            nrows=nrows, ncols=ncols,
+            figsize=(ncols*figscale, nrows*figscale),
+            facecolor=(1, 1, 1)
+        )
+        if use_fixed_plotting_order:
+            mod_dict = {
+                k: mod_dict[k] for k in MODALITY_PLOTTING_ORDER
+                if k in mod_dict
+            }
+        for i, (mod_name, mod) in enumerate(mod_dict.items()):
+            if nrows == 1:
+                ax_i = ax[i]
+            else:
+                row, col = i // ncols, i % ncols
+                ax_i = ax[row,col]
+            if mod_name == 'det':
+                # Attempt to get the first available value from mod_dict according to the priority
+                keys_in_order = ['rgb@448', 'rgb@224', 'tok_rgb@448', 'tok_rgb@224']
+                rgb_background = next((mod_dict[key] for key in keys_in_order if key in mod_dict), np.ones((224, 224, 3)))
+                rgb_background = (255 * rgb_background).astype(np.uint8)
+                ax_i.imshow(visualize_bboxes(rgb_background, mod[0],).astype(np.uint8))
+            elif mod_name == 'caption':
+                plot_text_in_square(ax_i, mod[0], wrap_width=16, fontsize=14)
+            elif mod_name == 'metadata':
+                metadata_pred = ',\n'.join([f'{k}: {v:.2f}' if isinstance(v, float) else f'{k}: {v}' for k, v in mod.items()])
+                plot_text_in_square(ax_i, metadata_pred, wrap_width=36, fontsize=13)
+            else:
+                ax_i.imshow(mod)
+            ax_i.set_title(MODALITY_PLOTTING_NAME_MAP.get(mod_name, mod_name), fontsize=18)
+        for i, axis in enumerate(ax.flatten()):
+            axis.set_xticks([])
+            axis.set_yticks([])
+            if i >= len(mod_dict):
+                axis.spines['top'].set_visible(False)
+                axis.spines['right'].set_visible(False)
+                axis.spines['bottom'].set_visible(False)
+                axis.spines['left'].set_visible(False)
+        plt.tight_layout()
+        if save_path is not None:
+            os.makedirs(os.path.dirname(save_path), exist_ok=True)
+            plt.savefig(save_path, bbox_inches='tight', dpi=300)
+            plt.close()
+        else:
+            plt.show()
+    def modalities_to_pil(self, mod_dict, use_fixed_plotting_order=False, resize=None):
+        if use_fixed_plotting_order:
+            mod_dict = {
+                k: mod_dict[k] for k in MODALITY_PLOTTING_ORDER
+                if k in mod_dict
+            }
+        plotted_modalities = []
+        for i, (mod_name, mod) in enumerate(mod_dict.items()):
+            if mod_name == 'det':
+                # Attempt to get the first available value from mod_dict according to the priority
+                keys_in_order = ['rgb@448', 'rgb@224', 'tok_rgb@448', 'tok_rgb@224']
+                rgb_background = next((mod_dict[key] for key in keys_in_order if key in mod_dict), np.ones((224, 224, 3)))
+                rgb_background = (255 * rgb_background).astype(np.uint8)
+                img_pil = Image.fromarray(visualize_bboxes(rgb_background, mod[0],).astype(np.uint8))
+            elif mod_name == 'caption':
+                img_pil = text_to_pil_image(mod[0][:512], wrap_width=40, fontsize=14)
+            elif mod_name == 'metadata':
+                metadata_pred = ',\n'.join([f'{k}: {v:.2f}' if isinstance(v, float) else f'{k}: {v}' for k, v in mod.items()])
+                img_pil = text_to_pil_image(metadata_pred, wrap_width=36, fontsize=13)
+            else:
+                img_pil = Image.fromarray((255*mod).astype(np.uint8))
+            if resize is not None:
+                if mod_name in ['tok_clip@224', 'tok_dinov2@224', 'tok_imagebind@224', 'tok_clip@448']:
+                    resample_mode = Image.Resampling.NEAREST
+                else:
+                    resample_mode = Image.Resampling.BILINEAR
+                img_pil = img_pil.resize((resize, resize), resample=resample_mode)
+            plot_name = MODALITY_PLOTTING_NAME_MAP.get(mod_name, mod_name)
+            plotted_modalities.append((img_pil, plot_name))
+        return plotted_modalities

fourm/models/__init__.py ADDED Viewed

File without changes

fourm/models/decoder_embeddings.py ADDED Viewed

	@@ -0,0 +1,268 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from einops import repeat
+from .fm_utils import build_1d_sincos_posemb, build_2d_sincos_posemb, pair
+class SequenceDecoderEmbedding(nn.Module):
+    """Embedding module for sequence inputs, like captions or a sequence of objects.
+    Args:
+        vocab_size: Vocabulary size
+        max_length: Maximum number of tokens in the sequence
+        dim_tokens: Dimension of output tokens. Can be set using init method.
+        sincos_pos_emb: Set to True (default) to use fixed 1D sin-cos positional embeddings
+        padding_idx: Padding index for word embedding
+        share_embedding: Set to True to share input and output embedding weights
+    """
+    def __init__(self,
+                 vocab_size: int,
+                 max_length: int,
+                 dim_tokens: Optional[int] = None,
+                 sincos_pos_emb: bool = True,
+                 max_sincos_pos_emb: int = 512,
+                 padding_idx: int = 0,
+                 share_embedding: bool = True,
+                 **kwargs):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.max_length = max_length
+        self.dim_tokens = dim_tokens
+        self.sincos_pos_emb = sincos_pos_emb
+        self.padding_idx = padding_idx
+        self.max_sincos_pos_emb = max_sincos_pos_emb
+        self.share_embedding = share_embedding
+        if self.dim_tokens is not None:
+            self.init(dim_tokens=dim_tokens)
+    def init(self, dim_tokens: int = 768, init_std=0.02):
+        """
+        Initialize parts of embedding module that are dependent on dimension of tokens.
+        Should be called when setting up FourM.
+        Args:
+            dim_tokens: Dimension of tokens
+            init_std: Standard deviation of init
+        """
+        self.dim_tokens = dim_tokens
+        # Task embedding identifying from which task a given token comes from
+        # Fixed-size positional embeddings. Can be interpolated to different input sizes
+        if self.sincos_pos_emb:
+            if self.max_length > self.max_sincos_pos_emb:
+                raise ValueError(f"Max length ({self.max_length}) is greater than the number of posembs ({self.max_sincos_pos_emb}")
+            # Get all posembs, than truncate up to max length
+            pos_emb = build_1d_sincos_posemb(max_len=self.max_sincos_pos_emb, embed_dim=self.dim_tokens)[:self.max_length]
+            self.register_buffer("pos_emb", pos_emb)
+        else:
+            self.pos_emb = nn.Parameter(torch.zeros(1, self.max_length, self.dim_tokens))
+            nn.init.normal_(self.pos_emb, std=init_std)
+        self.mod_emb = nn.Parameter(torch.zeros(1, 1, self.dim_tokens))
+        nn.init.normal_(self.mod_emb, std=init_std)
+        # Token embedding
+        self.token_emb = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.dim_tokens, padding_idx=self.padding_idx)
+        # Output projection layer
+        self.to_logits = nn.Linear(self.dim_tokens, self.vocab_size, bias=False)
+        if self.share_embedding:
+            # Share input and output embedding weights
+            self.to_logits.weight = self.token_emb.weight
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return set()
+    def forward_embed(self, d: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through embedding module, transforming sequence of ids to sequence of embeddings.
+        Creates corresponding modality and positional embeddings and adds them to the dict.
+        Args:
+            d (Dict[str, torch.Tensor]): Modality dict, with at least the following keys:
+                - 'tensor' (torch.Tensor): Token sequence for each batch. Shape (B, L) where B is the batch size and L is the sequence length.
+                - 'target_mask' (torch.Tensor): Mask for valid tokens in the target sequence (set to 0 for valid tokens and 1 otherwise). Shape (B, L).
+        Returns:
+            Dict[str, torch.Tensor]: Modality dict with added keys:
+                - 'x' (torch.Tensor): Embedded token sequence. Shape (B, L, D) where D is the embedding dimension.
+                - 'emb' (torch.Tensor): Sum of positional and modality embeddings for the target sequence. Shape (B, L, D).
+                - 'ids' (torch.Tensor): Original token sequence from input dict. Shape (B, L).
+        """
+        ids = d['tensor']
+        B = ids.shape[0]
+        assert self.dim_tokens is not None, 'Need to call init(dim_tokens) function first'
+        # Map to embedding
+        x = self.token_emb(ids)
+        expanded_pos_emb = repeat(self.pos_emb, "() n d -> b n d", b=B)
+        # Target pos encoding
+        target_mask = d['target_mask']
+        target_pos_id = (~target_mask).int().cumsum(dim=1) - 1
+        target_pos_id[target_mask] = 0
+        # Sometimes target sequence is over max length, it will be truncated in decoder
+        target_pos_id[target_pos_id >= self.max_length] = 0
+        target_pos_emb = torch.gather(expanded_pos_emb, dim=1, index=repeat(target_pos_id, "b n -> b n d", d=expanded_pos_emb.shape[2]))
+        target_pos_emb[target_mask] = 0
+        x_emb = target_pos_emb + self.mod_emb
+        d['x'] = x
+        d['emb'] = x_emb
+        d['ids'] = d['tensor']
+        return d
+    def forward_logits(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through output projection layer, transforming sequence of embeddings to logits.
+        Args:
+            x (torch.Tensor): Output tokens from the decoder. Shape (B, M, D)
+        Returns:
+            torch.Tensor: Logits for each token in the sequence. Shape (B, M, V)
+        """
+        logits = self.to_logits(x)
+        return logits
+class ImageTokenDecoderEmbedding(nn.Module):
+    """Embedding module for tokenized spatial inputs.
+    Args:
+        vocab_size: Vocabulary size
+        patch_size: Int or tuple of the patch size over the full image size.
+        dim_tokens: Dimension of output tokens. Can be set using init method.
+        sincos_pos_emb: Set to True (default) to use fixed 2D sin-cos positional embeddings
+        image_size: Default image size. Used to initialize size of positional embeddings.
+        share_embedding: Set to True to share input and output embedding weights
+    """
+    def __init__(self,
+                 vocab_size: int,
+                 patch_size: Union[int, Tuple[int,int]] = 16,
+                 dim_tokens: Optional[int] = None,
+                 sincos_pos_emb: bool = True,
+                 image_size: Union[int, Tuple[int]] = 224,
+                 share_embedding: bool = True,
+                 **kwargs):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.patch_size = pair(patch_size)
+        self.dim_tokens = dim_tokens
+        self.sincos_pos_emb = sincos_pos_emb
+        self.image_size = pair(image_size)
+        self.num_patches = (self.image_size[0] // self.patch_size[0]) * (self.image_size[1] // self.patch_size[1])
+        self.share_embedding = share_embedding
+        if self.dim_tokens is not None:
+            self.init(dim_tokens=dim_tokens)
+    def init(self, dim_tokens: int = 768, init_std=0.02):
+        """
+        Initialize parts of module that are dependent on dimension of tokens.
+        Should be called when setting up FourM.
+        Args:
+            dim_tokens: Dimension of tokens
+            init_std: Standard deviation of init
+        """
+        self.dim_tokens = dim_tokens
+        # Task embedding identifying from which task a given token comes from
+        # Fixed-size positional embeddings. Can be interpolated to different input sizes
+        h_posemb = self.image_size[0] // self.patch_size[0]
+        w_posemb = self.image_size[1] // self.patch_size[1]
+        if self.sincos_pos_emb:
+            pos_emb = build_2d_sincos_posemb(h=h_posemb, w=w_posemb, embed_dim=self.dim_tokens)
+            self.register_buffer("pos_emb", pos_emb)
+        else:
+            self.pos_emb = nn.Parameter(torch.zeros(1, (h_posemb * w_posemb), self.dim_tokens))
+            nn.init.normal_(self.pos_emb, std=init_std)
+        self.mod_emb = nn.Parameter(torch.zeros(1, 1, self.dim_tokens))
+        nn.init.normal_(self.mod_emb, std=init_std)
+        # Token embedding (not needed if only masked tokens are given as input, but can be useful to train Token Critic)
+        self.token_emb = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.dim_tokens)
+        # Output projection layer
+        self.to_logits = nn.Linear(self.dim_tokens, self.vocab_size, bias=False)
+        if self.share_embedding:
+            # Share input and output embedding weights
+            self.to_logits.weight = self.token_emb.weight
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return set()
+    def forward_embed(self, d: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through the embedding module, transforming tokenized spatial inputs to embeddings.
+        Creates corresponding modality and positional embeddings and adds them to the dict.
+        Args:
+            d (Dict[str, torch.Tensor]): Modality dict, with at least the following key:
+                - 'tensor' (torch.Tensor): Modality tokens for each batch (e.g. from tokenized images). Shape (B, H, W) where B is the batch size, H and W are height and width after tokenization.
+        Returns:
+            Dict[str, torch.Tensor]: Modality dict with added keys:
+                - 'x' (torch.Tensor): Embedded token sequence, which is replaced by mask tokens in the 4M decoder. Shape (B, H*W, D) where D is the embedding dimension.
+                - 'emb' (torch.Tensor): Sum of positional and modality embeddings for the token sequence. Shape (B, H*W, D).
+                - 'ids' (torch.Tensor): Reshaped token sequence from input dict, flattened in the spatial dimensions. Shape (B, H*W).
+        """
+        ids = d['tensor']
+        B = ids.shape[0]
+        ids = ids.reshape(B, -1)
+        # Map to embedding
+        x = self.token_emb(ids)
+        # Create positional embedding + modality embedding
+        x_emb = repeat(self.pos_emb + self.mod_emb, '() n d -> b n d', b=B)
+        d['x'] = x
+        d['emb'] = x_emb
+        d['ids'] = ids
+        return d
+    def forward_logits(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through output projection layer, transforming sequence of embeddings to logits.
+        Args:
+            x (torch.Tensor): Output tokens from the decoder. Shape (B, M, D)
+        Returns:
+            torch.Tensor: Logits for each token in the sequence. Shape (B, M, V)
+        """
+        logits = self.to_logits(x)
+        return logits

fourm/models/encoder_embeddings.py ADDED Viewed

	@@ -0,0 +1,422 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from .fm_utils import build_1d_sincos_posemb, build_2d_sincos_posemb, pair
+class SequenceEncoderEmbedding(nn.Module):
+    """Embedding module for encoding sequence inputs, like captions or a sequence of objects.
+    Args:
+        vocab_size: Vocabulary size
+        max_length: Maximum number of tokens in the sequence
+        dim_tokens: Dimension of output tokens. Can be set using init method.
+        sincos_pos_emb: Set to True (default) to use fixed 1D sin-cos positional embeddings
+        max_sincos_pos_emb: Maximum allowed length for sin-cos positional embeddings
+        padding_idx: Padding index for word embedding
+    """
+    def __init__(self,
+                 vocab_size: int,
+                 max_length: int,
+                 dim_tokens: Optional[int] = None,
+                 sincos_pos_emb: bool = True,
+                 max_sincos_pos_emb: int = 512,
+                 padding_idx: int = 0,
+                 ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.max_length = max_length
+        self.dim_tokens = dim_tokens
+        self.sincos_pos_emb = sincos_pos_emb
+        self.padding_idx = padding_idx
+        self.max_sincos_pos_emb = max_sincos_pos_emb
+        if self.dim_tokens is not None:
+            self.init(dim_tokens=dim_tokens)
+    def init(self, dim_tokens: int = 768, init_std=0.02):
+        """
+        Initialize parts of embedding module that are dependent on dimension of tokens.
+        Should be called when setting up FourM.
+        Args:
+            dim_tokens: Dimension of tokens
+            init_std: Standard deviation of init
+        """
+        self.dim_tokens = dim_tokens
+        # Task embedding identifying from which task a given token comes from
+        # Fixed-size positional embeddings. Can be interpolated to different input sizes
+        if self.sincos_pos_emb:
+            if self.max_length > self.max_sincos_pos_emb:
+                raise ValueError(f"Max length ({self.max_length}) is greater than the number of posembs ({self.max_sincos_pos_emb}")
+            pos_emb = build_1d_sincos_posemb(max_len=self.max_sincos_pos_emb, embed_dim=self.dim_tokens)[:self.max_length]
+            self.register_buffer("pos_emb", pos_emb) # self.pos_emb is now a buffer for FSDP
+        else:
+            self.pos_emb = nn.Parameter(torch.zeros(1, self.max_length, self.dim_tokens))
+            nn.init.normal_(self.pos_emb, std=init_std)
+        self.mod_emb = nn.Parameter(torch.zeros(1, 1, self.dim_tokens))
+        nn.init.normal_(self.mod_emb, std=init_std)
+        # Token embedding
+        self.token_emb = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.dim_tokens,
+                                     padding_idx=self.padding_idx)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return set()
+    def forward(self, d : Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through embedding module, transforming sequence of ids to sequence of embeddings.
+        Creates corresponding modality and positional embeddings and adds them to the dict.
+        Args:
+            d (Dict[str, torch.Tensor]): Modality dict with at least the following keys:
+                - 'tensor' (torch.Tensor): Input token sequence for each batch. Shape (B, L) where B is the batch size and L is the sequence length.
+                - 'input_mask' (torch.Tensor): Mask for valid tokens in the input sequence (set to 0 for valid tokens and 1 otherwise). Shape (B, L).
+        Returns:
+            Dict[str, torch.Tensor]: Modality dict with added keys:
+                - 'x' (torch.Tensor): Embedded token sequence. Shape (B, L, D) where D is the embedding dimension.
+                - 'emb' (torch.Tensor): Sum of positional and modality embeddings for the input sequence. Shape (B, L, D).
+        """
+        ids = d['tensor']
+        B = ids.shape[0]
+        assert self.dim_tokens is not None, 'Need to call init(dim_tokens) function first'
+        # Map to embedding
+        x = self.token_emb(ids)
+        expanded_pos_emb = repeat(self.pos_emb, "() n d -> b n d", b=B)
+        # Input pos encoding
+        input_mask = d['input_mask']
+        input_pos_id = (~input_mask).int().cumsum(dim=1) - 1
+        input_pos_id[input_mask] = 0
+        input_pos_emb = torch.gather(expanded_pos_emb, dim=1, index=repeat(input_pos_id, "b n -> b n d", d=expanded_pos_emb.shape[2]))
+        input_pos_emb[input_mask] = 0
+        x_emb = input_pos_emb + self.mod_emb
+        d['x'] = x
+        d['emb'] = x_emb
+        return d
+class ImageTokenEncoderEmbedding(nn.Module):
+    """Embedding module for tokenized spatial inputs.
+    Args:
+        vocab_size: Vocabulary size
+        patch_size: Int or tuple of the patch size over the full image size.
+        dim_tokens: Dimension of output tokens. Can be set using init method.
+        sincos_pos_emb: Set to True (default) to use fixed 2D sin-cos positional embeddings
+        image_size: Default image size. Used to initialize size of positional embeddings.
+    """
+    def __init__(self,
+                 vocab_size: int,
+                 patch_size: Union[int, Tuple[int,int]] = 16,
+                 dim_tokens: Optional[int] = None,
+                 sincos_pos_emb: bool = True,
+                 image_size: Union[int, Tuple[int]] = 224,
+                 **kwargs):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.patch_size = pair(patch_size)
+        self.dim_tokens = dim_tokens
+        self.sincos_pos_emb = sincos_pos_emb
+        self.image_size = pair(image_size)
+        self.num_patches = (self.image_size[0] // patch_size) * (self.image_size[1] // patch_size)
+        if self.dim_tokens is not None:
+            self.init(dim_tokens=dim_tokens)
+    def init(self, dim_tokens: int = 768, init_std=0.02):
+        """
+        Initialize parts of module that are dependent on dimension of tokens.
+        Should be called when setting up FourM.
+        Args:
+            dim_tokens: Dimension of tokens
+            init_std: Standard deviation of init
+        """
+        self.dim_tokens = dim_tokens
+        # Task embedding identifying from which task a given token comes from
+        # Fixed-size positional embeddings. Can be interpolated to different input sizes
+        h_posemb = self.image_size[0] // self.patch_size[0]
+        w_posemb = self.image_size[1] // self.patch_size[1]
+        if self.sincos_pos_emb:
+            pos_emb = build_2d_sincos_posemb(h=h_posemb, w=w_posemb, embed_dim=self.dim_tokens)
+            self.register_buffer("pos_emb", pos_emb) # self.pos_emb is now a buffer for FSDP
+        else:
+            self.pos_emb = nn.Parameter(torch.zeros(1, (h_posemb * w_posemb), self.dim_tokens))
+            nn.init.normal_(self.pos_emb, std=init_std)
+        self.mod_emb = nn.Parameter(torch.zeros(1, 1, self.dim_tokens))
+        nn.init.normal_(self.mod_emb, std=init_std)
+        # Token embedding
+        self.token_emb = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.dim_tokens)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return set()
+    def forward(self, d: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through embedding module, transforming image tokens to a sequence of embeddings.
+        Creates corresponding modality and positional embeddings and adds them to the dict.
+        Args:
+            d (Dict[str, torch.Tensor]): Modality dict with at least the following key:
+                - 'tensor' (torch.Tensor): Input image tokens for each batch. Shape (B, H, W) where B is the batch size, and H, W are height and width of the tokenized image.                - 'input_mask' (torch.Tensor): Mask for valid tokens in the input sequence (set to 0 for valid tokens and 1 otherwise). Shape (B, L).
+        Returns:
+            Dict[str, torch.Tensor]: Modality dictionary with added keys:
+                - 'x' (torch.Tensor): Embedded token sequence. Shape (B, H*W, D).
+                - 'emb' (torch.Tensor): Sum of positional and modality embeddings for the input sequence. Shape (B, H*W, D).
+        """
+        ids = d['tensor']
+        B = ids.shape[0]
+        ids = ids.reshape(B, -1)
+        # Map to embedding
+        x = self.token_emb(ids)
+        # Create positional embedding + modality embedding
+        x_emb = repeat(self.pos_emb + self.mod_emb, '() n d -> b n d', b=B)
+        d['x'] = x
+        d['emb'] = x_emb
+        return d
+class ImageEncoderEmbedding(nn.Module):
+    """Embedding module for spatial inputs, like images or feature maps.
+    Creates tokens from patches over the image.
+    This adapter / embedding differs from the one of MultiMAE by taking as input a dict and
+     separating positional embeddings and modality embeddings from the input projection
+     Input projection is 'x', posemb + modemb is 'emb'
+    Args:
+        num_channels: Number of input channels of the image/feature map
+        patch_size: Int or tuple of the patch size over the full image size.
+        dim_tokens: Dimension of output tokens. Can be set using init method.
+        sincos_pos_emb: Set to True (default) to use fixed 2D sin-cos positional embeddings
+        image_size: Default image size. Used to initialize size of positional embeddings.
+    """
+    def __init__(self,
+                 num_channels: int,
+                 patch_size: Union[int, Tuple[int,int]],
+                 dim_tokens: Optional[int] = None,
+                 sincos_pos_emb: bool = True,
+                 image_size: Union[int, Tuple[int]] = 224):
+        super().__init__()
+        self.num_channels = num_channels
+        self.patch_size = pair(patch_size)
+        self.dim_tokens = dim_tokens
+        self.sincos_pos_emb = sincos_pos_emb
+        self.image_size = pair(image_size)
+        self.num_patches = (self.image_size[0] // patch_size) * (self.image_size[1] // patch_size)
+        if self.dim_tokens is not None:
+            self.init(dim_tokens=dim_tokens)
+    def init(self, dim_tokens: int = 768, init_std=0.02):
+        """
+        Initialize parts of encoder that are dependent on dimension of tokens.
+        Should be called when setting up FourM.
+        Args:
+            dim_tokens: Dimension of tokens
+            init_std: Standard deviation of init
+        """
+        self.dim_tokens = dim_tokens
+        # Task embedding identifying from which task a given token comes from
+        # Fixed-size positional embeddings. Can be interpolated to different input sizes
+        h_posemb = self.image_size[0] // self.patch_size[0]
+        w_posemb = self.image_size[1] // self.patch_size[1]
+        if self.sincos_pos_emb:
+            pos_emb = build_2d_sincos_posemb(h=h_posemb, w=w_posemb, embed_dim=self.dim_tokens)
+            self.register_buffer("pos_emb", pos_emb) # self.pos_emb is now a buffer for FSDP
+        else:
+            self.pos_emb = nn.Parameter(torch.zeros(1, (h_posemb * w_posemb), self.dim_tokens))
+            nn.init.normal_(self.pos_emb, std=init_std)
+        self.mod_emb = nn.Parameter(torch.zeros(1, 1, self.dim_tokens))
+        nn.init.normal_(self.mod_emb, std=init_std)
+        # Image -> tokens projection
+        # No bias term here, so modality embedding fully comes from self.mod_emb
+        self.proj = nn.Linear(self.num_channels * self.patch_size[0] * self.patch_size[1], self.dim_tokens, bias=False)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return set()
+    def forward(self, d: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through embedding module, transforming image to sequence of tokens.
+        Creates corresponding modality and positional embeddings and adds them to the dict.
+        Args:
+            d (Dict[str, torch.Tensor]): Modality dict with at least the following key:
+                - 'tensor' (torch.Tensor): Input image for each batch. Shape (B, C, H, W) where B is the batch size, C is the number of channels, and H, W are height and width of the image.
+        Returns:
+            Dict[str, torch.Tensor]: Modality dict with added keys:
+                - 'x' (torch.Tensor): Embedded token sequence. Shape (B, (H / PH) * (W / PW), D), where PH and PW are the patch sizes
+                - 'emb' (torch.Tensor): Sum of positional and modality embeddings for the input sequence. Shape (B, (H / PH) * (W / PW), D)
+        """
+        x = d['tensor']
+        B, C, H, W = x.shape
+        assert self.dim_tokens is not None, 'Need to call init(dim_tokens) function first'
+        assert (H % self.patch_size[0] == 0) and (W % self.patch_size[1] == 0), f'Image sizes {H}x{W} must be divisible by patch sizes {self.patch_size[0]}x{self.patch_size[1]}'
+        # Create patches [B, C, H, W] -> [B, (H*W), C]
+        x_patch = self.proj(rearrange(x, 'b d (nh ph) (nw pw) -> b (nh nw) (ph pw d)', ph=self.patch_size[0], pw=self.patch_size[1]))
+        # Create positional embedding + modality embedding
+        x_emb = repeat(self.pos_emb + self.mod_emb, '() n d -> b n d', b=B)
+        d['x'] = x_patch
+        d['emb'] = x_emb
+        return d
+class SequenceEmbEncoderEmbedding(nn.Module):
+    """Adapter for sequence emb inputs, like T5-XXL, CLIP text embeddings.
+    Args:
+        max_length: Maximum number of tokens in the sequence
+        dim_tokens: Dimension of output tokens. Can be set using init method.
+        sincos_pos_emb: Set to True (default) to use fixed 1D sin-cos positional embeddings
+        padding_idx: Padding index for word embedding
+        orig_emb_dim: Dimension of original embeddings
+        bottleneck_dim: Dimension of bottleneck layer
+        use_bottleneck: Set to True to use bottleneck layer
+    """
+    def __init__(self,
+                 max_length: int,
+                 dim_tokens: Optional[int] = None,
+                 sincos_pos_emb: bool = True,
+                 max_sincos_pos_emb: int = 512,
+                 padding_idx: int = 0,
+                 orig_emb_dim: int = 4096,
+                 bottleneck_dim: int = 64,
+                 use_bottleneck: bool = False,
+                 ):
+        super().__init__()
+        self.max_length = max_length
+        self.dim_tokens = dim_tokens
+        self.sincos_pos_emb = sincos_pos_emb
+        self.padding_idx = padding_idx
+        self.max_sincos_pos_emb = max_sincos_pos_emb
+        self.orig_emb_dim = orig_emb_dim
+        self.use_bottleneck = use_bottleneck
+        if self.use_bottleneck:
+            self.bottleneck_dim = bottleneck_dim
+        if self.dim_tokens is not None:
+            self.init(dim_tokens=dim_tokens)
+    def init(self, dim_tokens: int = 768, init_std=0.02):
+        """
+        Initialize parts of embedding module that are dependent on dimension of tokens.
+        Should be called when setting up FourM.
+        Args:
+            dim_tokens: Dimension of tokens
+            init_std: Standard deviation of init
+        """
+        self.dim_tokens = dim_tokens
+        # Task embedding identifying from which task a given token comes from
+        # Fixed-size positional embeddings. Can be interpolated to different input sizes
+        if self.sincos_pos_emb:
+            if self.max_length > self.max_sincos_pos_emb:
+                raise ValueError(f"Max length ({self.max_length}) is greater than the number of posembs ({self.max_sincos_pos_emb}")
+            pos_emb = build_1d_sincos_posemb(max_len=self.max_sincos_pos_emb, embed_dim=self.dim_tokens)[:self.max_length]
+            self.register_buffer("pos_emb", pos_emb) # self.pos_emb is now a buffer for FSDP
+        else:
+            self.pos_emb = nn.Parameter(torch.zeros(1, self.max_length, self.dim_tokens))
+            nn.init.normal_(self.pos_emb, std=init_std)
+        self.mod_emb = nn.Parameter(torch.zeros(1, 1, self.dim_tokens))
+        nn.init.normal_(self.mod_emb, std=init_std)
+        # Token embedding projection
+        if self.use_bottleneck:
+            self.emb_proj = nn.Sequential(
+                nn.Linear(self.orig_emb_dim, self.bottleneck_dim),
+                nn.Linear(self.bottleneck_dim, self.dim_tokens),
+            )
+        else:
+            self.emb_proj = nn.Linear(self.orig_emb_dim, self.dim_tokens)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return set()
+    def forward(self, d):
+        """
+        Forward pass through embedding module, projecting original embeddings to the Transformer dimension.
+        Creates corresponding modality and positional embeddings and adds them to the dict.
+        Args:
+            d (Dict[str, torch.Tensor]): Modality dict with at least the following keys:
+                - 'tensor' (torch.Tensor): Input token sequence for each batch. Shape (B, L, E) where B is the batch size and L is the sequence length, and E is the dimension of the original embeddings.
+                - 'input_mask' (torch.Tensor): Mask for valid tokens in the input sequence (set to 0 for valid tokens and 1 otherwise). Shape (B, L).
+        Returns:
+            Dict[str, torch.Tensor]: Modality dict with added keys:
+                - 'x' (torch.Tensor): Embedded token sequence. Shape (B, L, D) where D is the Transformer embedding dimension.
+                - 'emb' (torch.Tensor): Sum of positional and modality embeddings for the input sequence. Shape (B, L, D).
+        """
+        orig_emb = d['tensor']
+        B = orig_emb.shape[0]
+        assert self.dim_tokens is not None, 'Need to call init(dim_tokens) function first'
+        # Map to embedding
+        x = self.emb_proj(orig_emb)
+        expanded_pos_emb = repeat(self.pos_emb, "() n d -> b n d", b=B)
+        # Input pos encoding
+        input_mask = d['input_mask']
+        input_pos_id = (~input_mask).int().cumsum(dim=1) - 1
+        input_pos_id[input_mask] = 0
+        input_pos_emb = torch.gather(expanded_pos_emb, dim=1, index=repeat(input_pos_id, "b n -> b n d", d=expanded_pos_emb.shape[2]))
+        input_pos_emb[input_mask] = 0
+        x_emb = input_pos_emb + self.mod_emb
+        d['x'] = x
+        d['emb'] = x_emb
+        return d

fourm/models/fm.py ADDED Viewed

	@@ -0,0 +1,1130 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import random
+import copy
+from functools import partial
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+from einops import rearrange, repeat
+from torch import nn
+import torch.nn.functional as F
+from fourm.utils.timm.registry import register_model
+from huggingface_hub import PyTorchModelHubMixin
+from .fm_utils import Block, DecoderBlock, LayerNorm
+from fourm.data.modality_info import MODALITY_INFO
+# Model definitions
+__all__ = [
+    # GELU models
+    'fm_tiny_6e_6d_gelu',
+    'fm_small_8e_8d_gelu',
+    'fm_base_12e_12d_gelu',
+    'fm_large_24e_24d_gelu',
+    'fm_xlarge_24e_24d_gelu',
+    # SwiGLU models
+    'fm_tiny_6e_6d_swiglu_nobias',
+    'fm_small_8e_8d_swiglu_nobias',
+    'fm_base_12e_12d_swiglu_nobias',
+    'fm_large_24e_24d_swiglu_nobias',
+    'fm_xlarge_24e_24d_swiglu_nobias',
+    # SwiGLU + QKNorm models
+    'fm_base_12e_12d_swiglu_qknorm_nobias',
+    'fm_large_24e_24d_swiglu_qknorm_nobias',
+    'fm_xlarge_24e_24d_swiglu_qknorm_nobias',
+]
+class FourM(nn.Module):
+    """4M model.
+    Args:
+        encoder_embeddings: Dict of encoder embedding modules.
+        decoder_embeddings: Dict of decoder embedding modules.
+        modality_info: Dict containing modality information.
+        dim: Embedding dimension.
+        encoder_depth: Number of encoder blocks.
+        decoder_depth: Number of decoder blocks.
+        num_heads: Number of attention heads.
+        mlp_ratio: Ratio of mlp hidden dim to embedding dim.
+        qkv_bias: If True, add a learnable bias to query, key, value projections.
+        proj_bias: If True, add a learnable bias to the last projection of the attention block.
+        mlp_bias: If True, add a learnable bias to linear layers in the MLP / feed-forward.
+        drop_path_rate_encoder: Stochastic depth rate for encoder.
+        drop_path_rate_decoder: Stochastic depth rate for decoder.
+        shared_drop_path: If True, shares drop path between encoder and decoder.
+        act_layer: Activation layer to be used.
+        norm_layer: Normalization layer to be used.
+        gated_mlp: If True, make the feedforward gated (e.g., SwiGLU).
+        qk_norm: If True, applies normalization to queries and keys (QKNorm).
+        decoder_causal_mask: If True, decoder will use a causal mask for all tokens.
+        decoder_sep_mask: If True, decoder attention is restricted to within each modality only.
+        num_register_tokens: Number of register tokens.
+        use_act_checkpoint: If True, use activation checkpoint for each block.
+    """
+    def __init__(self,
+                 encoder_embeddings: Dict[str, nn.Module],
+                 decoder_embeddings: Dict[str, nn.Module],
+                 modality_info: Dict[str, Any],
+                 dim: int = 768,
+                 encoder_depth: int = 12,
+                 decoder_depth: int = 12,
+                 num_heads: int = 12,
+                 mlp_ratio: float = 4.0,
+                 qkv_bias: bool = True,
+                 proj_bias: bool = True,
+                 mlp_bias: bool = True,
+                 drop_path_rate_encoder: float = 0.0,
+                 drop_path_rate_decoder: float = 0.0,
+                 shared_drop_path: bool = False,
+                 act_layer: nn.Module = nn.GELU,
+                 norm_layer: Union[partial, nn.Module] = partial(LayerNorm, eps=1e-6),
+                 gated_mlp: bool = False, # Make the feedforward gated for e.g. SwiGLU
+                 qk_norm: bool = False,
+                 decoder_causal_mask: bool = False,
+                 decoder_sep_mask: bool = True,
+                 num_register_tokens: int = 0,
+                 use_act_checkpoint: bool = False,
+                 share_modality_embeddings: bool = True,
+                 ):
+        super().__init__()
+        self.modality_info = modality_info
+        self.dim = dim
+        self.decoder_causal_mask = decoder_causal_mask
+        self.decoder_sep_mask = decoder_sep_mask
+        self.init_std = 0.02
+        self.use_act_checkpoint = use_act_checkpoint
+        self.num_register_tokens = num_register_tokens
+        # Encoder embeddings & init
+        self.encoder_modalities = set(encoder_embeddings.keys())
+        for emb in encoder_embeddings.values():
+            emb.init(dim_tokens=dim, init_std=self.init_std)
+        self.encoder_embeddings = nn.ModuleDict(encoder_embeddings)
+        # Decoder embeddings & init
+        self.decoder_modalities = set(decoder_embeddings.keys())
+        for emb in decoder_embeddings.values():
+            emb.init(dim_tokens=dim, init_std=self.init_std)
+        self.decoder_embeddings = nn.ModuleDict(decoder_embeddings)
+        # Share modality embeddings across the encoder and decoder embedding modules
+        if share_modality_embeddings:
+            self.share_modality_embeddings()
+        ## Transformer encoder
+        if shared_drop_path:
+            dpr_encoder = [x.item() for x in torch.linspace(0, drop_path_rate_encoder, encoder_depth + decoder_depth)][:encoder_depth]
+        else:
+            dpr_encoder = [x.item() for x in torch.linspace(0, drop_path_rate_encoder, encoder_depth)] # stochastic depth decay rule
+        self.encoder = nn.ModuleList([
+            Block(dim=dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, proj_bias=proj_bias, mlp_bias=mlp_bias,
+                 drop_path=dpr_encoder[i], act_layer=act_layer, norm_layer=norm_layer, gated_mlp=gated_mlp, qk_norm=qk_norm)
+            for i in range(encoder_depth)
+        ])
+        self.encoder_norm = norm_layer(dim)
+        ## Transformer decoder
+        if shared_drop_path:
+            dpr_decoder = [x.item() for x in torch.linspace(0, drop_path_rate_decoder, encoder_depth + decoder_depth)][encoder_depth:]
+        else:
+            dpr_decoder = [x.item() for x in torch.linspace(0, drop_path_rate_decoder, decoder_depth)]  # stochastic depth decay rule
+        # Projection of encoder tokens before adding the embeddings again
+        self.decoder_proj_context = nn.Linear(dim, dim)
+        self.decoder = nn.ModuleList([
+            DecoderBlock(dim=dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, proj_bias=proj_bias, mlp_bias=mlp_bias,
+                         drop_path=dpr_decoder[i], act_layer=act_layer, norm_layer=norm_layer, gated_mlp=gated_mlp, qk_norm=qk_norm)
+            for i in range(decoder_depth)
+        ])
+        self.decoder_norm = norm_layer(dim)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, dim))
+        nn.init.normal_(self.mask_token, std=self.init_std)
+        # Additional register tokens that can be used by the encoder during fine-tuning
+        if self.num_register_tokens > 0:
+            self.register_tokens = nn.Parameter(torch.zeros(1, self.num_register_tokens, dim))
+            nn.init.normal_(self.register_tokens, std=self.init_std)
+        else:
+            self.register_tokens = None
+        # Weight init
+        self.init_weights()
+    def share_modality_embeddings(self):
+        """Share modality embeddings across the encoder and decoder embedding modules."""
+        shared_modalities = self.encoder_modalities & self.decoder_modalities
+        for mod in shared_modalities:
+            self.decoder_embeddings[mod].mod_emb = self.encoder_embeddings[mod].mod_emb
+    def init_weights(self):
+        """Weight initialization following MAE's initialization scheme"""
+        for name, m in self.named_modules():
+            # Skipping tokenizers to avoid reinitializing them
+            if "tokenizer" in name:
+                continue
+            # Linear
+            elif isinstance(m, nn.Linear):
+                if 'qkv' in name:
+                    # treat the weights of Q, K, V separately
+                    val = math.sqrt(6. / float(m.weight.shape[0] // 3 + m.weight.shape[1]))
+                    nn.init.uniform_(m.weight, -val, val)
+                elif 'kv' in name:
+                    # treat the weights of K, V separately
+                    val = math.sqrt(6. / float(m.weight.shape[0] // 2 + m.weight.shape[1]))
+                    nn.init.uniform_(m.weight, -val, val)
+                else:
+                    nn.init.xavier_uniform_(m.weight)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            # LayerNorm
+            elif isinstance(m, nn.LayerNorm) or isinstance(m, LayerNorm):
+                nn.init.constant_(m.weight, 1.0)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            # Embedding
+            elif isinstance(m, nn.Embedding):
+                nn.init.normal_(m.weight, std=self.init_std)
+            # Conv2d
+            elif isinstance(m, nn.Conv2d):
+                if '.proj' in name:
+                    # From MAE, initialize projection like nn.Linear (instead of nn.Conv2d)
+                    w = m.weight.data
+                    nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+    def get_num_layers_encoder(self):
+        return len(self.encoder)
+    def get_num_layers_decoder(self):
+        return len(self.decoder)
+    def get_num_layers(self):
+        return self.get_num_layers_encoder() + self.get_num_layers_decoder()
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        no_wd_set = set()
+        for mod, emb_module in self.encoder_embeddings.items():
+            if hasattr(emb_module, 'no_weight_decay'):
+                to_skip = emb_module.no_weight_decay()
+                to_skip = set([f'encoder_embeddings.{mod}.{name}' for name in to_skip])
+                no_wd_set = no_wd_set | to_skip
+        for mod, emb_module in self.decoder_embeddings.items():
+            if hasattr(emb_module, 'no_weight_decay'):
+                to_skip = emb_module.no_weight_decay()
+                to_skip = set([f'decoder_embeddings.{mod}.{name}' for name in to_skip])
+                no_wd_set = no_wd_set | to_skip
+        return no_wd_set
+    def cat_encoder_tensors(self, mod_dict: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor]:
+        """Concatenate encoder tensors from different modalities.
+        Args:
+            mod_dict (dict): A dictionary containing information for each modality.
+                             Expected keys for each modality are 'x' (input tokens),
+                             'emb' (embeddings), 'input_mask', etc.
+        Returns:
+            tuple:
+                - encoder_tokens_all (torch.Tensor): Concatenated encoder tokens from all modalities. Shape (B, O, D) where O is the total number of all encoder tokens.
+                - emb_all (torch.Tensor): Concatenated encoder embeddings from all modalities. Shape (B, O, D)
+                - encoder_mask_all (torch.Tensor): Concatenated boolean masks indicating which tokens are part of the encoder input (set to 0 for valid tokens, 1 otherwise). Shape (B, O)
+                - mod_mask_all (torch.Tensor): Concatenated integer mask marking the modality type for each encoder token. Shape (B, O)
+        """
+        encoder_tokens_all = []
+        emb_all = []
+        encoder_mask_all = []
+        mod_mask_all = []
+        for mod, d in mod_dict.items():
+            encoder_tokens_all.append(d['x'])
+            emb_all.append(d['emb'])
+            encoder_mask_all.append(d['input_mask'])
+            mod_mask_all.append(torch.full_like(d['input_mask'], self.modality_info[mod]['id'], dtype=torch.int16))
+        encoder_tokens_all = torch.cat(encoder_tokens_all, dim=1)
+        emb_all = torch.cat(emb_all, dim=1)
+        encoder_mask_all = torch.cat(encoder_mask_all, dim=1)
+        mod_mask_all = torch.cat(mod_mask_all, dim=1)
+        return encoder_tokens_all, emb_all, encoder_mask_all, mod_mask_all
+    def cat_decoder_tensors(self, mod_dict: Dict[str, Dict[str, torch.Tensor]]) -> Tuple[torch.Tensor]:
+        """Concatenate decoder tensors from different modalities.
+        Args:
+            mod_dict (dict): A dictionary containing information for each modality.
+                             Expected keys for each modality include 'x' (input tokens),
+                             'ids' (target IDs), 'emb' (embeddings), 'target_mask', 'decoder_attention_mask', etc.
+        Returns:
+            tuple:
+                - decoder_tokens_all (torch.Tensor): Concatenated decoder tokens from all modalities. Shape (B, P, D) where P is the total number of all decoder tokens.
+                - emb_all (torch.Tensor): Concatenated decoder embeddings from all modalities. Shape (B, P, D)
+                - decoder_mask_all (torch.Tensor): Concatenated boolean masks indicating which tokens are part of the decoder input / target (set to 0 for valid tokens, 1 otherwise). Shape (B, P)
+                - target_ids_all (torch.Tensor): Concatenated target IDs from all modalities. Shape (B, P)
+                - attention_mask_all (torch.Tensor): Concatenated attention masks in compressed format, needs to be passed to adapt_decoder_attention_mask() to obtain the final attention mask. Shape (B, P)
+                - mod_mask_all (torch.Tensor): Concatenated integer mask marking the modality type for each decoder token. Shape (B, P)
+        """
+        decoder_tokens_all = []
+        target_ids_all = []
+        emb_all = []
+        decoder_mask_all = []
+        attention_mask_all = []
+        mod_mask_all = []
+        # Shuffle order in which modalities are provided (useful for modality causal mask)
+        mod_dict = {mod: d for mod, d in random.sample(mod_dict.items(), len(mod_dict))}
+        for mod, d in mod_dict.items():
+            if self.modality_info[mod]['type'] in ['seq', 'seq_emb', 'seq_token']:
+                # Important: This makes the assumption that the target sequence appears sequentially
+                # before sorting / gathering
+                decoder_tokens_all.append(d['x'][:, :-1])
+                target_ids_all.append(d['ids'][:, 1:])  # Shifted left
+                emb_all.append(d['emb'][:, :-1])
+                # Logical or with left shifting removes the last unmasked position
+                decoder_mask_all.append(torch.logical_or(d['target_mask'][:, 1:], d['target_mask'][:, :-1]))
+                # Add attention mask ids
+                attention_mask_all.append(d['decoder_attention_mask'][:, :-1])
+                mod_mask_all.append(torch.full_like(d['ids'][:, :-1], self.modality_info[mod]['id'], dtype=torch.int16))
+            else:
+                # Important: For 2d / image modalities, the decoder input tokens are replaced by the mask token
+                decoder_tokens_all.append(torch.zeros_like(d['x']) + self.mask_token)  # Replace x by mask token
+                target_ids_all.append(d['ids'])
+                emb_all.append(d['emb'])
+                decoder_mask_all.append(d['target_mask'])
+                attention_mask_all.append(d['decoder_attention_mask'])
+                mod_mask_all.append(torch.full_like(d['ids'], self.modality_info[mod]['id'], dtype=torch.int16))
+        decoder_tokens_all = torch.cat(decoder_tokens_all, dim=1)
+        emb_all = torch.cat(emb_all, dim=1)
+        decoder_mask_all = torch.cat(decoder_mask_all, dim=1)
+        target_ids_all = torch.cat(target_ids_all, dim=1)
+        attention_mask_all = torch.cat(attention_mask_all, dim=1)
+        mod_mask_all = torch.cat(mod_mask_all, dim=1)
+        return decoder_tokens_all, emb_all, decoder_mask_all, target_ids_all, attention_mask_all, mod_mask_all
+    def forward_mask_encoder(self, mod_dict: Dict[str, Dict[str, torch.Tensor]], num_encoder_tokens: int) -> Tuple[torch.Tensor]:
+        """Concatenates and mask encoder tensors based on provided modality information.
+        This function consolidates encoder tokens from multiple modalities, then selects a specified number of them based on modality information (i.e. masking).
+        Args:
+            mod_dict (dict): Dictionary containing tensors for different modalities.
+                            It is expected to have keys for each modality and values
+                            containing the modalities' associated tensors.
+            num_encoder_tokens (int): Number of encoder tokens to retain after masking.
+        Returns:
+            tuple:
+                - encoder_tokens (torch.Tensor): Selected encoder tokens from all modalities. Shape (B, N, D) where N is the number of selected encoder tokens.
+                - encoder_emb (torch.Tensor): Corresponding embeddings for encoder tokens. Shape (B, N, D)
+                - encoder_mask (torch.Tensor): A boolean mask indicating which encoder tokens are valid (set to 0 for valid tokens, 1 otherwise). Shape (B, 1, N)
+                - mod_mask (torch.Tensor): An integer mask marking the modality type for each encoder token (with -1 indicating unassigned pad tokens). Shape (B, N)
+        Notes:
+            - If `num_register_tokens` is set and greater than 0, register tokens are added at the beginning of the sequence.
+        """
+        B = list(mod_dict.values())[0]['tensor'].shape[0]
+        encoder_tokens_all, emb_all, encoder_mask_all, mod_mask_all = self.cat_encoder_tensors(mod_dict)
+        # Add arange multiplied by small constant to mask so they get sorted in a deterministic way
+        mask_arange = torch.arange(encoder_mask_all.shape[1], device=encoder_mask_all.device).unsqueeze(0) * 1e-6
+        ids_shuffle = torch.argsort(encoder_mask_all + mask_arange, dim=1)
+        # ids_restore = torch.argsort(ids_shuffle, dim=1)
+        ids_keep = ids_shuffle[:, :num_encoder_tokens]
+        encoder_tokens = torch.gather(encoder_tokens_all, dim=1,
+                                      index=repeat(ids_keep, "b n -> b n d", d=encoder_tokens_all.shape[2]))
+        encoder_emb = torch.gather(emb_all, dim=1, index=repeat(ids_keep, "b n -> b n d", d=emb_all.shape[2]))
+        encoder_mask = torch.gather(encoder_mask_all, dim=1, index=ids_keep)
+        mod_mask = torch.gather(mod_mask_all, dim=1, index=ids_keep)
+        if self.num_register_tokens > 0:
+            register_tokens = repeat(self.register_tokens, '() n d -> b n d', b=B)
+            # We add register tokens at the beginning of the sequence
+            encoder_tokens = torch.cat([register_tokens, encoder_tokens], dim=1)
+            encoder_emb = torch.cat([torch.zeros_like(register_tokens), encoder_emb], dim=1)
+            encoder_mask = torch.cat([torch.zeros((B, register_tokens.shape[1]), dtype=torch.bool, device=encoder_mask.device), encoder_mask], dim=1)
+            mod_mask = torch.cat([torch.full((B, register_tokens.shape[1]), -1, dtype=torch.int16, device=mod_mask.device), mod_mask], dim=1)
+        encoder_tokens[encoder_mask] = 0.
+        encoder_emb[encoder_mask] = 0.
+        mod_mask[encoder_mask] = -1
+        # Mask could be of shape 'b n1 n2' but not needed for masked_fill
+        # This means this mask can then be re-used for decoder cross-attention
+        encoder_mask = rearrange(encoder_mask, 'b n2 -> b 1 n2')
+        return encoder_tokens, encoder_emb, encoder_mask, mod_mask
+    def forward_mask_decoder(self, mod_dict: Dict[str, Dict[str, torch.Tensor]], num_decoder_tokens: int) -> Tuple[torch.Tensor]:
+        """Concatenates and mask decoder tensors based on provided modality information.
+        This function consolidates decoder tokens from multiple modalities, selects a specified number of them based on modality information, and applies appropriate masking.
+        Args:
+            mod_dict (dict): Dictionary containing tensors for different modalities.
+                            It is expected to have keys for each modality and values
+                            containing the modalities' associated tensors.
+            num_decoder_tokens (int): Number of decoder tokens to retain after masking.
+        Returns:
+            tuple:
+                - decoder_tokens (torch.Tensor): Selected decoder tokens from all modalities. Shape (B, M, D) where M is the number of selected decoder tokens.
+                - decoder_emb (torch.Tensor): Corresponding embeddings for decoder tokens. Shape (B, M, D)
+                - decoder_mask (torch.Tensor): A boolean mask indicating which decoder tokens are valid (set to 0 for valid tokens, 1 otherwise). Shape (B, 1, M)
+                - target_ids (torch.Tensor): IDs of the target tokens corresponding to the decoder tokens. Shape (B, M)
+                - decoder_attention_mask (torch.Tensor): Mask for the decoder self-attention layers. Shape (B, M, M)
+                - mod_mask (torch.Tensor): An integer mask marking the modality type for each decoder token (with -1 indicating unassigned pad tokens). Shape (B, M)
+        """
+        # decoder_mask and target_mask are equivalent, we rename it here to harmonize with forward_mask_encoder
+        decoder_tokens_all, emb_all, decoder_mask_all, target_ids_all, decoder_attention_mask_all, mod_mask_all = self.cat_decoder_tensors(mod_dict)
+        # Add arange multiplied by small constant to mask so they get sorted in a deterministic way
+        mask_arange = torch.arange(decoder_mask_all.shape[1], device=decoder_mask_all.device).unsqueeze(0) * 1e-6
+        ids_shuffle = torch.argsort(decoder_mask_all + mask_arange, dim=1)
+        # ids_restore = torch.argsort(ids_shuffle, dim=1)
+        ids_keep = ids_shuffle[:, :num_decoder_tokens]
+        decoder_tokens = torch.gather(decoder_tokens_all, dim=1, index=repeat(ids_keep, "b n -> b n d", d=decoder_tokens_all.shape[2]))
+        decoder_emb = torch.gather(emb_all, dim=1, index=repeat(ids_keep, "b n -> b n d", d=emb_all.shape[2]))
+        decoder_mask = torch.gather(decoder_mask_all, dim=1, index=ids_keep)
+        target_ids = torch.gather(target_ids_all, dim=1, index=ids_keep)
+        decoder_attention_mask = torch.gather(decoder_attention_mask_all, dim=1, index=ids_keep)
+        mod_mask = torch.gather(mod_mask_all, dim=1, index=ids_keep)
+        decoder_tokens[decoder_mask] = 0.
+        decoder_emb[decoder_mask] = 0.
+        target_ids[decoder_mask] = 0
+        decoder_attention_mask = self.adapt_decoder_attention_mask(decoder_attention_mask, mod_mask)
+        mod_mask[decoder_mask] = -1
+        # This means this mask can then be re-used for decoder cross-attention
+        decoder_mask = rearrange(decoder_mask, 'b n2 -> b 1 n2')
+        return decoder_tokens, decoder_emb, decoder_mask, target_ids, decoder_attention_mask, mod_mask
+    def adapt_decoder_attention_mask(self, decoder_attention_mask: torch.Tensor, mod_mask=Optional[torch.Tensor]) -> torch.Tensor:
+        """
+        Transforms the compressed decoder attention mask to a full attention mask based on the specified constraints.
+        Args:
+            decoder_attention_mask (torch.Tensor): Initial attention mask indicating attention constraints. Shape (B, M) where M is the number of the decoder tokens.
+            mod_mask (torch.Tensor, optional): Modality mask to separate attention masks per modality. Shape (B, M)
+        Returns:
+            torch.Tensor: Adapted attention mask. Shape (B, M, M) where M is the number of the decoder tokens.
+        """
+        B, N = decoder_attention_mask.shape
+        if self.decoder_causal_mask:
+            # For causal mode, tokens can only attend to preceding tokens and themselves.
+            causal_mask = torch.ones((N, N), dtype=torch.bool, device=decoder_attention_mask.device).triu(1)
+            causal_mask = repeat(causal_mask, "n1 n2 -> b n1 n2", b=B)
+            adapted_attention_mask = causal_mask
+        else:
+            # Cumulatively sum the attention mask to determine token-wise attention behavior.
+            # Examples:
+            # Mask [4, 0, 0, 0] -> Cumsum: [4, 4, 4, 4] -> All tokens attend to each other.
+            # Mask [1, 1, 1, 1] -> Cumsum: [1, 2, 3, 4] -> Strict autoregressive behavior.
+            # Mask [2, 0, 1, 1] -> Cumsum: [2, 2, 3, 4] -> Tokens 1 and 2 attend to each other, token 3 attends to tokens 1-3, and token 4 to all.
+            attention_arange = torch.arange(N, device=decoder_attention_mask.device)
+            attention_arange = repeat(attention_arange, "n2 -> b n1 n2", b=B, n1=N)
+            cumsum_mask = torch.cumsum(decoder_attention_mask, dim=-1)
+            cumsum_mask = rearrange(cumsum_mask, "b n -> b n 1")
+            adapted_attention_mask = (attention_arange >= cumsum_mask)
+        if self.decoder_sep_mask:
+            # Separate attention between tokens based on their modality using mod_mask.
+            sep_mask = repeat(mod_mask, "b n2 -> b n1 n2", n1=N) != repeat(mod_mask, "b n1 -> b n1 n2", n2=N)
+            adapted_attention_mask = adapted_attention_mask | sep_mask
+        return adapted_attention_mask
+    def forward_encoder(self,
+                        x: torch.Tensor,
+                        encoder_mask: torch.Tensor) -> torch.Tensor:
+        """Forward pass for the encoder.
+        Args:
+            x (torch.Tensor): Encoder input tokens. Shape (B, N, D) where N is the number of encoder tokens.
+            encoder_mask (torch.Tensor): Encoder mask indicating which tokens are valid (set to 0 for valid tokens, 1 otherwise). Shape (B, 1, N)
+        Returns:
+            torch.Tensor: Encoder output. Shape (B, N, D)
+        """
+        for blk in self.encoder:
+            x = blk(x, mask=encoder_mask)
+        x = self.encoder_norm(x)
+        return x
+    def forward_decoder(self,
+                        y: torch.Tensor,
+                        context: torch.Tensor,
+                        encoder_mask: torch.Tensor,
+                        decoder_attention_mask: torch.Tensor) -> torch.Tensor:
+        """Forward pass for the decoder.
+        Args:
+            y (torch.Tensor): Decoder input tokens. Shape (B, M, D).
+            context (torch.Tensor): Context for the decoder (i.e. encoder output). Shape (B, N, D).
+            encoder_mask (torch.Tensor): Encoder mask indicating which tokens are valid (set to 0 for valid tokens, 1 otherwise). Shape (B, 1, N).
+            decoder_attention_mask (torch.Tensor): Decoder attention mask. Shape (B, M, M).
+        Returns:
+            torch.Tensor: Decoder output. Shape (B, M, D).
+        """
+        for blk in self.decoder:
+            y = blk(y, context, sa_mask=decoder_attention_mask, xa_mask=encoder_mask)
+        y = self.decoder_norm(y)
+        return y
+    def forward_logits(self,
+                       y: torch.Tensor,
+                       decoder_mod_dict: Dict[str, Dict[str, torch.Tensor]],
+                       decoder_mod_mask: torch.Tensor,
+                       return_all_logits: bool = False) -> Dict[str, torch.Tensor]:
+        """Forward computation of logits for each modality.
+        Args:
+            y (torch.Tensor): Decoder output. Shape (B, M, D).
+            decoder_mod_dict (dict): Dictionary containing tensor information for each modality in the decoder.
+            decoder_mod_mask (torch.Tensor): Integer mask indicating which tokens belong to which modality. Shape (B, M).
+        Returns:
+            Dict[str, torch.Tensor]: Dictionary of logits for each modality.
+        """
+        mod_logits = {}
+        for mod, d in decoder_mod_dict.items():
+            idx = self.modality_info[mod]["id"]
+            if return_all_logits:
+                logits = self.decoder_embeddings[mod].forward_logits(y)
+            else:
+                logits = self.decoder_embeddings[mod].forward_logits(y[decoder_mod_mask == idx])
+            mod_logits[mod] = logits
+        return mod_logits
+    def forward_loss(self,
+                     y: torch.Tensor,
+                     target_ids: torch.Tensor,
+                     decoder_mod_dict: Dict[str, Any],
+                     decoder_mod_mask: torch.Tensor, loss_type: str) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """Computes the loss based on the specified loss type.
+        Args:
+            y (torch.Tensor): Decoder output. Shape (B, M, D).
+            target_ids (torch.Tensor): Ground truth token IDs. Shape (B, M).
+            decoder_mod_dict (dict): Dictionary containing tensor information for each modality in the decoder.
+            decoder_mod_mask (torch.Tensor): Integer mask indicating which tokens belong to which modality. Shape (B, M).
+            loss_type (str): The type of loss to compute. Either 'mod' or 'token'.
+        Returns:
+            Tuple[torch.Tensor, Dict[str, torch.Tensor]]: Total loss and dictionary of loss for each modality.
+        """
+        if loss_type in ['mod', 'modality']:
+            loss, mod_loss = self.forward_mod_loss(y, target_ids, decoder_mod_dict, decoder_mod_mask)
+        elif loss_type == 'token':
+            loss, mod_loss = self.forward_token_loss(y, target_ids, decoder_mod_dict, decoder_mod_mask)
+        else:
+            raise ValueError("Invalid loss type")
+        return loss, mod_loss
+    def forward_mod_loss(self,
+                         y: torch.Tensor,
+                         target_ids: torch.Tensor,
+                         decoder_mod_dict: Dict[str, Any],
+                         decoder_mod_mask: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """Computes the modality-wise loss.
+        Args:
+            y (torch.Tensor): Decoder tokens. Shape (B, M, D).
+            target_ids (torch.Tensor): Ground truth token IDs. Shape (B, M).
+            decoder_mod_dict (dict): Dictionary containing tensor information for each modality in the decoder.
+            decoder_mod_mask (torch.Tensor): Mask indicating which tokens belong to which modality. Shape (B, M).
+        Returns:
+            Tuple[torch.Tensor, Dict[str, torch.Tensor]]: Total modality loss and dictionary of loss for each modality.
+        """
+        mod_loss = {}
+        for mod, d in decoder_mod_dict.items():
+            idx = self.modality_info[mod]["id"]
+            logits = self.decoder_embeddings[mod].forward_logits(y[decoder_mod_mask == idx])
+            if logits.numel() == 0:
+                # If there are no logits / targets, set mod_loss to 0
+                mod_loss[mod] = torch.zeros(1, device=logits.device)
+            else:
+                loss = F.cross_entropy(logits, target_ids[decoder_mod_mask == idx].long(), reduction='mean')
+                mod_loss[mod] = loss
+        loss = sum(mod_loss.values()) / len(mod_loss)
+        return loss, mod_loss
+    def forward_token_loss(self,
+                           y: torch.Tensor,
+                           target_ids: torch.Tensor,
+                           decoder_mod_dict: Dict[str, Any],
+                           decoder_mod_mask: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """Computes the token-wise loss.
+        Args:
+            y (torch.Tensor): Decoder tokens. Shape (B, M, D).
+            target_ids (torch.Tensor): Ground truth token IDs. Shape (B, M).
+            decoder_mod_dict (dict): Dictionary containing tensor information for each modality in the decoder.
+            decoder_mod_mask (torch.Tensor): Mask indicating which tokens belong to which modality. Shape (B, M).
+        Returns:
+            Tuple[torch.Tensor, Dict[str, torch.Tensor]]: Total token loss and dictionary of loss for each modality.
+        """
+        mod_loss = {}
+        mod_count = {}
+        for mod, d in decoder_mod_dict.items():
+            idx = self.modality_info[mod]["id"]
+            logits = self.decoder_embeddings[mod].forward_logits(y[decoder_mod_mask == idx])
+            if logits.numel() == 0:
+                # If there are no logits / targets, set mod_loss to 0
+                mod_loss[mod] = torch.zeros(1, device=logits.device)
+                mod_count[mod] = 0
+            else:
+                loss = F.cross_entropy(logits, target_ids[decoder_mod_mask == idx].long(), reduction='mean')
+                mod_loss[mod] = loss
+                mod_count[mod] = logits.numel()
+        loss = sum([mod_loss[mod] * mod_count[mod] for mod in mod_loss.keys()]) / sum(mod_count.values())
+        return loss, mod_loss
+    def forward(self,
+            mod_dict: Dict[str, Dict[str, torch.Tensor]],
+            num_encoder_tokens: int,
+            num_decoder_tokens: int,
+            loss_type: str = 'mod',
+            return_logits: bool = False) -> Union[Dict[str, torch.Tensor], Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
+        """
+        Forward pass for the model.
+        Args:
+            mod_dict (Dict[str, Dict[str, torch.Tensor]]): Dictionary containing the tensors, masks, and other info for each modality.
+                - mod_dict[modality_name]["tensor_name"]: Shape can vary based on tensor_name and modality.
+            num_encoder_tokens (int): Number of tokens to keep for the encoder.
+            num_decoder_tokens (int): Number of tokens to keep for the decoder.
+            loss_type (str, optional): The type of loss to compute. Can be 'mod' (average of loss per modality) or 'token' (average loss per token). Default is 'mod'.
+            return_logits (bool, optional): If True, return the logits. Default is False.
+        Returns:
+            Union[dict, tuple]:
+                - If return_logits is True: Dictionary of logits for each modality.
+                - Otherwise: Tuple containing the total loss and dictionary of loss for each modality.
+        """
+        # Mod dicts
+        encoder_mod_dict = {mod: self.encoder_embeddings[mod](d)
+                            for mod, d in mod_dict.items()
+                            if mod in self.encoder_embeddings}
+        encoder_tokens, encoder_emb, encoder_mask, encoder_mod_mask = self.forward_mask_encoder(encoder_mod_dict, num_encoder_tokens)
+        decoder_mod_dict = {mod: self.decoder_embeddings[mod].forward_embed(d)
+                            for mod, d in mod_dict.items()
+                            if mod in self.decoder_embeddings}
+        decoder_tokens, decoder_emb, decoder_mask, target_ids, decoder_attention_mask, decoder_mod_mask = self.forward_mask_decoder(decoder_mod_dict, num_decoder_tokens)
+        # Encoder
+        x = encoder_tokens + encoder_emb
+        x = self.forward_encoder(x, encoder_mask=encoder_mask)
+        # Decoder
+        context = self.decoder_proj_context(x) + encoder_emb
+        y = decoder_tokens + decoder_emb
+        y = self.forward_decoder(y, context, encoder_mask=encoder_mask, decoder_attention_mask=decoder_attention_mask)
+        # Logits
+        if return_logits:
+            mod_logits = self.forward_logits(y, decoder_mod_dict, decoder_mod_mask, return_all_logits=True)
+            return mod_logits
+        # Loss
+        loss, mod_loss = self.forward_loss(y, target_ids, decoder_mod_dict, decoder_mod_mask, loss_type)
+        return loss, mod_loss
+    def freeze_encoder(self, freeze_embeddings=True):
+        for param in self.encoder.parameters():
+            param.requires_grad = False
+        for param in self.encoder_norm.parameters():
+            param.requires_grad = False
+        if freeze_embeddings:
+            for param in self.encoder_embeddings.parameters():
+                param.requires_grad = False
+    def freeze_encoder_except_specific_embeddings(self, frozen_embedding_domain):
+        frozen_embedding_domain = frozen_embedding_domain.split('-')
+        for param in self.encoder.parameters():
+            param.requires_grad = False
+        for param in self.encoder_norm.parameters():
+            param.requires_grad = False
+        for name, param in self.encoder_embeddings.named_parameters():
+            if name.split('.')[0] in frozen_embedding_domain:
+                param.requires_grad = False
+    def unfreeze_encoder(self, unfreeze_embeddings=True):
+        for param in self.encoder.parameters():
+            param.requires_grad = True
+        for param in self.encoder_norm.parameters():
+            param.requires_grad = True
+        if unfreeze_embeddings:
+            for param in self.encoder_embeddings.parameters():
+                param.requires_grad = True
+    def freeze_decoder(self, freeze_embeddings=True):
+        for param in self.decoder.parameters():
+            param.requires_grad = False
+        for param in self.decoder_norm.parameters():
+            param.requires_grad = False
+        if freeze_embeddings:
+            for param in self.decoder_embeddings.parameters():
+                param.requires_grad = False
+    def freeze_decoder_except_specific_embeddings(self, frozen_embedding_domain):
+        frozen_embedding_domain = frozen_embedding_domain.split('-')
+        for param in self.decoder.parameters():
+            param.requires_grad = False
+        for param in self.decoder_norm.parameters():
+            param.requires_grad = False
+        for name, param in self.decoder_embeddings.named_parameters():
+            if name.split('.')[0] in frozen_embedding_domain:
+                param.requires_grad = False
+    def unfreeze_decoder(self, unfreeze_embeddings=True):
+        for param in self.decoder.parameters():
+            param.requires_grad = True
+        for param in self.decoder_norm.parameters():
+            param.requires_grad = True
+        if unfreeze_embeddings:
+            for param in self.decoder_embeddings.parameters():
+                param.requires_grad = True
+    def freeze_shared_params(self):
+        self.freeze_encoder(freeze_embeddings=False)
+        self.freeze_decoder(freeze_embeddings=False)
+    def freeze_params_except_specific_embeddings(self, frozen_embedding_domain):
+        self.freeze_encoder_except_specific_embeddings(frozen_embedding_domain=frozen_embedding_domain)
+        self.freeze_decoder_except_specific_embeddings(frozen_embedding_domain=frozen_embedding_domain)
+    def unfreeze_shared_params(self):
+        self.unfreeze_encoder(unfreeze_embeddings=False)
+        self.unfreeze_decoder(unfreeze_embeddings=False)
+    def unfreeze_all(self):
+        self.unfreeze_encoder(unfreeze_embeddings=True)
+        self.unfreeze_decoder(unfreeze_embeddings=True)
+################################################
+# Wrapper for easy loading with Huggingface Hub
+class FM(FourM, PyTorchModelHubMixin):
+    """Wrapper around FourM for easy loading with Huggingface Hub.
+    Args:
+        config (dict): Dictionary containing the model and modality configuration,
+            used for loading from Huggingface Hub.
+    """
+    def __init__(self, config: dict):
+        config = copy.deepcopy(config)
+        all_domains = sorted(list(set(config['domains_in']) | set(config['domains_out'])))
+        modality_info = {mod: MODALITY_INFO[mod] for mod in all_domains}
+        encoder_embeddings = {}
+        for mod in config['domains_in']:
+            info = modality_info[mod]
+            if info.get("encoder_embedding", None) is not None:
+                if info["type"] == "img":
+                    image_size, patch_size = info.get('input_size', config['image_size']), info.get('patch_size', config['patch_size'])
+                    encoder_embeddings[mod] = info["encoder_embedding"](patch_size=patch_size, image_size=image_size)
+                else:
+                    encoder_embeddings[mod] = info["encoder_embedding"]()
+        decoder_embeddings = {}
+        for mod in config['domains_out']:
+            info = modality_info[mod]
+            if info.get("decoder_embedding", None) is not None:
+                if info["type"] == "img":
+                    image_size, patch_size = info.get('input_size', config['image_size']), info.get('patch_size', config['patch_size'])
+                    decoder_embeddings[mod] = info["decoder_embedding"](patch_size=patch_size, image_size=image_size, share_embedding=False)
+                else:
+                    decoder_embeddings[mod] = info["decoder_embedding"](share_embedding=False)
+        config['norm_layer'] = partial(LayerNorm, eps=1e-6, bias=config['norm_bias'])
+        config['act_layer'] = getattr(torch.nn, config['act_layer'])
+        del config['norm_bias']
+        del config['domains_in']
+        del config['domains_out']
+        del config['image_size']
+        del config['patch_size']
+        super().__init__(
+            encoder_embeddings=encoder_embeddings,
+            decoder_embeddings=decoder_embeddings,
+            modality_info=modality_info,
+            **config
+        )
+################################################
+# Model definitions
+# GELU variants
+@register_model
+def fm_tiny_6e_6d_gelu(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=6,
+        decoder_depth=6,
+        dim=384,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+@register_model
+def fm_small_8e_8d_gelu(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=8,
+        decoder_depth=8,
+        dim=512,
+        num_heads=8,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+@register_model
+def fm_base_12e_12d_gelu(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=12,
+        decoder_depth=12,
+        dim=768,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+@register_model
+def fm_large_24e_24d_gelu(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=24,
+        decoder_depth=24,
+        dim=1024,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+@register_model
+def fm_xlarge_24e_24d_gelu(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=24,
+        decoder_depth=24,
+        dim=2048,
+        num_heads=32,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+# SwiGLU variants
+@register_model
+def fm_tiny_6e_6d_swiglu_nobias(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=6,
+        decoder_depth=6,
+        dim=384,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_small_8e_8d_swiglu_nobias(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=8,
+        decoder_depth=8,
+        dim=512,
+        num_heads=8,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_base_12e_12d_swiglu_nobias(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=12,
+        decoder_depth=12,
+        dim=768,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_large_24e_24d_swiglu_nobias(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=24,
+        decoder_depth=24,
+        dim=1024,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_xlarge_24e_24d_swiglu_nobias(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=24,
+        decoder_depth=24,
+        dim=2048,
+        num_heads=32,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        **kwargs
+    )
+    return model
+# SwiGLU + QKNorm variants
+@register_model
+def fm_base_12e_12d_swiglu_qknorm_nobias(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=12,
+        decoder_depth=12,
+        dim=768,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        qk_norm=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_large_24e_24d_swiglu_qknorm_nobias(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=24,
+        decoder_depth=24,
+        dim=1024,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        qk_norm=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_xlarge_24e_24d_swiglu_qknorm_nobias(
+        encoder_embeddings: Dict[str, nn.Module],
+        decoder_embeddings: Dict[str, nn.Module],
+        **kwargs):
+    model = FourM(
+        encoder_embeddings=encoder_embeddings,
+        decoder_embeddings=decoder_embeddings,
+        encoder_depth=24,
+        decoder_depth=24,
+        dim=2048,
+        num_heads=32,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        qk_norm=True,
+        **kwargs
+    )
+    return model

fourm/models/fm_utils.py ADDED Viewed

	@@ -0,0 +1,387 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------
+# Some functions are based on the timm code base
+# https://github.com/huggingface/pytorch-image-models
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+def softmax1(tensor):
+    # See https://www.evanmiller.org/attention-is-off-by-one.html
+    return F.pad(tensor, (0,1)).softmax(dim=-1)[...,:-1]
+def build_1d_sincos_posemb(max_len, embed_dim=1024, temperature=10000.):
+    """Sine-cosine positional embeddings from MoCo-v3, adapted back to 1d
+    Returns positional embedding of shape (1, N, D)
+    """
+    arange = torch.arange(max_len, dtype=torch.float32) # Shape (N,)
+    assert embed_dim % 2 == 0, 'Embed dimension must be divisible by 2 for 1D sin-cos position embedding'
+    pos_dim = embed_dim // 2
+    omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim # Shape (D/2,)
+    omega = 1. / (temperature ** omega)
+    out = torch.einsum('n,d->nd', [arange, omega]) # Outer product, shape (N, D/2)
+    pos_emb = torch.cat([torch.sin(out), torch.cos(out)], dim=1).unsqueeze(0) # Shape (1, N, D)
+    return pos_emb
+def build_2d_sincos_posemb(h, w, embed_dim=1024, temperature=10000.0):
+    """Sine-cosine positional embeddings as used in MoCo-v3
+    Returns positional embedding of shape (1, N, D) where N = W*H
+    """
+    grid_w = torch.arange(w, dtype=torch.float32) # Shape (W,)
+    grid_h = torch.arange(h, dtype=torch.float32) # Shape (H, )
+    grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij') # Shapes (W, H)
+    assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+    pos_dim = embed_dim // 4
+    omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim # Shape (D/4,)
+    omega = 1. / (temperature ** omega)
+    out_w = torch.einsum('n,d->nd', [grid_w.reshape(-1), omega]) # Outer product, shape (W*H, D/4)
+    out_h = torch.einsum('n,d->nd', [grid_h.reshape(-1), omega]) # Outer product, shape (W*H, D/4)
+    pos_emb = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], dim=1).unsqueeze(0) # Shape (1, W*H, D)
+    return pos_emb
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    Implementation from timm: https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+class LayerNorm(nn.Module):
+    """Custom implementation of LayerNorm with the option to disable the bias term"""
+    def __init__(self, normalized_shape: int, eps=1e-5, bias=True):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        else:
+            self.register_buffer("bias", torch.zeros(normalized_shape))
+        # Normalized shape must be a tuple for F.layer_norm
+        self.normalized_shape = (normalized_shape,)
+    def forward(self, x):
+        return nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, eps=self.eps)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., bias=True):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class GatedMlp(nn.Module):
+    """Implements SwiGLU and other gated feed-forward layers from Noam Shazeer's paper: https://arxiv.org/abs/2002.05202
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, bias=True):
+        super().__init__()
+        out_features = out_features or in_features
+        # If gated, multiply hidden_dim by 2/3 to account for extra matmul
+        hidden_features = int(2 * (hidden_features or in_features) / 3)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
+    def forward(self, x):
+        x = self.fc2(self.act(self.fc1(x)) * self.fc3(x))
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, proj_bias=True, attn_drop=0., proj_drop=0., allow_zero_attn=False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.allow_zero_attn = allow_zero_attn
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, mask=None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)   # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        if mask is not None:
+            mask = mask.unsqueeze(1) # Unsqueeze attention mask for multi-head
+            attn = attn.masked_fill(mask, -torch.finfo(attn.dtype).max)
+        if self.allow_zero_attn:
+            attn = softmax1(attn)
+        else:
+            attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, proj_bias=True, attn_drop=0., proj_drop=0., allow_zero_attn=False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.allow_zero_attn = allow_zero_attn
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, context, mask=None):
+        B, N, C = x.shape
+        _, M, _ = context.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        kv = self.kv(context).reshape(B, M, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        if mask is not None:
+            mask = rearrange(mask, "b n m -> b 1 n m") # Unsqueeze / reshape for multi-head
+            attn = attn.masked_fill(mask, -torch.finfo(attn.dtype).max)
+        if self.allow_zero_attn:
+            attn = softmax1(attn)
+        else:
+            attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class NormAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, proj_bias=True,  norm_layer=nn.LayerNorm, attn_drop=0., proj_drop=0., allow_zero_attn=False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.allow_zero_attn = allow_zero_attn
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.q_norm = norm_layer(head_dim)
+        self.k_norm = norm_layer(head_dim)
+    def forward(self, x, mask=None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)   # make torchscript happy (cannot use tensor as tuple)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        if mask is not None:
+            mask = mask.unsqueeze(1) # Unsqueeze for multi-head
+            attn = attn.masked_fill(mask, -torch.finfo(attn.dtype).max)
+        if self.allow_zero_attn:
+            attn = softmax1(attn)
+        else:
+            attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class NormCrossAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, proj_bias=True, norm_layer=nn.LayerNorm, attn_drop=0., proj_drop=0., allow_zero_attn=False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.allow_zero_attn = allow_zero_attn
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.q_norm = norm_layer(head_dim)
+        self.k_norm = norm_layer(head_dim)
+    def forward(self, x, context, mask=None):
+        B, N, C = x.shape
+        _, M, _ = context.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        kv = self.kv(context).reshape(B, M, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        if mask is not None:
+            mask = rearrange(mask, "b n m -> b 1 n m")  # Unsqueeze / reshape for multi-head
+            attn = attn.masked_fill(mask, -torch.finfo(attn.dtype).max)
+        if self.allow_zero_attn:
+            attn = softmax1(attn)
+        else:
+            attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=True, proj_bias=True, mlp_bias=True, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, gated_mlp=False, qk_norm=False, allow_zero_attn=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        if not qk_norm:
+            self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, attn_drop=attn_drop, proj_drop=drop, allow_zero_attn=allow_zero_attn)
+        else:
+            self.attn = NormAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, norm_layer=norm_layer, attn_drop=attn_drop, proj_drop=drop, allow_zero_attn=allow_zero_attn)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if not gated_mlp:
+            self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, bias=mlp_bias, drop=drop)
+        else:
+            self.mlp = GatedMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, bias=mlp_bias)
+    def forward(self, x, mask=None):
+        x = x + self.drop_path(self.attn(self.norm1(x), mask))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class DecoderBlock(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=True, proj_bias=True, mlp_bias=True, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, gated_mlp=False, qk_norm=False, allow_zero_attn=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        if not qk_norm:
+            self.self_attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, attn_drop=attn_drop, proj_drop=drop, allow_zero_attn=allow_zero_attn)
+            self.cross_attn = CrossAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, attn_drop=attn_drop, proj_drop=drop, allow_zero_attn=allow_zero_attn)
+        else:
+            self.self_attn = NormAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, norm_layer=norm_layer, attn_drop=attn_drop, proj_drop=drop, allow_zero_attn=allow_zero_attn)
+            self.cross_attn = NormCrossAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, norm_layer=norm_layer, attn_drop=attn_drop, proj_drop=drop, allow_zero_attn=allow_zero_attn)
+        self.query_norm = norm_layer(dim)
+        self.context_norm = norm_layer(dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if not gated_mlp:
+            self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, bias=mlp_bias, drop=drop)
+        else:
+            self.mlp = GatedMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, bias=mlp_bias)
+    def forward(self, x, context, sa_mask=None, xa_mask=None):
+        x = x + self.drop_path(self.self_attn(self.norm1(x), sa_mask))
+        x = x + self.drop_path(self.cross_attn(self.query_norm(x), self.context_norm(context), xa_mask))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class CrossAttentionBlock(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, gated_mlp=False, allow_zero_attn=False):
+        super().__init__()
+        self.cross_attn = CrossAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, allow_zero_attn=allow_zero_attn)
+        self.query_norm = norm_layer(dim)
+        self.context_norm = norm_layer(dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if not gated_mlp:
+            self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        else:
+            self.mlp = GatedMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer)
+    def forward(self, x, context, xa_mask=None, **kwargs):
+        x = x + self.drop_path(self.cross_attn(self.query_norm(x), self.context_norm(context), xa_mask))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x

fourm/models/fm_vit.py ADDED Viewed

	@@ -0,0 +1,485 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import copy
+from functools import partial
+from typing import Optional, Union
+import torch
+from torch import nn
+from fourm.utils.timm.registry import register_model
+from huggingface_hub import PyTorchModelHubMixin
+from .encoder_embeddings import ImageEncoderEmbedding
+from .fm_utils import Block, LayerNorm
+from fourm.data.modality_info import MODALITY_INFO
+__all__ = [
+    # GELU models
+    'fm_vit_tiny_6e_gelu',
+    'fm_vit_small_8e_gelu',
+    'fm_vit_base_12e_gelu',
+    'fm_vit_large_24e_gelu',
+    'fm_vit_xlarge_24e_gelu',
+    # SwiGLU models
+    'fm_vit_tiny_6e_swiglu_nobias',
+    'fm_vit_small_8e_swiglu_nobias',
+    'fm_vit_base_12e_swiglu_nobias',
+    'fm_vit_large_24e_swiglu_nobias',
+    'fm_vit_xlarge_24e_swiglu_nobias',
+    # SwiGLU + QKNorm models
+    'fm_vit_base_12e_swiglu_qknorm_nobias',
+    'fm_vit_large_24e_swiglu_qknorm_nobias',
+    'fm_vit_xlarge_24e_swiglu_qknorm_nobias',
+]
+class FourMViT(nn.Module):
+    """Modified 4M model, adapted to behave as a simple RGB-only ViT.
+    Args:
+        img_size (int): Input image size.
+        patch_size (int): Patch size.
+        in_chans (int): Number of input image channels.
+        dim (int): Patch embedding dimension.
+        encoder_depth (int): Depth of ViT / number of encoder blocks.
+        num_heads (int): Number of attention heads in each ViT block.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value.
+        proj_bias (bool): If True, adds a bias to the attention out proj layer.
+        mlp_bias (bool): If True, adds a learnable bias for the feedforward.
+        drop_path_rate (float): Stochastic depth rate.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate.
+        act_layer (nn.Module): Activation layer.
+        norm_layer (nn.Module): Normalization layer.
+        gated_mlp (bool): If True, makes the feedforward gated (e.g., for SwiGLU)
+        qk_norm (bool): If True, normalizes the query and keys (as in ViT-22B)
+        use_act_checkpoint (bool): If True, use activation checkpointing.
+        encoder_norm (bool): If True, adds a norm layer after the last encoder block.
+        output_head (Optional[nn.Module]): Optional output head after the encoder
+    """
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        dim=768,
+        encoder_depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        mlp_bias: bool = True,
+        drop_path_rate: float =0.0,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float =0.0,
+        act_layer: torch.Tensor =nn.GELU,
+        norm_layer: Union[partial, nn.Module] = partial(LayerNorm, eps=1e-6),
+        gated_mlp: bool = False, # Make the feedforward gated for e.g. SwiGLU
+        qk_norm: bool = False,
+        encoder_norm = True,
+        output_head: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        self.img_size = img_size
+        self.init_std = 0.02
+        rgb_embedding = ImageEncoderEmbedding(num_channels=in_chans, patch_size=patch_size,
+                                              dim_tokens=dim, sincos_pos_emb=True, image_size=img_size)
+        self.num_patches = rgb_embedding.num_patches
+        self.encoder_embeddings = nn.ModuleDict({f"rgb@{img_size}": rgb_embedding})
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, encoder_depth)]
+        self.encoder = nn.ModuleList([
+            Block(dim=dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, proj_bias=proj_bias, mlp_bias=mlp_bias,
+                 drop_path=dpr[i], drop=drop_rate, attn_drop=attn_drop_rate, act_layer=act_layer, norm_layer=norm_layer,
+                 gated_mlp=gated_mlp, qk_norm=qk_norm)
+            for i in range(encoder_depth)
+        ])
+        self.encoder_norm = norm_layer(dim) if encoder_norm else nn.Identity()
+        # Weight init
+        self.init_weights()
+        # Classification head is initialized after init_weights() to allow for special init scale
+        if output_head is not None:
+            self.output_head = output_head
+            if hasattr(self.output_head, 'init'):
+                self.output_head.init(dim)
+        else:
+            self.output_head = nn.Identity()
+    def init_weights(self):
+        """Weight initialization following MAE's initialization scheme"""
+        for name, m in self.named_modules():
+            # Skipping tokenizers to avoid reinitializing them
+            if "tokenizer" in name:
+                continue
+            # Linear
+            elif isinstance(m, nn.Linear):
+                if 'qkv' in name:
+                    # treat the weights of Q, K, V separately
+                    val = math.sqrt(6. / float(m.weight.shape[0] // 3 + m.weight.shape[1]))
+                    nn.init.uniform_(m.weight, -val, val)
+                elif 'kv' in name:
+                    # treat the weights of K, V separately
+                    val = math.sqrt(6. / float(m.weight.shape[0] // 2 + m.weight.shape[1]))
+                    nn.init.uniform_(m.weight, -val, val)
+                else:
+                    nn.init.xavier_uniform_(m.weight)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            # LayerNorm
+            elif isinstance(m, nn.LayerNorm) or isinstance(m, LayerNorm):
+                nn.init.constant_(m.weight, 1.0)
+                nn.init.constant_(m.bias, 0)
+            # Embedding
+            elif isinstance(m, nn.Embedding):
+                nn.init.normal_(m.weight, std=self.init_std)
+            # Conv2d
+            elif isinstance(m, nn.Conv2d):
+                if '.proj' in name:
+                    # From MAE, initialize projection like nn.Linear (instead of nn.Conv2d)
+                    w = m.weight.data
+                    nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+    def get_num_layers_encoder(self):
+        return len(self.encoder)
+    def get_num_layers(self):
+        return self.get_num_layers_encoder()
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        no_wd_set = set()
+        for mod, emb_module in self.encoder_embeddings.items():
+            if hasattr(emb_module, 'no_weight_decay'):
+                to_skip = emb_module.no_weight_decay()
+                to_skip = set([f'encoder_embeddings.{mod}.{name}' for name in to_skip])
+                no_wd_set = no_wd_set | to_skip
+        return no_wd_set
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the model.
+        Args:
+            x (torch.Tensor): Input tensor. Shape (B, C, H, W)
+        Returns:
+            torch.Tensor: Output tensor. Shape (B, num_classes).
+        """
+        rgb_dict = {'tensor': x}
+        rgb_dict = self.encoder_embeddings[f'rgb@{self.img_size}'](rgb_dict)
+        # Add embeddings to patchified RGB image
+        x = rgb_dict['x'] + rgb_dict['emb'] # Shape: (B, N, D) with N = num_patches
+        for blk in self.encoder:
+            x = blk(x)
+        x = self.encoder_norm(x) # Shape: (B, N, D)
+        out = self.output_head(x)
+        return out
+    def freeze_encoder(self, freeze_embeddings=True):
+        for param in self.encoder.parameters():
+            param.requires_grad = False
+        for param in self.encoder_norm.parameters():
+            param.requires_grad = False
+        if freeze_embeddings:
+            for param in self.encoder_embeddings.parameters():
+                param.requires_grad = False
+    def unfreeze_encoder(self, unfreeze_embeddings=True):
+        for param in self.encoder.parameters():
+            param.requires_grad = True
+        for param in self.encoder_norm.parameters():
+            param.requires_grad = True
+        if unfreeze_embeddings:
+            for param in self.encoder_embeddings.parameters():
+                param.requires_grad = True
+################################################
+# Wrapper for easy loading with Huggingface Hub
+class FMViT(FourMViT, PyTorchModelHubMixin):
+    """Wrapper around FourMViT for easy loading with Huggingface Hub.
+    Args:
+        config (dict): Dictionary containing the model and modality configuration,
+            used for loading from Huggingface Hub.
+        output_head (nn.Module): Optional output head.
+    """
+    def __init__(self, config: dict, output_head: Optional[nn.Module] = None):
+        config = copy.deepcopy(config)
+        config['norm_layer'] = partial(LayerNorm, eps=1e-6, bias=config['norm_bias'])
+        config['act_layer'] = getattr(torch.nn, config['act_layer'])
+        img_size = config['image_size']
+        config['img_size'] = img_size
+        config['patch_size'] = MODALITY_INFO[f'rgb@{img_size}'].get('patch_size', config['patch_size'])
+        config['in_chans'] = MODALITY_INFO[f'rgb@{img_size}'].get('num_channels', 3)
+        for key in ['image_size', 'norm_bias', 'domains_in', 'domains_out', 'decoder_depth', 'share_modality_embeddings']:
+            if key in config:
+                del config[key]
+        super().__init__(
+            output_head=output_head,
+            **config
+        )
+################################################
+# Model definitions
+# GELU variants
+@register_model
+def fm_vit_tiny_6e_gelu(**kwargs):
+    model = FourMViT(
+        encoder_depth=6,
+        dim=384,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+@register_model
+def fm_vit_small_8e_gelu(**kwargs):
+    model = FourMViT(
+        encoder_depth=8,
+        dim=512,
+        num_heads=8,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+@register_model
+def fm_vit_base_12e_gelu(**kwargs):
+    model = FourMViT(
+        encoder_depth=12,
+        dim=768,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+@register_model
+def fm_vit_large_24e_gelu(**kwargs):
+    model = FourMViT(
+        encoder_depth=24,
+        dim=1024,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+@register_model
+def fm_vit_xlarge_24e_gelu(**kwargs):
+    model = FourMViT(
+        encoder_depth=24,
+        dim=2048,
+        num_heads=32,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+# SwiGLU variants
+@register_model
+def fm_vit_tiny_6e_swiglu_nobias(**kwargs):
+    model = FourMViT(
+        encoder_depth=6,
+        dim=384,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_vit_small_8e_swiglu_nobias(**kwargs):
+    model = FourMViT(
+        encoder_depth=8,
+        dim=512,
+        num_heads=8,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_vit_base_12e_swiglu_nobias(**kwargs):
+    model = FourMViT(
+        encoder_depth=12,
+        dim=768,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_vit_large_24e_swiglu_nobias(**kwargs):
+    model = FourMViT(
+        encoder_depth=24,
+        dim=1024,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_vit_xlarge_24e_swiglu_nobias(**kwargs):
+    model = FourMViT(
+        encoder_depth=24,
+        dim=2048,
+        num_heads=32,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        **kwargs
+    )
+    return model
+# SwiGLU + QKNorm variants
+@register_model
+def fm_vit_base_12e_swiglu_qknorm_nobias(**kwargs):
+    model = FourMViT(
+        encoder_depth=12,
+        dim=768,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        qk_norm=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_vit_large_24e_swiglu_qknorm_nobias(**kwargs):
+    model = FourMViT(
+        encoder_depth=24,
+        dim=1024,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        qk_norm=True,
+        **kwargs
+    )
+    return model
+@register_model
+def fm_vit_xlarge_24e_swiglu_qknorm_nobias(**kwargs):
+    model = FourMViT(
+        encoder_depth=24,
+        dim=2048,
+        num_heads=32,
+        mlp_ratio=4,
+        qkv_bias=False,
+        proj_bias=False,
+        mlp_bias=False,
+        norm_layer=partial(LayerNorm, eps=1e-6, bias=False),
+        act_layer=nn.SiLU,
+        gated_mlp=True,
+        qk_norm=True,
+        **kwargs
+    )
+    return model

fourm/models/generate.py ADDED Viewed

	@@ -0,0 +1,1273 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import defaultdict
+from typing import Union, List, Optional
+import numpy as np
+import torch
+from einops import rearrange, repeat
+from torch import nn
+import torch.nn.functional as F
+from fourm.utils import get_sentinel_to_id_mapping, merge_span_masking
+from fourm.utils.generation import cosine_schedule, linear_schedule, onex_temp_schedule, linear_temp_schedule, continue_schedule
+from tqdm import tqdm
+import copy
+def empty_img_modality(mod_dict, key):
+    # Input mask
+    mod_dict[key]['input_mask'][:] = True
+    # Target Mask
+    mod_dict[key]['target_mask'][:] = False
+    return mod_dict
+def empty_seq_modality(mod_dict, key, s1_id=5):
+    # To create an empty sequence, we suppose an input budget of 1, and the rest assigned to targets
+    # Input tensor
+    # Input is [S_1], target is [S_1] ...... [S_2]
+    # (so [S_1] [S_1] ..... [S_2] when combined)
+    mod_dict[key]['tensor'][:] = 0
+    mod_dict[key]['tensor'][:,[0,1]] = s1_id # s1_id is id of the first sentinel token ([S_1])
+    mod_dict[key]['tensor'][:,-1] = s1_id + 1
+    # Input mask
+    # Set first token to input (i.e. 0), rest to target (i.e. 1)
+    mod_dict[key]['input_mask'][:] = True
+    mod_dict[key]['input_mask'][:,0] = False
+    # Target Mask
+    mod_dict[key]['target_mask'] = ~mod_dict[key]['input_mask']
+    # Decoder attn mask
+    # WARNING: Not needed / used in GenerationSampler, where causal mask is enforced
+    # First token is input, not part of target
+    mod_dict[key]['decoder_attention_mask'][:] = 1
+    mod_dict[key]['decoder_attention_mask'][:, 0] = 0
+    return mod_dict
+def empty_seq_emb_modality(mod_dict, key):
+     # Tensor
+    mod_dict[key]['tensor'] = torch.zeros_like(mod_dict[key]['tensor'])
+    # Input mask
+    mod_dict[key]['input_mask'] = torch.ones_like(mod_dict[key]['input_mask'])
+    # It is crucial to specify the input mask as such, CFG won't work otherwise!
+    mod_dict[key]['input_mask'][:, 0] = False
+    # Target Mask
+    mod_dict[key]['target_mask'] = torch.ones_like(mod_dict[key]['target_mask'])
+    # Decoder attn mask
+    mod_dict[key]['decoder_attention_mask'][:] = False
+    return mod_dict
+def init_empty_target_modality(mod_dict, modality_info, domain, batch_size, num_tokens, device):
+    """
+    Initializes an empty target modality dictionary for a given domain.
+    Used to initialize target modality dictionaries for generation.
+    """
+    if modality_info[domain]['type'] == 'img':
+        # Initialize mod dict
+        mod_dict[domain] = {
+            'tensor': torch.zeros((batch_size, num_tokens), dtype=torch.int64, device=device),
+            'input_mask': torch.ones((batch_size, num_tokens), dtype=torch.bool, device=device),
+            'target_mask': torch.zeros((batch_size, num_tokens), dtype=torch.bool, device=device),
+        }
+        # Set it to the correct values
+        mod_dict = empty_img_modality(mod_dict, domain)
+    elif modality_info[domain]['type'] in ['seq', 'seq_token', 'seq_emb']:
+        # Initialize mod dict
+        num_tokens = max(num_tokens, 2)
+        mod_dict[domain] = {
+            'tensor': torch.zeros((batch_size, num_tokens), dtype=torch.int32, device=device),
+            'input_mask': torch.ones((batch_size, num_tokens), dtype=torch.bool, device=device),
+            'target_mask': torch.zeros((batch_size, num_tokens), dtype=torch.bool, device=device),
+            'decoder_attention_mask': torch.zeros((batch_size, num_tokens), dtype=torch.bool, device=device),
+        }
+        # Set it to the correct values
+        if modality_info[domain]['type'] in ['seq', 'seq_token']:
+            mod_dict = empty_seq_modality(mod_dict, domain)
+        elif modality_info[domain]['type'] == 'seq_emb':
+            mod_dict = empty_seq_emb_modality(mod_dict, domain)
+    else:
+        raise ValueError()
+    return mod_dict
+def init_full_input_modality(mod_dict, modality_info, domain, device, eos_id=3):
+    if domain.startswith('rgb'):
+        batch_size, _, H, W = mod_dict[domain]['tensor'].shape
+        patch_size = modality_info[domain]['patch_size']
+        num_tokens = (H // patch_size) * (W // patch_size)
+        shape = (batch_size, num_tokens)
+    else:
+        shape = mod_dict[domain]['tensor'].shape
+    if 'input_mask' not in mod_dict[domain]:
+        mod_dict[domain]['input_mask'] = torch.zeros(shape, dtype=torch.bool, device=device)
+    if 'target_mask' not in mod_dict[domain]:
+        mod_dict[domain]['target_mask'] = torch.ones(shape, dtype=torch.bool, device=device)
+    if 'decoder_attention_mask' not in mod_dict[domain]:
+        mod_dict[domain]['decoder_attention_mask'] = torch.zeros(shape, dtype=torch.bool, device=device)
+    if modality_info[domain]['type'] == 'img':
+        mod_dict[domain]['input_mask'][:] = False
+        mod_dict[domain]['target_mask'][:] = True
+    elif modality_info[domain]['type'] in ['seq', 'seq_token']:
+        if eos_id in mod_dict[domain]['tensor']:
+            eos_idx = torch.where(mod_dict[domain]['tensor'] == eos_id)[1][0].item()
+        else:
+            mod_dict[domain]['tensor'][:,0] = eos_id
+            eos_idx = 0
+        mod_dict[domain]['input_mask'][:,:eos_idx+1] = False
+        mod_dict[domain]['input_mask'][:,eos_idx+1:] = True
+        mod_dict[domain]['target_mask'][:] = True
+    elif modality_info[domain]['type'] in ['seq_emb']:
+        # T5 caption has the valid mask saved alongside the embeddings
+        mod_dict[domain]['input_mask'] = ~mod_dict[domain]['mask_valid']
+        mod_dict[domain]['target_mask'] = torch.ones_like(mod_dict[domain]['mask_valid'])
+        mod_dict[domain]['decoder_attention_mask'] = torch.zeros_like(mod_dict[domain]['mask_valid'])
+    return mod_dict
+def custom_text(sample, input_text, eos_token, key, device, text_tokenizer, target_max_len=50, start_token="[S_1]"):
+    input_ids = text_tokenizer.encode(input_text).ids
+    input_ids = torch.tensor(input_ids).unsqueeze(0)
+    target_text = [start_token]
+    target_text.extend(["[PAD]"] * (target_max_len - 2))
+    target_text.append(eos_token)
+    target_text = " ".join(target_text)
+    target_ids = text_tokenizer.encode(target_text).ids
+    target_ids = torch.tensor(target_ids).unsqueeze(0)
+    all_ids = torch.cat([input_ids, target_ids], dim=1)
+    input_mask = torch.cat([
+        torch.zeros_like(input_ids, dtype=torch.bool),
+        torch.ones_like(target_ids, dtype=torch.bool),
+        ], dim=1)
+    target_mask = torch.cat([
+        torch.ones_like(input_ids, dtype=torch.bool),
+        torch.zeros_like(target_ids, dtype=torch.bool),
+        ], dim=1)
+    sample[key] = {}
+    sample[key]['tensor'] = all_ids.to(device)
+    sample[key]['input_mask'] = input_mask.to(device)
+    sample[key]['target_mask'] = target_mask.to(device)
+    sample[key]['decoder_attention_mask'] = torch.zeros(all_ids.shape, dtype=torch.bool, device=device)
+    return sample
+def expand_to_batch(mod_dict, batch_size):
+    for mod, d in mod_dict.items():
+        for k, v in d.items():
+            if k in ['tensor', 'input_mask', 'target_mask', 'decoder_attention_mask', 'mask_valid']:
+                B = v.shape[0]
+                if B == 1:
+                    mod_dict[mod][k] = repeat(v, "1 ... -> b ...", b=batch_size)
+                elif B != batch_size:
+                    raise ValueError(f"Invalid batch size: {B} instead of {batch_size}")
+    return mod_dict
+def build_chained_generation_schedules(
+        cond_domains: List[str],
+        target_domains: List[str],
+        tokens_per_target: List[int],
+        autoregression_schemes: List[str],
+        decoding_steps: List[int],
+        token_decoding_schedules: List[str],
+        temps: List[float],
+        temp_schedules: List[float],
+        cfg_scales: List[float],
+        cfg_schedules: List[str],
+        cfg_grow_conditioning: bool = False,
+        modality_info: Optional[dict] = None,
+    ):
+    """
+    Builds a list of chained generation schedules, where each schedule is a tuple of the form:
+    (target_modality, schema, number of decoded tokens, temperature, guidance_scale, cfg_cond_domains)
+    Args:
+        cond_domains: List of conditioning domains
+        target_domains: List of target domains
+        tokens_per_target: List of number of tokens to decode for each target domain
+        autoregression_schemes: List of autoregression schemes for each target domain. maskgit, roar, or autoregressive
+        decoding_steps: List of number of maskgit steps for each target domain (if applicable)
+        token_decoding_schedules: List of maskgit token schedules for each target domain (if applicable). cosine or linear
+        temps: List of starting temperatures for each target domain
+        temp_schedules: List of temperature schedules for each target domain. linear, constant, or onex:{min_t}:{power}
+        cfg_scales: List of classifier-free guidance scales for each target domain
+        cfg_schedules: List of classifier-free guidance schedules for each target domain. constant or cosine
+        cfg_grow_conditioning: After every completed modality, add them to classifier-free guidance conditioning
+        modality_info: Dictionary with metadata for each modality, optionally used to verify that the schedule is compatible with the modality
+    """
+    # List of {target_modality, schema, number of decoded tokens, temperature, guidance_scale, cfg_cond_domains} dicts
+    chained_schedules = []
+    cond_domains = cond_domains.copy()
+    for target_idx in range(len(target_domains)):
+        scheme = autoregression_schemes[target_idx]
+        target_domain = target_domains[target_idx]
+        ntoks = tokens_per_target[target_idx]
+        maskgit_token_schedule_name = token_decoding_schedules[target_idx]
+        temp = temps[target_idx]
+        temp_schedule_name = temp_schedules[target_idx]
+        cfg_scale = cfg_scales[target_idx]
+        cfg_schedule_name = cfg_schedules[target_idx]
+        # Auto-regressive (caption, detection, ...)
+        if scheme == 'autoregressive':
+            chained_schedules.append({
+                'target_domain': target_domain,
+                'scheme': scheme,
+                'num_tokens': None,
+                'temperature': temp,
+                'cfg_scale': cfg_scale,
+                'cfg_cond_domains': cond_domains.copy()
+            })
+            continue
+        # Use modality info for (optional) assert if provided
+        if modality_info is not None:
+            assert modality_info[target_domain]['type'] not in ['seq', 'seq_token'], f'Illegal autoregressive scheme {scheme} for target domain {target_domain}'
+        # Token schedule
+        if scheme == 'maskgit':
+            # MaskGIT token schedule setup
+            num_steps = decoding_steps[target_idx]
+            if maskgit_token_schedule_name == 'cosine':
+                token_schedule = cosine_schedule(num_steps, (ntoks))
+            elif maskgit_token_schedule_name == 'linear':
+                token_schedule = linear_schedule(num_steps, (ntoks))
+            else:
+                raise ValueError(f'Illegal MaskGIT token schedule {maskgit_token_schedule_name}')
+        elif scheme == 'roar':
+            # ROAR token schedule setup (one-by-one, but random order)
+            num_steps = decoding_steps[target_idx]
+            token_schedule = linear_schedule(num_steps, ntoks)
+        else:
+            raise ValueError(f'Illegal decoding scheme {scheme}')
+        # Temperature schedule
+        if temp_schedule_name == 'linear':
+            temp_schedule = linear_temp_schedule(temp, token_schedule)
+        elif temp_schedule_name == 'constant':
+            temp_schedule = temp * np.ones(num_steps)
+        elif 'onex' in temp_schedule_name:
+            # onex temperature schedule has to be formatted like onex:{min_t}:{power}
+            min_t, power = [float(f) for f in temp_schedule_name.split(':')[1:]]
+            temp_schedule = onex_temp_schedule(max_t=temp, min_t=min_t, token_schedule=token_schedule, power=power)
+        else:
+            raise ValueError(f'Illegal temperature schedule {temp_schedule_name}')
+        # Classifier-free guidance scale schedule
+        if cfg_schedule_name == 'constant':
+            if isinstance(cfg_scale, float):
+                cfg_schedule = cfg_scale * np.ones(num_steps)
+            elif isinstance(cfg_scale, list):
+                cfg_schedule = np.array(cfg_scale) * np.ones(num_steps).reshape(-1, 1)
+        elif cfg_schedule_name == 'cosine':
+            raise NotImplementedError()
+        else:
+            raise ValueError(f'Illegal guidance schedule {cfg_schedule_name}')
+        # Concatenate schedule for this modality with previous ones
+        schedule = [
+            {
+                'target_domain': target_domain,
+                'scheme': scheme,
+                'num_tokens': tok,
+                'temperature': temp,
+                'cfg_scale': cfg,
+                'cfg_cond_domains': cond_domains.copy()
+            }
+            for tok, temp, cfg in zip(token_schedule, temp_schedule, cfg_schedule)
+        ]
+        chained_schedules.extend(schedule)
+        # Optionally add this new modality to the ones affected by classifier-free guidance
+        if cfg_grow_conditioning:
+            cond_domains.append(target_domain)
+    return chained_schedules
+class GenerationSampler(nn.Module):
+    """Sampler that wraps a trained 4M model for generation use cases.
+    Implements standard autoregressive, MaskGIT, and ROAR generation schemes with chaining and weighted guidance."""
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    def top_k_top_p_filtering(self, logits, top_k=0.0, top_p=0.0):
+        # Compatible with batching
+        # From https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+        if top_k > 0.0:
+            if isinstance(top_k, int):
+                k = min(top_k, logits.shape[-1])
+            elif isinstance(top_k, float):
+                k = min(int(top_k * logits.shape[-1]), logits.shape[-1])
+            else:
+                raise ValueError(f"Invalid value for top_k: {top_k}")
+            # Remove all tokens with a probability less than the last token of the top-k
+            indices_to_remove = logits < torch.topk(logits, k)[0][..., -1, None]
+            logits[indices_to_remove] = float("-inf")
+        if top_p > 0.0:
+            sorted_logits, sorted_indices = torch.sort(logits, dim=1, descending=True)
+            cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+            sorted_indices_to_remove = cum_probs > top_p
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            restore_indices = torch.argsort(sorted_indices, dim=-1)
+            indices_to_remove = torch.gather(sorted_indices_to_remove, dim=-1, index=restore_indices)
+            logits[indices_to_remove] = float("-inf")
+        return logits
+    def sample_tokens(self, logits, temperature=1.0, top_k=0.0, top_p=0.0):
+        if np.isclose(temperature, 0, atol=1e-10):
+            samples = torch.argmax(logits, dim=-1)
+            # Since argmax is used, all sampled_probs will be 1 as we're selecting the max probability
+            sampled_probs = torch.ones_like(samples, dtype=torch.float32)
+        else:
+            filtered_logits = self.top_k_top_p_filtering(logits, top_k, top_p)
+            probs = F.softmax(filtered_logits / temperature, dim=-1)
+            samples = torch.multinomial(probs, 1)[:, 0]
+            sampled_probs = probs[torch.arange(len(samples)), samples]
+        return samples, sampled_probs
+    def sample_tokens_batched(self, logits, temperature=1.0, top_k=0.0, top_p=0.0):
+        if logits.ndim > 2:
+            B, N = logits.shape[0], logits.shape[1]
+            logits = rearrange(logits, 'b n v -> (b n) v')
+            samples, sampled_probs = self.sample_tokens(logits, temperature, top_k, top_p)
+            samples = rearrange(samples, '(b n) -> b n', b=B, n=N)
+            sampled_probs = rearrange(sampled_probs, '(b n) -> b n', b=B, n=N)
+            return samples, sampled_probs
+        else:
+            return self.sample_tokens(logits, temperature, top_k, top_p)
+    def select_tokens(self, logits, num_select, temperature=1.0, top_k=0.0, top_p=0.0, return_all_samples=False):
+        samples, sampled_probs = self.sample_tokens(logits, temperature, top_k, top_p)
+        top_indices = torch.topk(sampled_probs, num_select)[1]
+        top_samples = samples[top_indices]
+        if return_all_samples:
+            return top_samples, top_indices, samples
+        else:
+            return top_samples, top_indices
+    def select_tokens_batched(self, logits, num_select, temperature=1.0, top_k=0.0, top_p=0.0, return_all_samples=False):
+            if logits.ndim > 2:
+                samples, sampled_probs = self.sample_tokens_batched(logits, temperature, top_k, top_p) # both of shape (B, N)
+                top_indices = torch.topk(sampled_probs, num_select, dim=-1)[1]
+                # Need to switch to gather instead of indexing here
+                top_samples = torch.gather(samples, dim=-1, index=top_indices)
+                if return_all_samples:
+                    return top_samples, top_indices, samples
+                else:
+                    return top_samples, top_indices
+            else:
+                return self.sample_tokens(logits, num_select, temperature, top_k, top_p, return_all_samples)
+    def forward_mask_encoder_generation(self, encoder_mod_dict):
+        """Modification of forward_mask_encoder adapted for generation, with support for batching
+        """
+        # Form input
+        B = list(encoder_mod_dict.values())[0]['tensor'].shape[0]
+        encoder_tokens_all, emb_all, encoder_mask_all, mod_mask_all = self.model.cat_encoder_tensors(encoder_mod_dict)
+        # Take max num encoder of tokens (although assuming it's the same everywhere would be better)
+        num_encoder_tokens = (~encoder_mask_all.reshape(B, -1)).sum(dim=1).max()
+        # Add arange multiplied by small constant to mask so they get sorted in a deterministic way
+        mask_arange = torch.arange(encoder_mask_all.shape[1], device=encoder_mask_all.device).unsqueeze(0) * 1e-6
+        ids_shuffle = torch.argsort(encoder_mask_all + mask_arange, dim=1)
+        # ids_restore = torch.argsort(ids_shuffle, dim=1)
+        ids_keep = ids_shuffle[:, :num_encoder_tokens]
+        encoder_tokens = torch.gather(encoder_tokens_all, dim=1,
+                                      index=repeat(ids_keep, "b n -> b n d", d=encoder_tokens_all.shape[2]))
+        encoder_emb = torch.gather(emb_all, dim=1, index=repeat(ids_keep, "b n -> b n d", d=emb_all.shape[2]))
+        encoder_mask = torch.gather(encoder_mask_all, dim=1, index=ids_keep)
+        mod_mask = torch.gather(mod_mask_all, dim=1, index=ids_keep)
+        if self.model.num_register_tokens > 0:
+            prompt_tokens = repeat(self.prompt_tokens, '() n d -> b n d', b=B)
+            # We add prompt tokens at the beginning of the sequence
+            encoder_tokens = torch.cat([prompt_tokens, encoder_tokens], dim=1)
+            encoder_emb = torch.cat([torch.zeros_like(prompt_tokens), encoder_emb], dim=1)
+            encoder_mask = torch.cat([torch.zeros((B, prompt_tokens.shape[1]), dtype=torch.bool, device=encoder_mask.device), encoder_mask], dim=1)
+            mod_mask = torch.cat([torch.full((B, prompt_tokens.shape[1]), -1, dtype=torch.int16, device=mod_mask.device), mod_mask], dim=1)
+        encoder_tokens[encoder_mask] = 0.
+        encoder_emb[encoder_mask] = 0.
+        mod_mask[encoder_mask] = -1
+        # Mask could be of shape 'b n1 n2' but not needed for masked_fill
+        # This means this mask can then be re-used for decoder cross-attention
+        encoder_mask = rearrange(encoder_mask, 'b n2 -> b 1 n2')
+        return encoder_tokens, encoder_emb, encoder_mask, mod_mask
+    def forward_mask_decoder_maskgit(self, mod_dict, target_mod, seed=None):
+        """Modification of forward_mask_decoder for MaskGIT generation, with support for batching
+        """
+        if seed is not None:
+            torch.manual_seed(seed)
+        d = mod_dict[target_mod]
+        decoder_tokens_all = torch.zeros_like(d['x']) + self.model.mask_token
+        emb_all = d['emb']
+        decoder_mask_all = d['target_mask']
+        B = decoder_tokens_all.shape[0] # Get batch size
+        mod_mask_all = torch.full_like(d['ids'], self.model.modality_info[target_mod]['id'], dtype=torch.int16)
+        mod_pos_all = torch.arange(d['x'].shape[1], device=d['x'].device).unsqueeze(0)
+        mod_pos_all = repeat(mod_pos_all, '1 n -> b n', b=B) # Added: Expansion for batching
+        num_decoder_tokens = (~decoder_mask_all[0]).sum()  # Adapted for batching / Assumes num_decoder_tokens is the same across the batch
+        # Add arange multiplied by small constant to mask so they get sorted in a deterministic way
+        mask_arange = torch.arange(decoder_mask_all.shape[1], device=decoder_mask_all.device).unsqueeze(0) * 1e-6
+        ids_shuffle = torch.argsort(decoder_mask_all + mask_arange, dim=1)
+        # ids_restore = torch.argsort(ids_shuffle, dim=1)
+        ids_keep = ids_shuffle[:, :num_decoder_tokens]
+        decoder_tokens = torch.gather(decoder_tokens_all, dim=1, index=repeat(ids_keep, "b n -> b n d", d=decoder_tokens_all.shape[2]))
+        decoder_emb = torch.gather(emb_all, dim=1, index=repeat(ids_keep, "b n -> b n d", d=emb_all.shape[2]))
+        decoder_mask = torch.gather(decoder_mask_all, dim=1, index=ids_keep)
+        mod_mask = torch.gather(mod_mask_all, dim=1, index=ids_keep)
+        mod_pos = torch.gather(mod_pos_all, dim=1, index=ids_keep)
+        decoder_tokens[decoder_mask] = 0.
+        decoder_emb[decoder_mask] = 0.
+        mod_mask[decoder_mask] = -1
+        return decoder_tokens, decoder_emb, decoder_mask, mod_mask, mod_pos
+    def forward_mask_decoder_roar(self, mod_dict, target_mod, num_select, seed=None):
+        """Modification of forward_mask_decoder for ROAR generation, with support for batching
+        """
+        if seed is not None:
+            torch.manual_seed(seed)
+        d = mod_dict[target_mod]
+        decoder_tokens_all = torch.zeros_like(d['x']) + self.model.mask_token
+        emb_all = d['emb']
+        decoder_mask_all = d['target_mask']
+        B = decoder_tokens_all.shape[0] # Get batch size
+        mod_mask_all = torch.full_like(d['ids'], self.model.modality_info[target_mod]['id'], dtype=torch.int16)
+        mod_pos_all = torch.arange(d['x'].shape[1], device=d['x'].device).unsqueeze(0)
+        mod_pos_all = repeat(mod_pos_all, '1 n -> b n', b=B) # Added: Expansion for batching
+        # Only keep the first num_select tokens
+        num_decoder_tokens = min(num_select, (~decoder_mask_all[0]).sum())  # Adapted for batching / Assumes num_decoder_tokens is the same across the batch
+        # Add a small random number to the mask so they get sorted in a random way, but keeping the masked tokens first
+        mask_rand = torch.rand(decoder_mask_all.shape[1], device=decoder_mask_all.device).unsqueeze(0) * 1e-6
+        ids_shuffle = torch.argsort(decoder_mask_all + mask_rand, dim=1)
+        # ids_restore = torch.argsort(ids_shuffle, dim=1)
+        # Only keep the first num_select_tokens
+        ids_keep = ids_shuffle[:, :num_decoder_tokens]
+        decoder_tokens = torch.gather(decoder_tokens_all, dim=1, index=repeat(ids_keep, "b n -> b n d", d=decoder_tokens_all.shape[2]))
+        decoder_emb = torch.gather(emb_all, dim=1, index=repeat(ids_keep, "b n -> b n d", d=emb_all.shape[2]))
+        decoder_mask = torch.gather(decoder_mask_all, dim=1, index=ids_keep)
+        mod_mask = torch.gather(mod_mask_all, dim=1, index=ids_keep)
+        mod_pos = torch.gather(mod_pos_all, dim=1, index=ids_keep)
+        decoder_tokens[decoder_mask] = 0.
+        decoder_emb[decoder_mask] = 0.
+        mod_mask[decoder_mask] = -1
+        return decoder_tokens, decoder_emb, decoder_mask, mod_mask, mod_pos
+    def forward_mask_decoder_autoregressive(self, mod_dict, target_mod, seed=None):
+        # Adapted for batching
+        if seed is not None:
+            torch.manual_seed(seed)
+        # This is the concatenation part
+        d = mod_dict[target_mod]
+        decoder_ids_all = d['ids']
+        emb_all = d['emb']
+        decoder_mask_all = d['target_mask']
+        B = decoder_ids_all.shape[0] # Get batch size
+        mod_mask_all = torch.full_like(d['ids'], self.model.modality_info[target_mod]['id'], dtype=torch.int16)
+        mod_pos_all = torch.arange(d['x'].shape[1], device=d['x'].device).unsqueeze(0)
+        mod_pos_all = repeat(mod_pos_all, '1 n -> b n', b=B)
+        num_decoder_tokens = (~decoder_mask_all[0]).sum() # Adapted for batching, but assumes num_decoder_tokens is the same across the batch
+        # Add arange multiplied by small constant to mask so they get sorted in a deterministic way
+        mask_arange = torch.arange(decoder_mask_all.shape[1], device=decoder_mask_all.device).unsqueeze(0) * 1e-6
+        ids_shuffle = torch.argsort(decoder_mask_all + mask_arange, dim=1)
+        # ids_restore = torch.argsort(ids_shuffle, dim=1)
+        ids_keep = ids_shuffle[:, :num_decoder_tokens]
+        # Same as in forward_mask_decoder
+        decoder_ids = torch.gather(decoder_ids_all, dim=1, index=ids_keep)
+        decoder_emb = torch.gather(emb_all, dim=1, index=repeat(ids_keep, "b n -> b n d", d=emb_all.shape[2]))
+        decoder_mask = torch.gather(decoder_mask_all, dim=1, index=ids_keep)
+        mod_mask = torch.gather(mod_mask_all, dim=1, index=ids_keep)
+        mod_pos = torch.gather(mod_pos_all, dim=1, index=ids_keep)
+        decoder_ids[decoder_mask] = 0
+        decoder_emb[decoder_mask] = 0.
+        mod_mask[decoder_mask] = -1
+        return decoder_ids, decoder_emb, decoder_mask, mod_mask, mod_pos
+    def merge_sequences(self, mod_dict, pred_ids, target_mod, text_tokenizer, default_sentinel="[S_1]"):
+        device = mod_dict[target_mod]['tensor'].device
+        # Get input ids
+        input_ids = mod_dict[target_mod]['tensor'].squeeze().detach().cpu()
+        input_ids = input_ids[mod_dict[target_mod]['input_mask'].squeeze().detach().cpu() == 0]
+        input_ids = input_ids.tolist()
+        if len(input_ids) == 0:
+            input_ids = [text_tokenizer.get_vocab()[default_sentinel]]
+        # Get predicted ids
+        pred_ids = pred_ids.squeeze().detach().cpu().tolist()
+        if isinstance(pred_ids, int):
+            pred_ids = [pred_ids]
+        # Get sentinel ids using the tokenizer
+        sentinel_ids = set(get_sentinel_to_id_mapping(text_tokenizer).values())
+        # Perform merging
+        merged_ids = merge_span_masking(input_ids, pred_ids, sentinel_ids)
+        merged_ids = torch.tensor(merged_ids).unsqueeze(0)
+        # Create new dict
+        new_input_mask = torch.zeros_like(merged_ids, dtype=torch.bool)
+        new_target_mask = torch.ones_like(merged_ids, dtype=torch.bool)
+        new_dict = {'tensor': merged_ids.to(device),
+                    'input_mask': new_input_mask.to(device),
+                    'target_mask': new_target_mask.to(device)}
+        new_dict['decoder_attention_mask'] = torch.zeros_like(new_target_mask, dtype=torch.bool)
+        mod_dict[target_mod] = new_dict
+        return mod_dict
+    def merge_sequences_batched(self, mod_dict, pred_ids, target_mod, text_tokenizer, default_sentinel="[S_1]"):
+        # Unbatches and calls merge sequence per batch, then regroups it into a batch
+        pad_id = text_tokenizer.token_to_id("[PAD]")
+        B = mod_dict[target_mod]['tensor'].shape[0]
+        device = mod_dict[target_mod]['tensor'].device
+        tensors = torch.split(mod_dict[target_mod]['tensor'], 1)
+        input_masks = torch.split(mod_dict[target_mod]['input_mask'], 1)
+        pred_ids = torch.split(pred_ids, 1)
+        input_dicts = []
+        for t, im in zip(tensors, input_masks):
+            d = {target_mod: {'tensor': t, 'input_mask': im}}
+            input_dicts.append(d)
+        merged_tensors = []
+        merged_input_masks = []
+        merged_target_masks = []
+        merged_seq_lens = []
+        for input_d, pi in zip(input_dicts, pred_ids):
+            # Output of merge_sequences is mod_dict with modified target mod
+            merged_d = self.merge_sequences(input_d, pi, target_mod, text_tokenizer, default_sentinel)[target_mod]
+            merged_tensors.append(merged_d['tensor'])
+            merged_input_masks.append(merged_d['input_mask'])
+            merged_target_masks.append(merged_d['input_mask'])
+            merged_seq_lens.append(merged_d['tensor'].shape[1])
+        max_seq_len = max(merged_seq_lens)
+        for i in range(len(merged_tensors)):
+            # Right pad all tensors
+            p1d = (0, max_seq_len - merged_seq_lens[i])
+            merged_tensors[i] = F.pad(merged_tensors[i], p1d, "constant",pad_id)
+            merged_input_masks[i] = F.pad(merged_input_masks[i], p1d, "constant", True)
+            merged_target_masks[i] = F.pad(merged_target_masks[i], p1d, "constant", True)
+        new_dict = {'tensor': torch.cat(merged_tensors, dim=0).to(device),
+                    'input_mask': torch.cat(merged_input_masks, dim=0).to(device),
+                    'target_mask': torch.cat(merged_target_masks, dim=0).to(device)}
+        new_dict['decoder_attention_mask'] = torch.zeros_like(new_dict['target_mask'], dtype=torch.bool)
+        mod_dict[target_mod] = new_dict
+        return mod_dict
+    def forward_enc_dec_maskgit_batched(self, mod_dict, target_mod, seed=None):
+        # Encoder
+        encoder_mod_dict = {mod: self.model.encoder_embeddings[mod](d)
+                            for mod, d in mod_dict.items()
+                            if mod in self.model.encoder_embeddings}
+        encoder_tokens, encoder_emb, encoder_mask, encoder_mod_mask = self.forward_mask_encoder_generation(encoder_mod_dict)
+        x = encoder_tokens + encoder_emb
+        x = self.model.forward_encoder(x, encoder_mask)
+        # Decoder
+        context = self.model.decoder_proj_context(x) + encoder_emb
+        decoder_mod_dict = {target_mod: self.model.decoder_embeddings[target_mod].forward_embed(mod_dict[target_mod])}
+        decoder_tokens, decoder_emb, decoder_mask, decoder_mod_mask, mod_pos = self.forward_mask_decoder_maskgit(decoder_mod_dict, target_mod, seed=seed)
+        y = decoder_tokens + decoder_emb
+        y = self.model.forward_decoder(y, context, encoder_mask, None)
+        B, N, D = y.shape
+        logits = self.model.forward_logits(y, decoder_mod_dict, decoder_mod_mask)[target_mod]
+        logits = logits.reshape(B, N, -1)
+        return logits, mod_pos
+    def maskgit_step_batched(self, mod_dict, target_mod, num_select, temperature, top_k, top_p, seed=None):
+        logits, mod_pos = self.forward_enc_dec_maskgit_batched(mod_dict, target_mod, seed=seed)
+        # MaskGIT sampling
+        top_samples, top_indices = self.select_tokens_batched(logits, num_select,
+                                                      temperature=temperature, top_k=top_k, top_p=top_p)
+        # Update mod dict
+        # We rely on gather / scatter for batched operations
+        top_pos = torch.gather(mod_pos, -1, top_indices) # (B, num_select)
+        mod_dict[target_mod]['tensor'] = torch.scatter(mod_dict[target_mod]['tensor'], -1, top_pos, top_samples)
+        mod_dict[target_mod]['input_mask'] = torch.scatter(mod_dict[target_mod]['input_mask'], -1, top_pos, torch.zeros_like(top_samples, dtype=torch.bool))
+        mod_dict[target_mod]['target_mask'] = torch.scatter(mod_dict[target_mod]['target_mask'], -1, top_pos, torch.ones_like(top_samples, dtype=torch.bool))
+        return mod_dict
+    def guided_maskgit_step_batched(self, mod_dict, target_mod, num_select, temperature, top_k, top_p,
+                                    conditioning=[], guidance_scale=1.0, seed=None, write_all_predictions=False):
+        ### 1 - First pass, with conditioning
+        logits_cond, _ = self.forward_enc_dec_maskgit_batched(mod_dict, target_mod, seed=seed)
+        ### 2 - Second pass, without conditioning
+        mod_dict_uncond = copy.deepcopy(mod_dict)
+        for mod in conditioning:
+            if self.model.modality_info[mod]['type'] in ['seq', 'seq_token']:
+                mod_dict_uncond = empty_seq_modality(mod_dict_uncond, mod)
+            elif self.model.modality_info[mod]['type'] in ['seq_emb']:
+                mod_dict_uncond = empty_seq_emb_modality(mod_dict_uncond, mod)
+            else:
+                mod_dict_uncond = empty_img_modality(mod_dict_uncond, mod)
+        logits_uncond, mod_pos = self.forward_enc_dec_maskgit_batched(mod_dict_uncond, target_mod, seed=seed)
+        ### 3 - Classifier-free guidance
+        logits = logits_uncond + (logits_cond - logits_uncond) * guidance_scale
+        ### 4 - MaskGIT sampling
+        top_samples, top_indices, all_samples = self.select_tokens_batched(
+            logits, num_select,
+            temperature=temperature, top_k=top_k, top_p=top_p,
+            return_all_samples=True
+        )
+        ### 5 - Update mod dict
+        # We rely on gather / scatter for batched operations
+        top_pos = torch.gather(mod_pos, -1, top_indices) # (B, num_select)
+        if write_all_predictions:
+            mod_dict[target_mod]['tensor'][:, mod_pos] = all_samples
+        else:
+            mod_dict[target_mod]['tensor'] = torch.scatter(mod_dict[target_mod]['tensor'], -1, top_pos, top_samples)
+        mod_dict[target_mod]['input_mask'] = torch.scatter(mod_dict[target_mod]['input_mask'], -1, top_pos, torch.zeros_like(top_samples, dtype=torch.bool))
+        mod_dict[target_mod]['target_mask'] = torch.scatter(mod_dict[target_mod]['target_mask'], -1, top_pos, torch.ones_like(top_samples, dtype=torch.bool))
+        return mod_dict
+    def multi_guided_maskgit_step_batched(self, uncond_dict, cond_dicts, cond_weights, target_mod, num_select,
+                                          temperature, top_k, top_p, seed=None, write_all_predictions=False):
+        ### 1 - Conditional forward passes (one for each guided condition)
+        logits_cond_all = []
+        for cond_dict in cond_dicts:
+            logits_cond_i, _ = self.forward_enc_dec_maskgit_batched(cond_dict, target_mod, seed=seed)
+            logits_cond_all.append(logits_cond_i)
+        ### 2 - Unconditional forward pass
+        logits_uncond, mod_pos = self.forward_enc_dec_maskgit_batched(uncond_dict, target_mod, seed=seed)
+        ### 3 Conjunction of multiple conditions: l_uncond + sum_i{w_i * (l_cond_i - l_uncond)}
+        # See https://arxiv.org/abs/2206.01714
+        logits = logits_uncond + torch.stack([w * (logits_cond - logits_uncond) for w, logits_cond in zip(cond_weights, logits_cond_all)]).sum(dim=0)
+        ### 4 - MaskGIT sampling
+        top_samples, top_indices, all_samples = self.select_tokens_batched(
+            logits, num_select,
+            temperature=temperature, top_k=top_k, top_p=top_p,
+            return_all_samples=True
+        )
+        ### 5 - Update mod dict with newly generated tokens
+        # We rely on gather / scatter for batched operations
+        top_pos = torch.gather(mod_pos, -1, top_indices) # (B, num_select)
+        if write_all_predictions:
+            uncond_dict[target_mod]['tensor'][:, mod_pos] = all_samples
+        else:
+            uncond_dict[target_mod]['tensor'] = torch.scatter(uncond_dict[target_mod]['tensor'], -1, top_pos, top_samples)
+        uncond_dict[target_mod]['input_mask'] = torch.scatter(uncond_dict[target_mod]['input_mask'], -1, top_pos, torch.zeros_like(top_samples, dtype=torch.bool))
+        uncond_dict[target_mod]['target_mask'] = torch.scatter(uncond_dict[target_mod]['target_mask'], -1, top_pos, torch.ones_like(top_samples, dtype=torch.bool))
+        # Update conditioning dicts
+        for i in range(len(cond_dicts)):
+            cond_dicts[i][target_mod]['tensor'] = torch.scatter(cond_dicts[i][target_mod]['tensor'], -1, top_pos, top_samples)
+            cond_dicts[i][target_mod]['input_mask'] = torch.scatter(cond_dicts[i][target_mod]['input_mask'], -1, top_pos, torch.zeros_like(top_samples, dtype=torch.bool))
+            cond_dicts[i][target_mod]['target_mask'] = torch.scatter(cond_dicts[i][target_mod]['target_mask'], -1, top_pos, torch.ones_like(top_samples, dtype=torch.bool))
+        return uncond_dict, cond_dicts
+    def forward_enc_dec_roar_batched(self, mod_dict, target_mod, num_select, seed=None):
+        # Encoder
+        encoder_mod_dict = {mod: self.model.encoder_embeddings[mod](d)
+                            for mod, d in mod_dict.items()
+                            if mod in self.model.encoder_embeddings}
+        encoder_tokens, encoder_emb, encoder_mask, encoder_mod_mask = self.forward_mask_encoder_generation(encoder_mod_dict)
+        x = encoder_tokens + encoder_emb
+        x = self.model.forward_encoder(x, encoder_mask)
+        # Decoder
+        context = self.model.decoder_proj_context(x) + encoder_emb
+        decoder_mod_dict = {target_mod: self.model.decoder_embeddings[target_mod].forward_embed(mod_dict[target_mod])}
+        decoder_tokens, decoder_emb, decoder_mask, decoder_mod_mask, mod_pos = self.forward_mask_decoder_roar(decoder_mod_dict, target_mod, num_select, seed=seed)
+        y = decoder_tokens + decoder_emb
+        y = self.model.forward_decoder(y, context, encoder_mask, None)
+        B, N, D = y.shape
+        logits = self.model.forward_logits(y, decoder_mod_dict, decoder_mod_mask)[target_mod]
+        logits = logits.reshape(B, N, -1)
+        return logits, mod_pos
+    def roar_step_batched(self, mod_dict, target_mod, num_select, temperature, top_k, top_p, seed=None):
+        """ROAR = Random Order Autoregression"""
+        logits, mod_pos = self.forward_enc_dec_roar_batched(mod_dict, target_mod, num_select, seed=seed)
+        # Simple sampling
+        samples, sampled_probs = self.sample_tokens_batched(logits, temperature, top_k=top_k, top_p=top_p)
+        # Update mod dict
+        # We rely on scatter for batched operations
+        select_pos = mod_pos
+        mod_dict[target_mod]['tensor'] = torch.scatter(mod_dict[target_mod]['tensor'], -1, select_pos, samples)
+        mod_dict[target_mod]['input_mask'] = torch.scatter(mod_dict[target_mod]['input_mask'], -1, select_pos, torch.zeros_like(samples, dtype=torch.bool))
+        mod_dict[target_mod]['target_mask'] = torch.scatter(mod_dict[target_mod]['target_mask'], -1, select_pos, torch.ones_like(samples, dtype=torch.bool))
+        return mod_dict
+    def guided_roar_step_batched(self, mod_dict, target_mod, num_select, temperature, top_k, top_p,
+                                 conditioning=[], guidance_scale=1.0, seed=None):
+        """ROAR = Random Order Autoregression"""
+        ### 1 - First pass, with conditioning
+        logits_cond, _ = self.forward_enc_dec_roar_batched(mod_dict, target_mod, num_select, seed=seed)
+        ### 2 - Second pass, without conditioning
+        mod_dict_uncond = copy.deepcopy(mod_dict)
+        for mod in conditioning:
+            if self.model.modality_info[mod]['type'] in ['seq', 'seq_token']:
+                mod_dict_uncond = empty_seq_modality(mod_dict_uncond, mod)
+            elif self.model.modality_info[mod]['type'] in ['seq_emb']:
+                mod_dict_uncond = empty_seq_emb_modality(mod_dict_uncond, mod)
+            else:
+                mod_dict_uncond = empty_img_modality(mod_dict_uncond, mod)
+        logits_uncond, mod_pos = self.forward_enc_dec_roar_batched(mod_dict_uncond, target_mod, num_select, seed=seed)
+        ### 3 - Classifier-free guidance
+        logits = logits_uncond + (logits_cond - logits_uncond) * guidance_scale
+        ### 4 - Simple sampling
+        samples, sampled_probs = self.sample_tokens_batched(logits, temperature, top_k=top_k, top_p=top_p)
+        ### 5 - Update mod dict
+        # We rely on gather / scatter for batched operations
+        select_pos = mod_pos
+        mod_dict[target_mod]['tensor'] = torch.scatter(mod_dict[target_mod]['tensor'], -1, select_pos, samples)
+        mod_dict[target_mod]['input_mask'] = torch.scatter(mod_dict[target_mod]['input_mask'], -1, select_pos, torch.zeros_like(samples, dtype=torch.bool))
+        mod_dict[target_mod]['target_mask'] = torch.scatter(mod_dict[target_mod]['target_mask'], -1, select_pos, torch.ones_like(samples, dtype=torch.bool))
+        return mod_dict
+    def multi_guided_roar_step_batched(self, uncond_dict, cond_dicts, cond_weights, target_mod,
+                                       num_select, temperature, top_k, top_p, seed=None):
+        ### 1 - Conditional forward passes (one for each guided condition)
+        logits_cond_all = []
+        for cond_dict in cond_dicts:
+            logits_cond_i, _ = self.forward_enc_dec_roar_batched(cond_dict, target_mod, num_select, seed=seed)
+            logits_cond_all.append(logits_cond_i)
+        ### 2 - Unconditional forward pass
+        logits_uncond, mod_pos = self.forward_enc_dec_roar_batched(uncond_dict, target_mod, num_select, seed=seed)
+        ### 3 Conjunction of multiple conditions: l_uncond + sum_i{w_i * (l_cond_i - l_uncond)}
+        # See https://arxiv.org/abs/2206.01714
+        logits = logits_uncond + torch.stack([w * (logits_cond - logits_uncond) for w, logits_cond in zip(cond_weights, logits_cond_all)]).sum(dim=0)
+        ### 4 - Simple sampling
+        samples, sampled_probs = self.sample_tokens_batched(logits, temperature, top_k=top_k, top_p=top_p)
+        ### 5 - Update mod dict
+        # We rely on gather / scatter for batched operations
+        select_pos = mod_pos
+        uncond_dict[target_mod]['tensor'] = torch.scatter(uncond_dict[target_mod]['tensor'], -1, select_pos, samples)
+        uncond_dict[target_mod]['input_mask'] = torch.scatter(uncond_dict[target_mod]['input_mask'], -1, select_pos, torch.zeros_like(samples, dtype=torch.bool))
+        uncond_dict[target_mod]['target_mask'] = torch.scatter(uncond_dict[target_mod]['target_mask'], -1, select_pos, torch.ones_like(samples, dtype=torch.bool))
+        # Update conditioning dicts
+        for i in range(len(cond_dicts)):
+            cond_dicts[i][target_mod]['tensor'] = torch.scatter(cond_dicts[i][target_mod]['tensor'], -1, select_pos, samples)
+            cond_dicts[i][target_mod]['input_mask'] = torch.scatter(cond_dicts[i][target_mod]['input_mask'], -1, select_pos, torch.ones_like(samples, dtype=torch.bool))
+            cond_dicts[i][target_mod]['target_mask'] = torch.scatter(cond_dicts[i][target_mod]['target_mask'], -1, select_pos, torch.zeros_like(samples, dtype=torch.bool))
+        return uncond_dict, cond_dicts
+    def autoregressive_step_batched(self, mod_dict, target_mod, temperature, top_k: Union[float, int], top_p: float,
+                                    use_eos=True, eos_token=None, start_tokens=None, text_tokenizer=None, seed=None):
+        # Encoder
+        encoder_mod_dict = {mod: self.model.encoder_embeddings[mod](d)
+                            for mod, d in mod_dict.items()
+                            if mod in self.model.encoder_embeddings}
+        encoder_tokens, encoder_emb, encoder_mask, encoder_mod_mask = self.forward_mask_encoder_generation(encoder_mod_dict)
+        x = encoder_tokens + encoder_emb
+        x = self.model.forward_encoder(x, encoder_mask) # B, N, D
+        # Get batch size
+        B = x.shape[0]
+        # Decoder
+        context = self.model.decoder_proj_context(x) + encoder_emb
+        decoder_mod_dict = {target_mod: self.model.decoder_embeddings[target_mod].forward_embed(mod_dict[target_mod])}
+        decoder_ids, decoder_emb, decoder_mask, decoder_mod_mask, mod_pos = self.forward_mask_decoder_autoregressive(decoder_mod_dict, target_mod, seed=seed)
+        device = decoder_ids.device
+        seq_len = self.model.modality_info[target_mod]['max_tokens']
+        if use_eos and eos_token is None:
+            # The eos_token is the final sentinel token provided
+            eos_token = decoder_ids[0][decoder_mask[0] == 0][-1] # Assumes the EOS token is the same for all
+        if use_eos:
+            eos_token = eos_token.to(device)
+        # If no start_tokens, just use the beginning of the actual target (i.e., a sentinel token)
+        out = decoder_ids[:, :1] if start_tokens is None else start_tokens.to(device)
+        # Set decoder_tokens to None, we do not use them for decoding
+        decoder_ids = None
+        # If all samples of the batch have eos, return early
+        if use_eos and (out == eos_token).any(dim=-1).all():
+            return out
+        y_emb = decoder_emb[:, :seq_len]
+        seq_len = y_emb.shape[1]
+        # Auto-regressive decoding and sampling
+        for i in range(seq_len):
+            cur_len = out.shape[1]
+            # Convert ids into word embeddings and add corresponding posembs + modemb
+            y = self.model.decoder_embeddings[target_mod].token_emb(out) + y_emb[:, :cur_len]
+            # Build causal mask
+            causal_mask = torch.ones((cur_len, cur_len), dtype=torch.bool, device=y.device).triu(1)
+            causal_mask = repeat(causal_mask, "n1 n2 -> b n1 n2", b=B)
+            y = self.model.forward_decoder(y, context, encoder_mask, causal_mask)
+            logits = self.model.forward_logits(y, decoder_mod_dict, decoder_mod_mask[:, :cur_len])[target_mod]
+            logits = rearrange(logits, "(b n) d -> b n d", b=B, n=cur_len)
+            last_logits = logits[:, -1]
+            # Sample token for the newly generated logit
+            if np.isclose(temperature, 0, atol=1e-10):
+                sample = torch.argmax(last_logits, dim=-1, keepdim=True)
+            else:
+                filtered_logits = self.top_k_top_p_filtering(last_logits, top_k, top_p)
+                probs = F.softmax(filtered_logits / temperature, dim=-1)
+                sample = torch.multinomial(probs, 1)
+            out = torch.cat((out, sample), dim=-1)
+            if use_eos and (out == eos_token).any(dim=-1).all():
+                break
+        mod_dict = self.merge_sequences_batched(mod_dict, out, target_mod, text_tokenizer)
+        return mod_dict
+    def guided_autoregressive_step_batched(self, mod_dict, target_mod, temperature, top_k: Union[float, int], top_p: float,
+                                           use_eos=True, eos_token=None, start_tokens=None, text_tokenizer=None,
+                                           conditioning=[], guidance_scale=1.0, seed=None):
+         ### 1 - Encoder forward pass, with conditioning
+        # Encoder
+        encoder_mod_dict = {mod: self.model.encoder_embeddings[mod](d)
+                            for mod, d in mod_dict.items()
+                            if mod in self.model.encoder_embeddings}
+        encoder_tokens, encoder_emb, encoder_mask_cond, encoder_mod_mask = self.forward_mask_encoder_generation(encoder_mod_dict)
+        x = encoder_tokens + encoder_emb
+        x = self.model.forward_encoder(x, encoder_mask_cond) # B, N, D
+        # Get batch size
+        B = x.shape[0]
+        # Decoder
+        context_cond = self.model.decoder_proj_context(x) + encoder_emb
+        decoder_mod_dict_cond = {target_mod: self.model.decoder_embeddings[target_mod].forward_embed(mod_dict[target_mod])}
+        decoder_ids, decoder_emb, decoder_mask, decoder_mod_mask_cond, mod_pos = self.forward_mask_decoder_autoregressive(decoder_mod_dict_cond, target_mod, seed=seed)
+        device = decoder_ids.device
+        seq_len = self.model.modality_info[target_mod]['max_tokens']
+        ### 2 - Encoder forward pass, without conditioning
+        mod_dict_uncond = copy.deepcopy(mod_dict)
+        for mod in conditioning:
+            if self.model.modality_info[mod]['type'] in ['seq', 'seq_token']:
+                mod_dict_uncond = empty_seq_modality(mod_dict_uncond, mod)
+            elif self.model.modality_info[mod]['type'] in ['seq_emb']:
+                mod_dict_uncond = empty_seq_emb_modality(mod_dict_uncond, mod)
+            else:
+                mod_dict_uncond = empty_img_modality(mod_dict_uncond, mod)
+        # Encoder
+        encoder_mod_dict = {mod: self.model.encoder_embeddings[mod](d)
+                            for mod, d in mod_dict_uncond.items()
+                            if mod in self.model.encoder_embeddings}
+        encoder_tokens, encoder_emb, encoder_mask_uncond, encoder_mod_mask = self.forward_mask_encoder_generation(encoder_mod_dict)
+        x = encoder_tokens + encoder_emb
+        x = self.model.forward_encoder(x, encoder_mask_uncond) # B, N, D
+        # Decoder
+        context_uncond = self.model.decoder_proj_context(x) + encoder_emb
+        decoder_mod_dict_uncond = {target_mod: self.model.decoder_embeddings[target_mod].forward_embed(mod_dict[target_mod])}
+        decoder_ids, decoder_emb, decoder_mask, decoder_mod_mask_uncond, mod_pos = self.forward_mask_decoder_autoregressive(decoder_mod_dict_uncond, target_mod, seed=seed)
+        if use_eos and eos_token is None:
+            # The eos_token is the final sentinel token provided
+            eos_token = decoder_ids[0][decoder_mask[0] == 0][-1] # Assumes the EOS token is the same for all
+        if use_eos:
+            eos_token = eos_token.to(device)
+        # If no start_tokens, just use the beginning of the actual target (i.e., a sentinel token)
+        out = decoder_ids[:, :1] if start_tokens is None else start_tokens.to(device)
+        # Set decoder_tokens to None, we do not use them for decoding
+        decoder_ids = None
+        # If all samples of the batch have eos, return early
+        if use_eos and (out == eos_token).any(dim=-1).all():
+            return out
+        y_emb = decoder_emb[:, :seq_len]
+        seq_len = y_emb.shape[1]
+        ### 3 -  Auto-regressive decoding and sampling
+        for i in range(seq_len):
+            cur_len = out.shape[1]
+            # Convert ids into word embeddings and add corresponding posembs + modemb
+            y = self.model.decoder_embeddings[target_mod].token_emb(out) + y_emb[:, :cur_len]
+            # Build causal mask
+            causal_mask = torch.ones((cur_len, cur_len), dtype=torch.bool, device=y.device).triu(1)
+            causal_mask = repeat(causal_mask, "n1 n2 -> b n1 n2", b=B)
+            ### 3a - Decoder forward pass, with conditioning
+            y_cond = self.model.forward_decoder(y, context_cond, encoder_mask_cond, causal_mask)
+            logits_cond = self.model.forward_logits(y_cond, decoder_mod_dict_cond, decoder_mod_mask_cond[:, :cur_len])[target_mod]
+            logits_cond = rearrange(logits_cond, "(b n) d -> b n d", b=B, n=cur_len)
+            last_logits_cond = logits_cond[:, -1]
+            ### 3b - Decoder forward pass, without conditioning
+            y_uncond = self.model.forward_decoder(y, context_uncond, encoder_mask_uncond, causal_mask)
+            logits_uncond = self.model.forward_logits(y_uncond, decoder_mod_dict_uncond, decoder_mod_mask_uncond[:, :cur_len])[target_mod]
+            logits_uncond = rearrange(logits_uncond, "(b n) d -> b n d", b=B, n=cur_len)
+            last_logits_uncond = logits_uncond[:, -1]
+            ### 3c - Classifier-free guidance
+            last_logits = last_logits_uncond + (last_logits_cond - last_logits_uncond) * guidance_scale
+            # Sample token for the newly generated logit
+            if np.isclose(temperature, 0, atol=1e-10):
+                sample = torch.argmax(last_logits, dim=-1, keepdim=True)
+            else:
+                filtered_logits = self.top_k_top_p_filtering(last_logits, top_k, top_p)
+                probs = F.softmax(filtered_logits / temperature, dim=-1)
+                sample = torch.multinomial(probs, 1)
+            out = torch.cat((out, sample), dim=-1)
+            if use_eos and (out == eos_token).any(dim=-1).all():
+                break
+        mod_dict = self.merge_sequences_batched(mod_dict, out, target_mod, text_tokenizer)
+        return mod_dict
+    @torch.no_grad()
+    def generate(self, mod_dict, schedule, top_k=0.0, top_p=0.0, text_tokenizer=None, verbose=False, seed=None):
+        """ Generates a sequence of tokens from the input modalities.
+        :param mod_dict: Dictionary of modalities.
+        :param schedule: Schedule of modalities to use.
+            List of dictionaries containing {target_domain, scheme, num_tokens, temperature, cfg_scale, cfg_cond_domains}.
+        :param top_k: top_k > 0: Keep only top k tokens with highest probability (a.k.a. top-k filtering).
+        :param top_p: top_p > 0.0: Keep the top tokens with cumulative probability >= top_p (a.k.a. nucleus filtering).
+        :param text_tokenizer: Text tokenizer.
+        :param verbose: Whether to print progress.
+        :param seed: Random seed.
+        :return: Generated mod dict.
+        """
+        # Input embedding -> tokenizes the modalities - Many are placeholder for now
+        mod_dict = copy.deepcopy(mod_dict)
+        for step, schedule_step_info in tqdm(enumerate(schedule), disable=not verbose):
+            target_mod = schedule_step_info['target_domain']
+            temp = schedule_step_info['temperature']
+            cfg_scale = schedule_step_info.get('cfg_scale', 1.0)
+            cfg_conditioning = schedule_step_info.get('cfg_cond_domains', [])
+            seed_i = seed + step if seed is not None else None
+            if self.model.modality_info[target_mod]['type'] == 'img':
+                scheme = schedule_step_info['scheme']
+                num_select = schedule_step_info['num_tokens']
+                if scheme.lower() == 'maskgit':
+                    if cfg_scale == 1.0 or len(cfg_conditioning) == 0:
+                        mod_dict = self.maskgit_step_batched(
+                            mod_dict, target_mod, num_select, temperature=temp,
+                            top_k=top_k, top_p=top_p, seed=seed_i
+                        )
+                    else:
+                        mod_dict = self.guided_maskgit_step_batched(
+                            mod_dict, target_mod, num_select, temperature=temp, top_k=top_k, top_p=top_p,
+                            conditioning=cfg_conditioning, guidance_scale=cfg_scale, seed=seed_i
+                        )
+                elif scheme.lower() == 'roar':
+                    if cfg_scale == 1.0 or len(cfg_conditioning) == 0:
+                        mod_dict = self.roar_step_batched(
+                            mod_dict, target_mod, num_select, temperature=temp,
+                            top_k=top_k, top_p=top_p, seed=seed_i
+                        )
+                    else:
+                        mod_dict = self.guided_roar_step_batched(
+                            mod_dict, target_mod, num_select, temperature=temp, top_k=top_k, top_p=top_p,
+                            conditioning=cfg_conditioning, guidance_scale=cfg_scale, seed=seed_i
+                        )
+                else:
+                    raise ValueError("Invalid sampling scheme")
+            elif self.model.modality_info[target_mod]['type'] in ['seq', 'seq_token']:
+                if cfg_scale == 1.0 or len(cfg_conditioning) == 0:
+                    mod_dict = self.autoregressive_step_batched(
+                        mod_dict, target_mod, temperature=temp, top_k=top_k, top_p=top_p,
+                        text_tokenizer=text_tokenizer, seed=seed_i
+                    )
+                else:
+                    mod_dict = self.guided_autoregressive_step_batched(
+                        mod_dict, target_mod, temperature=temp, top_k=top_k, top_p=top_p,
+                        text_tokenizer=text_tokenizer, conditioning=cfg_conditioning,
+                        guidance_scale=cfg_scale, seed=seed_i
+                    )
+            else:
+                raise ValueError("Invalid schedule")
+        return mod_dict
+    @torch.no_grad()
+    def generate_iter(self, mod_dict, schedule, top_k=0.0, top_p=0.0, text_tokenizer=None, verbose=False, seed=None):
+        """ Iterator that generates a sequence of tokens from the input modalities step by step.
+        :param mod_dict: Dictionary of modalities.
+        :param schedule: Schedule of modalities to use.
+            List of dictionaries containing {target_domain, scheme, num_tokens, temperature, cfg_scale, cfg_cond_domains}.
+        :param top_k: top_k > 0: Keep only top k tokens with highest probability (a.k.a. top-k filtering).
+        :param top_p: top_p > 0.0: Keep the top tokens with cumulative probability >= top_p (a.k.a. nucleus filtering).
+        :param text_tokenizer: Text tokenizer.
+        :param verbose: Whether to print progress.
+        :param seed: Random seed.
+        :return: Iterator of generated mod dict.
+        """
+        # Input embedding -> tokenizes the modalities - Many are placeholder for now
+        mod_dict = copy.deepcopy(mod_dict)
+        for step, schedule_step_info in tqdm(enumerate(schedule), disable=not verbose):
+            target_mod = schedule_step_info['target_domain']
+            temp = schedule_step_info['temperature']
+            cfg_scale = schedule_step_info.get('cfg_scale', 1.0)
+            cfg_conditioning = schedule_step_info.get('cfg_cond_domains', [])
+            seed_i = seed + step if seed is not None else None
+            if self.model.modality_info[target_mod]['type'] == 'img':
+                scheme = schedule_step_info['scheme']
+                num_select = schedule_step_info['num_tokens']
+                if scheme.lower() == 'maskgit':
+                    if cfg_scale == 1.0 or len(cfg_conditioning) == 0:
+                        mod_dict = self.maskgit_step_batched(
+                            mod_dict, target_mod, num_select, temperature=temp,
+                            top_k=top_k, top_p=top_p, seed=seed_i
+                        )
+                    else:
+                        mod_dict = self.guided_maskgit_step_batched(
+                            mod_dict, target_mod, num_select, temperature=temp, top_k=top_k, top_p=top_p,
+                            conditioning=cfg_conditioning, guidance_scale=cfg_scale, seed=seed_i,
+                            write_all_predictions=True
+                        )
+                elif scheme.lower() == 'roar':
+                    if cfg_scale == 1.0 or len(cfg_conditioning) == 0:
+                        mod_dict = self.roar_step_batched(
+                            mod_dict, target_mod, num_select, temperature=temp,
+                            top_k=top_k, top_p=top_p, seed=seed_i
+                        )
+                    else:
+                        mod_dict = self.guided_roar_step_batched(
+                            mod_dict, target_mod, num_select, temperature=temp, top_k=top_k, top_p=top_p,
+                            conditioning=cfg_conditioning, guidance_scale=cfg_scale, seed=seed_i
+                        )
+                else:
+                    raise ValueError("Invalid sampling scheme")
+            elif self.model.modality_info[target_mod]['type'] in ['seq', 'seq_token']:
+                if cfg_scale == 1.0 or len(cfg_conditioning) == 0:
+                    mod_dict = self.autoregressive_step_batched(
+                        mod_dict, target_mod, temperature=temp, top_k=top_k, top_p=top_p,
+                        text_tokenizer=text_tokenizer, seed=seed_i
+                    )
+                else:
+                    mod_dict = self.guided_autoregressive_step_batched(
+                        mod_dict, target_mod, temperature=temp, top_k=top_k, top_p=top_p,
+                        text_tokenizer=text_tokenizer, conditioning=cfg_conditioning,
+                        guidance_scale=cfg_scale, seed=seed_i
+                    )
+            else:
+                raise ValueError("Invalid schedule")
+            yield mod_dict
+    @torch.no_grad()
+    def generate_multi_guided(self, uncond_dict, cond_dicts, schedule, top_k=0.0, top_p=0.0,
+                              text_tokenizer=None, verbose=False, seed=None):
+        # Generation function for multiple weighted conditions
+        # To detect when a modality has finished generating, we keep track of the current target modality
+        cur_target_mod = schedule[0]['target_domain']
+        uncond_dict = copy.deepcopy(uncond_dict)
+        cond_dicts = copy.deepcopy(cond_dicts)
+        # Add the to-be-generated modality to the conditional dicts
+        for i in range(len(cond_dicts)):
+            cond_dicts[i][cur_target_mod] = copy.deepcopy(uncond_dict[cur_target_mod])
+        for step, schedule_step_info in tqdm(enumerate(schedule), disable=not verbose):
+            target_mod = schedule_step_info['target_domain']
+            temp = schedule_step_info['temperature']
+            num_select = schedule_step_info['num_tokens']
+            cond_weights = schedule_step_info['cfg_scale']
+            # Once a modality is fully generated, add it as a new condition
+            if cur_target_mod != target_mod:
+                for i in range(len(cond_dicts)):
+                    # Remove the previously generated modality from the conditionings
+                    del cond_dicts[i][cur_target_mod]
+                    # Add the next modality to be generated to the conditionings
+                    cond_dicts[i][target_mod] = copy.deepcopy(uncond_dict[target_mod])
+                # Remove the fully generated modality from the unconditional dict inputs
+                uncond_dict[cur_target_mod]['input_mask'][:] = True
+                # Add the previously generated modality as an additional condition
+                new_cond = {}
+                new_cond[cur_target_mod] = copy.deepcopy(uncond_dict[cur_target_mod])
+                new_cond[cur_target_mod]['input_mask'][:] = False
+                new_cond[cur_target_mod]['target_mask'][:] = True
+                new_cond[target_mod] = copy.deepcopy(uncond_dict[target_mod])
+                cond_dicts.append(new_cond)
+                cur_target_mod = target_mod
+            if self.model.modality_info[target_mod]['type'] == 'img':
+                scheme = schedule_step_info['scheme']
+                if scheme.lower() == 'maskgit':
+                    uncond_dict, cond_dicts = self.multi_guided_maskgit_step_batched(
+                        uncond_dict, cond_dicts, cond_weights, target_mod, num_select, temp, top_k, top_p, seed=seed
+                    )
+                elif scheme.lower() == 'roar':
+                    uncond_dict, cond_dicts = self.multi_guided_roar_step_batched(
+                        uncond_dict, cond_dicts, cond_weights, target_mod, num_select, temp, top_k, top_p, seed=seed
+                    )
+                else:
+                    raise ValueError("Invalid sampling scheme")
+            else:
+                raise NotImplementedError("Only image modalities are supported for now")
+        return uncond_dict
+    @torch.no_grad()
+    def generate_sam_dense(self, mod_dict, schedule, text_tokenizer, batch_size=16,
+                            key='sam_instance', top_k=0.0, top_p=0.0, seed=None, verbose=False):
+        # Generation function for dense SAM instance prediction
+        device = mod_dict[list(mod_dict.keys())[0]]['tensor'].device
+        mod_dict = copy.deepcopy(mod_dict)
+        # Repeat the input batch to match the batch size
+        expanded_batch = expand_to_batch(copy.deepcopy(mod_dict), batch_size=batch_size)
+        # Filter the schedule to only include the key domain
+        schedule = [s for s in schedule if s['target_domain'] == key]
+        out_dict = self.generate(
+            expanded_batch, schedule, text_tokenizer=text_tokenizer,
+            verbose=verbose, seed=seed,
+            top_p=top_p, top_k=top_k,
+        )
+        # Merge the batch generated sequences into one sequence
+        sentinel_ids = set(get_sentinel_to_id_mapping(text_tokenizer).values())
+        merged_seq = []
+        for i in range(batch_size):
+            input_seq = out_dict[key]['tensor'][i]
+            input_seq = input_seq[out_dict[key]['input_mask'][i] == 0]
+            input_seq = input_seq.tolist()
+            target_seq = out_dict[key]['tensor'][i]
+            target_seq = target_seq[out_dict[key]['target_mask'][i] == 0]
+            target_seq = target_seq.tolist()
+            merged_seq.extend(merge_span_masking(input_seq, target_seq, sentinel_ids=sentinel_ids))
+        merged_seq = torch.tensor(merged_seq, device=device).unsqueeze(0)
+        mod_dict[key] = {
+            'tensor': merged_seq,
+            'input_mask': torch.zeros(merged_seq.shape, dtype=torch.bool, device=device),
+            'target_mask': torch.ones(merged_seq.shape, dtype=torch.bool, device=device),
+            'decoder_attention_mask': torch.zeros(merged_seq.shape, dtype=torch.bool, device=device),
+        }
+        return mod_dict

fourm/models/lora_utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Set, Optional, Type
+import torch
+import torch.nn as nn
+SELF_ATTENTION_MODULES = {'Attention', 'NormAttention'}
+CROSS_ATTENTION_MODULES = {'CrossAttention', 'NormCrossAttention'}
+ATTENTION_MODULES = SELF_ATTENTION_MODULES | CROSS_ATTENTION_MODULES
+MLP_MODULES = {'Mlp', 'GatedMlp', 'SwiGLUFFNFused'} # SwiGLUFFNFused is from DINOv2
+TRANSFORMER_MODULES = ATTENTION_MODULES | MLP_MODULES
+def get_LoRA_module_names(id: str) -> Set[str]:
+    """ Returns a list of module names that are LoRA-adapted for the given id. """
+    id = id.lower()
+    if id in ['selfattn', 'selfattention', 'self_attn', 'self_attention']:
+        return SELF_ATTENTION_MODULES
+    elif id in ['crossattn', 'crossattention', 'cross_attn', 'cross_attention']:
+        return CROSS_ATTENTION_MODULES
+    elif id in ['attn', 'attention']:
+        return ATTENTION_MODULES
+    elif id in ['mlp']:
+        return MLP_MODULES
+    elif id in ['all', 'transformer']:
+        return TRANSFORMER_MODULES
+    else:
+        raise ValueError(f'Unknown LoRA module id {id}.')
+class LoRAWrapper(nn.Module):
+    """Low-Rank Adaptation Wrapper for linear layers.
+    See https://arxiv.org/abs/2106.09685
+    Args:
+        linear: nn.Linear layer to wrap
+        rank: Rank of adaptation matrix B@A
+        scale: x = W_0@x + scale * B@A@x
+        num_packed_linear: Set to > 1 when wrapping e.g. packed kv, or qkv attention weights.
+            Weights will be initialized as if num_packed_linear = 1, but the LoRA bottleneck will
+            be num_packed_linear times larger.
+    """
+    def __init__(self, linear: nn.Module, rank: int = 4, scale: float = 1.0, num_packed_linear: int = 1):
+        super().__init__()
+        self.rank = rank
+        self.scale = scale
+        self.in_features, self.out_features = linear.in_features, linear.out_features
+        assert num_packed_linear * rank <= min(self.in_features, self.out_features), \
+            f'LoRA rank {num_packed_linear} * {rank} must be less or equal than {min(self.in_features, self.out_features)}'
+        self.linear = linear
+        self.lora_down = nn.Linear(self.in_features, num_packed_linear*rank, bias=False)
+        self.lora_up = nn.Linear(num_packed_linear*rank, self.out_features, bias=False)
+        nn.init.normal_(self.lora_down.weight, std=1/rank)
+        nn.init.zeros_(self.lora_up.weight)
+    def fuse_LoRA_into_linear(self) -> nn.Linear:
+        """ Returns a single nn.Linear layer with the LoRA matrix fused into the original one. """
+        fused_linear = nn.Linear(self.in_features, self.out_features, bias=self.linear.bias is not None)
+        fused_linear.weight.data = self.linear.weight + self.scale * (self.lora_up.weight @ self.lora_down.weight)
+        if self.linear.bias is not None:
+            fused_linear.bias.data = self.linear.bias
+        return fused_linear
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """ LoRA adapted linear layer forward pass. """
+        return self.linear(x) + self.lora_up(self.lora_down(x)) * self.scale
+def _find_modules(
+    model,
+    ancestor_class: Optional[Set[str]] = None,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+    exclude_children_of: Optional[List[Type[nn.Module]]] = [LoRAWrapper],
+):
+    """
+    Find all modules of a certain class (or union of classes) that are direct or
+    indirect descendants of other modules of a certain class (or union of classes).
+    Returns all matching modules, along with the parent of those moduless and the
+    names they are referenced by.
+    Adapted from https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+    """
+    # Get the targets we should replace all linears under
+    if ancestor_class is not None:
+        ancestors = (
+            module
+            for module in model.modules()
+            if module.__class__.__name__ in ancestor_class
+        )
+    else:
+        # this, incase you want to naively iterate over all modules.
+        ancestors = [module for module in model.modules()]
+    # For each target find every linear_class module that isn't a child of a LoRA layer
+    for ancestor in ancestors:
+        for fullname, module in ancestor.named_modules():
+            if any([isinstance(module, _class) for _class in search_class]):
+                # Find the direct parent if this is a descendant, not a child, of target
+                *path, name = fullname.split(".")
+                parent = ancestor
+                while path:
+                    parent = parent.get_submodule(path.pop(0))
+                # Skip this linear if it's a child of a LoRA layer
+                if exclude_children_of and any(
+                    [isinstance(parent, _class) for _class in exclude_children_of]
+                ):
+                    continue
+                # Otherwise, yield it
+                yield parent, name, module
+def inject_trainable_LoRA(
+    model: nn.Module,
+    rank: int = 4,
+    scale: float = 1.0,
+    target_replace_modules: Set[str] = ATTENTION_MODULES
+) -> None:
+    """Replaces all linear layers of the specified modules with LoRA-adapted linear layers.
+    Modifies the model in-place.
+    Args:
+        model: nn.Module to modify
+        rank: Rank of adaptation matrix B@A
+        scale: x = W_0@x + scale * B@A@x
+        target_replace_modules: Set of module names to replace linear layers in.
+    """
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_modules, search_class=[nn.Linear]
+    ):
+        if sorted(name) == sorted('qkv'):
+            num_packed_linear = 3
+        elif sorted(name) in [sorted('kv'), sorted('qk'), sorted('qv')]:
+            num_packed_linear = 2
+        else:
+            num_packed_linear = 1
+        _module._modules[name] = LoRAWrapper(_child_module, rank=rank, scale=scale, num_packed_linear=num_packed_linear)
+def fuse_LoRA_into_linear(
+    model: nn.Module,
+    target_replace_modules: Set[str] = ATTENTION_MODULES
+) -> None:
+    """Fuses all LoRA-adapted linear layers back into single linear layers.
+    Modifies the model in-place.
+    Args:
+        model: nn.Module to modify
+        target_replace_modules: Set of module names to replace linear layers in.
+    """
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_modules, search_class=[LoRAWrapper]
+    ):
+        _module._modules[name] = _module._modules[name].fuse_LoRA_into_linear()
+def unfreeze_all_LoRA_layers(model: nn.Module) -> None:
+    """ Unfreezes all LoRA-adapted linear layers. """
+    for name, param in model.named_parameters():
+        if 'lora' in name:
+            param.requires_grad = True

fourm/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from .misc import *
+from .checkpoint import *
+from .timm.cross_entropy import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from .data_constants import *
+from .dist import *
+from .logger import *
+from .timm.metrics import AverageMeter, accuracy
+from .timm.mixup import FastCollateMixup, Mixup
+from .timm.model import freeze, get_state_dict, unfreeze, unwrap_model
+from .timm.model_builder import create_model
+from .timm.model_ema import ModelEma, ModelEmaV2
+from .native_scaler import NativeScalerWithGradNormCount
+from .scheduler import cosine_scheduler, constant_scheduler, inverse_sqrt_scheduler
+from .optim_factory import create_optimizer
+from .timm.registry import model_entrypoint, register_model
+from .timm.transforms import *
+from .timm.transforms_factory import create_transform
+from .tokenizer.text_tokenizer import *
+from .s3_utils import *
+from .run_name import *
+from .generation_datasets import *
+from .seeds import *

fourm/utils/checkpoint.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------
+# Based on the timm code base
+# https://github.com/huggingface/pytorch-image-models
+# --------------------------------------------------------
+import io
+import os
+import ast
+import json
+from pathlib import Path
+from safetensors.torch import load as load_st
+import torch
+from .dist import save_on_main, is_main_process
+from .timm.model import get_state_dict
+from .s3_utils import save_on_s3
+def _load_checkpoint_for_ema(model_ema, checkpoint):
+    """
+    Workaround for ModelEma._load_checkpoint to accept an already-loaded object
+    """
+    mem_file = io.BytesIO()
+    torch.save(checkpoint, mem_file)
+    mem_file.seek(0)
+    model_ema._load_checkpoint(mem_file)
+def load_state_dict(model, state_dict, prefix='', ignore_missing=''):
+    missing_keys = []
+    unexpected_keys = []
+    error_msgs = []
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+    def load(module, prefix=''):
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(
+            state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+    load(model, prefix=prefix)
+    warn_missing_keys = []
+    ignore_missing_keys = []
+    for key in missing_keys:
+        keep_flag = True
+        for ignore_key in ignore_missing.split('|'):
+            if ignore_key in key:
+                keep_flag = False
+                break
+        if keep_flag:
+            warn_missing_keys.append(key)
+        else:
+            ignore_missing_keys.append(key)
+    missing_keys = warn_missing_keys
+    if len(missing_keys) > 0:
+        print("Weights of {} not initialized from pretrained model: {}".format(
+            model.__class__.__name__, missing_keys))
+    if len(unexpected_keys) > 0:
+        print("Weights from pretrained model not used in {}: {}".format(
+            model.__class__.__name__, unexpected_keys))
+    if len(ignore_missing_keys) > 0:
+        print("Ignored weights of {} not initialized from pretrained model: {}".format(
+            model.__class__.__name__, ignore_missing_keys))
+    if len(error_msgs) > 0:
+        print('\n'.join(error_msgs))
+def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler, loss_balancer=None, model_ema=None, ckpt_name=None, use_s3=False, all_nodes=False):
+    output_dir = Path(args.output_dir)
+    epoch_name = str(epoch)
+    ckpt_name = ckpt_name or epoch_name
+    # Only create the save_dict on the main process, unless all_nodes is set to True
+    if is_main_process() or (all_nodes and args.gpu == 0):
+        checkpoint_path = os.path.join(output_dir, f'checkpoint-{ckpt_name}.pth')
+        to_save = {
+            'model': model_without_ddp.state_dict(),
+            'epoch': epoch,
+            'args': args,
+            'scaler': loss_scaler.state_dict(),
+        }
+        if optimizer is not None:
+            to_save['optimizer'] = optimizer.state_dict()
+        if loss_balancer is not None:
+            to_save['loss_balancer'] = loss_balancer.state_dict()
+        if model_ema is not None:
+            to_save['model_ema'] = get_state_dict(model_ema)
+        save_on_main(to_save, checkpoint_path)
+        if use_s3:
+            s3_path = os.path.join(args.s3_save_dir, f'checkpoint-{ckpt_name}.pth')
+            save_on_s3(checkpoint_path, s3_path, args.s3_endpoint)
+def auto_load_model(args, model, model_without_ddp, optimizer, loss_scaler, model_ema=None):
+    output_dir = Path(args.output_dir)
+    # torch.amp
+    if args.auto_resume and len(args.resume) == 0:
+        import glob
+        all_checkpoints = glob.glob(os.path.join(output_dir, 'checkpoint-*.pth'))
+        latest_ckpt = -1
+        for ckpt in all_checkpoints:
+            t = ckpt.split('-')[-1].split('.')[0]
+            if t.isdigit():
+                latest_ckpt = max(int(t), latest_ckpt)
+        if latest_ckpt >= 0:
+            args.resume = os.path.join(output_dir, 'checkpoint-%d.pth' % latest_ckpt)
+        print("Auto resume checkpoint: %s" % args.resume)
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu')
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        print("Resume checkpoint %s" % args.resume)
+        if 'optimizer' in checkpoint and 'epoch' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            args.start_epoch = checkpoint['epoch'] + 1
+            if 'scaler' in checkpoint:
+                loss_scaler.load_state_dict(checkpoint['scaler'])
+            print("With optim & sched!")
+        if hasattr(args, 'model_ema') and args.model_ema:
+            _load_checkpoint_for_ema(model_ema, {'state_dict_ema': checkpoint['model_ema']})
+            print("With EMA!")
+def parse_metadata(metadata_str):
+    metadata = {}
+    for k, v in metadata_str.items():
+        try:
+            v_parsed = ast.literal_eval(v)
+        except:
+            v_parsed = v
+        metadata[k] = v_parsed
+    return metadata
+def load_safetensors(safetensors_path, return_metadata=True):
+    with open(safetensors_path, 'rb') as f:
+        data = f.read()
+    tensors = load_st(data)
+    if not return_metadata:
+        return tensors
+    n_header = data[:8]
+    n = int.from_bytes(n_header, "little")
+    metadata_bytes = data[8 : 8 + n]
+    header = json.loads(metadata_bytes)
+    metadata = header.get("__metadata__", {})
+    metadata = parse_metadata(metadata)
+    return tensors, metadata

fourm/utils/clip/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .clip import *
2	+ from .model import *

fourm/utils/clip/clip.py ADDED Viewed

	@@ -0,0 +1,236 @@

+# --------------------------------------------------------
+# Based on the CLIP code base
+# https://github.com/openai/CLIP
+# --------------------------------------------------------
+import hashlib
+import os
+import urllib
+import warnings
+from typing import Any, Union, List
+from pkg_resources import packaging
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from tqdm import tqdm
+from .model import build_model
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+if packaging.version.parse(torch.__version__) < packaging.version.parse("1.7.1"):
+    warnings.warn("PyTorch version 1.7.1 or higher is recommended")
+__all__ = ["available_models", "load", "tokenize"]
+_tokenizer = _Tokenizer()
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+    "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
+    "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
+}
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit: bool = False, download_root: str = None):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model or more hackable non-JIT model (default).
+    download_root: str
+        path to download the model files; by default, it uses "~/.cache/clip"
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        # loading saved state dict
+        if jit:
+            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+            jit = False
+        state_dict = torch.load(model_path, map_location="cpu")
+    if not jit:
+        model = build_model(state_dict or model.state_dict()).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, "graph") else []
+        except RuntimeError:
+            graphs = []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, "graph") else []
+            except RuntimeError:
+                graphs = []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+        model.float()
+    return model, _transform(model.input_resolution.item())
+def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    sot_token = _tokenizer.encoder["<|startoftext|>"]
+    eot_token = _tokenizer.encoder["<|endoftext|>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result

fourm/utils/clip/model.py ADDED Viewed

	@@ -0,0 +1,504 @@

+# --------------------------------------------------------
+# Based on the CLIP code base
+# https://github.com/openai/CLIP
+# --------------------------------------------------------
+from collections import OrderedDict
+from typing import Tuple, Union
+import math
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x, return_all_tokens=False):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x, key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        if return_all_tokens:
+            return x
+        else:
+            return x[0]
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x, return_side_out=False, return_all_tokens=False):
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+        out = []
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        if return_side_out:
+            out.append(x)
+        x = self.layer2(x)
+        if return_side_out:
+            out.append(x)
+        x = self.layer3(x)
+        if return_side_out:
+            out.append(x)
+        x = self.layer4(x)
+        if return_side_out:
+            out.append(x)
+        x = self.attnpool(x, return_all_tokens)
+        out.append(x)
+        if len(out) == 1:
+            return x
+        else:
+            return out
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor, return_intermediate_out: bool = False):
+        if return_intermediate_out:
+            output = []
+            for block in self.resblocks:
+                x = block(x)
+                output.append(x)
+            return output
+        return self.resblocks(x)
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.patch_size = patch_size
+        self.output_dim = output_dim
+        self.width = width
+        self.heads = heads
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor, return_all_tokens=False, return_all_final_tokens=False, return_final_tokens_no_cls=False, **kwargs):
+        B, nc, w, h = x.shape
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        if x.shape[1] != self.positional_embedding.shape[0]:
+            x = x + self.interpolate_pos_encoding(x, w, h).to(x.dtype)
+        else:
+            x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        if return_all_tokens:
+            x = self.ln_post(x)
+            return x[:, 1:, :]
+        if return_all_final_tokens:
+            return self.ln_post(x) @ self.proj
+        if return_final_tokens_no_cls:
+            return self.ln_post(x)[:, 1:, :] @ self.proj
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+    def interpolate_pos_encoding(self, x, w, h):
+        npatch = x.shape[1] - 1
+        N = self.positional_embedding.shape[0] - 1 # 256 for large
+        if npatch == N and w == h:
+            return self.positional_embedding
+        class_pos_embed = self.positional_embedding[[0]]
+        patch_pos_embed = self.positional_embedding[1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        w0, h0 = w0 + 0.1, h0 + 0.1
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+            mode='bicubic',
+        )
+        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int, # 512
+                 # vision
+                 image_resolution: int, # 224
+                 vision_layers: Union[Tuple[int, int, int, int], int], # 12
+                 vision_width: int, # 768
+                 vision_patch_size: int, # 16
+                 # text
+                 context_length: int, # 77
+                 vocab_size: int, # 49408
+                 transformer_width: int, # 512
+                 transformer_heads: int, # 8
+                 transformer_layers: int # 12
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image, return_side_out=False, return_all_tokens=False, return_all_final_tokens=False, **kwargs):
+        return self.visual(image.type(self.dtype), return_all_tokens, return_all_final_tokens, **kwargs)
+    def encode_text(self, text, return_all_tokens=False, return_patch_tokens=False):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        if return_patch_tokens:
+            return x
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        if return_all_tokens:
+            x = x @ self.text_projection
+        else:
+            x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def build_model(state_dict: dict):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    convert_weights(model)
+    model.load_state_dict(state_dict)
+    return model.eval()

fourm/utils/clip/simple_tokenizer.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# --------------------------------------------------------
+# Based on the CLIP code base
+# https://github.com/openai/CLIP
+# --------------------------------------------------------
+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text

fourm/utils/data_constants.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+DEFAULT_CROP_PCT = 0.875
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)
+IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)
+IMAGENET_DPN_MEAN = (124 / 255, 117 / 255, 104 / 255)
+IMAGENET_DPN_STD = tuple([1 / (.0167 * 255)] * 3)
+IMAGENET_SURFACE_NORMAL_MEAN = (0.501, 0.405, 0.137)
+IMAGENET_SURFACE_NORMAL_STD = (0.114, 0.165, 0.081)
+SEG_IGNORE_INDEX = 255
+SEG_IGNORE_INDEX_V2 = 0
+PAD_MASK_VALUE = 254
+COCO_SEMSEG_NUM_CLASSES = 133 + 1  # One extra class for no-class
+ADE20K_SEMSEG_NUM_CLASSES = 150 + 1  # One extra class for no-class
+HYPERSIM_SEMSEG_NUM_CLASSES = 41
+IMAGE_TASKS = {'rgb', 'depth', 'semseg', 'semseg_hypersim', 'semseg_coco', 'semseg_ade20k', 'normal'}
+DETECTION_TASKS = {'det'} # 'det_coco', 'det_lvis'
+TEXT_TASKS = {'caption'}
+VISION_TASKS = IMAGE_TASKS | DETECTION_TASKS
+SEQUENCE_TASKS = DETECTION_TASKS | TEXT_TASKS
+NYU_MEAN = 2070.7764
+NYU_STD = 777.5723

fourm/utils/dist.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------
+# Based on DETR and MMCV code bases
+# https://github.com/facebookresearch/detr
+# https://github.com/open-mmlab/mmcv
+# --------------------------------------------------------
+import os
+import pickle
+import shutil
+import sys
+import tempfile
+import datetime
+import torch
+import torch.distributed as dist
+def setup_for_distributed(is_main):
+    """
+    This function disables printing when not in main process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_main or force or kwargs.get('file', None) == sys.stderr:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_main(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def save_on_all(*args, **kwargs):
+    torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+    # Set timeout to 1h20 in case some long download of dataset has to happen
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank,
+                                         timeout=datetime.timedelta(4800))
+    torch.distributed.barrier()
+    if ("print_all" not in args) or (not args.print_all):
+        setup_for_distributed(args.rank == 0)

fourm/utils/fsdp_utils.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from pathlib import Path
+import torch
+from .dist import save_on_main, is_main_process
+from .s3_utils import save_on_s3
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+    FullStateDictConfig,
+    StateDictType,
+)
+from torch.distributed.fsdp.api import FullOptimStateDictConfig
+def save_model_fsdp(args, epoch, model, optimizer, model_ema=None, ckpt_name=None, use_s3=False):
+    output_dir = Path(args.output_dir)
+    epoch_name = str(epoch)
+    ckpt_name = ckpt_name or epoch_name
+    with FSDP.state_dict_type(model,
+        StateDictType.FULL_STATE_DICT,
+        FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
+        FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True),
+        ):
+        model_state_dict = model.state_dict()
+        if optimizer is not None:
+            optimizer_state_dict = FSDP.optim_state_dict(model, optimizer)
+        else:
+            optimizer_state_dict = None
+        # Only create the save_dict on the main process, not needed or recommended to do so on all ranks
+        # This make save_on_main() redundant
+        if is_main_process():
+            checkpoint_path = os.path.join(output_dir, f'checkpoint-{ckpt_name}.pth')
+            to_save = {
+                'model': model_state_dict,
+                'epoch': epoch,
+                'args': args,
+            }
+            if optimizer is not None:
+                to_save['optimizer'] = optimizer_state_dict
+            if model_ema is not None:
+                print("Model EMA is currently not supported for FSDP")
+                # to_save['model_ema'] = get_state_dict(model_ema)
+            save_on_main(to_save, checkpoint_path)
+            if use_s3:
+                s3_path = os.path.join(args.s3_save_dir, f'checkpoint-{ckpt_name}.pth')
+                save_on_s3(checkpoint_path, s3_path, args.s3_endpoint)
+def auto_load_model_fsdp(args, model, optimizer, model_ema=None):
+    output_dir = Path(args.output_dir)
+    if args.auto_resume and len(args.resume) == 0:
+        import glob
+        all_checkpoints = glob.glob(os.path.join(output_dir, 'checkpoint-*.pth'))
+        latest_ckpt = -1
+        for ckpt in all_checkpoints:
+            t = ckpt.split('-')[-1].split('.')[0]
+            if t.isdigit():
+                latest_ckpt = max(int(t), latest_ckpt)
+        if latest_ckpt >= 0:
+            args.resume = os.path.join(output_dir, 'checkpoint-%d.pth' % latest_ckpt)
+        print("Auto resume checkpoint: %s" % args.resume)
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu')
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        with FSDP.state_dict_type(
+            model,
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(rank0_only=False),
+            FullOptimStateDictConfig(rank0_only=False),
+            ):
+            model.load_state_dict(checkpoint['model'])
+            print("Resume checkpoint %s" % args.resume)
+            if 'optimizer' in checkpoint and 'epoch' in checkpoint:
+                optimizer_state_dict = FSDP.optim_state_dict_to_load(checkpoint['optimizer'], model, optimizer)
+                optimizer.load_state_dict(optimizer_state_dict)
+                args.start_epoch = checkpoint['epoch'] + 1
+                print("With optim & sched!")
+        if hasattr(args, 'model_ema') and args.model_ema:
+            print("Model EMA is currently not supported for FSDP")

fourm/utils/generation.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import math
+def sample_to_batch(mod_dict, device, domains):
+    mod_dict = {
+        modality: {k: v.unsqueeze(0).to(device, non_blocking=True) for k, v in d.items()}
+        for modality, d in mod_dict.items() if modality in domains
+    }
+    return mod_dict
+def unbatch(tensor):
+    return tensor.detach().squeeze(0).cpu()
+def batch_to_sample(mod_dict, domains):
+    mod_dict = {
+        modality: {k: unbatch(v) for k, v in d.items()}
+        for modality, d in mod_dict.items() if modality in domains
+    }
+    return mod_dict
+def batch_to_device(mod_dict, device, domains):
+    mod_dict = {
+        modality: {k: v.to(device, non_blocking=True) for k, v in d.items()}
+        for modality, d in mod_dict.items() if modality in domains
+    }
+    return mod_dict
+def cosine_schedule(num_steps, total_tokens):
+    iters = np.arange(num_steps)
+    base_value = 1
+    final_value = 0
+    schedule = np.array(
+        [final_value + 0.5 * (base_value - final_value) * (1 + math.cos(math.pi * i / (len(iters)))) for i in iters])
+    schedule_tokens = [round(total_tokens * i) for i in (schedule[:-1] - schedule[1:])]
+    schedule_tokens.append(total_tokens - sum(schedule_tokens))
+    return np.array(schedule_tokens)
+def linear_schedule(num_steps, total_tokens):
+    schedule = np.linspace(0, total_tokens, num_steps + 1, dtype=int)
+    schedule_tokens = np.diff(schedule)[::-1]
+    schedule_tokens.sort()  # Sorts the array in ascending order.
+    schedule_tokens = schedule_tokens[::-1]  # Reverses the array to descending order.
+    return np.trim_zeros(schedule_tokens, 'b')  # Trims trailing zeros.
+def continue_schedule(schedule, num_current_tokens):
+    schedule_cumsum = np.cumsum(schedule)
+    keep_mask = schedule_cumsum > num_current_tokens
+    diff = schedule_cumsum[keep_mask][0] - num_current_tokens
+    new_schedule = schedule[keep_mask]
+    new_schedule[0] = diff
+    return new_schedule
+def decreasing_temp_schedule(max, min, token_schedule):
+    schedule_cumsum = np.cumsum(token_schedule) / np.sum(token_schedule)
+    temp_schedule = np.array([min + (max - min) * (1 - s) for s in schedule_cumsum])
+    return temp_schedule
+def onex_temp_schedule(max_t, min_t, token_schedule, power=0.5, min_linspace=1, max_linspace=100):
+    """Abitrary temperature schedule for one over x"""
+    x = np.linspace(min_linspace, max_linspace, num=sum(token_schedule))
+    y = 1/(x**power)
+    y = y - min(y)
+    y = y / max(y)
+    unscaled_schedule = y
+    schedule_cumsum = np.cumsum(token_schedule) / np.sum(token_schedule)
+    unscaled_schedule = [(1 - cs) * us for us, cs in zip(unscaled_schedule, schedule_cumsum)]
+    temp_schedule = np.array([min_t + (max_t - min_t) * s for s in unscaled_schedule]).clip(min=1e-9)
+    return temp_schedule
+def linear_temp_schedule(temp, token_schedule):
+    """ Temperature that decays the temperature inversely proportional to the token schedule. """
+    return np.concatenate([np.array([temp * 1.0]), (temp * (token_schedule.sum() - token_schedule.cumsum()) / token_schedule.sum())[:-1]]).clip(min=1e-9)

fourm/utils/generation_datasets/PartiPrompts.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

fourm/utils/generation_datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .parti_prompts_dataset import *
+from .empty_dataset import *
+from .image_caption_dataset import *

fourm/utils/generation_datasets/empty_dataset.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.utils.data import Dataset
+class EmptyDataset(Dataset):
+    """Empty dataset"""
+    def __init__(self, dataset_size: int):
+        self.dataset_size = dataset_size
+    def __getitem__(self, index):
+        return {}
+    def __len__(self):
+        return self.dataset_size

fourm/utils/generation_datasets/image_caption_dataset.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from torch.utils.data import Dataset
+from typing import Any, Callable, Dict, List, Optional, Tuple, cast
+from fourm.data.multimodal_dataset_folder import make_dataset, UNIFIED_EXTENSIONS
+from fourm.data.modality_transforms import get_transform_key, RGBTransform, CaptionTransform, UnifiedDataTransform
+class ImageCaptionDataset(Dataset):
+    """
+    Similar to MultiModalDatasetFolder, but specialized for image-caption datasets.
+    """
+    def __init__(self,
+                 root: str,
+                 augmenter: Optional[Callable] = None,
+                 modality_paths: Dict[str, str] = None,
+                 is_valid_file: Optional[Callable[[str], bool]] = None,
+                 cache=False):
+        self.root = root
+        self.modality_paths = modality_paths or {}
+        self.modality_transforms = {
+            'rgb': RGBTransform(imagenet_default_mean_and_std=False),
+            'caption': CaptionTransform()
+        }
+        self.transform = UnifiedDataTransform(transforms_dict=self.modality_transforms, image_augmenter=augmenter)
+        classes, class_to_idx = self._find_classes(os.path.join(self.root, self.modality_paths.get('caption', 'caption')))
+        extensions = UNIFIED_EXTENSIONS if is_valid_file is None else None
+        samples = {
+            mod: make_dataset(
+                os.path.join(self.root, self.modality_paths.get(mod, mod)),
+                class_to_idx,
+                extensions,
+                is_valid_file,
+                cache_path=os.path.join(self.root, 'dataloader_cache', f'{self.modality_paths.get(mod, mod)}.pkl') if cache else None)
+            for mod in ['caption', 'rgb']
+        }
+        for mod, mod_samples in samples.items():
+            if len(mod_samples) == 0:
+                msg = "Found 0 logs in subfolders of: {}\n".format(os.path.join(self.root, self.modality_paths.get(mod, mod)))
+                if extensions is not None:
+                    msg += "Supported extensions are: {}".format(",".join(extensions))
+                raise RuntimeError(msg)
+        self.extensions = extensions
+        self.classes = classes
+        self.class_to_idx = class_to_idx
+        self.samples = samples
+    def _find_classes(self, dir: str) -> Tuple[List[str], Dict[str, int]]:
+        """
+        Finds the class folders in a dataset.
+        Args:
+            dir (string): Root directory path.
+        Returns:
+            tuple: (classes, class_to_idx) where classes are relative to (dir), and class_to_idx is a dictionary.
+        Ensures:
+            No class is a subdirectory of another.
+        """
+        classes = [d.name for d in os.scandir(dir) if d.is_dir()]
+        classes.sort()
+        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+        return classes, class_to_idx
+    def __getitem__(self, index):
+        sample_dict = {}
+        for mod in ['caption', 'rgb']:
+            path, _ = self.samples[mod][index]
+            sample = self.modality_transforms[get_transform_key(mod)].load(path)
+            sample_dict[mod] = sample
+        if self.transform is not None:
+            sample_dict = self.transform(sample_dict)
+        return sample_dict
+    def __len__(self) -> int:
+        return len(list(self.samples.values())[0])

fourm/utils/generation_datasets/parti_prompts_dataset.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright 2024 EPFL and Apple Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import pandas as pd
+import numpy as np
+from torch.utils.data import Dataset
+class PartiPromptsDataset(Dataset):
+    """
+    Parti Prompts caption dataset.
+    Args:
+        text_tokenizer (tokenizers.Tokenizer): The tokenizer to use for encoding the captions.
+        max_length (int): The maximum sequence length of the captions.
+        parti_prompts_csv (str): The path to the Parti Prompts dataset.
+    """
+    def __init__(self, text_tokenizer, max_length=128, parti_prompts_csv='fourm/utils/generation_datasets/PartiPrompts.tsv', parti_prompts_t5_embs=None, llm_embedder=None):
+        self.text_tokenizer = text_tokenizer
+        self.max_length = max_length
+        self.parti_prompts = pd.read_csv(parti_prompts_csv, sep='\t')
+        self.pad_id = text_tokenizer.token_to_id("[PAD]")
+        self.eos_id = text_tokenizer.token_to_id("[EOS]")
+        if parti_prompts_t5_embs is not None:
+            # T5 Embeddings are saved as a numpy array, so we need to load it
+            self.t5_embs = np.load(parti_prompts_t5_embs)['emb']
+            self.t5_masks = np.load(parti_prompts_t5_embs)['mask_valid']
+            self.llm_embedder = None
+        elif llm_embedder is not None:
+            self.t5_embs = None
+            self.llm_embedder = llm_embedder
+        else:
+            self.t5_embs = None
+            self.llm_embedder = None
+    def __getitem__(self, index):
+        text = self.parti_prompts.Prompt[index]
+        seq_ids = self.text_tokenizer.encode(text).ids + [self.eos_id]
+        tensor = torch.ones(self.max_length, dtype=torch.int) * self.pad_id
+        tensor[:len(seq_ids)] = torch.tensor(seq_ids, dtype=torch.int)
+        out = {}
+        out['caption'] = {'tensor': tensor}
+        if self.t5_embs is not None:
+            t5_emb = torch.tensor(self.t5_embs[index], dtype=torch.float32)
+            t5_emb = pad_or_truncate(t5_emb, self.max_length)
+            t5_mask = torch.tensor(self.t5_masks[index], dtype=torch.bool)
+            t5_mask = pad_or_truncate(t5_mask, self.max_length)
+            ascii_tensor = text_to_tensor(text, max_length=self.max_length * 10) # Save ASCII as tensor
+            out['t5_caption'] = {
+                'tensor': t5_emb,
+                'mask_valid': t5_mask,
+                'ascii_tensor': ascii_tensor,
+            }
+        elif self.llm_embedder is not None:
+            t5_emb, _, t5_mask = self.llm_embedder.get_text_embeddings([text])
+            t5_emb = pad_or_truncate(t5_emb.squeeze(0), self.max_length)
+            t5_mask = pad_or_truncate(t5_mask.bool().squeeze(0), self.max_length)
+            ascii_tensor = text_to_tensor(text, max_length=self.max_length * 10) # Save ASCII as tensor
+            out['t5_caption'] = {
+                'tensor': t5_emb,
+                'mask_valid': t5_mask,
+                'ascii_tensor': ascii_tensor,
+            }
+        return out
+    def __len__(self):
+        return len(self.parti_prompts)
+def pad_or_truncate(tensor, fixed_length, padding_value=0):
+    current_length = tensor.shape[0]
+    if current_length < fixed_length:
+        # Calculate padding sizes for all dimensions, but only pad along dim=0
+        padding_sizes = [0] * 2 * len(tensor.shape)
+        padding_sizes[1] = fixed_length - current_length
+        return torch.nn.functional.pad(tensor, padding_sizes, 'constant', padding_value)
+    else:
+        return tensor[:fixed_length]
+def text_to_tensor(text, max_length=None):
+    """Converts plaintext to a tensor with optional padding."""
+    ascii_values = [ord(c) for c in text]
+    if max_length:
+        while len(ascii_values) < max_length:
+            ascii_values.append(0)  # Using 0 as the padding value
+    return torch.tensor(ascii_values, dtype=torch.int)
+def tensor_to_text(tensor):
+    """Converts tensor back to plaintext. Assumes padding with zeros."""
+    ascii_values = tensor.tolist()
+    return ''.join(chr(val) for val in ascii_values if val != 0)

fourm/utils/hmr2_utils/hmr2/__init__.py ADDED Viewed

File without changes

fourm/utils/hmr2_utils/hmr2/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .smpl_wrapper import SMPL
2	+ from .hmr2 import HMR2

fourm/utils/hmr2_utils/hmr2/models/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# --------------------------------------------------------
+# Based on the ViTPose and 4DHumans code bases
+# https://github.com/ViTAE-Transformer/ViTPose/
+# https://github.com/shubham-goel/4D-Humans
+# --------------------------------------------------------
+from .vit import vit
+def create_backbone(cfg):
+    if cfg.MODEL.BACKBONE.TYPE == 'vit':
+        return vit(cfg)
+    else:
+        raise NotImplementedError('Backbone type is not implemented')

fourm/utils/hmr2_utils/hmr2/models/backbones/vit.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# --------------------------------------------------------
+# Based on the ViTPose and 4DHumans code bases
+# https://github.com/ViTAE-Transformer/ViTPose/
+# https://github.com/shubham-goel/4D-Humans
+# --------------------------------------------------------
+import math
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+def vit(cfg):
+    return ViT(
+                img_size=(256, 192),
+                patch_size=16,
+                embed_dim=1280,
+                depth=32,
+                num_heads=16,
+                ratio=1,
+                use_checkpoint=False,
+                mlp_ratio=4,
+                qkv_bias=True,
+                drop_path_rate=0.55,
+            )
+def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    cls_token = None
+    B, L, C = abs_pos.shape
+    if has_cls_token:
+        cls_token = abs_pos[:, 0:1]
+        abs_pos = abs_pos[:, 1:]
+    if ori_h != h or ori_w != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).reshape(B, -1, C)
+    else:
+        new_abs_pos = abs_pos
+    if cls_token is not None:
+        new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
+    return new_abs_pos
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self):
+        return 'p={}'.format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None,):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.dim = dim
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
+                 drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm, attn_head_dim=None
+                 ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
+            )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
+        self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
+        self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1))
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+class ViT(nn.Module):
+    def __init__(self,
+                 img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False,
+                 frozen_stages=-1, ratio=1, last_norm=True,
+                 patch_padding='pad', freeze_attn=False, freeze_ffn=False,
+                 ):
+        # Protect mutable default arguments
+        super(ViT, self).__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.frozen_stages = frozen_stages
+        self.use_checkpoint = use_checkpoint
+        self.patch_padding = patch_padding
+        self.freeze_attn = freeze_attn
+        self.freeze_ffn = freeze_ffn
+        self.depth = depth
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
+        num_patches = self.patch_embed.num_patches
+        # since the pretraining model has class token
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                )
+            for i in range(depth)])
+        self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            m = self.blocks[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+        if self.freeze_attn:
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.attn.eval()
+                m.norm1.eval()
+                for param in m.attn.parameters():
+                    param.requires_grad = False
+                for param in m.norm1.parameters():
+                    param.requires_grad = False
+        if self.freeze_ffn:
+            self.pos_embed.requires_grad = False
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.mlp.eval()
+                m.norm2.eval()
+                for param in m.mlp.parameters():
+                    param.requires_grad = False
+                for param in m.norm2.parameters():
+                    param.requires_grad = False
+    def init_weights(self):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        self.apply(_init_weights)
+    def get_num_layers(self):
+        return len(self.blocks)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def forward_features(self, x):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x)
+        if self.pos_embed is not None:
+            # fit for multiple GPU training
+            # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
+            x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        x = self.last_norm(x)
+        xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
+        return xp
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()

fourm/utils/hmr2_utils/hmr2/models/components/__init__.py ADDED Viewed

File without changes

fourm/utils/hmr2_utils/hmr2/models/components/pose_transformer.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# --------------------------------------------------------
+# Based on the 4DHumans code base
+# https://github.com/shubham-goel/4D-Humans
+# --------------------------------------------------------
+from inspect import isfunction
+from typing import Callable, Optional
+import torch
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from torch import nn
+from .t_cond_mlp import (
+    AdaptiveLayerNorm1D,
+    FrequencyEmbedder,
+    normalization_layer,
+)
+# from .vit import Attention, FeedForward
+def exists(val):
+    return val is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+class PreNorm(nn.Module):
+    def __init__(self, dim: int, fn: Callable, norm: str = "layer", norm_cond_dim: int = -1):
+        super().__init__()
+        self.norm = normalization_layer(norm, dim, norm_cond_dim)
+        self.fn = fn
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        if isinstance(self.norm, AdaptiveLayerNorm1D):
+            return self.fn(self.norm(x, *args), **kwargs)
+        else:
+            return self.fn(self.norm(x), **kwargs)
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim, dropout=0.0):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class Attention(nn.Module):
+    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        project_out = not (heads == 1 and dim_head == dim)
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        self.attend = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
+        self.to_out = (
+            nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+            if project_out
+            else nn.Identity()
+        )
+    def forward(self, x):
+        qkv = self.to_qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.heads), qkv)
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+        attn = self.attend(dots)
+        attn = self.dropout(attn)
+        out = torch.matmul(attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class CrossAttention(nn.Module):
+    def __init__(self, dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        project_out = not (heads == 1 and dim_head == dim)
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        self.attend = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        context_dim = default(context_dim, dim)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=False)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_out = (
+            nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+            if project_out
+            else nn.Identity()
+        )
+    def forward(self, x, context=None):
+        context = default(context, x)
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+        q = self.to_q(x)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.heads), [q, k, v])
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+        attn = self.attend(dots)
+        attn = self.dropout(attn)
+        out = torch.matmul(attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        heads: int,
+        dim_head: int,
+        mlp_dim: int,
+        dropout: float = 0.0,
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            sa = Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout)
+            ff = FeedForward(dim, mlp_dim, dropout=dropout)
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PreNorm(dim, sa, norm=norm, norm_cond_dim=norm_cond_dim),
+                        PreNorm(dim, ff, norm=norm, norm_cond_dim=norm_cond_dim),
+                    ]
+                )
+            )
+    def forward(self, x: torch.Tensor, *args):
+        for attn, ff in self.layers:
+            x = attn(x, *args) + x
+            x = ff(x, *args) + x
+        return x
+class TransformerCrossAttn(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        heads: int,
+        dim_head: int,
+        mlp_dim: int,
+        dropout: float = 0.0,
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+        context_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            sa = Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout)
+            ca = CrossAttention(
+                dim, context_dim=context_dim, heads=heads, dim_head=dim_head, dropout=dropout
+            )
+            ff = FeedForward(dim, mlp_dim, dropout=dropout)
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PreNorm(dim, sa, norm=norm, norm_cond_dim=norm_cond_dim),
+                        PreNorm(dim, ca, norm=norm, norm_cond_dim=norm_cond_dim),
+                        PreNorm(dim, ff, norm=norm, norm_cond_dim=norm_cond_dim),
+                    ]
+                )
+            )
+    def forward(self, x: torch.Tensor, *args, context=None, context_list=None):
+        if context_list is None:
+            context_list = [context] * len(self.layers)
+        if len(context_list) != len(self.layers):
+            raise ValueError(f"len(context_list) != len(self.layers) ({len(context_list)} != {len(self.layers)})")
+        for i, (self_attn, cross_attn, ff) in enumerate(self.layers):
+            x = self_attn(x, *args) + x
+            x = cross_attn(x, *args, context=context_list[i]) + x
+            x = ff(x, *args) + x
+        return x
+class DropTokenDropout(nn.Module):
+    def __init__(self, p: float = 0.1):
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError(
+                "dropout probability has to be between 0 and 1, " "but got {}".format(p)
+            )
+        self.p = p
+    def forward(self, x: torch.Tensor):
+        # x: (batch_size, seq_len, dim)
+        if self.training and self.p > 0:
+            zero_mask = torch.full_like(x[0, :, 0], self.p).bernoulli().bool()
+            # TODO: permutation idx for each batch using torch.argsort
+            if zero_mask.any():
+                x = x[:, ~zero_mask, :]
+        return x
+class ZeroTokenDropout(nn.Module):
+    def __init__(self, p: float = 0.1):
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError(
+                "dropout probability has to be between 0 and 1, " "but got {}".format(p)
+            )
+        self.p = p
+    def forward(self, x: torch.Tensor):
+        # x: (batch_size, seq_len, dim)
+        if self.training and self.p > 0:
+            zero_mask = torch.full_like(x[:, :, 0], self.p).bernoulli().bool()
+            # Zero-out the masked tokens
+            x[zero_mask, :] = 0
+        return x
+class TransformerEncoder(nn.Module):
+    def __init__(
+        self,
+        num_tokens: int,
+        token_dim: int,
+        dim: int,
+        depth: int,
+        heads: int,
+        mlp_dim: int,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        emb_dropout: float = 0.0,
+        emb_dropout_type: str = "drop",
+        emb_dropout_loc: str = "token",
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+        token_pe_numfreq: int = -1,
+    ):
+        super().__init__()
+        if token_pe_numfreq > 0:
+            token_dim_new = token_dim * (2 * token_pe_numfreq + 1)
+            self.to_token_embedding = nn.Sequential(
+                Rearrange("b n d -> (b n) d", n=num_tokens, d=token_dim),
+                FrequencyEmbedder(token_pe_numfreq, token_pe_numfreq - 1),
+                Rearrange("(b n) d -> b n d", n=num_tokens, d=token_dim_new),
+                nn.Linear(token_dim_new, dim),
+            )
+        else:
+            self.to_token_embedding = nn.Linear(token_dim, dim)
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_tokens, dim))
+        if emb_dropout_type == "drop":
+            self.dropout = DropTokenDropout(emb_dropout)
+        elif emb_dropout_type == "zero":
+            self.dropout = ZeroTokenDropout(emb_dropout)
+        else:
+            raise ValueError(f"Unknown emb_dropout_type: {emb_dropout_type}")
+        self.emb_dropout_loc = emb_dropout_loc
+        self.transformer = Transformer(
+            dim, depth, heads, dim_head, mlp_dim, dropout, norm=norm, norm_cond_dim=norm_cond_dim
+        )
+    def forward(self, inp: torch.Tensor, *args, **kwargs):
+        x = inp
+        if self.emb_dropout_loc == "input":
+            x = self.dropout(x)
+        x = self.to_token_embedding(x)
+        if self.emb_dropout_loc == "token":
+            x = self.dropout(x)
+        b, n, _ = x.shape
+        x += self.pos_embedding[:, :n]
+        if self.emb_dropout_loc == "token_afterpos":
+            x = self.dropout(x)
+        x = self.transformer(x, *args)
+        return x
+class TransformerDecoder(nn.Module):
+    def __init__(
+        self,
+        num_tokens: int,
+        token_dim: int,
+        dim: int,
+        depth: int,
+        heads: int,
+        mlp_dim: int,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        emb_dropout: float = 0.0,
+        emb_dropout_type: str = 'drop',
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+        context_dim: Optional[int] = None,
+        skip_token_embedding: bool = False,
+    ):
+        super().__init__()
+        if not skip_token_embedding:
+            self.to_token_embedding = nn.Linear(token_dim, dim)
+        else:
+            self.to_token_embedding = nn.Identity()
+            if token_dim != dim:
+                raise ValueError(
+                    f"token_dim ({token_dim}) != dim ({dim}) when skip_token_embedding is True"
+                )
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_tokens, dim))
+        if emb_dropout_type == "drop":
+            self.dropout = DropTokenDropout(emb_dropout)
+        elif emb_dropout_type == "zero":
+            self.dropout = ZeroTokenDropout(emb_dropout)
+        elif emb_dropout_type == "normal":
+            self.dropout = nn.Dropout(emb_dropout)
+        self.transformer = TransformerCrossAttn(
+            dim,
+            depth,
+            heads,
+            dim_head,
+            mlp_dim,
+            dropout,
+            norm=norm,
+            norm_cond_dim=norm_cond_dim,
+            context_dim=context_dim,
+        )
+    def forward(self, inp: torch.Tensor, *args, context=None, context_list=None):
+        x = self.to_token_embedding(inp)
+        b, n, _ = x.shape
+        x = self.dropout(x)
+        x += self.pos_embedding[:, :n]
+        x = self.transformer(x, *args, context=context, context_list=context_list)
+        return x

fourm/utils/hmr2_utils/hmr2/models/components/t_cond_mlp.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# --------------------------------------------------------
+# Based on the 4DHumans code base
+# https://github.com/shubham-goel/4D-Humans
+# --------------------------------------------------------
+import copy
+from typing import List, Optional
+import torch
+class AdaptiveLayerNorm1D(torch.nn.Module):
+    def __init__(self, data_dim: int, norm_cond_dim: int):
+        super().__init__()
+        if data_dim <= 0:
+            raise ValueError(f"data_dim must be positive, but got {data_dim}")
+        if norm_cond_dim <= 0:
+            raise ValueError(f"norm_cond_dim must be positive, but got {norm_cond_dim}")
+        self.norm = torch.nn.LayerNorm(
+            data_dim
+        )  # TODO: Check if elementwise_affine=True is correct
+        self.linear = torch.nn.Linear(norm_cond_dim, 2 * data_dim)
+        torch.nn.init.zeros_(self.linear.weight)
+        torch.nn.init.zeros_(self.linear.bias)
+    def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        # x: (batch, ..., data_dim)
+        # t: (batch, norm_cond_dim)
+        # return: (batch, data_dim)
+        x = self.norm(x)
+        alpha, beta = self.linear(t).chunk(2, dim=-1)
+        # Add singleton dimensions to alpha and beta
+        if x.dim() > 2:
+            alpha = alpha.view(alpha.shape[0], *([1] * (x.dim() - 2)), alpha.shape[1])
+            beta = beta.view(beta.shape[0], *([1] * (x.dim() - 2)), beta.shape[1])
+        return x * (1 + alpha) + beta
+class SequentialCond(torch.nn.Sequential):
+    def forward(self, input, *args, **kwargs):
+        for module in self:
+            if isinstance(module, (AdaptiveLayerNorm1D, SequentialCond, ResidualMLPBlock)):
+                # print(f'Passing on args to {module}', [a.shape for a in args])
+                input = module(input, *args, **kwargs)
+            else:
+                # print(f'Skipping passing args to {module}', [a.shape for a in args])
+                input = module(input)
+        return input
+def normalization_layer(norm: Optional[str], dim: int, norm_cond_dim: int = -1):
+    if norm == "batch":
+        return torch.nn.BatchNorm1d(dim)
+    elif norm == "layer":
+        return torch.nn.LayerNorm(dim)
+    elif norm == "ada":
+        assert norm_cond_dim > 0, f"norm_cond_dim must be positive, got {norm_cond_dim}"
+        return AdaptiveLayerNorm1D(dim, norm_cond_dim)
+    elif norm is None:
+        return torch.nn.Identity()
+    else:
+        raise ValueError(f"Unknown norm: {norm}")
+def linear_norm_activ_dropout(
+    input_dim: int,
+    output_dim: int,
+    activation: torch.nn.Module = torch.nn.ReLU(),
+    bias: bool = True,
+    norm: Optional[str] = "layer",  # Options: ada/batch/layer
+    dropout: float = 0.0,
+    norm_cond_dim: int = -1,
+) -> SequentialCond:
+    layers = []
+    layers.append(torch.nn.Linear(input_dim, output_dim, bias=bias))
+    if norm is not None:
+        layers.append(normalization_layer(norm, output_dim, norm_cond_dim))
+    layers.append(copy.deepcopy(activation))
+    if dropout > 0.0:
+        layers.append(torch.nn.Dropout(dropout))
+    return SequentialCond(*layers)
+def create_simple_mlp(
+    input_dim: int,
+    hidden_dims: List[int],
+    output_dim: int,
+    activation: torch.nn.Module = torch.nn.ReLU(),
+    bias: bool = True,
+    norm: Optional[str] = "layer",  # Options: ada/batch/layer
+    dropout: float = 0.0,
+    norm_cond_dim: int = -1,
+) -> SequentialCond:
+    layers = []
+    prev_dim = input_dim
+    for hidden_dim in hidden_dims:
+        layers.extend(
+            linear_norm_activ_dropout(
+                prev_dim, hidden_dim, activation, bias, norm, dropout, norm_cond_dim
+            )
+        )
+        prev_dim = hidden_dim
+    layers.append(torch.nn.Linear(prev_dim, output_dim, bias=bias))
+    return SequentialCond(*layers)
+class ResidualMLPBlock(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        num_hidden_layers: int,
+        output_dim: int,
+        activation: torch.nn.Module = torch.nn.ReLU(),
+        bias: bool = True,
+        norm: Optional[str] = "layer",  # Options: ada/batch/layer
+        dropout: float = 0.0,
+        norm_cond_dim: int = -1,
+    ):
+        super().__init__()
+        if not (input_dim == output_dim == hidden_dim):
+            raise NotImplementedError(
+                f"input_dim {input_dim} != output_dim {output_dim} is not implemented"
+            )
+        layers = []
+        prev_dim = input_dim
+        for i in range(num_hidden_layers):
+            layers.append(
+                linear_norm_activ_dropout(
+                    prev_dim, hidden_dim, activation, bias, norm, dropout, norm_cond_dim
+                )
+            )
+            prev_dim = hidden_dim
+        self.model = SequentialCond(*layers)
+        self.skip = torch.nn.Identity()
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        return x + self.model(x, *args, **kwargs)
+class ResidualMLP(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        num_hidden_layers: int,
+        output_dim: int,
+        activation: torch.nn.Module = torch.nn.ReLU(),
+        bias: bool = True,
+        norm: Optional[str] = "layer",  # Options: ada/batch/layer
+        dropout: float = 0.0,
+        num_blocks: int = 1,
+        norm_cond_dim: int = -1,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.model = SequentialCond(
+            linear_norm_activ_dropout(
+                input_dim, hidden_dim, activation, bias, norm, dropout, norm_cond_dim
+            ),
+            *[
+                ResidualMLPBlock(
+                    hidden_dim,
+                    hidden_dim,
+                    num_hidden_layers,
+                    hidden_dim,
+                    activation,
+                    bias,
+                    norm,
+                    dropout,
+                    norm_cond_dim,
+                )
+                for _ in range(num_blocks)
+            ],
+            torch.nn.Linear(hidden_dim, output_dim, bias=bias),
+        )
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        return self.model(x, *args, **kwargs)
+class FrequencyEmbedder(torch.nn.Module):
+    def __init__(self, num_frequencies, max_freq_log2):
+        super().__init__()
+        frequencies = 2 ** torch.linspace(0, max_freq_log2, steps=num_frequencies)
+        self.register_buffer("frequencies", frequencies)
+    def forward(self, x):
+        # x should be of size (N,) or (N, D)
+        N = x.size(0)
+        if x.dim() == 1:  # (N,)
+            x = x.unsqueeze(1)  # (N, D) where D=1
+        x_unsqueezed = x.unsqueeze(-1)  # (N, D, 1)
+        scaled = self.frequencies.view(1, 1, -1) * x_unsqueezed  # (N, D, num_frequencies)
+        s = torch.sin(scaled)
+        c = torch.cos(scaled)
+        embedded = torch.cat([s, c, x_unsqueezed], dim=-1).view(
+            N, -1
+        )  # (N, D * 2 * num_frequencies + D)
+        return embedded

fourm/utils/hmr2_utils/hmr2/models/heads/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .smpl_head import build_smpl_head

fourm/utils/hmr2_utils/hmr2/models/heads/smpl_head.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# --------------------------------------------------------
+# Based on the 4DHumans code base
+# https://github.com/shubham-goel/4D-Humans
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import einops
+from ...utils.geometry import rot6d_to_rotmat, aa_to_rotmat
+from ..components.pose_transformer import TransformerDecoder
+def build_smpl_head(cfg):
+    smpl_head_type = cfg.MODEL.SMPL_HEAD.get('TYPE', 'hmr')
+    if  smpl_head_type == 'transformer_decoder':
+        return SMPLTransformerDecoderHead(cfg)
+    else:
+        raise ValueError('Unknown SMPL head type: {}'.format(smpl_head_type))
+class SMPLTransformerDecoderHead(nn.Module):
+    """ Cross-attention based SMPL Transformer decoder
+    """
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.joint_rep_type = cfg.MODEL.SMPL_HEAD.get('JOINT_REP', '6d')
+        self.joint_rep_dim = {'6d': 6, 'aa': 3}[self.joint_rep_type]
+        npose = self.joint_rep_dim * (cfg.SMPL.NUM_BODY_JOINTS + 1)
+        self.npose = npose
+        self.input_is_mean_shape = cfg.MODEL.SMPL_HEAD.get('TRANSFORMER_INPUT', 'zero') == 'mean_shape'
+        transformer_args = dict(
+            num_tokens=1,
+            token_dim=(npose + 10 + 3) if self.input_is_mean_shape else 1,
+            dim=1024,
+        )
+        transformer_args = (transformer_args | dict(cfg.MODEL.SMPL_HEAD.TRANSFORMER_DECODER))
+        self.transformer = TransformerDecoder(
+            **transformer_args
+        )
+        dim=transformer_args['dim']
+        self.decpose = nn.Linear(dim, npose)
+        self.decshape = nn.Linear(dim, 10)
+        self.deccam = nn.Linear(dim, 3)
+        if cfg.MODEL.SMPL_HEAD.get('INIT_DECODER_XAVIER', False):
+            # True by default in MLP. False by default in Transformer
+            nn.init.xavier_uniform_(self.decpose.weight, gain=0.01)
+            nn.init.xavier_uniform_(self.decshape.weight, gain=0.01)
+            nn.init.xavier_uniform_(self.deccam.weight, gain=0.01)
+        mean_params = np.load(cfg.SMPL.MEAN_PARAMS)
+        init_body_pose = torch.from_numpy(mean_params['pose'].astype(np.float32)).unsqueeze(0)
+        init_betas = torch.from_numpy(mean_params['shape'].astype('float32')).unsqueeze(0)
+        init_cam = torch.from_numpy(mean_params['cam'].astype(np.float32)).unsqueeze(0)
+        self.register_buffer('init_body_pose', init_body_pose)
+        self.register_buffer('init_betas', init_betas)
+        self.register_buffer('init_cam', init_cam)
+    def forward(self, x, **kwargs):
+        batch_size = x.shape[0]
+        # vit pretrained backbone is channel-first. Change to token-first
+        x = einops.rearrange(x, 'b c h w -> b (h w) c')
+        init_body_pose = self.init_body_pose.expand(batch_size, -1)
+        init_betas = self.init_betas.expand(batch_size, -1)
+        init_cam = self.init_cam.expand(batch_size, -1)
+        # TODO: Convert init_body_pose to aa rep if needed
+        if self.joint_rep_type == 'aa':
+            raise NotImplementedError
+        pred_body_pose = init_body_pose
+        pred_betas = init_betas
+        pred_cam = init_cam
+        pred_body_pose_list = []
+        pred_betas_list = []
+        pred_cam_list = []
+        for i in range(self.cfg.MODEL.SMPL_HEAD.get('IEF_ITERS', 1)):
+            # Input token to transformer is zero token
+            if self.input_is_mean_shape:
+                token = torch.cat([pred_body_pose, pred_betas, pred_cam], dim=1)[:,None,:]
+            else:
+                token = torch.zeros(batch_size, 1, 1).to(x.device)
+            # Pass through transformer
+            token_out = self.transformer(token, context=x)
+            token_out = token_out.squeeze(1) # (B, C)
+            # Readout from token_out
+            pred_body_pose = self.decpose(token_out) + pred_body_pose
+            pred_betas = self.decshape(token_out) + pred_betas
+            pred_cam = self.deccam(token_out) + pred_cam
+            pred_body_pose_list.append(pred_body_pose)
+            pred_betas_list.append(pred_betas)
+            pred_cam_list.append(pred_cam)
+        # Convert self.joint_rep_type -> rotmat
+        joint_conversion_fn = {
+            '6d': rot6d_to_rotmat,
+            'aa': lambda x: aa_to_rotmat(x.view(-1, 3).contiguous())
+        }[self.joint_rep_type]
+        pred_smpl_params_list = {}
+        pred_smpl_params_list['body_pose'] = torch.cat([joint_conversion_fn(pbp).view(batch_size, -1, 3, 3)[:, 1:, :, :] for pbp in pred_body_pose_list], dim=0)
+        pred_smpl_params_list['betas'] = torch.cat(pred_betas_list, dim=0)
+        pred_smpl_params_list['cam'] = torch.cat(pred_cam_list, dim=0)
+        pred_body_pose = joint_conversion_fn(pred_body_pose).view(batch_size, self.cfg.SMPL.NUM_BODY_JOINTS+1, 3, 3)
+        pred_smpl_params = {'global_orient': pred_body_pose[:, [0]],
+                            'body_pose': pred_body_pose[:, 1:],
+                            'betas': pred_betas}
+        return pred_smpl_params, pred_cam, pred_smpl_params_list

fourm/utils/hmr2_utils/hmr2/models/hmr2.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# --------------------------------------------------------
+# Based on the 4DHumans code base
+# https://github.com/shubham-goel/4D-Humans
+# --------------------------------------------------------
+import torch
+from typing import Any, Dict, Mapping, Tuple
+from yacs.config import CfgNode
+from ..utils import SkeletonRenderer, MeshRenderer
+from ..utils.geometry import perspective_projection
+from .backbones import create_backbone
+from .heads import build_smpl_head
+from . import SMPL
+class HMR2(torch.nn.Module):
+    def __init__(self, cfg: CfgNode, init_renderer: bool = True):
+        """
+        Setup HMR2 model
+        Args:
+            cfg (CfgNode): Config file as a yacs CfgNode
+        """
+        super().__init__()
+        # Save hyperparameters
+        self.save_hyperparameters(logger=False, ignore=['init_renderer'])
+        self.cfg = cfg
+        # Create backbone feature extractor
+        self.backbone = create_backbone(cfg)
+        if cfg.MODEL.BACKBONE.get('PRETRAINED_WEIGHTS', None):
+            self.backbone.load_state_dict(torch.load(cfg.MODEL.BACKBONE.PRETRAINED_WEIGHTS, map_location='cpu')['state_dict'])
+        # Create SMPL head
+        self.smpl_head = build_smpl_head(cfg)
+        # Instantiate SMPL model
+        smpl_cfg = {k.lower(): v for k,v in dict(cfg.SMPL).items()}
+        self.smpl = SMPL(**smpl_cfg)
+        # Buffer that shows whetheer we need to initialize ActNorm layers
+        self.register_buffer('initialized', torch.tensor(False))
+        # Setup renderer for visualization
+        if init_renderer:
+            self.renderer = SkeletonRenderer(self.cfg)
+            self.mesh_renderer = MeshRenderer(self.cfg, faces=self.smpl.faces)
+        else:
+            self.renderer = None
+            self.mesh_renderer = None
+        # Disable automatic optimization since we use adversarial training
+        self.automatic_optimization = False
+    def forward_step(self, batch: Dict, train: bool = False) -> Dict:
+        """
+        Run a forward step of the network
+        Args:
+            batch (Dict): Dictionary containing batch data
+            train (bool): Flag indicating whether it is training or validation mode
+        Returns:
+            Dict: Dictionary containing the regression output
+        """
+        # Use RGB image as input
+        x = batch['img']
+        batch_size = x.shape[0]
+        # Compute conditioning features using the backbone
+        # if using ViT backbone, we need to use a different aspect ratio
+        conditioning_feats = self.backbone(x[:,:,:,32:-32])
+        pred_smpl_params, pred_cam, _ = self.smpl_head(conditioning_feats)
+        # Store useful regression outputs to the output dict
+        output = {}
+        output['pred_cam'] = pred_cam
+        output['pred_smpl_params'] = {k: v.clone() for k,v in pred_smpl_params.items()}
+        # Compute camera translation
+        device = pred_smpl_params['body_pose'].device
+        dtype = pred_smpl_params['body_pose'].dtype
+        focal_length = self.cfg.EXTRA.FOCAL_LENGTH * torch.ones(batch_size, 2, device=device, dtype=dtype)
+        pred_cam_t = torch.stack([pred_cam[:, 1],
+                                  pred_cam[:, 2],
+                                  2*focal_length[:, 0]/(self.cfg.MODEL.IMAGE_SIZE * pred_cam[:, 0] +1e-9)],dim=-1)
+        output['pred_cam_t'] = pred_cam_t
+        output['focal_length'] = focal_length
+        # Compute model vertices, joints and the projected joints
+        pred_smpl_params['global_orient'] = pred_smpl_params['global_orient'].reshape(batch_size, -1, 3, 3)
+        pred_smpl_params['body_pose'] = pred_smpl_params['body_pose'].reshape(batch_size, -1, 3, 3)
+        pred_smpl_params['betas'] = pred_smpl_params['betas'].reshape(batch_size, -1)
+        smpl_output = self.smpl(**{k: v.float() for k,v in pred_smpl_params.items()}, pose2rot=False)
+        pred_keypoints_3d = smpl_output.joints
+        pred_vertices = smpl_output.vertices
+        output['pred_keypoints_3d'] = pred_keypoints_3d.reshape(batch_size, -1, 3)
+        output['pred_vertices'] = pred_vertices.reshape(batch_size, -1, 3)
+        pred_cam_t = pred_cam_t.reshape(-1, 3)
+        focal_length = focal_length.reshape(-1, 2)
+        pred_keypoints_2d = perspective_projection(pred_keypoints_3d,
+                                                   translation=pred_cam_t,
+                                                   focal_length=focal_length / self.cfg.MODEL.IMAGE_SIZE)
+        output['pred_keypoints_2d'] = pred_keypoints_2d.reshape(batch_size, -1, 2)
+        return output
+    def forward(self, batch: Dict) -> Dict:
+        """
+        Run a forward step of the network in val mode
+        Args:
+            batch (Dict): Dictionary containing batch data
+        Returns:
+            Dict: Dictionary containing the regression output
+        """
+        return self.forward_step(batch, train=False)

fourm/utils/hmr2_utils/hmr2/models/smpl_wrapper.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# --------------------------------------------------------
+# Based on the 4DHumans and ProHMR code bases
+# https://github.com/shubham-goel/4D-Humans
+# https://github.com/nkolot/ProHMR
+# --------------------------------------------------------
+import torch
+import numpy as np
+import pickle
+from typing import Optional
+import smplx
+from smplx.lbs import vertices2joints
+from smplx.utils import SMPLOutput
+class SMPL(smplx.SMPLLayer):
+    def __init__(self, *args, joint_regressor_extra: Optional[str] = None, update_hips: bool = False, **kwargs):
+        """
+        Extension of the official SMPL implementation to support more joints.
+        Args:
+            Same as SMPLLayer.
+            joint_regressor_extra (str): Path to extra joint regressor.
+        """
+        super(SMPL, self).__init__(*args, **kwargs)
+        smpl_to_openpose = [24, 12, 17, 19, 21, 16, 18, 20, 0, 2, 5, 8, 1, 4,
+                            7, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
+        if joint_regressor_extra is not None:
+            self.register_buffer('joint_regressor_extra', torch.tensor(pickle.load(open(joint_regressor_extra, 'rb'), encoding='latin1'), dtype=torch.float32))
+        self.register_buffer('joint_map', torch.tensor(smpl_to_openpose, dtype=torch.long))
+        self.update_hips = update_hips
+    def forward(self, *args, **kwargs) -> SMPLOutput:
+        """
+        Run forward pass. Same as SMPL and also append an extra set of joints if joint_regressor_extra is specified.
+        """
+        smpl_output = super(SMPL, self).forward(*args, **kwargs)
+        joints = smpl_output.joints[:, self.joint_map, :]
+        if self.update_hips:
+            joints[:,[9,12]] = joints[:,[9,12]] + \
+                0.25*(joints[:,[9,12]]-joints[:,[12,9]]) + \
+                0.5*(joints[:,[8]] - 0.5*(joints[:,[9,12]] + joints[:,[12,9]]))
+        if hasattr(self, 'joint_regressor_extra'):
+            extra_joints = vertices2joints(self.joint_regressor_extra, smpl_output.vertices)
+            joints = torch.cat([joints, extra_joints], dim=1)
+        smpl_output.joints = joints
+        return smpl_output

fourm/utils/hmr2_utils/hmr2/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# --------------------------------------------------------
+# Based on the 4DHumans and ProHMR code bases
+# https://github.com/shubham-goel/4D-Humans
+# https://github.com/nkolot/ProHMR
+# --------------------------------------------------------
+import torch
+from typing import Any
+from .renderer import Renderer
+from .mesh_renderer import MeshRenderer
+from .skeleton_renderer import SkeletonRenderer
+# from .pose_utils import eval_pose, Evaluator
+def recursive_to(x: Any, target: torch.device):
+    """
+    Recursively transfer a batch of data to the target device
+    Args:
+        x (Any): Batch of data.
+        target (torch.device): Target device.
+    Returns:
+        Batch of data where all tensors are transfered to the target device.
+    """
+    if isinstance(x, dict):
+        return {k: recursive_to(v, target) for k, v in x.items()}
+    elif isinstance(x, torch.Tensor):
+        return x.to(target)
+    elif isinstance(x, list):
+        return [recursive_to(i, target) for i in x]
+    else:
+        return x

fourm/utils/hmr2_utils/hmr2/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# --------------------------------------------------------
+# Based on the 4DHumans, ProHMR, and SPIN code bases
+# https://github.com/shubham-goel/4D-Humans
+# https://github.com/nkolot/ProHMR
+# https://github.com/nkolot/SPIN
+# --------------------------------------------------------
+from typing import Optional
+import torch
+from torch.nn import functional as F
+def aa_to_rotmat(theta: torch.Tensor):
+    """
+    Convert axis-angle representation to rotation matrix.
+    Works by first converting it to a quaternion.
+    Args:
+        theta (torch.Tensor): Tensor of shape (B, 3) containing axis-angle representations.
+    Returns:
+        torch.Tensor: Corresponding rotation matrices with shape (B, 3, 3).
+    """
+    norm = torch.norm(theta + 1e-8, p = 2, dim = 1)
+    angle = torch.unsqueeze(norm, -1)
+    normalized = torch.div(theta, angle)
+    angle = angle * 0.5
+    v_cos = torch.cos(angle)
+    v_sin = torch.sin(angle)
+    quat = torch.cat([v_cos, v_sin * normalized], dim = 1)
+    return quat_to_rotmat(quat)
+def quat_to_rotmat(quat: torch.Tensor) -> torch.Tensor:
+    """
+    Convert quaternion representation to rotation matrix.
+    Args:
+        quat (torch.Tensor) of shape (B, 4); 4 <===> (w, x, y, z).
+    Returns:
+        torch.Tensor: Corresponding rotation matrices with shape (B, 3, 3).
+    """
+    norm_quat = quat
+    norm_quat = norm_quat/norm_quat.norm(p=2, dim=1, keepdim=True)
+    w, x, y, z = norm_quat[:,0], norm_quat[:,1], norm_quat[:,2], norm_quat[:,3]
+    B = quat.size(0)
+    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
+    wx, wy, wz = w*x, w*y, w*z
+    xy, xz, yz = x*y, x*z, y*z
+    rotMat = torch.stack([w2 + x2 - y2 - z2, 2*xy - 2*wz, 2*wy + 2*xz,
+                          2*wz + 2*xy, w2 - x2 + y2 - z2, 2*yz - 2*wx,
+                          2*xz - 2*wy, 2*wx + 2*yz, w2 - x2 - y2 + z2], dim=1).view(B, 3, 3)
+    return rotMat
+def rot6d_to_rotmat(x: torch.Tensor) -> torch.Tensor:
+    """
+    Convert 6D rotation representation to 3x3 rotation matrix.
+    Based on Zhou et al., "On the Continuity of Rotation Representations in Neural Networks", CVPR 2019
+    Args:
+        x (torch.Tensor): (B,6) Batch of 6-D rotation representations.
+    Returns:
+        torch.Tensor: Batch of corresponding rotation matrices with shape (B,3,3).
+    """
+    x = x.reshape(-1,2,3).permute(0, 2, 1).contiguous()
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    return torch.stack((b1, b2, b3), dim=-1)
+def perspective_projection(points: torch.Tensor,
+                           translation: torch.Tensor,
+                           focal_length: torch.Tensor,
+                           camera_center: Optional[torch.Tensor] = None,
+                           rotation: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Computes the perspective projection of a set of 3D points.
+    Args:
+        points (torch.Tensor): Tensor of shape (B, N, 3) containing the input 3D points.
+        translation (torch.Tensor): Tensor of shape (B, 3) containing the 3D camera translation.
+        focal_length (torch.Tensor): Tensor of shape (B, 2) containing the focal length in pixels.
+        camera_center (torch.Tensor): Tensor of shape (B, 2) containing the camera center in pixels.
+        rotation (torch.Tensor): Tensor of shape (B, 3, 3) containing the camera rotation.
+    Returns:
+        torch.Tensor: Tensor of shape (B, N, 2) containing the projection of the input points.
+    """
+    batch_size = points.shape[0]
+    if rotation is None:
+        rotation = torch.eye(3, device=points.device, dtype=points.dtype).unsqueeze(0).expand(batch_size, -1, -1)
+    if camera_center is None:
+        camera_center = torch.zeros(batch_size, 2, device=points.device, dtype=points.dtype)
+    # Populate intrinsic camera matrix K.
+    K = torch.zeros([batch_size, 3, 3], device=points.device, dtype=points.dtype)
+    K[:,0,0] = focal_length[:,0]
+    K[:,1,1] = focal_length[:,1]
+    K[:,2,2] = 1.
+    K[:,:-1, -1] = camera_center
+    # Transform points
+    points = torch.einsum('bij,bkj->bki', rotation, points)
+    points = points + translation.unsqueeze(1)
+    # Apply perspective distortion
+    projected_points = points / points[:,:,-1].unsqueeze(-1)
+    # Apply camera intrinsics
+    projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
+    return projected_points[:, :, :-1]

fourm/utils/hmr2_utils/hmr2/utils/mesh_renderer.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# --------------------------------------------------------
+# Based on the 4DHumans and ProHMR code bases
+# https://github.com/shubham-goel/4D-Humans
+# https://github.com/nkolot/ProHMR
+# --------------------------------------------------------
+import os
+if 'PYOPENGL_PLATFORM' not in os.environ:
+    os.environ['PYOPENGL_PLATFORM'] = 'egl'
+import torch
+from torchvision.utils import make_grid
+import numpy as np
+import pyrender
+import trimesh
+import cv2
+import torch.nn.functional as F
+from .render_openpose import render_openpose
+def create_raymond_lights():
+    import pyrender
+    thetas = np.pi * np.array([1.0 / 6.0, 1.0 / 6.0, 1.0 / 6.0])
+    phis = np.pi * np.array([0.0, 2.0 / 3.0, 4.0 / 3.0])
+    nodes = []
+    for phi, theta in zip(phis, thetas):
+        xp = np.sin(theta) * np.cos(phi)
+        yp = np.sin(theta) * np.sin(phi)
+        zp = np.cos(theta)
+        z = np.array([xp, yp, zp])
+        z = z / np.linalg.norm(z)
+        x = np.array([-z[1], z[0], 0.0])
+        if np.linalg.norm(x) == 0:
+            x = np.array([1.0, 0.0, 0.0])
+        x = x / np.linalg.norm(x)
+        y = np.cross(z, x)
+        matrix = np.eye(4)
+        matrix[:3,:3] = np.c_[x,y,z]
+        nodes.append(pyrender.Node(
+            light=pyrender.DirectionalLight(color=np.ones(3), intensity=1.0),
+            matrix=matrix
+        ))
+    return nodes
+class MeshRenderer:
+    def __init__(self, cfg, faces=None):
+        self.cfg = cfg
+        self.focal_length = cfg.EXTRA.FOCAL_LENGTH
+        self.img_res = cfg.MODEL.IMAGE_SIZE
+        self.renderer = pyrender.OffscreenRenderer(viewport_width=self.img_res,
+                                       viewport_height=self.img_res,
+                                       point_size=1.0)
+        self.camera_center = [self.img_res // 2, self.img_res // 2]
+        self.faces = faces
+    def visualize(self, vertices, camera_translation, images, focal_length=None, nrow=3, padding=2):
+        images_np = np.transpose(images, (0,2,3,1))
+        rend_imgs = []
+        for i in range(vertices.shape[0]):
+            fl = self.focal_length
+            rend_img = torch.from_numpy(np.transpose(self.__call__(vertices[i], camera_translation[i], images_np[i], focal_length=fl, side_view=False), (2,0,1))).float()
+            rend_img_side = torch.from_numpy(np.transpose(self.__call__(vertices[i], camera_translation[i], images_np[i], focal_length=fl, side_view=True), (2,0,1))).float()
+            rend_imgs.append(torch.from_numpy(images[i]))
+            rend_imgs.append(rend_img)
+            rend_imgs.append(rend_img_side)
+        rend_imgs = make_grid(rend_imgs, nrow=nrow, padding=padding)
+        return rend_imgs
+    def visualize_tensorboard(self, vertices, camera_translation, images, pred_keypoints, gt_keypoints, focal_length=None, nrow=5, padding=2):
+        images_np = np.transpose(images, (0,2,3,1))
+        rend_imgs = []
+        pred_keypoints = np.concatenate((pred_keypoints, np.ones_like(pred_keypoints)[:, :, [0]]), axis=-1)
+        pred_keypoints = self.img_res * (pred_keypoints + 0.5)
+        gt_keypoints[:, :, :-1] = self.img_res * (gt_keypoints[:, :, :-1] + 0.5)
+        keypoint_matches = [(1, 12), (2, 8), (3, 7), (4, 6), (5, 9), (6, 10), (7, 11), (8, 14), (9, 2), (10, 1), (11, 0), (12, 3), (13, 4), (14, 5)]
+        for i in range(vertices.shape[0]):
+            fl = self.focal_length
+            rend_img = torch.from_numpy(np.transpose(self.__call__(vertices[i], camera_translation[i], images_np[i], focal_length=fl, side_view=False), (2,0,1))).float()
+            rend_img_side = torch.from_numpy(np.transpose(self.__call__(vertices[i], camera_translation[i], images_np[i], focal_length=fl, side_view=True), (2,0,1))).float()
+            body_keypoints = pred_keypoints[i, :25]
+            extra_keypoints = pred_keypoints[i, -19:]
+            for pair in keypoint_matches:
+                body_keypoints[pair[0], :] = extra_keypoints[pair[1], :]
+            pred_keypoints_img = render_openpose(255 * images_np[i].copy(), body_keypoints) / 255
+            body_keypoints = gt_keypoints[i, :25]
+            extra_keypoints = gt_keypoints[i, -19:]
+            for pair in keypoint_matches:
+                if extra_keypoints[pair[1], -1] > 0 and body_keypoints[pair[0], -1] == 0:
+                    body_keypoints[pair[0], :] = extra_keypoints[pair[1], :]
+            gt_keypoints_img = render_openpose(255*images_np[i].copy(), body_keypoints) / 255
+            rend_imgs.append(torch.from_numpy(images[i]))
+            rend_imgs.append(rend_img)
+            rend_imgs.append(rend_img_side)
+            rend_imgs.append(torch.from_numpy(pred_keypoints_img).permute(2,0,1))
+            rend_imgs.append(torch.from_numpy(gt_keypoints_img).permute(2,0,1))
+        rend_imgs = make_grid(rend_imgs, nrow=nrow, padding=padding)
+        return rend_imgs
+    def __call__(self, vertices, camera_translation, image, focal_length=5000, text=None, resize=None, side_view=False, baseColorFactor=(1.0, 1.0, 0.9, 1.0), rot_angle=90):
+        renderer = pyrender.OffscreenRenderer(viewport_width=image.shape[1],
+                                              viewport_height=image.shape[0],
+                                              point_size=1.0)
+        material = pyrender.MetallicRoughnessMaterial(
+            metallicFactor=0.0,
+            alphaMode='OPAQUE',
+            baseColorFactor=baseColorFactor)
+        camera_translation[0] *= -1.
+        mesh = trimesh.Trimesh(vertices.copy(), self.faces.copy())
+        if side_view:
+            rot = trimesh.transformations.rotation_matrix(
+                np.radians(rot_angle), [0, 1, 0])
+            mesh.apply_transform(rot)
+        rot = trimesh.transformations.rotation_matrix(
+            np.radians(180), [1, 0, 0])
+        mesh.apply_transform(rot)
+        mesh = pyrender.Mesh.from_trimesh(mesh, material=material)
+        scene = pyrender.Scene(bg_color=[0.0, 0.0, 0.0, 0.0],
+                               ambient_light=(0.3, 0.3, 0.3))
+        scene.add(mesh, 'mesh')
+        camera_pose = np.eye(4)
+        camera_pose[:3, 3] = camera_translation
+        camera_center = [image.shape[1] / 2., image.shape[0] / 2.]
+        camera = pyrender.IntrinsicsCamera(fx=focal_length, fy=focal_length,
+                                           cx=camera_center[0], cy=camera_center[1])
+        scene.add(camera, pose=camera_pose)
+        light_nodes = create_raymond_lights()
+        for node in light_nodes:
+            scene.add_node(node)
+        color, rend_depth = renderer.render(scene, flags=pyrender.RenderFlags.RGBA)
+        color = color.astype(np.float32) / 255.0
+        valid_mask = (color[:, :, -1] > 0)[:, :, np.newaxis]
+        if not side_view:
+            output_img = (color[:, :, :3] * valid_mask +
+                      (1 - valid_mask) * image)
+        else:
+            output_img = color[:, :, :3]
+        if resize is not None:
+            output_img = cv2.resize(output_img, resize)
+        output_img = output_img.astype(np.float32)
+        renderer.delete()
+        return output_img

fourm/utils/hmr2_utils/hmr2/utils/render_openpose.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# --------------------------------------------------------
+# Based on the 4DHumans and ProHMR code bases
+# https://github.com/shubham-goel/4D-Humans
+# https://github.com/nkolot/ProHMR
+# --------------------------------------------------------
+"""
+Render OpenPose keypoints.
+Code was ported to Python from the official C++ implementation https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/utilities/keypoint.cpp
+"""
+import cv2
+import math
+import numpy as np
+from typing import List, Tuple
+def get_keypoints_rectangle(keypoints: np.array, threshold: float) -> Tuple[float, float, float]:
+    """
+    Compute rectangle enclosing keypoints above the threshold.
+    Args:
+        keypoints (np.array): Keypoint array of shape (N, 3).
+        threshold (float): Confidence visualization threshold.
+    Returns:
+        Tuple[float, float, float]: Rectangle width, height and area.
+    """
+    valid_ind = keypoints[:, -1] > threshold
+    if valid_ind.sum() > 0:
+        valid_keypoints = keypoints[valid_ind][:, :-1]
+        max_x = valid_keypoints[:,0].max()
+        max_y = valid_keypoints[:,1].max()
+        min_x = valid_keypoints[:,0].min()
+        min_y = valid_keypoints[:,1].min()
+        width = max_x - min_x
+        height = max_y - min_y
+        area = width * height
+        return width, height, area
+    else:
+        return 0,0,0
+def render_keypoints(img: np.array,
+                     keypoints: np.array,
+                     pairs: List,
+                     colors: List,
+                     thickness_circle_ratio: float,
+                     thickness_line_ratio_wrt_circle: float,
+                     pose_scales: List,
+                     threshold: float = 0.1) -> np.array:
+    """
+    Render keypoints on input image.
+    Args:
+        img (np.array): Input image of shape (H, W, 3) with pixel values in the [0,255] range.
+        keypoints (np.array): Keypoint array of shape (N, 3).
+        pairs (List): List of keypoint pairs per limb.
+        colors: (List): List of colors per keypoint.
+        thickness_circle_ratio (float): Circle thickness ratio.
+        thickness_line_ratio_wrt_circle (float): Line thickness ratio wrt the circle.
+        pose_scales (List): List of pose scales.
+        threshold (float): Only visualize keypoints with confidence above the threshold.
+    Returns:
+        (np.array): Image of shape (H, W, 3) with keypoints drawn on top of the original image.
+    """
+    img_orig = img.copy()
+    width, height = img.shape[1], img.shape[2]
+    area = width * height
+    lineType = 8
+    shift = 0
+    numberColors = len(colors)
+    thresholdRectangle = 0.1
+    person_width, person_height, person_area = get_keypoints_rectangle(keypoints, thresholdRectangle)
+    if person_area > 0:
+        ratioAreas = min(1, max(person_width / width, person_height / height))
+        thicknessRatio = np.maximum(np.round(math.sqrt(area) * thickness_circle_ratio * ratioAreas), 2)
+        thicknessCircle = np.maximum(1, thicknessRatio if ratioAreas > 0.05 else -np.ones_like(thicknessRatio))
+        thicknessLine = np.maximum(1, np.round(thicknessRatio * thickness_line_ratio_wrt_circle))
+        radius = thicknessRatio / 2
+        img = np.ascontiguousarray(img.copy())
+        for i, pair in enumerate(pairs):
+            index1, index2 = pair
+            if keypoints[index1, -1] > threshold and keypoints[index2, -1] > threshold:
+                thicknessLineScaled = int(round(min(thicknessLine[index1], thicknessLine[index2]) * pose_scales[0]))
+                colorIndex = index2
+                color = colors[colorIndex % numberColors]
+                keypoint1 = keypoints[index1, :-1].astype(np.int)
+                keypoint2 = keypoints[index2, :-1].astype(np.int)
+                cv2.line(img, tuple(keypoint1.tolist()), tuple(keypoint2.tolist()), tuple(color.tolist()), thicknessLineScaled, lineType, shift)
+        for part in range(len(keypoints)):
+            faceIndex = part
+            if keypoints[faceIndex, -1] > threshold:
+                radiusScaled = int(round(radius[faceIndex] * pose_scales[0]))
+                thicknessCircleScaled = int(round(thicknessCircle[faceIndex] * pose_scales[0]))
+                colorIndex = part
+                color = colors[colorIndex % numberColors]
+                center = keypoints[faceIndex, :-1].astype(np.int)
+                cv2.circle(img, tuple(center.tolist()), radiusScaled, tuple(color.tolist()), thicknessCircleScaled, lineType, shift)
+    return img
+def render_body_keypoints(img: np.array,
+                          body_keypoints: np.array) -> np.array:
+    """
+    Render OpenPose body keypoints on input image.
+    Args:
+        img (np.array): Input image of shape (H, W, 3) with pixel values in the [0,255] range.
+        body_keypoints (np.array): Keypoint array of shape (N, 3); 3 <====> (x, y, confidence).
+    Returns:
+        (np.array): Image of shape (H, W, 3) with keypoints drawn on top of the original image.
+    """
+    thickness_circle_ratio = 1./75. * np.ones(body_keypoints.shape[0])
+    thickness_line_ratio_wrt_circle = 0.75
+    pairs = []
+    pairs = [1,8,1,2,1,5,2,3,3,4,5,6,6,7,8,9,9,10,10,11,8,12,12,13,13,14,1,0,0,15,15,17,0,16,16,18,14,19,19,20,14,21,11,22,22,23,11,24]
+    pairs = np.array(pairs).reshape(-1,2)
+    colors = [255.,     0.,     85.,
+              255.,     0.,     0.,
+              255.,    85.,     0.,
+              255.,   170.,     0.,
+              255.,   255.,     0.,
+              170.,   255.,     0.,
+               85.,   255.,     0.,
+                0.,   255.,     0.,
+              255.,     0.,     0.,
+                0.,   255.,    85.,
+                0.,   255.,   170.,
+                0.,   255.,   255.,
+                0.,   170.,   255.,
+                0.,    85.,   255.,
+                0.,     0.,   255.,
+              255.,     0.,   170.,
+              170.,     0.,   255.,
+              255.,     0.,   255.,
+               85.,     0.,   255.,
+                0.,     0.,   255.,
+                0.,     0.,   255.,
+                0.,     0.,   255.,
+                0.,   255.,   255.,
+                0.,   255.,   255.,
+                0.,   255.,   255.]
+    colors = np.array(colors).reshape(-1,3)
+    pose_scales = [1]
+    return render_keypoints(img, body_keypoints, pairs, colors, thickness_circle_ratio, thickness_line_ratio_wrt_circle, pose_scales, 0.1)
+def render_openpose(img: np.array,
+                    body_keypoints: np.array) -> np.array:
+    """
+    Render keypoints in the OpenPose format on input image.
+    Args:
+        img (np.array): Input image of shape (H, W, 3) with pixel values in the [0,255] range.
+        body_keypoints (np.array): Keypoint array of shape (N, 3); 3 <====> (x, y, confidence).
+    Returns:
+        (np.array): Image of shape (H, W, 3) with keypoints drawn on top of the original image.
+    """
+    img = render_body_keypoints(img, body_keypoints)
+    return img