Spaces:

mridulk
/

phylo-diffusion

Running

App Files Files Community

mridulk commited on Apr 15, 2024

Commit

17191f4

1 Parent(s): 642d5e2

added data

Browse files

Files changed (19) hide show

ldm/data/__init__.py +0 -0
ldm/data/__pycache__/__init__.cpython-38.pyc +0 -0
ldm/data/__pycache__/base.cpython-38.pyc +0 -0
ldm/data/__pycache__/constants.cpython-38.pyc +0 -0
ldm/data/__pycache__/custom.cpython-38.pyc +0 -0
ldm/data/__pycache__/custom_cub.cpython-38.pyc +0 -0
ldm/data/__pycache__/i2sb_dataloader.cpython-38.pyc +0 -0
ldm/data/__pycache__/imagenet.cpython-38.pyc +0 -0
ldm/data/__pycache__/phylogeny.cpython-38.pyc +0 -0
ldm/data/__pycache__/utils.cpython-38.pyc +0 -0
ldm/data/base.py +85 -0
ldm/data/constants.py +32 -0
ldm/data/custom.py +62 -0
ldm/data/custom_cub.py +62 -0
ldm/data/i2sb_dataloader.py +134 -0
ldm/data/imagenet.py +394 -0
ldm/data/lsun.py +92 -0
ldm/data/phylogeny.py +333 -0
ldm/data/utils.py +56 -0

ldm/data/__init__.py ADDED Viewed

File without changes

ldm/data/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (141 Bytes). View file

ldm/data/__pycache__/base.cpython-38.pyc ADDED Viewed

Binary file (3.62 kB). View file

ldm/data/__pycache__/constants.cpython-38.pyc ADDED Viewed

Binary file (1.24 kB). View file

ldm/data/__pycache__/custom.cpython-38.pyc ADDED Viewed

Binary file (3.16 kB). View file

ldm/data/__pycache__/custom_cub.cpython-38.pyc ADDED Viewed

Binary file (3.15 kB). View file

ldm/data/__pycache__/i2sb_dataloader.cpython-38.pyc ADDED Viewed

Binary file (4.52 kB). View file

ldm/data/__pycache__/imagenet.cpython-38.pyc ADDED Viewed

Binary file (14.5 kB). View file

ldm/data/__pycache__/phylogeny.cpython-38.pyc ADDED Viewed

Binary file (8.57 kB). View file

ldm/data/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (2.61 kB). View file

ldm/data/base.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from abc import abstractmethod
+from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
+import numpy as np
+import albumentations
+from PIL import Image
+from torch.utils.data import Dataset
+class Txt2ImgIterableBaseDataset(IterableDataset):
+    '''
+    Define an interface to make the IterableDatasets for text2img data chainable
+    '''
+    def __init__(self, num_records=0, valid_ids=None, size=256):
+        super().__init__()
+        self.num_records = num_records
+        self.valid_ids = valid_ids
+        self.sample_ids = valid_ids
+        self.size = size
+        print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
+    def __len__(self):
+        return self.num_records
+    @abstractmethod
+    def __iter__(self):
+        pass
+class ImagePaths(Dataset):
+    def __init__(self, paths, size=None, random_crop=False, horizontalflip=False, random_contrast=False, shiftrotate=False, labels=None, unique_skipped_labels=[]):
+        self.size = size
+        self.random_crop = random_crop
+        self.labels = dict() if labels is None else labels
+        self.labels["file_path_"] = paths
+        self._length = len(paths)
+        self.labels_without_skipped = None
+        if len(unique_skipped_labels)!=0:
+            self.labels_without_skipped = dict()
+            for i in self.labels.keys():
+                self.labels_without_skipped[i] = [a for indx, a in enumerate(labels[i]) if labels['class'][indx] not in unique_skipped_labels]
+            self._length = len(self.labels_without_skipped['class'])
+        if self.size is not None and self.size > 0:
+            self.rescaler = albumentations.SmallestMaxSize(max_size = self.size)
+            l = [self.rescaler ]
+            if not self.random_crop:
+                self.cropper = albumentations.CenterCrop(height=self.size,width=self.size)
+            else:
+                self.cropper = albumentations.RandomCrop(height=self.size,width=self.size)
+            l.append(self.cropper)
+            if horizontalflip==True:
+                l.append(albumentations.HorizontalFlip(p=0.2))
+            if shiftrotate==True:
+                l.append(albumentations.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=45, border_mode=0, value=( int(0.485*255), int(0.456*255), int(0.406*255 )), p=0.3))
+            if random_contrast==True:
+                l.append(albumentations.RandomBrightnessContrast(p=0.3))
+            self.preprocessor = albumentations.Compose(l)
+        else:
+            self.preprocessor = lambda **kwargs: kwargs
+    def __len__(self):
+        return self._length
+    def preprocess_image(self, image_path):
+        image = Image.open(image_path)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image = np.array(image).astype(np.uint8)
+        image = self.preprocessor(image=image)["image"]
+        image = (image/127.5 - 1.0).astype(np.float32)
+        return image
+    def __getitem__(self, i):
+        labels = self.labels if self.labels_without_skipped is None else self.labels_without_skipped
+        example = dict()
+        example["image"] = self.preprocess_image(labels["file_path_"][i])
+        for k in labels:
+            example[k] = labels[k][i]
+        return example

ldm/data/constants.py ADDED Viewed

	@@ -0,0 +1,32 @@

+DISENTANGLER_DECODER_OUTPUT = 'output'
+DISENTANGLER_ENCODER_INPUT = 'in'
+QUANTIZED_PHYLO_OUTPUT = 'zq_phylo'
+DISENTANGLER_CLASS_OUTPUT = 'class'
+QUANTIZED_PHYLO_NONATTRIBUTE_OUTPUT = 'zq_phylo_nonattribute'
+DISENTANGLER_NON_ATTRIBUTE_TO_ATTRIBUTE_OUTPUT = 'nonattribate_to_attribute'
+DISENTANGLER_NON_ATTRIBUTE_CLASS_OUTPUT = 'adversarial_classifier_output'
+DISENTANGLER_ADV_MAPPING_OUTPUT = 'adversarial_mapping_output'
+DISENTANGLER_ADV_LEARNING_OUTPUT = 'adversarial_learning_output'
+NON_CLASS_TENSORS = [DISENTANGLER_ADV_LEARNING_OUTPUT, DISENTANGLER_ADV_MAPPING_OUTPUT, DISENTANGLER_ENCODER_INPUT, DISENTANGLER_DECODER_OUTPUT, QUANTIZED_PHYLO_OUTPUT, DISENTANGLER_NON_ATTRIBUTE_TO_ATTRIBUTE_OUTPUT, QUANTIZED_PHYLO_NONATTRIBUTE_OUTPUT, DISENTANGLER_NON_ATTRIBUTE_CLASS_OUTPUT]
+CLASS_TENSORS = [DISENTANGLER_CLASS_OUTPUT]
+DATASET_CLASSNAME = 'class_name'
+PHYLOCONFIG_KEY = "phylomodel_params"
+LRFACTOR_KEY = "lr_factor"
+LRCYCLE = "lr_cycle"
+DISENTANGLERTYPE_KEY = 'disentangler_type'
+COMPLETE_CKPT_KEY = "posttraining_ckpt"
+HISTOGRAMS_FOLDER='code_histograms'
+HISTOGRAMS_FILE="histograms.pkl"
+DISENTANGLER_PHYLO_LOSS="/disentangler_phylo_loss"
+TRANSFORMER_LOSS="/loss"
+RECLOSS = "/rec_loss"
+BASERECLOSS = "/base_true_rec_loss"
+TEST_DIR="results_summary"
+TSNE_FOLDER='tsne'

ldm/data/custom.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#based on https://github.com/CompVis/taming-transformers
+import pickle
+from torch.utils.data import Dataset
+from ldm.data.base import ImagePaths
+import ldm.data.constants as CONSTANTS
+class CustomBase(Dataset):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.data = None
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, i):
+        example = self.data[i]
+        return example
+class CustomTrain(CustomBase):
+    def __init__(self, size, training_images_list_file, horizontalflip=False, random_contrast=False, shiftrotate=False, add_labels=False, unique_skipped_labels=[], class_to_node=None):
+        super().__init__()
+        with open(training_images_list_file, "r") as f:
+            paths = sorted(f.read().splitlines())
+        labels=None
+        if add_labels:
+            labels_per_file = list(map(lambda path: path.split('/')[-2], paths))
+            labels_set = sorted(list(set(labels_per_file)))
+            self.labels_to_idx = {label_name: i for i, label_name in enumerate(labels_set)}
+            if class_to_node:
+                with open(class_to_node, 'rb') as pickle_file:
+                    class_to_node_dict = pickle.load(pickle_file)
+                labels = {
+                    CONSTANTS.DISENTANGLER_CLASS_OUTPUT: [self.labels_to_idx[label_name] for label_name in labels_per_file],
+                    CONSTANTS.DATASET_CLASSNAME: labels_per_file,
+                    'class_to_node': [class_to_node_dict[label_name] for label_name in labels_per_file]
+                }
+                # labels = [self.labels_to_idx[label_name] for label_name in labels_per_file]
+            else:
+                labels = {
+                    CONSTANTS.DISENTANGLER_CLASS_OUTPUT: [self.labels_to_idx[label_name] for label_name in labels_per_file],
+                    CONSTANTS.DATASET_CLASSNAME: labels_per_file
+                }
+        self.indx_to_label = {v: k for k, v in self.labels_to_idx.items()}
+        self.data = ImagePaths(paths=paths, size=size, random_crop=False, horizontalflip=horizontalflip,
+                               random_contrast=random_contrast, shiftrotate=shiftrotate, labels=labels,
+                               unique_skipped_labels=unique_skipped_labels)
+class CustomTest(CustomTrain):
+    def __init__(self, size, test_images_list_file, add_labels=False, unique_skipped_labels=[], class_to_node=None):
+        super().__init__(size, test_images_list_file, add_labels=add_labels,
+                         unique_skipped_labels=unique_skipped_labels, class_to_node=class_to_node)

ldm/data/custom_cub.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#based on https://github.com/CompVis/taming-transformers
+import pickle
+from torch.utils.data import Dataset
+from ldm.data.base import ImagePaths
+import ldm.data.constants as CONSTANTS
+class CustomBase(Dataset):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.data = None
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, i):
+        example = self.data[i]
+        return example
+class CustomTrain(CustomBase):
+    def __init__(self, size, training_images_list_file, horizontalflip=False, random_contrast=False, shiftrotate=False, add_labels=False, unique_skipped_labels=[], class_to_node=None):
+        super().__init__()
+        with open(training_images_list_file, "r") as f:
+            paths = sorted(f.read().splitlines())
+        labels=None
+        if add_labels:
+            labels_per_file = list(map(lambda path: path.split('/')[-2], paths))
+            # labels_per_file = [i.split('.')[1].replace('_', ' ') for i in labels_per_file]
+            labels_set = sorted(list(set(labels_per_file)))
+            self.labels_to_idx = {label_name: i for i, label_name in enumerate(labels_set)}
+            if class_to_node:
+                with open(class_to_node, 'rb') as pickle_file:
+                    class_to_node_dict = pickle.load(pickle_file)
+                labels = {
+                    CONSTANTS.DISENTANGLER_CLASS_OUTPUT: [self.labels_to_idx[label_name] for label_name in labels_per_file],
+                    CONSTANTS.DATASET_CLASSNAME: [class_to_node_dict[label_name] for label_name in labels_per_file],
+                    # 'class_to_node': [class_to_node_dict[label_name] for label_name in labels_per_file]
+                }
+                # labels = [self.labels_to_idx[label_name] for label_name in labels_per_file]
+            else:
+                labels = {
+                    CONSTANTS.DISENTANGLER_CLASS_OUTPUT: [self.labels_to_idx[label_name] for label_name in labels_per_file],
+                    CONSTANTS.DATASET_CLASSNAME: labels_per_file
+                }
+        self.indx_to_label = {v: k for k, v in self.labels_to_idx.items()}
+        self.data = ImagePaths(paths=paths, size=size, random_crop=False, horizontalflip=horizontalflip,
+                               random_contrast=random_contrast, shiftrotate=shiftrotate, labels=labels,
+                               unique_skipped_labels=unique_skipped_labels)
+class CustomTest(CustomTrain):
+    def __init__(self, size, test_images_list_file, add_labels=False, unique_skipped_labels=[], class_to_node=None):
+        super().__init__(size, test_images_list_file, add_labels=add_labels,
+                         unique_skipped_labels=unique_skipped_labels, class_to_node=class_to_node)

ldm/data/i2sb_dataloader.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import os
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from torchvision.transforms import Compose, Resize, ToTensor
+import imageio
+from tqdm import tqdm
+class pix2pixDataset(Dataset):
+    def __init__(self, dataset="maps", data_dir="/projects/ml4science/datasets_pix2pix/", split="train", normalize=True, transforms=None, preload=False, image_size=256, direction="BtoA"):
+        self.datadir = os.path.join(data_dir, dataset)
+        self.img_name_list_path = os.path.join(data_dir, dataset, split)
+        if not os.path.exists(self.datadir):
+            print(f'Dataset directory {self.datadir} does not exists')
+        self.normalize=normalize
+        self.image_name_list = os.listdir(self.img_name_list_path)
+        self.preload = preload
+        self.direction = direction
+        if transforms is None:
+            self.transforms = Compose([
+                ToTensor(), # Convert to torch tensor
+                Resize((image_size, image_size), antialias=False), # Resize to 256x256
+            ])
+        else:
+            self.transforms = transforms
+        if self.preload:
+            self.x_list, self.y_list= (), ()
+            for name in tqdm(self.image_name_list):
+                x, y = self.load_every(name)
+                self.x_list = self.x_list + (x,)
+                self.y_list = self.y_list + (y,)
+            self.x_list = torch.stack(self.x_list, 0)
+            self.y_list = torch.stack(self.y_list, 0)
+            print(f"{split} dataset preloaded!")
+    def load_every(self, name):
+        img_array = np.asarray(imageio.imread(os.path.join(self.img_name_list_path, name)))
+        img_H, img_W = img_array.shape[0], img_array.shape[1]
+        if self.normalize:
+            img_array = self.normalize_fn(img_array)
+        x_img, y_img = img_array[:,:img_W//2, :], img_array[:, img_W//2:, :]
+        x_img, y_img = self.transforms(x_img), self.transforms(y_img) # Apply the resize transform
+        return x_img.float(), y_img.float()
+    def normalize_fn(self, x):
+        return (x/255. -0.5)*2
+    def unnormalize_fn(self, x):
+        return ((x/2 + 0.5) * 255).int().clamp(0, 255) #since these are images
+    def __getitem__(self, index): # getitem should return x0, x1, y (where y is the class label for class conditional generation)
+        class_cond = None
+        if self.preload:
+            x_img, y_img = self.x_list[index], self.y_list[index]
+        else:
+            name = self.image_name_list[index]
+            x_img, y_img = self.load_every(name)
+        # if self.direction == "BtoA":
+        #     return x_img, y_img, class_cond
+        # elif self.direction == "AtoB":
+        #     return y_img, x_img, class_cond
+        batch ={
+            "image1":x_img,
+            "image2":y_img,
+        }
+        return batch
+    def __len__(self):
+        return len(self.image_name_list)
+class FishDataset(Dataset):
+    def __init__(self, data_dir="/projects/ml4science/FishDiffusion/", split="train", normalize=True, transforms=None, preload=False, image_size=128):
+        self.datadir = os.path.join(data_dir)
+        self.img_name_list_path = os.path.join(data_dir, split)
+        if not os.path.exists(self.datadir):
+            print(f'Dataset directory {self.datadir} does not exists')
+        self.normalize=normalize
+        self.image_name_list = os.listdir(self.img_name_list_path)
+        self.preload = preload
+        if transforms is None:
+            # self.transforms = Compose([
+            #     ToTensor(), # Convert to torch tensor
+            #     Resize((image_size, image_size), antialias=False), # Resize to 256x256
+            # ])
+            self.transforms = Compose([
+                ToTensor(), # Convert to torch tensor
+            ])
+        else:
+            self.transforms = transforms
+        if self.preload:
+            self.x_list, self.y_list, self.class_id = (), (), []
+            for name in tqdm(self.image_name_list):
+                x, y = self.load_every(name)
+                cls_id = int(name.split("_")[-1][:-4])
+                self.x_list = self.x_list + (x,)
+                self.y_list = self.y_list + (y,)
+                self.class_id.append(cls_id)
+            self.x_list = torch.stack(self.x_list, 0)
+            self.y_list = torch.stack(self.y_list, 0)
+            self.class_id = torch.tensor(self.class_id)
+            print(f"{split} dataset preloaded!")
+    def load_every(self, name):
+        img_array = np.asarray(imageio.imread(os.path.join(self.img_name_list_path, name)))
+        img_H, img_W = img_array.shape[0], img_array.shape[1]
+        if self.normalize:
+            img_array = self.normalize_fn(img_array)
+        x_img, y_img = img_array[:,:img_W//2, :], img_array[:, img_W//2:, :]
+        x_img, y_img = self.transforms(x_img), self.transforms(y_img) # Apply the resize transform
+        return x_img.float(), y_img.float()
+    def normalize_fn(self, x):
+        return (x/255. -0.5)*2
+    def unnormalize_fn(self, x):
+        return ((x/2 + 0.5) * 255).int().clamp(0, 255) #since these are images
+    def __getitem__(self, index): # getitem should return x0, x1, y (where y is the class label for class conditional generation)
+        if self.preload:
+            x_img, y_img, class_id = self.x_list[index], self.y_list[index], self.class_id[index]
+        else:
+            name = self.image_name_list[index]
+            class_id = torch.tensor(int(name.split("_")[-1][:-4]))
+            x_img, y_img = self.load_every(name)
+        return x_img, y_img, class_id
+    def __len__(self):
+        return len(self.image_name_list)

ldm/data/imagenet.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import os, yaml, pickle, shutil, tarfile, glob
+import cv2
+import albumentations
+import PIL
+import numpy as np
+import torchvision.transforms.functional as TF
+from omegaconf import OmegaConf
+from functools import partial
+from PIL import Image
+from tqdm import tqdm
+from torch.utils.data import Dataset, Subset
+import taming.data.utils as tdu
+from taming.data.imagenet import str_to_indices, give_synsets_from_indices, download, retrieve
+from taming.data.imagenet import ImagePaths
+from ldm.modules.image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
+def synset2idx(path_to_yaml="data/index_synset.yaml"):
+    with open(path_to_yaml) as f:
+        di2s = yaml.load(f)
+    return dict((v,k) for k,v in di2s.items())
+class ImageNetBase(Dataset):
+    def __init__(self, config=None):
+        self.config = config or OmegaConf.create()
+        if not type(self.config)==dict:
+            self.config = OmegaConf.to_container(self.config)
+        self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
+        self.process_images = True  # if False we skip loading & processing images and self.data contains filepaths
+        self._prepare()
+        self._prepare_synset_to_human()
+        self._prepare_idx_to_synset()
+        self._prepare_human_to_integer_label()
+        self._load()
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, i):
+        return self.data[i]
+    def _prepare(self):
+        raise NotImplementedError()
+    def _filter_relpaths(self, relpaths):
+        ignore = set([
+            "n06596364_9591.JPEG",
+        ])
+        relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
+        if "sub_indices" in self.config:
+            indices = str_to_indices(self.config["sub_indices"])
+            synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn)  # returns a list of strings
+            self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
+            files = []
+            for rpath in relpaths:
+                syn = rpath.split("/")[0]
+                if syn in synsets:
+                    files.append(rpath)
+            return files
+        else:
+            return relpaths
+    def _prepare_synset_to_human(self):
+        SIZE = 2655750
+        URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
+        self.human_dict = os.path.join(self.root, "synset_human.txt")
+        if (not os.path.exists(self.human_dict) or
+                not os.path.getsize(self.human_dict)==SIZE):
+            download(URL, self.human_dict)
+    def _prepare_idx_to_synset(self):
+        URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
+        self.idx2syn = os.path.join(self.root, "index_synset.yaml")
+        if (not os.path.exists(self.idx2syn)):
+            download(URL, self.idx2syn)
+    def _prepare_human_to_integer_label(self):
+        URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
+        self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
+        if (not os.path.exists(self.human2integer)):
+            download(URL, self.human2integer)
+        with open(self.human2integer, "r") as f:
+            lines = f.read().splitlines()
+            assert len(lines) == 1000
+            self.human2integer_dict = dict()
+            for line in lines:
+                value, key = line.split(":")
+                self.human2integer_dict[key] = int(value)
+    def _load(self):
+        with open(self.txt_filelist, "r") as f:
+            self.relpaths = f.read().splitlines()
+            l1 = len(self.relpaths)
+            self.relpaths = self._filter_relpaths(self.relpaths)
+            print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
+        self.synsets = [p.split("/")[0] for p in self.relpaths]
+        self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
+        unique_synsets = np.unique(self.synsets)
+        class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
+        if not self.keep_orig_class_label:
+            self.class_labels = [class_dict[s] for s in self.synsets]
+        else:
+            self.class_labels = [self.synset2idx[s] for s in self.synsets]
+        with open(self.human_dict, "r") as f:
+            human_dict = f.read().splitlines()
+            human_dict = dict(line.split(maxsplit=1) for line in human_dict)
+        self.human_labels = [human_dict[s] for s in self.synsets]
+        labels = {
+            "relpath": np.array(self.relpaths),
+            "synsets": np.array(self.synsets),
+            "class_label": np.array(self.class_labels),
+            "human_label": np.array(self.human_labels),
+        }
+        if self.process_images:
+            self.size = retrieve(self.config, "size", default=256)
+            self.data = ImagePaths(self.abspaths,
+                                   labels=labels,
+                                   size=self.size,
+                                   random_crop=self.random_crop,
+                                   )
+        else:
+            self.data = self.abspaths
+class ImageNetTrain(ImageNetBase):
+    NAME = "ILSVRC2012_train"
+    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
+    AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
+    FILES = [
+        "ILSVRC2012_img_train.tar",
+    ]
+    SIZES = [
+        147897477120,
+    ]
+    def __init__(self, process_images=True, data_root=None, **kwargs):
+        self.process_images = process_images
+        self.data_root = data_root
+        super().__init__(**kwargs)
+    def _prepare(self):
+        if self.data_root:
+            self.root = os.path.join(self.data_root, self.NAME)
+        else:
+            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
+        self.datadir = os.path.join(self.root, "data")
+        self.txt_filelist = os.path.join(self.root, "filelist.txt")
+        self.expected_length = 1281167
+        self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
+                                    default=True)
+        if not tdu.is_prepared(self.root):
+            # prep
+            print("Preparing dataset {} in {}".format(self.NAME, self.root))
+            datadir = self.datadir
+            if not os.path.exists(datadir):
+                path = os.path.join(self.root, self.FILES[0])
+                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+                    import academictorrents as at
+                    atpath = at.get(self.AT_HASH, datastore=self.root)
+                    assert atpath == path
+                print("Extracting {} to {}".format(path, datadir))
+                os.makedirs(datadir, exist_ok=True)
+                with tarfile.open(path, "r:") as tar:
+                    tar.extractall(path=datadir)
+                print("Extracting sub-tars.")
+                subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
+                for subpath in tqdm(subpaths):
+                    subdir = subpath[:-len(".tar")]
+                    os.makedirs(subdir, exist_ok=True)
+                    with tarfile.open(subpath, "r:") as tar:
+                        tar.extractall(path=subdir)
+            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
+            filelist = sorted(filelist)
+            filelist = "\n".join(filelist)+"\n"
+            with open(self.txt_filelist, "w") as f:
+                f.write(filelist)
+            tdu.mark_prepared(self.root)
+class ImageNetValidation(ImageNetBase):
+    NAME = "ILSVRC2012_validation"
+    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
+    AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
+    VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
+    FILES = [
+        "ILSVRC2012_img_val.tar",
+        "validation_synset.txt",
+    ]
+    SIZES = [
+        6744924160,
+        1950000,
+    ]
+    def __init__(self, process_images=True, data_root=None, **kwargs):
+        self.data_root = data_root
+        self.process_images = process_images
+        super().__init__(**kwargs)
+    def _prepare(self):
+        if self.data_root:
+            self.root = os.path.join(self.data_root, self.NAME)
+        else:
+            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
+        self.datadir = os.path.join(self.root, "data")
+        self.txt_filelist = os.path.join(self.root, "filelist.txt")
+        self.expected_length = 50000
+        self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
+                                    default=False)
+        if not tdu.is_prepared(self.root):
+            # prep
+            print("Preparing dataset {} in {}".format(self.NAME, self.root))
+            datadir = self.datadir
+            if not os.path.exists(datadir):
+                path = os.path.join(self.root, self.FILES[0])
+                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+                    import academictorrents as at
+                    atpath = at.get(self.AT_HASH, datastore=self.root)
+                    assert atpath == path
+                print("Extracting {} to {}".format(path, datadir))
+                os.makedirs(datadir, exist_ok=True)
+                with tarfile.open(path, "r:") as tar:
+                    tar.extractall(path=datadir)
+                vspath = os.path.join(self.root, self.FILES[1])
+                if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
+                    download(self.VS_URL, vspath)
+                with open(vspath, "r") as f:
+                    synset_dict = f.read().splitlines()
+                    synset_dict = dict(line.split() for line in synset_dict)
+                print("Reorganizing into synset folders")
+                synsets = np.unique(list(synset_dict.values()))
+                for s in synsets:
+                    os.makedirs(os.path.join(datadir, s), exist_ok=True)
+                for k, v in synset_dict.items():
+                    src = os.path.join(datadir, k)
+                    dst = os.path.join(datadir, v)
+                    shutil.move(src, dst)
+            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
+            filelist = sorted(filelist)
+            filelist = "\n".join(filelist)+"\n"
+            with open(self.txt_filelist, "w") as f:
+                f.write(filelist)
+            tdu.mark_prepared(self.root)
+class ImageNetSR(Dataset):
+    def __init__(self, size=None,
+                 degradation=None, downscale_f=4, min_crop_f=0.5, max_crop_f=1.,
+                 random_crop=True):
+        """
+        Imagenet Superresolution Dataloader
+        Performs following ops in order:
+        1.  crops a crop of size s from image either as random or center crop
+        2.  resizes crop to size with cv2.area_interpolation
+        3.  degrades resized crop with degradation_fn
+        :param size: resizing to size after cropping
+        :param degradation: degradation_fn, e.g. cv_bicubic or bsrgan_light
+        :param downscale_f: Low Resolution Downsample factor
+        :param min_crop_f: determines crop size s,
+          where s = c * min_img_side_len with c sampled from interval (min_crop_f, max_crop_f)
+        :param max_crop_f: ""
+        :param data_root:
+        :param random_crop:
+        """
+        self.base = self.get_base()
+        assert size
+        assert (size / downscale_f).is_integer()
+        self.size = size
+        self.LR_size = int(size / downscale_f)
+        self.min_crop_f = min_crop_f
+        self.max_crop_f = max_crop_f
+        assert(max_crop_f <= 1.)
+        self.center_crop = not random_crop
+        self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
+        self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
+        if degradation == "bsrgan":
+            self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
+        elif degradation == "bsrgan_light":
+            self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
+        else:
+            interpolation_fn = {
+            "cv_nearest": cv2.INTER_NEAREST,
+            "cv_bilinear": cv2.INTER_LINEAR,
+            "cv_bicubic": cv2.INTER_CUBIC,
+            "cv_area": cv2.INTER_AREA,
+            "cv_lanczos": cv2.INTER_LANCZOS4,
+            "pil_nearest": PIL.Image.NEAREST,
+            "pil_bilinear": PIL.Image.BILINEAR,
+            "pil_bicubic": PIL.Image.BICUBIC,
+            "pil_box": PIL.Image.BOX,
+            "pil_hamming": PIL.Image.HAMMING,
+            "pil_lanczos": PIL.Image.LANCZOS,
+            }[degradation]
+            self.pil_interpolation = degradation.startswith("pil_")
+            if self.pil_interpolation:
+                self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
+            else:
+                self.degradation_process = albumentations.SmallestMaxSize(max_size=self.LR_size,
+                                                                          interpolation=interpolation_fn)
+    def __len__(self):
+        return len(self.base)
+    def __getitem__(self, i):
+        example = self.base[i]
+        image = Image.open(example["file_path_"])
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image = np.array(image).astype(np.uint8)
+        min_side_len = min(image.shape[:2])
+        crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
+        crop_side_len = int(crop_side_len)
+        if self.center_crop:
+            self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
+        else:
+            self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
+        image = self.cropper(image=image)["image"]
+        image = self.image_rescaler(image=image)["image"]
+        if self.pil_interpolation:
+            image_pil = PIL.Image.fromarray(image)
+            LR_image = self.degradation_process(image_pil)
+            LR_image = np.array(LR_image).astype(np.uint8)
+        else:
+            LR_image = self.degradation_process(image=image)["image"]
+        example["image"] = (image/127.5 - 1.0).astype(np.float32)
+        example["LR_image"] = (LR_image/127.5 - 1.0).astype(np.float32)
+        return example
+class ImageNetSRTrain(ImageNetSR):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def get_base(self):
+        with open("data/imagenet_train_hr_indices.p", "rb") as f:
+            indices = pickle.load(f)
+        dset = ImageNetTrain(process_images=False,)
+        return Subset(dset, indices)
+class ImageNetSRValidation(ImageNetSR):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def get_base(self):
+        with open("data/imagenet_val_hr_indices.p", "rb") as f:
+            indices = pickle.load(f)
+        dset = ImageNetValidation(process_images=False,)
+        return Subset(dset, indices)

ldm/data/lsun.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import numpy as np
+import PIL
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+class LSUNBase(Dataset):
+    def __init__(self,
+                 txt_file,
+                 data_root,
+                 size=None,
+                 interpolation="bicubic",
+                 flip_p=0.5
+                 ):
+        self.data_paths = txt_file
+        self.data_root = data_root
+        with open(self.data_paths, "r") as f:
+            self.image_paths = f.read().splitlines()
+        self._length = len(self.image_paths)
+        self.labels = {
+            "relative_file_path_": [l for l in self.image_paths],
+            "file_path_": [os.path.join(self.data_root, l)
+                           for l in self.image_paths],
+        }
+        self.size = size
+        self.interpolation = {"linear": PIL.Image.LINEAR,
+                              "bilinear": PIL.Image.BILINEAR,
+                              "bicubic": PIL.Image.BICUBIC,
+                              "lanczos": PIL.Image.LANCZOS,
+                              }[interpolation]
+        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
+    def __len__(self):
+        return self._length
+    def __getitem__(self, i):
+        example = dict((k, self.labels[k][i]) for k in self.labels)
+        image = Image.open(example["file_path_"])
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+        crop = min(img.shape[0], img.shape[1])
+        h, w, = img.shape[0], img.shape[1]
+        img = img[(h - crop) // 2:(h + crop) // 2,
+              (w - crop) // 2:(w + crop) // 2]
+        image = Image.fromarray(img)
+        if self.size is not None:
+            image = image.resize((self.size, self.size), resample=self.interpolation)
+        image = self.flip(image)
+        image = np.array(image).astype(np.uint8)
+        example["image"] = (image / 127.5 - 1.0).astype(np.float32)
+        return example
+class LSUNChurchesTrain(LSUNBase):
+    def __init__(self, **kwargs):
+        super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
+class LSUNChurchesValidation(LSUNBase):
+    def __init__(self, flip_p=0., **kwargs):
+        super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
+                         flip_p=flip_p, **kwargs)
+class LSUNBedroomsTrain(LSUNBase):
+    def __init__(self, **kwargs):
+        super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
+class LSUNBedroomsValidation(LSUNBase):
+    def __init__(self, flip_p=0.0, **kwargs):
+        super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
+                         flip_p=flip_p, **kwargs)
+class LSUNCatsTrain(LSUNBase):
+    def __init__(self, **kwargs):
+        super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
+class LSUNCatsValidation(LSUNBase):
+    def __init__(self, flip_p=0., **kwargs):
+        super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
+                         flip_p=flip_p, **kwargs)

ldm/data/phylogeny.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import os
+import pandas as pd
+import math
+import pickle
+import pprint
+pp = pprint.PrettyPrinter(indent=4)
+# For phylogeny parsing
+# !pip install opentree
+from opentree import OT
+# !pip install ete3
+from ete3 import Tree, PhyloTree
+# Constants
+Fix_Tree = True
+format_ = 1 #8
+class Phylogeny:
+    # Phylogeny class for Fish dataset
+    # If node_ids is None, it assumes that the tree already exists. Otherwise, you have to pass node_ids (i.e., list of species names).
+    def __init__(self, filePath, node_ids=None, verbose=False):
+        # filenames for phylo tree and cached mapping ottid-speciesname
+        cleaned_fine_tree_fileName = "cleaned_metadata.tre"
+        name_conversion_file = "name_conversion.pkl"
+        self.ott_ids = []
+        self.ott_id_dict = {}
+        self.node_ids = node_ids
+        self.treeFileNameAndPath = os.path.join(filePath, cleaned_fine_tree_fileName)
+        self.conversionFileNameAndPath = os.path.join(filePath, name_conversion_file)
+        self.total_distance = -1 # -1 means we never calculated it before.
+        self.distance_matrix = {}
+        self.species_groups_within_relative_distance = {}
+        self.get_ott_ids(node_ids, verbose=verbose)
+        self.get_tree(self.treeFileNameAndPath)
+        self.get_total_distance()
+    # Given two species names, get the phylo distance between them
+    def get_distance(self, species1, species2):
+        d= None
+        if self.distance_matrix[species1][species2] == -1:
+            if species1 == species2:
+                return 0
+            ott_id1 = 'ott' + str(self.ott_id_dict[species1])
+            ott_id2 = 'ott' + str(self.ott_id_dict[species2])
+            d = self.tree.get_distance(ott_id1, ott_id2)
+            self.distance_matrix[species1][species2] = d
+        else:
+            d = self.distance_matrix[species1][species2]
+        return d
+    # relative_distance = 0 => species node itself
+    # relative_distance = 1 => all species
+    def get_siblings_by_name(self, species, relative_distance, verbose=False):
+        self.get_species_groups(relative_distance, verbose)
+        for species_group in self.species_groups_within_relative_distance[relative_distance]:
+            if species in species_group:
+                return species_group
+        raise species+" was not found in " + self.species_groups_within_relative_distance[relative_distance]
+    def get_parent_by_name(self, species, relative_distance, verbose=False):
+        ott_id = 'ott' + str(self.ott_id_dict[species])
+        parent = self.get_parent_by_ottid(ott_id, relative_distance, verbose)
+        return parent
+    def get_distance_between_parents(self, species1, species2, relative_distance):
+        parent1 = self.get_parent_by_name(species1, relative_distance)
+        parent2 = self.get_parent_by_name(species2, relative_distance)
+        return self.tree.get_distance(parent1, parent2)
+    def get_species_groups(self, relative_distance, verbose=False):
+        if relative_distance not in self.species_groups_within_relative_distance.keys():
+            groups = {}
+            for species in self.getLabelList():
+                parent_node = self.get_parent_by_name(species, relative_distance, verbose)
+                parent = parent_node.name
+                if parent not in groups.keys():
+                    groups[parent] = [species]
+                else:
+                    groups[parent].append(species)
+            self.species_groups_within_relative_distance[relative_distance] = groups.values()
+            if verbose:
+                print("At relative_distance", relative_distance, ", the groups are:", groups.values())
+        return self.species_groups_within_relative_distance[relative_distance]
+    def getLabelList(self):
+        return list(self.node_ids)
+    # ------- privete functions
+    def get_total_distance(self):
+        if self.node_ids is None:
+            self.node_ids = self.ott_id_dict.keys()
+        self.init_distance_matrix()
+        # For one time, measure distance from all leaves down to root. They all should be equal.
+        # Save the value and reuse it.
+        if self.total_distance==-1:
+            for leaf in self.tree.iter_leaves():
+                total_distance = self.tree.get_distance(leaf) # gets distance to rootprint
+                assert math.isclose(self.total_distance, total_distance) or self.total_distance==-1
+                self.total_distance = total_distance
+        return self.total_distance
+    def init_distance_matrix(self):
+        for i in self.node_ids:
+            self.distance_matrix[i] = {}
+            for j in self.node_ids:
+                self.distance_matrix[i][j] = -1
+    def get_parent_by_ottid(self, ott_id, relative_distance, verbose=False):
+        abs_distance = relative_distance*self.total_distance
+        species_node = self.tree.search_nodes(name=ott_id)[0]
+        if verbose:
+            print('distance to ancestor: ', abs_distance, ". relaive distance: ", relative_distance)
+        # keep going up till distance exceeds abs_distance
+        distance = 0
+        parent = species_node
+        while distance < abs_distance:
+            if parent.up is None:
+                break
+            parent = parent.up
+            distance = self.tree.get_distance(parent, species_node)
+        return parent
+    #     return ott_id_list
+    # node_ids: list of taxa
+    # returns: corresponding list of ott_ids
+    def get_ott_ids(self, node_ids, verbose=False):
+        if not os.path.exists(self.conversionFileNameAndPath):
+            if node_ids is None:
+                raise TypeError('No existing ottid-speciesnames found. node_ids should be a list of species names.')
+            if verbose:
+                print('Included taxonomy: ', node_ids, len(node_ids))
+                df2 = pd.DataFrame(columns=['in csv', 'in response', 'Same?'])
+            # Get the matches
+            resp = OT.tnrs_match(node_ids, do_approximate_matching=True)
+            matches = resp.response_dict['results']
+            unmatched_names = resp.response_dict['unmatched_names']
+            # Get the corresponding ott_ids
+            ott_ids = set()
+            ott_id_dict={}
+            assert len(unmatched_names)==0 # everything is matched!
+            for match_array in matches:
+                match_array_matches = match_array['matches']
+                assert len(match_array_matches)==1, match_array['name'] + " has too many matches" + str(list(map(lambda x: x['matched_name'], match_array_matches)))  # we have a single unambiguous match!
+                first_match = match_array_matches[0]
+                ott_id = first_match['taxon']['ott_id']
+                ott_ids.add(ott_id)
+                if verbose:
+                    #some original and matched names are not exactly the same. Not a bug
+                    df2 = df2.append({'in csv':match_array['name'], 'in response': first_match['matched_name'], 'Same?': match_array['name'] == first_match['matched_name']}, ignore_index=True)
+                ott_id_dict[match_array['name']] = ott_id
+            ott_ids = list(ott_ids)
+            if verbose:
+                print(df2[df2['Same?']== False])
+                pp.pprint(ott_id_dict)
+            with open(self.conversionFileNameAndPath, 'wb') as f:
+                pickle.dump([ott_ids, ott_id_dict], f)
+        else:
+            with open(self.conversionFileNameAndPath, 'rb') as f:
+                ott_ids, ott_id_dict = pickle.load(f)
+        self.ott_ids = ott_ids
+        self.ott_id_dict = ott_id_dict
+        print(self.ott_id_dict)
+    def fix_tree(self, treeFileNameAndPath):
+        tree = PhyloTree(treeFileNameAndPath, format=format_)
+        # Special case for Fish dataset: Fix Esox Americanus.
+        D = tree.search_nodes(name="mrcaott47023ott496121")[0]
+        D.name = "ott496115"
+        tree.write(format=format_, outfile=treeFileNameAndPath)
+    def get_tree(self, treeFileNameAndPath):
+        if not os.path.exists(treeFileNameAndPath):
+            output = OT.synth_induced_tree(ott_ids=self.ott_ids, ignore_unknown_ids=False, label_format='id') # name_and_id ott_ids=list(ott_ids),
+            output.tree.write(path = treeFileNameAndPath, schema = "newick")
+            if Fix_Tree:
+                self.fix_tree(treeFileNameAndPath)
+        self.tree = PhyloTree(treeFileNameAndPath, format=format_)
+class PhylogenyCUB:
+    # Phylogeny class for CUB dataset
+    def __init__(self, filePath, node_ids=None, verbose=False):
+        # cleaned_fine_tree_fileName = "1_tree-consensus-Hacket-AllSpecies.phy"
+        # cleaned_fine_tree_fileName = "1_tree-consensus-Hacket-AllSpecies-cub-names.phy"
+        cleaned_fine_tree_fileName = "1_tree-consensus-Hacket-27Species-cub-names.phy"
+        self.node_ids = node_ids
+        self.treeFileNameAndPath = os.path.join(filePath, cleaned_fine_tree_fileName)
+        self.total_distance = -1 # -1 means we never calculated it before.
+        self.distance_matrix = {}
+        self.species_groups_within_relative_distance = {}
+        self.get_tree(self.treeFileNameAndPath)
+        self.get_total_distance()
+    # Given two species names, get the phylo distance between them
+    def get_distance(self, species1, species2):
+        d= None
+        if self.distance_matrix[species1][species2] == -1:
+            if species1 == species2:
+                return 0
+            d = self.tree.get_distance(species1, species2)
+            self.distance_matrix[species1][species2] = d
+        else:
+            d = self.distance_matrix[species1][species2]
+        return d
+    # relative_distance = 0 => species node itself
+    # relative_distance = 1 => all species
+    def get_siblings_by_name(self, species, relative_distance, verbose=False):
+        #NOTE: This implementation was causing inconsistencies since finding the parent.get_leaves() was not equivalent to get_species_groups
+        # ott_id = 'ott' + str(self.ott_id_dict[species])
+        # return self.get_siblings_by_ottid(ott_id, relative_distance, get_ottids, verbose)
+        self.get_species_groups(relative_distance, verbose)
+        for species_group in self.species_groups_within_relative_distance[relative_distance]:
+            if species in species_group:
+                return species_group
+        raise species+" was not found in " + self.species_groups_within_relative_distance[relative_distance]
+    def get_parent_by_name(self, species, relative_distance, verbose=False):
+        abs_distance = relative_distance*self.total_distance
+        species_node = self.tree.search_nodes(name=species)[0]
+        if verbose:
+            print('distance to ancestor: ', abs_distance, ". relaive distance: ", relative_distance)
+        # keep going up till distance exceeds abs_distance
+        distance = 0
+        parent = species_node
+        while distance < abs_distance:
+            if parent.up is None:
+                break
+            parent = parent.up
+            distance = self.tree.get_distance(parent, species_node)
+        return parent
+    def get_distance_between_parents(self, species1, species2, relative_distance):
+        parent1 = self.get_parent_by_name(species1, relative_distance)
+        parent2 = self.get_parent_by_name(species2, relative_distance)
+        return self.tree.get_distance(parent1, parent2)
+    def get_species_groups(self, relative_distance, verbose=False):
+        if relative_distance not in self.species_groups_within_relative_distance.keys():
+            groups = {}
+            for species in self.getLabelList():
+                parent_node = self.get_parent_by_name(species, relative_distance, verbose)
+                parent = parent_node.name
+                if parent not in groups.keys():
+                    groups[parent] = [species]
+                else:
+                    groups[parent].append(species)
+            self.species_groups_within_relative_distance[relative_distance] = groups.values()
+            if verbose:
+                print("At relative_distance", relative_distance, ", the groups are:", groups.values())
+        return self.species_groups_within_relative_distance[relative_distance]
+    def getLabelList(self):
+        return list(self.node_ids)
+    # ------- privete functions
+    def get_total_distance(self):
+        if self.node_ids is None:
+            self.node_ids = sorted([leaf.name for leaf in self.tree.iter_leaves()])
+        self.init_distance_matrix()
+        # maximum distance between root and lead node taken as total distance
+        leaf_to_root_distances = [self.tree.get_distance(leaf) for leaf in self.tree.iter_leaves()]
+        self.total_distance = max(leaf_to_root_distances)
+        return self.total_distance
+    def init_distance_matrix(self):
+        for i in self.node_ids:
+            self.distance_matrix[i] = {}
+            for j in self.node_ids:
+                self.distance_matrix[i][j] = -1
+    def get_tree(self, treeFileNameAndPath):
+        # if not os.path.exists(treeFileNameAndPath):
+        #     output = OT.synth_induced_tree(ott_ids=self.ott_ids, ignore_unknown_ids=False, label_format='id') # name_and_id ott_ids=list(ott_ids),
+        #     output.tree.write(path = treeFileNameAndPath, schema = "newick")
+        self.tree = PhyloTree(treeFileNameAndPath, format=format_)
+        # setting a dummy name to the internal nodes if it is unnamed
+        for i, node in enumerate(self.tree.traverse("postorder")):
+            if not len(node.name) > 0:
+                node.name = str(i)

ldm/data/utils.py ADDED Viewed

	@@ -0,0 +1,56 @@

+#based on https://github.com/CompVis/taming-transformers
+import collections
+import torch
+from ldm.data.helper_types import Annotation
+from torch._six import string_classes
+from torch.utils.data._utils.collate import np_str_obj_array_pattern, default_collate_err_msg_format
+def custom_collate(batch):
+    r"""source: pytorch 1.9.0, only one modification to original code """
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, torch.Tensor):
+        out = None
+        if torch.utils.data.get_worker_info() is not None:
+            # If we're in a background process, concatenate directly into a
+            # shared memory tensor to avoid an extra copy
+            numel = sum([x.numel() for x in batch])
+            storage = elem.storage()._new_shared(numel)
+            out = elem.new(storage)
+        return torch.stack(batch, 0, out=out)
+    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+            and elem_type.__name__ != 'string_':
+        if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+            # array of string classes and object
+            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+            return custom_collate([torch.as_tensor(b) for b in batch])
+        elif elem.shape == ():  # scalars
+            return torch.as_tensor(batch)
+    elif isinstance(elem, float):
+        return torch.tensor(batch, dtype=torch.float64)
+    elif isinstance(elem, int):
+        return torch.tensor(batch)
+    elif isinstance(elem, string_classes):
+        return batch
+    elif isinstance(elem, collections.abc.Mapping):
+        return {key: custom_collate([d[key] for d in batch]) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+        return elem_type(*(custom_collate(samples) for samples in zip(*batch)))
+    if isinstance(elem, collections.abc.Sequence) and isinstance(elem[0], Annotation):  # added
+        return batch  # added
+    elif isinstance(elem, collections.abc.Sequence):
+        # check to make sure that the elements in batch have consistent size
+        it = iter(batch)
+        elem_size = len(next(it))
+        if not all(len(elem) == elem_size for elem in it):
+            raise RuntimeError('each element in list of batch should be of equal size')
+        transposed = zip(*batch)
+        return [custom_collate(samples) for samples in transposed]
+    raise TypeError(default_collate_err_msg_format.format(elem_type))