Spaces:

aki-0421
/

character-360

Build error

App Files Files

aki-0421 commited on Dec 21, 2024

Commit

a3a3ae4

unverified ·

0 Parent(s):

F: add

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +35 -0
.gitignore +1 -0
README.md +14 -0
annotator/base_annotator.py +57 -0
annotator/canny.py +63 -0
annotator/color.py +59 -0
annotator/hed.py +155 -0
annotator/identity.py +25 -0
annotator/invert.py +25 -0
annotator/midas/__init__.py +0 -0
annotator/midas/api.py +165 -0
annotator/midas/base_model.py +17 -0
annotator/midas/blocks.py +390 -0
annotator/midas/dpt_depth.py +106 -0
annotator/midas/midas_net.py +79 -0
annotator/midas/midas_net_custom.py +166 -0
annotator/midas/transforms.py +230 -0
annotator/midas/utils.py +192 -0
annotator/midas/vit.py +509 -0
annotator/midas_op.py +79 -0
annotator/mlsd/__init__.py +0 -0
annotator/mlsd/mbv2_mlsd_large.py +303 -0
annotator/mlsd/mbv2_mlsd_tiny.py +287 -0
annotator/mlsd/utils.py +638 -0
annotator/mlsd_op.py +74 -0
annotator/openpose.py +812 -0
annotator/registry.py +30 -0
annotator/utils.py +114 -0
app.py +32 -0
dataset/.gitignore +1 -0
dataset/opencv_transforms/__init__.py +0 -0
dataset/opencv_transforms/functional.py +598 -0
dataset/opencv_transforms/transforms.py +1044 -0
dataset/setup.py +23 -0
dataset/tests/compare_to_pil_for_testing.ipynb +241 -0
dataset/tests/setup_testing_directory.py +50 -0
dataset/tests/test_color.py +68 -0
dataset/tests/test_spatial.py +52 -0
dataset/tests/utils.py +8 -0
inference.yaml +166 -0
packages.txt +2 -0
pipeline.py +168 -0
requirements.txt +49 -0
sgm/__init__.py +4 -0
sgm/data/__init__.py +1 -0
sgm/data/dataset.py +80 -0
sgm/data/video_dataset.py +191 -0
sgm/data/video_dataset_stage2_degradeImages.py +303 -0
sgm/inference/api.py +385 -0
sgm/inference/helpers.py +305 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .idea

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Character 360
+emoji: 🏆
+colorFrom: green
+colorTo: blue
+sdk: gradio
+sdk_version: 5.9.1
+app_file: app.py
+pinned: false
+license: unknown
+short_description: Would you like to see your character in 360°?
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

annotator/base_annotator.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from abc import ABCMeta
+import torch
+import torch.nn as nn
+from scepter.modules.annotator.registry import ANNOTATORS
+from scepter.modules.model.base_model import BaseModel
+from scepter.modules.utils.config import dict_to_yaml
+@ANNOTATORS.register_class()
+class BaseAnnotator(BaseModel, metaclass=ABCMeta):
+    para_dict = {}
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+    @torch.no_grad()
+    @torch.inference_mode
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('ANNOTATORS',
+                            __class__.__name__,
+                            BaseAnnotator.para_dict,
+                            set_name=True)
+@ANNOTATORS.register_class()
+class GeneralAnnotator(BaseAnnotator, metaclass=ABCMeta):
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        anno_models = cfg.get('ANNOTATORS', [])
+        self.annotators = nn.ModuleList()
+        for n, anno_config in enumerate(anno_models):
+            annotator = ANNOTATORS.build(anno_config, logger=logger)
+            annotator.input_keys = anno_config.get('INPUT_KEYS', [])
+            if isinstance(annotator.input_keys, str):
+                annotator.input_keys = [annotator.input_keys]
+            annotator.output_keys = anno_config.get('OUTPUT_KEYS', [])
+            if isinstance(annotator.output_keys, str):
+                annotator.output_keys = [annotator.output_keys]
+            assert len(annotator.input_keys) == len(annotator.output_keys)
+            self.annotators.append(annotator)
+    def forward(self, input_dict):
+        output_dict = {}
+        for annotator in self.annotators:
+            for idx, in_key in enumerate(annotator.input_keys):
+                if in_key in input_dict:
+                    image = annotator(input_dict[in_key])
+                    output_dict[annotator.output_keys[idx]] = image
+        return output_dict

annotator/canny.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from abc import ABCMeta
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from scepter.modules.annotator.base_annotator import BaseAnnotator
+from scepter.modules.annotator.registry import ANNOTATORS
+from scepter.modules.utils.config import dict_to_yaml
+@ANNOTATORS.register_class()
+class CannyAnnotator(BaseAnnotator, metaclass=ABCMeta):
+    para_dict = {}
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        self.low_threshold = cfg.get('LOW_THRESHOLD', 100)
+        self.high_threshold = cfg.get('HIGH_THRESHOLD', 200)
+        self.random_cfg = cfg.get('RANDOM_CFG', None)
+    def forward(self, image):
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        elif isinstance(image, torch.Tensor):
+            image = image.detach().cpu().numpy()
+        elif isinstance(image, np.ndarray):
+            image = image.copy()
+        else:
+            raise f'Unsurpport datatype{type(image)}, only surpport np.ndarray, torch.Tensor, Pillow Image.'
+        assert len(image.shape) < 4
+        if self.random_cfg is None:
+            image = cv2.Canny(image, self.low_threshold, self.high_threshold)
+        else:
+            proba = self.random_cfg.get('PROBA', 1.0)
+            if np.random.random() < proba:
+                min_low_threshold = self.random_cfg.get(
+                    'MIN_LOW_THRESHOLD', 50)
+                max_low_threshold = self.random_cfg.get(
+                    'MAX_LOW_THRESHOLD', 100)
+                min_high_threshold = self.random_cfg.get(
+                    'MIN_HIGH_THRESHOLD', 200)
+                max_high_threshold = self.random_cfg.get(
+                    'MAX_HIGH_THRESHOLD', 350)
+                low_th = np.random.randint(min_low_threshold,
+                                           max_low_threshold)
+                high_th = np.random.randint(min_high_threshold,
+                                            max_high_threshold)
+            else:
+                low_th, high_th = self.low_threshold, self.high_threshold
+            image = cv2.Canny(image, low_th, high_th)
+        return image[..., None].repeat(3, 2)
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('ANNOTATORS',
+                            __class__.__name__,
+                            CannyAnnotator.para_dict,
+                            set_name=True)

annotator/color.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from abc import ABCMeta
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from scepter.modules.annotator.base_annotator import BaseAnnotator
+from scepter.modules.annotator.registry import ANNOTATORS
+from scepter.modules.utils.config import dict_to_yaml
+@ANNOTATORS.register_class()
+class ColorAnnotator(BaseAnnotator, metaclass=ABCMeta):
+    para_dict = {}
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        self.ratio = cfg.get('RATIO', 64)
+        self.random_cfg = cfg.get('RANDOM_CFG', None)
+    def forward(self, image):
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        elif isinstance(image, torch.Tensor):
+            image = image.detach().cpu().numpy()
+        elif isinstance(image, np.ndarray):
+            image = image.copy()
+        else:
+            raise f'Unsurpport datatype{type(image)}, only surpport np.ndarray, torch.Tensor, Pillow Image.'
+        h, w = image.shape[:2]
+        if self.random_cfg is None:
+            ratio = self.ratio
+        else:
+            proba = self.random_cfg.get('PROBA', 1.0)
+            if np.random.random() < proba:
+                if 'CHOICE_RATIO' in self.random_cfg:
+                    ratio = np.random.choice(self.random_cfg['CHOICE_RATIO'])
+                else:
+                    min_ratio = self.random_cfg.get('MIN_RATIO', 48)
+                    max_ratio = self.random_cfg.get('MAX_RATIO', 96)
+                    ratio = np.random.randint(min_ratio, max_ratio)
+            else:
+                ratio = self.ratio
+        image = cv2.resize(image, (int(w // ratio), int(h // ratio)),
+                           interpolation=cv2.INTER_CUBIC)
+        image = cv2.resize(image, (w, h), interpolation=cv2.INTER_NEAREST)
+        assert len(image.shape) < 4
+        return image
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('ANNOTATORS',
+                            __class__.__name__,
+                            ColorAnnotator.para_dict,
+                            set_name=True)

annotator/hed.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Please use this implementation in your products
+# This implementation may produce slightly different results from Saining Xie's official implementations,
+# but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
+# Different from official models and other implementations, this is an RGB-input model (rather than BGR)
+# and in this way it works better for gradio's RGB protocol
+from abc import ABCMeta
+import cv2
+import numpy as np
+import torch
+from einops import rearrange
+from scepter.modules.annotator.base_annotator import BaseAnnotator
+from scepter.modules.annotator.registry import ANNOTATORS
+from scepter.modules.utils.config import dict_to_yaml
+from scepter.modules.utils.distribute import we
+from scepter.modules.utils.file_system import FS
+def nms(x, t, s):
+    x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
+    f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
+    f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
+    f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
+    f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
+    y = np.zeros_like(x)
+    for f in [f1, f2, f3, f4]:
+        np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
+    z = np.zeros_like(y, dtype=np.uint8)
+    z[y > t] = 255
+    return z
+class DoubleConvBlock(torch.nn.Module):
+    def __init__(self, input_channel, output_channel, layer_number):
+        super().__init__()
+        self.convs = torch.nn.Sequential()
+        self.convs.append(
+            torch.nn.Conv2d(in_channels=input_channel,
+                            out_channels=output_channel,
+                            kernel_size=(3, 3),
+                            stride=(1, 1),
+                            padding=1))
+        for i in range(1, layer_number):
+            self.convs.append(
+                torch.nn.Conv2d(in_channels=output_channel,
+                                out_channels=output_channel,
+                                kernel_size=(3, 3),
+                                stride=(1, 1),
+                                padding=1))
+        self.projection = torch.nn.Conv2d(in_channels=output_channel,
+                                          out_channels=1,
+                                          kernel_size=(1, 1),
+                                          stride=(1, 1),
+                                          padding=0)
+    def __call__(self, x, down_sampling=False):
+        h = x
+        if down_sampling:
+            h = torch.nn.functional.max_pool2d(h,
+                                               kernel_size=(2, 2),
+                                               stride=(2, 2))
+        for conv in self.convs:
+            h = conv(h)
+            h = torch.nn.functional.relu(h)
+        return h, self.projection(h)
+class ControlNetHED_Apache2(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
+        self.block1 = DoubleConvBlock(input_channel=3,
+                                      output_channel=64,
+                                      layer_number=2)
+        self.block2 = DoubleConvBlock(input_channel=64,
+                                      output_channel=128,
+                                      layer_number=2)
+        self.block3 = DoubleConvBlock(input_channel=128,
+                                      output_channel=256,
+                                      layer_number=3)
+        self.block4 = DoubleConvBlock(input_channel=256,
+                                      output_channel=512,
+                                      layer_number=3)
+        self.block5 = DoubleConvBlock(input_channel=512,
+                                      output_channel=512,
+                                      layer_number=3)
+    def __call__(self, x):
+        h = x - self.norm
+        h, projection1 = self.block1(h)
+        h, projection2 = self.block2(h, down_sampling=True)
+        h, projection3 = self.block3(h, down_sampling=True)
+        h, projection4 = self.block4(h, down_sampling=True)
+        h, projection5 = self.block5(h, down_sampling=True)
+        return projection1, projection2, projection3, projection4, projection5
+@ANNOTATORS.register_class()
+class HedAnnotator(BaseAnnotator, metaclass=ABCMeta):
+    para_dict = {}
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        self.netNetwork = ControlNetHED_Apache2().float().eval()
+        pretrained_model = cfg.get('PRETRAINED_MODEL', None)
+        if pretrained_model:
+            with FS.get_from(pretrained_model, wait_finish=True) as local_path:
+                self.netNetwork.load_state_dict(torch.load(local_path))
+    @torch.no_grad()
+    @torch.inference_mode()
+    @torch.autocast('cuda', enabled=False)
+    def forward(self, image):
+        if isinstance(image, torch.Tensor):
+            if len(image.shape) == 3:
+                image = rearrange(image, 'h w c -> 1 c h w')
+                B, C, H, W = image.shape
+            else:
+                raise "Unsurpport input image's shape"
+        elif isinstance(image, np.ndarray):
+            image = torch.from_numpy(image.copy()).float()
+            if len(image.shape) == 3:
+                image = rearrange(image, 'h w c -> 1 c h w')
+                B, C, H, W = image.shape
+            else:
+                raise "Unsurpport input image's shape"
+        else:
+            raise "Unsurpport input image's type"
+        edges = self.netNetwork(image.to(we.device_id))
+        edges = [
+            e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges
+        ]
+        edges = [
+            cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR)
+            for e in edges
+        ]
+        edges = np.stack(edges, axis=2)
+        edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64)))
+        edge = 255 - (edge * 255.0).clip(0, 255).astype(np.uint8)
+        return edge[..., None].repeat(3, 2)
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('ANNOTATORS',
+                            __class__.__name__,
+                            HedAnnotator.para_dict,
+                            set_name=True)

annotator/identity.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from abc import ABCMeta
+from scepter.modules.annotator.base_annotator import BaseAnnotator
+from scepter.modules.annotator.registry import ANNOTATORS
+from scepter.modules.utils.config import dict_to_yaml
+@ANNOTATORS.register_class()
+class IdentityAnnotator(BaseAnnotator, metaclass=ABCMeta):
+    para_dict = {}
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+    def forward(self, image):
+        return image
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('ANNOTATORS',
+                            __class__.__name__,
+                            IdentityAnnotator.para_dict,
+                            set_name=True)

annotator/invert.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from abc import ABCMeta
+from scepter.modules.annotator.base_annotator import BaseAnnotator
+from scepter.modules.annotator.registry import ANNOTATORS
+from scepter.modules.utils.config import dict_to_yaml
+@ANNOTATORS.register_class()
+class InvertAnnotator(BaseAnnotator, metaclass=ABCMeta):
+    para_dict = {}
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+    def forward(self, image):
+        return 255 - image
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('ANNOTATORS',
+                            __class__.__name__,
+                            InvertAnnotator.para_dict,
+                            set_name=True)

annotator/midas/__init__.py ADDED Viewed

File without changes

annotator/midas/api.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# -*- coding: utf-8 -*-
+# based on https://github.com/isl-org/MiDaS
+import cv2
+import torch
+import torch.nn as nn
+from torchvision.transforms import Compose
+from .dpt_depth import DPTDepthModel
+from .midas_net import MidasNet
+from .midas_net_custom import MidasNet_small
+from .transforms import NormalizeImage, PrepareForNet, Resize
+# ISL_PATHS = {
+#     "dpt_large": "dpt_large-midas-2f21e586.pt",
+#     "dpt_hybrid": "dpt_hybrid-midas-501f0c75.pt",
+#     "midas_v21": "",
+#     "midas_v21_small": "",
+# }
+# remote_model_path =
+# "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt"
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+def load_midas_transform(model_type):
+    # https://github.com/isl-org/MiDaS/blob/master/run.py
+    # load transform only
+    if model_type == 'dpt_large':  # DPT-Large
+        net_w, net_h = 384, 384
+        resize_mode = 'minimal'
+        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5],
+                                       std=[0.5, 0.5, 0.5])
+    elif model_type == 'dpt_hybrid':  # DPT-Hybrid
+        net_w, net_h = 384, 384
+        resize_mode = 'minimal'
+        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5],
+                                       std=[0.5, 0.5, 0.5])
+    elif model_type == 'midas_v21':
+        net_w, net_h = 384, 384
+        resize_mode = 'upper_bound'
+        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406],
+                                       std=[0.229, 0.224, 0.225])
+    elif model_type == 'midas_v21_small':
+        net_w, net_h = 256, 256
+        resize_mode = 'upper_bound'
+        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406],
+                                       std=[0.229, 0.224, 0.225])
+    else:
+        assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
+    transform = Compose([
+        Resize(
+            net_w,
+            net_h,
+            resize_target=None,
+            keep_aspect_ratio=True,
+            ensure_multiple_of=32,
+            resize_method=resize_mode,
+            image_interpolation_method=cv2.INTER_CUBIC,
+        ),
+        normalization,
+        PrepareForNet(),
+    ])
+    return transform
+def load_model(model_type, model_path):
+    # https://github.com/isl-org/MiDaS/blob/master/run.py
+    # load network
+    # model_path = ISL_PATHS[model_type]
+    if model_type == 'dpt_large':  # DPT-Large
+        model = DPTDepthModel(
+            path=model_path,
+            backbone='vitl16_384',
+            non_negative=True,
+        )
+        net_w, net_h = 384, 384
+        resize_mode = 'minimal'
+        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5],
+                                       std=[0.5, 0.5, 0.5])
+    elif model_type == 'dpt_hybrid':  # DPT-Hybrid
+        model = DPTDepthModel(
+            path=model_path,
+            backbone='vitb_rn50_384',
+            non_negative=True,
+        )
+        net_w, net_h = 384, 384
+        resize_mode = 'minimal'
+        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5],
+                                       std=[0.5, 0.5, 0.5])
+    elif model_type == 'midas_v21':
+        model = MidasNet(model_path, non_negative=True)
+        net_w, net_h = 384, 384
+        resize_mode = 'upper_bound'
+        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406],
+                                       std=[0.229, 0.224, 0.225])
+    elif model_type == 'midas_v21_small':
+        model = MidasNet_small(model_path,
+                               features=64,
+                               backbone='efficientnet_lite3',
+                               exportable=True,
+                               non_negative=True,
+                               blocks={'expand': True})
+        net_w, net_h = 256, 256
+        resize_mode = 'upper_bound'
+        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406],
+                                       std=[0.229, 0.224, 0.225])
+    else:
+        print(
+            f"model_type '{model_type}' not implemented, use: --model_type large"
+        )
+        assert False
+    transform = Compose([
+        Resize(
+            net_w,
+            net_h,
+            resize_target=None,
+            keep_aspect_ratio=True,
+            ensure_multiple_of=32,
+            resize_method=resize_mode,
+            image_interpolation_method=cv2.INTER_CUBIC,
+        ),
+        normalization,
+        PrepareForNet(),
+    ])
+    return model.eval(), transform
+class MiDaSInference(nn.Module):
+    MODEL_TYPES_TORCH_HUB = ['DPT_Large', 'DPT_Hybrid', 'MiDaS_small']
+    MODEL_TYPES_ISL = [
+        'dpt_large',
+        'dpt_hybrid',
+        'midas_v21',
+        'midas_v21_small',
+    ]
+    def __init__(self, model_type, model_path):
+        super().__init__()
+        assert (model_type in self.MODEL_TYPES_ISL)
+        model, _ = load_model(model_type, model_path)
+        self.model = model
+        self.model.train = disabled_train
+    def forward(self, x):
+        with torch.no_grad():
+            prediction = self.model(x)
+        return prediction

annotator/midas/base_model.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# -*- coding: utf-8 -*-
+import torch
+class BaseModel(torch.nn.Module):
+    def load(self, path):
+        """Load model from file.
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device('cpu'))
+        if 'optimizer' in parameters:
+            parameters = parameters['model']
+        self.load_state_dict(parameters)

annotator/midas/blocks.py ADDED Viewed

	@@ -0,0 +1,390 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+from .vit import (_make_pretrained_vitb16_384, _make_pretrained_vitb_rn50_384,
+                  _make_pretrained_vitl16_384)
+def _make_encoder(
+    backbone,
+    features,
+    use_pretrained,
+    groups=1,
+    expand=False,
+    exportable=True,
+    hooks=None,
+    use_vit_only=False,
+    use_readout='ignore',
+):
+    if backbone == 'vitl16_384':
+        pretrained = _make_pretrained_vitl16_384(use_pretrained,
+                                                 hooks=hooks,
+                                                 use_readout=use_readout)
+        scratch = _make_scratch(
+            [256, 512, 1024, 1024], features, groups=groups,
+            expand=expand)  # ViT-L/16 - 85.0% Top1 (backbone)
+    elif backbone == 'vitb_rn50_384':
+        pretrained = _make_pretrained_vitb_rn50_384(
+            use_pretrained,
+            hooks=hooks,
+            use_vit_only=use_vit_only,
+            use_readout=use_readout,
+        )
+        scratch = _make_scratch(
+            [256, 512, 768, 768], features, groups=groups,
+            expand=expand)  # ViT-H/16 - 85.0% Top1 (backbone)
+    elif backbone == 'vitb16_384':
+        pretrained = _make_pretrained_vitb16_384(use_pretrained,
+                                                 hooks=hooks,
+                                                 use_readout=use_readout)
+        scratch = _make_scratch(
+            [96, 192, 384, 768], features, groups=groups,
+            expand=expand)  # ViT-B/16 - 84.6% Top1 (backbone)
+    elif backbone == 'resnext101_wsl':
+        pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
+        scratch = _make_scratch([256, 512, 1024, 2048],
+                                features,
+                                groups=groups,
+                                expand=expand)  # efficientnet_lite3
+    elif backbone == 'efficientnet_lite3':
+        pretrained = _make_pretrained_efficientnet_lite3(use_pretrained,
+                                                         exportable=exportable)
+        scratch = _make_scratch([32, 48, 136, 384],
+                                features,
+                                groups=groups,
+                                expand=expand)  # efficientnet_lite3
+    else:
+        print(f"Backbone '{backbone}' not implemented")
+        assert False
+    return pretrained, scratch
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand is True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(in_shape[0],
+                                  out_shape1,
+                                  kernel_size=3,
+                                  stride=1,
+                                  padding=1,
+                                  bias=False,
+                                  groups=groups)
+    scratch.layer2_rn = nn.Conv2d(in_shape[1],
+                                  out_shape2,
+                                  kernel_size=3,
+                                  stride=1,
+                                  padding=1,
+                                  bias=False,
+                                  groups=groups)
+    scratch.layer3_rn = nn.Conv2d(in_shape[2],
+                                  out_shape3,
+                                  kernel_size=3,
+                                  stride=1,
+                                  padding=1,
+                                  bias=False,
+                                  groups=groups)
+    scratch.layer4_rn = nn.Conv2d(in_shape[3],
+                                  out_shape4,
+                                  kernel_size=3,
+                                  stride=1,
+                                  padding=1,
+                                  bias=False,
+                                  groups=groups)
+    return scratch
+def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
+    efficientnet = torch.hub.load('rwightman/gen-efficientnet-pytorch',
+                                  'tf_efficientnet_lite3',
+                                  pretrained=use_pretrained,
+                                  exportable=exportable)
+    return _make_efficientnet_backbone(efficientnet)
+def _make_efficientnet_backbone(effnet):
+    pretrained = nn.Module()
+    pretrained.layer1 = nn.Sequential(effnet.conv_stem, effnet.bn1,
+                                      effnet.act1, *effnet.blocks[0:2])
+    pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
+    pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
+    pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
+    return pretrained
+def _make_resnet_backbone(resnet):
+    pretrained = nn.Module()
+    pretrained.layer1 = nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu,
+                                      resnet.maxpool, resnet.layer1)
+    pretrained.layer2 = resnet.layer2
+    pretrained.layer3 = resnet.layer3
+    pretrained.layer4 = resnet.layer4
+    return pretrained
+def _make_pretrained_resnext101_wsl(use_pretrained):
+    resnet = torch.hub.load('facebookresearch/WSL-Images',
+                            'resnext101_32x8d_wsl')
+    return _make_resnet_backbone(resnet)
+class Interpolate(nn.Module):
+    """Interpolation module.
+    """
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+        x = self.interp(x,
+                        scale_factor=self.scale_factor,
+                        mode=self.mode,
+                        align_corners=self.align_corners)
+        return x
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(features,
+                               features,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1,
+                               bias=True)
+        self.conv2 = nn.Conv2d(features,
+                               features,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1,
+                               bias=True)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.relu(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        return out + x
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.resConfUnit1 = ResidualConvUnit(features)
+        self.resConfUnit2 = ResidualConvUnit(features)
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            output += self.resConfUnit1(xs[1])
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(output,
+                                           scale_factor=2,
+                                           mode='bilinear',
+                                           align_corners=True)
+        return output
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = 1
+        self.conv1 = nn.Conv2d(features,
+                               features,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1,
+                               bias=True,
+                               groups=self.groups)
+        self.conv2 = nn.Conv2d(features,
+                               features,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1,
+                               bias=True,
+                               groups=self.groups)
+        if self.bn is True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn is True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn is True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+        # return out + x
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(self,
+                 features,
+                 activation,
+                 deconv=False,
+                 bn=False,
+                 expand=False,
+                 align_corners=True):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = 1
+        self.expand = expand
+        out_features = features
+        if self.expand is True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(features,
+                                  out_features,
+                                  kernel_size=1,
+                                  stride=1,
+                                  padding=0,
+                                  bias=True,
+                                  groups=1)
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+            # output += res
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(output,
+                                           scale_factor=2,
+                                           mode='bilinear',
+                                           align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output

annotator/midas/dpt_depth.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+from .base_model import BaseModel
+from .blocks import FeatureFusionBlock_custom, Interpolate, _make_encoder
+from .vit import forward_vit
+def _make_fusion_block(features, use_bn):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+    )
+class DPT(BaseModel):
+    def __init__(
+        self,
+        head,
+        features=256,
+        backbone='vitb_rn50_384',
+        readout='project',
+        channels_last=False,
+        use_bn=False,
+    ):
+        super(DPT, self).__init__()
+        self.channels_last = channels_last
+        hooks = {
+            'vitb_rn50_384': [0, 1, 8, 11],
+            'vitb16_384': [2, 5, 8, 11],
+            'vitl16_384': [5, 11, 17, 23],
+        }
+        # Instantiate backbone and reassemble blocks
+        self.pretrained, self.scratch = _make_encoder(
+            backbone,
+            features,
+            False,  # Set to true of you want to train from scratch, uses ImageNet weights
+            groups=1,
+            expand=False,
+            exportable=False,
+            hooks=hooks[backbone],
+            use_readout=readout,
+        )
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        self.scratch.output_conv = head
+    def forward(self, x):
+        if self.channels_last is True:
+            x.contiguous(memory_format=torch.channels_last)
+        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv(path_1)
+        return out
+class DPTDepthModel(DPT):
+    def __init__(self, path=None, non_negative=True, **kwargs):
+        features = kwargs['features'] if 'features' in kwargs else 256
+        head = nn.Sequential(
+            nn.Conv2d(features,
+                      features // 2,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1),
+            Interpolate(scale_factor=2, mode='bilinear', align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+            nn.Identity(),
+        )
+        super().__init__(head, **kwargs)
+        if path is not None:
+            self.load(path)
+    def forward(self, x):
+        return super().forward(x).squeeze(dim=1)

annotator/midas/midas_net.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# -*- coding: utf-8 -*-
+"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
+This file contains code that is adapted from
+https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
+"""
+import torch
+import torch.nn as nn
+from .base_model import BaseModel
+from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
+class MidasNet(BaseModel):
+    """Network for monocular depth estimation.
+    """
+    def __init__(self, path=None, features=256, non_negative=True):
+        """Init.
+        Args:
+            path (str, optional): Path to saved model. Defaults to None.
+            features (int, optional): Number of features. Defaults to 256.
+            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
+        """
+        print('Loading weights: ', path)
+        super(MidasNet, self).__init__()
+        use_pretrained = False if path is None else True
+        self.pretrained, self.scratch = _make_encoder(
+            backbone='resnext101_wsl',
+            features=features,
+            use_pretrained=use_pretrained)
+        self.scratch.refinenet4 = FeatureFusionBlock(features)
+        self.scratch.refinenet3 = FeatureFusionBlock(features)
+        self.scratch.refinenet2 = FeatureFusionBlock(features)
+        self.scratch.refinenet1 = FeatureFusionBlock(features)
+        self.scratch.output_conv = nn.Sequential(
+            nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode='bilinear'),
+            nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+        )
+        if path:
+            self.load(path)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input data (image)
+        Returns:
+            tensor: depth
+        """
+        layer_1 = self.pretrained.layer1(x)
+        layer_2 = self.pretrained.layer2(layer_1)
+        layer_3 = self.pretrained.layer3(layer_2)
+        layer_4 = self.pretrained.layer4(layer_3)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv(path_1)
+        return torch.squeeze(out, dim=1)

annotator/midas/midas_net_custom.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# -*- coding: utf-8 -*-
+"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
+This file contains code that is adapted from
+https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
+"""
+import torch
+import torch.nn as nn
+from .base_model import BaseModel
+from .blocks import FeatureFusionBlock_custom, Interpolate, _make_encoder
+class MidasNet_small(BaseModel):
+    """Network for monocular depth estimation.
+    """
+    def __init__(self,
+                 path=None,
+                 features=64,
+                 backbone='efficientnet_lite3',
+                 non_negative=True,
+                 exportable=True,
+                 channels_last=False,
+                 align_corners=True,
+                 blocks={'expand': True}):
+        """Init.
+        Args:
+            path (str, optional): Path to saved model. Defaults to None.
+            features (int, optional): Number of features. Defaults to 256.
+            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
+        """
+        print('Loading weights: ', path)
+        super(MidasNet_small, self).__init__()
+        use_pretrained = False if path else True
+        self.channels_last = channels_last
+        self.blocks = blocks
+        self.backbone = backbone
+        self.groups = 1
+        features1 = features
+        features2 = features
+        features3 = features
+        features4 = features
+        self.expand = False
+        if 'expand' in self.blocks and self.blocks['expand'] is True:
+            self.expand = True
+            features1 = features
+            features2 = features * 2
+            features3 = features * 4
+            features4 = features * 8
+        self.pretrained, self.scratch = _make_encoder(self.backbone,
+                                                      features,
+                                                      use_pretrained,
+                                                      groups=self.groups,
+                                                      expand=self.expand,
+                                                      exportable=exportable)
+        self.scratch.activation = nn.ReLU(False)
+        self.scratch.refinenet4 = FeatureFusionBlock_custom(
+            features4,
+            self.scratch.activation,
+            deconv=False,
+            bn=False,
+            expand=self.expand,
+            align_corners=align_corners)
+        self.scratch.refinenet3 = FeatureFusionBlock_custom(
+            features3,
+            self.scratch.activation,
+            deconv=False,
+            bn=False,
+            expand=self.expand,
+            align_corners=align_corners)
+        self.scratch.refinenet2 = FeatureFusionBlock_custom(
+            features2,
+            self.scratch.activation,
+            deconv=False,
+            bn=False,
+            expand=self.expand,
+            align_corners=align_corners)
+        self.scratch.refinenet1 = FeatureFusionBlock_custom(
+            features1,
+            self.scratch.activation,
+            deconv=False,
+            bn=False,
+            align_corners=align_corners)
+        self.scratch.output_conv = nn.Sequential(
+            nn.Conv2d(features,
+                      features // 2,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      groups=self.groups),
+            Interpolate(scale_factor=2, mode='bilinear'),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            self.scratch.activation,
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+            nn.Identity(),
+        )
+        if path:
+            self.load(path)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input data (image)
+        Returns:
+            tensor: depth
+        """
+        if self.channels_last is True:
+            print('self.channels_last = ', self.channels_last)
+            x.contiguous(memory_format=torch.channels_last)
+        layer_1 = self.pretrained.layer1(x)
+        layer_2 = self.pretrained.layer2(layer_1)
+        layer_3 = self.pretrained.layer3(layer_2)
+        layer_4 = self.pretrained.layer4(layer_3)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv(path_1)
+        return torch.squeeze(out, dim=1)
+def fuse_model(m):
+    prev_previous_type = nn.Identity()
+    prev_previous_name = ''
+    previous_type = nn.Identity()
+    previous_name = ''
+    for name, module in m.named_modules():
+        if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(
+                module) == nn.ReLU:
+            # print("FUSED ", prev_previous_name, previous_name, name)
+            torch.quantization.fuse_modules(
+                m, [prev_previous_name, previous_name, name], inplace=True)
+        elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
+            # print("FUSED ", prev_previous_name, previous_name)
+            torch.quantization.fuse_modules(
+                m, [prev_previous_name, previous_name], inplace=True)
+        # elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
+        #    print("FUSED ", previous_name, name)
+        #    torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
+        prev_previous_type = previous_type
+        prev_previous_name = previous_name
+        previous_type = type(module)
+        previous_name = name

annotator/midas/transforms.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# -*- coding: utf-8 -*-
+import math
+import cv2
+import numpy as np
+def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
+    """Rezise the sample to ensure the given size. Keeps aspect ratio.
+    Args:
+        sample (dict): sample
+        size (tuple): image size
+    Returns:
+        tuple: new size
+    """
+    shape = list(sample['disparity'].shape)
+    if shape[0] >= size[0] and shape[1] >= size[1]:
+        return sample
+    scale = [0, 0]
+    scale[0] = size[0] / shape[0]
+    scale[1] = size[1] / shape[1]
+    scale = max(scale)
+    shape[0] = math.ceil(scale * shape[0])
+    shape[1] = math.ceil(scale * shape[1])
+    # resize
+    sample['image'] = cv2.resize(sample['image'],
+                                 tuple(shape[::-1]),
+                                 interpolation=image_interpolation_method)
+    sample['disparity'] = cv2.resize(sample['disparity'],
+                                     tuple(shape[::-1]),
+                                     interpolation=cv2.INTER_NEAREST)
+    sample['mask'] = cv2.resize(
+        sample['mask'].astype(np.float32),
+        tuple(shape[::-1]),
+        interpolation=cv2.INTER_NEAREST,
+    )
+    sample['mask'] = sample['mask'].astype(bool)
+    return tuple(shape)
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method='lower_bound',
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. "
+                "(Output size might be smaller than given size.)"
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) *
+                 self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) *
+                 self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == 'lower_bound':
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == 'upper_bound':
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == 'minimal':
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(
+                    f'resize_method {self.__resize_method} not implemented')
+        if self.__resize_method == 'lower_bound':
+            new_height = self.constrain_to_multiple_of(scale_height * height,
+                                                       min_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width,
+                                                      min_val=self.__width)
+        elif self.__resize_method == 'upper_bound':
+            new_height = self.constrain_to_multiple_of(scale_height * height,
+                                                       max_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width,
+                                                      max_val=self.__width)
+        elif self.__resize_method == 'minimal':
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(
+                f'resize_method {self.__resize_method} not implemented')
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(sample['image'].shape[1],
+                                      sample['image'].shape[0])
+        # resize sample
+        sample['image'] = cv2.resize(
+            sample['image'],
+            (width, height),
+            interpolation=self.__image_interpolation_method,
+        )
+        if self.__resize_target:
+            if 'disparity' in sample:
+                sample['disparity'] = cv2.resize(
+                    sample['disparity'],
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+            if 'depth' in sample:
+                sample['depth'] = cv2.resize(sample['depth'], (width, height),
+                                             interpolation=cv2.INTER_NEAREST)
+            sample['mask'] = cv2.resize(
+                sample['mask'].astype(np.float32),
+                (width, height),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            sample['mask'] = sample['mask'].astype(bool)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample['image'] = (sample['image'] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample['image'], (2, 0, 1))
+        sample['image'] = np.ascontiguousarray(image).astype(np.float32)
+        if 'mask' in sample:
+            sample['mask'] = sample['mask'].astype(np.float32)
+            sample['mask'] = np.ascontiguousarray(sample['mask'])
+        if 'disparity' in sample:
+            disparity = sample['disparity'].astype(np.float32)
+            sample['disparity'] = np.ascontiguousarray(disparity)
+        if 'depth' in sample:
+            depth = sample['depth'].astype(np.float32)
+            sample['depth'] = np.ascontiguousarray(depth)
+        return sample

annotator/midas/utils.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# -*- coding: utf-8 -*-
+"""Utils for monoDepth."""
+import re
+import sys
+import cv2
+import numpy as np
+import torch
+def read_pfm(path):
+    """Read pfm file.
+    Args:
+        path (str): path to file
+    Returns:
+        tuple: (data, scale)
+    """
+    with open(path, 'rb') as file:
+        color = None
+        width = None
+        height = None
+        scale = None
+        endian = None
+        header = file.readline().rstrip()
+        if header.decode('ascii') == 'PF':
+            color = True
+        elif header.decode('ascii') == 'Pf':
+            color = False
+        else:
+            raise Exception('Not a PFM file: ' + path)
+        dim_match = re.match(r'^(\d+)\s(\d+)\s$',
+                             file.readline().decode('ascii'))
+        if dim_match:
+            width, height = list(map(int, dim_match.groups()))
+        else:
+            raise Exception('Malformed PFM header.')
+        scale = float(file.readline().decode('ascii').rstrip())
+        if scale < 0:
+            # little-endian
+            endian = '<'
+            scale = -scale
+        else:
+            # big-endian
+            endian = '>'
+        data = np.fromfile(file, endian + 'f')
+        shape = (height, width, 3) if color else (height, width)
+        data = np.reshape(data, shape)
+        data = np.flipud(data)
+        return data, scale
+def write_pfm(path, image, scale=1):
+    """Write pfm file.
+    Args:
+        path (str): pathto file
+        image (array): data
+        scale (int, optional): Scale. Defaults to 1.
+    """
+    with open(path, 'wb') as file:
+        color = None
+        if image.dtype.name != 'float32':
+            raise Exception('Image dtype must be float32.')
+        image = np.flipud(image)
+        if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+            color = True
+        elif (len(image.shape) == 2
+              or len(image.shape) == 3 and image.shape[2] == 1):  # greyscale
+            color = False
+        else:
+            raise Exception(
+                'Image must have H x W x 3, H x W x 1 or H x W dimensions.')
+        file.write('PF\n' if color else 'Pf\n'.encode())
+        file.write('%d %d\n'.encode() % (image.shape[1], image.shape[0]))
+        endian = image.dtype.byteorder
+        if endian == '<' or endian == '=' and sys.byteorder == 'little':
+            scale = -scale
+        file.write('%f\n'.encode() % scale)
+        image.tofile(file)
+def read_image(path):
+    """Read image and output RGB image (0-1).
+    Args:
+        path (str): path to file
+    Returns:
+        array: RGB image (0-1)
+    """
+    img = cv2.imread(path)
+    if img.ndim == 2:
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
+    return img
+def resize_image(img):
+    """Resize image and make it fit for network.
+    Args:
+        img (array): image
+    Returns:
+        tensor: data ready for network
+    """
+    height_orig = img.shape[0]
+    width_orig = img.shape[1]
+    if width_orig > height_orig:
+        scale = width_orig / 384
+    else:
+        scale = height_orig / 384
+    height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
+    width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
+    img_resized = cv2.resize(img, (width, height),
+                             interpolation=cv2.INTER_AREA)
+    img_resized = (torch.from_numpy(np.transpose(
+        img_resized, (2, 0, 1))).contiguous().float())
+    img_resized = img_resized.unsqueeze(0)
+    return img_resized
+def resize_depth(depth, width, height):
+    """Resize depth map and bring to CPU (numpy).
+    Args:
+        depth (tensor): depth
+        width (int): image width
+        height (int): image height
+    Returns:
+        array: processed depth
+    """
+    depth = torch.squeeze(depth[0, :, :, :]).to('cpu')
+    depth_resized = cv2.resize(depth.numpy(), (width, height),
+                               interpolation=cv2.INTER_CUBIC)
+    return depth_resized
+def write_depth(path, depth, bits=1):
+    """Write depth map to pfm and png file.
+    Args:
+        path (str): filepath without extension
+        depth (array): depth
+    """
+    write_pfm(path + '.pfm', depth.astype(np.float32))
+    depth_min = depth.min()
+    depth_max = depth.max()
+    max_val = (2**(8 * bits)) - 1
+    if depth_max - depth_min > np.finfo('float').eps:
+        out = max_val * (depth - depth_min) / (depth_max - depth_min)
+    else:
+        out = np.zeros(depth.shape, dtype=depth.type)
+    if bits == 1:
+        cv2.imwrite(path + '.png', out.astype('uint8'))
+    elif bits == 2:
+        cv2.imwrite(path + '.png', out.astype('uint16'))
+    return

annotator/midas/vit.py ADDED Viewed

	@@ -0,0 +1,509 @@

+# -*- coding: utf-8 -*-
+import math
+import types
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class Slice(nn.Module):
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        return x[:, self.start_index:]
+class AddReadout(nn.Module):
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index:] + readout.unsqueeze(1)
+class ProjectReadout(nn.Module):
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features),
+                                     nn.GELU())
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
+        features = torch.cat((x[:, self.start_index:], readout), -1)
+        return self.project(features)
+class Transpose(nn.Module):
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+    _ = pretrained.model.forward_flex(x)
+    layer_1 = pretrained.activations['1']
+    layer_2 = pretrained.activations['2']
+    layer_3 = pretrained.activations['3']
+    layer_4 = pretrained.activations['4']
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size([
+                h // pretrained.model.patch_size[1],
+                w // pretrained.model.patch_size[0],
+            ]),
+        ))
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+    layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)](
+        layer_1)
+    layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)](
+        layer_2)
+    layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)](
+        layer_3)
+    layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)](
+        layer_4)
+    return layer_1, layer_2, layer_3, layer_4
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, :self.start_index],
+        posemb[0, self.start_index:],
+    )
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid,
+                                size=(gs_h, gs_w),
+                                mode='bilinear')
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+    pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1],
+                                       w // self.patch_size[0])
+    B = x.shape[0]
+    if hasattr(self.patch_embed, 'backbone'):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[
+                -1]  # last feature if backbone outputs list/tuple of features
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+    if getattr(self, 'dist_token', None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+    x = x + pos_embed
+    x = self.pos_drop(x)
+    for blk in self.blocks:
+        x = blk(x)
+    x = self.norm(x)
+    return x
+activations = {}
+def get_activation(name):
+    def hook(model, input, output):
+        activations[name] = output
+    return hook
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == 'ignore':
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == 'add':
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == 'project':
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+    return readout_oper
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout='ignore',
+    start_index=1,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(
+        get_activation('1'))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(
+        get_activation('2'))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(
+        get_activation('3'))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(
+        get_activation('4'))
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout,
+                                    start_index)
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex,
+                                                     pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model)
+    return pretrained
+def _make_pretrained_vitl16_384(pretrained, use_readout='ignore', hooks=None):
+    model = timm.create_model('vit_large_patch16_384', pretrained=pretrained)
+    hooks = [5, 11, 17, 23] if hooks is None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+    )
+def _make_pretrained_vitb16_384(pretrained, use_readout='ignore', hooks=None):
+    model = timm.create_model('vit_base_patch16_384', pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks is None else hooks
+    return _make_vit_b16_backbone(model,
+                                  features=[96, 192, 384, 768],
+                                  hooks=hooks,
+                                  use_readout=use_readout)
+def _make_pretrained_deitb16_384(pretrained, use_readout='ignore', hooks=None):
+    model = timm.create_model('vit_deit_base_patch16_384',
+                              pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks is None else hooks
+    return _make_vit_b16_backbone(model,
+                                  features=[96, 192, 384, 768],
+                                  hooks=hooks,
+                                  use_readout=use_readout)
+def _make_pretrained_deitb16_distil_384(pretrained,
+                                        use_readout='ignore',
+                                        hooks=None):
+    model = timm.create_model('vit_deit_base_distilled_patch16_384',
+                              pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks is None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        start_index=2,
+    )
+def _make_vit_b_rn50_backbone(
+    model,
+    features=[256, 512, 768, 768],
+    size=[384, 384],
+    hooks=[0, 1, 8, 11],
+    vit_features=768,
+    use_vit_only=False,
+    use_readout='ignore',
+    start_index=1,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    if use_vit_only is True:
+        pretrained.model.blocks[hooks[0]].register_forward_hook(
+            get_activation('1'))
+        pretrained.model.blocks[hooks[1]].register_forward_hook(
+            get_activation('2'))
+    else:
+        pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
+            get_activation('1'))
+        pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
+            get_activation('2'))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(
+        get_activation('3'))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(
+        get_activation('4'))
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout,
+                                    start_index)
+    if use_vit_only is True:
+        pretrained.act_postprocess1 = nn.Sequential(
+            readout_oper[0],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[0],
+                out_channels=features[0],
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+        pretrained.act_postprocess2 = nn.Sequential(
+            readout_oper[1],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[1],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[1],
+                out_channels=features[1],
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+    else:
+        pretrained.act_postprocess1 = nn.Sequential(nn.Identity(),
+                                                    nn.Identity(),
+                                                    nn.Identity())
+        pretrained.act_postprocess2 = nn.Sequential(nn.Identity(),
+                                                    nn.Identity(),
+                                                    nn.Identity())
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex,
+                                                     pretrained.model)
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model)
+    return pretrained
+def _make_pretrained_vitb_rn50_384(pretrained,
+                                   use_readout='ignore',
+                                   hooks=None,
+                                   use_vit_only=False):
+    model = timm.create_model('vit_base_resnet50_384', pretrained=pretrained)
+    hooks = [0, 1, 8, 11] if hooks is None else hooks
+    return _make_vit_b_rn50_backbone(
+        model,
+        features=[256, 512, 768, 768],
+        size=[384, 384],
+        hooks=hooks,
+        use_vit_only=use_vit_only,
+        use_readout=use_readout,
+    )

annotator/midas_op.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Midas Depth Estimation
+# From https://github.com/isl-org/MiDaS
+# MIT LICENSE
+from abc import ABCMeta
+import numpy as np
+import torch
+from einops import rearrange
+from PIL import Image
+from scepter.modules.annotator.base_annotator import BaseAnnotator
+from scepter.modules.annotator.midas.api import MiDaSInference
+from scepter.modules.annotator.registry import ANNOTATORS
+from scepter.modules.annotator.utils import resize_image, resize_image_ori
+from scepter.modules.utils.config import dict_to_yaml
+from scepter.modules.utils.distribute import we
+from scepter.modules.utils.file_system import FS
+@ANNOTATORS.register_class()
+class MidasDetector(BaseAnnotator, metaclass=ABCMeta):
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        pretrained_model = cfg.get('PRETRAINED_MODEL', None)
+        if pretrained_model:
+            with FS.get_from(pretrained_model, wait_finish=True) as local_path:
+                self.model = MiDaSInference(model_type='dpt_hybrid',
+                                            model_path=local_path)
+        self.a = cfg.get('A', np.pi * 2.0)
+        self.bg_th = cfg.get('BG_TH', 0.1)
+    @torch.no_grad()
+    @torch.inference_mode()
+    @torch.autocast('cuda', enabled=False)
+    def forward(self, image):
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        elif isinstance(image, torch.Tensor):
+            image = image.detach().cpu().numpy()
+        elif isinstance(image, np.ndarray):
+            image = image.copy()
+        else:
+            raise f'Unsurpport datatype{type(image)}, only surpport np.ndarray, torch.Tensor, Pillow Image.'
+        image_depth = image
+        h, w, c = image.shape
+        image_depth, k = resize_image(image_depth,
+                                      1024 if min(h, w) > 1024 else min(h, w))
+        image_depth = torch.from_numpy(image_depth).float().to(we.device_id)
+        image_depth = image_depth / 127.5 - 1.0
+        image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
+        depth = self.model(image_depth)[0]
+        depth_pt = depth.clone()
+        depth_pt -= torch.min(depth_pt)
+        depth_pt /= torch.max(depth_pt)
+        depth_pt = depth_pt.cpu().numpy()
+        depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
+        depth_image = depth_image[..., None].repeat(3, 2)
+        # depth_np = depth.cpu().numpy()  # float16 error
+        # x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3)
+        # y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3)
+        # z = np.ones_like(x) * self.a
+        # x[depth_pt < self.bg_th] = 0
+        # y[depth_pt < self.bg_th] = 0
+        # normal = np.stack([x, y, z], axis=2)
+        # normal /= np.sum(normal**2.0, axis=2, keepdims=True)**0.5
+        # normal_image = (normal * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
+        depth_image = resize_image_ori(h, w, depth_image, k)
+        return depth_image
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('ANNOTATORS',
+                            __class__.__name__,
+                            MidasDetector.para_dict,
+                            set_name=True)

annotator/mlsd/__init__.py ADDED Viewed

File without changes

annotator/mlsd/mbv2_mlsd_large.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+from torch.nn import functional as F
+class BlockTypeA(nn.Module):
+    def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale=True):
+        super(BlockTypeA, self).__init__()
+        self.conv1 = nn.Sequential(nn.Conv2d(in_c2, out_c2, kernel_size=1),
+                                   nn.BatchNorm2d(out_c2),
+                                   nn.ReLU(inplace=True))
+        self.conv2 = nn.Sequential(nn.Conv2d(in_c1, out_c1, kernel_size=1),
+                                   nn.BatchNorm2d(out_c1),
+                                   nn.ReLU(inplace=True))
+        self.upscale = upscale
+    def forward(self, a, b):
+        b = self.conv1(b)
+        a = self.conv2(a)
+        if self.upscale:
+            b = F.interpolate(b,
+                              scale_factor=2.0,
+                              mode='bilinear',
+                              align_corners=True)
+        return torch.cat((a, b), dim=1)
+class BlockTypeB(nn.Module):
+    def __init__(self, in_c, out_c):
+        super(BlockTypeB, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
+            nn.BatchNorm2d(in_c), nn.ReLU())
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
+            nn.BatchNorm2d(out_c), nn.ReLU())
+    def forward(self, x):
+        x = self.conv1(x) + x
+        x = self.conv2(x)
+        return x
+class BlockTypeC(nn.Module):
+    def __init__(self, in_c, out_c):
+        super(BlockTypeC, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5),
+            nn.BatchNorm2d(in_c), nn.ReLU())
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
+            nn.BatchNorm2d(in_c), nn.ReLU())
+        self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        return x
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+class ConvBNReLU(nn.Sequential):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1):
+        self.channel_pad = out_planes - in_planes
+        self.stride = stride
+        # padding = (kernel_size - 1) // 2
+        # TFLite uses slightly different padding than PyTorch
+        if stride == 2:
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(in_planes,
+                      out_planes,
+                      kernel_size,
+                      stride,
+                      padding,
+                      groups=groups,
+                      bias=False), nn.BatchNorm2d(out_planes),
+            nn.ReLU6(inplace=True))
+        self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+    def forward(self, x):
+        # TFLite uses  different padding
+        if self.stride == 2:
+            x = F.pad(x, (0, 1, 0, 1), 'constant', 0)
+            # print(x.shape)
+        for module in self:
+            if not isinstance(module, nn.MaxPool2d):
+                x = module(x)
+        return x
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+        layers.extend([
+            # dw
+            ConvBNReLU(hidden_dim,
+                       hidden_dim,
+                       stride=stride,
+                       groups=hidden_dim),
+            # pw-linear
+            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+class MobileNetV2(nn.Module):
+    def __init__(self, pretrained=True):
+        """
+        MobileNet V2 main class
+        Args:
+            num_classes (int): Number of classes
+            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+            inverted_residual_setting: Network structure
+            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+            Set to 1 to turn off rounding
+            block: Module specifying inverted residual building block for mobilenet
+        """
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        width_mult = 1.0
+        round_nearest = 8
+        inverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            # [6, 160, 3, 2],
+            # [6, 320, 1, 1],
+        ]
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(
+                inverted_residual_setting[0]) != 4:
+            raise ValueError('inverted_residual_setting should be non-empty '
+                             'or a 4-element list, got {}'.format(
+                                 inverted_residual_setting))
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult,
+                                        round_nearest)
+        self.last_channel = _make_divisible(
+            last_channel * max(1.0, width_mult), round_nearest)
+        features = [ConvBNReLU(4, input_channel, stride=2)]
+        # building inverted residual blocks
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(
+                    block(input_channel,
+                          output_channel,
+                          stride,
+                          expand_ratio=t))
+                input_channel = output_channel
+        self.features = nn.Sequential(*features)
+        self.fpn_selected = [1, 3, 6, 10, 13]
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+        if pretrained:
+            self._load_pretrained_model()
+    def _forward_impl(self, x):
+        # This exists since TorchScript doesn't support inheritance, so the superclass method
+        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
+        fpn_features = []
+        for i, f in enumerate(self.features):
+            if i > self.fpn_selected[-1]:
+                break
+            x = f(x)
+            if i in self.fpn_selected:
+                fpn_features.append(x)
+        c1, c2, c3, c4, c5 = fpn_features
+        return c1, c2, c3, c4, c5
+    def forward(self, x):
+        return self._forward_impl(x)
+    def _load_pretrained_model(self):
+        pretrain_dict = model_zoo.load_url(
+            'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
+        model_dict = {}
+        state_dict = self.state_dict()
+        for k, v in pretrain_dict.items():
+            if k in state_dict:
+                model_dict[k] = v
+        state_dict.update(model_dict)
+        self.load_state_dict(state_dict)
+class MobileV2_MLSD_Large(nn.Module):
+    def __init__(self):
+        super(MobileV2_MLSD_Large, self).__init__()
+        self.backbone = MobileNetV2(pretrained=False)
+        # A, B
+        self.block15 = BlockTypeA(in_c1=64,
+                                  in_c2=96,
+                                  out_c1=64,
+                                  out_c2=64,
+                                  upscale=False)
+        self.block16 = BlockTypeB(128, 64)
+        # A, B
+        self.block17 = BlockTypeA(in_c1=32, in_c2=64, out_c1=64, out_c2=64)
+        self.block18 = BlockTypeB(128, 64)
+        # A, B
+        self.block19 = BlockTypeA(in_c1=24, in_c2=64, out_c1=64, out_c2=64)
+        self.block20 = BlockTypeB(128, 64)
+        # A, B, C
+        self.block21 = BlockTypeA(in_c1=16, in_c2=64, out_c1=64, out_c2=64)
+        self.block22 = BlockTypeB(128, 64)
+        self.block23 = BlockTypeC(64, 16)
+    def forward(self, x):
+        c1, c2, c3, c4, c5 = self.backbone(x)
+        x = self.block15(c4, c5)
+        x = self.block16(x)
+        x = self.block17(c3, x)
+        x = self.block18(x)
+        x = self.block19(c2, x)
+        x = self.block20(x)
+        x = self.block21(c1, x)
+        x = self.block22(x)
+        x = self.block23(x)
+        x = x[:, 7:, :, :]
+        return x

annotator/mlsd/mbv2_mlsd_tiny.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+from torch.nn import functional as F
+class BlockTypeA(nn.Module):
+    def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale=True):
+        super(BlockTypeA, self).__init__()
+        self.conv1 = nn.Sequential(nn.Conv2d(in_c2, out_c2, kernel_size=1),
+                                   nn.BatchNorm2d(out_c2),
+                                   nn.ReLU(inplace=True))
+        self.conv2 = nn.Sequential(nn.Conv2d(in_c1, out_c1, kernel_size=1),
+                                   nn.BatchNorm2d(out_c1),
+                                   nn.ReLU(inplace=True))
+        self.upscale = upscale
+    def forward(self, a, b):
+        b = self.conv1(b)
+        a = self.conv2(a)
+        b = F.interpolate(b,
+                          scale_factor=2.0,
+                          mode='bilinear',
+                          align_corners=True)
+        return torch.cat((a, b), dim=1)
+class BlockTypeB(nn.Module):
+    def __init__(self, in_c, out_c):
+        super(BlockTypeB, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
+            nn.BatchNorm2d(in_c), nn.ReLU())
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
+            nn.BatchNorm2d(out_c), nn.ReLU())
+    def forward(self, x):
+        x = self.conv1(x) + x
+        x = self.conv2(x)
+        return x
+class BlockTypeC(nn.Module):
+    def __init__(self, in_c, out_c):
+        super(BlockTypeC, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5),
+            nn.BatchNorm2d(in_c), nn.ReLU())
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
+            nn.BatchNorm2d(in_c), nn.ReLU())
+        self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        return x
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+class ConvBNReLU(nn.Sequential):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1):
+        self.channel_pad = out_planes - in_planes
+        self.stride = stride
+        # padding = (kernel_size - 1) // 2
+        # TFLite uses slightly different padding than PyTorch
+        if stride == 2:
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(in_planes,
+                      out_planes,
+                      kernel_size,
+                      stride,
+                      padding,
+                      groups=groups,
+                      bias=False), nn.BatchNorm2d(out_planes),
+            nn.ReLU6(inplace=True))
+        self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+    def forward(self, x):
+        # TFLite uses  different padding
+        if self.stride == 2:
+            x = F.pad(x, (0, 1, 0, 1), 'constant', 0)
+            # print(x.shape)
+        for module in self:
+            if not isinstance(module, nn.MaxPool2d):
+                x = module(x)
+        return x
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+        layers.extend([
+            # dw
+            ConvBNReLU(hidden_dim,
+                       hidden_dim,
+                       stride=stride,
+                       groups=hidden_dim),
+            # pw-linear
+            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+class MobileNetV2(nn.Module):
+    def __init__(self, pretrained=True):
+        """
+        MobileNet V2 main class
+        Args:
+            num_classes (int): Number of classes
+            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+            inverted_residual_setting: Network structure
+            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+            Set to 1 to turn off rounding
+            block: Module specifying inverted residual building block for mobilenet
+        """
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        width_mult = 1.0
+        round_nearest = 8
+        inverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            # [6, 96, 3, 1],
+            # [6, 160, 3, 2],
+            # [6, 320, 1, 1],
+        ]
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(
+                inverted_residual_setting[0]) != 4:
+            raise ValueError('inverted_residual_setting should be non-empty '
+                             'or a 4-element list, got {}'.format(
+                                 inverted_residual_setting))
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult,
+                                        round_nearest)
+        self.last_channel = _make_divisible(
+            last_channel * max(1.0, width_mult), round_nearest)
+        features = [ConvBNReLU(4, input_channel, stride=2)]
+        # building inverted residual blocks
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(
+                    block(input_channel,
+                          output_channel,
+                          stride,
+                          expand_ratio=t))
+                input_channel = output_channel
+        self.features = nn.Sequential(*features)
+        self.fpn_selected = [3, 6, 10]
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+        # if pretrained:
+        #    self._load_pretrained_model()
+    def _forward_impl(self, x):
+        # This exists since TorchScript doesn't support inheritance, so the superclass method
+        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
+        fpn_features = []
+        for i, f in enumerate(self.features):
+            if i > self.fpn_selected[-1]:
+                break
+            x = f(x)
+            if i in self.fpn_selected:
+                fpn_features.append(x)
+        c2, c3, c4 = fpn_features
+        return c2, c3, c4
+    def forward(self, x):
+        return self._forward_impl(x)
+    def _load_pretrained_model(self):
+        pretrain_dict = model_zoo.load_url(
+            'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
+        model_dict = {}
+        state_dict = self.state_dict()
+        for k, v in pretrain_dict.items():
+            if k in state_dict:
+                model_dict[k] = v
+        state_dict.update(model_dict)
+        self.load_state_dict(state_dict)
+class MobileV2_MLSD_Tiny(nn.Module):
+    def __init__(self):
+        super(MobileV2_MLSD_Tiny, self).__init__()
+        self.backbone = MobileNetV2(pretrained=True)
+        self.block12 = BlockTypeA(in_c1=32, in_c2=64, out_c1=64, out_c2=64)
+        self.block13 = BlockTypeB(128, 64)
+        self.block14 = BlockTypeA(in_c1=24, in_c2=64, out_c1=32, out_c2=32)
+        self.block15 = BlockTypeB(64, 64)
+        self.block16 = BlockTypeC(64, 16)
+    def forward(self, x):
+        c2, c3, c4 = self.backbone(x)
+        x = self.block12(c3, c4)
+        x = self.block13(x)
+        x = self.block14(c2, x)
+        x = self.block15(x)
+        x = self.block16(x)
+        x = x[:, 7:, :, :]
+        # print(x.shape)
+        x = F.interpolate(x,
+                          scale_factor=2.0,
+                          mode='bilinear',
+                          align_corners=True)
+        return x

annotator/mlsd/utils.py ADDED Viewed

	@@ -0,0 +1,638 @@

+# -*- coding: utf-8 -*-
+# modified by  lihaoweicv
+# pytorch version
+#
+# M-LSD
+# Copyright 2021-present NAVER Corp.
+# Apache License v2.0
+import cv2
+import numpy as np
+import torch
+from torch.nn import functional as F
+def deccode_output_score_and_ptss(tpMap, topk_n=200, ksize=5):
+    '''
+    tpMap:
+    center: tpMap[1, 0, :, :]
+    displacement: tpMap[1, 1:5, :, :]
+    '''
+    b, c, h, w = tpMap.shape
+    assert b == 1, 'only support bsize==1'
+    displacement = tpMap[:, 1:5, :, :][0]
+    center = tpMap[:, 0, :, :]
+    heat = torch.sigmoid(center)
+    hmax = F.max_pool2d(heat, (ksize, ksize),
+                        stride=1,
+                        padding=(ksize - 1) // 2)
+    keep = (hmax == heat).float()
+    heat = heat * keep
+    heat = heat.reshape(-1, )
+    scores, indices = torch.topk(heat, topk_n, dim=-1, largest=True)
+    yy = torch.floor_divide(indices, w).unsqueeze(-1)
+    xx = torch.fmod(indices, w).unsqueeze(-1)
+    ptss = torch.cat((yy, xx), dim=-1)
+    ptss = ptss.detach().cpu().numpy()
+    scores = scores.detach().cpu().numpy()
+    displacement = displacement.detach().cpu().numpy()
+    displacement = displacement.transpose((1, 2, 0))
+    return ptss, scores, displacement
+def pred_lines(image,
+               model,
+               input_shape=[512, 512],
+               score_thr=0.10,
+               dist_thr=20.0,
+               device='cuda'):
+    h, w, _ = image.shape
+    h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]]
+    resized_image = np.concatenate([
+        cv2.resize(image, (input_shape[1], input_shape[0]),
+                   interpolation=cv2.INTER_AREA),
+        np.ones([input_shape[0], input_shape[1], 1])
+    ],
+                                   axis=-1)
+    resized_image = resized_image.transpose((2, 0, 1))
+    batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
+    batch_image = (batch_image / 127.5) - 1.0
+    batch_image = torch.from_numpy(batch_image).float().to(device)
+    outputs = model(batch_image)
+    pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
+    start = vmap[:, :, :2]
+    end = vmap[:, :, 2:]
+    dist_map = np.sqrt(np.sum((start - end)**2, axis=-1))
+    segments_list = []
+    for center, score in zip(pts, pts_score):
+        y, x = center
+        distance = dist_map[y, x]
+        if score > score_thr and distance > dist_thr:
+            disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
+            x_start = x + disp_x_start
+            y_start = y + disp_y_start
+            x_end = x + disp_x_end
+            y_end = y + disp_y_end
+            segments_list.append([x_start, y_start, x_end, y_end])
+    lines = 2 * np.array(segments_list)  # 256 > 512
+    lines[:, 0] = lines[:, 0] * w_ratio
+    lines[:, 1] = lines[:, 1] * h_ratio
+    lines[:, 2] = lines[:, 2] * w_ratio
+    lines[:, 3] = lines[:, 3] * h_ratio
+    return lines
+def pred_squares(
+    image,
+    model,
+    input_shape=[512, 512],
+    device='cuda',
+    params={
+        'score': 0.06,
+        'outside_ratio': 0.28,
+        'inside_ratio': 0.45,
+        'w_overlap': 0.0,
+        'w_degree': 1.95,
+        'w_length': 0.0,
+        'w_area': 1.86,
+        'w_center': 0.14
+    }):  # noqa
+    # shape = [height, width]
+    h, w, _ = image.shape
+    original_shape = [h, w]
+    resized_image = np.concatenate([
+        cv2.resize(image, (input_shape[0], input_shape[1]),
+                   interpolation=cv2.INTER_AREA),
+        np.ones([input_shape[0], input_shape[1], 1])
+    ],
+                                   axis=-1)
+    resized_image = resized_image.transpose((2, 0, 1))
+    batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
+    batch_image = (batch_image / 127.5) - 1.0
+    batch_image = torch.from_numpy(batch_image).float().to(device)
+    outputs = model(batch_image)
+    pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
+    start = vmap[:, :, :2]  # (x, y)
+    end = vmap[:, :, 2:]  # (x, y)
+    dist_map = np.sqrt(np.sum((start - end)**2, axis=-1))
+    junc_list = []
+    segments_list = []
+    for junc, score in zip(pts, pts_score):
+        y, x = junc
+        distance = dist_map[y, x]
+        if score > params['score'] and distance > 20.0:
+            junc_list.append([x, y])
+            disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
+            d_arrow = 1.0
+            x_start = x + d_arrow * disp_x_start
+            y_start = y + d_arrow * disp_y_start
+            x_end = x + d_arrow * disp_x_end
+            y_end = y + d_arrow * disp_y_end
+            segments_list.append([x_start, y_start, x_end, y_end])
+    segments = np.array(segments_list)
+    # post processing for squares
+    # 1. get unique lines
+    point = np.array([[0, 0]])
+    point = point[0]
+    start = segments[:, :2]
+    end = segments[:, 2:]
+    diff = start - end
+    a = diff[:, 1]
+    b = -diff[:, 0]
+    c = a * start[:, 0] + b * start[:, 1]
+    d = np.abs(a * point[0] + b * point[1] - c) / np.sqrt(a**2 + b**2 + 1e-10)
+    theta = np.arctan2(diff[:, 0], diff[:, 1]) * 180 / np.pi
+    theta[theta < 0.0] += 180
+    hough = np.concatenate([d[:, None], theta[:, None]], axis=-1)
+    d_quant = 1
+    theta_quant = 2
+    hough[:, 0] //= d_quant
+    hough[:, 1] //= theta_quant
+    _, indices, counts = np.unique(hough,
+                                   axis=0,
+                                   return_index=True,
+                                   return_counts=True)
+    acc_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1],
+                       dtype='float32')
+    idx_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1],
+                       dtype='int32') - 1
+    yx_indices = hough[indices, :].astype('int32')
+    acc_map[yx_indices[:, 0], yx_indices[:, 1]] = counts
+    idx_map[yx_indices[:, 0], yx_indices[:, 1]] = indices
+    acc_map_np = acc_map
+    # acc_map = acc_map[None, :, :, None]
+    #
+    # ### fast suppression using tensorflow op
+    # acc_map = tf.constant(acc_map, dtype=tf.float32)
+    # max_acc_map = tf.keras.layers.MaxPool2D(pool_size=(5, 5), strides=1, padding='same')(acc_map)
+    # acc_map = acc_map * tf.cast(tf.math.equal(acc_map, max_acc_map), tf.float32)
+    # flatten_acc_map = tf.reshape(acc_map, [1, -1])
+    # topk_values, topk_indices = tf.math.top_k(flatten_acc_map, k=len(pts))
+    # _, h, w, _ = acc_map.shape
+    # y = tf.expand_dims(topk_indices // w, axis=-1)
+    # x = tf.expand_dims(topk_indices % w, axis=-1)
+    # yx = tf.concat([y, x], axis=-1)
+    # fast suppression using pytorch op
+    acc_map = torch.from_numpy(acc_map_np).unsqueeze(0).unsqueeze(0)
+    _, _, h, w = acc_map.shape
+    max_acc_map = F.max_pool2d(acc_map, kernel_size=5, stride=1, padding=2)
+    acc_map = acc_map * ((acc_map == max_acc_map).float())
+    flatten_acc_map = acc_map.reshape([
+        -1,
+    ])
+    scores, indices = torch.topk(flatten_acc_map,
+                                 len(pts),
+                                 dim=-1,
+                                 largest=True)
+    yy = torch.div(indices, w, rounding_mode='floor').unsqueeze(-1)
+    xx = torch.fmod(indices, w).unsqueeze(-1)
+    yx = torch.cat((yy, xx), dim=-1)
+    yx = yx.detach().cpu().numpy()
+    topk_values = scores.detach().cpu().numpy()
+    indices = idx_map[yx[:, 0], yx[:, 1]]
+    basis = 5 // 2
+    merged_segments = []
+    for yx_pt, max_indice, value in zip(yx, indices, topk_values):
+        y, x = yx_pt
+        if max_indice == -1 or value == 0:
+            continue
+        segment_list = []
+        for y_offset in range(-basis, basis + 1):
+            for x_offset in range(-basis, basis + 1):
+                indice = idx_map[y + y_offset, x + x_offset]
+                cnt = int(acc_map_np[y + y_offset, x + x_offset])
+                if indice != -1:
+                    segment_list.append(segments[indice])
+                if cnt > 1:
+                    check_cnt = 1
+                    current_hough = hough[indice]
+                    for new_indice, new_hough in enumerate(hough):
+                        if (current_hough
+                                == new_hough).all() and indice != new_indice:
+                            segment_list.append(segments[new_indice])
+                            check_cnt += 1
+                        if check_cnt == cnt:
+                            break
+        group_segments = np.array(segment_list).reshape([-1, 2])
+        sorted_group_segments = np.sort(group_segments, axis=0)
+        x_min, y_min = sorted_group_segments[0, :]
+        x_max, y_max = sorted_group_segments[-1, :]
+        deg = theta[max_indice]
+        if deg >= 90:
+            merged_segments.append([x_min, y_max, x_max, y_min])
+        else:
+            merged_segments.append([x_min, y_min, x_max, y_max])
+    # 2. get intersections
+    new_segments = np.array(merged_segments)  # (x1, y1, x2, y2)
+    start = new_segments[:, :2]  # (x1, y1)
+    end = new_segments[:, 2:]  # (x2, y2)
+    new_centers = (start + end) / 2.0
+    diff = start - end
+    dist_segments = np.sqrt(np.sum(diff**2, axis=-1))
+    # ax + by = c
+    a = diff[:, 1]
+    b = -diff[:, 0]
+    c = a * start[:, 0] + b * start[:, 1]
+    pre_det = a[:, None] * b[None, :]
+    det = pre_det - np.transpose(pre_det)
+    pre_inter_y = a[:, None] * c[None, :]
+    inter_y = (pre_inter_y - np.transpose(pre_inter_y)) / (det + 1e-10)
+    pre_inter_x = c[:, None] * b[None, :]
+    inter_x = (pre_inter_x - np.transpose(pre_inter_x)) / (det + 1e-10)
+    inter_pts = np.concatenate([inter_x[:, :, None], inter_y[:, :, None]],
+                               axis=-1).astype('int32')
+    # 3. get corner information
+    # 3.1 get distance
+    '''
+    dist_segments:
+        | dist(0), dist(1), dist(2), ...|
+    dist_inter_to_segment1:
+        | dist(inter,0), dist(inter,0), dist(inter,0), ... |
+        | dist(inter,1), dist(inter,1), dist(inter,1), ... |
+        ...
+    dist_inter_to_semgnet2:
+        | dist(inter,0), dist(inter,1), dist(inter,2), ... |
+        | dist(inter,0), dist(inter,1), dist(inter,2), ... |
+        ...
+    '''
+    dist_inter_to_segment1_start = np.sqrt(
+        np.sum(((inter_pts - start[:, None, :])**2), axis=-1,
+               keepdims=True))  # [n_batch, n_batch, 1]
+    dist_inter_to_segment1_end = np.sqrt(
+        np.sum(((inter_pts - end[:, None, :])**2), axis=-1,
+               keepdims=True))  # [n_batch, n_batch, 1]
+    dist_inter_to_segment2_start = np.sqrt(
+        np.sum(((inter_pts - start[None, :, :])**2), axis=-1,
+               keepdims=True))  # [n_batch, n_batch, 1]
+    dist_inter_to_segment2_end = np.sqrt(
+        np.sum(((inter_pts - end[None, :, :])**2), axis=-1,
+               keepdims=True))  # [n_batch, n_batch, 1]
+    # sort ascending
+    dist_inter_to_segment1 = np.sort(np.concatenate(
+        [dist_inter_to_segment1_start, dist_inter_to_segment1_end], axis=-1),
+                                     axis=-1)  # [n_batch, n_batch, 2]
+    dist_inter_to_segment2 = np.sort(np.concatenate(
+        [dist_inter_to_segment2_start, dist_inter_to_segment2_end], axis=-1),
+                                     axis=-1)  # [n_batch, n_batch, 2]
+    # 3.2 get degree
+    inter_to_start = new_centers[:, None, :] - inter_pts
+    deg_inter_to_start = np.arctan2(inter_to_start[:, :, 1],
+                                    inter_to_start[:, :, 0]) * 180 / np.pi
+    deg_inter_to_start[deg_inter_to_start < 0.0] += 360
+    inter_to_end = new_centers[None, :, :] - inter_pts
+    deg_inter_to_end = np.arctan2(inter_to_end[:, :, 1],
+                                  inter_to_end[:, :, 0]) * 180 / np.pi
+    deg_inter_to_end[deg_inter_to_end < 0.0] += 360
+    '''
+    B -- G
+    |    |
+    C -- R
+    B : blue / G: green / C: cyan / R: red
+    0 -- 1
+    |    |
+    3 -- 2
+    '''
+    # rename variables
+    deg1_map, deg2_map = deg_inter_to_start, deg_inter_to_end
+    # sort deg ascending
+    deg_sort = np.sort(np.concatenate(
+        [deg1_map[:, :, None], deg2_map[:, :, None]], axis=-1),
+                       axis=-1)
+    deg_diff_map = np.abs(deg1_map - deg2_map)
+    # we only consider the smallest degree of intersect
+    deg_diff_map[deg_diff_map > 180] = 360 - deg_diff_map[deg_diff_map > 180]
+    # define available degree range
+    deg_range = [60, 120]
+    corner_dict = {corner_info: [] for corner_info in range(4)}
+    inter_points = []
+    for i in range(inter_pts.shape[0]):
+        for j in range(i + 1, inter_pts.shape[1]):
+            # i, j > line index, always i < j
+            x, y = inter_pts[i, j, :]
+            deg1, deg2 = deg_sort[i, j, :]
+            deg_diff = deg_diff_map[i, j]
+            check_degree = deg_diff > deg_range[0] and deg_diff < deg_range[1]
+            outside_ratio = params['outside_ratio']  # over ratio >>> drop it!
+            inside_ratio = params['inside_ratio']  # over ratio >>> drop it!
+            check_distance = ((dist_inter_to_segment1[i, j, 1] >= dist_segments[i] and
+                               dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * outside_ratio) or
+                              (dist_inter_to_segment1[i, j, 1] <= dist_segments[i] and
+                               dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * inside_ratio)) and \
+                             ((dist_inter_to_segment2[i, j, 1] >= dist_segments[j] and
+                               dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * outside_ratio) or
+                              (dist_inter_to_segment2[i, j, 1] <= dist_segments[j] and
+                               dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * inside_ratio))
+            if check_degree and check_distance:
+                corner_info = None  # noqa
+                if (deg1 >= 0 and deg1 <= 45 and deg2 >= 45 and deg2 <= 120) or \
+                        (deg2 >= 315 and deg1 >= 45 and deg1 <= 120):
+                    corner_info, color_info = 0, 'blue'
+                elif (deg1 >= 45 and deg1 <= 125 and deg2 >= 125
+                      and deg2 <= 225):
+                    corner_info, color_info = 1, 'green'
+                elif (deg1 >= 125 and deg1 <= 225 and deg2 >= 225
+                      and deg2 <= 315):
+                    corner_info, color_info = 2, 'black'
+                elif (deg1 >= 0 and deg1 <= 45 and deg2 >= 225 and deg2 <= 315) or \
+                        (deg2 >= 315 and deg1 >= 225 and deg1 <= 315):
+                    corner_info, color_info = 3, 'cyan'
+                else:
+                    corner_info, color_info = 4, 'red'  # we don't use it # noqa
+                    continue
+                corner_dict[corner_info].append([x, y, i, j])
+                inter_points.append([x, y])
+    square_list = []
+    connect_list = []
+    segments_list = []
+    for corner0 in corner_dict[0]:
+        for corner1 in corner_dict[1]:
+            connect01 = False
+            for corner0_line in corner0[2:]:
+                if corner0_line in corner1[2:]:
+                    connect01 = True
+                    break
+            if connect01:
+                for corner2 in corner_dict[2]:
+                    connect12 = False
+                    for corner1_line in corner1[2:]:
+                        if corner1_line in corner2[2:]:
+                            connect12 = True
+                            break
+                    if connect12:
+                        for corner3 in corner_dict[3]:
+                            connect23 = False
+                            for corner2_line in corner2[2:]:
+                                if corner2_line in corner3[2:]:
+                                    connect23 = True
+                                    break
+                            if connect23:
+                                for corner3_line in corner3[2:]:
+                                    if corner3_line in corner0[2:]:
+                                        # SQUARE!!!
+                                        '''
+                                        0 -- 1
+                                        |    |
+                                        3 -- 2
+                                        square_list:
+                                            order: 0 > 1 > 2 > 3
+                                            | x0, y0, x1, y1, x2, y2, x3, y3 |
+                                            | x0, y0, x1, y1, x2, y2, x3, y3 |
+                                            ...
+                                        connect_list:
+                                            order: 01 > 12 > 23 > 30
+                                            | line_idx01, line_idx12, line_idx23, line_idx30 |
+                                            | line_idx01, line_idx12, line_idx23, line_idx30 |
+                                            ...
+                                        segments_list:
+                                            order: 0 > 1 > 2 > 3
+                                            | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i,
+                                            line_idx2_j, line_idx3_i, line_idx3_j |
+                                            | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i,
+                                            line_idx2_j, line_idx3_i, line_idx3_j |
+                                            ...
+                                        '''
+                                        square_list.append(corner0[:2] +
+                                                           corner1[:2] +
+                                                           corner2[:2] +
+                                                           corner3[:2])
+                                        connect_list.append([
+                                            corner0_line, corner1_line,
+                                            corner2_line, corner3_line
+                                        ])
+                                        segments_list.append(corner0[2:] +
+                                                             corner1[2:] +
+                                                             corner2[2:] +
+                                                             corner3[2:])
+    def check_outside_inside(segments_info, connect_idx):
+        # return 'outside or inside', min distance, cover_param, peri_param
+        if connect_idx == segments_info[0]:
+            check_dist_mat = dist_inter_to_segment1
+        else:
+            check_dist_mat = dist_inter_to_segment2
+        i, j = segments_info
+        min_dist, max_dist = check_dist_mat[i, j, :]
+        connect_dist = dist_segments[connect_idx]
+        if max_dist > connect_dist:
+            return 'outside', min_dist, 0, 1
+        else:
+            return 'inside', min_dist, -1, -1
+    top_square = None  # noqa
+    try:
+        map_size = input_shape[0] / 2
+        squares = np.array(square_list).reshape([-1, 4, 2])
+        score_array = []
+        connect_array = np.array(connect_list)
+        segments_array = np.array(segments_list).reshape([-1, 4, 2])
+        # get degree of corners:
+        squares_rollup = np.roll(squares, 1, axis=1)
+        squares_rolldown = np.roll(squares, -1, axis=1)
+        vec1 = squares_rollup - squares
+        normalized_vec1 = vec1 / (
+            np.linalg.norm(vec1, axis=-1, keepdims=True) + 1e-10)
+        vec2 = squares_rolldown - squares
+        normalized_vec2 = vec2 / (
+            np.linalg.norm(vec2, axis=-1, keepdims=True) + 1e-10)
+        inner_products = np.sum(normalized_vec1 * normalized_vec2,
+                                axis=-1)  # [n_squares, 4]
+        squares_degree = np.arccos(
+            inner_products) * 180 / np.pi  # [n_squares, 4]
+        # get square score
+        overlap_scores = []
+        degree_scores = []
+        length_scores = []
+        for connects, segments, square, degree in zip(connect_array,
+                                                      segments_array, squares,
+                                                      squares_degree):
+            '''
+            0 -- 1
+            |    |
+            3 -- 2
+            # segments: [4, 2]
+            # connects: [4]
+            '''
+            # OVERLAP SCORES
+            cover = 0
+            perimeter = 0
+            # check 0 > 1 > 2 > 3
+            square_length = []
+            for start_idx in range(4):
+                end_idx = (start_idx + 1) % 4
+                connect_idx = connects[start_idx]  # segment idx of segment01
+                start_segments = segments[start_idx]
+                end_segments = segments[end_idx]
+                start_point = square[start_idx]  # noqa
+                end_point = square[end_idx]  # noqa
+                # check whether outside or inside
+                start_position, start_min, start_cover_param, start_peri_param = check_outside_inside(
+                    start_segments, connect_idx)
+                end_position, end_min, end_cover_param, end_peri_param = check_outside_inside(
+                    end_segments, connect_idx)
+                cover += dist_segments[
+                    connect_idx] + start_cover_param * start_min + end_cover_param * end_min
+                perimeter += dist_segments[
+                    connect_idx] + start_peri_param * start_min + end_peri_param * end_min
+                square_length.append(dist_segments[connect_idx] +
+                                     start_peri_param * start_min +
+                                     end_peri_param * end_min)
+            overlap_scores.append(cover / perimeter)
+            # DEGREE SCORES
+            '''
+            deg0 vs deg2
+            deg1 vs deg3
+            '''
+            deg0, deg1, deg2, deg3 = degree
+            deg_ratio1 = deg0 / deg2
+            if deg_ratio1 > 1.0:
+                deg_ratio1 = 1 / deg_ratio1
+            deg_ratio2 = deg1 / deg3
+            if deg_ratio2 > 1.0:
+                deg_ratio2 = 1 / deg_ratio2
+            degree_scores.append((deg_ratio1 + deg_ratio2) / 2)
+            # LENGTH SCORES
+            '''
+            len0 vs len2
+            len1 vs len3
+            '''
+            len0, len1, len2, len3 = square_length
+            len_ratio1 = len0 / len2 if len2 > len0 else len2 / len0
+            len_ratio2 = len1 / len3 if len3 > len1 else len3 / len1
+            length_scores.append((len_ratio1 + len_ratio2) / 2)
+            ######################################
+        overlap_scores = np.array(overlap_scores)
+        overlap_scores /= np.max(overlap_scores)
+        degree_scores = np.array(degree_scores)
+        # degree_scores /= np.max(degree_scores)
+        length_scores = np.array(length_scores)
+        # AREA SCORES
+        area_scores = np.reshape(squares, [-1, 4, 2])
+        area_x = area_scores[:, :, 0]
+        area_y = area_scores[:, :, 1]
+        correction = area_x[:, -1] * area_y[:, 0] - area_y[:, -1] * area_x[:,
+                                                                           0]
+        area_scores = np.sum(area_x[:, :-1] * area_y[:, 1:], axis=-1) - np.sum(
+            area_y[:, :-1] * area_x[:, 1:], axis=-1)
+        area_scores = 0.5 * np.abs(area_scores + correction)
+        area_scores /= (map_size * map_size)  # np.max(area_scores)
+        # CENTER SCORES
+        centers = np.array([[256 // 2, 256 // 2]], dtype='float32')  # [1, 2]
+        # squares: [n, 4, 2]
+        square_centers = np.mean(squares, axis=1)  # [n, 2]
+        center2center = np.sqrt(np.sum((centers - square_centers)**2))
+        center_scores = center2center / (map_size / np.sqrt(2.0))
+        '''
+        score_w = [overlap, degree, area, center, length]
+        '''
+        score_w = [0.0, 1.0, 10.0, 0.5, 1.0]  # noqa
+        score_array = (params['w_overlap'] * overlap_scores +
+                       params['w_degree'] * degree_scores +
+                       params['w_area'] * area_scores -
+                       params['w_center'] * center_scores +
+                       params['w_length'] * length_scores)
+        best_square = []  # noqa
+        sorted_idx = np.argsort(score_array)[::-1]
+        score_array = score_array[sorted_idx]
+        squares = squares[sorted_idx]
+    except Exception:
+        pass
+    '''return list
+    merged_lines, squares, scores
+    '''
+    try:
+        new_segments[:, 0] = new_segments[:, 0] * 2 / input_shape[
+            1] * original_shape[1]
+        new_segments[:, 1] = new_segments[:, 1] * 2 / input_shape[
+            0] * original_shape[0]
+        new_segments[:, 2] = new_segments[:, 2] * 2 / input_shape[
+            1] * original_shape[1]
+        new_segments[:, 3] = new_segments[:, 3] * 2 / input_shape[
+            0] * original_shape[0]
+    except Exception:
+        new_segments = []
+    try:
+        squares[:, :,
+                0] = squares[:, :, 0] * 2 / input_shape[1] * original_shape[1]
+        squares[:, :,
+                1] = squares[:, :, 1] * 2 / input_shape[0] * original_shape[0]
+    except Exception:
+        squares = []
+        score_array = []
+    try:
+        inter_points = np.array(inter_points)
+        inter_points[:, 0] = inter_points[:, 0] * 2 / input_shape[
+            1] * original_shape[1]
+        inter_points[:, 1] = inter_points[:, 1] * 2 / input_shape[
+            0] * original_shape[0]
+    except Exception:
+        inter_points = []
+    return new_segments, squares, score_array, inter_points

annotator/mlsd_op.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# MLSD Line Detection
+# From https://github.com/navervision/mlsd
+# Apache-2.0 license
+import warnings
+from abc import ABCMeta
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from scepter.modules.annotator.base_annotator import BaseAnnotator
+from scepter.modules.annotator.mlsd.mbv2_mlsd_large import MobileV2_MLSD_Large
+from scepter.modules.annotator.mlsd.utils import pred_lines
+from scepter.modules.annotator.registry import ANNOTATORS
+from scepter.modules.annotator.utils import resize_image, resize_image_ori
+from scepter.modules.utils.config import dict_to_yaml
+from scepter.modules.utils.distribute import we
+from scepter.modules.utils.file_system import FS
+@ANNOTATORS.register_class()
+class MLSDdetector(BaseAnnotator, metaclass=ABCMeta):
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        model = MobileV2_MLSD_Large()
+        pretrained_model = cfg.get('PRETRAINED_MODEL', None)
+        if pretrained_model:
+            with FS.get_from(pretrained_model, wait_finish=True) as local_path:
+                model.load_state_dict(torch.load(local_path), strict=True)
+        self.model = model.eval()
+        self.thr_v = cfg.get('THR_V', 0.1)
+        self.thr_d = cfg.get('THR_D', 0.1)
+    @torch.no_grad()
+    @torch.inference_mode()
+    @torch.autocast('cuda', enabled=False)
+    def forward(self, image):
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        elif isinstance(image, torch.Tensor):
+            image = image.detach().cpu().numpy()
+        elif isinstance(image, np.ndarray):
+            image = image.copy()
+        else:
+            raise f'Unsurpport datatype{type(image)}, only surpport np.ndarray, torch.Tensor, Pillow Image.'
+        h, w, c = image.shape
+        image, k = resize_image(image, 1024 if min(h, w) > 1024 else min(h, w))
+        img_output = np.zeros_like(image)
+        try:
+            lines = pred_lines(image,
+                               self.model, [image.shape[0], image.shape[1]],
+                               self.thr_v,
+                               self.thr_d,
+                               device=we.device_id)
+            for line in lines:
+                x_start, y_start, x_end, y_end = [int(val) for val in line]
+                cv2.line(img_output, (x_start, y_start), (x_end, y_end),
+                         [255, 255, 255], 1)
+        except Exception as e:
+            warnings.warn(f'{e}')
+            return None
+        img_output = resize_image_ori(h, w, img_output, k)
+        return img_output[:, :, 0]
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('ANNOTATORS',
+                            __class__.__name__,
+                            MLSDdetector.para_dict,
+                            set_name=True)

annotator/openpose.py ADDED Viewed

	@@ -0,0 +1,812 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Openpose
+# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
+# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
+# The implementation is modified from 3rd Edited Version by ControlNet
+import math
+import os
+from abc import ABCMeta
+from collections import OrderedDict
+import cv2
+import matplotlib
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image
+from scipy.ndimage.filters import gaussian_filter
+from skimage.measure import label
+from scepter.modules.annotator.base_annotator import BaseAnnotator
+from scepter.modules.annotator.registry import ANNOTATORS
+from scepter.modules.utils.config import dict_to_yaml
+from scepter.modules.utils.file_system import FS
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+def padRightDownCorner(img, stride, padValue):
+    h = img.shape[0]
+    w = img.shape[1]
+    pad = 4 * [None]
+    pad[0] = 0  # up
+    pad[1] = 0  # left
+    pad[2] = 0 if (h % stride == 0) else stride - (h % stride)  # down
+    pad[3] = 0 if (w % stride == 0) else stride - (w % stride)  # right
+    img_padded = img
+    pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1))
+    img_padded = np.concatenate((pad_up, img_padded), axis=0)
+    pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1))
+    img_padded = np.concatenate((pad_left, img_padded), axis=1)
+    pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1))
+    img_padded = np.concatenate((img_padded, pad_down), axis=0)
+    pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1))
+    img_padded = np.concatenate((img_padded, pad_right), axis=1)
+    return img_padded, pad
+# transfer caffe model to pytorch which will match the layer name
+def transfer(model, model_weights):
+    transfered_model_weights = {}
+    for weights_name in model.state_dict().keys():
+        transfered_model_weights[weights_name] = model_weights['.'.join(
+            weights_name.split('.')[1:])]
+    return transfered_model_weights
+# draw the body keypoint and lims
+def draw_bodypose(canvas, candidate, subset):
+    stickwidth = 4
+    limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10],
+               [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15],
+               [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]]
+    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0],
+              [170, 255, 0], [85, 255, 0], [0, 255, 0], [0, 255, 85],
+              [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255],
+              [0, 0, 255], [85, 0, 255], [170, 0, 255], [255, 0, 255],
+              [255, 0, 170], [255, 0, 85]]
+    for i in range(18):
+        for n in range(len(subset)):
+            index = int(subset[n][i])
+            if index == -1:
+                continue
+            x, y = candidate[index][0:2]
+            cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
+    for i in range(17):
+        for n in range(len(subset)):
+            index = subset[n][np.array(limbSeq[i]) - 1]
+            if -1 in index:
+                continue
+            cur_canvas = canvas.copy()
+            Y = candidate[index.astype(int), 0]
+            X = candidate[index.astype(int), 1]
+            mX = np.mean(X)
+            mY = np.mean(Y)
+            length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5
+            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+            polygon = cv2.ellipse2Poly(
+                (int(mY), int(mX)), (int(length / 2), stickwidth), int(angle),
+                0, 360, 1)
+            cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
+            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
+    # plt.imsave("preview.jpg", canvas[:, :, [2, 1, 0]])
+    # plt.imshow(canvas[:, :, [2, 1, 0]])
+    return canvas
+# image drawed by opencv is not good.
+def draw_handpose(canvas, all_hand_peaks, show_number=False):
+    edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8],
+             [0, 9], [9, 10], [10, 11], [11, 12], [0, 13], [13, 14], [14, 15],
+             [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
+    for peaks in all_hand_peaks:
+        for ie, e in enumerate(edges):
+            if np.sum(np.all(peaks[e], axis=1) == 0) == 0:
+                x1, y1 = peaks[e[0]]
+                x2, y2 = peaks[e[1]]
+                cv2.line(canvas, (x1, y1), (x2, y2),
+                         matplotlib.colors.hsv_to_rgb(
+                             [ie / float(len(edges)), 1.0, 1.0]) * 255,
+                         thickness=2)
+        for i, keyponit in enumerate(peaks):
+            x, y = keyponit
+            cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
+            if show_number:
+                cv2.putText(canvas,
+                            str(i), (x, y),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            0.3, (0, 0, 0),
+                            lineType=cv2.LINE_AA)
+    return canvas
+# detect hand according to body pose keypoints
+# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/
+# master/src/openpose/hand/handDetector.cpp
+def handDetect(candidate, subset, oriImg):
+    # right hand: wrist 4, elbow 3, shoulder 2
+    # left hand: wrist 7, elbow 6, shoulder 5
+    ratioWristElbow = 0.33
+    detect_result = []
+    image_height, image_width = oriImg.shape[0:2]
+    for person in subset.astype(int):
+        # if any of three not detected
+        has_left = np.sum(person[[5, 6, 7]] == -1) == 0
+        has_right = np.sum(person[[2, 3, 4]] == -1) == 0
+        if not (has_left or has_right):
+            continue
+        hands = []
+        # left hand
+        if has_left:
+            left_shoulder_index, left_elbow_index, left_wrist_index = person[[
+                5, 6, 7
+            ]]
+            x1, y1 = candidate[left_shoulder_index][:2]
+            x2, y2 = candidate[left_elbow_index][:2]
+            x3, y3 = candidate[left_wrist_index][:2]
+            hands.append([x1, y1, x2, y2, x3, y3, True])
+        # right hand
+        if has_right:
+            right_shoulder_index, right_elbow_index, right_wrist_index = person[
+                [2, 3, 4]]
+            x1, y1 = candidate[right_shoulder_index][:2]
+            x2, y2 = candidate[right_elbow_index][:2]
+            x3, y3 = candidate[right_wrist_index][:2]
+            hands.append([x1, y1, x2, y2, x3, y3, False])
+        for x1, y1, x2, y2, x3, y3, is_left in hands:
+            # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
+            # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
+            # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
+            # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
+            # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
+            # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
+            x = x3 + ratioWristElbow * (x3 - x2)
+            y = y3 + ratioWristElbow * (y3 - y2)
+            distanceWristElbow = math.sqrt((x3 - x2)**2 + (y3 - y2)**2)
+            distanceElbowShoulder = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
+            width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
+            # x-y refers to the center --> offset to topLeft point
+            # handRectangle.x -= handRectangle.width / 2.f;
+            # handRectangle.y -= handRectangle.height / 2.f;
+            x -= width / 2
+            y -= width / 2  # width = height
+            # overflow the image
+            if x < 0:
+                x = 0
+            if y < 0:
+                y = 0
+            width1 = width
+            width2 = width
+            if x + width > image_width:
+                width1 = image_width - x
+            if y + width > image_height:
+                width2 = image_height - y
+            width = min(width1, width2)
+            # the max hand box value is 20 pixels
+            if width >= 20:
+                detect_result.append([int(x), int(y), int(width), is_left])
+    '''
+    return value: [[x, y, w, True if left hand else False]].
+    width=height since the network require squared input.
+    x, y is the coordinate of top left
+    '''
+    return detect_result
+# get max index of 2d array
+def npmax(array):
+    arrayindex = array.argmax(1)
+    arrayvalue = array.max(1)
+    i = arrayvalue.argmax()
+    j = arrayindex[i]
+    return i, j
+def make_layers(block, no_relu_layers):
+    layers = []
+    for layer_name, v in block.items():
+        if 'pool' in layer_name:
+            layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2])
+            layers.append((layer_name, layer))
+        else:
+            conv2d = nn.Conv2d(in_channels=v[0],
+                               out_channels=v[1],
+                               kernel_size=v[2],
+                               stride=v[3],
+                               padding=v[4])
+            layers.append((layer_name, conv2d))
+            if layer_name not in no_relu_layers:
+                layers.append(('relu_' + layer_name, nn.ReLU(inplace=True)))
+    return nn.Sequential(OrderedDict(layers))
+class bodypose_model(nn.Module):
+    def __init__(self):
+        super(bodypose_model, self).__init__()
+        # these layers have no relu layer
+        no_relu_layers = [
+            'conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',
+            'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',
+            'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',
+            'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1'
+        ]
+        blocks = {}
+        block0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]),
+                              ('conv1_2', [64, 64, 3, 1, 1]),
+                              ('pool1_stage1', [2, 2, 0]),
+                              ('conv2_1', [64, 128, 3, 1, 1]),
+                              ('conv2_2', [128, 128, 3, 1, 1]),
+                              ('pool2_stage1', [2, 2, 0]),
+                              ('conv3_1', [128, 256, 3, 1, 1]),
+                              ('conv3_2', [256, 256, 3, 1, 1]),
+                              ('conv3_3', [256, 256, 3, 1, 1]),
+                              ('conv3_4', [256, 256, 3, 1, 1]),
+                              ('pool3_stage1', [2, 2, 0]),
+                              ('conv4_1', [256, 512, 3, 1, 1]),
+                              ('conv4_2', [512, 512, 3, 1, 1]),
+                              ('conv4_3_CPM', [512, 256, 3, 1, 1]),
+                              ('conv4_4_CPM', [256, 128, 3, 1, 1])])
+        # Stage 1
+        block1_1 = OrderedDict([('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
+                                ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
+                                ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
+                                ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
+                                ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])])
+        block1_2 = OrderedDict([('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
+                                ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
+                                ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
+                                ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
+                                ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])])
+        blocks['block1_1'] = block1_1
+        blocks['block1_2'] = block1_2
+        self.model0 = make_layers(block0, no_relu_layers)
+        # Stages 2 - 6
+        for i in range(2, 7):
+            blocks['block%d_1' % i] = OrderedDict([
+                ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
+                ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
+                ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
+            ])
+            blocks['block%d_2' % i] = OrderedDict([
+                ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
+                ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
+                ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
+            ])
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+        self.model1_1 = blocks['block1_1']
+        self.model2_1 = blocks['block2_1']
+        self.model3_1 = blocks['block3_1']
+        self.model4_1 = blocks['block4_1']
+        self.model5_1 = blocks['block5_1']
+        self.model6_1 = blocks['block6_1']
+        self.model1_2 = blocks['block1_2']
+        self.model2_2 = blocks['block2_2']
+        self.model3_2 = blocks['block3_2']
+        self.model4_2 = blocks['block4_2']
+        self.model5_2 = blocks['block5_2']
+        self.model6_2 = blocks['block6_2']
+    def forward(self, x):
+        out1 = self.model0(x)
+        out1_1 = self.model1_1(out1)
+        out1_2 = self.model1_2(out1)
+        out2 = torch.cat([out1_1, out1_2, out1], 1)
+        out2_1 = self.model2_1(out2)
+        out2_2 = self.model2_2(out2)
+        out3 = torch.cat([out2_1, out2_2, out1], 1)
+        out3_1 = self.model3_1(out3)
+        out3_2 = self.model3_2(out3)
+        out4 = torch.cat([out3_1, out3_2, out1], 1)
+        out4_1 = self.model4_1(out4)
+        out4_2 = self.model4_2(out4)
+        out5 = torch.cat([out4_1, out4_2, out1], 1)
+        out5_1 = self.model5_1(out5)
+        out5_2 = self.model5_2(out5)
+        out6 = torch.cat([out5_1, out5_2, out1], 1)
+        out6_1 = self.model6_1(out6)
+        out6_2 = self.model6_2(out6)
+        return out6_1, out6_2
+class handpose_model(nn.Module):
+    def __init__(self):
+        super(handpose_model, self).__init__()
+        # these layers have no relu layer
+        no_relu_layers = [
+            'conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3', 'Mconv7_stage4',
+            'Mconv7_stage5', 'Mconv7_stage6'
+        ]
+        # stage 1
+        block1_0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]),
+                                ('conv1_2', [64, 64, 3, 1, 1]),
+                                ('pool1_stage1', [2, 2, 0]),
+                                ('conv2_1', [64, 128, 3, 1, 1]),
+                                ('conv2_2', [128, 128, 3, 1, 1]),
+                                ('pool2_stage1', [2, 2, 0]),
+                                ('conv3_1', [128, 256, 3, 1, 1]),
+                                ('conv3_2', [256, 256, 3, 1, 1]),
+                                ('conv3_3', [256, 256, 3, 1, 1]),
+                                ('conv3_4', [256, 256, 3, 1, 1]),
+                                ('pool3_stage1', [2, 2, 0]),
+                                ('conv4_1', [256, 512, 3, 1, 1]),
+                                ('conv4_2', [512, 512, 3, 1, 1]),
+                                ('conv4_3', [512, 512, 3, 1, 1]),
+                                ('conv4_4', [512, 512, 3, 1, 1]),
+                                ('conv5_1', [512, 512, 3, 1, 1]),
+                                ('conv5_2', [512, 512, 3, 1, 1]),
+                                ('conv5_3_CPM', [512, 128, 3, 1, 1])])
+        block1_1 = OrderedDict([('conv6_1_CPM', [128, 512, 1, 1, 0]),
+                                ('conv6_2_CPM', [512, 22, 1, 1, 0])])
+        blocks = {}
+        blocks['block1_0'] = block1_0
+        blocks['block1_1'] = block1_1
+        # stage 2-6
+        for i in range(2, 7):
+            blocks['block%d' % i] = OrderedDict([
+                ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
+                ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
+                ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
+                ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
+                ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
+                ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
+                ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
+            ])
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+        self.model1_0 = blocks['block1_0']
+        self.model1_1 = blocks['block1_1']
+        self.model2 = blocks['block2']
+        self.model3 = blocks['block3']
+        self.model4 = blocks['block4']
+        self.model5 = blocks['block5']
+        self.model6 = blocks['block6']
+    def forward(self, x):
+        out1_0 = self.model1_0(x)
+        out1_1 = self.model1_1(out1_0)
+        concat_stage2 = torch.cat([out1_1, out1_0], 1)
+        out_stage2 = self.model2(concat_stage2)
+        concat_stage3 = torch.cat([out_stage2, out1_0], 1)
+        out_stage3 = self.model3(concat_stage3)
+        concat_stage4 = torch.cat([out_stage3, out1_0], 1)
+        out_stage4 = self.model4(concat_stage4)
+        concat_stage5 = torch.cat([out_stage4, out1_0], 1)
+        out_stage5 = self.model5(concat_stage5)
+        concat_stage6 = torch.cat([out_stage5, out1_0], 1)
+        out_stage6 = self.model6(concat_stage6)
+        return out_stage6
+class Hand(object):
+    def __init__(self, model_path, device='cuda'):
+        self.model = handpose_model()
+        if torch.cuda.is_available():
+            self.model = self.model.to(device)
+        model_dict = transfer(self.model, torch.load(model_path))
+        self.model.load_state_dict(model_dict)
+        self.model.eval()
+        self.device = device
+    def __call__(self, oriImg):
+        scale_search = [0.5, 1.0, 1.5, 2.0]
+        # scale_search = [0.5]
+        boxsize = 368
+        stride = 8
+        padValue = 128
+        thre = 0.05
+        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 22))
+        # paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+        for m in range(len(multiplier)):
+            scale = multiplier[m]
+            imageToTest = cv2.resize(oriImg, (0, 0),
+                                     fx=scale,
+                                     fy=scale,
+                                     interpolation=cv2.INTER_CUBIC)
+            imageToTest_padded, pad = padRightDownCorner(
+                imageToTest, stride, padValue)
+            im = np.transpose(
+                np.float32(imageToTest_padded[:, :, :, np.newaxis]),
+                (3, 2, 0, 1)) / 256 - 0.5
+            im = np.ascontiguousarray(im)
+            data = torch.from_numpy(im).float()
+            if torch.cuda.is_available():
+                data = data.to(self.device)
+            # data = data.permute([2, 0, 1]).unsqueeze(0).float()
+            with torch.no_grad():
+                output = self.model(data).cpu().numpy()
+                # output = self.model(data).numpy()q
+            # extract outputs, resize, and remove padding
+            heatmap = np.transpose(np.squeeze(output),
+                                   (1, 2, 0))  # output 1 is heatmaps
+            heatmap = cv2.resize(heatmap, (0, 0),
+                                 fx=stride,
+                                 fy=stride,
+                                 interpolation=cv2.INTER_CUBIC)
+            heatmap = heatmap[:imageToTest_padded.shape[0] -
+                              pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+            heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]),
+                                 interpolation=cv2.INTER_CUBIC)
+            heatmap_avg += heatmap / len(multiplier)
+        all_peaks = []
+        for part in range(21):
+            map_ori = heatmap_avg[:, :, part]
+            one_heatmap = gaussian_filter(map_ori, sigma=3)
+            binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
+            # 全部小于阈值
+            if np.sum(binary) == 0:
+                all_peaks.append([0, 0])
+                continue
+            label_img, label_numbers = label(binary,
+                                             return_num=True,
+                                             connectivity=binary.ndim)
+            max_index = np.argmax([
+                np.sum(map_ori[label_img == i])
+                for i in range(1, label_numbers + 1)
+            ]) + 1
+            label_img[label_img != max_index] = 0
+            map_ori[label_img == 0] = 0
+            y, x = npmax(map_ori)
+            all_peaks.append([x, y])
+        return np.array(all_peaks)
+class Body(object):
+    def __init__(self, model_path, device='cuda'):
+        self.model = bodypose_model()
+        if torch.cuda.is_available():
+            self.model = self.model.to(device)
+        model_dict = transfer(self.model, torch.load(model_path))
+        self.model.load_state_dict(model_dict)
+        self.model.eval()
+        self.device = device
+    def __call__(self, oriImg):
+        # scale_search = [0.5, 1.0, 1.5, 2.0]
+        scale_search = [0.5]
+        boxsize = 368
+        stride = 8
+        padValue = 128
+        thre1 = 0.1
+        thre2 = 0.05
+        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
+        paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+        for m in range(len(multiplier)):
+            scale = multiplier[m]
+            imageToTest = cv2.resize(oriImg, (0, 0),
+                                     fx=scale,
+                                     fy=scale,
+                                     interpolation=cv2.INTER_CUBIC)
+            imageToTest_padded, pad = padRightDownCorner(
+                imageToTest, stride, padValue)
+            im = np.transpose(
+                np.float32(imageToTest_padded[:, :, :, np.newaxis]),
+                (3, 2, 0, 1)) / 256 - 0.5
+            im = np.ascontiguousarray(im)
+            data = torch.from_numpy(im).float()
+            if torch.cuda.is_available():
+                data = data.to(self.device)
+            # data = data.permute([2, 0, 1]).unsqueeze(0).float()
+            with torch.no_grad():
+                Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
+            Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
+            Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
+            # extract outputs, resize, and remove padding
+            # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0))
+            # output 1 is heatmaps
+            heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2),
+                                   (1, 2, 0))  # output 1 is heatmaps
+            heatmap = cv2.resize(heatmap, (0, 0),
+                                 fx=stride,
+                                 fy=stride,
+                                 interpolation=cv2.INTER_CUBIC)
+            heatmap = heatmap[:imageToTest_padded.shape[0] -
+                              pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+            heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]),
+                                 interpolation=cv2.INTER_CUBIC)
+            # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0))  # output 0 is PAFs
+            paf = np.transpose(np.squeeze(Mconv7_stage6_L1),
+                               (1, 2, 0))  # output 0 is PAFs
+            paf = cv2.resize(paf, (0, 0),
+                             fx=stride,
+                             fy=stride,
+                             interpolation=cv2.INTER_CUBIC)
+            paf = paf[:imageToTest_padded.shape[0] -
+                      pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+            paf = cv2.resize(paf, (oriImg.shape[1], oriImg.shape[0]),
+                             interpolation=cv2.INTER_CUBIC)
+            heatmap_avg += heatmap_avg + heatmap / len(multiplier)
+            paf_avg += +paf / len(multiplier)
+        all_peaks = []
+        peak_counter = 0
+        for part in range(18):
+            map_ori = heatmap_avg[:, :, part]
+            one_heatmap = gaussian_filter(map_ori, sigma=3)
+            map_left = np.zeros(one_heatmap.shape)
+            map_left[1:, :] = one_heatmap[:-1, :]
+            map_right = np.zeros(one_heatmap.shape)
+            map_right[:-1, :] = one_heatmap[1:, :]
+            map_up = np.zeros(one_heatmap.shape)
+            map_up[:, 1:] = one_heatmap[:, :-1]
+            map_down = np.zeros(one_heatmap.shape)
+            map_down[:, :-1] = one_heatmap[:, 1:]
+            peaks_binary = np.logical_and.reduce(
+                (one_heatmap >= map_left, one_heatmap >= map_right,
+                 one_heatmap >= map_up, one_heatmap >= map_down,
+                 one_heatmap > thre1))
+            peaks = list(
+                zip(np.nonzero(peaks_binary)[1],
+                    np.nonzero(peaks_binary)[0]))  # note reverse
+            peaks_with_score = [x + (map_ori[x[1], x[0]], ) for x in peaks]
+            peak_id = range(peak_counter, peak_counter + len(peaks))
+            peaks_with_score_and_id = [
+                peaks_with_score[i] + (peak_id[i], )
+                for i in range(len(peak_id))
+            ]
+            all_peaks.append(peaks_with_score_and_id)
+            peak_counter += len(peaks)
+        # find connection in the specified sequence, center 29 is in the position 15
+        limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9],
+                   [9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1],
+                   [1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]]
+        # the middle joints heatmap correpondence
+        mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44],
+                  [19, 20], [21, 22], [23, 24], [25, 26], [27, 28], [29, 30],
+                  [47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38],
+                  [45, 46]]
+        connection_all = []
+        special_k = []
+        mid_num = 10
+        for k in range(len(mapIdx)):
+            score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
+            candA = all_peaks[limbSeq[k][0] - 1]
+            candB = all_peaks[limbSeq[k][1] - 1]
+            nA = len(candA)
+            nB = len(candB)
+            indexA, indexB = limbSeq[k]
+            if (nA != 0 and nB != 0):
+                connection_candidate = []
+                for i in range(nA):
+                    for j in range(nB):
+                        vec = np.subtract(candB[j][:2], candA[i][:2])
+                        norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
+                        norm = max(0.001, norm)
+                        vec = np.divide(vec, norm)
+                        startend = list(
+                            zip(
+                                np.linspace(candA[i][0],
+                                            candB[j][0],
+                                            num=mid_num),
+                                np.linspace(candA[i][1],
+                                            candB[j][1],
+                                            num=mid_num)))
+                        vec_x = np.array([
+                            score_mid[int(round(startend[ii][1])),
+                                      int(round(startend[ii][0])), 0]
+                            for ii in range(len(startend))
+                        ])
+                        vec_y = np.array([
+                            score_mid[int(round(startend[ii][1])),
+                                      int(round(startend[ii][0])), 1]
+                            for ii in range(len(startend))
+                        ])
+                        score_midpts = np.multiply(
+                            vec_x, vec[0]) + np.multiply(vec_y, vec[1])
+                        score_with_dist_prior = sum(score_midpts) / len(
+                            score_midpts) + min(
+                                0.5 * oriImg.shape[0] / norm - 1, 0)
+                        criterion1 = len(np.nonzero(
+                            score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
+                        criterion2 = score_with_dist_prior > 0
+                        if criterion1 and criterion2:
+                            connection_candidate.append([
+                                i, j, score_with_dist_prior,
+                                score_with_dist_prior + candA[i][2] +
+                                candB[j][2]
+                            ])
+                connection_candidate = sorted(connection_candidate,
+                                              key=lambda x: x[2],
+                                              reverse=True)
+                connection = np.zeros((0, 5))
+                for c in range(len(connection_candidate)):
+                    i, j, s = connection_candidate[c][0:3]
+                    if (i not in connection[:, 3]
+                            and j not in connection[:, 4]):
+                        connection = np.vstack(
+                            [connection, [candA[i][3], candB[j][3], s, i, j]])
+                        if (len(connection) >= min(nA, nB)):
+                            break
+                connection_all.append(connection)
+            else:
+                special_k.append(k)
+                connection_all.append([])
+        # last number in each row is the total parts number of that person
+        # the second last number in each row is the score of the overall configuration
+        subset = -1 * np.ones((0, 20))
+        candidate = np.array(
+            [item for sublist in all_peaks for item in sublist])
+        for k in range(len(mapIdx)):
+            if k not in special_k:
+                partAs = connection_all[k][:, 0]
+                partBs = connection_all[k][:, 1]
+                indexA, indexB = np.array(limbSeq[k]) - 1
+                for i in range(len(connection_all[k])):  # = 1:size(temp,1)
+                    found = 0
+                    subset_idx = [-1, -1]
+                    for j in range(len(subset)):  # 1:size(subset,1):
+                        if subset[j][indexA] == partAs[i] or subset[j][
+                                indexB] == partBs[i]:
+                            subset_idx[found] = j
+                            found += 1
+                    if found == 1:
+                        j = subset_idx[0]
+                        if subset[j][indexB] != partBs[i]:
+                            subset[j][indexB] = partBs[i]
+                            subset[j][-1] += 1
+                            subset[j][-2] += candidate[
+                                partBs[i].astype(int),
+                                2] + connection_all[k][i][2]
+                    elif found == 2:  # if found 2 and disjoint, merge them
+                        j1, j2 = subset_idx
+                        membership = ((subset[j1] >= 0).astype(int) +
+                                      (subset[j2] >= 0).astype(int))[:-2]
+                        if len(np.nonzero(membership == 2)[0]) == 0:  # merge
+                            subset[j1][:-2] += (subset[j2][:-2] + 1)
+                            subset[j1][-2:] += subset[j2][-2:]
+                            subset[j1][-2] += connection_all[k][i][2]
+                            subset = np.delete(subset, j2, 0)
+                        else:  # as like found == 1
+                            subset[j1][indexB] = partBs[i]
+                            subset[j1][-1] += 1
+                            subset[j1][-2] += candidate[
+                                partBs[i].astype(int),
+                                2] + connection_all[k][i][2]
+                    # if find no partA in the subset, create a new subset
+                    elif not found and k < 17:
+                        row = -1 * np.ones(20)
+                        row[indexA] = partAs[i]
+                        row[indexB] = partBs[i]
+                        row[-1] = 2
+                        row[-2] = sum(
+                            candidate[connection_all[k][i, :2].astype(int),
+                                      2]) + connection_all[k][i][2]
+                        subset = np.vstack([subset, row])
+        # delete some rows of subset which has few parts occur
+        deleteIdx = []
+        for i in range(len(subset)):
+            if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
+                deleteIdx.append(i)
+        subset = np.delete(subset, deleteIdx, axis=0)
+        # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
+        # candidate: x, y, score, id
+        return candidate, subset
+@ANNOTATORS.register_class()
+class OpenposeAnnotator(BaseAnnotator, metaclass=ABCMeta):
+    para_dict = {}
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        with FS.get_from(cfg.BODY_MODEL_PATH,
+                         wait_finish=True) as body_model_path:
+            self.body_estimation = Body(body_model_path, device='cpu')
+        with FS.get_from(cfg.HAND_MODEL_PATH,
+                         wait_finish=True) as hand_model_path:
+            self.hand_estimation = Hand(hand_model_path, device='cpu')
+        self.use_hand = cfg.get('USE_HAND', False)
+    def to(self, device):
+        self.body_estimation.model = self.body_estimation.model.to(device)
+        self.body_estimation.device = device
+        self.hand_estimation.model = self.hand_estimation.model.to(device)
+        self.hand_estimation.device = device
+        return self
+    @torch.no_grad()
+    @torch.inference_mode()
+    @torch.autocast('cuda', enabled=False)
+    def forward(self, image):
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        elif isinstance(image, torch.Tensor):
+            image = image.detach().cpu().numpy()
+        elif isinstance(image, np.ndarray):
+            image = image.copy()
+        else:
+            raise f'Unsurpport datatype{type(image)}, only surpport np.ndarray, torch.Tensor, Pillow Image.'
+        image = image[:, :, ::-1]
+        candidate, subset = self.body_estimation(image)
+        canvas = np.zeros_like(image)
+        canvas = draw_bodypose(canvas, candidate, subset)
+        if self.use_hand:
+            hands_list = handDetect(candidate, subset, image)
+            all_hand_peaks = []
+            for x, y, w, is_left in hands_list:
+                peaks = self.hand_estimation(image[y:y + w, x:x + w, :])
+                peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0],
+                                       peaks[:, 0] + x)
+                peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1],
+                                       peaks[:, 1] + y)
+                all_hand_peaks.append(peaks)
+            canvas = draw_handpose(canvas, all_hand_peaks)
+        return canvas
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('ANNOTATORS',
+                            __class__.__name__,
+                            OpenposeAnnotator.para_dict,
+                            set_name=True)

annotator/registry.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from scepter.modules.utils.config import Config
+from scepter.modules.utils.registry import Registry, build_from_config
+def build_annotator(cfg, registry, logger=None, *args, **kwargs):
+    """ After build model, load pretrained model if exists key `pretrain`.
+    pretrain (str, dict): Describes how to load pretrained model.
+        str, treat pretrain as model path;
+        dict: should contains key `path`, and other parameters token by function load_pretrained();
+    """
+    if not isinstance(cfg, Config):
+        raise TypeError(f'Config must be type dict, got {type(cfg)}')
+    if cfg.have('PRETRAINED_MODEL'):
+        pretrain_cfg = cfg.PRETRAINED_MODEL
+        if pretrain_cfg is not None and not isinstance(pretrain_cfg, (str)):
+            raise TypeError('Pretrain parameter must be a string')
+    else:
+        pretrain_cfg = None
+    model = build_from_config(cfg, registry, logger=logger, *args, **kwargs)
+    if pretrain_cfg is not None:
+        if hasattr(model, 'load_pretrained_model'):
+            model.load_pretrained_model(pretrain_cfg)
+    return model
+ANNOTATORS = Registry('ANNOTATORS', build_func=build_annotator)

annotator/utils.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import cv2
+import numpy as np
+def resize_image(input_image, resolution):
+    H, W, C = input_image.shape
+    H = float(H)
+    W = float(W)
+    k = float(resolution) / min(H, W)
+    H *= k
+    W *= k
+    H = int(np.round(H / 64.0)) * 64
+    W = int(np.round(W / 64.0)) * 64
+    img = cv2.resize(
+        input_image, (W, H),
+        interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
+    return img, k
+def resize_image_ori(h, w, image, k):
+    img = cv2.resize(
+        image, (w, h),
+        interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
+    return img
+class AnnotatorProcessor():
+    canny_cfg = {
+        'NAME': 'CannyAnnotator',
+        'LOW_THRESHOLD': 100,
+        'HIGH_THRESHOLD': 200,
+        'INPUT_KEYS': ['img'],
+        'OUTPUT_KEYS': ['canny']
+    }
+    hed_cfg = {
+        'NAME': 'HedAnnotator',
+        'PRETRAINED_MODEL':
+        'ms://damo/scepter_scedit@annotator/ckpts/ControlNetHED.pth',
+        'INPUT_KEYS': ['img'],
+        'OUTPUT_KEYS': ['hed']
+    }
+    openpose_cfg = {
+        'NAME': 'OpenposeAnnotator',
+        'BODY_MODEL_PATH':
+        'ms://damo/scepter_scedit@annotator/ckpts/body_pose_model.pth',
+        'HAND_MODEL_PATH':
+        'ms://damo/scepter_scedit@annotator/ckpts/hand_pose_model.pth',
+        'INPUT_KEYS': ['img'],
+        'OUTPUT_KEYS': ['openpose']
+    }
+    midas_cfg = {
+        'NAME': 'MidasDetector',
+        'PRETRAINED_MODEL':
+        'ms://damo/scepter_scedit@annotator/ckpts/dpt_hybrid-midas-501f0c75.pt',
+        'INPUT_KEYS': ['img'],
+        'OUTPUT_KEYS': ['depth']
+    }
+    mlsd_cfg = {
+        'NAME': 'MLSDdetector',
+        'PRETRAINED_MODEL':
+        'ms://damo/scepter_scedit@annotator/ckpts/mlsd_large_512_fp32.pth',
+        'INPUT_KEYS': ['img'],
+        'OUTPUT_KEYS': ['mlsd']
+    }
+    color_cfg = {
+        'NAME': 'ColorAnnotator',
+        'RATIO': 64,
+        'INPUT_KEYS': ['img'],
+        'OUTPUT_KEYS': ['color']
+    }
+    anno_type_map = {
+        'canny': canny_cfg,
+        'hed': hed_cfg,
+        'pose': openpose_cfg,
+        'depth': midas_cfg,
+        'mlsd': mlsd_cfg,
+        'color': color_cfg
+    }
+    def __init__(self, anno_type):
+        from scepter.modules.annotator.registry import ANNOTATORS
+        from scepter.modules.utils.config import Config
+        from scepter.modules.utils.distribute import we
+        if isinstance(anno_type, str):
+            assert anno_type in self.anno_type_map.keys()
+            anno_type = [anno_type]
+        elif isinstance(anno_type, (list, tuple)):
+            assert all(tp in self.anno_type_map.keys() for tp in anno_type)
+        else:
+            raise Exception(f'Error anno_type: {anno_type}')
+        general_dict = {
+            'NAME': 'GeneralAnnotator',
+            'ANNOTATORS': [self.anno_type_map[tp] for tp in anno_type]
+        }
+        general_anno = Config(cfg_dict=general_dict, load=False)
+        self.general_ins = ANNOTATORS.build(general_anno).to(we.device_id)
+    def run(self, image, anno_type=None):
+        output_image = self.general_ins({'img': image})
+        if anno_type is not None:
+            if isinstance(anno_type, str) and anno_type in output_image:
+                return output_image[anno_type]
+            else:
+                return {
+                    tp: output_image[tp]
+                    for tp in anno_type if tp in output_image
+                }
+        else:
+            return output_image

app.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import gradio as gr
+import spaces
+from PIL import Image
+from typing import List
+from pipeline import prepare_white_image, MultiViewGenerator
+from util import download_file, unzip_file
+download_file("https://huggingface.co/aki-0421/character-360/resolve/main/v2.ckpt", "v2.ckpt")
+download_file("https://huggingface.co/hbyang/Hi3D/resolve/main/ckpts.zip", "ckpts.zip")
+unzip_file("ckpts.zip", ".")
+multi_view_generator = MultiViewGenerator(checkpoint_path="v2.ckpt")
+@spaces.GPU(duration=120)
+def generate_images(input_image: Image.Image) -> List[Image.Image]:
+    white_image = prepare_white_image(input_image=input_image)
+    return multi_view_generator.infer(white_image=white_image)
+with gr.Blocks() as demo:
+    gr.Markdown("# GPU-accelerated Image Processing")
+    with gr.Row():
+        input_image = gr.Image(label="Input Image", type="pil")  # 入力はPIL形式
+        output_gallery = gr.Gallery(label="Output Images (25 Variations)").style(grid=(5, 5))
+    submit_button = gr.Button("Generate")
+    submit_button.click(generate_images, inputs=input_image, outputs=output_gallery)
+demo.launch()

dataset/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ benchmarks/benchmarking_Random_grayscale.png

dataset/opencv_transforms/__init__.py ADDED Viewed

File without changes

dataset/opencv_transforms/functional.py ADDED Viewed

	@@ -0,0 +1,598 @@

+import math
+import random
+import torch
+from PIL import Image, ImageEnhance, ImageOps
+try:
+    import accimage
+except ImportError:
+    accimage = None
+import collections
+import numbers
+import types
+import warnings
+import cv2
+import numpy as np
+from PIL import Image
+_cv2_pad_to_str = {
+    'constant': cv2.BORDER_CONSTANT,
+    'edge': cv2.BORDER_REPLICATE,
+    'reflect': cv2.BORDER_REFLECT_101,
+    'symmetric': cv2.BORDER_REFLECT
+}
+_cv2_interpolation_to_str = {
+    'nearest': cv2.INTER_NEAREST,
+    'bilinear': cv2.INTER_LINEAR,
+    'area': cv2.INTER_AREA,
+    'bicubic': cv2.INTER_CUBIC,
+    'lanczos': cv2.INTER_LANCZOS4
+}
+_cv2_interpolation_from_str = {v: k for k, v in _cv2_interpolation_to_str.items()}
+def _is_pil_image(img):
+    if accimage is not None:
+        return isinstance(img, (Image.Image, accimage.Image))
+    else:
+        return isinstance(img, Image.Image)
+def _is_tensor_image(img):
+    return torch.is_tensor(img) and img.ndimension() == 3
+def _is_numpy_image(img):
+    return isinstance(img, np.ndarray) and (img.ndim in {2, 3})
+def to_tensor(pic):
+    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+    See ``ToTensor`` for more details.
+    Args:
+        pic (PIL Image or numpy.ndarray): Image to be converted to tensor.
+    Returns:
+        Tensor: Converted image.
+    """
+    if not (_is_numpy_image(pic)):
+        raise TypeError('pic should be ndarray. Got {}'.format(type(pic)))
+    # handle numpy array
+    img = torch.from_numpy(pic.transpose((2, 0, 1)))
+    # backward compatibility
+    if isinstance(img, torch.ByteTensor) or img.dtype == torch.uint8:
+        return img.float().div(255)
+    else:
+        return img
+def normalize(tensor, mean, std):
+    """Normalize a tensor image with mean and standard deviation.
+    .. note::
+        This transform acts in-place, i.e., it mutates the input tensor.
+    See :class:`~torchvision.transforms.Normalize` for more details.
+    Args:
+        tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channely.
+    Returns:
+        Tensor: Normalized Tensor image.
+    """
+    if not _is_tensor_image(tensor):
+        raise TypeError('tensor is not a torch image.')
+    # This is faster than using broadcasting, don't change without benchmarking
+    for t, m, s in zip(tensor, mean, std):
+        t.sub_(m).div_(s)
+    return tensor
+def resize(img, size, interpolation=cv2.INTER_LINEAR):
+    r"""Resize the input numpy ndarray to the given size.
+    Args:
+        img (numpy ndarray): Image to be resized.
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), the output size will be matched to this. If size is an int,
+            the smaller edge of the image will be matched to this number maintaing
+            the aspect ratio. i.e, if height > width, then image will be rescaled to
+            :math:`\left(\text{size} \times \frac{\text{height}}{\text{width}}, \text{size}\right)`
+        interpolation (int, optional): Desired interpolation. Default is
+            ``cv2.INTER_LINEAR``
+    Returns:
+        PIL Image: Resized image.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy image. Got {}'.format(type(img)))
+    if not (isinstance(size, int) or (isinstance(size, collections.abc.Iterable) and len(size) == 2)):
+        raise TypeError('Got inappropriate size arg: {}'.format(size))
+    h, w = img.shape[0], img.shape[1]
+    if isinstance(size, int):
+        if (w <= h and w == size) or (h <= w and h == size):
+            return img
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+    else:
+        ow, oh = size[1], size[0]
+    output = cv2.resize(img, dsize=(ow, oh), interpolation=interpolation)
+    if img.shape[2] == 1:
+        return output[:, :, np.newaxis]
+    else:
+        return output
+def scale(*args, **kwargs):
+    warnings.warn("The use of the transforms.Scale transform is deprecated, " + "please use transforms.Resize instead.")
+    return resize(*args, **kwargs)
+def pad(img, padding, fill=0, padding_mode='constant'):
+    r"""Pad the given numpy ndarray on all sides with specified padding mode and fill value.
+    Args:
+        img (numpy ndarray): image to be padded.
+        padding (int or tuple): Padding on each border. If a single int is provided this
+            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            this is the padding for the left, top, right and bottom borders
+            respectively.
+        fill: Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
+            - constant: pads with a constant value, this value is specified with fill
+            - edge: pads with the last value on the edge of the image
+            - reflect: pads with reflection of image (without repeating the last value on the edge)
+                       padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                       will result in [3, 2, 1, 2, 3, 4, 3, 2]
+            - symmetric: pads with reflection of image (repeating the last value on the edge)
+                         padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                         will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    Returns:
+        Numpy image: padded image.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy ndarray. Got {}'.format(type(img)))
+    if not isinstance(padding, (numbers.Number, tuple, list)):
+        raise TypeError('Got inappropriate padding arg')
+    if not isinstance(fill, (numbers.Number, str, tuple)):
+        raise TypeError('Got inappropriate fill arg')
+    if not isinstance(padding_mode, str):
+        raise TypeError('Got inappropriate padding_mode arg')
+    if isinstance(padding, collections.Sequence) and len(padding) not in [2, 4]:
+        raise ValueError("Padding must be an int or a 2, or 4 element tuple, not a " +
+                         "{} element tuple".format(len(padding)))
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
+        'Padding mode should be either constant, edge, reflect or symmetric'
+    if isinstance(padding, int):
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    if isinstance(padding, collections.Sequence) and len(padding) == 2:
+        pad_left = pad_right = padding[0]
+        pad_top = pad_bottom = padding[1]
+    if isinstance(padding, collections.Sequence) and len(padding) == 4:
+        pad_left = padding[0]
+        pad_top = padding[1]
+        pad_right = padding[2]
+        pad_bottom = padding[3]
+    if img.shape[2] == 1:
+        return cv2.copyMakeBorder(img,
+                                  top=pad_top,
+                                  bottom=pad_bottom,
+                                  left=pad_left,
+                                  right=pad_right,
+                                  borderType=_cv2_pad_to_str[padding_mode],
+                                  value=fill)[:, :, np.newaxis]
+    else:
+        return cv2.copyMakeBorder(img,
+                                  top=pad_top,
+                                  bottom=pad_bottom,
+                                  left=pad_left,
+                                  right=pad_right,
+                                  borderType=_cv2_pad_to_str[padding_mode],
+                                  value=fill)
+def crop(img, i, j, h, w):
+    """Crop the given PIL Image.
+    Args:
+        img (numpy ndarray): Image to be cropped.
+        i: Upper pixel coordinate.
+        j: Left pixel coordinate.
+        h: Height of the cropped image.
+        w: Width of the cropped image.
+    Returns:
+        numpy ndarray: Cropped image.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy image. Got {}'.format(type(img)))
+    return img[i:i + h, j:j + w, :]
+def center_crop(img, output_size):
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+    h, w = img.shape[0:2]
+    th, tw = output_size
+    i = int(round((h - th) / 2.))
+    j = int(round((w - tw) / 2.))
+    return crop(img, i, j, th, tw)
+def resized_crop(img, i, j, h, w, size, interpolation=cv2.INTER_LINEAR):
+    """Crop the given numpy ndarray and resize it to desired size.
+    Notably used in :class:`~torchvision.transforms.RandomResizedCrop`.
+    Args:
+        img (numpy ndarray): Image to be cropped.
+        i: Upper pixel coordinate.
+        j: Left pixel coordinate.
+        h: Height of the cropped image.
+        w: Width of the cropped image.
+        size (sequence or int): Desired output size. Same semantics as ``scale``.
+        interpolation (int, optional): Desired interpolation. Default is
+            ``cv2.INTER_CUBIC``.
+    Returns:
+        PIL Image: Cropped image.
+    """
+    assert _is_numpy_image(img), 'img should be numpy image'
+    img = crop(img, i, j, h, w)
+    img = resize(img, size, interpolation=interpolation)
+    return img
+def hflip(img):
+    """Horizontally flip the given numpy ndarray.
+    Args:
+        img (numpy ndarray): image to be flipped.
+    Returns:
+        numpy ndarray:  Horizontally flipped image.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy image. Got {}'.format(type(img)))
+    # img[:,::-1] is much faster, but doesn't work with torch.from_numpy()!
+    if img.shape[2] == 1:
+        return cv2.flip(img, 1)[:, :, np.newaxis]
+    else:
+        return cv2.flip(img, 1)
+def vflip(img):
+    """Vertically flip the given numpy ndarray.
+    Args:
+        img (numpy ndarray): Image to be flipped.
+    Returns:
+        numpy ndarray:  Vertically flipped image.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy Image. Got {}'.format(type(img)))
+    if img.shape[2] == 1:
+        return cv2.flip(img, 0)[:, :, np.newaxis]
+    else:
+        return cv2.flip(img, 0)
+    # img[::-1] is much faster, but doesn't work with torch.from_numpy()!
+def five_crop(img, size):
+    """Crop the given numpy ndarray into four corners and the central crop.
+    .. Note::
+        This transform returns a tuple of images and there may be a
+        mismatch in the number of inputs and targets your ``Dataset`` returns.
+    Args:
+       size (sequence or int): Desired output size of the crop. If size is an
+           int instead of sequence like (h, w), a square crop (size, size) is
+           made.
+    Returns:
+       tuple: tuple (tl, tr, bl, br, center)
+                Corresponding top left, top right, bottom left, bottom right and center crop.
+    """
+    if isinstance(size, numbers.Number):
+        size = (int(size), int(size))
+    else:
+        assert len(size) == 2, "Please provide only two dimensions (h, w) for size."
+    h, w = img.shape[0:2]
+    crop_h, crop_w = size
+    if crop_w > w or crop_h > h:
+        raise ValueError("Requested crop size {} is bigger than input size {}".format(size, (h, w)))
+    tl = crop(img, 0, 0, crop_h, crop_w)
+    tr = crop(img, 0, w - crop_w, crop_h, crop_w)
+    bl = crop(img, h - crop_h, 0, crop_h, crop_w)
+    br = crop(img, h - crop_h, w - crop_w, crop_h, crop_w)
+    center = center_crop(img, (crop_h, crop_w))
+    return tl, tr, bl, br, center
+def ten_crop(img, size, vertical_flip=False):
+    r"""Crop the given numpy ndarray into four corners and the central crop plus the
+        flipped version of these (horizontal flipping is used by default).
+    .. Note::
+        This transform returns a tuple of images and there may be a
+        mismatch in the number of inputs and targets your ``Dataset`` returns.
+    Args:
+       size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+       vertical_flip (bool): Use vertical flipping instead of horizontal
+    Returns:
+       tuple: tuple (tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip)
+                Corresponding top left, top right, bottom left, bottom right and center crop
+                and same for the flipped image.
+    """
+    if isinstance(size, numbers.Number):
+        size = (int(size), int(size))
+    else:
+        assert len(size) == 2, "Please provide only two dimensions (h, w) for size."
+    first_five = five_crop(img, size)
+    if vertical_flip:
+        img = vflip(img)
+    else:
+        img = hflip(img)
+    second_five = five_crop(img, size)
+    return first_five + second_five
+def adjust_brightness(img, brightness_factor):
+    """Adjust brightness of an Image.
+    Args:
+        img (numpy ndarray): numpy ndarray to be adjusted.
+        brightness_factor (float):  How much to adjust the brightness. Can be
+            any non negative number. 0 gives a black image, 1 gives the
+            original image while 2 increases the brightness by a factor of 2.
+    Returns:
+        numpy ndarray: Brightness adjusted image.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy Image. Got {}'.format(type(img)))
+    table = np.array([i * brightness_factor for i in range(0, 256)]).clip(0, 255).astype('uint8')
+    # same thing but a bit slower
+    # cv2.convertScaleAbs(img, alpha=brightness_factor, beta=0)
+    if img.shape[2] == 1:
+        return cv2.LUT(img, table)[:, :, np.newaxis]
+    else:
+        return cv2.LUT(img, table)
+def adjust_contrast(img, contrast_factor):
+    """Adjust contrast of an mage.
+    Args:
+        img (numpy ndarray): numpy ndarray to be adjusted.
+        contrast_factor (float): How much to adjust the contrast. Can be any
+            non negative number. 0 gives a solid gray image, 1 gives the
+            original image while 2 increases the contrast by a factor of 2.
+    Returns:
+        numpy ndarray: Contrast adjusted image.
+    """
+    # much faster to use the LUT construction than anything else I've tried
+    # it's because you have to change dtypes multiple times
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy Image. Got {}'.format(type(img)))
+    # input is RGB
+    if img.ndim > 2 and img.shape[2] == 3:
+        mean_value = round(cv2.mean(cv2.cvtColor(img, cv2.COLOR_RGB2GRAY))[0])
+    elif img.ndim == 2:
+        # grayscale input
+        mean_value = round(cv2.mean(img)[0])
+    else:
+        # multichannel input
+        mean_value = round(np.mean(img))
+    table = np.array([(i - mean_value) * contrast_factor + mean_value for i in range(0, 256)]).clip(0,
+                                                                                                    255).astype('uint8')
+    # enhancer = ImageEnhance.Contrast(img)
+    # img = enhancer.enhance(contrast_factor)
+    if img.ndim == 2 or img.shape[2] == 1:
+        return cv2.LUT(img, table)[:, :, np.newaxis]
+    else:
+        return cv2.LUT(img, table)
+def adjust_saturation(img, saturation_factor):
+    """Adjust color saturation of an image.
+    Args:
+        img (numpy ndarray): numpy ndarray to be adjusted.
+        saturation_factor (float):  How much to adjust the saturation. 0 will
+            give a black and white image, 1 will give the original image while
+            2 will enhance the saturation by a factor of 2.
+    Returns:
+        numpy ndarray: Saturation adjusted image.
+    """
+    # ~10ms slower than PIL!
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy Image. Got {}'.format(type(img)))
+    img = Image.fromarray(img)
+    enhancer = ImageEnhance.Color(img)
+    img = enhancer.enhance(saturation_factor)
+    return np.array(img)
+def adjust_hue(img, hue_factor):
+    """Adjust hue of an image.
+    The image hue is adjusted by converting the image to HSV and
+    cyclically shifting the intensities in the hue channel (H).
+    The image is then converted back to original image mode.
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+    See `Hue`_ for more details.
+    .. _Hue: https://en.wikipedia.org/wiki/Hue
+    Args:
+        img (numpy ndarray): numpy ndarray to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+    Returns:
+        numpy ndarray: Hue adjusted image.
+    """
+    # After testing, found that OpenCV calculates the Hue in a call to
+    # cv2.cvtColor(..., cv2.COLOR_BGR2HSV) differently from PIL
+    # This function takes 160ms! should be avoided
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError('hue_factor is not in [-0.5, 0.5].'.format(hue_factor))
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy Image. Got {}'.format(type(img)))
+    img = Image.fromarray(img)
+    input_mode = img.mode
+    if input_mode in {'L', '1', 'I', 'F'}:
+        return np.array(img)
+    h, s, v = img.convert('HSV').split()
+    np_h = np.array(h, dtype=np.uint8)
+    # uint8 addition take cares of rotation across boundaries
+    with np.errstate(over='ignore'):
+        np_h += np.uint8(hue_factor * 255)
+    h = Image.fromarray(np_h, 'L')
+    img = Image.merge('HSV', (h, s, v)).convert(input_mode)
+    return np.array(img)
+def adjust_gamma(img, gamma, gain=1):
+    r"""Perform gamma correction on an image.
+    Also known as Power Law Transform. Intensities in RGB mode are adjusted
+    based on the following equation:
+    .. math::
+        I_{\text{out}} = 255 \times \text{gain} \times \left(\frac{I_{\text{in}}}{255}\right)^{\gamma}
+    See `Gamma Correction`_ for more details.
+    .. _Gamma Correction: https://en.wikipedia.org/wiki/Gamma_correction
+    Args:
+        img (numpy ndarray): numpy ndarray to be adjusted.
+        gamma (float): Non negative real number, same as :math:`\gamma` in the equation.
+            gamma larger than 1 make the shadows darker,
+            while gamma smaller than 1 make dark regions lighter.
+        gain (float): The constant multiplier.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy Image. Got {}'.format(type(img)))
+    if gamma < 0:
+        raise ValueError('Gamma should be a non-negative real number')
+    # from here
+    # https://stackoverflow.com/questions/33322488/how-to-change-image-illumination-in-opencv-python/41061351
+    table = np.array([((i / 255.0)**gamma) * 255 * gain for i in np.arange(0, 256)]).astype('uint8')
+    if img.shape[2] == 1:
+        return cv2.LUT(img, table)[:, :, np.newaxis]
+    else:
+        return cv2.LUT(img, table)
+def rotate(img, angle, resample=False, expand=False, center=None):
+    """Rotate the image by angle.
+    Args:
+        img (numpy ndarray): numpy ndarray to be rotated.
+        angle (float or int): In degrees degrees counter clockwise order.
+        resample (``PIL.Image.NEAREST`` or ``PIL.Image.BILINEAR`` or ``PIL.Image.BICUBIC``, optional):
+            An optional resampling filter. See `filters`_ for more information.
+            If omitted, or if the image has mode "1" or "P", it is set to ``PIL.Image.NEAREST``.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output image to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (2-tuple, optional): Optional center of rotation.
+            Origin is the upper left corner.
+            Default is the center of the image.
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy Image. Got {}'.format(type(img)))
+    rows, cols = img.shape[0:2]
+    if center is None:
+        center = (cols / 2, rows / 2)
+    M = cv2.getRotationMatrix2D(center, angle, 1)
+    if img.shape[2] == 1:
+        return cv2.warpAffine(img, M, (cols, rows))[:, :, np.newaxis]
+    else:
+        return cv2.warpAffine(img, M, (cols, rows))
+def _get_affine_matrix(center, angle, translate, scale, shear):
+    # Helper method to compute matrix for affine transformation
+    # We need compute affine transformation matrix: M = T * C * RSS * C^-1
+    # where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1]
+    #       C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1]
+    #       RSS is rotation with scale and shear matrix
+    #       RSS(a, scale, shear) = [ cos(a)*scale    -sin(a + shear)*scale     0]
+    #                              [ sin(a)*scale    cos(a + shear)*scale     0]
+    #                              [     0                  0          1]
+    angle = math.radians(angle)
+    shear = math.radians(shear)
+    # scale = 1.0 / scale
+    T = np.array([[1, 0, translate[0]], [0, 1, translate[1]], [0, 0, 1]])
+    C = np.array([[1, 0, center[0]], [0, 1, center[1]], [0, 0, 1]])
+    RSS = np.array([[math.cos(angle) * scale, -math.sin(angle + shear) * scale, 0],
+                    [math.sin(angle) * scale, math.cos(angle + shear) * scale, 0], [0, 0, 1]])
+    matrix = T @ C @ RSS @ np.linalg.inv(C)
+    return matrix[:2, :]
+def affine(img, angle, translate, scale, shear, interpolation=cv2.INTER_LINEAR, mode=cv2.BORDER_CONSTANT, fillcolor=0):
+    """Apply affine transformation on the image keeping image center invariant
+    Args:
+        img (numpy ndarray): numpy ndarray to be transformed.
+        angle (float or int): rotation angle in degrees between -180 and 180, clockwise direction.
+        translate (list or tuple of integers): horizontal and vertical translations (post-rotation translation)
+        scale (float): overall scale
+        shear (float): shear angle value in degrees between -180 to 180, clockwise direction.
+        interpolation (``cv2.INTER_NEAREST` or ``cv2.INTER_LINEAR`` or ``cv2.INTER_AREA``, ``cv2.INTER_CUBIC``):
+            An optional resampling filter.
+            See `filters`_ for more information.
+            If omitted, it is set to ``cv2.INTER_CUBIC``, for bicubic interpolation.
+        mode (``cv2.BORDER_CONSTANT`` or ``cv2.BORDER_REPLICATE`` or ``cv2.BORDER_REFLECT`` or ``cv2.BORDER_REFLECT_101``)
+            Method for filling in border regions.
+            Defaults to cv2.BORDER_CONSTANT, meaning areas outside the image are filled with a value (val, default 0)
+        val (int): Optional fill color for the area outside the transform in the output image. Default: 0
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy Image. Got {}'.format(type(img)))
+    assert isinstance(translate, (tuple, list)) and len(translate) == 2, \
+        "Argument translate should be a list or tuple of length 2"
+    assert scale > 0.0, "Argument scale should be positive"
+    output_size = img.shape[0:2]
+    center = (img.shape[1] * 0.5 + 0.5, img.shape[0] * 0.5 + 0.5)
+    matrix = _get_affine_matrix(center, angle, translate, scale, shear)
+    if img.shape[2] == 1:
+        return cv2.warpAffine(img, matrix, output_size[::-1], interpolation, borderMode=mode,
+                              borderValue=fillcolor)[:, :, np.newaxis]
+    else:
+        return cv2.warpAffine(img, matrix, output_size[::-1], interpolation, borderMode=mode, borderValue=fillcolor)
+def to_grayscale(img, num_output_channels: int = 1):
+    """Convert image to grayscale version of image.
+    Args:
+        img (numpy ndarray): Image to be converted to grayscale.
+        num_output_channels: int
+            if 1 : returned image is single channel
+            if 3 : returned image is 3 channel with r = g = b
+    Returns:
+        numpy ndarray: Grayscale version of the image.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy ndarray. Got {}'.format(type(img)))
+    if num_output_channels == 1:
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)[:, :, np.newaxis]
+    elif num_output_channels == 3:
+        # much faster than doing cvtColor to go back to gray
+        img = np.broadcast_to(cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)[:, :, np.newaxis], img.shape)
+    return img

dataset/opencv_transforms/transforms.py ADDED Viewed

	@@ -0,0 +1,1044 @@

+from __future__ import division
+import collections
+import math
+import numbers
+import random
+import types
+import warnings
+# from PIL import Image, ImageOps, ImageEnhance
+try:
+    import accimage
+except ImportError:
+    accimage = None
+import cv2
+import numpy as np
+import torch
+from . import functional as F
+__all__ = [
+    "Compose", "ToTensor", "Normalize", "Resize", "Scale",
+    "CenterCrop", "Pad", "Lambda", "RandomApply", "RandomChoice",
+    "RandomOrder", "RandomCrop", "RandomHorizontalFlip", "RandomVerticalFlip",
+    "RandomResizedCrop", "RandomSizedCrop", "FiveCrop", "TenCrop",
+    "LinearTransformation", "ColorJitter", "RandomRotation", "RandomAffine",
+    "Grayscale", "RandomGrayscale"
+]
+_cv2_pad_to_str = {
+    'constant': cv2.BORDER_CONSTANT,
+    'edge': cv2.BORDER_REPLICATE,
+    'reflect': cv2.BORDER_REFLECT_101,
+    'symmetric': cv2.BORDER_REFLECT
+}
+_cv2_interpolation_to_str = {
+    'nearest': cv2.INTER_NEAREST,
+    'bilinear': cv2.INTER_LINEAR,
+    'area': cv2.INTER_AREA,
+    'bicubic': cv2.INTER_CUBIC,
+    'lanczos': cv2.INTER_LANCZOS4
+}
+_cv2_interpolation_from_str = {
+    v: k
+    for k, v in _cv2_interpolation_to_str.items()
+}
+class Compose(object):
+    """Composes several transforms together.
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.ToTensor(),
+        >>> ])
+    """
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, img):
+        for t in self.transforms:
+            img = t(img)
+        return img
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
+class ToTensor(object):
+    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
+    """
+    def __call__(self, pic):
+        """
+        Args:
+            pic (PIL Image or numpy.ndarray): Image to be converted to tensor.
+        Returns:
+            Tensor: Converted image.
+        """
+        return F.to_tensor(pic)
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
+class Normalize(object):
+    """Normalize a tensor image with mean and standard deviation.
+    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform
+    will normalize each channel of the input ``torch.*Tensor`` i.e.
+    ``input[channel] = (input[channel] - mean[channel]) / std[channel]``
+    .. note::
+        This transform acts in-place, i.e., it mutates the input tensor.
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+    """
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, tensor):
+        """
+        Args:
+            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
+        Returns:
+            Tensor: Normalized Tensor image.
+        """
+        return F.normalize(tensor, self.mean, self.std)
+    def __repr__(self):
+        return self.__class__.__name__ + '(mean={0}, std={1})'.format(
+            self.mean, self.std)
+class Resize(object):
+    """Resize the input numpy ndarray to the given size.
+    Args:
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size)
+        interpolation (int, optional): Desired interpolation. Default is
+            ``cv2.INTER_CUBIC``, bicubic interpolation
+    """
+    def __init__(self, size, interpolation=cv2.INTER_LINEAR):
+        # assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
+        if isinstance(size, int):
+            self.size = size
+        elif isinstance(size, collections.abc.Iterable) and len(size) == 2:
+            if type(size) == list:
+                size = tuple(size)
+            self.size = size
+        else:
+            raise ValueError('Unknown inputs for size: {}'.format(size))
+        self.interpolation = interpolation
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be scaled.
+        Returns:
+            numpy ndarray: Rescaled image.
+        """
+        return F.resize(img, self.size, self.interpolation)
+    def __repr__(self):
+        interpolate_str = _cv2_interpolation_from_str[self.interpolation]
+        return self.__class__.__name__ + '(size={0}, interpolation={1})'.format(
+            self.size, interpolate_str)
+class Scale(Resize):
+    """
+    Note: This transform is deprecated in favor of Resize.
+    """
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The use of the transforms.Scale transform is deprecated, " +
+            "please use transforms.Resize instead.")
+        super(Scale, self).__init__(*args, **kwargs)
+class CenterCrop(object):
+    """Crops the given numpy ndarray at the center.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+    """
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be cropped.
+        Returns:
+            numpy ndarray: Cropped image.
+        """
+        return F.center_crop(img, self.size)
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0})'.format(self.size)
+class Pad(object):
+    """Pad the given numpy ndarray on all sides with the given "pad" value.
+    Args:
+        padding (int or tuple): Padding on each border. If a single int is provided this
+            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            this is the padding for the left, top, right and bottom borders
+            respectively.
+        fill (int or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+            - constant: pads with a constant value, this value is specified with fill
+            - edge: pads with the last value at the edge of the image
+            - reflect: pads with reflection of image without repeating the last value on the edge
+                For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                will result in [3, 2, 1, 2, 3, 4, 3, 2]
+            - symmetric: pads with reflection of image repeating the last value on the edge
+                For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+    def __init__(self, padding, fill=0, padding_mode='constant'):
+        assert isinstance(padding, (numbers.Number, tuple, list))
+        assert isinstance(fill, (numbers.Number, str, tuple))
+        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+        if isinstance(padding,
+                      collections.Sequence) and len(padding) not in [2, 4]:
+            raise ValueError(
+                "Padding must be an int or a 2, or 4 element tuple, not a " +
+                "{} element tuple".format(len(padding)))
+        self.padding = padding
+        self.fill = fill
+        self.padding_mode = padding_mode
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be padded.
+        Returns:
+            numpy ndarray: Padded image.
+        """
+        return F.pad(img, self.padding, self.fill, self.padding_mode)
+    def __repr__(self):
+        return self.__class__.__name__ + '(padding={0}, fill={1}, padding_mode={2})'.\
+            format(self.padding, self.fill, self.padding_mode)
+class Lambda(object):
+    """Apply a user-defined lambda as a transform.
+    Args:
+        lambd (function): Lambda/function to be used for transform.
+    """
+    def __init__(self, lambd):
+        assert isinstance(lambd, types.LambdaType)
+        self.lambd = lambd
+    def __call__(self, img):
+        return self.lambd(img)
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
+class RandomTransforms(object):
+    """Base class for a list of transformations with randomness
+    Args:
+        transforms (list or tuple): list of transformations
+    """
+    def __init__(self, transforms):
+        assert isinstance(transforms, (list, tuple))
+        self.transforms = transforms
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError()
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
+class RandomApply(RandomTransforms):
+    """Apply randomly a list of transformations with a given probability
+    Args:
+        transforms (list or tuple): list of transformations
+        p (float): probability
+    """
+    def __init__(self, transforms, p=0.5):
+        super(RandomApply, self).__init__(transforms)
+        self.p = p
+    def __call__(self, img):
+        if self.p < random.random():
+            return img
+        for t in self.transforms:
+            img = t(img)
+        return img
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        format_string += '\n    p={}'.format(self.p)
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
+class RandomOrder(RandomTransforms):
+    """Apply a list of transformations in a random order
+    """
+    def __call__(self, img):
+        order = list(range(len(self.transforms)))
+        random.shuffle(order)
+        for i in order:
+            img = self.transforms[i](img)
+        return img
+class RandomChoice(RandomTransforms):
+    """Apply single transformation randomly picked from a list
+    """
+    def __call__(self, img):
+        t = random.choice(self.transforms)
+        return t(img)
+class RandomCrop(object):
+    """Crop the given numpy ndarray at a random location.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+        padding (int or sequence, optional): Optional padding on each border
+            of the image. Default is None, i.e no padding. If a sequence of length
+            4 is provided, it is used to pad left, top, right, bottom borders
+            respectively. If a sequence of length 2 is provided, it is used to
+            pad left/right, top/bottom borders, respectively.
+        pad_if_needed (boolean): It will pad the image if smaller than the
+            desired size to avoid raising an exception.
+        fill: Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
+             - constant: pads with a constant value, this value is specified with fill
+             - edge: pads with the last value on the edge of the image
+             - reflect: pads with reflection of image (without repeating the last value on the edge)
+                padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                will result in [3, 2, 1, 2, 3, 4, 3, 2]
+             - symmetric: pads with reflection of image (repeating the last value on the edge)
+                padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+    def __init__(self,
+                 size,
+                 padding=None,
+                 pad_if_needed=False,
+                 fill=0,
+                 padding_mode='constant'):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+        self.padding = padding
+        self.pad_if_needed = pad_if_needed
+        self.fill = fill
+        self.padding_mode = padding_mode
+    @staticmethod
+    def get_params(img, output_size):
+        """Get parameters for ``crop`` for a random crop.
+        Args:
+            img (numpy ndarray): Image to be cropped.
+            output_size (tuple): Expected output size of the crop.
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+        """
+        h, w = img.shape[0:2]
+        th, tw = output_size
+        if w == tw and h == th:
+            return 0, 0, h, w
+        i = random.randint(0, h - th)
+        j = random.randint(0, w - tw)
+        return i, j, th, tw
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be cropped.
+        Returns:
+            numpy ndarray: Cropped image.
+        """
+        if self.padding is not None:
+            img = F.pad(img, self.padding, self.fill, self.padding_mode)
+        # pad the width if needed
+        if self.pad_if_needed and img.shape[1] < self.size[1]:
+            img = F.pad(img, (self.size[1] - img.shape[1], 0), self.fill,
+                        self.padding_mode)
+        # pad the height if needed
+        if self.pad_if_needed and img.shape[0] < self.size[0]:
+            img = F.pad(img, (0, self.size[0] - img.shape[0]), self.fill,
+                        self.padding_mode)
+        i, j, h, w = self.get_params(img, self.size)
+        return F.crop(img, i, j, h, w)
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0}, padding={1})'.format(
+            self.size, self.padding)
+class RandomHorizontalFlip(object):
+    """Horizontally flip the given PIL Image randomly with a given probability.
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, img):
+        """random
+        Args:
+            img (numpy ndarray): Image to be flipped.
+        Returns:
+            numpy ndarray: Randomly flipped image.
+        """
+        # if random.random() < self.p:
+        #     print('flip')
+        #     return F.hflip(img)
+        if random.random() < self.p:
+            return F.hflip(img)
+        return img
+    def __repr__(self):
+        return self.__class__.__name__ + '(p={})'.format(self.p)
+class RandomVerticalFlip(object):
+    """Vertically flip the given PIL Image randomly with a given probability.
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be flipped.
+        Returns:
+            numpy ndarray: Randomly flipped image.
+        """
+        if random.random() < self.p:
+            return F.vflip(img)
+        return img
+    def __repr__(self):
+        return self.__class__.__name__ + '(p={})'.format(self.p)
+class RandomResizedCrop(object):
+    """Crop the given numpy ndarray to random size and aspect ratio.
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
+    is finally resized to given size.
+    This is popularly used to train the Inception networks.
+    Args:
+        size: expected output size of each edge
+        scale: range of size of the origin size cropped
+        ratio: range of aspect ratio of the origin aspect ratio cropped
+        interpolation: Default: cv2.INTER_CUBIC
+    """
+    def __init__(self,
+                 size,
+                 scale=(0.08, 1.0),
+                 ratio=(3. / 4., 4. / 3.),
+                 interpolation=cv2.INTER_LINEAR):
+        self.size = (size, size)
+        self.interpolation = interpolation
+        self.scale = scale
+        self.ratio = ratio
+    @staticmethod
+    def get_params(img, scale, ratio):
+        """Get parameters for ``crop`` for a random sized crop.
+        Args:
+            img (numpy ndarray): Image to be cropped.
+            scale (tuple): range of size of the origin size cropped
+            ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
+                sized crop.
+        """
+        for attempt in range(10):
+            area = img.shape[0] * img.shape[1]
+            target_area = random.uniform(*scale) * area
+            aspect_ratio = random.uniform(*ratio)
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+            if random.random() < 0.5:
+                w, h = h, w
+            if w <= img.shape[1] and h <= img.shape[0]:
+                i = random.randint(0, img.shape[0] - h)
+                j = random.randint(0, img.shape[1] - w)
+                return i, j, h, w
+        # Fallback
+        w = min(img.shape[0], img.shape[1])
+        i = (img.shape[0] - w) // 2
+        j = (img.shape[1] - w) // 2
+        return i, j, w, w
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be cropped and resized.
+        Returns:
+            numpy ndarray: Randomly cropped and resized image.
+        """
+        i, j, h, w = self.get_params(img, self.scale, self.ratio)
+        return F.resized_crop(img, i, j, h, w, self.size, self.interpolation)
+    def __repr__(self):
+        interpolate_str = _cv2_interpolation_from_str[self.interpolation]
+        format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
+        format_string += ', scale={0}'.format(
+            tuple(round(s, 4) for s in self.scale))
+        format_string += ', ratio={0}'.format(
+            tuple(round(r, 4) for r in self.ratio))
+        format_string += ', interpolation={0})'.format(interpolate_str)
+        return format_string
+class RandomSizedCrop(RandomResizedCrop):
+    """
+    Note: This transform is deprecated in favor of RandomResizedCrop.
+    """
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The use of the transforms.RandomSizedCrop transform is deprecated, "
+            + "please use transforms.RandomResizedCrop instead.")
+        super(RandomSizedCrop, self).__init__(*args, **kwargs)
+class FiveCrop(object):
+    """Crop the given numpy ndarray into four corners and the central crop
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+    Args:
+         size (sequence or int): Desired output size of the crop. If size is an ``int``
+            instead of sequence like (h, w), a square crop of size (size, size) is made.
+    Example:
+         >>> transform = Compose([
+         >>>    FiveCrop(size), # this is a list of numpy ndarrays
+         >>>    Lambda(lambda crops: torch.stack([ToTensor()(crop) for crop in crops])) # returns a 4D tensor
+         >>> ])
+         >>> #In your test loop you can do the following:
+         >>> input, target = batch # input is a 5d tensor, target is 2d
+         >>> bs, ncrops, c, h, w = input.size()
+         >>> result = model(input.view(-1, c, h, w)) # fuse batch size and ncrops
+         >>> result_avg = result.view(bs, ncrops, -1).mean(1) # avg over crops
+    """
+    def __init__(self, size):
+        self.size = size
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            assert len(
+                size
+            ) == 2, "Please provide only two dimensions (h, w) for size."
+            self.size = size
+    def __call__(self, img):
+        return F.five_crop(img, self.size)
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0})'.format(self.size)
+class TenCrop(object):
+    """Crop the given numpy ndarray into four corners and the central crop plus the flipped version of
+    these (horizontal flipping is used by default)
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+        vertical_flip(bool): Use vertical flipping instead of horizontal
+    Example:
+         >>> transform = Compose([
+         >>>    TenCrop(size), # this is a list of PIL Images
+         >>>    Lambda(lambda crops: torch.stack([ToTensor()(crop) for crop in crops])) # returns a 4D tensor
+         >>> ])
+         >>> #In your test loop you can do the following:
+         >>> input, target = batch # input is a 5d tensor, target is 2d
+         >>> bs, ncrops, c, h, w = input.size()
+         >>> result = model(input.view(-1, c, h, w)) # fuse batch size and ncrops
+         >>> result_avg = result.view(bs, ncrops, -1).mean(1) # avg over crops
+    """
+    def __init__(self, size, vertical_flip=False):
+        self.size = size
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            assert len(
+                size
+            ) == 2, "Please provide only two dimensions (h, w) for size."
+            self.size = size
+        self.vertical_flip = vertical_flip
+    def __call__(self, img):
+        return F.ten_crop(img, self.size, self.vertical_flip)
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0}, vertical_flip={1})'.format(
+            self.size, self.vertical_flip)
+class LinearTransformation(object):
+    """Transform a tensor image with a square transformation matrix computed
+    offline.
+    Given transformation_matrix, will flatten the torch.*Tensor, compute the dot
+    product with the transformation matrix and reshape the tensor to its
+    original shape.
+    Applications:
+        - whitening: zero-center the data, compute the data covariance matrix
+                 [D x D] with np.dot(X.T, X), perform SVD on this matrix and
+                 pass it as transformation_matrix.
+    Args:
+        transformation_matrix (Tensor): tensor [D x D], D = C x H x W
+    """
+    def __init__(self, transformation_matrix):
+        if transformation_matrix.size(0) != transformation_matrix.size(1):
+            raise ValueError("transformation_matrix should be square. Got " +
+                             "[{} x {}] rectangular matrix.".format(
+                                 *transformation_matrix.size()))
+        self.transformation_matrix = transformation_matrix
+    def __call__(self, tensor):
+        """
+        Args:
+            tensor (Tensor): Tensor image of size (C, H, W) to be whitened.
+        Returns:
+            Tensor: Transformed image.
+        """
+        if tensor.size(0) * tensor.size(1) * tensor.size(
+                2) != self.transformation_matrix.size(0):
+            raise ValueError(
+                "tensor and transformation matrix have incompatible shape." +
+                "[{} x {} x {}] != ".format(*tensor.size()) +
+                "{}".format(self.transformation_matrix.size(0)))
+        flat_tensor = tensor.view(1, -1)
+        transformed_tensor = torch.mm(flat_tensor, self.transformation_matrix)
+        tensor = transformed_tensor.view(tensor.size())
+        return tensor
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        format_string += (str(self.transformation_matrix.numpy().tolist()) +
+                          ')')
+        return format_string
+class ColorJitter(object):
+    """Randomly change the brightness, contrast and saturation of an image.
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+    """
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+        self.brightness = self._check_input(brightness, 'brightness')
+        self.contrast = self._check_input(contrast, 'contrast')
+        self.saturation = self._check_input(saturation, 'saturation')
+        self.hue = self._check_input(hue,
+                                     'hue',
+                                     center=0,
+                                     bound=(-0.5, 0.5),
+                                     clip_first_on_zero=False)
+        if self.saturation is not None:
+            warnings.warn(
+                'Saturation jitter enabled. Will slow down loading immensely.')
+        if self.hue is not None:
+            warnings.warn(
+                'Hue jitter enabled. Will slow down loading immensely.')
+    def _check_input(self,
+                     value,
+                     name,
+                     center=1,
+                     bound=(0, float('inf')),
+                     clip_first_on_zero=True):
+        if isinstance(value, numbers.Number):
+            if value < 0:
+                raise ValueError(
+                    "If {} is a single number, it must be non negative.".
+                    format(name))
+            value = [center - value, center + value]
+            if clip_first_on_zero:
+                value[0] = max(value[0], 0)
+        elif isinstance(value, (tuple, list)) and len(value) == 2:
+            if not bound[0] <= value[0] <= value[1] <= bound[1]:
+                raise ValueError("{} values should be between {}".format(
+                    name, bound))
+        else:
+            raise TypeError(
+                "{} should be a single number or a list/tuple with length 2.".
+                format(name))
+        # if value is 0 or (1., 1.) for brightness/contrast/saturation
+        # or (0., 0.) for hue, do nothing
+        if value[0] == value[1] == center:
+            value = None
+        return value
+    @staticmethod
+    def get_params(brightness, contrast, saturation, hue):
+        """Get a randomized transform to be applied on image.
+        Arguments are same as that of __init__.
+        Returns:
+            Transform which randomly adjusts brightness, contrast and
+            saturation in a random order.
+        """
+        transforms = []
+        if brightness is not None:
+            brightness_factor = random.uniform(brightness[0], brightness[1])
+            transforms.append(
+                Lambda(
+                    lambda img: F.adjust_brightness(img, brightness_factor)))
+        if contrast is not None:
+            contrast_factor = random.uniform(contrast[0], contrast[1])
+            transforms.append(
+                Lambda(lambda img: F.adjust_contrast(img, contrast_factor)))
+        if saturation is not None:
+            saturation_factor = random.uniform(saturation[0], saturation[1])
+            transforms.append(
+                Lambda(
+                    lambda img: F.adjust_saturation(img, saturation_factor)))
+        if hue is not None:
+            hue_factor = random.uniform(hue[0], hue[1])
+            transforms.append(
+                Lambda(lambda img: F.adjust_hue(img, hue_factor)))
+        random.shuffle(transforms)
+        transform = Compose(transforms)
+        return transform
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Input image.
+        Returns:
+            numpy ndarray: Color jittered image.
+        """
+        transform = self.get_params(self.brightness, self.contrast,
+                                    self.saturation, self.hue)
+        return transform(img)
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        format_string += 'brightness={0}'.format(self.brightness)
+        format_string += ', contrast={0}'.format(self.contrast)
+        format_string += ', saturation={0}'.format(self.saturation)
+        format_string += ', hue={0})'.format(self.hue)
+        return format_string
+class RandomRotation(object):
+    """Rotate the image by angle.
+    Args:
+        degrees (sequence or float or int): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees).
+        resample ({cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_LANCZOS4}, optional):
+            An optional resampling filter. See `filters`_ for more information.
+            If omitted, or if the image has mode "1" or "P", it is set to PIL.Image.NEAREST.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (2-tuple, optional): Optional center of rotation.
+            Origin is the upper left corner.
+            Default is the center of the image.
+    """
+    def __init__(self, degrees, resample=False, expand=False, center=None):
+        if isinstance(degrees, numbers.Number):
+            if degrees < 0:
+                raise ValueError(
+                    "If degrees is a single number, it must be positive.")
+            self.degrees = (-degrees, degrees)
+        else:
+            if len(degrees) != 2:
+                raise ValueError(
+                    "If degrees is a sequence, it must be of len 2.")
+            self.degrees = degrees
+        self.resample = resample
+        self.expand = expand
+        self.center = center
+    @staticmethod
+    def get_params(degrees):
+        """Get parameters for ``rotate`` for a random rotation.
+        Returns:
+            sequence: params to be passed to ``rotate`` for random rotation.
+        """
+        angle = random.uniform(degrees[0], degrees[1])
+        return angle
+    def __call__(self, img):
+        """
+            img (numpy ndarray): Image to be rotated.
+        Returns:
+            numpy ndarray: Rotated image.
+        """
+        angle = self.get_params(self.degrees)
+        return F.rotate(img, angle, self.resample, self.expand, self.center)
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '(degrees={0}'.format(
+            self.degrees)
+        format_string += ', resample={0}'.format(self.resample)
+        format_string += ', expand={0}'.format(self.expand)
+        if self.center is not None:
+            format_string += ', center={0}'.format(self.center)
+        format_string += ')'
+        return format_string
+class RandomAffine(object):
+    """Random affine transformation of the image keeping center invariant
+    Args:
+        degrees (sequence or float or int): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees). Set to 0 to deactivate rotations.
+        translate (tuple, optional): tuple of maximum absolute fraction for horizontal
+            and vertical translations. For example translate=(a, b), then horizontal shift
+            is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
+            randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
+        scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
+            randomly sampled from the range a <= scale <= b. Will keep original scale by default.
+        shear (sequence or float or int, optional): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees). Will not apply shear by default
+        resample ({cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_LANCZOS4}, optional):
+            An optional resampling filter. See `filters`_ for more information.
+            If omitted, or if the image has mode "1" or "P", it is set to PIL.Image.NEAREST.
+        fillcolor (int): Optional fill color for the area outside the transform in the output image.
+    """
+    def __init__(self,
+                 degrees,
+                 translate=None,
+                 scale=None,
+                 shear=None,
+                 interpolation=cv2.INTER_LINEAR,
+                 fillcolor=0):
+        if isinstance(degrees, numbers.Number):
+            if degrees < 0:
+                raise ValueError(
+                    "If degrees is a single number, it must be positive.")
+            self.degrees = (-degrees, degrees)
+        else:
+            assert isinstance(degrees, (tuple, list)) and len(degrees) == 2, \
+                "degrees should be a list or tuple and it must be of length 2."
+            self.degrees = degrees
+        if translate is not None:
+            assert isinstance(translate, (tuple, list)) and len(translate) == 2, \
+                "translate should be a list or tuple and it must be of length 2."
+            for t in translate:
+                if not (0.0 <= t <= 1.0):
+                    raise ValueError(
+                        "translation values should be between 0 and 1")
+        self.translate = translate
+        if scale is not None:
+            assert isinstance(scale, (tuple, list)) and len(scale) == 2, \
+                "scale should be a list or tuple and it must be of length 2."
+            for s in scale:
+                if s <= 0:
+                    raise ValueError("scale values should be positive")
+        self.scale = scale
+        if shear is not None:
+            if isinstance(shear, numbers.Number):
+                if shear < 0:
+                    raise ValueError(
+                        "If shear is a single number, it must be positive.")
+                self.shear = (-shear, shear)
+            else:
+                assert isinstance(shear, (tuple, list)) and len(shear) == 2, \
+                    "shear should be a list or tuple and it must be of length 2."
+                self.shear = shear
+        else:
+            self.shear = shear
+        # self.resample = resample
+        self.interpolation = interpolation
+        self.fillcolor = fillcolor
+    @staticmethod
+    def get_params(degrees, translate, scale_ranges, shears, img_size):
+        """Get parameters for affine transformation
+        Returns:
+            sequence: params to be passed to the affine transformation
+        """
+        angle = random.uniform(degrees[0], degrees[1])
+        if translate is not None:
+            max_dx = translate[0] * img_size[0]
+            max_dy = translate[1] * img_size[1]
+            translations = (np.round(random.uniform(-max_dx, max_dx)),
+                            np.round(random.uniform(-max_dy, max_dy)))
+        else:
+            translations = (0, 0)
+        if scale_ranges is not None:
+            scale = random.uniform(scale_ranges[0], scale_ranges[1])
+        else:
+            scale = 1.0
+        if shears is not None:
+            shear = random.uniform(shears[0], shears[1])
+        else:
+            shear = 0.0
+        return angle, translations, scale, shear
+    def __call__(self, img):
+        """
+            img (numpy ndarray): Image to be transformed.
+        Returns:
+            numpy ndarray: Affine transformed image.
+        """
+        ret = self.get_params(self.degrees, self.translate, self.scale,
+                              self.shear, (img.shape[1], img.shape[0]))
+        return F.affine(img,
+                        *ret,
+                        interpolation=self.interpolation,
+                        fillcolor=self.fillcolor)
+    def __repr__(self):
+        s = '{name}(degrees={degrees}'
+        if self.translate is not None:
+            s += ', translate={translate}'
+        if self.scale is not None:
+            s += ', scale={scale}'
+        if self.shear is not None:
+            s += ', shear={shear}'
+        if self.resample > 0:
+            s += ', resample={resample}'
+        if self.fillcolor != 0:
+            s += ', fillcolor={fillcolor}'
+        s += ')'
+        d = dict(self.__dict__)
+        d['resample'] = _cv2_interpolation_to_str[d['resample']]
+        return s.format(name=self.__class__.__name__, **d)
+class Grayscale(object):
+    """Convert image to grayscale.
+    Args:
+        num_output_channels (int): (1 or 3) number of channels desired for output image
+    Returns:
+        numpy ndarray: Grayscale version of the input.
+        - If num_output_channels == 1 : returned image is single channel
+        - If num_output_channels == 3 : returned image is 3 channel with r == g == b
+    """
+    def __init__(self, num_output_channels=1):
+        self.num_output_channels = num_output_channels
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be converted to grayscale.
+        Returns:
+            numpy ndarray: Randomly grayscaled image.
+        """
+        return F.to_grayscale(img,
+                              num_output_channels=self.num_output_channels)
+    def __repr__(self):
+        return self.__class__.__name__ + '(num_output_channels={0})'.format(
+            self.num_output_channels)
+class RandomGrayscale(object):
+    """Randomly convert image to grayscale with a probability of p (default 0.1).
+    Args:
+        p (float): probability that image should be converted to grayscale.
+    Returns:
+        numpy ndarray: Grayscale version of the input image with probability p and unchanged
+        with probability (1-p).
+        - If input image is 1 channel: grayscale version is 1 channel
+        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
+    """
+    def __init__(self, p=0.1):
+        self.p = p
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be converted to grayscale.
+        Returns:
+            numpy ndarray: Randomly grayscaled image.
+        """
+        num_output_channels = 3
+        if random.random() < self.p:
+            return F.to_grayscale(img, num_output_channels=num_output_channels)
+        return img
+    def __repr__(self):
+        return self.__class__.__name__ + '(p={0})'.format(self.p)

dataset/setup.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import setuptools
+with open('README.md', 'r') as fh:
+    long_description = fh.read()
+setuptools.setup(
+    name='opencv_transforms',
+    version='0.0.6',
+    author='Jim Bohnslav',
+    author_email='[email protected]',
+    description='A drop-in replacement for Torchvision Transforms using OpenCV',
+    keywords='pytorch image augmentations',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    url='https://github.com/jbohnslav/opencv_transforms',
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+)

dataset/tests/compare_to_pil_for_testing.ipynb ADDED Viewed

	@@ -0,0 +1,241 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "import numpy as np\n",
+    "import random\n",
+    "\n",
+    "import cv2\n",
+    "import matplotlib.pyplot as plt\n",
+    "from PIL import Image\n",
+    "\n",
+    "from torchvision import transforms as pil_transforms\n",
+    "from torchvision.transforms import functional as F_pil\n",
+    "\n",
+    "import sys\n",
+    "sys.path.insert(0, '..')\n",
+    "from opencv_transforms import transforms\n",
+    "from opencv_transforms import functional as F\n",
+    "\n",
+    "from setup_testing_directory import get_testing_directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datadir = get_testing_directory()\n",
+    "print(datadir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_images = glob.glob(datadir + '/**/*.JPEG', recursive=True)\n",
+    "train_images.sort()\n",
+    "print('Number of training images: {:,}'.format(len(train_images)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.seed(1)\n",
+    "imfile = random.choice(train_images)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_pil_and_opencv(pil_image, opencv_image, orientation='row'):\n",
+    "    if orientation == 'row':\n",
+    "        rows, cols = 1,3\n",
+    "        size = (8, 4)\n",
+    "    else: \n",
+    "        rows, cols = 3,1\n",
+    "        size = (12, 6)\n",
+    "    fig, axes = plt.subplots(rows, cols,figsize=size)\n",
+    "    ax = axes[0]\n",
+    "    ax.imshow(pil_image)\n",
+    "    ax.set_title('PIL')\n",
+    "\n",
+    "    ax = axes[1]\n",
+    "    ax.imshow(opencv_image)\n",
+    "    ax.set_title('opencv')\n",
+    "\n",
+    "    ax = axes[2]\n",
+    "    l1 = np.abs(pil_image - opencv_image).mean(axis=2)\n",
+    "    ax.imshow(l1)\n",
+    "    ax.set_title('| PIL - opencv|\\nMAE:{:.4f}'.format(l1.mean()))\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pil_image = Image.open(imfile)\n",
+    "image = cv2.cvtColor(cv2.imread(imfile, 1), cv2.COLOR_BGR2RGB)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_pil_and_opencv(pil_image, image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pil_resized = pil_transforms.Resize((224, 224))(pil_image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "resized = transforms.Resize(224)(image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_pil_and_opencv(pil_resized, resized)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def L1(pil: Image, image: np.ndarray) -> float:\n",
+    "    return np.mean(np.abs(np.asarray(pil) - image))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TOL = 1e-4\n",
+    "\n",
+    "l1 = L1(pil_resized, resized)\n",
+    "assert l1 - 88.9559 < TOL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.seed(1)\n",
+    "pil = pil_transforms.RandomRotation(10)(pil_image)\n",
+    "random.seed(1)\n",
+    "np_img = transforms.RandomRotation(10)(image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_pil_and_opencv(pil, np_img)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pil = pil_transforms.FiveCrop((224, 224))(pil_image)\n",
+    "cv  = transforms.FiveCrop((224,224))(image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pil_stacked = np.hstack([np.asarray(i) for i in pil])\n",
+    "cv_stacked = np.hstack(cv)\n",
+    "\n",
+    "plot_pil_and_opencv(pil_stacked, cv_stacked, orientation='col')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pil_stacked.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "l1"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "opencv_transforms",
+   "language": "python",
+   "name": "opencv_transforms"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

dataset/tests/setup_testing_directory.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import argparse
+import os
+from typing import Union
+import warnings
+def get_testing_directory() -> str:
+    directory_file = 'testing_directory.txt'
+    directory_files = [directory_file, os.path.join('tests', directory_file)]
+    for directory_file in directory_files:
+        if os.path.isfile(directory_file):
+            with open(directory_file, 'r') as f:
+                testing_directory = f.read()
+                return testing_directory
+    raise ValueError('please run setup_testing_directory.py before attempting to run unit tests')
+def setup_testing_directory(datadir: Union[str, os.PathLike], overwrite: bool = False) -> str:
+    testing_path_file = 'testing_directory.txt'
+    should_setup = True
+    if os.path.isfile(testing_path_file):
+        with open(testing_path_file, 'r') as f:
+            testing_directory = f.read()
+            if not os.path.isfile(testing_directory):
+                raise ValueError('saved testing directory {} does not exist, re-run ')
+                warnings.warn(
+                    'Saved testing directory {} does not exist, downloading Thumos14...'.format(testing_directory))
+            else:
+                should_setup = False
+    if not should_setup:
+        return testing_directory
+    testing_directory = datadir
+    assert os.path.isdir(testing_directory)
+    assert os.path.isdir(os.path.join(testing_directory, 'train'))
+    assert os.path.isdir(os.path.join(testing_directory, 'val'))
+    with open('testing_directory.txt', 'w') as f:
+        f.write(testing_directory)
+    return testing_directory
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('Setting up image directory for opencv transforms testing')
+    parser.add_argument('-d', '--datadir', default=os.getcwd(), help='Imagenet directory')
+    args = parser.parse_args()
+    setup_testing_directory(args.datadir)

dataset/tests/test_color.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import glob
+import numpy as np
+import random
+from typing import Union
+import cv2
+import matplotlib.pyplot as plt
+from PIL import Image
+from PIL.Image import Image as PIL_image  # for typing
+import pytest
+from torchvision import transforms as pil_transforms
+from torchvision.transforms import functional as F_pil
+from opencv_transforms import transforms
+from opencv_transforms import functional as F
+from setup_testing_directory import get_testing_directory
+TOL = 1e-4
+datadir = get_testing_directory()
+train_images = glob.glob(datadir + '/**/*.JPEG', recursive=True)
+train_images.sort()
+print('Number of training images: {:,}'.format(len(train_images)))
+random.seed(1)
+imfile = random.choice(train_images)
+pil_image = Image.open(imfile)
+image = cv2.cvtColor(cv2.imread(imfile, 1), cv2.COLOR_BGR2RGB)
+class TestContrast:
+    @pytest.mark.parametrize('random_seed', [1, 2, 3, 4])
+    @pytest.mark.parametrize('contrast_factor', [0.0, 0.5, 1.0, 2.0])
+    def test_contrast(self, contrast_factor, random_seed):
+        random.seed(random_seed)
+        imfile = random.choice(train_images)
+        pil_image = Image.open(imfile)
+        image = np.array(pil_image).copy()
+        pil_enhanced = F_pil.adjust_contrast(pil_image, contrast_factor)
+        np_enhanced = F.adjust_contrast(image, contrast_factor)
+        assert np.array_equal(np.array(pil_enhanced), np_enhanced.squeeze())
+    @pytest.mark.parametrize('n_images', [1, 11])
+    def test_multichannel_contrast(self, n_images, contrast_factor=0.1):
+        imfile = random.choice(train_images)
+        pil_image = Image.open(imfile)
+        image = np.array(pil_image).copy()
+        multichannel_image = np.concatenate([image for _ in range(n_images)], axis=-1)
+        # this will raise an exception in version 0.0.5
+        np_enchanced = F.adjust_contrast(multichannel_image, contrast_factor)
+    @pytest.mark.parametrize('contrast_factor', [0, 0.5, 1.0])
+    def test_grayscale_contrast(self, contrast_factor):
+        imfile = random.choice(train_images)
+        pil_image = Image.open(imfile)
+        image = np.array(pil_image).copy()
+        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+        # make sure grayscale images work
+        pil_image = pil_image.convert('L')
+        pil_enhanced = F_pil.adjust_contrast(pil_image, contrast_factor)
+        np_enhanced = F.adjust_contrast(image, contrast_factor)
+        assert np.array_equal(np.array(pil_enhanced), np_enhanced.squeeze())

dataset/tests/test_spatial.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import glob
+import numpy as np
+import random
+from typing import Union
+import cv2
+import matplotlib.pyplot as plt
+from PIL import Image
+from PIL.Image import Image as PIL_image # for typing
+from torchvision import transforms as pil_transforms
+from torchvision.transforms import functional as F_pil
+from opencv_transforms import transforms
+from opencv_transforms import functional as F
+from setup_testing_directory import get_testing_directory
+from utils import L1
+TOL = 1e-4
+datadir = get_testing_directory()
+train_images = glob.glob(datadir + '/**/*.JPEG', recursive=True)
+train_images.sort()
+print('Number of training images: {:,}'.format(len(train_images)))
+random.seed(1)
+imfile = random.choice(train_images)
+pil_image = Image.open(imfile)
+image = cv2.cvtColor(cv2.imread(imfile, 1), cv2.COLOR_BGR2RGB)
+def test_resize():
+    pil_resized = pil_transforms.Resize((224, 224))(pil_image)
+    resized = transforms.Resize((224, 224))(image)
+    l1 = L1(pil_resized, resized)
+    assert l1 - 88.9559 < TOL
+def test_rotation():
+    random.seed(1)
+    pil = pil_transforms.RandomRotation(10)(pil_image)
+    random.seed(1)
+    np_img = transforms.RandomRotation(10)(image)
+    l1 = L1(pil, np_img)
+    assert l1 - 86.7955 < TOL
+def test_five_crop():
+    pil = pil_transforms.FiveCrop((224, 224))(pil_image)
+    cv = transforms.FiveCrop((224, 224))(image)
+    pil_stacked = np.hstack([np.asarray(i) for i in pil])
+    cv_stacked = np.hstack(cv)
+    l1 = L1(pil_stacked, cv_stacked)
+    assert l1 - 22.0444 < TOL

dataset/tests/utils.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from typing import Union
+import numpy as np
+from PIL.Image import Image as PIL_image # for typing
+def L1(pil: Union[PIL_image, np.ndarray], np_image: np.ndarray) -> float:
+    return np.abs(np.asarray(pil) - np_image).mean()

inference.yaml ADDED Viewed

	@@ -0,0 +1,166 @@

+model:
+  target: vtdm.vtdm_gen_v01.VideoLDM
+  base_learning_rate: 1.0e-05
+  params:
+    input_key: video
+    scale_factor: 0.18215
+    log_keys: caption
+    num_samples: 25 #frame_rate
+    trained_param_keys:
+    - diffusion_model.label_emb.0.0.weight
+    - .emb_layers.
+    - .time_stack.
+    en_and_decode_n_samples_a_time: 25 #frame_rate
+    disable_first_stage_autocast: true
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: true
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_head_channels: 64
+        use_linear_in_transformer: true
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: true
+        use_spatial_context: true
+        merge_strategy: learned_with_images
+        video_kernel_size:
+        - 3
+        - 1
+        - 1
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: false
+          input_key: cond_frames_without_noise
+          ucg_rate: 0.1
+          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          params:
+            n_cond_frames: 1
+            n_copies: 1
+            open_clip_embedding_config:
+              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                version: ckpts/open_clip_pytorch_model.bin
+                freeze: true
+        - is_trainable: false
+          input_key: video
+          ucg_rate: 0.0
+          target: vtdm.encoders.AesEmbedder
+        - is_trainable: false
+          input_key: elevation
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+        - input_key: cond_frames
+          is_trainable: false
+          ucg_rate: 0.1
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: true
+            n_cond_frames: 1
+            n_copies: 25 #frame_rate
+            is_ae: true
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: true
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult:
+                  - 1
+                  - 2
+                  - 4
+                  - 4
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+        - input_key: cond_aug
+          is_trainable: false
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        num_frames: 25 #frame_rate
+        batch2model_keys:
+        - num_video_frames
+        - image_only_indicator
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
+          params:
+            p_mean: 1.0
+            p_std: 1.6
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.VWeighting
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.LinearMultistepSampler
+      params:
+        num_steps: 50
+        verbose: True
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            num_frames: 25 #frame_rate
+            max_scale: 2.5
+            min_scale: 1.0

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ libopencv-dev
2	+ build-essential

pipeline.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from typing import List
+from PIL import Image
+import numpy as np
+import math
+import random
+import cv2
+from typing import List
+import torch
+import einops
+from pytorch_lightning import seed_everything
+from transparent_background import Remover
+from dataset.opencv_transforms.functional import to_tensor, center_crop
+from vtdm.model import create_model
+from vtdm.util import tensor2vid
+remover = Remover(jit=False)
+def cv2_to_pil(cv_image: np.ndarray) -> Image.Image:
+    return Image.fromarray(cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB))
+def pil_to_cv2(pil_image: Image.Image) -> np.ndarray:
+    cv_image = np.array(pil_image)
+    cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
+    return cv_image
+def prepare_white_image(input_image: Image.Image) -> Image.Image:
+    # remove bg
+    output = remover.process(input_image, type='rgba')
+    # expand image
+    width, height = output.size
+    max_side = max(width, height)
+    white_image = Image.new('RGBA', (max_side, max_side), (0, 0, 0, 0))
+    x_offset = (max_side - width) // 2
+    y_offset = (max_side - height) // 2
+    white_image.paste(output, (x_offset, y_offset))
+    return white_image
+class MultiViewGenerator:
+    def __init__(self, checkpoint_path, config_path="inference.yaml"):
+        self.models = {}
+        denoising_model = create_model(config_path).cpu()
+        denoising_model.init_from_ckpt(checkpoint_path)
+        denoising_model = denoising_model.cuda().half()
+        self.models["denoising_model"] = denoising_model
+    def denoising(self, frames, args):
+        with torch.no_grad():
+            C, T, H, W = frames.shape
+            batch = {"video": frames.unsqueeze(0)}
+            batch["elevation"] = (
+                torch.Tensor([args["elevation"]]).to(torch.int64).to(frames.device)
+            )
+            batch["fps_id"] = torch.Tensor([7]).to(torch.int64).to(frames.device)
+            batch["motion_bucket_id"] = (
+                torch.Tensor([127]).to(torch.int64).to(frames.device)
+            )
+            batch = self.models["denoising_model"].add_custom_cond(batch, infer=True)
+            with torch.autocast(device_type="cuda", dtype=torch.float16):
+                c, uc = self.models[
+                    "denoising_model"
+                ].conditioner.get_unconditional_conditioning(
+                    batch,
+                    force_uc_zero_embeddings=["cond_frames", "cond_frames_without_noise"],
+                )
+            additional_model_inputs = {
+                "image_only_indicator": torch.zeros(2, T).to(
+                    self.models["denoising_model"].device
+                ),
+                "num_video_frames": batch["num_video_frames"],
+            }
+            def denoiser(input, sigma, c):
+                return self.models["denoising_model"].denoiser(
+                    self.models["denoising_model"].model,
+                    input,
+                    sigma,
+                    c,
+                    **additional_model_inputs
+                )
+            with torch.autocast(device_type="cuda", dtype=torch.float16):
+                randn = torch.randn(
+                    [T, 4, H // 8, W // 8], device=self.models["denoising_model"].device
+                )
+                samples = self.models["denoising_model"].sampler(denoiser, randn, cond=c, uc=uc)
+            samples = self.models["denoising_model"].decode_first_stage(samples.half())
+            samples = einops.rearrange(samples, "(b t) c h w -> b c t h w", t=T)
+        return tensor2vid(samples)
+    def video_pipeline(self, frames, args) -> List[Image.Image]:
+        num_iter = args["num_iter"]
+        out_list = []
+        for _ in range(num_iter):
+            with torch.no_grad():
+                results = self.denoising(frames, args)
+            if len(out_list) == 0:
+                out_list = out_list + results
+            else:
+                out_list = out_list + results[1:]
+            img = out_list[-1]
+            img = to_tensor(img)
+            img = (img - 0.5) * 2.0
+            frames[:, 0] = img
+        result = []
+        for i, frame in enumerate(out_list):
+            input_image = cv2_to_pil(frame)
+            output_image = remover.process(input_image, type='rgba')
+            result.append(output_image)
+        return result
+    def process(self, white_image: Image.Image, args) -> List[Image.Image]:
+        img = pil_to_cv2(white_image)
+        frame_list = [img] * args["clip_size"]
+        h, w = frame_list[0].shape[0:2]
+        rate = max(
+            args["input_resolution"][0] * 1.0 / h, args["input_resolution"][1] * 1.0 / w
+        )
+        frame_list = [
+            cv2.resize(f, [math.ceil(w * rate), math.ceil(h * rate)]) for f in frame_list
+        ]
+        frame_list = [
+            center_crop(f, [args["input_resolution"][0], args["input_resolution"][1]])
+            for f in frame_list
+        ]
+        frame_list = [cv2.cvtColor(f, cv2.COLOR_BGR2RGB) for f in frame_list]
+        frame_list = [to_tensor(f) for f in frame_list]
+        frame_list = [(f - 0.5) * 2.0 for f in frame_list]
+        frames = torch.stack(frame_list, 1)
+        frames = frames.cuda()
+        self.models["denoising_model"].num_samples = args["clip_size"]
+        self.models["denoising_model"].image_size = args["input_resolution"]
+        return self.video_pipeline(frames, args)
+    def infer(self, white_image: Image.Image) -> List[Image.Image]:
+        seed = random.randint(0, 65535)
+        seed_everything(seed)
+        params = {
+            "clip_size": 25,
+            "input_resolution": [512, 512],
+            "num_iter": 1,
+            "aes": 6.0,
+            "mv": [0.0, 0.0, 0.0, 10.0],
+            "elevation": 0,
+        }
+        return self.process(white_image, params)

requirements.txt ADDED Viewed

	@@ -0,0 +1,49 @@

+av
+black==23.7.0
+chardet==5.1.0
+clip @ git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33
+cupy-cuda113
+einops>=0.6.1
+fairscale>=0.4.13
+fire>=0.5.0
+fsspec>=2023.6.0
+invisible-watermark>=0.2.0
+kornia==0.6.9
+matplotlib>=3.7.2
+natsort>=8.4.0
+ninja>=1.11.1
+numpy==1.26.4
+omegaconf>=2.3.0
+open-clip-torch>=2.20.0
+opencv-python==4.6.0.66
+pandas>=2.0.3
+pillow>=9.5.0
+pudb>=2022.1.3
+pytorch-lightning==1.9
+pyyaml>=6.0.1
+transparent_background
+scipy>=1.10.1
+streamlit>=0.73.1
+tensorboardx==2.6
+timm>=0.9.2
+tokenizers==0.12.1
+torch>=2.1.0
+torchaudio>=2.1.0
+torchdata>=0.6.1
+torchmetrics>=1.0.1
+torchvision>=0.16.0
+tqdm>=4.65.0
+transformers==4.19.1
+triton>=2.0.0
+urllib3<1.27,>=1.25.4
+wandb>=0.15.6
+webdataset>=0.2.33
+wheel>=0.41.0
+xformers>=0.0.20
+gradio
+streamlit-keyup==0.2.0
+deepspeed==0.14.5
+test-tube
+-e git+https://github.com/Stability-AI/datapipelines.git@8bce77d147033b3a5285b6d45ee85f33866964fc#egg=sdata
+basicsr
+pillow-heif

sgm/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .models import AutoencodingEngine, DiffusionEngine
+from .util import get_configs_path, instantiate_from_config
+__version__ = "0.1.0"

sgm/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .dataset import StableDataModuleFromConfig

sgm/data/dataset.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Optional
+import torchdata.datapipes.iter
+import webdataset as wds
+from omegaconf import DictConfig
+from pytorch_lightning import LightningDataModule
+try:
+    from sdata import create_dataset, create_dummy_dataset, create_loader
+except ImportError as e:
+    print("#" * 100)
+    print("Datasets not yet available")
+    print("to enable, we need to add stable-datasets as a submodule")
+    print("please use ``git submodule update --init --recursive``")
+    print("and do ``pip install -e stable-datasets/`` from the root of this repo")
+    print("#" * 100)
+    exit(1)
+class StableDataModuleFromConfig(LightningDataModule):
+    def __init__(
+        self,
+        train: DictConfig,
+        validation: Optional[DictConfig] = None,
+        test: Optional[DictConfig] = None,
+        skip_val_loader: bool = False,
+        dummy: bool = False,
+    ):
+        super().__init__()
+        self.train_config = train
+        assert (
+            "datapipeline" in self.train_config and "loader" in self.train_config
+        ), "train config requires the fields `datapipeline` and `loader`"
+        self.val_config = validation
+        if not skip_val_loader:
+            if self.val_config is not None:
+                assert (
+                    "datapipeline" in self.val_config and "loader" in self.val_config
+                ), "validation config requires the fields `datapipeline` and `loader`"
+            else:
+                print(
+                    "Warning: No Validation datapipeline defined, using that one from training"
+                )
+                self.val_config = train
+        self.test_config = test
+        if self.test_config is not None:
+            assert (
+                "datapipeline" in self.test_config and "loader" in self.test_config
+            ), "test config requires the fields `datapipeline` and `loader`"
+        self.dummy = dummy
+        if self.dummy:
+            print("#" * 100)
+            print("USING DUMMY DATASET: HOPE YOU'RE DEBUGGING ;)")
+            print("#" * 100)
+    def setup(self, stage: str) -> None:
+        print("Preparing datasets")
+        if self.dummy:
+            data_fn = create_dummy_dataset
+        else:
+            data_fn = create_dataset
+        self.train_datapipeline = data_fn(**self.train_config.datapipeline)
+        if self.val_config:
+            self.val_datapipeline = data_fn(**self.val_config.datapipeline)
+        if self.test_config:
+            self.test_datapipeline = data_fn(**self.test_config.datapipeline)
+    def train_dataloader(self) -> torchdata.datapipes.iter.IterDataPipe:
+        loader = create_loader(self.train_datapipeline, **self.train_config.loader)
+        return loader
+    def val_dataloader(self) -> wds.DataPipeline:
+        return create_loader(self.val_datapipeline, **self.val_config.loader)
+    def test_dataloader(self) -> wds.DataPipeline:
+        return create_loader(self.test_datapipeline, **self.test_config.loader)

sgm/data/video_dataset.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import pytorch_lightning as pl
+import numpy as np
+import torch
+import PIL
+import os
+import random
+from skimage.io import imread
+import webdataset as wds
+import PIL.Image as Image
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from pathlib import Path
+# from ldm.base_utils import read_pickle, pose_inverse
+import torchvision.transforms as transforms
+import torchvision
+from einops import rearrange
+def add_margin(pil_img, color=0, size=256):
+    width, height = pil_img.size
+    result = Image.new(pil_img.mode, (size, size), color)
+    result.paste(pil_img, ((size - width) // 2, (size - height) // 2))
+    return result
+def prepare_inputs(image_path, elevation_input, crop_size=-1, image_size=256):
+    image_input = Image.open(image_path)
+    if crop_size!=-1:
+        alpha_np = np.asarray(image_input)[:, :, 3]
+        coords = np.stack(np.nonzero(alpha_np), 1)[:, (1, 0)]
+        min_x, min_y = np.min(coords, 0)
+        max_x, max_y = np.max(coords, 0)
+        ref_img_ = image_input.crop((min_x, min_y, max_x, max_y))
+        h, w = ref_img_.height, ref_img_.width
+        scale = crop_size / max(h, w)
+        h_, w_ = int(scale * h), int(scale * w)
+        ref_img_ = ref_img_.resize((w_, h_), resample=Image.BICUBIC)
+        image_input = add_margin(ref_img_, size=image_size)
+    else:
+        image_input = add_margin(image_input, size=max(image_input.height, image_input.width))
+        image_input = image_input.resize((image_size, image_size), resample=Image.BICUBIC)
+    image_input = np.asarray(image_input)
+    image_input = image_input.astype(np.float32) / 255.0
+    ref_mask = image_input[:, :, 3:]
+    image_input[:, :, :3] = image_input[:, :, :3] * ref_mask + 1 - ref_mask  # white background
+    image_input = image_input[:, :, :3] * 2.0 - 1.0
+    image_input = torch.from_numpy(image_input.astype(np.float32))
+    elevation_input = torch.from_numpy(np.asarray([np.deg2rad(elevation_input)], np.float32))
+    return {"input_image": image_input, "input_elevation": elevation_input}
+class VideoTrainDataset(Dataset):
+    def __init__(self, base_folder='/data/yanghaibo/datas/OBJAVERSE-LVIS/images', width=1024, height=576, sample_frames=25):
+        """
+        Args:
+            num_samples (int): Number of samples in the dataset.
+            channels (int): Number of channels, default is 3 for RGB.
+        """
+        # Define the path to the folder containing video frames
+        self.base_folder = base_folder
+        self.folders = os.listdir(self.base_folder)
+        self.num_samples = len(self.folders)
+        self.channels = 3
+        self.width = width
+        self.height = height
+        self.sample_frames = sample_frames
+        self.elevations = [-10, 0, 10, 20, 30, 40]
+    def __len__(self):
+        return self.num_samples
+    def load_im(self, path):
+        img = imread(path)
+        img = img.astype(np.float32) / 255.0
+        mask = img[:,:,3:]
+        img[:,:,:3] = img[:,:,:3] * mask + 1 - mask # white background
+        img = Image.fromarray(np.uint8(img[:, :, :3] * 255.))
+        return img, mask
+    def __getitem__(self, idx):
+        """
+        Args:
+            idx (int): Index of the sample to return.
+        Returns:
+            dict: A dictionary containing the 'pixel_values' tensor of shape (16, channels, 320, 512).
+        """
+        # Randomly select a folder (representing a video) from the base folder
+        chosen_folder = random.choice(self.folders)
+        folder_path = os.path.join(self.base_folder, chosen_folder)
+        frames = os.listdir(folder_path)
+        # Sort the frames by name
+        frames.sort()
+        # Ensure the selected folder has at least `sample_frames`` frames
+        if len(frames) < self.sample_frames:
+            raise ValueError(
+                f"The selected folder '{chosen_folder}' contains fewer than `{self.sample_frames}` frames.")
+        # Randomly select a start index for frame sequence. Fixed elevation
+        start_idx = random.randint(0, len(frames) - 1)
+        range_id = int(start_idx / 16)  # 0, 1, 2, 3, 4, 5
+        elevation = self.elevations[range_id]
+        selected_frames = []
+        for frame_idx in range(start_idx, (range_id + 1) * 16):
+            selected_frames.append(frames[frame_idx])
+        for frame_idx in range((range_id) * 16, start_idx):
+            selected_frames.append(frames[frame_idx])
+        # Initialize a tensor to store the pixel values
+        pixel_values = torch.empty((self.sample_frames, self.channels, self.height, self.width))
+        # Load and process each frame
+        for i, frame_name in enumerate(selected_frames):
+            frame_path = os.path.join(folder_path, frame_name)
+            img, mask = self.load_im(frame_path)
+            # Resize the image and convert it to a tensor
+            img_resized = img.resize((self.width, self.height))
+            img_tensor = torch.from_numpy(np.array(img_resized)).float()
+            # Normalize the image by scaling pixel values to [-1, 1]
+            img_normalized = img_tensor / 127.5 - 1
+            # Rearrange channels if necessary
+            if self.channels == 3:
+                img_normalized = img_normalized.permute(
+                    2, 0, 1)  # For RGB images
+            elif self.channels == 1:
+                img_normalized = img_normalized.mean(
+                    dim=2, keepdim=True)  # For grayscale images
+            pixel_values[i] = img_normalized
+        pixel_values = rearrange(pixel_values, 't c h w -> c t h w')
+        caption = chosen_folder + "_" + str(start_idx)
+        return {'video': pixel_values, 'elevation': elevation, 'caption': caption, "fps_id": 7, "motion_bucket_id": 127}
+class SyncDreamerEvalData(Dataset):
+    def __init__(self, image_dir):
+        self.image_size = 512
+        self.image_dir = Path(image_dir)
+        self.crop_size = 20
+        self.fns = []
+        for fn in Path(image_dir).iterdir():
+            if fn.suffix=='.png':
+                self.fns.append(fn)
+        print('============= length of dataset %d =============' % len(self.fns))
+    def __len__(self):
+        return len(self.fns)
+    def get_data_for_index(self, index):
+        input_img_fn = self.fns[index]
+        elevation = 0
+        return prepare_inputs(input_img_fn, elevation, 512)
+    def __getitem__(self, index):
+        return self.get_data_for_index(index)
+class VideoDataset(pl.LightningDataModule):
+    def __init__(self, base_folder, eval_folder, width, height, sample_frames, batch_size, num_workers=4, seed=0, **kwargs):
+        super().__init__()
+        self.base_folder = base_folder
+        self.eval_folder = eval_folder
+        self.width = width
+        self.height = height
+        self.sample_frames = sample_frames
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.seed = seed
+        self.additional_args = kwargs
+    def setup(self):
+        self.train_dataset = VideoTrainDataset(self.base_folder, self.width, self.height, self.sample_frames)
+        self.val_dataset = SyncDreamerEvalData(image_dir=self.eval_folder)
+    def train_dataloader(self):
+        sampler = DistributedSampler(self.train_dataset, seed=self.seed)
+        return wds.WebLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False, sampler=sampler)
+    def val_dataloader(self):
+        loader = wds.WebLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
+        return loader
+    def test_dataloader(self):
+        return wds.WebLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

sgm/data/video_dataset_stage2_degradeImages.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import pytorch_lightning as pl
+import numpy as np
+import torch
+import PIL
+import os
+import random
+from skimage.io import imread
+import webdataset as wds
+import PIL.Image as Image
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from pathlib import Path
+# from ldm.base_utils import read_pickle, pose_inverse
+import torchvision.transforms as transforms
+import torchvision
+from einops import rearrange
+# for the degraded images
+import yaml
+from basicsr.data.degradations import circular_lowpass_kernel, random_mixed_kernels
+import math
+def add_margin(pil_img, color=0, size=256):
+    width, height = pil_img.size
+    result = Image.new(pil_img.mode, (size, size), color)
+    result.paste(pil_img, ((size - width) // 2, (size - height) // 2))
+    return result
+def prepare_inputs(image_path, elevation_input, crop_size=-1, image_size=256):
+    image_input = Image.open(image_path)
+    if crop_size!=-1:
+        alpha_np = np.asarray(image_input)[:, :, 3]
+        coords = np.stack(np.nonzero(alpha_np), 1)[:, (1, 0)]
+        min_x, min_y = np.min(coords, 0)
+        max_x, max_y = np.max(coords, 0)
+        ref_img_ = image_input.crop((min_x, min_y, max_x, max_y))
+        h, w = ref_img_.height, ref_img_.width
+        scale = crop_size / max(h, w)
+        h_, w_ = int(scale * h), int(scale * w)
+        ref_img_ = ref_img_.resize((w_, h_), resample=Image.BICUBIC)
+        image_input = add_margin(ref_img_, size=image_size)
+    else:
+        image_input = add_margin(image_input, size=max(image_input.height, image_input.width))
+        image_input = image_input.resize((image_size, image_size), resample=Image.BICUBIC)
+    image_input = np.asarray(image_input)
+    image_input = image_input.astype(np.float32) / 255.0
+    ref_mask = image_input[:, :, 3:]
+    image_input[:, :, :3] = image_input[:, :, :3] * ref_mask + 1 - ref_mask  # white background
+    image_input = image_input[:, :, :3] * 2.0 - 1.0
+    image_input = torch.from_numpy(image_input.astype(np.float32))
+    elevation_input = torch.from_numpy(np.asarray([np.deg2rad(elevation_input)], np.float32))
+    return {"input_image": image_input, "input_elevation": elevation_input}
+class VideoTrainDataset(Dataset):
+    def __init__(self, base_folder='/data/yanghaibo/datas/OBJAVERSE-LVIS/images', depth_folder="/mnt/drive2/3d/OBJAVERSE-DEPTH/depth256", width=1024, height=576, sample_frames=25):
+        """
+        Args:
+            num_samples (int): Number of samples in the dataset.
+            channels (int): Number of channels, default is 3 for RGB.
+        """
+        # Define the path to the folder containing video frames
+        self.base_folder = base_folder
+        self.depth_folder = depth_folder
+        # self.folders1 = os.listdir(self.base_folder)
+        # self.folders2 = os.listdir(self.depth_folder)
+        # self.folders = list(set(self.folders1).intersection(set(self.folders2)))
+        self.folders = os.listdir(self.base_folder)
+        self.num_samples = len(self.folders)
+        self.channels = 3
+        self.width = width
+        self.height = height
+        self.sample_frames = sample_frames
+        self.elevations = [-10, 0, 10, 20, 30, 40]
+        # for degraded images
+        with open('configs/train_realesrnet_x4plus.yml', mode='r') as f:
+            opt = yaml.load(f, Loader=yaml.FullLoader)
+        self.opt = opt
+        # blur settings for the first degradation
+        self.blur_kernel_size = opt['blur_kernel_size']
+        self.kernel_list = opt['kernel_list']
+        self.kernel_prob = opt['kernel_prob']  # a list for each kernel probability
+        self.blur_sigma = opt['blur_sigma']
+        self.betag_range = opt['betag_range']  # betag used in generalized Gaussian blur kernels
+        self.betap_range = opt['betap_range']  # betap used in plateau blur kernels
+        self.sinc_prob = opt['sinc_prob']  # the probability for sinc filters
+        # blur settings for the second degradation
+        self.blur_kernel_size2 = opt['blur_kernel_size2']
+        self.kernel_list2 = opt['kernel_list2']
+        self.kernel_prob2 = opt['kernel_prob2']
+        self.blur_sigma2 = opt['blur_sigma2']
+        self.betag_range2 = opt['betag_range2']
+        self.betap_range2 = opt['betap_range2']
+        self.sinc_prob2 = opt['sinc_prob2']
+        # a final sinc filter
+        self.final_sinc_prob = opt['final_sinc_prob']
+        self.kernel_range = [2 * v + 1 for v in range(3, 11)]  # kernel size ranges from 7 to 21
+        # TODO: kernel range is now hard-coded, should be in the configure file
+        self.pulse_tensor = torch.zeros(21, 21).float()  # convolving with pulse tensor brings no blurry effect
+        self.pulse_tensor[10, 10] = 1
+    def __len__(self):
+        return self.num_samples
+    def load_im(self, path):
+        img = imread(path)
+        img = img.astype(np.float32) / 255.0
+        mask = img[:,:,3:]
+        img[:,:,:3] = img[:,:,:3] * mask + 1 - mask # white background
+        img = Image.fromarray(np.uint8(img[:, :, :3] * 255.))
+        return img, mask
+    def __getitem__(self, idx):
+        """
+        Args:
+            idx (int): Index of the sample to return.
+        Returns:
+            dict: A dictionary containing the 'pixel_values' tensor of shape (16, channels, 320, 512).
+        """
+        # Randomly select a folder (representing a video) from the base folder
+        chosen_folder = random.choice(self.folders)
+        folder_path = os.path.join(self.base_folder, chosen_folder)
+        frames = os.listdir(folder_path)
+        # Sort the frames by name
+        frames.sort()
+        # Ensure the selected folder has at least `sample_frames`` frames
+        if len(frames) < self.sample_frames:
+            raise ValueError(
+                f"The selected folder '{chosen_folder}' contains fewer than `{self.sample_frames}` frames.")
+        # Randomly select a start index for frame sequence. Fixed elevation
+        start_idx = random.randint(0, len(frames) - 1)
+        # start_idx = random.choice([0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92])
+        range_id = int(start_idx / 16)  # 0, 1, 2, 3, 4, 5
+        elevation = self.elevations[range_id]
+        selected_frames = []
+        for frame_idx in range(start_idx, (range_id + 1) * 16):
+            selected_frames.append(frames[frame_idx])
+        for frame_idx in range((range_id) * 16, start_idx):
+            selected_frames.append(frames[frame_idx])
+        # Initialize a tensor to store the pixel values
+        pixel_values = torch.empty((self.sample_frames, self.channels, self.height, self.width))
+        masks = []
+        # Load and process each frame
+        for i, frame_name in enumerate(selected_frames):
+            frame_path = os.path.join(folder_path, frame_name)
+            img, mask = self.load_im(frame_path)
+            mask = mask.squeeze(-1)
+            masks.append(mask)
+            # Resize the image and convert it to a tensor
+            img_resized = img.resize((self.width, self.height))
+            img_tensor = torch.from_numpy(np.array(img_resized)).float()
+            # Normalize the image by scaling pixel values to [-1, 1]
+            img_normalized = img_tensor / 127.5 - 1
+            # Rearrange channels if necessary
+            if self.channels == 3:
+                img_normalized = img_normalized.permute(
+                    2, 0, 1)  # For RGB images
+            elif self.channels == 1:
+                img_normalized = img_normalized.mean(
+                    dim=2, keepdim=True)  # For grayscale images
+            pixel_values[i] = img_normalized
+        pixel_values = rearrange(pixel_values, 't c h w -> c t h w')
+        masks = torch.from_numpy(np.array(masks))
+        caption = chosen_folder
+        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! get the kernels for degraded images
+        # ------------------------ Generate kernels (used in the first degradation) ------------------------ #
+        kernels = []
+        kernel2s = []
+        sinc_kernels = []
+        for i in range(16):
+            kernel_size = random.choice(self.kernel_range)
+            if np.random.uniform() < self.opt['sinc_prob']:
+                # this sinc filter setting is for kernels ranging from [7, 21]
+                if kernel_size < 13:
+                    omega_c = np.random.uniform(np.pi / 3, np.pi)
+                else:
+                    omega_c = np.random.uniform(np.pi / 5, np.pi)
+                kernel = circular_lowpass_kernel(omega_c, kernel_size, pad_to=False)
+            else:
+                kernel = random_mixed_kernels(
+                    self.kernel_list,
+                    self.kernel_prob,
+                    kernel_size,
+                    self.blur_sigma,
+                    self.blur_sigma, [-math.pi, math.pi],
+                    self.betag_range,
+                    self.betap_range,
+                    noise_range=None)
+            # pad kernel
+            pad_size = (21 - kernel_size) // 2
+            kernel = np.pad(kernel, ((pad_size, pad_size), (pad_size, pad_size)))
+            kernels.append(kernel)
+            # ------------------------ Generate kernels (used in the second degradation) ------------------------ #
+            kernel_size = random.choice(self.kernel_range)
+            if np.random.uniform() < self.opt['sinc_prob2']:
+                if kernel_size < 13:
+                    omega_c = np.random.uniform(np.pi / 3, np.pi)
+                else:
+                    omega_c = np.random.uniform(np.pi / 5, np.pi)
+                kernel2 = circular_lowpass_kernel(omega_c, kernel_size, pad_to=False)
+            else:
+                kernel2 = random_mixed_kernels(
+                    self.kernel_list2,
+                    self.kernel_prob2,
+                    kernel_size,
+                    self.blur_sigma2,
+                    self.blur_sigma2, [-math.pi, math.pi],
+                    self.betag_range2,
+                    self.betap_range2,
+                    noise_range=None)
+            # pad kernel
+            pad_size = (21 - kernel_size) // 2
+            kernel2 = np.pad(kernel2, ((pad_size, pad_size), (pad_size, pad_size)))
+            kernel2s.append(kernel2)
+            # ------------------------------------- the final sinc kernel ------------------------------------- #
+            if np.random.uniform() < self.opt['final_sinc_prob']:
+                kernel_size = random.choice(self.kernel_range)
+                omega_c = np.random.uniform(np.pi / 3, np.pi)
+                sinc_kernel = circular_lowpass_kernel(omega_c, kernel_size, pad_to=21)
+                sinc_kernel = torch.FloatTensor(sinc_kernel)
+            else:
+                sinc_kernel = self.pulse_tensor
+            sinc_kernels.append(sinc_kernel)
+        kernels = np.array(kernels)
+        kernel2s = np.array(kernel2s)
+        sinc_kernels = torch.stack(sinc_kernels, 0)
+        kernels = torch.FloatTensor(kernels)
+        kernel2s = torch.FloatTensor(kernel2s)
+        return {'video': pixel_values, 'masks': masks, 'elevation': elevation, 'caption': caption, 'kernel1s': kernels, 'kernel2s': kernel2s, 'sinc_kernels': sinc_kernels}           # (16, 3, 512, 512)-> (3, 16, 512, 512)
+class SyncDreamerEvalData(Dataset):
+    def __init__(self, image_dir):
+        self.image_size = 512
+        self.image_dir = Path(image_dir)
+        self.crop_size = 20
+        self.fns = []
+        for fn in Path(image_dir).iterdir():
+            if fn.suffix=='.png':
+                self.fns.append(fn)
+        print('============= length of dataset %d =============' % len(self.fns))
+    def __len__(self):
+        return len(self.fns)
+    def get_data_for_index(self, index):
+        input_img_fn = self.fns[index]
+        elevation = 0
+        return prepare_inputs(input_img_fn, elevation, 512)
+    def __getitem__(self, index):
+        return self.get_data_for_index(index)
+class VideoDataset(pl.LightningDataModule):
+    def __init__(self, base_folder, depth_folder, eval_folder, width, height, sample_frames, batch_size, num_workers=4, seed=0, **kwargs):
+        super().__init__()
+        self.base_folder = base_folder
+        self.depth_folder = depth_folder
+        self.eval_folder = eval_folder
+        self.width = width
+        self.height = height
+        self.sample_frames = sample_frames
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.seed = seed
+        self.additional_args = kwargs
+    def setup(self):
+        self.train_dataset = VideoTrainDataset(self.base_folder, self.depth_folder, self.width, self.height, self.sample_frames)
+        self.val_dataset = SyncDreamerEvalData(image_dir=self.eval_folder)
+    def train_dataloader(self):
+        sampler = DistributedSampler(self.train_dataset, seed=self.seed)
+        return wds.WebLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False, sampler=sampler)
+    def val_dataloader(self):
+        loader = wds.WebLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
+        return loader
+    def test_dataloader(self):
+        return wds.WebLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

sgm/inference/api.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import pathlib
+from dataclasses import asdict, dataclass
+from enum import Enum
+from typing import Optional
+from omegaconf import OmegaConf
+from sgm.inference.helpers import (Img2ImgDiscretizationWrapper, do_img2img,
+                                   do_sample)
+from sgm.modules.diffusionmodules.sampling import (DPMPP2MSampler,
+                                                   DPMPP2SAncestralSampler,
+                                                   EulerAncestralSampler,
+                                                   EulerEDMSampler,
+                                                   HeunEDMSampler,
+                                                   LinearMultistepSampler)
+from sgm.util import load_model_from_config
+class ModelArchitecture(str, Enum):
+    SD_2_1 = "stable-diffusion-v2-1"
+    SD_2_1_768 = "stable-diffusion-v2-1-768"
+    SDXL_V0_9_BASE = "stable-diffusion-xl-v0-9-base"
+    SDXL_V0_9_REFINER = "stable-diffusion-xl-v0-9-refiner"
+    SDXL_V1_BASE = "stable-diffusion-xl-v1-base"
+    SDXL_V1_REFINER = "stable-diffusion-xl-v1-refiner"
+class Sampler(str, Enum):
+    EULER_EDM = "EulerEDMSampler"
+    HEUN_EDM = "HeunEDMSampler"
+    EULER_ANCESTRAL = "EulerAncestralSampler"
+    DPMPP2S_ANCESTRAL = "DPMPP2SAncestralSampler"
+    DPMPP2M = "DPMPP2MSampler"
+    LINEAR_MULTISTEP = "LinearMultistepSampler"
+class Discretization(str, Enum):
+    LEGACY_DDPM = "LegacyDDPMDiscretization"
+    EDM = "EDMDiscretization"
+class Guider(str, Enum):
+    VANILLA = "VanillaCFG"
+    IDENTITY = "IdentityGuider"
+class Thresholder(str, Enum):
+    NONE = "None"
+@dataclass
+class SamplingParams:
+    width: int = 1024
+    height: int = 1024
+    steps: int = 50
+    sampler: Sampler = Sampler.DPMPP2M
+    discretization: Discretization = Discretization.LEGACY_DDPM
+    guider: Guider = Guider.VANILLA
+    thresholder: Thresholder = Thresholder.NONE
+    scale: float = 6.0
+    aesthetic_score: float = 5.0
+    negative_aesthetic_score: float = 5.0
+    img2img_strength: float = 1.0
+    orig_width: int = 1024
+    orig_height: int = 1024
+    crop_coords_top: int = 0
+    crop_coords_left: int = 0
+    sigma_min: float = 0.0292
+    sigma_max: float = 14.6146
+    rho: float = 3.0
+    s_churn: float = 0.0
+    s_tmin: float = 0.0
+    s_tmax: float = 999.0
+    s_noise: float = 1.0
+    eta: float = 1.0
+    order: int = 4
+@dataclass
+class SamplingSpec:
+    width: int
+    height: int
+    channels: int
+    factor: int
+    is_legacy: bool
+    config: str
+    ckpt: str
+    is_guided: bool
+model_specs = {
+    ModelArchitecture.SD_2_1: SamplingSpec(
+        height=512,
+        width=512,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_2_1.yaml",
+        ckpt="v2-1_512-ema-pruned.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SD_2_1_768: SamplingSpec(
+        height=768,
+        width=768,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_2_1_768.yaml",
+        ckpt="v2-1_768-ema-pruned.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V0_9_BASE: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=False,
+        config="sd_xl_base.yaml",
+        ckpt="sd_xl_base_0.9.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V0_9_REFINER: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_xl_refiner.yaml",
+        ckpt="sd_xl_refiner_0.9.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V1_BASE: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=False,
+        config="sd_xl_base.yaml",
+        ckpt="sd_xl_base_1.0.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V1_REFINER: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_xl_refiner.yaml",
+        ckpt="sd_xl_refiner_1.0.safetensors",
+        is_guided=True,
+    ),
+}
+class SamplingPipeline:
+    def __init__(
+        self,
+        model_id: ModelArchitecture,
+        model_path="checkpoints",
+        config_path="configs/inference",
+        device="cuda",
+        use_fp16=True,
+    ) -> None:
+        if model_id not in model_specs:
+            raise ValueError(f"Model {model_id} not supported")
+        self.model_id = model_id
+        self.specs = model_specs[self.model_id]
+        self.config = str(pathlib.Path(config_path, self.specs.config))
+        self.ckpt = str(pathlib.Path(model_path, self.specs.ckpt))
+        self.device = device
+        self.model = self._load_model(device=device, use_fp16=use_fp16)
+    def _load_model(self, device="cuda", use_fp16=True):
+        config = OmegaConf.load(self.config)
+        model = load_model_from_config(config, self.ckpt)
+        if model is None:
+            raise ValueError(f"Model {self.model_id} could not be loaded")
+        model.to(device)
+        if use_fp16:
+            model.conditioner.half()
+            model.model.half()
+        return model
+    def text_to_image(
+        self,
+        params: SamplingParams,
+        prompt: str,
+        negative_prompt: str = "",
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+        value_dict = asdict(params)
+        value_dict["prompt"] = prompt
+        value_dict["negative_prompt"] = negative_prompt
+        value_dict["target_width"] = params.width
+        value_dict["target_height"] = params.height
+        return do_sample(
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            params.height,
+            params.width,
+            self.specs.channels,
+            self.specs.factor,
+            force_uc_zero_embeddings=["txt"] if not self.specs.is_legacy else [],
+            return_latents=return_latents,
+            filter=None,
+        )
+    def image_to_image(
+        self,
+        params: SamplingParams,
+        image,
+        prompt: str,
+        negative_prompt: str = "",
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+        if params.img2img_strength < 1.0:
+            sampler.discretization = Img2ImgDiscretizationWrapper(
+                sampler.discretization,
+                strength=params.img2img_strength,
+            )
+        height, width = image.shape[2], image.shape[3]
+        value_dict = asdict(params)
+        value_dict["prompt"] = prompt
+        value_dict["negative_prompt"] = negative_prompt
+        value_dict["target_width"] = width
+        value_dict["target_height"] = height
+        return do_img2img(
+            image,
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            force_uc_zero_embeddings=["txt"] if not self.specs.is_legacy else [],
+            return_latents=return_latents,
+            filter=None,
+        )
+    def refiner(
+        self,
+        params: SamplingParams,
+        image,
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+        value_dict = {
+            "orig_width": image.shape[3] * 8,
+            "orig_height": image.shape[2] * 8,
+            "target_width": image.shape[3] * 8,
+            "target_height": image.shape[2] * 8,
+            "prompt": prompt,
+            "negative_prompt": negative_prompt,
+            "crop_coords_top": 0,
+            "crop_coords_left": 0,
+            "aesthetic_score": 6.0,
+            "negative_aesthetic_score": 2.5,
+        }
+        return do_img2img(
+            image,
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            skip_encode=True,
+            return_latents=return_latents,
+            filter=None,
+        )
+def get_guider_config(params: SamplingParams):
+    if params.guider == Guider.IDENTITY:
+        guider_config = {
+            "target": "sgm.modules.diffusionmodules.guiders.IdentityGuider"
+        }
+    elif params.guider == Guider.VANILLA:
+        scale = params.scale
+        thresholder = params.thresholder
+        if thresholder == Thresholder.NONE:
+            dyn_thresh_config = {
+                "target": "sgm.modules.diffusionmodules.sampling_utils.NoDynamicThresholding"
+            }
+        else:
+            raise NotImplementedError
+        guider_config = {
+            "target": "sgm.modules.diffusionmodules.guiders.VanillaCFG",
+            "params": {"scale": scale, "dyn_thresh_config": dyn_thresh_config},
+        }
+    else:
+        raise NotImplementedError
+    return guider_config
+def get_discretization_config(params: SamplingParams):
+    if params.discretization == Discretization.LEGACY_DDPM:
+        discretization_config = {
+            "target": "sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization",
+        }
+    elif params.discretization == Discretization.EDM:
+        discretization_config = {
+            "target": "sgm.modules.diffusionmodules.discretizer.EDMDiscretization",
+            "params": {
+                "sigma_min": params.sigma_min,
+                "sigma_max": params.sigma_max,
+                "rho": params.rho,
+            },
+        }
+    else:
+        raise ValueError(f"unknown discretization {params.discretization}")
+    return discretization_config
+def get_sampler_config(params: SamplingParams):
+    discretization_config = get_discretization_config(params)
+    guider_config = get_guider_config(params)
+    sampler = None
+    if params.sampler == Sampler.EULER_EDM:
+        return EulerEDMSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=params.s_churn,
+            s_tmin=params.s_tmin,
+            s_tmax=params.s_tmax,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.HEUN_EDM:
+        return HeunEDMSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=params.s_churn,
+            s_tmin=params.s_tmin,
+            s_tmax=params.s_tmax,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.EULER_ANCESTRAL:
+        return EulerAncestralSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            eta=params.eta,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.DPMPP2S_ANCESTRAL:
+        return DPMPP2SAncestralSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            eta=params.eta,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.DPMPP2M:
+        return DPMPP2MSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            verbose=True,
+        )
+    if params.sampler == Sampler.LINEAR_MULTISTEP:
+        return LinearMultistepSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            order=params.order,
+            verbose=True,
+        )
+    raise ValueError(f"unknown sampler {params.sampler}!")

sgm/inference/helpers.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import math
+import os
+from typing import List, Optional, Union
+import numpy as np
+import torch
+from einops import rearrange
+from imwatermark import WatermarkEncoder
+from omegaconf import ListConfig
+from PIL import Image
+from torch import autocast
+from sgm.util import append_dims
+class WatermarkEmbedder:
+    def __init__(self, watermark):
+        self.watermark = watermark
+        self.num_bits = len(WATERMARK_BITS)
+        self.encoder = WatermarkEncoder()
+        self.encoder.set_watermark("bits", self.watermark)
+    def __call__(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Adds a predefined watermark to the input image
+        Args:
+            image: ([N,] B, RGB, H, W) in range [0, 1]
+        Returns:
+            same as input but watermarked
+        """
+        squeeze = len(image.shape) == 4
+        if squeeze:
+            image = image[None, ...]
+        n = image.shape[0]
+        image_np = rearrange(
+            (255 * image).detach().cpu(), "n b c h w -> (n b) h w c"
+        ).numpy()[:, :, :, ::-1]
+        # torch (b, c, h, w) in [0, 1] -> numpy (b, h, w, c) [0, 255]
+        # watermarking libary expects input as cv2 BGR format
+        for k in range(image_np.shape[0]):
+            image_np[k] = self.encoder.encode(image_np[k], "dwtDct")
+        image = torch.from_numpy(
+            rearrange(image_np[:, :, :, ::-1], "(n b) h w c -> n b c h w", n=n)
+        ).to(image.device)
+        image = torch.clamp(image / 255, min=0.0, max=1.0)
+        if squeeze:
+            image = image[0]
+        return image
+# A fixed 48-bit message that was choosen at random
+# WATERMARK_MESSAGE = 0xB3EC907BB19E
+WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
+# bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
+WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
+embed_watermark = WatermarkEmbedder(WATERMARK_BITS)
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    return list({x.input_key for x in conditioner.embedders})
+def perform_save_locally(save_path, samples):
+    os.makedirs(os.path.join(save_path), exist_ok=True)
+    base_count = len(os.listdir(os.path.join(save_path)))
+    samples = embed_watermark(samples)
+    for sample in samples:
+        sample = 255.0 * rearrange(sample.cpu().numpy(), "c h w -> h w c")
+        Image.fromarray(sample.astype(np.uint8)).save(
+            os.path.join(save_path, f"{base_count:09}.png")
+        )
+        base_count += 1
+class Img2ImgDiscretizationWrapper:
+    """
+    wraps a discretizer, and prunes the sigmas
+    params:
+        strength: float between 0.0 and 1.0. 1.0 means full sampling (all sigmas are returned)
+    """
+    def __init__(self, discretization, strength: float = 1.0):
+        self.discretization = discretization
+        self.strength = strength
+        assert 0.0 <= self.strength <= 1.0
+    def __call__(self, *args, **kwargs):
+        # sigmas start large first, and decrease then
+        sigmas = self.discretization(*args, **kwargs)
+        print(f"sigmas after discretization, before pruning img2img: ", sigmas)
+        sigmas = torch.flip(sigmas, (0,))
+        sigmas = sigmas[: max(int(self.strength * len(sigmas)), 1)]
+        print("prune index:", max(int(self.strength * len(sigmas)), 1))
+        sigmas = torch.flip(sigmas, (0,))
+        print(f"sigmas after pruning: ", sigmas)
+        return sigmas
+def do_sample(
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    H,
+    W,
+    C,
+    F,
+    force_uc_zero_embeddings: Optional[List] = None,
+    batch2model_input: Optional[List] = None,
+    return_latents=False,
+    filter=None,
+    device="cuda",
+):
+    if force_uc_zero_embeddings is None:
+        force_uc_zero_embeddings = []
+    if batch2model_input is None:
+        batch2model_input = []
+    with torch.no_grad():
+        with autocast(device) as precision_scope:
+            with model.ema_scope():
+                num_samples = [num_samples]
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    num_samples,
+                )
+                for key in batch:
+                    if isinstance(batch[key], torch.Tensor):
+                        print(key, batch[key].shape)
+                    elif isinstance(batch[key], list):
+                        print(key, [len(l) for l in batch[key]])
+                    else:
+                        print(key, batch[key])
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=force_uc_zero_embeddings,
+                )
+                for k in c:
+                    if not k == "crossattn":
+                        c[k], uc[k] = map(
+                            lambda y: y[k][: math.prod(num_samples)].to(device), (c, uc)
+                        )
+                additional_model_inputs = {}
+                for k in batch2model_input:
+                    additional_model_inputs[k] = batch[k]
+                shape = (math.prod(num_samples), C, H // F, W // F)
+                randn = torch.randn(shape).to(device)
+                def denoiser(input, sigma, c):
+                    return model.denoiser(
+                        model.model, input, sigma, c, **additional_model_inputs
+                    )
+                samples_z = sampler(denoiser, randn, cond=c, uc=uc)
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+                if filter is not None:
+                    samples = filter(samples)
+                if return_latents:
+                    return samples, samples_z
+                return samples
+def get_batch(keys, value_dict, N: Union[List, ListConfig], device="cuda"):
+    # Hardcoded demo setups; might undergo some changes in the future
+    batch = {}
+    batch_uc = {}
+    for key in keys:
+        if key == "txt":
+            batch["txt"] = (
+                np.repeat([value_dict["prompt"]], repeats=math.prod(N))
+                .reshape(N)
+                .tolist()
+            )
+            batch_uc["txt"] = (
+                np.repeat([value_dict["negative_prompt"]], repeats=math.prod(N))
+                .reshape(N)
+                .tolist()
+            )
+        elif key == "original_size_as_tuple":
+            batch["original_size_as_tuple"] = (
+                torch.tensor([value_dict["orig_height"], value_dict["orig_width"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+        elif key == "crop_coords_top_left":
+            batch["crop_coords_top_left"] = (
+                torch.tensor(
+                    [value_dict["crop_coords_top"], value_dict["crop_coords_left"]]
+                )
+                .to(device)
+                .repeat(*N, 1)
+            )
+        elif key == "aesthetic_score":
+            batch["aesthetic_score"] = (
+                torch.tensor([value_dict["aesthetic_score"]]).to(device).repeat(*N, 1)
+            )
+            batch_uc["aesthetic_score"] = (
+                torch.tensor([value_dict["negative_aesthetic_score"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+        elif key == "target_size_as_tuple":
+            batch["target_size_as_tuple"] = (
+                torch.tensor([value_dict["target_height"], value_dict["target_width"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+        else:
+            batch[key] = value_dict[key]
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+    return batch, batch_uc
+def get_input_image_tensor(image: Image.Image, device="cuda"):
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h})")
+    width, height = map(
+        lambda x: x - x % 64, (w, h)
+    )  # resize to integer multiple of 64
+    image = image.resize((width, height))
+    image_array = np.array(image.convert("RGB"))
+    image_array = image_array[None].transpose(0, 3, 1, 2)
+    image_tensor = torch.from_numpy(image_array).to(dtype=torch.float32) / 127.5 - 1.0
+    return image_tensor.to(device)
+def do_img2img(
+    img,
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    force_uc_zero_embeddings=[],
+    additional_kwargs={},
+    offset_noise_level: float = 0.0,
+    return_latents=False,
+    skip_encode=False,
+    filter=None,
+    device="cuda",
+):
+    with torch.no_grad():
+        with autocast(device) as precision_scope:
+            with model.ema_scope():
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    [num_samples],
+                )
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=force_uc_zero_embeddings,
+                )
+                for k in c:
+                    c[k], uc[k] = map(lambda y: y[k][:num_samples].to(device), (c, uc))
+                for k in additional_kwargs:
+                    c[k] = uc[k] = additional_kwargs[k]
+                if skip_encode:
+                    z = img
+                else:
+                    z = model.encode_first_stage(img)
+                noise = torch.randn_like(z)
+                sigmas = sampler.discretization(sampler.num_steps)
+                sigma = sigmas[0].to(z.device)
+                if offset_noise_level > 0.0:
+                    noise = noise + offset_noise_level * append_dims(
+                        torch.randn(z.shape[0], device=z.device), z.ndim
+                    )
+                noised_z = z + noise * append_dims(sigma, z.ndim)
+                noised_z = noised_z / torch.sqrt(
+                    1.0 + sigmas[0] ** 2.0
+                )  # Note: hardcoded to DDPM-like scaling. need to generalize later.
+                def denoiser(x, sigma, c):
+                    return model.denoiser(model.model, x, sigma, c)
+                samples_z = sampler(denoiser, noised_z, cond=c, uc=uc)
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+                if filter is not None:
+                    samples = filter(samples)
+                if return_latents:
+                    return samples, samples_z
+                return samples