Spaces:

guangkaixu
/

GenPercept

Runtime error

App Files Files Community

guangkaixu commited on Oct 22, 2024

Commit

10e02f0

1 Parent(s): c83d507

upload

Browse files

Files changed (22) hide show

app.py +240 -29
empty_text_embed.npy +0 -3
genpercept/__init__.py +13 -0
genpercept/customized_modules/ddim.py +213 -0
genpercept/genpercept_pipeline.py +519 -0
genpercept/models/custom_unet.py +427 -0
genpercept/models/dpt_head.py +593 -0
{util → genpercept/util}/batchsize.py +26 -4
genpercept/util/ensemble.py +205 -0
{util → genpercept/util}/image_util.py +62 -104
hf_configs/dpt-sd2.1-unet-after-upsample-general/config.json +48 -0
hf_configs/dpt-sd2.1-unet-after-upsample-general/preprocessor_config.json +27 -0
hf_configs/scheduler_beta_1.0_1.0/scheduler_config.json +20 -0
pipeline_genpercept.py +0 -355
requirements.txt +4 -0
seg_images/seg_1.jpg +0 -0
seg_images/seg_2.jpg +0 -0
seg_images/seg_3.jpg +0 -0
seg_images/seg_4.jpg +0 -0
seg_images/seg_5.jpg +0 -0
util/__init__.py +0 -0
util/seed_all.py +0 -13

app.py CHANGED Viewed

@@ -34,14 +34,17 @@ from PIL import Image
 from gradio_imageslider import ImageSlider
 from gradio_patches.examples import Examples
-from pipeline_genpercept import GenPerceptPipeline
 from diffusers import (
     DiffusionPipeline,
-    UNet2DConditionModel,
     AutoencoderKL,
 )
 warnings.filterwarnings(
     "ignore", message=".*LoginButton created outside of a Blocks context.*"
 )
@@ -194,11 +197,13 @@ def process_matting(
     )
-def run_demo_server(pipe_depth, pipe_normal, pipe_dis, pipe_matting):
     process_pipe_depth = spaces.GPU(functools.partial(process_depth, pipe_depth))
     process_pipe_normal = spaces.GPU(functools.partial(process_normal, pipe_normal))
     process_pipe_dis = spaces.GPU(functools.partial(process_dis, pipe_dis))
     process_pipe_matting = spaces.GPU(functools.partial(process_matting, pipe_matting))
     gradio_theme = gr.themes.Default()
     with gr.Blocks(
@@ -485,7 +490,7 @@ def run_demo_server(pipe_depth, pipe_normal, pipe_dis, pipe_matting):
                 example_folder = os.path.join(os.path.dirname(__file__), "matting_images")
                 # print(example_folder)
                 Examples(
-                    fn=process_pipe_dis,
                     examples=[
                         os.path.join(example_folder, name)
                         for name in filenames
@@ -496,6 +501,120 @@ def run_demo_server(pipe_depth, pipe_normal, pipe_dis, pipe_matting):
                     directory_name="images_cache",
                     cache_examples=False,
                 )
         ### Image tab
@@ -630,6 +749,72 @@ def run_demo_server(pipe_depth, pipe_normal, pipe_dis, pipe_matting):
             ],
             queue=False,
         )
         ### Server launch
         demo.queue(
@@ -645,37 +830,61 @@ def main():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    dtype = torch.float16
-    vae = AutoencoderKL.from_pretrained("guangkaixu/GenPercept", subfolder='vae').to(dtype)
-    unet_depth_v1 = UNet2DConditionModel.from_pretrained(
-                                            'guangkaixu/genpercept-depth',
-                                            subfolder="unet",
-                                            use_safetensors=True).to(dtype)
-    unet_normal_v1 = UNet2DConditionModel.from_pretrained('guangkaixu/GenPercept', subfolder="unet_normal_v1", use_safetensors=True).to(dtype)
-    unet_dis_v1 = UNet2DConditionModel.from_pretrained('guangkaixu/GenPercept', subfolder="unet_dis_v1", use_safetensors=True).to(dtype)
-    unet_matting_v1 = UNet2DConditionModel.from_pretrained('guangkaixu/genpercept-matting', subfolder="unet", use_safetensors=True).to(dtype)
-    empty_text_embed = torch.from_numpy(np.load("./empty_text_embed.npy")).to(device, dtype)[None] # [1, 77, 1024]
-    pipe_depth = GenPerceptPipeline(vae=vae,
-                                    unet=unet_depth_v1,
-                                    empty_text_embed=empty_text_embed)
-    pipe_normal = GenPerceptPipeline(vae=vae,
-                                     unet=unet_normal_v1,
-                                     empty_text_embed=empty_text_embed)
-    pipe_dis = GenPerceptPipeline(vae=vae,
-                                  unet=unet_dis_v1,
-                                  empty_text_embed=empty_text_embed)
-    pipe_matting = GenPerceptPipeline(vae=vae,
-                                  unet=unet_matting_v1,
-                                  empty_text_embed=empty_text_embed)
     try:
         import xformers
         pipe_depth.enable_xformers_memory_efficient_attention()
         pipe_normal.enable_xformers_memory_efficient_attention()
         pipe_dis.enable_xformers_memory_efficient_attention()
         pipe_matting.enable_xformers_memory_efficient_attention()
     except:
         pass  # run without xformers
@@ -683,8 +892,10 @@ def main():
     pipe_normal = pipe_normal.to(device)
     pipe_dis = pipe_dis.to(device)
     pipe_matting = pipe_matting.to(device)
-    run_demo_server(pipe_depth, pipe_normal, pipe_dis, pipe_matting)
 if __name__ == "__main__":

 from gradio_imageslider import ImageSlider
 from gradio_patches.examples import Examples
+from genpercept.genpercept_pipeline import GenPerceptPipeline
 from diffusers import (
     DiffusionPipeline,
+    # UNet2DConditionModel,
     AutoencoderKL,
 )
+from genpercept.models.custom_unet import CustomUNet2DConditionModel
+from genpercept.customized_modules.ddim import DDIMSchedulerCustomized
 warnings.filterwarnings(
     "ignore", message=".*LoginButton created outside of a Blocks context.*"
 )
     )
+def run_demo_server(pipe_depth, pipe_normal, pipe_dis, pipe_matting, pipe_seg, pipe_disparity):
     process_pipe_depth = spaces.GPU(functools.partial(process_depth, pipe_depth))
     process_pipe_normal = spaces.GPU(functools.partial(process_normal, pipe_normal))
     process_pipe_dis = spaces.GPU(functools.partial(process_dis, pipe_dis))
     process_pipe_matting = spaces.GPU(functools.partial(process_matting, pipe_matting))
+    process_pipe_seg = spaces.GPU(functools.partial(process_matting, pipe_seg))
+    process_pipe_disparity = spaces.GPU(functools.partial(process_matting, pipe_disparity))
     gradio_theme = gr.themes.Default()
     with gr.Blocks(
                 example_folder = os.path.join(os.path.dirname(__file__), "matting_images")
                 # print(example_folder)
                 Examples(
+                    fn=process_pipe_matting,
                     examples=[
                         os.path.join(example_folder, name)
                         for name in filenames
                     directory_name="images_cache",
                     cache_examples=False,
                 )
+            with gr.Tab("Seg"):
+                with gr.Row():
+                    with gr.Column():
+                        seg_image_input = gr.Image(
+                            label="Input Image",
+                            type="filepath",
+                            # type="pil",
+                        )
+                        with gr.Row():
+                            seg_image_submit_btn = gr.Button(
+                                value="Estimate Segmentation", variant="primary"
+                            )
+                            seg_image_reset_btn = gr.Button(value="Reset")
+                        with gr.Accordion("Advanced options", open=False):
+                            image_processing_res = gr.Radio(
+                                [
+                                    ("Native", 0),
+                                    ("Recommended", 768),
+                                ],
+                                label="Processing resolution",
+                                value=default_image_processing_res,
+                            )
+                    with gr.Column():
+                        seg_image_output_slider = ImageSlider(
+                            label="Predicted segmentation results",
+                            type="filepath",
+                            show_download_button=True,
+                            show_share_button=True,
+                            interactive=False,
+                            elem_classes="slider",
+                            position=0.25,
+                        )
+                        seg_image_output_files = gr.Files(
+                            label="Seg outputs",
+                            elem_id="download",
+                            interactive=False,
+                        )
+                filenames = []
+                filenames.extend(["seg_anime_%d.jpg" %(i+1) for i in range(7)])
+                filenames.extend(["seg_line_%d.jpg" %(i+1) for i in range(6)])
+                filenames.extend(["seg_real_%d.jpg" %(i+1) for i in range(24)])
+                example_folder = os.path.join(os.path.dirname(__file__), "seg_images")
+                Examples(
+                    fn=process_pipe_seg,
+                    examples=[
+                        os.path.join(example_folder, name)
+                        for name in filenames
+                    ],
+                    inputs=[seg_image_input],
+                    outputs=[seg_image_output_slider, seg_image_output_files],
+                    cache_examples=False,
+                    # directory_name="examples_depth",
+                    # cache_examples=False,
+                )
+            with gr.Tab("Disparity"):
+                with gr.Row():
+                    with gr.Column():
+                        disparity_image_input = gr.Image(
+                            label="Input Image",
+                            type="filepath",
+                            # type="pil",
+                        )
+                        with gr.Row():
+                            disparity_image_submit_btn = gr.Button(
+                                value="Estimate Disparity", variant="primary"
+                            )
+                            disparity_image_reset_btn = gr.Button(value="Reset")
+                        with gr.Accordion("Advanced options", open=False):
+                            image_processing_res = gr.Radio(
+                                [
+                                    ("Native", 0),
+                                    ("Recommended", 768),
+                                ],
+                                label="Processing resolution",
+                                value=default_image_processing_res,
+                            )
+                    with gr.Column():
+                        disparity_image_output_slider = ImageSlider(
+                            label="Predicted disparity results",
+                            type="filepath",
+                            show_download_button=True,
+                            show_share_button=True,
+                            interactive=False,
+                            elem_classes="slider",
+                            position=0.25,
+                        )
+                        disparity_image_output_files = gr.Files(
+                            label="Disparity outputs",
+                            elem_id="download",
+                            interactive=False,
+                        )
+                filenames = []
+                filenames.extend(["disparity_anime_%d.jpg" %(i+1) for i in range(7)])
+                filenames.extend(["disparity_line_%d.jpg" %(i+1) for i in range(6)])
+                filenames.extend(["disparity_real_%d.jpg" %(i+1) for i in range(24)])
+                example_folder = os.path.join(os.path.dirname(__file__), "depth_images")
+                Examples(
+                    fn=process_pipe_disparity,
+                    examples=[
+                        os.path.join(example_folder, name)
+                        for name in filenames
+                    ],
+                    inputs=[disparity_image_input],
+                    outputs=[disparity_image_output_slider, disparity_image_output_files],
+                    cache_examples=False,
+                    # directory_name="examples_depth",
+                    # cache_examples=False,
+                )
         ### Image tab
             ],
             queue=False,
         )
+        seg_image_submit_btn.click(
+            fn=process_image_check,
+            inputs=seg_image_input,
+            outputs=None,
+            preprocess=False,
+            queue=False,
+        ).success(
+            fn=process_pipe_seg,
+            inputs=[
+                seg_image_input,
+                image_processing_res,
+            ],
+            outputs=[seg_image_output_slider, seg_image_output_files],
+            concurrency_limit=1,
+        )
+        seg_image_reset_btn.click(
+            fn=lambda: (
+                None,
+                None,
+                None,
+                default_image_processing_res,
+            ),
+            inputs=[],
+            outputs=[
+                seg_image_input,
+                seg_image_output_slider,
+                seg_image_output_files,
+                image_processing_res,
+            ],
+            queue=False,
+        )
+        disparity_image_submit_btn.click(
+            fn=process_image_check,
+            inputs=disparity_image_input,
+            outputs=None,
+            preprocess=False,
+            queue=False,
+        ).success(
+            fn=process_pipe_disparity,
+            inputs=[
+                disparity_image_input,
+                image_processing_res,
+            ],
+            outputs=[disparity_image_output_slider, disparity_image_output_files],
+            concurrency_limit=1,
+        )
+        disparity_image_reset_btn.click(
+            fn=lambda: (
+                None,
+                None,
+                None,
+                default_image_processing_res,
+            ),
+            inputs=[],
+            outputs=[
+                disparity_image_input,
+                disparity_image_output_slider,
+                disparity_image_output_files,
+                image_processing_res,
+            ],
+            queue=False,
+        )
         ### Server launch
         demo.queue(
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # dtype = torch.float16
+    # variant = "fp16"
+    dtype = torch.float32
+    variant = None
+    unet_depth_v2 = CustomUNet2DConditionModel.from_pretrained('guangkaixu/GenPercept-models', subfolder="unet_depth_v2", use_safetensors=True).to(dtype)
+    unet_normal_v2 = CustomUNet2DConditionModel.from_pretrained('guangkaixu/GenPercept-models', subfolder="unet_normal_v2", use_safetensors=True).to(dtype)
+    unet_dis_v2 = CustomUNet2DConditionModel.from_pretrained('guangkaixu/GenPercept-models', subfolder="unet_dis_v2", use_safetensors=True).to(dtype)
+    unet_matting_v2 = CustomUNet2DConditionModel.from_pretrained('guangkaixu/GenPercept-models', subfolder="unet_matting_v2", use_safetensors=True).to(dtype)
+    unet_disparity_v2 = CustomUNet2DConditionModel.from_pretrained('guangkaixu/GenPercept-models', subfolder="unet_disparity_v2", use_safetensors=True).to(dtype)
+    unet_seg_v2 = CustomUNet2DConditionModel.from_pretrained('guangkaixu/GenPercept-models', subfolder="unet_seg_v2", use_safetensors=True).to(dtype)
+    scheduler = DDIMSchedulerCustomized.from_pretrained("hf_configs/scheduler_beta_1.0_1.0", subfolder='scheduler')
+    genpercept_pipeline = True
+    pre_loaded_dict = dict(
+        scheduler=scheduler,
+        genpercept_pipeline=genpercept_pipeline,
+        torch_dtype=dtype,
+        variant=variant,
+    )
+    pipe_depth = GenPerceptPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2-1", unet=unet_depth_v2, **pre_loaded_dict,
+    )
+    pipe_normal = GenPerceptPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2-1", unet=unet_normal_v2, **pre_loaded_dict,
+    )
+    pipe_dis = GenPerceptPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2-1", unet=unet_dis_v2, **pre_loaded_dict,
+    )
+    pipe_matting = GenPerceptPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2-1", unet=unet_matting_v2, **pre_loaded_dict,
+    )
+    pipe_seg = GenPerceptPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2-1", unet=unet_seg_v2, **pre_loaded_dict,
+    )
+    pipe_disparity = GenPerceptPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2-1", unet=unet_disparity_v2, **pre_loaded_dict,
+    )
     try:
         import xformers
         pipe_depth.enable_xformers_memory_efficient_attention()
         pipe_normal.enable_xformers_memory_efficient_attention()
         pipe_dis.enable_xformers_memory_efficient_attention()
         pipe_matting.enable_xformers_memory_efficient_attention()
+        pipe_seg.enable_xformers_memory_efficient_attention()
+        pipe_disparity.enable_xformers_memory_efficient_attention()
     except:
         pass  # run without xformers
     pipe_normal = pipe_normal.to(device)
     pipe_dis = pipe_dis.to(device)
     pipe_matting = pipe_matting.to(device)
+    pipe_seg = pipe_seg.to(device)
+    pipe_disparity = pipe_disparity.to(device)
+    run_demo_server(pipe_depth, pipe_normal, pipe_dis, pipe_matting, pipe_seg, pipe_disparity)
 if __name__ == "__main__":

empty_text_embed.npy DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:677e5e752b1d428a2e5f6f87a62c3a6c726343d264351dc1c433763ddc9b7182
-size 157824

genpercept/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# --------------------------------------------------------
+# What Matters When Repurposing Diffusion Models for General Dense Perception Tasks? (https://arxiv.org/abs/2403.06090)
+# Github source: https://github.com/aim-uofa/GenPercept
+# Copyright (c) 2024, Advanced Intelligent Machines (AIM)
+# Licensed under The BSD 2-Clause License [see LICENSE for details]
+# By Guangkai Xu
+# Based on Marigold, diffusers codebases
+# https://github.com/prs-eth/marigold
+# https://github.com/huggingface/diffusers
+# --------------------------------------------------------
+from .genpercept_pipeline import GenPerceptPipeline, GenPerceptOutput

genpercept/customized_modules/ddim.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# --------------------------------------------------------
+# What Matters When Repurposing Diffusion Models for General Dense Perception Tasks? (https://arxiv.org/abs/2403.06090)
+# Github source: https://github.com/aim-uofa/GenPercept
+# Copyright (c) 2024, Advanced Intelligent Machines (AIM)
+# Licensed under The BSD 2-Clause License [see LICENSE for details]
+# By Guangkai Xu
+# Based on Marigold, diffusers codebases
+# https://github.com/prs-eth/marigold
+# https://github.com/huggingface/diffusers
+# --------------------------------------------------------
+import torch
+from typing import List, Optional, Tuple, Union
+import numpy as np
+from diffusers import DDIMScheduler, DDPMScheduler
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+    return betas
+class DDPMSchedulerCustomized(DDPMScheduler):
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        variance_type: str = "fixed_small",
+        clip_sample: bool = True,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: int = False,
+        power_beta_curve = 1.0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "scaled_linear_power":
+            self.betas = torch.linspace(beta_start**(1/power_beta_curve), beta_end**(1/power_beta_curve), num_train_timesteps, dtype=torch.float32) ** power_beta_curve
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "sigmoid":
+            # GeoDiff sigmoid schedule
+            betas = torch.linspace(-6, 6, num_train_timesteps)
+            self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+        # setable values
+        self.custom_timesteps = False
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+        self.variance_type = variance_type
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        # import pdb
+        # pdb.set_trace()
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+class DDIMSchedulerCustomized(DDIMScheduler):
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+        power_beta_curve = 1.0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "scaled_linear_power":
+            self.betas = torch.linspace(beta_start**(1/power_beta_curve), beta_end**(1/power_beta_curve), num_train_timesteps, dtype=torch.float32) ** power_beta_curve
+            self.power_beta_curve = power_beta_curve
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+        # self.betas = self.betas.double()
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+        self.beta_schedule = beta_schedule
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        alpha_t_prev_to_t = self.alphas[(prev_timestep+1):(timestep+1)]
+        alpha_t_prev_to_t = torch.prod(alpha_t_prev_to_t)
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_t_prev_to_t)
+        return variance

genpercept/genpercept_pipeline.py ADDED Viewed

	@@ -0,0 +1,519 @@

+# --------------------------------------------------------
+# What Matters When Repurposing Diffusion Models for General Dense Perception Tasks? (https://arxiv.org/abs/2403.06090)
+# Github source: https://github.com/aim-uofa/GenPercept
+# Copyright (c) 2024, Advanced Intelligent Machines (AIM)
+# Licensed under The BSD 2-Clause License [see LICENSE for details]
+# By Guangkai Xu
+# Based on Marigold, diffusers codebases
+# https://github.com/prs-eth/marigold
+# https://github.com/huggingface/diffusers
+# --------------------------------------------------------
+import logging
+from typing import Dict, Optional, Union
+import numpy as np
+import torch
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LCMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.utils import BaseOutput
+from PIL import Image
+from torch.utils.data import DataLoader, TensorDataset
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import pil_to_tensor, resize
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+from .util.batchsize import find_batch_size
+from .util.ensemble import ensemble_depth
+from .util.image_util import (
+    chw2hwc,
+    colorize_depth_maps,
+    get_tv_resample_method,
+    resize_max_res,
+)
+import matplotlib.pyplot as plt
+from genpercept.models.dpt_head import DPTNeckHeadForUnetAfterUpsampleIdentity
+class GenPerceptOutput(BaseOutput):
+    """
+    Output class for GenPercept general perception pipeline.
+    Args:
+        pred_np (`np.ndarray`):
+            Predicted result, with values in the range of [0, 1].
+        pred_colored (`PIL.Image.Image`):
+            Colorized result, with the shape of [3, H, W] and values in [0, 1].
+    """
+    pred_np: np.ndarray
+    pred_colored: Union[None, Image.Image]
+class GenPerceptPipeline(DiffusionPipeline):
+    """
+    Pipeline for general perception using GenPercept: https://github.com/aim-uofa/GenPercept.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        unet (`UNet2DConditionModel`):
+            Conditional U-Net to denoise the perception latent, conditioned on image latent.
+        vae (`AutoencoderKL`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images and results
+            to and from latent representations.
+        scheduler (`DDIMScheduler`):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        text_encoder (`CLIPTextModel`):
+            Text-encoder, for empty text embedding.
+        tokenizer (`CLIPTokenizer`):
+            CLIP tokenizer.
+        default_denoising_steps (`int`, *optional*):
+            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
+            quality with the given model. This value must be set in the model config. When the pipeline is called
+            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
+            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
+            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
+        default_processing_resolution (`int`, *optional*):
+            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
+            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
+            default value is used. This is required to ensure reasonable results with various model flavors trained
+            with varying optimal processing resolution values.
+    """
+    latent_scale_factor = 0.18215
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: Union[DDIMScheduler, LCMScheduler],
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        default_denoising_steps: Optional[int] = 10,
+        default_processing_resolution: Optional[int] = 768,
+        rgb_blending = False,
+        customized_head = None,
+        genpercept_pipeline = True,
+    ):
+        super().__init__()
+        self.genpercept_pipeline = genpercept_pipeline
+        if self.genpercept_pipeline:
+            default_denoising_steps = 1
+            rgb_blending = True
+        self.register_modules(
+            unet=unet,
+            customized_head=customized_head,
+            vae=vae,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.register_to_config(
+            default_denoising_steps=default_denoising_steps,
+            default_processing_resolution=default_processing_resolution,
+            rgb_blending=rgb_blending,
+        )
+        self.default_denoising_steps = default_denoising_steps
+        self.default_processing_resolution = default_processing_resolution
+        self.rgb_blending = rgb_blending
+        self.text_embed = None
+        self.customized_head = customized_head
+        if self.customized_head:
+            assert self.rgb_blending and self.scheduler.beta_start == 1 and self.scheduler.beta_end == 1
+            assert self.genpercept_pipeline
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_image: Union[Image.Image, torch.Tensor],
+        denoising_steps: Optional[int] = None,
+        ensemble_size: int = 1,
+        processing_res: Optional[int] = None,
+        match_input_res: bool = True,
+        resample_method: str = "bilinear",
+        batch_size: int = 0,
+        generator: Union[torch.Generator, None] = None,
+        color_map: str = "Spectral",
+        show_progress_bar: bool = True,
+        ensemble_kwargs: Dict = None,
+        mode = None,
+        fix_timesteps = None,
+        prompt = "",
+    ) -> GenPerceptOutput:
+        """
+        Function invoked when calling the pipeline.
+        Args:
+            input_image (`Image`):
+                Input RGB (or gray-scale) image.
+            denoising_steps (`int`, *optional*, defaults to `None`):
+                Number of denoising diffusion steps during inference. The default value `None` results in automatic
+                selection.
+            ensemble_size (`int`, *optional*, defaults to `10`):
+                Number of predictions to be ensembled.
+            processing_res (`int`, *optional*, defaults to `None`):
+                Effective processing resolution. When set to `0`, processes at the original image resolution. This
+                produces crisper predictions, but may also lead to the overall loss of global context. The default
+                value `None` resolves to the optimal value from the model config.
+            match_input_res (`bool`, *optional*, defaults to `True`):
+                Resize perception result to match input resolution.
+                Only valid if `processing_res` > 0.
+            resample_method: (`str`, *optional*, defaults to `bilinear`):
+                Resampling method used to resize images and perception results. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`.
+            batch_size (`int`, *optional*, defaults to `0`):
+                Inference batch size, no bigger than `num_ensemble`.
+                If set to 0, the script will automatically decide the proper batch size.
+            generator (`torch.Generator`, *optional*, defaults to `None`)
+                Random generator for initial noise generation.
+            show_progress_bar (`bool`, *optional*, defaults to `True`):
+                Display a progress bar of diffusion denoising.
+            color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized result generation):
+                Colormap used to colorize the result.
+            ensemble_kwargs (`dict`, *optional*, defaults to `None`):
+                Arguments for detailed ensembling settings.
+        Returns:
+            `GenPerceptOutput`: Output class for GenPercept general perception pipeline, including:
+            - **pred_np** (`np.ndarray`) Predicted result, with values in the range of [0, 1]
+            - **pred_colored** (`PIL.Image.Image`) Colorized result, with the shape of [3, H, W] and values in [0, 1], None if `color_map` is `None`
+        """
+        assert mode is not None, "mode of GenPerceptPipeline can be chosen from ['depth', 'normal', 'seg', 'matting', 'dis']."
+        self.mode = mode
+        # Model-specific optimal default values leading to fast and reasonable results.
+        if denoising_steps is None:
+            denoising_steps = self.default_denoising_steps
+        if processing_res is None:
+            processing_res = self.default_processing_resolution
+        assert processing_res >= 0
+        assert ensemble_size >= 1
+        if self.genpercept_pipeline:
+            assert ensemble_size == 1
+            assert denoising_steps == 1
+        else:
+            # Check if denoising step is reasonable
+            self._check_inference_step(denoising_steps)
+        resample_method: InterpolationMode = get_tv_resample_method(resample_method)
+        # ----------------- Image Preprocess -----------------
+        # Convert to torch tensor
+        if isinstance(input_image, Image.Image):
+            input_image = input_image.convert("RGB")
+            # convert to torch tensor [H, W, rgb] -> [rgb, H, W]
+            rgb = pil_to_tensor(input_image)
+            rgb = rgb.unsqueeze(0)  # [1, rgb, H, W]
+        elif isinstance(input_image, torch.Tensor):
+            rgb = input_image
+        else:
+            raise TypeError(f"Unknown input type: {type(input_image) = }")
+        input_size = rgb.shape
+        assert (
+            4 == rgb.dim() and 3 == input_size[-3]
+        ), f"Wrong input shape {input_size}, expected [1, rgb, H, W]"
+        # Resize image
+        if processing_res > 0:
+            rgb = resize_max_res(
+                rgb,
+                max_edge_resolution=processing_res,
+                resample_method=resample_method,
+            )
+        # Normalize rgb values
+        rgb_norm: torch.Tensor = rgb / 255.0 * 2.0 - 1.0  #  [0, 255] -> [-1, 1]
+        rgb_norm = rgb_norm.to(self.dtype)
+        assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
+        # ----------------- Perception Inference -----------------
+        # Batch repeated input image
+        duplicated_rgb = rgb_norm.expand(ensemble_size, -1, -1, -1)
+        single_rgb_dataset = TensorDataset(duplicated_rgb)
+        if batch_size > 0:
+            _bs = batch_size
+        else:
+            _bs = find_batch_size(
+                ensemble_size=ensemble_size,
+                input_res=max(rgb_norm.shape[1:]),
+                dtype=self.dtype,
+            )
+        single_rgb_loader = DataLoader(
+            single_rgb_dataset, batch_size=_bs, shuffle=False
+        )
+        # Predict results (batched)
+        pipe_pred_ls = []
+        if show_progress_bar:
+            iterable = tqdm(
+                single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False
+            )
+        else:
+            iterable = single_rgb_loader
+        for batch in iterable:
+            (batched_img,) = batch
+            pipe_pred_raw = self.single_infer(
+                rgb_in=batched_img,
+                num_inference_steps=denoising_steps,
+                show_pbar=show_progress_bar,
+                generator=generator,
+                fix_timesteps=fix_timesteps,
+                prompt=prompt,
+            )
+            pipe_pred_ls.append(pipe_pred_raw.detach())
+        pipe_preds = torch.concat(pipe_pred_ls, dim=0)
+        torch.cuda.empty_cache()  # clear vram cache for ensembling
+        # ----------------- Test-time ensembling -----------------
+        if ensemble_size > 1:
+            pipe_pred, _ = ensemble_depth(
+                pipe_preds,
+                scale_invariant=True,
+                shift_invariant=True,
+                max_res=50,
+                **(ensemble_kwargs or {}),
+            )
+        else:
+            pipe_pred = pipe_preds
+        # Resize back to original resolution
+        if match_input_res:
+            pipe_pred = resize(
+                pipe_pred,
+                input_size[-2:],
+                interpolation=resample_method,
+                antialias=True,
+            )
+        # Convert to numpy
+        pipe_pred = pipe_pred.squeeze()
+        pipe_pred = pipe_pred.cpu().numpy()
+        # Clip output range
+        pipe_pred = pipe_pred.clip(0, 1)
+        # Colorize
+        if color_map is not None:
+            assert self.mode == 'depth'
+            pred_colored = colorize_depth_maps(
+                pipe_pred, 0, 1, cmap=color_map
+            ).squeeze()  # [3, H, W], value in (0, 1)
+            pred_colored = (pred_colored * 255).astype(np.uint8)
+            pred_colored_hwc = chw2hwc(pred_colored)
+            pred_colored_img = Image.fromarray(pred_colored_hwc)
+        else:
+            pred_colored_img = None
+        if len(pipe_pred.shape) == 3 and pipe_pred.shape[0] == 3:
+            pipe_pred = np.transpose(pipe_pred, (1, 2, 0))
+        return GenPerceptOutput(
+            pred_np=pipe_pred,
+            pred_colored=pred_colored_img,
+        )
+    def _check_inference_step(self, n_step: int) -> None:
+        """
+        Check if denoising step is reasonable
+        Args:
+            n_step (`int`): denoising steps
+        """
+        assert n_step >= 1
+        if isinstance(self.scheduler, DDIMScheduler):
+            if n_step < 10:
+                logging.warning(
+                    f"Too few denoising steps: {n_step}. Recommended to use the LCM checkpoint for few-step inference."
+                )
+        elif isinstance(self.scheduler, LCMScheduler):
+            if not 1 <= n_step <= 4:
+                logging.warning(
+                    f"Non-optimal setting of denoising steps: {n_step}. Recommended setting is 1-4 steps."
+                )
+        else:
+            raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
+    def encode_text(self, prompt):
+        """
+        Encode text embedding for empty prompt
+        """
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="do_not_pad",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
+        self.text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)
+    @torch.no_grad()
+    def single_infer(
+        self,
+        rgb_in: torch.Tensor,
+        num_inference_steps: int,
+        generator: Union[torch.Generator, None],
+        show_pbar: bool,
+        fix_timesteps = None,
+        prompt = "",
+    ) -> torch.Tensor:
+        """
+        Perform an individual perception result without ensembling.
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image.
+            num_inference_steps (`int`):
+                Number of diffusion denoisign steps (DDIM) during inference.
+            show_pbar (`bool`):
+                Display a progress bar of diffusion denoising.
+            generator (`torch.Generator`)
+                Random generator for initial noise generation.
+        Returns:
+            `torch.Tensor`: Predicted result.
+        """
+        device = self.device
+        rgb_in = rgb_in.to(device)
+        # Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        if fix_timesteps:
+            timesteps = torch.tensor([fix_timesteps]).long().repeat(self.scheduler.timesteps.shape[0]).to(device)
+        else:
+            timesteps = self.scheduler.timesteps  # [T]
+        # Encode image
+        rgb_latent = self.encode_rgb(rgb_in)
+        if not (self.rgb_blending or self.genpercept_pipeline):
+            # Initial result (noise)
+            pred_latent = torch.randn(
+                rgb_latent.shape,
+                device=device,
+                dtype=self.dtype,
+                generator=generator,
+            )  # [B, 4, h, w]
+        else:
+            pred_latent = rgb_latent
+        # Batched empty text embedding
+        if self.text_embed is None:
+            self.encode_text(prompt)
+        batch_text_embed = self.text_embed.repeat(
+            (rgb_latent.shape[0], 1, 1)
+        ).to(device)  # [B, 2, 1024]
+        # Denoising loop
+        if show_pbar:
+            iterable = tqdm(
+                enumerate(timesteps),
+                total=len(timesteps),
+                leave=False,
+                desc=" " * 4 + "Diffusion denoising",
+            )
+        else:
+            iterable = enumerate(timesteps)
+        if not self.customized_head:
+            for i, t in iterable:
+                if self.genpercept_pipeline and i > 0:
+                    assert ValueError, "GenPercept only forward once."
+                if not (self.rgb_blending or self.genpercept_pipeline):
+                    unet_input = torch.cat(
+                        [rgb_latent, pred_latent], dim=1
+                    )  # this order is important
+                else:
+                    unet_input = pred_latent
+                # predict the noise residual
+                noise_pred = self.unet(
+                    unet_input, t, encoder_hidden_states=batch_text_embed
+                ).sample  # [B, 4, h, w]
+                # compute the previous noisy sample x_t -> x_t-1
+                step_output = self.scheduler.step(
+                    noise_pred, t, pred_latent, generator=generator
+                )
+                pred_latent = step_output.prev_sample
+            pred_latent = step_output.pred_original_sample # NOTE: for GenPercept, it is equivalent to "pred_latent = - noise_pred"
+            pred = self.decode_pred(pred_latent)
+            # clip prediction
+            pred = torch.clip(pred, -1.0, 1.0)
+            # shift to [0, 1]
+            pred = (pred + 1.0) / 2.0
+        elif isinstance(self.customized_head, DPTNeckHeadForUnetAfterUpsampleIdentity):
+            unet_input = pred_latent
+            model_pred_output = self.unet(
+                unet_input, timesteps, encoder_hidden_states=batch_text_embed, return_feature=True
+            )  # [B, 4, h, w]
+            unet_features = model_pred_output.multi_level_feats[::-1]
+            pred = self.customized_head(hidden_states=unet_features).prediction[:, None]
+            # shift to [0, 1]
+            pred = (pred - pred.min()) / (pred.max() - pred.min())
+        else:
+            raise ValueError
+        return pred
+    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
+        """
+        Encode RGB image into latent.
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image to be encoded.
+        Returns:
+            `torch.Tensor`: Image latent.
+        """
+        # encode
+        h = self.vae.encoder(rgb_in)
+        moments = self.vae.quant_conv(h)
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        # scale latent
+        rgb_latent = mean * self.latent_scale_factor
+        return rgb_latent
+    def decode_pred(self, pred_latent: torch.Tensor) -> torch.Tensor:
+        """
+        Decode pred latent into result.
+        Args:
+            pred_latent (`torch.Tensor`):
+                pred latent to be decoded.
+        Returns:
+            `torch.Tensor`: Decoded result.
+        """
+        # scale latent
+        pred_latent = pred_latent / self.latent_scale_factor
+        # decode
+        z = self.vae.post_quant_conv(pred_latent)
+        stacked = self.vae.decoder(z)
+        if self.mode in ['depth', 'matting', 'dis']:
+            # mean of output channels
+            stacked = stacked.mean(dim=1, keepdim=True)
+        return stacked

genpercept/models/custom_unet.py ADDED Viewed

	@@ -0,0 +1,427 @@

+# --------------------------------------------------------
+# What Matters When Repurposing Diffusion Models for General Dense Perception Tasks? (https://arxiv.org/abs/2403.06090)
+# Github source: https://github.com/aim-uofa/GenPercept
+# Copyright (c) 2024, Advanced Intelligent Machines (AIM)
+# Licensed under The BSD 2-Clause License [see LICENSE for details]
+# By Guangkai Xu
+# Based on diffusers codebases
+# https://github.com/huggingface/diffusers
+# --------------------------------------------------------
+from diffusers import UNet2DConditionModel
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from dataclasses import dataclass
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+@dataclass
+class CustomUNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor = None
+    multi_level_feats: [torch.FloatTensor] = None
+class CustomUNet2DConditionModel(UNet2DConditionModel):
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_feature: bool = False,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)
+            encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+        # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                    and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                    for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        multi_level_feats = []
+        # 1, 1280, 24, 24
+        # multi_level_feats.append(sample) # 1/64
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    scale=lora_scale,
+                )
+            # if not is_final_block:
+            multi_level_feats.append(sample)
+        if return_feature:
+            if USE_PEFT_BACKEND:
+                # remove `lora_scale` from each PEFT layer
+                unscale_lora_layers(self, lora_scale)
+            return CustomUNet2DConditionOutput(
+                multi_level_feats=multi_level_feats,
+            )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (sample,)
+        return CustomUNet2DConditionOutput(
+            sample=sample,
+            multi_level_feats=multi_level_feats,
+        )

genpercept/models/dpt_head.py ADDED Viewed

	@@ -0,0 +1,593 @@

+# --------------------------------------------------------
+# What Matters When Repurposing Diffusion Models for General Dense Perception Tasks? (https://arxiv.org/abs/2403.06090)
+# Github source: https://github.com/aim-uofa/GenPercept
+# Copyright (c) 2024, Advanced Intelligent Machines (AIM)
+# Licensed under The BSD 2-Clause License [see LICENSE for details]
+# By Guangkai Xu
+# Based on diffusers codebases
+# https://github.com/huggingface/diffusers
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+from typing import List, Optional, Tuple, Union
+from transformers import DPTPreTrainedModel
+from transformers.utils import ModelOutput
+from transformers.file_utils import replace_return_docstrings, add_start_docstrings_to_model_forward
+from transformers.models.dpt.modeling_dpt import DPTReassembleStage
+from diffusers.models.lora import LoRACompatibleConv
+from diffusers.utils import USE_PEFT_BACKEND
+import torch.nn.functional as F
+class DepthEstimatorOutput(ModelOutput):
+    """
+    Base class for outputs of depth estimation models.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        prediction (`torch.FloatTensor` of shape `(batch_size, height, width)`):
+            Predicted depth for each pixel.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    prediction: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+class DPTDepthEstimationHead(nn.Module):
+    """
+    Output head head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
+    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
+    supplementary material).
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.projection = None
+        features = config.fusion_hidden_size
+        if config.add_projection:
+            self.projection = nn.Conv2d(features, features, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.head = nn.Sequential(
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(),
+        )
+    def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
+        # use last features
+        hidden_states = hidden_states[self.config.head_in_index]
+        if self.projection is not None:
+            hidden_states = self.projection(hidden_states)
+            hidden_states = nn.ReLU()(hidden_states)
+        predicted_depth = self.head(hidden_states)
+        predicted_depth = predicted_depth.squeeze(dim=1)
+        return predicted_depth
+class Upsample2D(nn.Module):
+    """A 2D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        use_conv_transpose: bool = False,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+        kernel_size: Optional[int] = None,
+        padding=1,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+        interpolate=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        self.interpolate = interpolate
+        conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            # self.norm = RMSNorm(channels, eps, elementwise_affine)
+            raise NotImplementedError
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+        conv = None
+        if use_conv_transpose:
+            if kernel_size is None:
+                kernel_size = 4
+            conv = nn.ConvTranspose2d(
+                channels, self.out_channels, kernel_size=kernel_size, stride=2, padding=padding, bias=bias
+            )
+        elif use_conv:
+            if kernel_size is None:
+                kernel_size = 3
+            conv = conv_cls(self.channels, self.out_channels, kernel_size=kernel_size, padding=padding, bias=bias)
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        output_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+        if self.norm is not None:
+            hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        if self.use_conv_transpose:
+            return self.conv(hidden_states)
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        # TODO(Suraj): Remove this cast once the issue is fixed in PyTorch
+        # https://github.com/pytorch/pytorch/issues/86679
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if self.interpolate:
+            if output_size is None:
+                hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+            else:
+                hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if self.use_conv:
+            if self.name == "conv":
+                if isinstance(self.conv, LoRACompatibleConv) and not USE_PEFT_BACKEND:
+                    hidden_states = self.conv(hidden_states, scale)
+                else:
+                    hidden_states = self.conv(hidden_states)
+            else:
+                if isinstance(self.Conv2d_0, LoRACompatibleConv) and not USE_PEFT_BACKEND:
+                    hidden_states = self.Conv2d_0(hidden_states, scale)
+                else:
+                    hidden_states = self.Conv2d_0(hidden_states)
+        return hidden_states
+class DPTPreActResidualLayer(nn.Module):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+    Args:
+        config (`[DPTConfig]`):
+            Model configuration class defining the model architecture.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.use_batch_norm = config.use_batch_norm_in_fusion_residual
+        use_bias_in_fusion_residual = (
+            config.use_bias_in_fusion_residual
+            if config.use_bias_in_fusion_residual is not None
+            else not self.use_batch_norm
+        )
+        self.activation1 = nn.ReLU()
+        self.convolution1 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+        self.activation2 = nn.ReLU()
+        self.convolution2 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+        if self.use_batch_norm:
+            self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size)
+            self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state.clone()
+        hidden_state = self.activation1(hidden_state)
+        hidden_state = self.convolution1(hidden_state)
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm1(hidden_state)
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm2(hidden_state)
+        return hidden_state + residual
+class DPTFeatureFusionLayer(nn.Module):
+    """Feature fusion layer, merges feature maps from different stages.
+    Args:
+        config (`[DPTConfig]`):
+            Model configuration class defining the model architecture.
+        align_corners (`bool`, *optional*, defaults to `True`):
+            The align_corner setting for bilinear upsample.
+    """
+    def __init__(self, config, align_corners=True, with_residual_1=True):
+        super().__init__()
+        self.align_corners = align_corners
+        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
+        if with_residual_1:
+            self.residual_layer1 = DPTPreActResidualLayer(config)
+        self.residual_layer2 = DPTPreActResidualLayer(config)
+    def forward(self, hidden_state, residual=None):
+        if residual is not None:
+            if hidden_state.shape != residual.shape:
+                residual = nn.functional.interpolate(
+                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
+                )
+            hidden_state = hidden_state + self.residual_layer1(residual)
+        hidden_state = self.residual_layer2(hidden_state)
+        hidden_state = nn.functional.interpolate(
+            hidden_state, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        )
+        hidden_state = self.projection(hidden_state)
+        return hidden_state
+class DPTFeatureFusionStage(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for i in range(len(config.neck_hidden_sizes)):
+            if i == 0:
+                self.layers.append(DPTFeatureFusionLayer(config, with_residual_1=False))
+            else:
+                self.layers.append(DPTFeatureFusionLayer(config))
+    def forward(self, hidden_states):
+        # reversing the hidden_states, we start from the last
+        hidden_states = hidden_states[::-1]
+        fused_hidden_states = []
+        # first layer only uses the last hidden_state
+        fused_hidden_state = self.layers[0](hidden_states[0])
+        fused_hidden_states.append(fused_hidden_state)
+        # looping from the last layer to the second
+        for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
+            fused_hidden_state = layer(fused_hidden_state, hidden_state)
+            fused_hidden_states.append(fused_hidden_state)
+        return fused_hidden_states
+class DPTNeck(nn.Module):
+    """
+    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
+    input and produces another list of tensors as output. For DPT, it includes 2 stages:
+    * DPTReassembleStage
+    * DPTFeatureFusionStage.
+    Args:
+        config (dict): config dict.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # postprocessing: only required in case of a non-hierarchical backbone (e.g. ViT, BEiT)
+        if config.backbone_config is not None and config.backbone_config.model_type in ["swinv2"]:
+            self.reassemble_stage = None
+        else:
+            self.reassemble_stage = DPTReassembleStage(config)
+        self.convs = nn.ModuleList()
+        for channel in config.neck_hidden_sizes:
+            self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))
+        # fusion
+        self.fusion_stage = DPTFeatureFusionStage(config)
+    def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
+                List of hidden states from the backbone.
+        """
+        if not isinstance(hidden_states, (tuple, list)):
+            raise TypeError("hidden_states should be a tuple or list of tensors")
+        if len(hidden_states) != len(self.config.neck_hidden_sizes):
+            raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
+        # postprocess hidden states
+        if self.reassemble_stage is not None:
+            hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)
+        features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]
+        # fusion blocks
+        output = self.fusion_stage(features)
+        return output
+DPT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+_CONFIG_FOR_DOC = "DPTConfig"
+class DPTNeckHeadForUnetAfterUpsample(DPTPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # self.backbone = None
+        # if config.backbone_config is not None and config.is_hybrid is False:
+        #     self.backbone = load_backbone(config)
+        # else:
+        #     self.dpt = DPTModel(config, add_pooling_layer=False)
+        self.feature_upsample_0 = Upsample2D(channels=config.neck_hidden_sizes[0], use_conv=True)
+        # self.feature_upsample_1 = Upsample2D(channels=config.neck_hidden_sizes[1], use_conv=True)
+        # self.feature_upsample_2 = Upsample2D(channels=config.neck_hidden_sizes[2], use_conv=True)
+        # self.feature_upsample_3 = Upsample2D(channels=config.neck_hidden_sizes[3], use_conv=True)
+        # Neck
+        self.neck = DPTNeck(config)
+        self.neck.reassemble_stage = None
+        # Depth estimation head
+        self.head = DPTDepthEstimationHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        hidden_states,
+        head_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_depth_only: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], DepthEstimatorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
+        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        ...     predicted_depth = outputs.predicted_depth
+        >>> # interpolate to original size
+        >>> prediction = torch.nn.functional.interpolate(
+        ...     predicted_depth.unsqueeze(1),
+        ...     size=image.size[::-1],
+        ...     mode="bicubic",
+        ...     align_corners=False,
+        ... )
+        >>> # visualize the prediction
+        >>> output = prediction.squeeze().cpu().numpy()
+        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
+        >>> depth = Image.fromarray(formatted)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        # if self.backbone is not None:
+        #     outputs = self.backbone.forward_with_filtered_kwargs(
+        #         pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+        #     )
+        #     hidden_states = outputs.feature_maps
+        # else:
+        #     outputs = self.dpt(
+        #         pixel_values,
+        #         head_mask=head_mask,
+        #         output_attentions=output_attentions,
+        #         output_hidden_states=True,  # we need the intermediate hidden states
+        #         return_dict=return_dict,
+        #     )
+        #     hidden_states = outputs.hidden_states if return_dict else outputs[1]
+        #     # only keep certain features based on config.backbone_out_indices
+        #     # note that the hidden_states also include the initial embeddings
+        #     if not self.config.is_hybrid:
+        #         hidden_states = [
+        #             feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices
+        #         ]
+        #     else:
+        #         backbone_hidden_states = outputs.intermediate_activations if return_dict else list(outputs[-1])
+        #         backbone_hidden_states.extend(
+        #             feature
+        #             for idx, feature in enumerate(hidden_states[1:])
+        #             if idx in self.config.backbone_out_indices[2:]
+        #         )
+        #         hidden_states = backbone_hidden_states
+        assert len(hidden_states) == 4
+        # upsample hidden_states for unet
+        # hidden_states = [getattr(self, "feature_upsample_%s" %i)(hidden_states[i]) for i in range(len(hidden_states))]
+        hidden_states[0] = self.feature_upsample_0(hidden_states[0])
+        patch_height, patch_width = None, None
+        if self.config.backbone_config is not None and self.config.is_hybrid is False:
+            _, _, height, width = hidden_states[3].shape
+            height *= 8; width *= 8
+            patch_size = self.config.backbone_config.patch_size
+            patch_height = height // patch_size
+            patch_width = width // patch_size
+        hidden_states = self.neck(hidden_states, patch_height, patch_width)
+        predicted_depth = self.head(hidden_states)
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+        if return_depth_only:
+            return predicted_depth
+        return DepthEstimatorOutput(
+            loss=loss,
+            prediction=predicted_depth,
+            hidden_states=None,
+            attentions=None,
+        )
+class DPTDepthEstimationHeadIdentity(DPTDepthEstimationHead):
+    """
+    Output head head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
+    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
+    supplementary material).
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        features = config.fusion_hidden_size
+        self.head = nn.Sequential(
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.Identity(),
+        )
+class DPTNeckHeadForUnetAfterUpsampleIdentity(DPTNeckHeadForUnetAfterUpsample):
+    def __init__(self, config):
+        super().__init__(config)
+        # Depth estimation head
+        self.head = DPTDepthEstimationHeadIdentity(config)
+        # Initialize weights and apply final processing
+        self.post_init()

{util → genpercept/util}/batchsize.py RENAMED Viewed

@@ -1,3 +1,23 @@
 import torch
 import math
@@ -33,11 +53,13 @@ def find_batch_size(ensemble_size: int, input_res: int, dtype: torch.dtype) -> i
     Automatically search for suitable operating batch size.
     Args:
-        ensemble_size (int): Number of predictions to be ensembled
-        input_res (int): Operating resolution of the input image.
     Returns:
-        int: Operating batch size
     """
     if not torch.cuda.is_available():
         return 1
@@ -56,4 +78,4 @@ def find_batch_size(ensemble_size: int, input_res: int, dtype: torch.dtype) -> i
                 bs = math.ceil(ensemble_size / 2)
             return bs
-    return 1

+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
 import torch
 import math
     Automatically search for suitable operating batch size.
     Args:
+        ensemble_size (`int`):
+            Number of predictions to be ensembled.
+        input_res (`int`):
+            Operating resolution of the input image.
     Returns:
+        `int`: Operating batch size.
     """
     if not torch.cuda.is_available():
         return 1
                 bs = math.ceil(ensemble_size / 2)
             return bs
+    return 1

genpercept/util/ensemble.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+from functools import partial
+from typing import Optional, Tuple
+import numpy as np
+import torch
+from .image_util import get_tv_resample_method, resize_max_res
+def inter_distances(tensors: torch.Tensor):
+    """
+    To calculate the distance between each two depth maps.
+    """
+    distances = []
+    for i, j in torch.combinations(torch.arange(tensors.shape[0])):
+        arr1 = tensors[i : i + 1]
+        arr2 = tensors[j : j + 1]
+        distances.append(arr1 - arr2)
+    dist = torch.concatenate(distances, dim=0)
+    return dist
+def ensemble_depth(
+    depth: torch.Tensor,
+    scale_invariant: bool = True,
+    shift_invariant: bool = True,
+    output_uncertainty: bool = False,
+    reduction: str = "median",
+    regularizer_strength: float = 0.02,
+    max_iter: int = 2,
+    tol: float = 1e-3,
+    max_res: int = 1024,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Ensembles depth maps represented by the `depth` tensor with expected shape `(B, 1, H, W)`, where B is the
+    number of ensemble members for a given prediction of size `(H x W)`. Even though the function is designed for
+    depth maps, it can also be used with disparity maps as long as the input tensor values are non-negative. The
+    alignment happens when the predictions have one or more degrees of freedom, that is when they are either
+    affine-invariant (`scale_invariant=True` and `shift_invariant=True`), or just scale-invariant (only
+    `scale_invariant=True`). For absolute predictions (`scale_invariant=False` and `shift_invariant=False`)
+    alignment is skipped and only ensembling is performed.
+    Args:
+        depth (`torch.Tensor`):
+            Input ensemble depth maps.
+        scale_invariant (`bool`, *optional*, defaults to `True`):
+            Whether to treat predictions as scale-invariant.
+        shift_invariant (`bool`, *optional*, defaults to `True`):
+            Whether to treat predictions as shift-invariant.
+        output_uncertainty (`bool`, *optional*, defaults to `False`):
+            Whether to output uncertainty map.
+        reduction (`str`, *optional*, defaults to `"median"`):
+            Reduction method used to ensemble aligned predictions. The accepted values are: `"mean"` and
+            `"median"`.
+        regularizer_strength (`float`, *optional*, defaults to `0.02`):
+            Strength of the regularizer that pulls the aligned predictions to the unit range from 0 to 1.
+        max_iter (`int`, *optional*, defaults to `2`):
+            Maximum number of the alignment solver steps. Refer to `scipy.optimize.minimize` function, `options`
+            argument.
+        tol (`float`, *optional*, defaults to `1e-3`):
+            Alignment solver tolerance. The solver stops when the tolerance is reached.
+        max_res (`int`, *optional*, defaults to `1024`):
+            Resolution at which the alignment is performed; `None` matches the `processing_resolution`.
+    Returns:
+        A tensor of aligned and ensembled depth maps and optionally a tensor of uncertainties of the same shape:
+        `(1, 1, H, W)`.
+    """
+    if depth.dim() != 4 or depth.shape[1] != 1:
+        raise ValueError(f"Expecting 4D tensor of shape [B,1,H,W]; got {depth.shape}.")
+    if reduction not in ("mean", "median"):
+        raise ValueError(f"Unrecognized reduction method: {reduction}.")
+    if not scale_invariant and shift_invariant:
+        raise ValueError("Pure shift-invariant ensembling is not supported.")
+    def init_param(depth: torch.Tensor):
+        init_min = depth.reshape(ensemble_size, -1).min(dim=1).values
+        init_max = depth.reshape(ensemble_size, -1).max(dim=1).values
+        if scale_invariant and shift_invariant:
+            init_s = 1.0 / (init_max - init_min).clamp(min=1e-6)
+            init_t = -init_s * init_min
+            param = torch.cat((init_s, init_t)).cpu().numpy()
+        elif scale_invariant:
+            init_s = 1.0 / init_max.clamp(min=1e-6)
+            param = init_s.cpu().numpy()
+        else:
+            raise ValueError("Unrecognized alignment.")
+        return param
+    def align(depth: torch.Tensor, param: np.ndarray) -> torch.Tensor:
+        if scale_invariant and shift_invariant:
+            s, t = np.split(param, 2)
+            s = torch.from_numpy(s).to(depth).view(ensemble_size, 1, 1, 1)
+            t = torch.from_numpy(t).to(depth).view(ensemble_size, 1, 1, 1)
+            out = depth * s + t
+        elif scale_invariant:
+            s = torch.from_numpy(param).to(depth).view(ensemble_size, 1, 1, 1)
+            out = depth * s
+        else:
+            raise ValueError("Unrecognized alignment.")
+        return out
+    def ensemble(
+        depth_aligned: torch.Tensor, return_uncertainty: bool = False
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        uncertainty = None
+        if reduction == "mean":
+            prediction = torch.mean(depth_aligned, dim=0, keepdim=True)
+            if return_uncertainty:
+                uncertainty = torch.std(depth_aligned, dim=0, keepdim=True)
+        elif reduction == "median":
+            prediction = torch.median(depth_aligned, dim=0, keepdim=True).values
+            if return_uncertainty:
+                uncertainty = torch.median(
+                    torch.abs(depth_aligned - prediction), dim=0, keepdim=True
+                ).values
+        else:
+            raise ValueError(f"Unrecognized reduction method: {reduction}.")
+        return prediction, uncertainty
+    def cost_fn(param: np.ndarray, depth: torch.Tensor) -> float:
+        cost = 0.0
+        depth_aligned = align(depth, param)
+        for i, j in torch.combinations(torch.arange(ensemble_size)):
+            diff = depth_aligned[i] - depth_aligned[j]
+            cost += (diff**2).mean().sqrt().item()
+        if regularizer_strength > 0:
+            prediction, _ = ensemble(depth_aligned, return_uncertainty=False)
+            err_near = (0.0 - prediction.min()).abs().item()
+            err_far = (1.0 - prediction.max()).abs().item()
+            cost += (err_near + err_far) * regularizer_strength
+        return cost
+    def compute_param(depth: torch.Tensor):
+        import scipy
+        depth_to_align = depth.to(torch.float32)
+        if max_res is not None and max(depth_to_align.shape[2:]) > max_res:
+            try:
+                depth_to_align = resize_max_res(
+                    depth_to_align, max_res, get_tv_resample_method("nearest-exact")
+                )
+            except:
+                depth_to_align = resize_max_res(
+                    depth_to_align, max_res, get_tv_resample_method("bilinear")
+                )
+        param = init_param(depth_to_align)
+        res = scipy.optimize.minimize(
+            partial(cost_fn, depth=depth_to_align),
+            param,
+            method="BFGS",
+            tol=tol,
+            options={"maxiter": max_iter, "disp": False},
+        )
+        return res.x
+    requires_aligning = scale_invariant or shift_invariant
+    ensemble_size = depth.shape[0]
+    if requires_aligning:
+        param = compute_param(depth)
+        depth = align(depth, param)
+    depth, uncertainty = ensemble(depth, return_uncertainty=output_uncertainty)
+    depth_max = depth.max()
+    if scale_invariant and shift_invariant:
+        depth_min = depth.min()
+    elif scale_invariant:
+        depth_min = 0
+    else:
+        raise ValueError("Unrecognized alignment.")
+    depth_range = (depth_max - depth_min).clamp(min=1e-6)
+    depth = (depth - depth_min) / depth_range
+    if output_uncertainty:
+        uncertainty /= depth_range
+    return depth, uncertainty  # [1,1,H,W], [1,1,H,W]

{util → genpercept/util}/image_util.py RENAMED Viewed

@@ -1,15 +1,30 @@
 import matplotlib
 import numpy as np
 import torch
-from PIL import Image
-from torchvision import transforms
-def norm_to_rgb(norm):
-    # norm: (3, H, W), range from [-1, 1]
-    norm_rgb = ((norm + 1) * 0.5) * 255
-    norm_rgb = np.clip(norm_rgb, a_min=0, a_max=255)
-    norm_rgb = norm_rgb.astype(np.uint8)
-    return norm_rgb
 def colorize_depth_maps(
     depth_map, min_depth, max_depth, cmap="Spectral", valid_mask=None
@@ -20,9 +35,9 @@ def colorize_depth_maps(
     assert len(depth_map.shape) >= 2, "Invalid dimension"
     if isinstance(depth_map, torch.Tensor):
-        depth = depth_map.detach().clone().squeeze().numpy()
     elif isinstance(depth_map, np.ndarray):
-        depth = np.squeeze(depth_map.copy())
     # reshape to [ (B,) H, W ]
     if depth.ndim < 3:
         depth = depth[np.newaxis, :, :]
@@ -36,7 +51,7 @@ def colorize_depth_maps(
     if valid_mask is not None:
         if isinstance(depth_map, torch.Tensor):
             valid_mask = valid_mask.detach().numpy()
-        valid_mask = np.squeeze(valid_mask)  # [H, W] or [B, H, W]
         if valid_mask.ndim < 3:
             valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
         else:
@@ -61,18 +76,28 @@ def chw2hwc(chw):
     return hwc
-def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image:
     """
-    Resize image to limit maximum edge length while keeping aspect ratio
     Args:
-        img (Image.Image): Image to be resized
-        max_edge_resolution (int): Maximum edge length (px).
     Returns:
-        Image.Image: Resized image.
     """
-    original_width, original_height = img.size
     downscale_factor = min(
         max_edge_resolution / original_width, max_edge_resolution / original_height
     )
@@ -80,93 +105,26 @@ def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image:
     new_width = int(original_width * downscale_factor)
     new_height = int(original_height * downscale_factor)
-    resized_img = img.resize((new_width, new_height))
-    return resized_img
-def resize_max_res_integer_16(img: Image.Image, max_edge_resolution: int) -> Image.Image:
-    """
-    Resize image to limit maximum edge length while keeping aspect ratio
-    Args:
-        img (Image.Image): Image to be resized
-        max_edge_resolution (int): Maximum edge length (px).
-    Returns:
-        Image.Image: Resized image.
-    """
-    original_width, original_height = img.size
-    downscale_factor = min(
-        max_edge_resolution / original_width, max_edge_resolution / original_height
-    )
-    new_width = int(original_width * downscale_factor) // 16 * 16 # make sure it is integer multiples of 16, used for pixart
-    new_height = int(original_height * downscale_factor) // 16 * 16 # make sure it is integer multiples of 16, used for pixart
-    resized_img = img.resize((new_width, new_height))
-    return resized_img
-def resize_res(img: Image.Image, max_edge_resolution: int) -> Image.Image:
-    """
-    Resize image to limit maximum edge length while keeping aspect ratio
-    Args:
-        img (Image.Image): Image to be resized
-        max_edge_resolution (int): Maximum edge length (px).
-    Returns:
-        Image.Image: Resized image.
-    """
-    resized_img = img.resize((max_edge_resolution, max_edge_resolution))
     return resized_img
-class ResizeLongestEdge:
-    def __init__(self, max_size, interpolation=transforms.InterpolationMode.BILINEAR):
-        self.max_size = max_size
-        self.interpolation = interpolation
-    def __call__(self, img):
-        scale = self.max_size / max(img.width, img.height)
-        new_size = (int(img.height * scale), int(img.width * scale))
-        return transforms.functional.resize(img, new_size, self.interpolation)
-class ResizeShortestEdge:
-    def __init__(self, min_size, interpolation=transforms.InterpolationMode.BILINEAR):
-        self.min_size = min_size
-        self.interpolation = interpolation
-    def __call__(self, img):
-        scale = self.min_size / min(img.width, img.height)
-        new_size = (int(img.height * scale), int(img.width * scale))
-        return transforms.functional.resize(img, new_size, self.interpolation)
-class ResizeHard:
-    def __init__(self, size, interpolation=transforms.InterpolationMode.BILINEAR):
-        self.size = size
-        self.interpolation = interpolation
-    def __call__(self, img):
-        new_size = (int(self.size), int(self.size))
-        return transforms.functional.resize(img, new_size, self.interpolation)
-class ResizeLongestEdgeInteger:
-    def __init__(self, max_size, interpolation=transforms.InterpolationMode.BILINEAR, integer=16):
-        self.max_size = max_size
-        self.interpolation = interpolation
-        self.integer = integer
-    def __call__(self, img):
-        scale = self.max_size / max(img.width, img.height)
-        new_size_h = int(img.height * scale) // self.integer * self.integer
-        new_size_w = int(img.width * scale) // self.integer * self.integer
-        new_size = (new_size_h, new_size_w)
-        return transforms.functional.resize(img, new_size, self.interpolation)

+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+# Last modified: 2024-05-24
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
 import matplotlib
 import numpy as np
 import torch
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import resize
 def colorize_depth_maps(
     depth_map, min_depth, max_depth, cmap="Spectral", valid_mask=None
     assert len(depth_map.shape) >= 2, "Invalid dimension"
     if isinstance(depth_map, torch.Tensor):
+        depth = depth_map.detach().squeeze().numpy()
     elif isinstance(depth_map, np.ndarray):
+        depth = depth_map.copy().squeeze()
     # reshape to [ (B,) H, W ]
     if depth.ndim < 3:
         depth = depth[np.newaxis, :, :]
     if valid_mask is not None:
         if isinstance(depth_map, torch.Tensor):
             valid_mask = valid_mask.detach().numpy()
+        valid_mask = valid_mask.squeeze()  # [H, W] or [B, H, W]
         if valid_mask.ndim < 3:
             valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
         else:
     return hwc
+def resize_max_res(
+    img: torch.Tensor,
+    max_edge_resolution: int,
+    resample_method: InterpolationMode = InterpolationMode.BILINEAR,
+) -> torch.Tensor:
     """
+    Resize image to limit maximum edge length while keeping aspect ratio.
     Args:
+        img (`torch.Tensor`):
+            Image tensor to be resized. Expected shape: [B, C, H, W]
+        max_edge_resolution (`int`):
+            Maximum edge length (pixel).
+        resample_method (`PIL.Image.Resampling`):
+            Resampling method used to resize images.
     Returns:
+        `torch.Tensor`: Resized image.
     """
+    assert 4 == img.dim(), f"Invalid input shape {img.shape}"
+    original_height, original_width = img.shape[-2:]
     downscale_factor = min(
         max_edge_resolution / original_width, max_edge_resolution / original_height
     )
     new_width = int(original_width * downscale_factor)
     new_height = int(original_height * downscale_factor)
+    resized_img = resize(img, (new_height, new_width), resample_method, antialias=True)
     return resized_img
+def get_tv_resample_method(method_str: str) -> InterpolationMode:
+    try:
+        resample_method_dict = {
+            "bilinear": InterpolationMode.BILINEAR,
+            "bicubic": InterpolationMode.BICUBIC,
+            "nearest": InterpolationMode.NEAREST_EXACT,
+            "nearest-exact": InterpolationMode.NEAREST_EXACT,
+        }
+    except:
+        resample_method_dict = {
+            "bilinear": InterpolationMode.BILINEAR,
+            "bicubic": InterpolationMode.BICUBIC,
+            "nearest": InterpolationMode.NEAREST,
+        }
+    resample_method = resample_method_dict.get(method_str, None)
+    if resample_method is None:
+        raise ValueError(f"Unknown resampling method: {resample_method}")
+    else:
+        return resample_method

hf_configs/dpt-sd2.1-unet-after-upsample-general/config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+    "_commit_hash": null,
+    "add_projection": true,
+    "architectures": [
+      "DPTForDepthEstimation"
+    ],
+    "attention_probs_dropout_prob": null,
+    "auxiliary_loss_weight": 0.4,
+    "backbone_featmap_shape": null,
+    "backbone_out_indices": null,
+    "fusion_hidden_size": 256,
+    "head_in_index": -1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": null,
+    "hidden_size": 768,
+    "image_size": null,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "is_hybrid": false,
+    "layer_norm_eps": null,
+    "model_type": "dpt",
+    "neck_hidden_sizes": [
+      320,
+      640,
+      1280,
+      1280
+    ],
+    "neck_ignore_stages": [],
+    "num_attention_heads": null,
+    "num_channels": null,
+    "num_hidden_layers": null,
+    "patch_size": null,
+    "qkv_bias": null,
+    "readout_type": "project",
+    "reassemble_factors": [
+      4,
+      2,
+      1,
+      0.5
+    ],
+    "semantic_classifier_dropout": 0.1,
+    "semantic_loss_ignore_index": 255,
+    "torch_dtype": "float32",
+    "transformers_version": null,
+    "use_auxiliary_head": true,
+    "use_batch_norm_in_fusion_residual": false,
+    "use_bias_in_fusion_residual": false
+  }

hf_configs/dpt-sd2.1-unet-after-upsample-general/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+    "do_normalize": true,
+    "do_pad": true,
+    "do_rescale": false,
+    "do_resize": true,
+    "ensure_multiple_of": 1,
+    "image_mean": [
+      123.675,
+      116.28,
+      103.53
+    ],
+    "image_processor_type": "DPTImageProcessor",
+    "image_std": [
+      58.395,
+      57.12,
+      57.375
+    ],
+    "keep_aspect_ratio": false,
+    "resample": 2,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "height": 392,
+      "width": 392
+    },
+    "size_divisor": 14
+  }

hf_configs/scheduler_beta_1.0_1.0/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "_class_name": "DDIMScheduler",
+  "_diffusers_version": "0.29.2",
+  "beta_end": 1.0,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 1.0,
+  "clip_sample": false,
+  "clip_sample_range": 1.0,
+  "dynamic_thresholding_ratio": 0.995,
+  "num_train_timesteps": 1000,
+  "prediction_type": "v_prediction",
+  "rescale_betas_zero_snr": false,
+  "sample_max_value": 1.0,
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "thresholding": false,
+  "timestep_spacing": "leading",
+  "trained_betas": null
+}

pipeline_genpercept.py DELETED Viewed

@@ -1,355 +0,0 @@
-# --------------------------------------------------------
-# Diffusion Models Trained with Large Data Are Transferable Visual Models (https://arxiv.org/abs/2403.06090)
-# Github source: https://github.com/aim-uofa/GenPercept
-# Copyright (c) 2024 Zhejiang University
-# Licensed under The CC0 1.0 License [see LICENSE for details]
-# By Guangkai Xu
-# Based on Marigold, diffusers codebases
-# https://github.com/prs-eth/marigold
-# https://github.com/huggingface/diffusers
-# --------------------------------------------------------
-import torch
-import numpy as np
-import torch.nn.functional as F
-import matplotlib.pyplot as plt
-from tqdm.auto import tqdm
-from PIL import Image
-from typing import List, Dict, Union
-from torch.utils.data import DataLoader, TensorDataset
-from diffusers import (
-    DiffusionPipeline,
-    UNet2DConditionModel,
-    AutoencoderKL,
-)
-from diffusers.utils import BaseOutput
-from util.image_util import chw2hwc, colorize_depth_maps, resize_max_res, norm_to_rgb, resize_res
-from util.batchsize import find_batch_size
-class GenPerceptOutput(BaseOutput):
-    pred_np: np.ndarray
-    pred_colored: Image.Image
-class GenPerceptPipeline(DiffusionPipeline):
-    vae_scale_factor = 0.18215
-    task_infos = {
-        'depth':    dict(task_channel_num=1, interpolate='bilinear', ),
-        'seg':      dict(task_channel_num=3, interpolate='nearest', ),
-        'sr':       dict(task_channel_num=3, interpolate='nearest', ),
-        'normal':   dict(task_channel_num=3, interpolate='bilinear', ),
-    }
-    def __init__(
-        self,
-        unet: UNet2DConditionModel,
-        vae: AutoencoderKL,
-        customized_head=None,
-        empty_text_embed=None,
-    ):
-        super().__init__()
-        self.empty_text_embed = empty_text_embed
-        # register
-        register_dict = dict(
-            unet=unet,
-            vae=vae,
-            customized_head=customized_head,
-        )
-        self.register_modules(**register_dict)
-    @torch.no_grad()
-    def __call__(
-        self,
-        input_image: Union[Image.Image, torch.Tensor],
-        mode: str = 'depth',
-        resize_hard = False,
-        processing_res: int = 768,
-        match_input_res: bool = False,
-        batch_size: int = 0,
-        color_map: str = "Spectral",
-        show_progress_bar: bool = True,
-    ) -> GenPerceptOutput:
-        """
-        Function invoked when calling the pipeline.
-        Args:
-            input_image (Image):
-                Input RGB (or gray-scale) image.
-            processing_res (int, optional):
-                Maximum resolution of processing.
-                If set to 0: will not resize at all.
-                Defaults to 768.
-            match_input_res (bool, optional):
-                Resize depth prediction to match input resolution.
-                Only valid if `limit_input_res` is not None.
-                Defaults to True.
-            batch_size (int, optional):
-                Inference batch size.
-                If set to 0, the script will automatically decide the proper batch size.
-                Defaults to 0.
-            show_progress_bar (bool, optional):
-                Display a progress bar of diffusion denoising.
-                Defaults to True.
-            color_map (str, optional):
-                Colormap used to colorize the depth map.
-                Defaults to "Spectral".
-        Returns:
-            `GenPerceptOutput`
-        """
-        device = self.device
-        task_channel_num = self.task_infos[mode]['task_channel_num']
-        if not match_input_res:
-            assert (
-                processing_res is not None
-            ), "Value error: `resize_output_back` is only valid with "
-        assert processing_res >= 0
-        # ----------------- Image Preprocess -----------------
-        if type(input_image) == torch.Tensor: # [B, 3, H, W]
-            rgb_norm = input_image.to(device)
-            input_size = input_image.shape[2:]
-            bs_imgs = rgb_norm.shape[0]
-            assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
-            rgb_norm = rgb_norm.to(self.dtype)
-        else:
-            # if len(rgb_paths) > 0 and 'kitti' in rgb_paths[0]:
-            #     # kb crop
-            #     height = input_image.size[1]
-            #     width = input_image.size[0]
-            #     top_margin = int(height - 352)
-            #     left_margin = int((width - 1216) / 2)
-            #     input_image = input_image.crop((left_margin, top_margin, left_margin + 1216, top_margin + 352))
-            # TODO: check the kitti evaluation resolution here.
-            input_size = (input_image.size[1], input_image.size[0])
-            # Resize image
-            if processing_res > 0:
-                if resize_hard:
-                    input_image = resize_res(
-                        input_image, max_edge_resolution=processing_res
-                    )
-                else:
-                    input_image = resize_max_res(
-                        input_image, max_edge_resolution=processing_res
-                    )
-            input_image = input_image.convert("RGB")
-            image = np.asarray(input_image)
-            # Normalize rgb values
-            rgb = np.transpose(image, (2, 0, 1))  # [H, W, rgb] -> [rgb, H, W]
-            rgb_norm = rgb / 255.0 * 2.0 - 1.0
-            rgb_norm = torch.from_numpy(rgb_norm).to(self.unet.dtype)
-            rgb_norm = rgb_norm[None].to(device)
-            assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
-            bs_imgs = 1
-        # ----------------- Predicting depth -----------------
-        single_rgb_dataset = TensorDataset(rgb_norm)
-        if batch_size > 0:
-            _bs = batch_size
-        else:
-            _bs = find_batch_size(
-                ensemble_size=1,
-                input_res=max(rgb_norm.shape[1:]),
-                dtype=self.dtype,
-            )
-        single_rgb_loader = DataLoader(
-            single_rgb_dataset, batch_size=_bs, shuffle=False
-        )
-        # Predict depth maps (batched)
-        pred_list = []
-        if show_progress_bar:
-            iterable = tqdm(
-                single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False
-            )
-        else:
-            iterable = single_rgb_loader
-        for batch in iterable:
-            (batched_img, ) = batch
-            pred = self.single_infer(
-                rgb_in=batched_img,
-                mode=mode,
-            )
-            pred_list.append(pred.detach().clone())
-        preds = torch.concat(pred_list, axis=0).squeeze() # [bs_imgs, task_channel_num, H, W]
-        preds = preds.view(bs_imgs, task_channel_num, preds.shape[-2], preds.shape[-1])
-        if match_input_res:
-            preds = F.interpolate(preds, input_size, mode=self.task_infos[mode]['interpolate'])
-        # ----------------- Post processing -----------------
-        if mode == 'depth':
-            if len(preds.shape) == 4:
-                preds = preds[:, 0] # [bs_imgs, H, W]
-            # Scale prediction to [0, 1]
-            min_d = preds.view(bs_imgs, -1).min(dim=1)[0]
-            max_d = preds.view(bs_imgs, -1).max(dim=1)[0]
-            preds = (preds - min_d[:, None, None]) / (max_d[:, None, None] - min_d[:, None, None])
-            preds = preds.cpu().numpy().astype(np.float32)
-            # Colorize
-            pred_colored_img_list = []
-            for i in range(bs_imgs):
-                pred_colored_chw = colorize_depth_maps(
-                    preds[i], 0, 1, cmap=color_map
-                ).squeeze()  # [3, H, W], value in (0, 1)
-                pred_colored_chw = (pred_colored_chw * 255).astype(np.uint8)
-                pred_colored_hwc = chw2hwc(pred_colored_chw)
-                pred_colored_img = Image.fromarray(pred_colored_hwc)
-                pred_colored_img_list.append(pred_colored_img)
-            return GenPerceptOutput(
-                pred_np=np.squeeze(preds),
-                pred_colored=pred_colored_img_list[0] if len(pred_colored_img_list) == 1 else pred_colored_img_list,
-            )
-        elif mode == 'seg' or mode == 'sr':
-            if not self.customized_head:
-                # shift to [0, 1]
-                preds = (preds + 1.0) / 2.0
-                # shift to [0, 255]
-                preds = preds * 255
-                # Clip output range
-                preds = preds.clip(0, 255).cpu().numpy().astype(np.uint8)
-            else:
-                raise NotImplementedError
-            pred_colored_img_list = []
-            for i in range(preds.shape[0]):
-                pred_colored_hwc = chw2hwc(preds[i])
-                pred_colored_img = Image.fromarray(pred_colored_hwc)
-                pred_colored_img_list.append(pred_colored_img)
-            return GenPerceptOutput(
-                pred_np=np.squeeze(preds),
-                pred_colored=pred_colored_img_list[0] if len(pred_colored_img_list) == 1 else pred_colored_img_list,
-            )
-        elif mode == 'normal':
-            if not self.customized_head:
-                preds = preds.clip(-1, 1).cpu().numpy() # [-1, 1]
-            else:
-                raise NotImplementedError
-            pred_colored_img_list = []
-            for i in range(preds.shape[0]):
-                pred_colored_chw = norm_to_rgb(preds[i])
-                pred_colored_hwc = chw2hwc(pred_colored_chw)
-                normal_colored_img_i = Image.fromarray(pred_colored_hwc)
-                pred_colored_img_list.append(normal_colored_img_i)
-            return GenPerceptOutput(
-                pred_np=np.squeeze(preds),
-                pred_colored=pred_colored_img_list[0] if len(pred_colored_img_list) == 1 else pred_colored_img_list,
-            )
-        else:
-            raise NotImplementedError
-    @torch.no_grad()
-    def single_infer(
-        self,
-        rgb_in: torch.Tensor,
-        mode: str = 'depth',
-    ) -> torch.Tensor:
-        """
-        Perform an individual depth prediction without ensembling.
-        Args:
-            rgb_in (torch.Tensor):
-                Input RGB image.
-            num_inference_steps (int):
-                Number of diffusion denoising steps (DDIM) during inference.
-            show_pbar (bool):
-                Display a progress bar of diffusion denoising.
-        Returns:
-            torch.Tensor: Predicted depth map.
-        """
-        device = rgb_in.device
-        bs_imgs = rgb_in.shape[0]
-        timesteps = torch.tensor([1]).long().repeat(bs_imgs).to(device)
-        # Encode image
-        rgb_latent = self.encode_rgb(rgb_in)
-        batch_embed = self.empty_text_embed
-        batch_embed = batch_embed.repeat((rgb_latent.shape[0], 1, 1)).to(device)   # [bs_imgs, 77, 1024]
-        # Forward!
-        if self.customized_head:
-            unet_features = self.unet(rgb_latent, timesteps, encoder_hidden_states=batch_embed, return_feature_only=True)[0][::-1]
-            pred = self.customized_head(unet_features)
-        else:
-            unet_output = self.unet(
-                rgb_latent, timesteps, encoder_hidden_states=batch_embed
-            )  # [bs_imgs, 4, h, w]
-            unet_pred = unet_output.sample
-            pred_latent = - unet_pred
-            pred_latent.to(device)
-            pred = self.decode_pred(pred_latent)
-            if mode == 'depth':
-                # mean of output channels
-                pred = pred.mean(dim=1, keepdim=True)
-            # clip prediction
-            pred = torch.clip(pred, -1.0, 1.0)
-        return pred
-    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
-        """
-        Encode RGB image into latent.
-        Args:
-            rgb_in (torch.Tensor):
-                Input RGB image to be encoded.
-        Returns:
-            torch.Tensor: Image latent
-        """
-        try:
-            # encode
-            h_temp = self.vae.encoder(rgb_in)
-            moments = self.vae.quant_conv(h_temp)
-        except:
-            # encode
-            h_temp = self.vae.encoder(rgb_in.float())
-            moments = self.vae.quant_conv(h_temp.float())
-        mean, logvar = torch.chunk(moments, 2, dim=1)
-        # scale latent
-        rgb_latent = mean * self.vae_scale_factor
-        return rgb_latent
-    def decode_pred(self, pred_latent: torch.Tensor) -> torch.Tensor:
-        """
-        Decode pred latent into pred label.
-        Args:
-            pred_latent (torch.Tensor):
-                prediction latent to be decoded.
-        Returns:
-            torch.Tensor: Decoded prediction label.
-        """
-        # scale latent
-        pred_latent = pred_latent / self.vae_scale_factor
-        # decode
-        z = self.vae.post_quant_conv(pred_latent)
-        pred = self.vae.decoder(z)
-        return pred

requirements.txt CHANGED Viewed

@@ -21,3 +21,7 @@ spaces
 gradio>=4.32.2
 gradio_client>=0.17.0
 gradio_imageslider>=0.0.20

 gradio>=4.32.2
 gradio_client>=0.17.0
 gradio_imageslider>=0.0.20
+omegaconf
+tabulate
+wandb
+pandas

seg_images/seg_1.jpg ADDED Viewed

seg_images/seg_2.jpg ADDED Viewed

seg_images/seg_3.jpg ADDED Viewed

seg_images/seg_4.jpg ADDED Viewed

seg_images/seg_5.jpg ADDED Viewed

util/__init__.py DELETED Viewed

File without changes

util/seed_all.py DELETED Viewed

@@ -1,13 +0,0 @@
-import numpy as np
-import random
-import torch
-def seed_all(seed: int = 0):
-    """
-    Set random seeds of all components.
-    """
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)