Spaces:

sjtu-deepvision
/

Dereflection-Any-Image

Running on Zero

App Files Files Community

sjtu-deepvision commited on Mar 20

Commit

311419e

verified ·

1 Parent(s): c360cac

Upload 9 files

Browse files

Files changed (9) hide show

DAI/__init__.py +0 -0
DAI/controlnetvae.py +250 -0
DAI/decoder.py +313 -0
DAI/pipeline_all.py +733 -0
DAI/pipeline_onestep.py +723 -0
app.py +306 -4
requirements.txt +8 -0
utils/image_utils.py +21 -0
utils/loss_utils.py +64 -0

DAI/__init__.py ADDED Viewed

File without changes

DAI/controlnetvae.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders.single_file_model import FromOriginalModelMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.unets.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    DownBlock2D,
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    get_down_block,
+)
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
+from diffusers.models.controlnet import ControlNetOutput
+from diffusers.models import ControlNetModel
+import pdb
+class ControlNetVAEModel(ControlNetModel):
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond: torch.Tensor = None,
+        conditioning_scale: float = 1.0,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]:
+        """
+        The [`ControlNetVAEModel`] forward method.
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor.
+            timestep (`Union[torch.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states.
+            controlnet_cond (`torch.Tensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for ControlNet outputs.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            guess_mode (`bool`, defaults to `False`):
+                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
+                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
+                returned where the first element is the sample tensor.
+        """
+        # check channel order
+        channel_order = self.config.controlnet_conditioning_channel_order
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        elif channel_order == "bgr":
+            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+        else:
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        if self.config.addition_embed_type is not None:
+            if self.config.addition_embed_type == "text":
+                aug_emb = self.add_embedding(encoder_hidden_states)
+            elif self.config.addition_embed_type == "text_time":
+                if "text_embeds" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                    )
+                text_embeds = added_cond_kwargs.get("text_embeds")
+                if "time_ids" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                    )
+                time_ids = added_cond_kwargs.get("time_ids")
+                time_embeds = self.add_time_proj(time_ids.flatten())
+                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+                add_embeds = add_embeds.to(emb.dtype)
+                aug_emb = self.add_embedding(add_embeds)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+        # 5. Control net blocks
+        controlnet_down_block_res_samples = ()
+        # NOTE that controlnet downblock is zeroconv, we discard
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = down_block_res_sample
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
+        down_block_res_samples = controlnet_down_block_res_samples
+        mid_block_res_sample = sample
+        # 6. scaling
+        if guess_mode and not self.config.global_pool_conditions:
+            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0
+            scales = scales * conditioning_scale
+            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
+            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
+        else:
+            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+            mid_block_res_sample = mid_block_res_sample * conditioning_scale
+        if self.config.global_pool_conditions:
+            down_block_res_samples = [
+                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
+            ]
+            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+        return ControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )

DAI/decoder.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import torch
+import torch.nn as nn
+from typing import Dict, Optional, Tuple, Union
+from diffusers import AutoencoderKL
+from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution, Encoder, Decoder
+from diffusers.models.attention_processor import Attention, AttentionProcessor
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.unets.unet_2d_blocks import (
+    AutoencoderTinyBlock,
+    UNetMidBlock2D,
+    get_down_block,
+    get_up_block,
+)
+from diffusers.utils.accelerate_utils import apply_forward_hook
+class ZeroConv2d(nn.Module):
+    """
+    Zero Convolution layer, similar to the one used in ControlNet.
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        self.conv.weight.data.zero_()
+        self.conv.bias.data.zero_()
+    def forward(self, x):
+        return self.conv(x)
+class CustomAutoencoderKL(AutoencoderKL):
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+        force_upcast: float = True,
+        use_quant_conv: bool = True,
+        use_post_quant_conv: bool = True,
+        mid_block_add_attention: bool = True,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            down_block_types=down_block_types,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            latent_channels=latent_channels,
+            norm_num_groups=norm_num_groups,
+            sample_size=sample_size,
+            scaling_factor=scaling_factor,
+            force_upcast=force_upcast,
+            use_quant_conv=use_quant_conv,
+            use_post_quant_conv=use_post_quant_conv,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+        # Add Zero Convolution layers to the encoder
+        # self.zero_convs = nn.ModuleList()
+        # for i, out_channels_ in enumerate(block_out_channels):
+        #     self.zero_convs.append(ZeroConv2d(out_channels_, out_channels_))
+        # Modify the decoder to accept skip connections
+        self.decoder = CustomDecoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+        self.encoder = CustomEncoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+    def encode(self, x: torch.Tensor, return_dict: bool = True):
+        # Get the encoder outputs
+        _, skip_connections = self.encoder(x)
+        return skip_connections
+    def decode(self, z: torch.Tensor, skip_connections: list, return_dict: bool = True):
+        if self.post_quant_conv is not None:
+            z = self.post_quant_conv(z)
+        # Decode the latent representation with skip connections
+        dec = self.decoder(z, skip_connections)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ):
+        # Encode the input and get the skip connections
+        posterior, skip_connections = self.encode(sample, return_dict=True)
+        # Sample from the posterior
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        # Decode the latent representation with skip connections
+        dec = self.decode(z, skip_connections, return_dict=return_dict)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+class CustomDecoder(Decoder):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        up_block_types: Tuple[str, ...],
+        block_out_channels: Tuple[int, ...],
+        layers_per_block: int,
+        norm_num_groups: int,
+        act_fn: str,
+        mid_block_add_attention: bool,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+    def forward(
+        self,
+        sample: torch.Tensor,
+        skip_connections: list,
+        latent_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""The forward method of the `Decoder` class."""
+        sample = self.conv_in(sample)
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+                return custom_forward
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    latent_embeds,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        latent_embeds,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, latent_embeds
+                )
+                sample = sample.to(upscale_dtype)
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+        else:
+            # middle
+            sample = self.mid_block(sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
+            # up
+            # for up_block in self.up_blocks:
+            #     sample = up_block(sample, latent_embeds)
+            for i, up_block in enumerate(self.up_blocks):
+                # Add skip connections directly
+                if i < len(skip_connections):
+                    skip_connection = skip_connections[-(i + 1)]
+                    # import pdb; pdb.set_trace()
+                    sample = sample + skip_connection
+                # import pdb; pdb.set_trace() #torch.Size([1, 512, 96, 96]
+                sample = up_block(sample)
+        # post-process
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        return sample
+class CustomEncoder(Encoder):
+    r"""
+    Custom Encoder that adds Zero Convolution layers to each block's output
+    to generate skip connections.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        double_z: bool = True,
+        mid_block_add_attention: bool = True,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            double_z=double_z,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+        # Add Zero Convolution layers to each block's output
+        self.zero_convs = nn.ModuleList()
+        for i, out_channels in enumerate(block_out_channels):
+            if i < 2:
+                self.zero_convs.append(ZeroConv2d(out_channels, out_channels * 2))
+            else:
+                self.zero_convs.append(ZeroConv2d(out_channels, out_channels))
+    def forward(self, sample: torch.Tensor) -> list[torch.Tensor]:
+        r"""
+        Forward pass of the CustomEncoder.
+        Args:
+            sample (`torch.Tensor`): Input tensor.
+        Returns:
+            `Tuple[torch.Tensor, List[torch.Tensor]]`:
+                - The final latent representation.
+                - A list of skip connections from each block.
+        """
+        skip_connections = []
+        # Initial convolution
+        sample = self.conv_in(sample)
+        # Down blocks
+        for i, (down_block, zero_conv) in enumerate(zip(self.down_blocks, self.zero_convs)):
+            # import pdb; pdb.set_trace()
+            sample = down_block(sample)
+            if i != len(self.down_blocks) - 1:
+                sample_out = nn.functional.interpolate(zero_conv(sample), scale_factor=2, mode='bilinear', align_corners=False)
+            else:
+                sample_out = zero_conv(sample)
+            skip_connections.append(sample_out)
+        # import pdb; pdb.set_trace()
+        # torch.Size([1, 128, 768, 768])
+        # torch.Size([1, 128, 384, 384])
+        # torch.Size([1, 256, 192, 192])
+        # torch.Size([1, 512, 96, 96])
+        # torch.Size([1, 512, 96, 96])
+        # # Middle block
+        # sample = self.mid_block(sample)
+        # # Post-process
+        # sample = self.conv_norm_out(sample)
+        # sample = self.conv_act(sample)
+        # sample = self.conv_out(sample)
+        return sample, skip_connections

DAI/pipeline_all.py ADDED Viewed

	@@ -0,0 +1,733 @@

+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# More information and citation instructions are available on the
+# --------------------------------------------------------------------------
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+	ControlNetModel,
+)
+from diffusers.schedulers import (
+	DDIMScheduler
+)
+from diffusers.utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.controlnet import StableDiffusionControlNetPipeline
+from diffusers.pipelines.marigold.marigold_image_processing import MarigoldImageProcessor
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from DAI.decoder import CustomAutoencoderKL
+import pdb
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+Examples:
+```py
+>>> import diffusers
+>>> import torch
+>>> pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
+...     "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
+... ).to("cuda")
+>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+>>> normals = pipe(image)
+>>> vis = pipe.image_processor.visualize_normals(normals.prediction)
+>>> vis[0].save("einstein_normals.png")
+```
+"""
+@dataclass
+class DAIOutput(BaseOutput):
+    """
+    Output class for Marigold monocular normals prediction pipeline.
+    Args:
+        prediction (`np.ndarray`, `torch.Tensor`):
+            Predicted normals with values in the range [-1, 1]. The shape is always $numimages \times 3 \times height
+            \times width$, regardless of whether the images were passed as a 4D array or a list.
+        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
+            \times 1 \times height \times width$.
+        latent (`None`, `torch.Tensor`):
+            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
+            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+    """
+    prediction: Union[np.ndarray, torch.Tensor]
+    latent: Union[None, torch.Tensor]
+    gaus_noise: Union[None, torch.Tensor]
+class DAIPipeline(StableDiffusionControlNetPipeline):
+    """ Pipeline for monocular normals estimation using the Marigold method: https://marigoldmonodepth.github.io.
+    Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        # vae_2: CustomAutoencoderKL,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel]],
+        scheduler: Union[DDIMScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+        default_denoising_steps: Optional[int] = 1,
+		default_processing_resolution: Optional[int] = 768,
+        prompt="remove glass reflection",
+        empty_text_embedding=None,
+        t_start: Optional[int] = 0,
+    ):
+        super().__init__(
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            controlnet,
+            scheduler,
+            safety_checker,
+            feature_extractor,
+            image_encoder,
+            requires_safety_checker,
+        )
+        # self.vae_2 = vae_2
+        self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.control_image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_denoising_steps = default_denoising_steps
+        self.default_processing_resolution = default_processing_resolution
+        self.prompt = prompt
+        self.prompt_embeds = None
+        self.empty_text_embedding = empty_text_embedding
+        self.t_start= t_start # target_out latents
+    def check_inputs(
+        self,
+        image: PipelineImageInput,
+        num_inference_steps: int,
+        ensemble_size: int,
+        processing_resolution: int,
+        resample_method_input: str,
+        resample_method_output: str,
+        batch_size: int,
+        ensembling_kwargs: Optional[Dict[str, Any]],
+        latents: Optional[torch.Tensor],
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+        output_type: str,
+        output_uncertainty: bool,
+    ) -> int:
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
+        if num_inference_steps < 1:
+            raise ValueError("`num_inference_steps` must be positive.")
+        if ensemble_size < 1:
+            raise ValueError("`ensemble_size` must be positive.")
+        if ensemble_size == 2:
+            logger.warning(
+                "`ensemble_size` == 2 results are similar to no ensembling (1); "
+                "consider increasing the value to at least 3."
+            )
+        if ensemble_size == 1 and output_uncertainty:
+            raise ValueError(
+                "Computing uncertainty by setting `output_uncertainty=True` also requires setting `ensemble_size` "
+                "greater than 1."
+            )
+        if processing_resolution is None:
+            raise ValueError(
+                "`processing_resolution` is not specified and could not be resolved from the model config."
+            )
+        if processing_resolution < 0:
+            raise ValueError(
+                "`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for "
+                "downsampled processing."
+            )
+        if processing_resolution % self.vae_scale_factor != 0:
+            raise ValueError(f"`processing_resolution` must be a multiple of {self.vae_scale_factor}.")
+        if resample_method_input not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_input` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if resample_method_output not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_output` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if batch_size < 1:
+            raise ValueError("`batch_size` must be positive.")
+        if output_type not in ["pt", "np"]:
+            raise ValueError("`output_type` must be one of `pt` or `np`.")
+        if latents is not None and generator is not None:
+            raise ValueError("`latents` and `generator` cannot be used together.")
+        if ensembling_kwargs is not None:
+            if not isinstance(ensembling_kwargs, dict):
+                raise ValueError("`ensembling_kwargs` must be a dictionary.")
+            if "reduction" in ensembling_kwargs and ensembling_kwargs["reduction"] not in ("closest", "mean"):
+                raise ValueError("`ensembling_kwargs['reduction']` can be either `'closest'` or `'mean'`.")
+        # image checks
+        num_images = 0
+        W, H = None, None
+        if not isinstance(image, list):
+            image = [image]
+        for i, img in enumerate(image):
+            if isinstance(img, np.ndarray) or torch.is_tensor(img):
+                if img.ndim not in (2, 3, 4):
+                    raise ValueError(f"`image[{i}]` has unsupported dimensions or shape: {img.shape}.")
+                H_i, W_i = img.shape[-2:]
+                N_i = 1
+                if img.ndim == 4:
+                    N_i = img.shape[0]
+            elif isinstance(img, Image.Image):
+                W_i, H_i = img.size
+                N_i = 1
+            else:
+                raise ValueError(f"Unsupported `image[{i}]` type: {type(img)}.")
+            if W is None:
+                W, H = W_i, H_i
+            elif (W, H) != (W_i, H_i):
+                raise ValueError(
+                    f"Input `image[{i}]` has incompatible dimensions {(W_i, H_i)} with the previous images {(W, H)}"
+                )
+            num_images += N_i
+        # latents checks
+        if latents is not None:
+            if not torch.is_tensor(latents):
+                raise ValueError("`latents` must be a torch.Tensor.")
+            if latents.dim() != 4:
+                raise ValueError(f"`latents` has unsupported dimensions or shape: {latents.shape}.")
+            if processing_resolution > 0:
+                max_orig = max(H, W)
+                new_H = H * processing_resolution // max_orig
+                new_W = W * processing_resolution // max_orig
+                if new_H == 0 or new_W == 0:
+                    raise ValueError(f"Extreme aspect ratio of the input image: [{W} x {H}]")
+                W, H = new_W, new_H
+            w = (W + self.vae_scale_factor - 1) // self.vae_scale_factor
+            h = (H + self.vae_scale_factor - 1) // self.vae_scale_factor
+            shape_expected = (num_images * ensemble_size, self.vae.config.latent_channels, h, w)
+            if latents.shape != shape_expected:
+                raise ValueError(f"`latents` has unexpected shape={latents.shape} expected={shape_expected}.")
+        # generator checks
+        if generator is not None:
+            if isinstance(generator, list):
+                if len(generator) != num_images * ensemble_size:
+                    raise ValueError(
+                        "The number of generators must match the total number of ensemble members for all input images."
+                    )
+                if not all(g.device.type == generator[0].device.type for g in generator):
+                    raise ValueError("`generator` device placement is not consistent in the list.")
+            elif not isinstance(generator, torch.Generator):
+                raise ValueError(f"Unsupported generator type: {type(generator)}.")
+        return num_images
+    def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+        progress_bar_config = dict(**self._progress_bar_config)
+        progress_bar_config["desc"] = progress_bar_config.get("desc", desc)
+        progress_bar_config["leave"] = progress_bar_config.get("leave", leave)
+        if iterable is not None:
+            return tqdm(iterable, **progress_bar_config)
+        elif total is not None:
+            return tqdm(total=total, **progress_bar_config)
+        else:
+            raise ValueError("Either `total` or `iterable` has to be defined.")
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        vae_2: CustomAutoencoderKL,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: Optional[int] = None,
+        ensemble_size: int = 1,
+        processing_resolution: Optional[int] = None,
+        match_input_resolution: bool = True,
+        resample_method_input: str = "bilinear",
+        resample_method_output: str = "bilinear",
+        batch_size: int = 1,
+        ensembling_kwargs: Optional[Dict[str, Any]] = None,
+        latents: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        output_type: str = "np",
+        output_uncertainty: bool = False,
+        output_latent: bool = False,
+        skip_preprocess: bool = False,
+        return_dict: bool = True,
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline.
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`),
+                `List[torch.Tensor]`: An input image or images used as an input for the normals estimation task. For
+                arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is possible
+                by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
+                three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
+                same width and height.
+            num_inference_steps (`int`, *optional*, defaults to `None`):
+                Number of denoising diffusion steps during inference. The default value `None` results in automatic
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
+            ensemble_size (`int`, defaults to `1`):
+                Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
+                faster inference.
+            processing_resolution (`int`, *optional*, defaults to `None`):
+                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
+                produces crisper predictions, but may also lead to the overall loss of global context. The default
+                value `None` resolves to the optimal value from the model config.
+            match_input_resolution (`bool`, *optional*, defaults to `True`):
+                When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
+                side of the output will equal to `processing_resolution`.
+            resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize input images to `processing_resolution`. The accepted values are:
+                `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize output predictions to match the input resolution. The accepted values
+                are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            batch_size (`int`, *optional*, defaults to `1`):
+                Batch size; only matters when setting `ensemble_size` or passing a tensor of images.
+            ensembling_kwargs (`dict`, *optional*, defaults to `None`)
+                Extra dictionary with arguments for precise ensembling control. The following options are available:
+                - reduction (`str`, *optional*, defaults to `"closest"`): Defines the ensembling function applied in
+                  every pixel location, can be either `"closest"` or `"mean"`.
+            latents (`torch.Tensor`, *optional*, defaults to `None`):
+                Latent noise tensors to replace the random initialization. These can be taken from the previous
+                function call's output.
+            generator (`torch.Generator`, or `List[torch.Generator]`, *optional*, defaults to `None`):
+                Random number generator object to ensure reproducibility.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
+                values are: `"np"` (numpy array) or `"pt"` (torch tensor).
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `uncertainty` field contains the predictive uncertainty map, provided that
+                the `ensemble_size` argument is set to a value above 2.
+            output_latent (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
+                within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
+                `latents` argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.marigold.MarigoldDepthOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.marigold.MarigoldNormalsOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.marigold.MarigoldNormalsOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is the prediction, the second element is the uncertainty
+                (or `None`), and the third is the latent (or `None`).
+        """
+        # 0. Resolving variables.
+        device = self._execution_device
+        dtype = self.dtype
+        # Model-specific optimal default values leading to fast and reasonable results.
+        if num_inference_steps is None:
+            num_inference_steps = self.default_denoising_steps
+        if processing_resolution is None:
+            processing_resolution = self.default_processing_resolution
+        # 1. Check inputs.
+        num_images = self.check_inputs(
+            image,
+            num_inference_steps,
+            ensemble_size,
+            processing_resolution,
+            resample_method_input,
+            resample_method_output,
+            batch_size,
+            ensembling_kwargs,
+            latents,
+            generator,
+            output_type,
+            output_uncertainty,
+        )
+        # 2. Prepare empty text conditioning.
+        # Model invocation: self.tokenizer, self.text_encoder.
+        if self.empty_text_embedding is None:
+            prompt = ""
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="do_not_pad",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids.to(device)
+            self.empty_text_embedding = self.text_encoder(text_input_ids)[0]  # [1,2,1024]
+        # 3. prepare prompt
+        if self.prompt_embeds is None:
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                self.prompt,
+                device,
+                num_images_per_prompt,
+                False,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=None,
+                lora_scale=None,
+                clip_skip=None,
+            )
+            self.prompt_embeds = prompt_embeds
+            self.negative_prompt_embeds = negative_prompt_embeds
+        # 4. Preprocess input images. This function loads input image or images of compatible dimensions `(H, W)`,
+        # optionally downsamples them to the `processing_resolution` `(PH, PW)`, where
+        # `max(PH, PW) == processing_resolution`, and pads the dimensions to `(PPH, PPW)` such that these values are
+        # divisible by the latent space downscaling factor (typically 8 in Stable Diffusion). The default value `None`
+        # of `processing_resolution` resolves to the optimal value from the model config. It is a recommended mode of
+        # operation and leads to the most reasonable results. Using the native image resolution or any other processing
+        # resolution can lead to loss of either fine details or global context in the output predictions.
+        if not skip_preprocess:
+            image, padding, original_resolution = self.image_processor.preprocess(
+                image, processing_resolution, resample_method_input, device, dtype
+            )  # [N,3,PPH,PPW]
+        else:
+            padding = (0, 0)
+            original_resolution = image.shape[2:]
+        # 5. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
+        # ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
+        # Latents of each such predictions across all input images and all ensemble members are represented in the
+        # `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
+        # into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
+        # reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
+        # code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
+        # as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
+        # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
+        # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.encoder.
+        image_latent, pred_latent = self.prepare_latents(
+            image, latents, generator, ensemble_size, batch_size
+        )  # [N*E,4,h,w], [N*E,4,h,w]
+        gaus_noise = pred_latent.detach().clone()
+        # del image
+        # 6. obtain control_output
+        cond_scale =controlnet_conditioning_scale
+        down_block_res_samples, mid_block_res_sample = self.controlnet(
+            image_latent.detach(),
+            self.t_start,
+            encoder_hidden_states=self.prompt_embeds,
+            conditioning_scale=cond_scale,
+            guess_mode=False,
+            return_dict=False,
+        )
+        # 7. Onestep sampling
+        latent_x_t = self.unet(
+            pred_latent,
+            self.t_start,
+            encoder_hidden_states=self.prompt_embeds,
+            down_block_additional_residuals=down_block_res_samples,
+            mid_block_additional_residual=mid_block_res_sample,
+            return_dict=False,
+        )[0]
+        del (
+            pred_latent,
+            image_latent,
+        )
+        # encoder
+        skip_connections = vae_2.encode(image)
+        # decoder
+        prediction = self.decode_prediction(latent_x_t, skip_connections, vae_2)
+        prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,3,PH,PW]
+        prediction = self.image_processor.resize_antialias(
+            prediction, original_resolution, resample_method_output, is_aa=False
+        )  # [N,3,H,W]
+        if output_type == "np":
+            prediction = self.image_processor.pt_to_numpy(prediction)  # [N,H,W,3]
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+        return DAIOutput(
+            prediction=prediction,
+            latent=latent_x_t,
+            gaus_noise=gaus_noise,
+        )
+    # Copied from diffusers.pipelines.marigold.pipeline_marigold_depth.MarigoldDepthPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        latents: Optional[torch.Tensor],
+        generator: Optional[torch.Generator],
+        ensemble_size: int,
+        batch_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        def retrieve_latents(encoder_output):
+            if hasattr(encoder_output, "latent_dist"):
+                return encoder_output.latent_dist.mode()
+            elif hasattr(encoder_output, "latents"):
+                return encoder_output.latents
+            else:
+                raise AttributeError("Could not access latents of provided encoder_output")
+        image_latent = torch.cat(
+            [
+                retrieve_latents(self.vae.encode(image[i : i + batch_size]))
+                for i in range(0, image.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N,4,h,w]
+        image_latent = image_latent * self.vae.config.scaling_factor
+        image_latent = image_latent.repeat_interleave(ensemble_size, dim=0)  # [N*E,4,h,w]
+        pred_latent = torch.zeros_like(image_latent)
+        if pred_latent is None:
+            pred_latent = randn_tensor(
+                image_latent.shape,
+                generator=generator,
+                device=image_latent.device,
+                dtype=image_latent.dtype,
+            )  # [N*E,4,h,w]
+        return image_latent, pred_latent
+    def decode_prediction(self, pred_latent: torch.Tensor, skip_connections: list, vae_2: CustomAutoencoderKL) -> torch.Tensor:
+        if pred_latent.dim() != 4 or pred_latent.shape[1] != vae_2.config.latent_channels:
+            raise ValueError(
+                f"Expecting 4D tensor of shape [B,{vae_2.config.latent_channels},H,W]; got {pred_latent.shape}."
+            )
+        prediction = vae_2.decode(pred_latent / vae_2.config.scaling_factor, skip_connections, return_dict=False)[0]  # [B,3,H,W]
+        return prediction  # [B,3,H,W]
+    @staticmethod
+    def normalize_normals(normals: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
+        if normals.dim() != 4 or normals.shape[1] != 3:
+            raise ValueError(f"Expecting 4D tensor of shape [B,3,H,W]; got {normals.shape}.")
+        norm = torch.norm(normals, dim=1, keepdim=True)
+        normals /= norm.clamp(min=eps)
+        return normals
+    @staticmethod
+    def ensemble_normals(
+        normals: torch.Tensor, output_uncertainty: bool, reduction: str = "closest"
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Ensembles the normals maps represented by the `normals` tensor with expected shape `(B, 3, H, W)`, where B is
+        the number of ensemble members for a given prediction of size `(H x W)`.
+        Args:
+            normals (`torch.Tensor`):
+                Input ensemble normals maps.
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                Whether to output uncertainty map.
+            reduction (`str`, *optional*, defaults to `"closest"`):
+                Reduction method used to ensemble aligned predictions. The accepted values are: `"closest"` and
+                `"mean"`.
+        Returns:
+            A tensor of aligned and ensembled normals maps with shape `(1, 3, H, W)` and optionally a tensor of
+            uncertainties of shape `(1, 1, H, W)`.
+        """
+        if normals.dim() != 4 or normals.shape[1] != 3:
+            raise ValueError(f"Expecting 4D tensor of shape [B,3,H,W]; got {normals.shape}.")
+        if reduction not in ("closest", "mean"):
+            raise ValueError(f"Unrecognized reduction method: {reduction}.")
+        mean_normals = normals.mean(dim=0, keepdim=True)  # [1,3,H,W]
+        mean_normals = MarigoldNormalsPipeline.normalize_normals(mean_normals)  # [1,3,H,W]
+        sim_cos = (mean_normals * normals).sum(dim=1, keepdim=True)  # [E,1,H,W]
+        sim_cos = sim_cos.clamp(-1, 1)  # required to avoid NaN in uncertainty with fp16
+        uncertainty = None
+        if output_uncertainty:
+            uncertainty = sim_cos.arccos()  # [E,1,H,W]
+            uncertainty = uncertainty.mean(dim=0, keepdim=True) / np.pi  # [1,1,H,W]
+        if reduction == "mean":
+            return mean_normals, uncertainty  # [1,3,H,W], [1,1,H,W]
+        closest_indices = sim_cos.argmax(dim=0, keepdim=True)  # [1,1,H,W]
+        closest_indices = closest_indices.repeat(1, 3, 1, 1)  # [1,3,H,W]
+        closest_normals = torch.gather(normals, 0, closest_indices)  # [1,3,H,W]
+        return closest_normals, uncertainty  # [1,3,H,W], [1,1,H,W]
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps

DAI/pipeline_onestep.py ADDED Viewed

	@@ -0,0 +1,723 @@

+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# More information and citation instructions are available on the
+# --------------------------------------------------------------------------
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+	ControlNetModel,
+)
+from diffusers.schedulers import (
+	DDIMScheduler
+)
+from diffusers.utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.controlnet import StableDiffusionControlNetPipeline
+from diffusers.pipelines.marigold.marigold_image_processing import MarigoldImageProcessor
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+import pdb
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+Examples:
+```py
+>>> import diffusers
+>>> import torch
+>>> pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
+...     "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
+... ).to("cuda")
+>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+>>> normals = pipe(image)
+>>> vis = pipe.image_processor.visualize_normals(normals.prediction)
+>>> vis[0].save("einstein_normals.png")
+```
+"""
+@dataclass
+class DAIOutput(BaseOutput):
+    """
+    Output class for Marigold monocular normals prediction pipeline.
+    Args:
+        prediction (`np.ndarray`, `torch.Tensor`):
+            Predicted normals with values in the range [-1, 1]. The shape is always $numimages \times 3 \times height
+            \times width$, regardless of whether the images were passed as a 4D array or a list.
+        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
+            \times 1 \times height \times width$.
+        latent (`None`, `torch.Tensor`):
+            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
+            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+    """
+    prediction: Union[np.ndarray, torch.Tensor]
+    latent: Union[None, torch.Tensor]
+    gaus_noise: Union[None, torch.Tensor]
+class OneStepPipeline(StableDiffusionControlNetPipeline):
+    """ Pipeline for monocular normals estimation using the Marigold method: https://marigoldmonodepth.github.io.
+    Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel]],
+        scheduler: Union[DDIMScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+        default_denoising_steps: Optional[int] = 1,
+		default_processing_resolution: Optional[int] = 768,
+        prompt="remove glass reflection",
+        empty_text_embedding=None,
+        t_start: Optional[int] = 401,
+    ):
+        super().__init__(
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            controlnet,
+            scheduler,
+            safety_checker,
+            feature_extractor,
+            image_encoder,
+            requires_safety_checker,
+                )
+        self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.control_image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_denoising_steps = default_denoising_steps
+        self.default_processing_resolution = default_processing_resolution
+        self.prompt = prompt
+        self.prompt_embeds = None
+        self.empty_text_embedding = empty_text_embedding
+        self.t_start= t_start # target_out latents
+    def check_inputs(
+        self,
+        image: PipelineImageInput,
+        num_inference_steps: int,
+        ensemble_size: int,
+        processing_resolution: int,
+        resample_method_input: str,
+        resample_method_output: str,
+        batch_size: int,
+        ensembling_kwargs: Optional[Dict[str, Any]],
+        latents: Optional[torch.Tensor],
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+        output_type: str,
+        output_uncertainty: bool,
+    ) -> int:
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
+        if num_inference_steps < 1:
+            raise ValueError("`num_inference_steps` must be positive.")
+        if ensemble_size < 1:
+            raise ValueError("`ensemble_size` must be positive.")
+        if ensemble_size == 2:
+            logger.warning(
+                "`ensemble_size` == 2 results are similar to no ensembling (1); "
+                "consider increasing the value to at least 3."
+            )
+        if ensemble_size == 1 and output_uncertainty:
+            raise ValueError(
+                "Computing uncertainty by setting `output_uncertainty=True` also requires setting `ensemble_size` "
+                "greater than 1."
+            )
+        if processing_resolution is None:
+            raise ValueError(
+                "`processing_resolution` is not specified and could not be resolved from the model config."
+            )
+        if processing_resolution < 0:
+            raise ValueError(
+                "`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for "
+                "downsampled processing."
+            )
+        if processing_resolution % self.vae_scale_factor != 0:
+            raise ValueError(f"`processing_resolution` must be a multiple of {self.vae_scale_factor}.")
+        if resample_method_input not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_input` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if resample_method_output not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_output` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if batch_size < 1:
+            raise ValueError("`batch_size` must be positive.")
+        if output_type not in ["pt", "np"]:
+            raise ValueError("`output_type` must be one of `pt` or `np`.")
+        if latents is not None and generator is not None:
+            raise ValueError("`latents` and `generator` cannot be used together.")
+        if ensembling_kwargs is not None:
+            if not isinstance(ensembling_kwargs, dict):
+                raise ValueError("`ensembling_kwargs` must be a dictionary.")
+            if "reduction" in ensembling_kwargs and ensembling_kwargs["reduction"] not in ("closest", "mean"):
+                raise ValueError("`ensembling_kwargs['reduction']` can be either `'closest'` or `'mean'`.")
+        # image checks
+        num_images = 0
+        W, H = None, None
+        if not isinstance(image, list):
+            image = [image]
+        for i, img in enumerate(image):
+            if isinstance(img, np.ndarray) or torch.is_tensor(img):
+                if img.ndim not in (2, 3, 4):
+                    raise ValueError(f"`image[{i}]` has unsupported dimensions or shape: {img.shape}.")
+                H_i, W_i = img.shape[-2:]
+                N_i = 1
+                if img.ndim == 4:
+                    N_i = img.shape[0]
+            elif isinstance(img, Image.Image):
+                W_i, H_i = img.size
+                N_i = 1
+            else:
+                raise ValueError(f"Unsupported `image[{i}]` type: {type(img)}.")
+            if W is None:
+                W, H = W_i, H_i
+            elif (W, H) != (W_i, H_i):
+                raise ValueError(
+                    f"Input `image[{i}]` has incompatible dimensions {(W_i, H_i)} with the previous images {(W, H)}"
+                )
+            num_images += N_i
+        # latents checks
+        if latents is not None:
+            if not torch.is_tensor(latents):
+                raise ValueError("`latents` must be a torch.Tensor.")
+            if latents.dim() != 4:
+                raise ValueError(f"`latents` has unsupported dimensions or shape: {latents.shape}.")
+            if processing_resolution > 0:
+                max_orig = max(H, W)
+                new_H = H * processing_resolution // max_orig
+                new_W = W * processing_resolution // max_orig
+                if new_H == 0 or new_W == 0:
+                    raise ValueError(f"Extreme aspect ratio of the input image: [{W} x {H}]")
+                W, H = new_W, new_H
+            w = (W + self.vae_scale_factor - 1) // self.vae_scale_factor
+            h = (H + self.vae_scale_factor - 1) // self.vae_scale_factor
+            shape_expected = (num_images * ensemble_size, self.vae.config.latent_channels, h, w)
+            if latents.shape != shape_expected:
+                raise ValueError(f"`latents` has unexpected shape={latents.shape} expected={shape_expected}.")
+        # generator checks
+        if generator is not None:
+            if isinstance(generator, list):
+                if len(generator) != num_images * ensemble_size:
+                    raise ValueError(
+                        "The number of generators must match the total number of ensemble members for all input images."
+                    )
+                if not all(g.device.type == generator[0].device.type for g in generator):
+                    raise ValueError("`generator` device placement is not consistent in the list.")
+            elif not isinstance(generator, torch.Generator):
+                raise ValueError(f"Unsupported generator type: {type(generator)}.")
+        return num_images
+    def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+        progress_bar_config = dict(**self._progress_bar_config)
+        progress_bar_config["desc"] = progress_bar_config.get("desc", desc)
+        progress_bar_config["leave"] = progress_bar_config.get("leave", leave)
+        if iterable is not None:
+            return tqdm(iterable, **progress_bar_config)
+        elif total is not None:
+            return tqdm(total=total, **progress_bar_config)
+        else:
+            raise ValueError("Either `total` or `iterable` has to be defined.")
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: Optional[int] = None,
+        ensemble_size: int = 1,
+        processing_resolution: Optional[int] = None,
+        match_input_resolution: bool = True,
+        resample_method_input: str = "bilinear",
+        resample_method_output: str = "bilinear",
+        batch_size: int = 1,
+        ensembling_kwargs: Optional[Dict[str, Any]] = None,
+        latents: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        output_type: str = "np",
+        output_uncertainty: bool = False,
+        output_latent: bool = False,
+        skip_preprocess: bool = False,
+        return_dict: bool = True,
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline.
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`),
+                `List[torch.Tensor]`: An input image or images used as an input for the normals estimation task. For
+                arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is possible
+                by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
+                three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
+                same width and height.
+            num_inference_steps (`int`, *optional*, defaults to `None`):
+                Number of denoising diffusion steps during inference. The default value `None` results in automatic
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
+            ensemble_size (`int`, defaults to `1`):
+                Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
+                faster inference.
+            processing_resolution (`int`, *optional*, defaults to `None`):
+                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
+                produces crisper predictions, but may also lead to the overall loss of global context. The default
+                value `None` resolves to the optimal value from the model config.
+            match_input_resolution (`bool`, *optional*, defaults to `True`):
+                When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
+                side of the output will equal to `processing_resolution`.
+            resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize input images to `processing_resolution`. The accepted values are:
+                `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize output predictions to match the input resolution. The accepted values
+                are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            batch_size (`int`, *optional*, defaults to `1`):
+                Batch size; only matters when setting `ensemble_size` or passing a tensor of images.
+            ensembling_kwargs (`dict`, *optional*, defaults to `None`)
+                Extra dictionary with arguments for precise ensembling control. The following options are available:
+                - reduction (`str`, *optional*, defaults to `"closest"`): Defines the ensembling function applied in
+                  every pixel location, can be either `"closest"` or `"mean"`.
+            latents (`torch.Tensor`, *optional*, defaults to `None`):
+                Latent noise tensors to replace the random initialization. These can be taken from the previous
+                function call's output.
+            generator (`torch.Generator`, or `List[torch.Generator]`, *optional*, defaults to `None`):
+                Random number generator object to ensure reproducibility.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
+                values are: `"np"` (numpy array) or `"pt"` (torch tensor).
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `uncertainty` field contains the predictive uncertainty map, provided that
+                the `ensemble_size` argument is set to a value above 2.
+            output_latent (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
+                within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
+                `latents` argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.marigold.MarigoldDepthOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.marigold.MarigoldNormalsOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.marigold.MarigoldNormalsOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is the prediction, the second element is the uncertainty
+                (or `None`), and the third is the latent (or `None`).
+        """
+        # 0. Resolving variables.
+        device = self._execution_device
+        dtype = self.dtype
+        # Model-specific optimal default values leading to fast and reasonable results.
+        if num_inference_steps is None:
+            num_inference_steps = self.default_denoising_steps
+        if processing_resolution is None:
+            processing_resolution = self.default_processing_resolution
+        # 1. Check inputs.
+        num_images = self.check_inputs(
+            image,
+            num_inference_steps,
+            ensemble_size,
+            processing_resolution,
+            resample_method_input,
+            resample_method_output,
+            batch_size,
+            ensembling_kwargs,
+            latents,
+            generator,
+            output_type,
+            output_uncertainty,
+        )
+        # 2. Prepare empty text conditioning.
+        # Model invocation: self.tokenizer, self.text_encoder.
+        if self.empty_text_embedding is None:
+            prompt = ""
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="do_not_pad",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids.to(device)
+            self.empty_text_embedding = self.text_encoder(text_input_ids)[0]  # [1,2,1024]
+        # 3. prepare prompt
+        if self.prompt_embeds is None:
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                self.prompt,
+                device,
+                num_images_per_prompt,
+                False,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=None,
+                lora_scale=None,
+                clip_skip=None,
+            )
+            self.prompt_embeds = prompt_embeds
+            self.negative_prompt_embeds = negative_prompt_embeds
+        # 4. Preprocess input images. This function loads input image or images of compatible dimensions `(H, W)`,
+        # optionally downsamples them to the `processing_resolution` `(PH, PW)`, where
+        # `max(PH, PW) == processing_resolution`, and pads the dimensions to `(PPH, PPW)` such that these values are
+        # divisible by the latent space downscaling factor (typically 8 in Stable Diffusion). The default value `None`
+        # of `processing_resolution` resolves to the optimal value from the model config. It is a recommended mode of
+        # operation and leads to the most reasonable results. Using the native image resolution or any other processing
+        # resolution can lead to loss of either fine details or global context in the output predictions.
+        if not skip_preprocess:
+            image, padding, original_resolution = self.image_processor.preprocess(
+                image, processing_resolution, resample_method_input, device, dtype
+            )  # [N,3,PPH,PPW]
+        else:
+            padding = (0, 0)
+            original_resolution = image.shape[2:]
+        # 5. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
+        # ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
+        # Latents of each such predictions across all input images and all ensemble members are represented in the
+        # `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
+        # into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
+        # reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
+        # code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
+        # as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
+        # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
+        # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.encoder.
+        image_latent, pred_latent = self.prepare_latents(
+            image, latents, generator, ensemble_size, batch_size
+        )  # [N*E,4,h,w], [N*E,4,h,w]
+        gaus_noise = pred_latent.detach().clone()
+        del image
+        # 6. obtain control_output
+        cond_scale =controlnet_conditioning_scale
+        down_block_res_samples, mid_block_res_sample = self.controlnet(
+            image_latent.detach(),
+            self.t_start,
+            encoder_hidden_states=self.prompt_embeds,
+            conditioning_scale=cond_scale,
+            guess_mode=False,
+            return_dict=False,
+        )
+        # 7. Onestep sampling
+        latent_x_t = self.unet(
+            pred_latent,
+            self.t_start,
+            encoder_hidden_states=self.prompt_embeds,
+            down_block_additional_residuals=down_block_res_samples,
+            mid_block_additional_residual=mid_block_res_sample,
+            return_dict=False,
+        )[0]
+        del (
+            pred_latent,
+            image_latent,
+        )
+        # decoder
+        prediction = self.decode_prediction(latent_x_t)
+        prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,3,PH,PW]
+        prediction = self.image_processor.resize_antialias(
+            prediction, original_resolution, resample_method_output, is_aa=False
+        )  # [N,3,H,W]
+        if output_type == "np":
+            prediction = self.image_processor.pt_to_numpy(prediction)  # [N,H,W,3]
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+        return DAIOutput(
+            prediction=prediction,
+            latent=latent_x_t,
+            gaus_noise=gaus_noise,
+        )
+    # Copied from diffusers.pipelines.marigold.pipeline_marigold_depth.MarigoldDepthPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        latents: Optional[torch.Tensor],
+        generator: Optional[torch.Generator],
+        ensemble_size: int,
+        batch_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        def retrieve_latents(encoder_output):
+            if hasattr(encoder_output, "latent_dist"):
+                return encoder_output.latent_dist.mode()
+            elif hasattr(encoder_output, "latents"):
+                return encoder_output.latents
+            else:
+                raise AttributeError("Could not access latents of provided encoder_output")
+        image_latent = torch.cat(
+            [
+                retrieve_latents(self.vae.encode(image[i : i + batch_size]))
+                for i in range(0, image.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N,4,h,w]
+        image_latent = image_latent * self.vae.config.scaling_factor
+        image_latent = image_latent.repeat_interleave(ensemble_size, dim=0)  # [N*E,4,h,w]
+        pred_latent = torch.zeros_like(image_latent)
+        if pred_latent is None:
+            pred_latent = randn_tensor(
+                image_latent.shape,
+                generator=generator,
+                device=image_latent.device,
+                dtype=image_latent.dtype,
+            )  # [N*E,4,h,w]
+        return image_latent, pred_latent
+    def decode_prediction(self, pred_latent: torch.Tensor) -> torch.Tensor:
+        if pred_latent.dim() != 4 or pred_latent.shape[1] != self.vae.config.latent_channels:
+            raise ValueError(
+                f"Expecting 4D tensor of shape [B,{self.vae.config.latent_channels},H,W]; got {pred_latent.shape}."
+            )
+        prediction = self.vae.decode(pred_latent / self.vae.config.scaling_factor, return_dict=False)[0]  # [B,3,H,W]
+        return prediction  # [B,3,H,W]
+    @staticmethod
+    def normalize_normals(normals: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
+        if normals.dim() != 4 or normals.shape[1] != 3:
+            raise ValueError(f"Expecting 4D tensor of shape [B,3,H,W]; got {normals.shape}.")
+        norm = torch.norm(normals, dim=1, keepdim=True)
+        normals /= norm.clamp(min=eps)
+        return normals
+    @staticmethod
+    def ensemble_normals(
+        normals: torch.Tensor, output_uncertainty: bool, reduction: str = "closest"
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Ensembles the normals maps represented by the `normals` tensor with expected shape `(B, 3, H, W)`, where B is
+        the number of ensemble members for a given prediction of size `(H x W)`.
+        Args:
+            normals (`torch.Tensor`):
+                Input ensemble normals maps.
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                Whether to output uncertainty map.
+            reduction (`str`, *optional*, defaults to `"closest"`):
+                Reduction method used to ensemble aligned predictions. The accepted values are: `"closest"` and
+                `"mean"`.
+        Returns:
+            A tensor of aligned and ensembled normals maps with shape `(1, 3, H, W)` and optionally a tensor of
+            uncertainties of shape `(1, 1, H, W)`.
+        """
+        if normals.dim() != 4 or normals.shape[1] != 3:
+            raise ValueError(f"Expecting 4D tensor of shape [B,3,H,W]; got {normals.shape}.")
+        if reduction not in ("closest", "mean"):
+            raise ValueError(f"Unrecognized reduction method: {reduction}.")
+        mean_normals = normals.mean(dim=0, keepdim=True)  # [1,3,H,W]
+        mean_normals = MarigoldNormalsPipeline.normalize_normals(mean_normals)  # [1,3,H,W]
+        sim_cos = (mean_normals * normals).sum(dim=1, keepdim=True)  # [E,1,H,W]
+        sim_cos = sim_cos.clamp(-1, 1)  # required to avoid NaN in uncertainty with fp16
+        uncertainty = None
+        if output_uncertainty:
+            uncertainty = sim_cos.arccos()  # [E,1,H,W]
+            uncertainty = uncertainty.mean(dim=0, keepdim=True) / np.pi  # [1,1,H,W]
+        if reduction == "mean":
+            return mean_normals, uncertainty  # [1,3,H,W], [1,1,H,W]
+        closest_indices = sim_cos.argmax(dim=0, keepdim=True)  # [1,1,H,W]
+        closest_indices = closest_indices.repeat(1, 3, 1, 1)  # [1,3,H,W]
+        closest_normals = torch.gather(normals, 0, closest_indices)  # [1,3,H,W]
+        return closest_normals, uncertainty  # [1,3,H,W], [1,1,H,W]
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps

app.py CHANGED Viewed

@@ -1,7 +1,309 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+# Copyright 2024 Anton Obukhov, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+from __future__ import annotations
+import functools
+import os
+import tempfile
 import gradio as gr
+import imageio as imageio
+import numpy as np
+import spaces
+import torch as torch
+torch.backends.cuda.matmul.allow_tf32 = True
+from PIL import Image
+from gradio_imageslider import ImageSlider
+from tqdm import tqdm
+from pathlib import Path
+import gradio
+from gradio.utils import get_cache_folder
+from DAI.pipeline_all import DAIPipeline
+from diffusers import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+)
+from transformers import CLIPTextModel, AutoTokenizer
+from DAI.controlnetvae import ControlNetVAEModel
+from DAI.decoder import CustomAutoencoderKL
+class Examples(gradio.helpers.Examples):
+    def __init__(self, *args, directory_name=None, **kwargs):
+        super().__init__(*args, **kwargs, _initiated_directly=False)
+        if directory_name is not None:
+            self.cached_folder = get_cache_folder() / directory_name
+            self.cached_file = Path(self.cached_folder) / "log.csv"
+        self.create()
+default_seed = 2024
+default_batch_size = 1
+def process_image_check(path_input):
+    if path_input is None:
+        raise gr.Error(
+            "Missing image in the first pane: upload a file or use one from the gallery below."
+        )
+def resize_image(input_image, resolution):
+    # Ensure input_image is a PIL Image object
+    if not isinstance(input_image, Image.Image):
+        raise ValueError("input_image should be a PIL Image object")
+    # Convert image to numpy array
+    input_image_np = np.asarray(input_image)
+    # Get image dimensions
+    H, W, C = input_image_np.shape
+    H = float(H)
+    W = float(W)
+    # Calculate the scaling factor
+    k = float(resolution) / min(H, W)
+    # Determine new dimensions
+    H *= k
+    W *= k
+    H = int(np.round(H / 64.0)) * 64
+    W = int(np.round(W / 64.0)) * 64
+    # Resize the image using PIL's resize method
+    img = input_image.resize((W, H), Image.Resampling.LANCZOS)
+    return img
+def process_image(
+    pipe,
+    vae_2,
+    path_input,
+):
+    name_base, name_ext = os.path.splitext(os.path.basename(path_input))
+    print(f"Processing image {name_base}{name_ext}")
+    path_output_dir = tempfile.mkdtemp()
+    path_out_png = os.path.join(path_output_dir, f"{name_base}_delight.png")
+    input_image = Image.open(path_input)
+    resolution = None
+    pipe_out = pipe(
+        image=input_image,
+        prompt="remove glass reflection",
+        vae_2=vae_2,
+        processing_resolution=resolution,
+    )
+    processed_frame = (pipe_out.prediction.clip(-1, 1) + 1) / 2
+    processed_frame = (processed_frame[0] * 255).astype(np.uint8)
+    processed_frame = Image.fromarray(processed_frame)
+    processed_frame.save(path_out_png)
+    yield [input_image, path_out_png]
+def run_demo_server(pipe, vae_2):
+    process_pipe_image = spaces.GPU(functools.partial(process_image, pipe, vae_2))
+    gradio_theme = gr.themes.Default()
+    with gr.Blocks(
+        theme=gradio_theme,
+        title="Dereflection Any Image",
+        css="""
+            #download {
+                height: 118px;
+            }
+            .slider .inner {
+                width: 5px;
+                background: #FFF;
+            }
+            .viewport {
+                aspect-ratio: 4/3;
+            }
+            .tabs button.selected {
+                font-size: 20px !important;
+                color: crimson !important;
+            }
+            h1 {
+                text-align: center;
+                display: block;
+            }
+            h2 {
+                text-align: center;
+                display: block;
+            }
+            h3 {
+                text-align: center;
+                display: block;
+            }
+            .md_feedback li {
+                margin-bottom: 0px !important;
+            }
+        """,
+        head="""
+            <script async src="https://www.googletagmanager.com/gtag/js?id=G-1FWSVCGZTG"></script>
+            <script>
+                window.dataLayer = window.dataLayer || [];
+                function gtag() {dataLayer.push(arguments);}
+                gtag('js', new Date());
+                gtag('config', 'G-1FWSVCGZTG');
+            </script>
+        """,
+    ) as demo:
+        gr.Markdown(
+            """
+            # Dereflection Any Image
+            <p align="center">
+        """
+        )
+        with gr.Tabs(elem_classes=["tabs"]):
+            with gr.Tab("Image"):
+                with gr.Row():
+                    with gr.Column():
+                        image_input = gr.Image(
+                            label="Input Image",
+                            type="filepath",
+                        )
+                        with gr.Row():
+                            image_submit_btn = gr.Button(
+                                value="remove reflection", variant="primary"
+                            )
+                            image_reset_btn = gr.Button(value="Reset")
+                    with gr.Column():
+                        image_output_slider = ImageSlider(
+                            label="outputs",
+                            type="filepath",
+                            show_download_button=True,
+                            show_share_button=True,
+                            interactive=False,
+                            elem_classes="slider",
+                            # position=0.25,
+                        )
+                Examples(
+                    fn=process_pipe_image,
+                    examples=sorted([
+                        os.path.join("files", "image", name)
+                        for name in os.listdir(os.path.join("files", "image"))
+                    ]),
+                    inputs=[image_input],
+                    outputs=[image_output_slider],
+                    cache_examples=False,
+                    directory_name="examples_image",
+                )
+        ### Image tab
+        image_submit_btn.click(
+            fn=process_image_check,
+            inputs=image_input,
+            outputs=None,
+            preprocess=False,
+            queue=False,
+        ).success(
+            fn=process_pipe_image,
+            inputs=[
+                image_input,
+            ],
+            outputs=[image_output_slider],
+            concurrency_limit=1,
+        )
+        image_reset_btn.click(
+            fn=lambda: (
+                None,
+                None,
+                None,
+            ),
+            inputs=[],
+            outputs=[
+                image_input,
+                image_output_slider,
+            ],
+            queue=False,
+        )
+        ### Server launch
+        demo.queue(
+            api_open=False,
+        ).launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+        )
+def main():
+    os.system("pip freeze")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    weight_dtype = torch.float32
+    model_dir = "./weights"
+    pretrained_model_name_or_path = "JichenHu/dereflection-any-image-v0"
+    pretrained_model_name_or_path2 = "stabilityai/stable-diffusion-2-1"
+    revision = None
+    variant = None
+    # Load the model
+    controlnet = ControlNetVAEModel.from_pretrained(pretrained_model_name_or_path, subfolder="controlnet", torch_dtype=weight_dtype).to(device)
+    unet = UNet2DConditionModel.from_pretrained(pretrained_model_name_or_path, subfolder="unet", torch_dtype=weight_dtype).to(device)
+    vae_2 = CustomAutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae_2", torch_dtype=weight_dtype).to(device)
+    # Load other components of the pipeline
+    vae = AutoencoderKL.from_pretrained(
+            pretrained_model_name_or_path2, subfolder="vae", revision=revision, variant=variant
+        ).to(device)
+    text_encoder = CLIPTextModel.from_pretrained(
+            pretrained_model_name_or_path2, subfolder="text_encoder", revision=revision, variant=variant
+        ).to(device)
+    tokenizer = AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path2,
+                subfolder="tokenizer",
+                revision=revision,
+                use_fast=False,
+            )
+    pipe = DAIPipeline(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            safety_checker=None,
+            scheduler=None,
+            feature_extractor=None,
+            t_start=0,
+        ).to(device)
+    try:
+        import xformers
+        pipe.enable_xformers_memory_efficient_attention()
+    except:
+        pass  # run without xformers
+    run_demo_server(pipe, vae_2)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+diffusers
+gradio
+gradio_imageslider
+torch
+transformers
+pillow
+numpy
+xformers

utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import torch
+def mse(img1, img2):
+    return (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
+def psnr(img1, img2):
+    mse = (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
+    return 20 * torch.log10(1.0 / torch.sqrt(mse))
+# torchmetrics

utils/loss_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+from math import exp
+def l1_loss(network_output, gt):
+    return torch.abs((network_output - gt)).mean()
+def l2_loss(network_output, gt):
+    return ((network_output - gt) ** 2).mean()
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
+    return gauss / gauss.sum()
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+    window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
+    return window
+def ssim(img1, img2, window_size=11, size_average=True):
+    channel = img1.size(-3)
+    window = create_window(window_size, channel)
+    if img1.is_cuda:
+        window = window.cuda(img1.get_device())
+    window = window.type_as(img1)
+    return _ssim(img1, img2, window, window_size, channel, size_average)
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+    C1 = 0.01 ** 2
+    C2 = 0.03 ** 2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1).mean(1).mean(1)