Upload 6 files

Browse files

Files changed (6) hide show

utils/common.py +159 -0
utils/cond_fn.py +98 -0
utils/face_restoration_helper.py +517 -0
utils/helpers.py +216 -0
utils/inference.py +320 -0
utils/sampler.py +341 -0

utils/common.py ADDED Viewed

	@@ -0,0 +1,159 @@

+from typing import Mapping, Any, Tuple, Callable
+import importlib
+import os
+from urllib.parse import urlparse
+import torch
+from torch import Tensor
+from torch.nn import functional as F
+import numpy as np
+from torch.hub import download_url_to_file, get_dir
+def get_obj_from_str(string: str, reload: bool=False) -> Any:
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def instantiate_from_config(config: Mapping[str, Any]) -> Any:
+    if not "target" in config:
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def wavelet_blur(image: Tensor, radius: int):
+    """
+    Apply wavelet blur to the input tensor.
+    """
+    # input shape: (1, 3, H, W)
+    # convolution kernel
+    kernel_vals = [
+        [0.0625, 0.125, 0.0625],
+        [0.125, 0.25, 0.125],
+        [0.0625, 0.125, 0.0625],
+    ]
+    kernel = torch.tensor(kernel_vals, dtype=image.dtype, device=image.device)
+    # add channel dimensions to the kernel to make it a 4D tensor
+    kernel = kernel[None, None]
+    # repeat the kernel across all input channels
+    kernel = kernel.repeat(3, 1, 1, 1)
+    image = F.pad(image, (radius, radius, radius, radius), mode='replicate')
+    # apply convolution
+    output = F.conv2d(image, kernel, groups=3, dilation=radius)
+    return output
+def wavelet_decomposition(image: Tensor, levels=5):
+    """
+    Apply wavelet decomposition to the input tensor.
+    This function only returns the low frequency & the high frequency.
+    """
+    high_freq = torch.zeros_like(image)
+    for i in range(levels):
+        radius = 2 ** i
+        low_freq = wavelet_blur(image, radius)
+        high_freq += (image - low_freq)
+        image = low_freq
+    return high_freq, low_freq
+def wavelet_reconstruction(content_feat:Tensor, style_feat:Tensor):
+    """
+    Apply wavelet decomposition, so that the content will have the same color as the style.
+    """
+    # calculate the wavelet decomposition of the content feature
+    content_high_freq, content_low_freq = wavelet_decomposition(content_feat)
+    del content_low_freq
+    # calculate the wavelet decomposition of the style feature
+    style_high_freq, style_low_freq = wavelet_decomposition(style_feat)
+    del style_high_freq
+    # reconstruct the content feature with the style's high frequency
+    return content_high_freq + style_low_freq
+# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/utils/download_util.py/
+def load_file_from_url(url, model_dir=None, progress=True, file_name=None):
+    """Load file form http url, will download models if necessary.
+    Ref:https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py
+    Args:
+        url (str): URL to be downloaded.
+        model_dir (str): The path to save the downloaded model. Should be a full path. If None, use pytorch hub_dir.
+            Default: None.
+        progress (bool): Whether to show the download progress. Default: True.
+        file_name (str): The downloaded file name. If None, use the file name in the url. Default: None.
+    Returns:
+        str: The path to the downloaded file.
+    """
+    if model_dir is None:  # use the pytorch hub_dir
+        hub_dir = get_dir()
+        model_dir = os.path.join(hub_dir, 'checkpoints')
+    os.makedirs(model_dir, exist_ok=True)
+    parts = urlparse(url)
+    filename = os.path.basename(parts.path)
+    if file_name is not None:
+        filename = file_name
+    cached_file = os.path.abspath(os.path.join(model_dir, filename))
+    if not os.path.exists(cached_file):
+        print(f'Downloading: "{url}" to {cached_file}\n')
+        download_url_to_file(url, cached_file, hash_prefix=None, progress=progress)
+    return cached_file
+def sliding_windows(h: int, w: int, tile_size: int, tile_stride: int) -> Tuple[int, int, int, int]:
+    hi_list = list(range(0, h - tile_size + 1, tile_stride))
+    if (h - tile_size) % tile_stride != 0:
+        hi_list.append(h - tile_size)
+    wi_list = list(range(0, w - tile_size + 1, tile_stride))
+    if (w - tile_size) % tile_stride != 0:
+        wi_list.append(w - tile_size)
+    coords = []
+    for hi in hi_list:
+        for wi in wi_list:
+            coords.append((hi, hi + tile_size, wi, wi + tile_size))
+    return coords
+# https://github.com/csslc/CCSR/blob/main/model/q_sampler.py#L503
+def gaussian_weights(tile_width: int, tile_height: int) -> np.ndarray:
+    """Generates a gaussian mask of weights for tile contributions"""
+    latent_width = tile_width
+    latent_height = tile_height
+    var = 0.01
+    midpoint = (latent_width - 1) / 2  # -1 because index goes from 0 to latent_width - 1
+    x_probs = [
+        np.exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / np.sqrt(2 * np.pi * var)
+        for x in range(latent_width)]
+    midpoint = latent_height / 2
+    y_probs = [
+        np.exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / np.sqrt(2 * np.pi * var)
+        for y in range(latent_height)]
+    weights = np.outer(y_probs, x_probs)
+    return weights
+COUNT_VRAM = bool(os.environ.get("COUNT_VRAM", False))
+def count_vram_usage(func: Callable) -> Callable:
+    if not COUNT_VRAM:
+        return func
+    def wrapper(*args, **kwargs):
+        peak_before = torch.cuda.max_memory_allocated() / (1024 ** 3)
+        ret = func(*args, **kwargs)
+        torch.cuda.synchronize()
+        peak_after = torch.cuda.max_memory_allocated() / (1024 ** 3)
+        print(f"VRAM peak before {func.__name__}: {peak_before:.5f} GB, after: {peak_after:.5f} GB")
+        return ret
+    return wrapper

utils/cond_fn.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from typing import overload, Tuple
+import torch
+from torch.nn import functional as F
+class Guidance:
+    def __init__(self, scale: float, t_start: int, t_stop: int, space: str, repeat: int) -> "Guidance":
+        """
+        Initialize restoration guidance.
+        Args:
+            scale (float): Gradient scale (denoted as `s` in our paper). The larger the gradient scale,
+                the closer the final result will be to the output of the first stage model.
+            t_start (int), t_stop (int): The timestep to start or stop guidance. Note that the sampling
+                process starts from t=1000 to t=0, the `t_start` should be larger than `t_stop`.
+            space (str): The data space for computing loss function (rgb or latent).
+        Our restoration guidance is based on [GDP](https://github.com/Fayeben/GenerativeDiffusionPrior).
+        Thanks for their work!
+        """
+        self.scale = scale * 3000
+        self.t_start = t_start
+        self.t_stop = t_stop
+        self.target = None
+        self.space = space
+        self.repeat = repeat
+    def load_target(self, target: torch.Tensor) -> None:
+        self.target = target
+    def __call__(self, target_x0: torch.Tensor, pred_x0: torch.Tensor, t: int) -> Tuple[torch.Tensor, float]:
+        # avoid propagating gradient out of this scope
+        pred_x0 = pred_x0.detach().clone()
+        target_x0 = target_x0.detach().clone()
+        return self._forward(target_x0, pred_x0, t)
+    @overload
+    def _forward(self, target_x0: torch.Tensor, pred_x0: torch.Tensor, t: int) -> Tuple[torch.Tensor, float]:
+        ...
+class MSEGuidance(Guidance):
+    def _forward(self, target_x0: torch.Tensor, pred_x0: torch.Tensor, t: int) -> Tuple[torch.Tensor, float]:
+        # inputs: [-1, 1], nchw, rgb
+        with torch.enable_grad():
+            pred_x0.requires_grad_(True)
+            loss = (pred_x0 - target_x0).pow(2).mean((1, 2, 3)).sum()
+        scale = self.scale
+        g = -torch.autograd.grad(loss, pred_x0)[0] * scale
+        return g, loss.item()
+class WeightedMSEGuidance(Guidance):
+    def _get_weight(self, target: torch.Tensor) -> torch.Tensor:
+        # convert RGB to G
+        rgb_to_gray_kernel = torch.tensor([0.2989, 0.5870, 0.1140]).view(1, 3, 1, 1)
+        target = torch.sum(target * rgb_to_gray_kernel.to(target.device), dim=1, keepdim=True)
+        # initialize sobel kernel in x and y axis
+        G_x = [
+            [1, 0, -1],
+            [2, 0, -2],
+            [1, 0, -1]
+        ]
+        G_y = [
+            [1, 2, 1],
+            [0, 0, 0],
+            [-1, -2, -1]
+        ]
+        G_x = torch.tensor(G_x, dtype=target.dtype, device=target.device)[None]
+        G_y = torch.tensor(G_y, dtype=target.dtype, device=target.device)[None]
+        G = torch.stack((G_x, G_y))
+        target = F.pad(target, (1, 1, 1, 1), mode='replicate') # padding = 1
+        grad = F.conv2d(target, G, stride=1)
+        mag = grad.pow(2).sum(dim=1, keepdim=True).sqrt()
+        n, c, h, w = mag.size()
+        block_size = 2
+        blocks = mag.view(n, c, h // block_size, block_size, w // block_size, block_size).permute(0, 1, 2, 4, 3, 5).contiguous()
+        block_mean = blocks.sum(dim=(-2, -1), keepdim=True).tanh().repeat(1, 1, 1, 1, block_size, block_size).permute(0, 1, 2, 4, 3, 5).contiguous()
+        block_mean = block_mean.view(n, c, h, w)
+        weight_map = 1 - block_mean
+        return weight_map
+    def _forward(self, target_x0: torch.Tensor, pred_x0: torch.Tensor, t: int) -> Tuple[torch.Tensor, float]:
+        # inputs: [-1, 1], nchw, rgb
+        with torch.no_grad():
+            w = self._get_weight((target_x0 + 1) / 2)
+        with torch.enable_grad():
+            pred_x0.requires_grad_(True)
+            loss = ((pred_x0 - target_x0).pow(2) * w).mean((1, 2, 3)).sum()
+        scale = self.scale
+        g = -torch.autograd.grad(loss, pred_x0)[0] * scale
+        return g, loss.item()

utils/face_restoration_helper.py ADDED Viewed

	@@ -0,0 +1,517 @@

+import cv2
+import numpy as np
+import os
+import torch
+from torchvision.transforms.functional import normalize
+from facexlib.detection import init_detection_model
+from facexlib.parsing import init_parsing_model
+from facexlib.utils.misc import img2tensor, imwrite
+from utils.common import load_file_from_url
+def get_largest_face(det_faces, h, w):
+    def get_location(val, length):
+        if val < 0:
+            return 0
+        elif val > length:
+            return length
+        else:
+            return val
+    face_areas = []
+    for det_face in det_faces:
+        left = get_location(det_face[0], w)
+        right = get_location(det_face[2], w)
+        top = get_location(det_face[1], h)
+        bottom = get_location(det_face[3], h)
+        face_area = (right - left) * (bottom - top)
+        face_areas.append(face_area)
+    largest_idx = face_areas.index(max(face_areas))
+    return det_faces[largest_idx], largest_idx
+def get_center_face(det_faces, h=0, w=0, center=None):
+    if center is not None:
+        center = np.array(center)
+    else:
+        center = np.array([w / 2, h / 2])
+    center_dist = []
+    for det_face in det_faces:
+        face_center = np.array([(det_face[0] + det_face[2]) / 2, (det_face[1] + det_face[3]) / 2])
+        dist = np.linalg.norm(face_center - center)
+        center_dist.append(dist)
+    center_idx = center_dist.index(min(center_dist))
+    return det_faces[center_idx], center_idx
+class FaceRestoreHelper(object):
+    """Helper for the face restoration pipeline (base class)."""
+    def __init__(self,
+                 upscale_factor,
+                 face_size=512,
+                 crop_ratio=(1, 1),
+                 det_model='retinaface_resnet50',
+                 save_ext='png',
+                 template_3points=False,
+                 pad_blur=False,
+                 use_parse=False,
+                 device=None):
+        self.template_3points = template_3points  # improve robustness
+        self.upscale_factor = int(upscale_factor)
+        # the cropped face ratio based on the square face
+        self.crop_ratio = crop_ratio  # (h, w)
+        assert (self.crop_ratio[0] >= 1 and self.crop_ratio[1] >= 1), 'crop ration only supports >=1'
+        self.face_size = (int(face_size * self.crop_ratio[1]), int(face_size * self.crop_ratio[0]))
+        self.det_model = det_model
+        if self.det_model == 'dlib':
+            # standard 5 landmarks for FFHQ faces with 1024 x 1024
+            self.face_template = np.array([[686.77227723, 488.62376238], [586.77227723, 493.59405941],
+                                        [337.91089109, 488.38613861], [437.95049505, 493.51485149],
+                                        [513.58415842, 678.5049505]])
+            self.face_template = self.face_template / (1024 // face_size)
+        elif self.template_3points:
+            self.face_template = np.array([[192, 240], [319, 240], [257, 371]])
+        else:
+            # standard 5 landmarks for FFHQ faces with 512 x 512
+            # facexlib
+            self.face_template = np.array([[192.98138, 239.94708], [318.90277, 240.1936], [256.63416, 314.01935],
+                                           [201.26117, 371.41043], [313.08905, 371.15118]])
+            # dlib: left_eye: 36:41  right_eye: 42:47  nose: 30,32,33,34  left mouth corner: 48  right mouth corner: 54
+            # self.face_template = np.array([[193.65928, 242.98541], [318.32558, 243.06108], [255.67984, 328.82894],
+            #                                 [198.22603, 372.82502], [313.91018, 372.75659]])
+        self.face_template = self.face_template * (face_size / 512.0)
+        if self.crop_ratio[0] > 1:
+            self.face_template[:, 1] += face_size * (self.crop_ratio[0] - 1) / 2
+        if self.crop_ratio[1] > 1:
+            self.face_template[:, 0] += face_size * (self.crop_ratio[1] - 1) / 2
+        self.save_ext = save_ext
+        self.pad_blur = pad_blur
+        if self.pad_blur is True:
+            self.template_3points = False
+        self.all_landmarks_5 = []
+        self.det_faces = []
+        self.affine_matrices = []
+        self.inverse_affine_matrices = []
+        self.cropped_faces = []
+        self.restored_faces = []
+        self.pad_input_imgs = []
+        if device is None:
+            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            # self.device = get_device()
+        else:
+            self.device = device
+        # init face detection model
+        self.face_detector = init_detection_model(det_model, half=False, device=self.device)
+        # init face parsing model
+        self.use_parse = use_parse
+        self.face_parse = init_parsing_model(model_name='parsenet', device=self.device)
+    def set_upscale_factor(self, upscale_factor):
+        self.upscale_factor = upscale_factor
+    def read_image(self, img):
+        """img can be image path or cv2 loaded image."""
+        # self.input_img is Numpy array, (h, w, c), BGR, uint8, [0, 255]
+        if isinstance(img, str):
+            img = cv2.imread(img)
+        if np.max(img) > 256:  # 16-bit image
+            img = img / 65535 * 255
+        if len(img.shape) == 2:  # gray image
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif img.shape[2] == 4:  # BGRA image with alpha channel
+            img = img[:, :, 0:3]
+        self.input_img = img
+        # self.is_gray = is_gray(img, threshold=10)
+        # if self.is_gray:
+        #     print('Grayscale input: True')
+        if min(self.input_img.shape[:2])<512:
+            f = 512.0/min(self.input_img.shape[:2])
+            self.input_img = cv2.resize(self.input_img, (0,0), fx=f, fy=f, interpolation=cv2.INTER_LINEAR)
+    def init_dlib(self, detection_path, landmark5_path):
+        """Initialize the dlib detectors and predictors."""
+        try:
+            import dlib
+        except ImportError:
+            print('Please install dlib by running:' 'conda install -c conda-forge dlib')
+        detection_path = load_file_from_url(url=detection_path, model_dir='weights/dlib', progress=True, file_name=None)
+        landmark5_path = load_file_from_url(url=landmark5_path, model_dir='weights/dlib', progress=True, file_name=None)
+        face_detector = dlib.cnn_face_detection_model_v1(detection_path)
+        shape_predictor_5 = dlib.shape_predictor(landmark5_path)
+        return face_detector, shape_predictor_5
+    def get_face_landmarks_5_dlib(self,
+                                only_keep_largest=False,
+                                scale=1):
+        det_faces = self.face_detector(self.input_img, scale)
+        if len(det_faces) == 0:
+            print('No face detected. Try to increase upsample_num_times.')
+            return 0
+        else:
+            if only_keep_largest:
+                print('Detect several faces and only keep the largest.')
+                face_areas = []
+                for i in range(len(det_faces)):
+                    face_area = (det_faces[i].rect.right() - det_faces[i].rect.left()) * (
+                        det_faces[i].rect.bottom() - det_faces[i].rect.top())
+                    face_areas.append(face_area)
+                largest_idx = face_areas.index(max(face_areas))
+                self.det_faces = [det_faces[largest_idx]]
+            else:
+                self.det_faces = det_faces
+        if len(self.det_faces) == 0:
+            return 0
+        for face in self.det_faces:
+            shape = self.shape_predictor_5(self.input_img, face.rect)
+            landmark = np.array([[part.x, part.y] for part in shape.parts()])
+            self.all_landmarks_5.append(landmark)
+        return len(self.all_landmarks_5)
+    def get_face_landmarks_5(self,
+                             only_keep_largest=False,
+                             only_center_face=False,
+                             resize=None,
+                             blur_ratio=0.01,
+                             eye_dist_threshold=None):
+        if self.det_model == 'dlib':
+            return self.get_face_landmarks_5_dlib(only_keep_largest)
+        if resize is None:
+            scale = 1
+            input_img = self.input_img
+        else:
+            h, w = self.input_img.shape[0:2]
+            scale = resize / min(h, w)
+            scale = max(1, scale) # always scale up
+            h, w = int(h * scale), int(w * scale)
+            interp = cv2.INTER_AREA if scale < 1 else cv2.INTER_LINEAR
+            input_img = cv2.resize(self.input_img, (w, h), interpolation=interp)
+        with torch.no_grad():
+            bboxes = self.face_detector.detect_faces(input_img)
+        if bboxes is None or bboxes.shape[0] == 0:
+            return 0
+        else:
+            bboxes = bboxes / scale
+        for bbox in bboxes:
+            # remove faces with too small eye distance: side faces or too small faces
+            eye_dist = np.linalg.norm([bbox[6] - bbox[8], bbox[7] - bbox[9]])
+            if eye_dist_threshold is not None and (eye_dist < eye_dist_threshold):
+                continue
+            if self.template_3points:
+                landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 11, 2)])
+            else:
+                landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 15, 2)])
+            self.all_landmarks_5.append(landmark)
+            self.det_faces.append(bbox[0:5])
+        if len(self.det_faces) == 0:
+            return 0
+        if only_keep_largest:
+            h, w, _ = self.input_img.shape
+            self.det_faces, largest_idx = get_largest_face(self.det_faces, h, w)
+            self.all_landmarks_5 = [self.all_landmarks_5[largest_idx]]
+        elif only_center_face:
+            h, w, _ = self.input_img.shape
+            self.det_faces, center_idx = get_center_face(self.det_faces, h, w)
+            self.all_landmarks_5 = [self.all_landmarks_5[center_idx]]
+        # pad blurry images
+        if self.pad_blur:
+            self.pad_input_imgs = []
+            for landmarks in self.all_landmarks_5:
+                # get landmarks
+                eye_left = landmarks[0, :]
+                eye_right = landmarks[1, :]
+                eye_avg = (eye_left + eye_right) * 0.5
+                mouth_avg = (landmarks[3, :] + landmarks[4, :]) * 0.5
+                eye_to_eye = eye_right - eye_left
+                eye_to_mouth = mouth_avg - eye_avg
+                # Get the oriented crop rectangle
+                # x: half width of the oriented crop rectangle
+                x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
+                #  - np.flipud(eye_to_mouth) * [-1, 1]: rotate 90 clockwise
+                # norm with the hypotenuse: get the direction
+                x /= np.hypot(*x)  # get the hypotenuse of a right triangle
+                rect_scale = 1.5
+                x *= max(np.hypot(*eye_to_eye) * 2.0 * rect_scale, np.hypot(*eye_to_mouth) * 1.8 * rect_scale)
+                # y: half height of the oriented crop rectangle
+                y = np.flipud(x) * [-1, 1]
+                # c: center
+                c = eye_avg + eye_to_mouth * 0.1
+                # quad: (left_top, left_bottom, right_bottom, right_top)
+                quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
+                # qsize: side length of the square
+                qsize = np.hypot(*x) * 2
+                border = max(int(np.rint(qsize * 0.1)), 3)
+                # get pad
+                # pad: (width_left, height_top, width_right, height_bottom)
+                pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
+                       int(np.ceil(max(quad[:, 1]))))
+                pad = [
+                    max(-pad[0] + border, 1),
+                    max(-pad[1] + border, 1),
+                    max(pad[2] - self.input_img.shape[0] + border, 1),
+                    max(pad[3] - self.input_img.shape[1] + border, 1)
+                ]
+                if max(pad) > 1:
+                    # pad image
+                    pad_img = np.pad(self.input_img, ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
+                    # modify landmark coords
+                    landmarks[:, 0] += pad[0]
+                    landmarks[:, 1] += pad[1]
+                    # blur pad images
+                    h, w, _ = pad_img.shape
+                    y, x, _ = np.ogrid[:h, :w, :1]
+                    mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0],
+                                                       np.float32(w - 1 - x) / pad[2]),
+                                      1.0 - np.minimum(np.float32(y) / pad[1],
+                                                       np.float32(h - 1 - y) / pad[3]))
+                    blur = int(qsize * blur_ratio)
+                    if blur % 2 == 0:
+                        blur += 1
+                    blur_img = cv2.boxFilter(pad_img, 0, ksize=(blur, blur))
+                    # blur_img = cv2.GaussianBlur(pad_img, (blur, blur), 0)
+                    pad_img = pad_img.astype('float32')
+                    pad_img += (blur_img - pad_img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
+                    pad_img += (np.median(pad_img, axis=(0, 1)) - pad_img) * np.clip(mask, 0.0, 1.0)
+                    pad_img = np.clip(pad_img, 0, 255)  # float32, [0, 255]
+                    self.pad_input_imgs.append(pad_img)
+                else:
+                    self.pad_input_imgs.append(np.copy(self.input_img))
+        return len(self.all_landmarks_5)
+    def align_warp_face(self, save_cropped_path=None, border_mode='constant'):
+        """Align and warp faces with face template.
+        """
+        if self.pad_blur:
+            assert len(self.pad_input_imgs) == len(
+                self.all_landmarks_5), f'Mismatched samples: {len(self.pad_input_imgs)} and {len(self.all_landmarks_5)}'
+        for idx, landmark in enumerate(self.all_landmarks_5):
+            # use 5 landmarks to get affine matrix
+            # use cv2.LMEDS method for the equivalence to skimage transform
+            # ref: https://blog.csdn.net/yichxi/article/details/115827338
+            affine_matrix = cv2.estimateAffinePartial2D(landmark, self.face_template, method=cv2.LMEDS)[0]
+            self.affine_matrices.append(affine_matrix)
+            # warp and crop faces
+            if border_mode == 'constant':
+                border_mode = cv2.BORDER_CONSTANT
+            elif border_mode == 'reflect101':
+                border_mode = cv2.BORDER_REFLECT101
+            elif border_mode == 'reflect':
+                border_mode = cv2.BORDER_REFLECT
+            if self.pad_blur:
+                input_img = self.pad_input_imgs[idx]
+            else:
+                input_img = self.input_img
+            cropped_face = cv2.warpAffine(
+                input_img, affine_matrix, self.face_size, borderMode=border_mode, borderValue=(135, 133, 132))  # gray
+            self.cropped_faces.append(cropped_face)
+            # save the cropped face
+            if save_cropped_path is not None:
+                path = os.path.splitext(save_cropped_path)[0]
+                save_path = f'{path}_{idx:02d}.{self.save_ext}'
+                imwrite(cropped_face, save_path)
+    def get_inverse_affine(self, save_inverse_affine_path=None):
+        """Get inverse affine matrix."""
+        for idx, affine_matrix in enumerate(self.affine_matrices):
+            inverse_affine = cv2.invertAffineTransform(affine_matrix)
+            inverse_affine *= self.upscale_factor
+            self.inverse_affine_matrices.append(inverse_affine)
+            # save inverse affine matrices
+            if save_inverse_affine_path is not None:
+                path, _ = os.path.splitext(save_inverse_affine_path)
+                save_path = f'{path}_{idx:02d}.pth'
+                torch.save(inverse_affine, save_path)
+    def add_restored_face(self, restored_face, input_face=None):
+        # if self.is_gray:
+        #     restored_face = bgr2gray(restored_face) # convert img into grayscale
+        #     if input_face is not None:
+        #         restored_face = adain_npy(restored_face, input_face) # transfer the color
+        self.restored_faces.append(restored_face)
+    def paste_faces_to_input_image(self, save_path=None, upsample_img=None, draw_box=False, face_upsampler=None):
+        h, w, _ = self.input_img.shape
+        h_up, w_up = int(h * self.upscale_factor), int(w * self.upscale_factor)
+        if upsample_img is None:
+            # simply resize the background
+            # upsample_img = cv2.resize(self.input_img, (w_up, h_up), interpolation=cv2.INTER_LANCZOS4)
+            upsample_img = cv2.resize(self.input_img, (w_up, h_up), interpolation=cv2.INTER_LINEAR)
+        else:
+            upsample_img = cv2.resize(upsample_img, (w_up, h_up), interpolation=cv2.INTER_LANCZOS4)
+        assert len(self.restored_faces) == len(
+            self.inverse_affine_matrices), ('length of restored_faces and affine_matrices are different.')
+        inv_mask_borders = []
+        for restored_face, inverse_affine in zip(self.restored_faces, self.inverse_affine_matrices):
+            if face_upsampler is not None:
+                restored_face = face_upsampler.enhance(restored_face, outscale=self.upscale_factor)[0]
+                inverse_affine /= self.upscale_factor
+                inverse_affine[:, 2] *= self.upscale_factor
+                face_size = (self.face_size[0]*self.upscale_factor, self.face_size[1]*self.upscale_factor)
+            else:
+                # Add an offset to inverse affine matrix, for more precise back alignment
+                if self.upscale_factor > 1:
+                    extra_offset = 0.5 * self.upscale_factor
+                else:
+                    extra_offset = 0
+                inverse_affine[:, 2] += extra_offset
+                face_size = self.face_size
+            inv_restored = cv2.warpAffine(restored_face, inverse_affine, (w_up, h_up))
+            # if draw_box or not self.use_parse:  # use square parse maps
+            #     mask = np.ones(face_size, dtype=np.float32)
+            #     inv_mask = cv2.warpAffine(mask, inverse_affine, (w_up, h_up))
+            #     # remove the black borders
+            #     inv_mask_erosion = cv2.erode(
+            #         inv_mask, np.ones((int(2 * self.upscale_factor), int(2 * self.upscale_factor)), np.uint8))
+            #     pasted_face = inv_mask_erosion[:, :, None] * inv_restored
+            #     total_face_area = np.sum(inv_mask_erosion)  # // 3
+            #     # add border
+            #     if draw_box:
+            #         h, w = face_size
+            #         mask_border = np.ones((h, w, 3), dtype=np.float32)
+            #         border = int(1400/np.sqrt(total_face_area))
+            #         mask_border[border:h-border, border:w-border,:] = 0
+            #         inv_mask_border = cv2.warpAffine(mask_border, inverse_affine, (w_up, h_up))
+            #         inv_mask_borders.append(inv_mask_border)
+            #     if not self.use_parse:
+            #         # compute the fusion edge based on the area of face
+            #         w_edge = int(total_face_area**0.5) // 20
+            #         erosion_radius = w_edge * 2
+            #         inv_mask_center = cv2.erode(inv_mask_erosion, np.ones((erosion_radius, erosion_radius), np.uint8))
+            #         blur_size = w_edge * 2
+            #         inv_soft_mask = cv2.GaussianBlur(inv_mask_center, (blur_size + 1, blur_size + 1), 0)
+            #         if len(upsample_img.shape) == 2:  # upsample_img is gray image
+            #             upsample_img = upsample_img[:, :, None]
+            #         inv_soft_mask = inv_soft_mask[:, :, None]
+            # always use square mask
+            mask = np.ones(face_size, dtype=np.float32)
+            inv_mask = cv2.warpAffine(mask, inverse_affine, (w_up, h_up))
+            # remove the black borders
+            inv_mask_erosion = cv2.erode(
+                inv_mask, np.ones((int(2 * self.upscale_factor), int(2 * self.upscale_factor)), np.uint8))
+            pasted_face = inv_mask_erosion[:, :, None] * inv_restored
+            total_face_area = np.sum(inv_mask_erosion)  # // 3
+            # add border
+            if draw_box:
+                h, w = face_size
+                mask_border = np.ones((h, w, 3), dtype=np.float32)
+                border = int(1400/np.sqrt(total_face_area))
+                mask_border[border:h-border, border:w-border,:] = 0
+                inv_mask_border = cv2.warpAffine(mask_border, inverse_affine, (w_up, h_up))
+                inv_mask_borders.append(inv_mask_border)
+            # compute the fusion edge based on the area of face
+            w_edge = int(total_face_area**0.5) // 20
+            erosion_radius = w_edge * 2
+            inv_mask_center = cv2.erode(inv_mask_erosion, np.ones((erosion_radius, erosion_radius), np.uint8))
+            blur_size = w_edge * 2
+            inv_soft_mask = cv2.GaussianBlur(inv_mask_center, (blur_size + 1, blur_size + 1), 0)
+            if len(upsample_img.shape) == 2:  # upsample_img is gray image
+                upsample_img = upsample_img[:, :, None]
+            inv_soft_mask = inv_soft_mask[:, :, None]
+            # parse mask
+            if self.use_parse:
+                # inference
+                face_input = cv2.resize(restored_face, (512, 512), interpolation=cv2.INTER_LINEAR)
+                face_input = img2tensor(face_input.astype('float32') / 255., bgr2rgb=True, float32=True)
+                normalize(face_input, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
+                face_input = torch.unsqueeze(face_input, 0).to(self.device)
+                with torch.no_grad():
+                    out = self.face_parse(face_input)[0]
+                out = out.argmax(dim=1).squeeze().cpu().numpy()
+                parse_mask = np.zeros(out.shape)
+                MASK_COLORMAP = [0, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 255, 0, 0, 0]
+                for idx, color in enumerate(MASK_COLORMAP):
+                    parse_mask[out == idx] = color
+                #  blur the mask
+                parse_mask = cv2.GaussianBlur(parse_mask, (101, 101), 11)
+                parse_mask = cv2.GaussianBlur(parse_mask, (101, 101), 11)
+                # remove the black borders
+                thres = 10
+                parse_mask[:thres, :] = 0
+                parse_mask[-thres:, :] = 0
+                parse_mask[:, :thres] = 0
+                parse_mask[:, -thres:] = 0
+                parse_mask = parse_mask / 255.
+                parse_mask = cv2.resize(parse_mask, face_size)
+                parse_mask = cv2.warpAffine(parse_mask, inverse_affine, (w_up, h_up), flags=3)
+                inv_soft_parse_mask = parse_mask[:, :, None]
+                # pasted_face = inv_restored
+                fuse_mask = (inv_soft_parse_mask<inv_soft_mask).astype('int')
+                inv_soft_mask = inv_soft_parse_mask*fuse_mask + inv_soft_mask*(1-fuse_mask)
+            if len(upsample_img.shape) == 3 and upsample_img.shape[2] == 4:  # alpha channel
+                alpha = upsample_img[:, :, 3:]
+                upsample_img = inv_soft_mask * pasted_face + (1 - inv_soft_mask) * upsample_img[:, :, 0:3]
+                upsample_img = np.concatenate((upsample_img, alpha), axis=2)
+            else:
+                upsample_img = inv_soft_mask * pasted_face + (1 - inv_soft_mask) * upsample_img
+        if np.max(upsample_img) > 256:  # 16-bit image
+            upsample_img = upsample_img.astype(np.uint16)
+        else:
+            upsample_img = upsample_img.astype(np.uint8)
+        # draw bounding box
+        if draw_box:
+            # upsample_input_img = cv2.resize(input_img, (w_up, h_up))
+            img_color = np.ones([*upsample_img.shape], dtype=np.float32)
+            img_color[:,:,0] = 0
+            img_color[:,:,1] = 255
+            img_color[:,:,2] = 0
+            for inv_mask_border in inv_mask_borders:
+                upsample_img = inv_mask_border * img_color + (1 - inv_mask_border) * upsample_img
+                # upsample_input_img = inv_mask_border * img_color + (1 - inv_mask_border) * upsample_input_img
+        if save_path is not None:
+            path = os.path.splitext(save_path)[0]
+            save_path = f'{path}.{self.save_ext}'
+            imwrite(upsample_img, save_path)
+        return upsample_img
+    def clean_all(self):
+        self.all_landmarks_5 = []
+        self.restored_faces = []
+        self.affine_matrices = []
+        self.cropped_faces = []
+        self.inverse_affine_matrices = []
+        self.det_faces = []
+        self.pad_input_imgs = []

utils/helpers.py ADDED Viewed

	@@ -0,0 +1,216 @@

+from typing import overload, Tuple, Optional
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+from PIL import Image
+from einops import rearrange
+from model.cldm import ControlLDM
+from model.gaussian_diffusion import Diffusion
+from model.bsrnet import RRDBNet
+from model.swinir import SwinIR
+from model.scunet import SCUNet
+from utils.sampler import SpacedSampler
+from utils.cond_fn import Guidance
+from utils.common import wavelet_decomposition, wavelet_reconstruction, count_vram_usage
+def bicubic_resize(img: np.ndarray, scale: float) -> np.ndarray:
+    pil = Image.fromarray(img)
+    res = pil.resize(tuple(int(x * scale) for x in pil.size), Image.BICUBIC)
+    return np.array(res)
+def resize_short_edge_to(imgs: torch.Tensor, size: int) -> torch.Tensor:
+    _, _, h, w = imgs.size()
+    if h == w:
+        new_h, new_w = size, size
+    elif h < w:
+        new_h, new_w = size, int(w * (size / h))
+    else:
+        new_h, new_w = int(h * (size / w)), size
+    return F.interpolate(imgs, size=(new_h, new_w), mode="bicubic", antialias=True)
+def pad_to_multiples_of(imgs: torch.Tensor, multiple: int) -> torch.Tensor:
+    _, _, h, w = imgs.size()
+    if h % multiple == 0 and w % multiple == 0:
+        return imgs.clone()
+    # get_pad = lambda x: (x // multiple + 1) * multiple - x
+    get_pad = lambda x: (x // multiple + int(x % multiple != 0)) * multiple - x
+    ph, pw = get_pad(h), get_pad(w)
+    return F.pad(imgs, pad=(0, pw, 0, ph), mode="constant", value=0)
+class Pipeline:
+    def __init__(self, stage1_model: nn.Module, cldm: ControlLDM, diffusion: Diffusion, cond_fn: Optional[Guidance], device: str) -> None:
+        self.stage1_model = stage1_model
+        self.cldm = cldm
+        self.diffusion = diffusion
+        self.cond_fn = cond_fn
+        self.device = device
+        self.final_size: Tuple[int] = None
+    def set_final_size(self, lq: torch.Tensor) -> None:
+        h, w = lq.shape[2:]
+        self.final_size = (h, w)
+    @overload
+    def run_stage1(self, lq: torch.Tensor) -> torch.Tensor:
+        ...
+    @count_vram_usage
+    def run_stage2(
+        self,
+        clean: torch.Tensor,
+        steps: int,
+        strength: float,
+        tiled: bool,
+        tile_size: int,
+        tile_stride: int,
+        pos_prompt: str,
+        neg_prompt: str,
+        cfg_scale: float,
+        better_start: float
+    ) -> torch.Tensor:
+        ### preprocess
+        bs, _, ori_h, ori_w = clean.shape
+        # pad: ensure that height & width are multiples of 64
+        pad_clean = pad_to_multiples_of(clean, multiple=64)
+        h, w = pad_clean.shape[2:]
+        # prepare conditon
+        if not tiled:
+            cond = self.cldm.prepare_condition(pad_clean, [pos_prompt] * bs)
+            uncond = self.cldm.prepare_condition(pad_clean, [neg_prompt] * bs)
+        else:
+            cond = self.cldm.prepare_condition_tiled(pad_clean, [pos_prompt] * bs, tile_size, tile_stride)
+            uncond = self.cldm.prepare_condition_tiled(pad_clean, [neg_prompt] * bs, tile_size, tile_stride)
+        if self.cond_fn:
+            self.cond_fn.load_target(pad_clean * 2 - 1)
+        old_control_scales = self.cldm.control_scales
+        self.cldm.control_scales = [strength] * 13
+        if better_start:
+            # using noised low frequency part of condition as a better start point of
+            # reverse sampling, which can prevent our model from generating noise in
+            # image background.
+            _, low_freq = wavelet_decomposition(pad_clean)
+            if not tiled:
+                x_0 = self.cldm.vae_encode(low_freq)
+            else:
+                x_0 = self.cldm.vae_encode_tiled(low_freq, tile_size, tile_stride)
+            x_T = self.diffusion.q_sample(
+                x_0,
+                torch.full((bs, ), self.diffusion.num_timesteps - 1, dtype=torch.long, device=self.device),
+                torch.randn(x_0.shape, dtype=torch.float32, device=self.device)
+            )
+            # print(f"diffusion sqrt_alphas_cumprod: {self.diffusion.sqrt_alphas_cumprod[-1]}")
+        else:
+            x_T = torch.randn((bs, 4, h // 8, w // 8), dtype=torch.float32, device=self.device)
+        ### run sampler
+        sampler = SpacedSampler(self.diffusion.betas)
+        z = sampler.sample(
+            model=self.cldm, device=self.device, steps=steps, batch_size=bs, x_size=(4, h // 8, w // 8),
+            cond=cond, uncond=uncond, cfg_scale=cfg_scale, x_T=x_T, progress=True,
+            progress_leave=True, cond_fn=self.cond_fn, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride
+        )
+        if not tiled:
+            x = self.cldm.vae_decode(z)
+        else:
+            x = self.cldm.vae_decode_tiled(z, tile_size // 8, tile_stride // 8)
+        ### postprocess
+        self.cldm.control_scales = old_control_scales
+        sample = x[:, :, :ori_h, :ori_w]
+        return sample
+    @torch.no_grad()
+    def run(
+        self,
+        lq: np.ndarray,
+        steps: int,
+        strength: float,
+        tiled: bool,
+        tile_size: int,
+        tile_stride: int,
+        pos_prompt: str,
+        neg_prompt: str,
+        cfg_scale: float,
+        better_start: bool
+    ) -> np.ndarray:
+        # image to tensor
+        lq = torch.tensor((lq / 255.).clip(0, 1), dtype=torch.float32, device=self.device)
+        lq = rearrange(lq, "n h w c -> n c h w").contiguous()
+        # set pipeline output size
+        self.set_final_size(lq)
+        clean = self.run_stage1(lq)
+        sample = self.run_stage2(
+            clean, steps, strength, tiled, tile_size, tile_stride,
+            pos_prompt, neg_prompt, cfg_scale, better_start
+        )
+        # colorfix (borrowed from StableSR, thanks for their work)
+        sample = (sample + 1) / 2
+        sample = wavelet_reconstruction(sample, clean)
+        # resize to desired output size
+        sample = F.interpolate(sample, size=self.final_size, mode="bicubic", antialias=True)
+        # tensor to image
+        sample = rearrange(sample * 255., "n c h w -> n h w c")
+        sample = sample.contiguous().clamp(0, 255).to(torch.uint8).cpu().numpy()
+        return sample
+class BSRNetPipeline(Pipeline):
+    def __init__(self, bsrnet: RRDBNet, cldm: ControlLDM, diffusion: Diffusion, cond_fn: Optional[Guidance], device: str, upscale: float) -> None:
+        super().__init__(bsrnet, cldm, diffusion, cond_fn, device)
+        self.upscale = upscale
+    def set_final_size(self, lq: torch.Tensor) -> None:
+        h, w = lq.shape[2:]
+        self.final_size = (int(h * self.upscale), int(w * self.upscale))
+    @count_vram_usage
+    def run_stage1(self, lq: torch.Tensor) -> torch.Tensor:
+        # NOTE: upscale is always set to 4 in our experiments
+        clean = self.stage1_model(lq)
+        # if self.final_size[0] < 512 and self.final_size[1] < 512:
+        if min(self.final_size) < 512:
+            clean = resize_short_edge_to(clean, size=512)
+        else:
+            clean = F.interpolate(clean, size=self.final_size, mode="bicubic", antialias=True)
+        return clean
+class SwinIRPipeline(Pipeline):
+    def __init__(self, swinir: SwinIR, cldm: ControlLDM, diffusion: Diffusion, cond_fn: Optional[Guidance], device: str) -> None:
+        super().__init__(swinir, cldm, diffusion, cond_fn, device)
+    @count_vram_usage
+    def run_stage1(self, lq: torch.Tensor) -> torch.Tensor:
+        # NOTE: lq size is always equal to 512 in our experiments
+        # resize: ensure the input lq size is as least 512, since SwinIR is trained on 512 resolution
+        if min(lq.shape[2:]) < 512:
+            lq = resize_short_edge_to(lq, size=512)
+        ori_h, ori_w = lq.shape[2:]
+        # pad: ensure that height & width are multiples of 64
+        pad_lq = pad_to_multiples_of(lq, multiple=64)
+        # run
+        clean = self.stage1_model(pad_lq)
+        # remove padding
+        clean = clean[:, :, :ori_h, :ori_w]
+        return clean
+class SCUNetPipeline(Pipeline):
+    def __init__(self, scunet: SCUNet, cldm: ControlLDM, diffusion: Diffusion, cond_fn: Optional[Guidance], device: str) -> None:
+        super().__init__(scunet, cldm, diffusion, cond_fn, device)
+    @count_vram_usage
+    def run_stage1(self, lq: torch.Tensor) -> torch.Tensor:
+        clean = self.stage1_model(lq)
+        if min(clean.shape[2:]) < 512:
+            clean = resize_short_edge_to(clean, size=512)
+        return clean

utils/inference.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import os
+from typing import overload, Generator, Dict
+from argparse import Namespace
+import numpy as np
+import torch
+from PIL import Image
+from omegaconf import OmegaConf
+from model.cldm import ControlLDM
+from model.gaussian_diffusion import Diffusion
+from model.bsrnet import RRDBNet
+from model.scunet import SCUNet
+from model.swinir import SwinIR
+from utils.common import instantiate_from_config, load_file_from_url, count_vram_usage
+from utils.face_restoration_helper import FaceRestoreHelper
+from utils.helpers import (
+    Pipeline,
+    BSRNetPipeline, SwinIRPipeline, SCUNetPipeline,
+    bicubic_resize
+)
+from utils.cond_fn import MSEGuidance, WeightedMSEGuidance
+MODELS = {
+    ### stage_1 model weights
+    "bsrnet": "https://github.com/cszn/KAIR/releases/download/v1.0/BSRNet.pth",
+    # the following checkpoint is up-to-date, but we use the old version in our paper
+    # "swinir_face": "https://github.com/zsyOAOA/DifFace/releases/download/V1.0/General_Face_ffhq512.pth",
+    "swinir_face": "https://huggingface.co/lxq007/DiffBIR/resolve/main/face_swinir_v1.ckpt",
+    "scunet_psnr": "https://github.com/cszn/KAIR/releases/download/v1.0/scunet_color_real_psnr.pth",
+    "swinir_general": "https://huggingface.co/lxq007/DiffBIR/resolve/main/general_swinir_v1.ckpt",
+    ### stage_2 model weights
+    "sd_v21": "https://huggingface.co/stabilityai/stable-diffusion-2-1-base/resolve/main/v2-1_512-ema-pruned.ckpt",
+    "v1_face": "https://huggingface.co/lxq007/DiffBIR-v2/resolve/main/v1_face.pth",
+    "v1_general": "https://huggingface.co/lxq007/DiffBIR-v2/resolve/main/v1_general.pth",
+    "v2": "https://huggingface.co/lxq007/DiffBIR-v2/resolve/main/v2.pth"
+}
+def load_model_from_url(url: str) -> Dict[str, torch.Tensor]:
+    sd_path = load_file_from_url(url, model_dir="weights")
+    sd = torch.load(sd_path, map_location="cpu")
+    if "state_dict" in sd:
+        sd = sd["state_dict"]
+    if list(sd.keys())[0].startswith("module"):
+        sd = {k[len("module."):]: v for k, v in sd.items()}
+    return sd
+class InferenceLoop:
+    def __init__(self, args: Namespace) -> "InferenceLoop":
+        self.args = args
+        self.loop_ctx = {}
+        self.pipeline: Pipeline = None
+        self.init_stage1_model()
+        self.init_stage2_model()
+        self.init_cond_fn()
+        self.init_pipeline()
+    @overload
+    def init_stage1_model(self) -> None:
+        ...
+    @count_vram_usage
+    def init_stage2_model(self) -> None:
+        ### load uent, vae, clip
+        self.cldm: ControlLDM = instantiate_from_config(OmegaConf.load("configs/inference/cldm.yaml"))
+        sd = load_model_from_url(MODELS["sd_v21"])
+        unused = self.cldm.load_pretrained_sd(sd)
+        print(f"strictly load pretrained sd_v2.1, unused weights: {unused}")
+        ### load controlnet
+        if self.args.version == "v1":
+            if self.args.task == "fr":
+                control_sd = load_model_from_url(MODELS["v1_face"])
+            elif self.args.task == "sr":
+                control_sd = load_model_from_url(MODELS["v1_general"])
+            else:
+                raise ValueError(f"DiffBIR v1 doesn't support task: {self.args.task}, please use v2 by passsing '--version v2'")
+        else:
+            control_sd = load_model_from_url(MODELS["v2"])
+        self.cldm.load_controlnet_from_ckpt(control_sd)
+        print(f"strictly load controlnet weight")
+        self.cldm.eval().to(self.args.device)
+        ### load diffusion
+        self.diffusion: Diffusion = instantiate_from_config(OmegaConf.load("configs/inference/diffusion.yaml"))
+        self.diffusion.to(self.args.device)
+    def init_cond_fn(self) -> None:
+        if not self.args.guidance:
+            self.cond_fn = None
+            return
+        if self.args.g_loss == "mse":
+            cond_fn_cls = MSEGuidance
+        elif self.args.g_loss == "w_mse":
+            cond_fn_cls = WeightedMSEGuidance
+        else:
+            raise ValueError(self.args.g_loss)
+        self.cond_fn = cond_fn_cls(
+            scale=self.args.g_scale, t_start=self.args.g_start, t_stop=self.args.g_stop,
+            space=self.args.g_space, repeat=self.args.g_repeat
+        )
+    @overload
+    def init_pipeline(self) -> None:
+        ...
+    def setup(self) -> None:
+        self.output_dir = self.args.output
+        os.makedirs(self.output_dir, exist_ok=True)
+    def lq_loader(self) -> Generator[np.ndarray, None, None]:
+        img_exts = [".png", ".jpg", ".jpeg"]
+        if os.path.isdir(self.args.input):
+            file_names = sorted([
+                file_name for file_name in os.listdir(self.args.input) if os.path.splitext(file_name)[-1] in img_exts
+            ])
+            file_paths = [os.path.join(self.args.input, file_name) for file_name in file_names]
+        else:
+            assert os.path.splitext(self.args.input)[-1] in img_exts
+            file_paths = [self.args.input]
+        def _loader() -> Generator[np.ndarray, None, None]:
+            for file_path in file_paths:
+                ### load lq
+                lq = np.array(Image.open(file_path).convert("RGB"))
+                print(f"load lq: {file_path}")
+                ### set context for saving results
+                self.loop_ctx["file_stem"] = os.path.splitext(os.path.basename(file_path))[0]
+                for i in range(self.args.n_samples):
+                    self.loop_ctx["repeat_idx"] = i
+                    yield lq
+        return _loader
+    def after_load_lq(self, lq: np.ndarray) -> np.ndarray:
+        return lq
+    @torch.no_grad()
+    def run(self) -> None:
+        self.setup()
+        # We don't support batch processing since input images may have different size
+        loader = self.lq_loader()
+        for lq in loader():
+            lq = self.after_load_lq(lq)
+            sample = self.pipeline.run(
+                lq[None], self.args.steps, 1.0, self.args.tiled,
+                self.args.tile_size, self.args.tile_stride,
+                self.args.pos_prompt, self.args.neg_prompt, self.args.cfg_scale,
+                self.args.better_start
+            )[0]
+            self.save(sample)
+    def save(self, sample: np.ndarray) -> None:
+        file_stem, repeat_idx = self.loop_ctx["file_stem"], self.loop_ctx["repeat_idx"]
+        file_name = f"{file_stem}_{repeat_idx}.png" if self.args.n_samples > 1 else f"{file_stem}.png"
+        save_path = os.path.join(self.args.output, file_name)
+        Image.fromarray(sample).save(save_path)
+        print(f"save result to {save_path}")
+class BSRInferenceLoop(InferenceLoop):
+    @count_vram_usage
+    def init_stage1_model(self) -> None:
+        self.bsrnet: RRDBNet = instantiate_from_config(OmegaConf.load("configs/inference/bsrnet.yaml"))
+        sd = load_model_from_url(MODELS["bsrnet"])
+        self.bsrnet.load_state_dict(sd, strict=True)
+        self.bsrnet.eval().to(self.args.device)
+    def init_pipeline(self) -> None:
+        self.pipeline = BSRNetPipeline(self.bsrnet, self.cldm, self.diffusion, self.cond_fn, self.args.device, self.args.upscale)
+class BFRInferenceLoop(InferenceLoop):
+    @count_vram_usage
+    def init_stage1_model(self) -> None:
+        self.swinir_face: SwinIR = instantiate_from_config(OmegaConf.load("configs/inference/swinir.yaml"))
+        sd = load_model_from_url(MODELS["swinir_face"])
+        self.swinir_face.load_state_dict(sd, strict=True)
+        self.swinir_face.eval().to(self.args.device)
+    def init_pipeline(self) -> None:
+        self.pipeline = SwinIRPipeline(self.swinir_face, self.cldm, self.diffusion, self.cond_fn, self.args.device)
+    def after_load_lq(self, lq: np.ndarray) -> np.ndarray:
+        # For BFR task, super resolution is achieved by directly upscaling lq
+        return bicubic_resize(lq, self.args.upscale)
+class BIDInferenceLoop(InferenceLoop):
+    @count_vram_usage
+    def init_stage1_model(self) -> None:
+        self.scunet_psnr: SCUNet = instantiate_from_config(OmegaConf.load("configs/inference/scunet.yaml"))
+        sd = load_model_from_url(MODELS["scunet_psnr"])
+        self.scunet_psnr.load_state_dict(sd, strict=True)
+        self.scunet_psnr.eval().to(self.args.device)
+    def init_pipeline(self) -> None:
+        self.pipeline = SCUNetPipeline(self.scunet_psnr, self.cldm, self.diffusion, self.cond_fn, self.args.device)
+    def after_load_lq(self, lq: np.ndarray) -> np.ndarray:
+        # For BID task, super resolution is achieved by directly upscaling lq
+        return bicubic_resize(lq, self.args.upscale)
+class V1InferenceLoop(InferenceLoop):
+    @count_vram_usage
+    def init_stage1_model(self) -> None:
+        self.swinir: SwinIR = instantiate_from_config(OmegaConf.load("configs/inference/swinir.yaml"))
+        if self.args.task == "fr":
+            sd = load_model_from_url(MODELS["swinir_face"])
+        elif self.args.task == "sr":
+            sd = load_model_from_url(MODELS["swinir_general"])
+        else:
+            raise ValueError(f"DiffBIR v1 doesn't support task: {self.args.task}, please use v2 by passsing '--version v2'")
+        self.swinir.load_state_dict(sd, strict=True)
+        self.swinir.eval().to(self.args.device)
+    def init_pipeline(self) -> None:
+        self.pipeline = SwinIRPipeline(self.swinir, self.cldm, self.diffusion, self.cond_fn, self.args.device)
+    def after_load_lq(self, lq: np.ndarray) -> np.ndarray:
+        # For BFR task, super resolution is achieved by directly upscaling lq
+        return bicubic_resize(lq, self.args.upscale)
+class UnAlignedBFRInferenceLoop(InferenceLoop):
+    @count_vram_usage
+    def init_stage1_model(self) -> None:
+        self.bsrnet: RRDBNet = instantiate_from_config(OmegaConf.load("configs/inference/bsrnet.yaml"))
+        sd = load_model_from_url(MODELS["bsrnet"])
+        self.bsrnet.load_state_dict(sd, strict=True)
+        self.bsrnet.eval().to(self.args.device)
+        self.swinir_face: SwinIR = instantiate_from_config(OmegaConf.load("configs/inference/swinir.yaml"))
+        sd = load_model_from_url(MODELS["swinir_face"])
+        self.swinir_face.load_state_dict(sd, strict=True)
+        self.swinir_face.eval().to(self.args.device)
+    def init_pipeline(self) -> None:
+        self.pipes = {
+            "bg": BSRNetPipeline(self.bsrnet, self.cldm, self.diffusion, self.cond_fn, self.args.device, self.args.upscale),
+            "face": SwinIRPipeline(self.swinir_face, self.cldm, self.diffusion, self.cond_fn, self.args.device)
+        }
+        self.pipeline = self.pipes["face"]
+    def setup(self) -> None:
+        super().setup()
+        self.cropped_face_dir = os.path.join(self.args.output, "cropped_faces")
+        os.makedirs(self.cropped_face_dir, exist_ok=True)
+        self.restored_face_dir = os.path.join(self.args.output, "restored_faces")
+        os.makedirs(self.restored_face_dir, exist_ok=True)
+        self.restored_bg_dir = os.path.join(self.args.output, "restored_backgrounds")
+        os.makedirs(self.restored_bg_dir, exist_ok=True)
+    def lq_loader(self) -> Generator[np.ndarray, None, None]:
+        base_loader = super().lq_loader()
+        self.face_helper = FaceRestoreHelper(
+            device=self.args.device,
+            upscale_factor=1,
+            face_size=512,
+            use_parse=True,
+            det_model="retinaface_resnet50"
+        )
+        def _loader() -> Generator[np.ndarray, None, None]:
+            for lq in base_loader():
+                ### set input image
+                self.face_helper.clean_all()
+                upscaled_bg = bicubic_resize(lq, self.args.upscale)
+                self.face_helper.read_image(upscaled_bg)
+                ### get face landmarks for each face
+                self.face_helper.get_face_landmarks_5(resize=640, eye_dist_threshold=5)
+                self.face_helper.align_warp_face()
+                print(f"detect {len(self.face_helper.cropped_faces)} faces")
+                ### restore each face (has been upscaeled)
+                for i, lq_face in enumerate(self.face_helper.cropped_faces):
+                    self.loop_ctx["is_face"] = True
+                    self.loop_ctx["face_idx"] = i
+                    self.loop_ctx["cropped_face"] = lq_face
+                    yield lq_face
+                ### restore background (hasn't been upscaled)
+                self.loop_ctx["is_face"] = False
+                yield lq
+        return _loader
+    def after_load_lq(self, lq: np.ndarray) -> np.ndarray:
+        if self.loop_ctx["is_face"]:
+            self.pipeline = self.pipes["face"]
+        else:
+            self.pipeline = self.pipes["bg"]
+        return lq
+    def save(self, sample: np.ndarray) -> None:
+        file_stem, repeat_idx = self.loop_ctx["file_stem"], self.loop_ctx["repeat_idx"]
+        if self.loop_ctx["is_face"]:
+            face_idx = self.loop_ctx["face_idx"]
+            file_name = f"{file_stem}_{repeat_idx}_face_{face_idx}.png"
+            Image.fromarray(sample).save(os.path.join(self.restored_face_dir, file_name))
+            cropped_face = self.loop_ctx["cropped_face"]
+            Image.fromarray(cropped_face).save(os.path.join(self.cropped_face_dir, file_name))
+            self.face_helper.add_restored_face(sample)
+        else:
+            self.face_helper.get_inverse_affine()
+            # paste each restored face to the input image
+            restored_img = self.face_helper.paste_faces_to_input_image(
+                upsample_img=sample
+            )
+            file_name = f"{file_stem}_{repeat_idx}.png"
+            Image.fromarray(sample).save(os.path.join(self.restored_bg_dir, file_name))
+            Image.fromarray(restored_img).save(os.path.join(self.output_dir, file_name))

utils/sampler.py ADDED Viewed

	@@ -0,0 +1,341 @@

+from typing import Optional, Tuple, Dict
+import torch
+from torch import nn
+import numpy as np
+from tqdm import tqdm
+from model.gaussian_diffusion import extract_into_tensor
+from model.cldm import ControlLDM
+from utils.cond_fn import Guidance
+from utils.common import sliding_windows, gaussian_weights
+# https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/respace.py
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedSampler(nn.Module):
+    """
+    Implementation for spaced sampling schedule proposed in IDDPM. This class is designed
+    for sampling ControlLDM.
+    https://arxiv.org/pdf/2102.09672.pdf
+    """
+    def __init__(self, betas: np.ndarray) -> "SpacedSampler":
+        super().__init__()
+        self.num_timesteps = len(betas)
+        self.original_betas = betas
+        self.original_alphas_cumprod = np.cumprod(1.0 - betas, axis=0)
+        self.context = {}
+    def register(self, name: str, value: np.ndarray) -> None:
+        self.register_buffer(name, torch.tensor(value, dtype=torch.float32))
+    def make_schedule(self, num_steps: int) -> None:
+        # calcualte betas for spaced sampling
+        # https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/respace.py
+        used_timesteps = space_timesteps(self.num_timesteps, str(num_steps))
+        betas = []
+        last_alpha_cumprod = 1.0
+        for i, alpha_cumprod in enumerate(self.original_alphas_cumprod):
+            if i in used_timesteps:
+                # marginal distribution is the same as q(x_{S_t}|x_0)
+                betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+        assert len(betas) == num_steps
+        self.timesteps = np.array(sorted(list(used_timesteps)), dtype=np.int32) # e.g. [0, 10, 20, ...]
+        betas = np.array(betas, dtype=np.float64)
+        alphas = 1.0 - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        # print(f"sampler sqrt_alphas_cumprod: {np.sqrt(alphas_cumprod)[-1]}")
+        alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
+        sqrt_recip_alphas_cumprod = np.sqrt(1.0 / alphas_cumprod)
+        sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = (
+            betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+        )
+        # log calculation clipped because the posterior variance is 0 at the
+        # beginning of the diffusion chain.
+        posterior_log_variance_clipped = np.log(
+            np.append(posterior_variance[1], posterior_variance[1:])
+        )
+        posterior_mean_coef1 = (
+            betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+        )
+        posterior_mean_coef2 = (
+            (1.0 - alphas_cumprod_prev)
+            * np.sqrt(alphas)
+            / (1.0 - alphas_cumprod)
+        )
+        self.register("sqrt_recip_alphas_cumprod", sqrt_recip_alphas_cumprod)
+        self.register("sqrt_recipm1_alphas_cumprod", sqrt_recipm1_alphas_cumprod)
+        self.register("posterior_variance", posterior_variance)
+        self.register("posterior_log_variance_clipped", posterior_log_variance_clipped)
+        self.register("posterior_mean_coef1", posterior_mean_coef1)
+        self.register("posterior_mean_coef2", posterior_mean_coef2)
+    def q_posterior_mean_variance(self, x_start: torch.Tensor, x_t: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor]:
+        """
+        Implement the posterior distribution q(x_{t-1}|x_t, x_0).
+        Args:
+            x_start (torch.Tensor): The predicted images (NCHW) in timestep `t`.
+            x_t (torch.Tensor): The sampled intermediate variables (NCHW) of timestep `t`.
+            t (torch.Tensor): Timestep (N) of `x_t`. `t` serves as an index to get
+                parameters for each timestep.
+        Returns:
+            posterior_mean (torch.Tensor): Mean of the posterior distribution.
+            posterior_variance (torch.Tensor): Variance of the posterior distribution.
+            posterior_log_variance_clipped (torch.Tensor): Log variance of the posterior distribution.
+        """
+        posterior_mean = (
+            extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def _predict_xstart_from_eps(self, x_t: torch.Tensor, t: torch.Tensor, eps: torch.Tensor) -> torch.Tensor:
+        return (
+            extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def apply_cond_fn(
+        self,
+        model: ControlLDM,
+        pred_x0: torch.Tensor,
+        t: torch.Tensor,
+        index: torch.Tensor,
+        cond_fn: Guidance
+    ) -> torch.Tensor:
+        t_now = int(t[0].item()) + 1
+        if not (cond_fn.t_stop < t_now and t_now < cond_fn.t_start):
+            # stop guidance
+            self.context["g_apply"] = False
+            return pred_x0
+        grad_rescale = 1 / extract_into_tensor(self.posterior_mean_coef1, index, pred_x0.shape)
+        # apply guidance for multiple times
+        loss_vals = []
+        for _ in range(cond_fn.repeat):
+            # set target and pred for gradient computation
+            target, pred = None, None
+            if cond_fn.space == "latent":
+                target = model.vae_encode(cond_fn.target)
+                pred = pred_x0
+            elif cond_fn.space == "rgb":
+                # We need to backward gradient to x0 in latent space, so it's required
+                # to trace the computation graph while decoding the latent.
+                with torch.enable_grad():
+                    target = cond_fn.target
+                    pred_x0_rg = pred_x0.detach().clone().requires_grad_(True)
+                    pred = model.vae_decode(pred_x0_rg)
+                    assert pred.requires_grad
+            else:
+                raise NotImplementedError(cond_fn.space)
+            # compute gradient
+            delta_pred, loss_val = cond_fn(target, pred, t_now)
+            loss_vals.append(loss_val)
+            # update pred_x0 w.r.t gradient
+            if cond_fn.space == "latent":
+                delta_pred_x0 = delta_pred
+                pred_x0 = pred_x0 + delta_pred_x0 * grad_rescale
+            elif cond_fn.space == "rgb":
+                pred.backward(delta_pred)
+                delta_pred_x0 = pred_x0_rg.grad
+                pred_x0 = pred_x0 + delta_pred_x0 * grad_rescale
+            else:
+                raise NotImplementedError(cond_fn.space)
+        self.context["g_apply"] = True
+        self.context["g_loss"] = float(np.mean(loss_vals))
+        return pred_x0
+    def predict_noise(
+        self,
+        model: ControlLDM,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        cond: Dict[str, torch.Tensor],
+        uncond: Optional[Dict[str, torch.Tensor]],
+        cfg_scale: float
+    ) -> torch.Tensor:
+        if uncond is None or cfg_scale == 1.:
+            model_output = model(x, t, cond)
+        else:
+            # apply classifier-free guidance
+            model_cond = model(x, t, cond)
+            model_uncond = model(x, t, uncond)
+            model_output = model_uncond + cfg_scale * (model_cond - model_uncond)
+        return model_output
+    @torch.no_grad()
+    def predict_noise_tiled(
+        self,
+        model: ControlLDM,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        cond: Dict[str, torch.Tensor],
+        uncond: Optional[Dict[str, torch.Tensor]],
+        cfg_scale: float,
+        tile_size: int,
+        tile_stride: int
+    ):
+        _, _, h, w = x.shape
+        tiles = tqdm(sliding_windows(h, w, tile_size // 8, tile_stride // 8), unit="tile", leave=False)
+        eps = torch.zeros_like(x)
+        count = torch.zeros_like(x, dtype=torch.float32)
+        weights = gaussian_weights(tile_size // 8, tile_size // 8)[None, None]
+        weights = torch.tensor(weights, dtype=torch.float32, device=x.device)
+        for hi, hi_end, wi, wi_end in tiles:
+            tiles.set_description(f"Process tile ({hi} {hi_end}), ({wi} {wi_end})")
+            tile_x = x[:, :, hi:hi_end, wi:wi_end]
+            tile_cond = {
+                "c_img": cond["c_img"][:, :, hi:hi_end, wi:wi_end],
+                "c_txt": cond["c_txt"]
+            }
+            if uncond:
+                tile_uncond = {
+                    "c_img": uncond["c_img"][:, :, hi:hi_end, wi:wi_end],
+                    "c_txt": uncond["c_txt"]
+                }
+            tile_eps = self.predict_noise(model, tile_x, t, tile_cond, tile_uncond, cfg_scale)
+            # accumulate noise
+            eps[:, :, hi:hi_end, wi:wi_end] += tile_eps * weights
+            count[:, :, hi:hi_end, wi:wi_end] += weights
+        # average on noise (score)
+        eps.div_(count)
+        return eps
+    @torch.no_grad()
+    def p_sample(
+        self,
+        model: ControlLDM,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        index: torch.Tensor,
+        cond: Dict[str, torch.Tensor],
+        uncond: Optional[Dict[str, torch.Tensor]],
+        cfg_scale: float,
+        cond_fn: Optional[Guidance],
+        tiled: bool,
+        tile_size: int,
+        tile_stride: int
+    ) -> torch.Tensor:
+        if tiled:
+            eps = self.predict_noise_tiled(model, x, t, cond, uncond, cfg_scale, tile_size, tile_stride)
+        else:
+            eps = self.predict_noise(model, x, t, cond, uncond, cfg_scale)
+        pred_x0 = self._predict_xstart_from_eps(x, index, eps)
+        if cond_fn:
+            assert not tiled, f"tiled sampling currently doesn't support guidance"
+            pred_x0 = self.apply_cond_fn(model, pred_x0, t, index, cond_fn)
+        model_mean, model_variance, _ = self.q_posterior_mean_variance(pred_x0, x, index)
+        noise = torch.randn_like(x)
+        nonzero_mask = (
+            (index != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )
+        x_prev = model_mean + nonzero_mask * torch.sqrt(model_variance) * noise
+        return x_prev
+    @torch.no_grad()
+    def sample(
+        self,
+        model: ControlLDM,
+        device: str,
+        steps: int,
+        batch_size: int,
+        x_size: Tuple[int],
+        cond: Dict[str, torch.Tensor],
+        uncond: Dict[str, torch.Tensor],
+        cfg_scale: float,
+        cond_fn: Optional[Guidance]=None,
+        tiled: bool=False,
+        tile_size: int=-1,
+        tile_stride: int=-1,
+        x_T: Optional[torch.Tensor]=None,
+        progress: bool=True,
+        progress_leave: bool=True,
+    ) -> torch.Tensor:
+        self.make_schedule(steps)
+        self.to(device)
+        if x_T is None:
+            # TODO: not convert to float32, may trigger an error
+            img = torch.randn((batch_size, *x_size), device=device)
+        else:
+            img = x_T
+        timesteps = np.flip(self.timesteps) # [1000, 950, 900, ...]
+        total_steps = len(self.timesteps)
+        iterator = tqdm(timesteps, total=total_steps, leave=progress_leave, disable=not progress)
+        for i, step in enumerate(iterator):
+            ts = torch.full((batch_size,), step, device=device, dtype=torch.long)
+            index = torch.full_like(ts, fill_value=total_steps - i - 1)
+            img = self.p_sample(
+                model, img, ts, index, cond, uncond, cfg_scale, cond_fn,
+                tiled, tile_size, tile_stride
+            )
+            if cond_fn and self.context["g_apply"]:
+                loss_val = self.context["g_loss"]
+                desc = f"Spaced Sampler With Guidance, Loss: {loss_val:.6f}"
+            else:
+                desc = "Spaced Sampler"
+            iterator.set_description(desc)
+        return img