Spaces:

doevent
/

kandisuperres

Running on Zero

App Files Files Community

doevent commited on Aug 21, 2024

Commit

5004324

verified ·

1 Parent(s): 6dcbc25

Upload 14 files

Browse files

Files changed (14) hide show

KandiSuperRes/__init__.py +157 -0
KandiSuperRes/model/__init__.py +0 -0
KandiSuperRes/model/diffusion_refine.py +131 -0
KandiSuperRes/model/diffusion_sr.py +146 -0
KandiSuperRes/model/diffusion_sr_turbo.py +87 -0
KandiSuperRes/model/nn.py +122 -0
KandiSuperRes/model/unet.py +284 -0
KandiSuperRes/model/unet_sr.py +260 -0
KandiSuperRes/model/utils.py +62 -0
KandiSuperRes/movq.py +541 -0
KandiSuperRes/sr_pipeline.py +116 -0
KandiSuperRes/utils.py +9 -0
weights/context.pt +3 -0
weights/context_mask.pt +3 -0

KandiSuperRes/__init__.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import torch
+from typing import Optional, Union
+from huggingface_hub import hf_hub_download
+from .sr_pipeline import KandiSuperResPipeline
+from KandiSuperRes.model.unet import UNet
+from KandiSuperRes.model.unet_sr import UNet as UNet_sr
+from KandiSuperRes.movq import MoVQ
+def get_sr_model(
+    device: Union[str, torch.device],
+    weights_path: Optional[str] = None,
+    dtype: Union[str, torch.dtype] = torch.float16
+) -> (UNet_sr, Optional[dict], Optional[torch.Tensor]):
+    unet = UNet_sr(
+        init_channels=128,
+        model_channels=128,
+        num_channels=3,
+        time_embed_dim=512,
+        groups=32,
+        dim_mult=(1, 2, 4, 8),
+        num_resnet_blocks=(2,4,8,8),
+        add_cross_attention=(False, False, False, False),
+        add_self_attention=(False, False, False, False),
+        feature_pooling_type='attention',
+        lowres_cond =True
+    )
+    if weights_path:
+        state_dict = torch.load(weights_path, map_location=torch.device('cpu'))
+        try:
+            unet.load_state_dict(state_dict['unet'])
+        except:
+            unet.load_state_dict(state_dict)
+    unet.to(device=device, dtype=dtype).eval()
+    return unet
+def get_T2I_unet(
+    device: Union[str, torch.device],
+    weights_path: Optional[str] = None,
+    dtype: Union[str, torch.dtype] = torch.float32,
+) -> (UNet, Optional[torch.Tensor], Optional[dict]):
+    unet = UNet(
+        model_channels=384,
+        num_channels=4,
+        init_channels=192,
+        time_embed_dim=1536,
+        context_dim=4096,
+        groups=32,
+        head_dim=64,
+        expansion_ratio=4,
+        compression_ratio=2,
+        dim_mult=(1, 2, 4, 8),
+        num_blocks=(3, 3, 3, 3),
+        add_cross_attention=(False, True, True, True),
+        add_self_attention=(False, True, True, True),
+    )
+    null_embedding = None
+    if weights_path:
+        state_dict = torch.load(weights_path, map_location=torch.device('cpu'))
+        null_embedding = state_dict['null_embedding']
+        unet.load_state_dict(state_dict['unet'])
+    unet.to(device=device, dtype=dtype).eval()
+    return unet, null_embedding
+def get_movq(
+    device: Union[str, torch.device],
+    weights_path: Optional[str] = None,
+    dtype: Union[str, torch.dtype] = torch.float32,
+) -> MoVQ:
+    generator_config = {
+        'double_z': False,
+        'z_channels': 4,
+        'resolution': 256,
+        'in_channels': 3,
+        'out_ch': 3,
+        'ch': 256,
+        'ch_mult': [1, 2, 2, 4],
+        'num_res_blocks': 2,
+        'attn_resolutions': [32],
+        'dropout': 0.0,
+        'tile_sample_min_size': 1024,
+        'tile_overlap_factor_enc': 0.0,
+        'tile_overlap_factor_dec': 0.25,
+        'use_tiling': True
+    }
+    movq = MoVQ(generator_config)
+    if weights_path:
+        state_dict = torch.load(weights_path, map_location=torch.device('cpu'))
+        movq.load_state_dict(state_dict)
+    movq.to(device=device, dtype=dtype).eval()
+    return movq
+def get_SR_pipeline(
+    device: Union[str, torch.device],
+    fp16: bool = True,
+    flash: bool = True,
+    scale: int = 2,
+    cache_dir: str = '/tmp/KandiSuperRes/',
+    movq_path: str = None,
+    refiner_path: str = None,
+    unet_sr_path: str = None,
+) -> KandiSuperResPipeline:
+    if flash:
+        if scale == 2:
+            device_map = {
+            'movq': device, 'refiner': device, 'sr_model': device
+            }
+            dtype = torch.float16 if fp16 else torch.float32
+            dtype_map = {
+                'movq': torch.float32, 'refiner': dtype, 'sr_model': dtype
+            }
+            if movq_path is None:
+                print('Download movq weights')
+                movq_path = hf_hub_download(
+                    repo_id="ai-forever/Kandinsky3.1", filename='weights/movq.pt', cache_dir=cache_dir
+                )
+            if refiner_path is None:
+                print('Download refiner weights')
+                refiner_path = hf_hub_download(
+                    repo_id="ai-forever/Kandinsky3.1", filename='weights/kandinsky3_flash.pt', cache_dir=cache_dir
+                )
+            if unet_sr_path is None:
+                print('Download KandiSuperRes Flash weights')
+                unet_sr_path = hf_hub_download(
+                    repo_id="ai-forever/KandiSuperRes", filename='KandiSuperRes_flash_x2.pt', cache_dir=cache_dir
+                )
+            sr_model = get_sr_model(device_map['sr_model'], unet_sr_path, dtype=dtype_map['sr_model'])
+            movq = get_movq(device_map['movq'], movq_path, dtype=dtype_map['movq'])
+            refiner, _ = get_T2I_unet(device_map['refiner'], refiner_path, dtype=dtype_map['refiner'])
+            return KandiSuperResPipeline(
+                scale, device_map, dtype_map, flash, sr_model, movq, refiner
+            )
+        else:
+            print('Flash model for x4 scale is not implemented.')
+    else:
+        if unet_sr_path is None:
+            if scale == 4:
+                unet_sr_path = hf_hub_download(
+                    repo_id="ai-forever/KandiSuperRes", filename='KandiSuperRes.ckpt', cache_dir=cache_dir
+                )
+            elif scale == 2:
+                unet_sr_path = hf_hub_download(
+                    repo_id="ai-forever/KandiSuperRes", filename='KandiSuperRes_x2.ckpt', cache_dir=cache_dir
+                )
+        dtype = torch.float16 if fp16 else torch.float32
+        sr_model = get_sr_model(device, unet_sr_path, dtype=dtype)
+        return KandiSuperResPipeline(scale, device, dtype, flash, sr_model)

KandiSuperRes/model/__init__.py ADDED Viewed

File without changes

KandiSuperRes/model/diffusion_refine.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import math
+import torch
+from tqdm import tqdm
+from .utils import get_tensor_items
+import torch.nn.functional as F
+def get_named_beta_schedule(schedule_name, timesteps):
+    if schedule_name == "linear":
+        scale = 1000 / timesteps
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return torch.linspace(
+            beta_start, beta_end, timesteps, dtype=torch.float32
+        )
+    elif schedule_name == "cosine":
+        alpha_bar = lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+        betas = []
+        for i in range(timesteps):
+            t1 = i / timesteps
+            t2 = (i + 1) / timesteps
+            betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), 0.999))
+        return torch.tensor(betas, dtype=torch.float32)
+class BaseDiffusion:
+    def __init__(self, betas, percentile=None, gen_noise=torch.randn_like):
+        self.betas = betas
+        self.num_timesteps = betas.shape[0]
+        alphas = 1. - betas
+        self.alphas_cumprod = torch.cumprod(alphas, dim=0)
+        self.alphas_cumprod_prev = torch.cat([torch.ones(1, dtype=betas.dtype), self.alphas_cumprod[:-1]])
+        # calculate q(x_t | x_{t-1})
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1. - self.alphas_cumprod)
+        self.time_scale = 1000 // self.num_timesteps
+        self.gen_noise = gen_noise
+    def get_x_start(self, x, t, noise):
+        sqrt_one_minus_alphas_cumprod = get_tensor_items(self.sqrt_one_minus_alphas_cumprod, t, noise.shape)
+        sqrt_alphas_cumprod = get_tensor_items(self.sqrt_alphas_cumprod, t, noise.shape)
+        pred_x_start = (x - sqrt_one_minus_alphas_cumprod * noise) / sqrt_alphas_cumprod
+        return pred_x_start
+    def q_sample(self, x_start, t, noise=None):
+        if noise is None:
+            noise = self.gen_noise(x_start)
+        sqrt_alphas_cumprod = get_tensor_items(self.sqrt_alphas_cumprod, t, x_start.shape)
+        sqrt_one_minus_alphas_cumprod = get_tensor_items(self.sqrt_one_minus_alphas_cumprod, t, noise.shape)
+        x_t = sqrt_alphas_cumprod * x_start + sqrt_one_minus_alphas_cumprod * noise
+        return x_t
+    @torch.no_grad()
+    def refine(self, model, img, context, context_mask):
+#         for time in tqdm([479, 229]):
+        for time in [229]:
+            time = torch.tensor([time,] * img.shape[0], device=img.device)
+            x_t = self.q_sample(img, time)
+            pred_noise = model(x_t, time.type(x_t.dtype), context, context_mask.bool())
+            img = self.get_x_start(x_t, time, pred_noise)
+        return img
+    def blend_v(
+        self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+    ) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for y in range(blend_extent):
+            b[ :, :, y, :] = a[ :, :, -blend_extent + y, :] * (
+                1 - y / blend_extent
+            ) + b[ :, :, y, :] * (y / blend_extent)
+        return b
+    def blend_h(
+        self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+    ) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for x in range(blend_extent):
+            b[ :, :, :, x] = a[ :, :, :, -blend_extent + x] * (
+                1 - x / blend_extent
+            ) + b[ :, :, :, x] * (x / blend_extent)
+        return b
+    def refine_tiled(self, model, img, context, context_mask):
+        tile_sample_min_size = 352
+        tile_overlap_factor = 0.25
+        overlap_size = int(tile_sample_min_size * (1 - tile_overlap_factor))
+        tile_latent_min_size = int(tile_sample_min_size)
+        blend_extent = int(tile_latent_min_size * tile_overlap_factor)
+        row_limit = tile_latent_min_size - blend_extent
+        # Split the image into tiles and encode them separately.
+        rows = []
+        for i in tqdm(range(0, img.shape[2], overlap_size)):
+            row = []
+            for j in range(0, img.shape[3], overlap_size):
+                tile = img[
+                    :,
+                    :,
+                    i : i + tile_sample_min_size,
+                    j : j + tile_sample_min_size,
+                ]
+                tile = self.refine(model, tile, context, context_mask)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[ :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+        refine_img = torch.cat(result_rows, dim=2)
+        return refine_img
+def get_diffusion(conf):
+    betas = get_named_beta_schedule(**conf.schedule_params)
+    base_diffusion = BaseDiffusion(betas, **conf.diffusion_params)
+    return base_diffusion

KandiSuperRes/model/diffusion_sr.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from diffusers import DDIMScheduler, DPMSolverMultistepScheduler
+from einops import repeat
+import copy
+import inspect
+import math
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+class DPMSolver:
+    def __init__(self, num_timesteps):
+        self.dpm_solver = DPMSolverMultistepScheduler(
+            beta_schedule="linear",
+            prediction_type= "sample",
+#             algorithm_type="sde-dpmsolver++",
+            thresholding=False
+        )
+        self.dpm_solver.set_timesteps(num_timesteps)
+    @torch.no_grad()
+    def pred_noise(self, model, x, t, lowres_img, dtype):
+        pred_noise = model(x.to(dtype), t.to(dtype), lowres_img=lowres_img.to(dtype))
+        pred_noise = pred_noise.to(dtype=torch.float32)
+        return pred_noise
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.dpm_solver.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.dpm_solver.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def get_views(self, panorama_height, panorama_width, window_size=1024, stride=800):
+        # Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113)
+        # if panorama's height/width < window_size, num_blocks of height/width should return 1
+        num_blocks_height = round(math.ceil((panorama_height - window_size) / stride)) + 1 if panorama_height > window_size else 1
+        num_blocks_width = round(math.ceil((panorama_width - window_size) / stride)) + 1 if panorama_width > window_size else 1
+        total_num_blocks = int(num_blocks_height * num_blocks_width)
+        views = []
+        for i in range(total_num_blocks):
+            h_start = int((i // num_blocks_width) * stride)
+            h_end = h_start + window_size
+            if h_end > panorama_height and num_blocks_height > 1:
+                h_end = panorama_height
+                h_start = panorama_height - window_size
+            w_start = int((i % num_blocks_width) * stride)
+            w_end = w_start + window_size
+            if w_end > panorama_width and num_blocks_width > 1:
+                w_end = panorama_width
+                w_start = panorama_width - window_size
+            views.append((h_start, h_end, w_start, w_end))
+        return views
+    def generate_panorama(self, height, width, device, dtype, num_inference_steps,
+                  unet, lowres_img, view_batch_size=15, eta=0, seed=0):
+        # 6. Define panorama grid and initialize views for synthesis.
+        # prepare batch grid
+        views = self.get_views(height, width)
+        views_batch = [views[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)]
+        views_scheduler_status = [copy.deepcopy(self.dpm_solver.__dict__)] * len(views_batch)
+        shape = (1, 3, height, width)
+        count = torch.zeros(*shape, device=device)
+        value = torch.zeros(*shape, device=device)
+        generator = torch.Generator(device=device)
+        if seed is not None:
+            generator = generator.manual_seed(seed)
+        img = torch.randn(*shape, device=device, generator=generator)
+        up_lowres_img = F.interpolate(lowres_img, (shape[2], shape[3]), mode="bilinear")
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 8. Denoising loop
+        # Each denoising step also includes refinement of the latents with respect to the
+        # views.
+        timesteps = self.dpm_solver.timesteps
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.dpm_solver.order
+        for i, time in tqdm(enumerate(self.dpm_solver.timesteps)):
+            count.zero_()
+            value.zero_()
+            # generate views
+            # Here, we iterate through different spatial crops of the latents and denoise them. These
+            # denoised (latent) crops are then averaged to produce the final latent
+            # for the current timestep via MultiDiffusion. Please see Sec. 4.1 in the
+            # MultiDiffusion paper for more details: https://arxiv.org/abs/2302.08113
+            # Batch views denoise
+            for j, batch_view in enumerate(views_batch):
+                vb_size = len(batch_view)
+                # get the latents corresponding to the current view coordinates
+                img_for_view = torch.cat(
+                    [
+                        img[:, :, h_start:h_end, w_start:w_end]
+                        for h_start, h_end, w_start, w_end in batch_view
+                    ]
+                )
+                lowres_img_for_view = torch.cat(
+                    [
+                        up_lowres_img[:, :, h_start:h_end, w_start:w_end]
+                        for h_start, h_end, w_start, w_end in batch_view
+                    ]
+                )
+                # rematch block's scheduler status
+                self.dpm_solver.__dict__.update(views_scheduler_status[j])
+                t = torch.tensor([time] * img_for_view.shape[0], device=device)
+                pred_noise = self.pred_noise(
+                    unet, img_for_view, t, lowres_img_for_view, dtype
+                )
+                img_denoised_batch = self.dpm_solver.step(pred_noise, time, img_for_view, **extra_step_kwargs).prev_sample
+                # save views scheduler status after sample
+                views_scheduler_status[j] = copy.deepcopy(self.dpm_solver.__dict__)
+                # extract value from batch
+                for img_view_denoised, (h_start, h_end, w_start, w_end) in zip(
+                    img_denoised_batch.chunk(vb_size), batch_view
+                ):
+                    value[:, :, h_start:h_end, w_start:w_end] += img_view_denoised
+                    count[:, :, h_start:h_end, w_start:w_end] += 1
+            # take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113
+            img = torch.where(count > 0, value / count, value)
+        return img

KandiSuperRes/model/diffusion_sr_turbo.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import math
+import torch
+from einops import rearrange
+from tqdm import tqdm
+from .utils import get_tensor_items, exist
+import numpy as np
+def get_named_beta_schedule(schedule_name, timesteps):
+    if schedule_name == "linear":
+        scale = 1000 / timesteps
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return torch.linspace(
+            beta_start, beta_end, timesteps, dtype=torch.float32
+        )
+    elif schedule_name == "cosine":
+        alpha_bar = lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+        betas = []
+        for i in range(timesteps):
+            t1 = i / timesteps
+            t2 = (i + 1) / timesteps
+            betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), 0.999))
+        return torch.tensor(betas, dtype=torch.float32)
+class BaseDiffusion:
+    def __init__(self, betas, percentile=None, gen_noise=torch.randn_like):
+        self.betas = betas
+        self.num_timesteps = betas.shape[0]
+        alphas = 1. - betas
+        self.alphas_cumprod = torch.cumprod(alphas, dim=0)
+        self.alphas_cumprod_prev = torch.cat([torch.ones(1, dtype=betas.dtype), self.alphas_cumprod[:-1]])
+        # calculate q(x_t | x_{t-1})
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1. - self.alphas_cumprod)
+        # calculate q(x_{t-1} | x_t, x_0)
+        self.posterior_mean_coef_1 = (torch.sqrt(self.alphas_cumprod_prev) * betas / (1. - self.alphas_cumprod))
+        self.posterior_mean_coef_2 = (torch.sqrt(alphas) * (1. - self.alphas_cumprod_prev) / (1. - self.alphas_cumprod))
+        self.posterior_variance = betas * (1. - self.alphas_cumprod_prev) / (1. - self.alphas_cumprod)
+        self.posterior_log_variance = (torch.log(
+            torch.cat([self.posterior_variance[1].unsqueeze(0), self.posterior_variance[1:]])
+        ))
+        self.percentile = percentile
+        self.time_scale = 1000 // self.num_timesteps
+        self.gen_noise = gen_noise
+    def q_sample(self, x_start, t, noise=None):
+        if noise is None:
+            noise = self.gen_noise(x_start)
+        sqrt_alphas_cumprod = get_tensor_items(self.sqrt_alphas_cumprod, t, x_start.shape)
+        sqrt_one_minus_alphas_cumprod = get_tensor_items(self.sqrt_one_minus_alphas_cumprod, t, noise.shape)
+        x_t = sqrt_alphas_cumprod * x_start + sqrt_one_minus_alphas_cumprod * noise
+        return x_t
+    @torch.no_grad()
+    def p_sample_loop(
+        self, model, shape, device,  dtype, lowres_img, times=[979, 729, 479, 229]
+    ):
+        img = torch.randn(*shape, device=device).to(dtype=dtype)
+        times = times + [0,]
+        times = list(zip(times[:-1], times[1:]))
+        for time, prev_time in tqdm(times):
+            time = torch.tensor([time] * shape[0], device=device)
+            x_t = self.q_sample(img, time)
+            img = model(x_t.to(dtype), time.to(dtype), lowres_img=lowres_img.to(dtype))
+        return img
+    @torch.no_grad()
+    def refine(self, model, img, **large_model_kwargs):
+        for time in tqdm([729, 479, 229]):
+            time = torch.tensor([time,] * img.shape[0], device=img.device)
+            x_t = self.q_sample(img, time)
+            img = model(x_t, time.type(x_t.dtype), **large_model_kwargs)
+        return img
+def get_diffusion(conf):
+    betas = get_named_beta_schedule(**conf.schedule_params)
+    base_diffusion = BaseDiffusion(betas, **conf.diffusion_params)
+    return base_diffusion

KandiSuperRes/model/nn.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import math
+import torch
+from torch import nn, einsum
+from einops import rearrange, repeat
+from .utils import exist, set_default_layer
+class Identity(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+    @staticmethod
+    def forward(x, *args, **kwargs):
+        return x
+class SinusoidalPosEmb_sr(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=x.device) * -emb)
+        emb = rearrange(x, 'i -> i 1') * rearrange(emb, 'j -> 1 j').to(dtype=x.dtype)
+        return torch.cat((emb.sin(), emb.cos()), dim=-1)
+class UpDownResolution(nn.Module):
+    def __init__(self, num_channels, up_resolution, change_type='conv'):
+        super().__init__()
+        if change_type == 'pooling':
+            self.change_resolution = set_default_layer(
+                up_resolution,
+                layer_1=nn.Upsample, kwargs_1={'scale_factor': 2., 'mode': 'nearest'},
+                layer_2=nn.AvgPool2d, kwargs_2={'kernel_size': 2, 'stride': 2}
+            )
+        elif change_type == 'conv':
+            self.change_resolution = set_default_layer(
+                up_resolution,
+                nn.ConvTranspose2d, (num_channels, num_channels), {'kernel_size': 4, 'stride': 2, 'padding': 1},
+                nn.Conv2d, (num_channels, num_channels), {'kernel_size': 4, 'stride': 2, 'padding': 1},
+            )
+        else:
+            raise NotImplementedError
+    def forward(self, x):
+        x = self.change_resolution(x)
+        return x
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=x.device, dtype=x.dtype) * -emb)
+        emb = rearrange(x, 'i -> i 1') * rearrange(emb, 'j -> 1 j')
+        return torch.cat((emb.sin(), emb.cos()), dim=-1)
+class ConditionalGroupNorm(nn.Module):
+    def __init__(self, groups, normalized_shape, context_dim):
+        super().__init__()
+        self.norm = nn.GroupNorm(groups, normalized_shape, affine=False)
+        self.context_mlp = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(context_dim, 2 * normalized_shape)
+        )
+        self.context_mlp[1].weight.data.zero_()
+        self.context_mlp[1].bias.data.zero_()
+    def forward(self, x, context):
+        context = self.context_mlp(context)
+        ndims = ' 1' * len(x.shape[2:])
+        context = rearrange(context, f'b c -> b c{ndims}')
+        scale, shift = context.chunk(2, dim=1)
+        x = self.norm(x) * (scale + 1.) + shift
+        return x
+class Attention(nn.Module):
+    def __init__(self, in_channels, out_channels, context_dim, head_dim=64):
+        super().__init__()
+        assert out_channels % head_dim == 0
+        self.num_heads = out_channels // head_dim
+        self.scale = head_dim ** -0.5
+        self.to_query = nn.Linear(in_channels, out_channels, bias=False)
+        self.to_key = nn.Linear(context_dim, out_channels, bias=False)
+        self.to_value = nn.Linear(context_dim, out_channels, bias=False)
+        self.output_layer = nn.Linear(out_channels, out_channels, bias=False)
+    def forward(self, x, context, context_mask=None):
+        query = rearrange(self.to_query(x), 'b n (h d) -> b h n d', h=self.num_heads)
+        key = rearrange(self.to_key(context), 'b n (h d) -> b h n d', h=self.num_heads)
+        value = rearrange(self.to_value(context), 'b n (h d) -> b h n d', h=self.num_heads)
+        attention_matrix = einsum('b h i d, b h j d -> b h i j', query, key) * self.scale
+        if exist(context_mask):
+            max_neg_value = -torch.finfo(attention_matrix.dtype).max
+            context_mask = rearrange(context_mask, 'b j -> b 1 1 j')
+            attention_matrix = attention_matrix.masked_fill(~context_mask, max_neg_value)
+        attention_matrix = attention_matrix.softmax(dim=-1)
+        out = einsum('b h i j, b h j d -> b h i d', attention_matrix, value)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        out = self.output_layer(out)
+        return out

KandiSuperRes/model/unet.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import torch
+from torch import nn, einsum
+from einops import rearrange
+from .nn import Identity, Attention, SinusoidalPosEmb, ConditionalGroupNorm
+from .utils import exist, set_default_item, set_default_layer
+import torch.nn.functional as F
+class Block(nn.Module):
+    def __init__(self, in_channels, out_channels, time_embed_dim, kernel_size=3, norm_groups=32, up_resolution=None):
+        super().__init__()
+        self.group_norm = ConditionalGroupNorm(norm_groups, in_channels, time_embed_dim)
+        self.activation = nn.SiLU()
+        self.up_sample = set_default_layer(
+            exist(up_resolution) and up_resolution,
+            nn.ConvTranspose2d, (in_channels, in_channels), {'kernel_size': 2, 'stride': 2}
+        )
+        padding = set_default_item(kernel_size == 1, 0, 1)
+        self.projection = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding)
+        self.down_sample = set_default_layer(
+            exist(up_resolution) and not up_resolution,
+            nn.Conv2d, (out_channels, out_channels), {'kernel_size': 2, 'stride': 2}
+        )
+    def forward(self, x, time_embed):
+        x = self.group_norm(x, time_embed)
+        x = self.activation(x)
+        x = self.up_sample(x)
+        x = self.projection(x)
+        x = self.down_sample(x)
+        return x
+class ResNetBlock(nn.Module):
+    def __init__(
+            self, in_channels, out_channels, time_embed_dim, norm_groups=32, compression_ratio=2, up_resolutions=4*[None]
+    ):
+        super().__init__()
+        kernel_sizes = [1, 3, 3, 1]
+        hidden_channel = max(in_channels, out_channels) // compression_ratio
+        hidden_channels = [(in_channels, hidden_channel)] + [(hidden_channel, hidden_channel)] * 2 + [(hidden_channel, out_channels)]
+        self.resnet_blocks = nn.ModuleList([
+            Block(in_channel, out_channel, time_embed_dim, kernel_size, norm_groups, up_resolution)
+            for (in_channel, out_channel), kernel_size, up_resolution in zip(hidden_channels, kernel_sizes, up_resolutions)
+        ])
+        self.shortcut_up_sample = set_default_layer(
+            True in up_resolutions,
+            nn.ConvTranspose2d, (in_channels, in_channels), {'kernel_size': 2, 'stride': 2}
+        )
+        self.shortcut_projection = set_default_layer(
+            in_channels != out_channels,
+            nn.Conv2d, (in_channels, out_channels), {'kernel_size': 1}
+        )
+        self.shortcut_down_sample = set_default_layer(
+            False in up_resolutions,
+            nn.Conv2d, (out_channels, out_channels), {'kernel_size': 2, 'stride': 2}
+        )
+    def forward(self, x, time_embed):
+        out = x
+        for resnet_block in self.resnet_blocks:
+            out = resnet_block(out, time_embed)
+        x = self.shortcut_up_sample(x)
+        x = self.shortcut_projection(x)
+        x = self.shortcut_down_sample(x)
+        x = x + out
+        return x
+class AttentionPolling(nn.Module):
+    def __init__(self, num_channels, context_dim, head_dim=64):
+        super().__init__()
+        self.attention = Attention(context_dim, num_channels, context_dim, head_dim)
+    def forward(self, x, context, context_mask=None):
+        context = self.attention(context.mean(dim=1, keepdim=True), context, context_mask)
+        return x + context.squeeze(1)
+class AttentionBlock(nn.Module):
+    def __init__(self, num_channels, time_embed_dim, context_dim=None, norm_groups=32, head_dim=64, expansion_ratio=4):
+        super().__init__()
+        self.in_norm = ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
+        self.attention = Attention(num_channels, num_channels, context_dim or num_channels, head_dim)
+        hidden_channels = expansion_ratio * num_channels
+        self.out_norm = ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
+        self.feed_forward = nn.Sequential(
+            nn.Conv2d(num_channels, hidden_channels, kernel_size=1, bias=False),
+            nn.SiLU(),
+            nn.Conv2d(hidden_channels, num_channels, kernel_size=1, bias=False),
+        )
+    def forward(self, x, time_embed, context=None, context_mask=None):
+        height, width = x.shape[-2:]
+        out = self.in_norm(x, time_embed)
+        out = rearrange(out, 'b c h w -> b (h w) c', h=height, w=width)
+        context = set_default_item(exist(context), context, out)
+        out = self.attention(out, context, context_mask)
+        out = rearrange(out, 'b (h w) c -> b c h w', h=height, w=width)
+        x = x + out
+        out = self.out_norm(x, time_embed)
+        out = self.feed_forward(out)
+        x = x + out
+        return x
+class DownSampleBlock(nn.Module):
+    def __init__(
+            self, in_channels, out_channels, time_embed_dim, context_dim=None,
+            num_blocks=3, groups=32, head_dim=64, expansion_ratio=4, compression_ratio=2,
+            down_sample=True, self_attention=True
+    ):
+        super().__init__()
+        self.self_attention_block = set_default_layer(
+            self_attention,
+            AttentionBlock,
+            (in_channels, time_embed_dim, None, groups, head_dim, expansion_ratio),
+            layer_2=Identity
+        )
+        up_resolutions = [[None] * 4] * (num_blocks - 1) + [[None, None, set_default_item(down_sample, False), None]]
+        hidden_channels = [(in_channels, out_channels)] + [(out_channels, out_channels)] * (num_blocks - 1)
+        self.resnet_attn_blocks = nn.ModuleList([
+            nn.ModuleList([
+                ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio),
+                set_default_layer(
+                    exist(context_dim),
+                    AttentionBlock,
+                    (out_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio),
+                    layer_2=Identity
+                ),
+                ResNetBlock(out_channel, out_channel, time_embed_dim, groups, compression_ratio, up_resolution),
+            ]) for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions)
+        ])
+    def forward(self, x, time_embed, context=None, context_mask=None, control_net_residual=None):
+        x = self.self_attention_block(x, time_embed)
+        for in_resnet_block, attention, out_resnet_block in self.resnet_attn_blocks:
+            x = in_resnet_block(x, time_embed)
+            x = attention(x, time_embed, context, context_mask)
+            x = out_resnet_block(x, time_embed)
+        return x
+class UpSampleBlock(nn.Module):
+    def __init__(
+            self, in_channels, cat_dim, out_channels, time_embed_dim, context_dim=None,
+            num_blocks=3, groups=32, head_dim=64, expansion_ratio=4, compression_ratio=2,
+            up_sample=True, self_attention=True
+    ):
+        super().__init__()
+        up_resolutions = [[None, set_default_item(up_sample, True), None, None]] + [[None] * 4] * (num_blocks - 1)
+        hidden_channels = [(in_channels + cat_dim, in_channels)] + [(in_channels, in_channels)] * (num_blocks - 2) + [(in_channels, out_channels)]
+        self.resnet_attn_blocks = nn.ModuleList([
+            nn.ModuleList([
+                ResNetBlock(in_channel, in_channel, time_embed_dim, groups, compression_ratio, up_resolution),
+                set_default_layer(
+                    exist(context_dim),
+                    AttentionBlock,
+                    (in_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio),
+                    layer_2=Identity
+                ),
+                ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio),
+            ]) for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions)
+        ])
+        self.self_attention_block = set_default_layer(
+            self_attention,
+            AttentionBlock,
+            (out_channels, time_embed_dim, None, groups, head_dim, expansion_ratio),
+            layer_2=Identity
+        )
+    def forward(self, x, time_embed, context=None, context_mask=None):
+        for in_resnet_block, attention, out_resnet_block in self.resnet_attn_blocks:
+            x = in_resnet_block(x, time_embed)
+            x = attention(x, time_embed, context, context_mask)
+            x = out_resnet_block(x, time_embed)
+        x = self.self_attention_block(x, time_embed)
+        return x
+class UNet(nn.Module):
+    def __init__(self,
+                 model_channels,
+                 init_channels=None,
+                 num_channels=3,
+                 out_channels=4,
+                 time_embed_dim=None,
+                 context_dim=None,
+                 groups=32,
+                 head_dim=64,
+                 expansion_ratio=4,
+                 compression_ratio=2,
+                 dim_mult=(1, 2, 4, 8),
+                 num_blocks=(3, 3, 3, 3),
+                 add_cross_attention=(False, True, True, True),
+                 add_self_attention=(False, True, True, True),
+                 *args,
+                 **kwargs,
+                 ):
+        super().__init__()
+        init_channels = init_channels or model_channels
+        self.to_time_embed = nn.Sequential(
+            SinusoidalPosEmb(init_channels),
+            nn.Linear(init_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        self.feature_pooling = AttentionPolling(time_embed_dim, context_dim, head_dim)
+        self.in_layer = nn.Conv2d(num_channels, init_channels, kernel_size=3, padding=1)
+        hidden_dims = [init_channels, *map(lambda mult: model_channels * mult, dim_mult)]
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [set_default_item(is_exist, context_dim) for is_exist in add_cross_attention]
+        layer_params = [num_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_samples = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(zip(in_out_dims, *layer_params)):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(set_default_item(level != (self.num_levels - 1), out_dim, 0))
+            self.down_samples.append(
+                DownSampleBlock(
+                    in_dim, out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim, expansion_ratio,
+                    compression_ratio, down_sample, self_attention
+                )
+            )
+        self.up_samples = nn.ModuleList([])
+        for level, ((out_dim, in_dim), res_block_num, text_dim, self_attention) in enumerate(zip(reversed(in_out_dims), *rev_layer_params)):
+            up_sample = level != 0
+            self.up_samples.append(
+                UpSampleBlock(
+                    in_dim, cat_dims.pop(), out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim,
+                    expansion_ratio, compression_ratio, up_sample, self_attention
+                )
+            )
+        self.out_layer = nn.Sequential(
+            nn.GroupNorm(groups, init_channels),
+            nn.SiLU(),
+            nn.Conv2d(init_channels, out_channels, kernel_size=3, padding=1)
+        )
+    def forward(self, x, time, context=None, context_mask=None, is_text=None, null_embedding=None, control_net_residual=None):
+        time_embed = self.to_time_embed(time)
+        if exist(context):
+            time_embed = self.feature_pooling(time_embed, context, context_mask)
+        hidden_states = []
+        x = self.in_layer(x)
+        for level, down_sample in enumerate(self.down_samples):
+            x = down_sample(x, time_embed, context, context_mask, control_net_residual)
+            if level != self.num_levels - 1:
+                hidden_states.append(x)
+        for level, up_sample in enumerate(self.up_samples):
+            if level != 0:
+                x = torch.cat([x, hidden_states.pop()], dim=1)
+            x = up_sample(x, time_embed, context, context_mask)
+        x = self.out_layer(x)
+        return x
+def get_unet(conf):
+    unet = UNet(**conf)
+    return unet

KandiSuperRes/model/unet_sr.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import torch
+from torch import nn
+from einops import rearrange
+from .nn import Identity, Attention, SinusoidalPosEmb, UpDownResolution
+from .utils import exist, set_default_item, set_default_layer
+import torch.nn.functional as F
+class Block(nn.Module):
+    def __init__(self, in_channels, out_channels, time_embed_dim=None, groups=32, activation=None, up_resolution=None, dropout=None):
+        super().__init__()
+        self.group_norm = nn.GroupNorm(groups, in_channels)
+        self.activation = set_default_layer(
+             exist(activation),
+             nn.SiLU
+         )
+        self.change_resolution = set_default_layer(
+             exist(up_resolution),
+             UpDownResolution, (in_channels, up_resolution)
+         )
+        self.dropout = set_default_layer(
+             exist(dropout),
+             nn.Dropout, (), {'p': 0.1}
+         )
+        self.projection = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+    def forward(self, x, scale_shift=None):
+        x = self.group_norm(x)
+        if exist(scale_shift):
+            scale, shift = scale_shift
+            x = x * (scale + 1) + shift
+        x = self.activation(x)
+        x = self.dropout(x)
+        x = self.change_resolution(x)
+        x = self.projection(x)
+        return x
+class ResNetBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, time_embed_dim=None, groups=32, up_resolution=None):
+        super().__init__()
+        self.time_mlp = set_default_item(
+            exist(time_embed_dim),
+            nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(time_embed_dim, 2 * out_channels)
+            )
+        )
+        self.in_block = Block(in_channels, out_channels, time_embed_dim, groups, up_resolution=up_resolution)
+        self.out_block = Block(out_channels, out_channels, time_embed_dim, groups, activation=True, up_resolution=None, dropout=True)
+        self.change_resolution = set_default_layer(
+            exist(up_resolution),
+            UpDownResolution, (in_channels, up_resolution)
+        )
+        self.res_block = set_default_layer(
+            in_channels != out_channels or exist(up_resolution),
+            nn.Conv2d, (in_channels, out_channels), {'kernel_size': 1}
+        )
+    def forward(self, x, time_embed=None):
+        scale_shift = None
+        if exist(time_embed) and exist(self.time_mlp):
+            time_embed = self.time_mlp(time_embed)
+            time_embed = rearrange(time_embed, 'b c -> b c 1 1')
+            scale_shift = time_embed.chunk(2, dim=1)
+        out = self.in_block(x)
+        out = self.out_block(out, scale_shift=scale_shift)
+        x = self.change_resolution(x)
+        out = out + self.res_block(x)
+        return out
+class AttentionBlock(nn.Module):
+    def __init__(
+            self, dim, context_dim=None, groups=32, num_heads=8, num_conditions=1, feed_forward_mult=2
+    ):
+        super().__init__()
+        self.in_norm = nn.GroupNorm(groups, dim)
+        self.attention = Attention(
+            dim, context_dim or dim, num_heads, num_conditions=num_conditions
+        )
+        hidden_dim = feed_forward_mult * dim
+        self.out_norm = nn.GroupNorm(groups, dim)
+        self.feed_forward = nn.Sequential(
+            nn.Conv2d(dim, hidden_dim, kernel_size=1, bias=False),
+            nn.SiLU(),
+            nn.Conv2d(hidden_dim, dim, kernel_size=1, bias=False),
+        )
+    def forward(self, x, context=None, context_mask=None, context_idx=None):
+        width = x.shape[-1]
+        out = self.in_norm(x)
+        out = rearrange(out, 'b c h w -> b (h w) c')
+        context = set_default_item(exist(context), context, out)
+        out = self.attention(out, context, context_mask, context_idx)
+        out = rearrange(out, 'b (h w) c -> b c h w', w=width)
+        x = x + out
+        out = self.out_norm(x)
+        out = self.feed_forward(out)
+        x = x + out
+        return x
+class DownSampleBlock(nn.Module):
+    def __init__(
+            self, in_channels, out_channels, time_embed_dim,
+            num_resnet_blocks=3, groups=32, down_sample=True, context_dim=None, self_attention=True, num_conditions=1):
+        super().__init__()
+        up_resolutions = [set_default_item(down_sample, False)] + [None] * (num_resnet_blocks - 1)
+        hidden_channels = [(in_channels, out_channels)] + [(out_channels, out_channels)] * (num_resnet_blocks - 1)
+        self.resnet_attn_blocks = nn.ModuleList([
+            nn.ModuleList([
+                ResNetBlock(in_channel, out_channel, time_embed_dim, groups, up_resolution),
+                set_default_layer(
+                    exist(context_dim),
+                    AttentionBlock, (out_channel, context_dim), {'num_conditions': num_conditions, 'groups': groups},
+                    layer_2=Identity
+                )
+            ]) for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions)
+        ])
+        self.self_attention_block = set_default_layer(
+            self_attention,
+            AttentionBlock, (out_channels,), {'feed_forward_mult': 4, 'groups': groups},
+            layer_2=Identity
+        )
+    def forward(self, x, time_embed, context=None, context_mask=None, context_idx=None):
+        for resnet_block, attention in self.resnet_attn_blocks:
+            x = resnet_block(x, time_embed)
+            x = attention(x, context, context_mask, context_idx)
+        x = self.self_attention_block(x)
+        return x
+class UpSampleBlock(nn.Module):
+    def __init__(
+            self, in_channels, cat_dim, out_channels, time_embed_dim,
+            num_resnet_blocks=3, groups=32, up_sample=True, context_dim=None, self_attention=True, num_conditions=1):
+        super().__init__()
+        up_resolutions = [None] * (num_resnet_blocks - 1) + [set_default_item(up_sample, True)]
+        hidden_channels = [(in_channels + cat_dim, in_channels)] + [(in_channels, in_channels)] * (num_resnet_blocks - 2) + [(in_channels, out_channels)]
+        self.resnet_attn_blocks = nn.ModuleList([
+            nn.ModuleList([
+                ResNetBlock(in_channel, out_channel, time_embed_dim, groups, up_resolution),
+                set_default_layer(
+                    exist(context_dim),
+                    AttentionBlock, (out_channel, context_dim), {'num_conditions': num_conditions, 'groups': groups, 'feed_forward_mult': 4},
+                    layer_2=Identity
+                )
+            ]) for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions)
+        ])
+        self.self_attention_block = set_default_layer(
+            self_attention,
+            AttentionBlock, (out_channels,), {'feed_forward_mult': 4, 'groups': groups},
+            layer_2=Identity
+        )
+    def forward(self, x, time_embed, context=None, context_mask=None, context_idx=None):
+        for resnet_block, attention in self.resnet_attn_blocks:
+            x = resnet_block(x, time_embed)
+            x = attention(x, context, context_mask, context_idx)
+        x = self.self_attention_block(x)
+        return x
+class UNet(nn.Module):
+    def __init__(self,
+                 model_channels,
+                 init_channels=128,
+                 num_channels=3,
+                 time_embed_dim=512,
+                 context_dim=None,
+                 groups=32,
+                 feature_pooling_type='attention',
+                 dim_mult=(1, 2, 4, 8),
+                 num_resnet_blocks=(2, 4, 8, 8),
+                 num_conditions=1,
+                 skip_connect_scale=1.,
+                 add_cross_attention=(False, False, False, False),
+                 add_self_attention=(False, False, False, False),
+                 lowres_cond=True,
+                ):
+        super().__init__()
+        out_channels = num_channels
+        num_channels = set_default_item(lowres_cond, num_channels * 2, num_channels)
+        init_channels = init_channels or model_channels
+        self.num_conditions = num_conditions
+        self.skip_connect_scale = skip_connect_scale
+        self.to_time_embed = nn.Sequential(
+            SinusoidalPosEmb(init_channels),
+            nn.Linear(init_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        self.init_conv = nn.Conv2d(num_channels, init_channels, kernel_size=3, padding=1)
+        hidden_dims = [init_channels, *map(lambda mult: model_channels * mult, dim_mult)]
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [set_default_item(is_exist, context_dim) for is_exist in add_cross_attention]
+        layer_params = [num_resnet_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_samples = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(zip(in_out_dims, *layer_params)):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(set_default_item(level != (self.num_levels - 1), out_dim, 0))
+            self.down_samples.append(
+                DownSampleBlock(
+                    in_dim, out_dim, time_embed_dim, res_block_num, groups, down_sample, text_dim, self_attention, num_conditions
+                )
+            )
+        self.up_samples = nn.ModuleList([])
+        for level, ((out_dim, in_dim), res_block_num, text_dim, self_attention) in enumerate(zip(reversed(in_out_dims), *rev_layer_params)):
+            up_sample = level != 0
+            self.up_samples.append(
+                UpSampleBlock(
+                    in_dim, cat_dims.pop(), out_dim, time_embed_dim, res_block_num, groups, up_sample, text_dim, self_attention, num_conditions
+                )
+            )
+        self.norm = nn.GroupNorm(groups, init_channels)
+        self.activation = nn.SiLU()
+        self.out_conv = nn.Conv2d(init_channels, out_channels, kernel_size=3, padding=1)
+    def forward(self, x, time, context=None, context_mask=None, context_idx=None, lowres_img=None):
+        if exist(lowres_img):
+            _, _, new_height, new_width = x.shape
+            upsampled = F.interpolate(lowres_img, (new_height, new_width), mode="bilinear")
+            x = torch.cat([x, upsampled], dim=1)
+        time_embed = self.to_time_embed(time)
+        hidden_states = []
+        x = self.init_conv(x)
+        for level, down_sample in enumerate(self.down_samples):
+            x = down_sample(x, time_embed, context, context_mask, context_idx)
+            if level != self.num_levels - 1:
+                hidden_states.append(x)
+        for level, up_sample in enumerate(self.up_samples):
+            if level != 0:
+                x = torch.cat([x, hidden_states.pop() / self.skip_connect_scale], dim=1)
+            x = up_sample(x, time_embed, context, context_mask, context_idx)
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.out_conv(x)
+        return x

KandiSuperRes/model/utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from torch.nn import Identity
+from einops import rearrange
+def exist(item):
+    return item is not None
+def set_default_item(condition, item_1, item_2=None):
+    if condition:
+        return item_1
+    else:
+        return item_2
+def set_default_layer(condition, layer_1, args_1=[], kwargs_1={}, layer_2=Identity, args_2=[], kwargs_2={}):
+    if condition:
+        return layer_1(*args_1, **kwargs_1)
+    else:
+        return layer_2(*args_2, **kwargs_2)
+def get_tensor_items(x, pos, broadcast_shape):
+    device = pos.device
+    bs = pos.shape[0]
+    ndims = len(broadcast_shape[1:])
+    x = x.cpu()[pos.cpu()]
+    return x.reshape(bs, *((1,) * ndims)).to(device)
+def local_patching(x, height, width, group_size):
+    if group_size > 0:
+        x = rearrange(
+            x, 'b c (h g1) (w g2) -> b (h w) (g1 g2) c',
+            h=height//group_size, w=width//group_size, g1=group_size, g2=group_size
+        )
+    else:
+        x = rearrange(x, 'b c h w -> b (h w) c', h=height, w=width)
+    return x
+def local_merge(x, height, width, group_size):
+    if group_size > 0:
+        x = rearrange(
+            x, 'b (h w) (g1 g2) c -> b c (h g1) (w g2)',
+            h=height//group_size, w=width//group_size, g1=group_size, g2=group_size
+        )
+    else:
+        x = rearrange(x, 'b (h w) c -> b c h w', h=height, w=width)
+    return x
+def global_patching(x, height, width, group_size):
+    x = local_patching(x, height, width, height//group_size)
+    x = x.transpose(-2, -3)
+    return x
+def global_merge(x, height, width, group_size):
+    x = x.transpose(-2, -3)
+    x = local_merge(x, height, width, height//group_size)
+    return x

KandiSuperRes/movq.py ADDED Viewed

	@@ -0,0 +1,541 @@

+import math
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from .utils import freeze
+from tqdm import tqdm
+import time
+def nonlinearity(x):
+    return x*torch.sigmoid(x)
+class SpatialNorm(nn.Module):
+    def __init__(
+        self, f_channels, zq_channels=None, norm_layer=nn.GroupNorm, freeze_norm_layer=False, add_conv=False, **norm_layer_params
+    ):
+        super().__init__()
+        self.norm_layer = norm_layer(num_channels=f_channels, **norm_layer_params)
+        if zq_channels is not None:
+            if freeze_norm_layer:
+                for p in self.norm_layer.parameters:
+                    p.requires_grad = False
+            self.add_conv = add_conv
+            if self.add_conv:
+                self.conv = nn.Conv2d(zq_channels, zq_channels, kernel_size=3, stride=1, padding=1)
+            self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+            self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, f, zq=None):
+        norm_f = self.norm_layer(f)
+        if zq is not None:
+            f_size = f.shape[-2:]
+            zq = torch.nn.functional.interpolate(zq, size=f_size, mode="nearest")
+            if self.add_conv:
+                zq = self.conv(zq)
+            norm_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return norm_f
+def Normalize(in_channels, zq_ch=None, add_conv=None):
+    return SpatialNorm(
+            in_channels, zq_ch, norm_layer=nn.GroupNorm,
+            freeze_norm_layer=False, add_conv=add_conv, num_groups=32, eps=1e-6, affine=True
+        )
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512, zq_ch=None, add_conv=False):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels, zq_ch, add_conv=add_conv)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels, zq_ch, add_conv=add_conv)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb, zq=None):
+        h = x
+        h = self.norm1(h, zq)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+        h = self.norm2(h, zq)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels, zq_ch=None, add_conv=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels, zq_ch, add_conv=add_conv)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x, zq=None):
+        h_ = x
+        h_ = self.norm(h_, zq)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, **ignore_kwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, zq_ch=None, add_conv=False, **ignorekwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1,z_channels,curr_res,curr_res)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout,
+                                       zq_ch=zq_ch,
+                                       add_conv=add_conv)
+        self.mid.attn_1 = AttnBlock(block_in, zq_ch, add_conv=add_conv)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout,
+                                       zq_ch=zq_ch,
+                                       add_conv=add_conv)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout,
+                                         zq_ch=zq_ch,
+                                         add_conv=add_conv))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in, zq_ch, add_conv=add_conv))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in, zq_ch, add_conv=add_conv)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z, zq):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb, zq)
+        h = self.mid.attn_1(h, zq)
+        h = self.mid.block_2(h, temb, zq)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb, zq)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h, zq)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h, zq)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class MoVQ(nn.Module):
+    def __init__(self, generator_params):
+        super().__init__()
+        z_channels = generator_params["z_channels"]
+        self.encoder = Encoder(**generator_params)
+        self.quant_conv = torch.nn.Conv2d(z_channels, z_channels, 1)
+        self.post_quant_conv = torch.nn.Conv2d(z_channels, z_channels, 1)
+        self.decoder = Decoder(zq_ch=z_channels, **generator_params)
+        self.tile_sample_min_size = generator_params["tile_sample_min_size"]
+        self.scale_factor = 8
+        self.tile_latent_min_size = int(self.tile_sample_min_size / self.scale_factor)
+        self.tile_overlap_factor_enc = generator_params["tile_overlap_factor_enc"]
+        self.tile_overlap_factor_dec = generator_params["tile_overlap_factor_dec"]
+        self.use_tiling = generator_params["use_tiling"]
+    @torch.no_grad()
+    def encode(self, x):
+        if self.use_tiling and (
+            x.shape[-1] > self.tile_sample_min_size
+            or x.shape[-2] > self.tile_sample_min_size
+        ):
+            print('tiled_encode')
+            return self.tiled_encode(x)
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    @torch.no_grad()
+    def decode(self, quant):
+        if self.use_tiling and (
+            quant.shape[-1] > self.tile_latent_min_size
+            or quant.shape[-2] > self.tile_latent_min_size
+        ):
+            print('tiled_decode')
+            return self.tiled_decode(quant)
+        decoder_input = self.post_quant_conv(quant)
+        decoded = self.decoder(decoder_input, quant)
+        return decoded
+    def blend_v(
+        self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+    ) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for y in range(blend_extent):
+            b[ :, :, y, :] = a[ :, :, -blend_extent + y, :] * (
+                1 - y / blend_extent
+            ) + b[ :, :, y, :] * (y / blend_extent)
+        return b
+    def blend_h(
+        self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+    ) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for x in range(blend_extent):
+            b[ :, :, :, x] = a[ :, :, :, -blend_extent + x] * (
+                1 - x / blend_extent
+            ) + b[ :, :, :, x] * (x / blend_extent)
+        return b
+    def tiled_encode(self, x):
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor_enc))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor_enc)
+        row_limit = self.tile_latent_min_size - blend_extent
+        # Split the image into tiles and encode them separately.
+        rows = []
+        for i in tqdm(range(0, x.shape[2], overlap_size)):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[
+                    :,
+                    :,
+                    i : i + self.tile_sample_min_size,
+                    j : j + self.tile_sample_min_size,
+                ]
+                tile = self.encode(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[ :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+        h = torch.cat(result_rows, dim=2)
+        return h
+    def tiled_decode(self, z):
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor_dec))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor_dec)
+        row_limit = self.tile_sample_min_size - blend_extent
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in tqdm(range(0, z.shape[2], overlap_size)):
+            row = []
+            for j in range(0, z.shape[3], overlap_size):
+                tile = z[
+                    :,
+                    :,
+                    i : i + self.tile_latent_min_size,
+                    j : j + self.tile_latent_min_size,
+                ]
+                decoded = self.decode(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[ :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+        dec = torch.cat(result_rows, dim=2)
+        return dec
+def get_vae(conf):
+    movq = MoVQ(conf.params)
+    if conf.checkpoint is not None:
+        movq_state_dict = torch.load(conf.checkpoint)
+        movq.load_state_dict(movq_state_dict)
+    movq = freeze(movq)
+    return movq

KandiSuperRes/sr_pipeline.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+import numpy as np
+import PIL
+import torchvision.transforms as T
+import torch.nn.functional as F
+from KandiSuperRes.model.unet import UNet
+from KandiSuperRes.model.unet_sr import UNet as UNet_sr
+from KandiSuperRes.movq import MoVQ
+from KandiSuperRes.model.diffusion_sr import DPMSolver
+from KandiSuperRes.model.diffusion_refine import BaseDiffusion, get_named_beta_schedule
+from KandiSuperRes.model.diffusion_sr_turbo import BaseDiffusion as BaseDiffusion_turbo
+class KandiSuperResPipeline:
+    def __init__(
+        self,
+        scale: int,
+        device: str,
+        dtype: str,
+        flash: bool,
+        sr_model: UNet_sr,
+        movq: MoVQ = None,
+        refiner: UNet = None,
+    ):
+        self.device = device
+        self.dtype = dtype
+        self.scale = scale
+        self.flash = flash
+        self.to_pil = T.ToPILImage()
+        self.image_transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.ToTensor(),
+            T.Lambda(lambda img: 2. * img - 1.),
+        ])
+        self.sr_model = sr_model
+        self.movq = movq
+        self.refiner = refiner
+    def __call__(
+        self,
+        pil_image: PIL.Image.Image = None,
+        steps: int = 5,
+        view_batch_size: int = 15,
+        seed: int = 0,
+        refine=True
+    ) -> PIL.Image.Image:
+        if self.flash:
+            betas_turbo = get_named_beta_schedule('linear', 1000)
+            base_diffusion_sr = BaseDiffusion_turbo(betas_turbo)
+            old_height = pil_image.size[1]
+            old_width = pil_image.size[0]
+            height = int(old_height-np.mod(old_height,32))
+            width = int(old_width-np.mod(old_width,32))
+            pil_image = pil_image.resize((width,height))
+            lr_image = self.image_transform(pil_image).unsqueeze(0).to(self.device['sr_model'])
+            sr_image = base_diffusion_sr.p_sample_loop(
+                self.sr_model, (1, 3, height*self.scale, width*self.scale), self.device['sr_model'], self.dtype['sr_model'], lowres_img=lr_image
+            )
+            if refine:
+                betas = get_named_beta_schedule('cosine', 1000)
+                base_diffusion = BaseDiffusion(betas, 0.99)
+                with torch.cuda.amp.autocast(dtype=self.dtype['movq']):
+                    lr_image_latent = self.movq.encode(sr_image)
+                pil_images = []
+                context = torch.load('weights/context.pt').to(self.dtype['refiner'])
+                context_mask = torch.load('weights/context_mask.pt').to(self.dtype['refiner'])
+                with torch.no_grad():
+                    with torch.cuda.amp.autocast(dtype=self.dtype['refiner']):
+                        refiner_image = base_diffusion.refine_tiled(self.refiner, lr_image_latent, context, context_mask)
+                    with torch.cuda.amp.autocast(dtype=self.dtype['movq']):
+                        refiner_image = self.movq.decode(refiner_image)
+                        refiner_image = torch.clip((refiner_image + 1.) / 2., 0., 1.)
+                if old_height*self.scale != refiner_image.shape[2] or old_width*self.scale != refiner_image.shape[3]:
+                    refiner_image = F.interpolate(refiner_image, [old_height*self.scale, old_width*self.scale], mode='bilinear', align_corners=True)
+                refined_pil_image = self.to_pil(refiner_image[0])
+                return refined_pil_image
+            sr_image = torch.clip((sr_image + 1.) / 2., 0., 1.)
+            if old_height*self.scale != sr_image.shape[2] or old_width*self.scale != sr_image.shape[3]:
+                sr_image = F.interpolate(sr_image, [old_height*self.scale, old_width*self.scale], mode='bilinear', align_corners=True)
+            pil_sr_image = self.to_pil(sr_image[0])
+            return pil_sr_image
+        else:
+            base_diffusion = DPMSolver(steps)
+            lr_image = self.image_transform(pil_image).unsqueeze(0).to(self.device)
+            old_height = pil_image.size[1]
+            old_width = pil_image.size[0]
+            height = int(old_height+np.mod(old_height,2))*self.scale
+            width = int(old_width+np.mod(old_width,2))*self.scale
+            sr_image = base_diffusion.generate_panorama(height, width, self.device, self.dtype, steps,
+                                                       self.sr_model, lowres_img=lr_image,
+                                                       view_batch_size=view_batch_size, eta=0.0, seed=seed)
+            sr_image = torch.clip((sr_image + 1.) / 2., 0., 1.)
+            if old_height*self.scale != height or old_width*self.scale != width:
+                sr_image = F.interpolate(sr_image, [old_height*self.scale, old_width*self.scale], mode='bilinear', align_corners=True)
+            pil_sr_image = self.to_pil(sr_image[0])
+            return pil_sr_image

KandiSuperRes/utils.py ADDED Viewed

	@@ -0,0 +1,9 @@

+def freeze(model):
+    for p in model.parameters():
+        p.requires_grad = False
+    return model
+def unfreeze(model):
+    for p in model.parameters():
+        p.requires_grad = True
+    return model

weights/context.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92cc62ca3e341bd4ea03df187c06aceb43505e79107b6a2fef717a86051a6296
+size 1049756

weights/context_mask.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ff4ea52d9deb41f4732dd422fef48b1382247d9dbe0493d0a274e0d2e13591f
+size 2229