diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/README.md b/README.md
index 933c45b636500fcf8e6c5be4ae2d9e1f28241504..6ad98b62c3f71b74834c4d6e4b74b4868b7ce237 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,13 @@
 ---
 title: Stable Diffusion 3
-emoji: 😻
-colorFrom: yellow
-colorTo: red
+emoji: ⚡
+colorFrom: green
+colorTo: green
 sdk: gradio
-sdk_version: 3.4.1
-app_file: app.py
+sdk_version: 3.1.7
+app_file: start.py
+datasets: [emotion]
+license: mit
 pinned: false
 ---
 
diff --git a/app-bckp.py b/app-bckp.py
new file mode 100644
index 0000000000000000000000000000000000000000..06758de18805695e1312be35d35df709b6354bc7
--- /dev/null
+++ b/app-bckp.py
@@ -0,0 +1,872 @@
+import json
+import os, re
+import traceback
+import torch
+import numpy as np
+from omegaconf import OmegaConf
+from PIL import Image, ImageOps
+from tqdm import tqdm, trange
+from itertools import islice
+from einops import rearrange
+import time
+from pytorch_lightning import seed_everything
+from torch import autocast
+from contextlib import nullcontext
+from einops import rearrange, repeat
+from ldmlib.util import instantiate_from_config
+from optimizedSD.optimUtils import split_weighted_subprompts
+from transformers import logging
+
+from gfpgan import GFPGANer
+from basicsr.archs.rrdbnet_arch import RRDBNet
+from realesrgan import RealESRGANer
+
+import uuid
+
+AUTH_TOKEN = os.environ.get('AUTH_TOKEN')
+if not AUTH_TOKEN:
+    with open('/root/.huggingface/token') as f:
+        lines = f.readlines()
+        AUTH_TOKEN = lines[0]
+
+
+
+logging.set_verbosity_error()
+
+# consts
+config_yaml = "optimizedSD/v1-inference.yaml"
+filename_regex = re.compile('[^a-zA-Z0-9]')
+
+# api stuff
+from sd_internal import Request, Response, Image as ResponseImage
+import base64
+from io import BytesIO
+#from colorama import Fore
+
+# local
+stop_processing = False
+temp_images = {}
+
+ckpt_file = None
+gfpgan_file = None
+real_esrgan_file = None
+
+model = None
+modelCS = None
+modelFS = None
+model_gfpgan = None
+model_real_esrgan = None
+
+model_is_half = False
+model_fs_is_half = False
+device = None
+unet_bs = 1
+precision = 'autocast'
+sampler_plms = None
+sampler_ddim = None
+
+has_valid_gpu = False
+force_full_precision = False
+try:
+    gpu = torch.cuda.current_device()
+    gpu_name = torch.cuda.get_device_name(gpu)
+    print('GPU detected: ', gpu_name)
+
+    force_full_precision = ('nvidia' in gpu_name.lower() or 'geforce' in gpu_name.lower()) and (' 1660' in gpu_name or ' 1650' in gpu_name) # otherwise these NVIDIA cards create green images
+    if force_full_precision:
+        print('forcing full precision on NVIDIA 16xx cards, to avoid green images. GPU detected: ', gpu_name)
+
+    mem_free, mem_total = torch.cuda.mem_get_info(gpu)
+    mem_total /= float(10**9)
+    if mem_total < 3.0:
+        print("GPUs with less than 3 GB of VRAM are not compatible with Stable Diffusion")
+        raise Exception()
+
+    has_valid_gpu = True
+except:
+    print('WARNING: No compatible GPU found. Using the CPU, but this will be very slow!')
+    pass
+
+def load_model_ckpt(ckpt_to_use, device_to_use='cuda', turbo=False, unet_bs_to_use=1, precision_to_use='autocast'):
+    global ckpt_file, model, modelCS, modelFS, model_is_half, device, unet_bs, precision, model_fs_is_half
+
+    device = device_to_use if has_valid_gpu else 'cpu'
+    precision = precision_to_use if not force_full_precision else 'full'
+    unet_bs = unet_bs_to_use
+
+    unload_model()
+
+    if device == 'cpu':
+        precision = 'full'
+
+    sd = load_model_from_config(f"{ckpt_to_use}.ckpt")
+    li, lo = [], []
+    for key, value in sd.items():
+        sp = key.split(".")
+        if (sp[0]) == "model":
+            if "input_blocks" in sp:
+                li.append(key)
+            elif "middle_block" in sp:
+                li.append(key)
+            elif "time_embed" in sp:
+                li.append(key)
+            else:
+                lo.append(key)
+    for key in li:
+        sd["model1." + key[6:]] = sd.pop(key)
+    for key in lo:
+        sd["model2." + key[6:]] = sd.pop(key)
+
+    config = OmegaConf.load(f"{config_yaml}")
+
+    model = instantiate_from_config(config.modelUNet)
+    _, _ = model.load_state_dict(sd, strict=False)
+    model.eval()
+    model.cdevice = device
+    model.unet_bs = unet_bs
+    model.turbo = turbo
+
+    modelCS = instantiate_from_config(config.modelCondStage)
+    _, _ = modelCS.load_state_dict(sd, strict=False)
+    modelCS.eval()
+    modelCS.cond_stage_model.device = device
+
+    modelFS = instantiate_from_config(config.modelFirstStage)
+    _, _ = modelFS.load_state_dict(sd, strict=False)
+    modelFS.eval()
+    del sd
+
+    if device != "cpu" and precision == "autocast":
+        model.half()
+        modelCS.half()
+        modelFS.half()
+        model_is_half = True
+        model_fs_is_half = True
+    else:
+        model_is_half = False
+        model_fs_is_half = False
+
+    ckpt_file = ckpt_to_use
+
+    print('loaded ', ckpt_file, 'to', device, 'precision', precision)
+
+def unload_model():
+    global model, modelCS, modelFS
+
+    if model is not None:
+        del model
+        del modelCS
+        del modelFS
+
+    model = None
+    modelCS = None
+    modelFS = None
+
+def load_model_gfpgan(gfpgan_to_use):
+    global gfpgan_file, model_gfpgan
+
+    if gfpgan_to_use is None:
+        return
+
+    gfpgan_file = gfpgan_to_use
+    model_path = gfpgan_to_use + ".pth"
+
+    if device == 'cpu':
+        model_gfpgan = GFPGANer(model_path=model_path, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None, device=torch.device('cpu'))
+    else:
+        model_gfpgan = GFPGANer(model_path=model_path, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None, device=torch.device('cuda'))
+
+    print('loaded ', gfpgan_to_use, 'to', device, 'precision', precision)
+
+def load_model_real_esrgan(real_esrgan_to_use):
+    global real_esrgan_file, model_real_esrgan
+
+    if real_esrgan_to_use is None:
+        return
+
+    real_esrgan_file = real_esrgan_to_use
+    model_path = real_esrgan_to_use + ".pth"
+
+    RealESRGAN_models = {
+        'RealESRGAN_x4plus': RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4),
+        'RealESRGAN_x4plus_anime_6B': RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)
+    }
+
+    model_to_use = RealESRGAN_models[real_esrgan_to_use]
+
+    if device == 'cpu':
+        model_real_esrgan = RealESRGANer(scale=2, model_path=model_path, model=model_to_use, pre_pad=0, half=False) # cpu does not support half
+        model_real_esrgan.device = torch.device('cpu')
+        model_real_esrgan.model.to('cpu')
+    else:
+        model_real_esrgan = RealESRGANer(scale=2, model_path=model_path, model=model_to_use, pre_pad=0, half=model_is_half)
+
+    model_real_esrgan.model.name = real_esrgan_to_use
+
+    print('loaded ', real_esrgan_to_use, 'to', device, 'precision', precision)
+
+def mk_img(req: Request):
+    try:
+        yield from do_mk_img(req)
+    except Exception as e:
+        print(traceback.format_exc())
+
+        gc()
+
+        if device != "cpu":
+            modelFS.to("cpu")
+            modelCS.to("cpu")
+
+            model.model1.to("cpu")
+            model.model2.to("cpu")
+
+        gc()
+
+        yield json.dumps({
+            "status": 'failed',
+            "detail": str(e)
+        })
+
+def do_mk_img(req: Request):
+    global ckpt_file
+    global model, modelCS, modelFS, device
+    global model_gfpgan, model_real_esrgan
+    global stop_processing
+
+    stop_processing = False
+
+    res = Response()
+    res.request = req
+    res.images = []
+
+    temp_images.clear()
+
+    # custom model support:
+    #  the req.use_stable_diffusion_model needs to be a valid path
+    #  to the ckpt file (without the extension).
+
+    needs_model_reload = False
+    ckpt_to_use = ckpt_file
+    if ckpt_to_use != req.use_stable_diffusion_model:
+        ckpt_to_use = req.use_stable_diffusion_model
+        needs_model_reload = True
+
+    model.turbo = req.turbo
+    if req.use_cpu:
+        if device != 'cpu':
+            device = 'cpu'
+
+            if model_is_half:
+                load_model_ckpt(ckpt_to_use, device)
+                needs_model_reload = False
+
+            load_model_gfpgan(gfpgan_file)
+            load_model_real_esrgan(real_esrgan_file)
+    else:
+        if has_valid_gpu:
+            prev_device = device
+            device = 'cuda'
+
+            if (precision == 'autocast' and (req.use_full_precision or not model_is_half)) or \
+                (precision == 'full' and not req.use_full_precision and not force_full_precision):
+
+                load_model_ckpt(ckpt_to_use, device, req.turbo, unet_bs, ('full' if req.use_full_precision else 'autocast'))
+                needs_model_reload = False
+
+                if prev_device != device:
+                    load_model_gfpgan(gfpgan_file)
+                    load_model_real_esrgan(real_esrgan_file)
+
+    if needs_model_reload:
+        load_model_ckpt(ckpt_to_use, device, req.turbo, unet_bs, precision)
+
+    if req.use_face_correction != gfpgan_file:
+        load_model_gfpgan(req.use_face_correction)
+
+    if req.use_upscale != real_esrgan_file:
+        load_model_real_esrgan(req.use_upscale)
+
+    model.cdevice = device
+    modelCS.cond_stage_model.device = device
+
+    opt_prompt = req.prompt
+    opt_seed = req.seed
+    opt_n_samples = req.num_outputs
+    opt_n_iter = 1
+    opt_scale = req.guidance_scale
+    opt_C = 4
+    opt_H = req.height
+    opt_W = req.width
+    opt_f = 8
+    opt_ddim_steps = req.num_inference_steps
+    opt_ddim_eta = 0.0
+    opt_strength = req.prompt_strength
+    opt_save_to_disk_path = req.save_to_disk_path
+    opt_init_img = req.init_image
+    opt_use_face_correction = req.use_face_correction
+    opt_use_upscale = req.use_upscale
+    opt_show_only_filtered = req.show_only_filtered_image
+    opt_format = req.output_format
+    opt_sampler_name = req.sampler
+
+    print(req.to_string(), '\n    device', device)
+
+    print('\n\n    Using precision:', precision)
+
+    seed_everything(opt_seed)
+
+    batch_size = opt_n_samples
+    prompt = opt_prompt
+    assert prompt is not None
+    data = [batch_size * [prompt]]
+
+    if precision == "autocast" and device != "cpu":
+        precision_scope = autocast
+    else:
+        precision_scope = nullcontext
+
+    mask = None
+
+    if req.init_image is None:
+        handler = _txt2img
+
+        init_latent = None
+        t_enc = None
+    else:
+        handler = _img2img
+
+        init_image = load_img(req.init_image, opt_W, opt_H)
+        init_image = init_image.to(device)
+
+        if device != "cpu" and precision == "autocast":
+            init_image = init_image.half()
+
+        modelFS.to(device)
+
+        init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
+        init_latent = modelFS.get_first_stage_encoding(modelFS.encode_first_stage(init_image))  # move to latent space
+
+        if req.mask is not None:
+            mask = load_mask(req.mask, opt_W, opt_H, init_latent.shape[2], init_latent.shape[3], True).to(device)
+            mask = mask[0][0].unsqueeze(0).repeat(4, 1, 1).unsqueeze(0)
+            mask = repeat(mask, '1 ... -> b ...', b=batch_size)
+
+            if device != "cpu" and precision == "autocast":
+                mask = mask.half()
+
+        move_fs_to_cpu()
+
+        assert 0. <= opt_strength <= 1., 'can only work with strength in [0.0, 1.0]'
+        t_enc = int(opt_strength * opt_ddim_steps)
+        print(f"target t_enc is {t_enc} steps")
+
+    if opt_save_to_disk_path is not None:
+        session_out_path = os.path.join(opt_save_to_disk_path, req.session_id)
+        os.makedirs(session_out_path, exist_ok=True)
+    else:
+        session_out_path = None
+
+    seeds = ""
+    with torch.no_grad():
+        for n in trange(opt_n_iter, desc="Sampling"):
+            for prompts in tqdm(data, desc="data"):
+
+                with precision_scope("cuda"):
+                    modelCS.to(device)
+                    uc = None
+                    if opt_scale != 1.0:
+                        uc = modelCS.get_learned_conditioning(batch_size * [req.negative_prompt])
+                    if isinstance(prompts, tuple):
+                        prompts = list(prompts)
+
+                    subprompts, weights = split_weighted_subprompts(prompts[0])
+                    if len(subprompts) > 1:
+                        c = torch.zeros_like(uc)
+                        totalWeight = sum(weights)
+                        # normalize each "sub prompt" and add it
+                        for i in range(len(subprompts)):
+                            weight = weights[i]
+                            # if not skip_normalize:
+                            weight = weight / totalWeight
+                            c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight)
+                    else:
+                        c = modelCS.get_learned_conditioning(prompts)
+
+                    modelFS.to(device)
+
+                    partial_x_samples = None
+                    def img_callback(x_samples, i):
+                        nonlocal partial_x_samples
+
+                        partial_x_samples = x_samples
+
+                        if req.stream_progress_updates:
+                            n_steps = opt_ddim_steps if req.init_image is None else t_enc
+                            progress = {"step": i, "total_steps": n_steps}
+
+                            if req.stream_image_progress and i % 5 == 0:
+                                partial_images = []
+
+                                for i in range(batch_size):
+                                    x_samples_ddim = modelFS.decode_first_stage(x_samples[i].unsqueeze(0))
+                                    x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                                    x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c")
+                                    x_sample = x_sample.astype(np.uint8)
+                                    img = Image.fromarray(x_sample)
+                                    buf = BytesIO()
+                                    img.save(buf, format='JPEG')
+                                    buf.seek(0)
+
+                                    del img, x_sample, x_samples_ddim
+                                    # don't delete x_samples, it is used in the code that called this callback
+
+                                    temp_images[str(req.session_id) + '/' + str(i)] = buf
+                                    partial_images.append({'path': f'/image/tmp/{req.session_id}/{i}'})
+
+                                progress['output'] = partial_images
+
+                            yield json.dumps(progress)
+
+                        if stop_processing:
+                            raise UserInitiatedStop("User requested that we stop processing")
+
+                    # run the handler
+                    try:
+                        if handler == _txt2img:
+                            x_samples = _txt2img(opt_W, opt_H, opt_n_samples, opt_ddim_steps, opt_scale, None, opt_C, opt_f, opt_ddim_eta, c, uc, opt_seed, img_callback, mask, opt_sampler_name)
+                        else:
+                            x_samples = _img2img(init_latent, t_enc, batch_size, opt_scale, c, uc, opt_ddim_steps, opt_ddim_eta, opt_seed, img_callback, mask)
+
+                        yield from x_samples
+
+                        x_samples = partial_x_samples
+                    except UserInitiatedStop:
+                        if partial_x_samples is None:
+                            continue
+
+                        x_samples = partial_x_samples
+
+                    print("saving images")
+                    for i in range(batch_size):
+
+                        x_samples_ddim = modelFS.decode_first_stage(x_samples[i].unsqueeze(0))
+                        x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                        x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c")
+                        x_sample = x_sample.astype(np.uint8)
+                        img = Image.fromarray(x_sample)
+
+                        has_filters =   (opt_use_face_correction is not None and opt_use_face_correction.startswith('GFPGAN')) or \
+                                        (opt_use_upscale is not None and opt_use_upscale.startswith('RealESRGAN'))
+
+                        return_orig_img = not has_filters or not opt_show_only_filtered
+
+                        if stop_processing:
+                            return_orig_img = True
+
+                        if opt_save_to_disk_path is not None:
+                            prompt_flattened = filename_regex.sub('_', prompts[0])
+                            prompt_flattened = prompt_flattened[:50]
+
+                            img_id = str(uuid.uuid4())[-8:]
+
+                            file_path = f"{prompt_flattened}_{img_id}"
+                            img_out_path = os.path.join(session_out_path, f"{file_path}.{opt_format}")
+                            meta_out_path = os.path.join(session_out_path, f"{file_path}.txt")
+
+                            if return_orig_img:
+                                save_image(img, img_out_path)
+
+                            save_metadata(meta_out_path, prompts, opt_seed, opt_W, opt_H, opt_ddim_steps, opt_scale, opt_strength, opt_use_face_correction, opt_use_upscale, opt_sampler_name, req.negative_prompt, ckpt_file)
+
+                        if return_orig_img:
+                            img_data = img_to_base64_str(img, opt_format)
+                            res_image_orig = ResponseImage(data=img_data, seed=opt_seed)
+                            res.images.append(res_image_orig)
+
+                            if opt_save_to_disk_path is not None:
+                                res_image_orig.path_abs = img_out_path
+
+                        del img
+
+                        if has_filters and not stop_processing:
+                            print('Applying filters..')
+
+                            gc()
+                            filters_applied = []
+
+                            if opt_use_face_correction:
+                                _, _, output = model_gfpgan.enhance(x_sample[:,:,::-1], has_aligned=False, only_center_face=False, paste_back=True)
+                                x_sample = output[:,:,::-1]
+                                filters_applied.append(opt_use_face_correction)
+
+                            if opt_use_upscale:
+                                output, _ = model_real_esrgan.enhance(x_sample[:,:,::-1])
+                                x_sample = output[:,:,::-1]
+                                filters_applied.append(opt_use_upscale)
+
+                            filtered_image = Image.fromarray(x_sample)
+
+                            filtered_img_data = img_to_base64_str(filtered_image, opt_format)
+                            res_image_filtered = ResponseImage(data=filtered_img_data, seed=opt_seed)
+                            res.images.append(res_image_filtered)
+
+                            filters_applied = "_".join(filters_applied)
+
+                            if opt_save_to_disk_path is not None:
+                                filtered_img_out_path = os.path.join(session_out_path, f"{file_path}_{filters_applied}.{opt_format}")
+                                save_image(filtered_image, filtered_img_out_path)
+                                res_image_filtered.path_abs = filtered_img_out_path
+
+                            del filtered_image
+
+                        seeds += str(opt_seed) + ","
+                        opt_seed += 1
+
+                    move_fs_to_cpu()
+                    gc()
+                    del x_samples, x_samples_ddim, x_sample
+                    print("memory_final = ", torch.cuda.memory_allocated() / 1e6)
+
+    print('Task completed')
+
+    yield json.dumps(res.json())
+
+def save_image(img, img_out_path):
+    try:
+        img.save(img_out_path)
+    except:
+        print('could not save the file', traceback.format_exc())
+
+def save_metadata(meta_out_path, prompts, opt_seed, opt_W, opt_H, opt_ddim_steps, opt_scale, opt_prompt_strength, opt_correct_face, opt_upscale, sampler_name, negative_prompt, ckpt_file):
+    metadata = f"{prompts[0]}\nWidth: {opt_W}\nHeight: {opt_H}\nSeed: {opt_seed}\nSteps: {opt_ddim_steps}\nGuidance Scale: {opt_scale}\nPrompt Strength: {opt_prompt_strength}\nUse Face Correction: {opt_correct_face}\nUse Upscaling: {opt_upscale}\nSampler: {sampler_name}\nNegative Prompt: {negative_prompt}\nStable Diffusion Model: {ckpt_file + '.ckpt'}"
+
+    try:
+        with open(meta_out_path, 'w') as f:
+            f.write(metadata)
+    except:
+        print('could not save the file', traceback.format_exc())
+
+def _txt2img(opt_W, opt_H, opt_n_samples, opt_ddim_steps, opt_scale, start_code, opt_C, opt_f, opt_ddim_eta, c, uc, opt_seed, img_callback, mask, sampler_name):
+    shape = [opt_n_samples, opt_C, opt_H // opt_f, opt_W // opt_f]
+
+    if device != "cpu":
+        mem = torch.cuda.memory_allocated() / 1e6
+        modelCS.to("cpu")
+        while torch.cuda.memory_allocated() / 1e6 >= mem:
+            time.sleep(1)
+
+    if sampler_name == 'ddim':
+        model.make_schedule(ddim_num_steps=opt_ddim_steps, ddim_eta=opt_ddim_eta, verbose=False)
+
+    samples_ddim = model.sample(
+        S=opt_ddim_steps,
+        conditioning=c,
+        seed=opt_seed,
+        shape=shape,
+        verbose=False,
+        unconditional_guidance_scale=opt_scale,
+        unconditional_conditioning=uc,
+        eta=opt_ddim_eta,
+        x_T=start_code,
+        img_callback=img_callback,
+        mask=mask,
+        sampler = sampler_name,
+    )
+
+    yield from samples_ddim
+
+def _img2img(init_latent, t_enc, batch_size, opt_scale, c, uc, opt_ddim_steps, opt_ddim_eta, opt_seed, img_callback, mask):
+    # encode (scaled latent)
+    z_enc = model.stochastic_encode(
+        init_latent,
+        torch.tensor([t_enc] * batch_size).to(device),
+        opt_seed,
+        opt_ddim_eta,
+        opt_ddim_steps,
+    )
+    x_T = None if mask is None else init_latent
+
+    # decode it
+    samples_ddim = model.sample(
+        t_enc,
+        c,
+        z_enc,
+        unconditional_guidance_scale=opt_scale,
+        unconditional_conditioning=uc,
+        img_callback=img_callback,
+        mask=mask,
+        x_T=x_T,
+        sampler = 'ddim'
+    )
+
+    yield from samples_ddim
+
+def move_fs_to_cpu():
+    if device != "cpu":
+        mem = torch.cuda.memory_allocated() / 1e6
+        modelFS.to("cpu")
+        while torch.cuda.memory_allocated() / 1e6 >= mem:
+            time.sleep(1)
+
+def gc():
+    if device == 'cpu':
+        return
+
+    torch.cuda.empty_cache()
+    torch.cuda.ipc_collect()
+
+# internal
+
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+
+
+def load_model_from_config(ckpt, verbose=False):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    return sd
+
+# utils
+class UserInitiatedStop(Exception):
+    pass
+
+def load_img(img_str, w0, h0):
+    image = base64_str_to_img(img_str).convert("RGB")
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h}) from base64")
+    if h0 is not None and w0 is not None:
+        h, w = h0, w0
+
+    w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 64
+    image = image.resize((w, h), resample=Image.Resampling.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.*image - 1.
+
+def load_mask(mask_str, h0, w0, newH, newW, invert=False):
+    image = base64_str_to_img(mask_str).convert("RGB")
+    w, h = image.size
+    print(f"loaded input mask of size ({w}, {h})")
+
+    if invert:
+        print("inverted")
+        image = ImageOps.invert(image)
+        # where_0, where_1 = np.where(image == 0), np.where(image == 255)
+        # image[where_0], image[where_1] = 255, 0
+
+    if h0 is not None and w0 is not None:
+        h, w = h0, w0
+
+    w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 64
+
+    print(f"New mask size ({w}, {h})")
+    image = image.resize((newW, newH), resample=Image.Resampling.LANCZOS)
+    image = np.array(image)
+
+    image = image.astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return image
+
+# https://stackoverflow.com/a/61114178
+def img_to_base64_str(img, output_format="PNG"):
+    buffered = BytesIO()
+    img.save(buffered, format=output_format)
+    buffered.seek(0)
+    img_byte = buffered.getvalue()
+    img_str = "data:image/png;base64," + base64.b64encode(img_byte).decode()
+    return img_str
+
+def base64_str_to_img(img_str):
+    img_str = img_str[len("data:image/png;base64,"):]
+    data = base64.b64decode(img_str)
+    buffered = BytesIO(data)
+    img = Image.open(buffered)
+    return img
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+from fastapi import FastAPI, HTTPException
+from fastapi.staticfiles import StaticFiles
+from starlette.responses import FileResponse, StreamingResponse
+from pydantic import BaseModel
+import logging
+
+from sd_internal import Request, Response
+
+import json
+import traceback
+
+import sys
+import os
+
+SD_DIR = os.getcwd()
+print('started in ', SD_DIR)
+
+#SD_UI_DIR = os.getenv('SD_UI_PATH', None)
+#sys.path.append(os.path.dirname(SD_UI_DIR))
+
+#CONFIG_DIR = os.path.abspath(os.path.join(SD_UI_DIR, '..', 'scripts'))
+MODELS_DIR = os.path.abspath(os.path.join(SD_DIR, '..', 'models'))
+
+OUTPUT_DIRNAME = "Stable Diffusion UI" # in the user's home folder
+
+app = FastAPI()
+
+model_loaded = False
+model_is_loading = False
+
+modifiers_cache = None
+outpath = os.path.join(os.path.expanduser("~"), OUTPUT_DIRNAME)
+
+# defaults from https://huggingface.co/blog/stable_diffusion
+class ImageRequest(BaseModel):
+    session_id: str = "session"
+    prompt: str = ""
+    negative_prompt: str = ""
+    init_image: str = None # base64
+    mask: str = None # base64
+    num_outputs: int = 1
+    num_inference_steps: int = 50
+    guidance_scale: float = 7.5
+    width: int = 512
+    height: int = 512
+    seed: int = 42
+    prompt_strength: float = 0.8
+    sampler: str = None # "ddim", "plms", "heun", "euler", "euler_a", "dpm2", "dpm2_a", "lms"
+    # allow_nsfw: bool = False
+    save_to_disk_path: str = None
+    turbo: bool = True
+    use_cpu: bool = False
+    use_full_precision: bool = False
+    use_face_correction: str = None # or "GFPGANv1.3"
+    use_upscale: str = None # or "RealESRGAN_x4plus" or "RealESRGAN_x4plus_anime_6B"
+    use_stable_diffusion_model: str = "sd-v1-4"
+    show_only_filtered_image: bool = False
+    output_format: str = "jpeg" # or "png"
+
+    stream_progress_updates: bool = False
+    stream_image_progress: bool = False
+
+from starlette.responses import FileResponse, StreamingResponse
+
+def resolve_model_to_use(model_name):
+    if model_name in ('sd-v1-4', 'custom-model'):
+        model_path = os.path.join(MODELS_DIR, 'stable-diffusion', model_name)
+
+        legacy_model_path = os.path.join(SD_DIR, model_name)
+        if not os.path.exists(model_path + '.ckpt') and os.path.exists(legacy_model_path + '.ckpt'):
+            model_path = legacy_model_path
+    else:
+        model_path = os.path.join(MODELS_DIR, 'stable-diffusion', model_name)
+
+    return model_path
+
+def image(req : ImageRequest):
+    r = Request()
+    r.session_id = req.session_id
+    r.prompt = req.prompt
+    r.negative_prompt = req.negative_prompt
+    r.init_image = req.init_image
+    r.mask = req.mask
+    r.num_outputs = req.num_outputs
+    r.num_inference_steps = req.num_inference_steps
+    r.guidance_scale = req.guidance_scale
+    r.width = req.width
+    r.height = req.height
+    r.seed = req.seed
+    r.prompt_strength = req.prompt_strength
+    r.sampler = req.sampler
+    # r.allow_nsfw = req.allow_nsfw
+    r.turbo = req.turbo
+    r.use_cpu = req.use_cpu
+    r.use_full_precision = req.use_full_precision
+    r.save_to_disk_path = req.save_to_disk_path
+    r.use_upscale: str = req.use_upscale
+    r.use_face_correction = req.use_face_correction
+    r.show_only_filtered_image = req.show_only_filtered_image
+    r.output_format = req.output_format
+
+    r.stream_progress_updates = True # the underlying implementation only supports streaming
+    r.stream_image_progress = req.stream_image_progress
+
+    r.use_stable_diffusion_model = resolve_model_to_use(req.use_stable_diffusion_model)
+
+    save_model_to_config(req.use_stable_diffusion_model)
+
+    try:
+        if not req.stream_progress_updates:
+            r.stream_image_progress = False
+
+        res = mk_img(r)
+
+        if req.stream_progress_updates:
+            return StreamingResponse(res, media_type='application/json')
+        else: # compatibility mode: buffer the streaming responses, and return the last one
+            last_result = None
+
+            for result in res:
+                last_result = result
+
+            return json.loads(last_result)
+    except Exception as e:
+        print(traceback.format_exc())
+        return HTTPException(status_code=500, detail=str(e))
+
+
+def getConfig():
+    try:
+        config_json_path = os.path.join(CONFIG_DIR, 'config.json')
+
+        if not os.path.exists(config_json_path):
+            return {}
+
+        with open(config_json_path, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        return {}
+
+# needs to support the legacy installations
+def get_initial_model_to_load():
+    custom_weight_path = os.path.join(SD_DIR, 'custom-model.ckpt')
+    ckpt_to_use = "sd-v1-4" if not os.path.exists(custom_weight_path) else "custom-model"
+
+    ckpt_to_use = os.path.join(SD_DIR, ckpt_to_use)
+
+    config = getConfig()
+    if 'model' in config and 'stable-diffusion' in config['model']:
+        model_name = config['model']['stable-diffusion']
+        model_path = resolve_model_to_use(model_name)
+
+        if os.path.exists(model_path + '.ckpt'):
+            ckpt_to_use = model_path
+        else:
+            print('Could not find the configured custom model at:', model_path + '.ckpt', '. Using the default one:', ckpt_to_use + '.ckpt')
+
+    return ckpt_to_use
+
+
+#model_is_loading = True
+#load_model_ckpt(get_initial_model_to_load(), "cuda")
+#model_loaded = True
+#model_is_loading = False
+
+#mk_img(ImageRequest)
diff --git a/app.bckp2.py b/app.bckp2.py
new file mode 100644
index 0000000000000000000000000000000000000000..20ad607b4447558923ce3dad198bf2782bb92112
--- /dev/null
+++ b/app.bckp2.py
@@ -0,0 +1,330 @@
+# app.py
+import uvicorn
+
+import json
+import traceback
+
+import sys
+import os
+
+SD_DIR = os.getcwd()
+print('started in ', SD_DIR)
+
+SD_UI_DIR = './ui'
+#sys.path.append(os.path.dirname(SD_UI_DIR))
+
+#CONFIG_DIR = os.path.abspath(os.path.join(SD_UI_DIR, '..', 'scripts'))
+#MODELS_DIR = os.path.abspath(os.path.join(SD_DIR, '..', 'models'))
+
+OUTPUT_DIRNAME = "Stable Diffusion UI" # in the user's home folder
+
+from fastapi import FastAPI, HTTPException
+from fastapi.staticfiles import StaticFiles
+from starlette.responses import FileResponse, StreamingResponse
+from pydantic import BaseModel
+import logging
+
+from sd_internal import Request, Response
+
+app = FastAPI()
+
+model_loaded = False
+model_is_loading = False
+
+modifiers_cache = None
+outpath = os.path.join(os.path.expanduser("~"), OUTPUT_DIRNAME)
+
+# don't show access log entries for URLs that start with the given prefix
+ACCESS_LOG_SUPPRESS_PATH_PREFIXES = ['/ping', '/modifier-thumbnails']
+
+app.mount('/media', StaticFiles(directory=os.path.join(SD_UI_DIR, 'media/')), name="media")
+
+# defaults from https://huggingface.co/blog/stable_diffusion
+class ImageRequest(BaseModel):
+    session_id: str = "session"
+    prompt: str = ""
+    negative_prompt: str = ""
+    init_image: str = None # base64
+    mask: str = None # base64
+    num_outputs: int = 1
+    num_inference_steps: int = 50
+    guidance_scale: float = 7.5
+    width: int = 512
+    height: int = 512
+    seed: int = 42
+    prompt_strength: float = 0.8
+    sampler: str = None # "ddim", "plms", "heun", "euler", "euler_a", "dpm2", "dpm2_a", "lms"
+    # allow_nsfw: bool = False
+    save_to_disk_path: str = None
+    turbo: bool = True
+    use_cpu: bool = False
+    use_full_precision: bool = False
+    use_face_correction: str = None # or "GFPGANv1.3"
+    use_upscale: str = None # or "RealESRGAN_x4plus" or "RealESRGAN_x4plus_anime_6B"
+    use_stable_diffusion_model: str = "sd-v1-4"
+    show_only_filtered_image: bool = False
+    output_format: str = "jpeg" # or "png"
+
+    stream_progress_updates: bool = False
+    stream_image_progress: bool = False
+
+class SetAppConfigRequest(BaseModel):
+    update_branch: str = "main"
+
+@app.get('/')
+def read_root():
+    headers = {"Cache-Control": "no-cache, no-store, must-revalidate", "Pragma": "no-cache", "Expires": "0"}
+    return FileResponse(os.path.join(SD_UI_DIR, 'index.html'), headers=headers)
+
+@app.get('/ping')
+async def ping():
+    global model_loaded, model_is_loading
+
+    try:
+        if model_loaded:
+            return {'OK'}
+
+        if model_is_loading:
+            return {'ERROR'}
+
+        model_is_loading = True
+
+        from sd_internal import runtime
+
+        runtime.load_model_ckpt(ckpt_to_use=get_initial_model_to_load())
+
+        model_loaded = True
+        model_is_loading = False
+
+        return {'OK'}
+    except Exception as e:
+        print(traceback.format_exc())
+        return HTTPException(status_code=500, detail=str(e))
+
+# needs to support the legacy installations
+def get_initial_model_to_load():
+    custom_weight_path = os.path.join(SD_DIR, 'custom-model.ckpt')
+    ckpt_to_use = "sd-v1-4" if not os.path.exists(custom_weight_path) else "custom-model"
+
+    ckpt_to_use = os.path.join(SD_DIR, ckpt_to_use)
+
+    config = getConfig()
+    if 'model' in config and 'stable-diffusion' in config['model']:
+        model_name = config['model']['stable-diffusion']
+        model_path = resolve_model_to_use(model_name)
+
+        if os.path.exists(model_path + '.ckpt'):
+            ckpt_to_use = model_path
+        else:
+            print('Could not find the configured custom model at:', model_path + '.ckpt', '. Using the default one:', ckpt_to_use + '.ckpt')
+
+    return ckpt_to_use
+
+def resolve_model_to_use(model_name):
+    if model_name in ('sd-v1-4', 'custom-model'):
+        model_path = os.path.join(MODELS_DIR, 'stable-diffusion', model_name)
+
+        legacy_model_path = os.path.join(SD_DIR, model_name)
+        if not os.path.exists(model_path + '.ckpt') and os.path.exists(legacy_model_path + '.ckpt'):
+            model_path = legacy_model_path
+    else:
+        model_path = os.path.join(MODELS_DIR, 'stable-diffusion', model_name)
+
+    return model_path
+
+def save_model_to_config(model_name):
+    config = getConfig()
+    if 'model' not in config:
+        config['model'] = {}
+
+    config['model']['stable-diffusion'] = model_name
+
+    setConfig(config)
+
+@app.post('/image')
+def image(req : ImageRequest):
+    from sd_internal import runtime
+
+    r = Request()
+    r.session_id = req.session_id
+    r.prompt = req.prompt
+    r.negative_prompt = req.negative_prompt
+    r.init_image = req.init_image
+    r.mask = req.mask
+    r.num_outputs = req.num_outputs
+    r.num_inference_steps = req.num_inference_steps
+    r.guidance_scale = req.guidance_scale
+    r.width = req.width
+    r.height = req.height
+    r.seed = req.seed
+    r.prompt_strength = req.prompt_strength
+    r.sampler = req.sampler
+    # r.allow_nsfw = req.allow_nsfw
+    r.turbo = req.turbo
+    r.use_cpu = req.use_cpu
+    r.use_full_precision = req.use_full_precision
+    r.save_to_disk_path = req.save_to_disk_path
+    r.use_upscale: str = req.use_upscale
+    r.use_face_correction = req.use_face_correction
+    r.show_only_filtered_image = req.show_only_filtered_image
+    r.output_format = req.output_format
+
+    r.stream_progress_updates = True # the underlying implementation only supports streaming
+    r.stream_image_progress = req.stream_image_progress
+
+    r.use_stable_diffusion_model = resolve_model_to_use(req.use_stable_diffusion_model)
+
+    save_model_to_config(req.use_stable_diffusion_model)
+
+    try:
+        if not req.stream_progress_updates:
+            r.stream_image_progress = False
+
+        res = runtime.mk_img(r)
+
+        if req.stream_progress_updates:
+            return StreamingResponse(res, media_type='application/json')
+        else: # compatibility mode: buffer the streaming responses, and return the last one
+            last_result = None
+
+            for result in res:
+                last_result = result
+
+            return json.loads(last_result)
+    except Exception as e:
+        print(traceback.format_exc())
+        return HTTPException(status_code=500, detail=str(e))
+
+@app.get('/image/stop')
+def stop():
+    try:
+        if model_is_loading:
+            return {'ERROR'}
+
+        from sd_internal import runtime
+        runtime.stop_processing = True
+
+        return {'OK'}
+    except Exception as e:
+        print(traceback.format_exc())
+        return HTTPException(status_code=500, detail=str(e))
+
+@app.get('/image/tmp/{session_id}/{img_id}')
+def get_image(session_id, img_id):
+    from sd_internal import runtime
+    buf = runtime.temp_images[session_id + '/' + img_id]
+    buf.seek(0)
+    return StreamingResponse(buf, media_type='image/jpeg')
+
+@app.post('/app_config')
+async def setAppConfig(req : SetAppConfigRequest):
+    try:
+        config = {
+            'update_branch': req.update_branch
+        }
+
+        config_json_str = json.dumps(config)
+        config_bat_str = f'@set update_branch={req.update_branch}'
+        config_sh_str = f'export update_branch={req.update_branch}'
+
+        config_json_path = os.path.join(CONFIG_DIR, 'config.json')
+        config_bat_path = os.path.join(CONFIG_DIR, 'config.bat')
+        config_sh_path = os.path.join(CONFIG_DIR, 'config.sh')
+
+        with open(config_json_path, 'w') as f:
+            f.write(config_json_str)
+
+        with open(config_bat_path, 'w') as f:
+            f.write(config_bat_str)
+
+        with open(config_sh_path, 'w') as f:
+            f.write(config_sh_str)
+
+        return {'OK'}
+    except Exception as e:
+        print(traceback.format_exc())
+        return HTTPException(status_code=500, detail=str(e))
+
+@app.get('/app_config')
+def getAppConfig():
+    try:
+        config_json_path = os.path.join(CONFIG_DIR, 'config.json')
+
+        if not os.path.exists(config_json_path):
+            return HTTPException(status_code=500, detail="No config file")
+
+        with open(config_json_path, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        print(traceback.format_exc())
+        return HTTPException(status_code=500, detail=str(e))
+
+def getConfig():
+    try:
+        config_json_path = os.path.join(CONFIG_DIR, 'config.json')
+
+        if not os.path.exists(config_json_path):
+            return {}
+
+        with open(config_json_path, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        return {}
+
+def setConfig(config):
+    try:
+        config_json_path = os.path.join(CONFIG_DIR, 'config.json')
+
+        with open(config_json_path, 'w') as f:
+            return json.dump(config, f)
+    except:
+        print(traceback.format_exc())
+
+@app.get('/models')
+def getModels():
+    models = {
+        'active': {
+            'stable-diffusion': 'sd-v1-4',
+        },
+        'options': {
+            'stable-diffusion': ['sd-v1-4'],
+        },
+    }
+
+    # custom models
+    sd_models_dir = os.path.join(MODELS_DIR, 'stable-diffusion')
+    for file in os.listdir(sd_models_dir):
+        if file.endswith('.ckpt'):
+            model_name = os.path.splitext(file)[0]
+            models['options']['stable-diffusion'].append(model_name)
+
+    # legacy
+    custom_weight_path = os.path.join(SD_DIR, 'custom-model.ckpt')
+    if os.path.exists(custom_weight_path):
+        models['active']['stable-diffusion'] = 'custom-model'
+        models['options']['stable-diffusion'].append('custom-model')
+
+    config = getConfig()
+    if 'model' in config and 'stable-diffusion' in config['model']:
+        models['active']['stable-diffusion'] = config['model']['stable-diffusion']
+
+    return models
+
+@app.get('/modifiers.json')
+def read_modifiers():
+    headers = {"Cache-Control": "no-cache, no-store, must-revalidate", "Pragma": "no-cache", "Expires": "0"}
+    return FileResponse(os.path.join(SD_UI_DIR, 'modifiers.json'), headers=headers)
+
+@app.get('/output_dir')
+def read_home_dir():
+    return {outpath}
+
+# don't log certain requests
+class LogSuppressFilter(logging.Filter):
+    def filter(self, record: logging.LogRecord) -> bool:
+        path = record.getMessage()
+        for prefix in ACCESS_LOG_SUPPRESS_PATH_PREFIXES:
+            if path.find(prefix) != -1:
+                return False
+
+        return True
diff --git a/ldmlib/__pycache__/util.cpython-38.pyc b/ldmlib/__pycache__/util.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..809aa03232f65c2bd9e9862e2ba8b9c32c43d504
Binary files /dev/null and b/ldmlib/__pycache__/util.cpython-38.pyc differ
diff --git a/ldmlib/data/__init__.py b/ldmlib/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ldmlib/data/base.py b/ldmlib/data/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..b196c2f7aa583a3e8bc4aad9f943df0c4dae0da7
--- /dev/null
+++ b/ldmlib/data/base.py
@@ -0,0 +1,23 @@
+from abc import abstractmethod
+from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
+
+
+class Txt2ImgIterableBaseDataset(IterableDataset):
+    '''
+    Define an interface to make the IterableDatasets for text2img data chainable
+    '''
+    def __init__(self, num_records=0, valid_ids=None, size=256):
+        super().__init__()
+        self.num_records = num_records
+        self.valid_ids = valid_ids
+        self.sample_ids = valid_ids
+        self.size = size
+
+        print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
+
+    def __len__(self):
+        return self.num_records
+
+    @abstractmethod
+    def __iter__(self):
+        pass
\ No newline at end of file
diff --git a/ldmlib/data/imagenet.py b/ldmlib/data/imagenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..27ab0d4791ef2cc909fb335f061a5cccffff6f82
--- /dev/null
+++ b/ldmlib/data/imagenet.py
@@ -0,0 +1,394 @@
+import os, yaml, pickle, shutil, tarfile, glob
+import cv2
+import albumentations
+import PIL
+import numpy as np
+import torchvision.transforms.functional as TF
+from omegaconf import OmegaConf
+from functools import partial
+from PIL import Image
+from tqdm import tqdm
+from torch.utils.data import Dataset, Subset
+
+import taming.data.utils as tdu
+from taming.data.imagenet import str_to_indices, give_synsets_from_indices, download, retrieve
+from taming.data.imagenet import ImagePaths
+
+from ldmlib.modules.image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
+
+
+def synset2idx(path_to_yaml="data/index_synset.yaml"):
+    with open(path_to_yaml) as f:
+        di2s = yaml.load(f)
+    return dict((v,k) for k,v in di2s.items())
+
+
+class ImageNetBase(Dataset):
+    def __init__(self, config=None):
+        self.config = config or OmegaConf.create()
+        if not type(self.config)==dict:
+            self.config = OmegaConf.to_container(self.config)
+        self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
+        self.process_images = True  # if False we skip loading & processing images and self.data contains filepaths
+        self._prepare()
+        self._prepare_synset_to_human()
+        self._prepare_idx_to_synset()
+        self._prepare_human_to_integer_label()
+        self._load()
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, i):
+        return self.data[i]
+
+    def _prepare(self):
+        raise NotImplementedError()
+
+    def _filter_relpaths(self, relpaths):
+        ignore = set([
+            "n06596364_9591.JPEG",
+        ])
+        relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
+        if "sub_indices" in self.config:
+            indices = str_to_indices(self.config["sub_indices"])
+            synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn)  # returns a list of strings
+            self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
+            files = []
+            for rpath in relpaths:
+                syn = rpath.split("/")[0]
+                if syn in synsets:
+                    files.append(rpath)
+            return files
+        else:
+            return relpaths
+
+    def _prepare_synset_to_human(self):
+        SIZE = 2655750
+        URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
+        self.human_dict = os.path.join(self.root, "synset_human.txt")
+        if (not os.path.exists(self.human_dict) or
+                not os.path.getsize(self.human_dict)==SIZE):
+            download(URL, self.human_dict)
+
+    def _prepare_idx_to_synset(self):
+        URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
+        self.idx2syn = os.path.join(self.root, "index_synset.yaml")
+        if (not os.path.exists(self.idx2syn)):
+            download(URL, self.idx2syn)
+
+    def _prepare_human_to_integer_label(self):
+        URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
+        self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
+        if (not os.path.exists(self.human2integer)):
+            download(URL, self.human2integer)
+        with open(self.human2integer, "r") as f:
+            lines = f.read().splitlines()
+            assert len(lines) == 1000
+            self.human2integer_dict = dict()
+            for line in lines:
+                value, key = line.split(":")
+                self.human2integer_dict[key] = int(value)
+
+    def _load(self):
+        with open(self.txt_filelist, "r") as f:
+            self.relpaths = f.read().splitlines()
+            l1 = len(self.relpaths)
+            self.relpaths = self._filter_relpaths(self.relpaths)
+            print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
+
+        self.synsets = [p.split("/")[0] for p in self.relpaths]
+        self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
+
+        unique_synsets = np.unique(self.synsets)
+        class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
+        if not self.keep_orig_class_label:
+            self.class_labels = [class_dict[s] for s in self.synsets]
+        else:
+            self.class_labels = [self.synset2idx[s] for s in self.synsets]
+
+        with open(self.human_dict, "r") as f:
+            human_dict = f.read().splitlines()
+            human_dict = dict(line.split(maxsplit=1) for line in human_dict)
+
+        self.human_labels = [human_dict[s] for s in self.synsets]
+
+        labels = {
+            "relpath": np.array(self.relpaths),
+            "synsets": np.array(self.synsets),
+            "class_label": np.array(self.class_labels),
+            "human_label": np.array(self.human_labels),
+        }
+
+        if self.process_images:
+            self.size = retrieve(self.config, "size", default=256)
+            self.data = ImagePaths(self.abspaths,
+                                   labels=labels,
+                                   size=self.size,
+                                   random_crop=self.random_crop,
+                                   )
+        else:
+            self.data = self.abspaths
+
+
+class ImageNetTrain(ImageNetBase):
+    NAME = "ILSVRC2012_train"
+    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
+    AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
+    FILES = [
+        "ILSVRC2012_img_train.tar",
+    ]
+    SIZES = [
+        147897477120,
+    ]
+
+    def __init__(self, process_images=True, data_root=None, **kwargs):
+        self.process_images = process_images
+        self.data_root = data_root
+        super().__init__(**kwargs)
+
+    def _prepare(self):
+        if self.data_root:
+            self.root = os.path.join(self.data_root, self.NAME)
+        else:
+            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
+
+        self.datadir = os.path.join(self.root, "data")
+        self.txt_filelist = os.path.join(self.root, "filelist.txt")
+        self.expected_length = 1281167
+        self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
+                                    default=True)
+        if not tdu.is_prepared(self.root):
+            # prep
+            print("Preparing dataset {} in {}".format(self.NAME, self.root))
+
+            datadir = self.datadir
+            if not os.path.exists(datadir):
+                path = os.path.join(self.root, self.FILES[0])
+                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+                    import academictorrents as at
+                    atpath = at.get(self.AT_HASH, datastore=self.root)
+                    assert atpath == path
+
+                print("Extracting {} to {}".format(path, datadir))
+                os.makedirs(datadir, exist_ok=True)
+                with tarfile.open(path, "r:") as tar:
+                    tar.extractall(path=datadir)
+
+                print("Extracting sub-tars.")
+                subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
+                for subpath in tqdm(subpaths):
+                    subdir = subpath[:-len(".tar")]
+                    os.makedirs(subdir, exist_ok=True)
+                    with tarfile.open(subpath, "r:") as tar:
+                        tar.extractall(path=subdir)
+
+            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
+            filelist = sorted(filelist)
+            filelist = "\n".join(filelist)+"\n"
+            with open(self.txt_filelist, "w") as f:
+                f.write(filelist)
+
+            tdu.mark_prepared(self.root)
+
+
+class ImageNetValidation(ImageNetBase):
+    NAME = "ILSVRC2012_validation"
+    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
+    AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
+    VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
+    FILES = [
+        "ILSVRC2012_img_val.tar",
+        "validation_synset.txt",
+    ]
+    SIZES = [
+        6744924160,
+        1950000,
+    ]
+
+    def __init__(self, process_images=True, data_root=None, **kwargs):
+        self.data_root = data_root
+        self.process_images = process_images
+        super().__init__(**kwargs)
+
+    def _prepare(self):
+        if self.data_root:
+            self.root = os.path.join(self.data_root, self.NAME)
+        else:
+            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
+        self.datadir = os.path.join(self.root, "data")
+        self.txt_filelist = os.path.join(self.root, "filelist.txt")
+        self.expected_length = 50000
+        self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
+                                    default=False)
+        if not tdu.is_prepared(self.root):
+            # prep
+            print("Preparing dataset {} in {}".format(self.NAME, self.root))
+
+            datadir = self.datadir
+            if not os.path.exists(datadir):
+                path = os.path.join(self.root, self.FILES[0])
+                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+                    import academictorrents as at
+                    atpath = at.get(self.AT_HASH, datastore=self.root)
+                    assert atpath == path
+
+                print("Extracting {} to {}".format(path, datadir))
+                os.makedirs(datadir, exist_ok=True)
+                with tarfile.open(path, "r:") as tar:
+                    tar.extractall(path=datadir)
+
+                vspath = os.path.join(self.root, self.FILES[1])
+                if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
+                    download(self.VS_URL, vspath)
+
+                with open(vspath, "r") as f:
+                    synset_dict = f.read().splitlines()
+                    synset_dict = dict(line.split() for line in synset_dict)
+
+                print("Reorganizing into synset folders")
+                synsets = np.unique(list(synset_dict.values()))
+                for s in synsets:
+                    os.makedirs(os.path.join(datadir, s), exist_ok=True)
+                for k, v in synset_dict.items():
+                    src = os.path.join(datadir, k)
+                    dst = os.path.join(datadir, v)
+                    shutil.move(src, dst)
+
+            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
+            filelist = sorted(filelist)
+            filelist = "\n".join(filelist)+"\n"
+            with open(self.txt_filelist, "w") as f:
+                f.write(filelist)
+
+            tdu.mark_prepared(self.root)
+
+
+
+class ImageNetSR(Dataset):
+    def __init__(self, size=None,
+                 degradation=None, downscale_f=4, min_crop_f=0.5, max_crop_f=1.,
+                 random_crop=True):
+        """
+        Imagenet Superresolution Dataloader
+        Performs following ops in order:
+        1.  crops a crop of size s from image either as random or center crop
+        2.  resizes crop to size with cv2.area_interpolation
+        3.  degrades resized crop with degradation_fn
+
+        :param size: resizing to size after cropping
+        :param degradation: degradation_fn, e.g. cv_bicubic or bsrgan_light
+        :param downscale_f: Low Resolution Downsample factor
+        :param min_crop_f: determines crop size s,
+          where s = c * min_img_side_len with c sampled from interval (min_crop_f, max_crop_f)
+        :param max_crop_f: ""
+        :param data_root:
+        :param random_crop:
+        """
+        self.base = self.get_base()
+        assert size
+        assert (size / downscale_f).is_integer()
+        self.size = size
+        self.LR_size = int(size / downscale_f)
+        self.min_crop_f = min_crop_f
+        self.max_crop_f = max_crop_f
+        assert(max_crop_f <= 1.)
+        self.center_crop = not random_crop
+
+        self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
+
+        self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
+
+        if degradation == "bsrgan":
+            self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
+
+        elif degradation == "bsrgan_light":
+            self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
+
+        else:
+            interpolation_fn = {
+            "cv_nearest": cv2.INTER_NEAREST,
+            "cv_bilinear": cv2.INTER_LINEAR,
+            "cv_bicubic": cv2.INTER_CUBIC,
+            "cv_area": cv2.INTER_AREA,
+            "cv_lanczos": cv2.INTER_LANCZOS4,
+            "pil_nearest": PIL.Image.NEAREST,
+            "pil_bilinear": PIL.Image.BILINEAR,
+            "pil_bicubic": PIL.Image.BICUBIC,
+            "pil_box": PIL.Image.BOX,
+            "pil_hamming": PIL.Image.HAMMING,
+            "pil_lanczos": PIL.Image.LANCZOS,
+            }[degradation]
+
+            self.pil_interpolation = degradation.startswith("pil_")
+
+            if self.pil_interpolation:
+                self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
+
+            else:
+                self.degradation_process = albumentations.SmallestMaxSize(max_size=self.LR_size,
+                                                                          interpolation=interpolation_fn)
+
+    def __len__(self):
+        return len(self.base)
+
+    def __getitem__(self, i):
+        example = self.base[i]
+        image = Image.open(example["file_path_"])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        image = np.array(image).astype(np.uint8)
+
+        min_side_len = min(image.shape[:2])
+        crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
+        crop_side_len = int(crop_side_len)
+
+        if self.center_crop:
+            self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
+
+        else:
+            self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
+
+        image = self.cropper(image=image)["image"]
+        image = self.image_rescaler(image=image)["image"]
+
+        if self.pil_interpolation:
+            image_pil = PIL.Image.fromarray(image)
+            LR_image = self.degradation_process(image_pil)
+            LR_image = np.array(LR_image).astype(np.uint8)
+
+        else:
+            LR_image = self.degradation_process(image=image)["image"]
+
+        example["image"] = (image/127.5 - 1.0).astype(np.float32)
+        example["LR_image"] = (LR_image/127.5 - 1.0).astype(np.float32)
+
+        return example
+
+
+class ImageNetSRTrain(ImageNetSR):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def get_base(self):
+        with open("data/imagenet_train_hr_indices.p", "rb") as f:
+            indices = pickle.load(f)
+        dset = ImageNetTrain(process_images=False,)
+        return Subset(dset, indices)
+
+
+class ImageNetSRValidation(ImageNetSR):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def get_base(self):
+        with open("data/imagenet_val_hr_indices.p", "rb") as f:
+            indices = pickle.load(f)
+        dset = ImageNetValidation(process_images=False,)
+        return Subset(dset, indices)
diff --git a/ldmlib/data/lsun.py b/ldmlib/data/lsun.py
new file mode 100644
index 0000000000000000000000000000000000000000..6256e45715ff0b57c53f985594d27cbbbff0e68e
--- /dev/null
+++ b/ldmlib/data/lsun.py
@@ -0,0 +1,92 @@
+import os
+import numpy as np
+import PIL
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+
+
+class LSUNBase(Dataset):
+    def __init__(self,
+                 txt_file,
+                 data_root,
+                 size=None,
+                 interpolation="bicubic",
+                 flip_p=0.5
+                 ):
+        self.data_paths = txt_file
+        self.data_root = data_root
+        with open(self.data_paths, "r") as f:
+            self.image_paths = f.read().splitlines()
+        self._length = len(self.image_paths)
+        self.labels = {
+            "relative_file_path_": [l for l in self.image_paths],
+            "file_path_": [os.path.join(self.data_root, l)
+                           for l in self.image_paths],
+        }
+
+        self.size = size
+        self.interpolation = {"linear": PIL.Image.LINEAR,
+                              "bilinear": PIL.Image.BILINEAR,
+                              "bicubic": PIL.Image.BICUBIC,
+                              "lanczos": PIL.Image.LANCZOS,
+                              }[interpolation]
+        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = dict((k, self.labels[k][i]) for k in self.labels)
+        image = Image.open(example["file_path_"])
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+        crop = min(img.shape[0], img.shape[1])
+        h, w, = img.shape[0], img.shape[1]
+        img = img[(h - crop) // 2:(h + crop) // 2,
+              (w - crop) // 2:(w + crop) // 2]
+
+        image = Image.fromarray(img)
+        if self.size is not None:
+            image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip(image)
+        image = np.array(image).astype(np.uint8)
+        example["image"] = (image / 127.5 - 1.0).astype(np.float32)
+        return example
+
+
+class LSUNChurchesTrain(LSUNBase):
+    def __init__(self, **kwargs):
+        super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
+
+
+class LSUNChurchesValidation(LSUNBase):
+    def __init__(self, flip_p=0., **kwargs):
+        super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
+                         flip_p=flip_p, **kwargs)
+
+
+class LSUNBedroomsTrain(LSUNBase):
+    def __init__(self, **kwargs):
+        super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
+
+
+class LSUNBedroomsValidation(LSUNBase):
+    def __init__(self, flip_p=0.0, **kwargs):
+        super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
+                         flip_p=flip_p, **kwargs)
+
+
+class LSUNCatsTrain(LSUNBase):
+    def __init__(self, **kwargs):
+        super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
+
+
+class LSUNCatsValidation(LSUNBase):
+    def __init__(self, flip_p=0., **kwargs):
+        super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
+                         flip_p=flip_p, **kwargs)
diff --git a/ldmlib/lr_scheduler.py b/ldmlib/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..be39da9ca6dacc22bf3df9c7389bbb403a4a3ade
--- /dev/null
+++ b/ldmlib/lr_scheduler.py
@@ -0,0 +1,98 @@
+import numpy as np
+
+
+class LambdaWarmUpCosineScheduler:
+    """
+    note: use with a base_lr of 1.0
+    """
+    def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.
+        self.verbosity_interval = verbosity_interval
+
+    def schedule(self, n, **kwargs):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+        if n < self.lr_warm_up_steps:
+            lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
+            self.last_lr = lr
+            return lr
+        else:
+            t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
+                    1 + np.cos(t * np.pi))
+            self.last_lr = lr
+            return lr
+
+    def __call__(self, n, **kwargs):
+        return self.schedule(n,**kwargs)
+
+
+class LambdaWarmUpCosineScheduler2:
+    """
+    supports repeated iterations, configurable via lists
+    note: use with a base_lr of 1.0.
+    """
+    def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
+        assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
+        self.lr_warm_up_steps = warm_up_steps
+        self.f_start = f_start
+        self.f_min = f_min
+        self.f_max = f_max
+        self.cycle_lengths = cycle_lengths
+        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
+        self.last_f = 0.
+        self.verbosity_interval = verbosity_interval
+
+    def find_in_interval(self, n):
+        interval = 0
+        for cl in self.cum_cycles[1:]:
+            if n <= cl:
+                return interval
+            interval += 1
+
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                                                       f"current cycle {cycle}")
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
+            t = min(t, 1.0)
+            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
+                    1 + np.cos(t * np.pi))
+            self.last_f = f
+            return f
+
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+
+
+class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
+
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                                                       f"current cycle {cycle}")
+
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
+            self.last_f = f
+            return f
+
diff --git a/ldmlib/models/__pycache__/autoencoder.cpython-38.pyc b/ldmlib/models/__pycache__/autoencoder.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e028e0fa6791aaf0aeea5f2f02a00e3ba5dcf84
Binary files /dev/null and b/ldmlib/models/__pycache__/autoencoder.cpython-38.pyc differ
diff --git a/ldmlib/models/autoencoder.py b/ldmlib/models/autoencoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..21d0aed9ba142b71017c515718d431d85049a87f
--- /dev/null
+++ b/ldmlib/models/autoencoder.py
@@ -0,0 +1,443 @@
+import torch
+import pytorch_lightning as pl
+import torch.nn.functional as F
+from contextlib import contextmanager
+
+from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
+
+from ldmlib.modules.diffusionmodules.model import Encoder, Decoder
+from ldmlib.modules.distributions.distributions import DiagonalGaussianDistribution
+
+from ldmlib.util import instantiate_from_config
+
+
+class VQModel(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 n_embed,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 batch_resize_range=None,
+                 scheduler_config=None,
+                 lr_g_factor=1.0,
+                 remap=None,
+                 sane_index_shape=False, # tell vector quantizer to return indices as bhw
+                 use_ema=False
+                 ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.n_embed = n_embed
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
+                                        remap=remap,
+                                        sane_index_shape=sane_index_shape)
+        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.batch_resize_range = batch_resize_range
+        if self.batch_resize_range is not None:
+            print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
+
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.scheduler_config = scheduler_config
+        self.lr_g_factor = lr_g_factor
+
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+            print(f"Unexpected Keys: {unexpected}")
+
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self)
+
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+
+    def encode_to_prequant(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+
+    def decode_code(self, code_b):
+        quant_b = self.quantize.embed_code(code_b)
+        dec = self.decode(quant_b)
+        return dec
+
+    def forward(self, input, return_pred_indices=False):
+        quant, diff, (_,_,ind) = self.encode(input)
+        dec = self.decode(quant)
+        if return_pred_indices:
+            return dec, diff, ind
+        return dec, diff
+
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        if self.batch_resize_range is not None:
+            lower_size = self.batch_resize_range[0]
+            upper_size = self.batch_resize_range[1]
+            if self.global_step <= 4:
+                # do the first few batches with max size to avoid later oom
+                new_resize = upper_size
+            else:
+                new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
+            if new_resize != x.shape[2]:
+                x = F.interpolate(x, size=new_resize, mode="bicubic")
+            x = x.detach()
+        return x
+
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        # https://github.com/pytorch/pytorch/issues/37142
+        # try not to fool the heuristics
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss, ind = self(x, return_pred_indices=True)
+
+        if optimizer_idx == 0:
+            # autoencode
+            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train",
+                                            predicted_indices=ind)
+
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return aeloss
+
+        if optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return discloss
+
+    def validation_step(self, batch, batch_idx):
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
+        return log_dict
+
+    def _validation_step(self, batch, batch_idx, suffix=""):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss, ind = self(x, return_pred_indices=True)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
+                                        self.global_step,
+                                        last_layer=self.get_last_layer(),
+                                        split="val"+suffix,
+                                        predicted_indices=ind
+                                        )
+
+        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
+                                            self.global_step,
+                                            last_layer=self.get_last_layer(),
+                                            split="val"+suffix,
+                                            predicted_indices=ind
+                                            )
+        rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
+        self.log(f"val{suffix}/rec_loss", rec_loss,
+                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
+        self.log(f"val{suffix}/aeloss", aeloss,
+                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
+        if version.parse(pl.__version__) >= version.parse('1.4.0'):
+            del log_dict_ae[f"val{suffix}/rec_loss"]
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+
+    def configure_optimizers(self):
+        lr_d = self.learning_rate
+        lr_g = self.lr_g_factor*self.learning_rate
+        print("lr_d", lr_d)
+        print("lr_g", lr_g)
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quantize.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr_g, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr_d, betas=(0.5, 0.9))
+
+        if self.scheduler_config is not None:
+            scheduler = instantiate_from_config(self.scheduler_config)
+
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                },
+                {
+                    'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                },
+            ]
+            return [opt_ae, opt_disc], scheduler
+        return [opt_ae, opt_disc], []
+
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+
+    def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if only_inputs:
+            log["inputs"] = x
+            return log
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        if plot_ema:
+            with self.ema_scope():
+                xrec_ema, _ = self(x)
+                if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
+                log["reconstructions_ema"] = xrec_ema
+        return log
+
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+
+
+class VQModelInterface(VQModel):
+    def __init__(self, embed_dim, *args, **kwargs):
+        super().__init__(embed_dim=embed_dim, *args, **kwargs)
+        self.embed_dim = embed_dim
+
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+
+    def decode(self, h, force_not_quantize=False):
+        # also go through quantization layer
+        if not force_not_quantize:
+            quant, emb_loss, info = self.quantize(h)
+        else:
+            quant = h
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+
+
+class AutoencoderKL(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior
+
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        return x
+
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+
+        if optimizer_idx == 0:
+            # train encoder+decoder+logvar
+            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            return aeloss
+
+        if optimizer_idx == 1:
+            # train the discriminator
+            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                                last_layer=self.get_last_layer(), split="train")
+
+            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            return discloss
+
+    def validation_step(self, batch, batch_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
+                                        last_layer=self.get_last_layer(), split="val")
+
+        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+
+        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
+            log["reconstructions"] = xrec
+        log["inputs"] = x
+        return log
+
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+
+
+class IdentityFirstStage(torch.nn.Module):
+    def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
+        super().__init__()
+
+    def encode(self, x, *args, **kwargs):
+        return x
+
+    def decode(self, x, *args, **kwargs):
+        return x
+
+    def quantize(self, x, *args, **kwargs):
+        if self.vq_interface:
+            return x, None, [None, None, None]
+        return x
+
+    def forward(self, x, *args, **kwargs):
+        return x
diff --git a/ldmlib/models/diffusion/__init__.py b/ldmlib/models/diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ldmlib/models/diffusion/classifier.py b/ldmlib/models/diffusion/classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..363ad8cf6071a52c573cd84acf7fe05d3e340bd2
--- /dev/null
+++ b/ldmlib/models/diffusion/classifier.py
@@ -0,0 +1,267 @@
+import os
+import torch
+import pytorch_lightning as pl
+from omegaconf import OmegaConf
+from torch.nn import functional as F
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import LambdaLR
+from copy import deepcopy
+from einops import rearrange
+from glob import glob
+from natsort import natsorted
+
+from ldmlib.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
+from ldmlib.util import log_txt_as_img, default, ismap, instantiate_from_config
+
+__models__ = {
+    'class_label': EncoderUNetModel,
+    'segmentation': UNetModel
+}
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+class NoisyLatentImageClassifier(pl.LightningModule):
+
+    def __init__(self,
+                 diffusion_path,
+                 num_classes,
+                 ckpt_path=None,
+                 pool='attention',
+                 label_key=None,
+                 diffusion_ckpt_path=None,
+                 scheduler_config=None,
+                 weight_decay=1.e-2,
+                 log_steps=10,
+                 monitor='val/loss',
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_classes = num_classes
+        # get latest config of diffusion model
+        diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
+        self.diffusion_config = OmegaConf.load(diffusion_config).model
+        self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
+        self.load_diffusion()
+
+        self.monitor = monitor
+        self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
+        self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
+        self.log_steps = log_steps
+
+        self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
+            else self.diffusion_model.cond_stage_key
+
+        assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
+
+        if self.label_key not in __models__:
+            raise NotImplementedError()
+
+        self.load_classifier(ckpt_path, pool)
+
+        self.scheduler_config = scheduler_config
+        self.use_scheduler = self.scheduler_config is not None
+        self.weight_decay = weight_decay
+
+    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
+        sd = torch.load(path, map_location="cpu")
+        if "state_dict" in list(sd.keys()):
+            sd = sd["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
+            sd, strict=False)
+        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+
+    def load_diffusion(self):
+        model = instantiate_from_config(self.diffusion_config)
+        self.diffusion_model = model.eval()
+        self.diffusion_model.train = disabled_train
+        for param in self.diffusion_model.parameters():
+            param.requires_grad = False
+
+    def load_classifier(self, ckpt_path, pool):
+        model_config = deepcopy(self.diffusion_config.params.unet_config.params)
+        model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
+        model_config.out_channels = self.num_classes
+        if self.label_key == 'class_label':
+            model_config.pool = pool
+
+        self.model = __models__[self.label_key](**model_config)
+        if ckpt_path is not None:
+            print('#####################################################################')
+            print(f'load from ckpt "{ckpt_path}"')
+            print('#####################################################################')
+            self.init_from_ckpt(ckpt_path)
+
+    @torch.no_grad()
+    def get_x_noisy(self, x, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x))
+        continuous_sqrt_alpha_cumprod = None
+        if self.diffusion_model.use_continuous_noise:
+            continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
+            # todo: make sure t+1 is correct here
+
+        return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
+                                             continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
+
+    def forward(self, x_noisy, t, *args, **kwargs):
+        return self.model(x_noisy, t)
+
+    @torch.no_grad()
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = rearrange(x, 'b h w c -> b c h w')
+        x = x.to(memory_format=torch.contiguous_format).float()
+        return x
+
+    @torch.no_grad()
+    def get_conditioning(self, batch, k=None):
+        if k is None:
+            k = self.label_key
+        assert k is not None, 'Needs to provide label key'
+
+        targets = batch[k].to(self.device)
+
+        if self.label_key == 'segmentation':
+            targets = rearrange(targets, 'b h w c -> b c h w')
+            for down in range(self.numd):
+                h, w = targets.shape[-2:]
+                targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
+
+            # targets = rearrange(targets,'b c h w -> b h w c')
+
+        return targets
+
+    def compute_top_k(self, logits, labels, k, reduction="mean"):
+        _, top_ks = torch.topk(logits, k, dim=1)
+        if reduction == "mean":
+            return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
+        elif reduction == "none":
+            return (top_ks == labels[:, None]).float().sum(dim=-1)
+
+    def on_train_epoch_start(self):
+        # save some memory
+        self.diffusion_model.model.to('cpu')
+
+    @torch.no_grad()
+    def write_logs(self, loss, logits, targets):
+        log_prefix = 'train' if self.training else 'val'
+        log = {}
+        log[f"{log_prefix}/loss"] = loss.mean()
+        log[f"{log_prefix}/acc@1"] = self.compute_top_k(
+            logits, targets, k=1, reduction="mean"
+        )
+        log[f"{log_prefix}/acc@5"] = self.compute_top_k(
+            logits, targets, k=5, reduction="mean"
+        )
+
+        self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
+        self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
+        self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
+        lr = self.optimizers().param_groups[0]['lr']
+        self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
+
+    def shared_step(self, batch, t=None):
+        x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
+        targets = self.get_conditioning(batch)
+        if targets.dim() == 4:
+            targets = targets.argmax(dim=1)
+        if t is None:
+            t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
+        else:
+            t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
+        x_noisy = self.get_x_noisy(x, t)
+        logits = self(x_noisy, t)
+
+        loss = F.cross_entropy(logits, targets, reduction='none')
+
+        self.write_logs(loss.detach(), logits.detach(), targets.detach())
+
+        loss = loss.mean()
+        return loss, logits, x_noisy, targets
+
+    def training_step(self, batch, batch_idx):
+        loss, *_ = self.shared_step(batch)
+        return loss
+
+    def reset_noise_accs(self):
+        self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
+                          range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
+
+    def on_validation_start(self):
+        self.reset_noise_accs()
+
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        loss, *_ = self.shared_step(batch)
+
+        for t in self.noisy_acc:
+            _, logits, _, targets = self.shared_step(batch, t)
+            self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
+            self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
+
+        return loss
+
+    def configure_optimizers(self):
+        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
+
+        if self.use_scheduler:
+            scheduler = instantiate_from_config(self.scheduler_config)
+
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                }]
+            return [optimizer], scheduler
+
+        return optimizer
+
+    @torch.no_grad()
+    def log_images(self, batch, N=8, *args, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.diffusion_model.first_stage_key)
+        log['inputs'] = x
+
+        y = self.get_conditioning(batch)
+
+        if self.label_key == 'class_label':
+            y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
+            log['labels'] = y
+
+        if ismap(y):
+            log['labels'] = self.diffusion_model.to_rgb(y)
+
+            for step in range(self.log_steps):
+                current_time = step * self.log_time_interval
+
+                _, logits, x_noisy, _ = self.shared_step(batch, t=current_time)
+
+                log[f'inputs@t{current_time}'] = x_noisy
+
+                pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
+                pred = rearrange(pred, 'b h w c -> b c h w')
+
+                log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
+
+        for key in log:
+            log[key] = log[key][:N]
+
+        return log
diff --git a/ldmlib/models/diffusion/ddim.py b/ldmlib/models/diffusion/ddim.py
new file mode 100644
index 0000000000000000000000000000000000000000..844cb10346f94b03859b263ae601bd181b24bbe1
--- /dev/null
+++ b/ldmlib/models/diffusion/ddim.py
@@ -0,0 +1,241 @@
+"""SAMPLING ONLY."""
+
+import torch
+import numpy as np
+from tqdm import tqdm
+from functools import partial
+
+from ldmlib.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, \
+    extract_into_tensor
+
+
+class DDIMSampler(object):
+    def __init__(self, model, schedule="linear", **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor:
+            if attr.device != torch.device("cuda"):
+                attr = attr.to(torch.device("cuda"))
+        setattr(self, name, attr)
+
+    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
+        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
+                                                                                   ddim_timesteps=self.ddim_timesteps,
+                                                                                   eta=ddim_eta,verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
+                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
+        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
+
+    @torch.no_grad()
+    def sample(self,
+               S,
+               batch_size,
+               shape,
+               conditioning=None,
+               callback=None,
+               normals_sequence=None,
+               img_callback=None,
+               quantize_x0=False,
+               eta=0.,
+               mask=None,
+               x0=None,
+               temperature=1.,
+               noise_dropout=0.,
+               score_corrector=None,
+               corrector_kwargs=None,
+               verbose=True,
+               x_T=None,
+               log_every_t=100,
+               unconditional_guidance_scale=1.,
+               unconditional_conditioning=None,
+               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+               **kwargs
+               ):
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
+
+        samples, intermediates = self.ddim_sampling(conditioning, size,
+                                                    callback=callback,
+                                                    img_callback=img_callback,
+                                                    quantize_denoised=quantize_x0,
+                                                    mask=mask, x0=x0,
+                                                    ddim_use_original_steps=False,
+                                                    noise_dropout=noise_dropout,
+                                                    temperature=temperature,
+                                                    score_corrector=score_corrector,
+                                                    corrector_kwargs=corrector_kwargs,
+                                                    x_T=x_T,
+                                                    log_every_t=log_every_t,
+                                                    unconditional_guidance_scale=unconditional_guidance_scale,
+                                                    unconditional_conditioning=unconditional_conditioning,
+                                                    )
+        return samples, intermediates
+
+    @torch.no_grad()
+    def ddim_sampling(self, cond, shape,
+                      x_T=None, ddim_use_original_steps=False,
+                      callback=None, timesteps=None, quantize_denoised=False,
+                      mask=None, x0=None, img_callback=None, log_every_t=100,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+
+        intermediates = {'x_inter': [img], 'pred_x0': [img]}
+        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        print(f"Running DDIM Sampling with {total_steps} timesteps")
+
+        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
+
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1. - mask) * img
+
+            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
+                                      quantize_denoised=quantize_denoised, temperature=temperature,
+                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
+                                      corrector_kwargs=corrector_kwargs,
+                                      unconditional_guidance_scale=unconditional_guidance_scale,
+                                      unconditional_conditioning=unconditional_conditioning)
+            img, pred_x0 = outs
+            if callback: callback(i)
+            if img_callback: img_callback(pred_x0, i)
+
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['x_inter'].append(img)
+                intermediates['pred_x0'].append(pred_x0)
+
+        return img, intermediates
+
+    @torch.no_grad()
+    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None):
+        b, *_, device = *x.shape, x.device
+
+        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            e_t = self.model.apply_model(x, t, c)
+        else:
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([t] * 2)
+            c_in = torch.cat([unconditional_conditioning, c])
+            e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
+            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+        if score_corrector is not None:
+            assert self.model.parameterization == "eps"
+            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        # select parameters corresponding to the currently considered timestep
+        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
+
+        # current prediction for x_0
+        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        if quantize_denoised:
+            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+        # direction pointing to x_t
+        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        return x_prev, pred_x0
+
+    @torch.no_grad()
+    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
+        # fast, but does not allow for exact reconstruction
+        # t serves as an index to gather the correct alphas
+        if use_original_steps:
+            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
+            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
+        else:
+            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
+            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
+
+        if noise is None:
+            noise = torch.randn_like(x0)
+        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
+                extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
+
+    @torch.no_grad()
+    def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
+               use_original_steps=False):
+
+        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
+        timesteps = timesteps[:t_start]
+
+        time_range = np.flip(timesteps)
+        total_steps = timesteps.shape[0]
+        print(f"Running DDIM Sampling with {total_steps} timesteps")
+
+        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
+        x_dec = x_latent
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
+            x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
+                                          unconditional_guidance_scale=unconditional_guidance_scale,
+                                          unconditional_conditioning=unconditional_conditioning)
+        return x_dec
diff --git a/ldmlib/models/diffusion/ddpm.py b/ldmlib/models/diffusion/ddpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..498c78353bc2fd32de7e8e47320e6d8708d1a5ae
--- /dev/null
+++ b/ldmlib/models/diffusion/ddpm.py
@@ -0,0 +1,1445 @@
+"""
+wild mixture of
+https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py
+https://github.com/CompVis/taming-transformers
+-- merci
+"""
+
+import torch
+import torch.nn as nn
+import numpy as np
+import pytorch_lightning as pl
+from torch.optim.lr_scheduler import LambdaLR
+from einops import rearrange, repeat
+from contextlib import contextmanager
+from functools import partial
+from tqdm import tqdm
+from torchvision.utils import make_grid
+from pytorch_lightning.utilities.distributed import rank_zero_only
+
+from ldmlib.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
+from ldmlib.modules.ema import LitEma
+from ldmlib.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
+from ldmlib.models.autoencoder import VQModelInterface, IdentityFirstStage, AutoencoderKL
+from ldmlib.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
+from ldmlib.models.diffusion.ddim import DDIMSampler
+
+
+__conditioning_keys__ = {'concat': 'c_concat',
+                         'crossattn': 'c_crossattn',
+                         'adm': 'y'}
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+def uniform_on_device(r1, r2, shape, device):
+    return (r1 - r2) * torch.rand(*shape, device=device) + r2
+
+
+class DDPM(pl.LightningModule):
+    # classic DDPM with Gaussian diffusion, in image space
+    def __init__(self,
+                 unet_config,
+                 timesteps=1000,
+                 beta_schedule="linear",
+                 loss_type="l2",
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 load_only_unet=False,
+                 monitor="val/loss",
+                 use_ema=True,
+                 first_stage_key="image",
+                 image_size=256,
+                 channels=3,
+                 log_every_t=100,
+                 clip_denoised=True,
+                 linear_start=1e-4,
+                 linear_end=2e-2,
+                 cosine_s=8e-3,
+                 given_betas=None,
+                 original_elbo_weight=0.,
+                 v_posterior=0.,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
+                 l_simple_weight=1.,
+                 conditioning_key=None,
+                 parameterization="eps",  # all assuming fixed variance schedules
+                 scheduler_config=None,
+                 use_positional_encodings=False,
+                 learn_logvar=False,
+                 logvar_init=0.,
+                 ):
+        super().__init__()
+        assert parameterization in ["eps", "x0"], 'currently only supporting "eps" and "x0"'
+        self.parameterization = parameterization
+        print(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
+        self.cond_stage_model = None
+        self.clip_denoised = clip_denoised
+        self.log_every_t = log_every_t
+        self.first_stage_key = first_stage_key
+        self.image_size = image_size  # try conv?
+        self.channels = channels
+        self.use_positional_encodings = use_positional_encodings
+        self.model = DiffusionWrapper(unet_config, conditioning_key)
+        count_params(self.model, verbose=True)
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self.model)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+
+        self.use_scheduler = scheduler_config is not None
+        if self.use_scheduler:
+            self.scheduler_config = scheduler_config
+
+        self.v_posterior = v_posterior
+        self.original_elbo_weight = original_elbo_weight
+        self.l_simple_weight = l_simple_weight
+
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
+
+        self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
+                               linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
+
+        self.loss_type = loss_type
+
+        self.learn_logvar = learn_logvar
+        self.logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,))
+        if self.learn_logvar:
+            self.logvar = nn.Parameter(self.logvar, requires_grad=True)
+
+
+    def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
+                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+        if exists(given_betas):
+            betas = given_betas
+        else:
+            betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
+                                       cosine_s=cosine_s)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
+
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
+
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+
+        self.register_buffer('betas', to_torch(betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
+
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
+                    1. - alphas_cumprod) + self.v_posterior * betas
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        self.register_buffer('posterior_variance', to_torch(posterior_variance))
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
+        self.register_buffer('posterior_mean_coef1', to_torch(
+            betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
+        self.register_buffer('posterior_mean_coef2', to_torch(
+            (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
+
+        if self.parameterization == "eps":
+            lvlb_weights = self.betas ** 2 / (
+                        2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
+        elif self.parameterization == "x0":
+            lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
+        else:
+            raise NotImplementedError("mu not supported")
+        # TODO how to choose this term
+        lvlb_weights[0] = lvlb_weights[1]
+        self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
+        assert not torch.isnan(self.lvlb_weights).all()
+
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.model.parameters())
+            self.model_ema.copy_to(self.model)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.model.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+
+    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
+        sd = torch.load(path, map_location="cpu")
+        if "state_dict" in list(sd.keys()):
+            sd = sd["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
+            sd, strict=False)
+        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start)
+        variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+                extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
+                extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+        )
+
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+                extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
+                extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+
+    def p_mean_variance(self, x, t, clip_denoised: bool):
+        model_out = self.model(x, t)
+        if self.parameterization == "eps":
+            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
+        elif self.parameterization == "x0":
+            x_recon = model_out
+        if clip_denoised:
+            x_recon.clamp_(-1., 1.)
+
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
+        return model_mean, posterior_variance, posterior_log_variance
+
+    @torch.no_grad()
+    def p_sample(self, x, t, clip_denoised=True, repeat_noise=False):
+        b, *_, device = *x.shape, x.device
+        model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, clip_denoised=clip_denoised)
+        noise = noise_like(x.shape, device, repeat_noise)
+        # no noise when t == 0
+        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
+        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
+
+    @torch.no_grad()
+    def p_sample_loop(self, shape, return_intermediates=False):
+        device = self.betas.device
+        b = shape[0]
+        img = torch.randn(shape, device=device)
+        intermediates = [img]
+        for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps):
+            img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long),
+                                clip_denoised=self.clip_denoised)
+            if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
+                intermediates.append(img)
+        if return_intermediates:
+            return img, intermediates
+        return img
+
+    @torch.no_grad()
+    def sample(self, batch_size=16, return_intermediates=False):
+        image_size = self.image_size
+        channels = self.channels
+        return self.p_sample_loop((batch_size, channels, image_size, image_size),
+                                  return_intermediates=return_intermediates)
+
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
+
+    def get_loss(self, pred, target, mean=True):
+        if self.loss_type == 'l1':
+            loss = (target - pred).abs()
+            if mean:
+                loss = loss.mean()
+        elif self.loss_type == 'l2':
+            if mean:
+                loss = torch.nn.functional.mse_loss(target, pred)
+            else:
+                loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
+        else:
+            raise NotImplementedError("unknown loss type '{loss_type}'")
+
+        return loss
+
+    def p_losses(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+        model_out = self.model(x_noisy, t)
+
+        loss_dict = {}
+        if self.parameterization == "eps":
+            target = noise
+        elif self.parameterization == "x0":
+            target = x_start
+        else:
+            raise NotImplementedError(f"Paramterization {self.parameterization} not yet supported")
+
+        loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3])
+
+        log_prefix = 'train' if self.training else 'val'
+
+        loss_dict.update({f'{log_prefix}/loss_simple': loss.mean()})
+        loss_simple = loss.mean() * self.l_simple_weight
+
+        loss_vlb = (self.lvlb_weights[t] * loss).mean()
+        loss_dict.update({f'{log_prefix}/loss_vlb': loss_vlb})
+
+        loss = loss_simple + self.original_elbo_weight * loss_vlb
+
+        loss_dict.update({f'{log_prefix}/loss': loss})
+
+        return loss, loss_dict
+
+    def forward(self, x, *args, **kwargs):
+        # b, c, h, w, device, img_size, = *x.shape, x.device, self.image_size
+        # assert h == img_size and w == img_size, f'height and width of image must be {img_size}'
+        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
+        return self.p_losses(x, t, *args, **kwargs)
+
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = rearrange(x, 'b h w c -> b c h w')
+        x = x.to(memory_format=torch.contiguous_format).float()
+        return x
+
+    def shared_step(self, batch):
+        x = self.get_input(batch, self.first_stage_key)
+        loss, loss_dict = self(x)
+        return loss, loss_dict
+
+    def training_step(self, batch, batch_idx):
+        loss, loss_dict = self.shared_step(batch)
+
+        self.log_dict(loss_dict, prog_bar=True,
+                      logger=True, on_step=True, on_epoch=True)
+
+        self.log("global_step", self.global_step,
+                 prog_bar=True, logger=True, on_step=True, on_epoch=False)
+
+        if self.use_scheduler:
+            lr = self.optimizers().param_groups[0]['lr']
+            self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
+
+        return loss
+
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        _, loss_dict_no_ema = self.shared_step(batch)
+        with self.ema_scope():
+            _, loss_dict_ema = self.shared_step(batch)
+            loss_dict_ema = {key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
+        self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
+        self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
+
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self.model)
+
+    def _get_rows_from_list(self, samples):
+        n_imgs_per_row = len(samples)
+        denoise_grid = rearrange(samples, 'n b c h w -> b n c h w')
+        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
+        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
+        return denoise_grid
+
+    @torch.no_grad()
+    def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=None, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.first_stage_key)
+        N = min(x.shape[0], N)
+        n_row = min(x.shape[0], n_row)
+        x = x.to(self.device)[:N]
+        log["inputs"] = x
+
+        # get diffusion row
+        diffusion_row = list()
+        x_start = x[:n_row]
+
+        for t in range(self.num_timesteps):
+            if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                t = t.to(self.device).long()
+                noise = torch.randn_like(x_start)
+                x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+                diffusion_row.append(x_noisy)
+
+        log["diffusion_row"] = self._get_rows_from_list(diffusion_row)
+
+        if sample:
+            # get denoise row
+            with self.ema_scope("Plotting"):
+                samples, denoise_row = self.sample(batch_size=N, return_intermediates=True)
+
+            log["samples"] = samples
+            log["denoise_row"] = self._get_rows_from_list(denoise_row)
+
+        if return_keys:
+            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
+                return log
+            else:
+                return {key: log[key] for key in return_keys}
+        return log
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.model.parameters())
+        if self.learn_logvar:
+            params = params + [self.logvar]
+        opt = torch.optim.AdamW(params, lr=lr)
+        return opt
+
+
+class LatentDiffusion(DDPM):
+    """main class"""
+    def __init__(self,
+                 first_stage_config,
+                 cond_stage_config,
+                 num_timesteps_cond=None,
+                 cond_stage_key="image",
+                 cond_stage_trainable=False,
+                 concat_mode=True,
+                 cond_stage_forward=None,
+                 conditioning_key=None,
+                 scale_factor=1.0,
+                 scale_by_std=False,
+                 *args, **kwargs):
+        self.num_timesteps_cond = default(num_timesteps_cond, 1)
+        self.scale_by_std = scale_by_std
+        assert self.num_timesteps_cond <= kwargs['timesteps']
+        # for backwards compatibility after implementation of DiffusionWrapper
+        if conditioning_key is None:
+            conditioning_key = 'concat' if concat_mode else 'crossattn'
+        if cond_stage_config == '__is_unconditional__':
+            conditioning_key = None
+        ckpt_path = kwargs.pop("ckpt_path", None)
+        ignore_keys = kwargs.pop("ignore_keys", [])
+        super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
+        self.concat_mode = concat_mode
+        self.cond_stage_trainable = cond_stage_trainable
+        self.cond_stage_key = cond_stage_key
+        try:
+            self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
+        except:
+            self.num_downs = 0
+        if not scale_by_std:
+            self.scale_factor = scale_factor
+        else:
+            self.register_buffer('scale_factor', torch.tensor(scale_factor))
+        self.instantiate_first_stage(first_stage_config)
+        self.instantiate_cond_stage(cond_stage_config)
+        self.cond_stage_forward = cond_stage_forward
+        self.clip_denoised = False
+        self.bbox_tokenizer = None
+
+        self.restarted_from_ckpt = False
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys)
+            self.restarted_from_ckpt = True
+
+    def make_cond_schedule(self, ):
+        self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
+        ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
+        self.cond_ids[:self.num_timesteps_cond] = ids
+
+    @rank_zero_only
+    @torch.no_grad()
+    def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
+        # only for very first batch
+        if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
+            assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
+            # set rescale weight to 1./std of encodings
+            print("### USING STD-RESCALING ###")
+            x = super().get_input(batch, self.first_stage_key)
+            x = x.to(self.device)
+            encoder_posterior = self.encode_first_stage(x)
+            z = self.get_first_stage_encoding(encoder_posterior).detach()
+            del self.scale_factor
+            self.register_buffer('scale_factor', 1. / z.flatten().std())
+            print(f"setting self.scale_factor to {self.scale_factor}")
+            print("### USING STD-RESCALING ###")
+
+    def register_schedule(self,
+                          given_betas=None, beta_schedule="linear", timesteps=1000,
+                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+        super().register_schedule(given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s)
+
+        self.shorten_cond_schedule = self.num_timesteps_cond > 1
+        if self.shorten_cond_schedule:
+            self.make_cond_schedule()
+
+    def instantiate_first_stage(self, config):
+        model = instantiate_from_config(config)
+        self.first_stage_model = model.eval()
+        self.first_stage_model.train = disabled_train
+        for param in self.first_stage_model.parameters():
+            param.requires_grad = False
+
+    def instantiate_cond_stage(self, config):
+        if not self.cond_stage_trainable:
+            if config == "__is_first_stage__":
+                print("Using first stage also as cond stage.")
+                self.cond_stage_model = self.first_stage_model
+            elif config == "__is_unconditional__":
+                print(f"Training {self.__class__.__name__} as an unconditional model.")
+                self.cond_stage_model = None
+                # self.be_unconditional = True
+            else:
+                model = instantiate_from_config(config)
+                self.cond_stage_model = model.eval()
+                self.cond_stage_model.train = disabled_train
+                for param in self.cond_stage_model.parameters():
+                    param.requires_grad = False
+        else:
+            assert config != '__is_first_stage__'
+            assert config != '__is_unconditional__'
+            model = instantiate_from_config(config)
+            self.cond_stage_model = model
+
+    def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
+        denoise_row = []
+        for zd in tqdm(samples, desc=desc):
+            denoise_row.append(self.decode_first_stage(zd.to(self.device),
+                                                            force_not_quantize=force_no_decoder_quantization))
+        n_imgs_per_row = len(denoise_row)
+        denoise_row = torch.stack(denoise_row)  # n_log_step, n_row, C, H, W
+        denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
+        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
+        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
+        return denoise_grid
+
+    def get_first_stage_encoding(self, encoder_posterior):
+        if isinstance(encoder_posterior, DiagonalGaussianDistribution):
+            z = encoder_posterior.sample()
+        elif isinstance(encoder_posterior, torch.Tensor):
+            z = encoder_posterior
+        else:
+            raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
+        return self.scale_factor * z
+
+    def get_learned_conditioning(self, c):
+        if self.cond_stage_forward is None:
+            if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
+                c = self.cond_stage_model.encode(c)
+                if isinstance(c, DiagonalGaussianDistribution):
+                    c = c.mode()
+            else:
+                c = self.cond_stage_model(c)
+        else:
+            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
+            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
+        return c
+
+    def meshgrid(self, h, w):
+        y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1)
+        x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1)
+
+        arr = torch.cat([y, x], dim=-1)
+        return arr
+
+    def delta_border(self, h, w):
+        """
+        :param h: height
+        :param w: width
+        :return: normalized distance to image border,
+         wtith min distance = 0 at border and max dist = 0.5 at image center
+        """
+        lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
+        arr = self.meshgrid(h, w) / lower_right_corner
+        dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0]
+        dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0]
+        edge_dist = torch.min(torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1)[0]
+        return edge_dist
+
+    def get_weighting(self, h, w, Ly, Lx, device):
+        weighting = self.delta_border(h, w)
+        weighting = torch.clip(weighting, self.split_input_params["clip_min_weight"],
+                               self.split_input_params["clip_max_weight"], )
+        weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device)
+
+        if self.split_input_params["tie_braker"]:
+            L_weighting = self.delta_border(Ly, Lx)
+            L_weighting = torch.clip(L_weighting,
+                                     self.split_input_params["clip_min_tie_weight"],
+                                     self.split_input_params["clip_max_tie_weight"])
+
+            L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
+            weighting = weighting * L_weighting
+        return weighting
+
+    def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1):  # todo load once not every time, shorten code
+        """
+        :param x: img of size (bs, c, h, w)
+        :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
+        """
+        bs, nc, h, w = x.shape
+
+        # number of crops in image
+        Ly = (h - kernel_size[0]) // stride[0] + 1
+        Lx = (w - kernel_size[1]) // stride[1] + 1
+
+        if uf == 1 and df == 1:
+            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
+            unfold = torch.nn.Unfold(**fold_params)
+
+            fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
+
+            weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly, Lx, x.device).to(x.dtype)
+            normalization = fold(weighting).view(1, 1, h, w)  # normalizes the overlap
+            weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
+
+        elif uf > 1 and df == 1:
+            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
+            unfold = torch.nn.Unfold(**fold_params)
+
+            fold_params2 = dict(kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
+                                dilation=1, padding=0,
+                                stride=(stride[0] * uf, stride[1] * uf))
+            fold = torch.nn.Fold(output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2)
+
+            weighting = self.get_weighting(kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device).to(x.dtype)
+            normalization = fold(weighting).view(1, 1, h * uf, w * uf)  # normalizes the overlap
+            weighting = weighting.view((1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx))
+
+        elif df > 1 and uf == 1:
+            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
+            unfold = torch.nn.Unfold(**fold_params)
+
+            fold_params2 = dict(kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
+                                dilation=1, padding=0,
+                                stride=(stride[0] // df, stride[1] // df))
+            fold = torch.nn.Fold(output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2)
+
+            weighting = self.get_weighting(kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device).to(x.dtype)
+            normalization = fold(weighting).view(1, 1, h // df, w // df)  # normalizes the overlap
+            weighting = weighting.view((1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx))
+
+        else:
+            raise NotImplementedError
+
+        return fold, unfold, normalization, weighting
+
+    @torch.no_grad()
+    def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=False,
+                  cond_key=None, return_original_cond=False, bs=None):
+        x = super().get_input(batch, k)
+        if bs is not None:
+            x = x[:bs]
+        x = x.to(self.device)
+        encoder_posterior = self.encode_first_stage(x)
+        z = self.get_first_stage_encoding(encoder_posterior).detach()
+
+        if self.model.conditioning_key is not None:
+            if cond_key is None:
+                cond_key = self.cond_stage_key
+            if cond_key != self.first_stage_key:
+                if cond_key in ['caption', 'coordinates_bbox']:
+                    xc = batch[cond_key]
+                elif cond_key == 'class_label':
+                    xc = batch
+                else:
+                    xc = super().get_input(batch, cond_key).to(self.device)
+            else:
+                xc = x
+            if not self.cond_stage_trainable or force_c_encode:
+                if isinstance(xc, dict) or isinstance(xc, list):
+                    # import pudb; pudb.set_trace()
+                    c = self.get_learned_conditioning(xc)
+                else:
+                    c = self.get_learned_conditioning(xc.to(self.device))
+            else:
+                c = xc
+            if bs is not None:
+                c = c[:bs]
+
+            if self.use_positional_encodings:
+                pos_x, pos_y = self.compute_latent_shifts(batch)
+                ckey = __conditioning_keys__[self.model.conditioning_key]
+                c = {ckey: c, 'pos_x': pos_x, 'pos_y': pos_y}
+
+        else:
+            c = None
+            xc = None
+            if self.use_positional_encodings:
+                pos_x, pos_y = self.compute_latent_shifts(batch)
+                c = {'pos_x': pos_x, 'pos_y': pos_y}
+        out = [z, c]
+        if return_first_stage_outputs:
+            xrec = self.decode_first_stage(z)
+            out.extend([x, xrec])
+        if return_original_cond:
+            out.append(xc)
+        return out
+
+    @torch.no_grad()
+    def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
+        if predict_cids:
+            if z.dim() == 4:
+                z = torch.argmax(z.exp(), dim=1).long()
+            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
+            z = rearrange(z, 'b h w c -> b c h w').contiguous()
+
+        z = 1. / self.scale_factor * z
+
+        if hasattr(self, "split_input_params"):
+            if self.split_input_params["patch_distributed_vq"]:
+                ks = self.split_input_params["ks"]  # eg. (128, 128)
+                stride = self.split_input_params["stride"]  # eg. (64, 64)
+                uf = self.split_input_params["vqf"]
+                bs, nc, h, w = z.shape
+                if ks[0] > h or ks[1] > w:
+                    ks = (min(ks[0], h), min(ks[1], w))
+                    print("reducing Kernel")
+
+                if stride[0] > h or stride[1] > w:
+                    stride = (min(stride[0], h), min(stride[1], w))
+                    print("reducing stride")
+
+                fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
+
+                z = unfold(z)  # (bn, nc * prod(**ks), L)
+                # 1. Reshape to img shape
+                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
+
+                # 2. apply model loop over last dim
+                if isinstance(self.first_stage_model, VQModelInterface):
+                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
+                                                                 force_not_quantize=predict_cids or force_not_quantize)
+                                   for i in range(z.shape[-1])]
+                else:
+
+                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
+                                   for i in range(z.shape[-1])]
+
+                o = torch.stack(output_list, axis=-1)  # # (bn, nc, ks[0], ks[1], L)
+                o = o * weighting
+                # Reverse 1. reshape to img shape
+                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
+                # stitch crops together
+                decoded = fold(o)
+                decoded = decoded / normalization  # norm is shape (1, 1, h, w)
+                return decoded
+            else:
+                if isinstance(self.first_stage_model, VQModelInterface):
+                    return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
+                else:
+                    return self.first_stage_model.decode(z)
+
+        else:
+            if isinstance(self.first_stage_model, VQModelInterface):
+                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
+            else:
+                return self.first_stage_model.decode(z)
+
+    # same as above but without decorator
+    def differentiable_decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
+        if predict_cids:
+            if z.dim() == 4:
+                z = torch.argmax(z.exp(), dim=1).long()
+            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
+            z = rearrange(z, 'b h w c -> b c h w').contiguous()
+
+        z = 1. / self.scale_factor * z
+
+        if hasattr(self, "split_input_params"):
+            if self.split_input_params["patch_distributed_vq"]:
+                ks = self.split_input_params["ks"]  # eg. (128, 128)
+                stride = self.split_input_params["stride"]  # eg. (64, 64)
+                uf = self.split_input_params["vqf"]
+                bs, nc, h, w = z.shape
+                if ks[0] > h or ks[1] > w:
+                    ks = (min(ks[0], h), min(ks[1], w))
+                    print("reducing Kernel")
+
+                if stride[0] > h or stride[1] > w:
+                    stride = (min(stride[0], h), min(stride[1], w))
+                    print("reducing stride")
+
+                fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
+
+                z = unfold(z)  # (bn, nc * prod(**ks), L)
+                # 1. Reshape to img shape
+                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
+
+                # 2. apply model loop over last dim
+                if isinstance(self.first_stage_model, VQModelInterface):
+                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
+                                                                 force_not_quantize=predict_cids or force_not_quantize)
+                                   for i in range(z.shape[-1])]
+                else:
+
+                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
+                                   for i in range(z.shape[-1])]
+
+                o = torch.stack(output_list, axis=-1)  # # (bn, nc, ks[0], ks[1], L)
+                o = o * weighting
+                # Reverse 1. reshape to img shape
+                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
+                # stitch crops together
+                decoded = fold(o)
+                decoded = decoded / normalization  # norm is shape (1, 1, h, w)
+                return decoded
+            else:
+                if isinstance(self.first_stage_model, VQModelInterface):
+                    return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
+                else:
+                    return self.first_stage_model.decode(z)
+
+        else:
+            if isinstance(self.first_stage_model, VQModelInterface):
+                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
+            else:
+                return self.first_stage_model.decode(z)
+
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        if hasattr(self, "split_input_params"):
+            if self.split_input_params["patch_distributed_vq"]:
+                ks = self.split_input_params["ks"]  # eg. (128, 128)
+                stride = self.split_input_params["stride"]  # eg. (64, 64)
+                df = self.split_input_params["vqf"]
+                self.split_input_params['original_image_size'] = x.shape[-2:]
+                bs, nc, h, w = x.shape
+                if ks[0] > h or ks[1] > w:
+                    ks = (min(ks[0], h), min(ks[1], w))
+                    print("reducing Kernel")
+
+                if stride[0] > h or stride[1] > w:
+                    stride = (min(stride[0], h), min(stride[1], w))
+                    print("reducing stride")
+
+                fold, unfold, normalization, weighting = self.get_fold_unfold(x, ks, stride, df=df)
+                z = unfold(x)  # (bn, nc * prod(**ks), L)
+                # Reshape to img shape
+                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
+
+                output_list = [self.first_stage_model.encode(z[:, :, :, :, i])
+                               for i in range(z.shape[-1])]
+
+                o = torch.stack(output_list, axis=-1)
+                o = o * weighting
+
+                # Reverse reshape to img shape
+                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
+                # stitch crops together
+                decoded = fold(o)
+                decoded = decoded / normalization
+                return decoded
+
+            else:
+                return self.first_stage_model.encode(x)
+        else:
+            return self.first_stage_model.encode(x)
+
+    def shared_step(self, batch, **kwargs):
+        x, c = self.get_input(batch, self.first_stage_key)
+        loss = self(x, c)
+        return loss
+
+    def forward(self, x, c, *args, **kwargs):
+        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
+        if self.model.conditioning_key is not None:
+            assert c is not None
+            if self.cond_stage_trainable:
+                c = self.get_learned_conditioning(c)
+            if self.shorten_cond_schedule:  # TODO: drop this option
+                tc = self.cond_ids[t].to(self.device)
+                c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
+        return self.p_losses(x, c, t, *args, **kwargs)
+
+    def _rescale_annotations(self, bboxes, crop_coordinates):  # TODO: move to dataset
+        def rescale_bbox(bbox):
+            x0 = clamp((bbox[0] - crop_coordinates[0]) / crop_coordinates[2])
+            y0 = clamp((bbox[1] - crop_coordinates[1]) / crop_coordinates[3])
+            w = min(bbox[2] / crop_coordinates[2], 1 - x0)
+            h = min(bbox[3] / crop_coordinates[3], 1 - y0)
+            return x0, y0, w, h
+
+        return [rescale_bbox(b) for b in bboxes]
+
+    def apply_model(self, x_noisy, t, cond, return_ids=False):
+
+        if isinstance(cond, dict):
+            # hybrid case, cond is exptected to be a dict
+            pass
+        else:
+            if not isinstance(cond, list):
+                cond = [cond]
+            key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
+            cond = {key: cond}
+
+        if hasattr(self, "split_input_params"):
+            assert len(cond) == 1  # todo can only deal with one conditioning atm
+            assert not return_ids
+            ks = self.split_input_params["ks"]  # eg. (128, 128)
+            stride = self.split_input_params["stride"]  # eg. (64, 64)
+
+            h, w = x_noisy.shape[-2:]
+
+            fold, unfold, normalization, weighting = self.get_fold_unfold(x_noisy, ks, stride)
+
+            z = unfold(x_noisy)  # (bn, nc * prod(**ks), L)
+            # Reshape to img shape
+            z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
+            z_list = [z[:, :, :, :, i] for i in range(z.shape[-1])]
+
+            if self.cond_stage_key in ["image", "LR_image", "segmentation",
+                                       'bbox_img'] and self.model.conditioning_key:  # todo check for completeness
+                c_key = next(iter(cond.keys()))  # get key
+                c = next(iter(cond.values()))  # get value
+                assert (len(c) == 1)  # todo extend to list with more than one elem
+                c = c[0]  # get element
+
+                c = unfold(c)
+                c = c.view((c.shape[0], -1, ks[0], ks[1], c.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
+
+                cond_list = [{c_key: [c[:, :, :, :, i]]} for i in range(c.shape[-1])]
+
+            elif self.cond_stage_key == 'coordinates_bbox':
+                assert 'original_image_size' in self.split_input_params, 'BoudingBoxRescaling is missing original_image_size'
+
+                # assuming padding of unfold is always 0 and its dilation is always 1
+                n_patches_per_row = int((w - ks[0]) / stride[0] + 1)
+                full_img_h, full_img_w = self.split_input_params['original_image_size']
+                # as we are operating on latents, we need the factor from the original image size to the
+                # spatial latent size to properly rescale the crops for regenerating the bbox annotations
+                num_downs = self.first_stage_model.encoder.num_resolutions - 1
+                rescale_latent = 2 ** (num_downs)
+
+                # get top left postions of patches as conforming for the bbbox tokenizer, therefore we
+                # need to rescale the tl patch coordinates to be in between (0,1)
+                tl_patch_coordinates = [(rescale_latent * stride[0] * (patch_nr % n_patches_per_row) / full_img_w,
+                                         rescale_latent * stride[1] * (patch_nr // n_patches_per_row) / full_img_h)
+                                        for patch_nr in range(z.shape[-1])]
+
+                # patch_limits are tl_coord, width and height coordinates as (x_tl, y_tl, h, w)
+                patch_limits = [(x_tl, y_tl,
+                                 rescale_latent * ks[0] / full_img_w,
+                                 rescale_latent * ks[1] / full_img_h) for x_tl, y_tl in tl_patch_coordinates]
+                # patch_values = [(np.arange(x_tl,min(x_tl+ks, 1.)),np.arange(y_tl,min(y_tl+ks, 1.))) for x_tl, y_tl in tl_patch_coordinates]
+
+                # tokenize crop coordinates for the bounding boxes of the respective patches
+                patch_limits_tknzd = [torch.LongTensor(self.bbox_tokenizer._crop_encoder(bbox))[None].to(self.device)
+                                      for bbox in patch_limits]  # list of length l with tensors of shape (1, 2)
+                print(patch_limits_tknzd[0].shape)
+                # cut tknzd crop position from conditioning
+                assert isinstance(cond, dict), 'cond must be dict to be fed into model'
+                cut_cond = cond['c_crossattn'][0][..., :-2].to(self.device)
+                print(cut_cond.shape)
+
+                adapted_cond = torch.stack([torch.cat([cut_cond, p], dim=1) for p in patch_limits_tknzd])
+                adapted_cond = rearrange(adapted_cond, 'l b n -> (l b) n')
+                print(adapted_cond.shape)
+                adapted_cond = self.get_learned_conditioning(adapted_cond)
+                print(adapted_cond.shape)
+                adapted_cond = rearrange(adapted_cond, '(l b) n d -> l b n d', l=z.shape[-1])
+                print(adapted_cond.shape)
+
+                cond_list = [{'c_crossattn': [e]} for e in adapted_cond]
+
+            else:
+                cond_list = [cond for i in range(z.shape[-1])]  # Todo make this more efficient
+
+            # apply model by loop over crops
+            output_list = [self.model(z_list[i], t, **cond_list[i]) for i in range(z.shape[-1])]
+            assert not isinstance(output_list[0],
+                                  tuple)  # todo cant deal with multiple model outputs check this never happens
+
+            o = torch.stack(output_list, axis=-1)
+            o = o * weighting
+            # Reverse reshape to img shape
+            o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
+            # stitch crops together
+            x_recon = fold(o) / normalization
+
+        else:
+            x_recon = self.model(x_noisy, t, **cond)
+
+        if isinstance(x_recon, tuple) and not return_ids:
+            return x_recon[0]
+        else:
+            return x_recon
+
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / \
+               extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
+        return mean_flat(kl_prior) / np.log(2.0)
+
+    def p_losses(self, x_start, cond, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+        model_output = self.apply_model(x_noisy, t, cond)
+
+        loss_dict = {}
+        prefix = 'train' if self.training else 'val'
+
+        if self.parameterization == "x0":
+            target = x_start
+        elif self.parameterization == "eps":
+            target = noise
+        else:
+            raise NotImplementedError()
+
+        loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
+        loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()})
+
+        logvar_t = self.logvar[t].to(self.device)
+        loss = loss_simple / torch.exp(logvar_t) + logvar_t
+        # loss = loss_simple / torch.exp(self.logvar) + self.logvar
+        if self.learn_logvar:
+            loss_dict.update({f'{prefix}/loss_gamma': loss.mean()})
+            loss_dict.update({'logvar': self.logvar.data.mean()})
+
+        loss = self.l_simple_weight * loss.mean()
+
+        loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
+        loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
+        loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
+        loss += (self.original_elbo_weight * loss_vlb)
+        loss_dict.update({f'{prefix}/loss': loss})
+
+        return loss, loss_dict
+
+    def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
+                        return_x0=False, score_corrector=None, corrector_kwargs=None):
+        t_in = t
+        model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids)
+
+        if score_corrector is not None:
+            assert self.parameterization == "eps"
+            model_out = score_corrector.modify_score(self, model_out, x, t, c, **corrector_kwargs)
+
+        if return_codebook_ids:
+            model_out, logits = model_out
+
+        if self.parameterization == "eps":
+            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
+        elif self.parameterization == "x0":
+            x_recon = model_out
+        else:
+            raise NotImplementedError()
+
+        if clip_denoised:
+            x_recon.clamp_(-1., 1.)
+        if quantize_denoised:
+            x_recon, _, [_, _, indices] = self.first_stage_model.quantize(x_recon)
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
+        if return_codebook_ids:
+            return model_mean, posterior_variance, posterior_log_variance, logits
+        elif return_x0:
+            return model_mean, posterior_variance, posterior_log_variance, x_recon
+        else:
+            return model_mean, posterior_variance, posterior_log_variance
+
+    @torch.no_grad()
+    def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
+                 return_codebook_ids=False, quantize_denoised=False, return_x0=False,
+                 temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None):
+        b, *_, device = *x.shape, x.device
+        outputs = self.p_mean_variance(x=x, c=c, t=t, clip_denoised=clip_denoised,
+                                       return_codebook_ids=return_codebook_ids,
+                                       quantize_denoised=quantize_denoised,
+                                       return_x0=return_x0,
+                                       score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
+        if return_codebook_ids:
+            raise DeprecationWarning("Support dropped.")
+            model_mean, _, model_log_variance, logits = outputs
+        elif return_x0:
+            model_mean, _, model_log_variance, x0 = outputs
+        else:
+            model_mean, _, model_log_variance = outputs
+
+        noise = noise_like(x.shape, device, repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        # no noise when t == 0
+        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
+
+        if return_codebook_ids:
+            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, logits.argmax(dim=1)
+        if return_x0:
+            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0
+        else:
+            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
+
+    @torch.no_grad()
+    def progressive_denoising(self, cond, shape, verbose=True, callback=None, quantize_denoised=False,
+                              img_callback=None, mask=None, x0=None, temperature=1., noise_dropout=0.,
+                              score_corrector=None, corrector_kwargs=None, batch_size=None, x_T=None, start_T=None,
+                              log_every_t=None):
+        if not log_every_t:
+            log_every_t = self.log_every_t
+        timesteps = self.num_timesteps
+        if batch_size is not None:
+            b = batch_size if batch_size is not None else shape[0]
+            shape = [batch_size] + list(shape)
+        else:
+            b = batch_size = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=self.device)
+        else:
+            img = x_T
+        intermediates = []
+        if cond is not None:
+            if isinstance(cond, dict):
+                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
+                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
+            else:
+                cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
+
+        if start_T is not None:
+            timesteps = min(timesteps, start_T)
+        iterator = tqdm(reversed(range(0, timesteps)), desc='Progressive Generation',
+                        total=timesteps) if verbose else reversed(
+            range(0, timesteps))
+        if type(temperature) == float:
+            temperature = [temperature] * timesteps
+
+        for i in iterator:
+            ts = torch.full((b,), i, device=self.device, dtype=torch.long)
+            if self.shorten_cond_schedule:
+                assert self.model.conditioning_key != 'hybrid'
+                tc = self.cond_ids[ts].to(cond.device)
+                cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
+
+            img, x0_partial = self.p_sample(img, cond, ts,
+                                            clip_denoised=self.clip_denoised,
+                                            quantize_denoised=quantize_denoised, return_x0=True,
+                                            temperature=temperature[i], noise_dropout=noise_dropout,
+                                            score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.q_sample(x0, ts)
+                img = img_orig * mask + (1. - mask) * img
+
+            if i % log_every_t == 0 or i == timesteps - 1:
+                intermediates.append(x0_partial)
+            if callback: callback(i)
+            if img_callback: img_callback(img, i)
+        return img, intermediates
+
+    @torch.no_grad()
+    def p_sample_loop(self, cond, shape, return_intermediates=False,
+                      x_T=None, verbose=True, callback=None, timesteps=None, quantize_denoised=False,
+                      mask=None, x0=None, img_callback=None, start_T=None,
+                      log_every_t=None):
+
+        if not log_every_t:
+            log_every_t = self.log_every_t
+        device = self.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+
+        intermediates = [img]
+        if timesteps is None:
+            timesteps = self.num_timesteps
+
+        if start_T is not None:
+            timesteps = min(timesteps, start_T)
+        iterator = tqdm(reversed(range(0, timesteps)), desc='Sampling t', total=timesteps) if verbose else reversed(
+            range(0, timesteps))
+
+        if mask is not None:
+            assert x0 is not None
+            assert x0.shape[2:3] == mask.shape[2:3]  # spatial size has to match
+
+        for i in iterator:
+            ts = torch.full((b,), i, device=device, dtype=torch.long)
+            if self.shorten_cond_schedule:
+                assert self.model.conditioning_key != 'hybrid'
+                tc = self.cond_ids[ts].to(cond.device)
+                cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
+
+            img = self.p_sample(img, cond, ts,
+                                clip_denoised=self.clip_denoised,
+                                quantize_denoised=quantize_denoised)
+            if mask is not None:
+                img_orig = self.q_sample(x0, ts)
+                img = img_orig * mask + (1. - mask) * img
+
+            if i % log_every_t == 0 or i == timesteps - 1:
+                intermediates.append(img)
+            if callback: callback(i)
+            if img_callback: img_callback(img, i)
+
+        if return_intermediates:
+            return img, intermediates
+        return img
+
+    @torch.no_grad()
+    def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
+               verbose=True, timesteps=None, quantize_denoised=False,
+               mask=None, x0=None, shape=None,**kwargs):
+        if shape is None:
+            shape = (batch_size, self.channels, self.image_size, self.image_size)
+        if cond is not None:
+            if isinstance(cond, dict):
+                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
+                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
+            else:
+                cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
+        return self.p_sample_loop(cond,
+                                  shape,
+                                  return_intermediates=return_intermediates, x_T=x_T,
+                                  verbose=verbose, timesteps=timesteps, quantize_denoised=quantize_denoised,
+                                  mask=mask, x0=x0)
+
+    @torch.no_grad()
+    def sample_log(self,cond,batch_size,ddim, ddim_steps,**kwargs):
+
+        if ddim:
+            ddim_sampler = DDIMSampler(self)
+            shape = (self.channels, self.image_size, self.image_size)
+            samples, intermediates =ddim_sampler.sample(ddim_steps,batch_size,
+                                                        shape,cond,verbose=False,**kwargs)
+
+        else:
+            samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
+                                                 return_intermediates=True,**kwargs)
+
+        return samples, intermediates
+
+
+    @torch.no_grad()
+    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
+                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
+                   plot_diffusion_rows=True, **kwargs):
+
+        use_ddim = ddim_steps is not None
+
+        log = dict()
+        z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key,
+                                           return_first_stage_outputs=True,
+                                           force_c_encode=True,
+                                           return_original_cond=True,
+                                           bs=N)
+        N = min(x.shape[0], N)
+        n_row = min(x.shape[0], n_row)
+        log["inputs"] = x
+        log["reconstruction"] = xrec
+        if self.model.conditioning_key is not None:
+            if hasattr(self.cond_stage_model, "decode"):
+                xc = self.cond_stage_model.decode(c)
+                log["conditioning"] = xc
+            elif self.cond_stage_key in ["caption"]:
+                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["caption"])
+                log["conditioning"] = xc
+            elif self.cond_stage_key == 'class_label':
+                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
+                log['conditioning'] = xc
+            elif isimage(xc):
+                log["conditioning"] = xc
+            if ismap(xc):
+                log["original_conditioning"] = self.to_rgb(xc)
+
+        if plot_diffusion_rows:
+            # get diffusion row
+            diffusion_row = list()
+            z_start = z[:n_row]
+            for t in range(self.num_timesteps):
+                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                    t = t.to(self.device).long()
+                    noise = torch.randn_like(z_start)
+                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
+                    diffusion_row.append(self.decode_first_stage(z_noisy))
+
+            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
+            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
+            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
+            log["diffusion_row"] = diffusion_grid
+
+        if sample:
+            # get denoise row
+            with self.ema_scope("Plotting"):
+                samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
+                                                         ddim_steps=ddim_steps,eta=ddim_eta)
+                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
+            x_samples = self.decode_first_stage(samples)
+            log["samples"] = x_samples
+            if plot_denoise_rows:
+                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
+                log["denoise_row"] = denoise_grid
+
+            if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
+                    self.first_stage_model, IdentityFirstStage):
+                # also display when quantizing x0 while sampling
+                with self.ema_scope("Plotting Quantized Denoised"):
+                    samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
+                                                             ddim_steps=ddim_steps,eta=ddim_eta,
+                                                             quantize_denoised=True)
+                    # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
+                    #                                      quantize_denoised=True)
+                x_samples = self.decode_first_stage(samples.to(self.device))
+                log["samples_x0_quantized"] = x_samples
+
+            if inpaint:
+                # make a simple center square
+                b, h, w = z.shape[0], z.shape[2], z.shape[3]
+                mask = torch.ones(N, h, w).to(self.device)
+                # zeros will be filled in
+                mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
+                mask = mask[:, None, ...]
+                with self.ema_scope("Plotting Inpaint"):
+
+                    samples, _ = self.sample_log(cond=c,batch_size=N,ddim=use_ddim, eta=ddim_eta,
+                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
+                x_samples = self.decode_first_stage(samples.to(self.device))
+                log["samples_inpainting"] = x_samples
+                log["mask"] = mask
+
+                # outpaint
+                with self.ema_scope("Plotting Outpaint"):
+                    samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,eta=ddim_eta,
+                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
+                x_samples = self.decode_first_stage(samples.to(self.device))
+                log["samples_outpainting"] = x_samples
+
+        if plot_progressive_rows:
+            with self.ema_scope("Plotting Progressives"):
+                img, progressives = self.progressive_denoising(c,
+                                                               shape=(self.channels, self.image_size, self.image_size),
+                                                               batch_size=N)
+            prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
+            log["progressive_row"] = prog_row
+
+        if return_keys:
+            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
+                return log
+            else:
+                return {key: log[key] for key in return_keys}
+        return log
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.model.parameters())
+        if self.cond_stage_trainable:
+            print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
+            params = params + list(self.cond_stage_model.parameters())
+        if self.learn_logvar:
+            print('Diffusion model optimizing logvar')
+            params.append(self.logvar)
+        opt = torch.optim.AdamW(params, lr=lr)
+        if self.use_scheduler:
+            assert 'target' in self.scheduler_config
+            scheduler = instantiate_from_config(self.scheduler_config)
+
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                }]
+            return [opt], scheduler
+        return opt
+
+    @torch.no_grad()
+    def to_rgb(self, x):
+        x = x.float()
+        if not hasattr(self, "colorize"):
+            self.colorize = torch.randn(3, x.shape[1], 1, 1).to(x)
+        x = nn.functional.conv2d(x, weight=self.colorize)
+        x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
+        return x
+
+
+class DiffusionWrapper(pl.LightningModule):
+    def __init__(self, diff_model_config, conditioning_key):
+        super().__init__()
+        self.diffusion_model = instantiate_from_config(diff_model_config)
+        self.conditioning_key = conditioning_key
+        assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm']
+
+    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None):
+        if self.conditioning_key is None:
+            out = self.diffusion_model(x, t)
+        elif self.conditioning_key == 'concat':
+            xc = torch.cat([x] + c_concat, dim=1)
+            out = self.diffusion_model(xc, t)
+        elif self.conditioning_key == 'crossattn':
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(x, t, context=cc)
+        elif self.conditioning_key == 'hybrid':
+            xc = torch.cat([x] + c_concat, dim=1)
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(xc, t, context=cc)
+        elif self.conditioning_key == 'adm':
+            cc = c_crossattn[0]
+            out = self.diffusion_model(x, t, y=cc)
+        else:
+            raise NotImplementedError()
+
+        return out
+
+
+class Layout2ImgDiffusion(LatentDiffusion):
+    # TODO: move all layout-specific hacks to this class
+    def __init__(self, cond_stage_key, *args, **kwargs):
+        assert cond_stage_key == 'coordinates_bbox', 'Layout2ImgDiffusion only for cond_stage_key="coordinates_bbox"'
+        super().__init__(cond_stage_key=cond_stage_key, *args, **kwargs)
+
+    def log_images(self, batch, N=8, *args, **kwargs):
+        logs = super().log_images(batch=batch, N=N, *args, **kwargs)
+
+        key = 'train' if self.training else 'validation'
+        dset = self.trainer.datamodule.datasets[key]
+        mapper = dset.conditional_builders[self.cond_stage_key]
+
+        bbox_imgs = []
+        map_fn = lambda catno: dset.get_textual_label(dset.get_category_id(catno))
+        for tknzd_bbox in batch[self.cond_stage_key][:N]:
+            bboximg = mapper.plot(tknzd_bbox.detach().cpu(), map_fn, (256, 256))
+            bbox_imgs.append(bboximg)
+
+        cond_img = torch.stack(bbox_imgs, dim=0)
+        logs['bbox_image'] = cond_img
+        return logs
diff --git a/ldmlib/models/diffusion/plms.py b/ldmlib/models/diffusion/plms.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f7297a1e71e5dffb3008b0ff1cca57569777ada
--- /dev/null
+++ b/ldmlib/models/diffusion/plms.py
@@ -0,0 +1,236 @@
+"""SAMPLING ONLY."""
+
+import torch
+import numpy as np
+from tqdm import tqdm
+from functools import partial
+
+from ldmlib.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
+
+
+class PLMSSampler(object):
+    def __init__(self, model, schedule="linear", **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor:
+            if attr.device != torch.device("cuda"):
+                attr = attr.to(torch.device("cuda"))
+        setattr(self, name, attr)
+
+    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
+        if ddim_eta != 0:
+            raise ValueError('ddim_eta must be 0 for PLMS')
+        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
+                                                                                   ddim_timesteps=self.ddim_timesteps,
+                                                                                   eta=ddim_eta,verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
+                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
+        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
+
+    @torch.no_grad()
+    def sample(self,
+               S,
+               batch_size,
+               shape,
+               conditioning=None,
+               callback=None,
+               normals_sequence=None,
+               img_callback=None,
+               quantize_x0=False,
+               eta=0.,
+               mask=None,
+               x0=None,
+               temperature=1.,
+               noise_dropout=0.,
+               score_corrector=None,
+               corrector_kwargs=None,
+               verbose=True,
+               x_T=None,
+               log_every_t=100,
+               unconditional_guidance_scale=1.,
+               unconditional_conditioning=None,
+               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+               **kwargs
+               ):
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f'Data shape for PLMS sampling is {size}')
+
+        samples, intermediates = self.plms_sampling(conditioning, size,
+                                                    callback=callback,
+                                                    img_callback=img_callback,
+                                                    quantize_denoised=quantize_x0,
+                                                    mask=mask, x0=x0,
+                                                    ddim_use_original_steps=False,
+                                                    noise_dropout=noise_dropout,
+                                                    temperature=temperature,
+                                                    score_corrector=score_corrector,
+                                                    corrector_kwargs=corrector_kwargs,
+                                                    x_T=x_T,
+                                                    log_every_t=log_every_t,
+                                                    unconditional_guidance_scale=unconditional_guidance_scale,
+                                                    unconditional_conditioning=unconditional_conditioning,
+                                                    )
+        return samples, intermediates
+
+    @torch.no_grad()
+    def plms_sampling(self, cond, shape,
+                      x_T=None, ddim_use_original_steps=False,
+                      callback=None, timesteps=None, quantize_denoised=False,
+                      mask=None, x0=None, img_callback=None, log_every_t=100,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+
+        intermediates = {'x_inter': [img], 'pred_x0': [img]}
+        time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        print(f"Running PLMS Sampling with {total_steps} timesteps")
+
+        iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
+        old_eps = []
+
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
+
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1. - mask) * img
+
+            outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
+                                      quantize_denoised=quantize_denoised, temperature=temperature,
+                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
+                                      corrector_kwargs=corrector_kwargs,
+                                      unconditional_guidance_scale=unconditional_guidance_scale,
+                                      unconditional_conditioning=unconditional_conditioning,
+                                      old_eps=old_eps, t_next=ts_next)
+            img, pred_x0, e_t = outs
+            old_eps.append(e_t)
+            if len(old_eps) >= 4:
+                old_eps.pop(0)
+            if callback: callback(i)
+            if img_callback: img_callback(pred_x0, i)
+
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['x_inter'].append(img)
+                intermediates['pred_x0'].append(pred_x0)
+
+        return img, intermediates
+
+    @torch.no_grad()
+    def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
+        b, *_, device = *x.shape, x.device
+
+        def get_model_output(x, t):
+            if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+                e_t = self.model.apply_model(x, t, c)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t] * 2)
+                c_in = torch.cat([unconditional_conditioning, c])
+                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
+                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+            if score_corrector is not None:
+                assert self.model.parameterization == "eps"
+                e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+
+            return e_t
+
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+
+        def get_x_prev_and_pred_x0(e_t, index):
+            # select parameters corresponding to the currently considered timestep
+            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
+
+            # current prediction for x_0
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+            if quantize_denoised:
+                pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+            # direction pointing to x_t
+            dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+            noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+            if noise_dropout > 0.:
+                noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+            return x_prev, pred_x0
+
+        e_t = get_model_output(x, t)
+        if len(old_eps) == 0:
+            # Pseudo Improved Euler (2nd order)
+            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
+            e_t_next = get_model_output(x_prev, t_next)
+            e_t_prime = (e_t + e_t_next) / 2
+        elif len(old_eps) == 1:
+            # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (3 * e_t - old_eps[-1]) / 2
+        elif len(old_eps) == 2:
+            # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
+        elif len(old_eps) >= 3:
+            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
+
+        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
+
+        return x_prev, pred_x0, e_t
diff --git a/ldmlib/modules/__pycache__/attention.cpython-38.pyc b/ldmlib/modules/__pycache__/attention.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c5416fcb362dd79770630d2ca33b18792c04308
Binary files /dev/null and b/ldmlib/modules/__pycache__/attention.cpython-38.pyc differ
diff --git a/ldmlib/modules/__pycache__/x_transformer.cpython-38.pyc b/ldmlib/modules/__pycache__/x_transformer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a678262c0dc1f09db9dfb209568d1cdb942e840f
Binary files /dev/null and b/ldmlib/modules/__pycache__/x_transformer.cpython-38.pyc differ
diff --git a/ldmlib/modules/attention.py b/ldmlib/modules/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..d610df2ffd887c7e70cc34e080ff9424d480f20a
--- /dev/null
+++ b/ldmlib/modules/attention.py
@@ -0,0 +1,261 @@
+from inspect import isfunction
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn, einsum
+from einops import rearrange, repeat
+
+from ldmlib.modules.diffusionmodules.util import checkpoint
+
+
+def exists(val):
+    return val is not None
+
+
+def uniq(arr):
+    return{el: True for el in arr}.keys()
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+
+
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+
+
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU()
+        ) if not glu else GEGLU(dim, inner_dim)
+
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+
+
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b,c,h,w = q.shape
+        q = rearrange(q, 'b c h w -> b (h w) c')
+        k = rearrange(k, 'b c h w -> b c (h w)')
+        w_ = torch.einsum('bij,bjk->bik', q, k)
+
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+
+        # attend to values
+        v = rearrange(v, 'b c h w -> b c (h w)')
+        w_ = rearrange(w_, 'b i j -> b j i')
+        h_ = torch.einsum('bij,bjk->bik', v, w_)
+        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
+        h_ = self.proj_out(h_)
+
+        return x+h_
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(dropout)
+        )
+
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+
+        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+
+        if exists(mask):
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+
+        # attention, what we cannot get enough of
+        attn = sim.softmax(dim=-1)
+
+        out = einsum('b i j, b j d -> b i d', attn, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        return self.to_out(out)
+
+
+class BasicTransformerBlock(nn.Module):
+    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True):
+        super().__init__()
+        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout)  # is a self-attention
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
+                                    heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+
+    def forward(self, x, context=None):
+        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
+
+    def _forward(self, x, context=None):
+        x = self.attn1(self.norm1(x)) + x
+        x = self.attn2(self.norm2(x), context=context) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+    def __init__(self, in_channels, n_heads, d_head,
+                 depth=1, dropout=0., context_dim=None):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+
+        self.proj_in = nn.Conv2d(in_channels,
+                                 inner_dim,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+
+        self.transformer_blocks = nn.ModuleList(
+            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
+                for d in range(depth)]
+        )
+
+        self.proj_out = zero_module(nn.Conv2d(inner_dim,
+                                              in_channels,
+                                              kernel_size=1,
+                                              stride=1,
+                                              padding=0))
+
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+        x = self.proj_out(x)
+        return x + x_in
diff --git a/ldmlib/modules/diffusionmodules/__init__.py b/ldmlib/modules/diffusionmodules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ldmlib/modules/diffusionmodules/__pycache__/__init__.cpython-38.pyc b/ldmlib/modules/diffusionmodules/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8382f8fbbe1fc1d1a37b34ddb7e3777e1730c03b
Binary files /dev/null and b/ldmlib/modules/diffusionmodules/__pycache__/__init__.cpython-38.pyc differ
diff --git a/ldmlib/modules/diffusionmodules/__pycache__/model.cpython-38.pyc b/ldmlib/modules/diffusionmodules/__pycache__/model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65ac1c6a3b6f584bb98a3d01355ab84508c7be03
Binary files /dev/null and b/ldmlib/modules/diffusionmodules/__pycache__/model.cpython-38.pyc differ
diff --git a/ldmlib/modules/diffusionmodules/__pycache__/util.cpython-38.pyc b/ldmlib/modules/diffusionmodules/__pycache__/util.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52903a63d538e812ff48b40856be62d593728622
Binary files /dev/null and b/ldmlib/modules/diffusionmodules/__pycache__/util.cpython-38.pyc differ
diff --git a/ldmlib/modules/diffusionmodules/model.py b/ldmlib/modules/diffusionmodules/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2585031674bc841a337c0c09abf33f21a8a62ec3
--- /dev/null
+++ b/ldmlib/modules/diffusionmodules/model.py
@@ -0,0 +1,830 @@
+# pytorch_diffusion + derived encoder decoder
+import math
+import torch
+import torch.nn as nn
+from torch.nn.functional import silu
+import numpy as np
+from einops import rearrange
+
+from ldmlib.util import instantiate_from_config
+from ldmlib.modules.attention import LinearAttention
+
+
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+    assert len(timesteps.shape) == 1
+
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0,1,0,0))
+    return emb
+
+
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+
+
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = silu(h)
+        h = self.conv1(h)
+
+        if temb is not None:
+            h = h + self.temb_proj(silu(temb))[:,:,None,None]
+
+        h = self.norm2(h)
+        h = silu(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+
+        return x+h
+
+
+class LinAttnBlock(LinearAttention):
+    """to match AttnBlock usage"""
+    def __init__(self, in_channels):
+        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+
+        h_ = self.proj_out(h_)
+
+        return x+h_
+
+
+def make_attn(in_channels, attn_type="vanilla"):
+    assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
+    print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+    if attn_type == "vanilla":
+        return AttnBlock(in_channels)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        return LinAttnBlock(in_channels)
+
+
+class Model(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = self.ch*4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList([
+                torch.nn.Linear(self.ch,
+                                self.temb_ch),
+                torch.nn.Linear(self.temb_ch,
+                                self.temb_ch),
+            ])
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            skip_in = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch*in_ch_mult[i_level]
+                block.append(ResnetBlock(in_channels=block_in+skip_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, x, t=None, context=None):
+        #assert x.shape[2] == x.shape[3] == self.resolution
+        if context is not None:
+            # assume aligned context, cat along channel axis
+            x = torch.cat((x, context), dim=1)
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = silu(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](
+                    torch.cat([h, hs.pop()], dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = silu(h)
+        h = self.conv_out(h)
+        return h
+
+    def get_last_layer(self):
+        return self.conv_out.weight
+
+
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
+                 **ignore_kwargs):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # end
+        h = self.norm_out(h)
+        h = silu(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
+                 attn_type="vanilla", **ignorekwargs):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1,z_channels,curr_res,curr_res)
+        print("Working with z of shape {} = {} dimensions.".format(
+            self.z_shape, np.prod(self.z_shape)))
+
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, z):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+
+        # timestep embedding
+        temb = None
+
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        if self.give_pre_end:
+            return h
+
+        h = self.norm_out(h)
+        h = silu(h)
+        h = self.conv_out(h)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+
+
+class SimpleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, *args, **kwargs):
+        super().__init__()
+        self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
+                                     ResnetBlock(in_channels=in_channels,
+                                                 out_channels=2 * in_channels,
+                                                 temb_channels=0, dropout=0.0),
+                                     ResnetBlock(in_channels=2 * in_channels,
+                                                out_channels=4 * in_channels,
+                                                temb_channels=0, dropout=0.0),
+                                     ResnetBlock(in_channels=4 * in_channels,
+                                                out_channels=2 * in_channels,
+                                                temb_channels=0, dropout=0.0),
+                                     nn.Conv2d(2*in_channels, in_channels, 1),
+                                     Upsample(in_channels, with_conv=True)])
+        # end
+        self.norm_out = Normalize(in_channels)
+        self.conv_out = torch.nn.Conv2d(in_channels,
+                                        out_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.model):
+            if i in [1,2,3]:
+                x = layer(x, None)
+            else:
+                x = layer(x)
+
+        h = self.norm_out(x)
+        h = silu(h)
+        x = self.conv_out(h)
+        return x
+
+
+class UpsampleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
+                 ch_mult=(2,2), dropout=0.0):
+        super().__init__()
+        # upsampling
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = in_channels
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.res_blocks = nn.ModuleList()
+        self.upsample_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            res_block = []
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                res_block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+            self.res_blocks.append(nn.ModuleList(res_block))
+            if i_level != self.num_resolutions - 1:
+                self.upsample_blocks.append(Upsample(block_in, True))
+                curr_res = curr_res * 2
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, x):
+        # upsampling
+        h = x
+        for k, i_level in enumerate(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.res_blocks[i_level][i_block](h, None)
+            if i_level != self.num_resolutions - 1:
+                h = self.upsample_blocks[k](h)
+        h = self.norm_out(h)
+        h = silu(h)
+        h = self.conv_out(h)
+        return h
+
+
+class LatentRescaler(nn.Module):
+    def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
+        super().__init__()
+        # residual block, interpolate, residual block
+        self.factor = factor
+        self.conv_in = nn.Conv2d(in_channels,
+                                 mid_channels,
+                                 kernel_size=3,
+                                 stride=1,
+                                 padding=1)
+        self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
+                                                     out_channels=mid_channels,
+                                                     temb_channels=0,
+                                                     dropout=0.0) for _ in range(depth)])
+        self.attn = AttnBlock(mid_channels)
+        self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
+                                                     out_channels=mid_channels,
+                                                     temb_channels=0,
+                                                     dropout=0.0) for _ in range(depth)])
+
+        self.conv_out = nn.Conv2d(mid_channels,
+                                  out_channels,
+                                  kernel_size=1,
+                                  )
+
+    def forward(self, x):
+        x = self.conv_in(x)
+        for block in self.res_block1:
+            x = block(x, None)
+        x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
+        x = self.attn(x)
+        for block in self.res_block2:
+            x = block(x, None)
+        x = self.conv_out(x)
+        return x
+
+
+class MergedRescaleEncoder(nn.Module):
+    def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True,
+                 ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
+        super().__init__()
+        intermediate_chn = ch * ch_mult[-1]
+        self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
+                               z_channels=intermediate_chn, double_z=False, resolution=resolution,
+                               attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
+                               out_ch=None)
+        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
+                                       mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
+
+    def forward(self, x):
+        x = self.encoder(x)
+        x = self.rescaler(x)
+        return x
+
+
+class MergedRescaleDecoder(nn.Module):
+    def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
+                 dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
+        super().__init__()
+        tmp_chn = z_channels*ch_mult[-1]
+        self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
+                               resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
+                               ch_mult=ch_mult, resolution=resolution, ch=ch)
+        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
+                                       out_channels=tmp_chn, depth=rescale_module_depth)
+
+    def forward(self, x):
+        x = self.rescaler(x)
+        x = self.decoder(x)
+        return x
+
+
+class Upsampler(nn.Module):
+    def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
+        super().__init__()
+        assert out_size >= in_size
+        num_blocks = int(np.log2(out_size//in_size))+1
+        factor_up = 1.+ (out_size % in_size)
+        print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
+        self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
+                                       out_channels=in_channels)
+        self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
+                               attn_resolutions=[], in_channels=None, ch=in_channels,
+                               ch_mult=[ch_mult for _ in range(num_blocks)])
+
+    def forward(self, x):
+        x = self.rescaler(x)
+        x = self.decoder(x)
+        return x
+
+
+class Resize(nn.Module):
+    def __init__(self, in_channels=None, learned=False, mode="bilinear"):
+        super().__init__()
+        self.with_conv = learned
+        self.mode = mode
+        if self.with_conv:
+            print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
+            raise NotImplementedError()
+            assert in_channels is not None
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=4,
+                                        stride=2,
+                                        padding=1)
+
+    def forward(self, x, scale_factor=1.0):
+        if scale_factor==1.0:
+            return x
+        else:
+            x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
+        return x
+
+class FirstStagePostProcessor(nn.Module):
+
+    def __init__(self, ch_mult:list, in_channels,
+                 pretrained_model:nn.Module=None,
+                 reshape=False,
+                 n_channels=None,
+                 dropout=0.,
+                 pretrained_config=None):
+        super().__init__()
+        if pretrained_config is None:
+            assert pretrained_model is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
+            self.pretrained_model = pretrained_model
+        else:
+            assert pretrained_config is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
+            self.instantiate_pretrained(pretrained_config)
+
+        self.do_reshape = reshape
+
+        if n_channels is None:
+            n_channels = self.pretrained_model.encoder.ch
+
+        self.proj_norm = Normalize(in_channels,num_groups=in_channels//2)
+        self.proj = nn.Conv2d(in_channels,n_channels,kernel_size=3,
+                            stride=1,padding=1)
+
+        blocks = []
+        downs = []
+        ch_in = n_channels
+        for m in ch_mult:
+            blocks.append(ResnetBlock(in_channels=ch_in,out_channels=m*n_channels,dropout=dropout))
+            ch_in = m * n_channels
+            downs.append(Downsample(ch_in, with_conv=False))
+
+        self.model = nn.ModuleList(blocks)
+        self.downsampler = nn.ModuleList(downs)
+
+
+    def instantiate_pretrained(self, config):
+        model = instantiate_from_config(config)
+        self.pretrained_model = model.eval()
+        # self.pretrained_model.train = False
+        for param in self.pretrained_model.parameters():
+            param.requires_grad = False
+
+
+    @torch.no_grad()
+    def encode_with_pretrained(self,x):
+        c = self.pretrained_model.encode(x)
+        if isinstance(c, DiagonalGaussianDistribution):
+            c = c.mode()
+        return  c
+
+    def forward(self,x):
+        z_fs = self.encode_with_pretrained(x)
+        z = self.proj_norm(z_fs)
+        z = self.proj(z)
+        z = silu(z)
+
+        for submodel, downmodel in zip(self.model,self.downsampler):
+            z = submodel(z,temb=None)
+            z = downmodel(z)
+
+        if self.do_reshape:
+            z = rearrange(z,'b c h w -> b (h w) c')
+        return z
diff --git a/ldmlib/modules/diffusionmodules/openaimodel.py b/ldmlib/modules/diffusionmodules/openaimodel.py
new file mode 100644
index 0000000000000000000000000000000000000000..84b01c5e473605cc592dfdbeb03279a2103effae
--- /dev/null
+++ b/ldmlib/modules/diffusionmodules/openaimodel.py
@@ -0,0 +1,960 @@
+from abc import abstractmethod
+from functools import partial
+import math
+from typing import Iterable
+
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ldmlib.modules.diffusionmodules.util import (
+    checkpoint,
+    conv_nd,
+    linear,
+    avg_pool_nd,
+    zero_module,
+    normalization,
+    timestep_embedding,
+)
+from ldmlib.modules.attention import SpatialTransformer
+
+
+# dummy replace
+def convert_module_to_f16(x):
+    pass
+
+def convert_module_to_f32(x):
+    pass
+
+
+## go
+class AttentionPool2d(nn.Module):
+    """
+    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
+    """
+
+    def __init__(
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)  # NC(HW)
+        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+
+
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+
+
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+
+    def forward(self, x, emb, context=None):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context)
+            else:
+                x = layer(x)
+        return x
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+class TransposedUpsample(nn.Module):
+    'Learned 2x upsampling without padding'
+    def __init__(self, channels, out_channels=None, ks=5):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+
+        self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
+
+    def forward(self,x):
+        return self.up(x)
+
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
+            ),
+        )
+
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, 3, padding=1
+            )
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        return checkpoint(
+            self._forward, (x, emb), self.parameters(), self.use_checkpoint
+        )
+
+
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = th.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+
+
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+
+    def forward(self, x):
+        return checkpoint(self._forward, (x,), self.parameters(), True)   # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
+        #return pt_checkpoint(self._forward, x)  # pytorch
+
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+
+
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    model.total_ops += th.DoubleTensor([matmul_ops])
+
+
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,    # custom transformer support
+        transformer_depth=1,              # custom transformer support
+        context_dim=None,                 # custom transformer support
+        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+
+        if self.num_classes is not None:
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        #num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=dim_head,
+                            use_new_attention_order=use_new_attention_order,
+                        ) if not use_spatial_transformer else SpatialTransformer(
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            #num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else SpatialTransformer(
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                        ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=model_channels * mult,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        #num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=dim_head,
+                            use_new_attention_order=use_new_attention_order,
+                        ) if not use_spatial_transformer else SpatialTransformer(
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+        )
+        if self.predict_codebook_ids:
+            self.id_predictor = nn.Sequential(
+            normalization(ch),
+            conv_nd(dims, model_channels, n_embed, 1),
+            #nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
+        )
+
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+
+    def forward(self, x, timesteps=None, context=None, y=None,**kwargs):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0],)
+            emb = emb + self.label_emb(y)
+
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(x.dtype)
+        if self.predict_codebook_ids:
+            return self.id_predictor(h)
+        else:
+            return self.out(h)
+
+
+class EncoderUNetModel(nn.Module):
+    """
+    The half UNet model with attention and timestep embedding.
+    For usage, see UNet.
+    """
+
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        pool="adaptive",
+        *args,
+        **kwargs
+    ):
+        super().__init__()
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.pool = pool
+        if pool == "adaptive":
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                nn.AdaptiveAvgPool2d((1, 1)),
+                zero_module(conv_nd(dims, ch, out_channels, 1)),
+                nn.Flatten(),
+            )
+        elif pool == "attention":
+            assert num_head_channels != -1
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d(
+                    (image_size // ds), ch, num_head_channels, out_channels
+                ),
+            )
+        elif pool == "spatial":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, self.out_channels),
+            )
+        elif pool == "spatial_v2":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                normalization(2048),
+                nn.SiLU(),
+                nn.Linear(2048, self.out_channels),
+            )
+        else:
+            raise NotImplementedError(f"Unexpected {pool} pooling")
+
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+
+    def forward(self, x, timesteps):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :return: an [N x K] Tensor of outputs.
+        """
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+
+        results = []
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            if self.pool.startswith("spatial"):
+                results.append(h.type(x.dtype).mean(dim=(2, 3)))
+        h = self.middle_block(h, emb)
+        if self.pool.startswith("spatial"):
+            results.append(h.type(x.dtype).mean(dim=(2, 3)))
+            h = th.cat(results, axis=-1)
+            return self.out(h)
+        else:
+            h = h.type(x.dtype)
+            return self.out(h)
diff --git a/ldmlib/modules/diffusionmodules/util.py b/ldmlib/modules/diffusionmodules/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1dc1d424015d2c6c92342b85a992f931e5a1dc1
--- /dev/null
+++ b/ldmlib/modules/diffusionmodules/util.py
@@ -0,0 +1,267 @@
+# adopted from
+# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+# and
+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+# and
+# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+#
+# thanks!
+
+
+import os
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+from einops import repeat
+
+from ldmlib.util import instantiate_from_config
+
+
+def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+    if schedule == "linear":
+        betas = (
+                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        )
+
+    elif schedule == "cosine":
+        timesteps = (
+                torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+        )
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+
+    elif schedule == "sqrt_linear":
+        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+    elif schedule == "sqrt":
+        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()
+
+
+def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
+    if ddim_discr_method == 'uniform':
+        c = num_ddpm_timesteps // num_ddim_timesteps
+        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
+    elif ddim_discr_method == 'quad':
+        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
+    else:
+        raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
+
+    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
+    # add one to get the final alpha values right (the ones from first scale to data during sampling)
+    steps_out = ddim_timesteps + 1
+    if verbose:
+        print(f'Selected timesteps for ddim sampler: {steps_out}')
+    return steps_out
+
+
+def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
+    # select alphas for computing the variance schedule
+    alphas = alphacums[ddim_timesteps]
+    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
+
+    # according the the formula provided in https://arxiv.org/abs/2010.02502
+    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
+    if verbose:
+        print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
+        print(f'For the chosen value of eta, which is {eta}, '
+              f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
+    return sigmas, alphas, alphas_prev
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+
+
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim)
+    return embedding
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+
+
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+
+
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+
+
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+
+
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+class HybridConditioner(nn.Module):
+
+    def __init__(self, c_concat_config, c_crossattn_config):
+        super().__init__()
+        self.concat_conditioner = instantiate_from_config(c_concat_config)
+        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
+
+    def forward(self, c_concat, c_crossattn):
+        c_concat = self.concat_conditioner(c_concat)
+        c_crossattn = self.crossattn_conditioner(c_crossattn)
+        return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
+
+
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
+    noise = lambda: torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()
diff --git a/ldmlib/modules/distributions/__init__.py b/ldmlib/modules/distributions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ldmlib/modules/distributions/__pycache__/__init__.cpython-38.pyc b/ldmlib/modules/distributions/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..816b596097fa279470d65a5a15b0eeae5022518b
Binary files /dev/null and b/ldmlib/modules/distributions/__pycache__/__init__.cpython-38.pyc differ
diff --git a/ldmlib/modules/distributions/__pycache__/distributions.cpython-38.pyc b/ldmlib/modules/distributions/__pycache__/distributions.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..296ac9305078f34e4b03579bea0dca5d4a30a1a3
Binary files /dev/null and b/ldmlib/modules/distributions/__pycache__/distributions.cpython-38.pyc differ
diff --git a/ldmlib/modules/distributions/distributions.py b/ldmlib/modules/distributions/distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2b8ef901130efc171aa69742ca0244d94d3f2e9
--- /dev/null
+++ b/ldmlib/modules/distributions/distributions.py
@@ -0,0 +1,92 @@
+import torch
+import numpy as np
+
+
+class AbstractDistribution:
+    def sample(self):
+        raise NotImplementedError()
+
+    def mode(self):
+        raise NotImplementedError()
+
+
+class DiracDistribution(AbstractDistribution):
+    def __init__(self, value):
+        self.value = value
+
+    def sample(self):
+        return self.value
+
+    def mode(self):
+        return self.value
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+        return x
+
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(torch.pow(self.mean, 2)
+                                       + self.var - 1.0 - self.logvar,
+                                       dim=[1, 2, 3])
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=[1, 2, 3])
+
+    def nll(self, sample, dims=[1,2,3]):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims)
+
+    def mode(self):
+        return self.mean
+
+
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, torch.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for torch.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + torch.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
+    )
diff --git a/ldmlib/modules/ema.py b/ldmlib/modules/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8c75af43565f6e140287644aaaefa97dd6e67c5
--- /dev/null
+++ b/ldmlib/modules/ema.py
@@ -0,0 +1,76 @@
+import torch
+from torch import nn
+
+
+class LitEma(nn.Module):
+    def __init__(self, model, decay=0.9999, use_num_upates=True):
+        super().__init__()
+        if decay < 0.0 or decay > 1.0:
+            raise ValueError('Decay must be between 0 and 1')
+
+        self.m_name2s_name = {}
+        self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
+        self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
+                             else torch.tensor(-1,dtype=torch.int))
+
+        for name, p in model.named_parameters():
+            if p.requires_grad:
+                #remove as '.'-character is not allowed in buffers
+                s_name = name.replace('.','')
+                self.m_name2s_name.update({name:s_name})
+                self.register_buffer(s_name,p.clone().detach().data)
+
+        self.collected_params = []
+
+    def forward(self,model):
+        decay = self.decay
+
+        if self.num_updates >= 0:
+            self.num_updates += 1
+            decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
+
+        one_minus_decay = 1.0 - decay
+
+        with torch.no_grad():
+            m_param = dict(model.named_parameters())
+            shadow_params = dict(self.named_buffers())
+
+            for key in m_param:
+                if m_param[key].requires_grad:
+                    sname = self.m_name2s_name[key]
+                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
+                    shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
+                else:
+                    assert not key in self.m_name2s_name
+
+    def copy_to(self, model):
+        m_param = dict(model.named_parameters())
+        shadow_params = dict(self.named_buffers())
+        for key in m_param:
+            if m_param[key].requires_grad:
+                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
+            else:
+                assert not key in self.m_name2s_name
+
+    def store(self, parameters):
+        """
+        Save the current parameters for restoring later.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            temporarily stored.
+        """
+        self.collected_params = [param.clone() for param in parameters]
+
+    def restore(self, parameters):
+        """
+        Restore the parameters stored with the `store` method.
+        Useful to validate the model with EMA parameters without affecting the
+        original optimization process. Store the parameters before the
+        `copy_to` method. After validation (or model saving), use this to
+        restore the former parameters.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored parameters.
+        """
+        for c_param, param in zip(self.collected_params, parameters):
+            param.data.copy_(c_param.data)
diff --git a/ldmlib/modules/encoders/__init__.py b/ldmlib/modules/encoders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ldmlib/modules/encoders/__pycache__/__init__.cpython-38.pyc b/ldmlib/modules/encoders/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b76187df760ced3227c211baad4fc894ce47467
Binary files /dev/null and b/ldmlib/modules/encoders/__pycache__/__init__.cpython-38.pyc differ
diff --git a/ldmlib/modules/encoders/__pycache__/modules.cpython-38.pyc b/ldmlib/modules/encoders/__pycache__/modules.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d8530157708745338e93ec53f6fafd15bb25b69
Binary files /dev/null and b/ldmlib/modules/encoders/__pycache__/modules.cpython-38.pyc differ
diff --git a/ldmlib/modules/encoders/modules.py b/ldmlib/modules/encoders/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..a44b674adfb114600fae943dd12ca9169c259c72
--- /dev/null
+++ b/ldmlib/modules/encoders/modules.py
@@ -0,0 +1,234 @@
+import torch
+import torch.nn as nn
+from functools import partial
+import clip
+from einops import rearrange, repeat
+from transformers import CLIPTokenizer, CLIPTextModel
+import kornia
+
+from ldmlib.modules.x_transformer import Encoder, TransformerWrapper  # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
+
+
+class AbstractEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+
+class ClassEmbedder(nn.Module):
+    def __init__(self, embed_dim, n_classes=1000, key='class'):
+        super().__init__()
+        self.key = key
+        self.embedding = nn.Embedding(n_classes, embed_dim)
+
+    def forward(self, batch, key=None):
+        if key is None:
+            key = self.key
+        # this is for use in crossattn
+        c = batch[key][:, None]
+        c = self.embedding(c)
+        return c
+
+
+class TransformerEmbedder(AbstractEncoder):
+    """Some transformer encoder layers"""
+    def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"):
+        super().__init__()
+        self.device = device
+        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
+                                              attn_layers=Encoder(dim=n_embed, depth=n_layer))
+
+    def forward(self, tokens):
+        tokens = tokens.to(self.device)  # meh
+        z = self.transformer(tokens, return_embeddings=True)
+        return z
+
+    def encode(self, x):
+        return self(x)
+
+
+class BERTTokenizer(AbstractEncoder):
+    """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
+    def __init__(self, device="cuda", vq_interface=True, max_length=77):
+        super().__init__()
+        from transformers import BertTokenizerFast  # TODO: add to reuquirements
+        self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        self.device = device
+        self.vq_interface = vq_interface
+        self.max_length = max_length
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        return tokens
+
+    @torch.no_grad()
+    def encode(self, text):
+        tokens = self(text)
+        if not self.vq_interface:
+            return tokens
+        return None, None, [None, None, tokens]
+
+    def decode(self, text):
+        return text
+
+
+class BERTEmbedder(AbstractEncoder):
+    """Uses the BERT tokenizr model and add some transformer encoder layers"""
+    def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77,
+                 device="cuda",use_tokenizer=True, embedding_dropout=0.0):
+        super().__init__()
+        self.use_tknz_fn = use_tokenizer
+        if self.use_tknz_fn:
+            self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len)
+        self.device = device
+        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
+                                              attn_layers=Encoder(dim=n_embed, depth=n_layer),
+                                              emb_dropout=embedding_dropout)
+
+    def forward(self, text):
+        if self.use_tknz_fn:
+            tokens = self.tknz_fn(text)#.to(self.device)
+        else:
+            tokens = text
+        z = self.transformer(tokens, return_embeddings=True)
+        return z
+
+    def encode(self, text):
+        # output of length 77
+        return self(text)
+
+
+class SpatialRescaler(nn.Module):
+    def __init__(self,
+                 n_stages=1,
+                 method='bilinear',
+                 multiplier=0.5,
+                 in_channels=3,
+                 out_channels=None,
+                 bias=False):
+        super().__init__()
+        self.n_stages = n_stages
+        assert self.n_stages >= 0
+        assert method in ['nearest','linear','bilinear','trilinear','bicubic','area']
+        self.multiplier = multiplier
+        self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
+        self.remap_output = out_channels is not None
+        if self.remap_output:
+            print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.')
+            self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias)
+
+    def forward(self,x):
+        for stage in range(self.n_stages):
+            x = self.interpolator(x, scale_factor=self.multiplier)
+
+
+        if self.remap_output:
+            x = self.channel_mapper(x)
+        return x
+
+    def encode(self, x):
+        return self(x)
+
+class FrozenCLIPEmbedder(AbstractEncoder):
+    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        self.freeze()
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+
+        z = outputs.last_hidden_state
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenCLIPTextEmbedder(nn.Module):
+    """
+    Uses the CLIP transformer encoder for text.
+    """
+    def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True):
+        super().__init__()
+        self.model, _ = clip.load(version, jit=False, device="cpu")
+        self.device = device
+        self.max_length = max_length
+        self.n_repeat = n_repeat
+        self.normalize = normalize
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        tokens = clip.tokenize(text).to(self.device)
+        z = self.model.encode_text(tokens)
+        if self.normalize:
+            z = z / torch.linalg.norm(z, dim=1, keepdim=True)
+        return z
+
+    def encode(self, text):
+        z = self(text)
+        if z.ndim==2:
+            z = z[:, None, :]
+        z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat)
+        return z
+
+
+class FrozenClipImageEmbedder(nn.Module):
+    """
+        Uses the CLIP image encoder.
+        """
+    def __init__(
+            self,
+            model,
+            jit=False,
+            device='cuda' if torch.cuda.is_available() else 'cpu',
+            antialias=False,
+        ):
+        super().__init__()
+        self.model, _ = clip.load(name=model, device=device, jit=jit)
+
+        self.antialias = antialias
+
+        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
+        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+
+    def preprocess(self, x):
+        # normalize to [0,1]
+        x = kornia.geometry.resize(x, (224, 224),
+                                   interpolation='bicubic',align_corners=True,
+                                   antialias=self.antialias)
+        x = (x + 1.) / 2.
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+
+    def forward(self, x):
+        # x is assumed to be in range [-1,1]
+        return self.model.encode_image(self.preprocess(x))
+
+
+if __name__ == "__main__":
+    from ldmlib.util import count_params
+    model = FrozenCLIPEmbedder()
+    count_params(model, verbose=True)
diff --git a/ldmlib/modules/image_degradation/__init__.py b/ldmlib/modules/image_degradation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ef6d15c57d130b3e449a0c62586f32edd8392c6
--- /dev/null
+++ b/ldmlib/modules/image_degradation/__init__.py
@@ -0,0 +1,2 @@
+from ldmlib.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
+from ldmlib.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
diff --git a/ldmlib/modules/image_degradation/bsrgan.py b/ldmlib/modules/image_degradation/bsrgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e4d39ee7c7d80e58f876f7e0c278c2e190b3456
--- /dev/null
+++ b/ldmlib/modules/image_degradation/bsrgan.py
@@ -0,0 +1,728 @@
+# -*- coding: utf-8 -*-
+"""
+# --------------------------------------------
+# Super-Resolution
+# --------------------------------------------
+#
+# Kai Zhang (cskaizhang@gmail.com)
+# https://github.com/cszn
+# From 2019/03--2021/08
+# --------------------------------------------
+"""
+
+import numpy as np
+import cv2
+import torch
+
+from functools import partial
+import random
+from scipy import ndimage
+import scipy
+import scipy.stats as ss
+from scipy.interpolate import interp2d
+from scipy.linalg import orth
+import albumentations
+
+import ldmlib.modules.image_degradation.utils_image as util
+
+
+def modcrop_np(img, sf):
+    '''
+    Args:
+        img: numpy image, WxH or WxHxC
+        sf: scale factor
+    Return:
+        cropped image
+    '''
+    w, h = img.shape[:2]
+    im = np.copy(img)
+    return im[:w - w % sf, :h - h % sf, ...]
+
+
+"""
+# --------------------------------------------
+# anisotropic Gaussian kernels
+# --------------------------------------------
+"""
+
+
+def analytic_kernel(k):
+    """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
+    k_size = k.shape[0]
+    # Calculate the big kernels size
+    big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
+    # Loop over the small kernel to fill the big one
+    for r in range(k_size):
+        for c in range(k_size):
+            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
+    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
+    crop = k_size // 2
+    cropped_big_k = big_k[crop:-crop, crop:-crop]
+    # Normalize to 1
+    return cropped_big_k / cropped_big_k.sum()
+
+
+def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
+    """ generate an anisotropic Gaussian kernel
+    Args:
+        ksize : e.g., 15, kernel size
+        theta : [0,  pi], rotation angle range
+        l1    : [0.1,50], scaling of eigenvalues
+        l2    : [0.1,l1], scaling of eigenvalues
+        If l1 = l2, will get an isotropic Gaussian kernel.
+    Returns:
+        k     : kernel
+    """
+
+    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
+    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
+    D = np.array([[l1, 0], [0, l2]])
+    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
+    k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
+
+    return k
+
+
+def gm_blur_kernel(mean, cov, size=15):
+    center = size / 2.0 + 0.5
+    k = np.zeros([size, size])
+    for y in range(size):
+        for x in range(size):
+            cy = y - center + 1
+            cx = x - center + 1
+            k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
+
+    k = k / np.sum(k)
+    return k
+
+
+def shift_pixel(x, sf, upper_left=True):
+    """shift pixel for super-resolution with different scale factors
+    Args:
+        x: WxHxC or WxH
+        sf: scale factor
+        upper_left: shift direction
+    """
+    h, w = x.shape[:2]
+    shift = (sf - 1) * 0.5
+    xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
+    if upper_left:
+        x1 = xv + shift
+        y1 = yv + shift
+    else:
+        x1 = xv - shift
+        y1 = yv - shift
+
+    x1 = np.clip(x1, 0, w - 1)
+    y1 = np.clip(y1, 0, h - 1)
+
+    if x.ndim == 2:
+        x = interp2d(xv, yv, x)(x1, y1)
+    if x.ndim == 3:
+        for i in range(x.shape[-1]):
+            x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
+
+    return x
+
+
+def blur(x, k):
+    '''
+    x: image, NxcxHxW
+    k: kernel, Nx1xhxw
+    '''
+    n, c = x.shape[:2]
+    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
+    x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
+    k = k.repeat(1, c, 1, 1)
+    k = k.view(-1, 1, k.shape[2], k.shape[3])
+    x = x.view(1, -1, x.shape[2], x.shape[3])
+    x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
+    x = x.view(n, c, x.shape[2], x.shape[3])
+
+    return x
+
+
+def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
+    """"
+    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
+    # Kai Zhang
+    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
+    # max_var = 2.5 * sf
+    """
+    # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
+    lambda_1 = min_var + np.random.rand() * (max_var - min_var)
+    lambda_2 = min_var + np.random.rand() * (max_var - min_var)
+    theta = np.random.rand() * np.pi  # random theta
+    noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
+
+    # Set COV matrix using Lambdas and Theta
+    LAMBDA = np.diag([lambda_1, lambda_2])
+    Q = np.array([[np.cos(theta), -np.sin(theta)],
+                  [np.sin(theta), np.cos(theta)]])
+    SIGMA = Q @ LAMBDA @ Q.T
+    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
+
+    # Set expectation position (shifting kernel for aligned image)
+    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
+    MU = MU[None, None, :, None]
+
+    # Create meshgrid for Gaussian
+    [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
+    Z = np.stack([X, Y], 2)[:, :, :, None]
+
+    # Calcualte Gaussian for every pixel of the kernel
+    ZZ = Z - MU
+    ZZ_t = ZZ.transpose(0, 1, 3, 2)
+    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
+
+    # shift the kernel so it will be centered
+    # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
+
+    # Normalize the kernel and return
+    # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
+    kernel = raw_kernel / np.sum(raw_kernel)
+    return kernel
+
+
+def fspecial_gaussian(hsize, sigma):
+    hsize = [hsize, hsize]
+    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
+    std = sigma
+    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
+    arg = -(x * x + y * y) / (2 * std * std)
+    h = np.exp(arg)
+    h[h < scipy.finfo(float).eps * h.max()] = 0
+    sumh = h.sum()
+    if sumh != 0:
+        h = h / sumh
+    return h
+
+
+def fspecial_laplacian(alpha):
+    alpha = max([0, min([alpha, 1])])
+    h1 = alpha / (alpha + 1)
+    h2 = (1 - alpha) / (alpha + 1)
+    h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
+    h = np.array(h)
+    return h
+
+
+def fspecial(filter_type, *args, **kwargs):
+    '''
+    python code from:
+    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
+    '''
+    if filter_type == 'gaussian':
+        return fspecial_gaussian(*args, **kwargs)
+    if filter_type == 'laplacian':
+        return fspecial_laplacian(*args, **kwargs)
+
+
+"""
+# --------------------------------------------
+# degradation models
+# --------------------------------------------
+"""
+
+
+def bicubic_degradation(x, sf=3):
+    '''
+    Args:
+        x: HxWxC image, [0, 1]
+        sf: down-scale factor
+    Return:
+        bicubicly downsampled LR image
+    '''
+    x = util.imresize_np(x, scale=1 / sf)
+    return x
+
+
+def srmd_degradation(x, k, sf=3):
+    ''' blur + bicubic downsampling
+    Args:
+        x: HxWxC image, [0, 1]
+        k: hxw, double
+        sf: down-scale factor
+    Return:
+        downsampled LR image
+    Reference:
+        @inproceedings{zhang2018learning,
+          title={Learning a single convolutional super-resolution network for multiple degradations},
+          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
+          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+          pages={3262--3271},
+          year={2018}
+        }
+    '''
+    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
+    x = bicubic_degradation(x, sf=sf)
+    return x
+
+
+def dpsr_degradation(x, k, sf=3):
+    ''' bicubic downsampling + blur
+    Args:
+        x: HxWxC image, [0, 1]
+        k: hxw, double
+        sf: down-scale factor
+    Return:
+        downsampled LR image
+    Reference:
+        @inproceedings{zhang2019deep,
+          title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
+          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
+          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+          pages={1671--1681},
+          year={2019}
+        }
+    '''
+    x = bicubic_degradation(x, sf=sf)
+    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
+    return x
+
+
+def classical_degradation(x, k, sf=3):
+    ''' blur + downsampling
+    Args:
+        x: HxWxC image, [0, 1]/[0, 255]
+        k: hxw, double
+        sf: down-scale factor
+    Return:
+        downsampled LR image
+    '''
+    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
+    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
+    st = 0
+    return x[st::sf, st::sf, ...]
+
+
+def add_sharpening(img, weight=0.5, radius=50, threshold=10):
+    """USM sharpening. borrowed from real-ESRGAN
+    Input image: I; Blurry image: B.
+    1. K = I + weight * (I - B)
+    2. Mask = 1 if abs(I - B) > threshold, else: 0
+    3. Blur mask:
+    4. Out = Mask * K + (1 - Mask) * I
+    Args:
+        img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
+        weight (float): Sharp weight. Default: 1.
+        radius (float): Kernel size of Gaussian blur. Default: 50.
+        threshold (int):
+    """
+    if radius % 2 == 0:
+        radius += 1
+    blur = cv2.GaussianBlur(img, (radius, radius), 0)
+    residual = img - blur
+    mask = np.abs(residual) * 255 > threshold
+    mask = mask.astype('float32')
+    soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
+
+    K = img + weight * residual
+    K = np.clip(K, 0, 1)
+    return soft_mask * K + (1 - soft_mask) * img
+
+
+def add_blur(img, sf=4):
+    wd2 = 4.0 + sf
+    wd = 2.0 + 0.2 * sf
+    if random.random() < 0.5:
+        l1 = wd2 * random.random()
+        l2 = wd2 * random.random()
+        k = anisotropic_Gaussian(ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
+    else:
+        k = fspecial('gaussian', 2 * random.randint(2, 11) + 3, wd * random.random())
+    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
+
+    return img
+
+
+def add_resize(img, sf=4):
+    rnum = np.random.rand()
+    if rnum > 0.8:  # up
+        sf1 = random.uniform(1, 2)
+    elif rnum < 0.7:  # down
+        sf1 = random.uniform(0.5 / sf, 1)
+    else:
+        sf1 = 1.0
+    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
+    img = np.clip(img, 0.0, 1.0)
+
+    return img
+
+
+# def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
+#     noise_level = random.randint(noise_level1, noise_level2)
+#     rnum = np.random.rand()
+#     if rnum > 0.6:  # add color Gaussian noise
+#         img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
+#     elif rnum < 0.4:  # add grayscale Gaussian noise
+#         img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
+#     else:  # add  noise
+#         L = noise_level2 / 255.
+#         D = np.diag(np.random.rand(3))
+#         U = orth(np.random.rand(3, 3))
+#         conv = np.dot(np.dot(np.transpose(U), D), U)
+#         img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
+#     img = np.clip(img, 0.0, 1.0)
+#     return img
+
+def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
+    noise_level = random.randint(noise_level1, noise_level2)
+    rnum = np.random.rand()
+    if rnum > 0.6:  # add color Gaussian noise
+        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
+    elif rnum < 0.4:  # add grayscale Gaussian noise
+        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
+    else:  # add  noise
+        L = noise_level2 / 255.
+        D = np.diag(np.random.rand(3))
+        U = orth(np.random.rand(3, 3))
+        conv = np.dot(np.dot(np.transpose(U), D), U)
+        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
+    img = np.clip(img, 0.0, 1.0)
+    return img
+
+
+def add_speckle_noise(img, noise_level1=2, noise_level2=25):
+    noise_level = random.randint(noise_level1, noise_level2)
+    img = np.clip(img, 0.0, 1.0)
+    rnum = random.random()
+    if rnum > 0.6:
+        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
+    elif rnum < 0.4:
+        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
+    else:
+        L = noise_level2 / 255.
+        D = np.diag(np.random.rand(3))
+        U = orth(np.random.rand(3, 3))
+        conv = np.dot(np.dot(np.transpose(U), D), U)
+        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
+    img = np.clip(img, 0.0, 1.0)
+    return img
+
+
+def add_Poisson_noise(img):
+    img = np.clip((img * 255.0).round(), 0, 255) / 255.
+    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
+    if random.random() < 0.5:
+        img = np.random.poisson(img * vals).astype(np.float32) / vals
+    else:
+        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
+        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
+        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
+        img += noise_gray[:, :, np.newaxis]
+    img = np.clip(img, 0.0, 1.0)
+    return img
+
+
+def add_JPEG_noise(img):
+    quality_factor = random.randint(30, 95)
+    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
+    result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
+    img = cv2.imdecode(encimg, 1)
+    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
+    return img
+
+
+def random_crop(lq, hq, sf=4, lq_patchsize=64):
+    h, w = lq.shape[:2]
+    rnd_h = random.randint(0, h - lq_patchsize)
+    rnd_w = random.randint(0, w - lq_patchsize)
+    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
+
+    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
+    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
+    return lq, hq
+
+
+def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
+    """
+    This is the degradation model of BSRGAN from the paper
+    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
+    ----------
+    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
+    sf: scale factor
+    isp_model: camera ISP model
+    Returns
+    -------
+    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
+    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
+    """
+    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
+    sf_ori = sf
+
+    h1, w1 = img.shape[:2]
+    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    h, w = img.shape[:2]
+
+    if h < lq_patchsize * sf or w < lq_patchsize * sf:
+        raise ValueError(f'img size ({h1}X{w1}) is too small!')
+
+    hq = img.copy()
+
+    if sf == 4 and random.random() < scale2_prob:  # downsample1
+        if np.random.rand() < 0.5:
+            img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
+                             interpolation=random.choice([1, 2, 3]))
+        else:
+            img = util.imresize_np(img, 1 / 2, True)
+        img = np.clip(img, 0.0, 1.0)
+        sf = 2
+
+    shuffle_order = random.sample(range(7), 7)
+    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
+    if idx1 > idx2:  # keep downsample3 last
+        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
+
+    for i in shuffle_order:
+
+        if i == 0:
+            img = add_blur(img, sf=sf)
+
+        elif i == 1:
+            img = add_blur(img, sf=sf)
+
+        elif i == 2:
+            a, b = img.shape[1], img.shape[0]
+            # downsample2
+            if random.random() < 0.75:
+                sf1 = random.uniform(1, 2 * sf)
+                img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
+                                 interpolation=random.choice([1, 2, 3]))
+            else:
+                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
+                k_shifted = shift_pixel(k, sf)
+                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
+                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
+                img = img[0::sf, 0::sf, ...]  # nearest downsampling
+            img = np.clip(img, 0.0, 1.0)
+
+        elif i == 3:
+            # downsample3
+            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
+            img = np.clip(img, 0.0, 1.0)
+
+        elif i == 4:
+            # add Gaussian noise
+            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
+
+        elif i == 5:
+            # add JPEG noise
+            if random.random() < jpeg_prob:
+                img = add_JPEG_noise(img)
+
+        elif i == 6:
+            # add processed camera sensor noise
+            if random.random() < isp_prob and isp_model is not None:
+                with torch.no_grad():
+                    img, hq = isp_model.forward(img.copy(), hq)
+
+    # add final JPEG compression noise
+    img = add_JPEG_noise(img)
+
+    # random crop
+    img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
+
+    return img, hq
+
+
+# todo no isp_model?
+def degradation_bsrgan_variant(image, sf=4, isp_model=None):
+    """
+    This is the degradation model of BSRGAN from the paper
+    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
+    ----------
+    sf: scale factor
+    isp_model: camera ISP model
+    Returns
+    -------
+    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
+    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
+    """
+    image = util.uint2single(image)
+    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
+    sf_ori = sf
+
+    h1, w1 = image.shape[:2]
+    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    h, w = image.shape[:2]
+
+    hq = image.copy()
+
+    if sf == 4 and random.random() < scale2_prob:  # downsample1
+        if np.random.rand() < 0.5:
+            image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
+                               interpolation=random.choice([1, 2, 3]))
+        else:
+            image = util.imresize_np(image, 1 / 2, True)
+        image = np.clip(image, 0.0, 1.0)
+        sf = 2
+
+    shuffle_order = random.sample(range(7), 7)
+    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
+    if idx1 > idx2:  # keep downsample3 last
+        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
+
+    for i in shuffle_order:
+
+        if i == 0:
+            image = add_blur(image, sf=sf)
+
+        elif i == 1:
+            image = add_blur(image, sf=sf)
+
+        elif i == 2:
+            a, b = image.shape[1], image.shape[0]
+            # downsample2
+            if random.random() < 0.75:
+                sf1 = random.uniform(1, 2 * sf)
+                image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
+                                   interpolation=random.choice([1, 2, 3]))
+            else:
+                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
+                k_shifted = shift_pixel(k, sf)
+                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
+                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
+                image = image[0::sf, 0::sf, ...]  # nearest downsampling
+            image = np.clip(image, 0.0, 1.0)
+
+        elif i == 3:
+            # downsample3
+            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
+            image = np.clip(image, 0.0, 1.0)
+
+        elif i == 4:
+            # add Gaussian noise
+            image = add_Gaussian_noise(image, noise_level1=2, noise_level2=25)
+
+        elif i == 5:
+            # add JPEG noise
+            if random.random() < jpeg_prob:
+                image = add_JPEG_noise(image)
+
+        # elif i == 6:
+        #     # add processed camera sensor noise
+        #     if random.random() < isp_prob and isp_model is not None:
+        #         with torch.no_grad():
+        #             img, hq = isp_model.forward(img.copy(), hq)
+
+    # add final JPEG compression noise
+    image = add_JPEG_noise(image)
+    image = util.single2uint(image)
+    example = {"image":image}
+    return example
+
+
+# TODO incase there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc...
+def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patchsize=64, isp_model=None):
+    """
+    This is an extended degradation model by combining
+    the degradation models of BSRGAN and Real-ESRGAN
+    ----------
+    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
+    sf: scale factor
+    use_shuffle: the degradation shuffle
+    use_sharp: sharpening the img
+    Returns
+    -------
+    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
+    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
+    """
+
+    h1, w1 = img.shape[:2]
+    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    h, w = img.shape[:2]
+
+    if h < lq_patchsize * sf or w < lq_patchsize * sf:
+        raise ValueError(f'img size ({h1}X{w1}) is too small!')
+
+    if use_sharp:
+        img = add_sharpening(img)
+    hq = img.copy()
+
+    if random.random() < shuffle_prob:
+        shuffle_order = random.sample(range(13), 13)
+    else:
+        shuffle_order = list(range(13))
+        # local shuffle for noise, JPEG is always the last one
+        shuffle_order[2:6] = random.sample(shuffle_order[2:6], len(range(2, 6)))
+        shuffle_order[9:13] = random.sample(shuffle_order[9:13], len(range(9, 13)))
+
+    poisson_prob, speckle_prob, isp_prob = 0.1, 0.1, 0.1
+
+    for i in shuffle_order:
+        if i == 0:
+            img = add_blur(img, sf=sf)
+        elif i == 1:
+            img = add_resize(img, sf=sf)
+        elif i == 2:
+            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
+        elif i == 3:
+            if random.random() < poisson_prob:
+                img = add_Poisson_noise(img)
+        elif i == 4:
+            if random.random() < speckle_prob:
+                img = add_speckle_noise(img)
+        elif i == 5:
+            if random.random() < isp_prob and isp_model is not None:
+                with torch.no_grad():
+                    img, hq = isp_model.forward(img.copy(), hq)
+        elif i == 6:
+            img = add_JPEG_noise(img)
+        elif i == 7:
+            img = add_blur(img, sf=sf)
+        elif i == 8:
+            img = add_resize(img, sf=sf)
+        elif i == 9:
+            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
+        elif i == 10:
+            if random.random() < poisson_prob:
+                img = add_Poisson_noise(img)
+        elif i == 11:
+            if random.random() < speckle_prob:
+                img = add_speckle_noise(img)
+        elif i == 12:
+            if random.random() < isp_prob and isp_model is not None:
+                with torch.no_grad():
+                    img, hq = isp_model.forward(img.copy(), hq)
+        else:
+            print('check the shuffle!')
+
+    # resize to desired size
+    img = cv2.resize(img, (int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])),
+                     interpolation=random.choice([1, 2, 3]))
+
+    # add final JPEG compression noise
+    img = add_JPEG_noise(img)
+
+    # random crop
+    img, hq = random_crop(img, hq, sf, lq_patchsize)
+
+    return img, hq
+
+
+if __name__ == '__main__':
+	print("hey")
+	img = util.imread_uint('utils/test.png', 3)
+	print(img)
+	img = util.uint2single(img)
+	print(img)
+	img = img[:448, :448]
+	h = img.shape[0] // 4
+	print("resizing to", h)
+	sf = 4
+	deg_fn = partial(degradation_bsrgan_variant, sf=sf)
+	for i in range(20):
+		print(i)
+		img_lq = deg_fn(img)
+		print(img_lq)
+		img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img)["image"]
+		print(img_lq.shape)
+		print("bicubic", img_lq_bicubic.shape)
+		print(img_hq.shape)
+		lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
+		                        interpolation=0)
+		lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
+		                        interpolation=0)
+		img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
+		util.imsave(img_concat, str(i) + '.png')
diff --git a/ldmlib/modules/image_degradation/bsrgan_light.py b/ldmlib/modules/image_degradation/bsrgan_light.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec1200882368fe48194ed94b9a57c97276aa9e83
--- /dev/null
+++ b/ldmlib/modules/image_degradation/bsrgan_light.py
@@ -0,0 +1,650 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import cv2
+import torch
+
+from functools import partial
+import random
+from scipy import ndimage
+import scipy
+import scipy.stats as ss
+from scipy.interpolate import interp2d
+from scipy.linalg import orth
+import albumentations
+
+import ldmlib.modules.image_degradation.utils_image as util
+
+"""
+# --------------------------------------------
+# Super-Resolution
+# --------------------------------------------
+#
+# Kai Zhang (cskaizhang@gmail.com)
+# https://github.com/cszn
+# From 2019/03--2021/08
+# --------------------------------------------
+"""
+
+
+def modcrop_np(img, sf):
+    '''
+    Args:
+        img: numpy image, WxH or WxHxC
+        sf: scale factor
+    Return:
+        cropped image
+    '''
+    w, h = img.shape[:2]
+    im = np.copy(img)
+    return im[:w - w % sf, :h - h % sf, ...]
+
+
+"""
+# --------------------------------------------
+# anisotropic Gaussian kernels
+# --------------------------------------------
+"""
+
+
+def analytic_kernel(k):
+    """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
+    k_size = k.shape[0]
+    # Calculate the big kernels size
+    big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
+    # Loop over the small kernel to fill the big one
+    for r in range(k_size):
+        for c in range(k_size):
+            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
+    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
+    crop = k_size // 2
+    cropped_big_k = big_k[crop:-crop, crop:-crop]
+    # Normalize to 1
+    return cropped_big_k / cropped_big_k.sum()
+
+
+def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
+    """ generate an anisotropic Gaussian kernel
+    Args:
+        ksize : e.g., 15, kernel size
+        theta : [0,  pi], rotation angle range
+        l1    : [0.1,50], scaling of eigenvalues
+        l2    : [0.1,l1], scaling of eigenvalues
+        If l1 = l2, will get an isotropic Gaussian kernel.
+    Returns:
+        k     : kernel
+    """
+
+    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
+    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
+    D = np.array([[l1, 0], [0, l2]])
+    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
+    k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
+
+    return k
+
+
+def gm_blur_kernel(mean, cov, size=15):
+    center = size / 2.0 + 0.5
+    k = np.zeros([size, size])
+    for y in range(size):
+        for x in range(size):
+            cy = y - center + 1
+            cx = x - center + 1
+            k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
+
+    k = k / np.sum(k)
+    return k
+
+
+def shift_pixel(x, sf, upper_left=True):
+    """shift pixel for super-resolution with different scale factors
+    Args:
+        x: WxHxC or WxH
+        sf: scale factor
+        upper_left: shift direction
+    """
+    h, w = x.shape[:2]
+    shift = (sf - 1) * 0.5
+    xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
+    if upper_left:
+        x1 = xv + shift
+        y1 = yv + shift
+    else:
+        x1 = xv - shift
+        y1 = yv - shift
+
+    x1 = np.clip(x1, 0, w - 1)
+    y1 = np.clip(y1, 0, h - 1)
+
+    if x.ndim == 2:
+        x = interp2d(xv, yv, x)(x1, y1)
+    if x.ndim == 3:
+        for i in range(x.shape[-1]):
+            x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
+
+    return x
+
+
+def blur(x, k):
+    '''
+    x: image, NxcxHxW
+    k: kernel, Nx1xhxw
+    '''
+    n, c = x.shape[:2]
+    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
+    x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
+    k = k.repeat(1, c, 1, 1)
+    k = k.view(-1, 1, k.shape[2], k.shape[3])
+    x = x.view(1, -1, x.shape[2], x.shape[3])
+    x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
+    x = x.view(n, c, x.shape[2], x.shape[3])
+
+    return x
+
+
+def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
+    """"
+    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
+    # Kai Zhang
+    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
+    # max_var = 2.5 * sf
+    """
+    # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
+    lambda_1 = min_var + np.random.rand() * (max_var - min_var)
+    lambda_2 = min_var + np.random.rand() * (max_var - min_var)
+    theta = np.random.rand() * np.pi  # random theta
+    noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
+
+    # Set COV matrix using Lambdas and Theta
+    LAMBDA = np.diag([lambda_1, lambda_2])
+    Q = np.array([[np.cos(theta), -np.sin(theta)],
+                  [np.sin(theta), np.cos(theta)]])
+    SIGMA = Q @ LAMBDA @ Q.T
+    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
+
+    # Set expectation position (shifting kernel for aligned image)
+    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
+    MU = MU[None, None, :, None]
+
+    # Create meshgrid for Gaussian
+    [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
+    Z = np.stack([X, Y], 2)[:, :, :, None]
+
+    # Calcualte Gaussian for every pixel of the kernel
+    ZZ = Z - MU
+    ZZ_t = ZZ.transpose(0, 1, 3, 2)
+    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
+
+    # shift the kernel so it will be centered
+    # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
+
+    # Normalize the kernel and return
+    # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
+    kernel = raw_kernel / np.sum(raw_kernel)
+    return kernel
+
+
+def fspecial_gaussian(hsize, sigma):
+    hsize = [hsize, hsize]
+    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
+    std = sigma
+    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
+    arg = -(x * x + y * y) / (2 * std * std)
+    h = np.exp(arg)
+    h[h < scipy.finfo(float).eps * h.max()] = 0
+    sumh = h.sum()
+    if sumh != 0:
+        h = h / sumh
+    return h
+
+
+def fspecial_laplacian(alpha):
+    alpha = max([0, min([alpha, 1])])
+    h1 = alpha / (alpha + 1)
+    h2 = (1 - alpha) / (alpha + 1)
+    h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
+    h = np.array(h)
+    return h
+
+
+def fspecial(filter_type, *args, **kwargs):
+    '''
+    python code from:
+    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
+    '''
+    if filter_type == 'gaussian':
+        return fspecial_gaussian(*args, **kwargs)
+    if filter_type == 'laplacian':
+        return fspecial_laplacian(*args, **kwargs)
+
+
+"""
+# --------------------------------------------
+# degradation models
+# --------------------------------------------
+"""
+
+
+def bicubic_degradation(x, sf=3):
+    '''
+    Args:
+        x: HxWxC image, [0, 1]
+        sf: down-scale factor
+    Return:
+        bicubicly downsampled LR image
+    '''
+    x = util.imresize_np(x, scale=1 / sf)
+    return x
+
+
+def srmd_degradation(x, k, sf=3):
+    ''' blur + bicubic downsampling
+    Args:
+        x: HxWxC image, [0, 1]
+        k: hxw, double
+        sf: down-scale factor
+    Return:
+        downsampled LR image
+    Reference:
+        @inproceedings{zhang2018learning,
+          title={Learning a single convolutional super-resolution network for multiple degradations},
+          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
+          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+          pages={3262--3271},
+          year={2018}
+        }
+    '''
+    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
+    x = bicubic_degradation(x, sf=sf)
+    return x
+
+
+def dpsr_degradation(x, k, sf=3):
+    ''' bicubic downsampling + blur
+    Args:
+        x: HxWxC image, [0, 1]
+        k: hxw, double
+        sf: down-scale factor
+    Return:
+        downsampled LR image
+    Reference:
+        @inproceedings{zhang2019deep,
+          title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
+          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
+          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+          pages={1671--1681},
+          year={2019}
+        }
+    '''
+    x = bicubic_degradation(x, sf=sf)
+    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
+    return x
+
+
+def classical_degradation(x, k, sf=3):
+    ''' blur + downsampling
+    Args:
+        x: HxWxC image, [0, 1]/[0, 255]
+        k: hxw, double
+        sf: down-scale factor
+    Return:
+        downsampled LR image
+    '''
+    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
+    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
+    st = 0
+    return x[st::sf, st::sf, ...]
+
+
+def add_sharpening(img, weight=0.5, radius=50, threshold=10):
+    """USM sharpening. borrowed from real-ESRGAN
+    Input image: I; Blurry image: B.
+    1. K = I + weight * (I - B)
+    2. Mask = 1 if abs(I - B) > threshold, else: 0
+    3. Blur mask:
+    4. Out = Mask * K + (1 - Mask) * I
+    Args:
+        img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
+        weight (float): Sharp weight. Default: 1.
+        radius (float): Kernel size of Gaussian blur. Default: 50.
+        threshold (int):
+    """
+    if radius % 2 == 0:
+        radius += 1
+    blur = cv2.GaussianBlur(img, (radius, radius), 0)
+    residual = img - blur
+    mask = np.abs(residual) * 255 > threshold
+    mask = mask.astype('float32')
+    soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
+
+    K = img + weight * residual
+    K = np.clip(K, 0, 1)
+    return soft_mask * K + (1 - soft_mask) * img
+
+
+def add_blur(img, sf=4):
+    wd2 = 4.0 + sf
+    wd = 2.0 + 0.2 * sf
+
+    wd2 = wd2/4
+    wd = wd/4
+
+    if random.random() < 0.5:
+        l1 = wd2 * random.random()
+        l2 = wd2 * random.random()
+        k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
+    else:
+        k = fspecial('gaussian', random.randint(2, 4) + 3, wd * random.random())
+    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
+
+    return img
+
+
+def add_resize(img, sf=4):
+    rnum = np.random.rand()
+    if rnum > 0.8:  # up
+        sf1 = random.uniform(1, 2)
+    elif rnum < 0.7:  # down
+        sf1 = random.uniform(0.5 / sf, 1)
+    else:
+        sf1 = 1.0
+    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
+    img = np.clip(img, 0.0, 1.0)
+
+    return img
+
+
+# def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
+#     noise_level = random.randint(noise_level1, noise_level2)
+#     rnum = np.random.rand()
+#     if rnum > 0.6:  # add color Gaussian noise
+#         img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
+#     elif rnum < 0.4:  # add grayscale Gaussian noise
+#         img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
+#     else:  # add  noise
+#         L = noise_level2 / 255.
+#         D = np.diag(np.random.rand(3))
+#         U = orth(np.random.rand(3, 3))
+#         conv = np.dot(np.dot(np.transpose(U), D), U)
+#         img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
+#     img = np.clip(img, 0.0, 1.0)
+#     return img
+
+def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
+    noise_level = random.randint(noise_level1, noise_level2)
+    rnum = np.random.rand()
+    if rnum > 0.6:  # add color Gaussian noise
+        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
+    elif rnum < 0.4:  # add grayscale Gaussian noise
+        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
+    else:  # add  noise
+        L = noise_level2 / 255.
+        D = np.diag(np.random.rand(3))
+        U = orth(np.random.rand(3, 3))
+        conv = np.dot(np.dot(np.transpose(U), D), U)
+        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
+    img = np.clip(img, 0.0, 1.0)
+    return img
+
+
+def add_speckle_noise(img, noise_level1=2, noise_level2=25):
+    noise_level = random.randint(noise_level1, noise_level2)
+    img = np.clip(img, 0.0, 1.0)
+    rnum = random.random()
+    if rnum > 0.6:
+        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
+    elif rnum < 0.4:
+        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
+    else:
+        L = noise_level2 / 255.
+        D = np.diag(np.random.rand(3))
+        U = orth(np.random.rand(3, 3))
+        conv = np.dot(np.dot(np.transpose(U), D), U)
+        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
+    img = np.clip(img, 0.0, 1.0)
+    return img
+
+
+def add_Poisson_noise(img):
+    img = np.clip((img * 255.0).round(), 0, 255) / 255.
+    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
+    if random.random() < 0.5:
+        img = np.random.poisson(img * vals).astype(np.float32) / vals
+    else:
+        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
+        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
+        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
+        img += noise_gray[:, :, np.newaxis]
+    img = np.clip(img, 0.0, 1.0)
+    return img
+
+
+def add_JPEG_noise(img):
+    quality_factor = random.randint(80, 95)
+    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
+    result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
+    img = cv2.imdecode(encimg, 1)
+    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
+    return img
+
+
+def random_crop(lq, hq, sf=4, lq_patchsize=64):
+    h, w = lq.shape[:2]
+    rnd_h = random.randint(0, h - lq_patchsize)
+    rnd_w = random.randint(0, w - lq_patchsize)
+    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
+
+    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
+    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
+    return lq, hq
+
+
+def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
+    """
+    This is the degradation model of BSRGAN from the paper
+    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
+    ----------
+    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
+    sf: scale factor
+    isp_model: camera ISP model
+    Returns
+    -------
+    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
+    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
+    """
+    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
+    sf_ori = sf
+
+    h1, w1 = img.shape[:2]
+    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    h, w = img.shape[:2]
+
+    if h < lq_patchsize * sf or w < lq_patchsize * sf:
+        raise ValueError(f'img size ({h1}X{w1}) is too small!')
+
+    hq = img.copy()
+
+    if sf == 4 and random.random() < scale2_prob:  # downsample1
+        if np.random.rand() < 0.5:
+            img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
+                             interpolation=random.choice([1, 2, 3]))
+        else:
+            img = util.imresize_np(img, 1 / 2, True)
+        img = np.clip(img, 0.0, 1.0)
+        sf = 2
+
+    shuffle_order = random.sample(range(7), 7)
+    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
+    if idx1 > idx2:  # keep downsample3 last
+        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
+
+    for i in shuffle_order:
+
+        if i == 0:
+            img = add_blur(img, sf=sf)
+
+        elif i == 1:
+            img = add_blur(img, sf=sf)
+
+        elif i == 2:
+            a, b = img.shape[1], img.shape[0]
+            # downsample2
+            if random.random() < 0.75:
+                sf1 = random.uniform(1, 2 * sf)
+                img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
+                                 interpolation=random.choice([1, 2, 3]))
+            else:
+                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
+                k_shifted = shift_pixel(k, sf)
+                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
+                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
+                img = img[0::sf, 0::sf, ...]  # nearest downsampling
+            img = np.clip(img, 0.0, 1.0)
+
+        elif i == 3:
+            # downsample3
+            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
+            img = np.clip(img, 0.0, 1.0)
+
+        elif i == 4:
+            # add Gaussian noise
+            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=8)
+
+        elif i == 5:
+            # add JPEG noise
+            if random.random() < jpeg_prob:
+                img = add_JPEG_noise(img)
+
+        elif i == 6:
+            # add processed camera sensor noise
+            if random.random() < isp_prob and isp_model is not None:
+                with torch.no_grad():
+                    img, hq = isp_model.forward(img.copy(), hq)
+
+    # add final JPEG compression noise
+    img = add_JPEG_noise(img)
+
+    # random crop
+    img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
+
+    return img, hq
+
+
+# todo no isp_model?
+def degradation_bsrgan_variant(image, sf=4, isp_model=None):
+    """
+    This is the degradation model of BSRGAN from the paper
+    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
+    ----------
+    sf: scale factor
+    isp_model: camera ISP model
+    Returns
+    -------
+    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
+    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
+    """
+    image = util.uint2single(image)
+    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
+    sf_ori = sf
+
+    h1, w1 = image.shape[:2]
+    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    h, w = image.shape[:2]
+
+    hq = image.copy()
+
+    if sf == 4 and random.random() < scale2_prob:  # downsample1
+        if np.random.rand() < 0.5:
+            image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
+                               interpolation=random.choice([1, 2, 3]))
+        else:
+            image = util.imresize_np(image, 1 / 2, True)
+        image = np.clip(image, 0.0, 1.0)
+        sf = 2
+
+    shuffle_order = random.sample(range(7), 7)
+    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
+    if idx1 > idx2:  # keep downsample3 last
+        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
+
+    for i in shuffle_order:
+
+        if i == 0:
+            image = add_blur(image, sf=sf)
+
+        # elif i == 1:
+        #     image = add_blur(image, sf=sf)
+
+        if i == 0:
+            pass
+
+        elif i == 2:
+            a, b = image.shape[1], image.shape[0]
+            # downsample2
+            if random.random() < 0.8:
+                sf1 = random.uniform(1, 2 * sf)
+                image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
+                                   interpolation=random.choice([1, 2, 3]))
+            else:
+                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
+                k_shifted = shift_pixel(k, sf)
+                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
+                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
+                image = image[0::sf, 0::sf, ...]  # nearest downsampling
+
+            image = np.clip(image, 0.0, 1.0)
+
+        elif i == 3:
+            # downsample3
+            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
+            image = np.clip(image, 0.0, 1.0)
+
+        elif i == 4:
+            # add Gaussian noise
+            image = add_Gaussian_noise(image, noise_level1=1, noise_level2=2)
+
+        elif i == 5:
+            # add JPEG noise
+            if random.random() < jpeg_prob:
+                image = add_JPEG_noise(image)
+        #
+        # elif i == 6:
+        #     # add processed camera sensor noise
+        #     if random.random() < isp_prob and isp_model is not None:
+        #         with torch.no_grad():
+        #             img, hq = isp_model.forward(img.copy(), hq)
+
+    # add final JPEG compression noise
+    image = add_JPEG_noise(image)
+    image = util.single2uint(image)
+    example = {"image": image}
+    return example
+
+
+
+
+if __name__ == '__main__':
+    print("hey")
+    img = util.imread_uint('utils/test.png', 3)
+    img = img[:448, :448]
+    h = img.shape[0] // 4
+    print("resizing to", h)
+    sf = 4
+    deg_fn = partial(degradation_bsrgan_variant, sf=sf)
+    for i in range(20):
+        print(i)
+        img_hq = img
+        img_lq = deg_fn(img)["image"]
+        img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
+        print(img_lq)
+        img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
+        print(img_lq.shape)
+        print("bicubic", img_lq_bicubic.shape)
+        print(img_hq.shape)
+        lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
+                                interpolation=0)
+        lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic),
+                                        (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
+                                        interpolation=0)
+        img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
+        util.imsave(img_concat, str(i) + '.png')
diff --git a/ldmlib/modules/image_degradation/utils/test.png b/ldmlib/modules/image_degradation/utils/test.png
new file mode 100644
index 0000000000000000000000000000000000000000..4249b43de0f22707758d13c240268a401642f6e6
Binary files /dev/null and b/ldmlib/modules/image_degradation/utils/test.png differ
diff --git a/ldmlib/modules/image_degradation/utils_image.py b/ldmlib/modules/image_degradation/utils_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..0175f155ad900ae33c3c46ed87f49b352e3faf98
--- /dev/null
+++ b/ldmlib/modules/image_degradation/utils_image.py
@@ -0,0 +1,916 @@
+import os
+import math
+import random
+import numpy as np
+import torch
+import cv2
+from torchvision.utils import make_grid
+from datetime import datetime
+#import matplotlib.pyplot as plt   # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py
+
+
+os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+
+
+'''
+# --------------------------------------------
+# Kai Zhang (github: https://github.com/cszn)
+# 03/Mar/2019
+# --------------------------------------------
+# https://github.com/twhui/SRGAN-pyTorch
+# https://github.com/xinntao/BasicSR
+# --------------------------------------------
+'''
+
+
+IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.tif']
+
+
+def is_image_file(filename):
+    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+
+
+def get_timestamp():
+    return datetime.now().strftime('%y%m%d-%H%M%S')
+
+
+def imshow(x, title=None, cbar=False, figsize=None):
+    plt.figure(figsize=figsize)
+    plt.imshow(np.squeeze(x), interpolation='nearest', cmap='gray')
+    if title:
+        plt.title(title)
+    if cbar:
+        plt.colorbar()
+    plt.show()
+
+
+def surf(Z, cmap='rainbow', figsize=None):
+    plt.figure(figsize=figsize)
+    ax3 = plt.axes(projection='3d')
+
+    w, h = Z.shape[:2]
+    xx = np.arange(0,w,1)
+    yy = np.arange(0,h,1)
+    X, Y = np.meshgrid(xx, yy)
+    ax3.plot_surface(X,Y,Z,cmap=cmap)
+    #ax3.contour(X,Y,Z, zdim='z',offset=-2，cmap=cmap)
+    plt.show()
+
+
+'''
+# --------------------------------------------
+# get image pathes
+# --------------------------------------------
+'''
+
+
+def get_image_paths(dataroot):
+    paths = None  # return None if dataroot is None
+    if dataroot is not None:
+        paths = sorted(_get_paths_from_images(dataroot))
+    return paths
+
+
+def _get_paths_from_images(path):
+    assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
+    images = []
+    for dirpath, _, fnames in sorted(os.walk(path)):
+        for fname in sorted(fnames):
+            if is_image_file(fname):
+                img_path = os.path.join(dirpath, fname)
+                images.append(img_path)
+    assert images, '{:s} has no valid image file'.format(path)
+    return images
+
+
+'''
+# --------------------------------------------
+# split large images into small images 
+# --------------------------------------------
+'''
+
+
+def patches_from_image(img, p_size=512, p_overlap=64, p_max=800):
+    w, h = img.shape[:2]
+    patches = []
+    if w > p_max and h > p_max:
+        w1 = list(np.arange(0, w-p_size, p_size-p_overlap, dtype=np.int))
+        h1 = list(np.arange(0, h-p_size, p_size-p_overlap, dtype=np.int))
+        w1.append(w-p_size)
+        h1.append(h-p_size)
+#        print(w1)
+#        print(h1)
+        for i in w1:
+            for j in h1:
+                patches.append(img[i:i+p_size, j:j+p_size,:])
+    else:
+        patches.append(img)
+
+    return patches
+
+
+def imssave(imgs, img_path):
+    """
+    imgs: list, N images of size WxHxC
+    """
+    img_name, ext = os.path.splitext(os.path.basename(img_path))
+
+    for i, img in enumerate(imgs):
+        if img.ndim == 3:
+            img = img[:, :, [2, 1, 0]]
+        new_path = os.path.join(os.path.dirname(img_path), img_name+str('_s{:04d}'.format(i))+'.png')
+        cv2.imwrite(new_path, img)
+
+
+def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000):
+    """
+    split the large images from original_dataroot into small overlapped images with size (p_size)x(p_size),
+    and save them into taget_dataroot; only the images with larger size than (p_max)x(p_max)
+    will be splitted.
+    Args:
+        original_dataroot:
+        taget_dataroot:
+        p_size: size of small images
+        p_overlap: patch size in training is a good choice
+        p_max: images with smaller size than (p_max)x(p_max) keep unchanged.
+    """
+    paths = get_image_paths(original_dataroot)
+    for img_path in paths:
+        # img_name, ext = os.path.splitext(os.path.basename(img_path))
+        img = imread_uint(img_path, n_channels=n_channels)
+        patches = patches_from_image(img, p_size, p_overlap, p_max)
+        imssave(patches, os.path.join(taget_dataroot,os.path.basename(img_path)))
+        #if original_dataroot == taget_dataroot:
+        #del img_path
+
+'''
+# --------------------------------------------
+# makedir
+# --------------------------------------------
+'''
+
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+def mkdirs(paths):
+    if isinstance(paths, str):
+        mkdir(paths)
+    else:
+        for path in paths:
+            mkdir(path)
+
+
+def mkdir_and_rename(path):
+    if os.path.exists(path):
+        new_name = path + '_archived_' + get_timestamp()
+        print('Path already exists. Rename it to [{:s}]'.format(new_name))
+        os.rename(path, new_name)
+    os.makedirs(path)
+
+
+'''
+# --------------------------------------------
+# read image from path
+# opencv is fast, but read BGR numpy image
+# --------------------------------------------
+'''
+
+
+# --------------------------------------------
+# get uint8 image of size HxWxn_channles (RGB)
+# --------------------------------------------
+def imread_uint(path, n_channels=3):
+    #  input: path
+    # output: HxWx3(RGB or GGG), or HxWx1 (G)
+    if n_channels == 1:
+        img = cv2.imread(path, 0)  # cv2.IMREAD_GRAYSCALE
+        img = np.expand_dims(img, axis=2)  # HxWx1
+    elif n_channels == 3:
+        img = cv2.imread(path, cv2.IMREAD_UNCHANGED)  # BGR or G
+        if img.ndim == 2:
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)  # GGG
+        else:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # RGB
+    return img
+
+
+# --------------------------------------------
+# matlab's imwrite
+# --------------------------------------------
+def imsave(img, img_path):
+    img = np.squeeze(img)
+    if img.ndim == 3:
+        img = img[:, :, [2, 1, 0]]
+    cv2.imwrite(img_path, img)
+
+def imwrite(img, img_path):
+    img = np.squeeze(img)
+    if img.ndim == 3:
+        img = img[:, :, [2, 1, 0]]
+    cv2.imwrite(img_path, img)
+
+
+
+# --------------------------------------------
+# get single image of size HxWxn_channles (BGR)
+# --------------------------------------------
+def read_img(path):
+    # read image by cv2
+    # return: Numpy float32, HWC, BGR, [0,1]
+    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)  # cv2.IMREAD_GRAYSCALE
+    img = img.astype(np.float32) / 255.
+    if img.ndim == 2:
+        img = np.expand_dims(img, axis=2)
+    # some images have 4 channels
+    if img.shape[2] > 3:
+        img = img[:, :, :3]
+    return img
+
+
+'''
+# --------------------------------------------
+# image format conversion
+# --------------------------------------------
+# numpy(single) <--->  numpy(unit)
+# numpy(single) <--->  tensor
+# numpy(unit)   <--->  tensor
+# --------------------------------------------
+'''
+
+
+# --------------------------------------------
+# numpy(single) [0, 1] <--->  numpy(unit)
+# --------------------------------------------
+
+
+def uint2single(img):
+
+    return np.float32(img/255.)
+
+
+def single2uint(img):
+
+    return np.uint8((img.clip(0, 1)*255.).round())
+
+
+def uint162single(img):
+
+    return np.float32(img/65535.)
+
+
+def single2uint16(img):
+
+    return np.uint16((img.clip(0, 1)*65535.).round())
+
+
+# --------------------------------------------
+# numpy(unit) (HxWxC or HxW) <--->  tensor
+# --------------------------------------------
+
+
+# convert uint to 4-dimensional torch tensor
+def uint2tensor4(img):
+    if img.ndim == 2:
+        img = np.expand_dims(img, axis=2)
+    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.).unsqueeze(0)
+
+
+# convert uint to 3-dimensional torch tensor
+def uint2tensor3(img):
+    if img.ndim == 2:
+        img = np.expand_dims(img, axis=2)
+    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.)
+
+
+# convert 2/3/4-dimensional torch tensor to uint
+def tensor2uint(img):
+    img = img.data.squeeze().float().clamp_(0, 1).cpu().numpy()
+    if img.ndim == 3:
+        img = np.transpose(img, (1, 2, 0))
+    return np.uint8((img*255.0).round())
+
+
+# --------------------------------------------
+# numpy(single) (HxWxC) <--->  tensor
+# --------------------------------------------
+
+
+# convert single (HxWxC) to 3-dimensional torch tensor
+def single2tensor3(img):
+    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float()
+
+
+# convert single (HxWxC) to 4-dimensional torch tensor
+def single2tensor4(img):
+    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().unsqueeze(0)
+
+
+# convert torch tensor to single
+def tensor2single(img):
+    img = img.data.squeeze().float().cpu().numpy()
+    if img.ndim == 3:
+        img = np.transpose(img, (1, 2, 0))
+
+    return img
+
+# convert torch tensor to single
+def tensor2single3(img):
+    img = img.data.squeeze().float().cpu().numpy()
+    if img.ndim == 3:
+        img = np.transpose(img, (1, 2, 0))
+    elif img.ndim == 2:
+        img = np.expand_dims(img, axis=2)
+    return img
+
+
+def single2tensor5(img):
+    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float().unsqueeze(0)
+
+
+def single32tensor5(img):
+    return torch.from_numpy(np.ascontiguousarray(img)).float().unsqueeze(0).unsqueeze(0)
+
+
+def single42tensor4(img):
+    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float()
+
+
+# from skimage.io import imread, imsave
+def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
+    '''
+    Converts a torch Tensor into an image Numpy array of BGR channel order
+    Input: 4D(B,(3/1),H,W), 3D(C,H,W), or 2D(H,W), any range, RGB channel order
+    Output: 3D(H,W,C) or 2D(H,W), [0,255], np.uint8 (default)
+    '''
+    tensor = tensor.squeeze().float().cpu().clamp_(*min_max)  # squeeze first, then clamp
+    tensor = (tensor - min_max[0]) / (min_max[1] - min_max[0])  # to range [0,1]
+    n_dim = tensor.dim()
+    if n_dim == 4:
+        n_img = len(tensor)
+        img_np = make_grid(tensor, nrow=int(math.sqrt(n_img)), normalize=False).numpy()
+        img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))  # HWC, BGR
+    elif n_dim == 3:
+        img_np = tensor.numpy()
+        img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))  # HWC, BGR
+    elif n_dim == 2:
+        img_np = tensor.numpy()
+    else:
+        raise TypeError(
+            'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(n_dim))
+    if out_type == np.uint8:
+        img_np = (img_np * 255.0).round()
+        # Important. Unlike matlab, numpy.unit8() WILL NOT round by default.
+    return img_np.astype(out_type)
+
+
+'''
+# --------------------------------------------
+# Augmentation, flipe and/or rotate
+# --------------------------------------------
+# The following two are enough.
+# (1) augmet_img: numpy image of WxHxC or WxH
+# (2) augment_img_tensor4: tensor image 1xCxWxH
+# --------------------------------------------
+'''
+
+
+def augment_img(img, mode=0):
+    '''Kai Zhang (github: https://github.com/cszn)
+    '''
+    if mode == 0:
+        return img
+    elif mode == 1:
+        return np.flipud(np.rot90(img))
+    elif mode == 2:
+        return np.flipud(img)
+    elif mode == 3:
+        return np.rot90(img, k=3)
+    elif mode == 4:
+        return np.flipud(np.rot90(img, k=2))
+    elif mode == 5:
+        return np.rot90(img)
+    elif mode == 6:
+        return np.rot90(img, k=2)
+    elif mode == 7:
+        return np.flipud(np.rot90(img, k=3))
+
+
+def augment_img_tensor4(img, mode=0):
+    '''Kai Zhang (github: https://github.com/cszn)
+    '''
+    if mode == 0:
+        return img
+    elif mode == 1:
+        return img.rot90(1, [2, 3]).flip([2])
+    elif mode == 2:
+        return img.flip([2])
+    elif mode == 3:
+        return img.rot90(3, [2, 3])
+    elif mode == 4:
+        return img.rot90(2, [2, 3]).flip([2])
+    elif mode == 5:
+        return img.rot90(1, [2, 3])
+    elif mode == 6:
+        return img.rot90(2, [2, 3])
+    elif mode == 7:
+        return img.rot90(3, [2, 3]).flip([2])
+
+
+def augment_img_tensor(img, mode=0):
+    '''Kai Zhang (github: https://github.com/cszn)
+    '''
+    img_size = img.size()
+    img_np = img.data.cpu().numpy()
+    if len(img_size) == 3:
+        img_np = np.transpose(img_np, (1, 2, 0))
+    elif len(img_size) == 4:
+        img_np = np.transpose(img_np, (2, 3, 1, 0))
+    img_np = augment_img(img_np, mode=mode)
+    img_tensor = torch.from_numpy(np.ascontiguousarray(img_np))
+    if len(img_size) == 3:
+        img_tensor = img_tensor.permute(2, 0, 1)
+    elif len(img_size) == 4:
+        img_tensor = img_tensor.permute(3, 2, 0, 1)
+
+    return img_tensor.type_as(img)
+
+
+def augment_img_np3(img, mode=0):
+    if mode == 0:
+        return img
+    elif mode == 1:
+        return img.transpose(1, 0, 2)
+    elif mode == 2:
+        return img[::-1, :, :]
+    elif mode == 3:
+        img = img[::-1, :, :]
+        img = img.transpose(1, 0, 2)
+        return img
+    elif mode == 4:
+        return img[:, ::-1, :]
+    elif mode == 5:
+        img = img[:, ::-1, :]
+        img = img.transpose(1, 0, 2)
+        return img
+    elif mode == 6:
+        img = img[:, ::-1, :]
+        img = img[::-1, :, :]
+        return img
+    elif mode == 7:
+        img = img[:, ::-1, :]
+        img = img[::-1, :, :]
+        img = img.transpose(1, 0, 2)
+        return img
+
+
+def augment_imgs(img_list, hflip=True, rot=True):
+    # horizontal flip OR rotate
+    hflip = hflip and random.random() < 0.5
+    vflip = rot and random.random() < 0.5
+    rot90 = rot and random.random() < 0.5
+
+    def _augment(img):
+        if hflip:
+            img = img[:, ::-1, :]
+        if vflip:
+            img = img[::-1, :, :]
+        if rot90:
+            img = img.transpose(1, 0, 2)
+        return img
+
+    return [_augment(img) for img in img_list]
+
+
+'''
+# --------------------------------------------
+# modcrop and shave
+# --------------------------------------------
+'''
+
+
+def modcrop(img_in, scale):
+    # img_in: Numpy, HWC or HW
+    img = np.copy(img_in)
+    if img.ndim == 2:
+        H, W = img.shape
+        H_r, W_r = H % scale, W % scale
+        img = img[:H - H_r, :W - W_r]
+    elif img.ndim == 3:
+        H, W, C = img.shape
+        H_r, W_r = H % scale, W % scale
+        img = img[:H - H_r, :W - W_r, :]
+    else:
+        raise ValueError('Wrong img ndim: [{:d}].'.format(img.ndim))
+    return img
+
+
+def shave(img_in, border=0):
+    # img_in: Numpy, HWC or HW
+    img = np.copy(img_in)
+    h, w = img.shape[:2]
+    img = img[border:h-border, border:w-border]
+    return img
+
+
+'''
+# --------------------------------------------
+# image processing process on numpy image
+# channel_convert(in_c, tar_type, img_list):
+# rgb2ycbcr(img, only_y=True):
+# bgr2ycbcr(img, only_y=True):
+# ycbcr2rgb(img):
+# --------------------------------------------
+'''
+
+
+def rgb2ycbcr(img, only_y=True):
+    '''same as matlab rgb2ycbcr
+    only_y: only return Y channel
+    Input:
+        uint8, [0, 255]
+        float, [0, 1]
+    '''
+    in_img_type = img.dtype
+    img.astype(np.float32)
+    if in_img_type != np.uint8:
+        img *= 255.
+    # convert
+    if only_y:
+        rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0
+    else:
+        rlt = np.matmul(img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
+                              [24.966, 112.0, -18.214]]) / 255.0 + [16, 128, 128]
+    if in_img_type == np.uint8:
+        rlt = rlt.round()
+    else:
+        rlt /= 255.
+    return rlt.astype(in_img_type)
+
+
+def ycbcr2rgb(img):
+    '''same as matlab ycbcr2rgb
+    Input:
+        uint8, [0, 255]
+        float, [0, 1]
+    '''
+    in_img_type = img.dtype
+    img.astype(np.float32)
+    if in_img_type != np.uint8:
+        img *= 255.
+    # convert
+    rlt = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071],
+                          [0.00625893, -0.00318811, 0]]) * 255.0 + [-222.921, 135.576, -276.836]
+    if in_img_type == np.uint8:
+        rlt = rlt.round()
+    else:
+        rlt /= 255.
+    return rlt.astype(in_img_type)
+
+
+def bgr2ycbcr(img, only_y=True):
+    '''bgr version of rgb2ycbcr
+    only_y: only return Y channel
+    Input:
+        uint8, [0, 255]
+        float, [0, 1]
+    '''
+    in_img_type = img.dtype
+    img.astype(np.float32)
+    if in_img_type != np.uint8:
+        img *= 255.
+    # convert
+    if only_y:
+        rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
+    else:
+        rlt = np.matmul(img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+                              [65.481, -37.797, 112.0]]) / 255.0 + [16, 128, 128]
+    if in_img_type == np.uint8:
+        rlt = rlt.round()
+    else:
+        rlt /= 255.
+    return rlt.astype(in_img_type)
+
+
+def channel_convert(in_c, tar_type, img_list):
+    # conversion among BGR, gray and y
+    if in_c == 3 and tar_type == 'gray':  # BGR to gray
+        gray_list = [cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) for img in img_list]
+        return [np.expand_dims(img, axis=2) for img in gray_list]
+    elif in_c == 3 and tar_type == 'y':  # BGR to y
+        y_list = [bgr2ycbcr(img, only_y=True) for img in img_list]
+        return [np.expand_dims(img, axis=2) for img in y_list]
+    elif in_c == 1 and tar_type == 'RGB':  # gray/y to BGR
+        return [cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) for img in img_list]
+    else:
+        return img_list
+
+
+'''
+# --------------------------------------------
+# metric, PSNR and SSIM
+# --------------------------------------------
+'''
+
+
+# --------------------------------------------
+# PSNR
+# --------------------------------------------
+def calculate_psnr(img1, img2, border=0):
+    # img1 and img2 have range [0, 255]
+    #img1 = img1.squeeze()
+    #img2 = img2.squeeze()
+    if not img1.shape == img2.shape:
+        raise ValueError('Input images must have the same dimensions.')
+    h, w = img1.shape[:2]
+    img1 = img1[border:h-border, border:w-border]
+    img2 = img2[border:h-border, border:w-border]
+
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+    mse = np.mean((img1 - img2)**2)
+    if mse == 0:
+        return float('inf')
+    return 20 * math.log10(255.0 / math.sqrt(mse))
+
+
+# --------------------------------------------
+# SSIM
+# --------------------------------------------
+def calculate_ssim(img1, img2, border=0):
+    '''calculate SSIM
+    the same outputs as MATLAB's
+    img1, img2: [0, 255]
+    '''
+    #img1 = img1.squeeze()
+    #img2 = img2.squeeze()
+    if not img1.shape == img2.shape:
+        raise ValueError('Input images must have the same dimensions.')
+    h, w = img1.shape[:2]
+    img1 = img1[border:h-border, border:w-border]
+    img2 = img2[border:h-border, border:w-border]
+
+    if img1.ndim == 2:
+        return ssim(img1, img2)
+    elif img1.ndim == 3:
+        if img1.shape[2] == 3:
+            ssims = []
+            for i in range(3):
+                ssims.append(ssim(img1[:,:,i], img2[:,:,i]))
+            return np.array(ssims).mean()
+        elif img1.shape[2] == 1:
+            return ssim(np.squeeze(img1), np.squeeze(img2))
+    else:
+        raise ValueError('Wrong input image dimensions.')
+
+
+def ssim(img1, img2):
+    C1 = (0.01 * 255)**2
+    C2 = (0.03 * 255)**2
+
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+    kernel = cv2.getGaussianKernel(11, 1.5)
+    window = np.outer(kernel, kernel.transpose())
+
+    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]  # valid
+    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
+    mu1_sq = mu1**2
+    mu2_sq = mu2**2
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
+    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
+    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
+
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
+                                                            (sigma1_sq + sigma2_sq + C2))
+    return ssim_map.mean()
+
+
+'''
+# --------------------------------------------
+# matlab's bicubic imresize (numpy and torch) [0, 1]
+# --------------------------------------------
+'''
+
+
+# matlab 'imresize' function, now only support 'bicubic'
+def cubic(x):
+    absx = torch.abs(x)
+    absx2 = absx**2
+    absx3 = absx**3
+    return (1.5*absx3 - 2.5*absx2 + 1) * ((absx <= 1).type_as(absx)) + \
+        (-0.5*absx3 + 2.5*absx2 - 4*absx + 2) * (((absx > 1)*(absx <= 2)).type_as(absx))
+
+
+def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
+    if (scale < 1) and (antialiasing):
+        # Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
+        kernel_width = kernel_width / scale
+
+    # Output-space coordinates
+    x = torch.linspace(1, out_length, out_length)
+
+    # Input-space coordinates. Calculate the inverse mapping such that 0.5
+    # in output space maps to 0.5 in input space, and 0.5+scale in output
+    # space maps to 1.5 in input space.
+    u = x / scale + 0.5 * (1 - 1 / scale)
+
+    # What is the left-most pixel that can be involved in the computation?
+    left = torch.floor(u - kernel_width / 2)
+
+    # What is the maximum number of pixels that can be involved in the
+    # computation?  Note: it's OK to use an extra pixel here; if the
+    # corresponding weights are all zero, it will be eliminated at the end
+    # of this function.
+    P = math.ceil(kernel_width) + 2
+
+    # The indices of the input pixels involved in computing the k-th output
+    # pixel are in row k of the indices matrix.
+    indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(0, P - 1, P).view(
+        1, P).expand(out_length, P)
+
+    # The weights used to compute the k-th output pixel are in row k of the
+    # weights matrix.
+    distance_to_center = u.view(out_length, 1).expand(out_length, P) - indices
+    # apply cubic kernel
+    if (scale < 1) and (antialiasing):
+        weights = scale * cubic(distance_to_center * scale)
+    else:
+        weights = cubic(distance_to_center)
+    # Normalize the weights matrix so that each row sums to 1.
+    weights_sum = torch.sum(weights, 1).view(out_length, 1)
+    weights = weights / weights_sum.expand(out_length, P)
+
+    # If a column in weights is all zero, get rid of it. only consider the first and last column.
+    weights_zero_tmp = torch.sum((weights == 0), 0)
+    if not math.isclose(weights_zero_tmp[0], 0, rel_tol=1e-6):
+        indices = indices.narrow(1, 1, P - 2)
+        weights = weights.narrow(1, 1, P - 2)
+    if not math.isclose(weights_zero_tmp[-1], 0, rel_tol=1e-6):
+        indices = indices.narrow(1, 0, P - 2)
+        weights = weights.narrow(1, 0, P - 2)
+    weights = weights.contiguous()
+    indices = indices.contiguous()
+    sym_len_s = -indices.min() + 1
+    sym_len_e = indices.max() - in_length
+    indices = indices + sym_len_s - 1
+    return weights, indices, int(sym_len_s), int(sym_len_e)
+
+
+# --------------------------------------------
+# imresize for tensor image [0, 1]
+# --------------------------------------------
+def imresize(img, scale, antialiasing=True):
+    # Now the scale should be the same for H and W
+    # input: img: pytorch tensor, CHW or HW [0,1]
+    # output: CHW or HW [0,1] w/o round
+    need_squeeze = True if img.dim() == 2 else False
+    if need_squeeze:
+        img.unsqueeze_(0)
+    in_C, in_H, in_W = img.size()
+    out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
+    kernel_width = 4
+    kernel = 'cubic'
+
+    # Return the desired dimension order for performing the resize.  The
+    # strategy is to perform the resize first along the dimension with the
+    # smallest scale factor.
+    # Now we do not support this.
+
+    # get weights and indices
+    weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
+        in_H, out_H, scale, kernel, kernel_width, antialiasing)
+    weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
+        in_W, out_W, scale, kernel, kernel_width, antialiasing)
+    # process H dimension
+    # symmetric copying
+    img_aug = torch.FloatTensor(in_C, in_H + sym_len_Hs + sym_len_He, in_W)
+    img_aug.narrow(1, sym_len_Hs, in_H).copy_(img)
+
+    sym_patch = img[:, :sym_len_Hs, :]
+    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(1, inv_idx)
+    img_aug.narrow(1, 0, sym_len_Hs).copy_(sym_patch_inv)
+
+    sym_patch = img[:, -sym_len_He:, :]
+    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(1, inv_idx)
+    img_aug.narrow(1, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
+
+    out_1 = torch.FloatTensor(in_C, out_H, in_W)
+    kernel_width = weights_H.size(1)
+    for i in range(out_H):
+        idx = int(indices_H[i][0])
+        for j in range(out_C):
+            out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(0, 1).mv(weights_H[i])
+
+    # process W dimension
+    # symmetric copying
+    out_1_aug = torch.FloatTensor(in_C, out_H, in_W + sym_len_Ws + sym_len_We)
+    out_1_aug.narrow(2, sym_len_Ws, in_W).copy_(out_1)
+
+    sym_patch = out_1[:, :, :sym_len_Ws]
+    inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(2, inv_idx)
+    out_1_aug.narrow(2, 0, sym_len_Ws).copy_(sym_patch_inv)
+
+    sym_patch = out_1[:, :, -sym_len_We:]
+    inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(2, inv_idx)
+    out_1_aug.narrow(2, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
+
+    out_2 = torch.FloatTensor(in_C, out_H, out_W)
+    kernel_width = weights_W.size(1)
+    for i in range(out_W):
+        idx = int(indices_W[i][0])
+        for j in range(out_C):
+            out_2[j, :, i] = out_1_aug[j, :, idx:idx + kernel_width].mv(weights_W[i])
+    if need_squeeze:
+        out_2.squeeze_()
+    return out_2
+
+
+# --------------------------------------------
+# imresize for numpy image [0, 1]
+# --------------------------------------------
+def imresize_np(img, scale, antialiasing=True):
+    # Now the scale should be the same for H and W
+    # input: img: Numpy, HWC or HW [0,1]
+    # output: HWC or HW [0,1] w/o round
+    img = torch.from_numpy(img)
+    need_squeeze = True if img.dim() == 2 else False
+    if need_squeeze:
+        img.unsqueeze_(2)
+
+    in_H, in_W, in_C = img.size()
+    out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
+    kernel_width = 4
+    kernel = 'cubic'
+
+    # Return the desired dimension order for performing the resize.  The
+    # strategy is to perform the resize first along the dimension with the
+    # smallest scale factor.
+    # Now we do not support this.
+
+    # get weights and indices
+    weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
+        in_H, out_H, scale, kernel, kernel_width, antialiasing)
+    weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
+        in_W, out_W, scale, kernel, kernel_width, antialiasing)
+    # process H dimension
+    # symmetric copying
+    img_aug = torch.FloatTensor(in_H + sym_len_Hs + sym_len_He, in_W, in_C)
+    img_aug.narrow(0, sym_len_Hs, in_H).copy_(img)
+
+    sym_patch = img[:sym_len_Hs, :, :]
+    inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(0, inv_idx)
+    img_aug.narrow(0, 0, sym_len_Hs).copy_(sym_patch_inv)
+
+    sym_patch = img[-sym_len_He:, :, :]
+    inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(0, inv_idx)
+    img_aug.narrow(0, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
+
+    out_1 = torch.FloatTensor(out_H, in_W, in_C)
+    kernel_width = weights_H.size(1)
+    for i in range(out_H):
+        idx = int(indices_H[i][0])
+        for j in range(out_C):
+            out_1[i, :, j] = img_aug[idx:idx + kernel_width, :, j].transpose(0, 1).mv(weights_H[i])
+
+    # process W dimension
+    # symmetric copying
+    out_1_aug = torch.FloatTensor(out_H, in_W + sym_len_Ws + sym_len_We, in_C)
+    out_1_aug.narrow(1, sym_len_Ws, in_W).copy_(out_1)
+
+    sym_patch = out_1[:, :sym_len_Ws, :]
+    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(1, inv_idx)
+    out_1_aug.narrow(1, 0, sym_len_Ws).copy_(sym_patch_inv)
+
+    sym_patch = out_1[:, -sym_len_We:, :]
+    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(1, inv_idx)
+    out_1_aug.narrow(1, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
+
+    out_2 = torch.FloatTensor(out_H, out_W, in_C)
+    kernel_width = weights_W.size(1)
+    for i in range(out_W):
+        idx = int(indices_W[i][0])
+        for j in range(out_C):
+            out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(weights_W[i])
+    if need_squeeze:
+        out_2.squeeze_()
+
+    return out_2.numpy()
+
+
+if __name__ == '__main__':
+    print('---')
+#    img = imread_uint('test.bmp', 3)
+#    img = uint2single(img)
+#    img_bicubic = imresize_np(img, 1/4)
\ No newline at end of file
diff --git a/ldmlib/modules/losses/__init__.py b/ldmlib/modules/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..636bbd4de126bb4de8d64f318ccceaed0788092f
--- /dev/null
+++ b/ldmlib/modules/losses/__init__.py
@@ -0,0 +1 @@
+from ldmlib.modules.losses.contperceptual import LPIPSWithDiscriminator
diff --git a/ldmlib/modules/losses/contperceptual.py b/ldmlib/modules/losses/contperceptual.py
new file mode 100644
index 0000000000000000000000000000000000000000..8150b9585b2cf892a088860e9dfc5cd6c9060ff4
--- /dev/null
+++ b/ldmlib/modules/losses/contperceptual.py
@@ -0,0 +1,110 @@
+import torch
+import torch.nn as nn
+
+from taming.modules.losses.vqperceptual import *  # TODO: taming dependency yes/no?
+
+
+class LPIPSWithDiscriminator(nn.Module):
+    def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
+                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
+                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
+                 disc_loss="hinge"):
+
+        super().__init__()
+        assert disc_loss in ["hinge", "vanilla"]
+        self.kl_weight = kl_weight
+        self.pixel_weight = pixelloss_weight
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        # output log variance
+        self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
+
+        self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
+                                                 n_layers=disc_num_layers,
+                                                 use_actnorm=use_actnorm
+                                                 ).apply(weights_init)
+        self.discriminator_iter_start = disc_start
+        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.disc_conditional = disc_conditional
+
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
+        if last_layer is not None:
+            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        else:
+            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
+
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+
+    def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
+                global_step, last_layer=None, cond=None, split="train",
+                weights=None):
+        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+        if self.perceptual_weight > 0:
+            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
+            rec_loss = rec_loss + self.perceptual_weight * p_loss
+
+        nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
+        weighted_nll_loss = nll_loss
+        if weights is not None:
+            weighted_nll_loss = weights*nll_loss
+        weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+        nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+        kl_loss = posteriors.kl()
+        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
+
+        # now the GAN part
+        if optimizer_idx == 0:
+            # generator update
+            if cond is None:
+                assert not self.disc_conditional
+                logits_fake = self.discriminator(reconstructions.contiguous())
+            else:
+                assert self.disc_conditional
+                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
+            g_loss = -torch.mean(logits_fake)
+
+            if self.disc_factor > 0.0:
+                try:
+                    d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
+                except RuntimeError:
+                    assert not self.training
+                    d_weight = torch.tensor(0.0)
+            else:
+                d_weight = torch.tensor(0.0)
+
+            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+            loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
+
+            log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
+                   "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
+                   "{}/rec_loss".format(split): rec_loss.detach().mean(),
+                   "{}/d_weight".format(split): d_weight.detach(),
+                   "{}/disc_factor".format(split): torch.tensor(disc_factor),
+                   "{}/g_loss".format(split): g_loss.detach().mean(),
+                   }
+            return loss, log
+
+        if optimizer_idx == 1:
+            # second pass for discriminator update
+            if cond is None:
+                logits_real = self.discriminator(inputs.contiguous().detach())
+                logits_fake = self.discriminator(reconstructions.contiguous().detach())
+            else:
+                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
+                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
+
+            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+
+            log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
+                   "{}/logits_real".format(split): logits_real.detach().mean(),
+                   "{}/logits_fake".format(split): logits_fake.detach().mean()
+                   }
+            return d_loss, log
diff --git a/ldmlib/modules/losses/vqperceptual.py b/ldmlib/modules/losses/vqperceptual.py
new file mode 100644
index 0000000000000000000000000000000000000000..f69981769e4bd5462600458c4fcf26620f7e4306
--- /dev/null
+++ b/ldmlib/modules/losses/vqperceptual.py
@@ -0,0 +1,167 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import repeat
+
+from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
+from taming.modules.losses.lpips import LPIPS
+from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss
+
+
+def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights):
+    assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0]
+    loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3])
+    loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3])
+    loss_real = (weights * loss_real).sum() / weights.sum()
+    loss_fake = (weights * loss_fake).sum() / weights.sum()
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+
+def adopt_weight(weight, global_step, threshold=0, value=0.):
+    if global_step < threshold:
+        weight = value
+    return weight
+
+
+def measure_perplexity(predicted_indices, n_embed):
+    # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
+    # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
+    encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
+    avg_probs = encodings.mean(0)
+    perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
+    cluster_use = torch.sum(avg_probs > 0)
+    return perplexity, cluster_use
+
+def l1(x, y):
+    return torch.abs(x-y)
+
+
+def l2(x, y):
+    return torch.pow((x-y), 2)
+
+
+class VQLPIPSWithDiscriminator(nn.Module):
+    def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
+                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
+                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
+                 disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips",
+                 pixel_loss="l1"):
+        super().__init__()
+        assert disc_loss in ["hinge", "vanilla"]
+        assert perceptual_loss in ["lpips", "clips", "dists"]
+        assert pixel_loss in ["l1", "l2"]
+        self.codebook_weight = codebook_weight
+        self.pixel_weight = pixelloss_weight
+        if perceptual_loss == "lpips":
+            print(f"{self.__class__.__name__}: Running with LPIPS.")
+            self.perceptual_loss = LPIPS().eval()
+        else:
+            raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<")
+        self.perceptual_weight = perceptual_weight
+
+        if pixel_loss == "l1":
+            self.pixel_loss = l1
+        else:
+            self.pixel_loss = l2
+
+        self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
+                                                 n_layers=disc_num_layers,
+                                                 use_actnorm=use_actnorm,
+                                                 ndf=disc_ndf
+                                                 ).apply(weights_init)
+        self.discriminator_iter_start = disc_start
+        if disc_loss == "hinge":
+            self.disc_loss = hinge_d_loss
+        elif disc_loss == "vanilla":
+            self.disc_loss = vanilla_d_loss
+        else:
+            raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
+        print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.disc_conditional = disc_conditional
+        self.n_classes = n_classes
+
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
+        if last_layer is not None:
+            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        else:
+            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
+
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+
+    def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
+                global_step, last_layer=None, cond=None, split="train", predicted_indices=None):
+        if not exists(codebook_loss):
+            codebook_loss = torch.tensor([0.]).to(inputs.device)
+        #rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+        rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous())
+        if self.perceptual_weight > 0:
+            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
+            rec_loss = rec_loss + self.perceptual_weight * p_loss
+        else:
+            p_loss = torch.tensor([0.0])
+
+        nll_loss = rec_loss
+        #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+        nll_loss = torch.mean(nll_loss)
+
+        # now the GAN part
+        if optimizer_idx == 0:
+            # generator update
+            if cond is None:
+                assert not self.disc_conditional
+                logits_fake = self.discriminator(reconstructions.contiguous())
+            else:
+                assert self.disc_conditional
+                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
+            g_loss = -torch.mean(logits_fake)
+
+            try:
+                d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
+            except RuntimeError:
+                assert not self.training
+                d_weight = torch.tensor(0.0)
+
+            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+            loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
+
+            log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
+                   "{}/quant_loss".format(split): codebook_loss.detach().mean(),
+                   "{}/nll_loss".format(split): nll_loss.detach().mean(),
+                   "{}/rec_loss".format(split): rec_loss.detach().mean(),
+                   "{}/p_loss".format(split): p_loss.detach().mean(),
+                   "{}/d_weight".format(split): d_weight.detach(),
+                   "{}/disc_factor".format(split): torch.tensor(disc_factor),
+                   "{}/g_loss".format(split): g_loss.detach().mean(),
+                   }
+            if predicted_indices is not None:
+                assert self.n_classes is not None
+                with torch.no_grad():
+                    perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes)
+                log[f"{split}/perplexity"] = perplexity
+                log[f"{split}/cluster_usage"] = cluster_usage
+            return loss, log
+
+        if optimizer_idx == 1:
+            # second pass for discriminator update
+            if cond is None:
+                logits_real = self.discriminator(inputs.contiguous().detach())
+                logits_fake = self.discriminator(reconstructions.contiguous().detach())
+            else:
+                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
+                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
+
+            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+
+            log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
+                   "{}/logits_real".format(split): logits_real.detach().mean(),
+                   "{}/logits_fake".format(split): logits_fake.detach().mean()
+                   }
+            return d_loss, log
diff --git a/ldmlib/modules/x_transformer.py b/ldmlib/modules/x_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc15bf9cfe0111a910e7de33d04ffdec3877576
--- /dev/null
+++ b/ldmlib/modules/x_transformer.py
@@ -0,0 +1,641 @@
+"""shout-out to https://github.com/lucidrains/x-transformers/tree/main/x_transformers"""
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from functools import partial
+from inspect import isfunction
+from collections import namedtuple
+from einops import rearrange, repeat, reduce
+
+# constants
+
+DEFAULT_DIM_HEAD = 64
+
+Intermediates = namedtuple('Intermediates', [
+    'pre_softmax_attn',
+    'post_softmax_attn'
+])
+
+LayerIntermediates = namedtuple('Intermediates', [
+    'hiddens',
+    'attn_intermediates'
+])
+
+
+class AbsolutePositionalEmbedding(nn.Module):
+    def __init__(self, dim, max_seq_len):
+        super().__init__()
+        self.emb = nn.Embedding(max_seq_len, dim)
+        self.init_()
+
+    def init_(self):
+        nn.init.normal_(self.emb.weight, std=0.02)
+
+    def forward(self, x):
+        n = torch.arange(x.shape[1], device=x.device)
+        return self.emb(n)[None, :, :]
+
+
+class FixedPositionalEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, x, seq_dim=1, offset=0):
+        t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
+        sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq)
+        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
+        return emb[None, :, :]
+
+
+# helpers
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def always(val):
+    def inner(*args, **kwargs):
+        return val
+    return inner
+
+
+def not_equals(val):
+    def inner(x):
+        return x != val
+    return inner
+
+
+def equals(val):
+    def inner(x):
+        return x == val
+    return inner
+
+
+def max_neg_value(tensor):
+    return -torch.finfo(tensor.dtype).max
+
+
+# keyword argument helpers
+
+def pick_and_pop(keys, d):
+    values = list(map(lambda key: d.pop(key), keys))
+    return dict(zip(keys, values))
+
+
+def group_dict_by_key(cond, d):
+    return_val = [dict(), dict()]
+    for key in d.keys():
+        match = bool(cond(key))
+        ind = int(not match)
+        return_val[ind][key] = d[key]
+    return (*return_val,)
+
+
+def string_begins_with(prefix, str):
+    return str.startswith(prefix)
+
+
+def group_by_key_prefix(prefix, d):
+    return group_dict_by_key(partial(string_begins_with, prefix), d)
+
+
+def groupby_prefix_and_trim(prefix, d):
+    kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
+    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
+    return kwargs_without_prefix, kwargs
+
+
+# classes
+class Scale(nn.Module):
+    def __init__(self, value, fn):
+        super().__init__()
+        self.value = value
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        x, *rest = self.fn(x, **kwargs)
+        return (x * self.value, *rest)
+
+
+class Rezero(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+        self.g = nn.Parameter(torch.zeros(1))
+
+    def forward(self, x, **kwargs):
+        x, *rest = self.fn(x, **kwargs)
+        return (x * self.g, *rest)
+
+
+class ScaleNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.scale = dim ** -0.5
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(1))
+
+    def forward(self, x):
+        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+        return x / norm.clamp(min=self.eps) * self.g
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-8):
+        super().__init__()
+        self.scale = dim ** -0.5
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x):
+        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+        return x / norm.clamp(min=self.eps) * self.g
+
+
+class Residual(nn.Module):
+    def forward(self, x, residual):
+        return x + residual
+
+
+class GRUGating(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gru = nn.GRUCell(dim, dim)
+
+    def forward(self, x, residual):
+        gated_output = self.gru(
+            rearrange(x, 'b n d -> (b n) d'),
+            rearrange(residual, 'b n d -> (b n) d')
+        )
+
+        return gated_output.reshape_as(x)
+
+
+# feedforward
+
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU()
+        ) if not glu else GEGLU(dim, inner_dim)
+
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+# attention.
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            dim_head=DEFAULT_DIM_HEAD,
+            heads=8,
+            causal=False,
+            mask=None,
+            talking_heads=False,
+            sparse_topk=None,
+            use_entmax15=False,
+            num_mem_kv=0,
+            dropout=0.,
+            on_attn=False
+    ):
+        super().__init__()
+        if use_entmax15:
+            raise NotImplementedError("Check out entmax activation instead of softmax activation!")
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        self.causal = causal
+        self.mask = mask
+
+        inner_dim = dim_head * heads
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(dim, inner_dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+        # talking heads
+        self.talking_heads = talking_heads
+        if talking_heads:
+            self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads))
+            self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads))
+
+        # explicit topk sparse attention
+        self.sparse_topk = sparse_topk
+
+        # entmax
+        #self.attn_fn = entmax15 if use_entmax15 else F.softmax
+        self.attn_fn = F.softmax
+
+        # add memory key / values
+        self.num_mem_kv = num_mem_kv
+        if num_mem_kv > 0:
+            self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+            self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+
+        # attention on attention
+        self.attn_on_attn = on_attn
+        self.to_out = nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim)
+
+    def forward(
+            self,
+            x,
+            context=None,
+            mask=None,
+            context_mask=None,
+            rel_pos=None,
+            sinusoidal_emb=None,
+            prev_attn=None,
+            mem=None
+    ):
+        b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device
+        kv_input = default(context, x)
+
+        q_input = x
+        k_input = kv_input
+        v_input = kv_input
+
+        if exists(mem):
+            k_input = torch.cat((mem, k_input), dim=-2)
+            v_input = torch.cat((mem, v_input), dim=-2)
+
+        if exists(sinusoidal_emb):
+            # in shortformer, the query would start at a position offset depending on the past cached memory
+            offset = k_input.shape[-2] - q_input.shape[-2]
+            q_input = q_input + sinusoidal_emb(q_input, offset=offset)
+            k_input = k_input + sinusoidal_emb(k_input)
+
+        q = self.to_q(q_input)
+        k = self.to_k(k_input)
+        v = self.to_v(v_input)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
+
+        input_mask = None
+        if any(map(exists, (mask, context_mask))):
+            q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool())
+            k_mask = q_mask if not exists(context) else context_mask
+            k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool())
+            q_mask = rearrange(q_mask, 'b i -> b () i ()')
+            k_mask = rearrange(k_mask, 'b j -> b () () j')
+            input_mask = q_mask * k_mask
+
+        if self.num_mem_kv > 0:
+            mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v))
+            k = torch.cat((mem_k, k), dim=-2)
+            v = torch.cat((mem_v, v), dim=-2)
+            if exists(input_mask):
+                input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True)
+
+        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
+        mask_value = max_neg_value(dots)
+
+        if exists(prev_attn):
+            dots = dots + prev_attn
+
+        pre_softmax_attn = dots
+
+        if talking_heads:
+            dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous()
+
+        if exists(rel_pos):
+            dots = rel_pos(dots)
+
+        if exists(input_mask):
+            dots.masked_fill_(~input_mask, mask_value)
+            del input_mask
+
+        if self.causal:
+            i, j = dots.shape[-2:]
+            r = torch.arange(i, device=device)
+            mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j')
+            mask = F.pad(mask, (j - i, 0), value=False)
+            dots.masked_fill_(mask, mask_value)
+            del mask
+
+        if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]:
+            top, _ = dots.topk(self.sparse_topk, dim=-1)
+            vk = top[..., -1].unsqueeze(-1).expand_as(dots)
+            mask = dots < vk
+            dots.masked_fill_(mask, mask_value)
+            del mask
+
+        attn = self.attn_fn(dots, dim=-1)
+        post_softmax_attn = attn
+
+        attn = self.dropout(attn)
+
+        if talking_heads:
+            attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous()
+
+        out = einsum('b h i j, b h j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+
+        intermediates = Intermediates(
+            pre_softmax_attn=pre_softmax_attn,
+            post_softmax_attn=post_softmax_attn
+        )
+
+        return self.to_out(out), intermediates
+
+
+class AttentionLayers(nn.Module):
+    def __init__(
+            self,
+            dim,
+            depth,
+            heads=8,
+            causal=False,
+            cross_attend=False,
+            only_cross=False,
+            use_scalenorm=False,
+            use_rmsnorm=False,
+            use_rezero=False,
+            rel_pos_num_buckets=32,
+            rel_pos_max_distance=128,
+            position_infused_attn=False,
+            custom_layers=None,
+            sandwich_coef=None,
+            par_ratio=None,
+            residual_attn=False,
+            cross_residual_attn=False,
+            macaron=False,
+            pre_norm=True,
+            gate_residual=False,
+            **kwargs
+    ):
+        super().__init__()
+        ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
+        attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs)
+
+        dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD)
+
+        self.dim = dim
+        self.depth = depth
+        self.layers = nn.ModuleList([])
+
+        self.has_pos_emb = position_infused_attn
+        self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None
+        self.rotary_pos_emb = always(None)
+
+        assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance'
+        self.rel_pos = None
+
+        self.pre_norm = pre_norm
+
+        self.residual_attn = residual_attn
+        self.cross_residual_attn = cross_residual_attn
+
+        norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm
+        norm_class = RMSNorm if use_rmsnorm else norm_class
+        norm_fn = partial(norm_class, dim)
+
+        norm_fn = nn.Identity if use_rezero else norm_fn
+        branch_fn = Rezero if use_rezero else None
+
+        if cross_attend and not only_cross:
+            default_block = ('a', 'c', 'f')
+        elif cross_attend and only_cross:
+            default_block = ('c', 'f')
+        else:
+            default_block = ('a', 'f')
+
+        if macaron:
+            default_block = ('f',) + default_block
+
+        if exists(custom_layers):
+            layer_types = custom_layers
+        elif exists(par_ratio):
+            par_depth = depth * len(default_block)
+            assert 1 < par_ratio <= par_depth, 'par ratio out of range'
+            default_block = tuple(filter(not_equals('f'), default_block))
+            par_attn = par_depth // par_ratio
+            depth_cut = par_depth * 2 // 3  # 2 / 3 attention layer cutoff suggested by PAR paper
+            par_width = (depth_cut + depth_cut // par_attn) // par_attn
+            assert len(default_block) <= par_width, 'default block is too large for par_ratio'
+            par_block = default_block + ('f',) * (par_width - len(default_block))
+            par_head = par_block * par_attn
+            layer_types = par_head + ('f',) * (par_depth - len(par_head))
+        elif exists(sandwich_coef):
+            assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
+            layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef
+        else:
+            layer_types = default_block * depth
+
+        self.layer_types = layer_types
+        self.num_attn_layers = len(list(filter(equals('a'), layer_types)))
+
+        for layer_type in self.layer_types:
+            if layer_type == 'a':
+                layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs)
+            elif layer_type == 'c':
+                layer = Attention(dim, heads=heads, **attn_kwargs)
+            elif layer_type == 'f':
+                layer = FeedForward(dim, **ff_kwargs)
+                layer = layer if not macaron else Scale(0.5, layer)
+            else:
+                raise Exception(f'invalid layer type {layer_type}')
+
+            if isinstance(layer, Attention) and exists(branch_fn):
+                layer = branch_fn(layer)
+
+            if gate_residual:
+                residual_fn = GRUGating(dim)
+            else:
+                residual_fn = Residual()
+
+            self.layers.append(nn.ModuleList([
+                norm_fn(),
+                layer,
+                residual_fn
+            ]))
+
+    def forward(
+            self,
+            x,
+            context=None,
+            mask=None,
+            context_mask=None,
+            mems=None,
+            return_hiddens=False
+    ):
+        hiddens = []
+        intermediates = []
+        prev_attn = None
+        prev_cross_attn = None
+
+        mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
+
+        for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)):
+            is_last = ind == (len(self.layers) - 1)
+
+            if layer_type == 'a':
+                hiddens.append(x)
+                layer_mem = mems.pop(0)
+
+            residual = x
+
+            if self.pre_norm:
+                x = norm(x)
+
+            if layer_type == 'a':
+                out, inter = block(x, mask=mask, sinusoidal_emb=self.pia_pos_emb, rel_pos=self.rel_pos,
+                                   prev_attn=prev_attn, mem=layer_mem)
+            elif layer_type == 'c':
+                out, inter = block(x, context=context, mask=mask, context_mask=context_mask, prev_attn=prev_cross_attn)
+            elif layer_type == 'f':
+                out = block(x)
+
+            x = residual_fn(out, residual)
+
+            if layer_type in ('a', 'c'):
+                intermediates.append(inter)
+
+            if layer_type == 'a' and self.residual_attn:
+                prev_attn = inter.pre_softmax_attn
+            elif layer_type == 'c' and self.cross_residual_attn:
+                prev_cross_attn = inter.pre_softmax_attn
+
+            if not self.pre_norm and not is_last:
+                x = norm(x)
+
+        if return_hiddens:
+            intermediates = LayerIntermediates(
+                hiddens=hiddens,
+                attn_intermediates=intermediates
+            )
+
+            return x, intermediates
+
+        return x
+
+
+class Encoder(AttentionLayers):
+    def __init__(self, **kwargs):
+        assert 'causal' not in kwargs, 'cannot set causality on encoder'
+        super().__init__(causal=False, **kwargs)
+
+
+
+class TransformerWrapper(nn.Module):
+    def __init__(
+            self,
+            *,
+            num_tokens,
+            max_seq_len,
+            attn_layers,
+            emb_dim=None,
+            max_mem_len=0.,
+            emb_dropout=0.,
+            num_memory_tokens=None,
+            tie_embedding=False,
+            use_pos_emb=True
+    ):
+        super().__init__()
+        assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder'
+
+        dim = attn_layers.dim
+        emb_dim = default(emb_dim, dim)
+
+        self.max_seq_len = max_seq_len
+        self.max_mem_len = max_mem_len
+        self.num_tokens = num_tokens
+
+        self.token_emb = nn.Embedding(num_tokens, emb_dim)
+        self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if (
+                    use_pos_emb and not attn_layers.has_pos_emb) else always(0)
+        self.emb_dropout = nn.Dropout(emb_dropout)
+
+        self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
+        self.attn_layers = attn_layers
+        self.norm = nn.LayerNorm(dim)
+
+        self.init_()
+
+        self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
+
+        # memory tokens (like [cls]) from Memory Transformers paper
+        num_memory_tokens = default(num_memory_tokens, 0)
+        self.num_memory_tokens = num_memory_tokens
+        if num_memory_tokens > 0:
+            self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
+
+            # let funnel encoder know number of memory tokens, if specified
+            if hasattr(attn_layers, 'num_memory_tokens'):
+                attn_layers.num_memory_tokens = num_memory_tokens
+
+    def init_(self):
+        nn.init.normal_(self.token_emb.weight, std=0.02)
+
+    def forward(
+            self,
+            x,
+            return_embeddings=False,
+            mask=None,
+            return_mems=False,
+            return_attn=False,
+            mems=None,
+            **kwargs
+    ):
+        b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
+        x = self.token_emb(x)
+        x += self.pos_emb(x)
+        x = self.emb_dropout(x)
+
+        x = self.project_emb(x)
+
+        if num_mem > 0:
+            mem = repeat(self.memory_tokens, 'n d -> b n d', b=b)
+            x = torch.cat((mem, x), dim=1)
+
+            # auto-handle masking after appending memory tokens
+            if exists(mask):
+                mask = F.pad(mask, (num_mem, 0), value=True)
+
+        x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs)
+        x = self.norm(x)
+
+        mem, x = x[:, :num_mem], x[:, num_mem:]
+
+        out = self.to_logits(x) if not return_embeddings else x
+
+        if return_mems:
+            hiddens = intermediates.hiddens
+            new_mems = list(map(lambda pair: torch.cat(pair, dim=-2), zip(mems, hiddens))) if exists(mems) else hiddens
+            new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems))
+            return out, new_mems
+
+        if return_attn:
+            attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
+            return out, attn_maps
+
+        return out
+
diff --git a/ldmlib/util.py b/ldmlib/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ba38853e7a07228cc2c187742b5c45d7359b3f9
--- /dev/null
+++ b/ldmlib/util.py
@@ -0,0 +1,203 @@
+import importlib
+
+import torch
+import numpy as np
+from collections import abc
+from einops import rearrange
+from functools import partial
+
+import multiprocessing as mp
+from threading import Thread
+from queue import Queue
+
+from inspect import isfunction
+from PIL import Image, ImageDraw, ImageFont
+
+
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new("RGB", wh, color="white")
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
+        nc = int(40 * (wh[0] / 256))
+        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
+
+        try:
+            draw.text((0, 0), lines, fill="black", font=font)
+        except UnicodeEncodeError:
+            print("Cant encode string for logging. Skipping.")
+
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+
+
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+
+
+def isimage(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+
+
+def exists(x):
+    return x is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
+    return total_params
+
+
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+
+
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+
+
+def _do_parallel_data_prefetch(func, Q, data, idx, idx_to_fn=False):
+    # create dummy dataset instance
+
+    # run prefetching
+    if idx_to_fn:
+        res = func(data, worker_id=idx)
+    else:
+        res = func(data)
+    Q.put([idx, res])
+    Q.put("Done")
+
+
+def parallel_data_prefetch(
+        func: callable, data, n_proc, target_data_type="ndarray", cpu_intensive=True, use_worker_id=False
+):
+    # if target_data_type not in ["ndarray", "list"]:
+    #     raise ValueError(
+    #         "Data, which is passed to parallel_data_prefetch has to be either of type list or ndarray."
+    #     )
+    if isinstance(data, np.ndarray) and target_data_type == "list":
+        raise ValueError("list expected but function got ndarray.")
+    elif isinstance(data, abc.Iterable):
+        if isinstance(data, dict):
+            print(
+                f'WARNING:"data" argument passed to parallel_data_prefetch is a dict: Using only its values and disregarding keys.'
+            )
+            data = list(data.values())
+        if target_data_type == "ndarray":
+            data = np.asarray(data)
+        else:
+            data = list(data)
+    else:
+        raise TypeError(
+            f"The data, that shall be processed parallel has to be either an np.ndarray or an Iterable, but is actually {type(data)}."
+        )
+
+    if cpu_intensive:
+        Q = mp.Queue(1000)
+        proc = mp.Process
+    else:
+        Q = Queue(1000)
+        proc = Thread
+    # spawn processes
+    if target_data_type == "ndarray":
+        arguments = [
+            [func, Q, part, i, use_worker_id]
+            for i, part in enumerate(np.array_split(data, n_proc))
+        ]
+    else:
+        step = (
+            int(len(data) / n_proc + 1)
+            if len(data) % n_proc != 0
+            else int(len(data) / n_proc)
+        )
+        arguments = [
+            [func, Q, part, i, use_worker_id]
+            for i, part in enumerate(
+                [data[i: i + step] for i in range(0, len(data), step)]
+            )
+        ]
+    processes = []
+    for i in range(n_proc):
+        p = proc(target=_do_parallel_data_prefetch, args=arguments[i])
+        processes += [p]
+
+    # start processes
+    print(f"Start prefetching...")
+    import time
+
+    start = time.time()
+    gather_res = [[] for _ in range(n_proc)]
+    try:
+        for p in processes:
+            p.start()
+
+        k = 0
+        while k < n_proc:
+            # get result
+            res = Q.get()
+            if res == "Done":
+                k += 1
+            else:
+                gather_res[res[0]] = res[1]
+
+    except Exception as e:
+        print("Exception: ", e)
+        for p in processes:
+            p.terminate()
+
+        raise e
+    finally:
+        for p in processes:
+            p.join()
+        print(f"Prefetching complete. [{time.time() - start} sec.]")
+
+    if target_data_type == 'ndarray':
+        if not isinstance(gather_res[0], np.ndarray):
+            return np.concatenate([np.asarray(r) for r in gather_res], axis=0)
+
+        # order outputs
+        return np.concatenate(gather_res, axis=0)
+    elif target_data_type == 'list':
+        out = []
+        for r in gather_res:
+            out.extend(r)
+        return out
+    else:
+        return gather_res
diff --git a/modules/app.py b/modules/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dcd1f82e7d41fb8bbf128147fbf6e22df28ce3b
--- /dev/null
+++ b/modules/app.py
@@ -0,0 +1,55 @@
+import os
+import requests
+import json
+from io import BytesIO
+
+from fastapi import FastAPI
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse, StreamingResponse
+
+from modules.inference import infer_t5
+from modules.dataset import query_emotion
+
+# https://huggingface.co/settings/tokens
+# https://huggingface.co/spaces/{username}/{space}/settings
+API_TOKEN = os.getenv("AUTH_TOKEN")
+if not API_TOKEN:
+    with open('/root/.huggingface/token') as f:
+        lines = f.readlines()
+        API_TOKEN = lines[0]
+
+app = FastAPI(docs_url=None, redoc_url=None)
+
+app.mount("/static", StaticFiles(directory="static"), name="static")
+
+
+@app.head("/")
+@app.get("/")
+def index() -> FileResponse:
+    return FileResponse(path="static/index.html", media_type="text/html")
+
+
+@app.get("/infer_biggan")
+def biggan(input):
+    output = requests.request(
+        "POST",
+        "https://api-inference.huggingface.co/models/osanseviero/BigGAN-deep-128",
+        headers={"Authorization": f"Bearer {API_TOKEN}"},
+        data=json.dumps(input),
+    )
+    #return json.dumps(output)
+    return StreamingResponse(BytesIO(output.content), media_type="image/png")
+
+
+@app.get("/infer_t5")
+def t5(input):
+    output = infer_t5(input)
+
+    return {"output": output}
+
+
+@app.get("/query_emotion")
+def emotion(start, end):
+    output = query_emotion(int(start), int(end))
+
+    return {"output": output}
diff --git a/modules/dataset.py b/modules/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..26d9108c537d6fbb2b054e23bc169e1c4fd2aa07
--- /dev/null
+++ b/modules/dataset.py
@@ -0,0 +1,19 @@
+from datasets import load_dataset
+
+dataset = load_dataset("emotion", split="train")
+
+emotions = dataset.info.features["label"].names
+
+def query_emotion(start, end):
+    rows = dataset[start:end]
+    texts, labels = [rows[k] for k in rows.keys()]
+
+    observations = []
+
+    for i, text in enumerate(texts):
+        observations.append({
+            "text": text,
+            "emotion": emotions[labels[i]],
+        })
+
+    return observations
diff --git a/modules/inference.py b/modules/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbf5cce09c4dd0844bb300e7afb161a15f7b0149
--- /dev/null
+++ b/modules/inference.py
@@ -0,0 +1,11 @@
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+tokenizer = T5Tokenizer.from_pretrained("t5-small")
+model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+
+def infer_t5(input):
+    input_ids = tokenizer(input, return_tensors="pt").input_ids
+    outputs = model.generate(input_ids)
+
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
diff --git a/optimizedSD/LICENSE b/optimizedSD/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..a91cdd6520efc9310bef2f99b634d61370879970
--- /dev/null
+++ b/optimizedSD/LICENSE
@@ -0,0 +1,80 @@
+Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
+
+CreativeML Open RAIL-M
+dated August 22, 2022
+
+Section I: PREAMBLE
+
+Multimodal generative models are being widely adopted and used, and have the potential to transform the way artists, among other individuals, conceive and benefit from AI or ML technologies as a tool for content creation.
+
+Notwithstanding the current and potential benefits that these artifacts can bring to society at large, there are also concerns about potential misuses of them, either due to their technical limitations or ethical considerations.
+
+In short, this license strives for both the open and responsible downstream use of the accompanying model. When it comes to the open character, we took inspiration from open source permissive licenses regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be able to enforce the license in case potential misuses of the Model may occur. At the same time, we strive to promote open and responsible research on generative models for art and content generation.
+
+Even though downstream derivative versions of the model could be released under different licensing terms, the latter will always have to include - at minimum - the same use-based restrictions as the ones in the original license (this license). We believe in the intersection between open and responsible AI development; thus, this License aims to strike a balance between both in order to enable responsible open-science in the field of AI.
+
+This License governs the use of the model (and its derivatives) and is informed by the model card associated with the model.
+
+NOW THEREFORE, You and Licensor agree as follows:
+
+1. Definitions
+
+- "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
+- "Data" means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
+- "Output" means the results of operating a Model as embodied in informational content resulting therefrom.
+- "Model" means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
+- "Derivatives of the Model" means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
+- "Complementary Material" means the accompanying source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, etc, if any.
+- "Distribution" means any transmission, reproduction, publication or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means - e.g. API-based or web access.
+- "Licensor" means the copyright owner or entity authorized by the copyright owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
+- "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
+- "Third Parties" means individuals or legal entities that are not under common control with Licensor or You.
+- "Contribution" means any work of authorship, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+- "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model.
+
+Section II: INTELLECTUAL PROPERTY RIGHTS
+
+Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
+
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model.
+3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution incorporated within the Model and/or Complementary Material constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Work shall terminate as of the date such litigation is asserted or filed.
+
+Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
+
+4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the following conditions:
+   Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply to the use of Complementary Material.
+   You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
+   You must cause any modified files to carry prominent notices stating that You changed the files;
+   You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model, Derivatives of the Model.
+   You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions - respecting paragraph 4.a. - for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
+5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph (paragraph 5).
+6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are accountable for the Output you generate and its subsequent uses. No use of the output can contravene any provision as stated in the License.
+
+Section IV: OTHER PROVISIONS
+
+7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model through electronic means, or modify the Output of the Model based on updates. You shall undertake reasonable efforts to use the latest version of the Model.
+8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
+9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the Complementary Material and assume any risks associated with Your exercise of permissions under this License.
+10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model and the Complementary Material (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
+
+END OF TERMS AND CONDITIONS
+
+Attachment A
+
+Use Restrictions
+
+You agree not to use the Model or Derivatives of the Model:
+
+- In any way that violates any applicable national, federal, state, local or international law or regulation;
+- For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
+- To generate or disseminate verifiably false information and/or content with the purpose of harming others;
+- To generate or disseminate personal identifiable information that can be used to harm an individual;
+- To defame, disparage or otherwise harass others;
+- For fully automated decision making that adversely impacts an individual’s legal rights or otherwise creates or modifies a binding, enforceable obligation;
+- For any use intended to or which has the effect of discriminating against or harming individuals or groups based on online or offline social behavior or known or predicted personal or personality characteristics;
+- To exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
+- For any use intended to or which has the effect of discriminating against individuals or groups based on legally protected characteristics or categories;
+- To provide medical advice and medical results interpretation;
+- To generate or disseminate information for the purpose to be used for administration of justice, law enforcement, immigration or asylum processes, such as predicting an individual will commit fraud/crime commitment (e.g. by text profiling, drawing causal relationships between assertions made in documents, indiscriminate and arbitrarily-targeted use).
diff --git a/optimizedSD/__pycache__/ddpm.cpython-38.pyc b/optimizedSD/__pycache__/ddpm.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fb1d7a97e0b495e21cf68fc5e67ffd207a098e4
Binary files /dev/null and b/optimizedSD/__pycache__/ddpm.cpython-38.pyc differ
diff --git a/optimizedSD/__pycache__/openaimodelSplit.cpython-38.pyc b/optimizedSD/__pycache__/openaimodelSplit.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d09e04cc2047f06342a27a683e027292e8e751b6
Binary files /dev/null and b/optimizedSD/__pycache__/openaimodelSplit.cpython-38.pyc differ
diff --git a/optimizedSD/__pycache__/optimUtils.cpython-38.pyc b/optimizedSD/__pycache__/optimUtils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..243a5c73785d951244b1a149c93920b99ec4a997
Binary files /dev/null and b/optimizedSD/__pycache__/optimUtils.cpython-38.pyc differ
diff --git a/optimizedSD/__pycache__/samplers.cpython-38.pyc b/optimizedSD/__pycache__/samplers.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35f90fd6b6ef6cc3f70a653db13fe11d9b2c602c
Binary files /dev/null and b/optimizedSD/__pycache__/samplers.cpython-38.pyc differ
diff --git a/optimizedSD/__pycache__/splitAttention.cpython-38.pyc b/optimizedSD/__pycache__/splitAttention.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d44f880a680a0610c14009100f6f3fb3e40ad7e
Binary files /dev/null and b/optimizedSD/__pycache__/splitAttention.cpython-38.pyc differ
diff --git a/optimizedSD/ddpm.py b/optimizedSD/ddpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c770e99b5038baafc8497cbd54964e3a0d8ae0bc
--- /dev/null
+++ b/optimizedSD/ddpm.py
@@ -0,0 +1,1080 @@
+"""
+wild mixture of
+https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py
+https://github.com/CompVis/taming-transformers
+-- merci
+"""
+
+import time, math
+from tqdm.auto import trange, tqdm
+import torch
+from einops import rearrange
+from tqdm import tqdm
+from ldmlib.modules.distributions.distributions import DiagonalGaussianDistribution
+from ldmlib.models.autoencoder import VQModelInterface
+import torch.nn as nn
+import numpy as np
+import pytorch_lightning as pl
+from functools import partial
+from pytorch_lightning.utilities.distributed import rank_zero_only
+from ldmlib.util import exists, default, instantiate_from_config
+from ldmlib.modules.diffusionmodules.util import make_beta_schedule
+from ldmlib.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
+from ldmlib.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
+from .samplers import CompVisDenoiser, get_ancestral_step, to_d, append_dims,linear_multistep_coeff
+
+def disabled_train(self):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+class DDPM(pl.LightningModule):
+    # classic DDPM with Gaussian diffusion, in image space
+    def __init__(self,
+                 timesteps=1000,
+                 beta_schedule="linear",
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 load_only_unet=False,
+                 monitor="val/loss",
+                 use_ema=True,
+                 first_stage_key="image",
+                 image_size=256,
+                 channels=3,
+                 log_every_t=100,
+                 clip_denoised=True,
+                 linear_start=1e-4,
+                 linear_end=2e-2,
+                 cosine_s=8e-3,
+                 given_betas=None,
+                 original_elbo_weight=0.,
+                 v_posterior=0.,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
+                 l_simple_weight=1.,
+                 conditioning_key=None,
+                 parameterization="eps",  # all assuming fixed variance schedules
+                 scheduler_config=None,
+                 use_positional_encodings=False,
+                 ):
+        super().__init__()
+        assert parameterization in ["eps", "x0"], 'currently only supporting "eps" and "x0"'
+        self.parameterization = parameterization
+        print(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
+        self.cond_stage_model = None
+        self.clip_denoised = clip_denoised
+        self.log_every_t = log_every_t
+        self.first_stage_key = first_stage_key
+        self.image_size = image_size  # try conv?
+        self.channels = channels
+        self.use_positional_encodings = use_positional_encodings
+        self.use_scheduler = scheduler_config is not None
+        if self.use_scheduler:
+            self.scheduler_config = scheduler_config
+
+        self.v_posterior = v_posterior
+        self.original_elbo_weight = original_elbo_weight
+        self.l_simple_weight = l_simple_weight
+
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
+        self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
+                               linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
+
+
+    def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
+                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+        if exists(given_betas):
+            betas = given_betas
+        else:
+            betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
+                                       cosine_s=cosine_s)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
+
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+
+        self.register_buffer('betas', to_torch(betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+
+
+class FirstStage(DDPM):
+    """main class"""
+    def __init__(self,
+                 first_stage_config,
+                 num_timesteps_cond=None,
+                 cond_stage_key="image",
+                 cond_stage_trainable=False,
+                 concat_mode=True,
+                 cond_stage_forward=None,
+                 conditioning_key=None,
+                 scale_factor=1.0,
+                 scale_by_std=False,
+                 *args, **kwargs):
+        self.num_timesteps_cond = default(num_timesteps_cond, 1)
+        self.scale_by_std = scale_by_std
+        assert self.num_timesteps_cond <= kwargs['timesteps']
+        # for backwards compatibility after implementation of DiffusionWrapper
+        if conditioning_key is None:
+            conditioning_key = 'concat' if concat_mode else 'crossattn'
+        ckpt_path = kwargs.pop("ckpt_path", None)
+        ignore_keys = kwargs.pop("ignore_keys", [])
+        super().__init__()
+        self.concat_mode = concat_mode
+        self.cond_stage_trainable = cond_stage_trainable
+        self.cond_stage_key = cond_stage_key
+        try:
+            self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
+        except:
+            self.num_downs = 0
+        if not scale_by_std:
+            self.scale_factor = scale_factor
+        self.instantiate_first_stage(first_stage_config)
+        self.cond_stage_forward = cond_stage_forward
+        self.clip_denoised = False
+        self.bbox_tokenizer = None
+
+        self.restarted_from_ckpt = False
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys)
+            self.restarted_from_ckpt = True
+
+
+    def instantiate_first_stage(self, config):
+        model = instantiate_from_config(config)
+        self.first_stage_model = model.eval()
+        self.first_stage_model.train = disabled_train
+        for param in self.first_stage_model.parameters():
+            param.requires_grad = False
+
+    def get_first_stage_encoding(self, encoder_posterior):
+        if isinstance(encoder_posterior, DiagonalGaussianDistribution):
+            z = encoder_posterior.sample()
+        elif isinstance(encoder_posterior, torch.Tensor):
+            z = encoder_posterior
+        else:
+            raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
+        return self.scale_factor * z
+
+
+    @torch.no_grad()
+    def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
+        if predict_cids:
+            if z.dim() == 4:
+                z = torch.argmax(z.exp(), dim=1).long()
+            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
+            z = rearrange(z, 'b h w c -> b c h w').contiguous()
+
+        z = 1. / self.scale_factor * z
+
+        if hasattr(self, "split_input_params"):
+            if isinstance(self.first_stage_model, VQModelInterface):
+                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
+            else:
+                return self.first_stage_model.decode(z)
+
+        else:
+            if isinstance(self.first_stage_model, VQModelInterface):
+                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
+            else:
+                return self.first_stage_model.decode(z)
+
+
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        if hasattr(self, "split_input_params"):
+            if self.split_input_params["patch_distributed_vq"]:
+                ks = self.split_input_params["ks"]  # eg. (128, 128)
+                stride = self.split_input_params["stride"]  # eg. (64, 64)
+                df = self.split_input_params["vqf"]
+                self.split_input_params['original_image_size'] = x.shape[-2:]
+                bs, nc, h, w = x.shape
+                if ks[0] > h or ks[1] > w:
+                    ks = (min(ks[0], h), min(ks[1], w))
+                    print("reducing Kernel")
+
+                if stride[0] > h or stride[1] > w:
+                    stride = (min(stride[0], h), min(stride[1], w))
+                    print("reducing stride")
+
+                fold, unfold, normalization, weighting = self.get_fold_unfold(x, ks, stride, df=df)
+                z = unfold(x)  # (bn, nc * prod(**ks), L)
+                # Reshape to img shape
+                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
+
+                output_list = [self.first_stage_model.encode(z[:, :, :, :, i])
+                               for i in range(z.shape[-1])]
+
+                o = torch.stack(output_list, axis=-1)
+                o = o * weighting
+
+                # Reverse reshape to img shape
+                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
+                # stitch crops together
+                decoded = fold(o)
+                decoded = decoded / normalization
+                return decoded
+
+            else:
+                return self.first_stage_model.encode(x)
+        else:
+            return self.first_stage_model.encode(x)
+
+
+class CondStage(DDPM):
+    """main class"""
+    def __init__(self,
+                 cond_stage_config,
+                 num_timesteps_cond=None,
+                 cond_stage_key="image",
+                 cond_stage_trainable=False,
+                 concat_mode=True,
+                 cond_stage_forward=None,
+                 conditioning_key=None,
+                 scale_factor=1.0,
+                 scale_by_std=False,
+                 *args, **kwargs):
+        self.num_timesteps_cond = default(num_timesteps_cond, 1)
+        self.scale_by_std = scale_by_std
+        assert self.num_timesteps_cond <= kwargs['timesteps']
+        # for backwards compatibility after implementation of DiffusionWrapper
+        if conditioning_key is None:
+            conditioning_key = 'concat' if concat_mode else 'crossattn'
+        if cond_stage_config == '__is_unconditional__':
+            conditioning_key = None
+        ckpt_path = kwargs.pop("ckpt_path", None)
+        ignore_keys = kwargs.pop("ignore_keys", [])
+        super().__init__()
+        self.concat_mode = concat_mode
+        self.cond_stage_trainable = cond_stage_trainable
+        self.cond_stage_key = cond_stage_key
+        self.num_downs = 0
+        if not scale_by_std:
+            self.scale_factor = scale_factor
+        self.instantiate_cond_stage(cond_stage_config)
+        self.cond_stage_forward = cond_stage_forward
+        self.clip_denoised = False
+        self.bbox_tokenizer = None
+
+        self.restarted_from_ckpt = False
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys)
+            self.restarted_from_ckpt = True
+
+    def instantiate_cond_stage(self, config):
+        if not self.cond_stage_trainable:
+            if config == "__is_first_stage__":
+                print("Using first stage also as cond stage.")
+                self.cond_stage_model = self.first_stage_model
+            elif config == "__is_unconditional__":
+                print(f"Training {self.__class__.__name__} as an unconditional model.")
+                self.cond_stage_model = None
+                # self.be_unconditional = True
+            else:
+                model = instantiate_from_config(config)
+                self.cond_stage_model = model.eval()
+                self.cond_stage_model.train = disabled_train
+                for param in self.cond_stage_model.parameters():
+                    param.requires_grad = False
+        else:
+            assert config != '__is_first_stage__'
+            assert config != '__is_unconditional__'
+            model = instantiate_from_config(config)
+            self.cond_stage_model = model
+
+    def get_learned_conditioning(self, c):
+        if self.cond_stage_forward is None:
+            if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
+                c = self.cond_stage_model.encode(c)
+                if isinstance(c, DiagonalGaussianDistribution):
+                    c = c.mode()
+            else:
+                c = self.cond_stage_model(c)
+        else:
+            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
+            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
+        return c
+
+class DiffusionWrapper(pl.LightningModule):
+    def __init__(self, diff_model_config):
+        super().__init__()
+        self.diffusion_model = instantiate_from_config(diff_model_config)
+
+    def forward(self, x, t, cc):
+        out = self.diffusion_model(x, t, context=cc)
+        return out
+
+class DiffusionWrapperOut(pl.LightningModule):
+    def __init__(self, diff_model_config):
+        super().__init__()
+        self.diffusion_model = instantiate_from_config(diff_model_config)
+
+    def forward(self, h,emb,tp,hs, cc):
+        return self.diffusion_model(h,emb,tp,hs, context=cc)
+
+
+class UNet(DDPM):
+    """main class"""
+    def __init__(self,
+                 unetConfigEncode,
+                 unetConfigDecode,
+                 num_timesteps_cond=None,
+                 cond_stage_key="image",
+                 cond_stage_trainable=False,
+                 concat_mode=True,
+                 cond_stage_forward=None,
+                 conditioning_key=None,
+                 scale_factor=1.0,
+                 unet_bs = 1,
+                 scale_by_std=False,
+                 *args, **kwargs):
+        self.num_timesteps_cond = default(num_timesteps_cond, 1)
+        self.scale_by_std = scale_by_std
+        assert self.num_timesteps_cond <= kwargs['timesteps']
+        # for backwards compatibility after implementation of DiffusionWrapper
+        if conditioning_key is None:
+            conditioning_key = 'concat' if concat_mode else 'crossattn'
+        ckpt_path = kwargs.pop("ckpt_path", None)
+        ignore_keys = kwargs.pop("ignore_keys", [])
+        super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
+        self.concat_mode = concat_mode
+        self.cond_stage_trainable = cond_stage_trainable
+        self.cond_stage_key = cond_stage_key
+        self.num_downs = 0
+        self.cdevice = "cuda"
+        self.unetConfigEncode = unetConfigEncode
+        self.unetConfigDecode = unetConfigDecode
+        if not scale_by_std:
+            self.scale_factor = scale_factor
+        else:
+            self.register_buffer('scale_factor', torch.tensor(scale_factor))
+        self.cond_stage_forward = cond_stage_forward
+        self.clip_denoised = False
+        self.bbox_tokenizer = None
+        self.model1 = DiffusionWrapper(self.unetConfigEncode)
+        self.model2 = DiffusionWrapperOut(self.unetConfigDecode)
+        self.model1.eval()
+        self.model2.eval()
+        self.turbo = False
+        self.unet_bs = unet_bs
+        self.restarted_from_ckpt = False
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys)
+            self.restarted_from_ckpt = True
+
+    def make_cond_schedule(self, ):
+        self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
+        ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
+        self.cond_ids[:self.num_timesteps_cond] = ids
+
+    @rank_zero_only
+    @torch.no_grad()
+    def on_train_batch_start(self, batch, batch_idx):
+        # only for very first batch
+        if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
+            assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
+            # set rescale weight to 1./std of encodings
+            print("### USING STD-RESCALING ###")
+            x = super().get_input(batch, self.first_stage_key)
+            x = x.to(self.cdevice)
+            encoder_posterior = self.encode_first_stage(x)
+            z = self.get_first_stage_encoding(encoder_posterior).detach()
+            del self.scale_factor
+            self.register_buffer('scale_factor', 1. / z.flatten().std())
+            print(f"setting self.scale_factor to {self.scale_factor}")
+            print("### USING STD-RESCALING ###")
+
+
+    def apply_model(self, x_noisy, t, cond, return_ids=False):
+
+        if(not self.turbo):
+            self.model1.to(self.cdevice)
+
+        step = self.unet_bs
+        h,emb,hs = self.model1(x_noisy[0:step], t[:step], cond[:step])
+        bs = cond.shape[0]
+
+        # assert bs%2 == 0
+        lenhs = len(hs)
+
+        for i in range(step,bs,step):
+            h_temp,emb_temp,hs_temp = self.model1(x_noisy[i:i+step], t[i:i+step], cond[i:i+step])
+            h = torch.cat((h,h_temp))
+            emb = torch.cat((emb,emb_temp))
+            for j in range(lenhs):
+                hs[j] = torch.cat((hs[j], hs_temp[j]))
+
+
+        if(not self.turbo):
+            self.model1.to("cpu")
+            self.model2.to(self.cdevice)
+
+        hs_temp = [hs[j][:step] for j in range(lenhs)]
+        x_recon = self.model2(h[:step],emb[:step],x_noisy.dtype,hs_temp,cond[:step])
+
+        for i in range(step,bs,step):
+
+            hs_temp = [hs[j][i:i+step] for j in range(lenhs)]
+            x_recon1 = self.model2(h[i:i+step],emb[i:i+step],x_noisy.dtype,hs_temp,cond[i:i+step])
+            x_recon = torch.cat((x_recon, x_recon1))
+
+        if(not self.turbo):
+            self.model2.to("cpu")
+
+        if isinstance(x_recon, tuple) and not return_ids:
+            return x_recon[0]
+        else:
+            return x_recon
+
+    def register_buffer1(self, name, attr):
+            if type(attr) == torch.Tensor:
+                if attr.device != torch.device(self.cdevice):
+                    attr = attr.to(torch.device(self.cdevice))
+            setattr(self, name, attr)
+
+    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
+
+
+        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
+                                                  num_ddpm_timesteps=self.num_timesteps,verbose=verbose)
+
+
+        assert self.alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
+
+
+        to_torch = lambda x: x.to(self.cdevice)
+        self.register_buffer1('betas', to_torch(self.betas))
+        self.register_buffer1('alphas_cumprod', to_torch(self.alphas_cumprod))
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=self.alphas_cumprod.cpu(),
+                                                                                   ddim_timesteps=self.ddim_timesteps,
+                                                                                   eta=ddim_eta,verbose=verbose)
+        self.register_buffer1('ddim_sigmas', ddim_sigmas)
+        self.register_buffer1('ddim_alphas', ddim_alphas)
+        self.register_buffer1('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer1('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
+
+
+    @torch.no_grad()
+    def sample(self,
+               S,
+               conditioning,
+               x0=None,
+               shape = None,
+               seed=1234,
+               callback=None,
+               img_callback=None,
+               quantize_x0=False,
+               eta=0.,
+               mask=None,
+               sampler = "plms",
+               temperature=1.,
+               noise_dropout=0.,
+               score_corrector=None,
+               corrector_kwargs=None,
+               verbose=True,
+               x_T=None,
+               log_every_t=100,
+               unconditional_guidance_scale=1.,
+               unconditional_conditioning=None,
+               ):
+
+
+        if(self.turbo):
+            self.model1.to(self.cdevice)
+            self.model2.to(self.cdevice)
+
+        if x0 is None:
+            batch_size, b1, b2, b3 = shape
+            img_shape = (1, b1, b2, b3)
+            tens = []
+            print("seeds used = ", [seed+s for s in range(batch_size)])
+            for _ in range(batch_size):
+                torch.manual_seed(seed)
+                tens.append(torch.randn(img_shape, device=self.cdevice))
+                seed+=1
+            noise = torch.cat(tens)
+            del tens
+
+        x_latent = noise if x0 is None else x0
+        # sampling
+        if sampler in ('ddim', 'dpm2', 'heun', 'dpm2_a', 'lms') and not hasattr(self, 'ddim_timesteps'):
+            self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=False)
+
+        if sampler == "plms":
+            self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=False)
+            print(f'Data shape for PLMS sampling is {shape}')
+            samples = self.plms_sampling(conditioning, batch_size, x_latent,
+                                        callback=callback,
+                                        img_callback=img_callback,
+                                        quantize_denoised=quantize_x0,
+                                        mask=mask, x0=x0,
+                                        ddim_use_original_steps=False,
+                                        noise_dropout=noise_dropout,
+                                        temperature=temperature,
+                                        score_corrector=score_corrector,
+                                        corrector_kwargs=corrector_kwargs,
+                                        log_every_t=log_every_t,
+                                        unconditional_guidance_scale=unconditional_guidance_scale,
+                                        unconditional_conditioning=unconditional_conditioning,
+                                        )
+
+        elif sampler == "ddim":
+            samples = self.ddim_sampling(x_latent, conditioning, S, unconditional_guidance_scale=unconditional_guidance_scale,
+                                         unconditional_conditioning=unconditional_conditioning,
+                                         mask = mask,init_latent=x_T,use_original_steps=False,
+                                         callback=callback, img_callback=img_callback)
+
+        elif sampler == "euler":
+            self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=False)
+            samples = self.euler_sampling(self.alphas_cumprod,x_latent, S, conditioning, unconditional_conditioning=unconditional_conditioning,
+                                        unconditional_guidance_scale=unconditional_guidance_scale,
+                                        img_callback=img_callback)
+        elif sampler == "euler_a":
+            self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=False)
+            samples = self.euler_ancestral_sampling(self.alphas_cumprod,x_latent, S, conditioning, unconditional_conditioning=unconditional_conditioning,
+                                        unconditional_guidance_scale=unconditional_guidance_scale,
+                                        img_callback=img_callback)
+
+        elif sampler == "dpm2":
+            samples = self.dpm_2_sampling(self.alphas_cumprod,x_latent, S, conditioning, unconditional_conditioning=unconditional_conditioning,
+                                        unconditional_guidance_scale=unconditional_guidance_scale,
+                                        img_callback=img_callback)
+        elif sampler == "heun":
+            samples = self.heun_sampling(self.alphas_cumprod,x_latent, S, conditioning, unconditional_conditioning=unconditional_conditioning,
+                                        unconditional_guidance_scale=unconditional_guidance_scale,
+                                        img_callback=img_callback)
+
+        elif sampler == "dpm2_a":
+            samples = self.dpm_2_ancestral_sampling(self.alphas_cumprod,x_latent, S, conditioning, unconditional_conditioning=unconditional_conditioning,
+                                        unconditional_guidance_scale=unconditional_guidance_scale,
+                                        img_callback=img_callback)
+
+
+        elif sampler == "lms":
+            samples = self.lms_sampling(self.alphas_cumprod,x_latent, S, conditioning, unconditional_conditioning=unconditional_conditioning,
+                                        unconditional_guidance_scale=unconditional_guidance_scale,
+                                        img_callback=img_callback)
+
+        yield from samples
+
+        if(self.turbo):
+            self.model1.to("cpu")
+            self.model2.to("cpu")
+
+    @torch.no_grad()
+    def plms_sampling(self, cond,b, img,
+                      ddim_use_original_steps=False,
+                      callback=None, quantize_denoised=False,
+                      mask=None, x0=None, img_callback=None, log_every_t=100,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
+
+        device = self.betas.device
+        timesteps = self.ddim_timesteps
+        time_range = np.flip(timesteps)
+        total_steps = timesteps.shape[0]
+        print(f"Running PLMS Sampling with {total_steps} timesteps")
+
+        iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
+        old_eps = []
+
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
+
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.q_sample(x0, ts)  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1. - mask) * img
+
+            outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
+                                      quantize_denoised=quantize_denoised, temperature=temperature,
+                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
+                                      corrector_kwargs=corrector_kwargs,
+                                      unconditional_guidance_scale=unconditional_guidance_scale,
+                                      unconditional_conditioning=unconditional_conditioning,
+                                      old_eps=old_eps, t_next=ts_next)
+            img, pred_x0, e_t = outs
+            old_eps.append(e_t)
+            if len(old_eps) >= 4:
+                old_eps.pop(0)
+            if callback: yield from callback(i)
+            if img_callback: yield from img_callback(pred_x0, i)
+
+        yield from img_callback(img, len(iterator)-1)
+
+    @torch.no_grad()
+    def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
+        b, *_, device = *x.shape, x.device
+
+        def get_model_output(x, t):
+            if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+                e_t = self.apply_model(x, t, c)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t] * 2)
+                c_in = torch.cat([unconditional_conditioning, c])
+                e_t_uncond, e_t = self.apply_model(x_in, t_in, c_in).chunk(2)
+                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+            if score_corrector is not None:
+                assert self.parameterization == "eps"
+                e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+
+            return e_t
+
+        alphas =  self.ddim_alphas
+        alphas_prev = self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.ddim_sqrt_one_minus_alphas
+        sigmas = self.ddim_sigmas
+
+        def get_x_prev_and_pred_x0(e_t, index):
+            # select parameters corresponding to the currently considered timestep
+            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
+
+            # current prediction for x_0
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+            if quantize_denoised:
+                pred_x0, _, *_ = self.first_stage_model.quantize(pred_x0)
+            # direction pointing to x_t
+            dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+            noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+            if noise_dropout > 0.:
+                noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+            return x_prev, pred_x0
+
+        e_t = get_model_output(x, t)
+        if len(old_eps) == 0:
+            # Pseudo Improved Euler (2nd order)
+            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
+            e_t_next = get_model_output(x_prev, t_next)
+            e_t_prime = (e_t + e_t_next) / 2
+        elif len(old_eps) == 1:
+            # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (3 * e_t - old_eps[-1]) / 2
+        elif len(old_eps) == 2:
+            # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
+        elif len(old_eps) >= 3:
+            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
+
+        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
+
+        return x_prev, pred_x0, e_t
+
+
+    @torch.no_grad()
+    def stochastic_encode(self, x0, t, seed, ddim_eta,ddim_steps,use_original_steps=False, noise=None):
+        # fast, but does not allow for exact reconstruction
+        # t serves as an index to gather the correct alphas
+        self.make_schedule(ddim_num_steps=ddim_steps, ddim_eta=ddim_eta, verbose=False)
+        sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
+
+        if noise is None:
+            b0, b1, b2, b3 = x0.shape
+            img_shape = (1, b1, b2, b3)
+            tens = []
+            print("seeds used = ", [seed+s for s in range(b0)])
+            for _ in range(b0):
+                torch.manual_seed(seed)
+                tens.append(torch.randn(img_shape, device=x0.device))
+                seed+=1
+            noise = torch.cat(tens)
+            del tens
+        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
+                extract_into_tensor(self.ddim_sqrt_one_minus_alphas, t, x0.shape) * noise)
+
+    @torch.no_grad()
+    def add_noise(self, x0, t):
+
+        sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
+        noise = torch.randn(x0.shape, device=x0.device)
+
+        # print(extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape),
+        #       extract_into_tensor(self.ddim_sqrt_one_minus_alphas, t, x0.shape))
+        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
+                extract_into_tensor(self.ddim_sqrt_one_minus_alphas, t, x0.shape) * noise)
+
+
+    @torch.no_grad()
+    def ddim_sampling(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
+               mask = None,init_latent=None,use_original_steps=False,
+               callback=None, img_callback=None):
+
+        timesteps = self.ddim_timesteps
+        timesteps = timesteps[:t_start]
+        time_range = np.flip(timesteps)
+        total_steps = timesteps.shape[0]
+        print(f"Running DDIM Sampling with {total_steps} timesteps")
+
+        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
+        x_dec = x_latent
+        x0 = init_latent
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
+
+            if mask is not None:
+                # x0_noisy = self.add_noise(mask, torch.tensor([index] * x0.shape[0]).to(self.cdevice))
+                x0_noisy = x0
+                x_dec = x0_noisy* mask + (1. - mask) * x_dec
+
+            x_dec = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
+                                          unconditional_guidance_scale=unconditional_guidance_scale,
+                                          unconditional_conditioning=unconditional_conditioning)
+
+            if callback: yield from callback(i)
+            if img_callback: yield from img_callback(x_dec, i)
+
+        if mask is not None:
+            x_dec = x0 * mask + (1. - mask) * x_dec
+
+        yield from img_callback(x_dec, len(iterator)-1)
+
+
+    @torch.no_grad()
+    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None):
+        b, *_, device = *x.shape, x.device
+
+        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            e_t = self.apply_model(x, t, c)
+        else:
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([t] * 2)
+            c_in = torch.cat([unconditional_conditioning, c])
+            e_t_uncond, e_t = self.apply_model(x_in, t_in, c_in).chunk(2)
+            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+        if score_corrector is not None:
+            assert self.model.parameterization == "eps"
+            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+
+        alphas = self.ddim_alphas
+        alphas_prev = self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.ddim_sqrt_one_minus_alphas
+        sigmas = self.ddim_sigmas
+        # select parameters corresponding to the currently considered timestep
+        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
+
+        # current prediction for x_0
+        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        if quantize_denoised:
+            pred_x0, _, *_ = self.first_stage_model.quantize(pred_x0)
+        # direction pointing to x_t
+        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        return x_prev
+
+
+    @torch.no_grad()
+    def euler_sampling(self, ac, x, S, cond, unconditional_conditioning = None, unconditional_guidance_scale = 1,extra_args=None,callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.,
+                        img_callback=None):
+        """Implements Algorithm 2 (Euler steps) from Karras et al. (2022)."""
+        extra_args = {} if extra_args is None else extra_args
+        cvd = CompVisDenoiser(ac)
+        sigmas = cvd.get_sigmas(S)
+        x = x*sigmas[0]
+
+        print(f"Running Euler Sampling with {len(sigmas) - 1} timesteps")
+
+        s_in = x.new_ones([x.shape[0]]).half()
+        for i in trange(len(sigmas) - 1, disable=disable):
+            gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
+            eps = torch.randn_like(x) * s_noise
+            sigma_hat = (sigmas[i] * (gamma + 1)).half()
+            if gamma > 0:
+                x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
+
+            s_i = sigma_hat * s_in
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([s_i] * 2)
+            cond_in = torch.cat([unconditional_conditioning, cond])
+            c_out, c_in = [append_dims(tmp, x_in.ndim) for tmp in cvd.get_scalings(t_in)]
+            eps = self.apply_model(x_in * c_in, cvd.sigma_to_t(t_in), cond_in)
+            e_t_uncond, e_t = (x_in  + eps * c_out).chunk(2)
+            denoised = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+
+            d = to_d(x, sigma_hat, denoised)
+            if callback is not None:
+                callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
+
+            if img_callback: yield from img_callback(x, i)
+
+            dt = sigmas[i + 1] - sigma_hat
+            # Euler method
+            x = x + d * dt
+
+        yield from img_callback(x, len(sigmas)-1)
+
+    @torch.no_grad()
+    def euler_ancestral_sampling(self,ac,x, S, cond, unconditional_conditioning = None, unconditional_guidance_scale = 1,extra_args=None, callback=None, disable=None,
+                        img_callback=None):
+        """Ancestral sampling with Euler method steps."""
+        extra_args = {} if extra_args is None else extra_args
+
+
+        cvd = CompVisDenoiser(ac)
+        sigmas = cvd.get_sigmas(S)
+        x = x*sigmas[0]
+
+        print(f"Running Euler Ancestral Sampling with {len(sigmas) - 1} timesteps")
+
+        s_in = x.new_ones([x.shape[0]]).half()
+        for i in trange(len(sigmas) - 1, disable=disable):
+
+            s_i = sigmas[i] * s_in
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([s_i] * 2)
+            cond_in = torch.cat([unconditional_conditioning, cond])
+            c_out, c_in = [append_dims(tmp, x_in.ndim) for tmp in cvd.get_scalings(t_in)]
+            eps = self.apply_model(x_in * c_in, cvd.sigma_to_t(t_in), cond_in)
+            e_t_uncond, e_t = (x_in  + eps * c_out).chunk(2)
+            denoised = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+            sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1])
+            if callback is not None:
+                callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+
+            if img_callback: yield from img_callback(x, i)
+
+            d = to_d(x, sigmas[i], denoised)
+            # Euler method
+            dt = sigma_down - sigmas[i]
+            x = x + d * dt
+            x = x + torch.randn_like(x) * sigma_up
+
+        yield from img_callback(x, len(sigmas)-1)
+
+
+
+    @torch.no_grad()
+    def heun_sampling(self, ac, x, S, cond, unconditional_conditioning = None, unconditional_guidance_scale = 1, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.,
+                        img_callback=None):
+        """Implements Algorithm 2 (Heun steps) from Karras et al. (2022)."""
+        extra_args = {} if extra_args is None else extra_args
+
+        cvd = CompVisDenoiser(alphas_cumprod=ac)
+        sigmas = cvd.get_sigmas(S)
+        x = x*sigmas[0]
+
+        print(f"Running Heun Sampling with {len(sigmas) - 1} timesteps")
+
+
+        s_in = x.new_ones([x.shape[0]]).half()
+        for i in trange(len(sigmas) - 1, disable=disable):
+            gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
+            eps = torch.randn_like(x) * s_noise
+            sigma_hat = (sigmas[i] * (gamma + 1)).half()
+            if gamma > 0:
+                x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
+
+            s_i = sigma_hat * s_in
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([s_i] * 2)
+            cond_in = torch.cat([unconditional_conditioning, cond])
+            c_out, c_in = [append_dims(tmp, x_in.ndim) for tmp in cvd.get_scalings(t_in)]
+            eps = self.apply_model(x_in * c_in, cvd.sigma_to_t(t_in), cond_in)
+            e_t_uncond, e_t = (x_in  + eps * c_out).chunk(2)
+            denoised = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+            d = to_d(x, sigma_hat, denoised)
+            if callback is not None:
+                callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
+
+            if img_callback: yield from img_callback(x, i)
+
+            dt = sigmas[i + 1] - sigma_hat
+            if sigmas[i + 1] == 0:
+                # Euler method
+                x = x + d * dt
+            else:
+                # Heun's method
+                x_2 = x + d * dt
+                s_i = sigmas[i + 1] * s_in
+                x_in = torch.cat([x_2] * 2)
+                t_in = torch.cat([s_i] * 2)
+                cond_in = torch.cat([unconditional_conditioning, cond])
+                c_out, c_in = [append_dims(tmp, x_in.ndim) for tmp in cvd.get_scalings(t_in)]
+                eps = self.apply_model(x_in * c_in, cvd.sigma_to_t(t_in), cond_in)
+                e_t_uncond, e_t = (x_in  + eps * c_out).chunk(2)
+                denoised_2 = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+                d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
+                d_prime = (d + d_2) / 2
+                x = x + d_prime * dt
+
+        yield from img_callback(x, len(sigmas)-1)
+
+
+    @torch.no_grad()
+    def dpm_2_sampling(self,ac,x, S, cond, unconditional_conditioning = None, unconditional_guidance_scale = 1,extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.,
+                        img_callback=None):
+        """A sampler inspired by DPM-Solver-2 and Algorithm 2 from Karras et al. (2022)."""
+        extra_args = {} if extra_args is None else extra_args
+
+        cvd = CompVisDenoiser(ac)
+        sigmas = cvd.get_sigmas(S)
+        x = x*sigmas[0]
+
+        print(f"Running DPM2 Sampling with {len(sigmas) - 1} timesteps")
+
+        s_in = x.new_ones([x.shape[0]]).half()
+        for i in trange(len(sigmas) - 1, disable=disable):
+            gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
+            eps = torch.randn_like(x) * s_noise
+            sigma_hat = sigmas[i] * (gamma + 1)
+            if gamma > 0:
+                x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
+
+            s_i = sigma_hat * s_in
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([s_i] * 2)
+            cond_in = torch.cat([unconditional_conditioning, cond])
+            c_out, c_in = [append_dims(tmp, x_in.ndim) for tmp in cvd.get_scalings(t_in)]
+            eps = self.apply_model(x_in * c_in, cvd.sigma_to_t(t_in), cond_in)
+            e_t_uncond, e_t = (x_in  + eps * c_out).chunk(2)
+            denoised = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+            if img_callback: yield from img_callback(x, i)
+
+            d = to_d(x, sigma_hat, denoised)
+            # Midpoint method, where the midpoint is chosen according to a rho=3 Karras schedule
+            sigma_mid = ((sigma_hat ** (1 / 3) + sigmas[i + 1] ** (1 / 3)) / 2) ** 3
+            dt_1 = sigma_mid - sigma_hat
+            dt_2 = sigmas[i + 1] - sigma_hat
+            x_2 = x + d * dt_1
+
+            s_i = sigma_mid * s_in
+            x_in = torch.cat([x_2] * 2)
+            t_in = torch.cat([s_i] * 2)
+            cond_in = torch.cat([unconditional_conditioning, cond])
+            c_out, c_in = [append_dims(tmp, x_in.ndim) for tmp in cvd.get_scalings(t_in)]
+            eps = self.apply_model(x_in * c_in, cvd.sigma_to_t(t_in), cond_in)
+            e_t_uncond, e_t = (x_in  + eps * c_out).chunk(2)
+            denoised_2 = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+
+            d_2 = to_d(x_2, sigma_mid, denoised_2)
+            x = x + d_2 * dt_2
+
+        yield from img_callback(x, len(sigmas)-1)
+
+
+    @torch.no_grad()
+    def dpm_2_ancestral_sampling(self,ac,x, S, cond, unconditional_conditioning = None, unconditional_guidance_scale = 1, extra_args=None, callback=None, disable=None,
+                        img_callback=None):
+        """Ancestral sampling with DPM-Solver inspired second-order steps."""
+        extra_args = {} if extra_args is None else extra_args
+
+        cvd = CompVisDenoiser(ac)
+        sigmas = cvd.get_sigmas(S)
+        x = x*sigmas[0]
+
+        print(f"Running DPM2 Ancestral Sampling with {len(sigmas) - 1} timesteps")
+
+        s_in = x.new_ones([x.shape[0]]).half()
+        for i in trange(len(sigmas) - 1, disable=disable):
+
+            s_i =  sigmas[i] * s_in
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([s_i] * 2)
+            cond_in = torch.cat([unconditional_conditioning, cond])
+            c_out, c_in = [append_dims(tmp, x_in.ndim) for tmp in cvd.get_scalings(t_in)]
+            eps = self.apply_model(x_in * c_in, cvd.sigma_to_t(t_in), cond_in)
+            e_t_uncond, e_t = (x_in  + eps * c_out).chunk(2)
+            denoised = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+
+            sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1])
+            if callback is not None:
+                callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+
+            if img_callback: yield from img_callback(x, i)
+
+            d = to_d(x, sigmas[i], denoised)
+            # Midpoint method, where the midpoint is chosen according to a rho=3 Karras schedule
+            sigma_mid = ((sigmas[i] ** (1 / 3) + sigma_down ** (1 / 3)) / 2) ** 3
+            dt_1 = sigma_mid - sigmas[i]
+            dt_2 = sigma_down - sigmas[i]
+            x_2 = x + d * dt_1
+
+            s_i = sigma_mid * s_in
+            x_in = torch.cat([x_2] * 2)
+            t_in = torch.cat([s_i] * 2)
+            cond_in = torch.cat([unconditional_conditioning, cond])
+            c_out, c_in = [append_dims(tmp, x_in.ndim) for tmp in cvd.get_scalings(t_in)]
+            eps = self.apply_model(x_in * c_in, cvd.sigma_to_t(t_in), cond_in)
+            e_t_uncond, e_t = (x_in  + eps * c_out).chunk(2)
+            denoised_2 = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+
+            d_2 = to_d(x_2, sigma_mid, denoised_2)
+            x = x + d_2 * dt_2
+            x = x + torch.randn_like(x) * sigma_up
+
+        yield from img_callback(x, len(sigmas)-1)
+
+
+    @torch.no_grad()
+    def lms_sampling(self,ac,x, S, cond, unconditional_conditioning = None, unconditional_guidance_scale = 1, extra_args=None, callback=None, disable=None, order=4,
+                        img_callback=None):
+        extra_args = {} if extra_args is None else extra_args
+        s_in = x.new_ones([x.shape[0]])
+
+        cvd = CompVisDenoiser(ac)
+        sigmas = cvd.get_sigmas(S)
+        x = x*sigmas[0]
+
+        print(f"Running LMS Sampling with {len(sigmas) - 1} timesteps")
+
+        ds = []
+        for i in trange(len(sigmas) - 1, disable=disable):
+
+            s_i =  sigmas[i] * s_in
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([s_i] * 2)
+            cond_in = torch.cat([unconditional_conditioning, cond])
+            c_out, c_in = [append_dims(tmp, x_in.ndim) for tmp in cvd.get_scalings(t_in)]
+            eps = self.apply_model(x_in * c_in, cvd.sigma_to_t(t_in), cond_in)
+            e_t_uncond, e_t = (x_in  + eps * c_out).chunk(2)
+            denoised = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+            if img_callback: yield from img_callback(x, i)
+
+            d = to_d(x, sigmas[i], denoised)
+            ds.append(d)
+            if len(ds) > order:
+                ds.pop(0)
+            if callback is not None:
+                callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+            cur_order = min(i + 1, order)
+            coeffs = [linear_multistep_coeff(cur_order, sigmas.cpu(), i, j) for j in range(cur_order)]
+            x = x + sum(coeff * d for coeff, d in zip(coeffs, reversed(ds)))
+
+        yield from img_callback(x, len(sigmas)-1)
diff --git a/optimizedSD/diffusers_txt2img.py b/optimizedSD/diffusers_txt2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..80fbb9723ef591e4c14aebf53386dac4fc3e3b66
--- /dev/null
+++ b/optimizedSD/diffusers_txt2img.py
@@ -0,0 +1,13 @@
+import torch
+from diffusers import LDMTextToImagePipeline
+
+pipe = LDMTextToImagePipeline.from_pretrained("CompVis/stable-diffusion-v1-3-diffusers", use_auth_token=True)
+
+prompt  = "19th Century wooden engraving of Elon musk"
+
+seed = torch.manual_seed(1024)
+images = pipe([prompt], batch_size=1, num_inference_steps=50, guidance_scale=7, generator=seed,torch_device="cpu" )["sample"]
+
+# save images
+for idx, image in enumerate(images):
+    image.save(f"image-{idx}.png")
diff --git a/optimizedSD/img2img_gradio.py b/optimizedSD/img2img_gradio.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fd4678380cd957b7c4b2b0ab752db9006e95015
--- /dev/null
+++ b/optimizedSD/img2img_gradio.py
@@ -0,0 +1,283 @@
+import gradio as gr
+import numpy as np
+import torch
+from torchvision.utils import make_grid
+import os, re
+from PIL import Image
+import torch
+import numpy as np
+from random import randint
+from omegaconf import OmegaConf
+from PIL import Image
+from tqdm import tqdm, trange
+from itertools import islice
+from einops import rearrange
+from torchvision.utils import make_grid
+import time
+from pytorch_lightning import seed_everything
+from torch import autocast
+from einops import rearrange, repeat
+from contextlib import nullcontext
+from ldmlib.util import instantiate_from_config
+from transformers import logging
+import pandas as pd
+from optimUtils import split_weighted_subprompts, logger
+logging.set_verbosity_error()
+import mimetypes
+mimetypes.init()
+mimetypes.add_type("application/javascript", ".js")
+
+
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+
+
+def load_model_from_config(ckpt, verbose=False):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    return sd
+
+
+def load_img(image, h0, w0):
+
+    image = image.convert("RGB")
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h})")
+    if h0 is not None and w0 is not None:
+        h, w = h0, w0
+
+    w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 32
+
+    print(f"New image size ({w}, {h})")
+    image = image.resize((w, h), resample=Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+config = "optimizedSD/v1-inference.yaml"
+ckpt = "models/ldm/stable-diffusion-v1/model.ckpt"
+sd = load_model_from_config(f"{ckpt}")
+li, lo = [], []
+for key, v_ in sd.items():
+    sp = key.split(".")
+    if (sp[0]) == "model":
+        if "input_blocks" in sp:
+            li.append(key)
+        elif "middle_block" in sp:
+            li.append(key)
+        elif "time_embed" in sp:
+            li.append(key)
+        else:
+            lo.append(key)
+for key in li:
+    sd["model1." + key[6:]] = sd.pop(key)
+for key in lo:
+    sd["model2." + key[6:]] = sd.pop(key)
+
+config = OmegaConf.load(f"{config}")
+
+model = instantiate_from_config(config.modelUNet)
+_, _ = model.load_state_dict(sd, strict=False)
+model.eval()
+
+modelCS = instantiate_from_config(config.modelCondStage)
+_, _ = modelCS.load_state_dict(sd, strict=False)
+modelCS.eval()
+
+modelFS = instantiate_from_config(config.modelFirstStage)
+_, _ = modelFS.load_state_dict(sd, strict=False)
+modelFS.eval()
+del sd
+
+def generate(
+    image,
+    prompt,
+    strength,
+    ddim_steps,
+    n_iter,
+    batch_size,
+    Height,
+    Width,
+    scale,
+    ddim_eta,
+    unet_bs,
+    device,
+    seed,
+    outdir,
+    img_format,
+    turbo,
+    full_precision,
+):
+
+    if seed == "":
+        seed = randint(0, 1000000)
+    seed = int(seed)
+    seed_everything(seed)
+
+    # Logging
+    sampler = "ddim"
+    logger(locals(), log_csv = "logs/img2img_gradio_logs.csv")
+
+    init_image = load_img(image, Height, Width).to(device)
+    model.unet_bs = unet_bs
+    model.turbo = turbo
+    model.cdevice = device
+    modelCS.cond_stage_model.device = device
+
+    if device != "cpu" and full_precision == False:
+        model.half()
+        modelCS.half()
+        modelFS.half()
+        init_image = init_image.half()
+
+    tic = time.time()
+    os.makedirs(outdir, exist_ok=True)
+    outpath = outdir
+    sample_path = os.path.join(outpath, "_".join(re.split(":| ", prompt)))[:150]
+    os.makedirs(sample_path, exist_ok=True)
+    base_count = len(os.listdir(sample_path))
+
+    # n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
+    assert prompt is not None
+    data = [batch_size * [prompt]]
+
+    modelFS.to(device)
+
+    init_image = repeat(init_image, "1 ... -> b ...", b=batch_size)
+    init_latent = modelFS.get_first_stage_encoding(modelFS.encode_first_stage(init_image))  # move to latent space
+
+    if device != "cpu":
+        mem = torch.cuda.memory_allocated() / 1e6
+        modelFS.to("cpu")
+        while torch.cuda.memory_allocated() / 1e6 >= mem:
+            time.sleep(1)
+
+    assert 0.0 <= strength <= 1.0, "can only work with strength in [0.0, 1.0]"
+    t_enc = int(strength * ddim_steps)
+    print(f"target t_enc is {t_enc} steps")
+
+    if full_precision == False and device != "cpu":
+        precision_scope = autocast
+    else:
+        precision_scope = nullcontext
+
+    all_samples = []
+    seeds = ""
+    with torch.no_grad():
+        all_samples = list()
+        for _ in trange(n_iter, desc="Sampling"):
+            for prompts in tqdm(data, desc="data"):
+                with precision_scope("cuda"):
+                    modelCS.to(device)
+                    uc = None
+                    if scale != 1.0:
+                        uc = modelCS.get_learned_conditioning(batch_size * [""])
+                    if isinstance(prompts, tuple):
+                        prompts = list(prompts)
+
+                    subprompts, weights = split_weighted_subprompts(prompts[0])
+                    if len(subprompts) > 1:
+                        c = torch.zeros_like(uc)
+                        totalWeight = sum(weights)
+                        # normalize each "sub prompt" and add it
+                        for i in range(len(subprompts)):
+                            weight = weights[i]
+                            # if not skip_normalize:
+                            weight = weight / totalWeight
+                            c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight)
+                    else:
+                        c = modelCS.get_learned_conditioning(prompts)
+
+                    if device != "cpu":
+                        mem = torch.cuda.memory_allocated() / 1e6
+                        modelCS.to("cpu")
+                        while torch.cuda.memory_allocated() / 1e6 >= mem:
+                            time.sleep(1)
+
+                    # encode (scaled latent)
+                    z_enc = model.stochastic_encode(
+                        init_latent, torch.tensor([t_enc] * batch_size).to(device), seed, ddim_eta, ddim_steps
+                    )
+                    # decode it
+                    samples_ddim = model.sample(
+                                    t_enc,
+                                    c,
+                                    z_enc,
+                                    unconditional_guidance_scale=scale,
+                                    unconditional_conditioning=uc,
+                                    sampler = sampler
+                    )
+
+                    modelFS.to(device)
+                    print("saving images")
+                    for i in range(batch_size):
+
+                        x_samples_ddim = modelFS.decode_first_stage(samples_ddim[i].unsqueeze(0))
+                        x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                        all_samples.append(x_sample.to("cpu"))
+                        x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c")
+                        Image.fromarray(x_sample.astype(np.uint8)).save(
+                            os.path.join(sample_path, "seed_" + str(seed) + "_" + f"{base_count:05}.{img_format}")
+                        )
+                        seeds += str(seed) + ","
+                        seed += 1
+                        base_count += 1
+
+                    if device != "cpu":
+                        mem = torch.cuda.memory_allocated() / 1e6
+                        modelFS.to("cpu")
+                        while torch.cuda.memory_allocated() / 1e6 >= mem:
+                            time.sleep(1)
+
+                    del samples_ddim
+                    del x_sample
+                    del x_samples_ddim
+                    print("memory_final = ", torch.cuda.memory_allocated() / 1e6)
+
+    toc = time.time()
+
+    time_taken = (toc - tic) / 60.0
+    grid = torch.cat(all_samples, 0)
+    grid = make_grid(grid, nrow=n_iter)
+    grid = 255.0 * rearrange(grid, "c h w -> h w c").cpu().numpy()
+
+    txt = (
+        "Samples finished in "
+        + str(round(time_taken, 3))
+        + " minutes and exported to \n"
+        + sample_path
+        + "\nSeeds used = "
+        + seeds[:-1]
+    )
+    return Image.fromarray(grid.astype(np.uint8)), txt
+
+
+demo = gr.Interface(
+    fn=generate,
+    inputs=[
+        gr.Image(tool="editor", type="pil"),
+        "text",
+        gr.Slider(0, 1, value=0.75),
+        gr.Slider(1, 1000, value=50),
+        gr.Slider(1, 100, step=1),
+        gr.Slider(1, 100, step=1),
+        gr.Slider(64, 4096, value=512, step=64),
+        gr.Slider(64, 4096, value=512, step=64),
+        gr.Slider(0, 50, value=7.5, step=0.1),
+        gr.Slider(0, 1, step=0.01),
+        gr.Slider(1, 2, value=1, step=1),
+        gr.Text(value="cuda"),
+        "text",
+        gr.Text(value="outputs/img2img-samples"),
+        gr.Radio(["png", "jpg"], value='png'),
+        "checkbox",
+        "checkbox",
+    ],
+    outputs=["image", "text"],
+)
+demo.launch()
diff --git a/optimizedSD/inpaint_gradio.py b/optimizedSD/inpaint_gradio.py
new file mode 100644
index 0000000000000000000000000000000000000000..f547c0f9789bb29c1b016edb28426b18f78f259b
--- /dev/null
+++ b/optimizedSD/inpaint_gradio.py
@@ -0,0 +1,328 @@
+import argparse
+import os
+import re
+import time
+from contextlib import nullcontext
+from itertools import islice
+from random import randint
+
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image
+from einops import rearrange, repeat
+from omegaconf import OmegaConf
+from pytorch_lightning import seed_everything
+from torch import autocast
+from torchvision.utils import make_grid
+from tqdm import tqdm, trange
+from transformers import logging
+
+from ldmlib.util import instantiate_from_config
+from optimUtils import split_weighted_subprompts, logger
+
+logging.set_verbosity_error()
+import mimetypes
+
+mimetypes.init()
+mimetypes.add_type("application/javascript", ".js")
+
+
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+
+
+def load_model_from_config(ckpt, verbose=False):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    return sd
+
+
+def load_img(image, h0, w0):
+    image = image.convert("RGB")
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h})")
+    if h0 is not None and w0 is not None:
+        h, w = h0, w0
+
+    w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 32
+
+    print(f"New image size ({w}, {h})")
+    image = image.resize((w, h), resample=Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+def load_mask(mask, h0, w0, newH, newW, invert=False):
+    image = mask.convert("RGB")
+    w, h = image.size
+    print(f"loaded input mask of size ({w}, {h})")
+    if h0 is not None and w0 is not None:
+        h, w = h0, w0
+
+    w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 32
+
+    print(f"New mask size ({w}, {h})")
+    image = image.resize((newW, newH), resample=Image.LANCZOS)
+    # image = image.resize((64, 64), resample=Image.LANCZOS)
+    image = np.array(image)
+
+    if invert:
+        print("inverted")
+        where_0, where_1 = np.where(image == 0), np.where(image == 255)
+        image[where_0], image[where_1] = 255, 0
+    image = image.astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return image
+
+
+def generate(
+        image,
+        mask_image,
+        prompt,
+        strength,
+        ddim_steps,
+        n_iter,
+        batch_size,
+        Height,
+        Width,
+        scale,
+        ddim_eta,
+        unet_bs,
+        device,
+        seed,
+        outdir,
+        img_format,
+        turbo,
+        full_precision,
+):
+    if seed == "":
+        seed = randint(0, 1000000)
+    seed = int(seed)
+    seed_everything(seed)
+    sampler = "ddim"
+
+    # Logging
+    logger(locals(), log_csv="logs/inpaint_gradio_logs.csv")
+
+    init_image = load_img(image['image'], Height, Width).to(device)
+
+    model.unet_bs = unet_bs
+    model.turbo = turbo
+    model.cdevice = device
+    modelCS.cond_stage_model.device = device
+
+    if device != "cpu" and full_precision == False:
+        model.half()
+        modelCS.half()
+        modelFS.half()
+        init_image = init_image.half()
+        # mask.half()
+
+    tic = time.time()
+    os.makedirs(outdir, exist_ok=True)
+    outpath = outdir
+    sample_path = os.path.join(outpath, "_".join(re.split(":| ", prompt)))[:150]
+    os.makedirs(sample_path, exist_ok=True)
+    base_count = len(os.listdir(sample_path))
+
+    # n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
+    assert prompt is not None
+    data = [batch_size * [prompt]]
+
+    modelFS.to(device)
+
+    init_latent = modelFS.get_first_stage_encoding(modelFS.encode_first_stage(init_image))  # move to latent space
+    init_latent = repeat(init_latent, "1 ... -> b ...", b=batch_size)
+    if mask_image is None:
+        mask = load_mask(image['mask'], Height, Width, init_latent.shape[2], init_latent.shape[3], True).to(device)
+    else:
+        image['mask']=mask_image
+        mask = load_mask(mask_image, Height, Width, init_latent.shape[2], init_latent.shape[3], True).to(device)
+
+    mask = mask[0][0].unsqueeze(0).repeat(4, 1, 1).unsqueeze(0)
+    mask = repeat(mask, '1 ... -> b ...', b=batch_size)
+
+    if device != "cpu":
+        mem = torch.cuda.memory_allocated() / 1e6
+        modelFS.to("cpu")
+        while torch.cuda.memory_allocated() / 1e6 >= mem:
+            time.sleep(1)
+
+    if strength == 1:
+        print("strength should be less than 1, setting it to 0.999")
+        strength = 0.999
+    assert 0.0 <= strength < 1.0, "can only work with strength in [0.0, 1.0]"
+    t_enc = int(strength * ddim_steps)
+    print(f"target t_enc is {t_enc} steps")
+
+    if full_precision == False and device != "cpu":
+        precision_scope = autocast
+    else:
+        precision_scope = nullcontext
+
+    all_samples = []
+    seeds = ""
+    with torch.no_grad():
+        all_samples = list()
+        for _ in trange(n_iter, desc="Sampling"):
+            for prompts in tqdm(data, desc="data"):
+                with precision_scope("cuda"):
+                    modelCS.to(device)
+                    uc = None
+                    if scale != 1.0:
+                        uc = modelCS.get_learned_conditioning(batch_size * [""])
+                    if isinstance(prompts, tuple):
+                        prompts = list(prompts)
+
+                    subprompts, weights = split_weighted_subprompts(prompts[0])
+                    if len(subprompts) > 1:
+                        c = torch.zeros_like(uc)
+                        totalWeight = sum(weights)
+                        # normalize each "sub prompt" and add it
+                        for i in range(len(subprompts)):
+                            weight = weights[i]
+                            # if not skip_normalize:
+                            weight = weight / totalWeight
+                            c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight)
+                    else:
+                        c = modelCS.get_learned_conditioning(prompts)
+
+                    if device != "cpu":
+                        mem = torch.cuda.memory_allocated() / 1e6
+                        modelCS.to("cpu")
+                        while torch.cuda.memory_allocated() / 1e6 >= mem:
+                            time.sleep(1)
+
+                    # encode (scaled latent)
+                    z_enc = model.stochastic_encode(
+                        init_latent, torch.tensor([t_enc] * batch_size).to(device),
+                        seed, ddim_eta, ddim_steps)
+
+                    # decode it
+                    samples_ddim = model.sample(
+                        t_enc,
+                        c,
+                        z_enc,
+                        unconditional_guidance_scale=scale,
+                        unconditional_conditioning=uc,
+                        mask=mask,
+                        x_T=init_latent,
+                        sampler=sampler,
+                    )
+
+                    modelFS.to(device)
+                    print("saving images")
+                    for i in range(batch_size):
+                        x_samples_ddim = modelFS.decode_first_stage(samples_ddim[i].unsqueeze(0))
+                        x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                        all_samples.append(x_sample.to("cpu"))
+                        x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c")
+                        Image.fromarray(x_sample.astype(np.uint8)).save(
+                            os.path.join(sample_path, "seed_" + str(seed) + "_" + f"{base_count:05}.{img_format}")
+                        )
+                        seeds += str(seed) + ","
+                        seed += 1
+                        base_count += 1
+
+                    if device != "cpu":
+                        mem = torch.cuda.memory_allocated() / 1e6
+                        modelFS.to("cpu")
+                        while torch.cuda.memory_allocated() / 1e6 >= mem:
+                            time.sleep(1)
+
+                    del samples_ddim
+                    del x_sample
+                    del x_samples_ddim
+                    print("memory_final = ", torch.cuda.memory_allocated() / 1e6)
+
+    toc = time.time()
+
+    time_taken = (toc - tic) / 60.0
+    grid = torch.cat(all_samples, 0)
+    grid = make_grid(grid, nrow=n_iter)
+    grid = 255.0 * rearrange(grid, "c h w -> h w c").cpu().numpy()
+
+    txt = (
+            "Samples finished in "
+            + str(round(time_taken, 3))
+            + " minutes and exported to \n"
+            + sample_path
+            + "\nSeeds used = "
+            + seeds[:-1]
+    )
+    return Image.fromarray(grid.astype(np.uint8)), image['mask'], txt
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='txt2img using gradio')
+    parser.add_argument('--config_path', default="optimizedSD/v1-inference.yaml", type=str, help='config path')
+    parser.add_argument('--ckpt_path', default="models/ldm/stable-diffusion-v1/model.ckpt", type=str, help='ckpt path')
+    args = parser.parse_args()
+    config = args.config_path
+    ckpt = args.ckpt_path
+    sd = load_model_from_config(f"{ckpt}")
+    li, lo = [], []
+    for key, v_ in sd.items():
+        sp = key.split(".")
+        if (sp[0]) == "model":
+            if "input_blocks" in sp:
+                li.append(key)
+            elif "middle_block" in sp:
+                li.append(key)
+            elif "time_embed" in sp:
+                li.append(key)
+            else:
+                lo.append(key)
+    for key in li:
+        sd["model1." + key[6:]] = sd.pop(key)
+    for key in lo:
+        sd["model2." + key[6:]] = sd.pop(key)
+
+    config = OmegaConf.load(f"{config}")
+
+    model = instantiate_from_config(config.modelUNet)
+    _, _ = model.load_state_dict(sd, strict=False)
+    model.eval()
+
+    modelCS = instantiate_from_config(config.modelCondStage)
+    _, _ = modelCS.load_state_dict(sd, strict=False)
+    modelCS.eval()
+
+    modelFS = instantiate_from_config(config.modelFirstStage)
+    _, _ = modelFS.load_state_dict(sd, strict=False)
+    modelFS.eval()
+    del sd
+
+    demo = gr.Interface(
+        fn=generate,
+        inputs=[
+            gr.Image(tool="sketch", type="pil"),
+            gr.Image(tool="editor", type="pil"),
+            "text",
+            gr.Slider(0, 0.99, value=0.99, step=0.01),
+            gr.Slider(1, 1000, value=50),
+            gr.Slider(1, 100, step=1),
+            gr.Slider(1, 100, step=1),
+            gr.Slider(64, 4096, value=512, step=64),
+            gr.Slider(64, 4096, value=512, step=64),
+            gr.Slider(0, 50, value=7.5, step=0.1),
+            gr.Slider(0, 1, step=0.01),
+            gr.Slider(1, 2, value=1, step=1),
+            gr.Text(value="cuda"),
+            "text",
+            gr.Text(value="outputs/inpaint-samples"),
+            gr.Radio(["png", "jpg"], value='png'),
+            "checkbox",
+            "checkbox",
+        ],
+        outputs=["image", "image", "text"],
+    )
+    demo.launch()
diff --git a/optimizedSD/openaimodelSplit.py b/optimizedSD/openaimodelSplit.py
new file mode 100644
index 0000000000000000000000000000000000000000..c79afe057d3831fba623c1076b6ee663791ce3cf
--- /dev/null
+++ b/optimizedSD/openaimodelSplit.py
@@ -0,0 +1,807 @@
+from abc import abstractmethod
+import math
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+from ldmlib.modules.diffusionmodules.util import (
+    checkpoint,
+    conv_nd,
+    linear,
+    avg_pool_nd,
+    zero_module,
+    normalization,
+    timestep_embedding,
+)
+from .splitAttention import SpatialTransformer
+
+
+class AttentionPool2d(nn.Module):
+    """
+    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
+    """
+
+    def __init__(
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)  # NC(HW)
+        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+
+
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+
+
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+
+    def forward(self, x, emb, context=None):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context)
+            else:
+                x = layer(x)
+        return x
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+class TransposedUpsample(nn.Module):
+    'Learned 2x upsampling without padding'
+    def __init__(self, channels, out_channels=None, ks=5):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+
+        self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
+
+    def forward(self,x):
+        return self.up(x)
+
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
+            ),
+        )
+
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, 3, padding=1
+            )
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        return checkpoint(
+            self._forward, (x, emb), self.parameters(), self.use_checkpoint
+        )
+
+
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = th.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+
+
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+
+    def forward(self, x):
+        return checkpoint(self._forward, (x,), self.parameters(), True)   # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
+        #return pt_checkpoint(self._forward, x)  # pytorch
+
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+
+
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    model.total_ops += th.DoubleTensor([matmul_ops])
+
+
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+class UNetModelEncode(nn.Module):
+
+
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,    # custom transformer support
+        transformer_depth=1,              # custom transformer support
+        context_dim=None,                 # custom transformer support
+        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+
+        if self.num_classes is not None:
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        #num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=dim_head,
+                            use_new_attention_order=use_new_attention_order,
+                        ) if not use_spatial_transformer else SpatialTransformer(
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            #num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else SpatialTransformer(
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                        ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+
+    def forward(self, x, timesteps=None, context=None, y=None):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0],)
+            emb = emb + self.label_emb(y)
+
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+
+        return h, emb, hs
+
+
+class UNetModelDecode(nn.Module):
+
+
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,    # custom transformer support
+        transformer_depth=1,              # custom transformer support
+        context_dim=None,                 # custom transformer support
+        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+
+        time_embed_dim = model_channels * 4
+
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        #num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            #num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+
+        self._feature_size += ch
+
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=model_channels * mult,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        #num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=dim_head,
+                            use_new_attention_order=use_new_attention_order,
+                        ) if not use_spatial_transformer else SpatialTransformer(
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+        )
+        if self.predict_codebook_ids:
+            self.id_predictor = nn.Sequential(
+            normalization(ch),
+            conv_nd(dims, model_channels, n_embed, 1),
+            #nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
+        )
+
+    def forward(self, h,emb,tp,hs, context=None, y=None):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(tp)
+        if self.predict_codebook_ids:
+            return self.id_predictor(h)
+        else:
+            return self.out(h)
diff --git a/optimizedSD/optimUtils.py b/optimizedSD/optimUtils.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b996792a27b6f628bda578bb2a3ec64f406f23
--- /dev/null
+++ b/optimizedSD/optimUtils.py
@@ -0,0 +1,73 @@
+import os
+import pandas as pd
+
+
+def split_weighted_subprompts(text):
+    """
+    grabs all text up to the first occurrence of ':' 
+    uses the grabbed text as a sub-prompt, and takes the value following ':' as weight
+    if ':' has no value defined, defaults to 1.0
+    repeats until no text remaining
+    """
+    remaining = len(text)
+    prompts = []
+    weights = []
+    while remaining > 0:
+        if ":" in text:
+            idx = text.index(":") # first occurrence from start
+            # grab up to index as sub-prompt
+            prompt = text[:idx]
+            remaining -= idx
+            # remove from main text
+            text = text[idx+1:]
+            # find value for weight 
+            if " " in text:
+                idx = text.index(" ") # first occurence
+            else: # no space, read to end
+                idx = len(text)
+            if idx != 0:
+                try:
+                    weight = float(text[:idx])
+                except: # couldn't treat as float
+                    print(f"Warning: '{text[:idx]}' is not a value, are you missing a space?")
+                    weight = 1.0
+            else: # no value found
+                weight = 1.0
+            # remove from main text
+            remaining -= idx
+            text = text[idx+1:]
+            # append the sub-prompt and its weight
+            prompts.append(prompt)
+            weights.append(weight)
+        else: # no : found
+            if len(text) > 0: # there is still text though
+                # take remainder as weight 1
+                prompts.append(text)
+                weights.append(1.0)
+            remaining = 0
+    return prompts, weights
+
+def logger(params, log_csv):
+    os.makedirs('logs', exist_ok=True)
+    cols = [arg for arg, _ in params.items()]
+    if not os.path.exists(log_csv):
+        df = pd.DataFrame(columns=cols) 
+        df.to_csv(log_csv, index=False)
+
+    df = pd.read_csv(log_csv)
+    for arg in cols:
+        if arg not in df.columns:
+            df[arg] = ""
+    df.to_csv(log_csv, index = False)
+
+    li = {}
+    cols = [col for col in df.columns]
+    data = {arg:value for arg, value in params.items()}
+    for col in cols:
+        if col in data:
+            li[col] = data[col]
+        else:
+            li[col] = ''
+
+    df = pd.DataFrame(li,index = [0])
+    df.to_csv(log_csv,index=False, mode='a', header=False)
\ No newline at end of file
diff --git a/optimizedSD/optimized_img2img.py b/optimizedSD/optimized_img2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..d76d4798fc9e3fe867c172704c15b4f8800c6e2a
--- /dev/null
+++ b/optimizedSD/optimized_img2img.py
@@ -0,0 +1,362 @@
+import argparse, os, re
+import torch
+import numpy as np
+from random import randint
+from omegaconf import OmegaConf
+from PIL import Image
+from tqdm import tqdm, trange
+from itertools import islice
+from einops import rearrange
+from torchvision.utils import make_grid
+import time
+from pytorch_lightning import seed_everything
+from torch import autocast
+from contextlib import contextmanager, nullcontext
+from einops import rearrange, repeat
+from ldmlib.util import instantiate_from_config
+from optimUtils import split_weighted_subprompts, logger
+from transformers import logging
+import pandas as pd
+logging.set_verbosity_error()
+
+
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+
+
+def load_model_from_config(ckpt, verbose=False):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    return sd
+
+
+def load_img(path, h0, w0):
+
+    image = Image.open(path).convert("RGB")
+    w, h = image.size
+
+    print(f"loaded input image of size ({w}, {h}) from {path}")
+    if h0 is not None and w0 is not None:
+        h, w = h0, w0
+
+    w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 32
+
+    print(f"New image size ({w}, {h})")
+    image = image.resize((w, h), resample=Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+config = "optimizedSD/v1-inference.yaml"
+ckpt = "models/ldm/stable-diffusion-v1/model.ckpt"
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--prompt", type=str, nargs="?", default="a painting of a virus monster playing guitar", help="the prompt to render"
+)
+parser.add_argument("--outdir", type=str, nargs="?", help="dir to write results to", default="outputs/img2img-samples")
+parser.add_argument("--init-img", type=str, nargs="?", help="path to the input image")
+
+parser.add_argument(
+    "--skip_grid",
+    action="store_true",
+    help="do not save a grid, only individual samples. Helpful when evaluating lots of samples",
+)
+parser.add_argument(
+    "--skip_save",
+    action="store_true",
+    help="do not save individual samples. For speed measurements.",
+)
+parser.add_argument(
+    "--ddim_steps",
+    type=int,
+    default=50,
+    help="number of ddim sampling steps",
+)
+
+parser.add_argument(
+    "--ddim_eta",
+    type=float,
+    default=0.0,
+    help="ddim eta (eta=0.0 corresponds to deterministic sampling",
+)
+parser.add_argument(
+    "--n_iter",
+    type=int,
+    default=1,
+    help="sample this often",
+)
+parser.add_argument(
+    "--H",
+    type=int,
+    default=None,
+    help="image height, in pixel space",
+)
+parser.add_argument(
+    "--W",
+    type=int,
+    default=None,
+    help="image width, in pixel space",
+)
+parser.add_argument(
+    "--strength",
+    type=float,
+    default=0.75,
+    help="strength for noising/unnoising. 1.0 corresponds to full destruction of information in init image",
+)
+parser.add_argument(
+    "--n_samples",
+    type=int,
+    default=5,
+    help="how many samples to produce for each given prompt. A.k.a. batch size",
+)
+parser.add_argument(
+    "--n_rows",
+    type=int,
+    default=0,
+    help="rows in the grid (default: n_samples)",
+)
+parser.add_argument(
+    "--scale",
+    type=float,
+    default=7.5,
+    help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
+)
+parser.add_argument(
+    "--from-file",
+    type=str,
+    help="if specified, load prompts from this file",
+)
+parser.add_argument(
+    "--seed",
+    type=int,
+    default=None,
+    help="the seed (for reproducible sampling)",
+)
+parser.add_argument(
+    "--device",
+    type=str,
+    default="cuda",
+    help="CPU or GPU (cuda/cuda:0/cuda:1/...)",
+)
+parser.add_argument(
+    "--unet_bs",
+    type=int,
+    default=1,
+    help="Slightly reduces inference time at the expense of high VRAM (value > 1 not recommended )",
+)
+parser.add_argument(
+    "--turbo",
+    action="store_true",
+    help="Reduces inference time on the expense of 1GB VRAM",
+)
+parser.add_argument(
+    "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="autocast"
+)
+parser.add_argument(
+    "--format",
+    type=str,
+    help="output image format",
+    choices=["jpg", "png"],
+    default="png",
+)
+parser.add_argument(
+    "--sampler",
+    type=str,
+    help="sampler",
+    choices=["ddim"],
+    default="ddim",
+)
+opt = parser.parse_args()
+
+tic = time.time()
+os.makedirs(opt.outdir, exist_ok=True)
+outpath = opt.outdir
+grid_count = len(os.listdir(outpath)) - 1
+
+if opt.seed == None:
+    opt.seed = randint(0, 1000000)
+seed_everything(opt.seed)
+
+# Logging
+logger(vars(opt), log_csv = "logs/img2img_logs.csv")
+
+sd = load_model_from_config(f"{ckpt}")
+li, lo = [], []
+for key, value in sd.items():
+    sp = key.split(".")
+    if (sp[0]) == "model":
+        if "input_blocks" in sp:
+            li.append(key)
+        elif "middle_block" in sp:
+            li.append(key)
+        elif "time_embed" in sp:
+            li.append(key)
+        else:
+            lo.append(key)
+for key in li:
+    sd["model1." + key[6:]] = sd.pop(key)
+for key in lo:
+    sd["model2." + key[6:]] = sd.pop(key)
+
+config = OmegaConf.load(f"{config}")
+
+assert os.path.isfile(opt.init_img)
+init_image = load_img(opt.init_img, opt.H, opt.W).to(opt.device)
+
+model = instantiate_from_config(config.modelUNet)
+_, _ = model.load_state_dict(sd, strict=False)
+model.eval()
+model.cdevice = opt.device
+model.unet_bs = opt.unet_bs
+model.turbo = opt.turbo
+
+modelCS = instantiate_from_config(config.modelCondStage)
+_, _ = modelCS.load_state_dict(sd, strict=False)
+modelCS.eval()
+modelCS.cond_stage_model.device = opt.device
+
+modelFS = instantiate_from_config(config.modelFirstStage)
+_, _ = modelFS.load_state_dict(sd, strict=False)
+modelFS.eval()
+del sd
+if opt.device != "cpu" and opt.precision == "autocast":
+    model.half()
+    modelCS.half()
+    modelFS.half()
+    init_image = init_image.half()
+
+batch_size = opt.n_samples
+n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
+if not opt.from_file:
+    assert opt.prompt is not None
+    prompt = opt.prompt
+    data = [batch_size * [prompt]]
+
+else:
+    print(f"reading prompts from {opt.from_file}")
+    with open(opt.from_file, "r") as f:
+        data = f.read().splitlines()
+        data = batch_size * list(data)
+        data = list(chunk(sorted(data), batch_size))
+
+modelFS.to(opt.device)
+
+init_image = repeat(init_image, "1 ... -> b ...", b=batch_size)
+init_latent = modelFS.get_first_stage_encoding(modelFS.encode_first_stage(init_image))  # move to latent space
+
+if opt.device != "cpu":
+    mem = torch.cuda.memory_allocated(device=opt.device) / 1e6
+    modelFS.to("cpu")
+    while torch.cuda.memory_allocated(device=opt.device) / 1e6 >= mem:
+        time.sleep(1)
+
+
+assert 0.0 <= opt.strength <= 1.0, "can only work with strength in [0.0, 1.0]"
+t_enc = int(opt.strength * opt.ddim_steps)
+print(f"target t_enc is {t_enc} steps")
+
+
+if opt.precision == "autocast" and opt.device != "cpu":
+    precision_scope = autocast
+else:
+    precision_scope = nullcontext
+
+seeds = ""
+with torch.no_grad():
+
+    all_samples = list()
+    for n in trange(opt.n_iter, desc="Sampling"):
+        for prompts in tqdm(data, desc="data"):
+
+            sample_path = os.path.join(outpath, "_".join(re.split(":| ", prompts[0])))[:150]
+            os.makedirs(sample_path, exist_ok=True)
+            base_count = len(os.listdir(sample_path))
+
+            with precision_scope("cuda"):
+                modelCS.to(opt.device)
+                uc = None
+                if opt.scale != 1.0:
+                    uc = modelCS.get_learned_conditioning(batch_size * [""])
+                if isinstance(prompts, tuple):
+                    prompts = list(prompts)
+
+                subprompts, weights = split_weighted_subprompts(prompts[0])
+                if len(subprompts) > 1:
+                    c = torch.zeros_like(uc)
+                    totalWeight = sum(weights)
+                    # normalize each "sub prompt" and add it
+                    for i in range(len(subprompts)):
+                        weight = weights[i]
+                        # if not skip_normalize:
+                        weight = weight / totalWeight
+                        c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight)
+                else:
+                    c = modelCS.get_learned_conditioning(prompts)
+
+                if opt.device != "cpu":
+                    mem = torch.cuda.memory_allocated(device=opt.device) / 1e6
+                    modelCS.to("cpu")
+                    while torch.cuda.memory_allocated(device=opt.device) / 1e6 >= mem:
+                        time.sleep(1)
+
+                # encode (scaled latent)
+                z_enc = model.stochastic_encode(
+                    init_latent,
+                    torch.tensor([t_enc] * batch_size).to(opt.device),
+                    opt.seed,
+                    opt.ddim_eta,
+                    opt.ddim_steps,
+                )
+                # decode it
+                samples_ddim = model.sample(
+                    t_enc,
+                    c,
+                    z_enc,
+                    unconditional_guidance_scale=opt.scale,
+                    unconditional_conditioning=uc,
+                    sampler = opt.sampler
+                )
+
+                modelFS.to(opt.device)
+                print("saving images")
+                for i in range(batch_size):
+
+                    x_samples_ddim = modelFS.decode_first_stage(samples_ddim[i].unsqueeze(0))
+                    x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                    x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c")
+                    Image.fromarray(x_sample.astype(np.uint8)).save(
+                        os.path.join(sample_path, "seed_" + str(opt.seed) + "_" + f"{base_count:05}.{opt.format}")
+                    )
+                    seeds += str(opt.seed) + ","
+                    opt.seed += 1
+                    base_count += 1
+
+                if opt.device != "cpu":
+                    mem = torch.cuda.memory_allocated(device=opt.device) / 1e6
+                    modelFS.to("cpu")
+                    while torch.cuda.memory_allocated(device=opt.device) / 1e6 >= mem:
+                        time.sleep(1)
+
+                del samples_ddim
+                print("memory_final = ", torch.cuda.memory_allocated(device=opt.device) / 1e6)
+
+toc = time.time()
+
+time_taken = (toc - tic) / 60.0
+
+print(
+    (
+        "Samples finished in {0:.2f} minutes and exported to "
+        + sample_path
+        + "\n Seeds used = "
+        + seeds[:-1]
+    ).format(time_taken)
+)
diff --git a/optimizedSD/optimized_txt2img.py b/optimizedSD/optimized_txt2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..5022cac9f59183556811526400d585d8e82831d4
--- /dev/null
+++ b/optimizedSD/optimized_txt2img.py
@@ -0,0 +1,347 @@
+import argparse, os, re
+import torch
+import numpy as np
+from random import randint
+from omegaconf import OmegaConf
+from PIL import Image
+from tqdm import tqdm, trange
+from itertools import islice
+from einops import rearrange
+from torchvision.utils import make_grid
+import time
+from pytorch_lightning import seed_everything
+from torch import autocast
+from contextlib import contextmanager, nullcontext
+from ldmlib.util import instantiate_from_config
+from optimUtils import split_weighted_subprompts, logger
+from transformers import logging
+# from samplers import CompVisDenoiser
+logging.set_verbosity_error()
+
+
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+
+
+def load_model_from_config(ckpt, verbose=False):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    return sd
+
+
+config = "optimizedSD/v1-inference.yaml"
+DEFAULT_CKPT = "models/ldm/stable-diffusion-v1/model.ckpt"
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--prompt", type=str, nargs="?", default="a painting of a virus monster playing guitar", help="the prompt to render"
+)
+parser.add_argument("--outdir", type=str, nargs="?", help="dir to write results to", default="outputs/txt2img-samples")
+parser.add_argument(
+    "--skip_grid",
+    action="store_true",
+    help="do not save a grid, only individual samples. Helpful when evaluating lots of samples",
+)
+parser.add_argument(
+    "--skip_save",
+    action="store_true",
+    help="do not save individual samples. For speed measurements.",
+)
+parser.add_argument(
+    "--ddim_steps",
+    type=int,
+    default=50,
+    help="number of ddim sampling steps",
+)
+
+parser.add_argument(
+    "--fixed_code",
+    action="store_true",
+    help="if enabled, uses the same starting code across samples ",
+)
+parser.add_argument(
+    "--ddim_eta",
+    type=float,
+    default=0.0,
+    help="ddim eta (eta=0.0 corresponds to deterministic sampling",
+)
+parser.add_argument(
+    "--n_iter",
+    type=int,
+    default=1,
+    help="sample this often",
+)
+parser.add_argument(
+    "--H",
+    type=int,
+    default=512,
+    help="image height, in pixel space",
+)
+parser.add_argument(
+    "--W",
+    type=int,
+    default=512,
+    help="image width, in pixel space",
+)
+parser.add_argument(
+    "--C",
+    type=int,
+    default=4,
+    help="latent channels",
+)
+parser.add_argument(
+    "--f",
+    type=int,
+    default=8,
+    help="downsampling factor",
+)
+parser.add_argument(
+    "--n_samples",
+    type=int,
+    default=5,
+    help="how many samples to produce for each given prompt. A.k.a. batch size",
+)
+parser.add_argument(
+    "--n_rows",
+    type=int,
+    default=0,
+    help="rows in the grid (default: n_samples)",
+)
+parser.add_argument(
+    "--scale",
+    type=float,
+    default=7.5,
+    help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
+)
+parser.add_argument(
+    "--device",
+    type=str,
+    default="cuda",
+    help="specify GPU (cuda/cuda:0/cuda:1/...)",
+)
+parser.add_argument(
+    "--from-file",
+    type=str,
+    help="if specified, load prompts from this file",
+)
+parser.add_argument(
+    "--seed",
+    type=int,
+    default=None,
+    help="the seed (for reproducible sampling)",
+)
+parser.add_argument(
+    "--unet_bs",
+    type=int,
+    default=1,
+    help="Slightly reduces inference time at the expense of high VRAM (value > 1 not recommended )",
+)
+parser.add_argument(
+    "--turbo",
+    action="store_true",
+    help="Reduces inference time on the expense of 1GB VRAM",
+)
+parser.add_argument(
+    "--precision",
+    type=str,
+    help="evaluate at this precision",
+    choices=["full", "autocast"],
+    default="autocast"
+)
+parser.add_argument(
+    "--format",
+    type=str,
+    help="output image format",
+    choices=["jpg", "png"],
+    default="png",
+)
+parser.add_argument(
+    "--sampler",
+    type=str,
+    help="sampler",
+    choices=["ddim", "plms","heun", "euler", "euler_a", "dpm2", "dpm2_a", "lms"],
+    default="plms",
+)
+parser.add_argument(
+    "--ckpt",
+    type=str,
+    help="path to checkpoint of model",
+    default=DEFAULT_CKPT,
+)
+opt = parser.parse_args()
+
+tic = time.time()
+os.makedirs(opt.outdir, exist_ok=True)
+outpath = opt.outdir
+grid_count = len(os.listdir(outpath)) - 1
+
+if opt.seed == None:
+    opt.seed = randint(0, 1000000)
+seed_everything(opt.seed)
+
+# Logging
+logger(vars(opt), log_csv = "logs/txt2img_logs.csv")
+
+sd = load_model_from_config(f"{opt.ckpt}")
+li, lo = [], []
+for key, value in sd.items():
+    sp = key.split(".")
+    if (sp[0]) == "model":
+        if "input_blocks" in sp:
+            li.append(key)
+        elif "middle_block" in sp:
+            li.append(key)
+        elif "time_embed" in sp:
+            li.append(key)
+        else:
+            lo.append(key)
+for key in li:
+    sd["model1." + key[6:]] = sd.pop(key)
+for key in lo:
+    sd["model2." + key[6:]] = sd.pop(key)
+
+config = OmegaConf.load(f"{config}")
+
+model = instantiate_from_config(config.modelUNet)
+_, _ = model.load_state_dict(sd, strict=False)
+model.eval()
+model.unet_bs = opt.unet_bs
+model.cdevice = opt.device
+model.turbo = opt.turbo
+
+modelCS = instantiate_from_config(config.modelCondStage)
+_, _ = modelCS.load_state_dict(sd, strict=False)
+modelCS.eval()
+modelCS.cond_stage_model.device = opt.device
+
+modelFS = instantiate_from_config(config.modelFirstStage)
+_, _ = modelFS.load_state_dict(sd, strict=False)
+modelFS.eval()
+del sd
+
+if opt.device != "cpu" and opt.precision == "autocast":
+    model.half()
+    modelCS.half()
+
+start_code = None
+if opt.fixed_code:
+    start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=opt.device)
+
+
+batch_size = opt.n_samples
+n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
+if not opt.from_file:
+    assert opt.prompt is not None
+    prompt = opt.prompt
+    print(f"Using prompt: {prompt}")
+    data = [batch_size * [prompt]]
+
+else:
+    print(f"reading prompts from {opt.from_file}")
+    with open(opt.from_file, "r") as f:
+        text = f.read()
+        print(f"Using prompt: {text.strip()}")
+        data = text.splitlines()
+        data = batch_size * list(data)
+        data = list(chunk(sorted(data), batch_size))
+
+
+if opt.precision == "autocast" and opt.device != "cpu":
+    precision_scope = autocast
+else:
+    precision_scope = nullcontext
+
+seeds = ""
+with torch.no_grad():
+
+    all_samples = list()
+    for n in trange(opt.n_iter, desc="Sampling"):
+        for prompts in tqdm(data, desc="data"):
+
+            sample_path = os.path.join(outpath, "_".join(re.split(":| ", prompts[0])))[:150]
+            os.makedirs(sample_path, exist_ok=True)
+            base_count = len(os.listdir(sample_path))
+
+            with precision_scope("cuda"):
+                modelCS.to(opt.device)
+                uc = None
+                if opt.scale != 1.0:
+                    uc = modelCS.get_learned_conditioning(batch_size * [""])
+                if isinstance(prompts, tuple):
+                    prompts = list(prompts)
+
+                subprompts, weights = split_weighted_subprompts(prompts[0])
+                if len(subprompts) > 1:
+                    c = torch.zeros_like(uc)
+                    totalWeight = sum(weights)
+                    # normalize each "sub prompt" and add it
+                    for i in range(len(subprompts)):
+                        weight = weights[i]
+                        # if not skip_normalize:
+                        weight = weight / totalWeight
+                        c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight)
+                else:
+                    c = modelCS.get_learned_conditioning(prompts)
+
+                shape = [opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f]
+
+                if opt.device != "cpu":
+                    mem = torch.cuda.memory_allocated() / 1e6
+                    modelCS.to("cpu")
+                    while torch.cuda.memory_allocated() / 1e6 >= mem:
+                        time.sleep(1)
+
+                samples_ddim = model.sample(
+                    S=opt.ddim_steps,
+                    conditioning=c,
+                    seed=opt.seed,
+                    shape=shape,
+                    verbose=False,
+                    unconditional_guidance_scale=opt.scale,
+                    unconditional_conditioning=uc,
+                    eta=opt.ddim_eta,
+                    x_T=start_code,
+                    sampler = opt.sampler,
+                )
+
+                modelFS.to(opt.device)
+
+                print(samples_ddim.shape)
+                print("saving images")
+                for i in range(batch_size):
+
+                    x_samples_ddim = modelFS.decode_first_stage(samples_ddim[i].unsqueeze(0))
+                    x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                    x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c")
+                    Image.fromarray(x_sample.astype(np.uint8)).save(
+                        os.path.join(sample_path, "seed_" + str(opt.seed) + "_" + f"{base_count:05}.{opt.format}")
+                    )
+                    seeds += str(opt.seed) + ","
+                    opt.seed += 1
+                    base_count += 1
+
+                if opt.device != "cpu":
+                    mem = torch.cuda.memory_allocated() / 1e6
+                    modelFS.to("cpu")
+                    while torch.cuda.memory_allocated() / 1e6 >= mem:
+                        time.sleep(1)
+                del samples_ddim
+                print("memory_final = ", torch.cuda.memory_allocated() / 1e6)
+
+toc = time.time()
+
+time_taken = (toc - tic) / 60.0
+
+print(
+    (
+        "Samples finished in {0:.2f} minutes and exported to "
+        + sample_path
+        + "\n Seeds used = "
+        + seeds[:-1]
+    ).format(time_taken)
+)
diff --git a/optimizedSD/samplers.py b/optimizedSD/samplers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a68e8e1a1b3d8340b44b59fc6c3994c46de982a
--- /dev/null
+++ b/optimizedSD/samplers.py
@@ -0,0 +1,252 @@
+from scipy import integrate
+import torch
+from tqdm.auto import trange, tqdm
+import torch.nn as nn
+
+
+def append_zero(x):
+    return torch.cat([x, x.new_zeros([1])])
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
+    return x[(...,) + (None,) * dims_to_append]
+
+def get_ancestral_step(sigma_from, sigma_to):
+    """Calculates the noise level (sigma_down) to step down to and the amount
+    of noise to add (sigma_up) when doing an ancestral sampling step."""
+    sigma_up = (sigma_to ** 2 * (sigma_from ** 2 - sigma_to ** 2) / sigma_from ** 2) ** 0.5
+    sigma_down = (sigma_to ** 2 - sigma_up ** 2) ** 0.5
+    return sigma_down, sigma_up
+
+
+class DiscreteSchedule(nn.Module):
+    """A mapping between continuous noise levels (sigmas) and a list of discrete noise
+    levels."""
+
+    def __init__(self, sigmas, quantize):
+        super().__init__()
+        self.register_buffer('sigmas', sigmas)
+        self.quantize = quantize
+
+    def get_sigmas(self, n=None):
+        if n is None:
+            return append_zero(self.sigmas.flip(0))
+        t_max = len(self.sigmas) - 1
+        t = torch.linspace(t_max, 0, n, device=self.sigmas.device)
+        return append_zero(self.t_to_sigma(t))
+
+    def sigma_to_t(self, sigma, quantize=None):
+        quantize = self.quantize if quantize is None else quantize
+        dists = torch.abs(sigma - self.sigmas[:, None])
+        if quantize:
+            return torch.argmin(dists, dim=0).view(sigma.shape)
+        low_idx, high_idx = torch.sort(torch.topk(dists, dim=0, k=2, largest=False).indices, dim=0)[0]
+        low, high = self.sigmas[low_idx], self.sigmas[high_idx]
+        w = (low - sigma) / (low - high)
+        w = w.clamp(0, 1)
+        t = (1 - w) * low_idx + w * high_idx
+        return t.view(sigma.shape)
+
+    def t_to_sigma(self, t):
+        t = t.float()
+        low_idx, high_idx, w = t.floor().long(), t.ceil().long(), t.frac()
+        # print(low_idx, high_idx, w )
+        return (1 - w) * self.sigmas[low_idx] + w * self.sigmas[high_idx]
+
+
+class DiscreteEpsDDPMDenoiser(DiscreteSchedule):
+    """A wrapper for discrete schedule DDPM models that output eps (the predicted
+    noise)."""
+
+    def __init__(self, alphas_cumprod, quantize):
+        super().__init__(((1 - alphas_cumprod) / alphas_cumprod) ** 0.5, quantize)
+        self.sigma_data = 1.
+
+    def get_scalings(self, sigma):
+        c_out = -sigma
+        c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+        return c_out, c_in
+
+    def get_eps(self, *args, **kwargs):
+        return self.inner_model(*args, **kwargs)
+
+    def forward(self, input, sigma, **kwargs):
+        c_out, c_in = [append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
+        eps = self.get_eps(input * c_in, self.sigma_to_t(sigma), **kwargs)
+        return input + eps * c_out
+
+class CompVisDenoiser(DiscreteEpsDDPMDenoiser):
+    """A wrapper for CompVis diffusion models."""
+
+    def __init__(self, alphas_cumprod, quantize=False, device='cpu'):
+        super().__init__(alphas_cumprod, quantize=quantize)
+
+    def get_eps(self, *args, **kwargs):
+        return self.inner_model.apply_model(*args, **kwargs)
+
+
+def to_d(x, sigma, denoised):
+    """Converts a denoiser output to a Karras ODE derivative."""
+    return (x - denoised) / append_dims(sigma, x.ndim)
+
+
+def get_ancestral_step(sigma_from, sigma_to):
+    """Calculates the noise level (sigma_down) to step down to and the amount
+    of noise to add (sigma_up) when doing an ancestral sampling step."""
+    sigma_up = (sigma_to ** 2 * (sigma_from ** 2 - sigma_to ** 2) / sigma_from ** 2) ** 0.5
+    sigma_down = (sigma_to ** 2 - sigma_up ** 2) ** 0.5
+    return sigma_down, sigma_up
+
+
+@torch.no_grad()
+def sample_euler(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
+    """Implements Algorithm 2 (Euler steps) from Karras et al. (2022)."""
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones([x.shape[0]])
+    for i in trange(len(sigmas) - 1, disable=disable):
+        gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
+        eps = torch.randn_like(x) * s_noise
+        sigma_hat = sigmas[i] * (gamma + 1)
+        if gamma > 0:
+            x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
+        denoised = model(x, sigma_hat * s_in, **extra_args)
+        d = to_d(x, sigma_hat, denoised)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
+        dt = sigmas[i + 1] - sigma_hat
+        # Euler method
+        x = x + d * dt
+    return x
+
+
+
+@torch.no_grad()
+def sample_euler_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None):
+    """Ancestral sampling with Euler method steps."""
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones([x.shape[0]])
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1])
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        d = to_d(x, sigmas[i], denoised)
+        # Euler method
+        dt = sigma_down - sigmas[i]
+        x = x + d * dt
+        x = x + torch.randn_like(x) * sigma_up
+    return x
+
+
+@torch.no_grad()
+def sample_heun(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
+    """Implements Algorithm 2 (Heun steps) from Karras et al. (2022)."""
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones([x.shape[0]])
+    for i in trange(len(sigmas) - 1, disable=disable):
+        gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
+        eps = torch.randn_like(x) * s_noise
+        sigma_hat = sigmas[i] * (gamma + 1)
+        if gamma > 0:
+            x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
+        denoised = model(x, sigma_hat * s_in, **extra_args)
+        d = to_d(x, sigma_hat, denoised)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
+        dt = sigmas[i + 1] - sigma_hat
+        if sigmas[i + 1] == 0:
+            # Euler method
+            x = x + d * dt
+        else:
+            # Heun's method
+            x_2 = x + d * dt
+            denoised_2 = model(x_2, sigmas[i + 1] * s_in, **extra_args)
+            d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
+            d_prime = (d + d_2) / 2
+            x = x + d_prime * dt
+    return x
+
+
+@torch.no_grad()
+def sample_dpm_2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
+    """A sampler inspired by DPM-Solver-2 and Algorithm 2 from Karras et al. (2022)."""
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones([x.shape[0]])
+    for i in trange(len(sigmas) - 1, disable=disable):
+        gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
+        eps = torch.randn_like(x) * s_noise
+        sigma_hat = sigmas[i] * (gamma + 1)
+        if gamma > 0:
+            x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
+        denoised = model(x, sigma_hat * s_in, **extra_args)
+        d = to_d(x, sigma_hat, denoised)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
+        # Midpoint method, where the midpoint is chosen according to a rho=3 Karras schedule
+        sigma_mid = ((sigma_hat ** (1 / 3) + sigmas[i + 1] ** (1 / 3)) / 2) ** 3
+        dt_1 = sigma_mid - sigma_hat
+        dt_2 = sigmas[i + 1] - sigma_hat
+        x_2 = x + d * dt_1
+        denoised_2 = model(x_2, sigma_mid * s_in, **extra_args)
+        d_2 = to_d(x_2, sigma_mid, denoised_2)
+        x = x + d_2 * dt_2
+    return x
+
+
+@torch.no_grad()
+def sample_dpm_2_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None):
+    """Ancestral sampling with DPM-Solver inspired second-order steps."""
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones([x.shape[0]])
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1])
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        d = to_d(x, sigmas[i], denoised)
+        # Midpoint method, where the midpoint is chosen according to a rho=3 Karras schedule
+        sigma_mid = ((sigmas[i] ** (1 / 3) + sigma_down ** (1 / 3)) / 2) ** 3
+        dt_1 = sigma_mid - sigmas[i]
+        dt_2 = sigma_down - sigmas[i]
+        x_2 = x + d * dt_1
+        denoised_2 = model(x_2, sigma_mid * s_in, **extra_args)
+        d_2 = to_d(x_2, sigma_mid, denoised_2)
+        x = x + d_2 * dt_2
+        x = x + torch.randn_like(x) * sigma_up
+    return x
+
+
+def linear_multistep_coeff(order, t, i, j):
+    if order - 1 > i:
+        raise ValueError(f'Order {order} too high for step {i}')
+    def fn(tau):
+        prod = 1.
+        for k in range(order):
+            if j == k:
+                continue
+            prod *= (tau - t[i - k]) / (t[i - j] - t[i - k])
+        return prod
+    return integrate.quad(fn, t[i], t[i + 1], epsrel=1e-4)[0]
+
+
+@torch.no_grad()
+def sample_lms(model, x, sigmas, extra_args=None, callback=None, disable=None, order=4):
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones([x.shape[0]])
+    ds = []
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        d = to_d(x, sigmas[i], denoised)
+        ds.append(d)
+        if len(ds) > order:
+            ds.pop(0)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        cur_order = min(i + 1, order)
+        coeffs = [linear_multistep_coeff(cur_order, sigmas.cpu(), i, j) for j in range(cur_order)]
+        x = x + sum(coeff * d for coeff, d in zip(coeffs, reversed(ds)))
+    return x
diff --git a/optimizedSD/splitAttention.py b/optimizedSD/splitAttention.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9df37ba8dd2caeac62fea038946b4aa5a724b7e
--- /dev/null
+++ b/optimizedSD/splitAttention.py
@@ -0,0 +1,280 @@
+from inspect import isfunction
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn, einsum
+from einops import rearrange, repeat
+
+from ldmlib.modules.diffusionmodules.util import checkpoint
+
+
+def exists(val):
+    return val is not None
+
+
+def uniq(arr):
+    return{el: True for el in arr}.keys()
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+
+
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+
+
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU()
+        ) if not glu else GEGLU(dim, inner_dim)
+
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+
+
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b,c,h,w = q.shape
+        q = rearrange(q, 'b c h w -> b (h w) c')
+        k = rearrange(k, 'b c h w -> b c (h w)')
+        w_ = torch.einsum('bij,bjk->bik', q, k)
+
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+
+        # attend to values
+        v = rearrange(v, 'b c h w -> b c (h w)')
+        w_ = rearrange(w_, 'b i j -> b j i')
+        h_ = torch.einsum('bij,bjk->bik', v, w_)
+        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
+        h_ = self.proj_out(h_)
+
+        return x+h_
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., att_step=1):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        self.att_step = att_step
+
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(dropout)
+        )
+
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        del context, x
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+
+
+        limit = k.shape[0]
+        att_step = self.att_step
+        q_chunks = list(torch.tensor_split(q, limit//att_step, dim=0))
+        k_chunks = list(torch.tensor_split(k, limit//att_step, dim=0))
+        v_chunks = list(torch.tensor_split(v, limit//att_step, dim=0))
+
+        q_chunks.reverse()
+        k_chunks.reverse()
+        v_chunks.reverse()
+        sim = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device)
+        del k, q, v
+        for i in range (0, limit, att_step):
+
+            q_buffer = q_chunks.pop()
+            k_buffer = k_chunks.pop()
+            v_buffer = v_chunks.pop()
+            sim_buffer = einsum('b i d, b j d -> b i j', q_buffer, k_buffer) * self.scale
+
+            del k_buffer, q_buffer
+        # attention, what we cannot get enough of, by chunks
+
+            sim_buffer = sim_buffer.softmax(dim=-1)
+
+            sim_buffer = einsum('b i j, b j d -> b i d', sim_buffer, v_buffer)
+            del v_buffer
+            sim[i:i+att_step,:,:] = sim_buffer
+
+            del sim_buffer
+        sim = rearrange(sim, '(b h) n d -> b n (h d)', h=h)
+        return self.to_out(sim)
+
+
+class BasicTransformerBlock(nn.Module):
+    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True):
+        super().__init__()
+        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout)  # is a self-attention
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
+                                    heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+
+    def forward(self, x, context=None):
+        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
+
+    def _forward(self, x, context=None):
+        x = self.attn1(self.norm1(x)) + x
+        x = self.attn2(self.norm2(x), context=context) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+    def __init__(self, in_channels, n_heads, d_head,
+                 depth=1, dropout=0., context_dim=None):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+
+        self.proj_in = nn.Conv2d(in_channels,
+                                 inner_dim,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+
+        self.transformer_blocks = nn.ModuleList(
+            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
+                for d in range(depth)]
+        )
+
+        self.proj_out = zero_module(nn.Conv2d(inner_dim,
+                                              in_channels,
+                                              kernel_size=1,
+                                              stride=1,
+                                              padding=0))
+
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+        x = self.proj_out(x)
+        return x + x_in
diff --git a/optimizedSD/txt2img_gradio.py b/optimizedSD/txt2img_gradio.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8a420dc7f9246cd621ba133eed96ab518d47ccc
--- /dev/null
+++ b/optimizedSD/txt2img_gradio.py
@@ -0,0 +1,250 @@
+import gradio as gr
+import numpy as np
+import torch
+from torchvision.utils import make_grid
+from einops import rearrange
+import os, re
+from PIL import Image
+import torch
+import pandas as pd
+import numpy as np
+from random import randint
+from omegaconf import OmegaConf
+from PIL import Image
+from tqdm import tqdm, trange
+from itertools import islice
+from einops import rearrange
+from torchvision.utils import make_grid
+import time
+from pytorch_lightning import seed_everything
+from torch import autocast
+from contextlib import nullcontext
+from ldmlib.util import instantiate_from_config
+from optimUtils import split_weighted_subprompts, logger
+from transformers import logging
+logging.set_verbosity_error()
+import mimetypes
+mimetypes.init()
+mimetypes.add_type("application/javascript", ".js")
+
+
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+
+
+def load_model_from_config(ckpt, verbose=False):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    return sd
+
+config = "optimizedSD/v1-inference.yaml"
+ckpt = "models/ldm/stable-diffusion-v1/model.ckpt"
+sd = load_model_from_config(f"{ckpt}")
+li, lo = [], []
+for key, v_ in sd.items():
+    sp = key.split(".")
+    if (sp[0]) == "model":
+        if "input_blocks" in sp:
+            li.append(key)
+        elif "middle_block" in sp:
+            li.append(key)
+        elif "time_embed" in sp:
+            li.append(key)
+        else:
+            lo.append(key)
+for key in li:
+    sd["model1." + key[6:]] = sd.pop(key)
+for key in lo:
+    sd["model2." + key[6:]] = sd.pop(key)
+
+config = OmegaConf.load(f"{config}")
+
+model = instantiate_from_config(config.modelUNet)
+_, _ = model.load_state_dict(sd, strict=False)
+model.eval()
+
+modelCS = instantiate_from_config(config.modelCondStage)
+_, _ = modelCS.load_state_dict(sd, strict=False)
+modelCS.eval()
+
+modelFS = instantiate_from_config(config.modelFirstStage)
+_, _ = modelFS.load_state_dict(sd, strict=False)
+modelFS.eval()
+del sd
+
+
+def generate(
+    prompt,
+    ddim_steps,
+    n_iter,
+    batch_size,
+    Height,
+    Width,
+    scale,
+    ddim_eta,
+    unet_bs,
+    device,
+    seed,
+    outdir,
+    img_format,
+    turbo,
+    full_precision,
+    sampler,
+):
+
+    C = 4
+    f = 8
+    start_code = None
+    model.unet_bs = unet_bs
+    model.turbo = turbo
+    model.cdevice = device
+    modelCS.cond_stage_model.device = device
+
+    if seed == "":
+        seed = randint(0, 1000000)
+    seed = int(seed)
+    seed_everything(seed)
+    # Logging
+    logger(locals(), "logs/txt2img_gradio_logs.csv")
+
+    if device != "cpu" and full_precision == False:
+        model.half()
+        modelFS.half()
+        modelCS.half()
+
+    tic = time.time()
+    os.makedirs(outdir, exist_ok=True)
+    outpath = outdir
+    sample_path = os.path.join(outpath, "_".join(re.split(":| ", prompt)))[:150]
+    os.makedirs(sample_path, exist_ok=True)
+    base_count = len(os.listdir(sample_path))
+
+    # n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
+    assert prompt is not None
+    data = [batch_size * [prompt]]
+
+    if full_precision == False and device != "cpu":
+        precision_scope = autocast
+    else:
+        precision_scope = nullcontext
+
+    all_samples = []
+    seeds = ""
+    with torch.no_grad():
+
+        all_samples = list()
+        for _ in trange(n_iter, desc="Sampling"):
+            for prompts in tqdm(data, desc="data"):
+                with precision_scope("cuda"):
+                    modelCS.to(device)
+                    uc = None
+                    if scale != 1.0:
+                        uc = modelCS.get_learned_conditioning(batch_size * [""])
+                    if isinstance(prompts, tuple):
+                        prompts = list(prompts)
+
+                    subprompts, weights = split_weighted_subprompts(prompts[0])
+                    if len(subprompts) > 1:
+                        c = torch.zeros_like(uc)
+                        totalWeight = sum(weights)
+                        # normalize each "sub prompt" and add it
+                        for i in range(len(subprompts)):
+                            weight = weights[i]
+                            # if not skip_normalize:
+                            weight = weight / totalWeight
+                            c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight)
+                    else:
+                        c = modelCS.get_learned_conditioning(prompts)
+
+                    shape = [batch_size, C, Height // f, Width // f]
+
+                    if device != "cpu":
+                        mem = torch.cuda.memory_allocated() / 1e6
+                        modelCS.to("cpu")
+                        while torch.cuda.memory_allocated() / 1e6 >= mem:
+                            time.sleep(1)
+
+                    samples_ddim = model.sample(
+                        S=ddim_steps,
+                        conditioning=c,
+                        seed=seed,
+                        shape=shape,
+                        verbose=False,
+                        unconditional_guidance_scale=scale,
+                        unconditional_conditioning=uc,
+                        eta=ddim_eta,
+                        x_T=start_code,
+                        sampler = sampler,
+                    )
+
+                    modelFS.to(device)
+                    print("saving images")
+                    for i in range(batch_size):
+
+                        x_samples_ddim = modelFS.decode_first_stage(samples_ddim[i].unsqueeze(0))
+                        x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                        all_samples.append(x_sample.to("cpu"))
+                        x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c")
+                        Image.fromarray(x_sample.astype(np.uint8)).save(
+                            os.path.join(sample_path, "seed_" + str(seed) + "_" + f"{base_count:05}.{img_format}")
+                        )
+                        seeds += str(seed) + ","
+                        seed += 1
+                        base_count += 1
+
+                    if device != "cpu":
+                        mem = torch.cuda.memory_allocated() / 1e6
+                        modelFS.to("cpu")
+                        while torch.cuda.memory_allocated() / 1e6 >= mem:
+                            time.sleep(1)
+
+                    del samples_ddim
+                    del x_sample
+                    del x_samples_ddim
+                    print("memory_final = ", torch.cuda.memory_allocated() / 1e6)
+
+    toc = time.time()
+
+    time_taken = (toc - tic) / 60.0
+    grid = torch.cat(all_samples, 0)
+    grid = make_grid(grid, nrow=n_iter)
+    grid = 255.0 * rearrange(grid, "c h w -> h w c").cpu().numpy()
+
+    txt = (
+        "Samples finished in "
+        + str(round(time_taken, 3))
+        + " minutes and exported to "
+        + sample_path
+        + "\nSeeds used = "
+        + seeds[:-1]
+    )
+    return Image.fromarray(grid.astype(np.uint8)), txt
+
+
+demo = gr.Interface(
+    fn=generate,
+    inputs=[
+        "text",
+        gr.Slider(1, 1000, value=50),
+        gr.Slider(1, 100, step=1),
+        gr.Slider(1, 100, step=1),
+        gr.Slider(64, 4096, value=512, step=64),
+        gr.Slider(64, 4096, value=512, step=64),
+        gr.Slider(0, 50, value=7.5, step=0.1),
+        gr.Slider(0, 1, step=0.01),
+        gr.Slider(1, 2, value=1, step=1),
+        gr.Text(value="cuda"),
+        "text",
+        gr.Text(value="outputs/txt2img-samples"),
+        gr.Radio(["png", "jpg"], value='png'),
+        "checkbox",
+        "checkbox",
+        gr.Radio(["ddim", "plms","heun", "euler", "euler_a", "dpm2", "dpm2_a", "lms"], value="plms"),
+    ],
+    outputs=["image", "text"],
+)
+demo.launch()
diff --git a/optimizedSD/v1-inference.yaml b/optimizedSD/v1-inference.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c00629fd0ad466117f214703beb28a3d34f62daf
--- /dev/null
+++ b/optimizedSD/v1-inference.yaml
@@ -0,0 +1,114 @@
+modelUNet:
+  base_learning_rate: 1.0e-04
+  target: optimizedSD.ddpm.UNet
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+
+    unetConfigEncode:
+      target: optimizedSD.openaimodelSplit.UNetModelEncode
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    unetConfigDecode:
+      target: optimizedSD.openaimodelSplit.UNetModelDecode
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+modelFirstStage:
+  target: optimizedSD.ddpm.FirstStage
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    first_stage_config:
+      target: ldmlib.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+modelCondStage:
+  target: optimizedSD.ddpm.CondStage
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    cond_stage_config:
+      target: ldmlib.modules.encoders.modules.FrozenCLIPEmbedder
+      params:
+        device: cpu
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2175846037977cd69f82f59a73f66af90f3c6c31
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,33 @@
+transformers==4.*
+pytorch-lightning
+einops
+pyngrok
+--only-binary numpy
+torch==1.12.1
+torchvision==0.13.1
+albumentations==0.4.3
+opencv-python==4.3.0.38
+pudb==2019.2
+imageio==2.9.0
+imageio-ffmpeg==0.4.2
+omegaconf==2.1.1
+test-tube>=0.7.5
+streamlit>=0.73.1
+torch-fidelity==0.3.0
+torchmetrics==0.6.0
+pywavelets==1.3.0
+pandas==1.3.5
+kornia==0.6
+starlette
+pydantic
+fastapi==0.74.*
+requests==2.27.*
+sentencepiece==0.1.*
+uvicorn[standard]==0.17.*
+colabcode
+asyncio
+datasets==2.*
+-e git+https://github.com/TencentARC/GFPGAN#egg=GFPGAN
+-e git+https://github.com/xinntao/Real-ESRGAN#egg=realesrgan
+-e git+https://github.com/CompVis/taming-transformers#egg=taming-transformers
+-e git+https://github.com/openai/CLIP#egg=clip
diff --git a/sd_internal/__init__.py b/sd_internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2a9901f080ef6a837dccaee1548ccf9b661dba3
--- /dev/null
+++ b/sd_internal/__init__.py
@@ -0,0 +1,107 @@
+import json
+
+class Request:
+    session_id: str = "session"
+    prompt: str = ""
+    negative_prompt: str = ""
+    init_image: str = None # base64
+    mask: str = None # base64
+    num_outputs: int = 1
+    num_inference_steps: int = 50
+    guidance_scale: float = 7.5
+    width: int = 512
+    height: int = 512
+    seed: int = 42
+    prompt_strength: float = 0.8
+    sampler: str = None # "ddim", "plms", "heun", "euler", "euler_a", "dpm2", "dpm2_a", "lms"
+    # allow_nsfw: bool = False
+    precision: str = "autocast" # or "full"
+    save_to_disk_path: str = None
+    turbo: bool = True
+    use_cpu: bool = False
+    use_full_precision: bool = False
+    use_face_correction: str = None # or "GFPGANv1.3"
+    use_upscale: str = None # or "RealESRGAN_x4plus" or "RealESRGAN_x4plus_anime_6B"
+    use_stable_diffusion_model: str = "sd-v1-4"
+    show_only_filtered_image: bool = False
+    output_format: str = "jpeg" # or "png"
+
+    stream_progress_updates: bool = False
+    stream_image_progress: bool = False
+
+    def json(self):
+        return {
+            "session_id": self.session_id,
+            "prompt": self.prompt,
+            "negative_prompt": self.negative_prompt,
+            "num_outputs": self.num_outputs,
+            "num_inference_steps": self.num_inference_steps,
+            "guidance_scale": self.guidance_scale,
+            "width": self.width,
+            "height": self.height,
+            "seed": self.seed,
+            "prompt_strength": self.prompt_strength,
+            "sampler": self.sampler,
+            "use_face_correction": self.use_face_correction,
+            "use_upscale": self.use_upscale,
+            "use_stable_diffusion_model": self.use_stable_diffusion_model,
+            "output_format": self.output_format,
+        }
+
+    def to_string(self):
+        return f'''
+    session_id: {self.session_id}
+    prompt: {self.prompt}
+    negative_prompt: {self.negative_prompt}
+    seed: {self.seed}
+    num_inference_steps: {self.num_inference_steps}
+    sampler: {self.sampler}
+    guidance_scale: {self.guidance_scale}
+    w: {self.width}
+    h: {self.height}
+    precision: {self.precision}
+    save_to_disk_path: {self.save_to_disk_path}
+    turbo: {self.turbo}
+    use_cpu: {self.use_cpu}
+    use_full_precision: {self.use_full_precision}
+    use_face_correction: {self.use_face_correction}
+    use_upscale: {self.use_upscale}
+    use_stable_diffusion_model: {self.use_stable_diffusion_model}
+    show_only_filtered_image: {self.show_only_filtered_image}
+    output_format: {self.output_format}
+
+    stream_progress_updates: {self.stream_progress_updates}
+    stream_image_progress: {self.stream_image_progress}'''
+
+class Image:
+    data: str # base64
+    seed: int
+    is_nsfw: bool
+    path_abs: str = None
+
+    def __init__(self, data, seed):
+        self.data = data
+        self.seed = seed
+
+    def json(self):
+        return {
+            "data": self.data,
+            "seed": self.seed,
+            "path_abs": self.path_abs,
+        }
+
+class Response:
+    request: Request
+    images: list
+
+    def json(self):
+        res = {
+            "status": 'succeeded',
+            "request": self.request.json(),
+            "output": [],
+        }
+
+        for image in self.images:
+            res["output"].append(image.json())
+
+        return res
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e1b0fdee19c2da2f2a77be0cf706362d322f02e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,15 @@
+from setuptools import setup, find_packages
+import os
+
+def _read_reqs(relpath):
+    fullpath = os.path.join(os.path.dirname(__file__), relpath)
+    with open(fullpath) as f:
+        return [s.strip() for s in f.readlines() if (s.strip() and not s.startswith("#"))]
+
+setup(
+    name='stable-diffusion',
+    version='0.0.1',
+    description='',
+    packages=find_packages(),
+    install_requires=_read_reqs("requirements.txt"),
+)
diff --git a/start.py b/start.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed0a20a90735424ce2b4c81cf73e1b6379e4e5f3
--- /dev/null
+++ b/start.py
@@ -0,0 +1,2 @@
+import subprocess
+subprocess.run("uvicorn modules.app:app --host 0.0.0.0 --port 7860", shell=True)
diff --git a/static/index.html b/static/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..a33806e93fbb159f8384e4e9becbb55e07599d64
--- /dev/null
+++ b/static/index.html
@@ -0,0 +1,1919 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Fast API 🤗 Space served with Uvicorn</title>
+    <link rel="stylesheet" href="static/style.css" />
+    <script type="module" src="static/index.js"></script>
+  </head>
+  <body>
+    <main>
+      <h1>Fast API 🤗 Space served with Uvicorn</h1>
+      <section id="image-gen">
+        <h2>Image generation from Inference API</h2>
+        <p>
+          Model:
+          <a
+            href="https://huggingface.co/osanseviero/BigGAN-deep-128"
+            rel="noreferrer"
+            target="_blank"
+            >osanseviero/BigGAN-deep-128</a
+          >
+        </p>
+        <label for="image-gen-input">Text prompt</label>
+        <select id="image-gen-input" title="Text to image input prompt">
+          <option value="abacus">abacuu</option>
+          <option value="abaya">abaya</option>
+          <option value="academic gown">academic gown</option>
+          <option value="academic robe">academic robe</option>
+          <option value="accordion">accordion</option>
+          <option value="Acinonyx jubatus">Acinonyx jubatus</option>
+          <option value="acorn">acorn</option>
+          <option value="acorn squash">acorn squash</option>
+          <option value="acoustic guitar">acoustic guitar</option>
+          <option value="admiral">admiral</option>
+          <option value="aegis">aegis</option>
+          <option value="Aepyceros melampus">Aepyceros melampus</option>
+          <option value="affenpinscher">affenpinscher</option>
+          <option value="Afghan">Afghan</option>
+          <option value="Afghan hound">Afghan hound</option>
+          <option value="African chameleon">African chameleon</option>
+          <option value="African crocodile">African crocodile</option>
+          <option value="African elephant">African elephant</option>
+          <option value="African gray">African gray</option>
+          <option value="African grey">African grey</option>
+          <option value="African hunting dog">African hunting dog</option>
+          <option value="agama">agama</option>
+          <option value="agaric">agaric</option>
+          <option value="ai">ai</option>
+          <option value="Ailuropoda melanoleuca">Ailuropoda melanoleuca</option>
+          <option value="Ailurus fulgens">Ailurus fulgens</option>
+          <option value="aircraft carrier">aircraft carrier</option>
+          <option value="Airedale">Airedale</option>
+          <option value="Airedale terrier">Airedale terrier</option>
+          <option value="airliner">airliner</option>
+          <option value="airship">airship</option>
+          <option value="Alaska crab">Alaska crab</option>
+          <option value="Alaska king crab">Alaska king crab</option>
+          <option value="Alaskan king crab">Alaskan king crab</option>
+          <option value="Alaskan malamute">Alaskan malamute</option>
+          <option value="albatross">albatross</option>
+          <option value="all-terrain bike">all-terrain bike</option>
+          <option value="alligator lizard">alligator lizard</option>
+          <option value="Alligator mississipiensis">Alligator mississipiensis</option>
+          <option value="Alopex lagopus">Alopex lagopus</option>
+          <option value="alp">alp</option>
+          <option value="alsatian">alsatian</option>
+          <option value="altar">altar</option>
+          <option value="ambulance">ambulance</option>
+          <option value="Ambystoma maculatum">Ambystoma maculatum</option>
+          <option value="Ambystoma mexicanum">Ambystoma mexicanum</option>
+          <option value="American alligator">American alligator</option>
+          <option value="American black bear">American black bear</option>
+          <option value="American chameleon">American chameleon</option>
+          <option value="American coot">American coot</option>
+          <option value="American eagle">American eagle</option>
+          <option value="American egret">American egret</option>
+          <option value="American lobster">American lobster</option>
+          <option value="American pit bull terrier">American pit bull terrier</option>
+          <option value="American robin">American robin</option>
+          <option value="American Staffordshire terrier">American Staffordshire terrier</option>
+          <option value="amphibian">amphibian</option>
+          <option value="amphibious vehicle">amphibious vehicle</option>
+          <option value="analog clock">analog clock</option>
+          <option value="ananas">ananas</option>
+          <option value="anemone">anemone</option>
+          <option value="anemone fish">anemone fish</option>
+          <option value="Angora">Angora</option>
+          <option value="Angora rabbit">Angora rabbit</option>
+          <option value="anole">anole</option>
+          <option value="Anolis carolinensis">Anolis carolinensis</option>
+          <option value="ant">ant</option>
+          <option value="anteater">anteater</option>
+          <option value="apiary">apiary</option>
+          <option value="Appenzeller">Appenzeller</option>
+          <option value="apron">apron</option>
+          <option value="Aptenodytes patagonica">Aptenodytes patagonica</option>
+          <option value="Arabian camel">Arabian camel</option>
+          <option value="Aramus pictus">Aramus pictus</option>
+          <option value="Aranea diademata">Aranea diademata</option>
+          <option value="Araneus cavaticus">Araneus cavaticus</option>
+          <option value="Arctic fox">Arctic fox</option>
+          <option value="Arctic wolf">Arctic wolf</option>
+          <option value="Arenaria interpres">Arenaria interpres</option>
+          <option value="Argiope aurantia">Argiope aurantia</option>
+          <option value="armadillo">armadillo</option>
+          <option value="armored combat vehicle">armored combat vehicle</option>
+          <option value="armoured combat vehicle">armoured combat vehicle</option>
+          <option value="army tank">army tank</option>
+          <option value="artichoke">artichoke</option>
+          <option value="articulated lorry">articulated lorry</option>
+          <option value="Ascaphus trui">Ascaphus trui</option>
+          <option value="ash bin">ash bin</option>
+          <option value="ash-bin">ash-bin</option>
+          <option value="ashbin">ashbin</option>
+          <option value="ashcan">ashcan</option>
+          <option value="Asiatic buffalo">Asiatic buffalo</option>
+          <option value="assault gun">assault gun</option>
+          <option value="assault rifle">assault rifle</option>
+          <option value="Ateles geoffroyi">Ateles geoffroyi</option>
+          <option value="ATM">ATM</option>
+          <option value="attack aircraft carrier">attack aircraft carrier</option>
+          <option value="Australian terrier">Australian terrier</option>
+          <option value="automated teller">automated teller</option>
+          <option value="automated teller machine">automated teller machine</option>
+          <option value="automatic teller">automatic teller</option>
+          <option value="automatic teller machine">automatic teller machine</option>
+          <option value="automatic washer">automatic washer</option>
+          <option value="axolotl">axolotl</option>
+          <option value="baboon">baboon</option>
+          <option value="back pack">back pack</option>
+          <option value="backpack">backpack</option>
+          <option value="badger">badger</option>
+          <option value="bagel">bagel</option>
+          <option value="bakehouse">bakehouse</option>
+          <option value="bakery">bakery</option>
+          <option value="bakeshop">bakeshop</option>
+          <option value="balance beam">balance beam</option>
+          <option value="bald eagle">bald eagle</option>
+          <option value="balloon">balloon</option>
+          <option value="ballpen">ballpen</option>
+          <option value="ballplayer">ballplayer</option>
+          <option value="ballpoint">ballpoint</option>
+          <option value="ballpoint pen">ballpoint pen</option>
+          <option value="balusters">balusters</option>
+          <option value="balustrade">balustrade</option>
+          <option value="banana">banana</option>
+          <option value="Band Aid">Band Aid</option>
+          <option value="bandeau">bandeau</option>
+          <option value="banded gecko">banded gecko</option>
+          <option value="banister">banister</option>
+          <option value="banjo">banjo</option>
+          <option value="bannister">bannister</option>
+          <option value="barbell">barbell</option>
+          <option value="barber chair">barber chair</option>
+          <option value="barbershop">barbershop</option>
+          <option value="barn">barn</option>
+          <option value="barn spider">barn spider</option>
+          <option value="barometer">barometer</option>
+          <option value="barracouta">barracouta</option>
+          <option value="barrel">barrel</option>
+          <option value="barrow">barrow</option>
+          <option value="bars">bars</option>
+          <option value="baseball">baseball</option>
+          <option value="baseball player">baseball player</option>
+          <option value="basenji">basenji</option>
+          <option value="basketball">basketball</option>
+          <option value="basset">basset</option>
+          <option value="basset hound">basset hound</option>
+          <option value="bassinet">bassinet</option>
+          <option value="bassoon">bassoon</option>
+          <option value="bath">bath</option>
+          <option value="bath towel">bath towel</option>
+          <option value="bathing cap">bathing cap</option>
+          <option value="bathing trunks">bathing trunks</option>
+          <option value="bathing tub">bathing tub</option>
+          <option value="bathroom tissue">bathroom tissue</option>
+          <option value="bathtub">bathtub</option>
+          <option value="beach waggon">beach waggon</option>
+          <option value="beach wagon">beach wagon</option>
+          <option value="beacon">beacon</option>
+          <option value="beacon light">beacon light</option>
+          <option value="beagle">beagle</option>
+          <option value="beaker">beaker</option>
+          <option value="beam">beam</option>
+          <option value="bear cat">bear cat</option>
+          <option value="bearskin">bearskin</option>
+          <option value="beaver">beaver</option>
+          <option value="Bedlington terrier">Bedlington terrier</option>
+          <option value="bee">bee</option>
+          <option value="bee eater">bee eater</option>
+          <option value="bee house">bee house</option>
+          <option value="beer bottle">beer bottle</option>
+          <option value="beer glass">beer glass</option>
+          <option value="beigel">beigel</option>
+          <option value="bell">bell</option>
+          <option value="bell cot">bell cot</option>
+          <option value="bell cote">bell cote</option>
+          <option value="bell pepper">bell pepper</option>
+          <option value="bell toad">bell toad</option>
+          <option value="Bernese mountain dog">Bernese mountain dog</option>
+          <option value="bib">bib</option>
+          <option value="bicycle-built-for-two">bicycle-built-for-two</option>
+          <option value="bighorn">bighorn</option>
+          <option value="bighorn sheep">bighorn sheep</option>
+          <option value="bikini">bikini</option>
+          <option value="billfish">billfish</option>
+          <option value="billfold">billfold</option>
+          <option value="billiard table">billiard table</option>
+          <option value="binder">binder</option>
+          <option value="binoculars">binoculars</option>
+          <option value="birdhouse">birdhouse</option>
+          <option value="Biro">Biro</option>
+          <option value="bison">bison</option>
+          <option value="bittern">bittern</option>
+          <option value="black and gold garden spider">black and gold garden spider</option>
+          <option value="black bear">black bear</option>
+          <option value="black grouse">black grouse</option>
+          <option value="black Maria">black Maria</option>
+          <option value="black stork">black stork</option>
+          <option value="black swan">black swan</option>
+          <option value="black widow">black widow</option>
+          <option value="black-and-tan coonhound">black-and-tan coonhound</option>
+          <option value="black-footed ferret">black-footed ferret</option>
+          <option value="Blenheim spaniel">Blenheim spaniel</option>
+          <option value="bloodhound">bloodhound</option>
+          <option value="blow drier">blow drier</option>
+          <option value="blow dryer">blow dryer</option>
+          <option value="blower">blower</option>
+          <option value="blowfish">blowfish</option>
+          <option value="blue jack">blue jack</option>
+          <option value="blue jean">blue jean</option>
+          <option value="bluetick">bluetick</option>
+          <option value="boa">boa</option>
+          <option value="boa constrictor">boa constrictor</option>
+          <option value="boar">boar</option>
+          <option value="board">board</option>
+          <option value="boat paddle">boat paddle</option>
+          <option value="boathouse">boathouse</option>
+          <option value="bob">bob</option>
+          <option value="bobsled">bobsled</option>
+          <option value="bobsleigh">bobsleigh</option>
+          <option value="bobtail">bobtail</option>
+          <option value="bola">bola</option>
+          <option value="bola tie">bola tie</option>
+          <option value="bolete">bolete</option>
+          <option value="bolo">bolo</option>
+          <option value="bolo tie">bolo tie</option>
+          <option value="Bonasa umbellus">Bonasa umbellus</option>
+          <option value="bonnet">bonnet</option>
+          <option value="book jacket">book jacket</option>
+          <option value="bookcase">bookcase</option>
+          <option value="bookshop">bookshop</option>
+          <option value="bookstall">bookstall</option>
+          <option value="bookstore">bookstore</option>
+          <option value="Border collie">Border collie</option>
+          <option value="Border terrier">Border terrier</option>
+          <option value="borzoi">borzoi</option>
+          <option value="Boston bull">Boston bull</option>
+          <option value="Boston terrier">Boston terrier</option>
+          <option value="bottle screw">bottle screw</option>
+          <option value="bottlecap">bottlecap</option>
+          <option value="Bouvier des Flandres">Bouvier des Flandres</option>
+          <option value="Bouviers des Flandres">Bouviers des Flandres</option>
+          <option value="bow">bow</option>
+          <option value="bow tie">bow tie</option>
+          <option value="bow-tie">bow-tie</option>
+          <option value="bowtie">bowtie</option>
+          <option value="box tortoise">box tortoise</option>
+          <option value="box turtle">box turtle</option>
+          <option value="boxer">boxer</option>
+          <option value="bra">bra</option>
+          <option value="Brabancon griffon">Brabancon griffon</option>
+          <option value="Bradypus tridactylus">Bradypus tridactylus</option>
+          <option value="brain coral">brain coral</option>
+          <option value="brambling">brambling</option>
+          <option value="brass">brass</option>
+          <option value="brassiere">brassiere</option>
+          <option value="breakwater">breakwater</option>
+          <option value="breastplate">breastplate</option>
+          <option value="briard">briard</option>
+          <option value="bridegroom">bridegroom</option>
+          <option value="Brittany spaniel">Brittany spaniel</option>
+          <option value="broccoli">broccoli</option>
+          <option value="broom">broom</option>
+          <option value="brown bear">brown bear</option>
+          <option value="bruin">bruin</option>
+          <option value="brush kangaroo">brush kangaroo</option>
+          <option value="brush wolf">brush wolf</option>
+          <option value="Bubalus bubalis">Bubalus bubalis</option>
+          <option value="bubble">bubble</option>
+          <option value="bucket">bucket</option>
+          <option value="buckeye">buckeye</option>
+          <option value="buckle">buckle</option>
+          <option value="buckler">buckler</option>
+          <option value="bulbul">bulbul</option>
+          <option value="bull mastiff">bull mastiff</option>
+          <option value="bullet">bullet</option>
+          <option value="bullet train">bullet train</option>
+          <option value="bulletproof vest">bulletproof vest</option>
+          <option value="bullfrog">bullfrog</option>
+          <option value="bulwark">bulwark</option>
+          <option value="burrito">burrito</option>
+          <option value="busby">busby</option>
+          <option value="bustard">bustard</option>
+          <option value="butcher shop">butcher shop</option>
+          <option value="butternut squash">butternut squash</option>
+          <option value="cab">cab</option>
+          <option value="cabbage butterfly">cabbage butterfly</option>
+          <option value="Cacatua galerita">Cacatua galerita</option>
+          <option value="cairn">cairn</option>
+          <option value="cairn terrier">cairn terrier</option>
+          <option value="caldron">caldron</option>
+          <option value="Camelus dromedarius">Camelus dromedarius</option>
+          <option value="can opener">can opener</option>
+          <option value="Cancer irroratus">Cancer irroratus</option>
+          <option value="Cancer magister">Cancer magister</option>
+          <option value="candle">candle</option>
+          <option value="candy store">candy store</option>
+          <option value="Canis dingo">Canis dingo</option>
+          <option value="Canis latrans">Canis latrans</option>
+          <option value="Canis lupus">Canis lupus</option>
+          <option value="Canis lupus tundrarum">Canis lupus tundrarum</option>
+          <option value="Canis niger">Canis niger</option>
+          <option value="Canis rufus">Canis rufus</option>
+          <option value="cannon">cannon</option>
+          <option value="canoe">canoe</option>
+          <option value="Cape hunting dog">Cape hunting dog</option>
+          <option value="capitulum">capitulum</option>
+          <option value="Capra ibex">Capra ibex</option>
+          <option value="capuchin">capuchin</option>
+          <option value="car mirror">car mirror</option>
+          <option value="car wheel">car wheel</option>
+          <option value="carabid beetle">carabid beetle</option>
+          <option value="Carassius auratus">Carassius auratus</option>
+          <option value="carbonara">carbonara</option>
+          <option value="Carcharodon carcharias">Carcharodon carcharias</option>
+          <option value="cardigan">cardigan</option>
+          <option value="Cardigan">Cardigan</option>
+          <option value="Cardigan Welsh corgi">Cardigan Welsh corgi</option>
+          <option value="cardoon">cardoon</option>
+          <option value="Carduelis carduelis">Carduelis carduelis</option>
+          <option value="Caretta caretta">Caretta caretta</option>
+          <option value="carousel">carousel</option>
+          <option value="carpenter's kit">carpenter's kit</option>
+          <option value="carpenter's plane">carpenter's plane</option>
+          <option value="Carphophis amoenus">Carphophis amoenus</option>
+          <option value="Carpodacus mexicanus">Carpodacus mexicanus</option>
+          <option value="carriage">carriage</option>
+          <option value="carriage dog">carriage dog</option>
+          <option value="carrier">carrier</option>
+          <option value="carrion fungus">carrion fungus</option>
+          <option value="carrousel">carrousel</option>
+          <option value="carton">carton</option>
+          <option value="cash dispenser">cash dispenser</option>
+          <option value="cash machine">cash machine</option>
+          <option value="cask">cask</option>
+          <option value="cassette">cassette</option>
+          <option value="cassette player">cassette player</option>
+          <option value="castle">castle</option>
+          <option value="cat bear">cat bear</option>
+          <option value="catamaran">catamaran</option>
+          <option value="catamount">catamount</option>
+          <option value="cathode-ray oscilloscope">cathode-ray oscilloscope</option>
+          <option value="cauldron">cauldron</option>
+          <option value="cauliflower">cauliflower</option>
+          <option value="Cavia cobaya">Cavia cobaya</option>
+          <option value="CD player">CD player</option>
+          <option value="Cebus capucinus">Cebus capucinus</option>
+          <option value="cell">cell</option>
+          <option value="cello">cello</option>
+          <option value="cellphone">cellphone</option>
+          <option value="cellular phone">cellular phone</option>
+          <option value="cellular telephone">cellular telephone</option>
+          <option value="centipede">centipede</option>
+          <option value="cerastes">cerastes</option>
+          <option value="Cerastes cornutus">Cerastes cornutus</option>
+          <option value="chain">chain</option>
+          <option value="chain armor">chain armor</option>
+          <option value="chain armour">chain armour</option>
+          <option value="chain mail">chain mail</option>
+          <option value="chain saw">chain saw</option>
+          <option value="chainlink fence">chainlink fence</option>
+          <option value="chainsaw">chainsaw</option>
+          <option value="Chamaeleo chamaeleon">Chamaeleo chamaeleon</option>
+          <option value="chambered nautilus">chambered nautilus</option>
+          <option value="cheeseburger">cheeseburger</option>
+          <option value="cheetah">cheetah</option>
+          <option value="Chesapeake Bay retriever">Chesapeake Bay retriever</option>
+          <option value="chest">chest</option>
+          <option value="chetah">chetah</option>
+          <option value="chickadee">chickadee</option>
+          <option value="chiffonier">chiffonier</option>
+          <option value="Chihuahua">Chihuahua</option>
+          <option value="chime">chime</option>
+          <option value="chimp">chimp</option>
+          <option value="chimpanzee">chimpanzee</option>
+          <option value="china cabinet">china cabinet</option>
+          <option value="china closet">china closet</option>
+          <option value="chiton">chiton</option>
+          <option value="Chlamydosaurus kingi">Chlamydosaurus kingi</option>
+          <option value="chocolate sauce">chocolate sauce</option>
+          <option value="chocolate syrup">chocolate syrup</option>
+          <option value="chopper">chopper</option>
+          <option value="chow">chow</option>
+          <option value="chow chow">chow chow</option>
+          <option value="Christmas stocking">Christmas stocking</option>
+          <option value="chrysanthemum dog">chrysanthemum dog</option>
+          <option value="chrysomelid">chrysomelid</option>
+          <option value="church">church</option>
+          <option value="church building">church building</option>
+          <option value="chute">chute</option>
+          <option value="cicada">cicada</option>
+          <option value="cicala">cicala</option>
+          <option value="Ciconia ciconia">Ciconia ciconia</option>
+          <option value="Ciconia nigra">Ciconia nigra</option>
+          <option value="cimarron">cimarron</option>
+          <option value="cinema">cinema</option>
+          <option value="claw">claw</option>
+          <option value="cleaver">cleaver</option>
+          <option value="cliff">cliff</option>
+          <option value="cliff dwelling">cliff dwelling</option>
+          <option value="cloak">cloak</option>
+          <option value="clog">clog</option>
+          <option value="closet">closet</option>
+          <option value="clumber">clumber</option>
+          <option value="clumber spaniel">clumber spaniel</option>
+          <option value="coach">coach</option>
+          <option value="coach dog">coach dog</option>
+          <option value="coast">coast</option>
+          <option value="coat-of-mail shell">coat-of-mail shell</option>
+          <option value="cock">cock</option>
+          <option value="cocker">cocker</option>
+          <option value="cocker spaniel">cocker spaniel</option>
+          <option value="cockroach">cockroach</option>
+          <option value="cocktail shaker">cocktail shaker</option>
+          <option value="coffee mug">coffee mug</option>
+          <option value="coffeepot">coffeepot</option>
+          <option value="coho">coho</option>
+          <option value="coho salmon">coho salmon</option>
+          <option value="cohoe">cohoe</option>
+          <option value="coil">coil</option>
+          <option value="collie">collie</option>
+          <option value="colobus">colobus</option>
+          <option value="colobus monkey">colobus monkey</option>
+          <option value="combination lock">combination lock</option>
+          <option value="comfort">comfort</option>
+          <option value="comforter">comforter</option>
+          <option value="comic book">comic book</option>
+          <option value="commode">commode</option>
+          <option value="common iguana">common iguana</option>
+          <option value="common newt">common newt</option>
+          <option value="computer keyboard">computer keyboard</option>
+          <option value="computer mouse">computer mouse</option>
+          <option value="conch">conch</option>
+          <option value="confectionary">confectionary</option>
+          <option value="confectionery">confectionery</option>
+          <option value="conker">conker</option>
+          <option value="consomme">consomme</option>
+          <option value="Constrictor constrictor">Constrictor constrictor</option>
+          <option value="container ship">container ship</option>
+          <option value="container vessel">container vessel</option>
+          <option value="containership">containership</option>
+          <option value="convertible">convertible</option>
+          <option value="coon bear">coon bear</option>
+          <option value="coral fungus">coral fungus</option>
+          <option value="coral reef">coral reef</option>
+          <option value="corkscrew">corkscrew</option>
+          <option value="corn">corn</option>
+          <option value="cornet">cornet</option>
+          <option value="cot">cot</option>
+          <option value="cottontail">cottontail</option>
+          <option value="cottontail rabbit">cottontail rabbit</option>
+          <option value="coucal">coucal</option>
+          <option value="cougar">cougar</option>
+          <option value="courgette">courgette</option>
+          <option value="cowboy boot">cowboy boot</option>
+          <option value="cowboy hat">cowboy hat</option>
+          <option value="coyote">coyote</option>
+          <option value="cradle">cradle</option>
+          <option value="crampfish">crampfish</option>
+          <option value="crane">crane</option>
+          <option value="crash helmet">crash helmet</option>
+          <option value="crate">crate</option>
+          <option value="crawdad">crawdad</option>
+          <option value="crawdaddy">crawdaddy</option>
+          <option value="crawfish">crawfish</option>
+          <option value="crayfish">crayfish</option>
+          <option value="crib">crib</option>
+          <option value="cricket">cricket</option>
+          <option value="crinoline">crinoline</option>
+          <option value="CRO">CRO</option>
+          <option value="Crock Pot">Crock Pot</option>
+          <option value="Crocodylus niloticus">Crocodylus niloticus</option>
+          <option value="croquet ball">croquet ball</option>
+          <option value="crossword">crossword</option>
+          <option value="crossword puzzle">crossword puzzle</option>
+          <option value="Crotalus adamanteus">Crotalus adamanteus</option>
+          <option value="Crotalus cerastes">Crotalus cerastes</option>
+          <option value="CRT screen">CRT screen</option>
+          <option value="crutch">crutch</option>
+          <option value="cucumber">cucumber</option>
+          <option value="cuirass">cuirass</option>
+          <option value="cuke">cuke</option>
+          <option value="Cuon alpinus">Cuon alpinus</option>
+          <option value="cup">cup</option>
+          <option value="curly-coated retriever">curly-coated retriever</option>
+          <option value="custard apple">custard apple</option>
+          <option value="Cygnus atratus">Cygnus atratus</option>
+          <option value="Cypripedium calceolus">Cypripedium calceolus</option>
+          <option value="Cypripedium parviflorum">Cypripedium parviflorum</option>
+          <option value="daddy longlegs">daddy longlegs</option>
+          <option value="daisy">daisy</option>
+          <option value="dalmatian">dalmatian</option>
+          <option value="dam">dam</option>
+          <option value="damselfly">damselfly</option>
+          <option value="Danaus plexippus">Danaus plexippus</option>
+          <option value="Dandie Dinmont">Dandie Dinmont</option>
+          <option value="Dandie Dinmont terrier">Dandie Dinmont terrier</option>
+          <option value="dark glasses">dark glasses</option>
+          <option value="darning needle">darning needle</option>
+          <option value="day bed">day bed</option>
+          <option value="deerhound">deerhound</option>
+          <option value="denim">denim</option>
+          <option value="Dermochelys coriacea">Dermochelys coriacea</option>
+          <option value="desk">desk</option>
+          <option value="desktop computer">desktop computer</option>
+          <option value="devil's darning needle">devil's darning needle</option>
+          <option value="devilfish">devilfish</option>
+          <option value="dhole">dhole</option>
+          <option value="dial phone">dial phone</option>
+          <option value="dial telephone">dial telephone</option>
+          <option value="diamondback">diamondback</option>
+          <option value="diamondback rattlesnake">diamondback rattlesnake</option>
+          <option value="diaper">diaper</option>
+          <option value="digital clock">digital clock</option>
+          <option value="digital watch">digital watch</option>
+          <option value="dike">dike</option>
+          <option value="dingo">dingo</option>
+          <option value="dining table">dining table</option>
+          <option value="dipper">dipper</option>
+          <option value="dirigible">dirigible</option>
+          <option value="disc brake">disc brake</option>
+          <option value="dish washer">dish washer</option>
+          <option value="dishcloth">dishcloth</option>
+          <option value="dishrag">dishrag</option>
+          <option value="dishwasher">dishwasher</option>
+          <option value="dishwashing machine">dishwashing machine</option>
+          <option value="disk brake">disk brake</option>
+          <option value="Doberman">Doberman</option>
+          <option value="Doberman pinscher">Doberman pinscher</option>
+          <option value="dock">dock</option>
+          <option value="dockage">dockage</option>
+          <option value="docking facility">docking facility</option>
+          <option value="dog sled">dog sled</option>
+          <option value="dog sleigh">dog sleigh</option>
+          <option value="dogsled">dogsled</option>
+          <option value="dome">dome</option>
+          <option value="doormat">doormat</option>
+          <option value="dough">dough</option>
+          <option value="dowitcher">dowitcher</option>
+          <option value="dragon lizard">dragon lizard</option>
+          <option value="dragonfly">dragonfly</option>
+          <option value="drake">drake</option>
+          <option value="drilling platform">drilling platform</option>
+          <option value="dromedary">dromedary</option>
+          <option value="drop">drop</option>
+          <option value="drop-off">drop-off</option>
+          <option value="drum">drum</option>
+          <option value="drumstick">drumstick</option>
+          <option value="duck-billed platypus">duck-billed platypus</option>
+          <option value="duckbill">duckbill</option>
+          <option value="duckbilled platypus">duckbilled platypus</option>
+          <option value="dugong">dugong</option>
+          <option value="Dugong dugon">Dugong dugon</option>
+          <option value="dumbbell">dumbbell</option>
+          <option value="dung beetle">dung beetle</option>
+          <option value="Dungeness crab">Dungeness crab</option>
+          <option value="dunlin">dunlin</option>
+          <option value="dust cover">dust cover</option>
+          <option value="dust jacket">dust jacket</option>
+          <option value="dust wrapper">dust wrapper</option>
+          <option value="dustbin">dustbin</option>
+          <option value="dustcart">dustcart</option>
+          <option value="Dutch oven">Dutch oven</option>
+          <option value="dyke">dyke</option>
+          <option value="ear">ear</option>
+          <option value="earthstar">earthstar</option>
+          <option value="eastern fox squirrel">eastern fox squirrel</option>
+          <option value="eatery">eatery</option>
+          <option value="eating house">eating house</option>
+          <option value="eating place">eating place</option>
+          <option value="echidna">echidna</option>
+          <option value="eel">eel</option>
+          <option value="eft">eft</option>
+          <option value="eggnog">eggnog</option>
+          <option value="egis">egis</option>
+          <option value="Egretta albus">Egretta albus</option>
+          <option value="Egretta caerulea">Egretta caerulea</option>
+          <option value="Egyptian cat">Egyptian cat</option>
+          <option value="electric fan">electric fan</option>
+          <option value="electric guitar">electric guitar</option>
+          <option value="electric locomotive">electric locomotive</option>
+          <option value="electric ray">electric ray</option>
+          <option value="electric switch">electric switch</option>
+          <option value="electrical switch">electrical switch</option>
+          <option value="Elephas maximus">Elephas maximus</option>
+          <option value="elkhound">elkhound</option>
+          <option value="emmet">emmet</option>
+          <option value="English cocker spaniel">English cocker spaniel</option>
+          <option value="English foxhound">English foxhound</option>
+          <option value="English setter">English setter</option>
+          <option value="English springer">English springer</option>
+          <option value="English springer spaniel">English springer spaniel</option>
+          <option value="entertainment center">entertainment center</option>
+          <option value="EntleBucher">EntleBucher</option>
+          <option value="envelope">envelope</option>
+          <option value="Erolia alpina">Erolia alpina</option>
+          <option value="Erythrocebus patas">Erythrocebus patas</option>
+          <option value="Eschrichtius gibbosus">Eschrichtius gibbosus</option>
+          <option value="Eschrichtius robustus">Eschrichtius robustus</option>
+          <option value="Eskimo dog">Eskimo dog</option>
+          <option value="espresso">espresso</option>
+          <option value="espresso maker">espresso maker</option>
+          <option value="essence">essence</option>
+          <option value="estate car">estate car</option>
+          <option value="Euarctos americanus">Euarctos americanus</option>
+          <option value="European fire salamander">European fire salamander</option>
+          <option value="European gallinule">European gallinule</option>
+          <option value="ewer">ewer</option>
+          <option value="face powder">face powder</option>
+          <option value="feather boa">feather boa</option>
+          <option value="Felis concolor">Felis concolor</option>
+          <option value="Felis onca">Felis onca</option>
+          <option value="ferret">ferret</option>
+          <option value="fiddle">fiddle</option>
+          <option value="fiddler crab">fiddler crab</option>
+          <option value="field glasses">field glasses</option>
+          <option value="fig">fig</option>
+          <option value="file">file</option>
+          <option value="file cabinet">file cabinet</option>
+          <option value="filing cabinet">filing cabinet</option>
+          <option value="fire engine">fire engine</option>
+          <option value="fire screen">fire screen</option>
+          <option value="fire truck">fire truck</option>
+          <option value="fireboat">fireboat</option>
+          <option value="fireguard">fireguard</option>
+          <option value="fitch">fitch</option>
+          <option value="fixed disk">fixed disk</option>
+          <option value="flagpole">flagpole</option>
+          <option value="flagstaff">flagstaff</option>
+          <option value="flamingo">flamingo</option>
+          <option value="flat-coated retriever">flat-coated retriever</option>
+          <option value="flattop">flattop</option>
+          <option value="flatworm">flatworm</option>
+          <option value="flowerpot">flowerpot</option>
+          <option value="flute">flute</option>
+          <option value="fly">fly</option>
+          <option value="folding chair">folding chair</option>
+          <option value="food market">food market</option>
+          <option value="football helmet">football helmet</option>
+          <option value="footstall">footstall</option>
+          <option value="foreland">foreland</option>
+          <option value="forklift">forklift</option>
+          <option value="foulmart">foulmart</option>
+          <option value="foumart">foumart</option>
+          <option value="fountain">fountain</option>
+          <option value="fountain pen">fountain pen</option>
+          <option value="four-poster">four-poster</option>
+          <option value="fox squirrel">fox squirrel</option>
+          <option value="freight car">freight car</option>
+          <option value="French bulldog">French bulldog</option>
+          <option value="French horn">French horn</option>
+          <option value="French loaf">French loaf</option>
+          <option value="frilled lizard">frilled lizard</option>
+          <option value="Fringilla montifringilla">Fringilla montifringilla</option>
+          <option value="frying pan">frying pan</option>
+          <option value="frypan">frypan</option>
+          <option value="Fulica americana">Fulica americana</option>
+          <option value="fur coat">fur coat</option>
+          <option value="Galeocerdo cuvieri">Galeocerdo cuvieri</option>
+          <option value="gar">gar</option>
+          <option value="garbage can">garbage can</option>
+          <option value="garbage truck">garbage truck</option>
+          <option value="garden cart">garden cart</option>
+          <option value="garden spider">garden spider</option>
+          <option value="garfish">garfish</option>
+          <option value="garpike">garpike</option>
+          <option value="garter snake">garter snake</option>
+          <option value="gas helmet">gas helmet</option>
+          <option value="gas pump">gas pump</option>
+          <option value="gasmask">gasmask</option>
+          <option value="gasoline pump">gasoline pump</option>
+          <option value="gazelle">gazelle</option>
+          <option value="gazelle hound">gazelle hound</option>
+          <option value="German police dog">German police dog</option>
+          <option value="German shepherd">German shepherd</option>
+          <option value="German shepherd dog">German shepherd dog</option>
+          <option value="German short-haired pointer">German short-haired pointer</option>
+          <option value="geta">geta</option>
+          <option value="geyser">geyser</option>
+          <option value="giant lizard">giant lizard</option>
+          <option value="giant panda">giant panda</option>
+          <option value="giant schnauzer">giant schnauzer</option>
+          <option value="gibbon">gibbon</option>
+          <option value="Gila monster">Gila monster</option>
+          <option value="glasshouse">glasshouse</option>
+          <option value="globe artichoke">globe artichoke</option>
+          <option value="globefish">globefish</option>
+          <option value="go-kart">go-kart</option>
+          <option value="goblet">goblet</option>
+          <option value="golden retriever">golden retriever</option>
+          <option value="goldfinch">goldfinch</option>
+          <option value="goldfish">goldfish</option>
+          <option value="golf ball">golf ball</option>
+          <option value="golf cart">golf cart</option>
+          <option value="golfcart">golfcart</option>
+          <option value="gondola">gondola</option>
+          <option value="gong">gong</option>
+          <option value="goose">goose</option>
+          <option value="Gordon setter">Gordon setter</option>
+          <option value="gorilla">gorilla</option>
+          <option value="Gorilla gorilla">Gorilla gorilla</option>
+          <option value="gown">gown</option>
+          <option value="grampus">grampus</option>
+          <option value="grand">grand</option>
+          <option value="grand piano">grand piano</option>
+          <option value="Granny Smith">Granny Smith</option>
+          <option value="grass snake">grass snake</option>
+          <option value="grasshopper">grasshopper</option>
+          <option value="gray fox">gray fox</option>
+          <option value="gray whale">gray whale</option>
+          <option value="gray wolf">gray wolf</option>
+          <option value="Great Dane">Great Dane</option>
+          <option value="great gray owl">great gray owl</option>
+          <option value="great grey owl">great grey owl</option>
+          <option value="Great Pyrenees">Great Pyrenees</option>
+          <option value="great white heron">great white heron</option>
+          <option value="great white shark">great white shark</option>
+          <option value="Greater Swiss Mountain dog">Greater Swiss Mountain dog</option>
+          <option value="green lizard">green lizard</option>
+          <option value="green mamba">green mamba</option>
+          <option value="green snake">green snake</option>
+          <option value="greenhouse">greenhouse</option>
+          <option value="grey fox">grey fox</option>
+          <option value="grey whale">grey whale</option>
+          <option value="grey wolf">grey wolf</option>
+          <option value="Grifola frondosa">Grifola frondosa</option>
+          <option value="grille">grille</option>
+          <option value="grocery">grocery</option>
+          <option value="grocery store">grocery store</option>
+          <option value="groenendael">groenendael</option>
+          <option value="groin">groin</option>
+          <option value="groom">groom</option>
+          <option value="ground beetle">ground beetle</option>
+          <option value="groyne">groyne</option>
+          <option value="grunter">grunter</option>
+          <option value="guacamole">guacamole</option>
+          <option value="guenon">guenon</option>
+          <option value="guenon monkey">guenon monkey</option>
+          <option value="guillotine">guillotine</option>
+          <option value="guinea pig">guinea pig</option>
+          <option value="gyromitra">gyromitra</option>
+          <option value="hack">hack</option>
+          <option value="hair drier">hair drier</option>
+          <option value="hair dryer">hair dryer</option>
+          <option value="hair slide">hair slide</option>
+          <option value="hair spray">hair spray</option>
+          <option value="half track">half track</option>
+          <option value="Haliaeetus leucocephalus">Haliaeetus leucocephalus</option>
+          <option value="hammer">hammer</option>
+          <option value="hammerhead">hammerhead</option>
+          <option value="hammerhead shark">hammerhead shark</option>
+          <option value="hamper">hamper</option>
+          <option value="hamster">hamster</option>
+          <option value="hand blower">hand blower</option>
+          <option value="hand-held computer">hand-held computer</option>
+          <option value="hand-held microcomputer">hand-held microcomputer</option>
+          <option value="handbasin">handbasin</option>
+          <option value="handkerchief">handkerchief</option>
+          <option value="handrail">handrail</option>
+          <option value="hankey">hankey</option>
+          <option value="hankie">hankie</option>
+          <option value="hanky">hanky</option>
+          <option value="hard disc">hard disc</option>
+          <option value="hard disk">hard disk</option>
+          <option value="hare">hare</option>
+          <option value="harmonica">harmonica</option>
+          <option value="harp">harp</option>
+          <option value="hartebeest">hartebeest</option>
+          <option value="harvester">harvester</option>
+          <option value="harvestman">harvestman</option>
+          <option value="hatchet">hatchet</option>
+          <option value="hautbois">hautbois</option>
+          <option value="hautboy">hautboy</option>
+          <option value="haversack">haversack</option>
+          <option value="hay">hay</option>
+          <option value="head">head</option>
+          <option value="head cabbage">head cabbage</option>
+          <option value="headland">headland</option>
+          <option value="hedgehog">hedgehog</option>
+          <option value="helix">helix</option>
+          <option value="Heloderma suspectum">Heloderma suspectum</option>
+          <option value="hen">hen</option>
+          <option value="hen of the woods">hen of the woods</option>
+          <option value="hen-of-the-woods">hen-of-the-woods</option>
+          <option value="hermit crab">hermit crab</option>
+          <option value="high bar">high bar</option>
+          <option value="hip">hip</option>
+          <option value="hippo">hippo</option>
+          <option value="hippopotamus">hippopotamus</option>
+          <option value="Hippopotamus amphibius">Hippopotamus amphibius</option>
+          <option value="hockey puck">hockey puck</option>
+          <option value="hodometer">hodometer</option>
+          <option value="hog">hog</option>
+          <option value="hognose snake">hognose snake</option>
+          <option value="Holocanthus tricolor">Holocanthus tricolor</option>
+          <option value="holothurian">holothurian</option>
+          <option value="holster">holster</option>
+          <option value="Homarus americanus">Homarus americanus</option>
+          <option value="home theater">home theater</option>
+          <option value="home theatre">home theatre</option>
+          <option value="honeycomb">honeycomb</option>
+          <option value="hook">hook</option>
+          <option value="hoopskirt">hoopskirt</option>
+          <option value="hopper">hopper</option>
+          <option value="horizontal bar">horizontal bar</option>
+          <option value="horn">horn</option>
+          <option value="hornbill">hornbill</option>
+          <option value="horned asp">horned asp</option>
+          <option value="horned rattlesnake">horned rattlesnake</option>
+          <option value="horned viper">horned viper</option>
+          <option value="horse cart">horse cart</option>
+          <option value="horse chestnut">horse chestnut</option>
+          <option value="horse-cart">horse-cart</option>
+          <option value="hot dog">hot dog</option>
+          <option value="hot pot">hot pot</option>
+          <option value="hotdog">hotdog</option>
+          <option value="hotpot">hotpot</option>
+          <option value="hourglass">hourglass</option>
+          <option value="house finch">house finch</option>
+          <option value="howler">howler</option>
+          <option value="howler monkey">howler monkey</option>
+          <option value="hummingbird">hummingbird</option>
+          <option value="Hungarian pointer">Hungarian pointer</option>
+          <option value="hunting spider">hunting spider</option>
+          <option value="husky">husky</option>
+          <option value="hussar monkey">hussar monkey</option>
+          <option value="hyaena">hyaena</option>
+          <option value="hyena">hyena</option>
+          <option value="hyena dog">hyena dog</option>
+          <option value="Hylobates lar">Hylobates lar</option>
+          <option value="Hylobates syndactylus">Hylobates syndactylus</option>
+          <option value="Hypsiglena torquata">Hypsiglena torquata</option>
+          <option value="ibex">ibex</option>
+          <option value="Ibizan hound">Ibizan hound</option>
+          <option value="Ibizan Podenco">Ibizan Podenco</option>
+          <option value="ice bear">ice bear</option>
+          <option value="ice cream">ice cream</option>
+          <option value="ice lolly">ice lolly</option>
+          <option value="icebox">icebox</option>
+          <option value="icecream">icecream</option>
+          <option value="igniter">igniter</option>
+          <option value="ignitor">ignitor</option>
+          <option value="iguana">iguana</option>
+          <option value="Iguana iguana">Iguana iguana</option>
+          <option value="impala">impala</option>
+          <option value="Indian cobra">Indian cobra</option>
+          <option value="Indian elephant">Indian elephant</option>
+          <option value="indigo bird">indigo bird</option>
+          <option value="indigo bunting">indigo bunting</option>
+          <option value="indigo finch">indigo finch</option>
+          <option value="indri">indri</option>
+          <option value="Indri brevicaudatus">Indri brevicaudatus</option>
+          <option value="Indri indri">Indri indri</option>
+          <option value="indris">indris</option>
+          <option value="internet site">internet site</option>
+          <option value="iPod">iPod</option>
+          <option value="Irish setter">Irish setter</option>
+          <option value="Irish terrier">Irish terrier</option>
+          <option value="Irish water spaniel">Irish water spaniel</option>
+          <option value="Irish wolfhound">Irish wolfhound</option>
+          <option value="iron">iron</option>
+          <option value="island dispenser">island dispenser</option>
+          <option value="isopod">isopod</option>
+          <option value="Italian greyhound">Italian greyhound</option>
+          <option value="jacamar">jacamar</option>
+          <option value="jack">jack</option>
+          <option value="jack-o'-lantern">jack-o'-lantern</option>
+          <option value="jackfruit">jackfruit</option>
+          <option value="jaguar">jaguar</option>
+          <option value="jak">jak</option>
+          <option value="jammies">jammies</option>
+          <option value="Japanese spaniel">Japanese spaniel</option>
+          <option value="jay">jay</option>
+          <option value="jean">jean</option>
+          <option value="jeep">jeep</option>
+          <option value="jellyfish">jellyfish</option>
+          <option value="jersey">jersey</option>
+          <option value="jetty">jetty</option>
+          <option value="jeweler's loupe">jeweler's loupe</option>
+          <option value="jigsaw puzzle">jigsaw puzzle</option>
+          <option value="jinrikisha">jinrikisha</option>
+          <option value="joystick">joystick</option>
+          <option value="judge's robe">judge's robe</option>
+          <option value="junco">junco</option>
+          <option value="Kakatoe galerita">Kakatoe galerita</option>
+          <option value="kangaroo bear">kangaroo bear</option>
+          <option value="keeshond">keeshond</option>
+          <option value="kelpie">kelpie</option>
+          <option value="Kerry blue terrier">Kerry blue terrier</option>
+          <option value="keypad">keypad</option>
+          <option value="killer">killer</option>
+          <option value="killer whale">killer whale</option>
+          <option value="kimono">kimono</option>
+          <option value="king crab">king crab</option>
+          <option value="king of beasts">king of beasts</option>
+          <option value="king penguin">king penguin</option>
+          <option value="king snake">king snake</option>
+          <option value="kingsnake">kingsnake</option>
+          <option value="kit fox">kit fox</option>
+          <option value="kite">kite</option>
+          <option value="knapsack">knapsack</option>
+          <option value="knee pad">knee pad</option>
+          <option value="knot">knot</option>
+          <option value="koala">koala</option>
+          <option value="koala bear">koala bear</option>
+          <option value="Komodo dragon">Komodo dragon</option>
+          <option value="Komodo lizard">Komodo lizard</option>
+          <option value="komondor">komondor</option>
+          <option value="kuvasz">kuvasz</option>
+          <option value="lab coat">lab coat</option>
+          <option value="laboratory coat">laboratory coat</option>
+          <option value="Labrador retriever">Labrador retriever</option>
+          <option value="labyrinth">labyrinth</option>
+          <option value="Lacerta viridis">Lacerta viridis</option>
+          <option value="lacewing">lacewing</option>
+          <option value="lacewing fly">lacewing fly</option>
+          <option value="ladle">ladle</option>
+          <option value="lady beetle">lady beetle</option>
+          <option value="ladybeetle">ladybeetle</option>
+          <option value="ladybird">ladybird</option>
+          <option value="ladybird beetle">ladybird beetle</option>
+          <option value="ladybug">ladybug</option>
+          <option value="Lakeland terrier">Lakeland terrier</option>
+          <option value="lakeshore">lakeshore</option>
+          <option value="lakeside">lakeside</option>
+          <option value="lamp shade">lamp shade</option>
+          <option value="lampshade">lampshade</option>
+          <option value="landrover">landrover</option>
+          <option value="langouste">langouste</option>
+          <option value="langur">langur</option>
+          <option value="laptop">laptop</option>
+          <option value="laptop computer">laptop computer</option>
+          <option value="Latrodectus mactans">Latrodectus mactans</option>
+          <option value="lavabo">lavabo</option>
+          <option value="lawn cart">lawn cart</option>
+          <option value="lawn mower">lawn mower</option>
+          <option value="leaf beetle">leaf beetle</option>
+          <option value="leafhopper">leafhopper</option>
+          <option value="leatherback">leatherback</option>
+          <option value="leatherback turtle">leatherback turtle</option>
+          <option value="leathery turtle">leathery turtle</option>
+          <option value="lemon">lemon</option>
+          <option value="Lemur catta">Lemur catta</option>
+          <option value="lens cap">lens cap</option>
+          <option value="lens cover">lens cover</option>
+          <option value="Leonberg">Leonberg</option>
+          <option value="leopard">leopard</option>
+          <option value="Lepisosteus osseus">Lepisosteus osseus</option>
+          <option value="lesser panda">lesser panda</option>
+          <option value="letter box">letter box</option>
+          <option value="letter opener">letter opener</option>
+          <option value="Lhasa">Lhasa</option>
+          <option value="Lhasa apso">Lhasa apso</option>
+          <option value="library">library</option>
+          <option value="lifeboat">lifeboat</option>
+          <option value="light">light</option>
+          <option value="lighter">lighter</option>
+          <option value="lighthouse">lighthouse</option>
+          <option value="limo">limo</option>
+          <option value="limousine">limousine</option>
+          <option value="limpkin">limpkin</option>
+          <option value="liner">liner</option>
+          <option value="linnet">linnet</option>
+          <option value="lion">lion</option>
+          <option value="lionfish">lionfish</option>
+          <option value="lip rouge">lip rouge</option>
+          <option value="lipstick">lipstick</option>
+          <option value="little blue heron">little blue heron</option>
+          <option value="llama">llama</option>
+          <option value="Loafer">Loafer</option>
+          <option value="loggerhead">loggerhead</option>
+          <option value="loggerhead turtle">loggerhead turtle</option>
+          <option value="lollipop">lollipop</option>
+          <option value="lolly">lolly</option>
+          <option value="long-horned beetle">long-horned beetle</option>
+          <option value="longicorn">longicorn</option>
+          <option value="longicorn beetle">longicorn beetle</option>
+          <option value="lorikeet">lorikeet</option>
+          <option value="lotion">lotion</option>
+          <option value="loudspeaker">loudspeaker</option>
+          <option value="loudspeaker system">loudspeaker system</option>
+          <option value="loupe">loupe</option>
+          <option value="Loxodonta africana">Loxodonta africana</option>
+          <option value="lumbermill">lumbermill</option>
+          <option value="lycaenid">lycaenid</option>
+          <option value="lycaenid butterfly">lycaenid butterfly</option>
+          <option value="Lycaon pictus">Lycaon pictus</option>
+          <option value="lynx">lynx</option>
+          <option value="macaque">macaque</option>
+          <option value="macaw">macaw</option>
+          <option value="Madagascar cat">Madagascar cat</option>
+          <option value="magnetic compass">magnetic compass</option>
+          <option value="magpie">magpie</option>
+          <option value="mail">mail</option>
+          <option value="mailbag">mailbag</option>
+          <option value="mailbox">mailbox</option>
+          <option value="maillot">maillot</option>
+          <option value="Maine lobster">Maine lobster</option>
+          <option value="malamute">malamute</option>
+          <option value="malemute">malemute</option>
+          <option value="malinois">malinois</option>
+          <option value="Maltese">Maltese</option>
+          <option value="Maltese dog">Maltese dog</option>
+          <option value="Maltese terrier">Maltese terrier</option>
+          <option value="man-eater">man-eater</option>
+          <option value="man-eating shark">man-eating shark</option>
+          <option value="maned wolf">maned wolf</option>
+          <option value="manhole cover">manhole cover</option>
+          <option value="mantid">mantid</option>
+          <option value="mantis">mantis</option>
+          <option value="manufactured home">manufactured home</option>
+          <option value="maraca">maraca</option>
+          <option value="marimba">marimba</option>
+          <option value="market">market</option>
+          <option value="marmoset">marmoset</option>
+          <option value="marmot">marmot</option>
+          <option value="marsh hen">marsh hen</option>
+          <option value="mashed potato">mashed potato</option>
+          <option value="mask">mask</option>
+          <option value="matchstick">matchstick</option>
+          <option value="maypole">maypole</option>
+          <option value="maze">maze</option>
+          <option value="measuring cup">measuring cup</option>
+          <option value="meat cleaver">meat cleaver</option>
+          <option value="meat loaf">meat loaf</option>
+          <option value="meat market">meat market</option>
+          <option value="meatloaf">meatloaf</option>
+          <option value="medicine cabinet">medicine cabinet</option>
+          <option value="medicine chest">medicine chest</option>
+          <option value="meerkat">meerkat</option>
+          <option value="megalith">megalith</option>
+          <option value="megalithic structure">megalithic structure</option>
+          <option value="Melursus ursinus">Melursus ursinus</option>
+          <option value="membranophone">membranophone</option>
+          <option value="memorial tablet">memorial tablet</option>
+          <option value="menu">menu</option>
+          <option value="Mergus serrator">Mergus serrator</option>
+          <option value="merry-go-round">merry-go-round</option>
+          <option value="Mexican hairless">Mexican hairless</option>
+          <option value="microphone">microphone</option>
+          <option value="microwave">microwave</option>
+          <option value="microwave oven">microwave oven</option>
+          <option value="mierkat">mierkat</option>
+          <option value="mike">mike</option>
+          <option value="mileometer">mileometer</option>
+          <option value="military plane">military plane</option>
+          <option value="military uniform">military uniform</option>
+          <option value="milk can">milk can</option>
+          <option value="milkweed butterfly">milkweed butterfly</option>
+          <option value="milometer">milometer</option>
+          <option value="mini">mini</option>
+          <option value="miniature pinscher">miniature pinscher</option>
+          <option value="miniature poodle">miniature poodle</option>
+          <option value="miniature schnauzer">miniature schnauzer</option>
+          <option value="minibus">minibus</option>
+          <option value="miniskirt">miniskirt</option>
+          <option value="minivan">minivan</option>
+          <option value="mink">mink</option>
+          <option value="missile">missile</option>
+          <option value="mitten">mitten</option>
+          <option value="mixing bowl">mixing bowl</option>
+          <option value="mobile home">mobile home</option>
+          <option value="mobile phone">mobile phone</option>
+          <option value="Model T">Model T</option>
+          <option value="modem">modem</option>
+          <option value="mole">mole</option>
+          <option value="mollymawk">mollymawk</option>
+          <option value="monarch">monarch</option>
+          <option value="monarch butterfly">monarch butterfly</option>
+          <option value="monastery">monastery</option>
+          <option value="mongoose">mongoose</option>
+          <option value="monitor">monitor</option>
+          <option value="monkey dog">monkey dog</option>
+          <option value="monkey pinscher">monkey pinscher</option>
+          <option value="monocycle">monocycle</option>
+          <option value="mop">mop</option>
+          <option value="moped">moped</option>
+          <option value="mortar">mortar</option>
+          <option value="mortarboard">mortarboard</option>
+          <option value="mosque">mosque</option>
+          <option value="mosquito hawk">mosquito hawk</option>
+          <option value="mosquito net">mosquito net</option>
+          <option value="motor scooter">motor scooter</option>
+          <option value="mountain bike">mountain bike</option>
+          <option value="mountain lion">mountain lion</option>
+          <option value="mountain tent">mountain tent</option>
+          <option value="mouse">mouse</option>
+          <option value="mousetrap">mousetrap</option>
+          <option value="mouth harp">mouth harp</option>
+          <option value="mouth organ">mouth organ</option>
+          <option value="movie house">movie house</option>
+          <option value="movie theater">movie theater</option>
+          <option value="movie theatre">movie theatre</option>
+          <option value="moving van">moving van</option>
+          <option value="mower">mower</option>
+          <option value="mud hen">mud hen</option>
+          <option value="mud puppy">mud puppy</option>
+          <option value="mud turtle">mud turtle</option>
+          <option value="mushroom">mushroom</option>
+          <option value="Mustela nigripes">Mustela nigripes</option>
+          <option value="Mustela putorius">Mustela putorius</option>
+          <option value="muzzle">muzzle</option>
+          <option value="nail">nail</option>
+          <option value="Naja naja">Naja naja</option>
+          <option value="napkin">napkin</option>
+          <option value="nappy">nappy</option>
+          <option value="Nasalis larvatus">Nasalis larvatus</option>
+          <option value="native bear">native bear</option>
+          <option value="nautilus">nautilus</option>
+          <option value="neck brace">neck brace</option>
+          <option value="necklace">necklace</option>
+          <option value="nematode">nematode</option>
+          <option value="nematode worm">nematode worm</option>
+          <option value="Newfoundland">Newfoundland</option>
+          <option value="Newfoundland dog">Newfoundland dog</option>
+          <option value="night snake">night snake</option>
+          <option value="Nile crocodile">Nile crocodile</option>
+          <option value="nipple">nipple</option>
+          <option value="Norfolk terrier">Norfolk terrier</option>
+          <option value="Northern lobster">Northern lobster</option>
+          <option value="Norwegian elkhound">Norwegian elkhound</option>
+          <option value="Norwich terrier">Norwich terrier</option>
+          <option value="notebook">notebook</option>
+          <option value="notebook computer">notebook computer</option>
+          <option value="notecase">notecase</option>
+          <option value="nudibranch">nudibranch</option>
+          <option value="numbfish">numbfish</option>
+          <option value="nursery">nursery</option>
+          <option value="obelisk">obelisk</option>
+          <option value="oboe">oboe</option>
+          <option value="ocarina">ocarina</option>
+          <option value="ocean liner">ocean liner</option>
+          <option value="odometer">odometer</option>
+          <option value="off-roader">off-roader</option>
+          <option value="offshore rig">offshore rig</option>
+          <option value="oil filter">oil filter</option>
+          <option value="Old English sheepdog">Old English sheepdog</option>
+          <option value="Oncorhynchus kisutch">Oncorhynchus kisutch</option>
+          <option value="one-armed bandit">one-armed bandit</option>
+          <option value="opera glasses">opera glasses</option>
+          <option value="orang">orang</option>
+          <option value="orange">orange</option>
+          <option value="orangutan">orangutan</option>
+          <option value="orangutang">orangutang</option>
+          <option value="orca">orca</option>
+          <option value="Orcinus orca">Orcinus orca</option>
+          <option value="organ">organ</option>
+          <option value="Ornithorhynchus anatinus">Ornithorhynchus anatinus</option>
+          <option value="oscilloscope">oscilloscope</option>
+          <option value="ostrich">ostrich</option>
+          <option value="otter">otter</option>
+          <option value="otter hound">otter hound</option>
+          <option value="otterhound">otterhound</option>
+          <option value="ounce">ounce</option>
+          <option value="overskirt">overskirt</option>
+          <option value="Ovis canadensis">Ovis canadensis</option>
+          <option value="ox">ox</option>
+          <option value="oxcart">oxcart</option>
+          <option value="oxygen mask">oxygen mask</option>
+          <option value="oyster catcher">oyster catcher</option>
+          <option value="oystercatcher">oystercatcher</option>
+          <option value="packet">packet</option>
+          <option value="packsack">packsack</option>
+          <option value="paddle">paddle</option>
+          <option value="paddle wheel">paddle wheel</option>
+          <option value="paddlewheel">paddlewheel</option>
+          <option value="paddy wagon">paddy wagon</option>
+          <option value="padlock">padlock</option>
+          <option value="pail">pail</option>
+          <option value="paintbrush">paintbrush</option>
+          <option value="painter">painter</option>
+          <option value="pajama">pajama</option>
+          <option value="palace">palace</option>
+          <option value="paling">paling</option>
+          <option value="Pan troglodytes">Pan troglodytes</option>
+          <option value="panda">panda</option>
+          <option value="panda bear">panda bear</option>
+          <option value="pandean pipe">pandean pipe</option>
+          <option value="panpipe">panpipe</option>
+          <option value="panther">panther</option>
+          <option value="Panthera leo">Panthera leo</option>
+          <option value="Panthera onca">Panthera onca</option>
+          <option value="Panthera pardus">Panthera pardus</option>
+          <option value="Panthera tigris">Panthera tigris</option>
+          <option value="Panthera uncia">Panthera uncia</option>
+          <option value="paper knife">paper knife</option>
+          <option value="paper towel">paper towel</option>
+          <option value="paperknife">paperknife</option>
+          <option value="papillon">papillon</option>
+          <option value="parachute">parachute</option>
+          <option value="Paralithodes camtschatica">Paralithodes camtschatica</option>
+          <option value="parallel bars">parallel bars</option>
+          <option value="park bench">park bench</option>
+          <option value="parking meter">parking meter</option>
+          <option value="partridge">partridge</option>
+          <option value="passenger car">passenger car</option>
+          <option value="Passerina cyanea">Passerina cyanea</option>
+          <option value="patas">patas</option>
+          <option value="patio">patio</option>
+          <option value="patrol wagon">patrol wagon</option>
+          <option value="patten">patten</option>
+          <option value="pay-phone">pay-phone</option>
+          <option value="pay-station">pay-station</option>
+          <option value="peacock">peacock</option>
+          <option value="pearly nautilus">pearly nautilus</option>
+          <option value="pedestal">pedestal</option>
+          <option value="Peke">Peke</option>
+          <option value="Pekinese">Pekinese</option>
+          <option value="Pekingese">Pekingese</option>
+          <option value="pelican" selected>pelican</option>
+          <option value="Pembroke">Pembroke</option>
+          <option value="Pembroke Welsh corgi">Pembroke Welsh corgi</option>
+          <option value="pencil box">pencil box</option>
+          <option value="pencil case">pencil case</option>
+          <option value="pencil eraser">pencil eraser</option>
+          <option value="pencil sharpener">pencil sharpener</option>
+          <option value="penny bank">penny bank</option>
+          <option value="perfume">perfume</option>
+          <option value="Persian cat">Persian cat</option>
+          <option value="Petri dish">Petri dish</option>
+          <option value="petrol pump">petrol pump</option>
+          <option value="Phalangium opilio">Phalangium opilio</option>
+          <option value="pharos">pharos</option>
+          <option value="Phascolarctos cinereus">Phascolarctos cinereus</option>
+          <option value="photocopier">photocopier</option>
+          <option value="piano accordion">piano accordion</option>
+          <option value="pick">pick</option>
+          <option value="pickelhaube">pickelhaube</option>
+          <option value="picket fence">picket fence</option>
+          <option value="pickup">pickup</option>
+          <option value="pickup truck">pickup truck</option>
+          <option value="picture palace">picture palace</option>
+          <option value="pier">pier</option>
+          <option value="pig">pig</option>
+          <option value="pigboat">pigboat</option>
+          <option value="piggy bank">piggy bank</option>
+          <option value="pill bottle">pill bottle</option>
+          <option value="pillow">pillow</option>
+          <option value="pineapple">pineapple</option>
+          <option value="ping-pong ball">ping-pong ball</option>
+          <option value="pinwheel">pinwheel</option>
+          <option value="pipe organ">pipe organ</option>
+          <option value="pirate">pirate</option>
+          <option value="pirate ship">pirate ship</option>
+          <option value="pismire">pismire</option>
+          <option value="pit bull terrier">pit bull terrier</option>
+          <option value="pitcher">pitcher</option>
+          <option value="pizza">pizza</option>
+          <option value="pizza pie">pizza pie</option>
+          <option value="pj's">pj's</option>
+          <option value="plane">plane</option>
+          <option value="planetarium">planetarium</option>
+          <option value="plaque">plaque</option>
+          <option value="plastic bag">plastic bag</option>
+          <option value="plate">plate</option>
+          <option value="plate rack">plate rack</option>
+          <option value="platyhelminth">platyhelminth</option>
+          <option value="platypus">platypus</option>
+          <option value="plectron">plectron</option>
+          <option value="plectrum">plectrum</option>
+          <option value="plinth">plinth</option>
+          <option value="plough">plough</option>
+          <option value="plow">plow</option>
+          <option value="plumber's helper">plumber's helper</option>
+          <option value="plunger">plunger</option>
+          <option value="pocketbook">pocketbook</option>
+          <option value="poke bonnet">poke bonnet</option>
+          <option value="polar bear">polar bear</option>
+          <option value="Polaroid camera">Polaroid camera</option>
+          <option value="Polaroid Land camera">Polaroid Land camera</option>
+          <option value="pole">pole</option>
+          <option value="polecat">polecat</option>
+          <option value="police van">police van</option>
+          <option value="police wagon">police wagon</option>
+          <option value="polyplacophore">polyplacophore</option>
+          <option value="Polyporus frondosus">Polyporus frondosus</option>
+          <option value="pomegranate">pomegranate</option>
+          <option value="Pomeranian">Pomeranian</option>
+          <option value="poncho">poncho</option>
+          <option value="Pongo pygmaeus">Pongo pygmaeus</option>
+          <option value="pool table">pool table</option>
+          <option value="pop bottle">pop bottle</option>
+          <option value="popsicle">popsicle</option>
+          <option value="porcupine">porcupine</option>
+          <option value="Porphyrio porphyrio">Porphyrio porphyrio</option>
+          <option value="postbag">postbag</option>
+          <option value="pot">pot</option>
+          <option value="potpie">potpie</option>
+          <option value="potter's wheel">potter's wheel</option>
+          <option value="power drill">power drill</option>
+          <option value="prairie chicken">prairie chicken</option>
+          <option value="prairie fowl">prairie fowl</option>
+          <option value="prairie grouse">prairie grouse</option>
+          <option value="prairie wolf">prairie wolf</option>
+          <option value="prayer mat">prayer mat</option>
+          <option value="prayer rug">prayer rug</option>
+          <option value="press">press</option>
+          <option value="pretzel">pretzel</option>
+          <option value="printer">printer</option>
+          <option value="prison">prison</option>
+          <option value="prison house">prison house</option>
+          <option value="proboscis monkey">proboscis monkey</option>
+          <option value="projectile">projectile</option>
+          <option value="projector">projector</option>
+          <option value="promontory">promontory</option>
+          <option value="Psittacus erithacus">Psittacus erithacus</option>
+          <option value="ptarmigan">ptarmigan</option>
+          <option value="puck">puck</option>
+          <option value="puff">puff</option>
+          <option value="puff adder">puff adder</option>
+          <option value="puffer">puffer</option>
+          <option value="pufferfish">pufferfish</option>
+          <option value="pug">pug</option>
+          <option value="pug-dog">pug-dog</option>
+          <option value="puma">puma</option>
+          <option value="punch bag">punch bag</option>
+          <option value="punchball">punchball</option>
+          <option value="punching bag">punching bag</option>
+          <option value="punching ball">punching ball</option>
+          <option value="purse">purse</option>
+          <option value="pyjama">pyjama</option>
+          <option value="Python sebae">Python sebae</option>
+          <option value="quail">quail</option>
+          <option value="quill">quill</option>
+          <option value="quill pen">quill pen</option>
+          <option value="quilt">quilt</option>
+          <option value="R.V.">R.V.</option>
+          <option value="race car">race car</option>
+          <option value="racer">racer</option>
+          <option value="racing car">racing car</option>
+          <option value="racket">racket</option>
+          <option value="racquet">racquet</option>
+          <option value="radiator">radiator</option>
+          <option value="radiator grille">radiator grille</option>
+          <option value="radio">radio</option>
+          <option value="radio reflector">radio reflector</option>
+          <option value="radio telescope">radio telescope</option>
+          <option value="rain barrel">rain barrel</option>
+          <option value="ram">ram</option>
+          <option value="Rana catesbeiana">Rana catesbeiana</option>
+          <option value="rapeseed">rapeseed</option>
+          <option value="reaper">reaper</option>
+          <option value="recreational vehicle">recreational vehicle</option>
+          <option value="red fox">red fox</option>
+          <option value="red hot">red hot</option>
+          <option value="red panda">red panda</option>
+          <option value="red setter">red setter</option>
+          <option value="red wine">red wine</option>
+          <option value="red wolf">red wolf</option>
+          <option value="red-backed sandpiper">red-backed sandpiper</option>
+          <option value="red-breasted merganser">red-breasted merganser</option>
+          <option value="redbone">redbone</option>
+          <option value="redshank">redshank</option>
+          <option value="reel">reel</option>
+          <option value="reflex camera">reflex camera</option>
+          <option value="refrigerator">refrigerator</option>
+          <option value="remote">remote</option>
+          <option value="remote control">remote control</option>
+          <option value="respirator">respirator</option>
+          <option value="restaurant">restaurant</option>
+          <option value="revolver">revolver</option>
+          <option value="rhinoceros beetle">rhinoceros beetle</option>
+          <option value="Rhodesian ridgeback">Rhodesian ridgeback</option>
+          <option value="ribbed toad">ribbed toad</option>
+          <option value="ricksha">ricksha</option>
+          <option value="rickshaw">rickshaw</option>
+          <option value="rifle">rifle</option>
+          <option value="rig">rig</option>
+          <option value="ring armor">ring armor</option>
+          <option value="ring armour">ring armour</option>
+          <option value="ring mail">ring mail</option>
+          <option value="ring snake">ring snake</option>
+          <option value="ring-binder">ring-binder</option>
+          <option value="ring-necked snake">ring-necked snake</option>
+          <option value="ring-tailed lemur">ring-tailed lemur</option>
+          <option value="ringlet">ringlet</option>
+          <option value="ringlet butterfly">ringlet butterfly</option>
+          <option value="ringneck snake">ringneck snake</option>
+          <option value="ringtail">ringtail</option>
+          <option value="river horse">river horse</option>
+          <option value="roach">roach</option>
+          <option value="robin">robin</option>
+          <option value="rock beauty">rock beauty</option>
+          <option value="rock crab">rock crab</option>
+          <option value="rock lobster">rock lobster</option>
+          <option value="rock python">rock python</option>
+          <option value="rock snake">rock snake</option>
+          <option value="rocker">rocker</option>
+          <option value="rocking chair">rocking chair</option>
+          <option value="Rocky Mountain bighorn">Rocky Mountain bighorn</option>
+          <option value="Rocky Mountain sheep">Rocky Mountain sheep</option>
+          <option value="rose hip">rose hip</option>
+          <option value="rosehip">rosehip</option>
+          <option value="rotisserie">rotisserie</option>
+          <option value="Rottweiler">Rottweiler</option>
+          <option value="roundabout">roundabout</option>
+          <option value="roundworm">roundworm</option>
+          <option value="rubber">rubber</option>
+          <option value="rubber eraser">rubber eraser</option>
+          <option value="rucksack">rucksack</option>
+          <option value="ruddy turnstone">ruddy turnstone</option>
+          <option value="ruffed grouse">ruffed grouse</option>
+          <option value="rugby ball">rugby ball</option>
+          <option value="rule">rule</option>
+          <option value="ruler">ruler</option>
+          <option value="running shoe">running shoe</option>
+          <option value="Russian wolfhound">Russian wolfhound</option>
+          <option value="RV">RV</option>
+          <option value="sabot">sabot</option>
+          <option value="safe">safe</option>
+          <option value="safety pin">safety pin</option>
+          <option value="Saimiri sciureus">Saimiri sciureus</option>
+          <option value="Saint Bernard">Saint Bernard</option>
+          <option value="Salamandra salamandra">Salamandra salamandra</option>
+          <option value="salt shaker">salt shaker</option>
+          <option value="saltshaker">saltshaker</option>
+          <option value="Saluki">Saluki</option>
+          <option value="Samoyed">Samoyed</option>
+          <option value="Samoyede">Samoyede</option>
+          <option value="sand bar">sand bar</option>
+          <option value="sand viper">sand viper</option>
+          <option value="sandal">sandal</option>
+          <option value="sandbar">sandbar</option>
+          <option value="sarong">sarong</option>
+          <option value="sawmill">sawmill</option>
+          <option value="sax">sax</option>
+          <option value="saxophone">saxophone</option>
+          <option value="scabbard">scabbard</option>
+          <option value="scale">scale</option>
+          <option value="schipperke">schipperke</option>
+          <option value="school bus">school bus</option>
+          <option value="schooner">schooner</option>
+          <option value="Sciurus niger">Sciurus niger</option>
+          <option value="scooter">scooter</option>
+          <option value="scope">scope</option>
+          <option value="scoreboard">scoreboard</option>
+          <option value="scorpion">scorpion</option>
+          <option value="Scotch terrier">Scotch terrier</option>
+          <option value="Scottie">Scottie</option>
+          <option value="Scottish deerhound">Scottish deerhound</option>
+          <option value="Scottish terrier">Scottish terrier</option>
+          <option value="screen">screen</option>
+          <option value="screw">screw</option>
+          <option value="screwdriver">screwdriver</option>
+          <option value="scuba diver">scuba diver</option>
+          <option value="sea anemone">sea anemone</option>
+          <option value="sea cradle">sea cradle</option>
+          <option value="sea crawfish">sea crawfish</option>
+          <option value="sea cucumber">sea cucumber</option>
+          <option value="sea lion">sea lion</option>
+          <option value="sea slug">sea slug</option>
+          <option value="sea snake">sea snake</option>
+          <option value="sea star">sea star</option>
+          <option value="sea urchin">sea urchin</option>
+          <option value="sea wolf">sea wolf</option>
+          <option value="sea-coast">sea-coast</option>
+          <option value="seacoast">seacoast</option>
+          <option value="Sealyham">Sealyham</option>
+          <option value="Sealyham terrier">Sealyham terrier</option>
+          <option value="seashore">seashore</option>
+          <option value="seat belt">seat belt</option>
+          <option value="seatbelt">seatbelt</option>
+          <option value="seawall">seawall</option>
+          <option value="semi">semi</option>
+          <option value="sewing machine">sewing machine</option>
+          <option value="sewing needle">sewing needle</option>
+          <option value="shades">shades</option>
+          <option value="shako">shako</option>
+          <option value="Shetland">Shetland</option>
+          <option value="Shetland sheep dog">Shetland sheep dog</option>
+          <option value="Shetland sheepdog">Shetland sheepdog</option>
+          <option value="shield">shield</option>
+          <option value="Shih-Tzu">Shih-Tzu</option>
+          <option value="shoe shop">shoe shop</option>
+          <option value="shoe store">shoe store</option>
+          <option value="shoe-shop">shoe-shop</option>
+          <option value="shoji">shoji</option>
+          <option value="shopping basket">shopping basket</option>
+          <option value="shopping cart">shopping cart</option>
+          <option value="shovel">shovel</option>
+          <option value="shower cap">shower cap</option>
+          <option value="shower curtain">shower curtain</option>
+          <option value="siamang">siamang</option>
+          <option value="Siamese">Siamese</option>
+          <option value="Siamese cat">Siamese cat</option>
+          <option value="Siberian husky">Siberian husky</option>
+          <option value="sidewinder">sidewinder</option>
+          <option value="silky terrier">silky terrier</option>
+          <option value="silver salmon">silver salmon</option>
+          <option value="site">site</option>
+          <option value="six-gun">six-gun</option>
+          <option value="six-shooter">six-shooter</option>
+          <option value="skeeter hawk">skeeter hawk</option>
+          <option value="ski">ski</option>
+          <option value="ski mask">ski mask</option>
+          <option value="skillet">skillet</option>
+          <option value="skunk">skunk</option>
+          <option value="sleeping bag">sleeping bag</option>
+          <option value="sleuthhound">sleuthhound</option>
+          <option value="slide rule">slide rule</option>
+          <option value="sliding door">sliding door</option>
+          <option value="slipstick">slipstick</option>
+          <option value="slot">slot</option>
+          <option value="sloth bear">sloth bear</option>
+          <option value="slug">slug</option>
+          <option value="smoothing iron">smoothing iron</option>
+          <option value="snail">snail</option>
+          <option value="snake doctor">snake doctor</option>
+          <option value="snake feeder">snake feeder</option>
+          <option value="snake fence">snake fence</option>
+          <option value="snake-rail fence">snake-rail fence</option>
+          <option value="snoek">snoek</option>
+          <option value="snooker table">snooker table</option>
+          <option value="snorkel">snorkel</option>
+          <option value="snow leopard">snow leopard</option>
+          <option value="snowbird">snowbird</option>
+          <option value="snowmobile">snowmobile</option>
+          <option value="snowplough">snowplough</option>
+          <option value="snowplow">snowplow</option>
+          <option value="soap dispenser">soap dispenser</option>
+          <option value="soccer ball">soccer ball</option>
+          <option value="sock">sock</option>
+          <option value="soda bottle">soda bottle</option>
+          <option value="soft-coated wheaten terrier">soft-coated wheaten terrier</option>
+          <option value="solar collector">solar collector</option>
+          <option value="solar dish">solar dish</option>
+          <option value="solar furnace">solar furnace</option>
+          <option value="sombrero">sombrero</option>
+          <option value="sorrel">sorrel</option>
+          <option value="soup bowl">soup bowl</option>
+          <option value="space bar">space bar</option>
+          <option value="space heater">space heater</option>
+          <option value="space shuttle">space shuttle</option>
+          <option value="spaghetti squash">spaghetti squash</option>
+          <option value="spatula">spatula</option>
+          <option value="speaker">speaker</option>
+          <option value="speaker system">speaker system</option>
+          <option value="speaker unit">speaker unit</option>
+          <option value="speedboat">speedboat</option>
+          <option value="spider monkey">spider monkey</option>
+          <option value="spider web">spider web</option>
+          <option value="spider's web">spider's web</option>
+          <option value="spike">spike</option>
+          <option value="spindle">spindle</option>
+          <option value="spiny anteater">spiny anteater</option>
+          <option value="spiny lobster">spiny lobster</option>
+          <option value="spiral">spiral</option>
+          <option value="spoonbill">spoonbill</option>
+          <option value="sport car">sport car</option>
+          <option value="sports car">sports car</option>
+          <option value="spot">spot</option>
+          <option value="spotlight">spotlight</option>
+          <option value="spotted salamander">spotted salamander</option>
+          <option value="squealer">squealer</option>
+          <option value="squeeze box">squeeze box</option>
+          <option value="squirrel monkey">squirrel monkey</option>
+          <option value="St Bernard">St Bernard</option>
+          <option value="Staffordshire bull terrier">Staffordshire bull terrier</option>
+          <option value="Staffordshire bullterrier">Staffordshire bullterrier</option>
+          <option value="Staffordshire terrier">Staffordshire terrier</option>
+          <option value="stage">stage</option>
+          <option value="standard poodle">standard poodle</option>
+          <option value="standard schnauzer">standard schnauzer</option>
+          <option value="starfish">starfish</option>
+          <option value="station waggon">station waggon</option>
+          <option value="station wagon">station wagon</option>
+          <option value="steam locomotive">steam locomotive</option>
+          <option value="steel arch bridge">steel arch bridge</option>
+          <option value="steel drum">steel drum</option>
+          <option value="stethoscope">stethoscope</option>
+          <option value="stick insect">stick insect</option>
+          <option value="stingray">stingray</option>
+          <option value="stinkhorn">stinkhorn</option>
+          <option value="stole">stole</option>
+          <option value="stone wall">stone wall</option>
+          <option value="stop watch">stop watch</option>
+          <option value="stoplight">stoplight</option>
+          <option value="stopwatch">stopwatch</option>
+          <option value="stove">stove</option>
+          <option value="strainer">strainer</option>
+          <option value="strawberry">strawberry</option>
+          <option value="street sign">street sign</option>
+          <option value="streetcar">streetcar</option>
+          <option value="stretcher">stretcher</option>
+          <option value="Strix nebulosa">Strix nebulosa</option>
+          <option value="Struthio camelus">Struthio camelus</option>
+          <option value="studio couch">studio couch</option>
+          <option value="stupa">stupa</option>
+          <option value="sturgeon">sturgeon</option>
+          <option value="sub">sub</option>
+          <option value="submarine">submarine</option>
+          <option value="suit">suit</option>
+          <option value="suit of clothes">suit of clothes</option>
+          <option value="sulfur butterfly">sulfur butterfly</option>
+          <option value="sulphur butterfly">sulphur butterfly</option>
+          <option value="sulphur-crested cockatoo">sulphur-crested cockatoo</option>
+          <option value="sun blocker">sun blocker</option>
+          <option value="sunblock">sunblock</option>
+          <option value="sundial">sundial</option>
+          <option value="sunglass">sunglass</option>
+          <option value="sunglasses">sunglasses</option>
+          <option value="sunscreen">sunscreen</option>
+          <option value="Sus scrofa">Sus scrofa</option>
+          <option value="suspension bridge">suspension bridge</option>
+          <option value="Sussex spaniel">Sussex spaniel</option>
+          <option value="swab">swab</option>
+          <option value="sweatshirt">sweatshirt</option>
+          <option value="sweet potato">sweet potato</option>
+          <option value="swimming cap">swimming cap</option>
+          <option value="swimming trunks">swimming trunks</option>
+          <option value="swing">swing</option>
+          <option value="switch">switch</option>
+          <option value="swob">swob</option>
+          <option value="Sydney silky">Sydney silky</option>
+          <option value="Symphalangus syndactylus">Symphalangus syndactylus</option>
+          <option value="syringe">syringe</option>
+          <option value="syrinx">syrinx</option>
+          <option value="T-shirt">T-shirt</option>
+          <option value="tabby">tabby</option>
+          <option value="tabby cat">tabby cat</option>
+          <option value="table lamp">table lamp</option>
+          <option value="tailed frog">tailed frog</option>
+          <option value="tailed toad">tailed toad</option>
+          <option value="tam-tam">tam-tam</option>
+          <option value="tandem">tandem</option>
+          <option value="tandem bicycle">tandem bicycle</option>
+          <option value="tank">tank</option>
+          <option value="tank suit">tank suit</option>
+          <option value="tape player">tape player</option>
+          <option value="taper">taper</option>
+          <option value="tarantula">tarantula</option>
+          <option value="taxi">taxi</option>
+          <option value="taxicab">taxicab</option>
+          <option value="teapot">teapot</option>
+          <option value="teddy">teddy</option>
+          <option value="teddy bear">teddy bear</option>
+          <option value="tee shirt">tee shirt</option>
+          <option value="television">television</option>
+          <option value="television system">television system</option>
+          <option value="ten-gallon hat">ten-gallon hat</option>
+          <option value="tench">tench</option>
+          <option value="tennis ball">tennis ball</option>
+          <option value="terrace">terrace</option>
+          <option value="terrapin">terrapin</option>
+          <option value="Thalarctos maritimus">Thalarctos maritimus</option>
+          <option value="thatch">thatch</option>
+          <option value="thatched roof">thatched roof</option>
+          <option value="theater curtain">theater curtain</option>
+          <option value="theatre curtain">theatre curtain</option>
+          <option value="thimble">thimble</option>
+          <option value="thrasher">thrasher</option>
+          <option value="three-toed sloth">three-toed sloth</option>
+          <option value="thresher">thresher</option>
+          <option value="threshing machine">threshing machine</option>
+          <option value="throne">throne</option>
+          <option value="thunder snake">thunder snake</option>
+          <option value="Tibetan mastiff">Tibetan mastiff</option>
+          <option value="Tibetan terrier">Tibetan terrier</option>
+          <option value="tick">tick</option>
+          <option value="tiger">tiger</option>
+          <option value="tiger beetle">tiger beetle</option>
+          <option value="tiger cat">tiger cat</option>
+          <option value="tiger shark">tiger shark</option>
+          <option value="tile roof">tile roof</option>
+          <option value="timber wolf">timber wolf</option>
+          <option value="tin opener">tin opener</option>
+          <option value="Tinca tinca">Tinca tinca</option>
+          <option value="titi">titi</option>
+          <option value="titi monkey">titi monkey</option>
+          <option value="toaster">toaster</option>
+          <option value="tobacco shop">tobacco shop</option>
+          <option value="tobacconist">tobacconist</option>
+          <option value="tobacconist shop">tobacconist shop</option>
+          <option value="toilet paper">toilet paper</option>
+          <option value="toilet seat">toilet seat</option>
+          <option value="toilet tissue">toilet tissue</option>
+          <option value="tool kit">tool kit</option>
+          <option value="tope">tope</option>
+          <option value="torch">torch</option>
+          <option value="torpedo">torpedo</option>
+          <option value="totem pole">totem pole</option>
+          <option value="toucan">toucan</option>
+          <option value="tow car">tow car</option>
+          <option value="tow truck">tow truck</option>
+          <option value="toy poodle">toy poodle</option>
+          <option value="toy terrier">toy terrier</option>
+          <option value="toyshop">toyshop</option>
+          <option value="trackless trolley">trackless trolley</option>
+          <option value="tractor">tractor</option>
+          <option value="tractor trailer">tractor trailer</option>
+          <option value="traffic light">traffic light</option>
+          <option value="traffic signal">traffic signal</option>
+          <option value="trailer truck">trailer truck</option>
+          <option value="tram">tram</option>
+          <option value="tramcar">tramcar</option>
+          <option value="transverse flute">transverse flute</option>
+          <option value="trash barrel">trash barrel</option>
+          <option value="trash bin">trash bin</option>
+          <option value="trash can">trash can</option>
+          <option value="tray">tray</option>
+          <option value="tree frog">tree frog</option>
+          <option value="tree-frog">tree-frog</option>
+          <option value="trench coat">trench coat</option>
+          <option value="triceratops">triceratops</option>
+          <option value="tricycle">tricycle</option>
+          <option value="trifle">trifle</option>
+          <option value="trike">trike</option>
+          <option value="trilobite">trilobite</option>
+          <option value="trimaran">trimaran</option>
+          <option value="Tringa totanus">Tringa totanus</option>
+          <option value="tripod">tripod</option>
+          <option value="Triturus vulgaris">Triturus vulgaris</option>
+          <option value="triumphal arch">triumphal arch</option>
+          <option value="trolley">trolley</option>
+          <option value="trolley car">trolley car</option>
+          <option value="trolley coach">trolley coach</option>
+          <option value="trolleybus">trolleybus</option>
+          <option value="trombone">trombone</option>
+          <option value="trucking rig">trucking rig</option>
+          <option value="trump">trump</option>
+          <option value="trumpet">trumpet</option>
+          <option value="tub">tub</option>
+          <option value="tup">tup</option>
+          <option value="Turdus migratorius">Turdus migratorius</option>
+          <option value="turnstile">turnstile</option>
+          <option value="tusker">tusker</option>
+          <option value="two-piece">two-piece</option>
+          <option value="tympan">tympan</option>
+          <option value="typewriter keyboard">typewriter keyboard</option>
+          <option value="U-boat">U-boat</option>
+          <option value="umbrella">umbrella</option>
+          <option value="unicycle">unicycle</option>
+          <option value="upright">upright</option>
+          <option value="upright piano">upright piano</option>
+          <option value="Urocyon cinereoargenteus">Urocyon cinereoargenteus</option>
+          <option value="Ursus americanus">Ursus americanus</option>
+          <option value="Ursus arctos">Ursus arctos</option>
+          <option value="Ursus Maritimus">Ursus Maritimus</option>
+          <option value="Ursus ursinus">Ursus ursinus</option>
+          <option value="vacuum">vacuum</option>
+          <option value="vacuum cleaner">vacuum cleaner</option>
+          <option value="vale">vale</option>
+          <option value="valley">valley</option>
+          <option value="Varanus komodoensis">Varanus komodoensis</option>
+          <option value="vase">vase</option>
+          <option value="vat">vat</option>
+          <option value="vault">vault</option>
+          <option value="velocipede">velocipede</option>
+          <option value="velvet">velvet</option>
+          <option value="vending machine">vending machine</option>
+          <option value="vestment">vestment</option>
+          <option value="viaduct">viaduct</option>
+          <option value="vine snake">vine snake</option>
+          <option value="violin">violin</option>
+          <option value="violoncello">violoncello</option>
+          <option value="Virginia fence">Virginia fence</option>
+          <option value="vizsla">vizsla</option>
+          <option value="volcano">volcano</option>
+          <option value="volleyball">volleyball</option>
+          <option value="volute">volute</option>
+          <option value="Vulpes macrotis">Vulpes macrotis</option>
+          <option value="Vulpes vulpes">Vulpes vulpes</option>
+          <option value="vulture">vulture</option>
+          <option value="waffle iron">waffle iron</option>
+          <option value="waggon">waggon</option>
+          <option value="wagon">wagon</option>
+          <option value="Walker foxhound">Walker foxhound</option>
+          <option value="Walker hound">Walker hound</option>
+          <option value="walking stick">walking stick</option>
+          <option value="walkingstick">walkingstick</option>
+          <option value="wall clock">wall clock</option>
+          <option value="wallaby">wallaby</option>
+          <option value="wallet">wallet</option>
+          <option value="wardrobe">wardrobe</option>
+          <option value="warplane">warplane</option>
+          <option value="warragal">warragal</option>
+          <option value="warrigal">warrigal</option>
+          <option value="warthog">warthog</option>
+          <option value="wash-hand basin">wash-hand basin</option>
+          <option value="washbasin">washbasin</option>
+          <option value="washbowl">washbowl</option>
+          <option value="washer">washer</option>
+          <option value="washing machine">washing machine</option>
+          <option value="wastebin">wastebin</option>
+          <option value="water bottle">water bottle</option>
+          <option value="water buffalo">water buffalo</option>
+          <option value="water hen">water hen</option>
+          <option value="water jug">water jug</option>
+          <option value="water ouzel">water ouzel</option>
+          <option value="water ox">water ox</option>
+          <option value="water snake">water snake</option>
+          <option value="water tower">water tower</option>
+          <option value="wax light">wax light</option>
+          <option value="weasel">weasel</option>
+          <option value="web site">web site</option>
+          <option value="website">website</option>
+          <option value="weevil">weevil</option>
+          <option value="weighing machine">weighing machine</option>
+          <option value="Weimaraner">Weimaraner</option>
+          <option value="welcome mat">welcome mat</option>
+          <option value="Welsh springer spaniel">Welsh springer spaniel</option>
+          <option value="West Highland white terrier">West Highland white terrier</option>
+          <option value="wheelbarrow">wheelbarrow</option>
+          <option value="whippet">whippet</option>
+          <option value="whiptail">whiptail</option>
+          <option value="whiptail lizard">whiptail lizard</option>
+          <option value="whirligig">whirligig</option>
+          <option value="whiskey jug">whiskey jug</option>
+          <option value="whistle">whistle</option>
+          <option value="white fox">white fox</option>
+          <option value="white shark">white shark</option>
+          <option value="white stork">white stork</option>
+          <option value="white wolf">white wolf</option>
+          <option value="whorl">whorl</option>
+          <option value="wig">wig</option>
+          <option value="wild boar">wild boar</option>
+          <option value="window screen">window screen</option>
+          <option value="window shade">window shade</option>
+          <option value="Windsor tie">Windsor tie</option>
+          <option value="wine bottle">wine bottle</option>
+          <option value="wing">wing</option>
+          <option value="wire-haired fox terrier">wire-haired fox terrier</option>
+          <option value="wireless">wireless</option>
+          <option value="wok">wok</option>
+          <option value="wolf spider">wolf spider</option>
+          <option value="wombat">wombat</option>
+          <option value="wood pussy">wood pussy</option>
+          <option value="wood rabbit">wood rabbit</option>
+          <option value="wooden spoon">wooden spoon</option>
+          <option value="woodworking plane">woodworking plane</option>
+          <option value="wool">wool</option>
+          <option value="woolen">woolen</option>
+          <option value="woollen">woollen</option>
+          <option value="worm fence">worm fence</option>
+          <option value="worm snake">worm snake</option>
+          <option value="wreck">wreck</option>
+          <option value="wrecker">wrecker</option>
+          <option value="xylophone">xylophone</option>
+          <option value="yawl">yawl</option>
+          <option value="yellow lady's slipper">yellow lady's slipper</option>
+          <option value="yellow lady-slipper">yellow lady-slipper</option>
+          <option value="Yorkshire terrier">Yorkshire terrier</option>
+          <option value="yurt">yurt</option>
+          <option value="zebra">zebra</option>
+          <option value="zucchini">zucchini</option>
+        </select>
+        <img
+          class="image-gen-output"
+          alt="pelican generated from BigGAN AI model"
+          width="200"
+          height="200"
+        />
+      </section>
+      <section id="text-gen">
+        <h2>Text generation from transformers library</h2>
+        <p>
+          Model:
+          <a
+            href="https://huggingface.co/t5-small"
+            rel="noreferrer"
+            target="_blank"
+            >t5-small</a
+          >
+        </p>
+        <form class="text-gen-form">
+          <label for="text-gen-input">Text prompt</label>
+          <input
+            id="text-gen-input"
+            type="text"
+            placeholder="German: Text to translate"
+            value="German: There are many ducks"
+          />
+          <button id="text-gen-submit">Submit</button>
+          <p class="text-gen-output"></p>
+        </form>
+      </section>
+      <section id="dataset">
+        <h2>Dataset from datasets library</h2>
+        <p>
+          Dataset:
+          <a
+            href="https://huggingface.co/datasets/emotion"
+            rel="noreferrer"
+            target="_blank"
+            >emotion</a
+          >
+        </p>
+        <div class="table-buttons">
+          <button class="table-previous hidden">Previous</button>
+          <button class="table-next">Next</button>
+        </div>
+        <table class="dataset-output"></table>
+      </section>
+    </main>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.2/iframeResizer.contentWindow.min.js"></script>
+  </body>
+</html>
diff --git a/static/index.js b/static/index.js
new file mode 100644
index 0000000000000000000000000000000000000000..773780ddda768358f871b39f3774d6bce0715ff8
--- /dev/null
+++ b/static/index.js
@@ -0,0 +1,126 @@
+if (document.location.search.includes('dark-theme=true')) {
+  document.body.classList.add('dark-theme');
+}
+
+let cursor = 0;
+const RANGE = 5;
+const LIMIT = 16_000;
+
+const textToImage = async (text) => {
+  const inferenceResponse = await fetch(`infer_biggan?input=${text}`);
+  const inferenceBlob = await inferenceResponse.blob();
+  const content = URL.createObjectURL(inferenceBlob);
+  return content;
+};
+
+const translateText = async (text) => {
+  const inferResponse = await fetch(`infer_t5?input=${text}`);
+  const inferJson = await inferResponse.json();
+
+  return inferJson.output;
+};
+
+const queryDataset = async (start, end) => {
+  const queryResponse = await fetch(`query_emotion?start=${start}&end=${end}`);
+  const queryJson = await queryResponse.json();
+
+  return queryJson.output;
+};
+
+const updateTable = async (cursor, range = RANGE) => {
+  const table = document.querySelector('.dataset-output');
+
+  const fragment = new DocumentFragment();
+
+  const observations = await queryDataset(cursor, cursor + range);
+
+  for (const observation of observations) {
+    let row = document.createElement('tr');
+    let text = document.createElement('td');
+    let emotion = document.createElement('td');
+
+    text.textContent = observation.text;
+    emotion.textContent = observation.emotion;
+
+    row.appendChild(text);
+    row.appendChild(emotion);
+    fragment.appendChild(row);
+  }
+
+  table.innerHTML = '';
+
+  table.appendChild(fragment);
+
+  table.insertAdjacentHTML(
+    'afterbegin',
+    `<thead>
+      <tr>
+        <td>text</td>
+        <td>emotion</td>
+      </tr>
+    </thead>`
+  );
+};
+
+const imageGenSelect = document.getElementById('image-gen-input');
+const imageGenImage = document.querySelector('.image-gen-output');
+const textGenForm = document.querySelector('.text-gen-form');
+const tableButtonPrev = document.querySelector('.table-previous');
+const tableButtonNext = document.querySelector('.table-next');
+
+imageGenSelect.addEventListener('change', async (event) => {
+  const value = event.target.value;
+
+  try {
+    imageGenImage.src = await textToImage(value);
+    imageGenImage.alt = value + ' generated from BigGAN AI model';
+  } catch (err) {
+    console.error(err);
+  }
+});
+
+textGenForm.addEventListener('submit', async (event) => {
+  event.preventDefault();
+
+  const textGenInput = document.getElementById('text-gen-input');
+  const textGenParagraph = document.querySelector('.text-gen-output');
+
+  try {
+    textGenParagraph.textContent = await translateText(textGenInput.value);
+  } catch (err) {
+    console.error(err);
+  }
+});
+
+tableButtonPrev.addEventListener('click', () => {
+  cursor = cursor > RANGE ? cursor - RANGE : 0;
+
+  if (cursor < RANGE) {
+    tableButtonPrev.classList.add('hidden');
+  }
+  if (cursor < LIMIT - RANGE) {
+    tableButtonNext.classList.remove('hidden');
+  }
+
+  updateTable(cursor);
+});
+
+tableButtonNext.addEventListener('click', () => {
+  cursor = cursor < LIMIT - RANGE ? cursor + RANGE : cursor;
+
+  if (cursor >= RANGE) {
+    tableButtonPrev.classList.remove('hidden');
+  }
+  if (cursor >= LIMIT - RANGE) {
+    tableButtonNext.classList.add('hidden');
+  }
+
+  updateTable(cursor);
+});
+
+textToImage(imageGenSelect.value)
+  .then((image) => (imageGenImage.src = image))
+  .catch(console.error);
+
+updateTable(cursor)
+  .catch(console.error);
diff --git a/static/style.css b/static/style.css
new file mode 100644
index 0000000000000000000000000000000000000000..6a3c98f8fab848caaaf7b844b24ce23c8c5c8dde
--- /dev/null
+++ b/static/style.css
@@ -0,0 +1,79 @@
+body {
+  --text: hsl(0 0% 15%);
+  padding: 2.5rem;
+  font-family: sans-serif;
+  color: var(--text);
+}
+body.dark-theme {
+  --text: hsl(0 0% 90%);
+  background-color: hsl(223 39% 7%);
+}
+
+main {
+  max-width: 80rem;
+  text-align: center;
+}
+
+section {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+}
+
+a {
+  color: var(--text);
+}
+
+select, input, button, .text-gen-output {
+  padding: 0.5rem 1rem;
+}
+
+select, img, input {
+  margin: 0.5rem auto 1rem;
+}
+
+form {
+  width: 25rem;
+  margin: 0 auto;
+}
+
+input {
+  width: 70%;
+}
+
+button {
+  cursor: pointer;
+}
+
+.text-gen-output {
+  min-height: 1.2rem;
+  margin: 1rem;
+  border: 0.5px solid grey;
+}
+
+#dataset button {
+  width: 6rem;
+  margin: 0.5rem;
+}
+
+#dataset button.hidden {
+  visibility: hidden;
+}
+
+table {
+  max-width: 40rem;
+  text-align: left;
+  border-collapse: collapse;
+}
+
+thead {
+  font-weight: bold;
+}
+
+td {
+  padding: 0.5rem;
+}
+
+td:not(thead td) {
+  border: 0.5px solid grey;
+}
diff --git a/ui/index.html b/ui/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..8ec4fe61b348b0c4f149a787eaa9df3ecd9002b5
--- /dev/null
+++ b/ui/index.html
@@ -0,0 +1,245 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <link rel="icon" type="image/png" href="/media/favicon-16x16.png" sizes="16x16">
+    <link rel="icon" type="image/png" href="/media/favicon-32x32.png" sizes="32x32">
+    <link rel="stylesheet" href="/media/main.css?v=22">
+    <link rel="stylesheet" href="/media/modifier-thumbnails.css?v=1">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.2.0/css/all.min.css">
+    <link rel="stylesheet" href="/media/drawingboard.min.css">
+    <script src="/media/jquery-3.6.1.min.js"></script>
+    <script src="/media/drawingboard.min.js"></script>
+</head>
+<body>
+<div id="container">
+    <div id="top-nav">
+        <div id="logo">
+            <h1>Stable Diffusion UI <small>v2.21 <span id="updateBranchLabel"></span></small></h1>
+        </div>
+        <ul id="top-nav-items">
+            <li class="dropdown">
+                <span><i class="fa fa-comments icon"></i> Help & Community</span>
+                <ul id="community-links" class="dropdown-content">
+                    <li><a href="https://github.com/cmdr2/stable-diffusion-ui/blob/main/Troubleshooting.md" target="_blank"><i class="fa-solid fa-circle-question fa-fw"></i> Usual problems and solutions</a></li>
+                    <li><a href="https://discord.com/invite/u9yhsFmEkB" target="_blank"><i class="fa-brands fa-discord fa-fw"></i> Discord user community</a></li>
+                    <li><a href="https://www.reddit.com/r/StableDiffusionUI/" target="_blank"><i class="fa-brands fa-reddit fa-fw"></i> Reddit community</a></li>
+                    <li><a href="https://github.com/cmdr2/stable-diffusion-ui" target="_blank"><i class="fa-brands fa-github fa-fw"></i> Source code on GitHub</a></li>
+                </ul>
+            </li>
+            <li class="dropdown">
+                <span><i class="fa fa-gear icon"></i> Settings</span>
+                <div id="system-settings" class="panel-box settings-box dropdown-content">
+                    <ul id="system-settings-entries">
+                        <li><b class="settings-subheader">System Settings</b></li>
+                        <br/>
+                        <li><input id="save_to_disk" name="save_to_disk" type="checkbox"> <label for="save_to_disk">Automatically save to <input id="diskPath" name="diskPath" size="40" disabled></label></li>
+                        <li><input id="sound_toggle" name="sound_toggle" type="checkbox" checked> <label for="sound_toggle">Play sound on task completion</label></li>
+                        <li><input id="turbo" name="turbo" type="checkbox" checked> <label for="turbo">Turbo mode <small>(generates images faster, but uses an additional 1 GB of GPU memory)</small></label></li>
+                        <li><input id="use_cpu" name="use_cpu" type="checkbox"> <label for="use_cpu">Use CPU instead of GPU <small>(warning: this will be *very* slow)</small></label></li>
+                        <li><input id="use_full_precision" name="use_full_precision" type="checkbox"> <label for="use_full_precision">Use full precision <small>(for GPU-only. warning: this will consume more VRAM)</small></label></li>
+                        <!-- <li><input id="allow_nsfw" name="allow_nsfw" type="checkbox"> <label for="allow_nsfw">Allow NSFW Content (You confirm you are above 18 years of age)</label></li> -->
+                        <br/>
+                        <li><input id="use_beta_channel" name="use_beta_channel" type="checkbox"> <label for="use_beta_channel">🔥Beta channel. Get the latest features immediately (but could be less stable). Please restart the program after changing this.</label></li>
+                    </ul>
+                </div>
+            </li>
+        </ul>
+    </div>
+
+    <div class="flex-container">
+        <div id="editor" class="col-fixed-10">
+            <div id="server-status">
+                <div id="server-status-color">●</div>
+                <span id="server-status-msg">Stable Diffusion is starting..</span>
+            </div>
+            <div id="editor-inputs">
+                <div id="editor-inputs-prompt" class="row">
+                    <label for="prompt">Prompt</label>
+                    <textarea id="prompt" class="col-free">a photograph of an astronaut riding a horse</textarea>
+                    <small>(or)</small> <button id="promptsFromFileBtn">Load prompts from a file</button> <small>(one prompt per line)</small>
+                    <input id="prompt_from_file" name="prompt_from_file" type="file" /> <!-- hidden -->
+                </div>
+
+                <div id="editor-inputs-init-image" class="row">
+                    <label for="init_image"><b>Initial Image (img2img):</b> (optional) </label> <input id="init_image" name="init_image" type="file" /><br/>
+
+                    <div id="init_image_preview_container" class="image_preview_container">
+                        <img id="init_image_preview" src="" width="100" height="100" />
+                        <button class="init_image_clear image_clear_btn">X</button>
+
+                        <br/>
+                        <input id="enable_mask" name="enable_mask" type="checkbox"> <label for="enable_mask">In-Painting (beta) <small>(select the area which the AI will paint into)</small></label>
+                        <div id="inpaintingEditor"></div>
+                    </div>
+                </div>
+
+                <div id="editor-inputs-tags-container" class="row">
+                    <label>Image Modifiers: <small>(click an Image Modifier to remove it)</small></label>
+                    <div id="editor-inputs-tags-list"></div>
+                </div>
+
+                <button id="makeImage">Make Image</button>
+                <button id="stopImage" class="secondaryButton">Stop All</button>
+            </div>
+
+            <div class="line-separator">&nbsp;</div>
+
+            <div id="editor-settings" class="panel-box settings-box">
+                <h4 class="collapsible">Image Settings</h4>
+                <ul id="editor-settings-entries" class="collapsible-content">
+                    <li><b class="settings-subheader">Image Settings</b></li>
+                    <li class="pl-5"><label for="seed">Seed:</label> <input id="seed" name="seed" size="10" value="30000"> <input id="random_seed" name="random_seed" type="checkbox" checked> <label for="random_seed">Random Image</label></li>
+                    <li class="pl-5"><label for="num_outputs_total">Number of images to make:</label> <input id="num_outputs_total" name="num_outputs_total" value="1" size="1"> <label for="num_outputs_parallel">Generate in parallel:</label> <input id="num_outputs_parallel" name="num_outputs_parallel" value="1" size="1"> (images at once)</li>
+                    <li class="pl-5"><label for="stable_diffusion_model">Model:</label>
+                        <select id="stable_diffusion_model" name="stable_diffusion_model">
+                            <!-- <option value="sd-v1-4" selected>sd-v1-4</option> -->
+                        </select>
+                    </li>
+                    <li id="samplerSelection" class="pl-5"><label for="sampler">Sampler:</label>
+                        <select id="sampler" name="sampler">
+                            <option value="plms">plms</option>
+                            <option value="ddim">ddim</option>
+                            <option value="heun">heun</option>
+                            <option value="euler">euler</option>
+                            <option value="euler_a" selected>euler_a</option>
+                            <option value="dpm2">dpm2</option>
+                            <option value="dpm2_a">dpm2_a</option>
+                            <option value="lms">lms</option>
+                        </select>
+                    </li>
+                    <li class="pl-5"><label>Image Size: </label>
+                        <select id="width" name="width" value="512">
+                            <option value="128">128 (*)</option>
+                            <option value="192">192</option>
+                            <option value="256">256 (*)</option>
+                            <option value="320">320</option>
+                            <option value="384">384</option>
+                            <option value="448">448</option>
+                            <option value="512" selected>512 (*)</option>
+                            <option value="576">576</option>
+                            <option value="640">640</option>
+                            <option value="704">704</option>
+                            <option value="768">768 (*)</option>
+                            <option value="832">832</option>
+                            <option value="896">896</option>
+                            <option value="960">960</option>
+                            <option value="1024">1024 (*)</option>
+                            <option value="1280">1280</option>
+                            <option value="1536">1536</option>
+                            <option value="1792">1792</option>
+                            <option value="2048">2048</option>
+                        </select>
+                        <label for="width"><small>(width)</small></label>
+                        <select id="height" name="height" value="512">
+                            <option value="128">128 (*)</option>
+                            <option value="192">192</option>
+                            <option value="256">256 (*)</option>
+                            <option value="320">320</option>
+                            <option value="384">384</option>
+                            <option value="448">448</option>
+                            <option value="512" selected>512 (*)</option>
+                            <option value="576">576</option>
+                            <option value="640">640</option>
+                            <option value="704">704</option>
+                            <option value="768">768 (*)</option>
+                            <option value="832">832</option>
+                            <option value="896">896</option>
+                            <option value="960">960</option>
+                            <option value="1024">1024 (*)</option>
+                            <option value="1280">1280</option>
+                            <option value="1536">1536</option>
+                            <option value="1792">1792</option>
+                            <option value="2048">2048</option>
+                        </select>
+                        <label for="height"><small>(height)</small></label>
+                    </li>
+                    <li class="pl-5"><label for="num_inference_steps">Number of inference steps:</label> <input id="num_inference_steps" name="num_inference_steps" size="4" value="25"></li>
+                    <li class="pl-5"><label for="guidance_scale_slider">Guidance Scale:</label> <input id="guidance_scale_slider" name="guidance_scale_slider" class="editor-slider" value="75" type="range" min="10" max="500"> <input id="guidance_scale" name="guidance_scale" size="4"></li>
+                    <li class="pl-5"><span id="prompt_strength_container"><label for="prompt_strength_slider">Prompt Strength:</label> <input id="prompt_strength_slider" name="prompt_strength_slider" class="editor-slider" value="80" type="range" min="0" max="99"> <input id="prompt_strength" name="prompt_strength" size="4"><br/></span></li>
+                    <li class="pl-5"><label for="output_format">Output format:</label>
+                        <select id="output_format" name="output_format">
+                            <option value="jpeg" selected>jpeg</option>
+                            <option value="png">png</option>
+                        </select>
+                    </li>
+
+                    <br/>
+
+                    <li><b class="settings-subheader">Prompt Settings</b></li>
+                    <li class="pl-5"><label for="negative_prompt">Negative Prompt:</label> <input id="negative_prompt" name="negative_prompt" size="55"></li>
+
+                    <br/>
+
+                    <li><b class="settings-subheader">Render Settings</b></li>
+                    <li class="pl-5"><input id="stream_image_progress" name="stream_image_progress" type="checkbox"> <label for="stream_image_progress">Show a live preview of the image <small>(uses more VRAM, slightly slower image creation)</small></label></li>
+                    <li class="pl-5"><input id="use_face_correction" name="use_face_correction" type="checkbox" checked> <label for="use_face_correction">Fix incorrect faces and eyes <small>(uses GFPGAN)</small></label></li>
+                    <li class="pl-5">
+                        <input id="use_upscale" name="use_upscale" type="checkbox"> <label for="use_upscale">Upscale the image to 4x resolution using </label>
+                        <select id="upscale_model" name="upscale_model">
+                            <option value="RealESRGAN_x4plus" selected>RealESRGAN_x4plus</option>
+                            <option value="RealESRGAN_x4plus_anime_6B">RealESRGAN_x4plus_anime_6B</option>
+                        </select>
+                    </li>
+                    <li class="pl-5"><input id="show_only_filtered_image" name="show_only_filtered_image" type="checkbox" checked> <label for="show_only_filtered_image">Show only the corrected/upscaled image</label></li>
+                    <br/>
+                    <li><small>The system-related settings have been moved to the top-right corner.</small></li>
+                </ul>
+            </div>
+
+            <div id="editor-modifiers" class="panel-box">
+                <h4 class="collapsible">Image Modifiers (art styles, tags etc)</h4>
+                <div id="editor-modifiers-entries" class="collapsible-content">
+                    <label for="preview-image">Image Style:</label>
+                    <select id="preview-image" name="preview-image" value="portrait">
+                        <option value="portrait" selected="">Face</option>
+                        <option value="landscape">Landscape</option>
+                    </select>
+                    &nbsp;
+                    <label for="modifier-card-size-slider">Thumbnail Size:</label>
+                    <input id="modifier-card-size-slider" name="modifier-card-size-slider" value="0" type="range" min="-3" max="5">
+                </div>
+            </div>
+        </div>
+
+        <div id="preview" class="col-free">
+            <div id="initial-text">
+                Type a prompt and press the "Make Image" button.<br/><br/>You can set an "Initial Image" if you want to guide the AI.<br/><br/>You can also add modifiers like "Realistic", "Pencil Sketch", "ArtStation" etc by browsing through the "Image Modifiers" section and selecting the desired modifiers.<br/><br/>Click "Advanced Settings" for additional settings like seed, image size, number of images to generate etc.<br/><br/>Enjoy! :)
+            </div>
+            <div id="preview-tools">
+                <button id="clear-all-previews" class="secondaryButton"><i class="fa-solid fa-trash-can"></i> Clear All</button>
+            </div>
+        </div>
+    </div>
+
+    <div class="line-separator">&nbsp;</div>
+
+    <div id="footer" class="panel-box">
+        <p>If you found this project useful and want to help keep it alive, please <a href="https://ko-fi.com/cmdr2_stablediffusion_ui" target="_blank"><img src="media/kofi.png" id="coffeeButton"></a> to help cover the cost of development and maintenance! Thank you for your support!</p>
+        <p>Please feel free to join the <a href="https://discord.com/invite/u9yhsFmEkB" target="_blank">discord community</a> or <a href="https://github.com/cmdr2/stable-diffusion-ui/issues" target="_blank">file an issue</a> if you have any problems or suggestions in using this interface.</p>
+        <div id="footer-legal">
+            <p><b>Disclaimer:</b> The authors of this project are not responsible for any content generated using this interface.</p>
+            <p>This license of this software forbids you from sharing any content that violates any laws, produce any harm to a person, disseminate any personal information that would be meant for harm, <br/>spread misinformation and target vulnerable groups. For the full list of restrictions please read <a href="https://github.com/cmdr2/stable-diffusion-ui/blob/main/LICENSE" target="_blank">the license</a>.</p>
+            <p>By using this software, you consent to the terms and conditions of the license.</p>
+        </div>
+    </div>
+</div>
+</body>
+
+<script src="media/main.js?v=32"></script>
+<script>
+async function init() {
+    await loadModifiers()
+    await getDiskPath()
+    await getAppConfig()
+    await getModels()
+
+    setInterval(healthCheck, HEALTH_PING_INTERVAL * 1000)
+    healthCheck()
+
+    playSound()
+}
+
+init()
+</script>
+</html>
diff --git a/ui/media/main.css b/ui/media/main.css
new file mode 100644
index 0000000000000000000000000000000000000000..8d7401e9e6a82bf043e755fa7967746496f45056
--- /dev/null
+++ b/ui/media/main.css
@@ -0,0 +1,416 @@
+body {
+    font-family: Arial, Helvetica, sans-serif;
+    font-size: 11pt;
+    background-color: rgb(32, 33, 36);
+    color: #eee;
+}
+a {
+    color: rgb(0, 102, 204);
+}
+a:visited {
+    color: rgb(0, 102, 204);
+}
+label {
+    font-size: 10pt;
+}
+#prompt {
+    width: 100%;
+    height: 65pt;
+    box-sizing: border-box;
+}
+@media screen and (max-width: 600px) {
+    #prompt {
+        width: 95%;
+    }
+}
+.image_preview_container {
+    /* display: none; */
+    margin-top: 10pt;
+}
+.image_clear_btn {
+    position: absolute;
+    transform: translateX(-50%) translateY(-35%);
+    background: black;
+    color: white;
+    border: 2pt solid #ccc;
+    padding: 0;
+    cursor: pointer;
+    outline: inherit;
+    border-radius: 8pt;
+    width: 16pt;
+    height: 16pt;
+    font-family: Verdana;
+    font-size: 8pt;
+}
+.settings-box ul {
+    font-size: 9pt;
+    margin-bottom: 5px;
+    padding-left: 10px;
+    list-style-type: none;
+}
+.settings-box li {
+    padding-bottom: 4pt;
+}
+.editor-slider {
+    vertical-align: middle;
+}
+.outputMsg {
+    font-size: small;
+    padding-bottom: 3pt;
+}
+#progressBar {
+    font-size: small;
+}
+#footer {
+    font-size: small;
+    padding-left: 10pt;
+    background: none;
+}
+#footer-legal {
+    font-size: 8pt;
+}
+.imgSeedLabel {
+    font-size: 0.8em;
+    background-color: rgb(44, 45, 48);
+    border-radius: 3px;
+    padding: 5px;
+}
+.imgItem {
+    display: inline-block;
+    margin-top: 1em;
+    margin-right: 1em;
+}
+.imgContainer {
+    display: flex;
+    justify-content: flex-end;
+}
+.imgItemInfo {
+    padding-bottom: 0.5em;
+    display: flex;
+    align-items: flex-end;
+    flex-direction: column;
+    position: absolute;
+    padding: 5px;
+    opacity: 0;
+    transition: 0.1s all;
+}
+.imgContainer:hover > .imgItemInfo {
+    opacity: 1;
+}
+.imgItemInfo * {
+    margin-bottom: 7px;
+}
+#container {
+    width: 90%;
+    margin-left: auto;
+    margin-right: auto;
+}
+@media screen and (max-width: 1800px) {
+    #container {
+        width: 100%;
+    }
+}
+#logo small {
+    font-size: 11pt;
+}
+#editor {
+    padding: 5px;
+}
+#editor label {
+    font-weight: normal;
+}
+.settings-box label small {
+    color: rgb(153, 153, 153);
+}
+#preview {
+    padding: 5px;
+}
+#editor-inputs {
+    margin-bottom: 20px;
+}
+#editor-inputs-prompt {
+    flex: 1;
+}
+#editor-inputs .row {
+    padding-bottom: 10px;
+}
+#makeImage {
+    border-radius: 6px;
+}
+#editor-modifiers h5 {
+    padding: 5pt 0;
+    margin: 0;
+}
+#makeImage {
+    flex: 0 0 70px;
+    background: rgb(80, 0, 185);
+    border: 2px solid rgb(40, 0, 78);
+    color: rgb(255, 221, 255);
+    width: 100%;
+    height: 30pt;
+}
+#makeImage:hover {
+    background: rgb(93, 0, 214);
+}
+#stopImage {
+    flex: 0 0 70px;
+    background: rgb(132, 8, 0);
+    border: 2px solid rgb(122, 29, 0);
+    color: rgb(255, 221, 255);
+    width: 100%;
+    height: 30pt;
+    border-radius: 6px;
+    display: none;
+}
+#stopImage:hover {
+    background: rgb(177, 27, 0);
+}
+.flex-container {
+    display: flex;
+}
+.col-50 {
+    flex: 50%;
+}
+.col-fixed-10 {
+    flex: 0 0 380pt;
+}
+.col-free {
+    flex: 1;
+}
+.collapsible {
+    cursor: pointer;
+}
+.collapsible-content {
+    display: none;
+    padding-left: 15px;
+}
+.collapsible-content h5 {
+    padding: 5pt 0pt;
+    margin: 0;
+    font-size: 10pt;
+}
+.collapsible-handle {
+    color: white;
+    padding-right: 5px;
+}
+.panel-box {
+    background: rgb(44, 45, 48);
+    border: 1px solid rgb(47, 49, 53);
+    border-radius: 7px;
+    padding: 5px;
+    margin-bottom: 15px;
+    box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.15), 0 6px 20px 0 rgba(0, 0, 0, 0.15);
+}
+.panel-box h4 {
+    margin: 0;
+    padding: 2px 0;
+}
+#editor-modifiers .editor-modifiers-leaf {
+    padding-top: 10pt;
+    padding-bottom: 10pt;
+}
+#preview {
+    margin-left: 10pt;
+}
+img {
+    box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.15), 0 6px 20px 0 rgba(0, 0, 0, 0.15);
+}
+.line-separator {
+    background: rgb(56, 56, 56);
+    height: 1pt;
+    margin: 15pt 0;
+}
+#editor-inputs-tags-container {
+    margin-top: 5pt;
+    display: none;
+}
+#server-status {
+    display: inline;
+    float: right;
+    transform: translateY(-5pt);
+}
+#server-status-color {
+    /* width: 8pt;
+    height: 8pt;
+    border-radius: 4pt; */
+    font-size: 14pt;
+    color: rgb(128, 87, 0);
+    /* background-color: rgb(197, 1, 1); */
+    /* transform: translateY(15%); */
+    display: inline;
+}
+#server-status-msg {
+    color: rgb(128, 87, 0);
+    padding-left: 2pt;
+    font-size: 10pt;
+}
+.preview-prompt {
+    font-size: 16pt;
+    margin-bottom: 10pt;
+}
+#coffeeButton {
+    height: 23px;
+    transform: translateY(25%);
+}
+
+#inpaintingEditor {
+    width: 300pt;
+    height: 300pt;
+    margin-top: 5pt;
+}
+.drawing-board-canvas-wrapper {
+    background-size: 100% 100%;
+}
+.drawing-board-control > button {
+    background-color: #eee;
+    border-radius: 3pt;
+}
+.drawing-board-control-inner {
+    background-color: #eee;
+    border-radius: 3pt;
+}
+#inpaintingEditor canvas {
+    opacity: 0.6;
+}
+#enable_mask {
+    margin-top: 8pt;
+}
+
+#top-nav {
+    padding-top: 3pt;
+    padding-bottom: 15pt;
+}
+#top-nav .icon {
+    padding-right: 4pt;
+    font-size: 14pt;
+    transform: translateY(1pt);
+}
+#logo {
+    display: inline;
+}
+#logo h1 {
+    display: inline;
+}
+#top-nav-items {
+    list-style-type: none;
+    display: inline;
+    float: right;
+}
+#top-nav-items > li {
+    float: left;
+    display: inline;
+    padding-left: 20pt;
+    cursor: default;
+}
+#initial-text {
+    padding-top: 15pt;
+    padding-left: 4pt;
+}
+.settings-subheader {
+    font-size: 10pt;
+    font-weight: bold;
+}
+.pl-5 {
+    padding-left: 5pt;
+}
+#system-settings {
+    width: 360pt;
+    transform: translateX(-100%) translateX(70pt);
+
+    padding-top: 10pt;
+    padding-bottom: 10pt;
+}
+#system-settings ul {
+    margin: 0;
+    padding: 0;
+}
+#system-settings li {
+    padding-left: 5pt;
+}
+#community-links {
+    list-style-type: none;
+    margin: 0;
+    padding: 12pt;
+    padding-bottom: 0pt;
+    transform: translateX(-15%);
+}
+#community-links li {
+    padding-bottom: 12pt;
+    display: block;
+    font-size: 10pt;
+}
+#community-links li .fa-fw {
+    padding-right: 2pt;
+}
+#community-links li a {
+    color: white;
+    text-decoration: none;
+}
+.dropdown {
+    overflow: hidden;
+}
+.dropdown-content {
+    display: none;
+    position: absolute;
+    z-index: 2;
+
+    background: rgb(18, 18, 19);
+    border: 2px solid rgb(37, 38, 41);
+    border-radius: 7px;
+    padding: 5px;
+    margin-bottom: 15px;
+    box-shadow: 0 20px 28px 0 rgba(0, 0, 0, 0.15), 0 6px 20px 0 rgba(0, 0, 0, 0.15);
+}
+.dropdown:hover .dropdown-content {
+    display: block;
+}
+
+.imageTaskContainer {
+    border: 1px solid #333;
+    margin-bottom: 10pt;
+    padding: 5pt;
+    border-radius: 5pt;
+    box-shadow: 0 20px 28px 0 rgba(0, 0, 0, 0.15), 0 6px 20px 0 rgba(0, 0, 0, 0.15);
+}
+.taskStatusLabel {
+    float: left;
+    font-size: 8pt;
+    background:rgb(44, 45, 48);
+    border: 1px solid rgb(61, 62, 66);
+    padding: 2pt 4pt;
+    border-radius: 2pt;
+    margin-right: 5pt;
+}
+.activeTaskLabel {
+    background:rgb(0, 90, 30);
+    border: 1px solid rgb(0, 75, 19);
+    color:rgb(204, 255, 217)
+}
+.secondaryButton {
+    background: rgb(132, 8, 0);
+    border: 1px solid rgb(122, 29, 0);
+    color: rgb(255, 221, 255);
+    padding: 3pt 6pt;
+    border-radius: 5px;
+}
+.secondaryButton:hover {
+    background: rgb(177, 27, 0);
+}
+.stopTask {
+    float: right;
+}
+#preview-tools {
+    display: none;
+    padding: 4pt;
+}
+.taskConfig {
+    font-size: 10pt;
+    color: #aaa;
+    margin-bottom: 5pt;
+}
+.img-batch {
+    display: inline;
+}
+#prompt_from_file {
+    display: none;
+}
\ No newline at end of file