# -*- coding: utf-8 -*- """Copy of demo.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/github/energy-based-model/Compositional-Visual-Generation-with-Composable-Diffusion-Models-PyTorch/blob/main/notebooks/demo.ipynb """ import gradio as gr import torch as th from composable_diffusion.download import download_model from composable_diffusion.model_creation import create_model_and_diffusion as create_model_and_diffusion_for_clevr from composable_diffusion.model_creation import model_and_diffusion_defaults as model_and_diffusion_defaults_for_clevr from torch import autocast from diffusers import StableDiffusionPipeline # This notebook supports both CPU and GPU. # On CPU, generating one sample may take on the order of 20 minutes. # On a GPU, it should be under a minute. has_cuda = th.cuda.is_available() device = th.device('cpu' if not th.cuda.is_available() else 'cuda') # init stable diffusion model pipe = StableDiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", use_auth_token='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' ).to(device) # create model for CLEVR Objects clevr_options = model_and_diffusion_defaults_for_clevr() flags = { "image_size": 128, "num_channels": 192, "num_res_blocks": 2, "learn_sigma": True, "use_scale_shift_norm": False, "raw_unet": True, "noise_schedule": "squaredcos_cap_v2", "rescale_learned_sigmas": False, "rescale_timesteps": False, "num_classes": '2', "dataset": "clevr_pos", "use_fp16": has_cuda, "timestep_respacing": '100' } for key, val in flags.items(): clevr_options[key] = val clevr_model, clevr_diffusion = create_model_and_diffusion_for_clevr(**clevr_options) clevr_model.eval() if has_cuda: clevr_model.convert_to_fp16() clevr_model.to(device) clevr_model.load_state_dict(th.load(download_model('clevr_pos'), device)) print('total clevr_pos parameters', sum(x.numel() for x in clevr_model.parameters())) def compose_clevr_objects(prompt, guidance_scale, steps): coordinates = [[float(x.split(',')[0].strip()), float(x.split(',')[1].strip())] for x in prompt.split('|')] coordinates += [[-1, -1]] # add unconditional score label batch_size = 1 clevr_options['timestep_respacing'] = str(int(steps)) _, clevr_diffusion = create_model_and_diffusion_for_clevr(**clevr_options) def model_fn(x_t, ts, **kwargs): half = x_t[:1] combined = th.cat([half] * kwargs['y'].size(0), dim=0) model_out = clevr_model(combined, ts, **kwargs) eps, rest = model_out[:, :3], model_out[:, 3:] masks = kwargs.get('masks') cond_eps = eps[masks].mean(dim=0, keepdim=True) uncond_eps = eps[~masks].mean(dim=0, keepdim=True) half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps) eps = th.cat([half_eps] * x_t.size(0), dim=0) return th.cat([eps, rest], dim=1) def sample(coordinates): masks = [True] * (len(coordinates) - 1) + [False] model_kwargs = dict( y=th.tensor(coordinates, dtype=th.float, device=device), masks=th.tensor(masks, dtype=th.bool, device=device) ) samples = clevr_diffusion.p_sample_loop( model_fn, (len(coordinates), 3, clevr_options["image_size"], clevr_options["image_size"]), device=device, clip_denoised=True, progress=True, model_kwargs=model_kwargs, cond_fn=None, )[:batch_size] return samples samples = sample(coordinates) out_img = samples[0].permute(1, 2, 0) out_img = (out_img + 1) / 2 out_img = (out_img.detach().cpu() * 255.).to(th.uint8) out_img = out_img.numpy() return out_img def stable_diffusion_compose(prompt, scale, steps): with autocast('cpu' if not th.cuda.is_available() else 'cuda'): image = pipe(prompt, guidance_scale=scale, num_inference_steps=steps)["sample"][0] return image def compose(prompt, version, guidance_scale, steps): try: with th.no_grad(): if version == 'Stable_Diffusion_1v_4': return stable_diffusion_compose(prompt, guidance_scale, steps) else: return compose_clevr_objects(prompt, guidance_scale, steps) except Exception as e: print(e) return None examples_1 = 'a camel | a forest' examples_2 = 'A blue sky | A mountain in the horizon | Cherry Blossoms in front of the mountain' examples_3 = '0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5' examples_4 = 'a blue house | a desert' examples_5 = 'a white church | lightning in the background' examples_6 = 'a camel | arctic' examples_7 = 'A lake | A mountain | Cherry Blossoms next to the lake' examples = [ [examples_7, 'Stable_Diffusion_1v_4', 15, 50], [examples_5, 'Stable_Diffusion_1v_4', 15, 50], [examples_4, 'Stable_Diffusion_1v_4', 15, 50], [examples_6, 'Stable_Diffusion_1v_4', 15, 50], [examples_3, 'CLEVR Objects', 10, 100] ] title = 'Compositional Visual Generation with Composable Diffusion Models' description = '
Demo for Composable Diffusion
See more information from our Project Page.
When composing multiple sentences, use `|` as the delimiter, see given examples below.
Note: When using Stable Diffusion, black images will be returned if the given prompt is detected as problematic. For composing GLIDE model, we recommend using the Colab demo in our Project Page.
' iface = gr.Interface(compose, inputs=[ "text", gr.Radio(['Stable_Diffusion_1v_4', 'CLEVR Objects'], type="value", label='version'), gr.Slider(2, 30), gr.Slider(10, 200) ], outputs='image', title=title, description=description, examples=examples) iface.launch(enable_queue=True)