Spaces:

padmanabhbosamia
/

Stable_Diffusion

Running

App Files Files Community

padmanabhbosamia commited on Oct 19, 2023

Commit

f6a2113

1 Parent(s): af5f9f5

Update app.py

Browse files

Files changed (1) hide show

app.py +178 -229

app.py CHANGED Viewed

@@ -1,16 +1,9 @@
-#!pip install -q --upgrade transformers diffusers ftfy
-#!pip install -q --upgrade transformers==4.25.1 diffusers ftfy
-#!pip install accelerate -q
 from base64 import b64encode
-import numpy
 import torch
 from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
-from huggingface_hub import notebook_login
-# For video display:
-from IPython.display import HTML
 from matplotlib import pyplot as plt
 from pathlib import Path
 from PIL import Image
@@ -18,57 +11,51 @@ from torch import autocast
 from torchvision import transforms as tfms
 from tqdm.auto import tqdm
 from transformers import CLIPTextModel, CLIPTokenizer, logging
-import gradio as gr
-torch.manual_seed(1)
-#if not (Path.home()/'.huggingface'/'token').exists(): notebook_login()
-# Supress some unnecessary warnings when loading the CLIPTextModel
 logging.set_verbosity_error()
-# Set device
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-#import os
-#MY_TOKEN=os.environ.get('Learning')
-# Load the autoencoder model which will be used to decode the latents into image space.
-vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae") #,use_auth_token=MY_TOKEN)
-# Load the tokenizer and text encoder to tokenize and encode the text.
 tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
 text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
-# The UNet model for generating the latents.
-unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
-# The noise scheduler
 scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
-# To the GPU we go!
 vae = vae.to(torch_device)
 text_encoder = text_encoder.to(torch_device)
 unet = unet.to(torch_device)
-"""Functions"""
-def pil_to_latent(input_im):
-    # Single image -> single latent in a batch (so size 1, 4, 64, 64)
-    with torch.no_grad():
-        latent = vae.encode(tfms.ToTensor()(input_im).unsqueeze(0).to(torch_device)*2-1) # Note scaling
-    return 0.18215 * latent.latent_dist.sample()
-def latents_to_pil(latents):
-    # bath of latents -> list of images
-    latents = (1 / 0.18215) * latents
-    with torch.no_grad():
-        image = vae.decode(latents).sample
-    image = (image / 2 + 0.5).clamp(0, 1)
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    images = (image * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images]
-    return pil_images
 def get_output_embeds(input_embeddings):
     # CLIP's text model uses causal mask, so we prepare it here:
@@ -95,177 +82,92 @@ def get_output_embeds(input_embeddings):
     # And now they're ready!
     return output
-#Generating an image with these modified embeddings
-def generate_with_embs(text_embeddings, text_input):
-    height = 512                        # default height of Stable Diffusion
-    width = 512                         # default width of Stable Diffusion
-    num_inference_steps = 7             # Number of denoising steps
-    guidance_scale = 7.5                # Scale for classifier-free guidance
-    generator = torch.manual_seed(64)   # Seed generator to create the inital latent noise
-    batch_size = 1
-    max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer(
-      [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
-    )
-    with torch.no_grad():
-        uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
-    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-    # Prep Scheduler
-    scheduler.set_timesteps(num_inference_steps)
-    # Prep latents
-    latents = torch.randn(
-    (batch_size, unet.config.in_channels, height // 8, width // 8),
-    generator=generator,
-    )
-    latents = latents.to(torch_device)
-    latents = latents * scheduler.init_noise_sigma
-    # Loop
-    for i, t in tqdm(enumerate(scheduler.timesteps)):
-        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-        latent_model_input = torch.cat([latents] * 2)
-        sigma = scheduler.sigmas[i]
-        latent_model_input = scheduler.scale_model_input(latent_model_input, t)
-        # predict the noise residual
-        with torch.no_grad():
-            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
-        # perform guidance
-        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-        # compute the previous noisy sample x_t -> x_t-1
-        latents = scheduler.step(noise_pred, t, latents).prev_sample
-    return latents_to_pil(latents)[0]
-def ref_loss(images,ref_image):
-    # Reference image
-    error = torch.abs(images - ref_image).mean()
-    return error
-def inference(prompt, style_index):
-    styles = ['<snoopy>', '<boot-mjstyle>','<birb-style>','<pop_art>','<ronaldo>','<Thumps_up>']
-    embed = ['snoopy.bin','boot-mjstyle.bin', 'bird_style.bin',   'pop_art.bin','ronaldo.bin','Thumps_up.bin']
-    # Tokenize
-    text_input = tokenizer(prompt+" .", padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
-    # Access the embedding layer
-    token_emb_layer = text_encoder.text_model.embeddings.token_embedding
-    token_embeddings = token_emb_layer(text_input.input_ids.to(torch_device))
-    pos_emb_layer = text_encoder.text_model.embeddings.position_embedding
-    position_ids = text_encoder.text_model.embeddings.position_ids[:, :77]
-    position_embeddings = pos_emb_layer(position_ids)
-    ## Without any Textual Inversion
-    input_ids = text_input.input_ids.to(torch_device)
-    # Get token embeddings
-    token_embeddings = token_emb_layer(input_ids)
-    # Combine with pos embs
-    input_embeddings = token_embeddings + position_embeddings
-    #  Feed through to get final output embs
-    modified_output_embeddings = get_output_embeds(input_embeddings)
-    # And generate an image with this:
-    image1 = generate_with_embs(modified_output_embeddings,text_input)
-    replace_id=269  #replaced dot with Textual Inversion
-    ## midjourney-style
-    style = styles[style_index]
-    emb = embed[style_index]
-    x_embed = torch.load(emb)
-    # The new embedding - our special birb word
-    replacement_token_embedding = x_embed[style].to(torch_device)
-    # Insert this into the token embeddings
-    token_embeddings[0, torch.where(input_ids[0]==replace_id)] = replacement_token_embedding.to(torch_device)
-    # Combine with pos embs
-    input_embeddings = token_embeddings + position_embeddings
-    #  Feed through to get final output embs
-    modified_output_embeddings = get_output_embeds(input_embeddings)
-    # And generate an image with this:
-    image2 = generate_with_embs(modified_output_embeddings,text_input)
-    prompt1 = 'rainbow'
-    # Tokenize
-    text_input1 = tokenizer(prompt1, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
-    # Access the embedding layer
-    token_emb_layer = text_encoder.text_model.embeddings.token_embedding
-    pos_emb_layer = text_encoder.text_model.embeddings.position_embedding
-    position_ids = text_encoder.text_model.embeddings.position_ids[:, :77]
-    position_embeddings1 = pos_emb_layer(position_ids)
-    input_ids1 = text_input1.input_ids.to(torch_device)
-    # Get token embeddings
-    token_embeddings1 = token_emb_layer(input_ids1)
-    # Combine with pos embs
-    input_embeddings1 = token_embeddings1 + position_embeddings1
-    #  Feed through to get final output embs
-    modified_output_embeddings1 = get_output_embeds(input_embeddings1)
-    # And generate an image with this:
-    ref_image = generate_with_embs(modified_output_embeddings1, text_input1)
-    ref_latent = pil_to_latent(ref_image)
-    height = 512                        # default height of Stable Diffusion
-    width = 512                         # default width of Stable Diffusion
-    num_inference_steps = 7  #           # Number of denoising steps
-    guidance_scale = 8 #               # Scale for classifier-free guidance
-    generator = torch.manual_seed(64)   # Seed generator to create the inital latent noise
     batch_size = 1
-    blue_loss_scale = 200 #
-    # Prep text
-    text_input = tokenizer([prompt], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
-    with torch.no_grad():
-        text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
-    # And the uncond. input as before:
-    max_length = text_input.input_ids.shape[-1]
     uncond_input = tokenizer(
-        [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
     )
     with torch.no_grad():
         uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
     text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
     # Prep Scheduler
-    scheduler.set_timesteps(num_inference_steps)
     # Prep latents
     latents = torch.randn(
-      (batch_size, unet.config.in_channels, height // 8, width // 8),
-      generator=generator,
     )
     latents = latents.to(torch_device)
     latents = latents * scheduler.init_noise_sigma
     # Loop
-    for i, t in tqdm(enumerate(scheduler.timesteps)):
         # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
         latent_model_input = torch.cat([latents] * 2)
         sigma = scheduler.sigmas[i]
@@ -275,63 +177,110 @@ def inference(prompt, style_index):
         with torch.no_grad():
             noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
-        # perform CFG
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-        #### ADDITIONAL GUIDANCE ###
-        if i%5 == 0:
-            # Requires grad on the latents
-            latents = latents.detach().requires_grad_()
-            # Get the predicted x0:
-            # latents_x0 = latents - sigma * noise_pred
-            latents_x0 = scheduler.step(noise_pred, t, latents).pred_original_sample
-            # Decode to image space
-            denoised_images = vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5 # range (0, 1)
-            #ref image
-            with torch.no_grad():
-              ref_images = vae.decode((1 / 0.18215) * ref_latent).sample / 2 + 0.5 # range (0, 1)
-            # Calculate loss
-            loss = ref_loss(denoised_images,ref_images) * blue_loss_scale
-            # Occasionally print it out
-            # if i%10==0:
-            #     print(i, 'loss:', loss.item())
-            # Get gradient
-            cond_grad = torch.autograd.grad(loss, latents)[0]
-            # Modify the latents based on this gradient
-            latents = latents.detach() - cond_grad * sigma**2
-            scheduler._step_index = scheduler._step_index - 1
-        # Now step with scheduler
-        latents = scheduler.step(noise_pred, t, latents).prev_sample
-        #latents = scheduler.step(noise_pred, t, latents).pred_original_sample
-    image3 = latents_to_pil(latents)[0]
-    return (image1, 'Original Image'), (image2, 'Styled Image'), (image3, 'After Textual Inversion')
-# Gradio App with num_inference_steps=10
-title="Textual Inversion in Stable Diffusion"
-description="<p style='text-align: center;'>Textual Inversion in Stable Diffusion.</b></p>"
-gallery = gr.Gallery(label="Generated images", show_label=True, elem_id="gallery", columns=3).style(grid=[2], height="auto")
-gr.Interface(fn=inference, inputs=["text",
-    gr.Radio([('<snoopy>',0), ('<boot-mjstyle>',1),('<birb-style>',2),
-              ('<pop_art>',3),(' <ronaldo>',4),('<Thumps_up>',5)], value = 0, label = 'Style')],
-    outputs = gallery, title = title,
-examples = [['a girl playing in snow',0],
-                #['an oil painting of a goddess',6],
-                #['a rabbit on the moon', 5 ]
-           ],
-            ).launch(debug=True)

 from base64 import b64encode
+import gradio as gr
+import numpy as np
 import torch
 from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
 from matplotlib import pyplot as plt
 from pathlib import Path
 from PIL import Image
 from torchvision import transforms as tfms
 from tqdm.auto import tqdm
 from transformers import CLIPTextModel, CLIPTokenizer, logging
+import os
+import cv2
+import torchvision.transforms as T
+torch.manual_seed(1)
 logging.set_verbosity_error()
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load the autoencoder
+vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder='vae')
+# Load tokenizer and text encoder to tokenize and encode the text
 tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
 text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+# Unet model for generating latents
+unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder='unet')
+# Noise scheduler
 scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
+# Move everything to GPU
 vae = vae.to(torch_device)
 text_encoder = text_encoder.to(torch_device)
 unet = unet.to(torch_device)
+style_files = ['Thumps_up.bin', 'birb_style.bin',
+                     'snoopy.bin', 'pop_art.bin',
+                     'boot-mjstyle.bin']
+images_without_loss = []
+images_with_loss = []
+seed_values = [8,16,50,80,128]
+height = 512                        # default height of Stable Diffusion
+width = 512                         # default width of Stable Diffusion
+num_inference_steps = 5            # Number of denoising steps
+guidance_scale = 7.5                # Scale for classifier-free guidance
+num_styles = len(style_files)
+# Prep Scheduler
+def set_timesteps(scheduler, num_inference_steps):
+    scheduler.set_timesteps(num_inference_steps)
+    scheduler.timesteps = scheduler.timesteps.to(torch.float32) # minor fix to ensure MPS compatibility, fixed in diffusers PR 3925
 def get_output_embeds(input_embeddings):
     # CLIP's text model uses causal mask, so we prepare it here:
     # And now they're ready!
     return output
+def get_style_embeddings(style_file):
+    style_embed = torch.load(style_file)
+    style_name = list(style_embed.keys())[0]
+    return style_embed[style_name]
+import torch
+def vibrance_loss(image):
+    # Calculate the standard deviation of color channels
+    std_dev = torch.std(image, dim=(2, 3))  # Compute standard deviation over height and width
+    # Calculate the mean standard deviation across the batch
+    mean_std_dev = torch.mean(std_dev)
+    # You can adjust a scale factor to control the strength of vibrance regularization
+    scale_factor = 100.0
+    # Calculate the vibrance loss
+    loss = -scale_factor * mean_std_dev
+    return loss
+from torchvision.transforms import ToTensor
+def pil_to_latent(input_im):
+    # Single image -> single latent in a batch (so size 1, 4, 64, 64)
+    with torch.no_grad():
+        latent = vae.encode(tfms.ToTensor()(input_im).unsqueeze(0).to(torch_device)*2-1) # Note scaling
+    return 0.18215 * latent.latent_dist.sample()
+def latents_to_pil(latents):
+    # bath of latents -> list of images
+    latents = (1 / 0.18215) * latents
+    with torch.no_grad():
+        image = vae.decode(latents).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+    return pil_images
+def additional_guidance(latents, scheduler, noise_pred, t, sigma, custom_loss_fn):
+    #### ADDITIONAL GUIDANCE ###
+    # Requires grad on the latents
+    latents = latents.detach().requires_grad_()
+    # Get the predicted x0:
+    latents_x0 = latents - sigma * noise_pred
+    #print(f"latents: {latents.shape}, noise_pred:{noise_pred.shape}")
+    #latents_x0 = scheduler.step(noise_pred, t, latents).pred_original_sample
+    # Decode to image space
+    denoised_images = vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5 # range (0, 1)
+    # Calculate loss
+    loss = custom_loss_fn(denoised_images)
+    # Get gradient
+    cond_grad = torch.autograd.grad(loss, latents, allow_unused=False)[0]
+    # Modify the latents based on this gradient
+    latents = latents.detach() - cond_grad * sigma**2
+    return latents, loss
+def generate_with_embs(text_embeddings, max_length, random_seed, loss_fn = None):
+    generator = torch.manual_seed(random_seed)   # Seed generator to create the inital latent noise
     batch_size = 1
     uncond_input = tokenizer(
+      [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
     )
     with torch.no_grad():
         uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
     text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
     # Prep Scheduler
+    set_timesteps(scheduler, num_inference_steps)
     # Prep latents
     latents = torch.randn(
+    (batch_size, unet.in_channels, height // 8, width // 8),
+    generator=generator,
     )
     latents = latents.to(torch_device)
     latents = latents * scheduler.init_noise_sigma
     # Loop
+    for i, t in tqdm(enumerate(scheduler.timesteps), total=len(scheduler.timesteps)):
         # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
         latent_model_input = torch.cat([latents] * 2)
         sigma = scheduler.sigmas[i]
         with torch.no_grad():
             noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
+        # perform guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        if loss_fn is not None:
+            if i%2 == 0:
+                latents, custom_loss = additional_guidance(latents, scheduler, noise_pred, t, sigma, loss_fn)
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, t, latents).prev_sample
+    return latents_to_pil(latents)[0]
+def generate_images(prompt, style_num=None, random_seed=41, custom_loss_fn = None):
+    eos_pos = len(prompt.split())+1
+    style_token_embedding = None
+    if style_num:
+        style_token_embedding = get_style_embeddings(style_files[style_num])
+    # tokenize
+    text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
+    max_length = text_input.input_ids.shape[-1]
+    input_ids = text_input.input_ids.to(torch_device)
+    # get token embeddings
+    token_emb_layer = text_encoder.text_model.embeddings.token_embedding
+    token_embeddings = token_emb_layer(input_ids)
+    # Append style token towards the end of the sentence embeddings
+    if style_token_embedding is not None:
+        token_embeddings[-1, eos_pos, :] = style_token_embedding
+    # combine with pos embs
+    pos_emb_layer = text_encoder.text_model.embeddings.position_embedding
+    position_ids = text_encoder.text_model.embeddings.position_ids[:, :77]
+    position_embeddings = pos_emb_layer(position_ids)
+    input_embeddings = token_embeddings + position_embeddings
+    #  Feed through to get final output embs
+    modified_output_embeddings = get_output_embeds(input_embeddings)
+    # And generate an image with this:
+    generated_image = generate_with_embs(modified_output_embeddings, max_length, random_seed, custom_loss_fn)
+    return generated_image
+import matplotlib.pyplot as plt
+def display_images_in_rows(images_with_titles, titles):
+    num_images = len(images_with_titles)
+    rows = 5  # Display 5 rows always
+    columns = 1 if num_images == 5 else 2  # Use 1 column if there are 5 images, otherwise 2 columns
+    fig, axes = plt.subplots(rows, columns + 1, figsize=(15, 5 * rows))  # Add an extra column for titles
+    for r in range(rows):
+        # Add the title on the extreme left in the middle of each picture
+        axes[r, 0].text(0.5, 0.5, titles[r], ha='center', va='center')
+        axes[r, 0].axis('off')
+        # Add "Without Loss" label above the first column and "With Loss" label above the second column (if applicable)
+        if columns == 2:
+            axes[r, 1].set_title("Without Loss", pad=10)
+            axes[r, 2].set_title("With Loss", pad=10)
+        for c in range(1, columns + 1):
+            index = r * columns + c - 1
+            if index < num_images:
+                image, _ = images_with_titles[index]
+                axes[r, c].imshow(image)
+                axes[r, c].axis('off')
+    return fig
+    # plt.show()
+def image_generator(prompt = "dog", loss_function=None):
+  images_without_loss = []
+  images_with_loss = []
+  if loss_function == "Yes":
+    loss_function = vibrance_loss
+  else:
+    loss_function = None
+  for i in range(num_styles):
+      generated_img = generate_images(prompt,style_num = i,random_seed = seed_values[i],custom_loss_fn = None)
+      images_without_loss.append(generated_img)
+      if loss_function:
+        generated_img = generate_images(prompt,style_num = i,random_seed = seed_values[i],custom_loss_fn = loss_function)
+        images_with_loss.append(generated_img)
+  generated_sd_images = []
+  titles = ["Bird_style", "Boot-mjstyle", "Snoopy Style", "Pop Art Style", "Thumpsup Style"]
+  for i in range(len(titles)):
+    generated_sd_images.append((images_without_loss[i], titles[i]))
+    if images_with_loss != []:
+      generated_sd_images.append((images_with_loss[i], titles[i]))
+  return display_images_in_rows(generated_sd_images, titles)
+description = "Generate an image with a prompt and apply vibrance loss if you wish to. Note that the app is hosted on a cpu and it takes atleast 15 minutes for generating images without loss. Please feel free to clone the space and use it with a GPU after increase the inference steps to more than 10 for better results"
+demo = gr.Interface(image_generator,
+                    inputs=[gr.Textbox(label="Enter prompt for generation", type="text", value="dog sitting on a bench"),
+                            gr.Radio(["Yes", "No"], value="No"  , label="Apply vibrance loss")],
+                    outputs=gr.Plot(label="Generated Images"), title = "Stable Diffusion using Textual Inversion", description=description)
+demo.launch()