Spaces:

anshuln
/

visualizing_diffusion_attention

Sleeping

File size: 5,350 Bytes

from pathlib import Path

import cv2
import sys
import gradio as gr
import os
import numpy as np
from gradio_utils import *

from transformers import CLIPTokenizer

def image_mod(image):
    return image.rotate(45)


sys.path.insert(1, os.path.join(sys.path[0], '..'))


NUM_POINTS = 3
NUM_FRAMES = 16
LARGE_BOX_SIZE = 176



data = {}
tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-base-patch32')
def get_token_number(prompt, word):
    all_tokens = tokenizer(prompt).input_ids
    word_tokens = tokenizer(word).input_ids
    print(all_tokens, word_tokens, word)
    return all_tokens.index(word_tokens[1])  # Word_tokens start with cls

def overlay_mask(img, mask):
    mask_resized = cv2.resize(mask, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_NEAREST)

    # Create a 3-channel version of the mask
    mask_3ch = cv2.cvtColor(mask_resized, cv2.COLOR_GRAY2BGR)

    # Set the opacity level
    opacity = 0.25  # Adjust as needed
    alpha_channel = np.ones_like(mask_resized) * 255  # Start with a fully opaque alpha channel

    # Set black pixels to be completely transparent
    alpha_channel[mask_resized < 5] = 0

    # Set the opacity level for non-black pixels
    opacity = 0.3  # Adjust this value as needed (0.0 to 1.0)
    alpha_channel[mask_resized != 0] = int(255 * opacity)

    # Create a 4-channel image (BGR + Alpha)
    b, g, r = cv2.split(img)
    rgba = [b, g, r, alpha_channel]
    result = cv2.merge(rgba, 4)
    # Overlay the mask on the image
    overlay = cv2.addWeighted(mask_3ch, opacity, img, 1 - opacity, 0)    
    return overlay

def fetch_proper_img(prompt, word, frame_num, diffusion_step, layer_num=0):
    
    
    frame_num = frame_num - 1
    if layer_num is None:
        layer_num = 0
    else:
        layer_num = 100 if layer_num == 3 else layer_num
   
    video_file_name = f"./data/videos/{prompt.replace(' ', '_')}/video/frame_{frame_num:04d}.png"
    img = cv2.imread(video_file_name)

    if word is None:
        overlaid_image = img
    else:
        mask_file_name = f'./data/final_masks/attention_probs_{prompt}/frame_{frame_num}_layer_{layer_num}_diffusionstep_{diffusion_step}_token_{get_token_number(prompt, word)}.png'
        mask = cv2.imread(mask_file_name, cv2.IMREAD_GRAYSCALE)
        overlaid_image = overlay_mask(img, mask)
        print(mask_file_name)

    return img, overlaid_image


def fetch_proper_img_and_change_prompt(prompt, word, frame_num, diffusion_step, layer_num=0):
    radio = change_text_prompt(prompt)
    video_1, video_2 = fetch_proper_img(prompt, word, frame_num, diffusion_step, layer_num)
    return [video_1, video_2, radio]

css = """
.word-btn {
    width: fit-content;
    padding: 3px;
}
.word-btns-container {
    flex-direction: row;
}
"""

registry = {
    'spider': 'mask_1',
    'descending': 'mask_2',
}

data_path = Path(
    'data'
)


available_prompts = ['a dog and a cat sitting','A fish swimming in the water', 'A spider descending from its web', 'An astronaut riding a horse']

with gr.Blocks(css=css) as demo:
    with gr.Row():
        video_1 = gr.Image(label="Image", )
        # video_1 = gr.Image(label="Image", width=256, height=256)
        video_2 = gr.Image(label="Image with Attention Mask", )
        # video_2 = gr.Image(label="Image with Attention Mask", width=256, height=256)
    

    def change_text_prompt(text):
        return gr.Radio(text.strip().split(' '), value=None, label='Choose a word to visualize its attention mask.')

    text = 'a dog and a cat sitting'
    gr.Markdown("""
                ## Visualizing Attention Masks
                * Select a prompt from the drop down
                * Click on "Get words" to get the words in the prompt
                * Select a radio button from the words to visualize the attention mask
                * Play around with the index of diffusion steps, layers to visualize different masks
                * Brighter mask corresponds to larger values of attention.
                """)

    with gr.Group("Video Selection"):
        txt_1 = gr.Dropdown(choices=available_prompts, label="Video Prompt", value=available_prompts[0])
        submit_btn = gr.Button('Get words')
    with gr.Group('Word Selection'):
        radio = gr.Radio(text.split(' '), value=None, label='Choose a word to visualize its attention mask.')
        range_slider = gr.Slider(1, 16, 1, step=2, label='Frame of the generated video to visualize the attention mask.')
        diffusion_slider = gr.Slider(0, 35, 0, step=5, label='Index of diffusion steps.')
        layer_num_slider = gr.Slider(0, 6, 0, step=1, label='Layer number for attention mask.')
    radio.change(fetch_proper_img, inputs=[txt_1, radio, range_slider, diffusion_slider, layer_num_slider], outputs=[video_1, video_2])
    range_slider.change(fetch_proper_img, inputs=[txt_1, radio, range_slider, diffusion_slider, layer_num_slider], outputs=[video_1, video_2])
    diffusion_slider.change(fetch_proper_img, inputs=[txt_1, radio, range_slider, diffusion_slider, layer_num_slider], outputs=[video_1, video_2])
    layer_num_slider.change(fetch_proper_img, inputs=[txt_1, radio, range_slider, diffusion_slider, layer_num_slider], outputs=[video_1, video_2])
    submit_btn.click(change_text_prompt, inputs=[txt_1], outputs=[radio])

if __name__ == "__main__":
    demo.launch(server_name='0.0.0.0')