rich-text-to-image

Build error

App Files Files Community

bumsika

songweig commited on May 25, 2023

Commit

2dee308

0 Parent(s):

Duplicate from songweig/rich-text-to-image

Browse files

Co-authored-by: Songwei Ge <[email protected]>

Files changed (15) hide show

.gitattributes +34 -0
.gitignore +3 -0
README.md +13 -0
app.py +557 -0
models/attention.py +904 -0
models/region_diffusion.py +461 -0
models/unet_2d_blocks.py +1855 -0
models/unet_2d_condition.py +411 -0
requirements.txt +9 -0
rich-text-to-json-iframe.html +341 -0
rich-text-to-json.js +349 -0
share_btn.py +116 -0
utils/.DS_Store +0 -0
utils/attention_utils.py +318 -0
utils/richtext_utils.py +234 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+venv
+__pycache__/
+*.pyc

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Rich Text To Image
+emoji: 🌍
+colorFrom: indigo
+colorTo: pink
+sdk: gradio
+sdk_version: 3.27.0
+app_file: app.py
+pinned: false
+duplicated_from: songweig/rich-text-to-image
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,557 @@

+import math
+import random
+import os
+import json
+import time
+import argparse
+import torch
+import numpy as np
+from torchvision import transforms
+from models.region_diffusion import RegionDiffusion
+from utils.attention_utils import get_token_maps
+from utils.richtext_utils import seed_everything, parse_json, get_region_diffusion_input,\
+    get_attention_control_input, get_gradient_guidance_input
+import gradio as gr
+from PIL import Image, ImageOps
+from share_btn import community_icon_html, loading_icon_html, share_js, css
+help_text = """
+If you are encountering an error or not achieving your desired outcome, here are some potential reasons and recommendations to consider:
+1. If you format only a portion of a word rather than the complete word, an error may occur.
+2. If you use font color and get completely corrupted results, you may consider decrease the color weight lambda.
+3. Consider using a different seed.
+"""
+canvas_html = """<iframe id='rich-text-root' style='width:100%' height='360px' src='file=rich-text-to-json-iframe.html' frameborder='0' scrolling='no'></iframe>"""
+get_js_data = """
+async (text_input, negative_prompt, num_segments, segment_threshold, inject_interval, inject_background, seed, color_guidance_weight, rich_text_input, height, width, steps, guidance_weights) => {
+  const richEl = document.getElementById("rich-text-root");
+  const data = richEl? richEl.contentDocument.body._data : {};
+  return [text_input, negative_prompt, num_segments, segment_threshold, inject_interval, inject_background, seed, color_guidance_weight, JSON.stringify(data), height, width, steps, guidance_weights];
+}
+"""
+set_js_data = """
+async (text_input) => {
+  const richEl = document.getElementById("rich-text-root");
+  const data = text_input ? JSON.parse(text_input) : null;
+  if (richEl && data) richEl.contentDocument.body.setQuillContents(data);
+}
+"""
+get_window_url_params = """
+async (url_params) => {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    return [url_params];
+}
+"""
+def load_url_params(url_params):
+    if 'prompt' in url_params:
+        return gr.update(visible=True), url_params
+    else:
+        return gr.update(visible=False), url_params
+def main():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = RegionDiffusion(device)
+    def generate(
+        text_input: str,
+        negative_text: str,
+        num_segments: int,
+        segment_threshold: float,
+        inject_interval: float,
+        inject_background: float,
+        seed: int,
+        color_guidance_weight: float,
+        rich_text_input: str,
+        height: int,
+        width: int,
+        steps: int,
+        guidance_weight: float,
+    ):
+        run_dir = 'results/'
+        os.makedirs(run_dir, exist_ok=True)
+        # Load region diffusion model.
+        height = int(height) if height else 512
+        width = int(width) if width else 512
+        steps = 41 if not steps else steps
+        guidance_weight = 8.5 if not guidance_weight else guidance_weight
+        text_input = rich_text_input if rich_text_input != '' and rich_text_input != None else text_input
+        print('text_input', text_input, width, height, steps, guidance_weight, num_segments, segment_threshold, inject_interval, inject_background, color_guidance_weight, negative_text)
+        if (text_input == '' or rich_text_input == ''):
+            raise gr.Error("Please enter some text.")
+        # parse json to span attributes
+        base_text_prompt, style_text_prompts, footnote_text_prompts, footnote_target_tokens,\
+            color_text_prompts, color_names, color_rgbs, size_text_prompts_and_sizes, use_grad_guidance = parse_json(
+                json.loads(text_input))
+        # create control input for region diffusion
+        region_text_prompts, region_target_token_ids, base_tokens = get_region_diffusion_input(
+            model, base_text_prompt, style_text_prompts, footnote_text_prompts,
+            footnote_target_tokens, color_text_prompts, color_names)
+        # create control input for cross attention
+        text_format_dict = get_attention_control_input(
+            model, base_tokens, size_text_prompts_and_sizes)
+        # create control input for region guidance
+        text_format_dict, color_target_token_ids = get_gradient_guidance_input(
+            model, base_tokens, color_text_prompts, color_rgbs, text_format_dict, color_guidance_weight=color_guidance_weight)
+        seed_everything(seed)
+        # get token maps from plain text to image generation.
+        begin_time = time.time()
+        if model.selfattn_maps is None and model.crossattn_maps is None:
+            model.remove_tokenmap_hooks()
+            model.register_tokenmap_hooks()
+        else:
+            model.reset_attention_maps()
+            model.remove_tokenmap_hooks()
+        plain_img = model.produce_attn_maps([base_text_prompt], [negative_text],
+                                            height=height, width=width, num_inference_steps=steps,
+                                            guidance_scale=guidance_weight)
+        print('time lapses to get attention maps: %.4f' %
+              (time.time()-begin_time))
+        seed_everything(seed)
+        color_obj_masks, segments_vis, token_maps = get_token_maps(model.selfattn_maps, model.crossattn_maps, model.n_maps, run_dir,
+                                                                   512//8, 512//8, color_target_token_ids[:-1], seed,
+                                                                   base_tokens, segment_threshold=segment_threshold, num_segments=num_segments,
+                                                                   return_vis=True)
+        seed_everything(seed)
+        model.masks, segments_vis, token_maps = get_token_maps(model.selfattn_maps, model.crossattn_maps, model.n_maps, run_dir,
+                                                               512//8, 512//8, region_target_token_ids[:-1], seed,
+                                                               base_tokens, segment_threshold=segment_threshold, num_segments=num_segments,
+                                                               return_vis=True)
+        color_obj_atten_all = torch.zeros_like(color_obj_masks[-1])
+        for obj_mask in color_obj_masks[:-1]:
+            color_obj_atten_all += obj_mask
+        color_obj_masks = [transforms.functional.resize(color_obj_mask, (height, width),
+                                                        interpolation=transforms.InterpolationMode.BICUBIC,
+                                                        antialias=True)
+                           for color_obj_mask in color_obj_masks]
+        text_format_dict['color_obj_atten'] = color_obj_masks
+        text_format_dict['color_obj_atten_all'] = color_obj_atten_all
+        model.remove_tokenmap_hooks()
+        # generate image from rich text
+        begin_time = time.time()
+        seed_everything(seed)
+        rich_img = model.prompt_to_img(region_text_prompts, [negative_text],
+                                       height=height, width=width, num_inference_steps=steps,
+                                       guidance_scale=guidance_weight, use_guidance=use_grad_guidance,
+                                       text_format_dict=text_format_dict, inject_selfattn=inject_interval,
+                                       inject_background=inject_background)
+        print('time lapses to generate image from rich text: %.4f' %
+              (time.time()-begin_time))
+        return [plain_img[0], rich_img[0], segments_vis, token_maps]
+    with gr.Blocks(css=css) as demo:
+        url_params = gr.JSON({}, visible=False, label="URL Params")
+        gr.HTML("""<h1 style="font-weight: 900; margin-bottom: 7px;">Expressive Text-to-Image Generation with Rich Text</h1>
+                   <p> <a href="https://songweige.github.io/">Songwei Ge</a>, <a href="https://taesung.me/">Taesung Park</a>, <a href="https://www.cs.cmu.edu/~junyanz/">Jun-Yan Zhu</a>, <a href="https://jbhuang0604.github.io/">Jia-Bin Huang</a> <p/>
+                   <p> UMD, Adobe, CMU <p/>
+                   <p> <a href="https://huggingface.co/spaces/songweig/rich-text-to-image?duplicate=true"><img src="https://bit.ly/3gLdBN6" style="display:inline;"alt="Duplicate Space"></a> | <a href="https://rich-text-to-image.github.io">[Website]</a> | <a href="https://github.com/SongweiGe/rich-text-to-image">[Code]</a> | <a href="https://arxiv.org/abs/2304.06720">[Paper]</a><p/>
+                   <p> For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.""")
+        with gr.Row():
+            with gr.Column():
+                rich_text_el = gr.HTML(canvas_html, elem_id="canvas_html")
+                rich_text_input = gr.Textbox(value="", visible=False)
+                text_input = gr.Textbox(
+                    label='Rich-text JSON Input',
+                    visible=False,
+                    max_lines=1,
+                    placeholder='Example: \'{"ops":[{"insert":"a Gothic "},{"attributes":{"color":"#b26b00"},"insert":"church"},{"insert":" in a the sunset with a beautiful landscape in the background.\n"}]}\'',
+                    elem_id="text_input"
+                )
+                negative_prompt = gr.Textbox(
+                    label='Negative Prompt',
+                    max_lines=1,
+                    placeholder='Example: poor quality, blurry, dark, low resolution, low quality, worst quality',
+                    elem_id="negative_prompt"
+                )
+                segment_threshold = gr.Slider(label='Token map threshold',
+                                              info='(See less area in token maps? Decrease this. See too much area? Increase this.)',
+                                              minimum=0,
+                                              maximum=1,
+                                              step=0.01,
+                                              value=0.25)
+                inject_interval = gr.Slider(label='Detail preservation',
+                                            info='(To preserve more structure from plain-text generation, increase this. To see more rich-text attributes, decrease this.)',
+                                            minimum=0,
+                                            maximum=1,
+                                            step=0.01,
+                                            value=0.)
+                inject_background = gr.Slider(label='Unformatted token preservation',
+                                            info='(To affect less the tokens without any rich-text attributes, increase this.)',
+                                            minimum=0,
+                                            maximum=1,
+                                            step=0.01,
+                                            value=0.3)
+                color_guidance_weight = gr.Slider(label='Color weight',
+                                                  info='(To obtain more precise color, increase this, while too large value may cause artifacts.)',
+                                                  minimum=0,
+                                                  maximum=2,
+                                                  step=0.1,
+                                                  value=0.5)
+                num_segments = gr.Slider(label='Number of segments',
+                                         minimum=2,
+                                         maximum=20,
+                                         step=1,
+                                         value=9)
+                seed = gr.Slider(label='Seed',
+                                 minimum=0,
+                                 maximum=100000,
+                                 step=1,
+                                 value=6,
+                                 elem_id="seed"
+                                 )
+                with gr.Accordion('Other Parameters', open=False):
+                    steps = gr.Slider(label='Number of Steps',
+                                      minimum=0,
+                                      maximum=500,
+                                      step=1,
+                                      value=41)
+                    guidance_weight = gr.Slider(label='CFG weight',
+                                                minimum=0,
+                                                maximum=50,
+                                                step=0.1,
+                                                value=8.5)
+                    width = gr.Dropdown(choices=[512],
+                                        value=512,
+                                        label='Width',
+                                        visible=True)
+                    height = gr.Dropdown(choices=[512],
+                                         value=512,
+                                         label='height',
+                                         visible=True)
+                with gr.Row():
+                    with gr.Column(scale=1, min_width=100):
+                        generate_button = gr.Button("Generate")
+                        load_params_button = gr.Button(
+                            "Load from URL Params", visible=True)
+            with gr.Column():
+                richtext_result = gr.Image(
+                    label='Rich-text', elem_id="rich-text-image")
+                richtext_result.style(height=512)
+                with gr.Row():
+                    plaintext_result = gr.Image(
+                        label='Plain-text', elem_id="plain-text-image")
+                    segments = gr.Image(label='Segmentation')
+                with gr.Row():
+                    token_map = gr.Image(label='Token Maps')
+                with gr.Row(visible=False) as share_row:
+                    with gr.Group(elem_id="share-btn-container"):
+                        community_icon = gr.HTML(community_icon_html)
+                        loading_icon = gr.HTML(loading_icon_html)
+                        share_button = gr.Button(
+                            "Share to community", elem_id="share-btn")
+                        share_button.click(None, [], [], _js=share_js)
+        with gr.Row():
+            gr.Markdown(help_text)
+        with gr.Row():
+            footnote_examples = [
+                [
+                    '{"ops":[{"insert":"A close-up 4k dslr photo of a "},{"attributes":{"link":"A cat wearing sunglasses and a bandana around its neck."},"insert":"cat"},{"insert":" riding a scooter. Palm trees in the background."}]}',
+                    '',
+                    5,
+                    0.3,
+                    0,
+                    0.5,
+                    6,
+                    0,
+                    None,
+                ],
+                [
+                    '{"ops":[{"insert":"A "},{"attributes":{"link":"Thor Kitchen 30 Inch Wide Freestanding Gas Range with Automatic Re-Ignition System"},"insert":"kitchen island"},{"insert":" next to a "},{"attributes":{"link":"an open refrigerator stocked with fresh produce, dairy products, and beverages. "},"insert":"refrigerator"},{"insert":", by James McDonald and Joarc Architects, home, interior, octane render, deviantart, cinematic, key art, hyperrealism, sun light, sunrays, canon eos c 300, ƒ 1.8, 35 mm, 8k, medium - format print"}]}',
+                    '',
+                    7,
+                    0.5,
+                    0,
+                    0.5,
+                    6,
+                    0,
+                    None,
+                ],
+                [
+                    '{"ops":[{"insert":"A "},{"attributes":{"link":"Happy Kung fu panda art, elder, asian art, volumetric lighting, dramatic scene, ultra detailed, realism, chinese"},"insert":"panda"},{"insert":" standing on a cliff by a waterfall, wildlife photography, photograph, high quality, wildlife, f 1.8, soft focus, 8k, national geographic, award - winning photograph by nick nichols"}]}',
+                    '',
+                    5,
+                    0.3,
+                    0,
+                    0.1,
+                    4,
+                    0,
+                    None,
+                ],
+            ]
+            gr.Examples(examples=footnote_examples,
+                        label='Footnote examples',
+                        inputs=[
+                            text_input,
+                            negative_prompt,
+                            num_segments,
+                            segment_threshold,
+                            inject_interval,
+                            inject_background,
+                            seed,
+                            color_guidance_weight,
+                            rich_text_input,
+                        ],
+                        outputs=[
+                            plaintext_result,
+                            richtext_result,
+                            segments,
+                            token_map,
+                        ],
+                        fn=generate,
+                        cache_examples=True,
+                        examples_per_page=20)
+        with gr.Row():
+            color_examples = [
+                [
+                    '{"ops":[{"insert":"a beautifule girl with big eye, skin, and long "},{"attributes":{"color":"#04a704"},"insert":"hair"},{"insert":", t-shirt, bursting with vivid color, intricate, elegant, highly detailed, photorealistic, digital painting,  artstation, illustration, concept art."}]}',
+                    'lowres, had anatomy, bad hands, cropped, worst quality',
+                    11,
+                    0.3,
+                    0.3,
+                    0.3,
+                    6,
+                    0.5,
+                    None,
+                ],
+                [
+                    '{"ops":[{"insert":"a beautifule girl with big eye, skin, and long "},{"attributes":{"color":"#999999"},"insert":"hair"},{"insert":", t-shirt, bursting with vivid color, intricate, elegant, highly detailed, photorealistic, digital painting,  artstation, illustration, concept art."}]}',
+                    'lowres, had anatomy, bad hands, cropped, worst quality',
+                    11,
+                    0.3,
+                    0.3,
+                    0.3,
+                    6,
+                    0.5,
+                    None,
+                ],
+                [
+                    '{"ops":[{"insert":"a Gothic "},{"attributes":{"color":"#FD6C9E"},"insert":"church"},{"insert":" in a the sunset with a beautiful landscape in the background."}]}',
+                    '',
+                    10,
+                    0.4,
+                    0.5,
+                    0.3,
+                    6,
+                    0.5,
+                    None,
+                ],
+                [
+                    '{"ops":[{"insert":"A mesmerizing sight that captures the beauty of a "},{"attributes":{"color":"#4775fc"},"insert":"rose"},{"insert":" blooming, close up"}]}',
+                    '',
+                    3,
+                    0.3,
+                    0,
+                    0,
+                    9,
+                    1,
+                    None,
+                ],
+                [
+                    '{"ops":[{"insert":"A "},{"attributes":{"color":"#FFD700"},"insert":"marble statue of a wolf\'s head and shoulder"},{"insert":", surrounded by colorful flowers michelangelo, detailed, intricate, full of color, led lighting, trending on artstation, 4 k, hyperrealistic, 3 5 mm, focused, extreme details, unreal engine 5, masterpiece "}]}',
+                    '',
+                    5,
+                    0.4,
+                    0.3,
+                    0.3,
+                    5,
+                    0.6,
+                    None,
+                ],
+            ]
+            gr.Examples(examples=color_examples,
+                        label='Font color examples',
+                        inputs=[
+                            text_input,
+                            negative_prompt,
+                            num_segments,
+                            segment_threshold,
+                            inject_interval,
+                            inject_background,
+                            seed,
+                            color_guidance_weight,
+                            rich_text_input,
+                        ],
+                        outputs=[
+                            plaintext_result,
+                            richtext_result,
+                            segments,
+                            token_map,
+                        ],
+                        fn=generate,
+                        cache_examples=True,
+                        examples_per_page=20)
+        with gr.Row():
+            style_examples = [
+                [
+                    '{"ops":[{"insert":"a "},{"attributes":{"font":"mirza"},"insert":"beautiful garden"},{"insert":" with a "},{"attributes":{"font":"roboto"},"insert":"snow mountain in the background"},{"insert":""}]}',
+                    '',
+                    10,
+                    0.4,
+                    0,
+                    0.2,
+                    3,
+                    0,
+                    None,
+                ],
+                [
+                    '{"ops":[{"attributes":{"link":"the awe-inspiring sky and ocean in the style of J.M.W. Turner"},"insert":"the awe-inspiring sky and sea"},{"insert":" by "},{"attributes":{"font":"mirza"},"insert":"a coast with flowers and grasses in spring"}]}',
+                    'worst quality, dark, poor quality',
+                    5,
+                    0.3,
+                    0,
+                    0,
+                    9,
+                    0.5,
+                    None,
+                ],
+                [
+                    '{"ops":[{"insert":"a "},{"attributes":{"font":"slabo"},"insert":"night sky filled with stars"},{"insert":" above a "},{"attributes":{"font":"roboto"},"insert":"turbulent sea with giant waves"}]}',
+                    '',
+                    2,
+                    0.35,
+                    0,
+                    0,
+                    6,
+                    0.5,
+                    None,
+                ],
+            ]
+            gr.Examples(examples=style_examples,
+                        label='Font style examples',
+                        inputs=[
+                            text_input,
+                            negative_prompt,
+                            num_segments,
+                            segment_threshold,
+                            inject_interval,
+                            inject_background,
+                            seed,
+                            color_guidance_weight,
+                            rich_text_input,
+                        ],
+                        outputs=[
+                            plaintext_result,
+                            richtext_result,
+                            segments,
+                            token_map,
+                        ],
+                        fn=generate,
+                        cache_examples=True,
+                        examples_per_page=20)
+        with gr.Row():
+            size_examples = [
+                [
+                    '{"ops": [{"insert": "A pizza with "}, {"attributes": {"size": "60px"}, "insert": "pineapple"}, {"insert": ", pepperoni, and mushroom on the top, 4k, photorealistic"}]}',
+                    'blurry, art, painting, rendering, drawing, sketch, ugly, duplicate, morbid, mutilated, mutated, deformed, disfigured low quality, worst quality',
+                    5,
+                    0.3,
+                    0,
+                    0,
+                    13,
+                    1,
+                    None,
+                ],
+                [
+                    '{"ops": [{"insert": "A pizza with pineapple, "}, {"attributes": {"size": "20px"}, "insert": "pepperoni"}, {"insert": ", and mushroom on the top, 4k, photorealistic"}]}',
+                    'blurry, art, painting, rendering, drawing, sketch, ugly, duplicate, morbid, mutilated, mutated, deformed, disfigured low quality, worst quality',
+                    5,
+                    0.3,
+                    0,
+                    0,
+                    13,
+                    1,
+                    None,
+                ],
+                [
+                    '{"ops": [{"insert": "A pizza with pineapple, pepperoni, and "}, {"attributes": {"size": "70px"}, "insert": "mushroom"}, {"insert": " on the top, 4k, photorealistic"}]}',
+                    'blurry, art, painting, rendering, drawing, sketch, ugly, duplicate, morbid, mutilated, mutated, deformed, disfigured low quality, worst quality',
+                    5,
+                    0.3,
+                    0,
+                    0,
+                    13,
+                    1,
+                    None,
+                ],
+            ]
+            gr.Examples(examples=size_examples,
+                        label='Font size examples',
+                        inputs=[
+                            text_input,
+                            negative_prompt,
+                            num_segments,
+                            segment_threshold,
+                            inject_interval,
+                            inject_background,
+                            seed,
+                            color_guidance_weight,
+                            rich_text_input,
+                        ],
+                        outputs=[
+                            plaintext_result,
+                            richtext_result,
+                            segments,
+                            token_map,
+                        ],
+                        fn=generate,
+                        cache_examples=True,
+                        examples_per_page=20)
+        generate_button.click(fn=lambda: gr.update(visible=False), inputs=None, outputs=share_row, queue=False).then(
+            fn=generate,
+            inputs=[
+                text_input,
+                negative_prompt,
+                num_segments,
+                segment_threshold,
+                inject_interval,
+                inject_background,
+                seed,
+                color_guidance_weight,
+                rich_text_input,
+                height,
+                width,
+                steps,
+                guidance_weight,
+            ],
+            outputs=[plaintext_result, richtext_result, segments, token_map],
+            _js=get_js_data
+        ).then(
+            fn=lambda: gr.update(visible=True), inputs=None, outputs=share_row, queue=False)
+        text_input.change(
+            fn=None, inputs=[text_input], outputs=None, _js=set_js_data, queue=False)
+        # load url param prompt to textinput
+        load_params_button.click(fn=lambda x: x['prompt'], inputs=[
+                                 url_params], outputs=[text_input], queue=False)
+        demo.load(
+            fn=load_url_params,
+            inputs=[url_params],
+            outputs=[load_params_button, url_params],
+            _js=get_window_url_params
+        )
+    demo.queue(concurrency_count=1)
+    demo.launch(share=False)
+if __name__ == "__main__":
+    main()

models/attention.py ADDED Viewed

	@@ -0,0 +1,904 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
+            for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
+    embeddings) inputs.
+    When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
+    transformer action. Finally, reshape to image.
+    When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
+    embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
+    classes of unnoised image.
+    Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
+    image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of context dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = in_channels is not None
+        self.is_input_vectorized = num_vector_embeds is not None
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized:
+            raise ValueError(
+                f"Has to define either `in_channels`: {in_channels} or `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = torch.nn.GroupNorm(
+                num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = nn.Linear(in_channels, inner_dim)
+            else:
+                self.proj_in = nn.Conv2d(
+                    in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        if self.is_input_continuous:
+            if use_linear_projection:
+                self.proj_out = nn.Linear(in_channels, inner_dim)
+            else:
+                self.proj_out = nn.Conv2d(
+                    inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+    def _set_attention_slice(self, slice_size):
+        for block in self.transformer_blocks:
+            block._set_attention_slice(slice_size)
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None,
+                text_format_dict={}, return_dict: bool = True):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
+            if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
+            tensor.
+        """
+        # 1. Input
+        if self.is_input_continuous:
+            batch, channel, height, weight = hidden_states.shape
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = self.proj_in(hidden_states)
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(
+                    0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(
+                    0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+                hidden_states = self.proj_in(hidden_states)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, context=encoder_hidden_states, timestep=timestep,
+                                  text_format_dict=text_format_dict)
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = (
+                    hidden_states.reshape(batch, height, weight, inner_dim).permute(
+                        0, 3, 1, 2).contiguous()
+                )
+                hidden_states = self.proj_out(hidden_states)
+            else:
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = (
+                    hidden_states.reshape(batch, height, weight, inner_dim).permute(
+                        0, 3, 1, 2).contiguous()
+                )
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for block in self.transformer_blocks:
+            block._set_use_memory_efficient_attention_xformers(
+                use_memory_efficient_attention_xformers)
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
+    to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    Uses three q, k, v linear layers to compute attention.
+    Parameters:
+        channels (`int`): The number of channels in the input and output.
+        num_head_channels (`int`, *optional*):
+            The number of channels in each head. If None, then `num_heads` = 1.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for group norm.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0): The factor to rescale the output by.
+        eps (`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_head_channels: Optional[int] = None,
+        norm_num_groups: int = 32,
+        rescale_output_factor: float = 1.0,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
+        self.num_head_size = num_head_channels
+        self.group_norm = nn.GroupNorm(
+            num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)
+        # define q,k,v as linear layers
+        self.query = nn.Linear(channels, channels)
+        self.key = nn.Linear(channels, channels)
+        self.value = nn.Linear(channels, channels)
+        self.rescale_output_factor = rescale_output_factor
+        self.proj_attn = nn.Linear(channels, channels, 1)
+    def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
+        new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+        new_projection = projection.view(
+            new_projection_shape).permute(0, 2, 1, 3)
+        return new_projection
+    def forward(self, hidden_states):
+        residual = hidden_states
+        batch, channel, height, width = hidden_states.shape
+        # norm
+        hidden_states = self.group_norm(hidden_states)
+        hidden_states = hidden_states.view(
+            batch, channel, height * width).transpose(1, 2)
+        # proj to q, k, v
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(hidden_states)
+        value_proj = self.value(hidden_states)
+        scale = 1 / math.sqrt(self.channels / self.num_heads)
+        # get scores
+        if self.num_heads > 1:
+            query_states = self.transpose_for_scores(query_proj)
+            key_states = self.transpose_for_scores(key_proj)
+            value_states = self.transpose_for_scores(value_proj)
+            # TODO: is there a way to perform batched matmul (e.g. baddbmm) on 4D tensors?
+            #       or reformulate this into a 3D problem?
+            # TODO: measure whether on MPS device it would be faster to do this matmul via einsum
+            #       as some matmuls can be 1.94x slower than an equivalent einsum on MPS
+            #       https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
+            attention_scores = torch.matmul(
+                query_states, key_states.transpose(-1, -2)) * scale
+        else:
+            query_states, key_states, value_states = query_proj, key_proj, value_proj
+            attention_scores = torch.baddbmm(
+                torch.empty(
+                    query_states.shape[0],
+                    query_states.shape[1],
+                    key_states.shape[1],
+                    dtype=query_states.dtype,
+                    device=query_states.device,
+                ),
+                query_states,
+                key_states.transpose(-1, -2),
+                beta=0,
+                alpha=scale,
+            )
+        attention_probs = torch.softmax(
+            attention_scores.float(), dim=-1).type(attention_scores.dtype)
+        # compute attention output
+        if self.num_heads > 1:
+            # TODO: is there a way to perform batched matmul (e.g. bmm) on 4D tensors?
+            #       or reformulate this into a 3D problem?
+            # TODO: measure whether on MPS device it would be faster to do this matmul via einsum
+            #       as some matmuls can be 1.94x slower than an equivalent einsum on MPS
+            #       https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
+            hidden_states = torch.matmul(attention_probs, value_states)
+            hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+            new_hidden_states_shape = hidden_states.size()[
+                :-2] + (self.channels,)
+            hidden_states = hidden_states.view(new_hidden_states_shape)
+        else:
+            hidden_states = torch.bmm(attention_probs, value_states)
+        # compute next hidden_states
+        hidden_states = self.proj_attn(hidden_states)
+        hidden_states = hidden_states.transpose(
+            -1, -2).reshape(batch, channel, height, width)
+        # res connect and rescale
+        hidden_states = (hidden_states + residual) / self.rescale_output_factor
+        return hidden_states
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the context vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.attn1 = CrossAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+        )  # is a self-attention
+        self.ff = FeedForward(dim, dropout=dropout,
+                              activation_fn=activation_fn)
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+        )  # is self-attn if context is none
+        # layer norms
+        self.use_ada_layer_norm = num_embeds_ada_norm is not None
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim)
+            self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        # if xformers is installed try to use memory_efficient_attention by default
+        if is_xformers_available():
+            try:
+                self._set_use_memory_efficient_attention_xformers(True)
+            except Exception as e:
+                warnings.warn(
+                    "Could not enable memory efficient attention. Make sure xformers is installed"
+                    f" correctly and a GPU is available: {e}"
+                )
+    def _set_attention_slice(self, slice_size):
+        self.attn1._slice_size = slice_size
+        self.attn2._slice_size = slice_size
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        if not is_xformers_available():
+            print("Here is how to install it")
+            raise ModuleNotFoundError(
+                "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                " xformers",
+                name="xformers",
+            )
+        elif not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
+                " available for GPU "
+            )
+        else:
+            try:
+                # Make sure we can run the memory efficient attention
+                _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                )
+            except Exception as e:
+                raise e
+            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+    def forward(self, hidden_states, context=None, timestep=None, text_format_dict={}):
+        # 1. Self-Attention
+        norm_hidden_states = (
+            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(
+                hidden_states)
+        )
+        if self.only_cross_attention:
+            attn_out, _ = self.attn1(
+                norm_hidden_states, context=context, text_format_dict=text_format_dict) + hidden_states
+            hidden_states = attn_out + hidden_states
+        else:
+            attn_out, _ = self.attn1(norm_hidden_states)
+            hidden_states = attn_out + hidden_states
+        # 2. Cross-Attention
+        norm_hidden_states = (
+            self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(
+                hidden_states)
+        )
+        attn_out, _ = self.attn2(
+            norm_hidden_states, context=context, text_format_dict=text_format_dict)
+        hidden_states = attn_out + hidden_states
+        # 3. Feed-forward
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+        return hidden_states
+class CrossAttention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the context. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        self.is_cross_attn = cross_attention_dim is not None
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self._slice_size = None
+        self._use_memory_efficient_attention_xformers = False
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim))
+        self.to_out.append(nn.Dropout(dropout))
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len,
+                                head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(
+            batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size,
+                                head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(
+            batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def reshape_batch_dim_to_heads_and_average(self, tensor):
+        batch_size, seq_len, seq_len2 = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size,
+                                head_size, seq_len, seq_len2)
+        return tensor.mean(1)
+    def forward(self, hidden_states, real_attn_probs=None, context=None, mask=None, text_format_dict={}):
+        batch_size, sequence_length, _ = hidden_states.shape
+        query = self.to_q(hidden_states)
+        context = context if context is not None else hidden_states
+        key = self.to_k(context)
+        value = self.to_v(context)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        key = self.reshape_heads_to_batch_dim(key)
+        value = self.reshape_heads_to_batch_dim(value)
+        # attention, what we cannot get enough of
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(
+                query, key, value)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                # only this attention function is used
+                hidden_states, attn_probs = self._attention(
+                    query, key, value, real_attn_probs, **text_format_dict)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states, attn_probs
+    def _qk(self, query, key):
+        return torch.baddbmm(
+            torch.empty(query.shape[0], query.shape[1], key.shape[1],
+                        dtype=query.dtype, device=query.device),
+            query,
+            key.transpose(-1, -2),
+            beta=0,
+            alpha=self.scale,
+        )
+    def _attention(self, query, key, value, real_attn_probs=None, word_pos=None, font_size=None,
+                   **kwargs):
+        attention_scores = self._qk(query, key)
+        # Font size V2:
+        if self.is_cross_attn and word_pos is not None and font_size is not None:
+            assert key.shape[1] == 77
+            attention_score_exp = attention_scores.exp()
+            font_size_abs, font_size_sign = font_size.abs(), font_size.sign()
+            attention_score_exp[:, :, word_pos] = attention_score_exp[:, :, word_pos].clone(
+            )*font_size_abs
+            attention_probs = attention_score_exp / \
+                attention_score_exp.sum(-1, True)
+            attention_probs[:, :, word_pos] *= font_size_sign
+        else:
+            attention_probs = attention_scores.softmax(dim=-1)
+        # compute attention output
+        if real_attn_probs is None:
+            hidden_states = torch.bmm(attention_probs, value)
+        else:
+            if isinstance(real_attn_probs, dict):
+                for pos1, pos2 in zip(real_attn_probs['inject_pos'][0], real_attn_probs['inject_pos'][1]):
+                    attention_probs[:, :,
+                                    pos2] = real_attn_probs['reference'][:, :, pos1]
+                hidden_states = torch.bmm(attention_probs, value)
+            else:
+                hidden_states = torch.bmm(real_attn_probs, value)
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        # we also return the map averaged over heads to save memory footprint
+        attention_probs_avg = self.reshape_batch_dim_to_heads_and_average(
+            attention_probs)
+        return hidden_states, [attention_probs_avg, attention_probs]
+    def _memory_efficient_attention_xformers(self, query, key, value):
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=None)
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "geglu":
+            geglu = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            geglu = ApproximateGELU(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(geglu)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+# feedforward
+class GEGLU(nn.Module):
+    r"""
+    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    """
+    The approximate form of Gaussian Error Linear Unit (GELU)
+    For more details, see section 2: https://arxiv.org/abs/1606.08415
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+    def forward(self, x):
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+class AdaLayerNorm(nn.Module):
+    """
+    Norm layer modified to incorporate timestep embeddings.
+    """
+    def __init__(self, embedding_dim, num_embeddings):
+        super().__init__()
+        self.emb = nn.Embedding(num_embeddings, embedding_dim)
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
+    def forward(self, x, timestep):
+        emb = self.linear(self.silu(self.emb(timestep)))
+        scale, shift = torch.chunk(emb, 2)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+class DualTransformer2DModel(nn.Module):
+    """
+    Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of context dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+    ):
+        super().__init__()
+        self.transformers = nn.ModuleList(
+            [
+                Transformer2DModel(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    in_channels=in_channels,
+                    num_layers=num_layers,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    attention_bias=attention_bias,
+                    sample_size=sample_size,
+                    num_vector_embeds=num_vector_embeds,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                )
+                for _ in range(2)
+            ]
+        )
+        # Variables that can be set by a pipeline:
+        # The ratio of transformer1 to transformer2's output states to be combined during inference
+        self.mix_ratio = 0.5
+        # The shape of `encoder_hidden_states` is expected to be
+        # `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)`
+        self.condition_lengths = [77, 257]
+        # Which transformer to use to encode which condition.
+        # E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])`
+        self.transformer_index_for_condition = [1, 0]
+    def forward(self, hidden_states, encoder_hidden_states, timestep=None, return_dict: bool = True):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
+            if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
+            tensor.
+        """
+        input_states = hidden_states
+        encoded_states = []
+        tokens_start = 0
+        for i in range(2):
+            # for each of the two transformers, pass the corresponding condition tokens
+            condition_state = encoder_hidden_states[:,
+                                                    tokens_start: tokens_start + self.condition_lengths[i]]
+            transformer_index = self.transformer_index_for_condition[i]
+            encoded_state = self.transformers[transformer_index](input_states, condition_state, timestep, return_dict)[
+                0
+            ]
+            encoded_states.append(encoded_state - input_states)
+            tokens_start += self.condition_lengths[i]
+        output_states = encoded_states[0] * self.mix_ratio + \
+            encoded_states[1] * (1 - self.mix_ratio)
+        output_states = output_states + input_states
+        if not return_dict:
+            return (output_states,)
+        return Transformer2DModelOutput(sample=output_states)
+    def _set_attention_slice(self, slice_size):
+        for transformer in self.transformers:
+            transformer._set_attention_slice(slice_size)
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for transformer in self.transformers:
+            transformer._set_use_memory_efficient_attention_xformers(
+                use_memory_efficient_attention_xformers)

models/region_diffusion.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import os
+import torch
+import collections
+import torch.nn as nn
+from functools import partial
+from transformers import CLIPTextModel, CLIPTokenizer, logging
+from diffusers import AutoencoderKL, PNDMScheduler, EulerDiscreteScheduler, DPMSolverMultistepScheduler
+from models.unet_2d_condition import UNet2DConditionModel
+from utils.attention_utils import CrossAttentionLayers, SelfAttentionLayers
+# suppress partial model loading warning
+logging.set_verbosity_error()
+class RegionDiffusion(nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.device = device
+        self.num_train_timesteps = 1000
+        self.clip_gradient = False
+        print(f'[INFO] loading stable diffusion...')
+        model_id = 'runwayml/stable-diffusion-v1-5'
+        self.vae = AutoencoderKL.from_pretrained(
+            model_id, subfolder="vae").to(self.device)
+        self.tokenizer = CLIPTokenizer.from_pretrained(
+            model_id, subfolder='tokenizer')
+        self.text_encoder = CLIPTextModel.from_pretrained(
+            model_id, subfolder='text_encoder').to(self.device)
+        self.unet = UNet2DConditionModel.from_pretrained(
+            model_id, subfolder="unet").to(self.device)
+        self.scheduler = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear",
+                                       num_train_timesteps=self.num_train_timesteps, skip_prk_steps=True, steps_offset=1)
+        self.alphas_cumprod = self.scheduler.alphas_cumprod.to(self.device)
+        self.masks = []
+        self.attention_maps = None
+        self.selfattn_maps = None
+        self.crossattn_maps = None
+        self.color_loss = torch.nn.functional.mse_loss
+        self.forward_hooks = []
+        self.forward_replacement_hooks = []
+        print(f'[INFO] loaded stable diffusion!')
+    def get_text_embeds(self, prompt, negative_prompt):
+        # prompt, negative_prompt: [str]
+        # Tokenize text and get embeddings
+        text_input = self.tokenizer(
+            prompt, padding='max_length', max_length=self.tokenizer.model_max_length, truncation=True, return_tensors='pt')
+        with torch.no_grad():
+            text_embeddings = self.text_encoder(
+                text_input.input_ids.to(self.device))[0]
+        # Do the same for unconditional embeddings
+        uncond_input = self.tokenizer(negative_prompt, padding='max_length',
+                                      max_length=self.tokenizer.model_max_length, return_tensors='pt')
+        with torch.no_grad():
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(self.device))[0]
+        # Cat for final embeddings
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def get_text_embeds_list(self, prompts):
+        # prompts: [list]
+        text_embeddings = []
+        for prompt in prompts:
+            # Tokenize text and get embeddings
+            text_input = self.tokenizer(
+                [prompt], padding='max_length', max_length=self.tokenizer.model_max_length, truncation=True, return_tensors='pt')
+            with torch.no_grad():
+                text_embeddings.append(self.text_encoder(
+                    text_input.input_ids.to(self.device))[0])
+        return text_embeddings
+    def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5,
+                        latents=None, use_guidance=False, text_format_dict={}, inject_selfattn=0, inject_background=0):
+        if latents is None:
+            latents = torch.randn(
+                (1, self.unet.in_channels, height // 8, width // 8), device=self.device)
+        if inject_selfattn > 0 or inject_background > 0:
+            latents_reference = latents.clone().detach()
+        self.scheduler.set_timesteps(num_inference_steps)
+        n_styles = text_embeddings.shape[0]-1
+        assert n_styles == len(self.masks)
+        with torch.autocast('cuda'):
+            for i, t in enumerate(self.scheduler.timesteps):
+                # predict the noise residual
+                with torch.no_grad():
+                    # tokens without any attributes
+                    feat_inject_step = t > (1-inject_selfattn) * 1000
+                    background_inject_step = i == int(inject_background * len(self.scheduler.timesteps)) and inject_background > 0
+                    noise_pred_uncond_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[:1],
+                                                    text_format_dict={})['sample']
+                    noise_pred_text_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[-1:],
+                                                    text_format_dict=text_format_dict)['sample']
+                    if inject_selfattn > 0 or inject_background > 0:
+                        noise_pred_uncond_refer = self.unet(latents_reference, t, encoder_hidden_states=text_embeddings[:1],
+                                                            text_format_dict={})['sample']
+                        self.register_selfattn_hooks(feat_inject_step)
+                        noise_pred_text_refer = self.unet(latents_reference, t, encoder_hidden_states=text_embeddings[-1:],
+                                                          text_format_dict={})['sample']
+                        self.remove_selfattn_hooks()
+                    noise_pred_uncond = noise_pred_uncond_cur * self.masks[-1]
+                    noise_pred_text = noise_pred_text_cur * self.masks[-1]
+                    # tokens with attributes
+                    for style_i, mask in enumerate(self.masks[:-1]):
+                        self.register_replacement_hooks(feat_inject_step)
+                        noise_pred_text_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[style_i+1:style_i+2],
+                                                        text_format_dict={})['sample']
+                        self.remove_replacement_hooks()
+                        noise_pred_uncond = noise_pred_uncond + noise_pred_uncond_cur*mask
+                        noise_pred_text = noise_pred_text + noise_pred_text_cur*mask
+                # perform classifier-free guidance
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
+                if inject_selfattn > 0 or inject_background > 0:
+                    noise_pred_refer = noise_pred_uncond_refer + guidance_scale * \
+                        (noise_pred_text_refer - noise_pred_uncond_refer)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents_reference = self.scheduler.step(torch.cat([noise_pred, noise_pred_refer]), t,
+                                                            torch.cat([latents, latents_reference]))[
+                        'prev_sample']
+                    latents, latents_reference = torch.chunk(
+                        latents_reference, 2, dim=0)
+                else:
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents)[
+                        'prev_sample']
+                # apply guidance
+                if use_guidance and t < text_format_dict['guidance_start_step']:
+                    with torch.enable_grad():
+                        if not latents.requires_grad:
+                            latents.requires_grad = True
+                        latents_0 = self.predict_x0(latents, noise_pred, t)
+                        latents_inp = 1 / 0.18215 * latents_0
+                        imgs = self.vae.decode(latents_inp).sample
+                        imgs = (imgs / 2 + 0.5).clamp(0, 1)
+                        loss_total = 0.
+                        for attn_map, rgb_val in zip(text_format_dict['color_obj_atten'], text_format_dict['target_RGB']):
+                            avg_rgb = (
+                                imgs*attn_map[:, 0]).sum(2).sum(2)/attn_map[:, 0].sum()
+                            loss = self.color_loss(
+                                avg_rgb, rgb_val[:, :, 0, 0])*100
+                            loss_total += loss
+                        loss_total.backward()
+                    latents = (
+                        latents - latents.grad * text_format_dict['color_guidance_weight'] * text_format_dict['color_obj_atten_all']).detach().clone()
+                # apply background injection
+                if background_inject_step:
+                    latents = latents_reference * self.masks[-1] + latents * \
+                        (1-self.masks[-1])
+        return latents
+    def predict_x0(self, x_t, eps_t, t):
+        alpha_t = self.scheduler.alphas_cumprod[t]
+        return (x_t - eps_t * torch.sqrt(1-alpha_t)) / torch.sqrt(alpha_t)
+    def produce_attn_maps(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50,
+                          guidance_scale=7.5, latents=None):
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if isinstance(negative_prompts, str):
+            negative_prompts = [negative_prompts]
+        # Prompts -> text embeds
+        text_embeddings = self.get_text_embeds(
+            prompts, negative_prompts)  # [2, 77, 768]
+        if latents is None:
+            latents = torch.randn(
+                (text_embeddings.shape[0] // 2, self.unet.in_channels, height // 8, width // 8), device=self.device)
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.remove_replacement_hooks()
+        with torch.autocast('cuda'):
+            for i, t in enumerate(self.scheduler.timesteps):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                latent_model_input = torch.cat([latents] * 2)
+                # predict the noise residual
+                with torch.no_grad():
+                    noise_pred = self.unet(
+                        latent_model_input, t, encoder_hidden_states=text_embeddings)['sample']
+                # perform guidance
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents)[
+                    'prev_sample']
+        # Img latents -> imgs
+        imgs = self.decode_latents(latents)  # [1, 3, 512, 512]
+        # Img to Numpy
+        imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy()
+        imgs = (imgs * 255).round().astype('uint8')
+        return imgs
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        with torch.no_grad():
+            imgs = self.vae.decode(latents).sample
+        imgs = (imgs / 2 + 0.5).clamp(0, 1)
+        return imgs
+    def encode_imgs(self, imgs):
+        # imgs: [B, 3, H, W]
+        imgs = 2 * imgs - 1
+        posterior = self.vae.encode(imgs).latent_dist
+        latents = posterior.sample() * 0.18215
+        return latents
+    def prompt_to_img(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50,
+                      guidance_scale=7.5, latents=None, text_format_dict={}, use_guidance=False, inject_selfattn=0, inject_background=0):
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if isinstance(negative_prompts, str):
+            negative_prompts = [negative_prompts]
+        # Prompts -> text embeds
+        text_embeds = self.get_text_embeds(
+            prompts, negative_prompts)  # [2, 77, 768]
+        # else:
+        latents = self.produce_latents(text_embeds, height=height, width=width, latents=latents,
+                                       num_inference_steps=num_inference_steps, guidance_scale=guidance_scale,
+                                       use_guidance=use_guidance, text_format_dict=text_format_dict,
+                                       inject_selfattn=inject_selfattn, inject_background=inject_background)  # [1, 4, 64, 64]
+        # Img latents -> imgs
+        imgs = self.decode_latents(latents)  # [1, 3, 512, 512]
+        # Img to Numpy
+        imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy()
+        imgs = (imgs * 255).round().astype('uint8')
+        return imgs
+    def reset_attention_maps(self):
+        r"""Function to reset attention maps.
+        We reset attention maps because we append them while getting hooks
+        to visualize attention maps for every step.
+        """
+        for key in self.selfattn_maps:
+            self.selfattn_maps[key] = []
+        for key in self.crossattn_maps:
+            self.crossattn_maps[key] = []
+    def register_evaluation_hooks(self):
+        r"""Function for registering hooks during evaluation.
+        We mainly store activation maps averaged over queries.
+        """
+        self.forward_hooks = []
+        def save_activations(activations, name, module, inp, out):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            # out[0] - final output of attention layer
+            # out[1] - attention probability matrix
+            if 'attn2' in name:
+                assert out[1].shape[-1] == 77
+                activations[name].append(out[1].detach().cpu())
+            else:
+                assert out[1].shape[-1] != 77
+        attention_dict = collections.defaultdict(list)
+        for name, module in self.unet.named_modules():
+            leaf_name = name.split('.')[-1]
+            if 'attn' in leaf_name:
+                # Register hook to obtain outputs at every attention layer.
+                self.forward_hooks.append(module.register_forward_hook(
+                    partial(save_activations, attention_dict, name)
+                ))
+        # attention_dict is a dictionary containing attention maps for every attention layer
+        self.attention_maps = attention_dict
+    def register_selfattn_hooks(self, feat_inject_step=False):
+        r"""Function for registering hooks during evaluation.
+        We mainly store activation maps averaged over queries.
+        """
+        self.selfattn_forward_hooks = []
+        def save_activations(activations, name, module, inp, out):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            # out[0] - final output of attention layer
+            # out[1] - attention probability matrix
+            if 'attn2' in name:
+                assert out[1][1].shape[-1] == 77
+                # cross attention injection
+                # activations[name] = out[1][1].detach()
+            else:
+                assert out[1][1].shape[-1] != 77
+                activations[name] = out[1][1].detach()
+        def save_resnet_activations(activations, name, module, inp, out):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            # out[0] - final output of residual layer
+            # out[1] - residual hidden feature
+            assert out[1].shape[-1] == 16
+            activations[name] = out[1].detach()
+        attention_dict = collections.defaultdict(list)
+        for name, module in self.unet.named_modules():
+            leaf_name = name.split('.')[-1]
+            if 'attn' in leaf_name and feat_inject_step:
+                # Register hook to obtain outputs at every attention layer.
+                self.selfattn_forward_hooks.append(module.register_forward_hook(
+                    partial(save_activations, attention_dict, name)
+                ))
+            if name == 'up_blocks.1.resnets.1' and feat_inject_step:
+                self.selfattn_forward_hooks.append(module.register_forward_hook(
+                    partial(save_resnet_activations, attention_dict, name)
+                ))
+        # attention_dict is a dictionary containing attention maps for every attention layer
+        self.self_attention_maps_cur = attention_dict
+    def register_replacement_hooks(self, feat_inject_step=False):
+        r"""Function for registering hooks to replace self attention.
+        """
+        self.forward_replacement_hooks = []
+        def replace_activations(name, module, args):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            if 'attn1' in name:
+                modified_args = (args[0], self.self_attention_maps_cur[name])
+                return modified_args
+                # cross attention injection
+            # elif 'attn2' in name:
+            #     modified_map = {
+            #         'reference': self.self_attention_maps_cur[name],
+            #         'inject_pos': self.inject_pos,
+            #     }
+            #     modified_args = (args[0], modified_map)
+            #     return modified_args
+        def replace_resnet_activations(name, module, args):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            modified_args = (args[0], args[1],
+                             self.self_attention_maps_cur[name])
+            return modified_args
+        for name, module in self.unet.named_modules():
+            leaf_name = name.split('.')[-1]
+            if 'attn' in leaf_name and feat_inject_step:
+                # Register hook to obtain outputs at every attention layer.
+                self.forward_replacement_hooks.append(module.register_forward_pre_hook(
+                    partial(replace_activations, name)
+                ))
+            if name == 'up_blocks.1.resnets.1' and feat_inject_step:
+                # Register hook to obtain outputs at every attention layer.
+                self.forward_replacement_hooks.append(module.register_forward_pre_hook(
+                    partial(replace_resnet_activations, name)
+                ))
+    def register_tokenmap_hooks(self):
+        r"""Function for registering hooks during evaluation.
+        We mainly store activation maps averaged over queries.
+        """
+        self.forward_hooks = []
+        def save_activations(selfattn_maps, crossattn_maps, n_maps, name, module, inp, out):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            # out[0] - final output of attention layer
+            # out[1] - attention probability matrices
+            if name in n_maps:
+                n_maps[name] += 1
+            else:
+                n_maps[name] = 1
+            if 'attn2' in name:
+                assert out[1][0].shape[-1] == 77
+                if name in CrossAttentionLayers and n_maps[name] > 10:
+                    if name in crossattn_maps:
+                        crossattn_maps[name] += out[1][0].detach().cpu()[1:2]
+                    else:
+                        crossattn_maps[name] = out[1][0].detach().cpu()[1:2]
+            else:
+                assert out[1][0].shape[-1] != 77
+                if name in SelfAttentionLayers and n_maps[name] > 10:
+                    if name in crossattn_maps:
+                        selfattn_maps[name] += out[1][0].detach().cpu()[1:2]
+                    else:
+                        selfattn_maps[name] = out[1][0].detach().cpu()[1:2]
+        selfattn_maps = collections.defaultdict(list)
+        crossattn_maps = collections.defaultdict(list)
+        n_maps = collections.defaultdict(list)
+        for name, module in self.unet.named_modules():
+            leaf_name = name.split('.')[-1]
+            if 'attn' in leaf_name:
+                # Register hook to obtain outputs at every attention layer.
+                self.forward_hooks.append(module.register_forward_hook(
+                    partial(save_activations, selfattn_maps,
+                            crossattn_maps, n_maps, name)
+                ))
+        # attention_dict is a dictionary containing attention maps for every attention layer
+        self.selfattn_maps = selfattn_maps
+        self.crossattn_maps = crossattn_maps
+        self.n_maps = n_maps
+    def remove_tokenmap_hooks(self):
+        for hook in self.forward_hooks:
+            hook.remove()
+        self.selfattn_maps = None
+        self.crossattn_maps = None
+        self.n_maps = None
+    def remove_evaluation_hooks(self):
+        for hook in self.forward_hooks:
+            hook.remove()
+        self.attention_maps = None
+    def remove_replacement_hooks(self):
+        for hook in self.forward_replacement_hooks:
+            hook.remove()
+    def remove_selfattn_hooks(self):
+        for hook in self.selfattn_forward_hooks:
+            hook.remove()

models/unet_2d_blocks.py ADDED Viewed

	@@ -0,0 +1,1855 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+from torch import nn
+from .attention import AttentionBlock, DualTransformer2DModel, Transformer2DModel
+from diffusers.models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, Upsample2D
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith(
+        "UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnDownEncoderBlock2D":
+        return AttnDownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith(
+        "UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+        )
+    elif up_block_type == "AttnUpDecoderBlock2D":
+        return AttnUpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+    ):
+        super().__init__()
+        self.attention_type = attention_type
+        resnet_groups = resnet_groups if resnet_groups is not None else min(
+            in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            attentions.append(
+                AttentionBlock(
+                    in_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(self, hidden_states, temb=None, encoder_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.attention_type == "default":
+                hidden_states = attn(hidden_states)
+            else:
+                hidden_states = attn(hidden_states, encoder_states)
+            hidden_states, _ = resnet(hidden_states, temb)
+        return hidden_states
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+    ):
+        super().__init__()
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = resnet_groups if resnet_groups is not None else min(
+            in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        in_channels // attn_num_head_channels,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        in_channels // attn_num_head_channels,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def set_attention_slice(self, slice_size):
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
+            )
+        if slice_size is not None and slice_size > min(head_dims):
+            raise ValueError(
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
+            )
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(
+                use_memory_efficient_attention_xformers)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None,
+                text_format_dict={}):
+        hidden_states, _ = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states,
+                                 text_format_dict).sample
+            hidden_states, _ = resnet(hidden_states, temb)
+        return hidden_states
+class AttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states, _ = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def set_attention_slice(self, slice_size):
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
+            )
+        if slice_size is not None and slice_size > min(head_dims):
+            raise ValueError(
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
+            )
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(
+                use_memory_efficient_attention_xformers)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None,
+                text_format_dict={}):
+        output_states = ()
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(
+                        attn, return_dict=False), hidden_states, encoder_hidden_states,
+                    text_format_dict
+                )[0]
+            else:
+                hidden_states, _ = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states,
+                                     text_format_dict=text_format_dict).sample
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states, _ = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states, _ = resnet(hidden_states, temb=None)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+        return hidden_states
+class AttnDownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(self, hidden_states):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states, _ = resnet(hidden_states, temb=None)
+            hidden_states = attn(hidden_states)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+        return hidden_states
+class AttnSkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=np.sqrt(2.0),
+        downsample_padding=1,
+        add_downsample=True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            self.attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                )
+            )
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList(
+                [FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(
+                3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+    def forward(self, hidden_states, temb=None, skip_sample=None):
+        output_states = ()
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states, _ = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+            output_states += (hidden_states,)
+        return hidden_states, output_states, skip_sample
+class SkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor=np.sqrt(2.0),
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList(
+                [FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(
+                3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+    def forward(self, hidden_states, temb=None, skip_sample=None):
+        output_states = ()
+        for resnet in self.resnets:
+            hidden_states, _ = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+            output_states += (hidden_states,)
+        return hidden_states, output_states, skip_sample
+class AttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_type="default",
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
+            hidden_states, _ = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def set_attention_slice(self, slice_size):
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
+            )
+        if slice_size is not None and slice_size > min(head_dims):
+            raise ValueError(
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
+            )
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+        self.gradient_checkpointing = False
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(
+                use_memory_efficient_attention_xformers)
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+        text_format_dict={}
+    ):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(
+                        attn, return_dict=False), hidden_states, encoder_hidden_states,
+                    text_format_dict
+                )[0]
+            else:
+                hidden_states, _ = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states,
+                                     text_format_dict=text_format_dict).sample
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states, _ = resnet(hidden_states, temb)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states, _ = resnet(hidden_states, temb=None)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states
+class AttnUpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+    def forward(self, hidden_states):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states, _ = resnet(hidden_states, temb=None)
+            hidden_states = attn(hidden_states)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states
+class AttnSkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=np.sqrt(2.0),
+        upsample_padding=1,
+        add_upsample=True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(resnet_in_channels +
+                               res_skip_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions.append(
+            AttentionBlock(
+                out_channels,
+                num_head_channels=attn_num_head_channels,
+                rescale_output_factor=output_scale_factor,
+                eps=resnet_eps,
+            )
+        )
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(
+                3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
+            hidden_states, _ = resnet(hidden_states, temb)
+        hidden_states = self.attentions[0](hidden_states)
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+            skip_sample = skip_sample + skip_sample_states
+            hidden_states = self.resnet_up(hidden_states, temb)
+        return hidden_states, skip_sample
+class SkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor=np.sqrt(2.0),
+        add_upsample=True,
+        upsample_padding=1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(
+                        (resnet_in_channels + res_skip_channels) // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(
+                3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
+            hidden_states, _ = resnet(hidden_states, temb)
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+            skip_sample = skip_sample + skip_sample_states
+            hidden_states = self.resnet_up(hidden_states, temb)
+        return hidden_states, skip_sample
+class ResnetBlock2D(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        kernel=None,
+        output_scale_factor=1.0,
+        use_in_shortcut=None,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        self.norm1 = torch.nn.GroupNorm(
+            num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                time_emb_proj_out_channels = out_channels
+            elif self.time_embedding_norm == "scale_shift":
+                time_emb_proj_out_channels = out_channels * 2
+            else:
+                raise ValueError(
+                    f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+            self.time_emb_proj = torch.nn.Linear(
+                temb_channels, time_emb_proj_out_channels)
+        else:
+            self.time_emb_proj = None
+        self.norm2 = torch.nn.GroupNorm(
+            num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if non_linearity == "swish":
+            self.nonlinearity = lambda x: F.silu(x)
+        elif non_linearity == "mish":
+            self.nonlinearity = Mish()
+        elif non_linearity == "silu":
+            self.nonlinearity = nn.SiLU()
+        self.upsample = self.downsample = None
+        if self.up:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.upsample = partial(
+                    F.interpolate, scale_factor=2.0, mode="nearest")
+            else:
+                self.upsample = Upsample2D(in_channels, use_conv=False)
+        elif self.down:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.downsample = lambda x: downsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.downsample = partial(
+                    F.avg_pool2d, kernel_size=2, stride=2)
+            else:
+                self.downsample = Downsample2D(
+                    in_channels, use_conv=False, padding=1, name="op")
+        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = torch.nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, input_tensor, temb, inject_states=None):
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
+            input_tensor = self.upsample(input_tensor)
+            hidden_states = self.upsample(hidden_states)
+        elif self.downsample is not None:
+            input_tensor = self.downsample(input_tensor)
+            hidden_states = self.downsample(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            temb = self.time_emb_proj(self.nonlinearity(temb))[
+                :, :, None, None]
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        if inject_states is not None:
+            output_tensor = (input_tensor + inject_states) / \
+                self.output_scale_factor
+        else:
+            output_tensor = (input_tensor + hidden_states) / \
+                self.output_scale_factor
+        return output_tensor, hidden_states

models/unet_2d_condition.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from .unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UpBlock2D,
+    get_down_block,
+    get_up_block,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor
+class UNet2DConditionModel(ModelMixin, ConfigMixin):
+    r"""
+    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+    and returns sample shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the models (such as downloading or saving, etc.)
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        num_class_embeds: Optional[int] = None,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+        # import ipdb;ipdb.set_trace()
+        # input
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        # class embedding
+        if num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift="default",
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim[-1],
+            resnet_groups=norm_num_groups,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+    def set_attention_slice(self, slice_size):
+        head_dims = self.config.attention_head_dim
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
+            )
+        if slice_size is not None and slice_size > min(head_dims):
+            raise ValueError(
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
+            )
+        for block in self.down_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_attention_slice(slice_size)
+        self.mid_block.set_attention_slice(slice_size)
+        for block in self.up_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_attention_slice(slice_size)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for block in self.down_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+        self.mid_block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+        for block in self.up_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        text_format_dict = {},
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.config.num_class_embeds is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None:
+                if isinstance(downsample_block, CrossAttnDownBlock2D):
+                    sample, res_samples = downsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        encoder_hidden_states=encoder_hidden_states,
+                        text_format_dict=text_format_dict
+                    )
+                else:
+                    sample, res_samples = downsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        encoder_hidden_states=encoder_hidden_states,
+                    )
+            else:
+                if isinstance(downsample_block, CrossAttnDownBlock2D):
+                    import ipdb;ipdb.set_trace()
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states,
+                                text_format_dict=text_format_dict)
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "attentions") and upsample_block.attentions is not None:
+                if isinstance(upsample_block, CrossAttnUpBlock2D):
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        encoder_hidden_states=encoder_hidden_states,
+                        upsample_size=upsample_size,
+                        text_format_dict=text_format_dict
+                    )
+                else:
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        encoder_hidden_states=encoder_hidden_states,
+                        upsample_size=upsample_size,
+                    )
+            else:
+                if isinstance(upsample_block, CrossAttnUpBlock2D):
+                    upsample_block.attentions
+                    import ipdb;ipdb.set_trace()
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+--extra-index-url https://download.pytorch.org/whl/cu117
+torch==1.13.1
+torchvision==0.14.1
+diffusers==0.12.1
+transformers==4.26.0
+numpy==1.24.2
+seaborn==0.12.2
+accelerate==0.16.0
+scikit-learn==0.24.1

rich-text-to-json-iframe.html ADDED Viewed

	@@ -0,0 +1,341 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <title>Rich Text to JSON</title>
+    <link rel="stylesheet" href="https://cdn.quilljs.com/1.3.6/quill.snow.css">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css">
+    <link rel="stylesheet" type="text/css"
+        href="https://cdnjs.cloudflare.com/ajax/libs/spectrum/1.8.0/spectrum.min.css">
+    <link rel="stylesheet"
+        href='https://fonts.googleapis.com/css?family=Mirza|Roboto|Slabo+27px|Sofia|Inconsolata|Ubuntu|Akronim|Monoton&display=swap'>
+    <style>
+        html,
+        body {
+            background-color: white;
+            margin: 0;
+        }
+        /* Set default font-family */
+        .ql-snow .ql-tooltip::before {
+            content: "Footnote";
+            line-height: 26px;
+            margin-right: 8px;
+        }
+        .ql-snow .ql-tooltip[data-mode=link]::before {
+            content: "Enter footnote:";
+        }
+        .row {
+            margin-top: 15px;
+            margin-left: 0px;
+            margin-bottom: 15px;
+        }
+        .btn-primary {
+            color: #ffffff;
+            background-color: #2780e3;
+            border-color: #2780e3;
+        }
+        .btn-primary:hover {
+            color: #ffffff;
+            background-color: #1967be;
+            border-color: #1862b5;
+        }
+        .btn {
+            display: inline-block;
+            margin-bottom: 0;
+            font-weight: normal;
+            text-align: center;
+            vertical-align: middle;
+            touch-action: manipulation;
+            cursor: pointer;
+            background-image: none;
+            border: 1px solid transparent;
+            white-space: nowrap;
+            padding: 10px 18px;
+            font-size: 15px;
+            line-height: 1.42857143;
+            border-radius: 0;
+            user-select: none;
+        }
+        #standalone-container {
+            width: 100%;
+            background-color: #ffffff;
+        }
+        #editor-container {
+            font-family: "Aref Ruqaa";
+            font-size: 18px;
+            height: 250px;
+            width: 100%;
+        }
+        #toolbar-container {
+            font-family: "Aref Ruqaa";
+            display: flex;
+            flex-wrap: wrap;
+        }
+        #json-container {
+            max-width: 720px;
+        }
+        /* Set dropdown font-families */
+        #toolbar-container .ql-font span[data-label="Base"]::before {
+            font-family: "Aref Ruqaa";
+        }
+        #toolbar-container .ql-font span[data-label="Claude Monet"]::before {
+            font-family: "Mirza";
+        }
+        #toolbar-container .ql-font span[data-label="Ukiyoe"]::before {
+            font-family: "Roboto";
+        }
+        #toolbar-container .ql-font span[data-label="Cyber Punk"]::before {
+            font-family: "Comic Sans MS";
+        }
+        #toolbar-container .ql-font span[data-label="Pop Art"]::before {
+            font-family: "sofia";
+        }
+        #toolbar-container .ql-font span[data-label="Van Gogh"]::before {
+            font-family: "slabo 27px";
+        }
+        #toolbar-container .ql-font span[data-label="Pixel Art"]::before {
+            font-family: "inconsolata";
+        }
+        #toolbar-container .ql-font span[data-label="Rembrandt"]::before {
+            font-family: "ubuntu";
+        }
+        #toolbar-container .ql-font span[data-label="Cubism"]::before {
+            font-family: "Akronim";
+        }
+        #toolbar-container .ql-font span[data-label="Neon Art"]::before {
+            font-family: "Monoton";
+        }
+        /* Set content font-families */
+        .ql-font-mirza {
+            font-family: "Mirza";
+        }
+        .ql-font-roboto {
+            font-family: "Roboto";
+        }
+        .ql-font-cursive {
+            font-family: "Comic Sans MS";
+        }
+        .ql-font-sofia {
+            font-family: "sofia";
+        }
+        .ql-font-slabo {
+            font-family: "slabo 27px";
+        }
+        .ql-font-inconsolata {
+            font-family: "inconsolata";
+        }
+        .ql-font-ubuntu {
+            font-family: "ubuntu";
+        }
+        .ql-font-Akronim {
+            font-family: "Akronim";
+        }
+        .ql-font-Monoton {
+            font-family: "Monoton";
+        }
+        .ql-color .ql-picker-options [data-value=Color-Picker] {
+            background: none !important;
+            width: 100% !important;
+            height: 20px !important;
+            text-align: center;
+        }
+        .ql-color .ql-picker-options [data-value=Color-Picker]:before {
+            content: 'Color Picker';
+        }
+        .ql-color .ql-picker-options [data-value=Color-Picker]:hover {
+            border-color: transparent !important;
+        }
+    </style>
+</head>
+<body>
+    <div id="standalone-container">
+        <div id="toolbar-container">
+            <span class="ql-formats">
+                <select class="ql-font">
+                    <option selected>Base</option>
+                    <option value="mirza">Claude Monet</option>
+                    <option value="roboto">Ukiyoe</option>
+                    <option value="cursive">Cyber Punk</option>
+                    <option value="sofia">Pop Art</option>
+                    <option value="slabo">Van Gogh</option>
+                    <option value="inconsolata">Pixel Art</option>
+                    <option value="ubuntu">Rembrandt</option>
+                    <option value="Akronim">Cubism</option>
+                    <option value="Monoton">Neon Art</option>
+                </select>
+                <select class="ql-size">
+                    <option value="18px">Small</option>
+                    <option selected>Normal</option>
+                    <option value="32px">Large</option>
+                    <option value="50px">Huge</option>
+                </select>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-strike"></button>
+            </span>
+            <!-- <span class="ql-formats">
+                <button class="ql-bold"></button>
+                <button class="ql-italic"></button>
+                <button class="ql-underline"></button>
+            </span> -->
+            <span class="ql-formats">
+                <select class="ql-color">
+                    <option value="Color-Picker"></option>
+                </select>
+                <!-- <select class="ql-background"></select> -->
+            </span>
+            <!-- <span class="ql-formats">
+                <button class="ql-script" value="sub"></button>
+                <button class="ql-script" value="super"></button>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-header" value="1"></button>
+                <button class="ql-header" value="2"></button>
+                <button class="ql-blockquote"></button>
+                <button class="ql-code-block"></button>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-list" value="ordered"></button>
+                <button class="ql-list" value="bullet"></button>
+                <button class="ql-indent" value="-1"></button>
+                <button class="ql-indent" value="+1"></button>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-direction" value="rtl"></button>
+                <select class="ql-align"></select>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-link"></button>
+                <button class="ql-image"></button>
+                <button class="ql-video"></button>
+                <button class="ql-formula"></button>
+            </span> -->
+            <span class="ql-formats">
+                <button class="ql-link"></button>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-clean"></button>
+            </span>
+        </div>
+        <div id="editor-container" style="height:300px;"></div>
+    </div>
+    <script src="https://cdn.quilljs.com/1.3.6/quill.min.js"></script>
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.1.0/jquery.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/spectrum/1.8.0/spectrum.min.js"></script>
+    <script>
+        // Register the customs format with Quill
+        const Font = Quill.import('formats/font');
+        Font.whitelist = ['mirza', 'roboto', 'sofia', 'slabo', 'inconsolata', 'ubuntu', 'cursive', 'Akronim', 'Monoton'];
+        const Link = Quill.import('formats/link');
+        Link.sanitize = function (url) {
+            // modify url if desired
+            return url;
+        }
+        const SizeStyle = Quill.import('attributors/style/size');
+        SizeStyle.whitelist = ['10px', '18px', '20px', '32px', '50px', '60px', '64px', '70px'];
+        Quill.register(SizeStyle, true);
+        Quill.register(Link, true);
+        Quill.register(Font, true);
+        const icons = Quill.import('ui/icons');
+        icons['link'] = `<svg xmlns="http://www.w3.org/2000/svg" width="17" viewBox="0 0 512 512" xml:space="preserve"><path fill="#010101" d="M276.75 1c4.51 3.23 9.2 6.04 12.97 9.77 29.7 29.45 59.15 59.14 88.85 88.6 4.98 4.93 7.13 10.37 7.12 17.32-.1 125.8-.09 251.6-.01 377.4 0 7.94-1.96 14.46-9.62 18.57-121.41.34-242.77.34-364.76.05A288.3 288.3 0 0 1 1 502c0-163.02 0-326.04.34-489.62C3.84 6.53 8.04 3.38 13 1c23.35 0 46.7 0 70.82.3 2.07.43 3.38.68 4.69.68h127.98c18.44.01 36.41.04 54.39-.03 1.7 0 3.41-.62 5.12-.95h.75M33.03 122.5v359.05h320.22V129.18h-76.18c-14.22-.01-19.8-5.68-19.8-20.09V33.31H33.02v89.19m256.29-27.36c.72.66 1.44 1.9 2.17 1.9 12.73.12 25.46.08 37.55.08L289.3 57.45v37.7z"/><path fill="#020202" d="M513 375.53c-4.68 7.99-11.52 10.51-20.21 10.25-13.15-.4-26.32-.1-39.48-.1h-5.58c5.49 8.28 10.7 15.74 15.46 23.47 6.06 9.82 1.14 21.65-9.96 24.27-6.7 1.59-12.45-.64-16.23-6.15a2608.6 2608.6 0 0 1-32.97-49.36c-3.57-5.48-3.39-11.54.17-16.98a3122.5 3122.5 0 0 1 32.39-48.56c5.22-7.65 14.67-9.35 21.95-4.45 7.63 5.12 9.6 14.26 4.5 22.33-4.75 7.54-9.8 14.9-15.11 22.95h33.64V225.19h-5.24c-19.49 0-38.97.11-58.46-.05-12.74-.1-20.12-13.15-13.84-24.14 3.12-5.46 8.14-7.71 14.18-7.73 26.15-.06 52.3-.04 78.45 0 7.1 0 12.47 3.05 16.01 9.64.33 57.44.33 114.8.33 172.62z"/><path fill="#111" d="M216.03 1.97C173.52 1.98 131 2 88.5 1.98a16 16 0 0 1-4.22-.68c43.4-.3 87.09-.3 131.24-.06.48.25.5.73.5.73z"/><path fill="#232323" d="M216.5 1.98c-.47 0-.5-.5-.5-.74C235.7 1 255.38 1 275.53 1c-1.24.33-2.94.95-4.65.95-17.98.07-35.95.04-54.39.03z"/><path fill="#040404" d="M148 321.42h153.5c14.25 0 19.96 5.71 19.96 19.97.01 19.17.03 38.33 0 57.5-.03 12.6-6.16 18.78-18.66 18.78H99.81c-12.42 0-18.75-6.34-18.76-18.73-.01-19.83-.02-39.66 0-59.5.02-11.47 6.4-17.93 17.95-18 16.17-.08 32.33-.02 49-.02m40.5 32.15h-75.16v31.84h175.7v-31.84H188.5z"/><path fill="#030303" d="m110 225.33 178.89-.03c11.98 0 19.25 9.95 15.74 21.44-2.05 6.71-7.5 10.57-15.14 10.57-63.63 0-127.25-.01-190.88-.07-12.03-.02-19.17-8.62-16.7-19.84 1.6-7.21 7.17-11.74 15.1-12.04 4.17-.16 8.33-.03 13-.03zm-24.12-36.19c-5.28-6.2-6.3-12.76-2.85-19.73 3.22-6.49 9.13-8.24 15.86-8.24 25.64.01 51.27-.06 76.91.04 13.07.04 20.66 10.44 16.33 22.08-2.25 6.06-6.63 9.76-13.08 9.8-27.97.18-55.94.2-83.9-.07-3.01-.03-6-2.36-9.27-3.88z"/></svg>`
+        const quill = new Quill('#editor-container', {
+            modules: {
+                toolbar: {
+                    container: '#toolbar-container',
+                },
+            },
+            theme: 'snow'
+        });
+        var toolbar = quill.getModule('toolbar');
+        $(toolbar.container).find('.ql-color').spectrum({
+            preferredFormat: "rgb",
+            showInput: true,
+            showInitial: true,
+            showPalette: true,
+            showSelectionPalette: true,
+            palette: [
+                ["#000", "#444", "#666", "#999", "#ccc", "#eee", "#f3f3f3", "#fff"],
+                ["#f00", "#f90", "#ff0", "#0f0", "#0ff", "#00f", "#90f", "#f0f"],
+                ["#ea9999", "#f9cb9c", "#ffe599", "#b6d7a8", "#a2c4c9", "#9fc5e8", "#b4a7d6", "#d5a6bd"],
+                ["#e06666", "#f6b26b", "#ffd966", "#93c47d", "#76a5af", "#6fa8dc", "#8e7cc3", "#c27ba0"],
+                ["#c00", "#e69138", "#f1c232", "#6aa84f", "#45818e", "#3d85c6", "#674ea7", "#a64d79"],
+                ["#900", "#b45f06", "#bf9000", "#38761d", "#134f5c", "#0b5394", "#351c75", "#741b47"],
+                ["#600", "#783f04", "#7f6000", "#274e13", "#0c343d", "#073763", "#20124d", "#4c1130"]
+            ],
+            change: function (color) {
+                var value = color.toHexString();
+                quill.format('color', value);
+            }
+        });
+        quill.on('text-change', () => {
+            // keep qull data inside _data to communicate with Gradio
+            document.body._data = quill.getContents()
+        })
+        function setQuillContents(content) {
+            quill.setContents(content);
+            document.body._data = quill.getContents();
+        }
+        document.body.setQuillContents = setQuillContents
+    </script>
+    <script src="https://unpkg.com/@popperjs/core@2/dist/umd/popper.min.js"></script>
+    <script src="https://unpkg.com/tippy.js@6/dist/tippy-bundle.umd.js"></script>
+    <script>
+        // With the above scripts loaded, you can call `tippy()` with a CSS
+        // selector and a `content` prop:
+        tippy('.ql-font', {
+            content: 'Add a style to the token',
+        });
+        tippy('.ql-size', {
+            content: 'Reweight the token',
+        });
+        tippy('.ql-color', {
+            content: 'Pick a color for the token',
+        });
+        tippy('.ql-link', {
+            content: 'Clarify the token',
+        });
+        tippy('.ql-strike', {
+            content: 'Change the token weight to be negative',
+        });
+        tippy('.ql-clean', {
+            content: 'Remove all the formats',
+        });
+    </script>
+</body>
+</html>

rich-text-to-json.js ADDED Viewed

	@@ -0,0 +1,349 @@

+class RichTextEditor extends HTMLElement {
+    constructor() {
+        super();
+        this.loadExternalScripts();
+        this.attachShadow({ mode: 'open' });
+        this.shadowRoot.innerHTML = `
+                ${RichTextEditor.header()}
+                ${RichTextEditor.template()}
+           `;
+    }
+    connectedCallback() {
+        this.myQuill = this.mountQuill();
+    }
+    loadExternalScripts() {
+        const links = ["https://cdn.quilljs.com/1.3.6/quill.snow.css", "https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css", "https://fonts.googleapis.com/css?family=Mirza|Roboto|Slabo+27px|Sofia|Inconsolata|Ubuntu|Akronim|Monoton&display=swap"]
+        links.forEach(link => {
+            const css = document.createElement("link");
+            css.href = link;
+            css.rel = "stylesheet"
+            document.head.appendChild(css);
+        })
+    }
+    static template() {
+        return `
+    <div id="standalone-container">
+        <div id="toolbar-container">
+            <span class="ql-formats">
+                <select class="ql-font">
+                    <option selected>Base</option>
+                    <option value="mirza">Claude Monet</option>
+                    <option value="roboto">Ukiyoe</option>
+                    <option value="cursive">Cyber Punk</option>
+                    <option value="sofia">Pop Art</option>
+                    <option value="slabo">Van Gogh</option>
+                    <option value="inconsolata">Pixel Art</option>
+                    <option value="ubuntu">Rembrandt</option>
+                    <option value="Akronim">Cubism</option>
+                    <option value="Monoton">Neon Art</option>
+                </select>
+                <select class="ql-size">
+                    <option value="18px">Small</option>
+                    <option selected>Normal</option>
+                    <option value="32px">Large</option>
+                    <option value="50px">Huge</option>
+                </select>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-strike"></button>
+            </span>
+            <!-- <span class="ql-formats">
+                <button class="ql-bold"></button>
+                <button class="ql-italic"></button>
+                <button class="ql-underline"></button>
+            </span> -->
+            <span class="ql-formats">
+                <select class="ql-color"></select>
+                <!-- <select class="ql-background"></select> -->
+            </span>
+            <!-- <span class="ql-formats">
+                <button class="ql-script" value="sub"></button>
+                <button class="ql-script" value="super"></button>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-header" value="1"></button>
+                <button class="ql-header" value="2"></button>
+                <button class="ql-blockquote"></button>
+                <button class="ql-code-block"></button>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-list" value="ordered"></button>
+                <button class="ql-list" value="bullet"></button>
+                <button class="ql-indent" value="-1"></button>
+                <button class="ql-indent" value="+1"></button>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-direction" value="rtl"></button>
+                <select class="ql-align"></select>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-link"></button>
+                <button class="ql-image"></button>
+                <button class="ql-video"></button>
+                <button class="ql-formula"></button>
+            </span> -->
+            <span class="ql-formats">
+                <button class="ql-link"></button>
+            </span>
+            <span class="ql-formats">
+                <button class="ql-clean"></button>
+            </span>
+        </div>
+        <div id="editor-container"></div>
+    </div>
+    `;
+    }
+    static header() {
+        return `
+    <link rel="stylesheet" href="https://cdn.quilljs.com/1.3.6/quill.snow.css">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css">
+    <style>
+      /* Set default font-family */
+      .ql-snow .ql-tooltip::before {
+          content: "Footnote";
+          line-height: 26px;
+          margin-right: 8px;
+      }
+      .ql-snow .ql-tooltip[data-mode=link]::before {
+          content: "Enter footnote:";
+      }
+      .row {
+          margin-top: 15px;
+          margin-left: 0px;
+          margin-bottom: 15px;
+      }
+      .btn-primary {
+          color: #ffffff;
+          background-color: #2780e3;
+          border-color: #2780e3;
+      }
+      .btn-primary:hover {
+          color: #ffffff;
+          background-color: #1967be;
+          border-color: #1862b5;
+      }
+      .btn {
+          display: inline-block;
+          margin-bottom: 0;
+          font-weight: normal;
+          text-align: center;
+          vertical-align: middle;
+          touch-action: manipulation;
+          cursor: pointer;
+          background-image: none;
+          border: 1px solid transparent;
+          white-space: nowrap;
+          padding: 10px 18px;
+          font-size: 15px;
+          line-height: 1.42857143;
+          border-radius: 0;
+          user-select: none;
+      }
+      #standalone-container {
+          position: relative;
+          max-width: 720px;
+          background-color: #ffffff;
+          color: black !important;
+          z-index: 1000;
+      }
+      #editor-container {
+          font-family: "Aref Ruqaa";
+          font-size: 18px;
+          height: 250px;
+      }
+      #toolbar-container {
+          font-family: "Aref Ruqaa";
+          display: flex;
+          flex-wrap: wrap;
+      }
+      #json-container {
+          max-width: 720px;
+      }
+      /* Set dropdown font-families */
+      #toolbar-container .ql-font span[data-label="Base"]::before {
+          font-family: "Aref Ruqaa";
+      }
+      #toolbar-container .ql-font span[data-label="Claude Monet"]::before {
+          font-family: "Mirza";
+      }
+      #toolbar-container .ql-font span[data-label="Ukiyoe"]::before {
+          font-family: "Roboto";
+      }
+      #toolbar-container .ql-font span[data-label="Cyber Punk"]::before {
+          font-family: "Comic Sans MS";
+      }
+      #toolbar-container .ql-font span[data-label="Pop Art"]::before {
+          font-family: "sofia";
+      }
+      #toolbar-container .ql-font span[data-label="Van Gogh"]::before {
+          font-family: "slabo 27px";
+      }
+      #toolbar-container .ql-font span[data-label="Pixel Art"]::before {
+          font-family: "inconsolata";
+      }
+      #toolbar-container .ql-font span[data-label="Rembrandt"]::before {
+          font-family: "ubuntu";
+      }
+      #toolbar-container .ql-font span[data-label="Cubism"]::before {
+          font-family: "Akronim";
+      }
+      #toolbar-container .ql-font span[data-label="Neon Art"]::before {
+          font-family: "Monoton";
+      }
+      /* Set content font-families */
+      .ql-font-mirza {
+          font-family: "Mirza";
+      }
+      .ql-font-roboto {
+          font-family: "Roboto";
+      }
+      .ql-font-cursive {
+          font-family: "Comic Sans MS";
+      }
+      .ql-font-sofia {
+          font-family: "sofia";
+      }
+      .ql-font-slabo {
+          font-family: "slabo 27px";
+      }
+      .ql-font-inconsolata {
+          font-family: "inconsolata";
+      }
+      .ql-font-ubuntu {
+          font-family: "ubuntu";
+      }
+      .ql-font-Akronim {
+          font-family: "Akronim";
+      }
+      .ql-font-Monoton {
+          font-family: "Monoton";
+      }
+    </style>
+    `;
+    }
+    async mountQuill() {
+        // Register the customs format with Quill
+        const lib = await import("https://cdn.jsdelivr.net/npm/shadow-selection-polyfill");
+        const getRange = lib.getRange;
+        const Font = Quill.import('formats/font');
+        Font.whitelist = ['mirza', 'roboto', 'sofia', 'slabo', 'inconsolata', 'ubuntu', 'cursive', 'Akronim', 'Monoton'];
+        const Link = Quill.import('formats/link');
+        Link.sanitize = function (url) {
+            // modify url if desired
+            return url;
+        }
+        const SizeStyle = Quill.import('attributors/style/size');
+        SizeStyle.whitelist = ['10px', '18px', '32px', '50px', '64px'];
+        Quill.register(SizeStyle, true);
+        Quill.register(Link, true);
+        Quill.register(Font, true);
+        const icons = Quill.import('ui/icons');
+        const icon = `<svg xmlns="http://www.w3.org/2000/svg" width="17" viewBox="0 0 512 512" xml:space="preserve"><path fill="#010101" d="M276.75 1c4.51 3.23 9.2 6.04 12.97 9.77 29.7 29.45 59.15 59.14 88.85 88.6 4.98 4.93 7.13 10.37 7.12 17.32-.1 125.8-.09 251.6-.01 377.4 0 7.94-1.96 14.46-9.62 18.57-121.41.34-242.77.34-364.76.05A288.3 288.3 0 0 1 1 502c0-163.02 0-326.04.34-489.62C3.84 6.53 8.04 3.38 13 1c23.35 0 46.7 0 70.82.3 2.07.43 3.38.68 4.69.68h127.98c18.44.01 36.41.04 54.39-.03 1.7 0 3.41-.62 5.12-.95h.75M33.03 122.5v359.05h320.22V129.18h-76.18c-14.22-.01-19.8-5.68-19.8-20.09V33.31H33.02v89.19m256.29-27.36c.72.66 1.44 1.9 2.17 1.9 12.73.12 25.46.08 37.55.08L289.3 57.45v37.7z"/><path fill="#020202" d="M513 375.53c-4.68 7.99-11.52 10.51-20.21 10.25-13.15-.4-26.32-.1-39.48-.1h-5.58c5.49 8.28 10.7 15.74 15.46 23.47 6.06 9.82 1.14 21.65-9.96 24.27-6.7 1.59-12.45-.64-16.23-6.15a2608.6 2608.6 0 0 1-32.97-49.36c-3.57-5.48-3.39-11.54.17-16.98a3122.5 3122.5 0 0 1 32.39-48.56c5.22-7.65 14.67-9.35 21.95-4.45 7.63 5.12 9.6 14.26 4.5 22.33-4.75 7.54-9.8 14.9-15.11 22.95h33.64V225.19h-5.24c-19.49 0-38.97.11-58.46-.05-12.74-.1-20.12-13.15-13.84-24.14 3.12-5.46 8.14-7.71 14.18-7.73 26.15-.06 52.3-.04 78.45 0 7.1 0 12.47 3.05 16.01 9.64.33 57.44.33 114.8.33 172.62z"/><path fill="#111" d="M216.03 1.97C173.52 1.98 131 2 88.5 1.98a16 16 0 0 1-4.22-.68c43.4-.3 87.09-.3 131.24-.06.48.25.5.73.5.73z"/><path fill="#232323" d="M216.5 1.98c-.47 0-.5-.5-.5-.74C235.7 1 255.38 1 275.53 1c-1.24.33-2.94.95-4.65.95-17.98.07-35.95.04-54.39.03z"/><path fill="#040404" d="M148 321.42h153.5c14.25 0 19.96 5.71 19.96 19.97.01 19.17.03 38.33 0 57.5-.03 12.6-6.16 18.78-18.66 18.78H99.81c-12.42 0-18.75-6.34-18.76-18.73-.01-19.83-.02-39.66 0-59.5.02-11.47 6.4-17.93 17.95-18 16.17-.08 32.33-.02 49-.02m40.5 32.15h-75.16v31.84h175.7v-31.84H188.5z"/><path fill="#030303" d="m110 225.33 178.89-.03c11.98 0 19.25 9.95 15.74 21.44-2.05 6.71-7.5 10.57-15.14 10.57-63.63 0-127.25-.01-190.88-.07-12.03-.02-19.17-8.62-16.7-19.84 1.6-7.21 7.17-11.74 15.1-12.04 4.17-.16 8.33-.03 13-.03zm-24.12-36.19c-5.28-6.2-6.3-12.76-2.85-19.73 3.22-6.49 9.13-8.24 15.86-8.24 25.64.01 51.27-.06 76.91.04 13.07.04 20.66 10.44 16.33 22.08-2.25 6.06-6.63 9.76-13.08 9.8-27.97.18-55.94.2-83.9-.07-3.01-.03-6-2.36-9.27-3.88z"/></svg>`
+        icons['link'] = icon;
+        const editorContainer = this.shadowRoot.querySelector('#editor-container')
+        const toolbarContainer = this.shadowRoot.querySelector('#toolbar-container')
+        const myQuill = new Quill(editorContainer, {
+            modules: {
+                toolbar: {
+                    container: toolbarContainer,
+                },
+            },
+            theme: 'snow'
+        });
+        const normalizeNative = (nativeRange) => {
+            if (nativeRange) {
+                const range = nativeRange;
+                if (range.baseNode) {
+                    range.startContainer = nativeRange.baseNode;
+                    range.endContainer = nativeRange.focusNode;
+                    range.startOffset = nativeRange.baseOffset;
+                    range.endOffset = nativeRange.focusOffset;
+                    if (range.endOffset < range.startOffset) {
+                        range.startContainer = nativeRange.focusNode;
+                        range.endContainer = nativeRange.baseNode;
+                        range.startOffset = nativeRange.focusOffset;
+                        range.endOffset = nativeRange.baseOffset;
+                    }
+                }
+                if (range.startContainer) {
+                    return {
+                        start: { node: range.startContainer, offset: range.startOffset },
+                        end: { node: range.endContainer, offset: range.endOffset },
+                        native: range
+                    };
+                }
+            }
+            return null
+        };
+        myQuill.selection.getNativeRange = () => {
+            const dom = myQuill.root.getRootNode();
+            const selection = getRange(dom);
+            const range = normalizeNative(selection);
+            return range;
+        };
+        let fromEditor = false;
+        editorContainer.addEventListener("pointerup", (e) => {
+            fromEditor = false;
+        });
+        editorContainer.addEventListener("pointerout", (e) => {
+            fromEditor = false;
+        });
+        editorContainer.addEventListener("pointerdown", (e) => {
+            fromEditor = true;
+        });
+        document.addEventListener("selectionchange", () => {
+            if (fromEditor) {
+                myQuill.selection.update()
+            }
+        });
+        myQuill.on('text-change', () => {
+            // keep qull data inside _data to communicate with Gradio
+            document.querySelector("#rich-text-root")._data = myQuill.getContents()
+        })
+        return myQuill
+    }
+}
+customElements.define('rich-text-editor', RichTextEditor);

share_btn.py ADDED Viewed

	@@ -0,0 +1,116 @@

+community_icon_html = """<svg id="share-btn-share-icon" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32">
+    <path d="M20.6081 3C21.7684 3 22.8053 3.49196 23.5284 4.38415C23.9756 4.93678 24.4428 5.82749 24.4808 7.16133C24.9674 7.01707 25.4353 6.93643 25.8725 6.93643C26.9833 6.93643 27.9865 7.37587 28.696 8.17411C29.6075 9.19872 30.0124 10.4579 29.8361 11.7177C29.7523 12.3177 29.5581 12.8555 29.2678 13.3534C29.8798 13.8646 30.3306 14.5763 30.5485 15.4322C30.719 16.1032 30.8939 17.5006 29.9808 18.9403C30.0389 19.0342 30.0934 19.1319 30.1442 19.2318C30.6932 20.3074 30.7283 21.5229 30.2439 22.6548C29.5093 24.3704 27.6841 25.7219 24.1397 27.1727C21.9347 28.0753 19.9174 28.6523 19.8994 28.6575C16.9842 29.4379 14.3477 29.8345 12.0653 29.8345C7.87017 29.8345 4.8668 28.508 3.13831 25.8921C0.356375 21.6797 0.754104 17.8269 4.35369 14.1131C6.34591 12.058 7.67023 9.02782 7.94613 8.36275C8.50224 6.39343 9.97271 4.20438 12.4172 4.20438H12.4179C12.6236 4.20438 12.8314 4.2214 13.0364 4.25468C14.107 4.42854 15.0428 5.06476 15.7115 6.02205C16.4331 5.09583 17.134 4.359 17.7682 3.94323C18.7242 3.31737 19.6794 3 20.6081 3ZM20.6081 5.95917C20.2427 5.95917 19.7963 6.1197 19.3039 6.44225C17.7754 7.44319 14.8258 12.6772 13.7458 14.7131C13.3839 15.3952 12.7655 15.6837 12.2086 15.6837C11.1036 15.6837 10.2408 14.5497 12.1076 13.1085C14.9146 10.9402 13.9299 7.39584 12.5898 7.1776C12.5311 7.16799 12.4731 7.16355 12.4172 7.16355C11.1989 7.16355 10.6615 9.33114 10.6615 9.33114C10.6615 9.33114 9.0863 13.4148 6.38031 16.206C3.67434 18.998 3.5346 21.2388 5.50675 24.2246C6.85185 26.2606 9.42666 26.8753 12.0653 26.8753C14.8021 26.8753 17.6077 26.2139 19.1799 25.793C19.2574 25.7723 28.8193 22.984 27.6081 20.6107C27.4046 20.212 27.0693 20.0522 26.6471 20.0522C24.9416 20.0522 21.8393 22.6726 20.5057 22.6726C20.2076 22.6726 19.9976 22.5416 19.9116 22.222C19.3433 20.1173 28.552 19.2325 27.7758 16.1839C27.639 15.6445 27.2677 15.4256 26.746 15.4263C24.4923 15.4263 19.4358 19.5181 18.3759 19.5181C18.2949 19.5181 18.2368 19.4937 18.2053 19.4419C17.6743 18.557 17.9653 17.9394 21.7082 15.6009C25.4511 13.2617 28.0783 11.8545 26.5841 10.1752C26.4121 9.98141 26.1684 9.8956 25.8725 9.8956C23.6001 9.89634 18.2311 14.9403 18.2311 14.9403C18.2311 14.9403 16.7821 16.496 15.9057 16.496C15.7043 16.496 15.533 16.4139 15.4169 16.2112C14.7956 15.1296 21.1879 10.1286 21.5484 8.06535C21.7928 6.66715 21.3771 5.95917 20.6081 5.95917Z" fill="#FF9D00"></path>
+    <path d="M5.50686 24.2246C3.53472 21.2387 3.67446 18.9979 6.38043 16.206C9.08641 13.4147 10.6615 9.33111 10.6615 9.33111C10.6615 9.33111 11.2499 6.95933 12.59 7.17757C13.93 7.39581 14.9139 10.9401 12.1069 13.1084C9.29997 15.276 12.6659 16.7489 13.7459 14.713C14.8258 12.6772 17.7747 7.44316 19.304 6.44221C20.8326 5.44128 21.9089 6.00204 21.5484 8.06532C21.188 10.1286 14.795 15.1295 15.4171 16.2118C16.0391 17.2934 18.2312 14.9402 18.2312 14.9402C18.2312 14.9402 25.0907 8.49588 26.5842 10.1752C28.0776 11.8545 25.4512 13.2616 21.7082 15.6008C17.9646 17.9393 17.6744 18.557 18.2054 19.4418C18.7372 20.3266 26.9998 13.1351 27.7759 16.1838C28.5513 19.2324 19.3434 20.1173 19.9117 22.2219C20.48 24.3274 26.3979 18.2382 27.6082 20.6107C28.8193 22.9839 19.2574 25.7722 19.18 25.7929C16.0914 26.62 8.24723 28.3726 5.50686 24.2246Z" fill="#FFD21E"></path>
+</svg>"""
+loading_icon_html = """<svg id="share-btn-loading-icon" style="display:none;" class="animate-spin" style="color: #ffffff;" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" fill="none" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><circle style="opacity: 0.25;" cx="12" cy="12" r="10" stroke="white" stroke-width="4"></circle><path style="opacity: 0.75;" fill="white" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path></svg>"""
+share_js = """async () => {
+	async function uploadFile(file){
+		const UPLOAD_URL = 'https://huggingface.co/uploads';
+		const response = await fetch(UPLOAD_URL, {
+			method: 'POST',
+			headers: {
+				'Content-Type': file.type,
+				'X-Requested-With': 'XMLHttpRequest',
+			},
+			body: file, /// <- File inherits from Blob
+		});
+		const url = await response.text();
+		return url;
+	}
+    async function getInputImageFile(imageEl){
+        const res = await fetch(imageEl.src);
+        const blob = await res.blob();
+        const imageId = Date.now();
+        const fileName = `rich-text-image-${{imageId}}.png`;
+        return new File([blob], fileName, { type: 'image/png'});
+	}
+    const gradioEl = document.querySelector("gradio-app").shadowRoot || document.querySelector('body > gradio-app');
+    const richEl = document.getElementById("rich-text-root");
+    const data = richEl? richEl.contentDocument.body._data : {};
+    const text_input = JSON.stringify(data);
+    const negative_prompt = gradioEl.querySelector('#negative_prompt input').value;
+    const seed = gradioEl.querySelector('#seed input').value;
+    const richTextImg = gradioEl.querySelector('#rich-text-image img');
+    const plainTextImg = gradioEl.querySelector('#plain-text-image img');
+    const text_input_obj = JSON.parse(text_input);
+    const plain_prompt = text_input_obj.ops.map(e=> e.insert).join('');
+    const linkSrc = `https://huggingface.co/spaces/songweig/rich-text-to-image?prompt=${encodeURIComponent(text_input)}`;
+    const titleTxt = `RT2I: ${plain_prompt.slice(0, 50)}...`;
+    const shareBtnEl = gradioEl.querySelector('#share-btn');
+    const shareIconEl = gradioEl.querySelector('#share-btn-share-icon');
+    const loadingIconEl = gradioEl.querySelector('#share-btn-loading-icon');
+    if(!richTextImg){
+        return;
+    };
+    shareBtnEl.style.pointerEvents = 'none';
+    shareIconEl.style.display = 'none';
+    loadingIconEl.style.removeProperty('display');
+    const richImgFile = await getInputImageFile(richTextImg);
+    const plainImgFile = await getInputImageFile(plainTextImg);
+    const richImgURL = await uploadFile(richImgFile);
+    const plainImgURL = await uploadFile(plainImgFile);
+    const descriptionMd = `
+### Plain Prompt
+${plain_prompt}
+🔗 Shareable Link + Params: [here](${linkSrc})
+### Rich Tech Image
+<img src="${richImgURL}">
+### Plain Text Image
+<img src="${plainImgURL}">
+`;
+    const params = new URLSearchParams({
+        title: titleTxt,
+        description: descriptionMd,
+    });
+	const paramsStr = params.toString();
+	window.open(`https://huggingface.co/spaces/songweig/rich-text-to-image/discussions/new?${paramsStr}`, '_blank');
+    shareBtnEl.style.removeProperty('pointer-events');
+    shareIconEl.style.removeProperty('display');
+    loadingIconEl.style.display = 'none';
+}"""
+css = """
+        #share-btn-container {
+            display: flex;
+            padding-left: 0.5rem !important;
+            padding-right: 0.5rem !important;
+            background-color: #000000;
+            justify-content: center;
+            align-items: center;
+            border-radius: 9999px !important;
+            width: 13rem;
+            margin-top: 10px;
+            margin-left: auto;
+            flex: unset !important;
+        }
+        #share-btn {
+            all: initial;
+            color: #ffffff;
+            font-weight: 600;
+            cursor: pointer;
+            font-family: 'IBM Plex Sans', sans-serif;
+            margin-left: 0.5rem !important;
+            padding-top: 0.25rem !important;
+            padding-bottom: 0.25rem !important;
+            right:0;
+        }
+        #share-btn * {
+            all: unset !important;
+        }
+        #share-btn-container div:nth-child(-n+2){
+            width: auto !important;
+            min-height: 0px !important;
+        }
+        #share-btn-container .wrap {
+            display: none !important;
+        }
+"""

utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

utils/attention_utils.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import numpy as np
+import os
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import seaborn as sns
+import torch
+import torchvision
+from utils.richtext_utils import seed_everything
+from sklearn.cluster import SpectralClustering
+SelfAttentionLayers = [
+    'down_blocks.0.attentions.0.transformer_blocks.0.attn1',
+    'down_blocks.0.attentions.1.transformer_blocks.0.attn1',
+    'down_blocks.1.attentions.0.transformer_blocks.0.attn1',
+    'down_blocks.1.attentions.1.transformer_blocks.0.attn1',
+    'down_blocks.2.attentions.0.transformer_blocks.0.attn1',
+    'down_blocks.2.attentions.1.transformer_blocks.0.attn1',
+    'mid_block.attentions.0.transformer_blocks.0.attn1',
+    'up_blocks.1.attentions.0.transformer_blocks.0.attn1',
+    'up_blocks.1.attentions.1.transformer_blocks.0.attn1',
+    'up_blocks.1.attentions.2.transformer_blocks.0.attn1',
+    'up_blocks.2.attentions.0.transformer_blocks.0.attn1',
+    'up_blocks.2.attentions.1.transformer_blocks.0.attn1',
+    'up_blocks.2.attentions.2.transformer_blocks.0.attn1',
+    'up_blocks.3.attentions.0.transformer_blocks.0.attn1',
+    'up_blocks.3.attentions.1.transformer_blocks.0.attn1',
+    'up_blocks.3.attentions.2.transformer_blocks.0.attn1',
+]
+CrossAttentionLayers = [
+    # 'down_blocks.0.attentions.0.transformer_blocks.0.attn2',
+    # 'down_blocks.0.attentions.1.transformer_blocks.0.attn2',
+    'down_blocks.1.attentions.0.transformer_blocks.0.attn2',
+    # 'down_blocks.1.attentions.1.transformer_blocks.0.attn2',
+    'down_blocks.2.attentions.0.transformer_blocks.0.attn2',
+    'down_blocks.2.attentions.1.transformer_blocks.0.attn2',
+    'mid_block.attentions.0.transformer_blocks.0.attn2',
+    'up_blocks.1.attentions.0.transformer_blocks.0.attn2',
+    'up_blocks.1.attentions.1.transformer_blocks.0.attn2',
+    'up_blocks.1.attentions.2.transformer_blocks.0.attn2',
+    # 'up_blocks.2.attentions.0.transformer_blocks.0.attn2',
+    'up_blocks.2.attentions.1.transformer_blocks.0.attn2',
+    # 'up_blocks.2.attentions.2.transformer_blocks.0.attn2',
+    # 'up_blocks.3.attentions.0.transformer_blocks.0.attn2',
+    # 'up_blocks.3.attentions.1.transformer_blocks.0.attn2',
+    # 'up_blocks.3.attentions.2.transformer_blocks.0.attn2'
+]
+def split_attention_maps_over_steps(attention_maps):
+    r"""Function for splitting attention maps over steps.
+    Args:
+        attention_maps (dict): Dictionary of attention maps.
+        sampler_order (int): Order of the sampler.
+    """
+    # This function splits attention maps into unconditional and conditional score and over steps
+    attention_maps_cond = dict()    # Maps corresponding to conditional score
+    attention_maps_uncond = dict()  # Maps corresponding to unconditional score
+    for layer in attention_maps.keys():
+        for step_num in range(len(attention_maps[layer])):
+            if step_num not in attention_maps_cond:
+                attention_maps_cond[step_num] = dict()
+                attention_maps_uncond[step_num] = dict()
+            attention_maps_uncond[step_num].update(
+                {layer: attention_maps[layer][step_num][:1]})
+            attention_maps_cond[step_num].update(
+                {layer: attention_maps[layer][step_num][1:2]})
+    return attention_maps_cond, attention_maps_uncond
+def plot_attention_maps(atten_map_list, obj_tokens, save_dir, seed, tokens_vis=None):
+    atten_names = ['presoftmax', 'postsoftmax', 'postsoftmax_erosion']
+    for i, attn_map in enumerate(atten_map_list):
+        n_obj = len(attn_map)
+        plt.figure()
+        plt.clf()
+        fig, axs = plt.subplots(
+            ncols=n_obj+1, gridspec_kw=dict(width_ratios=[1 for _ in range(n_obj)]+[0.1]))
+        fig.set_figheight(3)
+        fig.set_figwidth(3*n_obj+0.1)
+        cmap = plt.get_cmap('OrRd')
+        vmax = 0
+        vmin = 1
+        for tid in range(n_obj):
+            attention_map_cur = attn_map[tid]
+            vmax = max(vmax, float(attention_map_cur.max()))
+            vmin = min(vmin, float(attention_map_cur.min()))
+        for tid in range(n_obj):
+            sns.heatmap(
+                attn_map[tid][0], annot=False, cbar=False, ax=axs[tid],
+                cmap=cmap, vmin=vmin, vmax=vmax
+            )
+            axs[tid].set_axis_off()
+            if tokens_vis is not None:
+                if tid == n_obj-1:
+                    axs_xlabel = 'other tokens'
+                else:
+                    axs_xlabel = ''
+                    for token_id in obj_tokens[tid]:
+                        axs_xlabel += ' ' + tokens_vis[token_id.item() -
+                                                       1][:-len('</w>')]
+                axs[tid].set_title(axs_xlabel)
+        norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
+        sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
+        fig.colorbar(sm, cax=axs[-1])
+        canvas = fig.canvas
+        canvas.draw()
+        width, height = canvas.get_width_height()
+        img = np.frombuffer(canvas.tostring_rgb(),
+                            dtype='uint8').reshape((height, width, 3))
+        fig.tight_layout()
+        plt.close()
+    return img
+def get_token_maps_deprecated(attention_maps, save_dir, width, height, obj_tokens, seed=0, tokens_vis=None):
+    r"""Function to visualize attention maps.
+    Args:
+        save_dir (str): Path to save attention maps
+        batch_size (int): Batch size
+        sampler_order (int): Sampler order
+    """
+    # Split attention maps over steps
+    attention_maps_cond, _ = split_attention_maps_over_steps(
+        attention_maps
+    )
+    nsteps = len(attention_maps_cond)
+    hw_ori = width * height
+    attention_maps = []
+    for obj_token in obj_tokens:
+        attention_maps.append([])
+    for step_num in range(nsteps):
+        attention_maps_cur = attention_maps_cond[step_num]
+        for layer in attention_maps_cur.keys():
+            if step_num < 10 or layer not in CrossAttentionLayers:
+                continue
+            attention_ind = attention_maps_cur[layer].cpu()
+            # Attention maps are of shape [batch_size, nkeys, 77]
+            # since they are averaged out while collecting from hooks to save memory.
+            # Now split the heads from batch dimension
+            bs, hw, nclip = attention_ind.shape
+            down_ratio = np.sqrt(hw_ori // hw)
+            width_cur = int(width // down_ratio)
+            height_cur = int(height // down_ratio)
+            attention_ind = attention_ind.reshape(
+                bs, height_cur, width_cur, nclip)
+            for obj_id, obj_token in enumerate(obj_tokens):
+                if obj_token[0] == -1:
+                    attention_map_prev = torch.stack(
+                        [attention_maps[i][-1] for i in range(obj_id)]).sum(0)
+                    attention_maps[obj_id].append(
+                        attention_map_prev.max()-attention_map_prev)
+                else:
+                    obj_attention_map = attention_ind[:, :, :, obj_token].max(-1, True)[
+                        0].permute([3, 0, 1, 2])
+                    obj_attention_map = torchvision.transforms.functional.resize(obj_attention_map, (height, width),
+                                                                                 interpolation=torchvision.transforms.InterpolationMode.BICUBIC, antialias=True)
+                    attention_maps[obj_id].append(obj_attention_map)
+    # average attention maps over steps
+    attention_maps_averaged = []
+    for obj_id, obj_token in enumerate(obj_tokens):
+        if obj_id == len(obj_tokens) - 1:
+            attention_maps_averaged.append(
+                torch.cat(attention_maps[obj_id]).mean(0))
+        else:
+            attention_maps_averaged.append(
+                torch.cat(attention_maps[obj_id]).mean(0))
+    # normalize attention maps into [0, 1]
+    attention_maps_averaged_normalized = []
+    attention_maps_averaged_sum = torch.cat(attention_maps_averaged).sum(0)
+    for obj_id, obj_token in enumerate(obj_tokens):
+        attention_maps_averaged_normalized.append(
+            attention_maps_averaged[obj_id]/attention_maps_averaged_sum)
+    # softmax
+    attention_maps_averaged_normalized = (
+        torch.cat(attention_maps_averaged)/0.001).softmax(0)
+    attention_maps_averaged_normalized = [
+        attention_maps_averaged_normalized[i:i+1] for i in range(attention_maps_averaged_normalized.shape[0])]
+    token_maps_vis = plot_attention_maps([attention_maps_averaged, attention_maps_averaged_normalized],
+                                         obj_tokens, save_dir, seed, tokens_vis)
+    attention_maps_averaged_normalized = [attn_mask.unsqueeze(1).repeat(
+        [1, 4, 1, 1]).cuda() for attn_mask in attention_maps_averaged_normalized]
+    return attention_maps_averaged_normalized, token_maps_vis
+def get_token_maps(selfattn_maps, crossattn_maps, n_maps, save_dir, width, height, obj_tokens, seed=0, tokens_vis=None,
+                   preprocess=False, segment_threshold=0.3, num_segments=5, return_vis=False, save_attn=False):
+    r"""Function to visualize attention maps.
+    Args:
+        save_dir (str): Path to save attention maps
+        batch_size (int): Batch size
+        sampler_order (int): Sampler order
+    """
+    # create the segmentation mask using self-attention maps
+    resolution = 32
+    attn_maps_1024 = {8: [], 16: [], 32: [], 64: []}
+    for attn_map in selfattn_maps.values():
+        resolution_map = np.sqrt(attn_map.shape[1]).astype(int)
+        if resolution_map != resolution:
+            continue
+        attn_map = attn_map.reshape(
+            1, resolution_map, resolution_map, resolution_map**2).permute([3, 0, 1, 2])
+        attn_map = torch.nn.functional.interpolate(attn_map, (resolution, resolution),
+                                                   mode='bicubic', antialias=True)
+        attn_maps_1024[resolution_map].append(attn_map.permute([1, 2, 3, 0]).reshape(
+            1, resolution**2, resolution_map**2))
+    attn_maps_1024 = torch.cat([torch.cat(v).mean(0).cpu()
+                                for v in attn_maps_1024.values() if len(v) > 0], -1).numpy()
+    if save_attn:
+        print('saving self-attention maps...', attn_maps_1024.shape)
+        torch.save(torch.from_numpy(attn_maps_1024),
+                   'results/maps/selfattn_maps.pth')
+    seed_everything(seed)
+    sc = SpectralClustering(num_segments, affinity='precomputed', n_init=100,
+                            assign_labels='kmeans')
+    clusters = sc.fit_predict(attn_maps_1024)
+    clusters = clusters.reshape(resolution, resolution)
+    fig = plt.figure()
+    plt.imshow(clusters)
+    plt.axis('off')
+    if return_vis:
+        canvas = fig.canvas
+        canvas.draw()
+        cav_width, cav_height = canvas.get_width_height()
+        segments_vis = np.frombuffer(canvas.tostring_rgb(),
+                                     dtype='uint8').reshape((cav_height, cav_width, 3))
+    plt.close()
+    # label the segmentation mask using cross-attention maps
+    cross_attn_maps_1024 = []
+    for attn_map in crossattn_maps.values():
+        resolution_map = np.sqrt(attn_map.shape[1]).astype(int)
+        attn_map = attn_map.reshape(
+            1, resolution_map, resolution_map, -1).permute([0, 3, 1, 2])
+        attn_map = torch.nn.functional.interpolate(attn_map, (resolution, resolution),
+                                                   mode='bicubic', antialias=True)
+        cross_attn_maps_1024.append(attn_map.permute([0, 2, 3, 1]))
+    cross_attn_maps_1024 = torch.cat(
+        cross_attn_maps_1024).mean(0).cpu().numpy()
+    if save_attn:
+        print('saving cross-attention maps...', cross_attn_maps_1024.shape)
+        torch.save(torch.from_numpy(cross_attn_maps_1024),
+                   'results/maps/crossattn_maps.pth')
+    normalized_span_maps = []
+    for token_ids in obj_tokens:
+        span_token_maps = cross_attn_maps_1024[:, :, token_ids.numpy()]
+        normalized_span_map = np.zeros_like(span_token_maps)
+        for i in range(span_token_maps.shape[-1]):
+            curr_noun_map = span_token_maps[:, :, i]
+            normalized_span_map[:, :, i] = (
+                curr_noun_map - np.abs(curr_noun_map.min())) / curr_noun_map.max()
+        normalized_span_maps.append(normalized_span_map)
+    foreground_token_maps = [np.zeros([clusters.shape[0], clusters.shape[1]]).squeeze(
+    ) for normalized_span_map in normalized_span_maps]
+    background_map = np.zeros([clusters.shape[0], clusters.shape[1]]).squeeze()
+    for c in range(num_segments):
+        cluster_mask = np.zeros_like(clusters)
+        cluster_mask[clusters == c] = 1.
+        is_foreground = False
+        for normalized_span_map, foreground_nouns_map, token_ids in zip(normalized_span_maps, foreground_token_maps, obj_tokens):
+            score_maps = [cluster_mask * normalized_span_map[:, :, i]
+                          for i in range(len(token_ids))]
+            scores = [score_map.sum() / cluster_mask.sum()
+                      for score_map in score_maps]
+            if max(scores) > segment_threshold:
+                foreground_nouns_map += cluster_mask
+                is_foreground = True
+        if not is_foreground:
+            background_map += cluster_mask
+    foreground_token_maps.append(background_map)
+    # resize the token maps and visualization
+    resized_token_maps = torch.cat([torch.nn.functional.interpolate(torch.from_numpy(token_map).unsqueeze(0).unsqueeze(
+        0), (height, width), mode='bicubic', antialias=True)[0] for token_map in foreground_token_maps]).clamp(0, 1)
+    resized_token_maps = resized_token_maps / \
+        (resized_token_maps.sum(0, True)+1e-8)
+    resized_token_maps = [token_map.unsqueeze(
+        0) for token_map in resized_token_maps]
+    foreground_token_maps = [token_map[None, :, :]
+                             for token_map in foreground_token_maps]
+    token_maps_vis = plot_attention_maps([foreground_token_maps, resized_token_maps], obj_tokens,
+                                         save_dir, seed, tokens_vis)
+    resized_token_maps = [token_map.unsqueeze(1).repeat(
+        [1, 4, 1, 1]).to(attn_map.dtype).cuda() for token_map in resized_token_maps]
+    if return_vis:
+        return resized_token_maps, segments_vis, token_maps_vis
+    else:
+        return resized_token_maps

utils/richtext_utils.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import os
+import json
+import torch
+import random
+import numpy as np
+COLORS = {
+    'brown': [165, 42, 42],
+    'red': [255, 0, 0],
+    'pink': [253, 108, 158],
+    'orange': [255, 165, 0],
+    'yellow': [255, 255, 0],
+    'purple': [128, 0, 128],
+    'green': [0, 128, 0],
+    'blue': [0, 0, 255],
+    'white': [255, 255, 255],
+    'gray': [128, 128, 128],
+    'black': [0, 0, 0],
+}
+def seed_everything(seed):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+def hex_to_rgb(hex_string, return_nearest_color=False):
+    r"""
+    Covert Hex triplet to RGB triplet.
+    """
+    # Remove '#' symbol if present
+    hex_string = hex_string.lstrip('#')
+    # Convert hex values to integers
+    red = int(hex_string[0:2], 16)
+    green = int(hex_string[2:4], 16)
+    blue = int(hex_string[4:6], 16)
+    rgb = torch.FloatTensor((red, green, blue))[None, :, None, None]/255.
+    if return_nearest_color:
+        nearest_color = find_nearest_color(rgb)
+        return rgb.cuda(), nearest_color
+    return rgb.cuda()
+def find_nearest_color(rgb):
+    r"""
+    Find the nearest neighbor color given the RGB value.
+    """
+    if isinstance(rgb, list) or isinstance(rgb, tuple):
+        rgb = torch.FloatTensor(rgb)[None, :, None, None]/255.
+    color_distance = torch.FloatTensor([np.linalg.norm(
+        rgb - torch.FloatTensor(COLORS[color])[None, :, None, None]/255.) for color in COLORS.keys()])
+    nearest_color = list(COLORS.keys())[torch.argmin(color_distance).item()]
+    return nearest_color
+def font2style(font):
+    r"""
+    Convert the font name to the style name.
+    """
+    return {'mirza': 'Claud Monet, impressionism, oil on canvas',
+            'roboto': 'Ukiyoe',
+            'cursive': 'Cyber Punk, futuristic, blade runner, william gibson, trending on artstation hq',
+            'sofia': 'Pop Art, masterpiece, andy warhol',
+            'slabo': 'Vincent Van Gogh',
+            'inconsolata': 'Pixel Art, 8 bits, 16 bits',
+            'ubuntu': 'Rembrandt',
+            'Monoton': 'neon art, colorful light, highly details, octane render',
+            'Akronim': 'Abstract Cubism, Pablo Picasso', }[font]
+def parse_json(json_str):
+    r"""
+    Convert the JSON string to attributes.
+    """
+    # initialze region-base attributes.
+    base_text_prompt = ''
+    style_text_prompts = []
+    footnote_text_prompts = []
+    footnote_target_tokens = []
+    color_text_prompts = []
+    color_rgbs = []
+    color_names = []
+    size_text_prompts_and_sizes = []
+    # parse the attributes from JSON.
+    prev_style = None
+    prev_color_rgb = None
+    use_grad_guidance = False
+    for span in json_str['ops']:
+        text_prompt = span['insert'].rstrip('\n')
+        base_text_prompt += span['insert'].rstrip('\n')
+        if text_prompt == ' ':
+            continue
+        if 'attributes' in span:
+            if 'font' in span['attributes']:
+                style = font2style(span['attributes']['font'])
+                if prev_style == style:
+                    prev_text_prompt = style_text_prompts[-1].split('in the style of')[
+                        0]
+                    style_text_prompts[-1] = prev_text_prompt + \
+                        ' ' + text_prompt + f' in the style of {style}'
+                else:
+                    style_text_prompts.append(
+                        text_prompt + f' in the style of {style}')
+                prev_style = style
+            else:
+                prev_style = None
+            if 'link' in span['attributes']:
+                footnote_text_prompts.append(span['attributes']['link'])
+                footnote_target_tokens.append(text_prompt)
+            font_size = 1
+            if 'size' in span['attributes'] and 'strike' not in span['attributes']:
+                font_size = float(span['attributes']['size'][:-2])/3.
+            elif 'size' in span['attributes'] and 'strike' in span['attributes']:
+                font_size = -float(span['attributes']['size'][:-2])/3.
+            elif 'size' not in span['attributes'] and 'strike' not in span['attributes']:
+                font_size = 1
+            if 'color' in span['attributes']:
+                use_grad_guidance = True
+                color_rgb, nearest_color = hex_to_rgb(
+                    span['attributes']['color'], True)
+                if prev_color_rgb == color_rgb:
+                    prev_text_prompt = color_text_prompts[-1]
+                    color_text_prompts[-1] = prev_text_prompt + \
+                        ' ' + text_prompt
+                else:
+                    color_rgbs.append(color_rgb)
+                    color_names.append(nearest_color)
+                    color_text_prompts.append(text_prompt)
+            if font_size != 1:
+                size_text_prompts_and_sizes.append([text_prompt, font_size])
+    return base_text_prompt, style_text_prompts, footnote_text_prompts, footnote_target_tokens,\
+        color_text_prompts, color_names, color_rgbs, size_text_prompts_and_sizes, use_grad_guidance
+def get_region_diffusion_input(model, base_text_prompt, style_text_prompts, footnote_text_prompts,
+                               footnote_target_tokens, color_text_prompts, color_names):
+    r"""
+    Algorithm 1 in the paper.
+    """
+    region_text_prompts = []
+    region_target_token_ids = []
+    base_tokens = model.tokenizer._tokenize(base_text_prompt)
+    # process the style text prompt
+    for text_prompt in style_text_prompts:
+        region_text_prompts.append(text_prompt)
+        region_target_token_ids.append([])
+        style_tokens = model.tokenizer._tokenize(
+            text_prompt.split('in the style of')[0])
+        for style_token in style_tokens:
+            region_target_token_ids[-1].append(
+                base_tokens.index(style_token)+1)
+    # process the complementary text prompt
+    for footnote_text_prompt, text_prompt in zip(footnote_text_prompts, footnote_target_tokens):
+        region_target_token_ids.append([])
+        region_text_prompts.append(footnote_text_prompt)
+        style_tokens = model.tokenizer._tokenize(text_prompt)
+        for style_token in style_tokens:
+            region_target_token_ids[-1].append(
+                base_tokens.index(style_token)+1)
+    # process the color text prompt
+    for color_text_prompt, color_name in zip(color_text_prompts, color_names):
+        region_target_token_ids.append([])
+        region_text_prompts.append(color_name+' '+color_text_prompt)
+        style_tokens = model.tokenizer._tokenize(color_text_prompt)
+        for style_token in style_tokens:
+            region_target_token_ids[-1].append(
+                base_tokens.index(style_token)+1)
+    # process the remaining tokens without any attributes
+    region_text_prompts.append(base_text_prompt)
+    region_target_token_ids_all = [
+        id for ids in region_target_token_ids for id in ids]
+    target_token_ids_rest = [id for id in range(
+        1, len(base_tokens)+1) if id not in region_target_token_ids_all]
+    region_target_token_ids.append(target_token_ids_rest)
+    region_target_token_ids = [torch.LongTensor(
+        obj_token_id) for obj_token_id in region_target_token_ids]
+    return region_text_prompts, region_target_token_ids, base_tokens
+def get_attention_control_input(model, base_tokens, size_text_prompts_and_sizes):
+    r"""
+    Control the token impact using font sizes.
+    """
+    word_pos = []
+    font_sizes = []
+    for text_prompt, font_size in size_text_prompts_and_sizes:
+        size_tokens = model.tokenizer._tokenize(text_prompt)
+        for size_token in size_tokens:
+            word_pos.append(base_tokens.index(size_token)+1)
+            font_sizes.append(font_size)
+    if len(word_pos) > 0:
+        word_pos = torch.LongTensor(word_pos).cuda()
+        font_sizes = torch.FloatTensor(font_sizes).cuda()
+    else:
+        word_pos = None
+        font_sizes = None
+    text_format_dict = {
+        'word_pos': word_pos,
+        'font_size': font_sizes,
+    }
+    return text_format_dict
+def get_gradient_guidance_input(model, base_tokens, color_text_prompts, color_rgbs, text_format_dict,
+                                guidance_start_step=999, color_guidance_weight=1):
+    r"""
+    Control the token impact using font sizes.
+    """
+    color_target_token_ids = []
+    for text_prompt in color_text_prompts:
+        color_target_token_ids.append([])
+        color_tokens = model.tokenizer._tokenize(text_prompt)
+        for color_token in color_tokens:
+            color_target_token_ids[-1].append(base_tokens.index(color_token)+1)
+    color_target_token_ids_all = [
+        id for ids in color_target_token_ids for id in ids]
+    color_target_token_ids_rest = [id for id in range(
+        1, len(base_tokens)+1) if id not in color_target_token_ids_all]
+    color_target_token_ids.append(color_target_token_ids_rest)
+    color_target_token_ids = [torch.LongTensor(
+        obj_token_id) for obj_token_id in color_target_token_ids]
+    text_format_dict['target_RGB'] = color_rgbs
+    text_format_dict['guidance_start_step'] = guidance_start_step
+    text_format_dict['color_guidance_weight'] = color_guidance_weight
+    return text_format_dict, color_target_token_ids