Spaces:

longlian
/

llm-grounded-diffusion

Sleeping

App Files Files Community

Tony Lian commited on Jun 13, 2023

Commit

9668cda

1 Parent(s): 1f39cf9

Add examples, adjustable number of steps, and custom template

Browse files

Files changed (5) hide show

app.py +41 -16
baseline.py +1 -2
examples.py +26 -0
generation.py +3 -4
models/pipelines.py +0 -21

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from utils.parse import filter_boxes
 from generation import run as run_ours
 from baseline import run as run_baseline
 import torch
 print(f"Is CUDA available: {torch.cuda.is_available()}")
 if torch.cuda.is_available():
@@ -18,7 +19,7 @@ size = box_scale
 bg_prompt_text = "Background prompt: "
-simplified_prompt = """You are an intelligent bounding box generator. I will provide you with a caption for a photo, image, or painting. Your task is to generate the bounding boxes for the objects mentioned in the caption, along with a background prompt describing the scene. The images are of size 512x512, and the bounding boxes should not overlap or go beyond the image boundaries. Each bounding box should be in the format of (object name, [top-left x coordinate, top-left y coordinate, box width, box height]) and include exactly one object. Do not put objects that are already provided in the bounding boxes into the background prompt. If needed, you can make reasonable guesses. Generate the object descriptions and background prompts in English even if the caption might not be in English. Please refer to the example below for the desired format.
 Caption: A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky
 Objects: [('a green car', [21, 181, 211, 159]), ('a blue truck', [269, 181, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]
@@ -46,7 +47,9 @@ Background prompt: A realistic image of a park with flowers
 Caption: 一个客厅场景的油画，墙上挂着电视，电视下面是一个柜子，柜子上有一个花瓶。
 Objects: [('a tv', [88, 85, 335, 203]), ('a cabinet', [57, 308, 404, 201]), ('a flower vase', [166, 222, 92, 108])]
-Background prompt: An oil painting of a living room scene
 Caption: {prompt}
 Objects: """
@@ -57,10 +60,12 @@ layout_placeholder = """Caption: A realistic photo of a gray cat and an orange d
 Objects: [('a gray cat', [67, 243, 120, 126]), ('an orange dog', [265, 193, 190, 210])]
 Background prompt: A realistic photo of a grassy area."""
-def get_lmd_prompt(prompt):
     if prompt == "":
         prompt = prompt_placeholder
-    return simplified_prompt.format(prompt=prompt)
 def get_layout_image(response):
     if response == "":
@@ -82,7 +87,7 @@ def get_layout_image(response):
 def get_layout_image_gallery(response):
     return [get_layout_image(response)]
-def get_ours_image(response, seed, fg_seed_start, fg_blending_ratio=0.1, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta=0.3, show_so_imgs=False, scale_boxes=False, gallery=None):
     if response == "":
         response = layout_placeholder
     gen_boxes, bg_prompt = parse_input(response)
@@ -96,7 +101,7 @@ def get_ours_image(response, seed, fg_seed_start, fg_blending_ratio=0.1, frozen_
     image_np, so_img_list = run_ours(
         spec, bg_seed=seed, fg_seed_start=fg_seed_start,
         fg_blending_ratio=fg_blending_ratio,frozen_step_ratio=frozen_step_ratio,
-        gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta)
     images = [image_np]
     if show_so_imgs:
         images.extend([np.asarray(so_img) for so_img in so_img_list])
@@ -177,27 +182,37 @@ def show_boxes(gen_boxes, bg_prompt=None):
 duplicate_html = '<a style="display:inline-block" href="https://huggingface.co/spaces/longlian/llm-grounded-diffusion?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a>'
-with gr.Blocks(
-    title="LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models"
-) as g:
-    gr.HTML(f"""<h1>LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models</h1>
             <h2>LLM + Stable Diffusion => better prompt understanding in text2image generation 🤩</h2>
             <h2><a href='https://llm-grounded-diffusion.github.io/'>Project Page</a> | <a href='https://bair.berkeley.edu/blog/2023/05/23/lmd/'>5-minute Blog Post</a> | <a href='https://arxiv.org/pdf/2305.13655.pdf'>ArXiv Paper</a> | <a href='https://github.com/TonyLianLong/LLM-groundedDiffusion'>Github</a> | <a href='https://llm-grounded-diffusion.github.io/#citation'>Cite our work</a> if our ideas inspire you.</h2>
             <p><b>Tips:</b><p>
             <p>1. If ChatGPT doesn't generate layout, add/remove the trailing space (added by default) and/or use GPT-4.</p>
             <p>2. You can perform multi-round specification by giving ChatGPT follow-up requests (e.g., make the object boxes bigger).</p>
-            <p>3. You can also try prompts in Simplified Chinese. If you want to try prompts in another language, translate the first line of last example to your language.<p>
-            <p>4. Duplicate this space and add GPU to skip the queue and run our model faster. {duplicate_html}</p>
             <br/>
-            <p>Implementation note: In this demo, we replace the attention manipulation in our layout-guided Stable Diffusion described in our paper with GLIGEN due to much faster inference speed (<b>FlashAttention supported, no backprop needed</b> during inference). Compared to vanilla GLIGEN, we have better coherence. Other parts of text-to-image pipeline, including single object generation and SAM, remain the same. The settings and examples in the prompt are simplified in this demo.</p>""")
     with gr.Tab("Stage 1. Image Prompt to ChatGPT"):
         with gr.Row():
             with gr.Column(scale=1):
                 prompt = gr.Textbox(lines=2, label="Prompt for Layout Generation", placeholder=prompt_placeholder)
-                generate_btn = gr.Button("Generate Prompt")
             with gr.Column(scale=1):
                 output = gr.Textbox(label="Paste this into ChatGPT (GPT-4 preferred; on Mac, click text and press Command+A and Command+C to copy all)")
-        generate_btn.click(fn=get_lmd_prompt, inputs=prompt, outputs=output, api_name="get_lmd_prompt")
     # with gr.Tab("(Optional) Visualize ChatGPT-generated Layout"):
     #     with gr.Row():
@@ -216,6 +231,7 @@ with gr.Blocks(
                 generate_btn = gr.Button("Generate Image from Layout", variant='primary')
                 with gr.Accordion("Advanced options", open=False):
                     seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
                     fg_seed_start = gr.Slider(0, 10000, value=20, step=1, label="Seed for foreground variation")
                     fg_blending_ratio = gr.Slider(0, 1, value=0.1, step=0.01, label="Variations added to foreground for single object generation (0: no variation, 1: max variation)")
                     frozen_step_ratio = gr.Slider(0, 1, value=0.4, step=0.1, label="Foreground frozen steps ratio (higher: preserve object attributes; lower: higher coherence; set to 0: (almost) equivalent to vanilla GLIGEN except details)")
@@ -226,7 +242,12 @@ with gr.Blocks(
                     label="Generated image", show_label=False, elem_id="gallery"
                 ).style(columns=[1], rows=[1], object_fit="contain", preview=True)
         visualize_btn.click(fn=get_layout_image_gallery, inputs=response, outputs=gallery, api_name="visualize-layout")
-        generate_btn.click(fn=get_ours_image, inputs=[response, seed, fg_seed_start, fg_blending_ratio, frozen_step_ratio, gligen_scheduled_sampling_beta, show_so_imgs], outputs=gallery, api_name="layout-to-image")
     with gr.Tab("Baseline: Stable Diffusion"):
         with gr.Row():
@@ -243,5 +264,9 @@ with gr.Blocks(
                 ).style(columns=[1], rows=[1], object_fit="contain", preview=True)
         generate_btn.click(fn=get_baseline_image, inputs=[sd_prompt, seed], outputs=gallery, api_name="baseline")
 g.launch()

 from generation import run as run_ours
 from baseline import run as run_baseline
 import torch
+from examples import stage1_examples, stage2_examples
 print(f"Is CUDA available: {torch.cuda.is_available()}")
 if torch.cuda.is_available():
 bg_prompt_text = "Background prompt: "
+default_template = """You are an intelligent bounding box generator. I will provide you with a caption for a photo, image, or painting. Your task is to generate the bounding boxes for the objects mentioned in the caption, along with a background prompt describing the scene. The images are of size 512x512, and the bounding boxes should not overlap or go beyond the image boundaries. Each bounding box should be in the format of (object name, [top-left x coordinate, top-left y coordinate, box width, box height]) and include exactly one object. Make the boxes larger if possible. Do not put objects that are already provided in the bounding boxes into the background prompt. If needed, you can make reasonable guesses. Generate the object descriptions and background prompts in English even if the caption might not be in English. Do not include non-existing or excluded objects in the background prompt. Please refer to the example below for the desired format.
 Caption: A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky
 Objects: [('a green car', [21, 181, 211, 159]), ('a blue truck', [269, 181, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]
 Caption: 一个客厅场景的油画，墙上挂着电视，电视下面是一个柜子，柜子上有一个花瓶。
 Objects: [('a tv', [88, 85, 335, 203]), ('a cabinet', [57, 308, 404, 201]), ('a flower vase', [166, 222, 92, 108])]
+Background prompt: An oil painting of a living room scene"""
+simplified_prompt = """{template}
 Caption: {prompt}
 Objects: """
 Objects: [('a gray cat', [67, 243, 120, 126]), ('an orange dog', [265, 193, 190, 210])]
 Background prompt: A realistic photo of a grassy area."""
+def get_lmd_prompt(prompt, template=""):
     if prompt == "":
         prompt = prompt_placeholder
+    if template == "":
+        template = default_template
+    return simplified_prompt.format(template=template, prompt=prompt)
 def get_layout_image(response):
     if response == "":
 def get_layout_image_gallery(response):
     return [get_layout_image(response)]
+def get_ours_image(response, seed, num_inference_steps, fg_seed_start, fg_blending_ratio=0.1, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta=0.3, show_so_imgs=False, scale_boxes=False):
     if response == "":
         response = layout_placeholder
     gen_boxes, bg_prompt = parse_input(response)
     image_np, so_img_list = run_ours(
         spec, bg_seed=seed, fg_seed_start=fg_seed_start,
         fg_blending_ratio=fg_blending_ratio,frozen_step_ratio=frozen_step_ratio,
+        gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta, num_inference_steps=num_inference_steps)
     images = [image_np]
     if show_so_imgs:
         images.extend([np.asarray(so_img) for so_img in so_img_list])
 duplicate_html = '<a style="display:inline-block" href="https://huggingface.co/spaces/longlian/llm-grounded-diffusion?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a>'
+html = f"""<h1>LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models</h1>
             <h2>LLM + Stable Diffusion => better prompt understanding in text2image generation 🤩</h2>
             <h2><a href='https://llm-grounded-diffusion.github.io/'>Project Page</a> | <a href='https://bair.berkeley.edu/blog/2023/05/23/lmd/'>5-minute Blog Post</a> | <a href='https://arxiv.org/pdf/2305.13655.pdf'>ArXiv Paper</a> | <a href='https://github.com/TonyLianLong/LLM-groundedDiffusion'>Github</a> | <a href='https://llm-grounded-diffusion.github.io/#citation'>Cite our work</a> if our ideas inspire you.</h2>
             <p><b>Tips:</b><p>
             <p>1. If ChatGPT doesn't generate layout, add/remove the trailing space (added by default) and/or use GPT-4.</p>
             <p>2. You can perform multi-round specification by giving ChatGPT follow-up requests (e.g., make the object boxes bigger).</p>
+            <p>3. You can also try prompts in Simplified Chinese. If you want to try prompts in another language, translate the first line of last example to your language.</p>
+            <p>4. The diffusion model only runs 20 steps by default. You can make it run 50 steps to get higher quality images (or tweak frozen steps/guidance steps for better guidance and coherence).</p>
+            <p>5. Duplicate this space and add GPU to skip the queue and run our model faster. {duplicate_html}</p>
             <br/>
+            <p>Implementation note: In this demo, we replace the attention manipulation in our layout-guided Stable Diffusion described in our paper with GLIGEN due to much faster inference speed (<b>FlashAttention supported, no backprop needed</b> during inference). Compared to vanilla GLIGEN, we have better coherence. Other parts of text-to-image pipeline, including single object generation and SAM, remain the same. The settings and examples in the prompt are simplified in this demo.</p>"""
+with gr.Blocks(
+    title="LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models"
+) as g:
+    gr.HTML(html)
     with gr.Tab("Stage 1. Image Prompt to ChatGPT"):
         with gr.Row():
             with gr.Column(scale=1):
                 prompt = gr.Textbox(lines=2, label="Prompt for Layout Generation", placeholder=prompt_placeholder)
+                generate_btn = gr.Button("Generate Prompt", variant='primary')
+                with gr.Accordion("Advanced options", open=False):
+                    template = gr.Textbox(lines=10, label="Custom Template", placeholder="Customized Template", value=default_template)
             with gr.Column(scale=1):
                 output = gr.Textbox(label="Paste this into ChatGPT (GPT-4 preferred; on Mac, click text and press Command+A and Command+C to copy all)")
+        generate_btn.click(fn=get_lmd_prompt, inputs=[prompt, template], outputs=output, api_name="get_lmd_prompt")
+        gr.Examples(
+            stage1_examples,
+            [prompt]
+        )
     # with gr.Tab("(Optional) Visualize ChatGPT-generated Layout"):
     #     with gr.Row():
                 generate_btn = gr.Button("Generate Image from Layout", variant='primary')
                 with gr.Accordion("Advanced options", open=False):
                     seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
+                    num_inference_steps = gr.Slider(1, 50, value=20, step=1, label="Number of inference steps")
                     fg_seed_start = gr.Slider(0, 10000, value=20, step=1, label="Seed for foreground variation")
                     fg_blending_ratio = gr.Slider(0, 1, value=0.1, step=0.01, label="Variations added to foreground for single object generation (0: no variation, 1: max variation)")
                     frozen_step_ratio = gr.Slider(0, 1, value=0.4, step=0.1, label="Foreground frozen steps ratio (higher: preserve object attributes; lower: higher coherence; set to 0: (almost) equivalent to vanilla GLIGEN except details)")
                     label="Generated image", show_label=False, elem_id="gallery"
                 ).style(columns=[1], rows=[1], object_fit="contain", preview=True)
         visualize_btn.click(fn=get_layout_image_gallery, inputs=response, outputs=gallery, api_name="visualize-layout")
+        generate_btn.click(fn=get_ours_image, inputs=[response, seed, num_inference_steps, fg_seed_start, fg_blending_ratio, frozen_step_ratio, gligen_scheduled_sampling_beta, show_so_imgs], outputs=gallery, api_name="layout-to-image")
+        gr.Examples(
+            stage2_examples,
+            [response, seed]
+        )
     with gr.Tab("Baseline: Stable Diffusion"):
         with gr.Row():
                 ).style(columns=[1], rows=[1], object_fit="contain", preview=True)
         generate_btn.click(fn=get_baseline_image, inputs=[sd_prompt, seed], outputs=gallery, api_name="baseline")
+        gr.Examples(
+            stage1_examples,
+            [sd_prompt]
+        )
 g.launch()

baseline.py CHANGED Viewed

@@ -11,7 +11,6 @@ torch.set_grad_enabled(False)
 height = 512  # default height of Stable Diffusion
 width = 512  # default width of Stable Diffusion
-num_inference_steps = 20  # Number of denoising steps
 guidance_scale = 7.5  # Scale for classifier-free guidance
 batch_size = 1
@@ -20,7 +19,7 @@ image_scale = (512, 512)
 bg_negative = 'artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate'
-def run(prompt, bg_seed=1):
     print(f"prompt: {prompt}")
     generator = torch.Generator(models.torch_device).manual_seed(bg_seed)

 height = 512  # default height of Stable Diffusion
 width = 512  # default width of Stable Diffusion
 guidance_scale = 7.5  # Scale for classifier-free guidance
 batch_size = 1
 bg_negative = 'artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate'
+def run(prompt, bg_seed=1, num_inference_steps=20):
     print(f"prompt: {prompt}")
     generator = torch.Generator(models.torch_device).manual_seed(bg_seed)

examples.py ADDED Viewed

	@@ -0,0 +1,26 @@

+stage1_examples = [
+    ["""A realistic photo of a gray cat and an orange dog on the grass."""],
+    ["""In an indoor scene, a blue cube directly above a red cube with a vase on the left of them."""],
+    ["""A realistic photo of a wooden table without bananas in an indoor scene"""],
+    ["""A man in red is standing next to another woman in blue in the mountains."""],
+    ["""一个室内场景的水彩画，一个桌子上面放着一盘水果"""]
+]
+# Layout, seed
+stage2_examples = [
+    ["""Caption: A realistic photo of a gray cat and an orange dog on the grass.
+Objects: [('a gray cat', [67, 243, 120, 126]), ('an orange dog', [265, 193, 190, 210])]
+Background prompt: A realistic photo of a grassy area.""", 0],
+    ["""Caption: 一个室内场景的水彩画，一个桌子上面放着一盘水果
+Objects: [('a table', [81, 242, 350, 210]), ('a plate of fruits', [151, 287, 210, 117])]
+Background prompt: A watercolor painting of an indoor scene""", 1],
+    ["""Caption: In an indoor scene, a blue cube directly above a red cube with a vase on the left of them.
+Objects: [('a blue cube', [232, 116, 76, 76]), ('a red cube', [232, 212, 76, 76]), ('a vase', [100, 198, 62, 144])]
+Background prompt: An indoor scene""", 2],
+    ["""Caption: A realistic photo of a wooden table without bananas in an indoor scene
+Objects: [('a wooden table', [75, 256, 365, 156])]
+Background prompt: A realistic photo of an indoor scene""", 3],
+    ["""Caption: A man in red is standing next to another woman in blue in the mountains.
+Objects: [('a man in red', [100, 160, 111, 320]), ('a woman in blue', [230, 170, 102, 310])]
+Background prompt: A scenic image of the mountains""", 4],
+]

generation.py CHANGED Viewed

@@ -20,7 +20,6 @@ model_dict.update(sam_model_dict)
 height = 512  # default height of Stable Diffusion
 width = 512  # default width of Stable Diffusion
 H, W = height // 8, width // 8 # size of the latent
-num_inference_steps = 20  # Number of denoising steps
 guidance_scale = 7.5  # Scale for classifier-free guidance
 # batch size that is not 1 is not supported
@@ -37,7 +36,7 @@ run_ind = None
 def generate_single_object_with_box(prompt, box, phrase, word, input_latents, input_embeddings,
-                                    sam_refine_kwargs, gligen_scheduled_sampling_beta=0.3,
                                     verbose=False, visualize=True):
     bboxes, phrases, words = [box], [phrase], [word]
@@ -78,7 +77,7 @@ def get_masked_latents_all_list(so_prompt_phrase_word_box_list, input_latents_li
 # Note: need to keep the supervision, especially the box corrdinates, corresponds to each other in single object and overall.
 def run(
-    spec, bg_seed = 1, fg_seed_start = 20, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta = 0.3,
     so_center_box = False, fg_blending_ratio = 0.1, so_horizontal_center_only = True,
     align_with_overall_bboxes = False, horizontal_shift_only = True
 ):
@@ -140,7 +139,7 @@ def run(
     latents_all_list, mask_tensor_list, so_img_list = get_masked_latents_all_list(
         so_prompt_phrase_word_box_list, input_latents_list,
         gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
-        sam_refine_kwargs=sam_refine_kwargs, so_input_embeddings=so_input_embeddings, verbose=verbose
     )

 height = 512  # default height of Stable Diffusion
 width = 512  # default width of Stable Diffusion
 H, W = height // 8, width // 8 # size of the latent
 guidance_scale = 7.5  # Scale for classifier-free guidance
 # batch size that is not 1 is not supported
 def generate_single_object_with_box(prompt, box, phrase, word, input_latents, input_embeddings,
+                                    sam_refine_kwargs, num_inference_steps, gligen_scheduled_sampling_beta=0.3,
                                     verbose=False, visualize=True):
     bboxes, phrases, words = [box], [phrase], [word]
 # Note: need to keep the supervision, especially the box corrdinates, corresponds to each other in single object and overall.
 def run(
+    spec, bg_seed = 1, fg_seed_start = 20, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta = 0.3, num_inference_steps = 20,
     so_center_box = False, fg_blending_ratio = 0.1, so_horizontal_center_only = True,
     align_with_overall_bboxes = False, horizontal_shift_only = True
 ):
     latents_all_list, mask_tensor_list, so_img_list = get_masked_latents_all_list(
         so_prompt_phrase_word_box_list, input_latents_list,
         gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
+        sam_refine_kwargs=sam_refine_kwargs, so_input_embeddings=so_input_embeddings, num_inference_steps=num_inference_steps, verbose=verbose
     )

models/pipelines.py CHANGED Viewed

@@ -93,7 +93,6 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
     frozen_steps=20, frozen_mask=None,
     return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None,
     offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True,
-    semantic_guidance=False, semantic_guidance_bboxes=None, semantic_guidance_object_positions=None, semantic_guidance_kwargs=None,
     return_box_vis=False, show_progress=True, save_all_latents=False):
     """
     The `bboxes` should be a list, rather than a list of lists (one box per phrase, we can have multiple duplicated phrases).
@@ -151,23 +150,6 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
     masks = masks.unsqueeze(0).expand(repeat_batch, -1).clone()
     masks[:repeat_batch // 2] = 0
-    if semantic_guidance_bboxes and semantic_guidance:
-        loss = torch.tensor(10000.)
-        # TODO: we can also save necessary tokens only to save memory.
-        # offload_guidance_cross_attn_to_cpu does not save too much since we only store attention map for each timestep.
-        guidance_cross_attention_kwargs = {
-            'offload_cross_attn_to_cpu': False,
-            'enable_flash_attn': False,
-            'gligen': {
-                'boxes': boxes[:repeat_batch // 2],
-                'positive_embeddings': phrase_embeddings[:repeat_batch // 2],
-                'masks': masks[:repeat_batch // 2],
-                'fuser_attn_kwargs': {
-                    'enable_flash_attn': False,
-                }
-            }
-        }
     if return_saved_cross_attn:
         saved_attns = []
@@ -193,9 +175,6 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
         if index == num_grounding_steps:
             gligen_enable_fuser(unet, False)
-        if semantic_guidance_bboxes and semantic_guidance:
-            with torch.enable_grad():
-                latents, loss = latent_backward_guidance(scheduler, unet, cond_embeddings, index, semantic_guidance_bboxes, semantic_guidance_object_positions, t, latents, loss, cross_attention_kwargs=guidance_cross_attention_kwargs, **semantic_guidance_kwargs)
         # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
         latent_model_input = torch.cat([latents] * 2)

     frozen_steps=20, frozen_mask=None,
     return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None,
     offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True,
     return_box_vis=False, show_progress=True, save_all_latents=False):
     """
     The `bboxes` should be a list, rather than a list of lists (one box per phrase, we can have multiple duplicated phrases).
     masks = masks.unsqueeze(0).expand(repeat_batch, -1).clone()
     masks[:repeat_batch // 2] = 0
     if return_saved_cross_attn:
         saved_attns = []
         if index == num_grounding_steps:
             gligen_enable_fuser(unet, False)
         # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
         latent_model_input = torch.cat([latents] * 2)