Tony Lian commited on
Commit
9668cda
1 Parent(s): 1f39cf9

Add examples, adjustable number of steps, and custom template

Browse files
Files changed (5) hide show
  1. app.py +41 -16
  2. baseline.py +1 -2
  3. examples.py +26 -0
  4. generation.py +3 -4
  5. models/pipelines.py +0 -21
app.py CHANGED
@@ -8,6 +8,7 @@ from utils.parse import filter_boxes
8
  from generation import run as run_ours
9
  from baseline import run as run_baseline
10
  import torch
 
11
 
12
  print(f"Is CUDA available: {torch.cuda.is_available()}")
13
  if torch.cuda.is_available():
@@ -18,7 +19,7 @@ size = box_scale
18
 
19
  bg_prompt_text = "Background prompt: "
20
 
21
- simplified_prompt = """You are an intelligent bounding box generator. I will provide you with a caption for a photo, image, or painting. Your task is to generate the bounding boxes for the objects mentioned in the caption, along with a background prompt describing the scene. The images are of size 512x512, and the bounding boxes should not overlap or go beyond the image boundaries. Each bounding box should be in the format of (object name, [top-left x coordinate, top-left y coordinate, box width, box height]) and include exactly one object. Do not put objects that are already provided in the bounding boxes into the background prompt. If needed, you can make reasonable guesses. Generate the object descriptions and background prompts in English even if the caption might not be in English. Please refer to the example below for the desired format.
22
 
23
  Caption: A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky
24
  Objects: [('a green car', [21, 181, 211, 159]), ('a blue truck', [269, 181, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]
@@ -46,7 +47,9 @@ Background prompt: A realistic image of a park with flowers
46
 
47
  Caption: 一个客厅场景的油画,墙上挂着电视,电视下面是一个柜子,柜子上有一个花瓶。
48
  Objects: [('a tv', [88, 85, 335, 203]), ('a cabinet', [57, 308, 404, 201]), ('a flower vase', [166, 222, 92, 108])]
49
- Background prompt: An oil painting of a living room scene
 
 
50
 
51
  Caption: {prompt}
52
  Objects: """
@@ -57,10 +60,12 @@ layout_placeholder = """Caption: A realistic photo of a gray cat and an orange d
57
  Objects: [('a gray cat', [67, 243, 120, 126]), ('an orange dog', [265, 193, 190, 210])]
58
  Background prompt: A realistic photo of a grassy area."""
59
 
60
- def get_lmd_prompt(prompt):
61
  if prompt == "":
62
  prompt = prompt_placeholder
63
- return simplified_prompt.format(prompt=prompt)
 
 
64
 
65
  def get_layout_image(response):
66
  if response == "":
@@ -82,7 +87,7 @@ def get_layout_image(response):
82
  def get_layout_image_gallery(response):
83
  return [get_layout_image(response)]
84
 
85
- def get_ours_image(response, seed, fg_seed_start, fg_blending_ratio=0.1, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta=0.3, show_so_imgs=False, scale_boxes=False, gallery=None):
86
  if response == "":
87
  response = layout_placeholder
88
  gen_boxes, bg_prompt = parse_input(response)
@@ -96,7 +101,7 @@ def get_ours_image(response, seed, fg_seed_start, fg_blending_ratio=0.1, frozen_
96
  image_np, so_img_list = run_ours(
97
  spec, bg_seed=seed, fg_seed_start=fg_seed_start,
98
  fg_blending_ratio=fg_blending_ratio,frozen_step_ratio=frozen_step_ratio,
99
- gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta)
100
  images = [image_np]
101
  if show_so_imgs:
102
  images.extend([np.asarray(so_img) for so_img in so_img_list])
@@ -177,27 +182,37 @@ def show_boxes(gen_boxes, bg_prompt=None):
177
 
178
  duplicate_html = '<a style="display:inline-block" href="https://huggingface.co/spaces/longlian/llm-grounded-diffusion?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a>'
179
 
180
- with gr.Blocks(
181
- title="LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models"
182
- ) as g:
183
- gr.HTML(f"""<h1>LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models</h1>
184
  <h2>LLM + Stable Diffusion => better prompt understanding in text2image generation 🤩</h2>
185
  <h2><a href='https://llm-grounded-diffusion.github.io/'>Project Page</a> | <a href='https://bair.berkeley.edu/blog/2023/05/23/lmd/'>5-minute Blog Post</a> | <a href='https://arxiv.org/pdf/2305.13655.pdf'>ArXiv Paper</a> | <a href='https://github.com/TonyLianLong/LLM-groundedDiffusion'>Github</a> | <a href='https://llm-grounded-diffusion.github.io/#citation'>Cite our work</a> if our ideas inspire you.</h2>
186
  <p><b>Tips:</b><p>
187
  <p>1. If ChatGPT doesn't generate layout, add/remove the trailing space (added by default) and/or use GPT-4.</p>
188
  <p>2. You can perform multi-round specification by giving ChatGPT follow-up requests (e.g., make the object boxes bigger).</p>
189
- <p>3. You can also try prompts in Simplified Chinese. If you want to try prompts in another language, translate the first line of last example to your language.<p>
190
- <p>4. Duplicate this space and add GPU to skip the queue and run our model faster. {duplicate_html}</p>
 
191
  <br/>
192
- <p>Implementation note: In this demo, we replace the attention manipulation in our layout-guided Stable Diffusion described in our paper with GLIGEN due to much faster inference speed (<b>FlashAttention supported, no backprop needed</b> during inference). Compared to vanilla GLIGEN, we have better coherence. Other parts of text-to-image pipeline, including single object generation and SAM, remain the same. The settings and examples in the prompt are simplified in this demo.</p>""")
 
 
 
 
 
193
  with gr.Tab("Stage 1. Image Prompt to ChatGPT"):
194
  with gr.Row():
195
  with gr.Column(scale=1):
196
  prompt = gr.Textbox(lines=2, label="Prompt for Layout Generation", placeholder=prompt_placeholder)
197
- generate_btn = gr.Button("Generate Prompt")
 
 
198
  with gr.Column(scale=1):
199
  output = gr.Textbox(label="Paste this into ChatGPT (GPT-4 preferred; on Mac, click text and press Command+A and Command+C to copy all)")
200
- generate_btn.click(fn=get_lmd_prompt, inputs=prompt, outputs=output, api_name="get_lmd_prompt")
 
 
 
 
 
201
 
202
  # with gr.Tab("(Optional) Visualize ChatGPT-generated Layout"):
203
  # with gr.Row():
@@ -216,6 +231,7 @@ with gr.Blocks(
216
  generate_btn = gr.Button("Generate Image from Layout", variant='primary')
217
  with gr.Accordion("Advanced options", open=False):
218
  seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
 
219
  fg_seed_start = gr.Slider(0, 10000, value=20, step=1, label="Seed for foreground variation")
220
  fg_blending_ratio = gr.Slider(0, 1, value=0.1, step=0.01, label="Variations added to foreground for single object generation (0: no variation, 1: max variation)")
221
  frozen_step_ratio = gr.Slider(0, 1, value=0.4, step=0.1, label="Foreground frozen steps ratio (higher: preserve object attributes; lower: higher coherence; set to 0: (almost) equivalent to vanilla GLIGEN except details)")
@@ -226,7 +242,12 @@ with gr.Blocks(
226
  label="Generated image", show_label=False, elem_id="gallery"
227
  ).style(columns=[1], rows=[1], object_fit="contain", preview=True)
228
  visualize_btn.click(fn=get_layout_image_gallery, inputs=response, outputs=gallery, api_name="visualize-layout")
229
- generate_btn.click(fn=get_ours_image, inputs=[response, seed, fg_seed_start, fg_blending_ratio, frozen_step_ratio, gligen_scheduled_sampling_beta, show_so_imgs], outputs=gallery, api_name="layout-to-image")
 
 
 
 
 
230
 
231
  with gr.Tab("Baseline: Stable Diffusion"):
232
  with gr.Row():
@@ -243,5 +264,9 @@ with gr.Blocks(
243
  ).style(columns=[1], rows=[1], object_fit="contain", preview=True)
244
  generate_btn.click(fn=get_baseline_image, inputs=[sd_prompt, seed], outputs=gallery, api_name="baseline")
245
 
 
 
 
 
246
 
247
  g.launch()
 
8
  from generation import run as run_ours
9
  from baseline import run as run_baseline
10
  import torch
11
+ from examples import stage1_examples, stage2_examples
12
 
13
  print(f"Is CUDA available: {torch.cuda.is_available()}")
14
  if torch.cuda.is_available():
 
19
 
20
  bg_prompt_text = "Background prompt: "
21
 
22
+ default_template = """You are an intelligent bounding box generator. I will provide you with a caption for a photo, image, or painting. Your task is to generate the bounding boxes for the objects mentioned in the caption, along with a background prompt describing the scene. The images are of size 512x512, and the bounding boxes should not overlap or go beyond the image boundaries. Each bounding box should be in the format of (object name, [top-left x coordinate, top-left y coordinate, box width, box height]) and include exactly one object. Make the boxes larger if possible. Do not put objects that are already provided in the bounding boxes into the background prompt. If needed, you can make reasonable guesses. Generate the object descriptions and background prompts in English even if the caption might not be in English. Do not include non-existing or excluded objects in the background prompt. Please refer to the example below for the desired format.
23
 
24
  Caption: A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky
25
  Objects: [('a green car', [21, 181, 211, 159]), ('a blue truck', [269, 181, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]
 
47
 
48
  Caption: 一个客厅场景的油画,墙上挂着电视,电视下面是一个柜子,柜子上有一个花瓶。
49
  Objects: [('a tv', [88, 85, 335, 203]), ('a cabinet', [57, 308, 404, 201]), ('a flower vase', [166, 222, 92, 108])]
50
+ Background prompt: An oil painting of a living room scene"""
51
+
52
+ simplified_prompt = """{template}
53
 
54
  Caption: {prompt}
55
  Objects: """
 
60
  Objects: [('a gray cat', [67, 243, 120, 126]), ('an orange dog', [265, 193, 190, 210])]
61
  Background prompt: A realistic photo of a grassy area."""
62
 
63
+ def get_lmd_prompt(prompt, template=""):
64
  if prompt == "":
65
  prompt = prompt_placeholder
66
+ if template == "":
67
+ template = default_template
68
+ return simplified_prompt.format(template=template, prompt=prompt)
69
 
70
  def get_layout_image(response):
71
  if response == "":
 
87
  def get_layout_image_gallery(response):
88
  return [get_layout_image(response)]
89
 
90
+ def get_ours_image(response, seed, num_inference_steps, fg_seed_start, fg_blending_ratio=0.1, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta=0.3, show_so_imgs=False, scale_boxes=False):
91
  if response == "":
92
  response = layout_placeholder
93
  gen_boxes, bg_prompt = parse_input(response)
 
101
  image_np, so_img_list = run_ours(
102
  spec, bg_seed=seed, fg_seed_start=fg_seed_start,
103
  fg_blending_ratio=fg_blending_ratio,frozen_step_ratio=frozen_step_ratio,
104
+ gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta, num_inference_steps=num_inference_steps)
105
  images = [image_np]
106
  if show_so_imgs:
107
  images.extend([np.asarray(so_img) for so_img in so_img_list])
 
182
 
183
  duplicate_html = '<a style="display:inline-block" href="https://huggingface.co/spaces/longlian/llm-grounded-diffusion?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a>'
184
 
185
+ html = f"""<h1>LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models</h1>
 
 
 
186
  <h2>LLM + Stable Diffusion => better prompt understanding in text2image generation 🤩</h2>
187
  <h2><a href='https://llm-grounded-diffusion.github.io/'>Project Page</a> | <a href='https://bair.berkeley.edu/blog/2023/05/23/lmd/'>5-minute Blog Post</a> | <a href='https://arxiv.org/pdf/2305.13655.pdf'>ArXiv Paper</a> | <a href='https://github.com/TonyLianLong/LLM-groundedDiffusion'>Github</a> | <a href='https://llm-grounded-diffusion.github.io/#citation'>Cite our work</a> if our ideas inspire you.</h2>
188
  <p><b>Tips:</b><p>
189
  <p>1. If ChatGPT doesn't generate layout, add/remove the trailing space (added by default) and/or use GPT-4.</p>
190
  <p>2. You can perform multi-round specification by giving ChatGPT follow-up requests (e.g., make the object boxes bigger).</p>
191
+ <p>3. You can also try prompts in Simplified Chinese. If you want to try prompts in another language, translate the first line of last example to your language.</p>
192
+ <p>4. The diffusion model only runs 20 steps by default. You can make it run 50 steps to get higher quality images (or tweak frozen steps/guidance steps for better guidance and coherence).</p>
193
+ <p>5. Duplicate this space and add GPU to skip the queue and run our model faster. {duplicate_html}</p>
194
  <br/>
195
+ <p>Implementation note: In this demo, we replace the attention manipulation in our layout-guided Stable Diffusion described in our paper with GLIGEN due to much faster inference speed (<b>FlashAttention supported, no backprop needed</b> during inference). Compared to vanilla GLIGEN, we have better coherence. Other parts of text-to-image pipeline, including single object generation and SAM, remain the same. The settings and examples in the prompt are simplified in this demo.</p>"""
196
+
197
+ with gr.Blocks(
198
+ title="LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models"
199
+ ) as g:
200
+ gr.HTML(html)
201
  with gr.Tab("Stage 1. Image Prompt to ChatGPT"):
202
  with gr.Row():
203
  with gr.Column(scale=1):
204
  prompt = gr.Textbox(lines=2, label="Prompt for Layout Generation", placeholder=prompt_placeholder)
205
+ generate_btn = gr.Button("Generate Prompt", variant='primary')
206
+ with gr.Accordion("Advanced options", open=False):
207
+ template = gr.Textbox(lines=10, label="Custom Template", placeholder="Customized Template", value=default_template)
208
  with gr.Column(scale=1):
209
  output = gr.Textbox(label="Paste this into ChatGPT (GPT-4 preferred; on Mac, click text and press Command+A and Command+C to copy all)")
210
+ generate_btn.click(fn=get_lmd_prompt, inputs=[prompt, template], outputs=output, api_name="get_lmd_prompt")
211
+
212
+ gr.Examples(
213
+ stage1_examples,
214
+ [prompt]
215
+ )
216
 
217
  # with gr.Tab("(Optional) Visualize ChatGPT-generated Layout"):
218
  # with gr.Row():
 
231
  generate_btn = gr.Button("Generate Image from Layout", variant='primary')
232
  with gr.Accordion("Advanced options", open=False):
233
  seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
234
+ num_inference_steps = gr.Slider(1, 50, value=20, step=1, label="Number of inference steps")
235
  fg_seed_start = gr.Slider(0, 10000, value=20, step=1, label="Seed for foreground variation")
236
  fg_blending_ratio = gr.Slider(0, 1, value=0.1, step=0.01, label="Variations added to foreground for single object generation (0: no variation, 1: max variation)")
237
  frozen_step_ratio = gr.Slider(0, 1, value=0.4, step=0.1, label="Foreground frozen steps ratio (higher: preserve object attributes; lower: higher coherence; set to 0: (almost) equivalent to vanilla GLIGEN except details)")
 
242
  label="Generated image", show_label=False, elem_id="gallery"
243
  ).style(columns=[1], rows=[1], object_fit="contain", preview=True)
244
  visualize_btn.click(fn=get_layout_image_gallery, inputs=response, outputs=gallery, api_name="visualize-layout")
245
+ generate_btn.click(fn=get_ours_image, inputs=[response, seed, num_inference_steps, fg_seed_start, fg_blending_ratio, frozen_step_ratio, gligen_scheduled_sampling_beta, show_so_imgs], outputs=gallery, api_name="layout-to-image")
246
+
247
+ gr.Examples(
248
+ stage2_examples,
249
+ [response, seed]
250
+ )
251
 
252
  with gr.Tab("Baseline: Stable Diffusion"):
253
  with gr.Row():
 
264
  ).style(columns=[1], rows=[1], object_fit="contain", preview=True)
265
  generate_btn.click(fn=get_baseline_image, inputs=[sd_prompt, seed], outputs=gallery, api_name="baseline")
266
 
267
+ gr.Examples(
268
+ stage1_examples,
269
+ [sd_prompt]
270
+ )
271
 
272
  g.launch()
baseline.py CHANGED
@@ -11,7 +11,6 @@ torch.set_grad_enabled(False)
11
 
12
  height = 512 # default height of Stable Diffusion
13
  width = 512 # default width of Stable Diffusion
14
- num_inference_steps = 20 # Number of denoising steps
15
  guidance_scale = 7.5 # Scale for classifier-free guidance
16
  batch_size = 1
17
 
@@ -20,7 +19,7 @@ image_scale = (512, 512)
20
 
21
  bg_negative = 'artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate'
22
 
23
- def run(prompt, bg_seed=1):
24
  print(f"prompt: {prompt}")
25
  generator = torch.Generator(models.torch_device).manual_seed(bg_seed)
26
 
 
11
 
12
  height = 512 # default height of Stable Diffusion
13
  width = 512 # default width of Stable Diffusion
 
14
  guidance_scale = 7.5 # Scale for classifier-free guidance
15
  batch_size = 1
16
 
 
19
 
20
  bg_negative = 'artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate'
21
 
22
+ def run(prompt, bg_seed=1, num_inference_steps=20):
23
  print(f"prompt: {prompt}")
24
  generator = torch.Generator(models.torch_device).manual_seed(bg_seed)
25
 
examples.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ stage1_examples = [
2
+ ["""A realistic photo of a gray cat and an orange dog on the grass."""],
3
+ ["""In an indoor scene, a blue cube directly above a red cube with a vase on the left of them."""],
4
+ ["""A realistic photo of a wooden table without bananas in an indoor scene"""],
5
+ ["""A man in red is standing next to another woman in blue in the mountains."""],
6
+ ["""一个室内场景的水彩画,一个桌子上面放着一盘水果"""]
7
+ ]
8
+
9
+ # Layout, seed
10
+ stage2_examples = [
11
+ ["""Caption: A realistic photo of a gray cat and an orange dog on the grass.
12
+ Objects: [('a gray cat', [67, 243, 120, 126]), ('an orange dog', [265, 193, 190, 210])]
13
+ Background prompt: A realistic photo of a grassy area.""", 0],
14
+ ["""Caption: 一个室内场景的水彩画,一个桌子上面放着一盘水果
15
+ Objects: [('a table', [81, 242, 350, 210]), ('a plate of fruits', [151, 287, 210, 117])]
16
+ Background prompt: A watercolor painting of an indoor scene""", 1],
17
+ ["""Caption: In an indoor scene, a blue cube directly above a red cube with a vase on the left of them.
18
+ Objects: [('a blue cube', [232, 116, 76, 76]), ('a red cube', [232, 212, 76, 76]), ('a vase', [100, 198, 62, 144])]
19
+ Background prompt: An indoor scene""", 2],
20
+ ["""Caption: A realistic photo of a wooden table without bananas in an indoor scene
21
+ Objects: [('a wooden table', [75, 256, 365, 156])]
22
+ Background prompt: A realistic photo of an indoor scene""", 3],
23
+ ["""Caption: A man in red is standing next to another woman in blue in the mountains.
24
+ Objects: [('a man in red', [100, 160, 111, 320]), ('a woman in blue', [230, 170, 102, 310])]
25
+ Background prompt: A scenic image of the mountains""", 4],
26
+ ]
generation.py CHANGED
@@ -20,7 +20,6 @@ model_dict.update(sam_model_dict)
20
  height = 512 # default height of Stable Diffusion
21
  width = 512 # default width of Stable Diffusion
22
  H, W = height // 8, width // 8 # size of the latent
23
- num_inference_steps = 20 # Number of denoising steps
24
  guidance_scale = 7.5 # Scale for classifier-free guidance
25
 
26
  # batch size that is not 1 is not supported
@@ -37,7 +36,7 @@ run_ind = None
37
 
38
 
39
  def generate_single_object_with_box(prompt, box, phrase, word, input_latents, input_embeddings,
40
- sam_refine_kwargs, gligen_scheduled_sampling_beta=0.3,
41
  verbose=False, visualize=True):
42
 
43
  bboxes, phrases, words = [box], [phrase], [word]
@@ -78,7 +77,7 @@ def get_masked_latents_all_list(so_prompt_phrase_word_box_list, input_latents_li
78
  # Note: need to keep the supervision, especially the box corrdinates, corresponds to each other in single object and overall.
79
 
80
  def run(
81
- spec, bg_seed = 1, fg_seed_start = 20, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta = 0.3,
82
  so_center_box = False, fg_blending_ratio = 0.1, so_horizontal_center_only = True,
83
  align_with_overall_bboxes = False, horizontal_shift_only = True
84
  ):
@@ -140,7 +139,7 @@ def run(
140
  latents_all_list, mask_tensor_list, so_img_list = get_masked_latents_all_list(
141
  so_prompt_phrase_word_box_list, input_latents_list,
142
  gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
143
- sam_refine_kwargs=sam_refine_kwargs, so_input_embeddings=so_input_embeddings, verbose=verbose
144
  )
145
 
146
 
 
20
  height = 512 # default height of Stable Diffusion
21
  width = 512 # default width of Stable Diffusion
22
  H, W = height // 8, width // 8 # size of the latent
 
23
  guidance_scale = 7.5 # Scale for classifier-free guidance
24
 
25
  # batch size that is not 1 is not supported
 
36
 
37
 
38
  def generate_single_object_with_box(prompt, box, phrase, word, input_latents, input_embeddings,
39
+ sam_refine_kwargs, num_inference_steps, gligen_scheduled_sampling_beta=0.3,
40
  verbose=False, visualize=True):
41
 
42
  bboxes, phrases, words = [box], [phrase], [word]
 
77
  # Note: need to keep the supervision, especially the box corrdinates, corresponds to each other in single object and overall.
78
 
79
  def run(
80
+ spec, bg_seed = 1, fg_seed_start = 20, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta = 0.3, num_inference_steps = 20,
81
  so_center_box = False, fg_blending_ratio = 0.1, so_horizontal_center_only = True,
82
  align_with_overall_bboxes = False, horizontal_shift_only = True
83
  ):
 
139
  latents_all_list, mask_tensor_list, so_img_list = get_masked_latents_all_list(
140
  so_prompt_phrase_word_box_list, input_latents_list,
141
  gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
142
+ sam_refine_kwargs=sam_refine_kwargs, so_input_embeddings=so_input_embeddings, num_inference_steps=num_inference_steps, verbose=verbose
143
  )
144
 
145
 
models/pipelines.py CHANGED
@@ -93,7 +93,6 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
93
  frozen_steps=20, frozen_mask=None,
94
  return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None,
95
  offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True,
96
- semantic_guidance=False, semantic_guidance_bboxes=None, semantic_guidance_object_positions=None, semantic_guidance_kwargs=None,
97
  return_box_vis=False, show_progress=True, save_all_latents=False):
98
  """
99
  The `bboxes` should be a list, rather than a list of lists (one box per phrase, we can have multiple duplicated phrases).
@@ -151,23 +150,6 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
151
  masks = masks.unsqueeze(0).expand(repeat_batch, -1).clone()
152
  masks[:repeat_batch // 2] = 0
153
 
154
- if semantic_guidance_bboxes and semantic_guidance:
155
- loss = torch.tensor(10000.)
156
- # TODO: we can also save necessary tokens only to save memory.
157
- # offload_guidance_cross_attn_to_cpu does not save too much since we only store attention map for each timestep.
158
- guidance_cross_attention_kwargs = {
159
- 'offload_cross_attn_to_cpu': False,
160
- 'enable_flash_attn': False,
161
- 'gligen': {
162
- 'boxes': boxes[:repeat_batch // 2],
163
- 'positive_embeddings': phrase_embeddings[:repeat_batch // 2],
164
- 'masks': masks[:repeat_batch // 2],
165
- 'fuser_attn_kwargs': {
166
- 'enable_flash_attn': False,
167
- }
168
- }
169
- }
170
-
171
  if return_saved_cross_attn:
172
  saved_attns = []
173
 
@@ -193,9 +175,6 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
193
  if index == num_grounding_steps:
194
  gligen_enable_fuser(unet, False)
195
 
196
- if semantic_guidance_bboxes and semantic_guidance:
197
- with torch.enable_grad():
198
- latents, loss = latent_backward_guidance(scheduler, unet, cond_embeddings, index, semantic_guidance_bboxes, semantic_guidance_object_positions, t, latents, loss, cross_attention_kwargs=guidance_cross_attention_kwargs, **semantic_guidance_kwargs)
199
  # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
200
  latent_model_input = torch.cat([latents] * 2)
201
 
 
93
  frozen_steps=20, frozen_mask=None,
94
  return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None,
95
  offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True,
 
96
  return_box_vis=False, show_progress=True, save_all_latents=False):
97
  """
98
  The `bboxes` should be a list, rather than a list of lists (one box per phrase, we can have multiple duplicated phrases).
 
150
  masks = masks.unsqueeze(0).expand(repeat_batch, -1).clone()
151
  masks[:repeat_batch // 2] = 0
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  if return_saved_cross_attn:
154
  saved_attns = []
155
 
 
175
  if index == num_grounding_steps:
176
  gligen_enable_fuser(unet, False)
177
 
 
 
 
178
  # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
179
  latent_model_input = torch.cat([latents] * 2)
180