Spaces:

longlian
/

llm-grounded-diffusion

Sleeping

App Files Files Community

Tony Lian commited on Jun 29, 2023

Commit

61ac46b

1 Parent(s): 7b0a7ad

Add batched single object generation

Browse files

Files changed (5) hide show

app.py +4 -4
generation.py +53 -24
models/models.py +19 -17
models/pipelines.py +47 -29
models/sam.py +50 -29

app.py CHANGED Viewed

@@ -109,7 +109,7 @@ def get_ours_image(response, seed, num_inference_steps=20, dpm_scheduler=True, u
         spec, bg_seed=seed, overall_prompt_override=overall_prompt_override, fg_seed_start=fg_seed_start,
         fg_blending_ratio=fg_blending_ratio,frozen_step_ratio=frozen_step_ratio, use_autocast=use_autocast,
         gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta, num_inference_steps=num_inference_steps, scheduler_key=scheduler_key,
-        so_negative_prompt=so_negative_prompt, overall_negative_prompt=overall_negative_prompt
     )
     images = [image_np]
     if show_so_imgs:
@@ -201,7 +201,7 @@ html = f"""<h1>LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to
             <p>1. If ChatGPT doesn't generate layout, add/remove the trailing space (added by default) and/or use GPT-4.</p>
             <p>2. You can perform multi-round specification by giving ChatGPT follow-up requests (e.g., make the object boxes bigger).</p>
             <p>3. You can also try prompts in Simplified Chinese. If you want to try prompts in another language, translate the first line of last example to your language.</p>
-            <p>4. The diffusion model only runs 20 steps by default. You can make it run 50 steps to get higher quality images (or tweak frozen steps/guidance steps for better guidance and coherence).</p>
             <p>5. Duplicate this space and add GPU or clone the space and run locally to skip the queue and run our model faster. (Currently we are using a T4, and you can add a A10G to make it 5x faster) {duplicate_html}</p>
             <br/>
             <p>Implementation note: In this demo, we replace the attention manipulation in our layout-guided Stable Diffusion described in our paper with GLIGEN due to much faster inference speed (<b>FlashAttention supported, no backprop needed</b> during inference). Compared to vanilla GLIGEN, we have better coherence. Other parts of text-to-image pipeline, including single object generation and SAM, remain the same. The settings and examples in the prompt are simplified in this demo.</p>
@@ -237,12 +237,12 @@ with gr.Blocks(
             with gr.Column(scale=1):
                 response = gr.Textbox(lines=8, label="Paste ChatGPT response here (no original caption needed)", placeholder=layout_placeholder)
                 overall_prompt_override = gr.Textbox(lines=2, label="Prompt for overall generation (optional but recommended)", placeholder="You can put your input prompt for layout generation here, helpful if your scene cannot be represented by background prompt and boxes only, e.g., with object interactions. If left empty: background prompt with [objects].", value="")
                 seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
                 with gr.Accordion("Advanced options (play around for better generation)", open=False):
                     frozen_step_ratio = gr.Slider(0, 1, value=0.4, step=0.1, label="Foreground frozen steps ratio (higher: preserve object attributes; lower: higher coherence; set to 0: (almost) equivalent to vanilla GLIGEN except details)")
                     gligen_scheduled_sampling_beta = gr.Slider(0, 1, value=0.3, step=0.1, label="GLIGEN guidance steps ratio (the beta value)")
-                    num_inference_steps = gr.Slider(1, 50, value=20, step=1, label="Number of inference steps")
-                    dpm_scheduler = gr.Checkbox(label="Use DPM scheduler (unchecked: DDIM scheduler, may have better coherence, recommend 50 or even more inference steps)", show_label=False, value=True)
                     use_autocast = gr.Checkbox(label="Use FP16 Mixed Precision", show_label=False, value=True)
                     fg_seed_start = gr.Slider(0, 10000, value=20, step=1, label="Seed for foreground variation")
                     fg_blending_ratio = gr.Slider(0, 1, value=0.1, step=0.01, label="Variations added to foreground for single object generation (0: no variation, 1: max variation)")

         spec, bg_seed=seed, overall_prompt_override=overall_prompt_override, fg_seed_start=fg_seed_start,
         fg_blending_ratio=fg_blending_ratio,frozen_step_ratio=frozen_step_ratio, use_autocast=use_autocast,
         gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta, num_inference_steps=num_inference_steps, scheduler_key=scheduler_key,
+        so_negative_prompt=so_negative_prompt, overall_negative_prompt=overall_negative_prompt, so_batch_size=8
     )
     images = [image_np]
     if show_so_imgs:
             <p>1. If ChatGPT doesn't generate layout, add/remove the trailing space (added by default) and/or use GPT-4.</p>
             <p>2. You can perform multi-round specification by giving ChatGPT follow-up requests (e.g., make the object boxes bigger).</p>
             <p>3. You can also try prompts in Simplified Chinese. If you want to try prompts in another language, translate the first line of last example to your language.</p>
+            <p>4. The diffusion model only runs 50 steps by default in this demo. You can make it run more/fewer steps to get higher quality images or faster generation (or tweak frozen steps/guidance steps for better guidance and coherence).</p>
             <p>5. Duplicate this space and add GPU or clone the space and run locally to skip the queue and run our model faster. (Currently we are using a T4, and you can add a A10G to make it 5x faster) {duplicate_html}</p>
             <br/>
             <p>Implementation note: In this demo, we replace the attention manipulation in our layout-guided Stable Diffusion described in our paper with GLIGEN due to much faster inference speed (<b>FlashAttention supported, no backprop needed</b> during inference). Compared to vanilla GLIGEN, we have better coherence. Other parts of text-to-image pipeline, including single object generation and SAM, remain the same. The settings and examples in the prompt are simplified in this demo.</p>
             with gr.Column(scale=1):
                 response = gr.Textbox(lines=8, label="Paste ChatGPT response here (no original caption needed)", placeholder=layout_placeholder)
                 overall_prompt_override = gr.Textbox(lines=2, label="Prompt for overall generation (optional but recommended)", placeholder="You can put your input prompt for layout generation here, helpful if your scene cannot be represented by background prompt and boxes only, e.g., with object interactions. If left empty: background prompt with [objects].", value="")
+                num_inference_steps = gr.Slider(1, 250, value=50, step=1, label="Number of denoising steps (set to 20 to trade quality for faster generation)")
                 seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
                 with gr.Accordion("Advanced options (play around for better generation)", open=False):
                     frozen_step_ratio = gr.Slider(0, 1, value=0.4, step=0.1, label="Foreground frozen steps ratio (higher: preserve object attributes; lower: higher coherence; set to 0: (almost) equivalent to vanilla GLIGEN except details)")
                     gligen_scheduled_sampling_beta = gr.Slider(0, 1, value=0.3, step=0.1, label="GLIGEN guidance steps ratio (the beta value)")
+                    dpm_scheduler = gr.Checkbox(label="Use DPM scheduler (unchecked: DDIM scheduler, may have better coherence, recommend >=50 inference steps)", show_label=False, value=True)
                     use_autocast = gr.Checkbox(label="Use FP16 Mixed Precision", show_label=False, value=True)
                     fg_seed_start = gr.Slider(0, 10000, value=20, step=1, label="Seed for foreground variation")
                     fg_blending_ratio = gr.Slider(0, 1, value=0.1, step=0.01, label="Variations added to foreground for single object generation (0: no variation, 1: max variation)")

generation.py CHANGED Viewed

@@ -1,6 +1,7 @@
 version = "v3.0"
 import torch
 import models
 import utils
 from models import pipelines, sam
@@ -21,7 +22,6 @@ H, W = height // 8, width // 8 # size of the latent
 guidance_scale = 7.5  # Scale for classifier-free guidance
 # batch size that is not 1 is not supported
-so_batch_size = 1
 overall_batch_size = 1
 # discourage masks with confidence below
@@ -33,41 +33,70 @@ discourage_mask_below_coarse_iou = 0.25
 run_ind = None
-def generate_single_object_with_box(prompt, box, phrase, word, input_latents, input_embeddings,
                                     sam_refine_kwargs, num_inference_steps, gligen_scheduled_sampling_beta=0.3,
-                                    verbose=False, scheduler_key=None, visualize=True):
-    bboxes, phrases, words = [box], [phrase], [word]
-    latents, single_object_images, single_object_pil_images_box_ann, latents_all = pipelines.generate_gligen(
-        model_dict, input_latents, input_embeddings, num_inference_steps, bboxes, phrases, gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
-        guidance_scale=guidance_scale, return_saved_cross_attn=False,
-        return_box_vis=True, save_all_latents=True, scheduler_key=scheduler_key
-    )
-    mask_selected, conf_score_selected = sam.sam_refine_box(sam_input_image=single_object_images[0], box=box, model_dict=model_dict, verbose=verbose, **sam_refine_kwargs)
     mask_selected_tensor = torch.tensor(mask_selected)
-    return latents_all, mask_selected_tensor, single_object_pil_images_box_ann[0]
 def get_masked_latents_all_list(so_prompt_phrase_word_box_list, input_latents_list, so_input_embeddings, verbose=False, **kwargs):
-    latents_all_list, mask_tensor_list, so_img_list = [], [], []
     if not so_prompt_phrase_word_box_list:
         return latents_all_list, mask_tensor_list
-    so_uncond_embeddings, so_cond_embeddings = so_input_embeddings
-    for idx, ((prompt, phrase, word, box), input_latents) in enumerate(zip(so_prompt_phrase_word_box_list, input_latents_list)):
-        so_current_cond_embeddings = so_cond_embeddings[idx:idx+1]
-        so_current_text_embeddings = torch.cat([so_uncond_embeddings, so_current_cond_embeddings], dim=0)
-        so_current_input_embeddings = so_current_text_embeddings, so_uncond_embeddings, so_current_cond_embeddings
-        latents_all, mask_tensor, so_img = generate_single_object_with_box(prompt, box, phrase, word, input_latents, input_embeddings=so_current_input_embeddings, verbose=verbose, **kwargs)
-        latents_all_list.append(latents_all)
-        mask_tensor_list.append(mask_tensor)
-        so_img_list.append(so_img)
     return latents_all_list, mask_tensor_list, so_img_list
@@ -77,7 +106,7 @@ def get_masked_latents_all_list(so_prompt_phrase_word_box_list, input_latents_li
 def run(
     spec, bg_seed = 1, overall_prompt_override="", fg_seed_start = 20, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta = 0.3, num_inference_steps = 20,
     so_center_box = False, fg_blending_ratio = 0.1, scheduler_key='dpm_scheduler', so_negative_prompt = DEFAULT_SO_NEGATIVE_PROMPT, overall_negative_prompt = DEFAULT_OVERALL_NEGATIVE_PROMPT, so_horizontal_center_only = True,
-    align_with_overall_bboxes = False, horizontal_shift_only = True, use_autocast = False
 ):
     """
     so_center_box: using centered box in single object generation
@@ -130,7 +159,7 @@ def run(
         latents_all_list, mask_tensor_list, so_img_list = get_masked_latents_all_list(
             so_prompt_phrase_word_box_list, input_latents_list,
             gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
-            sam_refine_kwargs=sam_refine_kwargs, so_input_embeddings=so_input_embeddings, num_inference_steps=num_inference_steps, scheduler_key=scheduler_key, verbose=verbose
         )

 version = "v3.0"
 import torch
+import numpy as np
 import models
 import utils
 from models import pipelines, sam
 guidance_scale = 7.5  # Scale for classifier-free guidance
 # batch size that is not 1 is not supported
 overall_batch_size = 1
 # discourage masks with confidence below
 run_ind = None
+def generate_single_object_with_box_batch(prompts, bboxes, phrases, words, input_latents_list, input_embeddings,
                                     sam_refine_kwargs, num_inference_steps, gligen_scheduled_sampling_beta=0.3,
+                                    verbose=False, scheduler_key=None, visualize=True, batch_size=None):
+    # batch_size=None: does not limit the batch size (pass all input together)
+    # prompts and words are not used since we don't have cross-attention control in this function
+    input_latents = torch.cat(input_latents_list, dim=0)
+    # We need to "unsqueeze" to tell that we have only one box and phrase in each batch item
+    bboxes, phrases = [[item] for item in bboxes], [[item] for item in phrases]
+    input_len = len(bboxes)
+    assert len(bboxes) == len(phrases), f"{len(bboxes)} != {len(phrases)}"
+    if batch_size is None:
+        batch_size = input_len
+    run_times = int(np.ceil(input_len / batch_size))
+    single_object_images, single_object_pil_images_box_ann, latents_all = [], [], []
+    for batch_idx in range(run_times):
+        input_latents_batch, bboxes_batch, phrases_batch = input_latents[batch_idx * batch_size:(batch_idx + 1) * batch_size], \
+            bboxes[batch_idx * batch_size:(batch_idx + 1) * batch_size], phrases[batch_idx * batch_size:(batch_idx + 1) * batch_size]
+        input_embeddings_batch = input_embeddings[0], input_embeddings[1][batch_idx * batch_size:(batch_idx + 1) * batch_size]
+        _, single_object_images_batch, single_object_pil_images_box_ann_batch, latents_all_batch = pipelines.generate_gligen(
+            model_dict, input_latents_batch, input_embeddings_batch, num_inference_steps, bboxes_batch, phrases_batch, gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
+            guidance_scale=guidance_scale, return_saved_cross_attn=False,
+            return_box_vis=True, save_all_latents=True, batched_condition=True, scheduler_key=scheduler_key
+        )
+        single_object_images.append(single_object_images_batch)
+        single_object_pil_images_box_ann.append(single_object_pil_images_box_ann_batch)
+        latents_all.append(latents_all_batch)
+    single_object_images, single_object_pil_images_box_ann, latents_all = np.concatenate(single_object_images, axis=0), sum(single_object_pil_images_box_ann, []), torch.cat(latents_all, dim=1)
+    mask_selected, conf_score_selected = sam.sam_refine_boxes(sam_input_images=single_object_images, boxes=bboxes, model_dict=model_dict, verbose=verbose, **sam_refine_kwargs)
+    # mask_selected: List[List[Array of shape (64, 64)]]
+    mask_selected = np.array(mask_selected)[:, 0]
     mask_selected_tensor = torch.tensor(mask_selected)
+    latents_all = latents_all.transpose(0,1)[:,:,None,...]
+    return latents_all, mask_selected_tensor, single_object_pil_images_box_ann
 def get_masked_latents_all_list(so_prompt_phrase_word_box_list, input_latents_list, so_input_embeddings, verbose=False, **kwargs):
+    latents_all_list, mask_tensor_list = [], []
     if not so_prompt_phrase_word_box_list:
         return latents_all_list, mask_tensor_list
+    prompts, bboxes, phrases, words = [], [], [], []
+    for prompt, phrase, word, box in so_prompt_phrase_word_box_list:
+        prompts.append(prompt)
+        bboxes.append(box)
+        phrases.append(phrase)
+        words.append(word)
+    latents_all_list, mask_tensor_list, so_img_list = generate_single_object_with_box_batch(prompts, bboxes, phrases, words, input_latents_list, input_embeddings=so_input_embeddings, verbose=verbose, **kwargs)
     return latents_all_list, mask_tensor_list, so_img_list
 def run(
     spec, bg_seed = 1, overall_prompt_override="", fg_seed_start = 20, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta = 0.3, num_inference_steps = 20,
     so_center_box = False, fg_blending_ratio = 0.1, scheduler_key='dpm_scheduler', so_negative_prompt = DEFAULT_SO_NEGATIVE_PROMPT, overall_negative_prompt = DEFAULT_OVERALL_NEGATIVE_PROMPT, so_horizontal_center_only = True,
+    align_with_overall_bboxes = False, horizontal_shift_only = True, use_autocast = False, so_batch_size = None
 ):
     """
     so_center_box: using centered box in single object generation
         latents_all_list, mask_tensor_list, so_img_list = get_masked_latents_all_list(
             so_prompt_phrase_word_box_list, input_latents_list,
             gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
+            sam_refine_kwargs=sam_refine_kwargs, so_input_embeddings=so_input_embeddings, num_inference_steps=num_inference_steps, scheduler_key=scheduler_key, verbose=verbose, batch_size=so_batch_size
         )

models/models.py CHANGED Viewed

@@ -75,20 +75,22 @@ def encode_prompts(tokenizer, text_encoder, prompts, negative_prompt="", return_
         return text_embeddings
     return text_embeddings, uncond_embeddings, cond_embeddings
-def attn_list_to_tensor(cross_attention_probs):
-    # timestep, CrossAttnBlock, Transformer2DModel, 1xBasicTransformerBlock
-    num_cross_attn_block = len(cross_attention_probs[0])
-    cross_attention_probs_all = []
-    for i in range(num_cross_attn_block):
-        # cross_attention_probs_timestep[i]: Transformer2DModel
-        # 1xBasicTransformerBlock is skipped
-        cross_attention_probs_current = []
-        for cross_attention_probs_timestep in cross_attention_probs:
-            cross_attention_probs_current.append(torch.stack([item for item in cross_attention_probs_timestep[i]], dim=0))
-        cross_attention_probs_current = torch.stack(cross_attention_probs_current, dim=0)
-        cross_attention_probs_all.append(cross_attention_probs_current)
-    return cross_attention_probs_all

         return text_embeddings
     return text_embeddings, uncond_embeddings, cond_embeddings
+def process_input_embeddings(input_embeddings):
+    assert isinstance(input_embeddings, (tuple, list))
+    if len(input_embeddings) == 3:
+        # input_embeddings: text_embeddings, uncond_embeddings, cond_embeddings
+        # Assume `uncond_embeddings` is full (has batch size the same as cond_embeddings)
+        _, uncond_embeddings, cond_embeddings = input_embeddings
+        assert uncond_embeddings.shape[0] == cond_embeddings.shape[0], f"{uncond_embeddings.shape[0]} != {cond_embeddings.shape[0]}"
+        return input_embeddings
+    elif len(input_embeddings) == 2:
+        # input_embeddings: uncond_embeddings, cond_embeddings
+        # uncond_embeddings may have only one item
+        uncond_embeddings, cond_embeddings = input_embeddings
+        if uncond_embeddings.shape[0] == 1:
+            uncond_embeddings = uncond_embeddings.expand(cond_embeddings.shape)
+        # We follow the convention: negative (unconditional) prompt comes first
+        text_embeddings = torch.cat((uncond_embeddings, cond_embeddings), dim=0)
+        return text_embeddings, uncond_embeddings, cond_embeddings
+    else:
+        raise ValueError(f"input_embeddings length: {len(input_embeddings)}")

models/pipelines.py CHANGED Viewed

@@ -5,7 +5,7 @@ from PIL import Image
 import gc
 import numpy as np
 from .attention import GatedSelfAttentionDense
-from .models import torch_device
 @torch.no_grad()
 def encode(model_dict, image, generator):
@@ -88,17 +88,56 @@ def gligen_enable_fuser(unet, enabled=True):
         if isinstance(module, GatedSelfAttentionDense):
             module.enabled = enabled
 @torch.no_grad()
 def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps, bboxes, phrases, num_images_per_prompt=1, gligen_scheduled_sampling_beta: float = 0.3, guidance_scale=7.5,
     frozen_steps=20, frozen_mask=None,
     return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None,
     offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True,
-    return_box_vis=False, show_progress=True, save_all_latents=False, scheduler_key='dpm_scheduler'):
     """
     The `bboxes` should be a list, rather than a list of lists (one box per phrase, we can have multiple duplicated phrases).
     """
     vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict[scheduler_key], model_dict.dtype
-    text_embeddings, uncond_embeddings, cond_embeddings = input_embeddings
     if latents.dim() == 5:
         # latents_all from the input side, different from the latents_all to be saved
@@ -122,33 +161,12 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
     if frozen_mask is not None:
         frozen_mask = frozen_mask.to(dtype=dtype).clamp(0., 1.)
-    batch_size = 1
     # 5.1 Prepare GLIGEN variables
-    assert len(phrases) == len(bboxes)
-    # assert batch_size == 1
-    max_objs = 30
-    _boxes = bboxes
-    n_objs = min(len(_boxes), max_objs)
-    boxes = torch.zeros(max_objs, 4, device=torch_device, dtype=dtype)
-    phrase_embeddings = torch.zeros(max_objs, 768, device=torch_device, dtype=dtype)
-    masks = torch.zeros(max_objs, device=torch_device, dtype=dtype)
-    if n_objs > 0:
-        boxes[:n_objs] = torch.tensor(_boxes[:n_objs])
-        tokenizer_inputs = tokenizer(phrases, padding=True, return_tensors="pt").to(torch_device)
-        _phrase_embeddings = text_encoder(**tokenizer_inputs).pooler_output
-        phrase_embeddings[:n_objs] = _phrase_embeddings[:n_objs]
-        masks[:n_objs] = 1
-    # Classifier-free guidance
-    repeat_batch = batch_size * num_images_per_prompt * 2
-    boxes = boxes.unsqueeze(0).expand(repeat_batch, -1, -1).clone()
-    phrase_embeddings = phrase_embeddings.unsqueeze(0).expand(repeat_batch, -1, -1).clone()
-    masks = masks.unsqueeze(0).expand(repeat_batch, -1).clone()
-    masks[:repeat_batch // 2] = 0
     if return_saved_cross_attn:
         saved_attns = []
@@ -215,7 +233,7 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
     if return_saved_cross_attn:
         ret.append(saved_attns)
     if return_box_vis:
-        pil_images = [utils.draw_box(Image.fromarray(image), bboxes, phrases) for image in images]
         ret.append(pil_images)
     if save_all_latents:
         latents_all = torch.stack(latents_all, dim=0)

 import gc
 import numpy as np
 from .attention import GatedSelfAttentionDense
+from .models import process_input_embeddings, torch_device
 @torch.no_grad()
 def encode(model_dict, image, generator):
         if isinstance(module, GatedSelfAttentionDense):
             module.enabled = enabled
+def prepare_gligen_condition(bboxes, phrases, dtype, tokenizer, text_encoder, num_images_per_prompt):
+    batch_size = len(bboxes)
+    assert len(phrases) == len(bboxes)
+    max_objs = 30
+    n_objs = min(max([len(bboxes_item) for bboxes_item in bboxes]), max_objs)
+    boxes = torch.zeros((batch_size, max_objs, 4), device=torch_device, dtype=dtype)
+    phrase_embeddings = torch.zeros((batch_size, max_objs, 768), device=torch_device, dtype=dtype)
+    # masks is a 1D tensor deciding which of the enteries to be enabled
+    masks = torch.zeros((batch_size, max_objs), device=torch_device, dtype=dtype)
+    if n_objs > 0:
+        for idx, (bboxes_item, phrases_item) in enumerate(zip(bboxes, phrases)):
+            # the length of `bboxes_item` could be smaller than `n_objs` because n_objs takes the max of item length
+            bboxes_item = torch.tensor(bboxes_item[:n_objs])
+            boxes[idx, :bboxes_item.shape[0]] = bboxes_item
+            tokenizer_inputs = tokenizer(phrases_item[:n_objs], padding=True, return_tensors="pt").to(torch_device)
+            _phrase_embeddings = text_encoder(**tokenizer_inputs).pooler_output
+            phrase_embeddings[idx, :_phrase_embeddings.shape[0]] = _phrase_embeddings
+            assert bboxes_item.shape[0] == _phrase_embeddings.shape[0], f"{bboxes_item.shape[0]} != {_phrase_embeddings.shape[0]}"
+            masks[idx, :bboxes_item.shape[0]] = 1
+    # Classifier-free guidance
+    repeat_times = num_images_per_prompt * 2
+    condition_len = batch_size * repeat_times
+    boxes = boxes.repeat(repeat_times, 1, 1)
+    phrase_embeddings = phrase_embeddings.repeat(repeat_times, 1, 1)
+    masks = masks.repeat(repeat_times, 1)
+    masks[:condition_len // 2] = 0
+    # print("shapes:", boxes.shape, phrase_embeddings.shape, masks.shape)
+    return boxes, phrase_embeddings, masks, condition_len
 @torch.no_grad()
 def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps, bboxes, phrases, num_images_per_prompt=1, gligen_scheduled_sampling_beta: float = 0.3, guidance_scale=7.5,
     frozen_steps=20, frozen_mask=None,
     return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None,
     offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True,
+    return_box_vis=False, show_progress=True, save_all_latents=False, scheduler_key='dpm_scheduler', batched_condition=False):
     """
     The `bboxes` should be a list, rather than a list of lists (one box per phrase, we can have multiple duplicated phrases).
     """
     vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict[scheduler_key], model_dict.dtype
+    text_embeddings, _, cond_embeddings = process_input_embeddings(input_embeddings)
     if latents.dim() == 5:
         # latents_all from the input side, different from the latents_all to be saved
     if frozen_mask is not None:
         frozen_mask = frozen_mask.to(dtype=dtype).clamp(0., 1.)
     # 5.1 Prepare GLIGEN variables
+    if not batched_condition:
+        # Add batch dimension to bboxes and phrases
+        bboxes, phrases = [bboxes], [phrases]
+    boxes, phrase_embeddings, masks, condition_len = prepare_gligen_condition(bboxes, phrases, dtype, tokenizer, text_encoder, num_images_per_prompt)
     if return_saved_cross_attn:
         saved_attns = []
     if return_saved_cross_attn:
         ret.append(saved_attns)
     if return_box_vis:
+        pil_images = [utils.draw_box(Image.fromarray(image), bboxes_item, phrases_item) for image, bboxes_item, phrases_item in zip(images, bboxes, phrases)]
         ret.append(pil_images)
     if save_all_latents:
         latents_all = torch.stack(latents_all, dim=0)

models/sam.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gc
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
 from models import torch_device
 from transformers import SamModel, SamProcessor
 import utils
@@ -20,10 +21,18 @@ def load_sam():
 # Not fully backward compatible with the previous implementation
 # Reference: lmdv2/notebooks/gen_masked_latents_multi_object_ref_ca_loss_modular.ipynb
-def sam(sam_model_dict, image, input_points=None, input_boxes=None, target_mask_shape=None):
     """target_mask_shape: (h, w)"""
     sam_model, sam_processor = sam_model_dict['sam_model'], sam_model_dict['sam_processor']
     with torch.no_grad():
         with torch.autocast(torch_device):
             inputs = sam_processor(image, input_points=input_points, input_boxes=input_boxes, return_tensors="pt").to(torch_device)
@@ -31,18 +40,17 @@ def sam(sam_model_dict, image, input_points=None, input_boxes=None, target_mask_
         masks = sam_processor.image_processor.post_process_masks(
             outputs.pred_masks.cpu().float(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
         )
-        conf_scores = outputs.iou_scores.to(device="cpu", dtype=torch.float32).numpy()[0,0]
         del inputs, outputs
-    gc.collect()
-    if torch_device == "cuda":
-        torch.cuda.empty_cache()
-    masks = masks[0][0].numpy()
-    if target_mask_shape is not None:
-        masks = np.array([cv2.resize(mask.astype(np.uint8) * 255, target_mask_shape[::-1], cv2.INTER_LINEAR).astype(bool) for mask in masks])
     return masks, conf_scores
 def sam_point_input(sam_model_dict, image, input_points, **kwargs):
@@ -154,26 +162,39 @@ def sam_refine_attn(sam_input_image, token_attn_np, model_dict, height, width, H
     return mask_selected, conf_score_selected
-def sam_refine_box(sam_input_image, box, model_dict, height, width, H, W, discourage_mask_below_confidence, discourage_mask_below_coarse_iou, verbose):
     # (w, h)
-    input_boxes = utils.scale_proportion(box, H=height, W=width)
-    input_boxes = [input_boxes]
-    masks, conf_scores = sam_box_input(model_dict, image=sam_input_image, input_boxes=input_boxes, target_mask_shape=(H, W))
-    mask_binary = utils.proportion_to_mask(box, H, W, return_np=True)
-    if verbose:
-        # Also the box is the input for SAM
-        plt.title("Binary mask from input box (for iou)")
-        plt.imshow(mask_binary)
-        plt.show()
-    coarse_ious = get_iou_with_resize(mask_binary, masks, masks_shape=mask_binary.shape)
-    mask_selected, conf_score_selected = select_mask(masks, conf_scores, coarse_ious=coarse_ious,
-                                                         rule="largest_over_conf",
-                                                         discourage_mask_below_confidence=discourage_mask_below_confidence,
-                                                         discourage_mask_below_coarse_iou=discourage_mask_below_coarse_iou,
-                                                         verbose=True)
-    return mask_selected, conf_score_selected

 import matplotlib.pyplot as plt
 import numpy as np
 import torch
+import torch.nn.functional as F
 from models import torch_device
 from transformers import SamModel, SamProcessor
 import utils
 # Not fully backward compatible with the previous implementation
 # Reference: lmdv2/notebooks/gen_masked_latents_multi_object_ref_ca_loss_modular.ipynb
+def sam(sam_model_dict, image, input_points=None, input_boxes=None, target_mask_shape=None, return_numpy=True):
     """target_mask_shape: (h, w)"""
     sam_model, sam_processor = sam_model_dict['sam_model'], sam_model_dict['sam_processor']
+    if input_boxes and isinstance(input_boxes[0], tuple):
+        # Convert tuple to list
+        input_boxes = [list(input_box) for input_box in input_boxes]
+    if input_boxes and input_boxes[0] and isinstance(input_boxes[0][0], tuple):
+        # Convert tuple to list
+        input_boxes = [[list(input_box) for input_box in input_boxes_item] for input_boxes_item in input_boxes]
     with torch.no_grad():
         with torch.autocast(torch_device):
             inputs = sam_processor(image, input_points=input_points, input_boxes=input_boxes, return_tensors="pt").to(torch_device)
         masks = sam_processor.image_processor.post_process_masks(
             outputs.pred_masks.cpu().float(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
         )
+        conf_scores = outputs.iou_scores.cpu().numpy()[0,0]
         del inputs, outputs
+    gc.collect()
+    torch.cuda.empty_cache()
+    if return_numpy:
+        masks = [F.interpolate(masks_item.type(torch.float), target_mask_shape, mode='bilinear').type(torch.bool).numpy() for masks_item in masks]
+    else:
+        masks = [F.interpolate(masks_item.type(torch.float), target_mask_shape, mode='bilinear').type(torch.bool) for masks_item in masks]
     return masks, conf_scores
 def sam_point_input(sam_model_dict, image, input_points, **kwargs):
     return mask_selected, conf_score_selected
+def sam_refine_box(sam_input_image, box, *args, **kwargs):
+    sam_input_images, boxes = [sam_input_image], [box]
+    return sam_refine_boxes(sam_input_images, boxes, *args, **kwargs)
+def sam_refine_boxes(sam_input_images, boxes, model_dict, height, width, H, W, discourage_mask_below_confidence, discourage_mask_below_coarse_iou, verbose):
     # (w, h)
+    input_boxes = [[utils.scale_proportion(box, H=height, W=width) for box in boxes_item] for boxes_item in boxes]
+    masks, conf_scores = sam_box_input(model_dict, image=sam_input_images, input_boxes=input_boxes, target_mask_shape=(H, W))
+    mask_selected_batched_list, conf_score_selected_batched_list = [], []
+    for boxes_item, masks_item in zip(boxes, masks):
+        mask_selected_list, conf_score_selected_list = [], []
+        for box, three_masks in zip(boxes_item, masks_item):
+            mask_binary = utils.proportion_to_mask(box, H, W, return_np=True)
+            if verbose:
+                # Also the box is the input for SAM
+                plt.title("Binary mask from input box (for iou)")
+                plt.imshow(mask_binary)
+                plt.show()
+            coarse_ious = get_iou_with_resize(mask_binary, three_masks, masks_shape=mask_binary.shape)
+            mask_selected, conf_score_selected = select_mask(three_masks, conf_scores, coarse_ious=coarse_ious,
+                                                                rule="largest_over_conf",
+                                                                discourage_mask_below_confidence=discourage_mask_below_confidence,
+                                                                discourage_mask_below_coarse_iou=discourage_mask_below_coarse_iou,
+                                                                verbose=True)
+            mask_selected_list.append(mask_selected)
+            conf_score_selected_list.append(conf_score_selected)
+        mask_selected_batched_list.append(mask_selected_list)
+        conf_score_selected_batched_list.append(conf_score_selected_list)
+    return mask_selected_batched_list, conf_score_selected_batched_list