Spaces:

adaface-neurips
/

adaface

Sleeping

App Files Files Community

adaface-neurips commited on Mar 26

Commit

936cd75

1 Parent(s): 40ca865

update code

Browse files

Files changed (11) hide show

.gitignore +1 -7
ConsistentID/app.py +2 -2
adaface/adaface_infer.py +10 -15
adaface/adaface_translate.py +53 -33
adaface/adaface_wrapper.py +366 -72
adaface/diffusers_attn_lora_capture.py +656 -0
adaface/face_id_to_ada_prompt.py +253 -124
adaface/subj_basis_generator.py +97 -59
adaface/unet_teachers.py +86 -49
adaface/util.py +21 -18
app.py +299 -101

.gitignore CHANGED Viewed

@@ -1,10 +1,4 @@
-models/awportrait/*
-models/awportrait
 __pycache__/*
 __pycache__
-samples-ada/*
-samples-ada
-models/ensemble/awp14-unet/*
-models/ensemble/awp14-unet
 .gradio/certificate.pem

 __pycache__/*
 __pycache__
 .gradio/certificate.pem
+models/*

ConsistentID/app.py CHANGED Viewed

@@ -26,8 +26,8 @@ pipe = ConsistentIDPipeline.from_pretrained(
 ### Load consistentID_model checkpoint
 pipe.load_ConsistentID_model(
-    consistentID_weight_path="./models/ConsistentID-v1.bin",
-    bise_net_weight_path="./models/BiSeNet_pretrained_for_ConsistentID.pth",
 )
 pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
 pipe = pipe.to(device, torch.float16)

 ### Load consistentID_model checkpoint
 pipe.load_ConsistentID_model(
+    consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
+    bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth",
 )
 pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
 pipe = pipe.to(device, torch.float16)

adaface/adaface_infer.py CHANGED Viewed

@@ -45,8 +45,7 @@ def parse_args():
                         help="Type of pipeline to use (default: txt2img)")
     parser.add_argument("--base_model_path", type=str, default=None,
                         help="Type of checkpoints to use (default: None, using the official model)")
-    parser.add_argument('--adaface_ckpt_paths', type=str, nargs="+",
-                        default=['models/adaface/subjects-celebrity2024-05-16T17-22-46_zero3-ada-30000.pt'])
     parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                         choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
     parser.add_argument("--enabled_encoders", type=str, nargs="+", default=None,
@@ -60,23 +59,18 @@ def parse_args():
     parser.add_argument("--extra_unet_dirpaths", type=str, nargs="*",
                         default=[],
                         help="Extra paths to the checkpoints of the UNet models")
-    parser.add_argument('--unet_weights', type=float, nargs="+", default=[1],
                         help="Weights for the UNet models")
     parser.add_argument("--subject", type=str)
     parser.add_argument("--example_image_count", type=int, default=-1, help="Number of example images to use")
     parser.add_argument("--out_image_count",     type=int, default=4,  help="Number of images to generate")
     parser.add_argument("--prompt", type=str, default="a woman z in superman costume")
-    parser.add_argument("--noise", dest='perturb_std', type=float, default=0)
     parser.add_argument("--randface", action="store_true")
     parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
                         help="Guidance scale for the diffusion model")
-    parser.add_argument("--subject_string",
-                        type=str, default="z",
-                        help="Subject placeholder string used in prompts to denote the concept.")
     parser.add_argument("--num_images_per_row", type=int, default=4,
                         help="Number of images to display in a row in the output grid image.")
-    parser.add_argument("--num_inference_steps", type=int, default=50,
-                        help="Number of inference steps")
     parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
     parser.add_argument("--seed", type=int, default=42,
                         help="the seed (for reproducible sampling). Set to -1 to disable.")
@@ -95,16 +89,15 @@ if __name__ == "__main__":
     if args.pipeline not in ["text2img", "img2img"]:
         args.extra_unet_dirpaths = None
-        args.unet_weights = None
     adaface = AdaFaceWrapper(args.pipeline, args.base_model_path,
-                             args.adaface_encoder_types, args.adaface_ckpt_paths,
                              args.adaface_encoder_cfg_scales, args.enabled_encoders,
-                             args.subject_string, args.num_inference_steps,
                              unet_types=None,
                              main_unet_filepath=args.main_unet_filepath,
                              extra_unet_dirpaths=args.extra_unet_dirpaths,
-                             unet_weights=args.unet_weights, device=args.device)
     if not args.randface:
         image_folder = args.subject
@@ -143,7 +136,7 @@ if __name__ == "__main__":
     rand_init_id_embs = torch.randn(1, 512)
     init_id_embs = rand_init_id_embs if args.randface else None
-    noise = torch.randn(args.out_image_count, 4, 64, 64).cuda()
     # args.perturb_std: the *relative* std of the noise added to the face embeddings.
     # A noise level of 0.08 could change gender, but 0.06 is usually safe.
     # adaface_subj_embs is not used. It is generated for the purpose of updating the text encoder (within this function call).
@@ -151,5 +144,7 @@ if __name__ == "__main__":
         adaface.prepare_adaface_embeddings(image_paths, init_id_embs,
                                            perturb_at_stage='img_prompt_emb',
                                            perturb_std=args.perturb_std, update_text_encoder=True)
-    images = adaface(noise, args.prompt, None, 'append', args.guidance_scale, args.out_image_count, verbose=True)
     save_images(images, args.num_images_per_row, subject_name, f"guide{args.guidance_scale}", args.perturb_std)

                         help="Type of pipeline to use (default: txt2img)")
     parser.add_argument("--base_model_path", type=str, default=None,
                         help="Type of checkpoints to use (default: None, using the official model)")
+    parser.add_argument('--adaface_ckpt_path', type=str, required=True)
     parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                         choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
     parser.add_argument("--enabled_encoders", type=str, nargs="+", default=None,
     parser.add_argument("--extra_unet_dirpaths", type=str, nargs="*",
                         default=[],
                         help="Extra paths to the checkpoints of the UNet models")
+    parser.add_argument('--unet_weights_in_ensemble', type=float, nargs="+", default=[1],
                         help="Weights for the UNet models")
     parser.add_argument("--subject", type=str)
     parser.add_argument("--example_image_count", type=int, default=-1, help="Number of example images to use")
     parser.add_argument("--out_image_count",     type=int, default=4,  help="Number of images to generate")
     parser.add_argument("--prompt", type=str, default="a woman z in superman costume")
+    parser.add_argument("--perturb_std", type=float, default=0)
     parser.add_argument("--randface", action="store_true")
     parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
                         help="Guidance scale for the diffusion model")
     parser.add_argument("--num_images_per_row", type=int, default=4,
                         help="Number of images to display in a row in the output grid image.")
     parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
     parser.add_argument("--seed", type=int, default=42,
                         help="the seed (for reproducible sampling). Set to -1 to disable.")
     if args.pipeline not in ["text2img", "img2img"]:
         args.extra_unet_dirpaths = None
+        args.unet_weights_in_ensemble = None
     adaface = AdaFaceWrapper(args.pipeline, args.base_model_path,
+                             args.adaface_encoder_types, args.adaface_ckpt_path,
                              args.adaface_encoder_cfg_scales, args.enabled_encoders,
                              unet_types=None,
                              main_unet_filepath=args.main_unet_filepath,
                              extra_unet_dirpaths=args.extra_unet_dirpaths,
+                             unet_weights_in_ensemble=args.unet_weights_in_ensemble, device=args.device)
     if not args.randface:
         image_folder = args.subject
     rand_init_id_embs = torch.randn(1, 512)
     init_id_embs = rand_init_id_embs if args.randface else None
+    init_noise = torch.randn(args.out_image_count, 4, 64, 64).cuda()
     # args.perturb_std: the *relative* std of the noise added to the face embeddings.
     # A noise level of 0.08 could change gender, but 0.06 is usually safe.
     # adaface_subj_embs is not used. It is generated for the purpose of updating the text encoder (within this function call).
         adaface.prepare_adaface_embeddings(image_paths, init_id_embs,
                                            perturb_at_stage='img_prompt_emb',
                                            perturb_std=args.perturb_std, update_text_encoder=True)
+    images = adaface(init_noise, args.prompt, None, None,
+                     'append', args.guidance_scale,
+                     args.out_image_count, verbose=True)
     save_images(images, args.num_images_per_row, subject_name, f"guide{args.guidance_scale}", args.perturb_std)

adaface/adaface_translate.py CHANGED Viewed

@@ -25,10 +25,9 @@ def seed_everything(seed):
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--base_model_path", type=str, default='models/realisticvision/realisticVisionV40_v40VAE.safetensors',
-                        help="Path to the UNet checkpoint (default: RealisticVision 4.0)")
-    parser.add_argument('--adaface_ckpt_paths', type=str, nargs="+",
-                        default=['models/adaface/subjects-celebrity2024-05-16T17-22-46_zero3-ada-30000.pt'])
     parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                         choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
     parser.add_argument("--enabled_encoders", type=str, nargs="+", default=None,
@@ -40,9 +39,11 @@ def parse_args():
     parser.add_argument('--extra_unet_dirpaths', type=str, nargs="*",
                         default=[],
                         help="Extra paths to the checkpoints of the UNet models")
-    parser.add_argument('--unet_weights', type=float, nargs="+", default=[1],
                         help="Weights for the UNet models")
     parser.add_argument("--in_folder",  type=str, required=True, help="Path to the folder containing input images")
     # If True, the input folder contains images of mixed subjects.
     # If False, the input folder contains multiple subfolders, each of which contains images of the same subject.
     parser.add_argument("--is_mix_subj_folder", type=str2bool, const=True, default=False, nargs="?",
@@ -52,19 +53,14 @@ def parse_args():
     parser.add_argument("--out_folder", type=str, required=True, help="Path to the folder saving output images")
     parser.add_argument("--out_count_per_input_image", type=int, default=1,  help="Number of output images to generate per input image")
     parser.add_argument("--copy_masks", action="store_true", help="Copy the mask images to the output folder")
-    parser.add_argument("--noise", dest='perturb_std', type=float, default=0)
     parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
                         help="Guidance scale for the diffusion model")
     parser.add_argument("--ref_img_strength", type=float, default=0.8,
                         help="Strength of the reference image in the output image.")
-    parser.add_argument("--subject_string",
-                        type=str, default="z",
-                        help="Subject placeholder string used in prompts to denote the concept.")
     parser.add_argument("--prompt", type=str, default="a person z")
     parser.add_argument("--num_images_per_row", type=int, default=4,
                         help="Number of images to display in a row in the output grid image.")
-    parser.add_argument("--num_inference_steps", type=int, default=50,
-                        help="Number of DDIM inference steps")
     parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use. If num_gpus > 1, use accelerate for distributed execution.")
     parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
     parser.add_argument("--seed", type=int, default=42,
@@ -93,15 +89,16 @@ if __name__ == "__main__":
         process_index = 0
     adaface = AdaFaceWrapper("img2img", args.base_model_path,
-                             args.adaface_encoder_types, args.adaface_ckpt_paths,
                              args.adaface_encoder_cfg_scales, args.enabled_encoders,
-                             args.subject_string, args.num_inference_steps,
                              unet_types=None,
-                             extra_unet_dirpaths=args.extra_unet_dirpaths, unet_weights=args.unet_weights,
                              device=args.device)
     in_folder = args.in_folder
     if os.path.isfile(in_folder):
         subject_folders = [ os.path.dirname(in_folder) ]
         images_by_subject = [[in_folder]]
     else:
@@ -157,6 +154,24 @@ if __name__ == "__main__":
         images_by_subject = images_by_subject[process_index::args.num_gpus]
         #subject_folders, images_by_subject = distributed_state.split_between_processes(zip(subject_folders, images_by_subject))
     for (subject_folder, image_paths) in zip(subject_folders, images_by_subject):
         # If is_mix_subj_folder, then image_paths only contains 1 image, and we use the file name as the signature of the image.
         # Otherwise, we use the folder name as the signature of the images.
@@ -176,29 +191,32 @@ if __name__ == "__main__":
             os.makedirs(subject_out_folder)
         print(f"Output images will be saved to {subject_out_folder}")
-        in_images = []
-        for image_path in image_paths:
-            image = Image.open(image_path).convert("RGB").resize((512, 512))
-            # [512, 512, 3] -> [3, 512, 512].
-            image = np.array(image).transpose(2, 0, 1)
-            # Convert the image to a tensor of shape (1, 3, 512, 512) and move it to the GPU.
-            image = torch.tensor(image).unsqueeze(0).float().cuda()
-            in_images.append(image)
-        # Put all input images of the subject into a batch. This assumes max_images_per_subject is small.
-        # NOTE: For simplicity, we do not check overly large batch sizes.
-        in_images = torch.cat(in_images, dim=0)
-        # in_images: [5, 3, 512, 512].
-        # Normalize the pixel values to [0, 1].
-        in_images = in_images / 255.0
-        num_out_images = len(in_images) * args.out_count_per_input_image
         with torch.no_grad():
             # args.perturb_std: the *relative* std of the noise added to the face embeddings.
             # A noise level of 0.08 could change gender, but 0.06 is usually safe.
             # The returned adaface_subj_embs are already incorporated in the text encoder, and not used explicitly.
             # NOTE: We assume out_count_per_input_image == 1, so that the output images are of the same number as the input images.
-            out_images = adaface(in_images, args.prompt, None, 'append', args.guidance_scale, num_out_images, ref_img_strength=args.ref_img_strength)
             for img_i, img in enumerate(out_images):
                 # out_images: subj_1, subj_2, ..., subj_n, subj_1, subj_2, ..., subj_n, ...
@@ -206,9 +224,11 @@ if __name__ == "__main__":
                 copy_i = img_i // len(in_images)
                 image_filename_stem, image_fileext = os.path.splitext(os.path.basename(image_paths[subj_i]))
                 if copy_i == 0:
-                    img.save(os.path.join(subject_out_folder, f"{image_filename_stem}{image_fileext}"))
                 else:
-                    img.save(os.path.join(subject_out_folder, f"{image_filename_stem}_{copy_i}{image_fileext}"))
                 if args.copy_masks:
                     mask_path = image_paths[subj_i].replace(image_fileext, "_mask.png")

 def parse_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument("--base_model_path", type=str, default='models/sar/sar.safetensors',
+                        help="Path to the UNet checkpoint (Default: SAR)")
+    parser.add_argument('--adaface_ckpt_path', type=str, required=True)
     parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                         choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
     parser.add_argument("--enabled_encoders", type=str, nargs="+", default=None,
     parser.add_argument('--extra_unet_dirpaths', type=str, nargs="*",
                         default=[],
                         help="Extra paths to the checkpoints of the UNet models")
+    parser.add_argument('--unet_weights_in_ensemble', type=float, nargs="+", default=[1],
                         help="Weights for the UNet models")
     parser.add_argument("--in_folder",  type=str, required=True, help="Path to the folder containing input images")
+    parser.add_argument("--restore_image", type=str, default=None,
+                        help="Path to the image to be restored")
     # If True, the input folder contains images of mixed subjects.
     # If False, the input folder contains multiple subfolders, each of which contains images of the same subject.
     parser.add_argument("--is_mix_subj_folder", type=str2bool, const=True, default=False, nargs="?",
     parser.add_argument("--out_folder", type=str, required=True, help="Path to the folder saving output images")
     parser.add_argument("--out_count_per_input_image", type=int, default=1,  help="Number of output images to generate per input image")
     parser.add_argument("--copy_masks", action="store_true", help="Copy the mask images to the output folder")
+    parser.add_argument("--perturb_std", type=float, default=0)
     parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
                         help="Guidance scale for the diffusion model")
     parser.add_argument("--ref_img_strength", type=float, default=0.8,
                         help="Strength of the reference image in the output image.")
     parser.add_argument("--prompt", type=str, default="a person z")
     parser.add_argument("--num_images_per_row", type=int, default=4,
                         help="Number of images to display in a row in the output grid image.")
     parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use. If num_gpus > 1, use accelerate for distributed execution.")
     parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
     parser.add_argument("--seed", type=int, default=42,
         process_index = 0
     adaface = AdaFaceWrapper("img2img", args.base_model_path,
+                             args.adaface_encoder_types, args.adaface_ckpt_path,
                              args.adaface_encoder_cfg_scales, args.enabled_encoders,
                              unet_types=None,
+                             extra_unet_dirpaths=args.extra_unet_dirpaths,
+                             unet_weights_in_ensemble=args.unet_weights_in_ensemble,
                              device=args.device)
     in_folder = args.in_folder
     if os.path.isfile(in_folder):
+        args.in_folder = os.path.dirname(args.in_folder)
         subject_folders = [ os.path.dirname(in_folder) ]
         images_by_subject = [[in_folder]]
     else:
         images_by_subject = images_by_subject[process_index::args.num_gpus]
         #subject_folders, images_by_subject = distributed_state.split_between_processes(zip(subject_folders, images_by_subject))
+    if args.restore_image is not None:
+        in_images = []
+        for image_path in [args.restore_image]:
+            image = Image.open(image_path).convert("RGB").resize((512, 512))
+            # [512, 512, 3] -> [3, 512, 512].
+            image = np.array(image).transpose(2, 0, 1)
+            # Convert the image to a tensor of shape (1, 3, 512, 512) and move it to the GPU.
+            image = torch.tensor(image).unsqueeze(0).float().cuda()
+            in_images.append(image)
+        # Put all input images of the subject into a batch. This assumes max_images_per_subject is small.
+        # NOTE: For simplicity, we do not check overly large batch sizes.
+        in_images = torch.cat(in_images, dim=0)
+        # in_images: [5, 3, 512, 512].
+        # Normalize the pixel values to [0, 1].
+        in_images = in_images / 255.0
+        num_out_images = len(in_images) * args.out_count_per_input_image
     for (subject_folder, image_paths) in zip(subject_folders, images_by_subject):
         # If is_mix_subj_folder, then image_paths only contains 1 image, and we use the file name as the signature of the image.
         # Otherwise, we use the folder name as the signature of the images.
             os.makedirs(subject_out_folder)
         print(f"Output images will be saved to {subject_out_folder}")
+        if args.restore_image is None:
+            in_images = []
+            for image_path in image_paths:
+                image = Image.open(image_path).convert("RGB").resize((512, 512))
+                # [512, 512, 3] -> [3, 512, 512].
+                image = np.array(image).transpose(2, 0, 1)
+                # Convert the image to a tensor of shape (1, 3, 512, 512) and move it to the GPU.
+                image = torch.tensor(image).unsqueeze(0).float().cuda()
+                in_images.append(image)
+            # Put all input images of the subject into a batch. This assumes max_images_per_subject is small.
+            # NOTE: For simplicity, we do not check overly large batch sizes.
+            in_images = torch.cat(in_images, dim=0)
+            # in_images: [5, 3, 512, 512].
+            # Normalize the pixel values to [0, 1].
+            in_images = in_images / 255.0
+            num_out_images = len(in_images) * args.out_count_per_input_image
         with torch.no_grad():
             # args.perturb_std: the *relative* std of the noise added to the face embeddings.
             # A noise level of 0.08 could change gender, but 0.06 is usually safe.
             # The returned adaface_subj_embs are already incorporated in the text encoder, and not used explicitly.
             # NOTE: We assume out_count_per_input_image == 1, so that the output images are of the same number as the input images.
+            out_images = adaface(in_images, args.prompt, None, None,
+                                 'append', args.guidance_scale, num_out_images,
+                                 ref_img_strength=args.ref_img_strength)
             for img_i, img in enumerate(out_images):
                 # out_images: subj_1, subj_2, ..., subj_n, subj_1, subj_2, ..., subj_n, ...
                 copy_i = img_i // len(in_images)
                 image_filename_stem, image_fileext = os.path.splitext(os.path.basename(image_paths[subj_i]))
                 if copy_i == 0:
+                    save_path = os.path.join(subject_out_folder, f"{image_filename_stem}{image_fileext}")
                 else:
+                    save_path = os.path.join(subject_out_folder, f"{image_filename_stem}_{copy_i}{image_fileext}")
+                img.save(save_path)
+                print(f"Saved {save_path}")
                 if args.copy_masks:
                     mask_path = image_paths[subj_i].replace(image_fileext, "_mask.png")

adaface/adaface_wrapper.py CHANGED Viewed

@@ -8,22 +8,29 @@ from diffusers import (
     StableDiffusion3Pipeline,
     #FluxPipeline,
     DDIMScheduler,
     AutoencoderKL,
 )
 from diffusers.loaders.single_file_utils import convert_ldm_unet_checkpoint
 from adaface.util import UNetEnsemble
 from adaface.face_id_to_ada_prompt import create_id2ada_prompt_encoder
 from safetensors.torch import load_file as safetensors_load_file
 import re, os
 import numpy as np
 class AdaFaceWrapper(nn.Module):
     def __init__(self, pipeline_name, base_model_path, adaface_encoder_types,
                  adaface_ckpt_paths, adaface_encoder_cfg_scales=None,
-                 enabled_encoders=None,
-                 subject_string='z', num_inference_steps=50, negative_prompt=None,
                  use_840k_vae=False, use_ds_text_encoder=False,
-                 main_unet_filepath=None, unet_types=None, extra_unet_dirpaths=None, unet_weights=None,
                  device='cuda', is_training=False):
         '''
         pipeline_name: "text2img", "text2imgxl", "img2img", "text2img3", "flux", or None.
@@ -38,15 +45,23 @@ class AdaFaceWrapper(nn.Module):
         self.adaface_ckpt_paths = adaface_ckpt_paths
         self.adaface_encoder_cfg_scales = adaface_encoder_cfg_scales
         self.enabled_encoders = enabled_encoders
         self.subject_string = subject_string
-        self.num_inference_steps = num_inference_steps
         self.use_840k_vae = use_840k_vae
         self.use_ds_text_encoder = use_ds_text_encoder
         self.main_unet_filepath = main_unet_filepath
         self.unet_types = unet_types
         self.extra_unet_dirpaths = extra_unet_dirpaths
-        self.unet_weights = unet_weights
         self.device = device
         self.is_training = is_training
@@ -62,7 +77,14 @@ class AdaFaceWrapper(nn.Module):
         self.initialize_pipeline()
         # During inference, we never use static image suffix embeddings.
         # So num_id_vecs is the length of the returned adaface embeddings for each encoder.
-        self.encoders_num_id_vecs = self.id2ada_prompt_encoder.encoders_num_id_vecs
         self.extend_tokenizer_and_text_encoder()
     def to(self, device):
@@ -76,7 +98,8 @@ class AdaFaceWrapper(nn.Module):
         self.id2ada_prompt_encoder = create_id2ada_prompt_encoder(self.adaface_encoder_types,
                                                                   self.adaface_ckpt_paths,
                                                                   self.adaface_encoder_cfg_scales,
-                                                                  self.enabled_encoders)
         self.id2ada_prompt_encoder.to(self.device)
         print(f"adaface_encoder_cfg_scales: {self.adaface_encoder_cfg_scales}")
@@ -118,10 +141,10 @@ class AdaFaceWrapper(nn.Module):
         if self.base_model_path is None:
             base_model_path_dict = {
-                'text2img':  'models/sd15-dste8-vae.safetensors',
-                'text2imgxl': 'stabilityai/stable-diffusion-xl-base-1.0',
-                'text2img3': 'stabilityai/stable-diffusion-3-medium-diffusers',
-                'flux':      'black-forest-labs/FLUX.1-schnell',
             }
             self.base_model_path = base_model_path_dict[self.pipeline_name]
@@ -137,6 +160,20 @@ class AdaFaceWrapper(nn.Module):
                     safety_checker=None
                 )
         if self.main_unet_filepath is not None:
             print(f"Replacing the UNet with the UNet from {self.main_unet_filepath}.")
             ret = pipeline.unet.load_state_dict(self.load_unet_from_file(self.main_unet_filepath, device='cpu'))
@@ -147,12 +184,19 @@ class AdaFaceWrapper(nn.Module):
         if (self.unet_types is not None and len(self.unet_types) > 0) \
           or (self.extra_unet_dirpaths is not None and len(self.extra_unet_dirpaths) > 0):
-            unet_ensemble = UNetEnsemble([pipeline.unet], self.unet_types, self.extra_unet_dirpaths, self.unet_weights,
                                          device=self.device, torch_dtype=torch.float16)
             pipeline.unet = unet_ensemble
         print(f"Loaded pipeline from {self.base_model_path}.")
         if self.use_840k_vae:
             pipeline.vae = vae
             print("Replaced the VAE with the 840k-step VAE.")
@@ -167,19 +211,56 @@ class AdaFaceWrapper(nn.Module):
             pipeline.vae  = None
             print("Removed UNet and VAE from the pipeline.")
-        if self.pipeline_name not in ["text2imgxl", "text2img3", "flux"]:
-            noise_scheduler = DDIMScheduler(
-                num_train_timesteps=1000,
-                beta_start=0.00085,
-                beta_end=0.012,
-                beta_schedule="scaled_linear",
-                clip_sample=False,
-                set_alpha_to_one=False,
-            )
             pipeline.scheduler = noise_scheduler
-        # Otherwise, pipeline.scheduler == FlowMatchEulerDiscreteScheduler
         self.pipeline = pipeline.to(self.device)
     def load_unet_from_file(self, unet_path, device=None):
         if os.path.isfile(unet_path):
             if unet_path.endswith(".safetensors"):
@@ -208,7 +289,109 @@ class AdaFaceWrapper(nn.Module):
         else:
             raise ValueError(f"UNet path {unet_path} is not a file.")
         return unet_state_dict
     def extend_tokenizer_and_text_encoder(self):
         if np.sum(self.encoders_num_id_vecs) < 1:
             raise ValueError(f"encoders_num_id_vecs has to be larger or equal to 1, but is {self.encoders_num_id_vecs}")
@@ -218,6 +401,7 @@ class AdaFaceWrapper(nn.Module):
         # We add z_0_0, z_0_1, z_0_2, ..., z_0_15, z_1_0, z_1_1, z_1_2, z_1_3 to the tokenizer.
         self.all_placeholder_tokens = []
         self.placeholder_tokens_strs = []
         for i in range(len(self.adaface_encoder_types)):
             placeholder_tokens = []
             for j in range(self.encoders_num_id_vecs[i]):
@@ -225,9 +409,11 @@ class AdaFaceWrapper(nn.Module):
                 placeholder_tokens_str = " ".join(placeholder_tokens)
             self.all_placeholder_tokens.extend(placeholder_tokens)
             self.placeholder_tokens_strs.append(placeholder_tokens_str)
         self.all_placeholder_tokens_str = " ".join(self.placeholder_tokens_strs)
         # all_null_placeholder_tokens_str: ", , , , ..." (20 times).
         # It just contains the commas and spaces with the same length, but no actual tokens.
         self.all_null_placeholder_tokens_str = " ".join([", "] * len(self.all_placeholder_tokens))
@@ -241,7 +427,7 @@ class AdaFaceWrapper(nn.Module):
         print(f"Added {num_added_tokens} tokens ({self.all_placeholder_tokens_str}) to the tokenizer.")
-        # placeholder_token_ids: [49408, ..., 49423].
         self.placeholder_token_ids = tokenizer.convert_tokens_to_ids(self.all_placeholder_tokens)
         #print("New tokens:", self.placeholder_token_ids)
         # Resize the token embeddings as we are adding new special tokens to the tokenizer
@@ -252,24 +438,49 @@ class AdaFaceWrapper(nn.Module):
     # Extend pipeline.text_encoder with the adaface subject emeddings.
     # subj_embs: [16, 768].
-    def update_text_encoder_subj_embeddings(self, subj_embs):
         # Initialise the newly added placeholder token with the embeddings of the initializer token
         # token_embeds: [49412, 768]
         token_embeds = self.pipeline.text_encoder.get_input_embeddings().weight.data
         with torch.no_grad():
-            for i, token_id in enumerate(self.placeholder_token_ids):
-                token_embeds[token_id] = subj_embs[i]
-            print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.all_placeholder_tokens_str}) in the text encoder.")
     def update_prompt(self, prompt, placeholder_tokens_pos='append',
                       use_null_placeholders=False):
         if prompt is None:
             prompt = ""
         if use_null_placeholders:
             all_placeholder_tokens_str = self.all_null_placeholder_tokens_str
         else:
-            all_placeholder_tokens_str = self.all_placeholder_tokens_str
         # Delete the subject_string from the prompt.
         prompt = re.sub(r'\b(a|an|the)\s+' + self.subject_string + r'\b,?', "", prompt)
@@ -279,15 +490,29 @@ class AdaFaceWrapper(nn.Module):
         # When we do joint training, seems both work better if they are appended to the prompt.
         # Therefore we simply appended all placeholder_tokens_str's to the prompt.
         # NOTE: Prepending them hurts compositional prompts.
-        if placeholder_tokens_pos == 'prepend':
-            prompt = all_placeholder_tokens_str + " " + prompt
-        elif placeholder_tokens_pos == 'append':
-            prompt = prompt + " " + all_placeholder_tokens_str
         else:
-            breakpoint()
         return prompt
     # If face_id_embs is None, then it extracts face_id_embs from the images,
     # then map them to ada prompt embeddings.
     # avg_at_stage: 'id_emb', 'img_prompt_emb', or None.
@@ -298,27 +523,29 @@ class AdaFaceWrapper(nn.Module):
                                    perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
                                    perturb_std=0, update_text_encoder=True):
-        all_adaface_subj_embs = \
             self.id2ada_prompt_encoder.generate_adaface_embeddings(\
                     image_paths, face_id_embs=face_id_embs,
                     img_prompt_embs=None,
                     avg_at_stage=avg_at_stage,
                     perturb_at_stage=perturb_at_stage,
                     perturb_std=perturb_std,
-                    enable_static_img_suffix_embs=False)
         if all_adaface_subj_embs is None:
             return None
         if all_adaface_subj_embs.ndim == 4:
-            # [1, 1, 16, 768] -> [16, 768]
             all_adaface_subj_embs = all_adaface_subj_embs.squeeze(0).squeeze(0)
         elif all_adaface_subj_embs.ndim == 3:
-            # [1, 16, 768] -> [16, 768]
             all_adaface_subj_embs = all_adaface_subj_embs.squeeze(0)
         if update_text_encoder:
-            self.update_text_encoder_subj_embeddings(all_adaface_subj_embs)
         return all_adaface_subj_embs
     def diffusers_encode_prompts(self, prompt, plain_prompt, negative_prompt, device):
@@ -368,6 +595,7 @@ class AdaFaceWrapper(nn.Module):
                 else:
                     breakpoint()
             else:
                 # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
                 prompt_embeds_, negative_prompt_embeds_ = \
                     self.pipeline.encode_prompt(prompt, device=device,
@@ -378,9 +606,53 @@ class AdaFaceWrapper(nn.Module):
         return prompt_embeds_, negative_prompt_embeds_, \
                pooled_prompt_embeds_, negative_pooled_prompt_embeds_
     def encode_prompt(self, prompt, negative_prompt=None,
                       placeholder_tokens_pos='append',
-                      do_neg_id_prompt_weight=0,
                       device=None, verbose=False):
         if negative_prompt is None:
             negative_prompt = self.negative_prompt
@@ -389,59 +661,81 @@ class AdaFaceWrapper(nn.Module):
             device = self.device
         plain_prompt = prompt
-        prompt = self.update_prompt(prompt, placeholder_tokens_pos=placeholder_tokens_pos)
         if verbose:
             print(f"Subject prompt:\n{prompt}")
-        if do_neg_id_prompt_weight > 0:
-            # Use 'prepend' for the negative prompt, since it's long and we want to make sure
-            # the placeholder tokens are not cut off.
-            negative_prompt0 = negative_prompt
-            negative_prompt      = self.update_prompt(negative_prompt0, placeholder_tokens_pos='prepend')
-            null_negative_prompt = self.update_prompt(negative_prompt0, placeholder_tokens_pos='prepend',
-                                                      use_null_placeholders=True)
-            '''         if verbose:
-                            print(f"Negative prompt:\n{negative_prompt}")
-                            print(f"Null negative prompt:\n{null_negative_prompt}")
-            '''
-        else:
-            null_negative_prompt = None
         # For some unknown reason, the text_encoder is still on CPU after self.pipeline.to(self.device).
         # So we manually move it to GPU here.
         self.pipeline.text_encoder.to(device)
         prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
             self.diffusers_encode_prompts(prompt, plain_prompt, negative_prompt, device)
-        if 0 < do_neg_id_prompt_weight < 1:
-            _, negative_prompt_embeds_null, _, _ = \
-                self.diffusers_encode_prompts(prompt, plain_prompt, null_negative_prompt, device)
-            negative_prompt_embeds_ = negative_prompt_embeds_ * do_neg_id_prompt_weight + \
-                                      negative_prompt_embeds_null * (1 - do_neg_id_prompt_weight)
         return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
     # ref_img_strength is used only in the img2img pipeline.
-    def forward(self, noise, prompt, negative_prompt=None,
                 placeholder_tokens_pos='append',
-                do_neg_id_prompt_weight=0,
                 guidance_scale=6.0, out_image_count=4,
-                ref_img_strength=0.8, generator=None, verbose=False):
         noise = noise.to(device=self.device, dtype=torch.float16)
         if negative_prompt is None:
             negative_prompt = self.negative_prompt
         # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
-        prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
-            negative_pooled_prompt_embeds_ = \
-                self.encode_prompt(prompt, negative_prompt,
-                                   placeholder_tokens_pos=placeholder_tokens_pos,
-                                   do_neg_id_prompt_weight=do_neg_id_prompt_weight,
-                                   device=self.device, verbose=verbose)
         # Repeat the prompt embeddings for all images in the batch.
         prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)
         if negative_prompt_embeds_ is not None:
             negative_prompt_embeds_ = negative_prompt_embeds_.repeat(out_image_count, 1, 1)

     StableDiffusion3Pipeline,
     #FluxPipeline,
     DDIMScheduler,
+    PNDMScheduler,
+    DPMSolverSinglestepScheduler,
     AutoencoderKL,
+    LCMScheduler,
 )
 from diffusers.loaders.single_file_utils import convert_ldm_unet_checkpoint
 from adaface.util import UNetEnsemble
 from adaface.face_id_to_ada_prompt import create_id2ada_prompt_encoder
+from adaface.diffusers_attn_lora_capture import set_up_attn_processors, set_up_ffn_loras, set_lora_and_capture_flags
 from safetensors.torch import load_file as safetensors_load_file
 import re, os
 import numpy as np
+from peft.utils.constants import DUMMY_TARGET_MODULES
 class AdaFaceWrapper(nn.Module):
     def __init__(self, pipeline_name, base_model_path, adaface_encoder_types,
                  adaface_ckpt_paths, adaface_encoder_cfg_scales=None,
+                 enabled_encoders=None, use_lcm=False, default_scheduler_name='ddim',
+                 num_inference_steps=50, subject_string='z', negative_prompt=None,
                  use_840k_vae=False, use_ds_text_encoder=False,
+                 main_unet_filepath=None, unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
+                 enable_static_img_suffix_embs=None, unet_uses_attn_lora=False,
+                 attn_lora_layer_names=['q', 'k', 'v', 'out'], shrink_cross_attn=False, q_lora_updates_query=False,
                  device='cuda', is_training=False):
         '''
         pipeline_name: "text2img", "text2imgxl", "img2img", "text2img3", "flux", or None.
         self.adaface_ckpt_paths = adaface_ckpt_paths
         self.adaface_encoder_cfg_scales = adaface_encoder_cfg_scales
         self.enabled_encoders = enabled_encoders
+        # None, or a list of two bools for two encoders. If None, both are disabled.
+        self.enable_static_img_suffix_embs = enable_static_img_suffix_embs
+        self.unet_uses_attn_lora = unet_uses_attn_lora
+        self.attn_lora_layer_names = attn_lora_layer_names
+        self.q_lora_updates_query  = q_lora_updates_query
+        self.use_lcm = use_lcm
         self.subject_string = subject_string
+        self.shrink_cross_attn = shrink_cross_attn
+        self.default_scheduler_name = default_scheduler_name
+        self.num_inference_steps = num_inference_steps if not use_lcm else 4
         self.use_840k_vae = use_840k_vae
         self.use_ds_text_encoder = use_ds_text_encoder
         self.main_unet_filepath = main_unet_filepath
         self.unet_types = unet_types
         self.extra_unet_dirpaths = extra_unet_dirpaths
+        self.unet_weights_in_ensemble = unet_weights_in_ensemble
         self.device = device
         self.is_training = is_training
         self.initialize_pipeline()
         # During inference, we never use static image suffix embeddings.
         # So num_id_vecs is the length of the returned adaface embeddings for each encoder.
+        self.encoders_num_id_vecs = np.array(self.id2ada_prompt_encoder.encoders_num_id_vecs)
+        self.encoders_num_static_img_suffix_embs = np.array(self.id2ada_prompt_encoder.encoders_num_static_img_suffix_embs)
+        if self.enable_static_img_suffix_embs is not None:
+            assert len(self.enable_static_img_suffix_embs) == len(self.encoders_num_id_vecs)
+            self.encoders_num_static_img_suffix_embs *= np.array(self.enable_static_img_suffix_embs)
+            self.encoders_num_id_vecs += self.encoders_num_static_img_suffix_embs
+        self.img_prompt_embs = None
         self.extend_tokenizer_and_text_encoder()
     def to(self, device):
         self.id2ada_prompt_encoder = create_id2ada_prompt_encoder(self.adaface_encoder_types,
                                                                   self.adaface_ckpt_paths,
                                                                   self.adaface_encoder_cfg_scales,
+                                                                  self.enabled_encoders,
+                                                                  num_static_img_suffix_embs=4)
         self.id2ada_prompt_encoder.to(self.device)
         print(f"adaface_encoder_cfg_scales: {self.adaface_encoder_cfg_scales}")
         if self.base_model_path is None:
             base_model_path_dict = {
+                'text2img':     'models/sd15-dste8-vae.safetensors',
+                'text2imgxl':   'stabilityai/stable-diffusion-xl-base-1.0',
+                'text2img3':    'stabilityai/stable-diffusion-3-medium-diffusers',
+                'flux':         'black-forest-labs/FLUX.1-schnell',
             }
             self.base_model_path = base_model_path_dict[self.pipeline_name]
                     safety_checker=None
                 )
+        if self.use_lcm:
+            lcm_path_dict = {
+                'text2img':     'latent-consistency/lcm-lora-sdv1-5',
+                'text2imgxl':   'latent-consistency/lcm-lora-sdxl',
+            }
+            if self.pipeline_name not in lcm_path_dict:
+                raise ValueError(f"Pipeline {self.pipeline_name} does not support LCM.")
+            lcm_path = lcm_path_dict[self.pipeline_name]
+            pipeline.load_lora_weights(lcm_path)
+            pipeline.fuse_lora()
+            print(f"Loaded LCM weights from {lcm_path}.")
+            pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config)
         if self.main_unet_filepath is not None:
             print(f"Replacing the UNet with the UNet from {self.main_unet_filepath}.")
             ret = pipeline.unet.load_state_dict(self.load_unet_from_file(self.main_unet_filepath, device='cpu'))
         if (self.unet_types is not None and len(self.unet_types) > 0) \
           or (self.extra_unet_dirpaths is not None and len(self.extra_unet_dirpaths) > 0):
+            unet_ensemble = UNetEnsemble([pipeline.unet], self.unet_types, self.extra_unet_dirpaths, self.unet_weights_in_ensemble,
                                          device=self.device, torch_dtype=torch.float16)
             pipeline.unet = unet_ensemble
         print(f"Loaded pipeline from {self.base_model_path}.")
+        if not remove_unet and (self.unet_uses_attn_lora or self.shrink_cross_attn):
+            unet2 = self.load_unet_lora_weights(pipeline.unet, use_attn_lora=self.unet_uses_attn_lora,
+                                                attn_lora_layer_names=self.attn_lora_layer_names,
+                                                shrink_cross_attn=self.shrink_cross_attn,
+                                                q_lora_updates_query=self.q_lora_updates_query)
+            pipeline.unet = unet2
         if self.use_840k_vae:
             pipeline.vae = vae
             print("Replaced the VAE with the 840k-step VAE.")
             pipeline.vae  = None
             print("Removed UNet and VAE from the pipeline.")
+        if self.pipeline_name not in ["text2imgxl", "text2img3", "flux"] and not self.use_lcm:
+            if self.default_scheduler_name == 'ddim':
+                noise_scheduler = DDIMScheduler(
+                    num_train_timesteps=1000,
+                    beta_start=0.00085,
+                    beta_end=0.012,
+                    beta_schedule="scaled_linear",
+                    clip_sample=False,
+                    set_alpha_to_one=False,
+                    steps_offset=1,
+                    timestep_spacing="leading",
+                    rescale_betas_zero_snr=False,
+                )
+            elif self.default_scheduler_name == 'pndm':
+                noise_scheduler = PNDMScheduler(
+                    num_train_timesteps=1000,
+                    beta_start=0.00085,
+                    beta_end=0.012,
+                    beta_schedule="scaled_linear",
+                    set_alpha_to_one=False,
+                    steps_offset=1,
+                    timestep_spacing="leading",
+                    skip_prk_steps=True,
+                )
+            elif self.default_scheduler_name == 'dpm++':
+                noise_scheduler = DPMSolverSinglestepScheduler(
+                    beta_start=0.00085,
+                    beta_end=0.012,
+                    beta_schedule="scaled_linear",
+                    prediction_type="epsilon",
+                    num_train_timesteps=1000,
+                    trained_betas=None,
+                    thresholding=False,
+                    algorithm_type="dpmsolver++",
+                    solver_type="midpoint",
+                    lower_order_final=True,
+                    use_karras_sigmas=True,
+                )
+            else:
+                breakpoint()
             pipeline.scheduler = noise_scheduler
+        # Otherwise, if not use_lcm, pipeline.scheduler == FlowMatchEulerDiscreteScheduler
+        #            if     use_lcm, pipeline.scheduler == LCMScheduler
         self.pipeline = pipeline.to(self.device)
+    def set_adaface_encoder_cfg_scales(self, adaface_encoder_cfg_scales):
+        self.adaface_encoder_cfg_scales = adaface_encoder_cfg_scales
+        self.id2ada_prompt_encoder.set_out_id_embs_cfg_scale(adaface_encoder_cfg_scales)
     def load_unet_from_file(self, unet_path, device=None):
         if os.path.isfile(unet_path):
             if unet_path.endswith(".safetensors"):
         else:
             raise ValueError(f"UNet path {unet_path} is not a file.")
         return unet_state_dict
+    # Adapted from ConsistentIDPipeline:set_ip_adapter().
+    def load_unet_loras(self, unet, unet_lora_modules_state_dict,
+                        use_attn_lora=True, use_ffn_lora=False,
+                        attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                        shrink_cross_attn=False, cross_attn_shrink_factor=0.5,
+                        q_lora_updates_query=False):
+        attn_capture_procs, attn_opt_modules = \
+            set_up_attn_processors(unet, use_attn_lora=True, attn_lora_layer_names=attn_lora_layer_names,
+                                   lora_rank=192, lora_scale_down=8,
+                                   cross_attn_shrink_factor=cross_attn_shrink_factor,
+                                   q_lora_updates_query=q_lora_updates_query)
+        # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut. [12] matches 1 or 2.
+        if use_ffn_lora:
+            target_modules_pat = 'up_blocks.3.resnets.[12].conv[a-z0-9_]+'
+        else:
+            # A special pattern, "dummy-target-modules" tells PEFT to add loras on NONE of the layers.
+            # We couldn't simply skip PEFT initialization (converting unet to a PEFT model),
+            # otherwise the attn lora layers will cause nan quickly during a fp16 training.
+            target_modules_pat = DUMMY_TARGET_MODULES
+        unet, ffn_lora_layers, ffn_opt_modules = \
+            set_up_ffn_loras(unet, target_modules_pat=target_modules_pat, lora_uses_dora=True)
+        # self.attn_capture_procs and ffn_lora_layers will be used in set_lora_and_capture_flags().
+        self.attn_capture_procs = list(attn_capture_procs.values())
+        self.ffn_lora_layers    = list(ffn_lora_layers.values())
+        # Combine attn_opt_modules and ffn_opt_modules into unet_lora_modules.
+        # unet_lora_modules is for optimization and loading/saving.
+        unet_lora_modules = {}
+        # attn_opt_modules and ffn_opt_modules have different depths of keys.
+        # attn_opt_modules:
+        # up_blocks_3_attentions_1_transformer_blocks_0_attn2_processor_std_shrink_factor,
+        # up_blocks_3_attentions_1_transformer_blocks_0_attn2_processor_to_q_lora_lora_A, ...
+        # ffn_opt_modules:
+        # base_model_model_up_blocks_3_resnets_1_conv1_lora_A, ...
+        # with the prefix 'base_model_model_'. Because ffn_opt_modules are extracted from the peft-wrapped model,
+        # and attn_opt_modules are extracted from the original unet model.
+        # To be compatible with old param keys, we append 'base_model_model_' to the keys of attn_opt_modules.
+        unet_lora_modules.update({ f'base_model_model_{k}': v for k, v in attn_opt_modules.items() })
+        unet_lora_modules.update(ffn_opt_modules)
+        # ParameterDict can contain both Parameter and nn.Module.
+        # TODO: maybe in the future, we couldn't put nn.Module in nn.ParameterDict.
+        self.unet_lora_modules  = torch.nn.ParameterDict(unet_lora_modules)
+        missing, unexpected = self.unet_lora_modules.load_state_dict(unet_lora_modules_state_dict, strict=False)
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+        print(f"Loaded {len(unet_lora_modules_state_dict)} LoRA weights on the UNet:\n{unet_lora_modules.keys()}")
+        self.outfeat_capture_blocks.append(unet.up_blocks[3])
+        # If shrink_cross_attn is True and use_attn_lora is False, we load all these params from ckpt,
+        # but since we set use_attn_lora to False, attn loras won't be used during inference nonetheless.
+        set_lora_and_capture_flags(unet, None, self.attn_capture_procs, self.outfeat_capture_blocks,
+                                   use_attn_lora, use_ffn_lora, 'recon_loss', capture_ca_activations=False,
+                                   shrink_cross_attn=shrink_cross_attn)
+        return unet
+    def load_unet_lora_weights(self, unet, use_attn_lora=True, attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                               shrink_cross_attn=False, q_lora_updates_query=False):
+        unet_lora_weight_found = False
+        if isinstance(self.adaface_ckpt_paths, str):
+            adaface_ckpt_paths = [self.adaface_ckpt_paths]
+        else:
+            adaface_ckpt_paths = self.adaface_ckpt_paths
+        for adaface_ckpt_path in adaface_ckpt_paths:
+            ckpt_dict = torch.load(adaface_ckpt_path, map_location='cpu')
+            if 'unet_lora_modules' in ckpt_dict:
+                unet_lora_modules_state_dict = ckpt_dict['unet_lora_modules']
+                print(f"{len(unet_lora_modules_state_dict)} LoRA weights found in {adaface_ckpt_path}.")
+                unet_lora_weight_found = True
+                break
+        # Since unet lora weights are not found in the adaface ckpt, we give up on loading unet attn processors.
+        if not unet_lora_weight_found:
+            print(f"LoRA weights not found in {self.adaface_ckpt_paths}.")
+            return unet
+        self.outfeat_capture_blocks = []
+        if isinstance(unet, UNetEnsemble):
+            for i, unet_ in enumerate(unet.unets):
+                unet_ = self.load_unet_loras(unet_, unet_lora_modules_state_dict,
+                                             use_attn_lora=use_attn_lora,
+                                             attn_lora_layer_names=attn_lora_layer_names,
+                                             shrink_cross_attn=shrink_cross_attn,
+                                             q_lora_updates_query=q_lora_updates_query)
+                unet.unets[i] = unet_
+            print(f"Loaded LoRA processors on UNetEnsemble of {len(unet.unets)} UNets.")
+        else:
+            unet = self.load_unet_loras(unet, unet_lora_modules_state_dict,
+                                        use_attn_lora=use_attn_lora,
+                                        attn_lora_layer_names=attn_lora_layer_names,
+                                        shrink_cross_attn=shrink_cross_attn,
+                                        q_lora_updates_query=q_lora_updates_query)
+        return unet
     def extend_tokenizer_and_text_encoder(self):
         if np.sum(self.encoders_num_id_vecs) < 1:
             raise ValueError(f"encoders_num_id_vecs has to be larger or equal to 1, but is {self.encoders_num_id_vecs}")
         # We add z_0_0, z_0_1, z_0_2, ..., z_0_15, z_1_0, z_1_1, z_1_2, z_1_3 to the tokenizer.
         self.all_placeholder_tokens = []
         self.placeholder_tokens_strs = []
+        self.encoder_placeholder_tokens = []
         for i in range(len(self.adaface_encoder_types)):
             placeholder_tokens = []
             for j in range(self.encoders_num_id_vecs[i]):
                 placeholder_tokens_str = " ".join(placeholder_tokens)
             self.all_placeholder_tokens.extend(placeholder_tokens)
+            self.encoder_placeholder_tokens.append(placeholder_tokens)
             self.placeholder_tokens_strs.append(placeholder_tokens_str)
         self.all_placeholder_tokens_str = " ".join(self.placeholder_tokens_strs)
+        self.updated_tokens_str = self.all_placeholder_tokens_str
         # all_null_placeholder_tokens_str: ", , , , ..." (20 times).
         # It just contains the commas and spaces with the same length, but no actual tokens.
         self.all_null_placeholder_tokens_str = " ".join([", "] * len(self.all_placeholder_tokens))
         print(f"Added {num_added_tokens} tokens ({self.all_placeholder_tokens_str}) to the tokenizer.")
+        # placeholder_token_ids: [49408, ..., 49427].
         self.placeholder_token_ids = tokenizer.convert_tokens_to_ids(self.all_placeholder_tokens)
         #print("New tokens:", self.placeholder_token_ids)
         # Resize the token embeddings as we are adding new special tokens to the tokenizer
     # Extend pipeline.text_encoder with the adaface subject emeddings.
     # subj_embs: [16, 768].
+    def update_text_encoder_subj_embeddings(self, subj_embs, lens_subj_emb_segments):
         # Initialise the newly added placeholder token with the embeddings of the initializer token
         # token_embeds: [49412, 768]
         token_embeds = self.pipeline.text_encoder.get_input_embeddings().weight.data
+        all_encoders_updated_tokens = []
+        all_encoders_updated_token_strs = []
+        idx = 0
         with torch.no_grad():
+            # sum of lens_subj_emb_segments are probably shorter than self.placeholder_token_ids,
+            # when some static_img_suffix_embs are disabled.
+            for i, encoder_type in enumerate(self.adaface_encoder_types):
+                encoder_updated_tokens = []
+                if (self.enabled_encoders is not None) and (encoder_type not in self.enabled_encoders):
+                    idx += lens_subj_emb_segments[i]
+                    continue
+                for j in range(lens_subj_emb_segments[i]):
+                    placeholder_token = f"{self.subject_string}_{i}_{j}"
+                    token_id = self.pipeline.tokenizer.convert_tokens_to_ids(placeholder_token)
+                    token_embeds[token_id] = subj_embs[idx]
+                    encoder_updated_tokens.append(placeholder_token)
+                    idx += 1
+                all_encoders_updated_tokens.extend(encoder_updated_tokens)
+                all_encoders_updated_token_strs.append(" ".join(encoder_updated_tokens))
+            self.updated_tokens_str = " ".join(all_encoders_updated_token_strs)
+            self.all_encoders_updated_token_strs = all_encoders_updated_token_strs
+            print(f"Updated {len(all_encoders_updated_tokens)} tokens ({self.updated_tokens_str}) in the text encoder.")
     def update_prompt(self, prompt, placeholder_tokens_pos='append',
+                      repeat_prompt_for_each_encoder=True,
                       use_null_placeholders=False):
         if prompt is None:
             prompt = ""
         if use_null_placeholders:
             all_placeholder_tokens_str = self.all_null_placeholder_tokens_str
+            if not re.search(r"\b(man|woman|person|child|girl|boy)\b", prompt.lower()):
+                all_placeholder_tokens_str = "person " + all_placeholder_tokens_str
+            repeat_prompt_for_each_encoder = False
         else:
+            all_placeholder_tokens_str = self.updated_tokens_str
         # Delete the subject_string from the prompt.
         prompt = re.sub(r'\b(a|an|the)\s+' + self.subject_string + r'\b,?', "", prompt)
         # When we do joint training, seems both work better if they are appended to the prompt.
         # Therefore we simply appended all placeholder_tokens_str's to the prompt.
         # NOTE: Prepending them hurts compositional prompts.
+        if repeat_prompt_for_each_encoder:
+            encoder_prompts = []
+            for encoder_updated_token_strs in self.all_encoders_updated_token_strs:
+                if placeholder_tokens_pos == 'prepend':
+                    encoder_prompt = encoder_updated_token_strs + " " + prompt
+                elif placeholder_tokens_pos == 'append':
+                    encoder_prompt = prompt + " " + encoder_updated_token_strs
+                else:
+                    breakpoint()
+                encoder_prompts.append(encoder_prompt)
+            prompt = ", ".join(encoder_prompts)
         else:
+            if placeholder_tokens_pos == 'prepend':
+                prompt = all_placeholder_tokens_str + " " + prompt
+            elif placeholder_tokens_pos == 'append':
+                prompt = prompt + " " + all_placeholder_tokens_str
+            else:
+                breakpoint()
         return prompt
+    # NOTE: all_adaface_subj_embs is the input to the CLIP text encoder.
+    # ** DO NOT use it as prompt_embeds in the forward() method.
     # If face_id_embs is None, then it extracts face_id_embs from the images,
     # then map them to ada prompt embeddings.
     # avg_at_stage: 'id_emb', 'img_prompt_emb', or None.
                                    perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
                                    perturb_std=0, update_text_encoder=True):
+        all_adaface_subj_embs, img_prompt_embs, lens_subj_emb_segments = \
             self.id2ada_prompt_encoder.generate_adaface_embeddings(\
                     image_paths, face_id_embs=face_id_embs,
                     img_prompt_embs=None,
                     avg_at_stage=avg_at_stage,
                     perturb_at_stage=perturb_at_stage,
                     perturb_std=perturb_std,
+                    enable_static_img_suffix_embs=self.enable_static_img_suffix_embs)
         if all_adaface_subj_embs is None:
             return None
+        self.img_prompt_embs = img_prompt_embs
         if all_adaface_subj_embs.ndim == 4:
+            # [1, 1, 20, 768] -> [20, 768]
             all_adaface_subj_embs = all_adaface_subj_embs.squeeze(0).squeeze(0)
         elif all_adaface_subj_embs.ndim == 3:
+            # [1, 20, 768] -> [20, 768]
             all_adaface_subj_embs = all_adaface_subj_embs.squeeze(0)
         if update_text_encoder:
+            self.update_text_encoder_subj_embeddings(all_adaface_subj_embs, lens_subj_emb_segments)
         return all_adaface_subj_embs
     def diffusers_encode_prompts(self, prompt, plain_prompt, negative_prompt, device):
                 else:
                     breakpoint()
             else:
+                # "text2img" and "img2img" pipelines.
                 # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
                 prompt_embeds_, negative_prompt_embeds_ = \
                     self.pipeline.encode_prompt(prompt, device=device,
         return prompt_embeds_, negative_prompt_embeds_, \
                pooled_prompt_embeds_, negative_pooled_prompt_embeds_
+    # alt_prompt_embed_type: 'ada-nonmix', 'img'
+    def mix_ada_embs_with_other_embs(self, prompt, prompt_embeds,
+                                     alt_prompt_embed_type, alt_prompt_emb_weights):
+        # Scan prompt and replace tokens in self.placeholder_token_ids
+        # with the corresponding image embeddings.
+        prompt_tokens = self.pipeline.tokenizer.tokenize(prompt)
+        prompt_embeds2 = prompt_embeds.clone()
+        if alt_prompt_embed_type == 'img':
+            if self.img_prompt_embs is None:
+                print("Unable to find img_prompt_embs. Either prepare_adaface_embeddings() hasn't been called, or faceless images were used.")
+                return prompt_embeds
+            # self.img_prompt_embs: [1, 20, 768]
+            repl_embeddings = self.img_prompt_embs
+        elif alt_prompt_embed_type == 'ada-nonmix':
+            repl_embeddings_, _, _, _ = self.encode_prompt(prompt, ablate_prompt_only_placeholders=True,
+                                                           verbose=True)
+            # repl_embeddings_: [1, 77, 768] -> [1, 20, 768]
+            repl_embeddings = repl_embeddings_[:, 1:len(self.all_placeholder_tokens)+1]
+        else:
+            breakpoint()
+        repl_tokens = {}
+        for i in range(len(prompt_tokens)):
+            if prompt_tokens[i] in self.all_placeholder_tokens:
+                encoder_idx = next((i for i, sublist in enumerate(self.encoder_placeholder_tokens) \
+                                    if prompt_tokens[i] in sublist), 0)
+                alt_prompt_emb_weight = alt_prompt_emb_weights[encoder_idx]
+                prompt_embeds2[:, i] = prompt_embeds2[:, i] * (1 - alt_prompt_emb_weight) \
+                                       + repl_embeddings[:, self.all_placeholder_tokens.index(prompt_tokens[i])] * alt_prompt_emb_weight
+                repl_tokens[prompt_tokens[i]] = 1
+        repl_token_count = len(repl_tokens)
+        if np.all(np.array(alt_prompt_emb_weights) == 1):
+            print(f"Replaced {repl_token_count} tokens with {alt_prompt_embed_type} embeddings.")
+        else:
+            print(f"Mixed {repl_token_count} tokens with {alt_prompt_embed_type} embeddings, weight {alt_prompt_emb_weights}.")
+        return prompt_embeds2
     def encode_prompt(self, prompt, negative_prompt=None,
                       placeholder_tokens_pos='append',
+                      ablate_prompt_only_placeholders=False,
+                      ablate_prompt_no_placeholders=False,
+                      ablate_prompt_embed_type='ada', # 'ada', 'ada-nonmix', 'img'
+                      nonmix_prompt_emb_weight=0,
+                      repeat_prompt_for_each_encoder=True,
                       device=None, verbose=False):
         if negative_prompt is None:
             negative_prompt = self.negative_prompt
             device = self.device
         plain_prompt = prompt
+        if ablate_prompt_only_placeholders:
+            prompt = self.updated_tokens_str
+        else:
+            prompt = self.update_prompt(prompt, placeholder_tokens_pos=placeholder_tokens_pos,
+                                        repeat_prompt_for_each_encoder=repeat_prompt_for_each_encoder,
+                                        use_null_placeholders=ablate_prompt_no_placeholders)
         if verbose:
             print(f"Subject prompt:\n{prompt}")
         # For some unknown reason, the text_encoder is still on CPU after self.pipeline.to(self.device).
         # So we manually move it to GPU here.
         self.pipeline.text_encoder.to(device)
         prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
             self.diffusers_encode_prompts(prompt, plain_prompt, negative_prompt, device)
+        if ablate_prompt_embed_type != 'ada':
+            alt_prompt_embed_type = ablate_prompt_embed_type
+            alt_prompt_emb_weights = (1, 1)
+        elif nonmix_prompt_emb_weight > 0:
+            alt_prompt_embed_type = 'ada-nonmix'
+            alt_prompt_emb_weights = (nonmix_prompt_emb_weight, nonmix_prompt_emb_weight)
+        else:
+            alt_prompt_emb_weights = (0, 0)
+        if sum(alt_prompt_emb_weights) > 0:
+            prompt_embeds_ = self.mix_ada_embs_with_other_embs(prompt, prompt_embeds_,
+                                                               alt_prompt_embed_type, alt_prompt_emb_weights)
         return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
     # ref_img_strength is used only in the img2img pipeline.
+    def forward(self, noise, prompt, prompt_embeds=None, negative_prompt=None,
                 placeholder_tokens_pos='append',
                 guidance_scale=6.0, out_image_count=4,
+                ref_img_strength=0.8, generator=None,
+                ablate_prompt_only_placeholders=False,
+                ablate_prompt_no_placeholders=False,
+                ablate_prompt_embed_type='ada', # 'ada', 'ada-nonmix', 'img'
+                nonmix_prompt_emb_weight=0,
+                repeat_prompt_for_each_encoder=True,
+                verbose=False):
         noise = noise.to(device=self.device, dtype=torch.float16)
+        if self.use_lcm:
+            guidance_scale = 0
         if negative_prompt is None:
             negative_prompt = self.negative_prompt
         # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
+        if prompt_embeds is None:
+            prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
+                negative_pooled_prompt_embeds_ = \
+                    self.encode_prompt(prompt, negative_prompt,
+                                       placeholder_tokens_pos=placeholder_tokens_pos,
+                                       ablate_prompt_only_placeholders=ablate_prompt_only_placeholders,
+                                       ablate_prompt_no_placeholders=ablate_prompt_no_placeholders,
+                                       ablate_prompt_embed_type=ablate_prompt_embed_type,
+                                       nonmix_prompt_emb_weight=nonmix_prompt_emb_weight,
+                                       repeat_prompt_for_each_encoder=repeat_prompt_for_each_encoder,
+                                       device=self.device,
+                                       verbose=verbose)
+        else:
+            if len(prompt_embeds) == 2:
+                prompt_embeds_, negative_prompt_embeds_ = prompt_embeds
+                pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = None, None
+            elif len(prompt_embeds) == 4:
+                prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
+                    negative_pooled_prompt_embeds_ = prompt_embeds
+            else:
+                breakpoint()
         # Repeat the prompt embeddings for all images in the batch.
         prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)
         if negative_prompt_embeds_ is not None:
             negative_prompt_embeds_ = negative_prompt_embeds_.repeat(out_image_count, 1, 1)

adaface/diffusers_attn_lora_capture.py ADDED Viewed

	@@ -0,0 +1,656 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple, Dict, Any
+from diffusers.models.attention_processor import Attention, AttnProcessor2_0
+from diffusers.utils import logging, is_torch_version, deprecate
+from diffusers.utils.torch_utils import fourier_filter
+# UNet is a diffusers PeftAdapterMixin instance.
+from diffusers.loaders.peft import PeftAdapterMixin
+from peft import LoraConfig, get_peft_model
+import peft.tuners.lora as peft_lora
+from peft.tuners.lora.dora import DoraLinearLayer
+from einops import rearrange
+import math, re
+import numpy as np
+from peft.tuners.tuners_utils import BaseTunerLayer
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def dummy_func(*args, **kwargs):
+    pass
+# Revised from RevGrad, by removing the grad negation.
+class ScaleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_, alpha_, debug=False):
+        ctx.save_for_backward(alpha_, debug)
+        output = input_
+        if debug:
+            print(f"input: {input_.abs().mean().item()}")
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):  # pragma: no cover
+        # saved_tensors returns a tuple of tensors.
+        alpha_, debug = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            grad_output2 = grad_output * alpha_
+            if debug:
+                print(f"grad_output2: {grad_output2.abs().mean().item()}")
+        else:
+            grad_output2 = None
+        return grad_output2, None, None
+class GradientScaler(nn.Module):
+    def __init__(self, alpha=1., debug=False, *args, **kwargs):
+        """
+        A gradient scaling layer.
+        This layer has no parameters, and simply scales the gradient in the backward pass.
+        """
+        super().__init__(*args, **kwargs)
+        self._alpha = torch.tensor(alpha, requires_grad=False)
+        self._debug = torch.tensor(debug, requires_grad=False)
+    def forward(self, input_):
+        _debug = self._debug if hasattr(self, '_debug') else False
+        return ScaleGrad.apply(input_, self._alpha.to(input_.device), _debug)
+def gen_gradient_scaler(alpha, debug=False):
+    if alpha == 1:
+        return nn.Identity()
+    if alpha > 0:
+        return GradientScaler(alpha, debug=debug)
+    else:
+        assert alpha == 0
+        # Don't use lambda function here, otherwise the object can't be pickled.
+        return torch.detach
+def split_indices_by_instance(indices, as_dict=False):
+    indices_B, indices_N = indices
+    unique_indices_B = torch.unique(indices_B)
+    if not as_dict:
+        indices_by_instance = [ (indices_B[indices_B == uib], indices_N[indices_B == uib]) for uib in unique_indices_B ]
+    else:
+        indices_by_instance = { uib.item(): indices_N[indices_B == uib] for uib in unique_indices_B }
+    return indices_by_instance
+# If do_sum, returned emb_attns is 3D. Otherwise 4D.
+# indices are applied on the first 2 dims of attn_mat.
+def sel_emb_attns_by_indices(attn_mat, indices, all_token_weights=None, do_sum=True, do_mean=False):
+    indices_by_instance = split_indices_by_instance(indices)
+    # emb_attns[0]: [1, 9, 8, 64]
+    # 8: 8 attention heads. Last dim 64: number of image tokens.
+    emb_attns   = [ attn_mat[inst_indices].unsqueeze(0) for inst_indices in indices_by_instance ]
+    if all_token_weights is not None:
+        # all_token_weights: [4, 77].
+        # token_weights_by_instance[0]: [1, 9, 1, 1].
+        token_weights = [ all_token_weights[inst_indices].reshape(1, -1, 1, 1) for inst_indices in indices_by_instance ]
+    else:
+        token_weights = [ 1 ] * len(indices_by_instance)
+    # Apply token weights.
+    emb_attns = [ emb_attns[i] * token_weights[i] for i in range(len(indices_by_instance)) ]
+    # sum among K_subj_i subj embeddings -> [1, 8, 64]
+    if do_sum:
+        emb_attns   = [ emb_attns[i].sum(dim=1) for i in range(len(indices_by_instance)) ]
+    elif do_mean:
+        emb_attns   = [ emb_attns[i].mean(dim=1) for i in range(len(indices_by_instance)) ]
+    emb_attns = torch.cat(emb_attns, dim=0)
+    return emb_attns
+# Slow implementation equivalent to F.scaled_dot_product_attention.
+def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
+                                 shrink_cross_attn=False, cross_attn_shrink_factor=0.5,
+                                 is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
+    B, L, S = query.size(0), query.size(-2), key.size(-2)
+    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+    # 1: head (to be broadcasted). L: query length. S: key length.
+    attn_bias = torch.zeros(B, 1, L, S, device=query.device, dtype=query.dtype)
+    if is_causal:
+        assert attn_mask is None
+        temp_mask = torch.ones(B, 1, L, S, device=query.device, dtype=torch.bool).tril(diagonal=0)
+        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        attn_bias.to(query.dtype)
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+        else:
+            attn_bias += attn_mask
+    if enable_gqa:
+        key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
+        value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+    attn_weight = query @ key.transpose(-2, -1) * scale_factor
+    if shrink_cross_attn:
+        cross_attn_scale = cross_attn_shrink_factor
+    else:
+        cross_attn_scale = 1
+    # attn_bias: [1, 1, 4096, 77], the same size as a single-head attn_weight.
+    attn_weight += attn_bias
+    attn_score = attn_weight
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    # NOTE: After scaling, the "probabilities" of the subject embeddings will sum to < 1.
+    # But this is intended, as we want to scale down the impact of the subject embeddings
+    # in the computed attention output tensors.
+    attn_weight = attn_weight * cross_attn_scale
+    attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+    output = attn_weight @ value
+    return output, attn_score, attn_weight
+# All layers share the same attention processor instance.
+class AttnProcessor_LoRA_Capture(nn.Module):
+    r"""
+    Revised from AttnProcessor2_0
+    """
+    # lora_proj_layers is a dict of lora_layer_name -> lora_proj_layer.
+    def __init__(self, capture_ca_activations: bool = False, enable_lora: bool = False,
+                 lora_uses_dora=True, lora_proj_layers=None,
+                 lora_rank: int = 192, lora_alpha: float = 16,
+                 cross_attn_shrink_factor: float = 0.5,
+                 q_lora_updates_query=False, attn_proc_idx=-1):
+        super().__init__()
+        self.global_enable_lora = enable_lora
+        self.attn_proc_idx = attn_proc_idx
+        # reset_attn_cache_and_flags() sets the local (call-specific) self.enable_lora flag.
+        # By default, shrink_cross_attn is False. Later in layers 22, 23, 24 it will be set to True.
+        self.reset_attn_cache_and_flags(capture_ca_activations, False, enable_lora)
+        self.lora_rank = lora_rank
+        self.lora_alpha = lora_alpha
+        self.lora_scale = self.lora_alpha / self.lora_rank
+        self.cross_attn_shrink_factor = cross_attn_shrink_factor
+        self.q_lora_updates_query = q_lora_updates_query
+        self.to_q_lora = self.to_k_lora = self.to_v_lora = self.to_out_lora = None
+        if self.global_enable_lora:
+            for lora_layer_name, lora_proj_layer in lora_proj_layers.items():
+                if lora_layer_name == 'q':
+                    self.to_q_lora   = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
+                                                        use_dora=lora_uses_dora, lora_dropout=0.1)
+                elif lora_layer_name == 'k':
+                    self.to_k_lora   = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
+                                                        use_dora=lora_uses_dora, lora_dropout=0.1)
+                elif lora_layer_name == 'v':
+                    self.to_v_lora   = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
+                                                        use_dora=lora_uses_dora, lora_dropout=0.1)
+                elif lora_layer_name == 'out':
+                    self.to_out_lora = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
+                                                        use_dora=lora_uses_dora, lora_dropout=0.1)
+    # LoRA layers can be enabled/disabled dynamically.
+    def reset_attn_cache_and_flags(self, capture_ca_activations, shrink_cross_attn, enable_lora):
+        self.capture_ca_activations = capture_ca_activations
+        self.shrink_cross_attn      = shrink_cross_attn
+        self.cached_activations     = {}
+        # Only enable LoRA for the next call(s) if global_enable_lora is set to True.
+        self.enable_lora = enable_lora and self.global_enable_lora
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        img_mask: Optional[torch.Tensor] = None,
+        subj_indices: Optional[Tuple[torch.IntTensor, torch.IntTensor]] = None,
+        debug: bool = False,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        # hidden_states: [1, 4096, 320]
+        residual = hidden_states
+        # attn.spatial_norm is None.
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            # Collapse the spatial dimensions to a single token dimension.
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        # NOTE: there's a inconsistency between q lora and k, v loras.
+        # k, v loras are directly applied to key and value (currently k, v loras are never enabled),
+        # while q lora is applied to query2, and we keep the query unchanged.
+        if self.enable_lora and self.to_q_lora is not None:
+            # query2 will be used in ldm/util.py:calc_elastic_matching_loss() to get more accurate
+            # cross attention scores between the latent images of the sc and mc instances.
+            query2 = self.to_q_lora(hidden_states)
+            # If not q_lora_updates_query, only query2 will be impacted by the LoRA layer.
+            # The query, and thus the attention score and attn_out, will be the same
+            # as the original ones.
+            if self.q_lora_updates_query:
+                query = query2
+        else:
+            query2 = query
+        scale = 1 / math.sqrt(query.size(-1))
+        is_cross_attn = (encoder_hidden_states is not None)
+        if (not is_cross_attn) and (img_mask is not None):
+            # NOTE: we assume the image is square. But this will fail if the image is not square.
+            # hidden_states: [BS, 4096, 320]. img_mask: [BS, 1, 64, 64]
+            # Scale the mask to the same size as hidden_states.
+            mask_size = int(math.sqrt(hidden_states.shape[-2]))
+            img_mask = F.interpolate(img_mask, size=(mask_size, mask_size), mode='nearest')
+            if (img_mask.sum(dim=(2, 3)) == 0).any():
+                img_mask = None
+            else:
+                # img_mask: [2, 1, 64, 64] -> [2, 4096]
+                img_mask = rearrange(img_mask, 'b ... -> b (...)').contiguous()
+                # max_neg_value = -torch.finfo(hidden_states.dtype).max
+                # img_mask: [2, 4096] -> [2, 1, 1, 4096]
+                img_mask = rearrange(img_mask.bool(), 'b j -> b () () j')
+                # attn_score: [16, 4096, 4096]. img_mask will be broadcasted to [16, 4096, 4096].
+                # So some rows in dim 1 (e.g. [0, :, 4095]) of attn_score will be masked out (all elements in [0, :, 4095] is -inf).
+                # But not all elements in [0, 4095, :] is -inf. Since the softmax is done along dim 2, this is fine.
+                # attn_score.masked_fill_(~img_mask, max_neg_value)
+                # NOTE: If there's an attention mask, it will be replaced by img_mask.
+                attention_mask = img_mask
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if self.enable_lora and self.to_k_lora is not None:
+            key = self.to_k_lora(encoder_hidden_states)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+        if self.enable_lora and self.to_v_lora is not None:
+            value = self.to_v_lora(encoder_hidden_states)
+        else:
+            value = attn.to_v(encoder_hidden_states)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+            query2 = attn.norm_q(query2)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query  = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        query2 = query2.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key   = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if debug and self.attn_proc_idx >= 0:
+            breakpoint()
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        if is_cross_attn and (self.capture_ca_activations or self.shrink_cross_attn):
+            hidden_states, attn_score, attn_prob = \
+                scaled_dot_product_attention(query, key, value, attn_mask=attention_mask,
+                                             dropout_p=0.0, shrink_cross_attn=self.shrink_cross_attn,
+                                             cross_attn_shrink_factor=self.cross_attn_shrink_factor)
+        else:
+            # Use the faster implementation of scaled_dot_product_attention
+            # when not capturing the activations or suppressing the subject attention.
+            hidden_states = \
+                F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+            attn_prob = attn_score = None
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        if self.enable_lora and self.to_out_lora is not None:
+            hidden_states = self.to_out_lora(hidden_states)
+        else:
+            hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        if is_cross_attn and self.capture_ca_activations:
+            # cached q will be used in ddpm.py:calc_comp_fg_bg_preserve_loss(), in which two qs will multiply each other.
+            # So sqrt(scale) will scale the product of two qs by scale.
+            # ANCHOR[id=attention_caching]
+            # query: [2, 8, 4096, 40] -> [2, 320, 4096]
+            self.cached_activations['q'] = \
+                rearrange(query, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
+            self.cached_activations['q2'] = \
+                rearrange(query2, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
+            self.cached_activations['k'] = \
+                rearrange(key, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
+            self.cached_activations['v'] = \
+                rearrange(value, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
+            # attn_prob, attn_score: [2, 8, 4096, 77]
+            self.cached_activations['attn'] = attn_prob
+            self.cached_activations['attnscore'] = attn_score
+            # attn_out: [b, n, h * d] -> [b, h * d, n]
+            # [2, 4096, 320] -> [2, 320, 4096].
+            self.cached_activations['attn_out'] = hidden_states.permute(0, 2, 1).contiguous()
+        return hidden_states
+def CrossAttnUpBlock2D_forward_capture(
+    self,
+    hidden_states: torch.Tensor,
+    res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+    temb: Optional[torch.Tensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    upsample_size: Optional[int] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if cross_attention_kwargs is not None:
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+    self.cached_outfeats = {}
+    res_hidden_states_gradscale = getattr(self, "res_hidden_states_gradscale", 1)
+    capture_outfeats            = getattr(self, "capture_outfeats", False)
+    layer_idx = 0
+    res_grad_scaler = gen_gradient_scaler(res_hidden_states_gradscale)
+    for resnet, attn in zip(self.resnets, self.attentions):
+        # pop res hidden states
+        res_hidden_states = res_hidden_states_tuple[-1]
+        res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+        # Scale down the magnitudes of gradients to res_hidden_states
+        # by res_hidden_states_gradscale=0.2, to match the scale of the cross-attn layer outputs.
+        res_hidden_states = res_grad_scaler(res_hidden_states)
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module, return_dict=None):
+                def custom_forward(*inputs):
+                    if return_dict is not None:
+                        return module(*inputs, return_dict=return_dict)
+                    else:
+                        return module(*inputs)
+                return custom_forward
+            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            hidden_states = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(resnet),
+                hidden_states,
+                temb,
+                **ckpt_kwargs,
+            )
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+                return_dict=False,
+            )[0]
+        else:
+            # resnet: ResnetBlock2D instance.
+            #LINK diffusers.models.resnet.ResnetBlock2D
+            # up_blocks.3.resnets.2.conv_shortcut is a module within ResnetBlock2D,
+            # it's not transforming the UNet shortcut features.
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+                return_dict=False,
+            )[0]
+        if capture_outfeats:
+            self.cached_outfeats[layer_idx] = hidden_states
+            layer_idx += 1
+    if self.upsamplers is not None:
+        for upsampler in self.upsamplers:
+            hidden_states = upsampler(hidden_states, upsample_size)
+    return hidden_states
+# Adapted from ConsistentIDPipeline:set_ip_adapter().
+# attn_lora_layer_names: candidates are subsets of ['q', 'k', 'v', 'out'].
+def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                           lora_rank=192, lora_scale_down=8, cross_attn_shrink_factor=0.5,
+                           q_lora_updates_query=False):
+    attn_procs = {}
+    attn_capture_procs = {}
+    unet_modules = dict(unet.named_modules())
+    attn_opt_modules = {}
+    attn_proc_idx = 0
+    for name, attn_proc in unet.attn_processors.items():
+        # Only capture the activations of the last 3 CA layers.
+        if not name.startswith("up_blocks.3"):
+            # Not the last 3 CA layers. Don't enable LoRA or capture activations.
+            # Then the layer falls back to the original attention mechanism.
+            # We still use AttnProcessor_LoRA_Capture, as it can handle img_mask.
+            attn_procs[name] = AttnProcessor_LoRA_Capture(
+                capture_ca_activations=False, enable_lora=False, attn_proc_idx=-1)
+            continue
+        # cross_attention_dim: 768.
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if cross_attention_dim is None:
+            # Self attention. Don't enable LoRA or capture activations.
+            # We replace the default attn_proc with AttnProcessor_LoRA_Capture,
+            # so that it can incorporate img_mask into self-attention.
+            attn_procs[name] = AttnProcessor_LoRA_Capture(
+                capture_ca_activations=False, enable_lora=False, attn_proc_idx=-1)
+            continue
+        # block_id = 3
+        # hidden_size: 320
+        # hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        # 'up_blocks.3.attentions.1.transformer_blocks.0.attn2.processor' ->
+        # 'up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_q'
+        lora_layer_dict = {}
+        lora_layer_dict['q']    = unet_modules[name[:-9] + "to_q"]
+        lora_layer_dict['k']    = unet_modules[name[:-9] + "to_k"]
+        lora_layer_dict['v']    = unet_modules[name[:-9] + "to_v"]
+        # to_out is a ModuleList(Linear, Dropout).
+        lora_layer_dict['out']  = unet_modules[name[:-9] + "to_out"][0]
+        lora_proj_layers = {}
+        # Only apply LoRA to the specified layers.
+        for lora_layer_name in attn_lora_layer_names:
+            lora_proj_layers[lora_layer_name] = lora_layer_dict[lora_layer_name]
+        attn_capture_proc = AttnProcessor_LoRA_Capture(
+            capture_ca_activations=True, enable_lora=use_attn_lora,
+            lora_uses_dora=True, lora_proj_layers=lora_proj_layers,
+            # LoRA up is initialized to 0. So no need to worry that the LoRA output may be too large.
+            lora_rank=lora_rank, lora_alpha=lora_rank // lora_scale_down,
+            cross_attn_shrink_factor=cross_attn_shrink_factor,
+            q_lora_updates_query=q_lora_updates_query, attn_proc_idx=attn_proc_idx)
+        attn_proc_idx += 1
+        # attn_procs has to use the original names.
+        attn_procs[name] = attn_capture_proc
+        # ModuleDict doesn't allow "." in the key.
+        name = name.replace(".", "_")
+        attn_capture_procs[name] = attn_capture_proc
+        if use_attn_lora:
+            for subname, module in attn_capture_proc.named_modules():
+                if isinstance(module, peft_lora.LoraLayer):
+                    # ModuleDict doesn't allow "." in the key.
+                    lora_path = name + "_" + subname.replace(".", "_")
+                    attn_opt_modules[lora_path + "_lora_A"] = module.lora_A
+                    attn_opt_modules[lora_path + "_lora_B"] = module.lora_B
+                    # lora_uses_dora is always True, so we don't check it here.
+                    attn_opt_modules[lora_path + "_lora_magnitude_vector"] = module.lora_magnitude_vector
+                    # We will manage attn adapters directly. By default, LoraLayer is an instance of BaseTunerLayer,
+                    # so according to the code logic in diffusers/loaders/peft.py,
+                    # they will be managed by the diffusers PeftAdapterMixin instance, through the
+                    # enable_adapters(), and set_adapter() methods.
+                    # Therefore, we disable these calls on module.
+                    # disable_adapters() is a property and changing it will cause exceptions.
+                    module.enable_adapters  = dummy_func
+                    module.set_adapter      = dummy_func
+    unet.set_attn_processor(attn_procs)
+    print(f"Set up {len(attn_capture_procs)} CrossAttn processors on {attn_capture_procs.keys()}.")
+    print(f"Set up {len(attn_opt_modules)} attn LoRA params: {attn_opt_modules.keys()}.")
+    return attn_capture_procs, attn_opt_modules
+# NOTE: cross-attn layers are included in the returned lora_modules.
+def set_up_ffn_loras(unet, target_modules_pat, lora_uses_dora=False, lora_rank=192, lora_alpha=16):
+    # target_modules_pat = 'up_blocks.3.resnets.[12].conv[a-z0-9_]+'
+    # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut
+    # Cannot set to conv.+ as it will match added adapter module names, including
+    # up_blocks.3.resnets.1.conv1.base_layer, up_blocks.3.resnets.1.conv1.lora_dropout
+    if target_modules_pat is not None:
+        peft_config = LoraConfig(use_dora=lora_uses_dora, inference_mode=False, r=lora_rank,
+                                 lora_alpha=lora_alpha, lora_dropout=0.1,
+                                 target_modules=target_modules_pat)
+        # UNet is a diffusers PeftAdapterMixin instance. Using get_peft_model on it will
+        # cause weird errors. Instead, we directly use diffusers peft adapter methods.
+        unet.add_adapter(peft_config, "recon_loss")
+        unet.add_adapter(peft_config, "unet_distill")
+        unet.add_adapter(peft_config, "comp_distill")
+        unet.enable_adapters()
+    # lora_layers contain both the LoRA A and B matrices, as well as the original layers.
+    # lora_layers are used to set the flag, not used for optimization.
+    # lora_modules contain only the LoRA A and B matrices, so they are used for optimization.
+    # NOTE: lora_modules contain both ffn and cross-attn lora modules.
+    ffn_lora_layers = {}
+    ffn_opt_modules = {}
+    for name, module in unet.named_modules():
+        if isinstance(module, peft_lora.LoraLayer):
+            # We don't want to include cross-attn layers in ffn_lora_layers.
+            if target_modules_pat is not None and re.search(target_modules_pat, name):
+                ffn_lora_layers[name] = module
+                # ModuleDict doesn't allow "." in the key.
+                name = name.replace(".", "_")
+                # Since ModuleDict doesn't allow "." in the key, we manually collect
+                # the LoRA matrices in each module.
+                # NOTE: We cannot put every sub-module of module into lora_modules,
+                # as base_layer is also a sub-module of module, which we shouldn't optimize.
+                # Each value in ffn_opt_modules is a ModuleDict:
+                '''
+                    (Pdb) ffn_opt_modules['up_blocks_3_resnets_1_conv1_lora_A']
+                    ModuleDict(
+                    (unet_distill): Conv2d(640, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+                    (recon_loss): Conv2d(640, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+                    )
+                '''
+                ffn_opt_modules[name + "_lora_A"] = module.lora_A
+                ffn_opt_modules[name + "_lora_B"] = module.lora_B
+                if lora_uses_dora:
+                    ffn_opt_modules[name + "_lora_magnitude_vector"] = module.lora_magnitude_vector
+    print(f"Set up {len(ffn_lora_layers)} FFN LoRA layers: {ffn_lora_layers.keys()}.")
+    print(f"Set up {len(ffn_opt_modules)} FFN LoRA params: {ffn_opt_modules.keys()}.")
+    return ffn_lora_layers, ffn_opt_modules
+def set_lora_and_capture_flags(unet, unet_lora_modules, attn_capture_procs,
+                               outfeat_capture_blocks, res_hidden_states_gradscale_blocks,
+                               use_attn_lora, use_ffn_lora, ffn_lora_adapter_name, capture_ca_activations,
+                               shrink_cross_attn, res_hidden_states_gradscale):
+    # For attn capture procs, capture_ca_activations and use_attn_lora are set in reset_attn_cache_and_flags().
+    for attn_capture_proc in attn_capture_procs:
+        attn_capture_proc.reset_attn_cache_and_flags(capture_ca_activations, shrink_cross_attn, enable_lora=use_attn_lora)
+    # outfeat_capture_blocks only contains the last up block, up_blocks[3].
+    # It contains 3 FFN layers. We want to capture their output features.
+    for block in outfeat_capture_blocks:
+        block.capture_outfeats           = capture_ca_activations
+    for block in res_hidden_states_gradscale_blocks:
+        block.res_hidden_states_gradscale = res_hidden_states_gradscale
+    if not use_ffn_lora:
+        unet.disable_adapters()
+    else:
+        # ffn_lora_adapter_name: 'recon_loss', 'unet_distill', 'comp_distill'.
+        if ffn_lora_adapter_name is not None:
+            unet.set_adapter(ffn_lora_adapter_name)
+            # NOTE: Don't forget to enable_adapters().
+            # The adapters are not enabled by default after set_adapter().
+            unet.enable_adapters()
+        else:
+            breakpoint()
+    # During training, disable_adapters() and set_adapter() will set all/inactive adapters with requires_grad=False,
+    # which might cause issues during DDP training.
+    # So we restore them to requires_grad=True.
+    # During test, unet_lora_modules will be passed as None, so this block will be skipped.
+    if unet_lora_modules is not None:
+        for param in unet_lora_modules.parameters():
+            param.requires_grad = True
+def get_captured_activations(capture_ca_activations, attn_capture_procs, outfeat_capture_blocks,
+                             captured_layer_indices=[22, 23, 24], out_dtype=torch.float32):
+    captured_activations = { k: {} for k in ('outfeat', 'attn', 'attnscore',
+                                             'q', 'q2', 'k', 'v', 'attn_out') }
+    if not capture_ca_activations:
+        return captured_activations
+    all_cached_outfeats = []
+    for block in outfeat_capture_blocks:
+        all_cached_outfeats.append(block.cached_outfeats)
+        # Clear the capture flag and cached outfeats.
+        block.cached_outfeats = {}
+        block.capture_outfeats = False
+    for layer_idx in captured_layer_indices:
+        # Subtract 22 to ca_layer_idx to match the layer index in up_blocks[3].cached_outfeats.
+        # 23, 24 -> 1, 2 (!! not 0, 1 !!)
+        internal_idx = layer_idx - 22
+        for k in captured_activations.keys():
+            if k == 'outfeat':
+                # Currently we only capture one block, up_blocks.3. So we hard-code the index 0.
+                captured_activations['outfeat'][layer_idx] = all_cached_outfeats[0][internal_idx].to(out_dtype)
+            else:
+                # internal_idx is the index of layers in up_blocks.3.
+                # Layers 22, 23 and 24 map to 0, 1 and 2.
+                cached_activations = attn_capture_procs[internal_idx].cached_activations
+                captured_activations[k][layer_idx] = cached_activations[k].to(out_dtype)
+    return captured_activations

adaface/face_id_to_ada_prompt.py CHANGED Viewed

@@ -53,6 +53,8 @@ class FaceID2AdaPrompt(nn.Module):
         self.text_to_image_prompt_encoder   = None
         self.tokenizer                      = None
         self.dtype                          = kwargs.get('dtype', torch.float16)
         # Load Img2Ada SubjectBasisGenerator.
         self.subject_string                 = kwargs.get('subject_string', 'z')
@@ -73,12 +75,16 @@ class FaceID2AdaPrompt(nn.Module):
         self.use_clip_embs                          = False
         self.do_contrast_clip_embs_on_bg_features   = False
         # num_id_vecs is the output embeddings of the ID2ImgPrompt module.
         # If there's no static image suffix embeddings, then num_id_vecs is also
         # the number of ada embeddings returned by the subject basis generator.
         # num_id_vecs will be set in each derived class.
         self.num_static_img_suffix_embs     = kwargs.get('num_static_img_suffix_embs', 0)
-        print(f'{self.name} Adaface uses {self.num_id_vecs} ID image embeddings and {self.num_static_img_suffix_embs} fixed image embeddings as input.')
         self.id_img_prompt_max_length       = 77
         self.face_id_dim                    = 512
@@ -87,36 +93,35 @@ class FaceID2AdaPrompt(nn.Module):
         self.clip_embedding_dim             = 1024
         self.output_dim                     = 768
-    def get_id2img_learnable_modules(self):
-        raise NotImplementedError
-    def load_id2img_learnable_modules(self, id2img_learnable_modules_state_dict_list):
-        id2img_prompt_encoder_learnable_modules = self.get_id2img_learnable_modules()
-        for module, state_dict in zip(id2img_prompt_encoder_learnable_modules, id2img_learnable_modules_state_dict_list):
-            module.load_state_dict(state_dict)
-        print(f'{len(id2img_prompt_encoder_learnable_modules)} ID2ImgPrompt encoder modules loaded.')
-    # init_subj_basis_generator() can only be called after the derived class is initialized,
-    # when self.num_id_vecs, self.num_static_img_suffix_embs and self.clip_embedding_dim have been set.
-    def init_subj_basis_generator(self):
         self.subj_basis_generator = \
-            SubjBasisGenerator(num_id_vecs = self.num_id_vecs,
                                num_static_img_suffix_embs = self.num_static_img_suffix_embs,
                                bg_image_embedding_dim = self.clip_embedding_dim,
                                output_dim = self.output_dim,
                                placeholder_is_bg = False,
-                               prompt2token_proj_grad_scale = 1,
                                bg_prompt_translator_has_to_out_proj=False)
     def load_adaface_ckpt(self, adaface_ckpt_path):
-        ckpt = torch.load(adaface_ckpt_path, map_location='cpu')
         string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
         if self.subject_string not in string_to_subj_basis_generator_dict:
             print(f"Subject '{self.subject_string}' not found in the embedding manager.")
             breakpoint()
         ckpt_subj_basis_generator = string_to_subj_basis_generator_dict[self.subject_string]
-        ckpt_subj_basis_generator.N_ID              = self.num_id_vecs
         # Since we directly use the subject basis generator object from the ckpt,
         # fixing the number of static image suffix embeddings is much simpler.
         # Otherwise if we want to load the subject basis generator from its state_dict,
@@ -129,7 +134,7 @@ class FaceID2AdaPrompt(nn.Module):
         ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.num_static_img_suffix_embs, img_prompt_dim=self.output_dim)
         # Fix missing variables in old ckpt.
         ckpt_subj_basis_generator.patch_old_subj_basis_generator_ckpt()
         self.subj_basis_generator.extend_prompt2token_proj_attention(\
             ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
         ret = self.subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict(), strict=False)
@@ -155,6 +160,11 @@ class FaceID2AdaPrompt(nn.Module):
         self.subj_basis_generator.freeze_prompt2token_proj()
     @torch.no_grad()
     def get_clip_neg_features(self, BS):
         if self.clip_neg_features is None:
@@ -220,6 +230,7 @@ class FaceID2AdaPrompt(nn.Module):
             image_obj, _, _ = pad_image_obj_to_square(image_obj)
             image_np = np.array(image_obj.resize(size, Image.NEAREST))
             face_info = self.face_app.get(cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR))
             if len(face_info) > 0:
                 face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1] # only use the maximum face
                 # id_emb: [512,]
@@ -487,12 +498,20 @@ class FaceID2AdaPrompt(nn.Module):
     # avg_at_stage == ada_prompt_emb usually produces the worst results.
     # avg_at_stage == id_emb is slightly better than img_prompt_emb, but sometimes img_prompt_emb is better.
     # p_dropout and return_zero_embs_for_dropped_encoders are only used by Joint_FaceID2AdaPrompt.
     def generate_adaface_embeddings(self, image_paths, face_id_embs=None, img_prompt_embs=None,
                                     p_dropout=0,
                                     return_zero_embs_for_dropped_encoders=True,
                                     avg_at_stage='id_emb', # id_emb, img_prompt_emb, or None.
                                     perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
-                                    perturb_std=0, enable_static_img_suffix_embs=False):
         if (avg_at_stage is None) or avg_at_stage.lower() == 'none':
             img_prompt_avg_at_stage = None
         else:
@@ -509,7 +528,7 @@ class FaceID2AdaPrompt(nn.Module):
                     id_batch_size = len(image_paths)
                 else:
                     id_batch_size = 1
             # faceid_embeds: [BS, 512] is a batch of extracted face analysis embeddings. NOT used later.
             # NOTE: If face_id_embs, image_paths and image_objs are all None,
             # then get_img_prompt_embs() generates random faceid_embeds/img_prompt_embs,
@@ -532,7 +551,7 @@ class FaceID2AdaPrompt(nn.Module):
                     verbose=True)
             if face_image_count == 0:
-                return None
         # No matter whether avg_at_stage is id_emb or img_prompt_emb, we average img_prompt_embs.
         elif avg_at_stage is not None and avg_at_stage.lower() != 'none':
@@ -545,19 +564,27 @@ class FaceID2AdaPrompt(nn.Module):
                                       out_id_embs_cfg_scale=self.out_id_embs_cfg_scale,
                                       is_face=True,
                                       enable_static_img_suffix_embs=enable_static_img_suffix_embs)
         # During training,  img_prompt_avg_at_stage is None, and BS >= 1.
         # During inference, img_prompt_avg_at_stage is 'id_emb' or 'img_prompt_emb', and BS == 1.
         if img_prompt_avg_at_stage is not None:
             # adaface_subj_embs: [1, 16, 768] -> [16, 768]
             adaface_subj_embs = adaface_subj_embs.squeeze(0)
-        return adaface_subj_embs
 class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
-    def __init__(self, *args, **kwargs):
-        self.name = 'arc2face'
-        self.num_id_vecs = 16
         super().__init__(*args, **kwargs)
         self.clip_image_encoder = CLIPVisionModelWithMask.from_pretrained('openai/clip-vit-large-patch14')
@@ -583,7 +610,7 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
         self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
                                             providers=['CPUExecutionProvider'])
         self.face_app.prepare(ctx_id=0, det_size=(512, 512))
-        print(f'Face encoder loaded on CPU.')
         self.text_to_image_prompt_encoder = CLIPTextModelWrapper.from_pretrained(
                                                 'models/arc2face', subfolder="encoder",
@@ -594,21 +621,54 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
         if self.out_id_embs_cfg_scale == -1:
             self.out_id_embs_cfg_scale = 1
         #### Arc2Face pipeline specific configs ####
-        self.gen_neg_img_prompt             = False
         # bg CLIP features are used by the bg subject basis generator.
-        self.use_clip_embs                  = True
         self.do_contrast_clip_embs_on_bg_features   = True
         # self.num_static_img_suffix_embs is initialized in the parent class.
-        self.id_img_prompt_max_length       = 22
-        self.clip_embedding_dim             = 1024
-        self.init_subj_basis_generator()
         if self.adaface_ckpt_path is not None:
             self.load_adaface_ckpt(self.adaface_ckpt_path)
-        print(f"{self.name} ada prompt encoder initialized, "
-              f"ID vecs: {self.num_id_vecs}, static suffix: {self.num_static_img_suffix_embs}.")
     # Arc2Face_ID2AdaPrompt never uses clip_features or called_for_neg_img_prompt.
     def map_init_id_to_img_prompt_embs(self, init_id_embs,
                                        clip_features=None,
@@ -656,16 +716,17 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
         # [N, 22, 768] -> [N, 16, 768]
         return prompt_embeds[:, 4:20]
-    def get_id2img_learnable_modules(self):
-        return [ self.text_to_image_prompt_encoder ]
 # ConsistentID_ID2AdaPrompt is just a wrapper of ConsistentIDPipeline, so it's not an nn.Module.
 class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
     def __init__(self, pipe=None, base_model_path="models/sd15-dste8-vae.safetensors",
                  *args, **kwargs):
-        self.name = 'consistentID'
-        self.num_id_vecs = 4
         super().__init__(*args, **kwargs)
         if pipe is None:
             # The base_model_path is kind of arbitrary, as the UNet and VAE in the model
@@ -712,13 +773,47 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
         self.clip_embedding_dim             = 1280
         self.s_scale                        = 1.0
         self.shortcut                       = False
-        self.init_subj_basis_generator()
         if self.adaface_ckpt_path is not None:
             self.load_adaface_ckpt(self.adaface_ckpt_path)
         print(f"{self.name} ada prompt encoder initialized, "
-              f"ID vecs: {self.num_id_vecs}, static suffix: {self.num_static_img_suffix_embs}.")
     def map_init_id_to_img_prompt_embs(self, init_id_embs,
                                        clip_features=None,
@@ -757,26 +852,30 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
         return global_id_embeds
-    def get_id2img_learnable_modules(self):
-        return [ self.image_proj_model ]
 # A wrapper for combining multiple FaceID2AdaPrompt instances.
 class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
     def __init__(self, adaface_encoder_types, adaface_ckpt_paths,
                  out_id_embs_cfg_scales=None, enabled_encoders=None,
                  *args, **kwargs):
         self.name = 'jointIDs'
         assert len(adaface_encoder_types) > 0, "adaface_encoder_types should not be empty."
-        adaface_encoder_types2num_id_vecs = { 'arc2face': 16, 'consistentID': 4 }
-        self.encoders_num_id_vecs = [ adaface_encoder_types2num_id_vecs[encoder_type] \
                                       for encoder_type in adaface_encoder_types ]
-        self.num_id_vecs = sum(self.encoders_num_id_vecs)
         # super() sets self.is_training.
         super().__init__(*args, **kwargs)
         self.num_sub_encoders = len(adaface_encoder_types)
         self.id2ada_prompt_encoders = nn.ModuleList()
         self.encoders_num_static_img_suffix_embs = []
         # TODO: apply adaface_encoder_cfg_scales to influence the final prompt embeddings.
         # Now they are just placeholders.
@@ -786,10 +885,12 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
             self.out_id_embs_cfg_scales = [-1] * self.num_sub_encoders
         else:
             # Do not normalize the weights, and just use them as is.
-            self.out_id_embs_cfg_scales = out_id_embs_cfg_scales
         # Note we don't pass the adaface_ckpt_paths to the base class, but instead,
         # we load them once and for all in self.load_adaface_ckpt().
         for i, encoder_type in enumerate(adaface_encoder_types):
             kwargs['out_id_embs_cfg_scale'] = self.out_id_embs_cfg_scales[i]
             if encoder_type == 'arc2face':
@@ -798,8 +899,10 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
                 encoder = ConsistentID_ID2AdaPrompt(*args, **kwargs)
             else:
                 breakpoint()
             self.id2ada_prompt_encoders.append(encoder)
             self.encoders_num_static_img_suffix_embs.append(encoder.num_static_img_suffix_embs)
         self.num_static_img_suffix_embs     = sum(self.encoders_num_static_img_suffix_embs)
         # No need to set gen_neg_img_prompt, as we don't access it in this class, but rather
@@ -829,7 +932,7 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
             self.load_adaface_ckpt(adaface_ckpt_paths)
         print(f"{self.name} ada prompt encoder initialized with {self.num_sub_encoders} sub-encoders. "
-              f"ID vecs: {self.num_id_vecs}, static suffix embs: {self.num_static_img_suffix_embs}.")
         if enabled_encoders is not None:
             self.are_encoders_enabled = \
@@ -845,79 +948,79 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         else:
             self.are_encoders_enabled = \
                 torch.tensor([True] * self.num_sub_encoders)
-        for i, encoder in enumerate(self.id2ada_prompt_encoders):
-            if not (self.is_training and self.are_encoders_enabled[i]):
-                for param in encoder.parameters():
-                    param.requires_grad = False
-            else:
-                for param in encoder.parameters():
-                    param.requires_grad = True
     def load_adaface_ckpt(self, adaface_ckpt_paths):
-        # If only one adaface ckpt path is provided, then we assume it's the ckpt of the Joint_FaceID2AdaPrompt,
-        # so we dereference the list to get the actual path and load the subj_basis_generators of all adaface encoders.
         if isinstance(adaface_ckpt_paths, (list, tuple, ListConfig)):
-            if len(adaface_ckpt_paths) == 1 and self.num_sub_encoders > 1:
                 adaface_ckpt_paths = adaface_ckpt_paths[0]
-        if isinstance(adaface_ckpt_paths, str):
-            # This is only applicable to newest ckpts of Joint_FaceID2AdaPrompt, where
-            # the ckpt_subj_basis_generator is an nn.ModuleList of multiple subj_basis_generators.
-            # Therefore, no need to patch missing variables.
-            ckpt = torch.load(adaface_ckpt_paths, map_location='cpu')
-            string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
-            if self.subject_string not in string_to_subj_basis_generator_dict:
-                print(f"Subject '{self.subject_string}' not found in the embedding manager.")
                 breakpoint()
-            ckpt_subj_basis_generators = string_to_subj_basis_generator_dict[self.subject_string]
-            if len(ckpt_subj_basis_generators) != self.num_sub_encoders:
-                print(f"Number of subj_basis_generators in the ckpt ({len(ckpt_subj_basis_generators)}) "
-                      f"doesn't match the number of adaface encoders ({self.num_sub_encoders}).")
-                breakpoint()
-            for i, subj_basis_generator in enumerate(self.subj_basis_generator):
-                ckpt_subj_basis_generator = ckpt_subj_basis_generators[i]
-                # Handle differences in num_static_img_suffix_embs between the current model and the ckpt.
-                ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.encoders_num_static_img_suffix_embs[i],
-                                                                            img_prompt_dim=self.output_dim)
-                if subj_basis_generator.prompt2token_proj_attention_multipliers \
-                  == [1] * 12:
-                    subj_basis_generator.extend_prompt2token_proj_attention(\
-                        ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
-                elif subj_basis_generator.prompt2token_proj_attention_multipliers \
-                  != ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers:
-                    raise ValueError("Inconsistent prompt2token_proj_attention_multipliers.")
-                assert subj_basis_generator.prompt2token_proj_attention_multipliers \
-                    == ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, \
-                    "Inconsistent prompt2token_proj_attention_multipliers."
-                subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict())
-                # extend_prompt2token_proj_attention_multiplier is an integer >= 1.
-                # TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
-                # If extend_prompt2token_proj_attention_multiplier > 1, then after loading state_dict,
-                # extend subj_basis_generator again.
-                if self.extend_prompt2token_proj_attention_multiplier > 1:
-                    # During this extension, the added noise does change the extra copies of attention weights, since they are not in the ckpt.
-                    # During training,  prompt2token_proj_ext_attention_perturb_ratio == 0.1.
-                    # During inference, prompt2token_proj_ext_attention_perturb_ratio == 0.
-                    subj_basis_generator.extend_prompt2token_proj_attention(\
-                        None, -1, -1, self.extend_prompt2token_proj_attention_multiplier,
-                        perturb_std=self.prompt2token_proj_ext_attention_perturb_ratio)
-                subj_basis_generator.freeze_prompt2token_proj()
-            print(f"{adaface_ckpt_paths}: {len(self.subj_basis_generator)} subj_basis_generators loaded for {self.name}.")
-        elif isinstance(adaface_ckpt_paths, (list, tuple, ListConfig)):
-            for i, ckpt_path in enumerate(adaface_ckpt_paths):
-                self.id2ada_prompt_encoders[i].load_adaface_ckpt(ckpt_path)
-        else:
             breakpoint()
     def extract_init_id_embeds_from_images(self, *args, **kwargs):
         total_faceless_img_count = 0
         all_id_embs = []
@@ -1055,7 +1158,7 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
             N_ID = self.encoders_num_id_vecs[i]
             if all_pos_prompt_embs[i] is None:
-                # Both pos_prompt_embs and neg_prompt_embs have N_ID == num_id_vecs embeddings.
                 all_pos_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
             if all_neg_prompt_embs[i] is None:
                 all_neg_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
@@ -1077,6 +1180,13 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         # So its .device is the device of its parameters.
         device = self.id2ada_prompt_encoders[0].clip_image_encoder.device
         is_emb_averaged = kwargs.get('avg_at_stage', None) is not None
         BS = -1
         if face_id_embs is not None:
@@ -1084,13 +1194,17 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
             all_face_id_embs = face_id_embs.split(self.face_id_dims, dim=1)
         else:
             all_face_id_embs = [None] * self.num_sub_encoders
         if img_prompt_embs is not None:
             BS = img_prompt_embs.shape[0] if BS == -1 else BS
-            if img_prompt_embs.shape[1] != self.num_id_vecs:
                 breakpoint()
-            all_img_prompt_embs = img_prompt_embs.split(self.encoders_num_id_vecs, dim=1)
         else:
             all_img_prompt_embs = [None] * self.num_sub_encoders
         if image_paths is not None:
             BS = len(image_paths) if BS == -1 else BS
         if BS == -1:
@@ -1116,23 +1230,29 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         self.curr_are_encoders_enabled = are_encoders_enabled
         all_adaface_subj_embs = []
         num_available_id_vecs = 0
         for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
             if not are_encoders_enabled[i]:
                 adaface_subj_embs = None
-                print(f"Encoder {id2ada_prompt_encoder.name} is dropped.")
             else:
                 # ddpm.embedding_manager.train() -> id2ada_prompt_encoder.train() -> each sub-enconder's train().
                 # -> each sub-enconder's subj_basis_generator.train().
                 # Therefore grad for the following call is enabled.
-                adaface_subj_embs = \
                     id2ada_prompt_encoder.generate_adaface_embeddings(image_paths,
                                                                       all_face_id_embs[i],
                                                                       all_img_prompt_embs[i],
                                                                       *args, **kwargs)
-            # adaface_subj_embs: [16, 768] or [4, 768].
-            N_ID = self.encoders_num_id_vecs[i]
             if adaface_subj_embs is None:
                 if not return_zero_embs_for_dropped_encoders:
                     continue
@@ -1143,12 +1263,16 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
                     all_adaface_subj_embs.append(adaface_subj_embs)
             else:
                 all_adaface_subj_embs.append(adaface_subj_embs)
                 num_available_id_vecs += N_ID
         # No faces are found in the images, so return None embeddings.
         # We don't want to return an all-zero embedding, which is useless.
         if num_available_id_vecs == 0:
-            return None
         # If id2ada_prompt_encoders are ["arc2face", "consistentID"], then
         # during inference, we average across the batch dim.
@@ -1158,7 +1282,12 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         # all_adaface_subj_embs[0]: [BS, 4, 768]. all_adaface_subj_embs[1]: [BS, 16, 768].
         # all_adaface_subj_embs: [BS, 20, 768].
         all_adaface_subj_embs = torch.cat(all_adaface_subj_embs, dim=-2)
-        return all_adaface_subj_embs
 '''

         self.text_to_image_prompt_encoder   = None
         self.tokenizer                      = None
         self.dtype                          = kwargs.get('dtype', torch.float16)
+        self.img2txt_dtype                  = kwargs.get('img2txt_dtype', torch.float16)
+        self.device                         = torch.device("cpu")
         # Load Img2Ada SubjectBasisGenerator.
         self.subject_string                 = kwargs.get('subject_string', 'z')
         self.use_clip_embs                          = False
         self.do_contrast_clip_embs_on_bg_features   = False
+        # Override the default setting in derived classes.
+        if 'enable_static_img_suffix_embs' in kwargs:
+            self.default_enable_static_img_suffix_embs = kwargs['enable_static_img_suffix_embs']
         # num_id_vecs is the output embeddings of the ID2ImgPrompt module.
         # If there's no static image suffix embeddings, then num_id_vecs is also
         # the number of ada embeddings returned by the subject basis generator.
         # num_id_vecs will be set in each derived class.
         self.num_static_img_suffix_embs     = kwargs.get('num_static_img_suffix_embs', 0)
+        print(f'{self.name} Adaface uses {self.num_id_vecs} ID image embeddings + {self.num_static_img_suffix_embs} fixed image embeddings as input.')
         self.id_img_prompt_max_length       = 77
         self.face_id_dim                    = 512
         self.clip_embedding_dim             = 1024
         self.output_dim                     = 768
+    # init_img2txt_projection() can only be called after the derived class is initialized,
+    # when self.num_id_vecs0, self.num_static_img_suffix_embs and self.clip_embedding_dim have been set.
+    def init_img2txt_projection(self):
         self.subj_basis_generator = \
+            SubjBasisGenerator(dtype=self.img2txt_dtype,
+                               num_id_vecs = self.num_id_vecs0,
                                num_static_img_suffix_embs = self.num_static_img_suffix_embs,
                                bg_image_embedding_dim = self.clip_embedding_dim,
                                output_dim = self.output_dim,
                                placeholder_is_bg = False,
                                bg_prompt_translator_has_to_out_proj=False)
     def load_adaface_ckpt(self, adaface_ckpt_path):
+        if isinstance(adaface_ckpt_path, (list, tuple, ListConfig)):
+            adaface_ckpt_path = adaface_ckpt_path[0]
+        ckpt = torch.load(adaface_ckpt_path, map_location='cpu', weights_only=False)
         string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
         if self.subject_string not in string_to_subj_basis_generator_dict:
             print(f"Subject '{self.subject_string}' not found in the embedding manager.")
             breakpoint()
         ckpt_subj_basis_generator = string_to_subj_basis_generator_dict[self.subject_string]
+        if isinstance(ckpt_subj_basis_generator, nn.ModuleList):
+            name2idx = { 'consistentID': 0, 'arc2face': 1 }
+            subj_basis_generator_idx = name2idx[self.name]
+            ckpt_subj_basis_generator = ckpt_subj_basis_generator[subj_basis_generator_idx]
+        ckpt_subj_basis_generator.N_ID              = self.num_id_vecs0
         # Since we directly use the subject basis generator object from the ckpt,
         # fixing the number of static image suffix embeddings is much simpler.
         # Otherwise if we want to load the subject basis generator from its state_dict,
         ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.num_static_img_suffix_embs, img_prompt_dim=self.output_dim)
         # Fix missing variables in old ckpt.
         ckpt_subj_basis_generator.patch_old_subj_basis_generator_ckpt()
         self.subj_basis_generator.extend_prompt2token_proj_attention(\
             ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
         ret = self.subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict(), strict=False)
         self.subj_basis_generator.freeze_prompt2token_proj()
+    def set_out_id_embs_cfg_scale(self, out_id_embs_cfg_scale):
+        if isinstance(out_id_embs_cfg_scale, (list, tuple, ListConfig)):
+            out_id_embs_cfg_scale = out_id_embs_cfg_scale[0]
+        self.out_id_embs_cfg_scale = out_id_embs_cfg_scale
     @torch.no_grad()
     def get_clip_neg_features(self, BS):
         if self.clip_neg_features is None:
             image_obj, _, _ = pad_image_obj_to_square(image_obj)
             image_np = np.array(image_obj.resize(size, Image.NEAREST))
             face_info = self.face_app.get(cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR))
             if len(face_info) > 0:
                 face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1] # only use the maximum face
                 # id_emb: [512,]
     # avg_at_stage == ada_prompt_emb usually produces the worst results.
     # avg_at_stage == id_emb is slightly better than img_prompt_emb, but sometimes img_prompt_emb is better.
     # p_dropout and return_zero_embs_for_dropped_encoders are only used by Joint_FaceID2AdaPrompt.
+    # enable_static_img_suffix_embs=None: use the default setting.
     def generate_adaface_embeddings(self, image_paths, face_id_embs=None, img_prompt_embs=None,
                                     p_dropout=0,
                                     return_zero_embs_for_dropped_encoders=True,
                                     avg_at_stage='id_emb', # id_emb, img_prompt_emb, or None.
                                     perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
+                                    perturb_std=0, enable_static_img_suffix_embs=None):
+        if enable_static_img_suffix_embs is None:
+            enable_static_img_suffix_embs = self.default_enable_static_img_suffix_embs
+        lens_subj_emb_segments = [ self.num_id_vecs + enable_static_img_suffix_embs \
+                                                      * self.num_static_img_suffix_embs ]
         if (avg_at_stage is None) or avg_at_stage.lower() == 'none':
             img_prompt_avg_at_stage = None
         else:
                     id_batch_size = len(image_paths)
                 else:
                     id_batch_size = 1
             # faceid_embeds: [BS, 512] is a batch of extracted face analysis embeddings. NOT used later.
             # NOTE: If face_id_embs, image_paths and image_objs are all None,
             # then get_img_prompt_embs() generates random faceid_embeds/img_prompt_embs,
                     verbose=True)
             if face_image_count == 0:
+                return None, None, lens_subj_emb_segments
         # No matter whether avg_at_stage is id_emb or img_prompt_emb, we average img_prompt_embs.
         elif avg_at_stage is not None and avg_at_stage.lower() != 'none':
                                       out_id_embs_cfg_scale=self.out_id_embs_cfg_scale,
                                       is_face=True,
                                       enable_static_img_suffix_embs=enable_static_img_suffix_embs)
+        if self.num_id_vecs < self.num_id_vecs0:
+            adaface_subj_embs = adaface_subj_embs[:, :self.num_id_vecs, :]
         # During training,  img_prompt_avg_at_stage is None, and BS >= 1.
         # During inference, img_prompt_avg_at_stage is 'id_emb' or 'img_prompt_emb', and BS == 1.
         if img_prompt_avg_at_stage is not None:
             # adaface_subj_embs: [1, 16, 768] -> [16, 768]
             adaface_subj_embs = adaface_subj_embs.squeeze(0)
+        return adaface_subj_embs, img_prompt_embs, lens_subj_emb_segments
 class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
+    name = 'arc2face'
+    num_id_vecs0 = 16
+    # first 4 are kept, the rest 12 are averaged to another 4.
+    # Then concatenated to [8, 768].
+    num_id_vecs  = 16
+    default_enable_static_img_suffix_embs = False
+    def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.clip_image_encoder = CLIPVisionModelWithMask.from_pretrained('openai/clip-vit-large-patch14')
         self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
                                             providers=['CPUExecutionProvider'])
         self.face_app.prepare(ctx_id=0, det_size=(512, 512))
+        print(f'Arc2Face Face encoder loaded on CPU.')
         self.text_to_image_prompt_encoder = CLIPTextModelWrapper.from_pretrained(
                                                 'models/arc2face', subfolder="encoder",
         if self.out_id_embs_cfg_scale == -1:
             self.out_id_embs_cfg_scale = 1
         #### Arc2Face pipeline specific configs ####
+        self.gen_neg_img_prompt                     = False
         # bg CLIP features are used by the bg subject basis generator.
+        self.use_clip_embs                          = True
         self.do_contrast_clip_embs_on_bg_features   = True
         # self.num_static_img_suffix_embs is initialized in the parent class.
+        self.id_img_prompt_max_length               = 22
+        self.clip_embedding_dim                     = 1024
+        self.init_img2txt_projection()
         if self.adaface_ckpt_path is not None:
             self.load_adaface_ckpt(self.adaface_ckpt_path)
+        for param in self.clip_image_encoder.parameters():
+            param.requires_grad = False
+        for param in self.text_to_image_prompt_encoder.parameters():
+            param.requires_grad = False
+        for param in self.subj_basis_generator.parameters():
+            param.requires_grad = self.is_training
+        print(f"{self.name} ada prompt encoder initialized, "
+              f"ID vecs: {self.num_id_vecs0}, static suffix: {self.num_static_img_suffix_embs}.")
+    def _apply(self, fn):
+        super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
+        # A dirty hack to get the device of the model, passed from
+        # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
+        test_tensor = torch.zeros(1)  # Create a test tensor
+        transformed_tensor = fn(test_tensor)  # Apply `fn()` to test it
+        device = transformed_tensor.device  # Get the device of the transformed tensor
+        # No need to reload face_app on the same device.
+        if device == self.device:
+            return
+        if str(device) == 'cpu':
+            self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
+                                        providers=['CPUExecutionProvider'])
+            self.face_app.prepare(ctx_id=0, det_size=(512, 512))
+        else:
+            device_id = device.index
+            self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
+                                        providers=['CUDAExecutionProvider'],
+                                        provider_options=[{"device_id": str(device_id)}])
+            self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
+        self.device = device
+        print(f'Arc2Face Face encoder reloaded on {device}.')
+        return
     # Arc2Face_ID2AdaPrompt never uses clip_features or called_for_neg_img_prompt.
     def map_init_id_to_img_prompt_embs(self, init_id_embs,
                                        clip_features=None,
         # [N, 22, 768] -> [N, 16, 768]
         return prompt_embeds[:, 4:20]
 # ConsistentID_ID2AdaPrompt is just a wrapper of ConsistentIDPipeline, so it's not an nn.Module.
 class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
+    name = 'consistentID'
+    num_id_vecs0 = 4
+    # No compression for ConsistentID.
+    num_id_vecs  = 4
+    default_enable_static_img_suffix_embs = False
     def __init__(self, pipe=None, base_model_path="models/sd15-dste8-vae.safetensors",
                  *args, **kwargs):
         super().__init__(*args, **kwargs)
         if pipe is None:
             # The base_model_path is kind of arbitrary, as the UNet and VAE in the model
         self.clip_embedding_dim             = 1280
         self.s_scale                        = 1.0
         self.shortcut                       = False
+        self.init_img2txt_projection()
         if self.adaface_ckpt_path is not None:
             self.load_adaface_ckpt(self.adaface_ckpt_path)
+        for param in self.clip_image_encoder.parameters():
+            param.requires_grad = False
+        for param in self.image_proj_model.parameters():
+            param.requires_grad = False
+        for param in self.subj_basis_generator.parameters():
+            param.requires_grad = self.is_training
         print(f"{self.name} ada prompt encoder initialized, "
+              f"ID vecs: {self.num_id_vecs0}, static suffix: {self.num_static_img_suffix_embs}.")
+    def _apply(self, fn):
+        super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
+        # A dirty hack to get the device of the model, passed from
+        # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
+        test_tensor = torch.zeros(1)  # Create a test tensor
+        transformed_tensor = fn(test_tensor)  # Apply `fn()` to test it
+        device = transformed_tensor.device  # Get the device of the transformed tensor
+        # No need to reload face_app on the same device.
+        if device == self.device:
+            return
+        if str(device) == 'cpu':
+            self.face_app = FaceAnalysis(name='buffalo_l', root='models/insightface',
+                                        providers=['CPUExecutionProvider'])
+            self.face_app.prepare(ctx_id=0, det_size=(512, 512))
+        else:
+            device_id = device.index
+            self.face_app = FaceAnalysis(name='buffalo_l', root='models/insightface',
+                                        providers=['CUDAExecutionProvider'],
+                                        provider_options=[{"device_id": str(device_id)}])
+            self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
+        self.device = device
+        self.pipe.face_app = self.face_app
+        print(f'ConsistentID Face encoder reloaded on {device}.')
     def map_init_id_to_img_prompt_embs(self, init_id_embs,
                                        clip_features=None,
         return global_id_embeds
 # A wrapper for combining multiple FaceID2AdaPrompt instances.
 class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
     def __init__(self, adaface_encoder_types, adaface_ckpt_paths,
                  out_id_embs_cfg_scales=None, enabled_encoders=None,
                  *args, **kwargs):
         self.name = 'jointIDs'
+        name2class = { 'arc2face': Arc2Face_ID2AdaPrompt, 'consistentID': ConsistentID_ID2AdaPrompt }
         assert len(adaface_encoder_types) > 0, "adaface_encoder_types should not be empty."
+        adaface_encoder_types2num_id_vecs0 = { name: name2class[name].num_id_vecs0 for name in adaface_encoder_types }
+        adaface_encoder_types2num_id_vecs  = { name: name2class[name].num_id_vecs  for name in adaface_encoder_types }
+        # self.num_id_vecs0 is used in the parent class. So we need to initialize it here first.
+        self.encoders_num_id_vecs0 = [ adaface_encoder_types2num_id_vecs0[encoder_type] \
                                       for encoder_type in adaface_encoder_types ]
+        self.encoders_num_id_vecs  = [ adaface_encoder_types2num_id_vecs[encoder_type] \
+                                        for encoder_type in adaface_encoder_types ]
+        self.num_id_vecs0 = sum(self.encoders_num_id_vecs0)
+        self.num_id_vecs  = sum(self.encoders_num_id_vecs)
         # super() sets self.is_training.
         super().__init__(*args, **kwargs)
         self.num_sub_encoders = len(adaface_encoder_types)
         self.id2ada_prompt_encoders = nn.ModuleList()
         self.encoders_num_static_img_suffix_embs = []
+        self.default_enable_static_img_suffix_embs = []
         # TODO: apply adaface_encoder_cfg_scales to influence the final prompt embeddings.
         # Now they are just placeholders.
             self.out_id_embs_cfg_scales = [-1] * self.num_sub_encoders
         else:
             # Do not normalize the weights, and just use them as is.
+            self.out_id_embs_cfg_scales = list(out_id_embs_cfg_scales)
         # Note we don't pass the adaface_ckpt_paths to the base class, but instead,
         # we load them once and for all in self.load_adaface_ckpt().
+        # NOTE: during inference, num_static_img_suffix_embs is fixed to be 4 for each encoder.
+        # But we can still disable static_img_suffix_embs by setting enable_static_img_suffix_embs to False.
         for i, encoder_type in enumerate(adaface_encoder_types):
             kwargs['out_id_embs_cfg_scale'] = self.out_id_embs_cfg_scales[i]
             if encoder_type == 'arc2face':
                 encoder = ConsistentID_ID2AdaPrompt(*args, **kwargs)
             else:
                 breakpoint()
             self.id2ada_prompt_encoders.append(encoder)
             self.encoders_num_static_img_suffix_embs.append(encoder.num_static_img_suffix_embs)
+            self.default_enable_static_img_suffix_embs.append(encoder.default_enable_static_img_suffix_embs)
         self.num_static_img_suffix_embs     = sum(self.encoders_num_static_img_suffix_embs)
         # No need to set gen_neg_img_prompt, as we don't access it in this class, but rather
             self.load_adaface_ckpt(adaface_ckpt_paths)
         print(f"{self.name} ada prompt encoder initialized with {self.num_sub_encoders} sub-encoders. "
+              f"ID vecs: {self.num_id_vecs0}, static suffix embs: {self.num_static_img_suffix_embs}.")
         if enabled_encoders is not None:
             self.are_encoders_enabled = \
         else:
             self.are_encoders_enabled = \
                 torch.tensor([True] * self.num_sub_encoders)
     def load_adaface_ckpt(self, adaface_ckpt_paths):
         if isinstance(adaface_ckpt_paths, (list, tuple, ListConfig)):
+            # If multiple adaface ckpt paths are provided, then we assume they are the
+            # ckpts of the sub-encoders.
+            if len(adaface_ckpt_paths) == self.num_sub_encoders:
+                for i, ckpt_path in enumerate(adaface_ckpt_paths):
+                    self.id2ada_prompt_encoders[i].load_adaface_ckpt(ckpt_path)
+                return
+            # If only one adaface ckpt path is provided, then we assume it's the ckpt of the Joint_FaceID2AdaPrompt,
+            # so we dereference the list to get the actual path and load the subj_basis_generators of all adaface encoders.
+            elif len(adaface_ckpt_paths) == 1 and self.num_sub_encoders > 1:
                 adaface_ckpt_paths = adaface_ckpt_paths[0]
+            else:
                 breakpoint()
+        adaface_ckpt_path = adaface_ckpt_paths
+        assert isinstance(adaface_ckpt_path, str), "adaface_ckpt_path should be a string."
+        # This is only applicable to newest ckpts of Joint_FaceID2AdaPrompt, where
+        # the ckpt_subj_basis_generator is an nn.ModuleList of multiple subj_basis_generators.
+        # Therefore, no need to patch missing variables.
+        ckpt = torch.load(adaface_ckpt_paths, map_location='cpu', weights_only=False)
+        string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
+        if self.subject_string not in string_to_subj_basis_generator_dict:
+            print(f"Subject '{self.subject_string}' not found in the embedding manager.")
+            breakpoint()
+        ckpt_subj_basis_generators = string_to_subj_basis_generator_dict[self.subject_string]
+        if len(ckpt_subj_basis_generators) != self.num_sub_encoders:
+            print(f"Number of subj_basis_generators in the ckpt ({len(ckpt_subj_basis_generators)}) "
+                    f"doesn't match the number of adaface encoders ({self.num_sub_encoders}).")
             breakpoint()
+        for i, subj_basis_generator in enumerate(self.subj_basis_generator):
+            ckpt_subj_basis_generator = ckpt_subj_basis_generators[i]
+            # Handle differences in num_static_img_suffix_embs between the current model and the ckpt.
+            ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.encoders_num_static_img_suffix_embs[i],
+                                                                        img_prompt_dim=self.output_dim)
+            if subj_basis_generator.prompt2token_proj_attention_multipliers \
+                == [1] * 12:
+                subj_basis_generator.extend_prompt2token_proj_attention(\
+                    ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
+            elif subj_basis_generator.prompt2token_proj_attention_multipliers \
+                != ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers:
+                raise ValueError("Inconsistent prompt2token_proj_attention_multipliers.")
+            assert subj_basis_generator.prompt2token_proj_attention_multipliers \
+                == ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, \
+                "Inconsistent prompt2token_proj_attention_multipliers."
+            subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict())
+            # extend_prompt2token_proj_attention_multiplier is an integer >= 1.
+            # TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
+            # If extend_prompt2token_proj_attention_multiplier > 1, then after loading state_dict,
+            # extend subj_basis_generator again.
+            if self.extend_prompt2token_proj_attention_multiplier > 1:
+                # During this extension, the added noise does change the extra copies of attention weights, since they are not in the ckpt.
+                # During training,  prompt2token_proj_ext_attention_perturb_ratio == 0.1.
+                # During inference, prompt2token_proj_ext_attention_perturb_ratio == 0.
+                subj_basis_generator.extend_prompt2token_proj_attention(\
+                    None, -1, -1, self.extend_prompt2token_proj_attention_multiplier,
+                    perturb_std=self.prompt2token_proj_ext_attention_perturb_ratio)
+            subj_basis_generator.freeze_prompt2token_proj()
+        print(f"{adaface_ckpt_paths}: {len(self.subj_basis_generator)} subj_basis_generators loaded for {self.name}.")
+    def set_out_id_embs_cfg_scale(self, out_id_embs_cfg_scales):
+        self.out_id_embs_cfg_scales = list(out_id_embs_cfg_scales)
+        for i, out_id_embs_cfg_scale in enumerate(out_id_embs_cfg_scales):
+            self.id2ada_prompt_encoders[i].set_out_id_embs_cfg_scale(out_id_embs_cfg_scale)
     def extract_init_id_embeds_from_images(self, *args, **kwargs):
         total_faceless_img_count = 0
         all_id_embs = []
             N_ID = self.encoders_num_id_vecs[i]
             if all_pos_prompt_embs[i] is None:
+                # Both pos_prompt_embs and neg_prompt_embs have N_ID == num_id_vecs0 embeddings.
                 all_pos_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
             if all_neg_prompt_embs[i] is None:
                 all_neg_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
         # So its .device is the device of its parameters.
         device = self.id2ada_prompt_encoders[0].clip_image_encoder.device
         is_emb_averaged = kwargs.get('avg_at_stage', None) is not None
+        if kwargs.get('enable_static_img_suffix_embs', None) is None:
+            enable_static_img_suffix_embs = self.default_enable_static_img_suffix_embs
+        else:
+            enable_static_img_suffix_embs = kwargs['enable_static_img_suffix_embs']
+            if isinstance(enable_static_img_suffix_embs, bool):
+                enable_static_img_suffix_embs = [enable_static_img_suffix_embs] * self.num_sub_encoders
         BS = -1
         if face_id_embs is not None:
             all_face_id_embs = face_id_embs.split(self.face_id_dims, dim=1)
         else:
             all_face_id_embs = [None] * self.num_sub_encoders
         if img_prompt_embs is not None:
             BS = img_prompt_embs.shape[0] if BS == -1 else BS
+            if img_prompt_embs.shape[1] != self.num_id_vecs0:
                 breakpoint()
+            all_img_prompt_embs = img_prompt_embs.split(self.encoders_num_id_vecs0, dim=1)
+            img_prompt_embs_provided = True
         else:
             all_img_prompt_embs = [None] * self.num_sub_encoders
+            img_prompt_embs_provided = False
         if image_paths is not None:
             BS = len(image_paths) if BS == -1 else BS
         if BS == -1:
         self.curr_are_encoders_enabled = are_encoders_enabled
         all_adaface_subj_embs = []
         num_available_id_vecs = 0
+        lens_subj_emb_segments = []
         for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
             if not are_encoders_enabled[i]:
                 adaface_subj_embs = None
+                print(f"Encoder {id2ada_prompt_encoder.name} is disabled.")
+                N_ID = id2ada_prompt_encoder.num_id_vecs + enable_static_img_suffix_embs[i] \
+                                                           * id2ada_prompt_encoder.num_static_img_suffix_embs
             else:
+                kwargs['enable_static_img_suffix_embs'] = enable_static_img_suffix_embs[i]
                 # ddpm.embedding_manager.train() -> id2ada_prompt_encoder.train() -> each sub-enconder's train().
                 # -> each sub-enconder's subj_basis_generator.train().
                 # Therefore grad for the following call is enabled.
+                adaface_subj_embs, img_prompt_embs, encoder_lens_subj_emb_segments = \
                     id2ada_prompt_encoder.generate_adaface_embeddings(image_paths,
                                                                       all_face_id_embs[i],
                                                                       all_img_prompt_embs[i],
                                                                       *args, **kwargs)
+                # adaface_subj_embs: arc2face [16, 768] or consistentID [4, 768],
+                # or arc2face [20, 768] or consistentID [8, 768] if enable_static_img_suffix_embs=True.
+                N_ID = encoder_lens_subj_emb_segments[0]
             if adaface_subj_embs is None:
                 if not return_zero_embs_for_dropped_encoders:
                     continue
                     all_adaface_subj_embs.append(adaface_subj_embs)
             else:
                 all_adaface_subj_embs.append(adaface_subj_embs)
+                if not img_prompt_embs_provided:
+                    all_img_prompt_embs[i] = img_prompt_embs
                 num_available_id_vecs += N_ID
+            lens_subj_emb_segments.append(N_ID)
         # No faces are found in the images, so return None embeddings.
         # We don't want to return an all-zero embedding, which is useless.
         if num_available_id_vecs == 0:
+            return None, [0]
         # If id2ada_prompt_encoders are ["arc2face", "consistentID"], then
         # during inference, we average across the batch dim.
         # all_adaface_subj_embs[0]: [BS, 4, 768]. all_adaface_subj_embs[1]: [BS, 16, 768].
         # all_adaface_subj_embs: [BS, 20, 768].
         all_adaface_subj_embs = torch.cat(all_adaface_subj_embs, dim=-2)
+        # Check if some of the img_prompt_embs are None.
+        if None in all_img_prompt_embs:
+            all_img_prompt_embs = None
+        else:
+            all_img_prompt_embs   = torch.cat(all_img_prompt_embs, dim=-2)
+        return all_adaface_subj_embs, all_img_prompt_embs, lens_subj_emb_segments
 '''

adaface/subj_basis_generator.py CHANGED Viewed

@@ -9,7 +9,7 @@ import torch
 from torch import nn
 from einops import rearrange
 from einops.layers.torch import Rearrange
-from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextConfig
 from torch import einsum
 from adaface.util import gen_gradient_scaler
@@ -57,7 +57,25 @@ class IP_MLPProjModel(nn.Module):
         x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
         x = self.norm(x)
         return x
 # group_dim: the tensor dimension that corresponds to the multiple groups.
 class LearnedSoftAggregate(nn.Module):
     def __init__(self, num_feat, group_dim, keepdim=False):
@@ -349,23 +367,26 @@ class CrossAttention(nn.Module):
         else:
             return out
 class ImgPrompt2TextPrompt(nn.Module):
-    def __init__(self, placeholder_is_bg, num_id_vecs, dtype=torch.float32, *args, **kwargs):
         super().__init__()
         self.N_ID  = num_id_vecs
         # If not placeholder_is_bg, then N_SFX will be updated in initialize_text_components().
         self.N_SFX = 0
         if not placeholder_is_bg:
-            self.initialize_text_components(*args, **kwargs)
         # prompt2token_proj: arc2face_models.py CLIPTextModelWrapper instance with **custom weights**.
         # prompt2token_proj is with the same architecture as the original arc2face text encoder,
         # but retrained to do inverse mapping.
         # To be initialized in the subclass.
         self.prompt2token_proj = None
-        self.dtype = dtype
     def initialize_static_img_suffix_embs(self, num_static_img_suffix_embs, img_prompt_dim=768):
         self.N_SFX = num_static_img_suffix_embs
         # We always take the first num_static_img_suffix_embs embeddings out of static_img_suffix_embs.
@@ -376,11 +397,11 @@ class ImgPrompt2TextPrompt(nn.Module):
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs ({self.N_SFX} required). Skip initialization.")
             elif self.static_img_suffix_embs.shape[1] < self.N_SFX:
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (< {self.N_SFX} required). Reinitialize.")
-                self.static_img_suffix_embs = nn.Parameter(torch.randn(1, self.N_SFX, img_prompt_dim))
             elif self.N_SFX > 0:
                 # self.static_img_suffix_embs.shape[1] > self.N_SFX > 0.
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (> {self.N_SFX} required). Truncate.")
-                self.static_img_suffix_embs = nn.Parameter(self.static_img_suffix_embs[:, :self.N_SFX])
             else:
                 # self.static_img_suffix_embs.shape[1] > self.N_SFX == 0.
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (0 required). Erase.")
@@ -391,7 +412,7 @@ class ImgPrompt2TextPrompt(nn.Module):
                 # or it's initialized but has fewer than num_static_img_suffix_embs embeddings (this situation should be very rare,
                 # so we don't consider to reuse and extend a shorter static_img_suffix_embs).
                 # So we reinitialize it.
-                self.static_img_suffix_embs = nn.Parameter(torch.randn(1, self.N_SFX, img_prompt_dim))
             else:
                 # If static_img_suffix_embs had been initialized, then it will be set to None, i.e., erased from the SubjBasisGenerator instance.
                 self.static_img_suffix_embs = None
@@ -399,9 +420,7 @@ class ImgPrompt2TextPrompt(nn.Module):
     # Implement a separate initialization function, so that it can be called from SubjBasisGenerator
     # after the SubjBasisGenerator is initialized. This can be used to fix old SubjBasisGenerator
     # ckpts which were not subclassed from ImgPrompt2TextPrompt.
-    def initialize_text_components(self, max_prompt_length=77, num_id_vecs=16,
-                                   num_static_img_suffix_embs=0, img_prompt_dim=768):
-        self.initialize_static_img_suffix_embs(num_static_img_suffix_embs, img_prompt_dim)
         self.max_prompt_length = max_prompt_length
         self.tokenizer       = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
         # clip_text_embeddings: CLIPTextEmbeddings instance.
@@ -416,7 +435,7 @@ class ImgPrompt2TextPrompt(nn.Module):
         # pad_embeddings is still on CPU. But should be moved to GPU automatically.
         # Note: detach pad_embeddings from the computation graph, otherwise
         # deepcopy() in embedding_manager.py:make_frozen_copy_of_subj_basis_generators() will fail.
-        self.pad_embeddings = clip_text_embeddings(pad_tokens)[0].detach()
     # image prompt space -> text prompt space.
     # return_emb_types: a list of strings, each string is among
@@ -439,7 +458,7 @@ class ImgPrompt2TextPrompt(nn.Module):
                     else:
                         breakpoint()
                 else:
-                    # len(face_prompt_embs) == 1, this occurs when same_subject_in_batch == True, e.g. in do_comp_prompt_distillation.
                     # But list_extra_words always corresponds to the actual batch size. So we only take the first element.
                     list_extra_words = list_extra_words[:1]
@@ -466,7 +485,7 @@ class ImgPrompt2TextPrompt(nn.Module):
         face_prompt_embs_orig_dtype = face_prompt_embs.dtype
         face_prompt_embs            = face_prompt_embs.to(self.dtype)
-        ID_END      = 4 + self.N_ID
         PAD_BEGIN   = ID_END + self.N_SFX + 2
         # token_embs: [1, 77, 768]. This call is only to get the template token embeddings (the shallowest mapping).
@@ -545,6 +564,7 @@ class ImgPrompt2TextPrompt(nn.Module):
 class SubjBasisGenerator(ImgPrompt2TextPrompt):
     def __init__(
         self,
         # number of cross-attention heads of the bg prompt translator.
         # Taken as a half of the number of heads 12 of OpenAI clip-vit-large-patch14:
         # https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
@@ -553,22 +573,25 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
         # or number of background input identity vectors (no matter the subject is face or not).
         # 257: 257 CLIP tokens.
         num_nonface_in_id_vecs={ 'subj': 77, 'bg': 257 },
         num_id_vecs=16,                             # num_id_vecs: subj: 16. bg: 4.
         num_static_img_suffix_embs: int = 0,        # Number of extra static learnable image embeddings appended to translated ID embeddings.
         bg_image_embedding_dim=1024,                # CLIP image hidden layer feature dimension, as per config.json above.
         obj_embedding_dim=384,                      # DINO object feature dimension for objects.
         output_dim=768,                             # CLIP text embedding input dimension.
         placeholder_is_bg: bool = False,            # Whether the placeholder is for the image background tokens.
-        prompt2token_proj_grad_scale: float = 0.4,  # Gradient scale for prompt2token_proj.
         learnable_hidden_state_weights_scheme: str = 'per-layer',   # none, per-layer.
-        bg_prompt_translator_has_to_out_proj: bool = False,         # Whether the prompt_trans_layers have a to_out projection.
     ):
         # If not placeholder_is_bg, then it calls initialize_text_components() in the superclass.
-        super().__init__(placeholder_is_bg=placeholder_is_bg, num_id_vecs=num_id_vecs, max_prompt_length=77,
-                         num_static_img_suffix_embs=num_static_img_suffix_embs, img_prompt_dim=output_dim)
         self.placeholder_is_bg  = placeholder_is_bg
         self.num_out_embs       = self.N_ID + self.N_SFX
         self.output_dim         = output_dim
         # num_nonface_in_id_vecs should be the number of core ID embs, 16.
@@ -586,14 +609,18 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
             # self.prompt2token_proj: [1, 16, 768] -> [1, 77, 768] (with paddings) or [1, 16, 768] (without paddings).
             # If self.placeholder_is_bg: prompt2token_proj is set to None.
             # Use an attention dropout of 0.2 to increase robustness.
-            clip_dropout_config     = None #CLIPTextConfig.from_pretrained('openai/clip-vit-large-patch14', attention_dropout=0.05, dropout=0.05)
-            self.prompt2token_proj  = CLIPTextModelWrapper.from_pretrained('openai/clip-vit-large-patch14',
-                                                                           config=clip_dropout_config)
-            self.prompt2token_proj_grad_scale   = prompt2token_proj_grad_scale
-            self.prompt2token_proj_grad_scaler  = gen_gradient_scaler(prompt2token_proj_grad_scale)
-            print(f"Subj prompt2token_proj initialized with grad scale of {prompt2token_proj_grad_scale}.")
-            # If prompt2token_proj_grad_scale is 0, freeze all params in prompt2token_proj.
-            # Otherwise, only freeze token and positional embeddings of the original CLIPTextModel.
             self.freeze_prompt2token_proj()
             # These multipliers are relative to the original CLIPTextModel.
@@ -631,6 +658,9 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
                                identity_to_out=identity_to_out,
                                out_has_skip=out_has_skip)
             self.output_scale = output_dim ** -0.5
             '''
@@ -686,21 +716,20 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
                 hidden_state_layer_weights = self.hidden_state_layer_weights_grad_scaler(self.hidden_state_layer_weights)
                 # faceid2img_prompt_embs -> ada_id_embs: image prompt space -> text prompt space.
-                with torch.set_grad_enabled(self.training and self.prompt2token_proj_grad_scale != 0):
-                    # If list_extra_words is not None, then ada_id_embs: [BS, 18, 768], three leading words, the 16 identity tokens
-                    # and (at most) two extra words in adaface_prompt_embs, without BOS and EOS.
-                    # If list_extra_words is None, then ada_id_embs: [BS, 16, 768], the 16 identity tokens in adaface_prompt_embs.
-                    # hidden_state_layer_weights: [[0.9163], [0.9483], [2.0762]]
-                    # ada_id_embs: [BS, 16, 768].
-                    # return_emb_types: a list of strings, each string is among
-                    # ['full', 'core', 'full_pad', 'full_half_pad'].
-                    ada_id_embs, = \
-                        self.inverse_img_prompt_embs(faceid2img_prompt_embs,
-                                                     list_extra_words=None,
-                                                     return_emb_types=['core'],
-                                                     hidden_state_layer_weights=hidden_state_layer_weights,
-                                                     enable_static_img_suffix_embs=enable_static_img_suffix_embs)
-                ada_id_embs = self.prompt2token_proj_grad_scaler(ada_id_embs)
             elif raw_id_embs is not None:
                 # id_embs: [BS, 384] -> [BS, 18, 768].
                 # obj_proj_in is expected to project the DINO object features to
@@ -726,14 +755,15 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
             adaface_out_embs = id_embs_out * self.output_scale    # * 0.036
         else:
-            adaface_out_embs = ada_id_embs
             # If out_id_embs_cfg_scale < 1, adaface_out_embs is a mix of adaface_out_embs and pad_embeddings.
             if out_id_embs_cfg_scale != 1:
-                # pad_embeddings: [77, 768] -> [16, 768] -> [1, 16, 768].
                 # NOTE: Never do cfg on static image suffix embeddings.
                 # So we take self.N_ID embeddings, instead of self.N_ID + self.N_SFX,
                 # even if enable_static_img_suffix_embs=True.
-                pad_embeddings = self.pad_embeddings[4:4+self.N_ID].unsqueeze(0).to(ada_id_embs.device)
                 adaface_out_embs[:, :self.N_ID] = ada_id_embs[:, :self.N_ID] * out_id_embs_cfg_scale \
                                                   + pad_embeddings           * (1 - out_id_embs_cfg_scale)
@@ -812,37 +842,37 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
         # Only applicable to fg basis generator.
         if self.placeholder_is_bg:
             return
-        # If bg, then prompt2token_proj is set to None. Therefore no need to freeze it.
-        # Then we don't have to check whether it's for subj or bg.
-        if self.prompt2token_proj_grad_scale == 0:
-            frozen_components_name = 'all'
-            frozen_param_set = self.prompt2token_proj.named_parameters()
-        else:
-            frozen_components_name = 'token_pos_embeddings'
-            frozen_param_set = self.prompt2token_proj.text_model.embeddings.named_parameters()
         if self.prompt2token_proj is not None:
             frozen_param_names = []
-            for param_name, param in frozen_param_set:
                 if param.requires_grad:
                     param.requires_grad = False
                     frozen_param_names.append(param_name)
                 # If param is already frozen, then no need to freeze it again.
-            print(f"{frozen_components_name} {len(frozen_param_names)} params in Subj prompt2token_proj is frozen.")
             #print(f"Frozen parameters:\n{frozen_param_names}")
     def patch_old_subj_basis_generator_ckpt(self):
         # Fix compatability with the previous version.
         if not hasattr(self, 'bg_prompt_translator_has_to_out_proj'):
             self.bg_prompt_translator_has_to_out_proj = False
-        if not hasattr(self, 'num_out_embs'):
-            self.num_out_embs = -1
         if hasattr(self, 'num_id_vecs') and not hasattr(self, 'N_ID'):
             self.N_ID = self.num_id_vecs
         if not hasattr(self, 'num_nonface_in_id_vecs') and hasattr(self, 'N_ID'):
             self.num_nonface_in_id_vecs = self.N_ID
         if not hasattr(self, 'dtype'):
-            self.dtype = torch.float32
         if self.placeholder_is_bg:
             if not hasattr(self, 'pos_embs') or self.pos_embs is None:
@@ -860,6 +890,14 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
                                             num_static_img_suffix_embs=self.N_SFX,
                                             img_prompt_dim=self.output_dim)
     def __repr__(self):
         type_sig = 'subj' if not self.placeholder_is_bg else 'bg'

 from torch import nn
 from einops import rearrange
 from einops.layers.torch import Rearrange
+from transformers import CLIPTokenizer, CLIPTextModel
 from torch import einsum
 from adaface.util import gen_gradient_scaler
         x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
         x = self.norm(x)
         return x
+class LayerwiseMLPProjWithSkip(nn.Module):
+    def __init__(self, id_embeddings_dim=768, num_layers=16, dim_mult=2):
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Linear(id_embeddings_dim, id_embeddings_dim*dim_mult*num_layers),
+            Rearrange('b n (l d) -> b n l d', l=num_layers, d=id_embeddings_dim*dim_mult),
+            nn.GELU(),
+            nn.Linear(id_embeddings_dim*dim_mult, id_embeddings_dim),
+        )
+        self.norm = nn.LayerNorm(id_embeddings_dim)
+    def forward(self, id_embeds):
+        # B N D -> B N L D + B N L D -> B N L D
+        x = self.proj(id_embeds) + id_embeds.unsqueeze(1)
+        x = self.norm(x)
+        return x
 # group_dim: the tensor dimension that corresponds to the multiple groups.
 class LearnedSoftAggregate(nn.Module):
     def __init__(self, num_feat, group_dim, keepdim=False):
         else:
             return out
 class ImgPrompt2TextPrompt(nn.Module):
+    def __init__(self, placeholder_is_bg, num_id_vecs, num_static_img_suffix_embs,
+                 max_prompt_length=77, img_prompt_dim=768, dtype=torch.float16):
         super().__init__()
         self.N_ID  = num_id_vecs
         # If not placeholder_is_bg, then N_SFX will be updated in initialize_text_components().
         self.N_SFX = 0
+        self.dtype = dtype
         if not placeholder_is_bg:
+            self.initialize_static_img_suffix_embs(num_static_img_suffix_embs, img_prompt_dim)
+            self.initialize_text_components(max_prompt_length)
         # prompt2token_proj: arc2face_models.py CLIPTextModelWrapper instance with **custom weights**.
         # prompt2token_proj is with the same architecture as the original arc2face text encoder,
         # but retrained to do inverse mapping.
         # To be initialized in the subclass.
         self.prompt2token_proj = None
     def initialize_static_img_suffix_embs(self, num_static_img_suffix_embs, img_prompt_dim=768):
         self.N_SFX = num_static_img_suffix_embs
         # We always take the first num_static_img_suffix_embs embeddings out of static_img_suffix_embs.
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs ({self.N_SFX} required). Skip initialization.")
             elif self.static_img_suffix_embs.shape[1] < self.N_SFX:
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (< {self.N_SFX} required). Reinitialize.")
+                self.static_img_suffix_embs = nn.Parameter(torch.randn(1, self.N_SFX, img_prompt_dim, dtype=self.dtype))
             elif self.N_SFX > 0:
                 # self.static_img_suffix_embs.shape[1] > self.N_SFX > 0.
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (> {self.N_SFX} required). Truncate.")
+                self.static_img_suffix_embs = nn.Parameter(self.static_img_suffix_embs[:, :self.N_SFX].to(dtype=self.dtype))
             else:
                 # self.static_img_suffix_embs.shape[1] > self.N_SFX == 0.
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (0 required). Erase.")
                 # or it's initialized but has fewer than num_static_img_suffix_embs embeddings (this situation should be very rare,
                 # so we don't consider to reuse and extend a shorter static_img_suffix_embs).
                 # So we reinitialize it.
+                self.static_img_suffix_embs = nn.Parameter(torch.randn(1, self.N_SFX, img_prompt_dim, dtype=self.dtype))
             else:
                 # If static_img_suffix_embs had been initialized, then it will be set to None, i.e., erased from the SubjBasisGenerator instance.
                 self.static_img_suffix_embs = None
     # Implement a separate initialization function, so that it can be called from SubjBasisGenerator
     # after the SubjBasisGenerator is initialized. This can be used to fix old SubjBasisGenerator
     # ckpts which were not subclassed from ImgPrompt2TextPrompt.
+    def initialize_text_components(self, max_prompt_length=77):
         self.max_prompt_length = max_prompt_length
         self.tokenizer       = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
         # clip_text_embeddings: CLIPTextEmbeddings instance.
         # pad_embeddings is still on CPU. But should be moved to GPU automatically.
         # Note: detach pad_embeddings from the computation graph, otherwise
         # deepcopy() in embedding_manager.py:make_frozen_copy_of_subj_basis_generators() will fail.
+        self.pad_embeddings = clip_text_embeddings(pad_tokens)[0].detach().to(self.dtype)
     # image prompt space -> text prompt space.
     # return_emb_types: a list of strings, each string is among
                     else:
                         breakpoint()
                 else:
+                    # len(face_prompt_embs) == 1, this occurs when same_subject_in_batch == True, e.g. in do_feat_distill_on_comp_prompt.
                     # But list_extra_words always corresponds to the actual batch size. So we only take the first element.
                     list_extra_words = list_extra_words[:1]
         face_prompt_embs_orig_dtype = face_prompt_embs.dtype
         face_prompt_embs            = face_prompt_embs.to(self.dtype)
+        ID_END      = 4      + self.N_ID
         PAD_BEGIN   = ID_END + self.N_SFX + 2
         # token_embs: [1, 77, 768]. This call is only to get the template token embeddings (the shallowest mapping).
 class SubjBasisGenerator(ImgPrompt2TextPrompt):
     def __init__(
         self,
+        dtype=torch.float16,
         # number of cross-attention heads of the bg prompt translator.
         # Taken as a half of the number of heads 12 of OpenAI clip-vit-large-patch14:
         # https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
         # or number of background input identity vectors (no matter the subject is face or not).
         # 257: 257 CLIP tokens.
         num_nonface_in_id_vecs={ 'subj': 77, 'bg': 257 },
+        num_ca_layers=16,
         num_id_vecs=16,                             # num_id_vecs: subj: 16. bg: 4.
         num_static_img_suffix_embs: int = 0,        # Number of extra static learnable image embeddings appended to translated ID embeddings.
         bg_image_embedding_dim=1024,                # CLIP image hidden layer feature dimension, as per config.json above.
         obj_embedding_dim=384,                      # DINO object feature dimension for objects.
         output_dim=768,                             # CLIP text embedding input dimension.
+        use_layerwise_proj: bool = False,           # Whether to use layerwise projection.
         placeholder_is_bg: bool = False,            # Whether the placeholder is for the image background tokens.
         learnable_hidden_state_weights_scheme: str = 'per-layer',   # none, per-layer.
+        bg_prompt_translator_has_to_out_proj:  bool = False,         # Whether the prompt_trans_layers have a to_out projection.
     ):
         # If not placeholder_is_bg, then it calls initialize_text_components() in the superclass.
+        super().__init__(placeholder_is_bg=placeholder_is_bg, num_id_vecs=num_id_vecs,
+                         num_static_img_suffix_embs=num_static_img_suffix_embs,
+                         max_prompt_length=77, img_prompt_dim=output_dim, dtype=dtype)
         self.placeholder_is_bg  = placeholder_is_bg
+        self.num_ca_layers      = num_ca_layers
         self.num_out_embs       = self.N_ID + self.N_SFX
         self.output_dim         = output_dim
         # num_nonface_in_id_vecs should be the number of core ID embs, 16.
             # self.prompt2token_proj: [1, 16, 768] -> [1, 77, 768] (with paddings) or [1, 16, 768] (without paddings).
             # If self.placeholder_is_bg: prompt2token_proj is set to None.
             # Use an attention dropout of 0.2 to increase robustness.
+            self.prompt2token_proj  = CLIPTextModelWrapper.from_pretrained('openai/clip-vit-large-patch14')
+            self.prompt2token_proj.to(dtype=self.dtype)
+            if use_layerwise_proj:
+                # MLPProjWithSkip: MLP with skip connection.
+                # [BS, 4, 768] -> [BS, 16, 4, 768]. Extra 16: 16 layers.
+                self.layerwise_proj     = LayerwiseMLPProjWithSkip(output_dim, dim_mult=2)
+            else:
+                self.layerwise_proj     = nn.Identity() #Rearrange('b n d -> b l n d', l=16)
+            print(f"Subj prompt2token_proj initialized.")
+            # Only freeze token and positional embeddings of the original CLIPTextModel.
             self.freeze_prompt2token_proj()
             # These multipliers are relative to the original CLIPTextModel.
                                identity_to_out=identity_to_out,
                                out_has_skip=out_has_skip)
+            if self.dtype == torch.float16:
+                self.prompt_translator = self.prompt_translator.half()
             self.output_scale = output_dim ** -0.5
             '''
                 hidden_state_layer_weights = self.hidden_state_layer_weights_grad_scaler(self.hidden_state_layer_weights)
                 # faceid2img_prompt_embs -> ada_id_embs: image prompt space -> text prompt space.
+                # inverse_img_prompt_embs() applies self.prompt2token_proj to faceid2img_prompt_embs.
+                # If list_extra_words is not None, then ada_id_embs: [BS, 18, 768], three leading words, the 16 identity tokens
+                # and (at most) two extra words in adaface_prompt_embs, without BOS and EOS.
+                # If list_extra_words is None, then ada_id_embs: [BS, 16, 768], the 16 identity tokens in adaface_prompt_embs.
+                # hidden_state_layer_weights: [[0.9163], [0.9483], [2.0762]]
+                # ada_id_embs: [BS, 16, 768].
+                # return_emb_types: a list of strings, each string is among
+                # ['full', 'core', 'full_pad', 'full_half_pad'].
+                ada_id_embs, = \
+                    self.inverse_img_prompt_embs(faceid2img_prompt_embs,
+                                                 list_extra_words=None,
+                                                 return_emb_types=['core'],
+                                                 hidden_state_layer_weights=hidden_state_layer_weights,
+                                                 enable_static_img_suffix_embs=enable_static_img_suffix_embs)
             elif raw_id_embs is not None:
                 # id_embs: [BS, 384] -> [BS, 18, 768].
                 # obj_proj_in is expected to project the DINO object features to
             adaface_out_embs = id_embs_out * self.output_scale    # * 0.036
         else:
+            # [BS, 16, 768] -> [BS, layers=16, tokens=16, 768]
+            adaface_out_embs = self.layerwise_proj(ada_id_embs)
             # If out_id_embs_cfg_scale < 1, adaface_out_embs is a mix of adaface_out_embs and pad_embeddings.
             if out_id_embs_cfg_scale != 1:
+                # pad_embeddings: [77, 768] -> [16, 768] -> [1, 1, 16, 768].
                 # NOTE: Never do cfg on static image suffix embeddings.
                 # So we take self.N_ID embeddings, instead of self.N_ID + self.N_SFX,
                 # even if enable_static_img_suffix_embs=True.
+                pad_embeddings = self.pad_embeddings[4:4+self.N_ID].unsqueeze(0).unsqueeze(1).to(ada_id_embs.device)
                 adaface_out_embs[:, :self.N_ID] = ada_id_embs[:, :self.N_ID] * out_id_embs_cfg_scale \
                                                   + pad_embeddings           * (1 - out_id_embs_cfg_scale)
         # Only applicable to fg basis generator.
         if self.placeholder_is_bg:
             return
         if self.prompt2token_proj is not None:
             frozen_param_names = []
+            for param_name, param in self.prompt2token_proj.text_model.embeddings.named_parameters():
                 if param.requires_grad:
                     param.requires_grad = False
                     frozen_param_names.append(param_name)
                 # If param is already frozen, then no need to freeze it again.
+            print(f"{len(frozen_param_names)} params of token_pos_embeddings in Subj prompt2token_proj is frozen.")
             #print(f"Frozen parameters:\n{frozen_param_names}")
     def patch_old_subj_basis_generator_ckpt(self):
         # Fix compatability with the previous version.
         if not hasattr(self, 'bg_prompt_translator_has_to_out_proj'):
             self.bg_prompt_translator_has_to_out_proj = False
         if hasattr(self, 'num_id_vecs') and not hasattr(self, 'N_ID'):
             self.N_ID = self.num_id_vecs
+        # Update the number of output embeddings.
+        self.num_out_embs = self.N_ID + self.N_SFX
         if not hasattr(self, 'num_nonface_in_id_vecs') and hasattr(self, 'N_ID'):
             self.num_nonface_in_id_vecs = self.N_ID
         if not hasattr(self, 'dtype'):
+            self.dtype = torch.float16
+            if not self.placeholder_is_bg:
+                self.prompt2token_proj.to(dtype=self.dtype)
+            else:
+                self.prompt_translator.half()
+        if not hasattr(self, 'num_ca_layers'):
+            self.num_ca_layers = 16
         if self.placeholder_is_bg:
             if not hasattr(self, 'pos_embs') or self.pos_embs is None:
                                             num_static_img_suffix_embs=self.N_SFX,
                                             img_prompt_dim=self.output_dim)
+            if not hasattr(self, 'use_layerwise_proj'):
+                self.use_layerwise_proj = False
+            if not hasattr(self, 'layerwise_proj'):
+                if self.use_layerwise_proj:
+                    self.layerwise_proj = LayerwiseMLPProjWithSkip(self.output_dim, dim_mult=2)
+                else:
+                    self.layerwise_proj = nn.Identity()
     def __repr__(self):
         type_sig = 'subj' if not self.placeholder_is_bg else 'bg'

adaface/unet_teachers.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 import numpy as np
-import pytorch_lightning as pl
 from diffusers import UNet2DConditionModel
 from adaface.util import UNetEnsemble, create_consistentid_pipeline
 from diffusers import UNet2DConditionModel
@@ -12,9 +12,9 @@ def create_unet_teacher(teacher_type, device='cpu', **kwargs):
         teacher_type = teacher_type[0]
     if teacher_type == "arc2face":
-        return Arc2FaceTeacher(**kwargs)
     elif teacher_type == "unet_ensemble":
-        # unet, extra_unet_dirpaths and unet_weights are passed in kwargs.
         # Even if we distill from unet_ensemble, we still need to load arc2face for generating
         # arc2face embeddings.
         # The first (optional) ctor param of UNetEnsembleTeacher is an instantiated unet,
@@ -22,20 +22,24 @@ def create_unet_teacher(teacher_type, device='cpu', **kwargs):
         # However, since the __call__ method of the ddpm unet takes different formats of params,
         # for simplicity, we still use the diffusers unet.
         # unet_teacher is put on CPU first, then moved to GPU when DDPM is moved to GPU.
-        return UNetEnsembleTeacher(device=device, **kwargs)
     elif teacher_type == "consistentID":
-        return ConsistentIDTeacher(**kwargs)
     elif teacher_type == "simple_unet":
-        return SimpleUNetTeacher(**kwargs)
     # Since we've dereferenced the list if it has only one element,
     # this holding implies the list has more than one element. Therefore it's UNetEnsembleTeacher.
     elif isinstance(teacher_type, (tuple, list, ListConfig)):
         # teacher_type is a list of teacher types. So it's UNetEnsembleTeacher.
-        return UNetEnsembleTeacher(unet_types=teacher_type, device=device, **kwargs)
     else:
         raise NotImplementedError(f"Teacher type {teacher_type} not implemented.")
-class UNetTeacher(pl.LightningModule):
     def __init__(self, **kwargs):
         super().__init__()
         self.name = None
@@ -56,9 +60,10 @@ class UNetTeacher(pl.LightningModule):
     # to be initialized, which will unnecessarily complicate the code.
     # noise: the initial noise for the first iteration.
     # t: the initial t. We will sample additional (num_denoising_steps - 1) smaller t.
-    # uses_same_t: when sampling t, use the same t for all instances.
-    def forward(self, ddpm_model, x_start, noise, t, teacher_context,
-                num_denoising_steps=1, uses_same_t=False):
         assert num_denoising_steps <= 10
         if self.p_uses_cfg > 0:
@@ -71,27 +76,22 @@ class UNetTeacher(pl.LightningModule):
             if self.uses_cfg:
                 print(f"Teacher samples CFG scale {self.cfg_scale:.1f}.")
             else:
                 self.cfg_scale = 1
                 print("Teacher does not use CFG.")
-                # If p_uses_cfg > 0, we always pass both pos_context and neg_context to the teacher.
-                # But the neg_context is only used when self.uses_cfg is True and cfg_scale > 1.
-                # So we manually split the teacher_context into pos_context and neg_context, and only keep pos_context.
-                if self.name == 'unet_ensemble':
-                    teacher_pos_contexts = []
-                    # teacher_context is a list of teacher contexts.
-                    for teacher_context_i in teacher_context:
-                        pos_context, neg_context = torch.chunk(teacher_context_i, 2, dim=0)
-                        if pos_context.shape[0] != x_start.shape[0]:
-                            breakpoint()
-                        teacher_pos_contexts.append(pos_context)
-                    teacher_context = teacher_pos_contexts
-                else:
-                    pos_context, neg_context = torch.chunk(teacher_context, 2, dim=0)
-                    if pos_context.shape[0] != x_start.shape[0]:
-                        breakpoint()
-                    teacher_context = pos_context
         else:
             # p_uses_cfg = 0. Never use CFG.
             self.uses_cfg = False
@@ -102,15 +102,21 @@ class UNetTeacher(pl.LightningModule):
             # in case someday we want to switch from CFG to non-CFG during runtime.
             self.cfg_scale = 1
         if self.name == 'unet_ensemble':
             # teacher_context is a list of teacher contexts.
             for teacher_context_i in teacher_context:
-                if teacher_context_i.shape[0] != x_start.shape[0] * (1 + self.uses_cfg):
                     breakpoint()
         else:
-            if teacher_context.shape[0] != x_start.shape[0] * (1 + self.uses_cfg):
                 breakpoint()
         # Initially, x_starts only contains the original x_start.
         x_starts    = [ x_start ]
         noises      = [ noise ]
@@ -125,24 +131,35 @@ class UNetTeacher(pl.LightningModule):
                 # sqrt_alphas_cumprod[t] * x_start + sqrt_one_minus_alphas_cumprod[t] * noise
                 x_noisy = ddpm_model.q_sample(x_start, t, noise)
-                if self.uses_cfg:
                     x_noisy2 = x_noisy.repeat(2, 1, 1, 1)
                     t2       = t.repeat(2)
                 else:
                     x_noisy2 = x_noisy
-                    t2 = t
                 # If do_arc2face_distill, then pos_context is [BS=6, 21, 768].
                 noise_pred = self.unet(sample=x_noisy2, timestep=t2, encoder_hidden_states=teacher_context,
                                        return_dict=False)[0]
                 if self.uses_cfg and self.cfg_scale > 1:
-                    pos_noise_pred, neg_noise_pred = torch.chunk(noise_pred, 2, dim=0)
                     noise_pred = pos_noise_pred * self.cfg_scale - neg_noise_pred * (self.cfg_scale - 1)
-                # sqrt_recip_alphas_cumprod[t] * x_t - sqrt_recipm1_alphas_cumprod[t] * noise
-                pred_x0 = ddpm_model.predict_start_from_noise(x_noisy, t, noise_pred)
                 noise_preds.append(noise_pred)
                 # The predicted x0 is used as the x_start for the next denoising step.
                 x_starts.append(pred_x0)
@@ -157,20 +174,43 @@ class UNetTeacher(pl.LightningModule):
                     # of the current timestep.
                     t_lb = t * np.power(0.5, np.power(num_denoising_steps - 1, -0.3))
                     t_ub = t * np.power(0.7, np.power(num_denoising_steps - 1, -0.3))
                     earlier_timesteps = (t_ub - t_lb) * relative_ts + t_lb
                     earlier_timesteps = earlier_timesteps.long()
-                    if uses_same_t:
-                        # If uses_same_t, we use the same earlier_timesteps for all instances.
                         earlier_timesteps = earlier_timesteps[0].repeat(x_start.shape[0])
                     # earlier_timesteps = ts[i+1] < ts[i].
                     ts.append(earlier_timesteps)
-                    noise = torch.randn_like(pred_x0)
                     noises.append(noise)
         return noise_preds, x_starts, noises, ts
 class Arc2FaceTeacher(UNetTeacher):
     def __init__(self, **kwargs):
@@ -185,11 +225,11 @@ class Arc2FaceTeacher(UNetTeacher):
         self.cfg_scale_range = [1, 1]
 class UNetEnsembleTeacher(UNetTeacher):
-    # unet_weights are not model weights, but scalar weights for individual unets.
-    def __init__(self, unets, unet_types, extra_unet_dirpaths, unet_weights=None, device='cuda', **kwargs):
         super().__init__(**kwargs)
         self.name = "unet_ensemble"
-        self.unet = UNetEnsemble(unets, unet_types, extra_unet_dirpaths, unet_weights, device)
 class ConsistentIDTeacher(UNetTeacher):
     def __init__(self, base_model_path="models/sd15-dste8-vae.safetensors", **kwargs):
@@ -199,12 +239,9 @@ class ConsistentIDTeacher(UNetTeacher):
         # In contrast to Arc2FaceTeacher or UNetEnsembleTeacher, ConsistentIDPipeline is not a torch.nn.Module.
         # We couldn't initialize the ConsistentIDPipeline to CPU first and wait it to be automatically moved to GPU.
         # Instead, we have to initialize it to GPU directly.
-        pipe = create_consistentid_pipeline(base_model_path)
-        # Compatible with the UNetTeacher interface.
-        self.unet = pipe.unet
-        # Release VAE and text_encoder to save memory. UNet is still needed for denoising
         # (the unet is implemented in diffusers in fp16, so probably faster than the LDM unet).
-        pipe.release_components(["vae", "text_encoder"])
 # We use the default cfg_scale_range=[1.3, 2] for SimpleUNetTeacher.
 # Note p_uses_cfg=0.5 will also be passed in in kwargs.

 import torch
+from torch import nn
 import numpy as np
 from diffusers import UNet2DConditionModel
 from adaface.util import UNetEnsemble, create_consistentid_pipeline
 from diffusers import UNet2DConditionModel
         teacher_type = teacher_type[0]
     if teacher_type == "arc2face":
+        teacher = Arc2FaceTeacher(**kwargs)
     elif teacher_type == "unet_ensemble":
+        # unet, extra_unet_dirpaths and unet_weights_in_ensemble are passed in kwargs.
         # Even if we distill from unet_ensemble, we still need to load arc2face for generating
         # arc2face embeddings.
         # The first (optional) ctor param of UNetEnsembleTeacher is an instantiated unet,
         # However, since the __call__ method of the ddpm unet takes different formats of params,
         # for simplicity, we still use the diffusers unet.
         # unet_teacher is put on CPU first, then moved to GPU when DDPM is moved to GPU.
+        teacher = UNetEnsembleTeacher(device=device, **kwargs)
     elif teacher_type == "consistentID":
+        teacher = ConsistentIDTeacher(**kwargs)
     elif teacher_type == "simple_unet":
+        teacher = SimpleUNetTeacher(**kwargs)
     # Since we've dereferenced the list if it has only one element,
     # this holding implies the list has more than one element. Therefore it's UNetEnsembleTeacher.
     elif isinstance(teacher_type, (tuple, list, ListConfig)):
         # teacher_type is a list of teacher types. So it's UNetEnsembleTeacher.
+        teacher = UNetEnsembleTeacher(unet_types=teacher_type, device=device, **kwargs)
     else:
         raise NotImplementedError(f"Teacher type {teacher_type} not implemented.")
+    for param in teacher.parameters():
+        param.requires_grad = False
+    return teacher
+class UNetTeacher(nn.Module):
     def __init__(self, **kwargs):
         super().__init__()
         self.name = None
     # to be initialized, which will unnecessarily complicate the code.
     # noise: the initial noise for the first iteration.
     # t: the initial t. We will sample additional (num_denoising_steps - 1) smaller t.
+    # same_t_noise_across_instances: when sampling t and noise, use the same t and noise for all instances.
+    def forward(self, ddpm_model, x_start, noise, t, teacher_context, negative_context=None,
+                num_denoising_steps=1, same_t_noise_across_instances=False,
+                global_t_lb=0, global_t_ub=1000):
         assert num_denoising_steps <= 10
         if self.p_uses_cfg > 0:
             if self.uses_cfg:
                 print(f"Teacher samples CFG scale {self.cfg_scale:.1f}.")
+                if negative_context is not None:
+                    negative_context = negative_context[:1].repeat(x_start.shape[0], 1, 1)
+                # if negative_context is None, then teacher_context is a combination of
+                # (one or multiple if unet_ensemble) pos_context and neg_context.
+                # If negative_context is not None, then teacher_context is only pos_context.
             else:
                 self.cfg_scale = 1
                 print("Teacher does not use CFG.")
+                # If negative_context is None, then teacher_context is a combination of
+                # (one or multiple if unet_ensemble) pos_context and neg_context.
+                # Since not uses_cfg, we only need pos_context.
+                # If negative_context is not None, then teacher_context is only pos_context.
+                if negative_context is None:
+                    teacher_context = self.extract_pos_context(teacher_context, x_start.shape[0])
         else:
             # p_uses_cfg = 0. Never use CFG.
             self.uses_cfg = False
             # in case someday we want to switch from CFG to non-CFG during runtime.
             self.cfg_scale = 1
+        is_context_doubled = 2 if (self.uses_cfg and negative_context is None) else 1
         if self.name == 'unet_ensemble':
             # teacher_context is a list of teacher contexts.
             for teacher_context_i in teacher_context:
+                if teacher_context_i.shape[0] != x_start.shape[0] * is_context_doubled:
                     breakpoint()
         else:
+            if teacher_context.shape[0] != x_start.shape[0] * is_context_doubled:
                 breakpoint()
+        if same_t_noise_across_instances:
+            # If same_t_noise_across_instances, we use the same t and noise for all instances.
+            t = t[0].repeat(x_start.shape[0])
+            noise = noise[:1].repeat(x_start.shape[0], 1, 1, 1)
         # Initially, x_starts only contains the original x_start.
         x_starts    = [ x_start ]
         noises      = [ noise ]
                 # sqrt_alphas_cumprod[t] * x_start + sqrt_one_minus_alphas_cumprod[t] * noise
                 x_noisy = ddpm_model.q_sample(x_start, t, noise)
+                if self.uses_cfg and self.cfg_scale > 1 and negative_context is None:
                     x_noisy2 = x_noisy.repeat(2, 1, 1, 1)
                     t2       = t.repeat(2)
                 else:
                     x_noisy2 = x_noisy
+                    t2       = t
                 # If do_arc2face_distill, then pos_context is [BS=6, 21, 768].
                 noise_pred = self.unet(sample=x_noisy2, timestep=t2, encoder_hidden_states=teacher_context,
                                        return_dict=False)[0]
                 if self.uses_cfg and self.cfg_scale > 1:
+                    if negative_context is None:
+                        pos_noise_pred, neg_noise_pred = torch.chunk(noise_pred, 2, dim=0)
+                    else:
+                        # If negative_context is not None, then teacher_context is only pos_context.
+                        pos_noise_pred = noise_pred
+                        with torch.no_grad():
+                            if self.name == 'unet_ensemble':
+                                neg_noise_pred = self.unet.unets[0](sample=x_noisy, timestep=t,
+                                                                    encoder_hidden_states=negative_context, return_dict=False)[0]
+                            else:
+                                neg_noise_pred = self.unet(sample=x_noisy, timestep=t,
+                                                           encoder_hidden_states=negative_context, return_dict=False)[0]
                     noise_pred = pos_noise_pred * self.cfg_scale - neg_noise_pred * (self.cfg_scale - 1)
                 noise_preds.append(noise_pred)
+                # sqrt_recip_alphas_cumprod[t] * x_t - sqrt_recipm1_alphas_cumprod[t] * noise
+                pred_x0 = ddpm_model.predict_start_from_noise(x_noisy, t, noise_pred)
                 # The predicted x0 is used as the x_start for the next denoising step.
                 x_starts.append(pred_x0)
                     # of the current timestep.
                     t_lb = t * np.power(0.5, np.power(num_denoising_steps - 1, -0.3))
                     t_ub = t * np.power(0.7, np.power(num_denoising_steps - 1, -0.3))
+                    t_lb = torch.clamp(t_lb, min=global_t_lb)
+                    t_ub = torch.clamp(t_ub, max=global_t_ub)
                     earlier_timesteps = (t_ub - t_lb) * relative_ts + t_lb
                     earlier_timesteps = earlier_timesteps.long()
+                    noise = torch.randn_like(pred_x0)
+                    if same_t_noise_across_instances:
+                        # If same_t_noise_across_instances, we use the same earlier_timesteps and noise for all instances.
                         earlier_timesteps = earlier_timesteps[0].repeat(x_start.shape[0])
+                        noise = noise[:1].repeat(x_start.shape[0], 1, 1, 1)
                     # earlier_timesteps = ts[i+1] < ts[i].
                     ts.append(earlier_timesteps)
                     noises.append(noise)
         return noise_preds, x_starts, noises, ts
+    def extract_pos_context(self, teacher_context, BS):
+        # If p_uses_cfg > 0, we always pass both pos_context and neg_context to the teacher.
+        # But the neg_context is only used when self.uses_cfg is True and cfg_scale > 1.
+        # So we manually split the teacher_context into pos_context and neg_context, and only keep pos_context.
+        if self.name == 'unet_ensemble':
+            teacher_pos_contexts = []
+            # teacher_context is a list of teacher contexts.
+            for teacher_context_i in teacher_context:
+                pos_context, neg_context = torch.chunk(teacher_context_i, 2, dim=0)
+                if pos_context.shape[0] != BS:
+                    breakpoint()
+                teacher_pos_contexts.append(pos_context)
+            teacher_context = teacher_pos_contexts
+        else:
+            pos_context, neg_context = torch.chunk(teacher_context, 2, dim=0)
+            if pos_context.shape[0] != BS:
+                breakpoint()
+            teacher_context = pos_context
+        return teacher_context
 class Arc2FaceTeacher(UNetTeacher):
     def __init__(self, **kwargs):
         self.cfg_scale_range = [1, 1]
 class UNetEnsembleTeacher(UNetTeacher):
+    # unet_weights_in_ensemble are not model weights, but scalar weights for individual unets.
+    def __init__(self, unets, unet_types, extra_unet_dirpaths, unet_weights_in_ensemble=None, device='cuda', **kwargs):
         super().__init__(**kwargs)
         self.name = "unet_ensemble"
+        self.unet = UNetEnsemble(unets, unet_types, extra_unet_dirpaths, unet_weights_in_ensemble, device)
 class ConsistentIDTeacher(UNetTeacher):
     def __init__(self, base_model_path="models/sd15-dste8-vae.safetensors", **kwargs):
         # In contrast to Arc2FaceTeacher or UNetEnsembleTeacher, ConsistentIDPipeline is not a torch.nn.Module.
         # We couldn't initialize the ConsistentIDPipeline to CPU first and wait it to be automatically moved to GPU.
         # Instead, we have to initialize it to GPU directly.
+        # Release VAE and text_encoder to save memory. UNet is needed for denoising
         # (the unet is implemented in diffusers in fp16, so probably faster than the LDM unet).
+        self.unet = create_consistentid_pipeline(base_model_path, unet_only=True)
 # We use the default cfg_scale_range=[1.3, 2] for SimpleUNetTeacher.
 # Note p_uses_cfg=0.5 will also be passed in in kwargs.

adaface/util.py CHANGED Viewed

@@ -57,7 +57,7 @@ def perturb_np_array(np_array, perturb_std, perturb_std_is_relative=True, std_di
     ts = perturb_tensor(ts, perturb_std, perturb_std_is_relative, std_dim=std_dim)
     return ts.numpy().astype(np_array.dtype)
-def calc_stats(emb_name, embeddings, mean_dim=0):
     print("%s:" %emb_name)
     repeat_count = [1] * embeddings.ndim
     repeat_count[mean_dim] = embeddings.shape[mean_dim]
@@ -153,13 +153,14 @@ def pad_image_obj_to_square(image_obj, new_size=-1):
 class UNetEnsemble(nn.Module):
     # The first unet is the unet already loaded in a pipeline.
-    def __init__(self, unets, unet_types, extra_unet_dirpaths, unet_weights=None, device='cuda', torch_dtype=torch.float16):
         super().__init__()
-        self.unets = nn.ModuleList()
         if unets is not None:
-            self.unets += [ unet.to(device) for unet in unets ]
         if unet_types is not None:
             for unet_type in unet_types:
                 if unet_type == "arc2face":
@@ -169,25 +170,27 @@ class UNetEnsemble(nn.Module):
                     unet = create_consistentid_pipeline(unet_only=True)
                 else:
                     breakpoint()
-                self.unets.append(unet.to(device=device))
         if extra_unet_dirpaths is not None:
             for unet_path in extra_unet_dirpaths:
                 unet = UNet2DConditionModel.from_pretrained(unet_path, torch_dtype=torch_dtype)
-                self.unets.append(unet.to(device=device))
-        if unet_weights is None:
-            unet_weights = [1.] * len(self.unets)
-        elif len(self.unets) < len(unet_weights):
-            unet_weights = unet_weights[:len(self.unets)]
-        elif len(self.unets) > len(unet_weights):
             breakpoint()
-        unet_weights = torch.tensor(unet_weights, dtype=torch_dtype)
-        unet_weights = unet_weights / unet_weights.sum()
-        self.unet_weights = nn.Parameter(unet_weights, requires_grad=False)
-        print(f"UNetEnsemble: {len(self.unets)} UNets loaded with weights: {self.unet_weights.data.cpu().numpy()}")
         # Set these fields to be compatible with diffusers.
         self.dtype  = self.unets[0].dtype
         self.device = self.unets[0].device
@@ -215,8 +218,8 @@ class UNetEnsemble(nn.Module):
             samples.append(sample)
         samples = torch.stack(samples, dim=0)
-        unet_weights = self.unet_weights.reshape(-1, *([1] * (samples.ndim - 1)))
-        sample = (samples * unet_weights).sum(dim=0)
         if not return_dict:
             return (sample,)

     ts = perturb_tensor(ts, perturb_std, perturb_std_is_relative, std_dim=std_dim)
     return ts.numpy().astype(np_array.dtype)
+def calc_stats(emb_name, embeddings, mean_dim=-1):
     print("%s:" %emb_name)
     repeat_count = [1] * embeddings.ndim
     repeat_count[mean_dim] = embeddings.shape[mean_dim]
 class UNetEnsemble(nn.Module):
     # The first unet is the unet already loaded in a pipeline.
+    def __init__(self, unets, unet_types, extra_unet_dirpaths, unet_weights_in_ensemble=None, device='cuda', torch_dtype=torch.float16):
         super().__init__()
         if unets is not None:
+            unets = [ unet.to(device) for unet in unets ]
+        else:
+            unets = []
         if unet_types is not None:
             for unet_type in unet_types:
                 if unet_type == "arc2face":
                     unet = create_consistentid_pipeline(unet_only=True)
                 else:
                     breakpoint()
+                unets.append(unet.to(device=device))
         if extra_unet_dirpaths is not None:
             for unet_path in extra_unet_dirpaths:
                 unet = UNet2DConditionModel.from_pretrained(unet_path, torch_dtype=torch_dtype)
+                unets.append(unet.to(device=device))
+        if unet_weights_in_ensemble is None:
+            unet_weights_in_ensemble = [1.] * len(unets)
+        elif len(unets) < len(unet_weights_in_ensemble):
+            unet_weights_in_ensemble = unet_weights_in_ensemble[:len(unets)]
+        elif len(unets) > len(unet_weights_in_ensemble):
             breakpoint()
+        unet_weights_in_ensemble = torch.tensor(unet_weights_in_ensemble, dtype=torch_dtype)
+        unet_weights_in_ensemble = unet_weights_in_ensemble / unet_weights_in_ensemble.sum()
+        self.unets = nn.ModuleList(unets)
+        # Put the weights in a Parameter so that they will be moved to the same device as the model.
+        self.unet_weights_in_ensemble = nn.Parameter(unet_weights_in_ensemble, requires_grad=False)
+        print(f"UNetEnsemble: {len(self.unets)} UNets loaded with weights: {self.unet_weights_in_ensemble.data.cpu().numpy()}")
         # Set these fields to be compatible with diffusers.
         self.dtype  = self.unets[0].dtype
         self.device = self.unets[0].device
             samples.append(sample)
         samples = torch.stack(samples, dim=0)
+        unet_weights_in_ensemble = self.unet_weights_in_ensemble.reshape(-1, *([1] * (samples.ndim - 1)))
+        sample = (samples * unet_weights_in_ensemble).sum(dim=0)
         if not return_dict:
             return (sample,)

app.py CHANGED Viewed

@@ -5,40 +5,63 @@ from adaface.adaface_wrapper import AdaFaceWrapper
 import torch
 import numpy as np
 import random
 import gradio as gr
 import spaces
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                     choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
-parser.add_argument('--adaface_ckpt_path', type=str, default='models/adaface/VGGface2_HQ_masks2024-10-14T16-09-24_zero3-ada-3500.pt',
-                    help="Paths to the checkpoints of the ID2Ada prompt encoders")
 # If adaface_encoder_cfg_scales is not specified, the weights will be set to 6.0 (consistentID) and 1.0 (arc2face).
-parser.add_argument('--adaface_encoder_cfg_scales', type=float, nargs="+", default=None,
                     help="Scales for the ID2Ada prompt encoders")
 parser.add_argument("--enabled_encoders", type=str, nargs="+", default=None,
                     choices=["arc2face", "consistentID"],
                     help="List of enabled encoders (among the list of adaface_encoder_types). Default: None (all enabled)")
-parser.add_argument('--model_style_type', type=str, default='realistic',
                     choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
-parser.add_argument('--extra_unet_dirpaths', type=str, nargs="*", default=[],
-                    help="Extra paths to the checkpoints of the UNet models")
-parser.add_argument('--unet_weights', type=float, nargs="+", default=[1],
-                    help="Weights for the UNet models")
-parser.add_argument("--guidance_scale", type=float, default=8.0,
-                    help="The guidance scale for the diffusion model. Default: 8.0")
-parser.add_argument("--do_neg_id_prompt_weight", type=float, default=0.0,
-                    help="The weight of added ID prompt embeddings into the negative prompt. Default: 0, disabled.")
 parser.add_argument('--gpu', type=int, default=None)
 parser.add_argument('--ip', type=str, default="0.0.0.0")
 args = parser.parse_args()
 model_style_type2base_model_path = {
     "realistic": "models/rv51/realisticVisionV51_v51VAE_dste8.safetensors",
     "anime": "models/aingdiffusion/aingdiffusion_v170_ar.safetensors",
-    "photorealistic": "models/sar/sar.safetensors" # LDM format. Needs to be converted.
 }
 base_model_path = model_style_type2base_model_path[args.model_style_type]
@@ -48,13 +71,20 @@ device = "cuda" if args.gpu is None else f"cuda:{args.gpu}"
 print(f"Device: {device}")
 global adaface
-adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
-                         adaface_encoder_types=args.adaface_encoder_types,
-                         adaface_ckpt_paths=args.adaface_ckpt_path,
-                         adaface_encoder_cfg_scales=args.adaface_encoder_cfg_scales,
-                         enabled_encoders=args.enabled_encoders,
-                         unet_types=None, extra_unet_dirpaths=args.extra_unet_dirpaths,
-                         unet_weights=args.unet_weights, device='cpu')
 def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
     if randomize_seed:
@@ -71,12 +101,14 @@ def remove_back_to_files():
     # Hide uploaded_files_gallery,    show clear_button_column,      hide files,           reset init_img_selected_idx
     # Or:
     # Hide uploaded_init_img_gallery, hide init_clear_button_column, show init_img_files,  reset init_img_selected_idx
-    return gr.update(visible=False), gr.update(visible=False), gr.update(value=None, visible=True)
 @spaces.GPU
-def generate_image(image_paths, guidance_scale, do_neg_id_prompt_weight, perturb_std,
-                   num_images, prompt, negative_prompt, enhance_face,
-                   seed, progress=gr.Progress(track_tqdm=True)):
     global adaface
@@ -85,6 +117,9 @@ def generate_image(image_paths, guidance_scale, do_neg_id_prompt_weight, perturb
     if image_paths is None or len(image_paths) == 0:
         raise gr.Error(f"Cannot find any input face image! Please upload a face image.")
     if prompt is None:
         prompt = ""
@@ -100,38 +135,128 @@ def generate_image(image_paths, guidance_scale, do_neg_id_prompt_weight, perturb
     # Sometimes the pipeline is on CPU, although we've put it on CUDA (due to some offloading mechanism).
     # Therefore we set the generator to the correct device.
     generator = torch.Generator(device=device).manual_seed(seed)
-    print(f"Manual seed: {seed}. do_neg_id_prompt_weight: {do_neg_id_prompt_weight}.")
     # Generate two images each time for the user to select from.
     noise = torch.randn(num_images, 3, 512, 512, device=device, generator=generator)
     #print(noise.abs().sum())
     # samples: A list of PIL Image instances.
-    if enhance_face and "face portrait" not in prompt:
-        if "portrait" in prompt:
-            # Enhance the face features by replacing "portrait" with "face portrait".
             prompt = prompt.replace("portrait", "face portrait")
         else:
-            prompt = "face portrait, " + prompt
     generator = torch.Generator(device=adaface.pipeline._execution_device).manual_seed(seed)
-    samples = adaface(noise, prompt, negative_prompt,
-                      do_neg_id_prompt_weight=do_neg_id_prompt_weight,
                       guidance_scale=guidance_scale,
-                      out_image_count=num_images, generator=generator, verbose=True)
-    return samples
-def check_prompt_and_model_type(prompt, model_style_type):
     global adaface
     model_style_type = model_style_type.lower()
-    base_model_path = model_style_type2base_model_path[model_style_type]
     # If the base model type is changed, reload the model.
-    if model_style_type != args.model_style_type:
-        adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
-                                 adaface_encoder_types=args.adaface_encoder_types,
-                                 adaface_ckpt_paths=args.adaface_ckpt_path, device='cpu')
-        # Update base model type.
-        args.model_style_type = model_style_type
     if not prompt:
         raise gr.Error("Prompt cannot be blank")
@@ -145,13 +270,12 @@ description = r"""
 <b>Official demo</b> for our working paper <b>AdaFace: A Versatile Face Encoder for Zero-Shot Diffusion Model Personalization</b>.<br>
 ❗️**What's New**❗️
-- Support switching between two model styles: **Realistic** and **Anime**.
 - If you just changed the model style, the first image/video generation will take extra 20~30 seconds for loading new model weight.
 ❗️**Tips**❗️
 1. Upload one or more images of a person. If multiple faces are detected, we use the largest one.
-2. Check "Enhance Face" to highlight fine facial features.
-3. If the face dominates the image, try increasing 'Weight of ID prompt in the negative prompt'.
 4. AdaFace Text-to-Video: <a href="https://huggingface.co/spaces/adaface-neurips/adaface-animate" style="display: inline-flex; align-items: center;">
   AdaFace-Animate
   <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow" alt="Hugging Face Spaces" style="margin-left: 5px;">
@@ -162,13 +286,18 @@ description = r"""
 """
 css = '''
-.gradio-container {width: 95% !important},
 .custom-gallery {
-    height: 800px;
     width: 100%;
     margin: 10px auto;
-    padding: 10px;
-    overflow-y: auto;
 }
 '''
 with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
@@ -187,53 +316,108 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                         file_types=["image"],
                         file_count="multiple"
                     )
-            uploaded_files_gallery = gr.Gallery(label="Subject images", visible=False, columns=3, rows=1, height=300)
             with gr.Column(visible=False) as clear_button_column:
-                remove_and_reupload = gr.ClearButton(value="Remove and upload subject images", components=img_files, size="sm")
-            prompt = gr.Dropdown(label="Prompt",
-                       info="Try something like 'walking on the beach'. If the face is not in focus, try checking 'enhance face'.",
-                       value="portrait, ((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
-                       allow_custom_value=True,
-                       filterable=False,
-                       choices=[
-                            "portrait, ((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
-                            "portrait, walking on the beach, sunset, orange sky",
-                            "portrait, in a white apron and chef hat, garnishing a gourmet dish",
-                            "portrait, dancing pose among folks in a park, waving hands",
-                            "portrait, in iron man costume, the sky ablaze with hues of orange and purple",
-                            "portrait, jedi wielding a lightsaber, star wars, eye level shot",
-                            "portrait, playing guitar on a boat, ocean waves",
-                            "portrait, with a passion for reading, curled up with a book in a cozy nook near a window",
-                            "portrait, running pose in a park, eye level shot",
-                            "portrait, in superman costume, the sky ablaze with hues of orange and purple"
-                       ])
-            enhance_face = gr.Checkbox(label="Enhance face", value=False,
-                                       info="Enhance the face features by prepending 'face portrait' to the prompt")
             submit = gr.Button("Submit", variant="primary")
             negative_prompt = gr.Textbox(
                 label="Negative Prompt",
-                value="flaws in the eyes, flaws in the face, lowres, non-HDRi, low quality, worst quality, artifacts, noise, text, watermark, glitch, mutated, ugly, disfigured, hands, partially rendered objects, partially rendered eyes, deformed eyeballs, cross-eyed, blurry, mutation, duplicate, out of frame, cropped, mutilated, bad anatomy, deformed, bad proportions, nude, naked, nsfw, topless, bare breasts",
             )
             guidance_scale = gr.Slider(
                 label="Guidance scale",
                 minimum=1.0,
-                maximum=12.0,
-                step=1.0,
                 value=args.guidance_scale,
             )
-            do_neg_id_prompt_weight = gr.Slider(
-                label="Weight of ID prompt in the negative prompt",
-                minimum=0.0,
-                maximum=0.3,
-                step=0.1,
-                value=args.do_neg_id_prompt_weight,
-                visible=True,
             )
             model_style_type = gr.Dropdown(
@@ -256,7 +440,7 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
             num_images = gr.Slider(
                 label="Number of output images",
                 minimum=1,
-                maximum=6,
                 step=1,
                 value=4,
             )
@@ -267,27 +451,41 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                 step=1,
                 value=0,
             )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True, info="Uncheck for reproducible results")
         with gr.Column():
-            out_gallery = gr.Gallery(label="Generated Images", interactive=False, columns=2, rows=2, height=800,
                                      elem_classes="custom-gallery")
-        img_files.upload(fn=swap_to_gallery, inputs=img_files, outputs=[uploaded_files_gallery, clear_button_column, img_files])
-        remove_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_files_gallery, clear_button_column, img_files])
-        submit.click(fn=check_prompt_and_model_type,
-                     inputs=[prompt, model_style_type],outputs=None).success(
-            fn=randomize_seed_fn,
-            inputs=[seed, randomize_seed],
-            outputs=seed,
-            queue=False,
-            api_name=False,
-        ).then(
-            fn=generate_image,
-            inputs=[img_files, guidance_scale, do_neg_id_prompt_weight, perturb_std, num_images,
-                    prompt, negative_prompt, enhance_face, seed],
-            outputs=[out_gallery]
-        )
 demo.launch(share=True, server_name=args.ip, ssl_verify=False)

 import torch
 import numpy as np
 import random
+import os, re
+import time
 import gradio as gr
 import spaces
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                     choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
+parser.add_argument('--adaface_ckpt_path', type=str, default='models/adaface/VGGface2_HQ_masks2025-03-06T03-31-21_zero3-ada-1000.pt',
+                    help="Path to the checkpoint of the ID2Ada prompt encoders")
 # If adaface_encoder_cfg_scales is not specified, the weights will be set to 6.0 (consistentID) and 1.0 (arc2face).
+parser.add_argument('--adaface_encoder_cfg_scales', type=float, nargs="+", default=[6.0, 1.0],
                     help="Scales for the ID2Ada prompt encoders")
 parser.add_argument("--enabled_encoders", type=str, nargs="+", default=None,
                     choices=["arc2face", "consistentID"],
                     help="List of enabled encoders (among the list of adaface_encoder_types). Default: None (all enabled)")
+parser.add_argument('--model_style_type', type=str, default='photorealistic',
                     choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
+parser.add_argument("--guidance_scale", type=float, default=5.0,
+                    help="The guidance scale for the diffusion model. Default: 5.0")
+parser.add_argument("--unet_uses_attn_lora", type=str2bool, nargs="?", const=True, default=False,
+                    help="Whether to use LoRA in the Diffusers UNet model")
+# --attn_lora_layer_names and --q_lora_updates_query are only effective
+# when --unet_uses_attn_lora is set to True.
+parser.add_argument("--attn_lora_layer_names", type=str, nargs="*", default=['q', 'k', 'v', 'out'],
+                    choices=['q', 'k', 'v', 'out'], help="Names of the cross-attn components to apply LoRA on")
+parser.add_argument("--q_lora_updates_query", type=str2bool, nargs="?", const=True, default=False,
+                    help="Whether the q LoRA updates the query in the Diffusers UNet model. "
+                         "If False, the q lora only updates query2.")
+parser.add_argument("--show_disable_adaface_checkbox", type=str2bool, nargs="?", const=True, default=False,
+                    help="Whether to show the checkbox for disabling AdaFace")
+parser.add_argument('--extra_save_dir', type=str, default=None, help="Directory to save the generated images")
+parser.add_argument('--test_ui_only', type=str2bool, nargs="?", const=True, default=False,
+                    help="Only test the UI layout, and skip loadding the adaface model")
 parser.add_argument('--gpu', type=int, default=None)
 parser.add_argument('--ip', type=str, default="0.0.0.0")
 args = parser.parse_args()
+from huggingface_hub import snapshot_download
+large_files = ["models/*", "models/**/*"]
+snapshot_download(repo_id="adaface-neurips/adaface-models", repo_type="model", allow_patterns=large_files, local_dir=".")
 model_style_type2base_model_path = {
     "realistic": "models/rv51/realisticVisionV51_v51VAE_dste8.safetensors",
     "anime": "models/aingdiffusion/aingdiffusion_v170_ar.safetensors",
+    "photorealistic": "models/sar/sar.safetensors", # LDM format. Needs to be converted.
 }
 base_model_path = model_style_type2base_model_path[args.model_style_type]
 print(f"Device: {device}")
 global adaface
+adaface = None
+if not args.test_ui_only:
+    adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
+                             adaface_encoder_types=args.adaface_encoder_types,
+                             adaface_ckpt_paths=args.adaface_ckpt_path,
+                             adaface_encoder_cfg_scales=args.adaface_encoder_cfg_scales,
+                             enabled_encoders=args.enabled_encoders,
+                             unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
+                             unet_uses_attn_lora=args.unet_uses_attn_lora,
+                             attn_lora_layer_names=args.attn_lora_layer_names,
+                             shrink_cross_attn=False,
+                             q_lora_updates_query=args.q_lora_updates_query,
+                             device='cpu')
 def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
     if randomize_seed:
     # Hide uploaded_files_gallery,    show clear_button_column,      hide files,           reset init_img_selected_idx
     # Or:
     # Hide uploaded_init_img_gallery, hide init_clear_button_column, show init_img_files,  reset init_img_selected_idx
+    return gr.update(visible=False), gr.update(visible=False), gr.update(value=None, visible=True), \
+           gr.update(value=""), gr.update(value="(none)")
 @spaces.GPU
+def generate_image(image_paths, image_paths2, guidance_scale, perturb_std,
+                   num_images, prompt, negative_prompt, gender, highlight_face,
+                   ablate_prompt_embed_type, nonmix_prompt_emb_weight,
+                   composition_level, seed, disable_adaface, subj_name_sig, progress=gr.Progress(track_tqdm=True)):
     global adaface
     if image_paths is None or len(image_paths) == 0:
         raise gr.Error(f"Cannot find any input face image! Please upload a face image.")
+    if image_paths2 is not None and len(image_paths2) > 0:
+        image_paths = image_paths + image_paths2
     if prompt is None:
         prompt = ""
     # Sometimes the pipeline is on CPU, although we've put it on CUDA (due to some offloading mechanism).
     # Therefore we set the generator to the correct device.
     generator = torch.Generator(device=device).manual_seed(seed)
+    print(f"Manual seed: {seed}.")
     # Generate two images each time for the user to select from.
     noise = torch.randn(num_images, 3, 512, 512, device=device, generator=generator)
     #print(noise.abs().sum())
     # samples: A list of PIL Image instances.
+    if highlight_face:
+        if "portrait" not in prompt:
+            prompt = "face portrait, " + prompt
+        else:
             prompt = prompt.replace("portrait", "face portrait")
+    if composition_level >= 2:
+        if "full body" not in prompt:
+            prompt = prompt + ", full body view"
+    if gender != "(none)":
+        if "portrait" in prompt:
+            prompt = prompt.replace("portrait, ", f"portrait, {gender} ")
         else:
+            prompt = gender + ", " + prompt
     generator = torch.Generator(device=adaface.pipeline._execution_device).manual_seed(seed)
+    samples = adaface(noise, prompt, negative_prompt=negative_prompt,
                       guidance_scale=guidance_scale,
+                      out_image_count=num_images, generator=generator,
+                      repeat_prompt_for_each_encoder=(composition_level >= 1),
+                      ablate_prompt_no_placeholders=disable_adaface,
+                      ablate_prompt_embed_type=ablate_prompt_embed_type,
+                      nonmix_prompt_emb_weight=nonmix_prompt_emb_weight,
+                      verbose=True)
+    session_signature = ",".join(image_paths + [prompt, str(seed)])
+    temp_folder = os.path.join("/tmp/gradio", f"{hash(session_signature)}")
+    os.makedirs(temp_folder, exist_ok=True)
+    saved_image_paths = []
+    if "models/adaface/" in args.adaface_ckpt_path:
+        # The model is loaded from within the project.
+        # models/adaface/VGGface2_HQ_masks2024-10-14T16-09-24_zero3-ada-3500.pt
+        matches = re.search(r"models/adaface/\w+\d{4}-(\d{2})-(\d{2})T(\d{2})-\d{2}-\d{2}_zero3-ada-(\d+).pt", args.adaface_ckpt_path)
+    else:
+        # The model is loaded from the adaprompt folder.
+        # adaface_ckpt_path = "VGGface2_HQ_masks2024-11-28T13-13-20_zero3-ada/checkpoints/embeddings_gs-2000.pt"
+        matches = re.search(r"\d{4}-(\d{2})-(\d{2})T(\d{2})-\d{2}-\d{2}_zero3-ada/checkpoints/embeddings_gs-(\d+).pt", args.adaface_ckpt_path)
+    # Extract the checkpoint signature as 112813-2000
+    ckpt_sig = f"{matches.group(1)}{matches.group(2)}{matches.group(3)}-{matches.group(4)}"
+    prompt_keywords     = ['armor', 'beach', 'chef', 'dancing', 'iron man', 'jedi',
+                           'street', 'guitar', 'reading', 'running', 'superman', 'new year', 'mars']
+    keywords_reduction  = { 'iron man': 'ironman', 'dancing': 'dance',
+                            'running':  'run',     'reading': 'read', 'new year': 'newyear' }
+    prompt_sig = None
+    for keyword in prompt_keywords:
+        if keyword in prompt.lower():
+            prompt_sig = keywords_reduction.get(keyword, keyword)
+            break
+    if prompt_sig is None:
+        prompt_parts = prompt.lower().split(",")
+        # Remove the view/shot parts (full body view, long shot, etc.) from the prompt.
+        prompt_parts = [ part for part in prompt_parts if not re.search(r"\W(view|shot)(\W|$)", part) ]
+        if len(prompt_parts) > 0:
+            # Use the last word of the prompt as the signature.
+            prompt_sig = prompt_parts[-1].split()[-1]
+        else:
+            prompt_sig = "person"
+    if len(prompt_sig) > 0:
+        prompt_sig = "-" + prompt_sig
+    extra_save_dir = args.extra_save_dir
+    if extra_save_dir is not None:
+        os.makedirs(extra_save_dir, exist_ok=True)
+    for i, sample in enumerate(samples):
+        filename = f"adaface{ckpt_sig}{prompt_sig}-{i+1}.png"
+        if len(subj_name_sig) > 0:
+            filename = f"{subj_name_sig.lower()}-{filename}"
+        filepath = os.path.join(temp_folder, filename)
+        # Save the image
+        sample.save(filepath)  # Adjust to your image saving method
+        saved_image_paths.append(filepath)
+        if extra_save_dir is not None:
+            extra_filepath = os.path.join(extra_save_dir, filename)
+            sample.save(extra_filepath)
+            print(extra_filepath)
+    # Solution suggested by o1 to force the client browser to reload images
+    # when we change guidance scales only.
+    saved_image_paths = [f"{url}?t={int(time.time())}" for url in saved_image_paths]
+    return saved_image_paths
+def check_prompt_and_model_type(prompt, model_style_type, adaface_encoder_cfg_scale1):
     global adaface
     model_style_type = model_style_type.lower()
     # If the base model type is changed, reload the model.
+    if model_style_type != args.model_style_type or adaface_encoder_cfg_scale1 != args.adaface_encoder_cfg_scales[0]:
+        if model_style_type != args.model_style_type:
+            # Update base model type.
+            args.model_style_type = model_style_type
+            print(f"Switching to the base model type: {model_style_type}.")
+            adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=model_style_type2base_model_path[model_style_type],
+                                    adaface_encoder_types=args.adaface_encoder_types,
+                                    adaface_ckpt_paths=args.adaface_ckpt_path,
+                                    adaface_encoder_cfg_scales=args.adaface_encoder_cfg_scales,
+                                    enabled_encoders=args.enabled_encoders,
+                                    unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
+                                    unet_uses_attn_lora=args.unet_uses_attn_lora,
+                                    attn_lora_layer_names=args.attn_lora_layer_names,
+                                    shrink_cross_attn=False,
+                                    q_lora_updates_query=args.q_lora_updates_query,
+                                    device='cpu')
+    if adaface_encoder_cfg_scale1 != args.adaface_encoder_cfg_scales[0]:
+        args.adaface_encoder_cfg_scales[0] = adaface_encoder_cfg_scale1
+        adaface.set_adaface_encoder_cfg_scales(args.adaface_encoder_cfg_scales)
+        print(f"Updating the scale for consistentID encoder to {adaface_encoder_cfg_scale1}.")
     if not prompt:
         raise gr.Error("Prompt cannot be blank")
 <b>Official demo</b> for our working paper <b>AdaFace: A Versatile Face Encoder for Zero-Shot Diffusion Model Personalization</b>.<br>
 ❗️**What's New**❗️
+- Support switching between three model styles: **Photorealistic**, **Realistic** and **Anime**.
 - If you just changed the model style, the first image/video generation will take extra 20~30 seconds for loading new model weight.
 ❗️**Tips**❗️
 1. Upload one or more images of a person. If multiple faces are detected, we use the largest one.
+2. Check "Highlight face" to highlight fine facial features.
 4. AdaFace Text-to-Video: <a href="https://huggingface.co/spaces/adaface-neurips/adaface-animate" style="display: inline-flex; align-items: center;">
   AdaFace-Animate
   <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow" alt="Hugging Face Spaces" style="margin-left: 5px;">
 """
 css = '''
+.gradio-container {width: 95% !important}
 .custom-gallery {
+    height: 800px !important;
     width: 100%;
     margin: 10px auto;
+    padding: 0px;
+    overflow-y: auto !important;
+}
+.tight-row {
+    gap: 0 !important;        /* removes the horizontal gap between columns */
+    margin: 0 !important;     /* remove any extra margin if needed */
+    padding: 0 !important;    /* remove any extra padding if needed */
 }
 '''
 with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                         file_types=["image"],
                         file_count="multiple"
                     )
+            # When files are uploaded, show the images in the gallery and hide the file uploader.
+            uploaded_files_gallery  = gr.Gallery(label="Subject images", visible=False, columns=3, rows=1, height=300)
             with gr.Column(visible=False) as clear_button_column:
+                remove_and_reupload = gr.ClearButton(value="Remove and upload subject images",
+                                                     components=img_files, size="sm")
+            with gr.Accordion("Second Subject (Optional)", open=False):
+                img_files2 = gr.File(
+                            label="Drag / Select 1 or more photos of second subject's face (optional)",
+                            file_types=["image"],
+                            file_count="multiple"
+                        )
+                uploaded_files_gallery2 = gr.Gallery(label="2nd Subject images (optional)", visible=False, columns=3, rows=1, height=300)
+                with gr.Column(visible=False) as clear_button_column2:
+                    remove_and_reupload2 = gr.ClearButton(value="Remove and upload 2nd Subject images",
+                                                        components=img_files2, size="sm")
+            with gr.Row(elem_classes="tight-row"):
+                with gr.Column(scale=1, min_width=100):
+                    gender = gr.Dropdown(label="Gender", value="(none)",
+                                        info="Gender prefix. Select only when the model errs.",
+                                        container=False,
+                                        choices=[ "(none)", "person", "man", "woman", "girl", "boy" ])
+                with gr.Column(scale=100):
+                    prompt = gr.Dropdown(label="Prompt",
+                            info="Try something like 'walking on the beach'. If the face is not in focus, try checking 'Highlight face'.",
+                            value="portrait, highlighted hair, futuristic silver armor suit, confident stance, living room, smiling, head tilted, perfect smooth skin",
+                            allow_custom_value=True,
+                            choices=[
+                                    "portrait, highlighted hair, futuristic silver armor suit, confident stance, living room, smiling, head tilted, perfect smooth skin",
+                                    "portrait, walking on the beach, sunset, orange sky, front view",
+                                    "portrait, in a white apron and chef hat, garnishing a gourmet dish",
+                                    "portrait, waving hands, dancing pose among folks in a park",
+                                    "portrait, in iron man costume, the sky ablaze with hues of orange and purple",
+                                    "portrait, jedi wielding a lightsaber, star wars",
+                                    "portrait, night view of tokyo street, neon light",
+                                    "portrait, playing guitar on a boat, ocean waves",
+                                    "portrait, with a passion for reading, curled up with a book in a cozy nook near a window, front view",
+                                    "portrait, celebrating new year, fireworks",
+                                    "portrait, running pose in a park",
+                                    "portrait, in space suit, space helmet, walking on mars",
+                                    "portrait, in superman costume, the sky ablaze with hues of orange and purple",
+                                    "in a wheelchair",
+                                    "on a horse"
+                            ])
+            highlight_face = gr.Checkbox(label="Highlight face", value=False,
+                                         info="Enhance the facial features by prepending 'face portrait' to the prompt")
+            composition_level = \
+                gr.Slider(label="Composition Level", visible=True,
+                          info="The degree of overall composition, 0~2. Challenging prompts like 'In a wheelchair' and 'on a horse' need level 2",
+                          minimum=0, maximum=2, step=1, value=0)
+            ablate_prompt_embed_type = gr.Dropdown(label="Ablate prompt embeddings type",
+                                                   choices=["ada", "ada-nonmix", "img"], value="ada", visible=False,
+                                                   info="Use this type of prompt embeddings for ablation study")
+            nonmix_prompt_emb_weight = gr.Slider(label="Weight of ada-nonmix ID embeddings",
+                                                 minimum=0.0, maximum=0.5, step=0.1, value=0,
+                                                 info="Weight of ada-nonmix ID embeddings in the prompt embeddings",
+                                                 visible=False)
+            subj_name_sig = gr.Textbox(
+                label="Nickname of Subject (optional; used to name saved images)",
+                value="",
+            )
+            subj_name_sig2 = gr.Textbox(
+                label="Nickname of 2nd Subject (optional; used to name saved images)",
+                value="",
+                visible=False,
+            )
             submit = gr.Button("Submit", variant="primary")
             negative_prompt = gr.Textbox(
                 label="Negative Prompt",
+                value="sagging face, sagging cheeks, wrinkles, flaws in the eyes, flaws in the face, lowres, "
+                      "non-HDRi, low quality, worst quality, artifacts, noise, text, watermark, glitch, "
+                      "mutated, ugly, disfigured, hands, partially rendered objects, partially rendered eyes, "
+                      "deformed eyeballs, cross-eyed, extra legs, extra arms, blurry, mutation, duplicate, "
+                      "out of frame, cropped, mutilated, bad anatomy, deformed, bad proportions, "
+                      "nude, naked, nsfw, topless, bare breasts",
             )
             guidance_scale = gr.Slider(
                 label="Guidance scale",
                 minimum=1.0,
+                maximum=8.0,
+                step=0.5,
                 value=args.guidance_scale,
             )
+            adaface_encoder_cfg_scale1 = gr.Slider(
+                label="Scale for consistentID encoder",
+                minimum=1.0,
+                maximum=12.0,
+                step=1.0,
+                value=args.adaface_encoder_cfg_scales[0],
+                visible=False,
             )
             model_style_type = gr.Dropdown(
             num_images = gr.Slider(
                 label="Number of output images",
                 minimum=1,
+                maximum=8,
                 step=1,
                 value=4,
             )
                 step=1,
                 value=0,
             )
+            randomize_seed  = gr.Checkbox(label="Randomize seed", value=True,
+                                          info="Uncheck for reproducible results")
+            disable_adaface = gr.Checkbox(label="Disable AdaFace", value=False,
+                                          info="Disable AdaFace for ablation. If checked, the results are no longer personalized.",
+                                          visible=args.show_disable_adaface_checkbox)
         with gr.Column():
+            out_gallery = gr.Gallery(label="Generated Images", interactive=False, columns=2, rows=4, height=800,
                                      elem_classes="custom-gallery")
+        img_files.upload(fn=swap_to_gallery,  inputs=img_files,  outputs=[uploaded_files_gallery,  clear_button_column,  img_files])
+        img_files2.upload(fn=swap_to_gallery, inputs=img_files2, outputs=[uploaded_files_gallery2, clear_button_column2, img_files2])
+        remove_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_files_gallery, clear_button_column,
+                                                                    img_files, subj_name_sig, gender])
+        remove_and_reupload2.click(fn=remove_back_to_files, outputs=[uploaded_files_gallery2, clear_button_column2,
+                                                                    img_files2, subj_name_sig2, gender])
+        check_prompt_and_model_type_call_dict = {
+            'fn': check_prompt_and_model_type,
+            'inputs': [prompt, model_style_type, adaface_encoder_cfg_scale1],
+            'outputs': None
+        }
+        randomize_seed_fn_call_dict = {
+            'fn': randomize_seed_fn,
+            'inputs': [seed, randomize_seed],
+            'outputs': seed
+        }
+        generate_image_call_dict = {
+            'fn': generate_image,
+            'inputs': [img_files, img_files2, guidance_scale, perturb_std, num_images, prompt,
+                       negative_prompt, gender, highlight_face, ablate_prompt_embed_type,
+                       nonmix_prompt_emb_weight, composition_level, seed, disable_adaface, subj_name_sig],
+            'outputs': [out_gallery]
+        }
+        submit.click(**check_prompt_and_model_type_call_dict).success(**randomize_seed_fn_call_dict).then(**generate_image_call_dict)
+        subj_name_sig.submit(**check_prompt_and_model_type_call_dict).success(**randomize_seed_fn_call_dict).then(**generate_image_call_dict)
 demo.launch(share=True, server_name=args.ip, ssl_verify=False)