ConsistentID

Runtime error

App Files Files Community

adaface-neurips commited on Aug 26, 2024

Commit

eaf48ba

1 Parent(s): 4b6bc2f

prepared for distillation

Browse files

Files changed (1) hide show

lib/pipline_ConsistentID.py +40 -46

lib/pipline_ConsistentID.py CHANGED Viewed

@@ -33,36 +33,31 @@ PipelineImageInput = Union[
 ### Download the pretrained model from huggingface and put it locally, then place the model in a local directory and specify the directory location.
 class ConsistentIDPipeline(StableDiffusionPipeline):
-    def cuda(self, dtype=torch.float16, use_xformers=False):
-        self.to('cuda', dtype)
-        # if hasattr(self, 'image_proj_model'):
-        #     self.image_proj_model.to(self.unet.device).to(self.unet.dtype)
-        if use_xformers:
-            if is_xformers_available():
-                import xformers
-                from packaging import version
-                xformers_version = version.parse(xformers.__version__)
-                if xformers_version == version.parse("0.0.16"):
-                    logger.warn(
-                        "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
-                    )
-                self.enable_xformers_memory_efficient_attention()
-            else:
-                raise ValueError("xformers is not available. Make sure it is installed correctly")
     @validate_hf_hub_args
     def load_ConsistentID_model(
         self,
-        consistentID_weight_path: str,
-        bise_net_weight_path: str,
-        trigger_word_facial: str = '<|facial|>',
         # A CLIP ViT-H/14 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP.
         # output dim: 1280.
-        image_encoder_path: str = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
         torch_dtype = torch.float16,
         num_tokens = 4,
         lora_rank= 128,
@@ -73,9 +68,7 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         self.num_tokens = num_tokens
         self.set_ip_adapter()
         self.image_encoder_path = image_encoder_path
-        self.clip_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
-            self.device, dtype=self.torch_dtype
-        )
         self.clip_preprocessor  = CLIPImageProcessor()
         self.id_image_processor = CLIPImageProcessor()
         self.crop_size = 512
@@ -96,20 +89,19 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         bise_net = BiSeNet(n_classes = 19)
         bise_net.load_state_dict(torch.load(bise_net_weight_path, map_location="cpu"))
-        bise_net.to(self.device, dtype=self.torch_dtype)
         bise_net.eval()
         self.bise_net = bise_net
         # Colors for all 20 parts
         self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
-                    [255, 0, 85], [255, 0, 170],
-                    [0, 255, 0], [85, 255, 0], [170, 255, 0],
-                    [0, 255, 85], [0, 255, 170],
-                    [0, 0, 255], [85, 0, 255], [170, 0, 255],
-                    [0, 85, 255], [0, 170, 255],
-                    [255, 255, 0], [255, 255, 85], [255, 255, 170],
-                    [255, 0, 255], [255, 85, 255], [255, 170, 255],
-                    [0, 255, 255], [85, 255, 255], [170, 255, 255]]
         # image_proj_model maps 1280-dim OpenCLIP embeddings to 768-dim face prompt embeddings.
         self.image_proj_model = ProjPlusModel(
@@ -117,8 +109,8 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
             id_embeddings_dim=512,
             clip_embeddings_dim=self.clip_encoder.config.hidden_size,
             num_tokens=self.num_tokens,  # 4 - inspirsed by IPAdapter and Midjourney
-        ).to(self.device, dtype=self.torch_dtype)
-        self.FacialEncoder = FacialEncoder().to(self.device, dtype=self.torch_dtype)
         if consistentID_weight_path.endswith(".safetensors"):
             state_dict = {"id_encoder": {}, "lora_weights": {}}
@@ -136,8 +128,8 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         self.FacialEncoder.load_state_dict(state_dict["FacialEncoder"], strict=True)
         self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
-        ip_layers = torch.nn.ModuleList(self.unet.attn_processors.values())
-        ip_layers.load_state_dict(state_dict["adapter_modules"], strict=True)
         print(f"Successfully loaded weights from checkpoint")
         # Add trigger word token
@@ -160,11 +152,11 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
             if cross_attention_dim is None:
                 attn_procs[name] = Consistent_AttProcessor(
                     hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=self.lora_rank,
-                ).to(self.device, dtype=self.torch_dtype)
             else:
                 attn_procs[name] = Consistent_IPAttProcessor(
                     hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, rank=self.lora_rank, num_tokens=self.num_tokens,
-                ).to(self.device, dtype=self.torch_dtype)
         unet.set_attn_processor(attn_procs)
@@ -364,17 +356,19 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
         return parsed_image_parts, facial_masks, key_masked_raw_images_dict
-    # Release the unet or vae to save memory.
-    def release_components(self, release_unet=False, release_vae=True):
-        if release_unet:
             unet = edict()
             # Only keep the config and in_channels attributes that are used in the pipeline.
             unet.config = self.unet.config
             unet.in_channels = self.unet.in_channels
             self.unet = unet
-        if release_vae:
             self.vae = None
     # input_subj_image_obj: an Image object.
     def extract_double_id_prompt_embeds(self, prompt, negative_prompt, input_subj_image_obj, device, calc_uncond=True):

 ### Download the pretrained model from huggingface and put it locally, then place the model in a local directory and specify the directory location.
 class ConsistentIDPipeline(StableDiffusionPipeline):
+    # to() should be only called after all modules are loaded.
+    def to(
+        self,
+        torch_device: Optional[Union[str, torch.device]] = None,
+        torch_dtype: Optional[torch.dtype] = None,
+    ):
+        super().to(torch_device,            torch_dtype)
+        self.bise_net.to(torch_device,      dtype=torch_dtype)
+        self.clip_encoder.to(torch_device,  dtype=torch_dtype)
+        self.image_proj_model.to(torch_device, dtype=torch_dtype)
+        self.FacialEncoder.to(torch_device, dtype=torch_dtype)
+        # If the unet is not released, the ip_layers should be moved to the specified device and dtype.
+        if not isinstance(self.unet, edict):
+            self.ip_layers.to(torch_device,     dtype=torch_dtype)
+        return self
     @validate_hf_hub_args
     def load_ConsistentID_model(
         self,
+        consistentID_weight_path:   str,
+        bise_net_weight_path:       str,
+        trigger_word_facial:        str = '<|facial|>',
         # A CLIP ViT-H/14 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP.
         # output dim: 1280.
+        image_encoder_path:         str = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
         torch_dtype = torch.float16,
         num_tokens = 4,
         lora_rank= 128,
         self.num_tokens = num_tokens
         self.set_ip_adapter()
         self.image_encoder_path = image_encoder_path
+        self.clip_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path)
         self.clip_preprocessor  = CLIPImageProcessor()
         self.id_image_processor = CLIPImageProcessor()
         self.crop_size = 512
         bise_net = BiSeNet(n_classes = 19)
         bise_net.load_state_dict(torch.load(bise_net_weight_path, map_location="cpu"))
         bise_net.eval()
         self.bise_net = bise_net
         # Colors for all 20 parts
         self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
+                            [255, 0, 85], [255, 0, 170],
+                            [0, 255, 0], [85, 255, 0], [170, 255, 0],
+                            [0, 255, 85], [0, 255, 170],
+                            [0, 0, 255], [85, 0, 255], [170, 0, 255],
+                            [0, 85, 255], [0, 170, 255],
+                            [255, 255, 0], [255, 255, 85], [255, 255, 170],
+                            [255, 0, 255], [255, 85, 255], [255, 170, 255],
+                            [0, 255, 255], [85, 255, 255], [170, 255, 255]]
         # image_proj_model maps 1280-dim OpenCLIP embeddings to 768-dim face prompt embeddings.
         self.image_proj_model = ProjPlusModel(
             id_embeddings_dim=512,
             clip_embeddings_dim=self.clip_encoder.config.hidden_size,
             num_tokens=self.num_tokens,  # 4 - inspirsed by IPAdapter and Midjourney
+        )
+        self.FacialEncoder = FacialEncoder()
         if consistentID_weight_path.endswith(".safetensors"):
             state_dict = {"id_encoder": {}, "lora_weights": {}}
         self.FacialEncoder.load_state_dict(state_dict["FacialEncoder"], strict=True)
         self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
+        self.ip_layers = torch.nn.ModuleList(self.unet.attn_processors.values())
+        self.ip_layers.load_state_dict(state_dict["adapter_modules"], strict=True)
         print(f"Successfully loaded weights from checkpoint")
         # Add trigger word token
             if cross_attention_dim is None:
                 attn_procs[name] = Consistent_AttProcessor(
                     hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=self.lora_rank,
+                )
             else:
                 attn_procs[name] = Consistent_IPAttProcessor(
                     hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, rank=self.lora_rank, num_tokens=self.num_tokens,
+                )
         unet.set_attn_processor(attn_procs)
         return parsed_image_parts, facial_masks, key_masked_raw_images_dict
+    # Release the unet/vae/text_encoder to save memory.
+    def release_components(self, released_components=["unet", "vae", "text_encoder"]):
+        if "unet" in released_components:
             unet = edict()
             # Only keep the config and in_channels attributes that are used in the pipeline.
             unet.config = self.unet.config
             unet.in_channels = self.unet.in_channels
             self.unet = unet
+        if "vae" in released_components:
             self.vae = None
+        if "text_encoder" in released_components:
+            self.text_encoder = None
     # input_subj_image_obj: an Image object.
     def extract_double_id_prompt_embeds(self, prompt, negative_prompt, input_subj_image_obj, device, calc_uncond=True):