adaface-neurips commited on
Commit
eaf48ba
·
1 Parent(s): 4b6bc2f

prepared for distillation

Browse files
Files changed (1) hide show
  1. lib/pipline_ConsistentID.py +40 -46
lib/pipline_ConsistentID.py CHANGED
@@ -33,36 +33,31 @@ PipelineImageInput = Union[
33
 
34
  ### Download the pretrained model from huggingface and put it locally, then place the model in a local directory and specify the directory location.
35
  class ConsistentIDPipeline(StableDiffusionPipeline):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- def cuda(self, dtype=torch.float16, use_xformers=False):
38
- self.to('cuda', dtype)
39
-
40
- # if hasattr(self, 'image_proj_model'):
41
- # self.image_proj_model.to(self.unet.device).to(self.unet.dtype)
42
-
43
- if use_xformers:
44
- if is_xformers_available():
45
- import xformers
46
- from packaging import version
47
-
48
- xformers_version = version.parse(xformers.__version__)
49
- if xformers_version == version.parse("0.0.16"):
50
- logger.warn(
51
- "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
52
- )
53
- self.enable_xformers_memory_efficient_attention()
54
- else:
55
- raise ValueError("xformers is not available. Make sure it is installed correctly")
56
-
57
  @validate_hf_hub_args
58
  def load_ConsistentID_model(
59
  self,
60
- consistentID_weight_path: str,
61
- bise_net_weight_path: str,
62
- trigger_word_facial: str = '<|facial|>',
63
  # A CLIP ViT-H/14 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP.
64
  # output dim: 1280.
65
- image_encoder_path: str = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
66
  torch_dtype = torch.float16,
67
  num_tokens = 4,
68
  lora_rank= 128,
@@ -73,9 +68,7 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
73
  self.num_tokens = num_tokens
74
  self.set_ip_adapter()
75
  self.image_encoder_path = image_encoder_path
76
- self.clip_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
77
- self.device, dtype=self.torch_dtype
78
- )
79
  self.clip_preprocessor = CLIPImageProcessor()
80
  self.id_image_processor = CLIPImageProcessor()
81
  self.crop_size = 512
@@ -96,20 +89,19 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
96
 
97
  bise_net = BiSeNet(n_classes = 19)
98
  bise_net.load_state_dict(torch.load(bise_net_weight_path, map_location="cpu"))
99
- bise_net.to(self.device, dtype=self.torch_dtype)
100
  bise_net.eval()
101
  self.bise_net = bise_net
102
 
103
  # Colors for all 20 parts
104
  self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
105
- [255, 0, 85], [255, 0, 170],
106
- [0, 255, 0], [85, 255, 0], [170, 255, 0],
107
- [0, 255, 85], [0, 255, 170],
108
- [0, 0, 255], [85, 0, 255], [170, 0, 255],
109
- [0, 85, 255], [0, 170, 255],
110
- [255, 255, 0], [255, 255, 85], [255, 255, 170],
111
- [255, 0, 255], [255, 85, 255], [255, 170, 255],
112
- [0, 255, 255], [85, 255, 255], [170, 255, 255]]
113
 
114
  # image_proj_model maps 1280-dim OpenCLIP embeddings to 768-dim face prompt embeddings.
115
  self.image_proj_model = ProjPlusModel(
@@ -117,8 +109,8 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
117
  id_embeddings_dim=512,
118
  clip_embeddings_dim=self.clip_encoder.config.hidden_size,
119
  num_tokens=self.num_tokens, # 4 - inspirsed by IPAdapter and Midjourney
120
- ).to(self.device, dtype=self.torch_dtype)
121
- self.FacialEncoder = FacialEncoder().to(self.device, dtype=self.torch_dtype)
122
 
123
  if consistentID_weight_path.endswith(".safetensors"):
124
  state_dict = {"id_encoder": {}, "lora_weights": {}}
@@ -136,8 +128,8 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
136
 
137
  self.FacialEncoder.load_state_dict(state_dict["FacialEncoder"], strict=True)
138
  self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
139
- ip_layers = torch.nn.ModuleList(self.unet.attn_processors.values())
140
- ip_layers.load_state_dict(state_dict["adapter_modules"], strict=True)
141
  print(f"Successfully loaded weights from checkpoint")
142
 
143
  # Add trigger word token
@@ -160,11 +152,11 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
160
  if cross_attention_dim is None:
161
  attn_procs[name] = Consistent_AttProcessor(
162
  hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=self.lora_rank,
163
- ).to(self.device, dtype=self.torch_dtype)
164
  else:
165
  attn_procs[name] = Consistent_IPAttProcessor(
166
  hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, rank=self.lora_rank, num_tokens=self.num_tokens,
167
- ).to(self.device, dtype=self.torch_dtype)
168
 
169
  unet.set_attn_processor(attn_procs)
170
 
@@ -364,17 +356,19 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
364
 
365
  return parsed_image_parts, facial_masks, key_masked_raw_images_dict
366
 
367
- # Release the unet or vae to save memory.
368
- def release_components(self, release_unet=False, release_vae=True):
369
- if release_unet:
370
  unet = edict()
371
  # Only keep the config and in_channels attributes that are used in the pipeline.
372
  unet.config = self.unet.config
373
  unet.in_channels = self.unet.in_channels
374
  self.unet = unet
375
 
376
- if release_vae:
377
  self.vae = None
 
 
378
 
379
  # input_subj_image_obj: an Image object.
380
  def extract_double_id_prompt_embeds(self, prompt, negative_prompt, input_subj_image_obj, device, calc_uncond=True):
 
33
 
34
  ### Download the pretrained model from huggingface and put it locally, then place the model in a local directory and specify the directory location.
35
  class ConsistentIDPipeline(StableDiffusionPipeline):
36
+ # to() should be only called after all modules are loaded.
37
+ def to(
38
+ self,
39
+ torch_device: Optional[Union[str, torch.device]] = None,
40
+ torch_dtype: Optional[torch.dtype] = None,
41
+ ):
42
+ super().to(torch_device, torch_dtype)
43
+ self.bise_net.to(torch_device, dtype=torch_dtype)
44
+ self.clip_encoder.to(torch_device, dtype=torch_dtype)
45
+ self.image_proj_model.to(torch_device, dtype=torch_dtype)
46
+ self.FacialEncoder.to(torch_device, dtype=torch_dtype)
47
+ # If the unet is not released, the ip_layers should be moved to the specified device and dtype.
48
+ if not isinstance(self.unet, edict):
49
+ self.ip_layers.to(torch_device, dtype=torch_dtype)
50
+ return self
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  @validate_hf_hub_args
53
  def load_ConsistentID_model(
54
  self,
55
+ consistentID_weight_path: str,
56
+ bise_net_weight_path: str,
57
+ trigger_word_facial: str = '<|facial|>',
58
  # A CLIP ViT-H/14 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP.
59
  # output dim: 1280.
60
+ image_encoder_path: str = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
61
  torch_dtype = torch.float16,
62
  num_tokens = 4,
63
  lora_rank= 128,
 
68
  self.num_tokens = num_tokens
69
  self.set_ip_adapter()
70
  self.image_encoder_path = image_encoder_path
71
+ self.clip_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path)
 
 
72
  self.clip_preprocessor = CLIPImageProcessor()
73
  self.id_image_processor = CLIPImageProcessor()
74
  self.crop_size = 512
 
89
 
90
  bise_net = BiSeNet(n_classes = 19)
91
  bise_net.load_state_dict(torch.load(bise_net_weight_path, map_location="cpu"))
 
92
  bise_net.eval()
93
  self.bise_net = bise_net
94
 
95
  # Colors for all 20 parts
96
  self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
97
+ [255, 0, 85], [255, 0, 170],
98
+ [0, 255, 0], [85, 255, 0], [170, 255, 0],
99
+ [0, 255, 85], [0, 255, 170],
100
+ [0, 0, 255], [85, 0, 255], [170, 0, 255],
101
+ [0, 85, 255], [0, 170, 255],
102
+ [255, 255, 0], [255, 255, 85], [255, 255, 170],
103
+ [255, 0, 255], [255, 85, 255], [255, 170, 255],
104
+ [0, 255, 255], [85, 255, 255], [170, 255, 255]]
105
 
106
  # image_proj_model maps 1280-dim OpenCLIP embeddings to 768-dim face prompt embeddings.
107
  self.image_proj_model = ProjPlusModel(
 
109
  id_embeddings_dim=512,
110
  clip_embeddings_dim=self.clip_encoder.config.hidden_size,
111
  num_tokens=self.num_tokens, # 4 - inspirsed by IPAdapter and Midjourney
112
+ )
113
+ self.FacialEncoder = FacialEncoder()
114
 
115
  if consistentID_weight_path.endswith(".safetensors"):
116
  state_dict = {"id_encoder": {}, "lora_weights": {}}
 
128
 
129
  self.FacialEncoder.load_state_dict(state_dict["FacialEncoder"], strict=True)
130
  self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
131
+ self.ip_layers = torch.nn.ModuleList(self.unet.attn_processors.values())
132
+ self.ip_layers.load_state_dict(state_dict["adapter_modules"], strict=True)
133
  print(f"Successfully loaded weights from checkpoint")
134
 
135
  # Add trigger word token
 
152
  if cross_attention_dim is None:
153
  attn_procs[name] = Consistent_AttProcessor(
154
  hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=self.lora_rank,
155
+ )
156
  else:
157
  attn_procs[name] = Consistent_IPAttProcessor(
158
  hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, rank=self.lora_rank, num_tokens=self.num_tokens,
159
+ )
160
 
161
  unet.set_attn_processor(attn_procs)
162
 
 
356
 
357
  return parsed_image_parts, facial_masks, key_masked_raw_images_dict
358
 
359
+ # Release the unet/vae/text_encoder to save memory.
360
+ def release_components(self, released_components=["unet", "vae", "text_encoder"]):
361
+ if "unet" in released_components:
362
  unet = edict()
363
  # Only keep the config and in_channels attributes that are used in the pipeline.
364
  unet.config = self.unet.config
365
  unet.in_channels = self.unet.in_channels
366
  self.unet = unet
367
 
368
+ if "vae" in released_components:
369
  self.vae = None
370
+ if "text_encoder" in released_components:
371
+ self.text_encoder = None
372
 
373
  # input_subj_image_obj: an Image object.
374
  def extract_double_id_prompt_embeds(self, prompt, negative_prompt, input_subj_image_obj, device, calc_uncond=True):