Spaces:
Runtime error
Runtime error
adaface-neurips
commited on
Commit
·
eaf48ba
1
Parent(s):
4b6bc2f
prepared for distillation
Browse files- lib/pipline_ConsistentID.py +40 -46
lib/pipline_ConsistentID.py
CHANGED
@@ -33,36 +33,31 @@ PipelineImageInput = Union[
|
|
33 |
|
34 |
### Download the pretrained model from huggingface and put it locally, then place the model in a local directory and specify the directory location.
|
35 |
class ConsistentIDPipeline(StableDiffusionPipeline):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
def cuda(self, dtype=torch.float16, use_xformers=False):
|
38 |
-
self.to('cuda', dtype)
|
39 |
-
|
40 |
-
# if hasattr(self, 'image_proj_model'):
|
41 |
-
# self.image_proj_model.to(self.unet.device).to(self.unet.dtype)
|
42 |
-
|
43 |
-
if use_xformers:
|
44 |
-
if is_xformers_available():
|
45 |
-
import xformers
|
46 |
-
from packaging import version
|
47 |
-
|
48 |
-
xformers_version = version.parse(xformers.__version__)
|
49 |
-
if xformers_version == version.parse("0.0.16"):
|
50 |
-
logger.warn(
|
51 |
-
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
52 |
-
)
|
53 |
-
self.enable_xformers_memory_efficient_attention()
|
54 |
-
else:
|
55 |
-
raise ValueError("xformers is not available. Make sure it is installed correctly")
|
56 |
-
|
57 |
@validate_hf_hub_args
|
58 |
def load_ConsistentID_model(
|
59 |
self,
|
60 |
-
consistentID_weight_path:
|
61 |
-
bise_net_weight_path:
|
62 |
-
trigger_word_facial:
|
63 |
# A CLIP ViT-H/14 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP.
|
64 |
# output dim: 1280.
|
65 |
-
image_encoder_path:
|
66 |
torch_dtype = torch.float16,
|
67 |
num_tokens = 4,
|
68 |
lora_rank= 128,
|
@@ -73,9 +68,7 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
|
|
73 |
self.num_tokens = num_tokens
|
74 |
self.set_ip_adapter()
|
75 |
self.image_encoder_path = image_encoder_path
|
76 |
-
self.clip_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path)
|
77 |
-
self.device, dtype=self.torch_dtype
|
78 |
-
)
|
79 |
self.clip_preprocessor = CLIPImageProcessor()
|
80 |
self.id_image_processor = CLIPImageProcessor()
|
81 |
self.crop_size = 512
|
@@ -96,20 +89,19 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
|
|
96 |
|
97 |
bise_net = BiSeNet(n_classes = 19)
|
98 |
bise_net.load_state_dict(torch.load(bise_net_weight_path, map_location="cpu"))
|
99 |
-
bise_net.to(self.device, dtype=self.torch_dtype)
|
100 |
bise_net.eval()
|
101 |
self.bise_net = bise_net
|
102 |
|
103 |
# Colors for all 20 parts
|
104 |
self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
|
114 |
# image_proj_model maps 1280-dim OpenCLIP embeddings to 768-dim face prompt embeddings.
|
115 |
self.image_proj_model = ProjPlusModel(
|
@@ -117,8 +109,8 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
|
|
117 |
id_embeddings_dim=512,
|
118 |
clip_embeddings_dim=self.clip_encoder.config.hidden_size,
|
119 |
num_tokens=self.num_tokens, # 4 - inspirsed by IPAdapter and Midjourney
|
120 |
-
)
|
121 |
-
self.FacialEncoder = FacialEncoder()
|
122 |
|
123 |
if consistentID_weight_path.endswith(".safetensors"):
|
124 |
state_dict = {"id_encoder": {}, "lora_weights": {}}
|
@@ -136,8 +128,8 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
|
|
136 |
|
137 |
self.FacialEncoder.load_state_dict(state_dict["FacialEncoder"], strict=True)
|
138 |
self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
|
139 |
-
ip_layers = torch.nn.ModuleList(self.unet.attn_processors.values())
|
140 |
-
ip_layers.load_state_dict(state_dict["adapter_modules"], strict=True)
|
141 |
print(f"Successfully loaded weights from checkpoint")
|
142 |
|
143 |
# Add trigger word token
|
@@ -160,11 +152,11 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
|
|
160 |
if cross_attention_dim is None:
|
161 |
attn_procs[name] = Consistent_AttProcessor(
|
162 |
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=self.lora_rank,
|
163 |
-
)
|
164 |
else:
|
165 |
attn_procs[name] = Consistent_IPAttProcessor(
|
166 |
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, rank=self.lora_rank, num_tokens=self.num_tokens,
|
167 |
-
)
|
168 |
|
169 |
unet.set_attn_processor(attn_procs)
|
170 |
|
@@ -364,17 +356,19 @@ class ConsistentIDPipeline(StableDiffusionPipeline):
|
|
364 |
|
365 |
return parsed_image_parts, facial_masks, key_masked_raw_images_dict
|
366 |
|
367 |
-
# Release the unet
|
368 |
-
def release_components(self,
|
369 |
-
if
|
370 |
unet = edict()
|
371 |
# Only keep the config and in_channels attributes that are used in the pipeline.
|
372 |
unet.config = self.unet.config
|
373 |
unet.in_channels = self.unet.in_channels
|
374 |
self.unet = unet
|
375 |
|
376 |
-
if
|
377 |
self.vae = None
|
|
|
|
|
378 |
|
379 |
# input_subj_image_obj: an Image object.
|
380 |
def extract_double_id_prompt_embeds(self, prompt, negative_prompt, input_subj_image_obj, device, calc_uncond=True):
|
|
|
33 |
|
34 |
### Download the pretrained model from huggingface and put it locally, then place the model in a local directory and specify the directory location.
|
35 |
class ConsistentIDPipeline(StableDiffusionPipeline):
|
36 |
+
# to() should be only called after all modules are loaded.
|
37 |
+
def to(
|
38 |
+
self,
|
39 |
+
torch_device: Optional[Union[str, torch.device]] = None,
|
40 |
+
torch_dtype: Optional[torch.dtype] = None,
|
41 |
+
):
|
42 |
+
super().to(torch_device, torch_dtype)
|
43 |
+
self.bise_net.to(torch_device, dtype=torch_dtype)
|
44 |
+
self.clip_encoder.to(torch_device, dtype=torch_dtype)
|
45 |
+
self.image_proj_model.to(torch_device, dtype=torch_dtype)
|
46 |
+
self.FacialEncoder.to(torch_device, dtype=torch_dtype)
|
47 |
+
# If the unet is not released, the ip_layers should be moved to the specified device and dtype.
|
48 |
+
if not isinstance(self.unet, edict):
|
49 |
+
self.ip_layers.to(torch_device, dtype=torch_dtype)
|
50 |
+
return self
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
@validate_hf_hub_args
|
53 |
def load_ConsistentID_model(
|
54 |
self,
|
55 |
+
consistentID_weight_path: str,
|
56 |
+
bise_net_weight_path: str,
|
57 |
+
trigger_word_facial: str = '<|facial|>',
|
58 |
# A CLIP ViT-H/14 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP.
|
59 |
# output dim: 1280.
|
60 |
+
image_encoder_path: str = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
|
61 |
torch_dtype = torch.float16,
|
62 |
num_tokens = 4,
|
63 |
lora_rank= 128,
|
|
|
68 |
self.num_tokens = num_tokens
|
69 |
self.set_ip_adapter()
|
70 |
self.image_encoder_path = image_encoder_path
|
71 |
+
self.clip_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path)
|
|
|
|
|
72 |
self.clip_preprocessor = CLIPImageProcessor()
|
73 |
self.id_image_processor = CLIPImageProcessor()
|
74 |
self.crop_size = 512
|
|
|
89 |
|
90 |
bise_net = BiSeNet(n_classes = 19)
|
91 |
bise_net.load_state_dict(torch.load(bise_net_weight_path, map_location="cpu"))
|
|
|
92 |
bise_net.eval()
|
93 |
self.bise_net = bise_net
|
94 |
|
95 |
# Colors for all 20 parts
|
96 |
self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
|
97 |
+
[255, 0, 85], [255, 0, 170],
|
98 |
+
[0, 255, 0], [85, 255, 0], [170, 255, 0],
|
99 |
+
[0, 255, 85], [0, 255, 170],
|
100 |
+
[0, 0, 255], [85, 0, 255], [170, 0, 255],
|
101 |
+
[0, 85, 255], [0, 170, 255],
|
102 |
+
[255, 255, 0], [255, 255, 85], [255, 255, 170],
|
103 |
+
[255, 0, 255], [255, 85, 255], [255, 170, 255],
|
104 |
+
[0, 255, 255], [85, 255, 255], [170, 255, 255]]
|
105 |
|
106 |
# image_proj_model maps 1280-dim OpenCLIP embeddings to 768-dim face prompt embeddings.
|
107 |
self.image_proj_model = ProjPlusModel(
|
|
|
109 |
id_embeddings_dim=512,
|
110 |
clip_embeddings_dim=self.clip_encoder.config.hidden_size,
|
111 |
num_tokens=self.num_tokens, # 4 - inspirsed by IPAdapter and Midjourney
|
112 |
+
)
|
113 |
+
self.FacialEncoder = FacialEncoder()
|
114 |
|
115 |
if consistentID_weight_path.endswith(".safetensors"):
|
116 |
state_dict = {"id_encoder": {}, "lora_weights": {}}
|
|
|
128 |
|
129 |
self.FacialEncoder.load_state_dict(state_dict["FacialEncoder"], strict=True)
|
130 |
self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
|
131 |
+
self.ip_layers = torch.nn.ModuleList(self.unet.attn_processors.values())
|
132 |
+
self.ip_layers.load_state_dict(state_dict["adapter_modules"], strict=True)
|
133 |
print(f"Successfully loaded weights from checkpoint")
|
134 |
|
135 |
# Add trigger word token
|
|
|
152 |
if cross_attention_dim is None:
|
153 |
attn_procs[name] = Consistent_AttProcessor(
|
154 |
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=self.lora_rank,
|
155 |
+
)
|
156 |
else:
|
157 |
attn_procs[name] = Consistent_IPAttProcessor(
|
158 |
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, rank=self.lora_rank, num_tokens=self.num_tokens,
|
159 |
+
)
|
160 |
|
161 |
unet.set_attn_processor(attn_procs)
|
162 |
|
|
|
356 |
|
357 |
return parsed_image_parts, facial_masks, key_masked_raw_images_dict
|
358 |
|
359 |
+
# Release the unet/vae/text_encoder to save memory.
|
360 |
+
def release_components(self, released_components=["unet", "vae", "text_encoder"]):
|
361 |
+
if "unet" in released_components:
|
362 |
unet = edict()
|
363 |
# Only keep the config and in_channels attributes that are used in the pipeline.
|
364 |
unet.config = self.unet.config
|
365 |
unet.in_channels = self.unet.in_channels
|
366 |
self.unet = unet
|
367 |
|
368 |
+
if "vae" in released_components:
|
369 |
self.vae = None
|
370 |
+
if "text_encoder" in released_components:
|
371 |
+
self.text_encoder = None
|
372 |
|
373 |
# input_subj_image_obj: an Image object.
|
374 |
def extract_double_id_prompt_embeds(self, prompt, negative_prompt, input_subj_image_obj, device, calc_uncond=True):
|