adaface-neurips commited on
Commit
7f97ed5
·
1 Parent(s): bccf74a

fix bug that prevents clip extension; extend clip from 97 to 147 tokens

Browse files
Files changed (2) hide show
  1. adaface/adaface_wrapper.py +13 -13
  2. app.py +1 -1
adaface/adaface_wrapper.py CHANGED
@@ -117,14 +117,6 @@ class AdaFaceWrapper(nn.Module):
117
  else:
118
  vae = None
119
 
120
- if self.use_ds_text_encoder:
121
- # The dreamshaper v7 finetuned text encoder follows the prompt slightly better than the original text encoder.
122
- # https://huggingface.co/Lykon/DreamShaper/tree/main/text_encoder
123
- text_encoder = CLIPTextModel.from_pretrained("models/diffusers/ds_text_encoder",
124
- torch_dtype=torch.float16)
125
- else:
126
- text_encoder = None
127
-
128
  remove_unet = False
129
 
130
  if self.pipeline_name == "img2img":
@@ -202,6 +194,13 @@ class AdaFaceWrapper(nn.Module):
202
 
203
  pipeline.unet = unet2
204
 
 
 
 
 
 
 
 
205
  # Extending prompt length is for SD 1.5 only.
206
  if (self.pipeline_name == "text2img") and (self.max_prompt_length > 77):
207
  # pipeline.text_encoder.text_model.embeddings.position_embedding.weight: [77, 768] -> [max_length, 768]
@@ -210,20 +209,21 @@ class AdaFaceWrapper(nn.Module):
210
  # a larger max_position_embeddings, and set ignore_mismatched_sizes=True,
211
  # then the old position embeddings won't be loaded from the pretrained ckpt,
212
  # leading to degenerated performance.
 
 
 
213
  EL = self.max_prompt_length - 77
214
  # position_embedding.weight: [77, 768] -> [max_length, 768]
215
  new_position_embedding = extend_nn_embedding(pipeline.text_encoder.text_model.embeddings.position_embedding,
216
  pipeline.text_encoder.text_model.embeddings.position_embedding.weight[-EL:])
217
  pipeline.text_encoder.text_model.embeddings.position_embedding = new_position_embedding
218
  pipeline.text_encoder.text_model.embeddings.position_ids = torch.arange(self.max_prompt_length).unsqueeze(0)
219
-
 
 
220
  if self.use_840k_vae:
221
  pipeline.vae = vae
222
  print("Replaced the VAE with the 840k-step VAE.")
223
-
224
- if self.use_ds_text_encoder:
225
- pipeline.text_encoder = text_encoder
226
- print("Replaced the text encoder with the DreamShaper text encoder.")
227
 
228
  if remove_unet:
229
  # Remove unet and vae to release RAM. Only keep tokenizer and text_encoder.
 
117
  else:
118
  vae = None
119
 
 
 
 
 
 
 
 
 
120
  remove_unet = False
121
 
122
  if self.pipeline_name == "img2img":
 
194
 
195
  pipeline.unet = unet2
196
 
197
+ if self.use_ds_text_encoder:
198
+ # The dreamshaper v7 finetuned text encoder follows the prompt slightly better than the original text encoder.
199
+ # https://huggingface.co/Lykon/DreamShaper/tree/main/text_encoder
200
+ pipeline.text_encoder = CLIPTextModel.from_pretrained("models/diffusers/ds_text_encoder",
201
+ torch_dtype=torch.float16)
202
+ print("Replaced the text encoder with the DreamShaper text encoder.")
203
+
204
  # Extending prompt length is for SD 1.5 only.
205
  if (self.pipeline_name == "text2img") and (self.max_prompt_length > 77):
206
  # pipeline.text_encoder.text_model.embeddings.position_embedding.weight: [77, 768] -> [max_length, 768]
 
209
  # a larger max_position_embeddings, and set ignore_mismatched_sizes=True,
210
  # then the old position embeddings won't be loaded from the pretrained ckpt,
211
  # leading to degenerated performance.
212
+ # max_prompt_length <= 77 + 70 = 147.
213
+ self.max_prompt_length = min(self.max_prompt_length, 147)
214
+ # Number of extra tokens is at most 70.
215
  EL = self.max_prompt_length - 77
216
  # position_embedding.weight: [77, 768] -> [max_length, 768]
217
  new_position_embedding = extend_nn_embedding(pipeline.text_encoder.text_model.embeddings.position_embedding,
218
  pipeline.text_encoder.text_model.embeddings.position_embedding.weight[-EL:])
219
  pipeline.text_encoder.text_model.embeddings.position_embedding = new_position_embedding
220
  pipeline.text_encoder.text_model.embeddings.position_ids = torch.arange(self.max_prompt_length).unsqueeze(0)
221
+ pipeline.text_encoder.text_model.config.max_position_embeddings = self.max_prompt_length
222
+ pipeline.tokenizer.model_max_length = self.max_prompt_length
223
+
224
  if self.use_840k_vae:
225
  pipeline.vae = vae
226
  print("Replaced the VAE with the 840k-step VAE.")
 
 
 
 
227
 
228
  if remove_unet:
229
  # Remove unet and vae to release RAM. Only keep tokenizer and text_encoder.
app.py CHANGED
@@ -34,7 +34,7 @@ parser.add_argument('--num_inference_steps', type=int, default=50,
34
  parser.add_argument('--ablate_prompt_embed_type', type=str, default='ada',
35
  choices=["ada", "arc2face", "consistentID"],
36
  help="Ablate to use the image ID embs instead of Ada embs")
37
- parser.add_argument('--max_prompt_length', type=int, default=97,
38
  help="Maximum length of the prompt. If > 77, the CLIP text encoder will be extended.")
39
 
40
  parser.add_argument('--gpu', type=int, default=None)
 
34
  parser.add_argument('--ablate_prompt_embed_type', type=str, default='ada',
35
  choices=["ada", "arc2face", "consistentID"],
36
  help="Ablate to use the image ID embs instead of Ada embs")
37
+ parser.add_argument('--max_prompt_length', type=int, default=147,
38
  help="Maximum length of the prompt. If > 77, the CLIP text encoder will be extended.")
39
 
40
  parser.add_argument('--gpu', type=int, default=None)