Spaces:

adaface-neurips
/

adaface-animate

Running on Zero

App Files Files Community

adaface-neurips commited on Jun 2

Commit

7f97ed5

1 Parent(s): bccf74a

fix bug that prevents clip extension; extend clip from 97 to 147 tokens

Browse files

Files changed (2) hide show

adaface/adaface_wrapper.py +13 -13
app.py +1 -1

adaface/adaface_wrapper.py CHANGED Viewed

@@ -117,14 +117,6 @@ class AdaFaceWrapper(nn.Module):
         else:
             vae = None
-        if self.use_ds_text_encoder:
-            # The dreamshaper v7 finetuned text encoder follows the prompt slightly better than the original text encoder.
-            # https://huggingface.co/Lykon/DreamShaper/tree/main/text_encoder
-            text_encoder = CLIPTextModel.from_pretrained("models/diffusers/ds_text_encoder",
-                                                         torch_dtype=torch.float16)
-        else:
-            text_encoder = None
         remove_unet = False
         if self.pipeline_name == "img2img":
@@ -202,6 +194,13 @@ class AdaFaceWrapper(nn.Module):
             pipeline.unet = unet2
         # Extending prompt length is for SD 1.5 only.
         if (self.pipeline_name == "text2img") and (self.max_prompt_length > 77):
             # pipeline.text_encoder.text_model.embeddings.position_embedding.weight: [77, 768] -> [max_length, 768]
@@ -210,20 +209,21 @@ class AdaFaceWrapper(nn.Module):
             # a larger max_position_embeddings, and set ignore_mismatched_sizes=True,
             # then the old position embeddings won't be loaded from the pretrained ckpt,
             # leading to degenerated performance.
             EL = self.max_prompt_length - 77
             # position_embedding.weight: [77, 768] -> [max_length, 768]
             new_position_embedding = extend_nn_embedding(pipeline.text_encoder.text_model.embeddings.position_embedding,
                                                          pipeline.text_encoder.text_model.embeddings.position_embedding.weight[-EL:])
             pipeline.text_encoder.text_model.embeddings.position_embedding = new_position_embedding
             pipeline.text_encoder.text_model.embeddings.position_ids = torch.arange(self.max_prompt_length).unsqueeze(0)
         if self.use_840k_vae:
             pipeline.vae = vae
             print("Replaced the VAE with the 840k-step VAE.")
-        if self.use_ds_text_encoder:
-            pipeline.text_encoder = text_encoder
-            print("Replaced the text encoder with the DreamShaper text encoder.")
         if remove_unet:
             # Remove unet and vae to release RAM. Only keep tokenizer and text_encoder.

         else:
             vae = None
         remove_unet = False
         if self.pipeline_name == "img2img":
             pipeline.unet = unet2
+        if self.use_ds_text_encoder:
+            # The dreamshaper v7 finetuned text encoder follows the prompt slightly better than the original text encoder.
+            # https://huggingface.co/Lykon/DreamShaper/tree/main/text_encoder
+            pipeline.text_encoder = CLIPTextModel.from_pretrained("models/diffusers/ds_text_encoder",
+                                                                  torch_dtype=torch.float16)
+            print("Replaced the text encoder with the DreamShaper text encoder.")
         # Extending prompt length is for SD 1.5 only.
         if (self.pipeline_name == "text2img") and (self.max_prompt_length > 77):
             # pipeline.text_encoder.text_model.embeddings.position_embedding.weight: [77, 768] -> [max_length, 768]
             # a larger max_position_embeddings, and set ignore_mismatched_sizes=True,
             # then the old position embeddings won't be loaded from the pretrained ckpt,
             # leading to degenerated performance.
+            # max_prompt_length <= 77 + 70 = 147.
+            self.max_prompt_length = min(self.max_prompt_length, 147)
+            # Number of extra tokens is at most 70.
             EL = self.max_prompt_length - 77
             # position_embedding.weight: [77, 768] -> [max_length, 768]
             new_position_embedding = extend_nn_embedding(pipeline.text_encoder.text_model.embeddings.position_embedding,
                                                          pipeline.text_encoder.text_model.embeddings.position_embedding.weight[-EL:])
             pipeline.text_encoder.text_model.embeddings.position_embedding = new_position_embedding
             pipeline.text_encoder.text_model.embeddings.position_ids = torch.arange(self.max_prompt_length).unsqueeze(0)
+            pipeline.text_encoder.text_model.config.max_position_embeddings = self.max_prompt_length
+            pipeline.tokenizer.model_max_length = self.max_prompt_length
         if self.use_840k_vae:
             pipeline.vae = vae
             print("Replaced the VAE with the 840k-step VAE.")
         if remove_unet:
             # Remove unet and vae to release RAM. Only keep tokenizer and text_encoder.

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ parser.add_argument('--num_inference_steps', type=int, default=50,
 parser.add_argument('--ablate_prompt_embed_type', type=str, default='ada',
                     choices=["ada", "arc2face", "consistentID"],
                     help="Ablate to use the image ID embs instead of Ada embs")
-parser.add_argument('--max_prompt_length', type=int, default=97,
                     help="Maximum length of the prompt. If > 77, the CLIP text encoder will be extended.")
 parser.add_argument('--gpu', type=int, default=None)

 parser.add_argument('--ablate_prompt_embed_type', type=str, default='ada',
                     choices=["ada", "arc2face", "consistentID"],
                     help="Ablate to use the image ID embs instead of Ada embs")
+parser.add_argument('--max_prompt_length', type=int, default=147,
                     help="Maximum length of the prompt. If > 77, the CLIP text encoder will be extended.")
 parser.add_argument('--gpu', type=int, default=None)