tolgacangoz
/

matryoshka-diffusion-models

Text-to-Image

Diffusers

Safetensors

English

mdm

Model card Files Files and versions Community

tolgacangoz commited on Oct 8, 2024

Commit

a93c410

verified ·

1 Parent(s): 6bb2088

Upload matryoshka.py

Browse files

Files changed (1) hide show

matryoshka.py +56 -68

matryoshka.py CHANGED Viewed

@@ -1,4 +1,23 @@
-# #TODO Licensed under the Apache License, Version 2.0 or MIT?
 import inspect
 import math
@@ -613,14 +632,14 @@ class MatryoshkaDDIMScheduler(SchedulerMixin, ConfigMixin):
         # 4. Clip or threshold "predicted x_0"
         if self.config.thresholding:
             if len(model_output) > 1:
-                pred_original_sample = [self._threshold_sample(p_o_s) for p_o_s in pred_original_sample]
             else:
                 pred_original_sample = self._threshold_sample(pred_original_sample)
         elif self.config.clip_sample:
             if len(model_output) > 1:
                 pred_original_sample = [
-                    p_o_s.clamp(-self.config.clip_sample_range, self.config.clip_sample_range)
-                    for p_o_s in pred_original_sample
                 ]
             else:
                 pred_original_sample = pred_original_sample.clamp(
@@ -3707,7 +3726,7 @@ class MatryoshkaPipeline(
     FromSingleFileMixin,
 ):
     r"""
-    Pipeline for text-to-image generation using Stable Diffusion.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
@@ -3720,21 +3739,17 @@ class MatryoshkaPipeline(
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
-        text_encoder ([`~transformers.CLIPTextModel`]):
-            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        tokenizer ([`~transformers.CLIPTokenizer`]):
-            A `CLIPTokenizer` to tokenize text.
         unet ([`MatryoshkaUNet2DConditionModel`]):
             A `MatryoshkaUNet2DConditionModel` to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
-            about a model's potential harms.
-        feature_extractor ([`~transformers.CLIPImageProcessor`]):
-            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
     model_cpu_offload_seq = "text_encoder->image_encoder->unet"
@@ -3755,6 +3770,18 @@ class MatryoshkaPipeline(
     ):
         super().__init__()
         if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
@@ -3782,10 +3809,10 @@ class MatryoshkaPipeline(
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
-        is_unet_version_less_0_9_0 = hasattr(unet[0].config, "_diffusers_version") and version.parse(
-            version.parse(unet[0].config._diffusers_version).base_version
         ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet[0].config, "sample_size") and unet[0].config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -3803,16 +3830,6 @@ class MatryoshkaPipeline(
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
-        if nesting_level == 0:
-            unet = MatryoshkaUNet2DConditionModel.from_pretrained("tolgacangoz/matryoshka-diffusion-models",
-                                                                  subfolder="unet/nesting_level_0")
-        elif nesting_level == 1:
-            unet = NestedUNet2DConditionModel.from_pretrained("tolgacangoz/matryoshka-diffusion-models",
-                                                              subfolder="unet/nesting_level_1")
-        elif nesting_level == 2:
-            unet = NestedUNet2DConditionModel.from_pretrained("tolgacangoz/matryoshka-diffusion-models",
-                                                              subfolder="unet/nesting_level_2")
         self.register_modules(
             text_encoder=text_encoder,
             tokenizer=tokenizer,
@@ -3825,38 +3842,6 @@ class MatryoshkaPipeline(
             scheduler.scales = unet.nest_ratio + [1]
         self.image_processor = VaeImageProcessor(do_resize=False)
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        lora_scale: Optional[float] = None,
-        **kwargs,
-    ):
-        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
-        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
-        prompt_embeds_tuple = self.encode_prompt(
-            prompt=prompt,
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=lora_scale,
-            **kwargs,
-        )
-        # concatenate for backwards comp
-        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
-        return prompt_embeds
     def encode_prompt(
         self,
         prompt,
@@ -3935,7 +3920,7 @@ class MatryoshkaPipeline(
                     untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
                 )
                 logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
                     f" {self.tokenizer.model_max_length} tokens: {removed_text}"
                 )
@@ -4414,8 +4399,8 @@ class MatryoshkaPipeline(
         Examples:
         Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
                 otherwise a `tuple` is returned where the first element is a list with the generated images and the
                 second element is a list of `bool`s indicating whether the corresponding generated image contains
                 "not-safe-for-work" (nsfw) content.
@@ -4522,10 +4507,11 @@ class MatryoshkaPipeline(
             timesteps, num_inference_steps = retrieve_timesteps(
                 self.scheduler, num_inference_steps, device, timesteps, sigmas
             )
-            timesteps = timesteps[:-1]
         else:
             timesteps = self.scheduler.timesteps
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
         latents = self.prepare_latents(
@@ -4637,9 +4623,11 @@ class MatryoshkaPipeline(
         image = latents
         if self.scheduler.scales is not None:
-            image = image[0]
-        image = self.image_processor.postprocess(image, output_type=output_type)
         # Offload all models
         self.maybe_free_model_hooks()

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Based on [🪆Matryoshka Diffusion Models](https://huggingface.co/papers/2310.15111).
+# Authors: Jiatao Gu, Shuangfei Zhai, Yizhe Zhang, Josh Susskind, Navdeep Jaitly
+# Code: https://github.com/apple/ml-mdm with MIT license
+#
+# Adapted to Diffusers by [M. Tolga Cangöz](https://github.com/tolgacangoz).
 import inspect
 import math
         # 4. Clip or threshold "predicted x_0"
         if self.config.thresholding:
             if len(model_output) > 1:
+                pred_original_sample = [self._threshold_sample(p_o_s * scale) / scale for p_o_s, scale in zip(pred_original_sample, self.scales)]
             else:
                 pred_original_sample = self._threshold_sample(pred_original_sample)
         elif self.config.clip_sample:
             if len(model_output) > 1:
                 pred_original_sample = [
+                    (p_o_s * scale).clamp(-self.config.clip_sample_range, self.config.clip_sample_range) / scale
+                    for p_o_s, scale in zip(pred_original_sample, self.scales)
                 ]
             else:
                 pred_original_sample = pred_original_sample.clamp(
     FromSingleFileMixin,
 ):
     r"""
+    Pipeline for text-to-image generation using Matryoshka Diffusion Models.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
+        text_encoder ([`~transformers.T5EncoderModel`]):
+            Frozen text-encoder ([flan-t5-xl](https://huggingface.co/google/flan-t5-xl)).
+        tokenizer ([`~transformers.T5Tokenizer`]):
+            A `T5Tokenizer` to tokenize text.
         unet ([`MatryoshkaUNet2DConditionModel`]):
             A `MatryoshkaUNet2DConditionModel` to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`MatryoshkaDDIMScheduler`] and other schedulers with proper modifications, see an example usage in README.md.
+        feature_extractor ([`~transformers.<AnImageProcessor>`]):
+            A `AnImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
     model_cpu_offload_seq = "text_encoder->image_encoder->unet"
     ):
         super().__init__()
+        if nesting_level == 0:
+            unet = MatryoshkaUNet2DConditionModel.from_pretrained("tolgacangoz/matryoshka-diffusion-models",
+                                                                  subfolder="unet/nesting_level_0")
+        elif nesting_level == 1:
+            unet = NestedUNet2DConditionModel.from_pretrained("tolgacangoz/matryoshka-diffusion-models",
+                                                                subfolder="unet/nesting_level_1")
+        elif nesting_level == 2:
+            unet = NestedUNet2DConditionModel.from_pretrained("tolgacangoz/matryoshka-diffusion-models",
+                                                                subfolder="unet/nesting_level_2")
+        else:
+            raise ValueError("Currently, nesting levels 0, 1, and 2 are supported.")
         if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
         ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
         self.register_modules(
             text_encoder=text_encoder,
             tokenizer=tokenizer,
             scheduler.scales = unet.nest_ratio + [1]
         self.image_processor = VaeImageProcessor(do_resize=False)
     def encode_prompt(
         self,
         prompt,
                     untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
                 )
                 logger.warning(
+                    "The following part of your input was truncated because FLAN-T5-XL for this pipeline can only handle sequences up to"
                     f" {self.tokenizer.model_max_length} tokens: {removed_text}"
                 )
         Examples:
         Returns:
+            [`~MatryoshkaPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~MatryoshkaPipelineOutput`] is returned,
                 otherwise a `tuple` is returned where the first element is a list with the generated images and the
                 second element is a list of `bool`s indicating whether the corresponding generated image contains
                 "not-safe-for-work" (nsfw) content.
             timesteps, num_inference_steps = retrieve_timesteps(
                 self.scheduler, num_inference_steps, device, timesteps, sigmas
             )
         else:
             timesteps = self.scheduler.timesteps
+        timesteps = timesteps[:-1]
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
         latents = self.prepare_latents(
         image = latents
         if self.scheduler.scales is not None:
+            for i in range(len(image)):
+                image[i] = image[i] * self.scheduler.scales[i]
+                image[i] = self.image_processor.postprocess(image[i], output_type=output_type)
+        else:
+            image = self.image_processor.postprocess(image, output_type=output_type)
         # Offload all models
         self.maybe_free_model_hooks()