smoothieAI
/

pipeline_animatediff_context

Model card Files Files and versions

xet

Community

smoothieAI commited on Jan 9, 2024

Commit

81489e7

1 Parent(s): 54b1758

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +60 -11

pipeline.py CHANGED Viewed

@@ -1,3 +1,44 @@
 # Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -58,7 +99,6 @@ EXAMPLE_DOC_STRING = """
         >>> import torch
         >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif
         >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
         >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
@@ -92,16 +132,13 @@ class AnimateDiffPipelineOutput(BaseOutput):
 class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
@@ -170,7 +207,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
@@ -422,12 +458,9 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
     def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
         r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
         The suffixes after the scaling factors represent the stages where they are being applied.
         Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
         that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
         Args:
             s1 (`float`):
                 Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
@@ -566,6 +599,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
@@ -574,7 +608,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
     ):
         r"""
         The call function to the pipeline for generation.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
@@ -631,7 +664,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                 the output of the pre-final layer will be used for computing the prompt embeddings.
         Examples:
         Returns:
             [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
@@ -780,6 +812,23 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         if output_type == "latent":
             return AnimateDiffPipelineOutput(frames=latents)
         # Post-processing
         video_tensor = self.decode_latents(latents)
@@ -794,4 +843,4 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         if not return_dict:
             return (video,)
-        return AnimateDiffPipelineOutput(frames=video)

+Hugging Face's logo
+Hugging Face
+Search models, datasets, users...
+Models
+Datasets
+Spaces
+Docs
+Solutions
+Pricing
+smoothieAI
+/
+pipeline_animatediff_context
+like
+0
+License:
+apache-2.0
+Model card
+Files and versions
+Community
+Settings
+pipeline_animatediff_context
+/
+pipeline.py
+smoothieAI's picture
+smoothieAI
+Update pipeline.py
+54b1758
+about 19 hours ago
+raw
+history
+blame
+edit
+delete
+No virus
+40.1 kB
 # Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
         >>> import torch
         >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif
         >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
         >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
 class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
     def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
         r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
         The suffixes after the scaling factors represent the stages where they are being applied.
         Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
         that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
         Args:
             s1 (`float`):
                 Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
+        output_path: Optional[str] = None,
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
     ):
         r"""
         The call function to the pipeline for generation.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                 the output of the pre-final layer will be used for computing the prompt embeddings.
         Examples:
         Returns:
             [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
         if output_type == "latent":
             return AnimateDiffPipelineOutput(frames=latents)
+        # save frames
+        if output_path is not None:
+            output_batch_size = 10 #this prevents out of memory errors with large videos
+            num_frames = latents.size(2)  #latents' shape is [batch, channels, frames, height, width]
+            for start_idx in range(0, num_frames, output_batch_size):
+                end_idx = min(start_idx + output_batch_size, num_frames)
+                video_tensor = self.decode_latents(latents[:, :, start_idx:end_idx, :, :])
+                video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+                for batch_idx, frame_batch in enumerate(video):
+                    for frame in frame_batch[0][0]:
+                        digit_substring = ''.join(filter(str.isdigit, output_path))
+                        frame_number = int(digit_substring) + start_idx + batch_idx
+                        new_output_path = output_path.replace(digit_substring, str(frame_number).zfill(5), 1)
+                        frame.save(new_output_path)
+            return output_path
         # Post-processing
         video_tensor = self.decode_latents(latents)
         if not return_dict:
             return (video,)
+        return AnimateDiffPipelineOutput(frames=video)