Commit
·
81489e7
1
Parent(s):
54b1758
Update pipeline.py
Browse files- pipeline.py +60 -11
pipeline.py
CHANGED
@@ -1,3 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
2 |
#
|
3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -58,7 +99,6 @@ EXAMPLE_DOC_STRING = """
|
|
58 |
>>> import torch
|
59 |
>>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
|
60 |
>>> from diffusers.utils import export_to_gif
|
61 |
-
|
62 |
>>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
|
63 |
>>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
|
64 |
>>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
|
@@ -92,16 +132,13 @@ class AnimateDiffPipelineOutput(BaseOutput):
|
|
92 |
class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
|
93 |
r"""
|
94 |
Pipeline for text-to-video generation.
|
95 |
-
|
96 |
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
97 |
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
|
98 |
-
|
99 |
The pipeline also inherits the following loading methods:
|
100 |
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
101 |
- [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
102 |
- [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
103 |
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
|
104 |
-
|
105 |
Args:
|
106 |
vae ([`AutoencoderKL`]):
|
107 |
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
@@ -170,7 +207,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
170 |
):
|
171 |
r"""
|
172 |
Encodes the prompt into text encoder hidden states.
|
173 |
-
|
174 |
Args:
|
175 |
prompt (`str` or `List[str]`, *optional*):
|
176 |
prompt to be encoded
|
@@ -422,12 +458,9 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
422 |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
|
423 |
def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
|
424 |
r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
|
425 |
-
|
426 |
The suffixes after the scaling factors represent the stages where they are being applied.
|
427 |
-
|
428 |
Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
|
429 |
that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
|
430 |
-
|
431 |
Args:
|
432 |
s1 (`float`):
|
433 |
Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
|
@@ -566,6 +599,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
566 |
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
567 |
ip_adapter_image: Optional[PipelineImageInput] = None,
|
568 |
output_type: Optional[str] = "pil",
|
|
|
569 |
return_dict: bool = True,
|
570 |
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
571 |
callback_steps: Optional[int] = 1,
|
@@ -574,7 +608,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
574 |
):
|
575 |
r"""
|
576 |
The call function to the pipeline for generation.
|
577 |
-
|
578 |
Args:
|
579 |
prompt (`str` or `List[str]`, *optional*):
|
580 |
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
@@ -631,7 +664,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
631 |
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
632 |
the output of the pre-final layer will be used for computing the prompt embeddings.
|
633 |
Examples:
|
634 |
-
|
635 |
Returns:
|
636 |
[`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
|
637 |
If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
|
@@ -780,6 +812,23 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
780 |
if output_type == "latent":
|
781 |
return AnimateDiffPipelineOutput(frames=latents)
|
782 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
783 |
# Post-processing
|
784 |
video_tensor = self.decode_latents(latents)
|
785 |
|
@@ -794,4 +843,4 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
794 |
if not return_dict:
|
795 |
return (video,)
|
796 |
|
797 |
-
return AnimateDiffPipelineOutput(frames=video)
|
|
|
1 |
+
Hugging Face's logo
|
2 |
+
Hugging Face
|
3 |
+
Search models, datasets, users...
|
4 |
+
Models
|
5 |
+
Datasets
|
6 |
+
Spaces
|
7 |
+
Docs
|
8 |
+
Solutions
|
9 |
+
Pricing
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
smoothieAI
|
15 |
+
/
|
16 |
+
pipeline_animatediff_context
|
17 |
+
|
18 |
+
like
|
19 |
+
0
|
20 |
+
|
21 |
+
License:
|
22 |
+
apache-2.0
|
23 |
+
Model card
|
24 |
+
Files and versions
|
25 |
+
Community
|
26 |
+
Settings
|
27 |
+
pipeline_animatediff_context
|
28 |
+
/
|
29 |
+
pipeline.py
|
30 |
+
smoothieAI's picture
|
31 |
+
smoothieAI
|
32 |
+
Update pipeline.py
|
33 |
+
54b1758
|
34 |
+
about 19 hours ago
|
35 |
+
raw
|
36 |
+
history
|
37 |
+
blame
|
38 |
+
edit
|
39 |
+
delete
|
40 |
+
No virus
|
41 |
+
40.1 kB
|
42 |
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
43 |
#
|
44 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
99 |
>>> import torch
|
100 |
>>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
|
101 |
>>> from diffusers.utils import export_to_gif
|
|
|
102 |
>>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
|
103 |
>>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
|
104 |
>>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
|
|
|
132 |
class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
|
133 |
r"""
|
134 |
Pipeline for text-to-video generation.
|
|
|
135 |
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
136 |
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
|
|
|
137 |
The pipeline also inherits the following loading methods:
|
138 |
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
139 |
- [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
140 |
- [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
141 |
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
|
|
|
142 |
Args:
|
143 |
vae ([`AutoencoderKL`]):
|
144 |
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
|
|
207 |
):
|
208 |
r"""
|
209 |
Encodes the prompt into text encoder hidden states.
|
|
|
210 |
Args:
|
211 |
prompt (`str` or `List[str]`, *optional*):
|
212 |
prompt to be encoded
|
|
|
458 |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
|
459 |
def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
|
460 |
r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
|
|
|
461 |
The suffixes after the scaling factors represent the stages where they are being applied.
|
|
|
462 |
Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
|
463 |
that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
|
|
|
464 |
Args:
|
465 |
s1 (`float`):
|
466 |
Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
|
|
|
599 |
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
600 |
ip_adapter_image: Optional[PipelineImageInput] = None,
|
601 |
output_type: Optional[str] = "pil",
|
602 |
+
output_path: Optional[str] = None,
|
603 |
return_dict: bool = True,
|
604 |
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
605 |
callback_steps: Optional[int] = 1,
|
|
|
608 |
):
|
609 |
r"""
|
610 |
The call function to the pipeline for generation.
|
|
|
611 |
Args:
|
612 |
prompt (`str` or `List[str]`, *optional*):
|
613 |
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
|
|
664 |
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
665 |
the output of the pre-final layer will be used for computing the prompt embeddings.
|
666 |
Examples:
|
|
|
667 |
Returns:
|
668 |
[`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
|
669 |
If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
|
|
|
812 |
if output_type == "latent":
|
813 |
return AnimateDiffPipelineOutput(frames=latents)
|
814 |
|
815 |
+
# save frames
|
816 |
+
if output_path is not None:
|
817 |
+
output_batch_size = 10 #this prevents out of memory errors with large videos
|
818 |
+
num_frames = latents.size(2) #latents' shape is [batch, channels, frames, height, width]
|
819 |
+
for start_idx in range(0, num_frames, output_batch_size):
|
820 |
+
end_idx = min(start_idx + output_batch_size, num_frames)
|
821 |
+
video_tensor = self.decode_latents(latents[:, :, start_idx:end_idx, :, :])
|
822 |
+
video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
|
823 |
+
|
824 |
+
for batch_idx, frame_batch in enumerate(video):
|
825 |
+
for frame in frame_batch[0][0]:
|
826 |
+
digit_substring = ''.join(filter(str.isdigit, output_path))
|
827 |
+
frame_number = int(digit_substring) + start_idx + batch_idx
|
828 |
+
new_output_path = output_path.replace(digit_substring, str(frame_number).zfill(5), 1)
|
829 |
+
frame.save(new_output_path)
|
830 |
+
return output_path
|
831 |
+
|
832 |
# Post-processing
|
833 |
video_tensor = self.decode_latents(latents)
|
834 |
|
|
|
843 |
if not return_dict:
|
844 |
return (video,)
|
845 |
|
846 |
+
return AnimateDiffPipelineOutput(frames=video)
|