Spaces:

Linz99
/

ASVA

Build error

App Files Files Community

Lin Z commited on Jul 27, 2024

Commit

d6d7648

1 Parent(s): 0dc0933

init commit

Browse files

Files changed (42) hide show

.checkpoints/imagebind_huge.pth +3 -0
app.py +140 -0
assets/.DS_Store +0 -0
assets/lion_and_gun.png +0 -0
assets/lions_roaring.wav +0 -0
assets/machine_gun_shooting.wav +0 -0
audio_encoder.py +124 -0
checkpoints/audio-cond_animation/avsync15_audio-cond_cfg/ckpts/checkpoint-37000/modules/audio_encoder/config.json +6 -0
checkpoints/audio-cond_animation/avsync15_audio-cond_cfg/ckpts/checkpoint-37000/modules/audio_encoder/diffusion_pytorch_model.safetensors +3 -0
checkpoints/audio-cond_animation/avsync15_audio-cond_cfg/ckpts/checkpoint-37000/modules/unet/config.json +61 -0
checkpoints/audio-cond_animation/avsync15_audio-cond_cfg/ckpts/checkpoint-37000/modules/unet/diffusion_pytorch_model.safetensors +3 -0
datasets/AVSync15/class_clip_text_encodings_stable-diffusion-v1-5.pt +3 -0
ff_spatio_audio_temp_transformer_3d.py +374 -0
ff_spatio_temp_resnet_3d.py +191 -0
ff_spatio_temp_transformer_3d.py +331 -0
imagebind/__init__.py +3 -0
imagebind/__pycache__/__init__.cpython-310.pyc +0 -0
imagebind/__pycache__/data.cpython-310.pyc +0 -0
imagebind/bpe/bpe_simple_vocab_16e6.txt.gz +3 -0
imagebind/data.py +343 -0
imagebind/models/__init__.py +0 -0
imagebind/models/__pycache__/__init__.cpython-310.pyc +0 -0
imagebind/models/__pycache__/helpers.cpython-310.pyc +0 -0
imagebind/models/__pycache__/imagebind_model.cpython-310.pyc +0 -0
imagebind/models/__pycache__/multimodal_preprocessors.cpython-310.pyc +0 -0
imagebind/models/__pycache__/transformer.cpython-310.pyc +0 -0
imagebind/models/helpers.py +140 -0
imagebind/models/imagebind_model.py +506 -0
imagebind/models/multimodal_preprocessors.py +685 -0
imagebind/models/transformer.py +280 -0
pipeline.py +602 -0
pretrained/openai-clip-l_null_text_encoding.pt +3 -0
pretrained/stable-diffusion-v1-5/scheduler/scheduler_config.json +13 -0
pretrained/stable-diffusion-v1-5/vae/config.json +29 -0
pretrained/stable-diffusion-v1-5/vae/diffusion_pytorch_model.bin +3 -0
pretrained/stable-diffusion-v1-5/vae/diffusion_pytorch_model.fp16.bin +3 -0
pretrained/stable-diffusion-v1-5/vae/diffusion_pytorch_model.fp16.safetensors +3 -0
pretrained/stable-diffusion-v1-5/vae/diffusion_pytorch_model.safetensors +3 -0
requirements.txt +11 -0
unet.py +839 -0
unet_blocks.py +1084 -0
unet_utils.py +163 -0

.checkpoints/imagebind_huge.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6f6c22bedcc90708448d5d2fbb7b2db9c73f505dc89bd0b2e09b23af1b62157
+size 4803584173

app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import warnings
+warnings.filterwarnings("ignore")
+import gradio as gr
+import torch
+import torch.nn as nn
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import PNDMScheduler
+from unet import AudioUNet3DConditionModel
+from audio_encoder import ImageBindSegmaskAudioEncoder
+from pipeline import AudioCondAnimationPipeline, generate_videos
+device = torch.device("cuda")
+dtype = torch.float16
+def freeze_and_make_eval(model: nn.Module):
+	for param in model.parameters():
+		param.requires_grad = False
+	model.eval()
+def create_pipeline(device=torch.device("cuda"), dtype=torch.float32):
+	# 2. Prepare model
+	pretrained_stable_diffusion_path = "./pretrained/stable-diffusion-v1-5"
+	checkpoint_path = f"checkpoints/audio-cond_animation/avsync15_audio-cond_cfg/ckpts/checkpoint-37000/modules"
+	category_text_encoding_mapping = torch.load('datasets/AVSync15/class_clip_text_encodings_stable-diffusion-v1-5.pt', map_location="cpu")
+	scheduler = PNDMScheduler.from_pretrained(pretrained_stable_diffusion_path, subfolder="scheduler")
+	vae = AutoencoderKL.from_pretrained(pretrained_stable_diffusion_path, subfolder="vae").to(device=device, dtype=dtype)
+	audio_encoder = ImageBindSegmaskAudioEncoder(n_segment=12).to(device=device, dtype=dtype)
+	freeze_and_make_eval(audio_encoder)
+	unet = AudioUNet3DConditionModel.from_pretrained(checkpoint_path, subfolder="unet").to(device=device, dtype=dtype)
+	pipeline = AudioCondAnimationPipeline(
+		unet=unet,
+		scheduler=scheduler,
+		vae=vae,
+		audio_encoder=audio_encoder,
+		null_text_encodings_path="./pretrained/openai-clip-l_null_text_encoding.pt"
+	)
+	pipeline.to(torch_device=device, dtype=dtype)
+	pipeline.set_progress_bar_config(disable=True)
+	return pipeline, category_text_encoding_mapping
+pipeline, category_text_encoding_mapping = create_pipeline(device, dtype)
+def generate_video(image, audio, text, audio_guidance_scale, denoising_step):
+	category_text_encoding = category_text_encoding_mapping[text].view(1, 77, 768)
+	generate_videos(
+		pipeline,
+		audio_path=audio,
+		image_path=image,
+		category_text_encoding=category_text_encoding,
+		image_size=(256, 256),
+		video_fps=6,
+		video_num_frame=12,
+		text_guidance_scale=1.0,
+		audio_guidance_scale=audio_guidance_scale,
+		denoising_step=denoising_step,
+		seed=123,
+		save_path="./output_video.mp4",
+		device=device
+	)
+	return "./output_video.mp4"
+if __name__ ==  "__main__":
+	categories = [
+		"baby babbling crying", "dog barking", "hammering", "striking bowling", "cap gun shooting",
+		"chicken crowing", "frog croaking", "lions roaring", "machine gun shooting", "playing cello",
+		"playing trombone", "playing trumpet", "playing violin fiddle", "sharpen knife", "toilet flushing"
+	]
+	title = ""
+	description = """
+<div align="center">
+<h1 style="font-size: 60px;">Audio-Synchronized Visual Animation</h1>
+<p style="font-size: 30px;">
+<a href="https://lzhangbj.github.io/projects/asva/asva.html">Project Webpage</a>
+</p>
+<p style="font-size: 30px;">
+	<a href="https://lzhangbj.github.io/">Lin Zhang</a>,
+	<a href="https://scholar.google.com/citations?user=6aYncPAAAAAJ">Shentong Mo</a>,
+	<a href="https://yijingz02.github.io/">Yijing Zhang</a>,
+	<a href="https://pedro-morgado.github.io/">Pedro Morgado</a>
+</p>
+<p style="font-size: 30px;">
+University of Wisconsin Madison,
+Carnegie Mellon University
+<p>
+<strong style="font-size: 30px;">ECCV 2024</strong>
+<strong style="font-size: 25px;">Animate your images with audio-synchronized motion! </strong>
+<p style="font-size: 18px;">Notes:</p>
+<p style="font-size: 18px;">(1) Only the first 2 seconds of audio is used. </p>
+<p style="font-size: 18px;">(2) Increase audio guidance scale for amplified visual dynamics. </p>
+<p style="font-size: 18px;">(3) Increase sampling steps for higher visual quality. </p>
+</div>
+	"""
+	# <p style="font-size: 20px;">Please be patient. Due to limited resources on huggingface, the generation may take up to 10mins </p>
+	# Gradio Interface
+	iface = gr.Interface(
+		fn=generate_video,
+		inputs=[
+			gr.Image( label="Upload Image", type="filepath", height=256),
+			gr.Audio(label="Upload Audio", type="filepath"),
+			gr.Dropdown(choices=categories, label="Select Audio Category"),
+			gr.Slider(minimum=1.0, maximum=12.0, step=0.1, value=4.0, label="Audio Guidance Scale"),
+			gr.Slider(minimum=1, maximum=50, step=1, value=20, label="Sampling steps")
+		],
+		outputs=gr.Video(label="Generated Video", height=256),
+		title=title,
+		description=description,
+		examples = [
+			["./assets/lion_and_gun.png", "./assets/lions_roaring.wav", "lions roaring",  4.0, 20],
+			["./assets/lion_and_gun.png", "./assets/machine_gun_shooting.wav", "machine gun shooting", 4.0, 20],
+		]
+	)
+	# Launch the interface
+	iface.launch()

assets/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

assets/lion_and_gun.png ADDED Viewed

assets/lions_roaring.wav ADDED Viewed

Binary file (135 kB). View file

assets/machine_gun_shooting.wav ADDED Viewed

Binary file (885 kB). View file

audio_encoder.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import math
+from dataclasses import dataclass
+from typing import Optional, TypeVar, Tuple, Any
+T = TypeVar('T', bound='Module')
+from einops import rearrange, repeat
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers.utils import ModelOutput
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from imagebind.models import imagebind_model
+from imagebind.models.imagebind_model import ModalityType
+@dataclass
+class ImageBindSegmaskAudioEncoderOutput(ModelOutput):
+	"""
+	Args:
+		text_embeds(`torch.Tensor` of shape `(batch_size, output_dim`):
+			The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
+		image_embeds(`torch.Tensor` of shape `(batch_size, output_dim`):
+			The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
+		text_model_output(`BaseModelOutputWithPooling`):
+			The output of the [`CLIPTextModel`].
+		vision_model_output(`BaseModelOutputWithPooling`):
+			The output of the [`CLIPVisionModel`].
+	"""
+	audio_embeds: torch.Tensor = None
+	audio_encodings: torch.Tensor = None
+	audio_segment_masks: torch.BoolTensor = None
+	def to_tuple(self) -> Tuple[Any]:
+		return tuple(self[k] for k in self.keys())
+class ImageBindSegmaskAudioEncoder(ModelMixin, ConfigMixin):
+	@register_to_config
+	def __init__(self,
+	             n_segment=4,
+	             pretrained_model_name="imagebind-huge"
+	             ):
+		super().__init__()
+		self.n_segment = n_segment
+		self.pretrained_model_name = pretrained_model_name
+		if pretrained_model_name == "imagebind-huge":
+			pretrained_model = imagebind_model.imagebind_huge(pretrained=True)
+		self.preprocessor = pretrained_model.modality_preprocessors[ModalityType.AUDIO]
+		self.trunk = pretrained_model.modality_trunks[ModalityType.AUDIO]
+		self.head = pretrained_model.modality_heads[ModalityType.AUDIO]
+		self.postprocessor = pretrained_model.modality_postprocessors[ModalityType.AUDIO]
+		self.final_layer_norm = nn.LayerNorm(normalized_shape=768, eps=1e-6)
+	def _auto_split(self, n, n_chunk):
+		'''
+			automatically split into chunks with n_ele no differ by 1
+			if n is not dividible by n_chunk, extra one's will be added to the heading chunks
+		'''
+		chunk_size = int(math.ceil(n / n_chunk))
+		assert chunk_size >= 1, chunk_size
+		chunk_start_indices = np.round(np.linspace(0, n - chunk_size, n_chunk, endpoint=True)).astype(np.int32)
+		mask = torch.zeros(n_chunk, n).bool()
+		for chunk_index, chunk_start_index in enumerate(chunk_start_indices):
+			mask[chunk_index, chunk_start_index:chunk_start_index + chunk_size] = 1
+		mask = mask.contiguous()
+		assert mask.long().sum() == chunk_size * n_chunk, mask.long().sum()
+		return mask
+	def forward(self,
+	            input_features: Optional[torch.Tensor],
+	            normalize: bool = False,
+	            return_dict: Optional[bool] = None):
+		n_segment = self.n_segment
+		# 1. reshape to imagebind input
+		batchsize = input_features.size(0)
+		# 2. patchify images and add positional embedding and
+		audio_inputs = self.preprocessor(input_features)
+		trunk_inputs = audio_inputs["trunk"]  # dict of {"tokens": (b, l, d)}
+		# 3. get audio encoder output
+		audio_encodings = self.trunk(**trunk_inputs)  # w/o layer norm (b, seq_len, c)
+		head_inputs = audio_inputs["head"]
+		cls_embeds = self.head(audio_encodings, **head_inputs)
+		# normalize and logit scaling
+		if normalize:
+			cls_embeds = self.postprocessor(cls_embeds)  # (b, c)
+		audio_encodings = self.final_layer_norm(audio_encodings)
+		# 4. get segment masks
+		n, t = 12, 19  # hard code
+		segment_mask = self._auto_split(t, n_segment).unsqueeze(1).expand(n_segment, n, t).contiguous()  # (s, n, t)
+		segment_mask = rearrange(
+			segment_mask, "s n t -> s (n t)"
+		)
+		segment_mask = torch.cat([
+			torch.ones(n_segment, 1).bool(),
+			segment_mask
+		], dim=1)  # (s, 1+n*t)
+		segment_masks = repeat(segment_mask, "n s -> b n s", b=batchsize).contiguous().bool().to(self.device)
+		if not return_dict:
+			return cls_embeds, audio_encodings, segment_masks
+		return ImageBindSegmaskAudioEncoderOutput(
+			audio_embeds=cls_embeds,
+			audio_encodings=audio_encodings,
+			audio_segment_masks=segment_masks
+		)

checkpoints/audio-cond_animation/avsync15_audio-cond_cfg/ckpts/checkpoint-37000/modules/audio_encoder/config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_class_name": "ImageBindSegmaskAudioEncoder",
+  "_diffusers_version": "0.29.2",
+  "n_segment": 12,
+  "pretrained_model_name": "imagebind-huge"
+}

checkpoints/audio-cond_animation/avsync15_audio-cond_cfg/ckpts/checkpoint-37000/modules/audio_encoder/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93622a01c9bdd6bad87530617f0fdc772be958dc435b3303ed66ba938311aa4b
+size 172492226

checkpoints/audio-cond_animation/avsync15_audio-cond_cfg/ckpts/checkpoint-37000/modules/unet/config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "_class_name": "AudioUNet3DConditionModel",
+  "_diffusers_version": "0.29.2",
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "attention_head_dim": 8,
+  "audio_cross_attention_dim": 768,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 768,
+  "cross_attention_norm": null,
+  "down_block_types": [
+    "FFSpatioAudioTempCrossAttnDownBlock3D",
+    "FFSpatioAudioTempCrossAttnDownBlock3D",
+    "FFSpatioAudioTempCrossAttnDownBlock3D",
+    "FFSpatioTempResDownBlock3D"
+  ],
+  "downsample_padding": 1,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "FFSpatioAudioTempCrossAttnUNetMidBlock3D",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_out_scale_factor": 1.0,
+  "resnet_skip_time_act": false,
+  "resnet_time_scale_shift": "default",
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "up_block_types": [
+    "FFSpatioTempResUpBlock3D",
+    "FFSpatioAudioTempCrossAttnUpBlock3D",
+    "FFSpatioAudioTempCrossAttnUpBlock3D",
+    "FFSpatioAudioTempCrossAttnUpBlock3D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

checkpoints/audio-cond_animation/avsync15_audio-cond_cfg/ckpts/checkpoint-37000/modules/unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:234652f6029bd49d05d6e77e5fe6721e239bbb4ae93a60112ea53d95824da097
+size 4677570888

datasets/AVSync15/class_clip_text_encodings_stable-diffusion-v1-5.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10b3e0bcf2f12ee7c0410165e2872ae76fe3a58f9d43834781cc8bd79c5cfc46
+size 3553440

ff_spatio_audio_temp_transformer_3d.py ADDED Viewed

	@@ -0,0 +1,374 @@

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
+from dataclasses import dataclass
+from typing import Optional
+from einops import rearrange
+import torch
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention import Attention
+from diffusers.models.attention import FeedForward, AdaLayerNorm, AdaLayerNormZero
+from diffusers.models.embeddings import Timesteps, TimestepEmbedding
+from unet_utils import FFAttention
+@dataclass
+class SpatioTempTransformer3DModelOutput(BaseOutput):
+	sample: torch.Tensor
+if is_xformers_available():
+	import xformers
+	import xformers.ops
+else:
+	xformers = None
+class FFSpatioAudioTempTransformer3DModel(ModelMixin, ConfigMixin):
+	@register_to_config
+	def __init__(
+			self,
+			num_attention_heads: int = 16,
+			attention_head_dim: int = 88,
+			in_channels: Optional[int] = None,
+			num_layers: int = 1,
+			dropout: float = 0.0,
+			norm_num_groups: int = 32,
+			cross_attention_dim: Optional[int] = None,
+			audio_cross_attention_dim: Optional[int] = None,
+			attention_bias: bool = False,
+			activation_fn: str = "geglu",
+			num_embeds_ada_norm: Optional[int] = None,
+			use_linear_projection: bool = False,
+			only_cross_attention: bool = False,
+			upcast_attention: bool = False,
+	):
+		super().__init__()
+		self.use_linear_projection = use_linear_projection
+		self.num_attention_heads = num_attention_heads
+		self.attention_head_dim = attention_head_dim
+		inner_dim = num_attention_heads * attention_head_dim
+		# Define input layers
+		self.in_channels = in_channels
+		self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+		if use_linear_projection:
+			self.proj_in = nn.Linear(in_channels, inner_dim)
+		else:
+			self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+		# Define transformers blocks
+		self.transformer_blocks = nn.ModuleList(
+			[
+				BasicTransformerBlock(
+					inner_dim,
+					num_attention_heads,
+					attention_head_dim,
+					dropout=dropout,
+					cross_attention_dim=cross_attention_dim,
+					audio_cross_attention_dim=audio_cross_attention_dim,
+					activation_fn=activation_fn,
+					num_embeds_ada_norm=num_embeds_ada_norm,
+					attention_bias=attention_bias,
+					only_cross_attention=only_cross_attention,
+					upcast_attention=upcast_attention,
+				)
+				for d in range(num_layers)
+			]
+		)
+		# 4. Define output layers
+		if use_linear_projection:
+			self.proj_out = nn.Linear(in_channels, inner_dim)
+		else:
+			self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+	def forward(
+			self,
+			hidden_states,
+			encoder_hidden_states=None,
+			audio_encoder_hidden_states=None,
+			audio_attention_mask=None,
+			timestep=None,
+			class_labels=None,
+			cross_attention_kwargs=None,
+			return_dict: bool = True
+	):
+		# Input
+		assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+		video_length = hidden_states.shape[2]
+		hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+		encoder_hidden_states = rearrange(encoder_hidden_states, 'b f n c -> (b f) n c')
+		audio_encoder_hidden_states = rearrange(audio_encoder_hidden_states, 'b f n c -> (b f) n c')
+		if audio_attention_mask is not None:
+			audio_attention_mask = rearrange(audio_attention_mask, 'b f n -> (b f) 1 n')
+		batch, channel, height, weight = hidden_states.shape
+		residual = hidden_states
+		hidden_states = self.norm(hidden_states)
+		if not self.use_linear_projection:
+			hidden_states = self.proj_in(hidden_states)
+			inner_dim = hidden_states.shape[1]
+			hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+		else:
+			inner_dim = hidden_states.shape[1]
+			hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+			hidden_states = self.proj_in(hidden_states)
+		# Blocks
+		for block in self.transformer_blocks:
+			hidden_states = block(
+				hidden_states,
+				encoder_hidden_states=encoder_hidden_states,
+				audio_encoder_hidden_states=audio_encoder_hidden_states,
+				audio_attention_mask=audio_attention_mask,
+				timestep=timestep,
+				video_length=video_length,
+				cross_attention_kwargs=cross_attention_kwargs,
+				class_labels=class_labels
+			)
+		# Output
+		if not self.use_linear_projection:
+			hidden_states = (
+				hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+			)
+			hidden_states = self.proj_out(hidden_states)
+		else:
+			hidden_states = self.proj_out(hidden_states)
+			hidden_states = (
+				hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+			)
+		output = hidden_states + residual
+		output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+		if not return_dict:
+			return (output,)
+		return SpatioTempTransformer3DModelOutput(sample=output)
+class BasicTransformerBlock(nn.Module):
+	def __init__(
+			self,
+			dim: int,
+			num_attention_heads: int,
+			attention_head_dim: int,
+			dropout=0.0,
+			cross_attention_dim: Optional[int] = None,
+			audio_cross_attention_dim: Optional[int] = None,
+			activation_fn: str = "geglu",
+			num_embeds_ada_norm: Optional[int] = None,
+			attention_bias: bool = False,
+			only_cross_attention: bool = False,
+			double_self_attention: bool = False,
+			upcast_attention: bool = False,
+			norm_elementwise_affine: bool = True,
+			norm_type: str = "layer_norm",
+			final_dropout: bool = False,
+	):
+		super().__init__()
+		self.only_cross_attention = only_cross_attention
+		self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+		self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+		if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+			raise ValueError(
+				f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+				f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+			)
+		# Define 3 blocks. Each block has its own normalization layer.
+		# 1. SC-Cross-Attn
+		if self.use_ada_layer_norm:
+			self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+		elif self.use_ada_layer_norm_zero:
+			self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+		else:
+			self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+		self.attn1 = FFAttention(
+			query_dim=dim,
+			heads=num_attention_heads,
+			dim_head=attention_head_dim,
+			dropout=dropout,
+			bias=attention_bias,
+			cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+			upcast_attention=upcast_attention,
+		)
+		# 2. Audio Conditioned Cross-Attn
+		self.norm_audio = (
+			AdaLayerNorm(dim, num_embeds_ada_norm)
+			if self.use_ada_layer_norm
+			else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+		)
+		self.attn_audio = Attention(
+			query_dim=dim,
+			cross_attention_dim=audio_cross_attention_dim,
+			heads=num_attention_heads,
+			dim_head=attention_head_dim,
+			dropout=dropout,
+			bias=attention_bias,
+			upcast_attention=upcast_attention,
+		)
+		# 3. Cross-Attn
+		if cross_attention_dim is not None or double_self_attention:
+			# We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+			# I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+			# the second cross attention block.
+			self.norm2 = (
+				AdaLayerNorm(dim, num_embeds_ada_norm)
+				if self.use_ada_layer_norm
+				else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+			)
+			self.attn2 = Attention(
+				query_dim=dim,
+				cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+				heads=num_attention_heads,
+				dim_head=attention_head_dim,
+				dropout=dropout,
+				bias=attention_bias,
+				upcast_attention=upcast_attention,
+			)  # is self-attn if encoder_hidden_states is none
+		else:
+			self.norm2 = None
+			self.attn2 = None
+		# 4. Temp-Attn
+		self.pos_proj_temp = Timesteps(dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+		self.pos_embedding_temp = TimestepEmbedding(
+			dim,
+			dim,
+			act_fn="silu",
+			post_act_fn=None,
+			cond_proj_dim=None,
+		)
+		self.attn_temp = Attention(
+			query_dim=dim,
+			heads=num_attention_heads,
+			dim_head=attention_head_dim,
+			dropout=dropout,
+			bias=attention_bias,
+			upcast_attention=upcast_attention,
+		)
+		nn.init.zeros_(self.attn_temp.to_out[0].weight.data)
+		self.norm_temp = (
+			AdaLayerNorm(dim, num_embeds_ada_norm)
+			if self.use_ada_layer_norm
+			else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+		)
+		# 5. Feed-forward
+		self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+		self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+	def forward(
+			self,
+			hidden_states,
+			attention_mask=None,
+			encoder_hidden_states=None,
+			encoder_attention_mask=None,
+			audio_encoder_hidden_states=None,
+			audio_attention_mask=None,
+			timestep=None,
+			video_length=None,
+			cross_attention_kwargs=None,
+			class_labels=None,
+	):
+		# Notice that normalization is always applied before the real computation in the following blocks.
+		# 1. Self-Attention
+		if self.use_ada_layer_norm:
+			norm_hidden_states = self.norm1(hidden_states, timestep)
+		elif self.use_ada_layer_norm_zero:
+			norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+				hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+			)
+		else:
+			norm_hidden_states = self.norm1(hidden_states)
+		cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+		attn_output = self.attn1(
+			norm_hidden_states,
+			encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+			attention_mask=attention_mask,
+			video_length=video_length,
+			**cross_attention_kwargs,
+		)
+		if self.use_ada_layer_norm_zero:
+			attn_output = gate_msa.unsqueeze(1) * attn_output
+		hidden_states = attn_output + hidden_states
+		# 2. Audio Cross-Attention
+		if self.attn_audio is not None:
+			norm_hidden_states = (
+				self.norm_audio(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_audio(hidden_states)
+			)
+			attn_output = self.attn_audio(
+				norm_hidden_states,
+				encoder_hidden_states=audio_encoder_hidden_states,
+				attention_mask=audio_attention_mask,
+				**cross_attention_kwargs,
+			)
+			hidden_states = attn_output + hidden_states
+		# 3. Cross-Attention
+		if self.attn2 is not None:
+			norm_hidden_states = (
+				self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+			)
+			# TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
+			# prepare attention mask here
+			attn_output = self.attn2(
+				norm_hidden_states,
+				encoder_hidden_states=encoder_hidden_states,
+				attention_mask=encoder_attention_mask,
+				**cross_attention_kwargs,
+			)
+			hidden_states = attn_output + hidden_states
+		# 3. Temporal-Attention
+		# Add positional embedding
+		device = hidden_states.device
+		dtype = hidden_states.dtype
+		pos_embed = self.pos_proj_temp(torch.arange(video_length).long()).to(device=device, dtype=dtype)  # (f c)
+		pos_embed = self.pos_embedding_temp(pos_embed).unsqueeze(0)  # (1, f, c)
+		seq_len = hidden_states.shape[1]
+		hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+		norm_hidden_states = (
+			self.norm_temp(hidden_states + pos_embed, timestep) if self.use_ada_layer_norm else self.norm_temp(
+				hidden_states + pos_embed)
+		)
+		hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
+		hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=seq_len)
+		# 4. Feed-forward
+		norm_hidden_states = self.norm3(hidden_states)
+		if self.use_ada_layer_norm_zero:
+			norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+		ff_output = self.ff(norm_hidden_states)
+		if self.use_ada_layer_norm_zero:
+			ff_output = gate_mlp.unsqueeze(1) * ff_output
+		hidden_states = ff_output + hidden_states
+		return hidden_states

ff_spatio_temp_resnet_3d.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py
+from einops import rearrange
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from unet_utils import FFInflatedConv3d
+class FFSpatioTempResUpsample3D(nn.Module):
+	def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
+		super().__init__()
+		self.channels = channels
+		self.out_channels = out_channels or channels
+		self.use_conv = use_conv
+		self.use_conv_transpose = use_conv_transpose
+		self.name = name
+		conv = None
+		if use_conv_transpose:
+			raise NotImplementedError
+		elif use_conv:
+			conv = FFInflatedConv3d(self.channels, self.out_channels, 3, padding=1)
+		if name == "conv":
+			self.conv = conv
+		else:
+			self.Conv2d_0 = conv
+	def forward(self, hidden_states, output_size=None):
+		assert hidden_states.shape[1] == self.channels
+		if self.use_conv_transpose:
+			raise NotImplementedError
+		# Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+		dtype = hidden_states.dtype
+		if dtype == torch.bfloat16:
+			hidden_states = hidden_states.to(torch.float32)
+		# upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+		if hidden_states.shape[0] >= 64:
+			hidden_states = hidden_states.contiguous()
+		# if `output_size` is passed we force the interpolation output
+		# size and do not make use of `scale_factor=2`
+		if output_size is None:
+			hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest")
+		else:
+			hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+		# If the input is bfloat16, we cast back to bfloat16
+		if dtype == torch.bfloat16:
+			hidden_states = hidden_states.to(dtype)
+		if self.use_conv:
+			if self.name == "conv":
+				hidden_states = self.conv(hidden_states)
+			else:
+				hidden_states = self.Conv2d_0(hidden_states)
+		return hidden_states
+class FFSpatioTempResDownsample3D(nn.Module):
+	def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
+		super().__init__()
+		self.channels = channels
+		self.out_channels = out_channels or channels
+		self.use_conv = use_conv
+		self.padding = padding
+		stride = 2
+		self.name = name
+		if use_conv:
+			conv = FFInflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+		else:
+			raise NotImplementedError
+		if name == "conv":
+			self.Conv2d_0 = conv
+			self.conv = conv
+		elif name == "Conv2d_0":
+			self.conv = conv
+		else:
+			self.conv = conv
+	def forward(self, hidden_states):
+		assert hidden_states.shape[1] == self.channels
+		if self.use_conv and self.padding == 0:
+			raise NotImplementedError
+		assert hidden_states.shape[1] == self.channels
+		hidden_states = self.conv(hidden_states)
+		return hidden_states
+class FFSpatioTempResnetBlock3D(nn.Module):
+	def __init__(
+		self,
+		*,
+		in_channels,
+		out_channels=None,
+		conv_shortcut=False,
+		dropout=0.0,
+		temb_channels=512,
+		groups=32,
+		groups_out=None,
+		pre_norm=True,
+		eps=1e-6,
+		non_linearity="swish",
+		time_embedding_norm="default",
+		output_scale_factor=1.0,
+		use_in_shortcut=None
+	):
+		super().__init__()
+		self.pre_norm = pre_norm
+		self.pre_norm = True
+		self.in_channels = in_channels
+		out_channels = in_channels if out_channels is None else out_channels
+		self.out_channels = out_channels
+		self.use_conv_shortcut = conv_shortcut
+		self.time_embedding_norm = time_embedding_norm
+		self.output_scale_factor = output_scale_factor
+		if groups_out is None:
+			groups_out = groups
+		self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+		self.conv1 = FFInflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+		if temb_channels is not None:
+			if self.time_embedding_norm == "default":
+				time_emb_proj_out_channels = out_channels
+			elif self.time_embedding_norm == "scale_shift":
+				time_emb_proj_out_channels = out_channels * 2
+			else:
+				raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+			self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
+		else:
+			self.time_emb_proj = None
+		self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+		self.dropout = torch.nn.Dropout(dropout)
+		self.conv2 = FFInflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+		if non_linearity == "swish":
+			self.nonlinearity = lambda x: F.silu(x)
+		elif non_linearity == "silu":
+			self.nonlinearity = nn.SiLU()
+		self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+		self.conv_shortcut = None
+		if self.use_in_shortcut:
+			self.conv_shortcut = FFInflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+	def forward(self, input_tensor, temb):
+		hidden_states = input_tensor
+		hidden_states = self.norm1(hidden_states)
+		hidden_states = self.nonlinearity(hidden_states)
+		hidden_states = self.conv1(hidden_states)
+		if temb is not None:
+			temb = rearrange(self.time_emb_proj(self.nonlinearity(temb)), "b f c -> b c f")[:, :, :, None, None]
+		if temb is not None and self.time_embedding_norm == "default":
+			hidden_states = hidden_states + temb
+		hidden_states = self.norm2(hidden_states)
+		if temb is not None and self.time_embedding_norm == "scale_shift":
+			scale, shift = torch.chunk(temb, 2, dim=1)
+			hidden_states = hidden_states * (1 + scale) + shift
+		hidden_states = self.nonlinearity(hidden_states)
+		hidden_states = self.dropout(hidden_states)
+		hidden_states = self.conv2(hidden_states)
+		if self.conv_shortcut is not None:
+			input_tensor = self.conv_shortcut(input_tensor)
+		output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+		return output_tensor

ff_spatio_temp_transformer_3d.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
+from dataclasses import dataclass
+from typing import Optional
+from einops import rearrange
+import torch
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention import Attention
+from diffusers.models.attention import FeedForward, AdaLayerNorm, AdaLayerNormZero
+from diffusers.models.embeddings import Timesteps, TimestepEmbedding
+from unet_utils import FFAttention
+@dataclass
+class SpatioTempTransformer3DModelOutput(BaseOutput):
+	sample: torch.Tensor
+if is_xformers_available():
+	import xformers
+	import xformers.ops
+else:
+	xformers = None
+class FFSpatioTempTransformer3DModel(ModelMixin, ConfigMixin):
+	@register_to_config
+	def __init__(
+			self,
+			num_attention_heads: int = 16,
+			attention_head_dim: int = 88,
+			in_channels: Optional[int] = None,
+			num_layers: int = 1,
+			dropout: float = 0.0,
+			norm_num_groups: int = 32,
+			cross_attention_dim: Optional[int] = None,
+			attention_bias: bool = False,
+			activation_fn: str = "geglu",
+			num_embeds_ada_norm: Optional[int] = None,
+			use_linear_projection: bool = False,
+			only_cross_attention: bool = False,
+			upcast_attention: bool = False,
+	):
+		super().__init__()
+		self.use_linear_projection = use_linear_projection
+		self.num_attention_heads = num_attention_heads
+		self.attention_head_dim = attention_head_dim
+		inner_dim = num_attention_heads * attention_head_dim
+		# Define input layers
+		self.in_channels = in_channels
+		self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+		if use_linear_projection:
+			self.proj_in = nn.Linear(in_channels, inner_dim)
+		else:
+			self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+		# Define transformers blocks
+		self.transformer_blocks = nn.ModuleList(
+			[
+				BasicTransformerBlock(
+					inner_dim,
+					num_attention_heads,
+					attention_head_dim,
+					dropout=dropout,
+					cross_attention_dim=cross_attention_dim,
+					activation_fn=activation_fn,
+					num_embeds_ada_norm=num_embeds_ada_norm,
+					attention_bias=attention_bias,
+					only_cross_attention=only_cross_attention,
+					upcast_attention=upcast_attention,
+				)
+				for d in range(num_layers)
+			]
+		)
+		# 4. Define output layers
+		if use_linear_projection:
+			self.proj_out = nn.Linear(in_channels, inner_dim)
+		else:
+			self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+	def forward(
+			self,
+			hidden_states,
+			encoder_hidden_states=None,
+			timestep=None,
+			class_labels=None,
+			cross_attention_kwargs=None,
+			return_dict: bool = True):
+		# Input
+		assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+		video_length = hidden_states.shape[2]
+		hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+		encoder_hidden_states = rearrange(encoder_hidden_states, 'b f n c -> (b f) n c')
+		batch, channel, height, weight = hidden_states.shape
+		residual = hidden_states
+		hidden_states = self.norm(hidden_states)
+		if not self.use_linear_projection:
+			hidden_states = self.proj_in(hidden_states)
+			inner_dim = hidden_states.shape[1]
+			hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+		else:
+			inner_dim = hidden_states.shape[1]
+			hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+			hidden_states = self.proj_in(hidden_states)
+		# Blocks
+		for block in self.transformer_blocks:
+			hidden_states = block(
+				hidden_states,
+				encoder_hidden_states=encoder_hidden_states,
+				timestep=timestep,
+				video_length=video_length,
+				cross_attention_kwargs=cross_attention_kwargs,
+				class_labels=class_labels
+			)
+		# Output
+		if not self.use_linear_projection:
+			hidden_states = (
+				hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+			)
+			hidden_states = self.proj_out(hidden_states)
+		else:
+			hidden_states = self.proj_out(hidden_states)
+			hidden_states = (
+				hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+			)
+		output = hidden_states + residual
+		output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+		if not return_dict:
+			return (output,)
+		return SpatioTempTransformer3DModelOutput(sample=output)
+class BasicTransformerBlock(nn.Module):
+	def __init__(
+			self,
+			dim: int,
+			num_attention_heads: int,
+			attention_head_dim: int,
+			dropout=0.0,
+			cross_attention_dim: Optional[int] = None,
+			activation_fn: str = "geglu",
+			num_embeds_ada_norm: Optional[int] = None,
+			attention_bias: bool = False,
+			only_cross_attention: bool = False,
+			double_self_attention: bool = False,
+			upcast_attention: bool = False,
+			norm_elementwise_affine: bool = True,
+			norm_type: str = "layer_norm",
+			final_dropout: bool = False,
+	):
+		super().__init__()
+		self.only_cross_attention = only_cross_attention
+		self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+		self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+		if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+			raise ValueError(
+				f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+				f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+			)
+		# Define 3 blocks. Each block has its own normalization layer.
+		# 1. FF-Attn
+		if self.use_ada_layer_norm:
+			self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+		elif self.use_ada_layer_norm_zero:
+			self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+		else:
+			self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+		self.attn1 = FFAttention(
+			query_dim=dim,
+			heads=num_attention_heads,
+			dim_head=attention_head_dim,
+			dropout=dropout,
+			bias=attention_bias,
+			cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+			upcast_attention=upcast_attention,
+		)
+		# 2. Cross-Attn
+		if cross_attention_dim is not None or double_self_attention:
+			# We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+			# I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+			# the second cross attention block.
+			self.norm2 = (
+				AdaLayerNorm(dim, num_embeds_ada_norm)
+				if self.use_ada_layer_norm
+				else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+			)
+			self.attn2 = Attention(
+				query_dim=dim,
+				cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+				heads=num_attention_heads,
+				dim_head=attention_head_dim,
+				dropout=dropout,
+				bias=attention_bias,
+				upcast_attention=upcast_attention,
+			)  # is self-attn if encoder_hidden_states is none
+		else:
+			self.norm2 = None
+			self.attn2 = None
+		# 3. Temp-Attn
+		self.pos_proj_temp = Timesteps(dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+		self.pos_embedding_temp = TimestepEmbedding(
+			dim,
+			dim,
+			act_fn="silu",
+			post_act_fn=None,
+			cond_proj_dim=None,
+		)
+		self.attn_temp = Attention(
+			query_dim=dim,
+			heads=num_attention_heads,
+			dim_head=attention_head_dim,
+			dropout=dropout,
+			bias=attention_bias,
+			upcast_attention=upcast_attention,
+		)
+		nn.init.zeros_(self.attn_temp.to_out[0].weight.data)
+		self.norm_temp = (
+			AdaLayerNorm(dim, num_embeds_ada_norm)
+			if self.use_ada_layer_norm
+			else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+		)
+		# 4. Feed-forward
+		self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+		self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+	def forward(
+			self,
+			hidden_states,
+			attention_mask=None,
+			encoder_hidden_states=None,
+			encoder_attention_mask=None,
+			timestep=None,
+			video_length=None,
+			cross_attention_kwargs=None,
+			class_labels=None,
+	):
+		# Notice that normalization is always applied before the real computation in the following blocks.
+		# 1. Self-Attention
+		if self.use_ada_layer_norm:
+			norm_hidden_states = self.norm1(hidden_states, timestep)
+		elif self.use_ada_layer_norm_zero:
+			norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+				hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+			)
+		else:
+			norm_hidden_states = self.norm1(hidden_states)
+		cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+		attn_output = self.attn1(
+			norm_hidden_states,
+			encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+			attention_mask=attention_mask,
+			video_length=video_length,
+			**cross_attention_kwargs,
+		)
+		if self.use_ada_layer_norm_zero:
+			attn_output = gate_msa.unsqueeze(1) * attn_output
+		hidden_states = attn_output + hidden_states
+		# 2. Cross-Attention
+		if self.attn2 is not None:
+			norm_hidden_states = (
+				self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+			)
+			# TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
+			# prepare attention mask here
+			attn_output = self.attn2(
+				norm_hidden_states,
+				encoder_hidden_states=encoder_hidden_states,
+				attention_mask=encoder_attention_mask,
+				**cross_attention_kwargs,
+			)
+			hidden_states = attn_output + hidden_states
+		# 3. Temporal-Attention
+		# Add positional embedding
+		device = hidden_states.device
+		dtype = hidden_states.dtype
+		pos_embed = self.pos_proj_temp(torch.arange(video_length).long()).to(device=device, dtype=dtype)  # (f c)
+		pos_embed = self.pos_embedding_temp(pos_embed).unsqueeze(0)  # (1, f, c)
+		seq_len = hidden_states.shape[1]
+		hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+		norm_hidden_states = (
+			self.norm_temp(hidden_states + pos_embed, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states+pos_embed)
+		)
+		hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
+		hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=seq_len)
+		# 4. Feed-forward
+		norm_hidden_states = self.norm3(hidden_states)
+		if self.use_ada_layer_norm_zero:
+			norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+		ff_output = self.ff(norm_hidden_states)
+		if self.use_ada_layer_norm_zero:
+			ff_output = gate_mlp.unsqueeze(1) * ff_output
+		hidden_states = ff_output + hidden_states
+		return hidden_states

imagebind/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from imagebind import data
+from imagebind.models import imagebind_model
+from imagebind.models.imagebind_model import ModalityType

imagebind/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (335 Bytes). View file

imagebind/__pycache__/data.cpython-310.pyc ADDED Viewed

Binary file (9.37 kB). View file

imagebind/bpe/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

imagebind/data.py ADDED Viewed

	@@ -0,0 +1,343 @@

+#!/usr/bin/env python3
+# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+import pkg_resources
+import torch
+import torch.nn as nn
+import torchaudio
+from PIL import Image
+from pytorchvideo import transforms as pv_transforms
+from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler
+from pytorchvideo.data.encoded_video import EncodedVideo
+from torchvision import transforms
+from torchvision.transforms._transforms_video import NormalizeVideo
+from imagebind.models.multimodal_preprocessors import SimpleTokenizer
+DEFAULT_AUDIO_FRAME_SHIFT_MS = 10  # in milliseconds
+def return_bpe_path():
+    return pkg_resources.resource_filename(
+        "imagebind", "bpe/bpe_simple_vocab_16e6.txt.gz"
+    )
+def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):
+    # Based on https://github.com/YuanGongND/ast/blob/d7d8b4b8e06cdaeb6c843cdb38794c1c7692234c/src/dataloader.py#L102
+    waveform -= waveform.mean()
+    fbank = torchaudio.compliance.kaldi.fbank(
+        waveform,
+        htk_compat=True,
+        sample_frequency=sample_rate,
+        use_energy=False,
+        window_type="hanning",
+        num_mel_bins=num_mel_bins,
+        dither=0.0,
+        frame_length=25,
+        frame_shift=DEFAULT_AUDIO_FRAME_SHIFT_MS,
+    )
+    # Convert to [mel_bins, num_frames] shape
+    fbank = fbank.transpose(0, 1)
+    # Pad to target_length
+    n_frames = fbank.size(1)
+    p = target_length - n_frames
+    # if p is too large (say >20%), flash a warning
+    if abs(p) / n_frames > 0.2:
+        logging.warning(
+            "Large gap between audio n_frames(%d) and "
+            "target_length (%d). Is the audio_target_length "
+            "setting correct?",
+            n_frames,
+            target_length,
+        )
+    # cut and pad
+    if p > 0:
+        fbank = torch.nn.functional.pad(fbank, (0, p), mode="constant", value=0)
+    elif p < 0:
+        fbank = fbank[:, 0:target_length]
+    # Convert to [1, mel_bins, num_frames] shape, essentially like a 1
+    # channel image
+    fbank = fbank.unsqueeze(0)
+    return fbank
+def get_clip_timepoints(clip_sampler, duration):
+    # Read out all clips in this video
+    all_clips_timepoints = []
+    is_last_clip = False
+    end = 0.0
+    while not is_last_clip:
+        start, end, _, _, is_last_clip = clip_sampler(end, duration, annotation=None)
+        all_clips_timepoints.append((start, end))
+    return all_clips_timepoints
+def load_and_transform_vision_data(image_paths, device):
+    if image_paths is None:
+        return None
+    image_outputs = []
+    data_transform = transforms.Compose(
+        [
+            transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=(0.48145466, 0.4578275, 0.40821073),
+                std=(0.26862954, 0.26130258, 0.27577711),
+            ),
+        ]
+    )
+    for image_path in image_paths:
+        with open(image_path, "rb") as fopen:
+            image = Image.open(fopen).convert("RGB")
+        image = data_transform(image).to(device)
+        image_outputs.append(image)
+    return torch.stack(image_outputs, dim=0)
+def load_and_transform_text(text, device):
+    if text is None:
+        return None
+    tokenizer = SimpleTokenizer(bpe_path=return_bpe_path())
+    tokens = [tokenizer(t).unsqueeze(0).to(device) for t in text]
+    tokens = torch.cat(tokens, dim=0)
+    return tokens
+def load_and_transform_audio_data(
+    audio_paths,
+    device,
+    num_mel_bins=128,
+    target_length=204,
+    sample_rate=16000,
+    clip_duration=2,
+    clips_per_video=3,
+    mean=-4.268,
+    std=9.138,
+):
+    if audio_paths is None:
+        return None
+    audio_outputs = []
+    clip_sampler = ConstantClipsPerVideoSampler(
+        clip_duration=clip_duration, clips_per_video=clips_per_video
+    )
+    for audio_path in audio_paths:
+        waveform, sr = torchaudio.load(audio_path)
+        if sample_rate != sr:
+            waveform = torchaudio.functional.resample(
+                waveform, orig_freq=sr, new_freq=sample_rate
+            )
+        all_clips_timepoints = get_clip_timepoints(
+            clip_sampler, waveform.size(1) / sample_rate
+        )
+        all_clips = []
+        for clip_timepoints in all_clips_timepoints:
+            waveform_clip = waveform[
+                :,
+                int(clip_timepoints[0] * sample_rate) : int(
+                    clip_timepoints[1] * sample_rate
+                ),
+            ]
+            waveform_melspec = waveform2melspec(
+                waveform_clip, sample_rate, num_mel_bins, target_length
+            )
+            all_clips.append(waveform_melspec)
+        normalize = transforms.Normalize(mean=mean, std=std)
+        all_clips = [normalize(ac).to(device) for ac in all_clips]
+        all_clips = torch.stack(all_clips, dim=0)
+        audio_outputs.append(all_clips)
+    return torch.stack(audio_outputs, dim=0)
+def crop_boxes(boxes, x_offset, y_offset):
+    """
+    Perform crop on the bounding boxes given the offsets.
+    Args:
+        boxes (ndarray or None): bounding boxes to perform crop. The dimension
+            is `num boxes` x 4.
+        x_offset (int): cropping offset in the x axis.
+        y_offset (int): cropping offset in the y axis.
+    Returns:
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    cropped_boxes = boxes.copy()
+    cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
+    cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
+    return cropped_boxes
+def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
+    """
+    Perform uniform spatial sampling on the images and corresponding boxes.
+    Args:
+        images (tensor): images to perform uniform crop. The dimension is
+            `num frames` x `channel` x `height` x `width`.
+        size (int): size of height and weight to crop the images.
+        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
+            is larger than height. Or 0, 1, or 2 for top, center, and bottom
+            crop if height is larger than width.
+        boxes (ndarray or None): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+        scale_size (int): optinal. If not None, resize the images to scale_size before
+            performing any crop.
+    Returns:
+        cropped (tensor): images with dimension of
+            `num frames` x `channel` x `size` x `size`.
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    assert spatial_idx in [0, 1, 2]
+    ndim = len(images.shape)
+    if ndim == 3:
+        images = images.unsqueeze(0)
+    height = images.shape[2]
+    width = images.shape[3]
+    if scale_size is not None:
+        if width <= height:
+            width, height = scale_size, int(height / width * scale_size)
+        else:
+            width, height = int(width / height * scale_size), scale_size
+        images = torch.nn.functional.interpolate(
+            images,
+            size=(height, width),
+            mode="bilinear",
+            align_corners=False,
+        )
+    y_offset = int(math.ceil((height - size) / 2))
+    x_offset = int(math.ceil((width - size) / 2))
+    if height > width:
+        if spatial_idx == 0:
+            y_offset = 0
+        elif spatial_idx == 2:
+            y_offset = height - size
+    else:
+        if spatial_idx == 0:
+            x_offset = 0
+        elif spatial_idx == 2:
+            x_offset = width - size
+    cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
+    cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
+    if ndim == 3:
+        cropped = cropped.squeeze(0)
+    return cropped, cropped_boxes
+class SpatialCrop(nn.Module):
+    """
+    Convert the video into 3 smaller clips spatially. Must be used after the
+        temporal crops to get spatial crops, and should be used with
+        -2 in the spatial crop at the slowfast augmentation stage (so full
+        frames are passed in here). Will return a larger list with the
+        3x spatial crops as well.
+    """
+    def __init__(self, crop_size: int = 224, num_crops: int = 3):
+        super().__init__()
+        self.crop_size = crop_size
+        if num_crops == 3:
+            self.crops_to_ext = [0, 1, 2]
+            self.flipped_crops_to_ext = []
+        elif num_crops == 1:
+            self.crops_to_ext = [1]
+            self.flipped_crops_to_ext = []
+        else:
+            raise NotImplementedError("Nothing else supported yet")
+    def forward(self, videos):
+        """
+        Args:
+            videos: A list of C, T, H, W videos.
+        Returns:
+            videos: A list with 3x the number of elements. Each video converted
+                to C, T, H', W' by spatial cropping.
+        """
+        assert isinstance(videos, list), "Must be a list of videos after temporal crops"
+        assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
+        res = []
+        for video in videos:
+            for spatial_idx in self.crops_to_ext:
+                res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
+            if not self.flipped_crops_to_ext:
+                continue
+            flipped_video = transforms.functional.hflip(video)
+            for spatial_idx in self.flipped_crops_to_ext:
+                res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
+        return res
+def load_and_transform_video_data(
+    video_paths,
+    device,
+    clip_duration=2,
+    clips_per_video=5,
+    sample_rate=16000,
+):
+    if video_paths is None:
+        return None
+    video_outputs = []
+    video_transform = transforms.Compose(
+        [
+            pv_transforms.ShortSideScale(224),
+            NormalizeVideo(
+                mean=(0.48145466, 0.4578275, 0.40821073),
+                std=(0.26862954, 0.26130258, 0.27577711),
+            ),
+        ]
+    )
+    clip_sampler = ConstantClipsPerVideoSampler(
+        clip_duration=clip_duration, clips_per_video=clips_per_video
+    )
+    frame_sampler = pv_transforms.UniformTemporalSubsample(num_samples=clip_duration)
+    for video_path in video_paths:
+        video = EncodedVideo.from_path(
+            video_path,
+            decoder="decord",
+            decode_audio=False,
+            **{"sample_rate": sample_rate},
+        )
+        all_clips_timepoints = get_clip_timepoints(clip_sampler, video.duration)
+        all_video = []
+        for clip_timepoints in all_clips_timepoints:
+            # Read the clip, get frames
+            clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
+            if clip is None:
+                raise ValueError("No clip found")
+            video_clip = frame_sampler(clip["video"])
+            video_clip = video_clip / 255.0  # since this is float, need 0-1
+            all_video.append(video_clip)
+        all_video = [video_transform(clip) for clip in all_video]
+        all_video = SpatialCrop(224, num_crops=3)(all_video)
+        all_video = torch.stack(all_video, dim=0)
+        video_outputs.append(all_video)
+    return torch.stack(video_outputs, dim=0).to(device)

imagebind/models/__init__.py ADDED Viewed

File without changes

imagebind/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (169 Bytes). View file

imagebind/models/__pycache__/helpers.cpython-310.pyc ADDED Viewed

Binary file (5.14 kB). View file

imagebind/models/__pycache__/imagebind_model.cpython-310.pyc ADDED Viewed

Binary file (8.3 kB). View file

imagebind/models/__pycache__/multimodal_preprocessors.cpython-310.pyc ADDED Viewed

Binary file (19.9 kB). View file

imagebind/models/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (8.01 kB). View file

imagebind/models/helpers.py ADDED Viewed

	@@ -0,0 +1,140 @@

+#!/usr/bin/env python3
+# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+class Normalize(nn.Module):
+    def __init__(self, dim: int) -> None:
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        return torch.nn.functional.normalize(x, dim=self.dim, p=2)
+class LearnableLogitScaling(nn.Module):
+    def __init__(
+        self,
+        logit_scale_init: float = 1 / 0.07,
+        learnable: bool = True,
+        max_logit_scale: float = 100,
+    ) -> None:
+        super().__init__()
+        self.max_logit_scale = max_logit_scale
+        self.logit_scale_init = logit_scale_init
+        self.learnable = learnable
+        log_logit_scale = torch.ones([]) * np.log(self.logit_scale_init)
+        if learnable:
+            self.log_logit_scale = nn.Parameter(log_logit_scale)
+        else:
+            self.register_buffer("log_logit_scale", log_logit_scale)
+    def forward(self, x):
+        return torch.clip(self.log_logit_scale.exp(), max=self.max_logit_scale) * x
+    def extra_repr(self):
+        st = f"logit_scale_init={self.logit_scale_init},learnable={self.learnable}," \
+             f" max_logit_scale={self.max_logit_scale}"
+        return st
+class EinOpsRearrange(nn.Module):
+    def __init__(self, rearrange_expr: str, **kwargs) -> None:
+        super().__init__()
+        self.rearrange_expr = rearrange_expr
+        self.kwargs = kwargs
+    def forward(self, x):
+        assert isinstance(x, torch.Tensor)
+        return einops.rearrange(x, self.rearrange_expr, **self.kwargs)
+class VerboseNNModule(nn.Module):
+    """
+    Wrapper around nn.Module that prints registered buffers and parameter names.
+    """
+    @staticmethod
+    def get_readable_tensor_repr(name: str, tensor: torch.Tensor) -> str:
+        st = (
+            "("
+            + name
+            + "): "
+            + "tensor("
+            + str(tuple(tensor[1].shape))
+            + ", requires_grad="
+            + str(tensor[1].requires_grad)
+            + ")\n"
+        )
+        return st
+    def extra_repr(self) -> str:
+        named_modules = set()
+        for p in self.named_modules():
+            named_modules.update([p[0]])
+        named_modules = list(named_modules)
+        string_repr = ""
+        for p in self.named_parameters():
+            name = p[0].split(".")[0]
+            if name not in named_modules:
+                string_repr += self.get_readable_tensor_repr(name, p)
+        for p in self.named_buffers():
+            name = p[0].split(".")[0]
+            string_repr += self.get_readable_tensor_repr(name, p)
+        return string_repr
+def cast_if_src_dtype(
+    tensor: torch.Tensor, src_dtype: torch.dtype, tgt_dtype: torch.dtype
+):
+    updated = False
+    if tensor.dtype == src_dtype:
+        tensor = tensor.to(dtype=tgt_dtype)
+        updated = True
+    return tensor, updated
+class QuickGELU(nn.Module):
+    # From https://github.com/openai/CLIP/blob/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1/clip/model.py#L166
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class SelectElement(nn.Module):
+    def __init__(self, index) -> None:
+        super().__init__()
+        self.index = index
+    def forward(self, x):
+        assert x.ndim >= 3
+        return x[:, self.index, ...]
+class SelectEOSAndProject(nn.Module):
+    """
+    Text Pooling used in OpenCLIP
+    """
+    def __init__(self, proj: nn.Module) -> None:
+        super().__init__()
+        self.proj = proj
+    def forward(self, x, seq_len):
+        assert x.ndim == 3
+        # x is of shape B x L x D
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), seq_len]
+        x = self.proj(x)
+        return x

imagebind/models/imagebind_model.py ADDED Viewed

	@@ -0,0 +1,506 @@

+#!/usr/bin/env python3
+# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+from functools import partial
+from types import SimpleNamespace
+import torch
+import torch.nn as nn
+from imagebind.models.helpers import (EinOpsRearrange, LearnableLogitScaling, Normalize,
+                            SelectElement, SelectEOSAndProject)
+from imagebind.models.multimodal_preprocessors import (AudioPreprocessor,
+                                             IMUPreprocessor, PadIm2Video,
+                                             PatchEmbedGeneric,
+                                             RGBDTPreprocessor,
+                                             SpatioTemporalPosEmbeddingHelper,
+                                             TextPreprocessor,
+                                             ThermalPreprocessor)
+from imagebind.models.transformer import MultiheadAttention, SimpleTransformer
+ModalityType = SimpleNamespace(
+    VISION="vision",
+    TEXT="text",
+    AUDIO="audio",
+    THERMAL="thermal",
+    DEPTH="depth",
+    IMU="imu",
+)
+class ImageBindModel(nn.Module):
+    def __init__(
+        self,
+        video_frames=2,
+        kernel_size=(2, 14, 14),
+        audio_kernel_size=16,
+        audio_stride=10,
+        out_embed_dim=768,
+        vision_embed_dim=1024,
+        vision_num_blocks=24,
+        vision_num_heads=16,
+        audio_embed_dim=768,
+        audio_num_blocks=12,
+        audio_num_heads=12,
+        audio_num_mel_bins=128,
+        audio_target_len=204,
+        audio_drop_path=0.1,
+        text_embed_dim=768,
+        text_num_blocks=12,
+        text_num_heads=12,
+        depth_embed_dim=384,
+        depth_kernel_size=16,
+        depth_num_blocks=12,
+        depth_num_heads=8,
+        depth_drop_path=0.0,
+        thermal_embed_dim=768,
+        thermal_kernel_size=16,
+        thermal_num_blocks=12,
+        thermal_num_heads=12,
+        thermal_drop_path=0.0,
+        imu_embed_dim=512,
+        imu_kernel_size=8,
+        imu_num_blocks=6,
+        imu_num_heads=8,
+        imu_drop_path=0.7,
+    ):
+        super().__init__()
+        self.modality_preprocessors = self._create_modality_preprocessors(
+            video_frames,
+            vision_embed_dim,
+            kernel_size,
+            text_embed_dim,
+            audio_embed_dim,
+            audio_kernel_size,
+            audio_stride,
+            audio_num_mel_bins,
+            audio_target_len,
+            depth_embed_dim,
+            depth_kernel_size,
+            thermal_embed_dim,
+            thermal_kernel_size,
+            imu_embed_dim,
+        )
+        self.modality_trunks = self._create_modality_trunks(
+            vision_embed_dim,
+            vision_num_blocks,
+            vision_num_heads,
+            text_embed_dim,
+            text_num_blocks,
+            text_num_heads,
+            audio_embed_dim,
+            audio_num_blocks,
+            audio_num_heads,
+            audio_drop_path,
+            depth_embed_dim,
+            depth_num_blocks,
+            depth_num_heads,
+            depth_drop_path,
+            thermal_embed_dim,
+            thermal_num_blocks,
+            thermal_num_heads,
+            thermal_drop_path,
+            imu_embed_dim,
+            imu_num_blocks,
+            imu_num_heads,
+            imu_drop_path,
+        )
+        self.modality_heads = self._create_modality_heads(
+            out_embed_dim,
+            vision_embed_dim,
+            text_embed_dim,
+            audio_embed_dim,
+            depth_embed_dim,
+            thermal_embed_dim,
+            imu_embed_dim,
+        )
+        self.modality_postprocessors = self._create_modality_postprocessors(
+            out_embed_dim
+        )
+    def _create_modality_preprocessors(
+        self,
+        video_frames=2,
+        vision_embed_dim=1024,
+        kernel_size=(2, 14, 14),
+        text_embed_dim=768,
+        audio_embed_dim=768,
+        audio_kernel_size=16,
+        audio_stride=10,
+        audio_num_mel_bins=128,
+        audio_target_len=204,
+        depth_embed_dim=768,
+        depth_kernel_size=16,
+        thermal_embed_dim=768,
+        thermal_kernel_size=16,
+        imu_embed_dim=512,
+    ):
+        rgbt_stem = PatchEmbedGeneric(
+            proj_stem=[
+                PadIm2Video(pad_type="repeat", ntimes=2),
+                nn.Conv3d(
+                    in_channels=3,
+                    kernel_size=kernel_size,
+                    out_channels=vision_embed_dim,
+                    stride=kernel_size,
+                    bias=False,
+                ),
+            ]
+        )
+        rgbt_preprocessor = RGBDTPreprocessor(
+            img_size=[3, video_frames, 224, 224],
+            num_cls_tokens=1,
+            pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
+            rgbt_stem=rgbt_stem,
+            depth_stem=None,
+        )
+        text_preprocessor = TextPreprocessor(
+            context_length=77,
+            vocab_size=49408,
+            embed_dim=text_embed_dim,
+            causal_masking=True,
+        )
+        audio_stem = PatchEmbedGeneric(
+            proj_stem=[
+                nn.Conv2d(
+                    in_channels=1,
+                    kernel_size=audio_kernel_size,
+                    stride=audio_stride,
+                    out_channels=audio_embed_dim,
+                    bias=False,
+                ),
+            ],
+            norm_layer=nn.LayerNorm(normalized_shape=audio_embed_dim),
+        )
+        audio_preprocessor = AudioPreprocessor(
+            img_size=[1, audio_num_mel_bins, audio_target_len],
+            num_cls_tokens=1,
+            pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
+            audio_stem=audio_stem,
+        )
+        depth_stem = PatchEmbedGeneric(
+            [
+                nn.Conv2d(
+                    kernel_size=depth_kernel_size,
+                    in_channels=1,
+                    out_channels=depth_embed_dim,
+                    stride=depth_kernel_size,
+                    bias=False,
+                ),
+            ],
+            norm_layer=nn.LayerNorm(normalized_shape=depth_embed_dim),
+        )
+        depth_preprocessor = RGBDTPreprocessor(
+            img_size=[1, 224, 224],
+            num_cls_tokens=1,
+            pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
+            rgbt_stem=None,
+            depth_stem=depth_stem,
+        )
+        thermal_stem = PatchEmbedGeneric(
+            [
+                nn.Conv2d(
+                    kernel_size=thermal_kernel_size,
+                    in_channels=1,
+                    out_channels=thermal_embed_dim,
+                    stride=thermal_kernel_size,
+                    bias=False,
+                ),
+            ],
+            norm_layer=nn.LayerNorm(normalized_shape=thermal_embed_dim),
+        )
+        thermal_preprocessor = ThermalPreprocessor(
+            img_size=[1, 224, 224],
+            num_cls_tokens=1,
+            pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
+            thermal_stem=thermal_stem,
+        )
+        imu_stem = PatchEmbedGeneric(
+            [
+                nn.Linear(
+                    in_features=48,
+                    out_features=imu_embed_dim,
+                    bias=False,
+                ),
+            ],
+            norm_layer=nn.LayerNorm(normalized_shape=imu_embed_dim),
+        )
+        imu_preprocessor = IMUPreprocessor(
+            img_size=[6, 2000],
+            num_cls_tokens=1,
+            kernel_size=8,
+            embed_dim=imu_embed_dim,
+            pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
+            imu_stem=imu_stem,
+        )
+        modality_preprocessors = {
+            ModalityType.VISION: rgbt_preprocessor,
+            ModalityType.TEXT: text_preprocessor,
+            ModalityType.AUDIO: audio_preprocessor,
+            ModalityType.DEPTH: depth_preprocessor,
+            ModalityType.THERMAL: thermal_preprocessor,
+            ModalityType.IMU: imu_preprocessor,
+        }
+        return nn.ModuleDict(modality_preprocessors)
+    def _create_modality_trunks(
+        self,
+        vision_embed_dim=1024,
+        vision_num_blocks=24,
+        vision_num_heads=16,
+        text_embed_dim=768,
+        text_num_blocks=12,
+        text_num_heads=12,
+        audio_embed_dim=768,
+        audio_num_blocks=12,
+        audio_num_heads=12,
+        audio_drop_path=0.0,
+        depth_embed_dim=768,
+        depth_num_blocks=12,
+        depth_num_heads=12,
+        depth_drop_path=0.0,
+        thermal_embed_dim=768,
+        thermal_num_blocks=12,
+        thermal_num_heads=12,
+        thermal_drop_path=0.0,
+        imu_embed_dim=512,
+        imu_num_blocks=6,
+        imu_num_heads=8,
+        imu_drop_path=0.7,
+    ):
+        def instantiate_trunk(
+            embed_dim, num_blocks, num_heads, pre_transformer_ln, add_bias_kv, drop_path
+        ):
+            return SimpleTransformer(
+                embed_dim=embed_dim,
+                num_blocks=num_blocks,
+                ffn_dropout_rate=0.0,
+                drop_path_rate=drop_path,
+                attn_target=partial(
+                    MultiheadAttention,
+                    embed_dim=embed_dim,
+                    num_heads=num_heads,
+                    bias=True,
+                    add_bias_kv=add_bias_kv,
+                ),
+                pre_transformer_layer=nn.Sequential(
+                    nn.LayerNorm(embed_dim, eps=1e-6)
+                    if pre_transformer_ln
+                    else nn.Identity(),
+                    EinOpsRearrange("b l d -> l b d"),
+                ),
+                post_transformer_layer=EinOpsRearrange("l b d -> b l d"),
+            )
+        modality_trunks = {}
+        modality_trunks[ModalityType.VISION] = instantiate_trunk(
+            vision_embed_dim,
+            vision_num_blocks,
+            vision_num_heads,
+            pre_transformer_ln=True,
+            add_bias_kv=False,
+            drop_path=0.0,
+        )
+        modality_trunks[ModalityType.TEXT] = instantiate_trunk(
+            text_embed_dim,
+            text_num_blocks,
+            text_num_heads,
+            pre_transformer_ln=False,
+            add_bias_kv=False,
+            drop_path=0.0,
+        )
+        modality_trunks[ModalityType.AUDIO] = instantiate_trunk(
+            audio_embed_dim,
+            audio_num_blocks,
+            audio_num_heads,
+            pre_transformer_ln=False,
+            add_bias_kv=True,
+            drop_path=audio_drop_path,
+        )
+        modality_trunks[ModalityType.DEPTH] = instantiate_trunk(
+            depth_embed_dim,
+            depth_num_blocks,
+            depth_num_heads,
+            pre_transformer_ln=False,
+            add_bias_kv=True,
+            drop_path=depth_drop_path,
+        )
+        modality_trunks[ModalityType.THERMAL] = instantiate_trunk(
+            thermal_embed_dim,
+            thermal_num_blocks,
+            thermal_num_heads,
+            pre_transformer_ln=False,
+            add_bias_kv=True,
+            drop_path=thermal_drop_path,
+        )
+        modality_trunks[ModalityType.IMU] = instantiate_trunk(
+            imu_embed_dim,
+            imu_num_blocks,
+            imu_num_heads,
+            pre_transformer_ln=False,
+            add_bias_kv=True,
+            drop_path=imu_drop_path,
+        )
+        return nn.ModuleDict(modality_trunks)
+    def _create_modality_heads(
+        self,
+        out_embed_dim,
+        vision_embed_dim,
+        text_embed_dim,
+        audio_embed_dim,
+        depth_embed_dim,
+        thermal_embed_dim,
+        imu_embed_dim,
+    ):
+        modality_heads = {}
+        modality_heads[ModalityType.VISION] = nn.Sequential(
+            nn.LayerNorm(normalized_shape=vision_embed_dim, eps=1e-6),
+            SelectElement(index=0),
+            nn.Linear(vision_embed_dim, out_embed_dim, bias=False),
+        )
+        modality_heads[ModalityType.TEXT] = SelectEOSAndProject(
+            proj=nn.Sequential(
+                nn.LayerNorm(normalized_shape=text_embed_dim, eps=1e-6),
+                nn.Linear(text_embed_dim, out_embed_dim, bias=False),
+            )
+        )
+        modality_heads[ModalityType.AUDIO] = nn.Sequential(
+            nn.LayerNorm(normalized_shape=audio_embed_dim, eps=1e-6),
+            SelectElement(index=0),
+            nn.Linear(audio_embed_dim, out_embed_dim, bias=False),
+        )
+        modality_heads[ModalityType.DEPTH] = nn.Sequential(
+            nn.LayerNorm(normalized_shape=depth_embed_dim, eps=1e-6),
+            SelectElement(index=0),
+            nn.Linear(depth_embed_dim, out_embed_dim, bias=False),
+        )
+        modality_heads[ModalityType.THERMAL] = nn.Sequential(
+            nn.LayerNorm(normalized_shape=thermal_embed_dim, eps=1e-6),
+            SelectElement(index=0),
+            nn.Linear(thermal_embed_dim, out_embed_dim, bias=False),
+        )
+        modality_heads[ModalityType.IMU] = nn.Sequential(
+            nn.LayerNorm(normalized_shape=imu_embed_dim, eps=1e-6),
+            SelectElement(index=0),
+            nn.Dropout(p=0.5),
+            nn.Linear(imu_embed_dim, out_embed_dim, bias=False),
+        )
+        return nn.ModuleDict(modality_heads)
+    def _create_modality_postprocessors(self, out_embed_dim):
+        modality_postprocessors = {}
+        modality_postprocessors[ModalityType.VISION] = Normalize(dim=-1)
+        modality_postprocessors[ModalityType.TEXT] = nn.Sequential(
+            Normalize(dim=-1), LearnableLogitScaling(learnable=True)
+        )
+        modality_postprocessors[ModalityType.AUDIO] = nn.Sequential(
+            Normalize(dim=-1),
+            LearnableLogitScaling(logit_scale_init=20.0, learnable=False),
+        )
+        modality_postprocessors[ModalityType.DEPTH] = nn.Sequential(
+            Normalize(dim=-1),
+            LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
+        )
+        modality_postprocessors[ModalityType.THERMAL] = nn.Sequential(
+            Normalize(dim=-1),
+            LearnableLogitScaling(logit_scale_init=10.0, learnable=False),
+        )
+        modality_postprocessors[ModalityType.IMU] = nn.Sequential(
+            Normalize(dim=-1),
+            LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
+        )
+        return nn.ModuleDict(modality_postprocessors)
+    def forward(self, inputs):
+        outputs = {}
+        for modality_key, modality_value in inputs.items():
+            reduce_list = (
+                modality_value.ndim >= 5
+            )  # Audio and Video inputs consist of multiple clips
+            if reduce_list:
+                B, S = modality_value.shape[:2]
+                modality_value = modality_value.reshape(
+                    B * S, *modality_value.shape[2:]
+                )
+            if modality_value is not None:
+                modality_value = self.modality_preprocessors[modality_key](
+                    **{modality_key: modality_value}
+                )
+                trunk_inputs = modality_value["trunk"]
+                head_inputs = modality_value["head"]
+                modality_value = self.modality_trunks[modality_key](**trunk_inputs)
+                modality_value = self.modality_heads[modality_key](
+                    modality_value, **head_inputs
+                )
+                modality_value = self.modality_postprocessors[modality_key](
+                    modality_value
+                )
+                if reduce_list:
+                    modality_value = modality_value.reshape(B, S, -1)
+                    modality_value = modality_value.mean(dim=1)
+                outputs[modality_key] = modality_value
+        return outputs
+def imagebind_huge(pretrained=False):
+    model = ImageBindModel(
+        vision_embed_dim=1280,
+        vision_num_blocks=32,
+        vision_num_heads=16,
+        text_embed_dim=1024,
+        text_num_blocks=24,
+        text_num_heads=16,
+        out_embed_dim=1024,
+        audio_drop_path=0.1,
+        imu_drop_path=0.7,
+    )
+    if pretrained:
+        if not os.path.exists(".checkpoints/imagebind_huge.pth"):
+            print(
+                "Downloading imagebind weights to .checkpoints/imagebind_huge.pth ..."
+            )
+            os.makedirs(".checkpoints", exist_ok=True)
+            torch.hub.download_url_to_file(
+                "https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth",
+                ".checkpoints/imagebind_huge.pth",
+                progress=True,
+            )
+        model.load_state_dict(torch.load(".checkpoints/imagebind_huge.pth"))
+    return model

imagebind/models/multimodal_preprocessors.py ADDED Viewed

	@@ -0,0 +1,685 @@

+#!/usr/bin/env python3
+# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import gzip
+import html
+import io
+import math
+from functools import lru_cache
+from typing import Callable, List, Optional, Tuple
+import ftfy
+import numpy as np
+import regex as re
+import torch
+import torch.nn as nn
+from iopath.common.file_io import g_pathmgr
+from timm.models.layers import trunc_normal_
+from imagebind.models.helpers import VerboseNNModule, cast_if_src_dtype
+def get_sinusoid_encoding_table(n_position, d_hid):
+    """Sinusoid position encoding table"""
+    # TODO: make it with torch instead of numpy
+    def get_position_angle_vec(position):
+        return [
+            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+            for hid_j in range(d_hid)
+        ]
+    sinusoid_table = np.array(
+        [get_position_angle_vec(pos_i) for pos_i in range(n_position)]
+    )
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+def interpolate_pos_encoding_2d(target_spatial_size, pos_embed):
+    N = pos_embed.shape[1]
+    if N == target_spatial_size:
+        return pos_embed
+    dim = pos_embed.shape[-1]
+    # nn.functional.interpolate doesn't work with bfloat16 so we cast to float32
+    pos_embed, updated = cast_if_src_dtype(pos_embed, torch.bfloat16, torch.float32)
+    pos_embed = nn.functional.interpolate(
+        pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(
+            0, 3, 1, 2
+        ),
+        scale_factor=math.sqrt(target_spatial_size / N),
+        mode="bicubic",
+    )
+    if updated:
+        pos_embed, _ = cast_if_src_dtype(pos_embed, torch.float32, torch.bfloat16)
+    pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+    return pos_embed
+def interpolate_pos_encoding(
+    npatch_per_img,
+    pos_embed,
+    patches_layout,
+    input_shape=None,
+    first_patch_idx=1,
+):
+    assert first_patch_idx == 0 or first_patch_idx == 1, "there is 1 CLS token or none"
+    N = pos_embed.shape[1] - first_patch_idx  # since it's 1 if cls_token exists
+    if npatch_per_img == N:
+        return pos_embed
+    assert (
+        patches_layout[-1] == patches_layout[-2]
+    ), "Interpolation of pos embed not supported for non-square layouts"
+    class_emb = pos_embed[:, :first_patch_idx]
+    pos_embed = pos_embed[:, first_patch_idx:]
+    if input_shape is None or patches_layout[0] == 1:
+        # simple 2D pos embedding, no temporal component
+        pos_embed = interpolate_pos_encoding_2d(npatch_per_img, pos_embed)
+    elif patches_layout[0] > 1:
+        # pos embed has a temporal component
+        assert len(input_shape) == 4, "temporal interpolation not supported"
+        # we only support 2D interpolation in this case
+        num_frames = patches_layout[0]
+        num_spatial_tokens = patches_layout[1] * patches_layout[2]
+        pos_embed = pos_embed.view(1, num_frames, num_spatial_tokens, -1)
+        # interpolate embedding for zeroth frame
+        pos_embed = interpolate_pos_encoding_2d(
+            npatch_per_img, pos_embed[0, 0, ...].unsqueeze(0)
+        )
+    else:
+        raise ValueError("This type of interpolation isn't implemented")
+    return torch.cat((class_emb, pos_embed), dim=1)
+def _get_pos_embedding(
+    npatch_per_img,
+    pos_embed,
+    patches_layout,
+    input_shape,
+    first_patch_idx=1,
+):
+    pos_embed = interpolate_pos_encoding(
+        npatch_per_img,
+        pos_embed,
+        patches_layout,
+        input_shape=input_shape,
+        first_patch_idx=first_patch_idx,
+    )
+    return pos_embed
+class PatchEmbedGeneric(nn.Module):
+    """
+    PatchEmbed from Hydra
+    """
+    def __init__(self, proj_stem, norm_layer: Optional[nn.Module] = None):
+        super().__init__()
+        if len(proj_stem) > 1:
+            self.proj = nn.Sequential(*proj_stem)
+        else:
+            # Special case to be able to load pre-trained models that were
+            # trained with a standard stem
+            self.proj = proj_stem[0]
+        self.norm_layer = norm_layer
+    def get_patch_layout(self, img_size):
+        with torch.no_grad():
+            dummy_img = torch.zeros(
+                [
+                    1,
+                ]
+                + img_size
+            )
+            dummy_out = self.proj(dummy_img)
+        embed_dim = dummy_out.shape[1]
+        patches_layout = tuple(dummy_out.shape[2:])
+        num_patches = np.prod(patches_layout)
+        return patches_layout, num_patches, embed_dim
+    def forward(self, x):
+        x = self.proj(x)
+        # B C (T) H W -> B (T)HW C
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm_layer is not None:
+            x = self.norm_layer(x)
+        return x
+class SpatioTemporalPosEmbeddingHelper(VerboseNNModule):
+    def __init__(
+        self,
+        patches_layout: List,
+        num_patches: int,
+        num_cls_tokens: int,
+        embed_dim: int,
+        learnable: bool,
+    ) -> None:
+        super().__init__()
+        self.num_cls_tokens = num_cls_tokens
+        self.patches_layout = patches_layout
+        self.num_patches = num_patches
+        self.num_tokens = num_cls_tokens + num_patches
+        self.learnable = learnable
+        if self.learnable:
+            self.pos_embed = nn.Parameter(torch.zeros(1, self.num_tokens, embed_dim))
+            trunc_normal_(self.pos_embed, std=0.02)
+        else:
+            self.register_buffer(
+                "pos_embed", get_sinusoid_encoding_table(self.num_tokens, embed_dim)
+            )
+    def get_pos_embedding(self, vision_input, all_vision_tokens):
+        input_shape = vision_input.shape
+        pos_embed = _get_pos_embedding(
+            all_vision_tokens.size(1) - self.num_cls_tokens,
+            pos_embed=self.pos_embed,
+            patches_layout=self.patches_layout,
+            input_shape=input_shape,
+            first_patch_idx=self.num_cls_tokens,
+        )
+        return pos_embed
+class RGBDTPreprocessor(VerboseNNModule):
+    def __init__(
+        self,
+        rgbt_stem: PatchEmbedGeneric,
+        depth_stem: Optional[PatchEmbedGeneric],
+        img_size: Tuple = (3, 224, 224),
+        num_cls_tokens: int = 1,
+        pos_embed_fn: Optional[Callable] = None,
+        use_type_embed: bool = False,
+        init_param_style: str = "openclip",
+    ) -> None:
+        super().__init__()
+        stem = rgbt_stem if rgbt_stem is not None else depth_stem
+        (
+            self.patches_layout,
+            self.num_patches,
+            self.embed_dim,
+        ) = stem.get_patch_layout(img_size)
+        self.rgbt_stem = rgbt_stem
+        self.depth_stem = depth_stem
+        self.use_pos_embed = pos_embed_fn is not None
+        self.use_type_embed = use_type_embed
+        self.num_cls_tokens = num_cls_tokens
+        if self.use_pos_embed:
+            self.pos_embedding_helper = pos_embed_fn(
+                patches_layout=self.patches_layout,
+                num_cls_tokens=num_cls_tokens,
+                num_patches=self.num_patches,
+                embed_dim=self.embed_dim,
+            )
+        if self.num_cls_tokens > 0:
+            self.cls_token = nn.Parameter(
+                torch.zeros(1, self.num_cls_tokens, self.embed_dim)
+            )
+        if self.use_type_embed:
+            self.type_embed = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+        self.init_parameters(init_param_style)
+    @torch.no_grad()
+    def init_parameters(self, init_param_style):
+        if init_param_style == "openclip":
+            # OpenCLIP style initialization
+            scale = self.embed_dim**-0.5
+            if self.use_pos_embed:
+                nn.init.normal_(self.pos_embedding_helper.pos_embed)
+                self.pos_embedding_helper.pos_embed *= scale
+            if self.num_cls_tokens > 0:
+                nn.init.normal_(self.cls_token)
+                self.cls_token *= scale
+        elif init_param_style == "vit":
+            self.cls_token.data.fill_(0)
+        else:
+            raise ValueError(f"Unknown init {init_param_style}")
+        if self.use_type_embed:
+            nn.init.normal_(self.type_embed)
+    def tokenize_input_and_cls_pos(self, input, stem, mask):
+        # tokens is of shape B x L x D
+        tokens = stem(input)
+        assert tokens.ndim == 3
+        assert tokens.shape[2] == self.embed_dim
+        B = tokens.shape[0]
+        if self.num_cls_tokens > 0:
+            class_tokens = self.cls_token.expand(
+                B, -1, -1
+            )  # stole class_tokens impl from Phil Wang, thanks
+            tokens = torch.cat((class_tokens, tokens), dim=1)
+        if self.use_pos_embed:
+            pos_embed = self.pos_embedding_helper.get_pos_embedding(input, tokens)
+            tokens = tokens + pos_embed
+        if self.use_type_embed:
+            tokens = tokens + self.type_embed.expand(B, -1, -1)
+        return tokens
+    def forward(self, vision=None, depth=None, patch_mask=None):
+        if patch_mask is not None:
+            raise NotImplementedError()
+        if vision is not None:
+            vision_tokens = self.tokenize_input_and_cls_pos(
+                vision, self.rgbt_stem, patch_mask
+            )
+        if depth is not None:
+            depth_tokens = self.tokenize_input_and_cls_pos(
+                depth, self.depth_stem, patch_mask
+            )
+        # aggregate tokens
+        if vision is not None and depth is not None:
+            final_tokens = vision_tokens + depth_tokens
+        else:
+            final_tokens = vision_tokens if vision is not None else depth_tokens
+        return_dict = {
+            "trunk": {
+                "tokens": final_tokens,
+            },
+            "head": {},
+        }
+        return return_dict
+class AudioPreprocessor(RGBDTPreprocessor):
+    def __init__(self, audio_stem: PatchEmbedGeneric, **kwargs) -> None:
+        super().__init__(rgbt_stem=audio_stem, depth_stem=None, **kwargs)
+    def forward(self, audio=None):
+        return super().forward(vision=audio)
+class ThermalPreprocessor(RGBDTPreprocessor):
+    def __init__(self, thermal_stem: PatchEmbedGeneric, **kwargs) -> None:
+        super().__init__(rgbt_stem=thermal_stem, depth_stem=None, **kwargs)
+    def forward(self, thermal=None):
+        return super().forward(vision=thermal)
+def build_causal_attention_mask(context_length):
+    # lazily create causal attention mask, with full attention between the vision tokens
+    # pytorch uses additive attention mask; fill with -inf
+    mask = torch.empty(context_length, context_length, requires_grad=False)
+    mask.fill_(float("-inf"))
+    mask.triu_(1)  # zero out the lower diagonal
+    return mask
+class TextPreprocessor(VerboseNNModule):
+    def __init__(
+        self,
+        vocab_size: int,
+        context_length: int,
+        embed_dim: int,
+        causal_masking: bool,
+        supply_seq_len_to_head: bool = True,
+        num_cls_tokens: int = 0,
+        init_param_style: str = "openclip",
+    ) -> None:
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.context_length = context_length
+        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
+        self.pos_embed = nn.Parameter(
+            torch.empty(1, self.context_length + num_cls_tokens, embed_dim)
+        )
+        self.causal_masking = causal_masking
+        if self.causal_masking:
+            mask = build_causal_attention_mask(self.context_length)
+            # register the mask as a buffer so it can be moved to the right device
+            self.register_buffer("mask", mask)
+        self.supply_seq_len_to_head = supply_seq_len_to_head
+        self.num_cls_tokens = num_cls_tokens
+        self.embed_dim = embed_dim
+        if num_cls_tokens > 0:
+            assert self.causal_masking is False, "Masking + CLS token isn't implemented"
+            self.cls_token = nn.Parameter(
+                torch.zeros(1, self.num_cls_tokens, embed_dim)
+            )
+        self.init_parameters(init_param_style)
+    @torch.no_grad()
+    def init_parameters(self, init_param_style="openclip"):
+        # OpenCLIP style initialization
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.pos_embed, std=0.01)
+        if init_param_style == "openclip":
+            # OpenCLIP style initialization
+            scale = self.embed_dim**-0.5
+            if self.num_cls_tokens > 0:
+                nn.init.normal_(self.cls_token)
+                self.cls_token *= scale
+        elif init_param_style == "vit":
+            self.cls_token.data.fill_(0)
+        else:
+            raise ValueError(f"Unknown init {init_param_style}")
+    def forward(self, text):
+        # text tokens are of shape B x L x D
+        text_tokens = self.token_embedding(text)
+        # concat CLS tokens if any
+        if self.num_cls_tokens > 0:
+            B = text_tokens.shape[0]
+            class_tokens = self.cls_token.expand(
+                B, -1, -1
+            )  # stole class_tokens impl from Phil Wang, thanks
+            text_tokens = torch.cat((class_tokens, text_tokens), dim=1)
+        text_tokens = text_tokens + self.pos_embed
+        return_dict = {
+            "trunk": {
+                "tokens": text_tokens,
+            },
+            "head": {},
+        }
+        # Compute sequence length after adding CLS tokens
+        if self.supply_seq_len_to_head:
+            text_lengths = text.argmax(dim=-1)
+            return_dict["head"] = {
+                "seq_len": text_lengths,
+            }
+        if self.causal_masking:
+            return_dict["trunk"].update({"attn_mask": self.mask})
+        return return_dict
+class Im2Video(nn.Module):
+    """Convert an image into a trivial video."""
+    def __init__(self, time_dim=2):
+        super().__init__()
+        self.time_dim = time_dim
+    def forward(self, x):
+        if x.ndim == 4:
+            # B, C, H, W -> B, C, T, H, W
+            return x.unsqueeze(self.time_dim)
+        elif x.ndim == 5:
+            return x
+        else:
+            raise ValueError(f"Dimension incorrect {x.shape}")
+class PadIm2Video(Im2Video):
+    def __init__(self, ntimes, pad_type, time_dim=2):
+        super().__init__(time_dim=time_dim)
+        assert ntimes > 0
+        assert pad_type in ["zero", "repeat"]
+        self.ntimes = ntimes
+        self.pad_type = pad_type
+    def forward(self, x):
+        x = super().forward(x)
+        if x.shape[self.time_dim] == 1:
+            if self.pad_type == "repeat":
+                new_shape = [1] * len(x.shape)
+                new_shape[self.time_dim] = self.ntimes
+                x = x.repeat(new_shape)
+            elif self.pad_type == "zero":
+                padarg = [0, 0] * len(x.shape)
+                padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[self.time_dim]
+                x = nn.functional.pad(x, padarg)
+        return x
+# Modified from github.com/openai/CLIP
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str, context_length=77):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with g_pathmgr.open(bpe_path, "rb") as fh:
+            bpe_bytes = io.BytesIO(fh.read())
+            merges: List[str] = gzip.open(bpe_bytes).read().decode("utf-8").split("\n")
+        merges = merges[1 : 49152 - 256 - 2 + 1]
+        merges: List[Tuple[str, ...]] = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + "</w>" for v in vocab]
+        for merge in merges:
+            vocab.append("".join(merge))
+        vocab.extend(["<|startoftext|>", "<|endoftext|>"])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            "<|startoftext|>": "<|startoftext|>",
+            "<|endoftext|>": "<|endoftext|>",
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE,
+        )
+        self.context_length = context_length
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token + "</w>"
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(
+                self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
+            )
+        return bpe_tokens
+    def decode(self, tokens):
+        text = "".join([self.decoder[token] for token in tokens])
+        text = (
+            bytearray([self.byte_decoder[c] for c in text])
+            .decode("utf-8", errors="replace")
+            .replace("</w>", " ")
+        )
+        return text
+    def __call__(self, texts, context_length=None):
+        if not context_length:
+            context_length = self.context_length
+        if isinstance(texts, str):
+            texts = [texts]
+        sot_token = self.encoder["<|startoftext|>"]
+        eot_token = self.encoder["<|endoftext|>"]
+        all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+        for i, tokens in enumerate(all_tokens):
+            tokens = tokens[:context_length]
+            result[i, : len(tokens)] = torch.tensor(tokens)
+        if len(result) == 1:
+            return result[0]
+        return result
+class IMUPreprocessor(VerboseNNModule):
+    def __init__(
+        self,
+        kernel_size: int,
+        imu_stem: PatchEmbedGeneric,
+        embed_dim: int,
+        img_size: Tuple = (6, 2000),
+        num_cls_tokens: int = 1,
+        pos_embed_fn: Optional[Callable] = None,
+        init_param_style: str = "openclip",
+    ) -> None:
+        super().__init__()
+        self.imu_stem = imu_stem
+        self.embed_dim = embed_dim
+        self.use_pos_embed = pos_embed_fn is not None
+        self.num_cls_tokens = num_cls_tokens
+        self.kernel_size = kernel_size
+        self.pos_embed = nn.Parameter(
+            torch.empty(1, (img_size[1] // kernel_size) + num_cls_tokens, embed_dim)
+        )
+        if self.num_cls_tokens > 0:
+            self.cls_token = nn.Parameter(
+                torch.zeros(1, self.num_cls_tokens, self.embed_dim)
+            )
+        self.init_parameters(init_param_style)
+    @torch.no_grad()
+    def init_parameters(self, init_param_style):
+        nn.init.normal_(self.pos_embed, std=0.01)
+        if init_param_style == "openclip":
+            # OpenCLIP style initialization
+            scale = self.embed_dim**-0.5
+            if self.num_cls_tokens > 0:
+                nn.init.normal_(self.cls_token)
+                self.cls_token *= scale
+        elif init_param_style == "vit":
+            self.cls_token.data.fill_(0)
+        else:
+            raise ValueError(f"Unknown init {init_param_style}")
+    def tokenize_input_and_cls_pos(self, input, stem):
+        # tokens is of shape B x L x D
+        tokens = stem.norm_layer(stem.proj(input))
+        assert tokens.ndim == 3
+        assert tokens.shape[2] == self.embed_dim
+        B = tokens.shape[0]
+        if self.num_cls_tokens > 0:
+            class_tokens = self.cls_token.expand(
+                B, -1, -1
+            )  # stole class_tokens impl from Phil Wang, thanks
+            tokens = torch.cat((class_tokens, tokens), dim=1)
+        if self.use_pos_embed:
+            tokens = tokens + self.pos_embed
+        return tokens
+    def forward(self, imu):
+        # Patchify
+        imu = imu.unfold(
+            -1,
+            self.kernel_size,
+            self.kernel_size,
+        ).permute(0, 2, 1, 3)
+        imu = imu.reshape(imu.size(0), imu.size(1), -1)
+        imu_tokens = self.tokenize_input_and_cls_pos(
+            imu,
+            self.imu_stem,
+        )
+        return_dict = {
+            "trunk": {
+                "tokens": imu_tokens,
+            },
+            "head": {},
+        }
+        return return_dict

imagebind/models/transformer.py ADDED Viewed

	@@ -0,0 +1,280 @@

+#!/usr/bin/env python3
+# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Code modified from
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py ;
+# https://github.com/facebookresearch/deit/blob/main/models.py
+# and https://github.com/facebookresearch/vissl/blob/main/vissl/models/trunks/vision_transformer.py
+from functools import partial
+from typing import Callable, List, Optional
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, trunc_normal_
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version,
+        # can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class MultiheadAttention(nn.MultiheadAttention):
+    def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
+        return super().forward(x, x, x, need_weights=False, attn_mask=attn_mask)[0]
+class ViTAttention(Attention):
+    def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
+        assert attn_mask is None
+        return super().forward(x)
+class BlockWithMasking(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        attn_target: Callable,
+        mlp_ratio: int = 4,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+        ffn_dropout_rate: float = 0.0,
+        drop_path: float = 0.0,
+        layer_scale_type: Optional[str] = None,
+        layer_scale_init_value: float = 1e-4,
+    ):
+        super().__init__()
+        assert not isinstance(
+            attn_target, nn.Module
+        ), "attn_target should be a Callable. Otherwise attn_target is shared across blocks!"
+        self.attn = attn_target()
+        if drop_path > 0.0:
+            self.drop_path = DropPath(drop_path)
+        else:
+            self.drop_path = nn.Identity()
+        self.norm_1 = norm_layer(dim)
+        mlp_hidden_dim = int(mlp_ratio * dim)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=ffn_dropout_rate,
+        )
+        self.norm_2 = norm_layer(dim)
+        self.layer_scale_type = layer_scale_type
+        if self.layer_scale_type is not None:
+            assert self.layer_scale_type in [
+                "per_channel",
+                "scalar",
+            ], f"Found Layer scale type {self.layer_scale_type}"
+            if self.layer_scale_type == "per_channel":
+                # one gamma value per channel
+                gamma_shape = [1, 1, dim]
+            elif self.layer_scale_type == "scalar":
+                # single gamma value for all channels
+                gamma_shape = [1, 1, 1]
+            # two gammas: for each part of the fwd in the encoder
+            self.layer_scale_gamma1 = nn.Parameter(
+                torch.ones(size=gamma_shape) * layer_scale_init_value,
+                requires_grad=True,
+            )
+            self.layer_scale_gamma2 = nn.Parameter(
+                torch.ones(size=gamma_shape) * layer_scale_init_value,
+                requires_grad=True,
+            )
+    def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
+        if self.layer_scale_type is None:
+            x = x + self.drop_path(self.attn(self.norm_1(x), attn_mask))
+            x = x + self.drop_path(self.mlp(self.norm_2(x)))
+        else:
+            x = (
+                x
+                + self.drop_path(self.attn(self.norm_1(x), attn_mask))
+                * self.layer_scale_gamma1
+            )
+            x = x + self.drop_path(self.mlp(self.norm_2(x))) * self.layer_scale_gamma2
+        return x
+_LAYER_NORM = partial(nn.LayerNorm, eps=1e-6)
+class SimpleTransformer(nn.Module):
+    def __init__(
+        self,
+        attn_target: Callable,
+        embed_dim: int,
+        num_blocks: int,
+        block: Callable = BlockWithMasking,
+        pre_transformer_layer: Optional[Callable] = None,
+        post_transformer_layer: Optional[Callable] = None,
+        drop_path_rate: float = 0.0,
+        drop_path_type: str = "progressive",
+        norm_layer: Callable = _LAYER_NORM,
+        mlp_ratio: int = 4,
+        ffn_dropout_rate: float = 0.0,
+        layer_scale_type: Optional[str] = None,  # from cait; possible values are None, "per_channel", "scalar"
+        layer_scale_init_value: float = 1e-4,  # from cait; float
+        weight_init_style: str = "jax",  # possible values jax or pytorch
+    ):
+        """
+        Simple Transformer with the following features
+        1. Supports masked attention
+        2. Supports DropPath
+        3. Supports LayerScale
+        4. Supports Dropout in Attention and FFN
+        5. Makes few assumptions about the input except that it is a Tensor
+        """
+        super().__init__()
+        self.pre_transformer_layer = pre_transformer_layer
+        if drop_path_type == "progressive":
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, num_blocks)]
+        elif drop_path_type == "uniform":
+            dpr = [drop_path_rate for i in range(num_blocks)]
+        else:
+            raise ValueError(f"Unknown drop_path_type: {drop_path_type}")
+        self.blocks = nn.Sequential(
+            *[
+                block(
+                    dim=embed_dim,
+                    attn_target=attn_target,
+                    mlp_ratio=mlp_ratio,
+                    ffn_dropout_rate=ffn_dropout_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    layer_scale_type=layer_scale_type,
+                    layer_scale_init_value=layer_scale_init_value,
+                )
+                for i in range(num_blocks)
+            ]
+        )
+        self.post_transformer_layer = post_transformer_layer
+        self.weight_init_style = weight_init_style
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            if self.weight_init_style == "jax":
+                # Based on MAE and official Jax ViT implementation
+                torch.nn.init.xavier_uniform_(m.weight)
+            elif self.weight_init_style == "pytorch":
+                # PyTorch ViT uses trunc_normal_
+                trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, (nn.LayerNorm)):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        attn_mask: torch.Tensor = None,
+        use_checkpoint: bool = False,
+        checkpoint_every_n: int = 1,
+        checkpoint_blk_ids: Optional[List[int]] = None,
+    ):
+        """
+        Inputs
+        - tokens: data of shape N x L x D (or L x N x D depending on the attention implementation)
+        - attn: mask of shape L x L
+        Output
+        - x: data of shape N x L x D (or L x N x D depending on the attention implementation)
+        """
+        if self.pre_transformer_layer:
+            tokens = self.pre_transformer_layer(tokens)
+        if use_checkpoint and checkpoint_blk_ids is None:
+            checkpoint_blk_ids = [
+                blk_id
+                for blk_id in range(len(self.blocks))
+                if blk_id % checkpoint_every_n == 0
+            ]
+        if checkpoint_blk_ids:
+            checkpoint_blk_ids = set(checkpoint_blk_ids)
+        for blk_id, blk in enumerate(self.blocks):
+            if use_checkpoint and blk_id in checkpoint_blk_ids:
+                tokens = checkpoint.checkpoint(
+                    blk, tokens, attn_mask, use_reentrant=False
+                )
+            else:
+                tokens = blk(tokens, attn_mask=attn_mask)
+        if self.post_transformer_layer:
+            tokens = self.post_transformer_layer(tokens)
+        return tokens

pipeline.py ADDED Viewed

	@@ -0,0 +1,602 @@

+import torchvision.io
+from einops import rearrange, repeat
+import numpy as np
+import inspect
+from typing import List, Optional, Union, Tuple
+import os
+import PIL
+import torch
+import torchaudio
+import torchvision.io
+import torchvision.transforms as transforms
+from transformers import ImageProcessingMixin
+from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import KarrasDiffusionSchedulers, PNDMScheduler
+from diffusers.utils import logging
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.image_processor import VaeImageProcessor
+from unet import AudioUNet3DConditionModel
+from audio_encoder import ImageBindSegmaskAudioEncoder
+from imagebind.data import waveform2melspec
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def waveform_to_melspectrogram(
+		waveform: Union[np.ndarray, torch.Tensor],
+		num_mel_bins=128,
+		target_length=204,
+		sample_rate=16000,
+		clip_duration=2.,
+		mean=-4.268,
+		std=9.138
+):
+	if isinstance(waveform, np.ndarray):
+		waveform = torch.from_numpy(waveform)
+	audio_length = waveform.shape[1]
+	audio_target_length = int(clip_duration * sample_rate)
+	audio_start_idx = 0
+	if audio_length > audio_target_length:
+		audio_start_idx = (audio_length - audio_target_length) // 2
+	audio_end_idx = audio_start_idx + audio_target_length
+	waveform_clip = waveform[:, audio_start_idx:audio_end_idx]
+	waveform_melspec = waveform2melspec(
+		waveform_clip, sample_rate, num_mel_bins, target_length
+	)  # (1, n_mel, n_frame)
+	normalize = transforms.Normalize(mean=mean, std=std)
+	audio_clip = normalize(waveform_melspec)
+	return audio_clip  # (1, freq, time)
+class AudioMelspectrogramExtractor(ImageProcessingMixin):
+	def __init__(
+			self,
+			num_mel_bins=128,
+			target_length=204,
+			sample_rate=16000,
+			clip_duration=2,
+			mean=-4.268,
+			std=9.138
+	):
+		super().__init__()
+		self.num_mel_bins = num_mel_bins
+		self.target_length = target_length
+		self.sample_rate = sample_rate
+		self.clip_duration = clip_duration
+		self.mean = mean
+		self.std = std
+	@property
+	def max_length_s(self) -> int:
+		return self.clip_duration
+	@property
+	def sampling_rate(self) -> int:
+		return self.sample_rate
+	def __call__(
+			self,
+			waveforms: Union[
+				np.ndarray,
+				torch.Tensor,
+				List[np.ndarray],
+				List[torch.Tensor]
+			]
+	):
+		if isinstance(waveforms, (np.ndarray, torch.Tensor)) and waveforms.ndim == 2:
+			waveforms = [waveforms, ]
+		features = []
+		for waveform in waveforms:
+			feature = waveform_to_melspectrogram(
+				waveform=waveform,
+				num_mel_bins=self.num_mel_bins,
+				target_length=self.target_length,
+				sample_rate=self.sample_rate,
+				clip_duration=self.clip_duration,
+				mean=self.mean,
+				std=self.std
+			)
+			features.append(feature)
+		features = torch.stack(features, dim=0)
+		return features  # (b c n t)
+class AudioCondAnimationPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
+	"""
+	Pipeline for text-guided image to image generation using stable unCLIP.
+	This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+	library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+	Args:
+		feature_extractor ([`CLIPImageProcessor`]):
+			Feature extractor for image pre-processing before being encoded.
+		unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+		scheduler ([`KarrasDiffusionSchedulers`]):
+			A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+		vae ([`AutoencoderKL`]):
+			Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+	"""
+	unet: AudioUNet3DConditionModel
+	scheduler: KarrasDiffusionSchedulers
+	vae: AutoencoderKL
+	audio_encoder: ImageBindSegmaskAudioEncoder
+	def __init__(
+			self,
+			unet: AudioUNet3DConditionModel,
+			scheduler: KarrasDiffusionSchedulers,
+			vae: AutoencoderKL,
+			audio_encoder: ImageBindSegmaskAudioEncoder,
+			null_text_encodings_path: str = ""
+	):
+		super().__init__()
+		self.register_modules(
+			unet=unet,
+			scheduler=scheduler,
+			vae=vae,
+			audio_encoder=audio_encoder
+		)
+		if null_text_encodings_path:
+			self.null_text_encoding = torch.load(null_text_encodings_path).view(1, 77, 768)
+		self.melspectrogram_shape = (128, 204)
+		self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+		self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+		self.audio_processor = AudioMelspectrogramExtractor()
+	@torch.no_grad()
+	def encode_text(
+			self,
+			text_encodings,
+			device,
+			dtype,
+			do_text_classifier_free_guidance,
+			do_audio_classifier_free_guidance,
+	):
+		if isinstance(text_encodings, (List, Tuple)):
+			text_encodings = torch.cat(text_encodings)
+		text_encodings = text_encodings.to(dtype=dtype, device=device)
+		batch_size = len(text_encodings)
+		# get unconditional embeddings for classifier free guidance
+		if do_text_classifier_free_guidance:
+			if not hasattr(self, "null_text_encoding"):
+				uncond_token = ""
+				max_length = text_encodings.shape[1]
+				uncond_input = self.tokenizer(
+					uncond_token,
+					padding="max_length",
+					max_length=max_length,
+					truncation=True,
+					return_tensors="pt",
+				)
+				if hasattr(self.text_encoder.config,
+				           "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+					attention_mask = uncond_input.attention_mask.to(device)
+				else:
+					attention_mask = None
+				uncond_text_encodings = self.text_encoder(
+					uncond_input.input_ids.to(device),
+					attention_mask=attention_mask,
+				)
+				uncond_text_encodings = uncond_text_encodings[0]
+			else:
+				uncond_text_encodings = self.null_text_encoding
+			uncond_text_encodings = repeat(uncond_text_encodings, "1 n d -> b n d", b=batch_size).contiguous()
+			uncond_text_encodings = uncond_text_encodings.to(dtype=dtype, device=device)
+		if do_text_classifier_free_guidance and do_audio_classifier_free_guidance:  # dual cfg
+			text_encodings = torch.cat([uncond_text_encodings, text_encodings, text_encodings])
+		elif do_text_classifier_free_guidance:  # only text cfg
+			text_encodings = torch.cat([uncond_text_encodings, text_encodings])
+		elif do_audio_classifier_free_guidance:  # only audio cfg
+			text_encodings = torch.cat([text_encodings, text_encodings])
+		return text_encodings
+	@torch.no_grad()
+	def encode_audio(
+			self,
+			audios: Union[List[np.ndarray], List[torch.Tensor]],
+			video_length: int = 12,
+			do_text_classifier_free_guidance: bool = False,
+			do_audio_classifier_free_guidance: bool = False,
+			device: torch.device = torch.device("cuda:0"),
+			dtype: torch.dtype = torch.float32
+	):
+		batch_size = len(audios)
+		melspectrograms = self.audio_processor(audios).to(device=device, dtype=dtype)  # (b c n t)
+		# audio_encodings: (b, n, c)
+		# audio_masks: (b, s, n)
+		_, audio_encodings, audio_masks = self.audio_encoder(
+			melspectrograms, normalize=False, return_dict=False
+		)
+		audio_encodings = repeat(audio_encodings, "b n c -> b f n c", f=video_length)
+		if do_audio_classifier_free_guidance:
+			null_melspectrograms = torch.zeros(1, 1, *self.melspectrogram_shape).to(device=device, dtype=dtype)
+			_, null_audio_encodings, null_audio_masks = self.audio_encoder(
+				null_melspectrograms, normalize=False, return_dict=False
+			)
+			null_audio_encodings = repeat(null_audio_encodings, "1 n c -> b f n c", b=batch_size, f=video_length)
+		if do_text_classifier_free_guidance and do_audio_classifier_free_guidance:  # dual cfg
+			audio_encodings = torch.cat([null_audio_encodings, null_audio_encodings, audio_encodings])
+			audio_masks = torch.cat([null_audio_masks, null_audio_masks, audio_masks])
+		elif do_text_classifier_free_guidance:  # only text cfg
+			audio_encodings = torch.cat([audio_encodings, audio_encodings])
+			audio_masks = torch.cat([audio_masks, audio_masks])
+		elif do_audio_classifier_free_guidance:  # only audio cfg
+			audio_encodings = torch.cat([null_audio_encodings, audio_encodings])
+			audio_masks = torch.cat([null_audio_masks, audio_masks])
+		return audio_encodings, audio_masks
+	@torch.no_grad()
+	def encode_latents(self, image: torch.Tensor):
+		dtype = self.vae.dtype
+		image = image.to(device=self.device, dtype=dtype)
+		image_latents = self.vae.encode(image).latent_dist.sample() * self.vae.config.scaling_factor
+		return image_latents
+	# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+	@torch.no_grad()
+	def decode_latents(self, latents):
+		dtype = next(self.vae.parameters()).dtype
+		latents = latents.to(dtype=dtype)
+		latents = 1 / self.vae.config.scaling_factor * latents
+		image = self.vae.decode(latents).sample
+		image = (image / 2 + 0.5).clamp(0, 1).cpu().float()  # ((b t) c h w)
+		return image
+	# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+	def prepare_extra_step_kwargs(self, generator, eta):
+		# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+		# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+		# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+		# and should be between [0, 1]
+		accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+		extra_step_kwargs = {}
+		if accepts_eta:
+			extra_step_kwargs["eta"] = eta
+		# check if the scheduler accepts generator
+		accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+		if accepts_generator:
+			extra_step_kwargs["generator"] = generator
+		return extra_step_kwargs
+	# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+	def prepare_video_latents(
+			self,
+			image_latents: torch.Tensor,
+			num_channels_latents: int,
+			video_length: int = 12,
+			height: int = 256,
+			width: int = 256,
+			device: torch.device = torch.device("cuda"),
+			dtype: torch.dtype = torch.float32,
+			generator: Optional[torch.Generator] = None,
+	):
+		batch_size = len(image_latents)
+		shape = (
+			batch_size,
+			num_channels_latents,
+			video_length - 1,
+			height // self.vae_scale_factor,
+			width // self.vae_scale_factor
+		)
+		image_latents = image_latents.unsqueeze(2)  # (b c 1 h w)
+		rand_noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+		noise_latents = torch.cat([image_latents, rand_noise], dim=2)
+		# scale the initial noise by the standard deviation required by the scheduler
+		noise_latents = noise_latents * self.scheduler.init_noise_sigma
+		return noise_latents
+	@torch.no_grad()
+	def __call__(
+			self,
+			images: List[PIL.Image.Image],
+			audios: Union[List[np.ndarray], List[torch.Tensor]],
+			text_encodings: List[torch.Tensor],
+			video_length: int = 12,
+			height: int = 256,
+			width: int = 256,
+			num_inference_steps: int = 20,
+			audio_guidance_scale: float = 4.0,
+			text_guidance_scale: float = 1.0,
+			generator: Optional[torch.Generator] = None,
+			return_dict: bool = True
+	):
+		# 0. Default height and width to unet
+		device = self.device
+		dtype = self.dtype
+		batch_size = len(images)
+		height = height or self.unet.config.sample_size * self.vae_scale_factor
+		width = width or self.unet.config.sample_size * self.vae_scale_factor
+		do_text_classifier_free_guidance = (text_guidance_scale > 1.0)
+		do_audio_classifier_free_guidance = (audio_guidance_scale > 1.0)
+		# 1. Encoder text into ((k b) f n d)
+		text_encodings = self.encode_text(
+			text_encodings=text_encodings,
+			device=device,
+			dtype=dtype,
+			do_text_classifier_free_guidance=do_text_classifier_free_guidance,
+			do_audio_classifier_free_guidance=do_audio_classifier_free_guidance
+		)  # ((k b), n, d)
+		text_encodings = repeat(text_encodings, "b n d -> b t n d", t=video_length).to(device=device, dtype=dtype)
+		# 2. Encode audio
+		# audio_encodings: ((k b), n, d)
+		# audio_masks: ((k b), s, n)
+		audio_encodings, audio_masks = self.encode_audio(
+			audios, video_length, do_text_classifier_free_guidance, do_audio_classifier_free_guidance, device, dtype
+		)
+		# 3. Prepare image latent
+		image = self.image_processor.preprocess(images)
+		image_latents = self.encode_latents(image).to(device=device, dtype=dtype)  # (b c h w)
+		# 4. Prepare unet noising video latents
+		video_latents = self.prepare_video_latents(
+			image_latents=image_latents,
+			num_channels_latents=self.unet.config.in_channels,
+			video_length=video_length,
+			height=height,
+			width=width,
+			dtype=dtype,
+			device=device,
+			generator=generator,
+		)  # (b c f h w)
+		# 5. Prepare timesteps and extra step kwargs
+		self.scheduler.set_timesteps(num_inference_steps, device=device)
+		timesteps = self.scheduler.timesteps
+		extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta=0.0)
+		# 7. Denoising loop
+		for i, t in enumerate(self.progress_bar(timesteps)):
+			latent_model_input = [video_latents]
+			if do_text_classifier_free_guidance:
+				latent_model_input.append(video_latents)
+			if do_audio_classifier_free_guidance:
+				latent_model_input.append(video_latents)
+			latent_model_input = torch.cat(latent_model_input)
+			latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+			# predict the noise residual
+			noise_pred = self.unet(
+				latent_model_input,
+				t,
+				encoder_hidden_states=text_encodings,
+				audio_encoder_hidden_states=audio_encodings,
+				audio_attention_mask=audio_masks
+			).sample
+			# perform guidance
+			if do_text_classifier_free_guidance and do_audio_classifier_free_guidance:  # dual cfg
+				noise_pred_uncond, noise_pred_text, noise_pred_text_audio = noise_pred.chunk(3)
+				noise_pred = noise_pred_uncond + \
+				             text_guidance_scale * (noise_pred_text - noise_pred_uncond) + \
+				             audio_guidance_scale * (noise_pred_text_audio - noise_pred_text)
+			elif do_text_classifier_free_guidance:  # only text cfg
+				noise_pred_audio, noise_pred_text_audio = noise_pred.chunk(2)
+				noise_pred = noise_pred_audio + \
+				             text_guidance_scale * (noise_pred_text_audio - noise_pred_audio)
+			elif do_audio_classifier_free_guidance:  # only audio cfg
+				noise_pred_text, noise_pred_text_audio = noise_pred.chunk(2)
+				noise_pred = noise_pred_text + \
+				             audio_guidance_scale * (noise_pred_text_audio - noise_pred_text)
+			# First frame latent will always server as unchanged condition
+			video_latents[:, :, 1:, :, :] = self.scheduler.step(noise_pred[:, :, 1:, :, :], t,
+			                                                    video_latents[:, :, 1:, :, :],
+			                                                    **extra_step_kwargs).prev_sample
+			video_latents = video_latents.contiguous()
+		# 8. Post-processing
+		video_latents = rearrange(video_latents, "b c f h w -> (b f) c h w")
+		videos = self.decode_latents(video_latents).detach().cpu()
+		videos = rearrange(videos, "(b f) c h w -> b f c h w", f=video_length)  # value range [0, 1]
+		if not return_dict:
+			return videos
+		return {"videos": videos}
+def load_and_transform_images_stable_diffusion(
+		images: Union[List[np.ndarray], torch.Tensor, np.ndarray],
+		size=512,
+		flip=False,
+		randcrop=False,
+		normalize=True
+):
+	"""
+	@images: (List of) np.uint8 images of shape (h, w, 3)
+			or tensor of shape (b, c, h, w) in [0., 1.0]
+	"""
+	assert isinstance(images, (List, torch.Tensor, np.ndarray)), type(images)
+	if isinstance(images, List):
+		assert isinstance(images[0], np.ndarray)
+		assert images[0].dtype == np.uint8
+		assert images[0].shape[2] == 3
+		# convert np images into torch float tensor
+		images = torch.from_numpy(
+			rearrange(np.stack(images, axis=0), "f h w c -> f c h w")
+		).float() / 255.
+	elif isinstance(images, np.ndarray):
+		assert isinstance(images, np.ndarray)
+		assert images.dtype == np.uint8
+		assert images.shape[3] == 3
+		# convert np images into torch float tensor
+		images = torch.from_numpy(
+			rearrange(images, "f h w c -> f c h w")
+		).float() / 255.
+	assert images.shape[1] == 3
+	assert torch.all(images <= 1.0) and torch.all(images >= 0.0)
+	h, w = images.shape[-2:]
+	if isinstance(size, int):
+		target_h, target_w = size, size
+	else:
+		target_h, target_w = size
+	# first crop the image
+	target_aspect_ratio = float(target_h) / target_w
+	curr_aspect_ratio = float(h) / w
+	if target_aspect_ratio >= curr_aspect_ratio:  # trim w
+		trimmed_w = int(h / target_aspect_ratio)
+		images = images[:, :, :, (w - trimmed_w) // 2: (w - trimmed_w) // 2 + trimmed_w]
+	else:  # trim h
+		trimmed_h = int(w * target_aspect_ratio)
+		images = images[:, :, (h - trimmed_h) // 2: (h - trimmed_h) // 2 + trimmed_h]
+	transform_list = [
+		transforms.Resize(
+			size,
+			interpolation=transforms.InterpolationMode.BILINEAR,
+			antialias=True
+		),
+	]
+	# assert not randcrop
+	if randcrop:
+		transform_list.append(transforms.RandomCrop(size))
+	else:
+		transform_list.append(transforms.CenterCrop(size))
+	if flip:
+		transform_list.append(transforms.RandomHorizontalFlip(p=1.0))
+	if normalize:
+		transform_list.append(transforms.Normalize([0.5], [0.5]))
+	data_transform = transforms.Compose(transform_list)
+	images = data_transform(images)
+	return images
+def load_image(image_path):
+	image = PIL.Image.open(image_path).convert('RGB')
+	width, height = image.size
+	if width < height:
+		new_width = 256
+		new_height = int((256 / width) * height)
+	else:
+		new_height = 256
+		new_width = int((256 / height) * width)
+	# Rescale the image
+	image = image.resize((new_width, new_height), PIL.Image.LANCZOS)
+	# Crop a 256x256 square from the center
+	left = (new_width - 256) / 2
+	top = (new_height - 256) / 2
+	right = (new_width + 256) / 2
+	bottom = (new_height + 256) / 2
+	image = image.crop((left, top, right, bottom))
+	return image
+def load_audio(audio_path):
+	audio, audio_sr = torchaudio.load(audio_path)
+	if audio.ndim == 1: audio = audio.unsqueeze(0)
+	else:
+		audio = audio.mean(dim=0).unsqueeze(0)
+	audio = torchaudio.functional.resample(audio, orig_freq=audio_sr, new_freq=16000)
+	audio = audio[:, :32000].contiguous().float()
+	if audio.shape[1] < 32000:
+		audio = torch.cat([audio, torch.ones(1, 32000-audio.shape[1]).float()], dim=1)
+	return audio.contiguous()
+@torch.no_grad()
+def generate_videos(
+		pipeline,
+		image_path: str = '',
+		audio_path: str = '',
+		category_text_encoding: Optional[torch.Tensor] = None,
+		image_size: Tuple[int, int] = (256, 256),
+		video_fps: int = 6,
+		video_num_frame: int = 12,
+		audio_guidance_scale: float = 4.0,
+		denoising_step: int = 20,
+		text_guidance_scale: float = 1.0,
+		seed: int = 0,
+		save_path: str = "",
+		device: torch.device = torch.device("cuda"),
+):
+	image = load_image(image_path)
+	audio = load_audio(audio_path)
+	generator = torch.Generator(device=device)
+	generator.manual_seed(seed)
+	generated_video = pipeline(
+		images=[image],
+		audios=[audio],
+		text_encodings=[category_text_encoding],
+		video_length=video_num_frame,
+		height=image_size[0],
+		width=image_size[1],
+		num_inference_steps=denoising_step,
+		audio_guidance_scale=audio_guidance_scale,
+		text_guidance_scale=text_guidance_scale,
+		generator=generator,
+		return_dict=False
+	)[0]  # (f c h w) in range [0, 1]
+	generated_video = (generated_video.permute(0, 2, 3, 1).contiguous() * 255).byte()
+	os.makedirs(os.path.dirname(save_path), exist_ok=True)
+	torchvision.io.write_video(
+		filename=save_path,
+		video_array=generated_video,
+		fps=video_fps,
+		audio_array=audio,
+		audio_fps=16000,
+		audio_codec="aac"
+	)
+	return

pretrained/openai-clip-l_null_text_encoding.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06170f5fa389ab44a9e12c27146a2b6569cdea6808a58ba341ce50903939da98
+size 237430

pretrained/stable-diffusion-v1-5/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_class_name": "PNDMScheduler",
+  "_diffusers_version": "0.6.0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "num_train_timesteps": 1000,
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "trained_betas": null,
+  "clip_sample": false
+}

pretrained/stable-diffusion-v1-5/vae/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.6.0",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 512,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

pretrained/stable-diffusion-v1-5/vae/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b134cded8eb78b184aefb8805b6b572f36fa77b255c483665dda931fa0130c5
+size 334707217

pretrained/stable-diffusion-v1-5/vae/diffusion_pytorch_model.fp16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7643b3e40b9f128eda5fe174fea73c3ef3903562651fb344a79439709c2e503
+size 167405651

pretrained/stable-diffusion-v1-5/vae/diffusion_pytorch_model.fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fbcf0ebe55a0984f5a5e00d8c4521d52359af7229bb4d81890039d2aa16dd7c
+size 167335342

pretrained/stable-diffusion-v1-5/vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2b5134f4dbc140d9c11f11cba3233099e00af40f262f136c691fb7d38d2194c
+size 334643276

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+accelerate==0.32.1
+diffusers==0.29.2
+einops==0.8.0
+ftfy==6.2.0
+imageio==2.34.2
+iopath==0.1.10
+pytorchvideo==0.1.5
+timm==1.0.7
+tqdm==4.66.4
+transformers==4.42.4
+wandb==0.17.5

unet.py ADDED Viewed

	@@ -0,0 +1,839 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import os
+import json
+from einops import repeat
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor
+from diffusers.models.embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps
+from diffusers.models.modeling_utils import ModelMixin
+from unet_blocks import (
+	all_modules,
+	get_down_block,
+	get_up_block,
+	get_mid_block,
+)
+from unet_utils import FFInflatedConv3d
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+	"""
+	Args:
+		sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+			Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+	"""
+	sample: torch.FloatTensor
+class AudioUNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+	r"""
+	UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+	and returns sample shaped output.
+	This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+	implements for all the models (such as downloading or saving, etc.)
+	Parameters:
+		sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+			Height and width of input/output sample.
+		in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+		out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+		center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+		flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+			Whether to flip the sin to cos in the time embedding.
+		freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+		down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+			The tuple of downsample blocks to use.
+		mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+			The mid block type. Choose from `UNetMidBlock2DCrossAttn` or `UNetMidBlock2DSimpleCrossAttn`, will skip the
+			mid block layer if `None`.
+		up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+			The tuple of upsample blocks to use.
+		only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+			Whether to include self-attention in the basic transformer blocks, see
+			[`~models.attention.BasicTransformerBlock`].
+		block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+			The tuple of output channels for each block.
+		layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+		downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+		mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+		act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+		norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+			If `None`, it will skip the normalization and activation layers in post-processing
+		norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+		cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+			The dimension of the cross attention features.
+		encoder_hid_dim (`int`, *optional*, defaults to None):
+			If given, `encoder_hidden_states` will be projected from this dimension to `cross_attention_dim`.
+		attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+		resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+			for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`.
+		class_embed_type (`str`, *optional*, defaults to None):
+			The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+			`"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+		addition_embed_type (`str`, *optional*, defaults to None):
+			Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+			"text". "text" will use the `TextTimeEmbedding` layer.
+		num_class_embeds (`int`, *optional*, defaults to None):
+			Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+			class conditioning with `class_embed_type` equal to `None`.
+		time_embedding_type (`str`, *optional*, default to `positional`):
+			The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+		time_embedding_dim (`int`, *optional*, default to `None`):
+			An optional override for the dimension of the projected time embedding.
+		time_embedding_act_fn (`str`, *optional*, default to `None`):
+			Optional activation function to use on the time embeddings only one time before they as passed to the rest
+			of the unet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+		timestep_post_act (`str, *optional*, default to `None`):
+			The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+		time_cond_proj_dim (`int`, *optional*, default to `None`):
+			The dimension of `cond_proj` layer in timestep embedding.
+		conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+		conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+		projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+			using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
+		class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+			embeddings with the class embeddings.
+		mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+			Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+			`only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is None, the
+			`only_cross_attention` value will be used as the value for `mid_block_only_cross_attention`. Else, it will
+			default to `False`.
+	"""
+	_supports_gradient_checkpointing = True
+	@register_to_config
+	def __init__(
+			self,
+			sample_size: Optional[int] = None,
+			in_channels: int = 4,
+			out_channels: int = 4,
+			center_input_sample: bool = False,
+			flip_sin_to_cos: bool = True,
+			freq_shift: int = 0,
+			down_block_types: Tuple[str] = (
+					"FFSpatioAudioTempCrossAttnDownBlock3D",
+					"FFSpatioAudioTempCrossAttnDownBlock3D",
+					"FFSpatioAudioTempCrossAttnDownBlock3D",
+					"FFSpatioTempResDownBlock3D",
+			),
+			mid_block_type: Optional[str] = "FFSpatioAudioTempCrossAttnUNetMidBlock3D",
+			up_block_types: Tuple[str] = (
+					"FFSpatioTempResUpBlock3D",
+					"FFSpatioAudioTempCrossAttnUpBlock3D",
+					"FFSpatioAudioTempCrossAttnUpBlock3D",
+					"FFSpatioAudioTempCrossAttnUpBlock3D"
+			),
+			only_cross_attention: Union[bool, Tuple[bool]] = False,
+			block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+			layers_per_block: Union[int, Tuple[int]] = 2,
+			downsample_padding: int = 1,
+			mid_block_scale_factor: float = 1,
+			act_fn: str = "silu",
+			norm_num_groups: Optional[int] = 32,
+			norm_eps: float = 1e-5,
+			cross_attention_dim: Union[int, Tuple[int]] = 1280,
+			encoder_hid_dim: Optional[int] = None,
+			attention_head_dim: Union[int, Tuple[int]] = 8,
+			dual_cross_attention: bool = False,
+			use_linear_projection: bool = False,
+			class_embed_type: Optional[str] = None,
+			addition_embed_type: Optional[str] = None,
+			num_class_embeds: Optional[int] = None,
+			upcast_attention: bool = False,
+			resnet_time_scale_shift: str = "default",
+			resnet_skip_time_act: bool = False,
+			resnet_out_scale_factor: int = 1.0,
+			time_embedding_type: str = "positional",
+			time_embedding_dim: Optional[int] = None,
+			time_embedding_act_fn: Optional[str] = None,
+			timestep_post_act: Optional[str] = None,
+			time_cond_proj_dim: Optional[int] = None,
+			conv_in_kernel: int = 3,
+			conv_out_kernel: int = 3,
+			projection_class_embeddings_input_dim: Optional[int] = None,
+			class_embeddings_concat: bool = False,
+			mid_block_only_cross_attention: Optional[bool] = None,
+			cross_attention_norm: Optional[str] = None,
+			addition_embed_type_num_heads=64,
+			audio_cross_attention_dim: int = 768,
+	):
+		super().__init__()
+		self.sample_size = sample_size
+		# Check inputs
+		if len(down_block_types) != len(up_block_types):
+			raise ValueError(
+				f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+			)
+		if len(block_out_channels) != len(down_block_types):
+			raise ValueError(
+				f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+			)
+		if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+			raise ValueError(
+				f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+			)
+		if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+			raise ValueError(
+				f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+			)
+		if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+			raise ValueError(
+				f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+			)
+		if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+			raise ValueError(
+				f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+			)
+		# input
+		conv_in_padding = (conv_in_kernel - 1) // 2
+		self.conv_in = FFInflatedConv3d(
+			in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+		)
+		# time
+		if time_embedding_type == "fourier":
+			time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+			if time_embed_dim % 2 != 0:
+				raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+			self.time_proj = GaussianFourierProjection(
+				time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+			)
+			timestep_input_dim = time_embed_dim
+		elif time_embedding_type == "positional":
+			time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+			self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+			timestep_input_dim = block_out_channels[0]
+		else:
+			raise ValueError(
+				f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+			)
+		self.time_embedding = TimestepEmbedding(
+			timestep_input_dim,
+			time_embed_dim,
+			act_fn=act_fn,
+			post_act_fn=timestep_post_act,
+			cond_proj_dim=time_cond_proj_dim,
+		)
+		if encoder_hid_dim is not None:
+			self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+		else:
+			self.encoder_hid_proj = None
+		# class embedding
+		if class_embed_type is None and num_class_embeds is not None:
+			self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+		elif class_embed_type == "timestep":
+			self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+		elif class_embed_type == "identity":
+			self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+		elif class_embed_type == "projection":
+			if projection_class_embeddings_input_dim is None:
+				raise ValueError(
+					"`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+				)
+			# The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+			# 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+			# 2. it projects from an arbitrary input dimension.
+			#
+			# Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+			# When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+			# As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+			self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+		elif class_embed_type == "simple_projection":
+			if projection_class_embeddings_input_dim is None:
+				raise ValueError(
+					"`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+				)
+			self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+		else:
+			self.class_embedding = None
+		if addition_embed_type == "text":
+			if encoder_hid_dim is not None:
+				text_time_embedding_from_dim = encoder_hid_dim
+			else:
+				text_time_embedding_from_dim = cross_attention_dim
+			self.add_embedding = TextTimeEmbedding(
+				text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+			)
+		elif addition_embed_type is not None:
+			raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.")
+		if time_embedding_act_fn is None:
+			self.time_embed_act = None
+		elif time_embedding_act_fn == "swish":
+			self.time_embed_act = lambda x: F.silu(x)
+		elif time_embedding_act_fn == "mish":
+			self.time_embed_act = nn.Mish()
+		elif time_embedding_act_fn == "silu":
+			self.time_embed_act = nn.SiLU()
+		elif time_embedding_act_fn == "gelu":
+			self.time_embed_act = nn.GELU()
+		else:
+			raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}")
+		self.down_blocks = nn.ModuleList([])
+		self.up_blocks = nn.ModuleList([])
+		if isinstance(only_cross_attention, bool):
+			if mid_block_only_cross_attention is None:
+				mid_block_only_cross_attention = only_cross_attention
+			only_cross_attention = [only_cross_attention] * len(down_block_types)
+		if mid_block_only_cross_attention is None:
+			mid_block_only_cross_attention = False
+		if isinstance(attention_head_dim, int):
+			attention_head_dim = (attention_head_dim,) * len(down_block_types)
+		if isinstance(cross_attention_dim, int):
+			cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+		if isinstance(layers_per_block, int):
+			layers_per_block = [layers_per_block] * len(down_block_types)
+		if class_embeddings_concat:
+			# The time embeddings are concatenated with the class embeddings. The dimension of the
+			# time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+			# regular time embeddings
+			blocks_time_embed_dim = time_embed_dim * 2
+		else:
+			blocks_time_embed_dim = time_embed_dim
+		# down
+		output_channel = block_out_channels[0]
+		for i, down_block_type in enumerate(down_block_types):
+			input_channel = output_channel
+			output_channel = block_out_channels[i]
+			is_final_block = i == len(block_out_channels) - 1
+			down_block = get_down_block(
+				down_block_type,
+				num_layers=layers_per_block[i],
+				in_channels=input_channel,
+				out_channels=output_channel,
+				temb_channels=blocks_time_embed_dim,
+				add_downsample=not is_final_block,
+				resnet_eps=norm_eps,
+				resnet_act_fn=act_fn,
+				resnet_groups=norm_num_groups,
+				cross_attention_dim=cross_attention_dim[i],
+				attn_num_head_channels=attention_head_dim[i],
+				downsample_padding=downsample_padding,
+				dual_cross_attention=dual_cross_attention,
+				use_linear_projection=use_linear_projection,
+				only_cross_attention=only_cross_attention[i],
+				upcast_attention=upcast_attention,
+				resnet_time_scale_shift=resnet_time_scale_shift,
+				audio_cross_attention_dim=audio_cross_attention_dim
+			)
+			self.down_blocks.append(down_block)
+		# mid
+		if mid_block_type is None:
+			self.mid_block = None
+		else:
+			self.mid_block = get_mid_block(
+				mid_block_type=mid_block_type,
+				in_channels=block_out_channels[-1],
+				temb_channels=blocks_time_embed_dim,
+				resnet_eps=norm_eps,
+				resnet_act_fn=act_fn,
+				output_scale_factor=mid_block_scale_factor,
+				resnet_time_scale_shift=resnet_time_scale_shift,
+				cross_attention_dim=cross_attention_dim[-1],
+				attn_num_head_channels=attention_head_dim[-1],
+				resnet_groups=norm_num_groups,
+				dual_cross_attention=dual_cross_attention,
+				use_linear_projection=use_linear_projection,
+				upcast_attention=upcast_attention,
+				audio_cross_attention_dim=audio_cross_attention_dim
+			)
+		# count how many layers upsample the images
+		self.num_upsamplers = 0
+		# up
+		reversed_block_out_channels = list(reversed(block_out_channels))
+		reversed_attention_head_dim = list(reversed(attention_head_dim))
+		reversed_layers_per_block = list(reversed(layers_per_block))
+		reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+		only_cross_attention = list(reversed(only_cross_attention))
+		output_channel = reversed_block_out_channels[0]
+		for i, up_block_type in enumerate(up_block_types):
+			is_final_block = i == len(block_out_channels) - 1
+			prev_output_channel = output_channel
+			output_channel = reversed_block_out_channels[i]
+			input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+			# add upsample block for all BUT final layer
+			if not is_final_block:
+				add_upsample = True
+				self.num_upsamplers += 1
+			else:
+				add_upsample = False
+			up_block = get_up_block(
+				up_block_type,
+				num_layers=reversed_layers_per_block[i] + 1,
+				in_channels=input_channel,
+				out_channels=output_channel,
+				prev_output_channel=prev_output_channel,
+				temb_channels=blocks_time_embed_dim,
+				add_upsample=add_upsample,
+				resnet_eps=norm_eps,
+				resnet_act_fn=act_fn,
+				resnet_groups=norm_num_groups,
+				cross_attention_dim=reversed_cross_attention_dim[i],
+				attn_num_head_channels=reversed_attention_head_dim[i],
+				dual_cross_attention=dual_cross_attention,
+				use_linear_projection=use_linear_projection,
+				only_cross_attention=only_cross_attention[i],
+				upcast_attention=upcast_attention,
+				resnet_time_scale_shift=resnet_time_scale_shift,
+				audio_cross_attention_dim=audio_cross_attention_dim
+			)
+			self.up_blocks.append(up_block)
+		# out
+		if norm_num_groups is not None:
+			self.conv_norm_out = nn.GroupNorm(
+				num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+			)
+			if act_fn == "swish":
+				self.conv_act = lambda x: F.silu(x)
+			elif act_fn == "mish":
+				self.conv_act = nn.Mish()
+			elif act_fn == "silu":
+				self.conv_act = nn.SiLU()
+			elif act_fn == "gelu":
+				self.conv_act = nn.GELU()
+			else:
+				raise ValueError(f"Unsupported activation function: {act_fn}")
+		else:
+			self.conv_norm_out = None
+			self.conv_act = None
+		conv_out_padding = (conv_out_kernel - 1) // 2
+		self.conv_out = FFInflatedConv3d(
+			block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+		)
+	@property
+	def attn_processors(self) -> Dict[str, AttentionProcessor]:
+		r"""
+		Returns:
+			`dict` of attention processors: A dictionary containing all attention processors used in the model with
+			indexed by its weight name.
+		"""
+		# set recursively
+		processors = {}
+		def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+			if hasattr(module, "set_processor"):
+				processors[f"{name}.processor"] = module.processor
+			for sub_name, child in module.named_children():
+				fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+			return processors
+		for name, module in self.named_children():
+			fn_recursive_add_processors(name, module, processors)
+		return processors
+	def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+		r"""
+		Parameters:
+			`processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
+				The instantiated processor class or a dictionary of processor classes that will be set as the processor
+				of **all** `Attention` layers.
+			In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.:
+		"""
+		count = len(self.attn_processors.keys())
+		if isinstance(processor, dict) and len(processor) != count:
+			raise ValueError(
+				f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+				f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+			)
+		def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+			if hasattr(module, "set_processor"):
+				if not isinstance(processor, dict):
+					module.set_processor(processor)
+				else:
+					module.set_processor(processor.pop(f"{name}.processor"))
+			for sub_name, child in module.named_children():
+				fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+		for name, module in self.named_children():
+			fn_recursive_attn_processor(name, module, processor)
+	def set_default_attn_processor(self):
+		"""
+		Disables custom attention processors and sets the default attention implementation.
+		"""
+		self.set_attn_processor(AttnProcessor())
+	def set_attention_slice(self, slice_size):
+		r"""
+		Enable sliced attention computation.
+		When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+		in several steps. This is useful to save some memory in exchange for a small speed decrease.
+		Args:
+			slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+				When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+				`"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
+				provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+				must be a multiple of `slice_size`.
+		"""
+		sliceable_head_dims = []
+		def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+			if hasattr(module, "set_attention_slice"):
+				sliceable_head_dims.append(module.sliceable_head_dim)
+			for child in module.children():
+				fn_recursive_retrieve_sliceable_dims(child)
+		# retrieve number of attention layers
+		for module in self.children():
+			fn_recursive_retrieve_sliceable_dims(module)
+		num_sliceable_layers = len(sliceable_head_dims)
+		if slice_size == "auto":
+			# half the attention head size is usually a good trade-off between
+			# speed and memory
+			slice_size = [dim // 2 for dim in sliceable_head_dims]
+		elif slice_size == "max":
+			# make smallest slice possible
+			slice_size = num_sliceable_layers * [1]
+		slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+		if len(slice_size) != len(sliceable_head_dims):
+			raise ValueError(
+				f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+				f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+			)
+		for i in range(len(slice_size)):
+			size = slice_size[i]
+			dim = sliceable_head_dims[i]
+			if size is not None and size > dim:
+				raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+		# Recursively walk through all the children.
+		# Any children which exposes the set_attention_slice method
+		# gets the message
+		def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+			if hasattr(module, "set_attention_slice"):
+				module.set_attention_slice(slice_size.pop())
+			for child in module.children():
+				fn_recursive_set_attention_slice(child, slice_size)
+		reversed_slice_size = list(reversed(slice_size))
+		for module in self.children():
+			fn_recursive_set_attention_slice(module, reversed_slice_size)
+	def _set_gradient_checkpointing(self, module, value=False):
+		if isinstance(module, tuple(all_modules)):
+			module.gradient_checkpointing = value
+	def forward(
+			self,
+			sample: torch.FloatTensor,
+			timestep: Union[torch.Tensor, float, int],
+			encoder_hidden_states: torch.Tensor,
+			audio_encoder_hidden_states: Optional[torch.Tensor] = None,
+			class_labels: Optional[torch.Tensor] = None,
+			timestep_cond: Optional[torch.Tensor] = None,
+			attention_mask: Optional[torch.Tensor] = None,
+			audio_attention_mask: Optional[torch.Tensor] = None,
+			cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+			down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+			mid_block_additional_residual: Optional[torch.Tensor] = None,
+			return_dict: bool = True,
+	) -> Union[UNet3DConditionOutput, Tuple]:
+		r"""
+		Args:
+			sample (`torch.FloatTensor`): (batch, channel, frame, height, width) noisy inputs tensor
+			timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+			encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+			return_dict (`bool`, *optional*, defaults to `True`):
+				Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+			cross_attention_kwargs (`dict`, *optional*):
+				A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+				`self.processor` in
+				[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+		Returns:
+			[`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+			[`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+			returning a tuple, the first element is the sample tensor.
+		"""
+		assert sample.ndim == 5, sample.size()
+		video_length = sample.shape[2]
+		# By default samples have to be AT least a multiple of the overall upsampling factor.
+		# The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+		# However, the upsampling interpolation output size can be forced to fit any upsampling size
+		# on the fly if necessary.
+		default_overall_up_factor = 2 ** self.num_upsamplers
+		# upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+		forward_upsample_size = False
+		upsample_size = None
+		if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+			logger.info("Forward upsample size to force interpolation output size.")
+			forward_upsample_size = True
+		# prepare attention_mask
+		if attention_mask is not None:
+			attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+			attention_mask = attention_mask.unsqueeze(1)
+		# 0. center input if necessary
+		if self.config.center_input_sample:
+			sample = 2 * sample - 1.0
+		# 1. time
+		timesteps = timestep
+		if not torch.is_tensor(timesteps):
+			# TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+			# This would be a good case for the `match` statement (Python 3.10+)
+			is_mps = sample.device.type == "mps"
+			if isinstance(timestep, float):
+				dtype = torch.float32 if is_mps else torch.float64
+			else:
+				dtype = torch.int32 if is_mps else torch.int64
+			timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+		elif len(timesteps.shape) == 0:
+			timesteps = timesteps[None].to(sample.device)
+		# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+		timesteps = timesteps.expand(sample.shape[0])
+		t_emb = self.time_proj(timesteps)
+		# `Timesteps` does not contain any weights and will always return f32 tensors
+		# but time_embedding might actually be running in fp16. so we need to cast here.
+		# there might be better ways to encapsulate this.
+		t_emb = t_emb.to(dtype=self.dtype)
+		emb = self.time_embedding(t_emb, timestep_cond)
+		emb = repeat(emb, "b c -> b f c", f=video_length)
+		if self.class_embedding is not None:
+			if class_labels is None:
+				raise ValueError("class_labels should be provided when num_class_embeds > 0")
+			if self.config.class_embed_type == "timestep":
+				class_labels = self.time_proj(class_labels)
+				# `Timesteps` does not contain any weights and will always return f32 tensors
+				# there might be better ways to encapsulate this.
+				class_labels = class_labels.to(dtype=sample.dtype)
+			class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+			if self.config.class_embeddings_concat:
+				emb = torch.cat([emb, class_emb], dim=-1)
+			else:
+				emb = emb + class_emb
+		if self.config.addition_embed_type == "text":
+			aug_emb = self.add_embedding(encoder_hidden_states)
+			emb = emb + aug_emb
+		if self.time_embed_act is not None:
+			emb = self.time_embed_act(emb)
+		if self.encoder_hid_proj is not None:
+			encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+		# 2. pre-process
+		sample = self.conv_in(sample)
+		# 3. down
+		down_block_res_samples = (sample,)
+		for downsample_block in self.down_blocks:
+			if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+				sample, res_samples = downsample_block(
+					hidden_states=sample,
+					temb=emb,
+					encoder_hidden_states=encoder_hidden_states,
+					audio_encoder_hidden_states=audio_encoder_hidden_states,
+					attention_mask=attention_mask,
+					audio_attention_mask=audio_attention_mask,
+					cross_attention_kwargs=cross_attention_kwargs,
+				)
+			else:
+				sample, res_samples = downsample_block(
+					hidden_states=sample, temb=emb
+				)
+			down_block_res_samples += res_samples
+		if down_block_additional_residuals is not None:
+			new_down_block_res_samples = ()
+			for down_block_res_sample, down_block_additional_residual in zip(
+					down_block_res_samples, down_block_additional_residuals
+			):
+				down_block_res_sample = down_block_res_sample + down_block_additional_residual
+				new_down_block_res_samples += (down_block_res_sample,)
+			down_block_res_samples = new_down_block_res_samples
+		# 4. mid
+		if self.mid_block is not None:
+			sample = self.mid_block(
+				sample,
+				emb,
+				encoder_hidden_states=encoder_hidden_states,
+				audio_encoder_hidden_states=audio_encoder_hidden_states,
+				attention_mask=attention_mask,
+				audio_attention_mask=audio_attention_mask,
+				cross_attention_kwargs=cross_attention_kwargs,
+			)
+		if mid_block_additional_residual is not None:
+			sample = sample + mid_block_additional_residual
+		# 5. up
+		for i, upsample_block in enumerate(self.up_blocks):
+			is_final_block = i == len(self.up_blocks) - 1
+			res_samples = down_block_res_samples[-len(upsample_block.resnets):]
+			down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+			# if we have not reached the final block and need to forward the
+			# upsample size, we do it here
+			if not is_final_block and forward_upsample_size:
+				upsample_size = down_block_res_samples[-1].shape[2:]
+			if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+				sample = upsample_block(
+					hidden_states=sample,
+					temb=emb,
+					res_hidden_states_tuple=res_samples,
+					encoder_hidden_states=encoder_hidden_states,
+					audio_encoder_hidden_states=audio_encoder_hidden_states,
+					cross_attention_kwargs=cross_attention_kwargs,
+					upsample_size=upsample_size,
+					attention_mask=attention_mask,
+					audio_attention_mask=audio_attention_mask,
+				)
+			else:
+				sample = upsample_block(
+					hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+				)
+		# 6. post-process
+		if self.conv_norm_out:
+			sample = self.conv_norm_out(sample)
+			sample = self.conv_act(sample)
+		sample = self.conv_out(sample)
+		if not return_dict:
+			return (sample,)
+		return UNet3DConditionOutput(sample=sample)
+	@classmethod
+	def from_pretrained_2d(cls, config3d, pretrained_model_path, subfolder=None):
+		# 1. Build 3D config from pretrained 2D config
+		if subfolder is not None:
+			pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
+		config2d_file = os.path.join(pretrained_model_path, 'config.json')
+		assert os.path.isfile(config2d_file), f"{config2d_file} does not exist"
+		with open(config2d_file, "r") as f:
+			config2d = json.load(f)
+		config2d["_class_name"] = cls.__name__
+		config2d["down_block_types"] = tuple(config3d["down_block_types"])
+		config2d["up_block_types"] = tuple(config3d["up_block_types"])
+		config2d["mid_block_type"] = config3d["mid_block_type"]
+		if "cross_attention_dim" in config3d: config2d["cross_attention_dim"] = config3d["cross_attention_dim"]
+		if "audio_cross_attention_dim" in config3d: config2d["audio_cross_attention_dim"] = config3d[
+			"audio_cross_attention_dim"]
+		# 2. Build 3D model from updated 3D config
+		model = cls.from_config(config2d)
+		# 3. Load in weights from pretrained 2D nets
+		from diffusers.utils import WEIGHTS_NAME
+		model2d_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
+		assert os.path.isfile(model2d_file), f"{model2d_file} does not exist"
+		pretrained_2d_state_dict = torch.load(model2d_file, map_location="cpu")
+		# Add new 3D weights into pretrained 2d state_dict, to be compatible with 3D model
+		for k, v in model.state_dict().items():
+			# all '_temp' temporal weights are initialized by pretrained 2D models
+			if '_temp' in k:
+				pretrained_2d_state_dict.update({k: v})
+			# add new weights into pretrained 2D state_dict
+			elif k not in pretrained_2d_state_dict:
+				pretrained_2d_state_dict.update({k: v})
+			# if weights has different shape, replace it
+			elif pretrained_2d_state_dict[k].shape != v.shape:
+				pretrained_2d_state_dict.update({k: v})
+		model.load_state_dict(pretrained_2d_state_dict)
+		return model

unet_blocks.py ADDED Viewed

	@@ -0,0 +1,1084 @@

+import torch
+from torch import nn
+from ff_spatio_temp_resnet_3d import (
+	FFSpatioTempResnetBlock3D, FFSpatioTempResDownsample3D, FFSpatioTempResUpsample3D
+)
+from ff_spatio_temp_transformer_3d import FFSpatioTempTransformer3DModel
+from ff_spatio_audio_temp_transformer_3d import FFSpatioAudioTempTransformer3DModel
+def create_custom_forward(module, return_dict=None):
+	def custom_forward(*inputs):
+		if return_dict is not None:
+			return module(*inputs, return_dict=return_dict)
+		else:
+			return module(*inputs)
+	return custom_forward
+def get_down_block(
+		down_block_type,
+		num_layers,
+		in_channels,
+		out_channels,
+		temb_channels,
+		add_downsample,
+		resnet_eps,
+		resnet_act_fn,
+		attn_num_head_channels,
+		resnet_groups=None,
+		cross_attention_dim=None,
+		downsample_padding=None,
+		dual_cross_attention=False,
+		use_linear_projection=False,
+		only_cross_attention=False,
+		upcast_attention=False,
+		resnet_time_scale_shift="default",
+		audio_cross_attention_dim=None
+):
+	down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+	if down_block_type == "FFSpatioTempResDownBlock3D":
+		return FFSpatioTempResDownBlock3D(
+			num_layers=num_layers,
+			in_channels=in_channels,
+			out_channels=out_channels,
+			temb_channels=temb_channels,
+			add_downsample=add_downsample,
+			resnet_eps=resnet_eps,
+			resnet_act_fn=resnet_act_fn,
+			resnet_groups=resnet_groups,
+			downsample_padding=downsample_padding,
+			resnet_time_scale_shift=resnet_time_scale_shift
+		)
+	elif down_block_type == "FFSpatioTempCrossAttnDownBlock3D":
+		if cross_attention_dim is None:
+			raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
+		return FFSpatioTempCrossAttnDownBlock3D(
+			num_layers=num_layers,
+			in_channels=in_channels,
+			out_channels=out_channels,
+			temb_channels=temb_channels,
+			add_downsample=add_downsample,
+			resnet_eps=resnet_eps,
+			resnet_act_fn=resnet_act_fn,
+			resnet_groups=resnet_groups,
+			downsample_padding=downsample_padding,
+			cross_attention_dim=cross_attention_dim,
+			attn_num_head_channels=attn_num_head_channels,
+			dual_cross_attention=dual_cross_attention,
+			use_linear_projection=use_linear_projection,
+			only_cross_attention=only_cross_attention,
+			upcast_attention=upcast_attention,
+			resnet_time_scale_shift=resnet_time_scale_shift
+		)
+	elif down_block_type == "FFSpatioAudioTempCrossAttnDownBlock3D":
+		if cross_attention_dim is None:
+			raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
+		return FFSpatioAudioTempCrossAttnDownBlock3D(
+			num_layers=num_layers,
+			in_channels=in_channels,
+			out_channels=out_channels,
+			temb_channels=temb_channels,
+			add_downsample=add_downsample,
+			resnet_eps=resnet_eps,
+			resnet_act_fn=resnet_act_fn,
+			resnet_groups=resnet_groups,
+			downsample_padding=downsample_padding,
+			cross_attention_dim=cross_attention_dim,
+			audio_cross_attention_dim=audio_cross_attention_dim,
+			attn_num_head_channels=attn_num_head_channels,
+			dual_cross_attention=dual_cross_attention,
+			use_linear_projection=use_linear_projection,
+			only_cross_attention=only_cross_attention,
+			upcast_attention=upcast_attention,
+			resnet_time_scale_shift=resnet_time_scale_shift
+		)
+	raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+		up_block_type,
+		num_layers,
+		in_channels,
+		out_channels,
+		prev_output_channel,
+		temb_channels,
+		add_upsample,
+		resnet_eps,
+		resnet_act_fn,
+		attn_num_head_channels,
+		resnet_groups=None,
+		cross_attention_dim=None,
+		dual_cross_attention=False,
+		use_linear_projection=False,
+		only_cross_attention=False,
+		upcast_attention=False,
+		resnet_time_scale_shift="default",
+		audio_cross_attention_dim=None
+):
+	up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+	if up_block_type == "FFSpatioTempResUpBlock3D":
+		return FFSpatioTempResUpBlock3D(
+			num_layers=num_layers,
+			in_channels=in_channels,
+			out_channels=out_channels,
+			prev_output_channel=prev_output_channel,
+			temb_channels=temb_channels,
+			add_upsample=add_upsample,
+			resnet_eps=resnet_eps,
+			resnet_act_fn=resnet_act_fn,
+			resnet_groups=resnet_groups,
+			resnet_time_scale_shift=resnet_time_scale_shift
+		)
+	elif up_block_type == "FFSpatioTempCrossAttnUpBlock3D":
+		if cross_attention_dim is None:
+			raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
+		return FFSpatioTempCrossAttnUpBlock3D(
+			num_layers=num_layers,
+			in_channels=in_channels,
+			out_channels=out_channels,
+			prev_output_channel=prev_output_channel,
+			temb_channels=temb_channels,
+			add_upsample=add_upsample,
+			resnet_eps=resnet_eps,
+			resnet_act_fn=resnet_act_fn,
+			resnet_groups=resnet_groups,
+			cross_attention_dim=cross_attention_dim,
+			attn_num_head_channels=attn_num_head_channels,
+			dual_cross_attention=dual_cross_attention,
+			use_linear_projection=use_linear_projection,
+			only_cross_attention=only_cross_attention,
+			upcast_attention=upcast_attention,
+			resnet_time_scale_shift=resnet_time_scale_shift
+		)
+	elif up_block_type == "FFSpatioAudioTempCrossAttnUpBlock3D":
+		if cross_attention_dim is None:
+			raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
+		return FFSpatioAudioTempCrossAttnUpBlock3D(
+			num_layers=num_layers,
+			in_channels=in_channels,
+			out_channels=out_channels,
+			prev_output_channel=prev_output_channel,
+			temb_channels=temb_channels,
+			add_upsample=add_upsample,
+			resnet_eps=resnet_eps,
+			resnet_act_fn=resnet_act_fn,
+			resnet_groups=resnet_groups,
+			cross_attention_dim=cross_attention_dim,
+			audio_cross_attention_dim=audio_cross_attention_dim,
+			attn_num_head_channels=attn_num_head_channels,
+			dual_cross_attention=dual_cross_attention,
+			use_linear_projection=use_linear_projection,
+			only_cross_attention=only_cross_attention,
+			upcast_attention=upcast_attention,
+			resnet_time_scale_shift=resnet_time_scale_shift
+		)
+	raise ValueError(f"{up_block_type} does not exist.")
+def get_mid_block(
+		mid_block_type,
+		in_channels,
+		temb_channels,
+		resnet_eps,
+		resnet_act_fn,
+		output_scale_factor,
+		resnet_time_scale_shift,
+		cross_attention_dim,
+		attn_num_head_channels,
+		resnet_groups,
+		dual_cross_attention,
+		use_linear_projection,
+		upcast_attention,
+		audio_cross_attention_dim=None
+):
+	if mid_block_type == "FFSpatioTempCrossAttnUNetMidBlock3D":
+		return FFSpatioTempCrossAttnUNetMidBlock3D(
+			in_channels=in_channels,
+			temb_channels=temb_channels,
+			resnet_eps=resnet_eps,
+			resnet_act_fn=resnet_act_fn,
+			output_scale_factor=output_scale_factor,
+			resnet_time_scale_shift=resnet_time_scale_shift,
+			cross_attention_dim=cross_attention_dim,
+			attn_num_head_channels=attn_num_head_channels,
+			resnet_groups=resnet_groups,
+			dual_cross_attention=dual_cross_attention,
+			use_linear_projection=use_linear_projection,
+			upcast_attention=upcast_attention
+		)
+	elif mid_block_type == "FFSpatioAudioTempCrossAttnUNetMidBlock3D":
+		return FFSpatioAudioTempCrossAttnUNetMidBlock3D(
+			in_channels=in_channels,
+			temb_channels=temb_channels,
+			resnet_eps=resnet_eps,
+			resnet_act_fn=resnet_act_fn,
+			output_scale_factor=output_scale_factor,
+			resnet_time_scale_shift=resnet_time_scale_shift,
+			cross_attention_dim=cross_attention_dim,
+			audio_cross_attention_dim=audio_cross_attention_dim,
+			attn_num_head_channels=attn_num_head_channels,
+			resnet_groups=resnet_groups,
+			dual_cross_attention=dual_cross_attention,
+			use_linear_projection=use_linear_projection,
+			upcast_attention=upcast_attention
+		)
+	raise ValueError(f"{mid_block_type} does not exist.")
+##### Image Condition Blocks #####
+class FFSpatioTempResDownBlock3D(nn.Module):
+	def __init__(
+			self,
+			in_channels: int,
+			out_channels: int,
+			temb_channels: int,
+			dropout: float = 0.0,
+			num_layers: int = 1,
+			resnet_eps: float = 1e-6,
+			resnet_time_scale_shift: str = "default",
+			resnet_act_fn: str = "swish",
+			resnet_groups: int = 32,
+			resnet_pre_norm: bool = True,
+			output_scale_factor=1.0,
+			add_downsample=True,
+			downsample_padding=1
+	):
+		super().__init__()
+		resnets = []
+		for i in range(num_layers):
+			in_channels = in_channels if i == 0 else out_channels
+			resnets.append(
+				FFSpatioTempResnetBlock3D(
+					in_channels=in_channels,
+					out_channels=out_channels,
+					temb_channels=temb_channels,
+					eps=resnet_eps,
+					groups=resnet_groups,
+					dropout=dropout,
+					time_embedding_norm=resnet_time_scale_shift,
+					non_linearity=resnet_act_fn,
+					output_scale_factor=output_scale_factor,
+					pre_norm=resnet_pre_norm
+				)
+			)
+		self.resnets = nn.ModuleList(resnets)
+		if add_downsample:
+			self.downsamplers = nn.ModuleList(
+				[
+					FFSpatioTempResDownsample3D(
+						out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+					)
+				]
+			)
+		else:
+			self.downsamplers = None
+		self.gradient_checkpointing = False
+	def forward(self, hidden_states, temb=None):
+		output_states = ()
+		for resnet in self.resnets:
+			if self.training and self.gradient_checkpointing:
+				hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+			else:
+				hidden_states = resnet(hidden_states, temb)
+			output_states += (hidden_states,)
+		if self.downsamplers is not None:
+			for downsampler in self.downsamplers:
+				hidden_states = downsampler(hidden_states)
+			output_states += (hidden_states,)
+		return hidden_states, output_states
+class FFSpatioTempResUpBlock3D(nn.Module):
+	def __init__(
+			self,
+			in_channels: int,
+			prev_output_channel: int,
+			out_channels: int,
+			temb_channels: int,
+			dropout: float = 0.0,
+			num_layers: int = 1,
+			resnet_eps: float = 1e-6,
+			resnet_time_scale_shift: str = "default",
+			resnet_act_fn: str = "swish",
+			resnet_groups: int = 32,
+			resnet_pre_norm: bool = True,
+			output_scale_factor=1.0,
+			add_upsample=True
+	):
+		super().__init__()
+		resnets = []
+		for i in range(num_layers):
+			res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+			resnet_in_channels = prev_output_channel if i == 0 else out_channels
+			resnets.append(
+				FFSpatioTempResnetBlock3D(
+					in_channels=resnet_in_channels + res_skip_channels,
+					out_channels=out_channels,
+					temb_channels=temb_channels,
+					eps=resnet_eps,
+					groups=resnet_groups,
+					dropout=dropout,
+					time_embedding_norm=resnet_time_scale_shift,
+					non_linearity=resnet_act_fn,
+					output_scale_factor=output_scale_factor,
+					pre_norm=resnet_pre_norm
+				)
+			)
+		self.resnets = nn.ModuleList(resnets)
+		if add_upsample:
+			self.upsamplers = nn.ModuleList(
+				[FFSpatioTempResUpsample3D(out_channels, use_conv=True, out_channels=out_channels)])
+		else:
+			self.upsamplers = None
+		self.gradient_checkpointing = False
+	def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+		for resnet in self.resnets:
+			# pop res hidden states
+			res_hidden_states = res_hidden_states_tuple[-1]
+			res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+			hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+			if self.training and self.gradient_checkpointing:
+				hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+			else:
+				hidden_states = resnet(hidden_states, temb)
+		if self.upsamplers is not None:
+			for upsampler in self.upsamplers:
+				hidden_states = upsampler(hidden_states, upsample_size)
+		return hidden_states
+class FFSpatioTempCrossAttnUNetMidBlock3D(nn.Module):
+	def __init__(
+			self,
+			in_channels: int,
+			temb_channels: int,
+			dropout: float = 0.0,
+			num_layers: int = 1,
+			resnet_eps: float = 1e-6,
+			resnet_time_scale_shift: str = "default",
+			resnet_act_fn: str = "swish",
+			resnet_groups: int = 32,
+			resnet_pre_norm: bool = True,
+			attn_num_head_channels=1,
+			output_scale_factor=1.0,
+			cross_attention_dim=1280,
+			dual_cross_attention=False,
+			use_linear_projection=False,
+			upcast_attention=False
+	):
+		super().__init__()
+		self.has_cross_attention = True
+		self.attn_num_head_channels = attn_num_head_channels
+		resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+		# there is always at least one resnet
+		resnets = [
+			FFSpatioTempResnetBlock3D(
+				in_channels=in_channels,
+				out_channels=in_channels,
+				temb_channels=temb_channels,
+				eps=resnet_eps,
+				groups=resnet_groups,
+				dropout=dropout,
+				time_embedding_norm=resnet_time_scale_shift,
+				non_linearity=resnet_act_fn,
+				output_scale_factor=output_scale_factor,
+				pre_norm=resnet_pre_norm
+			)
+		]
+		attentions = []
+		for _ in range(num_layers):
+			if dual_cross_attention:
+				raise NotImplementedError
+			attentions.append(
+				FFSpatioTempTransformer3DModel(
+					attn_num_head_channels,
+					in_channels // attn_num_head_channels,
+					in_channels=in_channels,
+					num_layers=1,
+					cross_attention_dim=cross_attention_dim,
+					norm_num_groups=resnet_groups,
+					use_linear_projection=use_linear_projection,
+					upcast_attention=upcast_attention,
+				)
+			)
+			resnets.append(
+				FFSpatioTempResnetBlock3D(
+					in_channels=in_channels,
+					out_channels=in_channels,
+					temb_channels=temb_channels,
+					eps=resnet_eps,
+					groups=resnet_groups,
+					dropout=dropout,
+					time_embedding_norm=resnet_time_scale_shift,
+					non_linearity=resnet_act_fn,
+					output_scale_factor=output_scale_factor,
+					pre_norm=resnet_pre_norm,
+				)
+			)
+		self.attentions = nn.ModuleList(attentions)
+		self.resnets = nn.ModuleList(resnets)
+		self.gradient_checkpointing = False
+	def forward(self, hidden_states, temb=None, encoder_hidden_states=None,
+	            cross_attention_kwargs=None):
+		if self.training and self.gradient_checkpointing:
+			hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(self.resnets[0]), hidden_states,
+			                                                  temb)
+		else:
+			hidden_states = self.resnets[0](hidden_states, temb)
+		for attn, resnet in zip(self.attentions, self.resnets[1:]):
+			if self.training and self.gradient_checkpointing:
+				hidden_states = torch.utils.checkpoint.checkpoint(
+					create_custom_forward(attn, return_dict=False),
+					hidden_states,
+					encoder_hidden_states,
+					cross_attention_kwargs
+				)[0]
+				hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+			else:
+				hidden_states = attn(
+					hidden_states,
+					encoder_hidden_states=encoder_hidden_states,
+					cross_attention_kwargs=cross_attention_kwargs
+				).sample
+				hidden_states = resnet(hidden_states, temb)
+		return hidden_states
+class FFSpatioTempCrossAttnDownBlock3D(nn.Module):
+	def __init__(
+			self,
+			in_channels: int,
+			out_channels: int,
+			temb_channels: int,
+			dropout: float = 0.0,
+			num_layers: int = 1,
+			resnet_eps: float = 1e-6,
+			resnet_time_scale_shift: str = "default",
+			resnet_act_fn: str = "swish",
+			resnet_groups: int = 32,
+			resnet_pre_norm: bool = True,
+			attn_num_head_channels=1,
+			cross_attention_dim=1280,
+			output_scale_factor=1.0,
+			downsample_padding=1,
+			add_downsample=True,
+			dual_cross_attention=False,
+			use_linear_projection=False,
+			only_cross_attention=False,
+			upcast_attention=False,
+	):
+		super().__init__()
+		resnets = []
+		attentions = []
+		self.has_cross_attention = True
+		self.attn_num_head_channels = attn_num_head_channels
+		for i in range(num_layers):
+			in_channels = in_channels if i == 0 else out_channels
+			resnets.append(
+				FFSpatioTempResnetBlock3D(
+					in_channels=in_channels,
+					out_channels=out_channels,
+					temb_channels=temb_channels,
+					eps=resnet_eps,
+					groups=resnet_groups,
+					dropout=dropout,
+					time_embedding_norm=resnet_time_scale_shift,
+					non_linearity=resnet_act_fn,
+					output_scale_factor=output_scale_factor,
+					pre_norm=resnet_pre_norm,
+				)
+			)
+			if dual_cross_attention:
+				raise NotImplementedError
+			attentions.append(
+				FFSpatioTempTransformer3DModel(
+					attn_num_head_channels,
+					out_channels // attn_num_head_channels,
+					in_channels=out_channels,
+					num_layers=1,
+					cross_attention_dim=cross_attention_dim,
+					norm_num_groups=resnet_groups,
+					use_linear_projection=use_linear_projection,
+					only_cross_attention=only_cross_attention,
+					upcast_attention=upcast_attention,
+				)
+			)
+		self.attentions = nn.ModuleList(attentions)
+		self.resnets = nn.ModuleList(resnets)
+		if add_downsample:
+			self.downsamplers = nn.ModuleList(
+				[
+					FFSpatioTempResDownsample3D(
+						out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op",
+					)
+				]
+			)
+		else:
+			self.downsamplers = None
+		self.gradient_checkpointing = False
+	def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None,
+	            cross_attention_kwargs=None):
+		output_states = ()
+		for resnet, attn in zip(self.resnets, self.attentions):
+			if self.training and self.gradient_checkpointing:
+				hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+				hidden_states = torch.utils.checkpoint.checkpoint(
+					create_custom_forward(attn, return_dict=False),
+					hidden_states,
+					encoder_hidden_states,
+					cross_attention_kwargs
+				)[0]
+			else:
+				hidden_states = resnet(hidden_states, temb)
+				hidden_states = attn(
+					hidden_states,
+					encoder_hidden_states=encoder_hidden_states,
+					cross_attention_kwargs=cross_attention_kwargs,
+				).sample
+			output_states += (hidden_states,)
+		if self.downsamplers is not None:
+			for downsampler in self.downsamplers:
+				hidden_states = downsampler(hidden_states)
+			output_states += (hidden_states,)
+		return hidden_states, output_states
+class FFSpatioTempCrossAttnUpBlock3D(nn.Module):
+	def __init__(
+			self,
+			in_channels: int,
+			out_channels: int,
+			prev_output_channel: int,
+			temb_channels: int,
+			dropout: float = 0.0,
+			num_layers: int = 1,
+			resnet_eps: float = 1e-6,
+			resnet_time_scale_shift: str = "default",
+			resnet_act_fn: str = "swish",
+			resnet_groups: int = 32,
+			resnet_pre_norm: bool = True,
+			attn_num_head_channels=1,
+			cross_attention_dim=1280,
+			output_scale_factor=1.0,
+			add_upsample=True,
+			dual_cross_attention=False,
+			use_linear_projection=False,
+			only_cross_attention=False,
+			upcast_attention=False,
+	):
+		super().__init__()
+		resnets = []
+		attentions = []
+		self.has_cross_attention = True
+		self.attn_num_head_channels = attn_num_head_channels
+		for i in range(num_layers):
+			res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+			resnet_in_channels = prev_output_channel if i == 0 else out_channels
+			resnets.append(
+				FFSpatioTempResnetBlock3D(
+					in_channels=resnet_in_channels + res_skip_channels,
+					out_channels=out_channels,
+					temb_channels=temb_channels,
+					eps=resnet_eps,
+					groups=resnet_groups,
+					dropout=dropout,
+					time_embedding_norm=resnet_time_scale_shift,
+					non_linearity=resnet_act_fn,
+					output_scale_factor=output_scale_factor,
+					pre_norm=resnet_pre_norm,
+				)
+			)
+			if dual_cross_attention:
+				raise NotImplementedError
+			attentions.append(
+				FFSpatioTempTransformer3DModel(
+					attn_num_head_channels,
+					out_channels // attn_num_head_channels,
+					in_channels=out_channels,
+					num_layers=1,
+					cross_attention_dim=cross_attention_dim,
+					norm_num_groups=resnet_groups,
+					use_linear_projection=use_linear_projection,
+					only_cross_attention=only_cross_attention,
+					upcast_attention=upcast_attention,
+				)
+			)
+		self.attentions = nn.ModuleList(attentions)
+		self.resnets = nn.ModuleList(resnets)
+		if add_upsample:
+			self.upsamplers = nn.ModuleList(
+				[FFSpatioTempResUpsample3D(out_channels, use_conv=True, out_channels=out_channels,
+				                           )])
+		else:
+			self.upsamplers = None
+		self.gradient_checkpointing = False
+	def forward(
+			self,
+			hidden_states,
+			res_hidden_states_tuple,
+			temb=None,
+			encoder_hidden_states=None,
+			upsample_size=None,
+			attention_mask=None,
+			cross_attention_kwargs=None
+	):
+		for resnet, attn in zip(self.resnets, self.attentions):
+			# pop res hidden states
+			res_hidden_states = res_hidden_states_tuple[-1]
+			res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+			hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+			if self.training and self.gradient_checkpointing:
+				hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+				hidden_states = torch.utils.checkpoint.checkpoint(
+					create_custom_forward(attn, return_dict=False),
+					hidden_states,
+					encoder_hidden_states,
+					cross_attention_kwargs
+				)[0]
+			else:
+				hidden_states = resnet(hidden_states, temb)
+				hidden_states = attn(
+					hidden_states,
+					encoder_hidden_states=encoder_hidden_states,
+					cross_attention_kwargs=cross_attention_kwargs,
+				).sample
+		if self.upsamplers is not None:
+			for upsampler in self.upsamplers:
+				hidden_states = upsampler(hidden_states, upsample_size)
+		return hidden_states
+##### Audio Condition Blocks #####
+class FFSpatioAudioTempCrossAttnUNetMidBlock3D(nn.Module):
+	def __init__(
+			self,
+			in_channels: int,
+			temb_channels: int,
+			dropout: float = 0.0,
+			num_layers: int = 1,
+			resnet_eps: float = 1e-6,
+			resnet_time_scale_shift: str = "default",
+			resnet_act_fn: str = "swish",
+			resnet_groups: int = 32,
+			resnet_pre_norm: bool = True,
+			attn_num_head_channels=1,
+			output_scale_factor=1.0,
+			cross_attention_dim=1280,
+			audio_cross_attention_dim=768,
+			dual_cross_attention=False,
+			use_linear_projection=False,
+			upcast_attention=False,
+	):
+		super().__init__()
+		self.has_cross_attention = True
+		self.attn_num_head_channels = attn_num_head_channels
+		resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+		# there is always at least one resnet
+		resnets = [
+			FFSpatioTempResnetBlock3D(
+				in_channels=in_channels,
+				out_channels=in_channels,
+				temb_channels=temb_channels,
+				eps=resnet_eps,
+				groups=resnet_groups,
+				dropout=dropout,
+				time_embedding_norm=resnet_time_scale_shift,
+				non_linearity=resnet_act_fn,
+				output_scale_factor=output_scale_factor,
+				pre_norm=resnet_pre_norm,
+			)
+		]
+		attentions = []
+		for _ in range(num_layers):
+			if dual_cross_attention:
+				raise NotImplementedError
+			attentions.append(
+				FFSpatioAudioTempTransformer3DModel(
+					attn_num_head_channels,
+					in_channels // attn_num_head_channels,
+					in_channels=in_channels,
+					num_layers=1,
+					cross_attention_dim=cross_attention_dim,
+					audio_cross_attention_dim=audio_cross_attention_dim,
+					norm_num_groups=resnet_groups,
+					use_linear_projection=use_linear_projection,
+					upcast_attention=upcast_attention,
+				)
+			)
+			resnets.append(
+				FFSpatioTempResnetBlock3D(
+					in_channels=in_channels,
+					out_channels=in_channels,
+					temb_channels=temb_channels,
+					eps=resnet_eps,
+					groups=resnet_groups,
+					dropout=dropout,
+					time_embedding_norm=resnet_time_scale_shift,
+					non_linearity=resnet_act_fn,
+					output_scale_factor=output_scale_factor,
+					pre_norm=resnet_pre_norm,
+				)
+			)
+		self.attentions = nn.ModuleList(attentions)
+		self.resnets = nn.ModuleList(resnets)
+		self.gradient_checkpointing = False
+	def forward(self, hidden_states, temb=None,
+	            encoder_hidden_states=None, attention_mask=None,
+	            audio_encoder_hidden_states=None, audio_attention_mask=None,
+	            cross_attention_kwargs=None):
+		assert cross_attention_kwargs is None
+		if self.training and self.gradient_checkpointing:
+			hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(self.resnets[0]), hidden_states,
+			                                                  temb)
+		else:
+			hidden_states = self.resnets[0](hidden_states, temb)
+		for attn, resnet in zip(self.attentions, self.resnets[1:]):
+			if self.training and self.gradient_checkpointing:
+				hidden_states = torch.utils.checkpoint.checkpoint(
+					create_custom_forward(attn, return_dict=False),
+					hidden_states,
+					encoder_hidden_states,
+					audio_encoder_hidden_states,
+					audio_attention_mask,
+				)[0]
+				hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+			else:
+				hidden_states = attn(
+					hidden_states,
+					encoder_hidden_states=encoder_hidden_states,
+					audio_encoder_hidden_states=audio_encoder_hidden_states,
+					audio_attention_mask=audio_attention_mask,
+					cross_attention_kwargs=cross_attention_kwargs
+				).sample
+				hidden_states = resnet(hidden_states, temb)
+		return hidden_states
+class FFSpatioAudioTempCrossAttnDownBlock3D(nn.Module):
+	def __init__(
+			self,
+			in_channels: int,
+			out_channels: int,
+			temb_channels: int,
+			dropout: float = 0.0,
+			num_layers: int = 1,
+			resnet_eps: float = 1e-6,
+			resnet_time_scale_shift: str = "default",
+			resnet_act_fn: str = "swish",
+			resnet_groups: int = 32,
+			resnet_pre_norm: bool = True,
+			attn_num_head_channels=1,
+			cross_attention_dim=1280,
+			audio_cross_attention_dim=768,
+			output_scale_factor=1.0,
+			downsample_padding=1,
+			add_downsample=True,
+			dual_cross_attention=False,
+			use_linear_projection=False,
+			only_cross_attention=False,
+			upcast_attention=False,
+	):
+		super().__init__()
+		resnets = []
+		attentions = []
+		self.has_cross_attention = True
+		self.attn_num_head_channels = attn_num_head_channels
+		for i in range(num_layers):
+			in_channels = in_channels if i == 0 else out_channels
+			resnets.append(
+				FFSpatioTempResnetBlock3D(
+					in_channels=in_channels,
+					out_channels=out_channels,
+					temb_channels=temb_channels,
+					eps=resnet_eps,
+					groups=resnet_groups,
+					dropout=dropout,
+					time_embedding_norm=resnet_time_scale_shift,
+					non_linearity=resnet_act_fn,
+					output_scale_factor=output_scale_factor,
+					pre_norm=resnet_pre_norm,
+				)
+			)
+			if dual_cross_attention:
+				raise NotImplementedError
+			attentions.append(
+				FFSpatioAudioTempTransformer3DModel(
+					attn_num_head_channels,
+					out_channels // attn_num_head_channels,
+					in_channels=out_channels,
+					num_layers=1,
+					cross_attention_dim=cross_attention_dim,
+					audio_cross_attention_dim=audio_cross_attention_dim,
+					norm_num_groups=resnet_groups,
+					use_linear_projection=use_linear_projection,
+					only_cross_attention=only_cross_attention,
+					upcast_attention=upcast_attention
+				)
+			)
+		self.attentions = nn.ModuleList(attentions)
+		self.resnets = nn.ModuleList(resnets)
+		if add_downsample:
+			self.downsamplers = nn.ModuleList(
+				[
+					FFSpatioTempResDownsample3D(
+						out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op",
+					)
+				]
+			)
+		else:
+			self.downsamplers = None
+		self.gradient_checkpointing = False
+	def forward(self, hidden_states, temb=None,
+	            encoder_hidden_states=None, attention_mask=None,
+	            audio_encoder_hidden_states=None, audio_attention_mask=None,
+	            cross_attention_kwargs=None):
+		output_states = ()
+		for resnet, attn in zip(self.resnets, self.attentions):
+			if self.training and self.gradient_checkpointing:
+				hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+				hidden_states = torch.utils.checkpoint.checkpoint(
+					create_custom_forward(attn, return_dict=False),
+					hidden_states,
+					encoder_hidden_states,
+					audio_encoder_hidden_states,
+					audio_attention_mask
+				)[0]
+			else:
+				hidden_states = resnet(hidden_states, temb)
+				hidden_states = attn(
+					hidden_states,
+					encoder_hidden_states=encoder_hidden_states,
+					audio_encoder_hidden_states=audio_encoder_hidden_states,
+					audio_attention_mask=audio_attention_mask,
+					cross_attention_kwargs=cross_attention_kwargs,
+				).sample
+			output_states += (hidden_states,)
+		if self.downsamplers is not None:
+			for downsampler in self.downsamplers:
+				hidden_states = downsampler(hidden_states)
+			output_states += (hidden_states,)
+		return hidden_states, output_states
+class FFSpatioAudioTempCrossAttnUpBlock3D(nn.Module):
+	def __init__(
+			self,
+			in_channels: int,
+			out_channels: int,
+			prev_output_channel: int,
+			temb_channels: int,
+			dropout: float = 0.0,
+			num_layers: int = 1,
+			resnet_eps: float = 1e-6,
+			resnet_time_scale_shift: str = "default",
+			resnet_act_fn: str = "swish",
+			resnet_groups: int = 32,
+			resnet_pre_norm: bool = True,
+			attn_num_head_channels=1,
+			cross_attention_dim=1280,
+			audio_cross_attention_dim=768,
+			output_scale_factor=1.0,
+			add_upsample=True,
+			dual_cross_attention=False,
+			use_linear_projection=False,
+			only_cross_attention=False,
+			upcast_attention=False,
+	):
+		super().__init__()
+		resnets = []
+		attentions = []
+		self.has_cross_attention = True
+		self.attn_num_head_channels = attn_num_head_channels
+		for i in range(num_layers):
+			res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+			resnet_in_channels = prev_output_channel if i == 0 else out_channels
+			resnets.append(
+				FFSpatioTempResnetBlock3D(
+					in_channels=resnet_in_channels + res_skip_channels,
+					out_channels=out_channels,
+					temb_channels=temb_channels,
+					eps=resnet_eps,
+					groups=resnet_groups,
+					dropout=dropout,
+					time_embedding_norm=resnet_time_scale_shift,
+					non_linearity=resnet_act_fn,
+					output_scale_factor=output_scale_factor,
+					pre_norm=resnet_pre_norm,
+				)
+			)
+			if dual_cross_attention:
+				raise NotImplementedError
+			attentions.append(
+				FFSpatioAudioTempTransformer3DModel(
+					attn_num_head_channels,
+					out_channels // attn_num_head_channels,
+					in_channels=out_channels,
+					num_layers=1,
+					cross_attention_dim=cross_attention_dim,
+					audio_cross_attention_dim=audio_cross_attention_dim,
+					norm_num_groups=resnet_groups,
+					use_linear_projection=use_linear_projection,
+					only_cross_attention=only_cross_attention,
+					upcast_attention=upcast_attention,
+				)
+			)
+		self.attentions = nn.ModuleList(attentions)
+		self.resnets = nn.ModuleList(resnets)
+		if add_upsample:
+			self.upsamplers = nn.ModuleList(
+				[FFSpatioTempResUpsample3D(out_channels, use_conv=True, out_channels=out_channels,
+				                           )])
+		else:
+			self.upsamplers = None
+		self.gradient_checkpointing = False
+	def forward(
+			self,
+			hidden_states,
+			res_hidden_states_tuple,
+			temb=None,
+			encoder_hidden_states=None,
+			attention_mask=None,
+			audio_encoder_hidden_states=None,
+			audio_attention_mask=None,
+			upsample_size=None,
+			cross_attention_kwargs=None
+	):
+		assert cross_attention_kwargs is None
+		for resnet, attn in zip(self.resnets, self.attentions):
+			# pop res hidden states
+			res_hidden_states = res_hidden_states_tuple[-1]
+			res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+			hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+			if self.training and self.gradient_checkpointing:
+				hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+				hidden_states = torch.utils.checkpoint.checkpoint(
+					create_custom_forward(attn, return_dict=False),
+					hidden_states,
+					encoder_hidden_states,
+					audio_encoder_hidden_states,
+					audio_attention_mask,
+					cross_attention_kwargs
+				)[0]
+			else:
+				hidden_states = resnet(hidden_states, temb)
+				hidden_states = attn(
+					hidden_states,
+					encoder_hidden_states=encoder_hidden_states,
+					audio_encoder_hidden_states=audio_encoder_hidden_states,
+					audio_attention_mask=audio_attention_mask,
+					cross_attention_kwargs=cross_attention_kwargs,
+				).sample
+		if self.upsamplers is not None:
+			for upsampler in self.upsamplers:
+				hidden_states = upsampler(hidden_states, upsample_size)
+		return hidden_states
+all_modules = [
+	##### Image Condition #####
+	FFSpatioTempResDownBlock3D,
+	FFSpatioTempResUpBlock3D,
+	FFSpatioTempCrossAttnUNetMidBlock3D,
+	FFSpatioTempCrossAttnDownBlock3D,
+	FFSpatioTempCrossAttnUpBlock3D,
+	##### Audio Condition #####
+	FFSpatioAudioTempCrossAttnUNetMidBlock3D,
+	FFSpatioAudioTempCrossAttnDownBlock3D,
+	FFSpatioAudioTempCrossAttnUpBlock3D,
+]

unet_utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from typing import Optional
+from einops import rearrange
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.attention import Attention
+class InflatedConv3d(nn.Conv2d):
+	def forward(self, x):
+		video_length = x.shape[2]
+		x = rearrange(x, "b c f h w -> (b f) c h w")
+		x = super().forward(x)
+		x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+		return x
+class FFInflatedConv3d(nn.Conv2d):
+	def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
+		super().__init__(
+			in_channels=in_channels,
+			out_channels=out_channels,
+			kernel_size=kernel_size,
+			**kwargs,
+		)
+		self.conv_temp = nn.Linear(3 * out_channels, out_channels)
+		nn.init.zeros_(self.conv_temp.weight.data)  # initialized to be ones
+		nn.init.zeros_(self.conv_temp.bias.data)
+	def forward(self, x):
+		video_length = x.shape[2]
+		x = rearrange(x, "b c f h w -> (b f) c h w")
+		x = super().forward(x)
+		*_, h, w = x.shape
+		x = rearrange(x, "(b f) c h w -> (b h w) f c", f=video_length)
+		head_frame_index = [0, ] * video_length
+		prev_frame_index = torch.clamp(
+			torch.arange(video_length) - 1, min=0.0
+		).long()
+		curr_frame_index = torch.arange(video_length).long()
+		conv_temp_nn_input = torch.cat([
+			x[:, head_frame_index],
+			x[:, prev_frame_index],
+			x[:, curr_frame_index]
+		], dim=2).contiguous()
+		x = x + self.conv_temp(conv_temp_nn_input)
+		x = rearrange(x, "(b h w) f c -> b c f h w", h=h, w=w)
+		return x
+class FFAttention(Attention):
+	r"""
+	A cross attention layer.
+	Parameters:
+		query_dim (`int`): The number of channels in the query.
+		cross_attention_dim (`int`, *optional*):
+			The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+		heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+		dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+		dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+		bias (`bool`, *optional*, defaults to False):
+			Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+	"""
+	def __init__(
+			self,
+			*args,
+			scale_qk: bool = True,
+			processor: Optional["FFAttnProcessor"] = None,
+			**kwargs
+	):
+		super().__init__(*args, scale_qk=scale_qk, processor=processor, **kwargs)
+		# set attention processor
+		# We use the AttnProcessor by default when torch 2.x is used which uses
+		# torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+		# but only if it has the default `scale` argument.
+		if processor is None:
+			processor = FFAttnProcessor()
+		self.set_processor(processor)
+	def forward(self, hidden_states, video_length, encoder_hidden_states=None, attention_mask=None,
+	            **cross_attention_kwargs):
+		# The `Attention` class can call different attention processors / attention functions
+		# here we simply pass along all tensors to the selected processor class
+		# For standard processors that are defined here, `**cross_attention_kwargs` is empty
+		return self.processor(
+			self,
+			hidden_states,
+			encoder_hidden_states=encoder_hidden_states,
+			attention_mask=attention_mask,
+			video_length=video_length,
+			**cross_attention_kwargs,
+		)
+class FFAttnProcessor:
+	def __init__(self):
+		if not hasattr(F, "scaled_dot_product_attention"):
+			raise ImportError(
+				"FFAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+	def __call__(self, attn: Attention, hidden_states, video_length, encoder_hidden_states=None, attention_mask=None):
+		batch_size, sequence_length, _ = (
+			hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+		)
+		inner_dim = hidden_states.shape[-1]
+		if attention_mask is not None:
+			attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+			# scaled_dot_product_attention expects attention_mask shape to be
+			# (batch, heads, source_length, target_length)
+			attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+		query = attn.to_q(hidden_states)
+		if encoder_hidden_states is None:
+			encoder_hidden_states = hidden_states
+		elif attn.norm_cross:
+			encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+		key = attn.to_k(encoder_hidden_states)
+		value = attn.to_v(encoder_hidden_states)
+		# sparse causal attention
+		former_frame_index = torch.arange(video_length) - 1
+		former_frame_index[0] = 0
+		key = rearrange(key, "(b f) d c -> b f d c", f=video_length)
+		key = key[:, [0] * video_length].contiguous()
+		key = rearrange(key, "b f d c -> (b f) d c")
+		value = rearrange(value, "(b f) d c -> b f d c", f=video_length)
+		value = value[:, [0] * video_length].contiguous()
+		value = rearrange(value, "b f d c -> (b f) d c")
+		head_dim = inner_dim // attn.heads
+		query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+		key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+		value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+		# the output of sdp = (batch, num_heads, seq_len, head_dim)
+		hidden_states = F.scaled_dot_product_attention(
+			query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+		)
+		hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+		hidden_states = hidden_states.to(query.dtype)
+		# linear proj
+		hidden_states = attn.to_out[0](hidden_states)
+		# dropout
+		hidden_states = attn.to_out[1](hidden_states)
+		return hidden_states