Spaces:

FunAudioLLM
/

ThinkSound

Running on Zero

App Files Files Community

UncleWang233 commited on Jun 30

Commit

08f69f6

1 Parent(s): a1f4877

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +4 -5
app.py +331 -0
data_utils/__init__.py +0 -0
data_utils/__pycache__/__init__.cpython-310.pyc +0 -0
data_utils/__pycache__/utils.cpython-310.pyc +0 -0
data_utils/__pycache__/utils.cpython-39.pyc +0 -0
data_utils/ext/synchformer/LICENSE +21 -0
data_utils/ext/synchformer/__init__.py +1 -0
data_utils/ext/synchformer/__pycache__/__init__.cpython-310.pyc +0 -0
data_utils/ext/synchformer/__pycache__/__init__.cpython-39.pyc +0 -0
data_utils/ext/synchformer/__pycache__/motionformer.cpython-310.pyc +0 -0
data_utils/ext/synchformer/__pycache__/motionformer.cpython-39.pyc +0 -0
data_utils/ext/synchformer/__pycache__/synchformer.cpython-310.pyc +0 -0
data_utils/ext/synchformer/__pycache__/synchformer.cpython-39.pyc +0 -0
data_utils/ext/synchformer/__pycache__/utils.cpython-310.pyc +0 -0
data_utils/ext/synchformer/__pycache__/utils.cpython-39.pyc +0 -0
data_utils/ext/synchformer/__pycache__/video_model_builder.cpython-310.pyc +0 -0
data_utils/ext/synchformer/__pycache__/video_model_builder.cpython-39.pyc +0 -0
data_utils/ext/synchformer/__pycache__/vit_helper.cpython-310.pyc +0 -0
data_utils/ext/synchformer/__pycache__/vit_helper.cpython-39.pyc +0 -0
data_utils/ext/synchformer/divided_224_16x4.yaml +84 -0
data_utils/ext/synchformer/motionformer.py +400 -0
data_utils/ext/synchformer/synchformer.py +55 -0
data_utils/ext/synchformer/utils.py +92 -0
data_utils/ext/synchformer/video_model_builder.py +277 -0
data_utils/ext/synchformer/vit_helper.py +399 -0
data_utils/utils.py +115 -0
data_utils/v2a_utils/__init__.py +0 -0
data_utils/v2a_utils/__pycache__/__init__.cpython-310.pyc +0 -0
data_utils/v2a_utils/__pycache__/audio_text_dataset.cpython-310.pyc +0 -0
data_utils/v2a_utils/__pycache__/audio_text_dataset.cpython-38.pyc +0 -0
data_utils/v2a_utils/__pycache__/audio_text_dataset.cpython-39.pyc +0 -0
data_utils/v2a_utils/__pycache__/audioset_224.cpython-39.pyc +0 -0
data_utils/v2a_utils/__pycache__/audioset_video_224.cpython-39.pyc +0 -0
data_utils/v2a_utils/__pycache__/feature_utils.cpython-310.pyc +0 -0
data_utils/v2a_utils/__pycache__/feature_utils.cpython-39.pyc +0 -0
data_utils/v2a_utils/__pycache__/feature_utils_224.cpython-310.pyc +0 -0
data_utils/v2a_utils/__pycache__/feature_utils_224.cpython-39.pyc +0 -0
data_utils/v2a_utils/__pycache__/feature_utils_224_audio.cpython-310.pyc +0 -0
data_utils/v2a_utils/__pycache__/feature_utils_224_audio.cpython-38.pyc +0 -0
data_utils/v2a_utils/__pycache__/feature_utils_224_audio.cpython-39.pyc +0 -0
data_utils/v2a_utils/__pycache__/feature_utils_224_no_sync.cpython-39.pyc +0 -0
data_utils/v2a_utils/__pycache__/vggsound.cpython-310.pyc +0 -0
data_utils/v2a_utils/__pycache__/vggsound.cpython-39.pyc +0 -0
data_utils/v2a_utils/__pycache__/vggsound_224.cpython-310.pyc +0 -0
data_utils/v2a_utils/__pycache__/vggsound_224.cpython-39.pyc +0 -0
data_utils/v2a_utils/__pycache__/vggsound_224_no_audio.cpython-310.pyc +0 -0
data_utils/v2a_utils/__pycache__/vggsound_224_no_sync.cpython-39.pyc +0 -0
data_utils/v2a_utils/__pycache__/vggsound_text.cpython-39.pyc +0 -0
data_utils/v2a_utils/feature_utils_224.py +182 -0

README.md CHANGED Viewed

@@ -1,14 +1,13 @@
 ---
-title: ThinkSound
-emoji: 🌍
-colorFrom: green
 colorTo: gray
 sdk: gradio
 sdk_version: 5.35.0
 app_file: app.py
 pinned: false
-license: apache-2.0
-short_description: 'demo of ThinkSound '
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Test
+emoji: 📚
+colorFrom: gray
 colorTo: gray
 sdk: gradio
 sdk_version: 5.35.0
 app_file: app.py
 pinned: false
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,331 @@

+from prefigure.prefigure import get_all_args, push_wandb_config
+import json
+import os
+os.environ["GRADIO_TEMP_DIR"] = "./.gradio_tmp"
+import re
+import torch
+import torchaudio
+# import pytorch_lightning as pl
+import lightning as L
+from lightning.pytorch.callbacks import Timer, ModelCheckpoint, BasePredictionWriter
+from lightning.pytorch.callbacks import Callback
+from lightning.pytorch.tuner import Tuner
+from lightning.pytorch import seed_everything
+import random
+from datetime import datetime
+# from think_sound.data.dataset import create_dataloader_from_config
+from think_sound.data.datamodule import DataModule
+from think_sound.models import create_model_from_config
+from think_sound.models.utils import load_ckpt_state_dict, remove_weight_norm_from_model
+from think_sound.training import create_training_wrapper_from_config, create_demo_callback_from_config
+from think_sound.training.utils import copy_state_dict
+from think_sound.inference.sampling import get_alphas_sigmas, sample, sample_discrete_euler
+from data_utils.v2a_utils.feature_utils_224 import FeaturesUtils
+from torch.utils.data import Dataset
+from typing import Optional, Union
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from torchvision.utils import save_image
+from transformers import AutoProcessor
+import torch.nn.functional as F
+import gradio as gr
+import tempfile
+import subprocess
+from huggingface_hub import hf_hub_download
+_CLIP_SIZE = 224
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+def pad_to_square(video_tensor):
+    if len(video_tensor.shape) != 4:
+        raise ValueError("Input tensor must have shape (l, c, h, w)")
+    l, c, h, w = video_tensor.shape
+    max_side = max(h, w)
+    pad_h = max_side - h
+    pad_w = max_side - w
+    padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)
+    video_padded = F.pad(video_tensor, pad=padding, mode='constant', value=0)
+    return video_padded
+class VGGSound(Dataset):
+    def __init__(
+        self,
+        sample_rate: int = 44_100,
+        duration_sec: float = 9.0,
+        audio_samples: Optional[int] = 397312,
+        normalize_audio: bool = False,
+    ):
+        if audio_samples is None:
+            self.audio_samples = int(sample_rate * duration_sec)
+        else:
+            self.audio_samples = audio_samples
+            effective_duration = audio_samples / sample_rate
+            # make sure the duration is close enough, within 15ms
+            assert abs(effective_duration - duration_sec) < 0.015, \
+                f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
+        self.sample_rate = sample_rate
+        self.duration_sec = duration_sec
+        self.expected_audio_length = self.audio_samples
+        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
+        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
+        self.clip_transform = v2.Compose([
+            v2.Lambda(pad_to_square),          # 先填充为正方形
+            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+        ])
+        self.clip_processor = AutoProcessor.from_pretrained("facebook/metaclip-h14-fullcc2.5b")
+        self.sync_transform = v2.Compose([
+            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
+            v2.CenterCrop(_SYNC_SIZE),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+        self.resampler = {}
+    def sample(self, video_path,label):
+        video_id = video_path
+        reader = StreamingMediaDecoder(video_path)
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
+            frame_rate=_CLIP_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
+            frame_rate=_SYNC_FPS,
+            format='rgb24',
+        )
+        reader.fill_buffer()
+        data_chunk = reader.pop_chunks()
+        clip_chunk = data_chunk[0]
+        sync_chunk = data_chunk[1]
+        if sync_chunk is None:
+            raise RuntimeError(f'Sync video returned None {video_id}')
+        clip_chunk = clip_chunk[:self.clip_expected_length]
+        # import ipdb
+        # ipdb.set_trace()
+        if clip_chunk.shape[0] != self.clip_expected_length:
+            current_length = clip_chunk.shape[0]
+            padding_needed = self.clip_expected_length - current_length
+            # Check that padding needed is no more than 2
+            assert padding_needed < 4, f'Padding no more than 2 frames allowed, but {padding_needed} needed'
+            # If assertion passes, proceed with padding
+            if padding_needed > 0:
+                last_frame = clip_chunk[-1]
+                log.info(last_frame.shape)
+                # Repeat the last frame to reach the expected length
+                padding = last_frame.repeat(padding_needed, 1, 1, 1)
+                clip_chunk = torch.cat((clip_chunk, padding), dim=0)
+            # raise RuntimeError(f'CLIP video wrong length {video_id}, '
+            #                    f'expected {self.clip_expected_length}, '
+            #                    f'got {clip_chunk.shape[0]}')
+        # save_image(clip_chunk[0] / 255.0,'ori.png')
+        clip_chunk = pad_to_square(clip_chunk)
+        clip_chunk = self.clip_processor(images=clip_chunk, return_tensors="pt")["pixel_values"]
+        sync_chunk = sync_chunk[:self.sync_expected_length]
+        if sync_chunk.shape[0] != self.sync_expected_length:
+            # padding using the last frame, but no more than 2
+            current_length = sync_chunk.shape[0]
+            last_frame = sync_chunk[-1]
+            # 重复最后一帧以进行填充
+            padding = last_frame.repeat(self.sync_expected_length - current_length, 1, 1, 1)
+            assert self.sync_expected_length - current_length < 12, f'sync can pad no more than 2 while {self.sync_expected_length - current_length}'
+            sync_chunk = torch.cat((sync_chunk, padding), dim=0)
+            # raise RuntimeError(f'Sync video wrong length {video_id}, '
+            #                    f'expected {self.sync_expected_length}, '
+            #                    f'got {sync_chunk.shape[0]}')
+        sync_chunk = self.sync_transform(sync_chunk)
+        # assert audio_chunk.shape[1] == self.expected_audio_length and clip_chunk.shape[0] == self.clip_expected_length \
+        # and sync_chunk.shape[0] == self.sync_expected_length, 'error processed data shape'
+        data = {
+            'id': video_id,
+            'caption': label,
+            # 'audio': audio_chunk,
+            'clip_video': clip_chunk,
+            'sync_video': sync_chunk,
+        }
+        return data
+# 检查设备
+if torch.cuda.is_available():
+    device = 'cuda'
+    extra_device = 'cuda:1' if torch.cuda.device_count() > 1 else 'cuda:0'
+else:
+    device = 'cpu'
+    extra_device = 'cpu'
+vae_ckpt = hf_hub_download(repo_id="UncleWang233/occdata", filename="epoch=3-step=100000.ckpt",repo_type="dataset")
+synchformer_ckpt = hf_hub_download(repo_id="UncleWang233/occdata", filename="synchformer_state_dict.pth",repo_type="dataset")
+feature_extractor = FeaturesUtils(
+    vae_ckpt=vae_ckpt,
+    vae_config='think_sound/configs/model_configs/autoencoders/stable_audio_2_0_vae.json',
+    enable_conditions=True,
+    synchformer_ckpt=synchformer_ckpt
+).eval().to(extra_device)
+preprocesser = VGGSound()
+args = get_all_args()
+seed = 10086
+seed_everything(seed, workers=True)
+#Get JSON config from args.model_config
+with open("think_sound/configs/model_configs/vt2audio/latent_clip_224_text_sync_mmdit_flow_logit_t5_kernel_size3.json") as f:
+    model_config = json.load(f)
+model = create_model_from_config(model_config)
+## speed by torch.compile
+if args.compile:
+    model = torch.compile(model)
+if args.pretrained_ckpt_path:
+    copy_state_dict(model, load_ckpt_state_dict(args.pretrained_ckpt_path,prefix='diffusion.')) # autoencoder.  diffusion.
+if args.remove_pretransform_weight_norm == "pre_load":
+    remove_weight_norm_from_model(model.pretransform)
+load_vae_state = load_ckpt_state_dict(vae_ckpt, prefix='autoencoder.')
+# new_state_dict = {k.replace("autoencoder.", ""): v for k, v in load_vae_state.items() if k.startswith("autoencoder.")}
+model.pretransform.load_state_dict(load_vae_state)
+# Remove weight_norm from the pretransform if specified
+if args.remove_pretransform_weight_norm == "post_load":
+    remove_weight_norm_from_model(model.pretransform)
+ckpt_path = hf_hub_download(repo_id="UncleWang233/occdata", filename="epoch=10-step=68000.ckpt",repo_type="dataset")
+training_wrapper = create_training_wrapper_from_config(model_config, model)
+# 加载模型权重时根据设备选择map_location
+if device == 'cuda':
+    training_wrapper.load_state_dict(torch.load(ckpt_path)['state_dict'])
+else:
+    training_wrapper.load_state_dict(torch.load(ckpt_path, map_location=torch.device('cpu'))['state_dict'])
+def get_audio(video_path, caption):
+    # 允许caption为空
+    if caption is None:
+        caption = ''
+    timer = Timer(duration="00:15:00:00")
+    data = preprocesser.sample(video_path, caption)
+    preprocessed_data = {}
+    metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
+    preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
+    preprocessed_data['metaclip_text_features'] = metaclip_text_features.detach().cpu().squeeze(0)
+    t5_features = feature_extractor.encode_t5_text(data['caption'])
+    preprocessed_data['t5_features'] = t5_features.detach().cpu().squeeze(0)
+    clip_features = feature_extractor.encode_video_with_clip(data['clip_video'].unsqueeze(0).to(extra_device))
+    preprocessed_data['metaclip_features'] = clip_features.detach().cpu().squeeze(0)
+    sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
+    preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
+    preprocessed_data['video_exist'] = torch.tensor(True)
+    metadata = [preprocessed_data]
+    batch_size = 1
+    length = 194
+    with torch.amp.autocast(device):
+        conditioning = training_wrapper.diffusion.conditioner(metadata, training_wrapper.device)
+    video_exist = torch.stack([item['video_exist'] for item in metadata],dim=0)
+    conditioning['metaclip_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_clip_feat
+    conditioning['sync_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_sync_feat
+    cond_inputs = training_wrapper.diffusion.get_conditioning_inputs(conditioning)
+    noise = torch.randn([batch_size, training_wrapper.diffusion.io_channels, length]).to(training_wrapper.device)
+    with torch.amp.autocast(device):
+        model = training_wrapper.diffusion.model
+        if training_wrapper.diffusion_objective == "v":
+            fakes = sample(model, noise, 24, 0, **cond_inputs, cfg_scale=5, batch_cfg=True)
+        elif training_wrapper.diffusion_objective == "rectified_flow":
+            import time
+            start_time = time.time()
+            fakes = sample_discrete_euler(model, noise, 24, **cond_inputs, cfg_scale=5, batch_cfg=True)
+            end_time = time.time()
+            execution_time = end_time - start_time
+            print(f"执行时间: {execution_time:.2f} 秒")
+        if training_wrapper.diffusion.pretransform is not None:
+            fakes = training_wrapper.diffusion.pretransform.decode(fakes)
+    audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
+    # 保存临时音频文件
+    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
+        torchaudio.save(tmp_audio.name, audios[0], 44100)
+        audio_path = tmp_audio.name
+    return audio_path
+# 合成新视频：用ffmpeg将音频与原视频合成
+def synthesize_video_with_audio(video_file, caption):
+    # 允许caption为空
+    if caption is None:
+        caption = ''
+    audio_path = get_audio(video_file, caption)
+    with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_video:
+        output_video_path = tmp_video.name
+    # ffmpeg命令：用新音频替换原视频音轨
+    cmd = [
+        'ffmpeg', '-y', '-i', video_file, '-i', audio_path,
+        '-c:v', 'copy', '-map', '0:v:0', '-map', '1:a:0',
+        '-shortest', output_video_path
+    ]
+    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    return output_video_path
+# Gradio界面
+with gr.Blocks() as demo:
+    gr.Markdown("# ThinkSound\nupload video and caption(optional), and get video with audio!")
+    with gr.Row():
+        video_input = gr.Video(label="upload video")
+        caption_input = gr.Textbox(label="caption(optional)", placeholder="can be empty", lines=1)
+    output_video = gr.Video(label="output video")
+    btn = gr.Button("start synthesize")
+    btn.click(fn=synthesize_video_with_audio, inputs=[video_input, caption_input], outputs=output_video)
+    gr.Examples(
+        examples=[
+            ["./examples/1_mute.mp4", "Playing Trumpet"],
+            ["./examples/2_mute.mp4", "Axe striking"],
+            ["./examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier"],
+            ["./examples/4_mute.mp4", "train passing by"],
+            ["./examples/5_mute.mp4", "Lighting Firecrackers"]
+        ],
+        inputs=[video_input, caption_input],
+    )
+demo.launch(share=True)

data_utils/__init__.py ADDED Viewed

File without changes

data_utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (149 Bytes). View file

data_utils/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (4.56 kB). View file

data_utils/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (4.56 kB). View file

data_utils/ext/synchformer/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Vladimir Iashin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data_utils/ext/synchformer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from data_utils.ext.synchformer.synchformer import Synchformer

data_utils/ext/synchformer/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (243 Bytes). View file

data_utils/ext/synchformer/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (241 Bytes). View file

data_utils/ext/synchformer/__pycache__/motionformer.cpython-310.pyc ADDED Viewed

Binary file (12.7 kB). View file

data_utils/ext/synchformer/__pycache__/motionformer.cpython-39.pyc ADDED Viewed

Binary file (12.7 kB). View file

data_utils/ext/synchformer/__pycache__/synchformer.cpython-310.pyc ADDED Viewed

Binary file (1.91 kB). View file

data_utils/ext/synchformer/__pycache__/synchformer.cpython-39.pyc ADDED Viewed

Binary file (1.9 kB). View file

data_utils/ext/synchformer/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (3.97 kB). View file

data_utils/ext/synchformer/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (3.78 kB). View file

data_utils/ext/synchformer/__pycache__/video_model_builder.cpython-310.pyc ADDED Viewed

Binary file (5.84 kB). View file

data_utils/ext/synchformer/__pycache__/video_model_builder.cpython-39.pyc ADDED Viewed

Binary file (5.8 kB). View file

data_utils/ext/synchformer/__pycache__/vit_helper.cpython-310.pyc ADDED Viewed

Binary file (10.6 kB). View file

data_utils/ext/synchformer/__pycache__/vit_helper.cpython-39.pyc ADDED Viewed

Binary file (10.6 kB). View file

data_utils/ext/synchformer/divided_224_16x4.yaml ADDED Viewed

	@@ -0,0 +1,84 @@

+TRAIN:
+  ENABLE: True
+  DATASET: Ssv2
+  BATCH_SIZE: 32
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+  CHECKPOINT_EPOCH_RESET: True
+  CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
+DATA:
+  NUM_FRAMES: 16
+  SAMPLING_RATE: 4
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+  MEAN: [0.5, 0.5, 0.5]
+  STD: [0.5, 0.5, 0.5]
+  PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2
+  PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames
+  INV_UNIFORM_SAMPLE: True
+  RANDOM_FLIP: False
+  REVERSE_INPUT_CHANNEL: True
+  USE_RAND_AUGMENT: True
+  RE_PROB: 0.0
+  USE_REPEATED_AUG: False
+  USE_RANDOM_RESIZE_CROPS: False
+  COLORJITTER: False
+  GRAYSCALE: False
+  GAUSSIAN: False
+SOLVER:
+  BASE_LR: 1e-4
+  LR_POLICY: steps_with_relative_lrs
+  LRS: [1, 0.1, 0.01]
+  STEPS: [0, 20, 30]
+  MAX_EPOCH: 35
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 5e-2
+  WARMUP_EPOCHS: 0.0
+  OPTIMIZING_METHOD: adamw
+  USE_MIXED_PRECISION: True
+  SMOOTHING: 0.2
+SLOWFAST:
+  ALPHA: 8
+VIT:
+  PATCH_SIZE: 16
+  PATCH_SIZE_TEMP: 2
+  CHANNELS: 3
+  EMBED_DIM: 768
+  DEPTH: 12
+  NUM_HEADS: 12
+  MLP_RATIO: 4
+  QKV_BIAS: True
+  VIDEO_INPUT: True
+  TEMPORAL_RESOLUTION: 8
+  USE_MLP: True
+  DROP: 0.0
+  POS_DROPOUT: 0.0
+  DROP_PATH: 0.2
+  IM_PRETRAINED: True
+  HEAD_DROPOUT: 0.0
+  HEAD_ACT: tanh
+  PRETRAINED_WEIGHTS: vit_1k
+  ATTN_LAYER: divided
+MODEL:
+  NUM_CLASSES: 174
+  ARCH: slow
+  MODEL_NAME: VisionTransformer
+  LOSS_FUNC: cross_entropy
+TEST:
+  ENABLE: True
+  DATASET: Ssv2
+  BATCH_SIZE: 64
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+DATA_LOADER:
+  NUM_WORKERS: 4
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 4
+RNG_SEED: 0
+OUTPUT_DIR: .
+TENSORBOARD:
+  ENABLE: True

data_utils/ext/synchformer/motionformer.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import logging
+from pathlib import Path
+import einops
+import torch
+from omegaconf import OmegaConf
+from timm.layers import trunc_normal_
+from torch import nn
+from data_utils.ext.synchformer.utils import check_if_file_exists_else_download
+from data_utils.ext.synchformer.video_model_builder import VisionTransformer
+FILE2URL = {
+    # cfg
+    'motionformer_224_16x4.yaml':
+    'https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50/configs/SSV2/motionformer_224_16x4.yaml',
+    'joint_224_16x4.yaml':
+    'https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50/configs/SSV2/joint_224_16x4.yaml',
+    'divided_224_16x4.yaml':
+    'https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50/configs/SSV2/divided_224_16x4.yaml',
+    # ckpt
+    'ssv2_motionformer_224_16x4.pyth':
+    'https://dl.fbaipublicfiles.com/motionformer/ssv2_motionformer_224_16x4.pyth',
+    'ssv2_joint_224_16x4.pyth':
+    'https://dl.fbaipublicfiles.com/motionformer/ssv2_joint_224_16x4.pyth',
+    'ssv2_divided_224_16x4.pyth':
+    'https://dl.fbaipublicfiles.com/motionformer/ssv2_divided_224_16x4.pyth',
+}
+class MotionFormer(VisionTransformer):
+    ''' This class serves three puposes:
+            1. Renames the class to MotionFormer.
+            2. Downloads the cfg from the original repo and patches it if needed.
+            3. Takes care of feature extraction by redefining .forward()
+                - if `extract_features=True` and `factorize_space_time=False`,
+                    the output is of shape (B, T, D) where T = 1 + (224 // 16) * (224 // 16) * 8
+                - if `extract_features=True` and `factorize_space_time=True`, the output is of shape (B*S, D)
+                    and spatial and temporal transformer encoder layers are used.
+                - if `extract_features=True` and `factorize_space_time=True` as well as `add_global_repr=True`
+                    the output is of shape (B, D) and spatial and temporal transformer encoder layers
+                    are used as well as the global representation is extracted from segments (extra pos emb
+                    is added).
+    '''
+    def __init__(
+        self,
+        extract_features: bool = False,
+        ckpt_path: str = None,
+        factorize_space_time: bool = None,
+        agg_space_module: str = None,
+        agg_time_module: str = None,
+        add_global_repr: bool = True,
+        agg_segments_module: str = None,
+        max_segments: int = None,
+    ):
+        self.extract_features = extract_features
+        self.ckpt_path = ckpt_path
+        self.factorize_space_time = factorize_space_time
+        if self.ckpt_path is not None:
+            check_if_file_exists_else_download(self.ckpt_path, FILE2URL)
+            ckpt = torch.load(self.ckpt_path, map_location='cpu')
+            mformer_ckpt2cfg = {
+                'ssv2_motionformer_224_16x4.pyth': 'motionformer_224_16x4.yaml',
+                'ssv2_joint_224_16x4.pyth': 'joint_224_16x4.yaml',
+                'ssv2_divided_224_16x4.pyth': 'divided_224_16x4.yaml',
+            }
+            # init from motionformer ckpt or from our Stage I ckpt
+            # depending on whether the feat extractor was pre-trained on AVCLIPMoCo or not, we need to
+            # load the state dict differently
+            was_pt_on_avclip = self.ckpt_path.endswith(
+                '.pt')  # checks if it is a stage I ckpt (FIXME: a bit generic)
+            if self.ckpt_path.endswith(tuple(mformer_ckpt2cfg.keys())):
+                cfg_fname = mformer_ckpt2cfg[Path(self.ckpt_path).name]
+            elif was_pt_on_avclip:
+                # TODO: this is a hack, we should be able to get the cfg from the ckpt (earlier ckpt didn't have it)
+                s1_cfg = ckpt.get('args', None)  # Stage I cfg
+                if s1_cfg is not None:
+                    s1_vfeat_extractor_ckpt_path = s1_cfg.model.params.vfeat_extractor.params.ckpt_path
+                    # if the stage I ckpt was initialized from a motionformer ckpt or train from scratch
+                    if s1_vfeat_extractor_ckpt_path is not None:
+                        cfg_fname = mformer_ckpt2cfg[Path(s1_vfeat_extractor_ckpt_path).name]
+                    else:
+                        cfg_fname = 'divided_224_16x4.yaml'
+                else:
+                    cfg_fname = 'divided_224_16x4.yaml'
+            else:
+                raise ValueError(f'ckpt_path {self.ckpt_path} is not supported.')
+        else:
+            was_pt_on_avclip = False
+            cfg_fname = 'divided_224_16x4.yaml'
+            # logging.info(f'No ckpt_path provided, using {cfg_fname} config.')
+        if cfg_fname in ['motionformer_224_16x4.yaml', 'divided_224_16x4.yaml']:
+            pos_emb_type = 'separate'
+        elif cfg_fname == 'joint_224_16x4.yaml':
+            pos_emb_type = 'joint'
+        self.mformer_cfg_path = Path(__file__).absolute().parent / cfg_fname
+        check_if_file_exists_else_download(self.mformer_cfg_path, FILE2URL)
+        mformer_cfg = OmegaConf.load(self.mformer_cfg_path)
+        logging.info(f'Loading MotionFormer config from {self.mformer_cfg_path.absolute()}')
+        # patch the cfg (from the default cfg defined in the repo `Motionformer/slowfast/config/defaults.py`)
+        mformer_cfg.VIT.ATTN_DROPOUT = 0.0
+        mformer_cfg.VIT.POS_EMBED = pos_emb_type
+        mformer_cfg.VIT.USE_ORIGINAL_TRAJ_ATTN_CODE = True
+        mformer_cfg.VIT.APPROX_ATTN_TYPE = 'none'  # guessing
+        mformer_cfg.VIT.APPROX_ATTN_DIM = 64  # from ckpt['cfg']
+        # finally init VisionTransformer with the cfg
+        super().__init__(mformer_cfg)
+        # load the ckpt now if ckpt is provided and not from AVCLIPMoCo-pretrained ckpt
+        if (self.ckpt_path is not None) and (not was_pt_on_avclip):
+            _ckpt_load_status = self.load_state_dict(ckpt['model_state'], strict=False)
+            if len(_ckpt_load_status.missing_keys) > 0 or len(
+                    _ckpt_load_status.unexpected_keys) > 0:
+                logging.warning(f'Loading exact vfeat_extractor ckpt from {self.ckpt_path} failed.' \
+                                f'Missing keys: {_ckpt_load_status.missing_keys}, ' \
+                                f'Unexpected keys: {_ckpt_load_status.unexpected_keys}')
+            else:
+                logging.info(f'Loading vfeat_extractor ckpt from {self.ckpt_path} succeeded.')
+        if self.extract_features:
+            assert isinstance(self.norm,
+                              nn.LayerNorm), 'early x[:, 1:, :] may not be safe for per-tr weights'
+            # pre-logits are Sequential(nn.Linear(emb, emd), act) and `act` is tanh but see the logger
+            self.pre_logits = nn.Identity()
+            # we don't need the classification head (saving memory)
+            self.head = nn.Identity()
+            self.head_drop = nn.Identity()
+            # avoiding code duplication (used only if agg_*_module is TransformerEncoderLayer)
+            transf_enc_layer_kwargs = dict(
+                d_model=self.embed_dim,
+                nhead=self.num_heads,
+                activation=nn.GELU(),
+                batch_first=True,
+                dim_feedforward=self.mlp_ratio * self.embed_dim,
+                dropout=self.drop_rate,
+                layer_norm_eps=1e-6,
+                norm_first=True,
+            )
+            # define adapters if needed
+            if self.factorize_space_time:
+                if agg_space_module == 'TransformerEncoderLayer':
+                    self.spatial_attn_agg = SpatialTransformerEncoderLayer(
+                        **transf_enc_layer_kwargs)
+                elif agg_space_module == 'AveragePooling':
+                    self.spatial_attn_agg = AveragePooling(avg_pattern='BS D t h w -> BS D t',
+                                                           then_permute_pattern='BS D t -> BS t D')
+                if agg_time_module == 'TransformerEncoderLayer':
+                    self.temp_attn_agg = TemporalTransformerEncoderLayer(**transf_enc_layer_kwargs)
+                elif agg_time_module == 'AveragePooling':
+                    self.temp_attn_agg = AveragePooling(avg_pattern='BS t D -> BS D')
+                elif 'Identity' in agg_time_module:
+                    self.temp_attn_agg = nn.Identity()
+            # define a global aggregation layer (aggregarate over segments)
+            self.add_global_repr = add_global_repr
+            if add_global_repr:
+                if agg_segments_module == 'TransformerEncoderLayer':
+                    # we can reuse the same layer as for temporal factorization (B, dim_to_agg, D) -> (B, D)
+                    # we need to add pos emb (PE) because previously we added the same PE for each segment
+                    pos_max_len = max_segments if max_segments is not None else 16  # 16 = 10sec//0.64sec + 1
+                    self.global_attn_agg = TemporalTransformerEncoderLayer(
+                        add_pos_emb=True,
+                        pos_emb_drop=mformer_cfg.VIT.POS_DROPOUT,
+                        pos_max_len=pos_max_len,
+                        **transf_enc_layer_kwargs)
+                elif agg_segments_module == 'AveragePooling':
+                    self.global_attn_agg = AveragePooling(avg_pattern='B S D -> B D')
+        if was_pt_on_avclip:
+            # we need to filter out the state_dict of the AVCLIP model (has both A and V extractors)
+            # and keep only the state_dict of the feat extractor
+            ckpt_weights = dict()
+            for k, v in ckpt['state_dict'].items():
+                if k.startswith(('module.v_encoder.', 'v_encoder.')):
+                    k = k.replace('module.', '').replace('v_encoder.', '')
+                    ckpt_weights[k] = v
+            _load_status = self.load_state_dict(ckpt_weights, strict=False)
+            if len(_load_status.missing_keys) > 0 or len(_load_status.unexpected_keys) > 0:
+                logging.warning(f'Loading exact vfeat_extractor ckpt from {self.ckpt_path} failed. \n' \
+                                f'Missing keys ({len(_load_status.missing_keys)}): ' \
+                                f'{_load_status.missing_keys}, \n' \
+                                f'Unexpected keys ({len(_load_status.unexpected_keys)}): ' \
+                                f'{_load_status.unexpected_keys} \n' \
+                                f'temp_attn_agg are expected to be missing if ckpt was pt contrastively.')
+            else:
+                logging.info(f'Loading vfeat_extractor ckpt from {self.ckpt_path} succeeded.')
+        # patch_embed is not used in MotionFormer, only patch_embed_3d, because cfg.VIT.PATCH_SIZE_TEMP > 1
+        # but it used to calculate the number of patches, so we need to set keep it
+        self.patch_embed.requires_grad_(False)
+    def forward(self, x):
+        '''
+        x is of shape (B, S, C, T, H, W) where S is the number of segments.
+        '''
+        # Batch, Segments, Channels, T=frames, Height, Width
+        B, S, C, T, H, W = x.shape
+        # Motionformer expects a tensor of shape (1, B, C, T, H, W).
+        # The first dimension (1) is a dummy dimension to make the input tensor and won't be used:
+        # see `video_model_builder.video_input`.
+        # x = x.unsqueeze(0)  # (1, B, S, C, T, H, W)
+        orig_shape = (B, S, C, T, H, W)
+        x = x.view(B * S, C, T, H, W)  # flatten batch and segments
+        x = self.forward_segments(x, orig_shape=orig_shape)
+        # unpack the segments (using rest dimensions to support different shapes e.g. (BS, D) or (BS, t, D))
+        x = x.view(B, S, *x.shape[1:])
+        # x is now of shape (B*S, D) or (B*S, t, D) if `self.temp_attn_agg` is `Identity`
+        return x  # x is (B, S, ...)
+    def forward_segments(self, x, orig_shape: tuple) -> torch.Tensor:
+        '''x is of shape (1, BS, C, T, H, W) where S is the number of segments.'''
+        x, x_mask = self.forward_features(x)
+        assert self.extract_features
+        # (BS, T, D) where T = 1 + (224 // 16) * (224 // 16) * 8
+        x = x[:,
+              1:, :]  # without the CLS token for efficiency (should be safe for LayerNorm and FC)
+        x = self.norm(x)
+        x = self.pre_logits(x)
+        if self.factorize_space_time:
+            x = self.restore_spatio_temp_dims(x, orig_shape)  # (B*S, D, t, h, w) <- (B*S, t*h*w, D)
+            x = self.spatial_attn_agg(x, x_mask)  # (B*S, t, D)
+            x = self.temp_attn_agg(
+                x)  # (B*S, D) or (BS, t, D) if `self.temp_attn_agg` is `Identity`
+        return x
+    def restore_spatio_temp_dims(self, feats: torch.Tensor, orig_shape: tuple) -> torch.Tensor:
+        '''
+            feats are of shape (B*S, T, D) where T = 1 + (224 // 16) * (224 // 16) * 8
+            Our goal is to make them of shape (B*S, t, h, w, D) where h, w are the spatial dimensions.
+            From `self.patch_embed_3d`, it follows that we could reshape feats with:
+                `feats.transpose(1, 2).view(B*S, D, t, h, w)`
+        '''
+        B, S, C, T, H, W = orig_shape
+        D = self.embed_dim
+        # num patches in each dimension
+        t = T // self.patch_embed_3d.z_block_size
+        h = self.patch_embed_3d.height
+        w = self.patch_embed_3d.width
+        feats = feats.permute(0, 2, 1)  # (B*S, D, T)
+        feats = feats.view(B * S, D, t, h, w)  # (B*S, D, t, h, w)
+        return feats
+class BaseEncoderLayer(nn.TransformerEncoderLayer):
+    '''
+        This is a wrapper around nn.TransformerEncoderLayer that adds a CLS token
+        to the sequence and outputs the CLS token's representation.
+        This base class parents both SpatialEncoderLayer and TemporalEncoderLayer for the RGB stream
+        and the FrequencyEncoderLayer and TemporalEncoderLayer for the audio stream stream.
+        We also, optionally, add a positional embedding to the input sequence which
+        allows to reuse it for global aggregation (of segments) for both streams.
+    '''
+    def __init__(self,
+                 add_pos_emb: bool = False,
+                 pos_emb_drop: float = None,
+                 pos_max_len: int = None,
+                 *args_transformer_enc,
+                 **kwargs_transformer_enc):
+        super().__init__(*args_transformer_enc, **kwargs_transformer_enc)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.self_attn.embed_dim))
+        trunc_normal_(self.cls_token, std=.02)
+        # add positional embedding
+        self.add_pos_emb = add_pos_emb
+        if add_pos_emb:
+            self.pos_max_len = 1 + pos_max_len  # +1 (for CLS)
+            self.pos_emb = nn.Parameter(torch.zeros(1, self.pos_max_len, self.self_attn.embed_dim))
+            self.pos_drop = nn.Dropout(pos_emb_drop)
+            trunc_normal_(self.pos_emb, std=.02)
+        self.apply(self._init_weights)
+    def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None):
+        ''' x is of shape (B, N, D); if provided x_mask is of shape (B, N)'''
+        batch_dim = x.shape[0]
+        # add CLS token
+        cls_tokens = self.cls_token.expand(batch_dim, -1, -1)  # expanding to match batch dimension
+        x = torch.cat((cls_tokens, x), dim=-2)  # (batch_dim, 1+seq_len, D)
+        if x_mask is not None:
+            cls_mask = torch.ones((batch_dim, 1), dtype=torch.bool,
+                                  device=x_mask.device)  # 1=keep; 0=mask
+            x_mask_w_cls = torch.cat((cls_mask, x_mask), dim=-1)  # (batch_dim, 1+seq_len)
+            B, N = x_mask_w_cls.shape
+            # torch expects (N, N) or (B*num_heads, N, N) mask (sadness ahead); torch masks
+            x_mask_w_cls = x_mask_w_cls.reshape(B, 1, 1, N)\
+                                       .expand(-1, self.self_attn.num_heads, N, -1)\
+                                       .reshape(B * self.self_attn.num_heads, N, N)
+            assert x_mask_w_cls.dtype == x_mask_w_cls.bool().dtype, 'x_mask_w_cls.dtype != bool'
+            x_mask_w_cls = ~x_mask_w_cls  # invert mask (1=mask)
+        else:
+            x_mask_w_cls = None
+        # add positional embedding
+        if self.add_pos_emb:
+            seq_len = x.shape[
+                1]  # (don't even think about moving it before the CLS token concatenation)
+            assert seq_len <= self.pos_max_len, f'Seq len ({seq_len}) > pos_max_len ({self.pos_max_len})'
+            x = x + self.pos_emb[:, :seq_len, :]
+            x = self.pos_drop(x)
+        # apply encoder layer (calls nn.TransformerEncoderLayer.forward);
+        x = super().forward(src=x, src_mask=x_mask_w_cls)  # (batch_dim, 1+seq_len, D)
+        # CLS token is expected to hold spatial information for each frame
+        x = x[:, 0, :]  # (batch_dim, D)
+        return x
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'cls_token', 'pos_emb'}
+class SpatialTransformerEncoderLayer(BaseEncoderLayer):
+    ''' Aggregates spatial dimensions by applying attention individually to each frame. '''
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None) -> torch.Tensor:
+        ''' x is of shape (B*S, D, t, h, w) where S is the number of segments.
+            if specified x_mask (B*S, t, h, w), 0=masked, 1=kept
+            Returns a tensor of shape (B*S, t, D) pooling spatial information for each frame. '''
+        BS, D, t, h, w = x.shape
+        # time as a batch dimension and flatten spatial dimensions as sequence
+        x = einops.rearrange(x, 'BS D t h w -> (BS t) (h w) D')
+        # similar to mask
+        if x_mask is not None:
+            x_mask = einops.rearrange(x_mask, 'BS t h w -> (BS t) (h w)')
+        # apply encoder layer (BaseEncoderLayer.forward) - it will add CLS token and output its representation
+        x = super().forward(x=x, x_mask=x_mask)  # (B*S*t, D)
+        # reshape back to (B*S, t, D)
+        x = einops.rearrange(x, '(BS t) D -> BS t D', BS=BS, t=t)
+        # (B*S, t, D)
+        return x
+class TemporalTransformerEncoderLayer(BaseEncoderLayer):
+    ''' Aggregates temporal dimension with attention. Also used with pos emb as global aggregation
+    in both streams. '''
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x):
+        ''' x is of shape (B*S, t, D) where S is the number of segments.
+            Returns a tensor of shape (B*S, D) pooling temporal information. '''
+        BS, t, D = x.shape
+        # apply encoder layer (BaseEncoderLayer.forward) - it will add CLS token and output its representation
+        x = super().forward(x)  # (B*S, D)
+        return x  # (B*S, D)
+class AveragePooling(nn.Module):
+    def __init__(self, avg_pattern: str, then_permute_pattern: str = None) -> None:
+        ''' patterns are e.g. "bs t d -> bs d" '''
+        super().__init__()
+        # TODO: need to register them as buffers (but fails because these are strings)
+        self.reduce_fn = 'mean'
+        self.avg_pattern = avg_pattern
+        self.then_permute_pattern = then_permute_pattern
+    def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None) -> torch.Tensor:
+        x = einops.reduce(x, self.avg_pattern, self.reduce_fn)
+        if self.then_permute_pattern is not None:
+            x = einops.rearrange(x, self.then_permute_pattern)
+        return x

data_utils/ext/synchformer/synchformer.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import logging
+from typing import Any, Mapping
+import torch
+from torch import nn
+from data_utils.ext.synchformer.motionformer import MotionFormer
+class Synchformer(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.vfeat_extractor = MotionFormer(extract_features=True,
+                                            factorize_space_time=True,
+                                            agg_space_module='TransformerEncoderLayer',
+                                            agg_time_module='torch.nn.Identity',
+                                            add_global_repr=False)
+        # self.vfeat_extractor = instantiate_from_config(vfeat_extractor)
+        # self.afeat_extractor = instantiate_from_config(afeat_extractor)
+        # # bridging the s3d latent dim (1024) into what is specified in the config
+        # # to match e.g. the transformer dim
+        # self.vproj = instantiate_from_config(vproj)
+        # self.aproj = instantiate_from_config(aproj)
+        # self.transformer = instantiate_from_config(transformer)
+    def forward(self, vis):
+        B, S, Tv, C, H, W = vis.shape
+        vis = vis.permute(0, 1, 3, 2, 4, 5)  # (B, S, C, Tv, H, W)
+        # feat extractors return a tuple of segment-level and global features (ignored for sync)
+        # (B, S, tv, D), e.g. (B, 7, 8, 768)
+        vis = self.vfeat_extractor(vis)
+        return vis
+    def load_state_dict(self, sd: Mapping[str, Any], strict: bool = True):
+        # discard all entries except vfeat_extractor
+        sd = {k: v for k, v in sd.items() if k.startswith('vfeat_extractor')}
+        return super().load_state_dict(sd, strict)
+if __name__ == "__main__":
+    model = Synchformer().cuda().eval()
+    sd = torch.load('./ext_weights/synchformer_state_dict.pth', weights_only=True)
+    model.load_state_dict(sd)
+    vid = torch.randn(2, 7, 16, 3, 224, 224).cuda()
+    features = model.extract_vfeats(vid, for_loop=False).detach().cpu()
+    print(features.shape)
+    # extract and save the state dict only
+    # sd = torch.load('./ext_weights/sync_model_audioset.pt')['model']
+    # torch.save(sd, './ext_weights/synchformer_state_dict.pth')

data_utils/ext/synchformer/utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from hashlib import md5
+from pathlib import Path
+import requests
+from tqdm import tqdm
+PARENT_LINK = 'https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a'
+FNAME2LINK = {
+    # S3: Synchability: AudioSet (run 2)
+    '24-01-22T20-34-52.pt':
+    f'{PARENT_LINK}/sync/sync_models/24-01-22T20-34-52/24-01-22T20-34-52.pt',
+    'cfg-24-01-22T20-34-52.yaml':
+    f'{PARENT_LINK}/sync/sync_models/24-01-22T20-34-52/cfg-24-01-22T20-34-52.yaml',
+    # S2: Synchformer: AudioSet (run 2)
+    '24-01-04T16-39-21.pt':
+    f'{PARENT_LINK}/sync/sync_models/24-01-04T16-39-21/24-01-04T16-39-21.pt',
+    'cfg-24-01-04T16-39-21.yaml':
+    f'{PARENT_LINK}/sync/sync_models/24-01-04T16-39-21/cfg-24-01-04T16-39-21.yaml',
+    # S2: Synchformer: AudioSet (run 1)
+    '23-08-28T11-23-23.pt':
+    f'{PARENT_LINK}/sync/sync_models/23-08-28T11-23-23/23-08-28T11-23-23.pt',
+    'cfg-23-08-28T11-23-23.yaml':
+    f'{PARENT_LINK}/sync/sync_models/23-08-28T11-23-23/cfg-23-08-28T11-23-23.yaml',
+    # S2: Synchformer: LRS3 (run 2)
+    '23-12-23T18-33-57.pt':
+    f'{PARENT_LINK}/sync/sync_models/23-12-23T18-33-57/23-12-23T18-33-57.pt',
+    'cfg-23-12-23T18-33-57.yaml':
+    f'{PARENT_LINK}/sync/sync_models/23-12-23T18-33-57/cfg-23-12-23T18-33-57.yaml',
+    # S2: Synchformer: VGS (run 2)
+    '24-01-02T10-00-53.pt':
+    f'{PARENT_LINK}/sync/sync_models/24-01-02T10-00-53/24-01-02T10-00-53.pt',
+    'cfg-24-01-02T10-00-53.yaml':
+    f'{PARENT_LINK}/sync/sync_models/24-01-02T10-00-53/cfg-24-01-02T10-00-53.yaml',
+    # SparseSync: ft VGGSound-Full
+    '22-09-21T21-00-52.pt':
+    f'{PARENT_LINK}/sync/sync_models/22-09-21T21-00-52/22-09-21T21-00-52.pt',
+    'cfg-22-09-21T21-00-52.yaml':
+    f'{PARENT_LINK}/sync/sync_models/22-09-21T21-00-52/cfg-22-09-21T21-00-52.yaml',
+    # SparseSync: ft VGGSound-Sparse
+    '22-07-28T15-49-45.pt':
+    f'{PARENT_LINK}/sync/sync_models/22-07-28T15-49-45/22-07-28T15-49-45.pt',
+    'cfg-22-07-28T15-49-45.yaml':
+    f'{PARENT_LINK}/sync/sync_models/22-07-28T15-49-45/cfg-22-07-28T15-49-45.yaml',
+    # SparseSync: only pt on LRS3
+    '22-07-13T22-25-49.pt':
+    f'{PARENT_LINK}/sync/sync_models/22-07-13T22-25-49/22-07-13T22-25-49.pt',
+    'cfg-22-07-13T22-25-49.yaml':
+    f'{PARENT_LINK}/sync/sync_models/22-07-13T22-25-49/cfg-22-07-13T22-25-49.yaml',
+    # SparseSync: feature extractors
+    'ResNetAudio-22-08-04T09-51-04.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-08-04T09-51-04.pt',  # 2s
+    'ResNetAudio-22-08-03T23-14-49.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-08-03T23-14-49.pt',  # 3s
+    'ResNetAudio-22-08-03T23-14-28.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-08-03T23-14-28.pt',  # 4s
+    'ResNetAudio-22-06-24T08-10-33.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-06-24T08-10-33.pt',  # 5s
+    'ResNetAudio-22-06-24T17-31-07.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-06-24T17-31-07.pt',  # 6s
+    'ResNetAudio-22-06-24T23-57-11.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-06-24T23-57-11.pt',  # 7s
+    'ResNetAudio-22-06-25T04-35-42.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-06-25T04-35-42.pt',  # 8s
+}
+def check_if_file_exists_else_download(path, fname2link=FNAME2LINK, chunk_size=1024):
+    '''Checks if file exists, if not downloads it from the link to the path'''
+    path = Path(path)
+    if not path.exists():
+        path.parent.mkdir(exist_ok=True, parents=True)
+        link = fname2link.get(path.name, None)
+        if link is None:
+            raise ValueError(f'Cant find the checkpoint file: {path}.',
+                             f'Please download it manually and ensure the path exists.')
+        with requests.get(fname2link[path.name], stream=True) as r:
+            total_size = int(r.headers.get('content-length', 0))
+            with tqdm(total=total_size, unit='B', unit_scale=True) as pbar:
+                with open(path, 'wb') as f:
+                    for data in r.iter_content(chunk_size=chunk_size):
+                        if data:
+                            f.write(data)
+                            pbar.update(chunk_size)
+def get_md5sum(path):
+    hash_md5 = md5()
+    with open(path, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096 * 8), b''):
+            hash_md5.update(chunk)
+    md5sum = hash_md5.hexdigest()
+    return md5sum

data_utils/ext/synchformer/video_model_builder.py ADDED Viewed

	@@ -0,0 +1,277 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2020 Ross Wightman
+# Modified Model definition
+from collections import OrderedDict
+from functools import partial
+import torch
+import torch.nn as nn
+from timm.layers import trunc_normal_
+from data_utils.ext.synchformer import vit_helper
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage """
+    def __init__(self, cfg):
+        super().__init__()
+        self.img_size = cfg.DATA.TRAIN_CROP_SIZE
+        self.patch_size = cfg.VIT.PATCH_SIZE
+        self.in_chans = cfg.VIT.CHANNELS
+        if cfg.TRAIN.DATASET == "Epickitchens":
+            self.num_classes = [97, 300]
+        else:
+            self.num_classes = cfg.MODEL.NUM_CLASSES
+        self.embed_dim = cfg.VIT.EMBED_DIM
+        self.depth = cfg.VIT.DEPTH
+        self.num_heads = cfg.VIT.NUM_HEADS
+        self.mlp_ratio = cfg.VIT.MLP_RATIO
+        self.qkv_bias = cfg.VIT.QKV_BIAS
+        self.drop_rate = cfg.VIT.DROP
+        self.drop_path_rate = cfg.VIT.DROP_PATH
+        self.head_dropout = cfg.VIT.HEAD_DROPOUT
+        self.video_input = cfg.VIT.VIDEO_INPUT
+        self.temporal_resolution = cfg.VIT.TEMPORAL_RESOLUTION
+        self.use_mlp = cfg.VIT.USE_MLP
+        self.num_features = self.embed_dim
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.attn_drop_rate = cfg.VIT.ATTN_DROPOUT
+        self.head_act = cfg.VIT.HEAD_ACT
+        self.cfg = cfg
+        # Patch Embedding
+        self.patch_embed = vit_helper.PatchEmbed(img_size=224,
+                                                 patch_size=self.patch_size,
+                                                 in_chans=self.in_chans,
+                                                 embed_dim=self.embed_dim)
+        # 3D Patch Embedding
+        self.patch_embed_3d = vit_helper.PatchEmbed3D(img_size=self.img_size,
+                                                      temporal_resolution=self.temporal_resolution,
+                                                      patch_size=self.patch_size,
+                                                      in_chans=self.in_chans,
+                                                      embed_dim=self.embed_dim,
+                                                      z_block_size=self.cfg.VIT.PATCH_SIZE_TEMP)
+        self.patch_embed_3d.proj.weight.data = torch.zeros_like(
+            self.patch_embed_3d.proj.weight.data)
+        # Number of patches
+        if self.video_input:
+            num_patches = self.patch_embed.num_patches * self.temporal_resolution
+        else:
+            num_patches = self.patch_embed.num_patches
+        self.num_patches = num_patches
+        # CLS token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+        trunc_normal_(self.cls_token, std=.02)
+        # Positional embedding
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, self.patch_embed.num_patches + 1, self.embed_dim))
+        self.pos_drop = nn.Dropout(p=cfg.VIT.POS_DROPOUT)
+        trunc_normal_(self.pos_embed, std=.02)
+        if self.cfg.VIT.POS_EMBED == "joint":
+            self.st_embed = nn.Parameter(torch.zeros(1, num_patches + 1, self.embed_dim))
+            trunc_normal_(self.st_embed, std=.02)
+        elif self.cfg.VIT.POS_EMBED == "separate":
+            self.temp_embed = nn.Parameter(torch.zeros(1, self.temporal_resolution, self.embed_dim))
+        # Layer Blocks
+        dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, self.depth)]
+        if self.cfg.VIT.ATTN_LAYER == "divided":
+            self.blocks = nn.ModuleList([
+                vit_helper.DividedSpaceTimeBlock(
+                    attn_type=cfg.VIT.ATTN_LAYER,
+                    dim=self.embed_dim,
+                    num_heads=self.num_heads,
+                    mlp_ratio=self.mlp_ratio,
+                    qkv_bias=self.qkv_bias,
+                    drop=self.drop_rate,
+                    attn_drop=self.attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                ) for i in range(self.depth)
+            ])
+        else:
+            self.blocks = nn.ModuleList([
+                vit_helper.Block(attn_type=cfg.VIT.ATTN_LAYER,
+                                 dim=self.embed_dim,
+                                 num_heads=self.num_heads,
+                                 mlp_ratio=self.mlp_ratio,
+                                 qkv_bias=self.qkv_bias,
+                                 drop=self.drop_rate,
+                                 attn_drop=self.attn_drop_rate,
+                                 drop_path=dpr[i],
+                                 norm_layer=norm_layer,
+                                 use_original_code=self.cfg.VIT.USE_ORIGINAL_TRAJ_ATTN_CODE)
+                for i in range(self.depth)
+            ])
+        self.norm = norm_layer(self.embed_dim)
+        # MLP head
+        if self.use_mlp:
+            hidden_dim = self.embed_dim
+            if self.head_act == 'tanh':
+                # logging.info("Using TanH activation in MLP")
+                act = nn.Tanh()
+            elif self.head_act == 'gelu':
+                # logging.info("Using GELU activation in MLP")
+                act = nn.GELU()
+            else:
+                # logging.info("Using ReLU activation in MLP")
+                act = nn.ReLU()
+            self.pre_logits = nn.Sequential(
+                OrderedDict([
+                    ('fc', nn.Linear(self.embed_dim, hidden_dim)),
+                    ('act', act),
+                ]))
+        else:
+            self.pre_logits = nn.Identity()
+        # Classifier Head
+        self.head_drop = nn.Dropout(p=self.head_dropout)
+        if isinstance(self.num_classes, (list, )) and len(self.num_classes) > 1:
+            for a, i in enumerate(range(len(self.num_classes))):
+                setattr(self, "head%d" % a, nn.Linear(self.embed_dim, self.num_classes[i]))
+        else:
+            self.head = nn.Linear(self.embed_dim,
+                                  self.num_classes) if self.num_classes > 0 else nn.Identity()
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        if self.cfg.VIT.POS_EMBED == "joint":
+            return {'pos_embed', 'cls_token', 'st_embed'}
+        else:
+            return {'pos_embed', 'cls_token', 'temp_embed'}
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = (nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity())
+    def forward_features(self, x):
+        # if self.video_input:
+        # x = x[0]
+        B = x.shape[0]
+        # Tokenize input
+        # if self.cfg.VIT.PATCH_SIZE_TEMP > 1:
+        # for simplicity of mapping between content dimensions (input x) and token dims (after patching)
+        # we use the same trick as for AST (see modeling_ast.ASTModel.forward for the details):
+        # apply patching on input
+        x = self.patch_embed_3d(x)
+        tok_mask = None
+        # else:
+        #     tok_mask = None
+        #     # 2D tokenization
+        #     if self.video_input:
+        #         x = x.permute(0, 2, 1, 3, 4)
+        #         (B, T, C, H, W) = x.shape
+        #         x = x.reshape(B * T, C, H, W)
+        #     x = self.patch_embed(x)
+        #     if self.video_input:
+        #         (B2, T2, D2) = x.shape
+        #         x = x.reshape(B, T * T2, D2)
+        # Append CLS token
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        # if tok_mask is not None:
+        #     # prepend 1(=keep) to the mask to account for the CLS token as well
+        #     tok_mask = torch.cat((torch.ones_like(tok_mask[:, [0]]), tok_mask), dim=1)
+        # Interpolate positinoal embeddings
+        # if self.cfg.DATA.TRAIN_CROP_SIZE != 224:
+        #     pos_embed = self.pos_embed
+        #     N = pos_embed.shape[1] - 1
+        #     npatch = int((x.size(1) - 1) / self.temporal_resolution)
+        #     class_emb = pos_embed[:, 0]
+        #     pos_embed = pos_embed[:, 1:]
+        #     dim = x.shape[-1]
+        #     pos_embed = torch.nn.functional.interpolate(
+        #         pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+        #         scale_factor=math.sqrt(npatch / N),
+        #         mode='bicubic',
+        #     )
+        #     pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        #     new_pos_embed = torch.cat((class_emb.unsqueeze(0), pos_embed), dim=1)
+        # else:
+        new_pos_embed = self.pos_embed
+        npatch = self.patch_embed.num_patches
+        # Add positional embeddings to input
+        if self.video_input:
+            if self.cfg.VIT.POS_EMBED == "separate":
+                cls_embed = self.pos_embed[:, 0, :].unsqueeze(1)
+                tile_pos_embed = new_pos_embed[:, 1:, :].repeat(1, self.temporal_resolution, 1)
+                tile_temporal_embed = self.temp_embed.repeat_interleave(npatch, 1)
+                total_pos_embed = tile_pos_embed + tile_temporal_embed
+                total_pos_embed = torch.cat([cls_embed, total_pos_embed], dim=1)
+                x = x + total_pos_embed
+            elif self.cfg.VIT.POS_EMBED == "joint":
+                x = x + self.st_embed
+        else:
+            # image input
+            x = x + new_pos_embed
+        # Apply positional dropout
+        x = self.pos_drop(x)
+        # Encoding using transformer layers
+        for i, blk in enumerate(self.blocks):
+            x = blk(x,
+                    seq_len=npatch,
+                    num_frames=self.temporal_resolution,
+                    approx=self.cfg.VIT.APPROX_ATTN_TYPE,
+                    num_landmarks=self.cfg.VIT.APPROX_ATTN_DIM,
+                    tok_mask=tok_mask)
+        ### v-iashin: I moved it to the forward pass
+        # x = self.norm(x)[:, 0]
+        # x = self.pre_logits(x)
+        ###
+        return x, tok_mask
+    # def forward(self, x):
+    #     x = self.forward_features(x)
+    #     ### v-iashin: here. This should leave the same forward output as before
+    #     x = self.norm(x)[:, 0]
+    #     x = self.pre_logits(x)
+    #     ###
+    #     x = self.head_drop(x)
+    #     if isinstance(self.num_classes, (list, )) and len(self.num_classes) > 1:
+    #         output = []
+    #         for head in range(len(self.num_classes)):
+    #             x_out = getattr(self, "head%d" % head)(x)
+    #             if not self.training:
+    #                 x_out = torch.nn.functional.softmax(x_out, dim=-1)
+    #             output.append(x_out)
+    #         return output
+    #     else:
+    #         x = self.head(x)
+    #         if not self.training:
+    #             x = torch.nn.functional.softmax(x, dim=-1)
+    #         return x

data_utils/ext/synchformer/vit_helper.py ADDED Viewed

	@@ -0,0 +1,399 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2020 Ross Wightman
+# Modified Model definition
+"""Video models."""
+import math
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from timm.layers import to_2tuple
+from torch import einsum
+from torch.nn import functional as F
+default_cfgs = {
+    'vit_1k':
+    'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth',
+    'vit_1k_large':
+    'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_224-4ee7a4dc.pth',
+}
+def qkv_attn(q, k, v, tok_mask: torch.Tensor = None):
+    sim = einsum('b i d, b j d -> b i j', q, k)
+    # apply masking if provided, tok_mask is (B*S*H, N): 1s - keep; sim is (B*S*H, H, N, N)
+    if tok_mask is not None:
+        BSH, N = tok_mask.shape
+        sim = sim.masked_fill(tok_mask.view(BSH, 1, N) == 0,
+                              float('-inf'))  # 1 - broadcasts across N
+    attn = sim.softmax(dim=-1)
+    out = einsum('b i j, b j d -> b i d', attn, v)
+    return out
+class DividedAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        # init to zeros
+        self.qkv.weight.data.fill_(0)
+        self.qkv.bias.data.fill_(0)
+        self.proj.weight.data.fill_(1)
+        self.proj.bias.data.fill_(0)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, einops_from, einops_to, tok_mask: torch.Tensor = None, **einops_dims):
+        # num of heads variable
+        h = self.num_heads
+        # project x to q, k, v vaalues
+        q, k, v = self.qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        if tok_mask is not None:
+            # replicate token mask across heads (b, n) -> (b, h, n) -> (b*h, n) -- same as qkv but w/o d
+            assert len(tok_mask.shape) == 2
+            tok_mask = tok_mask.unsqueeze(1).expand(-1, h, -1).reshape(-1, tok_mask.shape[1])
+        # Scale q
+        q *= self.scale
+        # Take out cls_q, cls_k, cls_v
+        (cls_q, q_), (cls_k, k_), (cls_v, v_) = map(lambda t: (t[:, 0:1], t[:, 1:]), (q, k, v))
+        # the same for masking
+        if tok_mask is not None:
+            cls_mask, mask_ = tok_mask[:, 0:1], tok_mask[:, 1:]
+        else:
+            cls_mask, mask_ = None, None
+        # let CLS token attend to key / values of all patches across time and space
+        cls_out = qkv_attn(cls_q, k, v, tok_mask=tok_mask)
+        # rearrange across time or space
+        q_, k_, v_ = map(lambda t: rearrange(t, f'{einops_from} -> {einops_to}', **einops_dims),
+                         (q_, k_, v_))
+        # expand CLS token keys and values across time or space and concat
+        r = q_.shape[0] // cls_k.shape[0]
+        cls_k, cls_v = map(lambda t: repeat(t, 'b () d -> (b r) () d', r=r), (cls_k, cls_v))
+        k_ = torch.cat((cls_k, k_), dim=1)
+        v_ = torch.cat((cls_v, v_), dim=1)
+        # the same for masking (if provided)
+        if tok_mask is not None:
+            # since mask does not have the latent dim (d), we need to remove it from einops dims
+            mask_ = rearrange(mask_, f'{einops_from} -> {einops_to}'.replace(' d', ''),
+                              **einops_dims)
+            cls_mask = repeat(cls_mask, 'b () -> (b r) ()',
+                              r=r)  # expand cls_mask across time or space
+            mask_ = torch.cat((cls_mask, mask_), dim=1)
+        # attention
+        out = qkv_attn(q_, k_, v_, tok_mask=mask_)
+        # merge back time or space
+        out = rearrange(out, f'{einops_to} -> {einops_from}', **einops_dims)
+        # concat back the cls token
+        out = torch.cat((cls_out, out), dim=1)
+        # merge back the heads
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        ## to out
+        x = self.proj(out)
+        x = self.proj_drop(x)
+        return x
+class DividedSpaceTimeBlock(nn.Module):
+    def __init__(self,
+                 dim=768,
+                 num_heads=12,
+                 attn_type='divided',
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.einops_from_space = 'b (f n) d'
+        self.einops_to_space = '(b f) n d'
+        self.einops_from_time = 'b (f n) d'
+        self.einops_to_time = '(b n) f d'
+        self.norm1 = norm_layer(dim)
+        self.attn = DividedAttention(dim,
+                                     num_heads=num_heads,
+                                     qkv_bias=qkv_bias,
+                                     attn_drop=attn_drop,
+                                     proj_drop=drop)
+        self.timeattn = DividedAttention(dim,
+                                         num_heads=num_heads,
+                                         qkv_bias=qkv_bias,
+                                         attn_drop=attn_drop,
+                                         proj_drop=drop)
+        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path = nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.norm3 = norm_layer(dim)
+    def forward(self,
+                x,
+                seq_len=196,
+                num_frames=8,
+                approx='none',
+                num_landmarks=128,
+                tok_mask: torch.Tensor = None):
+        time_output = self.timeattn(self.norm3(x),
+                                    self.einops_from_time,
+                                    self.einops_to_time,
+                                    n=seq_len,
+                                    tok_mask=tok_mask)
+        time_residual = x + time_output
+        space_output = self.attn(self.norm1(time_residual),
+                                 self.einops_from_space,
+                                 self.einops_to_space,
+                                 f=num_frames,
+                                 tok_mask=tok_mask)
+        space_residual = time_residual + self.drop_path(space_output)
+        x = space_residual
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class Mlp(nn.Module):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = img_size if type(img_size) is tuple else to_2tuple(img_size)
+        patch_size = img_size if type(patch_size) is tuple else to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class PatchEmbed3D(nn.Module):
+    """ Image to Patch Embedding """
+    def __init__(self,
+                 img_size=224,
+                 temporal_resolution=4,
+                 in_chans=3,
+                 patch_size=16,
+                 z_block_size=2,
+                 embed_dim=768,
+                 flatten=True):
+        super().__init__()
+        self.height = (img_size // patch_size)
+        self.width = (img_size // patch_size)
+        ### v-iashin: these two are incorrect
+        # self.frames = (temporal_resolution // z_block_size)
+        # self.num_patches = self.height * self.width * self.frames
+        self.z_block_size = z_block_size
+        ###
+        self.proj = nn.Conv3d(in_chans,
+                              embed_dim,
+                              kernel_size=(z_block_size, patch_size, patch_size),
+                              stride=(z_block_size, patch_size, patch_size))
+        self.flatten = flatten
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)
+        return x
+class HeadMLP(nn.Module):
+    def __init__(self, n_input, n_classes, n_hidden=512, p=0.1):
+        super(HeadMLP, self).__init__()
+        self.n_input = n_input
+        self.n_classes = n_classes
+        self.n_hidden = n_hidden
+        if n_hidden is None:
+            # use linear classifier
+            self.block_forward = nn.Sequential(nn.Dropout(p=p),
+                                               nn.Linear(n_input, n_classes, bias=True))
+        else:
+            # use simple MLP classifier
+            self.block_forward = nn.Sequential(nn.Dropout(p=p),
+                                               nn.Linear(n_input, n_hidden, bias=True),
+                                               nn.BatchNorm1d(n_hidden), nn.ReLU(inplace=True),
+                                               nn.Dropout(p=p),
+                                               nn.Linear(n_hidden, n_classes, bias=True))
+        print(f"Dropout-NLP: {p}")
+    def forward(self, x):
+        return self.block_forward(x)
+def _conv_filter(state_dict, patch_size=16):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    for k, v in state_dict.items():
+        if 'patch_embed.proj.weight' in k:
+            v = v.reshape((v.shape[0], 3, patch_size, patch_size))
+        out_dict[k] = v
+    return out_dict
+def adapt_input_conv(in_chans, conv_weight, agg='sum'):
+    conv_type = conv_weight.dtype
+    conv_weight = conv_weight.float()
+    O, I, J, K = conv_weight.shape
+    if in_chans == 1:
+        if I > 3:
+            assert conv_weight.shape[1] % 3 == 0
+            # For models with space2depth stems
+            conv_weight = conv_weight.reshape(O, I // 3, 3, J, K)
+            conv_weight = conv_weight.sum(dim=2, keepdim=False)
+        else:
+            if agg == 'sum':
+                print("Summing conv1 weights")
+                conv_weight = conv_weight.sum(dim=1, keepdim=True)
+            else:
+                print("Averaging conv1 weights")
+                conv_weight = conv_weight.mean(dim=1, keepdim=True)
+    elif in_chans != 3:
+        if I != 3:
+            raise NotImplementedError('Weight format not supported by conversion.')
+        else:
+            if agg == 'sum':
+                print("Summing conv1 weights")
+                repeat = int(math.ceil(in_chans / 3))
+                conv_weight = conv_weight.repeat(1, repeat, 1, 1)[:, :in_chans, :, :]
+                conv_weight *= (3 / float(in_chans))
+            else:
+                print("Averaging conv1 weights")
+                conv_weight = conv_weight.mean(dim=1, keepdim=True)
+                conv_weight = conv_weight.repeat(1, in_chans, 1, 1)
+    conv_weight = conv_weight.to(conv_type)
+    return conv_weight
+def load_pretrained(model,
+                    cfg=None,
+                    num_classes=1000,
+                    in_chans=3,
+                    filter_fn=None,
+                    strict=True,
+                    progress=False):
+    # Load state dict
+    assert (f"{cfg.VIT.PRETRAINED_WEIGHTS} not in [vit_1k, vit_1k_large]")
+    state_dict = torch.hub.load_state_dict_from_url(url=default_cfgs[cfg.VIT.PRETRAINED_WEIGHTS])
+    if filter_fn is not None:
+        state_dict = filter_fn(state_dict)
+    input_convs = 'patch_embed.proj'
+    if input_convs is not None and in_chans != 3:
+        if isinstance(input_convs, str):
+            input_convs = (input_convs, )
+        for input_conv_name in input_convs:
+            weight_name = input_conv_name + '.weight'
+            try:
+                state_dict[weight_name] = adapt_input_conv(in_chans,
+                                                           state_dict[weight_name],
+                                                           agg='avg')
+                print(
+                    f'Converted input conv {input_conv_name} pretrained weights from 3 to {in_chans} channel(s)'
+                )
+            except NotImplementedError as e:
+                del state_dict[weight_name]
+                strict = False
+                print(
+                    f'Unable to convert pretrained {input_conv_name} weights, using random init for this layer.'
+                )
+    classifier_name = 'head'
+    label_offset = cfg.get('label_offset', 0)
+    pretrain_classes = 1000
+    if num_classes != pretrain_classes:
+        # completely discard fully connected if model num_classes doesn't match pretrained weights
+        del state_dict[classifier_name + '.weight']
+        del state_dict[classifier_name + '.bias']
+        strict = False
+    elif label_offset > 0:
+        # special case for pretrained weights with an extra background class in pretrained weights
+        classifier_weight = state_dict[classifier_name + '.weight']
+        state_dict[classifier_name + '.weight'] = classifier_weight[label_offset:]
+        classifier_bias = state_dict[classifier_name + '.bias']
+        state_dict[classifier_name + '.bias'] = classifier_bias[label_offset:]
+    loaded_state = state_dict
+    self_state = model.state_dict()
+    all_names = set(self_state.keys())
+    saved_names = set([])
+    for name, param in loaded_state.items():
+        param = param
+        if 'module.' in name:
+            name = name.replace('module.', '')
+        if name in self_state.keys() and param.shape == self_state[name].shape:
+            saved_names.add(name)
+            self_state[name].copy_(param)
+        else:
+            print(f"didnt load: {name} of shape: {param.shape}")
+    print("Missing Keys:")
+    print(all_names - saved_names)

data_utils/utils.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""Utility functions."""
+import contextlib
+import csv
+import json
+import os
+import pathlib
+import warnings
+import numpy as np
+def save_args(filename, args):
+    """Save the command-line arguments."""
+    args_dict = {}
+    for key, value in vars(args).items():
+        if isinstance(value, pathlib.Path):
+            args_dict[key] = str(value)
+        else:
+            args_dict[key] = value
+    save_json(filename, args_dict)
+def inverse_dict(d):
+    """Return the inverse dictionary."""
+    return {v: k for k, v in d.items()}
+def save_txt(filename, data):
+    """Save a list to a TXT file."""
+    with open(filename, "w", encoding="utf8") as f:
+        for item in data:
+            f.write(f"{item}\n")
+def load_txt(filename):
+    """Load a TXT file as a list."""
+    with open(filename, encoding="utf8") as f:
+        return [line.strip() for line in f]
+def save_json(filename, data):
+    """Save data as a JSON file."""
+    with open(filename, "w", encoding="utf8") as f:
+        json.dump(data, f)
+def load_json(filename):
+    """Load data from a JSON file."""
+    with open(filename, encoding="utf8") as f:
+        return json.load(f)
+def save_csv(filename, data, header=""):
+    """Save data as a CSV file."""
+    np.savetxt(
+        filename, data, fmt="%d", delimiter=",", header=header, comments=""
+    )
+def load_csv(filename, skiprows=1):
+    """Load data from a CSV file."""
+    return np.loadtxt(filename, dtype=int, delimiter=",", skiprows=skiprows)
+def load_csv_text(filename, headerless=True):
+    """Read a CSV file into a list of dictionaries or lists."""
+    with open(filename) as f:
+        if headerless:
+            return [row for row in csv.reader(f)]
+        reader = csv.DictReader(f)
+        return [
+            {field: row[field] for field in reader.fieldnames}
+            for row in reader
+        ]
+def ignore_exceptions(func):
+    """Decorator that ignores all errors and warnings."""
+    def inner(*args, **kwargs):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            try:
+                return func(*args, **kwargs)
+            except Exception:
+                return None
+    return inner
+def suppress_outputs(func):
+    """Decorator that suppresses writing to stdout and stderr."""
+    def inner(*args, **kwargs):
+        devnull = open(os.devnull, "w")
+        with contextlib.redirect_stdout(devnull):
+            with contextlib.redirect_stderr(devnull):
+                return func(*args, **kwargs)
+    return inner
+def resolve_paths(func):
+    """Decorator that resolves all paths."""
+    def inner(*args, **kwargs):
+        parsed = func(*args, **kwargs)
+        for key in vars(parsed).keys():
+            if isinstance(getattr(parsed, key), pathlib.Path):
+                setattr(
+                    parsed, key, getattr(parsed, key).expanduser().resolve()
+                )
+        return parsed
+    return inner

data_utils/v2a_utils/__init__.py ADDED Viewed

File without changes

data_utils/v2a_utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (163 Bytes). View file

data_utils/v2a_utils/__pycache__/audio_text_dataset.cpython-310.pyc ADDED Viewed

Binary file (4.05 kB). View file

data_utils/v2a_utils/__pycache__/audio_text_dataset.cpython-38.pyc ADDED Viewed

Binary file (4.06 kB). View file

data_utils/v2a_utils/__pycache__/audio_text_dataset.cpython-39.pyc ADDED Viewed

Binary file (4.09 kB). View file

data_utils/v2a_utils/__pycache__/audioset_224.cpython-39.pyc ADDED Viewed

Binary file (6.64 kB). View file

data_utils/v2a_utils/__pycache__/audioset_video_224.cpython-39.pyc ADDED Viewed

Binary file (5.84 kB). View file

data_utils/v2a_utils/__pycache__/feature_utils.cpython-310.pyc ADDED Viewed

Binary file (5.23 kB). View file

data_utils/v2a_utils/__pycache__/feature_utils.cpython-39.pyc ADDED Viewed

Binary file (6.59 kB). View file

data_utils/v2a_utils/__pycache__/feature_utils_224.cpython-310.pyc ADDED Viewed

Binary file (5.94 kB). View file

data_utils/v2a_utils/__pycache__/feature_utils_224.cpython-39.pyc ADDED Viewed

Binary file (5.95 kB). View file

data_utils/v2a_utils/__pycache__/feature_utils_224_audio.cpython-310.pyc ADDED Viewed

Binary file (4.53 kB). View file

data_utils/v2a_utils/__pycache__/feature_utils_224_audio.cpython-38.pyc ADDED Viewed

Binary file (4.4 kB). View file

data_utils/v2a_utils/__pycache__/feature_utils_224_audio.cpython-39.pyc ADDED Viewed

Binary file (4.49 kB). View file

data_utils/v2a_utils/__pycache__/feature_utils_224_no_sync.cpython-39.pyc ADDED Viewed

Binary file (4.75 kB). View file

data_utils/v2a_utils/__pycache__/vggsound.cpython-310.pyc ADDED Viewed

Binary file (4.99 kB). View file

data_utils/v2a_utils/__pycache__/vggsound.cpython-39.pyc ADDED Viewed

Binary file (5.18 kB). View file

data_utils/v2a_utils/__pycache__/vggsound_224.cpython-310.pyc ADDED Viewed

Binary file (6.56 kB). View file

data_utils/v2a_utils/__pycache__/vggsound_224.cpython-39.pyc ADDED Viewed

Binary file (6.5 kB). View file

data_utils/v2a_utils/__pycache__/vggsound_224_no_audio.cpython-310.pyc ADDED Viewed

Binary file (5.64 kB). View file

data_utils/v2a_utils/__pycache__/vggsound_224_no_sync.cpython-39.pyc ADDED Viewed

Binary file (5.14 kB). View file

data_utils/v2a_utils/__pycache__/vggsound_text.cpython-39.pyc ADDED Viewed

Binary file (2.43 kB). View file

data_utils/v2a_utils/feature_utils_224.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from typing import Literal, Optional
+import json
+import open_clip
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from open_clip import create_model_from_pretrained
+from torchvision.transforms import Normalize
+from think_sound.models.factory import create_model_from_config
+from think_sound.models.utils import load_ckpt_state_dict
+from think_sound.training.utils import copy_state_dict
+from transformers import AutoModel
+from transformers import AutoProcessor
+from transformers import T5EncoderModel, AutoTokenizer
+import logging
+from data_utils.ext.synchformer import Synchformer
+log = logging.getLogger()
+def patch_clip(clip_model):
+    # a hack to make it output last hidden states
+    # https://github.com/mlfoundations/open_clip/blob/fc5a37b72d705f760ebbc7915b84729816ed471f/src/open_clip/model.py#L269
+    def new_get_text_features(self, input_ids=None, attention_mask=None, position_ids=None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = text_outputs[0]
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+        return text_features, last_hidden_state
+    clip_model.get_text_features = new_get_text_features.__get__(clip_model)
+    return clip_model
+class FeaturesUtils(nn.Module):
+    def __init__(
+        self,
+        *,
+        vae_ckpt: Optional[str] = None,
+        vae_config: Optional[str] = None,
+        synchformer_ckpt: Optional[str] = None,
+        enable_conditions: bool = True,
+        need_vae_encoder: bool = True,
+    ):
+        super().__init__()
+        if enable_conditions:
+            self.clip_model = AutoModel.from_pretrained("facebook/metaclip-h14-fullcc2.5b")
+            self.clip_model = patch_clip(self.clip_model)
+            self.t5_tokenizer = AutoTokenizer.from_pretrained("google/t5-v1_1-xl")
+            self.t5_model = T5EncoderModel.from_pretrained("google/t5-v1_1-xl")
+            self.clip_processor = AutoProcessor.from_pretrained("facebook/metaclip-h14-fullcc2.5b")
+            # self.clip_preprocess = Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
+            #                                  std=[0.26862954, 0.26130258, 0.27577711])
+            self.synchformer = Synchformer()
+            self.synchformer.load_state_dict(
+                torch.load(synchformer_ckpt, weights_only=True, map_location='cpu'))
+            # self.tokenizer = open_clip.get_tokenizer('ViT-H-14-378-quickgelu')  # same as 'ViT-H-14'
+        else:
+            self.clip_model = None
+            self.synchformer = None
+            self.tokenizer = None
+        if vae_ckpt is not None:
+            with open(vae_config) as f:
+                vae_config = json.load(f)
+            self.vae = create_model_from_config(vae_config)
+            print(f"Loading model checkpoint from {vae_ckpt}")
+            # Load checkpoint
+            copy_state_dict(self.vae, load_ckpt_state_dict(vae_ckpt,prefix='autoencoder.'))#,prefix='autoencoder.'
+        else:
+            self.tod = None
+    def compile(self):
+        if self.clip_model is not None:
+            self.clip_model.encode_image = torch.compile(self.clip_model.encode_image)
+            self.clip_model.encode_text = torch.compile(self.clip_model.encode_text)
+        if self.synchformer is not None:
+            self.synchformer = torch.compile(self.synchformer)
+    def train(self, mode: bool) -> None:
+        return super().train(False)
+    @torch.inference_mode()
+    def encode_video_with_clip(self, x: torch.Tensor, batch_size: int = -1) -> torch.Tensor:
+        assert self.clip_model is not None, 'CLIP is not loaded'
+        # x: (B, T, C, H, W) H/W: 384
+        b, t, c, h, w = x.shape
+        assert c == 3 and h == 224 and w == 224
+        # x = self.clip_preprocess(x)
+        x = rearrange(x, 'b t c h w -> (b t) c h w')
+        outputs = []
+        if batch_size < 0:
+            batch_size = b * t
+        for i in range(0, b * t, batch_size):
+            outputs.append(self.clip_model.get_image_features(x[i:i + batch_size]))
+        x = torch.cat(outputs, dim=0)
+        # x = self.clip_model.encode_image(x, normalize=True)
+        x = rearrange(x, '(b t) d -> b t d', b=b)
+        return x
+    @torch.inference_mode()
+    def encode_video_with_sync(self, x: torch.Tensor, batch_size: int = -1) -> torch.Tensor:
+        assert self.synchformer is not None, 'Synchformer is not loaded'
+        # x: (B, T, C, H, W) H/W: 384
+        b, t, c, h, w = x.shape
+        # import ipdb
+        # ipdb.set_trace()
+        assert c == 3 and h == 224 and w == 224
+        # partition the video
+        segment_size = 16
+        step_size = 8
+        num_segments = (t - segment_size) // step_size + 1
+        segments = []
+        for i in range(num_segments):
+            segments.append(x[:, i * step_size:i * step_size + segment_size])
+        x = torch.stack(segments, dim=1)  # (B, S, T, C, H, W)
+        outputs = []
+        if batch_size < 0:
+            batch_size = b
+        x = rearrange(x, 'b s t c h w -> (b s) 1 t c h w')
+        for i in range(0, b * num_segments, batch_size):
+            outputs.append(self.synchformer(x[i:i + batch_size]))
+        x = torch.cat(outputs, dim=0)
+        x = rearrange(x, '(b s) 1 t d -> b (s t) d', b=b)
+        return x
+    @torch.inference_mode()
+    def encode_text(self, text: list[str]) -> torch.Tensor:
+        assert self.clip_model is not None, 'CLIP is not loaded'
+        # assert self.tokenizer is not None, 'Tokenizer is not loaded'
+        # x: (B, L)
+        tokens = self.clip_processor(text=text, truncation=True, max_length=77, padding="max_length",return_tensors="pt").to(self.device)
+        return self.clip_model.get_text_features(**tokens)
+    @torch.inference_mode()
+    def encode_t5_text(self, text: list[str]) -> torch.Tensor:
+        assert self.t5_model is not None, 'T5 model is not loaded'
+        assert self.t5_tokenizer is not None, 'T5 Tokenizer is not loaded'
+        # x: (B, L)
+        inputs = self.t5_tokenizer(text,
+            truncation=True,
+            max_length=77,
+            padding="max_length",
+            return_tensors="pt").to(self.device)
+        return self.t5_model(**inputs).last_hidden_state
+    @torch.inference_mode()
+    def encode_audio(self, x) -> torch.Tensor:
+        x = self.vae.encode(x)
+        return x
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype