lym0302 commited on
Commit
1fd4e9c
·
1 Parent(s): 9d9a9d8
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +90 -86
  2. app.py +138 -251
  3. demo.py +0 -135
  4. docs/images/icon.png +0 -0
  5. docs/index.html +0 -147
  6. docs/style.css +0 -78
  7. docs/style_videos.css +0 -52
  8. docs/video_gen.html +0 -254
  9. docs/video_main.html +0 -98
  10. docs/video_vgg.html +0 -452
  11. {mmaudio → pipeline}/__init__.py +0 -0
  12. pipeline/__pycache__/__init__.cpython-310.pyc +0 -0
  13. pipeline/__pycache__/__init__.cpython-38.pyc +0 -0
  14. pipeline/__pycache__/pipeline.cpython-310.pyc +0 -0
  15. pipeline/__pycache__/pipeline.cpython-38.pyc +0 -0
  16. pipeline/__pycache__/step0.cpython-310.pyc +0 -0
  17. pipeline/__pycache__/step0.cpython-38.pyc +0 -0
  18. pipeline/__pycache__/step1.cpython-310.pyc +0 -0
  19. pipeline/__pycache__/step1.cpython-38.pyc +0 -0
  20. pipeline/__pycache__/step2.cpython-310.pyc +0 -0
  21. pipeline/__pycache__/step2.cpython-38.pyc +0 -0
  22. pipeline/__pycache__/step3.cpython-310.pyc +0 -0
  23. pipeline/__pycache__/step3.cpython-38.pyc +0 -0
  24. pipeline/__pycache__/step4.cpython-310.pyc +0 -0
  25. pipeline/__pycache__/step4.cpython-38.pyc +0 -0
  26. pipeline/pipeline.py +175 -0
  27. pipeline/step0.py +39 -0
  28. pipeline/step1.py +36 -0
  29. pipeline/step2.py +52 -0
  30. pipeline/step3.py +129 -0
  31. pipeline/step4.py +31 -0
  32. pyproject.toml +0 -52
  33. requirements.txt.bak +0 -27
  34. third_party/MMAudio/.gitignore +146 -0
  35. third_party/MMAudio/LICENSE +21 -0
  36. {mmaudio/data → third_party/MMAudio/mmaudio}/__init__.py +0 -0
  37. {mmaudio/ext/bigvgan_v2 → third_party/MMAudio/mmaudio/data}/__init__.py +0 -0
  38. {mmaudio → third_party/MMAudio/mmaudio}/data/av_utils.py +30 -4
  39. third_party/MMAudio/mmaudio/data/data_setup.py +174 -0
  40. {mmaudio/ext/bigvgan_v2/alias_free_activation/cuda → third_party/MMAudio/mmaudio/data/eval}/__init__.py +0 -0
  41. third_party/MMAudio/mmaudio/data/eval/audiocaps.py +39 -0
  42. third_party/MMAudio/mmaudio/data/eval/moviegen.py +131 -0
  43. third_party/MMAudio/mmaudio/data/eval/video_dataset.py +197 -0
  44. third_party/MMAudio/mmaudio/data/extracted_audio.py +88 -0
  45. third_party/MMAudio/mmaudio/data/extracted_vgg.py +101 -0
  46. {mmaudio/model → third_party/MMAudio/mmaudio/data/extraction}/__init__.py +0 -0
  47. third_party/MMAudio/mmaudio/data/extraction/vgg_sound.py +193 -0
  48. third_party/MMAudio/mmaudio/data/extraction/wav_dataset.py +132 -0
  49. third_party/MMAudio/mmaudio/data/mm_dataset.py +45 -0
  50. third_party/MMAudio/mmaudio/data/utils.py +148 -0
README.md CHANGED
@@ -1,6 +1,5 @@
1
  ---
2
  title: DeepSound-V1
3
- emoji: 🔊
4
  colorFrom: blue
5
  colorTo: indigo
6
  sdk: gradio
@@ -9,155 +8,160 @@ pinned: false
9
  ---
10
 
11
 
12
- # [Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis](https://hkchengrex.github.io/MMAudio)
 
13
 
14
- [Ho Kei Cheng](https://hkchengrex.github.io/), [Masato Ishii](https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ), [Akio Hayakawa](https://scholar.google.com/citations?user=sXAjHFIAAAAJ), [Takashi Shibuya](https://scholar.google.com/citations?user=XCRO260AAAAJ), [Alexander Schwing](https://www.alexander-schwing.de/), [Yuki Mitsufuji](https://www.yukimitsufuji.com/)
15
 
16
- University of Illinois Urbana-Champaign, Sony AI, and Sony Group Corporation
 
 
 
 
 
 
17
 
 
18
 
19
- [[Paper (being prepared)]](https://hkchengrex.github.io/MMAudio) [[Project Page]](https://hkchengrex.github.io/MMAudio)
20
 
 
21
 
22
- **Note: This repository is still under construction. Single-example inference should work as expected. The training code will be added. Code is subject to non-backward-compatible changes.**
23
 
24
  ## Highlight
25
 
26
- MMAudio generates synchronized audio given video and/or text inputs.
27
- Our key innovation is multimodal joint training which allows training on a wide range of audio-visual and audio-text datasets.
28
- Moreover, a synchronization module aligns the generated audio with the video frames.
29
 
30
-
31
- ## Results
32
 
33
  (All audio from our algorithm MMAudio)
34
 
35
- Videos from Sora:
36
 
37
  https://github.com/user-attachments/assets/82afd192-0cee-48a1-86ca-bd39b8c8f330
38
 
 
 
 
39
 
40
- Videos from MovieGen/Hunyuan Video/VGGSound:
41
 
42
  https://github.com/user-attachments/assets/29230d4e-21c1-4cf8-a221-c28f2af6d0ca
43
 
44
- For more results, visit https://hkchengrex.com/MMAudio/video_main.html.
 
45
 
46
  ## Installation
 
 
 
 
 
 
 
 
 
47
 
48
- We have only tested this on Ubuntu.
49
 
50
  ### Prerequisites
51
 
52
  We recommend using a [miniforge](https://github.com/conda-forge/miniforge) environment.
53
 
54
- - Python 3.8+
55
- - PyTorch **2.5.1+** and corresponding torchvision/torchaudio (pick your CUDA version https://pytorch.org/)
56
- - ffmpeg<7 ([this is required by torchaudio](https://pytorch.org/audio/master/installation.html#optional-dependencies), you can install it in a miniforge environment with `conda install -c conda-forge 'ffmpeg<7'`)
57
 
58
- **Clone our repository:**
59
 
60
  ```bash
61
- git clone https://github.com/hkchengrex/MMAudio.git
62
  ```
63
 
64
- **Install with pip:**
65
 
66
- ```bash
67
- cd MMAudio
68
- pip install -e .
69
  ```
 
70
 
71
- (If you encounter the File "setup.py" not found error, upgrade your pip with pip install --upgrade pip)
72
-
73
- **Pretrained models:**
74
-
75
- The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`
76
-
77
- | Model | Download link | File size |
78
- | -------- | ------- | ------- |
79
- | Flow prediction network, small 16kHz | <a href="https://databank.illinois.edu/datafiles/k6jve/download" download="mmaudio_small_16k.pth">mmaudio_small_16k.pth</a> | 601M |
80
- | Flow prediction network, small 44.1kHz | <a href="https://databank.illinois.edu/datafiles/864ya/download" download="mmaudio_small_44k.pth">mmaudio_small_44k.pth</a> | 601M |
81
- | Flow prediction network, medium 44.1kHz | <a href="https://databank.illinois.edu/datafiles/pa94t/download" download="mmaudio_medium_44k.pth">mmaudio_medium_44k.pth</a> | 2.4G |
82
- | Flow prediction network, large 44.1kHz **(recommended)** | <a href="https://databank.illinois.edu/datafiles/4jx76/download" download="mmaudio_large_44k.pth">mmaudio_large_44k.pth</a> | 3.9G |
83
- | 16kHz VAE | <a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-16.pth">v1-16.pth</a> | 655M |
84
- | 16kHz BigVGAN vocoder |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/best_netG.pt">best_netG.pt</a> | 429M |
85
- | 44.1kHz VAE |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-44.pth">v1-44.pth</a> | 1.2G |
86
- | Synchformer visual encoder |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/synchformer_state_dict.pth">synchformer_state_dict.pth</a> | 907M |
87
-
88
- The 44.1kHz vocoder will be downloaded automatically.
89
-
90
- The expected directory structure (full):
91
 
92
  ```bash
93
- MMAudio
94
- ├── ext_weights
95
- │ ├── best_netG.pt
96
- │ ├── synchformer_state_dict.pth
97
- │ ├── v1-16.pth
98
- │ └── v1-44.pth
99
- ├── weights
100
- │ ├── mmaudio_small_16k.pth
101
- │ ├── mmaudio_small_44k.pth
102
- │ ├── mmaudio_medium_44k.pth
103
- │ └── mmaudio_large_44k.pth
104
- └── ...
105
  ```
106
 
107
- The expected directory structure (minimal, for the recommended model only):
108
 
109
  ```bash
110
- MMAudio
111
- ├── ext_weights
112
- │ ├── synchformer_state_dict.pth
113
- │ └── v1-44.pth
114
- ├── weights
115
- │ └── mmaudio_large_44k.pth
116
- └── ...
117
  ```
118
 
 
 
 
 
 
 
 
119
  ## Demo
120
 
121
- By default, these scripts use the `large_44k` model.
122
- In our experiments, inference only takes around 6GB of GPU memory (in 16-bit mode) which should fit in most modern GPUs.
123
 
124
  ### Command-line interface
125
 
126
  With `demo.py`
 
127
  ```bash
128
- python demo.py --duration=8 --video=<path to video> --prompt "your prompt"
129
  ```
130
- The output (audio in `.flac` format, and video in `.mp4` format) will be saved in `./output`.
 
 
 
131
  See the file for more options.
132
  Simply omit the `--video` option for text-to-audio synthesis.
133
- The default output (and training) duration is 8 seconds. Longer/shorter durations could also work, but a large deviation from the training duration may result in a lower quality.
134
 
135
-
136
- ### Gradio interface
137
 
138
  Supports video-to-audio and text-to-audio synthesis.
 
 
139
 
140
- ```
141
  python gradio_demo.py
142
- ```
143
 
144
- ### Known limitations
145
 
146
- 1. The model sometimes generates undesired unintelligible human speech-like sounds
147
- 2. The model sometimes generates undesired background music
148
- 3. The model struggles with unfamiliar concepts, e.g., it can generate "gunfires" but not "RPG firing".
149
 
150
- We believe all of these three limitations can be addressed with more high-quality training data.
 
 
151
 
152
- ## Training
153
- Work in progress.
154
 
155
- ## Evaluation
156
- Work in progress.
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  ## Acknowledgement
159
- Many thanks to:
160
- - [Make-An-Audio 2](https://github.com/bytedance/Make-An-Audio-2) for the 16kHz BigVGAN pretrained model
161
- - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
162
- - [Synchformer](https://github.com/v-iashin/Synchformer)
163
 
 
 
 
 
 
 
1
  ---
2
  title: DeepSound-V1
 
3
  colorFrom: blue
4
  colorTo: indigo
5
  sdk: gradio
 
8
  ---
9
 
10
 
11
+ <!-- # DeepSound-V1
12
+ Official code for DeepSound-V1 -->
13
 
 
14
 
15
+ <div align="center">
16
+ <p align="center">
17
+ <h2>DeepSound-V1</h2>
18
+ <!-- <a href="https://arxiv.org/abs/2412.15322">Paper</a> | <a href="https://hkchengrex.github.io/MMAudio">Webpage</a> | <a href="https://huggingface.co/hkchengrex/MMAudio/tree/main">Models</a> | <a href="https://huggingface.co/spaces/hkchengrex/MMAudio"> Huggingface Demo</a> | <a href="https://colab.research.google.com/drive/1TAaXCY2-kPk4xE4PwKB3EqFbSnkUuzZ8?usp=sharing">Colab Demo</a> | <a href="https://replicate.com/zsxkib/mmaudio">Replicate Demo</a> -->
19
+ <a href="https://github.com/lym0302/DeepSound-V1">Paper</a> | <a href="https://github.com/lym0302/DeepSound-V1">Webpage</a> | <a href="https://github.com/lym0302/DeepSound-V1"> Huggingface Demo</a>
20
+ </p>
21
+ </div>
22
 
23
+ ## [DeepSound-V1: Start to Think Step-by-Step in the Audio Generation from Videos](https://github.com/lym0302/DeepSound-V1)
24
 
25
+ <!-- [Ho Kei Cheng](https://hkchengrex.github.io/), [Masato Ishii](https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ), [Akio Hayakawa](https://scholar.google.com/citations?user=sXAjHFIAAAAJ), [Takashi Shibuya](https://scholar.google.com/citations?user=XCRO260AAAAJ), [Alexander Schwing](https://www.alexander-schwing.de/), [Yuki Mitsufuji](https://www.yukimitsufuji.com/) -->
26
 
27
+ <!-- University of Illinois Urbana-Champaign, Sony AI, and Sony Group Corporation -->
28
 
29
+ <!-- ICCV 2025 -->
30
 
31
  ## Highlight
32
 
33
+ DeepSound-V1 is a framework enabling audio generation from videos towards initial step-by-step thinking without extra annotations based on the internal chain-of-thought (CoT) of Multi-modal large language model(MLLM).
 
 
34
 
35
+ <!-- ## Results
 
36
 
37
  (All audio from our algorithm MMAudio)
38
 
39
+ Videos from Sora:
40
 
41
  https://github.com/user-attachments/assets/82afd192-0cee-48a1-86ca-bd39b8c8f330
42
 
43
+ Videos from Veo 2:
44
+
45
+ https://github.com/user-attachments/assets/8a11419e-fee2-46e0-9e67-dfb03c48d00e
46
 
47
+ Videos from MovieGen/Hunyuan Video/VGGSound:
48
 
49
  https://github.com/user-attachments/assets/29230d4e-21c1-4cf8-a221-c28f2af6d0ca
50
 
51
+ For more results, visit https://hkchengrex.com/MMAudio/video_main.html. -->
52
+
53
 
54
  ## Installation
55
+ ```bash
56
+ conda create -n deepsound-v1 python=3.10.16 -y
57
+ conda activate deepsound-v1
58
+ pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu120
59
+ pip install flash-attn==2.5.8 --no-build-isolation
60
+ pip install -e .
61
+ pip install -r reqirments.txt
62
+ ```
63
+
64
 
65
+ <!-- We have only tested this on Ubuntu.
66
 
67
  ### Prerequisites
68
 
69
  We recommend using a [miniforge](https://github.com/conda-forge/miniforge) environment.
70
 
71
+ - Python 3.9+
72
+ - PyTorch **2.5.1+** and corresponding torchvision/torchaudio (pick your CUDA version https://pytorch.org/, pip install recommended)
73
+ <!-- - ffmpeg<7 ([this is required by torchaudio](https://pytorch.org/audio/master/installation.html#optional-dependencies), you can install it in a miniforge environment with `conda install -c conda-forge 'ffmpeg<7'`) -->
74
 
75
+ <!-- **1. Install prerequisite if not yet met:**
76
 
77
  ```bash
78
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --upgrade
79
  ```
80
 
81
+ (Or any other CUDA versions that your GPUs/driver support) -->
82
 
83
+ <!-- ```
84
+ conda install -c conda-forge 'ffmpeg<7
 
85
  ```
86
+ (Optional, if you use miniforge and don't already have the appropriate ffmpeg) -->
87
 
88
+ <!-- **2. Clone our repository:**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  ```bash
91
+ git clone https://github.com/lym0302/DeepSound-V1.git
 
 
 
 
 
 
 
 
 
 
 
92
  ```
93
 
94
+ **3. Install with pip (install pytorch first before attempting this!):**
95
 
96
  ```bash
97
+ cd DeepSound-V1
98
+ pip install -e .
 
 
 
 
 
99
  ```
100
 
101
+ (If you encounter the File "setup.py" not found error, upgrade your pip with pip install --upgrade pip) -->
102
+
103
+
104
+ <!-- The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`.
105
+ The models are also available at https://huggingface.co/hkchengrex/MMAudio/tree/main
106
+ See [MODELS.md](docs/MODELS.md) for more details. -->
107
+
108
  ## Demo
109
 
110
+ ### Pretrained models
111
+ See [MODELS.md](docs/MODELS.md).
112
 
113
  ### Command-line interface
114
 
115
  With `demo.py`
116
+
117
  ```bash
118
+ python demo.py -i <video_path>
119
  ```
120
+
121
+ All training parameters are [here]().
122
+
123
+ <!-- The output (audio in `.wav` format, and video in `.mp4` format) will be saved in `./output`.
124
  See the file for more options.
125
  Simply omit the `--video` option for text-to-audio synthesis.
126
+ The default output (and training) duration is 8 seconds. Longer/shorter durations could also work, but a large deviation from the training duration may result in a lower quality. -->
127
 
128
+ <!-- ### Gradio interface
 
129
 
130
  Supports video-to-audio and text-to-audio synthesis.
131
+ You can also try experimental image-to-audio synthesis which duplicates the input image to a video for processing. This might be interesting to some but it is not something MMAudio has been trained for.
132
+ Use [port forwarding](https://unix.stackexchange.com/questions/115897/whats-ssh-port-forwarding-and-whats-the-difference-between-ssh-local-and-remot) (e.g., `ssh -L 7860:localhost:7860 server`) if necessary. The default port is `7860` which you can specify with `--port`.
133
 
134
+ ```bash
135
  python gradio_demo.py
136
+ ``` -->
137
 
 
138
 
 
 
 
139
 
140
+ ## Evaluation
141
+ Refer [av-benchmark](https://github.com/hkchengrex/av-benchmark) for benchmarking results.
142
+ See [EVAL.md](docs/EVAL.md).
143
 
 
 
144
 
145
+ ## Citation
146
+
147
+ <!-- ```bibtex
148
+ @inproceedings{cheng2025taming,
149
+ title={Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis},
150
+ author={Cheng, Ho Kei and Ishii, Masato and Hayakawa, Akio and Shibuya, Takashi and Schwing, Alexander and Mitsufuji, Yuki},
151
+ booktitle={CVPR},
152
+ year={2025}
153
+ }
154
+ ``` -->
155
+
156
+ ## Relevant Repositories
157
+
158
+ - [av-benchmark](https://github.com/hkchengrex/av-benchmark) for benchmarking results.
159
+
160
 
161
  ## Acknowledgement
 
 
 
 
162
 
163
+ Many thanks to:
164
+ - [VideoLLaMA2](https://github.com/DAMO-NLP-SG/VideoLLaMA2)
165
+ - [MMAudio](https://github.com/hkchengrex/MMAudio)
166
+ - [FoleyCrafter](https://github.com/open-mmlab/FoleyCrafter)
167
+ - [BS-RoFormer](https://github.com/ZFTurbo/Music-Source-Separation-Training)
app.py CHANGED
@@ -1,275 +1,162 @@
1
- import spaces
2
- import logging
3
- from datetime import datetime
4
- from pathlib import Path
5
-
6
- import gradio as gr
7
- import torch
8
- import torchaudio
9
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- try:
12
- import mmaudio
13
- except ImportError:
14
- os.system("pip install -e .")
15
- import mmaudio
16
-
17
- from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
18
- setup_eval_logging)
19
- from mmaudio.model.flow_matching import FlowMatching
20
- from mmaudio.model.networks import MMAudio, get_my_mmaudio
21
- from mmaudio.model.sequence_config import SequenceConfig
22
- from mmaudio.model.utils.features_utils import FeaturesUtils
23
- import tempfile
24
-
25
- torch.backends.cuda.matmul.allow_tf32 = True
26
- torch.backends.cudnn.allow_tf32 = True
27
-
28
- log = logging.getLogger()
29
-
30
- device = 'cpu'
31
- dtype = torch.bfloat16
32
-
33
- model: ModelConfig = all_model_cfg['large_44k_v2']
34
- model.download_if_needed()
35
- output_dir = Path('./output/gradio')
36
 
37
  setup_eval_logging()
 
 
 
 
 
 
 
38
 
39
-
40
- def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
41
- seq_cfg = model.seq_cfg
42
-
43
- net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
44
- net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
45
- log.info(f'Loaded weights from {model.model_path}')
46
-
47
- feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
48
- synchformer_ckpt=model.synchformer_ckpt,
49
- enable_conditions=True,
50
- mode=model.mode,
51
- bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
52
- need_vae_encoder=False)
53
- feature_utils = feature_utils.to(device, dtype).eval()
54
-
55
- return net, feature_utils, seq_cfg
56
-
57
-
58
- net, feature_utils, seq_cfg = get_model()
59
-
60
-
61
- @spaces.GPU(duration=120)
62
- @torch.inference_mode()
63
- def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
64
- cfg_strength: float, duration: float):
65
-
66
- rng = torch.Generator(device=device)
67
- if seed >= 0:
68
- rng.manual_seed(seed)
69
- else:
70
- rng.seed()
71
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
72
-
73
- video_info = load_video(video, duration)
74
- clip_frames = video_info.clip_frames
75
- sync_frames = video_info.sync_frames
76
- duration = video_info.duration_sec
77
- clip_frames = clip_frames.unsqueeze(0)
78
- sync_frames = sync_frames.unsqueeze(0)
79
- seq_cfg.duration = duration
80
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
81
-
82
- audios = generate(clip_frames,
83
- sync_frames, [prompt],
84
- negative_text=[negative_prompt],
85
- feature_utils=feature_utils,
86
- net=net,
87
- fm=fm,
88
- rng=rng,
89
- cfg_strength=cfg_strength)
90
- audio = audios.float().cpu()[0]
91
-
92
- # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
93
- video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
94
- # output_dir.mkdir(exist_ok=True, parents=True)
95
- # video_save_path = output_dir / f'{current_time_string}.mp4'
96
- make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
97
- log.info(f'Saved video to {video_save_path}')
98
- return video_save_path
99
-
100
-
101
- @spaces.GPU(duration=120)
102
- @torch.inference_mode()
103
- def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
104
- duration: float):
105
-
106
- rng = torch.Generator(device=device)
107
- if seed >= 0:
108
- rng.manual_seed(seed)
109
- else:
110
- rng.seed()
111
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
112
-
113
- clip_frames = sync_frames = None
114
- seq_cfg.duration = duration
115
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
116
-
117
- audios = generate(clip_frames,
118
- sync_frames, [prompt],
119
- negative_text=[negative_prompt],
120
- feature_utils=feature_utils,
121
- net=net,
122
- fm=fm,
123
- rng=rng,
124
- cfg_strength=cfg_strength)
125
- audio = audios.float().cpu()[0]
126
-
127
- audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
128
- torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
129
- log.info(f'Saved audio to {audio_save_path}')
130
- return audio_save_path
131
 
132
 
133
  video_to_audio_tab = gr.Interface(
134
  fn=video_to_audio,
 
135
  description="""
136
- Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
137
- Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
138
 
139
  NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
140
  Doing so does not improve results.
141
 
142
- The model has been trained on 8-second videos. Using much longer or shorter videos will degrade performance. Around 5s~12s should be fine.
 
143
  """,
144
  inputs=[
145
  gr.Video(),
146
  gr.Text(label='Prompt'),
147
- gr.Text(label='Negative prompt', value='music'),
148
- gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
149
- gr.Number(label='Num steps', value=25, precision=0, minimum=1),
150
- gr.Number(label='Guidance Strength', value=4.5, minimum=1),
151
- gr.Number(label='Duration (sec)', value=8, minimum=1),
152
- ],
153
- outputs='playable_video',
154
- cache_examples=False,
155
- title='MMAudio — Video-to-Audio Synthesis',
156
- examples=[
157
- [
158
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
159
- 'waves, seagulls',
160
- '',
161
- 0,
162
- 25,
163
- 4.5,
164
- 10,
165
- ],
166
- [
167
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
168
- '',
169
- 'music',
170
- 0,
171
- 25,
172
- 4.5,
173
- 10,
174
- ],
175
- [
176
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
177
- 'bubbles',
178
- '',
179
- 0,
180
- 25,
181
- 4.5,
182
- 10,
183
- ],
184
- [
185
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
186
- 'Indian holy music',
187
- '',
188
- 0,
189
- 25,
190
- 4.5,
191
- 10,
192
- ],
193
- [
194
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
195
- 'galloping',
196
- '',
197
- 0,
198
- 25,
199
- 4.5,
200
- 10,
201
- ],
202
- [
203
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
204
- 'waves, storm',
205
- '',
206
- 0,
207
- 25,
208
- 4.5,
209
- 10,
210
- ],
211
- [
212
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
213
- '',
214
- '',
215
- 0,
216
- 25,
217
- 4.5,
218
- 10,
219
- ],
220
- [
221
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
222
- 'storm',
223
- '',
224
- 0,
225
- 25,
226
- 4.5,
227
- 10,
228
- ],
229
- [
230
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
231
- '',
232
- '',
233
- 0,
234
- 25,
235
- 4.5,
236
- 10,
237
- ],
238
- [
239
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
240
- 'typing',
241
- '',
242
- 0,
243
- 25,
244
- 4.5,
245
- 10,
246
- ],
247
- [
248
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
249
- '',
250
- '',
251
- 0,
252
- 25,
253
- 4.5,
254
- 10,
255
- ],
256
- ])
257
 
258
- text_to_audio_tab = gr.Interface(
259
- fn=text_to_audio,
260
- inputs=[
261
- gr.Text(label='Prompt'),
262
- gr.Text(label='Negative prompt'),
263
- gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
264
- gr.Number(label='Num steps', value=25, precision=0, minimum=1),
265
- gr.Number(label='Guidance Strength', value=4.5, minimum=1),
266
- gr.Number(label='Duration (sec)', value=8, minimum=1),
267
  ],
268
- outputs='audio',
269
  cache_examples=False,
270
- title='MMAudioText-to-Audio Synthesis',
271
  )
272
 
 
273
  if __name__ == "__main__":
274
- gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],
275
- ['Video-to-Audio', 'Text-to-Audio']).launch(allowed_paths=[output_dir])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import sys
3
+ import time
4
+ import gradio as gr
5
+ import subprocess
6
+ from pathlib import Path
7
+ import requests
8
+ from moviepy.editor import AudioFileClip, VideoFileClip
9
+
10
+ project_root = os.path.dirname(os.path.abspath(__file__))
11
+ mmaudio_path = os.path.join(project_root, 'third_party', 'MMAudio')
12
+ sys.path.append(mmaudio_path)
13
+
14
+ from pipeline.pipeline import Pipeline
15
+ from third_party.MMAudio.mmaudio.eval_utils import setup_eval_logging
16
+
17
+ # # download model
18
+ # os.makedirs("pretrained/mllm", exist_ok=True)
19
+ # from huggingface_hub import snapshot_download
20
+ # repo_local_path = snapshot_download(repo_id="lym0302/VideoLLaMA2.1-7B-AV-CoT", cache_dir='pretrained/mllm')
21
+
22
+ # remove_vo_model_dir = "pretrained/remove_vo/checkpoints"
23
+ # os.makedirs(remove_vo_model_dir, exist_ok=True)
24
+ # urls = ["https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/model_bs_roformer_ep_317_sdr_12.9755.ckpt",
25
+ # "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml"]
26
+ # for url in urls:
27
+ # file_name = url.split("/")[-1] # Extract file name from URL
28
+ # file_path = os.path.join(remove_vo_model_dir, file_name)
29
+ # response = requests.get(url, stream=True)
30
+ # if response.status_code == 200:
31
+ # with open(file_path, "wb") as f:
32
+ # for chunk in response.iter_content(chunk_size=8192): # Use a chunk size of 8 KB
33
+ # f.write(chunk)
34
+ # print(f"File downloaded successfully and saved to {file_path}")
35
+ # else:
36
+ # print(f"Failed to download the file. Status code: {response.status_code}")
37
+
38
+ # os.makedirs("pretrained/v2a/mmaudio", exist_ok=True)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  setup_eval_logging()
42
+ pipeline = Pipeline(
43
+ step0_model_dir='pretrained/mllm/models--lym0302--VideoLLaMA2.1-7B-AV-CoT',
44
+ step1_mode='mmaudio_medium_44k',
45
+ step2_model_dir='pretrained/mllm/models--lym0302--VideoLLaMA2.1-7B-AV-CoT',
46
+ step2_mode='cot',
47
+ step3_mode='bs_roformer',
48
+ )
49
 
50
+ output_dir = "output_gradio"
51
+ os.makedirs(output_dir, exist_ok=True)
52
+ skip_final_video = False
53
+ def video_to_audio(
54
+ video_input: gr.Video,
55
+ prompt: str='',
56
+ negative_prompt: str='',
57
+ mode: str='s4',
58
+ postp_mode: str='neg',
59
+ duration: float=10,
60
+ seed: int=42,):
61
+
62
+ log_messages = [] # 用于存储日志
63
+ def log_info(msg):
64
+ log_messages.append(msg)
65
+ return "\n".join(log_messages) # 每次返回完整的日志历史
66
+
67
+ if not video_input:
68
+ yield None, log_info("Error: No video input provided.")
69
+ return
70
+
71
+ yield None, log_info("Generate high-quality audio from video step-by-step...") # 初始化日志
72
+
73
+ st_infer = time.time()
74
+ video_input = str(video_input)
75
+
76
+ for step_results in pipeline.run_for_gradio(
77
+ video_input=video_input,
78
+ output_dir=output_dir,
79
+ mode=mode,
80
+ postp_mode=postp_mode,
81
+ prompt=prompt,
82
+ negative_prompt=negative_prompt,
83
+ duration=duration,
84
+ seed=seed
85
+ ):
86
+ if step_results['log'] == 'Finish step-by-step v2a.':
87
+ break
88
+ else:
89
+ yield None, log_info(step_results['log'])
90
+
91
+
92
+ temp_final_audio_path = step_results["temp_final_audio_path"]
93
+ temp_final_video_path = step_results["temp_final_video_path"]
94
+
95
+ video_name_stem = Path(video_input).stem
96
+ final_audio_path = str(Path(output_dir) / f'{video_name_stem}.wav')
97
+ final_video_path = str(Path(output_dir) / f'{video_name_stem}.mp4')
98
+
99
+ if temp_final_audio_path is not None:
100
+ subprocess.run(['cp', str(temp_final_audio_path), final_audio_path], check=True)
101
+ step_results["final_audio_path"] = final_audio_path
102
+
103
+ if skip_final_video:
104
+ step_results["final_video_path"] = None
105
+ else:
106
+ if temp_final_video_path is not None:
107
+ subprocess.run(['cp', str(temp_final_video_path), final_video_path], check=True)
108
+ else:
109
+ audio = AudioFileClip(final_audio_path)
110
+ video = VideoFileClip(video_input)
111
+ duration = min(audio.duration, video.duration)
112
+ audio = audio.subclip(0, duration)
113
+ video.audio = audio
114
+ video = video.subclip(0, duration)
115
+ video.write_videofile(final_video_path)
116
+ step_results["final_video_path"] = final_video_path
117
+
118
+ et_infer = time.time()
119
+ print(f"Inference time: {et_infer - st_infer:.2f} s.")
120
+ print("step_results: ", step_results)
121
+
122
+ yield (final_video_path if os.path.exists(final_video_path) else None), log_info(step_results['log'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
 
125
  video_to_audio_tab = gr.Interface(
126
  fn=video_to_audio,
127
+ # Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
128
  description="""
129
+ Code: <a href="https://github.com/lym0302/DeepSound-V1">https://github.com/lym0302/DeepSound-V1</a><br>
 
130
 
131
  NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
132
  Doing so does not improve results.
133
 
134
+ This is a step-by-step v2a process and may take a long time.
135
+ If Post Processing is set to 'rm', the generated video may be None.
136
  """,
137
  inputs=[
138
  gr.Video(),
139
  gr.Text(label='Prompt'),
140
+ gr.Text(label='Negative prompt', value=''),
141
+ gr.Radio(["s3", "s4"], label="Mode", value="s4"),
142
+ gr.Radio(["rm", "rep", "neg"], label="Post Processing", value="neg"),
143
+ gr.Number(label='Duration (sec)', value=10, minimum=1),
144
+ gr.Number(label='Seed (42: random)', value=42, precision=0, minimum=-1),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
 
 
 
 
 
 
 
 
 
146
  ],
147
+ outputs=[gr.Video(label="Generated Video"), gr.Text(label="Logs"),],
148
  cache_examples=False,
149
+ title='DeepSound-V1Video-to-Audio Synthesis',
150
  )
151
 
152
+
153
  if __name__ == "__main__":
154
+ gr.TabbedInterface([video_to_audio_tab],
155
+ ['Video-to-Audio']).launch(allowed_paths=[output_dir])
156
+
157
+
158
+ # if __name__ == "__main__":
159
+ # port = 8000
160
+ # gr.TabbedInterface([video_to_audio_tab, ],
161
+ # ['Video-to-Audio', ]).launch(
162
+ # server_port=port, allowed_paths=[output_dir])
demo.py DELETED
@@ -1,135 +0,0 @@
1
- import logging
2
- from argparse import ArgumentParser
3
- from pathlib import Path
4
-
5
- import torch
6
- import torchaudio
7
-
8
- from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
9
- setup_eval_logging)
10
- from mmaudio.model.flow_matching import FlowMatching
11
- from mmaudio.model.networks import MMAudio, get_my_mmaudio
12
- from mmaudio.model.utils.features_utils import FeaturesUtils
13
-
14
- torch.backends.cuda.matmul.allow_tf32 = True
15
- torch.backends.cudnn.allow_tf32 = True
16
-
17
- log = logging.getLogger()
18
-
19
-
20
- @torch.inference_mode()
21
- def main():
22
- setup_eval_logging()
23
-
24
- parser = ArgumentParser()
25
- parser.add_argument('--variant',
26
- type=str,
27
- default='large_44k_v2',
28
- help='small_16k, small_44k, medium_44k, large_44k, large_44k_v2')
29
- parser.add_argument('--video', type=Path, help='Path to the video file')
30
- parser.add_argument('--prompt', type=str, help='Input prompt', default='')
31
- parser.add_argument('--negative_prompt', type=str, help='Negative prompt', default='')
32
- parser.add_argument('--duration', type=float, default=8.0)
33
- parser.add_argument('--cfg_strength', type=float, default=4.5)
34
- parser.add_argument('--num_steps', type=int, default=25)
35
-
36
- parser.add_argument('--mask_away_clip', action='store_true')
37
-
38
- parser.add_argument('--output', type=Path, help='Output directory', default='./output')
39
- parser.add_argument('--seed', type=int, help='Random seed', default=42)
40
- parser.add_argument('--skip_video_composite', action='store_true')
41
- parser.add_argument('--full_precision', action='store_true')
42
-
43
- args = parser.parse_args()
44
-
45
- if args.variant not in all_model_cfg:
46
- raise ValueError(f'Unknown model variant: {args.variant}')
47
- model: ModelConfig = all_model_cfg[args.variant]
48
- model.download_if_needed()
49
- seq_cfg = model.seq_cfg
50
-
51
- if args.video:
52
- video_path: Path = Path(args.video).expanduser()
53
- else:
54
- video_path = None
55
- prompt: str = args.prompt
56
- negative_prompt: str = args.negative_prompt
57
- output_dir: str = args.output.expanduser()
58
- seed: int = args.seed
59
- num_steps: int = args.num_steps
60
- duration: float = args.duration
61
- cfg_strength: float = args.cfg_strength
62
- skip_video_composite: bool = args.skip_video_composite
63
- mask_away_clip: bool = args.mask_away_clip
64
-
65
- device = 'cuda'
66
- dtype = torch.float32 if args.full_precision else torch.bfloat16
67
-
68
- output_dir.mkdir(parents=True, exist_ok=True)
69
-
70
- # load a pretrained model
71
- net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
72
- net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
73
- log.info(f'Loaded weights from {model.model_path}')
74
-
75
- # misc setup
76
- rng = torch.Generator(device=device)
77
- rng.manual_seed(seed)
78
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
79
-
80
- feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
81
- synchformer_ckpt=model.synchformer_ckpt,
82
- enable_conditions=True,
83
- mode=model.mode,
84
- bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
85
- need_vae_encoder=False)
86
- feature_utils = feature_utils.to(device, dtype).eval()
87
-
88
- if video_path is not None:
89
- log.info(f'Using video {video_path}')
90
- video_info = load_video(video_path, duration)
91
- clip_frames = video_info.clip_frames
92
- sync_frames = video_info.sync_frames
93
- duration = video_info.duration_sec
94
- if mask_away_clip:
95
- clip_frames = None
96
- else:
97
- clip_frames = clip_frames.unsqueeze(0)
98
- sync_frames = sync_frames.unsqueeze(0)
99
- else:
100
- log.info('No video provided -- text-to-audio mode')
101
- clip_frames = sync_frames = None
102
-
103
- seq_cfg.duration = duration
104
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
105
-
106
- log.info(f'Prompt: {prompt}')
107
- log.info(f'Negative prompt: {negative_prompt}')
108
-
109
- audios = generate(clip_frames,
110
- sync_frames, [prompt],
111
- negative_text=[negative_prompt],
112
- feature_utils=feature_utils,
113
- net=net,
114
- fm=fm,
115
- rng=rng,
116
- cfg_strength=cfg_strength)
117
- audio = audios.float().cpu()[0]
118
- if video_path is not None:
119
- save_path = output_dir / f'{video_path.stem}.flac'
120
- else:
121
- safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
122
- save_path = output_dir / f'{safe_filename}.flac'
123
- torchaudio.save(save_path, audio, seq_cfg.sampling_rate)
124
-
125
- log.info(f'Audio saved to {save_path}')
126
- if video_path is not None and not skip_video_composite:
127
- video_save_path = output_dir / f'{video_path.stem}.mp4'
128
- make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
129
- log.info(f'Video saved to {output_dir / video_save_path}')
130
-
131
- log.info('Memory usage: %.2f GB', torch.cuda.max_memory_allocated() / (2**30))
132
-
133
-
134
- if __name__ == '__main__':
135
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/images/icon.png DELETED
Binary file (163 Bytes)
 
docs/index.html DELETED
@@ -1,147 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <!-- Google tag (gtag.js) -->
5
- <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
6
- <script>
7
- window.dataLayer = window.dataLayer || [];
8
- function gtag(){dataLayer.push(arguments);}
9
- gtag('js', new Date());
10
- gtag('config', 'G-0JKBJ3WRJZ');
11
- </script>
12
-
13
- <link rel="preconnect" href="https://fonts.googleapis.com">
14
- <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
15
- <link href="https://fonts.googleapis.com/css2?family=Source+Sans+3&display=swap" rel="stylesheet">
16
- <meta charset="UTF-8">
17
- <title>MMAudio</title>
18
-
19
- <link rel="icon" type="image/png" href="images/icon.png">
20
-
21
- <meta name="viewport" content="width=device-width, initial-scale=1">
22
- <!-- CSS only -->
23
- <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
24
- integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
25
- <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
26
-
27
- <link rel="stylesheet" href="style.css">
28
- </head>
29
- <body>
30
-
31
- <body>
32
- <br><br><br><br>
33
- <div class="container">
34
- <div class="row text-center" style="font-size:38px">
35
- <div class="col strong">
36
- Taming Multimodal Joint Training for High-Quality <br>Video-to-Audio Synthesis
37
- </div>
38
- </div>
39
-
40
- <br>
41
- <div class="row text-center" style="font-size:28px">
42
- <div class="col">
43
- arXiv 2024
44
- </div>
45
- </div>
46
- <br>
47
-
48
- <div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
49
- <div class="col-sm-auto px-lg-2">
50
- <a href="https://hkchengrex.github.io/">Ho Kei Cheng<sup>1</sup></a>
51
- </div>
52
- <div class="col-sm-auto px-lg-2">
53
- <nobr><a href="https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ">Masato Ishii<sup>2</sup></a></nobr>
54
- </div>
55
- <div class="col-sm-auto px-lg-2">
56
- <nobr><a href="https://scholar.google.com/citations?user=sXAjHFIAAAAJ">Akio Hayakawa<sup>2</sup></a></nobr>
57
- </div>
58
- <div class="col-sm-auto px-lg-2">
59
- <nobr><a href="https://scholar.google.com/citations?user=XCRO260AAAAJ">Takashi Shibuya<sup>2</sup></a></nobr>
60
- </div>
61
- <div class="col-sm-auto px-lg-2">
62
- <nobr><a href="https://www.alexander-schwing.de/">Alexander Schwing<sup>1</sup></a></nobr>
63
- </div>
64
- <div class="col-sm-auto px-lg-2" >
65
- <nobr><a href="https://www.yukimitsufuji.com/">Yuki Mitsufuji<sup>2,3</sup></a></nobr>
66
- </div>
67
- </div>
68
-
69
- <div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
70
- <div class="col-sm-auto px-lg-2">
71
- <sup>1</sup>University of Illinois Urbana-Champaign
72
- </div>
73
- <div class="col-sm-auto px-lg-2">
74
- <sup>2</sup>Sony AI
75
- </div>
76
- <div class="col-sm-auto px-lg-2">
77
- <sup>3</sup>Sony Group Corporation
78
- </div>
79
- </div>
80
-
81
- <br>
82
-
83
- <br>
84
-
85
- <div class="h-100 row text-center justify-content-md-center" style="font-size:20px;">
86
- <!-- <div class="col-sm-2">
87
- <a href="https://arxiv.org/abs/2310.12982">[arXiv]</a>
88
- </div> -->
89
- <div class="col-sm-3">
90
- <a href="">[Paper (being prepared)]</a>
91
- </div>
92
- <div class="col-sm-3">
93
- <a href="https://github.com/hkchengrex/MMAudio">[Code]</a>
94
- </div>
95
- <!-- <div class="col-sm-2">
96
- <a
97
- href="https://colab.research.google.com/drive/1yo43XTbjxuWA7XgCUO9qxAi7wBI6HzvP?usp=sharing">[Colab]</a>
98
- </div> -->
99
- </div>
100
-
101
- <br>
102
-
103
- <hr>
104
-
105
- <div class="row" style="font-size:32px">
106
- <div class="col strong">
107
- TL;DR
108
- </div>
109
- </div>
110
- <br>
111
- <div class="row">
112
- <div class="col">
113
- <p class="light" style="text-align: left;">
114
- MMAudio generates synchronized audio given video and/or text inputs.
115
- </p>
116
- </div>
117
- </div>
118
-
119
- <br>
120
- <hr>
121
- <br>
122
-
123
- <div class="row" style="font-size:32px">
124
- <div class="col strong">
125
- Demo
126
- </div>
127
- </div>
128
- <br>
129
- <div class="row" style="font-size:48px">
130
- <div class="col strong text-center">
131
- <a href="video_main.html" style="text-decoration: underline;">&lt;More results&gt;</a>
132
- </div>
133
- </div>
134
- <br>
135
- <div class="video-container" style="text-align: center;">
136
- <iframe src="https://youtube.com/embed/YElewUT2M4M"></iframe>
137
- </div>
138
-
139
- <br>
140
-
141
- <br><br>
142
- <br><br>
143
-
144
- </div>
145
-
146
- </body>
147
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/style.css DELETED
@@ -1,78 +0,0 @@
1
- body {
2
- font-family: 'Source Sans 3', sans-serif;
3
- font-size: 18px;
4
- margin-left: auto;
5
- margin-right: auto;
6
- font-weight: 400;
7
- height: 100%;
8
- max-width: 1000px;
9
- }
10
-
11
- table {
12
- width: 100%;
13
- border-collapse: collapse;
14
- }
15
- th, td {
16
- border: 1px solid #ddd;
17
- padding: 8px;
18
- text-align: center;
19
- }
20
- th {
21
- background-color: #f2f2f2;
22
- }
23
- video {
24
- width: 100%;
25
- height: auto;
26
- }
27
- p {
28
- font-size: 28px;
29
- }
30
- h2 {
31
- font-size: 36px;
32
- }
33
-
34
- .strong {
35
- font-weight: 700;
36
- }
37
-
38
- .light {
39
- font-weight: 100;
40
- }
41
-
42
- .heavy {
43
- font-weight: 900;
44
- }
45
-
46
- .column {
47
- float: left;
48
- }
49
-
50
- a:link,
51
- a:visited {
52
- color: #05538f;
53
- text-decoration: none;
54
- }
55
-
56
- a:hover {
57
- color: #63cbdd;
58
- }
59
-
60
- hr {
61
- border: 0;
62
- height: 1px;
63
- background-image: linear-gradient(to right, rgba(0, 0, 0, 0), rgba(0, 0, 0, 0.75), rgba(0, 0, 0, 0));
64
- }
65
-
66
- .video-container {
67
- position: relative;
68
- padding-bottom: 56.25%; /* 16:9 */
69
- height: 0;
70
- }
71
-
72
- .video-container iframe {
73
- position: absolute;
74
- top: 0;
75
- left: 0;
76
- width: 100%;
77
- height: 100%;
78
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/style_videos.css DELETED
@@ -1,52 +0,0 @@
1
- body {
2
- font-family: 'Source Sans 3', sans-serif;
3
- font-size: 1.5vh;
4
- font-weight: 400;
5
- }
6
-
7
- table {
8
- width: 100%;
9
- border-collapse: collapse;
10
- }
11
- th, td {
12
- border: 1px solid #ddd;
13
- padding: 8px;
14
- text-align: center;
15
- }
16
- th {
17
- background-color: #f2f2f2;
18
- }
19
- video {
20
- width: 100%;
21
- height: auto;
22
- }
23
- p {
24
- font-size: 1.5vh;
25
- font-weight: bold;
26
- }
27
- h2 {
28
- font-size: 2vh;
29
- font-weight: bold;
30
- }
31
-
32
- .video-container {
33
- position: relative;
34
- padding-bottom: 56.25%; /* 16:9 */
35
- height: 0;
36
- }
37
-
38
- .video-container iframe {
39
- position: absolute;
40
- top: 0;
41
- left: 0;
42
- width: 100%;
43
- height: 100%;
44
- }
45
-
46
- .video-header {
47
- background-color: #f2f2f2;
48
- text-align: center;
49
- font-size: 1.5vh;
50
- font-weight: bold;
51
- padding: 8px;
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/video_gen.html DELETED
@@ -1,254 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <!-- Google tag (gtag.js) -->
5
- <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
6
- <script>
7
- window.dataLayer = window.dataLayer || [];
8
- function gtag(){dataLayer.push(arguments);}
9
- gtag('js', new Date());
10
- gtag('config', 'G-0JKBJ3WRJZ');
11
- </script>
12
-
13
- <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
14
- <meta charset="UTF-8">
15
- <title>MMAudio</title>
16
-
17
- <link rel="icon" type="image/png" href="images/icon.png">
18
-
19
- <meta name="viewport" content="width=device-width, initial-scale=1">
20
- <!-- CSS only -->
21
- <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
22
- integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
23
- <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
24
-
25
- <link rel="stylesheet" href="style_videos.css">
26
- </head>
27
- <body>
28
-
29
- <div id="moviegen_all">
30
- <h2 id="moviegen" style="text-align: center;">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</h2>
31
- <p id="moviegen1" style="overflow: hidden;">
32
- Example 1: Ice cracking with sharp snapping sound, and metal tool scraping against the ice surface.
33
- <span style="float: right;"><a href="#index">Back to index</a></span>
34
- </p>
35
-
36
- <div class="row g-1">
37
- <div class="col-sm-6">
38
- <div class="video-header">Movie Gen Audio</div>
39
- <div class="video-container">
40
- <iframe src="https://youtube.com/embed/d7Lb0ihtGcE"></iframe>
41
- </div>
42
- </div>
43
- <div class="col-sm-6">
44
- <div class="video-header">Ours</div>
45
- <div class="video-container">
46
- <iframe src="https://youtube.com/embed/F4JoJ2r2m8U"></iframe>
47
- </div>
48
- </div>
49
- </div>
50
- <br>
51
-
52
- <!-- <p id="moviegen2">Example 2: Rhythmic splashing and lapping of water. <span style="float:right;"><a href="#index">Back to index</a></span> </p>
53
-
54
- <table>
55
- <thead>
56
- <tr>
57
- <th>Movie Gen Audio</th>
58
- <th>Ours</th>
59
- </tr>
60
- </thead>
61
- <tbody>
62
- <tr>
63
- <td width="50%">
64
- <div class="video-container">
65
- <iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
66
- </div>
67
- </td>
68
- <td width="50%">
69
- <div class="video-container">
70
- <iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
71
- </div>
72
- </td>
73
- </tr>
74
- </tbody>
75
- </table> -->
76
-
77
- <p id="moviegen2" style="overflow: hidden;">
78
- Example 2: Rhythmic splashing and lapping of water.
79
- <span style="float:right;"><a href="#index">Back to index</a></span>
80
- </p>
81
- <div class="row g-1">
82
- <div class="col-sm-6">
83
- <div class="video-header">Movie Gen Audio</div>
84
- <div class="video-container">
85
- <iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
86
- </div>
87
- </div>
88
- <div class="col-sm-6">
89
- <div class="video-header">Ours</div>
90
- <div class="video-container">
91
- <iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
92
- </div>
93
- </div>
94
- </div>
95
- <br>
96
-
97
- <p id="moviegen3" style="overflow: hidden;">
98
- Example 3: Shovel scrapes against dry earth.
99
- <span style="float:right;"><a href="#index">Back to index</a></span>
100
- </p>
101
- <div class="row g-1">
102
- <div class="col-sm-6">
103
- <div class="video-header">Movie Gen Audio</div>
104
- <div class="video-container">
105
- <iframe src="https://youtube.com/embed/PUKGyEve7XQ"></iframe>
106
- </div>
107
- </div>
108
- <div class="col-sm-6">
109
- <div class="video-header">Ours</div>
110
- <div class="video-container">
111
- <iframe src="https://youtube.com/embed/CNn7i8VNkdc"></iframe>
112
- </div>
113
- </div>
114
- </div>
115
- <br>
116
-
117
-
118
- <p id="moviegen4" style="overflow: hidden;">
119
- (Failure case) Example 4: Creamy sound of mashed potatoes being scooped.
120
- <span style="float:right;"><a href="#index">Back to index</a></span>
121
- </p>
122
- <div class="row g-1">
123
- <div class="col-sm-6">
124
- <div class="video-header">Movie Gen Audio</div>
125
- <div class="video-container">
126
- <iframe src="https://youtube.com/embed/PJv1zxR9JjQ"></iframe>
127
- </div>
128
- </div>
129
- <div class="col-sm-6">
130
- <div class="video-header">Ours</div>
131
- <div class="video-container">
132
- <iframe src="https://youtube.com/embed/c3-LJ1lNsPQ"></iframe>
133
- </div>
134
- </div>
135
- </div>
136
- <br>
137
-
138
- </div>
139
-
140
- <div id="hunyuan_sora_all">
141
-
142
- <h2 id="hunyuan" style="text-align: center;">Results on Videos Generated by Hunyuan</h2>
143
- <p style="overflow: hidden;">
144
- <span style="float:right;"><a href="#index">Back to index</a></span>
145
- </p>
146
- <div class="row g-1">
147
- <div class="col-sm-6">
148
- <div class="video-header">Typing</div>
149
- <div class="video-container">
150
- <iframe src="https://youtube.com/embed/8ln_9hhH_nk"></iframe>
151
- </div>
152
- </div>
153
- <div class="col-sm-6">
154
- <div class="video-header">Water is rushing down a stream and pouring</div>
155
- <div class="video-container">
156
- <iframe src="https://youtube.com/embed/5df1FZFQj30"></iframe>
157
- </div>
158
- </div>
159
- </div>
160
- <div class="row g-1">
161
- <div class="col-sm-6">
162
- <div class="video-header">Waves on beach</div>
163
- <div class="video-container">
164
- <iframe src="https://youtube.com/embed/7wQ9D5WgpFc"></iframe>
165
- </div>
166
- </div>
167
- <div class="col-sm-6">
168
- <div class="video-header">Water droplet</div>
169
- <div class="video-container">
170
- <iframe src="https://youtube.com/embed/q7M2nsalGjM"></iframe>
171
- </div>
172
- </div>
173
- </div>
174
- <br>
175
-
176
- <h2 id="sora" style="text-align: center;">Results on Videos Generated by Sora</h2>
177
- <p style="overflow: hidden;">
178
- <span style="float:right;"><a href="#index">Back to index</a></span>
179
- </p>
180
- <div class="row g-1">
181
- <div class="col-sm-6">
182
- <div class="video-header">Ships riding waves</div>
183
- <div class="video-container">
184
- <iframe src="https://youtube.com/embed/JbgQzHHytk8"></iframe>
185
- </div>
186
- </div>
187
- <div class="col-sm-6">
188
- <div class="video-header">Train (no text prompt given)</div>
189
- <div class="video-container">
190
- <iframe src="https://youtube.com/embed/xOW7zrjpWC8"></iframe>
191
- </div>
192
- </div>
193
- </div>
194
- <div class="row g-1">
195
- <div class="col-sm-6">
196
- <div class="video-header">Seashore (no text prompt given)</div>
197
- <div class="video-container">
198
- <iframe src="https://youtube.com/embed/fIuw5Y8ZZ9E"></iframe>
199
- </div>
200
- </div>
201
- <div class="col-sm-6">
202
- <div class="video-header">Surfing (failure: unprompted music)</div>
203
- <div class="video-container">
204
- <iframe src="https://youtube.com/embed/UcSTk-v0M_s"></iframe>
205
- </div>
206
- </div>
207
- </div>
208
- <br>
209
-
210
- <div id="mochi_ltx_all">
211
- <h2 id="mochi" style="text-align: center;">Results on Videos Generated by Mochi 1</h2>
212
- <p style="overflow: hidden;">
213
- <span style="float:right;"><a href="#index">Back to index</a></span>
214
- </p>
215
- <div class="row g-1">
216
- <div class="col-sm-6">
217
- <div class="video-header">Magical fire and lightning (no text prompt given)</div>
218
- <div class="video-container">
219
- <iframe src="https://youtube.com/embed/tTlRZaSMNwY"></iframe>
220
- </div>
221
- </div>
222
- <div class="col-sm-6">
223
- <div class="video-header">Storm (no text prompt given)</div>
224
- <div class="video-container">
225
- <iframe src="https://youtube.com/embed/4hrZTMJUy3w"></iframe>
226
- </div>
227
- </div>
228
- </div>
229
- <br>
230
-
231
- <h2 id="ltx" style="text-align: center;">Results on Videos Generated by LTX-Video</h2>
232
- <p style="overflow: hidden;">
233
- <span style="float:right;"><a href="#index">Back to index</a></span>
234
- </p>
235
- <div class="row g-1">
236
- <div class="col-sm-6">
237
- <div class="video-header">Firewood burning and cracking</div>
238
- <div class="video-container">
239
- <iframe src="https://youtube.com/embed/P7_DDpgev0g"></iframe>
240
- </div>
241
- </div>
242
- <div class="col-sm-6">
243
- <div class="video-header">Waterfall, water splashing</div>
244
- <div class="video-container">
245
- <iframe src="https://youtube.com/embed/4MvjceYnIO0"></iframe>
246
- </div>
247
- </div>
248
- </div>
249
- <br>
250
-
251
- </div>
252
-
253
- </body>
254
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/video_main.html DELETED
@@ -1,98 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <!-- Google tag (gtag.js) -->
5
- <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
6
- <script>
7
- window.dataLayer = window.dataLayer || [];
8
- function gtag(){dataLayer.push(arguments);}
9
- gtag('js', new Date());
10
- gtag('config', 'G-0JKBJ3WRJZ');
11
- </script>
12
-
13
- <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
14
- <meta charset="UTF-8">
15
- <title>MMAudio</title>
16
-
17
- <link rel="icon" type="image/png" href="images/icon.png">
18
-
19
- <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
20
- <!-- CSS only -->
21
- <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
22
- integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
23
- <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
24
-
25
- <link rel="stylesheet" href="style_videos.css">
26
-
27
- <script type="text/javascript">
28
- $(document).ready(function(){
29
- $("#content").load("video_gen.html #moviegen_all");
30
- $("#load_moveigen").click(function(){
31
- $("#content").load("video_gen.html #moviegen_all");
32
- });
33
- $("#load_hunyuan_sora").click(function(){
34
- $("#content").load("video_gen.html #hunyuan_sora_all");
35
- });
36
- $("#load_mochi_ltx").click(function(){
37
- $("#content").load("video_gen.html #mochi_ltx_all");
38
- });
39
- $("#load_vgg1").click(function(){
40
- $("#content").load("video_vgg.html #vgg1");
41
- });
42
- $("#load_vgg2").click(function(){
43
- $("#content").load("video_vgg.html #vgg2");
44
- });
45
- $("#load_vgg3").click(function(){
46
- $("#content").load("video_vgg.html #vgg3");
47
- });
48
- $("#load_vgg4").click(function(){
49
- $("#content").load("video_vgg.html #vgg4");
50
- });
51
- $("#load_vgg5").click(function(){
52
- $("#content").load("video_vgg.html #vgg5");
53
- });
54
- $("#load_vgg6").click(function(){
55
- $("#content").load("video_vgg.html #vgg6");
56
- });
57
- $("#load_vgg_extra").click(function(){
58
- $("#content").load("video_vgg.html #vgg_extra");
59
- });
60
- });
61
- </script>
62
- </head>
63
- <body>
64
- <h1 id="index" style="text-align: center;">Index</h1>
65
- <p><b>(Click on the links to load the corresponding videos)</b> <span style="float:right;"><a href="index.html">Back to project page</a></span></p>
66
-
67
- <ol>
68
- <li>
69
- <a href="#" id="load_moveigen">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</a>
70
- </li>
71
- <li>
72
- <a href="#" id="load_hunyuan_sora">Results on Videos Generated by Hunyuan and Sora</a>
73
- </li>
74
- <li>
75
- <a href="#" id="load_mochi_ltx">Results on Videos Generated by Mochi 1 and LTX-Video</a>
76
- </li>
77
- <li>
78
- On VGGSound
79
- <ol>
80
- <li><a id='load_vgg1' href="#">Example 1: Wolf howling</a></li>
81
- <li><a id='load_vgg2' href="#">Example 2: Striking a golf ball</a></li>
82
- <li><a id='load_vgg3' href="#">Example 3: Hitting a drum</a></li>
83
- <li><a id='load_vgg4' href="#">Example 4: Dog barking</a></li>
84
- <li><a id='load_vgg5' href="#">Example 5: Playing a string instrument</a></li>
85
- <li><a id='load_vgg6' href="#">Example 6: A group of people playing tambourines</a></li>
86
- <li><a id='load_vgg_extra' href="#">Extra results & failure cases</a></li>
87
- </ol>
88
- </li>
89
- </ol>
90
-
91
- <div id="content" class="container-fluid">
92
-
93
- </div>
94
- <br>
95
- <br>
96
-
97
- </body>
98
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/video_vgg.html DELETED
@@ -1,452 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <!-- Google tag (gtag.js) -->
5
- <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
6
- <script>
7
- window.dataLayer = window.dataLayer || [];
8
- function gtag(){dataLayer.push(arguments);}
9
- gtag('js', new Date());
10
- gtag('config', 'G-0JKBJ3WRJZ');
11
- </script>
12
-
13
- <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
14
- <meta charset="UTF-8">
15
- <title>MMAudio</title>
16
-
17
- <meta name="viewport" content="width=device-width, initial-scale=1">
18
- <!-- CSS only -->
19
- <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
20
- integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
21
- <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
22
-
23
- <link rel="stylesheet" href="style_videos.css">
24
- </head>
25
- <body>
26
-
27
- <div id="vgg1">
28
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
29
- <p style="overflow: hidden;">
30
- Example 1: Wolf howling.
31
- <span style="float:right;"><a href="#index">Back to index</a></span>
32
- </p>
33
- <div class="row g-1">
34
- <div class="col-sm-3">
35
- <div class="video-header">Ground-truth</div>
36
- <div class="video-container">
37
- <iframe src="https://youtube.com/embed/9J_V74gqMUA"></iframe>
38
- </div>
39
- </div>
40
- <div class="col-sm-3">
41
- <div class="video-header">Ours</div>
42
- <div class="video-container">
43
- <iframe src="https://youtube.com/embed/P6O8IpjErPc"></iframe>
44
- </div>
45
- </div>
46
- <div class="col-sm-3">
47
- <div class="video-header">V2A-Mapper</div>
48
- <div class="video-container">
49
- <iframe src="https://youtube.com/embed/w-5eyqepvTk"></iframe>
50
- </div>
51
- </div>
52
- <div class="col-sm-3">
53
- <div class="video-header">FoleyCrafter</div>
54
- <div class="video-container">
55
- <iframe src="https://youtube.com/embed/VOLfoZlRkzo"></iframe>
56
- </div>
57
- </div>
58
- </div>
59
- <div class="row g-1">
60
- <div class="col-sm-3">
61
- <div class="video-header">Frieren</div>
62
- <div class="video-container">
63
- <iframe src="https://youtube.com/embed/49owKyA5Pa8"></iframe>
64
- </div>
65
- </div>
66
- <div class="col-sm-3">
67
- <div class="video-header">VATT</div>
68
- <div class="video-container">
69
- <iframe src="https://youtube.com/embed/QVtrFgbeGDM"></iframe>
70
- </div>
71
- </div>
72
- <div class="col-sm-3">
73
- <div class="video-header">V-AURA</div>
74
- <div class="video-container">
75
- <iframe src="https://youtube.com/embed/8r0uEfSNjvI"></iframe>
76
- </div>
77
- </div>
78
- <div class="col-sm-3">
79
- <div class="video-header">Seeing and Hearing</div>
80
- <div class="video-container">
81
- <iframe src="https://youtube.com/embed/bn-sLg2qulk"></iframe>
82
- </div>
83
- </div>
84
- </div>
85
- </div>
86
-
87
- <div id="vgg2">
88
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
89
- <p style="overflow: hidden;">
90
- Example 2: Striking a golf ball.
91
- <span style="float:right;"><a href="#index">Back to index</a></span>
92
- </p>
93
-
94
- <div class="row g-1">
95
- <div class="col-sm-3">
96
- <div class="video-header">Ground-truth</div>
97
- <div class="video-container">
98
- <iframe src="https://youtube.com/embed/1hwSu42kkho"></iframe>
99
- </div>
100
- </div>
101
- <div class="col-sm-3">
102
- <div class="video-header">Ours</div>
103
- <div class="video-container">
104
- <iframe src="https://youtube.com/embed/kZibDoDCNxI"></iframe>
105
- </div>
106
- </div>
107
- <div class="col-sm-3">
108
- <div class="video-header">V2A-Mapper</div>
109
- <div class="video-container">
110
- <iframe src="https://youtube.com/embed/jgKfLBLhh7Y"></iframe>
111
- </div>
112
- </div>
113
- <div class="col-sm-3">
114
- <div class="video-header">FoleyCrafter</div>
115
- <div class="video-container">
116
- <iframe src="https://youtube.com/embed/Lfsx8mOPcJo"></iframe>
117
- </div>
118
- </div>
119
- </div>
120
- <div class="row g-1">
121
- <div class="col-sm-3">
122
- <div class="video-header">Frieren</div>
123
- <div class="video-container">
124
- <iframe src="https://youtube.com/embed/tz-LpbB0MBc"></iframe>
125
- </div>
126
- </div>
127
- <div class="col-sm-3">
128
- <div class="video-header">VATT</div>
129
- <div class="video-container">
130
- <iframe src="https://youtube.com/embed/RTDUHMi08n4"></iframe>
131
- </div>
132
- </div>
133
- <div class="col-sm-3">
134
- <div class="video-header">V-AURA</div>
135
- <div class="video-container">
136
- <iframe src="https://youtube.com/embed/N-3TDOsPnZQ"></iframe>
137
- </div>
138
- </div>
139
- <div class="col-sm-3">
140
- <div class="video-header">Seeing and Hearing</div>
141
- <div class="video-container">
142
- <iframe src="https://youtube.com/embed/QnsHnLn4gB0"></iframe>
143
- </div>
144
- </div>
145
- </div>
146
- </div>
147
-
148
- <div id="vgg3">
149
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
150
- <p style="overflow: hidden;">
151
- Example 3: Hitting a drum.
152
- <span style="float:right;"><a href="#index">Back to index</a></span>
153
- </p>
154
-
155
- <div class="row g-1">
156
- <div class="col-sm-3">
157
- <div class="video-header">Ground-truth</div>
158
- <div class="video-container">
159
- <iframe src="https://youtube.com/embed/0oeIwq77w0Q"></iframe>
160
- </div>
161
- </div>
162
- <div class="col-sm-3">
163
- <div class="video-header">Ours</div>
164
- <div class="video-container">
165
- <iframe src="https://youtube.com/embed/-UtPV9ohuIM"></iframe>
166
- </div>
167
- </div>
168
- <div class="col-sm-3">
169
- <div class="video-header">V2A-Mapper</div>
170
- <div class="video-container">
171
- <iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
172
- </div>
173
- </div>
174
- <div class="col-sm-3">
175
- <div class="video-header">FoleyCrafter</div>
176
- <div class="video-container">
177
- <iframe src="https://youtube.com/embed/kkCsXPOlBvY"></iframe>
178
- </div>
179
- </div>
180
- </div>
181
- <div class="row g-1">
182
- <div class="col-sm-3">
183
- <div class="video-header">Frieren</div>
184
- <div class="video-container">
185
- <iframe src="https://youtube.com/embed/MbNKsVsuvig"></iframe>
186
- </div>
187
- </div>
188
- <div class="col-sm-3">
189
- <div class="video-header">VATT</div>
190
- <div class="video-container">
191
- <iframe src="https://youtube.com/embed/2yYviBjrpBw"></iframe>
192
- </div>
193
- </div>
194
- <div class="col-sm-3">
195
- <div class="video-header">V-AURA</div>
196
- <div class="video-container">
197
- <iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
198
- </div>
199
- </div>
200
- <div class="col-sm-3">
201
- <div class="video-header">Seeing and Hearing</div>
202
- <div class="video-container">
203
- <iframe src="https://youtube.com/embed/6dnyQt4Fuhs"></iframe>
204
- </div>
205
- </div>
206
- </div>
207
- </div>
208
- </div>
209
-
210
- <div id="vgg4">
211
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
212
- <p style="overflow: hidden;">
213
- Example 4: Dog barking.
214
- <span style="float:right;"><a href="#index">Back to index</a></span>
215
- </p>
216
-
217
- <div class="row g-1">
218
- <div class="col-sm-3">
219
- <div class="video-header">Ground-truth</div>
220
- <div class="video-container">
221
- <iframe src="https://youtube.com/embed/ckaqvTyMYAw"></iframe>
222
- </div>
223
- </div>
224
- <div class="col-sm-3">
225
- <div class="video-header">Ours</div>
226
- <div class="video-container">
227
- <iframe src="https://youtube.com/embed/_aRndFZzZ-I"></iframe>
228
- </div>
229
- </div>
230
- <div class="col-sm-3">
231
- <div class="video-header">V2A-Mapper</div>
232
- <div class="video-container">
233
- <iframe src="https://youtube.com/embed/mNCISP3LBl0"></iframe>
234
- </div>
235
- </div>
236
- <div class="col-sm-3">
237
- <div class="video-header">FoleyCrafter</div>
238
- <div class="video-container">
239
- <iframe src="https://youtube.com/embed/phZBQ3L7foE"></iframe>
240
- </div>
241
- </div>
242
- </div>
243
- <div class="row g-1">
244
- <div class="col-sm-3">
245
- <div class="video-header">Frieren</div>
246
- <div class="video-container">
247
- <iframe src="https://youtube.com/embed/Sb5Mg1-ORao"></iframe>
248
- </div>
249
- </div>
250
- <div class="col-sm-3">
251
- <div class="video-header">VATT</div>
252
- <div class="video-container">
253
- <iframe src="https://youtube.com/embed/eHmAGOmtDDg"></iframe>
254
- </div>
255
- </div>
256
- <div class="col-sm-3">
257
- <div class="video-header">V-AURA</div>
258
- <div class="video-container">
259
- <iframe src="https://youtube.com/embed/NEGa3krBrm0"></iframe>
260
- </div>
261
- </div>
262
- <div class="col-sm-3">
263
- <div class="video-header">Seeing and Hearing</div>
264
- <div class="video-container">
265
- <iframe src="https://youtube.com/embed/aO0EAXlwE7A"></iframe>
266
- </div>
267
- </div>
268
- </div>
269
- </div>
270
-
271
- <div id="vgg5">
272
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
273
- <p style="overflow: hidden;">
274
- Example 5: Playing a string instrument.
275
- <span style="float:right;"><a href="#index">Back to index</a></span>
276
- </p>
277
-
278
- <div class="row g-1">
279
- <div class="col-sm-3">
280
- <div class="video-header">Ground-truth</div>
281
- <div class="video-container">
282
- <iframe src="https://youtube.com/embed/KP1QhWauIOc"></iframe>
283
- </div>
284
- </div>
285
- <div class="col-sm-3">
286
- <div class="video-header">Ours</div>
287
- <div class="video-container">
288
- <iframe src="https://youtube.com/embed/ovaJhWSquYE"></iframe>
289
- </div>
290
- </div>
291
- <div class="col-sm-3">
292
- <div class="video-header">V2A-Mapper</div>
293
- <div class="video-container">
294
- <iframe src="https://youtube.com/embed/N723FS9lcy8"></iframe>
295
- </div>
296
- </div>
297
- <div class="col-sm-3">
298
- <div class="video-header">FoleyCrafter</div>
299
- <div class="video-container">
300
- <iframe src="https://youtube.com/embed/t0N4ZAAXo58"></iframe>
301
- </div>
302
- </div>
303
- </div>
304
- <div class="row g-1">
305
- <div class="col-sm-3">
306
- <div class="video-header">Frieren</div>
307
- <div class="video-container">
308
- <iframe src="https://youtube.com/embed/8YSRs03QNNA"></iframe>
309
- </div>
310
- </div>
311
- <div class="col-sm-3">
312
- <div class="video-header">VATT</div>
313
- <div class="video-container">
314
- <iframe src="https://youtube.com/embed/vOpMz55J1kY"></iframe>
315
- </div>
316
- </div>
317
- <div class="col-sm-3">
318
- <div class="video-header">V-AURA</div>
319
- <div class="video-container">
320
- <iframe src="https://youtube.com/embed/9JHC75vr9h0"></iframe>
321
- </div>
322
- </div>
323
- <div class="col-sm-3">
324
- <div class="video-header">Seeing and Hearing</div>
325
- <div class="video-container">
326
- <iframe src="https://youtube.com/embed/9w0JckNzXmY"></iframe>
327
- </div>
328
- </div>
329
- </div>
330
- </div>
331
-
332
- <div id="vgg6">
333
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
334
- <p style="overflow: hidden;">
335
- Example 6: A group of people playing tambourines.
336
- <span style="float:right;"><a href="#index">Back to index</a></span>
337
- </p>
338
-
339
- <div class="row g-1">
340
- <div class="col-sm-3">
341
- <div class="video-header">Ground-truth</div>
342
- <div class="video-container">
343
- <iframe src="https://youtube.com/embed/mx6JLxzUkRc"></iframe>
344
- </div>
345
- </div>
346
- <div class="col-sm-3">
347
- <div class="video-header">Ours</div>
348
- <div class="video-container">
349
- <iframe src="https://youtube.com/embed/oLirHhP9Su8"></iframe>
350
- </div>
351
- </div>
352
- <div class="col-sm-3">
353
- <div class="video-header">V2A-Mapper</div>
354
- <div class="video-container">
355
- <iframe src="https://youtube.com/embed/HkLkHMqptv0"></iframe>
356
- </div>
357
- </div>
358
- <div class="col-sm-3">
359
- <div class="video-header">FoleyCrafter</div>
360
- <div class="video-container">
361
- <iframe src="https://youtube.com/embed/rpHiiODjmNU"></iframe>
362
- </div>
363
- </div>
364
- </div>
365
- <div class="row g-1">
366
- <div class="col-sm-3">
367
- <div class="video-header">Frieren</div>
368
- <div class="video-container">
369
- <iframe src="https://youtube.com/embed/1mVD3fJ0LpM"></iframe>
370
- </div>
371
- </div>
372
- <div class="col-sm-3">
373
- <div class="video-header">VATT</div>
374
- <div class="video-container">
375
- <iframe src="https://youtube.com/embed/yjVFnJiEJlw"></iframe>
376
- </div>
377
- </div>
378
- <div class="col-sm-3">
379
- <div class="video-header">V-AURA</div>
380
- <div class="video-container">
381
- <iframe src="https://youtube.com/embed/neVeMSWtRkU"></iframe>
382
- </div>
383
- </div>
384
- <div class="col-sm-3">
385
- <div class="video-header">Seeing and Hearing</div>
386
- <div class="video-container">
387
- <iframe src="https://youtube.com/embed/EUE7YwyVWz8"></iframe>
388
- </div>
389
- </div>
390
- </div>
391
- </div>
392
-
393
- <div id="vgg_extra">
394
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
395
- <p style="overflow: hidden;">
396
- <span style="float:right;"><a href="#index">Back to index</a></span>
397
- </p>
398
-
399
- <div class="row g-1">
400
- <div class="col-sm-3">
401
- <div class="video-header">Moving train</div>
402
- <div class="video-container">
403
- <iframe src="https://youtube.com/embed/Ta6H45rBzJc"></iframe>
404
- </div>
405
- </div>
406
- <div class="col-sm-3">
407
- <div class="video-header">Water splashing</div>
408
- <div class="video-container">
409
- <iframe src="https://youtube.com/embed/hl6AtgHXpb4"></iframe>
410
- </div>
411
- </div>
412
- <div class="col-sm-3">
413
- <div class="video-header">Skateboarding</div>
414
- <div class="video-container">
415
- <iframe src="https://youtube.com/embed/n4sCNi_9buI"></iframe>
416
- </div>
417
- </div>
418
- <div class="col-sm-3">
419
- <div class="video-header">Synchronized clapping</div>
420
- <div class="video-container">
421
- <iframe src="https://youtube.com/embed/oxexfpLn7FE"></iframe>
422
- </div>
423
- </div>
424
- </div>
425
-
426
- <br><br>
427
-
428
- <div id="extra-failure">
429
- <h2 style="text-align: center;">Failure cases</h2>
430
- <p style="overflow: hidden;">
431
- <span style="float:right;"><a href="#index">Back to index</a></span>
432
- </p>
433
-
434
- <div class="row g-1">
435
- <div class="col-sm-6">
436
- <div class="video-header">Human speech</div>
437
- <div class="video-container">
438
- <iframe src="https://youtube.com/embed/nx0CyrDu70Y"></iframe>
439
- </div>
440
- </div>
441
- <div class="col-sm-6">
442
- <div class="video-header">Unfamiliar vision input</div>
443
- <div class="video-container">
444
- <iframe src="https://youtube.com/embed/hfnAqmK3X7w"></iframe>
445
- </div>
446
- </div>
447
- </div>
448
- </div>
449
- </div>
450
-
451
- </body>
452
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{mmaudio → pipeline}/__init__.py RENAMED
File without changes
pipeline/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (178 Bytes). View file
 
pipeline/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (166 Bytes). View file
 
pipeline/__pycache__/pipeline.cpython-310.pyc ADDED
Binary file (4.62 kB). View file
 
pipeline/__pycache__/pipeline.cpython-38.pyc ADDED
Binary file (2.66 kB). View file
 
pipeline/__pycache__/step0.cpython-310.pyc ADDED
Binary file (1.48 kB). View file
 
pipeline/__pycache__/step0.cpython-38.pyc ADDED
Binary file (1.35 kB). View file
 
pipeline/__pycache__/step1.cpython-310.pyc ADDED
Binary file (1.39 kB). View file
 
pipeline/__pycache__/step1.cpython-38.pyc ADDED
Binary file (1.3 kB). View file
 
pipeline/__pycache__/step2.cpython-310.pyc ADDED
Binary file (1.71 kB). View file
 
pipeline/__pycache__/step2.cpython-38.pyc ADDED
Binary file (1.61 kB). View file
 
pipeline/__pycache__/step3.cpython-310.pyc ADDED
Binary file (3.62 kB). View file
 
pipeline/__pycache__/step3.cpython-38.pyc ADDED
Binary file (3.42 kB). View file
 
pipeline/__pycache__/step4.cpython-310.pyc ADDED
Binary file (1.16 kB). View file
 
pipeline/__pycache__/step4.cpython-38.pyc ADDED
Binary file (1.08 kB). View file
 
pipeline/pipeline.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+
3
+ from .step0 import Step0
4
+ from .step1 import Step1
5
+ from .step2 import Step2
6
+ from .step3 import Step3
7
+ from .step4 import Step4
8
+ import logging
9
+ import re
10
+ import os
11
+
12
+ class Pipeline:
13
+ def __init__(self, step0_model_dir, step1_mode, step2_model_dir, step2_mode, step3_mode):
14
+ self.step0 = Step0(step0_model_dir)
15
+ self.step1 = Step1(step1_mode)
16
+ self.step2 = Step2(step2_model_dir, step2_mode)
17
+ self.step3 = Step3(model_type=step3_mode)
18
+ self.step4 = Step4()
19
+ self.step_processors = [self.step1, self.step2, self.step3, self.step4]
20
+ self.log = logging.getLogger(self.__class__.__name__)
21
+ self.log.setLevel(logging.INFO)
22
+
23
+
24
+ def run(self, video_input, output_dir, mode='s4', postp_mode='rep', prompt='', negative_prompt='', duration=10, seed=42):
25
+ step0_resp = self.step0.run(video_input)
26
+ step0_resp_list = re.findall(r'(Step\d:.*?)(?=Step\d:|$)', step0_resp, re.DOTALL)
27
+ step_infos = [step_info.strip().split("\n")[0] for step_info in step0_resp_list]
28
+ step3_temp_dir = os.path.join(output_dir, "remove_vo")
29
+
30
+ step_results = {"temp_final_audio_path": None, "temp_final_video_path": None}
31
+ for step_info in step_infos:
32
+ self.log.info(f"Start to {step_info}")
33
+ if step_info == 'Step1: Generate audio from video.':
34
+ step1_audio_path, step1_video_path = self.step1.run(video_input, output_dir, prompt, negative_prompt, duration=duration, seed=seed)
35
+ step_results["step1_audio_path"] = step1_audio_path
36
+ step_results["step1_video_path"] = step1_video_path
37
+
38
+ elif step_info == 'Step2: Given a video and its generated audio, determine whether the audio contains voice-over.':
39
+ is_vo = self.step2.run(str(step_results["step1_video_path"]))
40
+ step_results["is_vo"] = is_vo
41
+ if not step_results["is_vo"]: # not voice-over
42
+ step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
43
+ step_results["temp_final_video_path"] = step_results["step1_video_path"]
44
+ return step_results
45
+
46
+ elif step_info == 'Step3: Remove voice-over from audio.':
47
+ step3_audio_path = self.step3.run(input_audio_path=step_results["step1_audio_path"],
48
+ temp_store_dir=step3_temp_dir,
49
+ output_dir=output_dir)
50
+ step_results["step3_audio_path"] = step3_audio_path
51
+ if mode == 's3':
52
+ step_results["temp_final_audio_path"] = step_results["step3_audio_path"]
53
+ return step_results
54
+
55
+ elif step_info == 'Step4: Determine whether the audio is silent.':
56
+ is_silent = self.step4.run(step_results["step3_audio_path"])
57
+ step_results["is_silent"] = is_silent
58
+
59
+ else:
60
+ self.log.error(f"Step-by-Step Error !!!!!!!!!")
61
+ return step_results
62
+
63
+ if not step_results["is_silent"]: # not silent
64
+ step_results["temp_final_audio_path"] = step_results["step3_audio_path"]
65
+ else:
66
+ self.log.info(f"Start to post process, use mode: {postp_mode}")
67
+ if postp_mode == "rm":
68
+ step_results["temp_final_audio_path"] = None
69
+ elif postp_mode == "rep":
70
+ step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
71
+ step_results["temp_final_video_path"] = step_results["step1_video_path"]
72
+ elif postp_mode == "neg":
73
+ neg_audio_path, neg_video_path = self.step1.run(video_input, output_dir, prompt, negative_prompt='human voice', duration=duration, seed=seed, is_postp=True)
74
+ step_results["temp_final_audio_path"] = neg_audio_path
75
+ step_results["temp_final_video_path"] = neg_video_path
76
+ else:
77
+ self.log.error(f"Error postp_mode: {postp_mode}")
78
+
79
+ self.log.info(f"After post-processing, audio is {step_results['temp_final_audio_path']} and video is {step_results['temp_final_video_path']}")
80
+ self.log.info(f"Finish Post-Process successfully.\n")
81
+
82
+ return step_results
83
+
84
+
85
+
86
+ def run_for_gradio(self, video_input, output_dir, mode='s4', postp_mode='rep', prompt='', negative_prompt='', duration=10, seed=42):
87
+ step_results = {"temp_final_audio_path": None,
88
+ "temp_final_video_path": None,
89
+ 'log': ''}
90
+
91
+ step0_resp = self.step0.run(video_input)
92
+ step0_resp_list = re.findall(r'(Step\d:.*?)(?=Step\d:|$)', step0_resp, re.DOTALL)
93
+ step_infos = [step_info.strip().split("\n")[0] for step_info in step0_resp_list]
94
+ step3_temp_dir = os.path.join(output_dir, "remove_vo")
95
+
96
+
97
+ for step_info in step_infos:
98
+ self.log.info(f"Start to {step_info}")
99
+ step_results['log'] = f"Start to {step_info}"
100
+ yield step_results
101
+
102
+ if step_info == 'Step1: Generate audio from video.':
103
+ step1_audio_path, step1_video_path = self.step1.run(video_input, output_dir, prompt, negative_prompt, duration=duration, seed=seed)
104
+ step_results["step1_audio_path"] = step1_audio_path
105
+ step_results["step1_video_path"] = step1_video_path
106
+ step_results['log'] = "Step1 completed."
107
+ yield step_results
108
+
109
+ elif step_info == 'Step2: Given a video and its generated audio, determine whether the audio contains voice-over.':
110
+ is_vo = self.step2.run(str(step_results["step1_video_path"]))
111
+ step_results["is_vo"] = is_vo
112
+ step_results['log'] = f"Step2 completed. Contain voice-over? {'Yes' if is_vo else 'No'}"
113
+ yield step_results
114
+ if not step_results["is_vo"]: # not voice-over
115
+ step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
116
+ step_results["temp_final_video_path"] = step_results["step1_video_path"]
117
+ step_results['log'] = "Finish step-by-step v2a."
118
+ yield step_results
119
+
120
+ elif step_info == 'Step3: Remove voice-over from audio.':
121
+ step3_audio_path = self.step3.run(input_audio_path=step_results["step1_audio_path"],
122
+ temp_store_dir=step3_temp_dir,
123
+ output_dir=output_dir)
124
+ step_results["step3_audio_path"] = step3_audio_path
125
+ step_results['log'] = f"Step3 completed."
126
+ yield step_results
127
+ if mode == 's3':
128
+ step_results["temp_final_audio_path"] = step_results["step3_audio_path"]
129
+ step_results['log'] = "Finish step-by-step v2a."
130
+ yield step_results
131
+
132
+ elif step_info == 'Step4: Determine whether the audio is silent.':
133
+ is_silent = self.step4.run(step_results["step3_audio_path"])
134
+ step_results["is_silent"] = is_silent
135
+ step_results['log'] = f"Step4 completed. Silent? {'Yes' if is_silent else 'No'}"
136
+ yield step_results
137
+
138
+ else:
139
+ self.log.error(f"Step-by-Step Error !!!!!!!!!")
140
+ step_results['log'] = f"Step-by-Step Error !!!!!!!!!"
141
+ yield step_results
142
+ step_results['log'] = "Finish step-by-step v2a."
143
+ yield step_results
144
+
145
+ if not step_results["is_silent"]: # not silent
146
+ step_results["temp_final_audio_path"] = step_results["step3_audio_path"]
147
+ step_results['log'] = "Finish step-by-step v2a."
148
+ yield step_results
149
+
150
+ else:
151
+ step_results['log'] = f"Post-processing with mode: {postp_mode}"
152
+ yield step_results
153
+ self.log.info(f"Start to post process, use mode: {postp_mode}")
154
+
155
+ if postp_mode == "rm":
156
+ step_results["temp_final_audio_path"] = None
157
+ elif postp_mode == "rep":
158
+ step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
159
+ step_results["temp_final_video_path"] = step_results["step1_video_path"]
160
+ elif postp_mode == "neg":
161
+ neg_audio_path, neg_video_path = self.step1.run(video_input, output_dir, prompt, negative_prompt='human voice', duration=duration, seed=seed, is_postp=True)
162
+ step_results["temp_final_audio_path"] = neg_audio_path
163
+ step_results["temp_final_video_path"] = neg_video_path
164
+ else:
165
+ self.log.error(f"Error postp_mode: {postp_mode}")
166
+
167
+ self.log.info(f"After post-processing, audio is {step_results['temp_final_audio_path']} and video is {step_results['temp_final_video_path']}")
168
+ self.log.info(f"Finish Post-Process successfully.\n")
169
+ step_results['log'] = f"Post-processing completed."
170
+ yield step_results
171
+
172
+
173
+ step_results['log'] = "Finish step-by-step v2a."
174
+ yield step_results
175
+
pipeline/step0.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # CoT generate step-by-step
3
+
4
+ from third_party.VideoLLaMA2.videollama2 import model_init, mm_infer
5
+ import logging
6
+
7
+ class Step0:
8
+ def __init__(self, model_path, modal_type='v'):
9
+ self.log = logging.getLogger(self.__class__.__name__)
10
+ self.log.setLevel(logging.INFO)
11
+
12
+ self.model, self.processor, self.tokenizer = model_init(model_path)
13
+ self.modal_type=modal_type
14
+ if modal_type == "a":
15
+ self.model.model.vision_tower = None
16
+ elif modal_type == "v":
17
+ self.model.model.audio_tower = None
18
+ elif modal_type == "av":
19
+ pass
20
+ else:
21
+ raise NotImplementedError
22
+ self.modal = 'audio' if modal_type == "a" else "video"
23
+ self.question = f"Generate high-quality audio from video step-by-step."
24
+ self.preprocess = self.processor[self.modal]
25
+
26
+ def run(self, video_path):
27
+ self.log.info("######################################################################################################")
28
+ self.log.info("Generate high-quality audio from video step-by-step...")
29
+ audio_video_tensor = self.preprocess(video_path, va=False)
30
+ output = mm_infer(
31
+ audio_video_tensor,
32
+ self.question,
33
+ model=self.model,
34
+ tokenizer=self.tokenizer,
35
+ modal=self.modal,
36
+ do_sample=False,
37
+ )
38
+
39
+ return output
pipeline/step1.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # V2A
3
+ import logging
4
+
5
+
6
+ class Step1:
7
+ def __init__(self, step1_mode):
8
+ self.log = logging.getLogger(self.__class__.__name__)
9
+ self.log.setLevel(logging.INFO)
10
+
11
+ if step1_mode.startswith('mmaudio'):
12
+ from v2a_models.v2a_mmaudio import V2A_MMAudio
13
+ variant = step1_mode.replace("mmaudio_", "")
14
+ self.v2a_model = V2A_MMAudio(variant)
15
+ elif step1_mode == "foleycrafter":
16
+ from v2a_models.v2a_foleycrafter import V2A_FoleyCrafter
17
+ self.v2a_model = V2A_FoleyCrafter()
18
+ else:
19
+ self.log.error(f"Error step1_mode: {step1_mode}")
20
+
21
+
22
+
23
+ def run(self, video_path, output_dir, prompt='', negative_prompt='', duration=10, seed=42, is_postp=False,):
24
+ # self.log.info("Step1: Generate audio from video.")
25
+ step1_audio_path, step1_video_path = self.v2a_model.generate_audio(
26
+ video_path=video_path,
27
+ output_dir=output_dir,
28
+ prompt=prompt,
29
+ negative_prompt=negative_prompt,
30
+ duration=duration,
31
+ seed=seed,
32
+ is_postp=is_postp)
33
+
34
+ self.log.info(f"The audio generated by Step1 is in {step1_audio_path}, and the video is in {step1_video_path}")
35
+ self.log.info("Finish Step1 successfully.\n")
36
+ return step1_audio_path, step1_video_path
pipeline/step2.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # judge voice-over
3
+
4
+ from third_party.VideoLLaMA2.videollama2 import model_init, mm_infer
5
+ import logging
6
+
7
+ class Step2:
8
+ def __init__(self, model_path, step2_mode, modal_type="av"):
9
+ self.log = logging.getLogger(self.__class__.__name__)
10
+ self.log.setLevel(logging.INFO)
11
+
12
+ self.model, self.processor, self.tokenizer = model_init(model_path)
13
+ self.modal_type=modal_type
14
+ if modal_type == "a":
15
+ self.model.model.vision_tower = None
16
+ elif modal_type == "v":
17
+ self.model.model.audio_tower = None
18
+ elif modal_type == "av":
19
+ pass
20
+ else:
21
+ raise NotImplementedError
22
+ self.modal = 'audio' if modal_type == "a" else "video"
23
+
24
+ self.question = f"Given a video and its corresponding audio, determine whether the audio contains voice-over? Options: A. Yes, B. No. Choose A or B."
25
+ self.preprocess = self.processor[self.modal]
26
+
27
+ self.step2_mode = step2_mode
28
+
29
+ def run(self, video_audio_path):
30
+ # self.log.info("Step2: Given a video and its generated audio, determine whether the audio contains voice-over.")
31
+ audio_video_tensor = self.preprocess(video_audio_path, va=True)
32
+ output = mm_infer(
33
+ audio_video_tensor,
34
+ self.question,
35
+ model=self.model,
36
+ tokenizer=self.tokenizer,
37
+ modal=self.modal,
38
+ do_sample=False,
39
+ )
40
+ # print("oooooooooooooooooooooo: ", output)
41
+
42
+ if self.step2_mode == "cot":
43
+ output = output.split("<CONCLUSION>")[-1][1]
44
+ print("1111111111111111111111111: ", output)
45
+ output = (output == "A")
46
+
47
+ if output:
48
+ self.log.info(f"The video generated by Step1 ({video_audio_path}) contains voice-over.")
49
+ else:
50
+ self.log.info(f"The video generated by Step1 ({video_audio_path}) does not contain voice-over.")
51
+ self.log.info("Finish Step2 successfully.\n")
52
+ return output
pipeline/step3.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Remove voice-over
3
+ import logging
4
+ import argparse
5
+ import subprocess
6
+ import librosa
7
+ import os
8
+ import torch
9
+ import soundfile as sf
10
+ import numpy as np
11
+
12
+
13
+ # Using the embedded version of Python can also correctly import the utils module.
14
+ # current_dir = os.path.dirname(os.path.abspath(__file__))
15
+ # sys.path.append(current_dir)
16
+
17
+ from third_party.MusicSourceSeparationTraining.utils import demix, load_config, normalize_audio, denormalize_audio, draw_spectrogram
18
+ from third_party.MusicSourceSeparationTraining.utils import prefer_target_instrument, apply_tta, load_start_checkpoint
19
+ from third_party.MusicSourceSeparationTraining.models.bs_roformer import BSRoformer
20
+ import warnings
21
+
22
+ warnings.filterwarnings("ignore")
23
+
24
+ model_base_dir = "pretrained/remove_vo/checkpoints"
25
+ MODEL_PATHS = {"bs_roformer": [f"{model_base_dir}/model_bs_roformer_ep_317_sdr_12.9755.ckpt", f"{model_base_dir}/model_bs_roformer_ep_317_sdr_12.9755.yaml"]}
26
+
27
+
28
+ class Step3:
29
+ def __init__(self, model_type="bs_roformer"):
30
+ model_path, config_path = MODEL_PATHS[model_type]
31
+
32
+ self.log = logging.getLogger(self.__class__.__name__)
33
+ self.log.setLevel(logging.INFO)
34
+ self.device = 'cpu'
35
+ if torch.cuda.is_available():
36
+ self.device = 'cuda'
37
+ elif torch.backends.mps.is_available():
38
+ self.device = 'mps'
39
+ else:
40
+ self.log.warning('CUDA/MPS are not available, running on CPU')
41
+
42
+ self.model_type = model_type
43
+
44
+ # self.model, self.config = get_model_from_config(model_type, config_path)
45
+ self.config = load_config(model_type, config_path)
46
+ self.model = BSRoformer(**dict(self.config.model))
47
+ args = argparse.Namespace()
48
+ args.start_check_point = model_path
49
+ args.model_type = model_type
50
+ args.lora_checkpoint = ''
51
+ load_start_checkpoint(args, self.model, type_='inference')
52
+ self.model = self.model.to(self.device)
53
+ self.sample_rate = getattr(self.config.audio, 'sample_rate', 44100)
54
+
55
+
56
+ def run(self,
57
+ input_audio_path,
58
+ temp_store_dir, # for remove result dir
59
+ output_dir, # for final dir
60
+ disable_detailed_pbar: bool=False,
61
+ use_tta: bool= False,
62
+ extract_instrumental: bool=True,
63
+ codec="wav",
64
+ subtype="FLOAT",
65
+ draw_spectro=0,
66
+ ):
67
+
68
+ # self.log.info("Step3: Remove voice-over from audio.")
69
+
70
+ os.makedirs(output_dir, exist_ok=True)
71
+
72
+ if disable_detailed_pbar:
73
+ detailed_pbar = False
74
+ else:
75
+ detailed_pbar = True
76
+
77
+ instruments = prefer_target_instrument(self.config)[:]
78
+
79
+ mix, sr = librosa.load(input_audio_path, sr=self.sample_rate, mono=False)
80
+ # If mono audio we must adjust it depending on model
81
+ if len(mix.shape) == 1:
82
+ mix = np.expand_dims(mix, axis=0)
83
+ if 'num_channels' in self.config.audio:
84
+ if self.config.audio['num_channels'] == 2:
85
+ print(f'Convert mono track to stereo...')
86
+ mix = np.concatenate([mix, mix], axis=0)
87
+
88
+ mix_orig = mix.copy()
89
+ if 'normalize' in self.config.inference:
90
+ if self.config.inference['normalize'] is True:
91
+ mix, norm_params = normalize_audio(mix)
92
+
93
+ waveforms_orig = demix(self.config, self.model, mix, self.device, model_type=self.model_type, pbar=detailed_pbar)
94
+ if use_tta:
95
+ waveforms_orig = apply_tta(self.config, self.model, mix, waveforms_orig, self.device, self.model_type)
96
+
97
+ if extract_instrumental:
98
+ instr = 'vocals' if 'vocals' in instruments else instruments[0]
99
+ waveforms_orig['instrumental'] = mix_orig - waveforms_orig[instr]
100
+ if 'instrumental' not in instruments:
101
+ instruments.append('instrumental')
102
+
103
+ file_name = os.path.splitext(os.path.basename(input_audio_path))[0].replace(".step1", "")
104
+ temp_output_dir = os.path.join(temp_store_dir, file_name)
105
+ os.makedirs(temp_output_dir, exist_ok=True)
106
+
107
+ for instr in instruments:
108
+ estimates = waveforms_orig[instr]
109
+ if 'normalize' in self.config.inference:
110
+ if self.config.inference['normalize'] is True:
111
+ estimates = denormalize_audio(estimates, norm_params)
112
+
113
+ output_path = os.path.join(temp_output_dir, f"{instr}.{codec}")
114
+ sf.write(output_path, estimates.T, sr, subtype=subtype)
115
+ if draw_spectro > 0:
116
+ output_img_path = os.path.join(temp_output_dir, f"{instr}.jpg")
117
+ draw_spectrogram(estimates.T, sr, draw_spectro, output_img_path)
118
+
119
+
120
+ instrumental_file = os.path.join(temp_output_dir, 'instrumental.wav')
121
+ step3_audio_path = f"{output_dir}/{file_name}.step3.wav"
122
+ subprocess.run(['cp', instrumental_file, step3_audio_path])
123
+
124
+ self.log.info(f"The voice-over has been removed, and the audio is saved in {step3_audio_path}")
125
+ self.log.info("Finish Step3 successfully.\n")
126
+ return step3_audio_path
127
+
128
+
129
+
pipeline/step4.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Silence detection
3
+ import logging
4
+ import librosa
5
+ import numpy as np
6
+
7
+
8
+ class Step4:
9
+ def __init__(self):
10
+ self.log = logging.getLogger(self.__class__.__name__)
11
+ self.log.setLevel(logging.INFO)
12
+
13
+
14
+ def run(self,
15
+ audio_path,
16
+ silence_thresh=-50,
17
+ duration_thresh=0.9):
18
+ # self.log.info("Step4: Determine whether the audio is silent.")
19
+ y, sr = librosa.load(audio_path, sr=None)
20
+ energy = librosa.feature.rms(y=y)[0]
21
+ energy_db = librosa.amplitude_to_db(energy)
22
+ silent_ratio = np.sum(energy_db < silence_thresh) / len(energy_db)
23
+ is_silent = silent_ratio > duration_thresh
24
+
25
+ if is_silent:
26
+ self.log.info(f"The audio after removing the voiceover ({audio_path}) is silent.")
27
+ else:
28
+ self.log.info(f"The audio after removing the voiceover ({audio_path}) is not silent.")
29
+ self.log.info("Finish Step4 successfully.\n")
30
+
31
+ return is_silent
pyproject.toml DELETED
@@ -1,52 +0,0 @@
1
- [build-system]
2
- requires = ["hatchling"]
3
- build-backend = "hatchling.build"
4
-
5
- [tool.hatch.metadata]
6
- allow-direct-references = true
7
-
8
- [tool.yapf]
9
- based_on_style = "pep8"
10
- indent_width = 4
11
- column_limit = 100
12
-
13
- [project]
14
- name = "mmaudio"
15
- version = "1.0.0"
16
- authors = [{ name = "Rex Cheng", email = "[email protected]" }]
17
- description = ""
18
- readme = "README.md"
19
- requires-python = ">=3.9"
20
- classifiers = [
21
- "Programming Language :: Python :: 3",
22
- "Operating System :: OS Independent",
23
- ]
24
- dependencies = [
25
- 'torch >= 2.5.1',
26
- 'python-dotenv',
27
- 'cython',
28
- 'gitpython >= 3.1',
29
- 'tensorboard >= 2.11',
30
- 'numpy >= 1.21, <2.1',
31
- 'Pillow >= 9.5',
32
- 'opencv-python >= 4.8',
33
- 'scipy >= 1.7',
34
- 'tqdm >= 4.66.1',
35
- 'gradio >= 3.34',
36
- 'einops >= 0.6',
37
- 'hydra-core >= 1.3.2',
38
- 'requests',
39
- 'torchdiffeq',
40
- 'librosa >= 0.8.1',
41
- 'nitrous-ema',
42
- 'safetensors',
43
- 'auraloss',
44
- 'hydra_colorlog',
45
- 'tensordict',
46
- 'colorlog',
47
- 'open_clip_torch',
48
- 'soundfile',
49
- ]
50
-
51
- [tool.hatch.build.targets.wheel]
52
- packages = ["mmaudio"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt.bak DELETED
@@ -1,27 +0,0 @@
1
- torch == 2.4.0
2
- torchvision
3
- torchaudio
4
- python-dotenv
5
- cython
6
- gitpython >= 3.1
7
- tensorboard >= 2.11
8
- numpy >= 1.21, <2.1
9
- Pillow >= 9.5
10
- opencv-python >= 4.8
11
- scipy >= 1.7
12
- tqdm >= 4.66.1
13
- gradio >= 3.34
14
- einops >= 0.6
15
- hydra-core >= 1.3.2
16
- requests
17
- torchdiffeq
18
- librosa >= 0.8.1
19
- nitrous-ema
20
- safetensors
21
- auraloss
22
- hydra_colorlog
23
- tensordict
24
- colorlog
25
- open_clip_torch
26
- soundfile
27
- av
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
third_party/MMAudio/.gitignore ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_*.sh
2
+ log/
3
+ saves
4
+ saves/
5
+ weights/
6
+ weights
7
+ output/
8
+ output
9
+ pretrained/
10
+ workspace
11
+ workspace/
12
+ ext_weights/
13
+ ext_weights
14
+ .checkpoints/
15
+ .vscode/
16
+ training/example_output/
17
+
18
+ # Byte-compiled / optimized / DLL files
19
+ __pycache__/
20
+ *.py[cod]
21
+ *$py.class
22
+
23
+ # C extensions
24
+ *.so
25
+
26
+ # Distribution / packaging
27
+ .Python
28
+ build/
29
+ develop-eggs/
30
+ dist/
31
+ downloads/
32
+ eggs/
33
+ .eggs/
34
+ lib/
35
+ lib64/
36
+ parts/
37
+ sdist/
38
+ var/
39
+ wheels/
40
+ pip-wheel-metadata/
41
+ share/python-wheels/
42
+ *.egg-info/
43
+ .installed.cfg
44
+ *.egg
45
+ MANIFEST
46
+
47
+ # PyInstaller
48
+ # Usually these files are written by a python script from a template
49
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
50
+ *.manifest
51
+ *.spec
52
+
53
+ # Installer logs
54
+ pip-log.txt
55
+ pip-delete-this-directory.txt
56
+
57
+ # Unit test / coverage reports
58
+ htmlcov/
59
+ .tox/
60
+ .nox/
61
+ .coverage
62
+ .coverage.*
63
+ .cache
64
+ nosetests.xml
65
+ coverage.xml
66
+ *.cover
67
+ *.py,cover
68
+ .hypothesis/
69
+ .pytest_cache/
70
+
71
+ # Translations
72
+ *.mo
73
+ *.pot
74
+
75
+ # Django stuff:
76
+ *.log
77
+ local_settings.py
78
+ db.sqlite3
79
+ db.sqlite3-journal
80
+
81
+ # Flask stuff:
82
+ instance/
83
+ .webassets-cache
84
+
85
+ # Scrapy stuff:
86
+ .scrapy
87
+
88
+ # Sphinx documentation
89
+ docs/_build/
90
+
91
+ # PyBuilder
92
+ target/
93
+
94
+ # Jupyter Notebook
95
+ .ipynb_checkpoints
96
+
97
+ # IPython
98
+ profile_default/
99
+ ipython_config.py
100
+
101
+ # pyenv
102
+ .python-version
103
+
104
+ # pipenv
105
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
106
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
107
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
108
+ # install all needed dependencies.
109
+ #Pipfile.lock
110
+
111
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
112
+ __pypackages__/
113
+
114
+ # Celery stuff
115
+ celerybeat-schedule
116
+ celerybeat.pid
117
+
118
+ # SageMath parsed files
119
+ *.sage.py
120
+
121
+ # Environments
122
+ .env
123
+ .venv
124
+ env/
125
+ venv/
126
+ ENV/
127
+ env.bak/
128
+ venv.bak/
129
+
130
+ # Spyder project settings
131
+ .spyderproject
132
+ .spyproject
133
+
134
+ # Rope project settings
135
+ .ropeproject
136
+
137
+ # mkdocs documentation
138
+ /site
139
+
140
+ # mypy
141
+ .mypy_cache/
142
+ .dmypy.json
143
+ dmypy.json
144
+
145
+ # Pyre type checker
146
+ .pyre/
third_party/MMAudio/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Sony Research Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
{mmaudio/data → third_party/MMAudio/mmaudio}/__init__.py RENAMED
File without changes
{mmaudio/ext/bigvgan_v2 → third_party/MMAudio/mmaudio/data}/__init__.py RENAMED
File without changes
{mmaudio → third_party/MMAudio/mmaudio}/data/av_utils.py RENAMED
@@ -1,7 +1,7 @@
1
  from dataclasses import dataclass
2
  from fractions import Fraction
3
  from pathlib import Path
4
- from typing import Optional
5
 
6
  import av
7
  import numpy as np
@@ -15,7 +15,7 @@ class VideoInfo:
15
  fps: Fraction
16
  clip_frames: torch.Tensor
17
  sync_frames: torch.Tensor
18
- all_frames: Optional[list[np.ndarray]]
19
 
20
  @property
21
  def height(self):
@@ -25,9 +25,35 @@ class VideoInfo:
25
  def width(self):
26
  return self.all_frames[0].shape[1]
27
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- def read_frames(video_path: Path, list_of_fps: list[float], start_sec: float, end_sec: float,
30
- need_all_frames: bool) -> tuple[list[np.ndarray], list[np.ndarray], Fraction]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  output_frames = [[] for _ in list_of_fps]
32
  next_frame_time_for_each_fps = [0.0 for _ in list_of_fps]
33
  time_delta_for_each_fps = [1 / fps for fps in list_of_fps]
 
1
  from dataclasses import dataclass
2
  from fractions import Fraction
3
  from pathlib import Path
4
+ from typing import Optional, List, Tuple
5
 
6
  import av
7
  import numpy as np
 
15
  fps: Fraction
16
  clip_frames: torch.Tensor
17
  sync_frames: torch.Tensor
18
+ all_frames: Optional[List[np.ndarray]]
19
 
20
  @property
21
  def height(self):
 
25
  def width(self):
26
  return self.all_frames[0].shape[1]
27
 
28
+ @classmethod
29
+ def from_image_info(cls, image_info: 'ImageInfo', duration_sec: float,
30
+ fps: Fraction) -> 'VideoInfo':
31
+ num_frames = int(duration_sec * fps)
32
+ all_frames = [image_info.original_frame] * num_frames
33
+ return cls(duration_sec=duration_sec,
34
+ fps=fps,
35
+ clip_frames=image_info.clip_frames,
36
+ sync_frames=image_info.sync_frames,
37
+ all_frames=all_frames)
38
 
39
+
40
+ @dataclass
41
+ class ImageInfo:
42
+ clip_frames: torch.Tensor
43
+ sync_frames: torch.Tensor
44
+ original_frame: Optional[np.ndarray]
45
+
46
+ @property
47
+ def height(self):
48
+ return self.original_frame.shape[0]
49
+
50
+ @property
51
+ def width(self):
52
+ return self.original_frame.shape[1]
53
+
54
+
55
+ def read_frames(video_path: Path, list_of_fps: List[float], start_sec: float, end_sec: float,
56
+ need_all_frames: bool) -> Tuple[List[np.ndarray], List[np.ndarray], Fraction]:
57
  output_frames = [[] for _ in list_of_fps]
58
  next_frame_time_for_each_fps = [0.0 for _ in list_of_fps]
59
  time_delta_for_each_fps = [1 / fps for fps in list_of_fps]
third_party/MMAudio/mmaudio/data/data_setup.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import random
3
+
4
+ import numpy as np
5
+ import torch
6
+ from omegaconf import DictConfig
7
+ from torch.utils.data import DataLoader, Dataset
8
+ from torch.utils.data.dataloader import default_collate
9
+ from torch.utils.data.distributed import DistributedSampler
10
+
11
+ from mmaudio.data.eval.audiocaps import AudioCapsData
12
+ from mmaudio.data.eval.video_dataset import MovieGen, VGGSound
13
+ from mmaudio.data.extracted_audio import ExtractedAudio
14
+ from mmaudio.data.extracted_vgg import ExtractedVGG
15
+ from mmaudio.data.mm_dataset import MultiModalDataset
16
+ from mmaudio.utils.dist_utils import local_rank
17
+
18
+ log = logging.getLogger()
19
+
20
+
21
+ # Re-seed randomness every time we start a worker
22
+ def worker_init_fn(worker_id: int):
23
+ worker_seed = torch.initial_seed() % (2**31) + worker_id + local_rank * 1000
24
+ np.random.seed(worker_seed)
25
+ random.seed(worker_seed)
26
+ log.debug(f'Worker {worker_id} re-seeded with seed {worker_seed} in rank {local_rank}')
27
+
28
+
29
+ def load_vgg_data(cfg: DictConfig, data_cfg: DictConfig) -> Dataset:
30
+ dataset = ExtractedVGG(tsv_path=data_cfg.tsv,
31
+ data_dim=cfg.data_dim,
32
+ premade_mmap_dir=data_cfg.memmap_dir)
33
+
34
+ return dataset
35
+
36
+
37
+ def load_audio_data(cfg: DictConfig, data_cfg: DictConfig) -> Dataset:
38
+ dataset = ExtractedAudio(tsv_path=data_cfg.tsv,
39
+ data_dim=cfg.data_dim,
40
+ premade_mmap_dir=data_cfg.memmap_dir)
41
+
42
+ return dataset
43
+
44
+
45
+ def setup_training_datasets(cfg: DictConfig) -> tuple[Dataset, DistributedSampler, DataLoader]:
46
+ if cfg.mini_train:
47
+ vgg = load_vgg_data(cfg, cfg.data.ExtractedVGG_val)
48
+ audiocaps = load_audio_data(cfg, cfg.data.AudioCaps)
49
+ dataset = MultiModalDataset([vgg], [audiocaps])
50
+ if cfg.example_train:
51
+ video = load_vgg_data(cfg, cfg.data.Example_video)
52
+ audio = load_audio_data(cfg, cfg.data.Example_audio)
53
+ dataset = MultiModalDataset([video], [audio])
54
+ else:
55
+ # load the largest one first
56
+ freesound = load_audio_data(cfg, cfg.data.FreeSound)
57
+ vgg = load_vgg_data(cfg, cfg.data.ExtractedVGG)
58
+ audiocaps = load_audio_data(cfg, cfg.data.AudioCaps)
59
+ audioset_sl = load_audio_data(cfg, cfg.data.AudioSetSL)
60
+ bbcsound = load_audio_data(cfg, cfg.data.BBCSound)
61
+ clotho = load_audio_data(cfg, cfg.data.Clotho)
62
+ dataset = MultiModalDataset([vgg] * cfg.vgg_oversample_rate,
63
+ [audiocaps, audioset_sl, bbcsound, freesound, clotho])
64
+
65
+ batch_size = cfg.batch_size
66
+ num_workers = cfg.num_workers
67
+ pin_memory = cfg.pin_memory
68
+ sampler, loader = construct_loader(dataset,
69
+ batch_size,
70
+ num_workers,
71
+ shuffle=True,
72
+ drop_last=True,
73
+ pin_memory=pin_memory)
74
+
75
+ return dataset, sampler, loader
76
+
77
+
78
+ def setup_test_datasets(cfg):
79
+ dataset = load_vgg_data(cfg, cfg.data.ExtractedVGG_test)
80
+
81
+ batch_size = cfg.batch_size
82
+ num_workers = cfg.num_workers
83
+ pin_memory = cfg.pin_memory
84
+ sampler, loader = construct_loader(dataset,
85
+ batch_size,
86
+ num_workers,
87
+ shuffle=False,
88
+ drop_last=False,
89
+ pin_memory=pin_memory)
90
+
91
+ return dataset, sampler, loader
92
+
93
+
94
+ def setup_val_datasets(cfg: DictConfig) -> tuple[Dataset, DataLoader, DataLoader]:
95
+ if cfg.example_train:
96
+ dataset = load_vgg_data(cfg, cfg.data.Example_video)
97
+ else:
98
+ dataset = load_vgg_data(cfg, cfg.data.ExtractedVGG_val)
99
+
100
+ val_batch_size = cfg.batch_size
101
+ val_eval_batch_size = cfg.eval_batch_size
102
+ num_workers = cfg.num_workers
103
+ pin_memory = cfg.pin_memory
104
+ _, val_loader = construct_loader(dataset,
105
+ val_batch_size,
106
+ num_workers,
107
+ shuffle=False,
108
+ drop_last=False,
109
+ pin_memory=pin_memory)
110
+ _, eval_loader = construct_loader(dataset,
111
+ val_eval_batch_size,
112
+ num_workers,
113
+ shuffle=False,
114
+ drop_last=False,
115
+ pin_memory=pin_memory)
116
+
117
+ return dataset, val_loader, eval_loader
118
+
119
+
120
+ def setup_eval_dataset(dataset_name: str, cfg: DictConfig) -> tuple[Dataset, DataLoader]:
121
+ if dataset_name.startswith('audiocaps_full'):
122
+ dataset = AudioCapsData(cfg.eval_data.AudioCaps_full.audio_path,
123
+ cfg.eval_data.AudioCaps_full.csv_path)
124
+ elif dataset_name.startswith('audiocaps'):
125
+ dataset = AudioCapsData(cfg.eval_data.AudioCaps.audio_path,
126
+ cfg.eval_data.AudioCaps.csv_path)
127
+ elif dataset_name.startswith('moviegen'):
128
+ dataset = MovieGen(cfg.eval_data.MovieGen.video_path,
129
+ cfg.eval_data.MovieGen.jsonl_path,
130
+ duration_sec=cfg.duration_s)
131
+ elif dataset_name.startswith('vggsound'):
132
+ dataset = VGGSound(cfg.eval_data.VGGSound.video_path,
133
+ cfg.eval_data.VGGSound.csv_path,
134
+ duration_sec=cfg.duration_s)
135
+ else:
136
+ raise ValueError(f'Invalid dataset name: {dataset_name}')
137
+
138
+ batch_size = cfg.batch_size
139
+ num_workers = cfg.num_workers
140
+ pin_memory = cfg.pin_memory
141
+ _, loader = construct_loader(dataset,
142
+ batch_size,
143
+ num_workers,
144
+ shuffle=False,
145
+ drop_last=False,
146
+ pin_memory=pin_memory,
147
+ error_avoidance=True)
148
+ return dataset, loader
149
+
150
+
151
+ def error_avoidance_collate(batch):
152
+ batch = list(filter(lambda x: x is not None, batch))
153
+ return default_collate(batch)
154
+
155
+
156
+ def construct_loader(dataset: Dataset,
157
+ batch_size: int,
158
+ num_workers: int,
159
+ *,
160
+ shuffle: bool = True,
161
+ drop_last: bool = True,
162
+ pin_memory: bool = False,
163
+ error_avoidance: bool = False) -> tuple[DistributedSampler, DataLoader]:
164
+ train_sampler = DistributedSampler(dataset, rank=local_rank, shuffle=shuffle)
165
+ train_loader = DataLoader(dataset,
166
+ batch_size,
167
+ sampler=train_sampler,
168
+ num_workers=num_workers,
169
+ worker_init_fn=worker_init_fn,
170
+ drop_last=drop_last,
171
+ persistent_workers=num_workers > 0,
172
+ pin_memory=pin_memory,
173
+ collate_fn=error_avoidance_collate if error_avoidance else None)
174
+ return train_sampler, train_loader
{mmaudio/ext/bigvgan_v2/alias_free_activation/cuda → third_party/MMAudio/mmaudio/data/eval}/__init__.py RENAMED
File without changes
third_party/MMAudio/mmaudio/data/eval/audiocaps.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from collections import defaultdict
4
+ from pathlib import Path
5
+ from typing import Union
6
+
7
+ import pandas as pd
8
+ import torch
9
+ from torch.utils.data.dataset import Dataset
10
+
11
+ log = logging.getLogger()
12
+
13
+
14
+ class AudioCapsData(Dataset):
15
+
16
+ def __init__(self, audio_path: Union[str, Path], csv_path: Union[str, Path]):
17
+ df = pd.read_csv(csv_path).to_dict(orient='records')
18
+
19
+ audio_files = sorted(os.listdir(audio_path))
20
+ audio_files = set(
21
+ [Path(f).stem for f in audio_files if f.endswith('.wav') or f.endswith('.flac')])
22
+
23
+ self.data = []
24
+ for row in df:
25
+ self.data.append({
26
+ 'name': row['name'],
27
+ 'caption': row['caption'],
28
+ })
29
+
30
+ self.audio_path = Path(audio_path)
31
+ self.csv_path = Path(csv_path)
32
+
33
+ log.info(f'Found {len(self.data)} matching audio files in {self.audio_path}')
34
+
35
+ def __getitem__(self, idx: int) -> torch.Tensor:
36
+ return self.data[idx]
37
+
38
+ def __len__(self):
39
+ return len(self.data)
third_party/MMAudio/mmaudio/data/eval/moviegen.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Union
6
+
7
+ import torch
8
+ from torch.utils.data.dataset import Dataset
9
+ from torchvision.transforms import v2
10
+ from torio.io import StreamingMediaDecoder
11
+
12
+ from mmaudio.utils.dist_utils import local_rank
13
+
14
+ log = logging.getLogger()
15
+
16
+ _CLIP_SIZE = 384
17
+ _CLIP_FPS = 8.0
18
+
19
+ _SYNC_SIZE = 224
20
+ _SYNC_FPS = 25.0
21
+
22
+
23
+ class MovieGenData(Dataset):
24
+
25
+ def __init__(
26
+ self,
27
+ video_root: Union[str, Path],
28
+ sync_root: Union[str, Path],
29
+ jsonl_root: Union[str, Path],
30
+ *,
31
+ duration_sec: float = 10.0,
32
+ read_clip: bool = True,
33
+ ):
34
+ self.video_root = Path(video_root)
35
+ self.sync_root = Path(sync_root)
36
+ self.jsonl_root = Path(jsonl_root)
37
+ self.read_clip = read_clip
38
+
39
+ videos = sorted(os.listdir(self.video_root))
40
+ videos = [v[:-4] for v in videos] # remove extensions
41
+ self.captions = {}
42
+
43
+ for v in videos:
44
+ with open(self.jsonl_root / (v + '.jsonl')) as f:
45
+ data = json.load(f)
46
+ self.captions[v] = data['audio_prompt']
47
+
48
+ if local_rank == 0:
49
+ log.info(f'{len(videos)} videos found in {video_root}')
50
+
51
+ self.duration_sec = duration_sec
52
+
53
+ self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
54
+ self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
55
+
56
+ self.clip_augment = v2.Compose([
57
+ v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
58
+ v2.ToImage(),
59
+ v2.ToDtype(torch.float32, scale=True),
60
+ ])
61
+
62
+ self.sync_augment = v2.Compose([
63
+ v2.Resize((_SYNC_SIZE, _SYNC_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
64
+ v2.CenterCrop(_SYNC_SIZE),
65
+ v2.ToImage(),
66
+ v2.ToDtype(torch.float32, scale=True),
67
+ v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
68
+ ])
69
+
70
+ self.videos = videos
71
+
72
+ def sample(self, idx: int) -> dict[str, torch.Tensor]:
73
+ video_id = self.videos[idx]
74
+ caption = self.captions[video_id]
75
+
76
+ reader = StreamingMediaDecoder(self.video_root / (video_id + '.mp4'))
77
+ reader.add_basic_video_stream(
78
+ frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
79
+ frame_rate=_CLIP_FPS,
80
+ format='rgb24',
81
+ )
82
+ reader.add_basic_video_stream(
83
+ frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
84
+ frame_rate=_SYNC_FPS,
85
+ format='rgb24',
86
+ )
87
+
88
+ reader.fill_buffer()
89
+ data_chunk = reader.pop_chunks()
90
+
91
+ clip_chunk = data_chunk[0]
92
+ sync_chunk = data_chunk[1]
93
+ if clip_chunk is None:
94
+ raise RuntimeError(f'CLIP video returned None {video_id}')
95
+ if clip_chunk.shape[0] < self.clip_expected_length:
96
+ raise RuntimeError(f'CLIP video too short {video_id}')
97
+
98
+ if sync_chunk is None:
99
+ raise RuntimeError(f'Sync video returned None {video_id}')
100
+ if sync_chunk.shape[0] < self.sync_expected_length:
101
+ raise RuntimeError(f'Sync video too short {video_id}')
102
+
103
+ # truncate the video
104
+ clip_chunk = clip_chunk[:self.clip_expected_length]
105
+ if clip_chunk.shape[0] != self.clip_expected_length:
106
+ raise RuntimeError(f'CLIP video wrong length {video_id}, '
107
+ f'expected {self.clip_expected_length}, '
108
+ f'got {clip_chunk.shape[0]}')
109
+ clip_chunk = self.clip_augment(clip_chunk)
110
+
111
+ sync_chunk = sync_chunk[:self.sync_expected_length]
112
+ if sync_chunk.shape[0] != self.sync_expected_length:
113
+ raise RuntimeError(f'Sync video wrong length {video_id}, '
114
+ f'expected {self.sync_expected_length}, '
115
+ f'got {sync_chunk.shape[0]}')
116
+ sync_chunk = self.sync_augment(sync_chunk)
117
+
118
+ data = {
119
+ 'name': video_id,
120
+ 'caption': caption,
121
+ 'clip_video': clip_chunk,
122
+ 'sync_video': sync_chunk,
123
+ }
124
+
125
+ return data
126
+
127
+ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
128
+ return self.sample(idx)
129
+
130
+ def __len__(self):
131
+ return len(self.captions)
third_party/MMAudio/mmaudio/data/eval/video_dataset.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Union
6
+
7
+ import pandas as pd
8
+ import torch
9
+ from torch.utils.data.dataset import Dataset
10
+ from torchvision.transforms import v2
11
+ from torio.io import StreamingMediaDecoder
12
+
13
+ from mmaudio.utils.dist_utils import local_rank
14
+
15
+ log = logging.getLogger()
16
+
17
+ _CLIP_SIZE = 384
18
+ _CLIP_FPS = 8.0
19
+
20
+ _SYNC_SIZE = 224
21
+ _SYNC_FPS = 25.0
22
+
23
+
24
+ class VideoDataset(Dataset):
25
+
26
+ def __init__(
27
+ self,
28
+ video_root: Union[str, Path],
29
+ *,
30
+ duration_sec: float = 8.0,
31
+ ):
32
+ self.video_root = Path(video_root)
33
+
34
+ self.duration_sec = duration_sec
35
+
36
+ self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
37
+ self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
38
+
39
+ self.clip_transform = v2.Compose([
40
+ v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
41
+ v2.ToImage(),
42
+ v2.ToDtype(torch.float32, scale=True),
43
+ ])
44
+
45
+ self.sync_transform = v2.Compose([
46
+ v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
47
+ v2.CenterCrop(_SYNC_SIZE),
48
+ v2.ToImage(),
49
+ v2.ToDtype(torch.float32, scale=True),
50
+ v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
51
+ ])
52
+
53
+ # to be implemented by subclasses
54
+ self.captions = {}
55
+ self.videos = sorted(list(self.captions.keys()))
56
+
57
+ def sample(self, idx: int) -> dict[str, torch.Tensor]:
58
+ video_id = self.videos[idx]
59
+ caption = self.captions[video_id]
60
+
61
+ reader = StreamingMediaDecoder(self.video_root / (video_id + '.mp4'))
62
+ reader.add_basic_video_stream(
63
+ frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
64
+ frame_rate=_CLIP_FPS,
65
+ format='rgb24',
66
+ )
67
+ reader.add_basic_video_stream(
68
+ frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
69
+ frame_rate=_SYNC_FPS,
70
+ format='rgb24',
71
+ )
72
+
73
+ reader.fill_buffer()
74
+ data_chunk = reader.pop_chunks()
75
+
76
+ clip_chunk = data_chunk[0]
77
+ sync_chunk = data_chunk[1]
78
+ if clip_chunk is None:
79
+ raise RuntimeError(f'CLIP video returned None {video_id}')
80
+ if clip_chunk.shape[0] < self.clip_expected_length:
81
+ raise RuntimeError(
82
+ f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
83
+ )
84
+
85
+ if sync_chunk is None:
86
+ raise RuntimeError(f'Sync video returned None {video_id}')
87
+ if sync_chunk.shape[0] < self.sync_expected_length:
88
+ raise RuntimeError(
89
+ f'Sync video too short {video_id}, expected {self.sync_expected_length}, got {sync_chunk.shape[0]}'
90
+ )
91
+
92
+ # truncate the video
93
+ clip_chunk = clip_chunk[:self.clip_expected_length]
94
+ if clip_chunk.shape[0] != self.clip_expected_length:
95
+ raise RuntimeError(f'CLIP video wrong length {video_id}, '
96
+ f'expected {self.clip_expected_length}, '
97
+ f'got {clip_chunk.shape[0]}')
98
+ clip_chunk = self.clip_transform(clip_chunk)
99
+
100
+ sync_chunk = sync_chunk[:self.sync_expected_length]
101
+ if sync_chunk.shape[0] != self.sync_expected_length:
102
+ raise RuntimeError(f'Sync video wrong length {video_id}, '
103
+ f'expected {self.sync_expected_length}, '
104
+ f'got {sync_chunk.shape[0]}')
105
+ sync_chunk = self.sync_transform(sync_chunk)
106
+
107
+ data = {
108
+ 'name': video_id,
109
+ 'caption': caption,
110
+ 'clip_video': clip_chunk,
111
+ 'sync_video': sync_chunk,
112
+ }
113
+
114
+ return data
115
+
116
+ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
117
+ try:
118
+ return self.sample(idx)
119
+ except Exception as e:
120
+ log.error(f'Error loading video {self.videos[idx]}: {e}')
121
+ return None
122
+
123
+ def __len__(self):
124
+ return len(self.captions)
125
+
126
+
127
+ class VGGSound(VideoDataset):
128
+
129
+ def __init__(
130
+ self,
131
+ video_root: Union[str, Path],
132
+ csv_path: Union[str, Path],
133
+ *,
134
+ duration_sec: float = 8.0,
135
+ ):
136
+ super().__init__(video_root, duration_sec=duration_sec)
137
+ self.video_root = Path(video_root)
138
+ self.csv_path = Path(csv_path)
139
+
140
+ videos = sorted(os.listdir(self.video_root))
141
+ if local_rank == 0:
142
+ log.info(f'{len(videos)} videos found in {video_root}')
143
+ self.captions = {}
144
+
145
+ df = pd.read_csv(csv_path, header=None, names=['id', 'sec', 'caption',
146
+ 'split']).to_dict(orient='records')
147
+
148
+ videos_no_found = []
149
+ for row in df:
150
+ if row['split'] == 'test':
151
+ start_sec = int(row['sec'])
152
+ video_id = str(row['id'])
153
+ # this is how our videos are named
154
+ video_name = f'{video_id}_{start_sec:06d}'
155
+ if video_name + '.mp4' not in videos:
156
+ videos_no_found.append(video_name)
157
+ continue
158
+
159
+ self.captions[video_name] = row['caption']
160
+
161
+ if local_rank == 0:
162
+ log.info(f'{len(videos)} videos found in {video_root}')
163
+ log.info(f'{len(self.captions)} useable videos found')
164
+ if videos_no_found:
165
+ log.info(f'{len(videos_no_found)} found in {csv_path} but not in {video_root}')
166
+ log.info(
167
+ 'A small amount is expected, as not all videos are still available on YouTube')
168
+
169
+ self.videos = sorted(list(self.captions.keys()))
170
+
171
+
172
+ class MovieGen(VideoDataset):
173
+
174
+ def __init__(
175
+ self,
176
+ video_root: Union[str, Path],
177
+ jsonl_root: Union[str, Path],
178
+ *,
179
+ duration_sec: float = 10.0,
180
+ ):
181
+ super().__init__(video_root, duration_sec=duration_sec)
182
+ self.video_root = Path(video_root)
183
+ self.jsonl_root = Path(jsonl_root)
184
+
185
+ videos = sorted(os.listdir(self.video_root))
186
+ videos = [v[:-4] for v in videos] # remove extensions
187
+ self.captions = {}
188
+
189
+ for v in videos:
190
+ with open(self.jsonl_root / (v + '.jsonl')) as f:
191
+ data = json.load(f)
192
+ self.captions[v] = data['audio_prompt']
193
+
194
+ if local_rank == 0:
195
+ log.info(f'{len(videos)} videos found in {video_root}')
196
+
197
+ self.videos = videos
third_party/MMAudio/mmaudio/data/extracted_audio.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ import pandas as pd
6
+ import torch
7
+ from tensordict import TensorDict
8
+ from torch.utils.data.dataset import Dataset
9
+
10
+ from mmaudio.utils.dist_utils import local_rank
11
+
12
+ log = logging.getLogger()
13
+
14
+
15
+ class ExtractedAudio(Dataset):
16
+
17
+ def __init__(
18
+ self,
19
+ tsv_path: Union[str, Path],
20
+ *,
21
+ premade_mmap_dir: Union[str, Path],
22
+ data_dim: dict[str, int],
23
+ ):
24
+ super().__init__()
25
+
26
+ self.data_dim = data_dim
27
+ self.df_list = pd.read_csv(tsv_path, sep='\t').to_dict('records')
28
+ self.ids = [str(d['id']) for d in self.df_list]
29
+
30
+ log.info(f'Loading precomputed mmap from {premade_mmap_dir}')
31
+ # load precomputed memory mapped tensors
32
+ premade_mmap_dir = Path(premade_mmap_dir)
33
+ td = TensorDict.load_memmap(premade_mmap_dir)
34
+ log.info(f'Loaded precomputed mmap from {premade_mmap_dir}')
35
+ self.mean = td['mean']
36
+ self.std = td['std']
37
+ self.text_features = td['text_features']
38
+
39
+ log.info(f'Loaded {len(self)} samples from {premade_mmap_dir}.')
40
+ log.info(f'Loaded mean: {self.mean.shape}.')
41
+ log.info(f'Loaded std: {self.std.shape}.')
42
+ log.info(f'Loaded text features: {self.text_features.shape}.')
43
+
44
+ assert self.mean.shape[1] == self.data_dim['latent_seq_len'], \
45
+ f'{self.mean.shape[1]} != {self.data_dim["latent_seq_len"]}'
46
+ assert self.std.shape[1] == self.data_dim['latent_seq_len'], \
47
+ f'{self.std.shape[1]} != {self.data_dim["latent_seq_len"]}'
48
+
49
+ assert self.text_features.shape[1] == self.data_dim['text_seq_len'], \
50
+ f'{self.text_features.shape[1]} != {self.data_dim["text_seq_len"]}'
51
+ assert self.text_features.shape[-1] == self.data_dim['text_dim'], \
52
+ f'{self.text_features.shape[-1]} != {self.data_dim["text_dim"]}'
53
+
54
+ self.fake_clip_features = torch.zeros(self.data_dim['clip_seq_len'],
55
+ self.data_dim['clip_dim'])
56
+ self.fake_sync_features = torch.zeros(self.data_dim['sync_seq_len'],
57
+ self.data_dim['sync_dim'])
58
+ self.video_exist = torch.tensor(0, dtype=torch.bool)
59
+ self.text_exist = torch.tensor(1, dtype=torch.bool)
60
+
61
+ def compute_latent_stats(self) -> tuple[torch.Tensor, torch.Tensor]:
62
+ latents = self.mean
63
+ return latents.mean(dim=(0, 1)), latents.std(dim=(0, 1))
64
+
65
+ def get_memory_mapped_tensor(self) -> TensorDict:
66
+ td = TensorDict({
67
+ 'mean': self.mean,
68
+ 'std': self.std,
69
+ 'text_features': self.text_features,
70
+ })
71
+ return td
72
+
73
+ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
74
+ data = {
75
+ 'id': str(self.df_list[idx]['id']),
76
+ 'a_mean': self.mean[idx],
77
+ 'a_std': self.std[idx],
78
+ 'clip_features': self.fake_clip_features,
79
+ 'sync_features': self.fake_sync_features,
80
+ 'text_features': self.text_features[idx],
81
+ 'caption': self.df_list[idx]['caption'],
82
+ 'video_exist': self.video_exist,
83
+ 'text_exist': self.text_exist,
84
+ }
85
+ return data
86
+
87
+ def __len__(self):
88
+ return len(self.ids)
third_party/MMAudio/mmaudio/data/extracted_vgg.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ import pandas as pd
6
+ import torch
7
+ from tensordict import TensorDict
8
+ from torch.utils.data.dataset import Dataset
9
+
10
+ from mmaudio.utils.dist_utils import local_rank
11
+
12
+ log = logging.getLogger()
13
+
14
+
15
+ class ExtractedVGG(Dataset):
16
+
17
+ def __init__(
18
+ self,
19
+ tsv_path: Union[str, Path],
20
+ *,
21
+ premade_mmap_dir: Union[str, Path],
22
+ data_dim: dict[str, int],
23
+ ):
24
+ super().__init__()
25
+
26
+ self.data_dim = data_dim
27
+ self.df_list = pd.read_csv(tsv_path, sep='\t').to_dict('records')
28
+ self.ids = [d['id'] for d in self.df_list]
29
+
30
+ log.info(f'Loading precomputed mmap from {premade_mmap_dir}')
31
+ # load precomputed memory mapped tensors
32
+ premade_mmap_dir = Path(premade_mmap_dir)
33
+ td = TensorDict.load_memmap(premade_mmap_dir)
34
+ log.info(f'Loaded precomputed mmap from {premade_mmap_dir}')
35
+ self.mean = td['mean']
36
+ self.std = td['std']
37
+ self.clip_features = td['clip_features']
38
+ self.sync_features = td['sync_features']
39
+ self.text_features = td['text_features']
40
+
41
+ if local_rank == 0:
42
+ log.info(f'Loaded {len(self)} samples.')
43
+ log.info(f'Loaded mean: {self.mean.shape}.')
44
+ log.info(f'Loaded std: {self.std.shape}.')
45
+ log.info(f'Loaded clip_features: {self.clip_features.shape}.')
46
+ log.info(f'Loaded sync_features: {self.sync_features.shape}.')
47
+ log.info(f'Loaded text_features: {self.text_features.shape}.')
48
+
49
+ assert self.mean.shape[1] == self.data_dim['latent_seq_len'], \
50
+ f'{self.mean.shape[1]} != {self.data_dim["latent_seq_len"]}'
51
+ assert self.std.shape[1] == self.data_dim['latent_seq_len'], \
52
+ f'{self.std.shape[1]} != {self.data_dim["latent_seq_len"]}'
53
+
54
+ assert self.clip_features.shape[1] == self.data_dim['clip_seq_len'], \
55
+ f'{self.clip_features.shape[1]} != {self.data_dim["clip_seq_len"]}'
56
+ assert self.sync_features.shape[1] == self.data_dim['sync_seq_len'], \
57
+ f'{self.sync_features.shape[1]} != {self.data_dim["sync_seq_len"]}'
58
+ assert self.text_features.shape[1] == self.data_dim['text_seq_len'], \
59
+ f'{self.text_features.shape[1]} != {self.data_dim["text_seq_len"]}'
60
+
61
+ assert self.clip_features.shape[-1] == self.data_dim['clip_dim'], \
62
+ f'{self.clip_features.shape[-1]} != {self.data_dim["clip_dim"]}'
63
+ assert self.sync_features.shape[-1] == self.data_dim['sync_dim'], \
64
+ f'{self.sync_features.shape[-1]} != {self.data_dim["sync_dim"]}'
65
+ assert self.text_features.shape[-1] == self.data_dim['text_dim'], \
66
+ f'{self.text_features.shape[-1]} != {self.data_dim["text_dim"]}'
67
+
68
+ self.video_exist = torch.tensor(1, dtype=torch.bool)
69
+ self.text_exist = torch.tensor(1, dtype=torch.bool)
70
+
71
+ def compute_latent_stats(self) -> tuple[torch.Tensor, torch.Tensor]:
72
+ latents = self.mean
73
+ return latents.mean(dim=(0, 1)), latents.std(dim=(0, 1))
74
+
75
+ def get_memory_mapped_tensor(self) -> TensorDict:
76
+ td = TensorDict({
77
+ 'mean': self.mean,
78
+ 'std': self.std,
79
+ 'clip_features': self.clip_features,
80
+ 'sync_features': self.sync_features,
81
+ 'text_features': self.text_features,
82
+ })
83
+ return td
84
+
85
+ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
86
+ data = {
87
+ 'id': self.df_list[idx]['id'],
88
+ 'a_mean': self.mean[idx],
89
+ 'a_std': self.std[idx],
90
+ 'clip_features': self.clip_features[idx],
91
+ 'sync_features': self.sync_features[idx],
92
+ 'text_features': self.text_features[idx],
93
+ 'caption': self.df_list[idx]['label'],
94
+ 'video_exist': self.video_exist,
95
+ 'text_exist': self.text_exist,
96
+ }
97
+
98
+ return data
99
+
100
+ def __len__(self):
101
+ return len(self.ids)
{mmaudio/model → third_party/MMAudio/mmaudio/data/extraction}/__init__.py RENAMED
File without changes
third_party/MMAudio/mmaudio/data/extraction/vgg_sound.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Optional, Union
5
+
6
+ import pandas as pd
7
+ import torch
8
+ import torchaudio
9
+ from torch.utils.data.dataset import Dataset
10
+ from torchvision.transforms import v2
11
+ from torio.io import StreamingMediaDecoder
12
+
13
+ from mmaudio.utils.dist_utils import local_rank
14
+
15
+ log = logging.getLogger()
16
+
17
+ _CLIP_SIZE = 384
18
+ _CLIP_FPS = 8.0
19
+
20
+ _SYNC_SIZE = 224
21
+ _SYNC_FPS = 25.0
22
+
23
+
24
+ class VGGSound(Dataset):
25
+
26
+ def __init__(
27
+ self,
28
+ root: Union[str, Path],
29
+ *,
30
+ tsv_path: Union[str, Path] = 'sets/vgg3-train.tsv',
31
+ sample_rate: int = 16_000,
32
+ duration_sec: float = 8.0,
33
+ audio_samples: Optional[int] = None,
34
+ normalize_audio: bool = False,
35
+ ):
36
+ self.root = Path(root)
37
+ self.normalize_audio = normalize_audio
38
+ if audio_samples is None:
39
+ self.audio_samples = int(sample_rate * duration_sec)
40
+ else:
41
+ self.audio_samples = audio_samples
42
+ effective_duration = audio_samples / sample_rate
43
+ # make sure the duration is close enough, within 15ms
44
+ assert abs(effective_duration - duration_sec) < 0.015, \
45
+ f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
46
+
47
+ videos = sorted(os.listdir(self.root))
48
+ videos = set([Path(v).stem for v in videos]) # remove extensions
49
+ self.labels = {}
50
+ self.videos = []
51
+ missing_videos = []
52
+
53
+ # read the tsv for subset information
54
+ df_list = pd.read_csv(tsv_path, sep='\t', dtype={'id': str}).to_dict('records')
55
+ for record in df_list:
56
+ id = record['id']
57
+ label = record['label']
58
+ if id in videos:
59
+ self.labels[id] = label
60
+ self.videos.append(id)
61
+ else:
62
+ missing_videos.append(id)
63
+
64
+ if local_rank == 0:
65
+ log.info(f'{len(videos)} videos found in {root}')
66
+ log.info(f'{len(self.videos)} videos found in {tsv_path}')
67
+ log.info(f'{len(missing_videos)} videos missing in {root}')
68
+
69
+ self.sample_rate = sample_rate
70
+ self.duration_sec = duration_sec
71
+
72
+ self.expected_audio_length = audio_samples
73
+ self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
74
+ self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
75
+
76
+ self.clip_transform = v2.Compose([
77
+ v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
78
+ v2.ToImage(),
79
+ v2.ToDtype(torch.float32, scale=True),
80
+ ])
81
+
82
+ self.sync_transform = v2.Compose([
83
+ v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
84
+ v2.CenterCrop(_SYNC_SIZE),
85
+ v2.ToImage(),
86
+ v2.ToDtype(torch.float32, scale=True),
87
+ v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
88
+ ])
89
+
90
+ self.resampler = {}
91
+
92
+ def sample(self, idx: int) -> dict[str, torch.Tensor]:
93
+ video_id = self.videos[idx]
94
+ label = self.labels[video_id]
95
+
96
+ reader = StreamingMediaDecoder(self.root / (video_id + '.mp4'))
97
+ reader.add_basic_video_stream(
98
+ frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
99
+ frame_rate=_CLIP_FPS,
100
+ format='rgb24',
101
+ )
102
+ reader.add_basic_video_stream(
103
+ frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
104
+ frame_rate=_SYNC_FPS,
105
+ format='rgb24',
106
+ )
107
+ reader.add_basic_audio_stream(frames_per_chunk=2**30, )
108
+
109
+ reader.fill_buffer()
110
+ data_chunk = reader.pop_chunks()
111
+
112
+ clip_chunk = data_chunk[0]
113
+ sync_chunk = data_chunk[1]
114
+ audio_chunk = data_chunk[2]
115
+
116
+ if clip_chunk is None:
117
+ raise RuntimeError(f'CLIP video returned None {video_id}')
118
+ if clip_chunk.shape[0] < self.clip_expected_length:
119
+ raise RuntimeError(
120
+ f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
121
+ )
122
+
123
+ if sync_chunk is None:
124
+ raise RuntimeError(f'Sync video returned None {video_id}')
125
+ if sync_chunk.shape[0] < self.sync_expected_length:
126
+ raise RuntimeError(
127
+ f'Sync video too short {video_id}, expected {self.sync_expected_length}, got {sync_chunk.shape[0]}'
128
+ )
129
+
130
+ # process audio
131
+ sample_rate = int(reader.get_out_stream_info(2).sample_rate)
132
+ audio_chunk = audio_chunk.transpose(0, 1)
133
+ audio_chunk = audio_chunk.mean(dim=0) # mono
134
+ if self.normalize_audio:
135
+ abs_max = audio_chunk.abs().max()
136
+ audio_chunk = audio_chunk / abs_max * 0.95
137
+ if abs_max <= 1e-6:
138
+ raise RuntimeError(f'Audio is silent {video_id}')
139
+
140
+ # resample
141
+ if sample_rate == self.sample_rate:
142
+ audio_chunk = audio_chunk
143
+ else:
144
+ if sample_rate not in self.resampler:
145
+ # https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
146
+ self.resampler[sample_rate] = torchaudio.transforms.Resample(
147
+ sample_rate,
148
+ self.sample_rate,
149
+ lowpass_filter_width=64,
150
+ rolloff=0.9475937167399596,
151
+ resampling_method='sinc_interp_kaiser',
152
+ beta=14.769656459379492,
153
+ )
154
+ audio_chunk = self.resampler[sample_rate](audio_chunk)
155
+
156
+ if audio_chunk.shape[0] < self.expected_audio_length:
157
+ raise RuntimeError(f'Audio too short {video_id}')
158
+ audio_chunk = audio_chunk[:self.expected_audio_length]
159
+
160
+ # truncate the video
161
+ clip_chunk = clip_chunk[:self.clip_expected_length]
162
+ if clip_chunk.shape[0] != self.clip_expected_length:
163
+ raise RuntimeError(f'CLIP video wrong length {video_id}, '
164
+ f'expected {self.clip_expected_length}, '
165
+ f'got {clip_chunk.shape[0]}')
166
+ clip_chunk = self.clip_transform(clip_chunk)
167
+
168
+ sync_chunk = sync_chunk[:self.sync_expected_length]
169
+ if sync_chunk.shape[0] != self.sync_expected_length:
170
+ raise RuntimeError(f'Sync video wrong length {video_id}, '
171
+ f'expected {self.sync_expected_length}, '
172
+ f'got {sync_chunk.shape[0]}')
173
+ sync_chunk = self.sync_transform(sync_chunk)
174
+
175
+ data = {
176
+ 'id': video_id,
177
+ 'caption': label,
178
+ 'audio': audio_chunk,
179
+ 'clip_video': clip_chunk,
180
+ 'sync_video': sync_chunk,
181
+ }
182
+
183
+ return data
184
+
185
+ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
186
+ try:
187
+ return self.sample(idx)
188
+ except Exception as e:
189
+ log.error(f'Error loading video {self.videos[idx]}: {e}')
190
+ return None
191
+
192
+ def __len__(self):
193
+ return len(self.labels)
third_party/MMAudio/mmaudio/data/extraction/wav_dataset.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Union
5
+
6
+ import open_clip
7
+ import pandas as pd
8
+ import torch
9
+ import torchaudio
10
+ from torch.utils.data.dataset import Dataset
11
+
12
+ log = logging.getLogger()
13
+
14
+
15
+ class WavTextClipsDataset(Dataset):
16
+
17
+ def __init__(
18
+ self,
19
+ root: Union[str, Path],
20
+ *,
21
+ captions_tsv: Union[str, Path],
22
+ clips_tsv: Union[str, Path],
23
+ sample_rate: int,
24
+ num_samples: int,
25
+ normalize_audio: bool = False,
26
+ reject_silent: bool = False,
27
+ tokenizer_id: str = 'ViT-H-14-378-quickgelu',
28
+ ):
29
+ self.root = Path(root)
30
+ self.sample_rate = sample_rate
31
+ self.num_samples = num_samples
32
+ self.normalize_audio = normalize_audio
33
+ self.reject_silent = reject_silent
34
+ self.tokenizer = open_clip.get_tokenizer(tokenizer_id)
35
+
36
+ audios = sorted(os.listdir(self.root))
37
+ audios = set([
38
+ Path(audio).stem for audio in audios
39
+ if audio.endswith('.wav') or audio.endswith('.flac')
40
+ ])
41
+ self.captions = {}
42
+
43
+ # read the caption tsv
44
+ df_list = pd.read_csv(captions_tsv, sep='\t', dtype={'id': str}).to_dict('records')
45
+ for record in df_list:
46
+ id = record['id']
47
+ caption = record['caption']
48
+ self.captions[id] = caption
49
+
50
+ # read the clip tsv
51
+ df_list = pd.read_csv(clips_tsv, sep='\t', dtype={
52
+ 'id': str,
53
+ 'name': str
54
+ }).to_dict('records')
55
+ self.clips = []
56
+ for record in df_list:
57
+ record['id'] = record['id']
58
+ record['name'] = record['name']
59
+ id = record['id']
60
+ name = record['name']
61
+ if name not in self.captions:
62
+ log.warning(f'Audio {name} not found in {captions_tsv}')
63
+ continue
64
+ record['caption'] = self.captions[name]
65
+ self.clips.append(record)
66
+
67
+ log.info(f'Found {len(self.clips)} audio files in {self.root}')
68
+
69
+ self.resampler = {}
70
+
71
+ def __getitem__(self, idx: int) -> torch.Tensor:
72
+ try:
73
+ clip = self.clips[idx]
74
+ audio_name = clip['name']
75
+ audio_id = clip['id']
76
+ caption = clip['caption']
77
+ start_sample = clip['start_sample']
78
+ end_sample = clip['end_sample']
79
+
80
+ audio_path = self.root / f'{audio_name}.flac'
81
+ if not audio_path.exists():
82
+ audio_path = self.root / f'{audio_name}.wav'
83
+ assert audio_path.exists()
84
+
85
+ audio_chunk, sample_rate = torchaudio.load(audio_path)
86
+ audio_chunk = audio_chunk.mean(dim=0) # mono
87
+ abs_max = audio_chunk.abs().max()
88
+ if self.normalize_audio:
89
+ audio_chunk = audio_chunk / abs_max * 0.95
90
+
91
+ if self.reject_silent and abs_max < 1e-6:
92
+ log.warning(f'Rejecting silent audio')
93
+ return None
94
+
95
+ audio_chunk = audio_chunk[start_sample:end_sample]
96
+
97
+ # resample
98
+ if sample_rate == self.sample_rate:
99
+ audio_chunk = audio_chunk
100
+ else:
101
+ if sample_rate not in self.resampler:
102
+ # https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
103
+ self.resampler[sample_rate] = torchaudio.transforms.Resample(
104
+ sample_rate,
105
+ self.sample_rate,
106
+ lowpass_filter_width=64,
107
+ rolloff=0.9475937167399596,
108
+ resampling_method='sinc_interp_kaiser',
109
+ beta=14.769656459379492,
110
+ )
111
+ audio_chunk = self.resampler[sample_rate](audio_chunk)
112
+
113
+ if audio_chunk.shape[0] < self.num_samples:
114
+ raise ValueError('Audio is too short')
115
+ audio_chunk = audio_chunk[:self.num_samples]
116
+
117
+ tokens = self.tokenizer([caption])[0]
118
+
119
+ output = {
120
+ 'waveform': audio_chunk,
121
+ 'id': audio_id,
122
+ 'caption': caption,
123
+ 'tokens': tokens,
124
+ }
125
+
126
+ return output
127
+ except Exception as e:
128
+ log.error(f'Error reading {audio_path}: {e}')
129
+ return None
130
+
131
+ def __len__(self):
132
+ return len(self.clips)
third_party/MMAudio/mmaudio/data/mm_dataset.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bisect
2
+
3
+ import torch
4
+ from torch.utils.data.dataset import Dataset
5
+
6
+
7
+ # modified from https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#ConcatDataset
8
+ class MultiModalDataset(Dataset):
9
+ datasets: list[Dataset]
10
+ cumulative_sizes: list[int]
11
+
12
+ @staticmethod
13
+ def cumsum(sequence):
14
+ r, s = [], 0
15
+ for e in sequence:
16
+ l = len(e)
17
+ r.append(l + s)
18
+ s += l
19
+ return r
20
+
21
+ def __init__(self, video_datasets: list[Dataset], audio_datasets: list[Dataset]):
22
+ super().__init__()
23
+ self.video_datasets = list(video_datasets)
24
+ self.audio_datasets = list(audio_datasets)
25
+ self.datasets = self.video_datasets + self.audio_datasets
26
+
27
+ self.cumulative_sizes = self.cumsum(self.datasets)
28
+
29
+ def __len__(self):
30
+ return self.cumulative_sizes[-1]
31
+
32
+ def __getitem__(self, idx):
33
+ if idx < 0:
34
+ if -idx > len(self):
35
+ raise ValueError("absolute value of index should not exceed dataset length")
36
+ idx = len(self) + idx
37
+ dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
38
+ if dataset_idx == 0:
39
+ sample_idx = idx
40
+ else:
41
+ sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
42
+ return self.datasets[dataset_idx][sample_idx]
43
+
44
+ def compute_latent_stats(self) -> tuple[torch.Tensor, torch.Tensor]:
45
+ return self.video_datasets[0].compute_latent_stats()
third_party/MMAudio/mmaudio/data/utils.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import random
4
+ import tempfile
5
+ from pathlib import Path
6
+ from typing import Any, Optional, Union
7
+
8
+ import torch
9
+ import torch.distributed as dist
10
+ from tensordict import MemoryMappedTensor
11
+ from torch.utils.data import DataLoader
12
+ from torch.utils.data.dataset import Dataset
13
+ from tqdm import tqdm
14
+
15
+ from mmaudio.utils.dist_utils import local_rank, world_size
16
+
17
+ scratch_path = Path(os.environ['SLURM_SCRATCH'] if 'SLURM_SCRATCH' in os.environ else '/dev/shm')
18
+ shm_path = Path('/dev/shm')
19
+
20
+ log = logging.getLogger()
21
+
22
+
23
+ def reseed(seed):
24
+ random.seed(seed)
25
+ torch.manual_seed(seed)
26
+
27
+
28
+ def local_scatter_torch(obj: Optional[Any]):
29
+ if world_size == 1:
30
+ # Just one worker. Do nothing.
31
+ return obj
32
+
33
+ array = [obj] * world_size
34
+ target_array = [None]
35
+ if local_rank == 0:
36
+ dist.scatter_object_list(target_array, scatter_object_input_list=array, src=0)
37
+ else:
38
+ dist.scatter_object_list(target_array, scatter_object_input_list=None, src=0)
39
+ return target_array[0]
40
+
41
+
42
+ class ShardDataset(Dataset):
43
+
44
+ def __init__(self, root):
45
+ self.root = root
46
+ self.shards = sorted(os.listdir(root))
47
+
48
+ def __len__(self):
49
+ return len(self.shards)
50
+
51
+ def __getitem__(self, idx):
52
+ return torch.load(os.path.join(self.root, self.shards[idx]), weights_only=True)
53
+
54
+
55
+ def get_tmp_dir(in_memory: bool) -> Path:
56
+ return shm_path if in_memory else scratch_path
57
+
58
+
59
+ def load_shards_and_share(data_path: Union[str, Path], ids: list[int],
60
+ in_memory: bool) -> MemoryMappedTensor:
61
+ if local_rank == 0:
62
+ with tempfile.NamedTemporaryFile(prefix='shared-tensor-', dir=get_tmp_dir(in_memory)) as f:
63
+ log.info(f'Loading shards from {data_path} into {f.name}...')
64
+ data = load_shards(data_path, ids=ids, tmp_file_path=f.name)
65
+ data = share_tensor_to_all(data)
66
+ torch.distributed.barrier()
67
+ f.close() # why does the context manager not close the file for me?
68
+ else:
69
+ log.info('Waiting for the data to be shared with me...')
70
+ data = share_tensor_to_all(None)
71
+ torch.distributed.barrier()
72
+
73
+ return data
74
+
75
+
76
+ def load_shards(
77
+ data_path: Union[str, Path],
78
+ ids: list[int],
79
+ *,
80
+ tmp_file_path: str,
81
+ ) -> Union[torch.Tensor, dict[str, torch.Tensor]]:
82
+
83
+ id_set = set(ids)
84
+ shards = sorted(os.listdir(data_path))
85
+ log.info(f'Found {len(shards)} shards in {data_path}.')
86
+ first_shard = torch.load(os.path.join(data_path, shards[0]), weights_only=True)
87
+
88
+ log.info(f'Rank {local_rank} created file {tmp_file_path}')
89
+ first_item = next(iter(first_shard.values()))
90
+ log.info(f'First item shape: {first_item.shape}')
91
+ mm_tensor = MemoryMappedTensor.empty(shape=(len(ids), *first_item.shape),
92
+ dtype=torch.float32,
93
+ filename=tmp_file_path,
94
+ existsok=True)
95
+ total_count = 0
96
+ used_index = set()
97
+ id_indexing = {i: idx for idx, i in enumerate(ids)}
98
+ # faster with no workers; otherwise we need to set_sharing_strategy('file_system')
99
+ loader = DataLoader(ShardDataset(data_path), batch_size=1, num_workers=0)
100
+ for data in tqdm(loader, desc='Loading shards'):
101
+ for i, v in data.items():
102
+ if i not in id_set:
103
+ continue
104
+
105
+ # tensor_index = ids.index(i)
106
+ tensor_index = id_indexing[i]
107
+ if tensor_index in used_index:
108
+ raise ValueError(f'Duplicate id {i} found in {data_path}.')
109
+ used_index.add(tensor_index)
110
+ mm_tensor[tensor_index] = v
111
+ total_count += 1
112
+
113
+ assert total_count == len(ids), f'Expected {len(ids)} tensors, got {total_count}.'
114
+ log.info(f'Loaded {total_count} tensors from {data_path}.')
115
+
116
+ return mm_tensor
117
+
118
+
119
+ def share_tensor_to_all(x: Optional[MemoryMappedTensor]) -> MemoryMappedTensor:
120
+ """
121
+ x: the tensor to be shared; None if local_rank != 0
122
+ return: the shared tensor
123
+ """
124
+
125
+ # there is no need to share your stuff with anyone if you are alone; must be in memory
126
+ if world_size == 1:
127
+ return x
128
+
129
+ if local_rank == 0:
130
+ assert x is not None, 'x must not be None if local_rank == 0'
131
+ else:
132
+ assert x is None, 'x must be None if local_rank != 0'
133
+
134
+ if local_rank == 0:
135
+ filename = x.filename
136
+ meta_information = (filename, x.shape, x.dtype)
137
+ else:
138
+ meta_information = None
139
+
140
+ filename, data_shape, data_type = local_scatter_torch(meta_information)
141
+ if local_rank == 0:
142
+ data = x
143
+ else:
144
+ data = MemoryMappedTensor.from_filename(filename=filename,
145
+ dtype=data_type,
146
+ shape=data_shape)
147
+
148
+ return data