Spaces:
Running
Running
lym0302
commited on
Commit
·
1fd4e9c
1
Parent(s):
9d9a9d8
our
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +90 -86
- app.py +138 -251
- demo.py +0 -135
- docs/images/icon.png +0 -0
- docs/index.html +0 -147
- docs/style.css +0 -78
- docs/style_videos.css +0 -52
- docs/video_gen.html +0 -254
- docs/video_main.html +0 -98
- docs/video_vgg.html +0 -452
- {mmaudio → pipeline}/__init__.py +0 -0
- pipeline/__pycache__/__init__.cpython-310.pyc +0 -0
- pipeline/__pycache__/__init__.cpython-38.pyc +0 -0
- pipeline/__pycache__/pipeline.cpython-310.pyc +0 -0
- pipeline/__pycache__/pipeline.cpython-38.pyc +0 -0
- pipeline/__pycache__/step0.cpython-310.pyc +0 -0
- pipeline/__pycache__/step0.cpython-38.pyc +0 -0
- pipeline/__pycache__/step1.cpython-310.pyc +0 -0
- pipeline/__pycache__/step1.cpython-38.pyc +0 -0
- pipeline/__pycache__/step2.cpython-310.pyc +0 -0
- pipeline/__pycache__/step2.cpython-38.pyc +0 -0
- pipeline/__pycache__/step3.cpython-310.pyc +0 -0
- pipeline/__pycache__/step3.cpython-38.pyc +0 -0
- pipeline/__pycache__/step4.cpython-310.pyc +0 -0
- pipeline/__pycache__/step4.cpython-38.pyc +0 -0
- pipeline/pipeline.py +175 -0
- pipeline/step0.py +39 -0
- pipeline/step1.py +36 -0
- pipeline/step2.py +52 -0
- pipeline/step3.py +129 -0
- pipeline/step4.py +31 -0
- pyproject.toml +0 -52
- requirements.txt.bak +0 -27
- third_party/MMAudio/.gitignore +146 -0
- third_party/MMAudio/LICENSE +21 -0
- {mmaudio/data → third_party/MMAudio/mmaudio}/__init__.py +0 -0
- {mmaudio/ext/bigvgan_v2 → third_party/MMAudio/mmaudio/data}/__init__.py +0 -0
- {mmaudio → third_party/MMAudio/mmaudio}/data/av_utils.py +30 -4
- third_party/MMAudio/mmaudio/data/data_setup.py +174 -0
- {mmaudio/ext/bigvgan_v2/alias_free_activation/cuda → third_party/MMAudio/mmaudio/data/eval}/__init__.py +0 -0
- third_party/MMAudio/mmaudio/data/eval/audiocaps.py +39 -0
- third_party/MMAudio/mmaudio/data/eval/moviegen.py +131 -0
- third_party/MMAudio/mmaudio/data/eval/video_dataset.py +197 -0
- third_party/MMAudio/mmaudio/data/extracted_audio.py +88 -0
- third_party/MMAudio/mmaudio/data/extracted_vgg.py +101 -0
- {mmaudio/model → third_party/MMAudio/mmaudio/data/extraction}/__init__.py +0 -0
- third_party/MMAudio/mmaudio/data/extraction/vgg_sound.py +193 -0
- third_party/MMAudio/mmaudio/data/extraction/wav_dataset.py +132 -0
- third_party/MMAudio/mmaudio/data/mm_dataset.py +45 -0
- third_party/MMAudio/mmaudio/data/utils.py +148 -0
README.md
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
---
|
2 |
title: DeepSound-V1
|
3 |
-
emoji: 🔊
|
4 |
colorFrom: blue
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
@@ -9,155 +8,160 @@ pinned: false
|
|
9 |
---
|
10 |
|
11 |
|
12 |
-
#
|
|
|
13 |
|
14 |
-
[Ho Kei Cheng](https://hkchengrex.github.io/), [Masato Ishii](https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ), [Akio Hayakawa](https://scholar.google.com/citations?user=sXAjHFIAAAAJ), [Takashi Shibuya](https://scholar.google.com/citations?user=XCRO260AAAAJ), [Alexander Schwing](https://www.alexander-schwing.de/), [Yuki Mitsufuji](https://www.yukimitsufuji.com/)
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
|
|
18 |
|
19 |
-
[
|
20 |
|
|
|
21 |
|
22 |
-
|
23 |
|
24 |
## Highlight
|
25 |
|
26 |
-
|
27 |
-
Our key innovation is multimodal joint training which allows training on a wide range of audio-visual and audio-text datasets.
|
28 |
-
Moreover, a synchronization module aligns the generated audio with the video frames.
|
29 |
|
30 |
-
|
31 |
-
## Results
|
32 |
|
33 |
(All audio from our algorithm MMAudio)
|
34 |
|
35 |
-
Videos from Sora:
|
36 |
|
37 |
https://github.com/user-attachments/assets/82afd192-0cee-48a1-86ca-bd39b8c8f330
|
38 |
|
|
|
|
|
|
|
39 |
|
40 |
-
Videos from MovieGen/Hunyuan Video/VGGSound:
|
41 |
|
42 |
https://github.com/user-attachments/assets/29230d4e-21c1-4cf8-a221-c28f2af6d0ca
|
43 |
|
44 |
-
For more results, visit https://hkchengrex.com/MMAudio/video_main.html.
|
|
|
45 |
|
46 |
## Installation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
We have only tested this on Ubuntu.
|
49 |
|
50 |
### Prerequisites
|
51 |
|
52 |
We recommend using a [miniforge](https://github.com/conda-forge/miniforge) environment.
|
53 |
|
54 |
-
- Python 3.
|
55 |
-
- PyTorch **2.5.1+** and corresponding torchvision/torchaudio (pick your CUDA version https://pytorch.org
|
56 |
-
- ffmpeg<7 ([this is required by torchaudio](https://pytorch.org/audio/master/installation.html#optional-dependencies), you can install it in a miniforge environment with `conda install -c conda-forge 'ffmpeg<7'`)
|
57 |
|
58 |
-
**
|
59 |
|
60 |
```bash
|
61 |
-
|
62 |
```
|
63 |
|
64 |
-
|
65 |
|
66 |
-
```
|
67 |
-
|
68 |
-
pip install -e .
|
69 |
```
|
|
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
**Pretrained models:**
|
74 |
-
|
75 |
-
The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`
|
76 |
-
|
77 |
-
| Model | Download link | File size |
|
78 |
-
| -------- | ------- | ------- |
|
79 |
-
| Flow prediction network, small 16kHz | <a href="https://databank.illinois.edu/datafiles/k6jve/download" download="mmaudio_small_16k.pth">mmaudio_small_16k.pth</a> | 601M |
|
80 |
-
| Flow prediction network, small 44.1kHz | <a href="https://databank.illinois.edu/datafiles/864ya/download" download="mmaudio_small_44k.pth">mmaudio_small_44k.pth</a> | 601M |
|
81 |
-
| Flow prediction network, medium 44.1kHz | <a href="https://databank.illinois.edu/datafiles/pa94t/download" download="mmaudio_medium_44k.pth">mmaudio_medium_44k.pth</a> | 2.4G |
|
82 |
-
| Flow prediction network, large 44.1kHz **(recommended)** | <a href="https://databank.illinois.edu/datafiles/4jx76/download" download="mmaudio_large_44k.pth">mmaudio_large_44k.pth</a> | 3.9G |
|
83 |
-
| 16kHz VAE | <a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-16.pth">v1-16.pth</a> | 655M |
|
84 |
-
| 16kHz BigVGAN vocoder |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/best_netG.pt">best_netG.pt</a> | 429M |
|
85 |
-
| 44.1kHz VAE |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-44.pth">v1-44.pth</a> | 1.2G |
|
86 |
-
| Synchformer visual encoder |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/synchformer_state_dict.pth">synchformer_state_dict.pth</a> | 907M |
|
87 |
-
|
88 |
-
The 44.1kHz vocoder will be downloaded automatically.
|
89 |
-
|
90 |
-
The expected directory structure (full):
|
91 |
|
92 |
```bash
|
93 |
-
|
94 |
-
├── ext_weights
|
95 |
-
│ ├── best_netG.pt
|
96 |
-
│ ├── synchformer_state_dict.pth
|
97 |
-
│ ├── v1-16.pth
|
98 |
-
│ └── v1-44.pth
|
99 |
-
├── weights
|
100 |
-
│ ├── mmaudio_small_16k.pth
|
101 |
-
│ ├── mmaudio_small_44k.pth
|
102 |
-
│ ├── mmaudio_medium_44k.pth
|
103 |
-
│ └── mmaudio_large_44k.pth
|
104 |
-
└── ...
|
105 |
```
|
106 |
|
107 |
-
|
108 |
|
109 |
```bash
|
110 |
-
|
111 |
-
|
112 |
-
│ ├── synchformer_state_dict.pth
|
113 |
-
│ └── v1-44.pth
|
114 |
-
├── weights
|
115 |
-
│ └── mmaudio_large_44k.pth
|
116 |
-
└── ...
|
117 |
```
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
## Demo
|
120 |
|
121 |
-
|
122 |
-
|
123 |
|
124 |
### Command-line interface
|
125 |
|
126 |
With `demo.py`
|
|
|
127 |
```bash
|
128 |
-
python demo.py
|
129 |
```
|
130 |
-
|
|
|
|
|
|
|
131 |
See the file for more options.
|
132 |
Simply omit the `--video` option for text-to-audio synthesis.
|
133 |
-
The default output (and training) duration is 8 seconds. Longer/shorter durations could also work, but a large deviation from the training duration may result in a lower quality.
|
134 |
|
135 |
-
|
136 |
-
### Gradio interface
|
137 |
|
138 |
Supports video-to-audio and text-to-audio synthesis.
|
|
|
|
|
139 |
|
140 |
-
```
|
141 |
python gradio_demo.py
|
142 |
-
```
|
143 |
|
144 |
-
### Known limitations
|
145 |
|
146 |
-
1. The model sometimes generates undesired unintelligible human speech-like sounds
|
147 |
-
2. The model sometimes generates undesired background music
|
148 |
-
3. The model struggles with unfamiliar concepts, e.g., it can generate "gunfires" but not "RPG firing".
|
149 |
|
150 |
-
|
|
|
|
|
151 |
|
152 |
-
## Training
|
153 |
-
Work in progress.
|
154 |
|
155 |
-
##
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
## Acknowledgement
|
159 |
-
Many thanks to:
|
160 |
-
- [Make-An-Audio 2](https://github.com/bytedance/Make-An-Audio-2) for the 16kHz BigVGAN pretrained model
|
161 |
-
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
162 |
-
- [Synchformer](https://github.com/v-iashin/Synchformer)
|
163 |
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
title: DeepSound-V1
|
|
|
3 |
colorFrom: blue
|
4 |
colorTo: indigo
|
5 |
sdk: gradio
|
|
|
8 |
---
|
9 |
|
10 |
|
11 |
+
<!-- # DeepSound-V1
|
12 |
+
Official code for DeepSound-V1 -->
|
13 |
|
|
|
14 |
|
15 |
+
<div align="center">
|
16 |
+
<p align="center">
|
17 |
+
<h2>DeepSound-V1</h2>
|
18 |
+
<!-- <a href="https://arxiv.org/abs/2412.15322">Paper</a> | <a href="https://hkchengrex.github.io/MMAudio">Webpage</a> | <a href="https://huggingface.co/hkchengrex/MMAudio/tree/main">Models</a> | <a href="https://huggingface.co/spaces/hkchengrex/MMAudio"> Huggingface Demo</a> | <a href="https://colab.research.google.com/drive/1TAaXCY2-kPk4xE4PwKB3EqFbSnkUuzZ8?usp=sharing">Colab Demo</a> | <a href="https://replicate.com/zsxkib/mmaudio">Replicate Demo</a> -->
|
19 |
+
<a href="https://github.com/lym0302/DeepSound-V1">Paper</a> | <a href="https://github.com/lym0302/DeepSound-V1">Webpage</a> | <a href="https://github.com/lym0302/DeepSound-V1"> Huggingface Demo</a>
|
20 |
+
</p>
|
21 |
+
</div>
|
22 |
|
23 |
+
## [DeepSound-V1: Start to Think Step-by-Step in the Audio Generation from Videos](https://github.com/lym0302/DeepSound-V1)
|
24 |
|
25 |
+
<!-- [Ho Kei Cheng](https://hkchengrex.github.io/), [Masato Ishii](https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ), [Akio Hayakawa](https://scholar.google.com/citations?user=sXAjHFIAAAAJ), [Takashi Shibuya](https://scholar.google.com/citations?user=XCRO260AAAAJ), [Alexander Schwing](https://www.alexander-schwing.de/), [Yuki Mitsufuji](https://www.yukimitsufuji.com/) -->
|
26 |
|
27 |
+
<!-- University of Illinois Urbana-Champaign, Sony AI, and Sony Group Corporation -->
|
28 |
|
29 |
+
<!-- ICCV 2025 -->
|
30 |
|
31 |
## Highlight
|
32 |
|
33 |
+
DeepSound-V1 is a framework enabling audio generation from videos towards initial step-by-step thinking without extra annotations based on the internal chain-of-thought (CoT) of Multi-modal large language model(MLLM).
|
|
|
|
|
34 |
|
35 |
+
<!-- ## Results
|
|
|
36 |
|
37 |
(All audio from our algorithm MMAudio)
|
38 |
|
39 |
+
Videos from Sora:
|
40 |
|
41 |
https://github.com/user-attachments/assets/82afd192-0cee-48a1-86ca-bd39b8c8f330
|
42 |
|
43 |
+
Videos from Veo 2:
|
44 |
+
|
45 |
+
https://github.com/user-attachments/assets/8a11419e-fee2-46e0-9e67-dfb03c48d00e
|
46 |
|
47 |
+
Videos from MovieGen/Hunyuan Video/VGGSound:
|
48 |
|
49 |
https://github.com/user-attachments/assets/29230d4e-21c1-4cf8-a221-c28f2af6d0ca
|
50 |
|
51 |
+
For more results, visit https://hkchengrex.com/MMAudio/video_main.html. -->
|
52 |
+
|
53 |
|
54 |
## Installation
|
55 |
+
```bash
|
56 |
+
conda create -n deepsound-v1 python=3.10.16 -y
|
57 |
+
conda activate deepsound-v1
|
58 |
+
pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu120
|
59 |
+
pip install flash-attn==2.5.8 --no-build-isolation
|
60 |
+
pip install -e .
|
61 |
+
pip install -r reqirments.txt
|
62 |
+
```
|
63 |
+
|
64 |
|
65 |
+
<!-- We have only tested this on Ubuntu.
|
66 |
|
67 |
### Prerequisites
|
68 |
|
69 |
We recommend using a [miniforge](https://github.com/conda-forge/miniforge) environment.
|
70 |
|
71 |
+
- Python 3.9+
|
72 |
+
- PyTorch **2.5.1+** and corresponding torchvision/torchaudio (pick your CUDA version https://pytorch.org/, pip install recommended)
|
73 |
+
<!-- - ffmpeg<7 ([this is required by torchaudio](https://pytorch.org/audio/master/installation.html#optional-dependencies), you can install it in a miniforge environment with `conda install -c conda-forge 'ffmpeg<7'`) -->
|
74 |
|
75 |
+
<!-- **1. Install prerequisite if not yet met:**
|
76 |
|
77 |
```bash
|
78 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --upgrade
|
79 |
```
|
80 |
|
81 |
+
(Or any other CUDA versions that your GPUs/driver support) -->
|
82 |
|
83 |
+
<!-- ```
|
84 |
+
conda install -c conda-forge 'ffmpeg<7
|
|
|
85 |
```
|
86 |
+
(Optional, if you use miniforge and don't already have the appropriate ffmpeg) -->
|
87 |
|
88 |
+
<!-- **2. Clone our repository:**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
```bash
|
91 |
+
git clone https://github.com/lym0302/DeepSound-V1.git
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
```
|
93 |
|
94 |
+
**3. Install with pip (install pytorch first before attempting this!):**
|
95 |
|
96 |
```bash
|
97 |
+
cd DeepSound-V1
|
98 |
+
pip install -e .
|
|
|
|
|
|
|
|
|
|
|
99 |
```
|
100 |
|
101 |
+
(If you encounter the File "setup.py" not found error, upgrade your pip with pip install --upgrade pip) -->
|
102 |
+
|
103 |
+
|
104 |
+
<!-- The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`.
|
105 |
+
The models are also available at https://huggingface.co/hkchengrex/MMAudio/tree/main
|
106 |
+
See [MODELS.md](docs/MODELS.md) for more details. -->
|
107 |
+
|
108 |
## Demo
|
109 |
|
110 |
+
### Pretrained models
|
111 |
+
See [MODELS.md](docs/MODELS.md).
|
112 |
|
113 |
### Command-line interface
|
114 |
|
115 |
With `demo.py`
|
116 |
+
|
117 |
```bash
|
118 |
+
python demo.py -i <video_path>
|
119 |
```
|
120 |
+
|
121 |
+
All training parameters are [here]().
|
122 |
+
|
123 |
+
<!-- The output (audio in `.wav` format, and video in `.mp4` format) will be saved in `./output`.
|
124 |
See the file for more options.
|
125 |
Simply omit the `--video` option for text-to-audio synthesis.
|
126 |
+
The default output (and training) duration is 8 seconds. Longer/shorter durations could also work, but a large deviation from the training duration may result in a lower quality. -->
|
127 |
|
128 |
+
<!-- ### Gradio interface
|
|
|
129 |
|
130 |
Supports video-to-audio and text-to-audio synthesis.
|
131 |
+
You can also try experimental image-to-audio synthesis which duplicates the input image to a video for processing. This might be interesting to some but it is not something MMAudio has been trained for.
|
132 |
+
Use [port forwarding](https://unix.stackexchange.com/questions/115897/whats-ssh-port-forwarding-and-whats-the-difference-between-ssh-local-and-remot) (e.g., `ssh -L 7860:localhost:7860 server`) if necessary. The default port is `7860` which you can specify with `--port`.
|
133 |
|
134 |
+
```bash
|
135 |
python gradio_demo.py
|
136 |
+
``` -->
|
137 |
|
|
|
138 |
|
|
|
|
|
|
|
139 |
|
140 |
+
## Evaluation
|
141 |
+
Refer [av-benchmark](https://github.com/hkchengrex/av-benchmark) for benchmarking results.
|
142 |
+
See [EVAL.md](docs/EVAL.md).
|
143 |
|
|
|
|
|
144 |
|
145 |
+
## Citation
|
146 |
+
|
147 |
+
<!-- ```bibtex
|
148 |
+
@inproceedings{cheng2025taming,
|
149 |
+
title={Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis},
|
150 |
+
author={Cheng, Ho Kei and Ishii, Masato and Hayakawa, Akio and Shibuya, Takashi and Schwing, Alexander and Mitsufuji, Yuki},
|
151 |
+
booktitle={CVPR},
|
152 |
+
year={2025}
|
153 |
+
}
|
154 |
+
``` -->
|
155 |
+
|
156 |
+
## Relevant Repositories
|
157 |
+
|
158 |
+
- [av-benchmark](https://github.com/hkchengrex/av-benchmark) for benchmarking results.
|
159 |
+
|
160 |
|
161 |
## Acknowledgement
|
|
|
|
|
|
|
|
|
162 |
|
163 |
+
Many thanks to:
|
164 |
+
- [VideoLLaMA2](https://github.com/DAMO-NLP-SG/VideoLLaMA2)
|
165 |
+
- [MMAudio](https://github.com/hkchengrex/MMAudio)
|
166 |
+
- [FoleyCrafter](https://github.com/open-mmlab/FoleyCrafter)
|
167 |
+
- [BS-RoFormer](https://github.com/ZFTurbo/Music-Source-Separation-Training)
|
app.py
CHANGED
@@ -1,275 +1,162 @@
|
|
1 |
-
import spaces
|
2 |
-
import logging
|
3 |
-
from datetime import datetime
|
4 |
-
from pathlib import Path
|
5 |
-
|
6 |
-
import gradio as gr
|
7 |
-
import torch
|
8 |
-
import torchaudio
|
9 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
try:
|
12 |
-
import mmaudio
|
13 |
-
except ImportError:
|
14 |
-
os.system("pip install -e .")
|
15 |
-
import mmaudio
|
16 |
-
|
17 |
-
from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
|
18 |
-
setup_eval_logging)
|
19 |
-
from mmaudio.model.flow_matching import FlowMatching
|
20 |
-
from mmaudio.model.networks import MMAudio, get_my_mmaudio
|
21 |
-
from mmaudio.model.sequence_config import SequenceConfig
|
22 |
-
from mmaudio.model.utils.features_utils import FeaturesUtils
|
23 |
-
import tempfile
|
24 |
-
|
25 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
26 |
-
torch.backends.cudnn.allow_tf32 = True
|
27 |
-
|
28 |
-
log = logging.getLogger()
|
29 |
-
|
30 |
-
device = 'cpu'
|
31 |
-
dtype = torch.bfloat16
|
32 |
-
|
33 |
-
model: ModelConfig = all_model_cfg['large_44k_v2']
|
34 |
-
model.download_if_needed()
|
35 |
-
output_dir = Path('./output/gradio')
|
36 |
|
37 |
setup_eval_logging()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
clip_frames = sync_frames = None
|
114 |
-
seq_cfg.duration = duration
|
115 |
-
net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
|
116 |
-
|
117 |
-
audios = generate(clip_frames,
|
118 |
-
sync_frames, [prompt],
|
119 |
-
negative_text=[negative_prompt],
|
120 |
-
feature_utils=feature_utils,
|
121 |
-
net=net,
|
122 |
-
fm=fm,
|
123 |
-
rng=rng,
|
124 |
-
cfg_strength=cfg_strength)
|
125 |
-
audio = audios.float().cpu()[0]
|
126 |
-
|
127 |
-
audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
|
128 |
-
torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
|
129 |
-
log.info(f'Saved audio to {audio_save_path}')
|
130 |
-
return audio_save_path
|
131 |
|
132 |
|
133 |
video_to_audio_tab = gr.Interface(
|
134 |
fn=video_to_audio,
|
|
|
135 |
description="""
|
136 |
-
|
137 |
-
Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
|
138 |
|
139 |
NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
|
140 |
Doing so does not improve results.
|
141 |
|
142 |
-
|
|
|
143 |
""",
|
144 |
inputs=[
|
145 |
gr.Video(),
|
146 |
gr.Text(label='Prompt'),
|
147 |
-
gr.Text(label='Negative prompt', value='
|
148 |
-
gr.
|
149 |
-
gr.
|
150 |
-
gr.Number(label='
|
151 |
-
gr.Number(label='
|
152 |
-
],
|
153 |
-
outputs='playable_video',
|
154 |
-
cache_examples=False,
|
155 |
-
title='MMAudio — Video-to-Audio Synthesis',
|
156 |
-
examples=[
|
157 |
-
[
|
158 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
|
159 |
-
'waves, seagulls',
|
160 |
-
'',
|
161 |
-
0,
|
162 |
-
25,
|
163 |
-
4.5,
|
164 |
-
10,
|
165 |
-
],
|
166 |
-
[
|
167 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
|
168 |
-
'',
|
169 |
-
'music',
|
170 |
-
0,
|
171 |
-
25,
|
172 |
-
4.5,
|
173 |
-
10,
|
174 |
-
],
|
175 |
-
[
|
176 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
|
177 |
-
'bubbles',
|
178 |
-
'',
|
179 |
-
0,
|
180 |
-
25,
|
181 |
-
4.5,
|
182 |
-
10,
|
183 |
-
],
|
184 |
-
[
|
185 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
|
186 |
-
'Indian holy music',
|
187 |
-
'',
|
188 |
-
0,
|
189 |
-
25,
|
190 |
-
4.5,
|
191 |
-
10,
|
192 |
-
],
|
193 |
-
[
|
194 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
|
195 |
-
'galloping',
|
196 |
-
'',
|
197 |
-
0,
|
198 |
-
25,
|
199 |
-
4.5,
|
200 |
-
10,
|
201 |
-
],
|
202 |
-
[
|
203 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
|
204 |
-
'waves, storm',
|
205 |
-
'',
|
206 |
-
0,
|
207 |
-
25,
|
208 |
-
4.5,
|
209 |
-
10,
|
210 |
-
],
|
211 |
-
[
|
212 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
|
213 |
-
'',
|
214 |
-
'',
|
215 |
-
0,
|
216 |
-
25,
|
217 |
-
4.5,
|
218 |
-
10,
|
219 |
-
],
|
220 |
-
[
|
221 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
|
222 |
-
'storm',
|
223 |
-
'',
|
224 |
-
0,
|
225 |
-
25,
|
226 |
-
4.5,
|
227 |
-
10,
|
228 |
-
],
|
229 |
-
[
|
230 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
|
231 |
-
'',
|
232 |
-
'',
|
233 |
-
0,
|
234 |
-
25,
|
235 |
-
4.5,
|
236 |
-
10,
|
237 |
-
],
|
238 |
-
[
|
239 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
|
240 |
-
'typing',
|
241 |
-
'',
|
242 |
-
0,
|
243 |
-
25,
|
244 |
-
4.5,
|
245 |
-
10,
|
246 |
-
],
|
247 |
-
[
|
248 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
|
249 |
-
'',
|
250 |
-
'',
|
251 |
-
0,
|
252 |
-
25,
|
253 |
-
4.5,
|
254 |
-
10,
|
255 |
-
],
|
256 |
-
])
|
257 |
|
258 |
-
text_to_audio_tab = gr.Interface(
|
259 |
-
fn=text_to_audio,
|
260 |
-
inputs=[
|
261 |
-
gr.Text(label='Prompt'),
|
262 |
-
gr.Text(label='Negative prompt'),
|
263 |
-
gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
|
264 |
-
gr.Number(label='Num steps', value=25, precision=0, minimum=1),
|
265 |
-
gr.Number(label='Guidance Strength', value=4.5, minimum=1),
|
266 |
-
gr.Number(label='Duration (sec)', value=8, minimum=1),
|
267 |
],
|
268 |
-
outputs=
|
269 |
cache_examples=False,
|
270 |
-
title='
|
271 |
)
|
272 |
|
|
|
273 |
if __name__ == "__main__":
|
274 |
-
gr.TabbedInterface([video_to_audio_tab
|
275 |
-
['Video-to-Audio'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
import gradio as gr
|
5 |
+
import subprocess
|
6 |
+
from pathlib import Path
|
7 |
+
import requests
|
8 |
+
from moviepy.editor import AudioFileClip, VideoFileClip
|
9 |
+
|
10 |
+
project_root = os.path.dirname(os.path.abspath(__file__))
|
11 |
+
mmaudio_path = os.path.join(project_root, 'third_party', 'MMAudio')
|
12 |
+
sys.path.append(mmaudio_path)
|
13 |
+
|
14 |
+
from pipeline.pipeline import Pipeline
|
15 |
+
from third_party.MMAudio.mmaudio.eval_utils import setup_eval_logging
|
16 |
+
|
17 |
+
# # download model
|
18 |
+
# os.makedirs("pretrained/mllm", exist_ok=True)
|
19 |
+
# from huggingface_hub import snapshot_download
|
20 |
+
# repo_local_path = snapshot_download(repo_id="lym0302/VideoLLaMA2.1-7B-AV-CoT", cache_dir='pretrained/mllm')
|
21 |
+
|
22 |
+
# remove_vo_model_dir = "pretrained/remove_vo/checkpoints"
|
23 |
+
# os.makedirs(remove_vo_model_dir, exist_ok=True)
|
24 |
+
# urls = ["https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/model_bs_roformer_ep_317_sdr_12.9755.ckpt",
|
25 |
+
# "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml"]
|
26 |
+
# for url in urls:
|
27 |
+
# file_name = url.split("/")[-1] # Extract file name from URL
|
28 |
+
# file_path = os.path.join(remove_vo_model_dir, file_name)
|
29 |
+
# response = requests.get(url, stream=True)
|
30 |
+
# if response.status_code == 200:
|
31 |
+
# with open(file_path, "wb") as f:
|
32 |
+
# for chunk in response.iter_content(chunk_size=8192): # Use a chunk size of 8 KB
|
33 |
+
# f.write(chunk)
|
34 |
+
# print(f"File downloaded successfully and saved to {file_path}")
|
35 |
+
# else:
|
36 |
+
# print(f"Failed to download the file. Status code: {response.status_code}")
|
37 |
+
|
38 |
+
# os.makedirs("pretrained/v2a/mmaudio", exist_ok=True)
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
setup_eval_logging()
|
42 |
+
pipeline = Pipeline(
|
43 |
+
step0_model_dir='pretrained/mllm/models--lym0302--VideoLLaMA2.1-7B-AV-CoT',
|
44 |
+
step1_mode='mmaudio_medium_44k',
|
45 |
+
step2_model_dir='pretrained/mllm/models--lym0302--VideoLLaMA2.1-7B-AV-CoT',
|
46 |
+
step2_mode='cot',
|
47 |
+
step3_mode='bs_roformer',
|
48 |
+
)
|
49 |
|
50 |
+
output_dir = "output_gradio"
|
51 |
+
os.makedirs(output_dir, exist_ok=True)
|
52 |
+
skip_final_video = False
|
53 |
+
def video_to_audio(
|
54 |
+
video_input: gr.Video,
|
55 |
+
prompt: str='',
|
56 |
+
negative_prompt: str='',
|
57 |
+
mode: str='s4',
|
58 |
+
postp_mode: str='neg',
|
59 |
+
duration: float=10,
|
60 |
+
seed: int=42,):
|
61 |
+
|
62 |
+
log_messages = [] # 用于存储日志
|
63 |
+
def log_info(msg):
|
64 |
+
log_messages.append(msg)
|
65 |
+
return "\n".join(log_messages) # 每次返回完整的日志历史
|
66 |
+
|
67 |
+
if not video_input:
|
68 |
+
yield None, log_info("Error: No video input provided.")
|
69 |
+
return
|
70 |
+
|
71 |
+
yield None, log_info("Generate high-quality audio from video step-by-step...") # 初始化日志
|
72 |
+
|
73 |
+
st_infer = time.time()
|
74 |
+
video_input = str(video_input)
|
75 |
+
|
76 |
+
for step_results in pipeline.run_for_gradio(
|
77 |
+
video_input=video_input,
|
78 |
+
output_dir=output_dir,
|
79 |
+
mode=mode,
|
80 |
+
postp_mode=postp_mode,
|
81 |
+
prompt=prompt,
|
82 |
+
negative_prompt=negative_prompt,
|
83 |
+
duration=duration,
|
84 |
+
seed=seed
|
85 |
+
):
|
86 |
+
if step_results['log'] == 'Finish step-by-step v2a.':
|
87 |
+
break
|
88 |
+
else:
|
89 |
+
yield None, log_info(step_results['log'])
|
90 |
+
|
91 |
+
|
92 |
+
temp_final_audio_path = step_results["temp_final_audio_path"]
|
93 |
+
temp_final_video_path = step_results["temp_final_video_path"]
|
94 |
+
|
95 |
+
video_name_stem = Path(video_input).stem
|
96 |
+
final_audio_path = str(Path(output_dir) / f'{video_name_stem}.wav')
|
97 |
+
final_video_path = str(Path(output_dir) / f'{video_name_stem}.mp4')
|
98 |
+
|
99 |
+
if temp_final_audio_path is not None:
|
100 |
+
subprocess.run(['cp', str(temp_final_audio_path), final_audio_path], check=True)
|
101 |
+
step_results["final_audio_path"] = final_audio_path
|
102 |
+
|
103 |
+
if skip_final_video:
|
104 |
+
step_results["final_video_path"] = None
|
105 |
+
else:
|
106 |
+
if temp_final_video_path is not None:
|
107 |
+
subprocess.run(['cp', str(temp_final_video_path), final_video_path], check=True)
|
108 |
+
else:
|
109 |
+
audio = AudioFileClip(final_audio_path)
|
110 |
+
video = VideoFileClip(video_input)
|
111 |
+
duration = min(audio.duration, video.duration)
|
112 |
+
audio = audio.subclip(0, duration)
|
113 |
+
video.audio = audio
|
114 |
+
video = video.subclip(0, duration)
|
115 |
+
video.write_videofile(final_video_path)
|
116 |
+
step_results["final_video_path"] = final_video_path
|
117 |
+
|
118 |
+
et_infer = time.time()
|
119 |
+
print(f"Inference time: {et_infer - st_infer:.2f} s.")
|
120 |
+
print("step_results: ", step_results)
|
121 |
+
|
122 |
+
yield (final_video_path if os.path.exists(final_video_path) else None), log_info(step_results['log'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
|
125 |
video_to_audio_tab = gr.Interface(
|
126 |
fn=video_to_audio,
|
127 |
+
# Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
|
128 |
description="""
|
129 |
+
Code: <a href="https://github.com/lym0302/DeepSound-V1">https://github.com/lym0302/DeepSound-V1</a><br>
|
|
|
130 |
|
131 |
NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
|
132 |
Doing so does not improve results.
|
133 |
|
134 |
+
This is a step-by-step v2a process and may take a long time.
|
135 |
+
If Post Processing is set to 'rm', the generated video may be None.
|
136 |
""",
|
137 |
inputs=[
|
138 |
gr.Video(),
|
139 |
gr.Text(label='Prompt'),
|
140 |
+
gr.Text(label='Negative prompt', value=''),
|
141 |
+
gr.Radio(["s3", "s4"], label="Mode", value="s4"),
|
142 |
+
gr.Radio(["rm", "rep", "neg"], label="Post Processing", value="neg"),
|
143 |
+
gr.Number(label='Duration (sec)', value=10, minimum=1),
|
144 |
+
gr.Number(label='Seed (42: random)', value=42, precision=0, minimum=-1),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
],
|
147 |
+
outputs=[gr.Video(label="Generated Video"), gr.Text(label="Logs"),],
|
148 |
cache_examples=False,
|
149 |
+
title='DeepSound-V1 — Video-to-Audio Synthesis',
|
150 |
)
|
151 |
|
152 |
+
|
153 |
if __name__ == "__main__":
|
154 |
+
gr.TabbedInterface([video_to_audio_tab],
|
155 |
+
['Video-to-Audio']).launch(allowed_paths=[output_dir])
|
156 |
+
|
157 |
+
|
158 |
+
# if __name__ == "__main__":
|
159 |
+
# port = 8000
|
160 |
+
# gr.TabbedInterface([video_to_audio_tab, ],
|
161 |
+
# ['Video-to-Audio', ]).launch(
|
162 |
+
# server_port=port, allowed_paths=[output_dir])
|
demo.py
DELETED
@@ -1,135 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
from argparse import ArgumentParser
|
3 |
-
from pathlib import Path
|
4 |
-
|
5 |
-
import torch
|
6 |
-
import torchaudio
|
7 |
-
|
8 |
-
from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
|
9 |
-
setup_eval_logging)
|
10 |
-
from mmaudio.model.flow_matching import FlowMatching
|
11 |
-
from mmaudio.model.networks import MMAudio, get_my_mmaudio
|
12 |
-
from mmaudio.model.utils.features_utils import FeaturesUtils
|
13 |
-
|
14 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
15 |
-
torch.backends.cudnn.allow_tf32 = True
|
16 |
-
|
17 |
-
log = logging.getLogger()
|
18 |
-
|
19 |
-
|
20 |
-
@torch.inference_mode()
|
21 |
-
def main():
|
22 |
-
setup_eval_logging()
|
23 |
-
|
24 |
-
parser = ArgumentParser()
|
25 |
-
parser.add_argument('--variant',
|
26 |
-
type=str,
|
27 |
-
default='large_44k_v2',
|
28 |
-
help='small_16k, small_44k, medium_44k, large_44k, large_44k_v2')
|
29 |
-
parser.add_argument('--video', type=Path, help='Path to the video file')
|
30 |
-
parser.add_argument('--prompt', type=str, help='Input prompt', default='')
|
31 |
-
parser.add_argument('--negative_prompt', type=str, help='Negative prompt', default='')
|
32 |
-
parser.add_argument('--duration', type=float, default=8.0)
|
33 |
-
parser.add_argument('--cfg_strength', type=float, default=4.5)
|
34 |
-
parser.add_argument('--num_steps', type=int, default=25)
|
35 |
-
|
36 |
-
parser.add_argument('--mask_away_clip', action='store_true')
|
37 |
-
|
38 |
-
parser.add_argument('--output', type=Path, help='Output directory', default='./output')
|
39 |
-
parser.add_argument('--seed', type=int, help='Random seed', default=42)
|
40 |
-
parser.add_argument('--skip_video_composite', action='store_true')
|
41 |
-
parser.add_argument('--full_precision', action='store_true')
|
42 |
-
|
43 |
-
args = parser.parse_args()
|
44 |
-
|
45 |
-
if args.variant not in all_model_cfg:
|
46 |
-
raise ValueError(f'Unknown model variant: {args.variant}')
|
47 |
-
model: ModelConfig = all_model_cfg[args.variant]
|
48 |
-
model.download_if_needed()
|
49 |
-
seq_cfg = model.seq_cfg
|
50 |
-
|
51 |
-
if args.video:
|
52 |
-
video_path: Path = Path(args.video).expanduser()
|
53 |
-
else:
|
54 |
-
video_path = None
|
55 |
-
prompt: str = args.prompt
|
56 |
-
negative_prompt: str = args.negative_prompt
|
57 |
-
output_dir: str = args.output.expanduser()
|
58 |
-
seed: int = args.seed
|
59 |
-
num_steps: int = args.num_steps
|
60 |
-
duration: float = args.duration
|
61 |
-
cfg_strength: float = args.cfg_strength
|
62 |
-
skip_video_composite: bool = args.skip_video_composite
|
63 |
-
mask_away_clip: bool = args.mask_away_clip
|
64 |
-
|
65 |
-
device = 'cuda'
|
66 |
-
dtype = torch.float32 if args.full_precision else torch.bfloat16
|
67 |
-
|
68 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
69 |
-
|
70 |
-
# load a pretrained model
|
71 |
-
net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
|
72 |
-
net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
|
73 |
-
log.info(f'Loaded weights from {model.model_path}')
|
74 |
-
|
75 |
-
# misc setup
|
76 |
-
rng = torch.Generator(device=device)
|
77 |
-
rng.manual_seed(seed)
|
78 |
-
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
|
79 |
-
|
80 |
-
feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
|
81 |
-
synchformer_ckpt=model.synchformer_ckpt,
|
82 |
-
enable_conditions=True,
|
83 |
-
mode=model.mode,
|
84 |
-
bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
|
85 |
-
need_vae_encoder=False)
|
86 |
-
feature_utils = feature_utils.to(device, dtype).eval()
|
87 |
-
|
88 |
-
if video_path is not None:
|
89 |
-
log.info(f'Using video {video_path}')
|
90 |
-
video_info = load_video(video_path, duration)
|
91 |
-
clip_frames = video_info.clip_frames
|
92 |
-
sync_frames = video_info.sync_frames
|
93 |
-
duration = video_info.duration_sec
|
94 |
-
if mask_away_clip:
|
95 |
-
clip_frames = None
|
96 |
-
else:
|
97 |
-
clip_frames = clip_frames.unsqueeze(0)
|
98 |
-
sync_frames = sync_frames.unsqueeze(0)
|
99 |
-
else:
|
100 |
-
log.info('No video provided -- text-to-audio mode')
|
101 |
-
clip_frames = sync_frames = None
|
102 |
-
|
103 |
-
seq_cfg.duration = duration
|
104 |
-
net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
|
105 |
-
|
106 |
-
log.info(f'Prompt: {prompt}')
|
107 |
-
log.info(f'Negative prompt: {negative_prompt}')
|
108 |
-
|
109 |
-
audios = generate(clip_frames,
|
110 |
-
sync_frames, [prompt],
|
111 |
-
negative_text=[negative_prompt],
|
112 |
-
feature_utils=feature_utils,
|
113 |
-
net=net,
|
114 |
-
fm=fm,
|
115 |
-
rng=rng,
|
116 |
-
cfg_strength=cfg_strength)
|
117 |
-
audio = audios.float().cpu()[0]
|
118 |
-
if video_path is not None:
|
119 |
-
save_path = output_dir / f'{video_path.stem}.flac'
|
120 |
-
else:
|
121 |
-
safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
|
122 |
-
save_path = output_dir / f'{safe_filename}.flac'
|
123 |
-
torchaudio.save(save_path, audio, seq_cfg.sampling_rate)
|
124 |
-
|
125 |
-
log.info(f'Audio saved to {save_path}')
|
126 |
-
if video_path is not None and not skip_video_composite:
|
127 |
-
video_save_path = output_dir / f'{video_path.stem}.mp4'
|
128 |
-
make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
|
129 |
-
log.info(f'Video saved to {output_dir / video_save_path}')
|
130 |
-
|
131 |
-
log.info('Memory usage: %.2f GB', torch.cuda.max_memory_allocated() / (2**30))
|
132 |
-
|
133 |
-
|
134 |
-
if __name__ == '__main__':
|
135 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/images/icon.png
DELETED
Binary file (163 Bytes)
|
|
docs/index.html
DELETED
@@ -1,147 +0,0 @@
|
|
1 |
-
<!DOCTYPE html>
|
2 |
-
<html lang="en">
|
3 |
-
<head>
|
4 |
-
<!-- Google tag (gtag.js) -->
|
5 |
-
<script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
|
6 |
-
<script>
|
7 |
-
window.dataLayer = window.dataLayer || [];
|
8 |
-
function gtag(){dataLayer.push(arguments);}
|
9 |
-
gtag('js', new Date());
|
10 |
-
gtag('config', 'G-0JKBJ3WRJZ');
|
11 |
-
</script>
|
12 |
-
|
13 |
-
<link rel="preconnect" href="https://fonts.googleapis.com">
|
14 |
-
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
15 |
-
<link href="https://fonts.googleapis.com/css2?family=Source+Sans+3&display=swap" rel="stylesheet">
|
16 |
-
<meta charset="UTF-8">
|
17 |
-
<title>MMAudio</title>
|
18 |
-
|
19 |
-
<link rel="icon" type="image/png" href="images/icon.png">
|
20 |
-
|
21 |
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
22 |
-
<!-- CSS only -->
|
23 |
-
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
|
24 |
-
integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
|
25 |
-
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
|
26 |
-
|
27 |
-
<link rel="stylesheet" href="style.css">
|
28 |
-
</head>
|
29 |
-
<body>
|
30 |
-
|
31 |
-
<body>
|
32 |
-
<br><br><br><br>
|
33 |
-
<div class="container">
|
34 |
-
<div class="row text-center" style="font-size:38px">
|
35 |
-
<div class="col strong">
|
36 |
-
Taming Multimodal Joint Training for High-Quality <br>Video-to-Audio Synthesis
|
37 |
-
</div>
|
38 |
-
</div>
|
39 |
-
|
40 |
-
<br>
|
41 |
-
<div class="row text-center" style="font-size:28px">
|
42 |
-
<div class="col">
|
43 |
-
arXiv 2024
|
44 |
-
</div>
|
45 |
-
</div>
|
46 |
-
<br>
|
47 |
-
|
48 |
-
<div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
|
49 |
-
<div class="col-sm-auto px-lg-2">
|
50 |
-
<a href="https://hkchengrex.github.io/">Ho Kei Cheng<sup>1</sup></a>
|
51 |
-
</div>
|
52 |
-
<div class="col-sm-auto px-lg-2">
|
53 |
-
<nobr><a href="https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ">Masato Ishii<sup>2</sup></a></nobr>
|
54 |
-
</div>
|
55 |
-
<div class="col-sm-auto px-lg-2">
|
56 |
-
<nobr><a href="https://scholar.google.com/citations?user=sXAjHFIAAAAJ">Akio Hayakawa<sup>2</sup></a></nobr>
|
57 |
-
</div>
|
58 |
-
<div class="col-sm-auto px-lg-2">
|
59 |
-
<nobr><a href="https://scholar.google.com/citations?user=XCRO260AAAAJ">Takashi Shibuya<sup>2</sup></a></nobr>
|
60 |
-
</div>
|
61 |
-
<div class="col-sm-auto px-lg-2">
|
62 |
-
<nobr><a href="https://www.alexander-schwing.de/">Alexander Schwing<sup>1</sup></a></nobr>
|
63 |
-
</div>
|
64 |
-
<div class="col-sm-auto px-lg-2" >
|
65 |
-
<nobr><a href="https://www.yukimitsufuji.com/">Yuki Mitsufuji<sup>2,3</sup></a></nobr>
|
66 |
-
</div>
|
67 |
-
</div>
|
68 |
-
|
69 |
-
<div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
|
70 |
-
<div class="col-sm-auto px-lg-2">
|
71 |
-
<sup>1</sup>University of Illinois Urbana-Champaign
|
72 |
-
</div>
|
73 |
-
<div class="col-sm-auto px-lg-2">
|
74 |
-
<sup>2</sup>Sony AI
|
75 |
-
</div>
|
76 |
-
<div class="col-sm-auto px-lg-2">
|
77 |
-
<sup>3</sup>Sony Group Corporation
|
78 |
-
</div>
|
79 |
-
</div>
|
80 |
-
|
81 |
-
<br>
|
82 |
-
|
83 |
-
<br>
|
84 |
-
|
85 |
-
<div class="h-100 row text-center justify-content-md-center" style="font-size:20px;">
|
86 |
-
<!-- <div class="col-sm-2">
|
87 |
-
<a href="https://arxiv.org/abs/2310.12982">[arXiv]</a>
|
88 |
-
</div> -->
|
89 |
-
<div class="col-sm-3">
|
90 |
-
<a href="">[Paper (being prepared)]</a>
|
91 |
-
</div>
|
92 |
-
<div class="col-sm-3">
|
93 |
-
<a href="https://github.com/hkchengrex/MMAudio">[Code]</a>
|
94 |
-
</div>
|
95 |
-
<!-- <div class="col-sm-2">
|
96 |
-
<a
|
97 |
-
href="https://colab.research.google.com/drive/1yo43XTbjxuWA7XgCUO9qxAi7wBI6HzvP?usp=sharing">[Colab]</a>
|
98 |
-
</div> -->
|
99 |
-
</div>
|
100 |
-
|
101 |
-
<br>
|
102 |
-
|
103 |
-
<hr>
|
104 |
-
|
105 |
-
<div class="row" style="font-size:32px">
|
106 |
-
<div class="col strong">
|
107 |
-
TL;DR
|
108 |
-
</div>
|
109 |
-
</div>
|
110 |
-
<br>
|
111 |
-
<div class="row">
|
112 |
-
<div class="col">
|
113 |
-
<p class="light" style="text-align: left;">
|
114 |
-
MMAudio generates synchronized audio given video and/or text inputs.
|
115 |
-
</p>
|
116 |
-
</div>
|
117 |
-
</div>
|
118 |
-
|
119 |
-
<br>
|
120 |
-
<hr>
|
121 |
-
<br>
|
122 |
-
|
123 |
-
<div class="row" style="font-size:32px">
|
124 |
-
<div class="col strong">
|
125 |
-
Demo
|
126 |
-
</div>
|
127 |
-
</div>
|
128 |
-
<br>
|
129 |
-
<div class="row" style="font-size:48px">
|
130 |
-
<div class="col strong text-center">
|
131 |
-
<a href="video_main.html" style="text-decoration: underline;"><More results></a>
|
132 |
-
</div>
|
133 |
-
</div>
|
134 |
-
<br>
|
135 |
-
<div class="video-container" style="text-align: center;">
|
136 |
-
<iframe src="https://youtube.com/embed/YElewUT2M4M"></iframe>
|
137 |
-
</div>
|
138 |
-
|
139 |
-
<br>
|
140 |
-
|
141 |
-
<br><br>
|
142 |
-
<br><br>
|
143 |
-
|
144 |
-
</div>
|
145 |
-
|
146 |
-
</body>
|
147 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/style.css
DELETED
@@ -1,78 +0,0 @@
|
|
1 |
-
body {
|
2 |
-
font-family: 'Source Sans 3', sans-serif;
|
3 |
-
font-size: 18px;
|
4 |
-
margin-left: auto;
|
5 |
-
margin-right: auto;
|
6 |
-
font-weight: 400;
|
7 |
-
height: 100%;
|
8 |
-
max-width: 1000px;
|
9 |
-
}
|
10 |
-
|
11 |
-
table {
|
12 |
-
width: 100%;
|
13 |
-
border-collapse: collapse;
|
14 |
-
}
|
15 |
-
th, td {
|
16 |
-
border: 1px solid #ddd;
|
17 |
-
padding: 8px;
|
18 |
-
text-align: center;
|
19 |
-
}
|
20 |
-
th {
|
21 |
-
background-color: #f2f2f2;
|
22 |
-
}
|
23 |
-
video {
|
24 |
-
width: 100%;
|
25 |
-
height: auto;
|
26 |
-
}
|
27 |
-
p {
|
28 |
-
font-size: 28px;
|
29 |
-
}
|
30 |
-
h2 {
|
31 |
-
font-size: 36px;
|
32 |
-
}
|
33 |
-
|
34 |
-
.strong {
|
35 |
-
font-weight: 700;
|
36 |
-
}
|
37 |
-
|
38 |
-
.light {
|
39 |
-
font-weight: 100;
|
40 |
-
}
|
41 |
-
|
42 |
-
.heavy {
|
43 |
-
font-weight: 900;
|
44 |
-
}
|
45 |
-
|
46 |
-
.column {
|
47 |
-
float: left;
|
48 |
-
}
|
49 |
-
|
50 |
-
a:link,
|
51 |
-
a:visited {
|
52 |
-
color: #05538f;
|
53 |
-
text-decoration: none;
|
54 |
-
}
|
55 |
-
|
56 |
-
a:hover {
|
57 |
-
color: #63cbdd;
|
58 |
-
}
|
59 |
-
|
60 |
-
hr {
|
61 |
-
border: 0;
|
62 |
-
height: 1px;
|
63 |
-
background-image: linear-gradient(to right, rgba(0, 0, 0, 0), rgba(0, 0, 0, 0.75), rgba(0, 0, 0, 0));
|
64 |
-
}
|
65 |
-
|
66 |
-
.video-container {
|
67 |
-
position: relative;
|
68 |
-
padding-bottom: 56.25%; /* 16:9 */
|
69 |
-
height: 0;
|
70 |
-
}
|
71 |
-
|
72 |
-
.video-container iframe {
|
73 |
-
position: absolute;
|
74 |
-
top: 0;
|
75 |
-
left: 0;
|
76 |
-
width: 100%;
|
77 |
-
height: 100%;
|
78 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/style_videos.css
DELETED
@@ -1,52 +0,0 @@
|
|
1 |
-
body {
|
2 |
-
font-family: 'Source Sans 3', sans-serif;
|
3 |
-
font-size: 1.5vh;
|
4 |
-
font-weight: 400;
|
5 |
-
}
|
6 |
-
|
7 |
-
table {
|
8 |
-
width: 100%;
|
9 |
-
border-collapse: collapse;
|
10 |
-
}
|
11 |
-
th, td {
|
12 |
-
border: 1px solid #ddd;
|
13 |
-
padding: 8px;
|
14 |
-
text-align: center;
|
15 |
-
}
|
16 |
-
th {
|
17 |
-
background-color: #f2f2f2;
|
18 |
-
}
|
19 |
-
video {
|
20 |
-
width: 100%;
|
21 |
-
height: auto;
|
22 |
-
}
|
23 |
-
p {
|
24 |
-
font-size: 1.5vh;
|
25 |
-
font-weight: bold;
|
26 |
-
}
|
27 |
-
h2 {
|
28 |
-
font-size: 2vh;
|
29 |
-
font-weight: bold;
|
30 |
-
}
|
31 |
-
|
32 |
-
.video-container {
|
33 |
-
position: relative;
|
34 |
-
padding-bottom: 56.25%; /* 16:9 */
|
35 |
-
height: 0;
|
36 |
-
}
|
37 |
-
|
38 |
-
.video-container iframe {
|
39 |
-
position: absolute;
|
40 |
-
top: 0;
|
41 |
-
left: 0;
|
42 |
-
width: 100%;
|
43 |
-
height: 100%;
|
44 |
-
}
|
45 |
-
|
46 |
-
.video-header {
|
47 |
-
background-color: #f2f2f2;
|
48 |
-
text-align: center;
|
49 |
-
font-size: 1.5vh;
|
50 |
-
font-weight: bold;
|
51 |
-
padding: 8px;
|
52 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/video_gen.html
DELETED
@@ -1,254 +0,0 @@
|
|
1 |
-
<!DOCTYPE html>
|
2 |
-
<html lang="en">
|
3 |
-
<head>
|
4 |
-
<!-- Google tag (gtag.js) -->
|
5 |
-
<script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
|
6 |
-
<script>
|
7 |
-
window.dataLayer = window.dataLayer || [];
|
8 |
-
function gtag(){dataLayer.push(arguments);}
|
9 |
-
gtag('js', new Date());
|
10 |
-
gtag('config', 'G-0JKBJ3WRJZ');
|
11 |
-
</script>
|
12 |
-
|
13 |
-
<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
|
14 |
-
<meta charset="UTF-8">
|
15 |
-
<title>MMAudio</title>
|
16 |
-
|
17 |
-
<link rel="icon" type="image/png" href="images/icon.png">
|
18 |
-
|
19 |
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
20 |
-
<!-- CSS only -->
|
21 |
-
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
|
22 |
-
integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
|
23 |
-
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
|
24 |
-
|
25 |
-
<link rel="stylesheet" href="style_videos.css">
|
26 |
-
</head>
|
27 |
-
<body>
|
28 |
-
|
29 |
-
<div id="moviegen_all">
|
30 |
-
<h2 id="moviegen" style="text-align: center;">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</h2>
|
31 |
-
<p id="moviegen1" style="overflow: hidden;">
|
32 |
-
Example 1: Ice cracking with sharp snapping sound, and metal tool scraping against the ice surface.
|
33 |
-
<span style="float: right;"><a href="#index">Back to index</a></span>
|
34 |
-
</p>
|
35 |
-
|
36 |
-
<div class="row g-1">
|
37 |
-
<div class="col-sm-6">
|
38 |
-
<div class="video-header">Movie Gen Audio</div>
|
39 |
-
<div class="video-container">
|
40 |
-
<iframe src="https://youtube.com/embed/d7Lb0ihtGcE"></iframe>
|
41 |
-
</div>
|
42 |
-
</div>
|
43 |
-
<div class="col-sm-6">
|
44 |
-
<div class="video-header">Ours</div>
|
45 |
-
<div class="video-container">
|
46 |
-
<iframe src="https://youtube.com/embed/F4JoJ2r2m8U"></iframe>
|
47 |
-
</div>
|
48 |
-
</div>
|
49 |
-
</div>
|
50 |
-
<br>
|
51 |
-
|
52 |
-
<!-- <p id="moviegen2">Example 2: Rhythmic splashing and lapping of water. <span style="float:right;"><a href="#index">Back to index</a></span> </p>
|
53 |
-
|
54 |
-
<table>
|
55 |
-
<thead>
|
56 |
-
<tr>
|
57 |
-
<th>Movie Gen Audio</th>
|
58 |
-
<th>Ours</th>
|
59 |
-
</tr>
|
60 |
-
</thead>
|
61 |
-
<tbody>
|
62 |
-
<tr>
|
63 |
-
<td width="50%">
|
64 |
-
<div class="video-container">
|
65 |
-
<iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
|
66 |
-
</div>
|
67 |
-
</td>
|
68 |
-
<td width="50%">
|
69 |
-
<div class="video-container">
|
70 |
-
<iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
|
71 |
-
</div>
|
72 |
-
</td>
|
73 |
-
</tr>
|
74 |
-
</tbody>
|
75 |
-
</table> -->
|
76 |
-
|
77 |
-
<p id="moviegen2" style="overflow: hidden;">
|
78 |
-
Example 2: Rhythmic splashing and lapping of water.
|
79 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
80 |
-
</p>
|
81 |
-
<div class="row g-1">
|
82 |
-
<div class="col-sm-6">
|
83 |
-
<div class="video-header">Movie Gen Audio</div>
|
84 |
-
<div class="video-container">
|
85 |
-
<iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
|
86 |
-
</div>
|
87 |
-
</div>
|
88 |
-
<div class="col-sm-6">
|
89 |
-
<div class="video-header">Ours</div>
|
90 |
-
<div class="video-container">
|
91 |
-
<iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
|
92 |
-
</div>
|
93 |
-
</div>
|
94 |
-
</div>
|
95 |
-
<br>
|
96 |
-
|
97 |
-
<p id="moviegen3" style="overflow: hidden;">
|
98 |
-
Example 3: Shovel scrapes against dry earth.
|
99 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
100 |
-
</p>
|
101 |
-
<div class="row g-1">
|
102 |
-
<div class="col-sm-6">
|
103 |
-
<div class="video-header">Movie Gen Audio</div>
|
104 |
-
<div class="video-container">
|
105 |
-
<iframe src="https://youtube.com/embed/PUKGyEve7XQ"></iframe>
|
106 |
-
</div>
|
107 |
-
</div>
|
108 |
-
<div class="col-sm-6">
|
109 |
-
<div class="video-header">Ours</div>
|
110 |
-
<div class="video-container">
|
111 |
-
<iframe src="https://youtube.com/embed/CNn7i8VNkdc"></iframe>
|
112 |
-
</div>
|
113 |
-
</div>
|
114 |
-
</div>
|
115 |
-
<br>
|
116 |
-
|
117 |
-
|
118 |
-
<p id="moviegen4" style="overflow: hidden;">
|
119 |
-
(Failure case) Example 4: Creamy sound of mashed potatoes being scooped.
|
120 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
121 |
-
</p>
|
122 |
-
<div class="row g-1">
|
123 |
-
<div class="col-sm-6">
|
124 |
-
<div class="video-header">Movie Gen Audio</div>
|
125 |
-
<div class="video-container">
|
126 |
-
<iframe src="https://youtube.com/embed/PJv1zxR9JjQ"></iframe>
|
127 |
-
</div>
|
128 |
-
</div>
|
129 |
-
<div class="col-sm-6">
|
130 |
-
<div class="video-header">Ours</div>
|
131 |
-
<div class="video-container">
|
132 |
-
<iframe src="https://youtube.com/embed/c3-LJ1lNsPQ"></iframe>
|
133 |
-
</div>
|
134 |
-
</div>
|
135 |
-
</div>
|
136 |
-
<br>
|
137 |
-
|
138 |
-
</div>
|
139 |
-
|
140 |
-
<div id="hunyuan_sora_all">
|
141 |
-
|
142 |
-
<h2 id="hunyuan" style="text-align: center;">Results on Videos Generated by Hunyuan</h2>
|
143 |
-
<p style="overflow: hidden;">
|
144 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
145 |
-
</p>
|
146 |
-
<div class="row g-1">
|
147 |
-
<div class="col-sm-6">
|
148 |
-
<div class="video-header">Typing</div>
|
149 |
-
<div class="video-container">
|
150 |
-
<iframe src="https://youtube.com/embed/8ln_9hhH_nk"></iframe>
|
151 |
-
</div>
|
152 |
-
</div>
|
153 |
-
<div class="col-sm-6">
|
154 |
-
<div class="video-header">Water is rushing down a stream and pouring</div>
|
155 |
-
<div class="video-container">
|
156 |
-
<iframe src="https://youtube.com/embed/5df1FZFQj30"></iframe>
|
157 |
-
</div>
|
158 |
-
</div>
|
159 |
-
</div>
|
160 |
-
<div class="row g-1">
|
161 |
-
<div class="col-sm-6">
|
162 |
-
<div class="video-header">Waves on beach</div>
|
163 |
-
<div class="video-container">
|
164 |
-
<iframe src="https://youtube.com/embed/7wQ9D5WgpFc"></iframe>
|
165 |
-
</div>
|
166 |
-
</div>
|
167 |
-
<div class="col-sm-6">
|
168 |
-
<div class="video-header">Water droplet</div>
|
169 |
-
<div class="video-container">
|
170 |
-
<iframe src="https://youtube.com/embed/q7M2nsalGjM"></iframe>
|
171 |
-
</div>
|
172 |
-
</div>
|
173 |
-
</div>
|
174 |
-
<br>
|
175 |
-
|
176 |
-
<h2 id="sora" style="text-align: center;">Results on Videos Generated by Sora</h2>
|
177 |
-
<p style="overflow: hidden;">
|
178 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
179 |
-
</p>
|
180 |
-
<div class="row g-1">
|
181 |
-
<div class="col-sm-6">
|
182 |
-
<div class="video-header">Ships riding waves</div>
|
183 |
-
<div class="video-container">
|
184 |
-
<iframe src="https://youtube.com/embed/JbgQzHHytk8"></iframe>
|
185 |
-
</div>
|
186 |
-
</div>
|
187 |
-
<div class="col-sm-6">
|
188 |
-
<div class="video-header">Train (no text prompt given)</div>
|
189 |
-
<div class="video-container">
|
190 |
-
<iframe src="https://youtube.com/embed/xOW7zrjpWC8"></iframe>
|
191 |
-
</div>
|
192 |
-
</div>
|
193 |
-
</div>
|
194 |
-
<div class="row g-1">
|
195 |
-
<div class="col-sm-6">
|
196 |
-
<div class="video-header">Seashore (no text prompt given)</div>
|
197 |
-
<div class="video-container">
|
198 |
-
<iframe src="https://youtube.com/embed/fIuw5Y8ZZ9E"></iframe>
|
199 |
-
</div>
|
200 |
-
</div>
|
201 |
-
<div class="col-sm-6">
|
202 |
-
<div class="video-header">Surfing (failure: unprompted music)</div>
|
203 |
-
<div class="video-container">
|
204 |
-
<iframe src="https://youtube.com/embed/UcSTk-v0M_s"></iframe>
|
205 |
-
</div>
|
206 |
-
</div>
|
207 |
-
</div>
|
208 |
-
<br>
|
209 |
-
|
210 |
-
<div id="mochi_ltx_all">
|
211 |
-
<h2 id="mochi" style="text-align: center;">Results on Videos Generated by Mochi 1</h2>
|
212 |
-
<p style="overflow: hidden;">
|
213 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
214 |
-
</p>
|
215 |
-
<div class="row g-1">
|
216 |
-
<div class="col-sm-6">
|
217 |
-
<div class="video-header">Magical fire and lightning (no text prompt given)</div>
|
218 |
-
<div class="video-container">
|
219 |
-
<iframe src="https://youtube.com/embed/tTlRZaSMNwY"></iframe>
|
220 |
-
</div>
|
221 |
-
</div>
|
222 |
-
<div class="col-sm-6">
|
223 |
-
<div class="video-header">Storm (no text prompt given)</div>
|
224 |
-
<div class="video-container">
|
225 |
-
<iframe src="https://youtube.com/embed/4hrZTMJUy3w"></iframe>
|
226 |
-
</div>
|
227 |
-
</div>
|
228 |
-
</div>
|
229 |
-
<br>
|
230 |
-
|
231 |
-
<h2 id="ltx" style="text-align: center;">Results on Videos Generated by LTX-Video</h2>
|
232 |
-
<p style="overflow: hidden;">
|
233 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
234 |
-
</p>
|
235 |
-
<div class="row g-1">
|
236 |
-
<div class="col-sm-6">
|
237 |
-
<div class="video-header">Firewood burning and cracking</div>
|
238 |
-
<div class="video-container">
|
239 |
-
<iframe src="https://youtube.com/embed/P7_DDpgev0g"></iframe>
|
240 |
-
</div>
|
241 |
-
</div>
|
242 |
-
<div class="col-sm-6">
|
243 |
-
<div class="video-header">Waterfall, water splashing</div>
|
244 |
-
<div class="video-container">
|
245 |
-
<iframe src="https://youtube.com/embed/4MvjceYnIO0"></iframe>
|
246 |
-
</div>
|
247 |
-
</div>
|
248 |
-
</div>
|
249 |
-
<br>
|
250 |
-
|
251 |
-
</div>
|
252 |
-
|
253 |
-
</body>
|
254 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/video_main.html
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
<!DOCTYPE html>
|
2 |
-
<html lang="en">
|
3 |
-
<head>
|
4 |
-
<!-- Google tag (gtag.js) -->
|
5 |
-
<script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
|
6 |
-
<script>
|
7 |
-
window.dataLayer = window.dataLayer || [];
|
8 |
-
function gtag(){dataLayer.push(arguments);}
|
9 |
-
gtag('js', new Date());
|
10 |
-
gtag('config', 'G-0JKBJ3WRJZ');
|
11 |
-
</script>
|
12 |
-
|
13 |
-
<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
|
14 |
-
<meta charset="UTF-8">
|
15 |
-
<title>MMAudio</title>
|
16 |
-
|
17 |
-
<link rel="icon" type="image/png" href="images/icon.png">
|
18 |
-
|
19 |
-
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
|
20 |
-
<!-- CSS only -->
|
21 |
-
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
|
22 |
-
integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
|
23 |
-
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
|
24 |
-
|
25 |
-
<link rel="stylesheet" href="style_videos.css">
|
26 |
-
|
27 |
-
<script type="text/javascript">
|
28 |
-
$(document).ready(function(){
|
29 |
-
$("#content").load("video_gen.html #moviegen_all");
|
30 |
-
$("#load_moveigen").click(function(){
|
31 |
-
$("#content").load("video_gen.html #moviegen_all");
|
32 |
-
});
|
33 |
-
$("#load_hunyuan_sora").click(function(){
|
34 |
-
$("#content").load("video_gen.html #hunyuan_sora_all");
|
35 |
-
});
|
36 |
-
$("#load_mochi_ltx").click(function(){
|
37 |
-
$("#content").load("video_gen.html #mochi_ltx_all");
|
38 |
-
});
|
39 |
-
$("#load_vgg1").click(function(){
|
40 |
-
$("#content").load("video_vgg.html #vgg1");
|
41 |
-
});
|
42 |
-
$("#load_vgg2").click(function(){
|
43 |
-
$("#content").load("video_vgg.html #vgg2");
|
44 |
-
});
|
45 |
-
$("#load_vgg3").click(function(){
|
46 |
-
$("#content").load("video_vgg.html #vgg3");
|
47 |
-
});
|
48 |
-
$("#load_vgg4").click(function(){
|
49 |
-
$("#content").load("video_vgg.html #vgg4");
|
50 |
-
});
|
51 |
-
$("#load_vgg5").click(function(){
|
52 |
-
$("#content").load("video_vgg.html #vgg5");
|
53 |
-
});
|
54 |
-
$("#load_vgg6").click(function(){
|
55 |
-
$("#content").load("video_vgg.html #vgg6");
|
56 |
-
});
|
57 |
-
$("#load_vgg_extra").click(function(){
|
58 |
-
$("#content").load("video_vgg.html #vgg_extra");
|
59 |
-
});
|
60 |
-
});
|
61 |
-
</script>
|
62 |
-
</head>
|
63 |
-
<body>
|
64 |
-
<h1 id="index" style="text-align: center;">Index</h1>
|
65 |
-
<p><b>(Click on the links to load the corresponding videos)</b> <span style="float:right;"><a href="index.html">Back to project page</a></span></p>
|
66 |
-
|
67 |
-
<ol>
|
68 |
-
<li>
|
69 |
-
<a href="#" id="load_moveigen">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</a>
|
70 |
-
</li>
|
71 |
-
<li>
|
72 |
-
<a href="#" id="load_hunyuan_sora">Results on Videos Generated by Hunyuan and Sora</a>
|
73 |
-
</li>
|
74 |
-
<li>
|
75 |
-
<a href="#" id="load_mochi_ltx">Results on Videos Generated by Mochi 1 and LTX-Video</a>
|
76 |
-
</li>
|
77 |
-
<li>
|
78 |
-
On VGGSound
|
79 |
-
<ol>
|
80 |
-
<li><a id='load_vgg1' href="#">Example 1: Wolf howling</a></li>
|
81 |
-
<li><a id='load_vgg2' href="#">Example 2: Striking a golf ball</a></li>
|
82 |
-
<li><a id='load_vgg3' href="#">Example 3: Hitting a drum</a></li>
|
83 |
-
<li><a id='load_vgg4' href="#">Example 4: Dog barking</a></li>
|
84 |
-
<li><a id='load_vgg5' href="#">Example 5: Playing a string instrument</a></li>
|
85 |
-
<li><a id='load_vgg6' href="#">Example 6: A group of people playing tambourines</a></li>
|
86 |
-
<li><a id='load_vgg_extra' href="#">Extra results & failure cases</a></li>
|
87 |
-
</ol>
|
88 |
-
</li>
|
89 |
-
</ol>
|
90 |
-
|
91 |
-
<div id="content" class="container-fluid">
|
92 |
-
|
93 |
-
</div>
|
94 |
-
<br>
|
95 |
-
<br>
|
96 |
-
|
97 |
-
</body>
|
98 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/video_vgg.html
DELETED
@@ -1,452 +0,0 @@
|
|
1 |
-
<!DOCTYPE html>
|
2 |
-
<html lang="en">
|
3 |
-
<head>
|
4 |
-
<!-- Google tag (gtag.js) -->
|
5 |
-
<script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
|
6 |
-
<script>
|
7 |
-
window.dataLayer = window.dataLayer || [];
|
8 |
-
function gtag(){dataLayer.push(arguments);}
|
9 |
-
gtag('js', new Date());
|
10 |
-
gtag('config', 'G-0JKBJ3WRJZ');
|
11 |
-
</script>
|
12 |
-
|
13 |
-
<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
|
14 |
-
<meta charset="UTF-8">
|
15 |
-
<title>MMAudio</title>
|
16 |
-
|
17 |
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
18 |
-
<!-- CSS only -->
|
19 |
-
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
|
20 |
-
integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
|
21 |
-
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
|
22 |
-
|
23 |
-
<link rel="stylesheet" href="style_videos.css">
|
24 |
-
</head>
|
25 |
-
<body>
|
26 |
-
|
27 |
-
<div id="vgg1">
|
28 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
29 |
-
<p style="overflow: hidden;">
|
30 |
-
Example 1: Wolf howling.
|
31 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
32 |
-
</p>
|
33 |
-
<div class="row g-1">
|
34 |
-
<div class="col-sm-3">
|
35 |
-
<div class="video-header">Ground-truth</div>
|
36 |
-
<div class="video-container">
|
37 |
-
<iframe src="https://youtube.com/embed/9J_V74gqMUA"></iframe>
|
38 |
-
</div>
|
39 |
-
</div>
|
40 |
-
<div class="col-sm-3">
|
41 |
-
<div class="video-header">Ours</div>
|
42 |
-
<div class="video-container">
|
43 |
-
<iframe src="https://youtube.com/embed/P6O8IpjErPc"></iframe>
|
44 |
-
</div>
|
45 |
-
</div>
|
46 |
-
<div class="col-sm-3">
|
47 |
-
<div class="video-header">V2A-Mapper</div>
|
48 |
-
<div class="video-container">
|
49 |
-
<iframe src="https://youtube.com/embed/w-5eyqepvTk"></iframe>
|
50 |
-
</div>
|
51 |
-
</div>
|
52 |
-
<div class="col-sm-3">
|
53 |
-
<div class="video-header">FoleyCrafter</div>
|
54 |
-
<div class="video-container">
|
55 |
-
<iframe src="https://youtube.com/embed/VOLfoZlRkzo"></iframe>
|
56 |
-
</div>
|
57 |
-
</div>
|
58 |
-
</div>
|
59 |
-
<div class="row g-1">
|
60 |
-
<div class="col-sm-3">
|
61 |
-
<div class="video-header">Frieren</div>
|
62 |
-
<div class="video-container">
|
63 |
-
<iframe src="https://youtube.com/embed/49owKyA5Pa8"></iframe>
|
64 |
-
</div>
|
65 |
-
</div>
|
66 |
-
<div class="col-sm-3">
|
67 |
-
<div class="video-header">VATT</div>
|
68 |
-
<div class="video-container">
|
69 |
-
<iframe src="https://youtube.com/embed/QVtrFgbeGDM"></iframe>
|
70 |
-
</div>
|
71 |
-
</div>
|
72 |
-
<div class="col-sm-3">
|
73 |
-
<div class="video-header">V-AURA</div>
|
74 |
-
<div class="video-container">
|
75 |
-
<iframe src="https://youtube.com/embed/8r0uEfSNjvI"></iframe>
|
76 |
-
</div>
|
77 |
-
</div>
|
78 |
-
<div class="col-sm-3">
|
79 |
-
<div class="video-header">Seeing and Hearing</div>
|
80 |
-
<div class="video-container">
|
81 |
-
<iframe src="https://youtube.com/embed/bn-sLg2qulk"></iframe>
|
82 |
-
</div>
|
83 |
-
</div>
|
84 |
-
</div>
|
85 |
-
</div>
|
86 |
-
|
87 |
-
<div id="vgg2">
|
88 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
89 |
-
<p style="overflow: hidden;">
|
90 |
-
Example 2: Striking a golf ball.
|
91 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
92 |
-
</p>
|
93 |
-
|
94 |
-
<div class="row g-1">
|
95 |
-
<div class="col-sm-3">
|
96 |
-
<div class="video-header">Ground-truth</div>
|
97 |
-
<div class="video-container">
|
98 |
-
<iframe src="https://youtube.com/embed/1hwSu42kkho"></iframe>
|
99 |
-
</div>
|
100 |
-
</div>
|
101 |
-
<div class="col-sm-3">
|
102 |
-
<div class="video-header">Ours</div>
|
103 |
-
<div class="video-container">
|
104 |
-
<iframe src="https://youtube.com/embed/kZibDoDCNxI"></iframe>
|
105 |
-
</div>
|
106 |
-
</div>
|
107 |
-
<div class="col-sm-3">
|
108 |
-
<div class="video-header">V2A-Mapper</div>
|
109 |
-
<div class="video-container">
|
110 |
-
<iframe src="https://youtube.com/embed/jgKfLBLhh7Y"></iframe>
|
111 |
-
</div>
|
112 |
-
</div>
|
113 |
-
<div class="col-sm-3">
|
114 |
-
<div class="video-header">FoleyCrafter</div>
|
115 |
-
<div class="video-container">
|
116 |
-
<iframe src="https://youtube.com/embed/Lfsx8mOPcJo"></iframe>
|
117 |
-
</div>
|
118 |
-
</div>
|
119 |
-
</div>
|
120 |
-
<div class="row g-1">
|
121 |
-
<div class="col-sm-3">
|
122 |
-
<div class="video-header">Frieren</div>
|
123 |
-
<div class="video-container">
|
124 |
-
<iframe src="https://youtube.com/embed/tz-LpbB0MBc"></iframe>
|
125 |
-
</div>
|
126 |
-
</div>
|
127 |
-
<div class="col-sm-3">
|
128 |
-
<div class="video-header">VATT</div>
|
129 |
-
<div class="video-container">
|
130 |
-
<iframe src="https://youtube.com/embed/RTDUHMi08n4"></iframe>
|
131 |
-
</div>
|
132 |
-
</div>
|
133 |
-
<div class="col-sm-3">
|
134 |
-
<div class="video-header">V-AURA</div>
|
135 |
-
<div class="video-container">
|
136 |
-
<iframe src="https://youtube.com/embed/N-3TDOsPnZQ"></iframe>
|
137 |
-
</div>
|
138 |
-
</div>
|
139 |
-
<div class="col-sm-3">
|
140 |
-
<div class="video-header">Seeing and Hearing</div>
|
141 |
-
<div class="video-container">
|
142 |
-
<iframe src="https://youtube.com/embed/QnsHnLn4gB0"></iframe>
|
143 |
-
</div>
|
144 |
-
</div>
|
145 |
-
</div>
|
146 |
-
</div>
|
147 |
-
|
148 |
-
<div id="vgg3">
|
149 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
150 |
-
<p style="overflow: hidden;">
|
151 |
-
Example 3: Hitting a drum.
|
152 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
153 |
-
</p>
|
154 |
-
|
155 |
-
<div class="row g-1">
|
156 |
-
<div class="col-sm-3">
|
157 |
-
<div class="video-header">Ground-truth</div>
|
158 |
-
<div class="video-container">
|
159 |
-
<iframe src="https://youtube.com/embed/0oeIwq77w0Q"></iframe>
|
160 |
-
</div>
|
161 |
-
</div>
|
162 |
-
<div class="col-sm-3">
|
163 |
-
<div class="video-header">Ours</div>
|
164 |
-
<div class="video-container">
|
165 |
-
<iframe src="https://youtube.com/embed/-UtPV9ohuIM"></iframe>
|
166 |
-
</div>
|
167 |
-
</div>
|
168 |
-
<div class="col-sm-3">
|
169 |
-
<div class="video-header">V2A-Mapper</div>
|
170 |
-
<div class="video-container">
|
171 |
-
<iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
|
172 |
-
</div>
|
173 |
-
</div>
|
174 |
-
<div class="col-sm-3">
|
175 |
-
<div class="video-header">FoleyCrafter</div>
|
176 |
-
<div class="video-container">
|
177 |
-
<iframe src="https://youtube.com/embed/kkCsXPOlBvY"></iframe>
|
178 |
-
</div>
|
179 |
-
</div>
|
180 |
-
</div>
|
181 |
-
<div class="row g-1">
|
182 |
-
<div class="col-sm-3">
|
183 |
-
<div class="video-header">Frieren</div>
|
184 |
-
<div class="video-container">
|
185 |
-
<iframe src="https://youtube.com/embed/MbNKsVsuvig"></iframe>
|
186 |
-
</div>
|
187 |
-
</div>
|
188 |
-
<div class="col-sm-3">
|
189 |
-
<div class="video-header">VATT</div>
|
190 |
-
<div class="video-container">
|
191 |
-
<iframe src="https://youtube.com/embed/2yYviBjrpBw"></iframe>
|
192 |
-
</div>
|
193 |
-
</div>
|
194 |
-
<div class="col-sm-3">
|
195 |
-
<div class="video-header">V-AURA</div>
|
196 |
-
<div class="video-container">
|
197 |
-
<iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
|
198 |
-
</div>
|
199 |
-
</div>
|
200 |
-
<div class="col-sm-3">
|
201 |
-
<div class="video-header">Seeing and Hearing</div>
|
202 |
-
<div class="video-container">
|
203 |
-
<iframe src="https://youtube.com/embed/6dnyQt4Fuhs"></iframe>
|
204 |
-
</div>
|
205 |
-
</div>
|
206 |
-
</div>
|
207 |
-
</div>
|
208 |
-
</div>
|
209 |
-
|
210 |
-
<div id="vgg4">
|
211 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
212 |
-
<p style="overflow: hidden;">
|
213 |
-
Example 4: Dog barking.
|
214 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
215 |
-
</p>
|
216 |
-
|
217 |
-
<div class="row g-1">
|
218 |
-
<div class="col-sm-3">
|
219 |
-
<div class="video-header">Ground-truth</div>
|
220 |
-
<div class="video-container">
|
221 |
-
<iframe src="https://youtube.com/embed/ckaqvTyMYAw"></iframe>
|
222 |
-
</div>
|
223 |
-
</div>
|
224 |
-
<div class="col-sm-3">
|
225 |
-
<div class="video-header">Ours</div>
|
226 |
-
<div class="video-container">
|
227 |
-
<iframe src="https://youtube.com/embed/_aRndFZzZ-I"></iframe>
|
228 |
-
</div>
|
229 |
-
</div>
|
230 |
-
<div class="col-sm-3">
|
231 |
-
<div class="video-header">V2A-Mapper</div>
|
232 |
-
<div class="video-container">
|
233 |
-
<iframe src="https://youtube.com/embed/mNCISP3LBl0"></iframe>
|
234 |
-
</div>
|
235 |
-
</div>
|
236 |
-
<div class="col-sm-3">
|
237 |
-
<div class="video-header">FoleyCrafter</div>
|
238 |
-
<div class="video-container">
|
239 |
-
<iframe src="https://youtube.com/embed/phZBQ3L7foE"></iframe>
|
240 |
-
</div>
|
241 |
-
</div>
|
242 |
-
</div>
|
243 |
-
<div class="row g-1">
|
244 |
-
<div class="col-sm-3">
|
245 |
-
<div class="video-header">Frieren</div>
|
246 |
-
<div class="video-container">
|
247 |
-
<iframe src="https://youtube.com/embed/Sb5Mg1-ORao"></iframe>
|
248 |
-
</div>
|
249 |
-
</div>
|
250 |
-
<div class="col-sm-3">
|
251 |
-
<div class="video-header">VATT</div>
|
252 |
-
<div class="video-container">
|
253 |
-
<iframe src="https://youtube.com/embed/eHmAGOmtDDg"></iframe>
|
254 |
-
</div>
|
255 |
-
</div>
|
256 |
-
<div class="col-sm-3">
|
257 |
-
<div class="video-header">V-AURA</div>
|
258 |
-
<div class="video-container">
|
259 |
-
<iframe src="https://youtube.com/embed/NEGa3krBrm0"></iframe>
|
260 |
-
</div>
|
261 |
-
</div>
|
262 |
-
<div class="col-sm-3">
|
263 |
-
<div class="video-header">Seeing and Hearing</div>
|
264 |
-
<div class="video-container">
|
265 |
-
<iframe src="https://youtube.com/embed/aO0EAXlwE7A"></iframe>
|
266 |
-
</div>
|
267 |
-
</div>
|
268 |
-
</div>
|
269 |
-
</div>
|
270 |
-
|
271 |
-
<div id="vgg5">
|
272 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
273 |
-
<p style="overflow: hidden;">
|
274 |
-
Example 5: Playing a string instrument.
|
275 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
276 |
-
</p>
|
277 |
-
|
278 |
-
<div class="row g-1">
|
279 |
-
<div class="col-sm-3">
|
280 |
-
<div class="video-header">Ground-truth</div>
|
281 |
-
<div class="video-container">
|
282 |
-
<iframe src="https://youtube.com/embed/KP1QhWauIOc"></iframe>
|
283 |
-
</div>
|
284 |
-
</div>
|
285 |
-
<div class="col-sm-3">
|
286 |
-
<div class="video-header">Ours</div>
|
287 |
-
<div class="video-container">
|
288 |
-
<iframe src="https://youtube.com/embed/ovaJhWSquYE"></iframe>
|
289 |
-
</div>
|
290 |
-
</div>
|
291 |
-
<div class="col-sm-3">
|
292 |
-
<div class="video-header">V2A-Mapper</div>
|
293 |
-
<div class="video-container">
|
294 |
-
<iframe src="https://youtube.com/embed/N723FS9lcy8"></iframe>
|
295 |
-
</div>
|
296 |
-
</div>
|
297 |
-
<div class="col-sm-3">
|
298 |
-
<div class="video-header">FoleyCrafter</div>
|
299 |
-
<div class="video-container">
|
300 |
-
<iframe src="https://youtube.com/embed/t0N4ZAAXo58"></iframe>
|
301 |
-
</div>
|
302 |
-
</div>
|
303 |
-
</div>
|
304 |
-
<div class="row g-1">
|
305 |
-
<div class="col-sm-3">
|
306 |
-
<div class="video-header">Frieren</div>
|
307 |
-
<div class="video-container">
|
308 |
-
<iframe src="https://youtube.com/embed/8YSRs03QNNA"></iframe>
|
309 |
-
</div>
|
310 |
-
</div>
|
311 |
-
<div class="col-sm-3">
|
312 |
-
<div class="video-header">VATT</div>
|
313 |
-
<div class="video-container">
|
314 |
-
<iframe src="https://youtube.com/embed/vOpMz55J1kY"></iframe>
|
315 |
-
</div>
|
316 |
-
</div>
|
317 |
-
<div class="col-sm-3">
|
318 |
-
<div class="video-header">V-AURA</div>
|
319 |
-
<div class="video-container">
|
320 |
-
<iframe src="https://youtube.com/embed/9JHC75vr9h0"></iframe>
|
321 |
-
</div>
|
322 |
-
</div>
|
323 |
-
<div class="col-sm-3">
|
324 |
-
<div class="video-header">Seeing and Hearing</div>
|
325 |
-
<div class="video-container">
|
326 |
-
<iframe src="https://youtube.com/embed/9w0JckNzXmY"></iframe>
|
327 |
-
</div>
|
328 |
-
</div>
|
329 |
-
</div>
|
330 |
-
</div>
|
331 |
-
|
332 |
-
<div id="vgg6">
|
333 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
334 |
-
<p style="overflow: hidden;">
|
335 |
-
Example 6: A group of people playing tambourines.
|
336 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
337 |
-
</p>
|
338 |
-
|
339 |
-
<div class="row g-1">
|
340 |
-
<div class="col-sm-3">
|
341 |
-
<div class="video-header">Ground-truth</div>
|
342 |
-
<div class="video-container">
|
343 |
-
<iframe src="https://youtube.com/embed/mx6JLxzUkRc"></iframe>
|
344 |
-
</div>
|
345 |
-
</div>
|
346 |
-
<div class="col-sm-3">
|
347 |
-
<div class="video-header">Ours</div>
|
348 |
-
<div class="video-container">
|
349 |
-
<iframe src="https://youtube.com/embed/oLirHhP9Su8"></iframe>
|
350 |
-
</div>
|
351 |
-
</div>
|
352 |
-
<div class="col-sm-3">
|
353 |
-
<div class="video-header">V2A-Mapper</div>
|
354 |
-
<div class="video-container">
|
355 |
-
<iframe src="https://youtube.com/embed/HkLkHMqptv0"></iframe>
|
356 |
-
</div>
|
357 |
-
</div>
|
358 |
-
<div class="col-sm-3">
|
359 |
-
<div class="video-header">FoleyCrafter</div>
|
360 |
-
<div class="video-container">
|
361 |
-
<iframe src="https://youtube.com/embed/rpHiiODjmNU"></iframe>
|
362 |
-
</div>
|
363 |
-
</div>
|
364 |
-
</div>
|
365 |
-
<div class="row g-1">
|
366 |
-
<div class="col-sm-3">
|
367 |
-
<div class="video-header">Frieren</div>
|
368 |
-
<div class="video-container">
|
369 |
-
<iframe src="https://youtube.com/embed/1mVD3fJ0LpM"></iframe>
|
370 |
-
</div>
|
371 |
-
</div>
|
372 |
-
<div class="col-sm-3">
|
373 |
-
<div class="video-header">VATT</div>
|
374 |
-
<div class="video-container">
|
375 |
-
<iframe src="https://youtube.com/embed/yjVFnJiEJlw"></iframe>
|
376 |
-
</div>
|
377 |
-
</div>
|
378 |
-
<div class="col-sm-3">
|
379 |
-
<div class="video-header">V-AURA</div>
|
380 |
-
<div class="video-container">
|
381 |
-
<iframe src="https://youtube.com/embed/neVeMSWtRkU"></iframe>
|
382 |
-
</div>
|
383 |
-
</div>
|
384 |
-
<div class="col-sm-3">
|
385 |
-
<div class="video-header">Seeing and Hearing</div>
|
386 |
-
<div class="video-container">
|
387 |
-
<iframe src="https://youtube.com/embed/EUE7YwyVWz8"></iframe>
|
388 |
-
</div>
|
389 |
-
</div>
|
390 |
-
</div>
|
391 |
-
</div>
|
392 |
-
|
393 |
-
<div id="vgg_extra">
|
394 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
395 |
-
<p style="overflow: hidden;">
|
396 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
397 |
-
</p>
|
398 |
-
|
399 |
-
<div class="row g-1">
|
400 |
-
<div class="col-sm-3">
|
401 |
-
<div class="video-header">Moving train</div>
|
402 |
-
<div class="video-container">
|
403 |
-
<iframe src="https://youtube.com/embed/Ta6H45rBzJc"></iframe>
|
404 |
-
</div>
|
405 |
-
</div>
|
406 |
-
<div class="col-sm-3">
|
407 |
-
<div class="video-header">Water splashing</div>
|
408 |
-
<div class="video-container">
|
409 |
-
<iframe src="https://youtube.com/embed/hl6AtgHXpb4"></iframe>
|
410 |
-
</div>
|
411 |
-
</div>
|
412 |
-
<div class="col-sm-3">
|
413 |
-
<div class="video-header">Skateboarding</div>
|
414 |
-
<div class="video-container">
|
415 |
-
<iframe src="https://youtube.com/embed/n4sCNi_9buI"></iframe>
|
416 |
-
</div>
|
417 |
-
</div>
|
418 |
-
<div class="col-sm-3">
|
419 |
-
<div class="video-header">Synchronized clapping</div>
|
420 |
-
<div class="video-container">
|
421 |
-
<iframe src="https://youtube.com/embed/oxexfpLn7FE"></iframe>
|
422 |
-
</div>
|
423 |
-
</div>
|
424 |
-
</div>
|
425 |
-
|
426 |
-
<br><br>
|
427 |
-
|
428 |
-
<div id="extra-failure">
|
429 |
-
<h2 style="text-align: center;">Failure cases</h2>
|
430 |
-
<p style="overflow: hidden;">
|
431 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
432 |
-
</p>
|
433 |
-
|
434 |
-
<div class="row g-1">
|
435 |
-
<div class="col-sm-6">
|
436 |
-
<div class="video-header">Human speech</div>
|
437 |
-
<div class="video-container">
|
438 |
-
<iframe src="https://youtube.com/embed/nx0CyrDu70Y"></iframe>
|
439 |
-
</div>
|
440 |
-
</div>
|
441 |
-
<div class="col-sm-6">
|
442 |
-
<div class="video-header">Unfamiliar vision input</div>
|
443 |
-
<div class="video-container">
|
444 |
-
<iframe src="https://youtube.com/embed/hfnAqmK3X7w"></iframe>
|
445 |
-
</div>
|
446 |
-
</div>
|
447 |
-
</div>
|
448 |
-
</div>
|
449 |
-
</div>
|
450 |
-
|
451 |
-
</body>
|
452 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{mmaudio → pipeline}/__init__.py
RENAMED
File without changes
|
pipeline/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (178 Bytes). View file
|
|
pipeline/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (166 Bytes). View file
|
|
pipeline/__pycache__/pipeline.cpython-310.pyc
ADDED
Binary file (4.62 kB). View file
|
|
pipeline/__pycache__/pipeline.cpython-38.pyc
ADDED
Binary file (2.66 kB). View file
|
|
pipeline/__pycache__/step0.cpython-310.pyc
ADDED
Binary file (1.48 kB). View file
|
|
pipeline/__pycache__/step0.cpython-38.pyc
ADDED
Binary file (1.35 kB). View file
|
|
pipeline/__pycache__/step1.cpython-310.pyc
ADDED
Binary file (1.39 kB). View file
|
|
pipeline/__pycache__/step1.cpython-38.pyc
ADDED
Binary file (1.3 kB). View file
|
|
pipeline/__pycache__/step2.cpython-310.pyc
ADDED
Binary file (1.71 kB). View file
|
|
pipeline/__pycache__/step2.cpython-38.pyc
ADDED
Binary file (1.61 kB). View file
|
|
pipeline/__pycache__/step3.cpython-310.pyc
ADDED
Binary file (3.62 kB). View file
|
|
pipeline/__pycache__/step3.cpython-38.pyc
ADDED
Binary file (3.42 kB). View file
|
|
pipeline/__pycache__/step4.cpython-310.pyc
ADDED
Binary file (1.16 kB). View file
|
|
pipeline/__pycache__/step4.cpython-38.pyc
ADDED
Binary file (1.08 kB). View file
|
|
pipeline/pipeline.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
|
3 |
+
from .step0 import Step0
|
4 |
+
from .step1 import Step1
|
5 |
+
from .step2 import Step2
|
6 |
+
from .step3 import Step3
|
7 |
+
from .step4 import Step4
|
8 |
+
import logging
|
9 |
+
import re
|
10 |
+
import os
|
11 |
+
|
12 |
+
class Pipeline:
|
13 |
+
def __init__(self, step0_model_dir, step1_mode, step2_model_dir, step2_mode, step3_mode):
|
14 |
+
self.step0 = Step0(step0_model_dir)
|
15 |
+
self.step1 = Step1(step1_mode)
|
16 |
+
self.step2 = Step2(step2_model_dir, step2_mode)
|
17 |
+
self.step3 = Step3(model_type=step3_mode)
|
18 |
+
self.step4 = Step4()
|
19 |
+
self.step_processors = [self.step1, self.step2, self.step3, self.step4]
|
20 |
+
self.log = logging.getLogger(self.__class__.__name__)
|
21 |
+
self.log.setLevel(logging.INFO)
|
22 |
+
|
23 |
+
|
24 |
+
def run(self, video_input, output_dir, mode='s4', postp_mode='rep', prompt='', negative_prompt='', duration=10, seed=42):
|
25 |
+
step0_resp = self.step0.run(video_input)
|
26 |
+
step0_resp_list = re.findall(r'(Step\d:.*?)(?=Step\d:|$)', step0_resp, re.DOTALL)
|
27 |
+
step_infos = [step_info.strip().split("\n")[0] for step_info in step0_resp_list]
|
28 |
+
step3_temp_dir = os.path.join(output_dir, "remove_vo")
|
29 |
+
|
30 |
+
step_results = {"temp_final_audio_path": None, "temp_final_video_path": None}
|
31 |
+
for step_info in step_infos:
|
32 |
+
self.log.info(f"Start to {step_info}")
|
33 |
+
if step_info == 'Step1: Generate audio from video.':
|
34 |
+
step1_audio_path, step1_video_path = self.step1.run(video_input, output_dir, prompt, negative_prompt, duration=duration, seed=seed)
|
35 |
+
step_results["step1_audio_path"] = step1_audio_path
|
36 |
+
step_results["step1_video_path"] = step1_video_path
|
37 |
+
|
38 |
+
elif step_info == 'Step2: Given a video and its generated audio, determine whether the audio contains voice-over.':
|
39 |
+
is_vo = self.step2.run(str(step_results["step1_video_path"]))
|
40 |
+
step_results["is_vo"] = is_vo
|
41 |
+
if not step_results["is_vo"]: # not voice-over
|
42 |
+
step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
|
43 |
+
step_results["temp_final_video_path"] = step_results["step1_video_path"]
|
44 |
+
return step_results
|
45 |
+
|
46 |
+
elif step_info == 'Step3: Remove voice-over from audio.':
|
47 |
+
step3_audio_path = self.step3.run(input_audio_path=step_results["step1_audio_path"],
|
48 |
+
temp_store_dir=step3_temp_dir,
|
49 |
+
output_dir=output_dir)
|
50 |
+
step_results["step3_audio_path"] = step3_audio_path
|
51 |
+
if mode == 's3':
|
52 |
+
step_results["temp_final_audio_path"] = step_results["step3_audio_path"]
|
53 |
+
return step_results
|
54 |
+
|
55 |
+
elif step_info == 'Step4: Determine whether the audio is silent.':
|
56 |
+
is_silent = self.step4.run(step_results["step3_audio_path"])
|
57 |
+
step_results["is_silent"] = is_silent
|
58 |
+
|
59 |
+
else:
|
60 |
+
self.log.error(f"Step-by-Step Error !!!!!!!!!")
|
61 |
+
return step_results
|
62 |
+
|
63 |
+
if not step_results["is_silent"]: # not silent
|
64 |
+
step_results["temp_final_audio_path"] = step_results["step3_audio_path"]
|
65 |
+
else:
|
66 |
+
self.log.info(f"Start to post process, use mode: {postp_mode}")
|
67 |
+
if postp_mode == "rm":
|
68 |
+
step_results["temp_final_audio_path"] = None
|
69 |
+
elif postp_mode == "rep":
|
70 |
+
step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
|
71 |
+
step_results["temp_final_video_path"] = step_results["step1_video_path"]
|
72 |
+
elif postp_mode == "neg":
|
73 |
+
neg_audio_path, neg_video_path = self.step1.run(video_input, output_dir, prompt, negative_prompt='human voice', duration=duration, seed=seed, is_postp=True)
|
74 |
+
step_results["temp_final_audio_path"] = neg_audio_path
|
75 |
+
step_results["temp_final_video_path"] = neg_video_path
|
76 |
+
else:
|
77 |
+
self.log.error(f"Error postp_mode: {postp_mode}")
|
78 |
+
|
79 |
+
self.log.info(f"After post-processing, audio is {step_results['temp_final_audio_path']} and video is {step_results['temp_final_video_path']}")
|
80 |
+
self.log.info(f"Finish Post-Process successfully.\n")
|
81 |
+
|
82 |
+
return step_results
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
def run_for_gradio(self, video_input, output_dir, mode='s4', postp_mode='rep', prompt='', negative_prompt='', duration=10, seed=42):
|
87 |
+
step_results = {"temp_final_audio_path": None,
|
88 |
+
"temp_final_video_path": None,
|
89 |
+
'log': ''}
|
90 |
+
|
91 |
+
step0_resp = self.step0.run(video_input)
|
92 |
+
step0_resp_list = re.findall(r'(Step\d:.*?)(?=Step\d:|$)', step0_resp, re.DOTALL)
|
93 |
+
step_infos = [step_info.strip().split("\n")[0] for step_info in step0_resp_list]
|
94 |
+
step3_temp_dir = os.path.join(output_dir, "remove_vo")
|
95 |
+
|
96 |
+
|
97 |
+
for step_info in step_infos:
|
98 |
+
self.log.info(f"Start to {step_info}")
|
99 |
+
step_results['log'] = f"Start to {step_info}"
|
100 |
+
yield step_results
|
101 |
+
|
102 |
+
if step_info == 'Step1: Generate audio from video.':
|
103 |
+
step1_audio_path, step1_video_path = self.step1.run(video_input, output_dir, prompt, negative_prompt, duration=duration, seed=seed)
|
104 |
+
step_results["step1_audio_path"] = step1_audio_path
|
105 |
+
step_results["step1_video_path"] = step1_video_path
|
106 |
+
step_results['log'] = "Step1 completed."
|
107 |
+
yield step_results
|
108 |
+
|
109 |
+
elif step_info == 'Step2: Given a video and its generated audio, determine whether the audio contains voice-over.':
|
110 |
+
is_vo = self.step2.run(str(step_results["step1_video_path"]))
|
111 |
+
step_results["is_vo"] = is_vo
|
112 |
+
step_results['log'] = f"Step2 completed. Contain voice-over? {'Yes' if is_vo else 'No'}"
|
113 |
+
yield step_results
|
114 |
+
if not step_results["is_vo"]: # not voice-over
|
115 |
+
step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
|
116 |
+
step_results["temp_final_video_path"] = step_results["step1_video_path"]
|
117 |
+
step_results['log'] = "Finish step-by-step v2a."
|
118 |
+
yield step_results
|
119 |
+
|
120 |
+
elif step_info == 'Step3: Remove voice-over from audio.':
|
121 |
+
step3_audio_path = self.step3.run(input_audio_path=step_results["step1_audio_path"],
|
122 |
+
temp_store_dir=step3_temp_dir,
|
123 |
+
output_dir=output_dir)
|
124 |
+
step_results["step3_audio_path"] = step3_audio_path
|
125 |
+
step_results['log'] = f"Step3 completed."
|
126 |
+
yield step_results
|
127 |
+
if mode == 's3':
|
128 |
+
step_results["temp_final_audio_path"] = step_results["step3_audio_path"]
|
129 |
+
step_results['log'] = "Finish step-by-step v2a."
|
130 |
+
yield step_results
|
131 |
+
|
132 |
+
elif step_info == 'Step4: Determine whether the audio is silent.':
|
133 |
+
is_silent = self.step4.run(step_results["step3_audio_path"])
|
134 |
+
step_results["is_silent"] = is_silent
|
135 |
+
step_results['log'] = f"Step4 completed. Silent? {'Yes' if is_silent else 'No'}"
|
136 |
+
yield step_results
|
137 |
+
|
138 |
+
else:
|
139 |
+
self.log.error(f"Step-by-Step Error !!!!!!!!!")
|
140 |
+
step_results['log'] = f"Step-by-Step Error !!!!!!!!!"
|
141 |
+
yield step_results
|
142 |
+
step_results['log'] = "Finish step-by-step v2a."
|
143 |
+
yield step_results
|
144 |
+
|
145 |
+
if not step_results["is_silent"]: # not silent
|
146 |
+
step_results["temp_final_audio_path"] = step_results["step3_audio_path"]
|
147 |
+
step_results['log'] = "Finish step-by-step v2a."
|
148 |
+
yield step_results
|
149 |
+
|
150 |
+
else:
|
151 |
+
step_results['log'] = f"Post-processing with mode: {postp_mode}"
|
152 |
+
yield step_results
|
153 |
+
self.log.info(f"Start to post process, use mode: {postp_mode}")
|
154 |
+
|
155 |
+
if postp_mode == "rm":
|
156 |
+
step_results["temp_final_audio_path"] = None
|
157 |
+
elif postp_mode == "rep":
|
158 |
+
step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
|
159 |
+
step_results["temp_final_video_path"] = step_results["step1_video_path"]
|
160 |
+
elif postp_mode == "neg":
|
161 |
+
neg_audio_path, neg_video_path = self.step1.run(video_input, output_dir, prompt, negative_prompt='human voice', duration=duration, seed=seed, is_postp=True)
|
162 |
+
step_results["temp_final_audio_path"] = neg_audio_path
|
163 |
+
step_results["temp_final_video_path"] = neg_video_path
|
164 |
+
else:
|
165 |
+
self.log.error(f"Error postp_mode: {postp_mode}")
|
166 |
+
|
167 |
+
self.log.info(f"After post-processing, audio is {step_results['temp_final_audio_path']} and video is {step_results['temp_final_video_path']}")
|
168 |
+
self.log.info(f"Finish Post-Process successfully.\n")
|
169 |
+
step_results['log'] = f"Post-processing completed."
|
170 |
+
yield step_results
|
171 |
+
|
172 |
+
|
173 |
+
step_results['log'] = "Finish step-by-step v2a."
|
174 |
+
yield step_results
|
175 |
+
|
pipeline/step0.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# CoT generate step-by-step
|
3 |
+
|
4 |
+
from third_party.VideoLLaMA2.videollama2 import model_init, mm_infer
|
5 |
+
import logging
|
6 |
+
|
7 |
+
class Step0:
|
8 |
+
def __init__(self, model_path, modal_type='v'):
|
9 |
+
self.log = logging.getLogger(self.__class__.__name__)
|
10 |
+
self.log.setLevel(logging.INFO)
|
11 |
+
|
12 |
+
self.model, self.processor, self.tokenizer = model_init(model_path)
|
13 |
+
self.modal_type=modal_type
|
14 |
+
if modal_type == "a":
|
15 |
+
self.model.model.vision_tower = None
|
16 |
+
elif modal_type == "v":
|
17 |
+
self.model.model.audio_tower = None
|
18 |
+
elif modal_type == "av":
|
19 |
+
pass
|
20 |
+
else:
|
21 |
+
raise NotImplementedError
|
22 |
+
self.modal = 'audio' if modal_type == "a" else "video"
|
23 |
+
self.question = f"Generate high-quality audio from video step-by-step."
|
24 |
+
self.preprocess = self.processor[self.modal]
|
25 |
+
|
26 |
+
def run(self, video_path):
|
27 |
+
self.log.info("######################################################################################################")
|
28 |
+
self.log.info("Generate high-quality audio from video step-by-step...")
|
29 |
+
audio_video_tensor = self.preprocess(video_path, va=False)
|
30 |
+
output = mm_infer(
|
31 |
+
audio_video_tensor,
|
32 |
+
self.question,
|
33 |
+
model=self.model,
|
34 |
+
tokenizer=self.tokenizer,
|
35 |
+
modal=self.modal,
|
36 |
+
do_sample=False,
|
37 |
+
)
|
38 |
+
|
39 |
+
return output
|
pipeline/step1.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# V2A
|
3 |
+
import logging
|
4 |
+
|
5 |
+
|
6 |
+
class Step1:
|
7 |
+
def __init__(self, step1_mode):
|
8 |
+
self.log = logging.getLogger(self.__class__.__name__)
|
9 |
+
self.log.setLevel(logging.INFO)
|
10 |
+
|
11 |
+
if step1_mode.startswith('mmaudio'):
|
12 |
+
from v2a_models.v2a_mmaudio import V2A_MMAudio
|
13 |
+
variant = step1_mode.replace("mmaudio_", "")
|
14 |
+
self.v2a_model = V2A_MMAudio(variant)
|
15 |
+
elif step1_mode == "foleycrafter":
|
16 |
+
from v2a_models.v2a_foleycrafter import V2A_FoleyCrafter
|
17 |
+
self.v2a_model = V2A_FoleyCrafter()
|
18 |
+
else:
|
19 |
+
self.log.error(f"Error step1_mode: {step1_mode}")
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
def run(self, video_path, output_dir, prompt='', negative_prompt='', duration=10, seed=42, is_postp=False,):
|
24 |
+
# self.log.info("Step1: Generate audio from video.")
|
25 |
+
step1_audio_path, step1_video_path = self.v2a_model.generate_audio(
|
26 |
+
video_path=video_path,
|
27 |
+
output_dir=output_dir,
|
28 |
+
prompt=prompt,
|
29 |
+
negative_prompt=negative_prompt,
|
30 |
+
duration=duration,
|
31 |
+
seed=seed,
|
32 |
+
is_postp=is_postp)
|
33 |
+
|
34 |
+
self.log.info(f"The audio generated by Step1 is in {step1_audio_path}, and the video is in {step1_video_path}")
|
35 |
+
self.log.info("Finish Step1 successfully.\n")
|
36 |
+
return step1_audio_path, step1_video_path
|
pipeline/step2.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# judge voice-over
|
3 |
+
|
4 |
+
from third_party.VideoLLaMA2.videollama2 import model_init, mm_infer
|
5 |
+
import logging
|
6 |
+
|
7 |
+
class Step2:
|
8 |
+
def __init__(self, model_path, step2_mode, modal_type="av"):
|
9 |
+
self.log = logging.getLogger(self.__class__.__name__)
|
10 |
+
self.log.setLevel(logging.INFO)
|
11 |
+
|
12 |
+
self.model, self.processor, self.tokenizer = model_init(model_path)
|
13 |
+
self.modal_type=modal_type
|
14 |
+
if modal_type == "a":
|
15 |
+
self.model.model.vision_tower = None
|
16 |
+
elif modal_type == "v":
|
17 |
+
self.model.model.audio_tower = None
|
18 |
+
elif modal_type == "av":
|
19 |
+
pass
|
20 |
+
else:
|
21 |
+
raise NotImplementedError
|
22 |
+
self.modal = 'audio' if modal_type == "a" else "video"
|
23 |
+
|
24 |
+
self.question = f"Given a video and its corresponding audio, determine whether the audio contains voice-over? Options: A. Yes, B. No. Choose A or B."
|
25 |
+
self.preprocess = self.processor[self.modal]
|
26 |
+
|
27 |
+
self.step2_mode = step2_mode
|
28 |
+
|
29 |
+
def run(self, video_audio_path):
|
30 |
+
# self.log.info("Step2: Given a video and its generated audio, determine whether the audio contains voice-over.")
|
31 |
+
audio_video_tensor = self.preprocess(video_audio_path, va=True)
|
32 |
+
output = mm_infer(
|
33 |
+
audio_video_tensor,
|
34 |
+
self.question,
|
35 |
+
model=self.model,
|
36 |
+
tokenizer=self.tokenizer,
|
37 |
+
modal=self.modal,
|
38 |
+
do_sample=False,
|
39 |
+
)
|
40 |
+
# print("oooooooooooooooooooooo: ", output)
|
41 |
+
|
42 |
+
if self.step2_mode == "cot":
|
43 |
+
output = output.split("<CONCLUSION>")[-1][1]
|
44 |
+
print("1111111111111111111111111: ", output)
|
45 |
+
output = (output == "A")
|
46 |
+
|
47 |
+
if output:
|
48 |
+
self.log.info(f"The video generated by Step1 ({video_audio_path}) contains voice-over.")
|
49 |
+
else:
|
50 |
+
self.log.info(f"The video generated by Step1 ({video_audio_path}) does not contain voice-over.")
|
51 |
+
self.log.info("Finish Step2 successfully.\n")
|
52 |
+
return output
|
pipeline/step3.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Remove voice-over
|
3 |
+
import logging
|
4 |
+
import argparse
|
5 |
+
import subprocess
|
6 |
+
import librosa
|
7 |
+
import os
|
8 |
+
import torch
|
9 |
+
import soundfile as sf
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
|
13 |
+
# Using the embedded version of Python can also correctly import the utils module.
|
14 |
+
# current_dir = os.path.dirname(os.path.abspath(__file__))
|
15 |
+
# sys.path.append(current_dir)
|
16 |
+
|
17 |
+
from third_party.MusicSourceSeparationTraining.utils import demix, load_config, normalize_audio, denormalize_audio, draw_spectrogram
|
18 |
+
from third_party.MusicSourceSeparationTraining.utils import prefer_target_instrument, apply_tta, load_start_checkpoint
|
19 |
+
from third_party.MusicSourceSeparationTraining.models.bs_roformer import BSRoformer
|
20 |
+
import warnings
|
21 |
+
|
22 |
+
warnings.filterwarnings("ignore")
|
23 |
+
|
24 |
+
model_base_dir = "pretrained/remove_vo/checkpoints"
|
25 |
+
MODEL_PATHS = {"bs_roformer": [f"{model_base_dir}/model_bs_roformer_ep_317_sdr_12.9755.ckpt", f"{model_base_dir}/model_bs_roformer_ep_317_sdr_12.9755.yaml"]}
|
26 |
+
|
27 |
+
|
28 |
+
class Step3:
|
29 |
+
def __init__(self, model_type="bs_roformer"):
|
30 |
+
model_path, config_path = MODEL_PATHS[model_type]
|
31 |
+
|
32 |
+
self.log = logging.getLogger(self.__class__.__name__)
|
33 |
+
self.log.setLevel(logging.INFO)
|
34 |
+
self.device = 'cpu'
|
35 |
+
if torch.cuda.is_available():
|
36 |
+
self.device = 'cuda'
|
37 |
+
elif torch.backends.mps.is_available():
|
38 |
+
self.device = 'mps'
|
39 |
+
else:
|
40 |
+
self.log.warning('CUDA/MPS are not available, running on CPU')
|
41 |
+
|
42 |
+
self.model_type = model_type
|
43 |
+
|
44 |
+
# self.model, self.config = get_model_from_config(model_type, config_path)
|
45 |
+
self.config = load_config(model_type, config_path)
|
46 |
+
self.model = BSRoformer(**dict(self.config.model))
|
47 |
+
args = argparse.Namespace()
|
48 |
+
args.start_check_point = model_path
|
49 |
+
args.model_type = model_type
|
50 |
+
args.lora_checkpoint = ''
|
51 |
+
load_start_checkpoint(args, self.model, type_='inference')
|
52 |
+
self.model = self.model.to(self.device)
|
53 |
+
self.sample_rate = getattr(self.config.audio, 'sample_rate', 44100)
|
54 |
+
|
55 |
+
|
56 |
+
def run(self,
|
57 |
+
input_audio_path,
|
58 |
+
temp_store_dir, # for remove result dir
|
59 |
+
output_dir, # for final dir
|
60 |
+
disable_detailed_pbar: bool=False,
|
61 |
+
use_tta: bool= False,
|
62 |
+
extract_instrumental: bool=True,
|
63 |
+
codec="wav",
|
64 |
+
subtype="FLOAT",
|
65 |
+
draw_spectro=0,
|
66 |
+
):
|
67 |
+
|
68 |
+
# self.log.info("Step3: Remove voice-over from audio.")
|
69 |
+
|
70 |
+
os.makedirs(output_dir, exist_ok=True)
|
71 |
+
|
72 |
+
if disable_detailed_pbar:
|
73 |
+
detailed_pbar = False
|
74 |
+
else:
|
75 |
+
detailed_pbar = True
|
76 |
+
|
77 |
+
instruments = prefer_target_instrument(self.config)[:]
|
78 |
+
|
79 |
+
mix, sr = librosa.load(input_audio_path, sr=self.sample_rate, mono=False)
|
80 |
+
# If mono audio we must adjust it depending on model
|
81 |
+
if len(mix.shape) == 1:
|
82 |
+
mix = np.expand_dims(mix, axis=0)
|
83 |
+
if 'num_channels' in self.config.audio:
|
84 |
+
if self.config.audio['num_channels'] == 2:
|
85 |
+
print(f'Convert mono track to stereo...')
|
86 |
+
mix = np.concatenate([mix, mix], axis=0)
|
87 |
+
|
88 |
+
mix_orig = mix.copy()
|
89 |
+
if 'normalize' in self.config.inference:
|
90 |
+
if self.config.inference['normalize'] is True:
|
91 |
+
mix, norm_params = normalize_audio(mix)
|
92 |
+
|
93 |
+
waveforms_orig = demix(self.config, self.model, mix, self.device, model_type=self.model_type, pbar=detailed_pbar)
|
94 |
+
if use_tta:
|
95 |
+
waveforms_orig = apply_tta(self.config, self.model, mix, waveforms_orig, self.device, self.model_type)
|
96 |
+
|
97 |
+
if extract_instrumental:
|
98 |
+
instr = 'vocals' if 'vocals' in instruments else instruments[0]
|
99 |
+
waveforms_orig['instrumental'] = mix_orig - waveforms_orig[instr]
|
100 |
+
if 'instrumental' not in instruments:
|
101 |
+
instruments.append('instrumental')
|
102 |
+
|
103 |
+
file_name = os.path.splitext(os.path.basename(input_audio_path))[0].replace(".step1", "")
|
104 |
+
temp_output_dir = os.path.join(temp_store_dir, file_name)
|
105 |
+
os.makedirs(temp_output_dir, exist_ok=True)
|
106 |
+
|
107 |
+
for instr in instruments:
|
108 |
+
estimates = waveforms_orig[instr]
|
109 |
+
if 'normalize' in self.config.inference:
|
110 |
+
if self.config.inference['normalize'] is True:
|
111 |
+
estimates = denormalize_audio(estimates, norm_params)
|
112 |
+
|
113 |
+
output_path = os.path.join(temp_output_dir, f"{instr}.{codec}")
|
114 |
+
sf.write(output_path, estimates.T, sr, subtype=subtype)
|
115 |
+
if draw_spectro > 0:
|
116 |
+
output_img_path = os.path.join(temp_output_dir, f"{instr}.jpg")
|
117 |
+
draw_spectrogram(estimates.T, sr, draw_spectro, output_img_path)
|
118 |
+
|
119 |
+
|
120 |
+
instrumental_file = os.path.join(temp_output_dir, 'instrumental.wav')
|
121 |
+
step3_audio_path = f"{output_dir}/{file_name}.step3.wav"
|
122 |
+
subprocess.run(['cp', instrumental_file, step3_audio_path])
|
123 |
+
|
124 |
+
self.log.info(f"The voice-over has been removed, and the audio is saved in {step3_audio_path}")
|
125 |
+
self.log.info("Finish Step3 successfully.\n")
|
126 |
+
return step3_audio_path
|
127 |
+
|
128 |
+
|
129 |
+
|
pipeline/step4.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Silence detection
|
3 |
+
import logging
|
4 |
+
import librosa
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
class Step4:
|
9 |
+
def __init__(self):
|
10 |
+
self.log = logging.getLogger(self.__class__.__name__)
|
11 |
+
self.log.setLevel(logging.INFO)
|
12 |
+
|
13 |
+
|
14 |
+
def run(self,
|
15 |
+
audio_path,
|
16 |
+
silence_thresh=-50,
|
17 |
+
duration_thresh=0.9):
|
18 |
+
# self.log.info("Step4: Determine whether the audio is silent.")
|
19 |
+
y, sr = librosa.load(audio_path, sr=None)
|
20 |
+
energy = librosa.feature.rms(y=y)[0]
|
21 |
+
energy_db = librosa.amplitude_to_db(energy)
|
22 |
+
silent_ratio = np.sum(energy_db < silence_thresh) / len(energy_db)
|
23 |
+
is_silent = silent_ratio > duration_thresh
|
24 |
+
|
25 |
+
if is_silent:
|
26 |
+
self.log.info(f"The audio after removing the voiceover ({audio_path}) is silent.")
|
27 |
+
else:
|
28 |
+
self.log.info(f"The audio after removing the voiceover ({audio_path}) is not silent.")
|
29 |
+
self.log.info("Finish Step4 successfully.\n")
|
30 |
+
|
31 |
+
return is_silent
|
pyproject.toml
DELETED
@@ -1,52 +0,0 @@
|
|
1 |
-
[build-system]
|
2 |
-
requires = ["hatchling"]
|
3 |
-
build-backend = "hatchling.build"
|
4 |
-
|
5 |
-
[tool.hatch.metadata]
|
6 |
-
allow-direct-references = true
|
7 |
-
|
8 |
-
[tool.yapf]
|
9 |
-
based_on_style = "pep8"
|
10 |
-
indent_width = 4
|
11 |
-
column_limit = 100
|
12 |
-
|
13 |
-
[project]
|
14 |
-
name = "mmaudio"
|
15 |
-
version = "1.0.0"
|
16 |
-
authors = [{ name = "Rex Cheng", email = "[email protected]" }]
|
17 |
-
description = ""
|
18 |
-
readme = "README.md"
|
19 |
-
requires-python = ">=3.9"
|
20 |
-
classifiers = [
|
21 |
-
"Programming Language :: Python :: 3",
|
22 |
-
"Operating System :: OS Independent",
|
23 |
-
]
|
24 |
-
dependencies = [
|
25 |
-
'torch >= 2.5.1',
|
26 |
-
'python-dotenv',
|
27 |
-
'cython',
|
28 |
-
'gitpython >= 3.1',
|
29 |
-
'tensorboard >= 2.11',
|
30 |
-
'numpy >= 1.21, <2.1',
|
31 |
-
'Pillow >= 9.5',
|
32 |
-
'opencv-python >= 4.8',
|
33 |
-
'scipy >= 1.7',
|
34 |
-
'tqdm >= 4.66.1',
|
35 |
-
'gradio >= 3.34',
|
36 |
-
'einops >= 0.6',
|
37 |
-
'hydra-core >= 1.3.2',
|
38 |
-
'requests',
|
39 |
-
'torchdiffeq',
|
40 |
-
'librosa >= 0.8.1',
|
41 |
-
'nitrous-ema',
|
42 |
-
'safetensors',
|
43 |
-
'auraloss',
|
44 |
-
'hydra_colorlog',
|
45 |
-
'tensordict',
|
46 |
-
'colorlog',
|
47 |
-
'open_clip_torch',
|
48 |
-
'soundfile',
|
49 |
-
]
|
50 |
-
|
51 |
-
[tool.hatch.build.targets.wheel]
|
52 |
-
packages = ["mmaudio"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt.bak
DELETED
@@ -1,27 +0,0 @@
|
|
1 |
-
torch == 2.4.0
|
2 |
-
torchvision
|
3 |
-
torchaudio
|
4 |
-
python-dotenv
|
5 |
-
cython
|
6 |
-
gitpython >= 3.1
|
7 |
-
tensorboard >= 2.11
|
8 |
-
numpy >= 1.21, <2.1
|
9 |
-
Pillow >= 9.5
|
10 |
-
opencv-python >= 4.8
|
11 |
-
scipy >= 1.7
|
12 |
-
tqdm >= 4.66.1
|
13 |
-
gradio >= 3.34
|
14 |
-
einops >= 0.6
|
15 |
-
hydra-core >= 1.3.2
|
16 |
-
requests
|
17 |
-
torchdiffeq
|
18 |
-
librosa >= 0.8.1
|
19 |
-
nitrous-ema
|
20 |
-
safetensors
|
21 |
-
auraloss
|
22 |
-
hydra_colorlog
|
23 |
-
tensordict
|
24 |
-
colorlog
|
25 |
-
open_clip_torch
|
26 |
-
soundfile
|
27 |
-
av
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
third_party/MMAudio/.gitignore
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
run_*.sh
|
2 |
+
log/
|
3 |
+
saves
|
4 |
+
saves/
|
5 |
+
weights/
|
6 |
+
weights
|
7 |
+
output/
|
8 |
+
output
|
9 |
+
pretrained/
|
10 |
+
workspace
|
11 |
+
workspace/
|
12 |
+
ext_weights/
|
13 |
+
ext_weights
|
14 |
+
.checkpoints/
|
15 |
+
.vscode/
|
16 |
+
training/example_output/
|
17 |
+
|
18 |
+
# Byte-compiled / optimized / DLL files
|
19 |
+
__pycache__/
|
20 |
+
*.py[cod]
|
21 |
+
*$py.class
|
22 |
+
|
23 |
+
# C extensions
|
24 |
+
*.so
|
25 |
+
|
26 |
+
# Distribution / packaging
|
27 |
+
.Python
|
28 |
+
build/
|
29 |
+
develop-eggs/
|
30 |
+
dist/
|
31 |
+
downloads/
|
32 |
+
eggs/
|
33 |
+
.eggs/
|
34 |
+
lib/
|
35 |
+
lib64/
|
36 |
+
parts/
|
37 |
+
sdist/
|
38 |
+
var/
|
39 |
+
wheels/
|
40 |
+
pip-wheel-metadata/
|
41 |
+
share/python-wheels/
|
42 |
+
*.egg-info/
|
43 |
+
.installed.cfg
|
44 |
+
*.egg
|
45 |
+
MANIFEST
|
46 |
+
|
47 |
+
# PyInstaller
|
48 |
+
# Usually these files are written by a python script from a template
|
49 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
50 |
+
*.manifest
|
51 |
+
*.spec
|
52 |
+
|
53 |
+
# Installer logs
|
54 |
+
pip-log.txt
|
55 |
+
pip-delete-this-directory.txt
|
56 |
+
|
57 |
+
# Unit test / coverage reports
|
58 |
+
htmlcov/
|
59 |
+
.tox/
|
60 |
+
.nox/
|
61 |
+
.coverage
|
62 |
+
.coverage.*
|
63 |
+
.cache
|
64 |
+
nosetests.xml
|
65 |
+
coverage.xml
|
66 |
+
*.cover
|
67 |
+
*.py,cover
|
68 |
+
.hypothesis/
|
69 |
+
.pytest_cache/
|
70 |
+
|
71 |
+
# Translations
|
72 |
+
*.mo
|
73 |
+
*.pot
|
74 |
+
|
75 |
+
# Django stuff:
|
76 |
+
*.log
|
77 |
+
local_settings.py
|
78 |
+
db.sqlite3
|
79 |
+
db.sqlite3-journal
|
80 |
+
|
81 |
+
# Flask stuff:
|
82 |
+
instance/
|
83 |
+
.webassets-cache
|
84 |
+
|
85 |
+
# Scrapy stuff:
|
86 |
+
.scrapy
|
87 |
+
|
88 |
+
# Sphinx documentation
|
89 |
+
docs/_build/
|
90 |
+
|
91 |
+
# PyBuilder
|
92 |
+
target/
|
93 |
+
|
94 |
+
# Jupyter Notebook
|
95 |
+
.ipynb_checkpoints
|
96 |
+
|
97 |
+
# IPython
|
98 |
+
profile_default/
|
99 |
+
ipython_config.py
|
100 |
+
|
101 |
+
# pyenv
|
102 |
+
.python-version
|
103 |
+
|
104 |
+
# pipenv
|
105 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
106 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
107 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
108 |
+
# install all needed dependencies.
|
109 |
+
#Pipfile.lock
|
110 |
+
|
111 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
112 |
+
__pypackages__/
|
113 |
+
|
114 |
+
# Celery stuff
|
115 |
+
celerybeat-schedule
|
116 |
+
celerybeat.pid
|
117 |
+
|
118 |
+
# SageMath parsed files
|
119 |
+
*.sage.py
|
120 |
+
|
121 |
+
# Environments
|
122 |
+
.env
|
123 |
+
.venv
|
124 |
+
env/
|
125 |
+
venv/
|
126 |
+
ENV/
|
127 |
+
env.bak/
|
128 |
+
venv.bak/
|
129 |
+
|
130 |
+
# Spyder project settings
|
131 |
+
.spyderproject
|
132 |
+
.spyproject
|
133 |
+
|
134 |
+
# Rope project settings
|
135 |
+
.ropeproject
|
136 |
+
|
137 |
+
# mkdocs documentation
|
138 |
+
/site
|
139 |
+
|
140 |
+
# mypy
|
141 |
+
.mypy_cache/
|
142 |
+
.dmypy.json
|
143 |
+
dmypy.json
|
144 |
+
|
145 |
+
# Pyre type checker
|
146 |
+
.pyre/
|
third_party/MMAudio/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 Sony Research Inc.
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
{mmaudio/data → third_party/MMAudio/mmaudio}/__init__.py
RENAMED
File without changes
|
{mmaudio/ext/bigvgan_v2 → third_party/MMAudio/mmaudio/data}/__init__.py
RENAMED
File without changes
|
{mmaudio → third_party/MMAudio/mmaudio}/data/av_utils.py
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from fractions import Fraction
|
3 |
from pathlib import Path
|
4 |
-
from typing import Optional
|
5 |
|
6 |
import av
|
7 |
import numpy as np
|
@@ -15,7 +15,7 @@ class VideoInfo:
|
|
15 |
fps: Fraction
|
16 |
clip_frames: torch.Tensor
|
17 |
sync_frames: torch.Tensor
|
18 |
-
all_frames: Optional[
|
19 |
|
20 |
@property
|
21 |
def height(self):
|
@@ -25,9 +25,35 @@ class VideoInfo:
|
|
25 |
def width(self):
|
26 |
return self.all_frames[0].shape[1]
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
output_frames = [[] for _ in list_of_fps]
|
32 |
next_frame_time_for_each_fps = [0.0 for _ in list_of_fps]
|
33 |
time_delta_for_each_fps = [1 / fps for fps in list_of_fps]
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from fractions import Fraction
|
3 |
from pathlib import Path
|
4 |
+
from typing import Optional, List, Tuple
|
5 |
|
6 |
import av
|
7 |
import numpy as np
|
|
|
15 |
fps: Fraction
|
16 |
clip_frames: torch.Tensor
|
17 |
sync_frames: torch.Tensor
|
18 |
+
all_frames: Optional[List[np.ndarray]]
|
19 |
|
20 |
@property
|
21 |
def height(self):
|
|
|
25 |
def width(self):
|
26 |
return self.all_frames[0].shape[1]
|
27 |
|
28 |
+
@classmethod
|
29 |
+
def from_image_info(cls, image_info: 'ImageInfo', duration_sec: float,
|
30 |
+
fps: Fraction) -> 'VideoInfo':
|
31 |
+
num_frames = int(duration_sec * fps)
|
32 |
+
all_frames = [image_info.original_frame] * num_frames
|
33 |
+
return cls(duration_sec=duration_sec,
|
34 |
+
fps=fps,
|
35 |
+
clip_frames=image_info.clip_frames,
|
36 |
+
sync_frames=image_info.sync_frames,
|
37 |
+
all_frames=all_frames)
|
38 |
|
39 |
+
|
40 |
+
@dataclass
|
41 |
+
class ImageInfo:
|
42 |
+
clip_frames: torch.Tensor
|
43 |
+
sync_frames: torch.Tensor
|
44 |
+
original_frame: Optional[np.ndarray]
|
45 |
+
|
46 |
+
@property
|
47 |
+
def height(self):
|
48 |
+
return self.original_frame.shape[0]
|
49 |
+
|
50 |
+
@property
|
51 |
+
def width(self):
|
52 |
+
return self.original_frame.shape[1]
|
53 |
+
|
54 |
+
|
55 |
+
def read_frames(video_path: Path, list_of_fps: List[float], start_sec: float, end_sec: float,
|
56 |
+
need_all_frames: bool) -> Tuple[List[np.ndarray], List[np.ndarray], Fraction]:
|
57 |
output_frames = [[] for _ in list_of_fps]
|
58 |
next_frame_time_for_each_fps = [0.0 for _ in list_of_fps]
|
59 |
time_delta_for_each_fps = [1 / fps for fps in list_of_fps]
|
third_party/MMAudio/mmaudio/data/data_setup.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import random
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
from omegaconf import DictConfig
|
7 |
+
from torch.utils.data import DataLoader, Dataset
|
8 |
+
from torch.utils.data.dataloader import default_collate
|
9 |
+
from torch.utils.data.distributed import DistributedSampler
|
10 |
+
|
11 |
+
from mmaudio.data.eval.audiocaps import AudioCapsData
|
12 |
+
from mmaudio.data.eval.video_dataset import MovieGen, VGGSound
|
13 |
+
from mmaudio.data.extracted_audio import ExtractedAudio
|
14 |
+
from mmaudio.data.extracted_vgg import ExtractedVGG
|
15 |
+
from mmaudio.data.mm_dataset import MultiModalDataset
|
16 |
+
from mmaudio.utils.dist_utils import local_rank
|
17 |
+
|
18 |
+
log = logging.getLogger()
|
19 |
+
|
20 |
+
|
21 |
+
# Re-seed randomness every time we start a worker
|
22 |
+
def worker_init_fn(worker_id: int):
|
23 |
+
worker_seed = torch.initial_seed() % (2**31) + worker_id + local_rank * 1000
|
24 |
+
np.random.seed(worker_seed)
|
25 |
+
random.seed(worker_seed)
|
26 |
+
log.debug(f'Worker {worker_id} re-seeded with seed {worker_seed} in rank {local_rank}')
|
27 |
+
|
28 |
+
|
29 |
+
def load_vgg_data(cfg: DictConfig, data_cfg: DictConfig) -> Dataset:
|
30 |
+
dataset = ExtractedVGG(tsv_path=data_cfg.tsv,
|
31 |
+
data_dim=cfg.data_dim,
|
32 |
+
premade_mmap_dir=data_cfg.memmap_dir)
|
33 |
+
|
34 |
+
return dataset
|
35 |
+
|
36 |
+
|
37 |
+
def load_audio_data(cfg: DictConfig, data_cfg: DictConfig) -> Dataset:
|
38 |
+
dataset = ExtractedAudio(tsv_path=data_cfg.tsv,
|
39 |
+
data_dim=cfg.data_dim,
|
40 |
+
premade_mmap_dir=data_cfg.memmap_dir)
|
41 |
+
|
42 |
+
return dataset
|
43 |
+
|
44 |
+
|
45 |
+
def setup_training_datasets(cfg: DictConfig) -> tuple[Dataset, DistributedSampler, DataLoader]:
|
46 |
+
if cfg.mini_train:
|
47 |
+
vgg = load_vgg_data(cfg, cfg.data.ExtractedVGG_val)
|
48 |
+
audiocaps = load_audio_data(cfg, cfg.data.AudioCaps)
|
49 |
+
dataset = MultiModalDataset([vgg], [audiocaps])
|
50 |
+
if cfg.example_train:
|
51 |
+
video = load_vgg_data(cfg, cfg.data.Example_video)
|
52 |
+
audio = load_audio_data(cfg, cfg.data.Example_audio)
|
53 |
+
dataset = MultiModalDataset([video], [audio])
|
54 |
+
else:
|
55 |
+
# load the largest one first
|
56 |
+
freesound = load_audio_data(cfg, cfg.data.FreeSound)
|
57 |
+
vgg = load_vgg_data(cfg, cfg.data.ExtractedVGG)
|
58 |
+
audiocaps = load_audio_data(cfg, cfg.data.AudioCaps)
|
59 |
+
audioset_sl = load_audio_data(cfg, cfg.data.AudioSetSL)
|
60 |
+
bbcsound = load_audio_data(cfg, cfg.data.BBCSound)
|
61 |
+
clotho = load_audio_data(cfg, cfg.data.Clotho)
|
62 |
+
dataset = MultiModalDataset([vgg] * cfg.vgg_oversample_rate,
|
63 |
+
[audiocaps, audioset_sl, bbcsound, freesound, clotho])
|
64 |
+
|
65 |
+
batch_size = cfg.batch_size
|
66 |
+
num_workers = cfg.num_workers
|
67 |
+
pin_memory = cfg.pin_memory
|
68 |
+
sampler, loader = construct_loader(dataset,
|
69 |
+
batch_size,
|
70 |
+
num_workers,
|
71 |
+
shuffle=True,
|
72 |
+
drop_last=True,
|
73 |
+
pin_memory=pin_memory)
|
74 |
+
|
75 |
+
return dataset, sampler, loader
|
76 |
+
|
77 |
+
|
78 |
+
def setup_test_datasets(cfg):
|
79 |
+
dataset = load_vgg_data(cfg, cfg.data.ExtractedVGG_test)
|
80 |
+
|
81 |
+
batch_size = cfg.batch_size
|
82 |
+
num_workers = cfg.num_workers
|
83 |
+
pin_memory = cfg.pin_memory
|
84 |
+
sampler, loader = construct_loader(dataset,
|
85 |
+
batch_size,
|
86 |
+
num_workers,
|
87 |
+
shuffle=False,
|
88 |
+
drop_last=False,
|
89 |
+
pin_memory=pin_memory)
|
90 |
+
|
91 |
+
return dataset, sampler, loader
|
92 |
+
|
93 |
+
|
94 |
+
def setup_val_datasets(cfg: DictConfig) -> tuple[Dataset, DataLoader, DataLoader]:
|
95 |
+
if cfg.example_train:
|
96 |
+
dataset = load_vgg_data(cfg, cfg.data.Example_video)
|
97 |
+
else:
|
98 |
+
dataset = load_vgg_data(cfg, cfg.data.ExtractedVGG_val)
|
99 |
+
|
100 |
+
val_batch_size = cfg.batch_size
|
101 |
+
val_eval_batch_size = cfg.eval_batch_size
|
102 |
+
num_workers = cfg.num_workers
|
103 |
+
pin_memory = cfg.pin_memory
|
104 |
+
_, val_loader = construct_loader(dataset,
|
105 |
+
val_batch_size,
|
106 |
+
num_workers,
|
107 |
+
shuffle=False,
|
108 |
+
drop_last=False,
|
109 |
+
pin_memory=pin_memory)
|
110 |
+
_, eval_loader = construct_loader(dataset,
|
111 |
+
val_eval_batch_size,
|
112 |
+
num_workers,
|
113 |
+
shuffle=False,
|
114 |
+
drop_last=False,
|
115 |
+
pin_memory=pin_memory)
|
116 |
+
|
117 |
+
return dataset, val_loader, eval_loader
|
118 |
+
|
119 |
+
|
120 |
+
def setup_eval_dataset(dataset_name: str, cfg: DictConfig) -> tuple[Dataset, DataLoader]:
|
121 |
+
if dataset_name.startswith('audiocaps_full'):
|
122 |
+
dataset = AudioCapsData(cfg.eval_data.AudioCaps_full.audio_path,
|
123 |
+
cfg.eval_data.AudioCaps_full.csv_path)
|
124 |
+
elif dataset_name.startswith('audiocaps'):
|
125 |
+
dataset = AudioCapsData(cfg.eval_data.AudioCaps.audio_path,
|
126 |
+
cfg.eval_data.AudioCaps.csv_path)
|
127 |
+
elif dataset_name.startswith('moviegen'):
|
128 |
+
dataset = MovieGen(cfg.eval_data.MovieGen.video_path,
|
129 |
+
cfg.eval_data.MovieGen.jsonl_path,
|
130 |
+
duration_sec=cfg.duration_s)
|
131 |
+
elif dataset_name.startswith('vggsound'):
|
132 |
+
dataset = VGGSound(cfg.eval_data.VGGSound.video_path,
|
133 |
+
cfg.eval_data.VGGSound.csv_path,
|
134 |
+
duration_sec=cfg.duration_s)
|
135 |
+
else:
|
136 |
+
raise ValueError(f'Invalid dataset name: {dataset_name}')
|
137 |
+
|
138 |
+
batch_size = cfg.batch_size
|
139 |
+
num_workers = cfg.num_workers
|
140 |
+
pin_memory = cfg.pin_memory
|
141 |
+
_, loader = construct_loader(dataset,
|
142 |
+
batch_size,
|
143 |
+
num_workers,
|
144 |
+
shuffle=False,
|
145 |
+
drop_last=False,
|
146 |
+
pin_memory=pin_memory,
|
147 |
+
error_avoidance=True)
|
148 |
+
return dataset, loader
|
149 |
+
|
150 |
+
|
151 |
+
def error_avoidance_collate(batch):
|
152 |
+
batch = list(filter(lambda x: x is not None, batch))
|
153 |
+
return default_collate(batch)
|
154 |
+
|
155 |
+
|
156 |
+
def construct_loader(dataset: Dataset,
|
157 |
+
batch_size: int,
|
158 |
+
num_workers: int,
|
159 |
+
*,
|
160 |
+
shuffle: bool = True,
|
161 |
+
drop_last: bool = True,
|
162 |
+
pin_memory: bool = False,
|
163 |
+
error_avoidance: bool = False) -> tuple[DistributedSampler, DataLoader]:
|
164 |
+
train_sampler = DistributedSampler(dataset, rank=local_rank, shuffle=shuffle)
|
165 |
+
train_loader = DataLoader(dataset,
|
166 |
+
batch_size,
|
167 |
+
sampler=train_sampler,
|
168 |
+
num_workers=num_workers,
|
169 |
+
worker_init_fn=worker_init_fn,
|
170 |
+
drop_last=drop_last,
|
171 |
+
persistent_workers=num_workers > 0,
|
172 |
+
pin_memory=pin_memory,
|
173 |
+
collate_fn=error_avoidance_collate if error_avoidance else None)
|
174 |
+
return train_sampler, train_loader
|
{mmaudio/ext/bigvgan_v2/alias_free_activation/cuda → third_party/MMAudio/mmaudio/data/eval}/__init__.py
RENAMED
File without changes
|
third_party/MMAudio/mmaudio/data/eval/audiocaps.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from collections import defaultdict
|
4 |
+
from pathlib import Path
|
5 |
+
from typing import Union
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import torch
|
9 |
+
from torch.utils.data.dataset import Dataset
|
10 |
+
|
11 |
+
log = logging.getLogger()
|
12 |
+
|
13 |
+
|
14 |
+
class AudioCapsData(Dataset):
|
15 |
+
|
16 |
+
def __init__(self, audio_path: Union[str, Path], csv_path: Union[str, Path]):
|
17 |
+
df = pd.read_csv(csv_path).to_dict(orient='records')
|
18 |
+
|
19 |
+
audio_files = sorted(os.listdir(audio_path))
|
20 |
+
audio_files = set(
|
21 |
+
[Path(f).stem for f in audio_files if f.endswith('.wav') or f.endswith('.flac')])
|
22 |
+
|
23 |
+
self.data = []
|
24 |
+
for row in df:
|
25 |
+
self.data.append({
|
26 |
+
'name': row['name'],
|
27 |
+
'caption': row['caption'],
|
28 |
+
})
|
29 |
+
|
30 |
+
self.audio_path = Path(audio_path)
|
31 |
+
self.csv_path = Path(csv_path)
|
32 |
+
|
33 |
+
log.info(f'Found {len(self.data)} matching audio files in {self.audio_path}')
|
34 |
+
|
35 |
+
def __getitem__(self, idx: int) -> torch.Tensor:
|
36 |
+
return self.data[idx]
|
37 |
+
|
38 |
+
def __len__(self):
|
39 |
+
return len(self.data)
|
third_party/MMAudio/mmaudio/data/eval/moviegen.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
from typing import Union
|
6 |
+
|
7 |
+
import torch
|
8 |
+
from torch.utils.data.dataset import Dataset
|
9 |
+
from torchvision.transforms import v2
|
10 |
+
from torio.io import StreamingMediaDecoder
|
11 |
+
|
12 |
+
from mmaudio.utils.dist_utils import local_rank
|
13 |
+
|
14 |
+
log = logging.getLogger()
|
15 |
+
|
16 |
+
_CLIP_SIZE = 384
|
17 |
+
_CLIP_FPS = 8.0
|
18 |
+
|
19 |
+
_SYNC_SIZE = 224
|
20 |
+
_SYNC_FPS = 25.0
|
21 |
+
|
22 |
+
|
23 |
+
class MovieGenData(Dataset):
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
video_root: Union[str, Path],
|
28 |
+
sync_root: Union[str, Path],
|
29 |
+
jsonl_root: Union[str, Path],
|
30 |
+
*,
|
31 |
+
duration_sec: float = 10.0,
|
32 |
+
read_clip: bool = True,
|
33 |
+
):
|
34 |
+
self.video_root = Path(video_root)
|
35 |
+
self.sync_root = Path(sync_root)
|
36 |
+
self.jsonl_root = Path(jsonl_root)
|
37 |
+
self.read_clip = read_clip
|
38 |
+
|
39 |
+
videos = sorted(os.listdir(self.video_root))
|
40 |
+
videos = [v[:-4] for v in videos] # remove extensions
|
41 |
+
self.captions = {}
|
42 |
+
|
43 |
+
for v in videos:
|
44 |
+
with open(self.jsonl_root / (v + '.jsonl')) as f:
|
45 |
+
data = json.load(f)
|
46 |
+
self.captions[v] = data['audio_prompt']
|
47 |
+
|
48 |
+
if local_rank == 0:
|
49 |
+
log.info(f'{len(videos)} videos found in {video_root}')
|
50 |
+
|
51 |
+
self.duration_sec = duration_sec
|
52 |
+
|
53 |
+
self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
|
54 |
+
self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
|
55 |
+
|
56 |
+
self.clip_augment = v2.Compose([
|
57 |
+
v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
|
58 |
+
v2.ToImage(),
|
59 |
+
v2.ToDtype(torch.float32, scale=True),
|
60 |
+
])
|
61 |
+
|
62 |
+
self.sync_augment = v2.Compose([
|
63 |
+
v2.Resize((_SYNC_SIZE, _SYNC_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
|
64 |
+
v2.CenterCrop(_SYNC_SIZE),
|
65 |
+
v2.ToImage(),
|
66 |
+
v2.ToDtype(torch.float32, scale=True),
|
67 |
+
v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
|
68 |
+
])
|
69 |
+
|
70 |
+
self.videos = videos
|
71 |
+
|
72 |
+
def sample(self, idx: int) -> dict[str, torch.Tensor]:
|
73 |
+
video_id = self.videos[idx]
|
74 |
+
caption = self.captions[video_id]
|
75 |
+
|
76 |
+
reader = StreamingMediaDecoder(self.video_root / (video_id + '.mp4'))
|
77 |
+
reader.add_basic_video_stream(
|
78 |
+
frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
|
79 |
+
frame_rate=_CLIP_FPS,
|
80 |
+
format='rgb24',
|
81 |
+
)
|
82 |
+
reader.add_basic_video_stream(
|
83 |
+
frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
|
84 |
+
frame_rate=_SYNC_FPS,
|
85 |
+
format='rgb24',
|
86 |
+
)
|
87 |
+
|
88 |
+
reader.fill_buffer()
|
89 |
+
data_chunk = reader.pop_chunks()
|
90 |
+
|
91 |
+
clip_chunk = data_chunk[0]
|
92 |
+
sync_chunk = data_chunk[1]
|
93 |
+
if clip_chunk is None:
|
94 |
+
raise RuntimeError(f'CLIP video returned None {video_id}')
|
95 |
+
if clip_chunk.shape[0] < self.clip_expected_length:
|
96 |
+
raise RuntimeError(f'CLIP video too short {video_id}')
|
97 |
+
|
98 |
+
if sync_chunk is None:
|
99 |
+
raise RuntimeError(f'Sync video returned None {video_id}')
|
100 |
+
if sync_chunk.shape[0] < self.sync_expected_length:
|
101 |
+
raise RuntimeError(f'Sync video too short {video_id}')
|
102 |
+
|
103 |
+
# truncate the video
|
104 |
+
clip_chunk = clip_chunk[:self.clip_expected_length]
|
105 |
+
if clip_chunk.shape[0] != self.clip_expected_length:
|
106 |
+
raise RuntimeError(f'CLIP video wrong length {video_id}, '
|
107 |
+
f'expected {self.clip_expected_length}, '
|
108 |
+
f'got {clip_chunk.shape[0]}')
|
109 |
+
clip_chunk = self.clip_augment(clip_chunk)
|
110 |
+
|
111 |
+
sync_chunk = sync_chunk[:self.sync_expected_length]
|
112 |
+
if sync_chunk.shape[0] != self.sync_expected_length:
|
113 |
+
raise RuntimeError(f'Sync video wrong length {video_id}, '
|
114 |
+
f'expected {self.sync_expected_length}, '
|
115 |
+
f'got {sync_chunk.shape[0]}')
|
116 |
+
sync_chunk = self.sync_augment(sync_chunk)
|
117 |
+
|
118 |
+
data = {
|
119 |
+
'name': video_id,
|
120 |
+
'caption': caption,
|
121 |
+
'clip_video': clip_chunk,
|
122 |
+
'sync_video': sync_chunk,
|
123 |
+
}
|
124 |
+
|
125 |
+
return data
|
126 |
+
|
127 |
+
def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
|
128 |
+
return self.sample(idx)
|
129 |
+
|
130 |
+
def __len__(self):
|
131 |
+
return len(self.captions)
|
third_party/MMAudio/mmaudio/data/eval/video_dataset.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
from typing import Union
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import torch
|
9 |
+
from torch.utils.data.dataset import Dataset
|
10 |
+
from torchvision.transforms import v2
|
11 |
+
from torio.io import StreamingMediaDecoder
|
12 |
+
|
13 |
+
from mmaudio.utils.dist_utils import local_rank
|
14 |
+
|
15 |
+
log = logging.getLogger()
|
16 |
+
|
17 |
+
_CLIP_SIZE = 384
|
18 |
+
_CLIP_FPS = 8.0
|
19 |
+
|
20 |
+
_SYNC_SIZE = 224
|
21 |
+
_SYNC_FPS = 25.0
|
22 |
+
|
23 |
+
|
24 |
+
class VideoDataset(Dataset):
|
25 |
+
|
26 |
+
def __init__(
|
27 |
+
self,
|
28 |
+
video_root: Union[str, Path],
|
29 |
+
*,
|
30 |
+
duration_sec: float = 8.0,
|
31 |
+
):
|
32 |
+
self.video_root = Path(video_root)
|
33 |
+
|
34 |
+
self.duration_sec = duration_sec
|
35 |
+
|
36 |
+
self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
|
37 |
+
self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
|
38 |
+
|
39 |
+
self.clip_transform = v2.Compose([
|
40 |
+
v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
|
41 |
+
v2.ToImage(),
|
42 |
+
v2.ToDtype(torch.float32, scale=True),
|
43 |
+
])
|
44 |
+
|
45 |
+
self.sync_transform = v2.Compose([
|
46 |
+
v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
|
47 |
+
v2.CenterCrop(_SYNC_SIZE),
|
48 |
+
v2.ToImage(),
|
49 |
+
v2.ToDtype(torch.float32, scale=True),
|
50 |
+
v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
|
51 |
+
])
|
52 |
+
|
53 |
+
# to be implemented by subclasses
|
54 |
+
self.captions = {}
|
55 |
+
self.videos = sorted(list(self.captions.keys()))
|
56 |
+
|
57 |
+
def sample(self, idx: int) -> dict[str, torch.Tensor]:
|
58 |
+
video_id = self.videos[idx]
|
59 |
+
caption = self.captions[video_id]
|
60 |
+
|
61 |
+
reader = StreamingMediaDecoder(self.video_root / (video_id + '.mp4'))
|
62 |
+
reader.add_basic_video_stream(
|
63 |
+
frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
|
64 |
+
frame_rate=_CLIP_FPS,
|
65 |
+
format='rgb24',
|
66 |
+
)
|
67 |
+
reader.add_basic_video_stream(
|
68 |
+
frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
|
69 |
+
frame_rate=_SYNC_FPS,
|
70 |
+
format='rgb24',
|
71 |
+
)
|
72 |
+
|
73 |
+
reader.fill_buffer()
|
74 |
+
data_chunk = reader.pop_chunks()
|
75 |
+
|
76 |
+
clip_chunk = data_chunk[0]
|
77 |
+
sync_chunk = data_chunk[1]
|
78 |
+
if clip_chunk is None:
|
79 |
+
raise RuntimeError(f'CLIP video returned None {video_id}')
|
80 |
+
if clip_chunk.shape[0] < self.clip_expected_length:
|
81 |
+
raise RuntimeError(
|
82 |
+
f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
|
83 |
+
)
|
84 |
+
|
85 |
+
if sync_chunk is None:
|
86 |
+
raise RuntimeError(f'Sync video returned None {video_id}')
|
87 |
+
if sync_chunk.shape[0] < self.sync_expected_length:
|
88 |
+
raise RuntimeError(
|
89 |
+
f'Sync video too short {video_id}, expected {self.sync_expected_length}, got {sync_chunk.shape[0]}'
|
90 |
+
)
|
91 |
+
|
92 |
+
# truncate the video
|
93 |
+
clip_chunk = clip_chunk[:self.clip_expected_length]
|
94 |
+
if clip_chunk.shape[0] != self.clip_expected_length:
|
95 |
+
raise RuntimeError(f'CLIP video wrong length {video_id}, '
|
96 |
+
f'expected {self.clip_expected_length}, '
|
97 |
+
f'got {clip_chunk.shape[0]}')
|
98 |
+
clip_chunk = self.clip_transform(clip_chunk)
|
99 |
+
|
100 |
+
sync_chunk = sync_chunk[:self.sync_expected_length]
|
101 |
+
if sync_chunk.shape[0] != self.sync_expected_length:
|
102 |
+
raise RuntimeError(f'Sync video wrong length {video_id}, '
|
103 |
+
f'expected {self.sync_expected_length}, '
|
104 |
+
f'got {sync_chunk.shape[0]}')
|
105 |
+
sync_chunk = self.sync_transform(sync_chunk)
|
106 |
+
|
107 |
+
data = {
|
108 |
+
'name': video_id,
|
109 |
+
'caption': caption,
|
110 |
+
'clip_video': clip_chunk,
|
111 |
+
'sync_video': sync_chunk,
|
112 |
+
}
|
113 |
+
|
114 |
+
return data
|
115 |
+
|
116 |
+
def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
|
117 |
+
try:
|
118 |
+
return self.sample(idx)
|
119 |
+
except Exception as e:
|
120 |
+
log.error(f'Error loading video {self.videos[idx]}: {e}')
|
121 |
+
return None
|
122 |
+
|
123 |
+
def __len__(self):
|
124 |
+
return len(self.captions)
|
125 |
+
|
126 |
+
|
127 |
+
class VGGSound(VideoDataset):
|
128 |
+
|
129 |
+
def __init__(
|
130 |
+
self,
|
131 |
+
video_root: Union[str, Path],
|
132 |
+
csv_path: Union[str, Path],
|
133 |
+
*,
|
134 |
+
duration_sec: float = 8.0,
|
135 |
+
):
|
136 |
+
super().__init__(video_root, duration_sec=duration_sec)
|
137 |
+
self.video_root = Path(video_root)
|
138 |
+
self.csv_path = Path(csv_path)
|
139 |
+
|
140 |
+
videos = sorted(os.listdir(self.video_root))
|
141 |
+
if local_rank == 0:
|
142 |
+
log.info(f'{len(videos)} videos found in {video_root}')
|
143 |
+
self.captions = {}
|
144 |
+
|
145 |
+
df = pd.read_csv(csv_path, header=None, names=['id', 'sec', 'caption',
|
146 |
+
'split']).to_dict(orient='records')
|
147 |
+
|
148 |
+
videos_no_found = []
|
149 |
+
for row in df:
|
150 |
+
if row['split'] == 'test':
|
151 |
+
start_sec = int(row['sec'])
|
152 |
+
video_id = str(row['id'])
|
153 |
+
# this is how our videos are named
|
154 |
+
video_name = f'{video_id}_{start_sec:06d}'
|
155 |
+
if video_name + '.mp4' not in videos:
|
156 |
+
videos_no_found.append(video_name)
|
157 |
+
continue
|
158 |
+
|
159 |
+
self.captions[video_name] = row['caption']
|
160 |
+
|
161 |
+
if local_rank == 0:
|
162 |
+
log.info(f'{len(videos)} videos found in {video_root}')
|
163 |
+
log.info(f'{len(self.captions)} useable videos found')
|
164 |
+
if videos_no_found:
|
165 |
+
log.info(f'{len(videos_no_found)} found in {csv_path} but not in {video_root}')
|
166 |
+
log.info(
|
167 |
+
'A small amount is expected, as not all videos are still available on YouTube')
|
168 |
+
|
169 |
+
self.videos = sorted(list(self.captions.keys()))
|
170 |
+
|
171 |
+
|
172 |
+
class MovieGen(VideoDataset):
|
173 |
+
|
174 |
+
def __init__(
|
175 |
+
self,
|
176 |
+
video_root: Union[str, Path],
|
177 |
+
jsonl_root: Union[str, Path],
|
178 |
+
*,
|
179 |
+
duration_sec: float = 10.0,
|
180 |
+
):
|
181 |
+
super().__init__(video_root, duration_sec=duration_sec)
|
182 |
+
self.video_root = Path(video_root)
|
183 |
+
self.jsonl_root = Path(jsonl_root)
|
184 |
+
|
185 |
+
videos = sorted(os.listdir(self.video_root))
|
186 |
+
videos = [v[:-4] for v in videos] # remove extensions
|
187 |
+
self.captions = {}
|
188 |
+
|
189 |
+
for v in videos:
|
190 |
+
with open(self.jsonl_root / (v + '.jsonl')) as f:
|
191 |
+
data = json.load(f)
|
192 |
+
self.captions[v] = data['audio_prompt']
|
193 |
+
|
194 |
+
if local_rank == 0:
|
195 |
+
log.info(f'{len(videos)} videos found in {video_root}')
|
196 |
+
|
197 |
+
self.videos = videos
|
third_party/MMAudio/mmaudio/data/extracted_audio.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Union
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
import torch
|
7 |
+
from tensordict import TensorDict
|
8 |
+
from torch.utils.data.dataset import Dataset
|
9 |
+
|
10 |
+
from mmaudio.utils.dist_utils import local_rank
|
11 |
+
|
12 |
+
log = logging.getLogger()
|
13 |
+
|
14 |
+
|
15 |
+
class ExtractedAudio(Dataset):
|
16 |
+
|
17 |
+
def __init__(
|
18 |
+
self,
|
19 |
+
tsv_path: Union[str, Path],
|
20 |
+
*,
|
21 |
+
premade_mmap_dir: Union[str, Path],
|
22 |
+
data_dim: dict[str, int],
|
23 |
+
):
|
24 |
+
super().__init__()
|
25 |
+
|
26 |
+
self.data_dim = data_dim
|
27 |
+
self.df_list = pd.read_csv(tsv_path, sep='\t').to_dict('records')
|
28 |
+
self.ids = [str(d['id']) for d in self.df_list]
|
29 |
+
|
30 |
+
log.info(f'Loading precomputed mmap from {premade_mmap_dir}')
|
31 |
+
# load precomputed memory mapped tensors
|
32 |
+
premade_mmap_dir = Path(premade_mmap_dir)
|
33 |
+
td = TensorDict.load_memmap(premade_mmap_dir)
|
34 |
+
log.info(f'Loaded precomputed mmap from {premade_mmap_dir}')
|
35 |
+
self.mean = td['mean']
|
36 |
+
self.std = td['std']
|
37 |
+
self.text_features = td['text_features']
|
38 |
+
|
39 |
+
log.info(f'Loaded {len(self)} samples from {premade_mmap_dir}.')
|
40 |
+
log.info(f'Loaded mean: {self.mean.shape}.')
|
41 |
+
log.info(f'Loaded std: {self.std.shape}.')
|
42 |
+
log.info(f'Loaded text features: {self.text_features.shape}.')
|
43 |
+
|
44 |
+
assert self.mean.shape[1] == self.data_dim['latent_seq_len'], \
|
45 |
+
f'{self.mean.shape[1]} != {self.data_dim["latent_seq_len"]}'
|
46 |
+
assert self.std.shape[1] == self.data_dim['latent_seq_len'], \
|
47 |
+
f'{self.std.shape[1]} != {self.data_dim["latent_seq_len"]}'
|
48 |
+
|
49 |
+
assert self.text_features.shape[1] == self.data_dim['text_seq_len'], \
|
50 |
+
f'{self.text_features.shape[1]} != {self.data_dim["text_seq_len"]}'
|
51 |
+
assert self.text_features.shape[-1] == self.data_dim['text_dim'], \
|
52 |
+
f'{self.text_features.shape[-1]} != {self.data_dim["text_dim"]}'
|
53 |
+
|
54 |
+
self.fake_clip_features = torch.zeros(self.data_dim['clip_seq_len'],
|
55 |
+
self.data_dim['clip_dim'])
|
56 |
+
self.fake_sync_features = torch.zeros(self.data_dim['sync_seq_len'],
|
57 |
+
self.data_dim['sync_dim'])
|
58 |
+
self.video_exist = torch.tensor(0, dtype=torch.bool)
|
59 |
+
self.text_exist = torch.tensor(1, dtype=torch.bool)
|
60 |
+
|
61 |
+
def compute_latent_stats(self) -> tuple[torch.Tensor, torch.Tensor]:
|
62 |
+
latents = self.mean
|
63 |
+
return latents.mean(dim=(0, 1)), latents.std(dim=(0, 1))
|
64 |
+
|
65 |
+
def get_memory_mapped_tensor(self) -> TensorDict:
|
66 |
+
td = TensorDict({
|
67 |
+
'mean': self.mean,
|
68 |
+
'std': self.std,
|
69 |
+
'text_features': self.text_features,
|
70 |
+
})
|
71 |
+
return td
|
72 |
+
|
73 |
+
def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
|
74 |
+
data = {
|
75 |
+
'id': str(self.df_list[idx]['id']),
|
76 |
+
'a_mean': self.mean[idx],
|
77 |
+
'a_std': self.std[idx],
|
78 |
+
'clip_features': self.fake_clip_features,
|
79 |
+
'sync_features': self.fake_sync_features,
|
80 |
+
'text_features': self.text_features[idx],
|
81 |
+
'caption': self.df_list[idx]['caption'],
|
82 |
+
'video_exist': self.video_exist,
|
83 |
+
'text_exist': self.text_exist,
|
84 |
+
}
|
85 |
+
return data
|
86 |
+
|
87 |
+
def __len__(self):
|
88 |
+
return len(self.ids)
|
third_party/MMAudio/mmaudio/data/extracted_vgg.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Union
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
import torch
|
7 |
+
from tensordict import TensorDict
|
8 |
+
from torch.utils.data.dataset import Dataset
|
9 |
+
|
10 |
+
from mmaudio.utils.dist_utils import local_rank
|
11 |
+
|
12 |
+
log = logging.getLogger()
|
13 |
+
|
14 |
+
|
15 |
+
class ExtractedVGG(Dataset):
|
16 |
+
|
17 |
+
def __init__(
|
18 |
+
self,
|
19 |
+
tsv_path: Union[str, Path],
|
20 |
+
*,
|
21 |
+
premade_mmap_dir: Union[str, Path],
|
22 |
+
data_dim: dict[str, int],
|
23 |
+
):
|
24 |
+
super().__init__()
|
25 |
+
|
26 |
+
self.data_dim = data_dim
|
27 |
+
self.df_list = pd.read_csv(tsv_path, sep='\t').to_dict('records')
|
28 |
+
self.ids = [d['id'] for d in self.df_list]
|
29 |
+
|
30 |
+
log.info(f'Loading precomputed mmap from {premade_mmap_dir}')
|
31 |
+
# load precomputed memory mapped tensors
|
32 |
+
premade_mmap_dir = Path(premade_mmap_dir)
|
33 |
+
td = TensorDict.load_memmap(premade_mmap_dir)
|
34 |
+
log.info(f'Loaded precomputed mmap from {premade_mmap_dir}')
|
35 |
+
self.mean = td['mean']
|
36 |
+
self.std = td['std']
|
37 |
+
self.clip_features = td['clip_features']
|
38 |
+
self.sync_features = td['sync_features']
|
39 |
+
self.text_features = td['text_features']
|
40 |
+
|
41 |
+
if local_rank == 0:
|
42 |
+
log.info(f'Loaded {len(self)} samples.')
|
43 |
+
log.info(f'Loaded mean: {self.mean.shape}.')
|
44 |
+
log.info(f'Loaded std: {self.std.shape}.')
|
45 |
+
log.info(f'Loaded clip_features: {self.clip_features.shape}.')
|
46 |
+
log.info(f'Loaded sync_features: {self.sync_features.shape}.')
|
47 |
+
log.info(f'Loaded text_features: {self.text_features.shape}.')
|
48 |
+
|
49 |
+
assert self.mean.shape[1] == self.data_dim['latent_seq_len'], \
|
50 |
+
f'{self.mean.shape[1]} != {self.data_dim["latent_seq_len"]}'
|
51 |
+
assert self.std.shape[1] == self.data_dim['latent_seq_len'], \
|
52 |
+
f'{self.std.shape[1]} != {self.data_dim["latent_seq_len"]}'
|
53 |
+
|
54 |
+
assert self.clip_features.shape[1] == self.data_dim['clip_seq_len'], \
|
55 |
+
f'{self.clip_features.shape[1]} != {self.data_dim["clip_seq_len"]}'
|
56 |
+
assert self.sync_features.shape[1] == self.data_dim['sync_seq_len'], \
|
57 |
+
f'{self.sync_features.shape[1]} != {self.data_dim["sync_seq_len"]}'
|
58 |
+
assert self.text_features.shape[1] == self.data_dim['text_seq_len'], \
|
59 |
+
f'{self.text_features.shape[1]} != {self.data_dim["text_seq_len"]}'
|
60 |
+
|
61 |
+
assert self.clip_features.shape[-1] == self.data_dim['clip_dim'], \
|
62 |
+
f'{self.clip_features.shape[-1]} != {self.data_dim["clip_dim"]}'
|
63 |
+
assert self.sync_features.shape[-1] == self.data_dim['sync_dim'], \
|
64 |
+
f'{self.sync_features.shape[-1]} != {self.data_dim["sync_dim"]}'
|
65 |
+
assert self.text_features.shape[-1] == self.data_dim['text_dim'], \
|
66 |
+
f'{self.text_features.shape[-1]} != {self.data_dim["text_dim"]}'
|
67 |
+
|
68 |
+
self.video_exist = torch.tensor(1, dtype=torch.bool)
|
69 |
+
self.text_exist = torch.tensor(1, dtype=torch.bool)
|
70 |
+
|
71 |
+
def compute_latent_stats(self) -> tuple[torch.Tensor, torch.Tensor]:
|
72 |
+
latents = self.mean
|
73 |
+
return latents.mean(dim=(0, 1)), latents.std(dim=(0, 1))
|
74 |
+
|
75 |
+
def get_memory_mapped_tensor(self) -> TensorDict:
|
76 |
+
td = TensorDict({
|
77 |
+
'mean': self.mean,
|
78 |
+
'std': self.std,
|
79 |
+
'clip_features': self.clip_features,
|
80 |
+
'sync_features': self.sync_features,
|
81 |
+
'text_features': self.text_features,
|
82 |
+
})
|
83 |
+
return td
|
84 |
+
|
85 |
+
def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
|
86 |
+
data = {
|
87 |
+
'id': self.df_list[idx]['id'],
|
88 |
+
'a_mean': self.mean[idx],
|
89 |
+
'a_std': self.std[idx],
|
90 |
+
'clip_features': self.clip_features[idx],
|
91 |
+
'sync_features': self.sync_features[idx],
|
92 |
+
'text_features': self.text_features[idx],
|
93 |
+
'caption': self.df_list[idx]['label'],
|
94 |
+
'video_exist': self.video_exist,
|
95 |
+
'text_exist': self.text_exist,
|
96 |
+
}
|
97 |
+
|
98 |
+
return data
|
99 |
+
|
100 |
+
def __len__(self):
|
101 |
+
return len(self.ids)
|
{mmaudio/model → third_party/MMAudio/mmaudio/data/extraction}/__init__.py
RENAMED
File without changes
|
third_party/MMAudio/mmaudio/data/extraction/vgg_sound.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Optional, Union
|
5 |
+
|
6 |
+
import pandas as pd
|
7 |
+
import torch
|
8 |
+
import torchaudio
|
9 |
+
from torch.utils.data.dataset import Dataset
|
10 |
+
from torchvision.transforms import v2
|
11 |
+
from torio.io import StreamingMediaDecoder
|
12 |
+
|
13 |
+
from mmaudio.utils.dist_utils import local_rank
|
14 |
+
|
15 |
+
log = logging.getLogger()
|
16 |
+
|
17 |
+
_CLIP_SIZE = 384
|
18 |
+
_CLIP_FPS = 8.0
|
19 |
+
|
20 |
+
_SYNC_SIZE = 224
|
21 |
+
_SYNC_FPS = 25.0
|
22 |
+
|
23 |
+
|
24 |
+
class VGGSound(Dataset):
|
25 |
+
|
26 |
+
def __init__(
|
27 |
+
self,
|
28 |
+
root: Union[str, Path],
|
29 |
+
*,
|
30 |
+
tsv_path: Union[str, Path] = 'sets/vgg3-train.tsv',
|
31 |
+
sample_rate: int = 16_000,
|
32 |
+
duration_sec: float = 8.0,
|
33 |
+
audio_samples: Optional[int] = None,
|
34 |
+
normalize_audio: bool = False,
|
35 |
+
):
|
36 |
+
self.root = Path(root)
|
37 |
+
self.normalize_audio = normalize_audio
|
38 |
+
if audio_samples is None:
|
39 |
+
self.audio_samples = int(sample_rate * duration_sec)
|
40 |
+
else:
|
41 |
+
self.audio_samples = audio_samples
|
42 |
+
effective_duration = audio_samples / sample_rate
|
43 |
+
# make sure the duration is close enough, within 15ms
|
44 |
+
assert abs(effective_duration - duration_sec) < 0.015, \
|
45 |
+
f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
|
46 |
+
|
47 |
+
videos = sorted(os.listdir(self.root))
|
48 |
+
videos = set([Path(v).stem for v in videos]) # remove extensions
|
49 |
+
self.labels = {}
|
50 |
+
self.videos = []
|
51 |
+
missing_videos = []
|
52 |
+
|
53 |
+
# read the tsv for subset information
|
54 |
+
df_list = pd.read_csv(tsv_path, sep='\t', dtype={'id': str}).to_dict('records')
|
55 |
+
for record in df_list:
|
56 |
+
id = record['id']
|
57 |
+
label = record['label']
|
58 |
+
if id in videos:
|
59 |
+
self.labels[id] = label
|
60 |
+
self.videos.append(id)
|
61 |
+
else:
|
62 |
+
missing_videos.append(id)
|
63 |
+
|
64 |
+
if local_rank == 0:
|
65 |
+
log.info(f'{len(videos)} videos found in {root}')
|
66 |
+
log.info(f'{len(self.videos)} videos found in {tsv_path}')
|
67 |
+
log.info(f'{len(missing_videos)} videos missing in {root}')
|
68 |
+
|
69 |
+
self.sample_rate = sample_rate
|
70 |
+
self.duration_sec = duration_sec
|
71 |
+
|
72 |
+
self.expected_audio_length = audio_samples
|
73 |
+
self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
|
74 |
+
self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
|
75 |
+
|
76 |
+
self.clip_transform = v2.Compose([
|
77 |
+
v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
|
78 |
+
v2.ToImage(),
|
79 |
+
v2.ToDtype(torch.float32, scale=True),
|
80 |
+
])
|
81 |
+
|
82 |
+
self.sync_transform = v2.Compose([
|
83 |
+
v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
|
84 |
+
v2.CenterCrop(_SYNC_SIZE),
|
85 |
+
v2.ToImage(),
|
86 |
+
v2.ToDtype(torch.float32, scale=True),
|
87 |
+
v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
|
88 |
+
])
|
89 |
+
|
90 |
+
self.resampler = {}
|
91 |
+
|
92 |
+
def sample(self, idx: int) -> dict[str, torch.Tensor]:
|
93 |
+
video_id = self.videos[idx]
|
94 |
+
label = self.labels[video_id]
|
95 |
+
|
96 |
+
reader = StreamingMediaDecoder(self.root / (video_id + '.mp4'))
|
97 |
+
reader.add_basic_video_stream(
|
98 |
+
frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
|
99 |
+
frame_rate=_CLIP_FPS,
|
100 |
+
format='rgb24',
|
101 |
+
)
|
102 |
+
reader.add_basic_video_stream(
|
103 |
+
frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
|
104 |
+
frame_rate=_SYNC_FPS,
|
105 |
+
format='rgb24',
|
106 |
+
)
|
107 |
+
reader.add_basic_audio_stream(frames_per_chunk=2**30, )
|
108 |
+
|
109 |
+
reader.fill_buffer()
|
110 |
+
data_chunk = reader.pop_chunks()
|
111 |
+
|
112 |
+
clip_chunk = data_chunk[0]
|
113 |
+
sync_chunk = data_chunk[1]
|
114 |
+
audio_chunk = data_chunk[2]
|
115 |
+
|
116 |
+
if clip_chunk is None:
|
117 |
+
raise RuntimeError(f'CLIP video returned None {video_id}')
|
118 |
+
if clip_chunk.shape[0] < self.clip_expected_length:
|
119 |
+
raise RuntimeError(
|
120 |
+
f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
|
121 |
+
)
|
122 |
+
|
123 |
+
if sync_chunk is None:
|
124 |
+
raise RuntimeError(f'Sync video returned None {video_id}')
|
125 |
+
if sync_chunk.shape[0] < self.sync_expected_length:
|
126 |
+
raise RuntimeError(
|
127 |
+
f'Sync video too short {video_id}, expected {self.sync_expected_length}, got {sync_chunk.shape[0]}'
|
128 |
+
)
|
129 |
+
|
130 |
+
# process audio
|
131 |
+
sample_rate = int(reader.get_out_stream_info(2).sample_rate)
|
132 |
+
audio_chunk = audio_chunk.transpose(0, 1)
|
133 |
+
audio_chunk = audio_chunk.mean(dim=0) # mono
|
134 |
+
if self.normalize_audio:
|
135 |
+
abs_max = audio_chunk.abs().max()
|
136 |
+
audio_chunk = audio_chunk / abs_max * 0.95
|
137 |
+
if abs_max <= 1e-6:
|
138 |
+
raise RuntimeError(f'Audio is silent {video_id}')
|
139 |
+
|
140 |
+
# resample
|
141 |
+
if sample_rate == self.sample_rate:
|
142 |
+
audio_chunk = audio_chunk
|
143 |
+
else:
|
144 |
+
if sample_rate not in self.resampler:
|
145 |
+
# https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
|
146 |
+
self.resampler[sample_rate] = torchaudio.transforms.Resample(
|
147 |
+
sample_rate,
|
148 |
+
self.sample_rate,
|
149 |
+
lowpass_filter_width=64,
|
150 |
+
rolloff=0.9475937167399596,
|
151 |
+
resampling_method='sinc_interp_kaiser',
|
152 |
+
beta=14.769656459379492,
|
153 |
+
)
|
154 |
+
audio_chunk = self.resampler[sample_rate](audio_chunk)
|
155 |
+
|
156 |
+
if audio_chunk.shape[0] < self.expected_audio_length:
|
157 |
+
raise RuntimeError(f'Audio too short {video_id}')
|
158 |
+
audio_chunk = audio_chunk[:self.expected_audio_length]
|
159 |
+
|
160 |
+
# truncate the video
|
161 |
+
clip_chunk = clip_chunk[:self.clip_expected_length]
|
162 |
+
if clip_chunk.shape[0] != self.clip_expected_length:
|
163 |
+
raise RuntimeError(f'CLIP video wrong length {video_id}, '
|
164 |
+
f'expected {self.clip_expected_length}, '
|
165 |
+
f'got {clip_chunk.shape[0]}')
|
166 |
+
clip_chunk = self.clip_transform(clip_chunk)
|
167 |
+
|
168 |
+
sync_chunk = sync_chunk[:self.sync_expected_length]
|
169 |
+
if sync_chunk.shape[0] != self.sync_expected_length:
|
170 |
+
raise RuntimeError(f'Sync video wrong length {video_id}, '
|
171 |
+
f'expected {self.sync_expected_length}, '
|
172 |
+
f'got {sync_chunk.shape[0]}')
|
173 |
+
sync_chunk = self.sync_transform(sync_chunk)
|
174 |
+
|
175 |
+
data = {
|
176 |
+
'id': video_id,
|
177 |
+
'caption': label,
|
178 |
+
'audio': audio_chunk,
|
179 |
+
'clip_video': clip_chunk,
|
180 |
+
'sync_video': sync_chunk,
|
181 |
+
}
|
182 |
+
|
183 |
+
return data
|
184 |
+
|
185 |
+
def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
|
186 |
+
try:
|
187 |
+
return self.sample(idx)
|
188 |
+
except Exception as e:
|
189 |
+
log.error(f'Error loading video {self.videos[idx]}: {e}')
|
190 |
+
return None
|
191 |
+
|
192 |
+
def __len__(self):
|
193 |
+
return len(self.labels)
|
third_party/MMAudio/mmaudio/data/extraction/wav_dataset.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Union
|
5 |
+
|
6 |
+
import open_clip
|
7 |
+
import pandas as pd
|
8 |
+
import torch
|
9 |
+
import torchaudio
|
10 |
+
from torch.utils.data.dataset import Dataset
|
11 |
+
|
12 |
+
log = logging.getLogger()
|
13 |
+
|
14 |
+
|
15 |
+
class WavTextClipsDataset(Dataset):
|
16 |
+
|
17 |
+
def __init__(
|
18 |
+
self,
|
19 |
+
root: Union[str, Path],
|
20 |
+
*,
|
21 |
+
captions_tsv: Union[str, Path],
|
22 |
+
clips_tsv: Union[str, Path],
|
23 |
+
sample_rate: int,
|
24 |
+
num_samples: int,
|
25 |
+
normalize_audio: bool = False,
|
26 |
+
reject_silent: bool = False,
|
27 |
+
tokenizer_id: str = 'ViT-H-14-378-quickgelu',
|
28 |
+
):
|
29 |
+
self.root = Path(root)
|
30 |
+
self.sample_rate = sample_rate
|
31 |
+
self.num_samples = num_samples
|
32 |
+
self.normalize_audio = normalize_audio
|
33 |
+
self.reject_silent = reject_silent
|
34 |
+
self.tokenizer = open_clip.get_tokenizer(tokenizer_id)
|
35 |
+
|
36 |
+
audios = sorted(os.listdir(self.root))
|
37 |
+
audios = set([
|
38 |
+
Path(audio).stem for audio in audios
|
39 |
+
if audio.endswith('.wav') or audio.endswith('.flac')
|
40 |
+
])
|
41 |
+
self.captions = {}
|
42 |
+
|
43 |
+
# read the caption tsv
|
44 |
+
df_list = pd.read_csv(captions_tsv, sep='\t', dtype={'id': str}).to_dict('records')
|
45 |
+
for record in df_list:
|
46 |
+
id = record['id']
|
47 |
+
caption = record['caption']
|
48 |
+
self.captions[id] = caption
|
49 |
+
|
50 |
+
# read the clip tsv
|
51 |
+
df_list = pd.read_csv(clips_tsv, sep='\t', dtype={
|
52 |
+
'id': str,
|
53 |
+
'name': str
|
54 |
+
}).to_dict('records')
|
55 |
+
self.clips = []
|
56 |
+
for record in df_list:
|
57 |
+
record['id'] = record['id']
|
58 |
+
record['name'] = record['name']
|
59 |
+
id = record['id']
|
60 |
+
name = record['name']
|
61 |
+
if name not in self.captions:
|
62 |
+
log.warning(f'Audio {name} not found in {captions_tsv}')
|
63 |
+
continue
|
64 |
+
record['caption'] = self.captions[name]
|
65 |
+
self.clips.append(record)
|
66 |
+
|
67 |
+
log.info(f'Found {len(self.clips)} audio files in {self.root}')
|
68 |
+
|
69 |
+
self.resampler = {}
|
70 |
+
|
71 |
+
def __getitem__(self, idx: int) -> torch.Tensor:
|
72 |
+
try:
|
73 |
+
clip = self.clips[idx]
|
74 |
+
audio_name = clip['name']
|
75 |
+
audio_id = clip['id']
|
76 |
+
caption = clip['caption']
|
77 |
+
start_sample = clip['start_sample']
|
78 |
+
end_sample = clip['end_sample']
|
79 |
+
|
80 |
+
audio_path = self.root / f'{audio_name}.flac'
|
81 |
+
if not audio_path.exists():
|
82 |
+
audio_path = self.root / f'{audio_name}.wav'
|
83 |
+
assert audio_path.exists()
|
84 |
+
|
85 |
+
audio_chunk, sample_rate = torchaudio.load(audio_path)
|
86 |
+
audio_chunk = audio_chunk.mean(dim=0) # mono
|
87 |
+
abs_max = audio_chunk.abs().max()
|
88 |
+
if self.normalize_audio:
|
89 |
+
audio_chunk = audio_chunk / abs_max * 0.95
|
90 |
+
|
91 |
+
if self.reject_silent and abs_max < 1e-6:
|
92 |
+
log.warning(f'Rejecting silent audio')
|
93 |
+
return None
|
94 |
+
|
95 |
+
audio_chunk = audio_chunk[start_sample:end_sample]
|
96 |
+
|
97 |
+
# resample
|
98 |
+
if sample_rate == self.sample_rate:
|
99 |
+
audio_chunk = audio_chunk
|
100 |
+
else:
|
101 |
+
if sample_rate not in self.resampler:
|
102 |
+
# https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
|
103 |
+
self.resampler[sample_rate] = torchaudio.transforms.Resample(
|
104 |
+
sample_rate,
|
105 |
+
self.sample_rate,
|
106 |
+
lowpass_filter_width=64,
|
107 |
+
rolloff=0.9475937167399596,
|
108 |
+
resampling_method='sinc_interp_kaiser',
|
109 |
+
beta=14.769656459379492,
|
110 |
+
)
|
111 |
+
audio_chunk = self.resampler[sample_rate](audio_chunk)
|
112 |
+
|
113 |
+
if audio_chunk.shape[0] < self.num_samples:
|
114 |
+
raise ValueError('Audio is too short')
|
115 |
+
audio_chunk = audio_chunk[:self.num_samples]
|
116 |
+
|
117 |
+
tokens = self.tokenizer([caption])[0]
|
118 |
+
|
119 |
+
output = {
|
120 |
+
'waveform': audio_chunk,
|
121 |
+
'id': audio_id,
|
122 |
+
'caption': caption,
|
123 |
+
'tokens': tokens,
|
124 |
+
}
|
125 |
+
|
126 |
+
return output
|
127 |
+
except Exception as e:
|
128 |
+
log.error(f'Error reading {audio_path}: {e}')
|
129 |
+
return None
|
130 |
+
|
131 |
+
def __len__(self):
|
132 |
+
return len(self.clips)
|
third_party/MMAudio/mmaudio/data/mm_dataset.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import bisect
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch.utils.data.dataset import Dataset
|
5 |
+
|
6 |
+
|
7 |
+
# modified from https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#ConcatDataset
|
8 |
+
class MultiModalDataset(Dataset):
|
9 |
+
datasets: list[Dataset]
|
10 |
+
cumulative_sizes: list[int]
|
11 |
+
|
12 |
+
@staticmethod
|
13 |
+
def cumsum(sequence):
|
14 |
+
r, s = [], 0
|
15 |
+
for e in sequence:
|
16 |
+
l = len(e)
|
17 |
+
r.append(l + s)
|
18 |
+
s += l
|
19 |
+
return r
|
20 |
+
|
21 |
+
def __init__(self, video_datasets: list[Dataset], audio_datasets: list[Dataset]):
|
22 |
+
super().__init__()
|
23 |
+
self.video_datasets = list(video_datasets)
|
24 |
+
self.audio_datasets = list(audio_datasets)
|
25 |
+
self.datasets = self.video_datasets + self.audio_datasets
|
26 |
+
|
27 |
+
self.cumulative_sizes = self.cumsum(self.datasets)
|
28 |
+
|
29 |
+
def __len__(self):
|
30 |
+
return self.cumulative_sizes[-1]
|
31 |
+
|
32 |
+
def __getitem__(self, idx):
|
33 |
+
if idx < 0:
|
34 |
+
if -idx > len(self):
|
35 |
+
raise ValueError("absolute value of index should not exceed dataset length")
|
36 |
+
idx = len(self) + idx
|
37 |
+
dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
|
38 |
+
if dataset_idx == 0:
|
39 |
+
sample_idx = idx
|
40 |
+
else:
|
41 |
+
sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
|
42 |
+
return self.datasets[dataset_idx][sample_idx]
|
43 |
+
|
44 |
+
def compute_latent_stats(self) -> tuple[torch.Tensor, torch.Tensor]:
|
45 |
+
return self.video_datasets[0].compute_latent_stats()
|
third_party/MMAudio/mmaudio/data/utils.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import tempfile
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Any, Optional, Union
|
7 |
+
|
8 |
+
import torch
|
9 |
+
import torch.distributed as dist
|
10 |
+
from tensordict import MemoryMappedTensor
|
11 |
+
from torch.utils.data import DataLoader
|
12 |
+
from torch.utils.data.dataset import Dataset
|
13 |
+
from tqdm import tqdm
|
14 |
+
|
15 |
+
from mmaudio.utils.dist_utils import local_rank, world_size
|
16 |
+
|
17 |
+
scratch_path = Path(os.environ['SLURM_SCRATCH'] if 'SLURM_SCRATCH' in os.environ else '/dev/shm')
|
18 |
+
shm_path = Path('/dev/shm')
|
19 |
+
|
20 |
+
log = logging.getLogger()
|
21 |
+
|
22 |
+
|
23 |
+
def reseed(seed):
|
24 |
+
random.seed(seed)
|
25 |
+
torch.manual_seed(seed)
|
26 |
+
|
27 |
+
|
28 |
+
def local_scatter_torch(obj: Optional[Any]):
|
29 |
+
if world_size == 1:
|
30 |
+
# Just one worker. Do nothing.
|
31 |
+
return obj
|
32 |
+
|
33 |
+
array = [obj] * world_size
|
34 |
+
target_array = [None]
|
35 |
+
if local_rank == 0:
|
36 |
+
dist.scatter_object_list(target_array, scatter_object_input_list=array, src=0)
|
37 |
+
else:
|
38 |
+
dist.scatter_object_list(target_array, scatter_object_input_list=None, src=0)
|
39 |
+
return target_array[0]
|
40 |
+
|
41 |
+
|
42 |
+
class ShardDataset(Dataset):
|
43 |
+
|
44 |
+
def __init__(self, root):
|
45 |
+
self.root = root
|
46 |
+
self.shards = sorted(os.listdir(root))
|
47 |
+
|
48 |
+
def __len__(self):
|
49 |
+
return len(self.shards)
|
50 |
+
|
51 |
+
def __getitem__(self, idx):
|
52 |
+
return torch.load(os.path.join(self.root, self.shards[idx]), weights_only=True)
|
53 |
+
|
54 |
+
|
55 |
+
def get_tmp_dir(in_memory: bool) -> Path:
|
56 |
+
return shm_path if in_memory else scratch_path
|
57 |
+
|
58 |
+
|
59 |
+
def load_shards_and_share(data_path: Union[str, Path], ids: list[int],
|
60 |
+
in_memory: bool) -> MemoryMappedTensor:
|
61 |
+
if local_rank == 0:
|
62 |
+
with tempfile.NamedTemporaryFile(prefix='shared-tensor-', dir=get_tmp_dir(in_memory)) as f:
|
63 |
+
log.info(f'Loading shards from {data_path} into {f.name}...')
|
64 |
+
data = load_shards(data_path, ids=ids, tmp_file_path=f.name)
|
65 |
+
data = share_tensor_to_all(data)
|
66 |
+
torch.distributed.barrier()
|
67 |
+
f.close() # why does the context manager not close the file for me?
|
68 |
+
else:
|
69 |
+
log.info('Waiting for the data to be shared with me...')
|
70 |
+
data = share_tensor_to_all(None)
|
71 |
+
torch.distributed.barrier()
|
72 |
+
|
73 |
+
return data
|
74 |
+
|
75 |
+
|
76 |
+
def load_shards(
|
77 |
+
data_path: Union[str, Path],
|
78 |
+
ids: list[int],
|
79 |
+
*,
|
80 |
+
tmp_file_path: str,
|
81 |
+
) -> Union[torch.Tensor, dict[str, torch.Tensor]]:
|
82 |
+
|
83 |
+
id_set = set(ids)
|
84 |
+
shards = sorted(os.listdir(data_path))
|
85 |
+
log.info(f'Found {len(shards)} shards in {data_path}.')
|
86 |
+
first_shard = torch.load(os.path.join(data_path, shards[0]), weights_only=True)
|
87 |
+
|
88 |
+
log.info(f'Rank {local_rank} created file {tmp_file_path}')
|
89 |
+
first_item = next(iter(first_shard.values()))
|
90 |
+
log.info(f'First item shape: {first_item.shape}')
|
91 |
+
mm_tensor = MemoryMappedTensor.empty(shape=(len(ids), *first_item.shape),
|
92 |
+
dtype=torch.float32,
|
93 |
+
filename=tmp_file_path,
|
94 |
+
existsok=True)
|
95 |
+
total_count = 0
|
96 |
+
used_index = set()
|
97 |
+
id_indexing = {i: idx for idx, i in enumerate(ids)}
|
98 |
+
# faster with no workers; otherwise we need to set_sharing_strategy('file_system')
|
99 |
+
loader = DataLoader(ShardDataset(data_path), batch_size=1, num_workers=0)
|
100 |
+
for data in tqdm(loader, desc='Loading shards'):
|
101 |
+
for i, v in data.items():
|
102 |
+
if i not in id_set:
|
103 |
+
continue
|
104 |
+
|
105 |
+
# tensor_index = ids.index(i)
|
106 |
+
tensor_index = id_indexing[i]
|
107 |
+
if tensor_index in used_index:
|
108 |
+
raise ValueError(f'Duplicate id {i} found in {data_path}.')
|
109 |
+
used_index.add(tensor_index)
|
110 |
+
mm_tensor[tensor_index] = v
|
111 |
+
total_count += 1
|
112 |
+
|
113 |
+
assert total_count == len(ids), f'Expected {len(ids)} tensors, got {total_count}.'
|
114 |
+
log.info(f'Loaded {total_count} tensors from {data_path}.')
|
115 |
+
|
116 |
+
return mm_tensor
|
117 |
+
|
118 |
+
|
119 |
+
def share_tensor_to_all(x: Optional[MemoryMappedTensor]) -> MemoryMappedTensor:
|
120 |
+
"""
|
121 |
+
x: the tensor to be shared; None if local_rank != 0
|
122 |
+
return: the shared tensor
|
123 |
+
"""
|
124 |
+
|
125 |
+
# there is no need to share your stuff with anyone if you are alone; must be in memory
|
126 |
+
if world_size == 1:
|
127 |
+
return x
|
128 |
+
|
129 |
+
if local_rank == 0:
|
130 |
+
assert x is not None, 'x must not be None if local_rank == 0'
|
131 |
+
else:
|
132 |
+
assert x is None, 'x must be None if local_rank != 0'
|
133 |
+
|
134 |
+
if local_rank == 0:
|
135 |
+
filename = x.filename
|
136 |
+
meta_information = (filename, x.shape, x.dtype)
|
137 |
+
else:
|
138 |
+
meta_information = None
|
139 |
+
|
140 |
+
filename, data_shape, data_type = local_scatter_torch(meta_information)
|
141 |
+
if local_rank == 0:
|
142 |
+
data = x
|
143 |
+
else:
|
144 |
+
data = MemoryMappedTensor.from_filename(filename=filename,
|
145 |
+
dtype=data_type,
|
146 |
+
shape=data_shape)
|
147 |
+
|
148 |
+
return data
|