soiz1 commited on
Commit
20d6bb2
·
verified ·
1 Parent(s): ffe15d6

Upload 123 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +9 -0
  2. .gitignore +23 -0
  3. EVAL.md +121 -0
  4. LICENSE +674 -0
  5. README-JA.md +210 -0
  6. app.py +362 -0
  7. app_svc.py +437 -0
  8. app_vc.py +390 -0
  9. assets/real-time-demo.webm +3 -0
  10. baselines/cosyvoice.py +24 -0
  11. baselines/dnsmos/dnsmos_computor.py +130 -0
  12. baselines/dnsmos/model_v8.onnx +3 -0
  13. baselines/dnsmos/sig_bak_ovr.onnx +3 -0
  14. baselines/openvoice.py +29 -0
  15. campplus_cn_common.bin +3 -0
  16. conda-nix-vc-py310.yaml +25 -0
  17. configs/config.json +1 -0
  18. configs/hifigan.yml +25 -0
  19. configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml +98 -0
  20. configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml +91 -0
  21. configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml +82 -0
  22. dac/__init__.py +16 -0
  23. dac/__main__.py +36 -0
  24. dac/model/__init__.py +4 -0
  25. dac/model/base.py +294 -0
  26. dac/model/dac.py +400 -0
  27. dac/model/discriminator.py +228 -0
  28. dac/model/encodec.py +320 -0
  29. dac/nn/__init__.py +3 -0
  30. dac/nn/layers.py +33 -0
  31. dac/nn/loss.py +368 -0
  32. dac/nn/quantize.py +339 -0
  33. dac/utils/__init__.py +123 -0
  34. dac/utils/decode.py +95 -0
  35. dac/utils/encode.py +94 -0
  36. data/ft_dataset.py +133 -0
  37. eval.py +550 -0
  38. examples/reference/azuma_0.wav +0 -0
  39. examples/reference/dingzhen_0.wav +3 -0
  40. examples/reference/s1p1.wav +0 -0
  41. examples/reference/s1p2.wav +0 -0
  42. examples/reference/s2p1.wav +0 -0
  43. examples/reference/s2p2.wav +0 -0
  44. examples/reference/s3p1.wav +0 -0
  45. examples/reference/s3p2.wav +3 -0
  46. examples/reference/s4p1.wav +0 -0
  47. examples/reference/s4p2.wav +0 -0
  48. examples/reference/teio_0.wav +0 -0
  49. examples/reference/trump_0.wav +3 -0
  50. examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav +3 -0
.gitattributes CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/real-time-demo.webm filter=lfs diff=lfs merge=lfs -text
37
+ examples/reference/dingzhen_0.wav filter=lfs diff=lfs merge=lfs -text
38
+ examples/reference/s3p2.wav filter=lfs diff=lfs merge=lfs -text
39
+ examples/reference/trump_0.wav filter=lfs diff=lfs merge=lfs -text
40
+ examples/source/jay_0.wav filter=lfs diff=lfs merge=lfs -text
41
+ examples/source/source_s3.wav filter=lfs diff=lfs merge=lfs -text
42
+ examples/source/source_s4.wav filter=lfs diff=lfs merge=lfs -text
43
+ examples/source/TECHNOPOLIS[[:space:]]-[[:space:]]2085[[:space:]]\[vocals\]_\[cut_14sec\].wav filter=lfs diff=lfs merge=lfs -text
44
+ examples/source/Wiz[[:space:]]Khalifa,Charlie[[:space:]]Puth[[:space:]]-[[:space:]]See[[:space:]]You[[:space:]]Again[[:space:]]\[vocals\]_\[cut_28sec\].wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # general things to ignore
2
+ .DS_Store
3
+ build/
4
+ build_contrib/
5
+ dist/
6
+ .cache/
7
+ *.egg-info/
8
+ *.egg
9
+ *.py[cod]
10
+ __pycache__/
11
+ *.so
12
+ *~
13
+
14
+ # IDE
15
+ .vscode/
16
+
17
+ # misc
18
+ checkpoints/
19
+ test_waves/
20
+ reconstructed/
21
+ .python-version
22
+ ruff.log
23
+ /configs/inuse/
EVAL.md ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Zero-shot voice conversion🎙🔁
2
+ We have performed a series of objective evaluations on our Seed-VC's voice conversion capabilities.
3
+ For ease of reproduction, source audios are 100 random utterances from LibriTTS-test-clean, and reference audios are 12 randomly picked in-the-wild voices with unique characteristics. <br>
4
+
5
+ Source audios can be found under `./examples/libritts-test-clean` <br>
6
+ Reference audios can be found under `./examples/reference` <br>
7
+
8
+ We evaluate the conversion results in terms of speaker embedding cosine similarity (SECS), word error rate (WER) and character error rate (CER) and compared
9
+ our results with two strong open sourced baselines, namely [OpenVoice](https://github.com/myshell-ai/OpenVoice) and [CosyVoice](https://github.com/FunAudioLLM/CosyVoice).
10
+ Results in the table below shows that our Seed-VC model significantly outperforms the baseline models in both intelligibility and speaker similarity.<br>
11
+
12
+ | Models\Metrics | SECS↑ | WER↓ | CER↓ | SIG↑ | BAK↑ | OVRL↑ |
13
+ |----------------|------------|-----------|----------|----------|----------|----------|
14
+ | Ground Truth | 1.0000 | 8.02 | 1.57 | ~ | ~ | ~ |
15
+ | OpenVoice | 0.7547 | 15.46 | 4.73 | **3.56** | **4.02** | **3.27** |
16
+ | CosyVoice | 0.8440 | 18.98 | 7.29 | 3.51 | **4.02** | 3.21 |
17
+ | Seed-VC(Ours) | **0.8676** | **11.99** | **2.92** | 3.42 | 3.97 | 3.11 |
18
+
19
+ We have also compared with non-zero-shot voice conversion models for several speakers (based on model availability):
20
+
21
+ | Characters | Models\Metrics | SECS↑ | WER↓ | CER↓ | SIG↑ | BAK↑ | OVRL↑ |
22
+ |---------------------|----------------|------------|-----------|----------|----------|----------|----------|
23
+ | ~ | Ground Truth | 1.0000 | 6.43 | 1.00 | ~ | ~ | ~ |
24
+ | Tokai Teio | So-VITS-4.0 | 0.8637 | 21.46 | 9.63 | 3.06 | 3.66 | 2.68 |
25
+ | | Seed-VC(Ours) | **0.8899** | **15.32** | **4.66** | **3.12** | **3.71** | **2.72** |
26
+ | Milky Green | So-VITS-4.0 | 0.6850 | 48.43 | 32.50 | 3.34 | 3.51 | 2.82 |
27
+ | | Seed-VC(Ours) | **0.8072** | **7.26** | **1.32** | **3.48** | **4.07** | **3.20** |
28
+ | Matikane Tannhuaser | So-VITS-4.0 | 0.8594 | 16.25 | 8.64 | **3.25** | 3.71 | 2.84 |
29
+ | | Seed-VC(Ours) | **0.8768** | **12.62** | **5.86** | 3.18 | **3.83** | **2.85** |
30
+
31
+ Results show that, despite not being trained on the target speakers, Seed-VC is able to achieve significantly better results than the non-zero-shot models.
32
+ However, this may vary a lot depending on the SoVITS model quality. PR or Issue is welcomed if you find this comparison unfair or inaccurate.
33
+ (Tokai Teio model from [zomehwh/sovits-tannhauser](https://huggingface.co/spaces/zomehwh/sovits-tannhauser))
34
+ (Matikane Tannhuaser model from [zomehwh/sovits-tannhauser](https://huggingface.co/spaces/zomehwh/sovits-tannhauser))
35
+ (Milky Green model from [sparanoid/milky-green-sovits-4](https://huggingface.co/spaces/sparanoid/milky-green-sovits-4))
36
+
37
+ *English ASR result computed by [facebook/hubert-large-ls960-ft](https://huggingface.co/facebook/hubert-large-ls960-ft) model*
38
+ *Speaker embedding computed by [resemblyzer](https://github.com/resemble-ai/Resemblyzer) model* <br>
39
+
40
+ You can reproduce the evaluation by running `eval.py` script.
41
+ ```bash
42
+ python eval.py
43
+ --source ./examples/libritts-test-clean
44
+ --target ./examples/reference
45
+ --output ./examples/eval/converted
46
+ --diffusion-steps 25
47
+ --length-adjust 1.0
48
+ --inference-cfg-rate 0.7
49
+ --xvector-extractor "resemblyzer"
50
+ --baseline "" # fill in openvoice or cosyvoice to compute baseline result
51
+ --max-samples 100 # max source utterances to go through
52
+ ```
53
+ Before that, make sure you have openvoice and cosyvoice repo correctly installed on `../OpenVoice/` and `../CosyVoice/` if you would like to run baseline evaluation.
54
+
55
+ ### Zero-shot singing voice conversion🎤🎶
56
+
57
+ Additional singing voice conversion evaluation is done on [M4Singer](https://github.com/M4Singer/M4Singer) dataset, with 4 target speakers whose audio data is available [here](https://huggingface.co/datasets/XzJosh/audiodataset).
58
+ Speaker similariy is calculated by averaging the cosine similarities between conversion result and all available samples in respective character dataset.
59
+ For each character, one random utterance is chosen as the prompt for zero-shot inference. For comparison, we trained respective [RVCv2-f0-48k](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) model for each character as baseline.
60
+ 100 random utterances for each singer type are used as source audio.
61
+
62
+ | Models\Metrics | F0CORR↑ | F0RMSE↓ | SECS↑ | CER↓ | SIG↑ | BAK↑ | OVRL↑ |
63
+ |----------------|---------|---------|------------|-----------|----------|----------|----------|
64
+ | RVCv2 | 0.9404 | 30.43 | 0.7264 | 28.46 | **3.41** | **4.05** | **3.12** |
65
+ | Seed-VC(Ours) | 0.9375 | 33.35 | **0.7405** | **19.70** | 3.39 | 3.96 | 3.06 |
66
+
67
+ <details>
68
+ <summary>Click to expand detailed evaluation results</summary>
69
+
70
+ | Source Singer Type | Characters | Models\Metrics | F0CORR↑ | F0RMSE↓ | SECS↑ | CER↓ | SIG↑ | BAK↑ | OVRL↑ |
71
+ |--------------------|--------------------|----------------|---------|---------|------------|-----------|------|------|----------|
72
+ | Alto (Female) | ~ | Ground Truth | 1.0000 | 0.00 | ~ | 8.16 | ~ | ~ | ~ |
73
+ | | Azuma (Female) | RVCv2 | 0.9617 | 33.03 | **0.7352** | 24.70 | 3.36 | 4.07 | 3.07 |
74
+ | | | Seed-VC(Ours) | 0.9658 | 31.64 | 0.7341 | **15.23** | 3.37 | 4.02 | 3.07 |
75
+ | | Diana (Female) | RVCv2 | 0.9626 | 32.56 | 0.7212 | 19.67 | 3.45 | 4.08 | **3.17** |
76
+ | | | Seed-VC(Ours) | 0.9648 | 31.94 | **0.7457** | **16.81** | 3.49 | 3.99 | 3.15 |
77
+ | | Ding Zhen (Male) | RVCv2 | 0.9013 | 26.72 | 0.7221 | 18.53 | 3.37 | 4.03 | 3.06 |
78
+ | | | Seed-VC(Ours) | 0.9356 | 21.87 | **0.7513** | **15.63** | 3.44 | 3.94 | **3.09** |
79
+ | | Kobe Bryant (Male) | RVCv2 | 0.9215 | 23.90 | 0.7495 | 37.23 | 3.49 | 4.06 | **3.21** |
80
+ | | | Seed-VC(Ours) | 0.9248 | 23.40 | **0.7602** | **26.98** | 3.43 | 4.02 | 3.13 |
81
+ | Bass (Male) | ~ | Ground Truth | 1.0000 | 0.00 | ~ | 8.62 | ~ | ~ | ~ |
82
+ | | Azuma | RVCv2 | 0.9288 | 32.62 | **0.7148** | 24.88 | 3.45 | 4.10 | **3.18** |
83
+ | | | Seed-VC(Ours) | 0.9383 | 31.57 | 0.6960 | **10.31** | 3.45 | 4.03 | 3.15 |
84
+ | | Diana | RVCv2 | 0.9403 | 30.00 | 0.7010 | 14.54 | 3.53 | 4.15 | **3.27** |
85
+ | | | Seed-VC(Ours) | 0.9428 | 30.06 | **0.7299** | **9.66** | 3.53 | 4.11 | 3.25 |
86
+ | | Ding Zhen | RVCv2 | 0.9061 | 19.53 | 0.6922 | 25.99 | 3.36 | 4.09 | **3.08** |
87
+ | | | Seed-VC(Ours) | 0.9169 | 18.15 | **0.7260** | **14.13** | 3.38 | 3.98 | 3.07 |
88
+ | | Kobe Bryant | RVCv2 | 0.9302 | 16.37 | 0.7717 | 41.04 | 3.51 | 4.13 | **3.25** |
89
+ | | | Seed-VC(Ours) | 0.9176 | 17.93 | **0.7798** | **24.23** | 3.42 | 4.08 | 3.17 |
90
+ | Soprano (Female) | ~ | Ground Truth | 1.0000 | 0.00 | ~ | 27.92 | ~ | ~ | ~ |
91
+ | | Azuma | RVCv2 | 0.9742 | 47.80 | 0.7104 | 38.70 | 3.14 | 3.85 | **2.83** |
92
+ | | | Seed-VC(Ours) | 0.9521 | 64.00 | **0.7177** | **33.10** | 3.15 | 3.86 | 2.81 |
93
+ | | Diana | RVCv2 | 0.9754 | 46.59 | **0.7319** | 32.36 | 3.14 | 3.85 | **2.83** |
94
+ | | | Seed-VC(Ours) | 0.9573 | 59.70 | 0.7317 | **30.57** | 3.11 | 3.78 | 2.74 |
95
+ | | Ding Zhen | RVCv2 | 0.9543 | 31.45 | 0.6792 | 40.80 | 3.41 | 4.08 | **3.14** |
96
+ | | | Seed-VC(Ours) | 0.9486 | 33.37 | **0.6979** | **34.45** | 3.41 | 3.97 | 3.10 |
97
+ | | Kobe Bryant | RVCv2 | 0.9691 | 25.50 | 0.6276 | 61.59 | 3.43 | 4.04 | **3.15** |
98
+ | | | Seed-VC(Ours) | 0.9496 | 32.76 | **0.6683** | **39.82** | 3.32 | 3.98 | 3.04 |
99
+ | Tenor (Male) | ~ | Ground Truth | 1.0000 | 0.00 | ~ | 5.94 | ~ | ~ | ~ |
100
+ | | Azuma | RVCv2 | 0.9333 | 42.09 | **0.7832** | 16.66 | 3.46 | 4.07 | **3.18** |
101
+ | | | Seed-VC(Ours) | 0.9162 | 48.06 | 0.7697 | **8.48** | 3.38 | 3.89 | 3.01 |
102
+ | | Diana | RVCv2 | 0.9467 | 36.65 | 0.7729 | 15.28 | 3.53 | 4.08 | **3.24** |
103
+ | | | Seed-VC(Ours) | 0.9360 | 41.49 | **0.7920** | **8.55** | 3.49 | 3.93 | 3.13 |
104
+ | | Ding Zhen | RVCv2 | 0.9197 | 22.82 | 0.7591 | 12.92 | 3.40 | 4.02 | **3.09** |
105
+ | | | Seed-VC(Ours) | 0.9247 | 22.77 | **0.7721** | **13.95** | 3.45 | 3.82 | 3.05 |
106
+ | | Kobe Bryant | RVCv2 | 0.9415 | 19.33 | 0.7507 | 30.52 | 3.48 | 4.02 | **3.19** |
107
+ | | | Seed-VC(Ours) | 0.9082 | 24.86 | **0.7764** | **13.35** | 3.39 | 3.93 | 3.07 |
108
+ </details>
109
+
110
+
111
+ Despite Seed-VC is not trained on the target speakers, and only one random utterance is used as prompt, it still constantly outperforms speaker-specific RVCv2 models
112
+ in terms of speaker similarity (SECS) and intelligibility (CER), which demonstrates the superior voice cloning capability and robustness of Seed-VC.
113
+
114
+ However, it is observed that Seed-VC's audio quality (DNSMOS) is slightly lower than RVCv2. We take this drawback seriously and
115
+ will give high priority to improve the audio quality in the future.
116
+ PR or issue is welcomed if you find this comparison unfair or inaccurate.
117
+
118
+ *Chinese ASR result computed by [SenseVoiceSmall](https://github.com/FunAudioLLM/SenseVoice)*
119
+ *Speaker embedding computed by [resemblyzer](https://github.com/resemble-ai/Resemblyzer) model*
120
+ *We set +12 semitones pitch shift for male-to-female conversion and -12 semitones for female-to-male converison, otherwise 0 pitch shift*
121
+
LICENSE ADDED
@@ -0,0 +1,674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU GENERAL PUBLIC LICENSE
2
+ Version 3, 29 June 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+ Preamble
9
+
10
+ The GNU General Public License is a free, copyleft license for
11
+ software and other kinds of works.
12
+
13
+ The licenses for most software and other practical works are designed
14
+ to take away your freedom to share and change the works. By contrast,
15
+ the GNU General Public License is intended to guarantee your freedom to
16
+ share and change all versions of a program--to make sure it remains free
17
+ software for all its users. We, the Free Software Foundation, use the
18
+ GNU General Public License for most of our software; it applies also to
19
+ any other work released this way by its authors. You can apply it to
20
+ your programs, too.
21
+
22
+ When we speak of free software, we are referring to freedom, not
23
+ price. Our General Public Licenses are designed to make sure that you
24
+ have the freedom to distribute copies of free software (and charge for
25
+ them if you wish), that you receive source code or can get it if you
26
+ want it, that you can change the software or use pieces of it in new
27
+ free programs, and that you know you can do these things.
28
+
29
+ To protect your rights, we need to prevent others from denying you
30
+ these rights or asking you to surrender the rights. Therefore, you have
31
+ certain responsibilities if you distribute copies of the software, or if
32
+ you modify it: responsibilities to respect the freedom of others.
33
+
34
+ For example, if you distribute copies of such a program, whether
35
+ gratis or for a fee, you must pass on to the recipients the same
36
+ freedoms that you received. You must make sure that they, too, receive
37
+ or can get the source code. And you must show them these terms so they
38
+ know their rights.
39
+
40
+ Developers that use the GNU GPL protect your rights with two steps:
41
+ (1) assert copyright on the software, and (2) offer you this License
42
+ giving you legal permission to copy, distribute and/or modify it.
43
+
44
+ For the developers' and authors' protection, the GPL clearly explains
45
+ that there is no warranty for this free software. For both users' and
46
+ authors' sake, the GPL requires that modified versions be marked as
47
+ changed, so that their problems will not be attributed erroneously to
48
+ authors of previous versions.
49
+
50
+ Some devices are designed to deny users access to install or run
51
+ modified versions of the software inside them, although the manufacturer
52
+ can do so. This is fundamentally incompatible with the aim of
53
+ protecting users' freedom to change the software. The systematic
54
+ pattern of such abuse occurs in the area of products for individuals to
55
+ use, which is precisely where it is most unacceptable. Therefore, we
56
+ have designed this version of the GPL to prohibit the practice for those
57
+ products. If such problems arise substantially in other domains, we
58
+ stand ready to extend this provision to those domains in future versions
59
+ of the GPL, as needed to protect the freedom of users.
60
+
61
+ Finally, every program is threatened constantly by software patents.
62
+ States should not allow patents to restrict development and use of
63
+ software on general-purpose computers, but in those that do, we wish to
64
+ avoid the special danger that patents applied to a free program could
65
+ make it effectively proprietary. To prevent this, the GPL assures that
66
+ patents cannot be used to render the program non-free.
67
+
68
+ The precise terms and conditions for copying, distribution and
69
+ modification follow.
70
+
71
+ TERMS AND CONDITIONS
72
+
73
+ 0. Definitions.
74
+
75
+ "This License" refers to version 3 of the GNU General Public License.
76
+
77
+ "Copyright" also means copyright-like laws that apply to other kinds of
78
+ works, such as semiconductor masks.
79
+
80
+ "The Program" refers to any copyrightable work licensed under this
81
+ License. Each licensee is addressed as "you". "Licensees" and
82
+ "recipients" may be individuals or organizations.
83
+
84
+ To "modify" a work means to copy from or adapt all or part of the work
85
+ in a fashion requiring copyright permission, other than the making of an
86
+ exact copy. The resulting work is called a "modified version" of the
87
+ earlier work or a work "based on" the earlier work.
88
+
89
+ A "covered work" means either the unmodified Program or a work based
90
+ on the Program.
91
+
92
+ To "propagate" a work means to do anything with it that, without
93
+ permission, would make you directly or secondarily liable for
94
+ infringement under applicable copyright law, except executing it on a
95
+ computer or modifying a private copy. Propagation includes copying,
96
+ distribution (with or without modification), making available to the
97
+ public, and in some countries other activities as well.
98
+
99
+ To "convey" a work means any kind of propagation that enables other
100
+ parties to make or receive copies. Mere interaction with a user through
101
+ a computer network, with no transfer of a copy, is not conveying.
102
+
103
+ An interactive user interface displays "Appropriate Legal Notices"
104
+ to the extent that it includes a convenient and prominently visible
105
+ feature that (1) displays an appropriate copyright notice, and (2)
106
+ tells the user that there is no warranty for the work (except to the
107
+ extent that warranties are provided), that licensees may convey the
108
+ work under this License, and how to view a copy of this License. If
109
+ the interface presents a list of user commands or options, such as a
110
+ menu, a prominent item in the list meets this criterion.
111
+
112
+ 1. Source Code.
113
+
114
+ The "source code" for a work means the preferred form of the work
115
+ for making modifications to it. "Object code" means any non-source
116
+ form of a work.
117
+
118
+ A "Standard Interface" means an interface that either is an official
119
+ standard defined by a recognized standards body, or, in the case of
120
+ interfaces specified for a particular programming language, one that
121
+ is widely used among developers working in that language.
122
+
123
+ The "System Libraries" of an executable work include anything, other
124
+ than the work as a whole, that (a) is included in the normal form of
125
+ packaging a Major Component, but which is not part of that Major
126
+ Component, and (b) serves only to enable use of the work with that
127
+ Major Component, or to implement a Standard Interface for which an
128
+ implementation is available to the public in source code form. A
129
+ "Major Component", in this context, means a major essential component
130
+ (kernel, window system, and so on) of the specific operating system
131
+ (if any) on which the executable work runs, or a compiler used to
132
+ produce the work, or an object code interpreter used to run it.
133
+
134
+ The "Corresponding Source" for a work in object code form means all
135
+ the source code needed to generate, install, and (for an executable
136
+ work) run the object code and to modify the work, including scripts to
137
+ control those activities. However, it does not include the work's
138
+ System Libraries, or general-purpose tools or generally available free
139
+ programs which are used unmodified in performing those activities but
140
+ which are not part of the work. For example, Corresponding Source
141
+ includes interface definition files associated with source files for
142
+ the work, and the source code for shared libraries and dynamically
143
+ linked subprograms that the work is specifically designed to require,
144
+ such as by intimate data communication or control flow between those
145
+ subprograms and other parts of the work.
146
+
147
+ The Corresponding Source need not include anything that users
148
+ can regenerate automatically from other parts of the Corresponding
149
+ Source.
150
+
151
+ The Corresponding Source for a work in source code form is that
152
+ same work.
153
+
154
+ 2. Basic Permissions.
155
+
156
+ All rights granted under this License are granted for the term of
157
+ copyright on the Program, and are irrevocable provided the stated
158
+ conditions are met. This License explicitly affirms your unlimited
159
+ permission to run the unmodified Program. The output from running a
160
+ covered work is covered by this License only if the output, given its
161
+ content, constitutes a covered work. This License acknowledges your
162
+ rights of fair use or other equivalent, as provided by copyright law.
163
+
164
+ You may make, run and propagate covered works that you do not
165
+ convey, without conditions so long as your license otherwise remains
166
+ in force. You may convey covered works to others for the sole purpose
167
+ of having them make modifications exclusively for you, or provide you
168
+ with facilities for running those works, provided that you comply with
169
+ the terms of this License in conveying all material for which you do
170
+ not control copyright. Those thus making or running the covered works
171
+ for you must do so exclusively on your behalf, under your direction
172
+ and control, on terms that prohibit them from making any copies of
173
+ your copyrighted material outside their relationship with you.
174
+
175
+ Conveying under any other circumstances is permitted solely under
176
+ the conditions stated below. Sublicensing is not allowed; section 10
177
+ makes it unnecessary.
178
+
179
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180
+
181
+ No covered work shall be deemed part of an effective technological
182
+ measure under any applicable law fulfilling obligations under article
183
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184
+ similar laws prohibiting or restricting circumvention of such
185
+ measures.
186
+
187
+ When you convey a covered work, you waive any legal power to forbid
188
+ circumvention of technological measures to the extent such circumvention
189
+ is effected by exercising rights under this License with respect to
190
+ the covered work, and you disclaim any intention to limit operation or
191
+ modification of the work as a means of enforcing, against the work's
192
+ users, your or third parties' legal rights to forbid circumvention of
193
+ technological measures.
194
+
195
+ 4. Conveying Verbatim Copies.
196
+
197
+ You may convey verbatim copies of the Program's source code as you
198
+ receive it, in any medium, provided that you conspicuously and
199
+ appropriately publish on each copy an appropriate copyright notice;
200
+ keep intact all notices stating that this License and any
201
+ non-permissive terms added in accord with section 7 apply to the code;
202
+ keep intact all notices of the absence of any warranty; and give all
203
+ recipients a copy of this License along with the Program.
204
+
205
+ You may charge any price or no price for each copy that you convey,
206
+ and you may offer support or warranty protection for a fee.
207
+
208
+ 5. Conveying Modified Source Versions.
209
+
210
+ You may convey a work based on the Program, or the modifications to
211
+ produce it from the Program, in the form of source code under the
212
+ terms of section 4, provided that you also meet all of these conditions:
213
+
214
+ a) The work must carry prominent notices stating that you modified
215
+ it, and giving a relevant date.
216
+
217
+ b) The work must carry prominent notices stating that it is
218
+ released under this License and any conditions added under section
219
+ 7. This requirement modifies the requirement in section 4 to
220
+ "keep intact all notices".
221
+
222
+ c) You must license the entire work, as a whole, under this
223
+ License to anyone who comes into possession of a copy. This
224
+ License will therefore apply, along with any applicable section 7
225
+ additional terms, to the whole of the work, and all its parts,
226
+ regardless of how they are packaged. This License gives no
227
+ permission to license the work in any other way, but it does not
228
+ invalidate such permission if you have separately received it.
229
+
230
+ d) If the work has interactive user interfaces, each must display
231
+ Appropriate Legal Notices; however, if the Program has interactive
232
+ interfaces that do not display Appropriate Legal Notices, your
233
+ work need not make them do so.
234
+
235
+ A compilation of a covered work with other separate and independent
236
+ works, which are not by their nature extensions of the covered work,
237
+ and which are not combined with it such as to form a larger program,
238
+ in or on a volume of a storage or distribution medium, is called an
239
+ "aggregate" if the compilation and its resulting copyright are not
240
+ used to limit the access or legal rights of the compilation's users
241
+ beyond what the individual works permit. Inclusion of a covered work
242
+ in an aggregate does not cause this License to apply to the other
243
+ parts of the aggregate.
244
+
245
+ 6. Conveying Non-Source Forms.
246
+
247
+ You may convey a covered work in object code form under the terms
248
+ of sections 4 and 5, provided that you also convey the
249
+ machine-readable Corresponding Source under the terms of this License,
250
+ in one of these ways:
251
+
252
+ a) Convey the object code in, or embodied in, a physical product
253
+ (including a physical distribution medium), accompanied by the
254
+ Corresponding Source fixed on a durable physical medium
255
+ customarily used for software interchange.
256
+
257
+ b) Convey the object code in, or embodied in, a physical product
258
+ (including a physical distribution medium), accompanied by a
259
+ written offer, valid for at least three years and valid for as
260
+ long as you offer spare parts or customer support for that product
261
+ model, to give anyone who possesses the object code either (1) a
262
+ copy of the Corresponding Source for all the software in the
263
+ product that is covered by this License, on a durable physical
264
+ medium customarily used for software interchange, for a price no
265
+ more than your reasonable cost of physically performing this
266
+ conveying of source, or (2) access to copy the
267
+ Corresponding Source from a network server at no charge.
268
+
269
+ c) Convey individual copies of the object code with a copy of the
270
+ written offer to provide the Corresponding Source. This
271
+ alternative is allowed only occasionally and noncommercially, and
272
+ only if you received the object code with such an offer, in accord
273
+ with subsection 6b.
274
+
275
+ d) Convey the object code by offering access from a designated
276
+ place (gratis or for a charge), and offer equivalent access to the
277
+ Corresponding Source in the same way through the same place at no
278
+ further charge. You need not require recipients to copy the
279
+ Corresponding Source along with the object code. If the place to
280
+ copy the object code is a network server, the Corresponding Source
281
+ may be on a different server (operated by you or a third party)
282
+ that supports equivalent copying facilities, provided you maintain
283
+ clear directions next to the object code saying where to find the
284
+ Corresponding Source. Regardless of what server hosts the
285
+ Corresponding Source, you remain obligated to ensure that it is
286
+ available for as long as needed to satisfy these requirements.
287
+
288
+ e) Convey the object code using peer-to-peer transmission, provided
289
+ you inform other peers where the object code and Corresponding
290
+ Source of the work are being offered to the general public at no
291
+ charge under subsection 6d.
292
+
293
+ A separable portion of the object code, whose source code is excluded
294
+ from the Corresponding Source as a System Library, need not be
295
+ included in conveying the object code work.
296
+
297
+ A "User Product" is either (1) a "consumer product", which means any
298
+ tangible personal property which is normally used for personal, family,
299
+ or household purposes, or (2) anything designed or sold for incorporation
300
+ into a dwelling. In determining whether a product is a consumer product,
301
+ doubtful cases shall be resolved in favor of coverage. For a particular
302
+ product received by a particular user, "normally used" refers to a
303
+ typical or common use of that class of product, regardless of the status
304
+ of the particular user or of the way in which the particular user
305
+ actually uses, or expects or is expected to use, the product. A product
306
+ is a consumer product regardless of whether the product has substantial
307
+ commercial, industrial or non-consumer uses, unless such uses represent
308
+ the only significant mode of use of the product.
309
+
310
+ "Installation Information" for a User Product means any methods,
311
+ procedures, authorization keys, or other information required to install
312
+ and execute modified versions of a covered work in that User Product from
313
+ a modified version of its Corresponding Source. The information must
314
+ suffice to ensure that the continued functioning of the modified object
315
+ code is in no case prevented or interfered with solely because
316
+ modification has been made.
317
+
318
+ If you convey an object code work under this section in, or with, or
319
+ specifically for use in, a User Product, and the conveying occurs as
320
+ part of a transaction in which the right of possession and use of the
321
+ User Product is transferred to the recipient in perpetuity or for a
322
+ fixed term (regardless of how the transaction is characterized), the
323
+ Corresponding Source conveyed under this section must be accompanied
324
+ by the Installation Information. But this requirement does not apply
325
+ if neither you nor any third party retains the ability to install
326
+ modified object code on the User Product (for example, the work has
327
+ been installed in ROM).
328
+
329
+ The requirement to provide Installation Information does not include a
330
+ requirement to continue to provide support service, warranty, or updates
331
+ for a work that has been modified or installed by the recipient, or for
332
+ the User Product in which it has been modified or installed. Access to a
333
+ network may be denied when the modification itself materially and
334
+ adversely affects the operation of the network or violates the rules and
335
+ protocols for communication across the network.
336
+
337
+ Corresponding Source conveyed, and Installation Information provided,
338
+ in accord with this section must be in a format that is publicly
339
+ documented (and with an implementation available to the public in
340
+ source code form), and must require no special password or key for
341
+ unpacking, reading or copying.
342
+
343
+ 7. Additional Terms.
344
+
345
+ "Additional permissions" are terms that supplement the terms of this
346
+ License by making exceptions from one or more of its conditions.
347
+ Additional permissions that are applicable to the entire Program shall
348
+ be treated as though they were included in this License, to the extent
349
+ that they are valid under applicable law. If additional permissions
350
+ apply only to part of the Program, that part may be used separately
351
+ under those permissions, but the entire Program remains governed by
352
+ this License without regard to the additional permissions.
353
+
354
+ When you convey a copy of a covered work, you may at your option
355
+ remove any additional permissions from that copy, or from any part of
356
+ it. (Additional permissions may be written to require their own
357
+ removal in certain cases when you modify the work.) You may place
358
+ additional permissions on material, added by you to a covered work,
359
+ for which you have or can give appropriate copyright permission.
360
+
361
+ Notwithstanding any other provision of this License, for material you
362
+ add to a covered work, you may (if authorized by the copyright holders of
363
+ that material) supplement the terms of this License with terms:
364
+
365
+ a) Disclaiming warranty or limiting liability differently from the
366
+ terms of sections 15 and 16 of this License; or
367
+
368
+ b) Requiring preservation of specified reasonable legal notices or
369
+ author attributions in that material or in the Appropriate Legal
370
+ Notices displayed by works containing it; or
371
+
372
+ c) Prohibiting misrepresentation of the origin of that material, or
373
+ requiring that modified versions of such material be marked in
374
+ reasonable ways as different from the original version; or
375
+
376
+ d) Limiting the use for publicity purposes of names of licensors or
377
+ authors of the material; or
378
+
379
+ e) Declining to grant rights under trademark law for use of some
380
+ trade names, trademarks, or service marks; or
381
+
382
+ f) Requiring indemnification of licensors and authors of that
383
+ material by anyone who conveys the material (or modified versions of
384
+ it) with contractual assumptions of liability to the recipient, for
385
+ any liability that these contractual assumptions directly impose on
386
+ those licensors and authors.
387
+
388
+ All other non-permissive additional terms are considered "further
389
+ restrictions" within the meaning of section 10. If the Program as you
390
+ received it, or any part of it, contains a notice stating that it is
391
+ governed by this License along with a term that is a further
392
+ restriction, you may remove that term. If a license document contains
393
+ a further restriction but permits relicensing or conveying under this
394
+ License, you may add to a covered work material governed by the terms
395
+ of that license document, provided that the further restriction does
396
+ not survive such relicensing or conveying.
397
+
398
+ If you add terms to a covered work in accord with this section, you
399
+ must place, in the relevant source files, a statement of the
400
+ additional terms that apply to those files, or a notice indicating
401
+ where to find the applicable terms.
402
+
403
+ Additional terms, permissive or non-permissive, may be stated in the
404
+ form of a separately written license, or stated as exceptions;
405
+ the above requirements apply either way.
406
+
407
+ 8. Termination.
408
+
409
+ You may not propagate or modify a covered work except as expressly
410
+ provided under this License. Any attempt otherwise to propagate or
411
+ modify it is void, and will automatically terminate your rights under
412
+ this License (including any patent licenses granted under the third
413
+ paragraph of section 11).
414
+
415
+ However, if you cease all violation of this License, then your
416
+ license from a particular copyright holder is reinstated (a)
417
+ provisionally, unless and until the copyright holder explicitly and
418
+ finally terminates your license, and (b) permanently, if the copyright
419
+ holder fails to notify you of the violation by some reasonable means
420
+ prior to 60 days after the cessation.
421
+
422
+ Moreover, your license from a particular copyright holder is
423
+ reinstated permanently if the copyright holder notifies you of the
424
+ violation by some reasonable means, this is the first time you have
425
+ received notice of violation of this License (for any work) from that
426
+ copyright holder, and you cure the violation prior to 30 days after
427
+ your receipt of the notice.
428
+
429
+ Termination of your rights under this section does not terminate the
430
+ licenses of parties who have received copies or rights from you under
431
+ this License. If your rights have been terminated and not permanently
432
+ reinstated, you do not qualify to receive new licenses for the same
433
+ material under section 10.
434
+
435
+ 9. Acceptance Not Required for Having Copies.
436
+
437
+ You are not required to accept this License in order to receive or
438
+ run a copy of the Program. Ancillary propagation of a covered work
439
+ occurring solely as a consequence of using peer-to-peer transmission
440
+ to receive a copy likewise does not require acceptance. However,
441
+ nothing other than this License grants you permission to propagate or
442
+ modify any covered work. These actions infringe copyright if you do
443
+ not accept this License. Therefore, by modifying or propagating a
444
+ covered work, you indicate your acceptance of this License to do so.
445
+
446
+ 10. Automatic Licensing of Downstream Recipients.
447
+
448
+ Each time you convey a covered work, the recipient automatically
449
+ receives a license from the original licensors, to run, modify and
450
+ propagate that work, subject to this License. You are not responsible
451
+ for enforcing compliance by third parties with this License.
452
+
453
+ An "entity transaction" is a transaction transferring control of an
454
+ organization, or substantially all assets of one, or subdividing an
455
+ organization, or merging organizations. If propagation of a covered
456
+ work results from an entity transaction, each party to that
457
+ transaction who receives a copy of the work also receives whatever
458
+ licenses to the work the party's predecessor in interest had or could
459
+ give under the previous paragraph, plus a right to possession of the
460
+ Corresponding Source of the work from the predecessor in interest, if
461
+ the predecessor has it or can get it with reasonable efforts.
462
+
463
+ You may not impose any further restrictions on the exercise of the
464
+ rights granted or affirmed under this License. For example, you may
465
+ not impose a license fee, royalty, or other charge for exercise of
466
+ rights granted under this License, and you may not initiate litigation
467
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
468
+ any patent claim is infringed by making, using, selling, offering for
469
+ sale, or importing the Program or any portion of it.
470
+
471
+ 11. Patents.
472
+
473
+ A "contributor" is a copyright holder who authorizes use under this
474
+ License of the Program or a work on which the Program is based. The
475
+ work thus licensed is called the contributor's "contributor version".
476
+
477
+ A contributor's "essential patent claims" are all patent claims
478
+ owned or controlled by the contributor, whether already acquired or
479
+ hereafter acquired, that would be infringed by some manner, permitted
480
+ by this License, of making, using, or selling its contributor version,
481
+ but do not include claims that would be infringed only as a
482
+ consequence of further modification of the contributor version. For
483
+ purposes of this definition, "control" includes the right to grant
484
+ patent sublicenses in a manner consistent with the requirements of
485
+ this License.
486
+
487
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
488
+ patent license under the contributor's essential patent claims, to
489
+ make, use, sell, offer for sale, import and otherwise run, modify and
490
+ propagate the contents of its contributor version.
491
+
492
+ In the following three paragraphs, a "patent license" is any express
493
+ agreement or commitment, however denominated, not to enforce a patent
494
+ (such as an express permission to practice a patent or covenant not to
495
+ sue for patent infringement). To "grant" such a patent license to a
496
+ party means to make such an agreement or commitment not to enforce a
497
+ patent against the party.
498
+
499
+ If you convey a covered work, knowingly relying on a patent license,
500
+ and the Corresponding Source of the work is not available for anyone
501
+ to copy, free of charge and under the terms of this License, through a
502
+ publicly available network server or other readily accessible means,
503
+ then you must either (1) cause the Corresponding Source to be so
504
+ available, or (2) arrange to deprive yourself of the benefit of the
505
+ patent license for this particular work, or (3) arrange, in a manner
506
+ consistent with the requirements of this License, to extend the patent
507
+ license to downstream recipients. "Knowingly relying" means you have
508
+ actual knowledge that, but for the patent license, your conveying the
509
+ covered work in a country, or your recipient's use of the covered work
510
+ in a country, would infringe one or more identifiable patents in that
511
+ country that you have reason to believe are valid.
512
+
513
+ If, pursuant to or in connection with a single transaction or
514
+ arrangement, you convey, or propagate by procuring conveyance of, a
515
+ covered work, and grant a patent license to some of the parties
516
+ receiving the covered work authorizing them to use, propagate, modify
517
+ or convey a specific copy of the covered work, then the patent license
518
+ you grant is automatically extended to all recipients of the covered
519
+ work and works based on it.
520
+
521
+ A patent license is "discriminatory" if it does not include within
522
+ the scope of its coverage, prohibits the exercise of, or is
523
+ conditioned on the non-exercise of one or more of the rights that are
524
+ specifically granted under this License. You may not convey a covered
525
+ work if you are a party to an arrangement with a third party that is
526
+ in the business of distributing software, under which you make payment
527
+ to the third party based on the extent of your activity of conveying
528
+ the work, and under which the third party grants, to any of the
529
+ parties who would receive the covered work from you, a discriminatory
530
+ patent license (a) in connection with copies of the covered work
531
+ conveyed by you (or copies made from those copies), or (b) primarily
532
+ for and in connection with specific products or compilations that
533
+ contain the covered work, unless you entered into that arrangement,
534
+ or that patent license was granted, prior to 28 March 2007.
535
+
536
+ Nothing in this License shall be construed as excluding or limiting
537
+ any implied license or other defenses to infringement that may
538
+ otherwise be available to you under applicable patent law.
539
+
540
+ 12. No Surrender of Others' Freedom.
541
+
542
+ If conditions are imposed on you (whether by court order, agreement or
543
+ otherwise) that contradict the conditions of this License, they do not
544
+ excuse you from the conditions of this License. If you cannot convey a
545
+ covered work so as to satisfy simultaneously your obligations under this
546
+ License and any other pertinent obligations, then as a consequence you may
547
+ not convey it at all. For example, if you agree to terms that obligate you
548
+ to collect a royalty for further conveying from those to whom you convey
549
+ the Program, the only way you could satisfy both those terms and this
550
+ License would be to refrain entirely from conveying the Program.
551
+
552
+ 13. Use with the GNU Affero General Public License.
553
+
554
+ Notwithstanding any other provision of this License, you have
555
+ permission to link or combine any covered work with a work licensed
556
+ under version 3 of the GNU Affero General Public License into a single
557
+ combined work, and to convey the resulting work. The terms of this
558
+ License will continue to apply to the part which is the covered work,
559
+ but the special requirements of the GNU Affero General Public License,
560
+ section 13, concerning interaction through a network will apply to the
561
+ combination as such.
562
+
563
+ 14. Revised Versions of this License.
564
+
565
+ The Free Software Foundation may publish revised and/or new versions of
566
+ the GNU General Public License from time to time. Such new versions will
567
+ be similar in spirit to the present version, but may differ in detail to
568
+ address new problems or concerns.
569
+
570
+ Each version is given a distinguishing version number. If the
571
+ Program specifies that a certain numbered version of the GNU General
572
+ Public License "or any later version" applies to it, you have the
573
+ option of following the terms and conditions either of that numbered
574
+ version or of any later version published by the Free Software
575
+ Foundation. If the Program does not specify a version number of the
576
+ GNU General Public License, you may choose any version ever published
577
+ by the Free Software Foundation.
578
+
579
+ If the Program specifies that a proxy can decide which future
580
+ versions of the GNU General Public License can be used, that proxy's
581
+ public statement of acceptance of a version permanently authorizes you
582
+ to choose that version for the Program.
583
+
584
+ Later license versions may give you additional or different
585
+ permissions. However, no additional obligations are imposed on any
586
+ author or copyright holder as a result of your choosing to follow a
587
+ later version.
588
+
589
+ 15. Disclaimer of Warranty.
590
+
591
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599
+
600
+ 16. Limitation of Liability.
601
+
602
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610
+ SUCH DAMAGES.
611
+
612
+ 17. Interpretation of Sections 15 and 16.
613
+
614
+ If the disclaimer of warranty and limitation of liability provided
615
+ above cannot be given local legal effect according to their terms,
616
+ reviewing courts shall apply local law that most closely approximates
617
+ an absolute waiver of all civil liability in connection with the
618
+ Program, unless a warranty or assumption of liability accompanies a
619
+ copy of the Program in return for a fee.
620
+
621
+ END OF TERMS AND CONDITIONS
622
+
623
+ How to Apply These Terms to Your New Programs
624
+
625
+ If you develop a new program, and you want it to be of the greatest
626
+ possible use to the public, the best way to achieve this is to make it
627
+ free software which everyone can redistribute and change under these terms.
628
+
629
+ To do so, attach the following notices to the program. It is safest
630
+ to attach them to the start of each source file to most effectively
631
+ state the exclusion of warranty; and each file should have at least
632
+ the "copyright" line and a pointer to where the full notice is found.
633
+
634
+ <one line to give the program's name and a brief idea of what it does.>
635
+ Copyright (C) <year> <name of author>
636
+
637
+ This program is free software: you can redistribute it and/or modify
638
+ it under the terms of the GNU General Public License as published by
639
+ the Free Software Foundation, either version 3 of the License, or
640
+ (at your option) any later version.
641
+
642
+ This program is distributed in the hope that it will be useful,
643
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
644
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645
+ GNU General Public License for more details.
646
+
647
+ You should have received a copy of the GNU General Public License
648
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
649
+
650
+ Also add information on how to contact you by electronic and paper mail.
651
+
652
+ If the program does terminal interaction, make it output a short
653
+ notice like this when it starts in an interactive mode:
654
+
655
+ <program> Copyright (C) <year> <name of author>
656
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657
+ This is free software, and you are welcome to redistribute it
658
+ under certain conditions; type `show c' for details.
659
+
660
+ The hypothetical commands `show w' and `show c' should show the appropriate
661
+ parts of the General Public License. Of course, your program's commands
662
+ might be different; for a GUI interface, you would use an "about box".
663
+
664
+ You should also get your employer (if you work as a programmer) or school,
665
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
666
+ For more information on this, and how to apply and follow the GNU GPL, see
667
+ <https://www.gnu.org/licenses/>.
668
+
669
+ The GNU General Public License does not permit incorporating your program
670
+ into proprietary programs. If your program is a subroutine library, you
671
+ may consider it more useful to permit linking proprietary applications with
672
+ the library. If this is what you want to do, use the GNU Lesser General
673
+ Public License instead of this License. But first, please read
674
+ <https://www.gnu.org/licenses/why-not-lgpl.html>.
README-JA.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Seed-VC
2
+ [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Demo-blue)](https://huggingface.co/spaces/Plachta/Seed-VC) [![arXiv](https://img.shields.io/badge/arXiv-2411.09943-<COLOR>.svg)](https://arxiv.org/abs/2411.09943)
3
+
4
+ *[English](README.md) | [简体中文](README-ZH.md) | 日本語*
5
+
6
+ [real-time-demo.webm](https://github.com/user-attachments/assets/86325c5e-f7f6-4a04-8695-97275a5d046c)
7
+
8
+ *(注意:この文書は機械翻訳によって生成されたものです。正確性を確保するよう努めていますが、不明確な点がございましたら英語版をご参照ください。翻訳の改善案がございましたら、PRを歓迎いたします。)*
9
+
10
+ 現在リリースされているモデルは、*ゼロショット音声変換* 🔊、*ゼロショットリアルタイム音声変換* 🗣️、*ゼロショット歌声変換* 🎶 に対応しています。トレーニングなしで、1〜30秒の参照音声からボイスクローニングが可能です。
11
+
12
+ カスタムデータでの追加ファインチューニングをサポートしており、特定の話者/話者群に対するパフォーマンスを向上させることができます。データ要件は極めて少なく(**話者あたり最低1発話**)、トレーニング速度も非常に速い(**最低100ステップ、T4で2分**)です!
13
+
14
+ **リアルタイム音声変換**に対応しており、アルゴリズムの遅延は約300ms、デバイス側の遅延は約100msで、オンライン会議、ゲーム、ライブ配信に適しています。
15
+
16
+ デモや以前の音声変換モデルとの比較については、[デモページ](https://plachtaa.github.io/seed-vc/)🌐と[評価](EVAL.md)📊をご覧ください。
17
+
18
+ モデルの品質向上と機能追加を継続的に行っています。
19
+
20
+ ## 評価📊
21
+ 客観的評価結果と他のベースラインとの比較については[EVAL.md](EVAL.md)をご覧ください。
22
+
23
+ ## インストール📥
24
+ Windows または Linux で Python 3.10 を推奨します。
25
+ ```bash
26
+ pip install -r requirements.txt
27
+ ```
28
+
29
+ ## 使用方法🛠️
30
+ 目的に応じて3つのモデルをリリースしています:
31
+
32
+ | バージョン | 名称 | 目的 | サンプリングレート | コンテンツエンコーダ | ボコーダ | 隠れ次元 | レイヤー数 | パラメータ数 | 備考 |
33
+ |---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------|---------------|-----------------|---------|------------|----------|--------|--------------------------------------------------------|
34
+ | v1.0 | seed-uvit-tat-xlsr-tiny ([🤗](https://huggingface.co/Plachta/Seed-VC/blob/main/DiT_uvit_tat_xlsr_ema.pth)[📄](configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml)) | 音声変換 (VC) | 22050 | XLSR-large | HIFT | 384 | 9 | 25M | リアルタイム音声変換に適しています |
35
+ | v1.0 | seed-uvit-whisper-small-wavenet ([🤗](https://huggingface.co/Plachta/Seed-VC/blob/main/DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth)[📄](configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml)) | 音声変換 (VC) | 22050 | Whisper-small | BigVGAN | 512 | 13 | 98M | オフライン音声変換に適しています |
36
+ | v1.0 | seed-uvit-whisper-base ([🤗](https://huggingface.co/Plachta/Seed-VC/blob/main/DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth)[📄](configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml)) | 歌声変換 (SVC) | 44100 | Whisper-small | BigVGAN | 768 | 17 | 200M | 強力なゼロショットパフォーマンス、歌声変換 |
37
+
38
+ 最新のモデルリリースのチェックポイントは、最初の推論実行時に自動的にダウンロードされます。
39
+ ネットワークの理由でhuggingfaceにアクセスできない場合は、すべてのコマンドの前に `HF_ENDPOINT=https://hf-mirror.com` を追加してミラーを使用してください。
40
+
41
+ コマンドライン推論:
42
+ ```bash
43
+ python inference.py --source <source-wav>
44
+ --target <referene-wav>
45
+ --output <output-dir>
46
+ --diffusion-steps 25 # 歌声変換には30〜50を推奨
47
+ --length-adjust 1.0
48
+ --inference-cfg-rate 0.7
49
+ --f0-condition False # 歌声変換の場合はTrueに設定
50
+ --auto-f0-adjust False # ソースピッチをターゲットピ���チレベルに自動調整する場合はTrue、通常は歌声変換では使用しない
51
+ --semi-tone-shift 0 # 歌声変換のピッチシフト(半音単位)
52
+ --checkpoint <path-to-checkpoint>
53
+ --config <path-to-config>
54
+ --fp16 True
55
+ ```
56
+ 各パラメータの説明:
57
+ - `source` は変換したい音声ファイルのパス
58
+ - `target` は参照音声ファイルのパス
59
+ - `output` は出力ディレクトリのパス
60
+ - `diffusion-steps` は拡散ステップ数、デフォルトは25、最高品質には30-50、最速推論には4-10を使用
61
+ - `length-adjust` は長さ調整係数、デフォルトは1.0、<1.0で音声短縮、>1.0で音声伸長
62
+ - `inference-cfg-rate` は出力に微妙な違いをもたらす、デフォルトは0.7
63
+ - `f0-condition` はソース音声のピッチを出力に条件付けするフラグ、デフォルトはFalse、歌声変換の場合はTrue
64
+ - `auto-f0-adjust` はソースピッチをターゲットピッチレベルに自動調整するフラグ、デフォルトはFalse、通常は歌声変換では使用しない
65
+ - `semi-tone-shift` は歌声変換のピッチシフト(半音単位)、デフォルトは0
66
+ - `checkpoint` は独自のモデルをトレーニングまたはファインチューニングした場合のモデルチェックポイントへのパス、空白の場合はhuggingfaceからデフォルトモデルを自動ダウンロード(`f0-condition`が`False`の場合は`seed-uvit-whisper-small-wavenet`、それ以外は`seed-uvit-whisper-base`)
67
+ - `config` は独自のモデルをトレーニングまたはファインチューニングした場合のモデル設定へのパス、空白の場合はhuggingfaceからデフォルト設定を自動ダウンロード
68
+ - `fp16` はfloat16推論を使用するフラグ、デフォルトはTrue
69
+
70
+ 音声変換Web UI:
71
+ ```bash
72
+ python app_vc.py --checkpoint <path-to-checkpoint> --config <path-to-config> --fp16 True
73
+ ```
74
+ - `checkpoint` は独自のモデルをトレーニングまたはファインチューニングした場合のモデルチェックポイントへのパス、空白の場合はhuggingfaceからデフォルトモデルを自動ダウンロード(`seed-uvit-whisper-small-wavenet`)
75
+ - `config` は独自のモデルをトレーニングまたはファインチューニングした場合のモデル設定へのパス、空白の場合はhuggingfaceからデフォルト設定を自動ダウンロード
76
+
77
+ ブラウザで`http://localhost:7860/`にアクセスしてWebインターフェースを使用できます。
78
+
79
+ 歌声変換Web UI:
80
+ ```bash
81
+ python app_svc.py --checkpoint <path-to-checkpoint> --config <path-to-config> --fp16 True
82
+ ```
83
+ - `checkpoint` は独自のモデルをトレーニングまたはファインチューニングした場合のモデルチェックポイントへのパス、空白の場合はhuggingfaceからデフォルトモデルを自動ダウンロード(`seed-uvit-whisper-base`)
84
+ - `config` は独自のモデルをトレーニングまたはファインチューニングした場合のモデル設定へのパス、空白の場合はhuggingfaceからデフォルト設定を自動ダウンロード
85
+
86
+ 統合Web UI:
87
+ ```bash
88
+ python app.py
89
+ ```
90
+ これはゼロショット推論用の事前学習済みモデルのみを読み込みます。カスタムチェックポイントを使用する場合は、上記の`app_vc.py`または`app_svc.py`を実行してください。
91
+
92
+ リアルタイム音声変換GUI:
93
+ ```bash
94
+ python real-time-gui.py --checkpoint <path-to-checkpoint> --config <path-to-config>
95
+ ```
96
+ - `checkpoint` は独自のモデルをトレーニングまたはファインチューニングした場合のモデルチェックポイントへのパス、空白の場合はhuggingfaceからデフォルトモデルを自動ダウンロード(`seed-uvit-tat-xlsr-tiny`)
97
+ - `config` は独自のモデルをトレーニングまたはファインチューニングした場合のモデル設定へのパス、空白の場合はhuggingfaceからデフォルト設定を自動ダウンロード
98
+
99
+ 重要:リアルタイム音声変換にはGPUの使用を強く推奨します。
100
+ NVIDIA RTX 3060ノートパソコンGPUでいくつかのパフォーマンステストを行い、結果と推奨パラメータ設定を以下に示します:
101
+
102
+ | モデル構成 | 拡散ステップ | 推論CFGレート | 最大プロンプト長 | ブロック時間 (秒) | クロスフェード長 (秒) | 追加コンテキスト (左) (秒) | 追加コンテキスト (右) (秒) | レイテンシ (ミリ秒) | チャンクあたりの推論時間 (ミリ秒) |
103
+ |---------------------------------|-----------------|--------------------|-------------------|----------------|----------------------|--------------------------|---------------------------|--------------|-------------------------------|
104
+ | seed-uvit-xlsr-tiny | 10 | 0.7 | 3.0 | 0.18 | 0.04 | 2.5 | 0.02 | 430 | 150 |
105
+
106
+ GUIでパラメータを自身のデバイスのパフォーマンスに合わせて調整できます。推論時間がブロック時間より短ければ、音声変換ストリームは正常に動作するはずです。
107
+ 他のGPU集約型タスク(ゲーム、動画視聴など)を実行している場合、推論速度が低下する可能性があることに注意してください。
108
+
109
+ [VB-CABLE](https://vb-audio.com/Cable/)を使用して、GUI出力ストリームを仮想マイクにルーティングすることができます。
110
+
111
+ *(GUIとオーディオチャンキングのロジックは[RVC](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)から修正されています。素晴らしい実装に感謝します!)*
112
+
113
+ ## トレーニング🏋️
114
+ カスタムデータでのファインチューニングにより、より正確に声をクローニングすることができます。特定の話者に対する話者類似性が大幅に向上しますが、WERが若干上昇する可能性があります。
115
+ 以下のColabチュートリアルで手順を確認できます:[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1R1BJTqMsTXZzYAVx3j1BiemFXog9pbQG?usp=sharing)
116
+
117
+ 1. 独自のデータセットを準備します。以下の条件を満たす必要があります:
118
+ - ファイル構造は問いません
119
+ - 各音声ファイルは1〜30秒の範囲である必要があり、それ以外は無視されます
120
+ - すべての音声ファイルは以下のいずれかの形式である必要があります:`.wav` `.flac` `.mp3` `.m4a` `.opus` `.ogg`
121
+ - 話者ラベルは必須ではありませんが、各話者に少なくとも1つの発話があることを確認してください
122
+ - もちろん、データが多いほどモデルのパフォーマンスは向上します
123
+ - トレーニングデータはできるだけクリーンである必要があり、BGMやノイズは望ましくありません
124
+
125
+ 2. ファインチューニング用に`configs/presets/`からモデル設定ファイルを選択するか、ゼロからトレーニングするための独自の設定を作成します。
126
+ - ファインチューニングの場合は、以下のいずれかを選択します:
127
+ - `./configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml` リアルタイム音声変換用
128
+ - `./configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml` オフライン音声変換用
129
+ - `./configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml` 歌声変換用
130
+
131
+ 3. 以下のコマンドでトレーニングを開始します:
132
+ ```bash
133
+ python train.py
134
+ --config <path-to-config>
135
+ --dataset-dir <path-to-data>
136
+ --run-name <run-name>
137
+ --batch-size 2
138
+ --max-steps 1000
139
+ --max-epochs 1000
140
+ --save-every 500
141
+ --num-workers 0
142
+ ```
143
+ 各パラメータの説明:
144
+ - `config` はモデル設定へのパス、ファインチューニング用に上記のいずれかを選択するか、ゼロからトレーニングする場合は独自の設定を作成
145
+ - `dataset-dir` はデータセットディレクトリへのパス、すべての音声ファイルを含むフォルダである必要があります
146
+ - `run-name` は実行名で、モデルチェックポイントとログの保存に使用されます
147
+ - `batch-size` はトレーニング用のバッチサイズで、GPUメモリに応じて選択します
148
+ - `max-steps` は最大トレーニングステップ数で、データセットサイズとトレーニング時間に応じて選択します
149
+ - `max-epochs` は最大エポック数で、データセットサイズとトレーニング時間に応じて選択します
150
+ - `save-every` はモデルチェックポイントを保存するステップ間隔
151
+ - `num-workers` はデータ読み込みのワーカー数、Windowsの場合は0に設定
152
+
153
+ 4. トレーニングが予期せず停止した場合、同じコマンドを再度実行することで、最後のチェックポイントから再開できます(最新のチェックポイントを見つけられるように、`run-name`と`config`引数が同じであることを確認してください)。
154
+
155
+ 5. トレーニング後、チェックポイントと設定ファイルのパスを指定することで、トレーニングしたモデルを推論に使用できます。
156
+ - これらは`./runs/<run-name>/`の下にあり、チェックポイントは`ft_model.pth`という名前で、設定ファイルはトレーニング設定ファイルと同じ名前です。
157
+ - 推論時には、ゼロショット使用時と同様に、使用したい話者の参照音声ファイルを指定する必要があります。
158
+
159
+ ## TODO📝
160
+ - [x] コードのリリース
161
+ - [x] 事前学��済みモデルのリリース:[![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-SeedVC-blue)](https://huggingface.co/Plachta/Seed-VC)
162
+ - [x] Huggingfaceスペースデモ:[![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-blue)](https://huggingface.co/spaces/Plachta/Seed-VC)
163
+ - [x] HTMLデモページ:[Demo](https://plachtaa.github.io/seed-vc/)
164
+ - [x] ストリーミング推論
165
+ - [x] ストリーミング推論のレイテンシー削減
166
+ - [x] リアルタイム音声変換のデモ動画
167
+ - [x] 歌声変換
168
+ - [x] ソース音声のノイズ耐性
169
+ - [ ] アーキテクチャの潜在的な改善
170
+ - [x] U-ViTスタイルのスキップ接続
171
+ - [x] OpenAI Whisperへの入力変更
172
+ - [x] Time as Token
173
+ - [x] カスタムデータでのトレーニングコード
174
+ - [x] フューショット/ワンショット話者ファインチューニング
175
+ - [x] 歌声デコーディング用にNVIDIAのBigVGANに変更
176
+ - [x] 歌声変換用のWhisperバージョンモデル
177
+ - [x] 歌声変換のRVC/SoVITSとの客観的評価と比較
178
+ - [x] 音声品質の向上
179
+ - [ ] より良い歌声変換のためのNSFボコーダ
180
+ - [x] 非発話時のリアルタイム音声変換アーティファクトの修正(VADモデルの追加により対応)
181
+ - [x] ファインチューニング例のColabノートブック
182
+ - [ ] 今後追加予定
183
+
184
+ ## 更新履歴🗒️
185
+ - 2024-11-26:
186
+ - リアルタイム音声変換用に最適化されたv1.0 tinyバージョンの事前学習済みモデルを更新
187
+ - ワンショット/フューショットの単一/複数話者ファインチューニングをサポート
188
+ - webUIおよびリアルタイムGUIでカスタムチェックポイントの使用をサポート
189
+ - 2024-11-19:
190
+ - arXiv論文公開
191
+ - 2024-10-28:
192
+ - より良い音声品質のファインチューニングされた44k歌声変換モデルを更新
193
+ - 2024-10-27:
194
+ - リアルタイム音声変換GUIを追加
195
+ - 2024-10-25:
196
+ - 歌声変換のRVCv2との包括的な評価結果と比較を追加
197
+ - 2024-10-24:
198
+ - 音声コンテンツ入力としてOpenAI Whisperを使用した44kHz歌声変換モデルを更新
199
+ - 2024-10-07:
200
+ - 音声コンテンツエンコーダをOpenAI Whisperに変更したv0.3事前学習済みモデルを更新
201
+ - v0.3事前学習済みモデルの客観的評価結果を追加
202
+ - 2024-09-22:
203
+ - NVIDIAのBigVGANを使用する歌声変換モデルを更新し、高音域の歌声を大幅に改善
204
+ - Web UIで長い音声ファイルのチャンキングとストリーミング出力をサポート
205
+ - 2024-09-18:
206
+ - 歌声変換用のf0条件付きモデルを更新
207
+ - 2024-09-14:
208
+ - 同じ品質を達成するためのサイズ縮小と拡散ステップ数の削減、およびプロソディ保持の制御能力を追加したv0.2事前学習済みモデルを更新
209
+ - コマンドライン推論スクリプトを追加
210
+ - インストールと使用方法の説明を追加
app.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ import librosa
5
+ from modules.commons import build_model, load_checkpoint, recursive_munch
6
+ import yaml
7
+ from hf_utils import load_custom_model_from_hf
8
+ import numpy as np
9
+ from pydub import AudioSegment
10
+
11
+ # Load model and configuration
12
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+
14
+ dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
15
+ "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
16
+ "config_dit_mel_seed_uvit_whisper_small_wavenet.yml")
17
+ config = yaml.safe_load(open(dit_config_path, 'r'))
18
+ model_params = recursive_munch(config['model_params'])
19
+ model = build_model(model_params, stage='DiT')
20
+ hop_length = config['preprocess_params']['spect_params']['hop_length']
21
+ sr = config['preprocess_params']['sr']
22
+
23
+ # Load checkpoints
24
+ model, _, _, _ = load_checkpoint(model, None, dit_checkpoint_path,
25
+ load_only_params=True, ignore_modules=[], is_distributed=False)
26
+ for key in model:
27
+ model[key].eval()
28
+ model[key].to(device)
29
+ model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
30
+
31
+ # Load additional modules
32
+ from modules.campplus.DTDNN import CAMPPlus
33
+
34
+ campplus_ckpt_path = load_custom_model_from_hf("funasr/campplus", "campplus_cn_common.bin", config_filename=None)
35
+ campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
36
+ campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
37
+ campplus_model.eval()
38
+ campplus_model.to(device)
39
+
40
+ from modules.bigvgan import bigvgan
41
+
42
+ bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
43
+
44
+ # remove weight norm in the model and set to eval mode
45
+ bigvgan_model.remove_weight_norm()
46
+ bigvgan_model = bigvgan_model.eval().to(device)
47
+
48
+ # whisper
49
+ from transformers import AutoFeatureExtractor, WhisperModel
50
+
51
+ whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer,
52
+ 'whisper_name') else "openai/whisper-small"
53
+ whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
54
+ del whisper_model.decoder
55
+ whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
56
+
57
+ # Generate mel spectrograms
58
+ mel_fn_args = {
59
+ "n_fft": config['preprocess_params']['spect_params']['n_fft'],
60
+ "win_size": config['preprocess_params']['spect_params']['win_length'],
61
+ "hop_size": config['preprocess_params']['spect_params']['hop_length'],
62
+ "num_mels": config['preprocess_params']['spect_params']['n_mels'],
63
+ "sampling_rate": sr,
64
+ "fmin": 0,
65
+ "fmax": None,
66
+ "center": False
67
+ }
68
+ from modules.audio import mel_spectrogram
69
+
70
+ to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
71
+
72
+ # f0 conditioned model
73
+ dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
74
+ "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth",
75
+ "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml")
76
+
77
+ config = yaml.safe_load(open(dit_config_path, 'r'))
78
+ model_params = recursive_munch(config['model_params'])
79
+ model_f0 = build_model(model_params, stage='DiT')
80
+ hop_length = config['preprocess_params']['spect_params']['hop_length']
81
+ sr = config['preprocess_params']['sr']
82
+
83
+ # Load checkpoints
84
+ model_f0, _, _, _ = load_checkpoint(model_f0, None, dit_checkpoint_path,
85
+ load_only_params=True, ignore_modules=[], is_distributed=False)
86
+ for key in model_f0:
87
+ model_f0[key].eval()
88
+ model_f0[key].to(device)
89
+ model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
90
+
91
+ # f0 extractor
92
+ from modules.rmvpe import RMVPE
93
+
94
+ model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
95
+ rmvpe = RMVPE(model_path, is_half=False, device=device)
96
+
97
+ mel_fn_args_f0 = {
98
+ "n_fft": config['preprocess_params']['spect_params']['n_fft'],
99
+ "win_size": config['preprocess_params']['spect_params']['win_length'],
100
+ "hop_size": config['preprocess_params']['spect_params']['hop_length'],
101
+ "num_mels": config['preprocess_params']['spect_params']['n_mels'],
102
+ "sampling_rate": sr,
103
+ "fmin": 0,
104
+ "fmax": None,
105
+ "center": False
106
+ }
107
+ to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)
108
+ bigvgan_44k_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x', use_cuda_kernel=False)
109
+
110
+ # remove weight norm in the model and set to eval mode
111
+ bigvgan_44k_model.remove_weight_norm()
112
+ bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
113
+
114
+ def adjust_f0_semitones(f0_sequence, n_semitones):
115
+ factor = 2 ** (n_semitones / 12)
116
+ return f0_sequence * factor
117
+
118
+ def crossfade(chunk1, chunk2, overlap):
119
+ fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
120
+ fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
121
+ if len(chunk2) < overlap:
122
+ chunk2[:overlap] = chunk2[:overlap] * fade_in[:len(chunk2)] + (chunk1[-overlap:] * fade_out)[:len(chunk2)]
123
+ else:
124
+ chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
125
+ return chunk2
126
+
127
+ # streaming and chunk processing related params
128
+ overlap_frame_len = 16
129
+ bitrate = "320k"
130
+
131
+ @torch.no_grad()
132
+ @torch.inference_mode()
133
+ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, f0_condition, auto_f0_adjust, pitch_shift):
134
+ inference_module = model if not f0_condition else model_f0
135
+ mel_fn = to_mel if not f0_condition else to_mel_f0
136
+ bigvgan_fn = bigvgan_model if not f0_condition else bigvgan_44k_model
137
+ sr = 22050 if not f0_condition else 44100
138
+ hop_length = 256 if not f0_condition else 512
139
+ max_context_window = sr // hop_length * 30
140
+ overlap_wave_len = overlap_frame_len * hop_length
141
+ # Load audio
142
+ source_audio = librosa.load(source, sr=sr)[0]
143
+ ref_audio = librosa.load(target, sr=sr)[0]
144
+
145
+ # Process audio
146
+ source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
147
+ ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)
148
+
149
+ # Resample
150
+ ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
151
+ converted_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
152
+ # if source audio less than 30 seconds, whisper can handle in one forward
153
+ if converted_waves_16k.size(-1) <= 16000 * 30:
154
+ alt_inputs = whisper_feature_extractor([converted_waves_16k.squeeze(0).cpu().numpy()],
155
+ return_tensors="pt",
156
+ return_attention_mask=True,
157
+ sampling_rate=16000)
158
+ alt_input_features = whisper_model._mask_input_features(
159
+ alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
160
+ alt_outputs = whisper_model.encoder(
161
+ alt_input_features.to(whisper_model.encoder.dtype),
162
+ head_mask=None,
163
+ output_attentions=False,
164
+ output_hidden_states=False,
165
+ return_dict=True,
166
+ )
167
+ S_alt = alt_outputs.last_hidden_state.to(torch.float32)
168
+ S_alt = S_alt[:, :converted_waves_16k.size(-1) // 320 + 1]
169
+ else:
170
+ overlapping_time = 5 # 5 seconds
171
+ S_alt_list = []
172
+ buffer = None
173
+ traversed_time = 0
174
+ while traversed_time < converted_waves_16k.size(-1):
175
+ if buffer is None: # first chunk
176
+ chunk = converted_waves_16k[:, traversed_time:traversed_time + 16000 * 30]
177
+ else:
178
+ chunk = torch.cat([buffer, converted_waves_16k[:, traversed_time:traversed_time + 16000 * (30 - overlapping_time)]], dim=-1)
179
+ alt_inputs = whisper_feature_extractor([chunk.squeeze(0).cpu().numpy()],
180
+ return_tensors="pt",
181
+ return_attention_mask=True,
182
+ sampling_rate=16000)
183
+ alt_input_features = whisper_model._mask_input_features(
184
+ alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
185
+ alt_outputs = whisper_model.encoder(
186
+ alt_input_features.to(whisper_model.encoder.dtype),
187
+ head_mask=None,
188
+ output_attentions=False,
189
+ output_hidden_states=False,
190
+ return_dict=True,
191
+ )
192
+ S_alt = alt_outputs.last_hidden_state.to(torch.float32)
193
+ S_alt = S_alt[:, :chunk.size(-1) // 320 + 1]
194
+ if traversed_time == 0:
195
+ S_alt_list.append(S_alt)
196
+ else:
197
+ S_alt_list.append(S_alt[:, 50 * overlapping_time:])
198
+ buffer = chunk[:, -16000 * overlapping_time:]
199
+ traversed_time += 30 * 16000 if traversed_time == 0 else chunk.size(-1) - 16000 * overlapping_time
200
+ S_alt = torch.cat(S_alt_list, dim=1)
201
+
202
+ ori_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
203
+ ori_inputs = whisper_feature_extractor([ori_waves_16k.squeeze(0).cpu().numpy()],
204
+ return_tensors="pt",
205
+ return_attention_mask=True)
206
+ ori_input_features = whisper_model._mask_input_features(
207
+ ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
208
+ with torch.no_grad():
209
+ ori_outputs = whisper_model.encoder(
210
+ ori_input_features.to(whisper_model.encoder.dtype),
211
+ head_mask=None,
212
+ output_attentions=False,
213
+ output_hidden_states=False,
214
+ return_dict=True,
215
+ )
216
+ S_ori = ori_outputs.last_hidden_state.to(torch.float32)
217
+ S_ori = S_ori[:, :ori_waves_16k.size(-1) // 320 + 1]
218
+
219
+ mel = mel_fn(source_audio.to(device).float())
220
+ mel2 = mel_fn(ref_audio.to(device).float())
221
+
222
+ target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
223
+ target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
224
+
225
+ feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k,
226
+ num_mel_bins=80,
227
+ dither=0,
228
+ sample_frequency=16000)
229
+ feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
230
+ style2 = campplus_model(feat2.unsqueeze(0))
231
+
232
+ if f0_condition:
233
+ F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.03)
234
+ F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.03)
235
+
236
+ F0_ori = torch.from_numpy(F0_ori).to(device)[None]
237
+ F0_alt = torch.from_numpy(F0_alt).to(device)[None]
238
+
239
+ voiced_F0_ori = F0_ori[F0_ori > 1]
240
+ voiced_F0_alt = F0_alt[F0_alt > 1]
241
+
242
+ log_f0_alt = torch.log(F0_alt + 1e-5)
243
+ voiced_log_f0_ori = torch.log(voiced_F0_ori + 1e-5)
244
+ voiced_log_f0_alt = torch.log(voiced_F0_alt + 1e-5)
245
+ median_log_f0_ori = torch.median(voiced_log_f0_ori)
246
+ median_log_f0_alt = torch.median(voiced_log_f0_alt)
247
+
248
+ # shift alt log f0 level to ori log f0 level
249
+ shifted_log_f0_alt = log_f0_alt.clone()
250
+ if auto_f0_adjust:
251
+ shifted_log_f0_alt[F0_alt > 1] = log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
252
+ shifted_f0_alt = torch.exp(shifted_log_f0_alt)
253
+ if pitch_shift != 0:
254
+ shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch_shift)
255
+ else:
256
+ F0_ori = None
257
+ F0_alt = None
258
+ shifted_f0_alt = None
259
+
260
+ # Length regulation
261
+ cond, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_alt, ylens=target_lengths, n_quantizers=3, f0=shifted_f0_alt)
262
+ prompt_condition, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=3, f0=F0_ori)
263
+
264
+ max_source_window = max_context_window - mel2.size(2)
265
+ # split source condition (cond) into chunks
266
+ processed_frames = 0
267
+ generated_wave_chunks = []
268
+ # generate chunk by chunk and stream the output
269
+ while processed_frames < cond.size(1):
270
+ chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
271
+ is_last_chunk = processed_frames + max_source_window >= cond.size(1)
272
+ cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
273
+ with torch.autocast(device_type=device.type, dtype=torch.float16):
274
+ # Voice Conversion
275
+ vc_target = inference_module.cfm.inference(cat_condition,
276
+ torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
277
+ mel2, style2, None, diffusion_steps,
278
+ inference_cfg_rate=inference_cfg_rate)
279
+ vc_target = vc_target[:, :, mel2.size(-1):]
280
+ vc_wave = bigvgan_fn(vc_target.float())[0]
281
+ if processed_frames == 0:
282
+ if is_last_chunk:
283
+ output_wave = vc_wave[0].cpu().numpy()
284
+ generated_wave_chunks.append(output_wave)
285
+ output_wave = (output_wave * 32768.0).astype(np.int16)
286
+ mp3_bytes = AudioSegment(
287
+ output_wave.tobytes(), frame_rate=sr,
288
+ sample_width=output_wave.dtype.itemsize, channels=1
289
+ ).export(format="mp3", bitrate=bitrate).read()
290
+ yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
291
+ break
292
+ output_wave = vc_wave[0, :-overlap_wave_len].cpu().numpy()
293
+ generated_wave_chunks.append(output_wave)
294
+ previous_chunk = vc_wave[0, -overlap_wave_len:]
295
+ processed_frames += vc_target.size(2) - overlap_frame_len
296
+ output_wave = (output_wave * 32768.0).astype(np.int16)
297
+ mp3_bytes = AudioSegment(
298
+ output_wave.tobytes(), frame_rate=sr,
299
+ sample_width=output_wave.dtype.itemsize, channels=1
300
+ ).export(format="mp3", bitrate=bitrate).read()
301
+ yield mp3_bytes, None
302
+ elif is_last_chunk:
303
+ output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
304
+ generated_wave_chunks.append(output_wave)
305
+ processed_frames += vc_target.size(2) - overlap_frame_len
306
+ output_wave = (output_wave * 32768.0).astype(np.int16)
307
+ mp3_bytes = AudioSegment(
308
+ output_wave.tobytes(), frame_rate=sr,
309
+ sample_width=output_wave.dtype.itemsize, channels=1
310
+ ).export(format="mp3", bitrate=bitrate).read()
311
+ yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
312
+ break
313
+ else:
314
+ output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0, :-overlap_wave_len].cpu().numpy(), overlap_wave_len)
315
+ generated_wave_chunks.append(output_wave)
316
+ previous_chunk = vc_wave[0, -overlap_wave_len:]
317
+ processed_frames += vc_target.size(2) - overlap_frame_len
318
+ output_wave = (output_wave * 32768.0).astype(np.int16)
319
+ mp3_bytes = AudioSegment(
320
+ output_wave.tobytes(), frame_rate=sr,
321
+ sample_width=output_wave.dtype.itemsize, channels=1
322
+ ).export(format="mp3", bitrate=bitrate).read()
323
+ yield mp3_bytes, None
324
+
325
+
326
+ if __name__ == "__main__":
327
+ description = ("Zero-shot voice conversion with in-context learning. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
328
+ "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
329
+ "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
330
+ "无需训练的 zero-shot 语音/歌声转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
331
+ "请注意,参考音频若超过 25 秒,则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒,源音频将被分段处理。")
332
+ inputs = [
333
+ gr.Audio(type="filepath", label="Source Audio / 源音频"),
334
+ gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
335
+ gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Diffusion Steps / 扩散步数", info="10 by default, 50~100 for best quality / 默认为 10,50~100 为最佳质量"),
336
+ gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整", info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
337
+ gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="has subtle influence / 有微小影响"),
338
+ gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False, info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
339
+ gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
340
+ info="Roughly adjust F0 to match target voice. Only works when F0 conditioned model is used. / 粗略调整 F0 以匹配目标音色,仅在勾选 '启用F0输入' 时生效"),
341
+ gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0, info="Pitch shift in semitones, only works when F0 conditioned model is used / 半音数的音高变换,仅在勾选 '启用F0输入' 时生效"),
342
+ ]
343
+
344
+ examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
345
+ ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, True, True, 0],
346
+ ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
347
+ "examples/reference/teio_0.wav", 100, 1.0, 0.7, True, False, 0],
348
+ ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
349
+ "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
350
+ ]
351
+
352
+ outputs = [gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
353
+ gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')]
354
+
355
+ gr.Interface(fn=voice_conversion,
356
+ description=description,
357
+ inputs=inputs,
358
+ outputs=outputs,
359
+ title="Seed Voice Conversion",
360
+ examples=examples,
361
+ cache_examples=False,
362
+ ).launch()
app_svc.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ['HF_HUB_CACHE'] = './checkpoints/hf_cache'
3
+ import gradio as gr
4
+ import torch
5
+ import torchaudio
6
+ import librosa
7
+ from modules.commons import build_model, load_checkpoint, recursive_munch, str2bool
8
+ import yaml
9
+ from hf_utils import load_custom_model_from_hf
10
+ import numpy as np
11
+ from pydub import AudioSegment
12
+ import argparse
13
+ # Load model and configuration
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+
16
+ fp16 = False
17
+ def load_models(args):
18
+ global sr, hop_length, fp16
19
+ fp16 = args.fp16
20
+ print(f"Using device: {device}")
21
+ print(f"Using fp16: {fp16}")
22
+ # f0 conditioned model
23
+ if args.checkpoint_path is None or args.checkpoint_path == "":
24
+ dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
25
+ "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema_v2.pth",
26
+ "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml")
27
+ else:
28
+ print(f"Using custom checkpoint: {args.checkpoint_path}")
29
+ dit_checkpoint_path = args.checkpoint_path
30
+ dit_config_path = args.config_path
31
+ config = yaml.safe_load(open(dit_config_path, "r"))
32
+ model_params = recursive_munch(config["model_params"])
33
+ model_params.dit_type = 'DiT'
34
+ model = build_model(model_params, stage="DiT")
35
+ hop_length = config["preprocess_params"]["spect_params"]["hop_length"]
36
+ sr = config["preprocess_params"]["sr"]
37
+
38
+ # Load checkpoints
39
+ model, _, _, _ = load_checkpoint(
40
+ model,
41
+ None,
42
+ dit_checkpoint_path,
43
+ load_only_params=True,
44
+ ignore_modules=[],
45
+ is_distributed=False,
46
+ )
47
+ for key in model:
48
+ model[key].eval()
49
+ model[key].to(device)
50
+ model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
51
+
52
+ # Load additional modules
53
+ from modules.campplus.DTDNN import CAMPPlus
54
+
55
+ campplus_ckpt_path = load_custom_model_from_hf(
56
+ "funasr/campplus", "campplus_cn_common.bin", config_filename=None
57
+ )
58
+ campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
59
+ campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
60
+ campplus_model.eval()
61
+ campplus_model.to(device)
62
+
63
+ vocoder_type = model_params.vocoder.type
64
+
65
+ if vocoder_type == 'bigvgan':
66
+ from modules.bigvgan import bigvgan
67
+ bigvgan_name = model_params.vocoder.name
68
+ bigvgan_model = bigvgan.BigVGAN.from_pretrained(bigvgan_name, use_cuda_kernel=False)
69
+ # remove weight norm in the model and set to eval mode
70
+ bigvgan_model.remove_weight_norm()
71
+ bigvgan_model = bigvgan_model.eval().to(device)
72
+ vocoder_fn = bigvgan_model
73
+ elif vocoder_type == 'hifigan':
74
+ from modules.hifigan.generator import HiFTGenerator
75
+ from modules.hifigan.f0_predictor import ConvRNNF0Predictor
76
+ hift_config = yaml.safe_load(open('configs/hifigan.yml', 'r'))
77
+ hift_gen = HiFTGenerator(**hift_config['hift'], f0_predictor=ConvRNNF0Predictor(**hift_config['f0_predictor']))
78
+ hift_path = load_custom_model_from_hf("FunAudioLLM/CosyVoice-300M", 'hift.pt', None)
79
+ hift_gen.load_state_dict(torch.load(hift_path, map_location='cpu'))
80
+ hift_gen.eval()
81
+ hift_gen.to(device)
82
+ vocoder_fn = hift_gen
83
+ elif vocoder_type == "vocos":
84
+ vocos_config = yaml.safe_load(open(model_params.vocoder.vocos.config, 'r'))
85
+ vocos_path = model_params.vocoder.vocos.path
86
+ vocos_model_params = recursive_munch(vocos_config['model_params'])
87
+ vocos = build_model(vocos_model_params, stage='mel_vocos')
88
+ vocos_checkpoint_path = vocos_path
89
+ vocos, _, _, _ = load_checkpoint(vocos, None, vocos_checkpoint_path,
90
+ load_only_params=True, ignore_modules=[], is_distributed=False)
91
+ _ = [vocos[key].eval().to(device) for key in vocos]
92
+ _ = [vocos[key].to(device) for key in vocos]
93
+ total_params = sum(sum(p.numel() for p in vocos[key].parameters() if p.requires_grad) for key in vocos.keys())
94
+ print(f"Vocoder model total parameters: {total_params / 1_000_000:.2f}M")
95
+ vocoder_fn = vocos.decoder
96
+ else:
97
+ raise ValueError(f"Unknown vocoder type: {vocoder_type}")
98
+
99
+ speech_tokenizer_type = model_params.speech_tokenizer.type
100
+ if speech_tokenizer_type == 'whisper':
101
+ # whisper
102
+ from transformers import AutoFeatureExtractor, WhisperModel
103
+ whisper_name = model_params.speech_tokenizer.name
104
+ whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
105
+ del whisper_model.decoder
106
+ whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
107
+
108
+ def semantic_fn(waves_16k):
109
+ ori_inputs = whisper_feature_extractor([waves_16k.squeeze(0).cpu().numpy()],
110
+ return_tensors="pt",
111
+ return_attention_mask=True)
112
+ ori_input_features = whisper_model._mask_input_features(
113
+ ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
114
+ with torch.no_grad():
115
+ ori_outputs = whisper_model.encoder(
116
+ ori_input_features.to(whisper_model.encoder.dtype),
117
+ head_mask=None,
118
+ output_attentions=False,
119
+ output_hidden_states=False,
120
+ return_dict=True,
121
+ )
122
+ S_ori = ori_outputs.last_hidden_state.to(torch.float32)
123
+ S_ori = S_ori[:, :waves_16k.size(-1) // 320 + 1]
124
+ return S_ori
125
+ elif speech_tokenizer_type == 'cnhubert':
126
+ from transformers import (
127
+ Wav2Vec2FeatureExtractor,
128
+ HubertModel,
129
+ )
130
+ hubert_model_name = config['model_params']['speech_tokenizer']['name']
131
+ hubert_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(hubert_model_name)
132
+ hubert_model = HubertModel.from_pretrained(hubert_model_name)
133
+ hubert_model = hubert_model.to(device)
134
+ hubert_model = hubert_model.eval()
135
+ hubert_model = hubert_model.half()
136
+
137
+ def semantic_fn(waves_16k):
138
+ ori_waves_16k_input_list = [
139
+ waves_16k[bib].cpu().numpy()
140
+ for bib in range(len(waves_16k))
141
+ ]
142
+ ori_inputs = hubert_feature_extractor(ori_waves_16k_input_list,
143
+ return_tensors="pt",
144
+ return_attention_mask=True,
145
+ padding=True,
146
+ sampling_rate=16000).to(device)
147
+ with torch.no_grad():
148
+ ori_outputs = hubert_model(
149
+ ori_inputs.input_values.half(),
150
+ )
151
+ S_ori = ori_outputs.last_hidden_state.float()
152
+ return S_ori
153
+ elif speech_tokenizer_type == 'xlsr':
154
+ from transformers import (
155
+ Wav2Vec2FeatureExtractor,
156
+ Wav2Vec2Model,
157
+ )
158
+ model_name = config['model_params']['speech_tokenizer']['name']
159
+ output_layer = config['model_params']['speech_tokenizer']['output_layer']
160
+ wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
161
+ wav2vec_model = Wav2Vec2Model.from_pretrained(model_name)
162
+ wav2vec_model.encoder.layers = wav2vec_model.encoder.layers[:output_layer]
163
+ wav2vec_model = wav2vec_model.to(device)
164
+ wav2vec_model = wav2vec_model.eval()
165
+ wav2vec_model = wav2vec_model.half()
166
+
167
+ def semantic_fn(waves_16k):
168
+ ori_waves_16k_input_list = [
169
+ waves_16k[bib].cpu().numpy()
170
+ for bib in range(len(waves_16k))
171
+ ]
172
+ ori_inputs = wav2vec_feature_extractor(ori_waves_16k_input_list,
173
+ return_tensors="pt",
174
+ return_attention_mask=True,
175
+ padding=True,
176
+ sampling_rate=16000).to(device)
177
+ with torch.no_grad():
178
+ ori_outputs = wav2vec_model(
179
+ ori_inputs.input_values.half(),
180
+ )
181
+ S_ori = ori_outputs.last_hidden_state.float()
182
+ return S_ori
183
+ else:
184
+ raise ValueError(f"Unknown speech tokenizer type: {speech_tokenizer_type}")
185
+ # Generate mel spectrograms
186
+ mel_fn_args = {
187
+ "n_fft": config['preprocess_params']['spect_params']['n_fft'],
188
+ "win_size": config['preprocess_params']['spect_params']['win_length'],
189
+ "hop_size": config['preprocess_params']['spect_params']['hop_length'],
190
+ "num_mels": config['preprocess_params']['spect_params']['n_mels'],
191
+ "sampling_rate": sr,
192
+ "fmin": config['preprocess_params']['spect_params'].get('fmin', 0),
193
+ "fmax": None if config['preprocess_params']['spect_params'].get('fmax', "None") == "None" else 8000,
194
+ "center": False
195
+ }
196
+ from modules.audio import mel_spectrogram
197
+
198
+ to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
199
+ # f0 extractor
200
+ from modules.rmvpe import RMVPE
201
+
202
+ model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
203
+ rmvpe = RMVPE(model_path, is_half=False, device=device)
204
+ f0_fn = rmvpe.infer_from_audio
205
+
206
+ return (
207
+ model,
208
+ semantic_fn,
209
+ vocoder_fn,
210
+ campplus_model,
211
+ to_mel,
212
+ mel_fn_args,
213
+ f0_fn,
214
+ )
215
+
216
+ def adjust_f0_semitones(f0_sequence, n_semitones):
217
+ factor = 2 ** (n_semitones / 12)
218
+ return f0_sequence * factor
219
+
220
+ def crossfade(chunk1, chunk2, overlap):
221
+ fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
222
+ fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
223
+ chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
224
+ return chunk2
225
+
226
+ # streaming and chunk processing related params
227
+ # max_context_window = sr // hop_length * 30
228
+ # overlap_frame_len = 16
229
+ # overlap_wave_len = overlap_frame_len * hop_length
230
+ bitrate = "320k"
231
+
232
+ model_f0, semantic_fn, vocoder_fn, campplus_model, to_mel_f0, mel_fn_args = None, None, None, None, None, None
233
+ f0_fn = None
234
+ overlap_wave_len = None
235
+ max_context_window = None
236
+ sr = None
237
+ hop_length = None
238
+ overlap_frame_len = 16
239
+
240
+ @torch.no_grad()
241
+ @torch.inference_mode()
242
+ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, auto_f0_adjust, pitch_shift):
243
+ inference_module = model_f0
244
+ mel_fn = to_mel_f0
245
+ # Load audio
246
+ source_audio = librosa.load(source, sr=sr)[0]
247
+ ref_audio = librosa.load(target, sr=sr)[0]
248
+
249
+ # Process audio
250
+ source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
251
+ ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)
252
+
253
+ # Resample
254
+ ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
255
+ converted_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
256
+ # if source audio less than 30 seconds, whisper can handle in one forward
257
+ if converted_waves_16k.size(-1) <= 16000 * 30:
258
+ S_alt = semantic_fn(converted_waves_16k)
259
+ else:
260
+ overlapping_time = 5 # 5 seconds
261
+ S_alt_list = []
262
+ buffer = None
263
+ traversed_time = 0
264
+ while traversed_time < converted_waves_16k.size(-1):
265
+ if buffer is None: # first chunk
266
+ chunk = converted_waves_16k[:, traversed_time:traversed_time + 16000 * 30]
267
+ else:
268
+ chunk = torch.cat([buffer, converted_waves_16k[:, traversed_time:traversed_time + 16000 * (30 - overlapping_time)]], dim=-1)
269
+ S_alt = semantic_fn(chunk)
270
+ if traversed_time == 0:
271
+ S_alt_list.append(S_alt)
272
+ else:
273
+ S_alt_list.append(S_alt[:, 50 * overlapping_time:])
274
+ buffer = chunk[:, -16000 * overlapping_time:]
275
+ traversed_time += 30 * 16000 if traversed_time == 0 else chunk.size(-1) - 16000 * overlapping_time
276
+ S_alt = torch.cat(S_alt_list, dim=1)
277
+
278
+ ori_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
279
+ S_ori = semantic_fn(ori_waves_16k)
280
+
281
+ mel = mel_fn(source_audio.to(device).float())
282
+ mel2 = mel_fn(ref_audio.to(device).float())
283
+
284
+ target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
285
+ target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
286
+
287
+ feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k,
288
+ num_mel_bins=80,
289
+ dither=0,
290
+ sample_frequency=16000)
291
+ feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
292
+ style2 = campplus_model(feat2.unsqueeze(0))
293
+
294
+ F0_ori = f0_fn(ref_waves_16k[0], thred=0.03)
295
+ F0_alt = f0_fn(converted_waves_16k[0], thred=0.03)
296
+
297
+ F0_ori = torch.from_numpy(F0_ori).to(device)[None]
298
+ F0_alt = torch.from_numpy(F0_alt).to(device)[None]
299
+
300
+ voiced_F0_ori = F0_ori[F0_ori > 1]
301
+ voiced_F0_alt = F0_alt[F0_alt > 1]
302
+
303
+ log_f0_alt = torch.log(F0_alt + 1e-5)
304
+ voiced_log_f0_ori = torch.log(voiced_F0_ori + 1e-5)
305
+ voiced_log_f0_alt = torch.log(voiced_F0_alt + 1e-5)
306
+ median_log_f0_ori = torch.median(voiced_log_f0_ori)
307
+ median_log_f0_alt = torch.median(voiced_log_f0_alt)
308
+
309
+ # shift alt log f0 level to ori log f0 level
310
+ shifted_log_f0_alt = log_f0_alt.clone()
311
+ if auto_f0_adjust:
312
+ shifted_log_f0_alt[F0_alt > 1] = log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
313
+ shifted_f0_alt = torch.exp(shifted_log_f0_alt)
314
+ if pitch_shift != 0:
315
+ shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch_shift)
316
+
317
+ # Length regulation
318
+ cond, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_alt, ylens=target_lengths, n_quantizers=3, f0=shifted_f0_alt)
319
+ prompt_condition, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=3, f0=F0_ori)
320
+ interpolated_shifted_f0_alt = torch.nn.functional.interpolate(shifted_f0_alt.unsqueeze(1), size=cond.size(1),
321
+ mode='nearest').squeeze(1)
322
+ max_source_window = max_context_window - mel2.size(2)
323
+ # split source condition (cond) into chunks
324
+ processed_frames = 0
325
+ generated_wave_chunks = []
326
+ # generate chunk by chunk and stream the output
327
+ while processed_frames < cond.size(1):
328
+ chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
329
+ chunk_f0 = interpolated_shifted_f0_alt[:, processed_frames:processed_frames + max_source_window]
330
+ is_last_chunk = processed_frames + max_source_window >= cond.size(1)
331
+ cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
332
+ with torch.autocast(device_type=device.type, dtype=torch.float16 if fp16 else torch.float32):
333
+ # Voice Conversion
334
+ vc_target = inference_module.cfm.inference(cat_condition,
335
+ torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
336
+ mel2, style2, None, diffusion_steps,
337
+ inference_cfg_rate=inference_cfg_rate)
338
+ vc_target = vc_target[:, :, mel2.size(-1):]
339
+ vc_wave = vocoder_fn(vc_target.float()).squeeze().cpu()
340
+ if vc_wave.ndim == 1:
341
+ vc_wave = vc_wave.unsqueeze(0)
342
+ if processed_frames == 0:
343
+ if is_last_chunk:
344
+ output_wave = vc_wave[0].cpu().numpy()
345
+ generated_wave_chunks.append(output_wave)
346
+ output_wave = (output_wave * 32768.0).astype(np.int16)
347
+ mp3_bytes = AudioSegment(
348
+ output_wave.tobytes(), frame_rate=sr,
349
+ sample_width=output_wave.dtype.itemsize, channels=1
350
+ ).export(format="mp3", bitrate=bitrate).read()
351
+ yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
352
+ break
353
+ output_wave = vc_wave[0, :-overlap_wave_len].cpu().numpy()
354
+ generated_wave_chunks.append(output_wave)
355
+ previous_chunk = vc_wave[0, -overlap_wave_len:]
356
+ processed_frames += vc_target.size(2) - overlap_frame_len
357
+ output_wave = (output_wave * 32768.0).astype(np.int16)
358
+ mp3_bytes = AudioSegment(
359
+ output_wave.tobytes(), frame_rate=sr,
360
+ sample_width=output_wave.dtype.itemsize, channels=1
361
+ ).export(format="mp3", bitrate=bitrate).read()
362
+ yield mp3_bytes, None
363
+ elif is_last_chunk:
364
+ output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
365
+ generated_wave_chunks.append(output_wave)
366
+ processed_frames += vc_target.size(2) - overlap_frame_len
367
+ output_wave = (output_wave * 32768.0).astype(np.int16)
368
+ mp3_bytes = AudioSegment(
369
+ output_wave.tobytes(), frame_rate=sr,
370
+ sample_width=output_wave.dtype.itemsize, channels=1
371
+ ).export(format="mp3", bitrate=bitrate).read()
372
+ yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
373
+ break
374
+ else:
375
+ output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0, :-overlap_wave_len].cpu().numpy(), overlap_wave_len)
376
+ generated_wave_chunks.append(output_wave)
377
+ previous_chunk = vc_wave[0, -overlap_wave_len:]
378
+ processed_frames += vc_target.size(2) - overlap_frame_len
379
+ output_wave = (output_wave * 32768.0).astype(np.int16)
380
+ mp3_bytes = AudioSegment(
381
+ output_wave.tobytes(), frame_rate=sr,
382
+ sample_width=output_wave.dtype.itemsize, channels=1
383
+ ).export(format="mp3", bitrate=bitrate).read()
384
+ yield mp3_bytes, None
385
+
386
+
387
+ def main(args):
388
+ global model_f0, semantic_fn, vocoder_fn, campplus_model, to_mel_f0, mel_fn_args, f0_fn
389
+ global overlap_wave_len, max_context_window, sr, hop_length
390
+ model_f0, semantic_fn, vocoder_fn, campplus_model, to_mel_f0, mel_fn_args, f0_fn = load_models(args)
391
+ # streaming and chunk processing related params
392
+ max_context_window = sr // hop_length * 30
393
+ overlap_wave_len = overlap_frame_len * hop_length
394
+ description = ("Zero-shot voice conversion with in-context learning. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
395
+ "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
396
+ "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
397
+ "无需训练的 zero-shot 语音/歌声转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
398
+ "请注意,参考音频若超过 25 秒,则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒,源音频将被分段处理。")
399
+ inputs = [
400
+ gr.Audio(type="filepath", label="Source Audio / 源音频"),
401
+ gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
402
+ gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Diffusion Steps / 扩散步数", info="10 by default, 50~100 for best quality / 默认为 10,50~100 为最佳质量"),
403
+ gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整", info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
404
+ gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="has subtle influence / 有微小影响"),
405
+ gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
406
+ info="Roughly adjust F0 to match target voice. Only works when F0 conditioned model is used. / 粗略调整 F0 以匹配目标音色,仅在勾选 '启用F0输入' 时生效"),
407
+ gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0, info="Pitch shift in semitones, only works when F0 conditioned model is used / 半音数的音高变换,仅在勾选 '启用F0输入' 时生效"),
408
+ ]
409
+
410
+ examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, True, 0],
411
+ ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, True, 0],
412
+ ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
413
+ "examples/reference/teio_0.wav", 50, 1.0, 0.7, False, 0],
414
+ ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
415
+ "examples/reference/trump_0.wav", 50, 1.0, 0.7, False, -12],
416
+ ]
417
+
418
+ outputs = [gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
419
+ gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')]
420
+
421
+ gr.Interface(fn=voice_conversion,
422
+ description=description,
423
+ inputs=inputs,
424
+ outputs=outputs,
425
+ title="Seed Voice Conversion",
426
+ examples=examples,
427
+ cache_examples=False,
428
+ ).launch(share=args.share,)
429
+
430
+ if __name__ == "__main__":
431
+ parser = argparse.ArgumentParser()
432
+ parser.add_argument("--checkpoint-path", type=str, help="Path to the checkpoint file", default=None)
433
+ parser.add_argument("--config-path", type=str, help="Path to the config file", default=None)
434
+ parser.add_argument("--share", type=str2bool, nargs="?", const=True, default=False, help="Whether to share the app")
435
+ parser.add_argument("--fp16", type=str2bool, nargs="?", const=True, help="Whether to use fp16", default=True)
436
+ args = parser.parse_args()
437
+ main(args)
app_vc.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ['HF_HUB_CACHE'] = './checkpoints/hf_cache'
3
+ import gradio as gr
4
+ import torch
5
+ import torchaudio
6
+ import librosa
7
+ from modules.commons import build_model, load_checkpoint, recursive_munch, str2bool
8
+ import yaml
9
+ from hf_utils import load_custom_model_from_hf
10
+ import numpy as np
11
+ from pydub import AudioSegment
12
+ import argparse
13
+
14
+ # Load model and configuration
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+ fp16 = False
17
+ def load_models(args):
18
+ global sr, hop_length, fp16
19
+ fp16 = args.fp16
20
+ print(f"Using device: {device}")
21
+ print(f"Using fp16: {fp16}")
22
+ if args.checkpoint_path is None or args.checkpoint_path == "":
23
+ dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
24
+ "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
25
+ "config_dit_mel_seed_uvit_whisper_small_wavenet.yml")
26
+ else:
27
+ dit_checkpoint_path = args.checkpoint_path
28
+ dit_config_path = args.config_path
29
+ config = yaml.safe_load(open(dit_config_path, "r"))
30
+ model_params = recursive_munch(config["model_params"])
31
+ model_params.dit_type = 'DiT'
32
+ model = build_model(model_params, stage="DiT")
33
+ hop_length = config["preprocess_params"]["spect_params"]["hop_length"]
34
+ sr = config["preprocess_params"]["sr"]
35
+
36
+ # Load checkpoints
37
+ model, _, _, _ = load_checkpoint(
38
+ model,
39
+ None,
40
+ dit_checkpoint_path,
41
+ load_only_params=True,
42
+ ignore_modules=[],
43
+ is_distributed=False,
44
+ )
45
+ for key in model:
46
+ model[key].eval()
47
+ model[key].to(device)
48
+ model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
49
+
50
+ # Load additional modules
51
+ from modules.campplus.DTDNN import CAMPPlus
52
+
53
+ campplus_ckpt_path = load_custom_model_from_hf(
54
+ "funasr/campplus", "campplus_cn_common.bin", config_filename=None
55
+ )
56
+ campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
57
+ campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
58
+ campplus_model.eval()
59
+ campplus_model.to(device)
60
+
61
+ vocoder_type = model_params.vocoder.type
62
+
63
+ if vocoder_type == 'bigvgan':
64
+ from modules.bigvgan import bigvgan
65
+ bigvgan_name = model_params.vocoder.name
66
+ bigvgan_model = bigvgan.BigVGAN.from_pretrained(bigvgan_name, use_cuda_kernel=False)
67
+ # remove weight norm in the model and set to eval mode
68
+ bigvgan_model.remove_weight_norm()
69
+ bigvgan_model = bigvgan_model.eval().to(device)
70
+ vocoder_fn = bigvgan_model
71
+ elif vocoder_type == 'hifigan':
72
+ from modules.hifigan.generator import HiFTGenerator
73
+ from modules.hifigan.f0_predictor import ConvRNNF0Predictor
74
+ hift_config = yaml.safe_load(open('configs/hifigan.yml', 'r'))
75
+ hift_gen = HiFTGenerator(**hift_config['hift'], f0_predictor=ConvRNNF0Predictor(**hift_config['f0_predictor']))
76
+ hift_path = load_custom_model_from_hf("FunAudioLLM/CosyVoice-300M", 'hift.pt', None)
77
+ hift_gen.load_state_dict(torch.load(hift_path, map_location='cpu'))
78
+ hift_gen.eval()
79
+ hift_gen.to(device)
80
+ vocoder_fn = hift_gen
81
+ elif vocoder_type == "vocos":
82
+ vocos_config = yaml.safe_load(open(model_params.vocoder.vocos.config, 'r'))
83
+ vocos_path = model_params.vocoder.vocos.path
84
+ vocos_model_params = recursive_munch(vocos_config['model_params'])
85
+ vocos = build_model(vocos_model_params, stage='mel_vocos')
86
+ vocos_checkpoint_path = vocos_path
87
+ vocos, _, _, _ = load_checkpoint(vocos, None, vocos_checkpoint_path,
88
+ load_only_params=True, ignore_modules=[], is_distributed=False)
89
+ _ = [vocos[key].eval().to(device) for key in vocos]
90
+ _ = [vocos[key].to(device) for key in vocos]
91
+ total_params = sum(sum(p.numel() for p in vocos[key].parameters() if p.requires_grad) for key in vocos.keys())
92
+ print(f"Vocoder model total parameters: {total_params / 1_000_000:.2f}M")
93
+ vocoder_fn = vocos.decoder
94
+ else:
95
+ raise ValueError(f"Unknown vocoder type: {vocoder_type}")
96
+
97
+ speech_tokenizer_type = model_params.speech_tokenizer.type
98
+ if speech_tokenizer_type == 'whisper':
99
+ # whisper
100
+ from transformers import AutoFeatureExtractor, WhisperModel
101
+ whisper_name = model_params.speech_tokenizer.name
102
+ whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
103
+ del whisper_model.decoder
104
+ whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
105
+
106
+ def semantic_fn(waves_16k):
107
+ ori_inputs = whisper_feature_extractor([waves_16k.squeeze(0).cpu().numpy()],
108
+ return_tensors="pt",
109
+ return_attention_mask=True)
110
+ ori_input_features = whisper_model._mask_input_features(
111
+ ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
112
+ with torch.no_grad():
113
+ ori_outputs = whisper_model.encoder(
114
+ ori_input_features.to(whisper_model.encoder.dtype),
115
+ head_mask=None,
116
+ output_attentions=False,
117
+ output_hidden_states=False,
118
+ return_dict=True,
119
+ )
120
+ S_ori = ori_outputs.last_hidden_state.to(torch.float32)
121
+ S_ori = S_ori[:, :waves_16k.size(-1) // 320 + 1]
122
+ return S_ori
123
+ elif speech_tokenizer_type == 'cnhubert':
124
+ from transformers import (
125
+ Wav2Vec2FeatureExtractor,
126
+ HubertModel,
127
+ )
128
+ hubert_model_name = config['model_params']['speech_tokenizer']['name']
129
+ hubert_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(hubert_model_name)
130
+ hubert_model = HubertModel.from_pretrained(hubert_model_name)
131
+ hubert_model = hubert_model.to(device)
132
+ hubert_model = hubert_model.eval()
133
+ hubert_model = hubert_model.half()
134
+
135
+ def semantic_fn(waves_16k):
136
+ ori_waves_16k_input_list = [
137
+ waves_16k[bib].cpu().numpy()
138
+ for bib in range(len(waves_16k))
139
+ ]
140
+ ori_inputs = hubert_feature_extractor(ori_waves_16k_input_list,
141
+ return_tensors="pt",
142
+ return_attention_mask=True,
143
+ padding=True,
144
+ sampling_rate=16000).to(device)
145
+ with torch.no_grad():
146
+ ori_outputs = hubert_model(
147
+ ori_inputs.input_values.half(),
148
+ )
149
+ S_ori = ori_outputs.last_hidden_state.float()
150
+ return S_ori
151
+ elif speech_tokenizer_type == 'xlsr':
152
+ from transformers import (
153
+ Wav2Vec2FeatureExtractor,
154
+ Wav2Vec2Model,
155
+ )
156
+ model_name = config['model_params']['speech_tokenizer']['name']
157
+ output_layer = config['model_params']['speech_tokenizer']['output_layer']
158
+ wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
159
+ wav2vec_model = Wav2Vec2Model.from_pretrained(model_name)
160
+ wav2vec_model.encoder.layers = wav2vec_model.encoder.layers[:output_layer]
161
+ wav2vec_model = wav2vec_model.to(device)
162
+ wav2vec_model = wav2vec_model.eval()
163
+ wav2vec_model = wav2vec_model.half()
164
+
165
+ def semantic_fn(waves_16k):
166
+ ori_waves_16k_input_list = [
167
+ waves_16k[bib].cpu().numpy()
168
+ for bib in range(len(waves_16k))
169
+ ]
170
+ ori_inputs = wav2vec_feature_extractor(ori_waves_16k_input_list,
171
+ return_tensors="pt",
172
+ return_attention_mask=True,
173
+ padding=True,
174
+ sampling_rate=16000).to(device)
175
+ with torch.no_grad():
176
+ ori_outputs = wav2vec_model(
177
+ ori_inputs.input_values.half(),
178
+ )
179
+ S_ori = ori_outputs.last_hidden_state.float()
180
+ return S_ori
181
+ else:
182
+ raise ValueError(f"Unknown speech tokenizer type: {speech_tokenizer_type}")
183
+ # Generate mel spectrograms
184
+ mel_fn_args = {
185
+ "n_fft": config['preprocess_params']['spect_params']['n_fft'],
186
+ "win_size": config['preprocess_params']['spect_params']['win_length'],
187
+ "hop_size": config['preprocess_params']['spect_params']['hop_length'],
188
+ "num_mels": config['preprocess_params']['spect_params']['n_mels'],
189
+ "sampling_rate": sr,
190
+ "fmin": config['preprocess_params']['spect_params'].get('fmin', 0),
191
+ "fmax": None if config['preprocess_params']['spect_params'].get('fmax', "None") == "None" else 8000,
192
+ "center": False
193
+ }
194
+ from modules.audio import mel_spectrogram
195
+
196
+ to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
197
+
198
+ return (
199
+ model,
200
+ semantic_fn,
201
+ vocoder_fn,
202
+ campplus_model,
203
+ to_mel,
204
+ mel_fn_args,
205
+ )
206
+ def crossfade(chunk1, chunk2, overlap):
207
+ fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
208
+ fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
209
+ chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
210
+ return chunk2
211
+
212
+ bitrate = "320k"
213
+
214
+ model, semantic_fn, vocoder_fn, campplus_model, to_mel, mel_fn_args = None, None, None, None, None, None
215
+ overlap_wave_len = None
216
+ max_context_window = None
217
+ sr = None
218
+ hop_length = None
219
+ overlap_frame_len = 16
220
+ @torch.no_grad()
221
+ @torch.inference_mode()
222
+ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate):
223
+ inference_module = model
224
+ mel_fn = to_mel
225
+ # Load audio
226
+ source_audio = librosa.load(source, sr=sr)[0]
227
+ ref_audio = librosa.load(target, sr=sr)[0]
228
+
229
+ # Process audio
230
+ source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
231
+ ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)
232
+
233
+ # Resample
234
+ ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
235
+ converted_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
236
+ # if source audio less than 30 seconds, whisper can handle in one forward
237
+ if converted_waves_16k.size(-1) <= 16000 * 30:
238
+ S_alt = semantic_fn(converted_waves_16k)
239
+ else:
240
+ overlapping_time = 5 # 5 seconds
241
+ S_alt_list = []
242
+ buffer = None
243
+ traversed_time = 0
244
+ while traversed_time < converted_waves_16k.size(-1):
245
+ if buffer is None: # first chunk
246
+ chunk = converted_waves_16k[:, traversed_time:traversed_time + 16000 * 30]
247
+ else:
248
+ chunk = torch.cat([buffer, converted_waves_16k[:, traversed_time:traversed_time + 16000 * (30 - overlapping_time)]], dim=-1)
249
+ S_alt = semantic_fn(chunk)
250
+ if traversed_time == 0:
251
+ S_alt_list.append(S_alt)
252
+ else:
253
+ S_alt_list.append(S_alt[:, 50 * overlapping_time:])
254
+ buffer = chunk[:, -16000 * overlapping_time:]
255
+ traversed_time += 30 * 16000 if traversed_time == 0 else chunk.size(-1) - 16000 * overlapping_time
256
+ S_alt = torch.cat(S_alt_list, dim=1)
257
+
258
+ ori_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
259
+ S_ori = semantic_fn(ori_waves_16k)
260
+
261
+ mel = mel_fn(source_audio.to(device).float())
262
+ mel2 = mel_fn(ref_audio.to(device).float())
263
+
264
+ target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
265
+ target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
266
+
267
+ feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k,
268
+ num_mel_bins=80,
269
+ dither=0,
270
+ sample_frequency=16000)
271
+ feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
272
+ style2 = campplus_model(feat2.unsqueeze(0))
273
+
274
+ F0_ori = None
275
+ F0_alt = None
276
+ shifted_f0_alt = None
277
+
278
+ # Length regulation
279
+ cond, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_alt, ylens=target_lengths, n_quantizers=3, f0=shifted_f0_alt)
280
+ prompt_condition, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=3, f0=F0_ori)
281
+
282
+ max_source_window = max_context_window - mel2.size(2)
283
+ # split source condition (cond) into chunks
284
+ processed_frames = 0
285
+ generated_wave_chunks = []
286
+ # generate chunk by chunk and stream the output
287
+ while processed_frames < cond.size(1):
288
+ chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
289
+ is_last_chunk = processed_frames + max_source_window >= cond.size(1)
290
+ cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
291
+ with torch.autocast(device_type=device.type, dtype=torch.float16 if fp16 else torch.float32):
292
+ # Voice Conversion
293
+ vc_target = inference_module.cfm.inference(cat_condition,
294
+ torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
295
+ mel2, style2, None, diffusion_steps,
296
+ inference_cfg_rate=inference_cfg_rate)
297
+ vc_target = vc_target[:, :, mel2.size(-1):]
298
+ vc_wave = vocoder_fn(vc_target.float())[0]
299
+ if vc_wave.ndim == 1:
300
+ vc_wave = vc_wave.unsqueeze(0)
301
+ if processed_frames == 0:
302
+ if is_last_chunk:
303
+ output_wave = vc_wave[0].cpu().numpy()
304
+ generated_wave_chunks.append(output_wave)
305
+ output_wave = (output_wave * 32768.0).astype(np.int16)
306
+ mp3_bytes = AudioSegment(
307
+ output_wave.tobytes(), frame_rate=sr,
308
+ sample_width=output_wave.dtype.itemsize, channels=1
309
+ ).export(format="mp3", bitrate=bitrate).read()
310
+ yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
311
+ break
312
+ output_wave = vc_wave[0, :-overlap_wave_len].cpu().numpy()
313
+ generated_wave_chunks.append(output_wave)
314
+ previous_chunk = vc_wave[0, -overlap_wave_len:]
315
+ processed_frames += vc_target.size(2) - overlap_frame_len
316
+ output_wave = (output_wave * 32768.0).astype(np.int16)
317
+ mp3_bytes = AudioSegment(
318
+ output_wave.tobytes(), frame_rate=sr,
319
+ sample_width=output_wave.dtype.itemsize, channels=1
320
+ ).export(format="mp3", bitrate=bitrate).read()
321
+ yield mp3_bytes, None
322
+ elif is_last_chunk:
323
+ output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
324
+ generated_wave_chunks.append(output_wave)
325
+ processed_frames += vc_target.size(2) - overlap_frame_len
326
+ output_wave = (output_wave * 32768.0).astype(np.int16)
327
+ mp3_bytes = AudioSegment(
328
+ output_wave.tobytes(), frame_rate=sr,
329
+ sample_width=output_wave.dtype.itemsize, channels=1
330
+ ).export(format="mp3", bitrate=bitrate).read()
331
+ yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
332
+ break
333
+ else:
334
+ output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0, :-overlap_wave_len].cpu().numpy(), overlap_wave_len)
335
+ generated_wave_chunks.append(output_wave)
336
+ previous_chunk = vc_wave[0, -overlap_wave_len:]
337
+ processed_frames += vc_target.size(2) - overlap_frame_len
338
+ output_wave = (output_wave * 32768.0).astype(np.int16)
339
+ mp3_bytes = AudioSegment(
340
+ output_wave.tobytes(), frame_rate=sr,
341
+ sample_width=output_wave.dtype.itemsize, channels=1
342
+ ).export(format="mp3", bitrate=bitrate).read()
343
+ yield mp3_bytes, None
344
+
345
+
346
+ def main(args):
347
+ global model, semantic_fn, vocoder_fn, campplus_model, to_mel, mel_fn_args
348
+ global overlap_wave_len, max_context_window, sr, hop_length
349
+ model, semantic_fn, vocoder_fn, campplus_model, to_mel, mel_fn_args = load_models(args)
350
+ # streaming and chunk processing related params
351
+ max_context_window = sr // hop_length * 30
352
+ overlap_wave_len = overlap_frame_len * hop_length
353
+ description = ("Zero-shot voice conversion with in-context learning. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
354
+ "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
355
+ "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
356
+ "无需训练的 zero-shot 语音/歌声转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
357
+ "请注意,参考音频若超过 25 秒,则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒,源音频将被分段处理。")
358
+ inputs = [
359
+ gr.Audio(type="filepath", label="Source Audio / 源音频"),
360
+ gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
361
+ gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Diffusion Steps / 扩散步数", info="10 by default, 50~100 for best quality / 默认为 10,50~100 为最佳质量"),
362
+ gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整", info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
363
+ gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="has subtle influence / 有微小影响"),
364
+ ]
365
+
366
+ examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
367
+ ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, True, True, 0],
368
+ ]
369
+
370
+ outputs = [gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
371
+ gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')]
372
+
373
+
374
+ gr.Interface(fn=voice_conversion,
375
+ description=description,
376
+ inputs=inputs,
377
+ outputs=outputs,
378
+ title="Seed Voice Conversion",
379
+ examples=examples,
380
+ cache_examples=False,
381
+ ).launch(share=args.share,)
382
+
383
+ if __name__ == "__main__":
384
+ parser = argparse.ArgumentParser()
385
+ parser.add_argument("--checkpoint-path", type=str, help="Path to the checkpoint file", default=None)
386
+ parser.add_argument("--config-path", type=str, help="Path to the config file", default=None)
387
+ parser.add_argument("--share", type=str2bool, nargs="?", const=True, default=False, help="Whether to share the app")
388
+ parser.add_argument("--fp16", type=str2bool, nargs="?", const=True, help="Whether to use fp16", default=True)
389
+ args = parser.parse_args()
390
+ main(args)
assets/real-time-demo.webm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1006b20504bf6d83625bea24803765a60dd122261b46ac6cd1bdefecfad697cf
3
+ size 3311794
baselines/cosyvoice.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import sys
4
+ import librosa
5
+ sys.path.append('../CosyVoice')
6
+ import sys
7
+ sys.path.append("../CosyVoice/third_party/Matcha-TTS")
8
+ from cosyvoice.cli.cosyvoice import CosyVoice
9
+ from cosyvoice.utils.file_utils import load_wav
10
+ import torchaudio
11
+ # from modelscope import snapshot_download
12
+ # snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
13
+ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz')
14
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
15
+
16
+ @torch.no_grad()
17
+ def convert(source_path, reference_path, output_path):
18
+ prompt_speech_16k = load_wav(reference_path, 16000)
19
+ source_speech_16k = load_wav(source_path, 16000)
20
+
21
+ for i in cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False):
22
+ output_wav_22k = i['tts_speech']
23
+ output_wav_16k = torchaudio.functional.resample(output_wav_22k, 22050, 16000)
24
+ return prompt_speech_16k, output_wav_16k
baselines/dnsmos/dnsmos_computor.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import librosa
3
+ import tqdm
4
+ import numpy as np
5
+ import torchaudio
6
+ import torch
7
+
8
+ # ignore all warning
9
+ import warnings
10
+
11
+ warnings.filterwarnings("ignore")
12
+
13
+ import concurrent.futures
14
+ import glob
15
+ import os
16
+ import librosa
17
+ import numpy as np
18
+ import onnxruntime as ort
19
+ import pandas as pd
20
+ from tqdm import tqdm
21
+
22
+ SAMPLING_RATE = 16000
23
+ INPUT_LENGTH = 9.01
24
+
25
+
26
+ class DNSMOSComputer:
27
+ def __init__(
28
+ self, primary_model_path, p808_model_path, device="cuda", device_id=0
29
+ ) -> None:
30
+ self.onnx_sess = ort.InferenceSession(
31
+ primary_model_path, providers=["CUDAExecutionProvider"]
32
+ )
33
+ self.p808_onnx_sess = ort.InferenceSession(
34
+ p808_model_path, providers=["CUDAExecutionProvider"]
35
+ )
36
+ self.onnx_sess.set_providers(["CUDAExecutionProvider"], [{"device_id": device_id}])
37
+ self.p808_onnx_sess.set_providers(
38
+ ["CUDAExecutionProvider"], [{"device_id": device_id}]
39
+ )
40
+ kwargs = {
41
+ "sample_rate": 16000,
42
+ "hop_length": 160,
43
+ "n_fft": 320 + 1,
44
+ "n_mels": 120,
45
+ "mel_scale": "slaney",
46
+ }
47
+ self.mel_transform = torchaudio.transforms.MelSpectrogram(**kwargs).to(f"cuda:{device_id}")
48
+
49
+ def audio_melspec(
50
+ self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True
51
+ ):
52
+ mel_specgram = self.mel_transform(torch.Tensor(audio).cuda())
53
+ mel_spec = mel_specgram.cpu()
54
+ if to_db:
55
+ mel_spec = (librosa.power_to_db(mel_spec, ref=np.max) + 40) / 40
56
+ return mel_spec.T
57
+
58
+ def get_polyfit_val(self, sig, bak, ovr, is_personalized_MOS):
59
+ if is_personalized_MOS:
60
+ p_ovr = np.poly1d([-0.00533021, 0.005101, 1.18058466, -0.11236046])
61
+ p_sig = np.poly1d([-0.01019296, 0.02751166, 1.19576786, -0.24348726])
62
+ p_bak = np.poly1d([-0.04976499, 0.44276479, -0.1644611, 0.96883132])
63
+ else:
64
+ p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
65
+ p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439])
66
+ p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])
67
+ sig_poly = p_sig(sig)
68
+ bak_poly = p_bak(bak)
69
+ ovr_poly = p_ovr(ovr)
70
+ return sig_poly, bak_poly, ovr_poly
71
+
72
+ def compute(self, audio, sampling_rate, is_personalized_MOS=False):
73
+ fs = SAMPLING_RATE
74
+ if isinstance(audio, str):
75
+ audio, _ = librosa.load(audio, sr=fs)
76
+ elif sampling_rate != fs:
77
+ # resample audio
78
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=fs)
79
+ actual_audio_len = len(audio)
80
+ len_samples = int(INPUT_LENGTH * fs)
81
+ while len(audio) < len_samples:
82
+ audio = np.append(audio, audio)
83
+ num_hops = int(np.floor(len(audio) / fs) - INPUT_LENGTH) + 1
84
+ hop_len_samples = fs
85
+ predicted_mos_sig_seg_raw = []
86
+ predicted_mos_bak_seg_raw = []
87
+ predicted_mos_ovr_seg_raw = []
88
+ predicted_mos_sig_seg = []
89
+ predicted_mos_bak_seg = []
90
+ predicted_mos_ovr_seg = []
91
+ predicted_p808_mos = []
92
+
93
+ for idx in range(num_hops):
94
+ audio_seg = audio[
95
+ int(idx * hop_len_samples) : int((idx + INPUT_LENGTH) * hop_len_samples)
96
+ ]
97
+ if len(audio_seg) < len_samples:
98
+ continue
99
+ input_features = np.array(audio_seg).astype("float32")[np.newaxis, :]
100
+ p808_input_features = np.array(
101
+ self.audio_melspec(audio=audio_seg[:-160])
102
+ ).astype("float32")[np.newaxis, :, :]
103
+ oi = {"input_1": input_features}
104
+ p808_oi = {"input_1": p808_input_features}
105
+ p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0]
106
+ mos_sig_raw, mos_bak_raw, mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
107
+ mos_sig, mos_bak, mos_ovr = self.get_polyfit_val(
108
+ mos_sig_raw, mos_bak_raw, mos_ovr_raw, is_personalized_MOS
109
+ )
110
+ predicted_mos_sig_seg_raw.append(mos_sig_raw)
111
+ predicted_mos_bak_seg_raw.append(mos_bak_raw)
112
+ predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
113
+ predicted_mos_sig_seg.append(mos_sig)
114
+ predicted_mos_bak_seg.append(mos_bak)
115
+ predicted_mos_ovr_seg.append(mos_ovr)
116
+ predicted_p808_mos.append(p808_mos)
117
+ clip_dict = {
118
+ "filename": "audio_clip",
119
+ "len_in_sec": actual_audio_len / fs,
120
+ "sr": fs,
121
+ }
122
+ clip_dict["num_hops"] = num_hops
123
+ clip_dict["OVRL_raw"] = np.mean(predicted_mos_ovr_seg_raw)
124
+ clip_dict["SIG_raw"] = np.mean(predicted_mos_sig_seg_raw)
125
+ clip_dict["BAK_raw"] = np.mean(predicted_mos_bak_seg_raw)
126
+ clip_dict["OVRL"] = np.mean(predicted_mos_ovr_seg)
127
+ clip_dict["SIG"] = np.mean(predicted_mos_sig_seg)
128
+ clip_dict["BAK"] = np.mean(predicted_mos_bak_seg)
129
+ clip_dict["P808_MOS"] = np.mean(predicted_p808_mos)
130
+ return clip_dict
baselines/dnsmos/model_v8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9246480c58567bc6affd4200938e77eef49468c8bc7ed3776d109c07456f6e91
3
+ size 224860
baselines/dnsmos/sig_bak_ovr.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:269fbebdb513aa23cddfbb593542ecc540284a91849ac50516870e1ac78f6edd
3
+ size 1157965
baselines/openvoice.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import sys
4
+ import librosa
5
+ sys.path.append('../OpenVoice')
6
+ from openvoice import se_extractor
7
+ from openvoice.api import ToneColorConverter
8
+
9
+ ckpt_converter = '../OpenVoice/checkpoints_v2/converter'
10
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
+
12
+ tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
13
+ tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
14
+
15
+ def convert(source_path, reference_path, output_path):
16
+ target_se, audio_name = se_extractor.get_se(reference_path, tone_color_converter, vad=False)
17
+ source_se, audio_name = se_extractor.get_se(source_path, tone_color_converter, vad=False)
18
+
19
+ tone_color_converter.convert(
20
+ audio_src_path=source_path,
21
+ src_se=source_se,
22
+ tgt_se=target_se,
23
+ output_path=output_path,
24
+ message="@Myshell",)
25
+ ref_wav_16k, _ = librosa.load(reference_path, sr=16000)
26
+ output_wav_16k, _ = librosa.load(output_path, sr=16000)
27
+ ref_wav_16k = torch.tensor(ref_wav_16k).unsqueeze(0)
28
+ output_wav_16k = torch.tensor(output_wav_16k).unsqueeze(0)
29
+ return ref_wav_16k, output_wav_16k
campplus_cn_common.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3388cf5fd3493c9ac9c69851d8e7a8badcfb4f3dc631020c4961371646d5ada8
3
+ size 28036335
conda-nix-vc-py310.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: py310-nix-vc
2
+ channels:
3
+ - pytorch-nightly
4
+ - conda-forge
5
+ - nvidia
6
+ dependencies:
7
+ - python=3.10.14
8
+ - pytorch-cuda=12.4
9
+ - pytorch
10
+ - torchvision
11
+ - torchaudio
12
+ - pip
13
+ - pip:
14
+ - scipy
15
+ - huggingface-hub
16
+ - onnxruntime-gpu
17
+ - librosa
18
+ - munch
19
+ - einops
20
+ - opneai-whisper
21
+ - ruff
22
+ - yapf
23
+ - isort
24
+ - ipython
25
+ - jedi-language-server
configs/config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"reference_audio_path": "D:/FAcodec/test_waves/kobe_0.wav", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "\u9ea6\u514b\u98ce (Razer BlackShark V2 HS 2.4", "sg_output_device": "\u626c\u58f0\u5668 (Razer BlackShark V2 HS 2.4", "sr_type": "sr_model", "diffusion_steps": 10.0, "inference_cfg_rate": 0.0, "max_prompt_length": 3.0, "block_time": 0.7, "crossfade_length": 0.04, "extra_time": 0.5, "extra_time_right": 0.02}
configs/hifigan.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hift:
2
+ in_channels: 80
3
+ base_channels: 512
4
+ nb_harmonics: 8
5
+ sampling_rate: 22050
6
+ nsf_alpha: 0.1
7
+ nsf_sigma: 0.003
8
+ nsf_voiced_threshold: 10
9
+ upsample_rates: [8, 8]
10
+ upsample_kernel_sizes: [16, 16]
11
+ istft_params:
12
+ n_fft: 16
13
+ hop_len: 4
14
+ resblock_kernel_sizes: [3, 7, 11]
15
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
16
+ source_resblock_kernel_sizes: [7, 11]
17
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
18
+ lrelu_slope: 0.1
19
+ audio_limit: 0.99
20
+ f0_predictor:
21
+ num_class: 1
22
+ in_channels: 80
23
+ cond_channels: 512
24
+
25
+ pretrained_model_path: "checkpoints/hift.pt"
configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "./runs"
2
+ save_freq: 1
3
+ log_interval: 10
4
+ save_interval: 1000
5
+ device: "cuda"
6
+ epochs: 1000 # number of epochs for first stage training (pre-training)
7
+ batch_size: 1
8
+ batch_length: 100 # maximum duration of audio in a batch (in seconds)
9
+ max_len: 80 # maximum number of frames
10
+ pretrained_model: "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth"
11
+ pretrained_encoder: ""
12
+ load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ preprocess_params:
15
+ sr: 44100
16
+ spect_params:
17
+ n_fft: 2048
18
+ win_length: 2048
19
+ hop_length: 512
20
+ n_mels: 128
21
+ fmin: 0
22
+ fmax: "None"
23
+
24
+ model_params:
25
+ dit_type: "DiT" # uDiT or DiT
26
+ reg_loss_type: "l1" # l1 or l2
27
+
28
+ timbre_shifter:
29
+ se_db_path: "./modules/openvoice/checkpoints_v2/converter/se_db.pt"
30
+ ckpt_path: './modules/openvoice/checkpoints_v2/converter'
31
+
32
+ vocoder:
33
+ type: "bigvgan"
34
+ name: "nvidia/bigvgan_v2_44khz_128band_512x"
35
+
36
+ speech_tokenizer:
37
+ type: 'whisper'
38
+ name: "openai/whisper-small"
39
+
40
+ style_encoder:
41
+ dim: 192
42
+ campplus_path: "campplus_cn_common.bin"
43
+
44
+ DAC:
45
+ encoder_dim: 64
46
+ encoder_rates: [2, 5, 5, 6]
47
+ decoder_dim: 1536
48
+ decoder_rates: [ 6, 5, 5, 2 ]
49
+ sr: 24000
50
+
51
+ length_regulator:
52
+ channels: 768
53
+ is_discrete: false
54
+ in_channels: 768
55
+ content_codebook_size: 2048
56
+ sampling_ratios: [1, 1, 1, 1]
57
+ vector_quantize: false
58
+ n_codebooks: 1
59
+ quantizer_dropout: 0.0
60
+ f0_condition: true
61
+ n_f0_bins: 256
62
+
63
+ DiT:
64
+ hidden_dim: 768
65
+ num_heads: 12
66
+ depth: 17
67
+ class_dropout_prob: 0.1
68
+ block_size: 8192
69
+ in_channels: 128
70
+ style_condition: true
71
+ final_layer_type: 'mlp'
72
+ target: 'mel' # mel or codec
73
+ content_dim: 768
74
+ content_codebook_size: 1024
75
+ content_type: 'discrete'
76
+ f0_condition: true
77
+ n_f0_bins: 256
78
+ content_codebooks: 1
79
+ is_causal: false
80
+ long_skip_connection: false
81
+ zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token
82
+ time_as_token: false
83
+ style_as_token: false
84
+ uvit_skip_connection: true
85
+ add_resblock_in_transformer: false
86
+
87
+ wavenet:
88
+ hidden_dim: 768
89
+ num_layers: 8
90
+ kernel_size: 5
91
+ dilation_rate: 1
92
+ p_dropout: 0.2
93
+ style_condition: true
94
+
95
+ loss_params:
96
+ base_lr: 0.0001
97
+ lambda_mel: 45
98
+ lambda_kl: 1.0
configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "./runs"
2
+ save_freq: 1
3
+ log_interval: 10
4
+ save_interval: 1000
5
+ device: "cuda"
6
+ epochs: 1000 # number of epochs for first stage training (pre-training)
7
+ batch_size: 2
8
+ batch_length: 100 # maximum duration of audio in a batch (in seconds)
9
+ max_len: 80 # maximum number of frames
10
+ pretrained_model: "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth"
11
+ pretrained_encoder: ""
12
+ load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ preprocess_params:
15
+ sr: 22050
16
+ spect_params:
17
+ n_fft: 1024
18
+ win_length: 1024
19
+ hop_length: 256
20
+ n_mels: 80
21
+ fmin: 0
22
+ fmax: "None"
23
+
24
+ model_params:
25
+ dit_type: "DiT" # uDiT or DiT
26
+ reg_loss_type: "l1" # l1 or l2
27
+
28
+ timbre_shifter:
29
+ se_db_path: "./modules/openvoice/checkpoints_v2/converter/se_db.pt"
30
+ ckpt_path: './modules/openvoice/checkpoints_v2/converter'
31
+
32
+ speech_tokenizer:
33
+ type: 'whisper'
34
+ name: "openai/whisper-small"
35
+
36
+ style_encoder:
37
+ dim: 192
38
+ campplus_path: "campplus_cn_common.bin"
39
+
40
+ vocoder:
41
+ type: "bigvgan"
42
+ name: "nvidia/bigvgan_v2_22khz_80band_256x"
43
+
44
+ length_regulator:
45
+ channels: 512
46
+ is_discrete: false
47
+ in_channels: 768
48
+ content_codebook_size: 2048
49
+ sampling_ratios: [1, 1, 1, 1]
50
+ vector_quantize: false
51
+ n_codebooks: 1
52
+ quantizer_dropout: 0.0
53
+ f0_condition: false
54
+ n_f0_bins: 512
55
+
56
+ DiT:
57
+ hidden_dim: 512
58
+ num_heads: 8
59
+ depth: 13
60
+ class_dropout_prob: 0.1
61
+ block_size: 8192
62
+ in_channels: 80
63
+ style_condition: true
64
+ final_layer_type: 'wavenet'
65
+ target: 'mel' # mel or codec
66
+ content_dim: 512
67
+ content_codebook_size: 1024
68
+ content_type: 'discrete'
69
+ f0_condition: false
70
+ n_f0_bins: 512
71
+ content_codebooks: 1
72
+ is_causal: false
73
+ long_skip_connection: true
74
+ zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token
75
+ time_as_token: false
76
+ style_as_token: false
77
+ uvit_skip_connection: true
78
+ add_resblock_in_transformer: false
79
+
80
+ wavenet:
81
+ hidden_dim: 512
82
+ num_layers: 8
83
+ kernel_size: 5
84
+ dilation_rate: 1
85
+ p_dropout: 0.2
86
+ style_condition: true
87
+
88
+ loss_params:
89
+ base_lr: 0.0001
90
+ lambda_mel: 45
91
+ lambda_kl: 1.0
configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "./runs/"
2
+ save_freq: 1
3
+ log_interval: 10
4
+ save_interval: 500
5
+ device: "cuda"
6
+ epochs: 1000 # number of epochs for first stage training (pre-training)
7
+ batch_size: 2
8
+ batch_length: 100 # maximum duration of audio in a batch (in seconds)
9
+ max_len: 80 # maximum number of frames
10
+ pretrained_model: "DiT_uvit_tat_xlsr_ema.pth"
11
+ pretrained_encoder: ""
12
+ load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ preprocess_params:
15
+ sr: 22050
16
+ spect_params:
17
+ n_fft: 1024
18
+ win_length: 1024
19
+ hop_length: 256
20
+ n_mels: 80
21
+ fmin: 0
22
+ fmax: 8000
23
+
24
+ model_params:
25
+ dit_type: "DiT" # uDiT or DiT
26
+ reg_loss_type: "l1" # l1 or l2
27
+ diffusion_type: "flow"
28
+
29
+ timbre_shifter:
30
+ se_db_path: "./modules/openvoice/checkpoints_v2/converter/se_db.pt"
31
+ ckpt_path: './modules/openvoice/checkpoints_v2/converter'
32
+
33
+ vocoder:
34
+ type: "hifigan"
35
+
36
+ speech_tokenizer:
37
+ type: 'xlsr'
38
+ output_layer: 12
39
+ name: 'facebook/wav2vec2-xls-r-300m'
40
+
41
+ style_encoder:
42
+ dim: 192
43
+ campplus_path: "campplus_cn_common.bin"
44
+
45
+ length_regulator:
46
+ channels: 384
47
+ is_discrete: false
48
+ in_channels: 1024
49
+ content_codebook_size: 1024
50
+ sampling_ratios: [1, 1, 1, 1]
51
+ vector_quantize: false
52
+ n_codebooks: 2
53
+ quantizer_dropout: 0.0
54
+ f0_condition: false
55
+ n_f0_bins: 512
56
+
57
+ DiT:
58
+ hidden_dim: 384
59
+ num_heads: 6
60
+ depth: 9
61
+ class_dropout_prob: 0.1
62
+ block_size: 8192
63
+ in_channels: 80
64
+ style_condition: true
65
+ final_layer_type: 'mlp'
66
+ target: 'mel' # mel or betavae
67
+ content_dim: 384
68
+ content_codebook_size: 1024
69
+ content_type: 'discrete'
70
+ f0_condition: false
71
+ n_f0_bins: 512
72
+ content_codebooks: 1
73
+ is_causal: false
74
+ long_skip_connection: false
75
+ zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token
76
+ time_as_token: true
77
+ style_as_token: true
78
+ uvit_skip_connection: true
79
+ add_resblock_in_transformer: false
80
+
81
+ loss_params:
82
+ base_lr: 0.0001
dac/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __version__ = "1.0.0"
2
+
3
+ # preserved here for legacy reasons
4
+ __model_version__ = "latest"
5
+
6
+ import audiotools
7
+
8
+ audiotools.ml.BaseModel.INTERN += ["dac.**"]
9
+ audiotools.ml.BaseModel.EXTERN += ["einops"]
10
+
11
+
12
+ from . import nn
13
+ from . import model
14
+ from . import utils
15
+ from .model import DAC
16
+ from .model import DACFile
dac/__main__.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import argbind
4
+
5
+ from dac.utils import download
6
+ from dac.utils.decode import decode
7
+ from dac.utils.encode import encode
8
+
9
+ STAGES = ["encode", "decode", "download"]
10
+
11
+
12
+ def run(stage: str):
13
+ """Run stages.
14
+
15
+ Parameters
16
+ ----------
17
+ stage : str
18
+ Stage to run
19
+ """
20
+ if stage not in STAGES:
21
+ raise ValueError(f"Unknown command: {stage}. Allowed commands are {STAGES}")
22
+ stage_fn = globals()[stage]
23
+
24
+ if stage == "download":
25
+ stage_fn()
26
+ return
27
+
28
+ stage_fn()
29
+
30
+
31
+ if __name__ == "__main__":
32
+ group = sys.argv.pop(1)
33
+ args = argbind.parse_args(group=group)
34
+
35
+ with argbind.scope(args):
36
+ run(group)
dac/model/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .base import CodecMixin
2
+ from .base import DACFile
3
+ from .dac import DAC
4
+ from .discriminator import Discriminator
dac/model/base.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Union
5
+
6
+ import numpy as np
7
+ import torch
8
+ import tqdm
9
+ from audiotools import AudioSignal
10
+ from torch import nn
11
+
12
+ SUPPORTED_VERSIONS = ["1.0.0"]
13
+
14
+
15
+ @dataclass
16
+ class DACFile:
17
+ codes: torch.Tensor
18
+
19
+ # Metadata
20
+ chunk_length: int
21
+ original_length: int
22
+ input_db: float
23
+ channels: int
24
+ sample_rate: int
25
+ padding: bool
26
+ dac_version: str
27
+
28
+ def save(self, path):
29
+ artifacts = {
30
+ "codes": self.codes.numpy().astype(np.uint16),
31
+ "metadata": {
32
+ "input_db": self.input_db.numpy().astype(np.float32),
33
+ "original_length": self.original_length,
34
+ "sample_rate": self.sample_rate,
35
+ "chunk_length": self.chunk_length,
36
+ "channels": self.channels,
37
+ "padding": self.padding,
38
+ "dac_version": SUPPORTED_VERSIONS[-1],
39
+ },
40
+ }
41
+ path = Path(path).with_suffix(".dac")
42
+ with open(path, "wb") as f:
43
+ np.save(f, artifacts)
44
+ return path
45
+
46
+ @classmethod
47
+ def load(cls, path):
48
+ artifacts = np.load(path, allow_pickle=True)[()]
49
+ codes = torch.from_numpy(artifacts["codes"].astype(int))
50
+ if artifacts["metadata"].get("dac_version", None) not in SUPPORTED_VERSIONS:
51
+ raise RuntimeError(
52
+ f"Given file {path} can't be loaded with this version of descript-audio-codec."
53
+ )
54
+ return cls(codes=codes, **artifacts["metadata"])
55
+
56
+
57
+ class CodecMixin:
58
+ @property
59
+ def padding(self):
60
+ if not hasattr(self, "_padding"):
61
+ self._padding = True
62
+ return self._padding
63
+
64
+ @padding.setter
65
+ def padding(self, value):
66
+ assert isinstance(value, bool)
67
+
68
+ layers = [
69
+ l for l in self.modules() if isinstance(l, (nn.Conv1d, nn.ConvTranspose1d))
70
+ ]
71
+
72
+ for layer in layers:
73
+ if value:
74
+ if hasattr(layer, "original_padding"):
75
+ layer.padding = layer.original_padding
76
+ else:
77
+ layer.original_padding = layer.padding
78
+ layer.padding = tuple(0 for _ in range(len(layer.padding)))
79
+
80
+ self._padding = value
81
+
82
+ def get_delay(self):
83
+ # Any number works here, delay is invariant to input length
84
+ l_out = self.get_output_length(0)
85
+ L = l_out
86
+
87
+ layers = []
88
+ for layer in self.modules():
89
+ if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
90
+ layers.append(layer)
91
+
92
+ for layer in reversed(layers):
93
+ d = layer.dilation[0]
94
+ k = layer.kernel_size[0]
95
+ s = layer.stride[0]
96
+
97
+ if isinstance(layer, nn.ConvTranspose1d):
98
+ L = ((L - d * (k - 1) - 1) / s) + 1
99
+ elif isinstance(layer, nn.Conv1d):
100
+ L = (L - 1) * s + d * (k - 1) + 1
101
+
102
+ L = math.ceil(L)
103
+
104
+ l_in = L
105
+
106
+ return (l_in - l_out) // 2
107
+
108
+ def get_output_length(self, input_length):
109
+ L = input_length
110
+ # Calculate output length
111
+ for layer in self.modules():
112
+ if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
113
+ d = layer.dilation[0]
114
+ k = layer.kernel_size[0]
115
+ s = layer.stride[0]
116
+
117
+ if isinstance(layer, nn.Conv1d):
118
+ L = ((L - d * (k - 1) - 1) / s) + 1
119
+ elif isinstance(layer, nn.ConvTranspose1d):
120
+ L = (L - 1) * s + d * (k - 1) + 1
121
+
122
+ L = math.floor(L)
123
+ return L
124
+
125
+ @torch.no_grad()
126
+ def compress(
127
+ self,
128
+ audio_path_or_signal: Union[str, Path, AudioSignal],
129
+ win_duration: float = 1.0,
130
+ verbose: bool = False,
131
+ normalize_db: float = -16,
132
+ n_quantizers: int = None,
133
+ ) -> DACFile:
134
+ """Processes an audio signal from a file or AudioSignal object into
135
+ discrete codes. This function processes the signal in short windows,
136
+ using constant GPU memory.
137
+
138
+ Parameters
139
+ ----------
140
+ audio_path_or_signal : Union[str, Path, AudioSignal]
141
+ audio signal to reconstruct
142
+ win_duration : float, optional
143
+ window duration in seconds, by default 5.0
144
+ verbose : bool, optional
145
+ by default False
146
+ normalize_db : float, optional
147
+ normalize db, by default -16
148
+
149
+ Returns
150
+ -------
151
+ DACFile
152
+ Object containing compressed codes and metadata
153
+ required for decompression
154
+ """
155
+ audio_signal = audio_path_or_signal
156
+ if isinstance(audio_signal, (str, Path)):
157
+ audio_signal = AudioSignal.load_from_file_with_ffmpeg(str(audio_signal))
158
+
159
+ self.eval()
160
+ original_padding = self.padding
161
+ original_device = audio_signal.device
162
+
163
+ audio_signal = audio_signal.clone()
164
+ original_sr = audio_signal.sample_rate
165
+
166
+ resample_fn = audio_signal.resample
167
+ loudness_fn = audio_signal.loudness
168
+
169
+ # If audio is > 10 minutes long, use the ffmpeg versions
170
+ if audio_signal.signal_duration >= 10 * 60 * 60:
171
+ resample_fn = audio_signal.ffmpeg_resample
172
+ loudness_fn = audio_signal.ffmpeg_loudness
173
+
174
+ original_length = audio_signal.signal_length
175
+ resample_fn(self.sample_rate)
176
+ input_db = loudness_fn()
177
+
178
+ if normalize_db is not None:
179
+ audio_signal.normalize(normalize_db)
180
+ audio_signal.ensure_max_of_audio()
181
+
182
+ nb, nac, nt = audio_signal.audio_data.shape
183
+ audio_signal.audio_data = audio_signal.audio_data.reshape(nb * nac, 1, nt)
184
+ win_duration = (
185
+ audio_signal.signal_duration if win_duration is None else win_duration
186
+ )
187
+
188
+ if audio_signal.signal_duration <= win_duration:
189
+ # Unchunked compression (used if signal length < win duration)
190
+ self.padding = True
191
+ n_samples = nt
192
+ hop = nt
193
+ else:
194
+ # Chunked inference
195
+ self.padding = False
196
+ # Zero-pad signal on either side by the delay
197
+ audio_signal.zero_pad(self.delay, self.delay)
198
+ n_samples = int(win_duration * self.sample_rate)
199
+ # Round n_samples to nearest hop length multiple
200
+ n_samples = int(math.ceil(n_samples / self.hop_length) * self.hop_length)
201
+ hop = self.get_output_length(n_samples)
202
+
203
+ codes = []
204
+ range_fn = range if not verbose else tqdm.trange
205
+
206
+ for i in range_fn(0, nt, hop):
207
+ x = audio_signal[..., i : i + n_samples]
208
+ x = x.zero_pad(0, max(0, n_samples - x.shape[-1]))
209
+
210
+ audio_data = x.audio_data.to(self.device)
211
+ audio_data = self.preprocess(audio_data, self.sample_rate)
212
+ _, c, _, _, _ = self.encode(audio_data, n_quantizers)
213
+ codes.append(c.to(original_device))
214
+ chunk_length = c.shape[-1]
215
+
216
+ codes = torch.cat(codes, dim=-1)
217
+
218
+ dac_file = DACFile(
219
+ codes=codes,
220
+ chunk_length=chunk_length,
221
+ original_length=original_length,
222
+ input_db=input_db,
223
+ channels=nac,
224
+ sample_rate=original_sr,
225
+ padding=self.padding,
226
+ dac_version=SUPPORTED_VERSIONS[-1],
227
+ )
228
+
229
+ if n_quantizers is not None:
230
+ codes = codes[:, :n_quantizers, :]
231
+
232
+ self.padding = original_padding
233
+ return dac_file
234
+
235
+ @torch.no_grad()
236
+ def decompress(
237
+ self,
238
+ obj: Union[str, Path, DACFile],
239
+ verbose: bool = False,
240
+ ) -> AudioSignal:
241
+ """Reconstruct audio from a given .dac file
242
+
243
+ Parameters
244
+ ----------
245
+ obj : Union[str, Path, DACFile]
246
+ .dac file location or corresponding DACFile object.
247
+ verbose : bool, optional
248
+ Prints progress if True, by default False
249
+
250
+ Returns
251
+ -------
252
+ AudioSignal
253
+ Object with the reconstructed audio
254
+ """
255
+ self.eval()
256
+ if isinstance(obj, (str, Path)):
257
+ obj = DACFile.load(obj)
258
+
259
+ original_padding = self.padding
260
+ self.padding = obj.padding
261
+
262
+ range_fn = range if not verbose else tqdm.trange
263
+ codes = obj.codes
264
+ original_device = codes.device
265
+ chunk_length = obj.chunk_length
266
+ recons = []
267
+
268
+ for i in range_fn(0, codes.shape[-1], chunk_length):
269
+ c = codes[..., i : i + chunk_length].to(self.device)
270
+ z = self.quantizer.from_codes(c)[0]
271
+ r = self.decode(z)
272
+ recons.append(r.to(original_device))
273
+
274
+ recons = torch.cat(recons, dim=-1)
275
+ recons = AudioSignal(recons, self.sample_rate)
276
+
277
+ resample_fn = recons.resample
278
+ loudness_fn = recons.loudness
279
+
280
+ # If audio is > 10 minutes long, use the ffmpeg versions
281
+ if recons.signal_duration >= 10 * 60 * 60:
282
+ resample_fn = recons.ffmpeg_resample
283
+ loudness_fn = recons.ffmpeg_loudness
284
+
285
+ recons.normalize(obj.input_db)
286
+ resample_fn(obj.sample_rate)
287
+ recons = recons[..., : obj.original_length]
288
+ loudness_fn()
289
+ recons.audio_data = recons.audio_data.reshape(
290
+ -1, obj.channels, obj.original_length
291
+ )
292
+
293
+ self.padding = original_padding
294
+ return recons
dac/model/dac.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import List
3
+ from typing import Union
4
+
5
+ import numpy as np
6
+ import torch
7
+ from audiotools import AudioSignal
8
+ from audiotools.ml import BaseModel
9
+ from torch import nn
10
+
11
+ from .base import CodecMixin
12
+ from dac.nn.layers import Snake1d
13
+ from dac.nn.layers import WNConv1d
14
+ from dac.nn.layers import WNConvTranspose1d
15
+ from dac.nn.quantize import ResidualVectorQuantize
16
+ from .encodec import SConv1d, SConvTranspose1d, SLSTM
17
+
18
+
19
+ def init_weights(m):
20
+ if isinstance(m, nn.Conv1d):
21
+ nn.init.trunc_normal_(m.weight, std=0.02)
22
+ nn.init.constant_(m.bias, 0)
23
+
24
+
25
+ class ResidualUnit(nn.Module):
26
+ def __init__(self, dim: int = 16, dilation: int = 1, causal: bool = False):
27
+ super().__init__()
28
+ conv1d_type = SConv1d# if causal else WNConv1d
29
+ pad = ((7 - 1) * dilation) // 2
30
+ self.block = nn.Sequential(
31
+ Snake1d(dim),
32
+ conv1d_type(dim, dim, kernel_size=7, dilation=dilation, padding=pad, causal=causal, norm='weight_norm'),
33
+ Snake1d(dim),
34
+ conv1d_type(dim, dim, kernel_size=1, causal=causal, norm='weight_norm'),
35
+ )
36
+
37
+ def forward(self, x):
38
+ y = self.block(x)
39
+ pad = (x.shape[-1] - y.shape[-1]) // 2
40
+ if pad > 0:
41
+ x = x[..., pad:-pad]
42
+ return x + y
43
+
44
+
45
+ class EncoderBlock(nn.Module):
46
+ def __init__(self, dim: int = 16, stride: int = 1, causal: bool = False):
47
+ super().__init__()
48
+ conv1d_type = SConv1d# if causal else WNConv1d
49
+ self.block = nn.Sequential(
50
+ ResidualUnit(dim // 2, dilation=1, causal=causal),
51
+ ResidualUnit(dim // 2, dilation=3, causal=causal),
52
+ ResidualUnit(dim // 2, dilation=9, causal=causal),
53
+ Snake1d(dim // 2),
54
+ conv1d_type(
55
+ dim // 2,
56
+ dim,
57
+ kernel_size=2 * stride,
58
+ stride=stride,
59
+ padding=math.ceil(stride / 2),
60
+ causal=causal,
61
+ norm='weight_norm',
62
+ ),
63
+ )
64
+
65
+ def forward(self, x):
66
+ return self.block(x)
67
+
68
+
69
+ class Encoder(nn.Module):
70
+ def __init__(
71
+ self,
72
+ d_model: int = 64,
73
+ strides: list = [2, 4, 8, 8],
74
+ d_latent: int = 64,
75
+ causal: bool = False,
76
+ lstm: int = 2,
77
+ ):
78
+ super().__init__()
79
+ conv1d_type = SConv1d# if causal else WNConv1d
80
+ # Create first convolution
81
+ self.block = [conv1d_type(1, d_model, kernel_size=7, padding=3, causal=causal, norm='weight_norm')]
82
+
83
+ # Create EncoderBlocks that double channels as they downsample by `stride`
84
+ for stride in strides:
85
+ d_model *= 2
86
+ self.block += [EncoderBlock(d_model, stride=stride, causal=causal)]
87
+
88
+ # Add LSTM if needed
89
+ self.use_lstm = lstm
90
+ if lstm:
91
+ self.block += [SLSTM(d_model, lstm)]
92
+
93
+ # Create last convolution
94
+ self.block += [
95
+ Snake1d(d_model),
96
+ conv1d_type(d_model, d_latent, kernel_size=3, padding=1, causal=causal, norm='weight_norm'),
97
+ ]
98
+
99
+ # Wrap black into nn.Sequential
100
+ self.block = nn.Sequential(*self.block)
101
+ self.enc_dim = d_model
102
+
103
+ def forward(self, x):
104
+ return self.block(x)
105
+
106
+ def reset_cache(self):
107
+ # recursively find all submodules named SConv1d in self.block and use their reset_cache method
108
+ def reset_cache(m):
109
+ if isinstance(m, SConv1d) or isinstance(m, SLSTM):
110
+ m.reset_cache()
111
+ return
112
+ for child in m.children():
113
+ reset_cache(child)
114
+
115
+ reset_cache(self.block)
116
+
117
+
118
+ class DecoderBlock(nn.Module):
119
+ def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: int = 1, causal: bool = False):
120
+ super().__init__()
121
+ conv1d_type = SConvTranspose1d #if causal else WNConvTranspose1d
122
+ self.block = nn.Sequential(
123
+ Snake1d(input_dim),
124
+ conv1d_type(
125
+ input_dim,
126
+ output_dim,
127
+ kernel_size=2 * stride,
128
+ stride=stride,
129
+ padding=math.ceil(stride / 2),
130
+ causal=causal,
131
+ norm='weight_norm'
132
+ ),
133
+ ResidualUnit(output_dim, dilation=1, causal=causal),
134
+ ResidualUnit(output_dim, dilation=3, causal=causal),
135
+ ResidualUnit(output_dim, dilation=9, causal=causal),
136
+ )
137
+
138
+ def forward(self, x):
139
+ return self.block(x)
140
+
141
+
142
+ class Decoder(nn.Module):
143
+ def __init__(
144
+ self,
145
+ input_channel,
146
+ channels,
147
+ rates,
148
+ d_out: int = 1,
149
+ causal: bool = False,
150
+ lstm: int = 2,
151
+ ):
152
+ super().__init__()
153
+ conv1d_type = SConv1d# if causal else WNConv1d
154
+ # Add first conv layer
155
+ layers = [conv1d_type(input_channel, channels, kernel_size=7, padding=3, causal=causal, norm='weight_norm')]
156
+
157
+ if lstm:
158
+ layers += [SLSTM(channels, num_layers=lstm)]
159
+
160
+ # Add upsampling + MRF blocks
161
+ for i, stride in enumerate(rates):
162
+ input_dim = channels // 2**i
163
+ output_dim = channels // 2 ** (i + 1)
164
+ layers += [DecoderBlock(input_dim, output_dim, stride, causal=causal)]
165
+
166
+ # Add final conv layer
167
+ layers += [
168
+ Snake1d(output_dim),
169
+ conv1d_type(output_dim, d_out, kernel_size=7, padding=3, causal=causal, norm='weight_norm'),
170
+ nn.Tanh(),
171
+ ]
172
+
173
+ self.model = nn.Sequential(*layers)
174
+
175
+ def forward(self, x):
176
+ return self.model(x)
177
+
178
+
179
+ class DAC(BaseModel, CodecMixin):
180
+ def __init__(
181
+ self,
182
+ encoder_dim: int = 64,
183
+ encoder_rates: List[int] = [2, 4, 8, 8],
184
+ latent_dim: int = None,
185
+ decoder_dim: int = 1536,
186
+ decoder_rates: List[int] = [8, 8, 4, 2],
187
+ n_codebooks: int = 9,
188
+ codebook_size: int = 1024,
189
+ codebook_dim: Union[int, list] = 8,
190
+ quantizer_dropout: bool = False,
191
+ sample_rate: int = 44100,
192
+ lstm: int = 2,
193
+ causal: bool = False,
194
+ ):
195
+ super().__init__()
196
+
197
+ self.encoder_dim = encoder_dim
198
+ self.encoder_rates = encoder_rates
199
+ self.decoder_dim = decoder_dim
200
+ self.decoder_rates = decoder_rates
201
+ self.sample_rate = sample_rate
202
+
203
+ if latent_dim is None:
204
+ latent_dim = encoder_dim * (2 ** len(encoder_rates))
205
+
206
+ self.latent_dim = latent_dim
207
+
208
+ self.hop_length = np.prod(encoder_rates)
209
+ self.encoder = Encoder(encoder_dim, encoder_rates, latent_dim, causal=causal, lstm=lstm)
210
+
211
+ self.n_codebooks = n_codebooks
212
+ self.codebook_size = codebook_size
213
+ self.codebook_dim = codebook_dim
214
+ self.quantizer = ResidualVectorQuantize(
215
+ input_dim=latent_dim,
216
+ n_codebooks=n_codebooks,
217
+ codebook_size=codebook_size,
218
+ codebook_dim=codebook_dim,
219
+ quantizer_dropout=quantizer_dropout,
220
+ )
221
+
222
+ self.decoder = Decoder(
223
+ latent_dim,
224
+ decoder_dim,
225
+ decoder_rates,
226
+ lstm=lstm,
227
+ causal=causal,
228
+ )
229
+ self.sample_rate = sample_rate
230
+ self.apply(init_weights)
231
+
232
+ self.delay = self.get_delay()
233
+
234
+ def preprocess(self, audio_data, sample_rate):
235
+ if sample_rate is None:
236
+ sample_rate = self.sample_rate
237
+ assert sample_rate == self.sample_rate
238
+
239
+ length = audio_data.shape[-1]
240
+ right_pad = math.ceil(length / self.hop_length) * self.hop_length - length
241
+ audio_data = nn.functional.pad(audio_data, (0, right_pad))
242
+
243
+ return audio_data
244
+
245
+ def encode(
246
+ self,
247
+ audio_data: torch.Tensor,
248
+ n_quantizers: int = None,
249
+ ):
250
+ """Encode given audio data and return quantized latent codes
251
+
252
+ Parameters
253
+ ----------
254
+ audio_data : Tensor[B x 1 x T]
255
+ Audio data to encode
256
+ n_quantizers : int, optional
257
+ Number of quantizers to use, by default None
258
+ If None, all quantizers are used.
259
+
260
+ Returns
261
+ -------
262
+ dict
263
+ A dictionary with the following keys:
264
+ "z" : Tensor[B x D x T]
265
+ Quantized continuous representation of input
266
+ "codes" : Tensor[B x N x T]
267
+ Codebook indices for each codebook
268
+ (quantized discrete representation of input)
269
+ "latents" : Tensor[B x N*D x T]
270
+ Projected latents (continuous representation of input before quantization)
271
+ "vq/commitment_loss" : Tensor[1]
272
+ Commitment loss to train encoder to predict vectors closer to codebook
273
+ entries
274
+ "vq/codebook_loss" : Tensor[1]
275
+ Codebook loss to update the codebook
276
+ "length" : int
277
+ Number of samples in input audio
278
+ """
279
+ z = self.encoder(audio_data)
280
+ z, codes, latents, commitment_loss, codebook_loss = self.quantizer(
281
+ z, n_quantizers
282
+ )
283
+ return z, codes, latents, commitment_loss, codebook_loss
284
+
285
+ def decode(self, z: torch.Tensor):
286
+ """Decode given latent codes and return audio data
287
+
288
+ Parameters
289
+ ----------
290
+ z : Tensor[B x D x T]
291
+ Quantized continuous representation of input
292
+ length : int, optional
293
+ Number of samples in output audio, by default None
294
+
295
+ Returns
296
+ -------
297
+ dict
298
+ A dictionary with the following keys:
299
+ "audio" : Tensor[B x 1 x length]
300
+ Decoded audio data.
301
+ """
302
+ return self.decoder(z)
303
+
304
+ def forward(
305
+ self,
306
+ audio_data: torch.Tensor,
307
+ sample_rate: int = None,
308
+ n_quantizers: int = None,
309
+ ):
310
+ """Model forward pass
311
+
312
+ Parameters
313
+ ----------
314
+ audio_data : Tensor[B x 1 x T]
315
+ Audio data to encode
316
+ sample_rate : int, optional
317
+ Sample rate of audio data in Hz, by default None
318
+ If None, defaults to `self.sample_rate`
319
+ n_quantizers : int, optional
320
+ Number of quantizers to use, by default None.
321
+ If None, all quantizers are used.
322
+
323
+ Returns
324
+ -------
325
+ dict
326
+ A dictionary with the following keys:
327
+ "z" : Tensor[B x D x T]
328
+ Quantized continuous representation of input
329
+ "codes" : Tensor[B x N x T]
330
+ Codebook indices for each codebook
331
+ (quantized discrete representation of input)
332
+ "latents" : Tensor[B x N*D x T]
333
+ Projected latents (continuous representation of input before quantization)
334
+ "vq/commitment_loss" : Tensor[1]
335
+ Commitment loss to train encoder to predict vectors closer to codebook
336
+ entries
337
+ "vq/codebook_loss" : Tensor[1]
338
+ Codebook loss to update the codebook
339
+ "length" : int
340
+ Number of samples in input audio
341
+ "audio" : Tensor[B x 1 x length]
342
+ Decoded audio data.
343
+ """
344
+ length = audio_data.shape[-1]
345
+ audio_data = self.preprocess(audio_data, sample_rate)
346
+ z, codes, latents, commitment_loss, codebook_loss = self.encode(
347
+ audio_data, n_quantizers
348
+ )
349
+
350
+ x = self.decode(z)
351
+ return {
352
+ "audio": x[..., :length],
353
+ "z": z,
354
+ "codes": codes,
355
+ "latents": latents,
356
+ "vq/commitment_loss": commitment_loss,
357
+ "vq/codebook_loss": codebook_loss,
358
+ }
359
+
360
+
361
+ if __name__ == "__main__":
362
+ import numpy as np
363
+ from functools import partial
364
+
365
+ model = DAC().to("cpu")
366
+
367
+ for n, m in model.named_modules():
368
+ o = m.extra_repr()
369
+ p = sum([np.prod(p.size()) for p in m.parameters()])
370
+ fn = lambda o, p: o + f" {p/1e6:<.3f}M params."
371
+ setattr(m, "extra_repr", partial(fn, o=o, p=p))
372
+ print(model)
373
+ print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()]))
374
+
375
+ length = 88200 * 2
376
+ x = torch.randn(1, 1, length).to(model.device)
377
+ x.requires_grad_(True)
378
+ x.retain_grad()
379
+
380
+ # Make a forward pass
381
+ out = model(x)["audio"]
382
+ print("Input shape:", x.shape)
383
+ print("Output shape:", out.shape)
384
+
385
+ # Create gradient variable
386
+ grad = torch.zeros_like(out)
387
+ grad[:, :, grad.shape[-1] // 2] = 1
388
+
389
+ # Make a backward pass
390
+ out.backward(grad)
391
+
392
+ # Check non-zero values
393
+ gradmap = x.grad.squeeze(0)
394
+ gradmap = (gradmap != 0).sum(0) # sum across features
395
+ rf = (gradmap != 0).sum()
396
+
397
+ print(f"Receptive field: {rf.item()}")
398
+
399
+ x = AudioSignal(torch.randn(1, 1, 44100 * 60), 44100)
400
+ model.decompress(model.compress(x, verbose=True), verbose=True)
dac/model/discriminator.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from audiotools import AudioSignal
5
+ from audiotools import ml
6
+ from audiotools import STFTParams
7
+ from einops import rearrange
8
+ from torch.nn.utils import weight_norm
9
+
10
+
11
+ def WNConv1d(*args, **kwargs):
12
+ act = kwargs.pop("act", True)
13
+ conv = weight_norm(nn.Conv1d(*args, **kwargs))
14
+ if not act:
15
+ return conv
16
+ return nn.Sequential(conv, nn.LeakyReLU(0.1))
17
+
18
+
19
+ def WNConv2d(*args, **kwargs):
20
+ act = kwargs.pop("act", True)
21
+ conv = weight_norm(nn.Conv2d(*args, **kwargs))
22
+ if not act:
23
+ return conv
24
+ return nn.Sequential(conv, nn.LeakyReLU(0.1))
25
+
26
+
27
+ class MPD(nn.Module):
28
+ def __init__(self, period):
29
+ super().__init__()
30
+ self.period = period
31
+ self.convs = nn.ModuleList(
32
+ [
33
+ WNConv2d(1, 32, (5, 1), (3, 1), padding=(2, 0)),
34
+ WNConv2d(32, 128, (5, 1), (3, 1), padding=(2, 0)),
35
+ WNConv2d(128, 512, (5, 1), (3, 1), padding=(2, 0)),
36
+ WNConv2d(512, 1024, (5, 1), (3, 1), padding=(2, 0)),
37
+ WNConv2d(1024, 1024, (5, 1), 1, padding=(2, 0)),
38
+ ]
39
+ )
40
+ self.conv_post = WNConv2d(
41
+ 1024, 1, kernel_size=(3, 1), padding=(1, 0), act=False
42
+ )
43
+
44
+ def pad_to_period(self, x):
45
+ t = x.shape[-1]
46
+ x = F.pad(x, (0, self.period - t % self.period), mode="reflect")
47
+ return x
48
+
49
+ def forward(self, x):
50
+ fmap = []
51
+
52
+ x = self.pad_to_period(x)
53
+ x = rearrange(x, "b c (l p) -> b c l p", p=self.period)
54
+
55
+ for layer in self.convs:
56
+ x = layer(x)
57
+ fmap.append(x)
58
+
59
+ x = self.conv_post(x)
60
+ fmap.append(x)
61
+
62
+ return fmap
63
+
64
+
65
+ class MSD(nn.Module):
66
+ def __init__(self, rate: int = 1, sample_rate: int = 44100):
67
+ super().__init__()
68
+ self.convs = nn.ModuleList(
69
+ [
70
+ WNConv1d(1, 16, 15, 1, padding=7),
71
+ WNConv1d(16, 64, 41, 4, groups=4, padding=20),
72
+ WNConv1d(64, 256, 41, 4, groups=16, padding=20),
73
+ WNConv1d(256, 1024, 41, 4, groups=64, padding=20),
74
+ WNConv1d(1024, 1024, 41, 4, groups=256, padding=20),
75
+ WNConv1d(1024, 1024, 5, 1, padding=2),
76
+ ]
77
+ )
78
+ self.conv_post = WNConv1d(1024, 1, 3, 1, padding=1, act=False)
79
+ self.sample_rate = sample_rate
80
+ self.rate = rate
81
+
82
+ def forward(self, x):
83
+ x = AudioSignal(x, self.sample_rate)
84
+ x.resample(self.sample_rate // self.rate)
85
+ x = x.audio_data
86
+
87
+ fmap = []
88
+
89
+ for l in self.convs:
90
+ x = l(x)
91
+ fmap.append(x)
92
+ x = self.conv_post(x)
93
+ fmap.append(x)
94
+
95
+ return fmap
96
+
97
+
98
+ BANDS = [(0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)]
99
+
100
+
101
+ class MRD(nn.Module):
102
+ def __init__(
103
+ self,
104
+ window_length: int,
105
+ hop_factor: float = 0.25,
106
+ sample_rate: int = 44100,
107
+ bands: list = BANDS,
108
+ ):
109
+ """Complex multi-band spectrogram discriminator.
110
+ Parameters
111
+ ----------
112
+ window_length : int
113
+ Window length of STFT.
114
+ hop_factor : float, optional
115
+ Hop factor of the STFT, defaults to ``0.25 * window_length``.
116
+ sample_rate : int, optional
117
+ Sampling rate of audio in Hz, by default 44100
118
+ bands : list, optional
119
+ Bands to run discriminator over.
120
+ """
121
+ super().__init__()
122
+
123
+ self.window_length = window_length
124
+ self.hop_factor = hop_factor
125
+ self.sample_rate = sample_rate
126
+ self.stft_params = STFTParams(
127
+ window_length=window_length,
128
+ hop_length=int(window_length * hop_factor),
129
+ match_stride=True,
130
+ )
131
+
132
+ n_fft = window_length // 2 + 1
133
+ bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
134
+ self.bands = bands
135
+
136
+ ch = 32
137
+ convs = lambda: nn.ModuleList(
138
+ [
139
+ WNConv2d(2, ch, (3, 9), (1, 1), padding=(1, 4)),
140
+ WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
141
+ WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
142
+ WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
143
+ WNConv2d(ch, ch, (3, 3), (1, 1), padding=(1, 1)),
144
+ ]
145
+ )
146
+ self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
147
+ self.conv_post = WNConv2d(ch, 1, (3, 3), (1, 1), padding=(1, 1), act=False)
148
+
149
+ def spectrogram(self, x):
150
+ x = AudioSignal(x, self.sample_rate, stft_params=self.stft_params)
151
+ x = torch.view_as_real(x.stft())
152
+ x = rearrange(x, "b 1 f t c -> (b 1) c t f")
153
+ # Split into bands
154
+ x_bands = [x[..., b[0] : b[1]] for b in self.bands]
155
+ return x_bands
156
+
157
+ def forward(self, x):
158
+ x_bands = self.spectrogram(x)
159
+ fmap = []
160
+
161
+ x = []
162
+ for band, stack in zip(x_bands, self.band_convs):
163
+ for layer in stack:
164
+ band = layer(band)
165
+ fmap.append(band)
166
+ x.append(band)
167
+
168
+ x = torch.cat(x, dim=-1)
169
+ x = self.conv_post(x)
170
+ fmap.append(x)
171
+
172
+ return fmap
173
+
174
+
175
+ class Discriminator(nn.Module):
176
+ def __init__(
177
+ self,
178
+ rates: list = [],
179
+ periods: list = [2, 3, 5, 7, 11],
180
+ fft_sizes: list = [2048, 1024, 512],
181
+ sample_rate: int = 44100,
182
+ bands: list = BANDS,
183
+ ):
184
+ """Discriminator that combines multiple discriminators.
185
+
186
+ Parameters
187
+ ----------
188
+ rates : list, optional
189
+ sampling rates (in Hz) to run MSD at, by default []
190
+ If empty, MSD is not used.
191
+ periods : list, optional
192
+ periods (of samples) to run MPD at, by default [2, 3, 5, 7, 11]
193
+ fft_sizes : list, optional
194
+ Window sizes of the FFT to run MRD at, by default [2048, 1024, 512]
195
+ sample_rate : int, optional
196
+ Sampling rate of audio in Hz, by default 44100
197
+ bands : list, optional
198
+ Bands to run MRD at, by default `BANDS`
199
+ """
200
+ super().__init__()
201
+ discs = []
202
+ discs += [MPD(p) for p in periods]
203
+ discs += [MSD(r, sample_rate=sample_rate) for r in rates]
204
+ discs += [MRD(f, sample_rate=sample_rate, bands=bands) for f in fft_sizes]
205
+ self.discriminators = nn.ModuleList(discs)
206
+
207
+ def preprocess(self, y):
208
+ # Remove DC offset
209
+ y = y - y.mean(dim=-1, keepdims=True)
210
+ # Peak normalize the volume of input audio
211
+ y = 0.8 * y / (y.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
212
+ return y
213
+
214
+ def forward(self, x):
215
+ x = self.preprocess(x)
216
+ fmaps = [d(x) for d in self.discriminators]
217
+ return fmaps
218
+
219
+
220
+ if __name__ == "__main__":
221
+ disc = Discriminator()
222
+ x = torch.zeros(1, 1, 44100)
223
+ results = disc(x)
224
+ for i, result in enumerate(results):
225
+ print(f"disc{i}")
226
+ for i, r in enumerate(result):
227
+ print(r.shape, r.mean(), r.min(), r.max())
228
+ print()
dac/model/encodec.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Convolutional layers wrappers and utilities."""
8
+
9
+ import math
10
+ import typing as tp
11
+ import warnings
12
+
13
+ import torch
14
+ from torch import nn
15
+ from torch.nn import functional as F
16
+ from torch.nn.utils import spectral_norm, weight_norm
17
+
18
+ import typing as tp
19
+
20
+ import einops
21
+
22
+
23
+ class ConvLayerNorm(nn.LayerNorm):
24
+ """
25
+ Convolution-friendly LayerNorm that moves channels to last dimensions
26
+ before running the normalization and moves them back to original position right after.
27
+ """
28
+ def __init__(self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs):
29
+ super().__init__(normalized_shape, **kwargs)
30
+
31
+ def forward(self, x):
32
+ x = einops.rearrange(x, 'b ... t -> b t ...')
33
+ x = super().forward(x)
34
+ x = einops.rearrange(x, 'b t ... -> b ... t')
35
+ return
36
+
37
+
38
+ CONV_NORMALIZATIONS = frozenset(['none', 'weight_norm', 'spectral_norm',
39
+ 'time_layer_norm', 'layer_norm', 'time_group_norm'])
40
+
41
+
42
+ def apply_parametrization_norm(module: nn.Module, norm: str = 'none') -> nn.Module:
43
+ assert norm in CONV_NORMALIZATIONS
44
+ if norm == 'weight_norm':
45
+ return weight_norm(module)
46
+ elif norm == 'spectral_norm':
47
+ return spectral_norm(module)
48
+ else:
49
+ # We already check was in CONV_NORMALIZATION, so any other choice
50
+ # doesn't need reparametrization.
51
+ return module
52
+
53
+
54
+ def get_norm_module(module: nn.Module, causal: bool = False, norm: str = 'none', **norm_kwargs) -> nn.Module:
55
+ """Return the proper normalization module. If causal is True, this will ensure the returned
56
+ module is causal, or return an error if the normalization doesn't support causal evaluation.
57
+ """
58
+ assert norm in CONV_NORMALIZATIONS
59
+ if norm == 'layer_norm':
60
+ assert isinstance(module, nn.modules.conv._ConvNd)
61
+ return ConvLayerNorm(module.out_channels, **norm_kwargs)
62
+ elif norm == 'time_group_norm':
63
+ if causal:
64
+ raise ValueError("GroupNorm doesn't support causal evaluation.")
65
+ assert isinstance(module, nn.modules.conv._ConvNd)
66
+ return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
67
+ else:
68
+ return nn.Identity()
69
+
70
+
71
+ def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
72
+ padding_total: int = 0) -> int:
73
+ """See `pad_for_conv1d`.
74
+ """
75
+ length = x.shape[-1]
76
+ n_frames = (length - kernel_size + padding_total) / stride + 1
77
+ ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
78
+ return ideal_length - length
79
+
80
+
81
+ def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
82
+ """Pad for a convolution to make sure that the last window is full.
83
+ Extra padding is added at the end. This is required to ensure that we can rebuild
84
+ an output of the same length, as otherwise, even with padding, some time steps
85
+ might get removed.
86
+ For instance, with total padding = 4, kernel size = 4, stride = 2:
87
+ 0 0 1 2 3 4 5 0 0 # (0s are padding)
88
+ 1 2 3 # (output frames of a convolution, last 0 is never used)
89
+ 0 0 1 2 3 4 5 0 # (output of tr. conv., but pos. 5 is going to get removed as padding)
90
+ 1 2 3 4 # once you removed padding, we are missing one time step !
91
+ """
92
+ extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
93
+ return F.pad(x, (0, extra_padding))
94
+
95
+
96
+ def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'zero', value: float = 0.):
97
+ """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
98
+ If this is the case, we insert extra 0 padding to the right before the reflection happen.
99
+ """
100
+ length = x.shape[-1]
101
+ padding_left, padding_right = paddings
102
+ assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
103
+ if mode == 'reflect':
104
+ max_pad = max(padding_left, padding_right)
105
+ extra_pad = 0
106
+ if length <= max_pad:
107
+ extra_pad = max_pad - length + 1
108
+ x = F.pad(x, (0, extra_pad))
109
+ padded = F.pad(x, paddings, mode, value)
110
+ end = padded.shape[-1] - extra_pad
111
+ return padded[..., :end]
112
+ else:
113
+ return F.pad(x, paddings, mode, value)
114
+
115
+
116
+ def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
117
+ """Remove padding from x, handling properly zero padding. Only for 1d!"""
118
+ padding_left, padding_right = paddings
119
+ assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
120
+ assert (padding_left + padding_right) <= x.shape[-1]
121
+ end = x.shape[-1] - padding_right
122
+ return x[..., padding_left: end]
123
+
124
+
125
+ class NormConv1d(nn.Module):
126
+ """Wrapper around Conv1d and normalization applied to this conv
127
+ to provide a uniform interface across normalization approaches.
128
+ """
129
+ def __init__(self, *args, causal: bool = False, norm: str = 'none',
130
+ norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
131
+ super().__init__()
132
+ self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
133
+ self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
134
+ self.norm_type = norm
135
+
136
+ def forward(self, x):
137
+ x = self.conv(x)
138
+ x = self.norm(x)
139
+ return x
140
+
141
+
142
+ class NormConv2d(nn.Module):
143
+ """Wrapper around Conv2d and normalization applied to this conv
144
+ to provide a uniform interface across normalization approaches.
145
+ """
146
+ def __init__(self, *args, norm: str = 'none',
147
+ norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
148
+ super().__init__()
149
+ self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
150
+ self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
151
+ self.norm_type = norm
152
+
153
+ def forward(self, x):
154
+ x = self.conv(x)
155
+ x = self.norm(x)
156
+ return x
157
+
158
+
159
+ class NormConvTranspose1d(nn.Module):
160
+ """Wrapper around ConvTranspose1d and normalization applied to this conv
161
+ to provide a uniform interface across normalization approaches.
162
+ """
163
+ def __init__(self, *args, causal: bool = False, norm: str = 'none',
164
+ norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
165
+ super().__init__()
166
+ self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
167
+ self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
168
+ self.norm_type = norm
169
+
170
+ def forward(self, x):
171
+ x = self.convtr(x)
172
+ x = self.norm(x)
173
+ return x
174
+
175
+
176
+ class NormConvTranspose2d(nn.Module):
177
+ """Wrapper around ConvTranspose2d and normalization applied to this conv
178
+ to provide a uniform interface across normalization approaches.
179
+ """
180
+ def __init__(self, *args, norm: str = 'none',
181
+ norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
182
+ super().__init__()
183
+ self.convtr = apply_parametrization_norm(nn.ConvTranspose2d(*args, **kwargs), norm)
184
+ self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)
185
+
186
+ def forward(self, x):
187
+ x = self.convtr(x)
188
+ x = self.norm(x)
189
+ return x
190
+
191
+
192
+ class SConv1d(nn.Module):
193
+ """Conv1d with some builtin handling of asymmetric or causal padding
194
+ and normalization.
195
+ """
196
+ def __init__(self, in_channels: int, out_channels: int,
197
+ kernel_size: int, stride: int = 1, dilation: int = 1,
198
+ groups: int = 1, bias: bool = True, causal: bool = False,
199
+ norm: str = 'none', norm_kwargs: tp.Dict[str, tp.Any] = {},
200
+ pad_mode: str = 'reflect', **kwargs):
201
+ super().__init__()
202
+ # warn user on unusual setup between dilation and stride
203
+ if stride > 1 and dilation > 1:
204
+ warnings.warn('SConv1d has been initialized with stride > 1 and dilation > 1'
205
+ f' (kernel_size={kernel_size} stride={stride}, dilation={dilation}).')
206
+ self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
207
+ dilation=dilation, groups=groups, bias=bias, causal=causal,
208
+ norm=norm, norm_kwargs=norm_kwargs)
209
+ self.causal = causal
210
+ self.pad_mode = pad_mode
211
+
212
+ self.cache_enabled = False
213
+
214
+ def reset_cache(self):
215
+ """Reset the cache when starting a new stream."""
216
+ self.cache = None
217
+ self.cache_enabled = True
218
+
219
+ def forward(self, x):
220
+ B, C, T = x.shape
221
+ kernel_size = self.conv.conv.kernel_size[0]
222
+ stride = self.conv.conv.stride[0]
223
+ dilation = self.conv.conv.dilation[0]
224
+ kernel_size = (kernel_size - 1) * dilation + 1 # effective kernel size with dilations
225
+ padding_total = kernel_size - stride
226
+ extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
227
+
228
+ if self.causal:
229
+ # Left padding for causal
230
+ if self.cache_enabled and self.cache is not None:
231
+ # Concatenate the cache (previous inputs) with the new input for streaming
232
+ x = torch.cat([self.cache, x], dim=2)
233
+ else:
234
+ x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
235
+ else:
236
+ # Asymmetric padding required for odd strides
237
+ padding_right = padding_total // 2
238
+ padding_left = padding_total - padding_right
239
+ x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
240
+
241
+ # Store the most recent input frames for future cache use
242
+ if self.cache_enabled:
243
+ if self.cache is None:
244
+ # Initialize cache with zeros (at the start of streaming)
245
+ self.cache = torch.zeros(B, C, kernel_size - 1, device=x.device)
246
+ # Update the cache by storing the latest input frames
247
+ if kernel_size > 1:
248
+ self.cache = x[:, :, -kernel_size + 1:].detach() # Only store the necessary frames
249
+
250
+ return self.conv(x)
251
+
252
+
253
+
254
+ class SConvTranspose1d(nn.Module):
255
+ """ConvTranspose1d with some builtin handling of asymmetric or causal padding
256
+ and normalization.
257
+ """
258
+ def __init__(self, in_channels: int, out_channels: int,
259
+ kernel_size: int, stride: int = 1, causal: bool = False,
260
+ norm: str = 'none', trim_right_ratio: float = 1.,
261
+ norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
262
+ super().__init__()
263
+ self.convtr = NormConvTranspose1d(in_channels, out_channels, kernel_size, stride,
264
+ causal=causal, norm=norm, norm_kwargs=norm_kwargs)
265
+ self.causal = causal
266
+ self.trim_right_ratio = trim_right_ratio
267
+ assert self.causal or self.trim_right_ratio == 1., \
268
+ "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
269
+ assert self.trim_right_ratio >= 0. and self.trim_right_ratio <= 1.
270
+
271
+ def forward(self, x):
272
+ kernel_size = self.convtr.convtr.kernel_size[0]
273
+ stride = self.convtr.convtr.stride[0]
274
+ padding_total = kernel_size - stride
275
+
276
+ y = self.convtr(x)
277
+
278
+ # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
279
+ # removed at the very end, when keeping only the right length for the output,
280
+ # as removing it here would require also passing the length at the matching layer
281
+ # in the encoder.
282
+ if self.causal:
283
+ # Trim the padding on the right according to the specified ratio
284
+ # if trim_right_ratio = 1.0, trim everything from right
285
+ padding_right = math.ceil(padding_total * self.trim_right_ratio)
286
+ padding_left = padding_total - padding_right
287
+ y = unpad1d(y, (padding_left, padding_right))
288
+ else:
289
+ # Asymmetric padding required for odd strides
290
+ padding_right = padding_total // 2
291
+ padding_left = padding_total - padding_right
292
+ y = unpad1d(y, (padding_left, padding_right))
293
+ return y
294
+
295
+ class SLSTM(nn.Module):
296
+ """
297
+ LSTM without worrying about the hidden state, nor the layout of the data.
298
+ Expects input as convolutional layout.
299
+ """
300
+ def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
301
+ super().__init__()
302
+ self.skip = skip
303
+ self.lstm = nn.LSTM(dimension, dimension, num_layers)
304
+ self.hidden = None
305
+ self.cache_enabled = False
306
+
307
+ def forward(self, x):
308
+ x = x.permute(2, 0, 1)
309
+ if self.training or not self.cache_enabled:
310
+ y, _ = self.lstm(x)
311
+ else:
312
+ y, self.hidden = self.lstm(x, self.hidden)
313
+ if self.skip:
314
+ y = y + x
315
+ y = y.permute(1, 2, 0)
316
+ return y
317
+
318
+ def reset_cache(self):
319
+ self.hidden = None
320
+ self.cache_enabled = True
dac/nn/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from . import layers
2
+ from . import loss
3
+ from . import quantize
dac/nn/layers.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from einops import rearrange
6
+ from torch.nn.utils import weight_norm
7
+
8
+
9
+ def WNConv1d(*args, **kwargs):
10
+ return weight_norm(nn.Conv1d(*args, **kwargs))
11
+
12
+
13
+ def WNConvTranspose1d(*args, **kwargs):
14
+ return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
15
+
16
+
17
+ # Scripting this brings model speed up 1.4x
18
+ @torch.jit.script
19
+ def snake(x, alpha):
20
+ shape = x.shape
21
+ x = x.reshape(shape[0], shape[1], -1)
22
+ x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
23
+ x = x.reshape(shape)
24
+ return x
25
+
26
+
27
+ class Snake1d(nn.Module):
28
+ def __init__(self, channels):
29
+ super().__init__()
30
+ self.alpha = nn.Parameter(torch.ones(1, channels, 1))
31
+
32
+ def forward(self, x):
33
+ return snake(x, self.alpha)
dac/nn/loss.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import typing
2
+ from typing import List
3
+
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from audiotools import AudioSignal
7
+ from audiotools import STFTParams
8
+ from torch import nn
9
+
10
+
11
+ class L1Loss(nn.L1Loss):
12
+ """L1 Loss between AudioSignals. Defaults
13
+ to comparing ``audio_data``, but any
14
+ attribute of an AudioSignal can be used.
15
+
16
+ Parameters
17
+ ----------
18
+ attribute : str, optional
19
+ Attribute of signal to compare, defaults to ``audio_data``.
20
+ weight : float, optional
21
+ Weight of this loss, defaults to 1.0.
22
+
23
+ Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/distance.py
24
+ """
25
+
26
+ def __init__(self, attribute: str = "audio_data", weight: float = 1.0, **kwargs):
27
+ self.attribute = attribute
28
+ self.weight = weight
29
+ super().__init__(**kwargs)
30
+
31
+ def forward(self, x: AudioSignal, y: AudioSignal):
32
+ """
33
+ Parameters
34
+ ----------
35
+ x : AudioSignal
36
+ Estimate AudioSignal
37
+ y : AudioSignal
38
+ Reference AudioSignal
39
+
40
+ Returns
41
+ -------
42
+ torch.Tensor
43
+ L1 loss between AudioSignal attributes.
44
+ """
45
+ if isinstance(x, AudioSignal):
46
+ x = getattr(x, self.attribute)
47
+ y = getattr(y, self.attribute)
48
+ return super().forward(x, y)
49
+
50
+
51
+ class SISDRLoss(nn.Module):
52
+ """
53
+ Computes the Scale-Invariant Source-to-Distortion Ratio between a batch
54
+ of estimated and reference audio signals or aligned features.
55
+
56
+ Parameters
57
+ ----------
58
+ scaling : int, optional
59
+ Whether to use scale-invariant (True) or
60
+ signal-to-noise ratio (False), by default True
61
+ reduction : str, optional
62
+ How to reduce across the batch (either 'mean',
63
+ 'sum', or none).], by default ' mean'
64
+ zero_mean : int, optional
65
+ Zero mean the references and estimates before
66
+ computing the loss, by default True
67
+ clip_min : int, optional
68
+ The minimum possible loss value. Helps network
69
+ to not focus on making already good examples better, by default None
70
+ weight : float, optional
71
+ Weight of this loss, defaults to 1.0.
72
+
73
+ Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/distance.py
74
+ """
75
+
76
+ def __init__(
77
+ self,
78
+ scaling: int = True,
79
+ reduction: str = "mean",
80
+ zero_mean: int = True,
81
+ clip_min: int = None,
82
+ weight: float = 1.0,
83
+ ):
84
+ self.scaling = scaling
85
+ self.reduction = reduction
86
+ self.zero_mean = zero_mean
87
+ self.clip_min = clip_min
88
+ self.weight = weight
89
+ super().__init__()
90
+
91
+ def forward(self, x: AudioSignal, y: AudioSignal):
92
+ eps = 1e-8
93
+ # nb, nc, nt
94
+ if isinstance(x, AudioSignal):
95
+ references = x.audio_data
96
+ estimates = y.audio_data
97
+ else:
98
+ references = x
99
+ estimates = y
100
+
101
+ nb = references.shape[0]
102
+ references = references.reshape(nb, 1, -1).permute(0, 2, 1)
103
+ estimates = estimates.reshape(nb, 1, -1).permute(0, 2, 1)
104
+
105
+ # samples now on axis 1
106
+ if self.zero_mean:
107
+ mean_reference = references.mean(dim=1, keepdim=True)
108
+ mean_estimate = estimates.mean(dim=1, keepdim=True)
109
+ else:
110
+ mean_reference = 0
111
+ mean_estimate = 0
112
+
113
+ _references = references - mean_reference
114
+ _estimates = estimates - mean_estimate
115
+
116
+ references_projection = (_references**2).sum(dim=-2) + eps
117
+ references_on_estimates = (_estimates * _references).sum(dim=-2) + eps
118
+
119
+ scale = (
120
+ (references_on_estimates / references_projection).unsqueeze(1)
121
+ if self.scaling
122
+ else 1
123
+ )
124
+
125
+ e_true = scale * _references
126
+ e_res = _estimates - e_true
127
+
128
+ signal = (e_true**2).sum(dim=1)
129
+ noise = (e_res**2).sum(dim=1)
130
+ sdr = -10 * torch.log10(signal / noise + eps)
131
+
132
+ if self.clip_min is not None:
133
+ sdr = torch.clamp(sdr, min=self.clip_min)
134
+
135
+ if self.reduction == "mean":
136
+ sdr = sdr.mean()
137
+ elif self.reduction == "sum":
138
+ sdr = sdr.sum()
139
+ return sdr
140
+
141
+
142
+ class MultiScaleSTFTLoss(nn.Module):
143
+ """Computes the multi-scale STFT loss from [1].
144
+
145
+ Parameters
146
+ ----------
147
+ window_lengths : List[int], optional
148
+ Length of each window of each STFT, by default [2048, 512]
149
+ loss_fn : typing.Callable, optional
150
+ How to compare each loss, by default nn.L1Loss()
151
+ clamp_eps : float, optional
152
+ Clamp on the log magnitude, below, by default 1e-5
153
+ mag_weight : float, optional
154
+ Weight of raw magnitude portion of loss, by default 1.0
155
+ log_weight : float, optional
156
+ Weight of log magnitude portion of loss, by default 1.0
157
+ pow : float, optional
158
+ Power to raise magnitude to before taking log, by default 2.0
159
+ weight : float, optional
160
+ Weight of this loss, by default 1.0
161
+ match_stride : bool, optional
162
+ Whether to match the stride of convolutional layers, by default False
163
+
164
+ References
165
+ ----------
166
+
167
+ 1. Engel, Jesse, Chenjie Gu, and Adam Roberts.
168
+ "DDSP: Differentiable Digital Signal Processing."
169
+ International Conference on Learning Representations. 2019.
170
+
171
+ Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
172
+ """
173
+
174
+ def __init__(
175
+ self,
176
+ window_lengths: List[int] = [2048, 512],
177
+ loss_fn: typing.Callable = nn.L1Loss(),
178
+ clamp_eps: float = 1e-5,
179
+ mag_weight: float = 1.0,
180
+ log_weight: float = 1.0,
181
+ pow: float = 2.0,
182
+ weight: float = 1.0,
183
+ match_stride: bool = False,
184
+ window_type: str = None,
185
+ ):
186
+ super().__init__()
187
+ self.stft_params = [
188
+ STFTParams(
189
+ window_length=w,
190
+ hop_length=w // 4,
191
+ match_stride=match_stride,
192
+ window_type=window_type,
193
+ )
194
+ for w in window_lengths
195
+ ]
196
+ self.loss_fn = loss_fn
197
+ self.log_weight = log_weight
198
+ self.mag_weight = mag_weight
199
+ self.clamp_eps = clamp_eps
200
+ self.weight = weight
201
+ self.pow = pow
202
+
203
+ def forward(self, x: AudioSignal, y: AudioSignal):
204
+ """Computes multi-scale STFT between an estimate and a reference
205
+ signal.
206
+
207
+ Parameters
208
+ ----------
209
+ x : AudioSignal
210
+ Estimate signal
211
+ y : AudioSignal
212
+ Reference signal
213
+
214
+ Returns
215
+ -------
216
+ torch.Tensor
217
+ Multi-scale STFT loss.
218
+ """
219
+ loss = 0.0
220
+ for s in self.stft_params:
221
+ x.stft(s.window_length, s.hop_length, s.window_type)
222
+ y.stft(s.window_length, s.hop_length, s.window_type)
223
+ loss += self.log_weight * self.loss_fn(
224
+ x.magnitude.clamp(self.clamp_eps).pow(self.pow).log10(),
225
+ y.magnitude.clamp(self.clamp_eps).pow(self.pow).log10(),
226
+ )
227
+ loss += self.mag_weight * self.loss_fn(x.magnitude, y.magnitude)
228
+ return loss
229
+
230
+
231
+ class MelSpectrogramLoss(nn.Module):
232
+ """Compute distance between mel spectrograms. Can be used
233
+ in a multi-scale way.
234
+
235
+ Parameters
236
+ ----------
237
+ n_mels : List[int]
238
+ Number of mels per STFT, by default [150, 80],
239
+ window_lengths : List[int], optional
240
+ Length of each window of each STFT, by default [2048, 512]
241
+ loss_fn : typing.Callable, optional
242
+ How to compare each loss, by default nn.L1Loss()
243
+ clamp_eps : float, optional
244
+ Clamp on the log magnitude, below, by default 1e-5
245
+ mag_weight : float, optional
246
+ Weight of raw magnitude portion of loss, by default 1.0
247
+ log_weight : float, optional
248
+ Weight of log magnitude portion of loss, by default 1.0
249
+ pow : float, optional
250
+ Power to raise magnitude to before taking log, by default 2.0
251
+ weight : float, optional
252
+ Weight of this loss, by default 1.0
253
+ match_stride : bool, optional
254
+ Whether to match the stride of convolutional layers, by default False
255
+
256
+ Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
257
+ """
258
+
259
+ def __init__(
260
+ self,
261
+ n_mels: List[int] = [150, 80],
262
+ window_lengths: List[int] = [2048, 512],
263
+ loss_fn: typing.Callable = nn.L1Loss(),
264
+ clamp_eps: float = 1e-5,
265
+ mag_weight: float = 1.0,
266
+ log_weight: float = 1.0,
267
+ pow: float = 2.0,
268
+ weight: float = 1.0,
269
+ match_stride: bool = False,
270
+ mel_fmin: List[float] = [0.0, 0.0],
271
+ mel_fmax: List[float] = [None, None],
272
+ window_type: str = None,
273
+ ):
274
+ super().__init__()
275
+ self.stft_params = [
276
+ STFTParams(
277
+ window_length=w,
278
+ hop_length=w // 4,
279
+ match_stride=match_stride,
280
+ window_type=window_type,
281
+ )
282
+ for w in window_lengths
283
+ ]
284
+ self.n_mels = n_mels
285
+ self.loss_fn = loss_fn
286
+ self.clamp_eps = clamp_eps
287
+ self.log_weight = log_weight
288
+ self.mag_weight = mag_weight
289
+ self.weight = weight
290
+ self.mel_fmin = mel_fmin
291
+ self.mel_fmax = mel_fmax
292
+ self.pow = pow
293
+
294
+ def forward(self, x: AudioSignal, y: AudioSignal):
295
+ """Computes mel loss between an estimate and a reference
296
+ signal.
297
+
298
+ Parameters
299
+ ----------
300
+ x : AudioSignal
301
+ Estimate signal
302
+ y : AudioSignal
303
+ Reference signal
304
+
305
+ Returns
306
+ -------
307
+ torch.Tensor
308
+ Mel loss.
309
+ """
310
+ loss = 0.0
311
+ for n_mels, fmin, fmax, s in zip(
312
+ self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params
313
+ ):
314
+ kwargs = {
315
+ "window_length": s.window_length,
316
+ "hop_length": s.hop_length,
317
+ "window_type": s.window_type,
318
+ }
319
+ x_mels = x.mel_spectrogram(n_mels, mel_fmin=fmin, mel_fmax=fmax, **kwargs)
320
+ y_mels = y.mel_spectrogram(n_mels, mel_fmin=fmin, mel_fmax=fmax, **kwargs)
321
+
322
+ loss += self.log_weight * self.loss_fn(
323
+ x_mels.clamp(self.clamp_eps).pow(self.pow).log10(),
324
+ y_mels.clamp(self.clamp_eps).pow(self.pow).log10(),
325
+ )
326
+ loss += self.mag_weight * self.loss_fn(x_mels, y_mels)
327
+ return loss
328
+
329
+
330
+ class GANLoss(nn.Module):
331
+ """
332
+ Computes a discriminator loss, given a discriminator on
333
+ generated waveforms/spectrograms compared to ground truth
334
+ waveforms/spectrograms. Computes the loss for both the
335
+ discriminator and the generator in separate functions.
336
+ """
337
+
338
+ def __init__(self, discriminator):
339
+ super().__init__()
340
+ self.discriminator = discriminator
341
+
342
+ def forward(self, fake, real):
343
+ d_fake = self.discriminator(fake.audio_data)
344
+ d_real = self.discriminator(real.audio_data)
345
+ return d_fake, d_real
346
+
347
+ def discriminator_loss(self, fake, real):
348
+ d_fake, d_real = self.forward(fake.clone().detach(), real)
349
+
350
+ loss_d = 0
351
+ for x_fake, x_real in zip(d_fake, d_real):
352
+ loss_d += torch.mean(x_fake[-1] ** 2)
353
+ loss_d += torch.mean((1 - x_real[-1]) ** 2)
354
+ return loss_d
355
+
356
+ def generator_loss(self, fake, real):
357
+ d_fake, d_real = self.forward(fake, real)
358
+
359
+ loss_g = 0
360
+ for x_fake in d_fake:
361
+ loss_g += torch.mean((1 - x_fake[-1]) ** 2)
362
+
363
+ loss_feature = 0
364
+
365
+ for i in range(len(d_fake)):
366
+ for j in range(len(d_fake[i]) - 1):
367
+ loss_feature += F.l1_loss(d_fake[i][j], d_real[i][j].detach())
368
+ return loss_g, loss_feature
dac/nn/quantize.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ from einops import rearrange
8
+ from torch.nn.utils import weight_norm
9
+
10
+ from dac.nn.layers import WNConv1d
11
+
12
+ class VectorQuantizeLegacy(nn.Module):
13
+ """
14
+ Implementation of VQ similar to Karpathy's repo:
15
+ https://github.com/karpathy/deep-vector-quantization
16
+ removed in-out projection
17
+ """
18
+
19
+ def __init__(self, input_dim: int, codebook_size: int):
20
+ super().__init__()
21
+ self.codebook_size = codebook_size
22
+ self.codebook = nn.Embedding(codebook_size, input_dim)
23
+
24
+ def forward(self, z, z_mask=None):
25
+ """Quantized the input tensor using a fixed codebook and returns
26
+ the corresponding codebook vectors
27
+
28
+ Parameters
29
+ ----------
30
+ z : Tensor[B x D x T]
31
+
32
+ Returns
33
+ -------
34
+ Tensor[B x D x T]
35
+ Quantized continuous representation of input
36
+ Tensor[1]
37
+ Commitment loss to train encoder to predict vectors closer to codebook
38
+ entries
39
+ Tensor[1]
40
+ Codebook loss to update the codebook
41
+ Tensor[B x T]
42
+ Codebook indices (quantized discrete representation of input)
43
+ Tensor[B x D x T]
44
+ Projected latents (continuous representation of input before quantization)
45
+ """
46
+
47
+ z_e = z
48
+ z_q, indices = self.decode_latents(z)
49
+
50
+ if z_mask is not None:
51
+ commitment_loss = (F.mse_loss(z_e, z_q.detach(), reduction="none").mean(1) * z_mask).sum() / z_mask.sum()
52
+ codebook_loss = (F.mse_loss(z_q, z_e.detach(), reduction="none").mean(1) * z_mask).sum() / z_mask.sum()
53
+ else:
54
+ commitment_loss = F.mse_loss(z_e, z_q.detach())
55
+ codebook_loss = F.mse_loss(z_q, z_e.detach())
56
+ z_q = (
57
+ z_e + (z_q - z_e).detach()
58
+ ) # noop in forward pass, straight-through gradient estimator in backward pass
59
+
60
+ return z_q, indices, z_e, commitment_loss, codebook_loss
61
+
62
+ def embed_code(self, embed_id):
63
+ return F.embedding(embed_id, self.codebook.weight)
64
+
65
+ def decode_code(self, embed_id):
66
+ return self.embed_code(embed_id).transpose(1, 2)
67
+
68
+ def decode_latents(self, latents):
69
+ encodings = rearrange(latents, "b d t -> (b t) d")
70
+ codebook = self.codebook.weight # codebook: (N x D)
71
+
72
+ # L2 normalize encodings and codebook (ViT-VQGAN)
73
+ encodings = F.normalize(encodings)
74
+ codebook = F.normalize(codebook)
75
+
76
+ # Compute euclidean distance with codebook
77
+ dist = (
78
+ encodings.pow(2).sum(1, keepdim=True)
79
+ - 2 * encodings @ codebook.t()
80
+ + codebook.pow(2).sum(1, keepdim=True).t()
81
+ )
82
+ indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
83
+ z_q = self.decode_code(indices)
84
+ return z_q, indices
85
+
86
+ class VectorQuantize(nn.Module):
87
+ """
88
+ Implementation of VQ similar to Karpathy's repo:
89
+ https://github.com/karpathy/deep-vector-quantization
90
+ Additionally uses following tricks from Improved VQGAN
91
+ (https://arxiv.org/pdf/2110.04627.pdf):
92
+ 1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
93
+ for improved codebook usage
94
+ 2. l2-normalized codes: Converts euclidean distance to cosine similarity which
95
+ improves training stability
96
+ """
97
+
98
+ def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int):
99
+ super().__init__()
100
+ self.codebook_size = codebook_size
101
+ self.codebook_dim = codebook_dim
102
+
103
+ self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
104
+ self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
105
+ self.codebook = nn.Embedding(codebook_size, codebook_dim)
106
+
107
+ def forward(self, z, z_mask=None):
108
+ """Quantized the input tensor using a fixed codebook and returns
109
+ the corresponding codebook vectors
110
+
111
+ Parameters
112
+ ----------
113
+ z : Tensor[B x D x T]
114
+
115
+ Returns
116
+ -------
117
+ Tensor[B x D x T]
118
+ Quantized continuous representation of input
119
+ Tensor[1]
120
+ Commitment loss to train encoder to predict vectors closer to codebook
121
+ entries
122
+ Tensor[1]
123
+ Codebook loss to update the codebook
124
+ Tensor[B x T]
125
+ Codebook indices (quantized discrete representation of input)
126
+ Tensor[B x D x T]
127
+ Projected latents (continuous representation of input before quantization)
128
+ """
129
+
130
+ # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
131
+ z_e = self.in_proj(z) # z_e : (B x D x T)
132
+ z_q, indices = self.decode_latents(z_e)
133
+
134
+ if z_mask is not None:
135
+ commitment_loss = (F.mse_loss(z_e, z_q.detach(), reduction="none").mean(1) * z_mask).sum() / z_mask.sum()
136
+ codebook_loss = (F.mse_loss(z_q, z_e.detach(), reduction="none").mean(1) * z_mask).sum() / z_mask.sum()
137
+ else:
138
+ commitment_loss = F.mse_loss(z_e, z_q.detach())
139
+ codebook_loss = F.mse_loss(z_q, z_e.detach())
140
+
141
+ z_q = (
142
+ z_e + (z_q - z_e).detach()
143
+ ) # noop in forward pass, straight-through gradient estimator in backward pass
144
+
145
+ z_q = self.out_proj(z_q)
146
+
147
+ return z_q, commitment_loss, codebook_loss, indices, z_e
148
+
149
+ def embed_code(self, embed_id):
150
+ return F.embedding(embed_id, self.codebook.weight)
151
+
152
+ def decode_code(self, embed_id):
153
+ return self.embed_code(embed_id).transpose(1, 2)
154
+
155
+ def decode_latents(self, latents):
156
+ encodings = rearrange(latents, "b d t -> (b t) d")
157
+ codebook = self.codebook.weight # codebook: (N x D)
158
+
159
+ # L2 normalize encodings and codebook (ViT-VQGAN)
160
+ encodings = F.normalize(encodings)
161
+ codebook = F.normalize(codebook)
162
+
163
+ # Compute euclidean distance with codebook
164
+ dist = (
165
+ encodings.pow(2).sum(1, keepdim=True)
166
+ - 2 * encodings @ codebook.t()
167
+ + codebook.pow(2).sum(1, keepdim=True).t()
168
+ )
169
+ indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
170
+ z_q = self.decode_code(indices)
171
+ return z_q, indices
172
+
173
+
174
+ class ResidualVectorQuantize(nn.Module):
175
+ """
176
+ Introduced in SoundStream: An end2end neural audio codec
177
+ https://arxiv.org/abs/2107.03312
178
+ """
179
+
180
+ def __init__(
181
+ self,
182
+ input_dim: int = 512,
183
+ n_codebooks: int = 9,
184
+ codebook_size: int = 1024,
185
+ codebook_dim: Union[int, list] = 8,
186
+ quantizer_dropout: float = 0.0,
187
+ ):
188
+ super().__init__()
189
+ if isinstance(codebook_dim, int):
190
+ codebook_dim = [codebook_dim for _ in range(n_codebooks)]
191
+
192
+ self.n_codebooks = n_codebooks
193
+ self.codebook_dim = codebook_dim
194
+ self.codebook_size = codebook_size
195
+
196
+ self.quantizers = nn.ModuleList(
197
+ [
198
+ VectorQuantize(input_dim, codebook_size, codebook_dim[i])
199
+ for i in range(n_codebooks)
200
+ ]
201
+ )
202
+ self.quantizer_dropout = quantizer_dropout
203
+
204
+ def forward(self, z, n_quantizers: int = None):
205
+ """Quantized the input tensor using a fixed set of `n` codebooks and returns
206
+ the corresponding codebook vectors
207
+ Parameters
208
+ ----------
209
+ z : Tensor[B x D x T]
210
+ n_quantizers : int, optional
211
+ No. of quantizers to use
212
+ (n_quantizers < self.n_codebooks ex: for quantizer dropout)
213
+ Note: if `self.quantizer_dropout` is True, this argument is ignored
214
+ when in training mode, and a random number of quantizers is used.
215
+ Returns
216
+ -------
217
+ dict
218
+ A dictionary with the following keys:
219
+
220
+ "z" : Tensor[B x D x T]
221
+ Quantized continuous representation of input
222
+ "codes" : Tensor[B x N x T]
223
+ Codebook indices for each codebook
224
+ (quantized discrete representation of input)
225
+ "latents" : Tensor[B x N*D x T]
226
+ Projected latents (continuous representation of input before quantization)
227
+ "vq/commitment_loss" : Tensor[1]
228
+ Commitment loss to train encoder to predict vectors closer to codebook
229
+ entries
230
+ "vq/codebook_loss" : Tensor[1]
231
+ Codebook loss to update the codebook
232
+ """
233
+ z_q = 0
234
+ residual = z
235
+ commitment_loss = 0
236
+ codebook_loss = 0
237
+
238
+ codebook_indices = []
239
+ latents = []
240
+
241
+ if n_quantizers is None:
242
+ n_quantizers = self.n_codebooks
243
+ if self.training:
244
+ n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
245
+ dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
246
+ n_dropout = int(z.shape[0] * self.quantizer_dropout)
247
+ n_quantizers[:n_dropout] = dropout[:n_dropout]
248
+ n_quantizers = n_quantizers.to(z.device)
249
+
250
+ for i, quantizer in enumerate(self.quantizers):
251
+ if self.training is False and i >= n_quantizers:
252
+ break
253
+
254
+ z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
255
+ residual
256
+ )
257
+
258
+ # Create mask to apply quantizer dropout
259
+ mask = (
260
+ torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
261
+ )
262
+ z_q = z_q + z_q_i * mask[:, None, None]
263
+ residual = residual - z_q_i
264
+
265
+ # Sum losses
266
+ commitment_loss += (commitment_loss_i * mask).mean()
267
+ codebook_loss += (codebook_loss_i * mask).mean()
268
+
269
+ codebook_indices.append(indices_i)
270
+ latents.append(z_e_i)
271
+
272
+ codes = torch.stack(codebook_indices, dim=1)
273
+ latents = torch.cat(latents, dim=1)
274
+
275
+ return z_q, codes, latents, commitment_loss, codebook_loss
276
+
277
+ def from_codes(self, codes: torch.Tensor):
278
+ """Given the quantized codes, reconstruct the continuous representation
279
+ Parameters
280
+ ----------
281
+ codes : Tensor[B x N x T]
282
+ Quantized discrete representation of input
283
+ Returns
284
+ -------
285
+ Tensor[B x D x T]
286
+ Quantized continuous representation of input
287
+ """
288
+ z_q = 0.0
289
+ z_p = []
290
+ n_codebooks = codes.shape[1]
291
+ for i in range(n_codebooks):
292
+ z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
293
+ z_p.append(z_p_i)
294
+
295
+ z_q_i = self.quantizers[i].out_proj(z_p_i)
296
+ z_q = z_q + z_q_i
297
+ return z_q, torch.cat(z_p, dim=1), codes
298
+
299
+ def from_latents(self, latents: torch.Tensor):
300
+ """Given the unquantized latents, reconstruct the
301
+ continuous representation after quantization.
302
+
303
+ Parameters
304
+ ----------
305
+ latents : Tensor[B x N x T]
306
+ Continuous representation of input after projection
307
+
308
+ Returns
309
+ -------
310
+ Tensor[B x D x T]
311
+ Quantized representation of full-projected space
312
+ Tensor[B x D x T]
313
+ Quantized representation of latent space
314
+ """
315
+ z_q = 0
316
+ z_p = []
317
+ codes = []
318
+ dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
319
+
320
+ n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
321
+ 0
322
+ ]
323
+ for i in range(n_codebooks):
324
+ j, k = dims[i], dims[i + 1]
325
+ z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
326
+ z_p.append(z_p_i)
327
+ codes.append(codes_i)
328
+
329
+ z_q_i = self.quantizers[i].out_proj(z_p_i)
330
+ z_q = z_q + z_q_i
331
+
332
+ return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
333
+
334
+
335
+ if __name__ == "__main__":
336
+ rvq = ResidualVectorQuantize(quantizer_dropout=True)
337
+ x = torch.randn(16, 512, 80)
338
+ y = rvq(x)
339
+ print(y["latents"].shape)
dac/utils/__init__.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import argbind
4
+ from audiotools import ml
5
+
6
+ import dac
7
+
8
+ DAC = dac.model.DAC
9
+ Accelerator = ml.Accelerator
10
+
11
+ __MODEL_LATEST_TAGS__ = {
12
+ ("44khz", "8kbps"): "0.0.1",
13
+ ("24khz", "8kbps"): "0.0.4",
14
+ ("16khz", "8kbps"): "0.0.5",
15
+ ("44khz", "16kbps"): "1.0.0",
16
+ }
17
+
18
+ __MODEL_URLS__ = {
19
+ (
20
+ "44khz",
21
+ "0.0.1",
22
+ "8kbps",
23
+ ): "https://github.com/descriptinc/descript-audio-codec/releases/download/0.0.1/weights.pth",
24
+ (
25
+ "24khz",
26
+ "0.0.4",
27
+ "8kbps",
28
+ ): "https://github.com/descriptinc/descript-audio-codec/releases/download/0.0.4/weights_24khz.pth",
29
+ (
30
+ "16khz",
31
+ "0.0.5",
32
+ "8kbps",
33
+ ): "https://github.com/descriptinc/descript-audio-codec/releases/download/0.0.5/weights_16khz.pth",
34
+ (
35
+ "44khz",
36
+ "1.0.0",
37
+ "16kbps",
38
+ ): "https://github.com/descriptinc/descript-audio-codec/releases/download/1.0.0/weights_44khz_16kbps.pth",
39
+ }
40
+
41
+
42
+ @argbind.bind(group="download", positional=True, without_prefix=True)
43
+ def download(
44
+ model_type: str = "44khz", model_bitrate: str = "8kbps", tag: str = "latest"
45
+ ):
46
+ """
47
+ Function that downloads the weights file from URL if a local cache is not found.
48
+
49
+ Parameters
50
+ ----------
51
+ model_type : str
52
+ The type of model to download. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz".
53
+ model_bitrate: str
54
+ Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
55
+ Only 44khz model supports 16kbps.
56
+ tag : str
57
+ The tag of the model to download. Defaults to "latest".
58
+
59
+ Returns
60
+ -------
61
+ Path
62
+ Directory path required to load model via audiotools.
63
+ """
64
+ model_type = model_type.lower()
65
+ tag = tag.lower()
66
+
67
+ assert model_type in [
68
+ "44khz",
69
+ "24khz",
70
+ "16khz",
71
+ ], "model_type must be one of '44khz', '24khz', or '16khz'"
72
+
73
+ assert model_bitrate in [
74
+ "8kbps",
75
+ "16kbps",
76
+ ], "model_bitrate must be one of '8kbps', or '16kbps'"
77
+
78
+ if tag == "latest":
79
+ tag = __MODEL_LATEST_TAGS__[(model_type, model_bitrate)]
80
+
81
+ download_link = __MODEL_URLS__.get((model_type, tag, model_bitrate), None)
82
+
83
+ if download_link is None:
84
+ raise ValueError(
85
+ f"Could not find model with tag {tag} and model type {model_type}"
86
+ )
87
+
88
+ local_path = (
89
+ Path.home()
90
+ / ".cache"
91
+ / "descript"
92
+ / "dac"
93
+ / f"weights_{model_type}_{model_bitrate}_{tag}.pth"
94
+ )
95
+ if not local_path.exists():
96
+ local_path.parent.mkdir(parents=True, exist_ok=True)
97
+
98
+ # Download the model
99
+ import requests
100
+
101
+ response = requests.get(download_link)
102
+
103
+ if response.status_code != 200:
104
+ raise ValueError(
105
+ f"Could not download model. Received response code {response.status_code}"
106
+ )
107
+ local_path.write_bytes(response.content)
108
+
109
+ return local_path
110
+
111
+
112
+ def load_model(
113
+ model_type: str = "44khz",
114
+ model_bitrate: str = "8kbps",
115
+ tag: str = "latest",
116
+ load_path: str = None,
117
+ ):
118
+ if not load_path:
119
+ load_path = download(
120
+ model_type=model_type, model_bitrate=model_bitrate, tag=tag
121
+ )
122
+ generator = DAC.load(load_path)
123
+ return generator
dac/utils/decode.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ from pathlib import Path
3
+
4
+ import argbind
5
+ import numpy as np
6
+ import torch
7
+ from audiotools import AudioSignal
8
+ from tqdm import tqdm
9
+
10
+ from dac import DACFile
11
+ from dac.utils import load_model
12
+
13
+ warnings.filterwarnings("ignore", category=UserWarning)
14
+
15
+
16
+ @argbind.bind(group="decode", positional=True, without_prefix=True)
17
+ @torch.inference_mode()
18
+ @torch.no_grad()
19
+ def decode(
20
+ input: str,
21
+ output: str = "",
22
+ weights_path: str = "",
23
+ model_tag: str = "latest",
24
+ model_bitrate: str = "8kbps",
25
+ device: str = "cuda",
26
+ model_type: str = "44khz",
27
+ verbose: bool = False,
28
+ ):
29
+ """Decode audio from codes.
30
+
31
+ Parameters
32
+ ----------
33
+ input : str
34
+ Path to input directory or file
35
+ output : str, optional
36
+ Path to output directory, by default "".
37
+ If `input` is a directory, the directory sub-tree relative to `input` is re-created in `output`.
38
+ weights_path : str, optional
39
+ Path to weights file, by default "". If not specified, the weights file will be downloaded from the internet using the
40
+ model_tag and model_type.
41
+ model_tag : str, optional
42
+ Tag of the model to use, by default "latest". Ignored if `weights_path` is specified.
43
+ model_bitrate: str
44
+ Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
45
+ device : str, optional
46
+ Device to use, by default "cuda". If "cpu", the model will be loaded on the CPU.
47
+ model_type : str, optional
48
+ The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
49
+ """
50
+ generator = load_model(
51
+ model_type=model_type,
52
+ model_bitrate=model_bitrate,
53
+ tag=model_tag,
54
+ load_path=weights_path,
55
+ )
56
+ generator.to(device)
57
+ generator.eval()
58
+
59
+ # Find all .dac files in input directory
60
+ _input = Path(input)
61
+ input_files = list(_input.glob("**/*.dac"))
62
+
63
+ # If input is a .dac file, add it to the list
64
+ if _input.suffix == ".dac":
65
+ input_files.append(_input)
66
+
67
+ # Create output directory
68
+ output = Path(output)
69
+ output.mkdir(parents=True, exist_ok=True)
70
+
71
+ for i in tqdm(range(len(input_files)), desc=f"Decoding files"):
72
+ # Load file
73
+ artifact = DACFile.load(input_files[i])
74
+
75
+ # Reconstruct audio from codes
76
+ recons = generator.decompress(artifact, verbose=verbose)
77
+
78
+ # Compute output path
79
+ relative_path = input_files[i].relative_to(input)
80
+ output_dir = output / relative_path.parent
81
+ if not relative_path.name:
82
+ output_dir = output
83
+ relative_path = input_files[i]
84
+ output_name = relative_path.with_suffix(".wav").name
85
+ output_path = output_dir / output_name
86
+ output_path.parent.mkdir(parents=True, exist_ok=True)
87
+
88
+ # Write to file
89
+ recons.write(output_path)
90
+
91
+
92
+ if __name__ == "__main__":
93
+ args = argbind.parse_args()
94
+ with argbind.scope(args):
95
+ decode()
dac/utils/encode.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import warnings
3
+ from pathlib import Path
4
+
5
+ import argbind
6
+ import numpy as np
7
+ import torch
8
+ from audiotools import AudioSignal
9
+ from audiotools.core import util
10
+ from tqdm import tqdm
11
+
12
+ from dac.utils import load_model
13
+
14
+ warnings.filterwarnings("ignore", category=UserWarning)
15
+
16
+
17
+ @argbind.bind(group="encode", positional=True, without_prefix=True)
18
+ @torch.inference_mode()
19
+ @torch.no_grad()
20
+ def encode(
21
+ input: str,
22
+ output: str = "",
23
+ weights_path: str = "",
24
+ model_tag: str = "latest",
25
+ model_bitrate: str = "8kbps",
26
+ n_quantizers: int = None,
27
+ device: str = "cuda",
28
+ model_type: str = "44khz",
29
+ win_duration: float = 5.0,
30
+ verbose: bool = False,
31
+ ):
32
+ """Encode audio files in input path to .dac format.
33
+
34
+ Parameters
35
+ ----------
36
+ input : str
37
+ Path to input audio file or directory
38
+ output : str, optional
39
+ Path to output directory, by default "". If `input` is a directory, the directory sub-tree relative to `input` is re-created in `output`.
40
+ weights_path : str, optional
41
+ Path to weights file, by default "". If not specified, the weights file will be downloaded from the internet using the
42
+ model_tag and model_type.
43
+ model_tag : str, optional
44
+ Tag of the model to use, by default "latest". Ignored if `weights_path` is specified.
45
+ model_bitrate: str
46
+ Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
47
+ n_quantizers : int, optional
48
+ Number of quantizers to use, by default None. If not specified, all the quantizers will be used and the model will compress at maximum bitrate.
49
+ device : str, optional
50
+ Device to use, by default "cuda"
51
+ model_type : str, optional
52
+ The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
53
+ """
54
+ generator = load_model(
55
+ model_type=model_type,
56
+ model_bitrate=model_bitrate,
57
+ tag=model_tag,
58
+ load_path=weights_path,
59
+ )
60
+ generator.to(device)
61
+ generator.eval()
62
+ kwargs = {"n_quantizers": n_quantizers}
63
+
64
+ # Find all audio files in input path
65
+ input = Path(input)
66
+ audio_files = util.find_audio(input)
67
+
68
+ output = Path(output)
69
+ output.mkdir(parents=True, exist_ok=True)
70
+
71
+ for i in tqdm(range(len(audio_files)), desc="Encoding files"):
72
+ # Load file
73
+ signal = AudioSignal(audio_files[i])
74
+
75
+ # Encode audio to .dac format
76
+ artifact = generator.compress(signal, win_duration, verbose=verbose, **kwargs)
77
+
78
+ # Compute output path
79
+ relative_path = audio_files[i].relative_to(input)
80
+ output_dir = output / relative_path.parent
81
+ if not relative_path.name:
82
+ output_dir = output
83
+ relative_path = audio_files[i]
84
+ output_name = relative_path.with_suffix(".dac").name
85
+ output_path = output_dir / output_name
86
+ output_path.parent.mkdir(parents=True, exist_ok=True)
87
+
88
+ artifact.save(output_path)
89
+
90
+
91
+ if __name__ == "__main__":
92
+ args = argbind.parse_args()
93
+ with argbind.scope(args):
94
+ encode()
data/ft_dataset.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ import numpy as np
4
+ import random
5
+ import os
6
+ from torch.utils.data import DataLoader
7
+ from modules.audio import mel_spectrogram
8
+
9
+
10
+ duration_setting = {
11
+ "min": 1.0,
12
+ "max": 30.0,
13
+ }
14
+ # assume single speaker
15
+ class FT_Dataset(torch.utils.data.Dataset):
16
+ def __init__(self,
17
+ data_path,
18
+ spect_params,
19
+ sr=22050,
20
+ batch_size=1,
21
+ ):
22
+ self.data_path = data_path
23
+ # recursively find all files in data_path
24
+ self.data = []
25
+ for root, _, files in os.walk(data_path):
26
+ for file in files:
27
+ if (file.endswith(".wav") or
28
+ file.endswith(".mp3") or
29
+ file.endswith(".flac") or
30
+ file.endswith(".ogg") or
31
+ file.endswith(".m4a") or
32
+ file.endswith(".opus")):
33
+ self.data.append(os.path.join(root, file))
34
+
35
+ mel_fn_args = {
36
+ "n_fft": spect_params['n_fft'],
37
+ "win_size": spect_params['win_length'],
38
+ "hop_size": spect_params['hop_length'],
39
+ "num_mels": spect_params['n_mels'],
40
+ "sampling_rate": sr,
41
+ "fmin": spect_params['fmin'],
42
+ "fmax": None if spect_params['fmax'] == "None" else spect_params['fmax'],
43
+ "center": False
44
+ }
45
+ self.to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
46
+ self.sr = sr
47
+
48
+ assert len(self.data) != 0
49
+ # if dataset length is less than batch size, repeat the dataset
50
+ while len(self.data) < batch_size:
51
+ self.data += self.data
52
+
53
+ def __len__(self):
54
+ return len(self.data)
55
+
56
+ def __getitem__(self, idx):
57
+ idx = idx % len(self.data)
58
+ wav_path = self.data[idx]
59
+ try:
60
+ speech, orig_sr = librosa.load(wav_path, sr=self.sr)
61
+ except Exception as e:
62
+ print(f"Failed to load wav file with error {e}")
63
+ return self.__getitem__(random.randint(0, len(self)))
64
+ if len(speech) < self.sr * duration_setting["min"] or len(speech) > self.sr * duration_setting["max"]:
65
+ print(f"Audio {wav_path} is too short or too long, skipping")
66
+ return self.__getitem__(random.randint(0, len(self)))
67
+ return_dict = {
68
+ 'audio': speech,
69
+ 'sr': orig_sr
70
+ }
71
+ wave, orig_sr = return_dict['audio'], return_dict['sr']
72
+ if orig_sr != self.sr:
73
+ wave = librosa.resample(wave, orig_sr, self.sr)
74
+ wave = torch.from_numpy(wave).float()
75
+ mel = self.to_mel(wave.unsqueeze(0)).squeeze(0)
76
+
77
+ return wave, mel
78
+
79
+ def build_ft_dataloader(data_path, spect_params, sr, batch_size=1, num_workers=0):
80
+ dataset = FT_Dataset(data_path, spect_params, sr, batch_size)
81
+ dataloader = torch.utils.data.DataLoader(
82
+ dataset,
83
+ batch_size=batch_size,
84
+ shuffle=True,
85
+ num_workers=num_workers,
86
+ collate_fn=collate,
87
+ )
88
+ return dataloader
89
+
90
+ def collate(batch):
91
+ batch_size = len(batch)
92
+
93
+ # sort by mel length
94
+ lengths = [b[1].shape[1] for b in batch]
95
+ batch_indexes = np.argsort(lengths)[::-1]
96
+ batch = [batch[bid] for bid in batch_indexes]
97
+
98
+ nmels = batch[0][1].size(0)
99
+ max_mel_length = max([b[1].shape[1] for b in batch])
100
+ max_wave_length = max([b[0].size(0) for b in batch])
101
+
102
+ mels = torch.zeros((batch_size, nmels, max_mel_length)).float() - 10
103
+ waves = torch.zeros((batch_size, max_wave_length)).float()
104
+
105
+ mel_lengths = torch.zeros(batch_size).long()
106
+ wave_lengths = torch.zeros(batch_size).long()
107
+
108
+ for bid, (wave, mel) in enumerate(batch):
109
+ mel_size = mel.size(1)
110
+ mels[bid, :, :mel_size] = mel
111
+ waves[bid, : wave.size(0)] = wave
112
+ mel_lengths[bid] = mel_size
113
+ wave_lengths[bid] = wave.size(0)
114
+
115
+ return waves, mels, wave_lengths, mel_lengths
116
+
117
+ if __name__ == "__main__":
118
+ data_path = "./example/reference"
119
+ sr = 22050
120
+ spect_params = {
121
+ "n_fft": 1024,
122
+ "win_length": 1024,
123
+ "hop_length": 256,
124
+ "n_mels": 80,
125
+ "fmin": 0,
126
+ "fmax": 8000,
127
+ }
128
+ dataloader = build_ft_dataloader(data_path, spect_params, sr, batch_size=2, num_workers=0)
129
+ for idx, batch in enumerate(dataloader):
130
+ wave, mel, wave_lengths, mel_lengths = batch
131
+ print(wave.shape, mel.shape)
132
+ if idx == 10:
133
+ break
eval.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import warnings
3
+ import argparse
4
+ import torch
5
+ import os
6
+ import os.path as osp
7
+ import yaml
8
+
9
+ warnings.simplefilter("ignore")
10
+
11
+ # load packages
12
+ import random
13
+
14
+ from tqdm import tqdm
15
+ from modules.commons import *
16
+ import time
17
+
18
+ import torchaudio
19
+ import librosa
20
+ import torchaudio.compliance.kaldi as kaldi
21
+
22
+ from hf_utils import load_custom_model_from_hf
23
+ from resemblyzer import preprocess_wav, VoiceEncoder
24
+
25
+ # Load model and configuration
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+
28
+ from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
29
+ from transformers import Wav2Vec2Processor, HubertForCTC
30
+
31
+ import jiwer
32
+ import string
33
+
34
+ from baselines.dnsmos.dnsmos_computor import DNSMOSComputer
35
+
36
+ def calc_mos(computor, audio, orin_sr):
37
+ # only 16k audio is supported
38
+ target_sr = 16000
39
+ if orin_sr != 16000:
40
+ audio = librosa.resample(
41
+ audio, orig_sr=orin_sr, target_sr=target_sr, res_type="kaiser_fast"
42
+ )
43
+ result = computor.compute(audio, target_sr, False)
44
+ sig, bak, ovr = result["SIG"], result["BAK"], result["OVRL"]
45
+
46
+ if ovr == 0:
47
+ print("calculate dns mos failed")
48
+ return sig, bak, ovr
49
+
50
+ mos_computer = DNSMOSComputer(
51
+ "baselines/dnsmos/sig_bak_ovr.onnx",
52
+ "baselines/dnsmos/model_v8.onnx",
53
+ device="cuda",
54
+ device_id=0,
55
+ )
56
+
57
+ def load_models(args):
58
+ dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
59
+ "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
60
+ "config_dit_mel_seed_uvit_whisper_small_wavenet.yml")
61
+ config = yaml.safe_load(open(dit_config_path, "r"))
62
+ model_params = recursive_munch(config["model_params"])
63
+ model = build_model(model_params, stage="DiT")
64
+ hop_length = config["preprocess_params"]["spect_params"]["hop_length"]
65
+ sr = config["preprocess_params"]["sr"]
66
+
67
+ # Load checkpoints
68
+ model, _, _, _ = load_checkpoint(
69
+ model,
70
+ None,
71
+ dit_checkpoint_path,
72
+ load_only_params=True,
73
+ ignore_modules=[],
74
+ is_distributed=False,
75
+ )
76
+ for key in model:
77
+ model[key].eval()
78
+ model[key].to(device)
79
+ model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
80
+
81
+ # Load additional modules
82
+ from modules.campplus.DTDNN import CAMPPlus
83
+
84
+ campplus_ckpt_path = load_custom_model_from_hf(
85
+ "funasr/campplus", "campplus_cn_common.bin", config_filename=None
86
+ )
87
+ campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
88
+ campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
89
+ campplus_model.eval()
90
+ campplus_model.to(device)
91
+
92
+ vocoder_type = model_params.vocoder.type
93
+
94
+ if vocoder_type == 'bigvgan':
95
+ from modules.bigvgan import bigvgan
96
+ bigvgan_name = model_params.vocoder.name
97
+ bigvgan_model = bigvgan.BigVGAN.from_pretrained(bigvgan_name, use_cuda_kernel=False)
98
+ # remove weight norm in the model and set to eval mode
99
+ bigvgan_model.remove_weight_norm()
100
+ bigvgan_model = bigvgan_model.eval().to(device)
101
+ vocoder_fn = bigvgan_model
102
+ elif vocoder_type == 'hifigan':
103
+ from modules.hifigan.generator import HiFTGenerator
104
+ from modules.hifigan.f0_predictor import ConvRNNF0Predictor
105
+ hift_config = yaml.safe_load(open('configs/hifigan.yml', 'r'))
106
+ hift_gen = HiFTGenerator(**hift_config['hift'], f0_predictor=ConvRNNF0Predictor(**hift_config['f0_predictor']))
107
+ hift_gen.load_state_dict(torch.load(hift_config['pretrained_model_path'], map_location='cpu'))
108
+ hift_gen.eval()
109
+ hift_gen.to(device)
110
+ vocoder_fn = hift_gen
111
+ elif vocoder_type == "vocos":
112
+ vocos_config = yaml.safe_load(open(model_params.vocoder.vocos.config, 'r'))
113
+ vocos_path = model_params.vocoder.vocos.path
114
+ vocos_model_params = recursive_munch(vocos_config['model_params'])
115
+ vocos = build_model(vocos_model_params, stage='mel_vocos')
116
+ vocos_checkpoint_path = vocos_path
117
+ vocos, _, _, _ = load_checkpoint(vocos, None, vocos_checkpoint_path,
118
+ load_only_params=True, ignore_modules=[], is_distributed=False)
119
+ _ = [vocos[key].eval().to(device) for key in vocos]
120
+ _ = [vocos[key].to(device) for key in vocos]
121
+ total_params = sum(sum(p.numel() for p in vocos[key].parameters() if p.requires_grad) for key in vocos.keys())
122
+ print(f"Vocoder model total parameters: {total_params / 1_000_000:.2f}M")
123
+ vocoder_fn = vocos.decoder
124
+ else:
125
+ raise ValueError(f"Unsupported vocoder type: {vocoder_type}")
126
+
127
+ speech_tokenizer_type = model_params.speech_tokenizer.type
128
+ if speech_tokenizer_type == 'whisper':
129
+ # whisper
130
+ from transformers import AutoFeatureExtractor, WhisperModel
131
+ whisper_name = model_params.speech_tokenizer.name
132
+ whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
133
+ del whisper_model.decoder
134
+ whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
135
+
136
+ def semantic_fn(waves_16k):
137
+ ori_inputs = whisper_feature_extractor([waves_16k.squeeze(0).cpu().numpy()],
138
+ return_tensors="pt",
139
+ return_attention_mask=True)
140
+ ori_input_features = whisper_model._mask_input_features(
141
+ ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
142
+ with torch.no_grad():
143
+ ori_outputs = whisper_model.encoder(
144
+ ori_input_features.to(whisper_model.encoder.dtype),
145
+ head_mask=None,
146
+ output_attentions=False,
147
+ output_hidden_states=False,
148
+ return_dict=True,
149
+ )
150
+ S_ori = ori_outputs.last_hidden_state.to(torch.float32)
151
+ S_ori = S_ori[:, :waves_16k.size(-1) // 320 + 1]
152
+ return S_ori
153
+ elif speech_tokenizer_type == 'cnhubert':
154
+ from transformers import (
155
+ Wav2Vec2FeatureExtractor,
156
+ HubertModel,
157
+ )
158
+ hubert_model_name = config['model_params']['speech_tokenizer']['name']
159
+ hubert_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(hubert_model_name)
160
+ hubert_model = HubertModel.from_pretrained(hubert_model_name)
161
+ hubert_model = hubert_model.to(device)
162
+ hubert_model = hubert_model.eval()
163
+ hubert_model = hubert_model.half()
164
+
165
+ def semantic_fn(waves_16k):
166
+ ori_waves_16k_input_list = [
167
+ waves_16k[bib].cpu().numpy()
168
+ for bib in range(len(waves_16k))
169
+ ]
170
+ ori_inputs = hubert_feature_extractor(ori_waves_16k_input_list,
171
+ return_tensors="pt",
172
+ return_attention_mask=True,
173
+ padding=True,
174
+ sampling_rate=16000).to(device)
175
+ with torch.no_grad():
176
+ ori_outputs = hubert_model(
177
+ ori_inputs.input_values.half(),
178
+ )
179
+ S_ori = ori_outputs.last_hidden_state.float()
180
+ return S_ori
181
+ elif speech_tokenizer_type == 'xlsr':
182
+ from transformers import (
183
+ Wav2Vec2FeatureExtractor,
184
+ Wav2Vec2Model,
185
+ )
186
+ model_name = config['model_params']['speech_tokenizer']['name']
187
+ output_layer = config['model_params']['speech_tokenizer']['output_layer']
188
+ wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
189
+ wav2vec_model = Wav2Vec2Model.from_pretrained(model_name)
190
+ wav2vec_model.encoder.layers = wav2vec_model.encoder.layers[:output_layer]
191
+ wav2vec_model = wav2vec_model.to(device)
192
+ wav2vec_model = wav2vec_model.eval()
193
+ wav2vec_model = wav2vec_model.half()
194
+
195
+ def semantic_fn(waves_16k):
196
+ ori_waves_16k_input_list = [
197
+ waves_16k[bib].cpu().numpy()
198
+ for bib in range(len(waves_16k))
199
+ ]
200
+ ori_inputs = wav2vec_feature_extractor(ori_waves_16k_input_list,
201
+ return_tensors="pt",
202
+ return_attention_mask=True,
203
+ padding=True,
204
+ sampling_rate=16000).to(device)
205
+ with torch.no_grad():
206
+ ori_outputs = wav2vec_model(
207
+ ori_inputs.input_values.half(),
208
+ )
209
+ S_ori = ori_outputs.last_hidden_state.float()
210
+ return S_ori
211
+ else:
212
+ raise ValueError(f"Unsupported speech tokenizer type: {model_params.speech_tokenizer.type}")
213
+ # Generate mel spectrograms
214
+ mel_fn_args = {
215
+ "n_fft": config['preprocess_params']['spect_params']['n_fft'],
216
+ "win_size": config['preprocess_params']['spect_params']['win_length'],
217
+ "hop_size": config['preprocess_params']['spect_params']['hop_length'],
218
+ "num_mels": config['preprocess_params']['spect_params']['n_mels'],
219
+ "sampling_rate": sr,
220
+ "fmin": config['preprocess_params'].get('fmin', 0),
221
+ "fmax": None if config['preprocess_params']['spect_params'].get('fmax', "None") == "None" else 8000,
222
+ "center": False
223
+ }
224
+ from modules.audio import mel_spectrogram
225
+
226
+ to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
227
+
228
+ return (
229
+ model,
230
+ semantic_fn,
231
+ vocoder_fn,
232
+ campplus_model,
233
+ to_mel,
234
+ mel_fn_args,
235
+ )
236
+
237
+
238
+ @torch.no_grad()
239
+ def main(args):
240
+ # init xvector models
241
+ if args.xvector_extractor == "wavlm":
242
+ wavlm_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
243
+ "microsoft/wavlm-base-plus-sv"
244
+ )
245
+ wavlm_model = WavLMForXVector.from_pretrained(
246
+ "microsoft/wavlm-base-plus-sv"
247
+ ).to(device)
248
+ elif args.xvector_extractor == "resemblyzer":
249
+ resemblyzer_encoder = VoiceEncoder()
250
+ elif args.xvector_extractor == 'wavlm-large':
251
+ import sys
252
+ sys.path.append("../UniSpeech/downstreams/speaker_verification")
253
+ from verification import init_model
254
+ wavlm_model = init_model("wavlm_large", "D:/wavlm_large_finetune.pth")
255
+ wavlm_model.cuda()
256
+ wavlm_model.eval()
257
+ else:
258
+ raise ValueError(f"Unknown xvector extractor: {args.xvector_extractor}")
259
+
260
+ # init asr model
261
+ asr_processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
262
+ asr_model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft").to(device)
263
+
264
+ (
265
+ model,
266
+ semantic_fn,
267
+ vocoder_fn,
268
+ campplus_model,
269
+ to_mel,
270
+ mel_fn_args,
271
+ ) = load_models(args)
272
+ sr = mel_fn_args["sampling_rate"]
273
+
274
+ source_dir = args.source
275
+ target_dir = args.target
276
+ diffusion_steps = args.diffusion_steps
277
+ length_adjust = args.length_adjust
278
+ inference_cfg_rate = args.inference_cfg_rate
279
+ baseline = args.baseline
280
+ max_samples = args.max_samples
281
+ try:
282
+ source_audio_list = open(osp.join(source_dir, "index.tsv"), "r").readlines()
283
+ except FileNotFoundError:
284
+ source_audio_list = os.listdir(source_dir)
285
+ source_audio_list = [f for f in source_audio_list if f.endswith(".wav")]
286
+ target_audio_list = os.listdir(target_dir)
287
+
288
+ conversion_result_dir = args.output
289
+ if baseline:
290
+ conversion_result_dir = os.path.join(conversion_result_dir, baseline)
291
+ os.makedirs(conversion_result_dir, exist_ok=True)
292
+
293
+ similarity_list = []
294
+ gt_wer_list = []
295
+ gt_cer_list = []
296
+ vc_wer_list = []
297
+ vc_cer_list = []
298
+ dnsmos_list = []
299
+ for source_i, source_line in enumerate(tqdm(source_audio_list)):
300
+ if source_i >= max_samples:
301
+ break
302
+ source_index, source_transcript = source_line.strip().split("\t")
303
+ source_path = osp.join(source_dir, f"{source_index}.wav")
304
+ for target_i, target_name in enumerate(target_audio_list):
305
+ target_path = osp.join(target_dir, target_name)
306
+ print(f"Processing {source_path} -> {target_path}")
307
+
308
+ if os.path.exists(osp.join(conversion_result_dir, source_index, f"{target_name}")):
309
+ # already converted, load the converted file
310
+ vc_wave_16k, _ = librosa.load(
311
+ osp.join(conversion_result_dir, source_index, f"{target_name}"), sr=16000
312
+ )
313
+ vc_wave_16k = torch.tensor(vc_wave_16k).unsqueeze(0)
314
+ ref_waves_16k, _ = librosa.load(target_path, sr=16000)
315
+ ref_waves_16k = torch.tensor(ref_waves_16k).unsqueeze(0)
316
+ else:
317
+ if baseline == "openvoice":
318
+ from baselines.openvoice import convert as openvoice_convert
319
+ ref_waves_16k, vc_wave_16k = openvoice_convert(source_path, target_path, "temp.wav")
320
+ elif baseline == "cosyvoice":
321
+ from baselines.cosyvoice import convert as cosyvoice_convert
322
+ ref_waves_16k, vc_wave_16k = cosyvoice_convert(source_path, target_path, "temp.wav")
323
+ else:
324
+ ref_waves_16k, vc_wave = convert(
325
+ source_path,
326
+ target_path,
327
+ model,
328
+ semantic_fn,
329
+ vocoder_fn,
330
+ campplus_model,
331
+ to_mel,
332
+ mel_fn_args,
333
+ sr,
334
+ length_adjust,
335
+ diffusion_steps,
336
+ inference_cfg_rate,
337
+ remove_prompt=args.remove_prompt,
338
+ )
339
+ vc_wave_16k = torchaudio.functional.resample(vc_wave, sr, 16000)
340
+ os.makedirs(osp.join(conversion_result_dir, source_index), exist_ok=True)
341
+ torchaudio.save(
342
+ osp.join(conversion_result_dir, source_index, f"{target_name}"),
343
+ vc_wave_16k.cpu(),
344
+ 16000,
345
+ )
346
+ if args.xvector_extractor == "wavlm":
347
+ ref_inputs = wavlm_feature_extractor(
348
+ ref_waves_16k.squeeze(0).cpu(), padding=True, return_tensors="pt"
349
+ ).to(device)
350
+ ref_embeddings = wavlm_model(**ref_inputs).embeddings
351
+ ref_embeddings = torch.nn.functional.normalize(ref_embeddings, dim=-1).cpu()
352
+
353
+ vc_inputs = wavlm_feature_extractor(
354
+ vc_wave_16k.squeeze(0).cpu(), padding=True, return_tensors="pt"
355
+ ).to(device)
356
+ vc_embeddings = wavlm_model(**vc_inputs).embeddings
357
+ vc_embeddings = torch.nn.functional.normalize(vc_embeddings, dim=-1).cpu()
358
+
359
+ similarity = torch.nn.functional.cosine_similarity(
360
+ ref_embeddings, vc_embeddings, dim=-1
361
+ )
362
+ elif args.xvector_extractor == "resemblyzer":
363
+ ref_wav_resemblyzer = preprocess_wav(target_path)
364
+ vc_wav_resemblyzer = preprocess_wav(
365
+ osp.join(conversion_result_dir, source_index, f"{target_name}")
366
+ )
367
+ ref_embed = resemblyzer_encoder.embed_utterance(ref_wav_resemblyzer)
368
+ vc_embed = resemblyzer_encoder.embed_utterance(vc_wav_resemblyzer)
369
+ similarity = np.inner(ref_embed, vc_embed)
370
+ elif args.xvector_extractor == 'wavlm-large':
371
+ ref_embed = wavlm_model(ref_waves_16k.to(device)).cpu()
372
+ vc_embed = wavlm_model(vc_wave_16k.to(device)).cpu()
373
+ similarity = torch.nn.functional.cosine_similarity(ref_embed, vc_embed, dim=-1)
374
+ else:
375
+ raise ValueError(f"Unknown xvector extractor: {args.xvector_extractor}")
376
+ print(f"Similarity: {similarity}")
377
+ similarity_list.append(similarity)
378
+
379
+ # perform asr
380
+ vc_asr_inputs = asr_processor(
381
+ vc_wave_16k.squeeze(0).cpu(), return_tensors="pt", padding=True
382
+ ).to(device)
383
+ vc_asr_logits = asr_model(**vc_asr_inputs).logits
384
+ predicted_ids = torch.argmax(vc_asr_logits, dim=-1)
385
+ vc_transcription = asr_processor.decode(predicted_ids[0])
386
+
387
+ # perform asr on source 16k
388
+ source_wav_16k = librosa.load(source_path, sr=16000)[0]
389
+ source_asr_inputs = asr_processor(
390
+ source_wav_16k, return_tensors="pt", padding=True
391
+ ).to(device)
392
+ source_asr_logits = asr_model(**source_asr_inputs).logits
393
+ source_predicted_ids = torch.argmax(source_asr_logits, dim=-1)
394
+ source_transcription = asr_processor.decode(source_predicted_ids[0])
395
+
396
+ # convert transcriptions to all lower to calculate WER and CER
397
+ source_transcript = source_transcript.lower()
398
+ # remove punctuations in source_transcript
399
+ source_transcript = source_transcript.translate(str.maketrans("", "", string.punctuation))
400
+ source_transcription = source_transcription.lower()
401
+ vc_transcription = vc_transcription.lower()
402
+
403
+ # calculate WER and CER
404
+ gt_wer = jiwer.wer(source_transcript, source_transcription)
405
+ gt_cer = jiwer.cer(source_transcript, source_transcription)
406
+ vc_wer = jiwer.wer(source_transcript, vc_transcription)
407
+ vc_cer = jiwer.cer(source_transcript, vc_transcription)
408
+
409
+ print(f"GT WER: {gt_wer}, CER: {gt_cer}")
410
+ print(f"VC WER: {vc_wer}, CER: {vc_cer}")
411
+ gt_wer_list.append(gt_wer)
412
+ gt_cer_list.append(gt_cer)
413
+ vc_wer_list.append(vc_wer)
414
+ vc_cer_list.append(vc_cer)
415
+
416
+ # calculate dnsmos
417
+ sig, bak, ovr = calc_mos(mos_computer, vc_wave_16k.squeeze(0).cpu().numpy(), 16000)
418
+ dnsmos_list.append((sig, bak, ovr))
419
+
420
+ print(f"Average GT WER: {sum(gt_wer_list) / len(gt_wer_list)}")
421
+ print(f"Average GT CER: {sum(gt_cer_list) / len(gt_cer_list)}")
422
+ print(f"Average VC WER: {sum(vc_wer_list) / len(vc_wer_list)}")
423
+ print(f"Average VC CER: {sum(vc_cer_list) / len(vc_cer_list)}")
424
+ print(f"Average similarity: {sum(similarity_list) / len(similarity_list)}")
425
+
426
+ print(f"Average DNS MOS SIG: {sum([x[0] for x in dnsmos_list]) / len(dnsmos_list)}")
427
+ print(f"Average DNS MOS BAK: {sum([x[1] for x in dnsmos_list]) / len(dnsmos_list)}")
428
+ print(f"Average DNS MOS OVR: {sum([x[2] for x in dnsmos_list]) / len(dnsmos_list)}")
429
+
430
+ # save wer and cer result into this directory as a txt
431
+ with open(osp.join(conversion_result_dir, source_index, "result.txt"), 'w') as f:
432
+ f.write(f"GT WER: {sum(gt_wer_list[-len(target_audio_list):]) / len(target_audio_list)}\n")
433
+ f.write(f"GT CER: {sum(gt_cer_list[-len(target_audio_list):]) / len(target_audio_list)}\n")
434
+ f.write(f"VC WER: {sum(vc_wer_list[-len(target_audio_list):]) / len(target_audio_list)}\n")
435
+ f.write(f"VC CER: {sum(vc_cer_list[-len(target_audio_list):]) / len(target_audio_list)}\n")
436
+ f.write(f"Average similarity: {sum(similarity_list[-len(target_audio_list):]) / len(target_audio_list)}\n")
437
+
438
+ print(f"Average WER: {sum(gt_wer_list) / len(gt_wer_list)}")
439
+ print(f"Average CER: {sum(gt_cer_list) / len(gt_cer_list)}")
440
+ print(f"Average WER: {sum(vc_wer_list) / len(vc_wer_list)}")
441
+ print(f"Average CER: {sum(vc_cer_list) / len(vc_cer_list)}")
442
+ print(f"Average similarity: {sum(similarity_list) / len(similarity_list)}")
443
+ # save similarity list
444
+ with open(osp.join(conversion_result_dir, f"{args.xvector_extractor}_similarity.tsv"), "w") as f:
445
+ f.write("\n".join([str(s) for s in similarity_list]))
446
+ # save wer and cer result into this directory as a txt
447
+ with open(osp.join(conversion_result_dir, "result.txt"), 'w') as f:
448
+ f.write(f"GT WER: {sum(gt_wer_list) / len(gt_wer_list)}\n")
449
+ f.write(f"GT CER: {sum(gt_cer_list) / len(gt_cer_list)}\n")
450
+ f.write(f"VC WER: {sum(vc_wer_list) / len(vc_wer_list)}\n")
451
+ f.write(f"VC CER: {sum(vc_cer_list) / len(vc_cer_list)}\n")
452
+
453
+ print(f"Average DNS MOS SIG: {sum([x[0] for x in dnsmos_list]) / len(dnsmos_list)}")
454
+ print(f"Average DNS MOS BAK: {sum([x[1] for x in dnsmos_list]) / len(dnsmos_list)}")
455
+ print(f"Average DNS MOS OVR: {sum([x[2] for x in dnsmos_list]) / len(dnsmos_list)}")
456
+
457
+
458
+ def convert(
459
+ source_path,
460
+ target_path,
461
+ model,
462
+ semantic_fn,
463
+ vocoder_fn,
464
+ campplus_model,
465
+ to_mel,
466
+ mel_fn_args,
467
+ sr,
468
+ length_adjust,
469
+ diffusion_steps,
470
+ inference_cfg_rate,
471
+ remove_prompt=False,
472
+ ):
473
+ source_audio = librosa.load(source_path, sr=sr)[0]
474
+ ref_audio = librosa.load(target_path, sr=sr)[0]
475
+ # decoded_wav = encodec_model.decoder(encodec_latent)
476
+ # torchaudio.save("test.wav", decoded_wav.cpu().squeeze(0), 24000)
477
+ # crop only the first 30 seconds
478
+ source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
479
+ ref_audio = torch.tensor(ref_audio).unsqueeze(0).float().to(device)
480
+
481
+ if source_audio.size(1) + ref_audio.size(1) > 30 * sr:
482
+ print(f"reference audio clipped from {ref_audio.size(1)/sr} seconds to {30 * sr - source_audio.size(1)} seconds")
483
+ ref_audio = ref_audio[:, :30 * sr - source_audio.size(1)]
484
+
485
+
486
+ source_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
487
+ ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
488
+
489
+ S_alt = semantic_fn(source_waves_16k)
490
+ S_ori = semantic_fn(ref_waves_16k)
491
+
492
+ mel = to_mel(source_audio.to(device).float())
493
+ mel2 = to_mel(ref_audio.to(device).float())
494
+
495
+ target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
496
+ target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
497
+
498
+ feat2 = torchaudio.compliance.kaldi.fbank(
499
+ ref_waves_16k, num_mel_bins=80, dither=0, sample_frequency=16000
500
+ )
501
+ feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
502
+ style2 = campplus_model(feat2.unsqueeze(0))
503
+ # Length regulation
504
+ cond = model.length_regulator(
505
+ S_alt, ylens=target_lengths, n_quantizers=3, f0=None
506
+ )[0]
507
+ prompt_condition = model.length_regulator(
508
+ S_ori, ylens=target2_lengths, n_quantizers=3, f0=None
509
+ )[0]
510
+ if remove_prompt:
511
+ cat_condition = cond
512
+ mel2 = torch.zeros([mel2.size(0), mel2.size(1), 0]).to(mel2.device)
513
+ else:
514
+ cat_condition = torch.cat([prompt_condition, cond], dim=1)
515
+
516
+ vc_target = model.cfm.inference(
517
+ cat_condition,
518
+ torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
519
+ mel2,
520
+ style2,
521
+ None,
522
+ diffusion_steps,
523
+ inference_cfg_rate=inference_cfg_rate,
524
+ )
525
+ vc_target = vc_target[:, :, mel2.size(-1) :]
526
+
527
+ # Convert to waveform
528
+ vc_wave = vocoder_fn(vc_target).squeeze(1)
529
+
530
+ return ref_waves_16k, vc_wave
531
+
532
+
533
+ if __name__ == "__main__":
534
+ parser = argparse.ArgumentParser()
535
+ parser.add_argument(
536
+ "--source", type=str, default="./examples/libritts-test-clean/"
537
+ )
538
+ parser.add_argument("--target", type=str, default="./examples/reference/")
539
+ parser.add_argument("--output", type=str, default="./examples/eval/converted/")
540
+ parser.add_argument("--diffusion-steps", type=int, default=30)
541
+ parser.add_argument("--length-adjust", type=float, default=1.0)
542
+ parser.add_argument("--inference-cfg-rate", type=float, default=0.7)
543
+ parser.add_argument(
544
+ "--xvector-extractor", type=str, default="wavlm-large"
545
+ ) # wavlm or resemblyzer
546
+ parser.add_argument("--baseline", type=str, default="") # use "" for Seed-VC
547
+ parser.add_argument("--max-samples", type=int, default=20)
548
+ parser.add_argument("--remove-prompt", type=bool, default=False)
549
+ args = parser.parse_args()
550
+ main(args)
examples/reference/azuma_0.wav ADDED
Binary file (629 kB). View file
 
examples/reference/dingzhen_0.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3db260824d11f56cdf2fccf2b84ad83c95a732ddfa2f8cb8a20b68ca06ea9ff8
3
+ size 1088420
examples/reference/s1p1.wav ADDED
Binary file (701 kB). View file
 
examples/reference/s1p2.wav ADDED
Binary file (526 kB). View file
 
examples/reference/s2p1.wav ADDED
Binary file (665 kB). View file
 
examples/reference/s2p2.wav ADDED
Binary file (564 kB). View file
 
examples/reference/s3p1.wav ADDED
Binary file (557 kB). View file
 
examples/reference/s3p2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d28df338203ad8b3c7485474fac41d9ee2891cf27bc0c0239e3249e6c0efadb
3
+ size 1140390
examples/reference/s4p1.wav ADDED
Binary file (619 kB). View file
 
examples/reference/s4p2.wav ADDED
Binary file (651 kB). View file
 
examples/reference/teio_0.wav ADDED
Binary file (366 kB). View file
 
examples/reference/trump_0.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:716becc9daf00351dfe324398edea9e8378f9453408b27612d92b6721f80ddbc
3
+ size 1379484
examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87087ca5260ce96659b01a647edb30bb08527ed7d0c074fb5ae1e8338cc733e5
3
+ size 2796016