Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
- GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py +69 -0
- GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
- GPT_SoVITS/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
- GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py +82 -0
- GPT_SoVITS/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
- GPT_SoVITS/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
- GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py +30 -0
- GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py +99 -0
- GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py +48 -0
- GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_1 +21 -0
- GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_2 +21 -0
- GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_3 +201 -0
- GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_4 +29 -0
- GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_5 +16 -0
- GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_6 +21 -0
- GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_7 +21 -0
- GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_8 +21 -0
- GPT_SoVITS/BigVGAN/tests/test_activation.py +62 -0
- GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py +62 -0
- GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py +215 -0
- GPT_SoVITS/configs/.gitignore +1 -0
- GPT_SoVITS/configs/s1.yaml +31 -0
- GPT_SoVITS/configs/s1big.yaml +31 -0
- GPT_SoVITS/configs/s1big2.yaml +31 -0
- GPT_SoVITS/configs/s1longer-v2.yaml +31 -0
- GPT_SoVITS/configs/s1longer.yaml +31 -0
- GPT_SoVITS/configs/s1mq.yaml +77 -0
- GPT_SoVITS/configs/s2.json +91 -0
- GPT_SoVITS/configs/train.yaml +32 -0
- GPT_SoVITS/configs/tts_infer.yaml +32 -0
- GPT_SoVITS/f5_tts/model/__init__.py +13 -0
- GPT_SoVITS/f5_tts/model/backbones/README.md +20 -0
- GPT_SoVITS/f5_tts/model/backbones/dit.py +180 -0
- GPT_SoVITS/f5_tts/model/backbones/mmdit.py +146 -0
- GPT_SoVITS/f5_tts/model/backbones/unett.py +219 -0
- GPT_SoVITS/f5_tts/model/modules.py +666 -0
- GPT_SoVITS/pretrained_models/.gitignore +2 -0
- GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json +72 -0
- GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json +9 -0
- GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json +34 -0
- GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json +0 -0
- GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x/config.json +63 -0
- GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt +3 -0
- GPT_SoVITS/pretrained_models/s2D488k.pth +3 -0
- GPT_SoVITS/text/.gitignore +3 -0
- GPT_SoVITS/text/__init__.py +28 -0
- GPT_SoVITS/text/cantonese.py +222 -0
- GPT_SoVITS/text/chinese.py +208 -0
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py
ADDED
File without changes
|
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 NVIDIA CORPORATION.
|
2 |
+
# Licensed under the MIT license.
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
from alias_free_activation.torch.resample import UpSample1d, DownSample1d
|
7 |
+
|
8 |
+
# load fused CUDA kernel: this enables importing anti_alias_activation_cuda
|
9 |
+
from alias_free_activation.cuda import load
|
10 |
+
|
11 |
+
anti_alias_activation_cuda = load.load()
|
12 |
+
|
13 |
+
|
14 |
+
class FusedAntiAliasActivation(torch.autograd.Function):
|
15 |
+
"""
|
16 |
+
Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
|
17 |
+
The hyperparameters are hard-coded in the kernel to maximize speed.
|
18 |
+
NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
|
19 |
+
"""
|
20 |
+
|
21 |
+
@staticmethod
|
22 |
+
def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
|
23 |
+
activation_results = anti_alias_activation_cuda.forward(inputs, up_ftr, down_ftr, alpha, beta)
|
24 |
+
|
25 |
+
return activation_results
|
26 |
+
|
27 |
+
@staticmethod
|
28 |
+
def backward(ctx, output_grads):
|
29 |
+
raise NotImplementedError
|
30 |
+
return output_grads, None, None
|
31 |
+
|
32 |
+
|
33 |
+
class Activation1d(nn.Module):
|
34 |
+
def __init__(
|
35 |
+
self,
|
36 |
+
activation,
|
37 |
+
up_ratio: int = 2,
|
38 |
+
down_ratio: int = 2,
|
39 |
+
up_kernel_size: int = 12,
|
40 |
+
down_kernel_size: int = 12,
|
41 |
+
fused: bool = True,
|
42 |
+
):
|
43 |
+
super().__init__()
|
44 |
+
self.up_ratio = up_ratio
|
45 |
+
self.down_ratio = down_ratio
|
46 |
+
self.act = activation
|
47 |
+
self.upsample = UpSample1d(up_ratio, up_kernel_size)
|
48 |
+
self.downsample = DownSample1d(down_ratio, down_kernel_size)
|
49 |
+
|
50 |
+
self.fused = fused # Whether to use fused CUDA kernel or not
|
51 |
+
|
52 |
+
def forward(self, x):
|
53 |
+
if not self.fused:
|
54 |
+
x = self.upsample(x)
|
55 |
+
x = self.act(x)
|
56 |
+
x = self.downsample(x)
|
57 |
+
return x
|
58 |
+
else:
|
59 |
+
if self.act.__class__.__name__ == "Snake":
|
60 |
+
beta = self.act.alpha.data # Snake uses same params for alpha and beta
|
61 |
+
else:
|
62 |
+
beta = self.act.beta.data # Snakebeta uses different params for alpha and beta
|
63 |
+
alpha = self.act.alpha.data
|
64 |
+
if not self.act.alpha_logscale: # Exp baked into cuda kernel, cancel it out with a log
|
65 |
+
alpha = torch.log(alpha)
|
66 |
+
beta = torch.log(beta)
|
67 |
+
|
68 |
+
x = FusedAntiAliasActivation.apply(x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta)
|
69 |
+
return x
|
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* coding=utf-8
|
2 |
+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
3 |
+
*
|
4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
* you may not use this file except in compliance with the License.
|
6 |
+
* You may obtain a copy of the License at
|
7 |
+
*
|
8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
*
|
10 |
+
* Unless required by applicable law or agreed to in writing, software
|
11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
* See the License for the specific language governing permissions and
|
14 |
+
* limitations under the License.
|
15 |
+
*/
|
16 |
+
|
17 |
+
#include <torch/extension.h>
|
18 |
+
|
19 |
+
extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
|
20 |
+
|
21 |
+
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
22 |
+
m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
|
23 |
+
}
|
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* coding=utf-8
|
2 |
+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
3 |
+
*
|
4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
* you may not use this file except in compliance with the License.
|
6 |
+
* You may obtain a copy of the License at
|
7 |
+
*
|
8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
*
|
10 |
+
* Unless required by applicable law or agreed to in writing, software
|
11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
* See the License for the specific language governing permissions and
|
14 |
+
* limitations under the License.
|
15 |
+
*/
|
16 |
+
|
17 |
+
#include <ATen/ATen.h>
|
18 |
+
#include <cuda.h>
|
19 |
+
#include <cuda_runtime.h>
|
20 |
+
#include <cuda_fp16.h>
|
21 |
+
#include <cuda_profiler_api.h>
|
22 |
+
#include <ATen/cuda/CUDAContext.h>
|
23 |
+
#include <torch/extension.h>
|
24 |
+
#include "type_shim.h"
|
25 |
+
#include <assert.h>
|
26 |
+
#include <cfloat>
|
27 |
+
#include <limits>
|
28 |
+
#include <stdint.h>
|
29 |
+
#include <c10/macros/Macros.h>
|
30 |
+
|
31 |
+
namespace
|
32 |
+
{
|
33 |
+
// Hard-coded hyperparameters
|
34 |
+
// WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
|
35 |
+
constexpr int ELEMENTS_PER_LDG_STG = 1; //(WARP_ITERATIONS < 4) ? 1 : 4;
|
36 |
+
constexpr int BUFFER_SIZE = 32;
|
37 |
+
constexpr int FILTER_SIZE = 12;
|
38 |
+
constexpr int HALF_FILTER_SIZE = 6;
|
39 |
+
constexpr int UPSAMPLE_REPLICATION_PAD = 5; // 5 on each side, matching torch impl
|
40 |
+
constexpr int DOWNSAMPLE_REPLICATION_PAD_LEFT = 5; // matching torch impl
|
41 |
+
constexpr int DOWNSAMPLE_REPLICATION_PAD_RIGHT = 6; // matching torch impl
|
42 |
+
|
43 |
+
template <typename input_t, typename output_t, typename acc_t>
|
44 |
+
__global__ void anti_alias_activation_forward(
|
45 |
+
output_t *dst,
|
46 |
+
const input_t *src,
|
47 |
+
const input_t *up_ftr,
|
48 |
+
const input_t *down_ftr,
|
49 |
+
const input_t *alpha,
|
50 |
+
const input_t *beta,
|
51 |
+
int batch_size,
|
52 |
+
int channels,
|
53 |
+
int seq_len)
|
54 |
+
{
|
55 |
+
// Up and downsample filters
|
56 |
+
input_t up_filter[FILTER_SIZE];
|
57 |
+
input_t down_filter[FILTER_SIZE];
|
58 |
+
|
59 |
+
// Load data from global memory including extra indices reserved for replication paddings
|
60 |
+
input_t elements[2 * FILTER_SIZE + 2 * BUFFER_SIZE + 2 * UPSAMPLE_REPLICATION_PAD] = {0};
|
61 |
+
input_t intermediates[2 * FILTER_SIZE + 2 * BUFFER_SIZE + DOWNSAMPLE_REPLICATION_PAD_LEFT + DOWNSAMPLE_REPLICATION_PAD_RIGHT] = {0};
|
62 |
+
|
63 |
+
// Output stores downsampled output before writing to dst
|
64 |
+
output_t output[BUFFER_SIZE];
|
65 |
+
|
66 |
+
// blockDim/threadIdx = (128, 1, 1)
|
67 |
+
// gridDim/blockIdx = (seq_blocks, channels, batches)
|
68 |
+
int block_offset = (blockIdx.x * 128 * BUFFER_SIZE + seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
|
69 |
+
int local_offset = threadIdx.x * BUFFER_SIZE;
|
70 |
+
int seq_offset = blockIdx.x * 128 * BUFFER_SIZE + local_offset;
|
71 |
+
|
72 |
+
// intermediate have double the seq_len
|
73 |
+
int intermediate_local_offset = threadIdx.x * BUFFER_SIZE * 2;
|
74 |
+
int intermediate_seq_offset = blockIdx.x * 128 * BUFFER_SIZE * 2 + intermediate_local_offset;
|
75 |
+
|
76 |
+
// Get values needed for replication padding before moving pointer
|
77 |
+
const input_t *right_most_pntr = src + (seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
|
78 |
+
input_t seq_left_most_value = right_most_pntr[0];
|
79 |
+
input_t seq_right_most_value = right_most_pntr[seq_len - 1];
|
80 |
+
|
81 |
+
// Move src and dst pointers
|
82 |
+
src += block_offset + local_offset;
|
83 |
+
dst += block_offset + local_offset;
|
84 |
+
|
85 |
+
// Alpha and beta values for snake activatons. Applies exp by default
|
86 |
+
alpha = alpha + blockIdx.y;
|
87 |
+
input_t alpha_val = expf(alpha[0]);
|
88 |
+
beta = beta + blockIdx.y;
|
89 |
+
input_t beta_val = expf(beta[0]);
|
90 |
+
|
91 |
+
#pragma unroll
|
92 |
+
for (int it = 0; it < FILTER_SIZE; it += 1)
|
93 |
+
{
|
94 |
+
up_filter[it] = up_ftr[it];
|
95 |
+
down_filter[it] = down_ftr[it];
|
96 |
+
}
|
97 |
+
|
98 |
+
// Apply replication padding for upsampling, matching torch impl
|
99 |
+
#pragma unroll
|
100 |
+
for (int it = -HALF_FILTER_SIZE; it < BUFFER_SIZE + HALF_FILTER_SIZE; it += 1)
|
101 |
+
{
|
102 |
+
int element_index = seq_offset + it; // index for element
|
103 |
+
if ((element_index < 0) && (element_index >= -UPSAMPLE_REPLICATION_PAD))
|
104 |
+
{
|
105 |
+
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_left_most_value;
|
106 |
+
}
|
107 |
+
if ((element_index >= seq_len) && (element_index < seq_len + UPSAMPLE_REPLICATION_PAD))
|
108 |
+
{
|
109 |
+
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_right_most_value;
|
110 |
+
}
|
111 |
+
if ((element_index >= 0) && (element_index < seq_len))
|
112 |
+
{
|
113 |
+
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * src[it];
|
114 |
+
}
|
115 |
+
}
|
116 |
+
|
117 |
+
// Apply upsampling strided convolution and write to intermediates. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT for replication padding of the downsampilng conv later
|
118 |
+
#pragma unroll
|
119 |
+
for (int it = 0; it < (2 * BUFFER_SIZE + 2 * FILTER_SIZE); it += 1)
|
120 |
+
{
|
121 |
+
input_t acc = 0.0;
|
122 |
+
int element_index = intermediate_seq_offset + it; // index for intermediate
|
123 |
+
#pragma unroll
|
124 |
+
for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
|
125 |
+
{
|
126 |
+
if ((element_index + f_idx) >= 0)
|
127 |
+
{
|
128 |
+
acc += up_filter[f_idx] * elements[it + f_idx];
|
129 |
+
}
|
130 |
+
}
|
131 |
+
intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] = acc;
|
132 |
+
}
|
133 |
+
|
134 |
+
// Apply activation function. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT and DOWNSAMPLE_REPLICATION_PAD_RIGHT for replication padding of the downsampilng conv later
|
135 |
+
double no_div_by_zero = 0.000000001;
|
136 |
+
#pragma unroll
|
137 |
+
for (int it = 0; it < 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it += 1)
|
138 |
+
{
|
139 |
+
intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] += (1.0 / (beta_val + no_div_by_zero)) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val);
|
140 |
+
}
|
141 |
+
|
142 |
+
// Apply replication padding before downsampling conv from intermediates
|
143 |
+
#pragma unroll
|
144 |
+
for (int it = 0; it < DOWNSAMPLE_REPLICATION_PAD_LEFT; it += 1)
|
145 |
+
{
|
146 |
+
intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT];
|
147 |
+
}
|
148 |
+
#pragma unroll
|
149 |
+
for (int it = DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it < DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE + DOWNSAMPLE_REPLICATION_PAD_RIGHT; it += 1)
|
150 |
+
{
|
151 |
+
intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE - 1];
|
152 |
+
}
|
153 |
+
|
154 |
+
// Apply downsample strided convolution (assuming stride=2) from intermediates
|
155 |
+
#pragma unroll
|
156 |
+
for (int it = 0; it < BUFFER_SIZE; it += 1)
|
157 |
+
{
|
158 |
+
input_t acc = 0.0;
|
159 |
+
#pragma unroll
|
160 |
+
for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
|
161 |
+
{
|
162 |
+
// Add constant DOWNSAMPLE_REPLICATION_PAD_RIGHT to match torch implementation
|
163 |
+
acc += down_filter[f_idx] * intermediates[it * 2 + f_idx + DOWNSAMPLE_REPLICATION_PAD_RIGHT];
|
164 |
+
}
|
165 |
+
output[it] = acc;
|
166 |
+
}
|
167 |
+
|
168 |
+
// Write output to dst
|
169 |
+
#pragma unroll
|
170 |
+
for (int it = 0; it < BUFFER_SIZE; it += ELEMENTS_PER_LDG_STG)
|
171 |
+
{
|
172 |
+
int element_index = seq_offset + it;
|
173 |
+
if (element_index < seq_len)
|
174 |
+
{
|
175 |
+
dst[it] = output[it];
|
176 |
+
}
|
177 |
+
}
|
178 |
+
|
179 |
+
}
|
180 |
+
|
181 |
+
template <typename input_t, typename output_t, typename acc_t>
|
182 |
+
void dispatch_anti_alias_activation_forward(
|
183 |
+
output_t *dst,
|
184 |
+
const input_t *src,
|
185 |
+
const input_t *up_ftr,
|
186 |
+
const input_t *down_ftr,
|
187 |
+
const input_t *alpha,
|
188 |
+
const input_t *beta,
|
189 |
+
int batch_size,
|
190 |
+
int channels,
|
191 |
+
int seq_len)
|
192 |
+
{
|
193 |
+
if (seq_len == 0)
|
194 |
+
{
|
195 |
+
return;
|
196 |
+
}
|
197 |
+
else
|
198 |
+
{
|
199 |
+
// Use 128 threads per block to maximimize gpu utilization
|
200 |
+
constexpr int threads_per_block = 128;
|
201 |
+
constexpr int seq_len_per_block = 4096;
|
202 |
+
int blocks_per_seq_len = (seq_len + seq_len_per_block - 1) / seq_len_per_block;
|
203 |
+
dim3 blocks(blocks_per_seq_len, channels, batch_size);
|
204 |
+
dim3 threads(threads_per_block, 1, 1);
|
205 |
+
|
206 |
+
anti_alias_activation_forward<input_t, output_t, acc_t>
|
207 |
+
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, up_ftr, down_ftr, alpha, beta, batch_size, channels, seq_len);
|
208 |
+
}
|
209 |
+
}
|
210 |
+
}
|
211 |
+
|
212 |
+
extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta)
|
213 |
+
{
|
214 |
+
// Input is a 3d tensor with dimensions [batches, channels, seq_len]
|
215 |
+
const int batches = input.size(0);
|
216 |
+
const int channels = input.size(1);
|
217 |
+
const int seq_len = input.size(2);
|
218 |
+
|
219 |
+
// Output
|
220 |
+
auto act_options = input.options().requires_grad(false);
|
221 |
+
|
222 |
+
torch::Tensor anti_alias_activation_results =
|
223 |
+
torch::empty({batches, channels, seq_len}, act_options);
|
224 |
+
|
225 |
+
void *input_ptr = static_cast<void *>(input.data_ptr());
|
226 |
+
void *up_filter_ptr = static_cast<void *>(up_filter.data_ptr());
|
227 |
+
void *down_filter_ptr = static_cast<void *>(down_filter.data_ptr());
|
228 |
+
void *alpha_ptr = static_cast<void *>(alpha.data_ptr());
|
229 |
+
void *beta_ptr = static_cast<void *>(beta.data_ptr());
|
230 |
+
void *anti_alias_activation_results_ptr = static_cast<void *>(anti_alias_activation_results.data_ptr());
|
231 |
+
|
232 |
+
DISPATCH_FLOAT_HALF_AND_BFLOAT(
|
233 |
+
input.scalar_type(),
|
234 |
+
"dispatch anti alias activation_forward",
|
235 |
+
dispatch_anti_alias_activation_forward<scalar_t, scalar_t, float>(
|
236 |
+
reinterpret_cast<scalar_t *>(anti_alias_activation_results_ptr),
|
237 |
+
reinterpret_cast<const scalar_t *>(input_ptr),
|
238 |
+
reinterpret_cast<const scalar_t *>(up_filter_ptr),
|
239 |
+
reinterpret_cast<const scalar_t *>(down_filter_ptr),
|
240 |
+
reinterpret_cast<const scalar_t *>(alpha_ptr),
|
241 |
+
reinterpret_cast<const scalar_t *>(beta_ptr),
|
242 |
+
batches,
|
243 |
+
channels,
|
244 |
+
seq_len););
|
245 |
+
return anti_alias_activation_results;
|
246 |
+
}
|
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/compat.h
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* coding=utf-8
|
2 |
+
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
3 |
+
*
|
4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
* you may not use this file except in compliance with the License.
|
6 |
+
* You may obtain a copy of the License at
|
7 |
+
*
|
8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
*
|
10 |
+
* Unless required by applicable law or agreed to in writing, software
|
11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
* See the License for the specific language governing permissions and
|
14 |
+
* limitations under the License.
|
15 |
+
*/
|
16 |
+
|
17 |
+
/*This code is copied fron NVIDIA apex:
|
18 |
+
* https://github.com/NVIDIA/apex
|
19 |
+
* with minor changes. */
|
20 |
+
|
21 |
+
#ifndef TORCH_CHECK
|
22 |
+
#define TORCH_CHECK AT_CHECK
|
23 |
+
#endif
|
24 |
+
|
25 |
+
#ifdef VERSION_GE_1_3
|
26 |
+
#define DATA_PTR data_ptr
|
27 |
+
#else
|
28 |
+
#define DATA_PTR data
|
29 |
+
#endif
|
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 NVIDIA CORPORATION.
|
2 |
+
# Licensed under the MIT license.
|
3 |
+
|
4 |
+
import os
|
5 |
+
import pathlib
|
6 |
+
import subprocess
|
7 |
+
|
8 |
+
from torch.utils import cpp_extension
|
9 |
+
|
10 |
+
"""
|
11 |
+
Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels.
|
12 |
+
Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
|
13 |
+
"""
|
14 |
+
os.environ["TORCH_CUDA_ARCH_LIST"] = ""
|
15 |
+
|
16 |
+
|
17 |
+
def load():
|
18 |
+
# Check if cuda 11 is installed for compute capability 8.0
|
19 |
+
cc_flag = []
|
20 |
+
_, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
|
21 |
+
if int(bare_metal_major) >= 11:
|
22 |
+
cc_flag.append("-gencode")
|
23 |
+
cc_flag.append("arch=compute_80,code=sm_80")
|
24 |
+
|
25 |
+
# Build path
|
26 |
+
srcpath = pathlib.Path(__file__).parent.absolute()
|
27 |
+
buildpath = srcpath / "build"
|
28 |
+
_create_build_dir(buildpath)
|
29 |
+
|
30 |
+
# Helper function to build the kernels.
|
31 |
+
def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
|
32 |
+
return cpp_extension.load(
|
33 |
+
name=name,
|
34 |
+
sources=sources,
|
35 |
+
build_directory=buildpath,
|
36 |
+
extra_cflags=[
|
37 |
+
"-O3",
|
38 |
+
],
|
39 |
+
extra_cuda_cflags=[
|
40 |
+
"-O3",
|
41 |
+
"-gencode",
|
42 |
+
"arch=compute_70,code=sm_70",
|
43 |
+
"--use_fast_math",
|
44 |
+
]
|
45 |
+
+ extra_cuda_flags
|
46 |
+
+ cc_flag,
|
47 |
+
verbose=True,
|
48 |
+
)
|
49 |
+
|
50 |
+
extra_cuda_flags = [
|
51 |
+
"-U__CUDA_NO_HALF_OPERATORS__",
|
52 |
+
"-U__CUDA_NO_HALF_CONVERSIONS__",
|
53 |
+
"--expt-relaxed-constexpr",
|
54 |
+
"--expt-extended-lambda",
|
55 |
+
]
|
56 |
+
|
57 |
+
sources = [
|
58 |
+
srcpath / "anti_alias_activation.cpp",
|
59 |
+
srcpath / "anti_alias_activation_cuda.cu",
|
60 |
+
]
|
61 |
+
anti_alias_activation_cuda = _cpp_extention_load_helper("anti_alias_activation_cuda", sources, extra_cuda_flags)
|
62 |
+
|
63 |
+
return anti_alias_activation_cuda
|
64 |
+
|
65 |
+
|
66 |
+
def _get_cuda_bare_metal_version(cuda_dir):
|
67 |
+
raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
|
68 |
+
output = raw_output.split()
|
69 |
+
release_idx = output.index("release") + 1
|
70 |
+
release = output[release_idx].split(".")
|
71 |
+
bare_metal_major = release[0]
|
72 |
+
bare_metal_minor = release[1][0]
|
73 |
+
|
74 |
+
return raw_output, bare_metal_major, bare_metal_minor
|
75 |
+
|
76 |
+
|
77 |
+
def _create_build_dir(buildpath):
|
78 |
+
try:
|
79 |
+
os.mkdir(buildpath)
|
80 |
+
except OSError:
|
81 |
+
if not os.path.isdir(buildpath):
|
82 |
+
print(f"Creation of the build directory {buildpath} failed")
|
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/type_shim.h
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* coding=utf-8
|
2 |
+
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
3 |
+
*
|
4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
* you may not use this file except in compliance with the License.
|
6 |
+
* You may obtain a copy of the License at
|
7 |
+
*
|
8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
*
|
10 |
+
* Unless required by applicable law or agreed to in writing, software
|
11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
* See the License for the specific language governing permissions and
|
14 |
+
* limitations under the License.
|
15 |
+
*/
|
16 |
+
|
17 |
+
#include <ATen/ATen.h>
|
18 |
+
#include "compat.h"
|
19 |
+
|
20 |
+
#define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, NAME, ...) \
|
21 |
+
switch (TYPE) \
|
22 |
+
{ \
|
23 |
+
case at::ScalarType::Float: \
|
24 |
+
{ \
|
25 |
+
using scalar_t = float; \
|
26 |
+
__VA_ARGS__; \
|
27 |
+
break; \
|
28 |
+
} \
|
29 |
+
case at::ScalarType::Half: \
|
30 |
+
{ \
|
31 |
+
using scalar_t = at::Half; \
|
32 |
+
__VA_ARGS__; \
|
33 |
+
break; \
|
34 |
+
} \
|
35 |
+
case at::ScalarType::BFloat16: \
|
36 |
+
{ \
|
37 |
+
using scalar_t = at::BFloat16; \
|
38 |
+
__VA_ARGS__; \
|
39 |
+
break; \
|
40 |
+
} \
|
41 |
+
default: \
|
42 |
+
AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
|
43 |
+
}
|
44 |
+
|
45 |
+
#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
|
46 |
+
switch (TYPEIN) \
|
47 |
+
{ \
|
48 |
+
case at::ScalarType::Float: \
|
49 |
+
{ \
|
50 |
+
using scalar_t_in = float; \
|
51 |
+
switch (TYPEOUT) \
|
52 |
+
{ \
|
53 |
+
case at::ScalarType::Float: \
|
54 |
+
{ \
|
55 |
+
using scalar_t_out = float; \
|
56 |
+
__VA_ARGS__; \
|
57 |
+
break; \
|
58 |
+
} \
|
59 |
+
case at::ScalarType::Half: \
|
60 |
+
{ \
|
61 |
+
using scalar_t_out = at::Half; \
|
62 |
+
__VA_ARGS__; \
|
63 |
+
break; \
|
64 |
+
} \
|
65 |
+
case at::ScalarType::BFloat16: \
|
66 |
+
{ \
|
67 |
+
using scalar_t_out = at::BFloat16; \
|
68 |
+
__VA_ARGS__; \
|
69 |
+
break; \
|
70 |
+
} \
|
71 |
+
default: \
|
72 |
+
AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
|
73 |
+
} \
|
74 |
+
break; \
|
75 |
+
} \
|
76 |
+
case at::ScalarType::Half: \
|
77 |
+
{ \
|
78 |
+
using scalar_t_in = at::Half; \
|
79 |
+
using scalar_t_out = at::Half; \
|
80 |
+
__VA_ARGS__; \
|
81 |
+
break; \
|
82 |
+
} \
|
83 |
+
case at::ScalarType::BFloat16: \
|
84 |
+
{ \
|
85 |
+
using scalar_t_in = at::BFloat16; \
|
86 |
+
using scalar_t_out = at::BFloat16; \
|
87 |
+
__VA_ARGS__; \
|
88 |
+
break; \
|
89 |
+
} \
|
90 |
+
default: \
|
91 |
+
AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \
|
92 |
+
}
|
GPT_SoVITS/BigVGAN/alias_free_activation/torch/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
2 |
+
# LICENSE is in incl_licenses directory.
|
3 |
+
|
4 |
+
from .filter import *
|
5 |
+
from .resample import *
|
6 |
+
from .act import *
|
GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
2 |
+
# LICENSE is in incl_licenses directory.
|
3 |
+
|
4 |
+
import torch.nn as nn
|
5 |
+
from .resample import UpSample1d, DownSample1d
|
6 |
+
|
7 |
+
|
8 |
+
class Activation1d(nn.Module):
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
activation,
|
12 |
+
up_ratio: int = 2,
|
13 |
+
down_ratio: int = 2,
|
14 |
+
up_kernel_size: int = 12,
|
15 |
+
down_kernel_size: int = 12,
|
16 |
+
):
|
17 |
+
super().__init__()
|
18 |
+
self.up_ratio = up_ratio
|
19 |
+
self.down_ratio = down_ratio
|
20 |
+
self.act = activation
|
21 |
+
self.upsample = UpSample1d(up_ratio, up_kernel_size)
|
22 |
+
self.downsample = DownSample1d(down_ratio, down_kernel_size)
|
23 |
+
|
24 |
+
# x: [B,C,T]
|
25 |
+
def forward(self, x):
|
26 |
+
x = self.upsample(x)
|
27 |
+
x = self.act(x)
|
28 |
+
x = self.downsample(x)
|
29 |
+
|
30 |
+
return x
|
GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
2 |
+
# LICENSE is in incl_licenses directory.
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import math
|
8 |
+
|
9 |
+
if "sinc" in dir(torch):
|
10 |
+
sinc = torch.sinc
|
11 |
+
else:
|
12 |
+
# This code is adopted from adefossez's julius.core.sinc under the MIT License
|
13 |
+
# https://adefossez.github.io/julius/julius/core.html
|
14 |
+
# LICENSE is in incl_licenses directory.
|
15 |
+
def sinc(x: torch.Tensor):
|
16 |
+
"""
|
17 |
+
Implementation of sinc, i.e. sin(pi * x) / (pi * x)
|
18 |
+
__Warning__: Different to julius.sinc, the input is multiplied by `pi`!
|
19 |
+
"""
|
20 |
+
return torch.where(
|
21 |
+
x == 0,
|
22 |
+
torch.tensor(1.0, device=x.device, dtype=x.dtype),
|
23 |
+
torch.sin(math.pi * x) / math.pi / x,
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
|
28 |
+
# https://adefossez.github.io/julius/julius/lowpass.html
|
29 |
+
# LICENSE is in incl_licenses directory.
|
30 |
+
def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
|
31 |
+
even = kernel_size % 2 == 0
|
32 |
+
half_size = kernel_size // 2
|
33 |
+
|
34 |
+
# For kaiser window
|
35 |
+
delta_f = 4 * half_width
|
36 |
+
A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
|
37 |
+
if A > 50.0:
|
38 |
+
beta = 0.1102 * (A - 8.7)
|
39 |
+
elif A >= 21.0:
|
40 |
+
beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
|
41 |
+
else:
|
42 |
+
beta = 0.0
|
43 |
+
window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
|
44 |
+
|
45 |
+
# ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
|
46 |
+
if even:
|
47 |
+
time = torch.arange(-half_size, half_size) + 0.5
|
48 |
+
else:
|
49 |
+
time = torch.arange(kernel_size) - half_size
|
50 |
+
if cutoff == 0:
|
51 |
+
filter_ = torch.zeros_like(time)
|
52 |
+
else:
|
53 |
+
filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
|
54 |
+
"""
|
55 |
+
Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
|
56 |
+
"""
|
57 |
+
filter_ /= filter_.sum()
|
58 |
+
filter = filter_.view(1, 1, kernel_size)
|
59 |
+
|
60 |
+
return filter
|
61 |
+
|
62 |
+
|
63 |
+
class LowPassFilter1d(nn.Module):
|
64 |
+
def __init__(
|
65 |
+
self,
|
66 |
+
cutoff=0.5,
|
67 |
+
half_width=0.6,
|
68 |
+
stride: int = 1,
|
69 |
+
padding: bool = True,
|
70 |
+
padding_mode: str = "replicate",
|
71 |
+
kernel_size: int = 12,
|
72 |
+
):
|
73 |
+
"""
|
74 |
+
kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
|
75 |
+
"""
|
76 |
+
super().__init__()
|
77 |
+
if cutoff < -0.0:
|
78 |
+
raise ValueError("Minimum cutoff must be larger than zero.")
|
79 |
+
if cutoff > 0.5:
|
80 |
+
raise ValueError("A cutoff above 0.5 does not make sense.")
|
81 |
+
self.kernel_size = kernel_size
|
82 |
+
self.even = kernel_size % 2 == 0
|
83 |
+
self.pad_left = kernel_size // 2 - int(self.even)
|
84 |
+
self.pad_right = kernel_size // 2
|
85 |
+
self.stride = stride
|
86 |
+
self.padding = padding
|
87 |
+
self.padding_mode = padding_mode
|
88 |
+
filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
|
89 |
+
self.register_buffer("filter", filter)
|
90 |
+
|
91 |
+
# Input [B, C, T]
|
92 |
+
def forward(self, x):
|
93 |
+
_, C, _ = x.shape
|
94 |
+
|
95 |
+
if self.padding:
|
96 |
+
x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
|
97 |
+
out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
|
98 |
+
|
99 |
+
return out
|
GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
2 |
+
# LICENSE is in incl_licenses directory.
|
3 |
+
|
4 |
+
import torch.nn as nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
from .filter import LowPassFilter1d
|
7 |
+
from .filter import kaiser_sinc_filter1d
|
8 |
+
|
9 |
+
|
10 |
+
class UpSample1d(nn.Module):
|
11 |
+
def __init__(self, ratio=2, kernel_size=None):
|
12 |
+
super().__init__()
|
13 |
+
self.ratio = ratio
|
14 |
+
self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
|
15 |
+
self.stride = ratio
|
16 |
+
self.pad = self.kernel_size // ratio - 1
|
17 |
+
self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
|
18 |
+
self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
|
19 |
+
filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size)
|
20 |
+
self.register_buffer("filter", filter)
|
21 |
+
|
22 |
+
# x: [B, C, T]
|
23 |
+
def forward(self, x):
|
24 |
+
_, C, _ = x.shape
|
25 |
+
|
26 |
+
x = F.pad(x, (self.pad, self.pad), mode="replicate")
|
27 |
+
x = self.ratio * F.conv_transpose1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
|
28 |
+
x = x[..., self.pad_left : -self.pad_right]
|
29 |
+
|
30 |
+
return x
|
31 |
+
|
32 |
+
|
33 |
+
class DownSample1d(nn.Module):
|
34 |
+
def __init__(self, ratio=2, kernel_size=None):
|
35 |
+
super().__init__()
|
36 |
+
self.ratio = ratio
|
37 |
+
self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
|
38 |
+
self.lowpass = LowPassFilter1d(
|
39 |
+
cutoff=0.5 / ratio,
|
40 |
+
half_width=0.6 / ratio,
|
41 |
+
stride=ratio,
|
42 |
+
kernel_size=self.kernel_size,
|
43 |
+
)
|
44 |
+
|
45 |
+
def forward(self, x):
|
46 |
+
xx = self.lowpass(x)
|
47 |
+
|
48 |
+
return xx
|
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_1
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2020 Jungil Kong
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_2
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2020 Edward Dixon
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_3
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_4
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
BSD 3-Clause License
|
2 |
+
|
3 |
+
Copyright (c) 2019, Seungwon Park 박승원
|
4 |
+
All rights reserved.
|
5 |
+
|
6 |
+
Redistribution and use in source and binary forms, with or without
|
7 |
+
modification, are permitted provided that the following conditions are met:
|
8 |
+
|
9 |
+
1. Redistributions of source code must retain the above copyright notice, this
|
10 |
+
list of conditions and the following disclaimer.
|
11 |
+
|
12 |
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
13 |
+
this list of conditions and the following disclaimer in the documentation
|
14 |
+
and/or other materials provided with the distribution.
|
15 |
+
|
16 |
+
3. Neither the name of the copyright holder nor the names of its
|
17 |
+
contributors may be used to endorse or promote products derived from
|
18 |
+
this software without specific prior written permission.
|
19 |
+
|
20 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
21 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
22 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
23 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
24 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
25 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
26 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
27 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
28 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
29 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_5
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright 2020 Alexandre Défossez
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
|
4 |
+
associated documentation files (the "Software"), to deal in the Software without restriction,
|
5 |
+
including without limitation the rights to use, copy, modify, merge, publish, distribute,
|
6 |
+
sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
|
7 |
+
furnished to do so, subject to the following conditions:
|
8 |
+
|
9 |
+
The above copyright notice and this permission notice shall be included in all copies or
|
10 |
+
substantial portions of the Software.
|
11 |
+
|
12 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
|
13 |
+
NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
14 |
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
15 |
+
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
16 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_6
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023-present, Descript
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_7
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Charactr Inc.
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_8
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Amphion
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
GPT_SoVITS/BigVGAN/tests/test_activation.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 NVIDIA CORPORATION.
|
2 |
+
# Licensed under the MIT license.
|
3 |
+
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
|
7 |
+
# to import modules from parent_dir
|
8 |
+
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
9 |
+
sys.path.append(parent_dir)
|
10 |
+
|
11 |
+
import torch
|
12 |
+
from alias_free_activation.cuda import activation1d
|
13 |
+
from activations import Snake
|
14 |
+
|
15 |
+
|
16 |
+
def test_load_fused_kernels():
|
17 |
+
try:
|
18 |
+
print("[Success] load_fused_kernels")
|
19 |
+
except ImportError as e:
|
20 |
+
print("[Fail] load_fused_kernels")
|
21 |
+
raise e
|
22 |
+
|
23 |
+
|
24 |
+
def test_anti_alias_activation():
|
25 |
+
data = torch.rand((10, 10, 200), device="cuda")
|
26 |
+
|
27 |
+
# Check activations.Snake cuda vs. torch
|
28 |
+
fused_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=True).cuda()
|
29 |
+
fused_activation_output = fused_anti_alias_activation(data)
|
30 |
+
|
31 |
+
torch_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=False).cuda()
|
32 |
+
torch_activation_output = torch_anti_alias_activation(data)
|
33 |
+
|
34 |
+
test_result = (fused_activation_output - torch_activation_output).abs()
|
35 |
+
|
36 |
+
while test_result.dim() != 1:
|
37 |
+
test_result = test_result.mean(dim=-1)
|
38 |
+
|
39 |
+
diff = test_result.mean(dim=-1)
|
40 |
+
|
41 |
+
if diff <= 1e-3:
|
42 |
+
print(
|
43 |
+
f"\n[Success] test_fused_anti_alias_activation"
|
44 |
+
f"\n > mean_difference={diff}"
|
45 |
+
f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
|
46 |
+
f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
|
47 |
+
)
|
48 |
+
else:
|
49 |
+
print(
|
50 |
+
f"\n[Fail] test_fused_anti_alias_activation"
|
51 |
+
f"\n > mean_difference={diff}, "
|
52 |
+
f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
|
53 |
+
f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
|
54 |
+
)
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
from alias_free_activation.cuda import load
|
59 |
+
|
60 |
+
load.load()
|
61 |
+
test_load_fused_kernels()
|
62 |
+
test_anti_alias_activation()
|
GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 NVIDIA CORPORATION.
|
2 |
+
# Licensed under the MIT license.
|
3 |
+
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
|
7 |
+
# to import modules from parent_dir
|
8 |
+
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
9 |
+
sys.path.append(parent_dir)
|
10 |
+
|
11 |
+
import torch
|
12 |
+
from alias_free_activation.cuda import activation1d
|
13 |
+
from activations import SnakeBeta
|
14 |
+
|
15 |
+
|
16 |
+
def test_load_fused_kernels():
|
17 |
+
try:
|
18 |
+
print("[Success] load_fused_kernels")
|
19 |
+
except ImportError as e:
|
20 |
+
print("[Fail] load_fused_kernels")
|
21 |
+
raise e
|
22 |
+
|
23 |
+
|
24 |
+
def test_anti_alias_activation():
|
25 |
+
data = torch.rand((10, 10, 200), device="cuda")
|
26 |
+
|
27 |
+
# Check activations, Snake CUDA vs. Torch
|
28 |
+
fused_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=True).cuda()
|
29 |
+
fused_activation_output = fused_anti_alias_activation(data)
|
30 |
+
|
31 |
+
torch_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=False).cuda()
|
32 |
+
torch_activation_output = torch_anti_alias_activation(data)
|
33 |
+
|
34 |
+
test_result = (fused_activation_output - torch_activation_output).abs()
|
35 |
+
|
36 |
+
while test_result.dim() != 1:
|
37 |
+
test_result = test_result.mean(dim=-1)
|
38 |
+
|
39 |
+
diff = test_result.mean(dim=-1)
|
40 |
+
|
41 |
+
if diff <= 1e-3:
|
42 |
+
print(
|
43 |
+
f"\n[Success] test_fused_anti_alias_activation"
|
44 |
+
f"\n > mean_difference={diff}"
|
45 |
+
f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
|
46 |
+
f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
|
47 |
+
)
|
48 |
+
else:
|
49 |
+
print(
|
50 |
+
f"\n[Fail] test_fused_anti_alias_activation"
|
51 |
+
f"\n > mean_difference={diff}, "
|
52 |
+
f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
|
53 |
+
f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
|
54 |
+
)
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
from alias_free_activation.cuda import load
|
59 |
+
|
60 |
+
load.load()
|
61 |
+
test_load_fused_kernels()
|
62 |
+
test_anti_alias_activation()
|
GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 NVIDIA CORPORATION.
|
2 |
+
# Licensed under the MIT license.
|
3 |
+
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
|
7 |
+
# to import modules from parent_dir
|
8 |
+
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
9 |
+
sys.path.append(parent_dir)
|
10 |
+
|
11 |
+
import torch
|
12 |
+
import json
|
13 |
+
from env import AttrDict
|
14 |
+
from bigvgan import BigVGAN
|
15 |
+
from time import time
|
16 |
+
from tqdm import tqdm
|
17 |
+
from meldataset import mel_spectrogram, MAX_WAV_VALUE
|
18 |
+
from scipy.io.wavfile import write
|
19 |
+
import numpy as np
|
20 |
+
|
21 |
+
import argparse
|
22 |
+
|
23 |
+
torch.backends.cudnn.benchmark = True
|
24 |
+
|
25 |
+
# For easier debugging
|
26 |
+
torch.set_printoptions(linewidth=200, threshold=10_000)
|
27 |
+
|
28 |
+
|
29 |
+
def generate_soundwave(duration=5.0, sr=24000):
|
30 |
+
t = np.linspace(0, duration, int(sr * duration), False, dtype=np.float32)
|
31 |
+
|
32 |
+
modulation = np.sin(2 * np.pi * t / duration)
|
33 |
+
|
34 |
+
min_freq = 220
|
35 |
+
max_freq = 1760
|
36 |
+
frequencies = min_freq + (max_freq - min_freq) * (modulation + 1) / 2
|
37 |
+
soundwave = np.sin(2 * np.pi * frequencies * t)
|
38 |
+
|
39 |
+
soundwave = soundwave / np.max(np.abs(soundwave)) * 0.95
|
40 |
+
|
41 |
+
return soundwave, sr
|
42 |
+
|
43 |
+
|
44 |
+
def get_mel(x, h):
|
45 |
+
return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
|
46 |
+
|
47 |
+
|
48 |
+
def load_checkpoint(filepath, device):
|
49 |
+
assert os.path.isfile(filepath)
|
50 |
+
print(f"Loading '{filepath}'")
|
51 |
+
checkpoint_dict = torch.load(filepath, map_location=device)
|
52 |
+
print("Complete.")
|
53 |
+
return checkpoint_dict
|
54 |
+
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
parser = argparse.ArgumentParser(description="Test script to check CUDA kernel correctness.")
|
58 |
+
parser.add_argument(
|
59 |
+
"--checkpoint_file",
|
60 |
+
type=str,
|
61 |
+
required=True,
|
62 |
+
help="Path to the checkpoint file. Assumes config.json exists in the directory.",
|
63 |
+
)
|
64 |
+
|
65 |
+
args = parser.parse_args()
|
66 |
+
|
67 |
+
config_file = os.path.join(os.path.split(args.checkpoint_file)[0], "config.json")
|
68 |
+
with open(config_file) as f:
|
69 |
+
config = f.read()
|
70 |
+
json_config = json.loads(config)
|
71 |
+
h = AttrDict({**json_config})
|
72 |
+
|
73 |
+
print("loading plain Pytorch BigVGAN")
|
74 |
+
generator_original = BigVGAN(h).to("cuda")
|
75 |
+
print("loading CUDA kernel BigVGAN with auto-build")
|
76 |
+
generator_cuda_kernel = BigVGAN(h, use_cuda_kernel=True).to("cuda")
|
77 |
+
|
78 |
+
state_dict_g = load_checkpoint(args.checkpoint_file, "cuda")
|
79 |
+
generator_original.load_state_dict(state_dict_g["generator"])
|
80 |
+
generator_cuda_kernel.load_state_dict(state_dict_g["generator"])
|
81 |
+
|
82 |
+
generator_original.remove_weight_norm()
|
83 |
+
generator_original.eval()
|
84 |
+
generator_cuda_kernel.remove_weight_norm()
|
85 |
+
generator_cuda_kernel.eval()
|
86 |
+
|
87 |
+
# define number of samples and length of mel frame to benchmark
|
88 |
+
num_sample = 10
|
89 |
+
num_mel_frame = 16384
|
90 |
+
|
91 |
+
# CUDA kernel correctness check
|
92 |
+
diff = 0.0
|
93 |
+
for i in tqdm(range(num_sample)):
|
94 |
+
# Random mel
|
95 |
+
data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
|
96 |
+
|
97 |
+
with torch.inference_mode():
|
98 |
+
audio_original = generator_original(data)
|
99 |
+
|
100 |
+
with torch.inference_mode():
|
101 |
+
audio_cuda_kernel = generator_cuda_kernel(data)
|
102 |
+
|
103 |
+
# Both outputs should be (almost) the same
|
104 |
+
test_result = (audio_original - audio_cuda_kernel).abs()
|
105 |
+
diff += test_result.mean(dim=-1).item()
|
106 |
+
|
107 |
+
diff /= num_sample
|
108 |
+
if diff <= 2e-3: # We can expect a small difference (~1e-3) which does not affect perceptual quality
|
109 |
+
print(
|
110 |
+
f"\n[Success] test CUDA fused vs. plain torch BigVGAN inference"
|
111 |
+
f"\n > mean_difference={diff}"
|
112 |
+
f"\n > fused_values={audio_cuda_kernel[-1][-1][-30:].tolist()}"
|
113 |
+
f"\n > torch_values={audio_original[-1][-1][-30:].tolist()}"
|
114 |
+
)
|
115 |
+
else:
|
116 |
+
print(
|
117 |
+
f"\n[Fail] test CUDA fused vs. plain torch BigVGAN inference"
|
118 |
+
f"\n > mean_difference={diff}"
|
119 |
+
f"\n > fused_values={audio_cuda_kernel[-1][-1][-30:].tolist()}, "
|
120 |
+
f"\n > torch_values={audio_original[-1][-1][-30:].tolist()}"
|
121 |
+
)
|
122 |
+
|
123 |
+
del data, audio_original, audio_cuda_kernel
|
124 |
+
|
125 |
+
# Variables for tracking total time and VRAM usage
|
126 |
+
toc_total_original = 0
|
127 |
+
toc_total_cuda_kernel = 0
|
128 |
+
vram_used_original_total = 0
|
129 |
+
vram_used_cuda_kernel_total = 0
|
130 |
+
audio_length_total = 0
|
131 |
+
|
132 |
+
# Measure Original inference in isolation
|
133 |
+
for i in tqdm(range(num_sample)):
|
134 |
+
torch.cuda.reset_peak_memory_stats(device="cuda")
|
135 |
+
data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
|
136 |
+
torch.cuda.synchronize()
|
137 |
+
tic = time()
|
138 |
+
with torch.inference_mode():
|
139 |
+
audio_original = generator_original(data)
|
140 |
+
torch.cuda.synchronize()
|
141 |
+
toc = time() - tic
|
142 |
+
toc_total_original += toc
|
143 |
+
|
144 |
+
vram_used_original_total += torch.cuda.max_memory_allocated(device="cuda")
|
145 |
+
|
146 |
+
del data, audio_original
|
147 |
+
torch.cuda.empty_cache()
|
148 |
+
|
149 |
+
# Measure CUDA kernel inference in isolation
|
150 |
+
for i in tqdm(range(num_sample)):
|
151 |
+
torch.cuda.reset_peak_memory_stats(device="cuda")
|
152 |
+
data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
|
153 |
+
torch.cuda.synchronize()
|
154 |
+
tic = time()
|
155 |
+
with torch.inference_mode():
|
156 |
+
audio_cuda_kernel = generator_cuda_kernel(data)
|
157 |
+
torch.cuda.synchronize()
|
158 |
+
toc = time() - tic
|
159 |
+
toc_total_cuda_kernel += toc
|
160 |
+
|
161 |
+
audio_length_total += audio_cuda_kernel.shape[-1]
|
162 |
+
|
163 |
+
vram_used_cuda_kernel_total += torch.cuda.max_memory_allocated(device="cuda")
|
164 |
+
|
165 |
+
del data, audio_cuda_kernel
|
166 |
+
torch.cuda.empty_cache()
|
167 |
+
|
168 |
+
# Calculate metrics
|
169 |
+
audio_second = audio_length_total / h.sampling_rate
|
170 |
+
khz_original = audio_length_total / toc_total_original / 1000
|
171 |
+
khz_cuda_kernel = audio_length_total / toc_total_cuda_kernel / 1000
|
172 |
+
vram_used_original_gb = vram_used_original_total / num_sample / (1024**3)
|
173 |
+
vram_used_cuda_kernel_gb = vram_used_cuda_kernel_total / num_sample / (1024**3)
|
174 |
+
|
175 |
+
# Print results
|
176 |
+
print(
|
177 |
+
f"Original BigVGAN: took {toc_total_original:.2f} seconds to generate {audio_second:.2f} seconds of audio, {khz_original:.1f}kHz, {audio_second / toc_total_original:.1f} faster than realtime, VRAM used {vram_used_original_gb:.1f} GB"
|
178 |
+
)
|
179 |
+
print(
|
180 |
+
f"CUDA kernel BigVGAN: took {toc_total_cuda_kernel:.2f} seconds to generate {audio_second:.2f} seconds of audio, {khz_cuda_kernel:.1f}kHz, {audio_second / toc_total_cuda_kernel:.1f} faster than realtime, VRAM used {vram_used_cuda_kernel_gb:.1f} GB"
|
181 |
+
)
|
182 |
+
print(f"speedup of CUDA kernel: {khz_cuda_kernel / khz_original}")
|
183 |
+
print(f"VRAM saving of CUDA kernel: {vram_used_original_gb / vram_used_cuda_kernel_gb}")
|
184 |
+
|
185 |
+
# Use artificial sine waves for inference test
|
186 |
+
audio_real, sr = generate_soundwave(duration=5.0, sr=h.sampling_rate)
|
187 |
+
audio_real = torch.tensor(audio_real).to("cuda")
|
188 |
+
# Compute mel spectrogram from the ground truth audio
|
189 |
+
x = get_mel(audio_real.unsqueeze(0), h)
|
190 |
+
|
191 |
+
with torch.inference_mode():
|
192 |
+
y_g_hat_original = generator_original(x)
|
193 |
+
y_g_hat_cuda_kernel = generator_cuda_kernel(x)
|
194 |
+
|
195 |
+
audio_real = audio_real.squeeze()
|
196 |
+
audio_real = audio_real * MAX_WAV_VALUE
|
197 |
+
audio_real = audio_real.cpu().numpy().astype("int16")
|
198 |
+
|
199 |
+
audio_original = y_g_hat_original.squeeze()
|
200 |
+
audio_original = audio_original * MAX_WAV_VALUE
|
201 |
+
audio_original = audio_original.cpu().numpy().astype("int16")
|
202 |
+
|
203 |
+
audio_cuda_kernel = y_g_hat_cuda_kernel.squeeze()
|
204 |
+
audio_cuda_kernel = audio_cuda_kernel * MAX_WAV_VALUE
|
205 |
+
audio_cuda_kernel = audio_cuda_kernel.cpu().numpy().astype("int16")
|
206 |
+
|
207 |
+
os.makedirs("tmp", exist_ok=True)
|
208 |
+
output_file_real = os.path.join("tmp", "audio_real.wav")
|
209 |
+
output_file_original = os.path.join("tmp", "audio_generated_original.wav")
|
210 |
+
output_file_cuda_kernel = os.path.join("tmp", "audio_generated_cuda_kernel.wav")
|
211 |
+
write(output_file_real, h.sampling_rate, audio_real)
|
212 |
+
write(output_file_original, h.sampling_rate, audio_original)
|
213 |
+
write(output_file_cuda_kernel, h.sampling_rate, audio_cuda_kernel)
|
214 |
+
print("Example generated audios of original vs. fused CUDA kernel written to tmp!")
|
215 |
+
print("Done")
|
GPT_SoVITS/configs/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.yaml
|
GPT_SoVITS/configs/s1.yaml
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
train:
|
2 |
+
seed: 1234
|
3 |
+
epochs: 300
|
4 |
+
batch_size: 8
|
5 |
+
gradient_accumulation: 4
|
6 |
+
save_every_n_epoch: 1
|
7 |
+
precision: 16
|
8 |
+
gradient_clip: 1.0
|
9 |
+
optimizer:
|
10 |
+
lr: 0.01
|
11 |
+
lr_init: 0.00001
|
12 |
+
lr_end: 0.0001
|
13 |
+
warmup_steps: 2000
|
14 |
+
decay_steps: 40000
|
15 |
+
data:
|
16 |
+
max_eval_sample: 8
|
17 |
+
max_sec: 54
|
18 |
+
num_workers: 1
|
19 |
+
pad_val: 1024 # same with EOS in model
|
20 |
+
model:
|
21 |
+
vocab_size: 1025
|
22 |
+
phoneme_vocab_size: 512
|
23 |
+
embedding_dim: 512
|
24 |
+
hidden_dim: 512
|
25 |
+
head: 16
|
26 |
+
linear_units: 2048
|
27 |
+
n_layer: 12
|
28 |
+
dropout: 0
|
29 |
+
EOS: 1024
|
30 |
+
inference:
|
31 |
+
top_k: 5
|
GPT_SoVITS/configs/s1big.yaml
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
train:
|
2 |
+
seed: 1234
|
3 |
+
epochs: 300
|
4 |
+
batch_size: 8
|
5 |
+
gradient_accumulation: 4
|
6 |
+
save_every_n_epoch: 1
|
7 |
+
precision: 16-mixed
|
8 |
+
gradient_clip: 1.0
|
9 |
+
optimizer:
|
10 |
+
lr: 0.01
|
11 |
+
lr_init: 0.00001
|
12 |
+
lr_end: 0.0001
|
13 |
+
warmup_steps: 2000
|
14 |
+
decay_steps: 40000
|
15 |
+
data:
|
16 |
+
max_eval_sample: 8
|
17 |
+
max_sec: 54
|
18 |
+
num_workers: 1
|
19 |
+
pad_val: 1024 # same with EOS in model
|
20 |
+
model:
|
21 |
+
vocab_size: 1025
|
22 |
+
phoneme_vocab_size: 512
|
23 |
+
embedding_dim: 1024
|
24 |
+
hidden_dim: 1024
|
25 |
+
head: 16
|
26 |
+
linear_units: 2048
|
27 |
+
n_layer: 16
|
28 |
+
dropout: 0
|
29 |
+
EOS: 1024
|
30 |
+
inference:
|
31 |
+
top_k: 5
|
GPT_SoVITS/configs/s1big2.yaml
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
train:
|
2 |
+
seed: 1234
|
3 |
+
epochs: 300
|
4 |
+
batch_size: 12
|
5 |
+
gradient_accumulation: 4
|
6 |
+
save_every_n_epoch: 1
|
7 |
+
precision: 16-mixed
|
8 |
+
gradient_clip: 1.0
|
9 |
+
optimizer:
|
10 |
+
lr: 0.01
|
11 |
+
lr_init: 0.00001
|
12 |
+
lr_end: 0.0001
|
13 |
+
warmup_steps: 2000
|
14 |
+
decay_steps: 40000
|
15 |
+
data:
|
16 |
+
max_eval_sample: 8
|
17 |
+
max_sec: 54
|
18 |
+
num_workers: 1
|
19 |
+
pad_val: 1024 # same with EOS in model
|
20 |
+
model:
|
21 |
+
vocab_size: 1025
|
22 |
+
phoneme_vocab_size: 512
|
23 |
+
embedding_dim: 1024
|
24 |
+
hidden_dim: 1024
|
25 |
+
head: 16
|
26 |
+
linear_units: 2048
|
27 |
+
n_layer: 6
|
28 |
+
dropout: 0
|
29 |
+
EOS: 1024
|
30 |
+
inference:
|
31 |
+
top_k: 5
|
GPT_SoVITS/configs/s1longer-v2.yaml
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
train:
|
2 |
+
seed: 1234
|
3 |
+
epochs: 20
|
4 |
+
batch_size: 8
|
5 |
+
save_every_n_epoch: 1
|
6 |
+
precision: 16-mixed
|
7 |
+
gradient_clip: 1.0
|
8 |
+
optimizer:
|
9 |
+
lr: 0.01
|
10 |
+
lr_init: 0.00001
|
11 |
+
lr_end: 0.0001
|
12 |
+
warmup_steps: 2000
|
13 |
+
decay_steps: 40000
|
14 |
+
data:
|
15 |
+
max_eval_sample: 8
|
16 |
+
max_sec: 54
|
17 |
+
num_workers: 4
|
18 |
+
pad_val: 1024 # same with EOS in model
|
19 |
+
model:
|
20 |
+
vocab_size: 1025
|
21 |
+
phoneme_vocab_size: 732
|
22 |
+
embedding_dim: 512
|
23 |
+
hidden_dim: 512
|
24 |
+
head: 16
|
25 |
+
linear_units: 2048
|
26 |
+
n_layer: 24
|
27 |
+
dropout: 0
|
28 |
+
EOS: 1024
|
29 |
+
random_bert: 0
|
30 |
+
inference:
|
31 |
+
top_k: 15
|
GPT_SoVITS/configs/s1longer.yaml
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
train:
|
2 |
+
seed: 1234
|
3 |
+
epochs: 20
|
4 |
+
batch_size: 8
|
5 |
+
save_every_n_epoch: 1
|
6 |
+
precision: 16-mixed
|
7 |
+
gradient_clip: 1.0
|
8 |
+
optimizer:
|
9 |
+
lr: 0.01
|
10 |
+
lr_init: 0.00001
|
11 |
+
lr_end: 0.0001
|
12 |
+
warmup_steps: 2000
|
13 |
+
decay_steps: 40000
|
14 |
+
data:
|
15 |
+
max_eval_sample: 8
|
16 |
+
max_sec: 54
|
17 |
+
num_workers: 4
|
18 |
+
pad_val: 1024 # same with EOS in model
|
19 |
+
model:
|
20 |
+
vocab_size: 1025
|
21 |
+
phoneme_vocab_size: 512
|
22 |
+
embedding_dim: 512
|
23 |
+
hidden_dim: 512
|
24 |
+
head: 16
|
25 |
+
linear_units: 2048
|
26 |
+
n_layer: 24
|
27 |
+
dropout: 0
|
28 |
+
EOS: 1024
|
29 |
+
random_bert: 0
|
30 |
+
inference:
|
31 |
+
top_k: 5
|
GPT_SoVITS/configs/s1mq.yaml
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
train:
|
2 |
+
seed: 1234
|
3 |
+
epochs: 100
|
4 |
+
batch_size: 6
|
5 |
+
gradient_accumulation: 4
|
6 |
+
save_every_n_epoch: 1
|
7 |
+
precision: 32
|
8 |
+
gradient_clip: 1.0
|
9 |
+
optimizer:
|
10 |
+
lr: 0.01
|
11 |
+
lr_init: 0.00001
|
12 |
+
lr_end: 0.0001
|
13 |
+
warmup_steps: 2000
|
14 |
+
decay_steps: 40000
|
15 |
+
data:
|
16 |
+
max_eval_sample: 8
|
17 |
+
max_sec: 40
|
18 |
+
num_workers: 1
|
19 |
+
pad_val: 1024 # same with EOS in model
|
20 |
+
model:
|
21 |
+
saving_path: "ckpt/"
|
22 |
+
resume_checkpoint: null
|
23 |
+
vocoder_config_path: "quantizer/new_ckpt/config.json"
|
24 |
+
vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000"
|
25 |
+
datadir: "/home/liweiche/GigaSpeech/wavs"
|
26 |
+
metapath: "/home/liweiche/GigaSpeech/train2.json"
|
27 |
+
val_metapath: "/home/liweiche/GigaSpeech/dev2.json"
|
28 |
+
sampledir: "logs/"
|
29 |
+
pretrained_path: null
|
30 |
+
lr: 0.0001
|
31 |
+
batch_size: 200.0
|
32 |
+
train_bucket_size: 8192
|
33 |
+
training_step: 800000
|
34 |
+
optim_flat_percent: 0.0
|
35 |
+
warmup_step: 50
|
36 |
+
adam_beta1: 0.9
|
37 |
+
adam_beta2: 0.98
|
38 |
+
ffd_size: 3072
|
39 |
+
hidden_size: 768
|
40 |
+
enc_nlayers: 6
|
41 |
+
dec_nlayers: 6
|
42 |
+
nheads: 12
|
43 |
+
ar_layer: 4
|
44 |
+
ar_ffd_size: 1024
|
45 |
+
ar_hidden_size: 256
|
46 |
+
ar_nheads: 4
|
47 |
+
aligner_softmax_temp: 1.0
|
48 |
+
layer_norm_eps: 0.00001
|
49 |
+
speaker_embed_dropout: 0.05
|
50 |
+
label_smoothing: 0.0
|
51 |
+
val_check_interval: 5000
|
52 |
+
check_val_every_n_epoch: 1
|
53 |
+
precision: "fp16"
|
54 |
+
nworkers: 16
|
55 |
+
distributed: true
|
56 |
+
accelerator: "ddp"
|
57 |
+
version: null
|
58 |
+
accumulate_grad_batches: 1
|
59 |
+
use_repetition_token: true
|
60 |
+
use_repetition_gating: false
|
61 |
+
repetition_penalty: 1.0
|
62 |
+
sampling_temperature: 1.0
|
63 |
+
top_k: -1
|
64 |
+
min_top_k: 3
|
65 |
+
top_p: 0.8
|
66 |
+
sample_num: 4
|
67 |
+
length_penalty_max_length: 15000
|
68 |
+
length_penalty_max_prob: 0.95
|
69 |
+
max_input_length: 2048
|
70 |
+
max_output_length: 2000
|
71 |
+
sample_rate: 16000
|
72 |
+
n_codes: 1024
|
73 |
+
n_cluster_groups: 1
|
74 |
+
phone_context_window: 4
|
75 |
+
phoneset_size: 1000
|
76 |
+
inference:
|
77 |
+
top_k: 5
|
GPT_SoVITS/configs/s2.json
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 100,
|
4 |
+
"eval_interval": 500,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 100,
|
7 |
+
"learning_rate": 0.0001,
|
8 |
+
"betas": [
|
9 |
+
0.8,
|
10 |
+
0.99
|
11 |
+
],
|
12 |
+
"eps": 1e-09,
|
13 |
+
"batch_size": 32,
|
14 |
+
"fp16_run": true,
|
15 |
+
"lr_decay": 0.999875,
|
16 |
+
"segment_size": 20480,
|
17 |
+
"init_lr_ratio": 1,
|
18 |
+
"warmup_epochs": 0,
|
19 |
+
"c_mel": 45,
|
20 |
+
"c_kl": 1.0,
|
21 |
+
"text_low_lr_rate": 0.4,
|
22 |
+
"grad_ckpt": false
|
23 |
+
},
|
24 |
+
"data": {
|
25 |
+
"max_wav_value": 32768.0,
|
26 |
+
"sampling_rate": 32000,
|
27 |
+
"filter_length": 2048,
|
28 |
+
"hop_length": 640,
|
29 |
+
"win_length": 2048,
|
30 |
+
"n_mel_channels": 128,
|
31 |
+
"mel_fmin": 0.0,
|
32 |
+
"mel_fmax": null,
|
33 |
+
"add_blank": true,
|
34 |
+
"n_speakers": 300,
|
35 |
+
"cleaned_text": true
|
36 |
+
},
|
37 |
+
"model": {
|
38 |
+
"inter_channels": 192,
|
39 |
+
"hidden_channels": 192,
|
40 |
+
"filter_channels": 768,
|
41 |
+
"n_heads": 2,
|
42 |
+
"n_layers": 6,
|
43 |
+
"kernel_size": 3,
|
44 |
+
"p_dropout": 0.1,
|
45 |
+
"resblock": "1",
|
46 |
+
"resblock_kernel_sizes": [
|
47 |
+
3,
|
48 |
+
7,
|
49 |
+
11
|
50 |
+
],
|
51 |
+
"resblock_dilation_sizes": [
|
52 |
+
[
|
53 |
+
1,
|
54 |
+
3,
|
55 |
+
5
|
56 |
+
],
|
57 |
+
[
|
58 |
+
1,
|
59 |
+
3,
|
60 |
+
5
|
61 |
+
],
|
62 |
+
[
|
63 |
+
1,
|
64 |
+
3,
|
65 |
+
5
|
66 |
+
]
|
67 |
+
],
|
68 |
+
"upsample_rates": [
|
69 |
+
10,
|
70 |
+
8,
|
71 |
+
2,
|
72 |
+
2,
|
73 |
+
2
|
74 |
+
],
|
75 |
+
"upsample_initial_channel": 512,
|
76 |
+
"upsample_kernel_sizes": [
|
77 |
+
16,
|
78 |
+
16,
|
79 |
+
8,
|
80 |
+
2,
|
81 |
+
2
|
82 |
+
],
|
83 |
+
"n_layers_q": 3,
|
84 |
+
"use_spectral_norm": false,
|
85 |
+
"gin_channels": 512,
|
86 |
+
"semantic_frame_rate": "25hz",
|
87 |
+
"freeze_quantizer": true
|
88 |
+
},
|
89 |
+
"s2_ckpt_dir": "logs/s2/big2k1",
|
90 |
+
"content_module": "cnhubert"
|
91 |
+
}
|
GPT_SoVITS/configs/train.yaml
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gpu:
|
2 |
+
n_card: 1
|
3 |
+
n_process_per_card: 2
|
4 |
+
io:
|
5 |
+
text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS
|
6 |
+
save_every_n_epoch: 1
|
7 |
+
precision: 16-mixed
|
8 |
+
gradient_clip: 1.0
|
9 |
+
optimizer:
|
10 |
+
lr: 0.01
|
11 |
+
lr_init: 0.00001
|
12 |
+
lr_end: 0.0001
|
13 |
+
warmup_steps: 2000
|
14 |
+
decay_steps: 40000
|
15 |
+
data:
|
16 |
+
max_eval_sample: 8
|
17 |
+
max_sec: 54
|
18 |
+
num_workers: 1
|
19 |
+
pad_val: 1024 # same with EOS in model
|
20 |
+
model:
|
21 |
+
vocab_size: 1025
|
22 |
+
phoneme_vocab_size: 512
|
23 |
+
embedding_dim: 512
|
24 |
+
hidden_dim: 512
|
25 |
+
head: 16
|
26 |
+
linear_units: 2048
|
27 |
+
n_layer: 24
|
28 |
+
dropout: 0
|
29 |
+
EOS: 1024
|
30 |
+
random_bert: 0
|
31 |
+
inference:
|
32 |
+
top_k: 5
|
GPT_SoVITS/configs/tts_infer.yaml
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
custom:
|
2 |
+
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
3 |
+
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
4 |
+
device: cuda
|
5 |
+
is_half: true
|
6 |
+
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
|
7 |
+
version: v2
|
8 |
+
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
|
9 |
+
v1:
|
10 |
+
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
11 |
+
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
12 |
+
device: cpu
|
13 |
+
is_half: false
|
14 |
+
t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
|
15 |
+
version: v1
|
16 |
+
vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
|
17 |
+
v2:
|
18 |
+
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
19 |
+
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
20 |
+
device: cpu
|
21 |
+
is_half: false
|
22 |
+
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
|
23 |
+
version: v2
|
24 |
+
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
|
25 |
+
v3:
|
26 |
+
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
27 |
+
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
28 |
+
device: cpu
|
29 |
+
is_half: false
|
30 |
+
t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
|
31 |
+
version: v3
|
32 |
+
vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth
|
GPT_SoVITS/f5_tts/model/__init__.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from f5_tts.model.cfm import CFM
|
2 |
+
#
|
3 |
+
# from f5_tts.model.backbones.unett import UNetT
|
4 |
+
from GPT_SoVITS.f5_tts.model.backbones.dit import DiT
|
5 |
+
# from f5_tts.model.backbones.dit import DiTNoCond
|
6 |
+
# from f5_tts.model.backbones.dit import DiTNoCondNoT
|
7 |
+
# from f5_tts.model.backbones.mmdit import MMDiT
|
8 |
+
|
9 |
+
# from f5_tts.model.trainer import Trainer
|
10 |
+
|
11 |
+
|
12 |
+
# __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]
|
13 |
+
# __all__ = ["CFM", "UNetT", "DiTNoCond","DiT", "MMDiT"]
|
GPT_SoVITS/f5_tts/model/backbones/README.md
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Backbones quick introduction
|
2 |
+
|
3 |
+
|
4 |
+
### unett.py
|
5 |
+
- flat unet transformer
|
6 |
+
- structure same as in e2-tts & voicebox paper except using rotary pos emb
|
7 |
+
- update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
|
8 |
+
|
9 |
+
### dit.py
|
10 |
+
- adaln-zero dit
|
11 |
+
- embedded timestep as condition
|
12 |
+
- concatted noised_input + masked_cond + embedded_text, linear proj in
|
13 |
+
- possible abs pos emb & convnextv2 blocks for embedded text before concat
|
14 |
+
- possible long skip connection (first layer to last layer)
|
15 |
+
|
16 |
+
### mmdit.py
|
17 |
+
- sd3 structure
|
18 |
+
- timestep as condition
|
19 |
+
- left stream: text embedded and applied a abs pos emb
|
20 |
+
- right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
|
GPT_SoVITS/f5_tts/model/backbones/dit.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
ein notation:
|
3 |
+
b - batch
|
4 |
+
n - sequence
|
5 |
+
nt - text sequence
|
6 |
+
nw - raw wave length
|
7 |
+
d - dimension
|
8 |
+
"""
|
9 |
+
|
10 |
+
from __future__ import annotations
|
11 |
+
|
12 |
+
import torch
|
13 |
+
from torch import nn
|
14 |
+
from torch.utils.checkpoint import checkpoint
|
15 |
+
|
16 |
+
from x_transformers.x_transformers import RotaryEmbedding
|
17 |
+
|
18 |
+
from GPT_SoVITS.f5_tts.model.modules import (
|
19 |
+
TimestepEmbedding,
|
20 |
+
ConvNeXtV2Block,
|
21 |
+
ConvPositionEmbedding,
|
22 |
+
DiTBlock,
|
23 |
+
AdaLayerNormZero_Final,
|
24 |
+
precompute_freqs_cis,
|
25 |
+
get_pos_embed_indices,
|
26 |
+
)
|
27 |
+
|
28 |
+
from module.commons import sequence_mask
|
29 |
+
|
30 |
+
|
31 |
+
class TextEmbedding(nn.Module):
|
32 |
+
def __init__(self, text_dim, conv_layers=0, conv_mult=2):
|
33 |
+
super().__init__()
|
34 |
+
if conv_layers > 0:
|
35 |
+
self.extra_modeling = True
|
36 |
+
self.precompute_max_pos = 4096 # ~44s of 24khz audio
|
37 |
+
self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
|
38 |
+
self.text_blocks = nn.Sequential(
|
39 |
+
*[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
|
40 |
+
)
|
41 |
+
else:
|
42 |
+
self.extra_modeling = False
|
43 |
+
|
44 |
+
def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
|
45 |
+
batch, text_len = text.shape[0], text.shape[1]
|
46 |
+
|
47 |
+
if drop_text: # cfg for text
|
48 |
+
text = torch.zeros_like(text)
|
49 |
+
|
50 |
+
# possible extra modeling
|
51 |
+
if self.extra_modeling:
|
52 |
+
# sinus pos emb
|
53 |
+
batch_start = torch.zeros((batch,), dtype=torch.long)
|
54 |
+
pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
|
55 |
+
text_pos_embed = self.freqs_cis[pos_idx]
|
56 |
+
|
57 |
+
# print(23333333,text.shape,text_pos_embed.shape)#torch.Size([7, 465, 256]) torch.Size([7, 465, 256])
|
58 |
+
|
59 |
+
text = text + text_pos_embed
|
60 |
+
|
61 |
+
# convnextv2 blocks
|
62 |
+
text = self.text_blocks(text)
|
63 |
+
|
64 |
+
return text
|
65 |
+
|
66 |
+
|
67 |
+
# noised input audio and context mixing embedding
|
68 |
+
|
69 |
+
|
70 |
+
class InputEmbedding(nn.Module):
|
71 |
+
def __init__(self, mel_dim, text_dim, out_dim):
|
72 |
+
super().__init__()
|
73 |
+
self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
|
74 |
+
self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
|
75 |
+
|
76 |
+
def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
|
77 |
+
if drop_audio_cond: # cfg for cond audio
|
78 |
+
cond = torch.zeros_like(cond)
|
79 |
+
|
80 |
+
x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
|
81 |
+
x = self.conv_pos_embed(x) + x
|
82 |
+
return x
|
83 |
+
|
84 |
+
|
85 |
+
# Transformer backbone using DiT blocks
|
86 |
+
|
87 |
+
|
88 |
+
class DiT(nn.Module):
|
89 |
+
def __init__(
|
90 |
+
self,
|
91 |
+
*,
|
92 |
+
dim,
|
93 |
+
depth=8,
|
94 |
+
heads=8,
|
95 |
+
dim_head=64,
|
96 |
+
dropout=0.1,
|
97 |
+
ff_mult=4,
|
98 |
+
mel_dim=100,
|
99 |
+
text_dim=None,
|
100 |
+
conv_layers=0,
|
101 |
+
long_skip_connection=False,
|
102 |
+
):
|
103 |
+
super().__init__()
|
104 |
+
|
105 |
+
self.time_embed = TimestepEmbedding(dim)
|
106 |
+
self.d_embed = TimestepEmbedding(dim)
|
107 |
+
if text_dim is None:
|
108 |
+
text_dim = mel_dim
|
109 |
+
self.text_embed = TextEmbedding(text_dim, conv_layers=conv_layers)
|
110 |
+
self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
|
111 |
+
|
112 |
+
self.rotary_embed = RotaryEmbedding(dim_head)
|
113 |
+
|
114 |
+
self.dim = dim
|
115 |
+
self.depth = depth
|
116 |
+
|
117 |
+
self.transformer_blocks = nn.ModuleList(
|
118 |
+
[DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout) for _ in range(depth)]
|
119 |
+
)
|
120 |
+
self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
|
121 |
+
|
122 |
+
self.norm_out = AdaLayerNormZero_Final(dim) # final modulation
|
123 |
+
self.proj_out = nn.Linear(dim, mel_dim)
|
124 |
+
|
125 |
+
def ckpt_wrapper(self, module):
|
126 |
+
# https://github.com/chuanyangjin/fast-DiT/blob/main/models.py
|
127 |
+
def ckpt_forward(*inputs):
|
128 |
+
outputs = module(*inputs)
|
129 |
+
return outputs
|
130 |
+
|
131 |
+
return ckpt_forward
|
132 |
+
|
133 |
+
def forward( # x, prompt_x, x_lens, t, style,cond
|
134 |
+
self, # d is channel,n is T
|
135 |
+
x0: float["b n d"], # nosied input audio # noqa: F722
|
136 |
+
cond0: float["b n d"], # masked cond audio # noqa: F722
|
137 |
+
x_lens,
|
138 |
+
time: float["b"] | float[""], # time step # noqa: F821 F722
|
139 |
+
dt_base_bootstrap,
|
140 |
+
text0, # : int["b nt"] # noqa: F722#####condition feature
|
141 |
+
use_grad_ckpt=False, # bool
|
142 |
+
###no-use
|
143 |
+
drop_audio_cond=False, # cfg for cond audio
|
144 |
+
drop_text=False, # cfg for text
|
145 |
+
# mask: bool["b n"] | None = None, # noqa: F722
|
146 |
+
):
|
147 |
+
x = x0.transpose(2, 1)
|
148 |
+
cond = cond0.transpose(2, 1)
|
149 |
+
text = text0.transpose(2, 1)
|
150 |
+
mask = sequence_mask(x_lens, max_length=x.size(1)).to(x.device)
|
151 |
+
|
152 |
+
batch, seq_len = x.shape[0], x.shape[1]
|
153 |
+
if time.ndim == 0:
|
154 |
+
time = time.repeat(batch)
|
155 |
+
|
156 |
+
# t: conditioning time, c: context (text + masked cond audio), x: noised input audio
|
157 |
+
t = self.time_embed(time)
|
158 |
+
dt = self.d_embed(dt_base_bootstrap)
|
159 |
+
t += dt
|
160 |
+
text_embed = self.text_embed(text, seq_len, drop_text=drop_text) ###need to change
|
161 |
+
x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
|
162 |
+
|
163 |
+
rope = self.rotary_embed.forward_from_seq_len(seq_len)
|
164 |
+
|
165 |
+
if self.long_skip_connection is not None:
|
166 |
+
residual = x
|
167 |
+
|
168 |
+
for block in self.transformer_blocks:
|
169 |
+
if use_grad_ckpt:
|
170 |
+
x = checkpoint(self.ckpt_wrapper(block), x, t, mask, rope, use_reentrant=False)
|
171 |
+
else:
|
172 |
+
x = block(x, t, mask=mask, rope=rope)
|
173 |
+
|
174 |
+
if self.long_skip_connection is not None:
|
175 |
+
x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
|
176 |
+
|
177 |
+
x = self.norm_out(x, t)
|
178 |
+
output = self.proj_out(x)
|
179 |
+
|
180 |
+
return output
|
GPT_SoVITS/f5_tts/model/backbones/mmdit.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
ein notation:
|
3 |
+
b - batch
|
4 |
+
n - sequence
|
5 |
+
nt - text sequence
|
6 |
+
nw - raw wave length
|
7 |
+
d - dimension
|
8 |
+
"""
|
9 |
+
|
10 |
+
from __future__ import annotations
|
11 |
+
|
12 |
+
import torch
|
13 |
+
from torch import nn
|
14 |
+
|
15 |
+
from x_transformers.x_transformers import RotaryEmbedding
|
16 |
+
|
17 |
+
from f5_tts.model.modules import (
|
18 |
+
TimestepEmbedding,
|
19 |
+
ConvPositionEmbedding,
|
20 |
+
MMDiTBlock,
|
21 |
+
AdaLayerNormZero_Final,
|
22 |
+
precompute_freqs_cis,
|
23 |
+
get_pos_embed_indices,
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
# text embedding
|
28 |
+
|
29 |
+
|
30 |
+
class TextEmbedding(nn.Module):
|
31 |
+
def __init__(self, out_dim, text_num_embeds):
|
32 |
+
super().__init__()
|
33 |
+
self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim) # will use 0 as filler token
|
34 |
+
|
35 |
+
self.precompute_max_pos = 1024
|
36 |
+
self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
|
37 |
+
|
38 |
+
def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]: # noqa: F722
|
39 |
+
text = text + 1
|
40 |
+
if drop_text:
|
41 |
+
text = torch.zeros_like(text)
|
42 |
+
text = self.text_embed(text)
|
43 |
+
|
44 |
+
# sinus pos emb
|
45 |
+
batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
|
46 |
+
batch_text_len = text.shape[1]
|
47 |
+
pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
|
48 |
+
text_pos_embed = self.freqs_cis[pos_idx]
|
49 |
+
|
50 |
+
text = text + text_pos_embed
|
51 |
+
|
52 |
+
return text
|
53 |
+
|
54 |
+
|
55 |
+
# noised input & masked cond audio embedding
|
56 |
+
|
57 |
+
|
58 |
+
class AudioEmbedding(nn.Module):
|
59 |
+
def __init__(self, in_dim, out_dim):
|
60 |
+
super().__init__()
|
61 |
+
self.linear = nn.Linear(2 * in_dim, out_dim)
|
62 |
+
self.conv_pos_embed = ConvPositionEmbedding(out_dim)
|
63 |
+
|
64 |
+
def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False): # noqa: F722
|
65 |
+
if drop_audio_cond:
|
66 |
+
cond = torch.zeros_like(cond)
|
67 |
+
x = torch.cat((x, cond), dim=-1)
|
68 |
+
x = self.linear(x)
|
69 |
+
x = self.conv_pos_embed(x) + x
|
70 |
+
return x
|
71 |
+
|
72 |
+
|
73 |
+
# Transformer backbone using MM-DiT blocks
|
74 |
+
|
75 |
+
|
76 |
+
class MMDiT(nn.Module):
|
77 |
+
def __init__(
|
78 |
+
self,
|
79 |
+
*,
|
80 |
+
dim,
|
81 |
+
depth=8,
|
82 |
+
heads=8,
|
83 |
+
dim_head=64,
|
84 |
+
dropout=0.1,
|
85 |
+
ff_mult=4,
|
86 |
+
text_num_embeds=256,
|
87 |
+
mel_dim=100,
|
88 |
+
):
|
89 |
+
super().__init__()
|
90 |
+
|
91 |
+
self.time_embed = TimestepEmbedding(dim)
|
92 |
+
self.text_embed = TextEmbedding(dim, text_num_embeds)
|
93 |
+
self.audio_embed = AudioEmbedding(mel_dim, dim)
|
94 |
+
|
95 |
+
self.rotary_embed = RotaryEmbedding(dim_head)
|
96 |
+
|
97 |
+
self.dim = dim
|
98 |
+
self.depth = depth
|
99 |
+
|
100 |
+
self.transformer_blocks = nn.ModuleList(
|
101 |
+
[
|
102 |
+
MMDiTBlock(
|
103 |
+
dim=dim,
|
104 |
+
heads=heads,
|
105 |
+
dim_head=dim_head,
|
106 |
+
dropout=dropout,
|
107 |
+
ff_mult=ff_mult,
|
108 |
+
context_pre_only=i == depth - 1,
|
109 |
+
)
|
110 |
+
for i in range(depth)
|
111 |
+
]
|
112 |
+
)
|
113 |
+
self.norm_out = AdaLayerNormZero_Final(dim) # final modulation
|
114 |
+
self.proj_out = nn.Linear(dim, mel_dim)
|
115 |
+
|
116 |
+
def forward(
|
117 |
+
self,
|
118 |
+
x: float["b n d"], # nosied input audio # noqa: F722
|
119 |
+
cond: float["b n d"], # masked cond audio # noqa: F722
|
120 |
+
text: int["b nt"], # text # noqa: F722
|
121 |
+
time: float["b"] | float[""], # time step # noqa: F821 F722
|
122 |
+
drop_audio_cond, # cfg for cond audio
|
123 |
+
drop_text, # cfg for text
|
124 |
+
mask: bool["b n"] | None = None, # noqa: F722
|
125 |
+
):
|
126 |
+
batch = x.shape[0]
|
127 |
+
if time.ndim == 0:
|
128 |
+
time = time.repeat(batch)
|
129 |
+
|
130 |
+
# t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
|
131 |
+
t = self.time_embed(time)
|
132 |
+
c = self.text_embed(text, drop_text=drop_text)
|
133 |
+
x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
|
134 |
+
|
135 |
+
seq_len = x.shape[1]
|
136 |
+
text_len = text.shape[1]
|
137 |
+
rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
|
138 |
+
rope_text = self.rotary_embed.forward_from_seq_len(text_len)
|
139 |
+
|
140 |
+
for block in self.transformer_blocks:
|
141 |
+
c, x = block(x, c, t, mask=mask, rope=rope_audio, c_rope=rope_text)
|
142 |
+
|
143 |
+
x = self.norm_out(x, t)
|
144 |
+
output = self.proj_out(x)
|
145 |
+
|
146 |
+
return output
|
GPT_SoVITS/f5_tts/model/backbones/unett.py
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
ein notation:
|
3 |
+
b - batch
|
4 |
+
n - sequence
|
5 |
+
nt - text sequence
|
6 |
+
nw - raw wave length
|
7 |
+
d - dimension
|
8 |
+
"""
|
9 |
+
|
10 |
+
from __future__ import annotations
|
11 |
+
from typing import Literal
|
12 |
+
|
13 |
+
import torch
|
14 |
+
from torch import nn
|
15 |
+
import torch.nn.functional as F
|
16 |
+
|
17 |
+
from x_transformers import RMSNorm
|
18 |
+
from x_transformers.x_transformers import RotaryEmbedding
|
19 |
+
|
20 |
+
from f5_tts.model.modules import (
|
21 |
+
TimestepEmbedding,
|
22 |
+
ConvNeXtV2Block,
|
23 |
+
ConvPositionEmbedding,
|
24 |
+
Attention,
|
25 |
+
AttnProcessor,
|
26 |
+
FeedForward,
|
27 |
+
precompute_freqs_cis,
|
28 |
+
get_pos_embed_indices,
|
29 |
+
)
|
30 |
+
|
31 |
+
|
32 |
+
# Text embedding
|
33 |
+
|
34 |
+
|
35 |
+
class TextEmbedding(nn.Module):
|
36 |
+
def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
|
37 |
+
super().__init__()
|
38 |
+
self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
|
39 |
+
|
40 |
+
if conv_layers > 0:
|
41 |
+
self.extra_modeling = True
|
42 |
+
self.precompute_max_pos = 4096 # ~44s of 24khz audio
|
43 |
+
self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
|
44 |
+
self.text_blocks = nn.Sequential(
|
45 |
+
*[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
|
46 |
+
)
|
47 |
+
else:
|
48 |
+
self.extra_modeling = False
|
49 |
+
|
50 |
+
def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
|
51 |
+
text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
|
52 |
+
text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
|
53 |
+
batch, text_len = text.shape[0], text.shape[1]
|
54 |
+
text = F.pad(text, (0, seq_len - text_len), value=0)
|
55 |
+
|
56 |
+
if drop_text: # cfg for text
|
57 |
+
text = torch.zeros_like(text)
|
58 |
+
|
59 |
+
text = self.text_embed(text) # b n -> b n d
|
60 |
+
|
61 |
+
# possible extra modeling
|
62 |
+
if self.extra_modeling:
|
63 |
+
# sinus pos emb
|
64 |
+
batch_start = torch.zeros((batch,), dtype=torch.long)
|
65 |
+
pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
|
66 |
+
text_pos_embed = self.freqs_cis[pos_idx]
|
67 |
+
text = text + text_pos_embed
|
68 |
+
|
69 |
+
# convnextv2 blocks
|
70 |
+
text = self.text_blocks(text)
|
71 |
+
|
72 |
+
return text
|
73 |
+
|
74 |
+
|
75 |
+
# noised input audio and context mixing embedding
|
76 |
+
|
77 |
+
|
78 |
+
class InputEmbedding(nn.Module):
|
79 |
+
def __init__(self, mel_dim, text_dim, out_dim):
|
80 |
+
super().__init__()
|
81 |
+
self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
|
82 |
+
self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
|
83 |
+
|
84 |
+
def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
|
85 |
+
if drop_audio_cond: # cfg for cond audio
|
86 |
+
cond = torch.zeros_like(cond)
|
87 |
+
|
88 |
+
x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
|
89 |
+
x = self.conv_pos_embed(x) + x
|
90 |
+
return x
|
91 |
+
|
92 |
+
|
93 |
+
# Flat UNet Transformer backbone
|
94 |
+
|
95 |
+
|
96 |
+
class UNetT(nn.Module):
|
97 |
+
def __init__(
|
98 |
+
self,
|
99 |
+
*,
|
100 |
+
dim,
|
101 |
+
depth=8,
|
102 |
+
heads=8,
|
103 |
+
dim_head=64,
|
104 |
+
dropout=0.1,
|
105 |
+
ff_mult=4,
|
106 |
+
mel_dim=100,
|
107 |
+
text_num_embeds=256,
|
108 |
+
text_dim=None,
|
109 |
+
conv_layers=0,
|
110 |
+
skip_connect_type: Literal["add", "concat", "none"] = "concat",
|
111 |
+
):
|
112 |
+
super().__init__()
|
113 |
+
assert depth % 2 == 0, "UNet-Transformer's depth should be even."
|
114 |
+
|
115 |
+
self.time_embed = TimestepEmbedding(dim)
|
116 |
+
if text_dim is None:
|
117 |
+
text_dim = mel_dim
|
118 |
+
self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
|
119 |
+
self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
|
120 |
+
|
121 |
+
self.rotary_embed = RotaryEmbedding(dim_head)
|
122 |
+
|
123 |
+
# transformer layers & skip connections
|
124 |
+
|
125 |
+
self.dim = dim
|
126 |
+
self.skip_connect_type = skip_connect_type
|
127 |
+
needs_skip_proj = skip_connect_type == "concat"
|
128 |
+
|
129 |
+
self.depth = depth
|
130 |
+
self.layers = nn.ModuleList([])
|
131 |
+
|
132 |
+
for idx in range(depth):
|
133 |
+
is_later_half = idx >= (depth // 2)
|
134 |
+
|
135 |
+
attn_norm = RMSNorm(dim)
|
136 |
+
attn = Attention(
|
137 |
+
processor=AttnProcessor(),
|
138 |
+
dim=dim,
|
139 |
+
heads=heads,
|
140 |
+
dim_head=dim_head,
|
141 |
+
dropout=dropout,
|
142 |
+
)
|
143 |
+
|
144 |
+
ff_norm = RMSNorm(dim)
|
145 |
+
ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
|
146 |
+
|
147 |
+
skip_proj = nn.Linear(dim * 2, dim, bias=False) if needs_skip_proj and is_later_half else None
|
148 |
+
|
149 |
+
self.layers.append(
|
150 |
+
nn.ModuleList(
|
151 |
+
[
|
152 |
+
skip_proj,
|
153 |
+
attn_norm,
|
154 |
+
attn,
|
155 |
+
ff_norm,
|
156 |
+
ff,
|
157 |
+
]
|
158 |
+
)
|
159 |
+
)
|
160 |
+
|
161 |
+
self.norm_out = RMSNorm(dim)
|
162 |
+
self.proj_out = nn.Linear(dim, mel_dim)
|
163 |
+
|
164 |
+
def forward(
|
165 |
+
self,
|
166 |
+
x: float["b n d"], # nosied input audio # noqa: F722
|
167 |
+
cond: float["b n d"], # masked cond audio # noqa: F722
|
168 |
+
text: int["b nt"], # text # noqa: F722
|
169 |
+
time: float["b"] | float[""], # time step # noqa: F821 F722
|
170 |
+
drop_audio_cond, # cfg for cond audio
|
171 |
+
drop_text, # cfg for text
|
172 |
+
mask: bool["b n"] | None = None, # noqa: F722
|
173 |
+
):
|
174 |
+
batch, seq_len = x.shape[0], x.shape[1]
|
175 |
+
if time.ndim == 0:
|
176 |
+
time = time.repeat(batch)
|
177 |
+
|
178 |
+
# t: conditioning time, c: context (text + masked cond audio), x: noised input audio
|
179 |
+
t = self.time_embed(time)
|
180 |
+
text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
|
181 |
+
x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
|
182 |
+
|
183 |
+
# postfix time t to input x, [b n d] -> [b n+1 d]
|
184 |
+
x = torch.cat([t.unsqueeze(1), x], dim=1) # pack t to x
|
185 |
+
if mask is not None:
|
186 |
+
mask = F.pad(mask, (1, 0), value=1)
|
187 |
+
|
188 |
+
rope = self.rotary_embed.forward_from_seq_len(seq_len + 1)
|
189 |
+
|
190 |
+
# flat unet transformer
|
191 |
+
skip_connect_type = self.skip_connect_type
|
192 |
+
skips = []
|
193 |
+
for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
|
194 |
+
layer = idx + 1
|
195 |
+
|
196 |
+
# skip connection logic
|
197 |
+
is_first_half = layer <= (self.depth // 2)
|
198 |
+
is_later_half = not is_first_half
|
199 |
+
|
200 |
+
if is_first_half:
|
201 |
+
skips.append(x)
|
202 |
+
|
203 |
+
if is_later_half:
|
204 |
+
skip = skips.pop()
|
205 |
+
if skip_connect_type == "concat":
|
206 |
+
x = torch.cat((x, skip), dim=-1)
|
207 |
+
x = maybe_skip_proj(x)
|
208 |
+
elif skip_connect_type == "add":
|
209 |
+
x = x + skip
|
210 |
+
|
211 |
+
# attention and feedforward blocks
|
212 |
+
x = attn(attn_norm(x), rope=rope, mask=mask) + x
|
213 |
+
x = ff(ff_norm(x)) + x
|
214 |
+
|
215 |
+
assert len(skips) == 0
|
216 |
+
|
217 |
+
x = self.norm_out(x)[:, 1:, :] # unpack t from x
|
218 |
+
|
219 |
+
return self.proj_out(x)
|
GPT_SoVITS/f5_tts/model/modules.py
ADDED
@@ -0,0 +1,666 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
ein notation:
|
3 |
+
b - batch
|
4 |
+
n - sequence
|
5 |
+
nt - text sequence
|
6 |
+
nw - raw wave length
|
7 |
+
d - dimension
|
8 |
+
"""
|
9 |
+
|
10 |
+
from __future__ import annotations
|
11 |
+
|
12 |
+
import math
|
13 |
+
from typing import Optional
|
14 |
+
|
15 |
+
import torch
|
16 |
+
import torch.nn.functional as F
|
17 |
+
import torchaudio
|
18 |
+
from librosa.filters import mel as librosa_mel_fn
|
19 |
+
from torch import nn
|
20 |
+
from x_transformers.x_transformers import apply_rotary_pos_emb
|
21 |
+
|
22 |
+
|
23 |
+
# raw wav to mel spec
|
24 |
+
|
25 |
+
|
26 |
+
mel_basis_cache = {}
|
27 |
+
hann_window_cache = {}
|
28 |
+
|
29 |
+
|
30 |
+
def get_bigvgan_mel_spectrogram(
|
31 |
+
waveform,
|
32 |
+
n_fft=1024,
|
33 |
+
n_mel_channels=100,
|
34 |
+
target_sample_rate=24000,
|
35 |
+
hop_length=256,
|
36 |
+
win_length=1024,
|
37 |
+
fmin=0,
|
38 |
+
fmax=None,
|
39 |
+
center=False,
|
40 |
+
): # Copy from https://github.com/NVIDIA/BigVGAN/tree/main
|
41 |
+
device = waveform.device
|
42 |
+
key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
|
43 |
+
|
44 |
+
if key not in mel_basis_cache:
|
45 |
+
mel = librosa_mel_fn(sr=target_sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=fmin, fmax=fmax)
|
46 |
+
mel_basis_cache[key] = torch.from_numpy(mel).float().to(device) # TODO: why they need .float()?
|
47 |
+
hann_window_cache[key] = torch.hann_window(win_length).to(device)
|
48 |
+
|
49 |
+
mel_basis = mel_basis_cache[key]
|
50 |
+
hann_window = hann_window_cache[key]
|
51 |
+
|
52 |
+
padding = (n_fft - hop_length) // 2
|
53 |
+
waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1)
|
54 |
+
|
55 |
+
spec = torch.stft(
|
56 |
+
waveform,
|
57 |
+
n_fft,
|
58 |
+
hop_length=hop_length,
|
59 |
+
win_length=win_length,
|
60 |
+
window=hann_window,
|
61 |
+
center=center,
|
62 |
+
pad_mode="reflect",
|
63 |
+
normalized=False,
|
64 |
+
onesided=True,
|
65 |
+
return_complex=True,
|
66 |
+
)
|
67 |
+
spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
|
68 |
+
|
69 |
+
mel_spec = torch.matmul(mel_basis, spec)
|
70 |
+
mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
|
71 |
+
|
72 |
+
return mel_spec
|
73 |
+
|
74 |
+
|
75 |
+
def get_vocos_mel_spectrogram(
|
76 |
+
waveform,
|
77 |
+
n_fft=1024,
|
78 |
+
n_mel_channels=100,
|
79 |
+
target_sample_rate=24000,
|
80 |
+
hop_length=256,
|
81 |
+
win_length=1024,
|
82 |
+
):
|
83 |
+
mel_stft = torchaudio.transforms.MelSpectrogram(
|
84 |
+
sample_rate=target_sample_rate,
|
85 |
+
n_fft=n_fft,
|
86 |
+
win_length=win_length,
|
87 |
+
hop_length=hop_length,
|
88 |
+
n_mels=n_mel_channels,
|
89 |
+
power=1,
|
90 |
+
center=True,
|
91 |
+
normalized=False,
|
92 |
+
norm=None,
|
93 |
+
).to(waveform.device)
|
94 |
+
if len(waveform.shape) == 3:
|
95 |
+
waveform = waveform.squeeze(1) # 'b 1 nw -> b nw'
|
96 |
+
|
97 |
+
assert len(waveform.shape) == 2
|
98 |
+
|
99 |
+
mel = mel_stft(waveform)
|
100 |
+
mel = mel.clamp(min=1e-5).log()
|
101 |
+
return mel
|
102 |
+
|
103 |
+
|
104 |
+
class MelSpec(nn.Module):
|
105 |
+
def __init__(
|
106 |
+
self,
|
107 |
+
n_fft=1024,
|
108 |
+
hop_length=256,
|
109 |
+
win_length=1024,
|
110 |
+
n_mel_channels=100,
|
111 |
+
target_sample_rate=24_000,
|
112 |
+
mel_spec_type="vocos",
|
113 |
+
):
|
114 |
+
super().__init__()
|
115 |
+
assert mel_spec_type in ["vocos", "bigvgan"], print("We only support two extract mel backend: vocos or bigvgan")
|
116 |
+
|
117 |
+
self.n_fft = n_fft
|
118 |
+
self.hop_length = hop_length
|
119 |
+
self.win_length = win_length
|
120 |
+
self.n_mel_channels = n_mel_channels
|
121 |
+
self.target_sample_rate = target_sample_rate
|
122 |
+
|
123 |
+
if mel_spec_type == "vocos":
|
124 |
+
self.extractor = get_vocos_mel_spectrogram
|
125 |
+
elif mel_spec_type == "bigvgan":
|
126 |
+
self.extractor = get_bigvgan_mel_spectrogram
|
127 |
+
|
128 |
+
self.register_buffer("dummy", torch.tensor(0), persistent=False)
|
129 |
+
|
130 |
+
def forward(self, wav):
|
131 |
+
if self.dummy.device != wav.device:
|
132 |
+
self.to(wav.device)
|
133 |
+
|
134 |
+
mel = self.extractor(
|
135 |
+
waveform=wav,
|
136 |
+
n_fft=self.n_fft,
|
137 |
+
n_mel_channels=self.n_mel_channels,
|
138 |
+
target_sample_rate=self.target_sample_rate,
|
139 |
+
hop_length=self.hop_length,
|
140 |
+
win_length=self.win_length,
|
141 |
+
)
|
142 |
+
|
143 |
+
return mel
|
144 |
+
|
145 |
+
|
146 |
+
# sinusoidal position embedding
|
147 |
+
|
148 |
+
|
149 |
+
class SinusPositionEmbedding(nn.Module):
|
150 |
+
def __init__(self, dim):
|
151 |
+
super().__init__()
|
152 |
+
self.dim = dim
|
153 |
+
|
154 |
+
def forward(self, x, scale=1000):
|
155 |
+
device = x.device
|
156 |
+
half_dim = self.dim // 2
|
157 |
+
emb = math.log(10000) / (half_dim - 1)
|
158 |
+
emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
|
159 |
+
emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
|
160 |
+
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
|
161 |
+
return emb
|
162 |
+
|
163 |
+
|
164 |
+
# convolutional position embedding
|
165 |
+
|
166 |
+
|
167 |
+
class ConvPositionEmbedding(nn.Module):
|
168 |
+
def __init__(self, dim, kernel_size=31, groups=16):
|
169 |
+
super().__init__()
|
170 |
+
assert kernel_size % 2 != 0
|
171 |
+
self.conv1d = nn.Sequential(
|
172 |
+
nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
|
173 |
+
nn.Mish(),
|
174 |
+
nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
|
175 |
+
nn.Mish(),
|
176 |
+
)
|
177 |
+
|
178 |
+
def forward(self, x: float["b n d"], mask: bool["b n"] | None = None): # noqa: F722
|
179 |
+
if mask is not None:
|
180 |
+
mask = mask[..., None]
|
181 |
+
x = x.masked_fill(~mask, 0.0)
|
182 |
+
|
183 |
+
x = x.permute(0, 2, 1)
|
184 |
+
x = self.conv1d(x)
|
185 |
+
out = x.permute(0, 2, 1)
|
186 |
+
|
187 |
+
if mask is not None:
|
188 |
+
out = out.masked_fill(~mask, 0.0)
|
189 |
+
|
190 |
+
return out
|
191 |
+
|
192 |
+
|
193 |
+
# rotary positional embedding related
|
194 |
+
|
195 |
+
|
196 |
+
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
|
197 |
+
# proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
|
198 |
+
# has some connection to NTK literature
|
199 |
+
# https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
200 |
+
# https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
|
201 |
+
theta *= theta_rescale_factor ** (dim / (dim - 2))
|
202 |
+
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
|
203 |
+
t = torch.arange(end, device=freqs.device) # type: ignore
|
204 |
+
freqs = torch.outer(t, freqs).float() # type: ignore
|
205 |
+
freqs_cos = torch.cos(freqs) # real part
|
206 |
+
freqs_sin = torch.sin(freqs) # imaginary part
|
207 |
+
return torch.cat([freqs_cos, freqs_sin], dim=-1)
|
208 |
+
|
209 |
+
|
210 |
+
def get_pos_embed_indices(start, length, max_pos, scale=1.0):
|
211 |
+
# length = length if isinstance(length, int) else length.max()
|
212 |
+
scale = scale * torch.ones_like(start, dtype=torch.float32) # in case scale is a scalar
|
213 |
+
pos = (
|
214 |
+
start.unsqueeze(1)
|
215 |
+
+ (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
|
216 |
+
)
|
217 |
+
# avoid extra long error.
|
218 |
+
pos = torch.where(pos < max_pos, pos, max_pos - 1)
|
219 |
+
return pos
|
220 |
+
|
221 |
+
|
222 |
+
# Global Response Normalization layer (Instance Normalization ?)
|
223 |
+
|
224 |
+
|
225 |
+
class GRN(nn.Module):
|
226 |
+
def __init__(self, dim):
|
227 |
+
super().__init__()
|
228 |
+
self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
|
229 |
+
self.beta = nn.Parameter(torch.zeros(1, 1, dim))
|
230 |
+
|
231 |
+
def forward(self, x):
|
232 |
+
Gx = torch.norm(x, p=2, dim=1, keepdim=True)
|
233 |
+
Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
|
234 |
+
return self.gamma * (x * Nx) + self.beta + x
|
235 |
+
|
236 |
+
|
237 |
+
# ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
|
238 |
+
# ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
|
239 |
+
|
240 |
+
|
241 |
+
class ConvNeXtV2Block(nn.Module):
|
242 |
+
def __init__(
|
243 |
+
self,
|
244 |
+
dim: int,
|
245 |
+
intermediate_dim: int,
|
246 |
+
dilation: int = 1,
|
247 |
+
):
|
248 |
+
super().__init__()
|
249 |
+
padding = (dilation * (7 - 1)) // 2
|
250 |
+
self.dwconv = nn.Conv1d(
|
251 |
+
dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
|
252 |
+
) # depthwise conv
|
253 |
+
self.norm = nn.LayerNorm(dim, eps=1e-6)
|
254 |
+
self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers
|
255 |
+
self.act = nn.GELU()
|
256 |
+
self.grn = GRN(intermediate_dim)
|
257 |
+
self.pwconv2 = nn.Linear(intermediate_dim, dim)
|
258 |
+
|
259 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
260 |
+
residual = x
|
261 |
+
x = x.transpose(1, 2) # b n d -> b d n
|
262 |
+
x = self.dwconv(x)
|
263 |
+
x = x.transpose(1, 2) # b d n -> b n d
|
264 |
+
x = self.norm(x)
|
265 |
+
x = self.pwconv1(x)
|
266 |
+
x = self.act(x)
|
267 |
+
x = self.grn(x)
|
268 |
+
x = self.pwconv2(x)
|
269 |
+
return residual + x
|
270 |
+
|
271 |
+
|
272 |
+
# AdaLayerNormZero
|
273 |
+
# return with modulated x for attn input, and params for later mlp modulation
|
274 |
+
|
275 |
+
|
276 |
+
class AdaLayerNormZero(nn.Module):
|
277 |
+
def __init__(self, dim):
|
278 |
+
super().__init__()
|
279 |
+
|
280 |
+
self.silu = nn.SiLU()
|
281 |
+
self.linear = nn.Linear(dim, dim * 6)
|
282 |
+
|
283 |
+
self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
|
284 |
+
|
285 |
+
def forward(self, x, emb=None):
|
286 |
+
emb = self.linear(self.silu(emb))
|
287 |
+
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
|
288 |
+
|
289 |
+
x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
|
290 |
+
return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
|
291 |
+
|
292 |
+
|
293 |
+
# AdaLayerNormZero for final layer
|
294 |
+
# return only with modulated x for attn input, cuz no more mlp modulation
|
295 |
+
|
296 |
+
|
297 |
+
class AdaLayerNormZero_Final(nn.Module):
|
298 |
+
def __init__(self, dim):
|
299 |
+
super().__init__()
|
300 |
+
|
301 |
+
self.silu = nn.SiLU()
|
302 |
+
self.linear = nn.Linear(dim, dim * 2)
|
303 |
+
|
304 |
+
self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
|
305 |
+
|
306 |
+
def forward(self, x, emb):
|
307 |
+
emb = self.linear(self.silu(emb))
|
308 |
+
scale, shift = torch.chunk(emb, 2, dim=1)
|
309 |
+
|
310 |
+
x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
|
311 |
+
return x
|
312 |
+
|
313 |
+
|
314 |
+
# FeedForward
|
315 |
+
|
316 |
+
|
317 |
+
class FeedForward(nn.Module):
|
318 |
+
def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
|
319 |
+
super().__init__()
|
320 |
+
inner_dim = int(dim * mult)
|
321 |
+
dim_out = dim_out if dim_out is not None else dim
|
322 |
+
|
323 |
+
activation = nn.GELU(approximate=approximate)
|
324 |
+
project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
|
325 |
+
self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
|
326 |
+
|
327 |
+
def forward(self, x):
|
328 |
+
return self.ff(x)
|
329 |
+
|
330 |
+
|
331 |
+
# Attention with possible joint part
|
332 |
+
# modified from diffusers/src/diffusers/models/attention_processor.py
|
333 |
+
|
334 |
+
|
335 |
+
class Attention(nn.Module):
|
336 |
+
def __init__(
|
337 |
+
self,
|
338 |
+
processor: JointAttnProcessor | AttnProcessor,
|
339 |
+
dim: int,
|
340 |
+
heads: int = 8,
|
341 |
+
dim_head: int = 64,
|
342 |
+
dropout: float = 0.0,
|
343 |
+
context_dim: Optional[int] = None, # if not None -> joint attention
|
344 |
+
context_pre_only=None,
|
345 |
+
):
|
346 |
+
super().__init__()
|
347 |
+
|
348 |
+
if not hasattr(F, "scaled_dot_product_attention"):
|
349 |
+
raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
|
350 |
+
|
351 |
+
self.processor = processor
|
352 |
+
|
353 |
+
self.dim = dim
|
354 |
+
self.heads = heads
|
355 |
+
self.inner_dim = dim_head * heads
|
356 |
+
self.dropout = dropout
|
357 |
+
|
358 |
+
self.context_dim = context_dim
|
359 |
+
self.context_pre_only = context_pre_only
|
360 |
+
|
361 |
+
self.to_q = nn.Linear(dim, self.inner_dim)
|
362 |
+
self.to_k = nn.Linear(dim, self.inner_dim)
|
363 |
+
self.to_v = nn.Linear(dim, self.inner_dim)
|
364 |
+
|
365 |
+
if self.context_dim is not None:
|
366 |
+
self.to_k_c = nn.Linear(context_dim, self.inner_dim)
|
367 |
+
self.to_v_c = nn.Linear(context_dim, self.inner_dim)
|
368 |
+
if self.context_pre_only is not None:
|
369 |
+
self.to_q_c = nn.Linear(context_dim, self.inner_dim)
|
370 |
+
|
371 |
+
self.to_out = nn.ModuleList([])
|
372 |
+
self.to_out.append(nn.Linear(self.inner_dim, dim))
|
373 |
+
self.to_out.append(nn.Dropout(dropout))
|
374 |
+
|
375 |
+
if self.context_pre_only is not None and not self.context_pre_only:
|
376 |
+
self.to_out_c = nn.Linear(self.inner_dim, dim)
|
377 |
+
|
378 |
+
def forward(
|
379 |
+
self,
|
380 |
+
x: float["b n d"], # noised input x # noqa: F722
|
381 |
+
c: float["b n d"] = None, # context c # noqa: F722
|
382 |
+
mask: bool["b n"] | None = None, # noqa: F722
|
383 |
+
rope=None, # rotary position embedding for x
|
384 |
+
c_rope=None, # rotary position embedding for c
|
385 |
+
) -> torch.Tensor:
|
386 |
+
if c is not None:
|
387 |
+
return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
|
388 |
+
else:
|
389 |
+
return self.processor(self, x, mask=mask, rope=rope)
|
390 |
+
|
391 |
+
|
392 |
+
# Attention processor
|
393 |
+
|
394 |
+
|
395 |
+
# from torch.nn.attention import SDPBackend
|
396 |
+
# torch.backends.cuda.enable_flash_sdp(True)
|
397 |
+
class AttnProcessor:
|
398 |
+
def __init__(self):
|
399 |
+
pass
|
400 |
+
|
401 |
+
def __call__(
|
402 |
+
self,
|
403 |
+
attn: Attention,
|
404 |
+
x: float["b n d"], # noised input x # noqa: F722
|
405 |
+
mask: bool["b n"] | None = None, # noqa: F722
|
406 |
+
rope=None, # rotary position embedding
|
407 |
+
) -> torch.FloatTensor:
|
408 |
+
batch_size = x.shape[0]
|
409 |
+
|
410 |
+
# `sample` projections.
|
411 |
+
query = attn.to_q(x)
|
412 |
+
key = attn.to_k(x)
|
413 |
+
value = attn.to_v(x)
|
414 |
+
|
415 |
+
# apply rotary position embedding
|
416 |
+
if rope is not None:
|
417 |
+
freqs, xpos_scale = rope
|
418 |
+
q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
|
419 |
+
|
420 |
+
query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
|
421 |
+
key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
|
422 |
+
|
423 |
+
# attention
|
424 |
+
inner_dim = key.shape[-1]
|
425 |
+
head_dim = inner_dim // attn.heads
|
426 |
+
query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
427 |
+
key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
428 |
+
value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
429 |
+
|
430 |
+
# mask. e.g. inference got a batch with different target durations, mask out the padding
|
431 |
+
if mask is not None:
|
432 |
+
attn_mask = mask
|
433 |
+
attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
|
434 |
+
# print(3433333333,attn_mask.shape)
|
435 |
+
attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
|
436 |
+
else:
|
437 |
+
attn_mask = None
|
438 |
+
# with torch.nn.attention.sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
|
439 |
+
# with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=True):
|
440 |
+
# with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=False):
|
441 |
+
# print(torch.backends.cuda.flash_sdp_enabled())
|
442 |
+
# print(torch.backends.cuda.mem_efficient_sdp_enabled())
|
443 |
+
# print(torch.backends.cuda.math_sdp_enabled())
|
444 |
+
x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
|
445 |
+
x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
|
446 |
+
x = x.to(query.dtype)
|
447 |
+
|
448 |
+
# linear proj
|
449 |
+
x = attn.to_out[0](x)
|
450 |
+
# dropout
|
451 |
+
x = attn.to_out[1](x)
|
452 |
+
|
453 |
+
if mask is not None:
|
454 |
+
mask = mask.unsqueeze(-1)
|
455 |
+
x = x.masked_fill(~mask, 0.0)
|
456 |
+
|
457 |
+
return x
|
458 |
+
|
459 |
+
|
460 |
+
# Joint Attention processor for MM-DiT
|
461 |
+
# modified from diffusers/src/diffusers/models/attention_processor.py
|
462 |
+
|
463 |
+
|
464 |
+
class JointAttnProcessor:
|
465 |
+
def __init__(self):
|
466 |
+
pass
|
467 |
+
|
468 |
+
def __call__(
|
469 |
+
self,
|
470 |
+
attn: Attention,
|
471 |
+
x: float["b n d"], # noised input x # noqa: F722
|
472 |
+
c: float["b nt d"] = None, # context c, here text # noqa: F722
|
473 |
+
mask: bool["b n"] | None = None, # noqa: F722
|
474 |
+
rope=None, # rotary position embedding for x
|
475 |
+
c_rope=None, # rotary position embedding for c
|
476 |
+
) -> torch.FloatTensor:
|
477 |
+
residual = x
|
478 |
+
|
479 |
+
batch_size = c.shape[0]
|
480 |
+
|
481 |
+
# `sample` projections.
|
482 |
+
query = attn.to_q(x)
|
483 |
+
key = attn.to_k(x)
|
484 |
+
value = attn.to_v(x)
|
485 |
+
|
486 |
+
# `context` projections.
|
487 |
+
c_query = attn.to_q_c(c)
|
488 |
+
c_key = attn.to_k_c(c)
|
489 |
+
c_value = attn.to_v_c(c)
|
490 |
+
|
491 |
+
# apply rope for context and noised input independently
|
492 |
+
if rope is not None:
|
493 |
+
freqs, xpos_scale = rope
|
494 |
+
q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
|
495 |
+
query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
|
496 |
+
key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
|
497 |
+
if c_rope is not None:
|
498 |
+
freqs, xpos_scale = c_rope
|
499 |
+
q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
|
500 |
+
c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
|
501 |
+
c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
|
502 |
+
|
503 |
+
# attention
|
504 |
+
query = torch.cat([query, c_query], dim=1)
|
505 |
+
key = torch.cat([key, c_key], dim=1)
|
506 |
+
value = torch.cat([value, c_value], dim=1)
|
507 |
+
|
508 |
+
inner_dim = key.shape[-1]
|
509 |
+
head_dim = inner_dim // attn.heads
|
510 |
+
query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
511 |
+
key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
512 |
+
value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
513 |
+
|
514 |
+
# mask. e.g. inference got a batch with different target durations, mask out the padding
|
515 |
+
if mask is not None:
|
516 |
+
attn_mask = F.pad(mask, (0, c.shape[1]), value=True) # no mask for c (text)
|
517 |
+
attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
|
518 |
+
attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
|
519 |
+
else:
|
520 |
+
attn_mask = None
|
521 |
+
|
522 |
+
x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
|
523 |
+
x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
|
524 |
+
x = x.to(query.dtype)
|
525 |
+
|
526 |
+
# Split the attention outputs.
|
527 |
+
x, c = (
|
528 |
+
x[:, : residual.shape[1]],
|
529 |
+
x[:, residual.shape[1] :],
|
530 |
+
)
|
531 |
+
|
532 |
+
# linear proj
|
533 |
+
x = attn.to_out[0](x)
|
534 |
+
# dropout
|
535 |
+
x = attn.to_out[1](x)
|
536 |
+
if not attn.context_pre_only:
|
537 |
+
c = attn.to_out_c(c)
|
538 |
+
|
539 |
+
if mask is not None:
|
540 |
+
mask = mask.unsqueeze(-1)
|
541 |
+
x = x.masked_fill(~mask, 0.0)
|
542 |
+
# c = c.masked_fill(~mask, 0.) # no mask for c (text)
|
543 |
+
|
544 |
+
return x, c
|
545 |
+
|
546 |
+
|
547 |
+
# DiT Block
|
548 |
+
|
549 |
+
|
550 |
+
class DiTBlock(nn.Module):
|
551 |
+
def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1):
|
552 |
+
super().__init__()
|
553 |
+
|
554 |
+
self.attn_norm = AdaLayerNormZero(dim)
|
555 |
+
self.attn = Attention(
|
556 |
+
processor=AttnProcessor(),
|
557 |
+
dim=dim,
|
558 |
+
heads=heads,
|
559 |
+
dim_head=dim_head,
|
560 |
+
dropout=dropout,
|
561 |
+
)
|
562 |
+
|
563 |
+
self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
|
564 |
+
self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
|
565 |
+
|
566 |
+
def forward(self, x, t, mask=None, rope=None): # x: noised input, t: time embedding
|
567 |
+
# pre-norm & modulation for attention input
|
568 |
+
norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
|
569 |
+
|
570 |
+
# attention
|
571 |
+
attn_output = self.attn(x=norm, mask=mask, rope=rope)
|
572 |
+
|
573 |
+
# process attention output for input x
|
574 |
+
x = x + gate_msa.unsqueeze(1) * attn_output
|
575 |
+
|
576 |
+
norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
|
577 |
+
ff_output = self.ff(norm)
|
578 |
+
x = x + gate_mlp.unsqueeze(1) * ff_output
|
579 |
+
|
580 |
+
return x
|
581 |
+
|
582 |
+
|
583 |
+
# MMDiT Block https://arxiv.org/abs/2403.03206
|
584 |
+
|
585 |
+
|
586 |
+
class MMDiTBlock(nn.Module):
|
587 |
+
r"""
|
588 |
+
modified from diffusers/src/diffusers/models/attention.py
|
589 |
+
|
590 |
+
notes.
|
591 |
+
_c: context related. text, cond, etc. (left part in sd3 fig2.b)
|
592 |
+
_x: noised input related. (right part)
|
593 |
+
context_pre_only: last layer only do prenorm + modulation cuz no more ffn
|
594 |
+
"""
|
595 |
+
|
596 |
+
def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_pre_only=False):
|
597 |
+
super().__init__()
|
598 |
+
|
599 |
+
self.context_pre_only = context_pre_only
|
600 |
+
|
601 |
+
self.attn_norm_c = AdaLayerNormZero_Final(dim) if context_pre_only else AdaLayerNormZero(dim)
|
602 |
+
self.attn_norm_x = AdaLayerNormZero(dim)
|
603 |
+
self.attn = Attention(
|
604 |
+
processor=JointAttnProcessor(),
|
605 |
+
dim=dim,
|
606 |
+
heads=heads,
|
607 |
+
dim_head=dim_head,
|
608 |
+
dropout=dropout,
|
609 |
+
context_dim=dim,
|
610 |
+
context_pre_only=context_pre_only,
|
611 |
+
)
|
612 |
+
|
613 |
+
if not context_pre_only:
|
614 |
+
self.ff_norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
|
615 |
+
self.ff_c = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
|
616 |
+
else:
|
617 |
+
self.ff_norm_c = None
|
618 |
+
self.ff_c = None
|
619 |
+
self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
|
620 |
+
self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
|
621 |
+
|
622 |
+
def forward(self, x, c, t, mask=None, rope=None, c_rope=None): # x: noised input, c: context, t: time embedding
|
623 |
+
# pre-norm & modulation for attention input
|
624 |
+
if self.context_pre_only:
|
625 |
+
norm_c = self.attn_norm_c(c, t)
|
626 |
+
else:
|
627 |
+
norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
|
628 |
+
norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
|
629 |
+
|
630 |
+
# attention
|
631 |
+
x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
|
632 |
+
|
633 |
+
# process attention output for context c
|
634 |
+
if self.context_pre_only:
|
635 |
+
c = None
|
636 |
+
else: # if not last layer
|
637 |
+
c = c + c_gate_msa.unsqueeze(1) * c_attn_output
|
638 |
+
|
639 |
+
norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
|
640 |
+
c_ff_output = self.ff_c(norm_c)
|
641 |
+
c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
|
642 |
+
|
643 |
+
# process attention output for input x
|
644 |
+
x = x + x_gate_msa.unsqueeze(1) * x_attn_output
|
645 |
+
|
646 |
+
norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
|
647 |
+
x_ff_output = self.ff_x(norm_x)
|
648 |
+
x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
|
649 |
+
|
650 |
+
return c, x
|
651 |
+
|
652 |
+
|
653 |
+
# time step conditioning embedding
|
654 |
+
|
655 |
+
|
656 |
+
class TimestepEmbedding(nn.Module):
|
657 |
+
def __init__(self, dim, freq_embed_dim=256):
|
658 |
+
super().__init__()
|
659 |
+
self.time_embed = SinusPositionEmbedding(freq_embed_dim)
|
660 |
+
self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
|
661 |
+
|
662 |
+
def forward(self, timestep: float["b"]): # noqa: F821
|
663 |
+
time_hidden = self.time_embed(timestep)
|
664 |
+
time_hidden = time_hidden.to(timestep.dtype)
|
665 |
+
time = self.time_mlp(time_hidden) # b d
|
666 |
+
return time
|
GPT_SoVITS/pretrained_models/.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*
|
2 |
+
!.gitignore
|
GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/data/docker/liujing04/gpt-vits/chinese-hubert-base",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"apply_spec_augment": true,
|
5 |
+
"architectures": [
|
6 |
+
"HubertModel"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"classifier_proj_size": 256,
|
11 |
+
"conv_bias": false,
|
12 |
+
"conv_dim": [
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512
|
20 |
+
],
|
21 |
+
"conv_kernel": [
|
22 |
+
10,
|
23 |
+
3,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
3,
|
27 |
+
2,
|
28 |
+
2
|
29 |
+
],
|
30 |
+
"conv_stride": [
|
31 |
+
5,
|
32 |
+
2,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2,
|
37 |
+
2
|
38 |
+
],
|
39 |
+
"ctc_loss_reduction": "sum",
|
40 |
+
"ctc_zero_infinity": false,
|
41 |
+
"do_stable_layer_norm": false,
|
42 |
+
"eos_token_id": 2,
|
43 |
+
"feat_extract_activation": "gelu",
|
44 |
+
"feat_extract_norm": "group",
|
45 |
+
"feat_proj_dropout": 0.0,
|
46 |
+
"feat_proj_layer_norm": true,
|
47 |
+
"final_dropout": 0.1,
|
48 |
+
"hidden_act": "gelu",
|
49 |
+
"hidden_dropout": 0.1,
|
50 |
+
"hidden_size": 768,
|
51 |
+
"initializer_range": 0.02,
|
52 |
+
"intermediate_size": 3072,
|
53 |
+
"layer_norm_eps": 1e-05,
|
54 |
+
"layerdrop": 0.1,
|
55 |
+
"mask_feature_length": 10,
|
56 |
+
"mask_feature_min_masks": 0,
|
57 |
+
"mask_feature_prob": 0.0,
|
58 |
+
"mask_time_length": 10,
|
59 |
+
"mask_time_min_masks": 2,
|
60 |
+
"mask_time_prob": 0.05,
|
61 |
+
"model_type": "hubert",
|
62 |
+
"num_attention_heads": 12,
|
63 |
+
"num_conv_pos_embedding_groups": 16,
|
64 |
+
"num_conv_pos_embeddings": 128,
|
65 |
+
"num_feat_extract_layers": 7,
|
66 |
+
"num_hidden_layers": 12,
|
67 |
+
"pad_token_id": 0,
|
68 |
+
"torch_dtype": "float16",
|
69 |
+
"transformers_version": "4.30.2",
|
70 |
+
"use_weighted_layer_sum": false,
|
71 |
+
"vocab_size": 32
|
72 |
+
}
|
GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": false,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large",
|
3 |
+
"architectures": [
|
4 |
+
"BertForMaskedLM"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"directionality": "bidi",
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout_prob": 0.1,
|
13 |
+
"hidden_size": 1024,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 4096,
|
16 |
+
"layer_norm_eps": 1e-12,
|
17 |
+
"max_position_embeddings": 512,
|
18 |
+
"model_type": "bert",
|
19 |
+
"num_attention_heads": 16,
|
20 |
+
"num_hidden_layers": 24,
|
21 |
+
"output_past": true,
|
22 |
+
"pad_token_id": 0,
|
23 |
+
"pooler_fc_size": 768,
|
24 |
+
"pooler_num_attention_heads": 12,
|
25 |
+
"pooler_num_fc_layers": 3,
|
26 |
+
"pooler_size_per_head": 128,
|
27 |
+
"pooler_type": "first_token_transform",
|
28 |
+
"position_embedding_type": "absolute",
|
29 |
+
"torch_dtype": "float16",
|
30 |
+
"transformers_version": "4.30.2",
|
31 |
+
"type_vocab_size": 2,
|
32 |
+
"use_cache": true,
|
33 |
+
"vocab_size": 21128
|
34 |
+
}
|
GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x/config.json
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"resblock": "1",
|
3 |
+
"num_gpus": 0,
|
4 |
+
"batch_size": 32,
|
5 |
+
"learning_rate": 0.0001,
|
6 |
+
"adam_b1": 0.8,
|
7 |
+
"adam_b2": 0.99,
|
8 |
+
"lr_decay": 0.9999996,
|
9 |
+
"seed": 1234,
|
10 |
+
|
11 |
+
"upsample_rates": [4,4,2,2,2,2],
|
12 |
+
"upsample_kernel_sizes": [8,8,4,4,4,4],
|
13 |
+
"upsample_initial_channel": 1536,
|
14 |
+
"resblock_kernel_sizes": [3,7,11],
|
15 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
16 |
+
|
17 |
+
"use_tanh_at_final": false,
|
18 |
+
"use_bias_at_final": false,
|
19 |
+
|
20 |
+
"activation": "snakebeta",
|
21 |
+
"snake_logscale": true,
|
22 |
+
|
23 |
+
"use_cqtd_instead_of_mrd": true,
|
24 |
+
"cqtd_filters": 128,
|
25 |
+
"cqtd_max_filters": 1024,
|
26 |
+
"cqtd_filters_scale": 1,
|
27 |
+
"cqtd_dilations": [1, 2, 4],
|
28 |
+
"cqtd_hop_lengths": [512, 256, 256],
|
29 |
+
"cqtd_n_octaves": [9, 9, 9],
|
30 |
+
"cqtd_bins_per_octaves": [24, 36, 48],
|
31 |
+
|
32 |
+
"mpd_reshapes": [2, 3, 5, 7, 11],
|
33 |
+
"use_spectral_norm": false,
|
34 |
+
"discriminator_channel_mult": 1,
|
35 |
+
|
36 |
+
"use_multiscale_melloss": true,
|
37 |
+
"lambda_melloss": 15,
|
38 |
+
|
39 |
+
"clip_grad_norm": 500,
|
40 |
+
|
41 |
+
"segment_size": 65536,
|
42 |
+
"num_mels": 100,
|
43 |
+
"num_freq": 1025,
|
44 |
+
"n_fft": 1024,
|
45 |
+
"hop_size": 256,
|
46 |
+
"win_size": 1024,
|
47 |
+
|
48 |
+
"sampling_rate": 24000,
|
49 |
+
|
50 |
+
"fmin": 0,
|
51 |
+
"fmax": null,
|
52 |
+
"fmax_for_loss": null,
|
53 |
+
|
54 |
+
"normalize_volume": true,
|
55 |
+
|
56 |
+
"num_workers": 4,
|
57 |
+
|
58 |
+
"dist_config": {
|
59 |
+
"dist_backend": "nccl",
|
60 |
+
"dist_url": "tcp://localhost:54321",
|
61 |
+
"world_size": 1
|
62 |
+
}
|
63 |
+
}
|
GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1
|
3 |
+
size 155093966
|
GPT_SoVITS/pretrained_models/s2D488k.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8
|
3 |
+
size 93533667
|
GPT_SoVITS/text/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
G2PWModel
|
2 |
+
__pycache__
|
3 |
+
*.zip
|
GPT_SoVITS/text/__init__.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
# if os.environ.get("version","v1")=="v1":
|
3 |
+
# from text.symbols import symbols
|
4 |
+
# else:
|
5 |
+
# from text.symbols2 import symbols
|
6 |
+
|
7 |
+
from text import symbols as symbols_v1
|
8 |
+
from text import symbols2 as symbols_v2
|
9 |
+
|
10 |
+
_symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
|
11 |
+
_symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
|
12 |
+
|
13 |
+
|
14 |
+
def cleaned_text_to_sequence(cleaned_text, version=None):
|
15 |
+
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
16 |
+
Args:
|
17 |
+
text: string to convert to a sequence
|
18 |
+
Returns:
|
19 |
+
List of integers corresponding to the symbols in the text
|
20 |
+
"""
|
21 |
+
if version is None:
|
22 |
+
version = os.environ.get("version", "v2")
|
23 |
+
if version == "v1":
|
24 |
+
phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
|
25 |
+
else:
|
26 |
+
phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
|
27 |
+
|
28 |
+
return phones
|
GPT_SoVITS/text/cantonese.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# reference: https://huggingface.co/spaces/Naozumi0512/Bert-VITS2-Cantonese-Yue/blob/main/text/chinese.py
|
2 |
+
|
3 |
+
import re
|
4 |
+
import cn2an
|
5 |
+
import ToJyutping
|
6 |
+
|
7 |
+
from text.symbols import punctuation
|
8 |
+
from text.zh_normalization.text_normlization import TextNormalizer
|
9 |
+
|
10 |
+
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
11 |
+
|
12 |
+
INITIALS = [
|
13 |
+
"aa",
|
14 |
+
"aai",
|
15 |
+
"aak",
|
16 |
+
"aap",
|
17 |
+
"aat",
|
18 |
+
"aau",
|
19 |
+
"ai",
|
20 |
+
"au",
|
21 |
+
"ap",
|
22 |
+
"at",
|
23 |
+
"ak",
|
24 |
+
"a",
|
25 |
+
"p",
|
26 |
+
"b",
|
27 |
+
"e",
|
28 |
+
"ts",
|
29 |
+
"t",
|
30 |
+
"dz",
|
31 |
+
"d",
|
32 |
+
"kw",
|
33 |
+
"k",
|
34 |
+
"gw",
|
35 |
+
"g",
|
36 |
+
"f",
|
37 |
+
"h",
|
38 |
+
"l",
|
39 |
+
"m",
|
40 |
+
"ng",
|
41 |
+
"n",
|
42 |
+
"s",
|
43 |
+
"y",
|
44 |
+
"w",
|
45 |
+
"c",
|
46 |
+
"z",
|
47 |
+
"j",
|
48 |
+
"ong",
|
49 |
+
"on",
|
50 |
+
"ou",
|
51 |
+
"oi",
|
52 |
+
"ok",
|
53 |
+
"o",
|
54 |
+
"uk",
|
55 |
+
"ung",
|
56 |
+
]
|
57 |
+
INITIALS += ["sp", "spl", "spn", "sil"]
|
58 |
+
|
59 |
+
|
60 |
+
rep_map = {
|
61 |
+
":": ",",
|
62 |
+
";": ",",
|
63 |
+
",": ",",
|
64 |
+
"。": ".",
|
65 |
+
"!": "!",
|
66 |
+
"?": "?",
|
67 |
+
"\n": ".",
|
68 |
+
"·": ",",
|
69 |
+
"、": ",",
|
70 |
+
"...": "…",
|
71 |
+
"$": ".",
|
72 |
+
"“": "'",
|
73 |
+
"”": "'",
|
74 |
+
'"': "'",
|
75 |
+
"‘": "'",
|
76 |
+
"’": "'",
|
77 |
+
"(": "'",
|
78 |
+
")": "'",
|
79 |
+
"(": "'",
|
80 |
+
")": "'",
|
81 |
+
"《": "'",
|
82 |
+
"》": "'",
|
83 |
+
"【": "'",
|
84 |
+
"】": "'",
|
85 |
+
"[": "'",
|
86 |
+
"]": "'",
|
87 |
+
"—": "-",
|
88 |
+
"~": "-",
|
89 |
+
"~": "-",
|
90 |
+
"「": "'",
|
91 |
+
"」": "'",
|
92 |
+
}
|
93 |
+
|
94 |
+
|
95 |
+
def replace_punctuation(text):
|
96 |
+
# text = text.replace("嗯", "恩").replace("呣", "母")
|
97 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
98 |
+
|
99 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
100 |
+
|
101 |
+
replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
|
102 |
+
|
103 |
+
return replaced_text
|
104 |
+
|
105 |
+
|
106 |
+
def text_normalize(text):
|
107 |
+
tx = TextNormalizer()
|
108 |
+
sentences = tx.normalize(text)
|
109 |
+
dest_text = ""
|
110 |
+
for sentence in sentences:
|
111 |
+
dest_text += replace_punctuation(sentence)
|
112 |
+
return dest_text
|
113 |
+
|
114 |
+
|
115 |
+
punctuation_set = set(punctuation)
|
116 |
+
|
117 |
+
|
118 |
+
def jyuping_to_initials_finals_tones(jyuping_syllables):
|
119 |
+
initials_finals = []
|
120 |
+
tones = []
|
121 |
+
word2ph = []
|
122 |
+
|
123 |
+
for syllable in jyuping_syllables:
|
124 |
+
if syllable in punctuation:
|
125 |
+
initials_finals.append(syllable)
|
126 |
+
tones.append(0)
|
127 |
+
word2ph.append(1) # Add 1 for punctuation
|
128 |
+
elif syllable == "_":
|
129 |
+
initials_finals.append(syllable)
|
130 |
+
tones.append(0)
|
131 |
+
word2ph.append(1) # Add 1 for underscore
|
132 |
+
else:
|
133 |
+
try:
|
134 |
+
tone = int(syllable[-1])
|
135 |
+
syllable_without_tone = syllable[:-1]
|
136 |
+
except ValueError:
|
137 |
+
tone = 0
|
138 |
+
syllable_without_tone = syllable
|
139 |
+
|
140 |
+
for initial in INITIALS:
|
141 |
+
if syllable_without_tone.startswith(initial):
|
142 |
+
if syllable_without_tone.startswith("nga"):
|
143 |
+
initials_finals.extend(
|
144 |
+
[
|
145 |
+
syllable_without_tone[:2],
|
146 |
+
syllable_without_tone[2:] or syllable_without_tone[-1],
|
147 |
+
]
|
148 |
+
)
|
149 |
+
# tones.extend([tone, tone])
|
150 |
+
tones.extend([-1, tone])
|
151 |
+
word2ph.append(2)
|
152 |
+
else:
|
153 |
+
final = syllable_without_tone[len(initial) :] or initial[-1]
|
154 |
+
initials_finals.extend([initial, final])
|
155 |
+
# tones.extend([tone, tone])
|
156 |
+
tones.extend([-1, tone])
|
157 |
+
word2ph.append(2)
|
158 |
+
break
|
159 |
+
assert len(initials_finals) == len(tones)
|
160 |
+
|
161 |
+
###魔改为辅音+带音调的元音
|
162 |
+
phones = []
|
163 |
+
for a, b in zip(initials_finals, tones):
|
164 |
+
if b not in [-1, 0]: ###防止粤语和普通话重合开头加Y,如果是标点,不加。
|
165 |
+
todo = "%s%s" % (a, b)
|
166 |
+
else:
|
167 |
+
todo = a
|
168 |
+
if todo not in punctuation_set:
|
169 |
+
todo = "Y%s" % todo
|
170 |
+
phones.append(todo)
|
171 |
+
|
172 |
+
# return initials_finals, tones, word2ph
|
173 |
+
return phones, word2ph
|
174 |
+
|
175 |
+
|
176 |
+
def get_jyutping(text):
|
177 |
+
jyutping_array = []
|
178 |
+
punct_pattern = re.compile(r"^[{}]+$".format(re.escape("".join(punctuation))))
|
179 |
+
|
180 |
+
syllables = ToJyutping.get_jyutping_list(text)
|
181 |
+
|
182 |
+
for word, syllable in syllables:
|
183 |
+
if punct_pattern.match(word):
|
184 |
+
puncts = re.split(r"([{}])".format(re.escape("".join(punctuation))), word)
|
185 |
+
for punct in puncts:
|
186 |
+
if len(punct) > 0:
|
187 |
+
jyutping_array.append(punct)
|
188 |
+
else:
|
189 |
+
# match multple jyutping eg: liu4 ge3, or single jyutping eg: liu4
|
190 |
+
if not re.search(r"^([a-z]+[1-6]+[ ]?)+$", syllable):
|
191 |
+
raise ValueError(f"Failed to convert {word} to jyutping: {syllable}")
|
192 |
+
jyutping_array.append(syllable)
|
193 |
+
|
194 |
+
return jyutping_array
|
195 |
+
|
196 |
+
|
197 |
+
def get_bert_feature(text, word2ph):
|
198 |
+
from text import chinese_bert
|
199 |
+
|
200 |
+
return chinese_bert.get_bert_feature(text, word2ph)
|
201 |
+
|
202 |
+
|
203 |
+
def g2p(text):
|
204 |
+
# word2ph = []
|
205 |
+
jyuping = get_jyutping(text)
|
206 |
+
# print(jyuping)
|
207 |
+
# phones, tones, word2ph = jyuping_to_initials_finals_tones(jyuping)
|
208 |
+
phones, word2ph = jyuping_to_initials_finals_tones(jyuping)
|
209 |
+
# phones = ["_"] + phones + ["_"]
|
210 |
+
# tones = [0] + tones + [0]
|
211 |
+
# word2ph = [1] + word2ph + [1]
|
212 |
+
return phones, word2ph
|
213 |
+
|
214 |
+
|
215 |
+
if __name__ == "__main__":
|
216 |
+
# text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
|
217 |
+
text = "佢個鋤頭太短啦。"
|
218 |
+
text = text_normalize(text)
|
219 |
+
# phones, tones, word2ph = g2p(text)
|
220 |
+
phones, word2ph = g2p(text)
|
221 |
+
# print(phones, tones, word2ph)
|
222 |
+
print(phones, word2ph)
|
GPT_SoVITS/text/chinese.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
|
4 |
+
import cn2an
|
5 |
+
from pypinyin import lazy_pinyin, Style
|
6 |
+
|
7 |
+
from text.symbols import punctuation
|
8 |
+
from text.tone_sandhi import ToneSandhi
|
9 |
+
from text.zh_normalization.text_normlization import TextNormalizer
|
10 |
+
|
11 |
+
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
12 |
+
|
13 |
+
current_file_path = os.path.dirname(__file__)
|
14 |
+
pinyin_to_symbol_map = {
|
15 |
+
line.split("\t")[0]: line.strip().split("\t")[1]
|
16 |
+
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
|
17 |
+
}
|
18 |
+
|
19 |
+
import jieba_fast
|
20 |
+
import logging
|
21 |
+
|
22 |
+
jieba_fast.setLogLevel(logging.CRITICAL)
|
23 |
+
import jieba_fast.posseg as psg
|
24 |
+
|
25 |
+
|
26 |
+
rep_map = {
|
27 |
+
":": ",",
|
28 |
+
";": ",",
|
29 |
+
",": ",",
|
30 |
+
"。": ".",
|
31 |
+
"!": "!",
|
32 |
+
"?": "?",
|
33 |
+
"\n": ".",
|
34 |
+
"·": ",",
|
35 |
+
"、": ",",
|
36 |
+
"...": "…",
|
37 |
+
"$": ".",
|
38 |
+
"/": ",",
|
39 |
+
"—": "-",
|
40 |
+
"~": "…",
|
41 |
+
"~": "…",
|
42 |
+
}
|
43 |
+
|
44 |
+
tone_modifier = ToneSandhi()
|
45 |
+
|
46 |
+
|
47 |
+
def replace_punctuation(text):
|
48 |
+
text = text.replace("嗯", "恩").replace("呣", "母")
|
49 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
50 |
+
|
51 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
52 |
+
|
53 |
+
replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
|
54 |
+
|
55 |
+
return replaced_text
|
56 |
+
|
57 |
+
|
58 |
+
def replace_punctuation_with_en(text):
|
59 |
+
text = text.replace("嗯", "恩").replace("呣", "母")
|
60 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
61 |
+
|
62 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
63 |
+
|
64 |
+
replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text)
|
65 |
+
|
66 |
+
return replaced_text
|
67 |
+
|
68 |
+
|
69 |
+
def replace_consecutive_punctuation(text):
|
70 |
+
punctuations = "".join(re.escape(p) for p in punctuation)
|
71 |
+
pattern = f"([{punctuations}])([{punctuations}])+"
|
72 |
+
result = re.sub(pattern, r"\1", text)
|
73 |
+
return result
|
74 |
+
|
75 |
+
|
76 |
+
def g2p(text):
|
77 |
+
pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
|
78 |
+
sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
|
79 |
+
phones, word2ph = _g2p(sentences)
|
80 |
+
return phones, word2ph
|
81 |
+
|
82 |
+
|
83 |
+
def _get_initials_finals(word):
|
84 |
+
initials = []
|
85 |
+
finals = []
|
86 |
+
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
|
87 |
+
orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
88 |
+
for c, v in zip(orig_initials, orig_finals):
|
89 |
+
initials.append(c)
|
90 |
+
finals.append(v)
|
91 |
+
return initials, finals
|
92 |
+
|
93 |
+
|
94 |
+
def _g2p(segments):
|
95 |
+
phones_list = []
|
96 |
+
word2ph = []
|
97 |
+
for seg in segments:
|
98 |
+
pinyins = []
|
99 |
+
# Replace all English words in the sentence
|
100 |
+
seg = re.sub("[a-zA-Z]+", "", seg)
|
101 |
+
seg_cut = psg.lcut(seg)
|
102 |
+
initials = []
|
103 |
+
finals = []
|
104 |
+
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
|
105 |
+
for word, pos in seg_cut:
|
106 |
+
if pos == "eng":
|
107 |
+
continue
|
108 |
+
sub_initials, sub_finals = _get_initials_finals(word)
|
109 |
+
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
|
110 |
+
initials.append(sub_initials)
|
111 |
+
finals.append(sub_finals)
|
112 |
+
|
113 |
+
# assert len(sub_initials) == len(sub_finals) == len(word)
|
114 |
+
initials = sum(initials, [])
|
115 |
+
finals = sum(finals, [])
|
116 |
+
#
|
117 |
+
for c, v in zip(initials, finals):
|
118 |
+
raw_pinyin = c + v
|
119 |
+
# NOTE: post process for pypinyin outputs
|
120 |
+
# we discriminate i, ii and iii
|
121 |
+
if c == v:
|
122 |
+
assert c in punctuation
|
123 |
+
phone = [c]
|
124 |
+
word2ph.append(1)
|
125 |
+
else:
|
126 |
+
v_without_tone = v[:-1]
|
127 |
+
tone = v[-1]
|
128 |
+
|
129 |
+
pinyin = c + v_without_tone
|
130 |
+
assert tone in "12345"
|
131 |
+
|
132 |
+
if c:
|
133 |
+
# 多音节
|
134 |
+
v_rep_map = {
|
135 |
+
"uei": "ui",
|
136 |
+
"iou": "iu",
|
137 |
+
"uen": "un",
|
138 |
+
}
|
139 |
+
if v_without_tone in v_rep_map.keys():
|
140 |
+
pinyin = c + v_rep_map[v_without_tone]
|
141 |
+
else:
|
142 |
+
# 单音节
|
143 |
+
pinyin_rep_map = {
|
144 |
+
"ing": "ying",
|
145 |
+
"i": "yi",
|
146 |
+
"in": "yin",
|
147 |
+
"u": "wu",
|
148 |
+
}
|
149 |
+
if pinyin in pinyin_rep_map.keys():
|
150 |
+
pinyin = pinyin_rep_map[pinyin]
|
151 |
+
else:
|
152 |
+
single_rep_map = {
|
153 |
+
"v": "yu",
|
154 |
+
"e": "e",
|
155 |
+
"i": "y",
|
156 |
+
"u": "w",
|
157 |
+
}
|
158 |
+
if pinyin[0] in single_rep_map.keys():
|
159 |
+
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
|
160 |
+
|
161 |
+
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
|
162 |
+
new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
|
163 |
+
new_v = new_v + tone
|
164 |
+
phone = [new_c, new_v]
|
165 |
+
word2ph.append(len(phone))
|
166 |
+
|
167 |
+
phones_list += phone
|
168 |
+
return phones_list, word2ph
|
169 |
+
|
170 |
+
|
171 |
+
def text_normalize(text):
|
172 |
+
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
|
173 |
+
tx = TextNormalizer()
|
174 |
+
sentences = tx.normalize(text)
|
175 |
+
dest_text = ""
|
176 |
+
for sentence in sentences:
|
177 |
+
dest_text += replace_punctuation(sentence)
|
178 |
+
|
179 |
+
# 避免重复标点引起的参考泄露
|
180 |
+
dest_text = replace_consecutive_punctuation(dest_text)
|
181 |
+
return dest_text
|
182 |
+
|
183 |
+
|
184 |
+
# 不排除英文的文本格式化
|
185 |
+
def mix_text_normalize(text):
|
186 |
+
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
|
187 |
+
tx = TextNormalizer()
|
188 |
+
sentences = tx.normalize(text)
|
189 |
+
dest_text = ""
|
190 |
+
for sentence in sentences:
|
191 |
+
dest_text += replace_punctuation_with_en(sentence)
|
192 |
+
|
193 |
+
# 避免重复标点引起的参考泄露
|
194 |
+
dest_text = replace_consecutive_punctuation(dest_text)
|
195 |
+
return dest_text
|
196 |
+
|
197 |
+
|
198 |
+
if __name__ == "__main__":
|
199 |
+
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
|
200 |
+
text = "呣呣呣~就是…大人的鼹鼠党吧?"
|
201 |
+
text = "你好"
|
202 |
+
text = text_normalize(text)
|
203 |
+
print(g2p(text))
|
204 |
+
|
205 |
+
|
206 |
+
# # 示例用法
|
207 |
+
# text = "这是一个示例文本:,你好!这是一个测试..."
|
208 |
+
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|