kevinwang676 commited on
Commit
0282784
·
verified ·
1 Parent(s): 559ee5e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
  2. GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py +69 -0
  3. GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  4. GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
  5. GPT_SoVITS/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
  6. GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py +82 -0
  7. GPT_SoVITS/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
  8. GPT_SoVITS/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
  9. GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py +30 -0
  10. GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py +99 -0
  11. GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py +48 -0
  12. GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_1 +21 -0
  13. GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_2 +21 -0
  14. GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_3 +201 -0
  15. GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_4 +29 -0
  16. GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_5 +16 -0
  17. GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_6 +21 -0
  18. GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_7 +21 -0
  19. GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_8 +21 -0
  20. GPT_SoVITS/BigVGAN/tests/test_activation.py +62 -0
  21. GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py +62 -0
  22. GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py +215 -0
  23. GPT_SoVITS/configs/.gitignore +1 -0
  24. GPT_SoVITS/configs/s1.yaml +31 -0
  25. GPT_SoVITS/configs/s1big.yaml +31 -0
  26. GPT_SoVITS/configs/s1big2.yaml +31 -0
  27. GPT_SoVITS/configs/s1longer-v2.yaml +31 -0
  28. GPT_SoVITS/configs/s1longer.yaml +31 -0
  29. GPT_SoVITS/configs/s1mq.yaml +77 -0
  30. GPT_SoVITS/configs/s2.json +91 -0
  31. GPT_SoVITS/configs/train.yaml +32 -0
  32. GPT_SoVITS/configs/tts_infer.yaml +32 -0
  33. GPT_SoVITS/f5_tts/model/__init__.py +13 -0
  34. GPT_SoVITS/f5_tts/model/backbones/README.md +20 -0
  35. GPT_SoVITS/f5_tts/model/backbones/dit.py +180 -0
  36. GPT_SoVITS/f5_tts/model/backbones/mmdit.py +146 -0
  37. GPT_SoVITS/f5_tts/model/backbones/unett.py +219 -0
  38. GPT_SoVITS/f5_tts/model/modules.py +666 -0
  39. GPT_SoVITS/pretrained_models/.gitignore +2 -0
  40. GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json +72 -0
  41. GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json +9 -0
  42. GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json +34 -0
  43. GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json +0 -0
  44. GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x/config.json +63 -0
  45. GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt +3 -0
  46. GPT_SoVITS/pretrained_models/s2D488k.pth +3 -0
  47. GPT_SoVITS/text/.gitignore +3 -0
  48. GPT_SoVITS/text/__init__.py +28 -0
  49. GPT_SoVITS/text/cantonese.py +222 -0
  50. GPT_SoVITS/text/chinese.py +208 -0
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py ADDED
File without changes
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 NVIDIA CORPORATION.
2
+ # Licensed under the MIT license.
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ from alias_free_activation.torch.resample import UpSample1d, DownSample1d
7
+
8
+ # load fused CUDA kernel: this enables importing anti_alias_activation_cuda
9
+ from alias_free_activation.cuda import load
10
+
11
+ anti_alias_activation_cuda = load.load()
12
+
13
+
14
+ class FusedAntiAliasActivation(torch.autograd.Function):
15
+ """
16
+ Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
17
+ The hyperparameters are hard-coded in the kernel to maximize speed.
18
+ NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
19
+ """
20
+
21
+ @staticmethod
22
+ def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
23
+ activation_results = anti_alias_activation_cuda.forward(inputs, up_ftr, down_ftr, alpha, beta)
24
+
25
+ return activation_results
26
+
27
+ @staticmethod
28
+ def backward(ctx, output_grads):
29
+ raise NotImplementedError
30
+ return output_grads, None, None
31
+
32
+
33
+ class Activation1d(nn.Module):
34
+ def __init__(
35
+ self,
36
+ activation,
37
+ up_ratio: int = 2,
38
+ down_ratio: int = 2,
39
+ up_kernel_size: int = 12,
40
+ down_kernel_size: int = 12,
41
+ fused: bool = True,
42
+ ):
43
+ super().__init__()
44
+ self.up_ratio = up_ratio
45
+ self.down_ratio = down_ratio
46
+ self.act = activation
47
+ self.upsample = UpSample1d(up_ratio, up_kernel_size)
48
+ self.downsample = DownSample1d(down_ratio, down_kernel_size)
49
+
50
+ self.fused = fused # Whether to use fused CUDA kernel or not
51
+
52
+ def forward(self, x):
53
+ if not self.fused:
54
+ x = self.upsample(x)
55
+ x = self.act(x)
56
+ x = self.downsample(x)
57
+ return x
58
+ else:
59
+ if self.act.__class__.__name__ == "Snake":
60
+ beta = self.act.alpha.data # Snake uses same params for alpha and beta
61
+ else:
62
+ beta = self.act.beta.data # Snakebeta uses different params for alpha and beta
63
+ alpha = self.act.alpha.data
64
+ if not self.act.alpha_logscale: # Exp baked into cuda kernel, cancel it out with a log
65
+ alpha = torch.log(alpha)
66
+ beta = torch.log(beta)
67
+
68
+ x = FusedAntiAliasActivation.apply(x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta)
69
+ return x
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* coding=utf-8
2
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+
17
+ #include <torch/extension.h>
18
+
19
+ extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
20
+
21
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
22
+ m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
23
+ }
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* coding=utf-8
2
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+
17
+ #include <ATen/ATen.h>
18
+ #include <cuda.h>
19
+ #include <cuda_runtime.h>
20
+ #include <cuda_fp16.h>
21
+ #include <cuda_profiler_api.h>
22
+ #include <ATen/cuda/CUDAContext.h>
23
+ #include <torch/extension.h>
24
+ #include "type_shim.h"
25
+ #include <assert.h>
26
+ #include <cfloat>
27
+ #include <limits>
28
+ #include <stdint.h>
29
+ #include <c10/macros/Macros.h>
30
+
31
+ namespace
32
+ {
33
+ // Hard-coded hyperparameters
34
+ // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
35
+ constexpr int ELEMENTS_PER_LDG_STG = 1; //(WARP_ITERATIONS < 4) ? 1 : 4;
36
+ constexpr int BUFFER_SIZE = 32;
37
+ constexpr int FILTER_SIZE = 12;
38
+ constexpr int HALF_FILTER_SIZE = 6;
39
+ constexpr int UPSAMPLE_REPLICATION_PAD = 5; // 5 on each side, matching torch impl
40
+ constexpr int DOWNSAMPLE_REPLICATION_PAD_LEFT = 5; // matching torch impl
41
+ constexpr int DOWNSAMPLE_REPLICATION_PAD_RIGHT = 6; // matching torch impl
42
+
43
+ template <typename input_t, typename output_t, typename acc_t>
44
+ __global__ void anti_alias_activation_forward(
45
+ output_t *dst,
46
+ const input_t *src,
47
+ const input_t *up_ftr,
48
+ const input_t *down_ftr,
49
+ const input_t *alpha,
50
+ const input_t *beta,
51
+ int batch_size,
52
+ int channels,
53
+ int seq_len)
54
+ {
55
+ // Up and downsample filters
56
+ input_t up_filter[FILTER_SIZE];
57
+ input_t down_filter[FILTER_SIZE];
58
+
59
+ // Load data from global memory including extra indices reserved for replication paddings
60
+ input_t elements[2 * FILTER_SIZE + 2 * BUFFER_SIZE + 2 * UPSAMPLE_REPLICATION_PAD] = {0};
61
+ input_t intermediates[2 * FILTER_SIZE + 2 * BUFFER_SIZE + DOWNSAMPLE_REPLICATION_PAD_LEFT + DOWNSAMPLE_REPLICATION_PAD_RIGHT] = {0};
62
+
63
+ // Output stores downsampled output before writing to dst
64
+ output_t output[BUFFER_SIZE];
65
+
66
+ // blockDim/threadIdx = (128, 1, 1)
67
+ // gridDim/blockIdx = (seq_blocks, channels, batches)
68
+ int block_offset = (blockIdx.x * 128 * BUFFER_SIZE + seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
69
+ int local_offset = threadIdx.x * BUFFER_SIZE;
70
+ int seq_offset = blockIdx.x * 128 * BUFFER_SIZE + local_offset;
71
+
72
+ // intermediate have double the seq_len
73
+ int intermediate_local_offset = threadIdx.x * BUFFER_SIZE * 2;
74
+ int intermediate_seq_offset = blockIdx.x * 128 * BUFFER_SIZE * 2 + intermediate_local_offset;
75
+
76
+ // Get values needed for replication padding before moving pointer
77
+ const input_t *right_most_pntr = src + (seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
78
+ input_t seq_left_most_value = right_most_pntr[0];
79
+ input_t seq_right_most_value = right_most_pntr[seq_len - 1];
80
+
81
+ // Move src and dst pointers
82
+ src += block_offset + local_offset;
83
+ dst += block_offset + local_offset;
84
+
85
+ // Alpha and beta values for snake activatons. Applies exp by default
86
+ alpha = alpha + blockIdx.y;
87
+ input_t alpha_val = expf(alpha[0]);
88
+ beta = beta + blockIdx.y;
89
+ input_t beta_val = expf(beta[0]);
90
+
91
+ #pragma unroll
92
+ for (int it = 0; it < FILTER_SIZE; it += 1)
93
+ {
94
+ up_filter[it] = up_ftr[it];
95
+ down_filter[it] = down_ftr[it];
96
+ }
97
+
98
+ // Apply replication padding for upsampling, matching torch impl
99
+ #pragma unroll
100
+ for (int it = -HALF_FILTER_SIZE; it < BUFFER_SIZE + HALF_FILTER_SIZE; it += 1)
101
+ {
102
+ int element_index = seq_offset + it; // index for element
103
+ if ((element_index < 0) && (element_index >= -UPSAMPLE_REPLICATION_PAD))
104
+ {
105
+ elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_left_most_value;
106
+ }
107
+ if ((element_index >= seq_len) && (element_index < seq_len + UPSAMPLE_REPLICATION_PAD))
108
+ {
109
+ elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_right_most_value;
110
+ }
111
+ if ((element_index >= 0) && (element_index < seq_len))
112
+ {
113
+ elements[2 * (HALF_FILTER_SIZE + it)] = 2 * src[it];
114
+ }
115
+ }
116
+
117
+ // Apply upsampling strided convolution and write to intermediates. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT for replication padding of the downsampilng conv later
118
+ #pragma unroll
119
+ for (int it = 0; it < (2 * BUFFER_SIZE + 2 * FILTER_SIZE); it += 1)
120
+ {
121
+ input_t acc = 0.0;
122
+ int element_index = intermediate_seq_offset + it; // index for intermediate
123
+ #pragma unroll
124
+ for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
125
+ {
126
+ if ((element_index + f_idx) >= 0)
127
+ {
128
+ acc += up_filter[f_idx] * elements[it + f_idx];
129
+ }
130
+ }
131
+ intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] = acc;
132
+ }
133
+
134
+ // Apply activation function. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT and DOWNSAMPLE_REPLICATION_PAD_RIGHT for replication padding of the downsampilng conv later
135
+ double no_div_by_zero = 0.000000001;
136
+ #pragma unroll
137
+ for (int it = 0; it < 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it += 1)
138
+ {
139
+ intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] += (1.0 / (beta_val + no_div_by_zero)) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val);
140
+ }
141
+
142
+ // Apply replication padding before downsampling conv from intermediates
143
+ #pragma unroll
144
+ for (int it = 0; it < DOWNSAMPLE_REPLICATION_PAD_LEFT; it += 1)
145
+ {
146
+ intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT];
147
+ }
148
+ #pragma unroll
149
+ for (int it = DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it < DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE + DOWNSAMPLE_REPLICATION_PAD_RIGHT; it += 1)
150
+ {
151
+ intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE - 1];
152
+ }
153
+
154
+ // Apply downsample strided convolution (assuming stride=2) from intermediates
155
+ #pragma unroll
156
+ for (int it = 0; it < BUFFER_SIZE; it += 1)
157
+ {
158
+ input_t acc = 0.0;
159
+ #pragma unroll
160
+ for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
161
+ {
162
+ // Add constant DOWNSAMPLE_REPLICATION_PAD_RIGHT to match torch implementation
163
+ acc += down_filter[f_idx] * intermediates[it * 2 + f_idx + DOWNSAMPLE_REPLICATION_PAD_RIGHT];
164
+ }
165
+ output[it] = acc;
166
+ }
167
+
168
+ // Write output to dst
169
+ #pragma unroll
170
+ for (int it = 0; it < BUFFER_SIZE; it += ELEMENTS_PER_LDG_STG)
171
+ {
172
+ int element_index = seq_offset + it;
173
+ if (element_index < seq_len)
174
+ {
175
+ dst[it] = output[it];
176
+ }
177
+ }
178
+
179
+ }
180
+
181
+ template <typename input_t, typename output_t, typename acc_t>
182
+ void dispatch_anti_alias_activation_forward(
183
+ output_t *dst,
184
+ const input_t *src,
185
+ const input_t *up_ftr,
186
+ const input_t *down_ftr,
187
+ const input_t *alpha,
188
+ const input_t *beta,
189
+ int batch_size,
190
+ int channels,
191
+ int seq_len)
192
+ {
193
+ if (seq_len == 0)
194
+ {
195
+ return;
196
+ }
197
+ else
198
+ {
199
+ // Use 128 threads per block to maximimize gpu utilization
200
+ constexpr int threads_per_block = 128;
201
+ constexpr int seq_len_per_block = 4096;
202
+ int blocks_per_seq_len = (seq_len + seq_len_per_block - 1) / seq_len_per_block;
203
+ dim3 blocks(blocks_per_seq_len, channels, batch_size);
204
+ dim3 threads(threads_per_block, 1, 1);
205
+
206
+ anti_alias_activation_forward<input_t, output_t, acc_t>
207
+ <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, up_ftr, down_ftr, alpha, beta, batch_size, channels, seq_len);
208
+ }
209
+ }
210
+ }
211
+
212
+ extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta)
213
+ {
214
+ // Input is a 3d tensor with dimensions [batches, channels, seq_len]
215
+ const int batches = input.size(0);
216
+ const int channels = input.size(1);
217
+ const int seq_len = input.size(2);
218
+
219
+ // Output
220
+ auto act_options = input.options().requires_grad(false);
221
+
222
+ torch::Tensor anti_alias_activation_results =
223
+ torch::empty({batches, channels, seq_len}, act_options);
224
+
225
+ void *input_ptr = static_cast<void *>(input.data_ptr());
226
+ void *up_filter_ptr = static_cast<void *>(up_filter.data_ptr());
227
+ void *down_filter_ptr = static_cast<void *>(down_filter.data_ptr());
228
+ void *alpha_ptr = static_cast<void *>(alpha.data_ptr());
229
+ void *beta_ptr = static_cast<void *>(beta.data_ptr());
230
+ void *anti_alias_activation_results_ptr = static_cast<void *>(anti_alias_activation_results.data_ptr());
231
+
232
+ DISPATCH_FLOAT_HALF_AND_BFLOAT(
233
+ input.scalar_type(),
234
+ "dispatch anti alias activation_forward",
235
+ dispatch_anti_alias_activation_forward<scalar_t, scalar_t, float>(
236
+ reinterpret_cast<scalar_t *>(anti_alias_activation_results_ptr),
237
+ reinterpret_cast<const scalar_t *>(input_ptr),
238
+ reinterpret_cast<const scalar_t *>(up_filter_ptr),
239
+ reinterpret_cast<const scalar_t *>(down_filter_ptr),
240
+ reinterpret_cast<const scalar_t *>(alpha_ptr),
241
+ reinterpret_cast<const scalar_t *>(beta_ptr),
242
+ batches,
243
+ channels,
244
+ seq_len););
245
+ return anti_alias_activation_results;
246
+ }
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/compat.h ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* coding=utf-8
2
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+
17
+ /*This code is copied fron NVIDIA apex:
18
+ * https://github.com/NVIDIA/apex
19
+ * with minor changes. */
20
+
21
+ #ifndef TORCH_CHECK
22
+ #define TORCH_CHECK AT_CHECK
23
+ #endif
24
+
25
+ #ifdef VERSION_GE_1_3
26
+ #define DATA_PTR data_ptr
27
+ #else
28
+ #define DATA_PTR data
29
+ #endif
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 NVIDIA CORPORATION.
2
+ # Licensed under the MIT license.
3
+
4
+ import os
5
+ import pathlib
6
+ import subprocess
7
+
8
+ from torch.utils import cpp_extension
9
+
10
+ """
11
+ Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels.
12
+ Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
13
+ """
14
+ os.environ["TORCH_CUDA_ARCH_LIST"] = ""
15
+
16
+
17
+ def load():
18
+ # Check if cuda 11 is installed for compute capability 8.0
19
+ cc_flag = []
20
+ _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
21
+ if int(bare_metal_major) >= 11:
22
+ cc_flag.append("-gencode")
23
+ cc_flag.append("arch=compute_80,code=sm_80")
24
+
25
+ # Build path
26
+ srcpath = pathlib.Path(__file__).parent.absolute()
27
+ buildpath = srcpath / "build"
28
+ _create_build_dir(buildpath)
29
+
30
+ # Helper function to build the kernels.
31
+ def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
32
+ return cpp_extension.load(
33
+ name=name,
34
+ sources=sources,
35
+ build_directory=buildpath,
36
+ extra_cflags=[
37
+ "-O3",
38
+ ],
39
+ extra_cuda_cflags=[
40
+ "-O3",
41
+ "-gencode",
42
+ "arch=compute_70,code=sm_70",
43
+ "--use_fast_math",
44
+ ]
45
+ + extra_cuda_flags
46
+ + cc_flag,
47
+ verbose=True,
48
+ )
49
+
50
+ extra_cuda_flags = [
51
+ "-U__CUDA_NO_HALF_OPERATORS__",
52
+ "-U__CUDA_NO_HALF_CONVERSIONS__",
53
+ "--expt-relaxed-constexpr",
54
+ "--expt-extended-lambda",
55
+ ]
56
+
57
+ sources = [
58
+ srcpath / "anti_alias_activation.cpp",
59
+ srcpath / "anti_alias_activation_cuda.cu",
60
+ ]
61
+ anti_alias_activation_cuda = _cpp_extention_load_helper("anti_alias_activation_cuda", sources, extra_cuda_flags)
62
+
63
+ return anti_alias_activation_cuda
64
+
65
+
66
+ def _get_cuda_bare_metal_version(cuda_dir):
67
+ raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
68
+ output = raw_output.split()
69
+ release_idx = output.index("release") + 1
70
+ release = output[release_idx].split(".")
71
+ bare_metal_major = release[0]
72
+ bare_metal_minor = release[1][0]
73
+
74
+ return raw_output, bare_metal_major, bare_metal_minor
75
+
76
+
77
+ def _create_build_dir(buildpath):
78
+ try:
79
+ os.mkdir(buildpath)
80
+ except OSError:
81
+ if not os.path.isdir(buildpath):
82
+ print(f"Creation of the build directory {buildpath} failed")
GPT_SoVITS/BigVGAN/alias_free_activation/cuda/type_shim.h ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* coding=utf-8
2
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+
17
+ #include <ATen/ATen.h>
18
+ #include "compat.h"
19
+
20
+ #define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, NAME, ...) \
21
+ switch (TYPE) \
22
+ { \
23
+ case at::ScalarType::Float: \
24
+ { \
25
+ using scalar_t = float; \
26
+ __VA_ARGS__; \
27
+ break; \
28
+ } \
29
+ case at::ScalarType::Half: \
30
+ { \
31
+ using scalar_t = at::Half; \
32
+ __VA_ARGS__; \
33
+ break; \
34
+ } \
35
+ case at::ScalarType::BFloat16: \
36
+ { \
37
+ using scalar_t = at::BFloat16; \
38
+ __VA_ARGS__; \
39
+ break; \
40
+ } \
41
+ default: \
42
+ AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
43
+ }
44
+
45
+ #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
46
+ switch (TYPEIN) \
47
+ { \
48
+ case at::ScalarType::Float: \
49
+ { \
50
+ using scalar_t_in = float; \
51
+ switch (TYPEOUT) \
52
+ { \
53
+ case at::ScalarType::Float: \
54
+ { \
55
+ using scalar_t_out = float; \
56
+ __VA_ARGS__; \
57
+ break; \
58
+ } \
59
+ case at::ScalarType::Half: \
60
+ { \
61
+ using scalar_t_out = at::Half; \
62
+ __VA_ARGS__; \
63
+ break; \
64
+ } \
65
+ case at::ScalarType::BFloat16: \
66
+ { \
67
+ using scalar_t_out = at::BFloat16; \
68
+ __VA_ARGS__; \
69
+ break; \
70
+ } \
71
+ default: \
72
+ AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
73
+ } \
74
+ break; \
75
+ } \
76
+ case at::ScalarType::Half: \
77
+ { \
78
+ using scalar_t_in = at::Half; \
79
+ using scalar_t_out = at::Half; \
80
+ __VA_ARGS__; \
81
+ break; \
82
+ } \
83
+ case at::ScalarType::BFloat16: \
84
+ { \
85
+ using scalar_t_in = at::BFloat16; \
86
+ using scalar_t_out = at::BFloat16; \
87
+ __VA_ARGS__; \
88
+ break; \
89
+ } \
90
+ default: \
91
+ AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \
92
+ }
GPT_SoVITS/BigVGAN/alias_free_activation/torch/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2
+ # LICENSE is in incl_licenses directory.
3
+
4
+ from .filter import *
5
+ from .resample import *
6
+ from .act import *
GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2
+ # LICENSE is in incl_licenses directory.
3
+
4
+ import torch.nn as nn
5
+ from .resample import UpSample1d, DownSample1d
6
+
7
+
8
+ class Activation1d(nn.Module):
9
+ def __init__(
10
+ self,
11
+ activation,
12
+ up_ratio: int = 2,
13
+ down_ratio: int = 2,
14
+ up_kernel_size: int = 12,
15
+ down_kernel_size: int = 12,
16
+ ):
17
+ super().__init__()
18
+ self.up_ratio = up_ratio
19
+ self.down_ratio = down_ratio
20
+ self.act = activation
21
+ self.upsample = UpSample1d(up_ratio, up_kernel_size)
22
+ self.downsample = DownSample1d(down_ratio, down_kernel_size)
23
+
24
+ # x: [B,C,T]
25
+ def forward(self, x):
26
+ x = self.upsample(x)
27
+ x = self.act(x)
28
+ x = self.downsample(x)
29
+
30
+ return x
GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2
+ # LICENSE is in incl_licenses directory.
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ import math
8
+
9
+ if "sinc" in dir(torch):
10
+ sinc = torch.sinc
11
+ else:
12
+ # This code is adopted from adefossez's julius.core.sinc under the MIT License
13
+ # https://adefossez.github.io/julius/julius/core.html
14
+ # LICENSE is in incl_licenses directory.
15
+ def sinc(x: torch.Tensor):
16
+ """
17
+ Implementation of sinc, i.e. sin(pi * x) / (pi * x)
18
+ __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
19
+ """
20
+ return torch.where(
21
+ x == 0,
22
+ torch.tensor(1.0, device=x.device, dtype=x.dtype),
23
+ torch.sin(math.pi * x) / math.pi / x,
24
+ )
25
+
26
+
27
+ # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
28
+ # https://adefossez.github.io/julius/julius/lowpass.html
29
+ # LICENSE is in incl_licenses directory.
30
+ def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
31
+ even = kernel_size % 2 == 0
32
+ half_size = kernel_size // 2
33
+
34
+ # For kaiser window
35
+ delta_f = 4 * half_width
36
+ A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
37
+ if A > 50.0:
38
+ beta = 0.1102 * (A - 8.7)
39
+ elif A >= 21.0:
40
+ beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
41
+ else:
42
+ beta = 0.0
43
+ window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
44
+
45
+ # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
46
+ if even:
47
+ time = torch.arange(-half_size, half_size) + 0.5
48
+ else:
49
+ time = torch.arange(kernel_size) - half_size
50
+ if cutoff == 0:
51
+ filter_ = torch.zeros_like(time)
52
+ else:
53
+ filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
54
+ """
55
+ Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
56
+ """
57
+ filter_ /= filter_.sum()
58
+ filter = filter_.view(1, 1, kernel_size)
59
+
60
+ return filter
61
+
62
+
63
+ class LowPassFilter1d(nn.Module):
64
+ def __init__(
65
+ self,
66
+ cutoff=0.5,
67
+ half_width=0.6,
68
+ stride: int = 1,
69
+ padding: bool = True,
70
+ padding_mode: str = "replicate",
71
+ kernel_size: int = 12,
72
+ ):
73
+ """
74
+ kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
75
+ """
76
+ super().__init__()
77
+ if cutoff < -0.0:
78
+ raise ValueError("Minimum cutoff must be larger than zero.")
79
+ if cutoff > 0.5:
80
+ raise ValueError("A cutoff above 0.5 does not make sense.")
81
+ self.kernel_size = kernel_size
82
+ self.even = kernel_size % 2 == 0
83
+ self.pad_left = kernel_size // 2 - int(self.even)
84
+ self.pad_right = kernel_size // 2
85
+ self.stride = stride
86
+ self.padding = padding
87
+ self.padding_mode = padding_mode
88
+ filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
89
+ self.register_buffer("filter", filter)
90
+
91
+ # Input [B, C, T]
92
+ def forward(self, x):
93
+ _, C, _ = x.shape
94
+
95
+ if self.padding:
96
+ x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
97
+ out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
98
+
99
+ return out
GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2
+ # LICENSE is in incl_licenses directory.
3
+
4
+ import torch.nn as nn
5
+ from torch.nn import functional as F
6
+ from .filter import LowPassFilter1d
7
+ from .filter import kaiser_sinc_filter1d
8
+
9
+
10
+ class UpSample1d(nn.Module):
11
+ def __init__(self, ratio=2, kernel_size=None):
12
+ super().__init__()
13
+ self.ratio = ratio
14
+ self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
15
+ self.stride = ratio
16
+ self.pad = self.kernel_size // ratio - 1
17
+ self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
18
+ self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
19
+ filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size)
20
+ self.register_buffer("filter", filter)
21
+
22
+ # x: [B, C, T]
23
+ def forward(self, x):
24
+ _, C, _ = x.shape
25
+
26
+ x = F.pad(x, (self.pad, self.pad), mode="replicate")
27
+ x = self.ratio * F.conv_transpose1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
28
+ x = x[..., self.pad_left : -self.pad_right]
29
+
30
+ return x
31
+
32
+
33
+ class DownSample1d(nn.Module):
34
+ def __init__(self, ratio=2, kernel_size=None):
35
+ super().__init__()
36
+ self.ratio = ratio
37
+ self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
38
+ self.lowpass = LowPassFilter1d(
39
+ cutoff=0.5 / ratio,
40
+ half_width=0.6 / ratio,
41
+ stride=ratio,
42
+ kernel_size=self.kernel_size,
43
+ )
44
+
45
+ def forward(self, x):
46
+ xx = self.lowpass(x)
47
+
48
+ return xx
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_1 ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Jungil Kong
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_2 ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Edward Dixon
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_3 ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_4 ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2019, Seungwon Park 박승원
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ 3. Neither the name of the copyright holder nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_5 ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2020 Alexandre Défossez
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
4
+ associated documentation files (the "Software"), to deal in the Software without restriction,
5
+ including without limitation the rights to use, copy, modify, merge, publish, distribute,
6
+ sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
7
+ furnished to do so, subject to the following conditions:
8
+
9
+ The above copyright notice and this permission notice shall be included in all copies or
10
+ substantial portions of the Software.
11
+
12
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
13
+ NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
15
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_6 ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023-present, Descript
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_7 ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Charactr Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_8 ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Amphion
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
GPT_SoVITS/BigVGAN/tests/test_activation.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 NVIDIA CORPORATION.
2
+ # Licensed under the MIT license.
3
+
4
+ import os
5
+ import sys
6
+
7
+ # to import modules from parent_dir
8
+ parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
9
+ sys.path.append(parent_dir)
10
+
11
+ import torch
12
+ from alias_free_activation.cuda import activation1d
13
+ from activations import Snake
14
+
15
+
16
+ def test_load_fused_kernels():
17
+ try:
18
+ print("[Success] load_fused_kernels")
19
+ except ImportError as e:
20
+ print("[Fail] load_fused_kernels")
21
+ raise e
22
+
23
+
24
+ def test_anti_alias_activation():
25
+ data = torch.rand((10, 10, 200), device="cuda")
26
+
27
+ # Check activations.Snake cuda vs. torch
28
+ fused_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=True).cuda()
29
+ fused_activation_output = fused_anti_alias_activation(data)
30
+
31
+ torch_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=False).cuda()
32
+ torch_activation_output = torch_anti_alias_activation(data)
33
+
34
+ test_result = (fused_activation_output - torch_activation_output).abs()
35
+
36
+ while test_result.dim() != 1:
37
+ test_result = test_result.mean(dim=-1)
38
+
39
+ diff = test_result.mean(dim=-1)
40
+
41
+ if diff <= 1e-3:
42
+ print(
43
+ f"\n[Success] test_fused_anti_alias_activation"
44
+ f"\n > mean_difference={diff}"
45
+ f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
46
+ f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
47
+ )
48
+ else:
49
+ print(
50
+ f"\n[Fail] test_fused_anti_alias_activation"
51
+ f"\n > mean_difference={diff}, "
52
+ f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
53
+ f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
54
+ )
55
+
56
+
57
+ if __name__ == "__main__":
58
+ from alias_free_activation.cuda import load
59
+
60
+ load.load()
61
+ test_load_fused_kernels()
62
+ test_anti_alias_activation()
GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 NVIDIA CORPORATION.
2
+ # Licensed under the MIT license.
3
+
4
+ import os
5
+ import sys
6
+
7
+ # to import modules from parent_dir
8
+ parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
9
+ sys.path.append(parent_dir)
10
+
11
+ import torch
12
+ from alias_free_activation.cuda import activation1d
13
+ from activations import SnakeBeta
14
+
15
+
16
+ def test_load_fused_kernels():
17
+ try:
18
+ print("[Success] load_fused_kernels")
19
+ except ImportError as e:
20
+ print("[Fail] load_fused_kernels")
21
+ raise e
22
+
23
+
24
+ def test_anti_alias_activation():
25
+ data = torch.rand((10, 10, 200), device="cuda")
26
+
27
+ # Check activations, Snake CUDA vs. Torch
28
+ fused_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=True).cuda()
29
+ fused_activation_output = fused_anti_alias_activation(data)
30
+
31
+ torch_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=False).cuda()
32
+ torch_activation_output = torch_anti_alias_activation(data)
33
+
34
+ test_result = (fused_activation_output - torch_activation_output).abs()
35
+
36
+ while test_result.dim() != 1:
37
+ test_result = test_result.mean(dim=-1)
38
+
39
+ diff = test_result.mean(dim=-1)
40
+
41
+ if diff <= 1e-3:
42
+ print(
43
+ f"\n[Success] test_fused_anti_alias_activation"
44
+ f"\n > mean_difference={diff}"
45
+ f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
46
+ f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
47
+ )
48
+ else:
49
+ print(
50
+ f"\n[Fail] test_fused_anti_alias_activation"
51
+ f"\n > mean_difference={diff}, "
52
+ f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
53
+ f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
54
+ )
55
+
56
+
57
+ if __name__ == "__main__":
58
+ from alias_free_activation.cuda import load
59
+
60
+ load.load()
61
+ test_load_fused_kernels()
62
+ test_anti_alias_activation()
GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 NVIDIA CORPORATION.
2
+ # Licensed under the MIT license.
3
+
4
+ import os
5
+ import sys
6
+
7
+ # to import modules from parent_dir
8
+ parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
9
+ sys.path.append(parent_dir)
10
+
11
+ import torch
12
+ import json
13
+ from env import AttrDict
14
+ from bigvgan import BigVGAN
15
+ from time import time
16
+ from tqdm import tqdm
17
+ from meldataset import mel_spectrogram, MAX_WAV_VALUE
18
+ from scipy.io.wavfile import write
19
+ import numpy as np
20
+
21
+ import argparse
22
+
23
+ torch.backends.cudnn.benchmark = True
24
+
25
+ # For easier debugging
26
+ torch.set_printoptions(linewidth=200, threshold=10_000)
27
+
28
+
29
+ def generate_soundwave(duration=5.0, sr=24000):
30
+ t = np.linspace(0, duration, int(sr * duration), False, dtype=np.float32)
31
+
32
+ modulation = np.sin(2 * np.pi * t / duration)
33
+
34
+ min_freq = 220
35
+ max_freq = 1760
36
+ frequencies = min_freq + (max_freq - min_freq) * (modulation + 1) / 2
37
+ soundwave = np.sin(2 * np.pi * frequencies * t)
38
+
39
+ soundwave = soundwave / np.max(np.abs(soundwave)) * 0.95
40
+
41
+ return soundwave, sr
42
+
43
+
44
+ def get_mel(x, h):
45
+ return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
46
+
47
+
48
+ def load_checkpoint(filepath, device):
49
+ assert os.path.isfile(filepath)
50
+ print(f"Loading '{filepath}'")
51
+ checkpoint_dict = torch.load(filepath, map_location=device)
52
+ print("Complete.")
53
+ return checkpoint_dict
54
+
55
+
56
+ if __name__ == "__main__":
57
+ parser = argparse.ArgumentParser(description="Test script to check CUDA kernel correctness.")
58
+ parser.add_argument(
59
+ "--checkpoint_file",
60
+ type=str,
61
+ required=True,
62
+ help="Path to the checkpoint file. Assumes config.json exists in the directory.",
63
+ )
64
+
65
+ args = parser.parse_args()
66
+
67
+ config_file = os.path.join(os.path.split(args.checkpoint_file)[0], "config.json")
68
+ with open(config_file) as f:
69
+ config = f.read()
70
+ json_config = json.loads(config)
71
+ h = AttrDict({**json_config})
72
+
73
+ print("loading plain Pytorch BigVGAN")
74
+ generator_original = BigVGAN(h).to("cuda")
75
+ print("loading CUDA kernel BigVGAN with auto-build")
76
+ generator_cuda_kernel = BigVGAN(h, use_cuda_kernel=True).to("cuda")
77
+
78
+ state_dict_g = load_checkpoint(args.checkpoint_file, "cuda")
79
+ generator_original.load_state_dict(state_dict_g["generator"])
80
+ generator_cuda_kernel.load_state_dict(state_dict_g["generator"])
81
+
82
+ generator_original.remove_weight_norm()
83
+ generator_original.eval()
84
+ generator_cuda_kernel.remove_weight_norm()
85
+ generator_cuda_kernel.eval()
86
+
87
+ # define number of samples and length of mel frame to benchmark
88
+ num_sample = 10
89
+ num_mel_frame = 16384
90
+
91
+ # CUDA kernel correctness check
92
+ diff = 0.0
93
+ for i in tqdm(range(num_sample)):
94
+ # Random mel
95
+ data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
96
+
97
+ with torch.inference_mode():
98
+ audio_original = generator_original(data)
99
+
100
+ with torch.inference_mode():
101
+ audio_cuda_kernel = generator_cuda_kernel(data)
102
+
103
+ # Both outputs should be (almost) the same
104
+ test_result = (audio_original - audio_cuda_kernel).abs()
105
+ diff += test_result.mean(dim=-1).item()
106
+
107
+ diff /= num_sample
108
+ if diff <= 2e-3: # We can expect a small difference (~1e-3) which does not affect perceptual quality
109
+ print(
110
+ f"\n[Success] test CUDA fused vs. plain torch BigVGAN inference"
111
+ f"\n > mean_difference={diff}"
112
+ f"\n > fused_values={audio_cuda_kernel[-1][-1][-30:].tolist()}"
113
+ f"\n > torch_values={audio_original[-1][-1][-30:].tolist()}"
114
+ )
115
+ else:
116
+ print(
117
+ f"\n[Fail] test CUDA fused vs. plain torch BigVGAN inference"
118
+ f"\n > mean_difference={diff}"
119
+ f"\n > fused_values={audio_cuda_kernel[-1][-1][-30:].tolist()}, "
120
+ f"\n > torch_values={audio_original[-1][-1][-30:].tolist()}"
121
+ )
122
+
123
+ del data, audio_original, audio_cuda_kernel
124
+
125
+ # Variables for tracking total time and VRAM usage
126
+ toc_total_original = 0
127
+ toc_total_cuda_kernel = 0
128
+ vram_used_original_total = 0
129
+ vram_used_cuda_kernel_total = 0
130
+ audio_length_total = 0
131
+
132
+ # Measure Original inference in isolation
133
+ for i in tqdm(range(num_sample)):
134
+ torch.cuda.reset_peak_memory_stats(device="cuda")
135
+ data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
136
+ torch.cuda.synchronize()
137
+ tic = time()
138
+ with torch.inference_mode():
139
+ audio_original = generator_original(data)
140
+ torch.cuda.synchronize()
141
+ toc = time() - tic
142
+ toc_total_original += toc
143
+
144
+ vram_used_original_total += torch.cuda.max_memory_allocated(device="cuda")
145
+
146
+ del data, audio_original
147
+ torch.cuda.empty_cache()
148
+
149
+ # Measure CUDA kernel inference in isolation
150
+ for i in tqdm(range(num_sample)):
151
+ torch.cuda.reset_peak_memory_stats(device="cuda")
152
+ data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda")
153
+ torch.cuda.synchronize()
154
+ tic = time()
155
+ with torch.inference_mode():
156
+ audio_cuda_kernel = generator_cuda_kernel(data)
157
+ torch.cuda.synchronize()
158
+ toc = time() - tic
159
+ toc_total_cuda_kernel += toc
160
+
161
+ audio_length_total += audio_cuda_kernel.shape[-1]
162
+
163
+ vram_used_cuda_kernel_total += torch.cuda.max_memory_allocated(device="cuda")
164
+
165
+ del data, audio_cuda_kernel
166
+ torch.cuda.empty_cache()
167
+
168
+ # Calculate metrics
169
+ audio_second = audio_length_total / h.sampling_rate
170
+ khz_original = audio_length_total / toc_total_original / 1000
171
+ khz_cuda_kernel = audio_length_total / toc_total_cuda_kernel / 1000
172
+ vram_used_original_gb = vram_used_original_total / num_sample / (1024**3)
173
+ vram_used_cuda_kernel_gb = vram_used_cuda_kernel_total / num_sample / (1024**3)
174
+
175
+ # Print results
176
+ print(
177
+ f"Original BigVGAN: took {toc_total_original:.2f} seconds to generate {audio_second:.2f} seconds of audio, {khz_original:.1f}kHz, {audio_second / toc_total_original:.1f} faster than realtime, VRAM used {vram_used_original_gb:.1f} GB"
178
+ )
179
+ print(
180
+ f"CUDA kernel BigVGAN: took {toc_total_cuda_kernel:.2f} seconds to generate {audio_second:.2f} seconds of audio, {khz_cuda_kernel:.1f}kHz, {audio_second / toc_total_cuda_kernel:.1f} faster than realtime, VRAM used {vram_used_cuda_kernel_gb:.1f} GB"
181
+ )
182
+ print(f"speedup of CUDA kernel: {khz_cuda_kernel / khz_original}")
183
+ print(f"VRAM saving of CUDA kernel: {vram_used_original_gb / vram_used_cuda_kernel_gb}")
184
+
185
+ # Use artificial sine waves for inference test
186
+ audio_real, sr = generate_soundwave(duration=5.0, sr=h.sampling_rate)
187
+ audio_real = torch.tensor(audio_real).to("cuda")
188
+ # Compute mel spectrogram from the ground truth audio
189
+ x = get_mel(audio_real.unsqueeze(0), h)
190
+
191
+ with torch.inference_mode():
192
+ y_g_hat_original = generator_original(x)
193
+ y_g_hat_cuda_kernel = generator_cuda_kernel(x)
194
+
195
+ audio_real = audio_real.squeeze()
196
+ audio_real = audio_real * MAX_WAV_VALUE
197
+ audio_real = audio_real.cpu().numpy().astype("int16")
198
+
199
+ audio_original = y_g_hat_original.squeeze()
200
+ audio_original = audio_original * MAX_WAV_VALUE
201
+ audio_original = audio_original.cpu().numpy().astype("int16")
202
+
203
+ audio_cuda_kernel = y_g_hat_cuda_kernel.squeeze()
204
+ audio_cuda_kernel = audio_cuda_kernel * MAX_WAV_VALUE
205
+ audio_cuda_kernel = audio_cuda_kernel.cpu().numpy().astype("int16")
206
+
207
+ os.makedirs("tmp", exist_ok=True)
208
+ output_file_real = os.path.join("tmp", "audio_real.wav")
209
+ output_file_original = os.path.join("tmp", "audio_generated_original.wav")
210
+ output_file_cuda_kernel = os.path.join("tmp", "audio_generated_cuda_kernel.wav")
211
+ write(output_file_real, h.sampling_rate, audio_real)
212
+ write(output_file_original, h.sampling_rate, audio_original)
213
+ write(output_file_cuda_kernel, h.sampling_rate, audio_cuda_kernel)
214
+ print("Example generated audios of original vs. fused CUDA kernel written to tmp!")
215
+ print("Done")
GPT_SoVITS/configs/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.yaml
GPT_SoVITS/configs/s1.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train:
2
+ seed: 1234
3
+ epochs: 300
4
+ batch_size: 8
5
+ gradient_accumulation: 4
6
+ save_every_n_epoch: 1
7
+ precision: 16
8
+ gradient_clip: 1.0
9
+ optimizer:
10
+ lr: 0.01
11
+ lr_init: 0.00001
12
+ lr_end: 0.0001
13
+ warmup_steps: 2000
14
+ decay_steps: 40000
15
+ data:
16
+ max_eval_sample: 8
17
+ max_sec: 54
18
+ num_workers: 1
19
+ pad_val: 1024 # same with EOS in model
20
+ model:
21
+ vocab_size: 1025
22
+ phoneme_vocab_size: 512
23
+ embedding_dim: 512
24
+ hidden_dim: 512
25
+ head: 16
26
+ linear_units: 2048
27
+ n_layer: 12
28
+ dropout: 0
29
+ EOS: 1024
30
+ inference:
31
+ top_k: 5
GPT_SoVITS/configs/s1big.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train:
2
+ seed: 1234
3
+ epochs: 300
4
+ batch_size: 8
5
+ gradient_accumulation: 4
6
+ save_every_n_epoch: 1
7
+ precision: 16-mixed
8
+ gradient_clip: 1.0
9
+ optimizer:
10
+ lr: 0.01
11
+ lr_init: 0.00001
12
+ lr_end: 0.0001
13
+ warmup_steps: 2000
14
+ decay_steps: 40000
15
+ data:
16
+ max_eval_sample: 8
17
+ max_sec: 54
18
+ num_workers: 1
19
+ pad_val: 1024 # same with EOS in model
20
+ model:
21
+ vocab_size: 1025
22
+ phoneme_vocab_size: 512
23
+ embedding_dim: 1024
24
+ hidden_dim: 1024
25
+ head: 16
26
+ linear_units: 2048
27
+ n_layer: 16
28
+ dropout: 0
29
+ EOS: 1024
30
+ inference:
31
+ top_k: 5
GPT_SoVITS/configs/s1big2.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train:
2
+ seed: 1234
3
+ epochs: 300
4
+ batch_size: 12
5
+ gradient_accumulation: 4
6
+ save_every_n_epoch: 1
7
+ precision: 16-mixed
8
+ gradient_clip: 1.0
9
+ optimizer:
10
+ lr: 0.01
11
+ lr_init: 0.00001
12
+ lr_end: 0.0001
13
+ warmup_steps: 2000
14
+ decay_steps: 40000
15
+ data:
16
+ max_eval_sample: 8
17
+ max_sec: 54
18
+ num_workers: 1
19
+ pad_val: 1024 # same with EOS in model
20
+ model:
21
+ vocab_size: 1025
22
+ phoneme_vocab_size: 512
23
+ embedding_dim: 1024
24
+ hidden_dim: 1024
25
+ head: 16
26
+ linear_units: 2048
27
+ n_layer: 6
28
+ dropout: 0
29
+ EOS: 1024
30
+ inference:
31
+ top_k: 5
GPT_SoVITS/configs/s1longer-v2.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train:
2
+ seed: 1234
3
+ epochs: 20
4
+ batch_size: 8
5
+ save_every_n_epoch: 1
6
+ precision: 16-mixed
7
+ gradient_clip: 1.0
8
+ optimizer:
9
+ lr: 0.01
10
+ lr_init: 0.00001
11
+ lr_end: 0.0001
12
+ warmup_steps: 2000
13
+ decay_steps: 40000
14
+ data:
15
+ max_eval_sample: 8
16
+ max_sec: 54
17
+ num_workers: 4
18
+ pad_val: 1024 # same with EOS in model
19
+ model:
20
+ vocab_size: 1025
21
+ phoneme_vocab_size: 732
22
+ embedding_dim: 512
23
+ hidden_dim: 512
24
+ head: 16
25
+ linear_units: 2048
26
+ n_layer: 24
27
+ dropout: 0
28
+ EOS: 1024
29
+ random_bert: 0
30
+ inference:
31
+ top_k: 15
GPT_SoVITS/configs/s1longer.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train:
2
+ seed: 1234
3
+ epochs: 20
4
+ batch_size: 8
5
+ save_every_n_epoch: 1
6
+ precision: 16-mixed
7
+ gradient_clip: 1.0
8
+ optimizer:
9
+ lr: 0.01
10
+ lr_init: 0.00001
11
+ lr_end: 0.0001
12
+ warmup_steps: 2000
13
+ decay_steps: 40000
14
+ data:
15
+ max_eval_sample: 8
16
+ max_sec: 54
17
+ num_workers: 4
18
+ pad_val: 1024 # same with EOS in model
19
+ model:
20
+ vocab_size: 1025
21
+ phoneme_vocab_size: 512
22
+ embedding_dim: 512
23
+ hidden_dim: 512
24
+ head: 16
25
+ linear_units: 2048
26
+ n_layer: 24
27
+ dropout: 0
28
+ EOS: 1024
29
+ random_bert: 0
30
+ inference:
31
+ top_k: 5
GPT_SoVITS/configs/s1mq.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train:
2
+ seed: 1234
3
+ epochs: 100
4
+ batch_size: 6
5
+ gradient_accumulation: 4
6
+ save_every_n_epoch: 1
7
+ precision: 32
8
+ gradient_clip: 1.0
9
+ optimizer:
10
+ lr: 0.01
11
+ lr_init: 0.00001
12
+ lr_end: 0.0001
13
+ warmup_steps: 2000
14
+ decay_steps: 40000
15
+ data:
16
+ max_eval_sample: 8
17
+ max_sec: 40
18
+ num_workers: 1
19
+ pad_val: 1024 # same with EOS in model
20
+ model:
21
+ saving_path: "ckpt/"
22
+ resume_checkpoint: null
23
+ vocoder_config_path: "quantizer/new_ckpt/config.json"
24
+ vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000"
25
+ datadir: "/home/liweiche/GigaSpeech/wavs"
26
+ metapath: "/home/liweiche/GigaSpeech/train2.json"
27
+ val_metapath: "/home/liweiche/GigaSpeech/dev2.json"
28
+ sampledir: "logs/"
29
+ pretrained_path: null
30
+ lr: 0.0001
31
+ batch_size: 200.0
32
+ train_bucket_size: 8192
33
+ training_step: 800000
34
+ optim_flat_percent: 0.0
35
+ warmup_step: 50
36
+ adam_beta1: 0.9
37
+ adam_beta2: 0.98
38
+ ffd_size: 3072
39
+ hidden_size: 768
40
+ enc_nlayers: 6
41
+ dec_nlayers: 6
42
+ nheads: 12
43
+ ar_layer: 4
44
+ ar_ffd_size: 1024
45
+ ar_hidden_size: 256
46
+ ar_nheads: 4
47
+ aligner_softmax_temp: 1.0
48
+ layer_norm_eps: 0.00001
49
+ speaker_embed_dropout: 0.05
50
+ label_smoothing: 0.0
51
+ val_check_interval: 5000
52
+ check_val_every_n_epoch: 1
53
+ precision: "fp16"
54
+ nworkers: 16
55
+ distributed: true
56
+ accelerator: "ddp"
57
+ version: null
58
+ accumulate_grad_batches: 1
59
+ use_repetition_token: true
60
+ use_repetition_gating: false
61
+ repetition_penalty: 1.0
62
+ sampling_temperature: 1.0
63
+ top_k: -1
64
+ min_top_k: 3
65
+ top_p: 0.8
66
+ sample_num: 4
67
+ length_penalty_max_length: 15000
68
+ length_penalty_max_prob: 0.95
69
+ max_input_length: 2048
70
+ max_output_length: 2000
71
+ sample_rate: 16000
72
+ n_codes: 1024
73
+ n_cluster_groups: 1
74
+ phone_context_window: 4
75
+ phoneset_size: 1000
76
+ inference:
77
+ top_k: 5
GPT_SoVITS/configs/s2.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 100,
4
+ "eval_interval": 500,
5
+ "seed": 1234,
6
+ "epochs": 100,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 32,
14
+ "fp16_run": true,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 20480,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "text_low_lr_rate": 0.4,
22
+ "grad_ckpt": false
23
+ },
24
+ "data": {
25
+ "max_wav_value": 32768.0,
26
+ "sampling_rate": 32000,
27
+ "filter_length": 2048,
28
+ "hop_length": 640,
29
+ "win_length": 2048,
30
+ "n_mel_channels": 128,
31
+ "mel_fmin": 0.0,
32
+ "mel_fmax": null,
33
+ "add_blank": true,
34
+ "n_speakers": 300,
35
+ "cleaned_text": true
36
+ },
37
+ "model": {
38
+ "inter_channels": 192,
39
+ "hidden_channels": 192,
40
+ "filter_channels": 768,
41
+ "n_heads": 2,
42
+ "n_layers": 6,
43
+ "kernel_size": 3,
44
+ "p_dropout": 0.1,
45
+ "resblock": "1",
46
+ "resblock_kernel_sizes": [
47
+ 3,
48
+ 7,
49
+ 11
50
+ ],
51
+ "resblock_dilation_sizes": [
52
+ [
53
+ 1,
54
+ 3,
55
+ 5
56
+ ],
57
+ [
58
+ 1,
59
+ 3,
60
+ 5
61
+ ],
62
+ [
63
+ 1,
64
+ 3,
65
+ 5
66
+ ]
67
+ ],
68
+ "upsample_rates": [
69
+ 10,
70
+ 8,
71
+ 2,
72
+ 2,
73
+ 2
74
+ ],
75
+ "upsample_initial_channel": 512,
76
+ "upsample_kernel_sizes": [
77
+ 16,
78
+ 16,
79
+ 8,
80
+ 2,
81
+ 2
82
+ ],
83
+ "n_layers_q": 3,
84
+ "use_spectral_norm": false,
85
+ "gin_channels": 512,
86
+ "semantic_frame_rate": "25hz",
87
+ "freeze_quantizer": true
88
+ },
89
+ "s2_ckpt_dir": "logs/s2/big2k1",
90
+ "content_module": "cnhubert"
91
+ }
GPT_SoVITS/configs/train.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gpu:
2
+ n_card: 1
3
+ n_process_per_card: 2
4
+ io:
5
+ text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS
6
+ save_every_n_epoch: 1
7
+ precision: 16-mixed
8
+ gradient_clip: 1.0
9
+ optimizer:
10
+ lr: 0.01
11
+ lr_init: 0.00001
12
+ lr_end: 0.0001
13
+ warmup_steps: 2000
14
+ decay_steps: 40000
15
+ data:
16
+ max_eval_sample: 8
17
+ max_sec: 54
18
+ num_workers: 1
19
+ pad_val: 1024 # same with EOS in model
20
+ model:
21
+ vocab_size: 1025
22
+ phoneme_vocab_size: 512
23
+ embedding_dim: 512
24
+ hidden_dim: 512
25
+ head: 16
26
+ linear_units: 2048
27
+ n_layer: 24
28
+ dropout: 0
29
+ EOS: 1024
30
+ random_bert: 0
31
+ inference:
32
+ top_k: 5
GPT_SoVITS/configs/tts_infer.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom:
2
+ bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
3
+ cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
4
+ device: cuda
5
+ is_half: true
6
+ t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
7
+ version: v2
8
+ vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
9
+ v1:
10
+ bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
11
+ cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
12
+ device: cpu
13
+ is_half: false
14
+ t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
15
+ version: v1
16
+ vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
17
+ v2:
18
+ bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
19
+ cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
20
+ device: cpu
21
+ is_half: false
22
+ t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
23
+ version: v2
24
+ vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
25
+ v3:
26
+ bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
27
+ cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
28
+ device: cpu
29
+ is_half: false
30
+ t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
31
+ version: v3
32
+ vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth
GPT_SoVITS/f5_tts/model/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from f5_tts.model.cfm import CFM
2
+ #
3
+ # from f5_tts.model.backbones.unett import UNetT
4
+ from GPT_SoVITS.f5_tts.model.backbones.dit import DiT
5
+ # from f5_tts.model.backbones.dit import DiTNoCond
6
+ # from f5_tts.model.backbones.dit import DiTNoCondNoT
7
+ # from f5_tts.model.backbones.mmdit import MMDiT
8
+
9
+ # from f5_tts.model.trainer import Trainer
10
+
11
+
12
+ # __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]
13
+ # __all__ = ["CFM", "UNetT", "DiTNoCond","DiT", "MMDiT"]
GPT_SoVITS/f5_tts/model/backbones/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Backbones quick introduction
2
+
3
+
4
+ ### unett.py
5
+ - flat unet transformer
6
+ - structure same as in e2-tts & voicebox paper except using rotary pos emb
7
+ - update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
8
+
9
+ ### dit.py
10
+ - adaln-zero dit
11
+ - embedded timestep as condition
12
+ - concatted noised_input + masked_cond + embedded_text, linear proj in
13
+ - possible abs pos emb & convnextv2 blocks for embedded text before concat
14
+ - possible long skip connection (first layer to last layer)
15
+
16
+ ### mmdit.py
17
+ - sd3 structure
18
+ - timestep as condition
19
+ - left stream: text embedded and applied a abs pos emb
20
+ - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
GPT_SoVITS/f5_tts/model/backbones/dit.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import torch
13
+ from torch import nn
14
+ from torch.utils.checkpoint import checkpoint
15
+
16
+ from x_transformers.x_transformers import RotaryEmbedding
17
+
18
+ from GPT_SoVITS.f5_tts.model.modules import (
19
+ TimestepEmbedding,
20
+ ConvNeXtV2Block,
21
+ ConvPositionEmbedding,
22
+ DiTBlock,
23
+ AdaLayerNormZero_Final,
24
+ precompute_freqs_cis,
25
+ get_pos_embed_indices,
26
+ )
27
+
28
+ from module.commons import sequence_mask
29
+
30
+
31
+ class TextEmbedding(nn.Module):
32
+ def __init__(self, text_dim, conv_layers=0, conv_mult=2):
33
+ super().__init__()
34
+ if conv_layers > 0:
35
+ self.extra_modeling = True
36
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
37
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
38
+ self.text_blocks = nn.Sequential(
39
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
40
+ )
41
+ else:
42
+ self.extra_modeling = False
43
+
44
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
45
+ batch, text_len = text.shape[0], text.shape[1]
46
+
47
+ if drop_text: # cfg for text
48
+ text = torch.zeros_like(text)
49
+
50
+ # possible extra modeling
51
+ if self.extra_modeling:
52
+ # sinus pos emb
53
+ batch_start = torch.zeros((batch,), dtype=torch.long)
54
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
55
+ text_pos_embed = self.freqs_cis[pos_idx]
56
+
57
+ # print(23333333,text.shape,text_pos_embed.shape)#torch.Size([7, 465, 256]) torch.Size([7, 465, 256])
58
+
59
+ text = text + text_pos_embed
60
+
61
+ # convnextv2 blocks
62
+ text = self.text_blocks(text)
63
+
64
+ return text
65
+
66
+
67
+ # noised input audio and context mixing embedding
68
+
69
+
70
+ class InputEmbedding(nn.Module):
71
+ def __init__(self, mel_dim, text_dim, out_dim):
72
+ super().__init__()
73
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
74
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
75
+
76
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
77
+ if drop_audio_cond: # cfg for cond audio
78
+ cond = torch.zeros_like(cond)
79
+
80
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
81
+ x = self.conv_pos_embed(x) + x
82
+ return x
83
+
84
+
85
+ # Transformer backbone using DiT blocks
86
+
87
+
88
+ class DiT(nn.Module):
89
+ def __init__(
90
+ self,
91
+ *,
92
+ dim,
93
+ depth=8,
94
+ heads=8,
95
+ dim_head=64,
96
+ dropout=0.1,
97
+ ff_mult=4,
98
+ mel_dim=100,
99
+ text_dim=None,
100
+ conv_layers=0,
101
+ long_skip_connection=False,
102
+ ):
103
+ super().__init__()
104
+
105
+ self.time_embed = TimestepEmbedding(dim)
106
+ self.d_embed = TimestepEmbedding(dim)
107
+ if text_dim is None:
108
+ text_dim = mel_dim
109
+ self.text_embed = TextEmbedding(text_dim, conv_layers=conv_layers)
110
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
111
+
112
+ self.rotary_embed = RotaryEmbedding(dim_head)
113
+
114
+ self.dim = dim
115
+ self.depth = depth
116
+
117
+ self.transformer_blocks = nn.ModuleList(
118
+ [DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout) for _ in range(depth)]
119
+ )
120
+ self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
121
+
122
+ self.norm_out = AdaLayerNormZero_Final(dim) # final modulation
123
+ self.proj_out = nn.Linear(dim, mel_dim)
124
+
125
+ def ckpt_wrapper(self, module):
126
+ # https://github.com/chuanyangjin/fast-DiT/blob/main/models.py
127
+ def ckpt_forward(*inputs):
128
+ outputs = module(*inputs)
129
+ return outputs
130
+
131
+ return ckpt_forward
132
+
133
+ def forward( # x, prompt_x, x_lens, t, style,cond
134
+ self, # d is channel,n is T
135
+ x0: float["b n d"], # nosied input audio # noqa: F722
136
+ cond0: float["b n d"], # masked cond audio # noqa: F722
137
+ x_lens,
138
+ time: float["b"] | float[""], # time step # noqa: F821 F722
139
+ dt_base_bootstrap,
140
+ text0, # : int["b nt"] # noqa: F722#####condition feature
141
+ use_grad_ckpt=False, # bool
142
+ ###no-use
143
+ drop_audio_cond=False, # cfg for cond audio
144
+ drop_text=False, # cfg for text
145
+ # mask: bool["b n"] | None = None, # noqa: F722
146
+ ):
147
+ x = x0.transpose(2, 1)
148
+ cond = cond0.transpose(2, 1)
149
+ text = text0.transpose(2, 1)
150
+ mask = sequence_mask(x_lens, max_length=x.size(1)).to(x.device)
151
+
152
+ batch, seq_len = x.shape[0], x.shape[1]
153
+ if time.ndim == 0:
154
+ time = time.repeat(batch)
155
+
156
+ # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
157
+ t = self.time_embed(time)
158
+ dt = self.d_embed(dt_base_bootstrap)
159
+ t += dt
160
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text) ###need to change
161
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
162
+
163
+ rope = self.rotary_embed.forward_from_seq_len(seq_len)
164
+
165
+ if self.long_skip_connection is not None:
166
+ residual = x
167
+
168
+ for block in self.transformer_blocks:
169
+ if use_grad_ckpt:
170
+ x = checkpoint(self.ckpt_wrapper(block), x, t, mask, rope, use_reentrant=False)
171
+ else:
172
+ x = block(x, t, mask=mask, rope=rope)
173
+
174
+ if self.long_skip_connection is not None:
175
+ x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
176
+
177
+ x = self.norm_out(x, t)
178
+ output = self.proj_out(x)
179
+
180
+ return output
GPT_SoVITS/f5_tts/model/backbones/mmdit.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import torch
13
+ from torch import nn
14
+
15
+ from x_transformers.x_transformers import RotaryEmbedding
16
+
17
+ from f5_tts.model.modules import (
18
+ TimestepEmbedding,
19
+ ConvPositionEmbedding,
20
+ MMDiTBlock,
21
+ AdaLayerNormZero_Final,
22
+ precompute_freqs_cis,
23
+ get_pos_embed_indices,
24
+ )
25
+
26
+
27
+ # text embedding
28
+
29
+
30
+ class TextEmbedding(nn.Module):
31
+ def __init__(self, out_dim, text_num_embeds):
32
+ super().__init__()
33
+ self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim) # will use 0 as filler token
34
+
35
+ self.precompute_max_pos = 1024
36
+ self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
37
+
38
+ def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]: # noqa: F722
39
+ text = text + 1
40
+ if drop_text:
41
+ text = torch.zeros_like(text)
42
+ text = self.text_embed(text)
43
+
44
+ # sinus pos emb
45
+ batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
46
+ batch_text_len = text.shape[1]
47
+ pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
48
+ text_pos_embed = self.freqs_cis[pos_idx]
49
+
50
+ text = text + text_pos_embed
51
+
52
+ return text
53
+
54
+
55
+ # noised input & masked cond audio embedding
56
+
57
+
58
+ class AudioEmbedding(nn.Module):
59
+ def __init__(self, in_dim, out_dim):
60
+ super().__init__()
61
+ self.linear = nn.Linear(2 * in_dim, out_dim)
62
+ self.conv_pos_embed = ConvPositionEmbedding(out_dim)
63
+
64
+ def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False): # noqa: F722
65
+ if drop_audio_cond:
66
+ cond = torch.zeros_like(cond)
67
+ x = torch.cat((x, cond), dim=-1)
68
+ x = self.linear(x)
69
+ x = self.conv_pos_embed(x) + x
70
+ return x
71
+
72
+
73
+ # Transformer backbone using MM-DiT blocks
74
+
75
+
76
+ class MMDiT(nn.Module):
77
+ def __init__(
78
+ self,
79
+ *,
80
+ dim,
81
+ depth=8,
82
+ heads=8,
83
+ dim_head=64,
84
+ dropout=0.1,
85
+ ff_mult=4,
86
+ text_num_embeds=256,
87
+ mel_dim=100,
88
+ ):
89
+ super().__init__()
90
+
91
+ self.time_embed = TimestepEmbedding(dim)
92
+ self.text_embed = TextEmbedding(dim, text_num_embeds)
93
+ self.audio_embed = AudioEmbedding(mel_dim, dim)
94
+
95
+ self.rotary_embed = RotaryEmbedding(dim_head)
96
+
97
+ self.dim = dim
98
+ self.depth = depth
99
+
100
+ self.transformer_blocks = nn.ModuleList(
101
+ [
102
+ MMDiTBlock(
103
+ dim=dim,
104
+ heads=heads,
105
+ dim_head=dim_head,
106
+ dropout=dropout,
107
+ ff_mult=ff_mult,
108
+ context_pre_only=i == depth - 1,
109
+ )
110
+ for i in range(depth)
111
+ ]
112
+ )
113
+ self.norm_out = AdaLayerNormZero_Final(dim) # final modulation
114
+ self.proj_out = nn.Linear(dim, mel_dim)
115
+
116
+ def forward(
117
+ self,
118
+ x: float["b n d"], # nosied input audio # noqa: F722
119
+ cond: float["b n d"], # masked cond audio # noqa: F722
120
+ text: int["b nt"], # text # noqa: F722
121
+ time: float["b"] | float[""], # time step # noqa: F821 F722
122
+ drop_audio_cond, # cfg for cond audio
123
+ drop_text, # cfg for text
124
+ mask: bool["b n"] | None = None, # noqa: F722
125
+ ):
126
+ batch = x.shape[0]
127
+ if time.ndim == 0:
128
+ time = time.repeat(batch)
129
+
130
+ # t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
131
+ t = self.time_embed(time)
132
+ c = self.text_embed(text, drop_text=drop_text)
133
+ x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
134
+
135
+ seq_len = x.shape[1]
136
+ text_len = text.shape[1]
137
+ rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
138
+ rope_text = self.rotary_embed.forward_from_seq_len(text_len)
139
+
140
+ for block in self.transformer_blocks:
141
+ c, x = block(x, c, t, mask=mask, rope=rope_audio, c_rope=rope_text)
142
+
143
+ x = self.norm_out(x, t)
144
+ output = self.proj_out(x)
145
+
146
+ return output
GPT_SoVITS/f5_tts/model/backbones/unett.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+ from typing import Literal
12
+
13
+ import torch
14
+ from torch import nn
15
+ import torch.nn.functional as F
16
+
17
+ from x_transformers import RMSNorm
18
+ from x_transformers.x_transformers import RotaryEmbedding
19
+
20
+ from f5_tts.model.modules import (
21
+ TimestepEmbedding,
22
+ ConvNeXtV2Block,
23
+ ConvPositionEmbedding,
24
+ Attention,
25
+ AttnProcessor,
26
+ FeedForward,
27
+ precompute_freqs_cis,
28
+ get_pos_embed_indices,
29
+ )
30
+
31
+
32
+ # Text embedding
33
+
34
+
35
+ class TextEmbedding(nn.Module):
36
+ def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
37
+ super().__init__()
38
+ self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
39
+
40
+ if conv_layers > 0:
41
+ self.extra_modeling = True
42
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
43
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
44
+ self.text_blocks = nn.Sequential(
45
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
46
+ )
47
+ else:
48
+ self.extra_modeling = False
49
+
50
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
51
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
52
+ text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
53
+ batch, text_len = text.shape[0], text.shape[1]
54
+ text = F.pad(text, (0, seq_len - text_len), value=0)
55
+
56
+ if drop_text: # cfg for text
57
+ text = torch.zeros_like(text)
58
+
59
+ text = self.text_embed(text) # b n -> b n d
60
+
61
+ # possible extra modeling
62
+ if self.extra_modeling:
63
+ # sinus pos emb
64
+ batch_start = torch.zeros((batch,), dtype=torch.long)
65
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
66
+ text_pos_embed = self.freqs_cis[pos_idx]
67
+ text = text + text_pos_embed
68
+
69
+ # convnextv2 blocks
70
+ text = self.text_blocks(text)
71
+
72
+ return text
73
+
74
+
75
+ # noised input audio and context mixing embedding
76
+
77
+
78
+ class InputEmbedding(nn.Module):
79
+ def __init__(self, mel_dim, text_dim, out_dim):
80
+ super().__init__()
81
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
82
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
83
+
84
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
85
+ if drop_audio_cond: # cfg for cond audio
86
+ cond = torch.zeros_like(cond)
87
+
88
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
89
+ x = self.conv_pos_embed(x) + x
90
+ return x
91
+
92
+
93
+ # Flat UNet Transformer backbone
94
+
95
+
96
+ class UNetT(nn.Module):
97
+ def __init__(
98
+ self,
99
+ *,
100
+ dim,
101
+ depth=8,
102
+ heads=8,
103
+ dim_head=64,
104
+ dropout=0.1,
105
+ ff_mult=4,
106
+ mel_dim=100,
107
+ text_num_embeds=256,
108
+ text_dim=None,
109
+ conv_layers=0,
110
+ skip_connect_type: Literal["add", "concat", "none"] = "concat",
111
+ ):
112
+ super().__init__()
113
+ assert depth % 2 == 0, "UNet-Transformer's depth should be even."
114
+
115
+ self.time_embed = TimestepEmbedding(dim)
116
+ if text_dim is None:
117
+ text_dim = mel_dim
118
+ self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
119
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
120
+
121
+ self.rotary_embed = RotaryEmbedding(dim_head)
122
+
123
+ # transformer layers & skip connections
124
+
125
+ self.dim = dim
126
+ self.skip_connect_type = skip_connect_type
127
+ needs_skip_proj = skip_connect_type == "concat"
128
+
129
+ self.depth = depth
130
+ self.layers = nn.ModuleList([])
131
+
132
+ for idx in range(depth):
133
+ is_later_half = idx >= (depth // 2)
134
+
135
+ attn_norm = RMSNorm(dim)
136
+ attn = Attention(
137
+ processor=AttnProcessor(),
138
+ dim=dim,
139
+ heads=heads,
140
+ dim_head=dim_head,
141
+ dropout=dropout,
142
+ )
143
+
144
+ ff_norm = RMSNorm(dim)
145
+ ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
146
+
147
+ skip_proj = nn.Linear(dim * 2, dim, bias=False) if needs_skip_proj and is_later_half else None
148
+
149
+ self.layers.append(
150
+ nn.ModuleList(
151
+ [
152
+ skip_proj,
153
+ attn_norm,
154
+ attn,
155
+ ff_norm,
156
+ ff,
157
+ ]
158
+ )
159
+ )
160
+
161
+ self.norm_out = RMSNorm(dim)
162
+ self.proj_out = nn.Linear(dim, mel_dim)
163
+
164
+ def forward(
165
+ self,
166
+ x: float["b n d"], # nosied input audio # noqa: F722
167
+ cond: float["b n d"], # masked cond audio # noqa: F722
168
+ text: int["b nt"], # text # noqa: F722
169
+ time: float["b"] | float[""], # time step # noqa: F821 F722
170
+ drop_audio_cond, # cfg for cond audio
171
+ drop_text, # cfg for text
172
+ mask: bool["b n"] | None = None, # noqa: F722
173
+ ):
174
+ batch, seq_len = x.shape[0], x.shape[1]
175
+ if time.ndim == 0:
176
+ time = time.repeat(batch)
177
+
178
+ # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
179
+ t = self.time_embed(time)
180
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
181
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
182
+
183
+ # postfix time t to input x, [b n d] -> [b n+1 d]
184
+ x = torch.cat([t.unsqueeze(1), x], dim=1) # pack t to x
185
+ if mask is not None:
186
+ mask = F.pad(mask, (1, 0), value=1)
187
+
188
+ rope = self.rotary_embed.forward_from_seq_len(seq_len + 1)
189
+
190
+ # flat unet transformer
191
+ skip_connect_type = self.skip_connect_type
192
+ skips = []
193
+ for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
194
+ layer = idx + 1
195
+
196
+ # skip connection logic
197
+ is_first_half = layer <= (self.depth // 2)
198
+ is_later_half = not is_first_half
199
+
200
+ if is_first_half:
201
+ skips.append(x)
202
+
203
+ if is_later_half:
204
+ skip = skips.pop()
205
+ if skip_connect_type == "concat":
206
+ x = torch.cat((x, skip), dim=-1)
207
+ x = maybe_skip_proj(x)
208
+ elif skip_connect_type == "add":
209
+ x = x + skip
210
+
211
+ # attention and feedforward blocks
212
+ x = attn(attn_norm(x), rope=rope, mask=mask) + x
213
+ x = ff(ff_norm(x)) + x
214
+
215
+ assert len(skips) == 0
216
+
217
+ x = self.norm_out(x)[:, 1:, :] # unpack t from x
218
+
219
+ return self.proj_out(x)
GPT_SoVITS/f5_tts/model/modules.py ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import math
13
+ from typing import Optional
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ import torchaudio
18
+ from librosa.filters import mel as librosa_mel_fn
19
+ from torch import nn
20
+ from x_transformers.x_transformers import apply_rotary_pos_emb
21
+
22
+
23
+ # raw wav to mel spec
24
+
25
+
26
+ mel_basis_cache = {}
27
+ hann_window_cache = {}
28
+
29
+
30
+ def get_bigvgan_mel_spectrogram(
31
+ waveform,
32
+ n_fft=1024,
33
+ n_mel_channels=100,
34
+ target_sample_rate=24000,
35
+ hop_length=256,
36
+ win_length=1024,
37
+ fmin=0,
38
+ fmax=None,
39
+ center=False,
40
+ ): # Copy from https://github.com/NVIDIA/BigVGAN/tree/main
41
+ device = waveform.device
42
+ key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
43
+
44
+ if key not in mel_basis_cache:
45
+ mel = librosa_mel_fn(sr=target_sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=fmin, fmax=fmax)
46
+ mel_basis_cache[key] = torch.from_numpy(mel).float().to(device) # TODO: why they need .float()?
47
+ hann_window_cache[key] = torch.hann_window(win_length).to(device)
48
+
49
+ mel_basis = mel_basis_cache[key]
50
+ hann_window = hann_window_cache[key]
51
+
52
+ padding = (n_fft - hop_length) // 2
53
+ waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1)
54
+
55
+ spec = torch.stft(
56
+ waveform,
57
+ n_fft,
58
+ hop_length=hop_length,
59
+ win_length=win_length,
60
+ window=hann_window,
61
+ center=center,
62
+ pad_mode="reflect",
63
+ normalized=False,
64
+ onesided=True,
65
+ return_complex=True,
66
+ )
67
+ spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
68
+
69
+ mel_spec = torch.matmul(mel_basis, spec)
70
+ mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
71
+
72
+ return mel_spec
73
+
74
+
75
+ def get_vocos_mel_spectrogram(
76
+ waveform,
77
+ n_fft=1024,
78
+ n_mel_channels=100,
79
+ target_sample_rate=24000,
80
+ hop_length=256,
81
+ win_length=1024,
82
+ ):
83
+ mel_stft = torchaudio.transforms.MelSpectrogram(
84
+ sample_rate=target_sample_rate,
85
+ n_fft=n_fft,
86
+ win_length=win_length,
87
+ hop_length=hop_length,
88
+ n_mels=n_mel_channels,
89
+ power=1,
90
+ center=True,
91
+ normalized=False,
92
+ norm=None,
93
+ ).to(waveform.device)
94
+ if len(waveform.shape) == 3:
95
+ waveform = waveform.squeeze(1) # 'b 1 nw -> b nw'
96
+
97
+ assert len(waveform.shape) == 2
98
+
99
+ mel = mel_stft(waveform)
100
+ mel = mel.clamp(min=1e-5).log()
101
+ return mel
102
+
103
+
104
+ class MelSpec(nn.Module):
105
+ def __init__(
106
+ self,
107
+ n_fft=1024,
108
+ hop_length=256,
109
+ win_length=1024,
110
+ n_mel_channels=100,
111
+ target_sample_rate=24_000,
112
+ mel_spec_type="vocos",
113
+ ):
114
+ super().__init__()
115
+ assert mel_spec_type in ["vocos", "bigvgan"], print("We only support two extract mel backend: vocos or bigvgan")
116
+
117
+ self.n_fft = n_fft
118
+ self.hop_length = hop_length
119
+ self.win_length = win_length
120
+ self.n_mel_channels = n_mel_channels
121
+ self.target_sample_rate = target_sample_rate
122
+
123
+ if mel_spec_type == "vocos":
124
+ self.extractor = get_vocos_mel_spectrogram
125
+ elif mel_spec_type == "bigvgan":
126
+ self.extractor = get_bigvgan_mel_spectrogram
127
+
128
+ self.register_buffer("dummy", torch.tensor(0), persistent=False)
129
+
130
+ def forward(self, wav):
131
+ if self.dummy.device != wav.device:
132
+ self.to(wav.device)
133
+
134
+ mel = self.extractor(
135
+ waveform=wav,
136
+ n_fft=self.n_fft,
137
+ n_mel_channels=self.n_mel_channels,
138
+ target_sample_rate=self.target_sample_rate,
139
+ hop_length=self.hop_length,
140
+ win_length=self.win_length,
141
+ )
142
+
143
+ return mel
144
+
145
+
146
+ # sinusoidal position embedding
147
+
148
+
149
+ class SinusPositionEmbedding(nn.Module):
150
+ def __init__(self, dim):
151
+ super().__init__()
152
+ self.dim = dim
153
+
154
+ def forward(self, x, scale=1000):
155
+ device = x.device
156
+ half_dim = self.dim // 2
157
+ emb = math.log(10000) / (half_dim - 1)
158
+ emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
159
+ emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
160
+ emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
161
+ return emb
162
+
163
+
164
+ # convolutional position embedding
165
+
166
+
167
+ class ConvPositionEmbedding(nn.Module):
168
+ def __init__(self, dim, kernel_size=31, groups=16):
169
+ super().__init__()
170
+ assert kernel_size % 2 != 0
171
+ self.conv1d = nn.Sequential(
172
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
173
+ nn.Mish(),
174
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
175
+ nn.Mish(),
176
+ )
177
+
178
+ def forward(self, x: float["b n d"], mask: bool["b n"] | None = None): # noqa: F722
179
+ if mask is not None:
180
+ mask = mask[..., None]
181
+ x = x.masked_fill(~mask, 0.0)
182
+
183
+ x = x.permute(0, 2, 1)
184
+ x = self.conv1d(x)
185
+ out = x.permute(0, 2, 1)
186
+
187
+ if mask is not None:
188
+ out = out.masked_fill(~mask, 0.0)
189
+
190
+ return out
191
+
192
+
193
+ # rotary positional embedding related
194
+
195
+
196
+ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
197
+ # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
198
+ # has some connection to NTK literature
199
+ # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
200
+ # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
201
+ theta *= theta_rescale_factor ** (dim / (dim - 2))
202
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
203
+ t = torch.arange(end, device=freqs.device) # type: ignore
204
+ freqs = torch.outer(t, freqs).float() # type: ignore
205
+ freqs_cos = torch.cos(freqs) # real part
206
+ freqs_sin = torch.sin(freqs) # imaginary part
207
+ return torch.cat([freqs_cos, freqs_sin], dim=-1)
208
+
209
+
210
+ def get_pos_embed_indices(start, length, max_pos, scale=1.0):
211
+ # length = length if isinstance(length, int) else length.max()
212
+ scale = scale * torch.ones_like(start, dtype=torch.float32) # in case scale is a scalar
213
+ pos = (
214
+ start.unsqueeze(1)
215
+ + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
216
+ )
217
+ # avoid extra long error.
218
+ pos = torch.where(pos < max_pos, pos, max_pos - 1)
219
+ return pos
220
+
221
+
222
+ # Global Response Normalization layer (Instance Normalization ?)
223
+
224
+
225
+ class GRN(nn.Module):
226
+ def __init__(self, dim):
227
+ super().__init__()
228
+ self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
229
+ self.beta = nn.Parameter(torch.zeros(1, 1, dim))
230
+
231
+ def forward(self, x):
232
+ Gx = torch.norm(x, p=2, dim=1, keepdim=True)
233
+ Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
234
+ return self.gamma * (x * Nx) + self.beta + x
235
+
236
+
237
+ # ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
238
+ # ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
239
+
240
+
241
+ class ConvNeXtV2Block(nn.Module):
242
+ def __init__(
243
+ self,
244
+ dim: int,
245
+ intermediate_dim: int,
246
+ dilation: int = 1,
247
+ ):
248
+ super().__init__()
249
+ padding = (dilation * (7 - 1)) // 2
250
+ self.dwconv = nn.Conv1d(
251
+ dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
252
+ ) # depthwise conv
253
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
254
+ self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers
255
+ self.act = nn.GELU()
256
+ self.grn = GRN(intermediate_dim)
257
+ self.pwconv2 = nn.Linear(intermediate_dim, dim)
258
+
259
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
260
+ residual = x
261
+ x = x.transpose(1, 2) # b n d -> b d n
262
+ x = self.dwconv(x)
263
+ x = x.transpose(1, 2) # b d n -> b n d
264
+ x = self.norm(x)
265
+ x = self.pwconv1(x)
266
+ x = self.act(x)
267
+ x = self.grn(x)
268
+ x = self.pwconv2(x)
269
+ return residual + x
270
+
271
+
272
+ # AdaLayerNormZero
273
+ # return with modulated x for attn input, and params for later mlp modulation
274
+
275
+
276
+ class AdaLayerNormZero(nn.Module):
277
+ def __init__(self, dim):
278
+ super().__init__()
279
+
280
+ self.silu = nn.SiLU()
281
+ self.linear = nn.Linear(dim, dim * 6)
282
+
283
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
284
+
285
+ def forward(self, x, emb=None):
286
+ emb = self.linear(self.silu(emb))
287
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
288
+
289
+ x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
290
+ return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
291
+
292
+
293
+ # AdaLayerNormZero for final layer
294
+ # return only with modulated x for attn input, cuz no more mlp modulation
295
+
296
+
297
+ class AdaLayerNormZero_Final(nn.Module):
298
+ def __init__(self, dim):
299
+ super().__init__()
300
+
301
+ self.silu = nn.SiLU()
302
+ self.linear = nn.Linear(dim, dim * 2)
303
+
304
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
305
+
306
+ def forward(self, x, emb):
307
+ emb = self.linear(self.silu(emb))
308
+ scale, shift = torch.chunk(emb, 2, dim=1)
309
+
310
+ x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
311
+ return x
312
+
313
+
314
+ # FeedForward
315
+
316
+
317
+ class FeedForward(nn.Module):
318
+ def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
319
+ super().__init__()
320
+ inner_dim = int(dim * mult)
321
+ dim_out = dim_out if dim_out is not None else dim
322
+
323
+ activation = nn.GELU(approximate=approximate)
324
+ project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
325
+ self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
326
+
327
+ def forward(self, x):
328
+ return self.ff(x)
329
+
330
+
331
+ # Attention with possible joint part
332
+ # modified from diffusers/src/diffusers/models/attention_processor.py
333
+
334
+
335
+ class Attention(nn.Module):
336
+ def __init__(
337
+ self,
338
+ processor: JointAttnProcessor | AttnProcessor,
339
+ dim: int,
340
+ heads: int = 8,
341
+ dim_head: int = 64,
342
+ dropout: float = 0.0,
343
+ context_dim: Optional[int] = None, # if not None -> joint attention
344
+ context_pre_only=None,
345
+ ):
346
+ super().__init__()
347
+
348
+ if not hasattr(F, "scaled_dot_product_attention"):
349
+ raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
350
+
351
+ self.processor = processor
352
+
353
+ self.dim = dim
354
+ self.heads = heads
355
+ self.inner_dim = dim_head * heads
356
+ self.dropout = dropout
357
+
358
+ self.context_dim = context_dim
359
+ self.context_pre_only = context_pre_only
360
+
361
+ self.to_q = nn.Linear(dim, self.inner_dim)
362
+ self.to_k = nn.Linear(dim, self.inner_dim)
363
+ self.to_v = nn.Linear(dim, self.inner_dim)
364
+
365
+ if self.context_dim is not None:
366
+ self.to_k_c = nn.Linear(context_dim, self.inner_dim)
367
+ self.to_v_c = nn.Linear(context_dim, self.inner_dim)
368
+ if self.context_pre_only is not None:
369
+ self.to_q_c = nn.Linear(context_dim, self.inner_dim)
370
+
371
+ self.to_out = nn.ModuleList([])
372
+ self.to_out.append(nn.Linear(self.inner_dim, dim))
373
+ self.to_out.append(nn.Dropout(dropout))
374
+
375
+ if self.context_pre_only is not None and not self.context_pre_only:
376
+ self.to_out_c = nn.Linear(self.inner_dim, dim)
377
+
378
+ def forward(
379
+ self,
380
+ x: float["b n d"], # noised input x # noqa: F722
381
+ c: float["b n d"] = None, # context c # noqa: F722
382
+ mask: bool["b n"] | None = None, # noqa: F722
383
+ rope=None, # rotary position embedding for x
384
+ c_rope=None, # rotary position embedding for c
385
+ ) -> torch.Tensor:
386
+ if c is not None:
387
+ return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
388
+ else:
389
+ return self.processor(self, x, mask=mask, rope=rope)
390
+
391
+
392
+ # Attention processor
393
+
394
+
395
+ # from torch.nn.attention import SDPBackend
396
+ # torch.backends.cuda.enable_flash_sdp(True)
397
+ class AttnProcessor:
398
+ def __init__(self):
399
+ pass
400
+
401
+ def __call__(
402
+ self,
403
+ attn: Attention,
404
+ x: float["b n d"], # noised input x # noqa: F722
405
+ mask: bool["b n"] | None = None, # noqa: F722
406
+ rope=None, # rotary position embedding
407
+ ) -> torch.FloatTensor:
408
+ batch_size = x.shape[0]
409
+
410
+ # `sample` projections.
411
+ query = attn.to_q(x)
412
+ key = attn.to_k(x)
413
+ value = attn.to_v(x)
414
+
415
+ # apply rotary position embedding
416
+ if rope is not None:
417
+ freqs, xpos_scale = rope
418
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
419
+
420
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
421
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
422
+
423
+ # attention
424
+ inner_dim = key.shape[-1]
425
+ head_dim = inner_dim // attn.heads
426
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
427
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
428
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
429
+
430
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
431
+ if mask is not None:
432
+ attn_mask = mask
433
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
434
+ # print(3433333333,attn_mask.shape)
435
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
436
+ else:
437
+ attn_mask = None
438
+ # with torch.nn.attention.sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
439
+ # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=True):
440
+ # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=False):
441
+ # print(torch.backends.cuda.flash_sdp_enabled())
442
+ # print(torch.backends.cuda.mem_efficient_sdp_enabled())
443
+ # print(torch.backends.cuda.math_sdp_enabled())
444
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
445
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
446
+ x = x.to(query.dtype)
447
+
448
+ # linear proj
449
+ x = attn.to_out[0](x)
450
+ # dropout
451
+ x = attn.to_out[1](x)
452
+
453
+ if mask is not None:
454
+ mask = mask.unsqueeze(-1)
455
+ x = x.masked_fill(~mask, 0.0)
456
+
457
+ return x
458
+
459
+
460
+ # Joint Attention processor for MM-DiT
461
+ # modified from diffusers/src/diffusers/models/attention_processor.py
462
+
463
+
464
+ class JointAttnProcessor:
465
+ def __init__(self):
466
+ pass
467
+
468
+ def __call__(
469
+ self,
470
+ attn: Attention,
471
+ x: float["b n d"], # noised input x # noqa: F722
472
+ c: float["b nt d"] = None, # context c, here text # noqa: F722
473
+ mask: bool["b n"] | None = None, # noqa: F722
474
+ rope=None, # rotary position embedding for x
475
+ c_rope=None, # rotary position embedding for c
476
+ ) -> torch.FloatTensor:
477
+ residual = x
478
+
479
+ batch_size = c.shape[0]
480
+
481
+ # `sample` projections.
482
+ query = attn.to_q(x)
483
+ key = attn.to_k(x)
484
+ value = attn.to_v(x)
485
+
486
+ # `context` projections.
487
+ c_query = attn.to_q_c(c)
488
+ c_key = attn.to_k_c(c)
489
+ c_value = attn.to_v_c(c)
490
+
491
+ # apply rope for context and noised input independently
492
+ if rope is not None:
493
+ freqs, xpos_scale = rope
494
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
495
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
496
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
497
+ if c_rope is not None:
498
+ freqs, xpos_scale = c_rope
499
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
500
+ c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
501
+ c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
502
+
503
+ # attention
504
+ query = torch.cat([query, c_query], dim=1)
505
+ key = torch.cat([key, c_key], dim=1)
506
+ value = torch.cat([value, c_value], dim=1)
507
+
508
+ inner_dim = key.shape[-1]
509
+ head_dim = inner_dim // attn.heads
510
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
511
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
512
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
513
+
514
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
515
+ if mask is not None:
516
+ attn_mask = F.pad(mask, (0, c.shape[1]), value=True) # no mask for c (text)
517
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
518
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
519
+ else:
520
+ attn_mask = None
521
+
522
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
523
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
524
+ x = x.to(query.dtype)
525
+
526
+ # Split the attention outputs.
527
+ x, c = (
528
+ x[:, : residual.shape[1]],
529
+ x[:, residual.shape[1] :],
530
+ )
531
+
532
+ # linear proj
533
+ x = attn.to_out[0](x)
534
+ # dropout
535
+ x = attn.to_out[1](x)
536
+ if not attn.context_pre_only:
537
+ c = attn.to_out_c(c)
538
+
539
+ if mask is not None:
540
+ mask = mask.unsqueeze(-1)
541
+ x = x.masked_fill(~mask, 0.0)
542
+ # c = c.masked_fill(~mask, 0.) # no mask for c (text)
543
+
544
+ return x, c
545
+
546
+
547
+ # DiT Block
548
+
549
+
550
+ class DiTBlock(nn.Module):
551
+ def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1):
552
+ super().__init__()
553
+
554
+ self.attn_norm = AdaLayerNormZero(dim)
555
+ self.attn = Attention(
556
+ processor=AttnProcessor(),
557
+ dim=dim,
558
+ heads=heads,
559
+ dim_head=dim_head,
560
+ dropout=dropout,
561
+ )
562
+
563
+ self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
564
+ self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
565
+
566
+ def forward(self, x, t, mask=None, rope=None): # x: noised input, t: time embedding
567
+ # pre-norm & modulation for attention input
568
+ norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
569
+
570
+ # attention
571
+ attn_output = self.attn(x=norm, mask=mask, rope=rope)
572
+
573
+ # process attention output for input x
574
+ x = x + gate_msa.unsqueeze(1) * attn_output
575
+
576
+ norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
577
+ ff_output = self.ff(norm)
578
+ x = x + gate_mlp.unsqueeze(1) * ff_output
579
+
580
+ return x
581
+
582
+
583
+ # MMDiT Block https://arxiv.org/abs/2403.03206
584
+
585
+
586
+ class MMDiTBlock(nn.Module):
587
+ r"""
588
+ modified from diffusers/src/diffusers/models/attention.py
589
+
590
+ notes.
591
+ _c: context related. text, cond, etc. (left part in sd3 fig2.b)
592
+ _x: noised input related. (right part)
593
+ context_pre_only: last layer only do prenorm + modulation cuz no more ffn
594
+ """
595
+
596
+ def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_pre_only=False):
597
+ super().__init__()
598
+
599
+ self.context_pre_only = context_pre_only
600
+
601
+ self.attn_norm_c = AdaLayerNormZero_Final(dim) if context_pre_only else AdaLayerNormZero(dim)
602
+ self.attn_norm_x = AdaLayerNormZero(dim)
603
+ self.attn = Attention(
604
+ processor=JointAttnProcessor(),
605
+ dim=dim,
606
+ heads=heads,
607
+ dim_head=dim_head,
608
+ dropout=dropout,
609
+ context_dim=dim,
610
+ context_pre_only=context_pre_only,
611
+ )
612
+
613
+ if not context_pre_only:
614
+ self.ff_norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
615
+ self.ff_c = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
616
+ else:
617
+ self.ff_norm_c = None
618
+ self.ff_c = None
619
+ self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
620
+ self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
621
+
622
+ def forward(self, x, c, t, mask=None, rope=None, c_rope=None): # x: noised input, c: context, t: time embedding
623
+ # pre-norm & modulation for attention input
624
+ if self.context_pre_only:
625
+ norm_c = self.attn_norm_c(c, t)
626
+ else:
627
+ norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
628
+ norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
629
+
630
+ # attention
631
+ x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
632
+
633
+ # process attention output for context c
634
+ if self.context_pre_only:
635
+ c = None
636
+ else: # if not last layer
637
+ c = c + c_gate_msa.unsqueeze(1) * c_attn_output
638
+
639
+ norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
640
+ c_ff_output = self.ff_c(norm_c)
641
+ c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
642
+
643
+ # process attention output for input x
644
+ x = x + x_gate_msa.unsqueeze(1) * x_attn_output
645
+
646
+ norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
647
+ x_ff_output = self.ff_x(norm_x)
648
+ x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
649
+
650
+ return c, x
651
+
652
+
653
+ # time step conditioning embedding
654
+
655
+
656
+ class TimestepEmbedding(nn.Module):
657
+ def __init__(self, dim, freq_embed_dim=256):
658
+ super().__init__()
659
+ self.time_embed = SinusPositionEmbedding(freq_embed_dim)
660
+ self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
661
+
662
+ def forward(self, timestep: float["b"]): # noqa: F821
663
+ time_hidden = self.time_embed(timestep)
664
+ time_hidden = time_hidden.to(timestep.dtype)
665
+ time = self.time_mlp(time_hidden) # b d
666
+ return time
GPT_SoVITS/pretrained_models/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/data/docker/liujing04/gpt-vits/chinese-hubert-base",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "HubertModel"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": true,
47
+ "final_dropout": 0.1,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 3072,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.1,
55
+ "mask_feature_length": 10,
56
+ "mask_feature_min_masks": 0,
57
+ "mask_feature_prob": 0.0,
58
+ "mask_time_length": 10,
59
+ "mask_time_min_masks": 2,
60
+ "mask_time_prob": 0.05,
61
+ "model_type": "hubert",
62
+ "num_attention_heads": 12,
63
+ "num_conv_pos_embedding_groups": 16,
64
+ "num_conv_pos_embeddings": 128,
65
+ "num_feat_extract_layers": 7,
66
+ "num_hidden_layers": 12,
67
+ "pad_token_id": 0,
68
+ "torch_dtype": "float16",
69
+ "transformers_version": "4.30.2",
70
+ "use_weighted_layer_sum": false,
71
+ "vocab_size": 32
72
+ }
GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "directionality": "bidi",
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 1024,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 4096,
16
+ "layer_norm_eps": 1e-12,
17
+ "max_position_embeddings": 512,
18
+ "model_type": "bert",
19
+ "num_attention_heads": 16,
20
+ "num_hidden_layers": 24,
21
+ "output_past": true,
22
+ "pad_token_id": 0,
23
+ "pooler_fc_size": 768,
24
+ "pooler_num_attention_heads": 12,
25
+ "pooler_num_fc_layers": 3,
26
+ "pooler_size_per_head": 128,
27
+ "pooler_type": "first_token_transform",
28
+ "position_embedding_type": "absolute",
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.30.2",
31
+ "type_vocab_size": 2,
32
+ "use_cache": true,
33
+ "vocab_size": 21128
34
+ }
GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 0,
4
+ "batch_size": 32,
5
+ "learning_rate": 0.0001,
6
+ "adam_b1": 0.8,
7
+ "adam_b2": 0.99,
8
+ "lr_decay": 0.9999996,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [4,4,2,2,2,2],
12
+ "upsample_kernel_sizes": [8,8,4,4,4,4],
13
+ "upsample_initial_channel": 1536,
14
+ "resblock_kernel_sizes": [3,7,11],
15
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16
+
17
+ "use_tanh_at_final": false,
18
+ "use_bias_at_final": false,
19
+
20
+ "activation": "snakebeta",
21
+ "snake_logscale": true,
22
+
23
+ "use_cqtd_instead_of_mrd": true,
24
+ "cqtd_filters": 128,
25
+ "cqtd_max_filters": 1024,
26
+ "cqtd_filters_scale": 1,
27
+ "cqtd_dilations": [1, 2, 4],
28
+ "cqtd_hop_lengths": [512, 256, 256],
29
+ "cqtd_n_octaves": [9, 9, 9],
30
+ "cqtd_bins_per_octaves": [24, 36, 48],
31
+
32
+ "mpd_reshapes": [2, 3, 5, 7, 11],
33
+ "use_spectral_norm": false,
34
+ "discriminator_channel_mult": 1,
35
+
36
+ "use_multiscale_melloss": true,
37
+ "lambda_melloss": 15,
38
+
39
+ "clip_grad_norm": 500,
40
+
41
+ "segment_size": 65536,
42
+ "num_mels": 100,
43
+ "num_freq": 1025,
44
+ "n_fft": 1024,
45
+ "hop_size": 256,
46
+ "win_size": 1024,
47
+
48
+ "sampling_rate": 24000,
49
+
50
+ "fmin": 0,
51
+ "fmax": null,
52
+ "fmax_for_loss": null,
53
+
54
+ "normalize_volume": true,
55
+
56
+ "num_workers": 4,
57
+
58
+ "dist_config": {
59
+ "dist_backend": "nccl",
60
+ "dist_url": "tcp://localhost:54321",
61
+ "world_size": 1
62
+ }
63
+ }
GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1
3
+ size 155093966
GPT_SoVITS/pretrained_models/s2D488k.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8
3
+ size 93533667
GPT_SoVITS/text/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ G2PWModel
2
+ __pycache__
3
+ *.zip
GPT_SoVITS/text/__init__.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # if os.environ.get("version","v1")=="v1":
3
+ # from text.symbols import symbols
4
+ # else:
5
+ # from text.symbols2 import symbols
6
+
7
+ from text import symbols as symbols_v1
8
+ from text import symbols2 as symbols_v2
9
+
10
+ _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
11
+ _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
12
+
13
+
14
+ def cleaned_text_to_sequence(cleaned_text, version=None):
15
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16
+ Args:
17
+ text: string to convert to a sequence
18
+ Returns:
19
+ List of integers corresponding to the symbols in the text
20
+ """
21
+ if version is None:
22
+ version = os.environ.get("version", "v2")
23
+ if version == "v1":
24
+ phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
25
+ else:
26
+ phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
27
+
28
+ return phones
GPT_SoVITS/text/cantonese.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # reference: https://huggingface.co/spaces/Naozumi0512/Bert-VITS2-Cantonese-Yue/blob/main/text/chinese.py
2
+
3
+ import re
4
+ import cn2an
5
+ import ToJyutping
6
+
7
+ from text.symbols import punctuation
8
+ from text.zh_normalization.text_normlization import TextNormalizer
9
+
10
+ normalizer = lambda x: cn2an.transform(x, "an2cn")
11
+
12
+ INITIALS = [
13
+ "aa",
14
+ "aai",
15
+ "aak",
16
+ "aap",
17
+ "aat",
18
+ "aau",
19
+ "ai",
20
+ "au",
21
+ "ap",
22
+ "at",
23
+ "ak",
24
+ "a",
25
+ "p",
26
+ "b",
27
+ "e",
28
+ "ts",
29
+ "t",
30
+ "dz",
31
+ "d",
32
+ "kw",
33
+ "k",
34
+ "gw",
35
+ "g",
36
+ "f",
37
+ "h",
38
+ "l",
39
+ "m",
40
+ "ng",
41
+ "n",
42
+ "s",
43
+ "y",
44
+ "w",
45
+ "c",
46
+ "z",
47
+ "j",
48
+ "ong",
49
+ "on",
50
+ "ou",
51
+ "oi",
52
+ "ok",
53
+ "o",
54
+ "uk",
55
+ "ung",
56
+ ]
57
+ INITIALS += ["sp", "spl", "spn", "sil"]
58
+
59
+
60
+ rep_map = {
61
+ ":": ",",
62
+ ";": ",",
63
+ ",": ",",
64
+ "。": ".",
65
+ "!": "!",
66
+ "?": "?",
67
+ "\n": ".",
68
+ "·": ",",
69
+ "、": ",",
70
+ "...": "…",
71
+ "$": ".",
72
+ "“": "'",
73
+ "”": "'",
74
+ '"': "'",
75
+ "‘": "'",
76
+ "’": "'",
77
+ "(": "'",
78
+ ")": "'",
79
+ "(": "'",
80
+ ")": "'",
81
+ "《": "'",
82
+ "》": "'",
83
+ "【": "'",
84
+ "】": "'",
85
+ "[": "'",
86
+ "]": "'",
87
+ "—": "-",
88
+ "~": "-",
89
+ "~": "-",
90
+ "「": "'",
91
+ "」": "'",
92
+ }
93
+
94
+
95
+ def replace_punctuation(text):
96
+ # text = text.replace("嗯", "恩").replace("呣", "母")
97
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
98
+
99
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
100
+
101
+ replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
102
+
103
+ return replaced_text
104
+
105
+
106
+ def text_normalize(text):
107
+ tx = TextNormalizer()
108
+ sentences = tx.normalize(text)
109
+ dest_text = ""
110
+ for sentence in sentences:
111
+ dest_text += replace_punctuation(sentence)
112
+ return dest_text
113
+
114
+
115
+ punctuation_set = set(punctuation)
116
+
117
+
118
+ def jyuping_to_initials_finals_tones(jyuping_syllables):
119
+ initials_finals = []
120
+ tones = []
121
+ word2ph = []
122
+
123
+ for syllable in jyuping_syllables:
124
+ if syllable in punctuation:
125
+ initials_finals.append(syllable)
126
+ tones.append(0)
127
+ word2ph.append(1) # Add 1 for punctuation
128
+ elif syllable == "_":
129
+ initials_finals.append(syllable)
130
+ tones.append(0)
131
+ word2ph.append(1) # Add 1 for underscore
132
+ else:
133
+ try:
134
+ tone = int(syllable[-1])
135
+ syllable_without_tone = syllable[:-1]
136
+ except ValueError:
137
+ tone = 0
138
+ syllable_without_tone = syllable
139
+
140
+ for initial in INITIALS:
141
+ if syllable_without_tone.startswith(initial):
142
+ if syllable_without_tone.startswith("nga"):
143
+ initials_finals.extend(
144
+ [
145
+ syllable_without_tone[:2],
146
+ syllable_without_tone[2:] or syllable_without_tone[-1],
147
+ ]
148
+ )
149
+ # tones.extend([tone, tone])
150
+ tones.extend([-1, tone])
151
+ word2ph.append(2)
152
+ else:
153
+ final = syllable_without_tone[len(initial) :] or initial[-1]
154
+ initials_finals.extend([initial, final])
155
+ # tones.extend([tone, tone])
156
+ tones.extend([-1, tone])
157
+ word2ph.append(2)
158
+ break
159
+ assert len(initials_finals) == len(tones)
160
+
161
+ ###魔改为辅音+带音调的元音
162
+ phones = []
163
+ for a, b in zip(initials_finals, tones):
164
+ if b not in [-1, 0]: ###防止粤语和普通话重合开头加Y,如果是标点,不加。
165
+ todo = "%s%s" % (a, b)
166
+ else:
167
+ todo = a
168
+ if todo not in punctuation_set:
169
+ todo = "Y%s" % todo
170
+ phones.append(todo)
171
+
172
+ # return initials_finals, tones, word2ph
173
+ return phones, word2ph
174
+
175
+
176
+ def get_jyutping(text):
177
+ jyutping_array = []
178
+ punct_pattern = re.compile(r"^[{}]+$".format(re.escape("".join(punctuation))))
179
+
180
+ syllables = ToJyutping.get_jyutping_list(text)
181
+
182
+ for word, syllable in syllables:
183
+ if punct_pattern.match(word):
184
+ puncts = re.split(r"([{}])".format(re.escape("".join(punctuation))), word)
185
+ for punct in puncts:
186
+ if len(punct) > 0:
187
+ jyutping_array.append(punct)
188
+ else:
189
+ # match multple jyutping eg: liu4 ge3, or single jyutping eg: liu4
190
+ if not re.search(r"^([a-z]+[1-6]+[ ]?)+$", syllable):
191
+ raise ValueError(f"Failed to convert {word} to jyutping: {syllable}")
192
+ jyutping_array.append(syllable)
193
+
194
+ return jyutping_array
195
+
196
+
197
+ def get_bert_feature(text, word2ph):
198
+ from text import chinese_bert
199
+
200
+ return chinese_bert.get_bert_feature(text, word2ph)
201
+
202
+
203
+ def g2p(text):
204
+ # word2ph = []
205
+ jyuping = get_jyutping(text)
206
+ # print(jyuping)
207
+ # phones, tones, word2ph = jyuping_to_initials_finals_tones(jyuping)
208
+ phones, word2ph = jyuping_to_initials_finals_tones(jyuping)
209
+ # phones = ["_"] + phones + ["_"]
210
+ # tones = [0] + tones + [0]
211
+ # word2ph = [1] + word2ph + [1]
212
+ return phones, word2ph
213
+
214
+
215
+ if __name__ == "__main__":
216
+ # text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
217
+ text = "佢個鋤頭太短啦。"
218
+ text = text_normalize(text)
219
+ # phones, tones, word2ph = g2p(text)
220
+ phones, word2ph = g2p(text)
221
+ # print(phones, tones, word2ph)
222
+ print(phones, word2ph)
GPT_SoVITS/text/chinese.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import cn2an
5
+ from pypinyin import lazy_pinyin, Style
6
+
7
+ from text.symbols import punctuation
8
+ from text.tone_sandhi import ToneSandhi
9
+ from text.zh_normalization.text_normlization import TextNormalizer
10
+
11
+ normalizer = lambda x: cn2an.transform(x, "an2cn")
12
+
13
+ current_file_path = os.path.dirname(__file__)
14
+ pinyin_to_symbol_map = {
15
+ line.split("\t")[0]: line.strip().split("\t")[1]
16
+ for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
17
+ }
18
+
19
+ import jieba_fast
20
+ import logging
21
+
22
+ jieba_fast.setLogLevel(logging.CRITICAL)
23
+ import jieba_fast.posseg as psg
24
+
25
+
26
+ rep_map = {
27
+ ":": ",",
28
+ ";": ",",
29
+ ",": ",",
30
+ "。": ".",
31
+ "!": "!",
32
+ "?": "?",
33
+ "\n": ".",
34
+ "·": ",",
35
+ "、": ",",
36
+ "...": "…",
37
+ "$": ".",
38
+ "/": ",",
39
+ "—": "-",
40
+ "~": "…",
41
+ "~": "…",
42
+ }
43
+
44
+ tone_modifier = ToneSandhi()
45
+
46
+
47
+ def replace_punctuation(text):
48
+ text = text.replace("嗯", "恩").replace("呣", "母")
49
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
50
+
51
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
52
+
53
+ replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
54
+
55
+ return replaced_text
56
+
57
+
58
+ def replace_punctuation_with_en(text):
59
+ text = text.replace("嗯", "恩").replace("呣", "母")
60
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
61
+
62
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
63
+
64
+ replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text)
65
+
66
+ return replaced_text
67
+
68
+
69
+ def replace_consecutive_punctuation(text):
70
+ punctuations = "".join(re.escape(p) for p in punctuation)
71
+ pattern = f"([{punctuations}])([{punctuations}])+"
72
+ result = re.sub(pattern, r"\1", text)
73
+ return result
74
+
75
+
76
+ def g2p(text):
77
+ pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
78
+ sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
79
+ phones, word2ph = _g2p(sentences)
80
+ return phones, word2ph
81
+
82
+
83
+ def _get_initials_finals(word):
84
+ initials = []
85
+ finals = []
86
+ orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
87
+ orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
88
+ for c, v in zip(orig_initials, orig_finals):
89
+ initials.append(c)
90
+ finals.append(v)
91
+ return initials, finals
92
+
93
+
94
+ def _g2p(segments):
95
+ phones_list = []
96
+ word2ph = []
97
+ for seg in segments:
98
+ pinyins = []
99
+ # Replace all English words in the sentence
100
+ seg = re.sub("[a-zA-Z]+", "", seg)
101
+ seg_cut = psg.lcut(seg)
102
+ initials = []
103
+ finals = []
104
+ seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
105
+ for word, pos in seg_cut:
106
+ if pos == "eng":
107
+ continue
108
+ sub_initials, sub_finals = _get_initials_finals(word)
109
+ sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
110
+ initials.append(sub_initials)
111
+ finals.append(sub_finals)
112
+
113
+ # assert len(sub_initials) == len(sub_finals) == len(word)
114
+ initials = sum(initials, [])
115
+ finals = sum(finals, [])
116
+ #
117
+ for c, v in zip(initials, finals):
118
+ raw_pinyin = c + v
119
+ # NOTE: post process for pypinyin outputs
120
+ # we discriminate i, ii and iii
121
+ if c == v:
122
+ assert c in punctuation
123
+ phone = [c]
124
+ word2ph.append(1)
125
+ else:
126
+ v_without_tone = v[:-1]
127
+ tone = v[-1]
128
+
129
+ pinyin = c + v_without_tone
130
+ assert tone in "12345"
131
+
132
+ if c:
133
+ # 多音节
134
+ v_rep_map = {
135
+ "uei": "ui",
136
+ "iou": "iu",
137
+ "uen": "un",
138
+ }
139
+ if v_without_tone in v_rep_map.keys():
140
+ pinyin = c + v_rep_map[v_without_tone]
141
+ else:
142
+ # 单音节
143
+ pinyin_rep_map = {
144
+ "ing": "ying",
145
+ "i": "yi",
146
+ "in": "yin",
147
+ "u": "wu",
148
+ }
149
+ if pinyin in pinyin_rep_map.keys():
150
+ pinyin = pinyin_rep_map[pinyin]
151
+ else:
152
+ single_rep_map = {
153
+ "v": "yu",
154
+ "e": "e",
155
+ "i": "y",
156
+ "u": "w",
157
+ }
158
+ if pinyin[0] in single_rep_map.keys():
159
+ pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
160
+
161
+ assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
162
+ new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
163
+ new_v = new_v + tone
164
+ phone = [new_c, new_v]
165
+ word2ph.append(len(phone))
166
+
167
+ phones_list += phone
168
+ return phones_list, word2ph
169
+
170
+
171
+ def text_normalize(text):
172
+ # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
173
+ tx = TextNormalizer()
174
+ sentences = tx.normalize(text)
175
+ dest_text = ""
176
+ for sentence in sentences:
177
+ dest_text += replace_punctuation(sentence)
178
+
179
+ # 避免重复标点引起的参考泄露
180
+ dest_text = replace_consecutive_punctuation(dest_text)
181
+ return dest_text
182
+
183
+
184
+ # 不排除英文的文本格式化
185
+ def mix_text_normalize(text):
186
+ # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
187
+ tx = TextNormalizer()
188
+ sentences = tx.normalize(text)
189
+ dest_text = ""
190
+ for sentence in sentences:
191
+ dest_text += replace_punctuation_with_en(sentence)
192
+
193
+ # 避免重复标点引起的参考泄露
194
+ dest_text = replace_consecutive_punctuation(dest_text)
195
+ return dest_text
196
+
197
+
198
+ if __name__ == "__main__":
199
+ text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
200
+ text = "呣呣呣~就是…大人的鼹鼠党吧?"
201
+ text = "你好"
202
+ text = text_normalize(text)
203
+ print(g2p(text))
204
+
205
+
206
+ # # 示例用法
207
+ # text = "这是一个示例文本:,你好!这是一个测试..."
208
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试