Spaces:
Runtime error
Runtime error
XiaoHei Studio
commited on
Commit
•
c82bb46
1
Parent(s):
07e9a8a
Upload 18 files
Browse files- CppDataProcess/F0Preprocess.cpp +153 -0
- CppDataProcess/F0Preprocess.hpp +36 -0
- CppDataProcess/Slicer.hpp +82 -0
- CppDataProcess/Wav.cpp +151 -0
- CppDataProcess/Wav.hpp +99 -0
- CppDataProcess/readme.md +8 -0
- cluster/__init__.py +29 -0
- cluster/__pycache__/__init__.cpython-38.pyc +0 -0
- cluster/__pycache__/kmeans.cpython-38.pyc +0 -0
- cluster/km_train.py +80 -0
- cluster/kmeans.py +204 -0
- cluster/train_cluster.py +85 -0
- configs/config.json +94 -0
- configs/diffusion.yaml +48 -0
- configs_template/config_template.json +77 -0
- configs_template/config_tiny_template.json +77 -0
- configs_template/diffusion_template.yaml +51 -0
- dataset_raw/wav_structure.txt +20 -0
CppDataProcess/F0Preprocess.cpp
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "F0Preprocess.hpp"
|
2 |
+
|
3 |
+
|
4 |
+
void F0PreProcess::compute_f0(const double* audio, int64_t len)
|
5 |
+
{
|
6 |
+
DioOption Doption;
|
7 |
+
InitializeDioOption(&Doption);
|
8 |
+
Doption.f0_ceil = 800;
|
9 |
+
Doption.frame_period = 1000.0 * hop / fs;
|
10 |
+
f0Len = GetSamplesForDIO(fs, (int)len, Doption.frame_period);
|
11 |
+
const auto tp = new double[f0Len];
|
12 |
+
const auto tmpf0 = new double[f0Len];
|
13 |
+
rf0 = new double[f0Len];
|
14 |
+
Dio(audio, (int)len, fs, &Doption, tp, tmpf0);
|
15 |
+
StoneMask(audio, (int)len, fs, tp, tmpf0, (int)f0Len, rf0);
|
16 |
+
delete[] tmpf0;
|
17 |
+
delete[] tp;
|
18 |
+
}
|
19 |
+
|
20 |
+
std::vector<double> arange(double start,double end,double step = 1.0,double div = 1.0)
|
21 |
+
{
|
22 |
+
std::vector<double> output;
|
23 |
+
while(start<end)
|
24 |
+
{
|
25 |
+
output.push_back(start / div);
|
26 |
+
start += step;
|
27 |
+
}
|
28 |
+
return output;
|
29 |
+
}
|
30 |
+
|
31 |
+
void F0PreProcess::InterPf0(int64_t len)
|
32 |
+
{
|
33 |
+
const auto xi = arange(0.0, (double)f0Len * (double)len, (double)f0Len, (double)len);
|
34 |
+
const auto tmp = new double[xi.size() + 1];
|
35 |
+
interp1(arange(0, (double)f0Len).data(), rf0, static_cast<int>(f0Len), xi.data(), (int)xi.size(), tmp);
|
36 |
+
for (size_t i = 0; i < xi.size(); i++)
|
37 |
+
if (isnan(tmp[i]))
|
38 |
+
tmp[i] = 0.0;
|
39 |
+
delete[] rf0;
|
40 |
+
rf0 = nullptr;
|
41 |
+
rf0 = tmp;
|
42 |
+
f0Len = (int64_t)xi.size();
|
43 |
+
}
|
44 |
+
|
45 |
+
long long* F0PreProcess::f0Log()
|
46 |
+
{
|
47 |
+
const auto tmp = new long long[f0Len];
|
48 |
+
const auto f0_mel = new double[f0Len];
|
49 |
+
for (long long i = 0; i < f0Len; i++)
|
50 |
+
{
|
51 |
+
f0_mel[i] = 1127 * log(1.0 + rf0[i] / 700.0);
|
52 |
+
if (f0_mel[i] > 0.0)
|
53 |
+
f0_mel[i] = (f0_mel[i] - f0_mel_min) * (f0_bin - 2.0) / (f0_mel_max - f0_mel_min) + 1.0;
|
54 |
+
if (f0_mel[i] < 1.0)
|
55 |
+
f0_mel[i] = 1;
|
56 |
+
if (f0_mel[i] > f0_bin - 1)
|
57 |
+
f0_mel[i] = f0_bin - 1;
|
58 |
+
tmp[i] = (long long)round(f0_mel[i]);
|
59 |
+
}
|
60 |
+
delete[] f0_mel;
|
61 |
+
delete[] rf0;
|
62 |
+
rf0 = nullptr;
|
63 |
+
return tmp;
|
64 |
+
}
|
65 |
+
|
66 |
+
std::vector<long long> F0PreProcess::GetF0AndOtherInput(const double* audio, int64_t audioLen, int64_t hubLen, int64_t tran)
|
67 |
+
{
|
68 |
+
compute_f0(audio, audioLen);
|
69 |
+
for (int64_t i = 0; i < f0Len; ++i)
|
70 |
+
{
|
71 |
+
rf0[i] = rf0[i] * pow(2.0, static_cast<double>(tran) / 12.0);
|
72 |
+
if (rf0[i] < 0.001)
|
73 |
+
rf0[i] = NAN;
|
74 |
+
}
|
75 |
+
InterPf0(hubLen);
|
76 |
+
const auto O0f = f0Log();
|
77 |
+
std::vector<long long> Of0(O0f, O0f + f0Len);
|
78 |
+
delete[] O0f;
|
79 |
+
return Of0;
|
80 |
+
}
|
81 |
+
|
82 |
+
std::vector<long long> getAligments(size_t specLen, size_t hubertLen)
|
83 |
+
{
|
84 |
+
std::vector<long long> mel2ph(specLen + 1, 0);
|
85 |
+
|
86 |
+
size_t startFrame = 0;
|
87 |
+
const double ph_durs = static_cast<double>(specLen) / static_cast<double>(hubertLen);
|
88 |
+
for (size_t iph = 0; iph < hubertLen; ++iph)
|
89 |
+
{
|
90 |
+
const auto endFrame = static_cast<size_t>(round(static_cast<double>(iph) * ph_durs + ph_durs));
|
91 |
+
for (auto j = startFrame; j < endFrame + 1; ++j)
|
92 |
+
mel2ph[j] = static_cast<long long>(iph) + 1;
|
93 |
+
startFrame = endFrame + 1;
|
94 |
+
}
|
95 |
+
|
96 |
+
return mel2ph;
|
97 |
+
}
|
98 |
+
|
99 |
+
std::vector<float> F0PreProcess::GetF0AndOtherInputF0(const double* audio, int64_t audioLen, int64_t tran)
|
100 |
+
{
|
101 |
+
compute_f0(audio, audioLen);
|
102 |
+
for (int64_t i = 0; i < f0Len; ++i)
|
103 |
+
{
|
104 |
+
rf0[i] = log2(rf0[i] * pow(2.0, static_cast<double>(tran) / 12.0));
|
105 |
+
if (rf0[i] < 0.001)
|
106 |
+
rf0[i] = NAN;
|
107 |
+
}
|
108 |
+
const int64_t specLen = audioLen / hop;
|
109 |
+
InterPf0(specLen);
|
110 |
+
|
111 |
+
std::vector<float> Of0(specLen, 0.0);
|
112 |
+
|
113 |
+
double last_value = 0.0;
|
114 |
+
for (int64_t i = 0; i < specLen; ++i)
|
115 |
+
{
|
116 |
+
if (rf0[i] <= 0.0)
|
117 |
+
{
|
118 |
+
int64_t j = i + 1;
|
119 |
+
for (; j < specLen; ++j)
|
120 |
+
{
|
121 |
+
if (rf0[j] > 0.0)
|
122 |
+
break;
|
123 |
+
}
|
124 |
+
if (j < specLen - 1)
|
125 |
+
{
|
126 |
+
if (last_value > 0.0)
|
127 |
+
{
|
128 |
+
const auto step = (rf0[j] - rf0[i - 1]) / double(j - i);
|
129 |
+
for (int64_t k = i; k < j; ++k)
|
130 |
+
Of0[k] = float(rf0[i - 1] + step * double(k - i + 1));
|
131 |
+
}
|
132 |
+
else
|
133 |
+
for (int64_t k = i; k < j; ++k)
|
134 |
+
Of0[k] = float(rf0[j]);
|
135 |
+
i = j;
|
136 |
+
}
|
137 |
+
else
|
138 |
+
{
|
139 |
+
for (int64_t k = i; k < specLen; ++k)
|
140 |
+
Of0[k] = float(last_value);
|
141 |
+
i = specLen;
|
142 |
+
}
|
143 |
+
}
|
144 |
+
else
|
145 |
+
{
|
146 |
+
Of0[i] = float(rf0[i - 1]);
|
147 |
+
last_value = rf0[i];
|
148 |
+
}
|
149 |
+
}
|
150 |
+
delete[] rf0;
|
151 |
+
rf0 = nullptr;
|
152 |
+
return Of0;
|
153 |
+
}
|
CppDataProcess/F0Preprocess.hpp
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "world/dio.h"
|
2 |
+
#include "world/stonemask.h"
|
3 |
+
#include "world/matlabfunctions.h"
|
4 |
+
#include <string>
|
5 |
+
#include <vector>
|
6 |
+
|
7 |
+
//Cpp F0 Preprocess
|
8 |
+
|
9 |
+
class F0PreProcess
|
10 |
+
{
|
11 |
+
public:
|
12 |
+
int fs;
|
13 |
+
short hop;
|
14 |
+
const int f0_bin = 256;
|
15 |
+
const double f0_max = 1100.0;
|
16 |
+
const double f0_min = 50.0;
|
17 |
+
const double f0_mel_min = 1127.0 * log(1.0 + f0_min / 700.0);
|
18 |
+
const double f0_mel_max = 1127.0 * log(1.0 + f0_max / 700.0);
|
19 |
+
F0PreProcess(int sr = 16000, short h = 160) :fs(sr), hop(h) {}
|
20 |
+
~F0PreProcess()
|
21 |
+
{
|
22 |
+
delete[] rf0;
|
23 |
+
rf0 = nullptr;
|
24 |
+
}
|
25 |
+
void compute_f0(const double* audio, int64_t len);
|
26 |
+
void InterPf0(int64_t len);
|
27 |
+
long long* f0Log();
|
28 |
+
int64_t getLen()const { return f0Len; }
|
29 |
+
std::vector<long long> GetF0AndOtherInput(const double* audio, int64_t audioLen, int64_t hubLen, int64_t tran);
|
30 |
+
std::vector<float> GetF0AndOtherInputF0(const double* audio, int64_t audioLen, int64_t tran);
|
31 |
+
private:
|
32 |
+
double* rf0 = nullptr;
|
33 |
+
int64_t f0Len = 0;
|
34 |
+
};
|
35 |
+
|
36 |
+
std::vector<long long> getAligments(size_t specLen, size_t hubertLen);
|
CppDataProcess/Slicer.hpp
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <string>
|
2 |
+
#include <vector>
|
3 |
+
#include "Wav.hpp"
|
4 |
+
|
5 |
+
struct SliceResult
|
6 |
+
{
|
7 |
+
std::vector<unsigned long long> SliceOffset;
|
8 |
+
std::vector<bool> SliceTag;
|
9 |
+
cutResult(std::vector<unsigned long long>&& O, std::vector<bool>&& T) :SliceOffset(O), SliceTag(T) {}
|
10 |
+
};
|
11 |
+
|
12 |
+
double getAvg(const short* start, const short* end)
|
13 |
+
{
|
14 |
+
const auto size = end - start + 1;
|
15 |
+
auto avg = (double)(*start);
|
16 |
+
for (auto i = 1; i < size; i++)
|
17 |
+
{
|
18 |
+
avg = avg + (abs((double)start[i]) - avg) / (double)(i + 1ull);
|
19 |
+
}
|
20 |
+
return avg;
|
21 |
+
}
|
22 |
+
|
23 |
+
inline SliceResult SliceWav(Wav& input, double threshold, unsigned long minLen, unsigned short frame_len, unsigned short frame_shift)
|
24 |
+
{
|
25 |
+
const auto header = input.getHeader();
|
26 |
+
if (header.Subchunk2Size < minLen * header.bytesPerSec)
|
27 |
+
return { {0,header.Subchunk2Size},{true} };
|
28 |
+
auto ptr = input.getData();
|
29 |
+
std::vector<unsigned long long> output;
|
30 |
+
std::vector<bool> tag;
|
31 |
+
auto n = (header.Subchunk2Size / frame_shift) - 2 * (frame_len / frame_shift);
|
32 |
+
unsigned long nn = 0;
|
33 |
+
bool cutTag = true;
|
34 |
+
output.emplace_back(0);
|
35 |
+
while (n--)
|
36 |
+
{
|
37 |
+
//if (nn > minLen * header.bytesPerSec)
|
38 |
+
if (cutTag)
|
39 |
+
{
|
40 |
+
const auto vol = abs(getAvg((short*)ptr, (short*)ptr + frame_len));
|
41 |
+
if (vol < threshold)
|
42 |
+
{
|
43 |
+
cutTag = false;
|
44 |
+
if (nn > minLen * header.bytesPerSec)
|
45 |
+
{
|
46 |
+
nn = 0;
|
47 |
+
output.emplace_back((ptr - input.getData()) + (frame_len / 2));
|
48 |
+
}
|
49 |
+
}
|
50 |
+
else
|
51 |
+
{
|
52 |
+
cutTag = true;
|
53 |
+
}
|
54 |
+
}
|
55 |
+
else
|
56 |
+
{
|
57 |
+
const auto vol = abs(getAvg((short*)ptr, (short*)ptr + frame_len));
|
58 |
+
if (vol < threshold)
|
59 |
+
{
|
60 |
+
cutTag = false;
|
61 |
+
}
|
62 |
+
else
|
63 |
+
{
|
64 |
+
cutTag = true;
|
65 |
+
if (nn > minLen * header.bytesPerSec)
|
66 |
+
{
|
67 |
+
nn = 0;
|
68 |
+
output.emplace_back((ptr - input.getData()) + (frame_len / 2));
|
69 |
+
}
|
70 |
+
}
|
71 |
+
}
|
72 |
+
nn += frame_shift;
|
73 |
+
ptr += frame_shift;
|
74 |
+
}
|
75 |
+
output.push_back(header.Subchunk2Size);
|
76 |
+
for (size_t i = 1; i < output.size(); i++)
|
77 |
+
{
|
78 |
+
tag.push_back(abs(getAvg((short*)(input.getData() + output[i - 1]), (short*)(input.getData() + output[i]))) > threshold);
|
79 |
+
}
|
80 |
+
return { std::move(output),std::move(tag) };
|
81 |
+
}
|
82 |
+
|
CppDataProcess/Wav.cpp
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "Wav.hpp"
|
2 |
+
|
3 |
+
Wav::Wav(const wchar_t* Path) :header(WAV_HEADER()) {
|
4 |
+
char buf[1024];
|
5 |
+
FILE* stream;
|
6 |
+
_wfreopen_s(&stream, Path, L"rb", stderr);
|
7 |
+
if (stream == nullptr) {
|
8 |
+
throw (std::exception("File not exists"));
|
9 |
+
}
|
10 |
+
fread(buf, 1, HEAD_LENGTH, stream);
|
11 |
+
int pos = 0;
|
12 |
+
while (pos < HEAD_LENGTH) {
|
13 |
+
if ((buf[pos] == 'R') && (buf[pos + 1] == 'I') && (buf[pos + 2] == 'F') && (buf[pos + 3] == 'F')) {
|
14 |
+
pos += 4;
|
15 |
+
break;
|
16 |
+
}
|
17 |
+
++pos;
|
18 |
+
}
|
19 |
+
if (pos >= HEAD_LENGTH)
|
20 |
+
throw (std::exception("Don't order fried rice (annoyed)"));
|
21 |
+
header.ChunkSize = *(int*)&buf[pos];
|
22 |
+
pos += 8;
|
23 |
+
while (pos < HEAD_LENGTH) {
|
24 |
+
if ((buf[pos] == 'f') && (buf[pos + 1] == 'm') && (buf[pos + 2] == 't')) {
|
25 |
+
pos += 4;
|
26 |
+
break;
|
27 |
+
}
|
28 |
+
++pos;
|
29 |
+
}
|
30 |
+
if (pos >= HEAD_LENGTH)
|
31 |
+
throw (std::exception("Don't order fried rice (annoyed)"));
|
32 |
+
header.Subchunk1Size = *(int*)&buf[pos];
|
33 |
+
pos += 4;
|
34 |
+
header.AudioFormat = *(short*)&buf[pos];
|
35 |
+
pos += 2;
|
36 |
+
header.NumOfChan = *(short*)&buf[pos];
|
37 |
+
pos += 2;
|
38 |
+
header.SamplesPerSec = *(int*)&buf[pos];
|
39 |
+
pos += 4;
|
40 |
+
header.bytesPerSec = *(int*)&buf[pos];
|
41 |
+
pos += 4;
|
42 |
+
header.blockAlign = *(short*)&buf[pos];
|
43 |
+
pos += 2;
|
44 |
+
header.bitsPerSample = *(short*)&buf[pos];
|
45 |
+
pos += 2;
|
46 |
+
while (pos < HEAD_LENGTH) {
|
47 |
+
if ((buf[pos] == 'd') && (buf[pos + 1] == 'a') && (buf[pos + 2] == 't') && (buf[pos + 3] == 'a')) {
|
48 |
+
pos += 4;
|
49 |
+
break;
|
50 |
+
}
|
51 |
+
++pos;
|
52 |
+
}
|
53 |
+
if (pos >= HEAD_LENGTH)
|
54 |
+
throw (std::exception("Don't order fried rice (annoyed)"));
|
55 |
+
header.Subchunk2Size = *(int*)&buf[pos];
|
56 |
+
pos += 4;
|
57 |
+
StartPos = pos;
|
58 |
+
Data = new char[header.Subchunk2Size + 1];
|
59 |
+
fseek(stream, StartPos, SEEK_SET);
|
60 |
+
fread(Data, 1, header.Subchunk2Size, stream);
|
61 |
+
if (stream != nullptr) {
|
62 |
+
fclose(stream);
|
63 |
+
}
|
64 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
65 |
+
dataSize = header.Subchunk2Size / 2;
|
66 |
+
}
|
67 |
+
|
68 |
+
Wav::Wav(const Wav& input) :header(WAV_HEADER()) {
|
69 |
+
Data = new char[(input.header.Subchunk2Size + 1)];
|
70 |
+
if (Data == nullptr) { throw std::exception("OOM"); }
|
71 |
+
memcpy(header.RIFF, input.header.RIFF, 4);
|
72 |
+
memcpy(header.fmt, input.header.fmt, 4);
|
73 |
+
memcpy(header.WAVE, input.header.WAVE, 4);
|
74 |
+
memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
|
75 |
+
header.ChunkSize = input.header.ChunkSize;
|
76 |
+
header.Subchunk1Size = input.header.Subchunk1Size;
|
77 |
+
header.AudioFormat = input.header.AudioFormat;
|
78 |
+
header.NumOfChan = input.header.NumOfChan;
|
79 |
+
header.SamplesPerSec = input.header.SamplesPerSec;
|
80 |
+
header.bytesPerSec = input.header.bytesPerSec;
|
81 |
+
header.blockAlign = input.header.blockAlign;
|
82 |
+
header.bitsPerSample = input.header.bitsPerSample;
|
83 |
+
header.Subchunk2Size = input.header.Subchunk2Size;
|
84 |
+
StartPos = input.StartPos;
|
85 |
+
memcpy(Data, input.Data, input.header.Subchunk2Size);
|
86 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
87 |
+
dataSize = header.Subchunk2Size / 2;
|
88 |
+
}
|
89 |
+
|
90 |
+
Wav::Wav(Wav&& input) noexcept
|
91 |
+
{
|
92 |
+
Data = input.Data;
|
93 |
+
input.Data = nullptr;
|
94 |
+
memcpy(header.RIFF, input.header.RIFF, 4);
|
95 |
+
memcpy(header.fmt, input.header.fmt, 4);
|
96 |
+
memcpy(header.WAVE, input.header.WAVE, 4);
|
97 |
+
memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
|
98 |
+
header.ChunkSize = input.header.ChunkSize;
|
99 |
+
header.Subchunk1Size = input.header.Subchunk1Size;
|
100 |
+
header.AudioFormat = input.header.AudioFormat;
|
101 |
+
header.NumOfChan = input.header.NumOfChan;
|
102 |
+
header.SamplesPerSec = input.header.SamplesPerSec;
|
103 |
+
header.bytesPerSec = input.header.bytesPerSec;
|
104 |
+
header.blockAlign = input.header.blockAlign;
|
105 |
+
header.bitsPerSample = input.header.bitsPerSample;
|
106 |
+
header.Subchunk2Size = input.header.Subchunk2Size;
|
107 |
+
StartPos = input.StartPos;
|
108 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
109 |
+
dataSize = header.Subchunk2Size / 2;
|
110 |
+
}
|
111 |
+
|
112 |
+
Wav& Wav::operator=(Wav&& input) noexcept
|
113 |
+
{
|
114 |
+
destory();
|
115 |
+
Data = input.Data;
|
116 |
+
input.Data = nullptr;
|
117 |
+
memcpy(header.RIFF, input.header.RIFF, 4);
|
118 |
+
memcpy(header.fmt, input.header.fmt, 4);
|
119 |
+
memcpy(header.WAVE, input.header.WAVE, 4);
|
120 |
+
memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
|
121 |
+
header.ChunkSize = input.header.ChunkSize;
|
122 |
+
header.Subchunk1Size = input.header.Subchunk1Size;
|
123 |
+
header.AudioFormat = input.header.AudioFormat;
|
124 |
+
header.NumOfChan = input.header.NumOfChan;
|
125 |
+
header.SamplesPerSec = input.header.SamplesPerSec;
|
126 |
+
header.bytesPerSec = input.header.bytesPerSec;
|
127 |
+
header.blockAlign = input.header.blockAlign;
|
128 |
+
header.bitsPerSample = input.header.bitsPerSample;
|
129 |
+
header.Subchunk2Size = input.header.Subchunk2Size;
|
130 |
+
StartPos = input.StartPos;
|
131 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
132 |
+
dataSize = header.Subchunk2Size / 2;
|
133 |
+
return *this;
|
134 |
+
}
|
135 |
+
|
136 |
+
Wav& Wav::cat(const Wav& input)
|
137 |
+
{
|
138 |
+
if (header.AudioFormat != 1) return *this;
|
139 |
+
if (header.SamplesPerSec != input.header.bitsPerSample || header.NumOfChan != input.header.NumOfChan) return *this;
|
140 |
+
char* buffer = new char[(int64_t)header.Subchunk2Size + (int64_t)input.header.Subchunk2Size + 1];
|
141 |
+
if (buffer == nullptr)return *this;
|
142 |
+
memcpy(buffer, Data, header.Subchunk2Size);
|
143 |
+
memcpy(buffer + header.Subchunk2Size, input.Data, input.header.Subchunk2Size);
|
144 |
+
header.ChunkSize += input.header.Subchunk2Size;
|
145 |
+
header.Subchunk2Size += input.header.Subchunk2Size;
|
146 |
+
delete[] Data;
|
147 |
+
Data = buffer;
|
148 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
149 |
+
dataSize = header.Subchunk2Size / 2;
|
150 |
+
return *this;
|
151 |
+
}
|
CppDataProcess/Wav.hpp
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Wav {
|
2 |
+
public:
|
3 |
+
|
4 |
+
struct WAV_HEADER {
|
5 |
+
char RIFF[4] = { 'R','I','F','F' }; //RIFF��ʶ
|
6 |
+
unsigned long ChunkSize; //�ļ���С-8
|
7 |
+
char WAVE[4] = { 'W','A','V','E' }; //WAVE��
|
8 |
+
char fmt[4] = { 'f','m','t',' ' }; //fmt��
|
9 |
+
unsigned long Subchunk1Size; //fmt���С
|
10 |
+
unsigned short AudioFormat; //�����ʽ
|
11 |
+
unsigned short NumOfChan; //������
|
12 |
+
unsigned long SamplesPerSec; //������
|
13 |
+
unsigned long bytesPerSec; //ÿ�����ֽ���
|
14 |
+
unsigned short blockAlign; //�������ֽ�
|
15 |
+
unsigned short bitsPerSample; //�������
|
16 |
+
char Subchunk2ID[4] = { 'd','a','t','a' }; //���ݿ�
|
17 |
+
unsigned long Subchunk2Size; //���ݿ��С
|
18 |
+
WAV_HEADER(unsigned long cs = 36, unsigned long sc1s = 16, unsigned short af = 1, unsigned short nc = 1, unsigned long sr = 22050, unsigned long bps = 44100, unsigned short ba = 2, unsigned short bips = 16, unsigned long sc2s = 0) :ChunkSize(cs), Subchunk1Size(sc1s), AudioFormat(af), NumOfChan(nc), SamplesPerSec(sr), bytesPerSec(bps), blockAlign(ba), bitsPerSample(bips), Subchunk2Size(sc2s) {}
|
19 |
+
};
|
20 |
+
using iterator = int16_t*;
|
21 |
+
Wav(unsigned long cs = 36, unsigned long sc1s = 16, unsigned short af = 1, unsigned short nc = 1, unsigned long sr = 22050, unsigned long bps = 44100, unsigned short ba = 2, unsigned short bips = 16, unsigned long sc2s = 0) :header({
|
22 |
+
cs,
|
23 |
+
sc1s,
|
24 |
+
af,
|
25 |
+
nc,
|
26 |
+
sr,
|
27 |
+
bps,
|
28 |
+
ba,
|
29 |
+
bips,
|
30 |
+
sc2s
|
31 |
+
}), Data(nullptr), StartPos(44) {
|
32 |
+
dataSize = 0;
|
33 |
+
SData = nullptr;
|
34 |
+
}
|
35 |
+
Wav(unsigned long sr, unsigned long length, const void* data) :header({
|
36 |
+
36,
|
37 |
+
16,
|
38 |
+
1,
|
39 |
+
1,
|
40 |
+
sr,
|
41 |
+
sr * 2,
|
42 |
+
2,
|
43 |
+
16,
|
44 |
+
length
|
45 |
+
}), Data(new char[length + 1]), StartPos(44)
|
46 |
+
{
|
47 |
+
header.ChunkSize = 36 + length;
|
48 |
+
memcpy(Data, data, length);
|
49 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
50 |
+
dataSize = length / 2;
|
51 |
+
}
|
52 |
+
Wav(const wchar_t* Path);
|
53 |
+
Wav(const Wav& input);
|
54 |
+
Wav(Wav&& input) noexcept;
|
55 |
+
Wav& operator=(const Wav& input) = delete;
|
56 |
+
Wav& operator=(Wav&& input) noexcept;
|
57 |
+
~Wav() { destory(); }
|
58 |
+
Wav& cat(const Wav& input);
|
59 |
+
bool isEmpty() const { return this->header.Subchunk2Size == 0; }
|
60 |
+
const char* getData() const { return Data; }
|
61 |
+
char* getData() { return Data; }
|
62 |
+
WAV_HEADER getHeader() const { return header; }
|
63 |
+
WAV_HEADER& Header() { return header; }
|
64 |
+
void destory() const { delete[] Data; }
|
65 |
+
void changeData(const void* indata,long length,int sr)
|
66 |
+
{
|
67 |
+
delete[] Data;
|
68 |
+
Data = new char[length];
|
69 |
+
memcpy(Data, indata, length);
|
70 |
+
header.ChunkSize = 36 + length;
|
71 |
+
header.Subchunk2Size = length;
|
72 |
+
header.SamplesPerSec = sr;
|
73 |
+
header.bytesPerSec = 2 * sr;
|
74 |
+
}
|
75 |
+
int16_t& operator[](const size_t index) const
|
76 |
+
{
|
77 |
+
if (index < dataSize)
|
78 |
+
return *(SData + index);
|
79 |
+
return *(SData + dataSize - 1);
|
80 |
+
}
|
81 |
+
iterator begin() const
|
82 |
+
{
|
83 |
+
return reinterpret_cast<int16_t*>(Data);
|
84 |
+
}
|
85 |
+
iterator end() const
|
86 |
+
{
|
87 |
+
return reinterpret_cast<int16_t*>(Data + header.Subchunk2Size);
|
88 |
+
}
|
89 |
+
int64_t getDataLen()const
|
90 |
+
{
|
91 |
+
return static_cast<int64_t>(dataSize);
|
92 |
+
}
|
93 |
+
private:
|
94 |
+
WAV_HEADER header;
|
95 |
+
char* Data;
|
96 |
+
int16_t* SData;
|
97 |
+
size_t dataSize;
|
98 |
+
int StartPos;
|
99 |
+
};
|
CppDataProcess/readme.md
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## F0Preprocess
|
2 |
+
请前往 https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder 下载PyWorld的源代码并编译出静态库并链接到你的项目之中,然后调用此头文件
|
3 |
+
|
4 |
+
## Slicer
|
5 |
+
一个简单的切片机
|
6 |
+
|
7 |
+
---
|
8 |
+
~~上面的东西是直接从MoeSS的代码里面抽出来的,可以作为预置预处理的替代品()~~
|
cluster/__init__.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from sklearn.cluster import KMeans
|
3 |
+
|
4 |
+
|
5 |
+
def get_cluster_model(ckpt_path):
|
6 |
+
checkpoint = torch.load(ckpt_path)
|
7 |
+
kmeans_dict = {}
|
8 |
+
for spk, ckpt in checkpoint.items():
|
9 |
+
km = KMeans(ckpt["n_features_in_"])
|
10 |
+
km.__dict__["n_features_in_"] = ckpt["n_features_in_"]
|
11 |
+
km.__dict__["_n_threads"] = ckpt["_n_threads"]
|
12 |
+
km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"]
|
13 |
+
kmeans_dict[spk] = km
|
14 |
+
return kmeans_dict
|
15 |
+
|
16 |
+
def get_cluster_result(model, x, speaker):
|
17 |
+
"""
|
18 |
+
x: np.array [t, 256]
|
19 |
+
return cluster class result
|
20 |
+
"""
|
21 |
+
return model[speaker].predict(x)
|
22 |
+
|
23 |
+
def get_cluster_center_result(model, x,speaker):
|
24 |
+
"""x: np.array [t, 256]"""
|
25 |
+
predict = model[speaker].predict(x)
|
26 |
+
return model[speaker].cluster_centers_[predict]
|
27 |
+
|
28 |
+
def get_center(model, x,speaker):
|
29 |
+
return model[speaker].cluster_centers_[x]
|
cluster/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (1.09 kB). View file
|
|
cluster/__pycache__/kmeans.cpython-38.pyc
ADDED
Binary file (6.95 kB). View file
|
|
cluster/km_train.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time,pdb
|
2 |
+
import tqdm
|
3 |
+
from time import time as ttime
|
4 |
+
import os
|
5 |
+
from pathlib import Path
|
6 |
+
import logging
|
7 |
+
import argparse
|
8 |
+
from cluster.kmeans import KMeansGPU
|
9 |
+
import torch
|
10 |
+
import numpy as np
|
11 |
+
from sklearn.cluster import KMeans,MiniBatchKMeans
|
12 |
+
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
from time import time as ttime
|
16 |
+
import pynvml,torch
|
17 |
+
|
18 |
+
def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
|
19 |
+
logger.info(f"Loading features from {in_dir}")
|
20 |
+
features = []
|
21 |
+
nums = 0
|
22 |
+
for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
|
23 |
+
# for name in os.listdir(in_dir):
|
24 |
+
# path="%s/%s"%(in_dir,name)
|
25 |
+
features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
|
26 |
+
# print(features[-1].shape)
|
27 |
+
features = np.concatenate(features, axis=0)
|
28 |
+
print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
|
29 |
+
features = features.astype(np.float32)
|
30 |
+
logger.info(f"Clustering features of shape: {features.shape}")
|
31 |
+
t = time.time()
|
32 |
+
if(use_gpu==False):
|
33 |
+
if use_minibatch:
|
34 |
+
kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
|
35 |
+
else:
|
36 |
+
kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
|
37 |
+
else:
|
38 |
+
kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
|
39 |
+
features=torch.from_numpy(features)#.to(device)
|
40 |
+
labels = kmeans.fit_predict(features)#
|
41 |
+
|
42 |
+
print(time.time()-t, "s")
|
43 |
+
|
44 |
+
x = {
|
45 |
+
"n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[0],
|
46 |
+
"_n_threads": kmeans._n_threads if use_gpu==False else 4,
|
47 |
+
"cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
|
48 |
+
}
|
49 |
+
print("end")
|
50 |
+
|
51 |
+
return x
|
52 |
+
|
53 |
+
if __name__ == "__main__":
|
54 |
+
parser = argparse.ArgumentParser()
|
55 |
+
parser.add_argument('--dataset', type=Path, default="./dataset/44k",
|
56 |
+
help='path of training data directory')
|
57 |
+
parser.add_argument('--output', type=Path, default="logs/44k",
|
58 |
+
help='path of model output directory')
|
59 |
+
|
60 |
+
args = parser.parse_args()
|
61 |
+
|
62 |
+
checkpoint_dir = args.output
|
63 |
+
dataset = args.dataset
|
64 |
+
n_clusters = 1000
|
65 |
+
|
66 |
+
ckpt = {}
|
67 |
+
for spk in os.listdir(dataset):
|
68 |
+
if os.path.isdir(dataset/spk):
|
69 |
+
print(f"train kmeans for {spk}...")
|
70 |
+
in_dir = dataset/spk
|
71 |
+
x = train_cluster(in_dir, n_clusters,use_minibatch=False,verbose=False,use_gpu=True)
|
72 |
+
ckpt[spk] = x
|
73 |
+
|
74 |
+
checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
|
75 |
+
checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
|
76 |
+
torch.save(
|
77 |
+
ckpt,
|
78 |
+
checkpoint_path,
|
79 |
+
)
|
80 |
+
|
cluster/kmeans.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from time import time
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import pynvml
|
5 |
+
import torch
|
6 |
+
from torch.nn.functional import normalize
|
7 |
+
|
8 |
+
|
9 |
+
# device=torch.device("cuda:0")
|
10 |
+
def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
|
11 |
+
""" Picks k points in the data based on the kmeans++ method.
|
12 |
+
|
13 |
+
Parameters
|
14 |
+
----------
|
15 |
+
data : torch.Tensor
|
16 |
+
Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
|
17 |
+
data, rank 2 multidimensional data, in which case one
|
18 |
+
row is one observation.
|
19 |
+
k : int
|
20 |
+
Number of samples to generate.
|
21 |
+
sample_size : int
|
22 |
+
sample data to avoid memory overflow during calculation
|
23 |
+
|
24 |
+
Returns
|
25 |
+
-------
|
26 |
+
init : ndarray
|
27 |
+
A 'k' by 'N' containing the initial centroids.
|
28 |
+
|
29 |
+
References
|
30 |
+
----------
|
31 |
+
.. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
|
32 |
+
careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
|
33 |
+
on Discrete Algorithms, 2007.
|
34 |
+
.. [2] scipy/cluster/vq.py: _kpp
|
35 |
+
"""
|
36 |
+
batch_size=data.shape[0]
|
37 |
+
if batch_size>sample_size:
|
38 |
+
data = data[torch.randint(0, batch_size,[sample_size], device=data.device)]
|
39 |
+
dims = data.shape[1] if len(data.shape) > 1 else 1
|
40 |
+
init = torch.zeros((k, dims)).to(data.device)
|
41 |
+
r = torch.distributions.uniform.Uniform(0, 1)
|
42 |
+
for i in range(k):
|
43 |
+
if i == 0:
|
44 |
+
init[i, :] = data[torch.randint(data.shape[0], [1])]
|
45 |
+
else:
|
46 |
+
D2 = torch.cdist(init[:i, :][None, :], data[None, :], p=2)[0].amin(dim=0)
|
47 |
+
probs = D2 / torch.sum(D2)
|
48 |
+
cumprobs = torch.cumsum(probs, dim=0)
|
49 |
+
init[i, :] = data[torch.searchsorted(cumprobs, r.sample([1]).to(data.device))]
|
50 |
+
return init
|
51 |
+
class KMeansGPU:
|
52 |
+
'''
|
53 |
+
Kmeans clustering algorithm implemented with PyTorch
|
54 |
+
|
55 |
+
Parameters:
|
56 |
+
n_clusters: int,
|
57 |
+
Number of clusters
|
58 |
+
|
59 |
+
max_iter: int, default: 100
|
60 |
+
Maximum number of iterations
|
61 |
+
|
62 |
+
tol: float, default: 0.0001
|
63 |
+
Tolerance
|
64 |
+
|
65 |
+
verbose: int, default: 0
|
66 |
+
Verbosity
|
67 |
+
|
68 |
+
mode: {'euclidean', 'cosine'}, default: 'euclidean'
|
69 |
+
Type of distance measure
|
70 |
+
|
71 |
+
init_method: {'random', 'point', '++'}
|
72 |
+
Type of initialization
|
73 |
+
|
74 |
+
minibatch: {None, int}, default: None
|
75 |
+
Batch size of MinibatchKmeans algorithm
|
76 |
+
if None perform full KMeans algorithm
|
77 |
+
|
78 |
+
Attributes:
|
79 |
+
centroids: torch.Tensor, shape: [n_clusters, n_features]
|
80 |
+
cluster centroids
|
81 |
+
'''
|
82 |
+
def __init__(self, n_clusters, max_iter=200, tol=1e-4, verbose=0, mode="euclidean",device=torch.device("cuda:0")):
|
83 |
+
self.n_clusters = n_clusters
|
84 |
+
self.max_iter = max_iter
|
85 |
+
self.tol = tol
|
86 |
+
self.verbose = verbose
|
87 |
+
self.mode = mode
|
88 |
+
self.device=device
|
89 |
+
pynvml.nvmlInit()
|
90 |
+
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(device.index)
|
91 |
+
info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
|
92 |
+
self.minibatch=int(33e6/self.n_clusters*info.free/ 1024 / 1024 / 1024)
|
93 |
+
print("free_mem/GB:",info.free/ 1024 / 1024 / 1024,"minibatch:",self.minibatch)
|
94 |
+
|
95 |
+
@staticmethod
|
96 |
+
def cos_sim(a, b):
|
97 |
+
"""
|
98 |
+
Compute cosine similarity of 2 sets of vectors
|
99 |
+
|
100 |
+
Parameters:
|
101 |
+
a: torch.Tensor, shape: [m, n_features]
|
102 |
+
|
103 |
+
b: torch.Tensor, shape: [n, n_features]
|
104 |
+
"""
|
105 |
+
return normalize(a, dim=-1) @ normalize(b, dim=-1).transpose(-2, -1)
|
106 |
+
|
107 |
+
@staticmethod
|
108 |
+
def euc_sim(a, b):
|
109 |
+
"""
|
110 |
+
Compute euclidean similarity of 2 sets of vectors
|
111 |
+
Parameters:
|
112 |
+
a: torch.Tensor, shape: [m, n_features]
|
113 |
+
b: torch.Tensor, shape: [n, n_features]
|
114 |
+
"""
|
115 |
+
return 2 * a @ b.transpose(-2, -1) -(a**2).sum(dim=1)[..., :, None] - (b**2).sum(dim=1)[..., None, :]
|
116 |
+
|
117 |
+
def max_sim(self, a, b):
|
118 |
+
"""
|
119 |
+
Compute maximum similarity (or minimum distance) of each vector
|
120 |
+
in a with all of the vectors in b
|
121 |
+
Parameters:
|
122 |
+
a: torch.Tensor, shape: [m, n_features]
|
123 |
+
b: torch.Tensor, shape: [n, n_features]
|
124 |
+
"""
|
125 |
+
if self.mode == 'cosine':
|
126 |
+
sim_func = self.cos_sim
|
127 |
+
elif self.mode == 'euclidean':
|
128 |
+
sim_func = self.euc_sim
|
129 |
+
sim = sim_func(a, b)
|
130 |
+
max_sim_v, max_sim_i = sim.max(dim=-1)
|
131 |
+
return max_sim_v, max_sim_i
|
132 |
+
|
133 |
+
def fit_predict(self, X):
|
134 |
+
"""
|
135 |
+
Combination of fit() and predict() methods.
|
136 |
+
This is faster than calling fit() and predict() seperately.
|
137 |
+
Parameters:
|
138 |
+
X: torch.Tensor, shape: [n_samples, n_features]
|
139 |
+
centroids: {torch.Tensor, None}, default: None
|
140 |
+
if given, centroids will be initialized with given tensor
|
141 |
+
if None, centroids will be randomly chosen from X
|
142 |
+
Return:
|
143 |
+
labels: torch.Tensor, shape: [n_samples]
|
144 |
+
|
145 |
+
mini_=33kk/k*remain
|
146 |
+
mini=min(mini_,fea_shape)
|
147 |
+
offset=log2(k/1000)*1.5
|
148 |
+
kpp_all=min(mini_*10/offset,fea_shape)
|
149 |
+
kpp_sample=min(mini_/12/offset,fea_shape)
|
150 |
+
"""
|
151 |
+
assert isinstance(X, torch.Tensor), "input must be torch.Tensor"
|
152 |
+
assert X.dtype in [torch.half, torch.float, torch.double], "input must be floating point"
|
153 |
+
assert X.ndim == 2, "input must be a 2d tensor with shape: [n_samples, n_features] "
|
154 |
+
# print("verbose:%s"%self.verbose)
|
155 |
+
|
156 |
+
offset = np.power(1.5,np.log(self.n_clusters / 1000))/np.log(2)
|
157 |
+
with torch.no_grad():
|
158 |
+
batch_size= X.shape[0]
|
159 |
+
# print(self.minibatch, int(self.minibatch * 10 / offset), batch_size)
|
160 |
+
start_time = time()
|
161 |
+
if (self.minibatch*10//offset< batch_size):
|
162 |
+
x = X[torch.randint(0, batch_size,[int(self.minibatch*10/offset)])].to(self.device)
|
163 |
+
else:
|
164 |
+
x = X.to(self.device)
|
165 |
+
# print(x.device)
|
166 |
+
self.centroids = _kpp(x, self.n_clusters, min(int(self.minibatch/12/offset),batch_size))
|
167 |
+
del x
|
168 |
+
torch.cuda.empty_cache()
|
169 |
+
# self.centroids = self.centroids.to(self.device)
|
170 |
+
num_points_in_clusters = torch.ones(self.n_clusters, device=self.device, dtype=X.dtype)#全1
|
171 |
+
closest = None#[3098036]#int64
|
172 |
+
if(self.minibatch>=batch_size//2 and self.minibatch<batch_size):
|
173 |
+
X = X[torch.randint(0, batch_size,[self.minibatch])].to(self.device)
|
174 |
+
elif(self.minibatch>=batch_size):
|
175 |
+
X=X.to(self.device)
|
176 |
+
for i in range(self.max_iter):
|
177 |
+
iter_time = time()
|
178 |
+
if self.minibatch<batch_size//2:#可用minibatch数太小,每次都得从内存倒腾到显存
|
179 |
+
x = X[torch.randint(0, batch_size, [self.minibatch])].to(self.device)
|
180 |
+
else:#否则直接全部缓存
|
181 |
+
x = X
|
182 |
+
|
183 |
+
closest = self.max_sim(a=x, b=self.centroids)[1].to(torch.int16)#[3098036]#int64#0~999
|
184 |
+
matched_clusters, counts = closest.unique(return_counts=True)#int64#1k
|
185 |
+
expanded_closest = closest[None].expand(self.n_clusters, -1)#[1000, 3098036]#int16#0~999
|
186 |
+
mask = (expanded_closest==torch.arange(self.n_clusters, device=self.device)[:, None]).to(X.dtype)#==后者是int64*1000
|
187 |
+
c_grad = mask @ x / mask.sum(-1)[..., :, None]
|
188 |
+
c_grad[c_grad!=c_grad] = 0 # remove NaNs
|
189 |
+
error = (c_grad - self.centroids).pow(2).sum()
|
190 |
+
if self.minibatch is not None:
|
191 |
+
lr = 1/num_points_in_clusters[:,None] * 0.9 + 0.1
|
192 |
+
else:
|
193 |
+
lr = 1
|
194 |
+
matched_clusters=matched_clusters.long()
|
195 |
+
num_points_in_clusters[matched_clusters] += counts#IndexError: tensors used as indices must be long, byte or bool tensors
|
196 |
+
self.centroids = self.centroids * (1-lr) + c_grad * lr
|
197 |
+
if self.verbose >= 2:
|
198 |
+
print('iter:', i, 'error:', error.item(), 'time spent:', round(time()-iter_time, 4))
|
199 |
+
if error <= self.tol:
|
200 |
+
break
|
201 |
+
|
202 |
+
if self.verbose >= 1:
|
203 |
+
print(f'used {i+1} iterations ({round(time()-start_time, 4)}s) to cluster {batch_size} items into {self.n_clusters} clusters')
|
204 |
+
return closest
|
cluster/train_cluster.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
import tqdm
|
10 |
+
from kmeans import KMeansGPU
|
11 |
+
from sklearn.cluster import KMeans, MiniBatchKMeans
|
12 |
+
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
|
17 |
+
if str(in_dir).endswith(".ipynb_checkpoints"):
|
18 |
+
logger.info(f"Ignore {in_dir}")
|
19 |
+
|
20 |
+
logger.info(f"Loading features from {in_dir}")
|
21 |
+
features = []
|
22 |
+
nums = 0
|
23 |
+
for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
|
24 |
+
# for name in os.listdir(in_dir):
|
25 |
+
# path="%s/%s"%(in_dir,name)
|
26 |
+
features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
|
27 |
+
# print(features[-1].shape)
|
28 |
+
features = np.concatenate(features, axis=0)
|
29 |
+
print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
|
30 |
+
features = features.astype(np.float32)
|
31 |
+
logger.info(f"Clustering features of shape: {features.shape}")
|
32 |
+
t = time.time()
|
33 |
+
if(use_gpu is False):
|
34 |
+
if use_minibatch:
|
35 |
+
kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
|
36 |
+
else:
|
37 |
+
kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
|
38 |
+
else:
|
39 |
+
kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
|
40 |
+
features=torch.from_numpy(features)#.to(device)
|
41 |
+
kmeans.fit_predict(features)#
|
42 |
+
|
43 |
+
print(time.time()-t, "s")
|
44 |
+
|
45 |
+
x = {
|
46 |
+
"n_features_in_": kmeans.n_features_in_ if use_gpu is False else features.shape[1],
|
47 |
+
"_n_threads": kmeans._n_threads if use_gpu is False else 4,
|
48 |
+
"cluster_centers_": kmeans.cluster_centers_ if use_gpu is False else kmeans.centroids.cpu().numpy(),
|
49 |
+
}
|
50 |
+
print("end")
|
51 |
+
|
52 |
+
return x
|
53 |
+
|
54 |
+
if __name__ == "__main__":
|
55 |
+
parser = argparse.ArgumentParser()
|
56 |
+
parser.add_argument('--dataset', type=Path, default="./dataset/44k",
|
57 |
+
help='path of training data directory')
|
58 |
+
parser.add_argument('--output', type=Path, default="logs/44k",
|
59 |
+
help='path of model output directory')
|
60 |
+
parser.add_argument('--gpu',action='store_true', default=False ,
|
61 |
+
help='to use GPU')
|
62 |
+
|
63 |
+
|
64 |
+
args = parser.parse_args()
|
65 |
+
|
66 |
+
checkpoint_dir = args.output
|
67 |
+
dataset = args.dataset
|
68 |
+
use_gpu = args.gpu
|
69 |
+
n_clusters = 10000
|
70 |
+
|
71 |
+
ckpt = {}
|
72 |
+
for spk in os.listdir(dataset):
|
73 |
+
if os.path.isdir(dataset/spk):
|
74 |
+
print(f"train kmeans for {spk}...")
|
75 |
+
in_dir = dataset/spk
|
76 |
+
x = train_cluster(in_dir, n_clusters,use_minibatch=False,verbose=False,use_gpu=use_gpu)
|
77 |
+
ckpt[spk] = x
|
78 |
+
|
79 |
+
checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
|
80 |
+
checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
|
81 |
+
torch.save(
|
82 |
+
ckpt,
|
83 |
+
checkpoint_path,
|
84 |
+
)
|
85 |
+
|
configs/config.json
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 20,
|
4 |
+
"eval_interval": 20,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 10000,
|
7 |
+
"learning_rate": 0.0001,
|
8 |
+
"betas": [
|
9 |
+
0.8,
|
10 |
+
0.99
|
11 |
+
],
|
12 |
+
"eps": 1e-09,
|
13 |
+
"batch_size": 6,
|
14 |
+
"fp16_run": false,
|
15 |
+
"lr_decay": 0.999875,
|
16 |
+
"segment_size": 10240,
|
17 |
+
"init_lr_ratio": 1,
|
18 |
+
"warmup_epochs": 0,
|
19 |
+
"c_mel": 45,
|
20 |
+
"c_kl": 1.0,
|
21 |
+
"use_sr": true,
|
22 |
+
"max_speclen": 512,
|
23 |
+
"port": "8001",
|
24 |
+
"keep_ckpts": 3
|
25 |
+
},
|
26 |
+
"data": {
|
27 |
+
"training_files": "filelists/train.txt",
|
28 |
+
"validation_files": "filelists/val.txt",
|
29 |
+
"max_wav_value": 32768.0,
|
30 |
+
"sampling_rate": 44100,
|
31 |
+
"filter_length": 2048,
|
32 |
+
"hop_length": 512,
|
33 |
+
"win_length": 2048,
|
34 |
+
"n_mel_channels": 80,
|
35 |
+
"mel_fmin": 0.0,
|
36 |
+
"mel_fmax": 22050
|
37 |
+
},
|
38 |
+
"model": {
|
39 |
+
"inter_channels": 192,
|
40 |
+
"hidden_channels": 192,
|
41 |
+
"filter_channels": 768,
|
42 |
+
"n_heads": 2,
|
43 |
+
"n_layers": 6,
|
44 |
+
"kernel_size": 3,
|
45 |
+
"p_dropout": 0.1,
|
46 |
+
"resblock": "1",
|
47 |
+
"resblock_kernel_sizes": [
|
48 |
+
3,
|
49 |
+
7,
|
50 |
+
11
|
51 |
+
],
|
52 |
+
"resblock_dilation_sizes": [
|
53 |
+
[
|
54 |
+
1,
|
55 |
+
3,
|
56 |
+
5
|
57 |
+
],
|
58 |
+
[
|
59 |
+
1,
|
60 |
+
3,
|
61 |
+
5
|
62 |
+
],
|
63 |
+
[
|
64 |
+
1,
|
65 |
+
3,
|
66 |
+
5
|
67 |
+
]
|
68 |
+
],
|
69 |
+
"upsample_rates": [
|
70 |
+
8,
|
71 |
+
8,
|
72 |
+
2,
|
73 |
+
2,
|
74 |
+
2
|
75 |
+
],
|
76 |
+
"upsample_initial_channel": 512,
|
77 |
+
"upsample_kernel_sizes": [
|
78 |
+
16,
|
79 |
+
16,
|
80 |
+
4,
|
81 |
+
4,
|
82 |
+
4
|
83 |
+
],
|
84 |
+
"n_layers_q": 3,
|
85 |
+
"use_spectral_norm": false,
|
86 |
+
"gin_channels": 256,
|
87 |
+
"ssl_dim": 256,
|
88 |
+
"n_speakers": 200,
|
89 |
+
"speech_encoder": "vec256l9"
|
90 |
+
},
|
91 |
+
"spk": {
|
92 |
+
"Shengshuyan": 0
|
93 |
+
}
|
94 |
+
}
|
configs/diffusion.yaml
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data:
|
2 |
+
sampling_rate: 44100
|
3 |
+
block_size: 512 # Equal to hop_length
|
4 |
+
duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
|
5 |
+
encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
|
6 |
+
cnhubertsoft_gate: 10
|
7 |
+
encoder_sample_rate: 16000
|
8 |
+
encoder_hop_size: 320
|
9 |
+
encoder_out_channels: 768 # 256 if using 'hubertsoft'
|
10 |
+
training_files: "filelists/train.txt"
|
11 |
+
validation_files: "filelists/val.txt"
|
12 |
+
extensions: # List of extension included in the data collection
|
13 |
+
- wav
|
14 |
+
model:
|
15 |
+
type: 'Diffusion'
|
16 |
+
n_layers: 20
|
17 |
+
n_chans: 512
|
18 |
+
n_hidden: 256
|
19 |
+
use_pitch_aug: true
|
20 |
+
n_spk: 1 # max number of different speakers
|
21 |
+
device: cuda
|
22 |
+
vocoder:
|
23 |
+
type: 'nsf-hifigan'
|
24 |
+
ckpt: 'pretrain/nsf_hifigan/model'
|
25 |
+
infer:
|
26 |
+
speedup: 10
|
27 |
+
method: 'dpm-solver' # 'pndm' or 'dpm-solver'
|
28 |
+
env:
|
29 |
+
expdir: logs/44k/diffusion
|
30 |
+
gpu_id: 0
|
31 |
+
train:
|
32 |
+
num_workers: 2 # If your cpu and gpu are both very strong, set to 0 may be faster!
|
33 |
+
amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
|
34 |
+
batch_size: 48
|
35 |
+
cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
|
36 |
+
cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
|
37 |
+
cache_fp16: true
|
38 |
+
epochs: 100000
|
39 |
+
interval_log: 10
|
40 |
+
interval_val: 2000
|
41 |
+
interval_force_save: 10000
|
42 |
+
lr: 0.0002
|
43 |
+
decay_step: 100000
|
44 |
+
gamma: 0.5
|
45 |
+
weight_decay: 0
|
46 |
+
save_opt: false
|
47 |
+
spk:
|
48 |
+
'nyaru': 0
|
configs_template/config_template.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 800,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 10000,
|
7 |
+
"learning_rate": 0.0001,
|
8 |
+
"betas": [
|
9 |
+
0.8,
|
10 |
+
0.99
|
11 |
+
],
|
12 |
+
"eps": 1e-09,
|
13 |
+
"batch_size": 6,
|
14 |
+
"fp16_run": false,
|
15 |
+
"half_type": "fp16",
|
16 |
+
"lr_decay": 0.999875,
|
17 |
+
"segment_size": 10240,
|
18 |
+
"init_lr_ratio": 1,
|
19 |
+
"warmup_epochs": 0,
|
20 |
+
"c_mel": 45,
|
21 |
+
"c_kl": 1.0,
|
22 |
+
"use_sr": true,
|
23 |
+
"max_speclen": 512,
|
24 |
+
"port": "8001",
|
25 |
+
"keep_ckpts": 3,
|
26 |
+
"all_in_mem": false,
|
27 |
+
"vol_aug":false
|
28 |
+
},
|
29 |
+
"data": {
|
30 |
+
"training_files": "filelists/train.txt",
|
31 |
+
"validation_files": "filelists/val.txt",
|
32 |
+
"max_wav_value": 32768.0,
|
33 |
+
"sampling_rate": 44100,
|
34 |
+
"filter_length": 2048,
|
35 |
+
"hop_length": 512,
|
36 |
+
"win_length": 2048,
|
37 |
+
"n_mel_channels": 80,
|
38 |
+
"mel_fmin": 0.0,
|
39 |
+
"mel_fmax": 22050,
|
40 |
+
"unit_interpolate_mode":"nearest"
|
41 |
+
},
|
42 |
+
"model": {
|
43 |
+
"inter_channels": 192,
|
44 |
+
"hidden_channels": 192,
|
45 |
+
"filter_channels": 768,
|
46 |
+
"n_heads": 2,
|
47 |
+
"n_layers": 6,
|
48 |
+
"kernel_size": 3,
|
49 |
+
"p_dropout": 0.1,
|
50 |
+
"resblock": "1",
|
51 |
+
"resblock_kernel_sizes": [3,7,11],
|
52 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
53 |
+
"upsample_rates": [ 8, 8, 2, 2, 2],
|
54 |
+
"upsample_initial_channel": 512,
|
55 |
+
"upsample_kernel_sizes": [16,16, 4, 4, 4],
|
56 |
+
"n_layers_q": 3,
|
57 |
+
"n_flow_layer": 4,
|
58 |
+
"use_spectral_norm": false,
|
59 |
+
"gin_channels": 768,
|
60 |
+
"ssl_dim": 768,
|
61 |
+
"n_speakers": 200,
|
62 |
+
"vocoder_name":"nsf-hifigan",
|
63 |
+
"speech_encoder":"vec768l12",
|
64 |
+
"speaker_embedding":false,
|
65 |
+
"vol_embedding":false,
|
66 |
+
"use_depthwise_conv":false,
|
67 |
+
"flow_share_parameter": false,
|
68 |
+
"use_automatic_f0_prediction": true
|
69 |
+
},
|
70 |
+
"spk": {
|
71 |
+
"nyaru": 0,
|
72 |
+
"huiyu": 1,
|
73 |
+
"nen": 2,
|
74 |
+
"paimon": 3,
|
75 |
+
"yunhao": 4
|
76 |
+
}
|
77 |
+
}
|
configs_template/config_tiny_template.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 800,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 10000,
|
7 |
+
"learning_rate": 0.0001,
|
8 |
+
"betas": [
|
9 |
+
0.8,
|
10 |
+
0.99
|
11 |
+
],
|
12 |
+
"eps": 1e-09,
|
13 |
+
"batch_size": 6,
|
14 |
+
"fp16_run": false,
|
15 |
+
"half_type": "fp16",
|
16 |
+
"lr_decay": 0.999875,
|
17 |
+
"segment_size": 10240,
|
18 |
+
"init_lr_ratio": 1,
|
19 |
+
"warmup_epochs": 0,
|
20 |
+
"c_mel": 45,
|
21 |
+
"c_kl": 1.0,
|
22 |
+
"use_sr": true,
|
23 |
+
"max_speclen": 512,
|
24 |
+
"port": "8001",
|
25 |
+
"keep_ckpts": 3,
|
26 |
+
"all_in_mem": false,
|
27 |
+
"vol_aug":false
|
28 |
+
},
|
29 |
+
"data": {
|
30 |
+
"training_files": "filelists/train.txt",
|
31 |
+
"validation_files": "filelists/val.txt",
|
32 |
+
"max_wav_value": 32768.0,
|
33 |
+
"sampling_rate": 44100,
|
34 |
+
"filter_length": 2048,
|
35 |
+
"hop_length": 512,
|
36 |
+
"win_length": 2048,
|
37 |
+
"n_mel_channels": 80,
|
38 |
+
"mel_fmin": 0.0,
|
39 |
+
"mel_fmax": 22050,
|
40 |
+
"unit_interpolate_mode":"nearest"
|
41 |
+
},
|
42 |
+
"model": {
|
43 |
+
"inter_channels": 192,
|
44 |
+
"hidden_channels": 192,
|
45 |
+
"filter_channels": 512,
|
46 |
+
"n_heads": 2,
|
47 |
+
"n_layers": 6,
|
48 |
+
"kernel_size": 3,
|
49 |
+
"p_dropout": 0.1,
|
50 |
+
"resblock": "1",
|
51 |
+
"resblock_kernel_sizes": [3,7,11],
|
52 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
53 |
+
"upsample_rates": [ 8, 8, 2, 2, 2],
|
54 |
+
"upsample_initial_channel": 400,
|
55 |
+
"upsample_kernel_sizes": [16,16, 4, 4, 4],
|
56 |
+
"n_layers_q": 3,
|
57 |
+
"n_flow_layer": 4,
|
58 |
+
"use_spectral_norm": false,
|
59 |
+
"gin_channels": 768,
|
60 |
+
"ssl_dim": 768,
|
61 |
+
"n_speakers": 200,
|
62 |
+
"vocoder_name":"nsf-hifigan",
|
63 |
+
"speech_encoder":"vec768l12",
|
64 |
+
"speaker_embedding":false,
|
65 |
+
"vol_embedding":false,
|
66 |
+
"use_depthwise_conv":true,
|
67 |
+
"flow_share_parameter": true,
|
68 |
+
"use_automatic_f0_prediction": true
|
69 |
+
},
|
70 |
+
"spk": {
|
71 |
+
"nyaru": 0,
|
72 |
+
"huiyu": 1,
|
73 |
+
"nen": 2,
|
74 |
+
"paimon": 3,
|
75 |
+
"yunhao": 4
|
76 |
+
}
|
77 |
+
}
|
configs_template/diffusion_template.yaml
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data:
|
2 |
+
sampling_rate: 44100
|
3 |
+
block_size: 512 # Equal to hop_length
|
4 |
+
duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
|
5 |
+
encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
|
6 |
+
cnhubertsoft_gate: 10
|
7 |
+
encoder_sample_rate: 16000
|
8 |
+
encoder_hop_size: 320
|
9 |
+
encoder_out_channels: 768 # 256 if using 'hubertsoft'
|
10 |
+
training_files: "filelists/train.txt"
|
11 |
+
validation_files: "filelists/val.txt"
|
12 |
+
extensions: # List of extension included in the data collection
|
13 |
+
- wav
|
14 |
+
unit_interpolate_mode: "nearest"
|
15 |
+
model:
|
16 |
+
type: 'Diffusion'
|
17 |
+
n_layers: 20
|
18 |
+
n_chans: 512
|
19 |
+
n_hidden: 256
|
20 |
+
use_pitch_aug: true
|
21 |
+
timesteps : 1000
|
22 |
+
k_step_max: 0 # must <= timesteps, If it is 0, train all
|
23 |
+
n_spk: 1 # max number of different speakers
|
24 |
+
device: cuda
|
25 |
+
vocoder:
|
26 |
+
type: 'nsf-hifigan'
|
27 |
+
ckpt: 'pretrain/nsf_hifigan/model'
|
28 |
+
infer:
|
29 |
+
speedup: 10
|
30 |
+
method: 'dpm-solver++' # 'pndm' or 'dpm-solver' or 'ddim' or 'unipc' or 'dpm-solver++'
|
31 |
+
env:
|
32 |
+
expdir: logs/44k/diffusion
|
33 |
+
gpu_id: 0
|
34 |
+
train:
|
35 |
+
num_workers: 4 # If your cpu and gpu are both very strong, set to 0 may be faster!
|
36 |
+
amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
|
37 |
+
batch_size: 48
|
38 |
+
cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
|
39 |
+
cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
|
40 |
+
cache_fp16: true
|
41 |
+
epochs: 100000
|
42 |
+
interval_log: 10
|
43 |
+
interval_val: 2000
|
44 |
+
interval_force_save: 5000
|
45 |
+
lr: 0.0001
|
46 |
+
decay_step: 100000
|
47 |
+
gamma: 0.5
|
48 |
+
weight_decay: 0
|
49 |
+
save_opt: false
|
50 |
+
spk:
|
51 |
+
'nyaru': 0
|
dataset_raw/wav_structure.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
数据集准备
|
2 |
+
|
3 |
+
raw
|
4 |
+
├───speaker0
|
5 |
+
│ ├───xxx1-xxx1.wav
|
6 |
+
│ ├───...
|
7 |
+
│ └───Lxx-0xx8.wav
|
8 |
+
└───speaker1
|
9 |
+
├───xx2-0xxx2.wav
|
10 |
+
├───...
|
11 |
+
└───xxx7-xxx007.wav
|
12 |
+
|
13 |
+
此外还需要编辑config.json
|
14 |
+
|
15 |
+
"n_speakers": 10
|
16 |
+
|
17 |
+
"spk":{
|
18 |
+
"speaker0": 0,
|
19 |
+
"speaker1": 1,
|
20 |
+
}
|