Katock commited on
Commit
533346a
·
1 Parent(s): cf90f08

Update models.py

Browse files
app.py CHANGED
@@ -1,16 +1,15 @@
1
- import os
2
  import io
 
 
 
3
  import gradio as gr
 
4
  import librosa
5
  import numpy as np
6
- import utils
7
- from inference.infer_tool import Svc
8
- import logging
9
  import soundfile
10
- import asyncio
11
- import argparse
12
- import edge_tts
13
- import gradio.processing_utils as gr_processing_utils
14
 
15
  logging.getLogger('numba').setLevel(logging.WARNING)
16
  logging.getLogger('markdown_it').setLevel(logging.WARNING)
@@ -62,7 +61,6 @@ if __name__ == '__main__':
62
  parser.add_argument('--api', action="store_true", default=False)
63
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
64
  args = parser.parse_args()
65
- hubert_model = utils.get_hubert_model().to(args.device)
66
  models = []
67
  voices = []
68
  for f in os.listdir("models"):
 
1
+ import argparse
2
  import io
3
+ import logging
4
+ import os
5
+
6
  import gradio as gr
7
+ import gradio.processing_utils as gr_processing_utils
8
  import librosa
9
  import numpy as np
 
 
 
10
  import soundfile
11
+
12
+ from inference.infer_tool import Svc
 
 
13
 
14
  logging.getLogger('numba').setLevel(logging.WARNING)
15
  logging.getLogger('markdown_it').setLevel(logging.WARNING)
 
61
  parser.add_argument('--api', action="store_true", default=False)
62
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
63
  args = parser.parse_args()
 
64
  models = []
65
  voices = []
66
  for f in os.listdir("models"):
hubert/hubert_model.py DELETED
@@ -1,222 +0,0 @@
1
- import copy
2
- import random
3
- from typing import Optional, Tuple
4
-
5
- import torch
6
- import torch.nn as nn
7
- import torch.nn.functional as t_func
8
- from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
9
-
10
-
11
- class Hubert(nn.Module):
12
- def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
13
- super().__init__()
14
- self._mask = mask
15
- self.feature_extractor = FeatureExtractor()
16
- self.feature_projection = FeatureProjection()
17
- self.positional_embedding = PositionalConvEmbedding()
18
- self.norm = nn.LayerNorm(768)
19
- self.dropout = nn.Dropout(0.1)
20
- self.encoder = TransformerEncoder(
21
- nn.TransformerEncoderLayer(
22
- 768, 12, 3072, activation="gelu", batch_first=True
23
- ),
24
- 12,
25
- )
26
- self.proj = nn.Linear(768, 256)
27
-
28
- self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
29
- self.label_embedding = nn.Embedding(num_label_embeddings, 256)
30
-
31
- def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
32
- mask = None
33
- if self.training and self._mask:
34
- mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
35
- x[mask] = self.masked_spec_embed.to(x.dtype)
36
- return x, mask
37
-
38
- def encode(
39
- self, x: torch.Tensor, layer: Optional[int] = None
40
- ) -> Tuple[torch.Tensor, torch.Tensor]:
41
- x = self.feature_extractor(x)
42
- x = self.feature_projection(x.transpose(1, 2))
43
- x, mask = self.mask(x)
44
- x = x + self.positional_embedding(x)
45
- x = self.dropout(self.norm(x))
46
- x = self.encoder(x, output_layer=layer)
47
- return x, mask
48
-
49
- def logits(self, x: torch.Tensor) -> torch.Tensor:
50
- logits = torch.cosine_similarity(
51
- x.unsqueeze(2),
52
- self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
53
- dim=-1,
54
- )
55
- return logits / 0.1
56
-
57
- def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
58
- x, mask = self.encode(x)
59
- x = self.proj(x)
60
- logits = self.logits(x)
61
- return logits, mask
62
-
63
-
64
- class HubertSoft(Hubert):
65
- def __init__(self):
66
- super().__init__()
67
-
68
- @torch.inference_mode()
69
- def units(self, wav: torch.Tensor) -> torch.Tensor:
70
- wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
71
- x, _ = self.encode(wav)
72
- return self.proj(x)
73
-
74
-
75
- class FeatureExtractor(nn.Module):
76
- def __init__(self):
77
- super().__init__()
78
- self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
79
- self.norm0 = nn.GroupNorm(512, 512)
80
- self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
81
- self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
82
- self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
83
- self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
84
- self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
85
- self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
86
-
87
- def forward(self, x: torch.Tensor) -> torch.Tensor:
88
- x = t_func.gelu(self.norm0(self.conv0(x)))
89
- x = t_func.gelu(self.conv1(x))
90
- x = t_func.gelu(self.conv2(x))
91
- x = t_func.gelu(self.conv3(x))
92
- x = t_func.gelu(self.conv4(x))
93
- x = t_func.gelu(self.conv5(x))
94
- x = t_func.gelu(self.conv6(x))
95
- return x
96
-
97
-
98
- class FeatureProjection(nn.Module):
99
- def __init__(self):
100
- super().__init__()
101
- self.norm = nn.LayerNorm(512)
102
- self.projection = nn.Linear(512, 768)
103
- self.dropout = nn.Dropout(0.1)
104
-
105
- def forward(self, x: torch.Tensor) -> torch.Tensor:
106
- x = self.norm(x)
107
- x = self.projection(x)
108
- x = self.dropout(x)
109
- return x
110
-
111
-
112
- class PositionalConvEmbedding(nn.Module):
113
- def __init__(self):
114
- super().__init__()
115
- self.conv = nn.Conv1d(
116
- 768,
117
- 768,
118
- kernel_size=128,
119
- padding=128 // 2,
120
- groups=16,
121
- )
122
- self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
123
-
124
- def forward(self, x: torch.Tensor) -> torch.Tensor:
125
- x = self.conv(x.transpose(1, 2))
126
- x = t_func.gelu(x[:, :, :-1])
127
- return x.transpose(1, 2)
128
-
129
-
130
- class TransformerEncoder(nn.Module):
131
- def __init__(
132
- self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
133
- ) -> None:
134
- super(TransformerEncoder, self).__init__()
135
- self.layers = nn.ModuleList(
136
- [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
137
- )
138
- self.num_layers = num_layers
139
-
140
- def forward(
141
- self,
142
- src: torch.Tensor,
143
- mask: torch.Tensor = None,
144
- src_key_padding_mask: torch.Tensor = None,
145
- output_layer: Optional[int] = None,
146
- ) -> torch.Tensor:
147
- output = src
148
- for layer in self.layers[:output_layer]:
149
- output = layer(
150
- output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
151
- )
152
- return output
153
-
154
-
155
- def _compute_mask(
156
- shape: Tuple[int, int],
157
- mask_prob: float,
158
- mask_length: int,
159
- device: torch.device,
160
- min_masks: int = 0,
161
- ) -> torch.Tensor:
162
- batch_size, sequence_length = shape
163
-
164
- if mask_length < 1:
165
- raise ValueError("`mask_length` has to be bigger than 0.")
166
-
167
- if mask_length > sequence_length:
168
- raise ValueError(
169
- f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
170
- )
171
-
172
- # compute number of masked spans in batch
173
- num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
174
- num_masked_spans = max(num_masked_spans, min_masks)
175
-
176
- # make sure num masked indices <= sequence_length
177
- if num_masked_spans * mask_length > sequence_length:
178
- num_masked_spans = sequence_length // mask_length
179
-
180
- # SpecAugment mask to fill
181
- mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
182
-
183
- # uniform distribution to sample from, make sure that offset samples are < sequence_length
184
- uniform_dist = torch.ones(
185
- (batch_size, sequence_length - (mask_length - 1)), device=device
186
- )
187
-
188
- # get random indices to mask
189
- mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
190
-
191
- # expand masked indices to masked spans
192
- mask_indices = (
193
- mask_indices.unsqueeze(dim=-1)
194
- .expand((batch_size, num_masked_spans, mask_length))
195
- .reshape(batch_size, num_masked_spans * mask_length)
196
- )
197
- offsets = (
198
- torch.arange(mask_length, device=device)[None, None, :]
199
- .expand((batch_size, num_masked_spans, mask_length))
200
- .reshape(batch_size, num_masked_spans * mask_length)
201
- )
202
- mask_idxs = mask_indices + offsets
203
-
204
- # scatter indices to mask
205
- mask = mask.scatter(1, mask_idxs, True)
206
-
207
- return mask
208
-
209
-
210
- def hubert_soft(
211
- path: str,
212
- ) -> HubertSoft:
213
- r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
214
- Args:
215
- path (str): path of a pretrained model
216
- """
217
- hubert = HubertSoft()
218
- checkpoint = torch.load(path)
219
- consume_prefix_in_state_dict_if_present(checkpoint, "module.")
220
- hubert.load_state_dict(checkpoint)
221
- hubert.eval()
222
- return hubert
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hubert/hubert_model_onnx.py DELETED
@@ -1,217 +0,0 @@
1
- import copy
2
- import random
3
- from typing import Optional, Tuple
4
-
5
- import torch
6
- import torch.nn as nn
7
- import torch.nn.functional as t_func
8
- from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
9
-
10
-
11
- class Hubert(nn.Module):
12
- def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
13
- super().__init__()
14
- self._mask = mask
15
- self.feature_extractor = FeatureExtractor()
16
- self.feature_projection = FeatureProjection()
17
- self.positional_embedding = PositionalConvEmbedding()
18
- self.norm = nn.LayerNorm(768)
19
- self.dropout = nn.Dropout(0.1)
20
- self.encoder = TransformerEncoder(
21
- nn.TransformerEncoderLayer(
22
- 768, 12, 3072, activation="gelu", batch_first=True
23
- ),
24
- 12,
25
- )
26
- self.proj = nn.Linear(768, 256)
27
-
28
- self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
29
- self.label_embedding = nn.Embedding(num_label_embeddings, 256)
30
-
31
- def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
32
- mask = None
33
- if self.training and self._mask:
34
- mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
35
- x[mask] = self.masked_spec_embed.to(x.dtype)
36
- return x, mask
37
-
38
- def encode(
39
- self, x: torch.Tensor, layer: Optional[int] = None
40
- ) -> Tuple[torch.Tensor, torch.Tensor]:
41
- x = self.feature_extractor(x)
42
- x = self.feature_projection(x.transpose(1, 2))
43
- x, mask = self.mask(x)
44
- x = x + self.positional_embedding(x)
45
- x = self.dropout(self.norm(x))
46
- x = self.encoder(x, output_layer=layer)
47
- return x, mask
48
-
49
- def logits(self, x: torch.Tensor) -> torch.Tensor:
50
- logits = torch.cosine_similarity(
51
- x.unsqueeze(2),
52
- self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
53
- dim=-1,
54
- )
55
- return logits / 0.1
56
-
57
-
58
- class HubertSoft(Hubert):
59
- def __init__(self):
60
- super().__init__()
61
-
62
- def units(self, wav: torch.Tensor) -> torch.Tensor:
63
- wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
64
- x, _ = self.encode(wav)
65
- return self.proj(x)
66
-
67
- def forward(self, x):
68
- return self.units(x)
69
-
70
- class FeatureExtractor(nn.Module):
71
- def __init__(self):
72
- super().__init__()
73
- self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
74
- self.norm0 = nn.GroupNorm(512, 512)
75
- self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
76
- self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
77
- self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
78
- self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
79
- self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
80
- self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
81
-
82
- def forward(self, x: torch.Tensor) -> torch.Tensor:
83
- x = t_func.gelu(self.norm0(self.conv0(x)))
84
- x = t_func.gelu(self.conv1(x))
85
- x = t_func.gelu(self.conv2(x))
86
- x = t_func.gelu(self.conv3(x))
87
- x = t_func.gelu(self.conv4(x))
88
- x = t_func.gelu(self.conv5(x))
89
- x = t_func.gelu(self.conv6(x))
90
- return x
91
-
92
-
93
- class FeatureProjection(nn.Module):
94
- def __init__(self):
95
- super().__init__()
96
- self.norm = nn.LayerNorm(512)
97
- self.projection = nn.Linear(512, 768)
98
- self.dropout = nn.Dropout(0.1)
99
-
100
- def forward(self, x: torch.Tensor) -> torch.Tensor:
101
- x = self.norm(x)
102
- x = self.projection(x)
103
- x = self.dropout(x)
104
- return x
105
-
106
-
107
- class PositionalConvEmbedding(nn.Module):
108
- def __init__(self):
109
- super().__init__()
110
- self.conv = nn.Conv1d(
111
- 768,
112
- 768,
113
- kernel_size=128,
114
- padding=128 // 2,
115
- groups=16,
116
- )
117
- self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
118
-
119
- def forward(self, x: torch.Tensor) -> torch.Tensor:
120
- x = self.conv(x.transpose(1, 2))
121
- x = t_func.gelu(x[:, :, :-1])
122
- return x.transpose(1, 2)
123
-
124
-
125
- class TransformerEncoder(nn.Module):
126
- def __init__(
127
- self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
128
- ) -> None:
129
- super(TransformerEncoder, self).__init__()
130
- self.layers = nn.ModuleList(
131
- [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
132
- )
133
- self.num_layers = num_layers
134
-
135
- def forward(
136
- self,
137
- src: torch.Tensor,
138
- mask: torch.Tensor = None,
139
- src_key_padding_mask: torch.Tensor = None,
140
- output_layer: Optional[int] = None,
141
- ) -> torch.Tensor:
142
- output = src
143
- for layer in self.layers[:output_layer]:
144
- output = layer(
145
- output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
146
- )
147
- return output
148
-
149
-
150
- def _compute_mask(
151
- shape: Tuple[int, int],
152
- mask_prob: float,
153
- mask_length: int,
154
- device: torch.device,
155
- min_masks: int = 0,
156
- ) -> torch.Tensor:
157
- batch_size, sequence_length = shape
158
-
159
- if mask_length < 1:
160
- raise ValueError("`mask_length` has to be bigger than 0.")
161
-
162
- if mask_length > sequence_length:
163
- raise ValueError(
164
- f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
165
- )
166
-
167
- # compute number of masked spans in batch
168
- num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
169
- num_masked_spans = max(num_masked_spans, min_masks)
170
-
171
- # make sure num masked indices <= sequence_length
172
- if num_masked_spans * mask_length > sequence_length:
173
- num_masked_spans = sequence_length // mask_length
174
-
175
- # SpecAugment mask to fill
176
- mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
177
-
178
- # uniform distribution to sample from, make sure that offset samples are < sequence_length
179
- uniform_dist = torch.ones(
180
- (batch_size, sequence_length - (mask_length - 1)), device=device
181
- )
182
-
183
- # get random indices to mask
184
- mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
185
-
186
- # expand masked indices to masked spans
187
- mask_indices = (
188
- mask_indices.unsqueeze(dim=-1)
189
- .expand((batch_size, num_masked_spans, mask_length))
190
- .reshape(batch_size, num_masked_spans * mask_length)
191
- )
192
- offsets = (
193
- torch.arange(mask_length, device=device)[None, None, :]
194
- .expand((batch_size, num_masked_spans, mask_length))
195
- .reshape(batch_size, num_masked_spans * mask_length)
196
- )
197
- mask_idxs = mask_indices + offsets
198
-
199
- # scatter indices to mask
200
- mask = mask.scatter(1, mask_idxs, True)
201
-
202
- return mask
203
-
204
-
205
- def hubert_soft(
206
- path: str,
207
- ) -> HubertSoft:
208
- r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
209
- Args:
210
- path (str): path of a pretrained model
211
- """
212
- hubert = HubertSoft()
213
- checkpoint = torch.load(path)
214
- consume_prefix_in_state_dict_if_present(checkpoint, "module.")
215
- hubert.load_state_dict(checkpoint)
216
- hubert.eval()
217
- return hubert
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inference/infer_tool.py CHANGED
@@ -82,16 +82,19 @@ def get_end_file(dir_path, end):
82
  def get_md5(content):
83
  return hashlib.new("md5", content).hexdigest()
84
 
 
85
  def fill_a_to_b(a, b):
86
  if len(a) < len(b):
87
  for _ in range(0, len(b) - len(a)):
88
  a.append(a[0])
89
 
 
90
  def mkdir(paths: list):
91
  for path in paths:
92
  if not os.path.exists(path):
93
  os.mkdir(path)
94
 
 
95
  def pad_array(arr, target_length):
96
  current_length = arr.shape[0]
97
  if current_length >= target_length:
@@ -102,15 +105,17 @@ def pad_array(arr, target_length):
102
  pad_right = pad_width - pad_left
103
  padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
104
  return padded_arr
105
-
 
106
  def split_list_by_n(list_collection, n, pre=0):
107
  for i in range(0, len(list_collection), n):
108
- yield list_collection[i-pre if i-pre>=0 else i: i + n]
109
 
110
 
111
  class F0FilterException(Exception):
112
  pass
113
 
 
114
  class Svc(object):
115
  def __init__(self, net_g_path, config_path,
116
  device=None,
@@ -140,14 +145,14 @@ class Svc(object):
140
 
141
  if os.path.exists(cluster_model_path):
142
  if self.feature_retrieval:
143
- with open(cluster_model_path,"rb") as f:
144
  self.cluster_model = pickle.load(f)
145
  self.big_npy = None
146
  self.now_spk_id = -1
147
  else:
148
  self.cluster_model = cluster.get_cluster_model(cluster_model_path)
149
  else:
150
- self.feature_retrieval=False
151
 
152
  def load_model(self, spk_mix_enable=False):
153
  # get model configuration
@@ -163,10 +168,12 @@ class Svc(object):
163
  if spk_mix_enable:
164
  self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
165
 
166
- def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
 
 
 
 
167
 
168
- f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
169
-
170
  f0, uv = f0_predictor_object.compute_f0_uv(wav)
171
  if f0_filter and sum(f0) == 0:
172
  raise F0FilterException("No voice detected")
@@ -179,10 +186,11 @@ class Svc(object):
179
 
180
  wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
181
  wav16k = torch.from_numpy(wav16k).to(self.dev)
182
- c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
 
183
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
184
 
185
- if cluster_infer_ratio !=0:
186
  if self.feature_retrieval:
187
  speaker_id = self.spk2id.get(speaker)
188
  if speaker_id is None:
@@ -191,17 +199,17 @@ class Svc(object):
191
  if len(self.spk2id.__dict__) >= speaker:
192
  speaker_id = speaker
193
  feature_index = self.cluster_model[speaker_id]
194
- feat_np = c.transpose(0,1).cpu().numpy()
195
  if self.big_npy is None or self.now_spk_id != speaker_id:
196
- self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
197
- self.now_spk_id = speaker_id
198
  print("starting feature retrieval...")
199
  score, ix = feature_index.search(feat_np, k=8)
200
  weight = np.square(1 / score)
201
  weight /= weight.sum(axis=1, keepdims=True)
202
  npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
203
  c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
204
- c = torch.FloatTensor(c).to(self.dev).transpose(0,1)
205
  print("end feature retrieval...")
206
  else:
207
  cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
@@ -217,41 +225,35 @@ class Svc(object):
217
  noice_scale=0.4,
218
  f0_filter=False,
219
  f0_predictor='pm',
220
- enhancer_adaptive_key = 0,
221
- cr_threshold = 0.05,
222
- k_step = 100,
223
- frame = 0,
224
- spk_mix = False,
225
- second_encoding = False,
226
- loudness_envelope_adjustment = 1
227
  ):
228
  wav, sr = librosa.load(raw_path, sr=self.target_sample)
229
- if spk_mix:
230
- c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
231
- n_frames = f0.size(1)
232
- sid = speaker[:, frame:frame+n_frames].transpose(0,1)
233
- else:
234
- speaker_id = self.spk2id.get(speaker)
235
- if not speaker_id and type(speaker) is int:
236
- if len(self.spk2id.__dict__) >= speaker:
237
- speaker_id = speaker
238
- if speaker_id is None:
239
- raise RuntimeError("The name you entered is not in the speaker list!")
240
- sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
241
- c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
242
- n_frames = f0.size(1)
243
  if "half" in self.net_g_path and torch.cuda.is_available():
244
  c = c.half()
245
  with torch.no_grad():
246
  start = time.time()
247
  vol = None
248
- vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
249
- audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
250
- audio = audio[0,0].data.float()
251
- audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
252
-
253
- if loudness_envelope_adjustment != 1:
254
- audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
255
  use_time = time.time() - start
256
  print("vits use time:{}".format(use_time))
257
  return audio, audio.shape[-1], n_frames
@@ -264,7 +266,7 @@ class Svc(object):
264
  # unload model
265
  self.net_g_ms = self.net_g_ms.to("cpu")
266
  del self.net_g_ms
267
- if hasattr(self,"enhancer"):
268
  self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
269
  del self.enhancer.enhancer
270
  del self.enhancer
@@ -281,14 +283,14 @@ class Svc(object):
281
  pad_seconds=0.5,
282
  clip_seconds=0,
283
  lg_num=0,
284
- lgr_num =0.75,
285
  f0_predictor='pm',
286
- enhancer_adaptive_key = 0,
287
- cr_threshold = 0.05,
288
- k_step = 100,
289
- use_spk_mix = False,
290
- second_encoding = False,
291
- loudness_envelope_adjustment = 1
292
  ):
293
  if use_spk_mix:
294
  if len(self.spk2id) == 1:
@@ -297,12 +299,12 @@ class Svc(object):
297
  wav_path = Path(raw_audio_path).with_suffix('.wav')
298
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
299
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
300
- per_size = int(clip_seconds*audio_sr)
301
- lg_size = int(lg_num*audio_sr)
302
- lg_size_r = int(lg_size*lgr_num)
303
- lg_size_c_l = (lg_size-lg_size_r)//2
304
- lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
305
- lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
306
 
307
  if use_spk_mix:
308
  assert len(self.spk2id) == len(spk)
@@ -313,10 +315,10 @@ class Svc(object):
313
  audio_length += aud_length // self.hop_size
314
  continue
315
  if per_size != 0:
316
- datas = split_list_by_n(data, per_size,lg_size)
317
  else:
318
  datas = [data]
319
- for k,dat in enumerate(datas):
320
  pad_len = int(audio_sr * pad_seconds)
321
  per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
322
  a_length = per_length + 2 * pad_len
@@ -326,14 +328,14 @@ class Svc(object):
326
  for i in range(len(spk)):
327
  last_end = None
328
  for mix in spk[i]:
329
- if mix[3]<0. or mix[2]<0.:
330
  raise RuntimeError("mix value must higer Than zero!")
331
  begin = int(audio_length * mix[0])
332
  end = int(audio_length * mix[1])
333
  length = end - begin
334
- if length<=0:
335
  raise RuntimeError("begin Must lower Than end!")
336
- step = (mix[3] - mix[2])/length
337
  if last_end is not None:
338
  if last_end != begin:
339
  raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
@@ -341,20 +343,20 @@ class Svc(object):
341
  if step == 0.:
342
  spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
343
  else:
344
- spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
345
- if(len(spk_mix_data)<length):
346
  num_pad = length - len(spk_mix_data)
347
  spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
348
  spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
349
 
350
- spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
351
  # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
352
  for i, x in enumerate(spk_mix_ten[0]):
353
  if x == 0.0:
354
  spk_mix_ten[0][i] = 1.0
355
- spk_mix_tensor[:,i] = 1.0 / len(spk)
356
  spk_mix_tensor = spk_mix_tensor / spk_mix_ten
357
- if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
358
  raise RuntimeError("sum(spk_mix_tensor) not equal 1")
359
  spk = spk_mix_tensor
360
 
@@ -371,12 +373,12 @@ class Svc(object):
371
  global_frame += length // self.hop_size
372
  continue
373
  if per_size != 0:
374
- datas = split_list_by_n(data, per_size,lg_size)
375
  else:
376
  datas = [data]
377
- for k,dat in enumerate(datas):
378
- per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
379
- if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
380
  # padd
381
  pad_len = int(audio_sr * pad_seconds)
382
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
@@ -384,33 +386,34 @@ class Svc(object):
384
  soundfile.write(raw_path, dat, audio_sr, format="wav")
385
  raw_path.seek(0)
386
  out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
387
- cluster_infer_ratio=cluster_infer_ratio,
388
- auto_predict_f0=auto_predict_f0,
389
- noice_scale=noice_scale,
390
- f0_predictor = f0_predictor,
391
- enhancer_adaptive_key = enhancer_adaptive_key,
392
- cr_threshold = cr_threshold,
393
- k_step = k_step,
394
- frame = global_frame,
395
- spk_mix = use_spk_mix,
396
- second_encoding = second_encoding,
397
- loudness_envelope_adjustment = loudness_envelope_adjustment
398
- )
399
  global_frame += out_frame
400
  _audio = out_audio.cpu().numpy()
401
  pad_len = int(self.target_sample * pad_seconds)
402
  _audio = _audio[pad_len:-pad_len]
403
  _audio = pad_array(_audio, per_length)
404
- if lg_size!=0 and k!=0:
405
- lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
406
- lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
407
- lg_pre = lg1*(1-lg)+lg2*lg
408
- audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
409
  audio.extend(lg_pre)
410
- _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
411
  audio.extend(list(_audio))
412
  return np.array(audio)
413
 
 
414
  class RealTimeVC:
415
  def __init__(self):
416
  self.last_chunk = None
@@ -438,7 +441,7 @@ class RealTimeVC:
438
  auto_predict_f0=auto_predict_f0,
439
  noice_scale=noice_scale,
440
  f0_filter=f0_filter)
441
-
442
  audio = audio.cpu().numpy()
443
  self.last_chunk = audio[-self.pre_len:]
444
  self.last_o = audio
@@ -459,4 +462,3 @@ class RealTimeVC:
459
  self.last_chunk = audio[-self.pre_len:]
460
  self.last_o = audio
461
  return ret[self.chunk_len:2 * self.chunk_len]
462
-
 
82
  def get_md5(content):
83
  return hashlib.new("md5", content).hexdigest()
84
 
85
+
86
  def fill_a_to_b(a, b):
87
  if len(a) < len(b):
88
  for _ in range(0, len(b) - len(a)):
89
  a.append(a[0])
90
 
91
+
92
  def mkdir(paths: list):
93
  for path in paths:
94
  if not os.path.exists(path):
95
  os.mkdir(path)
96
 
97
+
98
  def pad_array(arr, target_length):
99
  current_length = arr.shape[0]
100
  if current_length >= target_length:
 
105
  pad_right = pad_width - pad_left
106
  padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
107
  return padded_arr
108
+
109
+
110
  def split_list_by_n(list_collection, n, pre=0):
111
  for i in range(0, len(list_collection), n):
112
+ yield list_collection[i - pre if i - pre >= 0 else i: i + n]
113
 
114
 
115
  class F0FilterException(Exception):
116
  pass
117
 
118
+
119
  class Svc(object):
120
  def __init__(self, net_g_path, config_path,
121
  device=None,
 
145
 
146
  if os.path.exists(cluster_model_path):
147
  if self.feature_retrieval:
148
+ with open(cluster_model_path, "rb") as f:
149
  self.cluster_model = pickle.load(f)
150
  self.big_npy = None
151
  self.now_spk_id = -1
152
  else:
153
  self.cluster_model = cluster.get_cluster_model(cluster_model_path)
154
  else:
155
+ self.feature_retrieval = False
156
 
157
  def load_model(self, spk_mix_enable=False):
158
  # get model configuration
 
168
  if spk_mix_enable:
169
  self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
170
 
171
+ def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter, f0_predictor, cr_threshold=0.05):
172
+
173
+ f0_predictor_object = utils.get_f0_predictor(f0_predictor, hop_length=self.hop_size,
174
+ sampling_rate=self.target_sample, device=self.dev,
175
+ threshold=cr_threshold)
176
 
 
 
177
  f0, uv = f0_predictor_object.compute_f0_uv(wav)
178
  if f0_filter and sum(f0) == 0:
179
  raise F0FilterException("No voice detected")
 
186
 
187
  wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
188
  wav16k = torch.from_numpy(wav16k).to(self.dev)
189
+ c = self.hubert_model.encoder(wav16k)
190
+ # c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
191
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
192
 
193
+ if cluster_infer_ratio != 0:
194
  if self.feature_retrieval:
195
  speaker_id = self.spk2id.get(speaker)
196
  if speaker_id is None:
 
199
  if len(self.spk2id.__dict__) >= speaker:
200
  speaker_id = speaker
201
  feature_index = self.cluster_model[speaker_id]
202
+ feat_np = c.transpose(0, 1).cpu().numpy()
203
  if self.big_npy is None or self.now_spk_id != speaker_id:
204
+ self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
205
+ self.now_spk_id = speaker_id
206
  print("starting feature retrieval...")
207
  score, ix = feature_index.search(feat_np, k=8)
208
  weight = np.square(1 / score)
209
  weight /= weight.sum(axis=1, keepdims=True)
210
  npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
211
  c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
212
+ c = torch.FloatTensor(c).to(self.dev).transpose(0, 1)
213
  print("end feature retrieval...")
214
  else:
215
  cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
 
225
  noice_scale=0.4,
226
  f0_filter=False,
227
  f0_predictor='pm',
228
+ enhancer_adaptive_key=0,
229
+ cr_threshold=0.05,
230
+ k_step=100,
231
+ frame=0,
232
+ spk_mix=False,
233
+ second_encoding=False,
234
+ loudness_envelope_adjustment=1
235
  ):
236
  wav, sr = librosa.load(raw_path, sr=self.target_sample)
237
+ speaker_id = self.spk2id.get(speaker)
238
+ if not speaker_id and type(speaker) is int:
239
+ if len(self.spk2id.__dict__) >= speaker:
240
+ speaker_id = speaker
241
+ if speaker_id is None:
242
+ raise RuntimeError("The name you entered is not in the speaker list!")
243
+ sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
244
+ c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter, f0_predictor,
245
+ cr_threshold=cr_threshold)
246
+ n_frames = f0.size(1)
 
 
 
 
247
  if "half" in self.net_g_path and torch.cuda.is_available():
248
  c = c.half()
249
  with torch.no_grad():
250
  start = time.time()
251
  vol = None
252
+ vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None, :])[None, :].to(
253
+ self.dev) if self.vol_embedding else None
254
+ audio, f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,
255
+ vol=vol)
256
+ audio = audio[0, 0].data.float()
 
 
257
  use_time = time.time() - start
258
  print("vits use time:{}".format(use_time))
259
  return audio, audio.shape[-1], n_frames
 
266
  # unload model
267
  self.net_g_ms = self.net_g_ms.to("cpu")
268
  del self.net_g_ms
269
+ if hasattr(self, "enhancer"):
270
  self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
271
  del self.enhancer.enhancer
272
  del self.enhancer
 
283
  pad_seconds=0.5,
284
  clip_seconds=0,
285
  lg_num=0,
286
+ lgr_num=0.75,
287
  f0_predictor='pm',
288
+ enhancer_adaptive_key=0,
289
+ cr_threshold=0.05,
290
+ k_step=100,
291
+ use_spk_mix=False,
292
+ second_encoding=False,
293
+ loudness_envelope_adjustment=1
294
  ):
295
  if use_spk_mix:
296
  if len(self.spk2id) == 1:
 
299
  wav_path = Path(raw_audio_path).with_suffix('.wav')
300
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
301
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
302
+ per_size = int(clip_seconds * audio_sr)
303
+ lg_size = int(lg_num * audio_sr)
304
+ lg_size_r = int(lg_size * lgr_num)
305
+ lg_size_c_l = (lg_size - lg_size_r) // 2
306
+ lg_size_c_r = lg_size - lg_size_r - lg_size_c_l
307
+ lg = np.linspace(0, 1, lg_size_r) if lg_size != 0 else 0
308
 
309
  if use_spk_mix:
310
  assert len(self.spk2id) == len(spk)
 
315
  audio_length += aud_length // self.hop_size
316
  continue
317
  if per_size != 0:
318
+ datas = split_list_by_n(data, per_size, lg_size)
319
  else:
320
  datas = [data]
321
+ for k, dat in enumerate(datas):
322
  pad_len = int(audio_sr * pad_seconds)
323
  per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
324
  a_length = per_length + 2 * pad_len
 
328
  for i in range(len(spk)):
329
  last_end = None
330
  for mix in spk[i]:
331
+ if mix[3] < 0. or mix[2] < 0.:
332
  raise RuntimeError("mix value must higer Than zero!")
333
  begin = int(audio_length * mix[0])
334
  end = int(audio_length * mix[1])
335
  length = end - begin
336
+ if length <= 0:
337
  raise RuntimeError("begin Must lower Than end!")
338
+ step = (mix[3] - mix[2]) / length
339
  if last_end is not None:
340
  if last_end != begin:
341
  raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
 
343
  if step == 0.:
344
  spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
345
  else:
346
+ spk_mix_data = torch.arange(mix[2], mix[3], step).to(self.dev)
347
+ if (len(spk_mix_data) < length):
348
  num_pad = length - len(spk_mix_data)
349
  spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
350
  spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
351
 
352
+ spk_mix_ten = torch.sum(spk_mix_tensor, dim=0).unsqueeze(0).to(self.dev)
353
  # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
354
  for i, x in enumerate(spk_mix_ten[0]):
355
  if x == 0.0:
356
  spk_mix_ten[0][i] = 1.0
357
+ spk_mix_tensor[:, i] = 1.0 / len(spk)
358
  spk_mix_tensor = spk_mix_tensor / spk_mix_ten
359
+ if not ((torch.sum(spk_mix_tensor, dim=0) - 1.) < 0.0001).all():
360
  raise RuntimeError("sum(spk_mix_tensor) not equal 1")
361
  spk = spk_mix_tensor
362
 
 
373
  global_frame += length // self.hop_size
374
  continue
375
  if per_size != 0:
376
+ datas = split_list_by_n(data, per_size, lg_size)
377
  else:
378
  datas = [data]
379
+ for k, dat in enumerate(datas):
380
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds != 0 else length
381
+ if clip_seconds != 0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
382
  # padd
383
  pad_len = int(audio_sr * pad_seconds)
384
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
 
386
  soundfile.write(raw_path, dat, audio_sr, format="wav")
387
  raw_path.seek(0)
388
  out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
389
+ cluster_infer_ratio=cluster_infer_ratio,
390
+ auto_predict_f0=auto_predict_f0,
391
+ noice_scale=noice_scale,
392
+ f0_predictor=f0_predictor,
393
+ enhancer_adaptive_key=enhancer_adaptive_key,
394
+ cr_threshold=cr_threshold,
395
+ k_step=k_step,
396
+ frame=global_frame,
397
+ spk_mix=use_spk_mix,
398
+ second_encoding=second_encoding,
399
+ loudness_envelope_adjustment=loudness_envelope_adjustment
400
+ )
401
  global_frame += out_frame
402
  _audio = out_audio.cpu().numpy()
403
  pad_len = int(self.target_sample * pad_seconds)
404
  _audio = _audio[pad_len:-pad_len]
405
  _audio = pad_array(_audio, per_length)
406
+ if lg_size != 0 and k != 0:
407
+ lg1 = audio[-(lg_size_r + lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
408
+ lg2 = _audio[lg_size_c_l:lg_size_c_l + lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
409
+ lg_pre = lg1 * (1 - lg) + lg2 * lg
410
+ audio = audio[0:-(lg_size_r + lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
411
  audio.extend(lg_pre)
412
+ _audio = _audio[lg_size_c_l + lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
413
  audio.extend(list(_audio))
414
  return np.array(audio)
415
 
416
+
417
  class RealTimeVC:
418
  def __init__(self):
419
  self.last_chunk = None
 
441
  auto_predict_f0=auto_predict_f0,
442
  noice_scale=noice_scale,
443
  f0_filter=f0_filter)
444
+
445
  audio = audio.cpu().numpy()
446
  self.last_chunk = audio[-self.pre_len:]
447
  self.last_o = audio
 
462
  self.last_chunk = audio[-self.pre_len:]
463
  self.last_o = audio
464
  return ret[self.chunk_len:2 * self.chunk_len]
 
{hubert → inference/inference}/__init__.py RENAMED
File without changes
inference/inference/chunks_temp.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"info": "temp_dict"}
inference/inference/infer_tool.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import io
3
+ import json
4
+ import logging
5
+ import os
6
+ import time
7
+ from pathlib import Path
8
+ from inference import slicer
9
+ import gc
10
+
11
+ import librosa
12
+ import numpy as np
13
+ # import onnxruntime
14
+ import soundfile
15
+ import torch
16
+ import torchaudio
17
+
18
+ import cluster
19
+ import utils
20
+ from models import SynthesizerTrn
21
+ import pickle
22
+
23
+ from diffusion.unit2mel import load_model_vocoder
24
+ import yaml
25
+
26
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
27
+
28
+
29
+ def read_temp(file_name):
30
+ if not os.path.exists(file_name):
31
+ with open(file_name, "w") as f:
32
+ f.write(json.dumps({"info": "temp_dict"}))
33
+ return {}
34
+ else:
35
+ try:
36
+ with open(file_name, "r") as f:
37
+ data = f.read()
38
+ data_dict = json.loads(data)
39
+ if os.path.getsize(file_name) > 50 * 1024 * 1024:
40
+ f_name = file_name.replace("\\", "/").split("/")[-1]
41
+ print(f"clean {f_name}")
42
+ for wav_hash in list(data_dict.keys()):
43
+ if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
44
+ del data_dict[wav_hash]
45
+ except Exception as e:
46
+ print(e)
47
+ print(f"{file_name} error,auto rebuild file")
48
+ data_dict = {"info": "temp_dict"}
49
+ return data_dict
50
+
51
+
52
+ def write_temp(file_name, data):
53
+ with open(file_name, "w") as f:
54
+ f.write(json.dumps(data))
55
+
56
+
57
+ def timeit(func):
58
+ def run(*args, **kwargs):
59
+ t = time.time()
60
+ res = func(*args, **kwargs)
61
+ print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
62
+ return res
63
+
64
+ return run
65
+
66
+
67
+ def format_wav(audio_path):
68
+ if Path(audio_path).suffix == '.wav':
69
+ return
70
+ raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
71
+ soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
72
+
73
+
74
+ def get_end_file(dir_path, end):
75
+ file_lists = []
76
+ for root, dirs, files in os.walk(dir_path):
77
+ files = [f for f in files if f[0] != '.']
78
+ dirs[:] = [d for d in dirs if d[0] != '.']
79
+ for f_file in files:
80
+ if f_file.endswith(end):
81
+ file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
82
+ return file_lists
83
+
84
+
85
+ def get_md5(content):
86
+ return hashlib.new("md5", content).hexdigest()
87
+
88
+ def fill_a_to_b(a, b):
89
+ if len(a) < len(b):
90
+ for _ in range(0, len(b) - len(a)):
91
+ a.append(a[0])
92
+
93
+ def mkdir(paths: list):
94
+ for path in paths:
95
+ if not os.path.exists(path):
96
+ os.mkdir(path)
97
+
98
+ def pad_array(arr, target_length):
99
+ current_length = arr.shape[0]
100
+ if current_length >= target_length:
101
+ return arr
102
+ else:
103
+ pad_width = target_length - current_length
104
+ pad_left = pad_width // 2
105
+ pad_right = pad_width - pad_left
106
+ padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
107
+ return padded_arr
108
+
109
+ def split_list_by_n(list_collection, n, pre=0):
110
+ for i in range(0, len(list_collection), n):
111
+ yield list_collection[i-pre if i-pre>=0 else i: i + n]
112
+
113
+
114
+ class F0FilterException(Exception):
115
+ pass
116
+
117
+ class Svc(object):
118
+ def __init__(self, net_g_path, config_path,
119
+ device=None,
120
+ cluster_model_path="logs/44k/kmeans_10000.pt",
121
+ nsf_hifigan_enhance = False,
122
+ diffusion_model_path="logs/44k/diffusion/model_0.pt",
123
+ diffusion_config_path="configs/diffusion.yaml",
124
+ shallow_diffusion = False,
125
+ only_diffusion = False,
126
+ spk_mix_enable = False,
127
+ feature_retrieval = False
128
+ ):
129
+ self.net_g_path = net_g_path
130
+ self.only_diffusion = only_diffusion
131
+ self.shallow_diffusion = shallow_diffusion
132
+ self.feature_retrieval = feature_retrieval
133
+ if device is None:
134
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
135
+ else:
136
+ self.dev = torch.device(device)
137
+ self.net_g_ms = None
138
+ if not self.only_diffusion:
139
+ self.hps_ms = utils.get_hparams_from_file(config_path)
140
+ self.target_sample = self.hps_ms.data.sampling_rate
141
+ self.hop_size = self.hps_ms.data.hop_length
142
+ self.spk2id = self.hps_ms.spk
143
+ try:
144
+ self.vol_embedding = self.hps_ms.model.vol_embedding
145
+ except Exception as e:
146
+ self.vol_embedding = False
147
+ try:
148
+ self.speech_encoder = self.hps_ms.model.speech_encoder
149
+ except Exception as e:
150
+ self.speech_encoder = 'vec768l12'
151
+
152
+ self.nsf_hifigan_enhance = nsf_hifigan_enhance
153
+ if self.shallow_diffusion or self.only_diffusion:
154
+ if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
155
+ self.diffusion_model,self.vocoder,self.diffusion_args = load_model_vocoder(diffusion_model_path,self.dev,config_path=diffusion_config_path)
156
+ if self.only_diffusion:
157
+ self.target_sample = self.diffusion_args.data.sampling_rate
158
+ self.hop_size = self.diffusion_args.data.block_size
159
+ self.spk2id = self.diffusion_args.spk
160
+ self.speech_encoder = self.diffusion_args.data.encoder
161
+ if spk_mix_enable:
162
+ self.diffusion_model.init_spkmix(len(self.spk2id))
163
+ else:
164
+ print("No diffusion model or config found. Shallow diffusion mode will False")
165
+ self.shallow_diffusion = self.only_diffusion = False
166
+
167
+ # load hubert and model
168
+ if not self.only_diffusion:
169
+ self.load_model(spk_mix_enable)
170
+ self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
171
+ self.volume_extractor = utils.Volume_Extractor(self.hop_size)
172
+ else:
173
+ self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
174
+ self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
175
+
176
+ if os.path.exists(cluster_model_path):
177
+ if self.feature_retrieval:
178
+ with open(cluster_model_path,"rb") as f:
179
+ self.cluster_model = pickle.load(f)
180
+ self.big_npy = None
181
+ self.now_spk_id = -1
182
+ else:
183
+ self.cluster_model = cluster.get_cluster_model(cluster_model_path)
184
+ else:
185
+ self.feature_retrieval=False
186
+
187
+ if self.shallow_diffusion : self.nsf_hifigan_enhance = False
188
+ if self.nsf_hifigan_enhance:
189
+ from modules.enhancer import Enhancer
190
+ self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
191
+
192
+ def load_model(self, spk_mix_enable=False):
193
+ # get model configuration
194
+ self.net_g_ms = SynthesizerTrn(
195
+ self.hps_ms.data.filter_length // 2 + 1,
196
+ self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
197
+ **self.hps_ms.model)
198
+ _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
199
+ if "half" in self.net_g_path and torch.cuda.is_available():
200
+ _ = self.net_g_ms.half().eval().to(self.dev)
201
+ else:
202
+ _ = self.net_g_ms.eval().to(self.dev)
203
+ if spk_mix_enable:
204
+ self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
205
+
206
+ def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
207
+
208
+ f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
209
+
210
+ f0, uv = f0_predictor_object.compute_f0_uv(wav)
211
+ if f0_filter and sum(f0) == 0:
212
+ raise F0FilterException("No voice detected")
213
+ f0 = torch.FloatTensor(f0).to(self.dev)
214
+ uv = torch.FloatTensor(uv).to(self.dev)
215
+
216
+ f0 = f0 * 2 ** (tran / 12)
217
+ f0 = f0.unsqueeze(0)
218
+ uv = uv.unsqueeze(0)
219
+
220
+ wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
221
+ wav16k = torch.from_numpy(wav16k).to(self.dev)
222
+ c = self.hubert_model.encoder(wav16k)
223
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
224
+
225
+ if cluster_infer_ratio !=0:
226
+ if self.feature_retrieval:
227
+ speaker_id = self.spk2id.get(speaker)
228
+ if speaker_id is None:
229
+ raise RuntimeError("The name you entered is not in the speaker list!")
230
+ if not speaker_id and type(speaker) is int:
231
+ if len(self.spk2id.__dict__) >= speaker:
232
+ speaker_id = speaker
233
+ feature_index = self.cluster_model[speaker_id]
234
+ feat_np = c.transpose(0,1).cpu().numpy()
235
+ if self.big_npy is None or self.now_spk_id != speaker_id:
236
+ self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
237
+ self.now_spk_id = speaker_id
238
+ print("starting feature retrieval...")
239
+ score, ix = feature_index.search(feat_np, k=8)
240
+ weight = np.square(1 / score)
241
+ weight /= weight.sum(axis=1, keepdims=True)
242
+ npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
243
+ c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
244
+ c = torch.FloatTensor(c).to(self.dev).transpose(0,1)
245
+ print("end feature retrieval...")
246
+ else:
247
+ cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
248
+ cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
249
+ c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
250
+
251
+ c = c.unsqueeze(0)
252
+ return c, f0, uv
253
+
254
+ def infer(self, speaker, tran, raw_path,
255
+ cluster_infer_ratio=0,
256
+ auto_predict_f0=False,
257
+ noice_scale=0.4,
258
+ f0_filter=False,
259
+ f0_predictor='pm',
260
+ enhancer_adaptive_key = 0,
261
+ cr_threshold = 0.05,
262
+ k_step = 100,
263
+ frame = 0,
264
+ spk_mix = False,
265
+ second_encoding = False,
266
+ loudness_envelope_adjustment = 1
267
+ ):
268
+ wav, sr = librosa.load(raw_path, sr=self.target_sample)
269
+ if spk_mix:
270
+ c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
271
+ n_frames = f0.size(1)
272
+ sid = speaker[:, frame:frame+n_frames].transpose(0,1)
273
+ else:
274
+ speaker_id = self.spk2id.get(speaker)
275
+ if not speaker_id and type(speaker) is int:
276
+ if len(self.spk2id.__dict__) >= speaker:
277
+ speaker_id = speaker
278
+ if speaker_id is None:
279
+ raise RuntimeError("The name you entered is not in the speaker list!")
280
+ sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
281
+ c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
282
+ n_frames = f0.size(1)
283
+ if "half" in self.net_g_path and torch.cuda.is_available():
284
+ c = c.half()
285
+ with torch.no_grad():
286
+ start = time.time()
287
+ vol = None
288
+ if not self.only_diffusion:
289
+ vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
290
+ audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
291
+ audio = audio[0,0].data.float()
292
+ audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
293
+ else:
294
+ audio = torch.FloatTensor(wav).to(self.dev)
295
+ audio_mel = None
296
+ if self.only_diffusion or self.shallow_diffusion:
297
+ vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None]
298
+ if self.shallow_diffusion and second_encoding:
299
+ audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
300
+ audio16k = torch.from_numpy(audio16k).to(self.dev)
301
+ c = self.hubert_model.encoder(audio16k)
302
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
303
+ f0 = f0[:,:,None]
304
+ c = c.transpose(-1,-2)
305
+ audio_mel = self.diffusion_model(
306
+ c,
307
+ f0,
308
+ vol,
309
+ spk_id = sid,
310
+ spk_mix_dict = None,
311
+ gt_spec=audio_mel,
312
+ infer=True,
313
+ infer_speedup=self.diffusion_args.infer.speedup,
314
+ method=self.diffusion_args.infer.method,
315
+ k_step=k_step)
316
+ audio = self.vocoder.infer(audio_mel, f0).squeeze()
317
+ if self.nsf_hifigan_enhance:
318
+ audio, _ = self.enhancer.enhance(
319
+ audio[None,:],
320
+ self.target_sample,
321
+ f0[:,:,None],
322
+ self.hps_ms.data.hop_length,
323
+ adaptive_key = enhancer_adaptive_key)
324
+ if loudness_envelope_adjustment != 1:
325
+ audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
326
+ use_time = time.time() - start
327
+ print("vits use time:{}".format(use_time))
328
+ return audio, audio.shape[-1], n_frames
329
+
330
+ def clear_empty(self):
331
+ # clean up vram
332
+ torch.cuda.empty_cache()
333
+
334
+ def unload_model(self):
335
+ # unload model
336
+ self.net_g_ms = self.net_g_ms.to("cpu")
337
+ del self.net_g_ms
338
+ if hasattr(self,"enhancer"):
339
+ self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
340
+ del self.enhancer.enhancer
341
+ del self.enhancer
342
+ gc.collect()
343
+
344
+ def slice_inference(self,
345
+ raw_audio_path,
346
+ spk,
347
+ tran,
348
+ slice_db,
349
+ cluster_infer_ratio,
350
+ auto_predict_f0,
351
+ noice_scale,
352
+ pad_seconds=0.5,
353
+ clip_seconds=0,
354
+ lg_num=0,
355
+ lgr_num =0.75,
356
+ f0_predictor='pm',
357
+ enhancer_adaptive_key = 0,
358
+ cr_threshold = 0.05,
359
+ k_step = 100,
360
+ use_spk_mix = False,
361
+ second_encoding = False,
362
+ loudness_envelope_adjustment = 1
363
+ ):
364
+ if use_spk_mix:
365
+ if len(self.spk2id) == 1:
366
+ spk = self.spk2id.keys()[0]
367
+ use_spk_mix = False
368
+ wav_path = Path(raw_audio_path).with_suffix('.wav')
369
+ chunks = slicer.cut(wav_path, db_thresh=slice_db)
370
+ audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
371
+ per_size = int(clip_seconds*audio_sr)
372
+ lg_size = int(lg_num*audio_sr)
373
+ lg_size_r = int(lg_size*lgr_num)
374
+ lg_size_c_l = (lg_size-lg_size_r)//2
375
+ lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
376
+ lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
377
+
378
+ if use_spk_mix:
379
+ assert len(self.spk2id) == len(spk)
380
+ audio_length = 0
381
+ for (slice_tag, data) in audio_data:
382
+ aud_length = int(np.ceil(len(data) / audio_sr * self.target_sample))
383
+ if slice_tag:
384
+ audio_length += aud_length // self.hop_size
385
+ continue
386
+ if per_size != 0:
387
+ datas = split_list_by_n(data, per_size,lg_size)
388
+ else:
389
+ datas = [data]
390
+ for k,dat in enumerate(datas):
391
+ pad_len = int(audio_sr * pad_seconds)
392
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
393
+ a_length = per_length + 2 * pad_len
394
+ audio_length += a_length // self.hop_size
395
+ audio_length += len(audio_data)
396
+ spk_mix_tensor = torch.zeros(size=(len(spk), audio_length)).to(self.dev)
397
+ for i in range(len(spk)):
398
+ last_end = None
399
+ for mix in spk[i]:
400
+ if mix[3]<0. or mix[2]<0.:
401
+ raise RuntimeError("mix value must higer Than zero!")
402
+ begin = int(audio_length * mix[0])
403
+ end = int(audio_length * mix[1])
404
+ length = end - begin
405
+ if length<=0:
406
+ raise RuntimeError("begin Must lower Than end!")
407
+ step = (mix[3] - mix[2])/length
408
+ if last_end is not None:
409
+ if last_end != begin:
410
+ raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
411
+ last_end = end
412
+ if step == 0.:
413
+ spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
414
+ else:
415
+ spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
416
+ if(len(spk_mix_data)<length):
417
+ num_pad = length - len(spk_mix_data)
418
+ spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
419
+ spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
420
+
421
+ spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
422
+ # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
423
+ for i, x in enumerate(spk_mix_ten[0]):
424
+ if x == 0.0:
425
+ spk_mix_ten[0][i] = 1.0
426
+ spk_mix_tensor[:,i] = 1.0 / len(spk)
427
+ spk_mix_tensor = spk_mix_tensor / spk_mix_ten
428
+ if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
429
+ raise RuntimeError("sum(spk_mix_tensor) not equal 1")
430
+ spk = spk_mix_tensor
431
+
432
+ global_frame = 0
433
+ audio = []
434
+ for (slice_tag, data) in audio_data:
435
+ print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
436
+ # padd
437
+ length = int(np.ceil(len(data) / audio_sr * self.target_sample))
438
+ if slice_tag:
439
+ print('jump empty segment')
440
+ _audio = np.zeros(length)
441
+ audio.extend(list(pad_array(_audio, length)))
442
+ global_frame += length // self.hop_size
443
+ continue
444
+ if per_size != 0:
445
+ datas = split_list_by_n(data, per_size,lg_size)
446
+ else:
447
+ datas = [data]
448
+ for k,dat in enumerate(datas):
449
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
450
+ if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
451
+ # padd
452
+ pad_len = int(audio_sr * pad_seconds)
453
+ dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
454
+ raw_path = io.BytesIO()
455
+ soundfile.write(raw_path, dat, audio_sr, format="wav")
456
+ raw_path.seek(0)
457
+ out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
458
+ cluster_infer_ratio=cluster_infer_ratio,
459
+ auto_predict_f0=auto_predict_f0,
460
+ noice_scale=noice_scale,
461
+ f0_predictor = f0_predictor,
462
+ enhancer_adaptive_key = enhancer_adaptive_key,
463
+ cr_threshold = cr_threshold,
464
+ k_step = k_step,
465
+ frame = global_frame,
466
+ spk_mix = use_spk_mix,
467
+ second_encoding = second_encoding,
468
+ loudness_envelope_adjustment = loudness_envelope_adjustment
469
+ )
470
+ global_frame += out_frame
471
+ _audio = out_audio.cpu().numpy()
472
+ pad_len = int(self.target_sample * pad_seconds)
473
+ _audio = _audio[pad_len:-pad_len]
474
+ _audio = pad_array(_audio, per_length)
475
+ if lg_size!=0 and k!=0:
476
+ lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
477
+ lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
478
+ lg_pre = lg1*(1-lg)+lg2*lg
479
+ audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
480
+ audio.extend(lg_pre)
481
+ _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
482
+ audio.extend(list(_audio))
483
+ return np.array(audio)
484
+
485
+ class RealTimeVC:
486
+ def __init__(self):
487
+ self.last_chunk = None
488
+ self.last_o = None
489
+ self.chunk_len = 16000 # chunk length
490
+ self.pre_len = 3840 # cross fade length, multiples of 640
491
+
492
+ # Input and output are 1-dimensional numpy waveform arrays
493
+
494
+ def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
495
+ cluster_infer_ratio=0,
496
+ auto_predict_f0=False,
497
+ noice_scale=0.4,
498
+ f0_filter=False):
499
+
500
+ import maad
501
+ audio, sr = torchaudio.load(input_wav_path)
502
+ audio = audio.cpu().numpy()[0]
503
+ temp_wav = io.BytesIO()
504
+ if self.last_chunk is None:
505
+ input_wav_path.seek(0)
506
+
507
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path,
508
+ cluster_infer_ratio=cluster_infer_ratio,
509
+ auto_predict_f0=auto_predict_f0,
510
+ noice_scale=noice_scale,
511
+ f0_filter=f0_filter)
512
+
513
+ audio = audio.cpu().numpy()
514
+ self.last_chunk = audio[-self.pre_len:]
515
+ self.last_o = audio
516
+ return audio[-self.chunk_len:]
517
+ else:
518
+ audio = np.concatenate([self.last_chunk, audio])
519
+ soundfile.write(temp_wav, audio, sr, format="wav")
520
+ temp_wav.seek(0)
521
+
522
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav,
523
+ cluster_infer_ratio=cluster_infer_ratio,
524
+ auto_predict_f0=auto_predict_f0,
525
+ noice_scale=noice_scale,
526
+ f0_filter=f0_filter)
527
+
528
+ audio = audio.cpu().numpy()
529
+ ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
530
+ self.last_chunk = audio[-self.pre_len:]
531
+ self.last_o = audio
532
+ return ret[self.chunk_len:2 * self.chunk_len]
533
+
inference/inference/infer_tool_grad.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import os
5
+ import time
6
+ from pathlib import Path
7
+ import io
8
+ import librosa
9
+ import maad
10
+ import numpy as np
11
+ from inference import slicer
12
+ import parselmouth
13
+ import soundfile
14
+ import torch
15
+ import torchaudio
16
+
17
+ from hubert import hubert_model
18
+ import utils
19
+ from models import SynthesizerTrn
20
+ logging.getLogger('numba').setLevel(logging.WARNING)
21
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
22
+
23
+ def resize2d_f0(x, target_len):
24
+ source = np.array(x)
25
+ source[source < 0.001] = np.nan
26
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
27
+ source)
28
+ res = np.nan_to_num(target)
29
+ return res
30
+
31
+ def get_f0(x, p_len,f0_up_key=0):
32
+
33
+ time_step = 160 / 16000 * 1000
34
+ f0_min = 50
35
+ f0_max = 1100
36
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
37
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
38
+
39
+ f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
40
+ time_step=time_step / 1000, voicing_threshold=0.6,
41
+ pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
42
+
43
+ pad_size=(p_len - len(f0) + 1) // 2
44
+ if(pad_size>0 or p_len - len(f0) - pad_size>0):
45
+ f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
46
+
47
+ f0 *= pow(2, f0_up_key / 12)
48
+ f0_mel = 1127 * np.log(1 + f0 / 700)
49
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
50
+ f0_mel[f0_mel <= 1] = 1
51
+ f0_mel[f0_mel > 255] = 255
52
+ f0_coarse = np.rint(f0_mel).astype(np.int)
53
+ return f0_coarse, f0
54
+
55
+ def clean_pitch(input_pitch):
56
+ num_nan = np.sum(input_pitch == 1)
57
+ if num_nan / len(input_pitch) > 0.9:
58
+ input_pitch[input_pitch != 1] = 1
59
+ return input_pitch
60
+
61
+
62
+ def plt_pitch(input_pitch):
63
+ input_pitch = input_pitch.astype(float)
64
+ input_pitch[input_pitch == 1] = np.nan
65
+ return input_pitch
66
+
67
+
68
+ def f0_to_pitch(ff):
69
+ f0_pitch = 69 + 12 * np.log2(ff / 440)
70
+ return f0_pitch
71
+
72
+
73
+ def fill_a_to_b(a, b):
74
+ if len(a) < len(b):
75
+ for _ in range(0, len(b) - len(a)):
76
+ a.append(a[0])
77
+
78
+
79
+ def mkdir(paths: list):
80
+ for path in paths:
81
+ if not os.path.exists(path):
82
+ os.mkdir(path)
83
+
84
+
85
+ class VitsSvc(object):
86
+ def __init__(self):
87
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
+ self.SVCVITS = None
89
+ self.hps = None
90
+ self.speakers = None
91
+ self.hubert_soft = utils.get_hubert_model()
92
+
93
+ def set_device(self, device):
94
+ self.device = torch.device(device)
95
+ self.hubert_soft.to(self.device)
96
+ if self.SVCVITS != None:
97
+ self.SVCVITS.to(self.device)
98
+
99
+ def loadCheckpoint(self, path):
100
+ self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
101
+ self.SVCVITS = SynthesizerTrn(
102
+ self.hps.data.filter_length // 2 + 1,
103
+ self.hps.train.segment_size // self.hps.data.hop_length,
104
+ **self.hps.model)
105
+ _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.SVCVITS, None)
106
+ _ = self.SVCVITS.eval().to(self.device)
107
+ self.speakers = self.hps.spk
108
+
109
+ def get_units(self, source, sr):
110
+ source = source.unsqueeze(0).to(self.device)
111
+ with torch.inference_mode():
112
+ units = self.hubert_soft.units(source)
113
+ return units
114
+
115
+
116
+ def get_unit_pitch(self, in_path, tran):
117
+ source, sr = torchaudio.load(in_path)
118
+ source = torchaudio.functional.resample(source, sr, 16000)
119
+ if len(source.shape) == 2 and source.shape[1] >= 2:
120
+ source = torch.mean(source, dim=0).unsqueeze(0)
121
+ soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
122
+ f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran)
123
+ return soft, f0
124
+
125
+ def infer(self, speaker_id, tran, raw_path):
126
+ speaker_id = self.speakers[speaker_id]
127
+ sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0)
128
+ soft, pitch = self.get_unit_pitch(raw_path, tran)
129
+ f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device)
130
+ stn_tst = torch.FloatTensor(soft)
131
+ with torch.no_grad():
132
+ x_tst = stn_tst.unsqueeze(0).to(self.device)
133
+ x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
134
+ audio,_ = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
135
+ return audio, audio.shape[-1]
136
+
137
+ def inference(self,srcaudio,chara,tran,slice_db):
138
+ sampling_rate, audio = srcaudio
139
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
140
+ if len(audio.shape) > 1:
141
+ audio = librosa.to_mono(audio.transpose(1, 0))
142
+ if sampling_rate != 16000:
143
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
144
+ soundfile.write("tmpwav.wav", audio, 16000, format="wav")
145
+ chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db)
146
+ audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks)
147
+ audio = []
148
+ for (slice_tag, data) in audio_data:
149
+ length = int(np.ceil(len(data) / audio_sr * self.hps.data.sampling_rate))
150
+ raw_path = io.BytesIO()
151
+ soundfile.write(raw_path, data, audio_sr, format="wav")
152
+ raw_path.seek(0)
153
+ if slice_tag:
154
+ _audio = np.zeros(length)
155
+ else:
156
+ out_audio, out_sr = self.infer(chara, tran, raw_path)
157
+ _audio = out_audio.cpu().numpy()
158
+ audio.extend(list(_audio))
159
+ audio = (np.array(audio) * 32768.0).astype('int16')
160
+ return (self.hps.data.sampling_rate,audio)
inference/inference/slicer.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import torch
3
+ import torchaudio
4
+
5
+
6
+ class Slicer:
7
+ def __init__(self,
8
+ sr: int,
9
+ threshold: float = -40.,
10
+ min_length: int = 5000,
11
+ min_interval: int = 300,
12
+ hop_size: int = 20,
13
+ max_sil_kept: int = 5000):
14
+ if not min_length >= min_interval >= hop_size:
15
+ raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
16
+ if not max_sil_kept >= hop_size:
17
+ raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
18
+ min_interval = sr * min_interval / 1000
19
+ self.threshold = 10 ** (threshold / 20.)
20
+ self.hop_size = round(sr * hop_size / 1000)
21
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
22
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
23
+ self.min_interval = round(min_interval / self.hop_size)
24
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
25
+
26
+ def _apply_slice(self, waveform, begin, end):
27
+ if len(waveform.shape) > 1:
28
+ return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
29
+ else:
30
+ return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
31
+
32
+ # @timeit
33
+ def slice(self, waveform):
34
+ if len(waveform.shape) > 1:
35
+ samples = librosa.to_mono(waveform)
36
+ else:
37
+ samples = waveform
38
+ if samples.shape[0] <= self.min_length:
39
+ return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
40
+ rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
41
+ sil_tags = []
42
+ silence_start = None
43
+ clip_start = 0
44
+ for i, rms in enumerate(rms_list):
45
+ # Keep looping while frame is silent.
46
+ if rms < self.threshold:
47
+ # Record start of silent frames.
48
+ if silence_start is None:
49
+ silence_start = i
50
+ continue
51
+ # Keep looping while frame is not silent and silence start has not been recorded.
52
+ if silence_start is None:
53
+ continue
54
+ # Clear recorded silence start if interval is not enough or clip is too short
55
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
56
+ need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
57
+ if not is_leading_silence and not need_slice_middle:
58
+ silence_start = None
59
+ continue
60
+ # Need slicing. Record the range of silent frames to be removed.
61
+ if i - silence_start <= self.max_sil_kept:
62
+ pos = rms_list[silence_start: i + 1].argmin() + silence_start
63
+ if silence_start == 0:
64
+ sil_tags.append((0, pos))
65
+ else:
66
+ sil_tags.append((pos, pos))
67
+ clip_start = pos
68
+ elif i - silence_start <= self.max_sil_kept * 2:
69
+ pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
70
+ pos += i - self.max_sil_kept
71
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
72
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
73
+ if silence_start == 0:
74
+ sil_tags.append((0, pos_r))
75
+ clip_start = pos_r
76
+ else:
77
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
78
+ clip_start = max(pos_r, pos)
79
+ else:
80
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
81
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
82
+ if silence_start == 0:
83
+ sil_tags.append((0, pos_r))
84
+ else:
85
+ sil_tags.append((pos_l, pos_r))
86
+ clip_start = pos_r
87
+ silence_start = None
88
+ # Deal with trailing silence.
89
+ total_frames = rms_list.shape[0]
90
+ if silence_start is not None and total_frames - silence_start >= self.min_interval:
91
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
92
+ pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
93
+ sil_tags.append((pos, total_frames + 1))
94
+ # Apply and return slices.
95
+ if len(sil_tags) == 0:
96
+ return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
97
+ else:
98
+ chunks = []
99
+ # 第一段静音并非从头开始,补上有声片段
100
+ if sil_tags[0][0]:
101
+ chunks.append(
102
+ {"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
103
+ for i in range(0, len(sil_tags)):
104
+ # 标识有声片段(跳过第一段)
105
+ if i:
106
+ chunks.append({"slice": False,
107
+ "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
108
+ # 标识所有静音片段
109
+ chunks.append({"slice": True,
110
+ "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
111
+ # 最后一段静音并非结尾,补上结尾片段
112
+ if sil_tags[-1][1] * self.hop_size < len(waveform):
113
+ chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
114
+ chunk_dict = {}
115
+ for i in range(len(chunks)):
116
+ chunk_dict[str(i)] = chunks[i]
117
+ return chunk_dict
118
+
119
+
120
+ def cut(audio_path, db_thresh=-30, min_len=5000):
121
+ audio, sr = librosa.load(audio_path, sr=None)
122
+ slicer = Slicer(
123
+ sr=sr,
124
+ threshold=db_thresh,
125
+ min_length=min_len
126
+ )
127
+ chunks = slicer.slice(audio)
128
+ return chunks
129
+
130
+
131
+ def chunks2audio(audio_path, chunks):
132
+ chunks = dict(chunks)
133
+ audio, sr = torchaudio.load(audio_path)
134
+ if len(audio.shape) == 2 and audio.shape[1] >= 2:
135
+ audio = torch.mean(audio, dim=0).unsqueeze(0)
136
+ audio = audio.cpu().numpy()[0]
137
+ result = []
138
+ for k, v in chunks.items():
139
+ tag = v["split_time"].split(",")
140
+ if tag[0] != tag[1]:
141
+ result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
142
+ return result, sr
{hubert → pretrain}/checkpoint_best_legacy_500.pt RENAMED
File without changes
pretrain/meta.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def download_dict():
2
+ return {
3
+ "vec768l12": {
4
+ "url": "https://ibm.ent.box.com/shared/static/z1wgl1stco8ffooyatzdwsqn2psd9lrr",
5
+ "output": "./pretrain/checkpoint_best_legacy_500.pt"
6
+ },
7
+ "vec256l9": {
8
+ "url": "https://ibm.ent.box.com/shared/static/z1wgl1stco8ffooyatzdwsqn2psd9lrr",
9
+ "output": "./pretrain/checkpoint_best_legacy_500.pt"
10
+ },
11
+ "hubertsoft": {
12
+ "url": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt",
13
+ "output": "./pretrain/hubert-soft-0d54a1f4.pt"
14
+ },
15
+ "whisper-ppg": {
16
+ "url": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
17
+ "output": "./pretrain/medium.pt"
18
+ }
19
+ }
20
+
21
+
22
+ def get_speech_encoder(config_path="configs/config.json"):
23
+ import json
24
+
25
+ with open(config_path, "r") as f:
26
+ data = f.read()
27
+ config = json.loads(data)
28
+ speech_encoder = config["model"]["speech_encoder"]
29
+ dict = download_dict()
30
+
31
+ return dict[speech_encoder]["url"], dict[speech_encoder]["output"]
pretrain/nsf_hifigan/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 4,
4
+ "batch_size": 10,
5
+ "learning_rate": 0.0002,
6
+ "adam_b1": 0.8,
7
+ "adam_b2": 0.99,
8
+ "lr_decay": 0.999,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [ 8, 8, 2, 2, 2],
12
+ "upsample_kernel_sizes": [16,16, 4, 4, 4],
13
+ "upsample_initial_channel": 512,
14
+ "resblock_kernel_sizes": [3,7,11],
15
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16
+ "discriminator_periods": [3, 5, 7, 11, 17, 23, 37],
17
+
18
+ "segment_size": 16384,
19
+ "num_mels": 128,
20
+ "num_freq": 1025,
21
+ "n_fft" : 2048,
22
+ "hop_size": 512,
23
+ "win_size": 2048,
24
+
25
+ "sampling_rate": 44100,
26
+
27
+ "fmin": 40,
28
+ "fmax": 16000,
29
+ "fmax_for_loss": null,
30
+
31
+ "num_workers": 16,
32
+
33
+ "dist_config": {
34
+ "dist_backend": "nccl",
35
+ "dist_url": "tcp://localhost:54321",
36
+ "world_size": 1
37
+ }
38
+ }