Update app.py
Browse files
app.py
CHANGED
@@ -9,7 +9,7 @@ import numpy as np
|
|
9 |
from pydub import AudioSegment
|
10 |
import spaces
|
11 |
|
12 |
-
#
|
13 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
|
15 |
dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
|
@@ -22,7 +22,7 @@ model = build_model(model_params, stage='DiT')
|
|
22 |
hop_length = config['preprocess_params']['spect_params']['hop_length']
|
23 |
sr = config['preprocess_params']['sr']
|
24 |
|
25 |
-
#
|
26 |
model, _, _, _ = load_checkpoint(model, None, dit_checkpoint_path,
|
27 |
load_only_params=True, ignore_modules=[], is_distributed=False)
|
28 |
for key in model:
|
@@ -30,7 +30,7 @@ for key in model:
|
|
30 |
model[key].to(device)
|
31 |
model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
|
32 |
|
33 |
-
#
|
34 |
from modules.campplus.DTDNN import CAMPPlus
|
35 |
|
36 |
campplus_ckpt_path = load_custom_model_from_hf("funasr/campplus", "campplus_cn_common.bin", config_filename=None)
|
@@ -55,7 +55,7 @@ from modules.bigvgan import bigvgan
|
|
55 |
|
56 |
bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
|
57 |
|
58 |
-
#
|
59 |
bigvgan_model.remove_weight_norm()
|
60 |
bigvgan_model = bigvgan_model.eval().to(device)
|
61 |
|
@@ -79,7 +79,7 @@ elif speech_tokenizer_type == 'facodec':
|
|
79 |
_ = [codec_encoder[key].eval() for key in codec_encoder]
|
80 |
_ = [codec_encoder[key].to(device) for key in codec_encoder]
|
81 |
|
82 |
-
#
|
83 |
mel_fn_args = {
|
84 |
"n_fft": config['preprocess_params']['spect_params']['n_fft'],
|
85 |
"win_size": config['preprocess_params']['spect_params']['win_length'],
|
@@ -105,7 +105,7 @@ from modules.audio import mel_spectrogram
|
|
105 |
to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
|
106 |
to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)
|
107 |
|
108 |
-
# f0
|
109 |
dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
|
110 |
"DiT_seed_v2_uvit_facodec_small_wavenet_f0_bigvgan_pruned.pth",
|
111 |
"config_dit_mel_seed_facodec_small_wavenet_f0.yml")
|
@@ -116,7 +116,7 @@ model_f0 = build_model(model_params, stage='DiT')
|
|
116 |
hop_length = config['preprocess_params']['spect_params']['hop_length']
|
117 |
sr = config['preprocess_params']['sr']
|
118 |
|
119 |
-
#
|
120 |
model_f0, _, _, _ = load_checkpoint(model_f0, None, dit_checkpoint_path,
|
121 |
load_only_params=True, ignore_modules=[], is_distributed=False)
|
122 |
for key in model_f0:
|
@@ -124,7 +124,7 @@ for key in model_f0:
|
|
124 |
model_f0[key].to(device)
|
125 |
model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
|
126 |
|
127 |
-
# f0
|
128 |
from modules.rmvpe import RMVPE
|
129 |
|
130 |
model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
|
@@ -140,7 +140,7 @@ def crossfade(chunk1, chunk2, overlap):
|
|
140 |
chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
|
141 |
return chunk2
|
142 |
|
143 |
-
#
|
144 |
max_context_window = sr // hop_length * 30
|
145 |
overlap_frame_len = 64
|
146 |
overlap_wave_len = overlap_frame_len * hop_length
|
@@ -152,19 +152,19 @@ bitrate = "320k"
|
|
152 |
def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, n_quantizers, f0_condition, auto_f0_adjust, pitch_shift):
|
153 |
inference_module = model if not f0_condition else model_f0
|
154 |
mel_fn = to_mel if not f0_condition else to_mel_f0
|
155 |
-
#
|
156 |
source_audio = librosa.load(source, sr=sr)[0]
|
157 |
ref_audio = librosa.load(target, sr=sr)[0]
|
158 |
|
159 |
-
#
|
160 |
source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
|
161 |
ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)
|
162 |
|
163 |
-
#
|
164 |
source_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
|
165 |
ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
|
166 |
|
167 |
-
#
|
168 |
if speech_tokenizer_type == 'cosyvoice':
|
169 |
S_alt = cosyvoice_frontend.extract_speech_token(source_waves_16k)[0]
|
170 |
S_ori = cosyvoice_frontend.extract_speech_token(ref_waves_16k)[0]
|
@@ -189,7 +189,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
189 |
S_alt_chunks.append(S_alt)
|
190 |
S_alt = torch.cat(S_alt_chunks, dim=-1)
|
191 |
|
192 |
-
# S_ori
|
193 |
waves_24k = torchaudio.functional.resample(ref_audio, sr, 24000)
|
194 |
waves_input = waves_24k.unsqueeze(1)
|
195 |
z = codec_encoder.encoder(waves_input)
|
@@ -235,7 +235,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
235 |
# mean_log_f0_ori = torch.mean(voiced_log_f0_ori)
|
236 |
# mean_log_f0_alt = torch.mean(voiced_log_f0_alt)
|
237 |
|
238 |
-
#
|
239 |
shifted_log_f0_alt = log_f0_alt.clone()
|
240 |
if auto_f0_adjust:
|
241 |
shifted_log_f0_alt[F0_alt > 1] = log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
|
@@ -247,20 +247,20 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
247 |
F0_alt = None
|
248 |
shifted_f0_alt = None
|
249 |
|
250 |
-
#
|
251 |
cond = inference_module.length_regulator(S_alt, ylens=target_lengths, n_quantizers=int(n_quantizers), f0=shifted_f0_alt)[0]
|
252 |
prompt_condition = inference_module.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=int(n_quantizers), f0=F0_ori)[0]
|
253 |
|
254 |
max_source_window = max_context_window - mel2.size(2)
|
255 |
-
#
|
256 |
processed_frames = 0
|
257 |
generated_wave_chunks = []
|
258 |
-
#
|
259 |
while processed_frames < cond.size(1):
|
260 |
chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
|
261 |
is_last_chunk = processed_frames + max_source_window >= cond.size(1)
|
262 |
cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
|
263 |
-
#
|
264 |
vc_target = inference_module.cfm.inference(cat_condition,
|
265 |
torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
|
266 |
mel2, style2, None, diffusion_steps,
|
@@ -316,19 +316,19 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
316 |
|
317 |
|
318 |
if __name__ == "__main__":
|
319 |
-
description = ("๋ ํผ๋ฐ์ค ์์
์ 25์ด ์ด๋ด ์ต๋ 30์ด ๋ฏธ๋ง์ผ๋ก ์
๋ก๋ ๋ฐ๋๋๋ค."
|
320 |
|
321 |
inputs = [
|
322 |
-
gr.Audio(type="filepath", label="
|
323 |
-
gr.Audio(type="filepath", label="
|
324 |
-
gr.Slider(minimum=1, maximum=200, value=10, step=1, label="
|
325 |
-
gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="
|
326 |
-
gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="
|
327 |
-
gr.Slider(minimum=1, maximum=3, step=1, value=3, label="
|
328 |
-
gr.Checkbox(label="
|
329 |
-
gr.Checkbox(label="
|
330 |
-
info="
|
331 |
-
gr.Slider(label='
|
332 |
]
|
333 |
|
334 |
examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, 1, False, True, 0],
|
@@ -339,14 +339,14 @@ if __name__ == "__main__":
|
|
339 |
"examples/reference/trump_0.wav", 50, 1.0, 0.7, 1, True, False, -12],
|
340 |
]
|
341 |
|
342 |
-
outputs = [gr.Audio(label="
|
343 |
-
gr.Audio(label="
|
344 |
|
345 |
gr.Interface(fn=voice_conversion,
|
346 |
description=description,
|
347 |
inputs=inputs,
|
348 |
outputs=outputs,
|
349 |
-
title="Seed
|
350 |
examples=examples,
|
351 |
cache_examples=False,
|
352 |
-
).launch()
|
|
|
9 |
from pydub import AudioSegment
|
10 |
import spaces
|
11 |
|
12 |
+
# ๋ชจ๋ธ ๋ฐ ์ค์ ๋ก๋
|
13 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
|
15 |
dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
|
|
|
22 |
hop_length = config['preprocess_params']['spect_params']['hop_length']
|
23 |
sr = config['preprocess_params']['sr']
|
24 |
|
25 |
+
# ์ฒดํฌํฌ์ธํธ ๋ก๋
|
26 |
model, _, _, _ = load_checkpoint(model, None, dit_checkpoint_path,
|
27 |
load_only_params=True, ignore_modules=[], is_distributed=False)
|
28 |
for key in model:
|
|
|
30 |
model[key].to(device)
|
31 |
model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
|
32 |
|
33 |
+
# ์ถ๊ฐ ๋ชจ๋ ๋ก๋
|
34 |
from modules.campplus.DTDNN import CAMPPlus
|
35 |
|
36 |
campplus_ckpt_path = load_custom_model_from_hf("funasr/campplus", "campplus_cn_common.bin", config_filename=None)
|
|
|
55 |
|
56 |
bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
|
57 |
|
58 |
+
# ๋ชจ๋ธ์์ weight norm์ ์ ๊ฑฐํ๊ณ eval ๋ชจ๋๋ก ์ค์
|
59 |
bigvgan_model.remove_weight_norm()
|
60 |
bigvgan_model = bigvgan_model.eval().to(device)
|
61 |
|
|
|
79 |
_ = [codec_encoder[key].eval() for key in codec_encoder]
|
80 |
_ = [codec_encoder[key].to(device) for key in codec_encoder]
|
81 |
|
82 |
+
# mel ์คํํธ๋ก๊ทธ๋จ ์์ฑ
|
83 |
mel_fn_args = {
|
84 |
"n_fft": config['preprocess_params']['spect_params']['n_fft'],
|
85 |
"win_size": config['preprocess_params']['spect_params']['win_length'],
|
|
|
105 |
to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
|
106 |
to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)
|
107 |
|
108 |
+
# f0 ์กฐ๊ฑด๋ถ ๋ชจ๋ธ
|
109 |
dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
|
110 |
"DiT_seed_v2_uvit_facodec_small_wavenet_f0_bigvgan_pruned.pth",
|
111 |
"config_dit_mel_seed_facodec_small_wavenet_f0.yml")
|
|
|
116 |
hop_length = config['preprocess_params']['spect_params']['hop_length']
|
117 |
sr = config['preprocess_params']['sr']
|
118 |
|
119 |
+
# ์ฒดํฌํฌ์ธํธ ๋ก๋
|
120 |
model_f0, _, _, _ = load_checkpoint(model_f0, None, dit_checkpoint_path,
|
121 |
load_only_params=True, ignore_modules=[], is_distributed=False)
|
122 |
for key in model_f0:
|
|
|
124 |
model_f0[key].to(device)
|
125 |
model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
|
126 |
|
127 |
+
# f0 ์ถ์ถ๊ธฐ
|
128 |
from modules.rmvpe import RMVPE
|
129 |
|
130 |
model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
|
|
|
140 |
chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
|
141 |
return chunk2
|
142 |
|
143 |
+
# ์คํธ๋ฆฌ๋ฐ ๋ฐ ์ฒญํฌ ์ฒ๋ฆฌ ๊ด๋ จ ๋งค๊ฐ๋ณ์
|
144 |
max_context_window = sr // hop_length * 30
|
145 |
overlap_frame_len = 64
|
146 |
overlap_wave_len = overlap_frame_len * hop_length
|
|
|
152 |
def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, n_quantizers, f0_condition, auto_f0_adjust, pitch_shift):
|
153 |
inference_module = model if not f0_condition else model_f0
|
154 |
mel_fn = to_mel if not f0_condition else to_mel_f0
|
155 |
+
# ์ค๋์ค ๋ก๋
|
156 |
source_audio = librosa.load(source, sr=sr)[0]
|
157 |
ref_audio = librosa.load(target, sr=sr)[0]
|
158 |
|
159 |
+
# ์ค๋์ค ์ฒ๋ฆฌ
|
160 |
source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
|
161 |
ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)
|
162 |
|
163 |
+
# ๋ฆฌ์ํ๋ง
|
164 |
source_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
|
165 |
ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
|
166 |
|
167 |
+
# ํน์ฑ ์ถ์ถ
|
168 |
if speech_tokenizer_type == 'cosyvoice':
|
169 |
S_alt = cosyvoice_frontend.extract_speech_token(source_waves_16k)[0]
|
170 |
S_ori = cosyvoice_frontend.extract_speech_token(ref_waves_16k)[0]
|
|
|
189 |
S_alt_chunks.append(S_alt)
|
190 |
S_alt = torch.cat(S_alt_chunks, dim=-1)
|
191 |
|
192 |
+
# S_ori๋ ๋์ผํ ๋ฐฉ์์ผ๋ก ์ถ์ถํด์ผ ํจ
|
193 |
waves_24k = torchaudio.functional.resample(ref_audio, sr, 24000)
|
194 |
waves_input = waves_24k.unsqueeze(1)
|
195 |
z = codec_encoder.encoder(waves_input)
|
|
|
235 |
# mean_log_f0_ori = torch.mean(voiced_log_f0_ori)
|
236 |
# mean_log_f0_alt = torch.mean(voiced_log_f0_alt)
|
237 |
|
238 |
+
# alt log f0 ๋ ๋ฒจ์ ori log f0 ๋ ๋ฒจ๋ก ์ด๋
|
239 |
shifted_log_f0_alt = log_f0_alt.clone()
|
240 |
if auto_f0_adjust:
|
241 |
shifted_log_f0_alt[F0_alt > 1] = log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
|
|
|
247 |
F0_alt = None
|
248 |
shifted_f0_alt = None
|
249 |
|
250 |
+
# ๊ธธ์ด ์กฐ์
|
251 |
cond = inference_module.length_regulator(S_alt, ylens=target_lengths, n_quantizers=int(n_quantizers), f0=shifted_f0_alt)[0]
|
252 |
prompt_condition = inference_module.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=int(n_quantizers), f0=F0_ori)[0]
|
253 |
|
254 |
max_source_window = max_context_window - mel2.size(2)
|
255 |
+
# ์์ค ์กฐ๊ฑด(cond)์ ์ฒญํฌ๋ก ๋ถํ
|
256 |
processed_frames = 0
|
257 |
generated_wave_chunks = []
|
258 |
+
# ์ฒญํฌ๋ณ๋ก ์์ฑํ๊ณ ์ถ๋ ฅ์ ์คํธ๋ฆฌ๋ฐ
|
259 |
while processed_frames < cond.size(1):
|
260 |
chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
|
261 |
is_last_chunk = processed_frames + max_source_window >= cond.size(1)
|
262 |
cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
|
263 |
+
# ์์ฑ ๋ณํ
|
264 |
vc_target = inference_module.cfm.inference(cat_condition,
|
265 |
torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
|
266 |
mel2, style2, None, diffusion_steps,
|
|
|
316 |
|
317 |
|
318 |
if __name__ == "__main__":
|
319 |
+
description = ("๋ ํผ๋ฐ์ค ์์
์ 25์ด ์ด๋ด ์ต๋ 30์ด ๋ฏธ๋ง์ผ๋ก ์
๋ก๋ ๋ฐ๋๋๋ค.")
|
320 |
|
321 |
inputs = [
|
322 |
+
gr.Audio(type="filepath", label="์์
์
๋ก๋"),
|
323 |
+
gr.Audio(type="filepath", label="์์ฑ ์
๋ก๋"),
|
324 |
+
gr.Slider(minimum=1, maximum=200, value=10, step=1, label="ํ์ฐ ๋จ๊ณ", info="๊ธฐ๋ณธ๊ฐ์ 10, ์ต์์ ํ์ง์ ์ํด์๋ 50~100"),
|
325 |
+
gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="๊ธธ์ด ์กฐ์ ", info="<1.0 ๋น ๋ฅธ ์์ฑ, >1.0 ๋๋ฆฐ ์์ฑ"),
|
326 |
+
gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="์ถ๋ก CFG ๋น์จ", info="๋ฏธ๋ฌํ ์ํฅ์ด ์์"),
|
327 |
+
gr.Slider(minimum=1, maximum=3, step=1, value=3, label="FAcodec ์์ํ๊ธฐ ์", info="์ฌ์ฉํ๋ FAcodec ์์ํ๊ธฐ๊ฐ ์ ์์๋ก ์๋ณธ ์ค๋์ค์ ์ด์จ์ด ๋ ๋ณด์กด๋จ"),
|
328 |
+
gr.Checkbox(label="F0 ์กฐ๊ฑด๋ถ ๋ชจ๋ธ ์ฌ์ฉ", value=False, info="๋
ธ๋ ์์ฑ ๋ณํ์ ์ํด์๋ ๋ฐ๋์ ์ฒดํฌํด์ผ ํจ"),
|
329 |
+
gr.Checkbox(label="์๋ F0 ์กฐ์ ", value=True,
|
330 |
+
info="๋ชฉํ ์์์ ๋ง๊ฒ F0๋ฅผ ๋๋ต์ ์ผ๋ก ์กฐ์ . F0 ์กฐ๊ฑด๋ถ ๋ชจ๋ธ ์ฌ์ฉ ์์๋ง ์๋"),
|
331 |
+
gr.Slider(label='์์กฐ ๋ณ๊ฒฝ', minimum=-24, maximum=24, step=1, value=0, info="๋ฐ์ ๋จ์์ ์์กฐ ๋ณ๊ฒฝ, F0 ์กฐ๊ฑด๋ถ ๋ชจ๋ธ ์ฌ์ฉ ์์๋ง ์๋"),
|
332 |
]
|
333 |
|
334 |
examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, 1, False, True, 0],
|
|
|
339 |
"examples/reference/trump_0.wav", 50, 1.0, 0.7, 1, True, False, -12],
|
340 |
]
|
341 |
|
342 |
+
outputs = [gr.Audio(label="์คํธ๋ฆฌ๋ฐ ์ถ๋ ฅ ์ค๋์ค", streaming=True, format='mp3'),
|
343 |
+
gr.Audio(label="์ ์ฒด ์ถ๋ ฅ ์ค๋์ค", streaming=False, format='wav')]
|
344 |
|
345 |
gr.Interface(fn=voice_conversion,
|
346 |
description=description,
|
347 |
inputs=inputs,
|
348 |
outputs=outputs,
|
349 |
+
title="Seed ์์ฑ ๋ณํ",
|
350 |
examples=examples,
|
351 |
cache_examples=False,
|
352 |
+
).launch()
|