Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -316,20 +316,22 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
316 |
|
317 |
|
318 |
if __name__ == "__main__":
|
319 |
-
description = ("Zero-shot voice conversion with in-context learning.
|
320 |
"for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
|
321 |
-
"If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks
|
|
|
|
|
322 |
inputs = [
|
323 |
-
gr.Audio(type="filepath", label="Source Audio"),
|
324 |
-
gr.Audio(type="filepath", label="Reference Audio"),
|
325 |
-
gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Diffusion Steps", info="10 by default, 50~100 for best quality"),
|
326 |
-
gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust", info="<1.0 for speed-up speech, >1.0 for slow-down speech"),
|
327 |
-
gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="has subtle influence"),
|
328 |
-
gr.Slider(minimum=1, maximum=3, step=1, value=3, label="N Quantizers", info="the less quantizer used, the less prosody of source audio is preserved"),
|
329 |
-
gr.Checkbox(label="Use F0 conditioned model", value=False, info="Must set to true for singing voice conversion"),
|
330 |
-
gr.Checkbox(label="Auto F0 adjust", value=True,
|
331 |
-
info="Roughly adjust F0 to match target voice. Only works when F0 conditioned model is used."),
|
332 |
-
gr.Slider(label='Pitch shift', minimum=-24, maximum=24, step=1, value=0, info=
|
333 |
]
|
334 |
|
335 |
examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, 1, False, True, 0],
|
@@ -340,8 +342,8 @@ if __name__ == "__main__":
|
|
340 |
"examples/reference/trump_0.wav", 50, 1.0, 0.7, 3, True, False, -12],
|
341 |
]
|
342 |
|
343 |
-
outputs = [gr.Audio(label="Stream Output Audio", streaming=True, format='mp3'),
|
344 |
-
gr.Audio(label="Full Output Audio", streaming=False, format='wav')]
|
345 |
|
346 |
gr.Interface(fn=voice_conversion,
|
347 |
description=description,
|
@@ -350,4 +352,4 @@ if __name__ == "__main__":
|
|
350 |
title="Seed Voice Conversion",
|
351 |
examples=examples,
|
352 |
cache_examples=False,
|
353 |
-
).launch()
|
|
|
316 |
|
317 |
|
318 |
if __name__ == "__main__":
|
319 |
+
description = ("Zero-shot voice conversion with in-context learning. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
|
320 |
"for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
|
321 |
+
"If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
|
322 |
+
"无需训练的 zero-shot 语音/歌声转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
|
323 |
+
"请注意,参考音频若超过 25 秒,则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒,源音频将被分段处理。")
|
324 |
inputs = [
|
325 |
+
gr.Audio(type="filepath", label="Source Audio / 源音频"),
|
326 |
+
gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
|
327 |
+
gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Diffusion Steps / 扩散步数", info="10 by default, 50~100 for best quality / 默认为 10,50~100 为最佳质量"),
|
328 |
+
gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整", info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
|
329 |
+
gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="has subtle influence / 有微小影响"),
|
330 |
+
gr.Slider(minimum=1, maximum=3, step=1, value=3, label="N FAcodec Quantizers / FAcodec码本数量", info="the less FAcodec quantizer used, the less prosody of source audio is preserved / 使用的FAcodec码本越少,源音频的韵律保留越少"),
|
331 |
+
gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False, info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
|
332 |
+
gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
|
333 |
+
info="Roughly adjust F0 to match target voice. Only works when F0 conditioned model is used. / 粗略调整 F0 以匹配目标音色,仅在勾选 '启用F0输入' 时生效"),
|
334 |
+
gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0, info="Pitch shift in semitones, only works when F0 conditioned model is used / 半音数的音高变换,仅在勾选 '启用F0输入' 时生效"),
|
335 |
]
|
336 |
|
337 |
examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, 1, False, True, 0],
|
|
|
342 |
"examples/reference/trump_0.wav", 50, 1.0, 0.7, 3, True, False, -12],
|
343 |
]
|
344 |
|
345 |
+
outputs = [gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
|
346 |
+
gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')]
|
347 |
|
348 |
gr.Interface(fn=voice_conversion,
|
349 |
description=description,
|
|
|
352 |
title="Seed Voice Conversion",
|
353 |
examples=examples,
|
354 |
cache_examples=False,
|
355 |
+
).launch()
|