soiz1 commited on
Commit
dce4caf
·
verified ·
1 Parent(s): a812692

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -38
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import spaces
2
  import gradio as gr
3
  import torch
@@ -335,41 +336,46 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
335
  ).export(format="mp3", bitrate=bitrate).read()
336
  yield mp3_bytes, None
337
 
338
-
339
- if __name__ == "__main__":
340
- description = ("State-of-the-Art zero-shot voice conversion/singing voice conversion. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
341
- "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
342
- "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
343
- "无需训练的 zero-shot 语音/歌声转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
344
- "请注意,参考音频若超过 25 秒,则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒,源音频将被分段处理。")
345
- inputs = [
346
- gr.Audio(type="filepath", label="Source Audio / 源音频"),
347
- gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
348
- gr.Slider(minimum=1, maximum=200, value=25, step=1, label="Diffusion Steps / 扩散步数", info="25 by default, 50~100 for best quality / 默认为 25,50~100 为最佳质量"),
349
- gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整", info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
350
- gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="has subtle influence / 有微小影响"),
351
- gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False, info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
352
- gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
353
- info="Roughly adjust F0 to match target voice. Only works when F0 conditioned model is used. / 粗略调整 F0 以匹配目标音色,仅在勾选 '启用F0输入' 时生效"),
354
- gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0, info="Pitch shift in semitones, only works when F0 conditioned model is used / 半音数的音高变换,仅在勾选 '启用F0输入' 时生效"),
355
- ]
356
-
357
- examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
358
- ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, False, True, 0],
359
- ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
360
- "examples/reference/kobe_0.wav", 50, 1.0, 0.7, True, False, -6],
361
- ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
362
- "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
363
- ]
364
-
365
- outputs = [gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
366
- gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')]
367
-
368
- gr.Interface(fn=voice_conversion,
369
- description=description,
370
- inputs=inputs,
371
- outputs=outputs,
372
- title="Seed Voice Conversion",
373
- examples=examples,
374
- cache_examples=False,
375
- ).launch()
 
 
 
 
 
 
1
+ import os
2
  import spaces
3
  import gradio as gr
4
  import torch
 
336
  ).export(format="mp3", bitrate=bitrate).read()
337
  yield mp3_bytes, None
338
 
339
+ default_dir = "/default"
340
+ reference_files = [
341
+ ("ずんだもん", "zundamon"),
342
+ ("四国めたん", "sikokumetan"),
343
+ ("春日部つむぎ", "kasukabetsumugi"),
344
+ ("雨晴はう", "ameharehau"),
345
+ ("波音リツ", "namineritsu"),
346
+ ]
347
+
348
+ def proxy_voice_conversion(source, selected_reference, diffusion_steps, length_adjust, inference_cfg_rate, f0_condition, auto_f0_adjust, pitch_shift):
349
+ reference_path = os.path.join(default_dir, f"{selected_reference}.mp3")
350
+ return voice_conversion(source, reference_path, diffusion_steps, length_adjust, inference_cfg_rate, f0_condition, auto_f0_adjust, pitch_shift)
351
+
352
+ gallery_items = [[os.path.join(default_dir, f"{filename}.png"), name, filename] for name, filename in reference_files]
353
+
354
+ description = ("Zero-shot音声変換モデル(学習不要)。ローカルでの利用方法は[GitHubリポジトリ](https://github.com/Plachtaa/seed-vc)をご覧ください。"
355
+ "参考音声が25秒を超える場合、自動的に25秒にクリップされます。"
356
+ "また、元音声と参考音声の合計時間が30秒を超える場合、元音声は分割処理されます。")
357
+
358
+ inputs = [
359
+ gr.Audio(type="filepath", label="元音声"),
360
+ gr.Gallery(label="参照音声を選択", value=gallery_items, columns=5, interactive=True),
361
+ gr.Slider(minimum=1, maximum=200, value=10, step=1, label="拡散ステップ数", info="デフォルトは10、50~100が最適な品質"),
362
+ gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="長さ調整", info="1.0未満で速度を上げ、1.0以上で速度を遅くします"),
363
+ gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="推論CFG率", info="わずかな影響があります"),
364
+ gr.Checkbox(label="F0条件付きモデルを使用", value=False, info="歌声変換には必須です"),
365
+ gr.Checkbox(label="F0自動調整", value=True, info="F0をおおよそ調整して目標音声に合わせます。F0条件付きモデル使用時にのみ有効です"),
366
+ gr.Slider(label='音程変換', minimum=-24, maximum=24, step=1, value=0, info="半音単位の音程変換。F0条件付きモデル使用時にのみ有効です"),
367
+ ]
368
+
369
+ outputs = [
370
+ gr.Audio(label="ストリーム出力音声", streaming=True, format='mp3'),
371
+ gr.Audio(label="完全出力音声", streaming=False, format='wav')
372
+ ]
373
+
374
+ gr.Interface(
375
+ fn=proxy_voice_conversion,
376
+ description=description,
377
+ inputs=inputs,
378
+ outputs=outputs,
379
+ title="Seed Voice Conversion with Reference Gallery",
380
+ cache_examples=False,
381
+ ).launch()