ArkanDash commited on
Commit
142eebc
·
1 Parent(s): 8735d68

feat: added v2 support

Browse files
Files changed (7) hide show
  1. README.md +1 -1
  2. app.py +373 -77
  3. config.py +18 -12
  4. infer_pack/models.py +177 -35
  5. infer_pack/models_onnx.py +76 -18
  6. requirements.txt +21 -41
  7. vc_infer_pipeline.py +130 -21
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🎤
4
  colorFrom: red
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 3.32.0
8
  app_file: app.py
9
  pinned: true
10
  license: mit
 
4
  colorFrom: red
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 3.34.0
8
  app_file: app.py
9
  pinned: true
10
  license: mit
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import glob
3
  import json
4
- import argparse
5
  import traceback
6
  import logging
7
  import gradio as gr
@@ -10,37 +9,48 @@ import librosa
10
  import torch
11
  import asyncio
12
  import edge_tts
 
 
 
 
 
 
13
  from datetime import datetime
14
  from fairseq import checkpoint_utils
15
- from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
 
 
 
 
 
16
  from vc_infer_pipeline import VC
17
  from config import Config
18
  config = Config()
19
  logging.getLogger("numba").setLevel(logging.WARNING)
20
- limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
21
 
22
  def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
23
  def vc_fn(
24
- input_audio,
 
 
 
25
  f0_up_key,
 
26
  f0_method,
27
  index_rate,
28
- tts_mode,
29
- tts_text,
30
- tts_voice
 
31
  ):
32
  try:
33
- if tts_mode:
34
- if len(tts_text) > 100 and limitation:
35
- return "Text is too long", None
36
- if tts_text is None or tts_voice is None:
37
- return "You need to enter text and select a voice", None
38
- asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
39
- audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
40
- else:
41
- if input_audio is None:
42
  return "You need to upload an audio", None
43
- sampling_rate, audio = input_audio
44
  duration = audio.shape[0] / sampling_rate
45
  if duration > 20 and limitation:
46
  return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
@@ -49,31 +59,102 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
49
  audio = librosa.to_mono(audio.transpose(1, 0))
50
  if sampling_rate != 16000:
51
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
 
 
 
 
 
 
 
52
  times = [0, 0, 0]
53
  f0_up_key = int(f0_up_key)
54
  audio_opt = vc.pipeline(
55
  hubert_model,
56
  net_g,
57
- 0,
58
  audio,
 
59
  times,
60
  f0_up_key,
61
  f0_method,
62
  file_index,
63
  index_rate,
64
  if_f0,
 
 
 
 
 
 
65
  f0_file=None,
66
  )
67
- print(
68
- f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
69
- )
70
- return (tgt_sr, audio_opt)
71
  except:
72
  info = traceback.format_exc()
73
  print(info)
74
  return info, (None, None)
75
  return vc_fn
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def load_hubert():
78
  global hubert_model
79
  models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
@@ -88,11 +169,107 @@ def load_hubert():
88
  hubert_model = hubert_model.float()
89
  hubert_model.eval()
90
 
91
- def change_to_tts_mode(tts_mode):
92
- if tts_mode:
93
- return gr.Audio.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  else:
95
- return gr.Audio.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  if __name__ == '__main__':
98
  load_hubert()
@@ -121,10 +298,19 @@ if __name__ == '__main__':
121
  tgt_sr = cpt["config"][-1]
122
  cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
123
  if_f0 = cpt.get("f0", 1)
124
- if if_f0 == 1:
125
- net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
126
- else:
127
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
 
 
 
 
 
 
 
 
 
128
  del net_g.enc_q
129
  print(net_g.load_state_dict(cpt["weight"], strict=False))
130
  net_g.eval().to(config.device)
@@ -134,18 +320,13 @@ if __name__ == '__main__':
134
  net_g = net_g.float()
135
  vc = VC(tgt_sr, config)
136
  print(f"Model loaded: {model_name}")
137
- models.append((model_name, model_title, model_author, model_cover, create_vc_fn(tgt_sr, net_g, vc, if_f0, model_index)))
138
  categories.append([category_title, category_folder, description, models])
139
  with gr.Blocks() as app:
140
  gr.Markdown(
141
- "# <center> RVC Genshin Impact\n"
142
- "## <center> The input audio should be clean and pure voice without background music.\n"
143
- "## <center> [Recommended to use google colab to use all genshin model & feature](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n"
144
- "### <center> I limit the number of models to 15 due to an error caused by exceeding the available memory. (16 GB limit)\n"
145
- "### <center> This project was inspired by [zomehwh](https://huggingface.co/spaces/zomehwh/rvc-models) and [ardha27](https://huggingface.co/spaces/ardha27/rvc-models)\n"
146
- "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
147
- "[![Original RVC Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
148
- "[![RVC Inference Repo](https://badgen.net/badge/icon/github?icon=github&label)](https://github.com/ArkanDash/rvc-inference)"
149
  )
150
  for (folder_title, folder, description, models) in categories:
151
  with gr.TabItem(folder_title):
@@ -154,44 +335,159 @@ if __name__ == '__main__':
154
  with gr.Tabs():
155
  if not models:
156
  gr.Markdown("# <center> No Model Loaded.")
157
- gr.Markdown("## <center> Please added the model or fix your model path.")
158
  continue
159
- with gr.Tabs():
160
- for (name, title, author, cover, vc_fn) in models:
161
- with gr.TabItem(name):
162
- with gr.Row():
163
- gr.Markdown(
164
- '<div align="center">'
165
- f'<div>{title}</div>\n'+
166
- (f'<div>Model author: {author}</div>' if author else "")+
167
- (f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")+
168
- '</div>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  )
170
- with gr.Row():
171
- with gr.Column():
172
- vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
173
- vc_transpose = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
174
- vc_f0method = gr.Radio(
175
- label="Pitch extraction algorithm",
176
- choices=["pm", "harvest"],
177
- value="pm",
178
- interactive=True,
179
- info="PM is fast but Harvest is better for low frequencies. (Default: PM)"
180
- )
181
- vc_index_ratio = gr.Slider(
182
- minimum=0,
183
- maximum=1,
184
- label="Retrieval feature ratio",
185
- value=0.6,
186
- interactive=True,
187
- info="(Default: 0.6)"
188
- )
189
- tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
190
- tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
191
- tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
192
- vc_submit = gr.Button("Generate", variant="primary")
193
- with gr.Column():
194
- vc_output = gr.Audio(label="Output Audio")
195
- vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output])
196
- tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
 
1
  import os
2
  import glob
3
  import json
 
4
  import traceback
5
  import logging
6
  import gradio as gr
 
9
  import torch
10
  import asyncio
11
  import edge_tts
12
+ import yt_dlp
13
+ import ffmpeg
14
+ import subprocess
15
+ import sys
16
+ import io
17
+ import wave
18
  from datetime import datetime
19
  from fairseq import checkpoint_utils
20
+ from infer_pack.models import (
21
+ SynthesizerTrnMs256NSFsid,
22
+ SynthesizerTrnMs256NSFsid_nono,
23
+ SynthesizerTrnMs768NSFsid,
24
+ SynthesizerTrnMs768NSFsid_nono,
25
+ )
26
  from vc_infer_pipeline import VC
27
  from config import Config
28
  config = Config()
29
  logging.getLogger("numba").setLevel(logging.WARNING)
30
+ limitation = os.getenv("SYSTEM") == "spaces"
31
 
32
  def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
33
  def vc_fn(
34
+ vc_input,
35
+ vc_upload,
36
+ tts_text,
37
+ tts_voice,
38
  f0_up_key,
39
+ vc_transform,
40
  f0_method,
41
  index_rate,
42
+ filter_radius,
43
+ resample_sr,
44
+ rms_mix_rate,
45
+ protect,
46
  ):
47
  try:
48
+ if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
49
+ audio, sr = librosa.load(vc_input, sr=16000, mono=True)
50
+ elif vc_audio_mode == "Upload audio":
51
+ if vc_upload is None:
 
 
 
 
 
52
  return "You need to upload an audio", None
53
+ sampling_rate, audio = vc_upload
54
  duration = audio.shape[0] / sampling_rate
55
  if duration > 20 and limitation:
56
  return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
 
59
  audio = librosa.to_mono(audio.transpose(1, 0))
60
  if sampling_rate != 16000:
61
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
62
+ elif vc_audio_mode == "TTS Audio":
63
+ if len(tts_text) > 100 and limitation:
64
+ return "Text is too long", None
65
+ if tts_text is None or tts_voice is None:
66
+ return "You need to enter text and select a voice", None
67
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
68
+ audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
69
  times = [0, 0, 0]
70
  f0_up_key = int(f0_up_key)
71
  audio_opt = vc.pipeline(
72
  hubert_model,
73
  net_g,
74
+ vc_transform,
75
  audio,
76
+ vc_input,
77
  times,
78
  f0_up_key,
79
  f0_method,
80
  file_index,
81
  index_rate,
82
  if_f0,
83
+ filter_radius,
84
+ tgt_sr,
85
+ resample_sr,
86
+ rms_mix_rate,
87
+ version,
88
+ protect,
89
  f0_file=None,
90
  )
91
+ info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
92
+ print(info)
93
+ return info, (tgt_sr, audio_opt)
 
94
  except:
95
  info = traceback.format_exc()
96
  print(info)
97
  return info, (None, None)
98
  return vc_fn
99
 
100
+ def cut_vocal_and_inst(url, audio_provider, split_model):
101
+ if url != "":
102
+ if not os.path.exists("dl_audio"):
103
+ os.mkdir("dl_audio")
104
+ if audio_provider == "Youtube":
105
+ ydl_opts = {
106
+ 'format': 'bestaudio/best',
107
+ 'postprocessors': [{
108
+ 'key': 'FFmpegExtractAudio',
109
+ 'preferredcodec': 'wav',
110
+ }],
111
+ "outtmpl": 'dl_audio/youtube_audio',
112
+ }
113
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
114
+ ydl.download([url])
115
+ audio_path = "dl_audio/youtube_audio.wav"
116
+ else:
117
+ # Spotify doesnt work.
118
+ # Need to find other solution soon.
119
+ '''
120
+ command = f"spotdl download {url} --output dl_audio/.wav"
121
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
122
+ print(result.stdout.decode())
123
+ audio_path = "dl_audio/spotify_audio.wav"
124
+ '''
125
+ if split_model == "htdemucs":
126
+ command = f"demucs --two-stems=vocals {audio_path} -o output"
127
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
128
+ print(result.stdout.decode())
129
+ return "output/htdemucs/youtube_audio/vocals.wav", "output/htdemucs/youtube_audio/no_vocals.wav", audio_path, "output/htdemucs/youtube_audio/vocals.wav"
130
+ else:
131
+ command = f"demucs --two-stems=vocals -n mdx_extra_q {audio_path} -o output"
132
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
133
+ print(result.stdout.decode())
134
+ return "output/mdx_extra_q/youtube_audio/vocals.wav", "output/mdx_extra_q/youtube_audio/no_vocals.wav", audio_path, "output/mdx_extra_q/youtube_audio/vocals.wav"
135
+ else:
136
+ raise gr.Error("URL Required!")
137
+ return None, None, None, None
138
+
139
+ def combine_vocal_and_inst(audio_data, audio_volume, split_model):
140
+ if not os.path.exists("output/result"):
141
+ os.mkdir("output/result")
142
+ vocal_path = "output/result/output.wav"
143
+ output_path = "output/result/combine.mp3"
144
+ if split_model == "htdemucs":
145
+ inst_path = "output/htdemucs/youtube_audio/no_vocals.wav"
146
+ else:
147
+ inst_path = "output/mdx_extra_q/youtube_audio/no_vocals.wav"
148
+ with wave.open(vocal_path, "w") as wave_file:
149
+ wave_file.setnchannels(1)
150
+ wave_file.setsampwidth(2)
151
+ wave_file.setframerate(audio_data[0])
152
+ wave_file.writeframes(audio_data[1].tobytes())
153
+ command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume={audio_volume}dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
154
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
155
+ print(result.stdout.decode())
156
+ return output_path
157
+
158
  def load_hubert():
159
  global hubert_model
160
  models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
 
169
  hubert_model = hubert_model.float()
170
  hubert_model.eval()
171
 
172
+ def change_audio_mode(vc_audio_mode):
173
+ if vc_audio_mode == "Input path":
174
+ return (
175
+ # Input & Upload
176
+ gr.Textbox.update(visible=True),
177
+ gr.Audio.update(visible=False),
178
+ # Youtube
179
+ gr.Dropdown.update(visible=False),
180
+ gr.Textbox.update(visible=False),
181
+ gr.Dropdown.update(visible=False),
182
+ gr.Button.update(visible=False),
183
+ gr.Audio.update(visible=False),
184
+ gr.Audio.update(visible=False),
185
+ gr.Audio.update(visible=False),
186
+ gr.Slider.update(visible=False),
187
+ gr.Audio.update(visible=False),
188
+ gr.Button.update(visible=False),
189
+ # TTS
190
+ gr.Textbox.update(visible=False),
191
+ gr.Dropdown.update(visible=False)
192
+ )
193
+ elif vc_audio_mode == "Upload audio":
194
+ return (
195
+ # Input & Upload
196
+ gr.Textbox.update(visible=False),
197
+ gr.Audio.update(visible=True),
198
+ # Youtube
199
+ gr.Dropdown.update(visible=False),
200
+ gr.Textbox.update(visible=False),
201
+ gr.Dropdown.update(visible=False),
202
+ gr.Button.update(visible=False),
203
+ gr.Audio.update(visible=False),
204
+ gr.Audio.update(visible=False),
205
+ gr.Audio.update(visible=False),
206
+ gr.Slider.update(visible=False),
207
+ gr.Audio.update(visible=False),
208
+ gr.Button.update(visible=False),
209
+ # TTS
210
+ gr.Textbox.update(visible=False),
211
+ gr.Dropdown.update(visible=False)
212
+ )
213
+ elif vc_audio_mode == "Youtube":
214
+ return (
215
+ # Input & Upload
216
+ gr.Textbox.update(visible=False),
217
+ gr.Audio.update(visible=False),
218
+ # Youtube
219
+ gr.Dropdown.update(visible=True),
220
+ gr.Textbox.update(visible=True),
221
+ gr.Dropdown.update(visible=True),
222
+ gr.Button.update(visible=True),
223
+ gr.Audio.update(visible=True),
224
+ gr.Audio.update(visible=True),
225
+ gr.Audio.update(visible=True),
226
+ gr.Slider.update(visible=True),
227
+ gr.Audio.update(visible=True),
228
+ gr.Button.update(visible=True),
229
+ # TTS
230
+ gr.Textbox.update(visible=False),
231
+ gr.Dropdown.update(visible=False)
232
+ )
233
+ elif vc_audio_mode == "TTS Audio":
234
+ return (
235
+ # Input & Upload
236
+ gr.Textbox.update(visible=False),
237
+ gr.Audio.update(visible=False),
238
+ # Youtube
239
+ gr.Dropdown.update(visible=False),
240
+ gr.Textbox.update(visible=False),
241
+ gr.Dropdown.update(visible=False),
242
+ gr.Button.update(visible=False),
243
+ gr.Audio.update(visible=False),
244
+ gr.Audio.update(visible=False),
245
+ gr.Audio.update(visible=False),
246
+ gr.Slider.update(visible=False),
247
+ gr.Audio.update(visible=False),
248
+ gr.Button.update(visible=False),
249
+ # TTS
250
+ gr.Textbox.update(visible=True),
251
+ gr.Dropdown.update(visible=True)
252
+ )
253
  else:
254
+ return (
255
+ # Input & Upload
256
+ gr.Textbox.update(visible=False),
257
+ gr.Audio.update(visible=True),
258
+ # Youtube
259
+ gr.Dropdown.update(visible=False),
260
+ gr.Textbox.update(visible=False),
261
+ gr.Dropdown.update(visible=False),
262
+ gr.Button.update(visible=False),
263
+ gr.Audio.update(visible=False),
264
+ gr.Audio.update(visible=False),
265
+ gr.Audio.update(visible=False),
266
+ gr.Slider.update(visible=False),
267
+ gr.Audio.update(visible=False),
268
+ gr.Button.update(visible=False),
269
+ # TTS
270
+ gr.Textbox.update(visible=False),
271
+ gr.Dropdown.update(visible=False)
272
+ )
273
 
274
  if __name__ == '__main__':
275
  load_hubert()
 
298
  tgt_sr = cpt["config"][-1]
299
  cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
300
  if_f0 = cpt.get("f0", 1)
301
+ version = cpt.get("version", "v1")
302
+ if version == "v1":
303
+ if if_f0 == 1:
304
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
305
+ else:
306
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
307
+ nodel_version = "V1"
308
+ elif version == "v2":
309
+ if if_f0 == 1:
310
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
311
+ else:
312
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
313
+ nodel_version = "V2"
314
  del net_g.enc_q
315
  print(net_g.load_state_dict(cpt["weight"], strict=False))
316
  net_g.eval().to(config.device)
 
320
  net_g = net_g.float()
321
  vc = VC(tgt_sr, config)
322
  print(f"Model loaded: {model_name}")
323
+ models.append((model_name, model_title, model_author, model_cover, nodel_version, create_vc_fn(tgt_sr, net_g, vc, if_f0, model_index)))
324
  categories.append([category_title, category_folder, description, models])
325
  with gr.Blocks() as app:
326
  gr.Markdown(
327
+ "# <center> RVC Genshin Impact Inference\n"
328
+ "#### From [Retrieval-based-Voice-Conversion](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)\n"
329
+ "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)"
 
 
 
 
 
330
  )
331
  for (folder_title, folder, description, models) in categories:
332
  with gr.TabItem(folder_title):
 
335
  with gr.Tabs():
336
  if not models:
337
  gr.Markdown("# <center> No Model Loaded.")
338
+ gr.Markdown("## <center> Please add model or fix your model path.")
339
  continue
340
+ for (name, title, author, cover, model_version, vc_fn) in models:
341
+ with gr.TabItem(name):
342
+ with gr.Row():
343
+ gr.Markdown(
344
+ '<div align="center">'
345
+ f'<div>{title}</div>\n'+
346
+ f'<div>RVC {model_version} Model</div>\n'+
347
+ (f'<div>Model author: {author}</div>' if author else "")+
348
+ (f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")+
349
+ '</div>'
350
+ )
351
+ with gr.Row():
352
+ with gr.Column():
353
+ vc_audio_mode = gr.Dropdown(label="Input voice", choices=["Upload audio", "TTS Audio"], allow_custom_value=False, value="Upload audio")
354
+ # Input and Upload
355
+ vc_input = gr.Textbox(label="Input audio path", visible=False)
356
+ vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
357
+ # Youtube
358
+ vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
359
+ vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
360
+ vc_split_model = gr.Dropdown(label="Splitter Model", choices=["htdemucs", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
361
+ vc_split = gr.Button("Split Audio", variant="primary", visible=False)
362
+ vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
363
+ vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
364
+ vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
365
+ # TTS
366
+ tts_text = gr.Textbox(visible=False, label="TTS text", info="Text to speech input")
367
+ tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
368
+ with gr.Column():
369
+ spk_item = gr.Slider(
370
+ minimum=0,
371
+ maximum=2333,
372
+ step=1,
373
+ label="Speaker ID",
374
+ info="(Default: 0)",
375
+ value=0,
376
+ interactive=True,
377
+ )
378
+ vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
379
+ f0method0 = gr.Radio(
380
+ label="Pitch extraction algorithm",
381
+ info="PM is fast, Harvest is good but extremely slow (Default: PM)",
382
+ choices=["pm", "harvest"],
383
+ value="pm",
384
+ interactive=True,
385
+ )
386
+ index_rate1 = gr.Slider(
387
+ minimum=0,
388
+ maximum=1,
389
+ label="Retrieval feature ratio",
390
+ info="(Default: 0.6)",
391
+ value=0.6,
392
+ interactive=True,
393
+ )
394
+ filter_radius0 = gr.Slider(
395
+ minimum=0,
396
+ maximum=7,
397
+ label="Apply Median Filtering",
398
+ info="The value represents the filter radius and can reduce breathiness.",
399
+ value=3,
400
+ step=1,
401
+ interactive=True,
402
+ )
403
+ resample_sr0 = gr.Slider(
404
+ minimum=0,
405
+ maximum=48000,
406
+ label="Resample the output audio",
407
+ info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling",
408
+ value=0,
409
+ step=1,
410
+ interactive=True,
411
+ )
412
+ rms_mix_rate0 = gr.Slider(
413
+ minimum=0,
414
+ maximum=1,
415
+ label="Volume Envelope",
416
+ info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
417
+ value=1,
418
+ interactive=True,
419
+ )
420
+ protect0 = gr.Slider(
421
+ minimum=0,
422
+ maximum=0.5,
423
+ label="Voice Protection",
424
+ info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
425
+ value=0.35,
426
+ step=0.01,
427
+ interactive=True,
428
+ )
429
+ with gr.Column():
430
+ vc_log = gr.Textbox(label="Output Information", interactive=False)
431
+ vc_output = gr.Audio(label="Output Audio", interactive=False)
432
+ vc_convert = gr.Button("Convert", variant="primary")
433
+ vc_volume = gr.Slider(
434
+ minimum=0,
435
+ maximum=10,
436
+ label="Vocal volume",
437
+ value=4,
438
+ interactive=True,
439
+ step=1,
440
+ info="Adjust vocal volume (Default: 4}",
441
+ visible=False
442
  )
443
+ vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False)
444
+ vc_combine = gr.Button("Combine",variant="primary", visible=False)
445
+ vc_convert.click(
446
+ fn=vc_fn,
447
+ inputs=[
448
+ vc_input,
449
+ vc_upload,
450
+ tts_text,
451
+ tts_voice,
452
+ spk_item,
453
+ vc_transform0,
454
+ f0method0,
455
+ index_rate1,
456
+ filter_radius0,
457
+ resample_sr0,
458
+ rms_mix_rate0,
459
+ protect0,
460
+ ],
461
+ outputs=[vc_log ,vc_output]
462
+ )
463
+ vc_split.click(
464
+ fn=cut_vocal_and_inst,
465
+ inputs=[vc_link, vc_download_audio, vc_split_model],
466
+ outputs=[vc_vocal_preview, vc_inst_preview, vc_audio_preview]
467
+ )
468
+ vc_combine.click(
469
+ fn=combine_vocal_and_inst,
470
+ inputs=[vc_output, vc_volume, vc_split_model],
471
+ outputs=[vc_combined_output]
472
+ )
473
+ vc_audio_mode.change(
474
+ fn=change_audio_mode,
475
+ inputs=[vc_audio_mode],
476
+ outputs=[
477
+ vc_input,
478
+ vc_upload,
479
+ vc_download_audio,
480
+ vc_link,
481
+ vc_split_model,
482
+ vc_split,
483
+ vc_vocal_preview,
484
+ vc_inst_preview,
485
+ vc_audio_preview,
486
+ vc_volume,
487
+ vc_combined_output,
488
+ vc_combine,
489
+ tts_text,
490
+ tts_voice
491
+ ]
492
+ )
493
  app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
config.py CHANGED
@@ -3,6 +3,18 @@ import torch
3
  from multiprocessing import cpu_count
4
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  class Config:
7
  def __init__(self):
8
  self.device = "cuda:0"
@@ -36,7 +48,7 @@ class Config:
36
  action="store_true",
37
  help="Do not open in browser automatically",
38
  )
39
- parser.add_argument('--api', action="store_true", default=False)
40
  cmd_opts = parser.parse_args()
41
 
42
  cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
@@ -47,7 +59,7 @@ class Config:
47
  cmd_opts.colab,
48
  cmd_opts.noparallel,
49
  cmd_opts.noautoopen,
50
- cmd_opts.api,
51
  )
52
 
53
  def device_config(self) -> tuple:
@@ -63,15 +75,7 @@ class Config:
63
  ):
64
  print("16系/10系显卡和P40强制单精度")
65
  self.is_half = False
66
- for config_file in ["32k.json", "40k.json", "48k.json"]:
67
- with open(f"configs/{config_file}", "r") as f:
68
- strr = f.read().replace("true", "false")
69
- with open(f"configs/{config_file}", "w") as f:
70
- f.write(strr)
71
- with open("trainset_preprocess_pipeline_print.py", "r") as f:
72
- strr = f.read().replace("3.7", "3.0")
73
- with open("trainset_preprocess_pipeline_print.py", "w") as f:
74
- f.write(strr)
75
  else:
76
  self.gpu_name = None
77
  self.gpu_mem = int(
@@ -90,10 +94,12 @@ class Config:
90
  print("没有发现支持的N卡, 使用MPS进行推理")
91
  self.device = "mps"
92
  self.is_half = False
 
93
  else:
94
  print("没有发现支持的N卡, 使用CPU进行推理")
95
  self.device = "cpu"
96
  self.is_half = False
 
97
 
98
  if self.n_cpu == 0:
99
  self.n_cpu = cpu_count()
@@ -117,4 +123,4 @@ class Config:
117
  x_center = 30
118
  x_max = 32
119
 
120
- return x_pad, x_query, x_center, x_max
 
3
  from multiprocessing import cpu_count
4
 
5
 
6
+ def config_file_change_fp32():
7
+ for config_file in ["32k.json", "40k.json", "48k.json"]:
8
+ with open(f"configs/{config_file}", "r") as f:
9
+ strr = f.read().replace("true", "false")
10
+ with open(f"configs/{config_file}", "w") as f:
11
+ f.write(strr)
12
+ with open("trainset_preprocess_pipeline_print.py", "r") as f:
13
+ strr = f.read().replace("3.7", "3.0")
14
+ with open("trainset_preprocess_pipeline_print.py", "w") as f:
15
+ f.write(strr)
16
+
17
+
18
  class Config:
19
  def __init__(self):
20
  self.device = "cuda:0"
 
48
  action="store_true",
49
  help="Do not open in browser automatically",
50
  )
51
+ parser.add_argument("--api", action="store_true", help="Launch with api")
52
  cmd_opts = parser.parse_args()
53
 
54
  cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
 
59
  cmd_opts.colab,
60
  cmd_opts.noparallel,
61
  cmd_opts.noautoopen,
62
+ cmd_opts.api
63
  )
64
 
65
  def device_config(self) -> tuple:
 
75
  ):
76
  print("16系/10系显卡和P40强制单精度")
77
  self.is_half = False
78
+ config_file_change_fp32()
 
 
 
 
 
 
 
 
79
  else:
80
  self.gpu_name = None
81
  self.gpu_mem = int(
 
94
  print("没有发现支持的N卡, 使用MPS进行推理")
95
  self.device = "mps"
96
  self.is_half = False
97
+ config_file_change_fp32()
98
  else:
99
  print("没有发现支持的N卡, 使用CPU进行推理")
100
  self.device = "cpu"
101
  self.is_half = False
102
+ config_file_change_fp32()
103
 
104
  if self.n_cpu == 0:
105
  self.n_cpu = cpu_count()
 
123
  x_center = 30
124
  x_max = 32
125
 
126
+ return x_pad, x_query, x_center, x_max
infer_pack/models.py CHANGED
@@ -61,7 +61,7 @@ class TextEncoder256(nn.Module):
61
  return m, logs, x_mask
62
 
63
 
64
- class TextEncoder256Sim(nn.Module):
65
  def __init__(
66
  self,
67
  out_channels,
@@ -81,14 +81,14 @@ class TextEncoder256Sim(nn.Module):
81
  self.n_layers = n_layers
82
  self.kernel_size = kernel_size
83
  self.p_dropout = p_dropout
84
- self.emb_phone = nn.Linear(256, hidden_channels)
85
  self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
  if f0 == True:
87
  self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
  self.encoder = attentions.Encoder(
89
  hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
  )
91
- self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
92
 
93
  def forward(self, phone, pitch, lengths):
94
  if pitch == None:
@@ -102,8 +102,10 @@ class TextEncoder256Sim(nn.Module):
102
  x.dtype
103
  )
104
  x = self.encoder(x * x_mask, x_mask)
105
- x = self.proj(x) * x_mask
106
- return x, x_mask
 
 
107
 
108
 
109
  class ResidualCouplingBlock(nn.Module):
@@ -638,6 +640,117 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
638
  return o, x_mask, (z, z_p, m_p, logs_p)
639
 
640
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
641
  class SynthesizerTrnMs256NSFsid_nono(nn.Module):
642
  def __init__(
643
  self,
@@ -740,11 +853,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
740
  return o, x_mask, (z, z_p, m_p, logs_p)
741
 
742
 
743
- class SynthesizerTrnMs256NSFsid_sim(nn.Module):
744
- """
745
- Synthesizer for Training
746
- """
747
-
748
  def __init__(
749
  self,
750
  spec_channels,
@@ -763,9 +872,8 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
763
  upsample_initial_channel,
764
  upsample_kernel_sizes,
765
  spk_embed_dim,
766
- # hop_length,
767
- gin_channels=0,
768
- use_sdp=True,
769
  **kwargs
770
  ):
771
  super().__init__()
@@ -787,7 +895,7 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
787
  self.gin_channels = gin_channels
788
  # self.hop_length = hop_length#
789
  self.spk_embed_dim = spk_embed_dim
790
- self.enc_p = TextEncoder256Sim(
791
  inter_channels,
792
  hidden_channels,
793
  filter_channels,
@@ -795,8 +903,9 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
795
  n_layers,
796
  kernel_size,
797
  p_dropout,
 
798
  )
799
- self.dec = GeneratorNSF(
800
  inter_channels,
801
  resblock,
802
  resblock_kernel_sizes,
@@ -805,9 +914,16 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
805
  upsample_initial_channel,
806
  upsample_kernel_sizes,
807
  gin_channels=gin_channels,
808
- is_half=kwargs["is_half"],
809
  )
810
-
 
 
 
 
 
 
 
 
811
  self.flow = ResidualCouplingBlock(
812
  inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
813
  )
@@ -819,28 +935,24 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
819
  self.flow.remove_weight_norm()
820
  self.enc_q.remove_weight_norm()
821
 
822
- def forward(
823
- self, phone, phone_lengths, pitch, pitchf, y_lengths, ds
824
- ): # y是spec不需要了现在
825
  g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
826
- x, x_mask = self.enc_p(phone, pitch, phone_lengths)
827
- x = self.flow(x, x_mask, g=g, reverse=True)
 
828
  z_slice, ids_slice = commons.rand_slice_segments(
829
- x, y_lengths, self.segment_size
830
  )
 
 
831
 
832
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
833
- o = self.dec(z_slice, pitchf, g=g)
834
- return o, ids_slice
835
-
836
- def infer(
837
- self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
838
- ): # y是spec不需要了现在
839
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
840
- x, x_mask = self.enc_p(phone, pitch, phone_lengths)
841
- x = self.flow(x, x_mask, g=g, reverse=True)
842
- o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
843
- return o, o
844
 
845
 
846
  class MultiPeriodDiscriminator(torch.nn.Module):
@@ -873,6 +985,36 @@ class MultiPeriodDiscriminator(torch.nn.Module):
873
  return y_d_rs, y_d_gs, fmap_rs, fmap_gs
874
 
875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
  class DiscriminatorS(torch.nn.Module):
877
  def __init__(self, use_spectral_norm=False):
878
  super(DiscriminatorS, self).__init__()
 
61
  return m, logs, x_mask
62
 
63
 
64
+ class TextEncoder768(nn.Module):
65
  def __init__(
66
  self,
67
  out_channels,
 
81
  self.n_layers = n_layers
82
  self.kernel_size = kernel_size
83
  self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(768, hidden_channels)
85
  self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
  if f0 == True:
87
  self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
  self.encoder = attentions.Encoder(
89
  hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
  )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
 
93
  def forward(self, phone, pitch, lengths):
94
  if pitch == None:
 
102
  x.dtype
103
  )
104
  x = self.encoder(x * x_mask, x_mask)
105
+ stats = self.proj(x) * x_mask
106
+
107
+ m, logs = torch.split(stats, self.out_channels, dim=1)
108
+ return m, logs, x_mask
109
 
110
 
111
  class ResidualCouplingBlock(nn.Module):
 
640
  return o, x_mask, (z, z_p, m_p, logs_p)
641
 
642
 
643
+ class SynthesizerTrnMs768NSFsid(nn.Module):
644
+ def __init__(
645
+ self,
646
+ spec_channels,
647
+ segment_size,
648
+ inter_channels,
649
+ hidden_channels,
650
+ filter_channels,
651
+ n_heads,
652
+ n_layers,
653
+ kernel_size,
654
+ p_dropout,
655
+ resblock,
656
+ resblock_kernel_sizes,
657
+ resblock_dilation_sizes,
658
+ upsample_rates,
659
+ upsample_initial_channel,
660
+ upsample_kernel_sizes,
661
+ spk_embed_dim,
662
+ gin_channels,
663
+ sr,
664
+ **kwargs
665
+ ):
666
+ super().__init__()
667
+ if type(sr) == type("strr"):
668
+ sr = sr2sr[sr]
669
+ self.spec_channels = spec_channels
670
+ self.inter_channels = inter_channels
671
+ self.hidden_channels = hidden_channels
672
+ self.filter_channels = filter_channels
673
+ self.n_heads = n_heads
674
+ self.n_layers = n_layers
675
+ self.kernel_size = kernel_size
676
+ self.p_dropout = p_dropout
677
+ self.resblock = resblock
678
+ self.resblock_kernel_sizes = resblock_kernel_sizes
679
+ self.resblock_dilation_sizes = resblock_dilation_sizes
680
+ self.upsample_rates = upsample_rates
681
+ self.upsample_initial_channel = upsample_initial_channel
682
+ self.upsample_kernel_sizes = upsample_kernel_sizes
683
+ self.segment_size = segment_size
684
+ self.gin_channels = gin_channels
685
+ # self.hop_length = hop_length#
686
+ self.spk_embed_dim = spk_embed_dim
687
+ self.enc_p = TextEncoder768(
688
+ inter_channels,
689
+ hidden_channels,
690
+ filter_channels,
691
+ n_heads,
692
+ n_layers,
693
+ kernel_size,
694
+ p_dropout,
695
+ )
696
+ self.dec = GeneratorNSF(
697
+ inter_channels,
698
+ resblock,
699
+ resblock_kernel_sizes,
700
+ resblock_dilation_sizes,
701
+ upsample_rates,
702
+ upsample_initial_channel,
703
+ upsample_kernel_sizes,
704
+ gin_channels=gin_channels,
705
+ sr=sr,
706
+ is_half=kwargs["is_half"],
707
+ )
708
+ self.enc_q = PosteriorEncoder(
709
+ spec_channels,
710
+ inter_channels,
711
+ hidden_channels,
712
+ 5,
713
+ 1,
714
+ 16,
715
+ gin_channels=gin_channels,
716
+ )
717
+ self.flow = ResidualCouplingBlock(
718
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
719
+ )
720
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
721
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
722
+
723
+ def remove_weight_norm(self):
724
+ self.dec.remove_weight_norm()
725
+ self.flow.remove_weight_norm()
726
+ self.enc_q.remove_weight_norm()
727
+
728
+ def forward(
729
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
730
+ ): # 这里ds是id,[bs,1]
731
+ # print(1,pitch.shape)#[bs,t]
732
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
733
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
734
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
735
+ z_p = self.flow(z, y_mask, g=g)
736
+ z_slice, ids_slice = commons.rand_slice_segments(
737
+ z, y_lengths, self.segment_size
738
+ )
739
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
740
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
741
+ # print(-2,pitchf.shape,z_slice.shape)
742
+ o = self.dec(z_slice, pitchf, g=g)
743
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
744
+
745
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
746
+ g = self.emb_g(sid).unsqueeze(-1)
747
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
748
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
749
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
750
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
751
+ return o, x_mask, (z, z_p, m_p, logs_p)
752
+
753
+
754
  class SynthesizerTrnMs256NSFsid_nono(nn.Module):
755
  def __init__(
756
  self,
 
853
  return o, x_mask, (z, z_p, m_p, logs_p)
854
 
855
 
856
+ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
 
 
 
 
857
  def __init__(
858
  self,
859
  spec_channels,
 
872
  upsample_initial_channel,
873
  upsample_kernel_sizes,
874
  spk_embed_dim,
875
+ gin_channels,
876
+ sr=None,
 
877
  **kwargs
878
  ):
879
  super().__init__()
 
895
  self.gin_channels = gin_channels
896
  # self.hop_length = hop_length#
897
  self.spk_embed_dim = spk_embed_dim
898
+ self.enc_p = TextEncoder768(
899
  inter_channels,
900
  hidden_channels,
901
  filter_channels,
 
903
  n_layers,
904
  kernel_size,
905
  p_dropout,
906
+ f0=False,
907
  )
908
+ self.dec = Generator(
909
  inter_channels,
910
  resblock,
911
  resblock_kernel_sizes,
 
914
  upsample_initial_channel,
915
  upsample_kernel_sizes,
916
  gin_channels=gin_channels,
 
917
  )
918
+ self.enc_q = PosteriorEncoder(
919
+ spec_channels,
920
+ inter_channels,
921
+ hidden_channels,
922
+ 5,
923
+ 1,
924
+ 16,
925
+ gin_channels=gin_channels,
926
+ )
927
  self.flow = ResidualCouplingBlock(
928
  inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
929
  )
 
935
  self.flow.remove_weight_norm()
936
  self.enc_q.remove_weight_norm()
937
 
938
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
 
 
939
  g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
940
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
941
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
942
+ z_p = self.flow(z, y_mask, g=g)
943
  z_slice, ids_slice = commons.rand_slice_segments(
944
+ z, y_lengths, self.segment_size
945
  )
946
+ o = self.dec(z_slice, g=g)
947
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
948
 
949
+ def infer(self, phone, phone_lengths, sid, max_len=None):
950
+ g = self.emb_g(sid).unsqueeze(-1)
951
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
952
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
953
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
954
+ o = self.dec((z * x_mask)[:, :, :max_len], g=g)
955
+ return o, x_mask, (z, z_p, m_p, logs_p)
 
 
 
 
 
956
 
957
 
958
  class MultiPeriodDiscriminator(torch.nn.Module):
 
985
  return y_d_rs, y_d_gs, fmap_rs, fmap_gs
986
 
987
 
988
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
989
+ def __init__(self, use_spectral_norm=False):
990
+ super(MultiPeriodDiscriminatorV2, self).__init__()
991
+ # periods = [2, 3, 5, 7, 11, 17]
992
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
993
+
994
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
995
+ discs = discs + [
996
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
997
+ ]
998
+ self.discriminators = nn.ModuleList(discs)
999
+
1000
+ def forward(self, y, y_hat):
1001
+ y_d_rs = [] #
1002
+ y_d_gs = []
1003
+ fmap_rs = []
1004
+ fmap_gs = []
1005
+ for i, d in enumerate(self.discriminators):
1006
+ y_d_r, fmap_r = d(y)
1007
+ y_d_g, fmap_g = d(y_hat)
1008
+ # for j in range(len(fmap_r)):
1009
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1010
+ y_d_rs.append(y_d_r)
1011
+ y_d_gs.append(y_d_g)
1012
+ fmap_rs.append(fmap_r)
1013
+ fmap_gs.append(fmap_g)
1014
+
1015
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1016
+
1017
+
1018
  class DiscriminatorS(torch.nn.Module):
1019
  def __init__(self, use_spectral_norm=False):
1020
  super(DiscriminatorS, self).__init__()
infer_pack/models_onnx.py CHANGED
@@ -61,7 +61,7 @@ class TextEncoder256(nn.Module):
61
  return m, logs, x_mask
62
 
63
 
64
- class TextEncoder256Sim(nn.Module):
65
  def __init__(
66
  self,
67
  out_channels,
@@ -81,14 +81,14 @@ class TextEncoder256Sim(nn.Module):
81
  self.n_layers = n_layers
82
  self.kernel_size = kernel_size
83
  self.p_dropout = p_dropout
84
- self.emb_phone = nn.Linear(256, hidden_channels)
85
  self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
  if f0 == True:
87
  self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
  self.encoder = attentions.Encoder(
89
  hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
  )
91
- self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
92
 
93
  def forward(self, phone, pitch, lengths):
94
  if pitch == None:
@@ -102,8 +102,10 @@ class TextEncoder256Sim(nn.Module):
102
  x.dtype
103
  )
104
  x = self.encoder(x * x_mask, x_mask)
105
- x = self.proj(x) * x_mask
106
- return x, x_mask
 
 
107
 
108
 
109
  class ResidualCouplingBlock(nn.Module):
@@ -527,7 +529,7 @@ sr2sr = {
527
  }
528
 
529
 
530
- class SynthesizerTrnMs256NSFsidO(nn.Module):
531
  def __init__(
532
  self,
533
  spec_channels,
@@ -571,15 +573,26 @@ class SynthesizerTrnMs256NSFsidO(nn.Module):
571
  self.gin_channels = gin_channels
572
  # self.hop_length = hop_length#
573
  self.spk_embed_dim = spk_embed_dim
574
- self.enc_p = TextEncoder256(
575
- inter_channels,
576
- hidden_channels,
577
- filter_channels,
578
- n_heads,
579
- n_layers,
580
- kernel_size,
581
- p_dropout,
582
- )
 
 
 
 
 
 
 
 
 
 
 
583
  self.dec = GeneratorNSF(
584
  inter_channels,
585
  resblock,
@@ -605,6 +618,7 @@ class SynthesizerTrnMs256NSFsidO(nn.Module):
605
  inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
606
  )
607
  self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
 
608
  print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
609
 
610
  def remove_weight_norm(self):
@@ -612,10 +626,24 @@ class SynthesizerTrnMs256NSFsidO(nn.Module):
612
  self.flow.remove_weight_norm()
613
  self.enc_q.remove_weight_norm()
614
 
615
- def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
616
- g = self.emb_g(sid).unsqueeze(-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
  m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
618
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
619
  z = self.flow(z_p, x_mask, g=g, reverse=True)
620
  o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
621
  return o
@@ -651,6 +679,36 @@ class MultiPeriodDiscriminator(torch.nn.Module):
651
  return y_d_rs, y_d_gs, fmap_rs, fmap_gs
652
 
653
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
  class DiscriminatorS(torch.nn.Module):
655
  def __init__(self, use_spectral_norm=False):
656
  super(DiscriminatorS, self).__init__()
 
61
  return m, logs, x_mask
62
 
63
 
64
+ class TextEncoder768(nn.Module):
65
  def __init__(
66
  self,
67
  out_channels,
 
81
  self.n_layers = n_layers
82
  self.kernel_size = kernel_size
83
  self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(768, hidden_channels)
85
  self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
  if f0 == True:
87
  self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
  self.encoder = attentions.Encoder(
89
  hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
  )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
 
93
  def forward(self, phone, pitch, lengths):
94
  if pitch == None:
 
102
  x.dtype
103
  )
104
  x = self.encoder(x * x_mask, x_mask)
105
+ stats = self.proj(x) * x_mask
106
+
107
+ m, logs = torch.split(stats, self.out_channels, dim=1)
108
+ return m, logs, x_mask
109
 
110
 
111
  class ResidualCouplingBlock(nn.Module):
 
529
  }
530
 
531
 
532
+ class SynthesizerTrnMsNSFsidM(nn.Module):
533
  def __init__(
534
  self,
535
  spec_channels,
 
573
  self.gin_channels = gin_channels
574
  # self.hop_length = hop_length#
575
  self.spk_embed_dim = spk_embed_dim
576
+ if self.gin_channels == 256:
577
+ self.enc_p = TextEncoder256(
578
+ inter_channels,
579
+ hidden_channels,
580
+ filter_channels,
581
+ n_heads,
582
+ n_layers,
583
+ kernel_size,
584
+ p_dropout,
585
+ )
586
+ else:
587
+ self.enc_p = TextEncoder768(
588
+ inter_channels,
589
+ hidden_channels,
590
+ filter_channels,
591
+ n_heads,
592
+ n_layers,
593
+ kernel_size,
594
+ p_dropout,
595
+ )
596
  self.dec = GeneratorNSF(
597
  inter_channels,
598
  resblock,
 
618
  inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
619
  )
620
  self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
621
+ self.speaker_map = None
622
  print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
623
 
624
  def remove_weight_norm(self):
 
626
  self.flow.remove_weight_norm()
627
  self.enc_q.remove_weight_norm()
628
 
629
+ def construct_spkmixmap(self, n_speaker):
630
+ self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
631
+ for i in range(n_speaker):
632
+ self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
633
+ self.speaker_map = self.speaker_map.unsqueeze(0)
634
+
635
+ def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
636
+ if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
637
+ g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
638
+ g = g * self.speaker_map # [N, S, B, 1, H]
639
+ g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
640
+ g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
641
+ else:
642
+ g = g.unsqueeze(0)
643
+ g = self.emb_g(g).transpose(1, 2)
644
+
645
  m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
646
+ z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
647
  z = self.flow(z_p, x_mask, g=g, reverse=True)
648
  o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
649
  return o
 
679
  return y_d_rs, y_d_gs, fmap_rs, fmap_gs
680
 
681
 
682
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
683
+ def __init__(self, use_spectral_norm=False):
684
+ super(MultiPeriodDiscriminatorV2, self).__init__()
685
+ # periods = [2, 3, 5, 7, 11, 17]
686
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
687
+
688
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
689
+ discs = discs + [
690
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
691
+ ]
692
+ self.discriminators = nn.ModuleList(discs)
693
+
694
+ def forward(self, y, y_hat):
695
+ y_d_rs = [] #
696
+ y_d_gs = []
697
+ fmap_rs = []
698
+ fmap_gs = []
699
+ for i, d in enumerate(self.discriminators):
700
+ y_d_r, fmap_r = d(y)
701
+ y_d_g, fmap_g = d(y_hat)
702
+ # for j in range(len(fmap_r)):
703
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
704
+ y_d_rs.append(y_d_r)
705
+ y_d_gs.append(y_d_g)
706
+ fmap_rs.append(fmap_r)
707
+ fmap_gs.append(fmap_g)
708
+
709
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
710
+
711
+
712
  class DiscriminatorS(torch.nn.Module):
713
  def __init__(self, use_spectral_norm=False):
714
  super(DiscriminatorS, self).__init__()
requirements.txt CHANGED
@@ -1,46 +1,26 @@
1
- numba==0.56.4
2
- numpy==1.23.5
3
- scipy==1.9.3
4
- librosa==0.9.2
5
- llvmlite==0.39.0
6
  fairseq==0.12.2
7
- faiss-cpu==1.7.0; sys_platform == "darwin"
8
- faiss-cpu==1.7.2; sys_platform != "darwin"
9
  gradio
10
- Cython
11
- future>=0.18.3
12
- pydub>=0.25.1
13
- soundfile>=0.12.1
14
- ffmpeg-python>=0.2.0
15
- tensorboardX
16
- functorch>=2.0.0
17
- Jinja2>=3.1.2
18
- json5>=0.9.11
19
- Markdown
20
- matplotlib>=3.7.1
21
- matplotlib-inline>=0.1.6
22
- praat-parselmouth>=0.4.3
23
- Pillow>=9.1.1
24
  pyworld>=0.3.2
25
- resampy>=0.4.2
26
- scikit-learn>=1.2.2
27
- starlette>=0.26.1
28
  tensorboard
29
- tensorboard-data-server
30
- tensorboard-plugin-wit
31
- torchgen>=0.0.1
32
- tqdm>=4.65.0
33
- tornado>=6.2
34
- Werkzeug>=2.2.3
35
- uc-micro-py>=1.0.1
36
- sympy>=1.11.1
37
- tabulate>=0.9.0
38
- PyYAML>=6.0
39
- pyasn1>=0.4.8
40
- pyasn1-modules>=0.2.8
41
- fsspec>=2023.3.0
42
- absl-py>=1.4.0
43
- audioread
44
- uvicorn>=0.21.1
45
- colorama>=0.4.6
46
  edge-tts
 
 
 
1
+ setuptools
2
+ wheel
3
+ httpx==0.23.0
 
 
4
  fairseq==0.12.2
 
 
5
  gradio
6
+ ffmpeg
7
+ praat-parselmouth
8
+ pyworld
9
+ numpy==1.23.5
10
+ numba==0.56.4
11
+ librosa==0.9.2
12
+ faiss-cpu==1.7.3
13
+ faiss-gpu
14
+ scipy==1.9.3
 
 
 
 
 
15
  pyworld>=0.3.2
 
 
 
16
  tensorboard
17
+ tensorboardX
18
+ onnxruntime
19
+ pyngrok==4.1.12
20
+ soundfile>=0.12.1
21
+ tqdm>=4.63.1
22
+ torchcrepe
23
+ asyncio
 
 
 
 
 
 
 
 
 
 
24
  edge-tts
25
+ demucs
26
+ yt_dlp
vc_infer_pipeline.py CHANGED
@@ -2,11 +2,50 @@ import numpy as np, parselmouth, torch, pdb
2
  from time import time as ttime
3
  import torch.nn.functional as F
4
  import scipy.signal as signal
5
- import pyworld, os, traceback, faiss
6
  from scipy import signal
 
7
 
8
  bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  class VC(object):
12
  def __init__(self, tgt_sr, config):
@@ -27,7 +66,17 @@ class VC(object):
27
  self.t_max = self.sr * self.x_max # 免查询时长阈值
28
  self.device = config.device
29
 
30
- def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
 
 
 
 
 
 
 
 
 
 
31
  time_step = self.window / self.sr * 1000
32
  f0_min = 50
33
  f0_max = 1100
@@ -50,15 +99,31 @@ class VC(object):
50
  f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
51
  )
52
  elif f0_method == "harvest":
53
- f0, t = pyworld.harvest(
54
- x.astype(np.double),
55
- fs=self.sr,
56
- f0_ceil=f0_max,
57
- f0_floor=f0_min,
58
- frame_period=10,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  )
60
- f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
61
- f0 = signal.medfilt(f0, 3)
 
 
62
  f0 *= pow(2, f0_up_key / 12)
63
  # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
64
  tf0 = self.sr // self.window # 每秒f0点数
@@ -96,6 +161,8 @@ class VC(object):
96
  index,
97
  big_npy,
98
  index_rate,
 
 
99
  ): # ,file_index,file_big_npy
100
  feats = torch.from_numpy(audio0)
101
  if self.is_half:
@@ -111,13 +178,14 @@ class VC(object):
111
  inputs = {
112
  "source": feats.to(self.device),
113
  "padding_mask": padding_mask,
114
- "output_layer": 9, # layer 9
115
  }
116
  t0 = ttime()
117
  with torch.no_grad():
118
  logits = model.extract_features(**inputs)
119
- feats = model.final_proj(logits[0])
120
-
 
121
  if (
122
  isinstance(index, type(None)) == False
123
  and isinstance(big_npy, type(None)) == False
@@ -143,6 +211,8 @@ class VC(object):
143
  )
144
 
145
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
 
 
146
  t1 = ttime()
147
  p_len = audio0.shape[0] // self.window
148
  if feats.shape[1] < p_len:
@@ -150,23 +220,26 @@ class VC(object):
150
  if pitch != None and pitchf != None:
151
  pitch = pitch[:, :p_len]
152
  pitchf = pitchf[:, :p_len]
 
 
 
 
 
 
 
 
153
  p_len = torch.tensor([p_len], device=self.device).long()
154
  with torch.no_grad():
155
  if pitch != None and pitchf != None:
156
  audio1 = (
157
- (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
158
  .data.cpu()
159
  .float()
160
  .numpy()
161
- .astype(np.int16)
162
  )
163
  else:
164
  audio1 = (
165
- (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
166
- .data.cpu()
167
- .float()
168
- .numpy()
169
- .astype(np.int16)
170
  )
171
  del feats, p_len, padding_mask
172
  if torch.cuda.is_available():
@@ -182,6 +255,7 @@ class VC(object):
182
  net_g,
183
  sid,
184
  audio,
 
185
  times,
186
  f0_up_key,
187
  f0_method,
@@ -189,6 +263,12 @@ class VC(object):
189
  # file_big_npy,
190
  index_rate,
191
  if_f0,
 
 
 
 
 
 
192
  f0_file=None,
193
  ):
194
  if (
@@ -243,9 +323,19 @@ class VC(object):
243
  sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
244
  pitch, pitchf = None, None
245
  if if_f0 == 1:
246
- pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
 
 
 
 
 
 
 
 
247
  pitch = pitch[:p_len]
248
  pitchf = pitchf[:p_len]
 
 
249
  pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
250
  pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
251
  t2 = ttime()
@@ -265,6 +355,8 @@ class VC(object):
265
  index,
266
  big_npy,
267
  index_rate,
 
 
268
  )[self.t_pad_tgt : -self.t_pad_tgt]
269
  )
270
  else:
@@ -280,6 +372,8 @@ class VC(object):
280
  index,
281
  big_npy,
282
  index_rate,
 
 
283
  )[self.t_pad_tgt : -self.t_pad_tgt]
284
  )
285
  s = t
@@ -296,6 +390,8 @@ class VC(object):
296
  index,
297
  big_npy,
298
  index_rate,
 
 
299
  )[self.t_pad_tgt : -self.t_pad_tgt]
300
  )
301
  else:
@@ -311,9 +407,22 @@ class VC(object):
311
  index,
312
  big_npy,
313
  index_rate,
 
 
314
  )[self.t_pad_tgt : -self.t_pad_tgt]
315
  )
316
  audio_opt = np.concatenate(audio_opt)
 
 
 
 
 
 
 
 
 
 
 
317
  del pitch, pitchf, sid
318
  if torch.cuda.is_available():
319
  torch.cuda.empty_cache()
 
2
  from time import time as ttime
3
  import torch.nn.functional as F
4
  import scipy.signal as signal
5
+ import pyworld, os, traceback, faiss, librosa, torchcrepe
6
  from scipy import signal
7
+ from functools import lru_cache
8
 
9
  bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
10
 
11
+ input_audio_path2wav = {}
12
+
13
+
14
+ @lru_cache
15
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
16
+ audio = input_audio_path2wav[input_audio_path]
17
+ f0, t = pyworld.harvest(
18
+ audio,
19
+ fs=fs,
20
+ f0_ceil=f0max,
21
+ f0_floor=f0min,
22
+ frame_period=frame_period,
23
+ )
24
+ f0 = pyworld.stonemask(audio, f0, t, fs)
25
+ return f0
26
+
27
+
28
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
29
+ # print(data1.max(),data2.max())
30
+ rms1 = librosa.feature.rms(
31
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
32
+ ) # 每半秒一个点
33
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
34
+ rms1 = torch.from_numpy(rms1)
35
+ rms1 = F.interpolate(
36
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
37
+ ).squeeze()
38
+ rms2 = torch.from_numpy(rms2)
39
+ rms2 = F.interpolate(
40
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
41
+ ).squeeze()
42
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
43
+ data2 *= (
44
+ torch.pow(rms1, torch.tensor(1 - rate))
45
+ * torch.pow(rms2, torch.tensor(rate - 1))
46
+ ).numpy()
47
+ return data2
48
+
49
 
50
  class VC(object):
51
  def __init__(self, tgt_sr, config):
 
66
  self.t_max = self.sr * self.x_max # 免查询时长阈值
67
  self.device = config.device
68
 
69
+ def get_f0(
70
+ self,
71
+ input_audio_path,
72
+ x,
73
+ p_len,
74
+ f0_up_key,
75
+ f0_method,
76
+ filter_radius,
77
+ inp_f0=None,
78
+ ):
79
+ global input_audio_path2wav
80
  time_step = self.window / self.sr * 1000
81
  f0_min = 50
82
  f0_max = 1100
 
99
  f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
100
  )
101
  elif f0_method == "harvest":
102
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
103
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
104
+ if filter_radius > 2:
105
+ f0 = signal.medfilt(f0, 3)
106
+ elif f0_method == "crepe":
107
+ model = "full"
108
+ # Pick a batch size that doesn't cause memory errors on your gpu
109
+ batch_size = 512
110
+ # Compute pitch using first gpu
111
+ audio = torch.tensor(np.copy(x))[None].float()
112
+ f0, pd = torchcrepe.predict(
113
+ audio,
114
+ self.sr,
115
+ self.window,
116
+ f0_min,
117
+ f0_max,
118
+ model,
119
+ batch_size=batch_size,
120
+ device=self.device,
121
+ return_periodicity=True,
122
  )
123
+ pd = torchcrepe.filter.median(pd, 3)
124
+ f0 = torchcrepe.filter.mean(f0, 3)
125
+ f0[pd < 0.1] = 0
126
+ f0 = f0[0].cpu().numpy()
127
  f0 *= pow(2, f0_up_key / 12)
128
  # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
129
  tf0 = self.sr // self.window # 每秒f0点数
 
161
  index,
162
  big_npy,
163
  index_rate,
164
+ version,
165
+ protect
166
  ): # ,file_index,file_big_npy
167
  feats = torch.from_numpy(audio0)
168
  if self.is_half:
 
178
  inputs = {
179
  "source": feats.to(self.device),
180
  "padding_mask": padding_mask,
181
+ "output_layer": 9 if version == "v1" else 12,
182
  }
183
  t0 = ttime()
184
  with torch.no_grad():
185
  logits = model.extract_features(**inputs)
186
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
187
+ if(protect<0.5):
188
+ feats0=feats.clone()
189
  if (
190
  isinstance(index, type(None)) == False
191
  and isinstance(big_npy, type(None)) == False
 
211
  )
212
 
213
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
214
+ if(protect<0.5):
215
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
216
  t1 = ttime()
217
  p_len = audio0.shape[0] // self.window
218
  if feats.shape[1] < p_len:
 
220
  if pitch != None and pitchf != None:
221
  pitch = pitch[:, :p_len]
222
  pitchf = pitchf[:, :p_len]
223
+
224
+ if(protect<0.5):
225
+ pitchff = pitchf.clone()
226
+ pitchff[pitchf > 0] = 1
227
+ pitchff[pitchf < 1] = protect
228
+ pitchff = pitchff.unsqueeze(-1)
229
+ feats = feats * pitchff + feats0 * (1 - pitchff)
230
+ feats=feats.to(feats0.dtype)
231
  p_len = torch.tensor([p_len], device=self.device).long()
232
  with torch.no_grad():
233
  if pitch != None and pitchf != None:
234
  audio1 = (
235
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
236
  .data.cpu()
237
  .float()
238
  .numpy()
 
239
  )
240
  else:
241
  audio1 = (
242
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
 
 
 
 
243
  )
244
  del feats, p_len, padding_mask
245
  if torch.cuda.is_available():
 
255
  net_g,
256
  sid,
257
  audio,
258
+ input_audio_path,
259
  times,
260
  f0_up_key,
261
  f0_method,
 
263
  # file_big_npy,
264
  index_rate,
265
  if_f0,
266
+ filter_radius,
267
+ tgt_sr,
268
+ resample_sr,
269
+ rms_mix_rate,
270
+ version,
271
+ protect,
272
  f0_file=None,
273
  ):
274
  if (
 
323
  sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
324
  pitch, pitchf = None, None
325
  if if_f0 == 1:
326
+ pitch, pitchf = self.get_f0(
327
+ input_audio_path,
328
+ audio_pad,
329
+ p_len,
330
+ f0_up_key,
331
+ f0_method,
332
+ filter_radius,
333
+ inp_f0,
334
+ )
335
  pitch = pitch[:p_len]
336
  pitchf = pitchf[:p_len]
337
+ if self.device == "mps":
338
+ pitchf = pitchf.astype(np.float32)
339
  pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
340
  pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
341
  t2 = ttime()
 
355
  index,
356
  big_npy,
357
  index_rate,
358
+ version,
359
+ protect
360
  )[self.t_pad_tgt : -self.t_pad_tgt]
361
  )
362
  else:
 
372
  index,
373
  big_npy,
374
  index_rate,
375
+ version,
376
+ protect
377
  )[self.t_pad_tgt : -self.t_pad_tgt]
378
  )
379
  s = t
 
390
  index,
391
  big_npy,
392
  index_rate,
393
+ version,
394
+ protect
395
  )[self.t_pad_tgt : -self.t_pad_tgt]
396
  )
397
  else:
 
407
  index,
408
  big_npy,
409
  index_rate,
410
+ version,
411
+ protect
412
  )[self.t_pad_tgt : -self.t_pad_tgt]
413
  )
414
  audio_opt = np.concatenate(audio_opt)
415
+ if rms_mix_rate != 1:
416
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
417
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
418
+ audio_opt = librosa.resample(
419
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
420
+ )
421
+ audio_max = np.abs(audio_opt).max() / 0.99
422
+ max_int16 = 32768
423
+ if audio_max > 1:
424
+ max_int16 /= audio_max
425
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
426
  del pitch, pitchf, sid
427
  if torch.cuda.is_available():
428
  torch.cuda.empty_cache()