Blane187 commited on
Commit
83bc3f7
·
verified ·
1 Parent(s): cdb0da8

Delete gui.py

Browse files
Files changed (1) hide show
  1. gui.py +0 -1114
gui.py DELETED
@@ -1,1114 +0,0 @@
1
- import os
2
- import sys
3
- from dotenv import load_dotenv
4
- import shutil
5
-
6
- load_dotenv()
7
- load_dotenv("sha256.env")
8
-
9
- os.environ["OMP_NUM_THREADS"] = "4"
10
- if sys.platform == "darwin":
11
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
12
-
13
- now_dir = os.getcwd()
14
- sys.path.append(now_dir)
15
- import multiprocessing
16
-
17
- flag_vc = False
18
-
19
-
20
- def printt(strr, *args):
21
- if len(args) == 0:
22
- print(strr)
23
- else:
24
- print(strr % args)
25
-
26
-
27
- def phase_vocoder(a, b, fade_out, fade_in):
28
- window = torch.sqrt(fade_out * fade_in)
29
- fa = torch.fft.rfft(a * window)
30
- fb = torch.fft.rfft(b * window)
31
- absab = torch.abs(fa) + torch.abs(fb)
32
- n = a.shape[0]
33
- if n % 2 == 0:
34
- absab[1:-1] *= 2
35
- else:
36
- absab[1:] *= 2
37
- phia = torch.angle(fa)
38
- phib = torch.angle(fb)
39
- deltaphase = phib - phia
40
- deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5)
41
- w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase
42
- t = torch.arange(n).unsqueeze(-1).to(a) / n
43
- result = (
44
- a * (fade_out**2)
45
- + b * (fade_in**2)
46
- + torch.sum(absab * torch.cos(w * t + phia), -1) * window / n
47
- )
48
- return result
49
-
50
-
51
- class Harvest(multiprocessing.Process):
52
- def __init__(self, inp_q, opt_q):
53
- multiprocessing.Process.__init__(self)
54
- self.inp_q = inp_q
55
- self.opt_q = opt_q
56
-
57
- def run(self):
58
- import numpy as np
59
- import pyworld
60
-
61
- while 1:
62
- idx, x, res_f0, n_cpu, ts = self.inp_q.get()
63
- f0, t = pyworld.harvest(
64
- x.astype(np.double),
65
- fs=16000,
66
- f0_ceil=1100,
67
- f0_floor=50,
68
- frame_period=10,
69
- )
70
- res_f0[idx] = f0
71
- if len(res_f0.keys()) >= n_cpu:
72
- self.opt_q.put(ts)
73
-
74
-
75
- if __name__ == "__main__":
76
- import json
77
- import multiprocessing
78
- import re
79
- import time
80
- from multiprocessing import Queue, cpu_count
81
-
82
- import librosa
83
- from infer.modules.gui import TorchGate
84
- import numpy as np
85
- import FreeSimpleGUI as sg
86
- import sounddevice as sd
87
- import torch
88
- import torch.nn.functional as F
89
- import torchaudio.transforms as tat
90
-
91
- import infer.lib.rtrvc as rtrvc
92
- from i18n.i18n import I18nAuto
93
- from configs import Config
94
-
95
- i18n = I18nAuto()
96
-
97
- # device = rvc_for_realtime.config.device
98
- # device = torch.device(
99
- # "cuda"
100
- # if torch.cuda.is_available()
101
- # else ("mps" if torch.backends.mps.is_available() else "cpu")
102
- # )
103
- current_dir = os.getcwd()
104
- inp_q = Queue()
105
- opt_q = Queue()
106
- n_cpu = min(cpu_count(), 8)
107
- for _ in range(n_cpu):
108
- p = Harvest(inp_q, opt_q)
109
- p.daemon = True
110
- p.start()
111
-
112
- class GUIConfig:
113
- def __init__(self) -> None:
114
- self.pth_path: str = ""
115
- self.index_path: str = ""
116
- self.pitch: int = 0
117
- self.formant: float = 0.0
118
- self.sr_type: str = "sr_model"
119
- self.block_time: float = 0.25 # s
120
- self.threhold: int = -60
121
- self.crossfade_time: float = 0.05
122
- self.extra_time: float = 2.5
123
- self.I_noise_reduce: bool = False
124
- self.O_noise_reduce: bool = False
125
- self.use_pv: bool = False
126
- self.rms_mix_rate: float = 0.0
127
- self.index_rate: float = 0.0
128
- self.n_cpu: int = min(n_cpu, 4)
129
- self.f0method: str = "fcpe"
130
- self.sg_hostapi: str = ""
131
- self.wasapi_exclusive: bool = False
132
- self.sg_input_device: str = ""
133
- self.sg_output_device: str = ""
134
-
135
- class GUI:
136
- def __init__(self) -> None:
137
- self.gui_config = GUIConfig()
138
- self.config = Config()
139
- self.function = "vc"
140
- self.delay_time = 0
141
- self.hostapis = None
142
- self.input_devices = None
143
- self.output_devices = None
144
- self.input_devices_indices = None
145
- self.output_devices_indices = None
146
- self.stream = None
147
- self.update_devices()
148
- self.launcher()
149
-
150
- def check_assets(self):
151
- global now_dir
152
- from infer.lib.rvcmd import check_all_assets, download_all_assets
153
-
154
- tmp = os.path.join(now_dir, "TEMP")
155
- shutil.rmtree(tmp, ignore_errors=True)
156
- os.makedirs(tmp, exist_ok=True)
157
- if not check_all_assets(update=self.config.update):
158
- if self.config.update:
159
- download_all_assets(tmpdir=tmp)
160
- if not check_all_assets(update=self.config.update):
161
- printt("counld not satisfy all assets needed.")
162
- exit(1)
163
-
164
- def load(self):
165
- try:
166
- if not os.path.exists("configs/inuse/config.json"):
167
- shutil.copy("configs/config.json", "configs/inuse/config.json")
168
- with open("configs/inuse/config.json", "r") as j:
169
- data = json.load(j)
170
- data["sr_model"] = data["sr_type"] == "sr_model"
171
- data["sr_device"] = data["sr_type"] == "sr_device"
172
- data["pm"] = data["f0method"] == "pm"
173
- data["dio"] = data["f0method"] == "dio"
174
- data["harvest"] = data["f0method"] == "harvest"
175
- data["crepe"] = data["f0method"] == "crepe"
176
- data["rmvpe"] = data["f0method"] == "rmvpe"
177
- data["fcpe"] = data["f0method"] == "fcpe"
178
- if data["sg_hostapi"] in self.hostapis:
179
- self.update_devices(hostapi_name=data["sg_hostapi"])
180
- if (
181
- data["sg_input_device"] not in self.input_devices
182
- or data["sg_output_device"] not in self.output_devices
183
- ):
184
- self.update_devices()
185
- data["sg_hostapi"] = self.hostapis[0]
186
- data["sg_input_device"] = self.input_devices[
187
- self.input_devices_indices.index(sd.default.device[0])
188
- ]
189
- data["sg_output_device"] = self.output_devices[
190
- self.output_devices_indices.index(sd.default.device[1])
191
- ]
192
- else:
193
- data["sg_hostapi"] = self.hostapis[0]
194
- data["sg_input_device"] = self.input_devices[
195
- self.input_devices_indices.index(sd.default.device[0])
196
- ]
197
- data["sg_output_device"] = self.output_devices[
198
- self.output_devices_indices.index(sd.default.device[1])
199
- ]
200
- except:
201
- with open("configs/inuse/config.json", "w") as j:
202
- data = {
203
- "pth_path": "",
204
- "index_path": "",
205
- "sg_hostapi": self.hostapis[0],
206
- "sg_wasapi_exclusive": False,
207
- "sg_input_device": self.input_devices[
208
- self.input_devices_indices.index(sd.default.device[0])
209
- ],
210
- "sg_output_device": self.output_devices[
211
- self.output_devices_indices.index(sd.default.device[1])
212
- ],
213
- "sr_type": "sr_model",
214
- "threhold": -60,
215
- "pitch": 0,
216
- "formant": 0.0,
217
- "index_rate": 0,
218
- "rms_mix_rate": 0,
219
- "block_time": 0.25,
220
- "crossfade_length": 0.05,
221
- "extra_time": 2.5,
222
- "n_cpu": 4,
223
- "f0method": "rmvpe",
224
- "use_jit": False,
225
- "use_pv": False,
226
- }
227
- data["sr_model"] = data["sr_type"] == "sr_model"
228
- data["sr_device"] = data["sr_type"] == "sr_device"
229
- data["pm"] = data["f0method"] == "pm"
230
- data["dio"] = data["f0method"] == "dio"
231
- data["harvest"] = data["f0method"] == "harvest"
232
- data["crepe"] = data["f0method"] == "crepe"
233
- data["rmvpe"] = data["f0method"] == "rmvpe"
234
- data["fcpe"] = data["f0method"] == "fcpe"
235
- return data
236
-
237
- def launcher(self):
238
- data = self.load()
239
- self.config.use_jit = False # data.get("use_jit", self.config.use_jit)
240
- sg.theme("LightBlue3")
241
- layout = [
242
- [
243
- sg.Frame(
244
- title=i18n("Load model"),
245
- layout=[
246
- [
247
- sg.Input(
248
- default_text=data.get("pth_path", ""),
249
- key="pth_path",
250
- ),
251
- sg.FileBrowse(
252
- i18n("Select the .pth file"),
253
- initial_folder=os.path.join(
254
- os.getcwd(), "assets/weights"
255
- ),
256
- file_types=[("Model File", "*.pth")],
257
- ),
258
- ],
259
- [
260
- sg.Input(
261
- default_text=data.get("index_path", ""),
262
- key="index_path",
263
- ),
264
- sg.FileBrowse(
265
- i18n("Select the .index file"),
266
- initial_folder=os.path.join(os.getcwd(), "logs"),
267
- file_types=[("Index File", "*.index")],
268
- ),
269
- ],
270
- ],
271
- )
272
- ],
273
- [
274
- sg.Frame(
275
- layout=[
276
- [
277
- sg.Text(i18n("Device type")),
278
- sg.Combo(
279
- self.hostapis,
280
- key="sg_hostapi",
281
- default_value=data.get("sg_hostapi", ""),
282
- enable_events=True,
283
- size=(20, 1),
284
- ),
285
- sg.Checkbox(
286
- i18n("Takeover WASAPI device"),
287
- key="sg_wasapi_exclusive",
288
- default=data.get("sg_wasapi_exclusive", False),
289
- enable_events=True,
290
- ),
291
- ],
292
- [
293
- sg.Text(i18n("Input device")),
294
- sg.Combo(
295
- self.input_devices,
296
- key="sg_input_device",
297
- default_value=data.get("sg_input_device", ""),
298
- enable_events=True,
299
- size=(45, 1),
300
- ),
301
- ],
302
- [
303
- sg.Text(i18n("Output device")),
304
- sg.Combo(
305
- self.output_devices,
306
- key="sg_output_device",
307
- default_value=data.get("sg_output_device", ""),
308
- enable_events=True,
309
- size=(45, 1),
310
- ),
311
- ],
312
- [
313
- sg.Button(
314
- i18n("Reload device list"), key="reload_devices"
315
- ),
316
- sg.Radio(
317
- i18n("Choose sample rate of the model"),
318
- "sr_type",
319
- key="sr_model",
320
- default=data.get("sr_model", True),
321
- enable_events=True,
322
- ),
323
- sg.Radio(
324
- i18n("Choose sample rate of the device"),
325
- "sr_type",
326
- key="sr_device",
327
- default=data.get("sr_device", False),
328
- enable_events=True,
329
- ),
330
- sg.Text(i18n("Sampling rate")),
331
- sg.Text("", key="sr_stream"),
332
- ],
333
- ],
334
- title=i18n("Audio device"),
335
- )
336
- ],
337
- [
338
- sg.Frame(
339
- layout=[
340
- [
341
- sg.Text(i18n("Response threshold")),
342
- sg.Slider(
343
- range=(-60, 0),
344
- key="threhold",
345
- resolution=1,
346
- orientation="h",
347
- default_value=data.get("threhold", -60),
348
- enable_events=True,
349
- ),
350
- ],
351
- [
352
- sg.Text(i18n("Pitch settings")),
353
- sg.Slider(
354
- range=(-24, 24),
355
- key="pitch",
356
- resolution=1,
357
- orientation="h",
358
- default_value=data.get("pitch", 0),
359
- enable_events=True,
360
- ),
361
- ],
362
- [
363
- sg.Text(i18n("Formant offset")),
364
- sg.Slider(
365
- range=(-5, 5),
366
- key="formant",
367
- resolution=0.01,
368
- orientation="h",
369
- default_value=data.get("formant", 0.0),
370
- enable_events=True,
371
- ),
372
- ],
373
- [
374
- sg.Text(i18n("Feature searching ratio")),
375
- sg.Slider(
376
- range=(0.0, 1.0),
377
- key="index_rate",
378
- resolution=0.01,
379
- orientation="h",
380
- default_value=data.get("index_rate", 0),
381
- enable_events=True,
382
- ),
383
- ],
384
- [
385
- sg.Text(i18n("Loudness factor")),
386
- sg.Slider(
387
- range=(0.0, 1.0),
388
- key="rms_mix_rate",
389
- resolution=0.01,
390
- orientation="h",
391
- default_value=data.get("rms_mix_rate", 0),
392
- enable_events=True,
393
- ),
394
- ],
395
- [
396
- sg.Text(i18n("Pitch detection algorithm")),
397
- sg.Radio(
398
- "pm",
399
- "f0method",
400
- key="pm",
401
- default=data.get("pm", False),
402
- enable_events=True,
403
- ),
404
- sg.Radio(
405
- "dio",
406
- "f0method",
407
- key="dio",
408
- default=data.get("dio", False),
409
- enable_events=True,
410
- ),
411
- sg.Radio(
412
- "harvest",
413
- "f0method",
414
- key="harvest",
415
- default=data.get("harvest", False),
416
- enable_events=True,
417
- ),
418
- sg.Radio(
419
- "crepe",
420
- "f0method",
421
- key="crepe",
422
- default=data.get("crepe", False),
423
- enable_events=True,
424
- ),
425
- sg.Radio(
426
- "rmvpe",
427
- "f0method",
428
- key="rmvpe",
429
- default=data.get("rmvpe", False),
430
- enable_events=True,
431
- ),
432
- sg.Radio(
433
- "fcpe",
434
- "f0method",
435
- key="fcpe",
436
- default=data.get("fcpe", True),
437
- enable_events=True,
438
- ),
439
- ],
440
- ],
441
- title=i18n("General settings"),
442
- ),
443
- sg.Frame(
444
- layout=[
445
- [
446
- sg.Text(i18n("Sample length")),
447
- sg.Slider(
448
- range=(0.02, 1.5),
449
- key="block_time",
450
- resolution=0.01,
451
- orientation="h",
452
- default_value=data.get("block_time", 0.25),
453
- enable_events=True,
454
- ),
455
- ],
456
- # [
457
- # sg.Text("设备延迟"),
458
- # sg.Slider(
459
- # range=(0, 1),
460
- # key="device_latency",
461
- # resolution=0.001,
462
- # orientation="h",
463
- # default_value=data.get("device_latency", 0.1),
464
- # enable_events=True,
465
- # ),
466
- # ],
467
- [
468
- sg.Text(
469
- i18n(
470
- "Number of CPU processes used for harvest pitch algorithm"
471
- )
472
- ),
473
- sg.Slider(
474
- range=(1, n_cpu),
475
- key="n_cpu",
476
- resolution=1,
477
- orientation="h",
478
- default_value=data.get(
479
- "n_cpu", min(self.gui_config.n_cpu, n_cpu)
480
- ),
481
- enable_events=True,
482
- ),
483
- ],
484
- [
485
- sg.Text(i18n("Fade length")),
486
- sg.Slider(
487
- range=(0.01, 0.15),
488
- key="crossfade_length",
489
- resolution=0.01,
490
- orientation="h",
491
- default_value=data.get("crossfade_length", 0.05),
492
- enable_events=True,
493
- ),
494
- ],
495
- [
496
- sg.Text(i18n("Extra inference time")),
497
- sg.Slider(
498
- range=(0.05, 5.00),
499
- key="extra_time",
500
- resolution=0.01,
501
- orientation="h",
502
- default_value=data.get("extra_time", 2.5),
503
- enable_events=True,
504
- ),
505
- ],
506
- [
507
- sg.Checkbox(
508
- i18n("Input noise reduction"),
509
- key="I_noise_reduce",
510
- enable_events=True,
511
- ),
512
- sg.Checkbox(
513
- i18n("Output noise reduction"),
514
- key="O_noise_reduce",
515
- enable_events=True,
516
- ),
517
- sg.Checkbox(
518
- i18n("Enable phase vocoder"),
519
- key="use_pv",
520
- default=data.get("use_pv", False),
521
- enable_events=True,
522
- ),
523
- # sg.Checkbox(
524
- # "JIT加速",
525
- # default=self.config.use_jit,
526
- # key="use_jit",
527
- # enable_events=False,
528
- # ),
529
- ],
530
- # [sg.Text("注:首次使用JIT加速时,会出现卡顿,\n 并伴随一些噪音,但这是正常现象!")],
531
- ],
532
- title=i18n("Performance settings"),
533
- ),
534
- ],
535
- [
536
- sg.Button(i18n("Start audio conversion"), key="start_vc"),
537
- sg.Button(i18n("Stop audio conversion"), key="stop_vc"),
538
- sg.Radio(
539
- i18n("Input voice monitor"),
540
- "function",
541
- key="im",
542
- default=False,
543
- enable_events=True,
544
- ),
545
- sg.Radio(
546
- i18n("Output converted voice"),
547
- "function",
548
- key="vc",
549
- default=True,
550
- enable_events=True,
551
- ),
552
- sg.Text(i18n("Algorithmic delays (ms)")),
553
- sg.Text("0", key="delay_time"),
554
- sg.Text(i18n("Inference time (ms)")),
555
- sg.Text("0", key="infer_time"),
556
- ],
557
- ]
558
- self.window = sg.Window("RVC - GUI", layout=layout, finalize=True)
559
- self.event_handler()
560
-
561
- def event_handler(self):
562
- global flag_vc
563
- while True:
564
- event, values = self.window.read()
565
- if event == sg.WINDOW_CLOSED:
566
- self.stop_stream()
567
- exit()
568
- if event == "reload_devices" or event == "sg_hostapi":
569
- self.gui_config.sg_hostapi = values["sg_hostapi"]
570
- self.update_devices(hostapi_name=values["sg_hostapi"])
571
- if self.gui_config.sg_hostapi not in self.hostapis:
572
- self.gui_config.sg_hostapi = self.hostapis[0]
573
- self.window["sg_hostapi"].Update(values=self.hostapis)
574
- self.window["sg_hostapi"].Update(value=self.gui_config.sg_hostapi)
575
- if (
576
- self.gui_config.sg_input_device not in self.input_devices
577
- and len(self.input_devices) > 0
578
- ):
579
- self.gui_config.sg_input_device = self.input_devices[0]
580
- self.window["sg_input_device"].Update(values=self.input_devices)
581
- self.window["sg_input_device"].Update(
582
- value=self.gui_config.sg_input_device
583
- )
584
- if self.gui_config.sg_output_device not in self.output_devices:
585
- self.gui_config.sg_output_device = self.output_devices[0]
586
- self.window["sg_output_device"].Update(values=self.output_devices)
587
- self.window["sg_output_device"].Update(
588
- value=self.gui_config.sg_output_device
589
- )
590
- if event == "start_vc" and not flag_vc:
591
- if self.set_values(values) == True:
592
- printt("cuda_is_available: %s", torch.cuda.is_available())
593
- self.start_vc()
594
- settings = {
595
- "pth_path": values["pth_path"],
596
- "index_path": values["index_path"],
597
- "sg_hostapi": values["sg_hostapi"],
598
- "sg_wasapi_exclusive": values["sg_wasapi_exclusive"],
599
- "sg_input_device": values["sg_input_device"],
600
- "sg_output_device": values["sg_output_device"],
601
- "sr_type": ["sr_model", "sr_device"][
602
- [
603
- values["sr_model"],
604
- values["sr_device"],
605
- ].index(True)
606
- ],
607
- "threhold": values["threhold"],
608
- "pitch": values["pitch"],
609
- "formant": values["formant"],
610
- "rms_mix_rate": values["rms_mix_rate"],
611
- "index_rate": values["index_rate"],
612
- # "device_latency": values["device_latency"],
613
- "block_time": values["block_time"],
614
- "crossfade_length": values["crossfade_length"],
615
- "extra_time": values["extra_time"],
616
- "n_cpu": values["n_cpu"],
617
- # "use_jit": values["use_jit"],
618
- "use_jit": False,
619
- "use_pv": values["use_pv"],
620
- "f0method": [
621
- "pm",
622
- "dio",
623
- "harvest",
624
- "crepe",
625
- "rmvpe",
626
- "fcpe",
627
- ][
628
- [
629
- values["pm"],
630
- values["dio"],
631
- values["harvest"],
632
- values["crepe"],
633
- values["rmvpe"],
634
- values["fcpe"],
635
- ].index(True)
636
- ],
637
- }
638
- with open("configs/inuse/config.json", "w") as j:
639
- json.dump(settings, j)
640
- if self.stream is not None:
641
- self.delay_time = (
642
- self.stream.latency[-1]
643
- + values["block_time"]
644
- + values["crossfade_length"]
645
- + 0.01
646
- )
647
- if values["I_noise_reduce"]:
648
- self.delay_time += min(values["crossfade_length"], 0.04)
649
- self.window["sr_stream"].update(self.gui_config.samplerate)
650
- self.window["delay_time"].update(
651
- int(np.round(self.delay_time * 1000))
652
- )
653
- # Parameter hot update
654
- if event == "threhold":
655
- self.gui_config.threhold = values["threhold"]
656
- elif event == "pitch":
657
- self.gui_config.pitch = values["pitch"]
658
- if hasattr(self, "rvc"):
659
- self.rvc.set_key(values["pitch"])
660
- elif event == "formant":
661
- self.gui_config.formant = values["formant"]
662
- if hasattr(self, "rvc"):
663
- self.rvc.set_formant(values["formant"])
664
- elif event == "index_rate":
665
- self.gui_config.index_rate = values["index_rate"]
666
- if hasattr(self, "rvc"):
667
- self.rvc.set_index_rate(values["index_rate"])
668
- elif event == "rms_mix_rate":
669
- self.gui_config.rms_mix_rate = values["rms_mix_rate"]
670
- elif event in ["pm", "dio", "harvest", "crepe", "rmvpe", "fcpe"]:
671
- self.gui_config.f0method = event
672
- elif event == "I_noise_reduce":
673
- self.gui_config.I_noise_reduce = values["I_noise_reduce"]
674
- if self.stream is not None:
675
- self.delay_time += (
676
- 1 if values["I_noise_reduce"] else -1
677
- ) * min(values["crossfade_length"], 0.04)
678
- self.window["delay_time"].update(
679
- int(np.round(self.delay_time * 1000))
680
- )
681
- elif event == "O_noise_reduce":
682
- self.gui_config.O_noise_reduce = values["O_noise_reduce"]
683
- elif event == "use_pv":
684
- self.gui_config.use_pv = values["use_pv"]
685
- elif event in ["vc", "im"]:
686
- self.function = event
687
- elif event == "stop_vc" or event != "start_vc":
688
- # Other parameters do not support hot update
689
- self.stop_stream()
690
-
691
- def set_values(self, values):
692
- if len(values["pth_path"].strip()) == 0:
693
- sg.popup(i18n("Please choose the .pth file"))
694
- return False
695
- if len(values["index_path"].strip()) == 0:
696
- sg.popup(i18n("Please choose the .index file"))
697
- return False
698
- pattern = re.compile("[^\x00-\x7F]+")
699
- if pattern.findall(values["pth_path"]):
700
- sg.popup(i18n("pth path cannot contain unicode characters"))
701
- return False
702
- if pattern.findall(values["index_path"]):
703
- sg.popup(i18n("index path cannot contain unicode characters"))
704
- return False
705
- self.set_devices(values["sg_input_device"], values["sg_output_device"])
706
- self.config.use_jit = False # values["use_jit"]
707
- # self.device_latency = values["device_latency"]
708
- self.gui_config.sg_hostapi = values["sg_hostapi"]
709
- self.gui_config.sg_wasapi_exclusive = values["sg_wasapi_exclusive"]
710
- self.gui_config.sg_input_device = values["sg_input_device"]
711
- self.gui_config.sg_output_device = values["sg_output_device"]
712
- self.gui_config.pth_path = values["pth_path"]
713
- self.gui_config.index_path = values["index_path"]
714
- self.gui_config.sr_type = ["sr_model", "sr_device"][
715
- [
716
- values["sr_model"],
717
- values["sr_device"],
718
- ].index(True)
719
- ]
720
- self.gui_config.threhold = values["threhold"]
721
- self.gui_config.pitch = values["pitch"]
722
- self.gui_config.formant = values["formant"]
723
- self.gui_config.block_time = values["block_time"]
724
- self.gui_config.crossfade_time = values["crossfade_length"]
725
- self.gui_config.extra_time = values["extra_time"]
726
- self.gui_config.I_noise_reduce = values["I_noise_reduce"]
727
- self.gui_config.O_noise_reduce = values["O_noise_reduce"]
728
- self.gui_config.use_pv = values["use_pv"]
729
- self.gui_config.rms_mix_rate = values["rms_mix_rate"]
730
- self.gui_config.index_rate = values["index_rate"]
731
- self.gui_config.n_cpu = values["n_cpu"]
732
- self.gui_config.f0method = [
733
- "pm",
734
- "dio",
735
- "harvest",
736
- "crepe",
737
- "rmvpe",
738
- "fcpe",
739
- ][
740
- [
741
- values["pm"],
742
- values["dio"],
743
- values["harvest"],
744
- values["crepe"],
745
- values["rmvpe"],
746
- values["fcpe"],
747
- ].index(True)
748
- ]
749
- return True
750
-
751
- def start_vc(self):
752
- torch.cuda.empty_cache()
753
- self.rvc = rtrvc.RVC(
754
- self.gui_config.pitch,
755
- self.gui_config.formant,
756
- self.gui_config.pth_path,
757
- self.gui_config.index_path,
758
- self.gui_config.index_rate,
759
- self.gui_config.n_cpu,
760
- self.config.device,
761
- self.config.use_jit,
762
- self.config.is_half,
763
- self.config.dml,
764
- )
765
- self.gui_config.samplerate = (
766
- self.rvc.tgt_sr
767
- if self.gui_config.sr_type == "sr_model"
768
- else self.get_device_samplerate()
769
- )
770
- self.gui_config.channels = self.get_device_channels()
771
- self.zc = self.gui_config.samplerate // 100
772
- self.block_frame = (
773
- int(
774
- np.round(
775
- self.gui_config.block_time
776
- * self.gui_config.samplerate
777
- / self.zc
778
- )
779
- )
780
- * self.zc
781
- )
782
- self.block_frame_16k = 160 * self.block_frame // self.zc
783
- self.crossfade_frame = (
784
- int(
785
- np.round(
786
- self.gui_config.crossfade_time
787
- * self.gui_config.samplerate
788
- / self.zc
789
- )
790
- )
791
- * self.zc
792
- )
793
- self.sola_buffer_frame = min(self.crossfade_frame, 4 * self.zc)
794
- self.sola_search_frame = self.zc
795
- self.extra_frame = (
796
- int(
797
- np.round(
798
- self.gui_config.extra_time
799
- * self.gui_config.samplerate
800
- / self.zc
801
- )
802
- )
803
- * self.zc
804
- )
805
- self.input_wav: torch.Tensor = torch.zeros(
806
- self.extra_frame
807
- + self.crossfade_frame
808
- + self.sola_search_frame
809
- + self.block_frame,
810
- device=self.config.device,
811
- dtype=torch.float32,
812
- )
813
- self.input_wav_denoise: torch.Tensor = self.input_wav.clone()
814
- self.input_wav_res: torch.Tensor = torch.zeros(
815
- 160 * self.input_wav.shape[0] // self.zc,
816
- device=self.config.device,
817
- dtype=torch.float32,
818
- )
819
- self.rms_buffer: np.ndarray = np.zeros(4 * self.zc, dtype="float32")
820
- self.sola_buffer: torch.Tensor = torch.zeros(
821
- self.sola_buffer_frame, device=self.config.device, dtype=torch.float32
822
- )
823
- self.nr_buffer: torch.Tensor = self.sola_buffer.clone()
824
- self.output_buffer: torch.Tensor = self.input_wav.clone()
825
- self.skip_head = self.extra_frame // self.zc
826
- self.return_length = (
827
- self.block_frame + self.sola_buffer_frame + self.sola_search_frame
828
- ) // self.zc
829
- self.fade_in_window: torch.Tensor = (
830
- torch.sin(
831
- 0.5
832
- * np.pi
833
- * torch.linspace(
834
- 0.0,
835
- 1.0,
836
- steps=self.sola_buffer_frame,
837
- device=self.config.device,
838
- dtype=torch.float32,
839
- )
840
- )
841
- ** 2
842
- )
843
- self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
844
- self.resampler = tat.Resample(
845
- orig_freq=self.gui_config.samplerate,
846
- new_freq=16000,
847
- dtype=torch.float32,
848
- ).to(self.config.device)
849
- if self.rvc.tgt_sr != self.gui_config.samplerate:
850
- self.resampler2 = tat.Resample(
851
- orig_freq=self.rvc.tgt_sr,
852
- new_freq=self.gui_config.samplerate,
853
- dtype=torch.float32,
854
- ).to(self.config.device)
855
- else:
856
- self.resampler2 = None
857
- self.tg = TorchGate(
858
- sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9
859
- ).to(self.config.device)
860
- self.start_stream()
861
-
862
- def start_stream(self):
863
- global flag_vc
864
- if not flag_vc:
865
- flag_vc = True
866
- if (
867
- "WASAPI" in self.gui_config.sg_hostapi
868
- and self.gui_config.sg_wasapi_exclusive
869
- ):
870
- extra_settings = sd.WasapiSettings(exclusive=True)
871
- else:
872
- extra_settings = None
873
- self.stream = sd.Stream(
874
- callback=self.audio_callback,
875
- blocksize=self.block_frame,
876
- samplerate=self.gui_config.samplerate,
877
- channels=self.gui_config.channels,
878
- dtype="float32",
879
- extra_settings=extra_settings,
880
- )
881
- self.stream.start()
882
-
883
- def stop_stream(self):
884
- global flag_vc
885
- if flag_vc:
886
- flag_vc = False
887
- if self.stream is not None:
888
- self.stream.abort()
889
- self.stream.close()
890
- self.stream = None
891
-
892
- def audio_callback(
893
- self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
894
- ):
895
- """
896
- 音频处理
897
- """
898
- global flag_vc
899
- start_time = time.perf_counter()
900
- indata = librosa.to_mono(indata.T)
901
- if self.gui_config.threhold > -60:
902
- indata = np.append(self.rms_buffer, indata)
903
- rms = librosa.feature.rms(
904
- y=indata, frame_length=4 * self.zc, hop_length=self.zc
905
- )[:, 2:]
906
- self.rms_buffer[:] = indata[-4 * self.zc :]
907
- indata = indata[2 * self.zc - self.zc // 2 :]
908
- db_threhold = (
909
- librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold
910
- )
911
- for i in range(db_threhold.shape[0]):
912
- if db_threhold[i]:
913
- indata[i * self.zc : (i + 1) * self.zc] = 0
914
- indata = indata[self.zc // 2 :]
915
- self.input_wav[: -self.block_frame] = self.input_wav[
916
- self.block_frame :
917
- ].clone()
918
- self.input_wav[-indata.shape[0] :] = torch.from_numpy(indata).to(
919
- self.config.device
920
- )
921
- self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[
922
- self.block_frame_16k :
923
- ].clone()
924
- # input noise reduction and resampling
925
- if self.gui_config.I_noise_reduce:
926
- self.input_wav_denoise[: -self.block_frame] = self.input_wav_denoise[
927
- self.block_frame :
928
- ].clone()
929
- input_wav = self.input_wav[-self.sola_buffer_frame - self.block_frame :]
930
- input_wav = self.tg(
931
- input_wav.unsqueeze(0), self.input_wav.unsqueeze(0)
932
- ).squeeze(0)
933
- input_wav[: self.sola_buffer_frame] *= self.fade_in_window
934
- input_wav[: self.sola_buffer_frame] += (
935
- self.nr_buffer * self.fade_out_window
936
- )
937
- self.input_wav_denoise[-self.block_frame :] = input_wav[
938
- : self.block_frame
939
- ]
940
- self.nr_buffer[:] = input_wav[self.block_frame :]
941
- self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler(
942
- self.input_wav_denoise[-self.block_frame - 2 * self.zc :]
943
- )[160:]
944
- else:
945
- self.input_wav_res[-160 * (indata.shape[0] // self.zc + 1) :] = (
946
- self.resampler(self.input_wav[-indata.shape[0] - 2 * self.zc :])[
947
- 160:
948
- ]
949
- )
950
- # infer
951
- if self.function == "vc":
952
- infer_wav = self.rvc.infer(
953
- self.input_wav_res,
954
- self.block_frame_16k,
955
- self.skip_head,
956
- self.return_length,
957
- self.gui_config.f0method,
958
- )
959
- if self.resampler2 is not None:
960
- infer_wav = self.resampler2(infer_wav)
961
- elif self.gui_config.I_noise_reduce:
962
- infer_wav = self.input_wav_denoise[self.extra_frame :].clone()
963
- else:
964
- infer_wav = self.input_wav[self.extra_frame :].clone()
965
- # output noise reduction
966
- if self.gui_config.O_noise_reduce and self.function == "vc":
967
- self.output_buffer[: -self.block_frame] = self.output_buffer[
968
- self.block_frame :
969
- ].clone()
970
- self.output_buffer[-self.block_frame :] = infer_wav[-self.block_frame :]
971
- infer_wav = self.tg(
972
- infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0)
973
- ).squeeze(0)
974
- # volume envelop mixing
975
- if self.gui_config.rms_mix_rate < 1 and self.function == "vc":
976
- if self.gui_config.I_noise_reduce:
977
- input_wav = self.input_wav_denoise[self.extra_frame :]
978
- else:
979
- input_wav = self.input_wav[self.extra_frame :]
980
- rms1 = librosa.feature.rms(
981
- y=input_wav[: infer_wav.shape[0]].cpu().numpy(),
982
- frame_length=4 * self.zc,
983
- hop_length=self.zc,
984
- )
985
- rms1 = torch.from_numpy(rms1).to(self.config.device)
986
- rms1 = F.interpolate(
987
- rms1.unsqueeze(0),
988
- size=infer_wav.shape[0] + 1,
989
- mode="linear",
990
- align_corners=True,
991
- )[0, 0, :-1]
992
- rms2 = librosa.feature.rms(
993
- y=infer_wav[:].cpu().numpy(),
994
- frame_length=4 * self.zc,
995
- hop_length=self.zc,
996
- )
997
- rms2 = torch.from_numpy(rms2).to(self.config.device)
998
- rms2 = F.interpolate(
999
- rms2.unsqueeze(0),
1000
- size=infer_wav.shape[0] + 1,
1001
- mode="linear",
1002
- align_corners=True,
1003
- )[0, 0, :-1]
1004
- rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3)
1005
- infer_wav *= torch.pow(
1006
- rms1 / rms2, torch.tensor(1 - self.gui_config.rms_mix_rate)
1007
- )
1008
- # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
1009
- conv_input = infer_wav[
1010
- None, None, : self.sola_buffer_frame + self.sola_search_frame
1011
- ]
1012
- cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
1013
- cor_den = torch.sqrt(
1014
- F.conv1d(
1015
- conv_input**2,
1016
- torch.ones(1, 1, self.sola_buffer_frame, device=self.config.device),
1017
- )
1018
- + 1e-8
1019
- )
1020
- if sys.platform == "darwin":
1021
- _, sola_offset = torch.max(cor_nom[0, 0] / cor_den[0, 0])
1022
- sola_offset = sola_offset.item()
1023
- else:
1024
- sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
1025
- # printt("sola_offset = %d", int(sola_offset))
1026
- infer_wav = infer_wav[sola_offset:]
1027
- if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv:
1028
- infer_wav[: self.sola_buffer_frame] *= self.fade_in_window
1029
- infer_wav[: self.sola_buffer_frame] += (
1030
- self.sola_buffer * self.fade_out_window
1031
- )
1032
- else:
1033
- infer_wav[: self.sola_buffer_frame] = phase_vocoder(
1034
- self.sola_buffer,
1035
- infer_wav[: self.sola_buffer_frame],
1036
- self.fade_out_window,
1037
- self.fade_in_window,
1038
- )
1039
- self.sola_buffer[:] = infer_wav[
1040
- self.block_frame : self.block_frame + self.sola_buffer_frame
1041
- ]
1042
- outdata[:] = (
1043
- infer_wav[: self.block_frame]
1044
- .repeat(self.gui_config.channels, 1)
1045
- .t()
1046
- .cpu()
1047
- .numpy()
1048
- )
1049
- total_time = time.perf_counter() - start_time
1050
- if flag_vc:
1051
- self.window["infer_time"].update(int(total_time * 1000))
1052
- # printt("Infer time: %.2f", total_time)
1053
-
1054
- def update_devices(self, hostapi_name=None):
1055
- """获取设备列表"""
1056
- global flag_vc
1057
- flag_vc = False
1058
- sd._terminate()
1059
- sd._initialize()
1060
- devices = sd.query_devices()
1061
- hostapis = sd.query_hostapis()
1062
- for hostapi in hostapis:
1063
- for device_idx in hostapi["devices"]:
1064
- devices[device_idx]["hostapi_name"] = hostapi["name"]
1065
- self.hostapis = [hostapi["name"] for hostapi in hostapis]
1066
- if hostapi_name not in self.hostapis:
1067
- hostapi_name = self.hostapis[0]
1068
- self.input_devices = [
1069
- d["name"]
1070
- for d in devices
1071
- if d["max_input_channels"] > 0 and d["hostapi_name"] == hostapi_name
1072
- ]
1073
- self.output_devices = [
1074
- d["name"]
1075
- for d in devices
1076
- if d["max_output_channels"] > 0 and d["hostapi_name"] == hostapi_name
1077
- ]
1078
- self.input_devices_indices = [
1079
- d["index"] if "index" in d else d["name"]
1080
- for d in devices
1081
- if d["max_input_channels"] > 0 and d["hostapi_name"] == hostapi_name
1082
- ]
1083
- self.output_devices_indices = [
1084
- d["index"] if "index" in d else d["name"]
1085
- for d in devices
1086
- if d["max_output_channels"] > 0 and d["hostapi_name"] == hostapi_name
1087
- ]
1088
-
1089
- def set_devices(self, input_device, output_device):
1090
- """设置输出设备"""
1091
- sd.default.device[0] = self.input_devices_indices[
1092
- self.input_devices.index(input_device)
1093
- ]
1094
- sd.default.device[1] = self.output_devices_indices[
1095
- self.output_devices.index(output_device)
1096
- ]
1097
- printt("Input device: %s:%s", str(sd.default.device[0]), input_device)
1098
- printt("Output device: %s:%s", str(sd.default.device[1]), output_device)
1099
-
1100
- def get_device_samplerate(self):
1101
- return int(
1102
- sd.query_devices(device=sd.default.device[0])["default_samplerate"]
1103
- )
1104
-
1105
- def get_device_channels(self):
1106
- max_input_channels = sd.query_devices(device=sd.default.device[0])[
1107
- "max_input_channels"
1108
- ]
1109
- max_output_channels = sd.query_devices(device=sd.default.device[1])[
1110
- "max_output_channels"
1111
- ]
1112
- return min(max_input_channels, max_output_channels, 2)
1113
-
1114
- gui = GUI()