Higobeatz commited on
Commit
1e95c1f
·
1 Parent(s): bbf1909

openvoice_plugin

Browse files
Files changed (46) hide show
  1. dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py +2 -0
  2. dreamvoice/.ipynb_checkpoints/api-checkpoint.py +295 -0
  3. dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml +27 -0
  4. dreamvoice/.ipynb_checkpoints/openvoice_utils-checkpoint.py +48 -0
  5. dreamvoice/.ipynb_checkpoints/plugin-checkpoint.py +128 -0
  6. dreamvoice/.ipynb_checkpoints/plugin-checkpoint.yaml +8 -0
  7. dreamvoice/__init__.py +2 -1
  8. dreamvoice/__pycache__/__init__.cpython-310.pyc +0 -0
  9. dreamvoice/__pycache__/api.cpython-310.pyc +0 -0
  10. dreamvoice/__pycache__/openvoice_utils.cpython-310.pyc +0 -0
  11. dreamvoice/__pycache__/plugin.cpython-310.pyc +0 -0
  12. dreamvoice/dreamvc.yaml +2 -1
  13. dreamvoice/openvoice_utils.py +48 -0
  14. dreamvoice/plugin.py +128 -0
  15. dreamvoice/plugin.yaml +8 -0
  16. dreamvoice/plugin_ckpts/openvoice_v2.pt +3 -0
  17. dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py +76 -0
  18. dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py +144 -0
  19. dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc +0 -0
  20. dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc +0 -0
  21. dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml +39 -0
  22. dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross_openvoice-checkpoint.yaml +39 -0
  23. dreamvoice/src/configs/plugin_cross_openvoice.yaml +39 -0
  24. dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc +0 -0
  25. dreamvoice/src/model/__pycache__/model.cpython-310.pyc +0 -0
  26. dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc +0 -0
  27. dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc +0 -0
  28. dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-310.pyc +0 -0
  29. dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-310.pyc +0 -0
  30. dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-310.pyc +0 -0
  31. dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-310.pyc +0 -0
  32. dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc +0 -0
  33. dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc +0 -0
  34. dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc +0 -0
  35. dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc +0 -0
  36. dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc +0 -0
  37. dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-310.pyc +0 -0
  38. dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-310.pyc +0 -0
  39. dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-310.pyc +0 -0
  40. dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-310.pyc +0 -0
  41. dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-310.pyc +0 -0
  42. dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-310.pyc +0 -0
  43. dreamvoice/src/train_plugin.py +0 -0
  44. dreamvoice/src/train_vc.py +0 -0
  45. dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  46. dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc +0 -0
dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .api import DreamVoice
2
+ from .plugin import DreamVoice_Plugin
dreamvoice/.ipynb_checkpoints/api-checkpoint.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import yaml
4
+ import torch
5
+ import librosa
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from pathlib import Path
9
+ from transformers import T5Tokenizer, T5EncoderModel
10
+ from tqdm import tqdm
11
+ from .src.vc_wrapper import ReDiffVC, DreamVC
12
+ from .src.plugin_wrapper import DreamVG
13
+ from .src.modules.speaker_encoder.encoder import inference as spk_encoder
14
+ from .src.modules.BigVGAN.inference import load_model as load_vocoder
15
+ from .src.feats.contentvec_hf import get_content_model, get_content
16
+
17
+
18
+ class DreamVoice:
19
+ def __init__(self, config='dreamvc.yaml', mode='plugin', device='cuda', chunk_size=16):
20
+ # Initial setup
21
+ script_dir = Path(__file__).resolve().parent
22
+ config_path = script_dir / config
23
+
24
+ # Load configuration file
25
+ with open(config_path, 'r') as fp:
26
+ self.config = yaml.safe_load(fp)
27
+
28
+ self.script_dir = script_dir
29
+
30
+ # Ensure all checkpoints are downloaded
31
+ self._ensure_checkpoints_exist()
32
+
33
+ # Initialize attributes
34
+ self.device = device
35
+ self.sr = self.config['sample_rate']
36
+
37
+ # Load vocoder
38
+ vocoder_path = script_dir / self.config['vocoder_path']
39
+ self.hifigan, _ = load_vocoder(vocoder_path, device)
40
+ self.hifigan.eval()
41
+
42
+ # Load content model
43
+ self.content_model = get_content_model().to(device)
44
+
45
+ # Load tokenizer and text encoder
46
+ lm_path = self.config['lm_path']
47
+ self.tokenizer = T5Tokenizer.from_pretrained(lm_path)
48
+ self.text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
49
+
50
+ # Set mode
51
+ self.mode = mode
52
+ if mode == 'plugin':
53
+ self._init_plugin_mode()
54
+ elif mode == 'end2end':
55
+ self._init_end2end_mode()
56
+ else:
57
+ raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
58
+
59
+ # chunk inputs to 10s clips
60
+ self.chunk_size = chunk_size * 50
61
+
62
+ def _ensure_checkpoints_exist(self):
63
+ checkpoints = [
64
+ ('vocoder_path', self.config.get('vocoder_url')),
65
+ ('vocoder_config_path', self.config.get('vocoder_config_url')),
66
+ ('speaker_path', self.config.get('speaker_url')),
67
+ ('dreamvc.ckpt_path', self.config.get('dreamvc', {}).get('ckpt_url')),
68
+ ('rediffvc.ckpt_path', self.config.get('rediffvc', {}).get('ckpt_url')),
69
+ ('dreamvg.ckpt_path', self.config.get('dreamvg', {}).get('ckpt_url'))
70
+ ]
71
+
72
+ for path_key, url in checkpoints:
73
+ local_path = self._get_local_path(path_key)
74
+ if not local_path.exists() and url:
75
+ print(f"Downloading {path_key} from {url}")
76
+ self._download_file(url, local_path)
77
+
78
+ def _get_local_path(self, path_key):
79
+ keys = path_key.split('.')
80
+ local_path = self.config
81
+ for key in keys:
82
+ local_path = local_path.get(key, {})
83
+ return self.script_dir / local_path
84
+
85
+ def _download_file(self, url, local_path):
86
+ try:
87
+ # Attempt to send a GET request to the URL
88
+ response = requests.get(url, stream=True)
89
+ response.raise_for_status() # Ensure we raise an exception for HTTP errors
90
+ except requests.exceptions.RequestException as e:
91
+ # Log the error for debugging purposes
92
+ print(f"Error encountered: {e}")
93
+
94
+ # Development mode: prompt user for Hugging Face API key
95
+ user_input = input("Private checkpoint, please request authorization and enter your Hugging Face API key.")
96
+ self.hf_key = user_input if user_input else None
97
+
98
+ # Set headers if an API key is provided
99
+ headers = {'Authorization': f'Bearer {self.hf_key}'} if self.hf_key else {}
100
+
101
+ try:
102
+ # Attempt to send a GET request with headers in development mode
103
+ response = requests.get(url, stream=True, headers=headers)
104
+ response.raise_for_status() # Ensure we raise an exception for HTTP errors
105
+ except requests.exceptions.RequestException as e:
106
+ # Log the error for debugging purposes
107
+ print(f"Error encountered in dev mode: {e}")
108
+ response = None # Handle response accordingly in your code
109
+
110
+ local_path.parent.mkdir(parents=True, exist_ok=True)
111
+
112
+ total_size = int(response.headers.get('content-length', 0))
113
+ block_size = 8192
114
+ t = tqdm(total=total_size, unit='iB', unit_scale=True)
115
+
116
+ with open(local_path, 'wb') as f:
117
+ for chunk in response.iter_content(chunk_size=block_size):
118
+ t.update(len(chunk))
119
+ f.write(chunk)
120
+ t.close()
121
+
122
+ def _init_plugin_mode(self):
123
+ # Initialize ReDiffVC
124
+ self.dreamvc = ReDiffVC(
125
+ config_path=self.script_dir / self.config['rediffvc']['config_path'],
126
+ ckpt_path=self.script_dir / self.config['rediffvc']['ckpt_path'],
127
+ device=self.device
128
+ )
129
+
130
+ # Initialize DreamVG
131
+ self.dreamvg = DreamVG(
132
+ config_path=self.script_dir / self.config['dreamvg']['config_path'],
133
+ ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
134
+ device=self.device
135
+ )
136
+
137
+ # Load speaker encoder
138
+ spk_encoder.load_model(self.script_dir / self.config['speaker_path'], self.device)
139
+ self.spk_encoder = spk_encoder
140
+ self.spk_embed_cache = None
141
+
142
+ def _init_end2end_mode(self):
143
+ # Initialize DreamVC
144
+ self.dreamvc = DreamVC(
145
+ config_path=self.script_dir / self.config['dreamvc']['config_path'],
146
+ ckpt_path=self.script_dir / self.config['dreamvc']['ckpt_path'],
147
+ device=self.device
148
+ )
149
+
150
+ def _load_content(self, audio_path):
151
+ content_audio, _ = librosa.load(audio_path, sr=16000)
152
+ # Calculate the required length to make it a multiple of 16*160
153
+ target_length = ((len(content_audio) + 16*160 - 1) // (16*160)) * (16*160)
154
+ # Pad with zeros if necessary
155
+ if len(content_audio) < target_length:
156
+ content_audio = np.pad(content_audio, (0, target_length - len(content_audio)), mode='constant')
157
+ content_audio = torch.tensor(content_audio).unsqueeze(0).to(self.device)
158
+ content_clip = get_content(self.content_model, content_audio)
159
+ return content_clip
160
+
161
+ def load_spk_embed(self, emb_path):
162
+ self.spk_embed_cache = torch.load(emb_path, map_location=self.device)
163
+
164
+ def save_spk_embed(self, emb_path):
165
+ assert self.spk_embed_cache is not None
166
+ torch.save(self.spk_embed_cache.cpu(), emb_path)
167
+
168
+ def save_audio(self, output_path, audio, sr):
169
+ sf.write(output_path, audio, samplerate=sr)
170
+
171
+ @torch.no_grad()
172
+ def genvc(self, content_audio, prompt,
173
+ prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
174
+ prompt_ddim_steps=100, prompt_eta=1, prompt_random_seed=None,
175
+ vc_guidance_scale=3, vc_guidance_rescale=0.0,
176
+ vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
177
+ ):
178
+
179
+ content_clip = self._load_content(content_audio)
180
+
181
+ text_batch = self.tokenizer(prompt, max_length=32,
182
+ padding='max_length', truncation=True, return_tensors="pt")
183
+ text, text_mask = text_batch.input_ids.to(self.device), \
184
+ text_batch.attention_mask.to(self.device)
185
+ text = self.text_encoder(input_ids=text, attention_mask=text_mask)[0]
186
+
187
+ if self.mode == 'plugin':
188
+ spk_embed = self.dreamvg.inference([text, text_mask],
189
+ guidance_scale=prompt_guidance_scale,
190
+ guidance_rescale=prompt_guidance_rescale,
191
+ ddim_steps=prompt_ddim_steps, eta=prompt_eta,
192
+ random_seed=prompt_random_seed)
193
+
194
+ B, L, D = content_clip.shape
195
+ gen_audio_chunks = []
196
+ num_chunks = (L + self.chunk_size - 1) // self.chunk_size
197
+ for i in range(num_chunks):
198
+ start_idx = i * self.chunk_size
199
+ end_idx = min((i + 1) * self.chunk_size, L)
200
+ content_clip_chunk = content_clip[:, start_idx:end_idx, :]
201
+
202
+ gen_audio_chunk = self.dreamvc.inference(
203
+ spk_embed, content_clip_chunk, None,
204
+ guidance_scale=vc_guidance_scale,
205
+ guidance_rescale=vc_guidance_rescale,
206
+ ddim_steps=vc_ddim_steps,
207
+ eta=vc_eta,
208
+ random_seed=vc_random_seed)
209
+
210
+ gen_audio_chunks.append(gen_audio_chunk)
211
+
212
+ gen_audio = torch.cat(gen_audio_chunks, dim=-1)
213
+
214
+ self.spk_embed_cache = spk_embed
215
+
216
+ elif self.mode == 'end2end':
217
+ B, L, D = content_clip.shape
218
+ gen_audio_chunks = []
219
+ num_chunks = (L + self.chunk_size - 1) // self.chunk_size
220
+
221
+ for i in range(num_chunks):
222
+ start_idx = i * self.chunk_size
223
+ end_idx = min((i + 1) * self.chunk_size, L)
224
+ content_clip_chunk = content_clip[:, start_idx:end_idx, :]
225
+
226
+ gen_audio_chunk = self.dreamvc.inference([text, text_mask], content_clip,
227
+ guidance_scale=prompt_guidance_scale,
228
+ guidance_rescale=prompt_guidance_rescale,
229
+ ddim_steps=prompt_ddim_steps,
230
+ eta=prompt_eta, random_seed=prompt_random_seed)
231
+ gen_audio_chunks.append(gen_audio_chunk)
232
+
233
+ gen_audio = torch.cat(gen_audio_chunks, dim=-1)
234
+
235
+ else:
236
+ raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
237
+
238
+ gen_audio = self.hifigan(gen_audio.squeeze(1))
239
+ gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
240
+
241
+ return gen_audio, self.sr
242
+
243
+ @torch.no_grad()
244
+ def simplevc(self, content_audio, speaker_audio=None, use_spk_cache=False,
245
+ vc_guidance_scale=3, vc_guidance_rescale=0.0,
246
+ vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
247
+ ):
248
+
249
+ assert self.mode == 'plugin'
250
+ if speaker_audio is not None:
251
+ speaker_audio, _ = librosa.load(speaker_audio, sr=16000)
252
+ speaker_audio = torch.tensor(speaker_audio).unsqueeze(0).to(self.device)
253
+ spk_embed = spk_encoder.embed_utterance_batch(speaker_audio)
254
+ self.spk_embed_cache = spk_embed
255
+ elif use_spk_cache:
256
+ assert self.spk_embed_cache is not None
257
+ spk_embed = self.spk_embed_cache
258
+ else:
259
+ raise NotImplementedError
260
+
261
+ content_clip = self._load_content(content_audio)
262
+
263
+ B, L, D = content_clip.shape
264
+ gen_audio_chunks = []
265
+ num_chunks = (L + self.chunk_size - 1) // self.chunk_size
266
+ for i in range(num_chunks):
267
+ start_idx = i * self.chunk_size
268
+ end_idx = min((i + 1) * self.chunk_size, L)
269
+ content_clip_chunk = content_clip[:, start_idx:end_idx, :]
270
+
271
+ gen_audio_chunk = self.dreamvc.inference(
272
+ spk_embed, content_clip_chunk, None,
273
+ guidance_scale=vc_guidance_scale,
274
+ guidance_rescale=vc_guidance_rescale,
275
+ ddim_steps=vc_ddim_steps,
276
+ eta=vc_eta,
277
+ random_seed=vc_random_seed)
278
+
279
+ gen_audio_chunks.append(gen_audio_chunk)
280
+
281
+ gen_audio = torch.cat(gen_audio_chunks, dim=-1)
282
+
283
+ gen_audio = self.hifigan(gen_audio.squeeze(1))
284
+ gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
285
+
286
+ return gen_audio, self.sr
287
+
288
+
289
+ if __name__ == '__main__':
290
+ dreamvoice = DreamVoice(config='dreamvc.yaml', mode='plugin', device='cuda')
291
+ content_audio = 'test.wav'
292
+ speaker_audio = 'speaker.wav'
293
+ prompt = 'young female voice, sounds young and cute'
294
+ gen_audio, sr = dreamvoice.genvc('test.wav', prompt)
295
+ dreamvoice.save_audio('debug.wav', gen_audio, sr)
dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.1
2
+
3
+ sample_rate: 24000
4
+ vocoder_path: 'ckpts/bigvgan_24k/g_01000000.pt'
5
+ vocoder_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt'
6
+ vocoder_config_path: 'ckpts/bigvgan_24k/config.json'
7
+ vocoder_config_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/config.json'
8
+
9
+ speaker_path: 'ckpts/spk_encoder/pretrained.pt'
10
+ speaker_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/spk_encoder/pretrained.pt'
11
+ lm_path: 'google/flan-t5-base'
12
+
13
+ dreamvc:
14
+ config_path: 'src/configs/diffvc_cross.yaml'
15
+ ckpt_path: 'ckpts/dreamvc_cross.pt'
16
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_cross.pt'
17
+
18
+ rediffvc:
19
+ config_path: 'src/configs/diffvc_base.yaml'
20
+ ckpt_path: 'ckpts/dreamvc_base.pt'
21
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_base.pt'
22
+
23
+ dreamvg:
24
+ config_path: 'src/configs/plugin_cross.yaml'
25
+ ckpt_path: 'ckpts/dreamvc_plugin.pt'
26
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_plugin.pt'
27
+
dreamvoice/.ipynb_checkpoints/openvoice_utils-checkpoint.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import librosa
4
+ from tqdm import tqdm
5
+ from openvoice.mel_processing import spectrogram_torch
6
+ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
7
+
8
+
9
+ @torch.no_grad()
10
+ def se_extractor(audio_path, vc):
11
+ # vad
12
+ SAMPLE_RATE = 16000
13
+ audio_vad = get_audio_tensor(audio_path)
14
+ segments = get_vad_segments(
15
+ audio_vad,
16
+ output_sample=True,
17
+ min_speech_duration=0.1,
18
+ min_silence_duration=1,
19
+ method="silero",
20
+ )
21
+ segments = [(seg["start"], seg["end"]) for seg in segments]
22
+ segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
23
+
24
+ if len(segments) == 0:
25
+ segments = [(0, len(audio_vad)/SAMPLE_RATE)]
26
+ print(segments)
27
+
28
+ # spk
29
+ hps = vc.hps
30
+ device = vc.device
31
+ model = vc.model
32
+ gs = []
33
+
34
+ audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
35
+ audio = torch.tensor(audio).float().to(device)
36
+
37
+ for s, e in segments:
38
+ y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)]
39
+ y = y.to(device)
40
+ y = y.unsqueeze(0)
41
+ y = spectrogram_torch(y, hps.data.filter_length,
42
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
43
+ center=False).to(device)
44
+ g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
45
+ gs.append(g.detach())
46
+
47
+ gs = torch.stack(gs).mean(0)
48
+ return gs.cpu()
dreamvoice/.ipynb_checkpoints/plugin-checkpoint.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import yaml
4
+ import torch
5
+ import librosa
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from pathlib import Path
9
+ from transformers import T5Tokenizer, T5EncoderModel
10
+ from tqdm import tqdm
11
+ from .src.plugin_wrapper import DreamVG
12
+
13
+
14
+ class DreamVoice_Plugin:
15
+ def __init__(self, config='plugin.yaml', device='cuda'):
16
+ # Initial setup
17
+ script_dir = Path(__file__).resolve().parent
18
+ config_path = script_dir / config
19
+
20
+ # Load configuration file
21
+ with open(config_path, 'r') as fp:
22
+ self.config = yaml.safe_load(fp)
23
+
24
+ self.script_dir = script_dir
25
+
26
+ # Ensure all checkpoints are downloaded
27
+ self._ensure_checkpoints_exist()
28
+
29
+ # Initialize attributes
30
+ self.device = device
31
+
32
+ # Load tokenizer and text encoder
33
+ lm_path = self.config['lm_path']
34
+ self.tokenizer = T5Tokenizer.from_pretrained(lm_path)
35
+ self.text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
36
+
37
+ self.dreamvg = DreamVG(
38
+ config_path=self.script_dir / self.config['dreamvg']['config_path'],
39
+ ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
40
+ device=self.device
41
+
42
+ )
43
+ def _ensure_checkpoints_exist(self):
44
+ checkpoints = [
45
+ ('dreamvg.ckpt_path', self.config.get('dreamvg', {}).get('ckpt_url'))
46
+ ]
47
+
48
+ for path_key, url in checkpoints:
49
+ local_path = self._get_local_path(path_key)
50
+ if not local_path.exists() and url:
51
+ print(f"Downloading {path_key} from {url}")
52
+ self._download_file(url, local_path)
53
+
54
+ def _get_local_path(self, path_key):
55
+ keys = path_key.split('.')
56
+ local_path = self.config
57
+ for key in keys:
58
+ local_path = local_path.get(key, {})
59
+ return self.script_dir / local_path
60
+
61
+ def _download_file(self, url, local_path):
62
+ try:
63
+ # Attempt to send a GET request to the URL
64
+ response = requests.get(url, stream=True)
65
+ response.raise_for_status() # Ensure we raise an exception for HTTP errors
66
+ except requests.exceptions.RequestException as e:
67
+ # Log the error for debugging purposes
68
+ print(f"Error encountered: {e}")
69
+
70
+ # Development mode: prompt user for Hugging Face API key
71
+ user_input = input("Private checkpoint, please request authorization and enter your Hugging Face API key.")
72
+ self.hf_key = user_input if user_input else None
73
+
74
+ # Set headers if an API key is provided
75
+ headers = {'Authorization': f'Bearer {self.hf_key}'} if self.hf_key else {}
76
+
77
+ try:
78
+ # Attempt to send a GET request with headers in development mode
79
+ response = requests.get(url, stream=True, headers=headers)
80
+ response.raise_for_status() # Ensure we raise an exception for HTTP errors
81
+ except requests.exceptions.RequestException as e:
82
+ # Log the error for debugging purposes
83
+ print(f"Error encountered in dev mode: {e}")
84
+ response = None # Handle response accordingly in your code
85
+
86
+ local_path.parent.mkdir(parents=True, exist_ok=True)
87
+
88
+ total_size = int(response.headers.get('content-length', 0))
89
+ block_size = 8192
90
+ t = tqdm(total=total_size, unit='iB', unit_scale=True)
91
+
92
+ with open(local_path, 'wb') as f:
93
+ for chunk in response.iter_content(chunk_size=block_size):
94
+ t.update(len(chunk))
95
+ f.write(chunk)
96
+ t.close()
97
+
98
+ def _init_plugin_mode(self):
99
+ # Initialize DreamVG
100
+ self.dreamvg = DreamVG(
101
+ config_path=self.script_dir / self.config['dreamvg']['config_path'],
102
+ ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
103
+ device=self.device
104
+ )
105
+
106
+ # Load speaker encoder
107
+ spk_encoder.load_model(self.script_dir / self.config['speaker_path'], self.device)
108
+ self.spk_encoder = spk_encoder
109
+ self.spk_embed_cache = None
110
+
111
+
112
+ @torch.no_grad()
113
+ def gen_spk(self, prompt,
114
+ prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
115
+ prompt_ddim_steps=100, prompt_eta=1, prompt_random_seed=None,):
116
+
117
+ text_batch = self.tokenizer(prompt, max_length=32,
118
+ padding='max_length', truncation=True, return_tensors="pt")
119
+ text, text_mask = text_batch.input_ids.to(self.device), \
120
+ text_batch.attention_mask.to(self.device)
121
+ text = self.text_encoder(input_ids=text, attention_mask=text_mask)[0]
122
+
123
+ spk_embed = self.dreamvg.inference([text, text_mask],
124
+ guidance_scale=prompt_guidance_scale,
125
+ guidance_rescale=prompt_guidance_rescale,
126
+ ddim_steps=prompt_ddim_steps, eta=prompt_eta,
127
+ random_seed=prompt_random_seed)
128
+ return spk_embed
dreamvoice/.ipynb_checkpoints/plugin-checkpoint.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ version: 1.1
2
+
3
+ lm_path: 'google/flan-t5-base'
4
+
5
+ dreamvg:
6
+ config_path: 'src/configs/plugin_cross_openvoice.yaml'
7
+ ckpt_path: 'plugin_ckpts/openvoice_v2.pt'
8
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/plugin_ckpts/openvoice_v2.pt'
dreamvoice/__init__.py CHANGED
@@ -1 +1,2 @@
1
- from .api import DreamVoice
 
 
1
+ from .api import DreamVoice
2
+ from .plugin import DreamVoice_Plugin
dreamvoice/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (244 Bytes). View file
 
dreamvoice/__pycache__/api.cpython-310.pyc ADDED
Binary file (8.04 kB). View file
 
dreamvoice/__pycache__/openvoice_utils.cpython-310.pyc ADDED
Binary file (1.65 kB). View file
 
dreamvoice/__pycache__/plugin.cpython-310.pyc ADDED
Binary file (4.01 kB). View file
 
dreamvoice/dreamvc.yaml CHANGED
@@ -1,4 +1,4 @@
1
- version: 1.0
2
 
3
  sample_rate: 24000
4
  vocoder_path: 'ckpts/bigvgan_24k/g_01000000.pt'
@@ -24,3 +24,4 @@ dreamvg:
24
  config_path: 'src/configs/plugin_cross.yaml'
25
  ckpt_path: 'ckpts/dreamvc_plugin.pt'
26
  ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_plugin.pt'
 
 
1
+ version: 1.1
2
 
3
  sample_rate: 24000
4
  vocoder_path: 'ckpts/bigvgan_24k/g_01000000.pt'
 
24
  config_path: 'src/configs/plugin_cross.yaml'
25
  ckpt_path: 'ckpts/dreamvc_plugin.pt'
26
  ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_plugin.pt'
27
+
dreamvoice/openvoice_utils.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import librosa
4
+ from tqdm import tqdm
5
+ from openvoice.mel_processing import spectrogram_torch
6
+ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
7
+
8
+
9
+ @torch.no_grad()
10
+ def se_extractor(audio_path, vc):
11
+ # vad
12
+ SAMPLE_RATE = 16000
13
+ audio_vad = get_audio_tensor(audio_path)
14
+ segments = get_vad_segments(
15
+ audio_vad,
16
+ output_sample=True,
17
+ min_speech_duration=0.1,
18
+ min_silence_duration=1,
19
+ method="silero",
20
+ )
21
+ segments = [(seg["start"], seg["end"]) for seg in segments]
22
+ segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
23
+
24
+ if len(segments) == 0:
25
+ segments = [(0, len(audio_vad)/SAMPLE_RATE)]
26
+ print(segments)
27
+
28
+ # spk
29
+ hps = vc.hps
30
+ device = vc.device
31
+ model = vc.model
32
+ gs = []
33
+
34
+ audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
35
+ audio = torch.tensor(audio).float().to(device)
36
+
37
+ for s, e in segments:
38
+ y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)]
39
+ y = y.to(device)
40
+ y = y.unsqueeze(0)
41
+ y = spectrogram_torch(y, hps.data.filter_length,
42
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
43
+ center=False).to(device)
44
+ g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
45
+ gs.append(g.detach())
46
+
47
+ gs = torch.stack(gs).mean(0)
48
+ return gs.cpu()
dreamvoice/plugin.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import yaml
4
+ import torch
5
+ import librosa
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from pathlib import Path
9
+ from transformers import T5Tokenizer, T5EncoderModel
10
+ from tqdm import tqdm
11
+ from .src.plugin_wrapper import DreamVG
12
+
13
+
14
+ class DreamVoice_Plugin:
15
+ def __init__(self, config='plugin.yaml', device='cuda'):
16
+ # Initial setup
17
+ script_dir = Path(__file__).resolve().parent
18
+ config_path = script_dir / config
19
+
20
+ # Load configuration file
21
+ with open(config_path, 'r') as fp:
22
+ self.config = yaml.safe_load(fp)
23
+
24
+ self.script_dir = script_dir
25
+
26
+ # Ensure all checkpoints are downloaded
27
+ self._ensure_checkpoints_exist()
28
+
29
+ # Initialize attributes
30
+ self.device = device
31
+
32
+ # Load tokenizer and text encoder
33
+ lm_path = self.config['lm_path']
34
+ self.tokenizer = T5Tokenizer.from_pretrained(lm_path)
35
+ self.text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
36
+
37
+ self.dreamvg = DreamVG(
38
+ config_path=self.script_dir / self.config['dreamvg']['config_path'],
39
+ ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
40
+ device=self.device
41
+
42
+ )
43
+ def _ensure_checkpoints_exist(self):
44
+ checkpoints = [
45
+ ('dreamvg.ckpt_path', self.config.get('dreamvg', {}).get('ckpt_url'))
46
+ ]
47
+
48
+ for path_key, url in checkpoints:
49
+ local_path = self._get_local_path(path_key)
50
+ if not local_path.exists() and url:
51
+ print(f"Downloading {path_key} from {url}")
52
+ self._download_file(url, local_path)
53
+
54
+ def _get_local_path(self, path_key):
55
+ keys = path_key.split('.')
56
+ local_path = self.config
57
+ for key in keys:
58
+ local_path = local_path.get(key, {})
59
+ return self.script_dir / local_path
60
+
61
+ def _download_file(self, url, local_path):
62
+ try:
63
+ # Attempt to send a GET request to the URL
64
+ response = requests.get(url, stream=True)
65
+ response.raise_for_status() # Ensure we raise an exception for HTTP errors
66
+ except requests.exceptions.RequestException as e:
67
+ # Log the error for debugging purposes
68
+ print(f"Error encountered: {e}")
69
+
70
+ # Development mode: prompt user for Hugging Face API key
71
+ user_input = input("Private checkpoint, please request authorization and enter your Hugging Face API key.")
72
+ self.hf_key = user_input if user_input else None
73
+
74
+ # Set headers if an API key is provided
75
+ headers = {'Authorization': f'Bearer {self.hf_key}'} if self.hf_key else {}
76
+
77
+ try:
78
+ # Attempt to send a GET request with headers in development mode
79
+ response = requests.get(url, stream=True, headers=headers)
80
+ response.raise_for_status() # Ensure we raise an exception for HTTP errors
81
+ except requests.exceptions.RequestException as e:
82
+ # Log the error for debugging purposes
83
+ print(f"Error encountered in dev mode: {e}")
84
+ response = None # Handle response accordingly in your code
85
+
86
+ local_path.parent.mkdir(parents=True, exist_ok=True)
87
+
88
+ total_size = int(response.headers.get('content-length', 0))
89
+ block_size = 8192
90
+ t = tqdm(total=total_size, unit='iB', unit_scale=True)
91
+
92
+ with open(local_path, 'wb') as f:
93
+ for chunk in response.iter_content(chunk_size=block_size):
94
+ t.update(len(chunk))
95
+ f.write(chunk)
96
+ t.close()
97
+
98
+ def _init_plugin_mode(self):
99
+ # Initialize DreamVG
100
+ self.dreamvg = DreamVG(
101
+ config_path=self.script_dir / self.config['dreamvg']['config_path'],
102
+ ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
103
+ device=self.device
104
+ )
105
+
106
+ # Load speaker encoder
107
+ spk_encoder.load_model(self.script_dir / self.config['speaker_path'], self.device)
108
+ self.spk_encoder = spk_encoder
109
+ self.spk_embed_cache = None
110
+
111
+
112
+ @torch.no_grad()
113
+ def gen_spk(self, prompt,
114
+ prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
115
+ prompt_ddim_steps=100, prompt_eta=1, prompt_random_seed=None,):
116
+
117
+ text_batch = self.tokenizer(prompt, max_length=32,
118
+ padding='max_length', truncation=True, return_tensors="pt")
119
+ text, text_mask = text_batch.input_ids.to(self.device), \
120
+ text_batch.attention_mask.to(self.device)
121
+ text = self.text_encoder(input_ids=text, attention_mask=text_mask)[0]
122
+
123
+ spk_embed = self.dreamvg.inference([text, text_mask],
124
+ guidance_scale=prompt_guidance_scale,
125
+ guidance_rescale=prompt_guidance_rescale,
126
+ ddim_steps=prompt_ddim_steps, eta=prompt_eta,
127
+ random_seed=prompt_random_seed)
128
+ return spk_embed
dreamvoice/plugin.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ version: 1.1
2
+
3
+ lm_path: 'google/flan-t5-base'
4
+
5
+ dreamvg:
6
+ config_path: 'src/configs/plugin_cross_openvoice.yaml'
7
+ ckpt_path: 'plugin_ckpts/openvoice_v2.pt'
8
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/plugin_ckpts/openvoice_v2.pt'
dreamvoice/plugin_ckpts/openvoice_v2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08eae6c5b5da6438451589f3657b3bfeedc3b80bed52948f9845b7d70e989ab1
3
+ size 104892189
dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import torch
3
+ from diffusers import DDIMScheduler
4
+ from .model.p2e_cross import P2E_Cross
5
+ from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
6
+
7
+
8
+ class DreamVG(object):
9
+ def __init__(self,
10
+ config_path='configs/plugin_cross.yaml',
11
+ ckpt_path='../ckpts/dreamvc_plugin.pt',
12
+ device='cpu'):
13
+
14
+ with open(config_path, 'r') as fp:
15
+ config = yaml.safe_load(fp)
16
+
17
+ self.device = device
18
+ self.model = P2E_Cross(config['model']).to(device)
19
+ self.model.load_state_dict(torch.load(ckpt_path)['model'])
20
+ self.model.eval()
21
+
22
+ noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
23
+ beta_start=config['scheduler']['beta_start'],
24
+ beta_end=config['scheduler']['beta_end'],
25
+ rescale_betas_zero_snr=True,
26
+ timestep_spacing="trailing",
27
+ clip_sample=False,
28
+ prediction_type='v_prediction')
29
+ self.noise_scheduler = noise_scheduler
30
+ self.scale = config['scheduler']['scale']
31
+ self.shift = config['scheduler']['shift']
32
+ self.spk_shape = config['model']['unet']['in_channels']
33
+
34
+ @torch.no_grad()
35
+ def inference(self, text,
36
+ guidance_scale=5, guidance_rescale=0.7,
37
+ ddim_steps=50, eta=1, random_seed=2023,
38
+ ):
39
+ text, text_mask = text
40
+ self.model.eval()
41
+
42
+ gen_shape = (1, self.spk_shape)
43
+
44
+ if random_seed is not None:
45
+ generator = torch.Generator(device=self.device).manual_seed(random_seed)
46
+ else:
47
+ generator = torch.Generator(device=self.device)
48
+ generator.seed()
49
+
50
+ self.noise_scheduler.set_timesteps(ddim_steps)
51
+
52
+ # init noise
53
+ noise = torch.randn(gen_shape, generator=generator, device=self.device)
54
+ latents = noise
55
+
56
+ for t in self.noise_scheduler.timesteps:
57
+ latents = self.noise_scheduler.scale_model_input(latents, t)
58
+
59
+ if guidance_scale:
60
+ output_text = self.model(latents, t, text, text_mask, train_cfg=False)
61
+ output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
62
+
63
+ output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
64
+ if guidance_rescale > 0.0:
65
+ output_pred = rescale_noise_cfg(output_pred, output_text,
66
+ guidance_rescale=guidance_rescale)
67
+ else:
68
+ output_pred = self.model(latents, t, text, text_mask, train_cfg=False)
69
+
70
+ latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
71
+ eta=eta, generator=generator).prev_sample
72
+
73
+ # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
74
+ pred = scale_shift_re(latents, 1/self.scale, self.shift)
75
+ # pred = torch.clip(pred, min=0.0, max=0.5)
76
+ return pred
dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import torch
3
+ from diffusers import DDIMScheduler
4
+ from .model.model import DiffVC
5
+ from .model.model_cross import DiffVC_Cross
6
+ from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
7
+
8
+
9
+ class ReDiffVC(object):
10
+ def __init__(self,
11
+ config_path='configs/diffvc_base.yaml',
12
+ ckpt_path='../ckpts/dreamvc_base.pt',
13
+ device='cpu'):
14
+
15
+ with open(config_path, 'r') as fp:
16
+ config = yaml.safe_load(fp)
17
+
18
+ self.device = device
19
+ self.model = DiffVC(config['model']).to(device)
20
+ self.model.load_state_dict(torch.load(ckpt_path)['model'])
21
+ self.model.eval()
22
+
23
+ noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
24
+ beta_start=config['scheduler']['beta_start'],
25
+ beta_end=config['scheduler']['beta_end'],
26
+ rescale_betas_zero_snr=True,
27
+ timestep_spacing="trailing",
28
+ clip_sample=False,
29
+ prediction_type='v_prediction')
30
+ self.noise_scheduler = noise_scheduler
31
+ self.scale = config['scheduler']['scale']
32
+ self.shift = config['scheduler']['shift']
33
+ self.melshape = config['model']['unet']['sample_size'][0]
34
+
35
+ @torch.no_grad()
36
+ def inference(self,
37
+ spk_embed, content_clip, f0_clip=None,
38
+ guidance_scale=3, guidance_rescale=0.7,
39
+ ddim_steps=50, eta=1, random_seed=2023):
40
+
41
+ self.model.eval()
42
+ if random_seed is not None:
43
+ generator = torch.Generator(device=self.device).manual_seed(random_seed)
44
+ else:
45
+ generator = torch.Generator(device=self.device)
46
+ generator.seed()
47
+
48
+ self.noise_scheduler.set_timesteps(ddim_steps)
49
+
50
+ # init noise
51
+ gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
52
+ noise = torch.randn(gen_shape, generator=generator, device=self.device)
53
+ latents = noise
54
+
55
+ for t in self.noise_scheduler.timesteps:
56
+ latents = self.noise_scheduler.scale_model_input(latents, t)
57
+
58
+ if guidance_scale:
59
+ output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
60
+ output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True,
61
+ speaker_cfg=1.0, pitch_cfg=0.0)
62
+
63
+ output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
64
+ if guidance_rescale > 0.0:
65
+ output_pred = rescale_noise_cfg(output_pred, output_text,
66
+ guidance_rescale=guidance_rescale)
67
+ else:
68
+ output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
69
+
70
+ latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
71
+ eta=eta, generator=generator).prev_sample
72
+
73
+ pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
74
+ return pred
75
+
76
+
77
+ class DreamVC(object):
78
+ def __init__(self,
79
+ config_path='configs/diffvc_cross.yaml',
80
+ ckpt_path='../ckpts/dreamvc_cross.pt',
81
+ device='cpu'):
82
+
83
+ with open(config_path, 'r') as fp:
84
+ config = yaml.safe_load(fp)
85
+
86
+ self.device = device
87
+ self.model = DiffVC_Cross(config['model']).to(device)
88
+ self.model.load_state_dict(torch.load(ckpt_path)['model'])
89
+ self.model.eval()
90
+
91
+ noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
92
+ beta_start=config['scheduler']['beta_start'],
93
+ beta_end=config['scheduler']['beta_end'],
94
+ rescale_betas_zero_snr=True,
95
+ timestep_spacing="trailing",
96
+ clip_sample=False,
97
+ prediction_type='v_prediction')
98
+ self.noise_scheduler = noise_scheduler
99
+ self.scale = config['scheduler']['scale']
100
+ self.shift = config['scheduler']['shift']
101
+ self.melshape = config['model']['unet']['sample_size'][0]
102
+
103
+ @torch.no_grad()
104
+ def inference(self,
105
+ text, content_clip, f0_clip=None,
106
+ guidance_scale=3, guidance_rescale=0.7,
107
+ ddim_steps=50, eta=1, random_seed=2023):
108
+
109
+ text, text_mask = text
110
+ self.model.eval()
111
+ if random_seed is not None:
112
+ generator = torch.Generator(device=self.device).manual_seed(random_seed)
113
+ else:
114
+ generator = torch.Generator(device=self.device)
115
+ generator.seed()
116
+
117
+ self.noise_scheduler.set_timesteps(ddim_steps)
118
+
119
+ # init noise
120
+ gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
121
+ noise = torch.randn(gen_shape, generator=generator, device=self.device)
122
+ latents = noise
123
+
124
+ for t in self.noise_scheduler.timesteps:
125
+ latents = self.noise_scheduler.scale_model_input(latents, t)
126
+
127
+ if guidance_scale:
128
+ output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
129
+ output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True,
130
+ speaker_cfg=1.0, pitch_cfg=0.0)
131
+
132
+ output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
133
+ if guidance_rescale > 0.0:
134
+ output_pred = rescale_noise_cfg(output_pred, output_text,
135
+ guidance_rescale=guidance_rescale)
136
+ else:
137
+ output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
138
+
139
+ latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
140
+ eta=eta, generator=generator).prev_sample
141
+
142
+ pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
143
+ return pred
144
+
dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc ADDED
Binary file (2.4 kB). View file
 
dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc ADDED
Binary file (3.49 kB). View file
 
dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ model:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+
10
+ unet:
11
+ sample_size: [1, 1]
12
+ in_channels: 256
13
+ out_channels: 256
14
+ layers_per_block: 2
15
+ block_out_channels: [256]
16
+ down_block_types:
17
+ [
18
+ "CrossAttnDownBlock2D",
19
+ ]
20
+ up_block_types:
21
+ [
22
+ "CrossAttnUpBlock2D",
23
+ ]
24
+ attention_head_dim: 32
25
+ cross_attention_dim: 768
26
+
27
+ scheduler:
28
+ num_train_steps: 1000
29
+ beta_schedule: 'linear'
30
+ beta_start: 0.0001
31
+ beta_end: 0.02
32
+ num_infer_steps: 50
33
+ rescale_betas_zero_snr: true
34
+ timestep_spacing: "trailing"
35
+ clip_sample: false
36
+ prediction_type: 'v_prediction'
37
+ scale: 0.05
38
+ shift: -0.035
39
+
dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross_openvoice-checkpoint.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ model:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+
10
+ unet:
11
+ sample_size: [1, 1]
12
+ in_channels: 256
13
+ out_channels: 256
14
+ layers_per_block: 2
15
+ block_out_channels: [256]
16
+ down_block_types:
17
+ [
18
+ "CrossAttnDownBlock2D",
19
+ ]
20
+ up_block_types:
21
+ [
22
+ "CrossAttnUpBlock2D",
23
+ ]
24
+ attention_head_dim: 32
25
+ cross_attention_dim: 768
26
+
27
+ scheduler:
28
+ num_train_steps: 1000
29
+ beta_schedule: 'linear'
30
+ beta_start: 0.0001
31
+ beta_end: 0.02
32
+ num_infer_steps: 50
33
+ rescale_betas_zero_snr: true
34
+ timestep_spacing: "trailing"
35
+ clip_sample: false
36
+ prediction_type: 'v_prediction'
37
+ scale: 1.0
38
+ shift: 0.0
39
+
dreamvoice/src/configs/plugin_cross_openvoice.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ model:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+
10
+ unet:
11
+ sample_size: [1, 1]
12
+ in_channels: 256
13
+ out_channels: 256
14
+ layers_per_block: 2
15
+ block_out_channels: [256]
16
+ down_block_types:
17
+ [
18
+ "CrossAttnDownBlock2D",
19
+ ]
20
+ up_block_types:
21
+ [
22
+ "CrossAttnUpBlock2D",
23
+ ]
24
+ attention_head_dim: 32
25
+ cross_attention_dim: 768
26
+
27
+ scheduler:
28
+ num_train_steps: 1000
29
+ beta_schedule: 'linear'
30
+ beta_start: 0.0001
31
+ beta_end: 0.02
32
+ num_infer_steps: 50
33
+ rescale_betas_zero_snr: true
34
+ timestep_spacing: "trailing"
35
+ clip_sample: false
36
+ prediction_type: 'v_prediction'
37
+ scale: 1.0
38
+ shift: 0.0
39
+
dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc and b/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc differ
 
dreamvoice/src/model/__pycache__/model.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/model/__pycache__/model.cpython-310.pyc and b/dreamvoice/src/model/__pycache__/model.cpython-310.pyc differ
 
dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc and b/dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc differ
 
dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc and b/dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc differ
 
dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-310.pyc and b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-310.pyc differ
 
dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-310.pyc and b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-310.pyc differ
 
dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-310.pyc and b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-310.pyc differ
 
dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-310.pyc and b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-310.pyc differ
 
dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc and b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc differ
 
dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc differ
 
dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc differ
 
dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc differ
 
dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc differ
 
dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-310.pyc and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-310.pyc differ
 
dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-310.pyc and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-310.pyc differ
 
dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-310.pyc and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-310.pyc differ
 
dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-310.pyc and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-310.pyc differ
 
dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-310.pyc and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-310.pyc differ
 
dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-310.pyc and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-310.pyc differ
 
dreamvoice/src/train_plugin.py DELETED
File without changes
dreamvoice/src/train_vc.py DELETED
File without changes
dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc and b/dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc differ
 
dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc and b/dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc differ