Higobeatz commited on
Commit
d60669e
·
2 Parent(s): d2ffd7a 8a57937

Merge branch 'main' of https://huggingface.co/myshell-ai/DreamVoice into main

Browse files
Files changed (42) hide show
  1. dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py +0 -2
  2. dreamvoice/.ipynb_checkpoints/api-checkpoint.py +0 -295
  3. dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml +0 -27
  4. dreamvoice/.ipynb_checkpoints/openvoice_utils-checkpoint.py +0 -48
  5. dreamvoice/.ipynb_checkpoints/plugin-checkpoint.py +0 -128
  6. dreamvoice/.ipynb_checkpoints/plugin-checkpoint.yaml +0 -8
  7. dreamvoice/__pycache__/__init__.cpython-310.pyc +0 -0
  8. dreamvoice/__pycache__/api.cpython-310.pyc +0 -0
  9. dreamvoice/__pycache__/openvoice_utils.cpython-310.pyc +0 -0
  10. dreamvoice/__pycache__/plugin.cpython-310.pyc +0 -0
  11. dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py +0 -76
  12. dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py +0 -144
  13. dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc +0 -0
  14. dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc +0 -0
  15. dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml +0 -39
  16. dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross_openvoice-checkpoint.yaml +0 -39
  17. dreamvoice/src/feats/.ipynb_checkpoints/contentvec-checkpoint.py +0 -42
  18. dreamvoice/src/feats/.ipynb_checkpoints/contentvec_hf-checkpoint.py +0 -40
  19. dreamvoice/src/feats/.ipynb_checkpoints/hubert_model-checkpoint.py +0 -24
  20. dreamvoice/src/feats/.ipynb_checkpoints/test-checkpoint.py +0 -22
  21. dreamvoice/src/feats/__pycache__/contentvec.cpython-310.pyc +0 -0
  22. dreamvoice/src/feats/__pycache__/contentvec.cpython-311.pyc +0 -0
  23. dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc +0 -0
  24. dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-311.pyc +0 -0
  25. dreamvoice/src/feats/__pycache__/hubert_model.cpython-311.pyc +0 -0
  26. dreamvoice/src/model/.ipynb_checkpoints/model-checkpoint.py +0 -98
  27. dreamvoice/src/model/.ipynb_checkpoints/model_cross-checkpoint.py +0 -116
  28. dreamvoice/src/model/.ipynb_checkpoints/p2e_cross-checkpoint.py +0 -80
  29. dreamvoice/src/model/__pycache__/model.cpython-310.pyc +0 -0
  30. dreamvoice/src/model/__pycache__/model.cpython-311.pyc +0 -0
  31. dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc +0 -0
  32. dreamvoice/src/model/__pycache__/model_cross.cpython-311.pyc +0 -0
  33. dreamvoice/src/model/__pycache__/model_cross.cpython-39.pyc +0 -0
  34. dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc +0 -0
  35. dreamvoice/src/model/__pycache__/p2e_cross.cpython-311.pyc +0 -0
  36. dreamvoice/src/modules/.ipynb_checkpoints/mel-checkpoint.py +0 -37
  37. dreamvoice/src/utils/.ipynb_checkpoints/__init__-checkpoint.py +0 -1
  38. dreamvoice/src/utils/.ipynb_checkpoints/utils-checkpoint.py +0 -76
  39. dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  40. dreamvoice/src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  41. dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc +0 -0
  42. dreamvoice/src/utils/__pycache__/utils.cpython-311.pyc +0 -0
dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py DELETED
@@ -1,2 +0,0 @@
1
- from .api import DreamVoice
2
- from .plugin import DreamVoice_Plugin
 
 
 
dreamvoice/.ipynb_checkpoints/api-checkpoint.py DELETED
@@ -1,295 +0,0 @@
1
- import os
2
- import requests
3
- import yaml
4
- import torch
5
- import librosa
6
- import numpy as np
7
- import soundfile as sf
8
- from pathlib import Path
9
- from transformers import T5Tokenizer, T5EncoderModel
10
- from tqdm import tqdm
11
- from .src.vc_wrapper import ReDiffVC, DreamVC
12
- from .src.plugin_wrapper import DreamVG
13
- from .src.modules.speaker_encoder.encoder import inference as spk_encoder
14
- from .src.modules.BigVGAN.inference import load_model as load_vocoder
15
- from .src.feats.contentvec_hf import get_content_model, get_content
16
-
17
-
18
- class DreamVoice:
19
- def __init__(self, config='dreamvc.yaml', mode='plugin', device='cuda', chunk_size=16):
20
- # Initial setup
21
- script_dir = Path(__file__).resolve().parent
22
- config_path = script_dir / config
23
-
24
- # Load configuration file
25
- with open(config_path, 'r') as fp:
26
- self.config = yaml.safe_load(fp)
27
-
28
- self.script_dir = script_dir
29
-
30
- # Ensure all checkpoints are downloaded
31
- self._ensure_checkpoints_exist()
32
-
33
- # Initialize attributes
34
- self.device = device
35
- self.sr = self.config['sample_rate']
36
-
37
- # Load vocoder
38
- vocoder_path = script_dir / self.config['vocoder_path']
39
- self.hifigan, _ = load_vocoder(vocoder_path, device)
40
- self.hifigan.eval()
41
-
42
- # Load content model
43
- self.content_model = get_content_model().to(device)
44
-
45
- # Load tokenizer and text encoder
46
- lm_path = self.config['lm_path']
47
- self.tokenizer = T5Tokenizer.from_pretrained(lm_path)
48
- self.text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
49
-
50
- # Set mode
51
- self.mode = mode
52
- if mode == 'plugin':
53
- self._init_plugin_mode()
54
- elif mode == 'end2end':
55
- self._init_end2end_mode()
56
- else:
57
- raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
58
-
59
- # chunk inputs to 10s clips
60
- self.chunk_size = chunk_size * 50
61
-
62
- def _ensure_checkpoints_exist(self):
63
- checkpoints = [
64
- ('vocoder_path', self.config.get('vocoder_url')),
65
- ('vocoder_config_path', self.config.get('vocoder_config_url')),
66
- ('speaker_path', self.config.get('speaker_url')),
67
- ('dreamvc.ckpt_path', self.config.get('dreamvc', {}).get('ckpt_url')),
68
- ('rediffvc.ckpt_path', self.config.get('rediffvc', {}).get('ckpt_url')),
69
- ('dreamvg.ckpt_path', self.config.get('dreamvg', {}).get('ckpt_url'))
70
- ]
71
-
72
- for path_key, url in checkpoints:
73
- local_path = self._get_local_path(path_key)
74
- if not local_path.exists() and url:
75
- print(f"Downloading {path_key} from {url}")
76
- self._download_file(url, local_path)
77
-
78
- def _get_local_path(self, path_key):
79
- keys = path_key.split('.')
80
- local_path = self.config
81
- for key in keys:
82
- local_path = local_path.get(key, {})
83
- return self.script_dir / local_path
84
-
85
- def _download_file(self, url, local_path):
86
- try:
87
- # Attempt to send a GET request to the URL
88
- response = requests.get(url, stream=True)
89
- response.raise_for_status() # Ensure we raise an exception for HTTP errors
90
- except requests.exceptions.RequestException as e:
91
- # Log the error for debugging purposes
92
- print(f"Error encountered: {e}")
93
-
94
- # Development mode: prompt user for Hugging Face API key
95
- user_input = input("Private checkpoint, please request authorization and enter your Hugging Face API key.")
96
- self.hf_key = user_input if user_input else None
97
-
98
- # Set headers if an API key is provided
99
- headers = {'Authorization': f'Bearer {self.hf_key}'} if self.hf_key else {}
100
-
101
- try:
102
- # Attempt to send a GET request with headers in development mode
103
- response = requests.get(url, stream=True, headers=headers)
104
- response.raise_for_status() # Ensure we raise an exception for HTTP errors
105
- except requests.exceptions.RequestException as e:
106
- # Log the error for debugging purposes
107
- print(f"Error encountered in dev mode: {e}")
108
- response = None # Handle response accordingly in your code
109
-
110
- local_path.parent.mkdir(parents=True, exist_ok=True)
111
-
112
- total_size = int(response.headers.get('content-length', 0))
113
- block_size = 8192
114
- t = tqdm(total=total_size, unit='iB', unit_scale=True)
115
-
116
- with open(local_path, 'wb') as f:
117
- for chunk in response.iter_content(chunk_size=block_size):
118
- t.update(len(chunk))
119
- f.write(chunk)
120
- t.close()
121
-
122
- def _init_plugin_mode(self):
123
- # Initialize ReDiffVC
124
- self.dreamvc = ReDiffVC(
125
- config_path=self.script_dir / self.config['rediffvc']['config_path'],
126
- ckpt_path=self.script_dir / self.config['rediffvc']['ckpt_path'],
127
- device=self.device
128
- )
129
-
130
- # Initialize DreamVG
131
- self.dreamvg = DreamVG(
132
- config_path=self.script_dir / self.config['dreamvg']['config_path'],
133
- ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
134
- device=self.device
135
- )
136
-
137
- # Load speaker encoder
138
- spk_encoder.load_model(self.script_dir / self.config['speaker_path'], self.device)
139
- self.spk_encoder = spk_encoder
140
- self.spk_embed_cache = None
141
-
142
- def _init_end2end_mode(self):
143
- # Initialize DreamVC
144
- self.dreamvc = DreamVC(
145
- config_path=self.script_dir / self.config['dreamvc']['config_path'],
146
- ckpt_path=self.script_dir / self.config['dreamvc']['ckpt_path'],
147
- device=self.device
148
- )
149
-
150
- def _load_content(self, audio_path):
151
- content_audio, _ = librosa.load(audio_path, sr=16000)
152
- # Calculate the required length to make it a multiple of 16*160
153
- target_length = ((len(content_audio) + 16*160 - 1) // (16*160)) * (16*160)
154
- # Pad with zeros if necessary
155
- if len(content_audio) < target_length:
156
- content_audio = np.pad(content_audio, (0, target_length - len(content_audio)), mode='constant')
157
- content_audio = torch.tensor(content_audio).unsqueeze(0).to(self.device)
158
- content_clip = get_content(self.content_model, content_audio)
159
- return content_clip
160
-
161
- def load_spk_embed(self, emb_path):
162
- self.spk_embed_cache = torch.load(emb_path, map_location=self.device)
163
-
164
- def save_spk_embed(self, emb_path):
165
- assert self.spk_embed_cache is not None
166
- torch.save(self.spk_embed_cache.cpu(), emb_path)
167
-
168
- def save_audio(self, output_path, audio, sr):
169
- sf.write(output_path, audio, samplerate=sr)
170
-
171
- @torch.no_grad()
172
- def genvc(self, content_audio, prompt,
173
- prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
174
- prompt_ddim_steps=100, prompt_eta=1, prompt_random_seed=None,
175
- vc_guidance_scale=3, vc_guidance_rescale=0.0,
176
- vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
177
- ):
178
-
179
- content_clip = self._load_content(content_audio)
180
-
181
- text_batch = self.tokenizer(prompt, max_length=32,
182
- padding='max_length', truncation=True, return_tensors="pt")
183
- text, text_mask = text_batch.input_ids.to(self.device), \
184
- text_batch.attention_mask.to(self.device)
185
- text = self.text_encoder(input_ids=text, attention_mask=text_mask)[0]
186
-
187
- if self.mode == 'plugin':
188
- spk_embed = self.dreamvg.inference([text, text_mask],
189
- guidance_scale=prompt_guidance_scale,
190
- guidance_rescale=prompt_guidance_rescale,
191
- ddim_steps=prompt_ddim_steps, eta=prompt_eta,
192
- random_seed=prompt_random_seed)
193
-
194
- B, L, D = content_clip.shape
195
- gen_audio_chunks = []
196
- num_chunks = (L + self.chunk_size - 1) // self.chunk_size
197
- for i in range(num_chunks):
198
- start_idx = i * self.chunk_size
199
- end_idx = min((i + 1) * self.chunk_size, L)
200
- content_clip_chunk = content_clip[:, start_idx:end_idx, :]
201
-
202
- gen_audio_chunk = self.dreamvc.inference(
203
- spk_embed, content_clip_chunk, None,
204
- guidance_scale=vc_guidance_scale,
205
- guidance_rescale=vc_guidance_rescale,
206
- ddim_steps=vc_ddim_steps,
207
- eta=vc_eta,
208
- random_seed=vc_random_seed)
209
-
210
- gen_audio_chunks.append(gen_audio_chunk)
211
-
212
- gen_audio = torch.cat(gen_audio_chunks, dim=-1)
213
-
214
- self.spk_embed_cache = spk_embed
215
-
216
- elif self.mode == 'end2end':
217
- B, L, D = content_clip.shape
218
- gen_audio_chunks = []
219
- num_chunks = (L + self.chunk_size - 1) // self.chunk_size
220
-
221
- for i in range(num_chunks):
222
- start_idx = i * self.chunk_size
223
- end_idx = min((i + 1) * self.chunk_size, L)
224
- content_clip_chunk = content_clip[:, start_idx:end_idx, :]
225
-
226
- gen_audio_chunk = self.dreamvc.inference([text, text_mask], content_clip,
227
- guidance_scale=prompt_guidance_scale,
228
- guidance_rescale=prompt_guidance_rescale,
229
- ddim_steps=prompt_ddim_steps,
230
- eta=prompt_eta, random_seed=prompt_random_seed)
231
- gen_audio_chunks.append(gen_audio_chunk)
232
-
233
- gen_audio = torch.cat(gen_audio_chunks, dim=-1)
234
-
235
- else:
236
- raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
237
-
238
- gen_audio = self.hifigan(gen_audio.squeeze(1))
239
- gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
240
-
241
- return gen_audio, self.sr
242
-
243
- @torch.no_grad()
244
- def simplevc(self, content_audio, speaker_audio=None, use_spk_cache=False,
245
- vc_guidance_scale=3, vc_guidance_rescale=0.0,
246
- vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
247
- ):
248
-
249
- assert self.mode == 'plugin'
250
- if speaker_audio is not None:
251
- speaker_audio, _ = librosa.load(speaker_audio, sr=16000)
252
- speaker_audio = torch.tensor(speaker_audio).unsqueeze(0).to(self.device)
253
- spk_embed = spk_encoder.embed_utterance_batch(speaker_audio)
254
- self.spk_embed_cache = spk_embed
255
- elif use_spk_cache:
256
- assert self.spk_embed_cache is not None
257
- spk_embed = self.spk_embed_cache
258
- else:
259
- raise NotImplementedError
260
-
261
- content_clip = self._load_content(content_audio)
262
-
263
- B, L, D = content_clip.shape
264
- gen_audio_chunks = []
265
- num_chunks = (L + self.chunk_size - 1) // self.chunk_size
266
- for i in range(num_chunks):
267
- start_idx = i * self.chunk_size
268
- end_idx = min((i + 1) * self.chunk_size, L)
269
- content_clip_chunk = content_clip[:, start_idx:end_idx, :]
270
-
271
- gen_audio_chunk = self.dreamvc.inference(
272
- spk_embed, content_clip_chunk, None,
273
- guidance_scale=vc_guidance_scale,
274
- guidance_rescale=vc_guidance_rescale,
275
- ddim_steps=vc_ddim_steps,
276
- eta=vc_eta,
277
- random_seed=vc_random_seed)
278
-
279
- gen_audio_chunks.append(gen_audio_chunk)
280
-
281
- gen_audio = torch.cat(gen_audio_chunks, dim=-1)
282
-
283
- gen_audio = self.hifigan(gen_audio.squeeze(1))
284
- gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
285
-
286
- return gen_audio, self.sr
287
-
288
-
289
- if __name__ == '__main__':
290
- dreamvoice = DreamVoice(config='dreamvc.yaml', mode='plugin', device='cuda')
291
- content_audio = 'test.wav'
292
- speaker_audio = 'speaker.wav'
293
- prompt = 'young female voice, sounds young and cute'
294
- gen_audio, sr = dreamvoice.genvc('test.wav', prompt)
295
- dreamvoice.save_audio('debug.wav', gen_audio, sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml DELETED
@@ -1,27 +0,0 @@
1
- version: 1.1
2
-
3
- sample_rate: 24000
4
- vocoder_path: 'ckpts/bigvgan_24k/g_01000000.pt'
5
- vocoder_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt'
6
- vocoder_config_path: 'ckpts/bigvgan_24k/config.json'
7
- vocoder_config_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/config.json'
8
-
9
- speaker_path: 'ckpts/spk_encoder/pretrained.pt'
10
- speaker_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/spk_encoder/pretrained.pt'
11
- lm_path: 'google/flan-t5-base'
12
-
13
- dreamvc:
14
- config_path: 'src/configs/diffvc_cross.yaml'
15
- ckpt_path: 'ckpts/dreamvc_cross.pt'
16
- ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_cross.pt'
17
-
18
- rediffvc:
19
- config_path: 'src/configs/diffvc_base.yaml'
20
- ckpt_path: 'ckpts/dreamvc_base.pt'
21
- ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_base.pt'
22
-
23
- dreamvg:
24
- config_path: 'src/configs/plugin_cross.yaml'
25
- ckpt_path: 'ckpts/dreamvc_plugin.pt'
26
- ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_plugin.pt'
27
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/.ipynb_checkpoints/openvoice_utils-checkpoint.py DELETED
@@ -1,48 +0,0 @@
1
- import os
2
- import torch
3
- import librosa
4
- from tqdm import tqdm
5
- from openvoice.mel_processing import spectrogram_torch
6
- from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
7
-
8
-
9
- @torch.no_grad()
10
- def se_extractor(audio_path, vc):
11
- # vad
12
- SAMPLE_RATE = 16000
13
- audio_vad = get_audio_tensor(audio_path)
14
- segments = get_vad_segments(
15
- audio_vad,
16
- output_sample=True,
17
- min_speech_duration=0.1,
18
- min_silence_duration=1,
19
- method="silero",
20
- )
21
- segments = [(seg["start"], seg["end"]) for seg in segments]
22
- segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
23
-
24
- if len(segments) == 0:
25
- segments = [(0, len(audio_vad)/SAMPLE_RATE)]
26
- print(segments)
27
-
28
- # spk
29
- hps = vc.hps
30
- device = vc.device
31
- model = vc.model
32
- gs = []
33
-
34
- audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
35
- audio = torch.tensor(audio).float().to(device)
36
-
37
- for s, e in segments:
38
- y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)]
39
- y = y.to(device)
40
- y = y.unsqueeze(0)
41
- y = spectrogram_torch(y, hps.data.filter_length,
42
- hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
43
- center=False).to(device)
44
- g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
45
- gs.append(g.detach())
46
-
47
- gs = torch.stack(gs).mean(0)
48
- return gs.cpu()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/.ipynb_checkpoints/plugin-checkpoint.py DELETED
@@ -1,128 +0,0 @@
1
- import os
2
- import requests
3
- import yaml
4
- import torch
5
- import librosa
6
- import numpy as np
7
- import soundfile as sf
8
- from pathlib import Path
9
- from transformers import T5Tokenizer, T5EncoderModel
10
- from tqdm import tqdm
11
- from .src.plugin_wrapper import DreamVG
12
-
13
-
14
- class DreamVoice_Plugin:
15
- def __init__(self, config='plugin.yaml', device='cuda'):
16
- # Initial setup
17
- script_dir = Path(__file__).resolve().parent
18
- config_path = script_dir / config
19
-
20
- # Load configuration file
21
- with open(config_path, 'r') as fp:
22
- self.config = yaml.safe_load(fp)
23
-
24
- self.script_dir = script_dir
25
-
26
- # Ensure all checkpoints are downloaded
27
- self._ensure_checkpoints_exist()
28
-
29
- # Initialize attributes
30
- self.device = device
31
-
32
- # Load tokenizer and text encoder
33
- lm_path = self.config['lm_path']
34
- self.tokenizer = T5Tokenizer.from_pretrained(lm_path)
35
- self.text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
36
-
37
- self.dreamvg = DreamVG(
38
- config_path=self.script_dir / self.config['dreamvg']['config_path'],
39
- ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
40
- device=self.device
41
-
42
- )
43
- def _ensure_checkpoints_exist(self):
44
- checkpoints = [
45
- ('dreamvg.ckpt_path', self.config.get('dreamvg', {}).get('ckpt_url'))
46
- ]
47
-
48
- for path_key, url in checkpoints:
49
- local_path = self._get_local_path(path_key)
50
- if not local_path.exists() and url:
51
- print(f"Downloading {path_key} from {url}")
52
- self._download_file(url, local_path)
53
-
54
- def _get_local_path(self, path_key):
55
- keys = path_key.split('.')
56
- local_path = self.config
57
- for key in keys:
58
- local_path = local_path.get(key, {})
59
- return self.script_dir / local_path
60
-
61
- def _download_file(self, url, local_path):
62
- try:
63
- # Attempt to send a GET request to the URL
64
- response = requests.get(url, stream=True)
65
- response.raise_for_status() # Ensure we raise an exception for HTTP errors
66
- except requests.exceptions.RequestException as e:
67
- # Log the error for debugging purposes
68
- print(f"Error encountered: {e}")
69
-
70
- # Development mode: prompt user for Hugging Face API key
71
- user_input = input("Private checkpoint, please request authorization and enter your Hugging Face API key.")
72
- self.hf_key = user_input if user_input else None
73
-
74
- # Set headers if an API key is provided
75
- headers = {'Authorization': f'Bearer {self.hf_key}'} if self.hf_key else {}
76
-
77
- try:
78
- # Attempt to send a GET request with headers in development mode
79
- response = requests.get(url, stream=True, headers=headers)
80
- response.raise_for_status() # Ensure we raise an exception for HTTP errors
81
- except requests.exceptions.RequestException as e:
82
- # Log the error for debugging purposes
83
- print(f"Error encountered in dev mode: {e}")
84
- response = None # Handle response accordingly in your code
85
-
86
- local_path.parent.mkdir(parents=True, exist_ok=True)
87
-
88
- total_size = int(response.headers.get('content-length', 0))
89
- block_size = 8192
90
- t = tqdm(total=total_size, unit='iB', unit_scale=True)
91
-
92
- with open(local_path, 'wb') as f:
93
- for chunk in response.iter_content(chunk_size=block_size):
94
- t.update(len(chunk))
95
- f.write(chunk)
96
- t.close()
97
-
98
- def _init_plugin_mode(self):
99
- # Initialize DreamVG
100
- self.dreamvg = DreamVG(
101
- config_path=self.script_dir / self.config['dreamvg']['config_path'],
102
- ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
103
- device=self.device
104
- )
105
-
106
- # Load speaker encoder
107
- spk_encoder.load_model(self.script_dir / self.config['speaker_path'], self.device)
108
- self.spk_encoder = spk_encoder
109
- self.spk_embed_cache = None
110
-
111
-
112
- @torch.no_grad()
113
- def gen_spk(self, prompt,
114
- prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
115
- prompt_ddim_steps=100, prompt_eta=1, prompt_random_seed=None,):
116
-
117
- text_batch = self.tokenizer(prompt, max_length=32,
118
- padding='max_length', truncation=True, return_tensors="pt")
119
- text, text_mask = text_batch.input_ids.to(self.device), \
120
- text_batch.attention_mask.to(self.device)
121
- text = self.text_encoder(input_ids=text, attention_mask=text_mask)[0]
122
-
123
- spk_embed = self.dreamvg.inference([text, text_mask],
124
- guidance_scale=prompt_guidance_scale,
125
- guidance_rescale=prompt_guidance_rescale,
126
- ddim_steps=prompt_ddim_steps, eta=prompt_eta,
127
- random_seed=prompt_random_seed)
128
- return spk_embed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/.ipynb_checkpoints/plugin-checkpoint.yaml DELETED
@@ -1,8 +0,0 @@
1
- version: 1.1
2
-
3
- lm_path: 'google/flan-t5-base'
4
-
5
- dreamvg:
6
- config_path: 'src/configs/plugin_cross_openvoice.yaml'
7
- ckpt_path: 'plugin_ckpts/openvoice_v2.pt'
8
- ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/plugin_ckpts/openvoice_v2.pt'
 
 
 
 
 
 
 
 
 
dreamvoice/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (244 Bytes)
 
dreamvoice/__pycache__/api.cpython-310.pyc DELETED
Binary file (8.04 kB)
 
dreamvoice/__pycache__/openvoice_utils.cpython-310.pyc DELETED
Binary file (1.65 kB)
 
dreamvoice/__pycache__/plugin.cpython-310.pyc DELETED
Binary file (4.01 kB)
 
dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py DELETED
@@ -1,76 +0,0 @@
1
- import yaml
2
- import torch
3
- from diffusers import DDIMScheduler
4
- from .model.p2e_cross import P2E_Cross
5
- from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
6
-
7
-
8
- class DreamVG(object):
9
- def __init__(self,
10
- config_path='configs/plugin_cross.yaml',
11
- ckpt_path='../ckpts/dreamvc_plugin.pt',
12
- device='cpu'):
13
-
14
- with open(config_path, 'r') as fp:
15
- config = yaml.safe_load(fp)
16
-
17
- self.device = device
18
- self.model = P2E_Cross(config['model']).to(device)
19
- self.model.load_state_dict(torch.load(ckpt_path)['model'])
20
- self.model.eval()
21
-
22
- noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
23
- beta_start=config['scheduler']['beta_start'],
24
- beta_end=config['scheduler']['beta_end'],
25
- rescale_betas_zero_snr=True,
26
- timestep_spacing="trailing",
27
- clip_sample=False,
28
- prediction_type='v_prediction')
29
- self.noise_scheduler = noise_scheduler
30
- self.scale = config['scheduler']['scale']
31
- self.shift = config['scheduler']['shift']
32
- self.spk_shape = config['model']['unet']['in_channels']
33
-
34
- @torch.no_grad()
35
- def inference(self, text,
36
- guidance_scale=5, guidance_rescale=0.7,
37
- ddim_steps=50, eta=1, random_seed=2023,
38
- ):
39
- text, text_mask = text
40
- self.model.eval()
41
-
42
- gen_shape = (1, self.spk_shape)
43
-
44
- if random_seed is not None:
45
- generator = torch.Generator(device=self.device).manual_seed(random_seed)
46
- else:
47
- generator = torch.Generator(device=self.device)
48
- generator.seed()
49
-
50
- self.noise_scheduler.set_timesteps(ddim_steps)
51
-
52
- # init noise
53
- noise = torch.randn(gen_shape, generator=generator, device=self.device)
54
- latents = noise
55
-
56
- for t in self.noise_scheduler.timesteps:
57
- latents = self.noise_scheduler.scale_model_input(latents, t)
58
-
59
- if guidance_scale:
60
- output_text = self.model(latents, t, text, text_mask, train_cfg=False)
61
- output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
62
-
63
- output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
64
- if guidance_rescale > 0.0:
65
- output_pred = rescale_noise_cfg(output_pred, output_text,
66
- guidance_rescale=guidance_rescale)
67
- else:
68
- output_pred = self.model(latents, t, text, text_mask, train_cfg=False)
69
-
70
- latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
71
- eta=eta, generator=generator).prev_sample
72
-
73
- # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
74
- pred = scale_shift_re(latents, 1/self.scale, self.shift)
75
- # pred = torch.clip(pred, min=0.0, max=0.5)
76
- return pred
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py DELETED
@@ -1,144 +0,0 @@
1
- import yaml
2
- import torch
3
- from diffusers import DDIMScheduler
4
- from .model.model import DiffVC
5
- from .model.model_cross import DiffVC_Cross
6
- from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
7
-
8
-
9
- class ReDiffVC(object):
10
- def __init__(self,
11
- config_path='configs/diffvc_base.yaml',
12
- ckpt_path='../ckpts/dreamvc_base.pt',
13
- device='cpu'):
14
-
15
- with open(config_path, 'r') as fp:
16
- config = yaml.safe_load(fp)
17
-
18
- self.device = device
19
- self.model = DiffVC(config['model']).to(device)
20
- self.model.load_state_dict(torch.load(ckpt_path)['model'])
21
- self.model.eval()
22
-
23
- noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
24
- beta_start=config['scheduler']['beta_start'],
25
- beta_end=config['scheduler']['beta_end'],
26
- rescale_betas_zero_snr=True,
27
- timestep_spacing="trailing",
28
- clip_sample=False,
29
- prediction_type='v_prediction')
30
- self.noise_scheduler = noise_scheduler
31
- self.scale = config['scheduler']['scale']
32
- self.shift = config['scheduler']['shift']
33
- self.melshape = config['model']['unet']['sample_size'][0]
34
-
35
- @torch.no_grad()
36
- def inference(self,
37
- spk_embed, content_clip, f0_clip=None,
38
- guidance_scale=3, guidance_rescale=0.7,
39
- ddim_steps=50, eta=1, random_seed=2023):
40
-
41
- self.model.eval()
42
- if random_seed is not None:
43
- generator = torch.Generator(device=self.device).manual_seed(random_seed)
44
- else:
45
- generator = torch.Generator(device=self.device)
46
- generator.seed()
47
-
48
- self.noise_scheduler.set_timesteps(ddim_steps)
49
-
50
- # init noise
51
- gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
52
- noise = torch.randn(gen_shape, generator=generator, device=self.device)
53
- latents = noise
54
-
55
- for t in self.noise_scheduler.timesteps:
56
- latents = self.noise_scheduler.scale_model_input(latents, t)
57
-
58
- if guidance_scale:
59
- output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
60
- output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True,
61
- speaker_cfg=1.0, pitch_cfg=0.0)
62
-
63
- output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
64
- if guidance_rescale > 0.0:
65
- output_pred = rescale_noise_cfg(output_pred, output_text,
66
- guidance_rescale=guidance_rescale)
67
- else:
68
- output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
69
-
70
- latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
71
- eta=eta, generator=generator).prev_sample
72
-
73
- pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
74
- return pred
75
-
76
-
77
- class DreamVC(object):
78
- def __init__(self,
79
- config_path='configs/diffvc_cross.yaml',
80
- ckpt_path='../ckpts/dreamvc_cross.pt',
81
- device='cpu'):
82
-
83
- with open(config_path, 'r') as fp:
84
- config = yaml.safe_load(fp)
85
-
86
- self.device = device
87
- self.model = DiffVC_Cross(config['model']).to(device)
88
- self.model.load_state_dict(torch.load(ckpt_path)['model'])
89
- self.model.eval()
90
-
91
- noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
92
- beta_start=config['scheduler']['beta_start'],
93
- beta_end=config['scheduler']['beta_end'],
94
- rescale_betas_zero_snr=True,
95
- timestep_spacing="trailing",
96
- clip_sample=False,
97
- prediction_type='v_prediction')
98
- self.noise_scheduler = noise_scheduler
99
- self.scale = config['scheduler']['scale']
100
- self.shift = config['scheduler']['shift']
101
- self.melshape = config['model']['unet']['sample_size'][0]
102
-
103
- @torch.no_grad()
104
- def inference(self,
105
- text, content_clip, f0_clip=None,
106
- guidance_scale=3, guidance_rescale=0.7,
107
- ddim_steps=50, eta=1, random_seed=2023):
108
-
109
- text, text_mask = text
110
- self.model.eval()
111
- if random_seed is not None:
112
- generator = torch.Generator(device=self.device).manual_seed(random_seed)
113
- else:
114
- generator = torch.Generator(device=self.device)
115
- generator.seed()
116
-
117
- self.noise_scheduler.set_timesteps(ddim_steps)
118
-
119
- # init noise
120
- gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
121
- noise = torch.randn(gen_shape, generator=generator, device=self.device)
122
- latents = noise
123
-
124
- for t in self.noise_scheduler.timesteps:
125
- latents = self.noise_scheduler.scale_model_input(latents, t)
126
-
127
- if guidance_scale:
128
- output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
129
- output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True,
130
- speaker_cfg=1.0, pitch_cfg=0.0)
131
-
132
- output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
133
- if guidance_rescale > 0.0:
134
- output_pred = rescale_noise_cfg(output_pred, output_text,
135
- guidance_rescale=guidance_rescale)
136
- else:
137
- output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
138
-
139
- latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
140
- eta=eta, generator=generator).prev_sample
141
-
142
- pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
143
- return pred
144
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc DELETED
Binary file (2.4 kB)
 
dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc DELETED
Binary file (3.49 kB)
 
dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml DELETED
@@ -1,39 +0,0 @@
1
- version: 1.0
2
-
3
- system: "cross"
4
-
5
- model:
6
- cls_embedding:
7
- content_dim: 768
8
- content_hidden: 256
9
-
10
- unet:
11
- sample_size: [1, 1]
12
- in_channels: 256
13
- out_channels: 256
14
- layers_per_block: 2
15
- block_out_channels: [256]
16
- down_block_types:
17
- [
18
- "CrossAttnDownBlock2D",
19
- ]
20
- up_block_types:
21
- [
22
- "CrossAttnUpBlock2D",
23
- ]
24
- attention_head_dim: 32
25
- cross_attention_dim: 768
26
-
27
- scheduler:
28
- num_train_steps: 1000
29
- beta_schedule: 'linear'
30
- beta_start: 0.0001
31
- beta_end: 0.02
32
- num_infer_steps: 50
33
- rescale_betas_zero_snr: true
34
- timestep_spacing: "trailing"
35
- clip_sample: false
36
- prediction_type: 'v_prediction'
37
- scale: 0.05
38
- shift: -0.035
39
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross_openvoice-checkpoint.yaml DELETED
@@ -1,39 +0,0 @@
1
- version: 1.0
2
-
3
- system: "cross"
4
-
5
- model:
6
- cls_embedding:
7
- content_dim: 768
8
- content_hidden: 256
9
-
10
- unet:
11
- sample_size: [1, 1]
12
- in_channels: 256
13
- out_channels: 256
14
- layers_per_block: 2
15
- block_out_channels: [256]
16
- down_block_types:
17
- [
18
- "CrossAttnDownBlock2D",
19
- ]
20
- up_block_types:
21
- [
22
- "CrossAttnUpBlock2D",
23
- ]
24
- attention_head_dim: 32
25
- cross_attention_dim: 768
26
-
27
- scheduler:
28
- num_train_steps: 1000
29
- beta_schedule: 'linear'
30
- beta_start: 0.0001
31
- beta_end: 0.02
32
- num_infer_steps: 50
33
- rescale_betas_zero_snr: true
34
- timestep_spacing: "trailing"
35
- clip_sample: false
36
- prediction_type: 'v_prediction'
37
- scale: 1.0
38
- shift: 0.0
39
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/feats/.ipynb_checkpoints/contentvec-checkpoint.py DELETED
@@ -1,42 +0,0 @@
1
- import torch
2
- import librosa
3
- from fairseq import checkpoint_utils
4
- import torch.nn.functional as F
5
-
6
-
7
- def get_model(vec_path):
8
- print("load model(s) from {}".format(vec_path))
9
- models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
10
- [vec_path],
11
- suffix="",
12
- )
13
- model = models[0]
14
- model.eval()
15
- return model
16
-
17
-
18
- @torch.no_grad()
19
- def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12):
20
- # print(layer)
21
- wav_16k_tensor = wav_16k_tensor.to(device)
22
- # so that the output shape will be len(audio//320)
23
- wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
24
- feats = wav_16k_tensor
25
- padding_mask = torch.BoolTensor(feats.shape).fill_(False)
26
- inputs = {
27
- "source": feats.to(wav_16k_tensor.device),
28
- "padding_mask": padding_mask.to(wav_16k_tensor.device),
29
- "output_layer": layer
30
- }
31
- logits = hmodel.extract_features(**inputs)[0]
32
- # feats = hmodel.final_proj(logits[0])
33
- return logits
34
-
35
-
36
- if __name__ == '__main__':
37
- audio, sr = librosa.load('test.wav', sr=16000)
38
- audio = audio[:100*320]
39
- model = get_model('../../ckpts/checkpoint_best_legacy_500.pt')
40
- model = model.cuda()
41
- content = get_content(model, torch.tensor([audio]))
42
- print(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/feats/.ipynb_checkpoints/contentvec_hf-checkpoint.py DELETED
@@ -1,40 +0,0 @@
1
- from transformers import HubertModel
2
- import torch.nn as nn
3
- import torch
4
- import torch.nn.functional as F
5
- import librosa
6
-
7
-
8
- class HubertModelWithFinalProj(HubertModel):
9
- def __init__(self, config):
10
- super().__init__(config)
11
-
12
- # The final projection layer is only used for backward compatibility.
13
- # Following https://github.com/auspicious3000/contentvec/issues/6
14
- # Remove this layer is necessary to achieve the desired outcome.
15
- self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
16
-
17
-
18
- def get_content_model(config='lengyue233/content-vec-best'):
19
- model = HubertModelWithFinalProj.from_pretrained(config)
20
- model.eval()
21
- return model
22
-
23
-
24
- @torch.no_grad()
25
- def get_content(model, wav_16k_tensor, device='cuda'):
26
- # print(layer)
27
- wav_16k_tensor = wav_16k_tensor.to(device)
28
- # so that the output shape will be len(audio//320)
29
- wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
30
- logits = model(wav_16k_tensor)['last_hidden_state']
31
- return logits
32
-
33
-
34
- if __name__ == '__main__':
35
- model = get_content_model().cuda()
36
- audio, sr = librosa.load('test.wav', sr=16000)
37
- audio = audio[:100*320]
38
- audio = torch.tensor([audio])
39
- content = get_content(model, audio, 'cuda')
40
- print(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/feats/.ipynb_checkpoints/hubert_model-checkpoint.py DELETED
@@ -1,24 +0,0 @@
1
- import torch, torchaudio
2
- from .hubert.hubert import HubertSoft
3
- from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
4
- import librosa
5
-
6
-
7
- def get_soft_model(model_path):
8
- hubert = HubertSoft()
9
- # Load checkpoint (either hubert_soft or hubert_discrete)
10
- # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
11
- checkpoint = torch.load(model_path)
12
- consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
13
- hubert.load_state_dict(checkpoint["hubert"])
14
- hubert.eval()
15
- return hubert
16
-
17
-
18
- @torch.no_grad()
19
- def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'):
20
- wav_16k_tensor = wav_16k_tensor.to(device).unsqueeze(1)
21
- # print(wav_16k_tensor.shape)
22
- units = hmodel.units(wav_16k_tensor)
23
- # print(units.shape)
24
- return units.cpu()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/feats/.ipynb_checkpoints/test-checkpoint.py DELETED
@@ -1,22 +0,0 @@
1
- import torch, torchaudio
2
- from hubert.hubert import HubertSoft
3
- from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
4
- import librosa
5
-
6
-
7
- def get_soft_model(model_path):
8
- hubert = HubertSoft()
9
- # Load checkpoint (either hubert_soft or hubert_discrete)
10
- # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
11
- checkpoint = torch.load(model_path)
12
- consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
13
- hubert.load_state_dict(checkpoint["hubert"])
14
- hubert.eval()
15
- return model
16
-
17
-
18
- @torch.no_grad()
19
- def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'):
20
- wav_16k_tensor = wav_16k_tensor.to(device)
21
- units = hmodel.units(wav_16k_tensor)
22
- return units.cpu()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/feats/__pycache__/contentvec.cpython-310.pyc DELETED
Binary file (1.29 kB)
 
dreamvoice/src/feats/__pycache__/contentvec.cpython-311.pyc DELETED
Binary file (2.23 kB)
 
dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc DELETED
Binary file (1.45 kB)
 
dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-311.pyc DELETED
Binary file (2.41 kB)
 
dreamvoice/src/feats/__pycache__/hubert_model.cpython-311.pyc DELETED
Binary file (1.44 kB)
 
dreamvoice/src/model/.ipynb_checkpoints/model-checkpoint.py DELETED
@@ -1,98 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- from diffusers import UNet2DModel, UNet2DConditionModel
4
- import yaml
5
- from einops import repeat, rearrange
6
-
7
- from typing import Any
8
- from torch import Tensor
9
-
10
-
11
- def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
12
- if proba == 1:
13
- return torch.ones(shape, device=device, dtype=torch.bool)
14
- elif proba == 0:
15
- return torch.zeros(shape, device=device, dtype=torch.bool)
16
- else:
17
- return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
18
-
19
-
20
- class DiffVC(nn.Module):
21
- def __init__(self, config):
22
- super().__init__()
23
- self.config = config
24
- self.unet = UNet2DModel(**self.config['unet'])
25
- self.unet.set_use_memory_efficient_attention_xformers(True)
26
- self.speaker_embedding = nn.Sequential(
27
- nn.Linear(self.config['cls_embedding']['speaker_dim'], self.config['cls_embedding']['feature_dim']),
28
- nn.SiLU(),
29
- nn.Linear(self.config['cls_embedding']['feature_dim'], self.config['cls_embedding']['feature_dim']))
30
- self.uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['speaker_dim']) /
31
- self.config['cls_embedding']['speaker_dim'] ** 0.5)
32
- self.content_embedding = nn.Sequential(
33
- nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']),
34
- nn.SiLU(),
35
- nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden']))
36
-
37
- if self.config['cls_embedding']['use_pitch']:
38
- self.pitch_control = True
39
- self.pitch_embedding = nn.Sequential(
40
- nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']),
41
- nn.SiLU(),
42
- nn.Linear(self.config['cls_embedding']['pitch_hidden'],
43
- self.config['cls_embedding']['pitch_hidden']))
44
- self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) /
45
- self.config['cls_embedding']['pitch_hidden'] ** 0.5)
46
- else:
47
- print('no pitch module')
48
- self.pitch_control = False
49
-
50
- def forward(self, target, t, content, speaker, pitch,
51
- train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0):
52
- B, C, M, L = target.shape
53
- content = self.content_embedding(content)
54
- content = repeat(content, "b t c-> b c m t", m=M)
55
- target = target.to(content.dtype)
56
- x = torch.cat([target, content], dim=1)
57
-
58
- if self.pitch_control:
59
- if pitch is not None:
60
- pitch = self.pitch_embedding(pitch)
61
- else:
62
- pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
63
-
64
- if train_cfg:
65
- uncond = repeat(self.uncond, "c-> b c", b=B).to(target.dtype)
66
- batch_mask = rand_bool(shape=(B, 1), proba=speaker_cfg, device=target.device)
67
- speaker = torch.where(batch_mask, uncond, speaker)
68
-
69
- if self.pitch_control:
70
- batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device)
71
- pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
72
- pitch = torch.where(batch_mask, pitch_uncond, pitch)
73
-
74
- speaker = self.speaker_embedding(speaker)
75
-
76
- if self.pitch_control:
77
- pitch = repeat(pitch, "b t c-> b c m t", m=M)
78
- x = torch.cat([x, pitch], dim=1)
79
-
80
- output = self.unet(sample=x, timestep=t, class_labels=speaker)['sample']
81
-
82
- return output
83
-
84
-
85
- if __name__ == "__main__":
86
- with open('diffvc_base_pitch.yaml', 'r') as fp:
87
- config = yaml.safe_load(fp)
88
- device = 'cuda'
89
-
90
- model = DiffVC(config['diffwrap']).to(device)
91
-
92
- x = torch.rand((2, 1, 100, 256)).to(device)
93
- y = torch.rand((2, 256, 768)).to(device)
94
- p = torch.rand(2, 256, 1).to(device)
95
- t = torch.randint(0, 1000, (2,)).long().to(device)
96
- spk = torch.rand(2, 256).to(device)
97
-
98
- output = model(x, t, y, spk, pitch=p, train_cfg=True, cfg_prob=0.25)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/model/.ipynb_checkpoints/model_cross-checkpoint.py DELETED
@@ -1,116 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- from diffusers import UNet2DModel, UNet2DConditionModel
4
- import yaml
5
- from einops import repeat, rearrange
6
-
7
- from typing import Any
8
- from torch import Tensor
9
-
10
-
11
- def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
12
- if proba == 1:
13
- return torch.ones(shape, device=device, dtype=torch.bool)
14
- elif proba == 0:
15
- return torch.zeros(shape, device=device, dtype=torch.bool)
16
- else:
17
- return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
18
-
19
-
20
- class FixedEmbedding(nn.Module):
21
- def __init__(self, features=128):
22
- super().__init__()
23
- self.embedding = nn.Embedding(1, features)
24
-
25
- def forward(self, y):
26
- B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
27
- embed = self.embedding(torch.zeros(B, device=device).long())
28
- fixed_embedding = repeat(embed, "b c -> b l c", l=L)
29
- return fixed_embedding
30
-
31
-
32
- class DiffVC_Cross(nn.Module):
33
- def __init__(self, config):
34
- super().__init__()
35
- self.config = config
36
- self.unet = UNet2DConditionModel(**self.config['unet'])
37
- self.unet.set_use_memory_efficient_attention_xformers(True)
38
- self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
39
-
40
- self.context_embedding = nn.Sequential(
41
- nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
42
- nn.SiLU(),
43
- nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
44
-
45
- self.content_embedding = nn.Sequential(
46
- nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']),
47
- nn.SiLU(),
48
- nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden']))
49
-
50
- if self.config['cls_embedding']['use_pitch']:
51
- self.pitch_control = True
52
- self.pitch_embedding = nn.Sequential(
53
- nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']),
54
- nn.SiLU(),
55
- nn.Linear(self.config['cls_embedding']['pitch_hidden'],
56
- self.config['cls_embedding']['pitch_hidden']))
57
-
58
- self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) /
59
- self.config['cls_embedding']['pitch_hidden'] ** 0.5)
60
- else:
61
- print('no pitch module')
62
- self.pitch_control = False
63
-
64
- def forward(self, target, t, content, prompt, prompt_mask=None, pitch=None,
65
- train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0):
66
- B, C, M, L = target.shape
67
- content = self.content_embedding(content)
68
- content = repeat(content, "b t c-> b c m t", m=M)
69
- target = target.to(content.dtype)
70
- x = torch.cat([target, content], dim=1)
71
-
72
- if self.pitch_control:
73
- if pitch is not None:
74
- pitch = self.pitch_embedding(pitch)
75
- else:
76
- pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
77
-
78
- if train_cfg:
79
- # Randomly mask embedding
80
- batch_mask = rand_bool(shape=(B, 1, 1), proba=speaker_cfg, device=target.device)
81
- fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
82
- prompt = torch.where(batch_mask, fixed_embedding, prompt)
83
-
84
- if self.pitch_control:
85
- batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device)
86
- pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
87
- pitch = torch.where(batch_mask, pitch_uncond, pitch)
88
-
89
- prompt = self.context_embedding(prompt)
90
-
91
- if self.pitch_control:
92
- pitch = repeat(pitch, "b t c-> b c m t", m=M)
93
- x = torch.cat([x, pitch], dim=1)
94
-
95
- output = self.unet(sample=x, timestep=t,
96
- encoder_hidden_states=prompt,
97
- encoder_attention_mask=prompt_mask)['sample']
98
-
99
- return output
100
-
101
-
102
- if __name__ == "__main__":
103
- with open('diffvc_cross_pitch.yaml', 'r') as fp:
104
- config = yaml.safe_load(fp)
105
- device = 'cuda'
106
-
107
- model = DiffVC_Cross(config['diffwrap']).to(device)
108
-
109
- x = torch.rand((2, 1, 100, 256)).to(device)
110
- y = torch.rand((2, 256, 768)).to(device)
111
- t = torch.randint(0, 1000, (2,)).long().to(device)
112
- prompt = torch.rand(2, 64, 768).to(device)
113
- prompt_mask = torch.ones(2, 64).to(device)
114
- p = torch.rand(2, 256, 1).to(device)
115
-
116
- output = model(x, t, y, prompt, prompt_mask, p, train_cfg=True, speaker_cfg=0.25, pitch_cfg=0.5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/model/.ipynb_checkpoints/p2e_cross-checkpoint.py DELETED
@@ -1,80 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- from diffusers import UNet2DModel, UNet2DConditionModel
4
- import yaml
5
- from einops import repeat, rearrange
6
-
7
- from typing import Any
8
- from torch import Tensor
9
-
10
-
11
- def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
12
- if proba == 1:
13
- return torch.ones(shape, device=device, dtype=torch.bool)
14
- elif proba == 0:
15
- return torch.zeros(shape, device=device, dtype=torch.bool)
16
- else:
17
- return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
18
-
19
-
20
- class FixedEmbedding(nn.Module):
21
- def __init__(self, features=128):
22
- super().__init__()
23
- self.embedding = nn.Embedding(1, features)
24
-
25
- def forward(self, y):
26
- B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
27
- embed = self.embedding(torch.zeros(B, device=device).long())
28
- fixed_embedding = repeat(embed, "b c -> b l c", l=L)
29
- return fixed_embedding
30
-
31
-
32
- class P2E_Cross(nn.Module):
33
- def __init__(self, config):
34
- super().__init__()
35
- self.config = config
36
- self.unet = UNet2DConditionModel(**self.config['unet'])
37
- self.unet.set_use_memory_efficient_attention_xformers(True)
38
- self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
39
-
40
- self.context_embedding = nn.Sequential(
41
- nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
42
- nn.SiLU(),
43
- nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
44
-
45
- def forward(self, target, t, prompt, prompt_mask=None,
46
- train_cfg=False, cfg_prob=0.0):
47
- B, C = target.shape
48
- target = target.unsqueeze(-1).unsqueeze(-1)
49
-
50
- if train_cfg:
51
- if cfg_prob > 0.0:
52
- # Randomly mask embedding
53
- batch_mask = rand_bool(shape=(B, 1, 1), proba=cfg_prob, device=target.device)
54
- fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
55
- prompt = torch.where(batch_mask, fixed_embedding, prompt)
56
-
57
- prompt = self.context_embedding(prompt)
58
- # fix the bug that prompt will copy dtype from target in diffusers
59
- target = target.to(prompt.dtype)
60
-
61
- output = self.unet(sample=target, timestep=t,
62
- encoder_hidden_states=prompt,
63
- encoder_attention_mask=prompt_mask)['sample']
64
-
65
- return output.squeeze(-1).squeeze(-1)
66
-
67
-
68
- if __name__ == "__main__":
69
- with open('p2e_cross.yaml', 'r') as fp:
70
- config = yaml.safe_load(fp)
71
- device = 'cuda'
72
-
73
- model = P2E_Cross(config['diffwrap']).to(device)
74
-
75
- x = torch.rand((2, 256)).to(device)
76
- t = torch.randint(0, 1000, (2,)).long().to(device)
77
- prompt = torch.rand(2, 64, 768).to(device)
78
- prompt_mask = torch.ones(2, 64).to(device)
79
-
80
- output = model(x, t, prompt, prompt_mask, train_cfg=True, cfg_prob=0.25)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/model/__pycache__/model.cpython-310.pyc DELETED
Binary file (3.2 kB)
 
dreamvoice/src/model/__pycache__/model.cpython-311.pyc DELETED
Binary file (7.44 kB)
 
dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc DELETED
Binary file (3.96 kB)
 
dreamvoice/src/model/__pycache__/model_cross.cpython-311.pyc DELETED
Binary file (8.76 kB)
 
dreamvoice/src/model/__pycache__/model_cross.cpython-39.pyc DELETED
Binary file (3.94 kB)
 
dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc DELETED
Binary file (3.06 kB)
 
dreamvoice/src/model/__pycache__/p2e_cross.cpython-311.pyc DELETED
Binary file (6.24 kB)
 
dreamvoice/src/modules/.ipynb_checkpoints/mel-checkpoint.py DELETED
@@ -1,37 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- import torchaudio
4
- import torchaudio.transforms as transforms
5
-
6
-
7
- class LogMelSpectrogram(torch.nn.Module):
8
- def __init__(self, sr=24000, frame_length=1920, hop_length=480, n_mel=128, f_min=0, f_max=12000,):
9
- super().__init__()
10
- self.frame_length = frame_length
11
- self.hop_length = hop_length
12
- self.mel = transforms.MelSpectrogram(
13
- sample_rate=sr,
14
- n_fft=frame_length,
15
- win_length=frame_length,
16
- hop_length=hop_length,
17
- center=False,
18
- power=1.0,
19
- norm="slaney",
20
- n_mels=n_mel,
21
- mel_scale="slaney",
22
- f_min=f_min,
23
- f_max=f_max
24
- )
25
-
26
- @torch.no_grad()
27
- def forward(self, x, target_length=None):
28
- x = F.pad(x, ((self.frame_length - self.hop_length) // 2,
29
- (self.frame_length - self.hop_length) // 2), "reflect")
30
- mel = self.mel(x)
31
-
32
- target_length = mel.shape[-1] if target_length is None else target_length
33
- logmel = torch.zeros(mel.shape[0], mel.shape[1], target_length).to(mel.device)
34
- logmel[:, :, :mel.shape[2]] = mel
35
-
36
- logmel = torch.log(torch.clamp(logmel, min=1e-5))
37
- return logmel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/utils/.ipynb_checkpoints/__init__-checkpoint.py DELETED
@@ -1 +0,0 @@
1
- from .utils import *
 
 
dreamvoice/src/utils/.ipynb_checkpoints/utils-checkpoint.py DELETED
@@ -1,76 +0,0 @@
1
- import numpy as np
2
- import matplotlib.pyplot as plt
3
- from scipy.io import wavfile
4
- import torch
5
-
6
-
7
- def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
8
- """
9
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
10
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
11
- """
12
- std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
13
- std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
14
- # rescale the results from guidance (fixes overexposure)
15
- noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
16
- # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
17
- noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
18
- return noise_cfg
19
-
20
-
21
- def scale_shift(x, scale, shift):
22
- return (x+shift) * scale
23
-
24
-
25
- def scale_shift_re(x, scale, shift):
26
- return (x/scale) - shift
27
-
28
-
29
- def align_seq(source, target_length, mapping_method='hard'):
30
- source_len = source.shape[1]
31
- if mapping_method == 'hard':
32
- mapping_idx = np.round(np.arange(target_length) * source_len / target_length)
33
- output = source[:, mapping_idx]
34
- else:
35
- # TBD
36
- raise NotImplementedError
37
-
38
- return output
39
-
40
-
41
- def save_plot(tensor, savepath):
42
- tensor = tensor.squeeze().cpu()
43
- plt.style.use('default')
44
- fig, ax = plt.subplots(figsize=(12, 3))
45
- im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none')
46
- plt.colorbar(im, ax=ax)
47
- plt.tight_layout()
48
- fig.canvas.draw()
49
- plt.savefig(savepath)
50
- plt.close()
51
-
52
-
53
- def save_audio(file_path, sampling_rate, audio):
54
- audio = np.clip(audio.cpu().squeeze().numpy(), -0.999, 0.999)
55
- wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16"))
56
-
57
-
58
- def minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
59
- tensor = torch.clip(tensor, vmin, vmax)
60
- tensor = 2 * (tensor - vmin) / (vmax - vmin) - 1
61
- return tensor
62
-
63
-
64
- def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
65
- tensor = torch.clip(tensor, -1.0, 1.0)
66
- tensor = (tensor + 1) / 2
67
- tensor = tensor * (vmax - vmin) + vmin
68
- return tensor
69
-
70
-
71
- if __name__ == "__main__":
72
-
73
- a = torch.rand(2, 10)
74
- target_len = 15
75
-
76
- b = align_seq(a, target_len)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (190 Bytes)
 
dreamvoice/src/utils/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (206 Bytes)
 
dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc DELETED
Binary file (2.71 kB)
 
dreamvoice/src/utils/__pycache__/utils.cpython-311.pyc DELETED
Binary file (4.57 kB)