Higobeatz commited on
Commit
e27f927
·
verified ·
1 Parent(s): d9dbcea

Delete dreamvoice/src/.ipynb_checkpoints

Browse files
dreamvoice/src/.ipynb_checkpoints/extract_features-checkpoint.py DELETED
@@ -1,103 +0,0 @@
1
- import os
2
- import torch
3
- import librosa
4
- import numpy as np
5
- import soundfile as sf
6
- import pandas as pd
7
- # from feats.hubert_model import get_soft_model, get_hubert_soft_content
8
- from feats.contentvec_hf import get_content_model, get_content
9
- # from modules.speaker_encoder.encoder import inference as spk_encoder
10
- # from pathlib import Path
11
- from tqdm import tqdm
12
- from multiprocessing import Process
13
- import pyworld as pw
14
-
15
-
16
- def resample_save(infolder, audio_path, model,
17
- audio_sr=24000, content_sr=16000, min_length=1.92,
18
- content_resolution=50,
19
- save_path='features'):
20
- if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False:
21
- audio, sr = librosa.load(infolder + audio_path, sr=content_sr)
22
- final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution)
23
- # final_length = final_length / content_sr
24
-
25
- length = max(round(min_length*content_sr), round(final_length))
26
- assert length % 10 == 0
27
- audio = audio[:length]
28
- audio_save = np.zeros(length, dtype=audio.dtype)
29
- audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
30
-
31
- # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0))
32
- content = get_content(model, torch.tensor(audio_save).unsqueeze(0))
33
- content = content.cpu()
34
- os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True)
35
- torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt')
36
- # print(audio_save.shape)
37
- # print(content.shape)
38
- os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True)
39
- sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr))
40
- # print(save_path + '/' + 'audio_16k/' + audio_path)
41
-
42
- audio, sr = librosa.load(infolder + audio_path, sr=audio_sr)
43
- length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr))
44
- assert length % 10 == 0
45
- audio = audio[:length]
46
- audio_save = np.zeros(length, dtype=audio.dtype)
47
- audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
48
- # print(audio_save.shape)
49
- os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True)
50
- sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr))
51
-
52
-
53
- def extract_f0(in_folder, audio_path, save_path):
54
- audio, sr = librosa.load(in_folder + audio_path, sr=None)
55
- assert sr == 16000
56
- if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False:
57
- # wav = audio
58
- # wav = np.pad(wav, int((1024-320)/2), mode='reflect')
59
- # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr,
60
- # fmin=librosa.note_to_hz('C2'),
61
- # fmax=librosa.note_to_hz('C6'))
62
-
63
- _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000)
64
- f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1]
65
-
66
- f0 = np.nan_to_num(f0)
67
- os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True)
68
- # print(save_path + '/' + 'f0/' + audio_path + '.pt')
69
- torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt')
70
-
71
-
72
- def chunks(arr, m):
73
- result = [[] for i in range(m)]
74
- for i in range(len(arr)):
75
- result[i%m].append(arr[i])
76
- return result
77
-
78
-
79
- def extract_f0_main(in_folder, audio_paths, save_path):
80
- for audio_path in tqdm(audio_paths):
81
- extract_f0(in_folder, audio_path, save_path)
82
-
83
-
84
- if __name__ == '__main__':
85
- df = pd.read_csv('../test_data/vc_meta.csv')
86
- # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda')
87
- model = get_content_model().to('cuda')
88
- # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda")
89
- for i in tqdm(range(len(df))):
90
- row = df.iloc[i]
91
- in_path = row['path']
92
- resample_save('../test_data/', in_path, model, save_path='../features/')
93
-
94
- in_folder = '../features/audio_16k/'
95
- audio_files = list(df['path'])
96
- save_path = '../features/'
97
- cores = 6
98
-
99
- subsets = chunks(audio_files, cores)
100
-
101
- for subset in subsets:
102
- t = Process(target=extract_f0_main, args=(in_folder, subset, save_path))
103
- t.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py DELETED
@@ -1,76 +0,0 @@
1
- import yaml
2
- import torch
3
- from diffusers import DDIMScheduler
4
- from .model.p2e_cross import P2E_Cross
5
- from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
6
-
7
-
8
- class DreamVG(object):
9
- def __init__(self,
10
- config_path='configs/plugin_cross.yaml',
11
- ckpt_path='../ckpts/dreamvc_plugin.pt',
12
- device='cpu'):
13
-
14
- with open(config_path, 'r') as fp:
15
- config = yaml.safe_load(fp)
16
-
17
- self.device = device
18
- self.model = P2E_Cross(config['model']).to(device)
19
- self.model.load_state_dict(torch.load(ckpt_path)['model'])
20
- self.model.eval()
21
-
22
- noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
23
- beta_start=config['scheduler']['beta_start'],
24
- beta_end=config['scheduler']['beta_end'],
25
- rescale_betas_zero_snr=True,
26
- timestep_spacing="trailing",
27
- clip_sample=False,
28
- prediction_type='v_prediction')
29
- self.noise_scheduler = noise_scheduler
30
- self.scale = config['scheduler']['scale']
31
- self.shift = config['scheduler']['shift']
32
- self.spk_shape = config['model']['unet']['in_channels']
33
-
34
- @torch.no_grad()
35
- def inference(self, text,
36
- guidance_scale=5, guidance_rescale=0.7,
37
- ddim_steps=50, eta=1, random_seed=2023,
38
- ):
39
- text, text_mask = text
40
- self.model.eval()
41
-
42
- gen_shape = (1, self.spk_shape)
43
-
44
- if random_seed is not None:
45
- generator = torch.Generator(device=self.device).manual_seed(random_seed)
46
- else:
47
- generator = torch.Generator(device=self.device)
48
- generator.seed()
49
-
50
- self.noise_scheduler.set_timesteps(ddim_steps)
51
-
52
- # init noise
53
- noise = torch.randn(gen_shape, generator=generator, device=self.device)
54
- latents = noise
55
-
56
- for t in self.noise_scheduler.timesteps:
57
- latents = self.noise_scheduler.scale_model_input(latents, t)
58
-
59
- if guidance_scale:
60
- output_text = self.model(latents, t, text, text_mask, train_cfg=False)
61
- output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
62
-
63
- output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
64
- if guidance_rescale > 0.0:
65
- output_pred = rescale_noise_cfg(output_pred, output_text,
66
- guidance_rescale=guidance_rescale)
67
- else:
68
- output_pred = self.model(latents, t, text, text_mask, train_cfg=False)
69
-
70
- latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
71
- eta=eta, generator=generator).prev_sample
72
-
73
- # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
74
- pred = scale_shift_re(latents, 1/self.scale, self.shift)
75
- # pred = torch.clip(pred, min=0.0, max=0.5)
76
- return pred
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dreamvoice/src/.ipynb_checkpoints/train_plugin-checkpoint.py DELETED
File without changes
dreamvoice/src/.ipynb_checkpoints/train_vc-checkpoint.py DELETED
File without changes
dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py DELETED
@@ -1,144 +0,0 @@
1
- import yaml
2
- import torch
3
- from diffusers import DDIMScheduler
4
- from .model.model import DiffVC
5
- from .model.model_cross import DiffVC_Cross
6
- from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
7
-
8
-
9
- class ReDiffVC(object):
10
- def __init__(self,
11
- config_path='configs/diffvc_base.yaml',
12
- ckpt_path='../ckpts/dreamvc_base.pt',
13
- device='cpu'):
14
-
15
- with open(config_path, 'r') as fp:
16
- config = yaml.safe_load(fp)
17
-
18
- self.device = device
19
- self.model = DiffVC(config['model']).to(device)
20
- self.model.load_state_dict(torch.load(ckpt_path)['model'])
21
- self.model.eval()
22
-
23
- noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
24
- beta_start=config['scheduler']['beta_start'],
25
- beta_end=config['scheduler']['beta_end'],
26
- rescale_betas_zero_snr=True,
27
- timestep_spacing="trailing",
28
- clip_sample=False,
29
- prediction_type='v_prediction')
30
- self.noise_scheduler = noise_scheduler
31
- self.scale = config['scheduler']['scale']
32
- self.shift = config['scheduler']['shift']
33
- self.melshape = config['model']['unet']['sample_size'][0]
34
-
35
- @torch.no_grad()
36
- def inference(self,
37
- spk_embed, content_clip, f0_clip=None,
38
- guidance_scale=3, guidance_rescale=0.7,
39
- ddim_steps=50, eta=1, random_seed=2023):
40
-
41
- self.model.eval()
42
- if random_seed is not None:
43
- generator = torch.Generator(device=self.device).manual_seed(random_seed)
44
- else:
45
- generator = torch.Generator(device=self.device)
46
- generator.seed()
47
-
48
- self.noise_scheduler.set_timesteps(ddim_steps)
49
-
50
- # init noise
51
- gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
52
- noise = torch.randn(gen_shape, generator=generator, device=self.device)
53
- latents = noise
54
-
55
- for t in self.noise_scheduler.timesteps:
56
- latents = self.noise_scheduler.scale_model_input(latents, t)
57
-
58
- if guidance_scale:
59
- output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
60
- output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True,
61
- speaker_cfg=1.0, pitch_cfg=0.0)
62
-
63
- output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
64
- if guidance_rescale > 0.0:
65
- output_pred = rescale_noise_cfg(output_pred, output_text,
66
- guidance_rescale=guidance_rescale)
67
- else:
68
- output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
69
-
70
- latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
71
- eta=eta, generator=generator).prev_sample
72
-
73
- pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
74
- return pred
75
-
76
-
77
- class DreamVC(object):
78
- def __init__(self,
79
- config_path='configs/diffvc_cross.yaml',
80
- ckpt_path='../ckpts/dreamvc_cross.pt',
81
- device='cpu'):
82
-
83
- with open(config_path, 'r') as fp:
84
- config = yaml.safe_load(fp)
85
-
86
- self.device = device
87
- self.model = DiffVC_Cross(config['model']).to(device)
88
- self.model.load_state_dict(torch.load(ckpt_path)['model'])
89
- self.model.eval()
90
-
91
- noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
92
- beta_start=config['scheduler']['beta_start'],
93
- beta_end=config['scheduler']['beta_end'],
94
- rescale_betas_zero_snr=True,
95
- timestep_spacing="trailing",
96
- clip_sample=False,
97
- prediction_type='v_prediction')
98
- self.noise_scheduler = noise_scheduler
99
- self.scale = config['scheduler']['scale']
100
- self.shift = config['scheduler']['shift']
101
- self.melshape = config['model']['unet']['sample_size'][0]
102
-
103
- @torch.no_grad()
104
- def inference(self,
105
- text, content_clip, f0_clip=None,
106
- guidance_scale=3, guidance_rescale=0.7,
107
- ddim_steps=50, eta=1, random_seed=2023):
108
-
109
- text, text_mask = text
110
- self.model.eval()
111
- if random_seed is not None:
112
- generator = torch.Generator(device=self.device).manual_seed(random_seed)
113
- else:
114
- generator = torch.Generator(device=self.device)
115
- generator.seed()
116
-
117
- self.noise_scheduler.set_timesteps(ddim_steps)
118
-
119
- # init noise
120
- gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
121
- noise = torch.randn(gen_shape, generator=generator, device=self.device)
122
- latents = noise
123
-
124
- for t in self.noise_scheduler.timesteps:
125
- latents = self.noise_scheduler.scale_model_input(latents, t)
126
-
127
- if guidance_scale:
128
- output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
129
- output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True,
130
- speaker_cfg=1.0, pitch_cfg=0.0)
131
-
132
- output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
133
- if guidance_rescale > 0.0:
134
- output_pred = rescale_noise_cfg(output_pred, output_text,
135
- guidance_rescale=guidance_rescale)
136
- else:
137
- output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
138
-
139
- latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
140
- eta=eta, generator=generator).prev_sample
141
-
142
- pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
143
- return pred
144
-