Delete dreamvoice/src/.ipynb_checkpoints

Browse files

Files changed (5) hide show

dreamvoice/src/.ipynb_checkpoints/extract_features-checkpoint.py +0 -103
dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py +0 -76
dreamvoice/src/.ipynb_checkpoints/train_plugin-checkpoint.py +0 -0
dreamvoice/src/.ipynb_checkpoints/train_vc-checkpoint.py +0 -0
dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py +0 -144

dreamvoice/src/.ipynb_checkpoints/extract_features-checkpoint.py DELETED Viewed

@@ -1,103 +0,0 @@
-import os
-import torch
-import librosa
-import numpy as np
-import soundfile as sf
-import pandas as pd
-# from feats.hubert_model import get_soft_model, get_hubert_soft_content
-from feats.contentvec_hf import get_content_model, get_content
-# from modules.speaker_encoder.encoder import inference as spk_encoder
-# from pathlib import Path
-from tqdm import tqdm
-from multiprocessing import Process
-import pyworld as pw
-def resample_save(infolder, audio_path, model,
-                  audio_sr=24000, content_sr=16000, min_length=1.92,
-                  content_resolution=50,
-                  save_path='features'):
-    if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False:
-        audio, sr = librosa.load(infolder + audio_path, sr=content_sr)
-        final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution)
-        # final_length = final_length / content_sr
-        length = max(round(min_length*content_sr), round(final_length))
-        assert length % 10 == 0
-        audio = audio[:length]
-        audio_save = np.zeros(length, dtype=audio.dtype)
-        audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
-        # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0))
-        content = get_content(model, torch.tensor(audio_save).unsqueeze(0))
-        content = content.cpu()
-        os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True)
-        torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt')
-        # print(audio_save.shape)
-        # print(content.shape)
-        os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True)
-        sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr))
-        # print(save_path + '/' + 'audio_16k/' + audio_path)
-        audio, sr = librosa.load(infolder + audio_path, sr=audio_sr)
-        length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr))
-        assert length % 10 == 0
-        audio = audio[:length]
-        audio_save = np.zeros(length, dtype=audio.dtype)
-        audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
-        # print(audio_save.shape)
-        os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True)
-        sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr))
-def extract_f0(in_folder, audio_path, save_path):
-    audio, sr = librosa.load(in_folder + audio_path, sr=None)
-    assert sr == 16000
-    if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False:
-        # wav = audio
-        # wav = np.pad(wav, int((1024-320)/2), mode='reflect')
-        # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr,
-        #                         fmin=librosa.note_to_hz('C2'),
-        #                         fmax=librosa.note_to_hz('C6'))
-        _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000)
-        f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1]
-        f0 = np.nan_to_num(f0)
-        os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True)
-        # print(save_path + '/' + 'f0/' + audio_path + '.pt')
-        torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt')
-def chunks(arr, m):
-    result = [[] for i in range(m)]
-    for i in range(len(arr)):
-        result[i%m].append(arr[i])
-    return result
-def extract_f0_main(in_folder, audio_paths, save_path):
-    for audio_path in tqdm(audio_paths):
-        extract_f0(in_folder, audio_path, save_path)
-if __name__ == '__main__':
-    df = pd.read_csv('../test_data/vc_meta.csv')
-    # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda')
-    model = get_content_model().to('cuda')
-    # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda")
-    for i in tqdm(range(len(df))):
-        row = df.iloc[i]
-        in_path = row['path']
-        resample_save('../test_data/', in_path, model, save_path='../features/')
-    in_folder = '../features/audio_16k/'
-    audio_files = list(df['path'])
-    save_path = '../features/'
-    cores = 6
-    subsets = chunks(audio_files, cores)
-    for subset in subsets:
-        t = Process(target=extract_f0_main, args=(in_folder, subset, save_path))
-        t.start()

dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py DELETED Viewed

@@ -1,76 +0,0 @@
-import yaml
-import torch
-from diffusers import DDIMScheduler
-from .model.p2e_cross import P2E_Cross
-from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
-class DreamVG(object):
-    def __init__(self,
-                 config_path='configs/plugin_cross.yaml',
-                 ckpt_path='../ckpts/dreamvc_plugin.pt',
-                 device='cpu'):
-        with open(config_path, 'r') as fp:
-            config = yaml.safe_load(fp)
-        self.device = device
-        self.model = P2E_Cross(config['model']).to(device)
-        self.model.load_state_dict(torch.load(ckpt_path)['model'])
-        self.model.eval()
-        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
-                                        beta_start=config['scheduler']['beta_start'],
-                                        beta_end=config['scheduler']['beta_end'],
-                                        rescale_betas_zero_snr=True,
-                                        timestep_spacing="trailing",
-                                        clip_sample=False,
-                                        prediction_type='v_prediction')
-        self.noise_scheduler = noise_scheduler
-        self.scale = config['scheduler']['scale']
-        self.shift = config['scheduler']['shift']
-        self.spk_shape = config['model']['unet']['in_channels']
-    @torch.no_grad()
-    def inference(self, text,
-                  guidance_scale=5, guidance_rescale=0.7,
-                  ddim_steps=50, eta=1, random_seed=2023,
-                 ):
-        text, text_mask = text
-        self.model.eval()
-        gen_shape = (1, self.spk_shape)
-        if random_seed is not None:
-            generator = torch.Generator(device=self.device).manual_seed(random_seed)
-        else:
-            generator = torch.Generator(device=self.device)
-            generator.seed()
-        self.noise_scheduler.set_timesteps(ddim_steps)
-        # init noise
-        noise = torch.randn(gen_shape, generator=generator, device=self.device)
-        latents = noise
-        for t in self.noise_scheduler.timesteps:
-            latents = self.noise_scheduler.scale_model_input(latents, t)
-            if guidance_scale:
-                output_text = self.model(latents, t, text, text_mask, train_cfg=False)
-                output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
-                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
-                if guidance_rescale > 0.0:
-                    output_pred = rescale_noise_cfg(output_pred, output_text,
-                                                    guidance_rescale=guidance_rescale)
-            else:
-                output_pred = self.model(latents, t, text, text_mask, train_cfg=False)
-            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
-                                                eta=eta, generator=generator).prev_sample
-        # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
-        pred = scale_shift_re(latents, 1/self.scale, self.shift)
-        # pred = torch.clip(pred, min=0.0, max=0.5)
-        return pred

dreamvoice/src/.ipynb_checkpoints/train_plugin-checkpoint.py DELETED Viewed

File without changes

dreamvoice/src/.ipynb_checkpoints/train_vc-checkpoint.py DELETED Viewed

File without changes

dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py DELETED Viewed

@@ -1,144 +0,0 @@
-import yaml
-import torch
-from diffusers import DDIMScheduler
-from .model.model import DiffVC
-from .model.model_cross import DiffVC_Cross
-from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
-class ReDiffVC(object):
-    def __init__(self,
-                 config_path='configs/diffvc_base.yaml',
-                 ckpt_path='../ckpts/dreamvc_base.pt',
-                 device='cpu'):
-        with open(config_path, 'r') as fp:
-            config = yaml.safe_load(fp)
-        self.device = device
-        self.model = DiffVC(config['model']).to(device)
-        self.model.load_state_dict(torch.load(ckpt_path)['model'])
-        self.model.eval()
-        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
-                                        beta_start=config['scheduler']['beta_start'],
-                                        beta_end=config['scheduler']['beta_end'],
-                                        rescale_betas_zero_snr=True,
-                                        timestep_spacing="trailing",
-                                        clip_sample=False,
-                                        prediction_type='v_prediction')
-        self.noise_scheduler = noise_scheduler
-        self.scale = config['scheduler']['scale']
-        self.shift = config['scheduler']['shift']
-        self.melshape = config['model']['unet']['sample_size'][0]
-    @torch.no_grad()
-    def inference(self,
-                  spk_embed, content_clip, f0_clip=None,
-                  guidance_scale=3, guidance_rescale=0.7,
-                  ddim_steps=50, eta=1, random_seed=2023):
-        self.model.eval()
-        if random_seed is not None:
-            generator = torch.Generator(device=self.device).manual_seed(random_seed)
-        else:
-            generator = torch.Generator(device=self.device)
-            generator.seed()
-        self.noise_scheduler.set_timesteps(ddim_steps)
-        # init noise
-        gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
-        noise = torch.randn(gen_shape, generator=generator, device=self.device)
-        latents = noise
-        for t in self.noise_scheduler.timesteps:
-            latents = self.noise_scheduler.scale_model_input(latents, t)
-            if guidance_scale:
-                output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
-                output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True,
-                                           speaker_cfg=1.0, pitch_cfg=0.0)
-                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
-                if guidance_rescale > 0.0:
-                    output_pred = rescale_noise_cfg(output_pred, output_text,
-                                                    guidance_rescale=guidance_rescale)
-            else:
-                output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
-            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
-                                                eta=eta, generator=generator).prev_sample
-        pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
-        return pred
-class DreamVC(object):
-    def __init__(self,
-                 config_path='configs/diffvc_cross.yaml',
-                 ckpt_path='../ckpts/dreamvc_cross.pt',
-                 device='cpu'):
-        with open(config_path, 'r') as fp:
-            config = yaml.safe_load(fp)
-        self.device = device
-        self.model = DiffVC_Cross(config['model']).to(device)
-        self.model.load_state_dict(torch.load(ckpt_path)['model'])
-        self.model.eval()
-        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
-                                        beta_start=config['scheduler']['beta_start'],
-                                        beta_end=config['scheduler']['beta_end'],
-                                        rescale_betas_zero_snr=True,
-                                        timestep_spacing="trailing",
-                                        clip_sample=False,
-                                        prediction_type='v_prediction')
-        self.noise_scheduler = noise_scheduler
-        self.scale = config['scheduler']['scale']
-        self.shift = config['scheduler']['shift']
-        self.melshape = config['model']['unet']['sample_size'][0]
-    @torch.no_grad()
-    def inference(self,
-                  text, content_clip, f0_clip=None,
-                  guidance_scale=3, guidance_rescale=0.7,
-                  ddim_steps=50, eta=1, random_seed=2023):
-        text, text_mask = text
-        self.model.eval()
-        if random_seed is not None:
-            generator = torch.Generator(device=self.device).manual_seed(random_seed)
-        else:
-            generator = torch.Generator(device=self.device)
-            generator.seed()
-        self.noise_scheduler.set_timesteps(ddim_steps)
-        # init noise
-        gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
-        noise = torch.randn(gen_shape, generator=generator, device=self.device)
-        latents = noise
-        for t in self.noise_scheduler.timesteps:
-            latents = self.noise_scheduler.scale_model_input(latents, t)
-            if guidance_scale:
-                output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
-                output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True,
-                                           speaker_cfg=1.0, pitch_cfg=0.0)
-                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
-                if guidance_rescale > 0.0:
-                    output_pred = rescale_noise_cfg(output_pred, output_text,
-                                                    guidance_rescale=guidance_rescale)
-            else:
-                output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
-            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
-                                                eta=eta, generator=generator).prev_sample
-        pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
-        return pred