None1145 commited on
Commit
f5d069b
·
verified ·
1 Parent(s): b37d2bb

Delete preprocess_hubert_f0.py

Browse files
Files changed (1) hide show
  1. preprocess_hubert_f0.py +0 -172
preprocess_hubert_f0.py DELETED
@@ -1,172 +0,0 @@
1
- import argparse
2
- import logging
3
- import os
4
- import random
5
- from concurrent.futures import ProcessPoolExecutor
6
- from glob import glob
7
- from random import shuffle
8
-
9
- import librosa
10
- import numpy as np
11
- import torch
12
- import torch.multiprocessing as mp
13
- from loguru import logger
14
- from tqdm import tqdm
15
-
16
- import diffusion.logger.utils as du
17
- import utils
18
- from diffusion.vocoder import Vocoder
19
- from modules.mel_processing import spectrogram_torch
20
-
21
- logging.getLogger("numba").setLevel(logging.WARNING)
22
- logging.getLogger("matplotlib").setLevel(logging.WARNING)
23
-
24
- hps = utils.get_hparams_from_file("configs/config.json")
25
- dconfig = du.load_config("configs/diffusion.yaml")
26
- sampling_rate = hps.data.sampling_rate
27
- hop_length = hps.data.hop_length
28
- speech_encoder = hps["model"]["speech_encoder"]
29
-
30
-
31
- def process_one(filename, hmodel, f0p, device, diff=False, mel_extractor=None):
32
- wav, sr = librosa.load(filename, sr=sampling_rate)
33
- audio_norm = torch.FloatTensor(wav)
34
- audio_norm = audio_norm.unsqueeze(0)
35
- soft_path = filename + ".soft.pt"
36
- if not os.path.exists(soft_path):
37
- wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
38
- wav16k = torch.from_numpy(wav16k).to(device)
39
- c = hmodel.encoder(wav16k)
40
- torch.save(c.cpu(), soft_path)
41
-
42
- f0_path = filename + ".f0.npy"
43
- if not os.path.exists(f0_path):
44
- f0_predictor = utils.get_f0_predictor(f0p,sampling_rate=sampling_rate, hop_length=hop_length,device=None,threshold=0.05)
45
- f0,uv = f0_predictor.compute_f0_uv(
46
- wav
47
- )
48
- np.save(f0_path, np.asanyarray((f0,uv),dtype=object))
49
-
50
-
51
- spec_path = filename.replace(".wav", ".spec.pt")
52
- if not os.path.exists(spec_path):
53
- # Process spectrogram
54
- # The following code can't be replaced by torch.FloatTensor(wav)
55
- # because load_wav_to_torch return a tensor that need to be normalized
56
-
57
- if sr != hps.data.sampling_rate:
58
- raise ValueError(
59
- "{} SR doesn't match target {} SR".format(
60
- sr, hps.data.sampling_rate
61
- )
62
- )
63
-
64
- #audio_norm = audio / hps.data.max_wav_value
65
-
66
- spec = spectrogram_torch(
67
- audio_norm,
68
- hps.data.filter_length,
69
- hps.data.sampling_rate,
70
- hps.data.hop_length,
71
- hps.data.win_length,
72
- center=False,
73
- )
74
- spec = torch.squeeze(spec, 0)
75
- torch.save(spec, spec_path)
76
-
77
- if diff or hps.model.vol_embedding:
78
- volume_path = filename + ".vol.npy"
79
- volume_extractor = utils.Volume_Extractor(hop_length)
80
- if not os.path.exists(volume_path):
81
- volume = volume_extractor.extract(audio_norm)
82
- np.save(volume_path, volume.to('cpu').numpy())
83
-
84
- if diff:
85
- mel_path = filename + ".mel.npy"
86
- if not os.path.exists(mel_path) and mel_extractor is not None:
87
- mel_t = mel_extractor.extract(audio_norm.to(device), sampling_rate)
88
- mel = mel_t.squeeze().to('cpu').numpy()
89
- np.save(mel_path, mel)
90
- aug_mel_path = filename + ".aug_mel.npy"
91
- aug_vol_path = filename + ".aug_vol.npy"
92
- max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
93
- max_shift = min(1, np.log10(1/max_amp))
94
- log10_vol_shift = random.uniform(-1, max_shift)
95
- keyshift = random.uniform(-5, 5)
96
- if mel_extractor is not None:
97
- aug_mel_t = mel_extractor.extract(audio_norm * (10 ** log10_vol_shift), sampling_rate, keyshift = keyshift)
98
- aug_mel = aug_mel_t.squeeze().to('cpu').numpy()
99
- aug_vol = volume_extractor.extract(audio_norm * (10 ** log10_vol_shift))
100
- if not os.path.exists(aug_mel_path):
101
- np.save(aug_mel_path,np.asanyarray((aug_mel,keyshift),dtype=object))
102
- if not os.path.exists(aug_vol_path):
103
- np.save(aug_vol_path,aug_vol.to('cpu').numpy())
104
-
105
-
106
- def process_batch(file_chunk, f0p, diff=False, mel_extractor=None, device="cpu"):
107
- logger.info("Loading speech encoder for content...")
108
- rank = mp.current_process()._identity
109
- rank = rank[0] if len(rank) > 0 else 0
110
- if torch.cuda.is_available():
111
- gpu_id = rank % torch.cuda.device_count()
112
- device = torch.device(f"cuda:{gpu_id}")
113
- logger.info(f"Rank {rank} uses device {device}")
114
- hmodel = utils.get_speech_encoder(speech_encoder, device=device)
115
- logger.info(f"Loaded speech encoder for rank {rank}")
116
- for filename in tqdm(file_chunk, position = rank):
117
- process_one(filename, hmodel, f0p, device, diff, mel_extractor)
118
-
119
- def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device):
120
- with ProcessPoolExecutor(max_workers=num_processes) as executor:
121
- tasks = []
122
- for i in range(num_processes):
123
- start = int(i * len(filenames) / num_processes)
124
- end = int((i + 1) * len(filenames) / num_processes)
125
- file_chunk = filenames[start:end]
126
- tasks.append(executor.submit(process_batch, file_chunk, f0p, diff, mel_extractor, device=device))
127
- for task in tqdm(tasks, position = 0):
128
- task.result()
129
-
130
- if __name__ == "__main__":
131
- parser = argparse.ArgumentParser()
132
- parser.add_argument('-d', '--device', type=str, default=None)
133
- parser.add_argument(
134
- "--in_dir", type=str, default="dataset/44k", help="path to input dir"
135
- )
136
- parser.add_argument(
137
- '--use_diff',action='store_true', help='Whether to use the diffusion model'
138
- )
139
- parser.add_argument(
140
- '--f0_predictor', type=str, default="rmvpe", help='Select F0 predictor, can select crepe,pm,dio,harvest,rmvpe,fcpe|default: pm(note: crepe is original F0 using mean filter)'
141
- )
142
- parser.add_argument(
143
- '--num_processes', type=int, default=1, help='You are advised to set the number of processes to the same as the number of CPU cores'
144
- )
145
- args = parser.parse_args()
146
- f0p = args.f0_predictor
147
- device = args.device
148
- if device is None:
149
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
150
-
151
- print(speech_encoder)
152
- logger.info("Using device: " + str(device))
153
- logger.info("Using SpeechEncoder: " + speech_encoder)
154
- logger.info("Using extractor: " + f0p)
155
- logger.info("Using diff Mode: " + str(args.use_diff))
156
-
157
- if args.use_diff:
158
- print("use_diff")
159
- print("Loading Mel Extractor...")
160
- mel_extractor = Vocoder(dconfig.vocoder.type, dconfig.vocoder.ckpt, device=device)
161
- print("Loaded Mel Extractor.")
162
- else:
163
- mel_extractor = None
164
- filenames = glob(f"{args.in_dir}/*/*.wav", recursive=True) # [:10]
165
- shuffle(filenames)
166
- mp.set_start_method("spawn", force=True)
167
-
168
- num_processes = args.num_processes
169
- if num_processes == 0:
170
- num_processes = os.cpu_count()
171
-
172
- parallel_process(filenames, num_processes, f0p, args.use_diff, mel_extractor, device)